[Bf-blender-cvs] [ce927e15e0e] master: Tweaks for threading schedule for Threadripper2 and EPYC

Sergey Sharybin noreply at git.blender.org
Wed Nov 28 14:42:05 CET 2018


Commit: ce927e15e0e3570a02834b6001519a024ca1e2b6
Author: Sergey Sharybin
Date:   Tue Nov 27 18:21:43 2018 +0100
Branches: master
https://developer.blender.org/rBce927e15e0e3570a02834b6001519a024ca1e2b6

Tweaks for threading schedule for Threadripper2 and EPYC

The idea is to make main thread and job threads to be scheduled
on CPU dies which has direct access to memory (those are NUMA
nodes 0 and 2).

We also do this for new EPYC CPUs since their NUMA nodes 1 and 3
do have access but only to a higher range DDR slots. By preferring
nodes 0 and 2 on EPYC we make it so users with partially filled
DDR slots has fast memory access.

One thing which is not really solved yet is localization of
memory allocation: we do not guarantee that memory is allocated
on the closest to the NUMA node DDR slot and hope that memory
manager of OS is acting in favor of us.

===================================================================

M	source/blender/blenlib/BLI_threads.h
M	source/blender/blenlib/CMakeLists.txt
M	source/blender/blenlib/intern/threads.c
M	source/blender/windowmanager/intern/wm_jobs.c
M	source/creator/creator.c

===================================================================

diff --git a/source/blender/blenlib/BLI_threads.h b/source/blender/blenlib/BLI_threads.h
index 81f8445783b..631a65ccade 100644
--- a/source/blender/blenlib/BLI_threads.h
+++ b/source/blender/blenlib/BLI_threads.h
@@ -204,6 +204,12 @@ void BLI_thread_queue_nowait(ThreadQueue *queue);
 #  define BLI_thread_local_set(name, value) name = value
 #endif  /* defined(__APPLE__) */
 
+/* **** Special functions to help performance on crazy NUMA setups. **** */
+
+/* Make sure process/thread is using NUMA node with fast memory access. */
+void BLI_thread_put_process_on_fast_node(void);
+void BLI_thread_put_thread_on_fast_node(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index e3f5773b1e4..16dfec77260 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -30,6 +30,7 @@ set(INC
 	../../../intern/guardedalloc
 	../../../intern/atomic
 	../../../intern/eigen
+	../../../intern/numaapi/include
 	../../../extern/wcwidth
 )
 
diff --git a/source/blender/blenlib/intern/threads.c b/source/blender/blenlib/intern/threads.c
index 862ce391109..f67d621f4a1 100644
--- a/source/blender/blenlib/intern/threads.c
+++ b/source/blender/blenlib/intern/threads.c
@@ -37,6 +37,7 @@
 
 #include "BLI_listbase.h"
 #include "BLI_gsqueue.h"
+#include "BLI_system.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -55,6 +56,7 @@
 #endif
 
 #include "atomic_ops.h"
+#include "numaapi.h"
 
 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
 #  define USE_APPLE_OMP_FIX
@@ -126,6 +128,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t mainid;
+static bool is_numa_available = false;
 static unsigned int thread_levels = 0;  /* threads can be invoked inside threads */
 static int num_threads_override = 0;
 
@@ -155,6 +158,9 @@ void BLI_threadapi_init(void)
 	mainid = pthread_self();
 
 	BLI_spin_init(&_malloc_lock);
+	if (numaAPI_Initialize() == NUMAAPI_SUCCESS) {
+		is_numa_available = true;
+	}
 }
 
 void BLI_threadapi_exit(void)
@@ -840,3 +846,98 @@ void BLI_threaded_malloc_end(void)
 		MEM_set_lock_callback(NULL, NULL);
 	}
 }
+
+/* **** Special functions to help performance on crazy NUMA setups. **** */
+
+static bool check_is_threadripper2_alike_topology(void)
+{
+	/* NOTE: We hope operating system does not support CPU hotswap to
+	 * a different brand. And that SMP of different types is also not
+	 * encouraged by the system. */
+	static bool is_initialized = false;
+	static bool is_threadripper2 = false;
+	if (is_initialized) {
+		return is_threadripper2;
+	}
+	is_initialized = true;
+	char *cpu_brand = BLI_cpu_brand_string();
+	if (cpu_brand == NULL) {
+		return false;
+	}
+	if (strstr(cpu_brand, "Threadripper")) {
+		/* NOTE: We consinder all Threadrippers having similar topology to
+		* the second one. This is because we are trying to utilize NUMA node
+		* 0 as much as possible. This node does exist on earlier versions of
+		* threadripper and setting affinity to it should not have negative
+		* effect.
+		* This allows us to avoid per-model check, making the code more
+		* reliable for the CPUs which are not yet released.
+		*/
+		if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) {
+			is_threadripper2 = true;
+		}
+	}
+	/* NOTE: While all dies of EPYC has memory controller, only two f them
+	 * has access to a lower-indexed DDR slots. Those dies are same as on
+	 * Threadripper2 with the memory controller.
+	 * Now, it is rather likely that reasonable amount of users don't max
+	 * up their DR slots, making it only two dies connected to a DDR slot
+	 * with actual memory in it. */
+	if (strstr(cpu_brand, "EPYC")) {
+		/* NOTE: Similarly to Threadripper we do not do model check. */
+		is_threadripper2 = true;
+	}
+	return is_threadripper2;
+}
+
+static void threadripper_put_process_on_fast_node(void)
+{
+	if (!is_numa_available) {
+		return;
+	}
+	/* NOTE: Technically, we can use NUMA nodes 0 and 2 and usning both of
+	 * them in the affinity mask will allow OS to schedule threads more
+	 * flexible,possibly increasing overall performance when multiple apps
+	 * are crunching numbers.
+	 *
+	 * However, if scene fits into memory adjacent to a single die we don't
+	 * want OS to re-schedule the process to another die since that will make
+	 * it further away from memory allocated for .blend file. */
+	/* NOTE: Even if NUMA is avasilable in the API but is disabled in BIOS on
+	 * this workstation we still process here. If NUMA is disabled it will be a
+	 * single node, so our action is no-visible-changes, but allows to keep
+	 * things simple and unified. */
+	numaAPI_RunProcessOnNode(0);
+}
+
+static void threadripper_put_thread_on_fast_node(void)
+{
+	if (!is_numa_available) {
+		return;
+	}
+	/* NOTE: This is where things becomes more interesting. On the one hand
+	 * we can use nodes 0 and 2 and allow operating system to do balancing
+	 * of processes/threads for the maximum performance when multiple apps
+	 * are running.
+	 * On another hand, however, we probably want to use same node as the
+	 * main thread since that's where the memory of .blend file is likely
+	 * to be allocated.
+	 * Since the main thread is currently on node 0, we also put thread on
+	 * same node. */
+	/* See additional note about NUMA disabled in BIOS above. */
+	numaAPI_RunThreadOnNode(0);
+}
+
+void BLI_thread_put_process_on_fast_node(void)
+{
+	if (check_is_threadripper2_alike_topology()) {
+		threadripper_put_process_on_fast_node();
+	}
+}
+
+void BLI_thread_put_thread_on_fast_node(void)
+{
+	if (check_is_threadripper2_alike_topology()) {
+		threadripper_put_thread_on_fast_node();
+	}
+}
diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c
index 92d51c9a400..cb627b465f4 100644
--- a/source/blender/windowmanager/intern/wm_jobs.c
+++ b/source/blender/windowmanager/intern/wm_jobs.c
@@ -334,6 +334,7 @@ static void *do_job_thread(void *job_v)
 {
 	wmJob *wm_job = job_v;
 
+	BLI_thread_put_thread_on_fast_node();
 	wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress);
 	wm_job->ready = true;
 
diff --git a/source/creator/creator.c b/source/creator/creator.c
index e375b65fd75..1d39fd6f05a 100644
--- a/source/creator/creator.c
+++ b/source/creator/creator.c
@@ -52,6 +52,7 @@
 #include "BLI_callbacks.h"
 #include "BLI_string.h"
 #include "BLI_system.h"
+#include "BLI_threads.h"
 
 /* mostly init functions */
 #include "BKE_appdir.h"
@@ -364,6 +365,7 @@ int main(
 	BKE_appdir_program_path_init(argv[0]);
 
 	BLI_threadapi_init();
+	BLI_thread_put_process_on_fast_node();
 
 	DNA_sdna_current_init();



More information about the Bf-blender-cvs mailing list