[Bf-blender-cvs] [312aa67cc73] master: Remove dead numaapi code in blenlib

Fri Jan 7 12:19:06 CET 2022

Commit: 312aa67cc73c97f11e34be24623231f98fcdaa35
Author: Sergey Sharybin
Date:   Fri Jan 7 12:01:11 2022 +0100
Branches: master
https://developer.blender.org/rB312aa67cc73c97f11e34be24623231f98fcdaa35

Remove dead numaapi code in blenlib

It it rather an old experiment now which didn't pay off.
The initial idea was to have main and jobs threads on fast
nodes of TR2 processors. This didn't really work reliably
because in Blender we need to be able to create nested
threads without their affinity set. This is not how some of
OS are creating nested threads, and we don't always have
access to child threads to reset their affinity.

So overall complexity of the initial idea implementation
became too much compared to the performance gain.

===================================================================

M	source/blender/blenlib/BLI_threads.h
M	source/blender/blenlib/CMakeLists.txt
M	source/blender/blenlib/intern/threads.cc
M	source/blender/windowmanager/intern/wm_jobs.c
M	source/creator/creator.c

===================================================================

diff --git a/source/blender/blenlib/BLI_threads.h b/source/blender/blenlib/BLI_threads.h
index c7aae6d7972..60ed5f0544a 100644
--- a/source/blender/blenlib/BLI_threads.h
+++ b/source/blender/blenlib/BLI_threads.h
@@ -210,13 +210,6 @@ void BLI_thread_queue_nowait(ThreadQueue *queue);
 #  define BLI_thread_local_set(name, value) name = value
 #endif /* defined(__APPLE__) */
 
-/* **** Special functions to help performance on crazy NUMA setups. **** */
-
-/* Make sure process/thread is using NUMA node with fast memory access. */
-
-void BLI_thread_put_process_on_fast_node(void);
-void BLI_thread_put_thread_on_fast_node(void);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 0db0d9a67b3..3958fd8e2d2 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -25,7 +25,6 @@ set(INC
   ../../../intern/atomic
   ../../../intern/eigen
   ../../../intern/guardedalloc
-  ../../../intern/numaapi/include
   ../../../extern/wcwidth
   ../../../extern/json/include
 )
@@ -334,7 +333,6 @@ set(SRC
 set(LIB
   bf_intern_eigen
   bf_intern_guardedalloc
-  bf_intern_numaapi
   extern_wcwidth
 
   ${ZLIB_LIBRARIES}
diff --git a/source/blender/blenlib/intern/threads.cc b/source/blender/blenlib/intern/threads.cc
index 3589c952409..f131a464650 100644
--- a/source/blender/blenlib/intern/threads.cc
+++ b/source/blender/blenlib/intern/threads.cc
@@ -52,7 +52,6 @@
 #endif
 
 #include "atomic_ops.h"
-#include "numaapi.h"
 
 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && \
     !defined(__clang__)
@@ -125,7 +124,6 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t mainid;
-static bool is_numa_available = false;
 static unsigned int thread_levels = 0; /* threads can be invoked inside threads */
 static int num_threads_override = 0;
 
@@ -143,9 +141,6 @@ struct ThreadSlot {
 void BLI_threadapi_init()
 {
   mainid = pthread_self();
-  if (numaAPI_Initialize() == NUMAAPI_SUCCESS) {
-    is_numa_available = true;
-  }
 }
 
 void BLI_threadapi_exit()
@@ -807,110 +802,3 @@ void BLI_thread_queue_wait_finish(ThreadQueue *queue)
 
   pthread_mutex_unlock(&queue->mutex);
 }
-
-/* **** Special functions to help performance on crazy NUMA setups. **** */
-
-#if 0  /* UNUSED */
-static bool check_is_threadripper2_alike_topology(){
-  /* NOTE: We hope operating system does not support CPU hot-swap to
-   * a different brand. And that SMP of different types is also not
-   * encouraged by the system. */
-  static bool is_initialized = false;
-  static bool is_threadripper2 = false;
-  if (is_initialized) {
-    return is_threadripper2;
-  }
-  is_initialized = true;
-  char *cpu_brand = BLI_cpu_brand_string();
-  if (cpu_brand == nullptr) {
-    return false;
-  }
-  if (strstr(cpu_brand, "Threadripper")) {
-    /* NOTE: We consider all Thread-rippers having similar topology to
-     * the second one. This is because we are trying to utilize NUMA node
-     * 0 as much as possible. This node does exist on earlier versions of
-     * thread-ripper and setting affinity to it should not have negative
-     * effect.
-     * This allows us to avoid per-model check, making the code more
-     * reliable for the CPUs which are not yet released.
-     */
-    if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) {
-      is_threadripper2 = true;
-    }
-  }
-  /* NOTE: While all dies of EPYC has memory controller, only two f them
-   * has access to a lower-indexed DDR slots. Those dies are same as on
-   * Threadripper2 with the memory controller.
-   * Now, it is rather likely that reasonable amount of users don't max
-   * up their DR slots, making it only two dies connected to a DDR slot
-   * with actual memory in it. */
-  if (strstr(cpu_brand, "EPYC")) {
-    /* NOTE: Similarly to Thread-ripper we do not do model check. */
-    is_threadripper2 = true;
-  }
-  MEM_freeN(cpu_brand);
-  return is_threadripper2;
-}
-
-static void threadripper_put_process_on_fast_node(){
-  if (!is_numa_available) {
-    return;
-  }
-  /* NOTE: Technically, we can use NUMA nodes 0 and 2 and using both of
-   * them in the affinity mask will allow OS to schedule threads more
-   * flexible,possibly increasing overall performance when multiple apps
-   * are crunching numbers.
-   *
-   * However, if scene fits into memory adjacent to a single die we don't
-   * want OS to re-schedule the process to another die since that will make
-   * it further away from memory allocated for .blend file. */
-  /* NOTE: Even if NUMA is available in the API but is disabled in BIOS on
-   * this workstation we still process here. If NUMA is disabled it will be a
-   * single node, so our action is no-visible-changes, but allows to keep
-   * things simple and unified. */
-  numaAPI_RunProcessOnNode(0);
-}
-
-static void threadripper_put_thread_on_fast_node(){
-  if (!is_numa_available) {
-    return;
-  }
-  /* NOTE: This is where things becomes more interesting. On the one hand
-   * we can use nodes 0 and 2 and allow operating system to do balancing
-   * of processes/threads for the maximum performance when multiple apps
-   * are running.
-   * On another hand, however, we probably want to use same node as the
-   * main thread since that's where the memory of .blend file is likely
-   * to be allocated.
-   * Since the main thread is currently on node 0, we also put thread on
-   * same node. */
-  /* See additional note about NUMA disabled in BIOS above. */
-  numaAPI_RunThreadOnNode(0);
-}
-#endif /* UNUSED */
-
-void BLI_thread_put_process_on_fast_node()
-{
-  /* Disabled for now since this causes only 16 threads to be used on a
-   * thread-ripper for computations like sculpting and fluid sim. The problem
-   * is that all threads created as children from this thread will inherit
-   * the NUMA node and so will end up on the same node. This can be fixed
-   * case-by-case by assigning the NUMA node for every child thread, however
-   * this is difficult for external libraries and OpenMP, and out of our
-   * control for plugins like external renderers. */
-#if 0
-  if (check_is_threadripper2_alike_topology()) {
-    threadripper_put_process_on_fast_node();
-  }
-#endif
-}
-
-void BLI_thread_put_thread_on_fast_node()
-{
-  /* Disabled for now, see comment above. */
-#if 0
-  if (check_is_threadripper2_alike_topology()) {
-    threadripper_put_thread_on_fast_node();
-  }
-#endif
-}
diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c
index 66277ec57f4..e7ad654d3e5 100644
--- a/source/blender/windowmanager/intern/wm_jobs.c
+++ b/source/blender/windowmanager/intern/wm_jobs.c
@@ -381,7 +381,6 @@ static void *do_job_thread(void *job_v)
 {
   wmJob *wm_job = job_v;
 
-  BLI_thread_put_thread_on_fast_node();
   wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress);
   wm_job->ready = true;
 
diff --git a/source/creator/creator.c b/source/creator/creator.c
index c50b6ac4c79..d69e4d075fd 100644
--- a/source/creator/creator.c
+++ b/source/creator/creator.c
@@ -408,7 +408,6 @@ int main(int argc,
   BKE_appdir_program_path_init(argv[0]);
 
   BLI_threadapi_init();
-  BLI_thread_put_process_on_fast_node();
 
   DNA_sdna_current_init();