[Bf-blender-cvs] [826d7adde79] blender2.7: Fix T59874: Cycles CPU 25% load only during rendering

Sergey Sharybin noreply at git.blender.org
Thu Dec 27 19:13:17 CET 2018


Commit: 826d7adde79216d271b78059c05abd10b7559899
Author: Sergey Sharybin
Date:   Thu Dec 27 19:01:19 2018 +0100
Branches: blender2.7
https://developer.blender.org/rB826d7adde79216d271b78059c05abd10b7559899

Fix T59874: Cycles CPU 25% load only during rendering

The issue was introduced by a Threadripper2 commit back in
ce927e15e0e3. This boils down to threads inheriting affinity
from the parent thread. It is a question how this slipped
through the review (we definitely run benchmark round).

Quick fix could have been to always set CPU group affinity
in Cycles, and it would work for Windows. On other platforms
we did not have CPU groups API finished.

Ended up making Cycles aware of NUMA topology, so now we
bound threads to a specific NUMA node. This required adding
an external dependency to Cycles, but made some code there
shorter.

===================================================================

M	intern/cycles/util/CMakeLists.txt
M	intern/cycles/util/util_system.cpp
M	intern/cycles/util/util_system.h
M	intern/cycles/util/util_task.cpp
M	intern/cycles/util/util_thread.cpp
M	intern/cycles/util/util_thread.h
D	intern/cycles/util/util_windows.cpp
M	intern/cycles/util/util_windows.h
M	intern/numaapi/include/numaapi.h
M	intern/numaapi/source/numaapi_linux.c

===================================================================

diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 92dfc9fa85d..42626d05cf9 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -25,13 +25,17 @@ set(SRC
 	util_thread.cpp
 	util_time.cpp
 	util_transform.cpp
-	util_windows.cpp
 )
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-	list(APPEND SRC
-		util_view.cpp
-	)
+if(WITH_CYCLES_STANDALONE)
+	if (WITH_CYCLES_STANDALONE_GUI)
+		list(APPEND SRC
+			util_view.cpp
+		)
+	endif()
+	list(APPEND INC_SYS ../../third_party/numaapi/include)
+else()
+	list(APPEND INC_SYS ../../numaapi/include)
 endif()
 
 set(SRC_HEADERS
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 34f428f111c..cc2d7017fd8 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -20,6 +20,8 @@
 #include "util/util_types.h"
 #include "util/util_string.h"
 
+#include <numaapi.h>
+
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
 #    include <intrin.h>
@@ -34,74 +36,81 @@
 
 CCL_NAMESPACE_BEGIN
 
-int system_cpu_group_count()
+bool system_cpu_ensure_initialized()
 {
-#ifdef _WIN32
-	util_windows_init_numa_groups();
-	return GetActiveProcessorGroupCount();
-#else
-	/* TODO(sergey): Need to adopt for other platforms. */
-	return 1;
-#endif
+	static bool is_initialized = false;
+	static bool result = false;
+	if (is_initialized) {
+		return result;
+	}
+	is_initialized = true;
+	const NUMAAPI_Result numa_result = numaAPI_Initialize();
+	result = (numa_result == NUMAAPI_SUCCESS);
+	return result;
 }
 
-int system_cpu_group_thread_count(int group)
+/* Fallback solution, which doesn't use NUMA/CPU groups. */
+static int system_cpu_thread_count_fallback()
 {
-	/* TODO(sergey): Need make other platforms aware of groups. */
 #ifdef _WIN32
-	util_windows_init_numa_groups();
-	return GetActiveProcessorCount(group);
+	SYSTEM_INFO info;
+	GetSystemInfo(&info);
+	return info.dwNumberOfProcessors;
 #elif defined(__APPLE__)
-	(void) group;
 	int count;
 	size_t len = sizeof(count);
 	int mib[2] = { CTL_HW, HW_NCPU };
 	sysctl(mib, 2, &count, &len, NULL, 0);
 	return count;
 #else
-	(void) group;
 	return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
 
 int system_cpu_thread_count()
 {
-	static uint count = 0;
-
-	if(count > 0) {
-		return count;
+	const int num_nodes = system_cpu_num_numa_nodes();
+	int num_threads = 0;
+	for (int node = 0; node < num_nodes; ++node) {
+		if (!system_cpu_is_numa_node_available(node)) {
+			continue;
+		}
+		num_threads += system_cpu_num_numa_node_processors(node);
 	}
+	return num_threads;
+}
 
-	int max_group = system_cpu_group_count();
-	VLOG(1) << "Detected " << max_group << " CPU groups.";
-	for(int group = 0; group < max_group; ++group) {
-		int num_threads = system_cpu_group_thread_count(group);
-		VLOG(1) << "Group " << group
-		        << " has " << num_threads << " threads.";
-		count += num_threads;
+int system_cpu_num_numa_nodes()
+{
+	if (!system_cpu_ensure_initialized()) {
+		/* Fallback to a single node with all the threads. */
+		return 1;
 	}
+	return numaAPI_GetNumNodes();
+}
 
-	if(count < 1) {
-		count = 1;
+bool system_cpu_is_numa_node_available(int node)
+{
+	if (!system_cpu_ensure_initialized()) {
+		return true;
 	}
+	return numaAPI_IsNodeAvailable(node);
+}
 
-	return count;
+int system_cpu_num_numa_node_processors(int node)
+{
+	if (!system_cpu_ensure_initialized()) {
+		return system_cpu_thread_count_fallback();
+	}
+	return numaAPI_GetNumNodeProcessors(node);
 }
 
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *groups)
+bool system_cpu_run_thread_on_node(int node)
 {
-#ifdef _WIN32
-	unsigned short group_count = max_groups;
-	if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
-		return 0;
+	if (!system_cpu_ensure_initialized()) {
+		return true;
 	}
-	return group_count;
-#else
-	(void) max_groups;
-	(void) groups;
-	return 0;
-#endif
+	return numaAPI_RunThreadOnNode(node);
 }
 
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 241ac897157..15f69bcf153 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -21,18 +21,28 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Get number of available CPU groups. */
-int system_cpu_group_count();
+/* Make sure CPU groups / NUMA API is initialized. */
+bool system_cpu_ensure_initialized();
 
-/* Get number of threads/processors in the specified group. */
-int system_cpu_group_thread_count(int group);
-
-/* Get total number of threads in all groups. */
+/* Get total number of threads in all NUMA nodes / CPU groups. */
 int system_cpu_thread_count();
 
-/* Get current process groups. */
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *grpups);
+/* Get number of available nodes.
+ *
+ * This is in fact an index of last node plus one and it's not guaranteed
+ * that all nodes up to this one are available. */
+int system_cpu_num_numa_nodes();
+
+/* Returns truth if the given node is available for compute. */
+bool system_cpu_is_numa_node_available(int node);
+
+/* Get number of available processors on a given node. */
+int system_cpu_num_numa_node_processors(int node);
+
+/* Runs the current thread and its children on a specific node.
+ *
+ * Returns truth if affinity has successfully changed. */
+bool system_cpu_run_thread_on_node(int node);
 
 string system_cpu_brand_string();
 int system_cpu_bits();
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 2d21d6b5a18..50a2bb160ff 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -204,50 +204,26 @@ void TaskScheduler::init(int num_threads)
 		/* launch threads that will be waiting for work */
 		threads.resize(num_threads);
 
-		const int num_groups = system_cpu_group_count();
-		unsigned short num_process_groups = 0;
-		vector<unsigned short> process_groups;
-		int current_group_threads = 0;
-		if(num_groups > 1) {
-			process_groups.resize(num_groups);
-			num_process_groups = system_cpu_process_groups(num_groups,
-			                                               &process_groups[0]);
-			if(num_process_groups == 1) {
-				current_group_threads = system_cpu_group_thread_count(process_groups[0]);
-			}
-		}
+		const int num_nodes = system_cpu_num_numa_nodes();
 		int thread_index = 0;
-		for(int group = 0; group < num_groups; ++group) {
-			/* NOTE: That's not really efficient from threading point of view,
-			 * but it is simple to read and it doesn't make sense to use more
-			 * user-specified threads than logical threads anyway.
-			 */
-			int num_group_threads = (group == num_groups - 1)
-			        ? (threads.size() - thread_index)
-			        : system_cpu_group_thread_count(group);
-			for(int group_thread = 0;
-				group_thread < num_group_threads && thread_index < threads.size();
-				++group_thread, ++thread_index)
+		for (int node = 0;
+		     node < num_nodes && thread_index < threads.size();
+		     ++node)
+		{
+			if (!system_cpu_is_numa_node_available(node)) {
+				continue;
+			}
+			const int num_node_processors =
+			        system_cpu_num_numa_node_processors(node);
+			for (int i = 0;
+			     i < num_node_processors && thread_index < threads.size();
+			     ++i)
 			{
-				/* NOTE: Thread group of -1 means we would not force thread affinity. */
-				int thread_group;
-				if(num_groups == 1) {
-					/* Use default affinity if there's only one CPU group in the system. */
-					thread_group = -1;
-				}
-				else if(use_auto_threads &&
-				        num_process_groups == 1 &&
-						num_threads <= current_group_threads)
-				{
-					/* If we fit into curent CPU group we also don't force any affinity. */
-					thread_group = -1;
-				}
-				else {
-					thread_group = group;
-				}
-				threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
-				                                                 thread_index + 1),
-				                                   thread_group);
+				threads[thread_index] = new thread(
+				        function_bind(&TaskScheduler::thread_run,
+				                      thread_index + 1),
+				        node);
+				thread_index++;
 			}
 		}
 	}
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 37d8bdbd4b0..4d30e3f564f 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -21,10 +21,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void()> run_cb, int group)
+thread::thread(function<void()> run_cb, int node)
   : run_cb_(run_cb),
     joined_(false),
-	group_(group)
+	node_(node)
 {
 	thread_ = std::thread(&thread::run, this);
 }
@@ -39,19 +39,8 @@ thread::~thread()
 void *thread::run(void *arg)
 {
 	thread *self = (thread*)(arg);
-	if(self->group_ != -1) {
-#ifdef _WIN32
-		HANDLE thread_handle = GetCurrentThread();
-		GROUP_AFFINITY group_affinity = { 0 };
-		int num_threads = system_cpu_group_thread_count(self->group_);
-		group_affinity.Group = self->group_;
-		group_affinity.Mask = (num_threads == 64)
-		                              ? -1
-		                              :  (1ull << num_threads) - 1;
-		if(SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-			fprintf(stderr, "Error setting thread affinity.\n");
-		}
-#endif
+	if (self->node_ != -1) {
+		system_cpu_run_thread_on_node(self->node_);
 	}
 	self->run_cb_();
 	return NULL;
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 6250bb95dcf..d54199a37fc 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,7 +46,7 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-	thread(function<void()> run_cb, int group = -1);
+	thread(function<void()> run_cb, int node = -1);
 	~thread();
 
 	static void *run(void *arg);
@@ -56,7 +56,7 @@ protected:
 	function<void()> run_cb_;
 	std::thread thread_;
 	bool joined_;
-	int group_;
+	int node_;
 };
 
 /* Own wrapper around pthread's spin lock t

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list