[Bf-blender-cvs] [01e8e7dc6d9] master: Task scheduler: Optimize parallel loop over lists

Tue Nov 20 15:02:02 CET 2018

Commit: 01e8e7dc6d9dd3710e0e3872b2d70196ac654d14
Author: Sergey Sharybin
Date:   Tue Nov 20 12:17:03 2018 +0100
Branches: master
https://developer.blender.org/rB01e8e7dc6d9dd3710e0e3872b2d70196ac654d14

Task scheduler: Optimize parallel loop over lists

The goal is to address performance regression when going from
few threads to 10s of threads. On a systems with more than 32
CPU threads the benefit of threaded loop was actually harmful.

There are following tweaks now:

- The chunk size is adaptive for the number of threads, which
  minimizes scheduling overhead.

- The number of tasks is adaptive to the list size and chunk
  size.

Here comes performance comparison on the production shot:

 Number of threads        DEG time before        DEG time after
       44                     0.09                   0.02
       32                     0.055                  0.025
       16                     0.025                  0.025
       8                      0.035                  0.033

===================================================================

M	source/blender/blenlib/intern/task.c

===================================================================

diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 2bb5d5397a9..b6d704d8d82 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -1221,6 +1221,31 @@ static void parallel_listbase_func(
 	}
 }
 
+static void task_parallel_listbase_no_threads(
+        struct ListBase *listbase,
+        void *userdata,
+        TaskParallelListbaseFunc func)
+{
+	int i = 0;
+	for (Link *link = listbase->first; link != NULL; link = link->next, ++i) {
+		func(userdata, link, i);
+	}
+}
+
+/* NOTE: The idea here is to compensate for rather measurable threading
+ * overhead caused by fetching tasks. With too many CPU threads we are starting
+ * to spend too much time in those overheads. */
+BLI_INLINE int task_parallel_listbasecalc_chunk_size(const int num_threads)
+{
+	if (num_threads > 32) {
+		return 128;
+	}
+	else if (num_threads > 16) {
+		return 64;
+	}
+	return 32;
+}
+
 /**
  * This function allows to parallelize for loops over ListBase items.
  *
@@ -1238,41 +1263,37 @@ void BLI_task_parallel_listbase(
         TaskParallelListbaseFunc func,
         const bool use_threading)
 {
-	TaskScheduler *task_scheduler;
-	TaskPool *task_pool;
-	ParallelListState state;
-	int i, num_threads, num_tasks;
-
 	if (BLI_listbase_is_empty(listbase)) {
 		return;
 	}
-
 	if (!use_threading) {
-		i = 0;
-		for (Link *link = listbase->first; link != NULL; link = link->next, ++i) {
-			func(userdata, link, i);
-		}
+		task_parallel_listbase_no_threads(listbase, userdata, func);
+		return;
+	}
+	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
+	const int num_threads = BLI_task_scheduler_num_threads(task_scheduler);
+	/* TODO(sergey): Consider making chunk size configurable. */
+	const int chunk_size = task_parallel_listbasecalc_chunk_size(num_threads);
+	const int num_tasks = min_ii(
+	        num_threads,
+	        BLI_listbase_count(listbase) / chunk_size);
+	if (num_tasks <= 1) {
+		task_parallel_listbase_no_threads(listbase, userdata, func);
 		return;
 	}
 
-	task_scheduler = BLI_task_scheduler_get();
-	task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
-	num_threads = BLI_task_scheduler_num_threads(task_scheduler);
-
-	/* The idea here is to prevent creating task for each of the loop iterations
-	 * and instead have tasks which are evenly distributed across CPU cores and
-	 * pull next iter to be crunched using the queue.
-	 */
-	num_tasks = num_threads + 2;
+	ParallelListState state;
+	TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
 
 	state.index = 0;
 	state.link = listbase->first;
 	state.userdata = userdata;
 	state.func = func;
-	state.chunk_size = 32;
+	state.chunk_size = chunk_size;
 	BLI_spin_init(&state.lock);
 
-	for (i = 0; i < num_tasks; i++) {
+	BLI_assert(num_tasks > 0);
+	for (int i = 0; i < num_tasks; i++) {
 		/* Use this pool's pre-allocated tasks. */
 		BLI_task_pool_push_from_thread(task_pool,
 		                               parallel_listbase_func,