[Bf-blender-cvs] [e43b74d] master: Optimization of parallel range

Sergey Sharybin noreply at git.blender.org
Mon Nov 3 18:45:33 CET 2014


Commit: e43b74d87a43ab919b86434db9881608c5b9f762
Author: Sergey Sharybin
Date:   Mon Nov 3 18:24:08 2014 +0100
Branches: master
https://developer.blender.org/rBe43b74d87a43ab919b86434db9881608c5b9f762

Optimization of parallel range

It now supports different scheduling schemas: dynamic and static.
Static one is the default and it splits work into equal number of
range iterations.

Dynamic one allocates chunks of 32 iterations which then being
dynamically send to a thread which is currently idling.

This gives slightly better performance. Still some tricks are
possible to have. For example we can use some smarter static scheduling
when one thread might steal tasks from another threads when it runs
out of work to be done.

Also removed unneeded spin lock in the mesh deform evaluation,
on the first glance it seemed to be a reduction involved here but
int fact threads are just adding value to the original vertex
coordinates. No write access to the same element of  vertexCos
happens from separate threads.

===================================================================

M	source/blender/blenlib/BLI_task.h
M	source/blender/blenlib/intern/task.c
M	source/blender/modifiers/intern/MOD_meshdeform.c

===================================================================

diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 8c22a25..28da673 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -106,7 +106,8 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold);
+        const int range_threshold,
+        const bool use_dynamic_scheduling);
 void BLI_task_parallel_range(
         int start, int stop,
         void *userdata,
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 07c67f0..219ccb1 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -29,6 +29,7 @@
 #include "MEM_guardedalloc.h"
 
 #include "BLI_listbase.h"
+#include "BLI_math.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -452,18 +453,21 @@ typedef struct ParallelRangeState {
 	TaskParallelRangeFunc func;
 
 	int iter;
+	int chunk_size;
 	SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
-        ParallelRangeState *state,
-        int *iter)
+        ParallelRangeState * __restrict state,
+        int * __restrict iter, int * __restrict count)
 {
 	bool result = false;
 	if (state->iter < state->stop) {
 		BLI_spin_lock(&state->lock);
 		if (state->iter < state->stop) {
-			*iter = state->iter++;
+			*count = min_ii(state->chunk_size, state->stop - state->iter);
+			*iter = state->iter;
+			state->iter += *count;
 			result = true;
 		}
 		BLI_spin_unlock(&state->lock);
@@ -472,14 +476,17 @@ BLI_INLINE bool parallel_range_next_iter_get(
 }
 
 static void parallel_range_func(
-        TaskPool *pool,
+        TaskPool * __restrict pool,
         void *UNUSED(taskdata),
         int UNUSED(threadid))
 {
-	ParallelRangeState *state = BLI_task_pool_userdata(pool);
-	int iter;
-	while (parallel_range_next_iter_get(state, &iter)) {
-		state->func(state->userdata, iter);
+	ParallelRangeState * __restrict state = BLI_task_pool_userdata(pool);
+	int iter, count;
+	while (parallel_range_next_iter_get(state, &iter, &count)) {
+		int i;
+		for (i = 0; i < count; ++i) {
+			state->func(state->userdata, iter + i);
+		}
 	}
 }
 
@@ -487,12 +494,13 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold)
+        const int range_threshold,
+        const bool use_dynamic_scheduling)
 {
 	TaskScheduler *task_scheduler;
 	TaskPool *task_pool;
 	ParallelRangeState state;
-	int i;
+	int i, num_threads, num_tasks;
 
 	BLI_assert(start < stop);
 
@@ -506,21 +514,30 @@ void BLI_task_parallel_range_ex(
 		return;
 	}
 
-	BLI_spin_init(&state.lock);
-	state.start = start;
-	state.stop = stop;
-	state.userdata = userdata;
-	state.func = func;
-	state.iter = start;
-
 	task_scheduler = BLI_task_scheduler_get();
 	task_pool = BLI_task_pool_create(task_scheduler, &state);
+	num_threads = BLI_task_scheduler_num_threads(task_scheduler);
 
 	/* The idea here is to prevent creating task for each of the loop iterations
 	 * and instead have tasks which are evenly distributed across CPU cores and
 	 * pull next iter to be crunched using the queue.
 	 */
-	for (i = 0; i < 2 * BLI_task_scheduler_num_threads(task_scheduler); i++) {
+	num_tasks = num_threads * 2;
+
+	BLI_spin_init(&state.lock);
+	state.start = start;
+	state.stop = stop;
+	state.userdata = userdata;
+	state.func = func;
+	state.iter = start;
+	if (use_dynamic_scheduling) {
+		state.chunk_size = 32;
+	}
+	else {
+		state.chunk_size = (stop - start) / (num_tasks);
+	}
+
+	for (i = 0; i < num_tasks; i++) {
 		BLI_task_pool_push(task_pool,
 		                   parallel_range_func,
 		                   NULL, false,
@@ -538,5 +555,5 @@ void BLI_task_parallel_range(
         void *userdata,
         TaskParallelRangeFunc func)
 {
-	BLI_task_parallel_range_ex(start, stop, userdata, func, 64);
+	BLI_task_parallel_range_ex(start, stop, userdata, func, 64, false);
 }
diff --git a/source/blender/modifiers/intern/MOD_meshdeform.c b/source/blender/modifiers/intern/MOD_meshdeform.c
index 472f35f..c253c03 100644
--- a/source/blender/modifiers/intern/MOD_meshdeform.c
+++ b/source/blender/modifiers/intern/MOD_meshdeform.c
@@ -213,10 +213,9 @@ typedef struct MeshdeformUserdata {
 	float (*vertexCos)[3];
 	float (*cagemat)[4];
 	float (*icagemat)[3];
-	SpinLock lock;
 } MeshdeformUserdata;
 
-static void meshdeform_vert_task(void *userdata, int iter)
+static void meshdeform_vert_task(void * userdata, int iter)
 {
 	MeshdeformUserdata *data = userdata;
 	/*const*/ MeshDeformModifierData *mmd = data->mmd;
@@ -265,12 +264,10 @@ static void meshdeform_vert_task(void *userdata, int iter)
 	if (totweight > 0.0f) {
 		mul_v3_fl(co, fac / totweight);
 		mul_m3_v3(data->icagemat, co);
-		BLI_spin_lock(&data->lock);
 		if (G.debug_value != 527)
 			add_v3_v3(vertexCos[iter], co);
 		else
 			copy_v3_v3(vertexCos[iter], co);
-		BLI_spin_unlock(&data->lock);
 	}
 }
 
@@ -395,14 +392,10 @@ static void meshdeformModifier_do(
 	data.vertexCos = vertexCos;
 	data.cagemat = cagemat;
 	data.icagemat = icagemat;
-	BLI_spin_init(&data.lock);
 
 	/* Do deformation. */
 	BLI_task_parallel_range(0, totvert, &data, meshdeform_vert_task);
 
-	/* Uninitialize user dtaa used by the task system. */
-	BLI_spin_end(&data.lock);
-
 	/* release cage derivedmesh */
 	MEM_freeN(dco);
 	MEM_freeN(cagecos);




More information about the Bf-blender-cvs mailing list