[Bf-blender-cvs] [98123ae] master: BLI_task: nano-optimizations to BLI_task_parallel_range feature.

Tue May 10 18:03:36 CEST 2016

Commit: 98123ae91680289255f5fa6cf6ae0ff6dcba251b
Author: Bastien Montagne
Date:   Tue May 10 17:49:27 2016 +0200
Branches: master
https://developer.blender.org/rB98123ae91680289255f5fa6cf6ae0ff6dcba251b

BLI_task: nano-optimizations to BLI_task_parallel_range feature.

This commit makes use of new taskpool feature (instead of allocating own tasks),
and removes the spinlock used to generate chunks (using atomic ops instead).

In best cases (dynamic scheduled loop with light processing func callback), we
get a few percents of speedup, in most cases there is no sensible enhancement.

===================================================================

M	source/blender/blenlib/intern/task.c

===================================================================

diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index b47931c..bebf331 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -777,23 +777,29 @@ typedef struct ParallelRangeState {
 
 	int iter;
 	int chunk_size;
-	SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
         ParallelRangeState * __restrict state,
         int * __restrict iter, int * __restrict count)
 {
-	bool result = false;
-	BLI_spin_lock(&state->lock);
-	if (state->iter < state->stop) {
-		*count = min_ii(state->chunk_size, state->stop - state->iter);
-		*iter = state->iter;
-		state->iter += *count;
-		result = true;
+	uint32_t n, olditer, previter, newiter;
+
+	if (state->iter >= state->stop) {
+		return false;
 	}
-	BLI_spin_unlock(&state->lock);
-	return result;
+
+	do {
+		olditer = state->iter;
+		n = min_ii(state->chunk_size, state->stop - state->iter);
+		newiter = olditer + n;
+		previter = atomic_cas_uint32((uint32_t *)&state->iter, olditer, newiter);
+	} while (UNLIKELY(previter != olditer));
+
+	*iter = previter;
+	*count = n;
+
+	return (n != 0);
 }
 
 static void parallel_range_func(
@@ -898,7 +904,6 @@ static void task_parallel_range_ex(
 	 */
 	num_tasks = num_threads * 2;
 
-	BLI_spin_init(&state.lock);
 	state.start = start;
 	state.stop = stop;
 	state.userdata = userdata;
@@ -917,16 +922,15 @@ static void task_parallel_range_ex(
 	num_tasks = min_ii(num_tasks, (stop - start) / state.chunk_size);
 
 	for (i = 0; i < num_tasks; i++) {
-		BLI_task_pool_push(task_pool,
-		                   parallel_range_func,
-		                   NULL, false,
-		                   TASK_PRIORITY_HIGH);
+		/* Use this pool's pre-allocated tasks. */
+		BLI_task_pool_push_from_thread(task_pool,
+		                               parallel_range_func,
+		                               NULL, false,
+		                               TASK_PRIORITY_HIGH, 0);
 	}
 
 	BLI_task_pool_work_and_wait(task_pool);
 	BLI_task_pool_free(task_pool);
-
-	BLI_spin_end(&state.lock);
 }
 
 /**