[Bf-blender-cvs] [8ee96b1] temp_custom_loop_normals: Better implementation of threading for split lnor code.

Fri Aug 8 19:18:59 CEST 2014

Commit: 8ee96b142b09017a9f27f94b85239b985853b996
Author: Bastien Montagne
Date:   Fri Aug 8 18:09:30 2014 +0200
Branches: temp_custom_loop_normals
https://developer.blender.org/rB8ee96b142b09017a9f27f94b85239b985853b996

Better implementation of threading for split lnor code.

Not that much satisfied yet, though, this adds some complexity with not-so-much gain in the end
(about 33% better with 100000 loops and more, on an 8cores machine - with nearly no gain after 4
workers anyway)...

Have the feeling threadable parts of this func are too small chunks needing too much data
to be really efficient (since we can't parallelize the main loop, order is crucial here).

===================================================================

M	source/blender/blenkernel/intern/mesh_evaluate.c

===================================================================

diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index c8acead..82e354e 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -39,6 +39,7 @@
 
 #include "BLI_utildefines.h"
 #include "BLI_memarena.h"
+#include "BLI_mempool.h"
 #include "BLI_math.h"
 #include "BLI_edgehash.h"
 #include "BLI_bitmap.h"
@@ -486,15 +487,32 @@ void BKE_lnor_space_custom_normal_to_data(MLoopNorSpace *lnor_space, const float
 	}
 }
 
+typedef struct LoopSplitTaskData {
+	/* Specific to each instance (each task). */
+	MLoopNorSpace *lnor_space;  /* We have to create those outside of tasks, since afaik memarena is not threadsafe. */
+	float (*lnor)[3];
+	const MLoop *ml_curr;
+	const MLoop *ml_prev;
+	int ml_curr_index;
+	int ml_prev_index;
+	const int *e2l_prev;  /* Also used a flag to switch between single or fan process! */
+	int mp_index;
+
+	/* This one is special, it's owned and managed by worker tasks, avoid to have to create it for each fan! */
+	BLI_Stack *edge_vectors;
+
+	char c;
+} LoopSplitTaskData;
+
 typedef struct LoopSplitTaskDataCommon {
-	/* Common to all instances, read/write.
+	/* Read/write.
 	 * Note we do not need to protect it, though, since two different tasks will *always* affect different
 	 * elements in the arrays. */
 	MLoopsNorSpaces *lnors_spaces;
 	float (*loopnors)[3];
 	short (*clnors_data)[2];
 
-	/* Common to all instances, and read-only. */
+	/* Read-only. */
 	const MVert *mverts;
 	const MEdge *medges;
 	const MLoop *mloops;
@@ -502,32 +520,69 @@ typedef struct LoopSplitTaskDataCommon {
 	const int (*edge_to_loops)[2];
 	const int *loop_to_poly;
 	const float (*polynors)[3];
+
+	/* ***** Workers communication. ***** */
+	/* Spinlock-protected area. */
+	SpinLock lock;
+	BLI_Stack *tasks;
+	/* End of spinlock-protected area. */
+
+	bool finished;
 } LoopSplitTaskDataCommon;
 
-typedef struct LoopSplitTaskData {
-	/* Specific to each instance (each task). */
-	MLoopNorSpace *lnor_space;  /* We have to create those outside of tasks, since afaik memarena is not threadsafe. */
-	float (*lnor)[3];
-	const MLoop *ml_curr;
-	const MLoop *ml_prev;
-	int ml_curr_index;
-	int ml_prev_index;
-	const int *e2l_prev;
-	int mp_index;
+/* Main thread only! */
+static void loop_split_task_init(LoopSplitTaskDataCommon *common_data)
+{
+	common_data->tasks = BLI_stack_new(sizeof(LoopSplitTaskData), __func__);
+	BLI_spin_init(&common_data->lock);
+}
 
-	LoopSplitTaskDataCommon *common;
-} LoopSplitTaskData;
+/* Main thread only! */
+static void loop_split_task_clear(LoopSplitTaskDataCommon *common_data)
+{
+	BLI_assert(common_data->finished);
+
+	BLI_spin_end(&common_data->lock);
+	BLI_stack_free(common_data->tasks);
+}
+
+static void loop_split_task_data_push(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data)
+{
+	BLI_spin_lock(&common_data->lock);
+	BLI_stack_push(common_data->tasks, data);
+	BLI_spin_unlock(&common_data->lock);
+}
 
-static void exec_split_loop_nor_single(TaskPool *UNUSED(pool), void *taskdata, int UNUSED(threadid))
+static bool loop_split_task_data_pop(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *r_data)
 {
-	LoopSplitTaskData *data = (LoopSplitTaskData *)taskdata;
+	bool ret;
+	BLI_spin_lock(&common_data->lock);
+	while (BLI_stack_is_empty(common_data->tasks) && !common_data->finished) {
+		BLI_spin_unlock(&common_data->lock);
+		PIL_sleep_ms(1);  /* Investigate other solutions, might become a real issue on win32 :/ */
+		BLI_spin_lock(&common_data->lock);
+	}
+	ret = !common_data->finished || !BLI_stack_is_empty(common_data->tasks);
+	if (ret) {
+		BLI_stack_pop(common_data->tasks, r_data);
+	}
+	BLI_spin_unlock(&common_data->lock);
+	return ret;
+}
 
-	MLoopsNorSpaces *lnors_spaces = data->common->lnors_spaces;
-	short (*clnors_data)[2] = data->common->clnors_data;
+#define INDEX_UNSET INT_MIN
+#define INDEX_INVALID -1
+/* See comment about edge_to_loops below. */
+#define IS_EDGE_SHARP(_e2l) (ELEM((_e2l)[1], INDEX_UNSET, INDEX_INVALID))
 
-	const MVert *mverts = data->common->mverts;
-	const MEdge *medges = data->common->medges;
-	const float (*polynors)[3] = data->common->polynors;
+static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data)
+{
+	MLoopsNorSpaces *lnors_spaces = common_data->lnors_spaces;
+	short (*clnors_data)[2] = common_data->clnors_data;
+
+	const MVert *mverts = common_data->mverts;
+	const MEdge *medges = common_data->medges;
+	const float (*polynors)[3] = common_data->polynors;
 
 	MLoopNorSpace *lnor_space = data->lnor_space;
 	float (*lnor)[3] = data->lnor;
@@ -573,26 +628,19 @@ static void exec_split_loop_nor_single(TaskPool *UNUSED(pool), void *taskdata, i
 	}
 }
 
-#define INDEX_UNSET INT_MIN
-#define INDEX_INVALID -1
-/* See comment about edge_to_loops below. */
-#define IS_EDGE_SHARP(_e2l) (ELEM((_e2l)[1], INDEX_UNSET, INDEX_INVALID))
-
-static void exec_split_loop_nor_fan(TaskPool *UNUSED(pool), void *taskdata, int UNUSED(threadid))
+static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data)
 {
-	LoopSplitTaskData *data = (LoopSplitTaskData *)taskdata;
-
-	MLoopsNorSpaces *lnors_spaces = data->common->lnors_spaces;
-	float (*loopnors)[3] = data->common->loopnors;
-	short (*clnors_data)[2] = data->common->clnors_data;
-
-	const MVert *mverts = data->common->mverts;
-	const MEdge *medges = data->common->medges;
-	const MLoop *mloops = data->common->mloops;
-	const MPoly *mpolys = data->common->mpolys;
-	const int (*edge_to_loops)[2] = data->common->edge_to_loops;
-	const int *loop_to_poly = data->common->loop_to_poly;
-	const float (*polynors)[3] = data->common->polynors;
+	MLoopsNorSpaces *lnors_spaces = common_data->lnors_spaces;
+	float (*loopnors)[3] = common_data->loopnors;
+	short (*clnors_data)[2] = common_data->clnors_data;
+
+	const MVert *mverts = common_data->mverts;
+	const MEdge *medges = common_data->medges;
+	const MLoop *mloops = common_data->mloops;
+	const MPoly *mpolys = common_data->mpolys;
+	const int (*edge_to_loops)[2] = common_data->edge_to_loops;
+	const int *loop_to_poly = common_data->loop_to_poly;
+	const float (*polynors)[3] = common_data->polynors;
 
 	MLoopNorSpace *lnor_space = data->lnor_space;
 #if 0  /* Not needed for 'fan' loops. */
@@ -605,6 +653,8 @@ static void exec_split_loop_nor_fan(TaskPool *UNUSED(pool), void *taskdata, int
 	const int mp_index = data->mp_index;
 	const int *e2l_prev = data->e2l_prev;
 
+	BLI_Stack *edge_vectors = data->edge_vectors;
+
 	/* Gah... We have to fan around current vertex, until we find the other non-smooth edge,
 	 * and accumulate face normals into the vertex!
 	 * Note in case this vertex has only one sharp edges, this is a waste because the normal is the same as
@@ -633,8 +683,6 @@ static void exec_split_loop_nor_fan(TaskPool *UNUSED(pool), void *taskdata, int
 	BLI_SMALLSTACK_DECLARE(normal, float *);
 	/* Temp clnors stack. */
 	BLI_SMALLSTACK_DECLARE(clnors, short *);
-	/* Temp edge vectors stack, only used when computing lnor spaces. */
-	BLI_Stack *edge_vectors = lnors_spaces ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
 
 	e2lfan_curr = e2l_prev;
 	mlfan_curr = ml_prev;
@@ -646,8 +694,6 @@ static void exec_split_loop_nor_fan(TaskPool *UNUSED(pool), void *taskdata, int
 	BLI_assert(mlfan_vert_index >= 0);
 	BLI_assert(mpfan_curr_index >= 0);
 
-	BLI_assert((edge_vectors == NULL) || BLI_stack_is_empty(edge_vectors));
-
 	/* Only need to compute previous edge's vector once, then we can just reuse old current one! */
 	{
 		const MVert *mv_2 = (me_org->v1 == mv_pivot_index) ? &mverts[me_org->v2] : &mverts[me_org->v1];
@@ -810,8 +856,29 @@ static void exec_split_loop_nor_fan(TaskPool *UNUSED(pool), void *taskdata, int
 		}
 		/* Extra bonus: since smallstack is local to this func, no more need to empty it at all cost! */
 	}
+}
 
-	if (lnors_spaces) {
+static void loop_split_worker(TaskPool *UNUSED(pool), void *taskdata, int UNUSED(threadid))
+{
+	LoopSplitTaskDataCommon *common_data = (LoopSplitTaskDataCommon *)taskdata;
+	LoopSplitTaskData data;
+
+	/* Temp edge vectors stack, only used when computing lnor spaces. */
+	BLI_Stack *edge_vectors = common_data->lnors_spaces ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
+
+	while (loop_split_task_data_pop(common_data, &data)) {
+		if (data.e2l_prev) {
+			BLI_assert((edge_vectors == NULL) || BLI_stack_is_empty(edge_vectors));
+			data.edge_vectors = edge_vectors;
+			split_loop_nor_fan_do(common_data, &data);
+		}
+		else {
+			/* No need for edge_vectors for 'single' case! */
+			split_loop_nor_single_do(common_data, &data);
+		}
+	}
+
+	if (edge_vectors) {
 		BLI_stack_free(edge_vectors);
 	}
 }
@@ -847,26 +914,23 @@ void BKE_mesh_normals_loop_split(MVert *mverts, const int numVerts, MEdge *medge
 	bool *sharp_verts = NULL;  /* Maybe we could use a BLI_bitmap here? */
 	MLoopsNorSpaces _lnors_spaces = {NULL};
 
-#ifdef USE_THREADS
-	const int totthread = 4;//TASK_SCHEDULER_AUTO_THREADS;
+//#ifdef USE_THREADS
+#if 1
+	const int totthread = TASK_SCHEDULER_AUTO_THREADS;
 #else
-	const int totthread = 4;//TASK_SCHEDULER_SINGLE_THREAD;
+	const int totthread = TASK_SCHEDULER_SINGLE_THREAD;
 #endif
 	TaskScheduler *task_scheduler = NULL;
 	TaskPool *task_pool = NULL;
 	LoopSplitTaskDataCommon common_taskdata = {NULL};
+	LoopSplitTaskData taskdata = {NULL};
+	/* Temp edge vectors stack, only used when computing lnor spaces. */
+	BLI_Stack *edge_vectors = NULL;
 
 #ifdef DEBUG_TIME
 	TIMEIT_START(BKE_mesh_normals_loop_split);
 #endif
 
-	printf("%d\n", totthread);
-
-	if (numLoops > 10000) {
-		task_scheduler = BLI_task_scheduler_create(totthread);
-		task_pool = BLI_task_pool_create(task_scheduler, NULL);
-	}
-
 	if (check_angle) {
 		/* When using custom loop normals, disable the angle feature! */
 		if (clnors_data) {
@@ -886,6 +950,7 @@ void BKE_mesh_normals_loop_split(MVert *mverts, const int numVerts, MEdge *medge
 			BKE_init_loops_normal_spaces(r_lnors_spaces, numLoops);
 		}
 		sharp_verts = MEM_callocN(sizeof(bool) * (size_t)numVerts, __func__);
+		 edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
 	}
 
 	/* This first loo

@@ Diff output truncated at 10240 characters. @@