[Bf-blender-cvs] [f29e0a3] temp_custom_loop_normals: Much better handling of multithreading - at least 80% quicker than previous code, more than two times quicker than without any threading...
Bastien Montagne
noreply at git.blender.org
Sun Aug 17 23:23:22 CEST 2014
Commit: f29e0a3f6db6e7f275aee118e7a11f4ff04e2160
Author: Bastien Montagne
Date: Sun Aug 17 23:07:47 2014 +0200
Branches: temp_custom_loop_normals
https://developer.blender.org/rBf29e0a3f6db6e7f275aee118e7a11f4ff04e2160
Much better handling of multithreading - at least 80% quicker than previous code,
more than two times quicker than without any threading...
Main idea was to make bigger chunks of tasks (currently, 1024 at once), saves a bunch
of mem management (alloc, copy, etc.) and spinlock locking.
===================================================================
M source/blender/blenkernel/intern/mesh_evaluate.c
===================================================================
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index 7853e7c..d33c890 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -487,6 +487,8 @@ void BKE_lnor_space_custom_normal_to_data(MLoopNorSpace *lnor_space, const float
}
}
+#define LOOP_SPLIT_TASK_BLOCK_SIZE 1024
+
typedef struct LoopSplitTaskData {
/* Specific to each instance (each task). */
MLoopNorSpace *lnor_space; /* We have to create those outside of tasks, since afaik memarena is not threadsafe. */
@@ -501,7 +503,7 @@ typedef struct LoopSplitTaskData {
/* This one is special, it's owned and managed by worker tasks, avoid to have to create it for each fan! */
BLI_Stack *edge_vectors;
- char c;
+ char pad_c;
} LoopSplitTaskData;
typedef struct LoopSplitTaskDataCommon {
@@ -536,7 +538,8 @@ typedef struct LoopSplitTaskDataCommon {
/* Main thread only! */
static void loop_split_task_init(LoopSplitTaskDataCommon *common_data)
{
- common_data->tasks = BLI_stack_new(sizeof(LoopSplitTaskData), __func__);
+ common_data->tasks = BLI_stack_new_ex(sizeof(LoopSplitTaskData) * LOOP_SPLIT_TASK_BLOCK_SIZE, __func__,
+ sizeof(LoopSplitTaskData) * LOOP_SPLIT_TASK_BLOCK_SIZE * 32);
BLI_spin_init(&common_data->lock);
}
@@ -861,28 +864,41 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
}
}
-static void loop_split_worker(TaskPool *UNUSED(pool), void *taskdata, int threadid)
+static void loop_split_worker(TaskPool *UNUSED(pool), void *taskdata, int UNUSED(threadid))
{
LoopSplitTaskDataCommon *common_data = (LoopSplitTaskDataCommon *)taskdata;
- LoopSplitTaskData data;
+ LoopSplitTaskData data_buff[LOOP_SPLIT_TASK_BLOCK_SIZE];
/* Temp edge vectors stack, only used when computing lnor spaces. */
BLI_Stack *edge_vectors = common_data->lnors_spaces ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
+ int count = 0;
+
#ifdef DEBUG_TIME
TIMEIT_START(loop_split_worker);
#endif
- while (loop_split_task_data_pop(common_data, &data)) {
- if (data.e2l_prev) {
- BLI_assert((edge_vectors == NULL) || BLI_stack_is_empty(edge_vectors));
- data.edge_vectors = edge_vectors;
- split_loop_nor_fan_do(common_data, &data);
- }
- else {
- /* No need for edge_vectors for 'single' case! */
- split_loop_nor_single_do(common_data, &data);
+ while (loop_split_task_data_pop(common_data, data_buff)) {
+ LoopSplitTaskData *data = data_buff;
+ int i;
+
+ for (i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) {
+ /* A NULL ml_curr is used to tag ended data! */
+ if (data->ml_curr == NULL) {
+ break;
+ }
+ else if (data->e2l_prev) {
+ BLI_assert((edge_vectors == NULL) || BLI_stack_is_empty(edge_vectors));
+ data->edge_vectors = edge_vectors;
+ split_loop_nor_fan_do(common_data, data);
+ }
+ else {
+ /* No need for edge_vectors for 'single' case! */
+ split_loop_nor_single_do(common_data, data);
+ }
}
+
+ count += i;
}
if (edge_vectors) {
@@ -890,15 +906,15 @@ static void loop_split_worker(TaskPool *UNUSED(pool), void *taskdata, int thread
}
#ifdef DEBUG_TIME
- printf("%d - ", threadid);
TIMEIT_END(loop_split_worker);
#endif
}
-static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int threadid)
+static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int UNUSED(threadid))
{
LoopSplitTaskDataCommon *common_data = (LoopSplitTaskDataCommon *)taskdata;
- LoopSplitTaskData data;
+ LoopSplitTaskData data_buff[LOOP_SPLIT_TASK_BLOCK_SIZE];
+ int data_idx = 0;
MLoopsNorSpaces *lnors_spaces = common_data->lnors_spaces;
BLI_bitmap *sharp_verts = common_data->sharp_verts;
@@ -916,6 +932,8 @@ static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int thr
TIMEIT_START(loop_split_generator);
#endif
+ memset(data_buff, 0, sizeof(data_buff));
+
/* We now know edges that can be smoothed (with their vector, and their two loops), and edges that will be hard!
* Now, time to generate the normals.
*/
@@ -934,6 +952,8 @@ static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int thr
const int *e2l_curr = edge_to_loops[ml_curr->e];
const int *e2l_prev = edge_to_loops[ml_prev->e];
+ LoopSplitTaskData *data = &data_buff[data_idx % LOOP_SPLIT_TASK_BLOCK_SIZE];
+
if (!IS_EDGE_SHARP(e2l_curr) && (!lnors_spaces || BLI_BITMAP_TEST_BOOL(sharp_verts, ml_curr->v))) {
/* A smooth edge, and we are not generating lnor_spaces, or the related vertex is sharp.
* We skip it because it is either:
@@ -944,46 +964,52 @@ static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int thr
*/
/* printf("Skipping loop %d / edge %d / vert %d(%d)\n", ml_curr_index, ml_curr->e, ml_curr->v, sharp_verts[ml_curr->v]); */
}
- else if (IS_EDGE_SHARP(e2l_curr) && IS_EDGE_SHARP(e2l_prev)) {
- data.lnor = lnors;
- data.ml_curr = ml_curr;
- data.ml_prev = ml_prev;
- data.ml_curr_index = ml_curr_index;
+ else {
+ if (IS_EDGE_SHARP(e2l_curr) && IS_EDGE_SHARP(e2l_prev)) {
+ data->lnor = lnors;
+ data->ml_curr = ml_curr;
+ data->ml_prev = ml_prev;
+ data->ml_curr_index = ml_curr_index;
#if 0 /* Not needed for 'single' loop. */
- data.ml_prev_index = ml_prev_index;
+ data->ml_prev_index = ml_prev_index;
+ data->e2l_prev = NULL; /* Tag as 'single' task. */
#endif
- data.e2l_prev = NULL; /* Tag as 'single' task. */
- data.mp_index = mp_index;
- if (lnors_spaces) {
- data.lnor_space = BKE_lnor_space_create(lnors_spaces);
+ data->mp_index = mp_index;
+ if (lnors_spaces) {
+ data->lnor_space = BKE_lnor_space_create(lnors_spaces);
+ }
}
- loop_split_task_data_push(common_data, &data);
- }
- /* We *do not need* to check/tag loops as already computed!
- * Due to the fact a loop only links to one of its two edges, a same fan *will never be walked more than
- * once!*
- * Since we consider edges having neighbor polys with inverted (flipped) normals as sharp, we are sure that
- * no fan will be skipped, even only considering the case (sharp curr_edge, smooth prev_edge), and not the
- * alternative (smooth curr_edge, sharp prev_edge).
- * All this due/thanks to link between normals and loop ordering (i.e. winding).
- */
- else {
+ /* We *do not need* to check/tag loops as already computed!
+ * Due to the fact a loop only links to one of its two edges, a same fan *will never be walked
+ * more than once!*
+ * Since we consider edges having neighbor polys with inverted (flipped) normals as sharp, we are sure
+ * that no fan will be skipped, even only considering the case (sharp curr_edge, smooth prev_edge),
+ * and not the alternative (smooth curr_edge, sharp prev_edge).
+ * All this due/thanks to link between normals and loop ordering (i.e. winding).
+ */
+ else {
#if 0 /* Not needed for 'fan' loops. */
- data.lnor = lnors;
+ data->lnor = lnors;
#endif
- data.ml_curr = ml_curr;
- data.ml_prev = ml_prev;
- data.ml_curr_index = ml_curr_index;
- data.ml_prev_index = ml_prev_index;
- data.e2l_prev = e2l_prev; /* Also tag as 'fan' task. */
- data.mp_index = mp_index;
- if (lnors_spaces) {
- data.lnor_space = BKE_lnor_space_create(lnors_spaces);
- /* Tag related vertex as sharp, to avoid fanning around it again (in case it was a smooth one).
- * This *has* to be done outside of workers tasks! */
- BLI_BITMAP_ENABLE(sharp_verts, ml_curr->v);
+ data->ml_curr = ml_curr;
+ data->ml_prev = ml_prev;
+ data->ml_curr_index = ml_curr_index;
+ data->ml_prev_index = ml_prev_index;
+ data->e2l_prev = e2l_prev; /* Also tag as 'fan' task. */
+ data->mp_index = mp_index;
+ if (lnors_spaces) {
+ data->lnor_space = BKE_lnor_space_create(lnors_spaces);
+ /* Tag related vertex as sharp, to avoid fanning around it again (in case it was a smooth one).
+ * This *has* to be done outside of workers tasks! */
+ BLI_BITMAP_ENABLE(sharp_verts, ml_curr->v);
+ }
+ }
+
+ data_idx++;
+ if (!(data_idx % LOOP_SPLIT_TASK_BLOCK_SIZE)) {
+ loop_split_task_data_push(common_data, data_buff);
+ memset(data_buff, 0, sizeof(data_buff));
}
- loop_split_task_data_push(common_data, &data);
}
ml_prev = ml_curr;
@@ -991,12 +1017,16 @@ static void loop_split_generator(TaskPool *UNUSED(pool), void *taskdata, int thr
}
}
+ /* Last block of data, most likely not exactly fully filled, so we have to tag first invalid item. */
+ if (data_idx % LOOP_SPLIT_TASK_BLOCK_SIZE) {
+ loop_split_task_data_push(common_data, data_buff);
+ }
+
+ common_data->finished = true;
+
#ifdef DEBUG_TIME
- printf("%d - ", threadid);
TIMEIT_END(loop_split_generator);
#endif
-
- common_data->finished = true;
}
/**
@@ -1031,15 +1061,10 @@ void BKE_mesh_normals_loop_split(MVert *mverts, const int numVerts, MEdge *medge
BLI_bitmap *sharp_verts = NULL;
MLoopsNorSpaces _lnors_spaces = {NULL};
-//#ifdef USE_THREADS
-#if 1
- const int totthread = TASK_SCHEDULER_AUTO_THREADS;
-#else
- const int totthread = TASK_SCHEDULER_SINGLE_THREAD;
-#endif
- TaskScheduler *task_scheduler = NULL;
- TaskPool *task_pool = NULL;
+ TaskScheduler *task_scheduler;
+ TaskPool *task_pool;
LoopSplitTaskDataCommon common_taskdata = {NULL};
+ int nbr_workers;
#ifdef DEBUG_TIME
TIMEIT_START(BKE_mesh_normals_loop_split);
@@ -1151,12 +1176,12 @@ void BKE_mesh_normals_loop_split(MVert *mverts, const int numVerts, MEdge *medge
task_scheduler = BLI_task_scheduler_get();
task_pool = BLI_task_pool_create(task_scheduler, NULL);
- printf("%d (%d)\n", totthread, BLI_task_scheduler_num_threads(task_scheduler));
- BLI_task_pool_push(task_pool, loop_split_generator, &common_taskdata, false, TASK_PRIORITY_HIGH);
- for (i = 0; i < min_ii(6, BLI_task_scheduler_num_threads(task_scheduler)); i++) {
+ nbr_workers = max_ii(1, BLI_task_scheduler_num_threads(task_scheduler));
+ for (i = 0; i < nbr_workers; i++) {
BLI_task_pool_push(task_pool, loop_split_worker, &comm
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list