[Bf-blender-cvs] [ef25a1e] compositor-2016: DynamicPaint: massive refactor, replace all OpenMP usage by BLI_task and other optimizations.

Wed Jun 8 21:48:47 CEST 2016

Commit: ef25a1ed871e96397ea8e4c856ed1236d5051404
Author: Bastien Montagne
Date:   Sat May 21 15:22:04 2016 +0200
Branches: compositor-2016
https://developer.blender.org/rBef25a1ed871e96397ea8e4c856ed1236d5051404

DynamicPaint: massive refactor, replace all OpenMP usage by BLI_task and other optimizations.

This commit makes Dynamicpaint modifier evaluation (during playback) a few percents quicker.
However, it makes dynapaint's 'image sequence' baking about 33% quicker (from 119 to 77 seconds
in own heavy test), partly due to switch to BLI_task itself (about 20%), and partly due to
optimizations (remaining ~13%).

As usual, did a lot of tests here to ensure nothing is broken, but a lot more users' testing would definitively
be welcome too! ;)

Note that some quite meaningless omp forloops have been removed (parallelizing thousands of vec copy does
make it two or three times quicker, but the few hundreds of microseconds gained do not make any difference
in a hundreds millisecond process).

Also, this code could still use a lot more cleanup (naming etc.), the way it (tries to) handle malloc faults
is also totally flacky and makes the code horribly verbose and convoluted in some places - without actually
catching all possible faults (memarena could make it more easy to handle here), etc.

===================================================================

M	source/blender/blenkernel/intern/dynamicpaint.c

===================================================================

diff --git a/source/blender/blenkernel/intern/dynamicpaint.c b/source/blender/blenkernel/intern/dynamicpaint.c
index 8af9750..2dc0388 100644
--- a/source/blender/blenkernel/intern/dynamicpaint.c
+++ b/source/blender/blenkernel/intern/dynamicpaint.c
@@ -65,6 +65,7 @@
 #include "BKE_image.h"
 #include "BKE_main.h"
 #include "BKE_material.h"
+#include "BKE_mesh_mapping.h"
 #include "BKE_modifier.h"
 #include "BKE_object.h"
 #include "BKE_particle.h"
@@ -141,8 +142,8 @@ typedef struct Bounds2D {
 } Bounds2D;
 
 typedef struct Bounds3D {
-	int valid;
 	float min[3], max[3];
+	bool valid;
 } Bounds3D;
 
 typedef struct VolumeGrid {
@@ -153,6 +154,8 @@ typedef struct VolumeGrid {
 	int *s_pos;  /* (x*y*z) t_index begin id */
 	int *s_num;  /* (x*y*z) number of t_index points */
 	int *t_index;  /* actual surface point index, access: (s_pos + s_num) */
+
+	int *temp_t_index;
 } VolumeGrid;
 
 typedef struct Vec3f {
@@ -178,6 +181,7 @@ typedef struct PaintBakeData {
 	int *s_num;  /* num of realCoord samples */
 	Vec3f *realCoord;  /* current pixel center world-space coordinates for each sample ordered as (s_pos + s_num) */
 	Bounds3D mesh_bounds;
+	float dim[3];
 
 	/* adjacency info */
 	BakeAdjPoint *bNeighs;  /* current global neighbor distances and directions, if required */
@@ -193,7 +197,6 @@ typedef struct PaintBakeData {
 	MVert *prev_verts;      /* copy of previous frame vertices. used to observe surface movement */
 	float prev_obmat[4][4]; /* previous frame object matrix */
 	int clear;              /* flag to check if surface was cleared/reset -> have to redo velocity etc. */
-
 } PaintBakeData;
 
 /* UV Image sequence format point	*/
@@ -422,7 +425,8 @@ static int surface_totalSamples(DynamicPaintSurface *surface)
 	return surface->data->total_points;
 }
 
-static void blendColors(const float t_color[3], float t_alpha, const float s_color[3], float s_alpha, float result[4])
+static void blendColors(
+        const float t_color[3], const float t_alpha, const float s_color[3], const float s_alpha, float result[4])
 {
 	/* Same thing as BLI's blend_color_mix_float(), but for non-premultiplied alpha. */
 	int i;
@@ -576,7 +580,7 @@ static void boundInsert(Bounds3D *b, float point[3])
 	if (!b->valid) {
 		copy_v3_v3(b->min, point);
 		copy_v3_v3(b->max, point);
-		b->valid = 1;
+		b->valid = true;
 		return;
 	}
 
@@ -603,27 +607,92 @@ static void freeGrid(PaintSurfaceData *data)
 	bData->grid = NULL;
 }
 
+static void grid_bound_insert_cb_ex(void *userdata, void *userdata_chunk, const int i, const int UNUSED(thread_id))
+{
+	PaintBakeData *bData = userdata;
+
+	Bounds3D *grid_bound = userdata_chunk;
+
+	boundInsert(grid_bound, bData->realCoord[bData->s_pos[i]].v);
+}
+
+static void grid_bound_insert_finalize(void *userdata, void *userdata_chunk)
+{
+	PaintBakeData *bData = userdata;
+	VolumeGrid *grid = bData->grid;
+
+	Bounds3D *grid_bound = userdata_chunk;
+
+	boundInsert(&grid->grid_bounds, grid_bound->min);
+	boundInsert(&grid->grid_bounds, grid_bound->max);
+}
+
+static void grid_cell_points_cb_ex(void *userdata, void *userdata_chunk, const int i, const int UNUSED(thread_id))
+{
+	PaintBakeData *bData = userdata;
+	VolumeGrid *grid = bData->grid;
+	int *temp_t_index = grid->temp_t_index;
+	int *s_num = userdata_chunk;
+
+	int co[3];
+
+	for (int j = 3; j--;) {
+		co[j] = (int)floorf((bData->realCoord[bData->s_pos[i]].v[j] - grid->grid_bounds.min[j]) /
+		                    bData->dim[j] * grid->dim[j]);
+		CLAMP(co[j], 0, grid->dim[j] - 1);
+	}
+
+	temp_t_index[i] = co[0] + co[1] * grid->dim[0] + co[2] * grid->dim[0] * grid->dim[1];
+	s_num[temp_t_index[i]]++;
+}
+
+static void grid_cell_points_finalize(void *userdata, void *userdata_chunk)
+{
+	PaintBakeData *bData = userdata;
+	VolumeGrid *grid = bData->grid;
+	const int grid_cells = grid->dim[0] * grid->dim[1] * grid->dim[2];
+
+	int *s_num = userdata_chunk;
+
+	/* calculate grid indexes */
+	for (int i = 0; i < grid_cells; i++) {
+		grid->s_num[i] += s_num[i];
+	}
+}
+
+static void grid_cell_bounds_cb(void *userdata, const int x)
+{
+	PaintBakeData *bData = userdata;
+	VolumeGrid *grid = bData->grid;
+	float *dim = bData->dim;
+	int *grid_dim = grid->dim;
+
+	for (int y = 0; y < grid_dim[1]; y++) {
+		for (int z = 0; z < grid_dim[2]; z++) {
+			const int b_index = x + y * grid_dim[0] + z * grid_dim[0] * grid_dim[1];
+			/* set bounds */
+			for (int j = 3; j--;) {
+				const int s = (j == 0) ? x : ((j == 1) ? y : z);
+				grid->bounds[b_index].min[j] = grid->grid_bounds.min[j] + dim[j] / grid_dim[j] * s;
+				grid->bounds[b_index].max[j] = grid->grid_bounds.min[j] + dim[j] / grid_dim[j] * (s + 1);
+			}
+			grid->bounds[b_index].valid = true;
+		}
+	}
+}
+
 static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
 {
 	PaintSurfaceData *sData = surface->data;
 	PaintBakeData *bData = sData->bData;
-	Bounds3D *grid_bounds;
 	VolumeGrid *grid;
 	int grid_cells, axis = 3;
 	int *temp_t_index = NULL;
 	int *temp_s_num = NULL;
 
-#ifdef _OPENMP
-	int num_of_threads = omp_get_max_threads();
-#else
-	int num_of_threads = 1;
-#endif
-
 	if (bData->grid)
 		freeGrid(sData);
 
-	/* allocate separate bounds for each thread */
-	grid_bounds = MEM_callocN(sizeof(Bounds3D) * num_of_threads, "Grid Bounds");
 	bData->grid = MEM_callocN(sizeof(VolumeGrid), "Surface Grid");
 	grid = bData->grid;
 
@@ -634,27 +703,16 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
 		float min_dim;
 
 		/* calculate canvas dimensions */
-#pragma omp parallel for schedule(static)
-		for (i = 0; i < sData->total_points; i++) {
-#ifdef _OPENMP
-			int id = omp_get_thread_num();
-			boundInsert(&grid_bounds[id], (bData->realCoord[bData->s_pos[i]].v));
-#else
-			boundInsert(&grid_bounds[0], (bData->realCoord[bData->s_pos[i]].v));
-#endif
-		}
-
-		/* get final dimensions */
-		for (i = 0; i < num_of_threads; i++) {
-			boundInsert(&grid->grid_bounds, grid_bounds[i].min);
-			boundInsert(&grid->grid_bounds, grid_bounds[i].max);
-		}
-
-		MEM_freeN(grid_bounds);
+		/* Important to init correctly our ref grid_bound... */
+		boundInsert(&grid->grid_bounds, bData->realCoord[bData->s_pos[0]].v);
+		BLI_task_parallel_range_finalize(
+		            0, sData->total_points, bData, &grid->grid_bounds, sizeof(grid->grid_bounds),
+		            grid_bound_insert_cb_ex, grid_bound_insert_finalize, sData->total_points > 1000, false);
 
 		/* get dimensions */
 		sub_v3_v3v3(dim, grid->grid_bounds.max, grid->grid_bounds.min);
 		copy_v3_v3(td, dim);
+		copy_v3_v3(bData->dim, dim);
 		min_dim = max_fff(td[0], td[1], td[2]) / 1000.f;
 
 		/* deactivate zero axises */
@@ -687,10 +745,11 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
 		/* allocate memory for grids */
 		grid->bounds = MEM_callocN(sizeof(Bounds3D) * grid_cells, "Surface Grid Bounds");
 		grid->s_pos = MEM_callocN(sizeof(int) * grid_cells, "Surface Grid Position");
-		grid->s_num = MEM_callocN(sizeof(int) * grid_cells * num_of_threads, "Surface Grid Points");
+
+		grid->s_num = MEM_callocN(sizeof(int) * grid_cells, "Surface Grid Points");
 		temp_s_num = MEM_callocN(sizeof(int) * grid_cells, "Temp Surface Grid Points");
 		grid->t_index = MEM_callocN(sizeof(int) * sData->total_points, "Surface Grid Target Ids");
-		temp_t_index = MEM_callocN(sizeof(int) * sData->total_points, "Temp Surface Grid Target Ids");
+		grid->temp_t_index = temp_t_index = MEM_callocN(sizeof(int) * sData->total_points, "Temp Surface Grid Target Ids");
 
 		/* in case of an allocation failure abort here */
 		if (!grid->bounds || !grid->s_pos || !grid->s_num || !grid->t_index || !temp_s_num || !temp_t_index)
@@ -698,33 +757,12 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
 
 		if (!error) {
 			/* calculate number of points withing each cell */
-#pragma omp parallel for schedule(static)
-			for (i = 0; i < sData->total_points; i++) {
-				int co[3], j;
-				for (j = 0; j < 3; j++) {
-					co[j] = (int)floor((bData->realCoord[bData->s_pos[i]].v[j] - grid->grid_bounds.min[j]) / dim[j] * grid->dim[j]);
-					CLAMP(co[j], 0, grid->dim[j] - 1);
-				}
-
-				temp_t_index[i] = co[0] + co[1] * grid->dim[0] + co[2] * grid->dim[0] * grid->dim[1];
-#ifdef _OPENMP
-				grid->s_num[temp_t_index[i] + omp_get_thread_num() * grid_cells]++;
-#else
-				grid->s_num[temp_t_index[i]]++;
-#endif
-			}
-
-			/* for first cell only calc s_num */
-			for (i = 1; i < num_of_threads; i++) {
-				grid->s_num[0] += grid->s_num[i * grid_cells];
-			}
+			BLI_task_parallel_range_finalize(
+			            0, sData->total_points, bData, grid->s_num, sizeof(*grid->s_num) * grid_cells,
+			            grid_cell_points_cb_ex, grid_cell_points_finalize, sData->total_points > 1000, false);
 
-			/* calculate grid indexes */
+			/* calculate grid indexes (not needed for first cell, which is zero). */
 			for (i = 1; i < grid_cells; i++) {
-				int id;
-				for (id = 1; id < num_of_threads; id++) {
-					grid->s_num[i] += grid->s_num[i + id * grid_cells];
-				}
 				grid->s_pos[i] = grid->s_pos[i - 1] + grid->s_num[i - 1];
 			}
 
@@ -737,35 +775,14 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
 			}
 
 			/* calculate cell bounds */
-			{
-				int x;
-#pragma omp parallel for schedule(static)
-				for (x = 0; x < grid->dim[0]; x++) {
-					int y;
-					for (y = 0; y < grid->dim[1]; y++) {
-						int z;
-						for (z = 0; z < grid->dim[2]; z++) {
-							int j, b_index = x + y * grid->dim[0] + z * grid->dim[0] * grid->dim[1];
-							/* set bounds */
-							for (j = 0; j < 3; j++) {
-								int s = (j == 0) ? x : ((j == 1) ? y : z);
-								grid->bounds[b_index].min[j] = grid->grid_bounds.min[j] + dim[j] / grid->dim[j] * s;
-								grid->bounds[b_index].max[j] = grid->grid_bounds.min[j] + dim[j] / grid->dim[j] * (s + 1);
-							}
-							grid->bounds[b_index].valid = 1;
-						}
-					}
-				}
-			}
+			BLI_task_parallel_range(0, grid->dim[0], bData, grid_cell_bounds_cb, grid_cells > 1000);
 		}
 
 		if (temp_s_num)
 			MEM_freeN(temp_s_num);
 		if (temp_t_index)
 			MEM_freeN(temp_t_index);
-
-		/* free per thread s_num values */
-		grid->s_num = MEM_reallocN(grid->s_num, sizeof(int) * grid_cells);
+		grid->temp_t_index = 

@@ Diff output truncated at 10240 characters. @@