[Bf-blender-cvs] SVN commit: /data/svn/bf-blender [47783] trunk/blender/source/blender/ editors/sculpt_paint/sculpt.c: Decrease frequency of mallocs during multires sculpt smoothing

Tue Jun 12 13:22:11 CEST 2012

Revision: 47783
          http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision=47783
Author:   nicholasbishop
Date:     2012-06-12 11:22:10 +0000 (Tue, 12 Jun 2012)
Log Message:
-----------
Decrease frequency of mallocs during multires sculpt smoothing

Patch from Jason Wilkins.

Creates a pool of allocations (one for each OpenMP thread) rather than
allocating every time do_multires_smooth_brush() is called.

Modified Paths:
--------------
    trunk/blender/source/blender/editors/sculpt_paint/sculpt.c

Modified: trunk/blender/source/blender/editors/sculpt_paint/sculpt.c
===================================================================

--- trunk/blender/source/blender/editors/sculpt_paint/sculpt.c	2012-06-12 11:13:53 UTC (rev 47782)
+++ trunk/blender/source/blender/editors/sculpt_paint/sculpt.c	2012-06-12 11:22:10 UTC (rev 47783)
@@ -212,6 +212,11 @@
 	float clip_tolerance[3];
 	float initial_mouse[2];
 
+	/* Pre-allocated temporary storage used during smoothing */
+	int num_threads;
+	float (**tmpgrid_co)[3], (**tmprow_co)[3];
+	float **tmpgrid_mask, **tmprow_mask;
+
 	/* Variants */
 	float radius;
 	float radius_squared;
@@ -1257,6 +1262,7 @@
 	float (*tmpgrid_co)[3], (*tmprow_co)[3];
 	float *tmpgrid_mask, *tmprow_mask;
 	int v1, v2, v3, v4;
+	int thread_num;
 	int *grid_indices, totgrid, gridsize, i, x, y;
 
 	sculpt_brush_test_init(ss, &test);
@@ -1267,17 +1273,15 @@
 	                        NULL, &gridsize, &griddata, &gridadj);
 	BLI_pbvh_get_grid_key(ss->pbvh, &key);
 
-	#pragma omp critical
-	{
-		if (smooth_mask) {
-			tmpgrid_mask = MEM_mallocN(sizeof(float) * gridsize * gridsize, "tmpgrid_mask");
-			tmprow_mask = MEM_mallocN(sizeof(float) * gridsize, "tmprow_mask");
-		}
-		else {
-			tmpgrid_co = MEM_mallocN(sizeof(float) * 3 * gridsize * gridsize, "tmpgrid_co");
-			tmprow_co = MEM_mallocN(sizeof(float) * 3 * gridsize, "tmprow_co");
-		}
-	}
+	thread_num = 0;
+#ifdef _OPENMP
+	if (sd->flags & SCULPT_USE_OPENMP)
+		thread_num = omp_get_thread_num();
+#endif
+	tmpgrid_co = ss->cache->tmpgrid_co[thread_num];
+	tmprow_co = ss->cache->tmprow_co[thread_num];
+	tmpgrid_mask = ss->cache->tmpgrid_mask[thread_num];
+	tmprow_mask = ss->cache->tmprow_mask[thread_num];
 
 	for (i = 0; i < totgrid; ++i) {
 		data = griddata[grid_indices[i]];
@@ -1393,18 +1397,6 @@
 			}
 		}
 	}
-
-	#pragma omp critical
-	{
-		if (smooth_mask) {
-			MEM_freeN(tmpgrid_mask);
-			MEM_freeN(tmprow_mask);
-		}
-		else {
-			MEM_freeN(tmpgrid_co);
-			MEM_freeN(tmprow_co);
-		}
-	}
 }
 
 static void smooth(Sculpt *sd, Object *ob, PBVHNode **nodes, int totnode,
@@ -3233,6 +3225,69 @@
 	}
 }
 
+static void sculpt_omp_start(Sculpt *sd, SculptSession *ss)
+{
+	StrokeCache *cache = ss->cache;
+
+#ifdef _OPENMP
+	/* If using OpenMP then create a number of threads two times the
+	 * number of processor cores.
+	 * Justification: Empirically I've found that two threads per
+	 * processor gives higher throughput. */
+	if (sd->flags & SCULPT_USE_OPENMP) {
+		cache->num_threads = 2 * omp_get_num_procs();
+		omp_set_num_threads(cache->num_threads);
+	}
+	else
+#endif
+	{
+		(void)sd;
+		cache->num_threads = 1;
+	}
+
+	if (ss->multires) {
+		int i, gridsize, array_mem_size;
+		BLI_pbvh_node_get_grids(ss->pbvh, NULL, NULL, NULL, NULL,
+								&gridsize, NULL, NULL);
+
+		array_mem_size = cache->num_threads * sizeof(void*);
+
+		cache->tmpgrid_co = MEM_mallocN(array_mem_size, "tmpgrid_co array");
+		cache->tmprow_co = MEM_mallocN(array_mem_size, "tmprow_co array");
+		cache->tmpgrid_mask = MEM_mallocN(array_mem_size, "tmpgrid_mask array");
+		cache->tmprow_mask = MEM_mallocN(array_mem_size, "tmprow_mask array");
+
+		for (i = 0; i < cache->num_threads; i++) {
+			const size_t row_size = sizeof(float) * gridsize;
+			const size_t co_row_size = 3 * row_size;
+
+			cache->tmprow_co[i] = MEM_mallocN(co_row_size, "tmprow_co");
+			cache->tmpgrid_co[i] = MEM_mallocN(co_row_size * gridsize, "tmpgrid_co");
+			cache->tmprow_mask[i] = MEM_mallocN(row_size, "tmprow_mask");
+			cache->tmpgrid_mask[i] = MEM_mallocN(row_size * gridsize, "tmpgrid_mask");
+		}
+	}
+}
+
+static void sculpt_omp_done(SculptSession *ss)
+{
+	if (ss->multires) {
+		int i;
+
+		for (i = 0; i < ss->cache->num_threads; i++) {
+			MEM_freeN(ss->cache->tmpgrid_co[i]);
+			MEM_freeN(ss->cache->tmprow_co[i]);
+			MEM_freeN(ss->cache->tmpgrid_mask[i]);
+			MEM_freeN(ss->cache->tmprow_mask[i]);
+		}
+
+		MEM_freeN(ss->cache->tmpgrid_co);
+		MEM_freeN(ss->cache->tmprow_co);
+		MEM_freeN(ss->cache->tmpgrid_mask);
+		MEM_freeN(ss->cache->tmprow_mask);
+	}
+}
+
 /* Initialize the stroke cache invariants from operator properties */
 static void sculpt_update_cache_invariants(bContext *C, Sculpt *sd, SculptSession *ss, wmOperator *op, const float mouse[2])
 {
@@ -3346,6 +3401,8 @@
 	cache->first_time = 1;
 
 	cache->vertex_rotation = 0;
+
+	sculpt_omp_start(sd, ss);
 }
 
 static void sculpt_update_brush_delta(Sculpt *sd, Object *ob, Brush *brush)
@@ -3798,19 +3855,6 @@
 
 		sculpt_undo_push_begin(sculpt_tool_name(sd));
 
-#ifdef _OPENMP
-		/* If using OpenMP then create a number of threads two times the
-		 * number of processor cores.
-		 * Justification: Empirically I've found that two threads per
-		 * processor gives higher throughput. */
-		if (sd->flags & SCULPT_USE_OPENMP) {
-			int num_procs;
-
-			num_procs = omp_get_num_procs();
-			omp_set_num_threads(2 * num_procs);
-		}
-#endif
-
 		return 1;
 	}
 	else
@@ -3847,6 +3891,8 @@
 	SculptSession *ss = ob->sculpt;
 	Sculpt *sd = CTX_data_tool_settings(C)->sculpt;
 
+	sculpt_omp_done(ss);
+
 	/* reset values used to draw brush after completing the stroke */
 	sd->draw_anchored = 0;
 	sd->draw_pressure = 0;