[Bf-blender-cvs] [d22c061] compositor-2016: Cycles: Reduce amount of malloc() calls from the kernel

Wed Jun 8 21:47:47 CEST 2016

Commit: d22c061d15f6bb64ef8ce5138467869227bb4dc5
Author: Sergey Sharybin
Date:   Tue May 17 12:30:46 2016 +0200
Branches: compositor-2016
https://developer.blender.org/rBd22c061d15f6bb64ef8ce5138467869227bb4dc5

Cycles: Reduce amount of malloc() calls from the kernel

This commit makes it so malloc() is only happening once per volume and
once per transparent shadow query (per thread), improving scalability of
the code to multiple CPU cores.

Hard to measure this with a low-bottom i7 here currently, but from quick
tests seems volume sampling gave about 3-5% speedup.

The idea is to store allocated memory in kernel globals, which are per
thread on CPU already.

Reviewers: dingto, juicyfruit, lukasstockner97, maiself, brecht

Reviewed By: brecht

Subscribers: Blendify, nutel

Differential Revision: https://developer.blender.org/D1996

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/kernel_globals.h
M	intern/cycles/kernel/kernel_shadow.h
M	intern/cycles/kernel/kernel_volume.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 676b1279..275ee02 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -213,12 +213,7 @@ public:
 				return;
 		}
 
-		KernelGlobals kg = kernel_globals;
-
-#ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-
+		KernelGlobals kg = thread_kernel_globals_init();
 		RenderTile tile;
 
 		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
@@ -289,9 +284,7 @@ public:
 			}
 		}
 
-#ifdef WITH_OSL
-		OSLShader::thread_free(&kg);
-#endif
+		thread_kernel_globals_free(&kg);
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -481,6 +474,40 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+protected:
+	inline KernelGlobals thread_kernel_globals_init()
+	{
+		KernelGlobals kg = kernel_globals;
+		kg.transparent_shadow_intersections = NULL;
+		const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+		                            sizeof(*kg.decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			kg.decoupled_volume_steps[i] = NULL;
+		}
+		kg.decoupled_volume_steps_index = 0;
+#ifdef WITH_OSL
+		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+#endif
+		return kg;
+	}
+
+	inline void thread_kernel_globals_free(KernelGlobals *kg)
+	{
+		if(kg->transparent_shadow_intersections != NULL) {
+			free(kg->transparent_shadow_intersections);
+		}
+		const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+		                            sizeof(*kg->decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			if(kg->decoupled_volume_steps[i] != NULL) {
+				free(kg->decoupled_volume_steps[i]);
+			}
+		}
+#ifdef WITH_OSL
+		OSLShader::thread_free(kg);
+#endif
+	}
 };
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index c44ea1b..7e6cdf9 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -31,6 +31,9 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif
 
+struct Intersection;
+struct VolumeStep;
+
 typedef struct KernelGlobals {
 	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_IMAGES_CPU];
 	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_IMAGES_CPU];
@@ -51,6 +54,14 @@ typedef struct KernelGlobals {
 	OSLThreadData *osl_tdata;
 #  endif
 
+	/* **** Run-time data ****  */
+
+	/* Heap-allocated storage for transparent shadows intersections. */
+	Intersection *transparent_shadow_intersections;
+
+	/* Storage for decoupled volume steps. */
+	VolumeStep *decoupled_volume_steps[2];
+	int decoupled_volume_steps_index;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 3b1111e..504ac2e 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -59,14 +59,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 		/* intersect to find an opaque surface, or record all transparent surface hits */
 		Intersection hits_stack[STACK_MAX_HITS];
 		Intersection *hits = hits_stack;
-		uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
+		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
 
 		/* prefer to use stack but use dynamic allocation if too deep max hits
 		 * we need max_hits + 1 storage space due to the logic in
 		 * scene_intersect_shadow_all which will first store and then check if
 		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS)
-			hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
+		if(max_hits + 1 > STACK_MAX_HITS) {
+			if(kg->transparent_shadow_intersections == NULL) {
+				kg->transparent_shadow_intersections =
+				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+			}
+			hits = kg->transparent_shadow_intersections;
+		}
 
 		uint num_hits;
 		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
@@ -147,14 +153,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 
 			*shadow = throughput;
 
-			if(hits != hits_stack)
-				free(hits);
 			return is_zero(throughput);
 		}
-
-		/* free dynamic storage */
-		if(hits != hits_stack)
-			free(hits);
 	}
 	else {
 		Intersection isect;
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c499773..224c275 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -627,12 +627,30 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step_size = kernel_data.integrator.volume_step_size;
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
+		/* NOTE: For the branched path tracing it's possible to have direct
+		 * and indirect light integration both having volume segments allocated.
+		 * We detect this using index in the pre-allocated memory. Currently we
+		 * only support two segments allocated at a time, if more needed some
+		 * modifications to the KernelGlobals will be needed.
+		 *
+		 * This gives us restrictions that decoupled record should only happen
+		 * in the stack manner, meaning if there's subsequent call of decoupled
+		 * record it'll need to free memory before it's caller frees memory.
+		 */
+		const int index = kg->decoupled_volume_steps_index;
+		assert(index < sizeof(kg->decoupled_volume_steps) /
+		               sizeof(*kg->decoupled_volume_steps));
 		if(max_steps > global_max_steps) {
 			max_steps = global_max_steps;
 			step_size = ray->t / (float)max_steps;
 		}
-		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+		if(kg->decoupled_volume_steps[index] == NULL) {
+			kg->decoupled_volume_steps[index] =
+			        (VolumeStep*)malloc(sizeof(VolumeStep)*global_max_steps);
+		}
+		segment->steps = kg->decoupled_volume_steps[index];
 		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+		++kg->decoupled_volume_steps_index;
 	}
 	else {
 		max_steps = 1;
@@ -745,8 +763,14 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
-	if(segment->steps != &segment->stack_step)
-		free(segment->steps);
+	if(segment->steps != &segment->stack_step) {
+		/* NOTE: We only allow free last allocated segment.
+		 * No random order of alloc/free is supported.
+		 */
+		assert(kg->decoupled_volume_steps_index > 0);
+		assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
+		--kg->decoupled_volume_steps_index;
+	}
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray