[Bf-blender-cvs] [504b201] cycles_split_kernel: Cycles: Simplify shared buffer and argument passing in split kernel

Mai Lavelle noreply at git.blender.org
Fri Oct 14 15:50:59 CEST 2016


Commit: 504b201ba7a661f2e551c2fcd6b93f64ebf25b57
Author: Mai Lavelle
Date:   Wed Oct 12 14:54:46 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rB504b201ba7a661f2e551c2fcd6b93f64ebf25b57

Cycles: Simplify shared buffer and argument passing in split kernel

Adds a single buffer for storing and passing around data shared between kernels.
The idea is to have there be less code so things are easier to work with and
later deduplicate.

Benefits:
 - reduces the number of buffers that need to be allocated and kept track of by
   using a single buffer for data shared between kernels
 - simplify passing of arguments to kernels by using a pointer in kernel
   globals to point to shared data
 - simplify calculation of shared data size

Things may need renaming / reorganizing but this is just to get things started.

===================================================================

M	intern/cycles/device/CMakeLists.txt
M	intern/cycles/device/opencl/opencl_split.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernel_globals.h
M	intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
M	intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
M	intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
M	intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
M	intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
M	intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
M	intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
M	intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
M	intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
M	intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
M	intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
M	intern/cycles/kernel/split/kernel_background_buffer_update.h
M	intern/cycles/kernel/split/kernel_data_init.h
M	intern/cycles/kernel/split/kernel_direct_lighting.h
M	intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M	intern/cycles/kernel/split/kernel_lamp_emission.h
M	intern/cycles/kernel/split/kernel_next_iteration_setup.h
M	intern/cycles/kernel/split/kernel_scene_intersect.h
M	intern/cycles/kernel/split/kernel_shader_eval.h
M	intern/cycles/kernel/split/kernel_shadow_blocked.h
M	intern/cycles/kernel/split/kernel_split_common.h
A	intern/cycles/kernel/split/kernel_split_data.h
M	intern/cycles/kernel/split/kernel_sum_all_radiance.h

===================================================================

diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e..5c25434 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,6 +3,7 @@ set(INC
 	.
 	../graph
 	../kernel
+	../kernel/split
 	../kernel/svm
 	../kernel/osl
 	../util
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 239e73a..ce0c702 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -21,6 +21,7 @@
 #include "buffers.h"
 
 #include "kernel_types.h"
+#include "kernel_split_data.h"
 
 #include "util_md5.h"
 #include "util_path.h"
@@ -106,46 +107,14 @@ public:
 	 * kernel will be available to another kernel via this global
 	 * memory.
 	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
 	cl_mem kgbuffer;  /* KernelGlobals buffer. */
 
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
+	cl_mem split_data;
 
 	/* Global state array that tracks ray state. */
 	cl_mem ray_state;
 
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
 	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
 	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
 	                     * Tracks the size of each queue.
 	                     */
@@ -188,39 +157,13 @@ public:
 
 		/* Initialize cl_mem variables. */
 		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
+		split_data = NULL;
 		ray_state = NULL;
 
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
 		/* Queue. */
-		Queue_data = NULL;
 		Queue_index = NULL;
 		use_queues_flag = NULL;
 
-		per_sample_output_buffers = NULL;
-
 		per_thread_output_buffer_size = 0;
 		hostRayStateArray = NULL;
 		PathIteration_times = PATH_ITER_INC_FACTOR;
@@ -265,12 +208,6 @@ public:
 		return ret_size;
 	}
 
-	size_t get_shader_data_size(size_t max_closure)
-	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
-	}
-
 	/* Returns size of KernelGlobals structure associated with OpenCL. */
 	size_t get_KernelGlobals_size()
 	{
@@ -285,6 +222,7 @@ public:
 #undef KERNEL_TEX
 			void *sd_input;
 			void *isect_shadow;
+			SplitData split_data;
 		} KernelGlobals;
 
 		return sizeof(KernelGlobals);
@@ -355,35 +293,14 @@ public:
 		program_sum_all_radiance.release();
 
 		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
 		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
+		release_mem_object_safe(split_data);
 		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
 		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
 		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
 #ifdef __WORK_STEALING__
 		release_mem_object_safe(work_pool_wgs);
 #endif
-		release_mem_object_safe(per_sample_output_buffers);
 
 		if(hostRayStateArray != NULL) {
 			free(hostRayStateArray);
@@ -451,15 +368,11 @@ public:
 		assert(global_size[0] * global_size[1] <=
 		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
 
+		int num_global_elements = max_render_feasible_tile_size.x *
+		                          max_render_feasible_tile_size.y;
+
 		/* Allocate all required global memory once. */
 		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
 #ifdef __WORK_STEALING__
 			/* Calculate max groups */
 			size_t max_global_size[2];
@@ -477,62 +390,27 @@ public:
 			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
 			use_queues_flag = mem_alloc(sizeof(char));
 			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
 			ray_state = mem_alloc(num_global_elements * sizeof(char));
+			split_data = mem_alloc(split_data_buffer_size(num_global_elements,
+			                                              current_max_closure,
+			                                              per_thread_output_buffer_size));
 
 			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
 			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
 		}
 
 		cl_int dQueue_size = global_size[0] * global_size[1];
 
+		//printf("kernel_set_args data_init\n");
 		cl_uint start_arg_index =
 			kernel_set_args(program_data_init(),
 			                0,
 			                kgbuffer,
-			                sd_DL_shadow,
 			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
-			                ray_state);
+							split_data,
+			                num_global_elements,
+							ray_state,
+			                d_rng_state);
 
 /* TODO(sergey): Avoid map lookup here. */
 #define KERNEL_TEX(type, ttype, name) \
@@ -553,78 +431,53 @@ public:
 			                rtile.rng_state_offset_x,
 			                rtile.rng_state_offset_y,
 			                rtile.buffer_rng_state_stride,
-			                Queue_data,
 			                Queue_index,
 			                dQueue_size,
 			                use_queues_flag,
-			                work_array,
 #ifdef __WORK_STEALING__
 			                work_pool_wgs,
 			                num_samples,
 #endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
 			                num_parallel_samples);
 
+		//printf("kernel_set_args scene_intersect\n");
 		kernel_set_args(program_scene_intersect(),
 		                0,
 		                kgbuffer,
 		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
 		                d_w,
 		                d_h,
-		                Queue_data,
 		                Queue_index,
 		                dQueue_size,
 		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
 		                num_parallel_samples);
 
+		//printf("kernel_set_args lamp_emission\n");
 		kernel_set_args(program_lamp_emission(),
 		                0,
 		                kgbuffer,
 		                d_data,
-		                throughput_coop,
-		          

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list