[Bf-blender-cvs] [dcc4260] cycles_split_kernel: Cycles: Finish implementing the split kernel for CPU devices

Mai Lavelle noreply at git.blender.org
Thu Oct 27 20:22:37 CEST 2016


Commit: dcc42603d87a40c371f695c20d27a1f324925b85
Author: Mai Lavelle
Date:   Thu Oct 27 19:12:59 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rBdcc42603d87a40c371f695c20d27a1f324925b85

Cycles: Finish implementing the split kernel for CPU devices

Implements the `data_init` kernel plus a few fixes so that it runs correctly.

The split kernel now works on CPU devices. This means we have the ability to
debug the kernel, get better compiler logs, and have a new device to compare
to, all of which will help in developing and keeping the split kernel
correct. With this implementing branched path tracing, volumes and subsurface
scattering will be much easier.

Even at this point the CPU implementation already paying off by making long
existing bugs in the kernel apparent.

It should be noted that while the kernel works, it is a bit slower than the
mega kernel (~13% with BMW). Will need to investigate whether this is a
bug / implementation issue or if split kernels are simply slower by nature.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_memory.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
M	intern/cycles/kernel/split/kernel_data_init.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 50b6ad1..be52fae 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -61,13 +61,22 @@ public:
 	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
 	~CPUSplitKernelFunction() {}
 
-	virtual bool enqueue(const KernelDimensions& /*dim*/, device_memory& kg, device_memory& data)
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
 	{
 		if(!func) {
 			return false;
 		}
 
-		func((KernelGlobals*)kg.device_pointer, (KernelData*)data.device_pointer);
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
 
 		return true;
 	}
@@ -428,6 +437,7 @@ public:
 
 		DeviceSplitKernel split_kernel(this);
 
+		requested_features.max_closure = MAX_CLOSURE;
 		if(!split_kernel.load_kernels(requested_features)) {
 			return;
 		}
@@ -676,20 +686,88 @@ protected:
 	}
 
 	/* split kernel */
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& /*dim*/,
-	                                            RenderTile& /*rtile*/,
-	                                            int /*num_global_elements*/,
-	                                            int /*num_parallel_samples*/,
-	                                            device_memory& /*kernel_globals*/,
-	                                            device_memory& /*kernel_data*/,
-	                                            device_memory& /*split_data*/,
-	                                            device_memory& /*ray_state*/,
-	                                            device_memory& /*queue_index*/,
-	                                            device_memory& /*use_queues_flag*/,
-	                                            device_memory& /*work_pool_wgs*/)
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            int num_parallel_samples,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& data,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flags,
+	                                            device_memory& work_pool_wgs)
 	{
-		assert(!"not implemented for this device");
-		return false;
+		typedef void(*data_init_t)(KernelGlobals *kg,
+                                   ccl_constant KernelData *data,
+                                   ccl_global void *split_data_buffer,
+                                   int num_elements,
+                                   ccl_global char *ray_state,
+                                   ccl_global uint *rng_state,
+                                   int start_sample,
+                                   int end_sample,
+                                   int sx, int sy, int sw, int sh, int offset, int stride,
+                                   int rng_state_offset_x,
+                                   int rng_state_offset_y,
+                                   int rng_state_stride,
+                                   ccl_global int *Queue_index,
+                                   int queuesize,
+                                   ccl_global char *use_queues_flag,
+#ifdef __WORK_STEALING__
+                                   ccl_global unsigned int *work_pool_wgs,
+                                   unsigned int num_samples,
+#endif
+                                   int parallel_samples,
+                                   int buffer_offset_x,
+                                   int buffer_offset_y,
+                                   int buffer_stride,
+                                   ccl_global float *buffer);
+
+		data_init_t data_init = get_kernel_function<data_init_t>("data_init");
+		if(!data_init) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				data_init((KernelGlobals*)kernel_globals.device_pointer,
+						  (KernelData*)data.device_pointer,
+						  (void*)split_data.device_pointer,
+						  num_global_elements,
+						  (char*)ray_state.device_pointer,
+						  (uint*)rtile.rng_state,
+						  rtile.start_sample,
+						  rtile.start_sample + rtile.num_samples,
+						  rtile.x,
+						  rtile.y,
+						  rtile.w,
+						  rtile.h,
+						  rtile.offset,
+						  rtile.stride,
+						  rtile.rng_state_offset_x,
+						  rtile.rng_state_offset_y,
+						  rtile.buffer_rng_state_stride,
+						  (int*)queue_index.device_pointer,
+						  dim.global_size[0] * dim.global_size[1],
+						  (char*)use_queues_flags.device_pointer,
+#ifdef __WORK_STEALING__
+						  (uint*)work_pool_wgs.device_pointer,
+						  rtile.num_samples,
+#endif
+						  num_parallel_samples,
+						  rtile.buffer_offset_x,
+						  rtile.buffer_offset_y,
+						  rtile.buffer_rng_state_stride,
+						  (float*)rtile.buffer);
+			}
+		}
+
+		return true;
 	}
 
 	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
@@ -697,6 +775,10 @@ protected:
 		CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(this);
 
 		kernel->func = get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+		if(!kernel->func) {
+			delete kernel;
+			return nullptr;
+		}
 
 		return kernel;
 	}
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 0093c93..cb97b79 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -276,6 +276,7 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
+		device_pointer = 0;
 	}
 
 	size_t size()
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 2310b0f..8e3fa92 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -32,6 +32,11 @@
 #  define ccl_addr_space
 #endif
 
+#ifdef __SPLIT_KERNEL__
+/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */
+#define __COMPUTE_DEVICE_GPU__
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* constants */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 30ffb05..8f5e329 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -37,6 +37,7 @@
 #else
 #  include "split/kernel_split_common.h"
 
+#  include "split/kernel_data_init.h"
 #  include "split/kernel_scene_intersect.h"
 #  include "split/kernel_lamp_emission.h"
 #  include "split/kernel_queue_enqueue.h"
@@ -184,6 +185,8 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name,
 	REGISTER(convert_to_half_float);
 	REGISTER(shader);
 
+	reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(data_init)), (void*)kernel_data_init);
+
 	REGISTER(scene_intersect);
 	REGISTER(lamp_emission);
 	REGISTER(queue_enqueue);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 8808715..0e5d9b1 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -15,6 +15,7 @@
  */
 
 #include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index d6a88ce..840c6ce 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_data_initialization kernel
  * This kernel Initializes structures needed in path-iteration kernels.
@@ -59,9 +59,11 @@ ccl_device void kernel_data_init(
         ccl_global char *ray_state,
         ccl_global uint *rng_state,
 
+#ifndef __KERNEL_CPU__
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
 #include "../kernel_textures.h"
+#endif
 
         int start_sample,
         int end_sample,
@@ -82,7 +84,9 @@ ccl_device void kernel_data_init(
         int buffer_stride,
         ccl_global float *buffer)
 {
+#ifndef __KERNEL_CPU__
 	kg->data = data;
+#endif
 
 	split_params->x = sx;
 	split_params->y = sy;
@@ -120,9 +124,11 @@ ccl_device void kernel_data_init(
 
 	kg->sd_input = split_state->sd_DL_shadow;
 	kg->isect_shadow = split_state->isect_shadow;
+#ifndef __KERNEL_CPU__
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
 #include "../kernel_textures.h"
+#endif
 
 	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 
@@ -266,3 +272,6 @@ ccl_device void kernel_data_init(
 		split_state->ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
 	}
 }
+
+CCL_NAMESPACE_END
+




More information about the Bf-blender-cvs mailing list