[Bf-blender-cvs] [60ad21badb] cycles_split_kernel: Cycles: Calculate size of split state buffer kernel side

Mai Lavelle noreply at git.blender.org
Sat Mar 4 12:38:59 CET 2017


Commit: 60ad21badbeff1584418f632b90254943a8b5bf8
Author: Mai Lavelle
Date:   Sat Mar 4 06:29:01 2017 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rB60ad21badbeff1584418f632b90254943a8b5bf8

Cycles: Calculate size of split state buffer kernel side

By calculating the size of the state buffer in the kernel rather than the host
less code is needed and the size actually reflects the requested features.

Will also be a little faster in some cases because of larger global work size.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_split_kernel.cpp
M	intern/cycles/device/device_split_kernel.h
M	intern/cycles/device/opencl/opencl_split.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernels/cuda/kernel_split.cu
A	intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
M	intern/cycles/kernel/split/kernel_data_init.h
M	intern/cycles/kernel/split/kernel_split_data.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b4d470747c..1589bbe1a3 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -71,7 +71,8 @@ public:
 
 	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
 	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(DeviceTask *task);
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
 };
 
 class CPUDevice : public Device
@@ -854,11 +855,17 @@ int2 CPUSplitKernel::split_kernel_local_size()
 	return make_int2(1, 1);
 }
 
-int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) {
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
 	/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
 	return task->requested_tile_size;
 }
 
+size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
 unordered_map<string, void*> CPUDevice::kernel_functions;
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 52f1b2a2a1..e6596a624d 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -88,6 +88,8 @@ class CUDASplitKernel : public DeviceSplitKernel {
 public:
 	explicit CUDASplitKernel(CUDADevice *device);
 
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
 	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
 	                                            RenderTile& rtile,
 	                                            int num_global_elements,
@@ -101,7 +103,7 @@ public:
 
 	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
 	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(DeviceTask *task);
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
 };
 
 class CUDADevice : public Device
@@ -1470,6 +1472,43 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
 {
 }
 
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	device_vector<uint> size_buffer;
+	size_buffer.resize(1);
+	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+	device->cuda_push_context();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+		                       1, 1, 1,
+		                       1, 1, 1,
+		                       0, 0, &args, 0));
+
+	device->cuda_pop_context();
+
+	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+	device->mem_free(size_buffer);
+
+	return *size_buffer.get_data();
+}
+
 bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
                                     RenderTile& rtile,
                                     int num_global_elements,
@@ -1572,7 +1611,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
 	return make_int2(32, 1);
 }
 
-int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
 {
 	/* TODO(mai): implement something here to detect ideal work size */
 	return make_int2(256, 256);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index f16fb6a1ea..799479ddb6 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -90,9 +90,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 	return true;
 }
 
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
 {
-	size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024;
+	size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
 	return max_buffer_size / size_per_element;
 }
 
@@ -113,13 +113,10 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		local_size[1] = lsize[1];
 	}
 
-	/* Calculate per_thread_output_buffer_size. */
-	size_t per_thread_output_buffer_size = task->passes_size;
-
 	/* Set gloabl size */
 	size_t global_size[2];
 	{
-		int2 gsize = split_kernel_global_size(task);
+		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
 
 		/* Make sure that set work size is a multiple of local
 		 * work size dimensions.
@@ -153,9 +150,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		ray_state.resize(num_global_elements);
 		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
 
-		split_data.resize(split_data_buffer_size(num_global_elements,
-		                                         current_max_closure,
-		                                         per_thread_output_buffer_size));
+		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
 		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
 	}
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 1c6a2709cf..cc3e1aa26a 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -100,7 +100,8 @@ public:
 	                device_memory& kgbuffer,
 	                device_memory& kernel_data);
 
-	size_t max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size);
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
 
 	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
 	                                            RenderTile& rtile,
@@ -115,7 +116,7 @@ public:
 
 	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
 	virtual int2 split_kernel_local_size() = 0;
-	virtual int2 split_kernel_global_size(DeviceTask *task) = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 7e04c6fac2..a44f5da3a3 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -60,6 +60,7 @@ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
 public:
 	DeviceSplitKernel *split_kernel;
 	OpenCLProgram program_data_init;
+	OpenCLProgram program_state_buffer_size;
 
 	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
 
@@ -83,6 +84,13 @@ public:
 		program_data_init.add_kernel(ustring("path_trace_data_init"));
 		programs.push_back(&program_data_init);
 
+		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  "split_state_buffer_size",
+		                                  "kernel_state_buffer_size.cl",
+		                                  get_build_options(this, requested_features));
+		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+		programs.push_back(&program_state_buffer_size);
+
 		return split_kernel->load_kernels(requested_features);
 	}
 
@@ -216,6 +224,41 @@ public:
 		return kernel;
 	}
 
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
+	{
+		device_vector<uint> size_buffer;
+		size_buffer.resize(1);
+		device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+		uint threads = num_threads;
+		device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+		size_t global_size = 64;
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_state_buffer_size(),
+		                               1,
+		                               NULL,
+		                               &global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+		device->mem_free(size_buffer);
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return 0;
+		}
+
+		return *size_buffer.get_data();
+	}
+
 	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
 	                                            RenderTile& rtile,
 	                                            int num_global_elements,
@@ -298,7 +341,7 @@ public:
 		return make_int2(64, 1);
 	}
 
-	virtual int2 split_kernel_global_size(DeviceTask *task)
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
 	{
 		size_t max_buffer_size;
 		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
@@ -306,7 +34

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list