[Bf-blender-cvs] [bc8e3a3] cycles_split_kernel: Cycles: Replace use of cl_mem with device_memory in split kernel device

Tue Oct 18 12:24:53 CEST 2016

Commit: bc8e3a3d868d19719f67ec2e5ed2d0b516a98312
Author: Mai Lavelle
Date:   Tue Oct 18 11:30:25 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rBbc8e3a3d868d19719f67ec2e5ed2d0b516a98312

Cycles: Replace use of cl_mem with device_memory in split kernel device

Working towards using only device agnostic types and methods in the host.

===================================================================

M	intern/cycles/device/device.h
M	intern/cycles/device/device_memory.h
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_split.cpp

===================================================================

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 77dc1fa..014e5fc 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -226,6 +226,25 @@ public:
 	virtual void mem_zero(device_memory& mem) = 0;
 	virtual void mem_free(device_memory& mem) = 0;
 
+	/* setup and allocate a device_memory object for use on device only (no host side buffer)*/
+	void mem_alloc(device_memory& mem, size_t size, MemoryType type = MEM_READ_WRITE)
+	{
+		mem.data_type = device_type_traits<uchar>::data_type;
+		mem.data_elements = 1;
+		mem.data_pointer = 0;
+		mem.data_size = size;
+		mem.device_size = 0;
+		mem.data_width = size;
+		mem.data_height = 1;
+		mem.data_depth = 1;
+
+		assert(mem.data_elements > 0);
+
+		mem.device_pointer = 0;
+
+		mem_alloc(mem, type);
+	}
+
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc..0093c93 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -180,10 +180,20 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;
 
-protected:
-	device_memory() {}
+	device_memory() {
+		data_type = device_type_traits<float>::data_type;
+		data_elements = device_type_traits<float>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		device_pointer = 0;
+	}
 	virtual ~device_memory() { assert(!device_pointer); }
 
+protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 30a35ac..83603b2 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -265,6 +265,7 @@ public:
 	                          vector<OpenCLProgram*> &programs) = 0;
 
 	void mem_alloc(device_memory& mem, MemoryType type);
+	using Device::mem_alloc;
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -321,16 +322,39 @@ protected:
 
 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index fc80173..e1e1f54 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -107,20 +107,13 @@ public:
 	 * kernel will be available to another kernel via this global
 	 * memory.
 	 */
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	cl_mem split_data;
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Queue */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
+	device_memory kgbuffer;
+	device_memory split_data;
+	device_vector<uchar> ray_state;
+	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
 
 	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
+	device_memory use_queues_flag;
 
 	/* Amount of memory in output buffer associated with one pixel/thread. */
 	size_t per_thread_output_buffer_size;
@@ -128,17 +121,12 @@ public:
 	/* Total allocatable available device memory. */
 	size_t total_allocatable_memory;
 
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
 	/* Number of path-iterations to be done in one shot. */
 	unsigned int PathIteration_times;
 
 #ifdef __WORK_STEALING__
 	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
+	device_memory work_pool_wgs;
 
 	/* Denotes the maximum work groups possible w.r.t. current tile size. */
 	unsigned int max_work_groups;
@@ -155,20 +143,9 @@ public:
 	{
 		background = background_;
 
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		split_data = NULL;
-		ray_state = NULL;
-
-		/* Queue. */
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
 		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
 		PathIteration_times = PATH_ITER_INC_FACTOR;
 #ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
 		max_work_groups = 0;
 #endif
 		current_max_closure = -1;
@@ -294,18 +271,14 @@ public:
 		program_sum_all_radiance.release();
 
 		/* Release global memory */
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(split_data);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_index);
+		mem_free(kgbuffer);
+		mem_free(split_data);
+		mem_free(ray_state);
+		mem_free(use_queues_flag);
+		mem_free(queue_index);
 #ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
+		mem_free(work_pool_wgs);
 #endif
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
-		}
 	}
 
 	void path_trace(DeviceTask *task,
@@ -313,9 +286,9 @@ public:
 	                int2 max_render_feasible_tile_size)
 	{
 		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		device_memory& d_data = *const_mem_map["__data"];
+		device_ptr d_buffer = rtile.buffer;
+		device_ptr d_rng_state = rtile.rng_state;
 		cl_int d_x = rtile.x;
 		cl_int d_y = rtile.y;
 		cl_int d_w = rtile.w;
@@ -383,26 +356,25 @@ public:
 			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
 			max_work_groups = (max_global_size[0] * max_global_size[1]) /
 			                  (local_size[0] * local_size[1]);
+
 			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+			mem_alloc(work_pool_wgs, max_work_groups * sizeof(unsigned int));
 #endif  /* __WORK_STEALING__ */
 
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-			split_data = mem_alloc(split_data_buffer_size(num_global_elements,
-			                                              current_max_closure,
-			                                              per_thread_output_buffer_size));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+			mem_alloc(queue_index, NUM_QUEUES * sizeof(int));
+			mem_alloc(use_queues_flag, sizeof(char));
+			mem_alloc(kgbuffer, get_KernelGlobals_size());
+
+			ray_state.resize(num_global_elements);
+			mem_alloc(ray_state, MEM_READ_WRITE);
+
+			mem_alloc(split_data, split_data_buffer_size(num_global_elements,
+			                                             current_max_closure,
+			                                             per_thread_output_buffer_size));
 		}
 
 		cl_int dQueue_size = global_size[0] * global_size[1];
 
-		//printf("kernel_set_args data_init\n");
 		cl_uint start_arg_index =
 			kernel_set_args(program_data_init(),
 			                0,
@@ -433,7 +405,7 @@ public:
 			                rtile.rng_state_offset_x,
 			                rtile.rng_state_offset_y,
 			                rtile.buffer_rng_state_stride,
-			                Queue_index,
+			                queue_index,
 			                dQueue_size,
 			                use_queues_flag,
 #ifdef __WORK_STEALING__
@@ -463,7 +435,6 @@ public:
 #define GLUE(a, b) a ## b
 #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
 		{ \
-			/*printf("enqueueing " #kernelName "\n");*/ \
 			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
 			                               GLUE(program_, \
 			                                    kernelName)(), \
@@ -517,20 +488,8 @@ public:
 				}
 			}
 
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			//printf("enqueue read\n");
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
+			/* Decide if we should exit path-iteration in host. */
+			mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
 
 			activeRaysAvailable = false;
 
@@ -538,7 +497,7 @@ public:
 			    rayStateIter < global_size[0] * global_size[1];
 			    ++rayStateIter)
 			{
-				if(int8_t(hostRayStateArra

@@ Diff output truncated at 10240 characters. @@