[Bf-blender-cvs] [5b3219a] cycles_split_kernel: Cycles: Remove tile splitting logic for split kernel

Fri Oct 21 15:26:31 CEST 2016

Commit: 5b3219a999d54e6fbb66583aab368c8e28d2b441
Author: Mai Lavelle
Date:   Fri Oct 21 14:46:58 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rB5b3219a999d54e6fbb66583aab368c8e28d2b441

Cycles: Remove tile splitting logic for split kernel

This is a huge reduction in code that was very hard to work with. It will be
much easier to go forward now that it is gone.

Unfortunate downside to not having tile splitting anymore is if the tile size
is set too large the user could get an out of memory error. We will have to
fix this at some point, but for now getting the kernel to run on CPU is
more important.

Also at this point the host side of the split kernel is almost entirely
independent of the underlying device, so work on CPU implementation can
finally start.

===================================================================

M	intern/cycles/blender/blender_sync.cpp
M	intern/cycles/device/device_split_kernel.cpp
M	intern/cycles/device/device_split_kernel.h
M	intern/cycles/device/opencl/opencl_split.cpp

===================================================================

diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 4ca202a..e4d6042 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -592,6 +592,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	/* tiles */
+#if 0
 	if(params.device.type != DEVICE_CPU && !background) {
 		/* currently GPU could be much slower than CPU when using tiles,
 		 * still need to be investigated, but meanwhile make it possible
@@ -601,7 +602,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 
 		params.tile_size = make_int2(debug_tile_size, debug_tile_size);
 	}
-	else {
+	else
+#endif
+	{
 		int tile_x = b_engine.tile_x();
 		int tile_y = b_engine.tile_y();
 
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 0a6ac77..2f6a777 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -21,6 +21,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+#define ROUND_UP(x, multiple) (((((x) - 1 ) / (multiple)) + 1) * (multiple))
+
 DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
 {
 	path_iteration_times = PATH_ITER_INC_FACTOR;
@@ -65,25 +67,61 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 }
 
 bool DeviceSplitKernel::path_trace(DeviceTask *task,
-                                   RenderTile& rtile,
-                                   int2 max_render_feasible_tile_size,
-                                   size_t per_thread_output_buffer_size,
+                                   RenderTile& tile,
                                    device_memory& kernel_data)
 {
-	device_memory& d_data = kernel_data;
+	/* TODO(mai): should be easy enough to remove these variables from tile */
+	/* Buffer and rng_state offset calc. */
+	size_t offset_index = tile.offset + (tile.x + tile.y * tile.stride);
+	size_t offset_x = offset_index % tile.stride;
+	size_t offset_y = offset_index / tile.stride;
+
+	tile.rng_state_offset_x = offset_x;
+	tile.rng_state_offset_y = offset_y;
+	tile.buffer_offset_x = offset_x;
+	tile.buffer_offset_y = offset_y;
+
+	tile.buffer_rng_state_stride = tile.stride;
+	tile.stride = tile.w;
 
 	/* Make sure that set render feasible tile size is a multiple of local
 	 * work size dimensions.
 	 */
-	assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-	assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+	int2 max_render_feasible_tile_size;
+	const int2 tile_size = task->requested_tile_size;
+	max_render_feasible_tile_size.x = ROUND_UP(tile_size.x, SPLIT_KERNEL_LOCAL_SIZE_X);
+	max_render_feasible_tile_size.y = ROUND_UP(tile_size.y, SPLIT_KERNEL_LOCAL_SIZE_Y);
+
+	/* Calculate per_thread_output_buffer_size. */
+	size_t per_thread_output_buffer_size;
+	size_t output_buffer_size = tile.buffers->buffer.device_size;
+
+#if 0
+	/* This value is different when running on AMD and NV. */
+	if(device->background) {
+		/* In offline render the number of buffer elements
+		 * associated with tile.buffer is the current tile size.
+		 */
+		per_thread_output_buffer_size =
+			output_buffer_size / (tile.w * tile.h);
+	}
+	else
+#endif
+	{
+		/* interactive rendering, unlike offline render, the number of buffer elements
+		 * associated with tile.buffer is the entire viewport size.
+		 */
+		per_thread_output_buffer_size =
+			output_buffer_size / (tile.buffers->params.width *
+			                      tile.buffers->params.height);
+	}
 
 	size_t global_size[2];
 	size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
 	                        SPLIT_KERNEL_LOCAL_SIZE_Y};
 
-	int d_w = rtile.w;
-	int d_h = rtile.h;
+	int d_w = tile.w;
+	int d_h = tile.h;
 
 #ifdef __WORK_STEALING__
 	global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
@@ -98,7 +136,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 	 * processed in parallel.
 	 */
 	unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-	                                        rtile.num_samples);
+	                                        tile.num_samples);
 	/* Wavefront size in AMD is 64.
 	 * TODO(sergey): What about other platforms?
 	 */
@@ -148,11 +186,11 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 	}
 
 	if(!device->enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-	                                           rtile,
+	                                           tile,
 	                                           num_global_elements,
 	                                           num_parallel_samples,
 	                                           kgbuffer,
-	                                           d_data,
+	                                           kernel_data,
 	                                           split_data,
 	                                           ray_state,
 	                                           queue_index,
@@ -164,7 +202,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 	}
 
 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, d_data)) { \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
 			return false; \
 		}
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 201f8b7..6105143 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -92,8 +92,6 @@ public:
 	bool load_kernels(const DeviceRequestedFeatures& requested_features);
 	bool path_trace(DeviceTask *task,
 	                RenderTile& rtile,
-	                int2 max_render_feasible_tile_size,
-	                size_t per_thread_output_buffer_size,
 	                device_memory& kernel_data);
 };
 
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index d78b329..9615501 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -74,63 +74,12 @@ public:
 
 	OpenCLProgram program_data_init;
 
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
 	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
 	: OpenCLDeviceBase(info, stats, background_)
 	{
 		split_kernel = new DeviceSplitKernel(this);
 
 		background = background_;
-
-		per_thread_output_buffer_size = 0;
-		current_max_closure = -1;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
-	}
-
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
-	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
-		}
-		return ret_size;
 	}
 
 	/* Returns size of KernelGlobals structure associated with OpenCL. */
@@ -187,8 +136,6 @@ public:
 		program_data_init.add_kernel(ustring("path_trace_data_init"));
 		programs.push_back(&program_data_init);
 
-		current_max_closure = requested_features.max_closure;
-
 		return split_kernel->load_kernels(requested_features);
 	}
 
@@ -309,269 +256,6 @@ public:
 		return true;
 	}
 
-	void path_trace(DeviceTask *task,
-	                RenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
-	{
-		split_kernel->path_trace(task,
-		                         rtile,
-		                         max_render_feasible_tile_size,
-		                         per_thread_output_buffer_size,
-		                         *const_mem_map["__data"]);
-	}
-
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
-	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = sizeof_KernelGlobals();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
-	}
-
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
-	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-	

@@ Diff output truncated at 10240 characters. @@