[Bf-blender-cvs] [28b8751] cycles_kernel_split: Support rendering of big tile sizes

Wed Apr 15 17:36:56 CEST 2015

Commit: 28b87518a1ad20894b4e29fc50e04f0c424a125d
Author: varunsundar08
Date:   Wed Apr 8 19:40:47 2015 +0530
Branches: cycles_kernel_split
https://developer.blender.org/rB28b87518a1ad20894b4e29fc50e04f0c424a125d

Support rendering of big tile sizes

===================================================================

M	intern/cycles/device/device_opencl.cpp
M	intern/cycles/kernel/kernel_Background_BufferUpdate.cl
M	intern/cycles/kernel/kernel_DataInit.cl
M	intern/cycles/kernel/kernel_SumAllRadiance.cl
M	intern/cycles/render/buffers.h

===================================================================

diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 103f710..58de549 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -544,15 +544,6 @@ public:
 	size_t PathState_size;
 	size_t Intersection_size;
 
-	/* Volume of ShaderData; ShaderData (in split_kernel) is a
-	 * Structure-Of-Arrays implementation; We need to calculate memory
-	 * required for a single thread
-	 */
-	size_t ShaderData_volume;
-
-	/* This is total ShaderClosure size required for one thread */
-	size_t ShaderClosure_size;
-
 	/* Sizes of memory required for shadow blocked function */
 	size_t AOAlpha_size;
 	size_t AOBSDF_size;
@@ -562,7 +553,7 @@ public:
 	size_t Intersection_coop_AO_size;
 	size_t Intersection_coop_DL_size;
 
-	/* This is sizeof_output_buffer / tile_size */
+	/* Amount of memory in output buffer associated with one pixel */
 	size_t per_thread_output_buffer_size;
 
 	/* Total allocatable available device memory */
@@ -595,9 +586,6 @@ public:
 	unsigned int max_work_groups;
 #endif
 
-	/* Flag denoting if rendering the scene with current tile size is possible */
-	bool cannot_render_scene;
-
 	/* Marked True in constructor and marked false at the end of path_trace() */
 	bool first_tile;
 
@@ -857,6 +845,7 @@ public:
 		Intersection_coop_DL_size = sizeof(Intersection);
 
 		per_thread_output_buffer_size = 0;
+
 		per_thread_memory = 0;
 		render_scene_input_data_size = 0;
 		hostRayStateArray = NULL;
@@ -865,7 +854,6 @@ public:
 		work_pool_wgs = NULL;
 		max_work_groups = 0;
 #endif
-		cannot_render_scene = false;
 		first_tile = true;
 
 #else
@@ -2396,6 +2384,37 @@ public:
 	}
 #endif
 
+#ifdef __SPLIT_KERNEL__
+	/* Returns size of KernelGlobals structure associated with OpenCL */
+	size_t get_KernelGlobals_size() {
+		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to fetch its size */
+		typedef struct KernelGlobals {
+			ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+			ccl_global type *name;
+#include "kernel_textures.h"
+		} KernelGlobals;
+
+		return sizeof(KernelGlobals);
+	}
+
+	/* Returns size of Structure of arrays implementation of */
+	size_t get_shaderdata_soa_size() {
+		size_t num_shader_soa_ptr = SD_NUM_FLOAT3 + SD_NUM_INT + SD_NUM_FLOAT
+#ifdef __DPDU__
+			+ SD_NUM_DPDU_FLOAT3
+#endif
+#ifdef __RAY_DIFFERENTIAL__
+			+ SD_NUM_RAY_DIFFERENTIALS_DIFFERENTIAL3
+			+ SD_NUM_DIFFERENTIAL
+#endif
+			+ SD_NUM_RAY_DP_DIFFERENTIAL3;
+
+		return (num_shader_soa_ptr * sizeof(void *));
+	}
+
+#endif
+
 	void path_trace(RenderTile& rtile, int sample)
 	{
 		/* cast arguments to cl types */
@@ -2409,11 +2428,6 @@ public:
 		cl_int d_offset = rtile.offset;
 		cl_int d_stride = rtile.stride;
 #ifdef __SPLIT_KERNEL__
-		(void)sample;
-
-		if(cannot_render_scene) {
-			return;
-		}
 
 		/* ray_state and hostRayStateArray should be of same size */
 		assert(hostRayState_size == rayState_size);
@@ -2422,25 +2436,39 @@ public:
 		size_t global_size[2];
 		size_t local_size[2] = { SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y };
 
+		/* Set the range of samples to be processed for every ray in path-regeneration logic */
+		cl_int start_sample = rtile.start_sample;
+		cl_int end_sample = rtile.start_sample + rtile.num_samples;
+		cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_parallel_samples = 1;
+#else
+		/* We may not need all global_size[0] threads; We only need as much as num_parallel_samples * d_w */
+		global_size[0] = num_parallel_samples * d_w;
+		global_size[0] = (((global_size[0] - 1) / local_size[0]) + 1) * local_size[0];
+
+		assert(global_size[0] * global_size[1] <= num_parallel_threads);
+		assert(global_size[0] * global_size[1] >= d_w * d_h);
+#endif // __WORK_STEALING__
+
+		/* Allocate all required global memory once */
 		if(first_tile) {
+			size_t num_global_elements = rtile.max_render_feasible_tile_size.x * rtile.max_render_feasible_tile_size.y;
 
 #ifdef __MULTI_CLOSURE__
-			ShaderClosure_size = get_shader_closure_size(clos_max);
+			size_t ShaderClosure_size = get_shader_closure_size(clos_max);
 #else
-			ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE);
+			size_t ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE);
 #endif
-			ShaderData_volume = get_shader_data_size(ShaderClosure_size);
-
-			/* Determine texture memories once */
-#define KERNEL_TEX(type, ttype, name) \
-			render_scene_input_data_size += get_tex_size(#name);
-#include "kernel_textures.h"
 
 #ifdef __WORK_STEALING__
 			/* Calculate max groups */
 			size_t max_global_size[2];
-			size_t tile_x = rtile.tile_size.x;
-			size_t tile_y = rtile.tile_size.y;
+			size_t tile_x = rtile.max_render_feasible_tile_size.x;
+			size_t tile_y = rtile.max_render_feasible_tile_size.y;
 			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
 			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
 			max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]);
@@ -2457,457 +2485,314 @@ public:
 			use_queues_flag = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, sizeof(char), NULL, &ciErr);
 			assert(ciErr == CL_SUCCESS && "Can't create use_queues_flag memory");
 
-			/* Calculate per thread memory */
-			size_t output_buffer_size = 0;
-			ciErr = clGetMemObjectInfo(d_buffer, CL_MEM_SIZE, sizeof(output_buffer_size), &output_buffer_size, NULL);
-			assert(ciErr == CL_SUCCESS && "Can't get d_buffer mem object info");
-
-			/* This value is different when running on AMD and NV */
-			per_thread_output_buffer_size = output_buffer_size / (d_w * d_h);
-
-			per_thread_memory = rng_size + throughput_size + L_transparent_size + rayState_size + work_element_size
-				 + ISLamp_size + PathRadiance_size + Ray_size + PathState_size
-				 + Intersection_size                  /* Overall isect */
-				 + Intersection_coop_AO_size          /* Instersection_coop_AO */
-				 + Intersection_coop_DL_size          /* Intersection coop DL */
-				 + ShaderData_volume       /* Overall ShaderData */
-				 + ShaderData_volume       /* ShaderData_coop_DL */
-				 + (ShaderData_volume * 2) /* ShaderData coop shadow */
-				 + LightRay_size + BSDFEval_size + AOAlpha_size + AOBSDF_size + AOLightRay_size
-				 + (sizeof(int) * NUM_QUEUES)
-				 + per_thread_output_buffer_size;
-
-			int user_set_tile_w = rtile.tile_size.x;
-			int user_set_tile_h = rtile.tile_size.y;
-
-			total_allocatable_parallel_sample_processing_memory = total_allocatable_memory
-			- sizeof(int)* NUM_QUEUES                                                /* Queue index size */
-			- sizeof(char)                                                           /* use_queues */
-			-render_scene_input_data_size                                            /* size for textures, bvh etc */
-			- (user_set_tile_w * user_set_tile_h) * per_thread_output_buffer_size    /* max d_buffer size possible */
-			- (user_set_tile_w * user_set_tile_h) * sizeof(RNG)                      /* max d_rng_state size possible */
-#ifdef __WORK_STEALING__
-			- max_work_groups * sizeof(unsigned int)
-#endif
-			- DATA_ALLOCATION_MEM_FACTOR;
-		}
-
-		/* Set the range of samples to be processed for every ray in path-regeneration logic */
-		cl_int start_sample = rtile.start_sample;
-		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		/* TODO : support dynamic num_parallel_samples in work_stealing
-		 * Do not change the values of num_parallel_samples/num_parallel_threads
-		 */
-		unsigned int num_parallel_samples = 0;
-		global_size[0] = (((rtile.tile_size.x - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((rtile.tile_size.y - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_threads = global_size[0] * global_size[1];
-
-		/* Check if we can process atleast one sample */
-		num_parallel_samples = (total_allocatable_parallel_sample_processing_memory / (per_thread_memory * num_parallel_threads));
-		num_parallel_samples = (num_parallel_samples > 0) ? 1 : 0;
-#else
-		unsigned int num_parallel_threads = total_allocatable_parallel_sample_processing_memory / per_thread_memory;
-
-		/* Estimate maximum global work size that can be launched */
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		global_size[0] = num_parallel_threads / global_size[1];
-		global_size[0] = (global_size[0] / local_size[0]) * local_size[0];
-
-		/* Estimate number of parallel samples that can be processed in parallel */
-		unsigned int num_parallel_samples = (global_size[0] / d_w) <= rtile.num_samples ? (global_size[0] / d_w) : rtile.num_samples;
-		/* Wavefront size in AMD is 64 */
-		num_parallel_samples = ((num_parallel_samples / 64) == 0) ?
-			num_parallel_samples :
-			(num_parallel_samples / 64) * 64;
-#endif
-
-		if(num_parallel_samples == 0) {
-			/* Rough estimate maximum rectangular tile size for this scene, to report to the user */
-			size_t scene_alloc_memory = total_allocatable_memory
-				- sizeof(int)* NUM_QUEUES
-				- sizeof(char)
-				-render_scene_input_data_size
-				- DATA_ALLOCATION_MEM_FACTOR;
-			unsigned int tile_max_x = 8, tile_max_y = 8;
-			bool max_rect_tile_reached = false;
-			while(!max_rect_tile_reached) {
-				unsigned int num_parallel_samples_possible = 0;
-#ifdef __WORK_STEALING__
-				unsigned int current_max_global_size[2];
-				current_max_global_size[0] = (((tile_max_x - 1) / local_size[0]) + 1) * local_size[0];
-				current_max_global_size[1] = (((tile_max_y - 1) / local_size[1]) + 1) * local_size[1];
-				unsigned int current_max_work_groups = (current_max_global_size[0] * current_max_global_size[1]) / (local_size[0] * local_size[1]);
-#endif
-				size_t memory_for_parallel_sample_processing = scene_alloc_memory
-#ifdef __WORK_STEALING__
-					- current_max_work_groups * sizeof(unsigned int)
-#endif
-					- (tile_max_x * tile_max_y) * per_thread_output_buffer_size
-					- (tile_max_x * tile_max_y) * size

@@ Diff output truncated at 10240 characters. @@