[Bf-blender-cvs] [28b8751] cycles_kernel_split: Support rendering of big tile sizes
varunsundar08
noreply at git.blender.org
Wed Apr 15 17:36:56 CEST 2015
Commit: 28b87518a1ad20894b4e29fc50e04f0c424a125d
Author: varunsundar08
Date: Wed Apr 8 19:40:47 2015 +0530
Branches: cycles_kernel_split
https://developer.blender.org/rB28b87518a1ad20894b4e29fc50e04f0c424a125d
Support rendering of big tile sizes
===================================================================
M intern/cycles/device/device_opencl.cpp
M intern/cycles/kernel/kernel_Background_BufferUpdate.cl
M intern/cycles/kernel/kernel_DataInit.cl
M intern/cycles/kernel/kernel_SumAllRadiance.cl
M intern/cycles/render/buffers.h
===================================================================
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 103f710..58de549 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -544,15 +544,6 @@ public:
size_t PathState_size;
size_t Intersection_size;
- /* Volume of ShaderData; ShaderData (in split_kernel) is a
- * Structure-Of-Arrays implementation; We need to calculate memory
- * required for a single thread
- */
- size_t ShaderData_volume;
-
- /* This is total ShaderClosure size required for one thread */
- size_t ShaderClosure_size;
-
/* Sizes of memory required for shadow blocked function */
size_t AOAlpha_size;
size_t AOBSDF_size;
@@ -562,7 +553,7 @@ public:
size_t Intersection_coop_AO_size;
size_t Intersection_coop_DL_size;
- /* This is sizeof_output_buffer / tile_size */
+ /* Amount of memory in output buffer associated with one pixel */
size_t per_thread_output_buffer_size;
/* Total allocatable available device memory */
@@ -595,9 +586,6 @@ public:
unsigned int max_work_groups;
#endif
- /* Flag denoting if rendering the scene with current tile size is possible */
- bool cannot_render_scene;
-
/* Marked True in constructor and marked false at the end of path_trace() */
bool first_tile;
@@ -857,6 +845,7 @@ public:
Intersection_coop_DL_size = sizeof(Intersection);
per_thread_output_buffer_size = 0;
+
per_thread_memory = 0;
render_scene_input_data_size = 0;
hostRayStateArray = NULL;
@@ -865,7 +854,6 @@ public:
work_pool_wgs = NULL;
max_work_groups = 0;
#endif
- cannot_render_scene = false;
first_tile = true;
#else
@@ -2396,6 +2384,37 @@ public:
}
#endif
+#ifdef __SPLIT_KERNEL__
+ /* Returns size of KernelGlobals structure associated with OpenCL */
+ size_t get_KernelGlobals_size() {
+ /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to fetch its size */
+ typedef struct KernelGlobals {
+ ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name;
+#include "kernel_textures.h"
+ } KernelGlobals;
+
+ return sizeof(KernelGlobals);
+ }
+
+ /* Returns size of Structure of arrays implementation of */
+ size_t get_shaderdata_soa_size() {
+ size_t num_shader_soa_ptr = SD_NUM_FLOAT3 + SD_NUM_INT + SD_NUM_FLOAT
+#ifdef __DPDU__
+ + SD_NUM_DPDU_FLOAT3
+#endif
+#ifdef __RAY_DIFFERENTIAL__
+ + SD_NUM_RAY_DIFFERENTIALS_DIFFERENTIAL3
+ + SD_NUM_DIFFERENTIAL
+#endif
+ + SD_NUM_RAY_DP_DIFFERENTIAL3;
+
+ return (num_shader_soa_ptr * sizeof(void *));
+ }
+
+#endif
+
void path_trace(RenderTile& rtile, int sample)
{
/* cast arguments to cl types */
@@ -2409,11 +2428,6 @@ public:
cl_int d_offset = rtile.offset;
cl_int d_stride = rtile.stride;
#ifdef __SPLIT_KERNEL__
- (void)sample;
-
- if(cannot_render_scene) {
- return;
- }
/* ray_state and hostRayStateArray should be of same size */
assert(hostRayState_size == rayState_size);
@@ -2422,25 +2436,39 @@ public:
size_t global_size[2];
size_t local_size[2] = { SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y };
+ /* Set the range of samples to be processed for every ray in path-regeneration logic */
+ cl_int start_sample = rtile.start_sample;
+ cl_int end_sample = rtile.start_sample + rtile.num_samples;
+ cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+ global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+ global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+ unsigned int num_parallel_samples = 1;
+#else
+ /* We may not need all global_size[0] threads; We only need as much as num_parallel_samples * d_w */
+ global_size[0] = num_parallel_samples * d_w;
+ global_size[0] = (((global_size[0] - 1) / local_size[0]) + 1) * local_size[0];
+
+ assert(global_size[0] * global_size[1] <= num_parallel_threads);
+ assert(global_size[0] * global_size[1] >= d_w * d_h);
+#endif // __WORK_STEALING__
+
+ /* Allocate all required global memory once */
if(first_tile) {
+ size_t num_global_elements = rtile.max_render_feasible_tile_size.x * rtile.max_render_feasible_tile_size.y;
#ifdef __MULTI_CLOSURE__
- ShaderClosure_size = get_shader_closure_size(clos_max);
+ size_t ShaderClosure_size = get_shader_closure_size(clos_max);
#else
- ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE);
+ size_t ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE);
#endif
- ShaderData_volume = get_shader_data_size(ShaderClosure_size);
-
- /* Determine texture memories once */
-#define KERNEL_TEX(type, ttype, name) \
- render_scene_input_data_size += get_tex_size(#name);
-#include "kernel_textures.h"
#ifdef __WORK_STEALING__
/* Calculate max groups */
size_t max_global_size[2];
- size_t tile_x = rtile.tile_size.x;
- size_t tile_y = rtile.tile_size.y;
+ size_t tile_x = rtile.max_render_feasible_tile_size.x;
+ size_t tile_y = rtile.max_render_feasible_tile_size.y;
max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]);
@@ -2457,457 +2485,314 @@ public:
use_queues_flag = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, sizeof(char), NULL, &ciErr);
assert(ciErr == CL_SUCCESS && "Can't create use_queues_flag memory");
- /* Calculate per thread memory */
- size_t output_buffer_size = 0;
- ciErr = clGetMemObjectInfo(d_buffer, CL_MEM_SIZE, sizeof(output_buffer_size), &output_buffer_size, NULL);
- assert(ciErr == CL_SUCCESS && "Can't get d_buffer mem object info");
-
- /* This value is different when running on AMD and NV */
- per_thread_output_buffer_size = output_buffer_size / (d_w * d_h);
-
- per_thread_memory = rng_size + throughput_size + L_transparent_size + rayState_size + work_element_size
- + ISLamp_size + PathRadiance_size + Ray_size + PathState_size
- + Intersection_size /* Overall isect */
- + Intersection_coop_AO_size /* Instersection_coop_AO */
- + Intersection_coop_DL_size /* Intersection coop DL */
- + ShaderData_volume /* Overall ShaderData */
- + ShaderData_volume /* ShaderData_coop_DL */
- + (ShaderData_volume * 2) /* ShaderData coop shadow */
- + LightRay_size + BSDFEval_size + AOAlpha_size + AOBSDF_size + AOLightRay_size
- + (sizeof(int) * NUM_QUEUES)
- + per_thread_output_buffer_size;
-
- int user_set_tile_w = rtile.tile_size.x;
- int user_set_tile_h = rtile.tile_size.y;
-
- total_allocatable_parallel_sample_processing_memory = total_allocatable_memory
- - sizeof(int)* NUM_QUEUES /* Queue index size */
- - sizeof(char) /* use_queues */
- -render_scene_input_data_size /* size for textures, bvh etc */
- - (user_set_tile_w * user_set_tile_h) * per_thread_output_buffer_size /* max d_buffer size possible */
- - (user_set_tile_w * user_set_tile_h) * sizeof(RNG) /* max d_rng_state size possible */
-#ifdef __WORK_STEALING__
- - max_work_groups * sizeof(unsigned int)
-#endif
- - DATA_ALLOCATION_MEM_FACTOR;
- }
-
- /* Set the range of samples to be processed for every ray in path-regeneration logic */
- cl_int start_sample = rtile.start_sample;
- cl_int end_sample = rtile.start_sample + rtile.num_samples;
- cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
- /* TODO : support dynamic num_parallel_samples in work_stealing
- * Do not change the values of num_parallel_samples/num_parallel_threads
- */
- unsigned int num_parallel_samples = 0;
- global_size[0] = (((rtile.tile_size.x - 1) / local_size[0]) + 1) * local_size[0];
- global_size[1] = (((rtile.tile_size.y - 1) / local_size[1]) + 1) * local_size[1];
- unsigned int num_parallel_threads = global_size[0] * global_size[1];
-
- /* Check if we can process atleast one sample */
- num_parallel_samples = (total_allocatable_parallel_sample_processing_memory / (per_thread_memory * num_parallel_threads));
- num_parallel_samples = (num_parallel_samples > 0) ? 1 : 0;
-#else
- unsigned int num_parallel_threads = total_allocatable_parallel_sample_processing_memory / per_thread_memory;
-
- /* Estimate maximum global work size that can be launched */
- global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
- global_size[0] = num_parallel_threads / global_size[1];
- global_size[0] = (global_size[0] / local_size[0]) * local_size[0];
-
- /* Estimate number of parallel samples that can be processed in parallel */
- unsigned int num_parallel_samples = (global_size[0] / d_w) <= rtile.num_samples ? (global_size[0] / d_w) : rtile.num_samples;
- /* Wavefront size in AMD is 64 */
- num_parallel_samples = ((num_parallel_samples / 64) == 0) ?
- num_parallel_samples :
- (num_parallel_samples / 64) * 64;
-#endif
-
- if(num_parallel_samples == 0) {
- /* Rough estimate maximum rectangular tile size for this scene, to report to the user */
- size_t scene_alloc_memory = total_allocatable_memory
- - sizeof(int)* NUM_QUEUES
- - sizeof(char)
- -render_scene_input_data_size
- - DATA_ALLOCATION_MEM_FACTOR;
- unsigned int tile_max_x = 8, tile_max_y = 8;
- bool max_rect_tile_reached = false;
- while(!max_rect_tile_reached) {
- unsigned int num_parallel_samples_possible = 0;
-#ifdef __WORK_STEALING__
- unsigned int current_max_global_size[2];
- current_max_global_size[0] = (((tile_max_x - 1) / local_size[0]) + 1) * local_size[0];
- current_max_global_size[1] = (((tile_max_y - 1) / local_size[1]) + 1) * local_size[1];
- unsigned int current_max_work_groups = (current_max_global_size[0] * current_max_global_size[1]) / (local_size[0] * local_size[1]);
-#endif
- size_t memory_for_parallel_sample_processing = scene_alloc_memory
-#ifdef __WORK_STEALING__
- - current_max_work_groups * sizeof(unsigned int)
-#endif
- - (tile_max_x * tile_max_y) * per_thread_output_buffer_size
- - (tile_max_x * tile_max_y) * size
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list