[Bf-blender-cvs] [0f56987] cycles_kernel_split: Cycles: OpenCL kernel-splitting work
George Kyriazis
noreply at git.blender.org
Mon Mar 30 17:27:02 CEST 2015
Commit: 0f56987ebabdef2d0d71665c86b047e28926aef8
Author: George Kyriazis
Date: Mon Mar 30 15:23:12 2015 +0500
Branches: cycles_kernel_split
https://developer.blender.org/rB0f56987ebabdef2d0d71665c86b047e28926aef8
Cycles: OpenCL kernel-splitting work
This patch contains work to split the OpenCL mega-kernel into separate kernels
to obtain better GPU utilization and therefore performance.
A description of the optimizations included in this patch is located at
https://docs.google.com/document/d/1LuXW-CV-sVJkQaEGZlMJ86jZ8FmoPfecaMdR-oiWbUY/edit?usp=sharing
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_opencl.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/closure/bsdf.h
M intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
M intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
M intern/cycles/kernel/closure/bsdf_diffuse.h
M intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
M intern/cycles/kernel/closure/bsdf_hair.h
M intern/cycles/kernel/closure/bsdf_microfacet.h
M intern/cycles/kernel/closure/bsdf_oren_nayar.h
M intern/cycles/kernel/closure/bsdf_phong_ramp.h
M intern/cycles/kernel/closure/bsdf_reflection.h
M intern/cycles/kernel/closure/bsdf_refraction.h
M intern/cycles/kernel/closure/bsdf_toon.h
M intern/cycles/kernel/closure/bsdf_transparent.h
M intern/cycles/kernel/closure/bssrdf.h
M intern/cycles/kernel/geom/geom_attribute.h
M intern/cycles/kernel/geom/geom_bvh.h
M intern/cycles/kernel/geom/geom_bvh_shadow.h
M intern/cycles/kernel/geom/geom_bvh_subsurface.h
M intern/cycles/kernel/geom/geom_bvh_traversal.h
M intern/cycles/kernel/geom/geom_bvh_volume.h
M intern/cycles/kernel/geom/geom_curve.h
M intern/cycles/kernel/geom/geom_motion_curve.h
M intern/cycles/kernel/geom/geom_motion_triangle.h
M intern/cycles/kernel/geom/geom_object.h
M intern/cycles/kernel/geom/geom_primitive.h
M intern/cycles/kernel/geom/geom_qbvh.h
M intern/cycles/kernel/geom/geom_qbvh_traversal.h
M intern/cycles/kernel/geom/geom_triangle.h
M intern/cycles/kernel/geom/geom_triangle_intersect.h
M intern/cycles/kernel/geom/geom_volume.h
M intern/cycles/kernel/kernel.cl
A intern/cycles/kernel/kernel_Background_BufferUpdate.cl
A intern/cycles/kernel/kernel_DataInit.cl
A intern/cycles/kernel/kernel_DirectLighting.cl
A intern/cycles/kernel/kernel_Holdout_Emission_Blurring_Pathtermination_AO.cl
A intern/cycles/kernel/kernel_LampEmission.cl
A intern/cycles/kernel/kernel_NextIterationSetUp.cl
A intern/cycles/kernel/kernel_QueueEnqueue.cl
A intern/cycles/kernel/kernel_SceneIntersect.cl
A intern/cycles/kernel/kernel_ShaderEval.cl
A intern/cycles/kernel/kernel_ShadowBlocked.cl
A intern/cycles/kernel/kernel_Subsurface.cl
A intern/cycles/kernel/kernel_SumAllRadiance.cl
M intern/cycles/kernel/kernel_accumulate.h
M intern/cycles/kernel/kernel_camera.h
M intern/cycles/kernel/kernel_differential.h
M intern/cycles/kernel/kernel_emission.h
M intern/cycles/kernel/kernel_globals.h
M intern/cycles/kernel/kernel_light.h
M intern/cycles/kernel/kernel_passes.h
M intern/cycles/kernel/kernel_path.h
M intern/cycles/kernel/kernel_path_state.h
M intern/cycles/kernel/kernel_path_surface.h
M intern/cycles/kernel/kernel_path_volume.h
M intern/cycles/kernel/kernel_projection.h
A intern/cycles/kernel/kernel_queues.h
M intern/cycles/kernel/kernel_random.h
M intern/cycles/kernel/kernel_shader.h
M intern/cycles/kernel/kernel_shadow.h
A intern/cycles/kernel/kernel_split.h
M intern/cycles/kernel/kernel_subsurface.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernel_volume.h
A intern/cycles/kernel/kernel_work_stealing.h
M intern/cycles/kernel/svm/svm.h
M intern/cycles/kernel/svm/svm_attribute.h
M intern/cycles/kernel/svm/svm_blackbody.h
M intern/cycles/kernel/svm/svm_brick.h
M intern/cycles/kernel/svm/svm_brightness.h
M intern/cycles/kernel/svm/svm_camera.h
M intern/cycles/kernel/svm/svm_checker.h
M intern/cycles/kernel/svm/svm_closure.h
M intern/cycles/kernel/svm/svm_convert.h
M intern/cycles/kernel/svm/svm_displace.h
M intern/cycles/kernel/svm/svm_fresnel.h
M intern/cycles/kernel/svm/svm_gamma.h
M intern/cycles/kernel/svm/svm_geometry.h
M intern/cycles/kernel/svm/svm_gradient.h
M intern/cycles/kernel/svm/svm_hsv.h
M intern/cycles/kernel/svm/svm_image.h
M intern/cycles/kernel/svm/svm_invert.h
M intern/cycles/kernel/svm/svm_light_path.h
M intern/cycles/kernel/svm/svm_magic.h
M intern/cycles/kernel/svm/svm_mapping.h
M intern/cycles/kernel/svm/svm_math.h
M intern/cycles/kernel/svm/svm_mix.h
M intern/cycles/kernel/svm/svm_musgrave.h
M intern/cycles/kernel/svm/svm_noisetex.h
M intern/cycles/kernel/svm/svm_normal.h
M intern/cycles/kernel/svm/svm_ramp.h
M intern/cycles/kernel/svm/svm_sepcomb_hsv.h
M intern/cycles/kernel/svm/svm_sepcomb_vector.h
M intern/cycles/kernel/svm/svm_sky.h
M intern/cycles/kernel/svm/svm_tex_coord.h
M intern/cycles/kernel/svm/svm_types.h
M intern/cycles/kernel/svm/svm_value.h
M intern/cycles/kernel/svm/svm_vector_transform.h
M intern/cycles/kernel/svm/svm_voronoi.h
M intern/cycles/kernel/svm/svm_wave.h
M intern/cycles/kernel/svm/svm_wavelength.h
M intern/cycles/kernel/svm/svm_wireframe.h
M intern/cycles/render/buffers.h
M intern/cycles/render/graph.cpp
M intern/cycles/render/graph.h
M intern/cycles/render/nodes.cpp
M intern/cycles/render/session.cpp
M intern/cycles/render/svm.cpp
M intern/cycles/render/tile.h
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 7c17f7f..e89a10b 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,10 @@
#include "util_types.h"
#include "util_vector.h"
+#include <set>
+
+#define SPLIT_KERNEL_CLOSURE_COUNT 1
+
CCL_NAMESPACE_BEGIN
class Progress;
@@ -94,6 +98,12 @@ public:
/* statistics */
Stats &stats;
+ /* Maximum closure count */
+ int clos_max;
+
+ /* Get all closure nodes associated with the scene */
+ std::set<int> closure_nodes;
+
/* regular memory */
virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
virtual void mem_copy_to(device_memory& mem) = 0;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index a5bf35a..ae8a029 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -25,6 +25,13 @@
#include "device.h"
#include "device_intern.h"
+/* hack: we need to get OpenCL features available for different vendors instead of default CPU
+ * now we have only one opencl feature set shared by nvidia and amd
+ */
+
+#define __KERNEL_OPENCL__ 1
+#define __SPLIT_KERNEL__ 1
+
#include "buffers.h"
#include "util_foreach.h"
@@ -39,21 +46,57 @@ CCL_NAMESPACE_BEGIN
#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+#if __SPLIT_KERNEL__
+/* This value may be tuned according to the scene we are rendering */
+#define PATH_ITER_INC_FACTOR 8
+
+/*
+ * When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000; //5MB
+
+/* Additional kernel build options regarding optimization */
+string opt;
+/* Additional kernel build option denoting compute device type */
+string compute_device_type_build_option;
+
+/* Shader data variable count - To calculate ShaderData size */
+#define SD_NUM_FLOAT3 5
+#ifdef __DPDU__
+#define SD_NUM_DPDU_FLOAT3 2
+#endif
+#define SD_NUM_INT 8
+#define SD_NUM_FLOAT 5
+#ifdef __RAY_DIFFERENTIALS__
+#define SD_NUM_RAY_DIFFERENTIALS_DIFFERENTIAL3 2
+#define SD_NUM_DIFFERENTIAL 2
+#endif
+#define SD_NUM_RAY_DP_DIFFERENTIAL3 1
+#endif
+
static cl_device_type opencl_device_type()
{
char *device = getenv("CYCLES_OPENCL_TEST");
if(device) {
- if(strcmp(device, "ALL") == 0)
+ if (strcmp(device, "ALL") == 0) {
return CL_DEVICE_TYPE_ALL;
- else if(strcmp(device, "DEFAULT") == 0)
+ }
+ else if (strcmp(device, "DEFAULT") == 0) {
return CL_DEVICE_TYPE_DEFAULT;
- else if(strcmp(device, "CPU") == 0)
+ }
+ else if (strcmp(device, "CPU") == 0) {
return CL_DEVICE_TYPE_CPU;
- else if(strcmp(device, "GPU") == 0)
+ }
+ else if (strcmp(device, "GPU") == 0) {
return CL_DEVICE_TYPE_GPU;
- else if(strcmp(device, "ACCELERATOR") == 0)
+ }
+ else if (strcmp(device, "ACCELERATOR") == 0) {
return CL_DEVICE_TYPE_ACCELERATOR;
+ }
}
return CL_DEVICE_TYPE_ALL;
@@ -81,7 +124,13 @@ static bool opencl_kernel_use_advanced_shading(const string& platform)
static string opencl_kernel_build_options(const string& platform, const string *debug_src = NULL)
{
- string build_options = " -cl-fast-relaxed-math ";
+#if __SPLIT_KERNEL__
+ string build_options = " -cl-fast-relaxed-math -D__SPLIT_KERNEL__=1 ";
+ build_options.append(opt);
+ build_options.append(compute_device_type_build_option);
+#else
+ string build_options = " -cl-fast-relaxed-math -D__SPLIT_KERNEL__=0 ";
+#endif
if(platform == "NVIDIA CUDA")
build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
@@ -100,12 +149,15 @@ static string opencl_kernel_build_options(const string& platform, const string *
build_options += "-g -s \"" + *debug_src + "\"";
}
+#if !__SPLIT_KERNEL__
+ /* kernel debug currently not supported in __SPLIT_KERNEL__ */
if(opencl_kernel_use_debug())
build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
#ifdef WITH_CYCLES_DEBUG
build_options += "-D__KERNEL_DEBUG__ ";
#endif
+#endif
return build_options;
}
@@ -321,13 +373,243 @@ public:
cl_command_queue cqCommandQueue;
cl_platform_id cpPlatform;
cl_device_id cdDevice;
- cl_program cpProgram;
+ cl_int ciErr;
+
+#if __SPLIT_KERNEL__
+ /* Kernel declaration */
+ cl_kernel ckPathTraceKernel_DataInit_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_SceneIntersect_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_LampEmission_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_QueueEnqueue_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_BG_BufferUpdate_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_Shader_Lighting_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_Holdout_Emission_Blurring_Pathtermination_AO_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_Subsurface_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_DirectLighting_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_ShadowBlocked_DirectLighting_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_SetUpNextIteration_SPLIT_KERNEL;
+ cl_kernel ckPathTraceKernel_SumAllRadiance_SPLIT_KERNEL;
+
+ /* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be avaible to another kernel via this global
+ * memory
+ */
+ cl_mem rng_coop;
+ cl_mem throughput_coop;
+ cl_mem L_transparent_coop;
+ cl_mem PathRadiance_coop;
+ cl_mem Ray_coop;
+ cl_mem PathState_coop;
+ cl_mem Intersection_coop;
+ cl_mem ShaderData_coop;
+ cl_mem ShaderData_coop_DL;
+ cl_mem ShaderData_coop_shadow;
+
+ /* KernelGlobals buffer */
+ cl_mem kgbuffer;
+
+ /* global buffers for ShaderData */
+ cl_mem sd; /* ShaderData used in the main path-iteration loop */
+ cl_mem sd_dl; /* ShaderData used in DirectLighting kernel */
+ cl_mem sd_shadow; /* ShaderData used in ShadowBlocked kernel */
+
+ /* global buffers of each member of ShaderData */
+ cl_mem P_sd;
+ cl_mem P_sd_dl;
+ cl_mem P_sd_shadow;
+ cl_mem N_sd;
+ cl_mem N_sd_dl;
+ cl_mem N_sd_shadow;
+ cl_mem Ng_sd;
+ cl_mem Ng_sd_dl;
+ cl_mem Ng_sd_shadow;
+ cl_mem I_sd;
+ cl_mem I_sd_dl;
+ cl_mem I_sd_shadow;
+ cl_mem shader_sd;
+ cl_mem shader_sd_dl;
+ cl_mem shader_sd_shadow;
+ cl_mem flag_sd;
+ cl_mem flag_sd_dl;
+ cl_mem flag_sd_shadow;
+ cl_mem prim_sd;
+ cl_mem prim_sd_dl;
+ cl_mem prim_sd_shadow;
+ cl_mem type_sd;
+ cl_mem type_sd_dl;
+ cl_mem type_sd_shadow;
+ cl_mem u_sd;
+ cl_mem u_sd_dl;
+ cl_mem u_sd_shadow;
+ cl_mem v_sd;
+ cl_mem v_sd_dl;
+ cl_mem v_sd_shadow;
+ cl_mem object_sd;
+ cl_mem object_sd_dl;
+ cl_mem object_sd_shadow;
+ cl_mem time_sd;
+ cl_mem time_sd_dl;
+ cl_mem time_sd_shadow;
+ cl_mem ray_length_sd;
+ cl_mem ray_length_sd_dl;
+ cl_mem ray_length_sd_shadow;
+ cl_mem ray_depth_sd;
+ cl_mem ray_depth_sd_dl;
+ cl_mem ray_depth_sd_shadow;
+ cl_mem transparent_depth_sd;
+ cl_mem transparent_depth_sd_dl;
+ cl_mem transparent_depth_sd_shadow;
+#ifdef __RAY_DIFFERENTIALS__
+ cl_mem dP_sd,dI_sd;
+ cl_mem dP_sd_dl, dI_sd_dl;
+ cl_mem dP_sd_shadow, dI_sd_shadow;
+ cl_mem du_sd, dv_sd;
+ cl_mem du_sd_dl, dv_sd_dl;
+ cl_mem du_sd_shadow, dv_sd_shadow;
+#endif
+#ifdef __DPDU__
+ cl_mem dPdu_sd, dPdv_sd;
+ cl_mem dPdu_sd_dl, dPdv_sd_dl;
+ cl_mem dPdu_sd_shadow, dPdv_sd_shadow;
+#endif
+ cl_mem closure_sd;
+ cl_mem closure_sd_dl;
+ cl_mem closure_sd_shadow;
+ cl_mem num_closure_sd;
+ cl_mem num_closure_sd_dl;
+ cl_mem num_closure_sd_shadow;
+ cl_mem randb_closure_sd;
+ cl_mem randb_closure_sd_dl;
+ cl_mem randb_closure_sd_shadow;
+ cl_mem ray_P_sd;
+ cl_mem ray_P_sd_dl;
+ cl_mem ray_P_sd_shadow;
+ cl_mem ray_dP_sd;
+ cl_mem ray_dP_sd_dl;
+ cl_mem ray_dP_sd_shadow;
+
+ /* Global memory required for shadow blocked and accum_radiance */
+ cl_mem BSDFEval_coop;
+ cl_mem ISLamp_coop;
+ cl_mem LightRay_coop;
+ cl_mem AOAlpha_coop;
+ cl_mem AOBSDF_coop;
+ cl_mem AOLightRay_coop;
+ cl_mem Intersection_coop_AO;
+ cl_mem Intersection_coop_DL;
+
+ /* Global state array that tracks ray state */
+ cl_mem ray_state;
+
+ /* per sample buffers */
+ cl_mem per_sample_output_buffers;
+
+ /* Denotes which sample each ray is being processed for */
+ cl_mem work_array;
+
+ /* Queue*/
+ cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int) */
+ cl_mem Queue_index; /* Array of size num_queues * sizeof(int); Tracks the size of each queue */
+
+ /* Flag to make sceneintersect and lampemission kernel use queues */
+ cl_mem use_queues_flag;
+
+ /* cl_program declaration */
+ cl_program dataInit_program;
+ cl_program sceneIntersect_program;
+ cl_program lampEmission_program;
+ cl_program QueueEnqueue_program;
+ cl_program background_BufferUpdate_program;
+ cl_program shaderEval_program;
+ cl_program holdout_emission_blurring_termination_ao_program;
+ cl_program subsurface_program;
+ cl_program directLighting_program;
+ cl_program shadowBlocked_program;
+ cl_program nextIterationSetUp_program;
+ cl_program sumAllRadiance_program;
+
+ /* Required memory size */
+ size_t rng_size = sizeof(RNG);
+ size_t throughput_size = sizeof(float3);
+ size_t L_transparent_size = sizeof(float);
+ size_t rayState_size = sizeof(char);
+ size_t hostRayState_size = sizeof(char);
+ size_t work_element_size = sizeof(unsigned int);
+ size_t ISLamp_size = sizeof(int);
+
+ /* size of structures declared in kernel_types.h */
+ size_t PathRadiance_size = sizeof(PathRadiance);
+ size_t Ray_size = sizeof(Ray);
+ size_t PathState_size = sizeof(PathState);
+ size_t Intersection_size = sizeof(Intersection);
+
+ /* Volume of ShaderData; ShaderData (in split_kernel) is a
+ * Structure-Of-Arrays implementation; We need to calculate memory
+ * required for a single thread
+ */
+ size_t ShaderData_volume = 0;
+
+ /* This is total ShaderClosure size required for one thread */
+ size_t ShaderClosure_size = 0;
+
+ /* Sizes of memory required for shadow blocked function */
+ size_t AOAlpha_size = sizeof(float3);
+ size_t AOBSDF_size = sizeof(float3);
+ size_t AOLightRay_size = sizeof(Ray);
+ size_t LightRay_size = sizeof(Ray);
+ size_t BSDFEval_size = sizeof(BsdfEval);
+ size_t Intersection_coop_AO_size = sizeof(Intersection);
+ size_t Intersection_coop_DL_size = siz
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list