[Bf-blender-cvs] [2851ed4] master: Cycles code refactor: use __launch_bounds__ instead of -maxrregcount for CUDA.

Wed Apr 16 21:05:10 CEST 2014

Commit: 2851ed4a553d633c3ccbfcbbec6a4c12b79401d9
Author: Brecht Van Lommel
Date:   Wed Apr 16 19:04:58 2014 +0200
https://developer.blender.org/rB2851ed4a553d633c3ccbfcbbec6a4c12b79401d9

Cycles code refactor: use __launch_bounds__ instead of -maxrregcount for CUDA.

This makes it easier to have per kernel number of registers. Also, all the
tunable parameters for this are now in kernel.cu, rather than spread over cmake,
scons and device_cuda.cpp.

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/SConscript
M	intern/cycles/kernel/kernel.cu

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index edee32e..9200473 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -253,7 +253,6 @@ public:
 			return false;
 		}
 		
-
 		return true;
 	}
 
@@ -315,17 +314,6 @@ public:
 		string kernel = path_join(kernel_path, "kernel.cu");
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
-		string arch_flags;
-
-		/* CUDA 5.x build flags for different archs */
-		if(major == 2) {
-			/* sm_2x */
-			arch_flags = "--maxrregcount=40 --use_fast_math";
-		}
-		else if(major == 3) {
-			/* sm_3x */
-			arch_flags = "--maxrregcount=32 --use_fast_math";
-		}
 
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
@@ -333,8 +321,8 @@ public:
 		path_create_directories(cubin);
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 
 		printf("%s\n", command.c_str());
 
@@ -665,9 +653,18 @@ public:
 
 		cuda_assert(cuParamSetSize(cuPathTrace, offset))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace))
+
+		/*int num_registers;
+		cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace))
+
+		printf("threads_per_block %d\n", threads_per_block);
+		printf("num_registers %d\n", num_registers);*/
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (rtile.w + xthreads - 1)/xthreads;
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;
 
@@ -730,9 +727,12 @@ public:
 
 		cuda_assert(cuParamSetSize(cuFilmConvert, offset))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert))
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (task.w + xthreads - 1)/xthreads;
 		int yblocks = (task.h + ythreads - 1)/ythreads;
 
@@ -752,40 +752,42 @@ public:
 
 		cuda_push_context();
 
-		CUfunction cuDisplace;
+		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
 
 		/* get kernel function */
-		cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
+		cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"))
 		
 		/* pass in parameters */
 		int offset = 0;
 		
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
+		cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)))
 		offset += sizeof(d_input);
 
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
+		cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)))
 		offset += sizeof(d_output);
 
 		int shader_eval_type = task.shader_eval_type;
 		offset = align_up(offset, __alignof(shader_eval_type));
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
+		cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type))
 		offset += sizeof(task.shader_eval_type);
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
+		cuda_assert(cuParamSeti(cuShader, offset, task.shader_x))
 		offset += sizeof(task.shader_x);
 
-		cuda_assert(cuParamSetSize(cuDisplace, offset))
+		cuda_assert(cuParamSetSize(cuShader, offset))
+
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader))
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int xblocks = (task.shader_w + xthreads - 1)/xthreads;
+		int xblocks = (task.shader_w + threads_per_block - 1)/threads_per_block;
 
-		cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
-		cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
-		cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
+		cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1))
+		cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1))
+		cuda_assert(cuLaunchGrid(cuShader, xblocks, 1))
 
 		cuda_pop_context();
 	}
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 7dab65f..1527d15 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -161,16 +161,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_cubin kernel_${arch}.cubin)
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
-
-		# CUDA 5.x build flags for different archs
-		if(${arch} MATCHES "sm_2[0-9]")
-			# sm_2x
-			set(cuda_arch_flags "--maxrregcount=40")
-		elseif(${arch} MATCHES "sm_3[0-9]")
-			# sm_3x
-			set(cuda_arch_flags "--maxrregcount=32")
-		endif()
-
 		set(cuda_math_flags "--use_fast_math")
 		
 		if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 61ddaff..347835e 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -87,14 +87,6 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-        # CUDA 5.x build flags for different archs
-        if arch.startswith("sm_2"):
-            # sm_2x
-            cuda_arch_flags = "--maxrregcount=40 --use_fast_math"
-        elif arch.startswith("sm_3"):
-            # sm_3x
-            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
-
         if env['BF_CYCLES_CUDA_ENV']:
             MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
             command = "\"%s\" & \"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 5e6748c..ade7271 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -24,7 +24,71 @@
 #include "kernel_path.h"
 #include "kernel_displace.h"
 
-extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+/* device data taken from CUDA occupancy calculator */
+
+#ifdef __CUDA_ARCH__
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* unknown architecture */
+#else
+#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -34,7 +98,9 @@ extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state
 }
 
 #ifdef __BRANCHED_PATH__
-extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
+kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -44,7 +110,9 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *
 }
 #endif
 
-extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int 

@@ Diff output truncated at 10240 characters. @@