[Bf-blender-cvs] [bae2b3a] master: Switch to Cuda 4.0 style api for kernel invocation. This is a small clean-up that has no functional changes but makes code a bit more readable.

Fri Jul 25 13:34:39 CEST 2014

Commit: bae2b3a688a2c2ee3eb8457c62af3a10bae76131
Author: Martijn Berger
Date:   Fri Jul 25 13:33:19 2014 +0200
Branches: master
https://developer.blender.org/rBbae2b3a688a2c2ee3eb8457c62af3a10bae76131

Switch to Cuda 4.0 style api for kernel invocation. This is a small clean-up that has no functional changes but makes code a bit more readable.

Differential revision: https://developer.blender.org/D659

Reviewed by: Sergey Sharybin, Thomas Dinges

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/util/util_cuda.cpp
M	intern/cycles/util/util_cuda.h

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d1d227b..022dcd0 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -615,40 +615,17 @@ public:
 
 		if(have_error())
 			return;
-	
-		/* pass in parameters */
-		int offset = 0;
-		
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)));
-		offset += sizeof(d_buffer);
-
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)));
-		offset += sizeof(d_rng_state);
-
-		offset = align_up(offset, __alignof(sample));
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, sample));
-		offset += sizeof(sample);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x));
-		offset += sizeof(rtile.x);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y));
-		offset += sizeof(rtile.y);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w));
-		offset += sizeof(rtile.w);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h));
-		offset += sizeof(rtile.h);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset));
-		offset += sizeof(rtile.offset);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride));
-		offset += sizeof(rtile.stride);
-
-		cuda_assert(cuParamSetSize(cuPathTrace, offset));
+		/* pass in parameters */
+		void *args[] = {&d_buffer,
+						 &d_rng_state,
+						 &sample,
+						 &rtile.x,
+						 &rtile.y,
+						 &rtile.w,
+						 &rtile.h,
+						 &rtile.offset,
+						 &rtile.stride};
 
 		/* launch kernel */
 		int threads_per_block;
@@ -666,8 +643,11 @@ public:
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;
 
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1));
-		cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks));
+
+		cuda_assert(cuLaunchKernel(cuPathTrace,
+								   xblocks , yblocks, 1, /* blocks */
+								   xthreads, ythreads, 1, /* threads */
+								   0, 0, args, 0));
 
 		cuda_assert(cuCtxSynchronize());
 
@@ -693,40 +673,19 @@ public:
 			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
 		}
 
-		/* pass in parameters */
-		int offset = 0;
-
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)));
-		offset += sizeof(d_rgba);
-		
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)));
-		offset += sizeof(d_buffer);
 
 		float sample_scale = 1.0f/(task.sample + 1);
-		offset = align_up(offset, __alignof(sample_scale));
-
-		cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale));
-		offset += sizeof(sample_scale);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x));
-		offset += sizeof(task.x);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y));
-		offset += sizeof(task.y);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w));
-		offset += sizeof(task.w);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h));
-		offset += sizeof(task.h);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset));
-		offset += sizeof(task.offset);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride));
-		offset += sizeof(task.stride);
-
-		cuda_assert(cuParamSetSize(cuFilmConvert, offset));
+		/* pass in parameters */
+		void *args[] = {&d_rgba,
+						 &d_buffer,
+						 &sample_scale,
+						 &task.x,
+						 &task.y,
+						 &task.w,
+						 &task.h,
+						 &task.offset,
+						 &task.stride};
 
 		/* launch kernel */
 		int threads_per_block;
@@ -738,8 +697,11 @@ public:
 		int yblocks = (task.h + ythreads - 1)/ythreads;
 
 		cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1));
-		cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks));
+
+		cuda_assert(cuLaunchKernel(cuFilmConvert,
+								   xblocks , yblocks, 1, /* blocks */
+								   xthreads, ythreads, 1, /* threads */
+								   0, 0, args, 0));
 
 		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
 
@@ -777,31 +739,14 @@ public:
 			int shader_w = min(shader_chunk_size, end - shader_x);
 
 			for(int sample = 0; sample < task.num_samples; sample++) {
-				/* pass in parameters */
-				int offset = 0;
 
-				cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
-				offset += sizeof(d_input);
-
-				cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
-				offset += sizeof(d_output);
-
-				int shader_eval_type = task.shader_eval_type;
-				offset = align_up(offset, __alignof(shader_eval_type));
-
-				cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
-				offset += sizeof(task.shader_eval_type);
-
-				cuda_assert(cuParamSeti(cuShader, offset, shader_x));
-				offset += sizeof(shader_x);
-
-				cuda_assert(cuParamSeti(cuShader, offset, shader_w));
-				offset += sizeof(shader_w);
-
-				cuda_assert(cuParamSeti(cuShader, offset, sample));
-				offset += sizeof(sample);
-
-				cuda_assert(cuParamSetSize(cuShader, offset));
+				/* pass in parameters */
+				void *args[] = {&d_input,
+								 &d_output,
+								 &task.shader_eval_type,
+								 &shader_x,
+								 &shader_w,
+								 &sample};
 
 				/* launch kernel */
 				int threads_per_block;
@@ -810,8 +755,10 @@ public:
 				int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
 
 				cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-				cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
-				cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+				cuda_assert(cuLaunchKernel(cuShader,
+										   xblocks , 1, 1, /* blocks */
+										   threads_per_block, 1, 1, /* threads */
+										   0, 0, args, 0));
 
 				cuda_assert(cuCtxSynchronize());
 			}
diff --git a/intern/cycles/util/util_cuda.cpp b/intern/cycles/util/util_cuda.cpp
index 9404f45..5069043 100644
--- a/intern/cycles/util/util_cuda.cpp
+++ b/intern/cycles/util/util_cuda.cpp
@@ -149,6 +149,7 @@ tcuGLCtxCreate *cuGLCtxCreate;
 tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
 tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
 tcuCtxSetCurrent *cuCtxSetCurrent;
+tcuLaunchKernel *cuLaunchKernel;
 
 CCL_NAMESPACE_BEGIN
 
@@ -386,6 +387,7 @@ bool cuLibraryInit()
 
 	/* cuda 4.0 */
 	CUDA_LIBRARY_FIND(cuCtxSetCurrent);
+	CUDA_LIBRARY_FIND(cuLaunchKernel);
 
 	if(cuHavePrecompiledKernels())
 		result = true;
diff --git a/intern/cycles/util/util_cuda.h b/intern/cycles/util/util_cuda.h
index b5e9c71..a633fb2 100644
--- a/intern/cycles/util/util_cuda.h
+++ b/intern/cycles/util/util_cuda.h
@@ -509,6 +509,7 @@ typedef CUresult CUDAAPI tcuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUd
 typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
 typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
 typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ, unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ, unsigned sharedMemBytes, CUstream hStream, void* kernelParams, void* extra);
 
 /* function declarations */
 
@@ -629,6 +630,7 @@ extern tcuGLCtxCreate *cuGLCtxCreate;
 extern tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
 extern tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
 extern tcuCtxSetCurrent *cuCtxSetCurrent;
+extern tcuLaunchKernel *cuLaunchKernel;
 
 #endif /* __UTIL_CUDA_H__ */