[Bf-blender-cvs] SVN commit: /data/svn/bf-blender [57383] trunk/blender/intern/cycles/kernel : Fix #35665: more CUDA issues with recent kernel changes, tested on sm_20, sm_21

Brecht Van Lommel brechtvanlommel at pandora.be
Tue Jun 11 23:58:49 CEST 2013


Revision: 57383
          http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision=57383
Author:   blendix
Date:     2013-06-11 21:58:48 +0000 (Tue, 11 Jun 2013)
Log Message:
-----------
Fix #35665: more CUDA issues with recent kernel changes, tested on sm_20, sm_21
and sm_30 cards, so hopefully it should all work now.

Also includes some warnings fixes related to nvcc compiler arguments, should make
no difference otherwise.

Modified Paths:
--------------
    trunk/blender/intern/cycles/kernel/CMakeLists.txt
    trunk/blender/intern/cycles/kernel/kernel_jitter.h
    trunk/blender/intern/cycles/kernel/kernel_path.h
    trunk/blender/intern/cycles/kernel/kernel_random.h

Modified: trunk/blender/intern/cycles/kernel/CMakeLists.txt
===================================================================
--- trunk/blender/intern/cycles/kernel/CMakeLists.txt	2013-06-11 21:58:43 UTC (rev 57382)
+++ trunk/blender/intern/cycles/kernel/CMakeLists.txt	2013-06-11 21:58:48 UTC (rev 57383)
@@ -129,9 +129,20 @@
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		set(cuda_cubin kernel_${arch}.cubin)
 
+		if(${arch} MATCHES "sm_1[0-9]")
+			# sm_1x
+			set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
+		elseif(${arch} MATCHES "sm_2[0-9]")
+			# sm_2x
+			set(cuda_arch_flags "--maxrregcount=24")
+		else()
+			# sm_3x
+			set(cuda_arch_flags "--maxrregcount=32")
+		endif()
+		
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
-			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
 			DEPENDS ${cuda_sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)

Modified: trunk/blender/intern/cycles/kernel/kernel_jitter.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_jitter.h	2013-06-11 21:58:43 UTC (rev 57382)
+++ trunk/blender/intern/cycles/kernel/kernel_jitter.h	2013-06-11 21:58:48 UTC (rev 57383)
@@ -146,7 +146,7 @@
 	return (x + jx)*invN;
 }
 
-__device_noinline float2 cmj_sample_2D(int s, int N, int p)
+__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
 	int m = float_to_int(sqrtf(N));
 	int n = (N + m - 1)/m;
@@ -173,7 +173,8 @@
 	float jx = cmj_randfloat(s, p * 0x967a889b);
 	float jy = cmj_randfloat(s, p * 0x368cc8b7);
 
-	return make_float2((sx + (sy + jx)*invn)*invm, (s + jy)*invN);
+	*fx = (sx + (sy + jx)*invn)*invm;
+	*fy = (s + jy)*invN;
 }
 #endif
 

Modified: trunk/blender/intern/cycles/kernel/kernel_path.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_path.h	2013-06-11 21:58:43 UTC (rev 57382)
+++ trunk/blender/intern/cycles/kernel/kernel_path.h	2013-06-11 21:58:48 UTC (rev 57383)
@@ -409,9 +409,8 @@
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			/* todo: solve correlation */
-			float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
-			float bsdf_u = bsdf_uv.x;
-			float bsdf_v = bsdf_uv.y;
+			float bsdf_u, bsdf_v;
+			path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@@ -450,9 +449,8 @@
 #else
 				float light_o = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_F);
 #endif
-				float2 light_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U);
-				float light_u = light_uv.x;
-				float light_v = light_uv.y;
+				float light_u, light_v;
+				path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
 
 				Ray light_ray;
 				BsdfEval L_light;
@@ -484,9 +482,8 @@
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
-		float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
-		float bsdf_u = bsdf_uv.x;
-		float bsdf_v = bsdf_uv.y;
+		float bsdf_u, bsdf_v;
+		path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 		int label;
 
 		label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -653,10 +650,8 @@
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			/* todo: solve correlation */
-			float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
-			float bsdf_u = bsdf_uv.x;
-			float bsdf_v = bsdf_uv.y;
+			float bsdf_u, bsdf_v;
+			path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@@ -695,9 +690,8 @@
 #else
 				float light_o = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_F);
 #endif
-				float2 light_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U);
-				float light_u = light_uv.x;
-				float light_v = light_uv.y;
+				float light_u, light_v;
+				path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
 
 				Ray light_ray;
 				BsdfEval L_light;
@@ -730,9 +724,8 @@
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
-		float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
-		float bsdf_u = bsdf_uv.x;
-		float bsdf_v = bsdf_uv.y;
+		float bsdf_u, bsdf_v;
+		path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 		int label;
 
 		label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -784,10 +777,8 @@
 		float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
 
 		for(int j = 0; j < num_samples; j++) {
-			/* todo: solve correlation */
-			float2 bsdf_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
-			float bsdf_u = bsdf_uv.x;
-			float bsdf_v = bsdf_uv.y;
+			float bsdf_u, bsdf_v;
+			path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float3 ao_D;
 			float ao_pdf;
@@ -836,9 +827,8 @@
 				num_samples_inv *= 0.5f;
 
 			for(int j = 0; j < num_samples; j++) {
-				float2 light_uv = path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
-				float light_u = light_uv.x;
-				float light_v = light_uv.y;
+				float light_u, light_v;
+				path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
 
 				if(direct_emission(kg, sd, i, 0.0f, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
@@ -862,9 +852,8 @@
 
 			for(int j = 0; j < num_samples; j++) {
 				float light_t = path_rng_1D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT);
-				float2 light_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
-				float light_u = light_uv.x;
-				float light_v = light_uv.y;
+				float light_u, light_v;
+				path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
@@ -913,9 +902,8 @@
 			BsdfEval bsdf_eval;
 			float3 bsdf_omega_in;
 			differential3 bsdf_domega_in;
-			float2 bsdf_uv = path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
-			float bsdf_u = bsdf_uv.x;
-			float bsdf_v = bsdf_uv.y;
+			float bsdf_u, bsdf_v;
+			path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 			int label;
 
 			label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -1162,11 +1150,8 @@
 
 	float lens_u = 0.0f, lens_v = 0.0f;
 
-	if(kernel_data.cam.aperturesize > 0.0f) {
-		float2 lens_uv = path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U);
-		lens_u = lens_uv.x;
-		lens_v = lens_uv.y;
-	}
+	if(kernel_data.cam.aperturesize > 0.0f)
+		path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
 
 	float time = 0.0f;
 

Modified: trunk/blender/intern/cycles/kernel/kernel_random.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_random.h	2013-06-11 21:58:43 UTC (rev 57382)
+++ trunk/blender/intern/cycles/kernel/kernel_random.h	2013-06-11 21:58:48 UTC (rev 57383)
@@ -102,8 +102,16 @@
 	return index;
 }
 
-__device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dimension)
+__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
+#ifdef __CMJ__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+		/* correlated multi-jittered */
+		int p = *rng + dimension;
+		return cmj_sample_1D(sample, num_samples, p);
+	}
+#endif
+
 #ifdef __SOBOL_FULL_SCREEN__
 	uint result = sobol_dimension(kg, *rng, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
@@ -117,43 +125,29 @@
 	float shift;
 
 	if(dimension & 1)
-		shift = (*rng >> 16)*(1.0f/(float)0xFFFF);
+		shift = (*rng >> 16)/((float)0xFFFF);
 	else
-		shift = (*rng & 0xFFFF)*(1.0f/(float)0xFFFF);
+		shift = (*rng & 0xFFFF)/((float)0xFFFF);
 
 	return r + shift - floorf(r + shift);
 #endif
 }
 
-__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+__device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
 		/* correlated multi-jittered */
 		int p = *rng + dimension;
-		return cmj_sample_1D(sample, num_samples, p);
+		cmj_sample_2D(sample, num_samples, p, fx, fy);
 	}
 #endif
 
 	/* sobol */
-	return path_rng(kg, rng, sample, dimension);
+	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
+	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
 }
 
-__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
-{
-#ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
-		int p = *rng + dimension;
-		return cmj_sample_2D(sample, num_samples, p);
-	}
-#endif
-
-	/* sobol */
-	return make_float2(path_rng(kg, rng, sample, dimension),

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list