[Bf-blender-cvs] [466bd61857] soc-2016-cycles_denoising: Cycles: Implement new NLM kernels for CUDA

Fri Jan 20 05:46:41 CET 2017

Commit: 466bd61857f61b73b8006db8ccc2e78799f4ef30
Author: Lukas Stockner
Date:   Fri Jan 13 00:09:09 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB466bd61857f61b73b8006db8ccc2e78799f4ef30

Cycles: Implement new NLM kernels for CUDA

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/filter/filter.h
R098	intern/cycles/kernel/filter/filter_nlm.h	intern/cycles/kernel/filter/filter_nlm_cpu.h
A	intern/cycles/kernel/filter/filter_nlm_gpu.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a596097f15..dfc6995e53 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -336,7 +336,7 @@ public:
 				/* Reuse some passes of the filter_buffer for temporary storage. */
 				float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
 				float *unfiltered = PASSPTR(4), *unfilteredB = PASSPTR(5);
-				float *diffI = PASSPTR(10), *blurDiffI = PASSPTR(11), *accumI = PASSPTR(12);
+				float *nlm_temp1 = PASSPTR(10), *nlm_temp2 = PASSPTR(11), *nlm_temp3 = PASSPTR(12);
 
 				/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
 				for(int y = rect.y; y < rect.w; y++) {
@@ -354,14 +354,14 @@ public:
 #endif
 
 				/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-				non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
+				non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 				WRITE_DEBUG("cleanV", cleanV);
 #endif
 
 				/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-				non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
-				non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+				non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
+				non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 				WRITE_DEBUG("filteredA", sampleV);
 				WRITE_DEBUG("filteredB", bufferV);
@@ -378,8 +378,8 @@ public:
 #endif
 
 				/* Use the residual variance for a second filter pass. */
-				non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
-				non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+				non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
+				non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 				WRITE_DEBUG("finalA", unfiltered);
 				WRITE_DEBUG("finalB", unfiltered + pass_stride);
@@ -403,7 +403,7 @@ public:
 			{
 
 				float *unfiltered = PASSPTR(16);
-				float *diffI = PASSPTR(17), *blurDiffI = PASSPTR(18), *accumI = PASSPTR(19);
+				float *nlm_temp1 = PASSPTR(17), *nlm_temp2 = PASSPTR(18), *nlm_temp3 = PASSPTR(19);
 				/* Order in render buffers:
 				 *   Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
 				 *          0  1  2            3  4  5         6  7  8            9  10 11  12    13
@@ -423,7 +423,7 @@ public:
 							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, PASSPTR(offset_to[i]+1), &rect.x);
 						}
 					}
-					non_local_means(rect, unfiltered, unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), diffI, blurDiffI, accumI, 2, 2, 1, 0.25f);
+					non_local_means(rect, unfiltered, unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), nlm_temp1, nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 #define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, name), var, 1, w);
 					WRITE_DEBUG("unfiltered", unfiltered);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 27e5b7af5b..5d347aea22 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -830,6 +830,76 @@ public:
 		}
 	}
 
+	void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight, CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) {
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+		cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h));
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuModule, "kernel_cuda_filter_nlm_update_output"));
+		cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuModule, "kernel_cuda_filter_nlm_normalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuNLMCalcDifference));
+
+		int xthreads = (int)sqrt((float)threads_per_block);
+		int ythreads = (int)sqrt((float)threads_per_block);
+		int xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+		int yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+		int dx, dy;
+		int4 local_rect;
+		void *calc_difference_args[] = {&dx, &dy, &weight, &variance, &difference, &local_rect, &w, &a, &k_2};
+		void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+		void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+		void *update_output_args[] = {&dx, &dy, &blurDifference, &image, &out, &weightAccum, &local_rect, &w, &f};
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			dy = i / (2*r+1) - r;
+			dx = i % (2*r+1) - r;
+			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+			cuda_assert(cuLaunchKernel(cuNLMCalcDifference,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, calc_difference_args, 0));
+			cuda_assert(cuLaunchKernel(cuNLMBlur,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, blur_args, 0));
+			cuda_assert(cuLaunchKernel(cuNLMCalcWeight,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, calc_weight_args, 0));
+			cuda_assert(cuLaunchKernel(cuNLMBlur,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, blur_args, 0));
+			cuda_assert(cuLaunchKernel(cuNLMUpdateOutput,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, update_output_args, 0));
+		}
+
+		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+		void *normalize_args[] = {&out, &weightAccum, &local_rect, &w};
+		cuda_assert(cuLaunchKernel(cuNLMNormalize,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, normalize_args, 0));
+	}
+
 	void denoise(RenderTile &rtile, int sample)
 	{
 		if(have_error())
@@ -892,62 +962,38 @@ public:
 		for(int frame = 0; frame < rtile.buffers->params.frames; frame++) {
 			CUdeviceptr d_denoise_buffer = CUDA_PTR_ADD(d_denoise_buffers, frame_stride*frame);
 			CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers, frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size());
-			/* ==== Step 1: Prefilter general features. ==== */
-			{
-				int mean_from[]      = { 0, 1, 2,  6,  7,  8, 12 };
-				int variance_from[]  = { 3, 4, 5,  9, 10, 11, 13 };
-				int offset_to[]      = { 0, 2, 4, 10, 12, 14,  6 };
-				for(int i = 0; i < 7; i++) {
-					CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
-					CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
-					CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
-
-					void *get_feature_args[] = {&sample, &d_buffer, &mean_from[i], &variance_from[i],
-					                            &buffer_area,
-					                            &rtile.offset, &rtile.stride,
-					                            &d_unfiltered, &d_variance,
-					                            &rect};
-					cuda_assert(cuLaunchKernel(cuFilterGetFeature,
-					                           xblocks , yblocks, 1, /* blocks */
-					                           xthreads, ythreads, 1, /* threads */
-					                           0, 0, get_feature_args, 0));
-
-					/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-					float a = 1.0f, k_2 = 0.25f;
-					int r = 4, f = 2;
-					void *filter_feature_args[] = {&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
-					                               &rect,
-					                               &r, &f, &a, &k_2};
-					cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
-					                           xblocks , yblocks, 1, /* blocks */
-					                           xthreads, ythreads, 1, /* threads */
-					                           0, 0, filter_feature_args, 0));
-				}
-			}
 
-			/* 

@@ Diff output truncated at 10240 characters. @@