[Bf-blender-cvs] [73c48d7347] soc-2016-cycles_denoising: Cycles Denoising: Use better and faster NLM implementation for feature pass prefiltering

Thu Jan 12 05:14:13 CET 2017

Commit: 73c48d7347ee9ca77aff7e558d97130c8eb43f87
Author: Lukas Stockner
Date:   Wed Jan 11 07:26:55 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB73c48d7347ee9ca77aff7e558d97130c8eb43f87

Cycles Denoising: Use better and faster NLM implementation for feature pass prefiltering

Previously, the prefiltering NLM kernel was implemented just as it's described in the paper:
For every pixel P, loop over every pixel Q in the search window. Then, loop over the small patches around them, calculate the average difference, and use that to compute the weight of Q for the denoised result at P.

However, that gives you a time complexity of O(N^2 * R^2 * F^2), where N is the image size, R the search window and F the patch size...
So, this patch implements the clever idea from "A Simple Trick to Speed Up and Improve the Non-Local Means" - by reformulating the loop, it's actually possible to skip a lot of computation and replace it with a separable box filter convolution.  This reduces complexity to O(N^2 * R^2 * F), and the amount of pixel differences calculated even to O(N^2 * R^2)!

Furthermore, by applying a second box-filter pass after calculating the weights, we get the "patchwise NLM" improvement basically for free!

This is CPU-only so far, but that will change soon.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/filter/filter_nlm.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 31191f3c84..a596097f15 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -135,12 +135,17 @@ public:
 	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*)> filter_divide_shadow_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*)>       filter_get_feature_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)>                               filter_non_local_means_kernel;
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                  filter_combine_halves_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_construct_transform_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)>      filter_reconstruct_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, int, int, float*, int, int)>                                         filter_divide_combined_kernel;
 
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+
 #define KERNEL_FUNCTIONS(name) \
 	      KERNEL_NAME_EVAL(cpu, name), \
 	      KERNEL_NAME_EVAL(cpu_sse2, name), \
@@ -157,11 +162,15 @@ public:
 	  shader_kernel(KERNEL_FUNCTIONS(shader)),
 	  filter_divide_shadow_kernel(KERNEL_FUNCTIONS(filter_divide_shadow)),
 	  filter_get_feature_kernel(KERNEL_FUNCTIONS(filter_get_feature)),
-	  filter_non_local_means_kernel(KERNEL_FUNCTIONS(filter_non_local_means)),
 	  filter_combine_halves_kernel(KERNEL_FUNCTIONS(filter_combine_halves)),
 	  filter_construct_transform_kernel(KERNEL_FUNCTIONS(filter_construct_transform)),
 	  filter_reconstruct_kernel(KERNEL_FUNCTIONS(filter_reconstruct)),
-	  filter_divide_combined_kernel(KERNEL_FUNCTIONS(filter_divide_combined))
+	  filter_divide_combined_kernel(KERNEL_FUNCTIONS(filter_divide_combined)),
+	  filter_nlm_calc_difference_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_difference)),
+	  filter_nlm_blur_kernel(KERNEL_FUNCTIONS(filter_nlm_blur)),
+	  filter_nlm_calc_weight_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_weight)),
+	  filter_nlm_update_output_kernel(KERNEL_FUNCTIONS(filter_nlm_update_output)),
+	  filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize))
 	{
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
@@ -275,6 +284,30 @@ public:
 		}
 	};
 
+	void non_local_means(int4 rect, float *image, float *weight, float *out, float *variance, float *difference, float *blurDifference, float *weightAccum, int r, int f, float a, float k_2)
+	{
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+
+		memset(weightAccum, 0, sizeof(float)*w*h);
+		memset(out, 0, sizeof(float)*w*h);
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, 0, a, k_2);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+			filter_nlm_update_output_kernel()(dx, dy, blurDifference, image, out, weightAccum, local_rect, w, f);
+		}
+
+		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+		filter_nlm_normalize_kernel()(out, weightAccum, local_rect, w);
+	}
+
 	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
 	{
 		bool cross_denoise = kg->__data.film.denoise_cross;
@@ -294,51 +327,16 @@ public:
 #ifdef WITH_CYCLES_DEBUG_FILTER
 			DenoiseDebug debug((rect.z - rect.x), h, 34);
 #endif
-			/* ==== Step 1: Prefilter general features. ==== */
-			{
-
-				float *unfiltered = filter_buffer + 16*pass_stride;
-				/* Order in render buffers:
-				 *   Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
-				 *          0  1  2            3  4  5         6  7  8            9  10 11  12    13
-				 *
-				 * Order in denoise buffer:
-				 *   Normal[X, XVar, Y, YVar, Z, ZVar] Depth DepthVar Shadow ShadowVar Albedo[R, RVar, G, GVar, B, BVar] Color[R, RVar, G, GVar, B, BVar]
-				 *          0  1     2  3     4  5     6     7        8      9                10 11    12 13    14 15          16 17    18 19    20 21
-				 *
-				 * Order of processing: |NormalXYZ|Depth|AlbedoXYZ |
-				 *                      |         |     |          | */
-				int mean_from[]      = { 0, 1, 2,   6,    7,  8, 12 };
-				int variance_from[]  = { 3, 4, 5,   9,   10, 11, 13 };
-				int offset_to[]      = { 0, 2, 4,  10,   12, 14,  6 };
-				for(int i = 0; i < 7; i++) {
-					for(int y = rect.y; y < rect.w; y++) {
-						for(int x = rect.x; x < rect.z; x++) {
-							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
-						}
-					}
-					for(int y = rect.y; y < rect.w; y++) {
-						for(int x = rect.x; x < rect.z; x++) {
-							filter_non_local_means_kernel()(x, y, unfiltered, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, filter_buffer + offset_to[i]*pass_stride, &rect.x, 2, 2, 1, 0.25f);
-						}
-					}
-#ifdef WITH_CYCLES_DEBUG_FILTER
-#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, name), var, 1, w);
-					WRITE_DEBUG("unfiltered", unfiltered);
-					WRITE_DEBUG("sampleV", filter_buffer + (offset_to[i]+1)*pass_stride);
-					WRITE_DEBUG("filtered", filter_buffer + offset_to[i]*pass_stride);
-#undef WRITE_DEBUG
-#endif
-				}
-			}
 
 
+#define PASSPTR(i) (filter_buffer + (i)*pass_stride)
 
-			/* ==== Step 2: Prefilter shadow feature. ==== */
+			/* ==== Step 1: Prefilter shadow feature. ==== */
 			{
 				/* Reuse some passes of the filter_buffer for temporary storage. */
-				float *sampleV = filter_buffer + 16*pass_stride, *sampleVV = filter_buffer + 17*pass_stride, *bufferV = filter_buffer + 18*pass_stride, *cleanV = filter_buffer + 19*pass_stride;
-				float *unfiltered = filter_buffer + 20*pass_stride;
+				float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
+				float *unfiltered = PASSPTR(4), *unfilteredB = PASSPTR(5);
+				float *diffI = PASSPTR(10), *blurDiffI = PASSPTR(11), *accumI = PASSPTR(12);
 
 				/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
 				for(int y = rect.y; y < rect.w; y++) {
@@ -349,29 +347,21 @@ public:
 #ifdef WITH_CYCLES_DEBUG_FILTER
 #define WRITE_DEBUG(name, var) debug.add_pass(string_printf("shadow_%s", name), var, 1, w);
 				WRITE_DEBUG("unfilteredA", unfiltered);
-				WRITE_DEBUG("unfilteredB", unfiltered + pass_stride);
+				WRITE_DEBUG("unfilteredB", unfilteredB);
 				WRITE_DEBUG("bufferV", bufferV);
 				WRITE_DEBUG("sampleV", sampleV);
 				WRITE_DEBUG("sampleVV", sampleVV);
 #endif
 
 				/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-				for(int y = rect.y; y < rect.w; y++) {
-					for(int x = rect.x; x < rect.z; x++) {
-						filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, cleanV, &rect.x, 6, 3, 4, 1.0f);
-					}
-				}
+				non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
-			WRITE_DEBUG("cleanV", cleanV);
+				WRITE_DEBUG("cleanV", cleanV);
 #endif
 
 				/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-				for(int y = rect.y; y < rect.w; y++) {
-					for(int x = rect.x; x < rect.z; x++) {
-						filter_non_local_means_kernel()(x, y, unfiltered, unfiltered + pass_stride, cleanV, sampleV, &rect.x, 5, 3, 1, 0.25f);
-						filter_non_local_means_kernel()(x, y, unfiltered + pass_stride, unfiltered, cleanV, bufferV, &rect.x, 5, 3, 1, 0.25f);
-					}
-				}
+				non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+				non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 				WRITE_DEBUG("filteredA", sampleV);
 				WRITE_DEBUG("filteredB", bufferV);
@@ -388,12 +378,8 @@ public:
 #endif
 
 				/* Use the residual variance for a second filter pass. */
-				for(int y = rect.y; y < rect.w; y++) {
-					for(int x = rect.x; x < rect.z; x++) {
-						filter_non_local_means_kernel()(x, y, sampleV, bufferV, sampleVV, unfiltered              , &rect.x, 4, 2, 1, 0.5f);
-						filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, unfiltered + pass_stride, &rect.x, 4, 2, 1, 0.5f);
-					}
-				}
+				non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+				non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 				WRITE_DEBUG("finalA", unfiltered

@@ Diff output truncated at 10240 characters. @@