[Bf-blender-cvs] [077116f] soc-2016-cycles_denoising: Cycles: Redesign the buffer accesses of the denoising kernel.

Sat Aug 6 05:41:09 CEST 2016

Commit: 077116f6570a4f22e6b1d0e875d8c68f518a3837
Author: Lukas Stockner
Date:   Sat Aug 6 05:04:12 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB077116f6570a4f22e6b1d0e875d8c68f518a3837

Cycles: Redesign the buffer accesses of the denoising kernel.

At first, the denoising kernel just directly accessed the RenderBuffers.
However, that introduced some addressing complexity since the filter window might cover multiple tiles, each with a separate buffer.
Apart from the addressing overhead, this also made it pretty much impossible to SIMDify the CPU code.

When feature prefiltering was added, it changed the buffer addressing.
First, it copied the various parts of different buffers into one continuous array. Then, it operated directly on that array.
With these changes, the only thing the regular buffer addressing was still needed for was the color image.

Now, this commit also copies the color image into the prefiltered buffer. Therefore, it's not really just a prefiltered buffer anymore, but actually contains all the data needed to denoise.
This allows to redesign and clean up the kernel-device-interface, which is also done in this commit.

Advantages are:
 - Lower addressing overhead - every pixel is only addressed once to copy the data to the denoising buffer, and once to store the final result - instead of hundreds of accesses per pixel when looping over the filter window.
 - Lower code complexity - one array with standard scanline addressing makes the code a lot cleaner.
 - For GPUs: More memory access coherence since the passes are stored in SoA layout instead of AoS (like the regular RenderBuffers are).
 - For CPUs: Possibility to use SIMD instructions in the future due to the SoA layout.
The disadvantage is slightly higher memory usage - 22 floats per pixel instead of 16.

This commit doesn't include the CUDA changes yet.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index adc0414..9ca1d03 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -208,7 +208,7 @@ public:
 		}
 	};
 
-	float* denoise_prefilter(int4 prefilter_rect, RenderTile &tile, KernelGlobals *kg, int sample, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides)
+	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides)
 	{
 		void(*filter_divide_shadow)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int4);
 		void(*filter_get_feature)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int4);
@@ -267,129 +267,221 @@ public:
 			filter_combine_halves = kernel_cpu_filter_combine_halves;
 		}
 
-		int w = (prefilter_rect.z - prefilter_rect.x), h = (prefilter_rect.w - prefilter_rect.y);
-		float *prefiltered = new float[16*w*h];
-		float *unfiltered = new float[2*w*h];
+		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
+		float *filter_buffer = new float[22*w*h];
 
 
 
-		/* Prefilter general features. */
-		int m_offsets[] = {0, 1, 2, 6, 7, 8, 12};
-		int variances[] = {3, 4, 5, 9, 10, 11, 13};
-		for(int i = 0; i < 7; i++) {
-			for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-				for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-					filter_get_feature(kg, sample, buffers, m_offsets[i], variances[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, prefiltered + (2*i+1)*w*h, prefilter_rect);
+		/* ==== Step 1: Prefilter general features. ==== */
+		{
+			float *unfiltered = filter_buffer + 16*w*h;
+			/* Order in render buffers:
+			 *   Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
+			 *          0  1  2            3  4  5         6  7  8            9  10 11  12    13
+			 *
+			 * Order in denoise buffer:
+			 *   Normal[X, XVar, Y, YVar, Z, ZVar] Depth DepthVar Shadow ShadowVar Albedo[R, RVar, G, GVar, B, BVar] Color[R, RVar, G, GVar, B, BVar]
+			 *          0  1     2  3     4  5     6     7        8      9                10 11    12 13    14 15          16 17    18 19    20 21
+			 *
+			 * Order of processing: |NormalXYZ|Depth|AlbedoXYZ |
+			 *                      |         |     |          | */
+			int mean_from[]      = { 0, 1, 2,   6,    7,  8, 12 };
+			int variance_from[]  = { 3, 4, 5,   9,   10, 11, 13 };
+			int offset_to[]      = { 0, 2, 4,  10,   12, 14,  6 };
+			for(int i = 0; i < 7; i++) {
+				for(int y = rect.y; y < rect.w; y++) {
+					for(int x = rect.x; x < rect.z; x++) {
+						filter_get_feature(kg, sample, buffers, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*w*h, rect);
+					}
 				}
-			}
-			for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-				for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-					filter_non_local_means(x, y, unfiltered, unfiltered, prefiltered + (2*i+1)*w*h, prefiltered + 2*i*w*h, prefilter_rect, 2, 2, 1, 0.25f);
+				for(int y = rect.y; y < rect.w; y++) {
+					for(int x = rect.x; x < rect.z; x++) {
+						filter_non_local_means(x, y, unfiltered, unfiltered, filter_buffer + (offset_to[i]+1)*w*h, filter_buffer + offset_to[i]*w*h, rect, 2, 2, 1, 0.25f);
+					}
 				}
-			}
 #ifdef WITH_CYCLES_DEBUG_FILTER
 #define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_feature%d_%s.pfm", tile.x, tile.y, i, name).c_str(), var, w, h, 1, w)
-			WRITE_DEBUG("unfiltered", unfiltered);
-			WRITE_DEBUG("sampleV", prefiltered + (2*i+1)*w*h);
-			WRITE_DEBUG("filtered", prefiltered + 2*i*w*h);
+				WRITE_DEBUG("unfiltered", unfiltered);
+				WRITE_DEBUG("sampleV", filter_buffer + (offset_to[i]+1)*w*h);
+				WRITE_DEBUG("filtered", filter_buffer + offset_to[i]*w*h);
 #undef WRITE_DEBUG
 #endif
+			}
 		}
 
 
 
-
-
-
-
-
-
-
-
-		float *sampleV = prefiltered + 14*w*h, *sampleVV = new float[w*h], *bufferV = prefiltered + 15*w*h, *cleanV = new float[w*h];
-
-		/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
-		for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-			for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-				filter_divide_shadow(kg, sample, buffers, x, y, tile_x, tile_y, offsets, strides, unfiltered, sampleV, sampleVV, bufferV, prefilter_rect);
+		/* ==== Step 2: Prefilter shadow feature. ==== */
+		{
+			/* Reuse some passes of the filter_buffer for temporary storage. */
+			float *sampleV = filter_buffer + 16*w*h, *sampleVV = filter_buffer + 17*w*h, *bufferV = filter_buffer + 18*w*h, *cleanV = filter_buffer + 19*w*h;
+			float *unfiltered = filter_buffer + 20*w*h;
+
+			/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_divide_shadow(kg, sample, buffers, x, y, tile_x, tile_y, offsets, strides, unfiltered, sampleV, sampleVV, bufferV, rect);
+				}
 			}
-		}
 #ifdef WITH_CYCLES_DEBUG_FILTER
 #define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_shadow_%s.pfm", tile.x, tile.y, name).c_str(), var, w, h, 1, w)
-		WRITE_DEBUG("unfilteredA", unfiltered);
-		WRITE_DEBUG("unfilteredB", unfiltered + w*h);
-		WRITE_DEBUG("bufferV", bufferV);
-		WRITE_DEBUG("sampleV", sampleV);
-		WRITE_DEBUG("sampleVV", sampleVV);
+			WRITE_DEBUG("unfilteredA", unfiltered);
+			WRITE_DEBUG("unfilteredB", unfiltered + w*h);
+			WRITE_DEBUG("bufferV", bufferV);
+			WRITE_DEBUG("sampleV", sampleV);
+			WRITE_DEBUG("sampleVV", sampleVV);
 #endif
 
-
-
-		/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-		for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-			for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-				//filter_prefilter_features(&kg, sample, x, y, filteredA, filteredB, prefilter_rect);
-				filter_non_local_means(x, y, bufferV, sampleV, sampleVV, cleanV, prefilter_rect, 3, 1, 4, 1.0f);
+			/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_non_local_means(x, y, bufferV, sampleV, sampleVV, cleanV, rect, 3, 1, 4, 1.0f);
+				}
 			}
-		}
 #ifdef WITH_CYCLES_DEBUG_FILTER
 		WRITE_DEBUG("cleanV", cleanV);
 #endif
 
-
-
-		/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-		for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-			for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-				filter_non_local_means(x, y, unfiltered, unfiltered + w*h, cleanV, sampleV, prefilter_rect, 5, 3, 1, 0.25f);
-				filter_non_local_means(x, y, unfiltered + w*h, unfiltered, cleanV, bufferV, prefilter_rect, 5, 3, 1, 0.25f);
+			/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_non_local_means(x, y, unfiltered, unfiltered + w*h, cleanV, sampleV, rect, 5, 3, 1, 0.25f);
+					filter_non_local_means(x, y, unfiltered + w*h, unfiltered, cleanV, bufferV, rect, 5, 3, 1, 0.25f);
+				}
 			}
-		}
-		delete[] cleanV;
 #ifdef WITH_CYCLES_DEBUG_FILTER
-		WRITE_DEBUG("filteredA", sampleV);
-		WRITE_DEBUG("filteredB", bufferV);
+			WRITE_DEBUG("filteredA", sampleV);
+			WRITE_DEBUG("filteredB", bufferV);
 #endif
 
+			/* Estimate the residual variance between the two filtered halves. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_combine_halves(x, y, NULL, sampleVV, sampleV, bufferV, rect);
+				}
+			}
+#ifdef WITH_CYCLES_DEBUG_FILTER
+			WRITE_DEBUG("residualV", sampleVV);
+#endif
 
+			/* Use the residual variance for a second filter pass. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_non_local_means(x, y, sampleV, bufferV, sampleVV, unfiltered      , rect, 4, 2, 1, 0.25f);
+					filter_non_local_means(x, y, bufferV, sampleV, sampleVV, unfiltered + w*h, rect, 4, 2, 1, 0.25f);
+				}
+			}
+#ifdef WITH_CYCLES_DEBUG_FILTER
+			WRITE_DEBUG("finalA", unfiltered);
+			WRITE_DEBUG("finalB", unfiltered + w*h);
+#endif
 
-		/* Estimate the residual variance between the two filtered halves. */
-		for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-			for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-				filter_combine_halves(x, y, NULL, sampleVV, sampleV, bufferV, prefilter_rect);
+			/* Combine the two double-filtered halves to a final shadow feature image and associated variance. */
+			for(int y = rect.y; y < rect.w; y++) {
+				for(int x = rect.x; x < rect.z; x++) {
+					filter_combine_halves(x, y, filter_buffer + 8*w*h, filter_buffer + 9*w*h, unfiltered, unfiltered + w*h, rect);
+				}
 			}
-		}
 #ifdef WITH_CYCLES_DEBUG_FILTER
-		WRITE_DEBUG("residualV", sampleVV);
+			WRITE_DEBUG("final", filter_buffer + 8*w*h);
+			WRITE_DEBUG("finalV", filter_buffer + 9*w*h);
+#undef WRITE_DEBUG
 #endif
+		}
+
+
 
-		/* Use the residual variance for a second filter pass. */
-		for(int y = prefilter_rect.y; y < prefilter_rect.w; y++) {
-			for(int x = prefilter_rect.x; x < prefilter_rect.z; x++) {
-				filter_non_local_means(x, y, sampleV, bufferV, sampleVV, unfiltered      , prefilter_rect, 4, 2, 1, 0.25f);
-				filter_non_local_means(x, y, bufferV, sampleV, sampleVV, unfiltered + w*h, prefilter_rect, 4, 2, 1, 0.25f);
+		/* ==== Step 3: Copy combined color pass. ==== */
+		{
+			int mean_from[]      = {20, 21, 22};
+			int variance_from[]  = {23, 24, 25};
+			int offset_to[]      = {16, 18, 20};
+			for(int i = 0; i < 3; i++) {
+				for(int y = rect.y; y < rect.w; y++) {
+					for(int x = rect.x; x < rect.z; x++) {
+						filter_get_feature(kg, sample, buffers, mean_from

@@ Diff output truncated at 10240 characters. @@