[Bf-blender-cvs] [2f6db0e227] soc-2016-cycles_denoising: Cycles Denoising: Use device-independent denoising in the CPUDevice

Thu Feb 9 14:39:38 CET 2017

Commit: 2f6db0e227d8835bc4b2ec5d0e181c5cf29da7dc
Author: Lukas Stockner
Date:   Wed Feb 8 16:59:38 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB2f6db0e227d8835bc4b2ec5d0e181c5cf29da7dc

Cycles Denoising: Use device-independent denoising in the CPUDevice

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/filter/filter_features.h
M	intern/cycles/filter/filter_nlm_cpu.h
M	intern/cycles/filter/filter_nlm_gpu.h
M	intern/cycles/filter/filter_prefilter.h
M	intern/cycles/filter/filter_reconstruction.h
M	intern/cycles/filter/kernels/cpu/filter_cpu.h
M	intern/cycles/filter/kernels/cpu/filter_cpu_impl.h
M	intern/cycles/filter/kernels/cuda/filter.cu

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ebd4acb1e5..bd5630ae95 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -26,6 +26,7 @@
 
 #include "device.h"
 #include "device_intern.h"
+#include "device_denoising.h"
 
 #include "kernel.h"
 #include "kernel_compat_cpu.h"
@@ -136,10 +137,10 @@ public:
 	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
 	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
 
-	KernelFunctions<void(*)(int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*, int, int, int, bool)> filter_divide_shadow_kernel;
-	KernelFunctions<void(*)(int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*, int, int, bool)>            filter_get_feature_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                       filter_combine_halves_kernel;
-	KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)>                                                                 filter_divide_combined_kernel;
+	KernelFunctions<void(*)(int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                          filter_combine_halves_kernel;
+	KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)>                                                                    filter_divide_combined_kernel;
 
 	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
@@ -147,9 +148,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float, int, int)>                          filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, int, int, float*, int*, float*, float3*, int*, int*, int, int, int)>  filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
+	KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float, int, int)>                               filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                                  filter_finalize_kernel;
 
 #define KERNEL_FUNCTIONS(name) \
 	      KERNEL_NAME_EVAL(cpu, name), \
@@ -221,12 +222,20 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
+			if(!mem.data_pointer) {
+				delete[] (char*) mem.device_pointer;
+			}
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
 		}
 	}
 
+	virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_offset(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		kernel_const_copy(&kernel_globals, name, host, size);
@@ -290,368 +299,216 @@ public:
 		}
 	};
 
-	void non_local_means(int4 rect, float *image, float *weight, float *out, float *variance, float *difference, float *blurDifference, float *weightAccum, int r, int f, float a, float k_2, int channel_ofs_in = 0, int channel_ofs_out = 0)
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
 	{
+		int4 rect = task->rect;
+		int   r   = task->nlm_state.r;
+		int   f   = task->nlm_state.f;
+		float a   = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
 		int w = align_up(rect.z-rect.x, 4);
 		int h = rect.w-rect.y;
 
-		int channels = channel_ofs_in? 3: 1;
-		memset(weightAccum, 0, sizeof(float)*w*h*channels);
-		memset(out, 0, sizeof(float)*w*h*channels);
+		float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+		float *difference     = (float*) task->nlm_state.temporary_2_ptr;
+		float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
+
+		memset(weightAccum, 0, sizeof(float)*w*h);
+		memset((float*) out_ptr, 0, sizeof(float)*w*h);
 
 		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
 			int dy = i / (2*r+1) - r;
 			int dx = i % (2*r+1) - r;
 
 			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
-			filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, channel_ofs_in, a, k_2);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) guide_ptr,
+			                                    (float*) variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    0,
+			                                    w, a, k_2);
+
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
 			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-			for(int c = 0; c < channels; c++) {
-				filter_nlm_update_output_kernel()(dx, dy, blurDifference, image + channel_ofs_in*c, out + channel_ofs_out*c, weightAccum + w*h*c, local_rect, w, f);
-			}
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+
+			filter_nlm_update_output_kernel()(dx, dy,
+			                                  blurDifference,
+			                                  (float*) image_ptr,
+			                                  (float*) out_ptr,
+			                                  weightAccum,
+			                                  local_rect,
+			                                  w, f);
 		}
 
 		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
-		for(int c = 0; c < channels; c++) {
-			filter_nlm_normalize_kernel()(out + channel_ofs_out*c, weightAccum + w*h*c, local_rect, w);
-		}
+		filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
+
+		return true;
 	}
 
-	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
+	bool denoising_construct_transform(DenoisingTask *task)
 	{
-		bool use_cross_denoising = kg->__data.film.denoise_cross;
-		bool use_gradients = kg->__data.integrator.use_gradients;
-		int buffer_pass_stride = kg->__data.film.pass_stride;
-		int buffer_denoising_offset = kg->__data.film.pass_denoising;
-		int num_frames = 1;
-
-		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
-		int pass_stride = w*h*frames;
-		int passes = use_cross_denoising? 20 : 14;
-		float *filter_buffers = new float[passes*pass_stride];
-		memset(filter_buffers, 0, sizeof(float)*passes*pass_stride);
-
-		/* Denoising Buffer Pass allocation:
-		 *  0: Normal X
-		 *  1: Normal Y
-		 *  2: Normal Z
-		 *  3: Depth
-		 *  4: Shadowing
-		 *  5: Albedo R
-		 *  6: Albedo G
-		 *  7: Albedo B
-		 *  8: Color R
-		 *  9: Color G
-		 * 10: Color B
-		 * 11: Color Variance R
-		 * 12: Color Variance G
-		 * 13: Color Variance B
-		 * With Cross-denoising passes, this list is essentially repeated two times. */
-
-		for(int frame = 0; frame < frames; frame++) {
-			float *filter_buffer = filter_buffers + w*h*frame;
-			float *buffer[9];
-			for(int i = 0; i < 9; i++) {
-				buffer[i] = buffers[i] + frame_strides[i]*frame;
-			}
-			DebugPasses debug((rect.z - rect.x), h, 42, 1, w);
-
-#define PASSPTR(i) (filter_buffer + (i)*pass_stride)
-
-			/* ==== Step 1: Prefilter shadow feature. ==== */
-			{
-				/* Reuse some passes of the filter_buffer for temporary storage. */
-				float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
-				float *unfilteredA = PASSPTR(5), *unfilteredB = PASSPTR(6);
-				float *nlm_temp1 = PASSPTR(7), *nlm_temp2 = PASSPTR(8), *nlm_temp3 = PASSPTR(9);
-
-				/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
-				for(int y = rect.y; y < rect.w; y++) {
-					for(int x = rect.x; x < rect.z; x++) {
-						filter_divide_shadow_kernel()(sample, buffer, x, y, tile_x, tile_y, offsets, strides, unfilteredA, sampleV, sampleVV, bufferV, &rect.x, buffer_pass_stride, buffer_denoising_offset, num_frames, use_gradients);
-					}
-				}
-				debug.add_pass("shadowUnfilteredA", unfilteredA);
-				debug.add_pass("shadowUnfilteredB", unfilteredB);
-				debug.add_pass("shadowBufferV", bufferV);
-				debug.add_pass("shadowSampleV", sampleV);
-				debug.add_pass("shadowSampleVV", sampleVV);
-
-				/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-				non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f);
-				debug.add_pass("shadowCleanV", cleanV);
-
-				/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-				non_local_means(rect, unfilteredA, unfilteredB, sample

@@ Diff output truncated at 10240 characters. @@