[Bf-blender-cvs] [8771846] soc-2016-cycles_denoising: Cycles Denoising: Tweak shadow filtering

Sun Aug 21 06:18:13 CEST 2016

Commit: 8771846b2bd7d558d7d0aa1843c5a42fa9cf3f09
Author: Lukas Stockner
Date:   Thu Aug 18 12:32:23 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB8771846b2bd7d558d7d0aa1843c5a42fa9cf3f09

Cycles Denoising: Tweak shadow filtering

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernel_filter_pre.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 8709030..ad52c90 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -213,7 +213,7 @@ public:
 		void(*filter_divide_shadow)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int4);
 		void(*filter_get_feature)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int4);
 		void(*filter_non_local_means)(int, int, float*, float*, float*, float*, int4, int, int, float, float);
-		void(*filter_combine_halves)(int, int, float*, float*, float*, float*, int4);
+		void(*filter_combine_halves)(int, int, float*, float*, float*, float*, int4, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2()) {
@@ -341,7 +341,7 @@ public:
 				/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
 				for(int y = rect.y; y < rect.w; y++) {
 					for(int x = rect.x; x < rect.z; x++) {
-						filter_non_local_means(x, y, bufferV, sampleV, sampleVV, cleanV, rect, 3, 1, 4, 1.0f);
+						filter_non_local_means(x, y, bufferV, sampleV, sampleVV, cleanV, rect, 6, 3, 4, 1.0f);
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
@@ -363,7 +363,7 @@ public:
 				/* Estimate the residual variance between the two filtered halves. */
 				for(int y = rect.y; y < rect.w; y++) {
 					for(int x = rect.x; x < rect.z; x++) {
-						filter_combine_halves(x, y, NULL, sampleVV, sampleV, bufferV, rect);
+						filter_combine_halves(x, y, NULL, sampleVV, sampleV, bufferV, rect, 2);
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
@@ -373,8 +373,8 @@ public:
 				/* Use the residual variance for a second filter pass. */
 				for(int y = rect.y; y < rect.w; y++) {
 					for(int x = rect.x; x < rect.z; x++) {
-						filter_non_local_means(x, y, sampleV, bufferV, sampleVV, unfiltered      , rect, 4, 2, 1, 0.25f);
-						filter_non_local_means(x, y, bufferV, sampleV, sampleVV, unfiltered + pass_stride, rect, 4, 2, 1, 0.25f);
+						filter_non_local_means(x, y, sampleV, bufferV, sampleVV, unfiltered              , rect, 4, 2, 1, 0.5f);
+						filter_non_local_means(x, y, bufferV, sampleV, sampleVV, unfiltered + pass_stride, rect, 4, 2, 1, 0.5f);
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
@@ -385,7 +385,7 @@ public:
 				/* Combine the two double-filtered halves to a final shadow feature image and associated variance. */
 				for(int y = rect.y; y < rect.w; y++) {
 					for(int x = rect.x; x < rect.z; x++) {
-						filter_combine_halves(x, y, filter_buffer + 8*pass_stride, filter_buffer + 9*pass_stride, unfiltered, unfiltered + pass_stride, rect);
+						filter_combine_halves(x, y, filter_buffer + 8*pass_stride, filter_buffer + 9*pass_stride, unfiltered, unfiltered + pass_stride, rect, 0);
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 60fe45b..3d165ff 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -928,8 +928,8 @@ public:
 			                           0, 0, divide_args, 0));
 
 			/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-			float a = 4.0f, k_2 = 1.0f;
-			int r = 3, f = 1;
+			float a = 2.0f, k_2 = 2.0f;
+			int r = 6, f = 3;
 			void *filter_variance_args[] = {&d_bufferV, &d_sampleV, &d_sampleVV, &d_cleanV,
 			                                &rect,
 			                                &r, &f, &a, &k_2};
@@ -959,8 +959,9 @@ public:
 			cuda_assert(cuCtxSynchronize());
 
 			/* Estimate the residual variance between the two filtered halves. */
+			int var_r = 2;
 			void *residual_variance_args[] = {&d_null, &d_sampleVV, &d_sampleV, &d_bufferV,
-			                                  &rect};
+			                                  &rect, &var_r};
 			cuda_assert(cuLaunchKernel(cuFilterCombineHalves,
 			                           xblocks , yblocks, 1, /* blocks */
 			                           xthreads, ythreads, 1, /* threads */
@@ -968,6 +969,7 @@ public:
 
 			/* Use the residual variance for a second filter pass. */
 			r = 4; f = 2;
+			k_2 = 1.0f;
 			void *filter_filteredA_args[] = {&d_sampleV, &d_bufferV, &d_sampleVV, &d_unfilteredA,
 			                                 &rect,
 			                                 &r, &f, &a, &k_2};
@@ -986,9 +988,10 @@ public:
 			cuda_assert(cuCtxSynchronize());
 
 			/* Combine the two double-filtered halves to a final shadow feature image and associated variance. */
+			var_r = 0;
 			void *final_prefiltered_args[] = {&d_mean, &d_variance,
 			                                  &d_unfilteredA, &d_unfilteredB,
-			                                  &rect};
+			                                  &rect, &var_r};
 			cuda_assert(cuLaunchKernel(cuFilterCombineHalves,
 			                           xblocks , yblocks, 1, /* blocks */
 			                           xthreads, ythreads, 1, /* threads */
diff --git a/intern/cycles/kernel/kernel_filter_pre.h b/intern/cycles/kernel/kernel_filter_pre.h
index 7f0cd30..b46753d 100644
--- a/intern/cycles/kernel/kernel_filter_pre.h
+++ b/intern/cycles/kernel/kernel_filter_pre.h
@@ -69,13 +69,35 @@ ccl_device void kernel_filter_get_feature(KernelGlobals *kg, int sample, float *
 
 /* Combine A/B buffers.
  * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x, int y, float *mean, float *variance, float *a, float *b, int4 rect)
+ccl_device void kernel_filter_combine_halves(int x, int y, float *mean, float *variance, float *a, float *b, int4 rect, int r)
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 	int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
 	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
-	if(variance) variance[idx] = 0.5f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.5f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.5f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
 }
 
 /* General Non-Local Means filter implementation.
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 56897dc..cf632a7 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -93,7 +93,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       float *variance,
                                                       float *a,
                                                       float *b,
-                                                      int4 prefilter_rect);
+                                                      int4 prefilter_rect,
+                                                      int r);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_estimate_params)(KernelGlobals *kg,
                                                        int sample,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index fdbda08..ce2e656 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -183,9 +183,10 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       float *variance,
                                                       float *a,
                                                       float *b,
-                                                      int4 prefilter_rect)
+                                                      int4 prefilter_rect,
+                                                      int r)
 {
-	kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect);
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_estimate_params)(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 70f6c2c..2e5a396 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -251,12 +251,12 @@ kernel_cuda_filter_non_local_means(float *noisyImage, float *weightImage, float
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
 {
 	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
 	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect);
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
 	}
 }