[Bf-blender-cvs] [4bc56ca9b1] soc-2016-cycles_denoising: Cycles Denoising: Move denoising kernels to a separate compilation unit

Lukas Stockner noreply at git.blender.org
Sat Feb 4 04:09:03 CET 2017


Commit: 4bc56ca9b1a414cb326b759ff65979e2ce40f966
Author: Lukas Stockner
Date:   Thu Feb 2 05:45:09 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB4bc56ca9b1a414cb326b759ff65979e2ce40f966

Cycles Denoising: Move denoising kernels to a separate compilation unit

Previously, the denoising kernels were just included with the other kernels.
However, that is not ideal, since the kernels already take very long to compile. Also, it isn't needed since the rendering and denoising kernels share basically no code.

So, this commit adds intern/cycles/filter/, which contains the filtering kernels.

===================================================================

M	build_files/cmake/macros.cmake
M	intern/cycles/CMakeLists.txt
M	intern/cycles/app/CMakeLists.txt
M	intern/cycles/device/CMakeLists.txt
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
A	intern/cycles/filter/CMakeLists.txt
A	intern/cycles/filter/filter.h
A	intern/cycles/filter/filter_compat_cpu.h
A	intern/cycles/filter/filter_compat_cuda.h
A	intern/cycles/filter/filter_compat_opencl.h
A	intern/cycles/filter/filter_defines.h
R093	intern/cycles/kernel/filter/filter_features.h	intern/cycles/filter/filter_features.h
R098	intern/cycles/kernel/filter/filter_features_sse.h	intern/cycles/filter/filter_features_sse.h
A	intern/cycles/filter/filter_kernel.h
R079	intern/cycles/kernel/filter/filter_nlm_cpu.h	intern/cycles/filter/filter_nlm_cpu.h
R091	intern/cycles/kernel/filter/filter_nlm_gpu.h	intern/cycles/filter/filter_nlm_gpu.h
R071	intern/cycles/kernel/filter/filter_prefilter.h	intern/cycles/filter/filter_prefilter.h
R063	intern/cycles/kernel/filter/filter_final_pass_impl.h	intern/cycles/filter/filter_reconstruction.h
R065	intern/cycles/kernel/filter/filter_wlr.h	intern/cycles/filter/filter_transform.h
R066	intern/cycles/kernel/filter/filter_wlr_cuda.h	intern/cycles/filter/filter_transform_cuda.h
R064	intern/cycles/kernel/filter/filter_wlr_sse.h	intern/cycles/filter/filter_transform_sse.h
A	intern/cycles/filter/kernels/cpu/filter.cpp
A	intern/cycles/filter/kernels/cpu/filter_avx.cpp
A	intern/cycles/filter/kernels/cpu/filter_avx2.cpp
A	intern/cycles/filter/kernels/cpu/filter_cpu.h
A	intern/cycles/filter/kernels/cpu/filter_cpu_impl.h
A	intern/cycles/filter/kernels/cpu/filter_sse2.cpp
A	intern/cycles/filter/kernels/cpu/filter_sse3.cpp
A	intern/cycles/filter/kernels/cpu/filter_sse41.cpp
A	intern/cycles/filter/kernels/cuda/filter.cu
A	intern/cycles/filter/kernels/opencl/filter.cl
M	intern/cycles/kernel/CMakeLists.txt
D	intern/cycles/kernel/filter/filter.h
M	intern/cycles/kernel/kernel_compat_cpu.h
M	intern/cycles/kernel/kernel_compat_cuda.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
A	intern/cycles/util/util_cuda.h
A	intern/cycles/util/util_cuda_capabilities.h

===================================================================

diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake
index 09428953a4..4ebeebbf1f 100644
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -536,6 +536,7 @@ function(SETUP_BLENDER_SORTED_LIBS)
 			cycles_bvh
 			cycles_device
 			cycles_kernel
+			cycles_filter
 			cycles_util
 			cycles_subd)
 		if(WITH_CYCLES_OSL)
@@ -651,6 +652,7 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		cycles_bvh
 		cycles_device
 		cycles_kernel
+		cycles_filter
 		cycles_util
 		cycles_subd
 		bf_intern_opencolorio
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 29285c6ad6..f0505795dc 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -251,6 +251,7 @@ endif()
 add_subdirectory(bvh)
 add_subdirectory(device)
 add_subdirectory(doc)
+add_subdirectory(filter)
 add_subdirectory(graph)
 add_subdirectory(kernel)
 add_subdirectory(render)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index ff72bd772a..9b78e168d4 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -18,6 +18,7 @@ set(INC_SYS
 set(LIBRARIES
 	cycles_device
 	cycles_kernel
+	cycles_filter
 	cycles_render
 	cycles_bvh
 	cycles_subd
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52b..2d20e7138f 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,6 +3,7 @@ set(INC
 	.
 	../graph
 	../kernel
+	../filter
 	../kernel/svm
 	../kernel/osl
 	../util
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index e449d97c86..a1c77894fe 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -32,6 +32,8 @@
 #include "kernel_types.h"
 #include "kernel_globals.h"
 
+#include "filter.h"
+
 #include "osl_shader.h"
 #include "osl_globals.h"
 
@@ -133,11 +135,11 @@ public:
 	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_half_float_kernel;
 	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
 	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*)> filter_divide_shadow_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*)>       filter_get_feature_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                  filter_combine_halves_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_construct_transform_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, int, int, float*, int, int)>                                         filter_divide_combined_kernel;
+
+	KernelFunctions<void(*)(int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*, int, int, int, bool)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*, int, int, bool)>            filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                       filter_combine_halves_kernel;
+	KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)>                                                                 filter_divide_combined_kernel;
 
 	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
@@ -145,8 +147,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(int, int, float*, float*, int, int,  void*, float*, float3*, int*, int*, int, int, int)>  filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, void*, float*, float3*, int*, int)>                      filter_finalize_kernel;
+	KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float, int, int)>                          filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, int, int, float*, int*, float*, float3*, int*, int*, int, int, int)>  filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
 
 #define KERNEL_FUNCTIONS(name) \
 	      KERNEL_NAME_EVAL(cpu, name), \
@@ -165,13 +168,13 @@ public:
 	  filter_divide_shadow_kernel(KERNEL_FUNCTIONS(filter_divide_shadow)),
 	  filter_get_feature_kernel(KERNEL_FUNCTIONS(filter_get_feature)),
 	  filter_combine_halves_kernel(KERNEL_FUNCTIONS(filter_combine_halves)),
-	  filter_construct_transform_kernel(KERNEL_FUNCTIONS(filter_construct_transform)),
 	  filter_divide_combined_kernel(KERNEL_FUNCTIONS(filter_divide_combined)),
 	  filter_nlm_calc_difference_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_difference)),
 	  filter_nlm_blur_kernel(KERNEL_FUNCTIONS(filter_nlm_blur)),
 	  filter_nlm_calc_weight_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_weight)),
 	  filter_nlm_update_output_kernel(KERNEL_FUNCTIONS(filter_nlm_update_output)),
 	  filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize)),
+	  filter_construct_transform_kernel(KERNEL_FUNCTIONS(filter_construct_transform)),
 	  filter_nlm_construct_gramian_kernel(KERNEL_FUNCTIONS(filter_nlm_construct_gramian)),
 	  filter_finalize_kernel(KERNEL_FUNCTIONS(filter_finalize))
 	{
@@ -318,10 +321,15 @@ public:
 
 	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
 	{
-		bool cross_denoise = kg->__data.film.denoise_cross;
+		bool use_cross_denoising = kg->__data.film.denoise_cross;
+		bool use_gradients = kg->__data.integrator.use_gradients;
+		int buffer_pass_stride = kg->__data.film.pass_stride;
+		int buffer_denoising_offset = kg->__data.film.pass_denoising;
+		int num_frames = 1;
+
 		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 		int pass_stride = w*h*frames;
-		int passes = cross_denoise? 20 : 14;
+		int passes = use_cross_denoising? 20 : 14;
 		float *filter_buffers = new float[passes*pass_stride];
 		memset(filter_buffers, 0, sizeof(float)*passes*pass_stride);
 
@@ -362,7 +370,7 @@ public:
 				/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
 				for(int y = rect.y; y < rect.w; y++) {
 					for(int x = rect.x; x < rect.z; x++) {
-						filter_divide_shadow_kernel()(kg, sample, buffer, x, y, tile_x, tile_y, offsets, strides, unfilteredA, sampleV, sampleVV, bufferV, &rect.x);
+						filter_divide_shadow_kernel()(sample, buffer, x, y, tile_x, tile_y, offsets, strides, unfilteredA, sampleV, sampleVV, bufferV, &rect.x, buffer_pass_stride, buffer_denoising_offset, num_frames, use_gradients);
 					}
 				}
 				debug.add_pass("shadowUnfilteredA", unfilteredA);
@@ -421,7 +429,7 @@ public:
 				for(int i = 0; i < 7; i++) {
 					for(int y = rect.y; y < rect.w; y++) {
 						for(int x = rect.x; x < rect.z; x++) {
-							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, variance, &rect.x);
+							filter_get_feature_kernel()(sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, variance, &rect.x, buffer_pass_stride, buffer_denoising_offset, use_cross_denoising);
 						}
 					}
 					non_local_means(rect, unfiltered, unfiltered, PASSPTR(mean_to[i]), variance, nlm_temp1, nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f);
@@ -439,10 +447,10 @@ public:
 				int variance_from[]  = {23, 24, 25, 29, 30, 31};
 				int mean_to[]        = { 8,  9, 10, 14, 15, 16};
 				int variance_to[]    = {11, 12, 13, 17, 18, 19};
-				for(int i = 0; i < (cross_denoise? 6 : 3); i++) {
+				for(int i = 0; i < (use_cross_denoising? 6 : 3); i++) {
 					for(int y = rect.y; y < rect.w; y++) {
 						for(int x = rect.x; x < rect.z; x++) {
-							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, PASSPTR(mean_to[i]), PASSPTR(variance_to[i]), &rect.x);
+							filter_get_feature_kernel()(sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, PASSPTR(mean_to[i]), PASSPTR(variance_to[i]), &rect.x, buffer_pass_stride, buffer_denoising_offset, use_cross_denoising);
 						}
 					}
 				}
@@ -480,25 +488,28 @@ public:
 		scoped_fpe fpe(FPE_ENABLED);
 #endif
 
-		bool cross_denoise = kg->__data.film.denoise_cross;
-
-		int hw = kg->__data.integrator.half_window;
-		int storage_num = filter_area.z*filter_area.w;
-		FilterStorage *storage = new FilterStorage[storage_num];
+		bool use_cross_denoising = kg->__data.film.denoise_cross;
+		int half_window = kg->__data.integrator.half_window;
+		float pca_threshold = kg->__data.integrator.filter_strength;
+		int num_frames = 1; /* TODO(lukas) */
+		int prev_frames = 0;
 
 		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 		int pass_stride = w*h;
 
-		float *XtWX = new float[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)*storage_num];
-		float3 *XtWY = new float3[(DENOISE_FEATURES+1)*storage_num];
+		int storage_num = filter_area.z*filter_area.w;
+		float *XtWX = new float[XTWX_SIZE*storage_num];
+		float3 *XtWY = new float3[XTWY_SIZE*storage_num];
+		float *transform = new float[TRANSFORM_SIZE*storage_num];
+		int *rank = new int[storage_num];
 
 		for(int y = 0; y < fil

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list