[Bf-blender-cvs] [b4e7747] soc-2016-cycles_denoising: Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around

Tue Nov 22 04:25:38 CET 2016

Commit: b4e774722fc4f2e064e41117bd17132d98f1a61e
Author: Lukas Stockner
Date:   Tue Nov 22 04:11:16 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBb4e774722fc4f2e064e41117bd17132d98f1a61e

Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/CMakeLists.txt
A	intern/cycles/kernel/filter/filter_final_pass_impl.h
M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernel_filter_util.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/util/util_atomic.h
M	intern/cycles/util/util_math.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fcce0ff..61c1e75 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -139,7 +139,7 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                  filter_combine_halves_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_construct_transform_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_estimate_wlr_params_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)>              filter_final_pass_wlr_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)>      filter_final_pass_wlr_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)>      filter_final_pass_nlm_kernel;
 	KernelFunctions<void(*)(int, int, float**, float**, float**, float**, int*, int, int, float, float)>              filter_non_local_means_3_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, float*, int, int, int, int, float, float*, int*)>                         filter_old_1_kernel;
@@ -453,8 +453,9 @@ public:
 		bool use_gradients = kg->__data.integrator.use_gradients;
 		bool nlm_weights = kg->__data.integrator.use_nlm_weights;
 
-		FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
 		int hw = kg->__data.integrator.half_window;
+		FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
+		float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
 
 		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 		int pass_stride = w*h;
@@ -511,14 +512,12 @@ public:
 			}
 		}
 		else if(nlm_weights) {
-			float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
 			for(int y = 0; y < filter_area.w; y++) {
 				for(int x = 0; x < filter_area.z; x++) {
 					filter_construct_transform_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z + x, &rect.x);
 					filter_final_pass_nlm_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
 				}
 			}
-			delete[] weight_cache;
 		}
 		else {
 			for(int y = 0; y < filter_area.w; y++) {
@@ -541,7 +540,7 @@ public:
 #endif
 			for(int y = 0; y < filter_area.w; y++) {
 				for(int x = 0; x < filter_area.z; x++) {
-					filter_final_pass_wlr_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, &filter_area.x, &rect.x);
+					filter_final_pass_wlr_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
 				}
 			}
 #ifdef WITH_CYCLES_DEBUG_FILTER
@@ -561,6 +560,7 @@ public:
 		}
 
 		delete[] storage;
+		delete[] weight_cache;
 	}
 
 	void thread_render(DeviceTask& task)
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9d9764b..84a0636 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -209,6 +209,10 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_sum_all_radiance.h
 )
 
+set(SRC_FILTER_HEADERS
+	filter/filter_final_pass_impl.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -346,6 +350,7 @@ add_library(cycles_kernel
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
+	${SRC_FILTER_HEADERS}
 )
 
 if(WITH_CYCLES_CUDA)
@@ -382,4 +387,5 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTAL
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/filter)
 
diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h b/intern/cycles/kernel/filter/filter_final_pass_impl.h
new file mode 100644
index 0000000..9e2b9b1
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -0,0 +1,276 @@
+#ifdef __KERNEL_CUDA__
+#define STORAGE_TYPE CUDAFilterStorage
+#else
+#define STORAGE_TYPE FilterStorage
+#endif
+
+ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, int filtered_passes, int2 color_passes, STORAGE_TYPE *storage, float *weight_cache, float ccl_readonly_ptr transform, int transform_stride, int4 filter_area, int4 rect)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int buffer_h = (rect.w - rect.y);
+	int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+	color_passes *= pass_stride;
+	int num_frames = kernel_data.film.num_frames;
+	int prev_frames = kernel_data.film.prev_frames;
+
+	int2 low  = make_int2(max(rect.x, x - kernel_data.integrator.half_window),
+	                      max(rect.y, y - kernel_data.integrator.half_window));
+	int2 high = make_int2(min(rect.z, x + kernel_data.integrator.half_window + 1),
+	                      min(rect.w, y + kernel_data.integrator.half_window + 1));
+
+	float ccl_readonly_ptr pixel_buffer;
+	float ccl_readonly_ptr center_buffer = buffer + (y - rect.y) * buffer_w + (x - rect.x);
+	int3 pixel;
+
+	float3 center_color  = filter_get_pixel_color(center_buffer + color_passes.x, pass_stride);
+	float sqrt_center_variance = sqrtf(filter_get_pixel_variance(center_buffer + color_passes.x, pass_stride));
+
+	/* NFOR weighting directly writes to the design row, so it doesn't need the feature vector and always uses full rank. */
+#ifndef WEIGHTING_NFOR
+#  ifdef __KERNEL_CUDA__
+	/* On GPUs, store the feature vector in shared memory for faster access. */
+	__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+	float *features = shared_features + DENOISE_FEATURES*(threadIdx.y*blockDim.x + threadIdx.x);
+#  else
+	float features[DENOISE_FEATURES];
+#  endif
+	const int rank = storage->rank;
+	const int matrix_size = rank+1;
+#else
+	const int matrix_size = DENOISE_FEATURES;
+	float *feature_scales = transform;
+#endif
+
+	float feature_means[DENOISE_FEATURES];
+	filter_get_features(make_int3(x, y, 0), center_buffer, feature_means, NULL, pass_stride);
+
+#ifdef WEIGHTING_WLR
+	/* Apply a median filter to the 3x3 window aroung the current pixel. */
+	int sort_idx = 0;
+	float global_bandwidths[9];
+	for(int dy = max(-1, filter_area.y - y); dy < min(2, filter_area.y+filter_area.w - y); dy++) {
+		for(int dx = max(-1, filter_area.x - x); dx < min(2, filter_area.x+filter_area.z - x); dx++) {
+			int ofs = dy*filter_area.z + dx;
+			if(storage[ofs].rank != rank) continue;
+			global_bandwidths[sort_idx++] = storage[ofs].global_bandwidth;
+		}
+	}
+	/* Insertion-sort the global bandwidths (fast enough for 9 elements). */
+	for(int i = 1; i < sort_idx; i++) {
+		float v = global_bandwidths[i];
+		int j;
+		for(j = i-1; j >= 0 && global_bandwidths[j] > v; j--)
+			global_bandwidths[j+1] = global_bandwidths[j];
+		global_bandwidths[j+1] = v;
+	}
+	float inv_global_bandwidth = 1.0f / (global_bandwidths[sort_idx/2] * kernel_data.integrator.weighting_adjust);
+
+	float bandwidth_factor[DENOISE_FEATURES];
+	for(int i = 0; i < rank; i++) {
+		/* Same as above, divide by the bandwidth since the bandwidth_factor actually is the inverse of the bandwidth. */
+		bandwidth_factor[i] = storage->bandwidth[i] * inv_global_bandwidth;
+	}
+#endif
+
+	/* Essentially, this function is just a first-order regression solver.
+	 * We model the pixel color as a linear function of the feature vectors.
+	 * So, we search the parameters S that minimize W*(X*S - y), where:
+	 * - X is the design matrix containing all the feature vectors
+	 * - y is the vector containing all the pixel colors
+	 * - W is the diagonal matrix containing all pixel weights
+	 * Since this is just regular least-squares, the solution is given by:
+	 * S = inv(Xt*W*X)*Xt*W*y */
+
+	float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+
+	math_matrix_zero(XtWX, matrix_size);
+	/* Construct Xt*W*X matrix (and fill weight cache, if used). */
+	FOR_PIXEL_WINDOW {
+		float3 color = filter_get_pixel_color(pixel_buffer + color_passes.x, pass_stride);
+		float variance = filter_get_pixel_variance(pixel_buffer + color_passes.x, pass_stride);
+		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) {
+#ifdef WEIGHT_CACHING_CUDA
+			if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+			weight_cache[cache_idx] = 0.0f;
+#endif
+			continue;
+		}
+
+#ifdef WEIGHTING_WLR
+		float weight = filter_get_design_row_transform_weight(pixel, pixel_buffer, feature_means, pass_stride, features, rank, design_row, transform, transform_stride, bandwidth_factor);
+#elif defined(WEIGHTING_NLM)
+		filter_get_design_row_transform(pixel, pixel_buffer, feature_means, pass_stride, features, rank, design_row, transform, transform_stride);
+		float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+#else /* WEIGHTING_NFOR */
+		filter_get_design_row(pixel, pixel_buffer, feature_means, feature_scales, pass_stride, design_row);
+		float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+#endif
+		if(weight < 1e-5f) {
+#ifdef WEIGHT_CACHING_CUDA
+			if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+			weight_cache[cache_idx]

@@ Diff output truncated at 10240 characters. @@