[Bf-blender-cvs] [0fe30e1] soc-2016-cycles_denoising: Cycles: Cache NLM weights

Tue Nov 22 04:25:28 CET 2016

Commit: 0fe30e156f289497f6354cc54481006b57bd5b8e
Author: Lukas Stockner
Date:   Thu Nov 17 13:23:30 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB0fe30e156f289497f6354cc54481006b57bd5b8e

Cycles: Cache NLM weights

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernel_filter_pre.h
M	intern/cycles/kernel/kernel_filter_util.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c2b6ea7..821879d 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -140,7 +140,7 @@ public:
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_construct_transform_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_estimate_wlr_params_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)>              filter_final_pass_wlr_kernel;
-	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)>              filter_final_pass_nlm_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)>      filter_final_pass_nlm_kernel;
 	KernelFunctions<void(*)(int, int, float**, float**, float**, float**, int*, int, int, float, float)>              filter_non_local_means_3_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, float*, int, int, int, int, float, float*, int*)>                         filter_old_1_kernel;
 	KernelFunctions<void(*)(KernelGlobals*, float*, float*, int, int, int, int, int, int, float, float*, int*, int*)> filter_old_2_kernel;
@@ -495,12 +495,14 @@ public:
 			}
 		}
 		else if(nlm_weights) {
+			float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
 			for(int y = 0; y < filter_area.w; y++) {
 				for(int x = 0; x < filter_area.z; x++) {
 					filter_construct_transform_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z + x, &rect.x);
-					filter_final_pass_nlm_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, &filter_area.x, &rect.x);
+					filter_final_pass_nlm_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
 				}
 			}
+			delete[] weight_cache;
 		}
 		else {
 			for(int y = 0; y < filter_area.w; y++) {
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index 27c1387..b8f6c42 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -494,6 +494,9 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 
 	/* === Calculate the final pixel color. === */
 	float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+	/* Using a larger cache would raise memory requirements too far. */
+#define WEIGHT_CACHE_SIZE 441
+	float weight_cache[WEIGHT_CACHE_SIZE];
 
 	int matrix_size = rank+1;
 	math_matrix_zero_lower(XtX, matrix_size);
@@ -501,14 +504,21 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 	FOR_PIXEL_WINDOW {
 		float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
 		float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
-		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) {
+			if(cache_idx < WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+			continue;
+		}
 
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
 		filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
 
 		float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-		if(weight == 0.0f) continue;
+		if(weight < 1e-5f) {
+			if(cache_idx < WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+			continue;
+		}
 		weight /= max(1.0f, variance);
+		if(cache_idx < WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = weight;
 
 		math_add_gramian(XtX, matrix_size, design_row, weight);
 	} END_FOR_PIXEL_WINDOW
@@ -544,17 +554,26 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 	float3 solution[DENOISE_FEATURES+1];
 	math_vec3_zero(solution, solution_size);
 	FOR_PIXEL_WINDOW {
-		float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
-		float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
-		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+		float weight;
+		float3 color;
+		if(cache_idx < WEIGHT_CACHE_SIZE) {
+			weight = weight_cache[cache_idx];
+			if(weight == 0.0f) continue;
+			color = filter_get_pixel_color(pixel_buffer, pass_stride);
+		}
+		else {
+			color = filter_get_pixel_color(pixel_buffer, pass_stride);
+			float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+			if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+			weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+			if(weight < 1e-5f) continue;
+			weight /= max(1.0f, variance);
+		}
 
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
 		filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
 
-		float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-		if(weight == 0.0f) continue;
-		weight /= max(1.0f, variance);
-
 		for(int i = 0; i < solution_size; i++) {
 			float XtWXinvXt = math_dot(XtWXinv + i*matrix_size, design_row, matrix_size);
 			solution[i] += XtWXinvXt*weight*color;
@@ -563,17 +582,24 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 
 	if(kernel_data.integrator.use_collaborative_filtering) {
 		FOR_PIXEL_WINDOW {
-			float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
-			float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
-			if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+			float weight;
+			if(cache_idx < WEIGHT_CACHE_SIZE) {
+				weight = weight_cache[cache_idx];
+				if(weight == 0.0f) continue;
+			}
+			else {
+				float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
+				float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+				if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+				weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+				if(weight < 1e-5f) continue;
+				weight /= max(1.0f, variance);
+			}
 
 			filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
 			filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
 
-			float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-			if(weight == 0.0f) continue;
-			weight /= max(1.0f, variance);
-
 			float3 reconstruction = math_dot_vec3(design_row, solution, matrix_size);
 			if(py >= filter_area.y && py < filter_area.y+filter_area.w && px >= filter_area.x && px < filter_area.x+filter_area.z) {
 			float *combined_buffer = buffers + (offset + py*stride + px)*kernel_data.film.pass_stride;
@@ -1354,7 +1380,7 @@ ccl_device void kernel_filter_final_pass_wlr(KernelGlobals *kg, int sample, floa
 	}
 }
 
-ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, FilterStorage *storage, int4 filter_area, int4 rect)
+ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, FilterStorage *storage, float *weight_cache, int4 filter_area, int4 rect)
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 	int buffer_h = (rect.w - rect.y);
@@ -1402,14 +1428,21 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 	FOR_PIXEL_WINDOW {
 		float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
 		float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
-		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) {
+			weight_cache[cache_idx] = 0.0f;
+			continue;
+		}
 
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
 		filter_fill_design_row_no_weight(features, rank, design_row, feature_transform);
 
 		float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-		if(weight < 1e-5f) continue;
+		if(weight < 1e-5f) {
+			weight_cache[cache_idx] = 0.0f;
+			continue;
+		}
 		weight /= max(1.0f, variance);
+		weight_cache[cache_idx] = weight;
 
 		math_add_gramian(XtX, matrix_size, design_row, weight);
 	} END_FOR_PIXEL_WINDOW
@@ -1443,16 +1476,13 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
 	float3 solution[DENOISE_FEATURES+1];
 	math_vec3_zero(solution, matrix_size);
 	FOR_PIXEL_WINDOW {
-		float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
-		float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
-		if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+		float weight = weight_cache[cache_idx];
+		if(weight == 0.0f) continue;
 
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
 		filter_fill_design_row_no_weight(features, rank, design_row, feature_transform);
+		float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
 
-		float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-		if(weight < 1e-5f) continue;
-		weight /= max(1.0f, variance);
 	

@@ Diff output truncated at 10240 characters. @@