[Bf-blender-cvs] [005d83a] soc-2016-cycles_denoising: Cycles: Add additional debugging output containing the estimated rMSE reduction per sample

Fri Jul 8 04:31:37 CEST 2016

Commit: 005d83a0bb076d280837f16f9bc435c4d6b4b1de
Author: Lukas Stockner
Date:   Thu Jul 7 20:12:35 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB005d83a0bb076d280837f16f9bc435c4d6b4b1de

Cycles: Add additional debugging output containing the estimated rMSE reduction per sample

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernel_passes.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c296c17..1f7e750 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -345,17 +345,21 @@ public:
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
+#define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_%s.pfm", tile.x, tile.y, name).c_str(), &storages[0].var, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
 				for(int i = 0; i < DENOISE_FEATURES; i++) {
-					debug_write_pfm(string_printf("debug_%dx%d_mean_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].means[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-					debug_write_pfm(string_printf("debug_%dx%d_scale_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].scales[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-					debug_write_pfm(string_printf("debug_%dx%d_singular_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].singular[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-					debug_write_pfm(string_printf("debug_%dx%d_bandwidth_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].bandwidth[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+					WRITE_DEBUG(string_printf("mean_%d.pfm", i).c_str(), means[i]);
+					WRITE_DEBUG(string_printf("scale_%d.pfm", i).c_str(), scales[i]);
+					WRITE_DEBUG(string_printf("singular_%d.pfm", i).c_str(), singular[i]);
+					WRITE_DEBUG(string_printf("bandwidth_%d.pfm", i).c_str(), bandwidth[i]);
 				}
-				debug_write_pfm(string_printf("debug_%dx%d_singular_threshold.pfm", tile.x, tile.y).c_str(), &storages[0].singular_threshold, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-				debug_write_pfm(string_printf("debug_%dx%d_feature_matrix_norm.pfm", tile.x, tile.y).c_str(), &storages[0].feature_matrix_norm, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-				debug_write_pfm(string_printf("debug_%dx%d_global_bandwidth.pfm", tile.x, tile.y).c_str(), &storages[0].global_bandwidth, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-				debug_write_pfm(string_printf("debug_%dx%d_filtered_global_bandwidth.pfm", tile.x, tile.y).c_str(), &storages[0].filtered_global_bandwidth, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
-				debug_write_pfm(string_printf("debug_%dx%d_sum_weight.pfm", tile.x, tile.y).c_str(), &storages[0].sum_weight, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+				WRITE_DEBUG("singular_threshold", singular_threshold);
+				WRITE_DEBUG("singular_threshold.pfm", singular_threshold);
+				WRITE_DEBUG("feature_matrix_norm.pfm", feature_matrix_norm);
+				WRITE_DEBUG("global_bandwidth.pfm", global_bandwidth);
+				WRITE_DEBUG("filtered_global_bandwidth.pfm", filtered_global_bandwidth);
+				WRITE_DEBUG("sum_weight.pfm", sum_weight);
+				WRITE_DEBUG("log_rmse_per_sample.pfm", log_rmse_per_sample);
+#undef WRITE_DEBUG
 #endif
 				tile.sample = sample;
 			}
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 912465b..c0bf5c1 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -831,13 +831,22 @@ public:
 #ifdef WITH_CYCLES_DEBUG_FILTER
 		FilterStorage *host_storage = new FilterStorage[filter_w*filter_h];
 		cuda_assert(cuMemcpyDtoH(host_storage, d_storage, sizeof(FilterStorage)*filter_w*filter_h));
-		std::string prefix = string_printf("debug_%dx%d_cuda", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan);
-		for(int i = 0; i < DENOISE_FEATURES; i++)
-			debug_write_pfm(string_printf("%s_bandwidth_%d.pfm", prefix.c_str(), i).c_str(), &host_storage[0].bandwidth[i], filter_w, filter_h, sizeof(FilterStorage)/sizeof(float), filter_w);
-		debug_write_pfm(string_printf("%s_global_bandwidth.pfm", prefix.c_str()).c_str(), &host_storage[0].global_bandwidth, filter_w, filter_h, sizeof(FilterStorage)/sizeof(float), filter_w);
-		debug_write_pfm(string_printf("%s_filtered_global_bandwidth.pfm", prefix.c_str()).c_str(), &host_storage[0].filtered_global_bandwidth, filter_w, filter_h, sizeof(FilterStorage)/sizeof(float), filter_w);
-		debug_write_pfm(string_printf("%s_sum_weight.pfm", prefix.c_str()).c_str(), &host_storage[0].sum_weight, filter_w, filter_h, sizeof(FilterStorage)/sizeof(float), filter_w);
+#define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_cuda_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, name).c_str(), &host_storage[0].var, filter_w, filter_h, sizeof(FilterStorage)/sizeof(float), filter_w);
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			WRITE_DEBUG(string_printf("mean_%d.pfm", i).c_str(), means[i]);
+			WRITE_DEBUG(string_printf("scale_%d.pfm", i).c_str(), scales[i]);
+			WRITE_DEBUG(string_printf("singular_%d.pfm", i).c_str(), singular[i]);
+			WRITE_DEBUG(string_printf("bandwidth_%d.pfm", i).c_str(), bandwidth[i]);
+		}
+		WRITE_DEBUG("singular_threshold", singular_threshold);
+		WRITE_DEBUG("singular_threshold.pfm", singular_threshold);
+		WRITE_DEBUG("feature_matrix_norm.pfm", feature_matrix_norm);
+		WRITE_DEBUG("global_bandwidth.pfm", global_bandwidth);
+		WRITE_DEBUG("filtered_global_bandwidth.pfm", filtered_global_bandwidth);
+		WRITE_DEBUG("sum_weight.pfm", sum_weight);
+		WRITE_DEBUG("log_rmse_per_sample.pfm", log_rmse_per_sample);
 		delete[] host_storage;
+#undef WRITE_DEBUG
 #endif
 
 		cuda_assert(cuMemFree(d_storage));
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index 00c6509..ad3d256 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -363,12 +363,21 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 
 
 	/* === Estimate optimal global bandwidth. === */
-	double bias_coef = math_lsq_solve(lsq_bias);
-	double variance_coef = math_lsq_solve(lsq_variance);
+	double bias_coef = math_lsq_solve(lsq_bias, NULL);
+	double variance_zeroth;
+	double variance_coef = math_lsq_solve(lsq_variance, &variance_zeroth);
+	if(variance_coef < 0.0) {
+		variance_coef = -variance_coef;
+		variance_zeroth = 0.0;
+	}
 	float optimal_bw = (float) pow((rank * variance_coef) / (4.0 * bias_coef*bias_coef * sample), 1.0 / (rank + 4));
 
-
-
+#ifdef WITH_CYCLES_DEBUG_FILTER
+	double h2 = ((double) optimal_bw) * ((double) optimal_bw);
+	double bias = bias_coef*h2;
+	double variance = (variance_zeroth + variance_coef*pow(optimal_bw, -rank)) / sample;
+	storage->log_rmse_per_sample = ( (float) log(max(bias*bias + variance, 1e-20)) - 4.0f*logf(sample)/(rank + 4) );
+#endif
 
 	/* === Store the calculated data for the second kernel. === */
 	storage->rank = rank;
@@ -504,6 +513,10 @@ ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float **
 	center_buffer[0] = final_color.x;
 	center_buffer[1] = final_color.y;
 	center_buffer[2] = final_color.z;
+
+#ifdef WITH_CYCLES_DEBUG_FILTER
+	storage->log_rmse_per_sample -= 2.0f * logf(linear_rgb_to_gray(final_color) + 0.001f);
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index b80bdbc..18db922 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -161,7 +161,7 @@ ccl_device_inline void kernel_write_denoising_passes(KernelGlobals *kg, ccl_glob
 	else {
 		kernel_write_pass_float3_var(buffer, sample, make_float3(0.0f, 0.0f, 0.0f));
 		kernel_write_pass_float3_var(buffer + 6, sample, world_albedo);
-		kernel_write_pass_float_var(buffer + 12, sample, 1e10f);
+		kernel_write_pass_float_var(buffer + 12, sample, 0.0f);
 	}
 
 	state->flag |= PATH_RAY_DENOISING_PASS_DONE;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 15dbcae..7ad03d8 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1286,6 +1286,7 @@ typedef struct FilterStorage {
 	float sum_weight;
 	float means[DENOISE_FEATURES], scales[DENOISE_FEATURES], singular[DENOISE_FEATURES];
 	float singular_threshold, feature_matrix_norm;
+	float log_rmse_per_sample;
 #endif
 } FilterStorage;
 
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index cb426dc..ba81e9f 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -252,9 +252,11 @@ ccl_device_inline void math_lsq_add(double *lsq, double x, double y)
 }
 
 /* Returns the first-order coefficient a of the fitted function. */
-ccl_device_inline double math_lsq_solve(double *lsq)
+ccl_device_inline double math_lsq_solve(double *lsq, double *zeroth)
 {
 	double inv_det = 1.0 / (lsq[0]*lsq[2] - lsq[1]*lsq[1] + 1e-4);
+	if(zeroth)
+		*zeroth = (lsq[2]*lsq[3] - lsq[1]*lsq[3]) * inv_det;
 	return (lsq[0]*lsq[4] - lsq[1]*lsq[3]) * inv_det;
 }