[Bf-blender-cvs] [3830bf4] soc-2016-cycles_denoising: Cycles: Add experimental alternative SVD-threshold norm based on power iteration

Fri Jul 8 04:31:33 CEST 2016

Commit: 3830bf4bbc812aea436102e45970f13f417a2174
Author: Lukas Stockner
Date:   Thu Jul 7 00:50:45 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB3830bf4bbc812aea436102e45970f13f417a2174

Cycles: Add experimental alternative SVD-threshold norm based on power iteration

One of the first steps of the algorithm is to apply a truncated SVD to a
matrix formed by the features in order to reduce the dimensionality of the
problem and decorrelate dimensions.

The truncation threshold, according to the paper, is defined by twice the
spectral norm of the matrix that is formed like the first one, but from the
variances of the features.

The reference implementation doesn't compute the spectral norm, but instead
computes the Frobenius norm and multiplies it with sqrt(Rank)/2.
That makes sense since it's guaranteed that the Frobenius norm lies somewhere
between the one and sqrt(Rank) times the spectral norm.

However, it's still an approximation. Therefore, in this commit I've tried
to directly compute the spectral norm since the runtime performance is currently
mainly limited by the per-pixel loops, so a small constant increase shouldn't matter too much.
In order to compute it, the code just constructs the Gramian matrix of the variance matrix
and computes the square root of its largest eigenvalue, which is found via power iteration.

I haven't tested the code too much yet, but it seems that the improvement is quite negligible
and not really worth the effort. Still, it might be interesting to tweak it further.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/kernel_filter.h
A	intern/cycles/kernel/kernel_filter.h.orig
A	intern/cycles/kernel/kernel_filter.h.rej
M	intern/cycles/kernel/kernel_types.h
A	intern/cycles/render/denoising.cpp
A	intern/cycles/render/denoising.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 83ef715..7e5162f 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -345,8 +345,14 @@ public:
 					}
 				}
 #ifdef WITH_CYCLES_DEBUG_FILTER
-				for(int i = 0; i < DENOISE_FEATURES; i++)
+				for(int i = 0; i < DENOISE_FEATURES; i++) {
+					debug_write_pfm(string_printf("debug_%dx%d_mean_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].means[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+					debug_write_pfm(string_printf("debug_%dx%d_scale_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].scales[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+					debug_write_pfm(string_printf("debug_%dx%d_singular_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].singular[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
 					debug_write_pfm(string_printf("debug_%dx%d_bandwidth_%d.pfm", tile.x, tile.y, i).c_str(), &storages[0].bandwidth[i], tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+				}
+				debug_write_pfm(string_printf("debug_%dx%d_singular_threshold.pfm", tile.x, tile.y).c_str(), &storages[0].singular_threshold, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
+				debug_write_pfm(string_printf("debug_%dx%d_feature_matrix_norm.pfm", tile.x, tile.y).c_str(), &storages[0].feature_matrix_norm, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
 				debug_write_pfm(string_printf("debug_%dx%d_global_bandwidth.pfm", tile.x, tile.y).c_str(), &storages[0].global_bandwidth, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
 				debug_write_pfm(string_printf("debug_%dx%d_filtered_global_bandwidth.pfm", tile.x, tile.y).c_str(), &storages[0].filtered_global_bandwidth, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
 				debug_write_pfm(string_printf("debug_%dx%d_sum_weight.pfm", tile.x, tile.y).c_str(), &storages[0].sum_weight, tile.w, tile.h, sizeof(FilterStorage)/sizeof(float), tile.w);
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index b6984a5..00c6509 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -128,7 +128,7 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 	storage += (y-filter_rect.y)*(filter_rect.z-filter_rect.x) + (x-filter_rect.x);
 
 	/* Temporary storage, used in different steps of the algorithm. */
-	float tempmatrix[(2*DENOISE_FEATURES+1)*(2*DENOISE_FEATURES+1)], tempvector[2*DENOISE_FEATURES+1];
+	float tempmatrix[(2*DENOISE_FEATURES+1)*(2*DENOISE_FEATURES+1)], tempvector[4*DENOISE_FEATURES+1];
 	float *buffer, features[DENOISE_FEATURES];
 
 	/* === Get center pixel color and variance. === */
@@ -187,6 +187,10 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
 	float* feature_matrix = tempmatrix, feature_matrix_norm = 0.0f;
 	math_matrix_zero_lower(feature_matrix, DENOISE_FEATURES);
+#ifdef FULL_EIGENVALUE_NORM
+	float *perturbation_matrix = tempmatrix + DENOISE_FEATURES*DENOISE_FEATURES;
+	math_matrix_zero_lower(perturbation_matrix, DENOISE_FEATURES);
+#endif
 	FOR_PIXEL_WINDOW {
 		filter_get_features(px, py, buffer, sample, features, feature_means);
 		for(int i = 0; i < FEATURE_PASSES; i++)
@@ -194,14 +198,26 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 		math_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
 
 		filter_get_feature_variance(px, py, buffer, sample, features, feature_scale);
+#ifdef FULL_EIGENVALUE_NORM
+		math_add_gramian(perturbation_matrix, DENOISE_FEATURES, features, 1.0f);
+#else
 		for(int i = 0; i < FEATURE_PASSES; i++)
 			feature_matrix_norm += features[i];
+#endif
 	} END_FOR_PIXEL_WINDOW
 	math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
 
 	float *feature_transform = &storage->transform[0], *singular = tempvector + DENOISE_FEATURES;
 	int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
+
+#ifdef FULL_EIGENVALUE_NORM
+	float *eigenvector_guess = tempvector + 2*DENOISE_FEATURES;
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		eigenvector_guess[i] = 1.0f;
+	float singular_threshold = 0.01f + 2.0f * sqrtf(math_largest_eigenvalue(perturbation_matrix, DENOISE_FEATURES, eigenvector_guess, tempvector + 3*DENOISE_FEATURES));
+#else
 	float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / (sqrtf(rank) * 0.5f));
+#endif
 
 	rank = 0;
 	for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
@@ -213,6 +229,16 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 			feature_transform[rank*DENOISE_FEATURES + j] *= feature_scale[j];
 	}
 
+#ifdef WITH_CYCLES_DEBUG_FILTER
+	storage->feature_matrix_norm = feature_matrix_norm;
+	storage->singular_threshold = singular_threshold;
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		storage->means[i] = feature_means[i];
+		storage->scales[i] = feature_scale[i];
+		storage->singular[i] = sqrtf(singular[i]);
+	}
+#endif
+
 	/* From here on, the mean of the features will be shifted to the central pixel's values. */
 	filter_get_features(x, y, center_buffer, sample, feature_means, NULL);
 
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h.orig
similarity index 93%
copy from intern/cycles/kernel/kernel_filter.h
copy to intern/cycles/kernel/kernel_filter.h.orig
index b6984a5..7209431 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h.orig
@@ -185,8 +185,9 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 	/* === Generate the feature transformation. ===
 	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
 	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-	float* feature_matrix = tempmatrix, feature_matrix_norm = 0.0f;
+	float *feature_matrix = tempmatrix, *perturbation_matrix = tempmatrix + DENOISE_FEATURES*DENOISE_FEATURES;
 	math_matrix_zero_lower(feature_matrix, DENOISE_FEATURES);
+	math_matrix_zero_lower(perturbation_matrix, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW {
 		filter_get_features(px, py, buffer, sample, features, feature_means);
 		for(int i = 0; i < FEATURE_PASSES; i++)
@@ -194,14 +195,26 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 		math_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
 
 		filter_get_feature_variance(px, py, buffer, sample, features, feature_scale);
-		for(int i = 0; i < FEATURE_PASSES; i++)
-			feature_matrix_norm += features[i];
+		math_add_gramian(perturbation_matrix, DENOISE_FEATURES, features, 1.0f);
 	} END_FOR_PIXEL_WINDOW
 	math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
 
 	float *feature_transform = &storage->transform[0], *singular = tempvector + DENOISE_FEATURES;
 	int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
-	float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / (sqrtf(rank) * 0.5f));
+
+	float *eigenvector_guess = tempvector + DENOISE_FEATURES;
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		eigenvector_guess[i] = 1.0f;
+	float singular_threshold = 0.01f + 2.0f * sqrtf(math_largest_eigenvalue(perturbation_matrix, DENOISE_FEATURES, eigenvector_guess, tempvector + 2*DENOISE_FEATURES));
+	if (x%100== 0 && y%100 == 0) {
+		for(int r = 0; r < DENOISE_FEATURES; r++) {
+			int c;
+			for(c = 0; c <= r; c++) printf("%f ", (double) perturbation_matrix[r*DENOISE_FEATURES+c]);
+			for(; c < DENOISE_FEATURES; c++) printf("%f ", (double) perturbation_matrix[c*DENOISE_FEATURES+r]);
+			printf("\n");
+		}
+		printf("Singular val: %f\n", (double) (0.5f*(singular_threshold - 0.01f)));
+	}
 
 	rank = 0;
 	for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
@@ -213,6 +226,16 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 			feature_transform[rank*DENOISE_FEATURES + j] *= feature_scale[j];
 	}
 
+#ifdef WITH_CYCLES_DEBUG_FILTER
+	storage->feature_matrix_norm = 0.0f;//feature_matrix_norm;
+	storage->singular_threshold = singular_threshold;
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		storage->means[i] = feature_means[i];
+		storage->scales[i] = feature_scale[i];
+		storage->singular[i] = sqrtf(singular[i]);
+	}
+#endif
+
 	/* From here on, the mean of the features will be shifted to the central pixel's values. */
 	filter_get_features(x, y, center_buffer, sample, feature_means, NULL);
 
diff --git a/intern/cycles/kernel/kernel_filter.h.rej b/intern/cycles/kernel/kernel_filter.h.rej
new file mode 100644
index 0000000..0646560
--- /dev/null
+++ b/intern/cycles/kernel/kernel_filter.h.rej
@@ -0,0 +1,43 @@
+--- intern/cycles/kernel/kernel_filter.h
++++ intern/cycles/kernel/kernel_filter.h
+@@ -185,9 +185,8 @@
+        /* === Generate the feature transformation. ===
+         * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+         * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+-       float *feature_matrix = tempmatrix, *perturbation_matrix = tempmatrix + DENOISE_FEATURES*DENOISE_FEATURES;
++       float* feature_matrix = tempmatrix, feature_matrix_norm = 0.0f;
+        math_matrix_zero_lower(feature_matrix, DENOISE_FEATURES);
+-       math_matrix_zero_lower(perturbation_matrix, DENOISE_FEATURES);
+        FOR_PIXEL_WINDOW {
+                filter_get_features(px, py, buffer, sample, features, feature_means);
+                for(int i = 0; i < FEATURE_PASSES; i++)
+@@ -195,26 +194,14 @@
+                math_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ 
+                filter_get_feature_variance(px, py, buffer, sample, features, feature_scale);
+-               math_add_gramian(perturbation_matrix, DENOISE_FEATURES, features, 1.0f);
++               for(int i = 0; i < FEATURE_PASSES; i++)
++                       feature_matrix_norm += features[i];


@@ Diff output truncated at 10240 characters. @@