[Bf-blender-cvs] [8a65d6c] soc-2016-cycles_denoising: Cycles: Replace T-SVD algorithm with new Jacobi Eigendecomposition solver

Lukas Stockner noreply at git.blender.org
Tue Dec 20 16:06:54 CET 2016


Commit: 8a65d6cf11f727ab0e99a0433edd3ce1e3cf09db
Author: Lukas Stockner
Date:   Tue Dec 20 05:13:20 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB8a65d6cf11f727ab0e99a0433edd3ce1e3cf09db

Cycles: Replace T-SVD algorithm with new Jacobi Eigendecomposition solver

The code that was used for the T-SVD before came from the WLR reference implementation,
but had numerical problems on Windows and would often cause NaNs.

This commit replaces it with a new implementation using Eigendecomposition based on the Jacobi Eigenvalue Method.
That should:
- Give a slight performance boost (probably not noticable, since the T-SVD was no bottleneck to begin with)
- Improve numerical accuracy of the results (not very important either since the eigenvalues are only compared against a threshold)
- FINALLY solve the black spot issue on Windows
- Slightly reduce memory usage (singular values are now constructed on the diagonal of the input matrix) with the potential of more in the future (now only the lower-triangular part is required).
- Resolve potential licensing issues - the specific file containing the original code didn't come with any licensing information, and the main file contains an apparently custom license...

===================================================================

M	intern/cycles/kernel/filter/filter_wlr.h
M	intern/cycles/kernel/filter/filter_wlr_cuda.h
M	intern/cycles/kernel/filter/filter_wlr_sse.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_wlr.h b/intern/cycles/kernel/filter/filter_wlr.h
index d5fab6e..2239a11 100644
--- a/intern/cycles/kernel/filter/filter_wlr.h
+++ b/intern/cycles/kernel/filter/filter_wlr.h
@@ -92,10 +92,9 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 			feature_matrix_norm += features[i + NORM_FEATURE_OFFSET]*kernel_data.integrator.filter_strength;
 #endif
 	} END_FOR_PIXEL_WINDOW
-	math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
 
-	float *feature_transform = &storage->transform[0], *singular = tempvector + DENOISE_FEATURES;
-	int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
+	float *feature_transform = &storage->transform[0];
+	int rank = math_jacobi_eigendecomposition(feature_matrix, feature_transform, DENOISE_FEATURES, 1);
 
 #ifdef FULL_EIGENVALUE_NORM
 	float tempvector_2[2*DENOISE_FEATURES];
@@ -109,7 +108,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 
 	rank = 0;
 	for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
-		float s = sqrtf(fabsf(singular[i]));
+		float s = feature_matrix[i*DENOISE_FEATURES+i];
 		if(i >= 2 && s < singular_threshold)
 			break;
 		/* Bake the feature scaling into the transformation matrix. */
@@ -123,7 +122,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
 		storage->means[i] = feature_means[i];
 		storage->scales[i] = feature_scale[i];
-		storage->singular[i] = sqrtf(fabsf(singular[i]));
+		storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
 	}
 #endif
 	storage->rank = rank;
diff --git a/intern/cycles/kernel/filter/filter_wlr_cuda.h b/intern/cycles/kernel/filter/filter_wlr_cuda.h
index 8dcc447..81454f5 100644
--- a/intern/cycles/kernel/filter/filter_wlr_cuda.h
+++ b/intern/cycles/kernel/filter/filter_wlr_cuda.h
@@ -78,17 +78,15 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 		for(int i = 0; i < NORM_FEATURE_NUM; i++)
 			feature_matrix_norm += features[i + NORM_FEATURE_OFFSET]*kernel_data.integrator.filter_strength;
 	} END_FOR_PIXEL_WINDOW
-	math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
 
-	float singular[DENOISE_FEATURES];
-	int rank = svd_cuda(feature_matrix, transform, transform_stride, singular, DENOISE_FEATURES);
+	int rank = math_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
 
 	float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / (sqrtf(rank) * 0.5f));
 	singular_threshold *= singular_threshold;
 
 	rank = 0;
 	for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
-		float s = sqrtf(fabsf(singular[i]));
+		float s = feature_matrix[i*DENOISE_FEATURES+i];
 		if(i >= 2 && s < singular_threshold)
 			break;
 		/* Bake the feature scaling into the transformation matrix. */
@@ -103,7 +101,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
 		storage->means[i] = feature_means[i];
 		storage->scales[i] = feature_scale[i];
-		storage->singular[i] = sqrtf(fabsf(singular[i]));
+		storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
 	}
 #endif
 }
diff --git a/intern/cycles/kernel/filter/filter_wlr_sse.h b/intern/cycles/kernel/filter/filter_wlr_sse.h
index a0829d0..baca96f 100644
--- a/intern/cycles/kernel/filter/filter_wlr_sse.h
+++ b/intern/cycles/kernel/filter/filter_wlr_sse.h
@@ -70,16 +70,14 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
 	math_hsum_matrix_lower(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
 
-	math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
-
-	float *feature_transform = &storage->transform[0], singular[DENOISE_FEATURES];
-	int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
+	float *feature_transform = &storage->transform[0];
+	int rank = math_jacobi_eigendecomposition(feature_matrix, feature_transform, DENOISE_FEATURES, 1);
 	float singular_threshold = 0.01f + 2.0f * (sqrtf(_mm_hsum_ss(feature_matrix_norm)) / (sqrtf(rank) * 0.5f));
 	singular_threshold *= singular_threshold;
 
 	rank = 0;
 	for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
-		float s = sqrtf(fabsf(singular[i]));
+		float s = feature_matrix[i*DENOISE_FEATURES+i];
 		if(i >= 2 && s < singular_threshold)
 			break;
 		/* Bake the feature scaling into the transformation matrix. */
@@ -94,7 +92,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
 		storage->means[i] = _mm_cvtss_f32(feature_means[i]);
 		storage->scales[i] = _mm_cvtss_f32(feature_scale[i]);
-		storage->singular[i] = sqrtf(fabsf(singular[i]));
+		storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
 	}
 #endif
 
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 4ea0ca3..e2e7e9d 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -21,6 +21,13 @@ CCL_NAMESPACE_BEGIN
 
 #define MAT(A, size, row, col) A[(row)*(size)+(col)]
 
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+#else
+#define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#endif
+
 ccl_device_inline void math_matrix_zero(float *A, int n)
 {
 	for(int i = 0; i < n*n; i++)
@@ -75,76 +82,6 @@ ccl_device void math_cholesky(float *A, int n)
 	}
 }
 
-/* Perform a Singular Value decomposition on A.
- * Returns the transpose of V and the squared singular values. A is destroyed in the process.
- * Note: Still buggy, therefore the old version from the proof-of-concept implementation is used for now. */
-ccl_device int math_svd(float *A, float *V, float *S2, int n)
-{
-	/* Initialize V to the identity matrix. */
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col < n; col++)
-			MAT(V, n, row, col) = (row == col)? 1.0f: 0.0f;
-
-	const float eps1 = 1e-7f;
-	const float eps2 = 10.0f * n * eps1*eps1;
-	const float eps3 = 0.1f * eps1;
-
-	int estimated_rank = n;
-	for(int rotations = n, i = 0; rotations > 0 && i < 7; i++) {
-		rotations = estimated_rank * (estimated_rank - 1)/2;
-		for(int col1 = 0; col1 < estimated_rank-1; col1++) {
-			for(int col2 = col1+1; col2 < estimated_rank; col2++) {
-				float p = 0.0f, q = 0.0f, r = 0.0f;
-				for(int row = 0; row < n; row++) {
-					float c1 = MAT(A, n, row, col1), c2 = MAT(A, n, row, col2);
-					p += c1*c2;
-					q += c1*c1;
-					r += c2*c2;
-				}
-				S2[col1] = q;
-				S2[col2] = r;
-
-				float x, y;
-				if(q >= r) {
-					if(q < eps2*S2[0] || fabsf(p) < eps3*q) {
-						rotations--;
-						continue;
-					}
-					const float invQ = 1.0f / q;
-					p *= invQ;
-					r = 1.0f - r*invQ;
-					const float pr = p*r;
-					const float invVt = 1.0f / sqrtf(4.0f * pr*pr);
-					x = sqrtf(0.5f * (1.0f + r*invVt));
-					y = p*invVt / x;
-				}
-				else {
-					const float invR = 1.0f / r;
-					p *= invR;
-					q = q*invR - 1.0f;
-					const float pq = p*q;
-					const float invVt = 1.0f / sqrtf(4.0f * pq*pq);
-					y = sqrtf(0.5f * (1.0f - q*invVt));
-					if(p < 0.0f) y = -y;
-					x = p*invVt / y;
-				}
-
-#define ROT(A, n, row1, row2, col1, col2, x, y) { float c1 = MAT(A, n, row1, col1), c2 = MAT(A, n, row2, col2); MAT(A, n, row1, col1) = c1*(x)+c2*(y); MAT(A, n, row2, col2) = -c1*(y)+c2*(x); }
-				for(int row = 0; row < n; row++) {
-					ROT(A, n, row, row, col1, col2, x, y);
-					/* V is stored as its transpose. */
-					ROT(V, n, col1, col2, row, row, x, y);
-				}
-#undef ROT
-			}
-		}
-		while(estimated_rank > 2 && S2[estimated_rank-1] < (S2[0] + eps3)*eps3)
-			estimated_rank--;
-	}
-
-	return estimated_rank;
-}
-
 ccl_device_inline void math_matrix_add_diagonal(float *A, int n, float val)
 {
 	for(int row = 0; row < n; row++)
@@ -167,13 +104,6 @@ ccl_device_inline void math_add_vec3(float3 *v, int n, float *x, float3 w)
 		v[i] += w*x[i];
 }
 
-ccl_device_inline void math_lower_tri_to_full(float *A, int n)
-{
-	for(int row = 0; row < n; row++)
-		for(int col = row+1; col < n; col++)
-			MAT(A, n, row, col) = MAT(A, n, col, row);
-}
-
 ccl_device_inline float math_dot(float ccl_readonly_ptr a, float ccl_readonly_ptr b, int n)
 {
 	float d = 0.0f;
@@ -297,195 +227,138 @@ ccl_device float math_largest_eigenvalue(float *A, int n, float *vec, float *tmp
 	return 0.0f;
 }
 
-/* TODO(lukas): Fix new code and remove this. */
-ccl_device int svd(float *A, float *V, float *S2, int n)
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ *
+ * Additionally, the function returns an estimate of the rank of A.
+ */
+ccl_device int math_jacobi_eigendecomposition(float *A, float *V, int n, int v_stride)
 {
-    int  i, j, k, EstColRank = n, RotCount = n, SweepCount = 0;
-    int slimit = 8;
-    float eps = 1e-8f;
-    float e2 = 10.f * n * eps * eps;
-    float tol = 0.1f * eps;
-    float vt, p, x0, y0, q, r, c0, s0, d1, d2;
-
-    for(int r = 0; r < n; r++)
-        for(int c = 0; c < n; c++)
-            V[r*n+c] = (c == r)? 1.0f: 0.0f;
-
-    while (RotCount != 0 && SweepCount++ <= slimit) {
-        RotCount = EstColRank * (EstColRank - 1) / 2;
-
-        for (j = 0; j < EstColRank-1; ++j) {
-            for (k = j+1; k < EstColRank; ++k) {
-                p = q = r = 0.0;
-
-                for (i = 0; i < n; ++i) {
-                    x0 = A[i * n + j];
-                    y0 = A[i * n + k];
-                    p += x0 * y0;
-                    q += x0 * x0;
-                    r += y0 * y0;
-                }
-
-                S2[j] = q;
-                S2[k] = r;
-
-                if (q >= r) {
-                    if (q <

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list