[Bf-blender-cvs] [8a65d6c] soc-2016-cycles_denoising: Cycles: Replace T-SVD algorithm with new Jacobi Eigendecomposition solver
Lukas Stockner
noreply at git.blender.org
Tue Dec 20 16:06:54 CET 2016
Commit: 8a65d6cf11f727ab0e99a0433edd3ce1e3cf09db
Author: Lukas Stockner
Date: Tue Dec 20 05:13:20 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB8a65d6cf11f727ab0e99a0433edd3ce1e3cf09db
Cycles: Replace T-SVD algorithm with new Jacobi Eigendecomposition solver
The code that was used for the T-SVD before came from the WLR reference implementation,
but had numerical problems on Windows and would often cause NaNs.
This commit replaces it with a new implementation using Eigendecomposition based on the Jacobi Eigenvalue Method.
That should:
- Give a slight performance boost (probably not noticable, since the T-SVD was no bottleneck to begin with)
- Improve numerical accuracy of the results (not very important either since the eigenvalues are only compared against a threshold)
- FINALLY solve the black spot issue on Windows
- Slightly reduce memory usage (singular values are now constructed on the diagonal of the input matrix) with the potential of more in the future (now only the lower-triangular part is required).
- Resolve potential licensing issues - the specific file containing the original code didn't come with any licensing information, and the main file contains an apparently custom license...
===================================================================
M intern/cycles/kernel/filter/filter_wlr.h
M intern/cycles/kernel/filter/filter_wlr_cuda.h
M intern/cycles/kernel/filter/filter_wlr_sse.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/kernel/filter/filter_wlr.h b/intern/cycles/kernel/filter/filter_wlr.h
index d5fab6e..2239a11 100644
--- a/intern/cycles/kernel/filter/filter_wlr.h
+++ b/intern/cycles/kernel/filter/filter_wlr.h
@@ -92,10 +92,9 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
feature_matrix_norm += features[i + NORM_FEATURE_OFFSET]*kernel_data.integrator.filter_strength;
#endif
} END_FOR_PIXEL_WINDOW
- math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
- float *feature_transform = &storage->transform[0], *singular = tempvector + DENOISE_FEATURES;
- int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
+ float *feature_transform = &storage->transform[0];
+ int rank = math_jacobi_eigendecomposition(feature_matrix, feature_transform, DENOISE_FEATURES, 1);
#ifdef FULL_EIGENVALUE_NORM
float tempvector_2[2*DENOISE_FEATURES];
@@ -109,7 +108,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
rank = 0;
for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
- float s = sqrtf(fabsf(singular[i]));
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && s < singular_threshold)
break;
/* Bake the feature scaling into the transformation matrix. */
@@ -123,7 +122,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
for(int i = 0; i < DENOISE_FEATURES; i++) {
storage->means[i] = feature_means[i];
storage->scales[i] = feature_scale[i];
- storage->singular[i] = sqrtf(fabsf(singular[i]));
+ storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
}
#endif
storage->rank = rank;
diff --git a/intern/cycles/kernel/filter/filter_wlr_cuda.h b/intern/cycles/kernel/filter/filter_wlr_cuda.h
index 8dcc447..81454f5 100644
--- a/intern/cycles/kernel/filter/filter_wlr_cuda.h
+++ b/intern/cycles/kernel/filter/filter_wlr_cuda.h
@@ -78,17 +78,15 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
for(int i = 0; i < NORM_FEATURE_NUM; i++)
feature_matrix_norm += features[i + NORM_FEATURE_OFFSET]*kernel_data.integrator.filter_strength;
} END_FOR_PIXEL_WINDOW
- math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
- float singular[DENOISE_FEATURES];
- int rank = svd_cuda(feature_matrix, transform, transform_stride, singular, DENOISE_FEATURES);
+ int rank = math_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / (sqrtf(rank) * 0.5f));
singular_threshold *= singular_threshold;
rank = 0;
for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
- float s = sqrtf(fabsf(singular[i]));
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && s < singular_threshold)
break;
/* Bake the feature scaling into the transformation matrix. */
@@ -103,7 +101,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
for(int i = 0; i < DENOISE_FEATURES; i++) {
storage->means[i] = feature_means[i];
storage->scales[i] = feature_scale[i];
- storage->singular[i] = sqrtf(fabsf(singular[i]));
+ storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
}
#endif
}
diff --git a/intern/cycles/kernel/filter/filter_wlr_sse.h b/intern/cycles/kernel/filter/filter_wlr_sse.h
index a0829d0..baca96f 100644
--- a/intern/cycles/kernel/filter/filter_wlr_sse.h
+++ b/intern/cycles/kernel/filter/filter_wlr_sse.h
@@ -70,16 +70,14 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
math_hsum_matrix_lower(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
- math_lower_tri_to_full(feature_matrix, DENOISE_FEATURES);
-
- float *feature_transform = &storage->transform[0], singular[DENOISE_FEATURES];
- int rank = svd(feature_matrix, feature_transform, singular, DENOISE_FEATURES);
+ float *feature_transform = &storage->transform[0];
+ int rank = math_jacobi_eigendecomposition(feature_matrix, feature_transform, DENOISE_FEATURES, 1);
float singular_threshold = 0.01f + 2.0f * (sqrtf(_mm_hsum_ss(feature_matrix_norm)) / (sqrtf(rank) * 0.5f));
singular_threshold *= singular_threshold;
rank = 0;
for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
- float s = sqrtf(fabsf(singular[i]));
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && s < singular_threshold)
break;
/* Bake the feature scaling into the transformation matrix. */
@@ -94,7 +92,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
for(int i = 0; i < DENOISE_FEATURES; i++) {
storage->means[i] = _mm_cvtss_f32(feature_means[i]);
storage->scales[i] = _mm_cvtss_f32(feature_scale[i]);
- storage->singular[i] = sqrtf(fabsf(singular[i]));
+ storage->singular[i] = feature_matrix[i*DENOISE_FEATURES+i];
}
#endif
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 4ea0ca3..e2e7e9d 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -21,6 +21,13 @@ CCL_NAMESPACE_BEGIN
#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+#else
+#define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#endif
+
ccl_device_inline void math_matrix_zero(float *A, int n)
{
for(int i = 0; i < n*n; i++)
@@ -75,76 +82,6 @@ ccl_device void math_cholesky(float *A, int n)
}
}
-/* Perform a Singular Value decomposition on A.
- * Returns the transpose of V and the squared singular values. A is destroyed in the process.
- * Note: Still buggy, therefore the old version from the proof-of-concept implementation is used for now. */
-ccl_device int math_svd(float *A, float *V, float *S2, int n)
-{
- /* Initialize V to the identity matrix. */
- for(int row = 0; row < n; row++)
- for(int col = 0; col < n; col++)
- MAT(V, n, row, col) = (row == col)? 1.0f: 0.0f;
-
- const float eps1 = 1e-7f;
- const float eps2 = 10.0f * n * eps1*eps1;
- const float eps3 = 0.1f * eps1;
-
- int estimated_rank = n;
- for(int rotations = n, i = 0; rotations > 0 && i < 7; i++) {
- rotations = estimated_rank * (estimated_rank - 1)/2;
- for(int col1 = 0; col1 < estimated_rank-1; col1++) {
- for(int col2 = col1+1; col2 < estimated_rank; col2++) {
- float p = 0.0f, q = 0.0f, r = 0.0f;
- for(int row = 0; row < n; row++) {
- float c1 = MAT(A, n, row, col1), c2 = MAT(A, n, row, col2);
- p += c1*c2;
- q += c1*c1;
- r += c2*c2;
- }
- S2[col1] = q;
- S2[col2] = r;
-
- float x, y;
- if(q >= r) {
- if(q < eps2*S2[0] || fabsf(p) < eps3*q) {
- rotations--;
- continue;
- }
- const float invQ = 1.0f / q;
- p *= invQ;
- r = 1.0f - r*invQ;
- const float pr = p*r;
- const float invVt = 1.0f / sqrtf(4.0f * pr*pr);
- x = sqrtf(0.5f * (1.0f + r*invVt));
- y = p*invVt / x;
- }
- else {
- const float invR = 1.0f / r;
- p *= invR;
- q = q*invR - 1.0f;
- const float pq = p*q;
- const float invVt = 1.0f / sqrtf(4.0f * pq*pq);
- y = sqrtf(0.5f * (1.0f - q*invVt));
- if(p < 0.0f) y = -y;
- x = p*invVt / y;
- }
-
-#define ROT(A, n, row1, row2, col1, col2, x, y) { float c1 = MAT(A, n, row1, col1), c2 = MAT(A, n, row2, col2); MAT(A, n, row1, col1) = c1*(x)+c2*(y); MAT(A, n, row2, col2) = -c1*(y)+c2*(x); }
- for(int row = 0; row < n; row++) {
- ROT(A, n, row, row, col1, col2, x, y);
- /* V is stored as its transpose. */
- ROT(V, n, col1, col2, row, row, x, y);
- }
-#undef ROT
- }
- }
- while(estimated_rank > 2 && S2[estimated_rank-1] < (S2[0] + eps3)*eps3)
- estimated_rank--;
- }
-
- return estimated_rank;
-}
-
ccl_device_inline void math_matrix_add_diagonal(float *A, int n, float val)
{
for(int row = 0; row < n; row++)
@@ -167,13 +104,6 @@ ccl_device_inline void math_add_vec3(float3 *v, int n, float *x, float3 w)
v[i] += w*x[i];
}
-ccl_device_inline void math_lower_tri_to_full(float *A, int n)
-{
- for(int row = 0; row < n; row++)
- for(int col = row+1; col < n; col++)
- MAT(A, n, row, col) = MAT(A, n, col, row);
-}
-
ccl_device_inline float math_dot(float ccl_readonly_ptr a, float ccl_readonly_ptr b, int n)
{
float d = 0.0f;
@@ -297,195 +227,138 @@ ccl_device float math_largest_eigenvalue(float *A, int n, float *vec, float *tmp
return 0.0f;
}
-/* TODO(lukas): Fix new code and remove this. */
-ccl_device int svd(float *A, float *V, float *S2, int n)
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ *
+ * Additionally, the function returns an estimate of the rank of A.
+ */
+ccl_device int math_jacobi_eigendecomposition(float *A, float *V, int n, int v_stride)
{
- int i, j, k, EstColRank = n, RotCount = n, SweepCount = 0;
- int slimit = 8;
- float eps = 1e-8f;
- float e2 = 10.f * n * eps * eps;
- float tol = 0.1f * eps;
- float vt, p, x0, y0, q, r, c0, s0, d1, d2;
-
- for(int r = 0; r < n; r++)
- for(int c = 0; c < n; c++)
- V[r*n+c] = (c == r)? 1.0f: 0.0f;
-
- while (RotCount != 0 && SweepCount++ <= slimit) {
- RotCount = EstColRank * (EstColRank - 1) / 2;
-
- for (j = 0; j < EstColRank-1; ++j) {
- for (k = j+1; k < EstColRank; ++k) {
- p = q = r = 0.0;
-
- for (i = 0; i < n; ++i) {
- x0 = A[i * n + j];
- y0 = A[i * n + k];
- p += x0 * y0;
- q += x0 * x0;
- r += y0 * y0;
- }
-
- S2[j] = q;
- S2[k] = r;
-
- if (q >= r) {
- if (q <
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list