[Bf-blender-cvs] [b538a5c085e] temp-cycles-denoising: Cycles Denoising: Only store lower-triangular parts of XtWX matrix

Lukas Stockner noreply at git.blender.org
Tue May 2 13:43:33 CEST 2017


Commit: b538a5c085e2fcf5303a6ab459a6974594531875
Author: Lukas Stockner
Date:   Tue May 2 13:16:21 2017 +0200
Branches: temp-cycles-denoising
https://developer.blender.org/rBb538a5c085e2fcf5303a6ab459a6974594531875

Cycles Denoising: Only store lower-triangular parts of XtWX matrix

This saves 55 floats per pixel, which is quite significant with GPU tile sizes.

===================================================================

M	intern/cycles/kernel/filter/filter_defines.h
M	intern/cycles/kernel/filter/filter_transform.h
M	intern/cycles/kernel/filter/filter_transform_gpu.h
M	intern/cycles/kernel/filter/filter_transform_sse.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index fb46d8df6d9..ce96f733aff 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -19,7 +19,7 @@
 
 #define DENOISE_FEATURES 10
 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
-#define XTWX_SIZE      ((DENOISE_FEATURES+1)*(DENOISE_FEATURES+1))
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
 #define XTWY_SIZE      (DENOISE_FEATURES+1)
 
 typedef struct TilesInfo {
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 3fe3d949f4a..10cec4bc3f4 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -71,14 +71,14 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
 	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
 	float* feature_matrix = tempmatrix;
-	math_trimatrix_zero(feature_matrix, DENOISE_FEATURES);
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW {
 		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-		math_trimatrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
 	} END_FOR_PIXEL_WINDOW
 
-	math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
 	*rank = 0;
 	if(pca_threshold < 0.0f) {
 		float threshold_energy = 0.0f;
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index a424d561989..e391ac62b15 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -72,14 +72,14 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
 	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_trimatrix_zero(feature_matrix, DENOISE_FEATURES);
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW {
 		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-		math_trimatrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
 	} END_FOR_PIXEL_WINDOW
 
-	math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
 	*rank = 0;
 	if(pca_threshold < 0.0f) {
 		float threshold_energy = 0.0f;
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 317e360c4eb..93495f3861c 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -56,17 +56,17 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 	filter_calculate_scale_sse(feature_scale);
 
 	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_trimatrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-		math_trimatrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
 	} END_FOR_PIXEL_WINDOW_SSE
 
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_trimatrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
 
-	math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
 
 	*rank = 0;
 	if(pca_threshold < 0.0f) {
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 0f41ec2efe3..d3f7b13283b 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -24,9 +24,12 @@ CCL_NAMESPACE_BEGIN
 /* Variants that use a constant stride on GPUS. */
 #ifdef __KERNEL_GPU__
 #define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
 #define VECS(V, i, s) V[(i)*(s)]
 #else
 #define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
 #define VECS(V, i, s) V[i]
 #endif
 
@@ -38,7 +41,7 @@ ccl_device_inline void math_vector_zero(float *v, int n)
 		v[i] = 0.0f;
 }
 
-ccl_device_inline void math_trimatrix_zero(float *A, int n)
+ccl_device_inline void math_matrix_zero(float *A, int n)
 {
 	for(int row = 0; row < n; row++)
 		for(int col = 0; col <= row; col++)
@@ -92,16 +95,15 @@ ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float
 /* Elementary matrix operations.
  * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
 
-ccl_device_inline void math_matrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
 {
 	for(int row = 0; row < n; row++)
-		MATS(A, n, row, row, stride) += val;
+		MATHS(A, row, row, stride) += val;
 }
 
 /* Add Gramian matrix of v to A.
- * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j].
- * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
-ccl_device_inline void math_trimatrix_add_gramian(float *A,
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
                                                   int n,
                                                   float ccl_restrict_ptr v,
                                                   float weight)
@@ -112,8 +114,7 @@ ccl_device_inline void math_trimatrix_add_gramian(float *A,
 }
 
 /* Add Gramian matrix of v to A.
- * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j].
- * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
 ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
                                                           int n,
                                                           float ccl_restrict_ptr v,
@@ -122,7 +123,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
 {
 	for(int row = 0; row < n; row++)
 		for(int col = 0; col <= row; col++)
-			MATS(A, n, row, col, stride) += v[row]*v[col]*weight;
+			MATHS(A, row, col, stride) += v[row]*v[col]*weight;
 }
 
 /* Transpose matrix A inplace. */
@@ -149,17 +150,17 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			float sum_col = MATS(A, n, row, col, stride);
+			float sum_col = MATHS(A, row, col, stride);
 			for(int k = 0; k < col; k++) {
-				sum_col -= MATS(A, n, row, k, stride) * MATS(A, n, col, k, stride);
+				sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
 			}
 			if(row == col) {
 				sum_col = sqrtf(max(sum_col, 0.0f));
 			}
 			else {
-				sum_col /= MATS(A, n, col, col, stride);
+				sum_col /= MATHS(A, col, col, stride);
 			}
-			MATS(A, n, row, col, stride) = sum_col;
+			MATHS(A, row, col, stride) = sum_col;
 		}
 	}
 }
@@ -175,23 +176,23 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
  * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
 ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
 {
-	math_matrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
+	math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
 	math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
 
 	/* Use forward substitution to solve L*b = y, replacing y by b. */
 	for(int row = 0; row < n; row++) {
 		float3 sum = VECS(y, row, stride);
 		for(int col = 0; col < row; col++)
-			sum -= MATS(A, n, row, col, stride) * VECS(y, col, stride);
-		VECS(y, row, stride) = sum / MATS(A, n, row, row, stride);
+			sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
 	}
 
 	/* Use backward substitution to solve Lt*S = b, replacing b by S. */
 	for(int row = n-1; row >= 0; row--) {
 		float3 sum = VECS(y, row, stride);
 		for(int col = row+1; col < n; col++)
-			sum -= MATS(A, n, col, row, stride) * VECS(y, col, stride);
-		VECS(y, row, stride) = sum / MATS(A, n, row, row, stride);
+			sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
 	}
 }
 
@@ -211,7 +212,7 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global
  *
  * Additionally, the function returns an estimate of the rank of A.
  */
-ccl_device int math_trimatrix_jacobi_

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list