[Bf-blender-cvs] [b538a5c085e] temp-cycles-denoising: Cycles Denoising: Only store lower-triangular parts of XtWX matrix
Lukas Stockner
noreply at git.blender.org
Tue May 2 13:43:33 CEST 2017
Commit: b538a5c085e2fcf5303a6ab459a6974594531875
Author: Lukas Stockner
Date: Tue May 2 13:16:21 2017 +0200
Branches: temp-cycles-denoising
https://developer.blender.org/rBb538a5c085e2fcf5303a6ab459a6974594531875
Cycles Denoising: Only store lower-triangular parts of XtWX matrix
This saves 55 floats per pixel, which is quite significant with GPU tile sizes.
===================================================================
M intern/cycles/kernel/filter/filter_defines.h
M intern/cycles/kernel/filter/filter_transform.h
M intern/cycles/kernel/filter/filter_transform_gpu.h
M intern/cycles/kernel/filter/filter_transform_sse.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index fb46d8df6d9..ce96f733aff 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -19,7 +19,7 @@
#define DENOISE_FEATURES 10
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
-#define XTWX_SIZE ((DENOISE_FEATURES+1)*(DENOISE_FEATURES+1))
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
#define XTWY_SIZE (DENOISE_FEATURES+1)
typedef struct TilesInfo {
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 3fe3d949f4a..10cec4bc3f4 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -71,14 +71,14 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float* feature_matrix = tempmatrix;
- math_trimatrix_zero(feature_matrix, DENOISE_FEATURES);
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_trimatrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index a424d561989..e391ac62b15 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -72,14 +72,14 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_trimatrix_zero(feature_matrix, DENOISE_FEATURES);
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_trimatrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 317e360c4eb..93495f3861c 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -56,17 +56,17 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
filter_calculate_scale_sse(feature_scale);
__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
- math_trimatrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
- math_trimatrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_trimatrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
- math_trimatrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 0f41ec2efe3..d3f7b13283b 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -24,9 +24,12 @@ CCL_NAMESPACE_BEGIN
/* Variants that use a constant stride on GPUS. */
#ifdef __KERNEL_GPU__
#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
#define VECS(V, i, s) V[(i)*(s)]
#else
#define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
#define VECS(V, i, s) V[i]
#endif
@@ -38,7 +41,7 @@ ccl_device_inline void math_vector_zero(float *v, int n)
v[i] = 0.0f;
}
-ccl_device_inline void math_trimatrix_zero(float *A, int n)
+ccl_device_inline void math_matrix_zero(float *A, int n)
{
for(int row = 0; row < n; row++)
for(int col = 0; col <= row; col++)
@@ -92,16 +95,15 @@ ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float
/* Elementary matrix operations.
* Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
-ccl_device_inline void math_matrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
{
for(int row = 0; row < n; row++)
- MATS(A, n, row, row, stride) += val;
+ MATHS(A, row, row, stride) += val;
}
/* Add Gramian matrix of v to A.
- * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j].
- * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
-ccl_device_inline void math_trimatrix_add_gramian(float *A,
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
int n,
float ccl_restrict_ptr v,
float weight)
@@ -112,8 +114,7 @@ ccl_device_inline void math_trimatrix_add_gramian(float *A,
}
/* Add Gramian matrix of v to A.
- * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j].
- * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
int n,
float ccl_restrict_ptr v,
@@ -122,7 +123,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
{
for(int row = 0; row < n; row++)
for(int col = 0; col <= row; col++)
- MATS(A, n, row, col, stride) += v[row]*v[col]*weight;
+ MATHS(A, row, col, stride) += v[row]*v[col]*weight;
}
/* Transpose matrix A inplace. */
@@ -149,17 +150,17 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
{
for(int row = 0; row < n; row++) {
for(int col = 0; col <= row; col++) {
- float sum_col = MATS(A, n, row, col, stride);
+ float sum_col = MATHS(A, row, col, stride);
for(int k = 0; k < col; k++) {
- sum_col -= MATS(A, n, row, k, stride) * MATS(A, n, col, k, stride);
+ sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
}
if(row == col) {
sum_col = sqrtf(max(sum_col, 0.0f));
}
else {
- sum_col /= MATS(A, n, col, col, stride);
+ sum_col /= MATHS(A, col, col, stride);
}
- MATS(A, n, row, col, stride) = sum_col;
+ MATHS(A, row, col, stride) = sum_col;
}
}
}
@@ -175,23 +176,23 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
* symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
{
- math_matrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
+ math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
/* Use forward substitution to solve L*b = y, replacing y by b. */
for(int row = 0; row < n; row++) {
float3 sum = VECS(y, row, stride);
for(int col = 0; col < row; col++)
- sum -= MATS(A, n, row, col, stride) * VECS(y, col, stride);
- VECS(y, row, stride) = sum / MATS(A, n, row, row, stride);
+ sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
}
/* Use backward substitution to solve Lt*S = b, replacing b by S. */
for(int row = n-1; row >= 0; row--) {
float3 sum = VECS(y, row, stride);
for(int col = row+1; col < n; col++)
- sum -= MATS(A, n, col, row, stride) * VECS(y, col, stride);
- VECS(y, row, stride) = sum / MATS(A, n, row, row, stride);
+ sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
}
}
@@ -211,7 +212,7 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global
*
* Additionally, the function returns an estimate of the rank of A.
*/
-ccl_device int math_trimatrix_jacobi_
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list