[Bf-blender-cvs] [5857c9c172] soc-2016-cycles_denoising: Cycles: Deduplicate numerical code in the WLR bandwidth estimation
Lukas Stockner
noreply at git.blender.org
Thu Jan 12 05:14:05 CET 2017
Commit: 5857c9c172ccb0a3a2281d28c40940c9e45b7ed1
Author: Lukas Stockner
Date: Wed Dec 21 02:29:56 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB5857c9c172ccb0a3a2281d28c40940c9e45b7ed1
Cycles: Deduplicate numerical code in the WLR bandwidth estimation
Also includes some documentation about what this code actually does.
===================================================================
M intern/cycles/kernel/filter/filter_final_pass_impl.h
M intern/cycles/kernel/filter/filter_wlr.h
M intern/cycles/kernel/filter/filter_wlr_cuda.h
M intern/cycles/kernel/filter/filter_wlr_sse.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h b/intern/cycles/kernel/filter/filter_final_pass_impl.h
index 20b500bbec..7ab0a856f9 100644
--- a/intern/cycles/kernel/filter/filter_final_pass_impl.h
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -102,7 +102,7 @@ ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, float ccl_readonly_
float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
float3 solution[(DENOISE_FEATURES+1)];
- math_matrix_zero(XtWX, matrix_size);
+ math_matrix_zero_lower(XtWX, matrix_size);
math_vec3_zero(solution, matrix_size);
/* Construct Xt*W*X matrix and Xt*W*y vector (and fill weight cache, if used). */
FOR_PIXEL_WINDOW {
diff --git a/intern/cycles/kernel/filter/filter_wlr.h b/intern/cycles/kernel/filter/filter_wlr.h
index c2a7f257b8..7e7f2b6465 100644
--- a/intern/cycles/kernel/filter/filter_wlr.h
+++ b/intern/cycles/kernel/filter/filter_wlr.h
@@ -229,15 +229,8 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
math_add_gramian(XtWX, matrix_size, design_row, weight);
} END_FOR_PIXEL_WINDOW
- math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
- math_cholesky(XtWX, matrix_size);
- math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
- float r_feature_weight[DENOISE_FEATURES+1];
- math_vector_zero(r_feature_weight, matrix_size);
- for(int col = 0; col < matrix_size; col++)
- for(int row = col; row < matrix_size; row++)
- r_feature_weight[col] += XtWX[row]*XtWX[col*matrix_size+row];
+ float inverse_row[DENOISE_FEATURES+1];
+ math_matrix_inverse_row(XtWX, inverse_row, matrix_size, 0);
float3 est_color = make_float3(0.0f, 0.0f, 0.0f), est_pos_color = make_float3(0.0f, 0.0f, 0.0f);
float est_variance = 0.0f, est_pos_variance = 0.0f;
@@ -252,7 +245,7 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- weight *= math_dot(design_row, r_feature_weight, matrix_size);
+ weight *= math_dot(design_row, inverse_row, matrix_size);
est_color += weight * color;
est_variance += weight*weight * max(variance, 0.0f);
diff --git a/intern/cycles/kernel/filter/filter_wlr_cuda.h b/intern/cycles/kernel/filter/filter_wlr_cuda.h
index 0322d89e54..a98d835ab6 100644
--- a/intern/cycles/kernel/filter/filter_wlr_cuda.h
+++ b/intern/cycles/kernel/filter/filter_wlr_cuda.h
@@ -211,15 +211,8 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
math_add_gramian(XtWX, matrix_size, design_row, weight);
} END_FOR_PIXEL_WINDOW
- math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
- math_cholesky(XtWX, matrix_size);
- math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
- float r_feature_weight[DENOISE_FEATURES+1];
- math_vector_zero(r_feature_weight, matrix_size);
- for(int col = 0; col < matrix_size; col++)
- for(int row = col; row < matrix_size; row++)
- r_feature_weight[col] += XtWX[row]*XtWX[col*matrix_size+row];
+ float inverse_row[DENOISE_FEATURES+1];
+ math_matrix_inverse_row(XtWX, inverse_row, matrix_size, 0);
float3 est_color = make_float3(0.0f, 0.0f, 0.0f), est_pos_color = make_float3(0.0f, 0.0f, 0.0f);
float est_variance = 0.0f, est_pos_variance = 0.0f;
@@ -234,7 +227,7 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- weight *= math_dot(design_row, r_feature_weight, matrix_size);
+ weight *= math_dot(design_row, inverse_row, matrix_size);
est_color += weight * color;
est_variance += weight*weight * max(variance, 0.0f);
diff --git a/intern/cycles/kernel/filter/filter_wlr_sse.h b/intern/cycles/kernel/filter/filter_wlr_sse.h
index 98838153d8..7af5970c38 100644
--- a/intern/cycles/kernel/filter/filter_wlr_sse.h
+++ b/intern/cycles/kernel/filter/filter_wlr_sse.h
@@ -209,18 +209,11 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
} END_FOR_PIXEL_WINDOW_SSE
math_hsum_matrix_lower(XtWX, matrix_size, XtWX_sse);
- math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
- math_cholesky(XtWX, matrix_size);
- math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
- float r_feature_weight_scalar[DENOISE_FEATURES+1];
- math_vector_zero(r_feature_weight_scalar, matrix_size);
- for(int col = 0; col < matrix_size; col++)
- for(int row = col; row < matrix_size; row++)
- r_feature_weight_scalar[col] += XtWX[row]*XtWX[col*matrix_size+row];
- __m128 r_feature_weight[DENOISE_FEATURES+1];
+ float inverse_row_scalar[DENOISE_FEATURES+1];
+ math_matrix_inverse_row(XtWX, inverse_row_scalar, matrix_size, 0);
+ __m128 inverse_row[DENOISE_FEATURES+1];
for(int col = 0; col < matrix_size; col++)
- r_feature_weight[col] = _mm_set1_ps(r_feature_weight_scalar[col]);
+ inverse_row[col] = _mm_set1_ps(inverse_row_scalar[col]);
__m128 est_pos_color[3] = {_mm_setzero_ps()}, est_color[3] = {_mm_setzero_ps()};
__m128 est_variance = _mm_setzero_ps(), est_pos_variance = _mm_setzero_ps(), pos_weight_sse = _mm_setzero_ps();
@@ -238,7 +231,7 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
/* Early out if all pixels were masked away. */
if(!_mm_movemask_ps(active_pixels)) continue;
- weight = _mm_mul_ps(weight, _mm_mul_ps(math_dot_sse(design_row, r_feature_weight, matrix_size), _mm_rcp_ps(_mm_max_ps(_mm_set1_ps(1.0f), variance))));
+ weight = _mm_mul_ps(weight, _mm_mul_ps(math_dot_sse(design_row, inverse_row, matrix_size), _mm_rcp_ps(_mm_max_ps(_mm_set1_ps(1.0f), variance))));
math_mul_vector_scalar_sse(color, 3, weight);
math_add_vector_sse(est_color, 3, color);
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 271b7f704a..4d9f2dc596 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -28,12 +28,6 @@ CCL_NAMESPACE_BEGIN
#define MATS(A, n, r, c, s) MAT(A, n, r, c)
#endif
-ccl_device_inline void math_matrix_zero(float *A, int n)
-{
- for(int i = 0; i < n*n; i++)
- A[i] = 0.0f;
-}
-
ccl_device_inline void math_vector_zero(float *v, int n)
{
for(int i = 0; i < n; i++)
@@ -53,13 +47,6 @@ ccl_device_inline void math_matrix_zero_lower(float *A, int n)
MAT(A, n, row, col) = 0.0f;
}
-ccl_device_inline void math_matrix_identity(float *A, int n)
-{
- for(int row = 0; row < n; row++)
- for(int col = 0; col < n; col++)
- MAT(A, n, row, col) = (row == col)? 1.0f: 0.0f;
-}
-
/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
* into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
* Also, only the lower triangular part of A is ever accessed. */
@@ -155,24 +142,8 @@ ccl_device_inline void math_substitute_back_vec3(float *LT, int n, float3 *x)
}
}
-ccl_device_inline void math_inverse_lower_tri(float *L, float *invL, int n)
-{
- for(int comp = 0; comp < n; comp++) {
- for(int row = 0; row < comp; row++)
- MAT(invL, n, row, comp) = 0.0f;
- MAT(invL, n, comp, comp) = 1.0f / MAT(L, n, comp, comp);
- for(int row = comp+1; row < n; row++) {
- float sum = 0.0f;
- for(int col = comp; col < row; col++)
- sum += MAT(L, n, row, col) * MAT(invL, n, col, comp);
- MAT(invL, n, row, comp) = -sum/MAT(L, n, row, row);
- }
- }
-}
-
-/* Inverts the lower triangular matrix L and overwrites it with the transpose
- * of the result. */
-ccl_device_inline void math_inverse_lower_tri_inplace(float *L, int n)
+/* Inverts the lower triangular matrix L and overwrites it with the transpose of the result. */
+ccl_device_inline void math_inverse_lower_tri(float *L, int n)
{
for(int row = 0; row < n; row++)
MAT(L, n, row, row) = 1.0f / MAT(L, n, row, row);
@@ -205,6 +176,32 @@ ccl_device_inline void math_solve_normal_equation(float *XtWX, float3 *XtWy, int
math_substitute_back_vec3(XtWX, n, XtWy); /* Solve L^T*S = b, replacing b by S. */
}
+/* Find only the i-th row of the inverse of A, destroying A in the process.
+ * This can be used to solve the normal equation for only the i-th element of the result vector:
+ * If S = inv(Xt*W*X)*Xt*W*y, then S[i] is equal to dot(inv(Xt*W*X)[i], X[k])*W[k]*y[k] summed over all data points k, where [j] is the j-th row.
+ *
+ * For most applications, it makes more sense to directly use math_solve_normal_equation
+ * since that only requires one pass over the data, accumulating both Xt*W*X and Xt*W*y.
+ *
+ * But, for variance estimation the squared value of every datapoint's combined weight (= inv(Xt*W*X)*Xt*W) is needed. Calculating the combined weight
+ * explicitly is too expensive, so we need to be able to find every datapoint's own combined weight in the pass over the data.
+ *
+ * Therefore, we compute Xt*W*X in the first pass, use this function to find v, the i-th row of its inverse,
+ * and then use dot(v, X[k])*W[k] as the combined weight in the second pass. */
+ccl_device_inline void math_matrix_inverse_row(float *A, float *v, int n, int i)
+{
+ math_matrix_add_diagonal(A, n, 1e-4f); /* Improve the numerical stability. */
+ math_cholesky(A, n); /* Find L so that L*Lt = A. */
+ math_inverse_lower_tri(A, n); /* Find inv(L). */
+ /* Now, inv(A) = inv(L*Lt) = inv(Lt)*inv(L) = inv(L)t*inv(L).
+ * From that, inv(A)[i,j] = sum_k((inv(L)t)[i, k] * inv(L)[k, j]) = sum_k(inv(L)[k, i] * inv(L)[k, j]).
+ * But, since math_inverse_lower_tri stores the *transpose* of the result to be able to work inplace, we must swap the indices. */
+ math_vector_zero(v, n);
+ for(int j = 0; j < n; j++)
+ for(int k = j; k < n; k++)
+ v[j] += MAT(A, n, i, k)*MAT(A, n, j, k);
+}
+
ccl_device float math_largest_eigenvalue(float *A, int n, float *vec, float *tmp)
{
/* Matrix-Vector-Multiplication that only accesses the lower triangular part of A. */
More information about the Bf-blender-cvs
mailing list