[Bf-blender-cvs] [5857c9c172] soc-2016-cycles_denoising: Cycles: Deduplicate numerical code in the WLR bandwidth estimation

Thu Jan 12 05:14:05 CET 2017

Commit: 5857c9c172ccb0a3a2281d28c40940c9e45b7ed1
Author: Lukas Stockner
Date:   Wed Dec 21 02:29:56 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB5857c9c172ccb0a3a2281d28c40940c9e45b7ed1

Cycles: Deduplicate numerical code in the WLR bandwidth estimation

Also includes some documentation about what this code actually does.

===================================================================

M	intern/cycles/kernel/filter/filter_final_pass_impl.h
M	intern/cycles/kernel/filter/filter_wlr.h
M	intern/cycles/kernel/filter/filter_wlr_cuda.h
M	intern/cycles/kernel/filter/filter_wlr_sse.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h b/intern/cycles/kernel/filter/filter_final_pass_impl.h
index 20b500bbec..7ab0a856f9 100644
--- a/intern/cycles/kernel/filter/filter_final_pass_impl.h
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -102,7 +102,7 @@ ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, float ccl_readonly_
 	float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
 	float3 solution[(DENOISE_FEATURES+1)];
 
-	math_matrix_zero(XtWX, matrix_size);
+	math_matrix_zero_lower(XtWX, matrix_size);
 	math_vec3_zero(solution, matrix_size);
 	/* Construct Xt*W*X matrix and Xt*W*y vector (and fill weight cache, if used). */
 	FOR_PIXEL_WINDOW {
diff --git a/intern/cycles/kernel/filter/filter_wlr.h b/intern/cycles/kernel/filter/filter_wlr.h
index c2a7f257b8..7e7f2b6465 100644
--- a/intern/cycles/kernel/filter/filter_wlr.h
+++ b/intern/cycles/kernel/filter/filter_wlr.h
@@ -229,15 +229,8 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
 			math_add_gramian(XtWX, matrix_size, design_row, weight);
 		} END_FOR_PIXEL_WINDOW
 
-		math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
-		math_cholesky(XtWX, matrix_size);
-		math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
-		float r_feature_weight[DENOISE_FEATURES+1];
-		math_vector_zero(r_feature_weight, matrix_size);
-		for(int col = 0; col < matrix_size; col++)
-			for(int row = col; row < matrix_size; row++)
-				r_feature_weight[col] += XtWX[row]*XtWX[col*matrix_size+row];
+		float inverse_row[DENOISE_FEATURES+1];
+		math_matrix_inverse_row(XtWX, inverse_row, matrix_size, 0);
 
 		float3 est_color = make_float3(0.0f, 0.0f, 0.0f), est_pos_color = make_float3(0.0f, 0.0f, 0.0f);
 		float est_variance = 0.0f, est_pos_variance = 0.0f;
@@ -252,7 +245,7 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
 
 			if(weight == 0.0f) continue;
 			weight /= max(1.0f, variance);
-			weight *= math_dot(design_row, r_feature_weight, matrix_size);
+			weight *= math_dot(design_row, inverse_row, matrix_size);
 
 			est_color += weight * color;
 			est_variance += weight*weight * max(variance, 0.0f);
diff --git a/intern/cycles/kernel/filter/filter_wlr_cuda.h b/intern/cycles/kernel/filter/filter_wlr_cuda.h
index 0322d89e54..a98d835ab6 100644
--- a/intern/cycles/kernel/filter/filter_wlr_cuda.h
+++ b/intern/cycles/kernel/filter/filter_wlr_cuda.h
@@ -211,15 +211,8 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
 		math_add_gramian(XtWX, matrix_size, design_row, weight);
 	} END_FOR_PIXEL_WINDOW
 
-	math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
-	math_cholesky(XtWX, matrix_size);
-	math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
-	float r_feature_weight[DENOISE_FEATURES+1];
-	math_vector_zero(r_feature_weight, matrix_size);
-	for(int col = 0; col < matrix_size; col++)
-		for(int row = col; row < matrix_size; row++)
-			r_feature_weight[col] += XtWX[row]*XtWX[col*matrix_size+row];
+	float inverse_row[DENOISE_FEATURES+1];
+	math_matrix_inverse_row(XtWX, inverse_row, matrix_size, 0);
 
 	float3 est_color = make_float3(0.0f, 0.0f, 0.0f), est_pos_color = make_float3(0.0f, 0.0f, 0.0f);
 	float est_variance = 0.0f, est_pos_variance = 0.0f;
@@ -234,7 +227,7 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
 
 		if(weight == 0.0f) continue;
 		weight /= max(1.0f, variance);
-		weight *= math_dot(design_row, r_feature_weight, matrix_size);
+		weight *= math_dot(design_row, inverse_row, matrix_size);
 
 		est_color += weight * color;
 		est_variance += weight*weight * max(variance, 0.0f);
diff --git a/intern/cycles/kernel/filter/filter_wlr_sse.h b/intern/cycles/kernel/filter/filter_wlr_sse.h
index 98838153d8..7af5970c38 100644
--- a/intern/cycles/kernel/filter/filter_wlr_sse.h
+++ b/intern/cycles/kernel/filter/filter_wlr_sse.h
@@ -209,18 +209,11 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
 		} END_FOR_PIXEL_WINDOW_SSE
 		math_hsum_matrix_lower(XtWX, matrix_size, XtWX_sse);
 
-		math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the numerical stability. */
-		math_cholesky(XtWX, matrix_size);
-		math_inverse_lower_tri_inplace(XtWX, matrix_size);
-
-		float r_feature_weight_scalar[DENOISE_FEATURES+1];
-		math_vector_zero(r_feature_weight_scalar, matrix_size);
-		for(int col = 0; col < matrix_size; col++)
-			for(int row = col; row < matrix_size; row++)
-				r_feature_weight_scalar[col] += XtWX[row]*XtWX[col*matrix_size+row];
-		__m128 r_feature_weight[DENOISE_FEATURES+1];
+		float inverse_row_scalar[DENOISE_FEATURES+1];
+		math_matrix_inverse_row(XtWX, inverse_row_scalar, matrix_size, 0);
+		__m128 inverse_row[DENOISE_FEATURES+1];
 		for(int col = 0; col < matrix_size; col++)
-			r_feature_weight[col] = _mm_set1_ps(r_feature_weight_scalar[col]);
+			inverse_row[col] = _mm_set1_ps(inverse_row_scalar[col]);
 
 		__m128 est_pos_color[3] = {_mm_setzero_ps()}, est_color[3] = {_mm_setzero_ps()};
 		__m128 est_variance = _mm_setzero_ps(), est_pos_variance = _mm_setzero_ps(), pos_weight_sse = _mm_setzero_ps();
@@ -238,7 +231,7 @@ ccl_device void kernel_filter_estimate_wlr_params(KernelGlobals *kg, int sample,
 			/* Early out if all pixels were masked away. */
 			if(!_mm_movemask_ps(active_pixels)) continue;
 
-			weight = _mm_mul_ps(weight, _mm_mul_ps(math_dot_sse(design_row, r_feature_weight, matrix_size), _mm_rcp_ps(_mm_max_ps(_mm_set1_ps(1.0f), variance))));
+			weight = _mm_mul_ps(weight, _mm_mul_ps(math_dot_sse(design_row, inverse_row, matrix_size), _mm_rcp_ps(_mm_max_ps(_mm_set1_ps(1.0f), variance))));
 
 			math_mul_vector_scalar_sse(color, 3, weight);
 			math_add_vector_sse(est_color, 3, color);
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 271b7f704a..4d9f2dc596 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -28,12 +28,6 @@ CCL_NAMESPACE_BEGIN
 #define MATS(A, n, r, c, s) MAT(A, n, r, c)
 #endif
 
-ccl_device_inline void math_matrix_zero(float *A, int n)
-{
-	for(int i = 0; i < n*n; i++)
-		A[i] = 0.0f;
-}
-
 ccl_device_inline void math_vector_zero(float *v, int n)
 {
 	for(int i = 0; i < n; i++)
@@ -53,13 +47,6 @@ ccl_device_inline void math_matrix_zero_lower(float *A, int n)
 			MAT(A, n, row, col) = 0.0f;
 }
 
-ccl_device_inline void math_matrix_identity(float *A, int n)
-{
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col < n; col++)
-			MAT(A, n, row, col) = (row == col)? 1.0f: 0.0f;
-}
-
 /* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
  * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
  * Also, only the lower triangular part of A is ever accessed. */
@@ -155,24 +142,8 @@ ccl_device_inline void math_substitute_back_vec3(float *LT, int n, float3 *x)
 	}
 }
 
-ccl_device_inline void math_inverse_lower_tri(float *L, float *invL, int n)
-{
-	for(int comp = 0; comp < n; comp++) {
-		for(int row = 0; row < comp; row++)
-			MAT(invL, n, row, comp) = 0.0f;
-		MAT(invL, n, comp, comp) = 1.0f / MAT(L, n, comp, comp);
-		for(int row = comp+1; row < n; row++) {
-			float sum = 0.0f;
-			for(int col = comp; col < row; col++)
-				sum += MAT(L, n, row, col) * MAT(invL, n, col, comp);
-			MAT(invL, n, row, comp) = -sum/MAT(L, n, row, row);
-		}
-	}
-}
-
-/* Inverts the lower triangular matrix L and overwrites it with the transpose
- * of the result. */
-ccl_device_inline void math_inverse_lower_tri_inplace(float *L, int n)
+/* Inverts the lower triangular matrix L and overwrites it with the transpose of the result. */
+ccl_device_inline void math_inverse_lower_tri(float *L, int n)
 {
 	for(int row = 0; row < n; row++)
 		MAT(L, n, row, row) = 1.0f / MAT(L, n, row, row);
@@ -205,6 +176,32 @@ ccl_device_inline void math_solve_normal_equation(float *XtWX, float3 *XtWy, int
 	math_substitute_back_vec3(XtWX, n, XtWy); /* Solve L^T*S = b, replacing b by S. */
 }
 
+/* Find only the i-th row of the inverse of A, destroying A in the process.
+ * This can be used to solve the normal equation for only the i-th element of the result vector:
+ * If S = inv(Xt*W*X)*Xt*W*y, then S[i] is equal to dot(inv(Xt*W*X)[i], X[k])*W[k]*y[k] summed over all data points k, where [j] is the j-th row.
+ *
+ * For most applications, it makes more sense to directly use math_solve_normal_equation
+ * since that only requires one pass over the data, accumulating both Xt*W*X and Xt*W*y.
+ *
+ * But, for variance estimation the squared value of every datapoint's combined weight (= inv(Xt*W*X)*Xt*W) is needed. Calculating the combined weight
+ * explicitly is too expensive, so we need to be able to find every datapoint's own combined weight in the pass over the data.
+ *
+ * Therefore, we compute Xt*W*X in the first pass, use this function to find v, the i-th row of its inverse,
+ * and then use dot(v, X[k])*W[k] as the combined weight in the second pass. */
+ccl_device_inline void math_matrix_inverse_row(float *A, float *v, int n, int i)
+{
+	math_matrix_add_diagonal(A, n, 1e-4f); /* Improve the numerical stability. */
+	math_cholesky(A, n); /* Find L so that L*Lt = A. */
+	math_inverse_lower_tri(A, n); /* Find inv(L). */
+	/* Now, inv(A) = inv(L*Lt) = inv(Lt)*inv(L) = inv(L)t*inv(L).
+	 * From that, inv(A)[i,j] = sum_k((inv(L)t)[i, k] * inv(L)[k, j]) = sum_k(inv(L)[k, i] * inv(L)[k, j]).
+	 * But, since math_inverse_lower_tri stores the *transpose* of the result to be able to work inplace, we must swap the indices. */
+	math_vector_zero(v, n);
+	for(int j = 0; j < n; j++)
+		for(int k = j; k < n; k++)
+			v[j] += MAT(A, n, i, k)*MAT(A, n, j, k);
+}
+
 ccl_device float math_largest_eigenvalue(float *A, int n, float *vec, float *tmp)
 {
 	/* Matrix-Vector-Multiplication that only accesses the lower triangular part of A. */