[Bf-blender-cvs] [a72222c] soc-2016-cycles_denoising: Cycles: Split up the denoising feature row functions

Tue Nov 22 04:25:06 CET 2016

Commit: a72222c6be21f12c3493d44a77a3b9d125079c1c
Author: Lukas Stockner
Date:   Mon Nov 14 10:58:27 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBa72222c6be21f12c3493d44a77a3b9d125079c1c

Cycles: Split up the denoising feature row functions

===================================================================

M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernel_filter_util.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index e4031f6..2dce0d1 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -153,7 +153,7 @@ ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample,
 	math_vec3_zero(XtY, matrix_size);
 	FOR_PIXEL_WINDOW {
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
-		float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, NULL);
+		float weight = filter_fill_design_row_quadratic_cuda(features, rank, design_row, transform, transform_stride);
 	
 		if(weight == 0.0f) continue;
 		weight /= max(1.0f, filter_get_pixel_variance(pixel_buffer, pass_stride));
@@ -539,7 +539,7 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 	math_vec3_zero(XtY, matrix_size);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
-		__m128 weight = filter_fill_design_row_sse(features, active_pixels, rank, design_row, feature_transform_sse, NULL);
+		__m128 weight = filter_fill_design_row_quadratic_sse(features, active_pixels, rank, design_row, feature_transform_sse);
 		active_pixels = _mm_and_ps(active_pixels, _mm_cmpneq_ps(weight, _mm_setzero_ps()));
 
 		if(!_mm_movemask_ps(active_pixels)) continue;
@@ -829,7 +829,7 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 	math_vec3_zero(XtY, matrix_size);
 	FOR_PIXEL_WINDOW {
 		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
-		float weight = filter_fill_design_row(features, rank, design_row, feature_transform, NULL);
+		float weight = filter_fill_design_row_quadratic(features, rank, design_row, feature_transform);
 	
 		if(weight == 0.0f) continue;
 		weight /= max(1.0f, filter_get_pixel_variance(pixel_buffer, pass_stride));
diff --git a/intern/cycles/kernel/kernel_filter_util.h b/intern/cycles/kernel/kernel_filter_util.h
index 5981358..79095ff 100644
--- a/intern/cycles/kernel/kernel_filter_util.h
+++ b/intern/cycles/kernel/kernel_filter_util.h
@@ -154,14 +154,38 @@ ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer,
 	return average(make_float3(ccl_get_feature(17), ccl_get_feature(19), ccl_get_feature(21)));
 }
 
+/* Fill design row and compute WLR weight.
+ * Doing both at the same time allows for a nice early-out as soon as the weight is zero. */
 ccl_device_inline float filter_fill_design_row(float *features, int rank, float *design_row, float *feature_transform, float *bandwidth_factor)
 {
 	design_row[0] = 1.0f;
 	float weight = 1.0f;
 	for(int d = 0; d < rank; d++) {
 		float x = math_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
+		float x2 = x*bandwidth_factor[d];
+		x2 *= x2;
+		if(x2 < 1.0f) {
+			/* Pixels are weighted by Epanechnikov kernels. */
+			weight *= 0.75f * (1.0f - x2);
+		}
+		else {
+			weight = 0.0f;
+			break;
+		}
+		design_row[1+d] = x;
+	}
+	return weight;
+}
+
+/* Fill design row for the quadratic fit and compute WLR weight.
+ * Doing both at the same time allows for a nice early-out as soon as the weight is zero. */
+ccl_device_inline float filter_fill_design_row_quadratic(float *features, int rank, float *design_row, float *feature_transform)
+{
+	design_row[0] = 1.0f;
+	float weight = 1.0f;
+	for(int d = 0; d < rank; d++) {
+		float x = math_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
 		float x2 = x*x;
-		if(bandwidth_factor) x2 *= bandwidth_factor[d]*bandwidth_factor[d];
 		if(x2 < 1.0f) {
 			/* Pixels are weighted by Epanechnikov kernels. */
 			weight *= 0.75f * (1.0f - x2);
@@ -171,7 +195,7 @@ ccl_device_inline float filter_fill_design_row(float *features, int rank, float
 			break;
 		}
 		design_row[1+d] = x;
-		if(!bandwidth_factor) design_row[1+rank+d] = x2;
+		design_row[1+rank+d] = x*x;
 	}
 	return weight;
 }
@@ -354,11 +378,24 @@ ccl_device_inline __m128 filter_fill_design_row_sse(__m128 *features, __m128 act
 	design_row[0] = weight;
 	for(int d = 0; d < rank; d++) {
 		__m128 x = math_dot_sse(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
+		__m128 x2 = _mm_mul_ps(x, bandwidth_factor[d]);
+		x2 = _mm_mul_ps(x2, x2);
+		weight = _mm_mask_ps(_mm_mul_ps(weight, _mm_mul_ps(_mm_set1_ps(0.75f), _mm_sub_ps(_mm_set1_ps(1.0f), x2))), _mm_and_ps(_mm_cmplt_ps(x2, _mm_set1_ps(1.0f)), active_pixels));
+		design_row[1+d] = x;
+	}
+	return weight;
+}
+
+ccl_device_inline __m128 filter_fill_design_row_quadratic_sse(__m128 *features, __m128 active_pixels, int rank, __m128 *design_row, __m128 *feature_transform)
+{
+	__m128 weight = _mm_mask_ps(_mm_set1_ps(1.0f), active_pixels);
+	design_row[0] = weight;
+	for(int d = 0; d < rank; d++) {
+		__m128 x = math_dot_sse(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
 		__m128 x2 = _mm_mul_ps(x, x);
-		if(bandwidth_factor) x2 = _mm_mul_ps(x2, _mm_mul_ps(bandwidth_factor[d], bandwidth_factor[d]));
 		weight = _mm_mask_ps(_mm_mul_ps(weight, _mm_mul_ps(_mm_set1_ps(0.75f), _mm_sub_ps(_mm_set1_ps(1.0f), x2))), _mm_and_ps(_mm_cmplt_ps(x2, _mm_set1_ps(1.0f)), active_pixels));
 		design_row[1+d] = x;
-		if(!bandwidth_factor) design_row[1+rank+d] = x2;
+		design_row[1+rank+d] = x2;
 	}
 	return weight;
 }
@@ -378,8 +415,28 @@ ccl_device_inline float filter_fill_design_row_cuda(float *features, int rank, f
 	float weight = 1.0f;
 	for(int d = 0; d < rank; d++) {
 		float x = math_dot_cuda(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
+		float x2 = x*bandwidth_factor[d];
+		x2 *= x2;
+		if(x2 < 1.0f) {
+			/* Pixels are weighted by Epanechnikov kernels. */
+			weight *= 0.75f * (1.0f - x2);
+		}
+		else {
+			weight = 0.0f;
+			break;
+		}
+		design_row[1+d] = x;
+	}
+	return weight;
+}
+
+ccl_device_inline float filter_fill_design_row_quadratic_cuda(float *features, int rank, float *design_row, float ccl_readonly_ptr feature_transform, int transform_stride)
+{
+	design_row[0] = 1.0f;
+	float weight = 1.0f;
+	for(int d = 0; d < rank; d++) {
+		float x = math_dot_cuda(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
 		float x2 = x*x;
-		if(bandwidth_factor) x2 *= bandwidth_factor[d]*bandwidth_factor[d];
 		if(x2 < 1.0f) {
 			/* Pixels are weighted by Epanechnikov kernels. */
 			weight *= 0.75f * (1.0f - x2);
@@ -389,7 +446,7 @@ ccl_device_inline float filter_fill_design_row_cuda(float *features, int rank, f
 			break;
 		}
 		design_row[1+d] = x;
-		if(!bandwidth_factor) design_row[1+rank+d] = x2;
+		design_row[1+rank+d] = x2;
 	}
 	return weight;
 }