[Bf-blender-cvs] [30f4576d51b] temp-cycles-denoising: Cycles Denoising: Refactor design row construction

Lukas Stockner noreply at git.blender.org
Fri Apr 14 00:57:24 CEST 2017


Commit: 30f4576d51bd247488da8d250b541539a3bd5fa8
Author: Lukas Stockner
Date:   Thu Mar 30 00:25:45 2017 +0200
Branches: temp-cycles-denoising
https://developer.blender.org/rB30f4576d51bd247488da8d250b541539a3bd5fa8

Cycles Denoising: Refactor design row construction

This change improves the shared memory access pattern, reduces the local memory requirementy by 11 floats per thread and saves some memory copying.

===================================================================

M	intern/cycles/filter/filter_features.h
M	intern/cycles/filter/filter_reconstruction.h
M	intern/cycles/filter/filter_transform.h
M	intern/cycles/filter/filter_transform_gpu.h
M	intern/cycles/filter/filter_transform_sse.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h
index b25649f497a..f4f6e1f7639 100644
--- a/intern/cycles/filter/filter_features.h
+++ b/intern/cycles/filter/filter_features.h
@@ -16,7 +16,7 @@
 
  CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature(pass) buffer[(pass)*pass_stride]
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
  * pixel_buffer always points to the current pixel in the first pass. */
@@ -28,32 +28,18 @@
                                  pixel_buffer += buffer_w - (high.x - low.x); \
                              }
 
-ccl_device_inline void filter_get_feature_mean(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, int pass_stride)
-{
-	features[0] = pixel.x;
-	features[1] = pixel.y;
-	features[2] = ccl_get_feature(0);
-	features[3] = ccl_get_feature(1);
-	features[4] = ccl_get_feature(2);
-	features[5] = ccl_get_feature(3);
-	features[6] = ccl_get_feature(4);
-	features[7] = ccl_get_feature(5);
-	features[8] = ccl_get_feature(6);
-	features[9] = ccl_get_feature(7);
-}
-
 ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride)
 {
 	features[0] = pixel.x;
 	features[1] = pixel.y;
-	features[2] = ccl_get_feature(0);
-	features[3] = ccl_get_feature(1);
-	features[4] = ccl_get_feature(2);
-	features[5] = ccl_get_feature(3);
-	features[6] = ccl_get_feature(4);
-	features[7] = ccl_get_feature(5);
-	features[8] = ccl_get_feature(6);
-	features[9] = ccl_get_feature(7);
+	features[2] = ccl_get_feature(buffer, 0);
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
 	if(mean) {
 		for(int i = 0; i < DENOISE_FEATURES; i++)
 			features[i] -= mean[i];
@@ -64,14 +50,14 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float cc
 {
 	scales[0] = fabsf(pixel.x - mean[0]);
 	scales[1] = fabsf(pixel.y - mean[1]);
-	scales[2] = fabsf(ccl_get_feature(0) - mean[2]);
-	scales[3] = len_squared(make_float3(ccl_get_feature(1) - mean[3],
-	                                    ccl_get_feature(2) - mean[4],
-	                                    ccl_get_feature(3) - mean[5]));
-	scales[4] = fabsf(ccl_get_feature(4) - mean[6]);
-	scales[5] = len_squared(make_float3(ccl_get_feature(5) - mean[7],
-	                                    ccl_get_feature(6) - mean[8],
-	                                    ccl_get_feature(7) - mean[9]));
+	scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
 }
 
 ccl_device_inline void filter_calculate_scale(float *scale)
@@ -86,12 +72,12 @@ ccl_device_inline void filter_calculate_scale(float *scale)
 
 ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
 {
-	return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2));
+	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
 }
 
 ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
 {
-	return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)));
+	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
 }
 
 ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance)
@@ -101,27 +87,41 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_
 	return (color_diff > 3.0f*variance);
 }
 
+ccl_device_inline void design_row_add(float ccl_local_param *design_row,
+                                      int rank,
+                                      ccl_global float ccl_readonly_ptr transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
 /* Fill the design row without computing the weight. */
-ccl_device_inline void filter_get_design_row_transform(int2 pixel,
-                                                       ccl_global float ccl_readonly_ptr buffer,
-                                                       float ccl_readonly_ptr feature_means,
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       ccl_global float ccl_readonly_ptr p_buffer,
+                                                       int2 q_pixel,
+                                                       ccl_global float ccl_readonly_ptr q_buffer,
                                                        int pass_stride,
-                                                       ccl_local_param float *features,
                                                        int rank,
-                                                       float *design_row,
-                                                       ccl_global float ccl_readonly_ptr feature_transform,
-                                                       int transform_stride)
+                                                       float ccl_local_param *design_row,
+                                                       ccl_global float ccl_readonly_ptr transform,
+                                                       int stride)
 {
-	filter_get_features(pixel, buffer, features, feature_means, pass_stride);
 	design_row[0] = 1.0f;
-	for(int d = 0; d < rank; d++) {
-#ifdef __KERNEL_GPU__
-		float x = math_vector_dot_strided(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
-#else
-		float x = math_vector_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
-#endif
-		design_row[1+d] = x;
-	}
+	math_local_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/filter/filter_reconstruction.h b/intern/cycles/filter/filter_reconstruction.h
index f665e7e5f1e..70dfedce453 100644
--- a/intern/cycles/filter/filter_reconstruction.h
+++ b/intern/cycles/filter/filter_reconstruction.h
@@ -38,11 +38,11 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
 	const int stride = 1;
 	(void)storage_stride;
 	(void)localIdx;
-	float features[DENOISE_FEATURES];
+	float design_row[DENOISE_FEATURES+1];
 #else
 	const int stride = storage_stride;
-	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
-	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
 #endif
 
 	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
@@ -55,11 +55,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
 		return;
 	}
 
-	float feature_means[DENOISE_FEATURES];
-	filter_get_feature_mean(make_int2(x, y), buffer + p_offset, feature_means, pass_stride);
-
-	float design_row[DENOISE_FEATURES+1];
-	filter_get_design_row_transform(make_int2(x+dx, y+dy), buffer + q_offset, feature_means, pass_stride, features, *rank, design_row, transform, stride);
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
 
 	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
 	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
diff --git a/intern/cycles/filter/filter_transform.h b/intern/cycles/filter/filter_transform.h
index 3f161b06024..7e2504612e7 100644
--- a/intern/cycles/filter/filter_transform.h
+++ b/intern/cycles/filter/filter_transform.h
@@ -106,6 +106,7 @@ ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly
 			math_vector_mul(transform + (*rank)*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
 		}
 	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
 }
 
 CCL_NAMESPACE_END
diff

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list