[Bf-blender-cvs] [30f4576d51b] temp-cycles-denoising: Cycles Denoising: Refactor design row construction
Lukas Stockner
noreply at git.blender.org
Fri Apr 14 00:57:24 CEST 2017
Commit: 30f4576d51bd247488da8d250b541539a3bd5fa8
Author: Lukas Stockner
Date: Thu Mar 30 00:25:45 2017 +0200
Branches: temp-cycles-denoising
https://developer.blender.org/rB30f4576d51bd247488da8d250b541539a3bd5fa8
Cycles Denoising: Refactor design row construction
This change improves the shared memory access pattern, reduces the local memory requirementy by 11 floats per thread and saves some memory copying.
===================================================================
M intern/cycles/filter/filter_features.h
M intern/cycles/filter/filter_reconstruction.h
M intern/cycles/filter/filter_transform.h
M intern/cycles/filter/filter_transform_gpu.h
M intern/cycles/filter/filter_transform_sse.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h
index b25649f497a..f4f6e1f7639 100644
--- a/intern/cycles/filter/filter_features.h
+++ b/intern/cycles/filter/filter_features.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-#define ccl_get_feature(pass) buffer[(pass)*pass_stride]
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
* pixel_buffer always points to the current pixel in the first pass. */
@@ -28,32 +28,18 @@
pixel_buffer += buffer_w - (high.x - low.x); \
}
-ccl_device_inline void filter_get_feature_mean(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, int pass_stride)
-{
- features[0] = pixel.x;
- features[1] = pixel.y;
- features[2] = ccl_get_feature(0);
- features[3] = ccl_get_feature(1);
- features[4] = ccl_get_feature(2);
- features[5] = ccl_get_feature(3);
- features[6] = ccl_get_feature(4);
- features[7] = ccl_get_feature(5);
- features[8] = ccl_get_feature(6);
- features[9] = ccl_get_feature(7);
-}
-
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride)
{
features[0] = pixel.x;
features[1] = pixel.y;
- features[2] = ccl_get_feature(0);
- features[3] = ccl_get_feature(1);
- features[4] = ccl_get_feature(2);
- features[5] = ccl_get_feature(3);
- features[6] = ccl_get_feature(4);
- features[7] = ccl_get_feature(5);
- features[8] = ccl_get_feature(6);
- features[9] = ccl_get_feature(7);
+ features[2] = ccl_get_feature(buffer, 0);
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] -= mean[i];
@@ -64,14 +50,14 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float cc
{
scales[0] = fabsf(pixel.x - mean[0]);
scales[1] = fabsf(pixel.y - mean[1]);
- scales[2] = fabsf(ccl_get_feature(0) - mean[2]);
- scales[3] = len_squared(make_float3(ccl_get_feature(1) - mean[3],
- ccl_get_feature(2) - mean[4],
- ccl_get_feature(3) - mean[5]));
- scales[4] = fabsf(ccl_get_feature(4) - mean[6]);
- scales[5] = len_squared(make_float3(ccl_get_feature(5) - mean[7],
- ccl_get_feature(6) - mean[8],
- ccl_get_feature(7) - mean[9]));
+ scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
}
ccl_device_inline void filter_calculate_scale(float *scale)
@@ -86,12 +72,12 @@ ccl_device_inline void filter_calculate_scale(float *scale)
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
{
- return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2));
+ return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
}
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
{
- return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)));
+ return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
}
ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance)
@@ -101,27 +87,41 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_
return (color_diff > 3.0f*variance);
}
+ccl_device_inline void design_row_add(float ccl_local_param *design_row,
+ int rank,
+ ccl_global float ccl_readonly_ptr transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
/* Fill the design row without computing the weight. */
-ccl_device_inline void filter_get_design_row_transform(int2 pixel,
- ccl_global float ccl_readonly_ptr buffer,
- float ccl_readonly_ptr feature_means,
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ ccl_global float ccl_readonly_ptr p_buffer,
+ int2 q_pixel,
+ ccl_global float ccl_readonly_ptr q_buffer,
int pass_stride,
- ccl_local_param float *features,
int rank,
- float *design_row,
- ccl_global float ccl_readonly_ptr feature_transform,
- int transform_stride)
+ float ccl_local_param *design_row,
+ ccl_global float ccl_readonly_ptr transform,
+ int stride)
{
- filter_get_features(pixel, buffer, features, feature_means, pass_stride);
design_row[0] = 1.0f;
- for(int d = 0; d < rank; d++) {
-#ifdef __KERNEL_GPU__
- float x = math_vector_dot_strided(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
-#else
- float x = math_vector_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
-#endif
- design_row[1+d] = x;
- }
+ math_local_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/filter/filter_reconstruction.h b/intern/cycles/filter/filter_reconstruction.h
index f665e7e5f1e..70dfedce453 100644
--- a/intern/cycles/filter/filter_reconstruction.h
+++ b/intern/cycles/filter/filter_reconstruction.h
@@ -38,11 +38,11 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
const int stride = 1;
(void)storage_stride;
(void)localIdx;
- float features[DENOISE_FEATURES];
+ float design_row[DENOISE_FEATURES+1];
#else
const int stride = storage_stride;
- ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
- ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
#endif
float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
@@ -55,11 +55,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
return;
}
- float feature_means[DENOISE_FEATURES];
- filter_get_feature_mean(make_int2(x, y), buffer + p_offset, feature_means, pass_stride);
-
- float design_row[DENOISE_FEATURES+1];
- filter_get_design_row_transform(make_int2(x+dx, y+dy), buffer + q_offset, feature_means, pass_stride, features, *rank, design_row, transform, stride);
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
diff --git a/intern/cycles/filter/filter_transform.h b/intern/cycles/filter/filter_transform.h
index 3f161b06024..7e2504612e7 100644
--- a/intern/cycles/filter/filter_transform.h
+++ b/intern/cycles/filter/filter_transform.h
@@ -106,6 +106,7 @@ ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly
math_vector_mul(transform + (*rank)*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
}
}
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
}
CCL_NAMESPACE_END
diff
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list