[Bf-blender-cvs] [fba2b77] soc-2016-cycles_denoising: Cycles: Further improve CUDA denoising speed by redesigning the design_row
Lukas Stockner
noreply at git.blender.org
Sun Aug 21 06:18:25 CEST 2016
Commit: fba2b77c2a12950802491c3112b3922f5805f98a
Author: Lukas Stockner
Date: Sun Aug 21 05:04:08 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBfba2b77c2a12950802491c3112b3922f5805f98a
Cycles: Further improve CUDA denoising speed by redesigning the design_row
The previous algorithm was:
- Fetch buffer data into the feature vector which was in shared (faster) memory
- Use the feature vector to calculate the weight and the design_row, which was stored in local (slower) memory
- Update the Gramian matrix using the design_row
Now, the problem there is that the most expensive part in terms of memory accesses is the third step, which means that having the design_row in shared memory would be a great improvement.
However, shared memory is extremely limited - for good performance, the number of elements per thread should be odd (to avoid bank comflicts), but even going from the 11 floats that the feature vector needs to 13 already significantly hurts the occupancy.
Therefore, in order to make room for the design_row, it would be great to get rid of the feature vector.
That's the first part of the commit: By changing the order in whoch the design_row is built, the first two steps can be merged so that the design_row is constructed directly from the buffer data instead of going through the feature vector.
This has a disadvantage - the old design_row construction had an early-abort for zero weights, which was pretty common. With the new structure, that's not possible anymore. However, this is less of a problem on GPUs due to divergence - in order to save any speed, all 32 threads in the warp had to abort anyways.
Now the feature vector doesn't take up memory anymore, but the design_row is still to big - it has up to 23 elements, which is far too much.
It has a useful property, though - the first element is always one, and the last 11 elements are just the squares of the first 11. So, storing 11 floats is enough to have all information, and the squaring can be performed when the design_row is used.
Therefore, the second part of the commit adds specialized functions that accept this reduced design_row and account for these missing elements.
===================================================================
M intern/cycles/kernel/kernel_filter.h
M intern/cycles/kernel/kernel_filter_util.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index b2f93b8..58399fa 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -117,8 +117,8 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample, float const* __restrict__ buffer, int x, int y, float const* __restrict__ transform, FilterStorage *storage, int4 rect, int transform_stride, int localIdx)
{
- __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
- float *features = shared_features + localIdx*DENOISE_FEATURES;
+ __shared__ float shared_design_row[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+ float *design_row = shared_design_row + localIdx*DENOISE_FEATURES;
int buffer_w = align_up(rect.z - rect.x, 4);
int buffer_h = (rect.w - rect.y);
@@ -144,20 +144,19 @@ ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample,
* both the r-feature vector z as well as z^T*z and using the resulting parameter for
* that dimension of the z^T*z vector times two as the derivative. */
int matrix_size = 2*rank + 1; /* Constant term (1 dim) + z (rank dims) + z^T*z (rank dims) */
- float XtX[(2*DENOISE_FEATURES+1)*(2*DENOISE_FEATURES+1)], design_row[2*DENOISE_FEATURES+1];
+ float XtX[(2*DENOISE_FEATURES+1)*(2*DENOISE_FEATURES+1)];
float3 XtY[2*DENOISE_FEATURES+1];
math_matrix_zero_lower(XtX, matrix_size);
math_vec3_zero(XtY, matrix_size);
FOR_PIXEL_WINDOW {
- filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, NULL);
+ float weight = filter_fill_design_row_cuda(design_row, rank, transform, transform_stride, NULL, px, py, pt, pixel_buffer, feature_means, pass_stride);
if(weight == 0.0f) continue;
weight /= max(1.0f, filter_get_pixel_variance(pixel_buffer, pass_stride));
- math_add_gramian(XtX, matrix_size, design_row, weight);
- math_add_vec3(XtY, matrix_size, design_row, weight * filter_get_pixel_color(pixel_buffer, pass_stride));
+ math_add_gramian_one_sqr(XtX, matrix_size, design_row, weight);
+ math_add_vec3_one_sqr(XtY, matrix_size, design_row, weight * filter_get_pixel_color(pixel_buffer, pass_stride));
} END_FOR_PIXEL_WINDOW
/* Solve the normal equation of the linear least squares system: Decompose A = X^T*X into L
@@ -178,8 +177,8 @@ ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample,
ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int sample, float const* __restrict__ buffer, int x, int y, float const* __restrict__ transform, FilterStorage *storage, int4 rect, int candidate, int transform_stride, int localIdx)
{
- __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
- float *features = shared_features + DENOISE_FEATURES*localIdx;
+ __shared__ float shared_design_row[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+ float *design_row = shared_design_row + localIdx*DENOISE_FEATURES;
int buffer_w = align_up(rect.z - rect.x, 4);
int buffer_h = (rect.w - rect.y);
@@ -209,7 +208,7 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
g_bandwidth_factor[i] = storage->bandwidth[i]/candidate_bw[candidate];
int matrix_size = rank+1;
- float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+ float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)];
math_matrix_zero_lower(XtX, matrix_size);
FOR_PIXEL_WINDOW {
@@ -217,13 +216,12 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
- filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, g_bandwidth_factor);
+ float weight = filter_fill_design_row_cuda(design_row, rank, transform, transform_stride, g_bandwidth_factor, px, py, pt, pixel_buffer, feature_means, pass_stride);
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- math_add_gramian(XtX, matrix_size, design_row, weight);
+ math_add_gramian_one(XtX, matrix_size, design_row, weight);
} END_FOR_PIXEL_WINDOW
math_matrix_add_diagonal(XtX, matrix_size, 1e-4f); /* Improve the numerical stability. */
@@ -245,12 +243,11 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
- filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, g_bandwidth_factor);
+ float weight = filter_fill_design_row_cuda(design_row, rank, transform, transform_stride, g_bandwidth_factor, px, py, pt, pixel_buffer, feature_means, pass_stride);
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- weight *= math_dot(design_row, r_feature_weight, matrix_size);
+ weight *= math_dot_one(r_feature_weight, design_row, matrix_size);
est_color += weight * color;
est_variance += weight*weight * max(variance, 0.0f);
@@ -274,29 +271,27 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
ccl_device void kernel_filter_calculate_bandwidth(KernelGlobals *kg, int sample, FilterStorage *storage)
{
- double lsq_bias[LSQ_SIZE], lsq_variance[LSQ_SIZE];
- math_lsq_init(lsq_bias);
- math_lsq_init(lsq_variance);
-
const float candidate_bw[6] = {0.05f, 0.1f, 0.25f, 0.5f, 1.0f, 2.0f};
+ double bias_XtX = 0.0, bias_XtY = 0.0, var_XtX = 0.0, var_XtY = 0.0;
for(int g = 0; g < 6; g++) {
- math_lsq_add(lsq_bias, (double) (candidate_bw[g]*candidate_bw[g]), (double) storage->est_bias[g]);
- math_lsq_add(lsq_variance, pow(candidate_bw[g], -storage->rank), max(sample*storage->est_variance[g], 0.0f));
+ double bias_x = (double) (candidate_bw[g]*candidate_bw[g]);
+ bias_XtX += bias_x*bias_x;
+ bias_XtY += bias_x * (double) storage->est_bias[g];
+ double var_x = 1.0 / (pow(candidate_bw[g], storage->rank) * sample);
+ var_XtX += var_x*var_x;
+ var_XtY += var_x * (double) storage->est_variance[g];
}
/* === Estimate optimal global bandwidth. === */
- double bias_coef = math_lsq_solve(lsq_bias, NULL);
- double variance_coef = math_lsq_solve(lsq_variance, NULL);
- if(variance_coef < 0.0) {
- variance_coef = -variance_coef;
- }
+ double bias_coef = bias_XtY / bias_XtX;
+ double variance_coef = var_XtY / var_XtX;
storage->global_bandwidth = (float) pow((storage->rank * variance_coef) / (4.0 * bias_coef*bias_coef * sample), 1.0 / (storage->rank + 4));
}
ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float const* __restrict__ buffer, int x, int y, int offset, int stride, float *buffers, float const* __restrict__ transform, FilterStorage *storage, int4 filter_area, int4 rect, int transform_stride, int localIdx)
{
- __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
- float *features = shared_features + DENOISE_FEATURES*localIdx;
+ __shared__ float shared_design_row[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+ float *design_row = shared_design_row + localIdx*DENOISE_FEATURES;
int buffer_w = align_up(rect.z - rect.x, 4);
int buffer_h = (rect.w - rect.y);
@@ -355,7 +350,7 @@ ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float co
/* === Calculate the final pixel color. === */
- float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+ float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)];
int matrix_size = rank+1;
math_matrix_zero_lower(XtX, matrix_size);
@@ -365,13 +360,12 @@ ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float co
float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
- filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, bandwidth_factor);
+ float weight = filter_fill_design_row_cuda(design_row, rank, transform, transform_stride, bandwidth_factor, px, py, pt, pixel_buffer, feature_means, pass_stride);
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- math_add_gramian(XtX, matrix_size, design_row, weight);
+ math_add_gramian_one(XtX, matrix_size, design_row, weight);
} END_FOR_PIXEL_WINDOW
#ifdef WITH_CYCLES_DEBUG_FILTER
@@ -397,12 +391,11 @@ ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float co
float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
- filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, bandwidth_factor);
+ float weight = filter_fill_design_row_cuda(design_row, rank, transform, transform_stride, bandwidth_factor, px, py, pt, pixel_buffer, feature_means, pass_stride);
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
- weight *= math_dot(design_row, r_feature_weight, matrix_size);
+ weight *= math_dot_one(r_feature_weight, design_row, matrix_size);
final_color += weight * color;
diff --git a/intern/cycles/kernel/kernel_filter_util.h b/intern/cycles/kernel/kernel_filter_util.h
index 3aabe32..c48beed 100644
--- a/intern/cycles/kernel/kernel_filter_util.h
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list