[Bf-blender-cvs] [cf60e3f] soc-2016-cycles_denoising: Cycles: Implement NLM-weight filtering kernel
Lukas Stockner
noreply at git.blender.org
Tue Nov 22 04:25:10 CET 2016
Commit: cf60e3f5dbb05e1433f2fe1cf0e5840220ca5896
Author: Lukas Stockner
Date: Mon Nov 14 11:06:40 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBcf60e3f5dbb05e1433f2fe1cf0e5840220ca5896
Cycles: Implement NLM-weight filtering kernel
===================================================================
M intern/cycles/kernel/kernel_filter.h
M intern/cycles/kernel/kernel_filter_pre.h
M intern/cycles/kernel/kernel_filter_util.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
===================================================================
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index 34e62e5..29a0742 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -14,8 +14,8 @@
* limitations under the License.
*/
-#include "kernel_filter_pre.h"
#include "kernel_filter_util.h"
+#include "kernel_filter_pre.h"
#include "kernel_filter_old.h"
@@ -432,6 +432,122 @@ ccl_device void kernel_filter_final_pass_wlr(KernelGlobals *kg, int sample, floa
#endif
}
+ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, float ccl_readonly_ptr transform, CUDAFilterStorage *storage, int4 filter_area, int4 rect, int transform_stride, int localIdx)
+{
+ __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+ float *features = shared_features + DENOISE_FEATURES*localIdx;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int buffer_h = (rect.w - rect.y);
+ int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+ int num_frames = kernel_data.film.num_frames;
+ int prev_frames = kernel_data.film.prev_frames;
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - kernel_data.integrator.half_window),
+ max(rect.y, y - kernel_data.integrator.half_window));
+ int2 high = make_int2(min(rect.z, x + kernel_data.integrator.half_window + 1),
+ min(rect.w, y + kernel_data.integrator.half_window + 1));
+ float ccl_readonly_ptr pixel_buffer;
+ /* === Get center pixel. === */
+ float ccl_readonly_ptr center_buffer = buffer + (y - rect.y) * buffer_w + (x - rect.x);
+ float3 center_color = filter_get_pixel_color(center_buffer, pass_stride);
+ float sqrt_center_variance = sqrtf(filter_get_pixel_variance(center_buffer, pass_stride));
+
+ float feature_means[DENOISE_FEATURES];
+ filter_get_features(x, y, 0, center_buffer, feature_means, NULL, pass_stride);
+
+
+ /* === Fetch stored data from the previous kernel. === */
+ int rank = storage->rank;
+
+
+
+
+
+
+
+
+
+ /* === Calculate the final pixel color. === */
+ float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+
+ int matrix_size = rank+1;
+ math_matrix_zero_lower(XtX, matrix_size);
+
+ FOR_PIXEL_WINDOW {
+ float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
+ float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+ if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+ filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+ filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
+
+ float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, 0.5f, 4, rect);
+ if(weight == 0.0f) continue;
+ weight /= max(1.0f, variance);
+
+ math_add_gramian(XtX, matrix_size, design_row, weight);
+ } END_FOR_PIXEL_WINDOW
+
+#ifdef WITH_CYCLES_DEBUG_FILTER
+ storage->filtered_global_bandwidth = global_bandwidth;
+ storage->sum_weight = XtX[0];
+#endif
+
+ math_matrix_add_diagonal(XtX, matrix_size, 1e-4f); /* Improve the numerical stability. */
+ math_cholesky(XtX, matrix_size);
+ math_inverse_lower_tri_inplace(XtX, matrix_size);
+
+ float r_feature_weight[DENOISE_FEATURES+1];
+ math_vector_zero(r_feature_weight, matrix_size);
+ for(int col = 0; col < matrix_size; col++)
+ for(int row = col; row < matrix_size; row++)
+ r_feature_weight[col] += XtX[row]*XtX[col*matrix_size+row];
+
+ float3 final_color = make_float3(0.0f, 0.0f, 0.0f);
+ float3 final_pos_color = make_float3(0.0f, 0.0f, 0.0f);
+ float pos_weight = 0.0f;
+ FOR_PIXEL_WINDOW {
+ float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
+ float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+ if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+ filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+ filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
+
+ float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, 0.5f, 4, rect);
+ if(weight == 0.0f) continue;
+ weight /= max(1.0f, variance);
+ weight *= math_dot(design_row, r_feature_weight, matrix_size);
+
+ final_color += weight * color;
+
+ if(weight >= 0.0f) {
+ final_pos_color += weight * color;
+ pos_weight += weight;
+ }
+ } END_FOR_PIXEL_WINDOW
+
+ if(final_color.x < 0.0f || final_color.y < 0.0f || final_color.z < 0.0f) {
+ final_color = final_pos_color / max(pos_weight, 1e-5f);
+ }
+ final_color *= sample;
+
+ float *combined_buffer = buffers + (offset + y*stride + x)*kernel_data.film.pass_stride;
+ if(kernel_data.film.pass_no_denoising)
+ final_color += make_float3(combined_buffer[kernel_data.film.pass_no_denoising],
+ combined_buffer[kernel_data.film.pass_no_denoising+1],
+ combined_buffer[kernel_data.film.pass_no_denoising+2]);
+
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+
+#ifdef WITH_CYCLES_DEBUG_FILTER
+ storage->log_rmse_per_sample -= 2.0f * logf(linear_rgb_to_gray(final_color) + 0.001f);
+#endif
+}
+
#else
# ifdef __KERNEL_SSE3__
@@ -1162,6 +1278,125 @@ ccl_device void kernel_filter_final_pass_wlr(KernelGlobals *kg, int sample, floa
#endif
}
+ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, FilterStorage *storage, int4 filter_area, int4 rect)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int buffer_h = (rect.w - rect.y);
+ int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+ int num_frames = kernel_data.film.num_frames;
+ int prev_frames = kernel_data.film.prev_frames;
+
+ float features[DENOISE_FEATURES];
+ float ccl_readonly_ptr pixel_buffer;
+
+ /* === Get center pixel. === */
+ float ccl_readonly_ptr center_buffer = buffer + (y - rect.y) * buffer_w + (x - rect.x);
+ float3 center_color = filter_get_pixel_color(center_buffer, pass_stride);
+ float sqrt_center_variance = sqrtf(filter_get_pixel_variance(center_buffer, pass_stride));
+
+ float feature_means[DENOISE_FEATURES];
+ filter_get_features(x, y, 0, center_buffer, feature_means, NULL, pass_stride);
+
+
+
+
+ /* === Fetch stored data from the previous kernel. === */
+ float *feature_transform = &storage->transform[0];
+ int rank = storage->rank;
+
+
+
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - kernel_data.integrator.half_window),
+ max(rect.y, y - kernel_data.integrator.half_window));
+ int2 high = make_int2(min(rect.z, x + kernel_data.integrator.half_window + 1),
+ min(rect.w, y + kernel_data.integrator.half_window + 1));
+
+
+
+
+
+ /* === Calculate the final pixel color. === */
+ float XtX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+
+ int matrix_size = rank+1;
+ math_matrix_zero_lower(XtX, matrix_size);
+
+ FOR_PIXEL_WINDOW {
+ float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
+ float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+ if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+ filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+ filter_fill_design_row_no_weight(features, rank, design_row, feature_transform);
+
+ float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, 0.5f, 4, rect);
+ if(weight < 1e-5f) continue;
+ weight /= max(1.0f, variance);
+
+ math_add_gramian(XtX, matrix_size, design_row, weight);
+ } END_FOR_PIXEL_WINDOW
+
+#ifdef WITH_CYCLES_DEBUG_FILTER
+ storage->filtered_global_bandwidth = global_bandwidth;
+ storage->sum_weight = XtX[0];
+#endif
+
+ math_matrix_add_diagonal(XtX, matrix_size, 1e-4f); /* Improve the numerical stability. */
+ math_cholesky(XtX, matrix_size);
+ math_inverse_lower_tri_inplace(XtX, matrix_size);
+
+ float r_feature_weight[DENOISE_FEATURES+1];
+ math_vector_zero(r_feature_weight, matrix_size);
+ for(int col = 0; col < matrix_size; col++)
+ for(int row = col; row < matrix_size; row++)
+ r_feature_weight[col] += XtX[row]*XtX[col*matrix_size+row];
+
+ float3 final_color = make_float3(0.0f, 0.0f, 0.0f);
+ float3 final_pos_color = make_float3(0.0f, 0.0f, 0.0f);
+ float pos_weight = 0.0f;
+ FOR_PIXEL_WINDOW {
+ float3 color = filter_get_pixel_color(pixel_buffer, pass_stride);
+ float variance = filter_get_pixel_variance(pixel_buffer, pass_stride);
+ if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
+
+ filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+ filter_fill_design_row_no_weight(features, rank, design_row, feature_transform);
+
+ float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, 0.5f, 4, rect);
+ if(weight < 1e-5f) continue;
+ weight /= max(1.0f, variance);
+ weight *= math_dot(design_row, r_feature_weight, matrix_size);
+
+ final_color += weight * color;
+
+ if(weight >= 0.0f) {
+ final_pos_color += weight * color;
+ pos_weight += weight;
+ }
+ } END_FOR_PIXEL_WINDOW
+
+ if(final_color.x < 0.0f || final_color.y < 0.0f || final_color.z < 0.0f) {
+ final_color = final_pos_color / max(pos_weight, 1e-5f);
+ }
+ final_color *= sample;
+
+ float *combined_buffer = buffers + (offset + y*stride + x)*kernel_data.film.pass_stride;
+ if(kernel_data.film.pass_no_denoising)
+ final_color += make_float3(combined_buffer[kernel_data.film.pass_no_denoising],
+ combined_buffer[kernel_data.film.pass_no_denoising+1],
+ combined_buffer[kernel_data.film.pass_no_denoising+2]);
+
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+
+#ifdef WITH_CYCLES_DEBUG_
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list