[Bf-blender-cvs] [73c48d7347] soc-2016-cycles_denoising: Cycles Denoising: Use better and faster NLM implementation for feature pass prefiltering
Lukas Stockner
noreply at git.blender.org
Thu Jan 12 05:14:13 CET 2017
Commit: 73c48d7347ee9ca77aff7e558d97130c8eb43f87
Author: Lukas Stockner
Date: Wed Jan 11 07:26:55 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB73c48d7347ee9ca77aff7e558d97130c8eb43f87
Cycles Denoising: Use better and faster NLM implementation for feature pass prefiltering
Previously, the prefiltering NLM kernel was implemented just as it's described in the paper:
For every pixel P, loop over every pixel Q in the search window. Then, loop over the small patches around them, calculate the average difference, and use that to compute the weight of Q for the denoised result at P.
However, that gives you a time complexity of O(N^2 * R^2 * F^2), where N is the image size, R the search window and F the patch size...
So, this patch implements the clever idea from "A Simple Trick to Speed Up and Improve the Non-Local Means" - by reformulating the loop, it's actually possible to skip a lot of computation and replace it with a separable box filter convolution. This reduces complexity to O(N^2 * R^2 * F), and the amount of pixel differences calculated even to O(N^2 * R^2)!
Furthermore, by applying a second box-filter pass after calculating the weights, we get the "patchwise NLM" improvement basically for free!
This is CPU-only so far, but that will change soon.
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/kernel/filter/filter_nlm.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 31191f3c84..a596097f15 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -135,12 +135,17 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*)> filter_divide_shadow_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*)> filter_get_feature_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_non_local_means_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)> filter_construct_transform_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)> filter_reconstruct_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, int, int, float*, int, int)> filter_divide_combined_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), \
KERNEL_NAME_EVAL(cpu_sse2, name), \
@@ -157,11 +162,15 @@ public:
shader_kernel(KERNEL_FUNCTIONS(shader)),
filter_divide_shadow_kernel(KERNEL_FUNCTIONS(filter_divide_shadow)),
filter_get_feature_kernel(KERNEL_FUNCTIONS(filter_get_feature)),
- filter_non_local_means_kernel(KERNEL_FUNCTIONS(filter_non_local_means)),
filter_combine_halves_kernel(KERNEL_FUNCTIONS(filter_combine_halves)),
filter_construct_transform_kernel(KERNEL_FUNCTIONS(filter_construct_transform)),
filter_reconstruct_kernel(KERNEL_FUNCTIONS(filter_reconstruct)),
- filter_divide_combined_kernel(KERNEL_FUNCTIONS(filter_divide_combined))
+ filter_divide_combined_kernel(KERNEL_FUNCTIONS(filter_divide_combined)),
+ filter_nlm_calc_difference_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_difference)),
+ filter_nlm_blur_kernel(KERNEL_FUNCTIONS(filter_nlm_blur)),
+ filter_nlm_calc_weight_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_weight)),
+ filter_nlm_update_output_kernel(KERNEL_FUNCTIONS(filter_nlm_update_output)),
+ filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize))
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
@@ -275,6 +284,30 @@ public:
}
};
+ void non_local_means(int4 rect, float *image, float *weight, float *out, float *variance, float *difference, float *blurDifference, float *weightAccum, int r, int f, float a, float k_2)
+ {
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ memset(weightAccum, 0, sizeof(float)*w*h);
+ memset(out, 0, sizeof(float)*w*h);
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, 0, a, k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+ filter_nlm_update_output_kernel()(dx, dy, blurDifference, image, out, weightAccum, local_rect, w, f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+ filter_nlm_normalize_kernel()(out, weightAccum, local_rect, w);
+ }
+
float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
{
bool cross_denoise = kg->__data.film.denoise_cross;
@@ -294,51 +327,16 @@ public:
#ifdef WITH_CYCLES_DEBUG_FILTER
DenoiseDebug debug((rect.z - rect.x), h, 34);
#endif
- /* ==== Step 1: Prefilter general features. ==== */
- {
-
- float *unfiltered = filter_buffer + 16*pass_stride;
- /* Order in render buffers:
- * Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
- * 0 1 2 3 4 5 6 7 8 9 10 11 12 13
- *
- * Order in denoise buffer:
- * Normal[X, XVar, Y, YVar, Z, ZVar] Depth DepthVar Shadow ShadowVar Albedo[R, RVar, G, GVar, B, BVar] Color[R, RVar, G, GVar, B, BVar]
- * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
- *
- * Order of processing: |NormalXYZ|Depth|AlbedoXYZ |
- * | | | | */
- int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
- int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
- int offset_to[] = { 0, 2, 4, 10, 12, 14, 6 };
- for(int i = 0; i < 7; i++) {
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
- }
- }
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_non_local_means_kernel()(x, y, unfiltered, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, filter_buffer + offset_to[i]*pass_stride, &rect.x, 2, 2, 1, 0.25f);
- }
- }
-#ifdef WITH_CYCLES_DEBUG_FILTER
-#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, name), var, 1, w);
- WRITE_DEBUG("unfiltered", unfiltered);
- WRITE_DEBUG("sampleV", filter_buffer + (offset_to[i]+1)*pass_stride);
- WRITE_DEBUG("filtered", filter_buffer + offset_to[i]*pass_stride);
-#undef WRITE_DEBUG
-#endif
- }
- }
+#define PASSPTR(i) (filter_buffer + (i)*pass_stride)
- /* ==== Step 2: Prefilter shadow feature. ==== */
+ /* ==== Step 1: Prefilter shadow feature. ==== */
{
/* Reuse some passes of the filter_buffer for temporary storage. */
- float *sampleV = filter_buffer + 16*pass_stride, *sampleVV = filter_buffer + 17*pass_stride, *bufferV = filter_buffer + 18*pass_stride, *cleanV = filter_buffer + 19*pass_stride;
- float *unfiltered = filter_buffer + 20*pass_stride;
+ float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
+ float *unfiltered = PASSPTR(4), *unfilteredB = PASSPTR(5);
+ float *diffI = PASSPTR(10), *blurDiffI = PASSPTR(11), *accumI = PASSPTR(12);
/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
for(int y = rect.y; y < rect.w; y++) {
@@ -349,29 +347,21 @@ public:
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("shadow_%s", name), var, 1, w);
WRITE_DEBUG("unfilteredA", unfiltered);
- WRITE_DEBUG("unfilteredB", unfiltered + pass_stride);
+ WRITE_DEBUG("unfilteredB", unfilteredB);
WRITE_DEBUG("bufferV", bufferV);
WRITE_DEBUG("sampleV", sampleV);
WRITE_DEBUG("sampleVV", sampleVV);
#endif
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, cleanV, &rect.x, 6, 3, 4, 1.0f);
- }
- }
+ non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
#ifdef WITH_CYCLES_DEBUG_FILTER
- WRITE_DEBUG("cleanV", cleanV);
+ WRITE_DEBUG("cleanV", cleanV);
#endif
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_non_local_means_kernel()(x, y, unfiltered, unfiltered + pass_stride, cleanV, sampleV, &rect.x, 5, 3, 1, 0.25f);
- filter_non_local_means_kernel()(x, y, unfiltered + pass_stride, unfiltered, cleanV, bufferV, &rect.x, 5, 3, 1, 0.25f);
- }
- }
+ non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+ non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("filteredA", sampleV);
WRITE_DEBUG("filteredB", bufferV);
@@ -388,12 +378,8 @@ public:
#endif
/* Use the residual variance for a second filter pass. */
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_non_local_means_kernel()(x, y, sampleV, bufferV, sampleVV, unfiltered , &rect.x, 4, 2, 1, 0.5f);
- filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, unfiltered + pass_stride, &rect.x, 4, 2, 1, 0.5f);
- }
- }
+ non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+ non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("finalA", unfiltered
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list