[Bf-blender-cvs] [a0cc7bd961e] master: Cycles: Implement vectorized NLM kernels for faster CPU denoising
Lukas Stockner
noreply at git.blender.org
Sat Oct 6 21:56:12 CEST 2018
Commit: a0cc7bd961ef2cd501468dce08998992a88b3bed
Author: Lukas Stockner
Date: Sat Oct 6 20:39:01 2018 +0200
Branches: master
https://developer.blender.org/rBa0cc7bd961ef2cd501468dce08998992a88b3bed
Cycles: Implement vectorized NLM kernels for faster CPU denoising
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_denoising.cpp
M intern/cycles/kernel/filter/filter_nlm_cpu.h
M intern/cycles/kernel/kernels/cpu/filter_cpu.h
M intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
M intern/cycles/util/util_math.h
M intern/cycles/util/util_math_fast.h
M intern/cycles/util/util_math_float4.h
M intern/cycles/util/util_math_int4.h
M intern/cycles/util/util_types_int4.h
M intern/cycles/util/util_types_int4_impl.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 7c72ab1a009..4986bb809fb 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -184,11 +184,11 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -475,6 +475,7 @@ public:
float *blurDifference = temporary_mem;
float *difference = temporary_mem + task->buffer.pass_stride;
float *weightAccum = temporary_mem + 2*task->buffer.pass_stride;
+ float *temp_image = temporary_mem + 3*task->buffer.pass_stride;
memset(weightAccum, 0, sizeof(float)*w*h);
memset((float*) out_ptr, 0, sizeof(float)*w*h);
@@ -499,6 +500,7 @@ public:
filter_nlm_update_output_kernel()(dx, dy,
blurDifference,
(float*) image_ptr,
+ temp_image,
(float*) out_ptr,
weightAccum,
local_rect,
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 23c18fa15b2..3384e1d81fd 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer()
buffer.mem.alloc_to_device(mem_size, false);
/* CPUs process shifts sequentially while GPUs process them in parallel. */
- int num_shifts = 1;
+ int num_layers;
if(buffer.gpu_temporary_mem) {
/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
int max_radius = max(radius, 6);
- num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+ int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+ num_layers = 2*num_shifts + 1;
+ }
+ else {
+ num_layers = 4;
}
/* Allocate two layers per shift as well as one for the weight accumulation. */
- buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride);
+ buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
}
void DenoisingTask::prefilter_shadowing()
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index e2da0fd872b..af73c0dadf2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,6 +16,9 @@
CCL_NAMESPACE_BEGIN
+#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
@@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
float a,
float k_2)
{
+ /* Strides need to be aligned to 16 bytes. */
+ kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+ int aligned_lowx = rect.x & (~3);
+ const int numChannels = (channel_offset > 0)? 3 : 1;
+ const float4 channel_fac = make_float4(1.0f / numChannels);
+
for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- float diff = 0.0f;
- int numChannels = channel_offset? 3 : 1;
- for(int c = 0; c < numChannels; c++) {
- float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
- float pvar = variance_image[c*channel_offset + y*stride + x];
- float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
- diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
- }
- if(numChannels > 1) {
- diff *= 1.0f/numChannels;
+ int idx_p = y*stride + aligned_lowx;
+ int idx_q = (y+dy)*stride + aligned_lowx + dx;
+ for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+ float4 diff = make_float4(0.0f);
+ for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+ /* idx_p is guaranteed to be aligned, but idx_q isn't. */
+ float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+ float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+ float4 cdiff = color_p - color_q;
+ float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+ float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+ diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
}
- difference_image[y*stride + x] = diff;
+ load4_a(difference_image, idx_p) = diff*channel_fac;
}
}
}
@@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
int stride,
int f)
{
- int aligned_lowx = rect.x / 4;
- int aligned_highx = (rect.z + 3) / 4;
+ int aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*stride + x] = 0.0f;
+ for(int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y*stride + x) = make_float4(0.0f);
}
for(int y1 = low; y1 < high; y1++) {
- float4* out_image4 = (float4*)(out_image + y*stride);
- float4* difference_image4 = (float4*)(difference_image + y1*stride);
- for(int x = aligned_lowx; x < aligned_highx; x++) {
- out_image4[x] += difference_image4[x];
+ for(int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
}
}
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*stride + x] *= 1.0f/(high - low);
+ float fac = 1.0f/(high - low);
+ for(int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y*stride + x) *= fac;
}
}
}
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
- float *out_image,
- int4 rect,
- int stride,
- int f)
+ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int stride,
+ int f)
{
+ int aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*stride + x] = 0.0f;
+ for(int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y*stride + x) = make_float4(0.0f);
}
}
+
for(int dx = -f; dx <= f; dx++) {
- int pos_dx = max(0, dx);
- int neg_dx = min(0, dx);
+ aligned_lowx = round_down(rect.x - min(0, dx), 4);
+ int highx = rect.z - max(0, dx);
+ int4 lowx4 = make_int4(rect.x - min(0, dx));
+ int4 highx4 = make_int4(rect.z - max(0, dx));
for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
- out_image[y*stride + x] += difference_image[y*stride + x+dx];
+ for(int x = aligned_lowx; x < highx; x += 4) {
+ int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+ int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+ float4 diff = load4_u(difference_image, y*stride + x + dx);
+ load4_a(out_image, y*stride + x) += mask(active, diff);
}
}
}
+
+ aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
+ for(int x = aligned_lowx; x < rect.z; x += 4) {
+ float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+ float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+ float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
+ load4_a(out_image, y*stride + x) *= rcp(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const flo
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list