[Bf-blender-cvs] [9aabef3] soc-2016-cycles_denoising: Cycles: Add a NLM filter for testing purposes
Lukas Stockner
noreply at git.blender.org
Tue Nov 22 04:25:07 CET 2016
Commit: 9aabef397515353ecc04270ee28d06364f3fd265
Author: Lukas Stockner
Date: Mon Nov 14 11:00:45 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB9aabef397515353ecc04270ee28d06364f3fd265
Cycles: Add a NLM filter for testing purposes
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/kernel/kernel_filter_pre.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 180bc5a..ec78cd0 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -420,6 +420,7 @@ public:
{
void(*filter_estimate_params_kernel)(KernelGlobals*, int, float*, int, int, void*, int*);
void(*filter_final_pass_kernel)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*);
+ void(*filter_non_local_means_3)(int, int, float**, float**, float**, float**, int*, int, int, float, float);
void(*filter_old_1)(KernelGlobals*, float*, int, int, int, int, float, float*, int*);
void(*filter_old_2)(KernelGlobals*, float*, float*, int, int, int, int, int, int, float, float*, int*, int*);
@@ -427,6 +428,7 @@ public:
if(system_cpu_support_avx2()) {
filter_estimate_params_kernel = kernel_cpu_avx2_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_avx2_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_avx2_filter_non_local_means_3;
filter_old_1 = kernel_cpu_avx2_filter_old_1;
filter_old_2 = kernel_cpu_avx2_filter_old_2;
}
@@ -436,6 +438,7 @@ public:
if(system_cpu_support_avx()) {
filter_estimate_params_kernel = kernel_cpu_avx_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_avx_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_avx_filter_non_local_means_3;
filter_old_1 = kernel_cpu_avx_filter_old_1;
filter_old_2 = kernel_cpu_avx_filter_old_2;
}
@@ -445,6 +448,7 @@ public:
if(system_cpu_support_sse41()) {
filter_estimate_params_kernel = kernel_cpu_sse41_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_sse41_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_sse41_filter_non_local_means_3;
filter_old_1 = kernel_cpu_sse41_filter_old_1;
filter_old_2 = kernel_cpu_sse41_filter_old_2;
}
@@ -454,6 +458,7 @@ public:
if(system_cpu_support_sse3()) {
filter_estimate_params_kernel = kernel_cpu_sse3_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_sse3_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_sse3_filter_non_local_means_3;
filter_old_1 = kernel_cpu_sse3_filter_old_1;
filter_old_2 = kernel_cpu_sse3_filter_old_2;
}
@@ -463,6 +468,7 @@ public:
if(system_cpu_support_sse2()) {
filter_estimate_params_kernel = kernel_cpu_sse2_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_sse2_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_sse2_filter_non_local_means_3;
filter_old_1 = kernel_cpu_sse2_filter_old_1;
filter_old_2 = kernel_cpu_sse2_filter_old_2;
}
@@ -471,15 +477,20 @@ public:
{
filter_estimate_params_kernel = kernel_cpu_filter_estimate_params;
filter_final_pass_kernel = kernel_cpu_filter_final_pass;
+ filter_non_local_means_3 = kernel_cpu_filter_non_local_means_3;
filter_old_1 = kernel_cpu_filter_old_1;
filter_old_2 = kernel_cpu_filter_old_2;
}
bool old_filter = getenv("OLD_FILTER");
+ bool nlm_filter = getenv("NLM_FILTER");
FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
int hw = kg->__data.integrator.half_window;
+ int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
+ int pass_stride = w*h;
+
if(old_filter) {
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
@@ -509,7 +520,29 @@ public:
WRITE_DEBUG("log_rmse_per_sample", log_rmse_per_sample);
#undef WRITE_DEBUG
#endif
- } else {
+ }
+ else if(nlm_filter) {
+ float *img[3] = {filter_buffer + 16*pass_stride, filter_buffer + 18*pass_stride, filter_buffer + 20*pass_stride};
+ float *var[3] = {filter_buffer + 17*pass_stride, filter_buffer + 19*pass_stride, filter_buffer + 21*pass_stride};
+ float *out[3] = {filter_buffer + 0*pass_stride, filter_buffer + 1*pass_stride, filter_buffer + 2*pass_stride};
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ filter_non_local_means_3(x, y, img, img, var, out, &rect.x, 10, 4, 1, 0.04f);
+ }
+ }
+ for(int y = 0; y < filter_area.w; y++) {
+ int py = y + filter_area.y;
+ for(int x = 0; x < filter_area.z; x++) {
+ int px = x + filter_area.x;
+ int i = (py - rect.y)*w + (px - rect.x);
+ float *loc_buf = buffers + (offset + py*stride + px)*kg->__data.film.pass_stride;
+ loc_buf[0] = sample*filter_buffer[0*pass_stride + i];
+ loc_buf[1] = sample*filter_buffer[1*pass_stride + i];
+ loc_buf[2] = sample*filter_buffer[2*pass_stride + i];
+ }
+ }
+ }
+ else {
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
filter_estimate_params_kernel(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z + x, &rect.x);
diff --git a/intern/cycles/kernel/kernel_filter_pre.h b/intern/cycles/kernel/kernel_filter_pre.h
index 04f5c03..47c69f3 100644
--- a/intern/cycles/kernel/kernel_filter_pre.h
+++ b/intern/cycles/kernel/kernel_filter_pre.h
@@ -185,4 +185,74 @@ ccl_device void kernel_filter_non_local_means(int x, int y, float ccl_readonly_p
filteredImage[p_idx] = sum_image / sum_weight;
}
+ccl_device void kernel_filter_non_local_means_3(int x, int y, float ccl_readonly_ptr noisyImage[3], float ccl_readonly_ptr weightImage[3], float ccl_readonly_ptr variance[3], float *filteredImage[3], int4 rect, int r, int f, float a, float k_2)
+{
+ int2 low = make_int2(max(rect.x, x - r),
+ max(rect.y, y - r));
+ int2 high = make_int2(min(rect.z, x + r + 1),
+ min(rect.w, y + r + 1));
+
+ float sum_image[3] = {0.0f}, sum_weight = 0.0f;
+
+ int w = align_up(rect.z - rect.x, 4);
+ int p_idx = (y-rect.y)*w + (x - rect.x);
+ int q_idx = (low.y-rect.y)*w + (low.x-rect.x);
+#ifdef __KERNEL_SSE41__
+ __m128 a_sse = _mm_set1_ps(a), k_2_sse = _mm_set1_ps(k_2);
+#endif
+ /* Loop over the q's, center pixels of all relevant patches. */
+ for(int qy = low.y; qy < high.y; qy++) {
+ for(int qx = low.x; qx < high.x; qx++, q_idx++) {
+ int2 low_dPatch = make_int2(max(max(rect.x - qx, rect.x - x), -f), max(max(rect.y - qy, rect.y - y), -f));
+ int2 high_dPatch = make_int2(min(min(rect.z - qx, rect.z - x), f+1), min(min(rect.w - qy, rect.w - y), f+1));
+ /* Loop over the pixels in the patch.
+ * Note that the patch must be small enough to be fully inside the rect, both at p and q.
+ * Do avoid doing all the coordinate calculations twice, the code here computes both weights at once. */
+#ifdef __KERNEL_SSE41__
+ __m128 dI_sse = _mm_setzero_ps();
+ __m128 highX_sse = _mm_set1_ps(high_dPatch.x);
+ for(int k = 0; k < 3; k++) {
+ int dIdx = low_dPatch.x + low_dPatch.y*w;
+ for(int dy = low_dPatch.y; dy < high_dPatch.y; dy++) {
+ int dx;
+ for(dx = low_dPatch.x; dx < high_dPatch.x; dx+=4, dIdx+=4) {
+ __m128 diff = _mm_sub_ps(_mm_loadu_ps(weightImage[k] + p_idx + dIdx), _mm_loadu_ps(weightImage[k] + q_idx + dIdx));
+ __m128 pvar = _mm_loadu_ps(variance[k] + p_idx + dIdx);
+ __m128 qvar = _mm_loadu_ps(variance[k] + q_idx + dIdx);
+ __m128 d = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(diff, diff), _mm_mul_ps(a_sse, _mm_add_ps(pvar, _mm_min_ps(pvar, qvar)))), _mm_rcp_ps(_mm_add_ps(_mm_set1_ps(1e-7f), _mm_mul_ps(k_2_sse, _mm_add_ps(pvar, qvar)))));
+ dI_sse = _mm_add_ps(dI_sse, _mm_mask_ps(d, _mm_cmplt_ps(_mm_add_ps(_mm_set1_ps(dx), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)), highX_sse)));
+ }
+ dIdx += w-(dx - low_dPatch.x);
+ }
+ }
+ float dI = _mm_hsum_ss(dI_sse);
+#else
+ float dI = 0.0f;
+ for(int k = 0; k < 3; k++) {
+ int dIdx = low_dPatch.x + low_dPatch.y*w;
+ for(int dy = low_dPatch.y; dy < high_dPatch.y; dy++) {
+ for(int dx = low_dPatch.x; dx < high_dPatch.x; dx++, dIdx++) {
+ float diff = weightImage[k][p_idx+dIdx] - weightImage[k][q_idx+dIdx];
+ dI += (diff*diff - a*(variance[k][p_idx+dIdx] + min(variance[k][p_idx+dIdx], variance[k][q_idx+dIdx]))) * (1.0f / (1e-7f + k_2*(variance[k][p_idx+dIdx] + variance[k][q_idx+dIdx])));
+ }
+ dIdx += w-(high_dPatch.x - low_dPatch.x);
+ }
+ }
+#endif
+ dI *= 1.0f / (3.0f * (high_dPatch.x - low_dPatch.x) * (high_dPatch.y - low_dPatch.y));
+
+ float wI = fast_expf(-max(0.0f, dI));
+ sum_image[0] += wI*noisyImage[0][q_idx];
+ sum_image[1] += wI*noisyImage[1][q_idx];
+ sum_image[2] += wI*noisyImage[2][q_idx];
+ sum_weight += wI;
+ }
+ q_idx += w-(high.x-low.x);
+ }
+
+ filteredImage[0][p_idx] = sum_image[0] / sum_weight;
+ filteredImage[1][p_idx] = sum_image[1] / sum_weight;
+ filteredImage[2][p_idx] = sum_image[2] / sum_weight;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 50396ed..3253828 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -88,6 +88,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_non_local_means)(int x, int y,
int r, int f,
float a, float k_2);
+void KERNEL_FUNCTION_FULL_NAME(filter_non_local_means_3)(int x, int y,
+ float *noisyImage[3],
+ float *weightImage[3],
+ float *variance[3],
+ float *filteredImage[3],
+ int* rect,
+ int r, int f,
+ float a, float k_2);
+
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index d0f2e61..b4cc703 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kern
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list