[Bf-blender-cvs] [85db3f2866] temp-cycles-denoising: Cycles Denoising: Implement OpenCL denoising kernels
Lukas Stockner
noreply at git.blender.org
Fri Mar 24 20:18:45 CET 2017
Commit: 85db3f2866fbd88b63b416b0ec65081b3406cca7
Author: Lukas Stockner
Date: Fri Mar 24 00:07:20 2017 +0100
Branches: temp-cycles-denoising
https://developer.blender.org/rB85db3f2866fbd88b63b416b0ec65081b3406cca7
Cycles Denoising: Implement OpenCL denoising kernels
Unfortunately, as always when making kernel code OpenCL compatible, that mainly means adding a huge amount of annoying pointer qualifiers...
===================================================================
M intern/cycles/filter/CMakeLists.txt
M intern/cycles/filter/filter_compat_cuda.h
M intern/cycles/filter/filter_compat_opencl.h
M intern/cycles/filter/filter_features.h
M intern/cycles/filter/filter_kernel.h
M intern/cycles/filter/filter_nlm_cpu.h
M intern/cycles/filter/filter_nlm_gpu.h
M intern/cycles/filter/filter_prefilter.h
M intern/cycles/filter/filter_reconstruction.h
R088 intern/cycles/filter/filter_transform_cuda.h intern/cycles/filter/filter_transform_gpu.h
M intern/cycles/filter/kernels/cuda/filter.cu
M intern/cycles/filter/kernels/opencl/filter.cl
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/filter/CMakeLists.txt b/intern/cycles/filter/CMakeLists.txt
index 1c12188768..79a8e12c12 100644
--- a/intern/cycles/filter/CMakeLists.txt
+++ b/intern/cycles/filter/CMakeLists.txt
@@ -33,7 +33,7 @@ set(SRC_HEADERS
filter_prefilter.h
filter_reconstruction.h
filter_transform.h
- filter_transform_cuda.h
+ filter_transform_gpu.h
filter_transform_sse.h
)
diff --git a/intern/cycles/filter/filter_compat_cuda.h b/intern/cycles/filter/filter_compat_cuda.h
index 55d7151c21..e056807996 100644
--- a/intern/cycles/filter/filter_compat_cuda.h
+++ b/intern/cycles/filter/filter_compat_cuda.h
@@ -43,6 +43,10 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
#define ccl_readonly_ptr const * __restrict__
+#define ccl_local __shared__
+#define ccl_local_param
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
/* No assert supported for CUDA */
diff --git a/intern/cycles/filter/filter_compat_opencl.h b/intern/cycles/filter/filter_compat_opencl.h
index 146775bc68..ad06125694 100644
--- a/intern/cycles/filter/filter_compat_opencl.h
+++ b/intern/cycles/filter/filter_compat_opencl.h
@@ -42,6 +42,11 @@
#define ccl_private __private
#define ccl_restrict restrict
#define ccl_align(n) __attribute__((aligned(n)))
+#define ccl_readonly_ptr const * __restrict__
+#define ccl_local __local
+#define ccl_local_param __local
+
+#define CCL_MAX_LOCAL_SIZE 256
/* no assert in opencl */
#define kernel_assert(cond)
diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h
index 678f255d37..a0b0e1325e 100644
--- a/intern/cycles/filter/filter_features.h
+++ b/intern/cycles/filter/filter_features.h
@@ -28,7 +28,21 @@
pixel_buffer += buffer_w - (high.x - low.x); \
}
-ccl_device_inline void filter_get_features(int2 pixel, float ccl_readonly_ptr buffer, float *features, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_mean(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = ccl_get_feature(0);
+ features[3] = ccl_get_feature(1);
+ features[4] = ccl_get_feature(2);
+ features[5] = ccl_get_feature(3);
+ features[6] = ccl_get_feature(4);
+ features[7] = ccl_get_feature(5);
+ features[8] = ccl_get_feature(6);
+ features[9] = ccl_get_feature(7);
+}
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride)
{
features[0] = pixel.x;
features[1] = pixel.y;
@@ -51,7 +65,7 @@ ccl_device_inline void filter_get_features(int2 pixel, float ccl_readonly_ptr bu
#endif
}
-ccl_device_inline void filter_get_feature_scales(int2 pixel, float ccl_readonly_ptr buffer, float *scales, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *scales, float ccl_readonly_ptr mean, int pass_stride)
{
scales[0] = fabsf(pixel.x - mean[0]);
scales[1] = fabsf(pixel.y - mean[1]);
@@ -75,12 +89,12 @@ ccl_device_inline void filter_calculate_scale(float *scale)
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
}
-ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int pass_stride)
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
{
return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2));
}
-ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int pass_stride)
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
{
return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)));
}
@@ -93,12 +107,20 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_
}
/* Fill the design row without computing the weight. */
-ccl_device_inline void filter_get_design_row_transform(int2 pixel, float ccl_readonly_ptr buffer, float ccl_readonly_ptr feature_means, int pass_stride, float *features, int rank, float *design_row, float ccl_readonly_ptr feature_transform, int transform_stride)
+ccl_device_inline void filter_get_design_row_transform(int2 pixel,
+ ccl_global float ccl_readonly_ptr buffer,
+ float ccl_readonly_ptr feature_means,
+ int pass_stride,
+ ccl_local_param float *features,
+ int rank,
+ float *design_row,
+ ccl_global float ccl_readonly_ptr feature_transform,
+ int transform_stride)
{
filter_get_features(pixel, buffer, features, feature_means, pass_stride);
design_row[0] = 1.0f;
for(int d = 0; d < rank; d++) {
-#ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_GPU__
float x = math_vector_dot_strided(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
#else
float x = math_vector_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
diff --git a/intern/cycles/filter/filter_kernel.h b/intern/cycles/filter/filter_kernel.h
index 4a68fbcbc5..0db2673f63 100644
--- a/intern/cycles/filter/filter_kernel.h
+++ b/intern/cycles/filter/filter_kernel.h
@@ -31,8 +31,8 @@
#include "filter_prefilter.h"
-#ifdef __KERNEL_CUDA__
-# include "filter_transform_cuda.h"
+#ifdef __KERNEL_GPU__
+# include "filter_transform_gpu.h"
#else
# ifdef __KERNEL_SSE3__
# include "filter_transform_sse.h"
@@ -51,9 +51,9 @@
CCL_NAMESPACE_BEGIN
-ccl_device void kernel_filter_divide_combined(int x, int y, int sample, float *buffers, int offset, int stride, int pass_stride, int no_denoising_offset)
+ccl_device void kernel_filter_divide_combined(int x, int y, int sample, ccl_global float *buffers, int offset, int stride, int pass_stride, int no_denoising_offset)
{
- float *combined_buffer = buffers + (offset + y*stride + x);
+ ccl_global float *combined_buffer = buffers + (offset + y*stride + x);
float fac = sample / combined_buffer[3];
combined_buffer[0] *= fac;
combined_buffer[1] *= fac;
diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h
index 3b03865a7f..c498057f82 100644
--- a/intern/cycles/filter/filter_nlm_cpu.h
+++ b/intern/cycles/filter/filter_nlm_cpu.h
@@ -145,7 +145,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
buffer,
color_pass, variance_pass,
l_transform, l_rank,
- weight, l_XtWX, l_XtWY);
+ weight, l_XtWX, l_XtWY, 0);
}
}
}
diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h
index 195fa15ed9..10330e313a 100644
--- a/intern/cycles/filter/filter_nlm_gpu.h
+++ b/intern/cycles/filter/filter_nlm_gpu.h
@@ -16,7 +16,14 @@
CCL_NAMESPACE_BEGIN
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, float ccl_readonly_ptr weightImage, float ccl_readonly_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ ccl_global float ccl_readonly_ptr weightImage,
+ ccl_global float ccl_readonly_ptr varianceImage,
+ ccl_global float *differenceImage,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
{
float diff = 0.0f;
int numChannels = channel_offset? 3 : 1;
@@ -32,7 +39,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, i
differenceImage[y*w+x] = diff;
}
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y, float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ ccl_global float ccl_readonly_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.y, y-f);
@@ -44,7 +54,10 @@ ccl_device_inline void kernel_filter_nlm_blur(int x, int y, float ccl_readonly_p
outImage[y*w+x] = sum;
}
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ ccl_global float ccl_readonly_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
@@ -58,11 +71,11 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, float ccl_rea
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
int dx, int dy,
- float ccl_readonly_ptr differenceImage,
-
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list