[Bf-blender-cvs] [85db3f2866] temp-cycles-denoising: Cycles Denoising: Implement OpenCL denoising kernels

Lukas Stockner noreply at git.blender.org
Fri Mar 24 20:18:45 CET 2017


Commit: 85db3f2866fbd88b63b416b0ec65081b3406cca7
Author: Lukas Stockner
Date:   Fri Mar 24 00:07:20 2017 +0100
Branches: temp-cycles-denoising
https://developer.blender.org/rB85db3f2866fbd88b63b416b0ec65081b3406cca7

Cycles Denoising: Implement OpenCL denoising kernels

Unfortunately, as always when making kernel code OpenCL compatible, that mainly means adding a huge amount of annoying pointer qualifiers...

===================================================================

M	intern/cycles/filter/CMakeLists.txt
M	intern/cycles/filter/filter_compat_cuda.h
M	intern/cycles/filter/filter_compat_opencl.h
M	intern/cycles/filter/filter_features.h
M	intern/cycles/filter/filter_kernel.h
M	intern/cycles/filter/filter_nlm_cpu.h
M	intern/cycles/filter/filter_nlm_gpu.h
M	intern/cycles/filter/filter_prefilter.h
M	intern/cycles/filter/filter_reconstruction.h
R088	intern/cycles/filter/filter_transform_cuda.h	intern/cycles/filter/filter_transform_gpu.h
M	intern/cycles/filter/kernels/cuda/filter.cu
M	intern/cycles/filter/kernels/opencl/filter.cl
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/filter/CMakeLists.txt b/intern/cycles/filter/CMakeLists.txt
index 1c12188768..79a8e12c12 100644
--- a/intern/cycles/filter/CMakeLists.txt
+++ b/intern/cycles/filter/CMakeLists.txt
@@ -33,7 +33,7 @@ set(SRC_HEADERS
 	filter_prefilter.h
 	filter_reconstruction.h
 	filter_transform.h
-	filter_transform_cuda.h
+	filter_transform_gpu.h
 	filter_transform_sse.h
 )
 
diff --git a/intern/cycles/filter/filter_compat_cuda.h b/intern/cycles/filter/filter_compat_cuda.h
index 55d7151c21..e056807996 100644
--- a/intern/cycles/filter/filter_compat_cuda.h
+++ b/intern/cycles/filter/filter_compat_cuda.h
@@ -43,6 +43,10 @@
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)
 #define ccl_readonly_ptr const * __restrict__
+#define ccl_local __shared__
+#define ccl_local_param
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/filter/filter_compat_opencl.h b/intern/cycles/filter/filter_compat_opencl.h
index 146775bc68..ad06125694 100644
--- a/intern/cycles/filter/filter_compat_opencl.h
+++ b/intern/cycles/filter/filter_compat_opencl.h
@@ -42,6 +42,11 @@
 #define ccl_private __private
 #define ccl_restrict restrict
 #define ccl_align(n) __attribute__((aligned(n)))
+#define ccl_readonly_ptr const * __restrict__
+#define ccl_local __local
+#define ccl_local_param __local
+
+#define CCL_MAX_LOCAL_SIZE 256
 
 /* no assert in opencl */
 #define kernel_assert(cond)
diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h
index 678f255d37..a0b0e1325e 100644
--- a/intern/cycles/filter/filter_features.h
+++ b/intern/cycles/filter/filter_features.h
@@ -28,7 +28,21 @@
                                  pixel_buffer += buffer_w - (high.x - low.x); \
                              }
 
-ccl_device_inline void filter_get_features(int2 pixel, float ccl_readonly_ptr buffer, float *features, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_mean(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = ccl_get_feature(0);
+	features[3] = ccl_get_feature(1);
+	features[4] = ccl_get_feature(2);
+	features[5] = ccl_get_feature(3);
+	features[6] = ccl_get_feature(4);
+	features[7] = ccl_get_feature(5);
+	features[8] = ccl_get_feature(6);
+	features[9] = ccl_get_feature(7);
+}
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride)
 {
 	features[0] = pixel.x;
 	features[1] = pixel.y;
@@ -51,7 +65,7 @@ ccl_device_inline void filter_get_features(int2 pixel, float ccl_readonly_ptr bu
 #endif
 }
 
-ccl_device_inline void filter_get_feature_scales(int2 pixel, float ccl_readonly_ptr buffer, float *scales, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *scales, float ccl_readonly_ptr mean, int pass_stride)
 {
 	scales[0] = fabsf(pixel.x - mean[0]);
 	scales[1] = fabsf(pixel.y - mean[1]);
@@ -75,12 +89,12 @@ ccl_device_inline void filter_calculate_scale(float *scale)
 	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
 }
 
-ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int pass_stride)
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
 {
 	return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2));
 }
 
-ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int pass_stride)
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_readonly_ptr buffer, int pass_stride)
 {
 	return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)));
 }
@@ -93,12 +107,20 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_
 }
 
 /* Fill the design row without computing the weight. */
-ccl_device_inline void filter_get_design_row_transform(int2 pixel, float ccl_readonly_ptr buffer, float ccl_readonly_ptr feature_means, int pass_stride, float *features, int rank, float *design_row, float ccl_readonly_ptr feature_transform, int transform_stride)
+ccl_device_inline void filter_get_design_row_transform(int2 pixel,
+                                                       ccl_global float ccl_readonly_ptr buffer,
+                                                       float ccl_readonly_ptr feature_means,
+                                                       int pass_stride,
+                                                       ccl_local_param float *features,
+                                                       int rank,
+                                                       float *design_row,
+                                                       ccl_global float ccl_readonly_ptr feature_transform,
+                                                       int transform_stride)
 {
 	filter_get_features(pixel, buffer, features, feature_means, pass_stride);
 	design_row[0] = 1.0f;
 	for(int d = 0; d < rank; d++) {
-#ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_GPU__
 		float x = math_vector_dot_strided(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES);
 #else
 		float x = math_vector_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES);
diff --git a/intern/cycles/filter/filter_kernel.h b/intern/cycles/filter/filter_kernel.h
index 4a68fbcbc5..0db2673f63 100644
--- a/intern/cycles/filter/filter_kernel.h
+++ b/intern/cycles/filter/filter_kernel.h
@@ -31,8 +31,8 @@
 
 #include "filter_prefilter.h"
 
-#ifdef __KERNEL_CUDA__
-#  include "filter_transform_cuda.h"
+#ifdef __KERNEL_GPU__
+#  include "filter_transform_gpu.h"
 #else
 #  ifdef __KERNEL_SSE3__
 #    include "filter_transform_sse.h"
@@ -51,9 +51,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_filter_divide_combined(int x, int y, int sample, float *buffers, int offset, int stride, int pass_stride, int no_denoising_offset)
+ccl_device void kernel_filter_divide_combined(int x, int y, int sample, ccl_global float *buffers, int offset, int stride, int pass_stride, int no_denoising_offset)
 {
-	float *combined_buffer = buffers + (offset + y*stride + x);
+	ccl_global float *combined_buffer = buffers + (offset + y*stride + x);
 	float fac = sample / combined_buffer[3];
 	combined_buffer[0] *= fac;
 	combined_buffer[1] *= fac;
diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h
index 3b03865a7f..c498057f82 100644
--- a/intern/cycles/filter/filter_nlm_cpu.h
+++ b/intern/cycles/filter/filter_nlm_cpu.h
@@ -145,7 +145,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 			                                buffer,
 			                                color_pass, variance_pass,
 			                                l_transform, l_rank,
-			                                weight, l_XtWX, l_XtWY);
+			                                weight, l_XtWX, l_XtWY, 0);
 		}
 	}
 }
diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h
index 195fa15ed9..10330e313a 100644
--- a/intern/cycles/filter/filter_nlm_gpu.h
+++ b/intern/cycles/filter/filter_nlm_gpu.h
@@ -16,7 +16,14 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, float ccl_readonly_ptr weightImage, float ccl_readonly_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         ccl_global float ccl_readonly_ptr weightImage,
+                                                         ccl_global float ccl_readonly_ptr varianceImage,
+                                                         ccl_global float *differenceImage,
+                                                         int4 rect, int w,
+                                                         int channel_offset,
+                                                         float a, float k_2)
 {
 	float diff = 0.0f;
 	int numChannels = channel_offset? 3 : 1;
@@ -32,7 +39,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, i
 	differenceImage[y*w+x] = diff;
 }
 
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y, float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              ccl_global float ccl_readonly_ptr differenceImage,
+                                              ccl_global float *outImage,
+                                              int4 rect, int w, int f)
 {
 	float sum = 0.0f;
 	const int low = max(rect.y, y-f);
@@ -44,7 +54,10 @@ ccl_device_inline void kernel_filter_nlm_blur(int x, int y, float ccl_readonly_p
 	outImage[y*w+x] = sum;
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     ccl_global float ccl_readonly_ptr differenceImage,
+                                                     ccl_global float *outImage,
+                                                     int4 rect, int w, int f)
 {
 	float sum = 0.0f;
 	const int low = max(rect.x, x-f);
@@ -58,11 +71,11 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, float ccl_rea
 
 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
                                                        int dx, int dy,
-                                                       float ccl_readonly_ptr differenceImage,
-                                           

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list