[Bf-blender-cvs] [374efaf9975] temp-cycles-denoising: Cycles Denoising: Disable local (aka shared) memory on OpenCL

Lukas Stockner noreply at git.blender.org
Wed Apr 19 20:46:58 CEST 2017


Commit: 374efaf9975bd2af726c76b3f92934060a50cb1b
Author: Lukas Stockner
Date:   Wed Apr 19 20:30:13 2017 +0200
Branches: temp-cycles-denoising
https://developer.blender.org/rB374efaf9975bd2af726c76b3f92934060a50cb1b

Cycles Denoising: Disable local (aka shared) memory on OpenCL

On GPU architectures, storing the design row in local memory improves performance due to lower global memory bandwidth requirements.
However, if the GPU doesn't have enough local memory available, occupancy suffers which makes it even slower than the global memory version.

On CUDA, the amount of available local memory (shared memory in CUDA terminology) can be controlled, but that's not possible on OpenCL. So, to avoid a huge performance hit when the local memory isn't enough, it's disabled on OpenCL.

===================================================================

M	intern/cycles/kernel/filter/filter_features.h
M	intern/cycles/kernel/filter/filter_reconstruction.h
M	intern/cycles/kernel/filter/filter_transform_gpu.h
M	intern/cycles/kernel/kernel_compat_opencl.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index f4f6e1f7639..a76b9005751 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -28,7 +28,7 @@
                                  pixel_buffer += buffer_w - (high.x - low.x); \
                              }
 
-ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, float ccl_readonly_ptr mean, int pass_stride)
 {
 	features[0] = pixel.x;
 	features[1] = pixel.y;
@@ -46,7 +46,7 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_read
 	}
 }
 
-ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *scales, float ccl_readonly_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *scales, float ccl_readonly_ptr mean, int pass_stride)
 {
 	scales[0] = fabsf(pixel.x - mean[0]);
 	scales[1] = fabsf(pixel.y - mean[1]);
@@ -87,7 +87,7 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_
 	return (color_diff > 3.0f*variance);
 }
 
-ccl_device_inline void design_row_add(float ccl_local_param *design_row,
+ccl_device_inline void design_row_add(float *design_row,
                                       int rank,
                                       ccl_global float ccl_readonly_ptr transform,
                                       int stride,
@@ -106,12 +106,12 @@ ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
                                                        ccl_global float ccl_readonly_ptr q_buffer,
                                                        int pass_stride,
                                                        int rank,
-                                                       float ccl_local_param *design_row,
+                                                       float *design_row,
                                                        ccl_global float ccl_readonly_ptr transform,
                                                        int stride)
 {
 	design_row[0] = 1.0f;
-	math_local_vector_zero(design_row+1, rank);
+	math_vector_zero(design_row+1, rank);
 	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
 	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
 	design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 70dfedce453..23e667a0dcf 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -39,10 +39,13 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
 	(void)storage_stride;
 	(void)localIdx;
 	float design_row[DENOISE_FEATURES+1];
-#else
+#elif defined(__KERNEL_CUDA__)
 	const int stride = storage_stride;
 	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
 	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	const int stride = storage_stride;
+	float design_row[DENOISE_FEATURES+1];
 #endif
 
 	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 3595f749fc4..6f27c9bdbec 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -26,8 +26,12 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_readonly_
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
+#ifdef __KERNEL_CUDA__
 	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
 	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
 
 	/* === Calculate denoising window. === */
 	int2 low  = make_int2(max(rect.x, x - radius),
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 244b632e227..f1e6c7f5ad1 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -51,7 +51,6 @@
 #endif
 
 #define ccl_readonly_ptr const * __restrict__
-#define CCL_MAX_LOCAL_SIZE 256
 
 #define ccl_local_id(d) get_local_id(d)
 #define ccl_global_id(d) get_global_id(d)
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 4aaea55f58e..c0ab4e65c9f 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -38,12 +38,6 @@ ccl_device_inline void math_vector_zero(float *v, int n)
 		v[i] = 0.0f;
 }
 
-ccl_device_inline void math_local_vector_zero(float ccl_local_param *v, int n)
-{
-	for(int i = 0; i < n; i++)
-		v[i] = 0;
-}
-
 ccl_device_inline void math_trimatrix_zero(float *A, int n)
 {
 	for(int row = 0; row < n; row++)
@@ -53,13 +47,13 @@ ccl_device_inline void math_trimatrix_zero(float *A, int n)
 
 /* Elementary vector operations. */
 
-ccl_device_inline void math_vector_add(float *a, ccl_local_param float ccl_readonly_ptr b, int n)
+ccl_device_inline void math_vector_add(float *a, float ccl_readonly_ptr b, int n)
 {
 	for(int i = 0; i < n; i++)
 		a[i] += b[i];
 }
 
-ccl_device_inline void math_vector_mul(ccl_local_param float *a, float ccl_readonly_ptr b, int n)
+ccl_device_inline void math_vector_mul(float *a, float ccl_readonly_ptr b, int n)
 {
 	for(int i = 0; i < n; i++)
 		a[i] *= b[i];
@@ -77,7 +71,7 @@ ccl_device_inline void math_vector_scale(float *a, float b, int n)
 		a[i] *= b;
 }
 
-ccl_device_inline void math_vector_max(float *a, ccl_local_param float ccl_readonly_ptr b, int n)
+ccl_device_inline void math_vector_max(float *a, float ccl_readonly_ptr b, int n)
 {
 	for(int i = 0; i < n; i++)
 		a[i] = max(a[i], b[i]);
@@ -89,7 +83,7 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
 		v[i] += w*x[i];
 }
 
-ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float ccl_local_param *x, float3 w, int stride)
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
 {
 	for(int i = 0; i < n; i++)
 		v[i*stride] += w*x[i];
@@ -109,7 +103,7 @@ ccl_device_inline void math_matrix_add_diagonal(ccl_global float *A, int n, floa
  * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
 ccl_device_inline void math_trimatrix_add_gramian(float *A,
                                                   int n,
-                                                  ccl_local_param float ccl_readonly_ptr v,
+                                                  float ccl_readonly_ptr v,
                                                   float weight)
 {
 	for(int row = 0; row < n; row++)
@@ -122,7 +116,7 @@ ccl_device_inline void math_trimatrix_add_gramian(float *A,
  * Obviously, the resulting matrix is symmetric, so only the lower triangluar part is stored. */
 ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
                                                           int n,
-                                                          ccl_local_param float ccl_readonly_ptr v,
+                                                          float ccl_readonly_ptr v,
                                                           float weight,
                                                           int stride)
 {




More information about the Bf-blender-cvs mailing list