[Bf-blender-cvs] [1db96fa] soc-2016-cycles_denoising: Cycles: Redesign CUDA kernels to increase denoising performance

Sun Aug 21 06:18:11 CEST 2016

Commit: 1db96fa89c16f8d823f084659ddc99f726544a8f
Author: Lukas Stockner
Date:   Wed Aug 17 11:34:52 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB1db96fa89c16f8d823f084659ddc99f726544a8f

Cycles: Redesign CUDA kernels to increase denoising performance

This commit contains essentially a complete overhaul of the CUDA denoising kernels.

One of the main changes is splitting up the huge estimate_params kernel into multiple smaller ones:
- One Kernel calculates the reduced feature space transform.
- One Kernel estimates the feature bandwidths.
- One Kernel estimates bias and variance for a given global bandwidth. This kernel is executed multiple times for different global bandwidths.
- One Kernel calculates the optimal global bandwidth.

This improves UI responsiveness since the individual kernel launches are shorter.
Also, smaller kernels are always a good thing on GPUs - from register allocation to warp divergence.

The next major improvement concerns the transform - before this commit, transform loads from global memory were the main bottleneck.
First of all, it's now stored in a SoA layout instead of AoS, which makes all transform loads coalesced.
Furthermore, the transform pointer is declared as "float const* __restricted__" instead of float*, which allows NVCC to cache the transform reads. Since only the first kernel writes the transforms, this increases speed again.

The third mayor change is that the feature vector, which is used in every per-pixel loop, now is stored in shared memory.
Since the feature vector is involved in a lot of operations, this improves performance again.
On the other hand, shared memory is rather limited on Kepler and older, so even the 11 floats per thread are already a lot.
With the default "16KB shared - 48KB L1 Cache" split on a GTX780, occupancy is only 12.5% - way too low.
With "48KB shared - 16KB L1 Cache", occupancy is back up at 50%, but of course there are more cache misses - in the end, though, the benefits of having the feature vector local make up for that.

I expect the performance boost to be even higher on Maxwell and Pascal, since these have much larger shared memory and L1.

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernel_filter.h
M	intern/cycles/kernel/kernel_filter_util.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 6ddf001..60fe45b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -823,22 +823,31 @@ public:
 
 		cuda_push_context();
 
-		CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterNonLocalMeans, cuFilterCombineHalves, cuFilterEstimateParams, cuFilterFinalPass;
+		CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterNonLocalMeans, cuFilterCombineHalves;
+		CUfunction cuFilterConstructTransform, cuFilterEstimateBandwidths, cuFilterEstimateBiasVariance, cuFilterCalculateBandwidth, cuFilterFinalPass;
 		CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer);
 
 		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuModule, "kernel_cuda_filter_divide_shadow"));
 		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuModule, "kernel_cuda_filter_get_feature"));
 		cuda_assert(cuModuleGetFunction(&cuFilterNonLocalMeans, cuModule, "kernel_cuda_filter_non_local_means"));
 		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuModule, "kernel_cuda_filter_combine_halves"));
-		cuda_assert(cuModuleGetFunction(&cuFilterEstimateParams, cuModule, "kernel_cuda_filter_estimate_params"));
+
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuModuleGetFunction(&cuFilterEstimateBandwidths, cuModule, "kernel_cuda_filter_estimate_bandwidths"));
+		cuda_assert(cuModuleGetFunction(&cuFilterEstimateBiasVariance, cuModule, "kernel_cuda_filter_estimate_bias_variance"));
+		cuda_assert(cuModuleGetFunction(&cuFilterCalculateBandwidth, cuModule, "kernel_cuda_filter_calculate_bandwidth"));
 		cuda_assert(cuModuleGetFunction(&cuFilterFinalPass, cuModule, "kernel_cuda_filter_final_pass"));
 
 		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
 		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
 		cuda_assert(cuFuncSetCacheConfig(cuFilterNonLocalMeans, CU_FUNC_CACHE_PREFER_L1));
 		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateParams, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPass, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateBandwidths, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateBiasVariance, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCalculateBandwidth, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPass, CU_FUNC_CACHE_PREFER_SHARED));
 
 		if(have_error())
 			return;
@@ -848,7 +857,7 @@ public:
 		int4 rect = make_int4(rtile.x, rtile.y, rtile.x + rtile.w, rtile.y + rtile.h);
 
 		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterEstimateParams));
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterFinalPass));
 
 		int xthreads = (int)sqrt((float)threads_per_block);
 		int ythreads = (int)sqrt((float)threads_per_block);
@@ -1009,36 +1018,75 @@ public:
 		}
 #undef CUDA_PTR_ADD
 
+#ifdef WITH_CYCLES_DEBUG_FILTER
+#define WRITE_DEBUG(name, pass) debug_write_pfm(string_printf("debug_%dx%d_cuda_feature%d_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, i, name).c_str(), host_denoise_buffer+pass*pass_stride, rtile.w, rtile.h, 1, w)
+		float *host_denoise_buffer = new float[22*pass_stride];
+		cuda_assert(cuMemcpyDtoH(host_denoise_buffer, d_denoise_buffer, 22*pass_stride*sizeof(float)));
+		for(int i = 0; i < 11; i++) {
+			WRITE_DEBUG("filtered", 2*i);
+			WRITE_DEBUG("variance", 2*i+1);
+		}
+		delete[] host_denoise_buffer;
+#undef WRITE_DEBUG
+#endif
+
 		/* Use the prefiltered feature to denoise the image. */
 		int4 filter_area = make_int4(rtile.x + overscan, rtile.y + overscan, rtile.w - 2*overscan, rtile.h - 2*overscan);
-		CUdeviceptr d_storage;
+		CUdeviceptr d_storage, d_transforms;
 		cuda_assert(cuMemAlloc(&d_storage, filter_area.z*filter_area.w*sizeof(FilterStorage)));
-
-		void *estimate_args[] = {&sample,
-		                         &d_denoise_buffer,
-		                         &d_storage,
-		                         &filter_area,
-		                         &rect};
+		cuda_assert(cuMemAlloc(&d_transforms, filter_area.z*filter_area.w*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
 
 		xthreads = (int)sqrt((float)threads_per_block);
 		ythreads = (int)sqrt((float)threads_per_block);
 		xblocks = (filter_area.z + xthreads - 1)/xthreads;
 		yblocks = (filter_area.w + ythreads - 1)/ythreads;
 
-		cuda_assert(cuLaunchKernel(cuFilterEstimateParams,
+		void *transform_args[] = {&sample,
+		                          &d_denoise_buffer,
+		                          &d_transforms,
+		                          &d_storage,
+		                          &filter_area,
+		                          &rect};
+		cuda_assert(cuLaunchKernel(cuFilterConstructTransform,
 		                           xblocks , yblocks, 1, /* blocks */
 		                           xthreads, ythreads, 1, /* threads */
-		                           0, 0, estimate_args, 0));
+		                           0, 0, transform_args, 0));
+		cuda_assert(cuLaunchKernel(cuFilterEstimateBandwidths,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, transform_args, 0));
+
+		for(int g = 0; g < 5; g++) {
+			void *bias_variance_args[] = {&sample,
+			                              &d_denoise_buffer,
+			                              &d_transforms,
+			                              &d_storage,
+			                              &filter_area,
+			                              &rect,
+			                              &g};
+			cuda_assert(cuLaunchKernel(cuFilterEstimateBiasVariance,
+			                           xblocks , yblocks, 1, /* blocks */
+			                           xthreads, ythreads, 1, /* threads */
+			                           0, 0, bias_variance_args, 0));
+		}
+
+		void *bandwidth_args[] = {&sample,
+		                          &d_storage,
+		                          &filter_area};
+		cuda_assert(cuLaunchKernel(cuFilterCalculateBandwidth,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, bandwidth_args, 0));
 
 		void *final_args[] = {&sample,
 		                      &d_denoise_buffer,
 		                      &rtile.offset,
 		                      &rtile.stride,
+		                      &d_transforms,
 		                      &d_storage,
 		                      &d_buffers,
 		                      &filter_area,
 		                      &rect};
-
 		cuda_assert(cuLaunchKernel(cuFilterFinalPass,
 		                           xblocks , yblocks, 1, /* blocks */
 		                           xthreads, ythreads, 1, /* threads */
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index f4db090..1ceb3e6 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -23,7 +23,418 @@ CCL_NAMESPACE_BEGIN
 #define NORM_FEATURE_OFFSET 2
 #define NORM_FEATURE_NUM 8
 
-#ifdef __KERNEL_SSE3__
+#ifdef __KERNEL_CUDA__
+ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample, float const* __restrict__ buffer, int x, int y, float *transform, FilterStorage *storage, int4 rect, int transform_stride, int localIdx)
+{
+	__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+	float *features = shared_features + localIdx*DENOISE_FEATURES;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int buffer_h = (rect.w - rect.y);
+	int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+	int num_frames = kernel_data.film.num_frames;
+	int prev_frames = kernel_data.film.prev_frames;
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - kernel_data.integrator.half_window),
+	                      max(rect.y, y - kernel_data.integrator.half_window));
+	int2 high = make_int2(min(rect.z, x + kernel_data.integrator.half_window + 1),
+	                      min(rect.w, y + kernel_data.integrator.half_window + 1));
+	float const* __restrict__ pixel_buffer;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES] = {0.0f};
+	FOR_PIXEL_WINDOW {
+		filter_get_features(px, py, pt, pixel_buffer, features, NULL, pass_stride);
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			feature_means[i] += features[i];
+	} END_FOR_PIXEL_WINDOW
+
+	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		feature_means[i] *= pixel_scale;
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			feature_scale[i] = max(feature_scale[i], fabsf(features[i]));
+	} END_FOR_PIXEL_WINDOW
+
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		feature_scale[i] = 1.0f / max(feature_scale[i], 0.01f);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES], feature_matrix_norm = 0.0f;
+	math_matrix_zero_lower(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
+		for(int i = 0; i < DENOISE_FEATUR

@@ Diff output truncated at 10240 characters. @@