[Bf-blender-cvs] [fa3d50af95f] master: Cycles: Improve denoising speed on GPUs with small tile sizes

Thu Nov 30 07:38:13 CET 2017

Commit: fa3d50af95fde76ef08590d2f86444f2f9fdca95
Author: Lukas Stockner
Date:   Fri Nov 10 04:34:14 2017 +0100
Branches: master
https://developer.blender.org/rBfa3d50af95fde76ef08590d2f86444f2f9fdca95

Cycles: Improve denoising speed on GPUs with small tile sizes

Previously, the NLM kernels would be launched once per offset with one thread per pixel.
However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown.

Therefore, the kernels are now launched in a single call that handles all offsets at once.
This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory.
On the other hand, of course, the smaller tiles significantly reduce the size of the memory.

The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum.
I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere.

To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now.
Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_denoising.cpp
M	intern/cycles/device/device_denoising.h
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/filter/filter_nlm_cpu.h
M	intern/cycles/kernel/filter/filter_nlm_gpu.h
M	intern/cycles/kernel/filter/filter_reconstruction.h
M	intern/cycles/kernel/kernels/cpu/filter_cpu.h
M	intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/filter.cu
M	intern/cycles/kernel/kernels/opencl/filter.cl
M	intern/cycles/util/CMakeLists.txt
M	intern/cycles/util/util_math.h
M	intern/cycles/util/util_math_matrix.h
A	intern/cycles/util/util_rect.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 999b9230d29..2d28ccd2b49 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -190,9 +190,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
 
 	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
 	                       int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -565,13 +565,13 @@ public:
 			                                    (float*) color_variance_ptr,
 			                                    difference,
 			                                    local_rect,
-			                                    task->buffer.w,
+			                                    task->buffer.stride,
 			                                    task->buffer.pass_stride,
 			                                    1.0f,
 			                                    task->nlm_k_2);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
-			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
 			filter_nlm_construct_gramian_kernel()(dx, dy,
 			                                      blurDifference,
 			                                      (float*)  task->buffer.mem.device_pointer,
@@ -580,9 +580,8 @@ public:
 			                                      (float*)  task->storage.XtWX.device_pointer,
 			                                      (float3*) task->storage.XtWY.device_pointer,
 			                                      local_rect,
-			                                      &task->reconstruction_state.filter_rect.x,
-			                                      task->buffer.w,
-			                                      task->buffer.h,
+			                                      &task->reconstruction_state.filter_window.x,
+			                                      task->buffer.stride,
 			                                      4,
 			                                      task->buffer.pass_stride);
 		}
@@ -591,8 +590,6 @@ public:
 				filter_finalize_kernel()(x,
 				                         y,
 				                         y*task->filter_area.z + x,
-				                         task->buffer.w,
-				                         task->buffer.h,
 				                         (float*)  output_ptr,
 				                         (int*)    task->storage.rank.device_pointer,
 				                         (float*)  task->storage.XtWX.device_pointer,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d8d787ba706..a663da748df 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1087,6 +1087,19 @@ public:
 			                           threads, threads, 1, \
 			                           0, 0, args, 0));
 
+/* Similar as above, but for 1-dimensional blocks. */
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
+			int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
+			cuda_assert(cuLaunchKernel(func,                    \
+			                           xblocks, yblocks, 1,     \
+			                           threads_per_block, 1, 1, \
+			                           0, 0, args, 0));
+
 	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
 	                               DenoisingTask *task)
 	{
@@ -1095,60 +1108,65 @@ public:
 
 		CUDAContextScope scope(this);
 
-		int4 rect = task->rect;
-		int w = align_up(rect.z-rect.x, 4);
-		int h = rect.w-rect.y;
+		int stride = task->buffer.stride;
+		int w = task->buffer.width;
+		int h = task->buffer.h;
 		int r = task->nlm_state.r;
 		int f = task->nlm_state.f;
 		float a = task->nlm_state.a;
 		float k_2 = task->nlm_state.k_2;
 
-		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
-		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
-		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+		int shift_stride = stride*h;
+		int num_shifts = (2*r+1)*(2*r+1);
+		int mem_size = sizeof(float)*shift_stride*2*num_shifts;
+		int channel_offset = 0;
 
-		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
-		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+		CUdeviceptr temporary_mem;
+		cuda_assert(cuMemAlloc(&temporary_mem, mem_size));
+		CUdeviceptr difference     = temporary_mem;
+		CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts;
 
-		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+		CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
 
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+		{
+			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+			cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+			cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
 
-		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
 
-		int dx, dy;
-		int4 local_rect;
-		int channel_offset = 0;
-		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
-		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
-		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
-		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
-
-		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-			dy = i / (2*r+1) - r;
-			dx = i % (2*r+1) - r;
-			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-
-			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
-		}
-
-		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
-		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
-		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-		cuda_assert(cuCtxSynchronize());
+			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
+
+			void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shif

@@ Diff output truncated at 10240 characters. @@