[Bf-blender-cvs] [6da6f8d33f6] master: Cycles: CUDA faster rendering of small tiles, using multiple samples like OpenCL.

Wed Oct 4 22:24:08 CEST 2017

Commit: 6da6f8d33f65b427162d0c8b13a5a5f5043bc8a5
Author: Brecht Van Lommel
Date:   Wed Sep 27 01:38:19 2017 +0200
Branches: master
https://developer.blender.org/rB6da6f8d33f65b427162d0c8b13a5a5f5043bc8a5

Cycles: CUDA faster rendering of small tiles, using multiple samples like OpenCL.

The work size is still very conservative, and this doesn't help for progressive
refine. For that we will need to render multiple tiles at the same time. But this
should already help for denoising renders that require too much memory with big
tiles, and just generally soften the performance dropoff with small tiles.

Differential Revision: https://developer.blender.org/D2856

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernel_passes.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 54e012191ae..d84771aefda 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1281,17 +1281,16 @@ public:
 		task.unmap_neighbor_tiles(rtiles, this);
 	}
 
-	void path_trace(RenderTile& rtile, int sample, bool branched)
+	void path_trace(DeviceTask& task, RenderTile& rtile)
 	{
 		if(have_error())
 			return;
 
 		CUDAContextScope scope(this);
-
 		CUfunction cuPathTrace;
 
-		/* get kernel function */
-		if(branched) {
+		/* Get kernel function. */
+		if(task.integrator_branched) {
 			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
 		}
 		else {
@@ -1304,7 +1303,7 @@ public:
 
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
-		/* allocate work tile */
+		/* Allocate work tile. */
 		device_vector<WorkTile> work_tiles;
 		work_tiles.resize(1);
 
@@ -1315,32 +1314,50 @@ public:
 		wtile->h = rtile.h;
 		wtile->offset = rtile.offset;
 		wtile->stride = rtile.stride;
-		wtile->start_sample = sample;
-		wtile->num_samples = 1;
 		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-
 		mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
-		mem_copy_to(work_tiles);
 
 		CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 
-		uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+		/* Prepare work size. More step samples render faster, but for now we
+		 * remain conservative to avoid driver timeouts. */
+		int min_blocks, num_threads_per_block;
+		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+		uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);;
 
-		/* pass in parameters */
-		void *args[] = {&d_work_tiles,
-		                &total_work_size};
+		/* Render all samples. */
+		int start_sample = rtile.start_sample;
+		int end_sample = rtile.start_sample + rtile.num_samples;
 
-		/* launch kernel */
-		int num_threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
-		int num_blocks = divide_up(total_work_size, num_threads_per_block);
+		for(int sample = start_sample; sample < end_sample; sample += step_samples) {
+			/* Setup and copy work tile to device. */
+			wtile->start_sample = sample;
+			wtile->num_samples = min(step_samples, end_sample - sample);;
+			mem_copy_to(work_tiles);
 
-		cuda_assert(cuLaunchKernel(cuPathTrace,
-		                           num_blocks, 1, 1,
-		                           num_threads_per_block, 1, 1,
-		                           0, 0, args, 0));
+			uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+			uint num_blocks = divide_up(total_work_size, num_threads_per_block);
 
-		cuda_assert(cuCtxSynchronize());
+			/* Launch kernel. */
+			void *args[] = {&d_work_tiles,
+			                &total_work_size};
+
+			cuda_assert(cuLaunchKernel(cuPathTrace,
+			                           num_blocks, 1, 1,
+			                           num_threads_per_block, 1, 1,
+			                           0, 0, args, 0));
+
+			cuda_assert(cuCtxSynchronize());
+
+			/* Update progress. */
+			rtile.sample = sample + wtile->num_samples;
+			task.update_progress(&rtile, rtile.w*rtile.h);
+
+			if(task.get_cancel()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
 
 		mem_free(work_tiles);
 	}
@@ -1700,8 +1717,6 @@ public:
 		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
-			bool branched = task->integrator_branched;
-
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
@@ -1725,21 +1740,7 @@ public:
 						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
 					else {
-						int start_sample = tile.start_sample;
-						int end_sample = tile.start_sample + tile.num_samples;
-
-						for(int sample = start_sample; sample < end_sample; sample++) {
-							if(task->get_cancel()) {
-								if(task->need_finish_queue == false)
-									break;
-							}
-
-							path_trace(tile, sample, branched);
-
-							tile.sample = sample + 1;
-
-							task->update_progress(&tile, tile.w*tile.h);
-						}
+						path_trace(*task, tile);
 					}
 				}
 				else if(tile.task == RenderTile::DENOISE) {
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 239598f7dab..644cc173571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -16,19 +16,23 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#define __ATOMIC_PASS_WRITE__
+#endif
+
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
 	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf += value;
-#endif  /* __SPLIT_KERNEL__ */
+#endif
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
 {
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -39,12 +43,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf += value;
-#endif  /* __SPLIT_KERNEL__ */
+#endif
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
 {
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -57,7 +61,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf += value;
-#endif  /* __SPLIT_KERNEL__ */
+#endif
 }
 
 #ifdef __DENOISING_FEATURES__
@@ -70,7 +74,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer
 	kernel_write_pass_float(buffer+1, value*value);
 }
 
-#  if defined(__SPLIT_KERNEL__)
+#  ifdef __ATOMIC_PASS_WRITE__
 #    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
 #  else
 ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 799cd587fcf..e72edfa7bdf 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -21,6 +21,8 @@
 #include "kernel/kernel_compat_cuda.h"
 #include "kernel_config.h"
 
+#include "util/util_atomic.h"
+
 #include "kernel/kernel_math.h"
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_globals.h"