[Bf-blender-cvs] [11c9e0eb97] cycles-tiles-rework: Cycles: Rendering multiple tiles at once for CUDA
Mai Lavelle
noreply at git.blender.org
Fri Jan 13 02:52:37 CET 2017
Commit: 11c9e0eb97b0dc8b16a1ce21d78c6a8ad7ddd8ae
Author: Mai Lavelle
Date: Fri Jan 6 21:44:08 2017 -0500
Branches: cycles-tiles-rework
https://developer.blender.org/rB11c9e0eb97b0dc8b16a1ce21d78c6a8ad7ddd8ae
Cycles: Rendering multiple tiles at once for CUDA
There's no major performance difference from rendering a single large
tile. However if there are too many tiles (such as if the tiles are
16x16) the overhead of updating tile display can become a bit
noticeable. This should be easy to fix by restructuring the tile update
code a little.
===================================================================
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/kernels/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 233f94be1b..6754c91702 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -32,6 +32,7 @@
# include <cudaGL.h>
#endif
#include "util_debug.h"
+#include "util_foreach.h"
#include "util_logging.h"
#include "util_map.h"
#include "util_md5.h"
@@ -826,16 +827,50 @@ public:
}
}
- void path_trace(RenderTile& rtile, int sample, bool branched)
+ void path_trace(vector<RenderTile>& rtiles, int sample, bool branched)
{
if(have_error())
return;
cuda_push_context();
+ /* set the sample ranges */
+ CUdeviceptr d_sample_ranges;
+ cuda_assert(cuMemAlloc(&d_sample_ranges, sizeof(SampleRange) * rtiles.size()));
+
+ CUfunction cuSetSampleRange;
+ cuda_assert(cuModuleGetFunction(&cuSetSampleRange, cuModule, "kernel_cuda_set_sample_range"));
+
+ if(have_error())
+ return;
+
+ for(int i = 0; i < rtiles.size(); i++) {
+ RenderTile& rtile = rtiles[i];
+
+ CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
+ CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
+
+ void *args[] = {
+ &d_sample_ranges,
+ &i,
+ &d_buffer,
+ &d_rng_state,
+ &sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride,
+ };
+
+ cuda_assert(cuLaunchKernel(cuSetSampleRange,
+ 1, 1, 1, /* blocks */
+ 1, 1, 1, /* threads */
+ 0, 0, args, 0));
+ }
+
CUfunction cuPathTrace;
- CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
- CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
/* get kernel function */
if(branched) {
@@ -849,15 +884,8 @@ public:
return;
/* pass in parameters */
- void *args[] = {&d_buffer,
- &d_rng_state,
- &sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride};
+ int num_sample_ranges = rtiles.size();
+ void *args[] = {&d_sample_ranges, &num_sample_ranges};
/* launch kernel */
int threads_per_block;
@@ -871,8 +899,9 @@ public:
int xthreads = (int)sqrt(threads_per_block);
int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (rtile.w + xthreads - 1)/xthreads;
- int yblocks = (rtile.h + ythreads - 1)/ythreads;
+ /* TODO(mai): calculate a reasonable gird size for the device */
+ int xblocks = (256 + xthreads - 1)/xthreads;
+ int yblocks = (256 + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
@@ -883,6 +912,8 @@ public:
cuda_assert(cuCtxSynchronize());
+ cuda_assert(cuMemFree(cuda_device_ptr(d_sample_ranges)));
+
cuda_pop_context();
}
@@ -1251,17 +1282,26 @@ public:
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::PATH_TRACE) {
- RenderTile tile;
-
bool branched = task->integrator_branched;
/* Upload Bindless Mapping */
load_bindless_mapping();
+ /* TODO(mai): calculate a reasonable grid size for the device */
+ RenderWorkRequest work_request = {256*256, 256*256};
+ vector<RenderTile> tiles;
+
/* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ while(task->acquire_tiles(this, tiles, work_request)) {
+ int start_sample = tiles[0].start_sample;
+ int end_sample = tiles[0].start_sample + tiles[0].num_samples;
+
+#ifndef NDEBUG
+ foreach(RenderTile& tile, tiles) {
+ assert(start_sample == tile.start_sample);
+ assert(end_sample == tile.start_sample + tile.num_samples);
+ }
+#endif
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
@@ -1269,14 +1309,22 @@ public:
break;
}
- path_trace(tile, sample, branched);
+ path_trace(tiles, sample, branched);
- tile.sample = sample + 1;
+ int pixel_samples = 0;
+ foreach(RenderTile& tile, tiles) {
+ tile.sample = sample + 1;
+ pixel_samples += tile.w * tile.h;
+ }
+
+ task->update_progress(tiles, pixel_samples);
+ }
- task->update_progress(&tile, tile.w*tile.h);
+ foreach(RenderTile& tile, tiles) {
+ task->release_tile(tile);
}
- task->release_tile(tile);
+ tiles.clear();
}
}
else if(task->type == DeviceTask::SHADER) {
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea541..2c73ba4888 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -122,28 +122,117 @@
#endif
/* kernels */
+extern "C" __global__ void
+kernel_cuda_set_sample_range(SampleRange *sample_ranges, int range, float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ SampleRange* sample_range = &sample_ranges[range];
+
+ sample_range->buffer = buffer;
+ sample_range->rng_state = rng_state;
+ sample_range->sample = sample;
+ sample_range->x = sx;
+ sample_range->y = sy;
+ sample_range->w = sw;
+ sample_range->h = sh;
+ sample_range->offset = offset;
+ sample_range->stride = stride;
+
+ if(range == 0) {
+ sample_range->work_offset = 0;
+ }
+ else {
+ SampleRange* prev_range = &sample_ranges[range-1];
+ sample_range->work_offset = prev_range->work_offset + prev_range->w * prev_range->h;
+ }
+}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_path_trace(SampleRange *sample_ranges, int num_sample_ranges)
{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+ /* order threads to maintain inner block coherency */
+ const int group_id = blockIdx.x + gridDim.x * blockIdx.y;
+ const int local_thread_id = threadIdx.x + threadIdx.y * blockDim.x;
- if(x < sx + sw && y < sy + sh)
- kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+ const int thread_id = group_id * (blockDim.x * blockDim.x) + local_thread_id;
+
+ /* find which sample range belongs to this thread */
+ SampleRange* sample_range = NULL;
+
+ for(int i = 0; i < num_sample_ranges; i++) {
+ if(thread_id >= sample_ranges[i].work_offset &&
+ thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
+ {
+ sample_range = &sample_ranges[i];
+ }
+ }
+
+ /* check if theres work for this thread */
+ if(!sample_range) {
+ return;
+ }
+
+ int work_offset = thread_id - sample_range->work_offset;
+
+ if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+ return;
+ }
+
+ int x = (work_offset % sample_range->w) + sample_range->x;
+ int y = (work_offset / sample_range->w) + sample_range->y;
+
+ kernel_path_trace(NULL,
+ sample_range->buffer,
+ sample_range->rng_state,
+ sample_range->sample,
+ x, y,
+ sample_range->offset,
+ sample_range->stride);
}
#ifdef __BRANCHED_PATH__
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_branched_path_trace(SampleRange *sample_ranges, int num_sample_ranges)
{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+ /* order threads to maintain inner block coherency */
+ const int group_id = blockIdx.x + gridDim.x * blockIdx.y;
+ const int local_thread_id = threadIdx.x + threadIdx.y * blockDim.x;
- if(x < sx + sw && y < sy + sh)
- kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+ const int thread_id = group_id * (blockDim.x * blockDim.x) + local_thread_id;
+
+ /* find which sample range belongs to this thread */
+ SampleRange* sample_range = NULL;
+
+ for(int i = 0; i < num_sample_ranges; i++) {
+ if(thread_id >= sample_ranges[i].work_offset &&
+ thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
+ {
+ sample_range = &sample_ranges[i];
+ }
+ }
+
+ /* check if theres work for this thread */
+ if(!sample_range) {
+ return;
+ }
+
+ int work_offset = thread_id - sample_range->work_offset;
+
+ if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+ return;
+ }
+
+ int x = (work_offset % sample_range->w) + sample_range->x;
+ int y = (work_offset / sample_range->w) + sample_range->y;
+
+ kernel_branched_path_trace(NULL,
+ sample_range->buffer,
+ sample_range->rng_state,
+ sample_range->sample,
+ x, y,
+ sample_range->offset,
+ sample_range->stride);
}
#endif
More information about the Bf-blender-cvs
mailing list