[Bf-blender-cvs] [26bcfb0] soc-2016-cycles_denoising: Cycles: Implement GPU denoising
Lukas Stockner
noreply at git.blender.org
Mon Jun 20 22:51:49 CEST 2016
Commit: 26bcfb0f9dc1bc2d7824f13d646a159f66b25a36
Author: Lukas Stockner
Date: Mon Jun 20 22:48:25 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB26bcfb0f9dc1bc2d7824f13d646a159f66b25a36
Cycles: Implement GPU denoising
This commit adds the CUDA denoising kernels and host code.
===================================================================
M intern/cycles/blender/blender_session.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/kernels/cuda/kernel.cu
M intern/cycles/render/session.cpp
===================================================================
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index ebda63f..a201c38 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -457,6 +457,7 @@ void BlenderSession::render()
/* get buffer parameters */
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+ const bool is_cpu = session_params.device.type == DEVICE_CPU;
BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
/* render each layer */
@@ -502,7 +503,7 @@ void BlenderSession::render()
buffer_params.passes = passes;
buffer_params.denoising_passes = b_layer_iter->keep_denoise_data() || b_layer_iter->denoise_result();
- session->tile_manager.schedule_denoising = b_layer_iter->denoise_result();
+ session->tile_manager.schedule_denoising = b_layer_iter->denoise_result() && is_cpu;
session->params.denoise_result = b_layer_iter->denoise_result();
scene->film->denoising_passes = buffer_params.denoising_passes;
scene->film->denoise_flags = 0;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 968c4ed..2ecc447 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -683,6 +683,68 @@ public:
}
}
+ void denoise(RenderTile &rtile, int sample)
+ {
+ if(have_error())
+ return;
+
+ cuda_push_context();
+
+ CUfunction cuFilterEstimateParams, cuFilterFinalPass;
+ CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
+
+ cuda_assert(cuModuleGetFunction(&cuFilterEstimateParams, cuModule, "kernel_cuda_filter_estimate_params"));
+ cuda_assert(cuModuleGetFunction(&cuFilterFinalPass, cuModule, "kernel_cuda_filter_final_pass"));
+
+ if(have_error())
+ return;
+
+ int filter_x = rtile.x + rtile.buffers->params.overscan, filter_y = rtile.y + rtile.buffers->params.overscan;
+ int filter_w = rtile.buffers->params.final_width, filter_h = rtile.buffers->params.final_height;
+
+ CUdeviceptr d_storage;
+ int storage_size = filter_w*filter_h*sizeof(FilterStorage);
+ cuda_assert(cuMemAlloc(&d_storage, storage_size));
+
+ void *args[] = {&sample,
+ &d_buffer,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.buffers->params.overscan,
+ &rtile.offset,
+ &rtile.stride,
+ &d_storage};
+
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterEstimateParams));
+
+ int xthreads = (int)sqrt((float)threads_per_block);
+ int ythreads = (int)sqrt((float)threads_per_block);
+ int xblocks = (filter_w + xthreads - 1)/xthreads;
+ int yblocks = (filter_h + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateParams, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPass, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(cuFilterEstimateParams,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ cuda_assert(cuLaunchKernel(cuFilterFinalPass,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_assert(cuMemFree(d_storage));
+
+ cuda_pop_context();
+ }
+
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@@ -1130,9 +1192,13 @@ public:
task->update_progress(&tile);
}
+
+ if(tile.buffers->params.overscan) { /* TODO(lukas) Works, but seems hacky? */
+ denoise(tile, end_sample);
+ }
}
else if(tile.task == RenderTile::DENOISE) {
- printf("TODO: Implement Denoising kernel, was called for tile at (%d, %d) with size %dx%d!\n", tile.x, tile.y, tile.w, tile.h);
+ assert(!"Explicitly scheduling tiles for denoising isn't supported on GPUs!");
}
task->release_tile(tile);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 37fae54..b743c00 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -24,6 +24,7 @@
#include "../../kernel_path.h"
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
+#include "../../kernel_filter.h"
/* device data taken from CUDA occupancy calculator */
@@ -205,5 +206,43 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
}
#endif
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_estimate_params(int sample, float* buffers, int sx, int sy, int w, int h, int overscan, int offset, int stride, void *storage)
+{
+ int4 filter_rect = make_int4(sx + overscan, sy + overscan, sx+w - overscan, sy+h - overscan);
+ int lx = blockDim.x*blockIdx.x + threadIdx.x;
+ int ly = blockDim.y*blockIdx.y + threadIdx.y;
+ int x = filter_rect.x + lx;
+ int y = filter_rect.y + ly;
+ if(x < filter_rect.z && y < filter_rect.w) {
+ int tile_x[4] = {sx, sx, sx+w, sx+w};
+ int tile_y[4] = {sy, sy, sy+h, sy+h};
+ float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL};
+ int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0};
+ int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0};
+ kernel_filter_estimate_params(NULL, sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, (FilterStorage*) storage, filter_rect);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_final_pass(int sample, float* buffers, int sx, int sy, int w, int h, int overscan, int offset, int stride, void *storage)
+{
+ int4 filter_rect = make_int4(sx + overscan, sy + overscan, sx+w - overscan, sy+h - overscan);
+ int lx = blockDim.x*blockIdx.x + threadIdx.x;
+ int ly = blockDim.y*blockIdx.y + threadIdx.y;
+ int x = filter_rect.x + lx;
+ int y = filter_rect.y + ly;
+ if(x < filter_rect.z && y < filter_rect.w) {
+ int tile_x[4] = {sx, sx, sx+w, sx+w};
+ int tile_y[4] = {sy, sy, sy+h, sy+h};
+ float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL};
+ int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0};
+ int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0};
+ kernel_filter_final_pass(NULL, sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, (FilterStorage*) storage, filter_rect);
+ }
+}
+
#endif
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c02a891..63091a1 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -380,6 +380,16 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
rtile.tile_index = tile->index;
rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
+ int overscan = 0;
+ const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
+ if(params.denoise_result && is_gpu) {
+ overscan = scene->integrator->half_window;
+ rtile.x -= overscan;
+ rtile.y -= overscan;
+ rtile.w += 2*overscan;
+ rtile.h += 2*overscan;
+ }
+
tile_lock.unlock();
/* in case of a permanent buffer, return it, otherwise we will allocate
@@ -403,7 +413,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
buffer_params.full_y = rtile.y;
buffer_params.width = rtile.w;
buffer_params.height = rtile.h;
- buffer_params.overscan = 0;
+ buffer_params.overscan = overscan;
buffer_params.final_width = rtile.w - 2*overscan;
buffer_params.final_height = rtile.h - 2*overscan;
More information about the Bf-blender-cvs
mailing list