[Bf-blender-cvs] [26bcfb0] soc-2016-cycles_denoising: Cycles: Implement GPU denoising

Lukas Stockner noreply at git.blender.org
Mon Jun 20 22:51:49 CEST 2016


Commit: 26bcfb0f9dc1bc2d7824f13d646a159f66b25a36
Author: Lukas Stockner
Date:   Mon Jun 20 22:48:25 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB26bcfb0f9dc1bc2d7824f13d646a159f66b25a36

Cycles: Implement GPU denoising

This commit adds the CUDA denoising kernels and host code.

===================================================================

M	intern/cycles/blender/blender_session.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/render/session.cpp

===================================================================

diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index ebda63f..a201c38 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -457,6 +457,7 @@ void BlenderSession::render()
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+	const bool is_cpu = session_params.device.type == DEVICE_CPU;
 	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
 	/* render each layer */
@@ -502,7 +503,7 @@ void BlenderSession::render()
 
 		buffer_params.passes = passes;
 		buffer_params.denoising_passes = b_layer_iter->keep_denoise_data() || b_layer_iter->denoise_result();
-		session->tile_manager.schedule_denoising = b_layer_iter->denoise_result();
+		session->tile_manager.schedule_denoising = b_layer_iter->denoise_result() && is_cpu;
 		session->params.denoise_result = b_layer_iter->denoise_result();
 		scene->film->denoising_passes = buffer_params.denoising_passes;
 		scene->film->denoise_flags = 0;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 968c4ed..2ecc447 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -683,6 +683,68 @@ public:
 		}
 	}
 
+	void denoise(RenderTile &rtile, int sample)
+	{
+		if(have_error())
+			return;
+
+		cuda_push_context();
+
+		CUfunction cuFilterEstimateParams, cuFilterFinalPass;
+		CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
+
+		cuda_assert(cuModuleGetFunction(&cuFilterEstimateParams, cuModule, "kernel_cuda_filter_estimate_params"));
+		cuda_assert(cuModuleGetFunction(&cuFilterFinalPass, cuModule, "kernel_cuda_filter_final_pass"));
+
+		if(have_error())
+			return;
+
+		int filter_x = rtile.x + rtile.buffers->params.overscan, filter_y = rtile.y + rtile.buffers->params.overscan;
+		int filter_w = rtile.buffers->params.final_width, filter_h = rtile.buffers->params.final_height;
+
+		CUdeviceptr d_storage;
+		int storage_size = filter_w*filter_h*sizeof(FilterStorage);
+		cuda_assert(cuMemAlloc(&d_storage, storage_size));
+
+		void *args[] = {&sample,
+		                &d_buffer,
+		                &rtile.x,
+		                &rtile.y,
+		                &rtile.w,
+		                &rtile.h,
+		                &rtile.buffers->params.overscan,
+		                &rtile.offset,
+		                &rtile.stride,
+		                &d_storage};
+
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterEstimateParams));
+
+		int xthreads = (int)sqrt((float)threads_per_block);
+		int ythreads = (int)sqrt((float)threads_per_block);
+		int xblocks = (filter_w + xthreads - 1)/xthreads;
+		int yblocks = (filter_h + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateParams, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPass, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(cuFilterEstimateParams,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		cuda_assert(cuLaunchKernel(cuFilterFinalPass,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_assert(cuMemFree(d_storage));
+
+		cuda_pop_context();
+	}
+
 	void path_trace(RenderTile& rtile, int sample, bool branched)
 	{
 		if(have_error())
@@ -1130,9 +1192,13 @@ public:
 
 						task->update_progress(&tile);
 					}
+
+					if(tile.buffers->params.overscan) { /* TODO(lukas) Works, but seems hacky? */
+						denoise(tile, end_sample);
+					}
 				}
 				else if(tile.task == RenderTile::DENOISE) {
-					printf("TODO: Implement Denoising kernel, was called for tile at (%d, %d) with size %dx%d!\n", tile.x, tile.y, tile.w, tile.h);
+					assert(!"Explicitly scheduling tiles for denoising isn't supported on GPUs!");
 				}
 
 				task->release_tile(tile);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 37fae54..b743c00 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -24,6 +24,7 @@
 #include "../../kernel_path.h"
 #include "../../kernel_path_branched.h"
 #include "../../kernel_bake.h"
+#include "../../kernel_filter.h"
 
 /* device data taken from CUDA occupancy calculator */
 
@@ -205,5 +206,43 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
 }
 #endif
 
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_estimate_params(int sample, float* buffers, int sx, int sy, int w, int h, int overscan, int offset, int stride, void *storage)
+{
+	int4 filter_rect = make_int4(sx + overscan, sy + overscan, sx+w - overscan, sy+h - overscan);
+	int lx = blockDim.x*blockIdx.x + threadIdx.x;
+	int ly = blockDim.y*blockIdx.y + threadIdx.y;
+	int x = filter_rect.x + lx;
+	int y = filter_rect.y + ly;
+	if(x < filter_rect.z && y < filter_rect.w) {
+		int tile_x[4] = {sx, sx, sx+w, sx+w};
+		int tile_y[4] = {sy, sy, sy+h, sy+h};
+		float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL};
+		int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0};
+		int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0};
+		kernel_filter_estimate_params(NULL, sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, (FilterStorage*) storage, filter_rect);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_final_pass(int sample, float* buffers, int sx, int sy, int w, int h, int overscan, int offset, int stride, void *storage)
+{
+	int4 filter_rect = make_int4(sx + overscan, sy + overscan, sx+w - overscan, sy+h - overscan);
+	int lx = blockDim.x*blockIdx.x + threadIdx.x;
+	int ly = blockDim.y*blockIdx.y + threadIdx.y;
+	int x = filter_rect.x + lx;
+	int y = filter_rect.y + ly;
+	if(x < filter_rect.z && y < filter_rect.w) {
+		int tile_x[4] = {sx, sx, sx+w, sx+w};
+		int tile_y[4] = {sy, sy, sy+h, sy+h};
+		float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL};
+		int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0};
+		int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0};
+		kernel_filter_final_pass(NULL, sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, (FilterStorage*) storage, filter_rect);
+	}
+}
+
 #endif
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c02a891..63091a1 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -380,6 +380,16 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	rtile.tile_index = tile->index;
 	rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
 
+	int overscan = 0;
+	const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
+	if(params.denoise_result && is_gpu) {
+		overscan = scene->integrator->half_window;
+		rtile.x -= overscan;
+		rtile.y -= overscan;
+		rtile.w += 2*overscan;
+		rtile.h += 2*overscan;
+	}
+
 	tile_lock.unlock();
 
 	/* in case of a permanent buffer, return it, otherwise we will allocate
@@ -403,7 +413,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	buffer_params.full_y = rtile.y;
 	buffer_params.width = rtile.w;
 	buffer_params.height = rtile.h;
-	buffer_params.overscan = 0;
+	buffer_params.overscan = overscan;
 	buffer_params.final_width = rtile.w - 2*overscan;
 	buffer_params.final_height = rtile.h - 2*overscan;




More information about the Bf-blender-cvs mailing list