[Bf-blender-cvs] [29c15f9] soc-2016-cycles_denoising: Cycles: Support cross-frame denoising on CUDA
Lukas Stockner
noreply at git.blender.org
Tue Aug 23 19:06:05 CEST 2016
Commit: 29c15f9d68893c1bfe3fb66429c7f572d1816979
Author: Lukas Stockner
Date: Tue Aug 23 17:33:32 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB29c15f9d68893c1bfe3fb66429c7f572d1816979
Cycles: Support cross-frame denoising on CUDA
===================================================================
M intern/cycles/device/device_cuda.cpp
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 078c22b..f98fd76 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -866,195 +866,199 @@ public:
int xblocks = (rtile.w + xthreads - 1)/xthreads;
int yblocks = (rtile.h + ythreads - 1)/ythreads;
- CUdeviceptr d_denoise_buffer;
+ CUdeviceptr d_denoise_buffers;
int w = align_up(rtile.w, 4);
- int pass_stride = w*rtile.h;
- cuda_assert(cuMemAlloc(&d_denoise_buffer, 22*pass_stride*sizeof(float)));
+ int pass_stride = w*rtile.h*rtile.buffers->params.frames;
+ cuda_assert(cuMemAlloc(&d_denoise_buffers, 22*pass_stride*sizeof(float)));
#define CUDA_PTR_ADD(ptr, x) ((CUdeviceptr) (((float*) (ptr)) + (x)))
- /* ==== Step 1: Prefilter general features. ==== */
- {
- int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
- int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
- int offset_to[] = { 0, 2, 4, 10, 12, 14, 6 };
- for(int i = 0; i < 7; i++) {
- CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
- CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
- CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
-
- void *get_feature_args[] = {&sample, &d_buffers, &mean_from[i], &variance_from[i],
- &rtile.x, &rtile.y, &rtile.w, &rtile.h,
- &rtile.offset, &rtile.stride,
- &d_unfiltered, &d_variance,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterGetFeature,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, get_feature_args, 0));
+ for(int frame = 0; frame < rtile.buffers->params.frames; frame++) {
+ CUdeviceptr d_denoise_buffer = CUDA_PTR_ADD(d_denoise_buffers, w*rtile.h*frame);
+ CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers, frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size());
+ /* ==== Step 1: Prefilter general features. ==== */
+ {
+ int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
+ int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+ int offset_to[] = { 0, 2, 4, 10, 12, 14, 6 };
+ for(int i = 0; i < 7; i++) {
+ CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
+ CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
+ CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
+
+ void *get_feature_args[] = {&sample, &d_buffer, &mean_from[i], &variance_from[i],
+ &rtile.x, &rtile.y, &rtile.w, &rtile.h,
+ &rtile.offset, &rtile.stride,
+ &d_unfiltered, &d_variance,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterGetFeature,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, get_feature_args, 0));
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ float a = 1.0f, k_2 = 0.25f;
+ int r = 4, f = 2;
+ void *filter_feature_args[] = {&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
+ &rect,
+ &r, &f, &a, &k_2};
+ cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, filter_feature_args, 0));
+ }
+ }
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- float a = 1.0f, k_2 = 0.25f;
- int r = 4, f = 2;
- void *filter_feature_args[] = {&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
- &rect,
- &r, &f, &a, &k_2};
- cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ /* ==== Step 2: Prefilter shadow feature. ==== */
+ {
+ CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, 8*pass_stride);
+ CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, 9*pass_stride);
+ /* Reuse some passes of the filter_buffer for temporary storage. */
+ CUdeviceptr d_sampleV = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
+ CUdeviceptr d_sampleVV = CUDA_PTR_ADD(d_denoise_buffer, 17*pass_stride);
+ CUdeviceptr d_bufferV = CUDA_PTR_ADD(d_denoise_buffer, 18*pass_stride);
+ CUdeviceptr d_cleanV = CUDA_PTR_ADD(d_denoise_buffer, 19*pass_stride);
+ CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
+ CUdeviceptr d_unfilteredA = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
+ CUdeviceptr d_unfilteredB = CUDA_PTR_ADD(d_denoise_buffer, 21*pass_stride);
+ CUdeviceptr d_null = (CUdeviceptr) 0;
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ void *divide_args[] = {&sample, &d_buffer,
+ &rtile.x, &rtile.y, &rtile.w, &rtile.h,
+ &rtile.offset, &rtile.stride,
+ &d_unfiltered, &d_sampleV, &d_sampleVV, &d_bufferV,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterDivideShadow,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
- 0, 0, filter_feature_args, 0));
- }
- }
-
- /* ==== Step 2: Prefilter shadow feature. ==== */
- {
- CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, 8*pass_stride);
- CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, 9*pass_stride);
- /* Reuse some passes of the filter_buffer for temporary storage. */
- CUdeviceptr d_sampleV = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
- CUdeviceptr d_sampleVV = CUDA_PTR_ADD(d_denoise_buffer, 17*pass_stride);
- CUdeviceptr d_bufferV = CUDA_PTR_ADD(d_denoise_buffer, 18*pass_stride);
- CUdeviceptr d_cleanV = CUDA_PTR_ADD(d_denoise_buffer, 19*pass_stride);
- CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
- CUdeviceptr d_unfilteredA = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
- CUdeviceptr d_unfilteredB = CUDA_PTR_ADD(d_denoise_buffer, 21*pass_stride);
- CUdeviceptr d_null = (CUdeviceptr) 0;
- /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
- void *divide_args[] = {&sample, &d_buffers,
- &rtile.x, &rtile.y, &rtile.w, &rtile.h,
- &rtile.offset, &rtile.stride,
- &d_unfiltered, &d_sampleV, &d_sampleVV, &d_bufferV,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterDivideShadow,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, divide_args, 0));
+ 0, 0, divide_args, 0));
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, ptr) debug_write_pfm(string_printf("debug_%dx%d_cuda_shadow_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, name).c_str(), ptr, rtile.w, rtile.h, 1, w)
- float *temp = new float[pass_stride*6];
- cuda_assert(cuMemcpyDtoH(temp, d_sampleV, 6*pass_stride*sizeof(float)));
-
- WRITE_DEBUG("unfilteredA", temp + 4*pass_stride);
- WRITE_DEBUG("unfilteredB", temp + 5*pass_stride);
- WRITE_DEBUG("bufferV", temp + 2*pass_stride);
- WRITE_DEBUG("sampleV", temp + 0*pass_stride);
- WRITE_DEBUG("sampleVV", temp + 1*pass_stride);
+ float *temp = new float[pass_stride*6];
+ cuda_assert(cuMemcpyDtoH(temp, d_sampleV, 6*pass_stride*sizeof(float)));
+
+ WRITE_DEBUG("unfilteredA", temp + 4*pass_stride);
+ WRITE_DEBUG("unfilteredB", temp + 5*pass_stride);
+ WRITE_DEBUG("bufferV", temp + 2*pass_stride);
+ WRITE_DEBUG("sampleV", temp + 0*pass_stride);
+ WRITE_DEBUG("sampleVV", temp + 1*pass_stride);
#endif
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- float a = 2.0f, k_2 = 2.0f;
- int r = 6, f = 3;
- void *filter_variance_args[] = {&d_bufferV, &d_sampleV, &d_sampleVV, &d_cleanV,
- &rect,
- &r, &f, &a, &k_2};
- cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, filter_variance_args, 0));
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ float a = 2.0f, k_2 = 2.0f;
+ int r = 6, f = 3;
+ void *filter_variance_args[] = {&d_bufferV, &d_sampleV, &d_sampleVV, &d_cleanV,
+ &rect,
+ &r, &f, &a, &k_2};
+ cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, filter_variance_args, 0));
#ifdef WITH_CYCLES_DEBUG_FILTER
- cuda_assert(cuMemcpyDtoH(temp, d_cleanV, pass_stride*sizeof(float)));
- WRITE_DEBUG("cleanV", temp);
+ cuda_assert(cuMemcpyDtoH(temp, d_cleanV, pass_stride*sizeof(float)));
+ WRITE_DEBUG("cleanV", temp);
#endif
- /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- a = 1.0f; k_2 = 0.25f;
- r = 5; f = 3;
- void *filter_unfilteredA_args[] = {&d_unfilteredA, &d_u
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list