[Bf-blender-cvs] [466bd61857] soc-2016-cycles_denoising: Cycles: Implement new NLM kernels for CUDA
Lukas Stockner
noreply at git.blender.org
Fri Jan 20 05:46:41 CET 2017
Commit: 466bd61857f61b73b8006db8ccc2e78799f4ef30
Author: Lukas Stockner
Date: Fri Jan 13 00:09:09 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB466bd61857f61b73b8006db8ccc2e78799f4ef30
Cycles: Implement new NLM kernels for CUDA
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/filter/filter.h
R098 intern/cycles/kernel/filter/filter_nlm.h intern/cycles/kernel/filter/filter_nlm_cpu.h
A intern/cycles/kernel/filter/filter_nlm_gpu.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a596097f15..dfc6995e53 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -336,7 +336,7 @@ public:
/* Reuse some passes of the filter_buffer for temporary storage. */
float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
float *unfiltered = PASSPTR(4), *unfilteredB = PASSPTR(5);
- float *diffI = PASSPTR(10), *blurDiffI = PASSPTR(11), *accumI = PASSPTR(12);
+ float *nlm_temp1 = PASSPTR(10), *nlm_temp2 = PASSPTR(11), *nlm_temp3 = PASSPTR(12);
/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
for(int y = rect.y; y < rect.w; y++) {
@@ -354,14 +354,14 @@ public:
#endif
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
+ non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("cleanV", cleanV);
#endif
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
- non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+ non_local_means(rect, unfiltered, unfilteredB, sampleV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
+ non_local_means(rect, unfilteredB, unfiltered, bufferV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("filteredA", sampleV);
WRITE_DEBUG("filteredB", bufferV);
@@ -378,8 +378,8 @@ public:
#endif
/* Use the residual variance for a second filter pass. */
- non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
- non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+ non_local_means(rect, sampleV, bufferV, unfiltered , sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
+ non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("finalA", unfiltered);
WRITE_DEBUG("finalB", unfiltered + pass_stride);
@@ -403,7 +403,7 @@ public:
{
float *unfiltered = PASSPTR(16);
- float *diffI = PASSPTR(17), *blurDiffI = PASSPTR(18), *accumI = PASSPTR(19);
+ float *nlm_temp1 = PASSPTR(17), *nlm_temp2 = PASSPTR(18), *nlm_temp3 = PASSPTR(19);
/* Order in render buffers:
* Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
* 0 1 2 3 4 5 6 7 8 9 10 11 12 13
@@ -423,7 +423,7 @@ public:
filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, PASSPTR(offset_to[i]+1), &rect.x);
}
}
- non_local_means(rect, unfiltered, unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), diffI, blurDiffI, accumI, 2, 2, 1, 0.25f);
+ non_local_means(rect, unfiltered, unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), nlm_temp1, nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f);
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, name), var, 1, w);
WRITE_DEBUG("unfiltered", unfiltered);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 27e5b7af5b..5d347aea22 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -830,6 +830,76 @@ public:
}
}
+ void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight, CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) {
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuNLMCalcDifference));
+
+ int xthreads = (int)sqrt((float)threads_per_block);
+ int ythreads = (int)sqrt((float)threads_per_block);
+ int xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+ int yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+ int dx, dy;
+ int4 local_rect;
+ void *calc_difference_args[] = {&dx, &dy, &weight, &variance, &difference, &local_rect, &w, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image, &out, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ cuda_assert(cuLaunchKernel(cuNLMCalcDifference,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, calc_difference_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMBlur,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, blur_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMCalcWeight,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, calc_weight_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMBlur,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, blur_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMUpdateOutput,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, update_output_args, 0));
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out, &weightAccum, &local_rect, &w};
+ cuda_assert(cuLaunchKernel(cuNLMNormalize,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, normalize_args, 0));
+ }
+
void denoise(RenderTile &rtile, int sample)
{
if(have_error())
@@ -892,62 +962,38 @@ public:
for(int frame = 0; frame < rtile.buffers->params.frames; frame++) {
CUdeviceptr d_denoise_buffer = CUDA_PTR_ADD(d_denoise_buffers, frame_stride*frame);
CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers, frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size());
- /* ==== Step 1: Prefilter general features. ==== */
- {
- int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
- int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
- int offset_to[] = { 0, 2, 4, 10, 12, 14, 6 };
- for(int i = 0; i < 7; i++) {
- CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
- CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
- CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
-
- void *get_feature_args[] = {&sample, &d_buffer, &mean_from[i], &variance_from[i],
- &buffer_area,
- &rtile.offset, &rtile.stride,
- &d_unfiltered, &d_variance,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterGetFeature,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, get_feature_args, 0));
-
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- float a = 1.0f, k_2 = 0.25f;
- int r = 4, f = 2;
- void *filter_feature_args[] = {&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
- &rect,
- &r, &f, &a, &k_2};
- cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, filter_feature_args, 0));
- }
- }
- /*
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list