[Bf-blender-cvs] [38f8ff4] soc-2016-cycles_denoising: Cycles: Adapt CUDA device to the new denoise buffer architecture
Lukas Stockner
noreply at git.blender.org
Sat Aug 6 05:41:10 CEST 2016
Commit: 38f8ff430421621d98d18efb352f9c6e53ad7e52
Author: Lukas Stockner
Date: Sat Aug 6 05:15:46 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB38f8ff430421621d98d18efb352f9c6e53ad7e52
Cycles: Adapt CUDA device to the new denoise buffer architecture
===================================================================
M intern/cycles/device/device_cuda.cpp
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index b6e48c7..b30c4e1 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -779,7 +779,7 @@ public:
cuda_push_context();
CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterNonLocalMeans, cuFilterCombineHalves, cuFilterEstimateParams, cuFilterFinalPass;
- CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
+ CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer);
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuModule, "kernel_cuda_filter_divide_shadow"));
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuModule, "kernel_cuda_filter_get_feature"));
@@ -799,9 +799,8 @@ public:
return;
int overscan = rtile.buffers->params.overscan;
- int filter_w = rtile.buffers->params.final_width, filter_h = rtile.buffers->params.final_height;
- int4 prefilter_rect = make_int4(rtile.x, rtile.y, rtile.x + rtile.w, rtile.y + rtile.h);
+ int4 rect = make_int4(rtile.x, rtile.y, rtile.x + rtile.w, rtile.y + rtile.h);
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterEstimateParams));
@@ -811,163 +810,193 @@ public:
int xblocks = (rtile.w + xthreads - 1)/xthreads;
int yblocks = (rtile.h + ythreads - 1)/ythreads;
- CUdeviceptr d_prefiltered, d_unfiltered, d_sampleVV, d_cleanV, d_sampleV, d_bufferV, d_unfilteredA, d_unfilteredB, d_null, d_prefiltered1;
- cuda_assert(cuMemAlloc(&d_prefiltered, 16*rtile.w*rtile.h*sizeof(float)));
- cuda_assert(cuMemAlloc(&d_unfiltered, rtile.w*rtile.h*2*sizeof(float)));
-
- int m_offsets[] = {0, 1, 2, 6, 7, 8, 12};
- int variances[] = {3, 4, 5, 9, 10, 11, 13};
- for(int i = 0; i < 7; i++) {
- CUdeviceptr d_prefiltered_mean = (CUdeviceptr) (((float*) d_prefiltered) + 2*i*rtile.w*rtile.h);
- CUdeviceptr d_prefiltered_var = (CUdeviceptr) (((float*) d_prefiltered) + (2*i+1)*rtile.w*rtile.h);
-
- void *get_feature_args[] = {&sample, &d_buffer, &m_offsets[i], &variances[i],
- &rtile.x, &rtile.y, &rtile.w, &rtile.h,
- &rtile.offset, &rtile.stride,
- &d_unfiltered, &d_prefiltered_var,
- &prefilter_rect};
- cuda_assert(cuLaunchKernel(cuFilterGetFeature,
+ CUdeviceptr d_denoise_buffer;
+ cuda_assert(cuMemAlloc(&d_denoise_buffer, 22*rtile.w*rtile.h*sizeof(float)));
+ int pass_stride = rtile.w*rtile.h;
+#define CUDA_PTR_ADD(ptr, x) ((CUdeviceptr) (((float*) (ptr)) + (x)))
+
+ /* ==== Step 1: Prefilter general features. ==== */
+ {
+ int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
+ int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+ int offset_to[] = { 0, 2, 4, 10, 12, 14, 6 };
+ for(int i = 0; i < 7; i++) {
+ CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
+ CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
+ CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
+
+ void *get_feature_args[] = {&sample, &d_buffers, &mean_from[i], &variance_from[i],
+ &rtile.x, &rtile.y, &rtile.w, &rtile.h,
+ &rtile.offset, &rtile.stride,
+ &d_unfiltered, &d_variance,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterGetFeature,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, get_feature_args, 0));
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ float a = 1.0f, k_2 = 0.25f;
+ int r = 4, f = 2;
+ void *filter_feature_args[] = {&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
+ &rect,
+ &r, &f, &a, &k_2};
+ cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, filter_feature_args, 0));
+ }
+ }
+
+ /* ==== Step 2: Prefilter shadow feature. ==== */
+ {
+ CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, 8*pass_stride);
+ CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, 9*pass_stride);
+ /* Reuse some passes of the filter_buffer for temporary storage. */
+ CUdeviceptr d_sampleV = CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
+ CUdeviceptr d_sampleVV = CUDA_PTR_ADD(d_denoise_buffer, 17*pass_stride);
+ CUdeviceptr d_bufferV = CUDA_PTR_ADD(d_denoise_buffer, 18*pass_stride);
+ CUdeviceptr d_cleanV = CUDA_PTR_ADD(d_denoise_buffer, 19*pass_stride);
+ CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
+ CUdeviceptr d_unfilteredA = CUDA_PTR_ADD(d_denoise_buffer, 20*pass_stride);
+ CUdeviceptr d_unfilteredB = CUDA_PTR_ADD(d_denoise_buffer, 21*pass_stride);
+ CUdeviceptr d_null = (CUdeviceptr) 0;
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ void *divide_args[] = {&sample, &d_buffers,
+ &rtile.x, &rtile.y, &rtile.w, &rtile.h,
+ &rtile.offset, &rtile.stride,
+ &d_unfiltered, &d_sampleV, &d_sampleVV, &d_bufferV,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterDivideShadow,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
- 0, 0, get_feature_args, 0));
+ 0, 0, divide_args, 0));
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- float a = 1.0f, k_2 = 0.25f;
- int r = 4, f = 2;
- void *filter_variance_args[] = {&d_unfiltered, &d_unfiltered, &d_prefiltered_var, &d_prefiltered_mean,
- &prefilter_rect,
+ float a = 4.0f, k_2 = 1.0f;
+ int r = 3, f = 1;
+ void *filter_variance_args[] = {&d_bufferV, &d_sampleV, &d_sampleVV, &d_cleanV,
+ &rect,
&r, &f, &a, &k_2};
cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
0, 0, filter_variance_args, 0));
- }
+ /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+ a = 1.0f; k_2 = 0.25f;
+ r = 5; f = 3;
+ void *filter_unfilteredA_args[] = {&d_unfilteredA, &d_unfilteredB, &d_cleanV, &d_sampleV,
+ &rect,
+ &r, &f, &a, &k_2};
+ cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, filter_unfilteredA_args, 0));
- cuda_assert(cuMemAlloc(&d_sampleVV, rtile.w*rtile.h*sizeof(float)));
- cuda_assert(cuMemAlloc(&d_cleanV, rtile.w*rtile.h*sizeof(float)));
- d_unfilteredA = d_unfiltered;
- d_unfilteredB = (CUdeviceptr) (((float*) d_unfiltered) + rtile.w*rtile.h);
- d_null = (CUdeviceptr) NULL;
- CUdeviceptr d_prefiltered_mean = (CUdeviceptr) (((float*) d_prefiltered) + 14*rtile.w*rtile.h);
- CUdeviceptr d_prefiltered_var = (CUdeviceptr) (((float*) d_prefiltered) + 15*rtile.w*rtile.h);
- d_sampleV = d_prefiltered_mean; /* Reuse memory since they're not both needed at the same time. */
- d_bufferV = d_prefiltered_var;
-
- /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
- void *divide_args[] = {&sample, &d_buffer,
- &rtile.x, &rtile.y, &rtile.w, &rtile.h,
- &rtile.offset, &rtile.stride,
- &d_unfiltered, &d_sampleV, &d_sampleVV, &d_bufferV,
- &prefilter_rect};
- cuda_assert(cuLaunchKernel(cuFilterDivideShadow,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, divide_args, 0));
-
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- float a = 4.0f, k_2 = 1.0f;
- int r = 3, f = 1;
- void *filter_variance_args[] = {&d_bufferV, &d_sampleV, &d_sampleVV, &d_cleanV,
- &prefilter_rect,
- &r, &f, &a, &k_2};
- cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, filter_variance_args, 0));
-
- /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- a = 1.0f; k_2 = 0.25f;
- r = 5; f = 3;
- void *filter_unfilteredA_args[] = {&d_unfilteredA, &d_unfilteredB, &d_cleanV, &d_sampleV,
- &prefilter_rect,
- &r, &f, &a, &k_2};
- cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, filter_unfilteredA_args, 0));
+ void *filter_unfilteredB_args[] = {&d_unfilteredB, &d_unfilteredA, &d_cleanV, &d_bufferV,
+ &rect,
+ &r, &f, &a, &k_2};
+ cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
+ xblocks , yblocks, 1, /* blocks */
+
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list