[Bf-blender-cvs] [58c353d] soc-2016-cycles_denoising: Cycles: Fix CUDA compilation with NLM
Lukas Stockner
noreply at git.blender.org
Tue Nov 22 04:25:23 CET 2016
Commit: 58c353d4de8231779250cf7c893b3f61a662d4b6
Author: Lukas Stockner
Date: Wed Nov 16 15:58:15 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB58c353d4de8231779250cf7c893b3f61a662d4b6
Cycles: Fix CUDA compilation with NLM
===================================================================
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/kernel_filter.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 377a9e5..ec7a4e7 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -832,7 +832,8 @@ public:
cuda_push_context();
CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterNonLocalMeans, cuFilterCombineHalves;
- CUfunction cuFilterConstructTransform, cuFilterEstimateBandwidths, cuFilterEstimateBiasVariance, cuFilterCalculateBandwidth, cuFilterFinalPass;
+ CUfunction cuFilterConstructTransform, cuFilterEstimateBandwidths, cuFilterEstimateBiasVariance, cuFilterCalculateBandwidth;
+ CUfunction cuFilterFinalPassWLR, cuFilterFinalPassNLM, cuFilterDivideCombined;
CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer);
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuModule, "kernel_cuda_filter_divide_shadow"));
@@ -844,7 +845,9 @@ public:
cuda_assert(cuModuleGetFunction(&cuFilterEstimateBandwidths, cuModule, "kernel_cuda_filter_estimate_bandwidths"));
cuda_assert(cuModuleGetFunction(&cuFilterEstimateBiasVariance, cuModule, "kernel_cuda_filter_estimate_bias_variance"));
cuda_assert(cuModuleGetFunction(&cuFilterCalculateBandwidth, cuModule, "kernel_cuda_filter_calculate_bandwidth"));
- cuda_assert(cuModuleGetFunction(&cuFilterFinalPass, cuModule, "kernel_cuda_filter_final_pass"));
+ cuda_assert(cuModuleGetFunction(&cuFilterFinalPassWLR, cuModule, "kernel_cuda_filter_final_pass_wlr"));
+ cuda_assert(cuModuleGetFunction(&cuFilterFinalPassNLM, cuModule, "kernel_cuda_filter_final_pass_nlm"));
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideCombined, cuModule, "kernel_cuda_filter_divide_combined"));
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
@@ -857,7 +860,9 @@ public:
cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateBandwidths, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
cuda_assert(cuFuncSetCacheConfig(cuFilterEstimateBiasVariance, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
cuda_assert(cuFuncSetCacheConfig(cuFilterCalculateBandwidth, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
- cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPass, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPassWLR, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterFinalPassNLM, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideCombined, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED));
if(have_error())
return;
@@ -873,7 +878,7 @@ public:
min(filter_area.y + filter_area.w + hw, buffer_area.y + buffer_area.w));
int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterFinalPass));
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterFinalPassWLR));
int xthreads = (int)sqrt((float)threads_per_block);
int ythreads = (int)sqrt((float)threads_per_block);
@@ -1110,48 +1115,78 @@ public:
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
0, 0, transform_args, 0));
- cuda_assert(cuLaunchKernel(cuFilterEstimateBandwidths,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, transform_args, 0));
- for(int g = 0; g < 6; g++) {
- void *bias_variance_args[] = {&sample,
- &d_denoise_buffers,
- &d_transforms,
- &d_storage,
- &filter_area,
- &rect,
- &g};
- cuda_assert(cuLaunchKernel(cuFilterEstimateBiasVariance,
+ if(getenv("NLM_FILTER")) {
+ void *final_args[] = {&sample,
+ &d_denoise_buffers,
+ &rtile.offset,
+ &rtile.stride,
+ &d_transforms,
+ &d_storage,
+ &d_buffers,
+ &filter_area,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterFinalPassNLM,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
- 0, 0, bias_variance_args, 0));
+ 0, 0, final_args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ void *divide_args[] = {&d_buffers,
+ &sample,
+ &rtile.offset,
+ &rtile.stride,
+ &filter_area};
+ cuda_assert(cuLaunchKernel(cuFilterDivideCombined,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, divide_args, 0));
}
+ else {
+ cuda_assert(cuLaunchKernel(cuFilterEstimateBandwidths,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, transform_args, 0));
+
+ for(int g = 0; g < 6; g++) {
+ void *bias_variance_args[] = {&sample,
+ &d_denoise_buffers,
+ &d_transforms,
+ &d_storage,
+ &filter_area,
+ &rect,
+ &g};
+ cuda_assert(cuLaunchKernel(cuFilterEstimateBiasVariance,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, bias_variance_args, 0));
+ }
- void *bandwidth_args[] = {&sample,
- &d_storage,
- &filter_area};
- cuda_assert(cuLaunchKernel(cuFilterCalculateBandwidth,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, bandwidth_args, 0));
-
- void *final_args[] = {&sample,
- &d_denoise_buffers,
- &rtile.offset,
- &rtile.stride,
- &d_transforms,
- &d_storage,
- &d_buffers,
- &filter_area,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterFinalPass,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, final_args, 0));
+ void *bandwidth_args[] = {&sample,
+ &d_storage,
+ &filter_area};
+ cuda_assert(cuLaunchKernel(cuFilterCalculateBandwidth,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, bandwidth_args, 0));
+
+ void *final_args[] = {&sample,
+ &d_denoise_buffers,
+ &rtile.offset,
+ &rtile.stride,
+ &d_transforms,
+ &d_storage,
+ &d_buffers,
+ &filter_area,
+ &rect};
+ cuda_assert(cuLaunchKernel(cuFilterFinalPassWLR,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, final_args, 0));
- cuda_assert(cuCtxSynchronize());
+ cuda_assert(cuCtxSynchronize());
+ }
#ifdef WITH_CYCLES_DEBUG_FILTER
CUDAFilterStorage *host_storage = new CUDAFilterStorage[filter_area.z*filter_area.w];
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index 4f6290f..27c1387 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -427,7 +427,7 @@ ccl_device void kernel_filter_final_pass_wlr(KernelGlobals *kg, int sample, floa
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- float weight = filter_fill_design_row_cuda(features, rank, design_row, feature_transform, bandwidth_factor);
+ float weight = filter_fill_design_row_cuda(features, rank, design_row, transform, transform_stride, bandwidth_factor);
if(weight == 0.0f) continue;
weight /= max(1.0f, variance);
@@ -568,7 +568,7 @@ ccl_device void kernel_filter_final_pass_nlm(KernelGlobals *kg, int sample, floa
if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) continue;
filter_get_features(px, py, pt, pixel_buffer, features, feature_means, pass_stride);
- filter_fill_design_row_no_weight_cuda(features, rank, design_row, feature_transform, bandwidth_factor);
+ filter_fill_design_row_no_weight_cuda(features, rank, design_row, transform, transform_stride);
float weight = nlm_weight(x, y, px, py, center_buffer, pixel_buffer, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
if(weight == 0.0f) continue;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 7e1f032..fa194a3 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -313,14 +313,38 @@ kernel_cuda_filter_calculate_bandwidth(int sample, void *storage, int4 filter_ar
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH,
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list