[Bf-blender-cvs] [ca6d583008] soc-2016-cycles_denoising: Cycles: Implement new NLM reconstruction kernels
Lukas Stockner
noreply at git.blender.org
Fri Jan 20 05:46:43 CET 2017
Commit: ca6d583008ffb5f0d23ab66a324f5ad5311da951
Author: Lukas Stockner
Date: Fri Jan 13 16:45:13 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBca6d583008ffb5f0d23ab66a324f5ad5311da951
Cycles: Implement new NLM reconstruction kernels
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/filter/filter.h
M intern/cycles/kernel/filter/filter_final_pass_impl.h
M intern/cycles/kernel/filter/filter_nlm_cpu.h
M intern/cycles/kernel/filter/filter_nlm_gpu.h
M intern/cycles/kernel/filter/filter_prefilter.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index dfc6995e53..ae7e24f0ef 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -146,6 +146,9 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, int, void*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, void*, float*, float3*, int*, int)> filter_finalize_kernel;
+
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), \
KERNEL_NAME_EVAL(cpu_sse2, name), \
@@ -170,7 +173,9 @@ public:
filter_nlm_blur_kernel(KERNEL_FUNCTIONS(filter_nlm_blur)),
filter_nlm_calc_weight_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_weight)),
filter_nlm_update_output_kernel(KERNEL_FUNCTIONS(filter_nlm_update_output)),
- filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize))
+ filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize)),
+ filter_nlm_construct_gramian_kernel(KERNEL_FUNCTIONS(filter_nlm_construct_gramian)),
+ filter_finalize_kernel(KERNEL_FUNCTIONS(filter_finalize))
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
@@ -473,29 +478,57 @@ public:
bool use_gradients = kg->__data.integrator.use_gradients;
int hw = kg->__data.integrator.half_window;
- FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
- float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
+ int storage_num = filter_area.z*filter_area.w;
+ FilterStorage *storage = new FilterStorage[storage_num];
int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
int pass_stride = w*h;
+ float *XtWX = new float[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)*storage_num];
+ float3 *XtWY = new float3[(DENOISE_FEATURES+1)*storage_num];
+ memset(XtWX, 0, sizeof(float)*(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)*storage_num);
+ memset(XtWY, 0, sizeof(float3)*(DENOISE_FEATURES+1)*storage_num);
+
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
filter_construct_transform_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z + x, &rect.x);
- filter_reconstruct_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
+ //filter_reconstruct_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
}
}
- if(use_gradients) {
+ {
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = kg->__data.integrator.weighting_adjust;
+ float *weight = filter_buffer + 16*pass_stride;
+ float *variance = filter_buffer + 17*pass_stride;
+ float *difference = new float[pass_stride];
+ float *blurDifference = new float[pass_stride];
+ int local_filter_rect[4] = {filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z, filter_area.w};
+ for(int i = 0; i < (2*hw+1)*(2*hw+1); i++) {
+ int dy = i / (2*hw+1) - hw;
+ int dx = i % (2*hw+1) - hw;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, 2*pass_stride, a, k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+ filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, filter_buffer, 0*pass_stride, storage, XtWX, XtWY, local_rect, local_filter_rect, w, h, 4);
+ }
+ delete[] difference;
+ delete[] blurDifference;
+ int buffer_params[4] = {offset, stride, kg->__data.film.pass_stride, kg->__data.film.pass_no_denoising};
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
- filter_divide_combined_kernel()(kg, x + filter_area.x, y + filter_area.y, sample, buffers, offset, stride);
+ filter_finalize_kernel()(x + filter_area.x, y + filter_area.y, y*filter_area.z + x, w, h, buffers, storage, XtWX, XtWY, buffer_params, sample);
}
}
}
delete[] storage;
- delete[] weight_cache;
+ delete[] XtWX;
+ delete[] XtWY;
}
void thread_render(DeviceTask& task)
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 5d347aea22..8ec6ca6b91 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -907,13 +907,12 @@ public:
cuda_push_context();
- CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterNonLocalMeans, cuFilterCombineHalves;
+ CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterCombineHalves;
CUfunction cuFilterConstructTransform, cuFilterReconstruct, cuFilterDivideCombined;
CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer);
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuModule, "kernel_cuda_filter_divide_shadow"));
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuModule, "kernel_cuda_filter_get_feature"));
- cuda_assert(cuModuleGetFunction(&cuFilterNonLocalMeans, cuModule, "kernel_cuda_filter_non_local_means"));
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuModule, "kernel_cuda_filter_combine_halves"));
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuModule, "kernel_cuda_filter_construct_transform"));
@@ -922,7 +921,6 @@ public:
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuFilterNonLocalMeans, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
bool l1 = false;
@@ -954,6 +952,7 @@ public:
CUdeviceptr d_denoise_buffers;
int w = align_up(rect.z - rect.x, 4);
+ int h = (rect.w - rect.y);
int frame_stride = w*(rect.w - rect.y);
int pass_stride = frame_stride*rtile.buffers->params.frames;
cuda_assert(cuMemAlloc(&d_denoise_buffers, 22*pass_stride*sizeof(float)));
@@ -1113,7 +1112,6 @@ public:
}
}
}
-#undef CUDA_PTR_ADD
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, pass) debug_write_pfm(string_printf("debug_%dx%d_cuda_feature%d_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, i, name).c_str(), host_denoise_buffer+pass*pass_stride, rtile.w, rtile.h, 1, w)
@@ -1128,9 +1126,10 @@ public:
#endif
/* Use the prefiltered feature to denoise the image. */
+ int storage_num = filter_area.z*filter_area.w;
CUdeviceptr d_storage, d_transforms;
- cuda_assert(cuMemAlloc(&d_storage, filter_area.z*filter_area.w*sizeof(CUDAFilterStorage)));
- cuda_assert(cuMemAlloc(&d_transforms, filter_area.z*filter_area.w*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
+ cuda_assert(cuMemAlloc(&d_storage, storage_num*sizeof(CUDAFilterStorage)));
+ cuda_assert(cuMemAlloc(&d_transforms, storage_num*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
xthreads = (int)sqrt((float)threads_per_block);
ythreads = (int)sqrt((float)threads_per_block);
@@ -1148,33 +1147,85 @@ public:
xthreads, ythreads, 1, /* threads */
0, 0, transform_args, 0));
- void *final_args[] = {&sample,
- &d_denoise_buffers,
- &rtile.offset,
- &rtile.stride,
- &d_transforms,
- &d_storage,
- &d_buffers,
- &filter_area,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterReconstruct,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, final_args, 0));
- cuda_assert(cuCtxSynchronize());
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+ yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+ int dx, dy;
+ int4 local_rect, local_filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z, filter_area.w);
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = kernel_globals.integrator.weighting_adjust;
+ int color_pass = 0;
+
+ CUdeviceptr color_buffer = CUDA_PTR_ADD(d_denoise_buffers, 16*pass_stride);
+ CUdeviceptr variance_buffer = CUDA_PTR_ADD(d_denoise_buffers, 17*pass_stride);
+ CUdeviceptr d_difference, d_blurDifference, d_XtWX, d_XtWY;
+ cuda_assert(cuMemAlloc(&d_difference, pass_stride*sizeof(float)));
+ cuda_assert(cuMemAlloc(&d_blurDifference, pass_stride*sizeof(float)));
+ cuda_assert(cuMemAlloc(&d_XtWX, storage_num*sizeo
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list