[Bf-blender-cvs] [04abe01b6c] soc-2016-cycles_denoising: Cycles Denoising: Use device-independent denoising code for CUDA as well
Lukas Stockner
noreply at git.blender.org
Thu Feb 9 14:39:39 CET 2017
Commit: 04abe01b6c65de2681af57caed43ef1aa4d1eb9f
Author: Lukas Stockner
Date: Wed Feb 8 22:53:06 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB04abe01b6c65de2681af57caed43ef1aa4d1eb9f
Cycles Denoising: Use device-independent denoising code for CUDA as well
As a result, cross-denoising on CUDA works now.
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_denoising.cpp
M intern/cycles/device/device_denoising.h
M intern/cycles/filter/filter_defines.h
M intern/cycles/filter/filter_prefilter.h
M intern/cycles/filter/filter_transform.h
M intern/cycles/filter/filter_transform_cuda.h
M intern/cycles/filter/filter_transform_sse.h
M intern/cycles/filter/kernels/cpu/filter_cpu.h
M intern/cycles/filter/kernels/cpu/filter_cpu_impl.h
M intern/cycles/filter/kernels/cuda/filter.cu
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index bd5630ae95..4c12556bf2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -137,10 +137,10 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
- KernelFunctions<void(*)(int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)> filter_divide_combined_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)> filter_divide_combined_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
@@ -148,7 +148,7 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float, int, int)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float)> filter_construct_transform_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
@@ -363,9 +363,7 @@ public:
(int*) task->storage.rank.device_pointer,
&task->rect.x,
task->half_window,
- task->pca_threshold,
- 1,
- 0);
+ task->pca_threshold);
}
}
return true;
@@ -463,12 +461,8 @@ public:
for(int y = task->rect.y; y < task->rect.w; y++) {
for(int x = task->rect.x; x < task->rect.z; x++) {
filter_divide_shadow_kernel()(task->render_buffer.samples,
- (float**) task->neighbors.buffers,
+ task->tiles,
x, y,
- task->neighbors.tile_x,
- task->neighbors.tile_y,
- task->neighbors.offsets,
- task->neighbors.strides,
(float*) a_ptr,
(float*) b_ptr,
(float*) sample_variance_ptr,
@@ -492,14 +486,10 @@ public:
for(int y = task->rect.y; y < task->rect.w; y++) {
for(int x = task->rect.x; x < task->rect.z; x++) {
filter_get_feature_kernel()(task->render_buffer.samples,
- (float**) task->neighbors.buffers,
+ task->tiles,
mean_offset,
variance_offset,
x, y,
- task->neighbors.tile_x,
- task->neighbors.tile_y,
- task->neighbors.offsets,
- task->neighbors.strides,
(float*) mean_ptr,
(float*) variance_ptr,
&task->rect.x,
@@ -560,7 +550,7 @@ public:
denoising.filter_area = make_int4(tile.x + overscan, tile.y + overscan, tile.w - 2*overscan, tile.h - 2*overscan);
denoising.render_buffer.samples = end_sample;
- denoising.neighbors.init_from_single_tile(tile);
+ denoising.tiles_from_single_tile(tile);
denoising.init_from_kerneldata(&kg.__data);
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
@@ -583,7 +573,7 @@ public:
RenderTile rtiles[9];
rtiles[4] = tile;
task.get_neighbor_tiles(rtiles);
- denoising.neighbors.init_from_rendertiles(rtiles);
+ denoising.tiles_from_rendertiles(rtiles);
denoising.init_from_kerneldata(&kg.__data);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index ba4424f844..1bbe98113e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,6 +21,7 @@
#include "device.h"
#include "device_intern.h"
+#include "device_denoising.h"
#include "buffers.h"
@@ -143,7 +144,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -520,7 +521,9 @@ public:
void mem_zero(device_memory& mem)
{
- memset((void*)mem.data_pointer, 0, mem.memory_size());
+ if(mem.data_pointer) {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+ }
cuda_push_context();
if(mem.device_pointer)
@@ -542,6 +545,11 @@ public:
}
}
+ virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_offset(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@@ -845,368 +853,343 @@ public:
}
}
- void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight, CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) {
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ int4 rect = task->rect;
int w = align_up(rect.z-rect.x, 4);
int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
- cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
- cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuF
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list