[Bf-blender-cvs] [29105ad] soc-2016-cycles_denoising: Cycles: Use separate struct for CUDA denoising storage to avoid allocating the transform memory twice
Lukas Stockner
noreply at git.blender.org
Tue Aug 23 19:06:07 CEST 2016
Commit: 29105adbab70bc25d9624bdab52523b409ee3570
Author: Lukas Stockner
Date: Tue Aug 23 17:34:04 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB29105adbab70bc25d9624bdab52523b409ee3570
Cycles: Use separate struct for CUDA denoising storage to avoid allocating the transform memory twice
===================================================================
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/kernel_filter.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index f98fd76..d2e5ed3 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1078,7 +1078,7 @@ public:
/* Use the prefiltered feature to denoise the image. */
int4 filter_area = make_int4(rtile.x + overscan, rtile.y + overscan, rtile.w - 2*overscan, rtile.h - 2*overscan);
CUdeviceptr d_storage, d_transforms;
- cuda_assert(cuMemAlloc(&d_storage, filter_area.z*filter_area.w*sizeof(FilterStorage)));
+ cuda_assert(cuMemAlloc(&d_storage, filter_area.z*filter_area.w*sizeof(CUDAFilterStorage)));
cuda_assert(cuMemAlloc(&d_transforms, filter_area.z*filter_area.w*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
xthreads = (int)sqrt((float)threads_per_block);
@@ -1140,9 +1140,9 @@ public:
cuda_assert(cuCtxSynchronize());
#ifdef WITH_CYCLES_DEBUG_FILTER
- FilterStorage *host_storage = new FilterStorage[filter_area.z*filter_area.w];
- cuda_assert(cuMemcpyDtoH(host_storage, d_storage, sizeof(FilterStorage)*filter_area.z*filter_area.w));
-#define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_cuda_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, name).c_str(), &host_storage[0].var, filter_area.z, filter_area.w, sizeof(FilterStorage)/sizeof(float), filter_area.z);
+ CUDAFilterStorage *host_storage = new CUDAFilterStorage[filter_area.z*filter_area.w];
+ cuda_assert(cuMemcpyDtoH(host_storage, d_storage, sizeof(CUDAFilterStorage)*filter_area.z*filter_area.w));
+#define WRITE_DEBUG(name, var) debug_write_pfm(string_printf("debug_%dx%d_cuda_%s.pfm", rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan, name).c_str(), &host_storage[0].var, filter_area.z, filter_area.w, sizeof(CUDAFilterStorage)/sizeof(float), filter_area.z);
for(int i = 0; i < DENOISE_FEATURES; i++) {
WRITE_DEBUG(string_printf("mean_%d", i).c_str(), means[i]);
WRITE_DEBUG(string_printf("scale_%d", i).c_str(), scales[i]);
diff --git a/intern/cycles/kernel/kernel_filter.h b/intern/cycles/kernel/kernel_filter.h
index b9f865f..e842ab2 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
#define NORM_FEATURE_NUM 8
#ifdef __KERNEL_CUDA__
-ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float *transform, FilterStorage *storage, int4 rect, int transform_stride, int localIdx)
+ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float *transform, CUDAFilterStorage *storage, int4 rect, int transform_stride, int localIdx)
{
__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
float *features = shared_features + localIdx*DENOISE_FEATURES;
@@ -115,7 +115,7 @@ ccl_device void kernel_filter_construct_transform(KernelGlobals *kg, int sample,
#endif
}
-ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float ccl_readonly_ptr transform, FilterStorage *storage, int4 rect, int transform_stride, int localIdx)
+ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float ccl_readonly_ptr transform, CUDAFilterStorage *storage, int4 rect, int transform_stride, int localIdx)
{
__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
float *features = shared_features + localIdx*DENOISE_FEATURES;
@@ -176,7 +176,7 @@ ccl_device void kernel_filter_estimate_bandwidths(KernelGlobals *kg, int sample,
storage->bandwidth[i] = 0.0f;
}
-ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float ccl_readonly_ptr transform, FilterStorage *storage, int4 rect, int candidate, int transform_stride, int localIdx)
+ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, float ccl_readonly_ptr transform, CUDAFilterStorage *storage, int4 rect, int candidate, int transform_stride, int localIdx)
{
__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
float *features = shared_features + DENOISE_FEATURES*localIdx;
@@ -272,7 +272,7 @@ ccl_device void kernel_filter_estimate_bias_variance(KernelGlobals *kg, int samp
storage->est_variance[candidate] = est_variance;
}
-ccl_device void kernel_filter_calculate_bandwidth(KernelGlobals *kg, int sample, FilterStorage *storage)
+ccl_device void kernel_filter_calculate_bandwidth(KernelGlobals *kg, int sample, CUDAFilterStorage *storage)
{
const float candidate_bw[6] = {0.05f, 0.1f, 0.25f, 0.5f, 1.0f, 2.0f};
double bias_XtX = 0.0, bias_XtY = 0.0, var_XtX = 0.0, var_XtY = 0.0;
@@ -291,7 +291,7 @@ ccl_device void kernel_filter_calculate_bandwidth(KernelGlobals *kg, int sample,
storage->global_bandwidth = (float) pow((storage->rank * variance_coef) / (4.0 * bias_coef*bias_coef * sample), 1.0 / (storage->rank + 4));
}
-ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, float ccl_readonly_ptr transform, FilterStorage *storage, int4 filter_area, int4 rect, int transform_stride, int localIdx)
+ccl_device void kernel_filter_final_pass(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, float ccl_readonly_ptr transform, CUDAFilterStorage *storage, int4 filter_area, int4 rect, int transform_stride, int localIdx)
{
__shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
float *features = shared_features + DENOISE_FEATURES*localIdx;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 2acab17..47d61bb 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1325,7 +1325,6 @@ typedef struct FilterStorage {
float bandwidth[DENOISE_FEATURES];
int rank;
float global_bandwidth;
- float est_bias[6], est_variance[6];
#ifdef WITH_CYCLES_DEBUG_FILTER
float filtered_global_bandwidth;
float sum_weight;
@@ -1335,6 +1334,20 @@ typedef struct FilterStorage {
#endif
} FilterStorage;
+typedef struct CUDAFilterStorage {
+ float bandwidth[DENOISE_FEATURES];
+ int rank;
+ float global_bandwidth;
+ float est_bias[6], est_variance[6];
+#ifdef WITH_CYCLES_DEBUG_FILTER
+ float filtered_global_bandwidth;
+ float sum_weight;
+ float means[DENOISE_FEATURES], scales[DENOISE_FEATURES], singular[DENOISE_FEATURES];
+ float singular_threshold, feature_matrix_norm;
+ float log_rmse_per_sample;
+#endif
+} CUDAFilterStorage;
+
CCL_NAMESPACE_END
#endif /* __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 2e5a396..444d4b4 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -267,7 +267,7 @@ kernel_cuda_filter_construct_transform(int sample, float const* __restrict__ buf
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
- FilterStorage *l_storage = ((FilterStorage*) storage) + y*filter_area.z + x;
+ CUDAFilterStorage *l_storage = ((CUDAFilterStorage*) storage) + y*filter_area.z + x;
float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(NULL, sample, buffer, x + filter_area.x, y + filter_area.y, l_transform, l_storage, rect, filter_area.z*filter_area.w, threadIdx.y*blockDim.x + threadIdx.x);
}
@@ -280,7 +280,7 @@ kernel_cuda_filter_estimate_bandwidths(int sample, float const* __restrict__ buf
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
- FilterStorage *l_storage = ((FilterStorage*) storage) + y*filter_area.z + x;
+ CUDAFilterStorage *l_storage = ((CUDAFilterStorage*) storage) + y*filter_area.z + x;
float const* __restrict__ l_transform = transform + y*filter_area.z + x;
kernel_filter_estimate_bandwidths(NULL, sample, buffer, x + filter_area.x, y + filter_area.y, l_transform, l_storage, rect, filter_area.z*filter_area.w, threadIdx.y*blockDim.x + threadIdx.x);
}
@@ -293,7 +293,7 @@ kernel_cuda_filter_estimate_bias_variance(int sample, float const* __restrict__
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
- FilterStorage *l_storage = ((FilterStorage*) storage) + y*filter_area.z + x;
+ CUDAFilterStorage *l_storage = ((CUDAFilterStorage*) storage) + y*filter_area.z + x;
float const* __restrict__ l_transform = transform + y*filter_area.z + x;
kernel_filter_estimate_bias_variance(NULL, sample, buffer, x + filter_area.x, y + filter_area.y, l_transform, l_storage, rect, candidate, filter_area.z*filter_area.w, threadIdx.y*blockDim.x + threadIdx.x);
}
@@ -306,7 +306,7 @@ kernel_cuda_filter_calculate_bandwidth(int sample, void *storage, int4 filter_ar
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
- FilterStorage *l_storage = ((FilterStorage*) storage) + y*filter_area.z + x;
+ CUDAFilterStorage *l_storage = ((CUDAFilterStorage*) storage) + y*filter_area.z + x;
kernel_filter_calculate_bandwidth(NULL, sample, l_storage);
}
}
@@ -318,7 +318,7 @@ kernel_cuda_filter_final_pass(int sample, float* buffer, int offset, int stride,
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
- FilterStorage *l_storage = ((FilterStorage*) storage) + y*filter_area.z + x;
+ CUDAFilterStorage *l_storage = ((CUDAFilterStorage*) storage) + y*filter_area.z + x;
floa
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list