[Bf-blender-cvs] [b4e7747] soc-2016-cycles_denoising: Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around
Lukas Stockner
noreply at git.blender.org
Tue Nov 22 04:25:38 CET 2016
Commit: b4e774722fc4f2e064e41117bd17132d98f1a61e
Author: Lukas Stockner
Date: Tue Nov 22 04:11:16 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBb4e774722fc4f2e064e41117bd17132d98f1a61e
Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/kernel/CMakeLists.txt
A intern/cycles/kernel/filter/filter_final_pass_impl.h
M intern/cycles/kernel/kernel_filter.h
M intern/cycles/kernel/kernel_filter_util.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
M intern/cycles/util/util_atomic.h
M intern/cycles/util/util_math.h
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fcce0ff..61c1e75 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -139,7 +139,7 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)> filter_construct_transform_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)> filter_estimate_wlr_params_kernel;
- KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)> filter_final_pass_wlr_kernel;
+ KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)> filter_final_pass_wlr_kernel;
KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, float*, int*, int*)> filter_final_pass_nlm_kernel;
KernelFunctions<void(*)(int, int, float**, float**, float**, float**, int*, int, int, float, float)> filter_non_local_means_3_kernel;
KernelFunctions<void(*)(KernelGlobals*, float*, int, int, int, int, float, float*, int*)> filter_old_1_kernel;
@@ -453,8 +453,9 @@ public:
bool use_gradients = kg->__data.integrator.use_gradients;
bool nlm_weights = kg->__data.integrator.use_nlm_weights;
- FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
int hw = kg->__data.integrator.half_window;
+ FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
+ float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
int pass_stride = w*h;
@@ -511,14 +512,12 @@ public:
}
}
else if(nlm_weights) {
- float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
filter_construct_transform_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z + x, &rect.x);
filter_final_pass_nlm_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
}
}
- delete[] weight_cache;
}
else {
for(int y = 0; y < filter_area.w; y++) {
@@ -541,7 +540,7 @@ public:
#endif
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
- filter_final_pass_wlr_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, &filter_area.x, &rect.x);
+ filter_final_pass_wlr_kernel()(kg, sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
}
}
#ifdef WITH_CYCLES_DEBUG_FILTER
@@ -561,6 +560,7 @@ public:
}
delete[] storage;
+ delete[] weight_cache;
}
void thread_render(DeviceTask& task)
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9d9764b..84a0636 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -209,6 +209,10 @@ set(SRC_SPLIT_HEADERS
split/kernel_sum_all_radiance.h
)
+set(SRC_FILTER_HEADERS
+ filter/filter_final_pass_impl.h
+)
+
# CUDA module
if(WITH_CYCLES_CUDA_BINARIES)
@@ -346,6 +350,7 @@ add_library(cycles_kernel
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
+ ${SRC_FILTER_HEADERS}
)
if(WITH_CYCLES_CUDA)
@@ -382,4 +387,5 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTAL
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/filter)
diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h b/intern/cycles/kernel/filter/filter_final_pass_impl.h
new file mode 100644
index 0000000..9e2b9b1
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -0,0 +1,276 @@
+#ifdef __KERNEL_CUDA__
+#define STORAGE_TYPE CUDAFilterStorage
+#else
+#define STORAGE_TYPE FilterStorage
+#endif
+
+ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, float ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, int filtered_passes, int2 color_passes, STORAGE_TYPE *storage, float *weight_cache, float ccl_readonly_ptr transform, int transform_stride, int4 filter_area, int4 rect)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int buffer_h = (rect.w - rect.y);
+ int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+ color_passes *= pass_stride;
+ int num_frames = kernel_data.film.num_frames;
+ int prev_frames = kernel_data.film.prev_frames;
+
+ int2 low = make_int2(max(rect.x, x - kernel_data.integrator.half_window),
+ max(rect.y, y - kernel_data.integrator.half_window));
+ int2 high = make_int2(min(rect.z, x + kernel_data.integrator.half_window + 1),
+ min(rect.w, y + kernel_data.integrator.half_window + 1));
+
+ float ccl_readonly_ptr pixel_buffer;
+ float ccl_readonly_ptr center_buffer = buffer + (y - rect.y) * buffer_w + (x - rect.x);
+ int3 pixel;
+
+ float3 center_color = filter_get_pixel_color(center_buffer + color_passes.x, pass_stride);
+ float sqrt_center_variance = sqrtf(filter_get_pixel_variance(center_buffer + color_passes.x, pass_stride));
+
+ /* NFOR weighting directly writes to the design row, so it doesn't need the feature vector and always uses full rank. */
+#ifndef WEIGHTING_NFOR
+# ifdef __KERNEL_CUDA__
+ /* On GPUs, store the feature vector in shared memory for faster access. */
+ __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+ float *features = shared_features + DENOISE_FEATURES*(threadIdx.y*blockDim.x + threadIdx.x);
+# else
+ float features[DENOISE_FEATURES];
+# endif
+ const int rank = storage->rank;
+ const int matrix_size = rank+1;
+#else
+ const int matrix_size = DENOISE_FEATURES;
+ float *feature_scales = transform;
+#endif
+
+ float feature_means[DENOISE_FEATURES];
+ filter_get_features(make_int3(x, y, 0), center_buffer, feature_means, NULL, pass_stride);
+
+#ifdef WEIGHTING_WLR
+ /* Apply a median filter to the 3x3 window aroung the current pixel. */
+ int sort_idx = 0;
+ float global_bandwidths[9];
+ for(int dy = max(-1, filter_area.y - y); dy < min(2, filter_area.y+filter_area.w - y); dy++) {
+ for(int dx = max(-1, filter_area.x - x); dx < min(2, filter_area.x+filter_area.z - x); dx++) {
+ int ofs = dy*filter_area.z + dx;
+ if(storage[ofs].rank != rank) continue;
+ global_bandwidths[sort_idx++] = storage[ofs].global_bandwidth;
+ }
+ }
+ /* Insertion-sort the global bandwidths (fast enough for 9 elements). */
+ for(int i = 1; i < sort_idx; i++) {
+ float v = global_bandwidths[i];
+ int j;
+ for(j = i-1; j >= 0 && global_bandwidths[j] > v; j--)
+ global_bandwidths[j+1] = global_bandwidths[j];
+ global_bandwidths[j+1] = v;
+ }
+ float inv_global_bandwidth = 1.0f / (global_bandwidths[sort_idx/2] * kernel_data.integrator.weighting_adjust);
+
+ float bandwidth_factor[DENOISE_FEATURES];
+ for(int i = 0; i < rank; i++) {
+ /* Same as above, divide by the bandwidth since the bandwidth_factor actually is the inverse of the bandwidth. */
+ bandwidth_factor[i] = storage->bandwidth[i] * inv_global_bandwidth;
+ }
+#endif
+
+ /* Essentially, this function is just a first-order regression solver.
+ * We model the pixel color as a linear function of the feature vectors.
+ * So, we search the parameters S that minimize W*(X*S - y), where:
+ * - X is the design matrix containing all the feature vectors
+ * - y is the vector containing all the pixel colors
+ * - W is the diagonal matrix containing all pixel weights
+ * Since this is just regular least-squares, the solution is given by:
+ * S = inv(Xt*W*X)*Xt*W*y */
+
+ float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], design_row[DENOISE_FEATURES+1];
+
+ math_matrix_zero(XtWX, matrix_size);
+ /* Construct Xt*W*X matrix (and fill weight cache, if used). */
+ FOR_PIXEL_WINDOW {
+ float3 color = filter_get_pixel_color(pixel_buffer + color_passes.x, pass_stride);
+ float variance = filter_get_pixel_variance(pixel_buffer + color_passes.x, pass_stride);
+ if(filter_firefly_rejection(color, variance, center_color, sqrt_center_variance)) {
+#ifdef WEIGHT_CACHING_CUDA
+ if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+ weight_cache[cache_idx] = 0.0f;
+#endif
+ continue;
+ }
+
+#ifdef WEIGHTING_WLR
+ float weight = filter_get_design_row_transform_weight(pixel, pixel_buffer, feature_means, pass_stride, features, rank, design_row, transform, transform_stride, bandwidth_factor);
+#elif defined(WEIGHTING_NLM)
+ filter_get_design_row_transform(pixel, pixel_buffer, feature_means, pass_stride, features, rank, design_row, transform, transform_stride);
+ float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+#else /* WEIGHTING_NFOR */
+ filter_get_design_row(pixel, pixel_buffer, feature_means, feature_scales, pass_stride, design_row);
+ float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
+#endif
+ if(weight < 1e-5f) {
+#ifdef WEIGHT_CACHING_CUDA
+ if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+ weight_cache[cache_idx]
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list