[Bf-blender-cvs] [90590d7ddc3] cycles-x: Cycles X: Make converge check return number of active pixels
Sergey Sharybin
noreply at git.blender.org
Wed May 12 16:10:45 CEST 2021
Commit: 90590d7ddc37f7ed3586279549c21b56c7122563
Author: Sergey Sharybin
Date: Thu Apr 22 14:54:01 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB90590d7ddc37f7ed3586279549c21b56c7122563
Cycles X: Make converge check return number of active pixels
Currently no functional changes, but allows to implement different
adaptive sampling filtering: for example, the one which progressively
lowers noise floor.
>From benchmarks on RTX 6000 there is no performance impact.
===================================================================
M intern/cycles/integrator/path_trace.cpp
M intern/cycles/integrator/path_trace_work.h
M intern/cycles/integrator/path_trace_work_cpu.cpp
M intern/cycles/integrator/path_trace_work_cpu.h
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/integrator/path_trace_work_gpu.h
M intern/cycles/kernel/device/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index d1d1aee5737..e1a0b4c24e7 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -207,7 +207,7 @@ void PathTrace::adaptive_sample(RenderWork &render_work)
}
tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
- if (!path_trace_work->adaptive_sampling_converge_and_filter(
+ if (path_trace_work->adaptive_sampling_converge_filter_count_active(
render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset)) {
all_pixels_converged = false;
}
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index c34698e2d8d..387d6d6d0b9 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -64,8 +64,8 @@ class PathTraceWork {
virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) = 0;
/* Perform convergence test on the render buffer, and filter the convergence mask.
- * Returns true if all pixels did converge. */
- virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) = 0;
+ * Returns number of active pixels (the ones which did not converge yet). */
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
/* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
* possible, without waiting for any samples to be finished. */
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 247cbe7b054..96ae38c2f9e 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -23,6 +23,7 @@
#include "render/gpu_display.h"
#include "render/scene.h"
+#include "util/util_atomic.h"
#include "util/util_logging.h"
#include "util/util_tbb.h"
@@ -168,7 +169,7 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, float sample
gpu_display->unmap_texture_buffer();
}
-bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bool reset)
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
{
const int full_x = effective_buffer_params_.full_x;
const int full_y = effective_buffer_params_.full_y;
@@ -179,7 +180,7 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
float *render_buffer = render_buffers_->buffer.data();
- bool all_pixels_converged = true;
+ uint num_active_pixels = 0;
tbb::task_arena local_arena = local_tbb_arena_create(device_);
@@ -189,20 +190,25 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
bool row_converged = true;
+ uint num_row_pixels_active = 0;
for (int x = 0; x < width; ++x) {
- row_converged &= kernels_.adaptive_sampling_convergence_check(
- kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride);
+ if (!kernels_.adaptive_sampling_convergence_check(
+ kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+ ++num_row_pixels_active;
+ row_converged = false;
+ }
}
+ atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
if (!row_converged) {
kernels_.adaptive_sampling_filter_x(
kernel_globals, render_buffer, y, full_x, width, offset, stride);
- all_pixels_converged = false;
}
});
});
- if (!all_pixels_converged) {
+ if (num_active_pixels) {
local_arena.execute([&]() {
tbb::parallel_for(full_x, full_x + width, [&](int x) {
CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
@@ -212,7 +218,7 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
});
}
- return all_pixels_converged;
+ return num_active_pixels;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 4d8dc1d6a57..4b4bdf08546 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -50,7 +50,7 @@ class PathTraceWorkCPU : public PathTraceWork {
virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) override;
- virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) override;
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
protected:
/* Core path tracing routine. Renders given work time on the given queue. */
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 5d37cd2caf4..aef4ad27c6a 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -646,26 +646,25 @@ void PathTraceWorkGPU::enqueue_film_convert(device_ptr d_rgba_half, float sample
queue_->enqueue(DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT, work_size, args);
}
-bool PathTraceWorkGPU::adaptive_sampling_converge_and_filter(float threshold, bool reset)
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
{
- if (adaptive_sampling_convergence_check(threshold, reset)) {
- return true;
- }
+ const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
- enqueue_adaptive_sampling_filter_x();
- enqueue_adaptive_sampling_filter_y();
- queue_->synchronize();
+ if (num_active_pixels) {
+ enqueue_adaptive_sampling_filter_x();
+ enqueue_adaptive_sampling_filter_y();
+ queue_->synchronize();
+ }
- return false;
+ return num_active_pixels;
}
-bool PathTraceWorkGPU::adaptive_sampling_convergence_check(float threshold, bool reset)
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
{
- device_vector<int> all_pixels_converged(device_, "all_pixels_converged", MEM_READ_WRITE);
- all_pixels_converged.alloc(1);
- all_pixels_converged.data()[0] = 1;
+ device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+ num_active_pixels.alloc(1);
- queue_->copy_to_device(all_pixels_converged);
+ queue_->zero_to_device(num_active_pixels);
const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
@@ -678,14 +677,14 @@ bool PathTraceWorkGPU::adaptive_sampling_convergence_check(float threshold, bool
&reset,
&effective_buffer_params_.offset,
&effective_buffer_params_.stride,
- &all_pixels_converged.device_pointer};
+ &num_active_pixels.device_pointer};
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
- queue_->copy_from_device(all_pixels_converged);
+ queue_->copy_from_device(num_active_pixels);
queue_->synchronize();
- return all_pixels_converged.data()[0];
+ return num_active_pixels.data()[0];
}
void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 87758dbdae5..b35f95a31bc 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -47,7 +47,7 @@ class PathTraceWorkGPU : public PathTraceWork {
virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) override;
- virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) override;
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
protected:
void alloc_integrator_soa();
@@ -84,7 +84,7 @@ class PathTraceWorkGPU : public PathTraceWork {
* This is a common part of both `copy_to_gpu_display` implementations. */
void enqueue_film_convert(device_ptr d_rgba_half, float sample_scale);
- bool adaptive_sampling_convergence_check(float threshold, bool reset);
+ int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
void enqueue_adaptive_sampling_filter_x();
void enqueue_adaptive_sampling_filter_y();
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index a922e935352..c38f17b714b 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -309,7 +309,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
bool reset,
int offset,
int stride,
- int *all_pixels_converged)
+ uint *num_active_pixels)
{
const int work_index = ccl_global_id(0);
const int y = work_index / sw;
@@ -323,10 +323,9 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- if (__any_sync(0xffffffff, !converged)) {
- if (threadIdx.x == 0) {
- all_pixels_converged[0] = 0;
- }
+ const uint num_active_pixels_mask = __ballot_sync(0xffffffff, !converged);
+ if (threadIdx.x == 0) {
+ atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
}
}
More information about the Bf-blender-cvs
mailing list