[Bf-blender-cvs] [90590d7ddc3] cycles-x: Cycles X: Make converge check return number of active pixels

Wed May 12 16:10:45 CEST 2021

Commit: 90590d7ddc37f7ed3586279549c21b56c7122563
Author: Sergey Sharybin
Date:   Thu Apr 22 14:54:01 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB90590d7ddc37f7ed3586279549c21b56c7122563

Cycles X: Make converge check return number of active pixels

Currently no functional changes, but allows to implement different
adaptive sampling filtering: for example, the one which progressively
lowers noise floor.

>From benchmarks on RTX 6000 there is no performance impact.

===================================================================

M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace_work.h
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/path_trace_work_cpu.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index d1d1aee5737..e1a0b4c24e7 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -207,7 +207,7 @@ void PathTrace::adaptive_sample(RenderWork &render_work)
   }
 
   tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
-    if (!path_trace_work->adaptive_sampling_converge_and_filter(
+    if (path_trace_work->adaptive_sampling_converge_filter_count_active(
             render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset)) {
       all_pixels_converged = false;
     }
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index c34698e2d8d..387d6d6d0b9 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -64,8 +64,8 @@ class PathTraceWork {
   virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) = 0;
 
   /* Perform convergence test on the render buffer, and filter the convergence mask.
-   * Returns true if all pixels did converge. */
-  virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) = 0;
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
 
   /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
    * possible, without waiting for any samples to be finished. */
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 247cbe7b054..96ae38c2f9e 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -23,6 +23,7 @@
 #include "render/gpu_display.h"
 #include "render/scene.h"
 
+#include "util/util_atomic.h"
 #include "util/util_logging.h"
 #include "util/util_tbb.h"
 
@@ -168,7 +169,7 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, float sample
   gpu_display->unmap_texture_buffer();
 }
 
-bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bool reset)
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
 {
   const int full_x = effective_buffer_params_.full_x;
   const int full_y = effective_buffer_params_.full_y;
@@ -179,7 +180,7 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
 
   float *render_buffer = render_buffers_->buffer.data();
 
-  bool all_pixels_converged = true;
+  uint num_active_pixels = 0;
 
   tbb::task_arena local_arena = local_tbb_arena_create(device_);
 
@@ -189,20 +190,25 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
       CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
 
       bool row_converged = true;
+      uint num_row_pixels_active = 0;
       for (int x = 0; x < width; ++x) {
-        row_converged &= kernels_.adaptive_sampling_convergence_check(
-            kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride);
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
       }
 
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
       if (!row_converged) {
         kernels_.adaptive_sampling_filter_x(
             kernel_globals, render_buffer, y, full_x, width, offset, stride);
-        all_pixels_converged = false;
       }
     });
   });
 
-  if (!all_pixels_converged) {
+  if (num_active_pixels) {
     local_arena.execute([&]() {
       tbb::parallel_for(full_x, full_x + width, [&](int x) {
         CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
@@ -212,7 +218,7 @@ bool PathTraceWorkCPU::adaptive_sampling_converge_and_filter(float threshold, bo
     });
   }
 
-  return all_pixels_converged;
+  return num_active_pixels;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 4d8dc1d6a57..4b4bdf08546 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -50,7 +50,7 @@ class PathTraceWorkCPU : public PathTraceWork {
 
   virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) override;
 
-  virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) override;
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
 
  protected:
   /* Core path tracing routine. Renders given work time on the given queue. */
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 5d37cd2caf4..aef4ad27c6a 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -646,26 +646,25 @@ void PathTraceWorkGPU::enqueue_film_convert(device_ptr d_rgba_half, float sample
   queue_->enqueue(DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT, work_size, args);
 }
 
-bool PathTraceWorkGPU::adaptive_sampling_converge_and_filter(float threshold, bool reset)
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
 {
-  if (adaptive_sampling_convergence_check(threshold, reset)) {
-    return true;
-  }
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
 
-  enqueue_adaptive_sampling_filter_x();
-  enqueue_adaptive_sampling_filter_y();
-  queue_->synchronize();
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
 
-  return false;
+  return num_active_pixels;
 }
 
-bool PathTraceWorkGPU::adaptive_sampling_convergence_check(float threshold, bool reset)
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
 {
-  device_vector<int> all_pixels_converged(device_, "all_pixels_converged", MEM_READ_WRITE);
-  all_pixels_converged.alloc(1);
-  all_pixels_converged.data()[0] = 1;
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
 
-  queue_->copy_to_device(all_pixels_converged);
+  queue_->zero_to_device(num_active_pixels);
 
   const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
 
@@ -678,14 +677,14 @@ bool PathTraceWorkGPU::adaptive_sampling_convergence_check(float threshold, bool
                   &reset,
                   &effective_buffer_params_.offset,
                   &effective_buffer_params_.stride,
-                  &all_pixels_converged.device_pointer};
+                  &num_active_pixels.device_pointer};
 
   queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
 
-  queue_->copy_from_device(all_pixels_converged);
+  queue_->copy_from_device(num_active_pixels);
   queue_->synchronize();
 
-  return all_pixels_converged.data()[0];
+  return num_active_pixels.data()[0];
 }
 
 void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 87758dbdae5..b35f95a31bc 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -47,7 +47,7 @@ class PathTraceWorkGPU : public PathTraceWork {
 
   virtual void copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale) override;
 
-  virtual bool adaptive_sampling_converge_and_filter(float threshold, bool reset) override;
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
 
  protected:
   void alloc_integrator_soa();
@@ -84,7 +84,7 @@ class PathTraceWorkGPU : public PathTraceWork {
    * This is a common part of both `copy_to_gpu_display` implementations. */
   void enqueue_film_convert(device_ptr d_rgba_half, float sample_scale);
 
-  bool adaptive_sampling_convergence_check(float threshold, bool reset);
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
   void enqueue_adaptive_sampling_filter_x();
   void enqueue_adaptive_sampling_filter_y();
 
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index a922e935352..c38f17b714b 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -309,7 +309,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
                                                     bool reset,
                                                     int offset,
                                                     int stride,
-                                                    int *all_pixels_converged)
+                                                    uint *num_active_pixels)
 {
   const int work_index = ccl_global_id(0);
   const int y = work_index / sw;
@@ -323,10 +323,9 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  if (__any_sync(0xffffffff, !converged)) {
-    if (threadIdx.x == 0) {
-      all_pixels_converged[0] = 0;
-    }
+  const uint num_active_pixels_mask = __ballot_sync(0xffffffff, !converged);
+  if (threadIdx.x == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
   }
 }