[Bf-blender-cvs] [ae741692727] cycles-x: Cycles X: Implement path compaction for shadow catcher

Thu Jul 15 17:15:02 CEST 2021

Commit: ae741692727bc14c7e40c8fe06126b1ecc1bc36e
Author: Sergey Sharybin
Date:   Wed Jul 14 16:16:16 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBae741692727bc14c7e40c8fe06126b1ecc1bc36e

Cycles X: Implement path compaction for shadow catcher

The demo file is BMW27 with the ground set as a shadow catcher.
The observed performance improvement is about 5% on RTX5000.

The general idea is to schedule new tiles in a way that we always
leave space for the shadow catcher. Roughly, we first schedule 50%
of path states from the maximum number of paths, then 25% and so on.

Summary of changes:

- Replace constant offset of shadow catcher state with an atomically
  incrementing index.

- Add new kernel to count number of states which can still spit.

  Could experiment with some atomics so that path split decreases a
  value, so does path termination, and increase it when new paths
  are added. Not sure this will give better performance.

- Remove terminated paths kernel from scheduling.
  The paths are compacted, so we know they are in the beginning of
  the array.

Differential Revision: https://developer.blender.org/D11932

===================================================================

M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/integrator/integrator_init_from_camera.h
M	intern/cycles/kernel/integrator/integrator_state.h
M	intern/cycles/kernel/integrator/integrator_state_flow.h
M	intern/cycles/kernel/integrator/integrator_state_util.h
M	intern/cycles/kernel/kernel_shadow_catcher.h
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index 535e1df7c22..ec34d976b75 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -66,6 +66,8 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_compact_states";
     case DEVICE_KERNEL_INTEGRATOR_RESET:
       return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
 
     /* Shader evaluation. */
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 5f4ef5a9dbc..dcddcd3a264 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -106,7 +106,7 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
   const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
   const bool has_bake = device_scene_->data.bake.use;
 
-  IntegratorState integrator_states[2];
+  IntegratorState integrator_states[2] = {{0}, {0}};
 
   IntegratorState *state = &integrator_states[0];
   IntegratorState *shadow_catcher_state = &integrator_states[1];
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index e9aacb58dd8..6e63a7a3aa9 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -40,6 +40,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
       queued_paths_(device, "queued_paths", MEM_READ_WRITE),
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
@@ -146,19 +148,32 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
   }
 }
 
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
 void PathTraceWorkGPU::alloc_work_memory()
 {
   alloc_integrator_soa();
   alloc_integrator_queue();
   alloc_integrator_sorting();
+  alloc_integrator_path_split();
 }
 
 void PathTraceWorkGPU::init_execution()
 {
   queue_->init_execution();
 
-  integrator_state_gpu_.shadow_catcher_state_offset = get_shadow_catcher_state_offset();
-
   /* Copy to device side struct in constant memory. */
   device_->const_copy_to(
       "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
@@ -166,6 +181,10 @@ void PathTraceWorkGPU::init_execution()
 
 void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
 {
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
   work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
 
   work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
@@ -415,12 +434,6 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     max_active_path_index_ = 0;
   }
 
-  /* TODO: not supported for shadow catcher yet. That needs to switch to an atomic
-   * counter for new paths so that we can fill in the space left after compaction. */
-  if (has_shadow_catcher()) {
-    return;
-  }
-
   /* Compact fragmented path states into the start of the array, moving any paths
    * with index higher than the number of active paths into the gaps. */
   if (max_active_path_index_ == num_active_paths) {
@@ -496,7 +509,28 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   vector<KernelWorkTile> work_tiles;
 
-  const int max_num_camera_paths = get_max_num_camera_paths();
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
 
   /* Schedule when we're out of paths or there are too few paths to keep the
    * device occupied. */
@@ -530,23 +564,33 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
    * known maximum path index, which makes computing active index arrays slow. */
   compact_states(num_active_paths);
 
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
   enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
                                                       DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
                      work_tiles.data(),
-                     work_tiles.size());
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
   return true;
 }
 
 void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
                                           const KernelWorkTile work_tiles[],
-                                          const int num_work_tiles)
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
 {
   /* Copy work tiles to device. */
   if (work_tiles_.size() < num_work_tiles) {
     work_tiles_.alloc(num_work_tiles);
   }
 
-  int path_index_offset = 0;
+  int path_index_offset = num_active_paths;
   int max_tile_work_size = 0;
   for (int i = 0; i < num_work_tiles; i++) {
     KernelWorkTile &work_tile = work_tiles_.data()[i];
@@ -565,46 +609,17 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
   queue_->copy_to_device(work_tiles_);
 
   void *d_work_tiles = (void *)work_tiles_.device_pointer;
-  void *d_path_index = (void *)nullptr;
   void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
 
-  if (max_active_path_index_ != 0) {
-    queue_->zero_to_device(num_queued_paths_);
-
-    /* Limit work size to max known active path index + the number of paths we are going
-     * to enqueue, which may be smaller than the total number of paths possible. */
-    const int work_size = min(max_num_paths_, max_active_path_index_ + path_index_offset);
-    int queued_kernel = 0;
-
-    void *d_queued_paths = (void *)queued_paths_.device_pointer;
-    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
-    void *args[] = {
-        const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &queued_kernel};
-
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
-
-    d_path_index = (void *)queued_paths_.device_pointer;
-  }
-
   /* Launch kernel. */
-  void *args[] = {&d_path_index,
-                  &d_work_tiles,
+  void *args[] = {&d_work_tiles,
                   const_cast<int *>(&num_work_tiles),
                   &d_render_buffer,
                   const_cast<int *>(&max_tile_work_size)};
 
   queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
 
-  /* TODO: this could be computed more accurately using on the last entry
-   * in the queued_paths array passed to the kernel? */
-  /* When there is a shadow catcher in the scene provision that the shadow catcher state will
-   * become active at some point.
-   *
-   * TODO: What is more accurate approach here? What if the shadow catcher is hit after some
-   * transparent bounce? Do we need to calculate this somewhere else as well? */
-  max_active_path_index_ = min(max_active_path_index_ + path_index_offset +
-                                   get_shadow_catcher_state_offset(),
-                               max_num_paths_);
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
 }
 
 int PathTraceWorkGPU::get_num_active_paths()
@@ -623,17 +638,6 @@ int PathTraceWorkGPU::get_num_active_paths()
   return num_paths;
 }
 
-int PathTraceWorkGPU::get_max_num_camera_paths() const
-{
-  /* When shadow catcher is used reserve half of the states for the shadow catcher needs (so that
-   * when path hits shadow catcher it can split). */
-  if (has_shadow_catcher()) {
-    return max_num_paths_ / 2;
-  }
-
-  return max_num_paths_;
-}
-
 bool PathTraceWorkGPU::should_use_graphics_interop()
 {
   /* There are few aspects with the graphics interop when using multiple devices caused by the fact
@@ -866,

@@ Diff output truncated at 10240 characters. @@