[Bf-blender-cvs] [2ee51362151] cycles-x: Cycles X: add path state compaction for GPU rendering

Fri Jun 4 15:23:43 CEST 2021

Commit: 2ee5136215134a4f4a8b87e256528c6de25d86dc
Author: Brecht Van Lommel
Date:   Wed May 5 20:53:24 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB2ee5136215134a4f4a8b87e256528c6de25d86dc

Cycles X: add path state compaction for GPU rendering

Before enqueing more work tiles, compact the path states so that all active
states are at the start. This improve coherence by avoiding paths from
different tiles to be interleaved.

It only provides minor performance benefits at this point, and only on
some GPUs. But it can be a useful building block for further optimizations.

Differential Revision: https://developer.blender.org/D11346

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/integrator/integrator_state_util.h
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 1746f45f840..09131176223 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -94,6 +94,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
       /* See parall_active_index.h for why this amount of shared memory is needed. */
       shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
       break;
@@ -108,6 +109,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
     case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
     case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index e8edbe5582d..49ab49d404d 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -56,6 +56,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_terminated_paths_array";
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
       return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
     case DEVICE_KERNEL_INTEGRATOR_RESET:
       return "integrator_reset";
 
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 703e55c99a2..573a618a7c7 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -116,6 +116,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
     case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 6cfc24595fd..e12b978bafa 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -240,12 +240,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
   /* Find kernel to execute, with max number of queued paths. */
   const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
 
-  int num_paths = 0;
+  int num_active_paths = 0;
   for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
-    num_paths += queue_counter->num_queued[i];
+    num_active_paths += queue_counter->num_queued[i];
   }
 
-  if (num_paths == 0) {
+  if (num_active_paths == 0) {
     return false;
   }
 
@@ -306,8 +306,6 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
       /* Compute array of active paths for specific kernel. */
       compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
     }
-
-    queue_->zero_to_device(num_queued_paths_);
   }
 
   DCHECK_LE(work_size, max_num_paths_);
@@ -342,6 +340,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
     case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
@@ -386,6 +386,8 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
+  queue_->zero_to_device(num_queued_paths_);
+
   /* Launch kernel to fill the active paths arrays. */
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
@@ -403,7 +405,6 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
     queue_->enqueue(kernel, work_size, args);
   }
 
-  queue_->zero_to_device(num_queued_paths_);
   if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
     queue_->zero_to_device(integrator_shader_sort_counter_);
   }
@@ -417,22 +418,81 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
 
 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
 {
-  /* Launch kernel to fill the active paths arrays. */
-  /* TODO: this could be smaller for terminated paths based on amount of work we want
-   * to schedule. */
-  const int work_size = (kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY) ?
-                            min(max_num_paths_, get_max_num_camera_paths()) :
-                            max_active_path_index_;
   int d_queued_kernel = queued_kernel;
 
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
   void *d_queued_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
   void *args[] = {
       const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
 
+  queue_->zero_to_device(num_queued_paths_);
   queue_->enqueue(kernel, work_size, args);
 }
 
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* TODO: not supported for shadow catcher yet. That needs to switch to an atomic
+   * counter for new paths so that we can fill in the space left after compaction. */
+  if (has_shadow_catcher()) {
+    return;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
 bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 {
   /* If there are existing paths wait them to go to intersect closest kernel, which will align the
@@ -444,15 +504,11 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
     return false;
   }
 
-  int num_paths = get_num_active_paths();
-
-  if (num_paths == 0) {
-    max_active_path_index_ = 0;
-  }
+  int num_active_paths = get_num_active_paths();
 
   /* Don't schedule more work if cancelling. */
   if (is_cancel_requested()) {
-    if (num_paths == 0) {
+    if (num_active_paths == 0) {
       finished = true;
     }
     return false;
@@ -466,6 +522,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   /* Schedule when we're out of paths or there are too few paths to keep the
    * device occupied. */
+  int num_paths = num_active_paths;
   if (num_paths == 0 || num_paths < min_num_active_paths_) {
     /* Get work tiles until the maximum number of path is reached. */
     while (num_paths < max_num_camera_paths) {
@@ -491,6 +548,10 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
     return false;
   }
 
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
   enqueue_work_tiles(
       DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA, work_tiles.data(), work_tiles.size());
   return true;
@@ -528,8 +589,20 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
   void *d_render_buffer = (void *)render_buffers_->buffer.device_pointer;
 
   if (max_active_path_index_ != 0) {
-    compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, (DeviceKernel)0);
     queue_->zero_to_device(num_queued_paths_);
+
+    /* Limit work size to max known active path index + the number of paths we are going
+     * to enqueue, which may be smaller than the total n

@@ Diff output truncated at 10240 characters. @@