[Bf-blender-cvs] [3336bc6af66] cycles-x: Cycles X: Implement early cancel in GPU rendering

Wed May 19 10:14:56 CEST 2021

Commit: 3336bc6af66a1b9f0e15d6816aa59f3880d4ae44
Author: Sergey Sharybin
Date:   Tue May 18 11:58:41 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB3336bc6af66a1b9f0e15d6816aa59f3880d4ae44

Cycles X: Implement early cancel in GPU rendering

Allow the GPU rendering to stop rendering before all scheduled paths
has been terminated. This significantly lowers latency of the viewport
navigation.

With some files from the Sprites Fright project the time spend in the
`PathTrace::cancel()` is reduced from 0.6 to below 0.01 sec.

Differential Revision: https://developer.blender.org/D11286

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index f8db047a663..f1336f6eed6 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -95,6 +95,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
     case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
     case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index 2492fd17149..007f5cccbc5 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -54,6 +54,8 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_terminated_paths_array";
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
       return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
 
     /* Shader evaluation. */
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 59203dedb35..1741b958ecc 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -116,6 +116,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
     case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index dc51f9f350e..dafd4ec7314 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -56,8 +56,7 @@ void PathTraceWorkGPU::alloc_integrator_soa()
    * Allocate a device only memory buffer before for each struct member, and then
    * write the pointers into a struct that resides in constant memory.
    *
-   * TODO: store float3 in separate XYZ arrays.
-   * TODO: skip zeroing most arrays and leave uninitialized. */
+   * TODO: store float3 in separate XYZ arrays. */
 
   if (!integrator_state_soa_.empty()) {
     return;
@@ -69,7 +68,6 @@ void PathTraceWorkGPU::alloc_integrator_soa()
     device_only_memory<type> *array = new device_only_memory<type>(device_, \
                                                                    "integrator_state_" #name); \
     array->alloc_to_device(max_num_paths_); \
-    array->zero_to_device(); \
     integrator_state_soa_.emplace_back(array); \
     integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
   }
@@ -78,7 +76,6 @@ void PathTraceWorkGPU::alloc_integrator_soa()
     device_only_memory<type> *array = new device_only_memory<type>(device_, \
                                                                    "integrator_state_" #name); \
     array->alloc_to_device(max_num_paths_); \
-    array->zero_to_device(); \
     integrator_state_soa_.emplace_back(array); \
     integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
   }
@@ -155,6 +152,8 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
 
   work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
 
+  enqueue_reset();
+
   /* TODO: set a hard limit in case of undetected kernel failures? */
   while (true) {
     /* Enqueue work from the scheduler, on start or when there are not enough
@@ -169,6 +168,10 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
       }
     }
 
+    if (is_cancel_requested()) {
+      break;
+    }
+
     /* Stop if no more work remaining. */
     if (finished) {
       break;
@@ -183,6 +186,24 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
         break; /* Stop on error. */
       }
     }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+  }
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  const int num_keys = integrator_sort_key_counter_.size();
+  void *args[] = {&max_num_paths_, const_cast<int *>(&num_keys)};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max(max_num_paths_, num_keys), args);
+  queue_->zero_to_device(integrator_queue_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
   }
 }
 
@@ -328,6 +349,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
     case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index b35f95a31bc..e3b67c08cac 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -54,6 +54,8 @@ class PathTraceWorkGPU : public PathTraceWork {
   void alloc_integrator_queue();
   void alloc_integrator_sorting();
 
+  void enqueue_reset();
+
   bool enqueue_work_tiles(bool &finished);
   void enqueue_work_tiles(DeviceKernel kernel,
                           const KernelWorkTile work_tiles[],
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index d753932e01b..e3ed5ea35f7 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -75,6 +75,22 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS, CUD
  * Integrator.
  */
 
+extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
+                                              CUDA_KERNEL_MAX_REGISTERS)
+    kernel_cuda_integrator_reset(int num_states, int num_keys)
+{
+  const int path_index = ccl_global_id(0);
+
+  if (path_index < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+
+  if (path_index < num_keys) {
+    kernel_integrator_state.sort_key_counter[path_index] = 0;
+  }
+}
+
 extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
                                               CUDA_KERNEL_MAX_REGISTERS)
     kernel_cuda_integrator_init_from_camera(const int *path_index_array,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index d64b6a32304..403a2d0bf2e 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1616,6 +1616,7 @@ typedef enum DeviceKernel {
   DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_RESET,
 
   DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
   DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,