[Bf-blender-cvs] [cb8f94022ab] cycles-x: Cleanup: refactor shader sorting to support it for more kernels later

Thu May 27 19:15:40 CEST 2021

Commit: cb8f94022ab0142c51b70a34f09d6715fb430505
Author: Brecht Van Lommel
Date:   Thu May 27 11:28:35 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBcb8f94022ab0142c51b70a34f09d6715fb430505

Cleanup: refactor shader sorting to support it for more kernels later

===================================================================

M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/integrator/integrator_intersect_closest.h
M	intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
M	intern/cycles/kernel/integrator/integrator_state.h
M	intern/cycles/kernel/integrator/integrator_state_flow.h
M	intern/cycles/kernel/integrator/integrator_subsurface.h

===================================================================

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index e8958ef1147..eee3ce0fd40 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -126,7 +126,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
   if (integrator_sort_key_counter_.size() < num_shaders) {
     integrator_sort_key_counter_.alloc(num_shaders);
     integrator_sort_key_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter = (int *)integrator_sort_key_counter_.device_pointer;
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_sort_key_counter_.device_pointer;
   }
 }
 
@@ -213,10 +215,10 @@ DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
 
 void PathTraceWorkGPU::enqueue_reset()
 {
-  const int num_keys = integrator_sort_key_counter_.size();
-  void *args[] = {&max_num_paths_, const_cast<int *>(&num_keys)};
-  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max(max_num_paths_, num_keys), args);
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
   queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_sort_key_counter_);
 
   /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
    * counter on the host side because `zero_to_device()` is not doing it. */
@@ -346,15 +348,17 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   }
 }
 
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, int queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
 {
-  void *d_key_counter = (void *)integrator_sort_key_counter_.device_pointer;
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
 
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
     int num_shaders = integrator_sort_key_counter_.size();
-    void *args[] = {&d_key_counter, &num_shaders};
+    void *args[] = {&d_counter, &num_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
@@ -369,18 +373,23 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, int queu
     void *args[] = {const_cast<int *>(&work_size),
                     &d_queued_paths,
                     &d_num_queued_paths,
-                    &d_key_counter,
-                    &queued_kernel};
+                    &d_counter,
+                    &d_queued_kernel};
 
     queue_->enqueue(kernel, work_size, args);
   }
 
-  /* TODO: ensure this happens as part of queue stream. */
   queue_->zero_to_device(num_queued_paths_);
-  queue_->zero_to_device(integrator_sort_key_counter_);
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_sort_key_counter_);
+  }
+  else {
+    /* TODO */
+    assert(0);
+  }
 }
 
-void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kernel)
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
 {
   /* Launch kernel to fill the active paths arrays. */
   /* TODO: this could be smaller for terminated paths based on amount of work we want
@@ -388,11 +397,12 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kern
   const int work_size = (kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY) ?
                             min(max_num_paths_, get_max_num_camera_paths()) :
                             max_active_path_index_;
+  int d_queued_kernel = queued_kernel;
 
   void *d_queued_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
   void *args[] = {
-      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &queued_kernel};
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
 
   queue_->enqueue(kernel, work_size, args);
 }
@@ -492,7 +502,7 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
   void *d_render_buffer = (void *)render_buffers_->buffer.device_pointer;
 
   if (max_active_path_index_ != 0) {
-    compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, 0);
+    compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, (DeviceKernel)0);
     queue_->zero_to_device(num_queued_paths_);
     d_path_index = (void *)queued_paths_.device_pointer;
   }
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd83286094c..4b8f84bee5e 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -67,8 +67,8 @@ class PathTraceWorkGPU : public PathTraceWork {
   bool enqueue_path_iteration();
   void enqueue_path_iteration(DeviceKernel kernel);
 
-  void compute_queued_paths(DeviceKernel kernel, int queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel, int queued_kernel);
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
 
   int get_num_active_paths();
 
@@ -114,7 +114,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   vector<unique_ptr<device_memory>> integrator_state_soa_;
   /* Keep track of number of queued kernels. */
   device_vector<IntegratorQueueCounter> integrator_queue_counter_;
-  /* Key for shader sorting. */
+  /* Shader sorting. */
   device_vector<int> integrator_sort_key_counter_;
 
   /* Temporary buffer to get an array of queued path for a particular kernel. */
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index 950c724aa25..0ce3c1abd71 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -76,7 +76,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS, CUD
 
 extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
                                               CUDA_KERNEL_MAX_REGISTERS)
-    kernel_cuda_integrator_reset(int num_states, int num_keys)
+    kernel_cuda_integrator_reset(int num_states)
 {
   const int path_index = ccl_global_id(0);
 
@@ -84,10 +84,6 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
     INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
     INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
   }
-
-  if (path_index < num_keys) {
-    kernel_integrator_state.sort_key_counter[path_index] = 0;
-  }
 }
 
 extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index b30012a8f45..ed671a6d595 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -28,21 +28,20 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_forceinline int integrator_intersect_next_shader(INTEGRATOR_STATE_ARGS,
-                                                            const Intersection *ccl_restrict isect)
+ccl_device_forceinline bool integrator_intersect_shader_next_kernel(
+    INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect)
 {
   /* Find shader from intersection. */
   const int shader = intersection_get_shader(kg, isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
 
   /* Optional AO bounce termination. */
   if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
-    const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
     if (flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
       INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
     }
     else {
-      return SHADER_NONE;
+      return false;
     }
   }
 
@@ -60,19 +59,21 @@ ccl_device_forceinline int integrator_intersect_next_shader(INTEGRATOR_STATE_ARG
     const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
 
     if (probability == 0.0f || terminate >= probability) {
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
       if (flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
         /* Mark path to be terminated right after shader evaluation. */
         INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IMMEDIATE;
       }
       else {
-        return SHADER_NONE;
+        return false;
       }
     }
   }
 
-  return shader;
+  /* Setup next kernel to execute. */
+  const int next_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
+  INTEGRATOR_PATH_NEXT_SORTED(INTERSECT_CLOSEST, next_kernel, shader);
+
+  return true;
 }
 
 ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
@@ -129,11 +130,7 @@ ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
     }
     else {
       /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = integrator_intersect_next_shader(INTEGRATOR_STATE_PASS, &isect);
-      if (shader != SHADER_NONE) {
-        INTEGRATOR_PATH_SET_SORT_KEY(shader);
-        INTEGRATOR_PATH_NEXT(INTERSECT_CLOSEST, SHADE_SURFACE);
-
+      if (integrator_intersect_shader_next_kernel(INTEGRATOR_STATE_PASS, &isect)) {
         const int object_flags = intersection_get_object_flags(kg, &isect);
         kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags);
         return;
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index a96f5cc5ec9..59e151ad287 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -24,7 +24,6 @@ ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
 {
 #ifdef __SUBSURF

@@ Diff output truncated at 10240 characters. @@