[Bf-blender-cvs] [0d28f5ff495] cycles-x: Cycles X: Implement pass accessor for GPU

Thu Jun 3 16:22:00 CEST 2021

Commit: 0d28f5ff49523260be058b33cc5d63330eae7dcf
Author: Sergey Sharybin
Date:   Thu Jun 3 10:35:00 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB0d28f5ff49523260be058b33cc5d63330eae7dcf

Cycles X: Implement pass accessor for GPU

It replaces film_convert_half_float with more generic code path.

Currently only half4 destination is supported. No real stopper reason
to support float destination, is just a bit harder to verify at this
point as there will be no users of this code path.

Form quick benchmarks the new code seems to be 10-15% faster than the
film_convert_half_float when measuring overall display update time for
non-interop rendering. Partially because we no longer convert global
index to x/y and back to pixel index, partially due to smaller kernel.

Note that the timing is only taking display update into account, the
path tracing is not included into the speedup calculation (this means
that viewport is not that much faster, but it means we can update
viewpor4t more often now).

Differential Revision: https://developer.blender.org/D11481

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/integrator/CMakeLists.txt
M	intern/cycles/integrator/pass_accessor.cpp
M	intern/cycles/integrator/pass_accessor.h
M	intern/cycles/integrator/pass_accessor_cpu.cpp
M	intern/cycles/integrator/pass_accessor_cpu.h
A	intern/cycles/integrator/pass_accessor_gpu.cpp
A	intern/cycles/integrator/pass_accessor_gpu.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_film.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/render/film.cpp

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 653c1bbae1d..02315b8e116 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -110,7 +110,20 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
-    case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
+    case DEVICE_KERNEL_FILM_CONVERT_DEPTH_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_MIST_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SAMPLE_COUNT_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW3_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_DIVIDE_EVEN_COLOR_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT3_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW4_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_MOTION_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_CRYPTOMATTE_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_DENOISING_COLOR_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_MATTE_WITH_SHADOW_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT4_HALF_RGBA:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index 007f5cccbc5..d9ee7838c63 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -64,8 +64,34 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "shader_eval_background";
 
     /* Film. */
-    case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
-      return "convert_to_half_float";
+    case DEVICE_KERNEL_FILM_CONVERT_DEPTH_HALF_RGBA:
+      return "film_convert_depth_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_MIST_HALF_RGBA:
+      return "film_convert_mist_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_SAMPLE_COUNT_HALF_RGBA:
+      return "film_convert_sample_count_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT_HALF_RGBA:
+      return "film_convert_float_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW3_HALF_RGBA:
+      return "film_convert_shadow3_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_DIVIDE_EVEN_COLOR_HALF_RGBA:
+      return "film_convert_divide_even_color_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT3_HALF_RGBA:
+      return "film_convert_float3_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW4_HALF_RGBA:
+      return "film_convert_shadow4_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_MOTION_HALF_RGBA:
+      return "film_convert_motion_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_CRYPTOMATTE_HALF_RGBA:
+      return "film_convert_cryptomatte_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_DENOISING_COLOR_HALF_RGBA:
+      return "film_convert_denoising_color_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_HALF_RGBA:
+      return "film_convert_shadow_catcher_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_MATTE_WITH_SHADOW_HALF_RGBA:
+      return "film_convert_shadow_catcher_matte_with_shadow_half_rgba";
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT4_HALF_RGBA:
+      return "film_convert_float4_half_rgba";
 
     /* Adaptive sampling. */
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 444b97baf17..5860632c364 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -107,7 +107,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
     case DEVICE_KERNEL_INTEGRATOR_RESET:
     case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
-    case DEVICE_KERNEL_CONVERT_TO_HALF_FLOAT:
+    case DEVICE_KERNEL_FILM_CONVERT_DEPTH_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_MIST_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SAMPLE_COUNT_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW3_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_DIVIDE_EVEN_COLOR_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT3_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW4_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_MOTION_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_CRYPTOMATTE_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_DENOISING_COLOR_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_MATTE_WITH_SHADOW_HALF_RGBA:
+    case DEVICE_KERNEL_FILM_CONVERT_FLOAT4_HALF_RGBA:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
index 9896ed9a2f3..e8950445bb7 100644
--- a/intern/cycles/integrator/CMakeLists.txt
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -26,6 +26,7 @@ set(SRC
   tile.cpp
   pass_accessor.cpp
   pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
   path_trace_work.cpp
   path_trace_work_cpu.cpp
   path_trace_work_gpu.cpp
@@ -44,6 +45,7 @@ set(SRC_HEADERS
   tile.h
   pass_accessor.h
   pass_accessor_cpu.h
+  pass_accessor_gpu.h
   path_trace_work.h
   path_trace_work_cpu.h
   path_trace_work_gpu.h
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index e46b202e684..0d8312932c1 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -19,6 +19,11 @@
 #include "render/buffers.h"
 #include "util/util_logging.h"
 
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
 CCL_NAMESPACE_BEGIN
 
 /* --------------------------------------------------------------------
@@ -44,7 +49,12 @@ PassAccessor::Destination::Destination(float *pixels, int num_components)
 }
 
 PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
-    : pixels_half_rgba(pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
 {
   const PassInfo pass_info = Pass::get_info(pass_type);
 
@@ -214,4 +224,46 @@ bool PassAccessor::set_pass_rect(PassType type, int components, float *pixels, i
 }
 #endif
 
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params) const
+{
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = 1.0f / num_samples_;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
index d478e9a9918..88e38e91003 100644
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -19,6 +19,7 @@
 #include "render/pass.h"
 #include "util/util_half.h"
 #include "util/util_string.h"
+#include "util/util_types.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -26,6 +27,7 @@ CCL_NAMESPACE_BEGIN
 class Film;
 class RenderBuffers;
 class BufferParams;
+struct KernelFilmConvert;
 
 /* Helper class which allows to access pass data.
  * Is designed in a way that it is created once when the pass data is known, and then pixels gets
@@ -54,9 +56,17 @@ class PassAccessor {
     Destination(float *pixels, int num_components);
     Destination(const PassType pass_type, half4 *pixels);
 
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
     float *pixels = nullptr;
     half4 *pixels_half_rgba = nullptr;
 
+    /* Device-side pointers. */
+    device_ptr d_pixels_half_rgba;
+
     int num_components = 0;
   };
 
@@ -78,6 +88,9 @@ class PassAccessor {
 #endif
 
  protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params) const;
+
 #define DECLARE_PASS_ACCESSOR(pass) \
   virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
                                const BufferParams &buffer_params, \
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor

@@ Diff output truncated at 10240 characters. @@