[Bf-blender-cvs] [97ff37bf544] blender-v3.0-release: Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion

Brecht Van Lommel noreply at git.blender.org
Fri Nov 5 22:05:36 CET 2021


Commit: 97ff37bf54474efbce39653a1387ad55091d4964
Author: Brecht Van Lommel
Date:   Fri Nov 5 21:01:23 2021 +0100
Branches: blender-v3.0-release
https://developer.blender.org/rB97ff37bf54474efbce39653a1387ad55091d4964

Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion

Adds a bunch of CPU kernel function to process on row of pixels, and use those
instead of calling unoptimized implementations.

Fixes T92598

===================================================================

M	intern/cycles/device/cpu/device_impl.cpp
M	intern/cycles/device/cpu/device_impl.h
M	intern/cycles/device/cpu/kernel.cpp
M	intern/cycles/device/cpu/kernel.h
M	intern/cycles/device/device.cpp
M	intern/cycles/device/device.h
M	intern/cycles/integrator/pass_accessor_cpu.cpp
M	intern/cycles/integrator/pass_accessor_cpu.h
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/shader_eval.cpp
M	intern/cycles/kernel/device/cpu/kernel.h
M	intern/cycles/kernel/device/cpu/kernel_arch.h
M	intern/cycles/kernel/device/cpu/kernel_arch_impl.h

===================================================================

diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index d494b40f71d..68dec7f0af2 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
   /* Pick any kernel, all of them are supposed to have same level of microarchitecture
    * optimization. */
-  VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+          << " CPU kernels.";
 
   if (info.cpu_threads == 0) {
     info.cpu_threads = TaskScheduler::num_threads();
@@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
     Device::build_bvh(bvh, progress, refit);
 }
 
-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
-  return &kernels;
-}
-
 void CPUDevice::get_cpu_kernel_thread_globals(
     vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 553728ccc3b..90d217bb624 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,8 +57,6 @@ class CPUDevice : public Device {
   RTCDevice embree_device;
 #endif
 
-  CPUKernels kernels;
-
   CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
   ~CPUDevice();
 
@@ -90,7 +88,6 @@ class CPUDevice : public Device {
 
   void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
 
-  virtual const CPUKernels *get_cpu_kernels() const override;
   virtual void get_cpu_kernel_thread_globals(
       vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
   virtual void *get_cpu_osl_memory() override;
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index 3b253c094fd..91c472d41e8 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
       KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
 
 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))
 
 CPUKernels::CPUKernels()
     : /* Integrator. */
@@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
       REGISTER_KERNEL(adaptive_sampling_filter_x),
       REGISTER_KERNEL(adaptive_sampling_filter_y),
       /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess)
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Film Convert. */
+      REGISTER_KERNEL_FILM_CONVERT(depth),
+      REGISTER_KERNEL_FILM_CONVERT(mist),
+      REGISTER_KERNEL_FILM_CONVERT(sample_count),
+      REGISTER_KERNEL_FILM_CONVERT(float),
+      REGISTER_KERNEL_FILM_CONVERT(light_path),
+      REGISTER_KERNEL_FILM_CONVERT(float3),
+      REGISTER_KERNEL_FILM_CONVERT(motion),
+      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+      REGISTER_KERNEL_FILM_CONVERT(combined),
+      REGISTER_KERNEL_FILM_CONVERT(float4)
 {
 }
 
 #undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 5beeaf148a1..406bd07ab3d 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,11 +17,13 @@
 #pragma once
 
 #include "device/cpu/kernel_function.h"
+#include "util/half.h"
 #include "util/types.h"
 
 CCL_NAMESPACE_BEGIN
 
 struct KernelGlobalsCPU;
+struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;
 
@@ -102,6 +104,41 @@ class CPUKernels {
 
   CryptomattePostprocessFunction cryptomatte_postprocess;
 
+  /* Film Convert. */
+  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                                         const float *buffer,
+                                                         float *pixel,
+                                                         const int width,
+                                                         const int buffer_stride,
+                                                         const int pixel_stride)>;
+  using FilmConvertHalfRGBAFunction =
+      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                 const float *buffer,
+                                 half4 *pixel,
+                                 const int width,
+                                 const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  FilmConvertFunction film_convert_##name; \
+  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+  KERNEL_FILM_CONVERT_FUNCTION(depth)
+  KERNEL_FILM_CONVERT_FUNCTION(mist)
+  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+  KERNEL_FILM_CONVERT_FUNCTION(float)
+
+  KERNEL_FILM_CONVERT_FUNCTION(light_path)
+  KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+  KERNEL_FILM_CONVERT_FUNCTION(motion)
+  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+  KERNEL_FILM_CONVERT_FUNCTION(combined)
+  KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
   CPUKernels();
 };
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 69e959b6f7b..63d0a49d3eb 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,6 +23,7 @@
 #include "device/queue.h"
 
 #include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@@ -363,10 +364,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
   return nullptr;
 }
 
-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
 {
-  LOG(FATAL) << "Device does not support CPU kernels.";
-  return nullptr;
+  /* Initialize CPU kernels once and reuse. */
+  static CPUKernels kernels;
+  return kernels;
 }
 
 void Device::get_cpu_kernel_thread_globals(
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 3cb177adde7..65188459c2c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -180,7 +180,7 @@ class Device {
    * These may not be used on GPU or multi-devices. */
 
   /* Get CPU kernel functions for native instruction set. */
-  virtual const CPUKernels *get_cpu_kernels() const;
+  static const CPUKernels &get_cpu_kernels();
   /* Get kernel globals to pass to kernels. */
   virtual void get_cpu_kernel_thread_globals(
       vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 820da757be0..77ca332d142 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "device/device.h"
+
 #include "integrator/pass_accessor_cpu.h"
 
 #include "session/buffers.h"
+
 #include "util/log.h"
 #include "util/tbb.h"
 
@@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
  * Kernel processing.
  */
 
-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                                           const BufferParams &buffer_params,
-                                                           const Destination &destination,
-                                                           const Processor &processor) const
-{
-  KernelFilmConvert kfilm_convert;
-  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
-  if (destination.pixels) {
-    /* NOTE: No overlays are applied since they are not used for final renders.
-     * Can be supported via some sort of specialization to avoid code duplication. */
-
-    run_get_pass_kernel_processor_float(
-        &kfilm_convert, render_buffers, buffer_params, destination, processor);
-  }
-
-  if (destination.pixels_half_rgba) {
-    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
-    if (destination.num_components == 1) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                float pixel;
-                                                processor(kfilm_convert, buffer, &pixel);
-
-                                                pixel_rgba[0] = pixel;
-                                                pixel_rgba[1] = pixel;
-                                                pixel_rgba[2] = pixel;
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 3) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-      

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list