[Bf-blender-cvs] [062287596e1] cycles-x: Cycles X: Reduce OIDN memory usage for shadow catcher and multi-device

Thu Jul 8 09:06:11 CEST 2021

Commit: 062287596e1c06bf8fa09c942959725d72647b16
Author: Sergey Sharybin
Date:   Wed Jul 7 10:35:16 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB062287596e1c06bf8fa09c942959725d72647b16

Cycles X: Reduce OIDN memory usage for shadow catcher and multi-device

Read compositing passes in-place, avoiding extra memory allocation for
the OIDN pass.

Differential Revision: https://developer.blender.org/D11840

===================================================================

M	intern/cycles/integrator/denoiser_oidn.cpp
M	intern/cycles/integrator/pass_accessor.h
M	intern/cycles/integrator/pass_accessor_cpu.cpp

===================================================================

diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
index 9f3e4b0b1de..12a0a1bad69 100644
--- a/intern/cycles/integrator/denoiser_oidn.cpp
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -146,8 +146,10 @@ class OIDNPass {
   /* For the scaled passes, the data which holds values of scaled pixels. */
   array<float> scaled_buffer;
 
-  /* For the in-place usable passes denotes whether the underlying data has been scaled. */
-  bool is_scaled = false;
+  /* For the in-place usable passes denotes whether the data is prepared to be used as-is.
+   * For example, for compositing passes this means that the compositing result has been read,
+   * and for scaling passes means that scaling has been performed. */
+  bool is_inplace_ready = false;
 };
 
 class OIDNDenoiseContext {
@@ -240,14 +242,8 @@ class OIDNDenoiseContext {
                            stride * pass_stride * sizeof(float));
   }
 
-  void read_pass_pixels(OIDNPass &oidn_pass)
+  void read_pass_pixels(OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
   {
-    const int64_t width = buffer_params_.width;
-    const int64_t height = buffer_params_.height;
-
-    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
-    scaled_buffer.resize(width * height * 3);
-
     PassAccessor::PassAccessInfo pass_access_info;
     pass_access_info.type = oidn_pass.type;
     pass_access_info.mode = oidn_pass.mode;
@@ -263,17 +259,50 @@ class OIDNDenoiseContext {
      * by users. What is important is to use same exposure for read and write access of the pass
      * pixels. */
     const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
-    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
 
     pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
   }
 
-  void set_pass_scaled(OIDNPass &oidn_pass)
+  void read_pass_pixels_inplace_if_needed(OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.is_inplace_ready) {
+      return;
+    }
+    oidn_pass.is_inplace_ready = true;
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  void read_pass_pixels_into_buffer_if_needed(OIDNPass &oidn_pass)
   {
-    if (oidn_pass.scaled_buffer.empty()) {
-      read_pass_pixels(oidn_pass);
+    if (!oidn_pass.scaled_buffer.empty()) {
+      return;
     }
 
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  void set_pass_scaled(OIDNPass &oidn_pass)
+  {
+    read_pass_pixels_into_buffer_if_needed(oidn_pass);
+
     const int64_t width = buffer_params_.width;
     const int64_t height = buffer_params_.height;
 
@@ -289,23 +318,23 @@ class OIDNDenoiseContext {
 
   void set_pass(OIDNPass &oidn_pass)
   {
-    if (oidn_pass.use_compositing) {
-      /* TODO(sergey): Avoid extra memory for compositing passes. */
-      set_pass_scaled(oidn_pass);
-      return;
-    }
+    const bool use_compositing = oidn_pass.use_compositing;
 
-    /* When adaptive sampling is involved scaling is always needed.
-     * If the avoid scaling if there is only one sample, to save up time (so we dont divide buffer
-     * by 1). */
-    if (pass_sample_count_ == PASS_UNUSED && (!oidn_pass.need_scale || num_samples_ == 1)) {
+    /* Simple case: no compositing is involved, no scaling o9s needed. Reference the pass from the
+     * render buffers without extra compute. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_pass)) {
       set_pass_referenced(oidn_pass);
       return;
     }
 
     if (allow_inplace_modification_) {
       set_pass_referenced(oidn_pass);
-      scale_pass_if_needed(oidn_pass);
+      if (use_compositing) {
+        read_pass_pixels_inplace_if_needed(oidn_pass);
+      }
+      else {
+        scale_pass_inplace_if_needed(oidn_pass);
+      }
       return;
     }
 
@@ -384,15 +413,37 @@ class OIDNDenoiseContext {
     }
   }
 
-  void scale_pass_if_needed(OIDNPass &oidn_pass)
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
   {
+    if (oidn_pass.is_inplace_ready) {
+      return false;
+    }
+
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
     if (!oidn_pass.need_scale) {
-      return;
+      return false;
     }
-    if (oidn_pass.is_scaled) {
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_inplace_if_needed(OIDNPass &oidn_pass)
+  {
+    if (!is_pass_scale_needed(oidn_pass)) {
       return;
     }
-    oidn_pass.is_scaled = true;
+    oidn_pass.is_inplace_ready = true;
 
     const int64_t x = buffer_params_.full_x;
     const int64_t y = buffer_params_.full_y;
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
index 401917e744a..9610ba0d1fb 100644
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -76,6 +76,9 @@ class PassAccessor {
     /* Offset in pixels from the beginning of pixels storage.
      * Allows to get pixels of render buffer into a partial slice of the destination. */
     int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`. */
+    int pixel_stride = 0;
   };
 
   class Source {
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 7591d8bf643..af6890f304b 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -98,14 +98,15 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
     const Processor &processor) const
 {
   const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
 
   tbb::parallel_for(0, buffer_params.height, [&](int y) {
     int64_t pixel_index = int64_t(y) * buffer_params.width;
     for (int x = 0; x < buffer_params.width; ++x, ++pixel_index) {
       const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
       const float *buffer = buffer_data + input_pixel_offset;
-      float *pixel = destination.pixels +
-                     (pixel_index + destination.offset) * destination.num_components;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
 
       processor(kfilm_convert, buffer, pixel);
     }