[Bf-blender-cvs] [79787bf8e1e] blender-v3.3-release: Cycles: Improve denoiser update performance when rendering with multiple GPUs

Patrick Mours noreply at git.blender.org
Fri Aug 12 16:03:07 CEST 2022


Commit: 79787bf8e1e1d766e34dc6f8c5eda2efcceaa6cc
Author: Patrick Mours
Date:   Fri Aug 12 15:49:30 2022 +0200
Branches: blender-v3.3-release
https://developer.blender.org/rB79787bf8e1e1d766e34dc6f8c5eda2efcceaa6cc

Cycles: Improve denoiser update performance when rendering with multiple GPUs

This patch causes the render buffers to be copied to the denoiser
device only once before denoising and output/display is then fed
from that single buffer on the denoiser device. That way usually all
but one copy (from all the render devices to the denoiser device)
can be eliminated, provided that the denoiser device is also the
display device (in which case interop is used to update the display).
As such this patch also adds some logic that tries to ensure the
chosen denoiser device is the same as the display device.

Differential Revision: https://developer.blender.org/D15657

===================================================================

M	intern/cycles/device/cuda/device_impl.cpp
M	intern/cycles/device/optix/device_impl.cpp
M	intern/cycles/integrator/denoiser.cpp
M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace.h
M	intern/cycles/integrator/path_trace_tile.cpp
M	intern/cycles/integrator/path_trace_tile.h
M	intern/cycles/session/session.cpp

===================================================================

diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 00851a8e91c..01c021551f3 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -1202,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop()
   }
 
   vector<CUdevice> gl_devices(num_all_devices);
-  uint num_gl_devices;
+  uint num_gl_devices = 0;
   cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
 
-  for (CUdevice gl_device : gl_devices) {
-    if (gl_device == cuDevice) {
+  for (uint i = 0; i < num_gl_devices; ++i) {
+    if (gl_devices[i] == cuDevice) {
       return true;
     }
   }
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 151983667c0..94a46acaf18 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -39,6 +39,9 @@ CCL_NAMESPACE_BEGIN
 // The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
 namespace {
 
+#  if OPTIX_ABI_VERSION >= 60
+using ::optixUtilDenoiserInvokeTiled;
+#  else
 static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
                                                const OptixImage2D &output,
                                                unsigned int overlapWindowSizeInPixels,
@@ -215,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
   }
   return OPTIX_SUCCESS;
 }
+#  endif
 
 #  if OPTIX_ABI_VERSION >= 55
 static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
index 94991d63e4c..831bd3a4407 100644
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type)
     if ((sub_device->info.denoisers & type) == 0) {
       return;
     }
+
     if (!best_device) {
       best_device = sub_device;
     }
     else {
+      /* Prefer a device that can use graphics interop for faster display update. */
+      if (sub_device->should_use_graphics_interop() &&
+          !best_device->should_use_graphics_interop()) {
+        best_device = sub_device;
+      }
+
       /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
        * of the device and data transfer cost. */
     }
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index ed278821b46..3ec7b601d9f 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device,
                      RenderScheduler &render_scheduler,
                      TileManager &tile_manager)
     : device_(device),
+      film_(film),
       device_scene_(device_scene),
       render_scheduler_(render_scheduler),
       tile_manager_(tile_manager)
@@ -60,7 +61,17 @@ PathTrace::~PathTrace()
 void PathTrace::load_kernels()
 {
   if (denoiser_) {
+    /* Activate graphics interop while denoiser device is created, so that it can choose a device
+     * that supports interop for faster display updates. */
+    if (display_ && path_trace_works_.size() > 1) {
+      display_->graphics_interop_activate();
+    }
+
     denoiser_->load_kernels(progress_);
+
+    if (display_ && path_trace_works_.size() > 1) {
+      display_->graphics_interop_deactivate();
+    }
   }
 }
 
@@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work)
   const double start_time = time_dt();
 
   RenderBuffers *buffer_to_denoise = nullptr;
-
-  unique_ptr<RenderBuffers> multi_device_buffers;
   bool allow_inplace_modification = false;
 
-  if (path_trace_works_.size() == 1) {
-    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  Device *denoiser_device = denoiser_->get_denoiser_device();
+  if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) {
+    big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr);
   }
-  else {
-    Device *denoiser_device = denoiser_->get_denoiser_device();
-    if (!denoiser_device) {
-      return;
-    }
 
-    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
-    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+  if (big_tile_denoise_work_) {
+    big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params,
+                                                        render_state_.effective_big_tile_params,
+                                                        render_state_.effective_big_tile_params);
 
-    buffer_to_denoise = multi_device_buffers.get();
+    buffer_to_denoise = big_tile_denoise_work_->get_render_buffers();
+    buffer_to_denoise->reset(render_state_.effective_big_tile_params);
 
-    copy_to_render_buffers(multi_device_buffers.get());
+    copy_to_render_buffers(buffer_to_denoise);
 
     allow_inplace_modification = true;
   }
+  else {
+    DCHECK_EQ(path_trace_works_.size(), 1);
+
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
 
   if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
                                 buffer_to_denoise,
@@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work)
     render_state_.has_denoised_result = true;
   }
 
-  if (multi_device_buffers) {
-    multi_device_buffers->copy_from_device();
-    parallel_for_each(
-        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
-          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
-        });
-  }
-
   render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
 }
 
@@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work)
     /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
      * all works in parallel. */
     const int num_samples = get_num_samples_in_buffer();
-    for (auto &&path_trace_work : path_trace_works_) {
-      path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+    if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+      big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples);
+    }
+    else {
+      for (auto &&path_trace_work : path_trace_works_) {
+        path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+      }
     }
 
     display_->update_end();
@@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work)
     VLOG_WORK << "Write tile result via buffer write callback.";
     tile_buffer_write();
   }
-
   /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
    */
-  if (has_multiple_tiles) {
-    VLOG_WORK << "Write tile result into .";
+  else {
+    VLOG_WORK << "Write tile result to disk.";
     tile_buffer_write_to_disk();
   }
 }
@@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device()
     return true;
   }
 
+  if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+    return big_tile_denoise_work_->copy_render_buffers_from_device();
+  }
+
   bool success = true;
 
   parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
     return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
   }
 
+  if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+    return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination);
+  }
+
   bool success = true;
 
   parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources()
     for (auto &&path_trace_work : path_trace_works_) {
       path_trace_work->destroy_gpu_resources(display_.get());
     }
+
+    if (big_tile_denoise_work_) {
+      big_tile_denoise_work_->destroy_gpu_resources(display_.get());
+    }
   }
 }
 
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index a470a6e1402..9531e4fb186 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -236,6 +236,7 @@ class PathTrace {
   /* CPU device for creating temporary render buffers on the CPU side. */
   unique_ptr<Device> cpu_device_;
 
+  Film *film_;
   DeviceScene *device_scene_;
 
   RenderScheduler &render_scheduler_;
@@ -261,6 +262,9 @@ class PathTrace {
   /* Denoiser which takes care of denoising the big tile. */
   unique_ptr<Denoiser> denoiser_;
 
+  /* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */
+  unique_ptr<PathTraceWork> denoiser_buffer_;
+
   /* State which is common for all the steps of the render work.
    * Is brought up to date in the `render()` call and is accessed from all the steps involved into
    * rendering the work. */
diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp
index 2f1f4e810a3..dfe88695013 100644
--- a/intern/cycles/integrator/path_trace_tile.cpp
+++ b/intern/cycles/integrator/path_trace_tile.cpp
@@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
   if (!copied_from_device_) {
     /* Copy from device on demand. */
     path_trace_.copy_render_tile_from_device();
-    const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
+    copied_from_device_ = true;
   }
 
   const BufferParams &buffer_params = path_trace_.get_render_tile_params();
diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h
index 99ae08d04d1..223fa96e113 100644
--- a/intern/cycles/integrator/path_trace_tile.h
+++ b/intern/cycles/integrator/path_trace_tile.h
@@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile {
 
  private:
   P

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list