[Bf-blender-cvs] [8b4acade6c1] cycles-x: Cycles X: Initial support of multi-GPU and GPU+CPU rendering

Tue Jun 29 12:20:22 CEST 2021

Commit: 8b4acade6c10d2978584e5819aace106046435f8
Author: Sergey Sharybin
Date:   Wed Jun 23 09:57:06 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB8b4acade6c10d2978584e5819aace106046435f8

Cycles X: Initial support of multi-GPU and GPU+CPU rendering

This change makes it possible to render single frame on multiple GPUs
and/or GPU(s)+CPU. (as configured in the User Preferences).

Work is split equally along the height of the big tile.
In the future this will be looked into to perform better initial guess
based on devices performance, dynamic re-scheduling, and interleaving
scanlines across devices.

The main idea is to move render buffers to per-work basis, so that the
ender buffers are always associated with the device work is being done
by. And then upon access delegate the read/write to the work, so that
it operates with a specific slice in the source/destination,

There are some multiple memory and performance improvements possible,
like:

- Copy render result to GPUDisplay from multiple threads (now when it
  is clear graphics inetrop can not be mixed in with naive update).
- Avoid denoiser buffer re-allocation.
- Avoid creation of temporary buffers in the denoisers when we know
  that we have a copy of real buffers.
- Only copy passes needed for denoiser, and results of denoiser.

The current state of the `PathTrace::denoise()` is not entirely ideal:
it could be split up, and memory usage could be improved. But think it
is good enough for the initial implementation. The further improvements
would require changes in the Denoiser API.

Differential Revision: https://developer.blender.org/D11727

===================================================================

M	intern/cycles/blender/blender_gpu_display.cpp
M	intern/cycles/blender/blender_gpu_display.h
M	intern/cycles/integrator/denoiser_device.cpp
M	intern/cycles/integrator/pass_accessor.cpp
M	intern/cycles/integrator/pass_accessor.h
M	intern/cycles/integrator/pass_accessor_cpu.cpp
M	intern/cycles/integrator/pass_accessor_gpu.cpp
M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace.h
M	intern/cycles/integrator/path_trace_work.cpp
M	intern/cycles/integrator/path_trace_work.h
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/path_trace_work_cpu.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/render/gpu_display.cpp
M	intern/cycles/render/gpu_display.h

===================================================================

diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
index 2cac31d826f..a02547fb5c6 100644
--- a/intern/cycles/blender/blender_gpu_display.cpp
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -372,22 +372,35 @@ void BlenderGPUDisplay::do_update_end()
  * Texture update from CPU buffer.
  */
 
-void BlenderGPUDisplay::do_copy_pixels_to_texture(const half4 *rgba_pixels)
+void BlenderGPUDisplay::do_copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
 {
   /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
    * point of view than to copy data directly to the OpenGL texture.
    *
    * The possible downside of this approach is that it might require a higher peak memory when
    * doing partial updates of the texture (although, in practice even partial updates might peak
-   * with a full-frame buffer stored on the CPU if the GPU is currently occupied), */
+   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
 
   half4 *mapped_rgba_pixels = map_texture_buffer();
   if (!mapped_rgba_pixels) {
     return;
   }
 
-  const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
-  memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
+      pixels_height == texture_.height) {
+    const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
+    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  }
+  else {
+    const half4 *rgba_row = rgba_pixels;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width;
+    for (int y = 0; y < pixels_height;
+         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
+      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+    }
+  }
+
   unmap_texture_buffer();
 }
 
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h
index dc4d9bbcca0..ccfbc358171 100644
--- a/intern/cycles/blender/blender_gpu_display.h
+++ b/intern/cycles/blender/blender_gpu_display.h
@@ -104,7 +104,11 @@ class BlenderGPUDisplay : public GPUDisplay {
   virtual bool do_update_begin(int texture_width, int texture_height) override;
   virtual void do_update_end() override;
 
-  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels) override;
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) override;
   virtual void do_draw() override;
 
   virtual half4 *do_map_texture_buffer() override;
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
index 30097baf130..dec5b69e9a1 100644
--- a/intern/cycles/integrator/denoiser_device.cpp
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -60,7 +60,20 @@ void DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
 /* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
 static bool is_single_supported_device(Device *device, DenoiserType type)
 {
-  return (device->info.type != DEVICE_MULTI) && (device->info.denoisers & type);
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
 }
 
 /* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index 8d54a96f992..aed9f053f35 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -354,7 +354,7 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
   const int num_components = source.num_components;
 
   float *out = buffer_data + pass_access_info_.offset;
-  const float *in = source.pixels;
+  const float *in = source.pixels + source.offset * num_components;
 
   for (int i = 0; i < size; i++, out += pass_stride, in += num_components) {
     memcpy(out, in, sizeof(float) * num_components);
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
index 859f3a65dd0..401917e744a 100644
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -69,7 +69,13 @@ class PassAccessor {
     device_ptr d_pixels = 0;
     device_ptr d_pixels_half_rgba = 0;
 
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
     int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
   };
 
   class Source {
@@ -80,6 +86,10 @@ class PassAccessor {
     /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
     const float *pixels = nullptr;
     int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
   };
 
   PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 406f300c59a..7591d8bf643 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -100,11 +100,12 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
   const float *buffer_data = render_buffers->buffer.data();
 
   tbb::parallel_for(0, buffer_params.height, [&](int y) {
-    int64_t pixel_index = y * buffer_params.width;
+    int64_t pixel_index = int64_t(y) * buffer_params.width;
     for (int x = 0; x < buffer_params.width; ++x, ++pixel_index) {
       const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
       const float *buffer = buffer_data + input_pixel_offset;
-      float *pixel = destination.pixels + pixel_index * destination.num_components;
+      float *pixel = destination.pixels +
+                     (pixel_index + destination.offset) * destination.num_components;
 
       processor(kfilm_convert, buffer, pixel);
     }
@@ -132,7 +133,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
 
       film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
-      half4 *pixel_half_rgba = destination.pixels_half_rgba + pixel_index;
+      half4 *pixel_half_rgba = destination.pixels_half_rgba + pixel_index + destination.offset;
       float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
     }
   });
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
index a62ea250f5a..a4a3d78ffed 100644
--- a/intern/cycles/integrator/pass_accessor_gpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -51,7 +51,8 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
                     const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
                     const_cast<int *>(&work_size),
                     const_cast<int *>(&buffer_params.offset),
-                    const_cast<int *>(&buffer_params.stride)};
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset)};
 
     queue_->enqueue(kernel, work_size, args);
   }
@@ -63,7 +64,8 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
                     const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
                     const_cast<int *>(&work_size),
                     const_cast<int *>(&buffer_params.offset),
-                    const_cast<int *>(&buffer_params.stride)};
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset)};
 
     queue_->enqueue(kernel_half_float, work_size, args);
   }
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 42efb845c66..09aa4e8a8da 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -16,6 +16,7 @@
 
 #include "integrator/path_trace.h"
 
+#include "device/cpu/device.h"
 #include "device/device.h"
 #include "integrator/pass_accessor.h"
 #include "integrator/render_scheduler.h"
@@ -37,22 +38,8 @@ PathTrace::PathTrace(Device *device, DeviceScene *device_scene, RenderScheduler
   /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
    * as possible. */
   device_->foreach_device([&](Device *path_trace_device) {
-    if (!path_trace_works_.empty()) {
-      if (path_trace_works_.size() == 1) {
-        LOG(ERROR)
-            << "Multi-devices are not yet fully implemented, will render on a single device.";
-      }
-      return;
-    }
-
-    /* TODO(sergey): Need to create render buffer for every individual device, so that they can
-     * write directly to it. */
-    full_render_buffers_ = make_unique<RenderBuffers>(path_trace_device);
-
-    path_trace_works_.emplace_back(PathTraceWork::create(path_trace_device,
-                                                         device_scene,
-                          

@@ Diff output truncated at 10240 characters. @@