[Bf-blender-cvs] [f367f1e5a55] master: Cycles: Improve OptiX viewport denoising performance with CUDA rendering

Patrick Mours noreply at git.blender.org
Wed Jun 10 14:12:43 CEST 2020


Commit: f367f1e5a55e1c657f9d2088f6537fb2e73492f0
Author: Patrick Mours
Date:   Tue Jun 9 19:46:16 2020 +0200
Branches: master
https://developer.blender.org/rBf367f1e5a55e1c657f9d2088f6537fb2e73492f0

Cycles: Improve OptiX viewport denoising performance with CUDA rendering

With this patch Cycles recognizing when a logical OptiX and CUDA device represent the same
physical GPU and attempts to eliminate unnecessary tile copies for viewport rendering if that
is the case for all active devices. In addition, denoising is now no longer performed on the first
available OptiX device only, but instead it will try to match CUDA and OptiX
rendering/denoising devices exactly to maximize utilization.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D7975

===================================================================

M	intern/cycles/blender/blender_device.cpp
M	intern/cycles/device/device_multi.cpp
M	intern/cycles/device/device_task.cpp
M	intern/cycles/render/session.cpp

===================================================================

diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 3a923459782..ac52948806c 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -141,10 +141,25 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
         device.multi_devices.push_back(device);
       }
 
-      /* Simply use the first available OptiX device. */
-      const DeviceInfo optix_device = optix_devices.front();
-      device.id += optix_device.id; /* Uniquely identify this special multi device. */
-      device.denoising_devices.push_back(optix_device);
+      /* Try to use the same physical devices for denoising. */
+      for (const DeviceInfo &cuda_device : device.multi_devices) {
+        if (cuda_device.type == DEVICE_CUDA) {
+          for (const DeviceInfo &optix_device : optix_devices) {
+            if (cuda_device.num == optix_device.num) {
+              device.id += optix_device.id;
+              device.denoising_devices.push_back(optix_device);
+              break;
+            }
+          }
+        }
+      }
+
+      if (device.denoising_devices.empty()) {
+        /* Simply use the first available OptiX device. */
+        const DeviceInfo optix_device = optix_devices.front();
+        device.id += optix_device.id; /* Uniquely identify this special multi device. */
+        device.denoising_devices.push_back(optix_device);
+      }
     }
   }
 
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 77ede3bf62a..ef1687ddd3a 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -43,6 +43,7 @@ class MultiDevice : public Device {
   list<SubDevice> devices, denoising_devices;
   device_ptr unique_key;
   vector<vector<SubDevice *>> peer_islands;
+  bool matching_rendering_and_denoising_devices;
 
   MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
       : Device(info, stats, profiler, background_), unique_key(1)
@@ -66,8 +67,8 @@ class MultiDevice : public Device {
     }
 
     foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_back();
-      SubDevice *sub = &denoising_devices.back();
+      denoising_devices.emplace_front();
+      SubDevice *sub = &denoising_devices.front();
 
       sub->device = Device::create(subinfo, sub->stats, profiler, background);
     }
@@ -96,6 +97,27 @@ class MultiDevice : public Device {
       }
     }
 
+    /* Try to re-use memory when denoising and render devices use the same physical devices
+     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
+     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
+    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
+                                               (devices.size() == denoising_devices.size());
+    if (matching_rendering_and_denoising_devices) {
+      for (list<SubDevice>::iterator device_it = devices.begin(),
+                                     denoising_device_it = denoising_devices.begin();
+           device_it != devices.end() && denoising_device_it != denoising_devices.end();
+           ++device_it, ++denoising_device_it) {
+        const DeviceInfo &info = device_it->device->info;
+        const DeviceInfo &denoising_info = denoising_device_it->device->info;
+        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
+            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
+            info.num != denoising_info.num) {
+          matching_rendering_and_denoising_devices = false;
+          break;
+        }
+      }
+    }
+
 #ifdef WITH_NETWORK
     /* try to add network devices */
     ServerDiscovery discovery(true);
@@ -232,7 +254,7 @@ class MultiDevice : public Device {
 
   SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
   {
-    assert(sub.peer_island_index >= 0 && key != 0);
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
 
     /* Get the memory owner of this key (first try current device, then peer devices) */
     SubDevice *owner_sub = ⊂
@@ -377,6 +399,9 @@ class MultiDevice : public Device {
      * Similarily the tile buffers also need to be allocated separately on all devices so any
      * overlap rendered for denoising does not interfer with each other */
     if (strcmp(mem.name, "RenderBuffers") == 0) {
+      vector<device_ptr> device_pointers;
+      device_pointers.reserve(devices.size());
+
       foreach (SubDevice &sub, devices) {
         mem.device = sub.device;
         mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
@@ -384,14 +409,22 @@ class MultiDevice : public Device {
 
         sub.device->mem_zero(mem);
         sub.ptr_map[key] = mem.device_pointer;
+
+        device_pointers.push_back(mem.device_pointer);
       }
       foreach (SubDevice &sub, denoising_devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
+        if (matching_rendering_and_denoising_devices) {
+          sub.ptr_map[key] = device_pointers.front();
+          device_pointers.erase(device_pointers.begin());
+        }
+        else {
+          mem.device = sub.device;
+          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+          mem.device_size = existing_size;
 
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
+          sub.device->mem_zero(mem);
+          sub.ptr_map[key] = mem.device_pointer;
+        }
       }
     }
     else {
@@ -427,12 +460,17 @@ class MultiDevice : public Device {
         sub.ptr_map.erase(sub.ptr_map.find(key));
       }
       foreach (SubDevice &sub, denoising_devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
+        if (matching_rendering_and_denoising_devices) {
+          sub.ptr_map.erase(key);
+        }
+        else {
+          mem.device = sub.device;
+          mem.device_pointer = sub.ptr_map[key];
+          mem.device_size = existing_size;
 
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
+          sub.device->mem_free(mem);
+          sub.ptr_map.erase(sub.ptr_map.find(key));
+        }
       }
     }
     else {
@@ -553,7 +591,7 @@ class MultiDevice : public Device {
       device_vector<float> &mem = tiles[i].buffers->buffer;
       tiles[i].buffer = mem.device_pointer;
 
-      if (mem.device == this && denoising_devices.empty()) {
+      if (mem.device == this && matching_rendering_and_denoising_devices) {
         /* Skip unnecessary copies in viewport mode (buffer covers the
          * whole image), but still need to fix up the tile device pointer. */
         map_tile(sub_device, tiles[i]);
@@ -597,7 +635,7 @@ class MultiDevice : public Device {
   {
     device_vector<float> &mem = tiles[9].buffers->buffer;
 
-    if (mem.device == this && denoising_devices.empty()) {
+    if (mem.device == this && matching_rendering_and_denoising_devices) {
       return;
     }
 
@@ -670,23 +708,23 @@ class MultiDevice : public Device {
         DeviceTask subtask = tasks.front();
         tasks.pop_front();
 
-        if (task.type == DeviceTask::DENOISE_BUFFER && !denoising_devices.empty()) {
-          subtask.buffer = sub.ptr_map[task.buffer];
-        }
-        else {
-          if (task.buffer)
-            subtask.buffer = find_matching_mem(task.buffer, sub);
-          if (task.rgba_byte)
-            subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-          if (task.rgba_half)
-            subtask.rgba_half = sub.ptr_map[task.rgba_half];
-          if (task.shader_input)
-            subtask.shader_input = find_matching_mem(task.shader_input, sub);
-          if (task.shader_output)
-            subtask.shader_output = find_matching_mem(task.shader_output, sub);
-        }
+        if (task.buffer)
+          subtask.buffer = find_matching_mem(task.buffer, sub);
+        if (task.rgba_byte)
+          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+        if (task.rgba_half)
+          subtask.rgba_half = sub.ptr_map[task.rgba_half];
+        if (task.shader_input)
+          subtask.shader_input = find_matching_mem(task.shader_input, sub);
+        if (task.shader_output)
+          subtask.shader_output = find_matching_mem(task.shader_output, sub);
 
         sub.device->task_add(subtask);
+
+        if (task.buffers && task.buffers->buffer.device == this) {
+          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
+          sub.device->task_wait();
+        }
       }
     }
   }
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index d2447eae867..7485e1b41de 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -44,7 +44,8 @@ DeviceTask::DeviceTask(Type type_)
       shader_eval_type(0),
       shader_filter(0),
       shader_x(0),
-      shader_w(0)
+      shader_w(0),
+      buffers(nullptr)
 {
   last_update_time = time_dt();
 }
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 361a1465aac..7c50140ecfe 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -436,6 +436,12 @@ bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_typ
     /* Reset copy state, since buffer contents change after the tile was acquired */
     buffers->map_neighbor_copied = false;
 
+    /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
+     * for the buffer resolution divider. */
+    buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
+                                 tile_manager.state.resolution_divider;
+    buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
+
     return true;
 

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list