[Bf-blender-cvs] [9f7d84b656f] master: Cycles: Add support for P2P memory distribution (e.g. via NVLink)

Mon Jun 8 17:56:33 CEST 2020

Commit: 9f7d84b656fbb56966620ecc249ce5bc7089a1d1
Author: Patrick Mours
Date:   Mon Jun 8 17:16:10 2020 +0200
Branches: master
https://developer.blender.org/rB9f7d84b656fbb56966620ecc249ce5bc7089a1d1

Cycles: Add support for P2P memory distribution (e.g. via NVLink)

This change modifies the multi-device implementation to support memory distribution
across devices, to reduce the overall memory footprint of large scenes and allow scenes to
fit entirely into combined GPU memory that previously had to fall back to host memory.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D7426

===================================================================

M	intern/cycles/blender/addon/properties.py
M	intern/cycles/blender/blender_device.cpp
M	intern/cycles/blender/blender_python.cpp
M	intern/cycles/device/cuda/device_cuda.h
M	intern/cycles/device/cuda/device_cuda_impl.cpp
M	intern/cycles/device/device.cpp
M	intern/cycles/device/device.h
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_memory.cpp
M	intern/cycles/device/device_memory.h
M	intern/cycles/device/device_multi.cpp

===================================================================

diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index da18ac7c693..1635afab210 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1535,6 +1535,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings)
 
+    peer_memory: BoolProperty(
+        name="Distribute memory across devices",
+        description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it",
+        default=False,
+    )
+
     def find_existing_device_entry(self, device):
         for device_entry in self.devices:
             if device_entry.id == device[2] and device_entry.type == device[1]:
@@ -1632,14 +1638,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         row = layout.row()
         row.prop(self, "compute_device_type", expand=True)
 
-        devices = self.get_devices_for_type(self.compute_device_type)
+        if self.compute_device_type == 'NONE':
+            return
         row = layout.row()
-        if self.compute_device_type == 'CUDA':
-            self._draw_devices(row, 'CUDA', devices)
-        elif self.compute_device_type == 'OPTIX':
-            self._draw_devices(row, 'OPTIX', devices)
-        elif self.compute_device_type == 'OPENCL':
-            self._draw_devices(row, 'OPENCL', devices)
+        devices = self.get_devices_for_type(self.compute_device_type)
+        self._draw_devices(row, self.compute_device_type, devices)
+
+        import _cycles
+        has_peer_memory = 0
+        for device in _cycles.available_devices(self.compute_device_type):
+            if device[3] and self.find_existing_device_entry(device).use:
+                has_peer_memory += 1
+        if has_peer_memory > 1:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "peer_memory")
 
     def draw(self, context):
         self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 5140f190f36..3a923459782 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -113,6 +113,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
         device = Device::get_multi_device(used_devices, threads, background);
       }
       /* Else keep using the CPU device that was set before. */
+
+      if (!get_boolean(cpreferences, "peer_memory")) {
+        device.has_peer_memory = false;
+      }
     }
   }
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 79c16856462..0be19dbffd1 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -416,10 +416,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
   for (size_t i = 0; i < devices.size(); i++) {
     DeviceInfo &device = devices[i];
     string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(3);
+    PyObject *device_tuple = PyTuple_New(4);
     PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
     PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
     PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory));
     PyTuple_SET_ITEM(ret, i, device_tuple);
   }
 
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 3f23f0fe4c5..9f31ed12cf4 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -51,6 +51,7 @@ class CUDADevice : public Device {
   size_t map_host_used;
   size_t map_host_limit;
   int can_map_host;
+  int pitch_alignment;
   int cuDevId;
   int cuDevArchitecture;
   bool first_error;
@@ -111,6 +112,8 @@ class CUDADevice : public Device {
 
   bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
 
+  bool check_peer_access(Device *peer_device);
+
   bool use_adaptive_compilation();
 
   bool use_split_kernel();
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index acf53c3eb1b..64c7f5e7d34 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -207,6 +207,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   map_host_limit = 0;
   map_host_used = 0;
   can_map_host = 0;
+  pitch_alignment = 0;
 
   functions.loaded = false;
 
@@ -224,6 +225,9 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   cuda_assert(
       cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
 
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
   unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
   if (can_map_host) {
     ctx_flags |= CU_CTX_MAP_HOST;
@@ -286,6 +290,49 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat
   return true;
 }
 
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) {
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CUDADevice::use_adaptive_compilation()
 {
   return DebugFlags().cuda.adaptive_compile;
@@ -674,6 +721,12 @@ void CUDADevice::load_texture_info()
 
 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
 {
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
   /* Signal to reallocate textures in host memory only. */
   move_texture_to_host = true;
 
@@ -687,6 +740,12 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
       device_memory &mem = *pair.first;
       CUDAMem *cmem = &pair.second;
 
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
       bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                         (&mem != &texture_info);
       bool is_image = is_texture && (mem.data_height > 1);
@@ -696,11 +755,6 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
         continue;
       }
 
-      /* Already in host memory. */
-      if (cmem->use_mapped_host) {
-        continue;
-      }
-
       /* For other textures, only move image textures. */
       if (for_texture && !is_image) {
         continue;
@@ -723,26 +777,30 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
       static thread_mutex move_mutex;
       thread_scoped_lock lock(move_mutex);
 
-      /* Preserve the original device pointer, in case of multi device
-       * we can't change it because the pointer mapping would break. */
-      device_ptr prev_pointer = max_mem->device_pointer;
-      size_t prev_size = max_mem->device_size;
+      any_device_moving_textures_to_host = true;
 
-      mem_copy_to(*max_mem);
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
       size = (max_size >= size) ? 0 : size - max_size;
 
-      max_mem->device_pointer = prev_pointer;
-      max_mem->device_size = prev_size;
+      any_device_moving_textures_to_host = false;
     }
     else {
       break;
     }
   }
 
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
   /* Update texture info array with new pointers. */
   load_texture_info();
-
-  move_texture_to_host = false;
 }
 
 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
@@ -808,9 +866,6 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
       map_host_used += size;
       status = " in host memory";
     }
-    else {
-      status = " failed, out of host memory";
-    }
   }
 
   if (mem_alloc_result != CUDA_SUCCESS) {
@@ -906,7 +961,7 @@ void CUDADevice::generic_free(device_memory &mem)
     }
     else {
       /* Free device memory. */
-     

@@ Diff output truncated at 10240 characters. @@