[Bf-blender-cvs] [336ca6796a7] blender-v3.0-release: Fix T90308: Cycles crash copying memory from device to host

Sergey Sharybin noreply at git.blender.org
Mon Nov 22 17:32:12 CET 2021


Commit: 336ca6796a7a6ee26ff6d889643df07a37efa554
Author: Sergey Sharybin
Date:   Mon Nov 22 17:07:55 2021 +0100
Branches: blender-v3.0-release
https://developer.blender.org/rB336ca6796a7a6ee26ff6d889643df07a37efa554

Fix T90308: Cycles crash copying memory from device to host

Happens when device runs out of memory and Cycles is moving some
textures to the host memory.

The delayed memory free for OptiX BVH was moving data from one
device_memory to another, leaving the original device memory in
an invalid state. This was ruining the allocation map in the CUDA
device which is using pointer to the device_memory.

This change makes it so the memory pointer is stolen from BVH
into the delayed memory free list.

Additionally, forbid copying and moving instances of device_memory
and added sanity checks in the device implementation.

Differential Revision: https://developer.blender.org/D13316

===================================================================

M	intern/cycles/bvh/optix.cpp
M	intern/cycles/bvh/optix.h
M	intern/cycles/device/cuda/device_impl.cpp
M	intern/cycles/device/hip/device_impl.cpp
M	intern/cycles/device/memory.cpp
M	intern/cycles/device/memory.h
M	intern/cycles/device/optix/device_impl.cpp
M	intern/cycles/device/optix/device_impl.h

===================================================================

diff --git a/intern/cycles/bvh/optix.cpp b/intern/cycles/bvh/optix.cpp
index ebc3fa68b97..671e1a42a31 100644
--- a/intern/cycles/bvh/optix.cpp
+++ b/intern/cycles/bvh/optix.cpp
@@ -30,15 +30,17 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
     : BVH(params_, geometry_, objects_),
       device(device),
       traversable_handle(0),
-      as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
-      motion_transform_data(device, "optix motion transform", false)
+      as_data(make_unique<device_only_memory<char>>(
+          device, params.top_level ? "optix tlas" : "optix blas", false)),
+      motion_transform_data(
+          make_unique<device_only_memory<char>>(device, "optix motion transform", false))
 {
 }
 
 BVHOptiX::~BVHOptiX()
 {
-  // Acceleration structure memory is delayed freed on device, since deleting the
-  // BVH may happen while still being used for rendering.
+  /* Acceleration structure memory is delayed freed on device, since deleting the
+   * BVH may happen while still being used for rendering. */
   device->release_optix_bvh(this);
 }
 
diff --git a/intern/cycles/bvh/optix.h b/intern/cycles/bvh/optix.h
index 037e54980bd..cb855d786bf 100644
--- a/intern/cycles/bvh/optix.h
+++ b/intern/cycles/bvh/optix.h
@@ -25,14 +25,16 @@
 
 #  include "device/memory.h"
 
+#  include "util/unique_ptr.h"
+
 CCL_NAMESPACE_BEGIN
 
 class BVHOptiX : public BVH {
  public:
   Device *device;
   uint64_t traversable_handle;
-  device_only_memory<char> as_data;
-  device_only_memory<char> motion_transform_data;
+  unique_ptr<device_only_memory<char>> as_data;
+  unique_ptr<device_only_memory<char>> motion_transform_data;
 
  protected:
   friend class BVH;
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 2bb0592bcc5..20945796a2d 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -777,6 +777,7 @@ void CUDADevice::generic_free(device_memory &mem)
   if (mem.device_pointer) {
     CUDAContextScope scope(this);
     thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
     const CUDAMem &cmem = cuda_mem_map[&mem];
 
     /* If cmem.use_mapped_host is true, reference counting is used
@@ -1145,6 +1146,7 @@ void CUDADevice::tex_free(device_texture &mem)
   if (mem.device_pointer) {
     CUDAContextScope scope(this);
     thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
     const CUDAMem &cmem = cuda_mem_map[&mem];
 
     if (cmem.texobject) {
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 76e300eb132..5b0e2951c39 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -745,6 +745,7 @@ void HIPDevice::generic_free(device_memory &mem)
   if (mem.device_pointer) {
     HIPContextScope scope(this);
     thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
     const HIPMem &cmem = hip_mem_map[&mem];
 
     /* If cmem.use_mapped_host is true, reference counting is used
@@ -1114,6 +1115,7 @@ void HIPDevice::tex_free(device_texture &mem)
   if (mem.device_pointer) {
     HIPContextScope scope(this);
     thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
     const HIPMem &cmem = hip_mem_map[&mem];
 
     if (cmem.texobject) {
diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp
index f162b00d9f7..86bf2542c92 100644
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -44,45 +44,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
 {
 }
 
-device_memory::device_memory(device_memory &&other) noexcept
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter),
-      original_device_ptr(other.original_device_ptr),
-      original_device_size(other.original_device_size),
-      original_device(other.original_device),
-      need_realloc_(other.need_realloc_),
-      modified(other.modified)
-{
-  other.data_elements = 0;
-  other.data_size = 0;
-  other.device_size = 0;
-  other.data_width = 0;
-  other.data_height = 0;
-  other.data_depth = 0;
-  other.device = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-  other.original_device_ptr = 0;
-  other.original_device_size = 0;
-  other.original_device = 0;
-  other.need_realloc_ = false;
-  other.modified = false;
-}
-
 device_memory::~device_memory()
 {
   assert(shared_pointer == 0);
diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h
index 281c54cc6a5..e04142117aa 100644
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -281,11 +281,16 @@ class device_memory {
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
-  device_memory(device_memory &&other) noexcept;
 
-  /* No copying allowed. */
+  /* No copying and allowed.
+   *
+   * This is because device implementation might need to register device memory in an allocation
+   * map of some sort and use pointer as a key to identify blocks. Moving data from one place to
+   * another bypassing device allocation routines will make those maps hard to maintain. */
   device_memory(const device_memory &) = delete;
+  device_memory(device_memory &&other) noexcept = delete;
   device_memory &operator=(const device_memory &) = delete;
+  device_memory &operator=(device_memory &&) = delete;
 
   /* Host allocation on the device. All host_pointer memory should be
    * allocated with these functions, for devices that support using
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index bb690551c04..6e897e3831f 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -1032,7 +1032,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
     return false;
   }
 
-  device_only_memory<char> &out_data = bvh->as_data;
+  device_only_memory<char> &out_data = *bvh->as_data;
   if (operation == OPTIX_BUILD_OPERATION_BUILD) {
     assert(out_data.device == this);
     out_data.alloc_to_device(sizes.outputSizeInBytes);
@@ -1123,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       operation = OPTIX_BUILD_OPERATION_UPDATE;
     }
     else {
-      bvh_optix->as_data.free();
+      bvh_optix->as_data->free();
       bvh_optix->traversable_handle = 0;
     }
 
@@ -1344,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
     unsigned int num_instances = 0;
     unsigned int max_num_instances = 0xFFFFFFFF;
 
-    bvh_optix->as_data.free();
+    bvh_optix->as_data->free();
     bvh_optix->traversable_handle = 0;
-    bvh_optix->motion_transform_data.free();
+    bvh_optix->motion_transform_data->free();
 
     optixDeviceContextGetProperty(context,
                                   OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@@ -1379,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         }
       }
 
-      assert(bvh_optix->motion_transform_data.device == this);
-      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+      assert(bvh_optix->motion_transform_data->device == this);
+      bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
     }
 
     for (Object *ob : bvh->objects) {
@@ -1441,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
         motion_transform_offset = align_up(motion_transform_offset,
                                            OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
                                            motion_transform_offset;
         motion_transform_offset += motion_transform_size;
 
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 5cfc249b430..1b43972d99f 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -23,6 +23,7 @@
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
 #  include "kernel/types.h"
+#  include "util/unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -76,7 +77,7 @@ class OptiXDevice : public CUDADevice {
   device_only_memory<KernelParamsOptiX> launch_params;
   OptixTraversableHandle tlas_handle = 0;
 
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
   thread_mutex delayed_free_bvh_mutex;
 
   class Denoiser {



More information about the Bf-blender-cvs mailing list