[Bf-blender-cvs] [3d5dbc1c449] blender-v3.0-release: Cycles: Reintroduce device-only memory handling that got lost in Cycles X merge

Tue Jan 11 10:38:37 CET 2022

Commit: 3d5dbc1c44907c73d2e6e57a146cbadaea9623bd
Author: Patrick Mours
Date:   Mon Dec 6 14:58:35 2021 +0100
Branches: blender-v3.0-release
https://developer.blender.org/rB3d5dbc1c44907c73d2e6e57a146cbadaea9623bd

Cycles: Reintroduce device-only memory handling that got lost in Cycles X merge

Somehow only a part of rBf4f8b6dde32b0438e0b97a6d8ebeb89802987127 ended up in
Cycles X, causing the issue that commit fixed, "OPTIX_ERROR_INVALID_VALUE" when the
system is out of memory, to show up again.
This adds the missing changes to fix that problem.

Maniphest Tasks: T93620

Differential Revision: https://developer.blender.org/D13488

===================================================================

M	intern/cycles/device/cpu/device_impl.cpp
M	intern/cycles/device/cuda/device_impl.cpp
M	intern/cycles/device/optix/device_impl.cpp

===================================================================

diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index 68dec7f0af2..5db89d1e4fb 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -134,8 +134,7 @@ void CPUDevice::mem_alloc(device_memory &mem)
               << string_human_readable_size(mem.memory_size()) << ")";
     }
 
-    if (mem.type == MEM_DEVICE_ONLY) {
-      assert(!mem.host_pointer);
+    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
       size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
       void *data = util_aligned_malloc(mem.memory_size(), alignment);
       mem.device_pointer = (device_ptr)data;
@@ -194,7 +193,7 @@ void CPUDevice::mem_free(device_memory &mem)
     tex_free((device_texture &)mem);
   }
   else if (mem.device_pointer) {
-    if (mem.type == MEM_DEVICE_ONLY) {
+    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
       util_aligned_free((void *)mem.device_pointer);
     }
     mem.device_pointer = 0;
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 20945796a2d..8c5779f4a72 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -680,7 +680,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
 
   void *shared_pointer = 0;
 
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
     if (mem.shared_pointer) {
       /* Another device already allocated host memory. */
       mem_alloc_result = CUDA_SUCCESS;
@@ -703,8 +703,14 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
   }
 
   if (mem_alloc_result != CUDA_SUCCESS) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
   }
 
   if (mem.name) {
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index f230f865f60..b33b5e21eee 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -44,14 +44,14 @@
 CCL_NAMESPACE_BEGIN
 
 OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
-    : device(device), queue(device), state(device, "__denoiser_state")
+    : device(device), queue(device), state(device, "__denoiser_state", true)
 {
 }
 
 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     : CUDADevice(info, stats, profiler),
       sbt_data(this, "__sbt", MEM_READ_ONLY),
-      launch_params(this, "__params"),
+      launch_params(this, "__params", false),
       denoiser_(this)
 {
   /* Make the CUDA context current. */
@@ -507,7 +507,7 @@ class OptiXDevice::DenoiseContext {
       : denoise_params(task.params),
         render_buffers(task.render_buffers),
         buffer_params(task.buffer_params),
-        guiding_buffer(device, "denoiser guiding passes buffer"),
+        guiding_buffer(device, "denoiser guiding passes buffer", true),
         num_samples(task.num_samples)
   {
     num_input_passes = 1;
@@ -1001,6 +1001,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
                                   const OptixBuildInput &build_input,
                                   uint16_t num_motion_steps)
 {
+  /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
+   * from running out of memory (since both original and compacted acceleration structure memory
+   * may be allocated at the same time for the duration of this function). The builds would
+   * otherwise happen on the same CUDA stream anyway. */
+  static thread_mutex mutex;
+  thread_scoped_lock lock(mutex);
+
   const CUDAContextScope scope(this);
 
   const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
@@ -1026,13 +1033,14 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
   optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
 
   /* Allocate required output buffers. */
-  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
   temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
   if (!temp_mem.device_pointer) {
     /* Make sure temporary memory allocation succeeded. */
     return false;
   }
 
+  /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
   device_only_memory<char> &out_data = *bvh->as_data;
   if (operation == OPTIX_BUILD_OPERATION_BUILD) {
     assert(out_data.device == this);
@@ -1081,12 +1089,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
 
     /* There is no point compacting if the size does not change. */
     if (compacted_size < sizes.outputSizeInBytes) {
-      device_only_memory<char> compacted_data(this, "optix compacted as");
+      device_only_memory<char> compacted_data(this, "optix compacted as", false);
       compacted_data.alloc_to_device(compacted_size);
-      if (!compacted_data.device_pointer)
+      if (!compacted_data.device_pointer) {
         /* Do not compact if memory allocation for compacted acceleration structure fails.
          * Can just use the uncompacted one then, so succeed here regardless. */
         return !have_error();
+      }
 
       optix_assert(optixAccelCompact(
           context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
@@ -1097,6 +1106,8 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
 
       std::swap(out_data.device_size, compacted_data.device_size);
       std::swap(out_data.device_pointer, compacted_data.device_pointer);
+      /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
+       */
     }
   }
 
@@ -1185,7 +1196,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
               const float4 pw = make_float4(
                   curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
 
-              /* Convert Catmull-Rom data to Bezier spline. */
+              /* Convert Catmull-Rom data to B-spline. */
               static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
               static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
               static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;