[Bf-blender-cvs] [baeb11826b9] master: Cycles: Add OptiX acceleration structure compaction

Patrick Mours noreply at git.blender.org
Mon Dec 9 14:33:23 CET 2019


Commit: baeb11826b9fe5525db6dd05ba5271949079fc1e
Author: Patrick Mours
Date:   Thu Dec 5 19:17:01 2019 +0100
Branches: master
https://developer.blender.org/rBbaeb11826b9fe5525db6dd05ba5271949079fc1e

Cycles: Add OptiX acceleration structure compaction

This adds compaction support for OptiX acceleration structures, which reduces the device memory footprint in a post step after building. Depending on the scene this can reduce the amount of used device memory quite a bit and even improve performance (smaller acceleration structure improves cache usage). It's only enabled for background renders to make acceleration structure builds fast in viewport.

Also fixes a bug in the memory management for OptiX acceleration structures: These were held in a dynamic vector of 'device_memory' instances and used the mem_alloc/mem_free functions. However, those keep track of memory instances in the 'cuda_mem_map' via pointers to 'device_memory' (which works fine everywhere else since those are never copied/moved). But in the case of the vector, it may decide to reallocate at some point, which invalidates those pointers and would result in some nasty [...]

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D6369

===================================================================

M	intern/cycles/device/device_memory.cpp
M	intern/cycles/device/device_memory.h
M	intern/cycles/device/device_optix.cpp
M	intern/cycles/kernel/svm/svm_bevel.h

===================================================================

diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c106b4505db..3a99a49dffc 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -47,31 +47,6 @@ device_memory::~device_memory()
   assert(shared_counter == 0);
 }
 
-device_memory::device_memory(device_memory &&other)
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      interpolation(other.interpolation),
-      extension(other.extension),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter)
-{
-  other.device_size = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-}
-
 void *device_memory::host_alloc(size_t size)
 {
   if (!size) {
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index f8324e2a214..60740807568 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -235,9 +235,6 @@ class device_memory {
   device_memory(const device_memory &) = delete;
   device_memory &operator=(const device_memory &) = delete;
 
-  /* But moving is possible. */
-  device_memory(device_memory &&);
-
   /* Host allocation on the device. All host_pointer memory should be
    * allocated with these functions, for devices that support using
    * the same pointer for host and device. */
@@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory {
     free();
   }
 
-  device_only_memory(device_only_memory &&other)
-      : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   void alloc_to_device(size_t num, bool shrink_to_fit = true)
   {
     size_t new_size = num;
@@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory {
     free();
   }
 
-  device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   /* Host memory allocation. */
   T *alloc(size_t width, size_t height = 0, size_t depth = 0)
   {
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ae3ab7e1fc2..7335e0bc64d 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -174,7 +174,7 @@ class OptiXDevice : public Device {
   device_vector<SbtRecord> sbt_data;
   device_vector<TextureInfo> texture_info;
   device_only_memory<KernelParams> launch_params;
-  vector<device_only_memory<uint8_t>> as_mem;
+  vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
   // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
@@ -269,6 +269,9 @@ class OptiXDevice : public Device {
     task_pool.stop();
 
     // Free all acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     sbt_data.free();
@@ -831,7 +834,6 @@ class OptiXDevice : public Device {
 
   bool build_optix_bvh(const OptixBuildInput &build_input,
                        uint16_t num_motion_steps,
-                       device_memory &out_data,
                        OptixTraversableHandle &out_handle)
   {
     out_handle = 0;
@@ -842,7 +844,15 @@ class OptiXDevice : public Device {
     OptixAccelBufferSizes sizes = {};
     OptixAccelBuildOptions options;
     options.operation = OPTIX_BUILD_OPERATION_BUILD;
-    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+    if (background) {
+      // Prefer best performance and lowest memory consumption in background
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+    }
+    else {
+      // Prefer fast updates in viewport
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+    }
+
     options.motionOptions.numKeys = num_motion_steps;
     options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
     options.motionOptions.timeBegin = 0.0f;
@@ -853,31 +863,75 @@ class OptiXDevice : public Device {
 
     // Allocate required output buffers
     device_only_memory<char> temp_mem(this, "temp_build_mem");
-    temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+    if (!temp_mem.device_pointer)
+      return false;  // Make sure temporary memory allocation succeeded
+
+    // Move textures to host memory if there is not enough room
+    size_t size = 0, free = 0;
+    cuMemGetInfo(&free, &size);
+    size = sizes.outputSizeInBytes + device_working_headroom;
+    if (size >= free && can_map_host) {
+      move_textures_to_host(size - free, false);
+    }
 
-    out_data.type = MEM_DEVICE_ONLY;
-    out_data.data_type = TYPE_UNKNOWN;
-    out_data.data_elements = 1;
-    out_data.data_size = sizes.outputSizeInBytes;
-    mem_alloc(out_data);
+    CUdeviceptr out_data = 0;
+    check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+    as_mem.push_back(out_data);
 
     // Finally build the acceleration structure
+    OptixAccelEmitDesc compacted_size_prop;
+    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+    // A tiny space was allocated for this property at the end of the temporary buffer above
+    // Make sure this pointer is 8-byte aligned
+    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
     check_result_optix_ret(optixAccelBuild(context,
                                            NULL,
                                            &options,
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
+                                           temp_mem.device_size,
+                                           out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
-                                           NULL,
-                                           0));
+                                           &compacted_size_prop,
+                                           1));
 
     // Wait for all operations to finish
     check_result_cuda_ret(cuStreamSynchronize(NULL));
 
+    // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
+    if (background) {
+      uint64_t compacted_size = sizes.outputSizeInBytes;
+      check_result_cuda_ret(
+          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+      // Temporary memory is no longer needed, so free it now to make space
+      temp_mem.free();
+
+      // There is no point compacting if the size does not change
+      if (compacted_size < sizes.outputSizeInBytes) {
+        CUdeviceptr compacted_data = 0;
+        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+          // Do not compact if memory allocation for compacted acceleration structure fails
+          // Can just use the uncompacted one then, so succeed here regardless
+          return true;
+        as_mem.push_back(compacted_data);
+
+        check_result_optix_ret(optixAccelCompact(
+            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+
+        // Wait for compaction to finish
+        check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+        // Free uncompacted acceleration structure
+        cuMemFree(out_data);
+        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+      }
+    }
+
     return true;
   }
 
@@ -889,7 +943,10 @@ class OptiXDevice : public Device {
     unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
     meshes.reserve(bvh->meshes.size());
 
-    // Free all previous acceleration structure
+    // Free all previous acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     // Build bottom level acceleration structures (BLAS)
@@ -968,9 +1025,8 @@ class OptiXDevice : public Device {
         build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1034,9 +1090,8 @@ class OptiXDevice : public Device {
         build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1081,15 +1136,17 @@ class OptiXDevice : public Device {
 
         // Insert motion traversable if object has motion
         if (motion_blur && ob->use_motion()) {
-          as_mem.emplace_back(this, "motion_transform");
-          device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back();
-          motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
-                                               (max(ob->motion.size(), 2) - 2) *
-                                                   sizeof(OptixSRTData));
+          size_t motion_keys = max(ob->motion.size(), 2) - 2;
+          size_t motion_transform_size = sizeof(OptixSRTMotionTransfo

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list