[Bf-blender-cvs] [8378db40c71] blender-v2.81-release: Cycles: Fix out of memory when rendering some scenes with OptiX that work with CUDA

Fri Oct 18 12:24:28 CEST 2019

Commit: 8378db40c71af53e3a48ef797cdacb38a1a26edc
Author: Patrick Mours
Date:   Fri Oct 18 12:06:28 2019 +0200
Branches: blender-v2.81-release
https://developer.blender.org/rB8378db40c71af53e3a48ef797cdacb38a1a26edc

Cycles: Fix out of memory when rendering some scenes with OptiX that work with CUDA

The OptiX implementation wasn't trying to allocate memory on the host if device allocation failed, while the CUDA implementation did. This copies the implementation over to OptiX to remedy that.

Differential Revision: https://developer.blender.org/D6068

===================================================================

M	intern/cycles/device/device_memory.h
M	intern/cycles/device/device_optix.cpp

===================================================================

diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b43ce8b0bc..272da6a9ad3 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -224,6 +224,7 @@ class device_memory {
 
  protected:
   friend class CUDADevice;
+  friend class OptiXDevice;
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 6f4734059da..e230662e698 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -177,7 +177,14 @@ class OptiXDevice : public Device {
   vector<device_only_memory<uint8_t>> blas;
   OptixTraversableHandle tlas_handle = 0;
 
+  // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
+  int can_map_host = 0;
+  size_t map_host_used = 0;
+  size_t map_host_limit = 0;
+  size_t device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  size_t device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
   map<device_memory *, CUDAMem> cuda_mem_map;
+  bool move_texture_to_host = false;
 
  public:
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
@@ -199,6 +206,25 @@ class OptiXDevice : public Device {
     // Make that CUDA context current
     const CUDAContextScope scope(cuda_context);
 
+    // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
+    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+    size_t system_ram = system_physical_ram();
+    if (system_ram > 0) {
+      if (system_ram / 2 > default_limit) {
+        map_host_limit = system_ram - default_limit;
+      }
+      else {
+        map_host_limit = system_ram / 2;
+      }
+    }
+    else {
+      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    }
+
+    // Check device support for pinned host memory
+    check_result_cuda(
+        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));
+
     // Create OptiX context for this device
     OptixDeviceContextOptions options = {};
 #  ifdef WITH_CYCLES_LOGGING
@@ -826,6 +852,7 @@ class OptiXDevice : public Device {
     device_only_memory<char> temp_mem(this, "temp_build_mem");
     temp_mem.alloc_to_device(sizes.tempSizeInBytes);
 
+    out_data.type = MEM_DEVICE_ONLY;
     out_data.data_type = TYPE_UNKNOWN;
     out_data.data_elements = 1;
     out_data.data_size = sizes.outputSizeInBytes;
@@ -1168,131 +1195,162 @@ class OptiXDevice : public Device {
 
   void mem_alloc(device_memory &mem) override
   {
-    const CUDAContextScope scope(cuda_context);
+    if (mem.type == MEM_PIXELS && !background) {
+      assert(!"mem_alloc not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_alloc not supported for textures.");
+    }
+    else {
+      generic_alloc(mem);
+    }
+  }
 
-    mem.device_size = mem.memory_size();
-
-    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
-      CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
-
-      CUDA_TEXTURE_DESC tex_desc = {};
-      tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-      CUDA_RESOURCE_DESC res_desc = {};
-
-      switch (mem.extension) {
-        default:
-          assert(0);
-        case EXTENSION_REPEAT:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_WRAP;
-          break;
-        case EXTENSION_EXTEND:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_CLAMP;
-          break;
-        case EXTENSION_CLIP:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_BORDER;
-          break;
-      }
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
+  {
+    CUDAContextScope scope(cuda_context);
 
-      switch (mem.interpolation) {
-        default:  // Default to linear for unsupported interpolation types
-        case INTERPOLATION_LINEAR:
-          tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-          break;
-        case INTERPOLATION_CLOSEST:
-          tex_desc.filterMode = CU_TR_FILTER_MODE_POINT;
-          break;
-      }
+    CUdeviceptr device_pointer = 0;
+    size_t size = mem.memory_size() + pitch_padding;
 
-      CUarray_format format;
-      switch (mem.data_type) {
-        default:
-          assert(0);
-        case TYPE_UCHAR:
-          format = CU_AD_FORMAT_UNSIGNED_INT8;
-          break;
-        case TYPE_UINT16:
-          format = CU_AD_FORMAT_UNSIGNED_INT16;
-          break;
-        case TYPE_UINT:
-          format = CU_AD_FORMAT_UNSIGNED_INT32;
-          break;
-        case TYPE_INT:
-          format = CU_AD_FORMAT_SIGNED_INT32;
-          break;
-        case TYPE_FLOAT:
-          format = CU_AD_FORMAT_FLOAT;
-          break;
-        case TYPE_HALF:
-          format = CU_AD_FORMAT_HALF;
-          break;
-      }
+    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+    const char *status = "";
 
-      if (mem.data_depth > 1) { /* 3D texture using array. */
-        CUDA_ARRAY3D_DESCRIPTOR desc;
-        desc.Width = mem.data_width;
-        desc.Height = mem.data_height;
-        desc.Depth = mem.data_depth;
-        desc.Format = format;
-        desc.NumChannels = mem.data_elements;
-        desc.Flags = 0;
+    /* First try allocating in device memory, respecting headroom. We make
+     * an exception for texture info. It is small and frequently accessed,
+     * so treat it as working memory.
+     *
+     * If there is not enough room for working memory, we will try to move
+     * textures to host memory, assuming the performance impact would have
+     * been worse for working memory. */
+    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+    bool is_image = is_texture && (mem.data_height > 1);
 
-        check_result_cuda(cuArray3DCreate(&cmem.array, &desc));
-        mem.device_pointer = (device_ptr)cmem.array;
+    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
 
-        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-        res_desc.res.array.hArray = cmem.array;
+    size_t total = 0, free = 0;
+    cuMemGetInfo(&free, &total);
+
+    /* Move textures to host memory if needed. */
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+      move_textures_to_host(size + headroom - free, is_texture);
+      cuMemGetInfo(&free, &total);
+    }
+
+    /* Allocate in device memory. */
+    if (!move_texture_to_host && (size + headroom) < free) {
+      mem_alloc_result = cuMemAlloc(&device_pointer, size);
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        status = " in device memory";
       }
-      else if (mem.data_height > 0) { /* 2D texture using array. */
-        CUDA_ARRAY_DESCRIPTOR desc;
-        desc.Width = mem.data_width;
-        desc.Height = mem.data_height;
-        desc.Format = format;
-        desc.NumChannels = mem.data_elements;
-
-        check_result_cuda(cuArrayCreate(&cmem.array, &desc));
-        mem.device_pointer = (device_ptr)cmem.array;
-
-        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-        res_desc.res.array.hArray = cmem.array;
+    }
+
+    /* Fall back to mapped host memory if needed and possible. */
+    void *map_host_pointer = 0;
+    bool free_map_host = false;
+
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+        map_host_used + size < map_host_limit) {
+      if (mem.shared_pointer) {
+        /* Another device already allocated host memory. */
+        mem_alloc_result = CUDA_SUCCESS;
+        map_host_pointer = mem.shared_pointer;
       }
       else {
-        check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+        /* Allocate host memory ourselves. */
+        mem_alloc_result = cuMemHostAlloc(
+            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+        mem.shared_pointer = map_host_pointer;
+        free_map_host = true;
+      }
 
-        res_desc.resType = CU_RESOURCE_TYPE_LINEAR;
-        res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer;
-        res_desc.res.linear.format = format;
-        res_desc.res.linear.numChannels = mem.data_elements;
-        res_desc.res.linear.sizeInBytes = mem.device_size;
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0);
+        map_host_used += size;
+        status = " in host memory";
+
+        /* Replace host pointer with our host allocation. Only works if
+         * CUDA memory layout is the same and has no pitch padding. Also
+         * does not work if we move textures to host during a render,
+         * since other devices might be using the memory. */
+        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+            mem.host_pointer != mem.shared_pointer) {
+          memcpy(mem.shared_pointer, mem.host_pointer, size);
+          mem.host_free();
+          mem.host_pointer = mem.shared_pointer;
+        }
       }
+      else {
+        status = " failed, out of host memory";
+      }
+    }
+    else if (mem_alloc_result != CUDA_SUCCESS) {
+      status = " failed, out of device and host memory";
+    }
 
-      check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL));
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+    }
 
-      int flat_slot = 0;
-      if (string_startswith(mem.name, "__tex_image")) {
-        flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1);
-      }
+    if (mem_alloc_result != CUDA_SUCCESS) {
+      set_error(string_printf("Buffer allocate %s", status));
+      return NULL;
+    }
+
+    mem.device_poin

@@ Diff output truncated at 10240 characters. @@