[Bf-blender-cvs] [6dcfb6df9ce] master: Cycles: Abstract host memory fallback for GPU devices
Nikita Sirgienko
noreply at git.blender.org
Mon Feb 6 22:20:36 CET 2023
Commit: 6dcfb6df9ce671996fcb39df1a1abadefd4f1d47
Author: Nikita Sirgienko
Date: Wed Feb 1 17:22:53 2023 +0100
Branches: master
https://developer.blender.org/rB6dcfb6df9ce671996fcb39df1a1abadefd4f1d47
Cycles: Abstract host memory fallback for GPU devices
Host memory fallback in CUDA and HIP devices is almost identical.
We remove duplicated code and create a shared generic version that
other devices (oneAPI) will be able to use.
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D17173
===================================================================
M intern/cycles/device/cuda/device_impl.cpp
M intern/cycles/device/cuda/device_impl.h
M intern/cycles/device/device.cpp
M intern/cycles/device/device.h
M intern/cycles/device/hip/device_impl.cpp
M intern/cycles/device/hip/device_impl.h
M intern/cycles/device/memory.h
===================================================================
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index f354ba6aee1..c19a0ade332 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
}
CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
- : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+ : GPUDevice(info, stats, profiler)
{
+ /* Verify that base class types can be used with specific backend types */
+ static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+ static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
first_error = true;
cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
- device_texture_headroom = 0;
- device_working_headroom = 0;
- move_texture_to_host = false;
- map_host_limit = 0;
- map_host_used = 0;
- can_map_host = 0;
pitch_alignment = 0;
/* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
- cuda_assert(
- cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+ int value;
+ cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+ can_map_host = value != 0;
cuda_assert(cuDeviceGetAttribute(
&pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
# endif
}
-void CUDADevice::init_host_memory()
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
{
- /* Limit amount of host mapped memory, because allocating too much can
- * cause system instability. Leave at least half or 4 GB of system
- * memory free, whichever is smaller. */
- size_t default_limit = 4 * 1024 * 1024 * 1024LL;
- size_t system_ram = system_physical_ram();
-
- if (system_ram > 0) {
- if (system_ram / 2 > default_limit) {
- map_host_limit = system_ram - default_limit;
- }
- else {
- map_host_limit = system_ram / 2;
- }
- }
- else {
- VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
- map_host_limit = 0;
- }
-
- /* Amount of device memory to keep is free after texture memory
- * and working memory allocations respectively. We set the working
- * memory limit headroom lower so that some space is left after all
- * texture memory allocations. */
- device_working_headroom = 32 * 1024 * 1024LL; // 32MB
- device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+ CUDAContextScope scope(this);
- VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
- << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+ cuMemGetInfo(&free, &total);
}
-void CUDADevice::load_texture_info()
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
{
- if (need_texture_info) {
- /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
- * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
- need_texture_info = false;
- texture_info.copy_to_device();
- }
+ CUDAContextScope scope(this);
+
+ CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+ return mem_alloc_result == CUDA_SUCCESS;
}
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+void CUDADevice::free_device(void *device_pointer)
{
- /* Break out of recursive call, which can happen when moving memory on a multi device. */
- static bool any_device_moving_textures_to_host = false;
- if (any_device_moving_textures_to_host) {
- return;
- }
-
- /* Signal to reallocate textures in host memory only. */
- move_texture_to_host = true;
-
- while (size > 0) {
- /* Find suitable memory allocation to move. */
- device_memory *max_mem = NULL;
- size_t max_size = 0;
- bool max_is_image = false;
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
- device_memory &mem = *pair.first;
- CUDAMem *cmem = &pair.second;
-
- /* Can only move textures allocated on this device (and not those from peer devices).
- * And need to ignore memory that is already on the host. */
- if (!mem.is_resident(this) || cmem->use_mapped_host) {
- continue;
- }
-
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
- (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- /* Can't move this type of memory. */
- if (!is_texture || cmem->array) {
- continue;
- }
-
- /* For other textures, only move image textures. */
- if (for_texture && !is_image) {
- continue;
- }
-
- /* Try to move largest allocation, prefer moving images. */
- if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
- max_is_image = is_image;
- max_size = mem.device_size;
- max_mem = &mem;
- }
- }
- lock.unlock();
-
- /* Move to host memory. This part is mutex protected since
- * multiple CUDA devices could be moving the memory. The
- * first one will do it, and the rest will adopt the pointer. */
- if (max_mem) {
- VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
- static thread_mutex move_mutex;
- thread_scoped_lock lock(move_mutex);
-
- any_device_moving_textures_to_host = true;
-
- /* Potentially need to call back into multi device, so pointer mapping
- * and peer devices are updated. This is also necessary since the device
- * pointer may just be a key here, so cannot be accessed and freed directly.
- * Unfortunately it does mean that memory is reallocated on all other
- * devices as well, which is potentially dangerous when still in use (since
- * a thread rendering on another devices would only be caught in this mutex
- * if it so happens to do an allocation at the same time as well. */
- max_mem->device_copy_to();
- size = (max_size >= size) ? 0 : size - max_size;
-
- any_device_moving_textures_to_host = false;
- }
- else {
- break;
- }
- }
-
- /* Unset flag before texture info is reloaded, since it should stay in device memory. */
- move_texture_to_host = false;
+ CUDAContextScope scope(this);
- /* Update texture info array with new pointers. */
- load_texture_info();
+ cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
}
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
{
CUDAContextScope scope(this);
- CUdeviceptr device_pointer = 0;
- size_t size = mem.memory_size() + pitch_padding;
-
- CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
- const char *status = "";
-
- /* First try allocating in device memory, respecting headroom. We make
- * an exception for texture info. It is small and frequently accessed,
- * so treat it as working memory.
- *
- * If there is not enough room for working memory, we will try to move
- * textures to host memory, assuming the performance impact would have
- * been worse for working memory. */
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
- size_t total = 0, free = 0;
- cuMemGetInfo(&free, &total);
-
- /* Move textures to host memory if needed. */
- if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
- move_textures_to_host(size + headroom - free, is_texture);
- cuMemGetInfo(&free, &total);
- }
-
- /* Allocate in device memory. */
- if (!move_texture_to_host && (size + headroom) < free) {
- mem_alloc_result = cuMemAlloc(&device_pointer, size);
- if (mem_alloc_result == CUDA_SUCCESS) {
- status = " in device memory";
- }
- }
-
- /* Fall back to mapped host memory if needed and possible. */
-
- void *shared_pointer = 0;
-
- if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
- if (mem.shared_pointer) {
- /* Another device already allocated host memory. */
- mem_alloc_result = CUDA_SUCCESS;
- shared_pointer = mem.shared_pointer;
- }
- else if (map_host_used + size < map_host_limit) {
- /* Allocate host memory ourselves. */
- mem_alloc_result = cuMemHostAlloc(
- &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
- assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
- (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
- }
-
- if (mem_alloc_result == CUDA_SUCCESS) {
- cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
- map_host_used += size;
- status = " in host memory";
- }
- }
-
- if (mem_alloc_result != CUDA_SUCCESS) {
- if (mem.type == MEM_DEVICE_ONLY) {
- status = " failed, out of device memory";
- set_error("System is out of GPU memory");
- }
- else {
- status = " failed, out of device and host memory";
- set_error("System is out of GPU and shared host memory");
- }
- }
-
- if (mem.name) {
- VLOG_WORK << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")" << status;
- }
-
- mem.device_pointer = (device_ptr)device_pointer;
- mem.device_size = size;
- stats.
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list