[Bf-blender-cvs] [ba4e227def1] master: HIP device code cleanup and fix for high VRAM usage

Wed Oct 20 14:06:02 CEST 2021

Commit: ba4e227def13b4b953775ff4a2fd9c154bb07751
Author: Sayak Biswas
Date:   Wed Oct 20 13:37:39 2021 +0200
Branches: master
https://developer.blender.org/rBba4e227def13b4b953775ff4a2fd9c154bb07751

HIP device code cleanup and fix for high VRAM usage

This patch cleans up code for HIP device and makes it more consistent with the CUDA code.
It also fixes the issue with high VRAM usage on AMD cards using HIP allowing better performance and usage on cards like 6600XT.
Added a check in intern/cycles/kernel/bvh/bvh_util.h to prevent compiler error with hipcc

Reviewed By: brecht, leesonw

Maniphest Tasks: T92124

Differential Revision: https://developer.blender.org/D12834

===================================================================

M	CMakeLists.txt
M	extern/hipew/include/hipew.h
M	intern/cycles/device/hip/device_impl.cpp
M	intern/cycles/device/hip/queue.cpp
M	intern/cycles/device/hip/queue.h
M	intern/cycles/kernel/bvh/bvh_util.h

===================================================================

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 544e22f342b..ac11b495948 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,7 +430,7 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
 
 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
-option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" ON)
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
 
diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h
index 02fffc331bf..aa42fdf8ecd 100644
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -24,9 +24,13 @@ extern "C" {
 #include <stdlib.h>
 
 #define HIP_IPC_HANDLE_SIZE 64
+#define hipHostMallocDefault 0x00
 #define hipHostMallocPortable 0x01
 #define hipHostMallocMapped 0x02
 #define hipHostMallocWriteCombined 0x04
+#define hipHostMallocNumaUser 0x20000000
+#define hipHostMallocCoherent 0x40000000
+#define hipHostMallocNonCoherent 0x80000000
 #define hipHostRegisterPortable 0x01
 #define hipHostRegisterMapped 0x02
 #define hipHostRegisterIoMemory 0x04
@@ -989,7 +993,7 @@ typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize);
 typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr);
 typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
-typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize);
+typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize, unsigned int flags);
 typedef hipError_t HIPAPI thipHostFree(void* p);
 typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
 typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags);
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 0e5ac6ce401..964783a08bf 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     return;
   }
 
+  /* hipDeviceMapHost for mapping host memory when out of device memory.
+   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
   hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
 
   hip_assert(
@@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
     }
     else if (map_host_used + size < map_host_limit) {
       /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+      mem_alloc_result = hipHostMalloc(
+          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
 
       assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
              (mem_alloc_result != hipSuccess && shared_pointer == 0));
@@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
   size_t bytes;
 
   hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
-  assert(bytes == size);
   hip_assert(hipMemcpyHtoD(mem, host, size));
 }
 
@@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem)
   }
 }
 
-#  if 0
-void HIPDevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  HIPContextScope scope(this);
-  hipFunction_t hipRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
-  }
-  else {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  hip_assert(
-      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    hip_assert(
-        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    hip_assert(hipDeviceSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    hip_assert(hipDeviceSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void HIPDevice::thread_run(DeviceTask &task)
-{
-  HIPContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
 {
   return make_unique<HIPDeviceQueue>(this);
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 78c77e5fdae..0d9f5916d30 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue()
   hipStreamDestroy(hip_stream_);
 }
 
-int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  return 14416128;
+  int num_states = 0;
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+  if (max_num_threads == 0) {
+    num_states = 1048576;  // 65536 * 16
+  }
+  else {
+    num_states = max_num_threads * 16;
+  }
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    float factor = atof(factor_str);
+    if (!factor)
+      VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+    num_states = max((int)(num_states * factor), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
 }
 
 int HIPDeviceQueue::num_concurrent_busy_states() const
@@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
   }
 
   /* Launch kernel. */
-  hip_device_assert(hip_device_,
-                    hipModuleLaunchKernel(hip_kernel.function,
-                                          num_blocks,
-                                          1,
-                                          1,
-                                          num_threads_per_block,
-                                          1,
-                                          1,
-                                          shared_mem_bytes,
-                                          hip_stream_,
-                                          args,
-                                          0));
+  assert_success(hipModuleLaunchKernel(hip_kernel.function,
+                                       num_block

@@ Diff output truncated at 10240 characters. @@