[Bf-blender-cvs] [c232be7866f] cycles-x: Cycles X: restore estimation of kernel memory usage for host memory fallback

Thu Jul 15 13:55:35 CEST 2021

Commit: c232be7866f03aec6c3167f88fe116bb8472b8a0
Author: Brecht Van Lommel
Date:   Wed Jul 14 17:43:11 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBc232be7866f03aec6c3167f88fe116bb8472b8a0

Cycles X: restore estimation of kernel memory usage for host memory fallback

This makes it so that we don't allocate scene memory on the device, only to
then find out later it has to move back to the host.

Integrator working memory is now allocated before loading the kernels and
allocating scene memory. This way it is included in the estimated kernel
memory usage, and makes it less likely to be moved to the host.

Differential Revision: https://developer.blender.org/D11922

===================================================================

M	intern/cycles/device/cuda/device_impl.cpp
M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace.h
M	intern/cycles/integrator/path_trace_work.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/integrator/integrator_subsurface.h
M	intern/cycles/kernel/kernel_shader.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/render/scene.cpp
M	intern/cycles/render/scene.h
M	intern/cycles/render/session.cpp

===================================================================

diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 2ea465fc3b1..0eaf787dbd7 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -448,8 +448,8 @@ bool CUDADevice::load_kernels(const uint kernel_features)
         "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
 
   if (result == CUDA_SUCCESS) {
-    reserve_local_memory(kernel_features);
     kernels.load(this);
+    reserve_local_memory(kernel_features);
   }
 
   return (result == CUDA_SUCCESS);
@@ -460,45 +460,36 @@ void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
   /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
    * needed for kernel launches, so that we can reliably figure out when
    * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
   size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* TODO: implement for new integrator kernels. */
-#  if 0
-  /* Get kernel function. */
-  CUfunction cuRender;
 
-  if (kernel_features & KERNEL_FEATURE_BAKING) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
   }
 
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
 
-  void *args[] = {&d_work_tiles, &total_work_size};
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
 
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
 
-  cuda_assert(cuCtxSynchronize());
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
 
-  cuMemGetInfo(&free_after, &total);
-#  else
-  free_after = free_before;
-#  endif
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
 
   VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
           << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index b799bfa009e..439d3eff48f 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -77,6 +77,13 @@ void PathTrace::load_kernels()
   }
 }
 
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
 bool PathTrace::ready_to_reset()
 {
   /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index 39e81c854e3..b1911b82cb7 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -54,6 +54,10 @@ class PathTrace {
    * The progress is reported to the currently configure progress object (via `set_progress`). */
   void load_kernels();
 
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
   /* Check whether now it is a good time to reset rendering.
    * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
    * render result. */
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index 3ab8e371156..c3760d29734 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -56,6 +56,9 @@ class PathTraceWork {
   /* Check whether the big tile is being worked on by multiple path trace works. */
   bool has_multiple_works() const;
 
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
   /* Initialize execution of kernels.
    * Will ensure that all device queues are initialized for execution.
    *
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index b0ba7054543..66518172f47 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -131,12 +131,12 @@ void PathTraceWorkGPU::alloc_integrator_queue()
 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
   /* Allocate arrays for shader sorting. */
-  const int num_shaders = device_scene_->shaders.size();
-  if (integrator_shader_sort_counter_.size() < num_shaders) {
-    integrator_shader_sort_counter_.alloc(num_shaders);
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
     integrator_shader_sort_counter_.zero_to_device();
 
-    integrator_shader_raytrace_sort_counter_.alloc(num_shaders);
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
@@ -146,13 +146,16 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
   }
 }
 
-void PathTraceWorkGPU::init_execution()
+void PathTraceWorkGPU::alloc_work_memory()
 {
-  queue_->init_execution();
-
   alloc_integrator_soa();
   alloc_integrator_queue();
   alloc_integrator_sorting();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
 
   integrator_state_gpu_.shadow_catcher_state_offset = get_shadow_catcher_state_offset();
 
@@ -358,8 +361,8 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
-    int num_shaders = device_scene_->shaders.size();
-    void *args[] = {&d_counter, &num_shaders};
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 76018d8501f..768e8802162 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -38,6 +38,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  public:
   PathTraceWorkGPU(Device *device, DeviceScene *device_scene, bool *cancel_requested_flag);
 
+  virtual void alloc_work_memory() override;
   virtual void init_execution() override;
 
   virtual void render_samples(int start_sample, int samples_num) override;
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
index 84f4e49b695..5522b6b4028 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -143,7 +143,7 @@ ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *
   /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
   sd->flag &= ~SD_CLOSURE_FLAGS;
   sd->num_closure = 0;
-  sd->num_closure_left = kernel_data.integrator.max_closures;
+  sd->num_closure_left = kernel_data.max_closures;
 
   const float3 weight = one_float3();
   const float roughness = INTEGRATOR_STATE(subsurface, roughness);
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index a678138050f..831e930b792 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -1026,7 +1026,7 @@ ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   sd->num_closure = 0;
@@ -1229,7 +1229,7 @@ ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   /* reset closures once at the start, we will be accumulating the closures
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 5f229eafe3b..612f8bf7660 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1126,12 +1126,10 @@ typedef struct KernelIntegrator {
   int volume_max_steps;
   float volume_step_rate;
 
-  int max_closures;
-
   int has_shadow_catcher;
 
   /* padding */
-  int pad1;
+  int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1190,7 +1188,9 @@ static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
   uint kernel_features;
-  uint pad1, pad2, pad3;
+  uint max_closures;
+  uint max_shaders;
+  uint pad;
 
   KernelCamera cam;
   KernelFilm film;
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 8e744e72a76..2f79a4a1e82 100644
--- a/intern/cycles/rend

@@ Diff output truncated at 10240 characters. @@