[Bf-blender-cvs] [8dd7b5b26b3] master: Cycles: Metal integrator state size tuning

Mon Oct 24 18:14:35 CEST 2022

Commit: 8dd7b5b26b394207b5941d49750f7e3abadaf82a
Author: Michael Jones
Date:   Mon Oct 24 10:23:56 2022 +0100
Branches: master
https://developer.blender.org/rB8dd7b5b26b394207b5941d49750f7e3abadaf82a

Cycles: Metal integrator state size tuning

This patch tunes the integrator state sizing for Metal (`num_concurrent_states` and `num_concurrent_busy_states`).

On all GPUs architecture, we adjust the busy:total states ratio to be 1:4 which gives better rendering performance than the previous 1:16 ratio (independent of total state count). This gives a small performance uplift (e.g. 2-3% on M1 Ultra).

Additionally for M2 architectures, we double the overall state size if there is available headroom. Inclusive of the first change, we can expect uplift of close to 10% in future, as this results in larger dispatch sizes and minimises work submission overheads. In order to make an accurate determination of available headroom, we defer the calculation of `num_concurrent_states` and `num_concurrent_busy_states` until the time of integrator state allocation (i.e. after all of the scene data h [...]

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D16313

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/cuda/queue.h
M	intern/cycles/device/hip/queue.cpp
M	intern/cycles/device/hip/queue.h
M	intern/cycles/device/metal/device_impl.mm
M	intern/cycles/device/metal/kernel.mm
M	intern/cycles/device/metal/queue.h
M	intern/cycles/device/metal/queue.mm
M	intern/cycles/device/oneapi/queue.cpp
M	intern/cycles/device/oneapi/queue.h
M	intern/cycles/device/queue.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = cuda_device_->get_num_multiprocessors() *
                               cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
   ~CUDADeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = hip_device_->get_num_multiprocessors() *
                               hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
   ~HIPDeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 4b929b6bc0a..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
   }
 
   source = global_defines + source;
+#  if 0
   metal_printf("================\n%s================\n\%s================\n",
                global_defines.c_str(),
                baked_constants.c_str());
+#  endif
 
   /* Generate an MD5 from the source and include any baked constants. This is used when caching
    * PSOs. */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 8ccc50e57a3..55938d1a03a 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
     }
   }
 
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+    if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+      /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+      return false;
+    }
+  }
+
   if (pso_type != PSO_GENERIC) {
     /* Only specialize kernels where it can make an impact. */
     if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
   ~MetalDeviceQueue();
 
   virtual int num_concurrent_states(const size_t) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t) const override;
   virtual int num_sort_partition_elements() const override;
 
   virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
   }
 }
 
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  int result = 1048576;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
+  static int result = 0;
+  if (result) {
+    return result;
   }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+  result = 1048576;
+  if (metal_device_->device_vendor == METAL_GPU_APPLE) {
     result *= 4;
+
+    if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+      size_t system_ram = system_physical_ram();
+      size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+      size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+      /* Determine whether we can double the state count, and leave enough GPU-available memory
+       * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+       * keep dispatch sizes high and minimize work submission overheads. */
+      size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+      size_t total_state_size = result * state_size;
+      if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+        result *= 2;
+        metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+      }
+    }
+  }
+  else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+    /* METAL_WIP */
+    /* TODO: compute automatically. */
+    /* TODO: must have at least num_threads_per_block. */
+    result *= 2;
   }
   return result;
 }
 
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  int result = 65536;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
-  }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
-    result *= 4;
-  }
-  return result;
+  /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+  return num_concurrent_states(state_size) / 4;
 }
 
 int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
                               oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t state_size) const override;
 
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
   /* Number of states which keeps the device occupied with work without losing performance.
    * The renderer will add more work (when available) when number of active paths falls below this
    * value. */
-  virtual int num_concurrent_busy_states() const = 0;
+  virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
 
   /* Number of elements in a partition of sorted shaders, that improves memory locality of
    * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index ee250a6916b..48f6cf3c903 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -18,13 +18,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(const uint kernel_features)
 {
   size_t state_size = 0;
 
 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
 #define KERNEL_STRUCT_END(name) \
   break; \
   }
@@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "

@@ Diff output truncated at 10240 characters. @@