[Bf-blender-cvs] [22717ed3517] cycles-x: Cycles X: set integrator state size relative to the number of GPU cores

Thu Sep 16 19:22:56 CEST 2021

Commit: 22717ed35171a1b5f49c5fa59747edb9e345a851
Author: Brecht Van Lommel
Date:   Thu Sep 16 19:20:57 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB22717ed35171a1b5f49c5fa59747edb9e345a851

Cycles X: set integrator state size relative to the number of GPU cores

More specifically, 16x the max number of threads on all multiprocessors,
with 1048576 minimum.

What this effectively does is double the state size on the very high end
GPUs like RTX A6000 and RTX 3080 while leaving the size unchanged for
others. On the RTX A6000 I there are 2-10% render time reductions on our
benchmark scenes. The biggest reduction is on the barbershop interior, as
scenes with more objects and shaders are more likely to benefit from
improved coherence.

This also adds an environment variable for developers to test different
sizes, and debug logging about the size and memory usage.

Reviewed By: sergey

Differential Revision: https://developer.blender.org/D12432

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/cuda/queue.h
M	intern/cycles/device/device_queue.h

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 0ab387d38cd..b7f86c10553 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -39,14 +39,24 @@ CUDADeviceQueue::~CUDADeviceQueue()
   cuStreamDestroy(cuda_stream_);
 }
 
-int CUDADeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  return 1048576;
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
 }
 
-int CUDADeviceQueue::num_concurrent_busy_states()
+int CUDADeviceQueue::num_concurrent_busy_states() const
 {
   const int max_num_threads = cuda_device_->get_num_multiprocessors() *
                               cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 9302616734f..62e3aa3d6c2 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -36,7 +36,7 @@ class CUDADeviceQueue : public DeviceQueue {
   ~CUDADeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() override;
+  virtual int num_concurrent_busy_states() const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index c7b0591122c..edda3e61d51 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -46,7 +46,7 @@ class DeviceQueue {
   /* Number of states which keeps the device occupied with work without loosing performance.
    * The renderer will add more work (when available) when number of active paths falls below this
    * value. */
-  virtual int num_concurrent_busy_states() = 0;
+  virtual int num_concurrent_busy_states() const = 0;
 
   /* Initialize execution of kernels on this queue.
    *