[Bf-blender-cvs] [a117a9c63c3] cycles-x: Cycles X: Experiment with tile reschedule heuristic

Fri May 21 20:04:55 CEST 2021

Commit: a117a9c63c3af774496422f787579a9d8c6f3346
Author: Sergey Sharybin
Date:   Fri May 21 15:01:49 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBa117a9c63c3af774496422f787579a9d8c6f3346

Cycles X: Experiment with tile reschedule heuristic

The idea is to add new tiles for rendering when the GPU starts to feel
hungry (as opposite of previous logic which was adding new work tiles
once the number of paths goes below certain threshold). Some motivation
behind this decision:

- There is only that many threads the GPU has. Having much more active
  threads might avoid some scheduling latency, but there is limit to how
  much it helps.

- Scheduling new tiles early on might have negative effect on coherency,
  so allowing more paths to be terminated before re-scheduling keeps the
  wavefront more coherent and efficient to be calculated.

The new code will use maximum number of threads the GPU has.

```
                              new           old(1)        cycles-x(2)   megakernel(3)
bmw27.blend                   10.2251       10.198        10.7419       10.4269
classroom.blend               15.8454       16.7821       17.2907       16.6609
pabellon.blend                9.34677       9.39898       9.61772       9.14966
monster.blend                 10.374        10.5923       10.5886       12.0106
barbershop_interior.blend     11.5124       11.777        11.8522       12.5769
junkshop.blend                15.6783       16.085        16.2821       16.5213
pvt_flat.blend                16.3432       16.5704       16.2637       17.4047

[1] cycles-x branch, previous commit e0716af1a4f
(2) cyclex-x branch hash ad81074fab1
(3) cyclex-x branch hash ef6ce4fa8ca (right before disabling megakernel)
```

===================================================================

M	intern/cycles/device/cuda/device_impl.cpp
M	intern/cycles/device/cuda/device_impl.h
M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/cuda/queue.h
M	intern/cycles/device/device_queue.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h

===================================================================

diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 0e7f88291cc..18ba51a2d03 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -1445,6 +1445,32 @@ unique_ptr<DeviceGraphicsInterop> CUDADevice::graphics_interop_create()
   return make_unique<CUDADeviceGraphicsInterop>(this);
 }
 
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
index c776ad09b57..13a238c486a 100644
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -152,6 +152,13 @@ class CUDADevice : public Device {
   virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
 
   virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index f1336f6eed6..653c1bbae1d 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -37,13 +37,25 @@ CUDADeviceQueue::~CUDADeviceQueue()
   cuStreamDestroy(cuda_stream_);
 }
 
-int CUDADeviceQueue::num_concurrent_states(const size_t) const
+int CUDADeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
 {
   /* TODO: compute automatically. */
   /* TODO: must have at least num_threads_per_block. */
   return 1048576;
 }
 
+int CUDADeviceQueue::num_concurrent_busy_states()
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
 void CUDADeviceQueue::init_execution()
 {
   /* Synchronize all textures and memory copies before executing task. */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 54088f0b6a5..48c97d69301 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -36,6 +36,7 @@ class CUDADeviceQueue : public DeviceQueue {
   ~CUDADeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index d1aee95f963..db9345bdef4 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -40,6 +40,11 @@ class DeviceQueue {
    * based on number of cores and/or available memory. */
   virtual int num_concurrent_states(const size_t state_size) const = 0;
 
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() = 0;
+
   /* Initialize execution of kernels on this queue.
    *
    * Will, for example, load all data required by the kernels from Device to global or path state.
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 6a50feab497..e8958ef1147 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -44,6 +44,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
       max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorState))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
       max_active_path_index_(0)
 {
   memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
@@ -407,7 +408,6 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
     return false;
   }
 
-  const float regenerate_threshold = 0.5f;
   int num_paths = get_num_active_paths();
 
   if (num_paths == 0) {
@@ -430,7 +430,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   /* Schedule when we're out of paths or there are too few paths to keep the
    * device occupied. */
-  if (num_paths == 0 || num_paths < regenerate_threshold * max_num_camera_paths) {
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
     /* Get work tiles until the maximum number of path is reached. */
     while (num_paths < max_num_camera_paths) {
       KernelWorkTile work_tile;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 3cd193e606f..dd83286094c 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -137,6 +137,10 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Maximum number of concurrent integrator states. */
   int max_num_paths_;
 
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
   /* Maximum path index, effective number of paths used may be smaller than
    * the size of the integrator_state_ buffer so can avoid iterating over the
    * full buffer. */