[Bf-blender-cvs] [cb7ea0cff03] cycles-x: Cycles: add GPU queue kernel time stats

Wed Apr 28 20:07:49 CEST 2021

Commit: cb7ea0cff03d6f161c3eb88ec701a294ddde231a
Author: Brecht Van Lommel
Date:   Wed Apr 28 16:33:20 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBcb7ea0cff03d6f161c3eb88ec701a294ddde231a

Cycles: add GPU queue kernel time stats

* Print summary of kernel execution times for GPU queues with --verbose 3
* This is for combinations of kernels rather than individual ones, since
  measuring individual ones has a performance impact.
* Refactor code to be independent of CUDA queue

Differential Revision: https://developer.blender.org/D11112

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/cuda/queue.h
M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/device/device_kernel.h
M	intern/cycles/device/device_queue.cpp
M	intern/cycles/device/device_queue.h
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/util/util_logging.h

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index d41a41435f3..9ee0298aa2a 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -20,14 +20,12 @@
 #  include "device/cuda/device_impl.h"
 #  include "device/cuda/kernel.h"
 
-#  include "util/util_time.h"
-
 CCL_NAMESPACE_BEGIN
 
 /* CUDADeviceQueue */
 
 CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
-    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr), last_sync_time_(0.0)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
 {
   const CUDAContextScope scope(cuda_device_);
   cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
@@ -46,7 +44,7 @@ void CUDADeviceQueue::init_execution()
   cuda_device_->load_texture_info();
   cuda_device_assert(cuda_device_, cuCtxSynchronize());
 
-  last_sync_time_ = time_dt();
+  debug_init_execution();
 }
 
 bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
@@ -55,8 +53,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     return false;
   }
 
-  VLOG(3) << "CUDA queue launch " << device_kernel_as_string(kernel) << ", work_size "
-          << work_size;
+  debug_enqueue(kernel, work_size);
 
   const CUDAContextScope scope(cuda_device_);
   const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
@@ -124,10 +121,7 @@ bool CUDADeviceQueue::synchronize()
 
   const CUDAContextScope scope(cuda_device_);
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
-
-  double new_time = time_dt();
-  VLOG(3) << "CUDA queue synchronize, elapsed " << new_time - last_sync_time_ << "s";
-  last_sync_time_ = new_time;
+  debug_synchronize();
 
   return !(cuda_device_->have_error());
 }
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 13460fa669c..59e10bcd711 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -53,7 +53,6 @@ class CUDADeviceQueue : public DeviceQueue {
  protected:
   CUDADevice *cuda_device_;
   CUstream cuda_stream_;
-  double last_sync_time_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index b6389c2a5f1..2492fd17149 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -96,4 +96,20 @@ std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
   return os;
 }
 
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h
index 871ecfcff6f..83d959ca87b 100644
--- a/intern/cycles/device/device_kernel.h
+++ b/intern/cycles/device/device_kernel.h
@@ -18,6 +18,8 @@
 
 #include "kernel/kernel_types.h"
 
+#include "util/util_string.h"
+
 #include <ostream>  // NOLINT
 
 CCL_NAMESPACE_BEGIN
@@ -25,4 +27,7 @@ CCL_NAMESPACE_BEGIN
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
 
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
index be3d8702b5d..62b853fdfd5 100644
--- a/intern/cycles/device/device_queue.cpp
+++ b/intern/cycles/device/device_queue.cpp
@@ -16,13 +16,72 @@
 
 #include "device/device_queue.h"
 
+#include "util/util_algorithm.h"
 #include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
 
 CCL_NAMESPACE_BEGIN
 
-DeviceQueue::DeviceQueue(Device *device) : device(device)
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
 {
   DCHECK_NE(device, nullptr);
 }
 
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(3) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index 0ccb6ce7cfb..5d556ec44dc 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -18,6 +18,8 @@
 
 #include "device/device_kernel.h"
 
+#include "util/util_map.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
@@ -32,7 +34,7 @@ struct KernelWorkTile;
  * This class encapsulates all properties needed for commands execution. */
 class DeviceQueue {
  public:
-  virtual ~DeviceQueue() = default;
+  virtual ~DeviceQueue();
 
   /* Initialize execution of kernels on this queue.
    *
@@ -66,6 +68,18 @@ class DeviceQueue {
  protected:
   /* Hide construction so that allocation via `Device` API is enforced. */
   explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index e770ce06169..211df631bcb 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -58,6 +58,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
     return false;
   }
 
+  debug_enqueue(kernel, work_size);
+
   const CUDAContextScope scope(cuda_device_);
 
   OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 3e56f0a0193..c8826435afe 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
 #  define VLOG_IF(severity, condition) LOG_SUPPRESS()
+#  define VLOG_IS_ON(severity) false
 #endif
 
 #define VLOG_ONCE(level, flag) \