[Bf-blender-cvs] [cb7ea0cff03] cycles-x: Cycles: add GPU queue kernel time stats
Brecht Van Lommel
noreply at git.blender.org
Wed Apr 28 20:07:49 CEST 2021
Commit: cb7ea0cff03d6f161c3eb88ec701a294ddde231a
Author: Brecht Van Lommel
Date: Wed Apr 28 16:33:20 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBcb7ea0cff03d6f161c3eb88ec701a294ddde231a
Cycles: add GPU queue kernel time stats
* Print summary of kernel execution times for GPU queues with --verbose 3
* This is for combinations of kernels rather than individual ones, since
measuring individual ones has a performance impact.
* Refactor code to be independent of CUDA queue
Differential Revision: https://developer.blender.org/D11112
===================================================================
M intern/cycles/device/cuda/queue.cpp
M intern/cycles/device/cuda/queue.h
M intern/cycles/device/device_kernel.cpp
M intern/cycles/device/device_kernel.h
M intern/cycles/device/device_queue.cpp
M intern/cycles/device/device_queue.h
M intern/cycles/device/optix/queue.cpp
M intern/cycles/util/util_logging.h
===================================================================
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index d41a41435f3..9ee0298aa2a 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -20,14 +20,12 @@
# include "device/cuda/device_impl.h"
# include "device/cuda/kernel.h"
-# include "util/util_time.h"
-
CCL_NAMESPACE_BEGIN
/* CUDADeviceQueue */
CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
- : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr), last_sync_time_(0.0)
+ : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
{
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
@@ -46,7 +44,7 @@ void CUDADeviceQueue::init_execution()
cuda_device_->load_texture_info();
cuda_device_assert(cuda_device_, cuCtxSynchronize());
- last_sync_time_ = time_dt();
+ debug_init_execution();
}
bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
@@ -55,8 +53,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
return false;
}
- VLOG(3) << "CUDA queue launch " << device_kernel_as_string(kernel) << ", work_size "
- << work_size;
+ debug_enqueue(kernel, work_size);
const CUDAContextScope scope(cuda_device_);
const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
@@ -124,10 +121,7 @@ bool CUDADeviceQueue::synchronize()
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
-
- double new_time = time_dt();
- VLOG(3) << "CUDA queue synchronize, elapsed " << new_time - last_sync_time_ << "s";
- last_sync_time_ = new_time;
+ debug_synchronize();
return !(cuda_device_->have_error());
}
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 13460fa669c..59e10bcd711 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -53,7 +53,6 @@ class CUDADeviceQueue : public DeviceQueue {
protected:
CUDADevice *cuda_device_;
CUstream cuda_stream_;
- double last_sync_time_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index b6389c2a5f1..2492fd17149 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -96,4 +96,20 @@ std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
return os;
}
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+ string str;
+
+ for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+ if (mask & (uint64_t(1) << i)) {
+ if (!str.empty()) {
+ str += " ";
+ }
+ str += device_kernel_as_string((DeviceKernel)i);
+ }
+ }
+
+ return str;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h
index 871ecfcff6f..83d959ca87b 100644
--- a/intern/cycles/device/device_kernel.h
+++ b/intern/cycles/device/device_kernel.h
@@ -18,6 +18,8 @@
#include "kernel/kernel_types.h"
+#include "util/util_string.h"
+
#include <ostream> // NOLINT
CCL_NAMESPACE_BEGIN
@@ -25,4 +27,7 @@ CCL_NAMESPACE_BEGIN
const char *device_kernel_as_string(DeviceKernel kernel);
std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
index be3d8702b5d..62b853fdfd5 100644
--- a/intern/cycles/device/device_queue.cpp
+++ b/intern/cycles/device/device_queue.cpp
@@ -16,13 +16,72 @@
#include "device/device_queue.h"
+#include "util/util_algorithm.h"
#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
CCL_NAMESPACE_BEGIN
-DeviceQueue::DeviceQueue(Device *device) : device(device)
+DeviceQueue::DeviceQueue(Device *device)
+ : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
{
DCHECK_NE(device, nullptr);
}
+DeviceQueue::~DeviceQueue()
+{
+ if (VLOG_IS_ON(3)) {
+ /* Print kernel execution times sorted by time. */
+ vector<pair<DeviceKernelMask, double>> stats_sorted;
+ for (const auto &stat : stats_kernel_time_) {
+ stats_sorted.push_back(stat);
+ }
+
+ sort(stats_sorted.begin(),
+ stats_sorted.end(),
+ [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+ return a.second > b.second;
+ });
+
+ VLOG(3) << "GPU queue stats:";
+ for (const auto &[mask, time] : stats_sorted) {
+ VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+ << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+ }
+ }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+ if (VLOG_IS_ON(3)) {
+ last_sync_time_ = time_dt();
+ last_kernels_enqueued_ = 0;
+ }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+ if (VLOG_IS_ON(3)) {
+ VLOG(3) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
+ last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+ }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+ if (VLOG_IS_ON(3)) {
+ const double new_time = time_dt();
+ const double elapsed_time = new_time - last_sync_time_;
+ VLOG(3) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+ stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+ last_sync_time_ = new_time;
+ last_kernels_enqueued_ = 0;
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index 0ccb6ce7cfb..5d556ec44dc 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -18,6 +18,8 @@
#include "device/device_kernel.h"
+#include "util/util_map.h"
+
CCL_NAMESPACE_BEGIN
class Device;
@@ -32,7 +34,7 @@ struct KernelWorkTile;
* This class encapsulates all properties needed for commands execution. */
class DeviceQueue {
public:
- virtual ~DeviceQueue() = default;
+ virtual ~DeviceQueue();
/* Initialize execution of kernels on this queue.
*
@@ -66,6 +68,18 @@ class DeviceQueue {
protected:
/* Hide construction so that allocation via `Device` API is enforced. */
explicit DeviceQueue(Device *device);
+
+ /* Implementations call these from the corresponding methods to generate debugging logs. */
+ void debug_init_execution();
+ void debug_enqueue(DeviceKernel kernel, const int work_size);
+ void debug_synchronize();
+
+ /* Combination of kernels enqueued together sync last synchronize. */
+ DeviceKernelMask last_kernels_enqueued_;
+ /* Time of synchronize call. */
+ double last_sync_time_;
+ /* Accumulated execution time for combinations of kernels launched together. */
+ map<DeviceKernelMask, double> stats_kernel_time_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index e770ce06169..211df631bcb 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -58,6 +58,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
return false;
}
+ debug_enqueue(kernel, work_size);
+
const CUDAContextScope scope(cuda_device_);
OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 3e56f0a0193..c8826435afe 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
# define LOG(severity) LOG_SUPPRESS()
# define VLOG(severity) LOG_SUPPRESS()
# define VLOG_IF(severity, condition) LOG_SUPPRESS()
+# define VLOG_IS_ON(severity) false
#endif
#define VLOG_ONCE(level, flag) \
More information about the Bf-blender-cvs
mailing list