[Bf-blender-cvs] [a6b53ef9949] master: Cycles: print name of kernels on errors in CUDA queue, for debugging

Mon Sep 27 15:24:26 CEST 2021

Commit: a6b53ef99492267f8f27fd58ea35104b88e1bec8
Author: Brecht Van Lommel
Date:   Mon Sep 27 14:47:51 2021 +0200
Branches: master
https://developer.blender.org/rBa6b53ef99492267f8f27fd58ea35104b88e1bec8

Cycles: print name of kernels on errors in CUDA queue, for debugging

===================================================================

M	intern/cycles/device/cuda/queue.cpp
M	intern/cycles/device/cuda/queue.h
M	intern/cycles/device/device_queue.cpp
M	intern/cycles/device/device_queue.h

===================================================================

diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index b7f86c10553..1149a835b14 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
   }
 
   /* Launch kernel. */
-  cuda_device_assert(cuda_device_,
-                     cuLaunchKernel(cuda_kernel.function,
-                                    num_blocks,
-                                    1,
-                                    1,
-                                    num_threads_per_block,
-                                    1,
-                                    1,
-                                    shared_mem_bytes,
-                                    cuda_stream_,
-                                    args,
-                                    0));
+  assert_success(cuLaunchKernel(cuda_kernel.function,
+                                num_blocks,
+                                1,
+                                1,
+                                num_threads_per_block,
+                                1,
+                                1,
+                                shared_mem_bytes,
+                                cuda_stream_,
+                                args,
+                                0),
+                 "enqueue");
 
   return !(cuda_device_->have_error());
 }
@@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
   }
 
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
+
   debug_synchronize();
 
   return !(cuda_device_->have_error());
@@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
   assert(mem.device_pointer != 0);
 
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
-      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+  assert_success(
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
+      "zero_to_device");
 }
 
 void CUDADeviceQueue::copy_to_device(device_memory &mem)
@@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)
 
   /* Copy memory to device. */
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
       cuMemcpyHtoDAsync(
-          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
+      "copy_to_device");
 }
 
 void CUDADeviceQueue::copy_from_device(device_memory &mem)
@@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)
 
   /* Copy memory from device. */
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
       cuMemcpyDtoHAsync(
-          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
+      "copy_from_device");
+}
+
+void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
+{
+  if (result != CUDA_SUCCESS) {
+    const char *name = cuewErrorString(result);
+    cuda_device_->set_error(string_printf(
+        "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }
 
 unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 62e3aa3d6c2..4d1995ed69e 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
  protected:
   CUDADevice *cuda_device_;
   CUstream cuda_stream_;
+
+  void assert_success(CUresult result, const char *operation);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
index a89ba68d62c..f2b2f3496e0 100644
--- a/intern/cycles/device/device_queue.cpp
+++ b/intern/cycles/device/device_queue.cpp
@@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
 {
   if (VLOG_IS_ON(3)) {
     last_sync_time_ = time_dt();
-    last_kernels_enqueued_ = 0;
   }
+
+  last_kernels_enqueued_ = 0;
 }
 
 void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
@@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
   if (VLOG_IS_ON(3)) {
     VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
             << work_size;
-    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
   }
+
+  last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
 }
 
 void DeviceQueue::debug_synchronize()
@@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
     stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
 
     last_sync_time_ = new_time;
-    last_kernels_enqueued_ = 0;
   }
+
+  last_kernels_enqueued_ = 0;
+}
+
+string DeviceQueue::debug_active_kernels()
+{
+  return device_kernel_mask_as_string(last_kernels_enqueued_);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index edda3e61d51..e6835b787cf 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -21,6 +21,7 @@
 #include "device/device_graphics_interop.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_string.h"
 #include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
@@ -101,6 +102,7 @@ class DeviceQueue {
   void debug_init_execution();
   void debug_enqueue(DeviceKernel kernel, const int work_size);
   void debug_synchronize();
+  string debug_active_kernels();
 
   /* Combination of kernels enqueued together sync last synchronize. */
   DeviceKernelMask last_kernels_enqueued_;