[Bf-blender-cvs] [4412e14708c] master: Cycles: Useful Metal backend debug & profiling functionality

Michael Jones noreply at git.blender.org
Tue Jun 7 12:08:47 CEST 2022


Commit: 4412e14708c5625c3fe84bc75fce2ca6de6f58c9
Author: Michael Jones
Date:   Tue Jun 7 11:08:21 2022 +0100
Branches: master
https://developer.blender.org/rB4412e14708c5625c3fe84bc75fce2ca6de6f58c9

Cycles: Useful Metal backend debug & profiling functionality

This patch adds some useful debugging & profiling env vars to the Metal backend:

- `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render
- `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose)
- `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index

Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled:

```
---------------------------------------------------------------------------------------------------
Kernel name                                 Total threads   Dispatches     Avg. T/D    Time   Time%
---------------------------------------------------------------------------------------------------
integrator_init_from_camera                   657,407,232          161    4,083,274   0.24s   0.51%
integrator_intersect_closest                1,629,288,440          681    2,392,494  15.18s  32.12%
integrator_intersect_shadow                   751,652,291          470    1,599,260   5.80s  12.28%
integrator_shade_background                   304,612,074          263    1,158,220   1.16s   2.45%
integrator_shade_surface                    1,159,764,041          676    1,715,627  20.57s  43.52%
integrator_shade_shadow                       598,885,847          418    1,432,741   1.27s   2.69%
integrator_queued_paths_array               2,969,650,130          805    3,689,006   0.35s   0.74%
integrator_queued_shadow_paths_array          593,936,619          379    1,567,115   0.14s   0.29%
integrator_terminated_paths_array              22,205,417          155      143,260   0.05s   0.10%
integrator_sorted_paths_array               2,517,140,043          676    3,723,579   1.65s   3.50%
integrator_compact_paths_array                648,912,748          155    4,186,533   0.03s   0.07%
integrator_compact_states                      20,872,687          155      134,662   0.14s   0.29%
integrator_terminated_shadow_paths_array      374,100,675          438      854,111   0.16s   0.33%
integrator_compact_shadow_paths_array         503,768,657          438    1,150,156   0.05s   0.10%
integrator_compact_shadow_states               37,664,941          202      186,460   0.23s   0.50%
integrator_reset                               25,165,824            6    4,194,304   0.06s   0.12%
film_convert_combined_half_rgba                 3,110,400            6      518,400   0.00s   0.01%
prefix_sum                                            676          676            1   0.19s   0.40%
---------------------------------------------------------------------------------------------------
                                                                 6,760               47.27s 100.00%
---------------------------------------------------------------------------------------------------
```

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D15044

===================================================================

M	intern/cycles/device/metal/bvh.mm
M	intern/cycles/device/metal/device_impl.h
M	intern/cycles/device/metal/device_impl.mm
M	intern/cycles/device/metal/queue.h
M	intern/cycles/device/metal/queue.mm
M	intern/cycles/device/metal/util.h
M	intern/cycles/kernel/device/gpu/kernel.h

===================================================================

diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm
index 086fbb093ba..09c4ace081e 100644
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -11,6 +11,7 @@
 #  include "util/progress.h"
 
 #  include "device/metal/bvh.h"
+#  include "device/metal/util.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN
     { \
       string str = string_printf(__VA_ARGS__); \
       progress.set_substatus(str); \
+      metal_printf("%s\n", str.c_str()); \
     }
 
 BVHMetal::BVHMetal(const BVHParams &params_,
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 7506b9b069f..0e6817d94f8 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -31,6 +31,8 @@ class MetalDevice : public Device {
   string source[PSO_NUM];
   string source_md5[PSO_NUM];
 
+  bool capture_enabled = false;
+
   KernelParamsMetal launch_params = {0};
 
   /* MetalRT members ----------------------------------*/
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 16aabacb4cf..086bf0af979 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
     use_metalrt = (atoi(metalrt) != 0);
   }
 
+  if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
+    capture_enabled = true;
+  }
+
   MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
   arg_desc_params.dataType = MTLDataTypePointer;
   arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
   }
 
   if (size > 0) {
-    if (mem.type == MEM_DEVICE_ONLY) {
+    if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
       options = MTLResourceStorageModePrivate;
     }
 
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index 6cc84a20787..de20514de0b 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -12,8 +12,6 @@
 #  include "device/metal/util.h"
 #  include "kernel/device/metal/globals.h"
 
-#  define metal_printf VLOG(4) << string_printf
-
 CCL_NAMESPACE_BEGIN
 
 class MetalDevice;
@@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue {
 
   void close_compute_encoder();
   void close_blit_encoder();
+
+  bool verbose_tracing = false;
+
+  /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */
+
+  struct TimingData {
+    DeviceKernel kernel;
+    int work_size;
+    uint64_t timing_id;
+  };
+  std::vector<TimingData> command_encoder_labels;
+  id<MTLSharedEvent> timing_shared_event = nil;
+  uint64_t timing_shared_event_id;
+  uint64_t command_buffer_start_timing_id;
+
+  struct TimingStats {
+    double total_time = 0.0;
+    uint64_t total_work_size = 0;
+    uint64_t num_dispatches = 0;
+  };
+  TimingStats timing_stats[DEVICE_KERNEL_NUM];
+  double last_completion_time = 0.0;
+
+  /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */
+
+  id<MTLCaptureScope> mtlCaptureScope = nil;
+  DeviceKernel capture_kernel;
+  int capture_dispatch = 0;
+  int capture_dispatch_counter = 0;
+  bool is_capturing = false;
+  bool is_capturing_to_disk = false;
+  bool has_captured_to_disk = false;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index c1dab5b0d8f..8b2d5d81859 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
   }
 
   wait_semaphore = dispatch_semaphore_create(0);
+
+  if (@available(macos 10.14, *)) {
+    if (getenv("CYCLES_METAL_PROFILING")) {
+      /* Enable per-kernel timing breakdown (shown at end of render). */
+      timing_shared_event = [mtlDevice newSharedEvent];
+    }
+    if (getenv("CYCLES_METAL_DEBUG")) {
+      /* Enable very verbose tracing (shows every dispatch). */
+      verbose_tracing = true;
+    }
+    timing_shared_event_id = 1;
+  }
+
+  capture_kernel = DeviceKernel(-1);
+  if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
+    /* Enable .gputrace capture for the specified DeviceKernel. */
+    MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
+    mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice];
+    mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
+    [captureManager setDefaultCaptureScope:mtlCaptureScope];
+
+    capture_dispatch = -1;
+    if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
+      capture_dispatch = atoi(capture_dispatch_str);
+      capture_dispatch_counter = 0;
+    }
+
+    capture_kernel = DeviceKernel(atoi(capture_kernel_str));
+    printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel));
+
+    if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
+      if (@available(macos 10.15, *)) {
+        if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
+
+          MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
+          captureDescriptor.captureObject = mtlCaptureScope;
+          captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+          captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
+
+          NSError *error;
+          if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
+            NSString *err = [error localizedDescription];
+            printf("Start capture failed: %s\n", [err UTF8String]);
+          }
+          else {
+            printf("Capture started (URL: %s)\n", capture_url);
+            is_capturing_to_disk = true;
+          }
+        }
+        else {
+          printf("Capture to file is not supported\n");
+        }
+      }
+    }
+  }
 }
 
 MetalDeviceQueue::~MetalDeviceQueue()
@@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue()
     [mtlCommandQueue release];
     mtlCommandQueue = nil;
   }
+
+  if (mtlCaptureScope) {
+    [mtlCaptureScope release];
+  }
+
+  double total_time = 0.0;
+
+  /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
+  int64_t total_work_size = 0;
+  int64_t num_dispatches = 0;
+  for (auto &stat : timing_stats) {
+    total_time += stat.total_time;
+    total_work_size += stat.total_work_size;
+    num_dispatches += stat.num_dispatches;
+  }
+
+  if (num_dispatches) {
+    printf("\nMetal dispatch stats:\n\n");
+    auto header = string_printf("%-40s %16s %12s %12s %7s %7s",
+                                "Kernel name",
+                                "Total threads",
+                                "Dispatches",
+                                "Avg. T/D",
+                                "Time",
+                                "Time%");
+    auto divider = string(header.length(), '-');
+    printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
+
+    for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
+      auto &stat = timing_stats[i];
+      if (stat.num_dispatches > 0) {
+        printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+               device_kernel_as_string(DeviceKernel(i)),
+               string_human_readable_number(stat.total_work_size).c_str(),
+               string_human_readable_number(stat.num_dispatches).c_str(),
+               string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(),
+               stat.total_time,
+               stat.total_time * 100.0 / total_time);
+      }
+    }
+    printf("%s\n", divider.c_str());
+    printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+           "",
+           "",
+           string_human_readable_number(num_dispatches).c_str(),
+           "",
+           total_time,
+           100.0);
+    printf("%s\n\n", divider.c_str());
+  }
 }
 
 int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
@@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
                                const int work_size,
                                DeviceKernelArguments const &args)
 {
+  if (kernel == capture_kernel) {
+    if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) {
+      /* Start gputrace capture. */
+      if (mtlCommandBuffer) {
+        synchronize();
+      }
+      [mtlCaptureScope beginScope];
+      printf("[mtlCaptureScope beginScope]\n");
+      is_capturing = true;
+    }
+    capture_dispatch_counter += 1;
+  }
+
   if (metal_device->have_error()) {
     return false;
   }
@@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
 
   id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
 
+  if (timing_shared_event) {
+    command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id});
+  }
+
   /* Determine size requirement for argument buffer. */
   size_t arg_buffer_length = 0;
   for (size_t i = 0; i < args.count; i++) {
@@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
   /* Encode KernelParamsMetal buffers */
   [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
 
+  if (verbose_tracing || timing_shared_event || is_capturing) {
+    /* Add human-readable labels if we're doing any form of debugging / profiling. */
+    mtlComputeCommandEncoder.label = [[NSString alloc]
+        initWithFormat:@"Metal queue launch %s, work_size %d",
+                       device_kernel_as_string(kernel),
+                       work_size];
+  }
+
   /* this relies on IntegratorStateGPU layout being contiguous device_ptrs  */
   const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
                                    sizeof(IntegratorStateGPU);
@@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
     int pointer_index = offset / sizeof(device_ptr);
     MetalDevice::MetalMem *mmem =

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list