[Bf-blender-cvs] [b82de02e7ce] master: Cycles: Enable inlining on Apple Silicon for 1.1x speedup

Michael Jones noreply at git.blender.org
Tue Apr 26 23:17:18 CEST 2022


Commit: b82de02e7ce857e20b842a074c0068b146a9fd79
Author: Michael Jones
Date:   Tue Apr 26 19:00:35 2022 +0100
Branches: master
https://developer.blender.org/rBb82de02e7ce857e20b842a074c0068b146a9fd79

Cycles: Enable inlining on Apple Silicon for 1.1x speedup

This is a stripped down version of D14645 without the scene specialisation optimisations.

The two major changes in this patch are:

- Enables more aggressive inlining on Apple Silicon resulting in a 1.1x speedup and 10% reduction in spill, at the cost of longer pipeline build times
- Revival of shader binary archives through a new ShaderCache which is shared between MetalDevice instances using the same physical MTLDevice. This mitigates the extra compile times via explicit caching (rather than, as before, relying on the implicit system shader cache which can be purged without notice)

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D14763

===================================================================

M	intern/cycles/device/metal/device_impl.h
M	intern/cycles/device/metal/device_impl.mm
M	intern/cycles/device/metal/kernel.h
M	intern/cycles/device/metal/kernel.mm
M	intern/cycles/device/metal/queue.mm
M	intern/cycles/kernel/device/metal/compat.h

===================================================================

diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 27c58ce6d2f..d7311ee985f 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -28,7 +28,8 @@ class MetalDevice : public Device {
   id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
   id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
       nil; /* encoder used for fetching device pointers from MTLBuffers */
-  string source_used_for_compile[PSO_NUM];
+  string source[PSO_NUM];
+  string source_md5[PSO_NUM];
 
   KernelParamsMetal launch_params = {0};
 
@@ -110,6 +111,12 @@ class MetalDevice : public Device {
 
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
 
+  id<MTLLibrary> compile(string const &source);
+
+  const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
+
+  bool kernel_available(DeviceKernel kernel) const;
+
   /* ------------------------------------------------------------------ */
   /* low-level memory management */
 
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index c01f51fb506..7d1212cb37c 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
    * active, but may still need to be rendered without motion blur if that isn't active as well. */
   motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
 
-  NSError *error = NULL;
+  source[PSO_GENERIC] = get_source(kernel_features);
+  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
 
-  for (int i = 0; i < PSO_NUM; i++) {
-    if (mtlLibrary[i]) {
-      [mtlLibrary[i] release];
-      mtlLibrary[i] = nil;
-    }
-  }
+  MD5Hash md5;
+  md5.append(source[PSO_GENERIC]);
+  source_md5[PSO_GENERIC] = md5.get_hex();
+
+  metal_printf("Front-end compilation finished (generic)\n");
+
+  bool result = kernels.load(this, false);
 
+  reserve_local_memory(kernel_features);
+
+  return result;
+}
+
+id<MTLLibrary> MetalDevice::compile(string const &source)
+{
   MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
 
   options.fastMathEnabled = YES;
   if (@available(macOS 12.0, *)) {
     options.languageVersion = MTLLanguageVersion2_4;
   }
-  else {
-    return false;
-  }
-
-  string metalsrc;
-
-  /* local helper: dump source to disk and return filepath */
-  auto dump_source = [&](int kernel_type) -> string {
-    string &source = source_used_for_compile[kernel_type];
-    string metalsrc = path_cache_get(path_join("kernels",
-                                               string_printf("%s.%s.metal",
-                                                             kernel_type_as_string(kernel_type),
-                                                             util_md5_string(source).c_str())));
-    path_write_text(metalsrc, source);
-    return metalsrc;
-  };
-
-  /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
-   * then compile it into a MTLLibrary */
-  auto fetch_and_compile_source = [&](int kernel_type) {
-    /* Record the source used to compile this library, for hash building later. */
-    string &source = source_used_for_compile[kernel_type];
-
-    switch (kernel_type) {
-      case PSO_GENERIC: {
-        source = get_source(kernel_features);
-        break;
-      }
-      case PSO_SPECIALISED: {
-        /* PSO_SPECIALISED derives from PSO_GENERIC */
-        string &generic_source = source_used_for_compile[PSO_GENERIC];
-        if (generic_source.empty()) {
-          generic_source = get_source(kernel_features);
-        }
-        source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
-        break;
-      }
-      default:
-        assert(0);
-    }
 
-    /* create MTLLibrary (front-end compilation) */
-    mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
+  NSError *error = NULL;
+  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                       options:options
                                                         error:&error];
 
-    bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
-
-    if (!mtlLibrary[kernel_type] || do_source_dump) {
-      string metalsrc = dump_source(kernel_type);
-
-      if (!mtlLibrary[kernel_type]) {
-        NSString *err = [error localizedDescription];
-        set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
-
-        return false;
-      }
-    }
-    return true;
-  };
-
-  fetch_and_compile_source(PSO_GENERIC);
-
-  if (use_function_specialisation) {
-    fetch_and_compile_source(PSO_SPECIALISED);
+  if (!mtlLibrary) {
+    NSString *err = [error localizedDescription];
+    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
   }
 
-  metal_printf("Front-end compilation finished\n");
-
-  bool result = kernels.load(this, PSO_GENERIC);
-
   [options release];
-  reserve_local_memory(kernel_features);
 
-  return result;
+  return mtlLibrary;
 }
 
 void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -671,6 +619,11 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
   return 0;
 }
 
+const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
+{
+  return kernels.get_best_pipeline(this, kernel);
+}
+
 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
 {
   if (strcmp(name, "__data") == 0) {
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index b12491d820d..7e398d1cf41 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -54,98 +54,41 @@ enum {
 const char *kernel_type_as_string(int kernel_type);
 
 struct MetalKernelPipeline {
-  void release()
-  {
-    if (pipeline) {
-      [pipeline release];
-      pipeline = nil;
-      if (@available(macOS 11.0, *)) {
-        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-          if (intersection_func_table[i]) {
-            [intersection_func_table[i] release];
-            intersection_func_table[i] = nil;
-          }
-        }
-      }
-    }
-    if (function) {
-      [function release];
-      function = nil;
-    }
-    if (@available(macOS 11.0, *)) {
-      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-        if (intersection_func_table[i]) {
-          [intersection_func_table[i] release];
-        }
-      }
-    }
-  }
 
-  bool loaded = false;
-  id<MTLFunction> function = nil;
-  id<MTLComputePipelineState> pipeline = nil;
-
-  API_AVAILABLE(macos(11.0))
-  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
-};
+  void compile();
 
-struct MetalKernelLoadDesc {
-  int pso_index = 0;
-  const char *function_name = nullptr;
-  int kernel_index = 0;
-  int threads_per_threadgroup = 0;
-  MTLFunctionConstantValues *constant_values = nullptr;
-  NSArray *linked_functions = nullptr;
-
-  struct IntersectorFunctions {
-    NSArray *defaults;
-    NSArray *shadow;
-    NSArray *local;
-    NSArray *operator[](int index) const
-    {
-      if (index == METALRT_TABLE_DEFAULT)
-        return defaults;
-      if (index == METALRT_TABLE_SHADOW)
-        return shadow;
-      return local;
-    }
-  } intersector_functions = {nullptr};
-};
-
-/* Metal kernel and associate occupancy information. */
-class MetalDeviceKernel {
- public:
-  ~MetalDeviceKernel();
+  id<MTLLibrary> mtlLibrary = nil;
+  bool scene_specialized;
+  string source_md5;
 
-  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+  bool use_metalrt;
+  bool metalrt_hair;
+  bool metalrt_hair_thick;
+  bool metalrt_pointcloud;
 
-  void mark_loaded(int pso_index)
-  {
-    pso[pso_index].loaded = true;
-  }
+  int threads_per_threadgroup;
 
-  int get_num_threads_per_block() const
-  {
-    return num_threads_per_block;
-  }
-  const MetalKernelPipeline &get_pso() const;
-
-  double load_duration = 0.0;
+  DeviceKernel device_kernel;
+  bool loaded = false;
+  id<MTLDevice> mtlDevice = nil;
+  id<MTLFunction> function = nil;
+  id<MTLComputePipelineState> pipeline = nil;
+  int num_threads_per_block = 0;
 
- private:
-  MetalKernelPipeline pso[PSO_NUM];
+  string error_str;
 
-  int num_threads_per_block = 0;
+  API_AVAILABLE(macos(11.0))
+  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
+  id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
 };
 
 /* Cache of Metal kernels for each DeviceKernel. */
 class MetalDeviceKernels {
  public:
-  bool load(MetalDevice *device, int kernel_type);
-  bool available(DeviceKernel kernel) const;
-  const MetalDeviceKernel &get(DeviceKernel kernel) const;
-
-  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool load(MetalDevice *device, bool scene_specialized);
+  bool available(const MetalDevice *device, DeviceKernel kernel) const;
+  const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
+                                               DeviceKernel kernel) const;
 
   id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
 
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 9555ca03c8e..44a5e23d00f 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -9,6 +9,7 @@
 #  include "util/path.h"
 #  include "util/tbb.h"
 #  include "util/time.h"
+#  include "util/unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,82 +29,370 @@ const char *kernel_type_as_string(int kernel_type)
   return "";
 }
 
-MetalDeviceKernel::~MetalDeviceKernel()
+bool kernel_has_intersection(DeviceKernel device_kernel)
 {
-  for (int i = 0; i < PSO_NUM; i++) {
-    pso[i].release();
+  return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          device_kernel ==

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list