[Bf-blender-cvs] [3a4c8f406a3] master: Cycles: Adapt shared kernel/device/gpu layer for MSL

Tue Nov 9 22:43:17 CET 2021

Commit: 3a4c8f406a3a3bf0627477c6183a594fa707a6e2
Author: Michael Jones
Date:   Tue Nov 9 21:30:46 2021 +0000
Branches: master
https://developer.blender.org/rB3a4c8f406a3a3bf0627477c6183a594fa707a6e2

Cycles: Adapt shared kernel/device/gpu layer for MSL

This patch adapts the shared kernel entrypoints so that they can be compiled as MSL (Metal Shading Language). Where possible, the adaptations avoid changes in common code.

In MSL, kernel function inputs are explicitly bound to resources. In the case of argument buffers, we declare a struct containing the kernel arguments, accessible via device pointer. This differs from CUDA and HIP where kernel function arguments are declared as traditional C-style function parameters. This patch adapts the entrypoints declared in kernel.h so that they can be translated via a new `ccl_gpu_kernel_signature` macro into the required parameter struct + kernel entrypoint pairin [...]

MSL buffer attribution must be applied to function parameters or non-static class data members. To allow universal access to the integrator state, kernel data, and texture fetch adapters, we wrap all of the shared kernel code in a `MetalKernelContext` class. This is achieved by bracketing the appropriate kernel headers with "context_begin.h" and "context_end.h" on Metal. When calling deeper into the kernel code, we must reference the context class (e.g. `context.integrator_init_from_camer [...]

Lambda expressions are not supported on MSL, so a new `ccl_gpu_kernel_lambda` macro generates an inline function object and optionally capturing any required state. This yields the same behaviour. This approach is applied to all parallel_... implementations which are templated by operation. The lambda expressions in the film_convert... kernels don't adapt cleanly to use function objects. However, these entrypoints can be macro-generated more concisely to avoid lambda expressions entirely, [...]

A separate implementation of `gpu_parallel_active_index_array` is provided for Metal to workaround some subtle differences in SIMD width, and also to encapsulate some required thread parameters which must be declared as explicit entrypoint function parameters.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13109

===================================================================

M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/device/cuda/compat.h
M	intern/cycles/kernel/device/cuda/config.h
M	intern/cycles/kernel/device/gpu/image.h
M	intern/cycles/kernel/device/gpu/kernel.h
M	intern/cycles/kernel/device/gpu/parallel_active_index.h
M	intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
M	intern/cycles/kernel/device/gpu/parallel_sorted_index.h
M	intern/cycles/kernel/device/hip/compat.h
M	intern/cycles/kernel/device/hip/config.h
M	intern/cycles/kernel/device/metal/compat.h
A	intern/cycles/kernel/device/metal/context_begin.h
A	intern/cycles/kernel/device/metal/context_end.h
A	intern/cycles/kernel/device/metal/globals.h
A	intern/cycles/kernel/device/metal/kernel.metal
M	intern/cycles/kernel/device/optix/compat.h

===================================================================

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 29ff69df864..f311b0e74bb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
   device/hip/kernel.cpp
 )
 
+set(SRC_KERNEL_DEVICE_METAL
+  device/metal/kernel.metal
+)
+
 set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel.cu
   device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
   device/optix/globals.h
 )
 
+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/compat.h
+  device/metal/context_begin.h
+  device/metal/context_end.h
+  device/metal/globals.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
   closure/alloc.h
   closure/bsdf.h
@@ -368,6 +379,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
     ${SRC_KERNEL_HEADERS}
     ${SRC_KERNEL_DEVICE_GPU_HEADERS}
     ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
+    ${SRC_KERNEL_DEVICE_METAL_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
   set(cuda_cubins)
@@ -723,12 +735,14 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_KERNEL_DEVICE_CUDA}
   ${SRC_KERNEL_DEVICE_HIP}
   ${SRC_KERNEL_DEVICE_OPTIX}
+  ${SRC_KERNEL_DEVICE_METAL}
   ${SRC_KERNEL_HEADERS}
   ${SRC_KERNEL_DEVICE_CPU_HEADERS}
   ${SRC_KERNEL_DEVICE_GPU_HEADERS}
   ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
   ${SRC_KERNEL_DEVICE_HIP_HEADERS}
   ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )
 
 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -740,6 +754,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -772,6 +787,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 1ee82e6eb7c..2feebad074f 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -75,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..e333fe90332 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -93,11 +93,35 @@
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
 
-#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_threads_registers(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
+#define SELECT_MACRO(_1, _2, NAME, ...) NAME
+#define ccl_gpu_kernel(...) \
+  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, ccl_gpu_kernel_threads)(__VA_ARGS__)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass; \
+  ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..2ec6a49ec7b 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,10 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -40,6 +44,11 @@
 #include "kernel/bake/bake.h"
 
 #include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
 #include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
@@ -47,7 +56,8 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+ccl_gpu_kernel_signature(integrator_reset,
+                          int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -58,10 +68,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -72,7 +83,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const Ker

@@ Diff output truncated at 10240 characters. @@