[Bf-blender-cvs] [33009ac0650] cycles-x: Cycles X: abstract new GPU kernels to be less CUDA specific

Wed Aug 18 20:11:15 CEST 2021

Commit: 33009ac0650ad11ebb3a6e80a86a068daba939c5
Author: Brecht Van Lommel
Date:   Wed Aug 18 18:44:37 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB33009ac0650ad11ebb3a6e80a86a068daba939c5

Cycles X: abstract new GPU kernels to be less CUDA specific

With the idea of being able to reuse this for HIP and hopefully other devices.
Also cleans up compat.h for CUDA and OptiX a bit and makes them more consistent.

===================================================================

M	intern/cycles/device/cuda/kernel.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/device/cpu/compat.h
M	intern/cycles/kernel/device/cuda/compat.h
M	intern/cycles/kernel/device/cuda/config.h
M	intern/cycles/kernel/device/cuda/kernel.cu
R083	intern/cycles/kernel/device/cuda/image.h	intern/cycles/kernel/device/gpu/image.h
A	intern/cycles/kernel/device/gpu/kernel.h
R068	intern/cycles/kernel/device/cuda/parallel_active_index.h	intern/cycles/kernel/device/gpu/parallel_active_index.h
R083	intern/cycles/kernel/device/cuda/parallel_prefix_sum.h	intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
R071	intern/cycles/kernel/device/cuda/parallel_reduce.h	intern/cycles/kernel/device/gpu/parallel_reduce.h
R064	intern/cycles/kernel/device/cuda/parallel_sorted_index.h	intern/cycles/kernel/device/gpu/parallel_sorted_index.h
M	intern/cycles/kernel/device/optix/compat.h
M	intern/cycles/kernel/device/optix/kernel.cu
M	intern/cycles/util/util_defines.h

===================================================================

diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
index 8026715d7b3..0ed20ddf8e6 100644
--- a/intern/cycles/device/cuda/kernel.cpp
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -33,7 +33,7 @@ void CUDADeviceKernels::load(CUDADevice *device)
       continue;
     }
 
-    const std::string function_name = std::string("kernel_cuda_") +
+    const std::string function_name = std::string("kernel_gpu_") +
                                       device_kernel_as_string((DeviceKernel)i);
     cuda_device_assert(device,
                        cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 23045034af5..b56e1c438ae 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -91,16 +91,19 @@ set(SRC_DEVICE_CPU_HEADERS
   device/cpu/kernel_arch.h
   device/cpu/kernel_arch_impl.h
 )
+set(SRC_DEVICE_GPU_HEADERS
+  device/gpu/image.h
+  device/gpu/kernel.h
+  device/gpu/parallel_active_index.h
+  device/gpu/parallel_prefix_sum.h
+  device/gpu/parallel_reduce.h
+  device/gpu/parallel_sorted_index.h
+)
 
 set(SRC_DEVICE_CUDA_HEADERS
   device/cuda/compat.h
   device/cuda/config.h
   device/cuda/globals.h
-  device/cuda/image.h
-  device/cuda/parallel_active_index.h
-  device/cuda/parallel_prefix_sum.h
-  device/cuda/parallel_reduce.h
-  device/cuda/parallel_sorted_index.h
 )
 
 set(SRC_DEVICE_OPTIX_HEADERS
@@ -316,6 +319,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
   # build for each arch
   set(cuda_sources device/cuda/kernel.cu
     ${SRC_HEADERS}
+    ${SRC_DEVICE_GPU_HEADERS}
     ${SRC_DEVICE_CUDA_HEADERS}
     ${SRC_BVH_HEADERS}
     ${SRC_SVM_HEADERS}
@@ -490,6 +494,7 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
           ${SRC_DEVICE_CUDA_HEADERS}
           ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
@@ -514,6 +519,7 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
           ${SRC_DEVICE_CUDA_HEADERS}
           ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
@@ -599,6 +605,7 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
   ${SRC_DEVICE_CPU_HEADERS}
+  ${SRC_DEVICE_GPU_HEADERS}
   ${SRC_DEVICE_CUDA_HEADERS}
   ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
@@ -614,6 +621,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
 source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
 source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
 source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
 source_group("svm" FILES ${SRC_SVM_HEADERS})
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index 0090b77a6ae..bfd936c7bbd 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -34,15 +34,6 @@
 
 #define ccl_addr_space
 
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 5bd3e4bbea7..665da43e1a1 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -21,6 +21,10 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
 /* Manual definitions so we can compile without CUDA toolkit. */
 
 #ifdef __CUDACC_RTC__
@@ -29,8 +33,6 @@ typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -38,14 +40,7 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
 
 #define ccl_device __device__ __inline__
 #if __CUDA_ARCH__ < 500
@@ -61,95 +56,57 @@ __device__ half __float2half(const float f)
 #define ccl_static_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
 #define ccl_align(n) __align__(n)
 #define ccl_optional_struct_init
 
-#define ccl_attr_maybe_unused [[maybe_unused]]
-
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE CUDA_KERNEL_BLOCK_NUM_THREADS
-
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
-/* Types */
+/* GPU thread, block, grid size and index */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
 
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-ccl_device_inline uint ccl_local_id(uint d)
-{
-  switch (d) {
-    case 0:
-      return threadIdx.x;
-    case 1:
-      return threadIdx.y;
-    case 2:
-      return threadIdx.z;
-    default:
-      return 0;
-  }
-}
+/* GPU warp synchronizaton */
 
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
 
-ccl_device_inline uint ccl_local_size(uint d)
-{
-  switch (d) {
-    case 0:
-      return blockDim.x;
-    case 1:
-      return blockDim.y;
-    case 2:
-      return blockDim.z;
-    default:
-      return 0;
-  }
-}
+/* GPU texture objects */
 
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
 
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
 {
-  switch (d) {
-    case 0:
-      return blockIdx.x;
-    case 1:
-      return blockIdx.y;
-    case 2:
-      return blockIdx.z;
-    default:
-      return 0;
-  }
+  return tex2D<T>(texobj, x, y);
 }
 
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
 {
-  switch (d) {
-    case 0:
-      return gridDim.x;
-    case 1:
-      return gridDim.y;
-    case 2:
-      return gridDim.z;
-    default:
-      return 0;
-  }
+  return tex3D<T>(texobj, x, y, z);
 }
 
 /* Use fast math functions */
@@ -160,3 +117,19 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define tanf(x) __tanf(((float)(x)))
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 4e6268b37ce..46196dcdb51 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -26,94 +26,89 @@
 
 /* 3.0 and 3.5 */
 #if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
 
 /* tunable parameters */
-#  define CUDA_KERNEL_BLOCK_NUM_THREADS 256
-#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
 
 /* 3.2 */
 #elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
 
 /* tunable parameters */
-#  define CUDA_KERNEL_BLOCK_NUM_THREADS 256
-#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
 
 /* 3.7 */
 #elif __CUDA_ARCH__ == 370
-#  defi

@@ Diff output truncated at 10240 characters. @@