[Bf-blender-cvs] [e2ff730fd98] cycles-hip-binaries: Cycles: patch for precompiled HIP binaries and additional fixes

Thu Oct 21 21:05:20 CEST 2021

Commit: e2ff730fd9830ef399924cf04b2dac5235097dde
Author: Sayak Biswas
Date:   Thu Oct 21 20:57:17 2021 +0200
Branches: cycles-hip-binaries
https://developer.blender.org/rBe2ff730fd9830ef399924cf04b2dac5235097dde

Cycles: patch for precompiled HIP binaries and additional fixes

Committing to a branch to test it on the buildbot.

Ref T92393, D12958

===================================================================

M	CMakeLists.txt
M	extern/hipew/include/hipew.h
M	extern/hipew/src/hipew.c
M	intern/cycles/blender/addon/properties.py
M	intern/cycles/device/hip/device.cpp
M	intern/cycles/device/hip/device_impl.cpp
M	intern/cycles/device/hip/device_impl.h
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/device/hip/globals.h

===================================================================

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94a5ff27491..715e9dd01d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -420,7 +420,9 @@ mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
 set(CYCLES_TEST_DEVICES CPU CACHE STRING "Run regression tests on the specified device types (CPU CUDA OPTIX)" )
 set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 sm_86 compute_75 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
-option(WITH_CYCLES_HIP_BINARIES     "Build Cycles HIP binaries" OFF)
+option(WITH_CYCLES_HIP_BINARIES     "Build Cycles HIP binaries" ON)
+set(CYCLES_HIP_BINARIES_ARCH gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "HIP architectures to build binaries for")
+mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
 option(WITH_CYCLES_LOGGING      "Build Cycles with logging support" ON)
 option(WITH_CYCLES_DEBUG_NAN    "Build Cycles with additional asserts for detecting NaNs and invalid values" OFF)
diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h
index aa42fdf8ecd..d18cf67524d 100644
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -425,6 +425,105 @@ typedef struct HIPdevprop_st {
   int textureAlign;
 } HIPdevprop;
 
+typedef struct {
+    // 32-bit Atomics
+    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
+    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
+    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
+    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
+    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.
+
+    // 64-bit Atomics
+    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
+    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+
+    // Doubles
+    unsigned hasDoubles : 1;  ///< Double-precision floating point.
+
+    // Warp cross-lane operations
+    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
+    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
+    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
+    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+
+    // Sync
+    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
+    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+
+    // Misc
+    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
+    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
+    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
+} hipDeviceArch_t;
+
+typedef struct hipDeviceProp_t {
+    char name[256];            ///< Device name.
+    size_t totalGlobalMem;     ///< Size of global memory region (in bytes).
+    size_t sharedMemPerBlock;  ///< Size of shared memory region (in bytes).
+    int regsPerBlock;          ///< Registers per block.
+    int warpSize;              ///< Warp size.
+    int maxThreadsPerBlock;    ///< Max work items per work group or workgroup max size.
+    int maxThreadsDim[3];      ///< Max number of threads in each dimension (XYZ) of a block.
+    int maxGridSize[3];        ///< Max grid dimensions (XYZ).
+    int clockRate;             ///< Max clock frequency of the multiProcessors in khz.
+    int memoryClockRate;       ///< Max global memory clock frequency in khz.
+    int memoryBusWidth;        ///< Global memory bus width in bits.
+    size_t totalConstMem;      ///< Size of shared memory region (in bytes).
+    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
+                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
+                ///< feature caps.
+    int multiProcessorCount;          ///< Number of multi-processors (compute units).
+    int l2CacheSize;                  ///< L2 cache size.
+    int maxThreadsPerMultiProcessor;  ///< Maximum resident threads per multi-processor.
+    int computeMode;                  ///< Compute mode.
+    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
+                               ///< instructions.  New for HIP.
+    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
+    int concurrentKernels;     ///< Device can possibly execute multiple kernels concurrently.
+    int pciDomainID;           ///< PCI Domain ID
+    int pciBusID;              ///< PCI Bus ID.
+    int pciDeviceID;           ///< PCI Device ID.
+    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per Multiprocessor.
+    int isMultiGpuBoard;                      ///< 1 if device is on a multi-GPU board, 0 if not.
+    int canMapHostMemory;                     ///< Check whether HIP can map host memory
+    int gcnArch;                              ///< DEPRECATED: use gcnArchName instead
+    char gcnArchName[256];                    ///< AMD GCN Arch Name.
+    int integrated;            ///< APU vs dGPU
+    int cooperativeLaunch;            ///< HIP device supports cooperative launch
+    int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices
+    int maxTexture1DLinear;    ///< Maximum size for 1D textures bound to linear memory
+    int maxTexture1D;          ///< Maximum number of elements in 1D images
+    int maxTexture2D[2];       ///< Maximum dimensions (width, height) of 2D images, in image elements
+    int maxTexture3D[3];       ///< Maximum dimensions (width, height, depth) of 3D images, in image elements
+    unsigned int* hdpMemFlushCntl;      ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+    unsigned int* hdpRegFlushCntl;      ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+    size_t memPitch;                 ///<Maximum pitch in bytes allowed by memory copies
+    size_t textureAlignment;         ///<Alignment requirement for textures
+    size_t texturePitchAlignment;    ///<Pitch alignment requirement for texture references bound to pitched memory
+    int kernelExecTimeoutEnabled;    ///<Run time limit for kernels executed on the device
+    int ECCEnabled;                  ///<Device has ECC support enabled
+    int tccDriver;                   ///< 1:If device is Tesla device using TCC driver, else 0
+    int cooperativeMultiDeviceUnmatchedFunc;        ///< HIP device supports cooperative launch on multiple
+                                                    ///devices with unmatched functions
+    int cooperativeMultiDeviceUnmatchedGridDim;     ///< HIP device supports cooperative launch on multiple
+                                                    ///devices with unmatched grid dimensions
+    int cooperativeMultiDeviceUnmatchedBlockDim;    ///< HIP device supports cooperative launch on multiple
+                                                    ///devices with unmatched block dimensions
+    int cooperativeMultiDeviceUnmatchedSharedMem;   ///< HIP device supports cooperative launch on multiple
+                                                    ///devices with unmatched shared memories
+    int isLargeBar;                  ///< 1: if it is a large PCI bar device, else 0
+    int asicRevision;                ///< Revision of the GPU in this device
+    int managedMemory;               ///< Device supports allocating managed memory on this system
+    int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device without migration
+    int concurrentManagedAccess;     ///< Device can coherently access managed memory concurrently with the CPU
+    int pageableMemoryAccess;        ///< Device supports coherently accessing pageable memory
+                                     ///< without calling hipHostRegister on it
+    int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's page tables
+} hipDeviceProp_t;
+
 typedef enum HIPpointer_attribute_enum {
   HIP_POINTER_ATTRIBUTE_CONTEXT = 1,
   HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
@@ -951,6 +1050,25 @@ typedef enum HIPGLmap_flags_enum {
   HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
 } HIPGLmap_flags;
 
+/**
+* hipRTC related
+*/
+typedef struct _hiprtcProgram* hiprtcProgram;
+
+typedef enum hiprtcResult {
+    HIPRTC_SUCCESS = 0,
+    HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+    HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+    HIPRTC_ERROR_INVALID_INPUT = 3,
+    HIPRTC_ERROR_INVALID_PROGRAM = 4,
+    HIPRTC_ERROR_INVALID_OPTION = 5,
+    HIPRTC_ERROR_COMPILATION = 6,
+    HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+    HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+    HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+    HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+    HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
 
 /* Function types. */
 typedef hipError_t HIPAPI thipGetErrorName(hipError_t error, const char** pStr);
@@ -958,6 +1076,7 @@ typedef hipError_t HIPAPI thipInit(unsigned int Flags);
 typedef hipError_t HIPAPI thipDriverGetVersion(int* driverVersion);
 typedef hipError_t HIPAPI thipGetDevice(hipDevice_t* device, int ordinal);
 typedef hipError_t HIPAPI thipGetDeviceCount(int* count);
+typedef hipError_t HIPAPI thipGetDeviceProperties(hipDeviceProp_t* props, int deviceId);
 typedef hipError_t HIPAPI thipDeviceGetName(char* name, int len, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceComputeCapability(int* major, int* minor, hipDevice_t dev);
@@ -1071,6 +1190,16 @@ typedef hipError_t HIPAPI thipGraphicsMapResources(unsigned int count, hipGraphi
 typedef 

@@ Diff output truncated at 10240 characters. @@