[Bf-blender-cvs] [b22d5faba2d] tmp-macs-arm-cycles: Cycles: refactor intrinsic functions implementation

Brecht Van Lommel noreply at git.blender.org
Mon Feb 15 19:06:50 CET 2021


Commit: b22d5faba2da2beec3adee8f3cdd06dcac7443ed
Author: Brecht Van Lommel
Date:   Sun Feb 14 15:34:23 2021 +0100
Branches: tmp-macs-arm-cycles
https://developer.blender.org/rBb22d5faba2da2beec3adee8f3cdd06dcac7443ed

Cycles: refactor intrinsic functions implementation

* Add processor independent fallbacks
* Use uint32_t and uint64_t types
* Remove unused functions
* Better comments and less indentation

Ref D8237, T78710

===================================================================

M	intern/cycles/util/util_color.h
M	intern/cycles/util/util_half.h
M	intern/cycles/util/util_simd.h

===================================================================

diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index c6937ca78fe..1b493d0ed5e 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -20,7 +20,7 @@
 #include "util/util_math.h"
 #include "util/util_types.h"
 
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
 #  include "util/util_simd.h"
 #endif
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 3bac7008905..a8d4ee75e20 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -20,7 +20,7 @@
 #include "util/util_math.h"
 #include "util/util_types.h"
 
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
 #  include "util/util_simd.h"
 #endif
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index de0e3c39f30..7308a3207ad 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,49 +18,40 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
-#ifndef __KERNEL_GPU__
+#include <limits>
 
-#  include <limits>
-
-#  include "util/util_defines.h"
+#include "util/util_defines.h"
 
 /* SSE Intrinsics includes
  *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#  ifndef FREE_WINDOWS64
-
-#    ifdef _MSC_VER
-#      include <intrin.h>
-#    elif (defined(__x86_64__) || defined(__i386__))
-#      include <x86intrin.h>
-#    endif
-
-#  else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point.
+ *
+ * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
-#    include "util/util_windows.h"
-
-#  endif
-
-#  if defined(__x86_64__) || defined(_M_X64)
-#    define SIMD_SET_FLUSH_TO_ZERO \
-      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
-      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#  else
-#    define SIMD_SET_FLUSH_TO_ZERO
-#  endif
+#if defined(FREE_WINDOWS64)
+#  include "util/util_windows.h"
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+/* Floating Point Control, for Embree. */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define SIMD_SET_FLUSH_TO_ZERO \
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#else
+#  define SIMD_SET_FLUSH_TO_ZERO
+#endif
 
 CCL_NAMESPACE_BEGIN
 
-#  ifdef __KERNEL_SSE2__
+/* Data structures used by SSE classes. */
+#ifdef __KERNEL_SSE2__
 
 extern const __m128 _mm_lookupmask_ps[16];
 
-/* Special Types */
-
 static struct TrueTy {
   __forceinline operator bool() const
   {
@@ -122,377 +113,282 @@ static struct PosInfTy {
 static struct StepTy {
 } step ccl_maybe_unused;
 
-/* Intrinsics Functions */
+#endif
 
-#    if defined(__BMI__) && defined(__GNUC__)
-#      ifndef _tzcnt_u32
-#        define _tzcnt_u32 __tzcnt_u32
-#      endif
-#      ifndef _tzcnt_u64
-#        define _tzcnt_u64 __tzcnt_u64
-#      endif
-#    endif
-
-#    if defined(__LZCNT__)
-#      define _lzcnt_u32 __lzcnt32
-#      define _lzcnt_u64 __lzcnt64
-#    endif
-
-#    if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-
-__forceinline int __popcnt(int in)
-{
-  return _mm_popcnt_u32(in);
-}
+/* Intrinsics Functions
+ *
+ * For fast bit operations. */
 
-#      if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in)
-{
-  return _mm_popcnt_u32(in);
-}
-#      endif
+#if defined(__BMI__) && defined(__GNUC__)
+#  ifndef _tzcnt_u32
+#    define _tzcnt_u32 __tzcnt_u32
+#  endif
+#  ifndef _tzcnt_u64
+#    define _tzcnt_u64 __tzcnt_u64
+#  endif
+#endif
 
-#      if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in)
-{
-  return _mm_popcnt_u64(in);
-}
-__forceinline size_t __popcnt(size_t in)
-{
-  return _mm_popcnt_u64(in);
-}
-#      endif
+#if defined(__LZCNT__)
+#  define _lzcnt_u32 __lzcnt32
+#  define _lzcnt_u64 __lzcnt64
+#endif
 
-__forceinline int __bsf(int v)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+/* Intrinsic functions on Windows. */
+__forceinline uint32_t __bsf(uint32_t v)
 {
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   unsigned long r = 0;
   _BitScanForward(&r, v);
   return r;
-#      endif
+#  endif
 }
 
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint32_t __bsf(uint32_t v)
 {
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   unsigned long r = 0;
   _BitScanForward(&r, v);
   return r;
-#      endif
+#  endif
 }
 
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(uint32_t v)
 {
   unsigned long r = 0;
   _BitScanReverse(&r, v);
   return r;
 }
 
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(uint32_t v, uint32_t i)
 {
   long r = v;
   _bittestandcomplement(&r, i);
   return r;
 }
 
-__forceinline int __bts(int v, int i)
+__forceinline uint32_t bitscan(uint32_t v)
 {
-  long r = v;
-  _bittestandset(&r, i);
-  return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
-  long r = v;
-  _bittestandreset(&r, i);
-  return r;
-}
-
-__forceinline int bitscan(int v)
-{
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   return __bsf(v);
-#      endif
-}
-
-__forceinline int clz(const int x)
-{
-#      if defined(__KERNEL_AVX2__)
-  return _lzcnt_u32(x);
-#      else
-  if (UNLIKELY(x == 0))
-    return 32;
-  return 31 - __bsr(x);
-#      endif
-}
-
-__forceinline int __bscf(int &v)
-{
-  int i = __bsf(v);
-  v &= v - 1;
-  return i;
-}
-
-__forceinline unsigned int __bscf(unsigned int &v)
-{
-  unsigned int i = __bsf(v);
-  v &= v - 1;
-  return i;
+#  endif
 }
 
-#      if defined(__KERNEL_64_BIT__)
+#  if defined(__KERNEL_64_BIT__)
 
-__forceinline size_t __bsf(size_t v)
+__forceinline uint64_t __bsf(uint64_t v)
 {
-#        if defined(__KERNEL_AVX2__)
+#    if defined(__KERNEL_AVX2__)
   return _tzcnt_u64(v);
-#        else
+#    else
   unsigned long r = 0;
   _BitScanForward64(&r, v);
   return r;
-#        endif
+#    endif
 }
 
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(uint64_t v)
 {
   unsigned long r = 0;
   _BitScanReverse64(&r, v);
   return r;
 }
 
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(uint64_t v, uint64_t i)
 {
-  size_t r = v;
+  uint64_t r = v;
   _bittestandcomplement64((__int64 *)&r, i);
   return r;
 }
 
-__forceinline size_t __bts(size_t v, size_t i)
+__forceinline uint64_t bitscan(uint64_t v)
 {
-  __int64 r = v;
-  _bittestandset64(&r, i);
-  return r;
-}
-
-__forceinline size_t __btr(size_t v, size_t i)
-{
-  __int64 r = v;
-  _bittestandreset64(&r, i);
-  return r;
-}
-
-__forceinline size_t bitscan(size_t v)
-{
-#        if defined(__KERNEL_AVX2__)
-#          if defined(__KERNEL_64_BIT__)
+#    if defined(__KERNEL_AVX2__)
+#      if defined(__KERNEL_64_BIT__)
   return _tzcnt_u64(v);
-#          else
+#      else
   return _tzcnt_u32(v);
-#          endif
-#        else
+#      endif
+#    else
   return __bsf(v);
-#        endif
-}
-
-__forceinline size_t __bscf(size_t &v)
-{
-  size_t i = __bsf(v);
-  v &= v - 1;
-  return i;
+#    endif
 }
 
-#      endif /* __KERNEL_64_BIT__ */
+#  endif /* __KERNEL_64_BIT__ */
 
-#    else /* _WIN32 */
-
-__forceinline unsigned int __popcnt(unsigned int in)
-{
-  int r = 0;
-  asm("popcnt %1,%0" : "=r"(r) : "r"(in));
-  return r;
-}
+#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
+/* Instrinsic functions with x86 SSE. */
 
-__forceinline int __bsf(int v)
+__forceinline uint32_t __bsf(const uint32_t v)
 {
-  int r = 0;
+  uint32_t r = 0;
   asm("bsf %1,%0" : "=r"(r) : "r"(v));
   return r;
 }
 
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(const uint32_t v)
 {
-  int r = 0;
+  uint32_t r = 0;
   asm("bsr %1,%0" : "=r"(r) : "r"(v));
   return r;
 }
 
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(const uint32_t v, uint32_t i)
 {
-  int r = 0;
+  uint32_t r = 0;
   asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
   return r;
 }
 
-__forceinline int __bts(int v, int i)
-{
-  int r = 0;
-  asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
-  return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
-  int r = 0;
-  asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
-  return r;
-}
-
-#      if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
-          !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bsf(size_t v)
-{
-  size_t r = 0;
-  asm("bsf %1,%0" : "=r"(r) : "r"(v));
-  return r;
-}
-#      endif
-
-__forceinline unsigned int __bsf(unsigned int v)
+#  if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+      !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline uint64_t __bsf(const uint64_t v)
 {
-  unsigned int r = 0;
+  uint64_t r = 0;
   asm("bsf %1,%0" : "=r"(r) : "r"(v));
   return r;
 }
+#  endif
 
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(const uint64_t v)
 {
-  size_t r = 0;
+  uint64_t r = 0;
   asm("bsr %1,%0" : "=r"(r) : "r"(v));
   return r;
 }
 
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
 {
-  size_t r = 0;
+  uint64_t r = 0;
   asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
   return r;
 }
 
-__forceinline size_t __bts(size_t v, size_t i)
+__forceinline uint32_t bitscan(uint32_t v)
 {
-  size_t r = 0;
-  asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
-  return r;
+#  if defined(__KERNEL_AVX2__)
+  return _tzcnt_u32(v);
+#  else
+  return __bsf(v);
+#  endif
 }
 
-__forceinline size_t __btr(size_t v, size_t i)
+__forceinline uint32_t bitscan(uint32_t v)
 {
-  size_t r = 0;
-  asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
-  return r;
+#  if defined(__KERNEL_AVX2__)
+  return _tzcnt_u32(v);
+#  el

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list