[Bf-blender-cvs] [b22d5faba2d] tmp-macs-arm-cycles: Cycles: refactor intrinsic functions implementation
Brecht Van Lommel
noreply at git.blender.org
Mon Feb 15 19:06:50 CET 2021
Commit: b22d5faba2da2beec3adee8f3cdd06dcac7443ed
Author: Brecht Van Lommel
Date: Sun Feb 14 15:34:23 2021 +0100
Branches: tmp-macs-arm-cycles
https://developer.blender.org/rBb22d5faba2da2beec3adee8f3cdd06dcac7443ed
Cycles: refactor intrinsic functions implementation
* Add processor independent fallbacks
* Use uint32_t and uint64_t types
* Remove unused functions
* Better comments and less indentation
Ref D8237, T78710
===================================================================
M intern/cycles/util/util_color.h
M intern/cycles/util/util_half.h
M intern/cycles/util/util_simd.h
===================================================================
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index c6937ca78fe..1b493d0ed5e 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 3bac7008905..a8d4ee75e20 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index de0e3c39f30..7308a3207ad 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,49 +18,40 @@
#ifndef __UTIL_SIMD_TYPES_H__
#define __UTIL_SIMD_TYPES_H__
-#ifndef __KERNEL_GPU__
+#include <limits>
-# include <limits>
-
-# include "util/util_defines.h"
+#include "util/util_defines.h"
/* SSE Intrinsics includes
*
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-# ifndef FREE_WINDOWS64
-
-# ifdef _MSC_VER
-# include <intrin.h>
-# elif (defined(__x86_64__) || defined(__i386__))
-# include <x86intrin.h>
-# endif
-
-# else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point.
+ *
+ * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
-# include "util/util_windows.h"
-
-# endif
-
-# if defined(__x86_64__) || defined(_M_X64)
-# define SIMD_SET_FLUSH_TO_ZERO \
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-# else
-# define SIMD_SET_FLUSH_TO_ZERO
-# endif
+#if defined(FREE_WINDOWS64)
+# include "util/util_windows.h"
+#elif defined(_MSC_VER)
+# include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+# include <x86intrin.h>
+#endif
+
+/* Floating Point Control, for Embree. */
+#if defined(__x86_64__) || defined(_M_X64)
+# define SIMD_SET_FLUSH_TO_ZERO \
+ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#else
+# define SIMD_SET_FLUSH_TO_ZERO
+#endif
CCL_NAMESPACE_BEGIN
-# ifdef __KERNEL_SSE2__
+/* Data structures used by SSE classes. */
+#ifdef __KERNEL_SSE2__
extern const __m128 _mm_lookupmask_ps[16];
-/* Special Types */
-
static struct TrueTy {
__forceinline operator bool() const
{
@@ -122,377 +113,282 @@ static struct PosInfTy {
static struct StepTy {
} step ccl_maybe_unused;
-/* Intrinsics Functions */
+#endif
-# if defined(__BMI__) && defined(__GNUC__)
-# ifndef _tzcnt_u32
-# define _tzcnt_u32 __tzcnt_u32
-# endif
-# ifndef _tzcnt_u64
-# define _tzcnt_u64 __tzcnt_u64
-# endif
-# endif
-
-# if defined(__LZCNT__)
-# define _lzcnt_u32 __lzcnt32
-# define _lzcnt_u64 __lzcnt64
-# endif
-
-# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-
-__forceinline int __popcnt(int in)
-{
- return _mm_popcnt_u32(in);
-}
+/* Intrinsics Functions
+ *
+ * For fast bit operations. */
-# if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in)
-{
- return _mm_popcnt_u32(in);
-}
-# endif
+#if defined(__BMI__) && defined(__GNUC__)
+# ifndef _tzcnt_u32
+# define _tzcnt_u32 __tzcnt_u32
+# endif
+# ifndef _tzcnt_u64
+# define _tzcnt_u64 __tzcnt_u64
+# endif
+#endif
-# if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in)
-{
- return _mm_popcnt_u64(in);
-}
-__forceinline size_t __popcnt(size_t in)
-{
- return _mm_popcnt_u64(in);
-}
-# endif
+#if defined(__LZCNT__)
+# define _lzcnt_u32 __lzcnt32
+# define _lzcnt_u64 __lzcnt64
+#endif
-__forceinline int __bsf(int v)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+/* Intrinsic functions on Windows. */
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(uint32_t v)
{
unsigned long r = 0;
_BitScanReverse(&r, v);
return r;
}
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(uint32_t v, uint32_t i)
{
long r = v;
_bittestandcomplement(&r, i);
return r;
}
-__forceinline int __bts(int v, int i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- long r = v;
- _bittestandset(&r, i);
- return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
- long r = v;
- _bittestandreset(&r, i);
- return r;
-}
-
-__forceinline int bitscan(int v)
-{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
return __bsf(v);
-# endif
-}
-
-__forceinline int clz(const int x)
-{
-# if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
-# else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
-# endif
-}
-
-__forceinline int __bscf(int &v)
-{
- int i = __bsf(v);
- v &= v - 1;
- return i;
-}
-
-__forceinline unsigned int __bscf(unsigned int &v)
-{
- unsigned int i = __bsf(v);
- v &= v - 1;
- return i;
+# endif
}
-# if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_64_BIT__)
-__forceinline size_t __bsf(size_t v)
+__forceinline uint64_t __bsf(uint64_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u64(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward64(&r, v);
return r;
-# endif
+# endif
}
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(uint64_t v)
{
unsigned long r = 0;
_BitScanReverse64(&r, v);
return r;
}
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(uint64_t v, uint64_t i)
{
- size_t r = v;
+ uint64_t r = v;
_bittestandcomplement64((__int64 *)&r, i);
return r;
}
-__forceinline size_t __bts(size_t v, size_t i)
+__forceinline uint64_t bitscan(uint64_t v)
{
- __int64 r = v;
- _bittestandset64(&r, i);
- return r;
-}
-
-__forceinline size_t __btr(size_t v, size_t i)
-{
- __int64 r = v;
- _bittestandreset64(&r, i);
- return r;
-}
-
-__forceinline size_t bitscan(size_t v)
-{
-# if defined(__KERNEL_AVX2__)
-# if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_64_BIT__)
return _tzcnt_u64(v);
-# else
+# else
return _tzcnt_u32(v);
-# endif
-# else
+# endif
+# else
return __bsf(v);
-# endif
-}
-
-__forceinline size_t __bscf(size_t &v)
-{
- size_t i = __bsf(v);
- v &= v - 1;
- return i;
+# endif
}
-# endif /* __KERNEL_64_BIT__ */
+# endif /* __KERNEL_64_BIT__ */
-# else /* _WIN32 */
-
-__forceinline unsigned int __popcnt(unsigned int in)
-{
- int r = 0;
- asm("popcnt %1,%0" : "=r"(r) : "r"(in));
- return r;
-}
+#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
+/* Instrinsic functions with x86 SSE. */
-__forceinline int __bsf(int v)
+__forceinline uint32_t __bsf(const uint32_t v)
{
- int r = 0;
+ uint32_t r = 0;
asm("bsf %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(const uint32_t v)
{
- int r = 0;
+ uint32_t r = 0;
asm("bsr %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(const uint32_t v, uint32_t i)
{
- int r = 0;
+ uint32_t r = 0;
asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
return r;
}
-__forceinline int __bts(int v, int i)
-{
- int r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
- int r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
-}
-
-# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bsf(size_t v)
-{
- size_t r = 0;
- asm("bsf %1,%0" : "=r"(r) : "r"(v));
- return r;
-}
-# endif
-
-__forceinline unsigned int __bsf(unsigned int v)
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline uint64_t __bsf(const uint64_t v)
{
- unsigned int r = 0;
+ uint64_t r = 0;
asm("bsf %1,%0" : "=r"(r) : "r"(v));
return r;
}
+# endif
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(const uint64_t v)
{
- size_t r = 0;
+ uint64_t r = 0;
asm("bsr %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
{
- size_t r = 0;
+ uint64_t r = 0;
asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
return r;
}
-__forceinline size_t __bts(size_t v, size_t i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- size_t r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
+# if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
+# else
+ return __bsf(v);
+# endif
}
-__forceinline size_t __btr(size_t v, size_t i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- size_t r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
+# if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
+# el
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list