[Bf-blender-cvs] [8119f0aad21] master: Cycles: refactor intrinsic functions implementation
Brecht Van Lommel
noreply at git.blender.org
Wed Feb 17 16:26:39 CET 2021
Commit: 8119f0aad21c3ce88e82d68ed20cd5a8edc99703
Author: Brecht Van Lommel
Date: Sun Feb 14 15:34:23 2021 +0100
Branches: master
https://developer.blender.org/rB8119f0aad21c3ce88e82d68ed20cd5a8edc99703
Cycles: refactor intrinsic functions implementation
* Add processor independent fallbacks
* Use uint32_t and uint64_t types
* Remove unused functions
* Better comments and less indentation
Ref D8237, T78710
===================================================================
M intern/cycles/bvh/bvh.cpp
M intern/cycles/bvh/bvh_build.cpp
M intern/cycles/util/util_avxb.h
M intern/cycles/util/util_avxi.h
M intern/cycles/util/util_color.h
M intern/cycles/util/util_half.h
M intern/cycles/util/util_simd.h
M intern/cycles/util/util_sseb.h
M intern/cycles/util/util_ssef.h
M intern/cycles/util/util_ssei.h
M intern/cycles/util/util_types.h
===================================================================
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 256382e63ba..050e090bddf 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -69,7 +69,7 @@ BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout, BVHLayoutMask s
allowed_layouts_mask = supported_layouts;
}
/* We get widest from allowed ones and convert mask to actual layout. */
- const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask);
+ const BVHLayoutMask widest_allowed_layout_mask = __bsr((uint32_t)allowed_layouts_mask);
return (BVHLayout)(1 << widest_allowed_layout_mask);
}
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 296f9130f43..ec85cef0851 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -851,7 +851,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
for (int i = 0; i < range.size(); i++) {
const BVHReference &ref = references[range.start() + i];
if (ref.prim_index() != -1) {
- int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
+ uint32_t type_index = bitscan((uint32_t)(ref.prim_type() & PRIMITIVE_ALL));
p_ref[type_index].push_back(ref);
p_type[type_index].push_back(ref.prim_type());
p_index[type_index].push_back(ref.prim_index());
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 34fafd188de..17d505c077a 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -191,12 +191,12 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
////////////////////////////////////////////////////////////////////////////////
#if defined(__KERNEL_SSE41__)
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
{
- return __popcnt(_mm256_movemask_ps(a));
+ return _mm_popcnt_u32(_mm256_movemask_ps(a));
}
#else
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
{
return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
bool(a[7]);
@@ -224,7 +224,7 @@ __forceinline bool none(const avxb &b)
return _mm256_movemask_ps(b) == 0x0;
}
-__forceinline size_t movemask(const avxb &a)
+__forceinline uint32_t movemask(const avxb &a)
{
return _mm256_movemask_ps(a);
}
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
index e658a4f848f..3db646e61f4 100644
--- a/intern/cycles/util/util_avxi.h
+++ b/intern/cycles/util/util_avxi.h
@@ -711,21 +711,21 @@ __forceinline int reduce_add(const avxi &v)
return extract<0>(extract<0>(vreduce_add(v)));
}
-__forceinline size_t select_min(const avxi &v)
+__forceinline uint32_t select_min(const avxi &v)
{
return __bsf(movemask(v == vreduce_min(v)));
}
-__forceinline size_t select_max(const avxi &v)
+__forceinline uint32_t select_max(const avxi &v)
{
return __bsf(movemask(v == vreduce_max(v)));
}
-__forceinline size_t select_min(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
{
const avxi a = select(valid, v, avxi(pos_inf));
return __bsf(movemask(valid & (a == vreduce_min(a))));
}
-__forceinline size_t select_max(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
{
const avxi a = select(valid, v, avxi(neg_inf));
return __bsf(movemask(valid & (a == vreduce_max(a))));
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index c6937ca78fe..1b493d0ed5e 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 3bac7008905..a8d4ee75e20 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index de0e3c39f30..3a6761c6a2f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,49 +18,41 @@
#ifndef __UTIL_SIMD_TYPES_H__
#define __UTIL_SIMD_TYPES_H__
-#ifndef __KERNEL_GPU__
+#include <limits>
+#include <stdint.h>
-# include <limits>
-
-# include "util/util_defines.h"
+#include "util/util_defines.h"
/* SSE Intrinsics includes
*
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-# ifndef FREE_WINDOWS64
-
-# ifdef _MSC_VER
-# include <intrin.h>
-# elif (defined(__x86_64__) || defined(__i386__))
-# include <x86intrin.h>
-# endif
-
-# else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point.
+ *
+ * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
-# include "util/util_windows.h"
-
-# endif
-
-# if defined(__x86_64__) || defined(_M_X64)
-# define SIMD_SET_FLUSH_TO_ZERO \
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-# else
-# define SIMD_SET_FLUSH_TO_ZERO
-# endif
+#if defined(FREE_WINDOWS64)
+# include "util/util_windows.h"
+#elif defined(_MSC_VER)
+# include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+# include <x86intrin.h>
+#endif
+
+/* Floating Point Control, for Embree. */
+#if defined(__x86_64__) || defined(_M_X64)
+# define SIMD_SET_FLUSH_TO_ZERO \
+ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#else
+# define SIMD_SET_FLUSH_TO_ZERO
+#endif
CCL_NAMESPACE_BEGIN
-# ifdef __KERNEL_SSE2__
+/* Data structures used by SSE classes. */
+#ifdef __KERNEL_SSE2__
extern const __m128 _mm_lookupmask_ps[16];
-/* Special Types */
-
static struct TrueTy {
__forceinline operator bool() const
{
@@ -122,377 +114,281 @@ static struct PosInfTy {
static struct StepTy {
} step ccl_maybe_unused;
-/* Intrinsics Functions */
+#endif
-# if defined(__BMI__) && defined(__GNUC__)
-# ifndef _tzcnt_u32
-# define _tzcnt_u32 __tzcnt_u32
-# endif
-# ifndef _tzcnt_u64
-# define _tzcnt_u64 __tzcnt_u64
-# endif
-# endif
-
-# if defined(__LZCNT__)
-# define _lzcnt_u32 __lzcnt32
-# define _lzcnt_u64 __lzcnt64
-# endif
-
-# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-
-__forceinline int __popcnt(int in)
-{
- return _mm_popcnt_u32(in);
-}
+/* Intrinsics Functions
+ *
+ * For fast bit operations. */
-# if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in)
-{
- return _mm_popcnt_u32(in);
-}
-# endif
+#if defined(__BMI__) && defined(__GNUC__)
+# ifndef _tzcnt_u32
+# define _tzcnt_u32 __tzcnt_u32
+# endif
+# ifndef _tzcnt_u64
+# define _tzcnt_u64 __tzcnt_u64
+# endif
+#endif
-# if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in)
-{
- return _mm_popcnt_u64(in);
-}
-__forceinline size_t __popcnt(size_t in)
-{
- return _mm_popcnt_u64(in);
-}
-# endif
+#if defined(__LZCNT__)
+# define _lzcnt_u32 __lzcnt32
+# define _lzcnt_u64 __lzcnt64
+#endif
-__forceinline int __bsf(int v)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+/* Intrinsic functions on Windows. */
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(uint32_t v)
{
unsigned long r = 0;
_BitScanReverse(&r, v);
return r;
}
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(uint32_t v, uint32_t i)
{
long r = v;
_bittestandcomplement(&r, i);
return r;
}
-__forceinline int __bts(int v, int i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- long r = v;
- _bittestandset(&r, i);
- return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
- long r = v;
- _bittestandreset(&r, i);
- return r;
-}
-
-__forceinline int bitscan(int v)
-{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
return __bsf(v);
-# endif
-}
-
-__forceinline int clz(const int x)
-{
-# if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
-# else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
-# endif
-}
-
-__forceinline int __bscf(int &v)
-{
- int i = __bsf(v);
- v &= v - 1;
- return i;
-}
-
-__forceinline unsigned int __bscf(unsigned int &v)
-{
- unsigned int i = __bsf(v);
- v &= v - 1;
- return i;
+# endif
}
-# if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_64_BIT__)
-__forceinline size_t __bsf(size_t v)
+__forceinline uint64_t __bsf(uint64_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u64(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward64(&r, v);
return r;
-# endif
+# endif
}
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(uint64_t v)
{
unsigned long r = 0;
_BitScanReverse64(&r, v);
return r;
}
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(uint64_t v, uint64_t i)
{
- size_t r = v;
+ u
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list