[Bf-blender-cvs] [8119f0aad21] master: Cycles: refactor intrinsic functions implementation

Brecht Van Lommel noreply at git.blender.org
Wed Feb 17 16:26:39 CET 2021


Commit: 8119f0aad21c3ce88e82d68ed20cd5a8edc99703
Author: Brecht Van Lommel
Date:   Sun Feb 14 15:34:23 2021 +0100
Branches: master
https://developer.blender.org/rB8119f0aad21c3ce88e82d68ed20cd5a8edc99703

Cycles: refactor intrinsic functions implementation

* Add processor independent fallbacks
* Use uint32_t and uint64_t types
* Remove unused functions
* Better comments and less indentation

Ref D8237, T78710

===================================================================

M	intern/cycles/bvh/bvh.cpp
M	intern/cycles/bvh/bvh_build.cpp
M	intern/cycles/util/util_avxb.h
M	intern/cycles/util/util_avxi.h
M	intern/cycles/util/util_color.h
M	intern/cycles/util/util_half.h
M	intern/cycles/util/util_simd.h
M	intern/cycles/util/util_sseb.h
M	intern/cycles/util/util_ssef.h
M	intern/cycles/util/util_ssei.h
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 256382e63ba..050e090bddf 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -69,7 +69,7 @@ BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout, BVHLayoutMask s
     allowed_layouts_mask = supported_layouts;
   }
   /* We get widest from allowed ones and convert mask to actual layout. */
-  const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask);
+  const BVHLayoutMask widest_allowed_layout_mask = __bsr((uint32_t)allowed_layouts_mask);
   return (BVHLayout)(1 << widest_allowed_layout_mask);
 }
 
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 296f9130f43..ec85cef0851 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -851,7 +851,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   for (int i = 0; i < range.size(); i++) {
     const BVHReference &ref = references[range.start() + i];
     if (ref.prim_index() != -1) {
-      int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
+      uint32_t type_index = bitscan((uint32_t)(ref.prim_type() & PRIMITIVE_ALL));
       p_ref[type_index].push_back(ref);
       p_type[type_index].push_back(ref.prim_type());
       p_index[type_index].push_back(ref.prim_index());
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 34fafd188de..17d505c077a 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -191,12 +191,12 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
 ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__KERNEL_SSE41__)
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
 {
-  return __popcnt(_mm256_movemask_ps(a));
+  return _mm_popcnt_u32(_mm256_movemask_ps(a));
 }
 #else
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
 {
   return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
          bool(a[7]);
@@ -224,7 +224,7 @@ __forceinline bool none(const avxb &b)
   return _mm256_movemask_ps(b) == 0x0;
 }
 
-__forceinline size_t movemask(const avxb &a)
+__forceinline uint32_t movemask(const avxb &a)
 {
   return _mm256_movemask_ps(a);
 }
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
index e658a4f848f..3db646e61f4 100644
--- a/intern/cycles/util/util_avxi.h
+++ b/intern/cycles/util/util_avxi.h
@@ -711,21 +711,21 @@ __forceinline int reduce_add(const avxi &v)
   return extract<0>(extract<0>(vreduce_add(v)));
 }
 
-__forceinline size_t select_min(const avxi &v)
+__forceinline uint32_t select_min(const avxi &v)
 {
   return __bsf(movemask(v == vreduce_min(v)));
 }
-__forceinline size_t select_max(const avxi &v)
+__forceinline uint32_t select_max(const avxi &v)
 {
   return __bsf(movemask(v == vreduce_max(v)));
 }
 
-__forceinline size_t select_min(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
 {
   const avxi a = select(valid, v, avxi(pos_inf));
   return __bsf(movemask(valid & (a == vreduce_min(a))));
 }
-__forceinline size_t select_max(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
 {
   const avxi a = select(valid, v, avxi(neg_inf));
   return __bsf(movemask(valid & (a == vreduce_max(a))));
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index c6937ca78fe..1b493d0ed5e 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -20,7 +20,7 @@
 #include "util/util_math.h"
 #include "util/util_types.h"
 
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
 #  include "util/util_simd.h"
 #endif
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 3bac7008905..a8d4ee75e20 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -20,7 +20,7 @@
 #include "util/util_math.h"
 #include "util/util_types.h"
 
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
 #  include "util/util_simd.h"
 #endif
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index de0e3c39f30..3a6761c6a2f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,49 +18,41 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
-#ifndef __KERNEL_GPU__
+#include <limits>
+#include <stdint.h>
 
-#  include <limits>
-
-#  include "util/util_defines.h"
+#include "util/util_defines.h"
 
 /* SSE Intrinsics includes
  *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#  ifndef FREE_WINDOWS64
-
-#    ifdef _MSC_VER
-#      include <intrin.h>
-#    elif (defined(__x86_64__) || defined(__i386__))
-#      include <x86intrin.h>
-#    endif
-
-#  else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point.
+ *
+ * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
-#    include "util/util_windows.h"
-
-#  endif
-
-#  if defined(__x86_64__) || defined(_M_X64)
-#    define SIMD_SET_FLUSH_TO_ZERO \
-      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
-      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#  else
-#    define SIMD_SET_FLUSH_TO_ZERO
-#  endif
+#if defined(FREE_WINDOWS64)
+#  include "util/util_windows.h"
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+/* Floating Point Control, for Embree. */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define SIMD_SET_FLUSH_TO_ZERO \
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#else
+#  define SIMD_SET_FLUSH_TO_ZERO
+#endif
 
 CCL_NAMESPACE_BEGIN
 
-#  ifdef __KERNEL_SSE2__
+/* Data structures used by SSE classes. */
+#ifdef __KERNEL_SSE2__
 
 extern const __m128 _mm_lookupmask_ps[16];
 
-/* Special Types */
-
 static struct TrueTy {
   __forceinline operator bool() const
   {
@@ -122,377 +114,281 @@ static struct PosInfTy {
 static struct StepTy {
 } step ccl_maybe_unused;
 
-/* Intrinsics Functions */
+#endif
 
-#    if defined(__BMI__) && defined(__GNUC__)
-#      ifndef _tzcnt_u32
-#        define _tzcnt_u32 __tzcnt_u32
-#      endif
-#      ifndef _tzcnt_u64
-#        define _tzcnt_u64 __tzcnt_u64
-#      endif
-#    endif
-
-#    if defined(__LZCNT__)
-#      define _lzcnt_u32 __lzcnt32
-#      define _lzcnt_u64 __lzcnt64
-#    endif
-
-#    if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-
-__forceinline int __popcnt(int in)
-{
-  return _mm_popcnt_u32(in);
-}
+/* Intrinsics Functions
+ *
+ * For fast bit operations. */
 
-#      if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in)
-{
-  return _mm_popcnt_u32(in);
-}
-#      endif
+#if defined(__BMI__) && defined(__GNUC__)
+#  ifndef _tzcnt_u32
+#    define _tzcnt_u32 __tzcnt_u32
+#  endif
+#  ifndef _tzcnt_u64
+#    define _tzcnt_u64 __tzcnt_u64
+#  endif
+#endif
 
-#      if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in)
-{
-  return _mm_popcnt_u64(in);
-}
-__forceinline size_t __popcnt(size_t in)
-{
-  return _mm_popcnt_u64(in);
-}
-#      endif
+#if defined(__LZCNT__)
+#  define _lzcnt_u32 __lzcnt32
+#  define _lzcnt_u64 __lzcnt64
+#endif
 
-__forceinline int __bsf(int v)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+/* Intrinsic functions on Windows. */
+__forceinline uint32_t __bsf(uint32_t v)
 {
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   unsigned long r = 0;
   _BitScanForward(&r, v);
   return r;
-#      endif
+#  endif
 }
 
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint32_t __bsf(uint32_t v)
 {
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   unsigned long r = 0;
   _BitScanForward(&r, v);
   return r;
-#      endif
+#  endif
 }
 
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(uint32_t v)
 {
   unsigned long r = 0;
   _BitScanReverse(&r, v);
   return r;
 }
 
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(uint32_t v, uint32_t i)
 {
   long r = v;
   _bittestandcomplement(&r, i);
   return r;
 }
 
-__forceinline int __bts(int v, int i)
+__forceinline uint32_t bitscan(uint32_t v)
 {
-  long r = v;
-  _bittestandset(&r, i);
-  return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
-  long r = v;
-  _bittestandreset(&r, i);
-  return r;
-}
-
-__forceinline int bitscan(int v)
-{
-#      if defined(__KERNEL_AVX2__)
+#  if defined(__KERNEL_AVX2__)
   return _tzcnt_u32(v);
-#      else
+#  else
   return __bsf(v);
-#      endif
-}
-
-__forceinline int clz(const int x)
-{
-#      if defined(__KERNEL_AVX2__)
-  return _lzcnt_u32(x);
-#      else
-  if (UNLIKELY(x == 0))
-    return 32;
-  return 31 - __bsr(x);
-#      endif
-}
-
-__forceinline int __bscf(int &v)
-{
-  int i = __bsf(v);
-  v &= v - 1;
-  return i;
-}
-
-__forceinline unsigned int __bscf(unsigned int &v)
-{
-  unsigned int i = __bsf(v);
-  v &= v - 1;
-  return i;
+#  endif
 }
 
-#      if defined(__KERNEL_64_BIT__)
+#  if defined(__KERNEL_64_BIT__)
 
-__forceinline size_t __bsf(size_t v)
+__forceinline uint64_t __bsf(uint64_t v)
 {
-#        if defined(__KERNEL_AVX2__)
+#    if defined(__KERNEL_AVX2__)
   return _tzcnt_u64(v);
-#        else
+#    else
   unsigned long r = 0;
   _BitScanForward64(&r, v);
   return r;
-#        endif
+#    endif
 }
 
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(uint64_t v)
 {
   unsigned long r = 0;
   _BitScanReverse64(&r, v);
   return r;
 }
 
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(uint64_t v, uint64_t i)
 {
-  size_t r = v;
+  u

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list