[Bf-blender-cvs] [4d457a4abef] tmp-macs-arm-cycles: Cycles: add support for Arm Neon instructions using sse2neon
Brecht Van Lommel
noreply at git.blender.org
Mon Feb 15 19:06:50 CET 2021
Commit: 4d457a4abef75be4a5a3424b1f7d221385e68b95
Author: Brecht Van Lommel
Date: Sun Feb 14 15:01:26 2021 +0100
Branches: tmp-macs-arm-cycles
https://developer.blender.org/rB4d457a4abef75be4a5a3424b1f7d221385e68b95
Cycles: add support for Arm Neon instructions using sse2neon
Based on patch contributed by Apple and Stefan Werner.
Ref D8237, T78710
===================================================================
M CMakeLists.txt
M intern/cycles/util/util_math_float3.h
M intern/cycles/util/util_math_float4.h
M intern/cycles/util/util_optimization.h
M intern/cycles/util/util_simd.h
M intern/cycles/util/util_sseb.h
M intern/cycles/util/util_ssef.h
M intern/cycles/util/util_ssei.h
===================================================================
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e3578d6632..cfa751d4720 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -962,11 +962,17 @@ if(WITH_CPU_SIMD)
set(COMPILER_SSE_FLAG)
set(COMPILER_SSE2_FLAG)
- TEST_NEON_SUPPORT()
- if(SUPPORT_NEON_BUILD)
- blender_include_dirs_sys("${SSE2NEON_INCLUDE_DIRS}")
- add_definitions(-DWITH_SSE2NEON)
- else()
+ # Neon
+ if(SSE2NEON_FOUND)
+ TEST_NEON_SUPPORT()
+ if(SUPPORT_NEON_BUILD)
+ blender_include_dirs_sys("${SSE2NEON_INCLUDE_DIRS}")
+ add_definitions(-DWITH_SSE2NEON)
+ endif()
+ endif()
+
+ # SSE
+ if(NOT SUPPORT_NEON_BUILD)
TEST_SSE_SUPPORT(COMPILER_SSE_FLAG COMPILER_SSE2_FLAG)
if(SUPPORT_SSE_BUILD)
string(PREPEND PLATFORM_CFLAGS "${COMPILER_SSE_FLAG} ")
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index dd2010715ba..8b9ff997824 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -290,8 +290,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
ccl_device_inline float3 fabs(const float3 &a)
{
# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float3(vabsq_f32(a.m128));
+# else
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return float3(_mm_and_ps(a.m128, mask));
+# endif
# else
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
# endif
@@ -434,7 +438,13 @@ ccl_device_inline bool is_zero(const float3 a)
ccl_device_inline float reduce_add(const float3 a)
{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
+ __m128 t = a.m128;
+ t[3] = 0.0f;
+ return vaddvq_f32(t);
+#else
return (a.x + a.y + a.z);
+#endif
}
ccl_device_inline float average(const float3 a)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index ec5328adb31..006e45207e0 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -243,7 +243,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ __m128 t = vmulq_f32(a, b);
+ return vaddvq_f32(t);
+# else
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+# endif
# else
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
# endif
@@ -299,8 +304,10 @@ ccl_device_inline bool is_zero(const float4 &a)
ccl_device_inline float4 reduce_add(const float4 &a)
{
-# ifdef __KERNEL_SSE__
-# ifdef __KERNEL_SSE3__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vaddvq_f32(a)));
+# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
# else
@@ -359,8 +366,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vabsq_f32(a));
+# else
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+# endif
# else
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
# endif
@@ -386,14 +397,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
{
+# if defined(__KERNEL_NEON__)
+ return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+# else
return float4(_mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+# endif
}
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &a, const float4 &b)
{
+# if defined(__KERNEL_NEON__)
+ return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+# else
return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+# endif
}
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@@ -443,9 +462,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
ccl_device_inline float4 reduce_min(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vminvq_f32(a)));
+# else
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
return min(shuffle<2, 3, 0, 1>(h), h);
+# endif
# else
return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
# endif
@@ -453,9 +476,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
ccl_device_inline float4 reduce_max(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vmaxvq_f32(a)));
+# else
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
return max(shuffle<2, 3, 0, 1>(h), h);
+# endif
# else
return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
# endif
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 46dd883282a..7e8cc88c08c 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -27,44 +27,49 @@
/* We require minimum SSE2 support on x86, so auto enable. */
# define __KERNEL_SSE2__
-
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# endif
-
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
-# endif /* defined(i386) || defined(_M_IX86) */
-
/* x86-64
*
* Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
-# if defined(__x86_64__) || defined(_M_X64)
+# elif defined(__x86_64__) || defined(_M_X64)
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
-
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
-
# ifdef WITH_KERNEL_SSE41
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# endif
-
# ifdef WITH_KERNEL_AVX
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# endif
-
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# endif
-# endif /* defined(__x86_64__) || defined(_M_X64) */
+/* Arm Neon
+ *
+ * Compile a SSE4 kernel emulated with Neon. Most code is shared with
+ * SSE, some specializations for performance and compatibility are made
+ * made testing for __KERNEL_NEON__. */
+
+# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+
+# define __KERNEL_NEON__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSE41__
+
+# endif
#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 7308a3207ad..acf8a343592 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -34,6 +34,11 @@
# include <intrin.h>
#elif (defined(__x86_64__) || defined(__i386__))
# include <x86intrin.h>
+#elif defined(__KERNEL_NEON__)
+# define SSE2NEON_PRECISE_MINMAX 1
+# define SSE2NEON_PRECISE_DIV 1
+# define SSE2NEON_PRECISE_SQRT 1
+# include <sse2neon.h>
#endif
/* Floating Point Control, for Embree. */
@@ -115,6 +120,80 @@ static struct StepTy {
#endif
+/* Utilities used by Neon */
+#if defined(__KERNEL_NEON__)
+template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
+{
+ if (i0 == i1 && i0 == i2 && i0 == i3) {
+ return vdupq_laneq_s32(a, i0);
+ }
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0,
+ (i2 * 4) + 1,
+ (i2 * 4) + 2,
+ (i2 * 4) + 3,
+ (i3 * 4) + 0,
+ (i3 * 4) + 1,
+ (i3 * 4) + 2,
+ (i3 * 4) + 3};
+
+ return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+}
+
+template<class type, int i0, int i1, int i2, int i3>
+type shuffle_neon(const type &a, const type &b)
+{
+ if (&a == &b) {
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0,
+ (i2 * 4) + 1,
+ (i2 * 4) + 2,
+ (i2 * 4) + 3,
+ (i3 * 4) + 0,
+ (i3 * 4) + 1,
+ (i3 * 4) + 2,
+ (i3 * 4) + 3};
+
+ return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
+ }
+ else {
+
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0 + 16,
+ (i2 * 4) + 1 + 16,
+ (i2 * 4) + 2
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list