[Bf-blender-cvs] [0f28751337d] tmp-macs-arm-cycles: Cycles: add support for Arm Neon instructions using sse2neon

Mon Feb 15 20:03:59 CET 2021

Commit: 0f28751337d1836a16dca01b8c9b513bc813bfa6
Author: Brecht Van Lommel
Date:   Sun Feb 14 15:01:26 2021 +0100
Branches: tmp-macs-arm-cycles
https://developer.blender.org/rB0f28751337d1836a16dca01b8c9b513bc813bfa6

Cycles: add support for Arm Neon instructions using sse2neon

Based on patch contributed by Apple and Stefan Werner.

Ref D8237, T78710

===================================================================

M	CMakeLists.txt
M	intern/cycles/util/util_math_float3.h
M	intern/cycles/util/util_math_float4.h
M	intern/cycles/util/util_optimization.h
M	intern/cycles/util/util_simd.h
M	intern/cycles/util/util_sseb.h
M	intern/cycles/util/util_ssef.h
M	intern/cycles/util/util_ssei.h

===================================================================

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e3578d6632..cfa751d4720 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -962,11 +962,17 @@ if(WITH_CPU_SIMD)
   set(COMPILER_SSE_FLAG)
   set(COMPILER_SSE2_FLAG)
 
-  TEST_NEON_SUPPORT()
-  if(SUPPORT_NEON_BUILD)
-    blender_include_dirs_sys("${SSE2NEON_INCLUDE_DIRS}")
-    add_definitions(-DWITH_SSE2NEON)
-  else()
+  # Neon
+  if(SSE2NEON_FOUND)
+    TEST_NEON_SUPPORT()
+    if(SUPPORT_NEON_BUILD)
+      blender_include_dirs_sys("${SSE2NEON_INCLUDE_DIRS}")
+      add_definitions(-DWITH_SSE2NEON)
+    endif()
+  endif()
+
+  # SSE
+  if(NOT SUPPORT_NEON_BUILD)
     TEST_SSE_SUPPORT(COMPILER_SSE_FLAG COMPILER_SSE2_FLAG)
     if(SUPPORT_SSE_BUILD)
       string(PREPEND PLATFORM_CFLAGS "${COMPILER_SSE_FLAG} ")
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index dd2010715ba..8b9ff997824 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -290,8 +290,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 ccl_device_inline float3 fabs(const float3 &a)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
+  return float3(vabsq_f32(a.m128));
+#    else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
+#    endif
 #  else
   return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
@@ -434,7 +438,13 @@ ccl_device_inline bool is_zero(const float3 a)
 
 ccl_device_inline float reduce_add(const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
+  __m128 t = a.m128;
+  t[3] = 0.0f;
+  return vaddvq_f32(t);
+#else
   return (a.x + a.y + a.z);
+#endif
 }
 
 ccl_device_inline float average(const float3 a)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index ec5328adb31..006e45207e0 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -243,7 +243,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
 #  else
   return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
 #  endif
@@ -299,8 +304,10 @@ ccl_device_inline bool is_zero(const float4 &a)
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_SSE3__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vaddvq_f32(a)));
+#    elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
 #    else
@@ -359,8 +366,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
   return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
 #  else
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 #  endif
@@ -386,14 +397,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+#  else
   return float4(_mm_castsi128_ps(
       _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+#  endif
 }
 
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &a, const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+#  else
   return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+#  endif
 }
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@@ -443,9 +462,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
 
 ccl_device_inline float4 reduce_min(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vminvq_f32(a)));
+#    else
   float4 h = min(shuffle<1, 0, 3, 2>(a), a);
   return min(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
   return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
 #  endif
@@ -453,9 +476,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
 
 ccl_device_inline float4 reduce_max(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vmaxvq_f32(a)));
+#    else
   float4 h = max(shuffle<1, 0, 3, 2>(a), a);
   return max(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
   return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
 #  endif
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 46dd883282a..7e8cc88c08c 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -27,44 +27,49 @@
 
 /* We require minimum SSE2 support on x86, so auto enable. */
 #    define __KERNEL_SSE2__
-
 #    ifdef WITH_KERNEL_SSE2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #    endif
-
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif
 
-#  endif /* defined(i386) || defined(_M_IX86) */
-
 /* x86-64
  *
  * Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
 
-#  if defined(__x86_64__) || defined(_M_X64)
+#  elif defined(__x86_64__) || defined(_M_X64)
 
 /* SSE2 is always available on x86-64 CPUs, so auto enable */
 #    define __KERNEL_SSE2__
-
 /* no SSE2 kernel on x86-64, part of regular kernel */
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif
-
 #    ifdef WITH_KERNEL_SSE41
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #    endif
-
 #    ifdef WITH_KERNEL_AVX
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #    endif
-
 #    ifdef WITH_KERNEL_AVX2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #    endif
 
-#  endif /* defined(__x86_64__) || defined(_M_X64) */
+/* Arm Neon
+ *
+ * Compile a SSE4 kernel emulated with Neon. Most code is shared with
+ * SSE, some specializations for performance and compatibility are made
+ * made testing for __KERNEL_NEON__. */
+
+#  elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+
+#    define __KERNEL_NEON__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSE41__
+
+#  endif
 
 #endif
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 3a6761c6a2f..9a044b4b57a 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -35,6 +35,11 @@
 #  include <intrin.h>
 #elif (defined(__x86_64__) || defined(__i386__))
 #  include <x86intrin.h>
+#elif defined(__KERNEL_NEON__)
+#  define SSE2NEON_PRECISE_MINMAX 1
+#  define SSE2NEON_PRECISE_DIV 1
+#  define SSE2NEON_PRECISE_SQRT 1
+#  include <sse2neon.h>
 #endif
 
 /* Floating Point Control, for Embree. */
@@ -116,6 +121,80 @@ static struct StepTy {
 
 #endif
 
+/* Utilities used by Neon */
+#if defined(__KERNEL_NEON__)
+template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
+{
+  if (i0 == i1 && i0 == i2 && i0 == i3) {
+    return vdupq_laneq_s32(a, i0);
+  }
+  static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                  (i0 * 4) + 1,
+                                  (i0 * 4) + 2,
+                                  (i0 * 4) + 3,
+                                  (i1 * 4) + 0,
+                                  (i1 * 4) + 1,
+                                  (i1 * 4) + 2,
+                                  (i1 * 4) + 3,
+                                  (i2 * 4) + 0,
+                                  (i2 * 4) + 1,
+                                  (i2 * 4) + 2,
+                                  (i2 * 4) + 3,
+                                  (i3 * 4) + 0,
+                                  (i3 * 4) + 1,
+                                  (i3 * 4) + 2,
+                                  (i3 * 4) + 3};
+
+  return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+}
+
+template<class type, int i0, int i1, int i2, int i3>
+type shuffle_neon(const type &a, const type &b)
+{
+  if (&a == &b) {
+    static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                    (i0 * 4) + 1,
+                                    (i0 * 4) + 2,
+                                    (i0 * 4) + 3,
+                                    (i1 * 4) + 0,
+                                    (i1 * 4) + 1,
+                                    (i1 * 4) + 2,
+                                    (i1 * 4) + 3,
+                                    (i2 * 4) + 0,
+                                    (i2 * 4) + 1,
+                                    (i2 * 4) + 2,
+                                    (i2 * 4) + 3,
+                                    (i3 * 4) + 0,
+                                    (i3 * 4) + 1,
+                                    (i3 * 4) + 2,
+                                    (i3 * 4) + 3};
+
+    return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
+  }
+  else {
+
+    static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                    (i0 * 4) + 1,
+                                    (i0 * 4) + 2,
+                                    (i0 * 4) + 3,
+                                    (i1 * 4) + 0,
+                                    (i1 * 4) + 1,
+                                    (i1 * 4) + 2,
+                                    (i1 * 4) + 3,
+                                    (i2 * 4) + 0 + 16,
+                                    (i2 * 4) + 1 + 16,
+                                    (i2 * 4) + 2 

@@ Diff output truncated at 10240 characters. @@