[Bf-blender-cvs] [0e9497e8869] master: Cycles: add support for Arm Neon instructions using sse2neon

Wed Feb 17 16:26:39 CET 2021

Commit: 0e9497e886924cb75ca67f2c14e2fdda29f2b583
Author: Brecht Van Lommel
Date:   Sun Feb 14 15:01:26 2021 +0100
Branches: master
https://developer.blender.org/rB0e9497e886924cb75ca67f2c14e2fdda29f2b583

Cycles: add support for Arm Neon instructions using sse2neon

Based on patch contributed by Apple and Stefan Werner.

Ref D8237, T78710

===================================================================

M	intern/cycles/graph/node_type.cpp
M	intern/cycles/render/camera.cpp
M	intern/cycles/render/nodes.cpp
M	intern/cycles/render/nodes.h
M	intern/cycles/util/util_math_float3.h
M	intern/cycles/util/util_math_float4.h
M	intern/cycles/util/util_optimization.h
M	intern/cycles/util/util_simd.h
M	intern/cycles/util/util_sseb.h
M	intern/cycles/util/util_ssef.h
M	intern/cycles/util/util_ssei.h

===================================================================

diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index 2b11af70d71..d1eadf21b1b 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -102,7 +102,7 @@ size_t SocketType::max_size()
 
 void *SocketType::zero_default_value()
 {
-  static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+  static Transform zero_transform = transform_zero();
   return &zero_transform;
 }
 
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 1f932135a57..afe788eb4be 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P)
     float3 D = transform_point(&worldtocamera, P);
     float dist = len(D);
 
-    Ray ray = {{0}};
+    Ray ray;
+    memset(&ray, 0, sizeof(ray));
 
     /* Distortion can become so great that the results become meaningless, there
      * may be a better way to do this, but calculating differentials from the
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 84286c9b1a3..b17f1ec0b2f 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto
     special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT;
 }
 
+/* Union usage requires a manual copy constructor. */
+ConvertNode::ConvertNode(const ConvertNode &other)
+    : ShaderNode(other),
+      from(other.from),
+      to(other.to),
+      value_color(other.value_color),
+      value_string(other.value_string)
+{
+}
+
 void ConvertNode::constant_fold(const ConstantFolder &folder)
 {
   /* proxy nodes should have been removed at this point */
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index d4603143ef4..fb9cf0c9836 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode {
 class ConvertNode : public ShaderNode {
  public:
   ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false);
+  ConvertNode(const ConvertNode &other);
   SHADER_NODE_BASE_CLASS(ConvertNode)
 
   void constant_fold(const ConstantFolder &folder);
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 162bc900d9f..67c5c61e4c0 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 ccl_device_inline float3 fabs(const float3 &a)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
+  return float3(vabsq_f32(a.m128));
+#    else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
+#    endif
 #  else
   return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
@@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a)
 
 ccl_device_inline float reduce_add(const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
+  __m128 t = a.m128;
+  t[3] = 0.0f;
+  return vaddvq_f32(t);
+#else
   return (a.x + a.y + a.z);
+#endif
 }
 
 ccl_device_inline float average(const float3 a)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 38fdd9e3146..0ba2bafa2f0 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
 #  else
   return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
 #  endif
@@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a)
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_SSE3__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vaddvq_f32(a)));
+#    elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
 #    else
@@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
   return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
 #  else
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 #  endif
@@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+#  else
   return float4(_mm_castsi128_ps(
       _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+#  endif
 }
 
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &a, const float4 &b)
 {
+#  if defined(__KERNEL_NEON__)
+  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+#  else
   return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+#  endif
 }
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
 
 ccl_device_inline float4 reduce_min(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vminvq_f32(a)));
+#    else
   float4 h = min(shuffle<1, 0, 3, 2>(a), a);
   return min(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
   return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
 #  endif
@@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
 
 ccl_device_inline float4 reduce_max(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vdupq_n_f32(vmaxvq_f32(a)));
+#    else
   float4 h = max(shuffle<1, 0, 3, 2>(a), a);
   return max(shuffle<2, 3, 0, 1>(h), h);
+#    endif
 #  else
   return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
 #  endif
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 46dd883282a..7ecd3893cf4 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -27,44 +27,50 @@
 
 /* We require minimum SSE2 support on x86, so auto enable. */
 #    define __KERNEL_SSE2__
-
 #    ifdef WITH_KERNEL_SSE2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #    endif
-
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif
 
-#  endif /* defined(i386) || defined(_M_IX86) */
-
 /* x86-64
  *
  * Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
 
-#  if defined(__x86_64__) || defined(_M_X64)
+#  elif defined(__x86_64__) || defined(_M_X64)
 
 /* SSE2 is always available on x86-64 CPUs, so auto enable */
 #    define __KERNEL_SSE2__
-
 /* no SSE2 kernel on x86-64, part of regular kernel */
 #    ifdef WITH_KERNEL_SSE3
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #    endif
-
 #    ifdef WITH_KERNEL_SSE41
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #    endif
-
 #    ifdef WITH_KERNEL_AVX
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #    endif
-
 #    ifdef WITH_KERNEL_AVX2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #    endif
 
-#  endif /* defined(__x86_64__) || defined(_M_X64) */
+/* Arm Neon
+ *
+ * Compile a SSE4 kernel emulated with Neon. Most code is shared with
+ * SSE, some specializations for performance and compatibility are made
+ * made testing for __KERNEL_NEON__. */
+
+#  elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+
+#    define __KERNEL_NEON__
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSE41__
+
+#  endif
 
 #endif
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 3a6761c6a2f..c51c3c957e0 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -35,6 +35,9 @@
 #  include <intrin.h>
 #elif (defined(__x86_64__) || defined(__i386__))
 #  include <x86intrin.h>
+#elif defined(__KERNEL_NEON__)
+#  define SSE2NEON_PRECISE_MINMAX 1
+#  include <sse2neon.h>
 #endif
 
 /* Floating Point Control, for Embree. */
@@ -116,6 +119,80 @@ static struct StepTy {
 
 #endif
 
+/* Utilities used by Neon */
+#if defined(__KERNEL_NEON__)
+template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
+{
+  if (i0 == i1 && i0 == i2 && i0 == i3) {
+    return vdupq_laneq_s32(a, i0);
+  }
+  static const uint8_t tbl[16] = {(i0 * 4) + 0,
+                                  (i0 * 4) + 1,
+                                  (i0 * 4) + 2,
+                                  (i0 * 4) + 3,
+                                  (i1 * 4) + 0,
+                                  (i1 * 4) + 1,
+                                  (i1 * 4) + 2,
+                                  (i1 * 4) + 3,
+                                  (i2 * 4) + 0,
+                                  (i2 * 4) + 1,
+                                  (i2 * 4) + 2,
+                                  (i2 * 4) + 3,
+                                  (i3 * 4) + 0,
+                                  (i3 * 4) + 1,
+                                  (i3 * 4) + 2,
+                                  (i3 * 4) + 3};
+
+  return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+}
+
+template<class type, int i0, int i1, int i2, int i3>
+type shuffle_neon(const type &a, const type &b)
+{
+  if 

@@ Diff output truncated at 10240 characters. @@