[Bf-blender-cvs] [e1b3d911273] master: Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8

Tue Nov 8 12:28:58 CET 2022

Commit: e1b3d9112730bc3b569ffff732a1558752ded146
Author: Brecht Van Lommel
Date:   Tue Nov 1 15:16:55 2022 +0100
Branches: master
https://developer.blender.org/rBe1b3d9112730bc3b569ffff732a1558752ded146

Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8

The distinction existed for legacy reasons, to easily port of Embree
intersection code without affecting the main vector types. However we are now
using SIMD for these types as well, so no good reason to keep the distinction.

Also more consistently pass these vector types by value in inline functions.
Previously it was partially changed for functions used by Metal to avoid having
to add address space qualifiers, simple to do it everywhere.

Also removes function declarations for vector math headers, serves no real
purpose.

Differential Revision: https://developer.blender.org/D16146

===================================================================

M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/device/cpu/kernel.cpp
M	intern/cycles/kernel/svm/noise.h
M	intern/cycles/test/CMakeLists.txt
D	intern/cycles/test/util_avxf_test.h
R077	intern/cycles/test/util_avxf_avx2_test.cpp	intern/cycles/test/util_float8_avx2_test.cpp
R082	intern/cycles/test/util_avxf_avx_test.cpp	intern/cycles/test/util_float8_avx_test.cpp
A	intern/cycles/test/util_float8_sse2_test.cpp
A	intern/cycles/test/util_float8_test.h
M	intern/cycles/util/CMakeLists.txt
D	intern/cycles/util/avxb.h
D	intern/cycles/util/avxf.h
D	intern/cycles/util/avxi.h
M	intern/cycles/util/color.h
M	intern/cycles/util/half.h
M	intern/cycles/util/hash.h
M	intern/cycles/util/math.h
M	intern/cycles/util/math_float2.h
M	intern/cycles/util/math_float3.h
M	intern/cycles/util/math_float4.h
M	intern/cycles/util/math_float8.h
M	intern/cycles/util/math_int2.h
M	intern/cycles/util/math_int3.h
M	intern/cycles/util/math_int4.h
A	intern/cycles/util/math_int8.h
M	intern/cycles/util/math_intersect.h
D	intern/cycles/util/sseb.h
D	intern/cycles/util/ssef.h
D	intern/cycles/util/ssei.h
M	intern/cycles/util/transform.cpp
M	intern/cycles/util/transform.h
M	intern/cycles/util/transform_inverse.h
M	intern/cycles/util/types.h
M	intern/cycles/util/types_float8.h
M	intern/cycles/util/types_float8_impl.h
A	intern/cycles/util/types_int8.h
A	intern/cycles/util/types_int8_impl.h

===================================================================

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3779fdc697a..3fbb346e94f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -328,6 +328,7 @@ set(SRC_UTIL_HEADERS
   ../util/math_int2.h
   ../util/math_int3.h
   ../util/math_int4.h
+  ../util/math_int8.h
   ../util/math_matrix.h
   ../util/projection.h
   ../util/rect.h
@@ -350,6 +351,8 @@ set(SRC_UTIL_HEADERS
   ../util/types_int3_impl.h
   ../util/types_int4.h
   ../util/types_int4_impl.h
+  ../util/types_int8.h
+  ../util/types_int8_impl.h
   ../util/types_spectrum.h
   ../util/types_uchar2.h
   ../util/types_uchar2_impl.h
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 01087c96dd6..558431961ab 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -7,6 +7,7 @@
  * one with SSE2 intrinsics.
  */
 #if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #endif
 
@@ -29,11 +30,15 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX2__
 #  endif
 #endif
diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 31e77d87413..209195a03f1 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
 }
 
 /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
  * supported, we do a standard implementation, but if it is supported, we
  * do an implementation using SSE intrinsics.
  */
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)
 
 /* ** Standard Implementation ** */
 
@@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 
 /* SSE Bilinear Interpolation:
  *
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
  *
  * The interpolation is done in two steps:
  * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
  *    (v2, v3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 2. Interpolate g0 and g1 along the y axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  * v1          v3          g1
  *  @ + + + + @            @                    y
@@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
  * v0          v2          g0
  *
  */
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
 {
-  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
   return mix(g, shuffle<1>(g), shuffle<1>(f));
 }
 
-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
 {
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
+  float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+  float4 b = madd(t, a, make_float4(10.0f));
   return (t * t) * (t * b);
 }
 
 /* Negate val if the nth bit of h is 1. */
 #  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
 {
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
+  int4 h = hash & 7;
+  float4 u = select(h < 4, x, y);
+  float4 v = 2.0f * select(h < 4, y, x);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
@@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  */
 ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
+  int4 XY;
+  float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+  float4 uv = fade(fxy);
 
-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+  int4 XY1 = XY + make_int4(1);
+  int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+  int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
 
-  ssei h = hash_ssei2(X, Y);
+  int4 h = hash_int4_2(X, Y);
 
-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+  float4 fxy1 = fxy - make_float4(1.0f);
+  float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
 
-  ssef g = grad(h, fx, fy);
+  float4 g = grad(h, fx, fy);
 
   return extract<0>(bi_mix(g, uv));
 }
 
 /* SSE Trilinear Interpolation:
  *
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - q : Contains the values at the points (v4, v5, v6, v7).
  * - f : Contains the values (x, y, z, _). The fourth value is unused.
@@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
  * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
  *    (s2, s3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 3. Interpolate g0 and g1 along the z axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  *   v3               v7
  *     @ + + + + + + @               s3 @
@@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  *          @ + + + + + + @                  @
  *        v0               v4                 s0
  */
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
 {
-  ssef s = mix(p, q, shuffle<0>(f));
-  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  float4 s = mix(p, q, shuffle<0>(f));
+  float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
   return mix(g, shuffle<1>(g), shuffle<2>(f));
 }
 
@@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
  * supported, we do an SSE implementation, but if it is supported,
  * we do an implementation using AVX intrinsics.
  */
-#  if !defined(__KERNEL_AVX__)
+#  if !defined(__KERNEL_AVX2__)
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
 {
-  ssei h = hash & 15;
-  ssef u = select(h < 8, x, y);
-  ssef vt = select((h == 12) | (h == 14), x, z);
-  ssef v = select(h < 4, y, vt);
+  int4 h = hash & 15;
+  float4 u = select(h < 8, x, y);
+  float4 vt = select((h == 12) | (h == 14), x, z);
+  float4 v = select(h < 4, y, vt);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
 {
-  ssei h = hash & 31;
-  ssef u = select(h < 24, x, y);
-  ssef v = select(h < 16, y, z);
-  ssef s = select(h < 8, z, w);
+  int4 h = hash & 31;
+  float4 u = select(h < 24, x, y);
+  float4 v = select(h < 16, y, z);
+  float4 s = select(h < 8, z, w);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
@@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
  * between two trilinear interpolations.
  *
  */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
 {
   return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
@@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);
 
-  ssei XYZ1 = XYZ + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
-  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+  int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+  int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);
 
-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>

@@ Diff output truncated at 10240 characters. @@