[Bf-blender-cvs] [004d858138f] master: math: improve accuracy of Linear->sRGB conversion SIMD path

Wed Jun 15 19:51:38 CEST 2022

Commit: 004d858138fced20d45cf3cc0149fcef1922c8a0
Author: Aras Pranckevicius
Date:   Wed Jun 15 20:51:12 2022 +0300
Branches: master
https://developer.blender.org/rB004d858138fced20d45cf3cc0149fcef1922c8a0

math: improve accuracy of Linear->sRGB conversion SIMD path

srgb_to_linearrgb_v3_v3 is using an approximation of powf that is
SIMD. However, while the accuracy of it is ok, a larger issue is that
it produces different results on Intel compared to ARM architectures.

On ARM (e.g. AppleSilicon), the result of the SIMD code path is much
closer to the reference implementation. This seems to be because of
_mm_rsqrt_ps usage in _bli_math_fastpow512. The ARM/NEON code path
emulates inverse square root with a combination of vrsqrteq_f32
followed by two Newton-Raphson iterations, because blender uses the
SSE2NEON_PRECISE_SQRT define.

This commit adds similar NR iterations to the "actual SSE" code path
as well.

Max error of srgb->linear->srgb conversion roundtrip goes from
0.000211 down to about 0.000062.

Reviewed By: Sergey Sharybin
Differential Revision: https://developer.blender.org/D15193

===================================================================

M	source/blender/blenlib/intern/math_base_inline.c
M	source/blender/blenlib/tests/BLI_math_color_test.cc

===================================================================

diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index a983821f15e..4a213f5fe74 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -767,6 +767,20 @@ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
   return _mm_mul_ps(x, _mm_mul_ps(x, x));
 }
 
+MALWAYS_INLINE __m128 _bli_math_rsqrt(__m128 in)
+{
+  __m128 r = _mm_rsqrt_ps(in);
+  /* Only do additional Newton-Raphson iterations when using actual SSE
+   * code path. When we are emulating SSE on NEON via sse2neon, the
+   * additional NR iterations are already done inside _mm_rsqrt_ps
+   * emulation. */
+#  if defined(__SSE2__)
+  r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+                 _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(in, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#  endif
+  return r;
+}
+
 /* Calculate powf(x, 1.0f / 2.4) */
 MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
 {
@@ -776,14 +790,14 @@ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
    */
   __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
   __m128 xover = _mm_mul_ps(arg, xf);
-  __m128 xfm1 = _mm_rsqrt_ps(xf);
+  __m128 xfm1 = _bli_math_rsqrt(xf);
   __m128 x2 = _mm_mul_ps(arg, arg);
   __m128 xunder = _mm_mul_ps(x2, xfm1);
   /* sqrt2 * over + 2 * sqrt2 * under */
   __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
                            _mm_add_ps(xover, xunder));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
   return xavg;
 }
 
diff --git a/source/blender/blenlib/tests/BLI_math_color_test.cc b/source/blender/blenlib/tests/BLI_math_color_test.cc
index 7f2c0a3f1ca..4d928477870 100644
--- a/source/blender/blenlib/tests/BLI_math_color_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_color_test.cc
@@ -74,3 +74,71 @@ TEST(math_color, LinearRGBTosRGBRoundtrip)
     EXPECT_NEAR(orig_linear_color, linear_color, 1e-5);
   }
 }
+
+TEST(math_color, linearrgb_to_srgb_v3_v3)
+{
+  float srgb_color[3];
+  {
+    const float kTolerance = 1.0e-8f;
+    const float linear_color[3] = {0.0023f, 0.0024f, 0.0025f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(0.029716f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(0.031008f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(0.032300f, srgb_color[2], kTolerance);
+  }
+
+  {
+    /* SIMD implementation of linear->srgb for larger inputs
+     * is less accurate; use larger tolerance. */
+    const float kTolerance = 3.6e-5f;
+    const float linear_color[3] = {0.71f, 0.75f, 0.78f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(0.859696f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(0.880825f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(0.896244f, srgb_color[2], kTolerance);
+  }
+
+  {
+    /* Not a common, but possible case: values beyond 1.0 range. */
+    const float kTolerance = 2.3e-4f;
+    const float linear_color[3] = {1.5f, 2.8f, 5.6f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(1.19418f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(1.56520f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(2.10771f, srgb_color[2], kTolerance);
+  }
+}
+
+TEST(math_color, srgb_to_linearrgb_v3_v3)
+{
+  float linear_color[3];
+  {
+    const float kTolerance = 1.0e-8f;
+    const float srgb_color[3] = {0.0023f, 0.0024f, 0.0025f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(0.000178019f, linear_color[0], kTolerance);
+    EXPECT_NEAR(0.000185759f, linear_color[1], kTolerance);
+    EXPECT_NEAR(0.000193498f, linear_color[2], kTolerance);
+  }
+
+  {
+    /* SIMD implementation of linear->srgb for larger inputs
+     * is less accurate; use larger tolerance. */
+    const float kTolerance = 1.5e-7f;
+    const float srgb_color[3] = {0.71f, 0.72f, 0.73f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(0.4623615f, linear_color[0], kTolerance);
+    EXPECT_NEAR(0.4770000f, linear_color[1], kTolerance);
+    EXPECT_NEAR(0.4919052f, linear_color[2], kTolerance);
+  }
+
+  {
+    /* Not a common, but possible case: values beyond 1.0 range. */
+    const float kTolerance = 7.7e-6f;
+    const float srgb_color[3] = {1.1f, 2.5f, 5.6f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(1.24277f, linear_color[0], kTolerance);
+    EXPECT_NEAR(8.35473f, linear_color[1], kTolerance);
+    EXPECT_NEAR(56.23833f, linear_color[2], kTolerance);
+  }
+}