[Bf-blender-cvs] [282516e53eb] master: Cleanup: refactor float/half conversions for clarity

Fri Oct 22 13:03:37 CEST 2021

Commit: 282516e53eba9bb3aaddd67b2b099fea98bd4c1f
Author: Brecht Van Lommel
Date:   Thu Oct 21 19:25:38 2021 +0200
Branches: master
https://developer.blender.org/rB282516e53eba9bb3aaddd67b2b099fea98bd4c1f

Cleanup: refactor float/half conversions for clarity

===================================================================

M	intern/cycles/integrator/pass_accessor.cpp
M	intern/cycles/integrator/pass_accessor_cpu.cpp
M	intern/cycles/kernel/device/cpu/image.h
M	intern/cycles/kernel/device/cuda/compat.h
M	intern/cycles/kernel/device/gpu/kernel.h
M	intern/cycles/kernel/device/optix/compat.h
M	intern/cycles/util/util_half.h
M	intern/cycles/util/util_image.h

===================================================================

diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index 4ef9ce7ef42..1308b03b06c 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -115,7 +115,7 @@ static void pad_pixels(const BufferParams &buffer_params,
   }
 
   if (destination.pixels_half_rgba) {
-    const half one = float_to_half(1.0f);
+    const half one = float_to_half_display(1.0f);
     half4 *pixel = destination.pixels_half_rgba + destination.offset;
 
     for (size_t i = 0; i < size; i++, pixel++) {
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 80908271ff6..e3cb81d31b7 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -148,8 +148,8 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
 
       film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
 
-      float4_store_half(&pixel->x,
-                        make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
+      *pixel = float4_to_half4_display(
+          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
     }
   });
 }
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 44c5d7ef065..93f956e354d 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator {
 
   static ccl_always_inline float4 read(half4 r)
   {
-    return half4_to_float4(r);
+    return half4_to_float4_image(r);
   }
 
   static ccl_always_inline float4 read(half r)
   {
-    float f = half_to_float(r);
+    float f = half_to_float_image(r);
     return make_float4(f, f, f, 1.0f);
   }
 
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 685c7a5b753..8a50eb1a3d5 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -128,6 +128,13 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index eeac09d4b29..335cb1ec0c0 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -516,7 +516,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }
 
 /* Common implementation for half4 destination and 3-channel input pass. */
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index c9ec9be05df..d27b7d55475 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -120,6 +120,13 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 81723abe1e2..0db5acd319a 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -59,99 +59,16 @@ struct half4 {
   half x, y, z, w;
 };
 
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+/* Conversion to/from half float for image textures
+ *
+ * Simplified float to half for fast sampling on processor without a native
+ * instruction, and eliminating any NaN and inf values. */
 
-ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
+ccl_device_inline half float_to_half_image(float f)
 {
-  h[0] = __float2half(f.x);
-  h[1] = __float2half(f.y);
-  h[2] = __float2half(f.z);
-  h[3] = __float2half(f.w);
-}
-
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+  return __float2half(f);
 #else
-
-ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
-{
-
-#  ifndef __KERNEL_SSE2__
-  for (int i = 0; i < 4; i++) {
-    /* optimized float to half for pixels:
-     * assumes no negative, no nan, no inf, and sets denormal to 0 */
-    union {
-      uint i;
-      float f;
-    } in;
-    in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
-    int x = in.i;
-
-    int absolute = x & 0x7FFFFFFF;
-    int Z = absolute + 0xC8000000;
-    int result = (absolute < 0x38800000) ? 0 : Z;
-    int rshift = (result >> 13);
-
-    h[i] = (rshift & 0x7FFF);
-  }
-#  else
-  /* same as above with SSE */
-  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
-
-#    ifdef __KERNEL_AVX2__
-  ssei rpack = _mm_cvtps_ph(x, 0);
-#    else
-  ssei absolute = cast(x) & 0x7FFFFFFF;
-  ssei Z = absolute + 0xC8000000;
-  ssei result = andnot(absolute < 0x38800000, Z);
-  ssei rshift = (result >> 13) & 0x7FFF;
-  ssei rpack = _mm_packs_epi32(rshift, rshift);
-#    endif
-
-  _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
-#  endif
-}
-
-#  ifndef __KERNEL_HIP__
-
-ccl_device_inline float half_to_float(half h)
-{
-  float f;
-
-  *((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
-
-  return f;
-}
-#  else
-
-ccl_device_inline float half_to_float(std::uint32_t a) noexcept
-{
-
-  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
-
-  std::uint32_t v = __float_as_uint(__uint_as_float(u) *
-                                    __uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
-                    0x38000000U;
-
-  u = (a & 0x7fff) != 0 ? v : u;
-
-  return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
-}
-
-#  endif /* __KERNEL_HIP__ */
-
-ccl_device_inline float4 half4_to_float4(half4 h)
-{
-  float4 f;
-
-  f.x = half_to_float(h.x);
-  f.y = half_to_float(h.y);
-  f.z = half_to_float(h.z);
-  f.w = half_to_float(h.w);
-
-  return f;
-}
-
-ccl_device_inline half float_to_half(float f)
-{
   const uint u = __float_as_uint(f);
   /* Sign bit, shifted to its position. */
   uint sign_bit = u & 0x80000000;
@@ -170,10 +87,83 @@ ccl_device_inline half float_to_half(float f)
   value_bits = (exponent_bits == 0 ? 0 : value_bits);
   /* Re-insert sign bit and return. */
   return (value_bits | sign_bit);
+#endif
+}
+
+ccl_device_inline float half_to_float_image(half h)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+  return __half2float(h);
+#else
+  const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
+  return __int_as_float(x);
+#endif
 }
 
+ccl_device_inline float4 half4_to_float4_image(const half4 h)
+{
+  /* Unable to use because it gives different results half_to_float_image, can we
+   * modify float_to_half_image so the conversion results are identical? */
+#if 0 /* defined(__KERNEL_AVX2__) */
+  /* CPU: AVX. */
+  __m128i x = _mm_castpd_si128(_mm_load_sd((const double *)&h));
+  return float4(_mm_cvtph_ps(x));
 #endif
 
+  const float4 f = make_float4(half_to_float_image(h.x),
+                               half_to_float_image(h.y),
+                               half_to_float_image(h.z),
+                               half_to_float_image(h.w));
+  return f;
+}
+
+/* Conversion to half float texture for display.
+ *
+ * Simplified float to half for fast display texture conversion on processors
+ * without a native instruction. Assumes no negative, no NaN, no inf, and sets
+ * denormal to 0. */
+
+ccl_device_inline half float_to_half_display(const float f)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+  return __float2half(f);
+#else
+  const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
+  const int absolute = x & 0x7FFFFFFF;
+  const int Z = absolute + 0xC8000000;
+  const int result = (absolute < 0x38800000) ? 0 : Z;
+  const int rshift = (result >> 13);
+  return (rshift & 0x7FFF);
+#endif
+}
+
+ccl_device_inline half4 float4_to_half4_display(const float4 f)
+{
+#ifdef __KERNEL_SSE2__
+  /* CPU: SSE and AVX. */
+  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+#  ifdef __KERNEL_AVX2__
+  ssei rpack = _mm_cvtps_ph(x, 0);
+#  else
+  ssei absolute = cast(x) & 0x7FFFFFFF;
+  ssei Z = absolute + 0xC8000000;
+  ssei result = andnot(absolute < 0x38800000, Z);
+  ssei rshift = (result >> 13) & 0x7FFF;
+  ssei rpack = _mm_packs_epi32(rshift, rshift);
+#  endif
+  half4 h;
+  _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
+  return h;
+#else
+  /* GPU and scalar fallback. */
+  const half4 h = {float_to_half_display(f.x),
+                   float_to_half_display(f.y),
+                   float_to_half_display(f.z),
+                   float_to_half_display(f.w)};
+  return h;
+#endif
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_HALF_H__ */
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index 27ec7ffb423..b082b971613 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -56,7 +56,7 @@ template<> inline float util_image_cast_to_float(uint16_t value)
 }
 template<> inline float util_image_cast_to_float(half value)
 {
-  return half_to_float(value);
+  return half_to_float_image(value);
 }
 
 /* Cast float value to output pixel type. */
@@ -88,7 +88,7 @@ template<> inline uint16_t util_image_cast_from_float(float value)
 }
 template<> inline half util_image_cast_from_float(float value)
 {
-  return float_to_half(value);
+  return float_to_half_image(value);
 }
 
 CCL_NAMESPACE_END