[Bf-blender-cvs] [869a46df298] master: Cycles fp consistency for Apple Silicon CPUs

Michael Jones noreply at git.blender.org
Tue Apr 12 20:43:52 CEST 2022


Commit: 869a46df2980818644db4823fb1d29e9d525b645
Author: Michael Jones
Date:   Tue Apr 12 19:36:55 2022 +0100
Branches: master
https://developer.blender.org/rB869a46df2980818644db4823fb1d29e9d525b645

Cycles fp consistency for Apple Silicon CPUs

Propagate the fp settings from the main thread to all the worker threads (the fp settings includes the FZ settings among other things) - this guarantees consistency in execution of floating point math regardless if its executed in tbb thread arena or on main thread

Add FZ mode to arm64/aarch64 in parallel to the way its been done on intel processors, currently compiling for arm target does not set this mode at all, hence potentially runs slower and with possible results mismatch with intel x86.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D14454

===================================================================

M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/util/simd.h

===================================================================

diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index ab134179602..f1e70b7f28f 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -355,6 +355,9 @@ void PathTrace::path_trace(RenderWork &render_work)
 
   const int num_works = path_trace_works_.size();
 
+  tbb::task_group_context *tbb_ctx = tbb::task::self().group();
+  tbb_ctx->capture_fp_settings();
+
   tbb::parallel_for(0, num_works, [&](int i) {
     const double work_start_time = time_dt();
     const int num_samples = render_work.path_trace.num_samples;
diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h
index 15dda4e76a8..6772025d1de 100644
--- a/intern/cycles/util/simd.h
+++ b/intern/cycles/util/simd.h
@@ -32,6 +32,12 @@
 #  define SIMD_SET_FLUSH_TO_ZERO \
     _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
     _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define _MM_FLUSH_ZERO_ON 24
+#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr))
+#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr))
+#  define SIMD_SET_FLUSH_TO_ZERO  set_fz(_MM_FLUSH_ZERO_ON);
+#  define SIMD_GET_FLUSH_TO_ZERO  get_fz(_MM_FLUSH_ZERO_ON)
 #else
 #  define SIMD_SET_FLUSH_TO_ZERO
 #endif
@@ -104,6 +110,21 @@ static struct PosInfTy {
 static struct StepTy {
 } step ccl_attr_maybe_unused;
 
+#endif
+#if defined(__aarch64__) || defined(_M_ARM64)
+__forceinline int set_fz(uint32_t flag) {
+    uint64_t old_fpcr, new_fpcr;
+    __get_fpcr(old_fpcr);
+    new_fpcr = old_fpcr | (1ULL << flag);
+    __set_fpcr(new_fpcr);
+    __get_fpcr(old_fpcr);
+    return old_fpcr == new_fpcr;
+}
+__forceinline int get_fz(uint32_t flag) {
+    uint64_t cur_fpcr;
+    __get_fpcr(cur_fpcr);
+    return (cur_fpcr & (1ULL<< flag)) > 0 ? 1 : 0 ;
+}
 #endif
 
 /* Utilities used by Neon */



More information about the Bf-blender-cvs mailing list