[Bf-blender-cvs] [dc1043d] master: Cycles: Add fast math function module
Sergey Sharybin
noreply at git.blender.org
Fri Jan 30 21:51:00 CET 2015
Commit: dc1043dda0552af72396fec15dccd9d7eefee803
Author: Sergey Sharybin
Date: Fri Jan 30 17:56:47 2015 +0500
Branches: master
https://developer.blender.org/rBdc1043dda0552af72396fec15dccd9d7eefee803
Cycles: Add fast math function module
It is based on fmath.h from OIIO and could be used to give some speedup
in areas where absolute accuracy is not so critical.
===================================================================
M SConstruct
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/kernel_math.h
M intern/cycles/util/CMakeLists.txt
A intern/cycles/util/util_math_fast.h
===================================================================
diff --git a/SConstruct b/SConstruct
index 9e5d434..7de9f2c 100644
--- a/SConstruct
+++ b/SConstruct
@@ -1010,6 +1010,7 @@ if env['OURPLATFORM']!='darwin':
source.append('intern/cycles/util/util_color.h')
source.append('intern/cycles/util/util_half.h')
source.append('intern/cycles/util/util_math.h')
+ source.append('intern/cycles/util/util_math_fast.h')
source.append('intern/cycles/util/util_transform.h')
source.append('intern/cycles/util/util_types.h')
scriptinstall.append(env.Install(dir=dir,source=source))
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0b2a543..a25eb3f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -137,6 +137,7 @@ set(SRC_UTIL_HEADERS
../util/util_color.h
../util/util_half.h
../util/util_math.h
+ ../util/util_math_fast.h
../util/util_transform.h
../util/util_types.h
)
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 132b042..453f4c8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -19,6 +19,7 @@
#include "util_color.h"
#include "util_math.h"
+#include "util_math_fast.h"
#include "util_transform.h"
#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index d52bcd6..1961ee8 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -47,6 +47,7 @@ set(SRC_HEADERS
util_logging.h
util_map.h
util_math.h
+ util_math_fast.h
util_md5.h
util_opengl.h
util_optimization.h
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
new file mode 100644
index 0000000..9b432fc
--- /dev/null
+++ b/intern/cycles/util/util_math_fast.h
@@ -0,0 +1,611 @@
+/*
+ * Adapted from OpenImageIO library with this license:
+ *
+ * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
+ * All Rights Reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the software's owners nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (This is the Modified BSD License)
+ *
+ * A few bits here are based upon code from NVIDIA that was also released
+ * under the same modified BSD license, and marked as:
+ * Copyright 2004 NVIDIA Corporation. All Rights Reserved.
+ *
+ * Some parts of this file were first open-sourced in Open Shading Language,
+ * then later moved here. The original copyright notice was:
+ * Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
+ *
+ * Many of the math functions were copied from or inspired by other
+ * public domain sources or open source packages with compatible licenses.
+ * The individual functions give references were applicable.
+ */
+
+#ifndef __UTIL_FAST_MATH__
+#define __UTIL_FAST_MATH__
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO(sergey): Make sure it does not conflict with SSE intrinsics. */
+ccl_device_inline float madd(const float a, const float b, const float c)
+{
+ /* NOTE: In the future we may want to explicitly ask for a fused
+ * multiply-add in a specialized version for float.
+ *
+ * NOTE: GCC/ICC will turn this (for float) into a FMA unless
+ * explicitly asked not to, clang seems to leave the code alone.
+ */
+ return a * b + c;
+}
+
+/*
+ * FAST & APPROXIMATE MATH
+ *
+ * The functions named "fast_*" provide a set of replacements to libm that
+ * are much faster at the expense of some accuracy and robust handling of
+ * extreme values. One design goal for these approximation was to avoid
+ * branches as much as possible and operate on single precision values only
+ * so that SIMD versions should be straightforward ports We also try to
+ * implement "safe" semantics (ie: clamp to valid range where possible)
+ * natively since wrapping these inline calls in another layer would be
+ * wasteful.
+ *
+ * Some functions are fast_safe_*, which is both a faster approximation as
+ * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
+ */
+
+/* Round to nearest integer, returning as an int. */
+ccl_device_inline int fast_rint(float x)
+{
+ /* used by sin/cos/tan range reduction. */
+#ifdef __KERNEL_SSE4__
+ /* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
+ return float_to_int(rintf(x));
+#else
+ /* emulate rounding by adding/substracting 0.5. */
+ return float_to_int(x + copysignf(0.5f, x));
+#endif
+}
+
+ccl_device float fast_sinf(float x)
+{
+ /* Very accurate argument reduction from SLEEF,
+ * starts failing around x=262000
+ *
+ * Results on: [-2pi,2pi].
+ *
+ * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+ * 1.19209e-07 max error
+ */
+ int q = fast_rint(x * M_1_PI_F);
+ float qf = q;
+ x = madd(qf, -0.78515625f*4, x);
+ x = madd(qf, -0.00024187564849853515625f*4, x);
+ x = madd(qf, -3.7747668102383613586e-08f*4, x);
+ x = madd(qf, -1.2816720341285448015e-12f*4, x);
+ x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
+ float s = x * x;
+ if((q & 1) != 0) x = -x;
+ /* This polynomial approximation has very low error on [-pi/2,+pi/2]
+ * 1.19209e-07 max error in total over [-2pi,+2pi]. */
+ float u = 2.6083159809786593541503e-06f;
+ u = madd(u, s, -0.0001981069071916863322258f);
+ u = madd(u, s, +0.00833307858556509017944336f);
+ u = madd(u, s, -0.166666597127914428710938f);
+ u = madd(s, u * x, x);
+ /* For large x, the argument reduction can fail and the polynomial can be
+ * evaluated with arguments outside the valid internal. Just clamp the bad
+ * values away (setting to 0.0f means no branches need to be generated). */
+ if(fabsf(u) > 1.0f) {
+ u = 0.0f;
+ }
+ return u;
+}
+
+ccl_device float fast_cosf(float x)
+{
+ /* Same argument reduction as fast_sinf(). */
+ int q = fast_rint(x * M_1_PI_F);
+ float qf = q;
+ x = madd(qf, -0.78515625f*4, x);
+ x = madd(qf, -0.00024187564849853515625f*4, x);
+ x = madd(qf, -3.7747668102383613586e-08f*4, x);
+ x = madd(qf, -1.2816720341285448015e-12f*4, x);
+ x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals. */
+ float s = x * x;
+ /* Polynomial from SLEEF's sincosf, max error is
+ * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
+ float u = -2.71811842367242206819355e-07f;
+ u = madd(u, s, +2.47990446951007470488548e-05f);
+ u = madd(u, s, -0.00138888787478208541870117f);
+ u = madd(u, s, +0.0416666641831398010253906f);
+ u = madd(u, s, -0.5f);
+ u = madd(u, s, +1.0f);
+ if((q & 1) != 0) {
+ u = -u;
+ }
+ if(fabsf(u) > 1.0f) {
+ u = 0.0f;
+ }
+ return u;
+}
+
+ccl_device void fast_sincosf(float x, float* sine, float* cosine)
+{
+ /* Same argument reduction as fast_sin. */
+ int q = fast_rint(x * float(M_1_PI));
+ float qf = q;
+ x = madd(qf, -0.78515625f*4, x);
+ x = madd(qf, -0.00024187564849853515625f*4, x);
+ x = madd(qf, -3.7747668102383613586e-08f*4, x);
+ x = madd(qf, -1.2816720341285448015e-12f*4, x);
+ x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
+ float s = x * x;
+ /* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
+ if((q & 1) != 0) {
+ x = -x;
+ }
+ float su = 2.6083159809786593541503e-06f;
+ su = madd(su, s, -0.0001981069071916863322258f);
+ su = madd(su, s, +0.00833307858556509017944336f);
+ su = madd(su, s, -0.166666597127914428710938f);
+ su = madd(s, su * x, x);
+ float cu = -2.71811842367242206819355e-07f;
+ cu = madd(cu, s, +2.47990446951007470488548e-05f);
+ cu = madd(cu, s, -0.00138888787478208541870117f);
+ cu = madd(cu, s, +0.0416666641831398010253906f);
+ cu = madd(cu, s, -0.5f);
+ cu = madd(cu, s, +1.0f);
+ if((q & 1) != 0) {
+ cu = -cu;
+ }
+ if(fabsf(su) > 1.0f) {
+ su = 0.0f;
+ }
+ if(fabsf(cu) > 1.0f) {
+ cu = 0.0f;
+ }
+ *sine = su;
+ *cosine = cu;
+}
+
+/* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
+ * becoming really poor outside of this range because the reciprocal amplifies
+ * errors.
+ */
+ccl_device float fast_tanf(float x)
+{
+ /* Derived from SLEEF implementation.
+ *
+ * Note that we cannot apply the "denormal crush" trick everywhere because
+ * we sometimes need to take the reciprocal of the polynomial
+ */
+ int q = fast_rint(x * 2.0f * M_1_PI_F);
+ float qf = q;
+ x = madd(qf, -0.78515625f*2, x);
+ x = madd(qf, -0.00024187564849853515625f*2, x);
+ x = madd(qf, -3.7747668102383613586e-08f*2, x);
+ x = madd(qf, -1.2816720341285448015e-12f*2, x);
+ if((q & 1) == 0) {
+ /* Crush denormals (only if we aren't inverting the result later). */
+ x = M_PI_4_F - (M_PI_4_F - x);
+ }
+ float s = x * x;
+ float u = 0.00927245803177356719970703f;
+ u = madd(u, s, 0.00331984995864331722259521f);
+ u = madd(u, s, 0.0242998078465461730957031f);
+ u = madd(u, s, 0.0534495301544666290283203f);
+ u = madd(u, s, 0.133383005857467651367188f);
+ u = madd(u, s, 0.3333318
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list