[Bf-blender-cvs] [dc1043d] master: Cycles: Add fast math function module

Fri Jan 30 21:51:00 CET 2015

Commit: dc1043dda0552af72396fec15dccd9d7eefee803
Author: Sergey Sharybin
Date:   Fri Jan 30 17:56:47 2015 +0500
Branches: master
https://developer.blender.org/rBdc1043dda0552af72396fec15dccd9d7eefee803

Cycles: Add fast math function module

It is based on fmath.h from OIIO and could be used to give some speedup
in areas where absolute accuracy is not so critical.

===================================================================

M	SConstruct
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernel_math.h
M	intern/cycles/util/CMakeLists.txt
A	intern/cycles/util/util_math_fast.h

===================================================================

diff --git a/SConstruct b/SConstruct
index 9e5d434..7de9f2c 100644
--- a/SConstruct
+++ b/SConstruct
@@ -1010,6 +1010,7 @@ if env['OURPLATFORM']!='darwin':
             source.append('intern/cycles/util/util_color.h')
             source.append('intern/cycles/util/util_half.h')
             source.append('intern/cycles/util/util_math.h')
+            source.append('intern/cycles/util/util_math_fast.h')
             source.append('intern/cycles/util/util_transform.h')
             source.append('intern/cycles/util/util_types.h')
             scriptinstall.append(env.Install(dir=dir,source=source))
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0b2a543..a25eb3f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -137,6 +137,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_color.h
 	../util/util_half.h
 	../util/util_math.h
+	../util/util_math_fast.h
 	../util/util_transform.h
 	../util/util_types.h
 )
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 132b042..453f4c8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -19,6 +19,7 @@
 
 #include "util_color.h"
 #include "util_math.h"
+#include "util_math_fast.h"
 #include "util_transform.h"
 
 #endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index d52bcd6..1961ee8 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -47,6 +47,7 @@ set(SRC_HEADERS
 	util_logging.h
 	util_map.h
 	util_math.h
+	util_math_fast.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
new file mode 100644
index 0000000..9b432fc
--- /dev/null
+++ b/intern/cycles/util/util_math_fast.h
@@ -0,0 +1,611 @@
+/*
+ * Adapted from OpenImageIO library with this license:
+ *
+ * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
+ * All Rights Reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of the software's owners nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (This is the Modified BSD License)
+ *
+ * A few bits here are based upon code from NVIDIA that was also released
+ * under the same modified BSD license, and marked as:
+ *    Copyright 2004 NVIDIA Corporation. All Rights Reserved.
+ *
+ * Some parts of this file were first open-sourced in Open Shading Language,
+ * then later moved here. The original copyright notice was:
+ *    Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
+ *
+ * Many of the math functions were copied from or inspired by other
+ * public domain sources or open source packages with compatible licenses.
+ * The individual functions give references were applicable.
+ */
+
+#ifndef __UTIL_FAST_MATH__
+#define __UTIL_FAST_MATH__
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO(sergey): Make sure it does not conflict with SSE intrinsics. */
+ccl_device_inline float madd(const float a, const float b, const float c)
+{
+	/* NOTE: In the future we may want to explicitly ask for a fused
+	 * multiply-add in a specialized version for float.
+	 *
+	 * NOTE: GCC/ICC will turn this (for float) into a FMA unless
+	 * explicitly asked not to, clang seems to leave the code alone.
+	 */
+	return a * b + c;
+}
+
+/*
+ * FAST & APPROXIMATE MATH
+ *
+ * The functions named "fast_*" provide a set of replacements to libm that
+ * are much faster at the expense of some accuracy and robust handling of
+ * extreme values. One design goal for these approximation was to avoid
+ * branches as much as possible and operate on single precision values only
+ * so that SIMD versions should be straightforward ports We also try to
+ * implement "safe" semantics (ie: clamp to valid range where possible)
+ * natively since wrapping these inline calls in another layer would be
+ * wasteful.
+ *
+ * Some functions are fast_safe_*, which is both a faster approximation as
+ * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
+ */
+
+/* Round to nearest integer, returning as an int. */
+ccl_device_inline int fast_rint(float x)
+{
+	/* used by sin/cos/tan range reduction. */
+#ifdef __KERNEL_SSE4__
+	/* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
+	return float_to_int(rintf(x));
+#else
+	/* emulate rounding by adding/substracting 0.5. */
+	return float_to_int(x + copysignf(0.5f, x));
+#endif
+}
+
+ccl_device float fast_sinf(float x)
+{
+	/* Very accurate argument reduction from SLEEF,
+	 * starts failing around x=262000
+	 *
+	 * Results on: [-2pi,2pi].
+	 *
+	 * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+	 * 1.19209e-07 max error
+	 */
+	int q = fast_rint(x * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals */
+	float s = x * x;
+	if((q & 1) != 0) x = -x;
+	/* This polynomial approximation has very low error on [-pi/2,+pi/2]
+	 * 1.19209e-07 max error in total over [-2pi,+2pi]. */
+	float u = 2.6083159809786593541503e-06f;
+	u = madd(u, s, -0.0001981069071916863322258f);
+	u = madd(u, s, +0.00833307858556509017944336f);
+	u = madd(u, s, -0.166666597127914428710938f);
+	u = madd(s, u * x, x);
+	/* For large x, the argument reduction can fail and the polynomial can be
+	 * evaluated with arguments outside the valid internal. Just clamp the bad
+	 * values away (setting to 0.0f means no branches need to be generated). */
+	if(fabsf(u) > 1.0f) {
+		u = 0.0f;
+	}
+	return u;
+}
+
+ccl_device float fast_cosf(float x)
+{
+	/* Same argument reduction as fast_sinf(). */
+	int q = fast_rint(x * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals. */
+	float s = x * x;
+	/* Polynomial from SLEEF's sincosf, max error is
+	 * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
+	float u = -2.71811842367242206819355e-07f;
+	u = madd(u, s, +2.47990446951007470488548e-05f);
+	u = madd(u, s, -0.00138888787478208541870117f);
+	u = madd(u, s, +0.0416666641831398010253906f);
+	u = madd(u, s, -0.5f);
+	u = madd(u, s, +1.0f);
+	if((q & 1) != 0) {
+		u = -u;
+	}
+	if(fabsf(u) > 1.0f) {
+		u = 0.0f;
+	}
+	return u;
+}
+
+ccl_device void fast_sincosf(float x, float* sine, float* cosine)
+{
+	/* Same argument reduction as fast_sin. */
+	int q = fast_rint(x * float(M_1_PI));
+	float qf = q;
+	x = madd(qf, -0.78515625f*4, x);
+	x = madd(qf, -0.00024187564849853515625f*4, x);
+	x = madd(qf, -3.7747668102383613586e-08f*4, x);
+	x = madd(qf, -1.2816720341285448015e-12f*4, x);
+	x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
+	float s = x * x;
+	/* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
+	if((q & 1) != 0) {
+		x = -x;
+	}
+	float su = 2.6083159809786593541503e-06f;
+	su = madd(su, s, -0.0001981069071916863322258f);
+	su = madd(su, s, +0.00833307858556509017944336f);
+	su = madd(su, s, -0.166666597127914428710938f);
+	su = madd(s, su * x, x);
+	float cu = -2.71811842367242206819355e-07f;
+	cu = madd(cu, s, +2.47990446951007470488548e-05f);
+	cu = madd(cu, s, -0.00138888787478208541870117f);
+	cu = madd(cu, s, +0.0416666641831398010253906f);
+	cu = madd(cu, s, -0.5f);
+	cu = madd(cu, s, +1.0f);
+	if((q & 1) != 0) {
+		cu = -cu;
+	}
+	if(fabsf(su) > 1.0f) {
+		su = 0.0f;
+	}
+	if(fabsf(cu) > 1.0f) {
+		cu = 0.0f;
+	}
+	*sine   = su;
+	*cosine = cu;
+}
+
+/* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
+ * becoming really poor outside of this range because the reciprocal amplifies
+ * errors.
+ */
+ccl_device float fast_tanf(float x)
+{
+	/* Derived from SLEEF implementation.
+	 *
+	 * Note that we cannot apply the "denormal crush" trick everywhere because
+	 * we sometimes need to take the reciprocal of the polynomial
+	 */
+	int q = fast_rint(x * 2.0f * M_1_PI_F);
+	float qf = q;
+	x = madd(qf, -0.78515625f*2, x);
+	x = madd(qf, -0.00024187564849853515625f*2, x);
+	x = madd(qf, -3.7747668102383613586e-08f*2, x);
+	x = madd(qf, -1.2816720341285448015e-12f*2, x);
+	if((q & 1) == 0) {
+		/* Crush denormals (only if we aren't inverting the result later). */
+		x = M_PI_4_F - (M_PI_4_F - x);
+	}
+	float s = x * x;
+	float u = 0.00927245803177356719970703f;
+	u = madd(u, s, 0.00331984995864331722259521f);
+	u = madd(u, s, 0.0242998078465461730957031f);
+	u = madd(u, s, 0.0534495301544666290283203f);
+	u = madd(u, s, 0.133383005857467651367188f);
+	u = madd(u, s, 0.3333318

@@ Diff output truncated at 10240 characters. @@