[Bf-blender-cvs] [e588106] master: Cycles: Use more SSE intrinsics for float3 type

Wed Oct 12 15:15:04 CEST 2016

Commit: e588106d459207f04d28cfc3456355343d413446
Author: Sergey Sharybin
Date:   Wed Oct 12 14:23:29 2016 +0200
Branches: master
https://developer.blender.org/rBe588106d459207f04d28cfc3456355343d413446

Cycles: Use more SSE intrinsics for float3 type

This gives about 5% speedup on AVX2 kernels (other kernels still
have SSE disabled for math operations) and this solves the slowdown
of koro scene mention in the previous commit.

The title says it all actually. This commit also contains
changes to pass float3 as const reference in affected functions.

This should make MSVC happier without breaking OpenCL because it's
only done in areas which are ifdef-ed for non-OpenCL.

Another patch based on inspiration from Maxym Dmytrychenko, thanks!

===================================================================

M	intern/cycles/util/util_math.h
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 89a882d..c98407b 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
 	return make_float3(-a.x, -a.y, -a.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
 	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	return make_float3(f/a.x, f/a.y, f/a.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(a.m128);
+	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
 }
 
-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
 {
 	float invf = 1.0f/f;
-	return make_float3(a.x*invf, a.y*invf, a.z*invf);
+	return a * invf;
 }
 
-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
 }
 
-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
 {
 	return a = a + b;
 }
@@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }
 
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
 ccl_device_inline float dot(const float4 a, const float4 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
 ccl_device_inline float3 normalize_len(const float3 a, float *t)
 {
 	*t = len(a);
-	return a/(*t);
+	float x = 1.0f / *t;
+	return a*x;
 }
 
 ccl_device_inline float3 safe_normalize(const float3 a)
 {
 	float t = len(a);
-	return (t != 0.0f)? a/t: a;
+	return (t != 0.0f)? a * (1.0f/t) : a;
 }
 
 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 6af65f8..a000fae 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int3(const int3& a) { m128 = a.m128; }
+	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int4(const int4& a) : m128(a.m128) {}
+	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
 	};
 
 	__forceinline float3() {}
-	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline float3(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float3(const float3& a) : m128(a.m128) {}
+	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
 #else
 	float x, y, z, w;
 #endif
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4(const __m128 a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float4(const float4& a) : m128(a.m128) {}
+	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
 #else
 	float x, y, z, w;
 #endif