[Bf-blender-cvs] [cb96cf0] master: Cycles: small optimization for SSE 4.1 bvh intersector

Sv. Lockal noreply at git.blender.org
Mon Feb 3 17:51:14 CET 2014


Commit: cb96cf0b637360a84d86f4f375f28ccc79a53294
Author: Sv. Lockal
Date:   Mon Feb 3 20:46:13 2014 +0400
https://developer.blender.org/rBcb96cf0b637360a84d86f4f375f28ccc79a53294

Cycles: small optimization for SSE 4.1 bvh intersector

Gives 0.7% - 1.3% speedup for BMW1M-MikePan scene.

Reviewers: juicyfruit

Differential Revision: https://developer.blender.org/D280

===================================================================

M	intern/cycles/kernel/kernel_bvh_subsurface.h
M	intern/cycles/kernel/kernel_bvh_traversal.h
M	intern/cycles/util/util_simd.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h
index bb51986..df82dda 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/kernel_bvh_subsurface.h
@@ -66,20 +66,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	
 	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
 	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = _mm_set_ps1(P.x);
 	Psplat[1] = _mm_set_ps1(P.y);
 	Psplat[2] = _mm_set_ps1(P.z);
 
-	idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-	idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-	idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 	__m128 tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
 
-	shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-	shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-	shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 	/* traversal loop */
@@ -139,9 +134,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
 
 				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
 				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
@@ -242,15 +237,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 						Psplat[1] = _mm_set_ps1(P.y);
 						Psplat[2] = _mm_set_ps1(P.z);
 
-						idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-						idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-						idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 						tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
 
-						shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-						shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-						shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 						++stackPtr;
@@ -285,15 +274,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 			Psplat[1] = _mm_set_ps1(P.y);
 			Psplat[2] = _mm_set_ps1(P.z);
 
-			idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-			idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-			idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 			tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
 
-			shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-			shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-			shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 			object = ~0;
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index 1ee1fbc..b4c63f5 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -75,20 +75,15 @@ ccl_device bool BVH_FUNCTION_NAME
 	
 	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
 	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = _mm_set_ps1(P.x);
 	Psplat[1] = _mm_set_ps1(P.y);
 	Psplat[2] = _mm_set_ps1(P.z);
 
-	idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-	idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-	idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 	__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
 
-	shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-	shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-	shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 	/* traversal loop */
@@ -163,9 +158,9 @@ ccl_device bool BVH_FUNCTION_NAME
 				float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
 
 				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
 				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
@@ -286,15 +281,9 @@ ccl_device bool BVH_FUNCTION_NAME
 					Psplat[1] = _mm_set_ps1(P.y);
 					Psplat[2] = _mm_set_ps1(P.z);
 
-					idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-					idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-					idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 					tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
 
-					shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-					shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-					shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 					++stackPtr;
@@ -322,15 +311,9 @@ ccl_device bool BVH_FUNCTION_NAME
 			Psplat[1] = _mm_set_ps1(P.y);
 			Psplat[2] = _mm_set_ps1(P.z);
 
-			idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
-			idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
-			idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
 			tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
 
-			shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
-			shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
-			shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 			object = ~0;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index ac4e38e..2d3a927 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -69,6 +69,36 @@ ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf
 
 #endif
 
+#ifdef __KERNEL_SSE41__
+ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+										  const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+	const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) };
+	idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
+	idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
+	idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
+
+	const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+	const __m128 shuf_identity_f = _mm_castsi128_ps(shuf_identity);
+	const __m128 shuf_swap_f = _mm_castsi128_ps(shuf_swap);
+	shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
+	shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
+	shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
+}
+#else
+ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+										  const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+	idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+	idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+	idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
+
+	shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap;
+	shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap;
+	shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap;
+}
+#endif
+
 template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a, const __m128& b)
 {
 	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));




More information about the Bf-blender-cvs mailing list