[Bf-blender-cvs] [cd5e1ff] master: Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.

Fri Jun 13 21:59:46 CEST 2014

Commit: cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed
Author: Thomas Dinges
Date:   Fri Jun 13 21:13:18 2014 +0200
https://developer.blender.org/rBcd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed

Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.

This makes the code a bit easier to understand, and might come in handy
if we want to reuse more Embree code.

Differential Revision: https://developer.blender.org/D482

Code by Brecht, with fixes by Lockal, Sergey and myself.

===================================================================

M	intern/cycles/kernel/geom/geom_bvh_shadow.h
M	intern/cycles/kernel/geom/geom_bvh_subsurface.h
M	intern/cycles/kernel/geom/geom_bvh_traversal.h
M	intern/cycles/kernel/geom/geom_curve.h
M	intern/cycles/kernel/kernel_avx.cpp
M	intern/cycles/kernel/kernel_compat_cpu.h
M	intern/cycles/kernel/svm/svm_image.h
M	intern/cycles/kernel/svm/svm_noise.h
M	intern/cycles/kernel/svm/svm_texture.h
M	intern/cycles/render/curves.cpp
M	intern/cycles/render/tile.cpp
M	intern/cycles/util/CMakeLists.txt
M	intern/cycles/util/util_color.h
M	intern/cycles/util/util_half.h
M	intern/cycles/util/util_optimization.h
A	intern/cycles/util/util_simd.cpp
M	intern/cycles/util/util_simd.h
A	intern/cycles/util/util_sseb.h
A	intern/cycles/util/util_ssef.h
A	intern/cycles/util/util_ssei.h
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 98bf82b..48876da 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
 	shuffle_swap_t shufflexyz[3];
 
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
 
-	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
 				/* calculate { c0min, c1min, -c0max, -c1max} */
-				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-				const __m128 tminmax = _mm_xor_ps(minmax, pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
 				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
 #else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME
 #if !defined(__KERNEL_SSE2__)
 					bool closestChild1 = (c1min < c0min);
 #else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+					bool closestChild1 = tminmax[1] < tminmax[0];
 #endif
 
 					if(closestChild1) {
@@ -301,12 +299,12 @@ ccl_device bool BVH_FUNCTION_NAME
 					num_hits_in_instance = 0;
 
 #if defined(__KERNEL_SSE2__)
-					Psplat[0] = _mm_set_ps1(P.x);
-					Psplat[1] = _mm_set_ps1(P.y);
-					Psplat[2] = _mm_set_ps1(P.z);
+					Psplat[0] = ssef(P.x);
+					Psplat[1] = ssef(P.y);
+					Psplat[2] = ssef(P.z);
 
 					isect_array->t = isect_t;
-					tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -348,13 +346,13 @@ ccl_device bool BVH_FUNCTION_NAME
 			}
 
 #if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
 
 			isect_t = tmax;
 			isect_array->t = isect_t;
-			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05d..a8f57cf 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
 	shuffle_swap_t shufflexyz[3];
 
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
 
-	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
-				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
 				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
 #else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #if !defined(__KERNEL_SSE2__)
 					bool closestChild1 = (c1min < c0min);
 #else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+					bool closestChild1 = tminmax[1] < tminmax[0];
 #endif
 
 					if(closestChild1) {
@@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-						Psplat[0] = _mm_set_ps1(P.x);
-						Psplat[1] = _mm_set_ps1(P.y);
-						Psplat[2] = _mm_set_ps1(P.z);
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
 
-						tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
 
-			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f9..e39228c 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -72,15 +72,15 @@ ccl_device bool BVH_FUNCTION_NAME
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psp

@@ Diff output truncated at 10240 characters. @@