[Bf-blender-cvs] [cd5e1ff] master: Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.
Thomas Dinges
noreply at git.blender.org
Fri Jun 13 21:59:46 CEST 2014
Commit: cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed
Author: Thomas Dinges
Date: Fri Jun 13 21:13:18 2014 +0200
https://developer.blender.org/rBcd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed
Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.
This makes the code a bit easier to understand, and might come in handy
if we want to reuse more Embree code.
Differential Revision: https://developer.blender.org/D482
Code by Brecht, with fixes by Lockal, Sergey and myself.
===================================================================
M intern/cycles/kernel/geom/geom_bvh_shadow.h
M intern/cycles/kernel/geom/geom_bvh_subsurface.h
M intern/cycles/kernel/geom/geom_bvh_traversal.h
M intern/cycles/kernel/geom/geom_curve.h
M intern/cycles/kernel/kernel_avx.cpp
M intern/cycles/kernel/kernel_compat_cpu.h
M intern/cycles/kernel/svm/svm_image.h
M intern/cycles/kernel/svm/svm_noise.h
M intern/cycles/kernel/svm/svm_texture.h
M intern/cycles/render/curves.cpp
M intern/cycles/render/tile.cpp
M intern/cycles/util/CMakeLists.txt
M intern/cycles/util/util_color.h
M intern/cycles/util/util_half.h
M intern/cycles/util/util_optimization.h
A intern/cycles/util/util_simd.cpp
M intern/cycles/util/util_simd.h
A intern/cycles/util/util_sseb.h
A intern/cycles/util/util_ssef.h
A intern/cycles/util/util_ssei.h
M intern/cycles/util/util_types.h
===================================================================
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 98bf82b..48876da 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psplat[3], idirsplat[3];
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
- const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
- __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
- const __m128 tminmax = _mm_xor_ps(minmax, pn);
- const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+ const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
#else
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
@@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
- union { __m128 m128; float v[4]; } uminmax;
- uminmax.m128 = tminmax;
- bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+ bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(closestChild1) {
@@ -301,12 +299,12 @@ ccl_device bool BVH_FUNCTION_NAME
num_hits_in_instance = 0;
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
isect_array->t = isect_t;
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -348,13 +346,13 @@ ccl_device bool BVH_FUNCTION_NAME
}
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
isect_t = tmax;
isect_array->t = isect_t;
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05d..a8f57cf 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psplat[3], idirsplat[3];
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
- const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
- const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
- const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
#else
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
@@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
- union { __m128 m128; float v[4]; } uminmax;
- uminmax.m128 = tminmax;
- bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+ bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(closestChild1) {
@@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f9..e39228c 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -72,15 +72,15 @@ ccl_device bool BVH_FUNCTION_NAME
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psp
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list