[Bf-blender-cvs] [16eb202] cycles_hair_bvh: Cycles: SIMD optimization of unaligned BVH traversal
Sergey Sharybin
noreply at git.blender.org
Thu Apr 28 18:33:21 CEST 2016
Commit: 16eb2026b67d0c989365b77d7e592ed4f12a5e54
Author: Sergey Sharybin
Date: Thu Apr 28 18:29:51 2016 +0200
Branches: cycles_hair_bvh
https://developer.blender.org/rB16eb2026b67d0c989365b77d7e592ed4f12a5e54
Cycles: SIMD optimization of unaligned BVH traversal
Some movement towards SIMD-optimized traversal of unaligned nodes,
not fully optimal yet, but now the branch is about 4% faster on
a test scene with hairy monkey with 40K hair.
Should still be possible to optimize matrix multiplication and
maybe some parts could be re-formulated a bit to become more
friendly for vectorization.
But likely next step would be to support QBVH for hair, that should
be easier to vectorze with similar to current formulation.
===================================================================
M intern/cycles/bvh/bvh.cpp
M intern/cycles/bvh/bvh.h
M intern/cycles/kernel/geom/geom_bvh_hair.h
M intern/cycles/kernel/geom/geom_bvh_traversal_hair.h
===================================================================
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 2f8ceaa..0bf746c 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -492,6 +492,8 @@ void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
const BVHStackEntry& e1)
{
pack_unaligned_node(e.idx,
+ e0.node->is_unaligned(),
+ e1.node->is_unaligned(),
e0.node->m_aligned_space,
e1.node->m_aligned_space,
e0.node->m_bounds,
@@ -501,6 +503,8 @@ void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
}
void RegularBVH::pack_unaligned_node(int idx,
+ const bool is_unaligned0,
+ const bool is_unaligned1,
const Transform& aligned_space0,
const Transform& aligned_space1,
const BoundBox& bounds0,
@@ -508,17 +512,39 @@ void RegularBVH::pack_unaligned_node(int idx,
int c0, int c1,
uint visibility0, uint visibility1)
{
- Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
- aligned_space0);
- Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
- aligned_space1);
- float4 data[BVH_UNALIGNED_NODE_SIZE] =
- {
- space0.x, space0.y, space0.z, space0.w,
- space1.x, space1.y, space1.z, space1.w,
- make_float4(__int_as_float(c0), __int_as_float(c1),
- __int_as_float(visibility0), __int_as_float(visibility1))
- };
+ float4 data[BVH_UNALIGNED_NODE_SIZE];
+ if (is_unaligned0 || is_unaligned1) {
+ Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+ aligned_space0);
+ Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+ aligned_space1);
+ data[0] = space0.x;
+ data[1] = space0.y;
+ data[2] = space0.z;
+ data[3] = space0.w;
+ data[4] = space1.x;
+ data[5] = space1.y;
+ data[6] = space1.z;
+ data[7] = space1.w;
+ }
+ else {
+ data[0] = make_float4(bounds0.min.x, bounds1.min.x,
+ bounds0.max.x, bounds1.max.x);
+ data[1] = make_float4(bounds0.min.y, bounds1.min.y,
+ bounds0.max.y, bounds1.max.y);
+ data[2] = make_float4(bounds0.min.z, bounds1.min.z,
+ bounds0.max.z, bounds1.max.z);
+ data[3] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ data[4] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ data[5] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ data[6] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ data[7] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ data[8] = make_float4(__int_as_float(c0),
+ __int_as_float(c1),
+ __int_as_float(visibility0),
+ __int_as_float(visibility1));
memcpy(&pack.nodes[idx * BVH_UNALIGNED_NODE_SIZE],
data,
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 3e670bc..3099e2c 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -135,6 +135,8 @@ protected:
const BVHStackEntry& e0,
const BVHStackEntry& e1);
void pack_unaligned_node(int idx,
+ const bool is_unaligned0,
+ const bool is_unaligned1,
const Transform& aligned_space0,
const Transform& aligned_space1,
const BoundBox& b0,
diff --git a/intern/cycles/kernel/geom/geom_bvh_hair.h b/intern/cycles/kernel/geom/geom_bvh_hair.h
index 79cf16d..f7c6283 100644
--- a/intern/cycles/kernel/geom/geom_bvh_hair.h
+++ b/intern/cycles/kernel/geom/geom_bvh_hair.h
@@ -34,6 +34,7 @@ ccl_device_inline Transform bvh_hair_fetch_aligned_space(KernelGlobals *kg,
return aligned_space;
}
+#if !defined(__KERNEL_SSE2__)
ccl_device_inline bool bvh_hair_intersect_child(KernelGlobals *kg,
const float3 P,
const float3 dir,
@@ -48,21 +49,20 @@ ccl_device_inline bool bvh_hair_intersect_child(KernelGlobals *kg,
float3 aligned_dir = transform_direction(&aligned_space, dir);
float3 aligned_P = transform_point(&aligned_space, P);
float3 nrdir = -1.0f * bvh_inverse_direction(aligned_dir);
+ /* TODO(sergey): Do we need NO_EXTENDED_PRECISION here as well? */
float3 tLowerXYZ = make_float3(aligned_P.x * nrdir.x,
aligned_P.y * nrdir.y,
aligned_P.z * nrdir.z);
float3 tUpperXYZ = tLowerXYZ - nrdir;
- const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
- const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
- const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
- const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
- const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
- const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
- const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
- const float tFar = min4(t, tFarX, tFarY, tFarZ);
- if(dist != NULL) {
- *dist = tNear;
- }
+ NO_EXTENDED_PRECISION const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+ NO_EXTENDED_PRECISION const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+ NO_EXTENDED_PRECISION const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+ NO_EXTENDED_PRECISION const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
+ NO_EXTENDED_PRECISION const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
+ NO_EXTENDED_PRECISION const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
+ NO_EXTENDED_PRECISION const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
+ NO_EXTENDED_PRECISION const float tFar = min4(t, tFarX, tFarY, tFarZ);
+ *dist = tNear;
if(difl != 0.0f) {
/* TODO(sergey): Same as for QBVH, needs a proper use. */
(void)extmax;
@@ -86,6 +86,7 @@ int ccl_device bvh_hair_intersect_node(KernelGlobals *kg,
float dist[2])
{
int mask = 0;
+ /* TODO(sergey): Add visibility check. */
if(bvh_hair_intersect_child(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
mask |= 1;
}
@@ -94,3 +95,141 @@ int ccl_device bvh_hair_intersect_node(KernelGlobals *kg,
}
return mask;
}
+#else /* !defined(__KERNEL_SSE2__) */
+int ccl_device bvh_hair_intersect_node_unaligned(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const ssef& tnear,
+ const ssef& tfar,
+ const float difl,
+ const float extmax,
+ const uint visibility,
+ int nodeAddr,
+ float dist[2])
+{
+ Transform aligned_space0 = bvh_hair_fetch_aligned_space(kg, nodeAddr, 0);
+ Transform aligned_space1 = bvh_hair_fetch_aligned_space(kg, nodeAddr, 1);
+
+ float3 aligned_dir0 = transform_direction(&aligned_space0, dir),
+ aligned_dir1 = transform_direction(&aligned_space1, dir);;
+ float3 aligned_P0 = transform_point(&aligned_space0, P),
+ aligned_P1 = transform_point(&aligned_space1, P);
+ float3 nrdir0 = -1.0f * bvh_inverse_direction(aligned_dir0),
+ nrdir1 = -1.0f * bvh_inverse_direction(aligned_dir1);
+
+ ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+ aligned_P1.x * nrdir1.x,
+ 0.0f, 0.0f),
+ tLowerY = ssef(aligned_P0.y * nrdir0.y,
+ aligned_P1.y * nrdir1.y,
+ 0.0f,
+ 0.0f),
+ tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+ aligned_P1.z * nrdir1.z,
+ 0.0f,
+ 0.0f);
+
+ ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+ tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+ tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+ ssef tnear_x = min(tLowerX, tUpperX);
+ ssef tnear_y = min(tLowerY, tUpperY);
+ ssef tnear_z = min(tLowerZ, tUpperZ);
+ ssef tfar_x = max(tLowerX, tUpperX);
+ ssef tfar_y = max(tLowerY, tUpperY);
+ ssef tfar_z = max(tLowerZ, tUpperZ);
+
+ const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+ const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+ const sseb vmask = tNear <= tFar;
+ int mask = (int)movemask(vmask);
+
+ dist[0] = tNear.f[0];
+ dist[1] = tNear.f[1];
+
+ return mask & 3;
+}
+
+int ccl_device_inline bvh_hair_intersect_node_aligned(KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const float difl,
+ const float extmax,
+ const uint visibility,
+ int nodeAddr,
+ float dist[2])
+{
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_curve_nodes.data + nodeAddr*BVH_UNALIGNED_NODE_SIZE;
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) *
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list