[Bf-blender-cvs] [16eb202] cycles_hair_bvh: Cycles: SIMD optimization of unaligned BVH traversal

Sergey Sharybin noreply at git.blender.org
Thu Apr 28 18:33:21 CEST 2016


Commit: 16eb2026b67d0c989365b77d7e592ed4f12a5e54
Author: Sergey Sharybin
Date:   Thu Apr 28 18:29:51 2016 +0200
Branches: cycles_hair_bvh
https://developer.blender.org/rB16eb2026b67d0c989365b77d7e592ed4f12a5e54

Cycles: SIMD optimization of unaligned BVH traversal

Some movement towards SIMD-optimized traversal of unaligned nodes,
not fully optimal yet, but now the branch is about 4% faster on
a test scene with hairy monkey with 40K hair.

Should still be possible to optimize matrix multiplication and
maybe some parts could be re-formulated a bit to become more
friendly for vectorization.

But likely next step would be to support QBVH for hair, that should
be easier to vectorze with similar to current formulation.

===================================================================

M	intern/cycles/bvh/bvh.cpp
M	intern/cycles/bvh/bvh.h
M	intern/cycles/kernel/geom/geom_bvh_hair.h
M	intern/cycles/kernel/geom/geom_bvh_traversal_hair.h

===================================================================

diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 2f8ceaa..0bf746c 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -492,6 +492,8 @@ void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
                                       const BVHStackEntry& e1)
 {
 	pack_unaligned_node(e.idx,
+	                    e0.node->is_unaligned(),
+	                    e1.node->is_unaligned(),
 	                    e0.node->m_aligned_space,
 	                    e1.node->m_aligned_space,
 	                    e0.node->m_bounds,
@@ -501,6 +503,8 @@ void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
 }
 
 void RegularBVH::pack_unaligned_node(int idx,
+                                     const bool is_unaligned0,
+                                     const bool is_unaligned1,
                                      const Transform& aligned_space0,
                                      const Transform& aligned_space1,
                                      const BoundBox& bounds0,
@@ -508,17 +512,39 @@ void RegularBVH::pack_unaligned_node(int idx,
                                      int c0, int c1,
                                      uint visibility0, uint visibility1)
 {
-	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
-	                                                        aligned_space0);
-	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
-	                                                        aligned_space1);
-	float4 data[BVH_UNALIGNED_NODE_SIZE] =
-	{
-		space0.x, space0.y, space0.z, space0.w,
-		space1.x, space1.y, space1.z, space1.w,
-		make_float4(__int_as_float(c0), __int_as_float(c1),
-		            __int_as_float(visibility0), __int_as_float(visibility1))
-	};
+	float4 data[BVH_UNALIGNED_NODE_SIZE];
+	if (is_unaligned0 || is_unaligned1) {
+		Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+		                                                        aligned_space0);
+		Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+		                                                        aligned_space1);
+		data[0] = space0.x;
+		data[1] = space0.y;
+		data[2] = space0.z;
+		data[3] = space0.w;
+		data[4] = space1.x;
+		data[5] = space1.y;
+		data[6] = space1.z;
+		data[7] = space1.w;
+	}
+	else {
+		data[0] = make_float4(bounds0.min.x, bounds1.min.x,
+		                      bounds0.max.x, bounds1.max.x);
+		data[1] = make_float4(bounds0.min.y, bounds1.min.y,
+		                      bounds0.max.y, bounds1.max.y);
+		data[2] = make_float4(bounds0.min.z, bounds1.min.z,
+		                      bounds0.max.z, bounds1.max.z);
+		data[3] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		data[4] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		data[5] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		data[6] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		data[7] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+
+	data[8] = make_float4(__int_as_float(c0),
+	                      __int_as_float(c1),
+	                      __int_as_float(visibility0),
+	                      __int_as_float(visibility1));
 
 	memcpy(&pack.nodes[idx * BVH_UNALIGNED_NODE_SIZE],
 	       data,
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 3e670bc..3099e2c 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -135,6 +135,8 @@ protected:
 	                          const BVHStackEntry& e0,
 	                          const BVHStackEntry& e1);
 	void pack_unaligned_node(int idx,
+	                         const bool is_unaligned0,
+	                         const bool is_unaligned1,
 	                         const Transform& aligned_space0,
 	                         const Transform& aligned_space1,
 	                         const BoundBox& b0,
diff --git a/intern/cycles/kernel/geom/geom_bvh_hair.h b/intern/cycles/kernel/geom/geom_bvh_hair.h
index 79cf16d..f7c6283 100644
--- a/intern/cycles/kernel/geom/geom_bvh_hair.h
+++ b/intern/cycles/kernel/geom/geom_bvh_hair.h
@@ -34,6 +34,7 @@ ccl_device_inline Transform bvh_hair_fetch_aligned_space(KernelGlobals *kg,
 	return aligned_space;
 }
 
+#if !defined(__KERNEL_SSE2__)
 ccl_device_inline bool bvh_hair_intersect_child(KernelGlobals *kg,
                                                 const float3 P,
                                                 const float3 dir,
@@ -48,21 +49,20 @@ ccl_device_inline bool bvh_hair_intersect_child(KernelGlobals *kg,
 	float3 aligned_dir = transform_direction(&aligned_space, dir);
 	float3 aligned_P = transform_point(&aligned_space, P);
 	float3 nrdir = -1.0f * bvh_inverse_direction(aligned_dir);
+	/* TODO(sergey): Do we need NO_EXTENDED_PRECISION here as well? */
 	float3 tLowerXYZ = make_float3(aligned_P.x * nrdir.x,
 	                               aligned_P.y * nrdir.y,
 	                               aligned_P.z * nrdir.z);
 	float3 tUpperXYZ = tLowerXYZ - nrdir;
-	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
-	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
-	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
-	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
-	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
-	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
-	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
-	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
-	if(dist != NULL) {
-		*dist = tNear;
-	}
+	NO_EXTENDED_PRECISION const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	NO_EXTENDED_PRECISION const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	NO_EXTENDED_PRECISION const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	NO_EXTENDED_PRECISION const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	NO_EXTENDED_PRECISION const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	NO_EXTENDED_PRECISION const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	NO_EXTENDED_PRECISION const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	NO_EXTENDED_PRECISION const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
 	if(difl != 0.0f) {
 		/* TODO(sergey): Same as for QBVH, needs a proper use. */
 		(void)extmax;
@@ -86,6 +86,7 @@ int ccl_device bvh_hair_intersect_node(KernelGlobals *kg,
                                        float dist[2])
 {
 	int mask = 0;
+	/* TODO(sergey): Add visibility check. */
 	if(bvh_hair_intersect_child(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
 		mask |= 1;
 	}
@@ -94,3 +95,141 @@ int ccl_device bvh_hair_intersect_node(KernelGlobals *kg,
 	}
 	return mask;
 }
+#else  /* !defined(__KERNEL_SSE2__) */
+int ccl_device bvh_hair_intersect_node_unaligned(KernelGlobals *kg,
+                                                 const float3 P,
+                                                 const float3 dir,
+                                                 const ssef& tnear,
+                                                 const ssef& tfar,
+                                                 const float difl,
+                                                 const float extmax,
+                                                 const uint visibility,
+                                                 int nodeAddr,
+                                                 float dist[2])
+{
+	Transform aligned_space0 = bvh_hair_fetch_aligned_space(kg, nodeAddr, 0);
+	Transform aligned_space1 = bvh_hair_fetch_aligned_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&aligned_space0, dir),
+	       aligned_dir1 = transform_direction(&aligned_space1, dir);;
+	float3 aligned_P0 = transform_point(&aligned_space0, P),
+	       aligned_P1 = transform_point(&aligned_space1, P);
+	float3 nrdir0 = -1.0f * bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -1.0f * bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = tNear <= tFar;
+	int mask = (int)movemask(vmask);
+
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	return mask & 3;
+}
+
+int ccl_device_inline bvh_hair_intersect_node_aligned(KernelGlobals *kg,
+                                               const float3& P,
+                                               const float3& dir,
+                                               const ssef& tsplat,
+                                               const ssef Psplat[3],
+                                               const ssef idirsplat[3],
+                                               const shuffle_swap_t shufflexyz[3],
+                                               const float difl,
+                                               const float extmax,
+                                               const uint visibility,
+                                               int nodeAddr,
+                                               float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_curve_nodes.data + nodeAddr*BVH_UNALIGNED_NODE_SIZE;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * 

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list