[Bf-blender-cvs] [064caae] master: Cycles: BVH-related SSE optimization

Sergey Sharybin noreply at git.blender.org
Tue Oct 25 15:34:30 CEST 2016


Commit: 064caae7b2943aa35953642fd4b15d0e9ec05a87
Author: Sergey Sharybin
Date:   Tue Oct 25 14:47:34 2016 +0200
Branches: master
https://developer.blender.org/rB064caae7b2943aa35953642fd4b15d0e9ec05a87

Cycles: BVH-related SSE optimization

Several ideas here:

- Optimize calculation of near_{x,y,z} in a way that does not require
  3 if() statements per update, which avoids negative effect of wrong
  branch prediction.

- Optimization of direction clamping for BVH.

- Optimization of point/direction transform.

Brings ~1.5% speedup again depending on a scene (unfortunately, this
speedup can't be sum across all previous commits because speedup of
each of the changes varies from scene to scene, but it still seems to
be nice solid speedup of few percent on Linux and bigger speedup was
reported on Windows).

Once again ,thanks Maxym for inspiration!

Still TODO: We have multiple places where we need to calculate near
x,y,z indices in BVH, for now it's only done for main BVH traversal.
Will try to move this calculation to an utility function and see if
that can be easily re-used across all the BVH flavors.

===================================================================

M	intern/cycles/kernel/bvh/qbvh_traversal.h
M	intern/cycles/kernel/geom/geom_object.h
M	intern/cycles/util/util_transform.h

===================================================================

diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index a1e154d..b9da539 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -100,12 +100,27 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif
 
 	/* Offsets to select the side that becomes the lower or upper bound. */
+#ifdef __KERNEL_SSE__
+	int near_x = 0, near_y = 2, near_z = 4;
+	int far_x = 1, far_y = 3, far_z = 5;
+
+	const size_t mask = movemask(ssef(idir.m128));
+
+	const int mask_x = mask & 1;
+	const int mask_y = (mask & 2) >> 1;
+	const int mask_z = (mask & 4) >> 2;
+
+	near_x += mask_x; far_x -= mask_x;
+	near_y += mask_y; far_y -= mask_y;
+	near_z += mask_z; far_z -= mask_z;
+#else
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
 
 	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
 	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+#endif
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -427,9 +442,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
 #  endif
 
+#ifdef __KERNEL_SSE__
+					near_x = 0; near_y = 2; near_z = 4;
+					far_x = 1; far_y = 3; far_z = 5;
+
+					const size_t mask = movemask(ssef(idir.m128));
+
+					const int mask_x = mask & 1;
+					const int mask_y = (mask & 2) >> 1;
+					const int mask_z = (mask & 4) >> 2;
+
+					near_x += mask_x; far_x -= mask_x;
+					near_y += mask_y; far_y -= mask_y;
+					near_z += mask_z; far_z -= mask_z;
+#else
 					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
 					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+#endif
 					tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -469,9 +499,25 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 
+#ifdef __KERNEL_SSE__
+			near_x = 0; near_y = 2; near_z = 4;
+			far_x = 1; far_y = 3; far_z = 5;
+
+			const size_t mask = movemask(ssef(idir.m128));
+
+			const int mask_x = mask & 1;
+			const int mask_y = (mask & 2) >> 1;
+			const int mask_z = (mask & 4) >> 2;
+
+			near_x += mask_x; far_x -= mask_x;
+			near_y += mask_y; far_y -= mask_y;
+			near_z += mask_z; far_z -= mask_z;
+#else
 			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+#endif
+
 			tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index c2ec774..0e09325 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -376,15 +376,32 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 {
 	/* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
-	float ooeps = 8.271806E-25f;
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	const ssef oopes(8.271806E-25f,8.271806E-25f,8.271806E-25f,0.0f);
+	const ssef mask = _mm_cmpgt_ps(fabs(dir),oopes);
+	const ssef signdir = signmsk(dir.m128) | oopes;
+#  ifndef __KERNEL_AVX__
+	ssef res = mask & dir;
+	res = _mm_or_ps(res,_mm_andnot_ps(mask, signdir));
+#  else
+	ssef res = _mm_blendv_ps(signdir,dir,mask);
+#  endif
+	return float3(res);
+#else  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
+	const float ooeps = 8.271806E-25f;
 	return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x),
 	                   (fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y),
 	                   (fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
+#endif  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
 }
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
+#ifdef __KERNEL_SSE__
+	return rcp(dir);
+#else
 	return 1.0f / dir;
+#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index bfc8f55..771a944 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -73,22 +73,57 @@ ccl_device_inline float3 transform_perspective(const Transform *t, const float3
 
 ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	ssef x, y, z, w, aa;
+	aa = a.m128;
+
+	x = _mm_loadu_ps(&t->x.x);
+	y = _mm_loadu_ps(&t->y.x);
+	z = _mm_loadu_ps(&t->z.x);
+	w = _mm_loadu_ps(&t->w.x);
+
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+
+	ssef tmp = madd(x, shuffle<0>(aa), w);
+	tmp = madd(y, shuffle<1>(aa), tmp);
+	tmp = madd(z, shuffle<2>(aa), tmp);
+
+	return float3(tmp.m128);
+#else
 	float3 c = make_float3(
 		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z + t->x.w,
 		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z + t->y.w,
 		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z + t->z.w);
 
 	return c;
+#endif
 }
 
 ccl_device_inline float3 transform_direction(const Transform *t, const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	ssef x, y, z, w, aa;
+	aa = a.m128;
+	x = _mm_loadu_ps(&t->x.x);
+	y = _mm_loadu_ps(&t->y.x);
+	z = _mm_loadu_ps(&t->z.x);
+	w = _mm_setzero_ps();
+
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+
+	ssef tmp = x * shuffle<0>(aa);
+	tmp = madd(y, shuffle<1>(aa), tmp);
+	tmp = madd(z, shuffle<2>(aa), tmp);
+
+	return float3(tmp.m128);
+#else
 	float3 c = make_float3(
 		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z,
 		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z,
 		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z);
 
 	return c;
+#endif
 }
 
 ccl_device_inline float3 transform_direction_transposed(const Transform *t, const float3 a)




More information about the Bf-blender-cvs mailing list