[Bf-blender-cvs] SVN commit: /data/svn/bf-blender [57543] trunk/blender/intern/cycles: Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.

Brecht Van Lommel brechtvanlommel at pandora.be
Tue Jun 18 11:36:06 CEST 2013


Revision: 57543
          http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision=57543
Author:   blendix
Date:     2013-06-18 09:36:06 +0000 (Tue, 18 Jun 2013)
Log Message:
-----------
Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.

On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30%
speedup with visual studio (2008). It turns out visual studio was optimizing the
existing code quite poorly compared to pretty good autovectorization by clang/gcc,
but hand written SSE code also gives a smaller speed boost there.

This code isn't enabled when using the hair minimum width feature yet, need to
make that work with the SSE code still.

Modified Paths:
--------------
    trunk/blender/intern/cycles/bvh/bvh.cpp
    trunk/blender/intern/cycles/kernel/kernel_bvh.h
    trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h
    trunk/blender/intern/cycles/kernel/kernel_sse3.cpp
    trunk/blender/intern/cycles/util/util_types.h

Modified: trunk/blender/intern/cycles/bvh/bvh.cpp
===================================================================
--- trunk/blender/intern/cycles/bvh/bvh.cpp	2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/bvh/bvh.cpp	2013-06-18 09:36:06 UTC (rev 57543)
@@ -552,9 +552,9 @@
 {
 	int4 data[BVH_NODE_SIZE] =
 	{
-		make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)),
-		make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)),
-		make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)),
+		make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
+		make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
+		make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
 		make_int4(c0, c1, visibility0, visibility1)
 	};
 

Modified: trunk/blender/intern/cycles/kernel/kernel_bvh.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_bvh.h	2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/kernel/kernel_bvh.h	2013-06-18 09:36:06 UTC (rev 57543)
@@ -112,80 +112,8 @@
 }
 #endif
 
-/* intersect two bounding boxes */
-#ifdef __HAIR__
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
-	bool *traverseChild0, bool *traverseChild1,
-	bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
-	float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax)
-{
-#else
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
-	bool *traverseChild0, bool *traverseChild1,
-	bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
-	float3 P, float3 idir, float t, uint visibility, int nodeAddr)
-{
-#endif
-
-	/* fetch node data */
-	float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-	float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-	float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-	/* intersect ray against child nodes */
-	float3 ood = P * idir;
-	NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-	NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
-	NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
-	NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
-	NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-#ifdef __HAIR__
-	if(difl != 0.0f) {
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-#endif
-
-	/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	*traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-	*traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
-	*traverseChild0 = (c0max >= c0min);
-	*traverseChild1 = (c1max >= c1min);
-#endif
-
-	*nodeAddr0 = __float_as_int(cnodes.x);
-	*nodeAddr1 = __float_as_int(cnodes.y);
-
-	*closestChild1 = (c1min < c0min);
-}
-
 /* Sven Woop's algorithm */
-__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int triAddr)
 {
 	/* compute and check intersection t-value */
@@ -223,10 +151,13 @@
 					isect->u = u;
 					isect->v = v;
 					isect->t = t;
+					return true;
 				}
 			}
 		}
 	}
+
+	return false;
 }
 
 #ifdef __HAIR__
@@ -280,7 +211,7 @@
 	}
 }
 
-__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	float epsilon = 0.0f;
@@ -346,7 +277,7 @@
 	float zextrem[4];
 	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
 	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return;
+		return false;
 
 	/*minimum width extension*/
 	float mw_extension = min(difl * fabsf(upper), extmax);
@@ -355,17 +286,18 @@
 	float xextrem[4];
 	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
 	if(lower > r_ext || upper < -r_ext)
-		return;
+		return false;
 
 	float yextrem[4];
 	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
 	if(lower > r_ext || upper < -r_ext)
-		return;
+		return false;
 
 	/*setup recurrent loop*/
 	int level = 1 << depth;
 	int tree = 0;
 	float resol = 1.0f / (float)level;
+	bool hit = false;
 
 	/*begin loop*/
 	while(!(tree >> (depth))) {
@@ -557,7 +489,7 @@
 			/*stochastic fade from minimum width*/
 			if(lcg_state && coverage != 1.0f) {
 				if(lcg_step(lcg_state) > coverage)
-					return;
+					return hit;
 			}
 
 #ifdef __VISIBILITY_FLAG__
@@ -574,6 +506,7 @@
 				isect->v = 0.0f;
 				/*isect->v = 1.0f - coverage; */
 				isect->t = t;
+				hit = true;
 			}
 			
 			tree++;
@@ -584,9 +517,11 @@
 			level = level >> 1;
 		}
 	}
+
+	return hit;
 }
 
-__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	/* curve Intersection check */
@@ -630,7 +565,7 @@
 	sphere_b = dot(dir,sphere_dif);
 	float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
 	if(sdisc < 0.0f)
-		return;
+		return false;
 
 	/* obtain parameters and test midpoint distance for suitable modes*/
 	float3 tg = (p2 - p1) / l;
@@ -645,9 +580,9 @@
 	float zcentre = difz + (dirz * tcentre);
 
 	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return;
+		return false;
 	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return;
+		return false;
 
 	/* test minimum separation*/
 	float3 cprod = cross(tg, dir);
@@ -662,7 +597,7 @@
 		distscaled = (distscaled*distscaled)/cprodsq;
 
 	if(distscaled > mr*mr)
-		return;
+		return false;
 
 	/* calculate true intersection*/
 	float3 tdif = P - p1 + tcentre * dir;
@@ -672,7 +607,7 @@
 	float td = tb*tb - 4*a*tc;
 
 	if (td < 0.0f)
-		return;
+		return false;
 
 	float rootd = 0.0f;
 	float correction = 0.0f;
@@ -706,7 +641,7 @@
 		adjradius = adjradius / (r1 + z * gd);
 		if(lcg_state && adjradius != 1.0f) {
 			if(lcg_step(lcg_state) > adjradius)
-				return;
+				return false;
 		}
 		/* --- */
 
@@ -719,7 +654,7 @@
 					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
 					float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
 					if(a2*c2 < 0.0f)
-						return;
+						return false;
 				}
 			}
 
@@ -740,9 +675,13 @@
 
 				if(backface) 
 					isect->u = -isect->u;
+				
+				return true;
 			}
 		}
 	}
+
+	return false;
 }
 #endif
 
@@ -751,7 +690,7 @@
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point. */
 
-__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random)
 {
 	/* compute and check intersection t-value */
@@ -786,10 +725,13 @@
 					isect->u = u;
 					isect->v = v;
 					isect->t = t;
+					return true;
 				}
 			}
 		}
 	}
+
+	return false;
 }
 #endif
 

Modified: trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h	2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h	2013-06-18 09:36:06 UTC (rev 57543)
@@ -1,7 +1,9 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
  *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
@@ -41,6 +43,14 @@
 #endif
 )
 {
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - SSE for hair
+	 * - test restrict attribute for pointers
+	 */
+	

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list