[Bf-blender-cvs] SVN commit: /data/svn/bf-blender [57543] trunk/blender/intern/cycles: Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.
Brecht Van Lommel
brechtvanlommel at pandora.be
Tue Jun 18 11:36:06 CEST 2013
Revision: 57543
http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision=57543
Author: blendix
Date: 2013-06-18 09:36:06 +0000 (Tue, 18 Jun 2013)
Log Message:
-----------
Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.
On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30%
speedup with visual studio (2008). It turns out visual studio was optimizing the
existing code quite poorly compared to pretty good autovectorization by clang/gcc,
but hand written SSE code also gives a smaller speed boost there.
This code isn't enabled when using the hair minimum width feature yet, need to
make that work with the SSE code still.
Modified Paths:
--------------
trunk/blender/intern/cycles/bvh/bvh.cpp
trunk/blender/intern/cycles/kernel/kernel_bvh.h
trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h
trunk/blender/intern/cycles/kernel/kernel_sse3.cpp
trunk/blender/intern/cycles/util/util_types.h
Modified: trunk/blender/intern/cycles/bvh/bvh.cpp
===================================================================
--- trunk/blender/intern/cycles/bvh/bvh.cpp 2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/bvh/bvh.cpp 2013-06-18 09:36:06 UTC (rev 57543)
@@ -552,9 +552,9 @@
{
int4 data[BVH_NODE_SIZE] =
{
- make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)),
- make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)),
- make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)),
+ make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
+ make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
+ make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
make_int4(c0, c1, visibility0, visibility1)
};
Modified: trunk/blender/intern/cycles/kernel/kernel_bvh.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_bvh.h 2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/kernel/kernel_bvh.h 2013-06-18 09:36:06 UTC (rev 57543)
@@ -112,80 +112,8 @@
}
#endif
-/* intersect two bounding boxes */
-#ifdef __HAIR__
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
- bool *traverseChild0, bool *traverseChild1,
- bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
- float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax)
-{
-#else
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
- bool *traverseChild0, bool *traverseChild1,
- bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
- float3 P, float3 idir, float t, uint visibility, int nodeAddr)
-{
-#endif
-
- /* fetch node data */
- float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
- float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
- float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
- /* intersect ray against child nodes */
- float3 ood = P * idir;
- NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-#ifdef __HAIR__
- if(difl != 0.0f) {
- float hdiff = 1.0f + difl;
- float ldiff = 1.0f - difl;
- if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
- c0min = max(ldiff * c0min, c0min - extmax);
- c0max = min(hdiff * c0max, c0max + extmax);
- }
- if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
- c1min = max(ldiff * c1min, c1min - extmax);
- c1max = min(hdiff * c1max, c1max + extmax);
- }
- }
-#endif
-
- /* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- *traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
- *traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
- *traverseChild0 = (c0max >= c0min);
- *traverseChild1 = (c1max >= c1min);
-#endif
-
- *nodeAddr0 = __float_as_int(cnodes.x);
- *nodeAddr1 = __float_as_int(cnodes.y);
-
- *closestChild1 = (c1min < c0min);
-}
-
/* Sven Woop's algorithm */
-__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int triAddr)
{
/* compute and check intersection t-value */
@@ -223,10 +151,13 @@
isect->u = u;
isect->v = v;
isect->t = t;
+ return true;
}
}
}
}
+
+ return false;
}
#ifdef __HAIR__
@@ -280,7 +211,7 @@
}
}
-__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
float epsilon = 0.0f;
@@ -346,7 +277,7 @@
float zextrem[4];
curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
if(lower - r_curr > isect->t || upper + r_curr < epsilon)
- return;
+ return false;
/*minimum width extension*/
float mw_extension = min(difl * fabsf(upper), extmax);
@@ -355,17 +286,18 @@
float xextrem[4];
curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
if(lower > r_ext || upper < -r_ext)
- return;
+ return false;
float yextrem[4];
curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
if(lower > r_ext || upper < -r_ext)
- return;
+ return false;
/*setup recurrent loop*/
int level = 1 << depth;
int tree = 0;
float resol = 1.0f / (float)level;
+ bool hit = false;
/*begin loop*/
while(!(tree >> (depth))) {
@@ -557,7 +489,7 @@
/*stochastic fade from minimum width*/
if(lcg_state && coverage != 1.0f) {
if(lcg_step(lcg_state) > coverage)
- return;
+ return hit;
}
#ifdef __VISIBILITY_FLAG__
@@ -574,6 +506,7 @@
isect->v = 0.0f;
/*isect->v = 1.0f - coverage; */
isect->t = t;
+ hit = true;
}
tree++;
@@ -584,9 +517,11 @@
level = level >> 1;
}
}
+
+ return hit;
}
-__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
/* curve Intersection check */
@@ -630,7 +565,7 @@
sphere_b = dot(dir,sphere_dif);
float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
if(sdisc < 0.0f)
- return;
+ return false;
/* obtain parameters and test midpoint distance for suitable modes*/
float3 tg = (p2 - p1) / l;
@@ -645,9 +580,9 @@
float zcentre = difz + (dirz * tcentre);
if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
- return;
+ return false;
if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
- return;
+ return false;
/* test minimum separation*/
float3 cprod = cross(tg, dir);
@@ -662,7 +597,7 @@
distscaled = (distscaled*distscaled)/cprodsq;
if(distscaled > mr*mr)
- return;
+ return false;
/* calculate true intersection*/
float3 tdif = P - p1 + tcentre * dir;
@@ -672,7 +607,7 @@
float td = tb*tb - 4*a*tc;
if (td < 0.0f)
- return;
+ return false;
float rootd = 0.0f;
float correction = 0.0f;
@@ -706,7 +641,7 @@
adjradius = adjradius / (r1 + z * gd);
if(lcg_state && adjradius != 1.0f) {
if(lcg_step(lcg_state) > adjradius)
- return;
+ return false;
}
/* --- */
@@ -719,7 +654,7 @@
float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
if(a2*c2 < 0.0f)
- return;
+ return false;
}
}
@@ -740,9 +675,13 @@
if(backface)
isect->u = -isect->u;
+
+ return true;
}
}
}
+
+ return false;
}
#endif
@@ -751,7 +690,7 @@
* only want to intersect with primitives in the same object, and if case of
* multiple hits we pick a single random primitive as the intersection point. */
-__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random)
{
/* compute and check intersection t-value */
@@ -786,10 +725,13 @@
isect->u = u;
isect->v = v;
isect->t = t;
+ return true;
}
}
}
}
+
+ return false;
}
#endif
Modified: trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h 2013-06-18 09:36:00 UTC (rev 57542)
+++ trunk/blender/intern/cycles/kernel/kernel_bvh_traversal.h 2013-06-18 09:36:06 UTC (rev 57543)
@@ -1,7 +1,9 @@
/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
*
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
@@ -41,6 +43,14 @@
#endif
)
{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - SSE for hair
+ * - test restrict attribute for pointers
+ */
+
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list