[Bf-blender-cvs] [7dabfb2] master: Cycles: Speedup of kernel side camera-in-volume detection

Fri Oct 3 08:56:16 CEST 2014

Commit: 7dabfb2048958c60afccf353c21eca2f5c08bc4d
Author: Sergey Sharybin
Date:   Fri Oct 3 03:42:16 2014 +0600
Branches: master
https://developer.blender.org/rB7dabfb2048958c60afccf353c21eca2f5c08bc4d

Cycles: Speedup of kernel side camera-in-volume detection

The idea is to only count intersections with objects which has volumetric shader
and ignore all other objects.

This is probably as fast as we can go without involving some forth level magic.

===================================================================

M	intern/cycles/kernel/geom/geom_bvh.h
A	intern/cycles/kernel/geom/geom_bvh_volume.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernel_volume.h
M	intern/cycles/render/mesh.cpp
M	intern/cycles/render/object.cpp

===================================================================

diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index dd6abe3..59e811f 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -123,6 +123,36 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_shadow.h"
 #endif
 
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
 /* to work around titan bug when using arrays instead of textures */
 #if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
 ccl_device_inline
@@ -311,5 +341,50 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+bool scene_intersect_volume(KernelGlobals *kg,
+                            const Ray *ray,
+                            Intersection *isect)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_hair_motion(kg, ray, isect);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_motion(kg, ray, isect);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_hair(kg, ray, isect);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_instancing(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume(kg, ray, isect);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_volume_instancing(kg, ray, isect);
+#else
+	return bvh_intersect_volume(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
new file mode 100644
index 0000000..aac0a18
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -0,0 +1,322 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                  const Ray *ray,
+                                  Intersection *isect)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->object = OBJECT_NONE;
+	isect->prim = PRIM_NONE;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fet

@@ Diff output truncated at 10240 characters. @@