[Bf-blender-cvs] [cd095aa] master: Cycles: Distance optimization for QBVH

Sergey Sharybin noreply at git.blender.org
Thu Dec 25 18:50:14 CET 2014


Commit: cd095aae139ecbcfdf2103f635eae8d5bc5f3b8e
Author: Sergey Sharybin
Date:   Thu Dec 25 22:40:02 2014 +0500
Branches: master
https://developer.blender.org/rBcd095aae139ecbcfdf2103f635eae8d5bc5f3b8e

Cycles: Distance optimization for QBVH

This commit implements heuristic which allows to skip nodes pushed to the stack
from intersection if distance to them is larger than the distance to the current
intersection.

This should solve speed regression which i didn't notice in the original QBVH
commit (which could have because i had WIP version of this patch applied in my
local branch).

>From quick tests speed seems to be much closer to what is was with regular BVH.

There's still some possible code cleanup, but they'll need a bit of assembly
code check and now i want to make it so artists can happily use Cycles over the
holidays.

===================================================================

M	intern/cycles/kernel/geom/geom_object.h
M	intern/cycles/kernel/geom/geom_qbvh.h
M	intern/cycles/kernel/geom/geom_qbvh_shadow.h
M	intern/cycles/kernel/geom/geom_qbvh_subsurface.h
M	intern/cycles/kernel/geom/geom_qbvh_traversal.h
M	intern/cycles/kernel/geom/geom_qbvh_volume.h

===================================================================

diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 91edd58..79a5668 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -391,6 +391,38 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float *t,
+                                          float *t1)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+	*P = transform_point(&tfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit static object in BVH */
 
 ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
@@ -436,6 +468,33 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, float *t1, Transform *tfm)
+{
+	Transform itfm;
+	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
+
+	*P = transform_point(&itfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit motion blurred object in BVH */
 
 ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
index a1dd89c..7a35437 100644
--- a/intern/cycles/kernel/geom/geom_qbvh.h
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@@ -14,32 +14,31 @@
  * limitations under the License.
  */
 
-ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
-                                       int *__restrict s2,
-                                       int *__restrict s3,
-                                       float *__restrict d1,
-                                       float *__restrict d2,
-                                       float *__restrict d3)
+struct QBVHStackItem {
+	int addr;
+	float dist;
+};
+
+/* TOOD(sergey): Investigate if using instrinsics helps here. */
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3)
 {
-	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
-	if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
-	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
+	if(s2->dist < s1->dist) { util_swap(s2, s1); }
+	if(s3->dist < s2->dist) { util_swap(s3, s2); }
+	if(s2->dist < s1->dist) { util_swap(s2, s1); }
 }
 
-ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
-                                       int *__restrict s2,
-                                       int *__restrict s3,
-                                       int *__restrict s4,
-                                       float *__restrict d1,
-                                       float *__restrict d2,
-                                       float *__restrict d3,
-                                       float *__restrict d4)
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3,
+                                       QBVHStackItem *__restrict s4)
 {
-	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
-	if(*d4 < *d3) { util_swap(s4, s3); util_swap(d4, d3); }
-	if(*d3 < *d1) { util_swap(s3, s1); util_swap(d3, d1); }
-	if(*d4 < *d2) { util_swap(s4, s2); util_swap(d4, d2); }
-	if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
+	if(s2->dist < s1->dist) { util_swap(s2, s1); }
+	if(s4->dist < s3->dist) { util_swap(s4, s3); }
+	if(s3->dist < s1->dist) { util_swap(s3, s1); }
+	if(s4->dist < s2->dist) { util_swap(s4, s2); }
+	if(s3->dist < s2->dist) { util_swap(s3, s2); }
 }
 
 ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
index f827999..2d1ad49 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -39,8 +39,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversalStack[BVH_STACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
 	int stackPtr = 0;
@@ -128,13 +128,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						if(d1 < d0) {
 							nodeAddr = c1;
 							++stackPtr;
-							traversalStack[stackPtr] = c0;
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
 							continue;
 						}
 						else {
 							nodeAddr = c0;
 							++stackPtr;
-							traversalStack[stackPtr] = c1;
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
 							continue;
 						}
 					}
@@ -143,9 +145,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 * all nodes onto the stack to sort them there.
 					 */
 					++stackPtr;
-					traversalStack[stackPtr] = c1;
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = c1;
 					++stackPtr;
-					traversalStack[stackPtr] = c0;
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = c0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
@@ -155,12 +159,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					float d2 = ((float*)&dist)[r];
 					if(traverseChild == 0) {
 						++stackPtr;
-						traversalStack[stackPtr] = c2;
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
 						qbvh_stack_sort(&traversalStack[stackPtr],
 						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2],
-						                &d2, &d1, &d0);
-						nodeAddr = traversalStack[stackPtr];
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
 						--stackPtr;
 						continue;
 					}
@@ -172,17 +176,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int c3 = __float_as_int(cnodes[r]);
 					float d3 = ((float*)&dist)[r];
 					++stackPtr;
-					traversalStack[stackPtr] = c3;
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
 					++stackPtr;
-					traversalStack[stackPtr] = c2;
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
 					qbvh_stack_sort(&traversalStack[stackPtr],
 					                &traversalStack[stackPtr - 1],
 					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3],
-					                &d3, &d2, &d1, &d0);
+					                &traversalStack[stackPtr - 3]);
 				}
 
-				nodeAddr = traversalStack[stackPtr];
+				nodeAddr = traversalStack[stackPtr].addr;
 				--stackPtr;
 			}
 
@@ -197,7 +202,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int primAddr2 = __float_as_int(leaf.y);
 
 					/* Pop. */
-					nodeAddr = traversalStack[stackPtr];
+					nodeAddr = traversalStack[stackPtr].addr;
 					--stackPtr;
 
 #ifdef __VISIBILITY_FLAG__
@@ -315,7 +320,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					triangle_intersect_precalc(dir, &isect_precalc);
 
 					++stackPtr;
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
 
@@ -368,7 +373,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			triangle_intersect_precalc(dir, &isect_precalc);
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
+			nodeAddr = traversalStack[stackPtr].addr;
 			--stackPtr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
index bc43d81..acb1bbd 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -42,8 +42,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversalStack[BVH_STACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
 	int stackPtr = 0;
@@ -124,13 +124,15 @@ ccl_devic

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list