[Bf-blender-cvs] [9830eeb44b] master: Cycles: Implement record-all transparent shadow function for GPU

Wed Feb 8 16:21:44 CET 2017

Commit: 9830eeb44b84cdb12e477cbb067f3b7e56634024
Author: Sergey Sharybin
Date:   Wed Sep 21 17:34:15 2016 +0200
Branches: master
https://developer.blender.org/rB9830eeb44b84cdb12e477cbb067f3b7e56634024

Cycles: Implement record-all transparent shadow function for GPU

The idea is to record all possible transparent intersections when
shooting transparent ray on GPU (similar to what we were doing  on
CPU already).

This avoids need of doing whole ray-to-scene intersections queries
for each intersection and speeds up a lot cases like transparent
hair in the cost of extra memory.

This commit is a base ground for now and this feature is kept
disabled for until some further tweaks.

===================================================================

M	intern/cycles/kernel/bvh/bvh.h
M	intern/cycles/kernel/kernel_shadow.h

===================================================================

diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 3679898265..2667f23606 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -357,7 +357,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__SHADOW_RECORD_ALL__) || defined(__VOLUME_RECORD_ALL__)
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -371,7 +371,25 @@ ccl_device int intersections_compare(const void *a, const void *b)
 	else
 		return 0;
 }
+
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+#ifdef __KERNEL_GPU__
+	/* Use bubble sort which has more friendly memory pattern on GPU. */
+	int i, j;
+	for(i = 0; i < num_hits; ++i) {
+		for(j = 0; j < num_hits - 1; ++j) {
+			if(hits[j].t < hits[j + 1].t) {
+				Intersection tmp = hits[j];
+				hits[j] = hits[j + 1];
+				hits[j + 1] = tmp;
+			}
+		}
+	}
+#else
+	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 #endif
+}
+#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index b74902f8dd..05a6c7d182 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -34,10 +34,8 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
 		kernel_volume_shadow(kg, shadow_sd, state, &segment_ray, throughput);
 	}
 #endif
-
 	/* Setup shader data at surface. */
 	shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
 	/* Attenuation from transparent surface. */
 	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
 		path_state_modify_bounce(state, true);
@@ -51,42 +49,19 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
 		path_state_modify_bounce(state, false);
 		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
 	}
-
 	/* Stop if all light is blocked. */
 	if(is_zero(*throughput)) {
 		return true;
 	}
-
 #ifdef __VOLUME__
 	/* Exit/enter volume. */
 	kernel_volume_stack_enter_exit(kg, shadow_sd, state->volume_stack);
 #endif
-
 	return false;
 }
 
 #ifdef __SHADOW_RECORD_ALL__
-
-ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
-{
-#ifdef __KERNEL_GPU__
-	/* Use bubble sort which has more friendly memory pattern on GPU. */
-	int i, j;
-	for(i = 0; i < num_hits; ++i) {
-		for(j = 0; j < num_hits - 1; ++j) {
-			if(hits[j].t < hits[j + 1].t) {
-				Intersection tmp = hits[j];
-				hits[j] = hits[j + 1];
-				hits[j + 1] = tmp;
-			}
-		}
-	}
-#else
-	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-#endif
-}
-
-/* Shadow function to compute how much light is blocked, CPU variation.
+/* Shadow function to compute how much light is blocked,
  *
  * We trace a single ray. If it hits any opaque surface, or more than a given
  * number of transparent surfaces is hit, then we consider the geometry to be
@@ -104,12 +79,20 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
  * or there is a performance increase anyway due to avoiding the need to send
  * two rays with transparent shadows.
  *
- * This is CPU only because of qsort, and malloc or high stack space usage to
- * record all these intersections. */
+ * On CPU it'll handle all transparent bounces (by allocating storage for
+ * intersections when they don't fit into the stack storage).
+ *
+ * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
+ * is something to be kept an eye on.
+ */
 
-#define STACK_MAX_HITS 64
+#define SHADOW_STACK_MAX_HITS 64
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg,
+                                          ShaderData *shadow_sd,
+                                          PathState *state,
+                                          Ray *ray,
+                                          float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 	if(ray->t == 0.0f) {
@@ -126,7 +109,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 		/* Intersect to find an opaque surface, or record all transparent
 		 * surface hits.
 		 */
-		Intersection hits_stack[STACK_MAX_HITS];
+		Intersection hits_stack[SHADOW_STACK_MAX_HITS];
 		Intersection *hits = hits_stack;
 		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
 		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
@@ -138,7 +121,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 		 *
 		 * Ignore this on GPU because of slow/unavailable malloc().
 		 */
-		if(max_hits + 1 > STACK_MAX_HITS) {
+		if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
 			if(kg->transparent_shadow_intersections == NULL) {
 				kg->transparent_shadow_intersections =
 				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
@@ -211,30 +194,27 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 #endif
 	return blocked;
 }
+#endif  /* __SHADOW_RECORD_ALL__ */
 
-#undef STACK_MAX_HITS
-
-#else
-
-/* Shadow function to compute how much light is blocked, GPU variation.
+#ifndef __KERNEL_CPU__
+/* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
  * To minimize overhead in cases where we don't need transparent shadows, we
  * first trace a regular shadow ray. We check if the hit primitive was
  * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
-
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+ * one extra ray cast for the cases were we do want transparency.
+ */
+ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg,
+                                                ShaderData *shadow_sd,
+                                                ccl_addr_space PathState *state,
+                                                ccl_addr_space Ray *ray_input,
+                                                float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray_input->t == 0.0f)
+	if(ray_input->t == 0.0f) {
 		return false;
-
+	}
 #ifdef __SPLIT_KERNEL__
 	Ray private_ray = *ray_input;
 	Ray *ray = &private_ray;
@@ -313,10 +293,32 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 	}
 #  endif
 #endif
-
 	return blocked;
 }
+#endif  /* __KERNEL_CPU__ */
 
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      PathState *state,
+                                      Ray *ray,
+                                      float3 *shadow)
+{
+#ifdef __SHADOW_RECORD_ALL__
+#  ifdef __KERNEL_CPU__
+	return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
+#  else
+	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+	if(max_hits + 1 < SHADOW_STACK_MAX_HITS) {
+		return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
+	}
+#  endif
+#endif
+#ifndef __KERNEL_CPU__
+	return shadow_blocked_stepped(kg, shadow_sd, state, ray, shadow);
 #endif
+}
+
+#undef SHADOW_STACK_MAX_HITS
 
 CCL_NAMESPACE_END