[Bf-blender-cvs] [26f1c51] master: Cycles: Trace indirect subsurface rays by restarting the integrator loop

Sergey Sharybin noreply at git.blender.org
Wed Nov 25 09:01:32 CET 2015


Commit: 26f1c51ca6fb1cc8d0b9b39781ed50d074e8c84f
Author: Sergey Sharybin
Date:   Sun Nov 22 16:08:03 2015 +0500
Branches: master
https://developer.blender.org/rB26f1c51ca6fb1cc8d0b9b39781ed50d074e8c84f

Cycles: Trace indirect subsurface rays by restarting the integrator loop

This gives much lower stack usage on GPU and reduces kernel memory size to
around 448MB on GTX560Ti (comparing to 652MB with previous commit and 946MB
with official release). There's also a barely measurable speedup of around
5%, but this is to be confirmed still.

At this stage we're using only ~3% for the experimental kernel and SSS
rendering seems to be faster by 40% and after some further testing we might
consider making SSS and CMJ official features and remove experimental
precompiled kernels.

===================================================================

M	intern/cycles/kernel/kernel_bake.h
M	intern/cycles/kernel/kernel_path.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 0f572b3..3efd7ec 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -74,7 +74,22 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
-				kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect);
+				while(ss_indirect.num_rays) {
+					kernel_path_subsurface_setup_indirect(kg,
+					                                      &ss_indirect,
+					                                      &L_sample,
+					                                      &state,
+					                                      &ray,
+					                                      &ray,
+					                                      &throughput);
+					kernel_path_indirect(kg,
+					                     &rng,
+					                     &ray,
+					                     throughput,
+					                     state.num_samples,
+					                     &state,
+					                     &L_sample);
+				}
 				is_sss_sample = true;
 			}
 		}
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1f385b8..885782f 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -448,6 +448,12 @@ ccl_device bool kernel_path_subsurface_scatter(
 
 	/* do bssrdf scatter step if we picked a bssrdf closure */
 	if(sc) {
+		/* We should never have two consecutive BSSRDF bounces,
+		 * the second one should be converted to a diffuse BSDF to
+		 * avoid this.
+		 */
+		kernel_assert(ss_indirect->num_rays == 0);
+
 		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 
 		SubsurfaceIntersection ss_isect;
@@ -510,50 +516,44 @@ ccl_device bool kernel_path_subsurface_scatter(
 	return false;
 }
 
-/* Trace subsurface indirect rays separately after the path loop, to reduce
- * GPU stack memory usage. this way ShaderData and other data structures
- * used during the loop are not needed during kernel_path_indirect.
- */
-ccl_device void kernel_path_subsurface_scatter_indirect(
+ccl_device void kernel_path_subsurface_setup_indirect(
         KernelGlobals *kg,
+        SubsurfaceIndirectRays *ss_indirect,
         PathRadiance *L,
         PathState *state,
-        RNG *rng,
+        Ray *orig_ray,
         Ray *ray,
-        SubsurfaceIndirectRays *ss_indirect)
+        float3 *throughput)
 {
-	for (int i = 0; i < ss_indirect->num_rays; i++) {
-		Ray *indirect_ray = &ss_indirect->rays[i];
-		float3 indirect_throughput = ss_indirect->throughputs[i];
+	/* Setup state, ray and throughput for indirect SSS rays. */
+	ss_indirect->num_rays--;
 
-		*state = ss_indirect->state;
+	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
 
-#ifdef __VOLUME__
-		if(ss_indirect->need_update_volume_stack) {
-			/* TODO(sergey): Single assignment per scatter. */
-			Ray volume_ray = *ray;
+	*state = ss_indirect->state;
+	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
 
-			/* Setup ray from previous surface point to the new one. */
-			volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
-			                             &volume_ray.t);
+#ifdef __VOLUME__
+	if(ss_indirect->need_update_volume_stack) {
+		Ray volume_ray = *orig_ray;
 
-			kernel_volume_stack_update_for_subsurface(
-			        kg,
-			        &volume_ray,
-			        state->volume_stack);
+		/* Setup ray from previous surface point to the new one. */
+		volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
+		                             &volume_ray.t);
 
-		}
+		kernel_volume_stack_update_for_subsurface(kg,
+		                                          &volume_ray,
+		                                          state->volume_stack);
+	}
 #endif
 
-		/* Note that this modifies state. */
-		kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
+	*ray = *indirect_ray;
 
-		/* For render passes, sum and reset indirect light pass variables
-		 * for the next samples.
-		 */
-		path_radiance_sum_indirect(L);
-		path_radiance_reset_indirect(L);
-	}
+	/* For render passes, sum and reset indirect light pass variables
+	 * for the next samples.
+	 */
+	path_radiance_sum_indirect(L);
+	path_radiance_reset_indirect(L);
 }
 #endif
 
@@ -577,6 +577,14 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
 	ss_indirect.num_rays = 0;
+
+	/* TODO(sergey): Avoid having explicit copy of the pre-subsurface scatter
+	 * ray by storing an updated version of state in the ss_indirect which will
+	 * be updated to the new volume stack.
+	 */
+	Ray ss_orig_ray;
+
+	for(;;) {
 #endif
 
 	/* path iteration */
@@ -825,6 +833,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
+				ss_orig_ray = ray;
 				break;
 			}
 		}
@@ -839,16 +848,21 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	}
 
 #ifdef __SUBSURFACE__
-	/* Trace indirect subsurface afterwards to reduce GPU stack size.
-	 * note that this modifies state.
-	 */
-	if (ss_indirect.num_rays) {
-		kernel_path_subsurface_scatter_indirect(kg,
-		                                        &L,
-		                                        &state,
-		                                        rng,
-		                                        &ray,
-		                                        &ss_indirect);
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
+		 * stack memory than invoking kernel_path_indirect.
+		 */
+		if(ss_indirect.num_rays) {
+			kernel_path_subsurface_setup_indirect(kg,
+			                                      &ss_indirect,
+			                                      &L,
+			                                      &state,
+			                                      &ss_orig_ray,
+			                                      &ray,
+			                                      &throughput);
+		}
+		else {
+			break;
+		}
 	}
 #endif




More information about the Bf-blender-cvs mailing list