[Bf-blender-cvs] [915766f42df] master: Cycles: Branched path tracing for the split kernel

Mai Lavelle noreply at git.blender.org
Tue May 2 20:27:31 CEST 2017


Commit: 915766f42df56a0fee47e8eb21f8b024f01afe18
Author: Mai Lavelle
Date:   Mon Mar 20 22:31:54 2017 -0400
Branches: master
https://developer.blender.org/rB915766f42df56a0fee47e8eb21f8b024f01afe18

Cycles: Branched path tracing for the split kernel

This implements branched path tracing for the split kernel.

General approach is to store the ray state at a branch point, trace the
branched ray as normal, then restore the state as necessary before iterating
to the next part of the path. A state machine is used to advance the indirect
loop state, which avoids the need to add any new kernels. Each iteration the
state machine recreates as much state as possible from the stored ray to keep
overall storage down.

Its kind of hard to keep all the different integration loops in sync, so this
needs lots of testing to make sure everything is working correctly. We should
probably start trying to deduplicate the integration loops more now.

Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes
could use more testing still.

Reviewers: sergey, nirved

Reviewed By: nirved

Subscribers: Blendify, bliblubli

Differential Revision: https://developer.blender.org/D2611

===================================================================

M	intern/cycles/blender/addon/ui.py
M	intern/cycles/device/device_split_kernel.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernel_path.h
M	intern/cycles/kernel/kernel_path_branched.h
M	intern/cycles/kernel/kernel_path_surface.h
M	intern/cycles/kernel/kernel_subsurface.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel_split.cu
M	intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
A	intern/cycles/kernel/split/kernel_branched.h
M	intern/cycles/kernel/split/kernel_data_init.h
M	intern/cycles/kernel/split/kernel_direct_lighting.h
M	intern/cycles/kernel/split/kernel_do_volume.h
M	intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M	intern/cycles/kernel/split/kernel_indirect_background.h
M	intern/cycles/kernel/split/kernel_indirect_subsurface.h
M	intern/cycles/kernel/split/kernel_next_iteration_setup.h
M	intern/cycles/kernel/split/kernel_shader_eval.h
M	intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
M	intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
M	intern/cycles/kernel/split/kernel_split_common.h
M	intern/cycles/kernel/split/kernel_split_data_types.h
M	intern/cycles/kernel/split/kernel_subsurface_scatter.h

===================================================================

diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 19ab9053e9a..ab1fe8924bc 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 71d52bb8097..abd26f5be49 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -240,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3750225571d..a92e8bc4aee 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -235,6 +235,7 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+	split/kernel_branched.h
 	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index e7957042182..58da141aed3 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                         RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
@@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 085eb42325d..ddcb57161ea 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                                RNG *rng,
                                                float3 throughput)
 {
@@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
@@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 076c82f3853..bd4ba775b4d 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index baf629342b9..274713addc2 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* setup diffuse bsdf */
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index b1269cdb6b4..6417f621c8f 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -71,22 +71,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
 #  define __PRINCIPLED__
+#  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
@@ -138,6 +134,7 @@ CCL_NAMESPACE_BEGIN
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
 #    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -1300,7 +1297,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1313,22 +1309,37 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
 	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
@@ -1343,14 +1354,22 @@ enum RayState {
 	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
 	RAY_REGENERATED,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | fla

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list