[Bf-blender-cvs] [365ad27] cycles_split_kernel: Cycles: Actually implement work item functions for CPU

Thu Oct 27 17:48:59 CEST 2016

Commit: 365ad278f6ceff5ba83d596cbd4e348dff5de76b
Author: Mai Lavelle
Date:   Thu Oct 27 17:36:52 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rB365ad278f6ceff5ba83d596cbd4e348dff5de76b

Cycles: Actually implement work item functions for CPU

Didn't actually implement these yet, unfortunately we need to pass `kg` around
to a lot more functions for this.

===================================================================

M	intern/cycles/kernel/closure/alloc.h
M	intern/cycles/kernel/closure/bssrdf.h
M	intern/cycles/kernel/kernel_compat_cpu.h
M	intern/cycles/kernel/kernel_compat_opencl.h
M	intern/cycles/kernel/kernel_globals.h
M	intern/cycles/kernel/kernel_queues.h
M	intern/cycles/kernel/kernel_subsurface.h
M	intern/cycles/kernel/kernel_work_stealing.h
M	intern/cycles/kernel/split/kernel_background_buffer_update.h
M	intern/cycles/kernel/split/kernel_data_init.h
M	intern/cycles/kernel/split/kernel_direct_lighting.h
M	intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M	intern/cycles/kernel/split/kernel_lamp_emission.h
M	intern/cycles/kernel/split/kernel_next_iteration_setup.h
M	intern/cycles/kernel/split/kernel_queue_enqueue.h
M	intern/cycles/kernel/split/kernel_scene_intersect.h
M	intern/cycles/kernel/split/kernel_shader_eval.h
M	intern/cycles/kernel/split/kernel_shadow_blocked.h
M	intern/cycles/kernel/svm/svm.h
M	intern/cycles/kernel/svm/svm_closure.h
M	intern/cycles/kernel/svm/svm_fresnel.h
M	intern/cycles/kernel/svm/svm_light_path.h

===================================================================

diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1e..76563ce 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
+ccl_device ShaderClosure *closure_alloc(KernelGlobals *kg, ShaderData *sd, int size, ClosureType type, float3 weight)
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
@@ -35,7 +35,7 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 	return sc;
 }
 
-ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
+ccl_device ccl_addr_space void *closure_alloc_extra(KernelGlobals *kg, ShaderData *sd, int size)
 {
 	/* Allocate extra space for closure that need more parameters. We allocate
 	 * in chunks of sizeof(ShaderClosure) starting from the end of the closure
@@ -58,9 +58,9 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
 }
 
-ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
+ccl_device_inline ShaderClosure *bsdf_alloc(KernelGlobals *kg, ShaderData *sd, int size, float3 weight)
 {
-	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+	ShaderClosure *sc = closure_alloc(kg, sd, size, CLOSURE_NONE_ID, weight);
 
 	if(!sc)
 		return NULL;
@@ -71,9 +71,9 @@ ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 wei
 }
 
 #ifdef __OSL__
-ccl_device_inline ShaderClosure *bsdf_alloc_osl(ShaderData *sd, int size, float3 weight, void *data)
+ccl_device_inline ShaderClosure *bsdf_alloc_osl(KernelGlobals *kg, ShaderData *sd, int size, float3 weight, void *data)
 {
-	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+	ShaderClosure *sc = closure_alloc(kg, sd, size, CLOSURE_NONE_ID, weight);
 
 	if(!sc)
 		return NULL;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd8..a342025 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -344,9 +344,9 @@ ccl_device void bssrdf_none_sample(const ShaderClosure *sc, float xi, float *r,
 
 /* Generic */
 
-ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
+ccl_device_inline Bssrdf *bssrdf_alloc(KernelGlobals *kg, ShaderData *sd, float3 weight)
 {
-	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
+	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(kg, sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
 
 	if(!bssrdf)
 		return NULL;
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index e6aa8f8..e347a1e 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -45,12 +45,13 @@
 #define ccl_addr_space
 
 #define ccl_local_id(d) 0
-#define ccl_global_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
 
 #define ccl_local_size(d) 1
-#define ccl_global_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
 
-#define ccl_num_groups(d) 1
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
 
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index b60eb14..ea99fdb 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -55,6 +55,7 @@
 #define ccl_local_size(d) get_local_size(d)
 #define ccl_global_size(d) get_global_size(d)
 
+#define ccl_group_id(d) get_group_id(d)
 #define ccl_num_groups(d) get_num_groups(d)
 
 /* Selective nodes compilation. */
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index a2d0057..121b840 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -70,6 +70,9 @@ typedef struct KernelGlobals {
 	Intersection *isect_shadow;
 	SplitData split_data;
 	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 8d3176f..212ef98 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -49,6 +49,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 52c05b8..5bdb3a6 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(KernelGlobals *kg, ShaderData *sd, float3 weight, bool hit, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
 	sd->randb_closure = 0.0f;
@@ -148,7 +148,7 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei
 	sd->num_closure_extra = 0;
 
 	if(hit) {
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(kg, sd, sizeof(DiffuseBsdf), weight);
 
 		if(bsdf) {
 			bsdf->N = N;
@@ -373,7 +373,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
 
 	/* Setup diffuse BSDF. */
-	subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+	subsurface_scatter_setup_diffuse_bsdf(kg, sd, weight, true, N);
 }
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
@@ -463,7 +463,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
 
 	/* setup diffuse bsdf */
-	subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+	subsurface_scatter_setup_diffuse_bsdf(kg, sd, eval, (ss_isect.num_hits > 0), N);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index afb9ac7..859994e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -46,7 +46,8 @@ ccl_device uint get_group_id_with_ray_index(uint ray_index,
 	}
 }
 
-ccl_device uint get_total_work(uint tile_dim_x,
+ccl_device uint get_total_work(KernelGlobals *kg,
+                    uint tile_dim_x,
                     uint tile_dim_y,
                     uint grp_idx,
                     uint grp_idy,
@@ -73,7 +74,8 @@ ccl_device uint get_total_work(uint tile_dim_x,
 
 /* Returns 0 in case there is no next work available */
 /* Returns 1 in case work assigned is valid */
-ccl_device int get_next_work(ccl_global uint *work_pool,
+ccl_device int get_next_work(KernelGlobals *kg,
+                  ccl_global uint *work_pool,
                   ccl_private uint *my_work,
                   uint tile_dim_x,
                   uint tile_dim_y,
@@ -91,7 +93,8 @@ ccl_device int get_next_work(ccl_global uint *work_pool,
 	                                           tile_dim_y,
 	                                           parallel_samples,
 	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
+	uint total_work = get_total_work(kg,
+	                                 tile_dim_x,
 	                                 tile_dim_y,
 	                                 grp_idx,
 	                                 grp_idy,
@@ -103,7 +106,8 @@ ccl_device int get_next_work(ccl_global uint *work_pool,
 
 /* This function assumes that the passed my_work is valid. */
 /* Decode sample number w.r.t. assigned my_work. */
-ccl_device uint get_my_sample(uint my_work,
+ccl_device uint get_my_sample(KernelGlobals *kg,
+                   uint my_work,
                    uint tile_dim_x,
                    uint tile_dim_y,
                    uint parallel_samples,
@@ -138,7 +142,8 @@ ccl_device uint get_my_sample(uint my_work,
 }
 
 /* Decode pixel and tile position w.r.t. assigned my_work. */
-ccl_device void get_pixel_tile_position(ccl_private uint *pixel_x,
+ccl_device void get_pixel_tile_position(KernelGlobals *kg,
+                             ccl_private uint *pixel_x,
                              ccl_private uint *pixel_y,
                              ccl_private uint *tile_x,
                              ccl_private uint *tile_y,
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
index 24c821d..eb40473 100644
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -83,7 +83,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 		split_params->queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
 	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
+	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 	                          split_state->queue_data,
 	                          split_params->queue_size,
@@ -142,8 +142,8 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 
 #ifdef __WORK_STEALING__
 	my_work = split_state->work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + split_params->start_sample;
-	g

@@ Diff output truncated at 10240 characters. @@