[Bf-blender-cvs] [07ec0effb61] master: Code cleanup: simplify kernel side work stealing code.
Brecht Van Lommel
noreply at git.blender.org
Thu Sep 21 22:36:40 CEST 2017
Commit: 07ec0effb61e18a3d2f1bad97ebf7f6cb5bb6b87
Author: Brecht Van Lommel
Date: Thu Sep 21 03:37:22 2017 +0200
Branches: master
https://developer.blender.org/rB07ec0effb61e18a3d2f1bad97ebf7f6cb5bb6b87
Code cleanup: simplify kernel side work stealing code.
===================================================================
M intern/cycles/kernel/kernel_work_stealing.h
M intern/cycles/kernel/split/kernel_buffer_update.h
M intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M intern/cycles/kernel/split/kernel_path_init.h
M intern/cycles/kernel/split/kernel_split_data_types.h
===================================================================
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 28fc5ce1c30..0c11158e8da 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
- return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
- return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
- return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
- uint total_work_size = kernel_total_work_size(kg);
- uint num_pools = kernel_num_work_pools(kg);
-
- if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
- return 0;
- }
-
- uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
- uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
- if(work_pool < remainder / WORK_POOL_SIZE) {
- work_size += WORK_POOL_SIZE;
- }
- else if(work_pool == remainder / WORK_POOL_SIZE) {
- work_size += remainder % WORK_POOL_SIZE;
- }
-
- return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- uint num_pools = kernel_num_work_pools(kg);
- uint pool = work_pool_from_ray_index(kg, ray_index);
-
- return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
- + (pool * WORK_POOL_SIZE)
- + (work_index % WORK_POOL_SIZE);
-}
-
/* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+ uint thread_index,
+ ccl_private uint *global_work_index)
{
- uint work_pool = work_pool_from_ray_index(kg, ray_index);
- uint pool_size = work_pool_work_size(kg, work_pool);
+ uint total_work_size = kernel_split_params.w
+ * kernel_split_params.h
+ * kernel_split_params.num_samples;
- if(pool_size == 0) {
+ /* With a small amount of work there may be more threads than work due to
+ * rounding up of global size, stop such threads immediately. */
+ if(thread_index >= total_work_size) {
return false;
}
- *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
- return (*work_index < pool_size);
-}
+ /* Increase atomic work index counter in pool. */
+ uint pool = thread_index / WORK_POOL_SIZE;
+ uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+ /* Map per-pool work index to a global work index. */
+ uint global_size = ccl_global_size(0) * ccl_global_size(1);
+ kernel_assert(global_size % WORK_POOL_SIZE == 0);
+ kernel_assert(thread_index < global_size);
-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
- ccl_private uint *pixel_x,
- ccl_private uint *pixel_y,
- ccl_private uint *tile_x,
- ccl_private uint *tile_y,
- uint work_index,
- uint ray_index)
-{
- uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+ *global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+ + (pool * WORK_POOL_SIZE)
+ + (work_index % WORK_POOL_SIZE);
- *tile_x = pixel_index % kernel_split_params.w;
- *tile_y = pixel_index / kernel_split_params.w;
+ /* Test if all work for this pool is done. */
+ return (*global_work_index < total_work_size);
+}
- *pixel_x = *tile_x + kernel_split_params.x;
- *pixel_y = *tile_y + kernel_split_params.y;
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+ uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+ uint sample_offset = global_work_index / tile_pixels;
+ uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+ uint y_offset = pixel_offset / kernel_split_params.w;
+ uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+
+ *x = kernel_split_params.x + x_offset;
+ *y = kernel_split_params.y + y_offset;
+ *sample = kernel_split_params.start_sample + sample_offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 7b4d1299c12..c9e7deddafa 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -84,14 +84,9 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- uint work_index = kernel_split_state.work_array[ray_index];
- uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
- uint tile_x, tile_y, pixel_x, pixel_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+ uint sample = state->sample;
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
/* accumulate result in output buffer */
kernel_write_result(kg, buffer, sample, L);
@@ -102,31 +97,26 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
/* We have completed current work; So get next work */
uint work_index;
- int valid_work = get_next_work(kg, &work_index, ray_index);
- if(!valid_work) {
+ if(!get_next_work(kg, ray_index, &work_index)) {
/* If work is invalid, this means no more work is available and the thread may exit */
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
}
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
- kernel_split_state.work_array[ray_index] = work_index;
- /* Get the sample associated with the current work */
- uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- /* Get pixel and tile position associated with current work */
- uint tile_x, tile_y, pixel_x, pixel_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-
- /* Remap rng_state according to the current work */
+ uint x, y, sample;
+ get_work_pixel(kg, work_index, &x, &y, &sample);
+
+ /* Remap rng_state to current pixel. */
ccl_global uint *rng_state = kernel_split_params.rng_state;
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+ rng_state += kernel_split_params.offset + x + y*stride;
- /* Remap buffer according to the current work */
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
/* Initialize random numbers and ray. */
uint rng_hash;
- kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray);
+ kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray);
if(ray->t != 0.0f) {
/* Initialize throughput, path radiance, Ray, PathState;
@@ -145,6 +135,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
kernel_write_pass_float4(buffer, sample, L_rad);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 4d9e08becc4..dffd291012d 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -90,8 +90,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(ray_index != QUEUE_EMPTY_SLOT) {
#endif
- int stride = kernel_split_params.stride;
-
ccl_global PathState *state = 0x0;
float3 throughput;
@@ -99,15 +97,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
ShaderData *sd = &kernel_split_state.sd[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- uint work_index = kernel_split_state.work_array[ray_index];
- uint pixel_x, pixel_y, tile_x, tile_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
-
- ccl_global float *buffer = kernel_split_pa
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list