[Bf-blender-cvs] [df8a964232] cycles_split_kernel: Cycles: Remove everything parallel samples from the split kernel
Mai Lavelle
noreply at git.blender.org
Tue Jan 24 13:33:43 CET 2017
Commit: df8a964232b64eadca03600935965d0b1c7ea668
Author: Mai Lavelle
Date: Tue Jan 24 04:55:40 2017 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rBdf8a964232b64eadca03600935965d0b1c7ea668
Cycles: Remove everything parallel samples from the split kernel
Parallel samples never actually worked, was producing incorrect results
or crashes, and wasn't any faster than work stealing, so removing it.
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_split_kernel.cpp
M intern/cycles/device/opencl/opencl_split.cpp
M intern/cycles/kernel/kernel_passes.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernel_work_stealing.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cuda/kernel_split.cu
M intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
M intern/cycles/kernel/split/kernel_background_buffer_update.h
M intern/cycles/kernel/split/kernel_data_init.h
M intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M intern/cycles/kernel/split/kernel_lamp_emission.h
M intern/cycles/kernel/split/kernel_scene_intersect.h
M intern/cycles/kernel/split/kernel_split_data.h
M intern/cycles/kernel/split/kernel_sum_all_radiance.h
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 1564d9f3d8..13b7c08f50 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -296,7 +296,6 @@ private:
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& /*dim*/,
RenderTile& /*rtile*/,
int /*num_global_elements*/,
- int /*num_parallel_samples*/,
device_memory& /*kernel_globals*/,
device_memory& /*kernel_data*/,
device_memory& /*split_data*/,
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 3811e43ab9..c131bea521 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -700,7 +700,6 @@ protected:
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
- int num_parallel_samples,
device_memory& kernel_globals,
device_memory& data,
device_memory& split_data,
@@ -724,11 +723,8 @@ protected:
ccl_global int *Queue_index,
int queuesize,
ccl_global char *use_queues_flag,
-#ifdef __WORK_STEALING__
ccl_global unsigned int *work_pool_wgs,
unsigned int num_samples,
-#endif
- int parallel_samples,
int buffer_offset_x,
int buffer_offset_y,
int buffer_stride,
@@ -797,11 +793,8 @@ protected:
(int*)queue_index.device_pointer,
dim.global_size[0] * dim.global_size[1],
(char*)use_queues_flags.device_pointer,
-#ifdef __WORK_STEALING__
(uint*)work_pool_wgs.device_pointer,
rtile.num_samples,
-#endif
- num_parallel_samples,
rtile.buffer_offset_x,
rtile.buffer_offset_y,
rtile.buffer_rng_state_stride,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 1b8b09bb07..5aaed093f3 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1419,7 +1419,6 @@ public:
bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
- int num_parallel_samples,
device_memory& /*kernel_globals*/,
device_memory& /*kernel_data*/,
device_memory& split_data,
@@ -1460,12 +1459,9 @@ public:
int* rng_state_stride;
CUdeviceptr* queue_index;
int* queuesize;
-#ifdef __WORK_STEALING__
CUdeviceptr* use_queues_flag;
CUdeviceptr* work_pool_wgs;
-#endif
int* num_samples;
- int* parallel_samples;
int* buffer_offset_x;
int* buffer_offset_y;
int* buffer_stride;
@@ -1491,11 +1487,8 @@ public:
&d_queue_index,
&queue_size,
&d_use_queues_flag,
-#ifdef __WORK_STEALING__
&d_work_pool_wgs,
&rtile.num_samples,
-#endif
- &num_parallel_samples,
&rtile.buffer_offset_x,
&rtile.buffer_offset_y,
&rtile.buffer_rng_state_stride,
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index d7a0297983..484416d297 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -143,36 +143,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
}
- /* set global_size and num_parallel_samples */
- size_t global_size[2];
- unsigned int num_parallel_samples;
- {
-#ifdef __WORK_STEALING__
- global_size[0] = round_up(tile.w, local_size[0]);
- global_size[1] = round_up(tile.h, local_size[1]);
- num_parallel_samples = 1;
-#else
- global_size[1] = round_up(tile.h, local_size[1]);
- unsigned int num_threads = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y;
- unsigned int num_tile_columns_possible = num_threads / global_size[1];
- /* Estimate number of parallel samples that can be
- * processed in parallel.
- */
- num_parallel_samples = min(num_tile_columns_possible / tile.w, tile.num_samples);
- /* Wavefront size in AMD is 64.
- * TODO(sergey): What about other platforms?
- */
- if(num_parallel_samples >= 64) {
- /* TODO(sergey): Could use generic round-up here. */
- num_parallel_samples = (num_parallel_samples / 64) * 64;
- }
- assert(num_parallel_samples != 0);
-
- global_size[0] = tile.w * num_parallel_samples;
-#endif /* __WORK_STEALING__ */
-
- assert(global_size[0] * global_size[1] <= max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
- }
+ /* set global_size */
+ size_t global_size[2] = {round_up(tile.w, local_size[0]), round_up(tile.h, local_size[1])};
+ assert(global_size[0] * global_size[1] <= max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
/* Number of elements in the global state buffer */
int num_global_elements = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y;
@@ -181,7 +154,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
if(first_tile) {
first_tile = false;
-#ifdef __WORK_STEALING__
/* Calculate max groups */
/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
@@ -191,7 +163,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
/* Allocate work_pool_wgs memory. */
work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
-#endif /* __WORK_STEALING__ */
queue_index.resize(NUM_QUEUES * sizeof(int));
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
@@ -241,7 +212,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
if(!device->enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
subtile,
num_global_elements,
- num_parallel_samples,
kgbuffer,
kernel_data,
split_data,
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index e93b0dc7a2..8d3e2598c5 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -174,7 +174,6 @@ public:
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
- int num_parallel_samples,
device_memory& kernel_globals,
device_memory& kernel_data,
device_memory& split_data,
@@ -225,11 +224,8 @@ public:
queue_index,
dQueue_size,
use_queues_flag,
-#ifdef __WORK_STEALING__
work_pool_wgs,
rtile.num_samples,
-#endif
- num_parallel_samples,
rtile.buffer_offset_x,
rtile.buffer_offset_y,
rtile.buffer_rng_state_stride,
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e495..7790cce067 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
{
ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
atomic_add_and_fetch_float(buf, value);
#else
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
{
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
#else
ccl_global float3 *buf = (ccl_global float3*)buffer;
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
{
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#else
ccl_global float4 *buf = (ccl_global float4*)buffer;
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 5e7b8fef5c..b49c602823 100644
--- a/intern/cycles/ke
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list