[Bf-blender-cvs] [5b7d6ea54b2] master: Code refactor: add WorkTile struct for passing work to kernel.
Brecht Van Lommel
noreply at git.blender.org
Wed Oct 4 21:22:29 CEST 2017
Commit: 5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2
Author: Brecht Van Lommel
Date: Tue Sep 26 23:42:36 2017 +0200
Branches: master
https://developer.blender.org/rB5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2
Code refactor: add WorkTile struct for passing work to kernel.
This makes sharing some code between mega/split in following commits a bit
easier, and also paves the way for rendering multiple tiles later.
===================================================================
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_memory.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernel_work_stealing.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
M intern/cycles/kernel/kernels/opencl/kernel_split_function.h
M intern/cycles/kernel/split/kernel_buffer_update.h
M intern/cycles/kernel/split/kernel_data_init.h
M intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M intern/cycles/kernel/split/kernel_path_init.h
M intern/cycles/kernel/split/kernel_split_data_types.h
===================================================================
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 29b5bd70789..7ee74e9a512 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1293,8 +1293,6 @@ public:
CUDAContextScope scope(this);
CUfunction cuPathTrace;
- CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
- CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
/* get kernel function */
if(branched) {
@@ -1308,40 +1306,48 @@ public:
return;
}
- /* pass in parameters */
- void *args[] = {&d_buffer,
- &d_rng_state,
- &sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride};
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+ /* allocate work tile */
+ device_vector<WorkTile> work_tiles;
+ work_tiles.resize(1);
- /*int num_registers;
- cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
+ WorkTile *wtile = work_tiles.get_data();
+ wtile->x = rtile.x;
+ wtile->y = rtile.y;
+ wtile->w = rtile.w;
+ wtile->h = rtile.h;
+ wtile->offset = rtile.offset;
+ wtile->stride = rtile.stride;
+ wtile->start_sample = sample;
+ wtile->num_samples = 1;
+ wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
+ wtile->rng_state = (uint*)cuda_device_ptr(rtile.rng_state);
- printf("threads_per_block %d\n", threads_per_block);
- printf("num_registers %d\n", num_registers);*/
+ mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
+ mem_copy_to(work_tiles);
- int xthreads = (int)sqrt(threads_per_block);
- int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (rtile.w + xthreads - 1)/xthreads;
- int yblocks = (rtile.h + ythreads - 1)/ythreads;
+ CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
- cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+
+ /* pass in parameters */
+ void *args[] = {&d_work_tiles,
+ &total_work_size};
+
+ /* launch kernel */
+ int num_threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+ int num_blocks = divide_up(total_work_size, num_threads_per_block);
cuda_assert(cuLaunchKernel(cuPathTrace,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
+ num_blocks, 1, 1,
+ num_threads_per_block, 1, 1,
0, 0, args, 0));
cuda_assert(cuCtxSynchronize());
+
+ mem_free(work_tiles);
}
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index b63dd00068b..20707ad04c9 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -46,6 +46,7 @@ enum MemoryType {
/* Supported Data Types */
enum DataType {
+ TYPE_UNKNOWN,
TYPE_UCHAR,
TYPE_UINT,
TYPE_INT,
@@ -57,6 +58,7 @@ enum DataType {
static inline size_t datatype_size(DataType datatype)
{
switch(datatype) {
+ case TYPE_UNKNOWN: return 1;
case TYPE_UCHAR: return sizeof(uchar);
case TYPE_FLOAT: return sizeof(float);
case TYPE_UINT: return sizeof(uint);
@@ -70,8 +72,8 @@ static inline size_t datatype_size(DataType datatype)
/* Traits for data types */
template<typename T> struct device_type_traits {
- static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 0;
+ static const DataType data_type = TYPE_UNKNOWN;
+ static const int num_elements = sizeof(T);
};
template<> struct device_type_traits<uchar> {
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 6c5b6ca3b2d..bf3a2881666 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1448,6 +1448,21 @@ enum RayState {
#define PATCH_MAP_NODE_IS_LEAF (1u << 31)
#define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF))
+/* Work Tiles */
+
+typedef struct WorkTile {
+ uint x, y, w, h;
+
+ uint start_sample;
+ uint num_samples;
+
+ uint offset;
+ uint stride;
+
+ ccl_global float *buffer;
+ ccl_global uint *rng_state;
+} WorkTile;
+
CCL_NAMESPACE_END
#endif /* __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 0c11158e8da..0c2d9379b63 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,29 +27,28 @@ CCL_NAMESPACE_BEGIN
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
+#ifdef __SPLIT_KERNEL__
/* Returns true if there is work */
ccl_device bool get_next_work(KernelGlobals *kg,
- uint thread_index,
+ ccl_global uint *work_pools,
+ uint total_work_size,
+ uint ray_index,
ccl_private uint *global_work_index)
{
- uint total_work_size = kernel_split_params.w
- * kernel_split_params.h
- * kernel_split_params.num_samples;
-
/* With a small amount of work there may be more threads than work due to
* rounding up of global size, stop such threads immediately. */
- if(thread_index >= total_work_size) {
+ if(ray_index >= total_work_size) {
return false;
}
/* Increase atomic work index counter in pool. */
- uint pool = thread_index / WORK_POOL_SIZE;
- uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
+ uint pool = ray_index / WORK_POOL_SIZE;
+ uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
/* Map per-pool work index to a global work index. */
uint global_size = ccl_global_size(0) * ccl_global_size(1);
kernel_assert(global_size % WORK_POOL_SIZE == 0);
- kernel_assert(thread_index < global_size);
+ kernel_assert(ray_index < global_size);
*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+ (pool * WORK_POOL_SIZE)
@@ -58,23 +57,24 @@ ccl_device bool get_next_work(KernelGlobals *kg,
/* Test if all work for this pool is done. */
return (*global_work_index < total_work_size);
}
+#endif
-/* Map global work index to pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
uint global_work_index,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *sample)
{
- uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+ uint tile_pixels = tile->w * tile->h;
uint sample_offset = global_work_index / tile_pixels;
uint pixel_offset = global_work_index - sample_offset * tile_pixels;
- uint y_offset = pixel_offset / kernel_split_params.w;
- uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+ uint y_offset = pixel_offset / tile->w;
+ uint x_offset = pixel_offset - y_offset * tile->w;
- *x = kernel_split_params.x + x_offset;
- *y = kernel_split_params.y + y_offset;
- *sample = kernel_split_params.start_sample + sample_offset;
+ *x = tile->x + x_offset;
+ *y = tile->y + y_offset;
+ *sample = tile->start_sample + sample_offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index dc343cb387a..4d100634421 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -20,6 +20,7 @@
#include "kernel/kernel_compat_cuda.h"
#include "kernel_config.h"
+
#include "kernel/kernel_math.h"
#include "kernel/kernel_types.h"
#include "kernel/kernel_globals.h"
@@ -27,32 +28,37 @@
#include "kernel/kernel_path.h"
#include "kernel/kernel_path_branched.h"
#include "kernel/kernel_bake.h"
+#include "kernel/kernel_work_stealing.h"
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+ int work_index = ccl_global_id(0);
+
+ if(work_index < total_work_size) {
+ uint x, y, sample;
+ get_work_pixel(tile, work_index, &x, &y, &sample);
- if(x < sx + sw && y < sy + sh) {
KernelGlobals kg;
- kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+ kernel_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride);
}
}
#ifdef __BRANCHED_PATH__
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+ int work_index = ccl_global_id(0);
+
+ if(work_index < total_work_size) {
+ uint x, y, sample;
+ get_work_pixel(tile, work_index, &x, &y, &sample);
- if(x < sx + sw && y < sy + sh) {
KernelGlobals kg;
- kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+ kernel_branched_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list