[Bf-blender-cvs] [bc8e3a3] cycles_split_kernel: Cycles: Replace use of cl_mem with device_memory in split kernel device
Mai Lavelle
noreply at git.blender.org
Tue Oct 18 12:24:53 CEST 2016
Commit: bc8e3a3d868d19719f67ec2e5ed2d0b516a98312
Author: Mai Lavelle
Date: Tue Oct 18 11:30:25 2016 +0200
Branches: cycles_split_kernel
https://developer.blender.org/rBbc8e3a3d868d19719f67ec2e5ed2d0b516a98312
Cycles: Replace use of cl_mem with device_memory in split kernel device
Working towards using only device agnostic types and methods in the host.
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_memory.h
M intern/cycles/device/opencl/opencl.h
M intern/cycles/device/opencl/opencl_split.cpp
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 77dc1fa..014e5fc 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -226,6 +226,25 @@ public:
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
+ /* setup and allocate a device_memory object for use on device only (no host side buffer)*/
+ void mem_alloc(device_memory& mem, size_t size, MemoryType type = MEM_READ_WRITE)
+ {
+ mem.data_type = device_type_traits<uchar>::data_type;
+ mem.data_elements = 1;
+ mem.data_pointer = 0;
+ mem.data_size = size;
+ mem.device_size = 0;
+ mem.data_width = size;
+ mem.data_height = 1;
+ mem.data_depth = 1;
+
+ assert(mem.data_elements > 0);
+
+ mem.device_pointer = 0;
+
+ mem_alloc(mem, type);
+ }
+
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc..0093c93 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -180,10 +180,20 @@ public:
/* device pointer */
device_ptr device_pointer;
-protected:
- device_memory() {}
+ device_memory() {
+ data_type = device_type_traits<float>::data_type;
+ data_elements = device_type_traits<float>::num_elements;
+ data_pointer = 0;
+ data_size = 0;
+ device_size = 0;
+ data_width = 0;
+ data_height = 0;
+ data_depth = 0;
+ device_pointer = 0;
+ }
virtual ~device_memory() { assert(!device_pointer); }
+protected:
/* no copying */
device_memory(const device_memory&);
device_memory& operator = (const device_memory&);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 30a35ac..83603b2 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -265,6 +265,7 @@ public:
vector<OpenCLProgram*> &programs) = 0;
void mem_alloc(device_memory& mem, MemoryType type);
+ using Device::mem_alloc;
void mem_copy_to(device_memory& mem);
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
@@ -321,16 +322,39 @@ protected:
class ArgumentWrapper {
public:
- ArgumentWrapper() : size(0), pointer(NULL) {}
- template <typename T>
+ ArgumentWrapper() : size(0), pointer(NULL)
+ {
+ }
+
+ ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+ pointer((void*)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
+ ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+ pointer((void*)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
ArgumentWrapper(T& argument) : size(sizeof(argument)),
- pointer(&argument) { }
+ pointer(&argument)
+ {
+ }
+
ArgumentWrapper(int argument) : size(sizeof(int)),
int_value(argument),
- pointer(&int_value) { }
+ pointer(&int_value)
+ {
+ }
+
ArgumentWrapper(float argument) : size(sizeof(float)),
float_value(argument),
- pointer(&float_value) { }
+ pointer(&float_value)
+ {
+ }
+
size_t size;
int int_value;
float float_value;
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index fc80173..e1e1f54 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -107,20 +107,13 @@ public:
* kernel will be available to another kernel via this global
* memory.
*/
- cl_mem kgbuffer; /* KernelGlobals buffer. */
-
- cl_mem split_data;
-
- /* Global state array that tracks ray state. */
- cl_mem ray_state;
-
- /* Queue */
- cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
- * Tracks the size of each queue.
- */
+ device_memory kgbuffer;
+ device_memory split_data;
+ device_vector<uchar> ray_state;
+ device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
/* Flag to make sceneintersect and lampemission kernel use queues. */
- cl_mem use_queues_flag;
+ device_memory use_queues_flag;
/* Amount of memory in output buffer associated with one pixel/thread. */
size_t per_thread_output_buffer_size;
@@ -128,17 +121,12 @@ public:
/* Total allocatable available device memory. */
size_t total_allocatable_memory;
- /* host version of ray_state; Used in checking host path-iteration
- * termination.
- */
- char *hostRayStateArray;
-
/* Number of path-iterations to be done in one shot. */
unsigned int PathIteration_times;
#ifdef __WORK_STEALING__
/* Work pool with respect to each work group. */
- cl_mem work_pool_wgs;
+ device_memory work_pool_wgs;
/* Denotes the maximum work groups possible w.r.t. current tile size. */
unsigned int max_work_groups;
@@ -155,20 +143,9 @@ public:
{
background = background_;
- /* Initialize cl_mem variables. */
- kgbuffer = NULL;
- split_data = NULL;
- ray_state = NULL;
-
- /* Queue. */
- Queue_index = NULL;
- use_queues_flag = NULL;
-
per_thread_output_buffer_size = 0;
- hostRayStateArray = NULL;
PathIteration_times = PATH_ITER_INC_FACTOR;
#ifdef __WORK_STEALING__
- work_pool_wgs = NULL;
max_work_groups = 0;
#endif
current_max_closure = -1;
@@ -294,18 +271,14 @@ public:
program_sum_all_radiance.release();
/* Release global memory */
- release_mem_object_safe(kgbuffer);
- release_mem_object_safe(split_data);
- release_mem_object_safe(ray_state);
- release_mem_object_safe(use_queues_flag);
- release_mem_object_safe(Queue_index);
+ mem_free(kgbuffer);
+ mem_free(split_data);
+ mem_free(ray_state);
+ mem_free(use_queues_flag);
+ mem_free(queue_index);
#ifdef __WORK_STEALING__
- release_mem_object_safe(work_pool_wgs);
+ mem_free(work_pool_wgs);
#endif
-
- if(hostRayStateArray != NULL) {
- free(hostRayStateArray);
- }
}
void path_trace(DeviceTask *task,
@@ -313,9 +286,9 @@ public:
int2 max_render_feasible_tile_size)
{
/* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+ device_memory& d_data = *const_mem_map["__data"];
+ device_ptr d_buffer = rtile.buffer;
+ device_ptr d_rng_state = rtile.rng_state;
cl_int d_x = rtile.x;
cl_int d_y = rtile.y;
cl_int d_w = rtile.w;
@@ -383,26 +356,25 @@ public:
max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
max_work_groups = (max_global_size[0] * max_global_size[1]) /
(local_size[0] * local_size[1]);
+
/* Allocate work_pool_wgs memory. */
- work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+ mem_alloc(work_pool_wgs, max_work_groups * sizeof(unsigned int));
#endif /* __WORK_STEALING__ */
- /* Allocate queue_index memory only once. */
- Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
- use_queues_flag = mem_alloc(sizeof(char));
- kgbuffer = mem_alloc(get_KernelGlobals_size());
- ray_state = mem_alloc(num_global_elements * sizeof(char));
- split_data = mem_alloc(split_data_buffer_size(num_global_elements,
- current_max_closure,
- per_thread_output_buffer_size));
-
- hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
- assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+ mem_alloc(queue_index, NUM_QUEUES * sizeof(int));
+ mem_alloc(use_queues_flag, sizeof(char));
+ mem_alloc(kgbuffer, get_KernelGlobals_size());
+
+ ray_state.resize(num_global_elements);
+ mem_alloc(ray_state, MEM_READ_WRITE);
+
+ mem_alloc(split_data, split_data_buffer_size(num_global_elements,
+ current_max_closure,
+ per_thread_output_buffer_size));
}
cl_int dQueue_size = global_size[0] * global_size[1];
- //printf("kernel_set_args data_init\n");
cl_uint start_arg_index =
kernel_set_args(program_data_init(),
0,
@@ -433,7 +405,7 @@ public:
rtile.rng_state_offset_x,
rtile.rng_state_offset_y,
rtile.buffer_rng_state_stride,
- Queue_index,
+ queue_index,
dQueue_size,
use_queues_flag,
#ifdef __WORK_STEALING__
@@ -463,7 +435,6 @@ public:
#define GLUE(a, b) a ## b
#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
{ \
- /*printf("enqueueing " #kernelName "\n");*/ \
ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
GLUE(program_, \
kernelName)(), \
@@ -517,20 +488,8 @@ public:
}
}
- /* Read ray-state into Host memory to decide if we should exit
- * path-iteration in host.
- */
- //printf("enqueue read\n");
- ciErr = clEnqueueReadBuffer(cqCommandQueue,
- ray_state,
- CL_TRUE,
- 0,
- global_size[0] * global_size[1] * sizeof(char),
- hostRayStateArray,
- 0,
- NULL,
- NULL);
- assert(ciErr == CL_SUCCESS);
+ /* Decide if we should exit path-iteration in host. */
+ mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
activeRaysAvailable = false;
@@ -538,7 +497,7 @@ public:
rayStateIter < global_size[0] * global_size[1];
++rayStateIter)
{
- if(int8_t(hostRayStateArra
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list