[Bf-blender-cvs] [aa50a5de17] cycles-tiles-rework: Cycles: Rendering multiple tiles at once for OpenCL mega kernel
Mai Lavelle
noreply at git.blender.org
Fri Jan 13 02:52:42 CET 2017
Commit: aa50a5de171ab3cc3d0b2512ddc80f62c9e73da3
Author: Mai Lavelle
Date: Thu Jan 12 01:44:21 2017 -0500
Branches: cycles-tiles-rework
https://developer.blender.org/rBaa50a5de171ab3cc3d0b2512ddc80f62c9e73da3
Cycles: Rendering multiple tiles at once for OpenCL mega kernel
Basically the same as for CUDA. No major performance difference, but in
some cases this is a little faster than single tile rendering.
===================================================================
M intern/cycles/device/opencl/opencl_base.cpp
M intern/cycles/device/opencl/opencl_mega.cpp
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/opencl/kernel.cl
===================================================================
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e..60df8f439a 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -206,6 +206,7 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("convert_to_half_float"));
base_program.add_kernel(ustring("shader"));
base_program.add_kernel(ustring("bake"));
+ base_program.add_kernel(ustring("set_sample_range"));
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e02..519dc8d176 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -22,6 +22,7 @@
#include "kernel_types.h"
+#include "util_foreach.h"
#include "util_md5.h"
#include "util_path.h"
#include "util_time.h"
@@ -56,30 +57,59 @@ public:
path_trace_program.release();
}
- void path_trace(RenderTile& rtile, int sample)
+ void path_trace(vector<RenderTile>& rtiles, int sample)
{
+ /* set the sample ranges */
+ cl_mem d_sample_ranges = clCreateBuffer(cxContext,
+ CL_MEM_READ_WRITE,
+ sizeof(SampleRange) * rtiles.size(),
+ NULL,
+ &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer");
+
+ cl_kernel ckSetSampleRange = base_program(ustring("set_sample_range"));
+
+ for(int i = 0; i < rtiles.size(); i++) {
+ RenderTile& rtile = rtiles[i];
+
+ /* Cast arguments to cl types. */
+ cl_int d_range = i;
+ cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+ cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+ cl_int d_sample = sample;
+ cl_int d_x = rtile.x;
+ cl_int d_y = rtile.y;
+ cl_int d_w = rtile.w;
+ cl_int d_h = rtile.h;
+ cl_int d_offset = rtile.offset;
+ cl_int d_stride = rtile.stride;
+
+ kernel_set_args(ckSetSampleRange, 0,
+ d_sample_ranges,
+ d_range,
+ d_buffer,
+ d_rng_state,
+ d_sample,
+ d_x,
+ d_y,
+ d_w,
+ d_h,
+ d_offset,
+ d_stride);
+
+ enqueue_kernel(ckSetSampleRange, 1, 1);
+ }
+
/* Cast arguments to cl types. */
cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
- cl_int d_x = rtile.x;
- cl_int d_y = rtile.y;
- cl_int d_w = rtile.w;
- cl_int d_h = rtile.h;
- cl_int d_offset = rtile.offset;
- cl_int d_stride = rtile.stride;
-
- /* Sample arguments. */
- cl_int d_sample = sample;
+ cl_int d_num_sample_ranges = rtiles.size();
cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace"));
cl_uint start_arg_index =
kernel_set_args(ckPathTraceKernel,
0,
- d_data,
- d_buffer,
- d_rng_state);
+ d_data);
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
@@ -88,15 +118,13 @@ public:
start_arg_index += kernel_set_args(ckPathTraceKernel,
start_arg_index,
- d_sample,
- d_x,
- d_y,
- d_w,
- d_h,
- d_offset,
- d_stride);
-
- enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+ d_sample_ranges,
+ d_num_sample_ranges);
+
+ /* TODO(mai): calculate a reasonable grid size for the device */
+ enqueue_kernel(ckPathTraceKernel, 256, 256);
+
+ opencl_assert(clReleaseMemObject(d_sample_ranges));
}
void thread_run(DeviceTask *task)
@@ -108,11 +136,21 @@ public:
shader(*task);
}
else if(task->type == DeviceTask::PATH_TRACE) {
- RenderTile tile;
+ /* TODO(mai): calculate a reasonable grid size for the device */
+ RenderWorkRequest work_request = {256*256, 256*256};
+ vector<RenderTile> tiles;
+
/* Keep rendering tiles until done. */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ while(task->acquire_tiles(this, tiles, work_request)) {
+ int start_sample = tiles[0].start_sample;
+ int end_sample = tiles[0].start_sample + tiles[0].num_samples;
+
+#ifndef NDEBUG
+ foreach(RenderTile& tile, tiles) {
+ assert(start_sample == tile.start_sample);
+ assert(end_sample == tile.start_sample + tile.num_samples);
+ }
+#endif
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
@@ -120,11 +158,20 @@ public:
break;
}
- path_trace(tile, sample);
+ path_trace(tiles, sample);
+
+ int pixel_samples = 0;
+ foreach(RenderTile& tile, tiles) {
+ tile.sample = sample + 1;
+ pixel_samples += tile.w * tile.h;
+ }
- tile.sample = sample + 1;
+ /* TODO(mai): without this we cant see tile updates, however this has a huge impact on
+ * performace. it should be posible to solve this by using a more async style loop
+ */
+ clFinish(cqCommandQueue);
- task->update_progress(&tile, tile.w*tile.h);
+ task->update_progress(tiles, pixel_samples);
}
/* Complete kernel execution before release tile */
@@ -138,7 +185,11 @@ public:
*/
clFinish(cqCommandQueue);
- task->release_tile(tile);
+ foreach(RenderTile& tile, tiles) {
+ task->release_tile(tile);
+ }
+
+ tiles.clear();
}
}
}
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 969907391b..84d2f912c2 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1277,8 +1277,8 @@ enum RayState {
/* Sample Range */
typedef struct SampleRange {
- float *buffer;
- uint *rng_state;
+ ccl_global float *buffer;
+ ccl_global uint *rng_state;
int x;
int y;
int w;
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b..1816d01d3b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -49,15 +49,13 @@
__kernel void kernel_ocl_path_trace(
ccl_constant KernelData *data,
- ccl_global float *buffer,
- ccl_global uint *rng_state,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "../../kernel_textures.h"
- int sample,
- int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_global SampleRange *sample_ranges,
+ int num_sample_ranges)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -67,15 +65,83 @@ __kernel void kernel_ocl_path_trace(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
+ /* order threads to maintain inner block coherency */
+ const int group_id = get_group_id(0) + get_num_groups(0) * get_group_id(1);
+ const int local_thread_id = get_local_id(0) + get_local_id(1) * get_local_size(0);
- if(x < sx + sw && y < sy + sh)
- kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+ const int thread_id = group_id * (get_local_size(0) * get_local_size(1)) + local_thread_id;
+
+ /* find which sample range belongs to this thread */
+ ccl_global SampleRange* sample_range = NULL;
+
+ for(int i = 0; i < num_sample_ranges; i++) {
+ if(thread_id >= sample_ranges[i].work_offset &&
+ thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
+ {
+ sample_range = &sample_ranges[i];
+ }
+ }
+
+ /* check if theres work for this thread */
+ if(!sample_range) {
+ return;
+ }
+
+ int work_offset = thread_id - sample_range->work_offset;
+
+ if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+ return;
+ }
+
+ int x = (work_offset % sample_range->w) + sample_range->x;
+ int y = (work_offset / sample_range->w) + sample_range->y;
+
+ kernel_path_trace(kg,
+ sample_range->buffer,
+ sample_range->rng_state,
+ sample_range->sample,
+ x, y,
+ sample_range->offset,
+ sample_range->stride);
}
#else /* __COMPILE_ONLY_MEGAKERNEL__ */
+/* kernels */
+__kernel void kernel_ocl_set_sample_range(
+ ccl_global SampleRange *sample_ranges,
+ int range,
+ ccl_global float *buffer,
+ ccl_global uint *rng_state,
+ int sample,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
+{
+ ccl_global SampleRange* sample_range = &sample_ranges[range];
+
+ sample_range->buffer = buffer;
+ sample_range->rng_state = rng_state;
+ sample_range->sample = sample;
+ sample_range->x = sx;
+ sample_range->y = sy;
+ sample_range->w = sw;
+ sample_range->h = sh;
+ sample_range->offset = offset;
+ sample_range->stride = stride;
+
+ if(range == 0) {
+ sample_range->work_offset = 0;
+ }
+ else {
+ ccl_global SampleRange* prev_range = &sample_ranges[range-1];
+ sample_range->work_offset = prev_range->work_offset + prev_range->w * prev_range->h;
+ }
+}
+
__kernel void kernel_ocl_shader(
ccl_constant KernelData *data,
ccl_global uint4 *input,
More information about the Bf-blender-cvs
mailing list