[Bf-blender-cvs] [0892352bfe] master: Cycles: CPU implementation of split kernel
Mai Lavelle
noreply at git.blender.org
Wed Mar 8 07:53:21 CET 2017
Commit: 0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610
Author: Mai Lavelle
Date: Tue Feb 14 06:20:48 2017 -0500
Branches: master
https://developer.blender.org/rB0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610
Cycles: CPU implementation of split kernel
===================================================================
M intern/cycles/blender/addon/properties.py
M intern/cycles/blender/addon/ui.py
M intern/cycles/blender/blender_python.cpp
M intern/cycles/device/device_cpu.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/kernel.h
M intern/cycles/kernel/kernel_compat_cpu.h
M intern/cycles/kernel/kernel_globals.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
A intern/cycles/kernel/kernels/cpu/kernel_split.cpp
A intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
A intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
A intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
A intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
A intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
M intern/cycles/kernel/osl/osl_closures.cpp
M intern/cycles/kernel/osl/osl_services.cpp
M intern/cycles/kernel/osl/osl_shader.cpp
M intern/cycles/kernel/split/kernel_data_init.h
M intern/cycles/kernel/split/kernel_split_common.h
M intern/cycles/util/util_debug.cpp
M intern/cycles/util/util_debug.h
M intern/cycles/util/util_types.h
===================================================================
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc2..1f0b712c93 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -665,6 +665,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+ cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 44af5f7efe..8d3fe87759 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1518,6 +1518,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_use_qbvh")
+ col.prop(cscene, "debug_use_cpu_split_kernel")
col = layout.column()
col.label('CUDA Flags:')
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f8..ed410e15e7 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -67,6 +67,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+ flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
/* Synchronize OpenCL kernel type. */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 78e2e3ea71..702f2a9136 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -26,10 +26,12 @@
#include "device.h"
#include "device_intern.h"
+#include "device_split_kernel.h"
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_types.h"
+#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "osl_shader.h"
@@ -41,6 +43,7 @@
#include "util_foreach.h"
#include "util_function.h"
#include "util_logging.h"
+#include "util_map.h"
#include "util_opengl.h"
#include "util_progress.h"
#include "util_system.h"
@@ -48,8 +51,92 @@
CCL_NAMESPACE_BEGIN
+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
+
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(DeviceTask *task);
+};
+
class CPUDevice : public Device
{
+ static unordered_map<string, void*> kernel_functions;
+
+ static void register_kernel_function(const char* name, void* func)
+ {
+ kernel_functions[name] = func;
+ }
+
+ static const char* get_arch_name()
+ {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ return "cpu_avx2";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ if(system_cpu_support_avx()) {
+ return "cpu_avx";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ if(system_cpu_support_sse41()) {
+ return "cpu_sse41";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ if(system_cpu_support_sse3()) {
+ return "cpu_sse3";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
+ return "cpu_sse2";
+ }
+ else
+#endif
+ {
+ return "cpu";
+ }
+ }
+
+ template<typename F>
+ static F get_kernel_function(string name)
+ {
+ name = string("kernel_") + get_arch_name() + "_" + name;
+
+ unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+ if(it == kernel_functions.end()) {
+ assert(!"kernel function not found");
+ return NULL;
+ }
+
+ return (F)it->second;
+ }
+
+ friend class CPUSplitKernel;
+
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -57,10 +144,15 @@ public:
#ifdef WITH_OSL
OSLGlobals osl_globals;
#endif
+
+ bool use_split_kernel;
+
+ DeviceRequestedFeatures requested_features;
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
: Device(info, stats, background)
{
+
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
@@ -105,6 +197,28 @@ public:
{
VLOG(1) << "Will be using regular kernels.";
}
+
+ use_split_kernel = DebugFlags().cpu.split_kernel;
+ if(use_split_kernel) {
+ VLOG(1) << "Will be using split kernel.";
+ }
+
+ kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
}
~CPUDevice()
@@ -205,8 +319,14 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE)
- thread_path_trace(*task);
+ if(task->type == DeviceTask::PATH_TRACE) {
+ if(!use_split_kernel) {
+ thread_path_trace(*task);
+ }
+ else {
+ thread_path_trace_split(*task);
+ }
+ }
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
else if(task->type == DeviceTask::SHADER)
@@ -267,7 +387,7 @@ public:
{
path_trace_kernel = kernel_cpu_path_trace;
}
-
+
while(task.acquire_tile(this, tile)) {
float *render_buffer = (float*)tile.buffer;
uint *rng_state = (uint*)tile.rng_state;
@@ -303,6 +423,49 @@ public:
thread_kernel_globals_free(&kg);
}
+ void thread_path_trace_split(DeviceTask& task)
+ {
+ if(task_pool.canceled()) {
+ if(task.need_finish_queue == false)
+ return;
+ }
+
+ RenderTile tile;
+
+ CPUSplitKernel split_kernel(this);
+
+ /* allocate buffer for kernel globals */
+ device_memory kgbuffer;
+ kgbuffer.resize(sizeof(KernelGlobals));
+ mem_alloc(kgbuffer, MEM_READ_WRITE);
+
+ KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+ *kg = thread_kernel_globals_init();
+
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel.load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
+
+ return;
+ }
+
+ while(task.acquire_tile(this, tile)) {
+ device_memory data;
+ split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+ task.release_tile(tile);
+
+ if(task_pool.canceled()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
+ }
+
void thread_film_convert(DeviceTask& task)
{
float sample_scale = 1.0f/(task.sample + 1);
@@ -510,6 +673,10 @@ protected:
inline void thread_kernel_globals_free(KernelGlobals *kg)
{
+ if(kg == NULL) {
+ return;
+ }
+
if(kg->transparent_shadow_intersections != NULL) {
free(kg->transparent_shadow_intersections);
}
@@ -524,8 +691,170 @@ protected:
OSLShader::thread_free(kg);
#endif
}
+
+ virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+ requested_features = requested_features_;
+
+ return true;
+ }
};
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+ CPUDevice* device;
+ void (*func)(KernelGlobals *kg, KernelData *data);
+
+ CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+ ~CPUSplitKernelFunction() {}
+
+ virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+ {
+ if(!func) {
+ return false;
+ }
+
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+ kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+ for(int y = 0; y < dim.global_size[1]; y++) {
+ for(int x = 0; x < dim.global_size[0]; x++) {
+ kg->global_id = make_int2(x, y);
+
+ func(kg, (KernelData*)data.device_pointer);
+ }
+ }
+
+ return true;
+ }
+};
+
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& data,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& q
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list