[Bf-blender-cvs] [9800837b987] blender2.7: Cycles: Support multithreaded compilation of kernels

Brecht Van Lommel noreply at git.blender.org
Fri Feb 15 08:56:35 CET 2019


Commit: 9800837b987930e6152c2dc27cae5bd55873d306
Author: Brecht Van Lommel
Date:   Fri Feb 15 08:18:38 2019 +0100
Branches: blender2.7
https://developer.blender.org/rB9800837b987930e6152c2dc27cae5bd55873d306

Cycles: Support multithreaded compilation of kernels

This patch implements a workaround to get the multithreaded compilation from D2231 working.
So far, it only works for Blender, not for Cycles Standalone. Also, I have only tested the Linux codepath in the helper function.
Depends on D2231.

Patch by lukasstockner97, jbakker, brecht

    job    |   scene_name    | compilation_time
----------+-----------------+------------------
    Baseline | empty           |            22.73
    D2264    | empty           |            13.94
    Baseline | bmw             |            56.44
    D2264    | bmw             |            41.32
    Baseline | fishycat        |            59.50
    D2264    | fishycat        |            45.19
    Baseline | barbershop      |           212.28
    D2264    | barbershop      |           169.81
    Baseline | victor          |            67.51
    D2264    | victor          |            53.60
    Baseline | classroom       |            51.46
    D2264    | classroom       |            39.02
    Baseline | koro            |            62.48
    D2264    | koro            |            49.03
    Baseline | pavillion       |            54.37
    D2264    | pavillion       |            38.82
    Baseline | splash279       |            47.43
    D2264    | splash279       |            37.94
    Baseline | volume_emission |           145.22
    D2264    | volume_emission |           121.10

This patch reduced compilation time as the split kernels and base
kernels are compiled in parallel. In cycles debug mode (256) you can set
unmark the opencl single program file, what reduces the compilation time
even further (bmw 17 seconds, barbershop 53 seconds).

Reviewers: brecht, dingto, sergey, juicyfruit, lukasstockner97

Reviewed By: brecht

Subscribers: Loner, jbakker, candreacchio, 3dLuver, LazyDodo, bliblubli

Differential Revision: https://developer.blender.org/D2264

===================================================================

M	intern/cycles/blender/CMakeLists.txt
M	intern/cycles/blender/blender_python.cpp
M	intern/cycles/device/device_intern.h
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/device/opencl/opencl_mega.cpp
M	intern/cycles/device/opencl/opencl_split.cpp
M	intern/cycles/device/opencl/opencl_util.cpp
M	intern/cycles/kernel/CMakeLists.txt
A	intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
M	intern/cycles/util/util_system.cpp
M	intern/cycles/util/util_system.h

===================================================================

diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 84e2690333e..f8720de366f 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -51,6 +51,10 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
+if(WITH_CYCLES_DEVICE_OPENCL)
+    add_definitions(-DWITH_OPENCL)
+endif()
+
 if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index bf7605ed5b1..513941b1fcc 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -40,6 +40,10 @@
 #include <OSL/oslconfig.h>
 #endif
 
+#ifdef WITH_OPENCL
+#include "device/device_intern.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 namespace {
@@ -624,6 +628,31 @@ static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
 	DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
 	Py_RETURN_NONE;
 }
+
+static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
+{
+	PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence");
+	if(sequence == NULL) {
+		Py_RETURN_FALSE;
+	}
+
+	vector<string> parameters;
+	for(Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) {
+		PyObject *item = PySequence_Fast_GET_ITEM(sequence, i);
+		PyObject *item_as_string = PyObject_Str(item);
+		const char *parameter_string = PyUnicode_AsUTF8(item_as_string);
+		parameters.push_back(parameter_string);
+		Py_DECREF(item_as_string);
+	}
+	Py_DECREF(sequence);
+
+	if (device_opencl_compile_kernel(parameters)) {
+		Py_RETURN_TRUE;
+	}
+	else {
+		Py_RETURN_FALSE;
+	}
+}
 #endif
 
 static bool denoise_parse_filepaths(PyObject *pyfilepaths, vector<string>& filepaths)
@@ -899,6 +928,7 @@ static PyMethodDef methods[] = {
 	{"system_info", system_info_func, METH_NOARGS, ""},
 #ifdef WITH_OPENCL
 	{"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
+ 	{"opencl_compile", opencl_compile_func, METH_VARARGS, ""},
 #endif
 
 	/* Standalone denoising */
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 0b26057c3ba..94df1e009eb 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -24,6 +24,7 @@ class Device;
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
 bool device_opencl_init();
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
+bool device_opencl_compile_kernel(const vector<string>& parameters);
 bool device_cuda_init();
 Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
 Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 9b763167459..a2c0e53b3e7 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -268,6 +268,7 @@ public:
 	cl_platform_id cpPlatform;
 	cl_device_id cdDevice;
 	cl_int ciErr;
+	int device_num;
 
 	class OpenCLProgram {
 	public:
@@ -293,7 +294,15 @@ public:
 
 	private:
 		bool build_kernel(const string *debug_src);
+		/* Build the program by calling the own process.
+		 * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
+		 * build calls internally if they come from the same process.
+		 * If that is not supported, this function just returns false.
+		 */
+		bool compile_separate(const string& clbin);
+		/* Build the program by calling OpenCL directly. */
 		bool compile_kernel(const string *debug_src);
+		/* Loading and saving the program from/to disk. */
 		bool load_binary(const string& clbin, const string *debug_src = NULL);
 		bool save_binary(const string& clbin);
 
@@ -342,12 +351,17 @@ public:
 	bool opencl_version_check();
 
 	string device_md5_hash(string kernel_custom_build_options = "");
-	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features);
 
 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
-	                          vector<OpenCLProgram*> &programs) = 0;
+	virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features,
+	                                 vector<OpenCLProgram*> &programs) = 0;
+
+	/* Get the name of the opencl program for the given kernel */
+	virtual const string get_opencl_program_name(bool single_program, const string& kernel_name) = 0;
+	/* Get the program file name to compile (*.cl) for the given kernel */
+	virtual const string get_opencl_program_filename(bool single_program, const string& kernel_name) = 0;
 
 	void mem_alloc(device_memory& mem);
 	void mem_copy_to(device_memory& mem);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 4417065bb7f..d8f9a242ac8 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -93,6 +93,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &pro
 	}
 	assert(info.num < usable_devices.size());
 	OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+	device_num = info.num;
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
@@ -143,7 +144,6 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &pro
 	texture_info.resize(1);
 	memory_manager.alloc("texture_info", texture_info);
 
-	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
 }
 
@@ -251,15 +251,13 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	programs.push_back(&base_program);
 	programs.push_back(&denoising_program);
 	/* Call actual class to fill the vector with its programs. */
-	if(!load_kernels(requested_features, programs)) {
+	if(!add_kernel_programs(requested_features, programs)) {
 		return false;
 	}
 
-	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
-	 * serialize the calls internally, so it's not much use right now.
-	 * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
-	 * should be set to false as well. */
-#if 0
+	/* Parallel compilation of Cycles kernels, this launches multiple
+	 * processes to workaround OpenCL frameworks serializing the calls
+	 * internally within a single process. */
 	TaskPool task_pool;
 	foreach(OpenCLProgram *program, programs) {
 		task_pool.push(function_bind(&OpenCLProgram::load, program));
@@ -273,14 +271,6 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 			return false;
 		}
 	}
-#else
-	foreach(OpenCLProgram *program, programs) {
-		program->load();
-		if(!program->is_loaded()) {
-			return false;
-		}
-	}
-#endif
 
 	return true;
 }
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 0a7bf96fed7..c0b9e81d4d3 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -35,19 +35,35 @@ public:
 
 	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
 	: OpenCLDeviceBase(info, stats, profiler, background_),
-	  path_trace_program(this, "megakernel", "kernel.cl", "-D__COMPILE_ONLY_MEGAKERNEL__ ")
+	  path_trace_program(this,
+	                     get_opencl_program_name(false, "megakernel"),
+	                     get_opencl_program_filename(false, "megakernel"),
+	                     "-D__COMPILE_ONLY_MEGAKERNEL__ ")
 	{
 	}
 
-	virtual bool show_samples() const {
+
+	virtual bool show_samples() const
+	{
 		return true;
 	}
 
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
+	virtual BVHLayoutMask get_bvh_layout_mask() const
+	{
 		return BVH_LAYOUT_BVH2;
 	}
 
-	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	const string get_opencl_program_name(bool /*single_program*/, const string& kernel_name)
+	{
+		return kernel_name;
+	}
+
+	const string get_opencl_program_filename(bool /*single_program*/, const string& /*kernel_name*/)
+	{
+		return "kernel.cl";
+	}
+
+	virtual bool add_kernel_programs(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 5a2555f9f80..b759f69d3ab 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -79,6 +79,27 @@ public:
 	OpenCLProgram program_data_init;
 	OpenCLProgram program_state_buffer_size;
 
+	OpenCLProgram program_split;
+
+	OpenCLProgram program_path_init;
+	OpenCLProgram program_scene_intersect;
+	OpenCLProgram program_lamp_emission;
+	OpenCLProgram program_do_volume;
+	OpenCLProgram program_queue_enqueue;
+	OpenCLProgram program_indirect_background;
+	OpenCLProgram program_shader_setup;
+	OpenCLProgram program_shader_sort;
+	OpenCLProgram program_shader_eval;
+	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
+	OpenCLProgram program_subsurface_scatter;
+	OpenCLProgram program_direct_lighting;
+	OpenCLProgram program_shadow_blocked_ao;
+	OpenCLProgram program_shadow_blocked_dl;
+	OpenCLProgram program_enqueue_inactive;
+	OpenCLProgram program_next_iteration_setup;
+	OpenCLProgram program_indirect_subsurface;
+	OpenCLProgram program_buffer_update;
+
 	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
 
 	~OpenCLDeviceSplitKernel()
@@ -99,26 +120,150 @@ public:
 		return BVH_LAYOUT_BVH2;
 	}
 
-	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_featur

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list