[Bf-blender-cvs] [ec8ae4d5e9f] master: Cycles: Pack kernel textures into buffers for OpenCL

Mai Lavelle noreply at git.blender.org
Tue Aug 8 13:16:51 CEST 2017


Commit: ec8ae4d5e9f735ab5aeb149dea8aa47ab8f8f977
Author: Mai Lavelle
Date:   Tue Aug 8 07:12:04 2017 -0400
Branches: master
https://developer.blender.org/rBec8ae4d5e9f735ab5aeb149dea8aa47ab8f8f977

Cycles: Pack kernel textures into buffers for OpenCL

Image textures were being packed into a single buffer for OpenCL, which
limited the amount of memory available for images to the size of one
buffer (usually 4gb on AMD hardware). By packing textures into multiple
buffers that limit is removed, while simultaneously reducing the number
of buffers that need to be passed to each kernel.

Benchmarks were within 2%.

Fixes T51554.

Differential Revision: https://developer.blender.org/D2745

===================================================================

M	intern/cycles/device/CMakeLists.txt
M	intern/cycles/device/device.cpp
M	intern/cycles/device/device.h
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_opencl.cpp
A	intern/cycles/device/opencl/memory_manager.cpp
A	intern/cycles/device/opencl/memory_manager.h
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/device/opencl/opencl_mega.cpp
M	intern/cycles/device/opencl/opencl_split.cpp
M	intern/cycles/kernel/kernel_compat_opencl.h
M	intern/cycles/kernel/kernel_globals.h
M	intern/cycles/kernel/kernel_image_opencl.h
M	intern/cycles/kernel/kernel_textures.h
M	intern/cycles/kernel/kernels/opencl/kernel.cl
M	intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
M	intern/cycles/kernel/kernels/opencl/kernel_split_function.h
M	intern/cycles/kernel/split/kernel_data_init.h
M	intern/cycles/render/image.cpp
M	intern/cycles/render/image.h
M	intern/cycles/render/mesh.cpp
M	intern/cycles/render/scene.cpp
M	intern/cycles/render/scene.h

===================================================================

diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 74ec57ddf74..3c632160fbd 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,11 +34,13 @@ set(SRC
 
 set(SRC_OPENCL
 	opencl/opencl.h
+	opencl/memory_manager.h
 
 	opencl/opencl_base.cpp
 	opencl/opencl_mega.cpp
 	opencl/opencl_split.cpp
 	opencl/opencl_util.cpp
+	opencl/memory_manager.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index a54bb77f9f3..f64436aec7b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -379,11 +379,9 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
 	info.num = 0;
 
 	info.has_bindless_textures = true;
-	info.pack_images = false;
 	foreach(DeviceInfo &device, subdevices) {
 		assert(device.type == info.multi_devices[0].type);
 
-		info.pack_images |= device.pack_images;
 		info.has_bindless_textures &= device.has_bindless_textures;
 	}
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index b3b693c630c..26d6d380a10 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -53,7 +53,6 @@ public:
 	int num;
 	bool display_device;
 	bool advanced_shading;
-	bool pack_images;
 	bool has_bindless_textures; /* flag for GPU and Multi device */
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
@@ -65,7 +64,6 @@ public:
 		num = 0;
 		display_device = false;
 		advanced_shading = true;
-		pack_images = false;
 		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a00be3eeaab..6e09c5f88c2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -977,7 +977,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
-	info.pack_images = false;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dbf636e1405..6769ed0229e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2164,7 +2164,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
-		info.pack_images = false;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 681b8214b03..aa380ec4b94 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -95,7 +95,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
 		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
-		info.pack_images = true;
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
 		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
new file mode 100644
index 00000000000..b67dfef88aa
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "util/util_foreach.h"
+
+#include "device/opencl/opencl.h"
+#include "device/opencl/memory_manager.h"
+
+CCL_NAMESPACE_BEGIN
+
+void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+{
+	allocations.push_back(&allocation);
+}
+
+void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device)
+{
+	bool need_realloc = false;
+
+	/* Calculate total size and remove any freed. */
+	size_t total_size = 0;
+
+	for(int i = allocations.size()-1; i >= 0; i--) {
+		Allocation* allocation = allocations[i];
+
+		/* Remove allocations that have been freed. */
+		if(!allocation->mem || allocation->mem->memory_size() == 0) {
+			allocation->device_buffer = NULL;
+			allocation->size = 0;
+
+			allocations.erase(allocations.begin()+i);
+
+			need_realloc = true;
+
+			continue;
+		}
+
+		/* Get actual size for allocation. */
+		size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+		if(allocation->size != alloc_size) {
+			/* Allocation is either new or resized. */
+			allocation->size = alloc_size;
+			allocation->needs_copy_to_device = true;
+
+			need_realloc = true;
+		}
+
+		total_size += alloc_size;
+	}
+
+	if(need_realloc) {
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(total_size > max_buffer_size) {
+			device->set_error("Scene too complex to fit in available memory.");
+			return;
+		}
+
+		device_memory *new_buffer = new device_memory;
+
+		new_buffer->resize(total_size);
+		device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(new_buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+			else {
+				/* Fast copy from memory already on device. */
+				opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_MEM_PTR(new_buffer->device_pointer),
+					allocation->desc.offset,
+					offset,
+					allocation->mem->memory_size(),
+					0, NULL, NULL
+				));
+			}
+
+			allocation->desc.offset = offset;
+			offset += allocation->size;
+		}
+
+		device->mem_free(*buffer);
+		delete buffer;
+
+		buffer = new_buffer;
+	}
+	else {
+		assert(total_size == buffer->data_size);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+
+			offset += allocation->size;
+		}
+	}
+
+	/* Not really necessary, but seems to improve responsiveness for some reason. */
+	clFinish(device->cqCommandQueue);
+}
+
+void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device)
+{
+	device->mem_free(*buffer);
+}
+
+MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+{
+	DeviceBuffer* smallest = device_buffers;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.size < smallest->size) {
+			smallest = &device_buffer;
+		}
+	}
+
+	return smallest;
+}
+
+MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false)
+{
+}
+
+void MemoryManager::free()
+{
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.free(device);
+	}
+}
+
+void MemoryManager::alloc(const char *name, device_memory& mem)
+{
+	Allocation& allocation = allocations[name];
+
+	allocation.mem = &mem;
+	allocation.needs_copy_to_device = true;
+
+	if(!allocation.device_buffer) {
+		DeviceBuffer* device_buffer = smallest_device_buffer();
+		allocation.device_buffer = device_buffer;
+
+		allocation.desc.device_buffer = device_buffer - device_buffers;
+
+		device_buffer->add_allocation(allocation);
+
+		device_buffer->size += mem.memory_size();
+	}
+
+	need_update = true;
+}
+
+bool MemoryManager::free(device_memory& mem)
+{
+	foreach(AllocationsMap::value_type& value, allocations) {
+		Allocation& allocation = value.second;
+		if(allocation.mem == &mem) {
+
+			allocation.device_buffer->size -= mem.memory_size();
+
+			allocation.mem = NULL;
+			allocation.needs_copy_to_device = false;
+
+			need_update = true;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
+{
+	update_device_memory();
+
+	Allocation& allocation = allocations[name];
+	return allocation.desc;
+}
+
+void MemoryManager::update_device_memory()
+{
+	if(!need_update) {
+		return;
+	}
+
+	need_update = false;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.update_device_memory(device);
+	}
+}
+
+void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	update_device_memory();
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.buffer->device_pointer) {
+			device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+		}
+		else {
+			device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* WITH_OPENCL */
+
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
new file mode 100644
index 00000000000..3714405d026
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ 

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list