[Bf-blender-cvs] [314cf40] compositor-2016: Cycles: Add support for bindless textures.

Wed Jun 8 21:48:05 CEST 2016

Commit: 314cf40f06dca493aeefe24bb0a14c5f071d8cd4
Author: Thomas Dinges
Date:   Thu May 19 12:47:41 2016 +0200
Branches: compositor-2016
https://developer.blender.org/rB314cf40f06dca493aeefe24bb0a14c5f071d8cd4

Cycles: Add support for bindless textures.

This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above).
This is used for all 2D/3D textures, data still uses arrays as before.

User benefits:
* No more limits of image textures on Kepler.
 We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4.
 This can be extended further if we need to (just change the define).

* Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type).

ToDo / Issues:
* 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet.
* Dynamically allocate bindless_mapping array?

I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master.

Part of my GSoC 2016.

Reviewers: sergey, #cycles, brecht

Subscribers: swerner, jtheninja, brecht, sergey

Differential Revision: https://developer.blender.org/D1999

===================================================================

M	intern/cycles/device/device.h
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_multi.cpp
M	intern/cycles/kernel/geom/geom_volume.h
M	intern/cycles/kernel/kernel_compat_cuda.h
M	intern/cycles/kernel/kernel_textures.h
M	intern/cycles/kernel/svm/svm_image.h
M	intern/cycles/kernel/svm/svm_voxel.h
M	intern/cycles/render/image.cpp
M	intern/cycles/util/util_texture.h

===================================================================

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 4c1b722..e11bb7f 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,7 +54,7 @@ public:
 	bool display_device;
 	bool advanced_shading;
 	bool pack_images;
-	bool extended_images; /* flag for GPU and Multi device */
+	bool has_bindless_textures; /* flag for GPU and Multi device */
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
 
@@ -66,7 +66,7 @@ public:
 		display_device = false;
 		advanced_shading = true;
 		pack_images = false;
-		extended_images = false;
+		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
 };
@@ -230,6 +230,7 @@ public:
 		(void)interpolation;  /* Ignored. */
 		(void)extension;  /* Ignored. */
 	};
+
 	virtual void tex_free(device_memory& /*mem*/) {};
 
 	/* pixel memory */
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 12c62c0..39bb442 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -85,10 +85,10 @@ public:
 	CUcontext cuContext;
 	CUmodule cuModule;
 	map<device_ptr, bool> tex_interp_map;
+	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
-	bool use_texture_storage;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
 
 	map<device_ptr, PixelMem> pixel_mem_map;
 
+	/* Bindless Textures */
+	device_vector<uint> bindless_mapping;
+	bool need_bindless_mapping;
+
 	CUdeviceptr cuda_device_ptr(device_ptr mem)
 	{
 		return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
 	{
 		first_error = true;
 		background = background_;
-		use_texture_storage = true;
 
 		cuDevId = info.num;
 		cuDevice = 0;
 		cuContext = 0;
 
+		need_bindless_mapping = false;
+
 		/* intialize */
 		if(cuda_error(cuInit(0)))
 			return;
@@ -211,11 +216,6 @@ public:
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;
 
-		/* In order to use full 6GB of memory on Titan cards, use arrays instead
-		 * of textures. On earlier cards this seems slower, but on Titan it is
-		 * actually slightly faster in tests. */
-		use_texture_storage = (cuDevArchitecture < 300);
-
 		cuda_pop_context();
 	}
 
@@ -223,6 +223,10 @@ public:
 	{
 		task_pool.stop();
 
+		if(info.has_bindless_textures) {
+			tex_free(bindless_mapping);
+		}
+
 		cuda_assert(cuCtxDestroy(cuContext));
 	}
 
@@ -400,6 +404,15 @@ public:
 		return (result == CUDA_SUCCESS);
 	}
 
+	void load_bindless_mapping()
+	{
+		if(info.has_bindless_textures && need_bindless_mapping) {
+			tex_free(bindless_mapping);
+			tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+			need_bindless_mapping = false;
+		}
+	}
+
 	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		cuda_push_context();
@@ -479,126 +492,99 @@ public:
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
+		/* Check if we are on sm_30 or above.
+		 * We use arrays and bindles textures for storage there */
+		bool has_bindless_textures = info.has_bindless_textures;
+
+		/* General variables for both architectures */
 		string bind_name = name;
-		if(mem.data_depth > 1) {
-			/* Kernel uses different bind names for 2d and 3d float textures,
-			 * so we have to adjust couple of things here.
-			 */
-			vector<string> tokens;
-			string_split(tokens, name, "_");
-			bind_name = string_printf("__tex_image_%s_3d_%s",
-			                          tokens[2].c_str(),
-			                          tokens[3].c_str());
+		size_t dsize = datatype_size(mem.data_type);
+		size_t size = mem.memory_size();
+
+		CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+		switch(extension) {
+			case EXTENSION_REPEAT:
+				address_mode = CU_TR_ADDRESS_MODE_WRAP;
+				break;
+			case EXTENSION_EXTEND:
+				address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+				break;
+			case EXTENSION_CLIP:
+				address_mode = CU_TR_ADDRESS_MODE_BORDER;
+				break;
+			default:
+				assert(0);
+				break;
+		}
+
+		CUfilter_mode filter_mode;
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			filter_mode = CU_TR_FILTER_MODE_POINT;
+		}
+		else {
+			filter_mode = CU_TR_FILTER_MODE_LINEAR;
 		}
 
-		/* determine format */
 		CUarray_format_enum format;
-		size_t dsize = datatype_size(mem.data_type);
-		size_t size = mem.memory_size();
-		bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
+		switch(mem.data_type) {
+			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+			case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+			case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+			case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+			default: assert(0); return;
+		}
 
-		if(use_texture) {
+		/* General variables for Fermi */
+		CUtexref texref = NULL;
 
-			switch(mem.data_type) {
-				case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-				case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-				case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-				case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-				default: assert(0); return;
+		if(!has_bindless_textures) {
+			if(mem.data_depth > 1) {
+				/* Kernel uses different bind names for 2d and 3d float textures,
+				 * so we have to adjust couple of things here.
+				 */
+				vector<string> tokens;
+				string_split(tokens, name, "_");
+				bind_name = string_printf("__tex_image_%s_3d_%s",
+				                          tokens[2].c_str(),
+				                          tokens[3].c_str());
 			}
 
-			CUtexref texref = NULL;
-
 			cuda_push_context();
 			cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+			cuda_pop_context();
 
 			if(!texref) {
-				cuda_pop_context();
 				return;
 			}
+		}
 
-			if(interpolation != INTERPOLATION_NONE) {
-				CUarray handle = NULL;
-
-				if(mem.data_depth > 1) {
-					CUDA_ARRAY3D_DESCRIPTOR desc;
-
-					desc.Width = mem.data_width;
-					desc.Height = mem.data_height;
-					desc.Depth = mem.data_depth;
-					desc.Format = format;
-					desc.NumChannels = mem.data_elements;
-					desc.Flags = 0;
-
-					cuda_assert(cuArray3DCreate(&handle, &desc));
-				}
-				else {
-					CUDA_ARRAY_DESCRIPTOR desc;
-
-					desc.Width = mem.data_width;
-					desc.Height = mem.data_height;
-					desc.Format = format;
-					desc.NumChannels = mem.data_elements;
-
-					cuda_assert(cuArrayCreate(&handle, &desc));
-				}
+		/* Data Storage */
+		if(interpolation == INTERPOLATION_NONE) {
+			if(has_bindless_textures) {
+				mem_alloc(mem, MEM_READ_ONLY);
+				mem_copy_to(mem);
 
-				if(!handle) {
-					cuda_pop_context();
-					return;
-				}
+				cuda_push_context();
 
-				if(mem.data_depth > 1) {
-					CUDA_MEMCPY3D param;
-					memset(&param, 0, sizeof(param));
-					param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-					param.dstArray = handle;
-					param.srcMemoryType = CU_MEMORYTYPE_HOST;
-					param.srcHost = (void*)mem.data_pointer;
-					param.srcPitch = mem.data_width*dsize*mem.data_elements;
-					param.WidthInBytes = param.srcPitch;
-					param.Height = mem.data_height;
-					param.Depth = mem.data_depth;
-
-					cuda_assert(cuMemcpy3D(&param));
-				}
-				else if(mem.data_height > 1) {
-					CUDA_MEMCPY2D param;
-					memset(&param, 0, sizeof(param));
-					param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-					param.dstArray = handle;
-					param.srcMemoryType = CU_MEMORYTYPE_HOST;
-					param.srcHost = (void*)mem.data_pointer;
-					param.srcPitch = mem.data_width*dsize*mem.data_elements;
-					param.WidthInBytes = param.srcPitch;
-					param.Height = mem.data_height;
-
-					cuda_assert(cuMemcpy2D(&param));
-				}
-				else
-					cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+				CUdeviceptr cumem;
+				size_t cubytes;
 
-				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+				cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
 
-				if(interpolation == INTERPOLATION_CLOSEST) {
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+				if(cubytes == 8) {
+					/* 64 bit device pointer */
+					uint64_t ptr = mem.device_pointer;
+					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 				}
-				else if(interpolation == INTERPOLATION_LINEAR) {
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-				}
-				else {/* CUBIC and SMART are unsupported for CUDA */
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+				else {
+					/* 32 bit device pointer */
+					uint32_t ptr = (uint32_t)mem.device_pointer;
+					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 				}
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
-				mem.device_pointer = (device_ptr)handle;
-				mem.device_size = size;
 
-				stats.mem_alloc(size);
+				cuda_pop_context();
 			}
 			else {
-				cuda_pop_context();
-
 				mem_alloc(mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
@@ -607,58 +593,149 @@ public:
 				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
 				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+				cuda_pop_context();
 			}
+		}
+		/* Texture Storage */
+		else {
+			CUarray handle = NULL;
 
-			CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					address_mode = CU_TR_ADDRESS_MODE_WRAP;
-					break;
-				case EXTENSION_EXTEND:
-					address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-					break;
-				case EXTENSION_CLIP:
-					address_mode = CU_TR_ADDRESS_MODE_BORDER;
-					break;
-				default:
-					assert(0);
-					break;
+			cuda_push_context();
+
+			if(mem.data_depth > 1) {
+				CUDA_ARRAY3D_DESCRIPTOR desc;
+
+				desc.Width = mem.data_width;
+				desc.Height = mem.data_height;
+				desc.Depth = mem.data_depth;
+				desc.Format = format;
+				desc.NumChannels = mem.data_elements;
+				desc.Flags = 0;
+
+				cuda_assert(cuArray3DCreate(&handle, &desc

@@ Diff output truncated at 10240 characters. @@