[Bf-blender-cvs] [d2e6824] soc-2016-cycles_images: Bindless Textures are now usable.

Mon May 16 19:41:06 CEST 2016

Commit: d2e6824599d36a6cfe9f32fab521ebc74a2e9d48
Author: Thomas Dinges
Date:   Mon May 16 19:38:56 2016 +0200
Branches: soc-2016-cycles_images
https://developer.blender.org/rBd2e6824599d36a6cfe9f32fab521ebc74a2e9d48

Bindless Textures are now usable.

* 2D Textures work fine (float4, byte4, float and byte) types, plus the new limit on Kepler cards is now 1024 per type, so 4096 textures in total.
If we ever need more, this is a one line change.

* 3D Textures (Smoke) do not show up yet, need to investigate this.

* Some cleanup of previous commits, can still deduplicate sume stuff though.

===================================================================

M	intern/cycles/device/device.h
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_multi.cpp
M	intern/cycles/device/device_network.cpp
M	intern/cycles/device/device_opencl.cpp
M	intern/cycles/kernel/geom/geom_volume.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/svm/svm_image.h
M	intern/cycles/kernel/svm/svm_voxel.h
M	intern/cycles/render/image.cpp
M	intern/cycles/util/util_texture.h

===================================================================

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 144ed0d..181abba 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -226,11 +226,11 @@ public:
 	                       device_memory& /*mem*/,
 	                       InterpolationType interpolation = INTERPOLATION_NONE,
 	                       ExtensionType extension = EXTENSION_REPEAT,
-	                       int *flat_slot = 0)
+	                       uint *bindless_slot = 0)
 	{
 		(void)interpolation;  /* Ignored. */
 		(void)extension;  /* Ignored. */
-		(void)flat_slot; /* Ignored. */
+		(void)bindless_slot; /* Ignored. */
 	};
 
 	virtual void tex_free(device_memory& /*mem*/) {};
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 1fa4bd0..a585bce 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -154,7 +154,7 @@ public:
 	               device_memory& mem,
 	               InterpolationType interpolation,
 	               ExtensionType extension,
-	               int /*flat_slot*/)
+	               uint* /*bindless_slot*/)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		kernel_tex_copy(&kernel_globals,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 0ed944c..9aa7d56 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -469,7 +469,7 @@ public:
 	               device_memory& mem,
 	               InterpolationType interpolation,
 	               ExtensionType extension,
-	               int *flat_slot)
+	               uint *bindless_slot)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
@@ -509,10 +509,8 @@ public:
 				cuda_pop_context();
 			}
 
-			/* Texture Storage */
+			/* Bindless Texture Storage */
 			else {
-				/* TODO(dingto): Complete Bindless textures */
-
 				CUarray_format_enum format;
 				switch(mem.data_type) {
 					case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
@@ -606,19 +604,25 @@ public:
 						break;
 				}
 
+				CUfilter_mode filter_mode;
+				if(interpolation == INTERPOLATION_CLOSEST) {
+					filter_mode = CU_TR_FILTER_MODE_POINT;
+				}
+				else {
+					filter_mode = CU_TR_FILTER_MODE_LINEAR;
+				}
+
 				CUDA_TEXTURE_DESC texDesc;
 				memset(&texDesc, 0, sizeof(texDesc));
 				texDesc.addressMode[0] = address_mode;
 				texDesc.addressMode[1] = address_mode;
 				texDesc.addressMode[2] = address_mode;
-				texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-				texDesc.flags = 0;
+				texDesc.filterMode = filter_mode;
+				texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
 				CUtexObject tex = 0;
 				cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
-
-				printf("Tex: %i - Slot: %i\n\n", tex, *flat_slot);
-				*flat_slot = (int)tex;
+				*bindless_slot = tex;
 			}
 		}
 		/* Geforce 4xx and 5xx */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index a8e2628..ae857ab 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -174,13 +174,13 @@ public:
 	               InterpolationType
 	               interpolation,
 	               ExtensionType extension,
-	               int *flat_slot)
+	               uint *bindless_slot)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->tex_alloc(name, mem, interpolation, extension, flat_slot);
+			sub.device->tex_alloc(name, mem, interpolation, extension, bindless_slot);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index c756805..6bd24cd 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -167,7 +167,7 @@ public:
 	               device_memory& mem,
 	               InterpolationType interpolation,
 	               ExtensionType extension,
-	               int *flat_slot)
+	               uint *bindless_slot)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
@@ -183,7 +183,7 @@ public:
 		snd.add(mem);
 		snd.add(interpolation);
 		snd.add(extension);
-		snd.add(flat_slot);
+		snd.add(bindless_slot);
 		snd.write();
 		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
 	}
@@ -583,7 +583,7 @@ protected:
 			rcv.read(mem);
 			rcv.read(interpolation);
 			rcv.read(extension_type);
-			rcv.read(flat_slot);
+			rcv.read(bindless_slot);
 			lock.unlock();
 
 			client_pointer = mem.device_pointer;
@@ -599,7 +599,7 @@ protected:
 
 			rcv.read_buffer((uint8_t*)mem.data_pointer, data_size);
 
-			device->tex_alloc(name.c_str(), mem, interpolation, extension_type, flat_slot);
+			device->tex_alloc(name.c_str(), mem, interpolation, extension_type, bindless_slot);
 
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
 		}
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 61f83f2..ddd282b 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1186,7 +1186,7 @@ public:
 	               device_memory& mem,
 	               InterpolationType /*interpolation*/,
 	               ExtensionType /*extension*/,
-	               int /*flat_slot*/)
+	               uint* /*bindless_slot*/)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		mem_alloc(mem, MEM_READ_ONLY);
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index ef02c01..83a163b 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+	CUtexObject tex = kernel_data.bindless_mapping[id];
+	float g = tex3D<float>(tex, P.x, P.y, P.z);
+	float4 r = make_float4(g, g, g, 1.0);
+#  else
 	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
@@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+	CUtexObject tex = kernel_data.bindless_mapping[id];
+	float4 r = tex3D<float4>(tex, P.x, P.y, P.z);
+#  else
 	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 1c54a75..922fdab 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1157,7 +1157,7 @@ typedef struct KernelData {
 	KernelBVH bvh;
 	KernelCurves curve;
 	KernelTables tables;
-	int bindless_mapping[4096];
+	uint bindless_mapping[4096]; /*TODO(dingto): Dynamic alloc */
 } KernelData;
 
 #ifdef __KERNEL_DEBUG__
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 2da4563..9ab627d 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -260,12 +260,12 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 	}
 #else
-	CUtexObject tex = (uint)kernel_data.bindless_mapping[id];
-	if(id < 5)
+	CUtexObject tex = kernel_data.bindless_mapping[id];
+	if(id < 2048) /* TODO(dingto): Make this a variable */
 		r = tex2D<float4>(tex, x, y);
 	else {
-		uchar4 f = tex2D<uchar4>(tex, x, y);
-		r = make_float4(f.x/255, f.y/255, f.z/255, f.w/255);
+		float g = tex2D<float>(tex, x, y);
+		r = make_float4(g, g, g, 1.0);
 	}
 #endif
 #endif
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 85ba2f9..9d85d97 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 		tfm.w = read_node_float(kg, offset);
 		co = transform_point(&tfm, co);
 	}
+	float4 r;
 #  if defined(__KERNEL_GPU__)
-	float4 r = volume_image_texture_3d(id, co.x, co.y, co.z);
-#  else
-	float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+#    if __CUDA_ARCH__ >= 300
+	CUtexObject tex = kernel_data.bindless_mapping[id];
+	if(id < 2048) /* TODO(dingto): Make this a variable */
+		r = tex3D<float4>(tex, co.x, co.y, co.z);
+	else {
+		float g = tex3D<float>(tex, co.x, co.y, co.z);
+		r = make_float4(g, g, g, 1.0);
+	}
+#    else /* __CUDA_ARCH__ >= 300 */
+	r = volume_image_texture_3d(id, co.x, co.y, co.z);
+#    endif
+#  else /* __KERNEL_GPU__ */
+	r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
 #  endif
 #else
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index fc40447..476947d 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -296,7 +296,7 @@ int ImageManager::add_image(const string& filename,
 	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 		is_float = true;
 
-	/* No float and byte textures on GPU yet */
+	/* No single channel textures on Fermi GPUs, use available slots */
 	if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0)
 		type = IMAGE_DATA_TYPE_FLOAT4;
 	if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0)
@@ -768,6 +768,9 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 	else
 		name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot);
 
+	/* Bindless slot for CUDA */
+	uint bindless_slot = 0;
+
 	if(type == IMAGE_DATA_TYPE_FLOAT4) {
 		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
 
@@ -792,7 +795,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			            

@@ Diff output truncated at 10240 characters. @@