[Bf-blender-cvs] [314cf40] compositor-2016: Cycles: Add support for bindless textures.
Thomas Dinges
noreply at git.blender.org
Wed Jun 8 21:48:05 CEST 2016
Commit: 314cf40f06dca493aeefe24bb0a14c5f071d8cd4
Author: Thomas Dinges
Date: Thu May 19 12:47:41 2016 +0200
Branches: compositor-2016
https://developer.blender.org/rB314cf40f06dca493aeefe24bb0a14c5f071d8cd4
Cycles: Add support for bindless textures.
This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above).
This is used for all 2D/3D textures, data still uses arrays as before.
User benefits:
* No more limits of image textures on Kepler.
We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4.
This can be extended further if we need to (just change the define).
* Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type).
ToDo / Issues:
* 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet.
* Dynamically allocate bindless_mapping array?
I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master.
Part of my GSoC 2016.
Reviewers: sergey, #cycles, brecht
Subscribers: swerner, jtheninja, brecht, sergey
Differential Revision: https://developer.blender.org/D1999
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_multi.cpp
M intern/cycles/kernel/geom/geom_volume.h
M intern/cycles/kernel/kernel_compat_cuda.h
M intern/cycles/kernel/kernel_textures.h
M intern/cycles/kernel/svm/svm_image.h
M intern/cycles/kernel/svm/svm_voxel.h
M intern/cycles/render/image.cpp
M intern/cycles/util/util_texture.h
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 4c1b722..e11bb7f 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,7 +54,7 @@ public:
bool display_device;
bool advanced_shading;
bool pack_images;
- bool extended_images; /* flag for GPU and Multi device */
+ bool has_bindless_textures; /* flag for GPU and Multi device */
bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
vector<DeviceInfo> multi_devices;
@@ -66,7 +66,7 @@ public:
display_device = false;
advanced_shading = true;
pack_images = false;
- extended_images = false;
+ has_bindless_textures = false;
use_split_kernel = false;
}
};
@@ -230,6 +230,7 @@ public:
(void)interpolation; /* Ignored. */
(void)extension; /* Ignored. */
};
+
virtual void tex_free(device_memory& /*mem*/) {};
/* pixel memory */
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 12c62c0..39bb442 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -85,10 +85,10 @@ public:
CUcontext cuContext;
CUmodule cuModule;
map<device_ptr, bool> tex_interp_map;
+ map<device_ptr, uint> tex_bindless_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
- bool use_texture_storage;
struct PixelMem {
GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
map<device_ptr, PixelMem> pixel_mem_map;
+ /* Bindless Textures */
+ device_vector<uint> bindless_mapping;
+ bool need_bindless_mapping;
+
CUdeviceptr cuda_device_ptr(device_ptr mem)
{
return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
{
first_error = true;
background = background_;
- use_texture_storage = true;
cuDevId = info.num;
cuDevice = 0;
cuContext = 0;
+ need_bindless_mapping = false;
+
/* intialize */
if(cuda_error(cuInit(0)))
return;
@@ -211,11 +216,6 @@ public:
cuDeviceComputeCapability(&major, &minor, cuDevId);
cuDevArchitecture = major*100 + minor*10;
- /* In order to use full 6GB of memory on Titan cards, use arrays instead
- * of textures. On earlier cards this seems slower, but on Titan it is
- * actually slightly faster in tests. */
- use_texture_storage = (cuDevArchitecture < 300);
-
cuda_pop_context();
}
@@ -223,6 +223,10 @@ public:
{
task_pool.stop();
+ if(info.has_bindless_textures) {
+ tex_free(bindless_mapping);
+ }
+
cuda_assert(cuCtxDestroy(cuContext));
}
@@ -400,6 +404,15 @@ public:
return (result == CUDA_SUCCESS);
}
+ void load_bindless_mapping()
+ {
+ if(info.has_bindless_textures && need_bindless_mapping) {
+ tex_free(bindless_mapping);
+ tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+ need_bindless_mapping = false;
+ }
+ }
+
void mem_alloc(device_memory& mem, MemoryType /*type*/)
{
cuda_push_context();
@@ -479,126 +492,99 @@ public:
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+ /* Check if we are on sm_30 or above.
+ * We use arrays and bindles textures for storage there */
+ bool has_bindless_textures = info.has_bindless_textures;
+
+ /* General variables for both architectures */
string bind_name = name;
- if(mem.data_depth > 1) {
- /* Kernel uses different bind names for 2d and 3d float textures,
- * so we have to adjust couple of things here.
- */
- vector<string> tokens;
- string_split(tokens, name, "_");
- bind_name = string_printf("__tex_image_%s_3d_%s",
- tokens[2].c_str(),
- tokens[3].c_str());
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ switch(extension) {
+ case EXTENSION_REPEAT:
+ address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ address_mode = CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ CUfilter_mode filter_mode;
+ if(interpolation == INTERPOLATION_CLOSEST) {
+ filter_mode = CU_TR_FILTER_MODE_POINT;
+ }
+ else {
+ filter_mode = CU_TR_FILTER_MODE_LINEAR;
}
- /* determine format */
CUarray_format_enum format;
- size_t dsize = datatype_size(mem.data_type);
- size_t size = mem.memory_size();
- bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
+ switch(mem.data_type) {
+ case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+ case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+ case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+ case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+ default: assert(0); return;
+ }
- if(use_texture) {
+ /* General variables for Fermi */
+ CUtexref texref = NULL;
- switch(mem.data_type) {
- case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
- case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
- case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
- case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
- default: assert(0); return;
+ if(!has_bindless_textures) {
+ if(mem.data_depth > 1) {
+ /* Kernel uses different bind names for 2d and 3d float textures,
+ * so we have to adjust couple of things here.
+ */
+ vector<string> tokens;
+ string_split(tokens, name, "_");
+ bind_name = string_printf("__tex_image_%s_3d_%s",
+ tokens[2].c_str(),
+ tokens[3].c_str());
}
- CUtexref texref = NULL;
-
cuda_push_context();
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+ cuda_pop_context();
if(!texref) {
- cuda_pop_context();
return;
}
+ }
- if(interpolation != INTERPOLATION_NONE) {
- CUarray handle = NULL;
-
- if(mem.data_depth > 1) {
- CUDA_ARRAY3D_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Depth = mem.data_depth;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
- desc.Flags = 0;
-
- cuda_assert(cuArray3DCreate(&handle, &desc));
- }
- else {
- CUDA_ARRAY_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
-
- cuda_assert(cuArrayCreate(&handle, &desc));
- }
+ /* Data Storage */
+ if(interpolation == INTERPOLATION_NONE) {
+ if(has_bindless_textures) {
+ mem_alloc(mem, MEM_READ_ONLY);
+ mem_copy_to(mem);
- if(!handle) {
- cuda_pop_context();
- return;
- }
+ cuda_push_context();
- if(mem.data_depth > 1) {
- CUDA_MEMCPY3D param;
- memset(¶m, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = handle;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = (void*)mem.data_pointer;
- param.srcPitch = mem.data_width*dsize*mem.data_elements;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
- param.Depth = mem.data_depth;
-
- cuda_assert(cuMemcpy3D(¶m));
- }
- else if(mem.data_height > 1) {
- CUDA_MEMCPY2D param;
- memset(¶m, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = handle;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = (void*)mem.data_pointer;
- param.srcPitch = mem.data_width*dsize*mem.data_elements;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2D(¶m));
- }
- else
- cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+ CUdeviceptr cumem;
+ size_t cubytes;
- cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+ cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
- if(interpolation == INTERPOLATION_CLOSEST) {
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+ if(cubytes == 8) {
+ /* 64 bit device pointer */
+ uint64_t ptr = mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
- else if(interpolation == INTERPOLATION_LINEAR) {
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
- }
- else {/* CUBIC and SMART are unsupported for CUDA */
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+ else {
+ /* 32 bit device pointer */
+ uint32_t ptr = (uint32_t)mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
- cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
- mem.device_pointer = (device_ptr)handle;
- mem.device_size = size;
- stats.mem_alloc(size);
+ cuda_pop_context();
}
else {
- cuda_pop_context();
-
mem_alloc(mem, MEM_READ_ONLY);
mem_copy_to(mem);
@@ -607,58 +593,149 @@ public:
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+ cuda_pop_context();
}
+ }
+ /* Texture Storage */
+ else {
+ CUarray handle = NULL;
- CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
- switch(extension) {
- case EXTENSION_REPEAT:
- address_mode = CU_TR_ADDRESS_MODE_WRAP;
- break;
- case EXTENSION_EXTEND:
- address_mode = CU_TR_ADDRESS_MODE_CLAMP;
- break;
- case EXTENSION_CLIP:
- address_mode = CU_TR_ADDRESS_MODE_BORDER;
- break;
- default:
- assert(0);
- break;
+ cuda_push_context();
+
+ if(mem.data_depth > 1) {
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ cuda_assert(cuArray3DCreate(&handle, &desc
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list