[Bf-blender-cvs] [420d88d] soc-2016-cycles_images: Bindless Textures: Move bindless_mapping to CUDADevice.
Thomas Dinges
noreply at git.blender.org
Thu May 19 01:49:01 CEST 2016
Commit: 420d88d4b6881614935c1107a407a2f7f00db432
Author: Thomas Dinges
Date: Wed May 18 22:11:45 2016 +0200
Branches: soc-2016-cycles_images
https://developer.blender.org/rB420d88d4b6881614935c1107a407a2f7f00db432
Bindless Textures: Move bindless_mapping to CUDADevice.
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_multi.cpp
M intern/cycles/device/device_network.cpp
M intern/cycles/device/device_opencl.cpp
M intern/cycles/kernel/geom/geom_volume.h
M intern/cycles/kernel/kernel_compat_cuda.h
M intern/cycles/kernel/kernel_textures.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/svm/svm_image.h
M intern/cycles/kernel/svm/svm_voxel.h
M intern/cycles/render/image.cpp
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 7d48692..2bfcf67 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -226,11 +226,11 @@ public:
device_memory& /*mem*/,
InterpolationType interpolation = INTERPOLATION_NONE,
ExtensionType extension = EXTENSION_REPEAT,
- uint *bindless_slot = 0)
+ int flat_slot = 0)
{
(void)interpolation; /* Ignored. */
(void)extension; /* Ignored. */
- (void)bindless_slot; /* Ignored. */
+ (void)flat_slot; /* Ignored. */
};
virtual void tex_free(device_memory& /*mem*/) {};
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 3265626..6b6be7c 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -154,7 +154,7 @@ public:
device_memory& mem,
InterpolationType interpolation,
ExtensionType extension,
- uint* /*bindless_slot*/)
+ int /*flat_slot*/)
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
kernel_tex_copy(&kernel_globals,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 9cbdde2..c28c41e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -98,6 +98,11 @@ public:
map<device_ptr, PixelMem> pixel_mem_map;
+ /* Bindless Textures */
+ CUtexObject bindless_mapping[4096];
+ device_vector<uint> bindless_mapping_device;
+ bool sync_bindless_mapping;
+
CUdeviceptr cuda_device_ptr(device_ptr mem)
{
return (CUdeviceptr)mem;
@@ -180,6 +185,8 @@ public:
cuDevice = 0;
cuContext = 0;
+ sync_bindless_mapping = false;
+
/* intialize */
if(cuda_error(cuInit(0)))
return;
@@ -216,6 +223,8 @@ public:
{
task_pool.stop();
+ tex_free(bindless_mapping_device);
+
cuda_assert(cuCtxDestroy(cuContext));
}
@@ -469,7 +478,7 @@ public:
device_memory& mem,
InterpolationType interpolation,
ExtensionType extension,
- uint *bindless_slot)
+ int flat_slot)
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
@@ -661,7 +670,9 @@ public:
CUtexObject tex = 0;
cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
- *bindless_slot = tex;
+ bindless_mapping[flat_slot] = tex;
+
+ sync_bindless_mapping = true;
}
/* Regular Textures - Fermi */
else {
@@ -720,6 +731,18 @@ public:
if(have_error())
return;
+ /* Upload bindless_mapping vector */
+ if(cuDevArchitecture >= 300) {
+ if(sync_bindless_mapping) {
+ uint *tmp = bindless_mapping_device.resize(4096);
+ for(size_t i = 0; i < 4096; i++) {
+ tmp[i] = (uint)bindless_mapping[i];
+ }
+ tex_alloc("__bindless_mapping", bindless_mapping_device, INTERPOLATION_NONE, EXTENSION_REPEAT, 0);
+ sync_bindless_mapping = false;
+ }
+ }
+
cuda_push_context();
CUfunction cuPathTrace;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index f41c65d..34a97db 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -174,13 +174,13 @@ public:
InterpolationType
interpolation,
ExtensionType extension,
- uint *bindless_slot)
+ int flat_slot)
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
foreach(SubDevice& sub, devices) {
mem.device_pointer = 0;
- sub.device->tex_alloc(name, mem, interpolation, extension, bindless_slot);
+ sub.device->tex_alloc(name, mem, interpolation, extension, flat_slot);
sub.ptr_map[unique_ptr] = mem.device_pointer;
}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 6bd24cd..449f543 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -167,7 +167,7 @@ public:
device_memory& mem,
InterpolationType interpolation,
ExtensionType extension,
- uint *bindless_slot)
+ int flat_slot)
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
@@ -183,7 +183,7 @@ public:
snd.add(mem);
snd.add(interpolation);
snd.add(extension);
- snd.add(bindless_slot);
+ snd.add(flat_slot);
snd.write();
snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
}
@@ -583,7 +583,7 @@ protected:
rcv.read(mem);
rcv.read(interpolation);
rcv.read(extension_type);
- rcv.read(bindless_slot);
+ rcv.read(flat_slot);
lock.unlock();
client_pointer = mem.device_pointer;
@@ -599,7 +599,7 @@ protected:
rcv.read_buffer((uint8_t*)mem.data_pointer, data_size);
- device->tex_alloc(name.c_str(), mem, interpolation, extension_type, bindless_slot);
+ device->tex_alloc(name.c_str(), mem, interpolation, extension_type, flat_slot);
pointer_mapping_insert(client_pointer, mem.device_pointer);
}
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index ddd282b..61f83f2 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1186,7 +1186,7 @@ public:
device_memory& mem,
InterpolationType /*interpolation*/,
ExtensionType /*extension*/,
- uint* /*bindless_slot*/)
+ int /*flat_slot*/)
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
mem_alloc(mem, MEM_READ_ONLY);
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 95d2888..2044aaf 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -66,7 +66,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_GPU__
# if __CUDA_ARCH__ >= 300
- CUtexObject tex = kernel_data.bindless_mapping[id];
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
float4 r = make_float4(f, f, f, 1.0);
# else
@@ -91,7 +91,7 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_GPU__
# if __CUDA_ARCH__ >= 300
- CUtexObject tex = kernel_data.bindless_mapping[id];
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
# else
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 5d9c307..4231475 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -73,7 +73,6 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
* Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
* Using Arrays on Fermi turned out to be slower.*/
-
/* Fermi */
#if __CUDA_ARCH__ < 300
# define __KERNEL_CUDA_TEX_STORAGE__
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 87c77ef..285da14 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -175,6 +175,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
+/* bindless textures */
+KERNEL_TEX(uint, texture_uint, __bindless_mapping)
+
/* packed image (opencl) */
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8a1bc1b..cc261ed 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1158,7 +1158,6 @@ typedef struct KernelData {
KernelBVH bvh;
KernelCurves curve;
KernelTables tables;
- uint bindless_mapping[4096]; /*TODO(dingto): Dynamic alloc */
} KernelData;
#ifdef __KERNEL_DEBUG__
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 0fd04a0..cf101f0 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN
/* Float4 textures on various devices. */
#if defined(__KERNEL_CPU__)
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
#elif defined(__KERNEL_CUDA__)
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
+# if __CUDA_ARCH__ < 300
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
+# else
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
+# endif
#else
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
#endif
#ifdef __KERNEL_OPENCL__
@@ -260,7 +264,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
#else
- CUtexObject tex = kernel_data.bindless_mapping[id];
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
if(id < 2048) /* TODO(dingto): Make this a variable */
r = kernel_tex_image_interp_float4(tex, x, y);
else {
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 825d76d..d2cc2c3 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -45,7 +45,7 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
float4 r;
# if defined(__KERNEL_GPU__)
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list