[Bf-blender-cvs] [5801ef71e40] master: Code refactor: device memory cleanups, preparing for mapped host memory.

Sun Nov 5 15:52:00 CET 2017

Commit: 5801ef71e40bc932c69e67f06076cd8b41132e52
Author: Brecht Van Lommel
Date:   Sun Nov 5 00:34:30 2017 +0100
Branches: master
https://developer.blender.org/rB5801ef71e40bc932c69e67f06076cd8b41132e52

Code refactor: device memory cleanups, preparing for mapped host memory.

===================================================================

M	intern/cycles/device/device.cpp
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_memory.cpp
M	intern/cycles/device/device_memory.h
M	intern/cycles/device/device_network.cpp
M	intern/cycles/device/device_network.h
M	intern/cycles/device/device_split_kernel.cpp
M	intern/cycles/device/opencl/memory_manager.cpp
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/kernel/kernel.h
M	intern/cycles/kernel/kernels/cpu/kernel.cpp
M	intern/cycles/render/bake.cpp
M	intern/cycles/render/buffers.cpp
M	intern/cycles/render/light.cpp
M	intern/cycles/render/mesh_displace.cpp
M	intern/cycles/render/object.cpp
M	intern/cycles/render/tables.cpp

===================================================================

diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index b2f20bab58b..641e3fde140 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -102,17 +102,17 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d
 	if(rgba.data_type == TYPE_HALF) {
 		/* for multi devices, this assumes the inefficient method that we allocate
 		 * all pixels on the device even though we only render to a subset */
-		GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
+		GLhalf *host_pointer = (GLhalf*)rgba.host_pointer;
 		float vbuffer[16], *basep;
 		float *vp = NULL;
 
-		data_pointer += 4*y*w;
+		host_pointer += 4*y*w;
 
 		/* draw half float texture, GLSL shader for display transform assumed to be bound */
 		GLuint texid;
 		glGenTextures(1, &texid);
 		glBindTexture(GL_TEXTURE_2D, texid);
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, host_pointer);
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 
@@ -194,7 +194,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d
 		glPixelZoom((float)width/(float)w, (float)height/(float)h);
 		glRasterPos2f(dx, dy);
 
-		uint8_t *pixels = (uint8_t*)rgba.data_pointer;
+		uint8_t *pixels = (uint8_t*)rgba.host_pointer;
 
 		pixels += 4*y*w;
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 0c0e6af7eb4..1a54c3380ee 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -297,10 +297,14 @@ public:
 						<< string_human_readable_size(mem.memory_size()) << ")";
 			}
 
-			mem.device_pointer = mem.data_pointer;
-
-			if(!mem.device_pointer) {
-				mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+			if(mem.type == MEM_DEVICE_ONLY) {
+				assert(!mem.host_pointer);
+				size_t alignment = mem_address_alignment();
+				void *data = util_aligned_malloc(mem.memory_size(), alignment);
+				mem.device_pointer = (device_ptr)data;
+			}
+			else {
+				mem.device_pointer = (device_ptr)mem.host_pointer;
 			}
 
 			mem.device_size = mem.memory_size();
@@ -350,8 +354,8 @@ public:
 			tex_free(mem);
 		}
 		else if(mem.device_pointer) {
-			if(!mem.data_pointer) {
-				free((void*)mem.device_pointer);
+			if(mem.type == MEM_DEVICE_ONLY) {
+				util_aligned_free((void*)mem.device_pointer);
 			}
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
@@ -379,7 +383,7 @@ public:
 			/* Data texture. */
 			kernel_tex_copy(&kernel_globals,
 							mem.name,
-							mem.data_pointer,
+							mem.host_pointer,
 							mem.data_size);
 		}
 		else {
@@ -400,7 +404,7 @@ public:
 			}
 
 			TextureInfo& info = texture_info[flat_slot];
-			info.data = (uint64_t)mem.data_pointer;
+			info.data = (uint64_t)mem.host_pointer;
 			info.cl_buffer = 0;
 			info.interpolation = mem.interpolation;
 			info.extension = mem.extension;
@@ -411,7 +415,7 @@ public:
 			need_texture_info = true;
 		}
 
-		mem.device_pointer = mem.data_pointer;
+		mem.device_pointer = (device_ptr)mem.host_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -457,7 +461,7 @@ public:
 
 	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer;
 		for(int i = 0; i < 9; i++) {
 			tiles->buffers[i] = buffers[i];
 		}
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 59d4fb055d0..4ab3cb9da75 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -128,20 +128,26 @@ public:
 	CUdevice cuDevice;
 	CUcontext cuContext;
 	CUmodule cuModule, cuFilterModule;
-	map<device_ptr, bool> tex_interp_map;
-	map<device_ptr, CUtexObject> tex_bindless_map;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
 	CUDASplitKernel *split_kernel;
 
+	struct CUDAMem {
+		CUDAMem()
+		: texobject(0), array(0) {}
+
+		CUtexObject texobject;
+		CUarray array;
+	};
+	map<device_memory*, CUDAMem> cuda_mem_map;
+
 	struct PixelMem {
 		GLuint cuPBO;
 		CUgraphicsResource cuPBOresource;
 		GLuint cuTexId;
 		int w, h;
 	};
-
 	map<device_ptr, PixelMem> pixel_mem_map;
 
 	/* Bindless Textures */
@@ -615,7 +621,7 @@ public:
 		}
 	}
 
-	void generic_alloc(device_memory& mem, size_t padding = 0)
+	CUDAMem *generic_alloc(device_memory& mem, size_t padding = 0)
 	{
 		CUDAContextScope scope(this);
 
@@ -625,19 +631,28 @@ public:
 					<< string_human_readable_size(mem.memory_size()) << ")";
 		}
 
-		CUdeviceptr device_pointer;
+		/* Allocate memory on device. */
+		CUdeviceptr device_pointer = 0;
 		size_t size = mem.memory_size();
 		cuda_assert(cuMemAlloc(&device_pointer, size + padding));
 		mem.device_pointer = (device_ptr)device_pointer;
 		mem.device_size = size;
 		stats.mem_alloc(size);
+
+		if(!mem.device_pointer) {
+			return NULL;
+		}
+
+		/* Insert into map of allocations. */
+		CUDAMem *cmem = &cuda_mem_map[&mem];
+		return cmem;
 	}
 
 	void generic_copy_to(device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			CUDAContextScope scope(this);
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
+			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
 		}
 	}
 
@@ -648,10 +663,11 @@ public:
 
 			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
 
-			mem.device_pointer = 0;
-
 			stats.mem_free(mem.device_size);
+			mem.device_pointer = 0;
 			mem.device_size = 0;
+
+			cuda_mem_map.erase(cuda_mem_map.find(&mem));
 		}
 	}
 
@@ -700,11 +716,11 @@ public:
 			size_t size = elem*w*h;
 
 			if(mem.device_pointer) {
-				cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
+				cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset,
 										 (CUdeviceptr)(mem.device_pointer + offset), size));
 			}
 			else {
-				memset((char*)mem.data_pointer + offset, 0, size);
+				memset((char*)mem.host_pointer + offset, 0, size);
 			}
 		}
 	}
@@ -715,8 +731,8 @@ public:
 			mem_alloc(mem);
 		}
 
-		if(mem.data_pointer) {
-			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(mem.host_pointer) {
+			memset(mem.host_pointer, 0, mem.memory_size());
 		}
 
 		if(mem.device_pointer) {
@@ -814,8 +830,6 @@ public:
 				uint32_t ptr = (uint32_t)mem.device_pointer;
 				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 			}
-
-			tex_interp_map[mem.device_pointer] = false;
 			return;
 		}
 
@@ -851,7 +865,7 @@ public:
 			default: assert(0); return;
 		}
 
-
+		CUDAMem *cmem = NULL;
 		CUarray array_3d = NULL;
 		size_t src_pitch = mem.data_width * dsize * mem.data_elements;
 		size_t dst_pitch = src_pitch;
@@ -878,7 +892,7 @@ public:
 			param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
 			param.dstArray = array_3d;
 			param.srcMemoryType = CU_MEMORYTYPE_HOST;
-			param.srcHost = (void*)mem.data_pointer;
+			param.srcHost = mem.host_pointer;
 			param.srcPitch = src_pitch;
 			param.WidthInBytes = param.srcPitch;
 			param.Height = mem.data_height;
@@ -889,6 +903,10 @@ public:
 			mem.device_pointer = (device_ptr)array_3d;
 			mem.device_size = size;
 			stats.mem_alloc(size);
+
+			cmem = &cuda_mem_map[&mem];
+			cmem->texobject = 0;
+			cmem->array = array_3d;
 		}
 		else if(mem.data_height > 1) {
 			/* 2D texture, using pitch aligned linear memory. */
@@ -897,7 +915,10 @@ public:
 			dst_pitch = align_up(src_pitch, alignment);
 			size_t dst_size = dst_pitch * mem.data_height;
 
-			generic_alloc(mem, dst_size - mem.memory_size());
+			cmem = generic_alloc(mem, dst_size - mem.memory_size());
+			if(!cmem) {
+				return;
+			}
 
 			CUDA_MEMCPY2D param;
 			memset(&param, 0, sizeof(param));
@@ -905,7 +926,7 @@ public:
 			param.dstDevice = mem.device_pointer;
 			param.dstPitch = dst_pitch;
 			param.srcMemoryType = CU_MEMORYTYPE_HOST;
-			param.srcHost = (void*)mem.data_pointer;
+			param.srcHost = mem.host_pointer;
 			param.srcPitch = src_pitch;
 			param.WidthInBytes = param.srcPitch;
 			param.Height = mem.data_height;
@@ -914,8 +935,12 @@ public:
 		}
 		else {
 			/* 1D texture, using linear memory. */
-			generic_alloc(mem);
-			cuda_assert(cuMemcpyHtoD(mem.device_pointer, (void*)mem.data_pointer, size));
+			cmem = generic_alloc(mem);
+			if(!cmem) {
+				return;
+			}
+
+			cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
 		}
 
 		if(!has_fermi_limits) {
@@ -932,7 +957,7 @@ public:
 			CUDA_RESOURCE_DESC resDesc;
 			memset(&resDesc, 0, sizeof(resDesc));
 
-			if(mem.data_depth > 1) {
+			if(array_3d) {
 				resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
 				resDesc.res.array.hArray = array_3d;
 				resDesc.flags = 0;
@@ -962,13 +987,7 @@ public:
 			texDesc.filterMode = filter_mode;
 			texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
-			CUtexObject tex = 0;
-			cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
-
-			/* Safety check */
-			if((uint)tex > UINT_MAX) {
-				assert(0);
-			}
+			cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
 
 			/* Resize once */
 			if(flat_slot >= texture_info.size()) {
@@ -979,20 +998,18 @@ public:
 
 			/* Set Mapping and tag that we need to (re-)upload to device */
 			TextureInfo& info = texture_info[flat_slot];
-			info.data = (uint64_t)tex;
+			info.data = (uint64_t)cmem->texobject;
 			info.cl_buffer = 0;
 			info.interpolation = mem.interpolation;
 			info.extension = mem.extension;
 			info.width = mem.data_width;
 			info.height = mem.data_height;
 			info.depth = mem.data_depth;
-
-			tex_bindless_map[mem.device_pointer] = tex;
 			need_texture_info = true;
 		}
 		else {
 			/* Fermi, fixed texture slots. */
-			if(mem.data_depth > 1) {
+			if(array_3d) {
 				cuda_assert(cuTexRefSetArray(texref, array_3d, CU_TRSA_OVERRIDE_FORMAT));
 			}
 			else if(mem.data_height > 1) {
@@ -1017,38 +1034,27 @@ public:
 				cuda_assert(cuTexRefSetAddressMode(texre

@@ Diff output truncated at 10240 characters. @@