[Bf-blender-cvs] [bd4bea3e98a] master: Cycles: avoid reallocating tile denoising memory many times during render.

Brecht Van Lommel noreply at git.blender.org
Thu Nov 9 20:39:57 CET 2017


Commit: bd4bea3e98a436521f9a7effcfed19cdf46eadfb
Author: Brecht Van Lommel
Date:   Wed Nov 8 20:15:38 2017 +0100
Branches: master
https://developer.blender.org/rBbd4bea3e98a436521f9a7effcfed19cdf46eadfb

Cycles: avoid reallocating tile denoising memory many times during render.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_denoising.cpp
M	intern/cycles/device/device_denoising.h
M	intern/cycles/device/device_memory.h
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/device/opencl/opencl_mega.cpp
M	intern/cycles/device/opencl/opencl_split.cpp

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 0f4001ab1a6..ce02a5a932e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -712,12 +712,10 @@ public:
 		}
 	}
 
-	void denoise(DeviceTask &task, RenderTile &tile)
+	void denoise(DeviceTask &task, DenoisingTask& denoising, RenderTile &tile)
 	{
 		tile.sample = tile.start_sample + tile.num_samples;
 
-		DenoisingTask denoising(this);
-
 		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
 		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
 		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
@@ -769,6 +767,8 @@ public:
 		}
 
 		RenderTile tile;
+		DenoisingTask denoising(this);
+
 		while(task.acquire_tile(this, tile)) {
 			if(tile.task == RenderTile::PATH_TRACE) {
 				if(use_split_kernel) {
@@ -780,7 +780,7 @@ public:
 				}
 			}
 			else if(tile.task == RenderTile::DENOISE) {
-				denoise(task, tile);
+				denoise(task, denoising, tile);
 			}
 
 			task.release_tile(tile);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d230a0c565d..a38340cb286 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1408,10 +1408,8 @@ public:
 		return !have_error();
 	}
 
-	void denoise(RenderTile &rtile, const DeviceTask &task)
+	void denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
 	{
-		DenoisingTask denoising(this);
-
 		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
 		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
 		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
@@ -1857,8 +1855,6 @@ public:
 		CUDAContextScope scope(this);
 
 		if(task->type == DeviceTask::RENDER) {
-			RenderTile tile;
-
 			DeviceRequestedFeatures requested_features;
 			if(use_split_kernel()) {
 				if(split_kernel == NULL) {
@@ -1870,6 +1866,9 @@ public:
 			device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
 
 			/* keep rendering tiles until done */
+			RenderTile tile;
+			DenoisingTask denoising(this);
+
 			while(task->acquire_tile(this, tile)) {
 				if(tile.task == RenderTile::PATH_TRACE) {
 					if(use_split_kernel()) {
@@ -1883,7 +1882,7 @@ public:
 				else if(tile.task == RenderTile::DENOISE) {
 					tile.sample = tile.start_sample + tile.num_samples;
 
-					denoise(tile, *task);
+					denoise(tile, denoising, *task);
 
 					task->update_progress(&tile, tile.w*tile.h);
 				}
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 2d39721e3d3..69c43e4a8cf 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -20,6 +20,27 @@
 
 CCL_NAMESPACE_BEGIN
 
+DenoisingTask::DenoisingTask(Device *device)
+: tiles_mem(device, "denoising tiles_mem", MEM_READ_WRITE),
+  storage(device),
+  buffer(device),
+  device(device)
+{
+}
+
+DenoisingTask::~DenoisingTask()
+{
+	storage.XtWX.free();
+	storage.XtWY.free();
+	storage.transform.free();
+	storage.rank.free();
+	storage.temporary_1.free();
+	storage.temporary_2.free();
+	storage.temporary_color.free();
+	buffer.mem.free();
+	tiles_mem.free();
+}
+
 void DenoisingTask::init_from_devicetask(const DeviceTask &task)
 {
 	radius = task.denoising_radius;
@@ -75,7 +96,7 @@ bool DenoisingTask::run_denoising()
 	buffer.w = align_up(rect.z - rect.x, 4);
 	buffer.h = rect.w - rect.y;
 	buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
-	buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes);
+	buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
 
 	device_ptr null_ptr = (device_ptr) 0;
 
@@ -159,11 +180,10 @@ bool DenoisingTask::run_denoising()
 		int variance_to[]   = {11, 12, 13};
 		int num_color_passes = 3;
 
-		device_only_memory<float> temp_color(device, "Denoising temporary color");
-		temp_color.alloc_to_device(3*buffer.pass_stride);
+		storage.temporary_color.alloc_to_device(3*buffer.pass_stride, false);
 
 		for(int pass = 0; pass < num_color_passes; pass++) {
-			device_sub_ptr color_pass(temp_color, pass*buffer.pass_stride, buffer.pass_stride);
+			device_sub_ptr color_pass(storage.temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
 			device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
 			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
 		}
@@ -172,28 +192,24 @@ bool DenoisingTask::run_denoising()
 			device_sub_ptr depth_pass    (buffer.mem,                                 0,   buffer.pass_stride);
 			device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
 			device_sub_ptr output_pass   (buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
-			functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+			functions.detect_outliers(storage.temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
 		}
-
-		temp_color.free();
 	}
 
 	storage.w = filter_area.z;
 	storage.h = filter_area.w;
-	storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE);
-	storage.rank.alloc_to_device(storage.w*storage.h);
+	storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false);
+	storage.rank.alloc_to_device(storage.w*storage.h, false);
 
 	functions.construct_transform();
 
-	device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
-	device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
-	temporary_1.alloc_to_device(buffer.w*buffer.h);
-	temporary_2.alloc_to_device(buffer.w*buffer.h);
-	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
-	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+	storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false);
+	storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false);
+	reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer;
 
-	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE);
-	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE);
+	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
+	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
 
 	reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
 	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
@@ -210,14 +226,6 @@ bool DenoisingTask::run_denoising()
 		functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr);
 	}
 
-	storage.XtWX.free();
-	storage.XtWY.free();
-	storage.transform.free();
-	storage.rank.free();
-	temporary_1.free();
-	temporary_2.free();
-	buffer.mem.free();
-	tiles_mem.free();
 	return true;
 }
 
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 606f7422ac8..ec4e7933cdc 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -121,6 +121,9 @@ public:
 		device_only_memory<int>    rank;
 		device_only_memory<float>  XtWX;
 		device_only_memory<float3> XtWY;
+		device_only_memory<float>  temporary_1;
+		device_only_memory<float>  temporary_2;
+		device_only_memory<float>  temporary_color;
 		int w;
 		int h;
 
@@ -128,16 +131,15 @@ public:
 		: transform(device, "denoising transform"),
 		  rank(device, "denoising rank"),
 		  XtWX(device, "denoising XtWX"),
-		  XtWY(device, "denoising XtWY")
+		  XtWY(device, "denoising XtWY"),
+		  temporary_1(device, "denoising NLM temporary 1"),
+		  temporary_2(device, "denoising NLM temporary 2"),
+		  temporary_color(device, "denoising temporary color")
 		{}
 	} storage;
 
-	DenoisingTask(Device *device)
-	: tiles_mem(device, "denoising tiles_mem", MEM_READ_WRITE),
-	  storage(device),
-	  buffer(device),
-	  device(device)
-	{}
+	DenoisingTask(Device *device);
+	~DenoisingTask();
 
 	void init_from_devicetask(const DeviceTask &task);
 
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index a2866ae3984..453dab9bfb3 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -243,15 +243,29 @@ public:
 		free();
 	}
 
-	void alloc_to_device(size_t num)
+	void alloc_to_device(size_t num, bool shrink_to_fit = true)
 	{
-		data_size = num*sizeof(T);
-		device_alloc();
+		size_t new_size = num*sizeof(T);
+		bool reallocate;
+
+		if(shrink_to_fit) {
+			reallocate = (data_size != new_size);
+		}
+		else {
+			reallocate = (data_size < new_size);
+		}
+
+		if(reallocate) {
+			device_free();
+			data_size = new_size;
+			device_alloc();
+		}
 	}
 
 	void free()
 	{
 		device_free();
+		data_size = 0;
 	}
 
 	void zero_to_device()
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 55848c8112d..c02f8ffafe6 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -360,7 +360,7 @@ public:
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
 
-	void denoise(RenderTile& tile, const DeviceTask& task);
+	void denoise(RenderTile& tile, DenoisingTask& denoising, const DeviceTask& task);
 
 	class OpenCLDeviceTask : public DeviceTask {
 	public:
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index d4af392fdd2..f43177247ef 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -1066,10 +1066,8 @@ bool OpenCLDeviceBase::d

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list