[Bf-blender-cvs] [1ae08a9978] temp-cycles-denoising: Cycles Denoising: Use correct alignment of denoising buffer offsets for OpenCL

Lukas Stockner noreply at git.blender.org
Sat Mar 25 01:45:51 CET 2017


Commit: 1ae08a9978e4495e7c0ae4735615bfddc676cb9d
Author: Lukas Stockner
Date:   Sat Mar 25 01:28:42 2017 +0100
Branches: temp-cycles-denoising
https://developer.blender.org/rB1ae08a9978e4495e7c0ae4735615bfddc676cb9d

Cycles Denoising: Use correct alignment of denoising buffer offsets for OpenCL

===================================================================

M	intern/cycles/device/device.h
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_denoising.cpp
M	intern/cycles/device/opencl/opencl.h
M	intern/cycles/device/opencl/opencl_base.cpp
M	intern/cycles/device/opencl/opencl_util.cpp
M	intern/cycles/filter/filter_nlm_cpu.h
M	intern/cycles/filter/filter_nlm_gpu.h
M	intern/cycles/filter/filter_reconstruction.h
M	intern/cycles/filter/filter_transform.h
M	intern/cycles/filter/filter_transform_gpu.h
M	intern/cycles/filter/filter_transform_sse.h
M	intern/cycles/filter/kernels/cpu/filter_cpu.h
M	intern/cycles/filter/kernels/cpu/filter_cpu_impl.h
M	intern/cycles/filter/kernels/cuda/filter.cu
M	intern/cycles/filter/kernels/opencl/filter.cl

===================================================================

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 397a911f1b..bd26cb66ba 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -256,6 +256,7 @@ public:
 		int y, int w, int h, int elem) = 0;
 	virtual void mem_zero(device_memory& mem) = 0;
 	virtual void mem_free(device_memory& mem) = 0;
+	virtual int mem_get_offset_alignment() { return 1; }
 	virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type)
 	{
 		/* Only required for devices that implement denoising. */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b0fd77eb07..45e3fe491e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -185,9 +185,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float)>                                         filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                                  filter_finalize_kernel;
+	KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, int, float)>                                         filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                                       filter_finalize_kernel;
 
 	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
 	                       ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -454,6 +454,7 @@ public:
 				                                    (float*) task->storage.transform.device_pointer,
 				                                    (int*)   task->storage.rank.device_pointer,
 				                                    &task->rect.x,
+				                                    task->buffer.pass_stride,
 				                                    task->radius,
 				                                    task->pca_threshold);
 			}
@@ -507,7 +508,8 @@ public:
 			                                      &task->reconstruction_state.filter_rect.x,
 			                                      task->buffer.w,
 			                                      task->buffer.h,
-			                                      4);
+			                                      4,
+			                                      task->buffer.pass_stride);
 		}
 		for(int y = 0; y < task->filter_area.w; y++) {
 			for(int x = 0; x < task->filter_area.z; x++) {
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 8c9db9bab2..daccc4d122 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1022,7 +1022,8 @@ public:
 		                &task->filter_area,
 		                &task->rect,
 		                &task->radius,
-		                &task->pca_threshold};
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
 		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
 		cuda_assert(cuCtxSynchronize());
 
@@ -1117,7 +1118,8 @@ public:
 			                                  &task->reconstruction_state.filter_rect,
 			                                  &task->buffer.w,
 			                                  &task->buffer.h,
-			                                  &f};
+			                                  &f,
+		                                      &task->buffer.pass_stride};
 			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
 		}
 
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 308c8e0ff1..72a56435ec 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -71,7 +71,7 @@ bool DenoisingTask::run_denoising()
 	buffer.passes = use_cross_denoising? 20 : 14;
 	buffer.w = align_up(rect.z - rect.x, 4);
 	buffer.h = rect.w - rect.y;
-	buffer.pass_stride = buffer.w * buffer.h;
+	buffer.pass_stride = align_up(buffer.w * buffer.h, device->mem_get_offset_alignment());
 	buffer.mem.resize(buffer.pass_stride * buffer.passes);
 	device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
 
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index a06420ddfe..a05feccccc 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -133,6 +133,8 @@ public:
 	                            cl_int* error = NULL);
 	static cl_device_type get_device_type(cl_device_id device_id);
 
+	static int get_base_align_bytes(cl_device_id device_id);
+
 	/* Get somewhat more readable device name.
 	 * Main difference is AMD OpenCL here which only gives code name
 	 * for the regular device name. This will give more sane device
@@ -324,7 +326,8 @@ public:
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
 	void mem_free(device_memory& mem);
-	virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type);
+	int mem_get_offset_alignment();
+	device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type);
 	void const_copy_to(const char *name, void *host, size_t size);
 	void tex_alloc(const char *name,
 	               device_memory& mem,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index f10e45bf35..db1efe14e7 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -417,6 +417,11 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
 	}
 }
 
+int OpenCLDeviceBase::mem_get_offset_alignment()
+{
+	return OpenCLInfo::get_base_align_bytes(cdDevice);
+}
+
 device_ptr OpenCLDeviceBase::mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type)
 {
 	cl_mem_flags mem_flag;
@@ -669,6 +674,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
 	                rank_mem,
 	                task->filter_area,
 	                task->rect,
+	                task->buffer.pass_stride,
 	                task->radius,
 	                task->pca_threshold);
 
@@ -774,7 +780,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 		                task->reconstruction_state.filter_rect,
 		                task->buffer.w,
 		                task->buffer.h,
-		                f);
+		                f,
+	                    task->buffer.pass_stride);
 		enqueue_kernel(ckNLMConstructGramian,
 		               task->reconstruction_state.source_w,
 		               task->reconstruction_state.source_h,
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 1f5b9ee089..1a8417d3f3 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1069,6 +1069,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	return get_device_name(device_id);
 }
 
+int OpenCLInfo::get_base_align_bytes(cl_device_id device_id)
+{
+	int base_align_bits;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+	                   sizeof(int),
+	                   &base_align_bits,
+	                   NULL) == CL_SUCCESS)
+	{
+		return base_align_bits/8;
+	}
+	return 1;
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h
index c498057f82..1967f35fde 100644
--- a/intern/cycles/filter/filter_nlm_cpu.h
+++ b/intern/cycles/filter/filter_nlm_cpu.h
@@ -120,7 +120,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            float3 *XtWY,
                                                            int4 rect,
                                                            int4 filter_rect,
-                                                           int w, int h, int f)
+                                                           int w, int h, int f,
+                                                           int pass_stride)
 {
 	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
 	for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
@@ -141,7 +142,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
 			int    *l_rank = rank + storage_ofs;
 
-			kernel_filter_construct_gramian(x, y, 1, dx, dy, w, h,
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy, w, h,
+			                                pass_stride,
 			                                buffer,
 			                                color_pass, variance_pass,
 			                                l_transform, l_rank,
diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h
index 10330e313a..a3fcc2e27b 100644
--- a/intern/cycles/filter/filter_nlm_gpu.h
+++ b/intern/cycles/filter/filter_nlm_gpu.h
@@ -106,6 +106,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
                                                            int4 rect,
                                                            int4 filter_rect,
                                                            int w, int h, int f,
+                                                           i

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list