[Bf-blender-cvs] [1ae08a9978] temp-cycles-denoising: Cycles Denoising: Use correct alignment of denoising buffer offsets for OpenCL
Lukas Stockner
noreply at git.blender.org
Sat Mar 25 01:45:51 CET 2017
Commit: 1ae08a9978e4495e7c0ae4735615bfddc676cb9d
Author: Lukas Stockner
Date: Sat Mar 25 01:28:42 2017 +0100
Branches: temp-cycles-denoising
https://developer.blender.org/rB1ae08a9978e4495e7c0ae4735615bfddc676cb9d
Cycles Denoising: Use correct alignment of denoising buffer offsets for OpenCL
===================================================================
M intern/cycles/device/device.h
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_denoising.cpp
M intern/cycles/device/opencl/opencl.h
M intern/cycles/device/opencl/opencl_base.cpp
M intern/cycles/device/opencl/opencl_util.cpp
M intern/cycles/filter/filter_nlm_cpu.h
M intern/cycles/filter/filter_nlm_gpu.h
M intern/cycles/filter/filter_reconstruction.h
M intern/cycles/filter/filter_transform.h
M intern/cycles/filter/filter_transform_gpu.h
M intern/cycles/filter/filter_transform_sse.h
M intern/cycles/filter/kernels/cpu/filter_cpu.h
M intern/cycles/filter/kernels/cpu/filter_cpu_impl.h
M intern/cycles/filter/kernels/cuda/filter.cu
M intern/cycles/filter/kernels/opencl/filter.cl
===================================================================
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 397a911f1b..bd26cb66ba 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -256,6 +256,7 @@ public:
int y, int w, int h, int elem) = 0;
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
+ virtual int mem_get_offset_alignment() { return 1; }
virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type)
{
/* Only required for devices that implement denoising. */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b0fd77eb07..45e3fe491e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -185,9 +185,9 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+ KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -454,6 +454,7 @@ public:
(float*) task->storage.transform.device_pointer,
(int*) task->storage.rank.device_pointer,
&task->rect.x,
+ task->buffer.pass_stride,
task->radius,
task->pca_threshold);
}
@@ -507,7 +508,8 @@ public:
&task->reconstruction_state.filter_rect.x,
task->buffer.w,
task->buffer.h,
- 4);
+ 4,
+ task->buffer.pass_stride);
}
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 8c9db9bab2..daccc4d122 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1022,7 +1022,8 @@ public:
&task->filter_area,
&task->rect,
&task->radius,
- &task->pca_threshold};
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
@@ -1117,7 +1118,8 @@ public:
&task->reconstruction_state.filter_rect,
&task->buffer.w,
&task->buffer.h,
- &f};
+ &f,
+ &task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
}
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 308c8e0ff1..72a56435ec 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -71,7 +71,7 @@ bool DenoisingTask::run_denoising()
buffer.passes = use_cross_denoising? 20 : 14;
buffer.w = align_up(rect.z - rect.x, 4);
buffer.h = rect.w - rect.y;
- buffer.pass_stride = buffer.w * buffer.h;
+ buffer.pass_stride = align_up(buffer.w * buffer.h, device->mem_get_offset_alignment());
buffer.mem.resize(buffer.pass_stride * buffer.passes);
device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index a06420ddfe..a05feccccc 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -133,6 +133,8 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
+ static int get_base_align_bytes(cl_device_id device_id);
+
/* Get somewhat more readable device name.
* Main difference is AMD OpenCL here which only gives code name
* for the regular device name. This will give more sane device
@@ -324,7 +326,8 @@ public:
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
void mem_free(device_memory& mem);
- virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type);
+ int mem_get_offset_alignment();
+ device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type);
void const_copy_to(const char *name, void *host, size_t size);
void tex_alloc(const char *name,
device_memory& mem,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index f10e45bf35..db1efe14e7 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -417,6 +417,11 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
+int OpenCLDeviceBase::mem_get_offset_alignment()
+{
+ return OpenCLInfo::get_base_align_bytes(cdDevice);
+}
+
device_ptr OpenCLDeviceBase::mem_get_offset_ptr(device_memory& mem, int offset, int size, MemoryType type)
{
cl_mem_flags mem_flag;
@@ -669,6 +674,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
rank_mem,
task->filter_area,
task->rect,
+ task->buffer.pass_stride,
task->radius,
task->pca_threshold);
@@ -774,7 +780,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
task->reconstruction_state.filter_rect,
task->buffer.w,
task->buffer.h,
- f);
+ f,
+ task->buffer.pass_stride);
enqueue_kernel(ckNLMConstructGramian,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h,
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 1f5b9ee089..1a8417d3f3 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1069,6 +1069,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
+int OpenCLInfo::get_base_align_bytes(cl_device_id device_id)
+{
+ int base_align_bits;
+ if(clGetDeviceInfo(device_id,
+ CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+ sizeof(int),
+ &base_align_bits,
+ NULL) == CL_SUCCESS)
+ {
+ return base_align_bits/8;
+ }
+ return 1;
+}
+
CCL_NAMESPACE_END
#endif
diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h
index c498057f82..1967f35fde 100644
--- a/intern/cycles/filter/filter_nlm_cpu.h
+++ b/intern/cycles/filter/filter_nlm_cpu.h
@@ -120,7 +120,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
float3 *XtWY,
int4 rect,
int4 filter_rect,
- int w, int h, int f)
+ int w, int h, int f,
+ int pass_stride)
{
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
@@ -141,7 +142,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
int *l_rank = rank + storage_ofs;
- kernel_filter_construct_gramian(x, y, 1, dx, dy, w, h,
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
buffer,
color_pass, variance_pass,
l_transform, l_rank,
diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h
index 10330e313a..a3fcc2e27b 100644
--- a/intern/cycles/filter/filter_nlm_gpu.h
+++ b/intern/cycles/filter/filter_nlm_gpu.h
@@ -106,6 +106,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
int4 rect,
int4 filter_rect,
int w, int h, int f,
+ i
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list