[Bf-blender-cvs] [2e50add1643] master: Fix OpenCL performance regression after cubic interpolation.
Brecht Van Lommel
noreply at git.blender.org
Sun Oct 15 20:02:53 CEST 2017
Commit: 2e50add1643d1f37dd9bd412348135477f1c3504
Author: Brecht Van Lommel
Date: Sun Oct 15 17:40:01 2017 +0200
Branches: master
https://developer.blender.org/rB2e50add1643d1f37dd9bd412348135477f1c3504
Fix OpenCL performance regression after cubic interpolation.
Reorganize code to reduce register pressure.
===================================================================
M intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
M intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
===================================================================
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index b7be4fe4409..5ca07eaeb05 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -87,7 +87,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObjec
g1x * tex2D<T>(tex, x1, y1));
}
-/* Fast tricubic texture lookup using 8 bilinear lookups. */
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
{
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index d908af78c7a..faa9dd66d0e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -27,9 +27,21 @@ ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uin
#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+ x %= width;
+ if(x < 0)
+ x += width;
+ return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+ return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset)
{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
const int texture_type = kernel_tex_type(id);
/* Float4 */
@@ -55,19 +67,45 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int o
}
}
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
{
- x %= width;
- if(x < 0)
- x += width;
- return x;
+ const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+ /* Wrap */
+ if(info->extension == EXTENSION_REPEAT) {
+ x = svm_image_texture_wrap_periodic(x, info->width);
+ y = svm_image_texture_wrap_periodic(y, info->height);
+ }
+ else {
+ x = svm_image_texture_wrap_clamp(x, info->width);
+ y = svm_image_texture_wrap_clamp(y, info->height);
+ }
+
+ int offset = x + info->width * y;
+ return svm_image_texture_read(kg, info, id, offset);
}
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z)
{
- return clamp(x, 0, width-1);
+ const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+ /* Wrap */
+ if(info->extension == EXTENSION_REPEAT) {
+ x = svm_image_texture_wrap_periodic(x, info->width);
+ y = svm_image_texture_wrap_periodic(y, info->height);
+ z = svm_image_texture_wrap_periodic(z, info->depth);
+ }
+ else {
+ x = svm_image_texture_wrap_clamp(x, info->width);
+ y = svm_image_texture_wrap_clamp(y, info->height);
+ z = svm_image_texture_wrap_clamp(z, info->depth);
+ }
+
+ int offset = x + info->width * y + info->width * info->height * z;
+ return svm_image_texture_read(kg, info, id, offset);
}
+
ccl_device_inline float svm_image_texture_frac(float x, int *ix)
{
int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
@@ -87,107 +125,52 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
{
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
- uint width = info->width;
- uint height = info->height;
- uint interpolation = info->interpolation;
- uint extension = info->extension;
+ if(info->extension == EXTENSION_CLIP) {
+ if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
- /* Actual sampling. */
- if(interpolation == INTERPOLATION_CLOSEST) {
+ if(info->interpolation == INTERPOLATION_CLOSEST) {
+ /* Closest interpolation. */
int ix, iy;
- svm_image_texture_frac(x*width, &ix);
- svm_image_texture_frac(y*height, &iy);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- /* Fall through. */
- /* EXTENSION_EXTEND */
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- }
+ svm_image_texture_frac(x*info->width, &ix);
+ svm_image_texture_frac(y*info->height, &iy);
- return svm_image_texture_read(kg, id, ix + iy*width);
+ return svm_image_texture_read_2d(kg, id, ix, iy);
+ }
+ else if(info->interpolation == INTERPOLATION_LINEAR) {
+ /* Bilinear interpolation. */
+ int ix, iy;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+ float4 r;
+ r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy);
+ r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy);
+ r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1);
+ r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1);
+ return r;
}
else {
- /* Bilinear or bicubic interpolation. */
- int ix, iy, nix, niy;
- float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
- float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- nix = svm_image_texture_wrap_periodic(ix+1, width);
- niy = svm_image_texture_wrap_periodic(iy+1, height);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- nix = svm_image_texture_wrap_clamp(ix+1, width);
- niy = svm_image_texture_wrap_clamp(iy+1, height);
- }
-
- if(interpolation == INTERPOLATION_LINEAR) {
- /* Bilinear interpolation. */
- float4 r;
- r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width);
- r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width);
- r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width);
- r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width);
- return r;
- }
-
/* Bicubic interpolation. */
- int pix, piy, nnix, nniy;
- if(extension == EXTENSION_REPEAT) {
- pix = svm_image_texture_wrap_periodic(ix-1, width);
- piy = svm_image_texture_wrap_periodic(iy-1, height);
- nnix = svm_image_texture_wrap_periodic(ix+2, width);
- nniy = svm_image_texture_wrap_periodic(iy+2, height);
- }
- else {
- pix = svm_image_texture_wrap_clamp(ix-1, width);
- piy = svm_image_texture_wrap_clamp(iy-1, height);
- nnix = svm_image_texture_wrap_clamp(ix+2, width);
- nniy = svm_image_texture_wrap_clamp(iy+2, height);
- }
+ int ix, iy;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
- const int xc[4] = {pix, ix, nix, nnix};
- const int yc[4] = {width * piy,
- width * iy,
- width * niy,
- width * nniy};
float u[4], v[4];
- /* Some helper macro to keep code reasonable size,
- * let compiler to inline all the matrix multiplications.
- */
-#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y]))
-#define TERM(col) \
- (v[col] * (u[0] * DATA(0, col) + \
- u[1] * DATA(1, col) + \
- u[2] * DATA(2, col) + \
- u[3] * DATA(3, col)))
-
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
- /* Actual interpolation. */
- return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-#undef TERM
-#undef DATA
+ float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(int y = 0; y < 4; y++) {
+ for(int x = 0; x < 4; x++) {
+ float weight = u[x]*v[y];
+ r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1);
+ }
+ }
+ return r;
}
}
@@ -196,145 +179,67 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
{
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
- uint width = info->width;
- uint height = info->height;
- uint depth = info->depth;
+ if(info->extension == EXTENSION_CLIP) {
+ if(x < 0.0f || y < 0.0f || z < 0.0f ||
+ x > 1.0f || y > 1.0f || z > 1.0f)
+ {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
+
uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
- uint extension = info->extension;
- /* Actual sampling. */
if(interpolation == INTERPOLATION_CLOSEST) {
+ /* Closest interpolation. */
int ix, iy, iz;
- svm_image_texture_frac(x*width, &ix);
- svm_image_texture_frac(y*height, &iy);
- svm_image_texture_frac(z*depth, &iz);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- iz = svm_image_texture_wrap_periodic(iz, depth);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || z < 0.0f ||
- x > 1.0f || y > 1.0f || z > 1.0f)
- {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- /* Fall through. */
- /* EXTENSION_EXTEND */
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- iz = svm_image_texture_wrap_clamp(iz, depth);
- }
- return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+ svm_image_texture_frac(x*info->width, &ix);
+ svm_image_texture_frac(y*info->height, &iy);
+ svm_image_texture_frac(z*info->depth, &iz);
+
+ return svm_image_texture_read_3d(kg, id, ix, iy, iz);
+ }
+ else if(interpolation == INTERPOLATION_LINEAR) {
+ /* Bilinear interpolation. */
+ int ix, iy, iz
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list