[Bf-blender-cvs] [2e50add1643] master: Fix OpenCL performance regression after cubic interpolation.

Brecht Van Lommel noreply at git.blender.org
Sun Oct 15 20:02:53 CEST 2017


Commit: 2e50add1643d1f37dd9bd412348135477f1c3504
Author: Brecht Van Lommel
Date:   Sun Oct 15 17:40:01 2017 +0200
Branches: master
https://developer.blender.org/rB2e50add1643d1f37dd9bd412348135477f1c3504

Fix OpenCL performance regression after cubic interpolation.

Reorganize code to reduce register pressure.

===================================================================

M	intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
M	intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h

===================================================================

diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index b7be4fe4409..5ca07eaeb05 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -87,7 +87,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObjec
 	                       g1x * tex2D<T>(tex, x1, y1));
 }
 
-/* Fast tricubic texture lookup using 8 bilinear lookups. */
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
 {
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index d908af78c7a..faa9dd66d0e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -27,9 +27,21 @@ ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uin
 
 #define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
 
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+	x %= width;
+	if(x < 0)
+		x += width;
+	return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+	return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset)
 {
-	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 	const int texture_type = kernel_tex_type(id);
 
 	/* Float4 */
@@ -55,19 +67,45 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int o
 	}
 }
 
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
 {
-	x %= width;
-	if(x < 0)
-		x += width;
-	return x;
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	/* Wrap */
+	if(info->extension == EXTENSION_REPEAT) {
+		x = svm_image_texture_wrap_periodic(x, info->width);
+		y = svm_image_texture_wrap_periodic(y, info->height);
+	}
+	else {
+		x = svm_image_texture_wrap_clamp(x, info->width);
+		y = svm_image_texture_wrap_clamp(y, info->height);
+	}
+
+	int offset = x + info->width * y;
+	return svm_image_texture_read(kg, info, id, offset);
 }
 
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z)
 {
-	return clamp(x, 0, width-1);
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	/* Wrap */
+	if(info->extension == EXTENSION_REPEAT) {
+		x = svm_image_texture_wrap_periodic(x, info->width);
+		y = svm_image_texture_wrap_periodic(y, info->height);
+		z = svm_image_texture_wrap_periodic(z, info->depth);
+	}
+	else {
+		x = svm_image_texture_wrap_clamp(x, info->width);
+		y = svm_image_texture_wrap_clamp(y, info->height);
+		z = svm_image_texture_wrap_clamp(z, info->depth);
+	}
+
+	int offset = x + info->width * y + info->width * info->height * z;
+	return svm_image_texture_read(kg, info, id, offset);
 }
 
+
 ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 {
 	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
@@ -87,107 +125,52 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 {
 	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
-	uint width = info->width;
-	uint height = info->height;
-	uint interpolation = info->interpolation;
-	uint extension = info->extension;
+	if(info->extension == EXTENSION_CLIP) {
+		if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+	}
 
-	/* Actual sampling. */
-	if(interpolation == INTERPOLATION_CLOSEST) {
+	if(info->interpolation == INTERPOLATION_CLOSEST) {
+		/* Closest interpolation. */
 		int ix, iy;
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
+		svm_image_texture_frac(x*info->width, &ix);
+		svm_image_texture_frac(y*info->height, &iy);
 
-		return svm_image_texture_read(kg, id, ix + iy*width);
+		return svm_image_texture_read_2d(kg, id, ix, iy);
+	}
+	else if(info->interpolation == INTERPOLATION_LINEAR) {
+		/* Bilinear interpolation. */
+		int ix, iy;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+		float4 r;
+		r =  (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy);
+		r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy);
+		r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1);
+		r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1);
+		return r;
 	}
 	else {
-		/* Bilinear or bicubic interpolation. */
-		int ix, iy, nix, niy;
-		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-		}
-
-		if(interpolation == INTERPOLATION_LINEAR) {
-			/* Bilinear interpolation. */
-			float4 r;
-			r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width);
-			r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width);
-			r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width);
-			r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width);
-			return r;
-		}
-
 		/* Bicubic interpolation. */
-		int pix, piy, nnix, nniy;
-		if(extension == EXTENSION_REPEAT) {
-			pix = svm_image_texture_wrap_periodic(ix-1, width);
-			piy = svm_image_texture_wrap_periodic(iy-1, height);
-			nnix = svm_image_texture_wrap_periodic(ix+2, width);
-			nniy = svm_image_texture_wrap_periodic(iy+2, height);
-		}
-		else {
-			pix = svm_image_texture_wrap_clamp(ix-1, width);
-			piy = svm_image_texture_wrap_clamp(iy-1, height);
-			nnix = svm_image_texture_wrap_clamp(ix+2, width);
-			nniy = svm_image_texture_wrap_clamp(iy+2, height);
-		}
+		int ix, iy;
+		float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
 
-		const int xc[4] = {pix, ix, nix, nnix};
-		const int yc[4] = {width * piy,
-		                   width * iy,
-		                   width * niy,
-		                   width * nniy};
 		float u[4], v[4];
-		/* Some helper macro to keep code reasonable size,
-		 * let compiler to inline all the matrix multiplications.
-		 */
-#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y]))
-#define TERM(col) \
-		(v[col] * (u[0] * DATA(0, col) + \
-		           u[1] * DATA(1, col) + \
-		           u[2] * DATA(2, col) + \
-		           u[3] * DATA(3, col)))
-
 		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
 		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
 
-		/* Actual interpolation. */
-		return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-#undef TERM
-#undef DATA
+		float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		for(int y = 0; y < 4; y++) {
+			for(int x = 0; x < 4; x++) {
+				float weight = u[x]*v[y];
+				r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1);
+			}
+		}
+		return r;
 	}
 }
 
@@ -196,145 +179,67 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 {
 	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
-	uint width = info->width;
-	uint height = info->height;
-	uint depth = info->depth;
+	if(info->extension == EXTENSION_CLIP) {
+		if(x < 0.0f || y < 0.0f || z < 0.0f ||
+		   x > 1.0f || y > 1.0f || z > 1.0f)
+		{
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+	}
+
 	uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
-	uint extension = info->extension;
 
-	/* Actual sampling. */
 	if(interpolation == INTERPOLATION_CLOSEST) {
+		/* Closest interpolation. */
 		int ix, iy, iz;
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-		svm_image_texture_frac(z*depth, &iz);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			iz = svm_image_texture_wrap_periodic(iz, depth);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || z < 0.0f ||
-				   x > 1.0f || y > 1.0f || z > 1.0f)
-				{
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			iz = svm_image_texture_wrap_clamp(iz, depth);
-		}
-		return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+		svm_image_texture_frac(x*info->width, &ix);
+		svm_image_texture_frac(y*info->height, &iy);
+		svm_image_texture_frac(z*info->depth, &iz);
+
+		return svm_image_texture_read_3d(kg, id, ix, iy, iz);
+	}
+	else if(interpolation == INTERPOLATION_LINEAR) {
+		/* Bilinear interpolation. */
+		int ix, iy, iz

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list