[Bf-blender-cvs] [adadfaa] master: Fix (unreported) fully broken 'sanitize utf-8' helper.

Sun Jan 1 02:29:24 CET 2017

Commit: adadfaad880b2ad56fde72a38b13d42b512de81d
Author: Bastien Montagne
Date:   Sun Jan 1 02:15:42 2017 +0100
Branches: master
https://developer.blender.org/rBadadfaad880b2ad56fde72a38b13d42b512de81d

Fix (unreported) fully broken 'sanitize utf-8' helper.

That code was a joke, letting some invalid utf8 bytes pass, returning
wrong offset for some invalid sequences, not to mention length and
pointer easily going out of sync, NULL final byte being 'forgotten' by
memcpy, etc. etc.

The miracle here is that we could survive using this for so long!
Probably because we do not use utf-8 sanitizing enough in Blender,
actually... :/

===================================================================

M	source/blender/blenlib/intern/string_utf8.c

===================================================================

diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 9603361..0743786 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -47,6 +47,19 @@
 
 // #define DEBUG_STRSIZE
 
+/* array copied from glib's gutf8.c, */
+/* Note: last two values (0xfe and 0xff) are forbidden in utf-8, so they are considered 1 byte length too. */
+static const size_t utf8_skip_data[256] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+};
+
 /* from libswish3, originally called u8_isvalid(),
  * modified to return the index of the bad character (byte index not utf).
  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
@@ -56,73 +69,91 @@
  * length is in bytes, since without knowing whether the string is valid
  * it's hard to know how many characters there are! */
 
-static const char trailingBytesForUTF8[256] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
-};
-
+/**
+ * Find first utf-8 invalid byte in given \a str, of \a length bytes.
+ *
+ * \return the offset of the first invalid byte.
+ */
 int BLI_utf8_invalid_byte(const char *str, int length)
 {
-	const unsigned char *p, *pend = (const unsigned char *)str + length;
+	const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
 	unsigned char c;
 	int ab;
 
-	for (p = (const unsigned char *)str; p < pend; p++) {
+	for (p = (const unsigned char *)str; p < pend; p++, length--) {
 		c = *p;
+		perr = p;  /* Erroneous char is always the first of an invalid utf8 sequence... */
+		if (ELEM(c, 0xfe, 0xff, 0x00))  /* Those three values are not allowed in utf8 string. */
+			goto utf8_error;
 		if (c < 128)
 			continue;
 		if ((c & 0xc0) != 0xc0)
 			goto utf8_error;
-		ab = trailingBytesForUTF8[c];
-		if (length < ab)
+
+		/* Note that since we always increase p (and decrease length) by one byte in main loop, we only add/subtract
+		 * extra utf8 bytes in code below
+		 * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
+		ab = utf8_skip_data[c] - 1;
+		if (length <= ab) {
 			goto utf8_error;
-		length -= ab;
+		}
 
-		p++;
 		/* Check top bits in the second byte */
+		p++;
+		length--;
 		if ((*p & 0xc0) != 0x80)
 			goto utf8_error;
 
 		/* Check for overlong sequences for each different length */
 		switch (ab) {
-			/* Check for xx00 000x */
-		case 1:
-			if ((c & 0x3e) == 0) goto utf8_error;
-			continue;   /* We know there aren't any more bytes to check */
-
-			/* Check for 1110 0000, xx0x xxxx */
-		case 2:
-			if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
-			break;
-
-			/* Check for 1111 0000, xx00 xxxx */
-		case 3:
-			if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
-			break;
-
-			/* Check for 1111 1000, xx00 0xxx */
-		case 4:
-			if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
-			break;
-
-			/* Check for leading 0xfe or 0xff,
-			 * and then for 1111 1100, xx00 00xx */
-		case 5:
-			if (c == 0xfe || c == 0xff ||
-			    (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
-			break;
+			case 1:
+				/* Check for xx00 000x */
+				if ((c & 0x3e) == 0) goto utf8_error;
+				continue;   /* We know there aren't any more bytes to check */
+
+			case 2:
+				/* Check for 1110 0000, xx0x xxxx */
+				if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
+				/* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
+				 * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
+				/* From section 5.1 (and 5.2) */
+				if (c == 0xed) {
+					if (*p == 0xa0 && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xad && *(p + 1) == 0xbf) goto utf8_error;
+					if (*p == 0xae && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xaf && *(p + 1) == 0xbf) goto utf8_error;
+					if (*p == 0xb0 && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xbe && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+				}
+				/* From section 5.3 */
+				if (c == 0xef) {
+					if (*p == 0xbf && *(p + 1) == 0xbe) goto utf8_error;
+					if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+				}
+				break;
+
+			case 3:
+				/* Check for 1111 0000, xx00 xxxx */
+				if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
+				break;
+
+			case 4:
+				/* Check for 1111 1000, xx00 0xxx */
+				if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
+				break;
+
+			case 5:
+				/* Check for 1111 1100, xx00 00xx */
+				if (c == 0xfc && (*p & 0x3c) == 0) goto utf8_error;
+				break;
 		}
 
 		/* Check for valid bytes after the 2nd, if any; all must start 10 */
 		while (--ab > 0) {
-			if ((*(p + 1) & 0xc0) != 0x80) goto utf8_error;
-			p++; /* do this after so we get usable offset - campbell */
+			p++;
+			length--;
+			if ((*p & 0xc0) != 0x80) goto utf8_error;
 		}
 	}
 
@@ -130,7 +161,7 @@ int BLI_utf8_invalid_byte(const char *str, int length)
 
 utf8_error:
 
-	return (int)((const char *)p - (const char *)str) - 1;
+	return (int)((const char *)perr - (const char *)str);
 }
 
 int BLI_utf8_invalid_strip(char *str, int length)
@@ -141,7 +172,7 @@ int BLI_utf8_invalid_strip(char *str, int length)
 
 	while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) {
 		str += bad_char;
-		length -= bad_char;
+		length -= (bad_char + 1);
 
 		if (length == 0) {
 			/* last character bad, strip it */
@@ -151,7 +182,7 @@ int BLI_utf8_invalid_strip(char *str, int length)
 		}
 		else {
 			/* strip, keep looking */
-			memmove(str, str + 1, (size_t)length);
+			memmove(str, str + 1, (size_t)length + 1);  /* +1 for NULL char! */
 			tot++;
 		}
 	}
@@ -162,20 +193,6 @@ int BLI_utf8_invalid_strip(char *str, int length)
 
 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
 
-/* array copied from glib's gutf8.c,
- * note: this looks to be at odd's with 'trailingBytesForUTF8',
- * need to find out what gives here! - campbell */
-static const size_t utf8_skip_data[256] = {
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
-};
-
 #define BLI_STR_UTF8_CPY(dst, src, maxncpy)                                   \
 	{                                                                         \
 		size_t utf8_size;                                                     \