[Bf-blender-cvs] [acbb84b] master: Add BLI_string_utf8 specific test.

Sun Jan 1 02:29:23 CET 2017

Commit: acbb84b021a145ee2dde73ec5923990f42fc18fb
Author: Bastien Montagne
Date:   Sat Dec 31 16:06:51 2016 +0100
Branches: master
https://developer.blender.org/rBacbb84b021a145ee2dde73ec5923990f42fc18fb

Add BLI_string_utf8 specific test.

This test should ensure we correctly detect all invalid utf-8 sequences in a given string.

DISCLAIMER:
Do not run this with current code - you'll either laugh or cry, nearly *all* checks fail!

Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt)
by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0

===================================================================

A	tests/gtests/blenlib/BLI_string_utf8_test.cc
M	tests/gtests/blenlib/CMakeLists.txt

===================================================================

diff --git a/tests/gtests/blenlib/BLI_string_utf8_test.cc b/tests/gtests/blenlib/BLI_string_utf8_test.cc
new file mode 100644
index 0000000..c0beb92
--- /dev/null
+++ b/tests/gtests/blenlib/BLI_string_utf8_test.cc
@@ -0,0 +1,304 @@
+/* Apache License, Version 2.0 */
+
+#include "testing/testing.h"
+
+extern "C" {
+#include "BLI_utildefines.h"
+#include "BLI_string.h"
+#include "BLI_string_utf8.h"
+}
+
+/* Note that 'common' utf-8 variants of string functions (like copy, etc.) are tested in BLI_string_test.cc
+ * However, tests below are specific utf-8 conformance ones, and since they eat quite their share of lines,
+ * they deserved their own file. */
+
+/* -------------------------------------------------------------------- */
+/* stubs */
+
+extern "C" {
+
+int mk_wcwidth(wchar_t ucs);
+int mk_wcswidth(const wchar_t *pwcs, size_t n);
+
+int mk_wcwidth(wchar_t ucs)
+{
+	return 0;
+}
+
+int mk_wcswidth(const wchar_t *pwcs, size_t n)
+{
+	return 0;
+}
+
+}
+
+/* -------------------------------------------------------------------- */
+/* tests */
+
+/* Each test is made of a 79 bytes (80 with NULL char) string to test, expected string result after
+ * stripping invalid utf8 bytes, and a single-byte string encoded with expected number of errors.
+ *
+ * Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt)
+ *     by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+ */
+const char *utf8_invalid_tests[][3] = {
+//    1  Some correct UTF-8 text
+    {"You should see the Greek word 'kosme':       \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\"                    |",
+     "You should see the Greek word 'kosme':       \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\"                    |", "\x00"},
+
+//    2  Boundary condition test cases
+//    Note that those will pass for us, those are not erronéous unicode code points
+//    (asside from \x00, which is only valid as string terminator).
+//    2.1  First possible sequence of a certain length
+    {"2.1.1  1 byte  (U-00000000):        \"\x00\"                                       |",
+     "2.1.1  1 byte  (U-00000000):        \"\"                                       |", "\x01"},
+    {"2.1.2  2 bytes (U-00000080):        \"\xc2\x80\"                                      |",
+     "2.1.2  2 bytes (U-00000080):        \"\xc2\x80\"                                      |", "\x00"},
+    {"2.1.3  3 bytes (U-00000800):        \"\xe0\xa0\x80\"                                     |",
+     "2.1.3  3 bytes (U-00000800):        \"\xe0\xa0\x80\"                                     |", "\x00"},
+    {"2.1.4  4 bytes (U-00010000):        \"\xf0\x90\x80\x80\"                                    |",
+     "2.1.4  4 bytes (U-00010000):        \"\xf0\x90\x80\x80\"                                    |", "\x00"},
+    {"2.1.5  5 bytes (U-00200000):        \"\xf8\x88\x80\x80\x80\"                                   |",
+     "2.1.5  5 bytes (U-00200000):        \"\xf8\x88\x80\x80\x80\"                                   |", "\x00"},
+    {"2.1.6  6 bytes (U-04000000):        \"\xfc\x84\x80\x80\x80\x80\"                                  |",
+     "2.1.6  6 bytes (U-04000000):        \"\xfc\x84\x80\x80\x80\x80\"                                  |", "\x00"},
+//    2.2  Last possible sequence of a certain length
+    {"2.2.1  1 byte  (U-0000007F):        \"\x7f\"                                       |",
+     "2.2.1  1 byte  (U-0000007F):        \"\x7f\"                                       |", "\x00"},
+    {"2.2.2  2 bytes (U-000007FF):        \"\xdf\xbf\"                                      |",
+     "2.2.2  2 bytes (U-000007FF):        \"\xdf\xbf\"                                      |", "\x00"},
+    {"2.2.3  3 bytes (U-0000FFFF):        \"\xef\xbf\xbf\"                                     |",
+     "2.2.3  3 bytes (U-0000FFFF):        \"\"                                     |", "\x03"},  /* matches one of 5.3 sequences... */
+    {"2.2.4  4 bytes (U-001FFFFF):        \"\xf7\xbf\xbf\xbf\"                                    |",
+     "2.2.4  4 bytes (U-001FFFFF):        \"\xf7\xbf\xbf\xbf\"                                    |", "\x00"},
+    {"2.2.5  5 bytes (U-03FFFFFF):        \"\xfb\xbf\xbf\xbf\xbf\"                                   |",
+     "2.2.5  5 bytes (U-03FFFFFF):        \"\xfb\xbf\xbf\xbf\xbf\"                                   |", "\x00"},
+    {"2.2.6  6 bytes (U-7FFFFFFF):        \"\xfd\xbf\xbf\xbf\xbf\xbf\"                                  |",
+     "2.2.6  6 bytes (U-7FFFFFFF):        \"\xfd\xbf\xbf\xbf\xbf\xbf\"                                  |", "\x00"},
+//    2.3  Other boundary conditions
+    {"2.3.1  U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\"                                          |",
+     "2.3.1  U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\"                                          |", "\x00"},
+    {"2.3.2  U-0000E000 = ee 80 80 = \"\xee\x80\x80\"                                          |",
+     "2.3.2  U-0000E000 = ee 80 80 = \"\xee\x80\x80\"                                          |", "\x00"},
+    {"2.3.3  U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\"                                          |",
+     "2.3.3  U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\"                                          |", "\x00"},
+    {"2.3.4  U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\"                                      |",
+     "2.3.4  U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\"                                      |", "\x00"},
+    {"2.3.5  U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\"                                      |",
+     "2.3.5  U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\"                                      |", "\x00"},
+
+//    3  Malformed sequences
+//    3.1  Unexpected continuation bytes
+//         Each unexpected continuation byte should be separately signalled as a malformed sequence of its own.
+    {"3.1.1  First continuation byte 0x80: \"\x80\"                                      |",
+     "3.1.1  First continuation byte 0x80: \"\"                                      |", "\x01"},
+    {"3.1.2  Last  continuation byte 0xbf: \"\xbf\"                                      |",
+     "3.1.2  Last  continuation byte 0xbf: \"\"                                      |", "\x01"},
+    {"3.1.3  2 continuation bytes: \"\x80\xbf\"                                             |",
+     "3.1.3  2 continuation bytes: \"\"                                             |", "\x02"},
+    {"3.1.4  3 continuation bytes: \"\x80\xbf\x80\"                                            |",
+     "3.1.4  3 continuation bytes: \"\"                                            |", "\x03"},
+    {"3.1.5  4 continuation bytes: \"\x80\xbf\x80\xbf\"                                           |",
+     "3.1.5  4 continuation bytes: \"\"                                           |", "\x04"},
+    {"3.1.6  5 continuation bytes: \"\x80\xbf\x80\xbf\x80\"                                          |",
+     "3.1.6  5 continuation bytes: \"\"                                          |", "\x05"},
+    {"3.1.7  6 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\"                                         |",
+     "3.1.7  6 continuation bytes: \"\"                                         |", "\x06"},
+    {"3.1.8  7 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\x80\"                                        |",
+     "3.1.8  7 continuation bytes: \"\"                                        |", "\x07"},
+//    3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+    {"3.1.9      \"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+                  "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+                  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+                  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\" |",
+     "3.1.9      \"\" |", "\x40"},
+//    3.2  Lonely start characters
+//    3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character:
+    {"3.2.1      \"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
+                  "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \" |",
+     "3.2.1      \"                                \" |", "\x20"},
+//    3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character:
+    {"3.2.2      \"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \"                                 |",
+     "3.2.2      \"                \"                                 |", "\x10"},
+//    3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character:
+    {"3.2.3      \"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \"                                                 |",
+     "3.2.3      \"        \"                                                 |", "\x08"},
+//    3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character:
+    {"3.2.4      \"\xf8 \xf9 \xfa \xfb \"                                                         |",
+     "3.2.4      \"    \"                                                         |", "\x04"},
+//    3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character:
+    {"3.2.4      \"\xfc \xfd \"                                                             |",
+     "3.2.4      \"  \"                                                             |", "\x02"},
+//    3.3  Sequences with last continuation byte missing
+//         All bytes of an incomplete sequence should be signalled as a single malformed sequence,
+//         i.e., you should see only a single replacement character in each of the next 10 tests.
+//         (Characters as in section 2)
+    {"3.3.1  2-byte sequence with last byte missing (U+0000):     \"\xc0\"               |",
+     "3.3.1  2-byte sequence with last byte missing (U+0000):     \"\"               |", "\x01"},
+    {"3.3.2  3-byte sequence with last byte missing (U+0000):     \"\xe0\x80\"              |",
+     

@@ Diff output truncated at 10240 characters. @@