[Bf-blender-cvs] [38630711a02] master: BLI_string_utf8: remove unnecessary utf8 decoding functions

Wed Aug 25 07:32:00 CEST 2021

Commit: 38630711a02e553f209ace9a8627a7a851820a2d
Author: Campbell Barton
Date:   Wed Aug 25 15:19:00 2021 +1000
Branches: master
https://developer.blender.org/rB38630711a02e553f209ace9a8627a7a851820a2d

BLI_string_utf8: remove unnecessary utf8 decoding functions

Remove BLI_str_utf8_as_unicode_and_size and
BLI_str_utf8_as_unicode_and_size_safe.

Use BLI_str_utf8_as_unicode_step instead since it takes
a buffer bounds argument to prevent buffer over-reading.

===================================================================

M	source/blender/blenkernel/intern/text.c
M	source/blender/blenlib/BLI_string_utf8.h
M	source/blender/blenlib/intern/string_cursor_utf8.c
M	source/blender/blenlib/intern/string_search.cc
M	source/blender/blenlib/intern/string_utf8.c
M	source/blender/editors/space_text/text_autocomplete.c
M	source/blender/makesrna/intern/rna_wm.c

===================================================================

diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c
index c2ab91251b6..7f1f6590e48 100644
--- a/source/blender/blenkernel/intern/text.c
+++ b/source/blender/blenkernel/intern/text.c
@@ -1888,8 +1888,9 @@ void txt_delete_char(Text *text)
     }
   }
   else { /* Just deleting a char */
-    size_t c_len = 0;
-    c = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &c_len);
+    size_t c_len = text->curc;
+    c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len);
+    c_len -= text->curc;
     UNUSED_VARS(c);
 
     memmove(text->curl->line + text->curc,
@@ -1937,9 +1938,11 @@ void txt_backspace_char(Text *text)
     txt_pop_sel(text);
   }
   else { /* Just backspacing a char */
-    size_t c_len = 0;
     const char *prev = BLI_str_prev_char_utf8(text->curl->line + text->curc);
-    c = BLI_str_utf8_as_unicode_and_size(prev, &c_len);
+    size_t c_len = prev - text->curl->line;
+    c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len);
+    c_len -= prev - text->curl->line;
+
     UNUSED_VARS(c);
 
     /* source and destination overlap, don't use memcpy() */
@@ -2053,7 +2056,9 @@ bool txt_replace_char(Text *text, unsigned int add)
     return txt_add_char(text, add);
   }
 
-  del = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &del_size);
+  del_size = text->curc;
+  del = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &del_size);
+  del_size -= text->curc;
   UNUSED_VARS(del);
   add_size = BLI_str_utf8_from_unicode(add, ch);
 
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index 1b12147fe0f..b97c8748ca4 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -39,10 +39,6 @@ int BLI_str_utf8_size(const char *p) ATTR_NONNULL();
 int BLI_str_utf8_size_safe(const char *p) ATTR_NONNULL();
 /* copied from glib */
 unsigned int BLI_str_utf8_as_unicode(const char *p) ATTR_NONNULL();
-unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
-    ATTR_NONNULL();
-unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p,
-                                                   size_t *__restrict index) ATTR_NONNULL();
 unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
                                           size_t p_len,
                                           size_t *__restrict index) ATTR_NONNULL(1, 3);
diff --git a/source/blender/blenlib/intern/string_cursor_utf8.c b/source/blender/blenlib/intern/string_cursor_utf8.c
index 90fde02b11f..59b9f4eeca0 100644
--- a/source/blender/blenlib/intern/string_cursor_utf8.c
+++ b/source/blender/blenlib/intern/string_cursor_utf8.c
@@ -101,11 +101,14 @@ static eStrCursorDelimType cursor_delim_type_unicode(const uint uch)
   return STRCUR_DELIM_ALPHANUMERIC; /* Not quite true, but ok for now */
 }
 
-static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8)
+static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8,
+                                                  const size_t ch_utf8_len,
+                                                  const int pos)
 {
   /* for full unicode support we really need to have large lookup tables to figure
    * out what's what in every possible char set - and python, glib both have these. */
-  uint uch = BLI_str_utf8_as_unicode(ch_utf8);
+  size_t index = (size_t)pos;
+  uint uch = BLI_str_utf8_as_unicode_step_or_error(ch_utf8, ch_utf8_len, &index);
   return cursor_delim_type_unicode(uch);
 }
 
@@ -157,14 +160,16 @@ void BLI_str_cursor_step_utf8(const char *str,
     }
 
     if (jump != STRCUR_JUMP_NONE) {
-      const eStrCursorDelimType delim_type = (*pos) < maxlen ? cursor_delim_type_utf8(&str[*pos]) :
-                                                               STRCUR_DELIM_NONE;
+      const eStrCursorDelimType delim_type = (*pos) < maxlen ?
+                                                 cursor_delim_type_utf8(str, maxlen, *pos) :
+                                                 STRCUR_DELIM_NONE;
       /* jump between special characters (/,\,_,-, etc.),
        * look at function cursor_delim_type() for complete
        * list of special character, ctr -> */
       while ((*pos) < maxlen) {
         if (BLI_str_cursor_step_next_utf8(str, maxlen, pos)) {
-          if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) {
+          if ((jump != STRCUR_JUMP_ALL) &&
+              (delim_type != cursor_delim_type_utf8(str, maxlen, *pos))) {
             break;
           }
         }
@@ -184,7 +189,7 @@ void BLI_str_cursor_step_utf8(const char *str,
 
     if (jump != STRCUR_JUMP_NONE) {
       const eStrCursorDelimType delim_type = (*pos) > 0 ?
-                                                 cursor_delim_type_utf8(&str[(*pos) - 1]) :
+                                                 cursor_delim_type_utf8(str, maxlen, *pos - 1) :
                                                  STRCUR_DELIM_NONE;
       /* jump between special characters (/,\,_,-, etc.),
        * look at function cursor_delim_type() for complete
@@ -192,7 +197,8 @@ void BLI_str_cursor_step_utf8(const char *str,
       while ((*pos) > 0) {
         const int pos_prev = *pos;
         if (BLI_str_cursor_step_prev_utf8(str, maxlen, pos)) {
-          if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) {
+          if ((jump != STRCUR_JUMP_ALL) &&
+              (delim_type != cursor_delim_type_utf8(str, maxlen, (size_t)*pos))) {
             /* left only: compensate for index/change in direction */
             if ((pos_orig - (*pos)) >= 1) {
               *pos = pos_prev;
diff --git a/source/blender/blenlib/intern/string_search.cc b/source/blender/blenlib/intern/string_search.cc
index 25a13674932..a466c124073 100644
--- a/source/blender/blenlib/intern/string_search.cc
+++ b/source/blender/blenlib/intern/string_search.cc
@@ -71,12 +71,12 @@ int damerau_levenshtein_distance(StringRef a, StringRef b)
   for (const int i : IndexRange(size_a)) {
     v2[0] = (i + 1) * deletion_cost;
 
-    const uint32_t unicode_a = BLI_str_utf8_as_unicode_and_size(a.data() + offset_a, &offset_a);
+    const uint32_t unicode_a = BLI_str_utf8_as_unicode_step(a.data(), a.size(), &offset_a);
 
     uint32_t prev_unicode_b;
     size_t offset_b = 0;
     for (const int j : IndexRange(size_b)) {
-      const uint32_t unicode_b = BLI_str_utf8_as_unicode_and_size(b.data() + offset_b, &offset_b);
+      const uint32_t unicode_b = BLI_str_utf8_as_unicode_step(b.data(), b.size(), &offset_b);
 
       /* Check how costly the different operations would be and pick the cheapest - the one with
        * minimal cost. */
@@ -202,8 +202,8 @@ static bool match_word_initials(StringRef query,
   int first_found_word_index = -1;
 
   while (query_index < query.size()) {
-    const uint query_unicode = BLI_str_utf8_as_unicode_and_size(query.data() + query_index,
-                                                                &query_index);
+    const uint query_unicode = BLI_str_utf8_as_unicode_step(
+        query.data(), query.size(), &query_index);
     while (true) {
       /* We are at the end of words, no complete match has been found yet. */
       if (word_index >= words.size()) {
@@ -226,8 +226,8 @@ static bool match_word_initials(StringRef query,
       StringRef word = words[word_index];
       /* Try to match the current character with the current word. */
       if (static_cast<int>(char_index) < word.size()) {
-        const uint32_t char_unicode = BLI_str_utf8_as_unicode_and_size(word.data() + char_index,
-                                                                       &char_index);
+        const uint32_t char_unicode = BLI_str_utf8_as_unicode_step(
+            word.data(), word.size(), &char_index);
         if (query_unicode == char_unicode) {
           r_word_is_matched[word_index] = true;
           if (first_found_word_index == -1) {
@@ -368,8 +368,9 @@ void extract_normalized_words(StringRef str,
   size_t word_start = 0;
   size_t offset = 0;
   while (offset < str_size_in_bytes) {
-    size_t size = 0;
-    uint32_t unicode = BLI_str_utf8_as_unicode_and_size(str.data() + offset, &size);
+    size_t size = offset;
+    uint32_t unicode = BLI_str_utf8_as_unicode_step(str.data(), str.size(), &size);
+    size -= offset;
     if (is_separator(unicode)) {
       if (is_in_word) {
         r_words.append(
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 06fd3168c24..7a01077bb44 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -546,40 +546,6 @@ uint BLI_str_utf8_as_unicode(const char *p)
   return result;
 }
 
-/* variant that increments the length */
-uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
-{
-  int i, len;
-  uint mask = 0;
-  uint result;
-  const unsigned char c = (unsigned char)*p;
-
-  UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
-    return BLI_UTF8_ERR;
-  }
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-  *index += (size_t)len;
-  return result;
-}
-
-uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
-{
-  int i, len;
-  uint mask = 0;
-  uint result;
-  const unsigned char c = (unsigned char)*p;
-
-  UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
-    *index += 1;
-    return c;
-  }
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-  *index += (size_t)len;
-  return result;
-}
-
 /**
  * UTF8 decoding that steps over the index (unless an error is encountered).
  *
@@ -709,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
   memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
 #endif
 
+  const size_t src_c_len = strlen(src_c);
+  const char *src_c_end = src_c + src_c_len;
+  size_t index = 0;
   while (*src_c && len != maxlen) {
-    size_t step = 0;
-    uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
+    const uint 

@@ Diff output truncated at 10240 characters. @@