[Bf-blender-cvs] [45bd8fdc2b0] master: BLI: new string search api that supports fuzzy and prefix matching

Wed Sep 9 13:52:09 CEST 2020

Commit: 45bd8fdc2b086e994fa3e907a64e587013112603
Author: Jacques Lucke
Date:   Wed Sep 9 13:40:14 2020 +0200
Branches: master
https://developer.blender.org/rB45bd8fdc2b086e994fa3e907a64e587013112603

BLI: new string search api that supports fuzzy and prefix matching

This adds a generic string search library in `BLI_string_search.h`.
The library has a simple to use C api that allows it's users to
filter and sort a set of possible search results based on some
query string.

Reviewers: Severin

Differential Revision: https://developer.blender.org/D8825

===================================================================

A	source/blender/blenlib/BLI_string_search.h
M	source/blender/blenlib/CMakeLists.txt
A	source/blender/blenlib/intern/string_search.cc
A	source/blender/blenlib/tests/BLI_string_search_test.cc

===================================================================

diff --git a/source/blender/blenlib/BLI_string_search.h b/source/blender/blenlib/BLI_string_search.h
new file mode 100644
index 00000000000..8057e5b75cb
--- /dev/null
+++ b/source/blender/blenlib/BLI_string_search.h
@@ -0,0 +1,51 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct StringSearch StringSearch;
+
+StringSearch *BLI_string_search_new(void);
+void BLI_string_search_add(StringSearch *search, const char *str, void *user_data);
+int BLI_string_search_query(StringSearch *search, const char *query, void ***r_data);
+void BLI_string_search_free(StringSearch *search);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+#  include "BLI_linear_allocator.hh"
+#  include "BLI_span.hh"
+#  include "BLI_string_ref.hh"
+#  include "BLI_vector.hh"
+
+namespace blender::string_search {
+
+int damerau_levenshtein_distance(StringRef a, StringRef b);
+int get_fuzzy_match_errors(StringRef query, StringRef full);
+void extract_normalized_words(StringRef str,
+                              LinearAllocator<> &allocator,
+                              Vector<StringRef, 64> &r_words);
+
+}  // namespace blender::string_search
+
+#endif
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index a0569ad3dd4..e01459ac80e 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -124,6 +124,7 @@ set(SRC
   intern/storage.c
   intern/string.c
   intern/string_cursor_utf8.c
+  intern/string_search.cc
   intern/string_utf8.c
   intern/string_utils.c
   intern/system.c
@@ -267,6 +268,7 @@ set(SRC
   BLI_strict_flags.h
   BLI_string.h
   BLI_string_cursor_utf8.h
+  BLI_string_search.h
   BLI_string_ref.hh
   BLI_string_utf8.h
   BLI_string_utils.h
@@ -411,6 +413,7 @@ if(WITH_GTESTS)
     tests/BLI_span_test.cc
     tests/BLI_stack_cxx_test.cc
     tests/BLI_stack_test.cc
+    tests/BLI_string_search_test.cc
     tests/BLI_string_ref_test.cc
     tests/BLI_string_test.cc
     tests/BLI_string_utf8_test.cc
diff --git a/source/blender/blenlib/intern/string_search.cc b/source/blender/blenlib/intern/string_search.cc
new file mode 100644
index 00000000000..17da3b9f493
--- /dev/null
+++ b/source/blender/blenlib/intern/string_search.cc
@@ -0,0 +1,475 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_array.hh"
+#include "BLI_linear_allocator.hh"
+#include "BLI_multi_value_map.hh"
+#include "BLI_span.hh"
+#include "BLI_string.h"
+#include "BLI_string_ref.hh"
+#include "BLI_string_search.h"
+#include "BLI_string_utf8.h"
+#include "BLI_timeit.hh"
+
+namespace blender::string_search {
+
+static int64_t count_utf8_code_points(StringRef str)
+{
+  return static_cast<int64_t>(BLI_strnlen_utf8(str.data(), static_cast<size_t>(str.size())));
+}
+
+/**
+ * Computes the cost of transforming string a into b. The cost/distance is the minimal number of
+ * operations that need to be executed. Valid operations are deletion, insertion, substitution and
+ * transposition.
+ *
+ * This function is utf8 aware in the sense that it works at the level of individual code points
+ * (1-4 bytes long) instead of on individual bytes.
+ */
+int damerau_levenshtein_distance(StringRef a, StringRef b)
+{
+  constexpr int deletion_cost = 1;
+  constexpr int insertion_cost = 1;
+  constexpr int substitution_cost = 1;
+  constexpr int transposition_cost = 1;
+
+  const int size_a = count_utf8_code_points(a);
+  const int size_b = count_utf8_code_points(b);
+
+  /* Instead of keeping the entire table in memory, only keep three rows. The algorithm only
+   * accesses these rows and nothing older.
+   * All three rows are usually allocated on the stack. At most a single heap allocation is done,
+   * if the reserved stack space is too small. */
+  const int row_length = size_b + 1;
+  Array<int, 64> rows(row_length * 3);
+
+  /* Store rows as spans so that it is cheap to swap them. */
+  MutableSpan v0{rows.data() + row_length * 0, row_length};
+  MutableSpan v1{rows.data() + row_length * 1, row_length};
+  MutableSpan v2{rows.data() + row_length * 2, row_length};
+
+  /* Only v1 needs to be initialized. */
+  for (const int i : IndexRange(row_length)) {
+    v1[i] = i * insertion_cost;
+  }
+
+  uint32_t prev_unicode_a;
+  size_t offset_a = 0;
+  for (const int i : IndexRange(size_a)) {
+    v2[0] = (i + 1) * deletion_cost;
+
+    const uint32_t unicode_a = BLI_str_utf8_as_unicode_and_size(a.data() + offset_a, &offset_a);
+
+    uint32_t prev_unicode_b;
+    size_t offset_b = 0;
+    for (const int j : IndexRange(size_b)) {
+      const uint32_t unicode_b = BLI_str_utf8_as_unicode_and_size(b.data() + offset_b, &offset_b);
+
+      /* Check how costly the different operations would be and pick the cheapest - the one with
+       * minimal cost. */
+      int new_cost = std::min({v1[j + 1] + deletion_cost,
+                               v2[j] + insertion_cost,
+                               v1[j] + (unicode_a != unicode_b) * substitution_cost});
+      if (i > 0 && j > 0) {
+        if (unicode_a == prev_unicode_b && prev_unicode_a == unicode_b) {
+          new_cost = std::min(new_cost, v0[j - 1] + transposition_cost);
+        }
+      }
+
+      v2[j + 1] = new_cost;
+      prev_unicode_b = unicode_b;
+    }
+
+    /* Swap the three rows, so that the next row can be computed. */
+    std::tie(v0, v1, v2) = std::tuple<MutableSpan<int>, MutableSpan<int>, MutableSpan<int>>(
+        v1, v2, v0);
+    prev_unicode_a = unicode_a;
+  }
+
+  return v1.last();
+}
+
+/**
+ * Returns -1 when this is no reasonably good match.
+ * Otherwise returns the number of errors in the match.
+ */
+int get_fuzzy_match_errors(StringRef query, StringRef full)
+{
+  /* If it is a perfect partial match, return immediatly. */
+  if (full.find(query) != StringRef::not_found) {
+    return 0;
+  }
+
+  const int query_size = count_utf8_code_points(query);
+  const int full_size = count_utf8_code_points(full);
+
+  /* If there is only a single character which is not in the full string, this is not a match. */
+  if (query_size == 1) {
+    return -1;
+  }
+  BLI_assert(query.size() >= 2);
+
+  /* Allow more errors when the size grows larger. */
+  const int max_errors = query_size <= 1 ? 0 : query_size / 8 + 1;
+
+  /* If the query is too large, this cannot be a match. */
+  if (query_size - full_size > max_errors) {
+    return -1;
+  }
+
+  const uint32_t query_first_unicode = BLI_str_utf8_as_unicode(query.data());
+  const uint32_t query_second_unicode = BLI_str_utf8_as_unicode(query.data() +
+                                                                BLI_str_utf8_size(query.data()));
+
+  const char *full_begin = full.begin();
+  const char *full_end = full.end();
+
+  const char *window_begin = full_begin;
+  const char *window_end = window_begin;
+  const int window_size = std::min(query_size + max_errors, full_size);
+  const int extra_chars = window_size - query_size;
+  const int max_acceptable_distance = max_errors + extra_chars;
+
+  for (int i = 0; i < window_size; i++) {
+    window_end += BLI_str_utf8_size(window_end);
+  }
+
+  while (true) {
+    StringRef window{window_begin, window_end};
+    const uint32_t window_begin_unicode = BLI_str_utf8_as_unicode(window_begin);
+    int distance = 0;
+    /* Expect that the first or second character of the query is correct. This helps to avoid
+     * computing the more expensive distance function. */
+    if (ELEM(window_begin_unicode, query_first_unicode, query_second_unicode)) {
+      distance = damerau_levenshtein_distance(query, window);
+      if (distance <= max_acceptable_distance) {
+        return distance;
+      }
+    }
+    if (window_end == full_end) {
+      return -1;
+    }
+
+    /* When the distance is way too large, we can skip a couple of code points, because the
+     * distance can't possibly become as short as required. */
+    const int window_offset = std::max(1, distance / 2);
+    for (int i = 0; i < window_offset && window_end < full_end; i++) {
+      window_begin += BLI_str_utf8_size(window_begin);
+      window_end += BLI_str_utf8_size(window_end);
+    }
+  }
+}
+
+/**
+ * Takes a query and tries to match it with the first characters of some words. For example, "msfv"
+ * matches "Mark Sharp from Vertices". Multiple letters of the beginning of a word can be matched
+ * as well. For example, "seboulo" matches "select boundary loop". The order of words is important.
+ * So "bose" does not match "select boundary". However, individual words can be skipped. For
+ * example, "rocc" matches "rotate edge ccw".
+ *
+ * Returns true when the match was successfull. If it was successfull, the used words are tagged in
+ * r_word_is_matched.
+ */
+static bool match_word_initials(StringRef query,
+                          

@@ Diff output truncated at 10240 characters. @@