[Bf-blender-cvs] [ceff86aafe4] master: Various Exact Boolean parallelizations and optimizations.

Tue Jul 6 00:16:25 CEST 2021

Commit: ceff86aafe46a6fb66e023500f5a47260964b0a2
Author: Erik Abrahamsson
Date:   Mon Jul 5 18:09:36 2021 -0400
Branches: master
https://developer.blender.org/rBceff86aafe46a6fb66e023500f5a47260964b0a2

Various Exact Boolean parallelizations and optimizations.

>From patch D11780 from Erik Abrahamsson.
It parallelizes making the vertices, destruction of map entries,
finding if the result is PWN, finding triangle adjacencies,
and finding the ambient cell.
The latter needs a parallel_reduce from tbb, so added one into
BLI_task.hh so that if WITH_TBB is false, the code will still work.

On Erik's 6-core machine, the elapsed time went from 17.5s to 11.8s
(33% faster) on an intersection of two spheres with 3.1M faces.
On Howard's 24-core machine, the elapsed time went from 18.7s to 10.8s
for the same test.

===================================================================

M	source/blender/blenkernel/intern/mesh_boolean_convert.cc
M	source/blender/blenlib/BLI_mesh_intersect.hh
M	source/blender/blenlib/BLI_task.hh
M	source/blender/blenlib/intern/mesh_boolean.cc
M	source/blender/blenlib/intern/mesh_intersect.cc

===================================================================

diff --git a/source/blender/blenkernel/intern/mesh_boolean_convert.cc b/source/blender/blenkernel/intern/mesh_boolean_convert.cc
index 9bfb01a257c..798d9562150 100644
--- a/source/blender/blenkernel/intern/mesh_boolean_convert.cc
+++ b/source/blender/blenkernel/intern/mesh_boolean_convert.cc
@@ -38,6 +38,7 @@
 #include "BLI_mesh_boolean.hh"
 #include "BLI_mesh_intersect.hh"
 #include "BLI_span.hh"
+#include "BLI_task.hh"
 
 namespace blender::meshintersect {
 
@@ -309,22 +310,38 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
                                                         clean_obmat(*obmats[mi]);
     r_info->to_target_transform[mi] = inv_target_mat * objn_mat;
 
-    /* Skip the matrix multiplication for each point when there is no transform for a mesh,
-     * for example when the first mesh is already in the target space. (Note the logic directly
-     * above, which uses an identity matrix with a null input transform). */
+    Vector<Vert *> verts(me->totvert);
+    Span<MVert> mverts = Span(me->mvert, me->totvert);
+
+    /* Allocate verts
+     * Skip the matrix multiplication for each point when there is no transform for a mesh,
+     * for example when the first mesh is already in the target space. (Note the logic
+     * directly above, which uses an identity matrix with a null input transform). */
     if (obmats[mi] == nullptr) {
-      for (const MVert &vert : Span(me->mvert, me->totvert)) {
-        const float3 co = float3(vert.co);
-        r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(mpq3(co.x, co.y, co.z), v);
-        ++v;
-      }
+      threading::parallel_for(mverts.index_range(), 2048, [&](IndexRange range) {
+        float3 co;
+        for (int i : range) {
+          co = float3(mverts[i].co);
+          mpq3 mco = mpq3(co.x, co.y, co.z);
+          double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
+          verts[i] = new Vert(mco, dco, NO_INDEX, i);
+        }
+      });
     }
     else {
-      for (const MVert &vert : Span(me->mvert, me->totvert)) {
-        const float3 co = r_info->to_target_transform[mi] * float3(vert.co);
-        r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(mpq3(co.x, co.y, co.z), v);
-        ++v;
-      }
+      threading::parallel_for(mverts.index_range(), 2048, [&](IndexRange range) {
+        float3 co;
+        for (int i : range) {
+          co = r_info->to_target_transform[mi] * float3(mverts[i].co);
+          mpq3 mco = mpq3(co.x, co.y, co.z);
+          double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
+          verts[i] = new Vert(mco, dco, NO_INDEX, i);
+        }
+      });
+    }
+    for (int i : mverts.index_range()) {
+      r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(verts[i]);
+      ++v;
     }
 
     for (const MPoly &poly : Span(me->mpoly, me->totpoly)) {
diff --git a/source/blender/blenlib/BLI_mesh_intersect.hh b/source/blender/blenlib/BLI_mesh_intersect.hh
index a19682327a5..f28be9bf59b 100644
--- a/source/blender/blenlib/BLI_mesh_intersect.hh
+++ b/source/blender/blenlib/BLI_mesh_intersect.hh
@@ -225,6 +225,7 @@ class IMeshArena : NonCopyable, NonMovable {
    */
   const Vert *add_or_find_vert(const mpq3 &co, int orig);
   const Vert *add_or_find_vert(const double3 &co, int orig);
+  const Vert *add_or_find_vert(Vert *vert);
 
   Face *add_face(Span<const Vert *> verts,
                  int orig,
diff --git a/source/blender/blenlib/BLI_task.hh b/source/blender/blenlib/BLI_task.hh
index 5f5a17f6b58..e2446ad143e 100644
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@@ -28,6 +28,7 @@
 #    define NOMINMAX
 #    define TBB_MIN_MAX_CLEANUP
 #  endif
+#  include "tbb/parallel_reduce.h"
 #  include <tbb/blocked_range.h>
 #  include <tbb/parallel_for.h>
 #  include <tbb/parallel_for_each.h>
@@ -76,6 +77,27 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
 #endif
 }
 
+template<typename Value, typename Function, typename Reduction>
+Value parallel_reduce(IndexRange range,
+                      int64_t grain_size,
+                      const Value &identity,
+                      const Function &function,
+                      const Reduction &reduction)
+{
+#ifdef WITH_TBB
+  return tbb::parallel_reduce(
+      tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
+      identity,
+      [&](const tbb::blocked_range<int64_t> &subrange, const Value &ident) {
+        return function(IndexRange(subrange.begin(), subrange.size()), ident);
+      },
+      reduction);
+#else
+  UNUSED_VARS(grain_size, reduction);
+  return function(range, identity);
+#endif
+}
+
 /** See #BLI_task_isolate for a description of what isolating a task means. */
 template<typename Function> void isolate_task(const Function &function)
 {
diff --git a/source/blender/blenlib/intern/mesh_boolean.cc b/source/blender/blenlib/intern/mesh_boolean.cc
index 9f7824a0029..dddf8851767 100644
--- a/source/blender/blenlib/intern/mesh_boolean.cc
+++ b/source/blender/blenlib/intern/mesh_boolean.cc
@@ -21,6 +21,7 @@
 #ifdef WITH_GMP
 
 #  include <algorithm>
+#  include <atomic>
 #  include <fstream>
 #  include <iostream>
 
@@ -50,6 +51,7 @@
 #  include "BLI_mesh_boolean.hh"
 
 #  ifdef WITH_TBB
+#    include "tbb/parallel_reduce.h"
 #    include "tbb/spin_mutex.h"
 #  endif
 
@@ -201,9 +203,14 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
         BLI_assert(edges != nullptr);
       }
       edges->append_non_duplicates(e);
-      auto createf = [t](Vector<int> **pvec) { *pvec = new Vector<int>{t}; };
-      auto modifyf = [t](Vector<int> **pvec) { (*pvec)->append_non_duplicates(t); };
-      this->edge_tri_.add_or_modify(Edge(v, vnext), createf, modifyf);
+
+      auto p = edge_tri_.lookup_ptr(Edge(v, vnext));
+      if (p == nullptr) {
+        edge_tri_.add_new(e, new Vector<int>{t});
+      }
+      else {
+        (*p)->append_non_duplicates(t);
+      }
     }
   }
   /* Debugging. */
@@ -228,9 +235,18 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
 
 TriMeshTopology::~TriMeshTopology()
 {
-  for (const Vector<int> *vec : edge_tri_.values()) {
-    delete vec;
+  Vector<Vector<int> *> values;
+
+  /* Deconstructing is faster in parallel, so it is worth building an array of things to delete. */
+  for (auto item : edge_tri_.values()) {
+    values.append(item);
   }
+
+  threading::parallel_for(values.index_range(), 256, [&](IndexRange range) {
+    for (int i : range) {
+      delete values[i];
+    }
+  });
 }
 
 /** A Patch is a maximal set of triangles that share manifold edges only. */
@@ -719,6 +735,18 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
   PatchesInfo pinfo(ntri);
   /* Algorithm: Grow patches across manifold edges as long as there are unassigned triangles. */
   Stack<int> cur_patch_grow;
+
+  /* Create an Array containing indices of adjacent faces. */
+  Array<std::array<int, 3>> t_others(tm.face_size());
+  threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
+    for (int t : range) {
+      const Face &tri = *tm.face(t);
+      for (int i = 0; i < 3; ++i) {
+        Edge e(tri[i], tri[(i + 1) % 3]);
+        t_others[t][i] = tmtopo.other_tri_if_manifold(e, t);
+      }
+    }
+  });
   for (int t : tm.face_index_range()) {
     if (pinfo.tri_patch(t) == -1) {
       cur_patch_grow.push(t);
@@ -739,7 +767,7 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
         const Face &tri = *tm.face(tcand);
         for (int i = 0; i < 3; ++i) {
           Edge e(tri[i], tri[(i + 1) % 3]);
-          int t_other = tmtopo.other_tri_if_manifold(e, tcand);
+          int t_other = t_others[tcand][i];
           if (dbg_level > 1) {
             std::cout << "  edge " << e << " generates t_other=" << t_other << "\n";
           }
@@ -953,12 +981,8 @@ static void sort_by_signed_triangle_index(Vector<int> &g,
  * To accommodate this:
  * If extra_tri is non-null, then an index of EXTRA_TRI_INDEX should use it for the triangle.
  */
-static Array<int> sort_tris_around_edge(const IMesh &tm,
-                                        const TriMeshTopology &tmtopo,
-                                        const Edge e,
-                                        const Span<int> tris,
-                                        const int t0,
-                                        const Face *extra_tri)
+static Array<int> sort_tris_around_edge(
+    const IMesh &tm, const Edge e, const Span<int> tris, const int t0, const Face *extra_tri)
 {
   /* Divide and conquer, quick-sort-like sort.
    * Pick a triangle t0, then partition into groups:
@@ -1023,14 +1047,14 @@ static Array<int> sort_tris_around_edge(const IMesh &tm,
     }
   }
   if (g3.size() > 1) {
-    Array<int> g3sorted = sort_tris_around_edge(tm, tmtopo, e, g3, t0, extra_tri);
+    Array<int> g3sorted = sort_tris_around_edge(tm, e, g3, t0, extra_tri);
     std::copy(g3sorted.begin(), g3sorted.end(), g3.begin());
     if (dbg_level > 1) {
       std::cout << "g3 sorted: " << g3 << "\n";
     }
   }
   if (g4.size() > 1) {
-    Array<int> g4sorted = sort_tris_around_edge(tm, tmtopo, e, g4, t0, extra_tri);
+    Array<int> g4sorted = sort_tris_around_edge(tm, e, g4, t0, extra_tri);
     std::copy(g4sorted.begin(), g4sorted.end(), g4.begin());
     if (dbg_level > 1) {
       std::cout << "g4 sorted: " << g4 << "\n";
@@ -1076,7 +1100,7 @@ static void find_cells_from_edge(const IMesh &tm,
   const Vector<int> *edge_tris = tmtopo.edge_tris(e);
   BLI_assert(edge_tris != nullptr);
   Array<int> sorted_tris = sort_tris_around_edge(
-      tm, tmtopo, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
+      tm, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
 
   int n_edge_tris = edge_tris->size();
   Array<int> edge_patches(n_edge_tris);
@@ -1338,34 +1362,46 @@ static bool patch_cell_graph_ok(const CellsInfo &cinfo, const PatchesInfo &pinfo
 static bool is_pwn(const IMesh &tm, const TriMeshTopology &tmtopo)
 {
   constexpr int dbg_level = 0;
+  std::atomic<bool> is_pwn = true;
+  Vector<std::pair<Edge, Vector<int> *>> tris;
+
   for (auto item : tmtopo.edge_tri_map_items()) 

@@ Diff output truncated at 10240 characters. @@