[Bf-blender-cvs] [6bef2559047] master: BMesh: remove unit-length edge-vector cache from normal calculation

Mon Jun 14 15:04:27 CEST 2021

Commit: 6bef2559047461794eb3ff27de15f4caf5ddcf1e
Author: Campbell Barton
Date:   Mon Jun 14 22:56:02 2021 +1000
Branches: master
https://developer.blender.org/rB6bef2559047461794eb3ff27de15f4caf5ddcf1e

BMesh: remove unit-length edge-vector cache from normal calculation

Bypass stored edge-vectors for ~16% performance gains.

While this increases unit-length edge-vector calculations by around ~4x
the overhead of a parallel loop over all edges makes it worthwhile.

Note that caching edge-vectors per-vertex performs better and may be
worth investigating further, although in my tests this increases code
complexity with barley measurable benefits over not using cache at all.

Details about performance and possible optimizations are noted in
bm_vert_calc_normals_impl.

===================================================================

M	source/blender/bmesh/intern/bmesh_mesh_normals.c
M	source/blender/bmesh/intern/bmesh_mesh_partial_update.c
M	source/blender/bmesh/intern/bmesh_mesh_partial_update.h

===================================================================

diff --git a/source/blender/bmesh/intern/bmesh_mesh_normals.c b/source/blender/bmesh/intern/bmesh_mesh_normals.c
index a3eae6dabe8..bf30f3a52e1 100644
--- a/source/blender/bmesh/intern/bmesh_mesh_normals.c
+++ b/source/blender/bmesh/intern/bmesh_mesh_normals.c
@@ -49,67 +49,9 @@
  * assuming no other tool using it would run concurrently to clnors editing. */
 #define BM_LNORSPACE_UPDATE _FLAG_MF
 
-typedef struct BMEdgesCalcVectorsData {
-  /* Read-only data. */
-  const float (*vcos)[3];
-
-  /* Read-write data, but no need to protect it, no concurrency to fear here. */
-  float (*edgevec)[3];
-} BMEdgesCalcVectorsData;
-
-static void bm_edge_calc_vectors_cb(void *userdata,
-                                    MempoolIterData *mp_e,
-                                    const TaskParallelTLS *__restrict UNUSED(tls))
-{
-  BMEdge *e = (BMEdge *)mp_e;
-  /* The edge vector will not be needed when the edge has no radial. */
-  if (e->l != NULL) {
-    float(*edgevec)[3] = userdata;
-    float *e_diff = edgevec[BM_elem_index_get(e)];
-    sub_v3_v3v3(e_diff, e->v2->co, e->v1->co);
-    normalize_v3(e_diff);
-  }
-}
-
-static void bm_edge_calc_vectors_with_coords_cb(void *userdata,
-                                                MempoolIterData *mp_e,
-                                                const TaskParallelTLS *__restrict UNUSED(tls))
-{
-  BMEdge *e = (BMEdge *)mp_e;
-  /* The edge vector will not be needed when the edge has no radial. */
-  if (e->l != NULL) {
-    BMEdgesCalcVectorsData *data = userdata;
-    float *e_diff = data->edgevec[BM_elem_index_get(e)];
-    sub_v3_v3v3(
-        e_diff, data->vcos[BM_elem_index_get(e->v2)], data->vcos[BM_elem_index_get(e->v1)]);
-    normalize_v3(e_diff);
-  }
-}
-
-static void bm_mesh_edges_calc_vectors(BMesh *bm, float (*edgevec)[3], const float (*vcos)[3])
-{
-  BM_mesh_elem_index_ensure(bm, BM_EDGE | (vcos ? BM_VERT : 0));
-
-  TaskParallelSettings settings;
-  BLI_parallel_mempool_settings_defaults(&settings);
-  settings.use_threading = bm->totedge >= BM_OMP_LIMIT;
-
-  if (vcos == NULL) {
-    BM_iter_parallel(bm, BM_EDGES_OF_MESH, bm_edge_calc_vectors_cb, edgevec, &settings);
-  }
-  else {
-    BMEdgesCalcVectorsData data = {
-        .edgevec = edgevec,
-        .vcos = vcos,
-    };
-    BM_iter_parallel(bm, BM_EDGES_OF_MESH, bm_edge_calc_vectors_with_coords_cb, &data, &settings);
-  }
-}
-
 typedef struct BMVertsCalcNormalsWithCoordsData {
   /* Read-only data. */
   const float (*fnos)[3];
-  const float (*edgevec)[3];
   const float (*vcos)[3];
 
   /* Write data. */
@@ -117,13 +59,12 @@ typedef struct BMVertsCalcNormalsWithCoordsData {
 } BMVertsCalcNormalsWithCoordsData;
 
 BLI_INLINE void bm_vert_calc_normals_accum_loop(const BMLoop *l_iter,
-                                                const float (*edgevec)[3],
+                                                const float e1diff[3],
+                                                const float e2diff[3],
                                                 const float f_no[3],
                                                 float v_no[3])
 {
   /* Calculate the dot product of the two edges that meet at the loop's vertex. */
-  const float *e1diff = edgevec[BM_elem_index_get(l_iter->prev->e)];
-  const float *e2diff = edgevec[BM_elem_index_get(l_iter->e)];
   /* Edge vectors are calculated from e->v1 to e->v2, so adjust the dot product if one but not
    * both loops actually runs from from e->v2 to e->v1. */
   float dotprod = dot_v3v3(e1diff, e2diff);
@@ -132,25 +73,61 @@ BLI_INLINE void bm_vert_calc_normals_accum_loop(const BMLoop *l_iter,
   }
   const float fac = saacos(-dotprod);
   /* NAN detection, otherwise this is a degenerated case, ignore that vertex in this case. */
-  if (fac == fac) { /* NAN detection. */
+  if (fac == fac) {
     madd_v3_v3fl(v_no, f_no, fac);
   }
 }
 
-static void bm_vert_calc_normals_impl(const float (*edgevec)[3], BMVert *v)
+static void bm_vert_calc_normals_impl(BMVert *v)
 {
+  /* Note on redundant unit-length edge-vector calculation:
+   *
+   * This functions calculates unit-length edge-vector for every loop edge
+   * in practice this means 2x `sqrt` calls per face-corner connected to each vertex.
+   *
+   * Previously (2.9x and older), the edge vectors were calculated and stored for reuse.
+   * However the overhead of did not perform well (~16% slower - single & multi-threaded)
+   * when compared with calculating the values as they are needed.
+   *
+   * For simple grid topologies this function calculates the edge-vectors 4x times.
+   * There is some room for improved performance by storing the edge-vectors for reuse locally
+   * in this function, reducing the number of redundant `sqrtf` in half (2x instead of 4x).
+   * so face loops that share an edge would not calculate it multiple times.
+   * From my tests the performance improvements are so small they're difficult to measure,
+   * the time saved removing `sqrtf` calls is lost on storing and looking up the information,
+   * even in the case of `BLI_smallhash.h` & small inline lookup tables.
+   *
+   * Further, local data structures would need to support cases where
+   * stack memory isn't sufficient - adding additional complexity for corner-cases
+   * (a vertex that has thousands of connected edges for example).
+   * Unless there are important use-cases that benefit from edge-vector caching,
+   * keep this simple and calculate ~4x as many edge-vectors.
+   *
+   * In conclusion, the cost of caching & looking up edge-vectors both globally or per-vertex
+   * doesn't save enough time to make it worthwhile.
+   * - Campbell. */
+
   float *v_no = v->no;
   zero_v3(v_no);
+
   BMEdge *e_first = v->e;
   if (e_first != NULL) {
+    float e1diff[3], e2diff[3];
     BMEdge *e_iter = e_first;
     do {
       BMLoop *l_first = e_iter->l;
       if (l_first != NULL) {
+        sub_v3_v3v3(e2diff, e_iter->v1->co, e_iter->v2->co);
+        normalize_v3(e2diff);
+
         BMLoop *l_iter = l_first;
         do {
           if (l_iter->v == v) {
-            bm_vert_calc_normals_accum_loop(l_iter, edgevec, l_iter->f->no, v_no);
+            BMEdge *e_prev = l_iter->prev->e;
+            sub_v3_v3v3(e1diff, e_prev->v1->co, e_prev->v2->co);
+            normalize_v3(e1diff);
+
+            bm_vert_calc_normals_accum_loop(l_iter, e1diff, e2diff, l_iter->f->no, v_no);
           }
         } while ((l_iter = l_iter->radial_next) != l_first);
       }
@@ -164,32 +141,44 @@ static void bm_vert_calc_normals_impl(const float (*edgevec)[3], BMVert *v)
   normalize_v3_v3(v_no, v->co);
 }
 
-static void bm_vert_calc_normals_cb(void *userdata,
+static void bm_vert_calc_normals_cb(void *UNUSED(userdata),
                                     MempoolIterData *mp_v,
                                     const TaskParallelTLS *__restrict UNUSED(tls))
 {
-  const float(*edgevec)[3] = userdata;
   BMVert *v = (BMVert *)mp_v;
-  bm_vert_calc_normals_impl(edgevec, v);
+  bm_vert_calc_normals_impl(v);
 }
 
 static void bm_vert_calc_normals_with_coords(BMVert *v, BMVertsCalcNormalsWithCoordsData *data)
 {
+  /* See #bm_vert_calc_normals_impl note on performance. */
   float *v_no = data->vnos[BM_elem_index_get(v)];
   zero_v3(v_no);
 
   /* Loop over edges. */
   BMEdge *e_first = v->e;
   if (e_first != NULL) {
+    float e1diff[3], e2diff[3];
     BMEdge *e_iter = e_first;
     do {
       BMLoop *l_first = e_iter->l;
       if (l_first != NULL) {
+        sub_v3_v3v3(e2diff,
+                    data->vcos[BM_elem_index_get(e_iter->v1)],
+                    data->vcos[BM_elem_index_get(e_iter->v2)]);
+        normalize_v3(e2diff);
+
         BMLoop *l_iter = l_first;
         do {
           if (l_iter->v == v) {
+            BMEdge *e_prev = l_iter->prev->e;
+            sub_v3_v3v3(e1diff,
+                        data->vcos[BM_elem_index_get(e_prev->v1)],
+                        data->vcos[BM_elem_index_get(e_prev->v2)]);
+            normalize_v3(e1diff);
+
             bm_vert_calc_normals_accum_loop(
-                l_iter, data->edgevec, data->fnos[BM_elem_index_get(l_iter->f)], v_no);
+                l_iter, e1diff, e2diff, data->fnos[BM_elem_index_get(l_iter->f)], v_no);
           }
         } while ((l_iter = l_iter->radial_next) != l_first);
       }
@@ -213,24 +202,22 @@ static void bm_vert_calc_normals_with_coords_cb(void *userdata,
 }
 
 static void bm_mesh_verts_calc_normals(BMesh *bm,
-                                       const float (*edgevec)[3],
                                        const float (*fnos)[3],
                                        const float (*vcos)[3],
                                        float (*vnos)[3])
 {
-  BM_mesh_elem_index_ensure(bm, (BM_EDGE | BM_FACE) | ((vnos || vcos) ? BM_VERT : 0));
+  BM_mesh_elem_index_ensure(bm, BM_FACE | ((vnos || vcos) ? BM_VERT : 0));
 
   TaskParallelSettings settings;
   BLI_parallel_mempool_settings_defaults(&settings);
   settings.use_threading = bm->totvert >= BM_OMP_LIMIT;
 
   if (vcos == NULL) {
-    BM_iter_parallel(bm, BM_VERTS_OF_MESH, bm_vert_calc_normals_cb, (void *)edgevec, &settings);
+    BM_iter_parallel(bm, BM_VERTS_OF_MESH, bm_vert_calc_normals_cb, NULL, &settings);
   }
   else {
     BLI_assert(!ELEM(NULL, fnos, vnos));
     BMVertsCalcNormalsWithCoordsData data = {
-        .edgevec = edgevec,
         .fnos = fnos,
         .vcos = vcos,
         .vnos = vnos,
@@ -255,11 +242,6 @@ static void bm_face_calc_normals_cb(void *UNUSED(userdata),
  */
 void BM_mesh_normals_update(BMesh *bm)
 {
-  float(*edgevec)[3] = MEM_mallocN(sizeof(*edgevec) * bm->totedge, __func__);
-
-  /* Parallel mempool iteration does not allow generating indices inline anymore. */
-  BM_mesh_elem_index_ensure(bm, (BM_EDGE | BM_FACE));
-
   /* Calculate all face normals. */
   TaskParallelSettings settings;
   BLI_parallel_mempool_settings_defaults(&settings);
@@ -267,11 +249,8 @@ void BM_mesh_normals_update(BMesh *bm)
 
   BM_iter_parallel(bm, BM_FACES_OF_MESH, bm_face_calc_normals_cb, NULL, &settings);
 
-  bm_mesh_edges_calc_vectors(bm, edgevec, NULL);
-
   /* Add weighted face normals to vertices, and normalize vert normals. */
-  bm_mesh_verts_calc_normals(bm, edgevec, NU

@@ Diff output truncated at 10240 characters. @@