[Bf-blender-cvs] [399b6ec76c8] master: Mesh: optimize normal calculation

Fri Aug 13 02:22:46 CEST 2021

Commit: 399b6ec76c80a8f343747a0459bb3d3df514555f
Author: Campbell Barton
Date:   Fri Aug 13 09:27:29 2021 +1000
Branches: master
https://developer.blender.org/rB399b6ec76c80a8f343747a0459bb3d3df514555f

Mesh: optimize normal calculation

Optimize mesh normal calculation.

- Remove the intermediate `lnors_weighted` array, accumulate directly
  into the normal array using a spin-lock for thread safety.
- Remove single threaded iteration over loops
  (normal calculation is now fully multi-threaded).
- Remove stack array (alloca) for pre-calculating edge-directions.

Summary of Performance Characteristics:

- The largest gains are for single high poly meshes, with isolated
  normal-calculation benchmarks of meshes over ~1.5 million showing
  2x+ speedup, ~25 million polygons are ~2.85x faster.

- Single lower poly meshes (250k polys) can be ~2x slower.

  Since these meshes aren't normally a bottleneck,
  and this problem isn't noticeable on large scenes,
  we considered the performance trade-off reasonable.

- The performance difference reduces with larger scenes,
  tests with production files from "Sprite Fight" showing
  the same or slightly better overall performance.

NOTE: tested on a AMD Ryzen TR 3970X 32-Core.

For more details & benchmarking scripts, see the patch description.

Reviewed By: mont29

Ref D11993

===================================================================

M	source/blender/blenkernel/intern/mesh_normals.cc

===================================================================

diff --git a/source/blender/blenkernel/intern/mesh_normals.cc b/source/blender/blenkernel/intern/mesh_normals.cc
index 6bd08b3dbe0..b86332097fa 100644
--- a/source/blender/blenkernel/intern/mesh_normals.cc
+++ b/source/blender/blenkernel/intern/mesh_normals.cc
@@ -50,6 +50,8 @@
 #include "BKE_global.h"
 #include "BKE_mesh.h"
 
+#include "atomic_ops.h"
+
 // #define DEBUG_TIME
 
 #ifdef DEBUG_TIME
@@ -59,6 +61,46 @@
 
 static CLG_LogRef LOG = {"bke.mesh_normals"};
 
+/* -------------------------------------------------------------------- */
+/** \name Private Utility Functions
+ * \{ */
+
+/**
+ * A thread-safe version of #add_v3_v3 that uses a spin-lock.
+ *
+ * \note Avoid using this when the chance of contention is high.
+ */
+static void add_v3_v3_atomic(float r[3], const float a[3])
+{
+#define FLT_EQ_NONAN(_fa, _fb) (*((const uint32_t *)&_fa) == *((const uint32_t *)&_fb))
+
+  float virtual_lock = r[0];
+  while (true) {
+    /* This loops until following conditions are met:
+     * - `r[0]` has same value as virtual_lock (i.e. it did not change since last try).
+     * - `r[0]` was not `FLT_MAX`, i.e. it was not locked by another thread. */
+    const float test_lock = atomic_cas_float(&r[0], virtual_lock, FLT_MAX);
+    if (_ATOMIC_LIKELY(FLT_EQ_NONAN(test_lock, virtual_lock) && (test_lock != FLT_MAX))) {
+      break;
+    }
+    virtual_lock = test_lock;
+  }
+  virtual_lock += a[0];
+  r[1] += a[1];
+  r[2] += a[2];
+
+  /* Second atomic operation to 'release'
+   * our lock on that vector and set its first scalar value. */
+  /* Note that we do not need to loop here, since we 'locked' `r[0]`,
+   * nobody should have changed it in the mean time. */
+  virtual_lock = atomic_cas_float(&r[0], FLT_MAX, virtual_lock);
+  BLI_assert(virtual_lock == FLT_MAX);
+
+#undef FLT_EQ_NONAN
+}
+
+/** \} */
+
 /* -------------------------------------------------------------------- */
 /** \name Mesh Normal Calculation
  * \{ */
@@ -210,7 +252,6 @@ struct MeshCalcNormalsData {
   const MLoop *mloop;
   MVert *mverts;
   float (*pnors)[3];
-  float (*lnors_weighted)[3];
   float (*vnors)[3];
 };
 
@@ -224,65 +265,65 @@ static void mesh_calc_normals_poly_cb(void *__restrict userdata,
   BKE_mesh_calc_poly_normal(mp, data->mloop + mp->loopstart, data->mverts, data->pnors[pidx]);
 }
 
-static void mesh_calc_normals_poly_prepare_cb(void *__restrict userdata,
-                                              const int pidx,
-                                              const TaskParallelTLS *__restrict UNUSED(tls))
+static void mesh_calc_normals_poly_and_accum_cb(void *__restrict userdata,
+                                                const int pidx,
+                                                const TaskParallelTLS *__restrict UNUSED(tls))
 {
-  MeshCalcNormalsData *data = (MeshCalcNormalsData *)userdata;
+  const MeshCalcNormalsData *data = (MeshCalcNormalsData *)userdata;
   const MPoly *mp = &data->mpolys[pidx];
   const MLoop *ml = &data->mloop[mp->loopstart];
   const MVert *mverts = data->mverts;
+  float(*vnors)[3] = data->vnors;
 
   float pnor_temp[3];
   float *pnor = data->pnors ? data->pnors[pidx] : pnor_temp;
-  float(*lnors_weighted)[3] = data->lnors_weighted;
 
-  const int nverts = mp->totloop;
-  float(*edgevecbuf)[3] = (float(*)[3])BLI_array_alloca(edgevecbuf, (size_t)nverts);
+  const int i_end = mp->totloop - 1;
 
   /* Polygon Normal and edge-vector */
   /* inline version of #BKE_mesh_calc_poly_normal, also does edge-vectors */
   {
-    int i_prev = nverts - 1;
-    const float *v_prev = mverts[ml[i_prev].v].co;
-    const float *v_curr;
-
     zero_v3(pnor);
     /* Newell's Method */
-    for (int i = 0; i < nverts; i++) {
-      v_curr = mverts[ml[i].v].co;
-      add_newell_cross_v3_v3v3(pnor, v_prev, v_curr);
-
-      /* Unrelated to normalize, calculate edge-vector */
-      sub_v3_v3v3(edgevecbuf[i_prev], v_prev, v_curr);
-      normalize_v3(edgevecbuf[i_prev]);
-      i_prev = i;
-
-      v_prev = v_curr;
+    const float *v_curr = mverts[ml[i_end].v].co;
+    for (int i_next = 0; i_next <= i_end; i_next++) {
+      const float *v_next = mverts[ml[i_next].v].co;
+      add_newell_cross_v3_v3v3(pnor, v_curr, v_next);
+      v_curr = v_next;
     }
     if (UNLIKELY(normalize_v3(pnor) == 0.0f)) {
       pnor[2] = 1.0f; /* other axes set to 0.0 */
     }
   }
 
-  /* accumulate angle weighted face normal */
-  /* inline version of #accumulate_vertex_normals_poly_v3,
-   * split between this threaded callback and #mesh_calc_normals_poly_accum_cb. */
+  /* Accumulate angle weighted face normal into the vertex normal. */
+  /* inline version of #accumulate_vertex_normals_poly_v3. */
   {
-    const float *prev_edge = edgevecbuf[nverts - 1];
-
-    for (int i = 0; i < nverts; i++) {
-      const int lidx = mp->loopstart + i;
-      const float *cur_edge = edgevecbuf[i];
-
-      /* calculate angle between the two poly edges incident on
-       * this vertex */
-      const float fac = saacos(-dot_v3v3(cur_edge, prev_edge));
+    float edvec_prev[3], edvec_next[3], edvec_end[3];
+    const float *v_curr = mverts[ml[i_end].v].co;
+    sub_v3_v3v3(edvec_prev, mverts[ml[i_end - 1].v].co, v_curr);
+    normalize_v3(edvec_prev);
+    copy_v3_v3(edvec_end, edvec_prev);
+
+    for (int i_next = 0, i_curr = i_end; i_next <= i_end; i_curr = i_next++) {
+      const float *v_next = mverts[ml[i_next].v].co;
+
+      /* Skip an extra normalization by reusing the first calculated edge. */
+      if (i_next != i_end) {
+        sub_v3_v3v3(edvec_next, v_curr, v_next);
+        normalize_v3(edvec_next);
+      }
+      else {
+        copy_v3_v3(edvec_next, edvec_end);
+      }
 
-      /* Store for later accumulation */
-      mul_v3_v3fl(lnors_weighted[lidx], pnor, fac);
+      /* Calculate angle between the two poly edges incident on this vertex. */
+      const float fac = saacos(-dot_v3v3(edvec_prev, edvec_next));
+      const float vnor_add[3] = {pnor[0] * fac, pnor[1] * fac, pnor[2] * fac};
 
-      prev_edge = cur_edge;
+      add_v3_v3_atomic(vnors[ml[i_curr].v], vnor_add);
+      v_curr = v_next;
+      copy_v3_v3(edvec_prev, edvec_next);
     }
   }
 }
@@ -309,7 +350,7 @@ void BKE_mesh_calc_normals_poly(MVert *mverts,
                                 int numVerts,
                                 const MLoop *mloop,
                                 const MPoly *mpolys,
-                                int numLoops,
+                                int UNUSED(numLoops),
                                 int numPolys,
                                 float (*r_polynors)[3],
                                 const bool only_face_normals)
@@ -335,8 +376,6 @@ void BKE_mesh_calc_normals_poly(MVert *mverts,
   }
 
   float(*vnors)[3] = r_vertnors;
-  float(*lnors_weighted)[3] = (float(*)[3])MEM_malloc_arrayN(
-      (size_t)numLoops, sizeof(*lnors_weighted), __func__);
   bool free_vnors = false;
 
   /* first go through and calculate normals for all the polys */
@@ -353,27 +392,17 @@ void BKE_mesh_calc_normals_poly(MVert *mverts,
   data.mloop = mloop;
   data.mverts = mverts;
   data.pnors = pnors;
-  data.lnors_weighted = lnors_weighted;
   data.vnors = vnors;
 
-  /* Compute poly normals, and prepare weighted loop normals. */
-  BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_prepare_cb, &settings);
-
-  /* Actually accumulate weighted loop normals into vertex ones. */
-  /* Unfortunately, not possible to thread that
-   * (not in a reasonable, totally lock- and barrier-free fashion),
-   * since several loops will point to the same vertex... */
-  for (int lidx = 0; lidx < numLoops; lidx++) {
-    add_v3_v3(vnors[mloop[lidx].v], data.lnors_weighted[lidx]);
-  }
+  /* Compute poly normals (`pnors`), accumulating them into vertex normals (`vnors`). */
+  BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_and_accum_cb, &settings);
 
-  /* Normalize and validate computed vertex normals. */
+  /* Normalize and validate computed vertex normals (`vnors`). */
   BLI_task_parallel_range(0, numVerts, &data, mesh_calc_normals_poly_finalize_cb, &settings);
 
   if (free_vnors) {
     MEM_freeN(vnors);
   }
-  MEM_freeN(lnors_weighted);
 }
 
 void BKE_mesh_ensure_normals(Mesh *mesh)