[Bf-blender-cvs] [042143440d7] blender-v2.91-release: LatticeDeform: Performance

Jeroen Bakker noreply at git.blender.org
Mon Oct 26 11:02:40 CET 2020


Commit: 042143440d7668d3e357805ffdd20b1a4d2e2975
Author: Jeroen Bakker
Date:   Mon Oct 26 11:01:18 2020 +0100
Branches: blender-v2.91-release
https://developer.blender.org/rB042143440d7668d3e357805ffdd20b1a4d2e2975

LatticeDeform: Performance

This patch improves the single core performance of the lattice deform.

1. Prefetching deform vert during initialization. This data is constant for
   each innerloop. This reduces the complexity of the inner loop what makes
   more CPU resources free for other optimizations.
2. Prefetching the Lattice instance. It was constant. Although performance
   wise this isn't noticeable it is always good to free some space in the
   branch prediction tables.
3. Remove branching in all loops by not exiting when the effect of the loop
   isn't there. The checks in the inner loops detected if this loop didn't
   have any effect on the final result and then continue to the next loop.
   This made the branch prediction unpredictable and a lot of mis
   predictions were done. For smaller inner loops it is always better
   to remove unpredictable if statements by using branchless code patterns.
4. Use SSE2 instruction when available.

This gives 50% performance increase measured on a
Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz with GCC 9.3.
Also check other compilers.

Before:
```
performance_no_dvert_10000 (4 ms)
performance_no_dvert_100000 (30 ms)
performance_no_dvert_1000000 (268 ms)
performance_no_dvert_10000000 (2637 ms)
```

After:
```
performance_no_dvert_10000 (3 ms)
performance_no_dvert_100000 (21 ms)
performance_no_dvert_1000000 (180 ms)
performance_no_dvert_10000000 (1756 ms)
```

Reviewed By: Campbell Barton

Differential Revision: https://developer.blender.org/D9087

===================================================================

M	source/blender/blenkernel/CMakeLists.txt
M	source/blender/blenkernel/intern/lattice_deform.c
A	source/blender/blenkernel/intern/lattice_deform_test.cc

===================================================================

diff --git a/source/blender/blenkernel/CMakeLists.txt b/source/blender/blenkernel/CMakeLists.txt
index 0fbc8c4c229..f6df3f1bb62 100644
--- a/source/blender/blenkernel/CMakeLists.txt
+++ b/source/blender/blenkernel/CMakeLists.txt
@@ -718,6 +718,7 @@ if(WITH_GTESTS)
   set(TEST_SRC
     intern/armature_test.cc
     intern/fcurve_test.cc
+    intern/lattice_deform_test.cc
   )
   set(TEST_INC
     ../editors/include
diff --git a/source/blender/blenkernel/intern/lattice_deform.c b/source/blender/blenkernel/intern/lattice_deform.c
index 919093f3630..43965813b84 100644
--- a/source/blender/blenkernel/intern/lattice_deform.c
+++ b/source/blender/blenkernel/intern/lattice_deform.c
@@ -49,14 +49,24 @@
 
 #include "BKE_deform.h"
 
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
 /* -------------------------------------------------------------------- */
 /** \name Lattice Deform API
  * \{ */
 
 typedef struct LatticeDeformData {
-  const Object *object;
-  float *latticedata;
+  /* Convert from object space to deform space */
   float latmat[4][4];
+  /* Cached reference to the lattice to use for evaluation. When in edit mode this attribute
+   * is set to the edit mode lattice. */
+  const Lattice *lt;
+  /* Preprocessed lattice points (converted to deform space). */
+  float *latticedata;
+  /* Prefetched DeformWeights of the lattice. */
+  float *lattice_weights;
 } LatticeDeformData;
 
 LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Object *ob)
@@ -72,6 +82,7 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
   float fu, fv, fw;
   int u, v, w;
   float *latticedata;
+  float *lattice_weights = NULL;
   float latmat[4][4];
   LatticeDeformData *lattice_deform_data;
 
@@ -80,8 +91,10 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
   }
   bp = lt->def;
 
-  fp = latticedata = MEM_mallocN(sizeof(float[3]) * lt->pntsu * lt->pntsv * lt->pntsw,
-                                 "latticedata");
+  const int32_t num_points = lt->pntsu * lt->pntsv * lt->pntsw;
+  /* We allocate one additional float for SSE2 optimizations. Without this
+   * the SSE2 instructions for the last item would read in unallocated memory. */
+  fp = latticedata = MEM_mallocN(sizeof(float[3]) * num_points + sizeof(float), "latticedata");
 
   /* for example with a particle system: (ob == NULL) */
   if (ob == NULL) {
@@ -100,6 +113,20 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
     invert_m4_m4(imat, latmat);
   }
 
+  /* Prefetch latice deform group weights. */
+  int defgrp_index = -1;
+  const MDeformVert *dvert = BKE_lattice_deform_verts_get(oblatt);
+  if (lt->vgroup[0] && dvert) {
+    defgrp_index = BKE_object_defgroup_name_index(ob, lt->vgroup);
+
+    if (defgrp_index != -1) {
+      lattice_weights = MEM_malloc_arrayN(sizeof(float), num_points, "lattice_weights");
+      for (int index = 0; index < num_points; index++) {
+        lattice_weights[index] = BKE_defvert_find_weight(dvert + index, defgrp_index);
+      }
+    }
+  }
+
   for (w = 0, fw = lt->fw; w < lt->pntsw; w++, fw += lt->dw) {
     for (v = 0, fv = lt->fv; v < lt->pntsv; v++, fv += lt->dv) {
       for (u = 0, fu = lt->fu; u < lt->pntsu; u++, bp++, co += 3, fp += 3, fu += lt->du) {
@@ -121,7 +148,8 @@ LatticeDeformData *BKE_lattice_deform_data_create(const Object *oblatt, const Ob
 
   lattice_deform_data = MEM_mallocN(sizeof(LatticeDeformData), "Lattice Deform Data");
   lattice_deform_data->latticedata = latticedata;
-  lattice_deform_data->object = oblatt;
+  lattice_deform_data->lattice_weights = lattice_weights;
+  lattice_deform_data->lt = lt;
   copy_m4_m4(lattice_deform_data->latmat, latmat);
 
   return lattice_deform_data;
@@ -131,30 +159,21 @@ void BKE_lattice_deform_data_eval_co(LatticeDeformData *lattice_deform_data,
                                      float co[3],
                                      float weight)
 {
-  const Object *ob = lattice_deform_data->object;
-  Lattice *lt = ob->data;
+  float *latticedata = lattice_deform_data->latticedata;
+  float *lattice_weights = lattice_deform_data->lattice_weights;
+  BLI_assert(latticedata);
+  const Lattice *lt = lattice_deform_data->lt;
   float u, v, w, tu[4], tv[4], tw[4];
   float vec[3];
   int idx_w, idx_v, idx_u;
   int ui, vi, wi, uu, vv, ww;
 
   /* vgroup influence */
-  int defgrp_index = -1;
   float co_prev[3], weight_blend = 0.0f;
-  const MDeformVert *dvert = BKE_lattice_deform_verts_get(ob);
-  float *__restrict latticedata = lattice_deform_data->latticedata;
-
-  if (lt->editlatt) {
-    lt = lt->editlatt->latt;
-  }
-  if (latticedata == NULL) {
-    return;
-  }
-
-  if (lt->vgroup[0] && dvert) {
-    defgrp_index = BKE_object_defgroup_name_index(ob, lt->vgroup);
-    copy_v3_v3(co_prev, co);
-  }
+  copy_v3_v3(co_prev, co);
+#ifdef __SSE2__
+  __m128 co_vec = _mm_loadu_ps(co_prev);
+#endif
 
   /* co is in local coords, treat with latmat */
   mul_v3_m4v3(vec, lattice_deform_data->latmat, co);
@@ -197,67 +216,47 @@ void BKE_lattice_deform_data_eval_co(LatticeDeformData *lattice_deform_data,
     wi = 0;
   }
 
-  for (ww = wi - 1; ww <= wi + 2; ww++) {
-    w = tw[ww - wi + 1];
+  const int w_stride = lt->pntsu * lt->pntsv;
+  const int idx_w_max = (lt->pntsw - 1) * lt->pntsu * lt->pntsv;
+  const int v_stride = lt->pntsu;
+  const int idx_v_max = (lt->pntsv - 1) * lt->pntsu;
+  const int idx_u_max = (lt->pntsu - 1);
 
-    if (w != 0.0f) {
-      if (ww > 0) {
-        if (ww < lt->pntsw) {
-          idx_w = ww * lt->pntsu * lt->pntsv;
-        }
-        else {
-          idx_w = (lt->pntsw - 1) * lt->pntsu * lt->pntsv;
+  for (ww = wi - 1; ww <= wi + 2; ww++) {
+    w = weight * tw[ww - wi + 1];
+    idx_w = CLAMPIS(ww * w_stride, 0, idx_w_max);
+    for (vv = vi - 1; vv <= vi + 2; vv++) {
+      v = w * tv[vv - vi + 1];
+      idx_v = CLAMPIS(vv * v_stride, 0, idx_v_max);
+      for (uu = ui - 1; uu <= ui + 2; uu++) {
+        u = v * tu[uu - ui + 1];
+        idx_u = CLAMPIS(uu, 0, idx_u_max);
+        const int idx = idx_w + idx_v + idx_u;
+#ifdef __SSE2__
+        {
+          __m128 weight_vec = _mm_set1_ps(u);
+          /* This will load one extra element, this is ok because
+           * we ignore that part of register anyway.
+           */
+          __m128 lattice_vec = _mm_loadu_ps(&latticedata[idx * 3]);
+          co_vec = _mm_add_ps(co_vec, _mm_mul_ps(lattice_vec, weight_vec));
         }
-      }
-      else {
-        idx_w = 0;
-      }
-
-      for (vv = vi - 1; vv <= vi + 2; vv++) {
-        v = w * tv[vv - vi + 1];
-
-        if (v != 0.0f) {
-          if (vv > 0) {
-            if (vv < lt->pntsv) {
-              idx_v = idx_w + vv * lt->pntsu;
-            }
-            else {
-              idx_v = idx_w + (lt->pntsv - 1) * lt->pntsu;
-            }
-          }
-          else {
-            idx_v = idx_w;
-          }
-
-          for (uu = ui - 1; uu <= ui + 2; uu++) {
-            u = weight * v * tu[uu - ui + 1];
-
-            if (u != 0.0f) {
-              if (uu > 0) {
-                if (uu < lt->pntsu) {
-                  idx_u = idx_v + uu;
-                }
-                else {
-                  idx_u = idx_v + (lt->pntsu - 1);
-                }
-              }
-              else {
-                idx_u = idx_v;
-              }
-
-              madd_v3_v3fl(co, &latticedata[idx_u * 3], u);
-
-              if (defgrp_index != -1) {
-                weight_blend += (u * BKE_defvert_find_weight(dvert + idx_u, defgrp_index));
-              }
-            }
-          }
+#else
+        madd_v3_v3fl(co, &latticedata[idx * 3], u);
+#endif
+        if (lattice_weights) {
+          weight_blend += (u * lattice_weights[idx]);
         }
       }
     }
   }
+#ifdef __SSE2__
+  {
+    copy_v3_v3(co, (float *)&co_vec);
+  }
+#endif
 
-  if (defgrp_index != -1) {
+  if (lattice_weights) {
     interp_v3_v3v3(co, co_prev, co, weight_blend);
   }
 }
diff --git a/source/blender/blenkernel/intern/lattice_deform_test.cc b/source/blender/blenkernel/intern/lattice_deform_test.cc
new file mode 100644
index 00000000000..33a4cc1d871
--- /dev/null
+++ b/source/blender/blenkernel/intern/lattice_deform_test.cc
@@ -0,0 +1,138 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2020 by Blender Foundation.
+ */
+#include "testing/testing.h"
+
+#include "BKE_idtype.h"
+#include "BKE_lattice.h"
+
+#include "MEM_guardedalloc.h"
+
+#include "DNA_lattice_types.h"
+#include "DNA_mesh_types.h"
+#include "DNA_object_types.h"
+
+#include "BLI_rand.hh"
+
+namespace blender::bke::tests {
+
+struct LatticeDeformTestContext {
+  Lattice lattice;
+  Object ob_lattice;
+  Mesh mesh;
+  Object ob_mesh;
+  float (*coords)[3];
+  LatticeDeformData *ldd;
+};
+
+static void test_lattice_deform_init(LatticeDeformTestContext *ctx,
+                                     RandomNumberGenerator *rng,
+                                     int32_t num_items)
+{
+  /* Generate random input data between -5 and 5. */
+  ctx->coords = (float(*)[3])MEM_malloc_arrayN(sizeof(float[3]), num_items, __func__);
+  for (uint32_t index = 0; index < num_items; index++) {
+    ctx->coords[index][0] = (rng->get_float() - 0.5f) * 10;
+    ctx->coords[index][1] = (rng->get_float() - 0.5f) * 10;
+    ctx->coords[index][2] = (rng->get_float() - 0.5f) * 10;
+  }
+  IDType_ID_LT.init_data(&ctx->lattice.id);
+  IDType_ID_OB.init_data(&ctx->ob_lattice.id);
+  ctx->ob_lattice.type = OB

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list