[Bf-blender-cvs] [cfa67535181] temp_bmesh_multires: wrote a fast and inaccurate distance to triangle function for DynTopo. Profiling consistently showed it to be a bottleneck. I've now written scalar versions of this function four times.
Joseph Eagar
noreply at git.blender.org
Wed Mar 31 04:04:50 CEST 2021
Commit: cfa6753518150a9e4174f4f5c4a1ca188e1d195d
Author: Joseph Eagar
Date: Tue Mar 30 19:00:08 2021 -0700
Branches: temp_bmesh_multires
https://developer.blender.org/rBcfa6753518150a9e4174f4f5c4a1ca188e1d195d
wrote a fast and inaccurate distance to triangle function
for DynTopo. Profiling consistently showed it to be a bottleneck.
I've now written scalar versions of this function four times.
For now I'm committing this inaccuration version. I'm also committing
code for a vectorized version I found in a paper. Still need to rejigger
the dyntopo code to use it though.
===================================================================
M source/blender/blenkernel/CMakeLists.txt
A source/blender/blenkernel/intern/closest_point_tri_sse.cc
M source/blender/blenkernel/intern/pbvh_bmesh.c
M source/blender/blenlib/intern/BLI_ghash_utils.c
M source/blender/blenloader/intern/versioning_290.c
M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_cone.cc
M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_cube.cc
M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_ico_sphere.cc
M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_uv_sphere.cc
===================================================================
diff --git a/source/blender/blenkernel/CMakeLists.txt b/source/blender/blenkernel/CMakeLists.txt
index 8fbf49b31d4..e71c48fcbbc 100644
--- a/source/blender/blenkernel/CMakeLists.txt
+++ b/source/blender/blenkernel/CMakeLists.txt
@@ -275,6 +275,7 @@ set(SRC
intern/workspace.c
intern/world.c
intern/writeavi.c
+ intern/closest_point_tri_sse.cc
BKE_DerivedMesh.h
BKE_action.h
diff --git a/source/blender/blenkernel/intern/closest_point_tri_sse.cc b/source/blender/blenkernel/intern/closest_point_tri_sse.cc
new file mode 100644
index 00000000000..b42cc370f6e
--- /dev/null
+++ b/source/blender/blenkernel/intern/closest_point_tri_sse.cc
@@ -0,0 +1,396 @@
+// see Fast Distance Queries for Triangles, Lines, and
+// Points using SSE Instructions
+// http://jcgt.org/published/0003/04/05/paper.pdf
+
+typedef struct DistRet {
+ float dist[4];
+} DistRet;
+
+#if defined(__SSE2__) || defined(__SSE4__)
+# include <cstdint>
+# include <immintrin.h>
+# include <smmintrin.h>
+# include <xmmintrin.h>
+
+struct sseb {
+ // data
+ union {
+ __m128 m128;
+ uint32_t v[4];
+ };
+
+ sseb()
+ {
+ }
+
+ sseb(__m128 m)
+ {
+ m128 = m;
+ }
+};
+
+struct ssef {
+ // data
+ union {
+ __m128 m128;
+ float v[4];
+ int i[4];
+ };
+
+ ssef(float a, float b, float c, float d)
+ {
+ v[0] = a;
+ v[1] = b;
+ v[2] = c;
+ v[3] = d;
+ }
+
+ ssef()
+ {
+ }
+
+ ssef(__m128 f)
+ {
+ m128 = f;
+ }
+
+ ssef(float f)
+ {
+ v[0] = v[1] = v[2] = v[3] = f;
+ }
+};
+#endif
+
+#if defined(__SSE2__) && !defined(__SSE4__)
+
+__m128 my_mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
+{
+ ssef fa(a);
+ ssef fb(b);
+ sseb fm(mask);
+
+ ssef ret;
+ for (int i = 0; i < 4; i++) {
+ if (fm.v[i] & (1 << 31)) {
+ ret.v[i] = fb.v[i];
+ }
+ else {
+ ret.v[i] = fa.v[i];
+ }
+ }
+}
+# define _mm_blendv_ps my_mm_blendv_ps
+#endif
+
+#ifdef __SSE2__
+
+# include <array>
+# include <vector>
+
+static inline struct sseb makeSSSEB(__m128 val)
+{
+ sseb r;
+ r.m128 = val;
+
+ return r;
+}
+
+// operations
+const sseb operator&(const sseb &a, const sseb &b)
+{
+ return makeSSSEB(_mm_and_ps(a.m128, b.m128));
+}
+const sseb operator|(const sseb &a, const sseb &b)
+{
+ return makeSSSEB(_mm_or_ps(a.m128, b.m128));
+}
+const sseb operator|=(const sseb &a, const sseb &b)
+{
+ return a | b;
+}
+const sseb operator^(const sseb &a, const sseb &b)
+{
+ return makeSSSEB(_mm_xor_ps(a.m128, b.m128));
+}
+bool all(const sseb &b)
+{
+ return _mm_movemask_ps(b.m128) == 0xf;
+}
+bool any(const sseb &b)
+{
+ return _mm_movemask_ps(b.m128) != 0x0;
+}
+bool none(const sseb &b)
+{
+ return _mm_movemask_ps(b.m128) == 0x0;
+}
+static inline struct ssef makeSSSEF(__m128 val)
+{
+ ssef r;
+ r.m128 = val;
+
+ return r;
+}
+
+// operations
+const ssef operator+(const ssef &a, const ssef &b)
+{
+ return makeSSSEF(_mm_add_ps(a.m128, b.m128));
+}
+const ssef operator-(const ssef &a, const ssef &b)
+{
+ return makeSSSEF(_mm_sub_ps(a.m128, b.m128));
+}
+const ssef operator*(const ssef &a, const ssef &b)
+{
+ return makeSSSEF(_mm_mul_ps(a.m128, b.m128));
+}
+const ssef operator/(const ssef &a, const ssef &b)
+{
+ return makeSSSEF(_mm_div_ps(a.m128, b.m128));
+}
+
+const sseb operator>=(const ssef &a, const ssef &b)
+{
+ __m128 r1 = _mm_cmpgt_ss(a.m128, b.m128);
+ __m128 r2 = _mm_cmpeq_ss(a.m128, b.m128);
+
+ return makeSSSEB(_mm_or_ps(r1, r2));
+}
+const sseb operator<=(const ssef &a, const ssef &b)
+{
+ __m128 r1 = _mm_cmplt_ss(a.m128, b.m128);
+ __m128 r2 = _mm_cmpeq_ss(a.m128, b.m128);
+
+ return makeSSSEB(_mm_or_ps(r1, r2));
+}
+
+const sseb operator>(const ssef &a, const ssef &b)
+{
+ return makeSSSEB(_mm_cmpgt_ss(a.m128, b.m128));
+}
+
+const sseb operator<(const ssef &a, const ssef &b)
+{
+ return makeSSSEB(_mm_cmplt_ss(a.m128, b.m128));
+}
+
+const ssef min(const ssef &a, const ssef &b)
+{
+ return makeSSSEF(_mm_min_ps(a.m128, b.m128));
+}
+const ssef sqr(const ssef &a)
+{
+ return makeSSSEF(_mm_mul_ps(a.m128, a.m128));
+}
+const ssef sqrt(const ssef &a)
+{
+ return makeSSSEF(_mm_sqrt_ps(a.m128));
+}
+const ssef select(const sseb &mask, const ssef &t, const ssef &f)
+{
+ return makeSSSEF(_mm_blendv_ps(f.m128, t.m128, mask.m128));
+}
+
+template<typename T> struct Vec3 {
+ // data
+ T x, y, z;
+
+ Vec3(T x, T y, T z) : x(x), y(y), z(z)
+ {
+ }
+
+ Vec3(T v) : x(v), y(v), z(v)
+ {
+ }
+};
+
+// operations
+template<typename T> Vec3<T> operator+(const Vec3<T> &a, const Vec3<T> &b)
+{
+ return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+template<typename T> Vec3<T> operator-(const Vec3<T> &a, const Vec3<T> &b)
+{
+ return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+template<typename T> Vec3<T> operator*(const Vec3<T> &a, const Vec3<T> &b)
+{
+ return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+template<typename T> Vec3<T> operator*(const T &a, const Vec3<T> &b)
+{
+ return Vec3<T>(b.x * a, b.y * a, b.z * a);
+}
+template<typename T> Vec3<T> operator*(const Vec3<T> &b, const T &a)
+{
+ return Vec3<T>(b.x * a, b.y * a, b.z * a);
+}
+
+template<typename T> inline Vec3<T> rcp(const Vec3<T> &a)
+{
+ return Vec3<T>(rcp(a.x), rcp(a.y), rcp(a.z));
+}
+template<typename T> inline Vec3<T> rsqrt(const Vec3<T> &a)
+{
+ return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z));
+}
+template<typename T> T dot(const Vec3<T> &a, const Vec3<T> &b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+template<typename T> T length2b(const Vec3<T> &a)
+{
+ return dot(a, a);
+}
+
+typedef sseb simdBool;
+typedef ssef simdFloat;
+typedef Vec3<ssef> simdFloatVec;
+typedef std::array<simdFloatVec, 3> simdTriangle_type;
+typedef std::array<simdFloatVec, 2> simdLine_type;
+typedef simdFloatVec simdPoint_type;
+
+static inline simdFloat length2(const simdFloatVec &a)
+{
+ return a.x * a.x + a.y * a.y + a.z * a.z;
+}
+
+static simdFloat zero = {0};
+static simdFloat one = {1.0f, 1.0f, 1.0f, 1.0f};
+
+const simdFloatVec select(const sseb &mask, const simdFloatVec &t, const simdFloatVec &f)
+{
+ simdFloatVec r2(select(mask, t.x, f.x), select(mask, t.y, f.y), select(mask, t.z, f.z));
+
+ return r2;
+}
+
+template<typename T> T clamp(const T &x, const T &lower, const T &upper)
+{
+ return max(lower, min(x, upper));
+}
+
+const simdFloat simdTriPoint2(simdFloatVec &oTriPoint,
+ const simdTriangle_type &iTri,
+ const simdPoint_type &iPoint)
+{
+ const simdFloatVec ab = iTri[1] - iTri[0];
+ const simdFloatVec ac = iTri[2] - iTri[0];
+ const simdFloatVec ap = iPoint - iTri[0];
+ const simdFloat d1 = dot(ab, ap);
+ const simdFloat d2 = dot(ac, ap);
+ const simdBool mask1 = (d1 <= simdFloat(zero)) & (d2 <= simdFloat(zero));
+ oTriPoint = iTri[0];
+ simdBool exit(mask1);
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloatVec bp = iPoint - iTri[1];
+ const simdFloat d3 = dot(ab, bp);
+ const simdFloat d4 = dot(ac, bp);
+ const simdBool mask2 = (d3 >= simdFloat(zero)) & (d4 <= d3);
+ // Closest point is the point iTri[1]. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask2, iTri[1], oTriPoint));
+ exit = exit | mask2;
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloatVec cp = iPoint - iTri[2];
+ const simdFloat d5 = dot(ab, cp);
+ const simdFloat d6 = dot(ac, cp);
+ const simdBool mask3 = (d6 >= simdFloat(zero)) & (d5 <= d6);
+ // Closest point is the point iTri[2]. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask3, iTri[2], oTriPoint));
+ exit |= mask3;
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloat vc = d1 * d4 - d3 * d2;
+ const simdBool mask4 = (vc <= simdFloat(zero)) & (d1 >= simdFloat(zero)) &
+ (d3 <= simdFloat(zero));
+ const simdFloat v1 = d1 / (d1 - d3);
+ const simdFloatVec answer1 = iTri[0] + v1 * ab;
+ // Closest point is on the line ab. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask4, answer1, oTriPoint));
+ exit |= mask4;
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloat vb = d5 * d2 - d1 * d6;
+ const simdBool mask5 = (vb <= simdFloat(zero)) & (d2 >= simdFloat(zero)) &
+ (d6 <= simdFloat(zero));
+ const simdFloat w1 = d2 / (d2 - d6);
+ const simdFloatVec answer2 = iTri[0] + w1 * ac;
+ // Closest point is on the line ac. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask5, answer2, oTriPoint));
+ exit |= mask5;
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloat va = d3 * d6 - d5 * d4;
+ const simdBool mask6 = (va <= simdFloat(zero)) & ((d4 - d3) >= simdFloat(zero)) &
+ ((d5 - d6) >= simdFloat(zero));
+ simdFloat w2 = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+ const simdFloatVec answer3 = iTri[1] + w2 * (iTri[2] - iTri[1]);
+ // Closest point is on the line bc. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask6, answer3, oTriPoint));
+ exit |= mask6;
+ if (all(exit))
+ return length2(oTriPoint - iPoint);
+
+ const simdFloat denom = simdFloat(one) / (va + vb + vc);
+ const simdFloat v2 = vb * denom;
+ const simdFloat w3 = vc * denom;
+ const simdFloatVec answer4 = iTri[0] + ab * v2 + ac * w3;
+ const simdBool mask7 = length2(answer4 - iPoint) < length2(oTriPoint - iPoint);
+ // Closest point is inside triangle. Update if necessary.
+ oTriPoint = select(exit, oTriPoint, select(mask7, answer4, oTriPoint));
+ return length2(oTriPoint - iPoint);
+}
+
+extern "C" struct DistRet dist_to_tri_sphere_fast_4(
+ float p[3], float v1[4][3], float v2[4][3], float v3[4][3], float n[4][3])
+{
+ simdFloatVec mp((simdFloat(p[0]), simdFloat(p[1]), simdFloat(p[2])));
+ simdFloatVec t1(simdFloat(v1[0][0], v1[1][0], v1[2][0], v1[3][0]),
+ simdFloat(v1[0][1], v1[1][1], v1[2][1], v1[3][1]),
+ simdFloat(v1[0][2], v1[1][2], v1[2][2], v1[3][2]));
+ simdFloatVec t2(simdFloat(v2[0][0], v2[1][0], v2[2][0], v2[3][0]),
+ simdFloat(v2[0][1], v2[1][1], v2[2][1], v2[3][1]),
+ simdFloat(v2[0][2], v2[1][2], v2[2][2], v2[3][2]));
+ simdFloatVec t3(simdFloat(v3[
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list