[Bf-blender-cvs] [de8b4ef2175] temp-lanpr-staging: BLI: SSE2 support for double version matrix.
YimingWu
noreply at git.blender.org
Sun Aug 18 08:27:27 CEST 2019
Commit: de8b4ef2175c51271abcfb1c0b67a2a30e77121a
Author: YimingWu
Date: Sun Aug 18 14:26:28 2019 +0800
Branches: temp-lanpr-staging
https://developer.blender.org/rBde8b4ef2175c51271abcfb1c0b67a2a30e77121a
BLI: SSE2 support for double version matrix.
===================================================================
M source/blender/blenlib/BLI_math_matrix.h
M source/blender/blenlib/intern/math_matrix.c
===================================================================
diff --git a/source/blender/blenlib/BLI_math_matrix.h b/source/blender/blenlib/BLI_math_matrix.h
index 432b1f8d4f8..f5d87667b73 100644
--- a/source/blender/blenlib/BLI_math_matrix.h
+++ b/source/blender/blenlib/BLI_math_matrix.h
@@ -88,6 +88,7 @@ void mul_m3_m3m3_uniq(float R[3][3], const float A[3][3], const float B[3][3]);
void mul_m3_m3_pre(float R[3][3], const float A[3][3]);
void mul_m3_m3_post(float R[3][3], const float B[3][3]);
void mul_m4_m4m4_uniq(float R[4][4], const float A[4][4], const float B[4][4]);
+void mul_m4_m4m4_db_uniq(double R[4][4], const double A[4][4], const double B[4][4]);
void mul_m4db_m4db_m4fl_uniq(double R[4][4], const double A[4][4], const float B[4][4]);
void mul_m4_m4_pre(float R[4][4], const float A[4][4]);
void mul_m4_m4_post(float R[4][4], const float B[4][4]);
diff --git a/source/blender/blenlib/intern/math_matrix.c b/source/blender/blenlib/intern/math_matrix.c
index ffd04e5aa29..516227b642e 100644
--- a/source/blender/blenlib/intern/math_matrix.c
+++ b/source/blender/blenlib/intern/math_matrix.c
@@ -268,27 +268,75 @@ void mul_m4_m4m4_uniq(float R[4][4], const float A[4][4], const float B[4][4])
#endif
}
+void mul_m4_m4m4_db_uniq(double R[4][4], const double A[4][4], const double B[4][4])
+{
+ BLI_assert(R != A && R != B);
+
+ /* matrix product: R[j][k] = A[j][i] . B[i][k] */
+#ifdef __SSE2__
+ __m128d A0 = _mm_loadu_pd(A[0]);
+ __m128d A1 = _mm_loadu_pd(A[1]);
+ __m128d A2 = _mm_loadu_pd(A[2]);
+ __m128d A3 = _mm_loadu_pd(A[3]);
+
+ for (int i = 0; i < 4; i++) {
+ __m128d B0 = _mm_set1_pd(B[i][0]);
+ __m128d B1 = _mm_set1_pd(B[i][1]);
+ __m128d B2 = _mm_set1_pd(B[i][2]);
+ __m128d B3 = _mm_set1_pd(B[i][3]);
+
+ __m128d sum = _mm_add_pd(_mm_add_pd(_mm_mul_pd(B0, A0), _mm_mul_pd(B1, A1)),
+ _mm_add_pd(_mm_mul_pd(B2, A2), _mm_mul_pd(B3, A3)));
+
+ _mm_storeu_pd(R[i], sum);
+ }
+#else
+ R[0][0] = B[0][0] * A[0][0] + B[0][1] * A[1][0] + B[0][2] * A[2][0] + B[0][3] * A[3][0];
+ R[0][1] = B[0][0] * A[0][1] + B[0][1] * A[1][1] + B[0][2] * A[2][1] + B[0][3] * A[3][1];
+ R[0][2] = B[0][0] * A[0][2] + B[0][1] * A[1][2] + B[0][2] * A[2][2] + B[0][3] * A[3][2];
+ R[0][3] = B[0][0] * A[0][3] + B[0][1] * A[1][3] + B[0][2] * A[2][3] + B[0][3] * A[3][3];
+
+ R[1][0] = B[1][0] * A[0][0] + B[1][1] * A[1][0] + B[1][2] * A[2][0] + B[1][3] * A[3][0];
+ R[1][1] = B[1][0] * A[0][1] + B[1][1] * A[1][1] + B[1][2] * A[2][1] + B[1][3] * A[3][1];
+ R[1][2] = B[1][0] * A[0][2] + B[1][1] * A[1][2] + B[1][2] * A[2][2] + B[1][3] * A[3][2];
+ R[1][3] = B[1][0] * A[0][3] + B[1][1] * A[1][3] + B[1][2] * A[2][3] + B[1][3] * A[3][3];
+
+ R[2][0] = B[2][0] * A[0][0] + B[2][1] * A[1][0] + B[2][2] * A[2][0] + B[2][3] * A[3][0];
+ R[2][1] = B[2][0] * A[0][1] + B[2][1] * A[1][1] + B[2][2] * A[2][1] + B[2][3] * A[3][1];
+ R[2][2] = B[2][0] * A[0][2] + B[2][1] * A[1][2] + B[2][2] * A[2][2] + B[2][3] * A[3][2];
+ R[2][3] = B[2][0] * A[0][3] + B[2][1] * A[1][3] + B[2][2] * A[2][3] + B[2][3] * A[3][3];
+
+ R[3][0] = B[3][0] * A[0][0] + B[3][1] * A[1][0] + B[3][2] * A[2][0] + B[3][3] * A[3][0];
+ R[3][1] = B[3][0] * A[0][1] + B[3][1] * A[1][1] + B[3][2] * A[2][1] + B[3][3] * A[3][1];
+ R[3][2] = B[3][0] * A[0][2] + B[3][1] * A[1][2] + B[3][2] * A[2][2] + B[3][3] * A[3][2];
+ R[3][3] = B[3][0] * A[0][3] + B[3][1] * A[1][3] + B[3][2] * A[2][3] + B[3][3] * A[3][3];
+#endif
+}
+
void mul_m4db_m4db_m4fl_uniq(double R[4][4], const double A[4][4], const float B[4][4])
{
BLI_assert(R != A && R != B);
+ double temp[4][4];
+
+ copy_m4d_m4(temp,B);
/* matrix product: R[j][k] = A[j][i] . B[i][k] */
-#if 0 /* Help needed to redo __SSE2__ implementation for double version */
- __m128 A0 = _mm_loadu_ps(A[0]);
- __m128 A1 = _mm_loadu_ps(A[1]);
- __m128 A2 = _mm_loadu_ps(A[2]);
- __m128 A3 = _mm_loadu_ps(A[3]);
+#ifdef __SSE2__
+ __m128d A0 = _mm_loadu_pd(A[0]);
+ __m128d A1 = _mm_loadu_pd(A[1]);
+ __m128d A2 = _mm_loadu_pd(A[2]);
+ __m128d A3 = _mm_loadu_pd(A[3]);
for (int i = 0; i < 4; i++) {
- __m128 B0 = _mm_set1_ps(B[i][0]);
- __m128 B1 = _mm_set1_ps(B[i][1]);
- __m128 B2 = _mm_set1_ps(B[i][2]);
- __m128 B3 = _mm_set1_ps(B[i][3]);
+ __m128d B0 = _mm_set1_pd(temp[i][0]);
+ __m128d B1 = _mm_set1_pd(temp[i][1]);
+ __m128d B2 = _mm_set1_pd(temp[i][2]);
+ __m128d B3 = _mm_set1_pd(temp[i][3]);
- __m128 sum = _mm_add_ps(_mm_add_ps(_mm_mul_ps(B0, A0), _mm_mul_ps(B1, A1)),
- _mm_add_ps(_mm_mul_ps(B2, A2), _mm_mul_ps(B3, A3)));
+ __m128d sum = _mm_add_pd(_mm_add_pd(_mm_mul_pd(B0, A0), _mm_mul_pd(B1, A1)),
+ _mm_add_pd(_mm_mul_pd(B2, A2), _mm_mul_pd(B3, A3)));
- _mm_storeu_ps(R[i], sum);
+ _mm_storeu_pd(R[i], sum);
}
#else
R[0][0] = B[0][0] * A[0][0] + B[0][1] * A[1][0] + B[0][2] * A[2][0] + B[0][3] * A[3][0];
More information about the Bf-blender-cvs
mailing list