From f4e4349877cbe50d438bb72e50ab1bba4e66b9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Fri, 11 Jul 2025 09:28:26 +0200 Subject: [PATCH] feat(mat4): add only x64 SSE mul --- src/math/mat4.c | 38 ++++++++++++++++++++++++++++++-------- src/math/mat4.h | 6 +++--- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/math/mat4.c b/src/math/mat4.c index e13c511..3b46aa9 100644 --- a/src/math/mat4.c +++ b/src/math/mat4.c @@ -164,16 +164,38 @@ Mat4f_t mat4f_scale(const Mat4f_t *__restrict m, float scalar) return mout; } -// Mat4_t mat4_scl(const Mat4_t* m, float scalar) -// { -// Mat4_t mat; +Mat4f_t* mat4_mul_r(Mat4f_t* out, const Mat4f_t* m2) +{ + Mat4f_t clone = mat4f_clone(out); -// for(int i = 0; i<16; i++) { -// mat.m[i] = m->m[i] * scalar; -// } + for (int row = 0; row<4; row++) { +#if defined (SIMD_X86) + __m128 mrow = _mm_load_ps(&clone.m[row * 4]); -// return mat; -// } + for (int col = 0; col<4; col++) { + __m128 mcol = _mm_set_ps( + m2->m[3 * 4 + col], + m2->m[2 * 4 + col], + m2->m[1 * 4 + col], + m2->m[0 * 4 + col] + ); + __m128 mmul = _mm_mul_ps(mrow, mcol); + + __m128 shuf = _mm_shuffle_ps(mmul, mmul, _MM_SHUFFLE(2, 3, 0, 1)); // [y, y, w, w] + __m128 sum = _mm_add_ps(mmul, shuf); // [x+y, y+y, z+w, w+w] + + shuf = _mm_movehl_ps(shuf, sum); // [z+w, w+w, w, w] + sum = _mm_add_ss(sum, shuf); // [x+y+z+w, y+y, z+w, w+w] + float mres = _mm_cvtss_f32(sum); + + out->m[row * 4 + col] = mres; + } +#elif defined (SIMD_ARCH) +#else +#endif + } + return out; +} // Mat4_t mat4_mul(const Mat4_t* m1, const Mat4_t* m2) // { diff --git a/src/math/mat4.h b/src/math/mat4.h index 7e7b5dc..1f575aa 100644 --- a/src/math/mat4.h +++ b/src/math/mat4.h @@ -28,10 +28,10 @@ Mat4f_t* mat4f_sub_r(Mat4f_t* out, const Mat4f_t* m2); Mat4f_t mat4f_scale(const Mat4f_t *__restrict m, float scalar); Mat4f_t* mat4f_scale_r(Mat4f_t *out, float scalar); -// Mat4_t mat4_scl(const Mat4_t* m, float scalar); -// // row * col -// Mat4_t mat4_mul(const Mat4_t* m1, const Mat4_t* m2); +// row * col +Mat4f_t mat4_mul(const Mat4f_t* m1, const Mat4f_t* m2); +Mat4f_t* mat4_mul_r(Mat4f_t* out, const Mat4f_t* m2); // Mat4_t mat4_tpo(const Mat4_t* m);