From 8d3a55e7f59d06c735aad73f977219191bb8fa67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Wed, 3 Sep 2025 18:20:34 +0200 Subject: [PATCH] feat: add mat4f_mul_vec4f function (x64 / arm / none) --- src/math/mv4.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/math/mv4.h | 4 ++++ 2 files changed, 58 insertions(+) create mode 100644 src/math/mv4.c create mode 100644 src/math/mv4.h diff --git a/src/math/mv4.c b/src/math/mv4.c new file mode 100644 index 0000000..2effcd5 --- /dev/null +++ b/src/math/mv4.c @@ -0,0 +1,54 @@ +#include "mv4.h" + +#ifdef SIMD_X86 + #include +#elif defined(SIMD_ARCH) + #include +#endif + + +Vec4f_t mat4f_mul_vec4f(const Mat4f_t mat, Vec4f_t v) +{ + Vec4f_t out; +#if defined (SIMD_X86) + __m128 vec = _mm_load_ps(v.data); + + for (int i = 0; i < 4; i++) + { + __m128 row = _mm_load_ps(&mat.m[4*i]); + __m128 mul = _mm_mul_ps(row, vec); + + __m128 shuf = _mm_movehl_ps(mul, mul); + __m128 sum = _mm_add_ps(mul, shuf); + shuf = _mm_movehl_ps(shuf, sum); + sum = _mm_add_ss(sum, shuf); + + out.data[i] = _mm_cvtss_f32(sum); + } + +#elif defined (SIMD_ARCH) + float32x4_t vec = vld1q_f32(v.data); + + for (int i = 0; i < 4; i++) + { + float32x4_t row = vld1q_f32(&mat.m[i*4]); + float32x4_t mul = vmulq_f32(row, vec); + + float32x2_t sum_low = vget_low_f32(mul); + float32x2_t sum_high = vget_high_f32(mul); + float32x2_t sum_pair = vpadd_f32(sum_low, sum_high); + sum_pair = vpadd_f32(sum_pair, sum_pair); + + out.data[i] = vget_lane_f32(sum_pair, 0); + } + +#else + for (int i = 0; i < 4; i++) + { + out.data[i] = 0.f; + for (int j = 0; j < 4; j++) + out.data[i] += mat.m[i*4 + j] * v.data[j]; + } +#endif + return out; +} \ No newline at end of file diff --git a/src/math/mv4.h b/src/math/mv4.h new file mode 100644 index 0000000..a74d9b3 --- /dev/null +++ b/src/math/mv4.h @@ -0,0 +1,4 @@ +#include "mat4.h" +#include "vec4.h" + +Vec4f_t mat4f_mul_vec4f(const Mat4f_t mat, Vec4f_t v); \ No newline at end of file