From 8d3a55e7f59d06c735aad73f977219191bb8fa67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= <loicguezo@gmail.com>
Date: Wed, 3 Sep 2025 18:20:34 +0200
Subject: [PATCH] feat: add mat4f_mul_vec4f function (x64 / arm / none)

---
 src/math/mv4.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/math/mv4.h |  4 ++++
 2 files changed, 58 insertions(+)
 create mode 100644 src/math/mv4.c
 create mode 100644 src/math/mv4.h

diff --git a/src/math/mv4.c b/src/math/mv4.c
new file mode 100644
index 0000000..2effcd5
--- /dev/null
+++ b/src/math/mv4.c
@@ -0,0 +1,54 @@
+#include "mv4.h"
+
+#ifdef SIMD_X86
+    #include <xmmintrin.h>
+#elif defined(SIMD_ARCH)
+    #include <arm_neon.h>
+#endif
+
+
+Vec4f_t mat4f_mul_vec4f(const Mat4f_t mat, Vec4f_t v)
+{
+    Vec4f_t out;
+#if defined (SIMD_X86)
+    __m128 vec = _mm_load_ps(v.data);
+    
+    for (int i = 0; i < 4; i++) 
+    {
+        __m128 row = _mm_load_ps(&mat.m[4*i]);
+        __m128 mul = _mm_mul_ps(row, vec);
+
+        __m128 shuf = _mm_movehl_ps(mul, mul);
+        __m128 sum = _mm_add_ps(mul, shuf);
+        shuf = _mm_movehl_ps(shuf, sum);
+        sum = _mm_add_ss(sum, shuf);
+
+        out.data[i] = _mm_cvtss_f32(sum);
+    }
+
+#elif defined (SIMD_ARCH)
+    float32x4_t vec = vld1q_f32(v.data);
+
+    for (int i = 0; i < 4; i++)
+    {
+        float32x4_t row = vld1q_f32(&mat.m[i*4]);
+        float32x4_t mul = vmulq_f32(row, vec);
+
+        float32x2_t sum_low = vget_low_f32(mul);
+        float32x2_t sum_high = vget_high_f32(mul);
+        float32x2_t sum_pair = vpadd_f32(sum_low, sum_high);
+        sum_pair = vpadd_f32(sum_pair, sum_pair);
+
+        out.data[i] = vget_lane_f32(sum_pair, 0);
+    }
+
+#else
+    for (int i = 0; i < 4; i++)
+    {
+        out.data[i] = 0.f;
+        for (int j = 0; j < 4; j++)
+            out.data[i] += mat.m[i*4 + j] * v.data[j];
+    }
+#endif
+    return out;
+}
\ No newline at end of file
diff --git a/src/math/mv4.h b/src/math/mv4.h
new file mode 100644
index 0000000..a74d9b3
--- /dev/null
+++ b/src/math/mv4.h
@@ -0,0 +1,4 @@
+#include "mat4.h"
+#include "vec4.h"
+
+Vec4f_t mat4f_mul_vec4f(const Mat4f_t mat, Vec4f_t v);
\ No newline at end of file