From 28e072330d1262d3f6b5414f8a56362346ff960f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Fri, 11 Jul 2025 21:31:51 +0200 Subject: [PATCH] feat: add MAT_DIM to most mat4f_t functions --- src/math/mat4.c | 63 ++++++++++++++++++++++++++----------------------- src/math/mat4.h | 4 +++- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/math/mat4.c b/src/math/mat4.c index 3eb2d23..2d38326 100644 --- a/src/math/mat4.c +++ b/src/math/mat4.c @@ -3,7 +3,7 @@ Mat4f_t mat4f_from_array(const float arr[16]) { Mat4f_t mat; - for(int i = 0; im[i]); __m128 mb = _mm_load_ps(&m2->m[i]); @@ -102,7 +102,7 @@ Mat4f_t mat4_add(const Mat4f_t* m1, const Mat4f_t* m2) Mat4f_t* mat4f_sub_r(Mat4f_t *out, const Mat4f_t *m2) { - for(int i = 0; im[i]); __m128 mb = _mm_load_ps(&m2->m[i]); @@ -136,7 +136,7 @@ Mat4f_t mat4_sub(const Mat4f_t* m1, const Mat4f_t* m2) Mat4f_t* mat4f_scale_r(Mat4f_t *out, float scalar) { - for(int i = 0; im[i]); __m128 mb = _mm_set1_ps(scalar); @@ -149,7 +149,7 @@ Mat4f_t* mat4f_scale_r(Mat4f_t *out, float scalar) float32x4_t mres = vmulq_f32(ma, mb); vst1q_f32(&out->m[i], mres); #else - for(int j = 0; j<4; j++) { + for(int j = 0; jm[i+j] *= scalar; } #endif @@ -168,16 +168,16 @@ Mat4f_t* mat4f_mul_r(Mat4f_t* out, const Mat4f_t* m2) { Mat4f_t clone = mat4f_clone(out); - for (int row = 0; row<4; row++) { + for (int row = 0; rowm[3 * 4 + col], - m2->m[2 * 4 + col], - m2->m[1 * 4 + col], - m2->m[0 * 4 + col] + m2->m[3 * MAT_DIM + col], + m2->m[2 * MAT_DIM + col], + m2->m[1 * MAT_DIM + col], + m2->m[0 * MAT_DIM + col] ); __m128 mmul = _mm_mul_ps(mrow, mcol); @@ -188,17 +188,17 @@ Mat4f_t* mat4f_mul_r(Mat4f_t* out, const Mat4f_t* m2) sum = _mm_add_ss(sum, shuf); // [x+y+z+w, y+y, z+w, w+w] float mres = _mm_cvtss_f32(sum); - out->m[row * 4 + col] = mres; + out->m[row * MAT_DIM + col] = mres; } #elif defined (SIMD_ARCH) - float32x4_t mrow = vld1q_f32(&clone.m[row*4]); + float32x4_t mrow = vld1q_f32(&clone.m[row*MAT_DIM]); - for (int col = 0; col<4; col++) { + for (int col = 0; colm[0 * 4 + col], - m2->m[1 * 4 + col], - m2->m[2 * 4 + col], - m2->m[3 * 4 + col] + m2->m[0 * MAT_DIM + col], + m2->m[1 * MAT_DIM + col], + m2->m[2 * MAT_DIM + col], + m2->m[3 * MAT_DIM + col] }; float32x4_t mmul = vmulq_f32(mrow, mcol); @@ -206,15 +206,15 @@ Mat4f_t* mat4f_mul_r(Mat4f_t* out, const Mat4f_t* m2) float32x2_t final_sum = vpadd_f32(sum_pair, sum_pair); float mres = vget_lane_f32(final_sum, 0); - out->m[row * 4 + col] = mres; + out->m[row * MAT_DIM + col] = mres; } #else - for (int col = 0; col < 4; col++) { + for (int col = 0; col < MAT_DIM; col++) { float sum = 0.0f; - for (int k = 0; k < 4; k++) { - sum += clone.m[row * 4 + k] * m2->m[k * 4 + col]; + for (int k = 0; k < MAT_DIM; k++) { + sum += clone.m[row * MAT_DIM + k] * m2->m[k * MAT_DIM + col]; } - out->m[row * 4 + col] = sum; + out->m[row * MAT_DIM + col] = sum; } #endif } @@ -225,4 +225,9 @@ Mat4f_t mat4_mul(const Mat4f_t* m1, const Mat4f_t* m2) Mat4f_t mout = mat4f_clone(m1); mat4f_mul_r(&mout, m2); return mout; +} + +Mat4f_t* mat4_tpo_r(Mat4f_t *__restrict m) +{ + return m; } \ No newline at end of file diff --git a/src/math/mat4.h b/src/math/mat4.h index fe25bea..a0327e2 100644 --- a/src/math/mat4.h +++ b/src/math/mat4.h @@ -4,6 +4,7 @@ #include "mconfig.h" #define MAT_SIZE 16 +#define MAT_DIM 4 typedef struct { @@ -33,7 +34,8 @@ Mat4f_t* mat4f_scale_r(Mat4f_t *out, float scalar); Mat4f_t mat4f_mul(const Mat4f_t* m1, const Mat4f_t* m2); Mat4f_t* mat4f_mul_r(Mat4f_t* out, const Mat4f_t* m2); -// Mat4_t mat4_tpo(const Mat4_t* m); +Mat4f_t mat4_tpo(const Mat4f_t *__restrict m); +Mat4f_t* mat4_tpo_r(Mat4f_t *__restrict m); // float mat4_det(const Mat4_t* m);