feat(mat4): add x86 transpose function

2026-01-25 03:34:22 +00:00 · 2025-08-07 08:07:17 +02:00
parent bff9c46bb0
commit 2b1e00305c
4 changed files with 61 additions and 12 deletions
--- a/.github/workflows/cmake-multi-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -73,4 +73,4 @@ jobs:
      working-directory: ${{ steps.strings.outputs.build-output-dir }}
      # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
-      run: ctest --build-config ${{ matrix.build_type }} --verbose
+      run: ctest --build-config ${{ matrix.build_type }}
--- a/src/main.c
+++ b/src/main.c
@@ -3,11 +3,34 @@

 int main(void) 
 {
-    float arr[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+    float arr[16] = {
+        1, 2, 3, 4, 
+        5, 6, 7, 8, 
+        9, 10, 11, 12, 
+        13, 14, 15, 16
+    };
+
    Mat4f_t mat1 =  mat4f_from_array(arr);
-    Mat4f_t mat2 =  mat4f_scalar(1);
-    Mat4f_t mat3 =  mat4f_zero();
-    Mat4f_t mat4 =  mat4f_identity();
-    printf("%f", arr[1]);
+    Mat4f_t* mat_tps = mat4f_tpo_r(&mat1);
+
+    printf("%f %f %f %f\n%f %f %f %f\n%f %f %f %f\n%f %f %f %f\n", 
+        mat1.m[0],
+        mat1.m[1],
+        mat1.m[2],
+        mat1.m[3],
+        mat1.m[4],
+        mat1.m[5],
+        mat1.m[6],
+        mat1.m[7],
+        mat1.m[8],
+        mat1.m[9],
+        mat1.m[10],
+        mat1.m[11],
+        mat1.m[12],
+        mat1.m[13],
+        mat1.m[14],
+        mat1.m[15]
+    
+    );
    return 0;
 }
--- a/src/math/mat4.c
+++ b/src/math/mat4.c
@@ -227,22 +227,48 @@ Mat4f_t mat4_mul(const Mat4f_t* m1, const Mat4f_t* m2)
    return mout;
 }

-Mat4f_t* mat4_tpo_r(Mat4f_t *__restrict out)
+Mat4f_t* mat4f_tpo_r(Mat4f_t *__restrict out)
 {
    Mat4f_t clone = mat4f_clone(out);
    
-    for(int i = 0; i < MAT_DIM; i++) {
    #if defined (SIMD_X86)
+    __m128 res[4];
+    for (int i=0; i<MAT_DIM; i++) {
+        __m128 mrow = _mm_load_ps(&clone.m[i*MAT_DIM]);
+    }
+    
+    __m128 t0 = _mm_unpacklo_ps(row[0], row[1]);
+    __m128 t1 = _mm_unpackhi_ps(row[0], row[1]);
+    __m128 t2 = _mm_unpacklo_ps(row[2], row[3]);
+    __m128 t3 = _mm_unpackhi_ps(row[2], row[3]);
+
+    __m128 r0 = _mm_movelh_ps(t0, t2);
+    __m128 r1 = _mm_movehl_ps(t2, t0);
+    __m128 r2 = _mm_movelh_ps(t1, t3);
+    __m128 r3 = _mm_movehl_ps(t3, t1);
+
+    _mm_store_ps(&out->m[0 * MAT_DIM], r0);
+    _mm_store_ps(&out->m[1 * MAT_DIM], r1);
+    _mm_store_ps(&out->m[2 * MAT_DIM], r2);
+    _mm_store_ps(&out->m[3 * MAT_DIM], r3);


 #elif defined (SIMD_ARCH)
 #else
+    for(int i = 0; i < MAT_DIM; i++) {
        int dim_i = i * MAT_DIM;

        for (int j = 0; j < MAT_DIM; j++) {
            out->m[dim_i + j] = clone.m[(j * MAT_DIM) + i];
        }
-#endif
    }
+#endif
    return out;    
 }
+
+Mat4f_t mat4f_tpo(const Mat4f_t *restrict m)
+{
+    Mat4f_t res = mat4f_clone(m);
+    mat4f_clone(&res);
+    return res;
+}
--- a/src/math/mat4.h
+++ b/src/math/mat4.h
@@ -34,8 +34,8 @@ Mat4f_t* mat4f_scale_r(Mat4f_t *out, float scalar);
 Mat4f_t mat4f_mul(const Mat4f_t* m1, const Mat4f_t* m2);
 Mat4f_t* mat4f_mul_r(Mat4f_t* out, const Mat4f_t* m2);

-Mat4f_t mat4_tpo(const Mat4f_t *__restrict m);
-Mat4f_t* mat4_tpo_r(Mat4f_t *__restrict m);
+Mat4f_t mat4f_tpo(const Mat4f_t *__restrict m);
+Mat4f_t* mat4f_tpo_r(Mat4f_t *__restrict m);

 // float mat4_det(const Mat4_t* m);