# HG changeset patch
# User Matti Hamalainen <ccr@tnsp.org>
# Date 1349876143 -10800
# Node ID 159264c2792988735264068c3c74eefda2965aa0
# Parent  f07ddc4982c661f6f22324ec768971442ecef5a2
Add some new vector and matrix operations, and introduce some SSE inline
assembler optimized versions.

diff -r f07ddc4982c6 -r 159264c27929 dmvecmat.c
--- a/dmvecmat.c	Wed Oct 10 14:08:39 2012 +0300
+++ b/dmvecmat.c	Wed Oct 10 16:35:43 2012 +0300
@@ -7,20 +7,164 @@
 #include "dmvecmat.h"
 
 
+void dm_vector_add_n(DMVector *dst, const DMVector *src, const int nlist)
+{
+    int i;
+    for (i = 0; i < nlist; i++)
+        dm_vector_add(dst + i, src + i);
+}
+
+
+void dm_vector_add_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
+{
+    int i;
+    for (i = 0; i < nlist; i++)
+        dm_vector_add_r(dst + i, src1 + i, src2 + i);
+}
+
+
+void dm_vector_sub_n(DMVector *dst, const DMVector *src, const int nlist)
+{
+    int i;
+    for (i = 0; i < nlist; i++)
+        dm_vector_add(dst + i, src + i);
+}
+
+
+void dm_vector_sub_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
+{
+    int i;
+    for (i = 0; i < nlist; i++)
+        dm_vector_sub_r(dst + i, src1 + i, src2 + i);
+}
+
+
+/* Multiply given vector with a matrix
+ */
+void dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat)
+{
+#ifdef DM_USE_SIMD
+    asm volatile(
+        "mov           %1,        %%edx\n"
+        "movups        (%%edx),   %%xmm4\n"
+        "movups      16(%%edx),   %%xmm5\n"
+        "movups      32(%%edx),   %%xmm6\n"
+        "movups      48(%%edx),   %%xmm7\n"
+
+        // vector -> xmm0
+        "movups      %2,     %%xmm0\n"
+        
+        // zero final result in xmm2
+        "xorps       %%xmm2, %%xmm2\n"
+
+        // perform shuffle and multiply and add whole "column" "X"
+        "movups      %%xmm0, %%xmm1\n"
+        "shufps      $0x00,  %%xmm1, %%xmm1\n"
+        "mulps       %%xmm4, %%xmm1\n"
+        "addps       %%xmm1, %%xmm2\n"
+
+        // Y
+        "movups      %%xmm0, %%xmm1\n"
+        "shufps      $0x55,  %%xmm1, %%xmm1\n"
+        "mulps       %%xmm5, %%xmm1\n"
+        "addps       %%xmm1, %%xmm2\n"
+
+        // Z
+        "movups      %%xmm0, %%xmm1\n"
+        "shufps      $0xAA,  %%xmm1, %%xmm1\n"
+        "mulps       %%xmm6, %%xmm1\n"
+        "addps       %%xmm1, %%xmm2\n"
+
+#if 0
+        // W
+        "movups      %%xmm0, %%xmm1\n"
+        "shufps      $0xFF,  %%xmm1, %%xmm1\n"
+        "mulps       %%xmm7, %%xmm1\n"
+        "addps       %%xmm1, %%xmm2\n"
+#endif
+
+        // Result ->
+        "movups      %%xmm2, %0\n"
+        : "=m" (vd)
+        : "m" (mat), "m" (vs)
+        : "memory", "%edx", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+        );
+#else
+    vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]);
+    vd->y = (vs->x * mat->m[0][1]) + (vs->y * mat->m[1][1]) + (vs->z * mat->m[2][1]);
+    vd->z = (vs->x * mat->m[0][2]) + (vs->y * mat->m[1][2]) + (vs->z * mat->m[2][2]);
+#endif
+}
+
+
 /* Multiply list of given vectors with given matrix.
  */
 void dm_vector_mul_by_mat_n(DMVector *list, const int nlist, const DMMatrix *mat)
 {
     int i;
 
+#ifdef DM_USE_SIMD
+    asm volatile(
+        "mov           %0,        %%edx\n"
+        "movups        (%%edx),   %%xmm4\n"
+        "movups      16(%%edx),   %%xmm5\n"
+        "movups      32(%%edx),   %%xmm6\n"
+        "movups      48(%%edx),   %%xmm7\n"
+        :
+        : "m" (mat)
+        : "%edx", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+        );
+#endif
+
     for (i = 0; i < nlist; i++)
     {
+#ifdef DM_USE_SIMD
+        asm volatile
+            (
+            // list[i] -> xmm0
+            "movups      %1,     %%xmm0\n"
+            
+            // zero final result in xmm2
+            "xorps       %%xmm2, %%xmm2\n"
+
+            // perform shuffle and multiply and add whole "column" "X"
+            "movups      %%xmm0, %%xmm1\n"
+            "shufps      $0x00,  %%xmm1, %%xmm1\n"
+            "mulps       %%xmm4, %%xmm1\n"
+            "addps       %%xmm1, %%xmm2\n"
+
+            // Y
+            "movups      %%xmm0, %%xmm1\n"
+            "shufps      $0x55,  %%xmm1, %%xmm1\n"
+            "mulps       %%xmm5, %%xmm1\n"
+            "addps       %%xmm1, %%xmm2\n"
+
+            // Z
+            "movups      %%xmm0, %%xmm1\n"
+            "shufps      $0xAA,  %%xmm1, %%xmm1\n"
+            "mulps       %%xmm6, %%xmm1\n"
+            "addps       %%xmm1, %%xmm2\n"
+
+#if 0
+            // W
+            "movups      %%xmm0, %%xmm1\n"
+            "shufps      $0xFF,  %%xmm1, %%xmm1\n"
+            "mulps       %%xmm7, %%xmm1\n"
+            "addps       %%xmm1, %%xmm2\n"
+#endif            
+            // Result ->
+            "movups      %%xmm2, %0\n"
+            : "=m" (list[i])
+            : "m" (list[i])
+            : "memory", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7");
+#else
         DMVector q;
         memcpy(&q, &list[i], sizeof(DMVector));
 
         list[i].x = (q.x * mat->m[0][0]) + (q.y * mat->m[1][0]) + (q.z * mat->m[2][0]);
         list[i].y = (q.x * mat->m[0][1]) + (q.y * mat->m[1][1]) + (q.z * mat->m[2][1]);
         list[i].z = (q.x * mat->m[0][2]) + (q.y * mat->m[1][2]) + (q.z * mat->m[2][2]);
+#endif
     }
 }
 
@@ -66,6 +210,19 @@
 }
 
 
+void dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2)
+{
+    int i, j;
+
+    for (i = 0; i < DM_MATRIX_SIZE; i++)
+        for (j = 0; j < DM_MATRIX_SIZE; j++)
+            dst->m[i][j] =
+                (mat1->m[i][0] * mat2->m[0][j]) +
+                (mat1->m[i][1] * mat2->m[1][j]) +
+                (mat1->m[i][2] * mat2->m[2][j]);
+}
+
+
 /* Multiply given list of matrices (size of nMatrices units) with given matrix.
  */
 void dm_matrix_mul_n(DMMatrix * list, const int nlist, const DMMatrix *mat)
diff -r f07ddc4982c6 -r 159264c27929 dmvecmat.h
--- a/dmvecmat.h	Wed Oct 10 14:08:39 2012 +0300
+++ b/dmvecmat.h	Wed Oct 10 16:35:43 2012 +0300
@@ -26,12 +26,20 @@
     DMFloat m[DM_MATRIX_SIZE][DM_MATRIX_SIZE];
 } DMMatrix;
 
+
+void    dm_vector_add_n(DMVector *dst, const DMVector *src, const int nlist);
+void    dm_vector_add_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist);
+void    dm_vector_sub_n(DMVector *dst, const DMVector *src, const int nlist);
+void    dm_vector_sub_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist);
+
+void    dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat);
 void    dm_vector_mul_by_mat_n(DMVector *list, const int nlist, const DMMatrix *mat);
 
 void    dm_matrix_unit(DMMatrix *mat);
 void    dm_matrix_transpose(DMMatrix *mat1, const DMMatrix *mat2);
 
 void    dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2);
+void    dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2);
 void    dm_matrix_mul_n(DMMatrix *list, const int nlist, const DMMatrix *mat);
 
 void    dm_matrix_rot(DMMatrix *mat,
@@ -39,14 +47,14 @@
         const DMFloat cx, const DMFloat cy, const DMFloat cz);
 
 
+/* Basic vector operations
+ */
 static inline void dm_vector_copy(DMVector *vd, const DMVector *vs)
 {
     memcpy(vd, vs, sizeof(DMVector));
 }
 
 
-/* Basic vector operations
- */
 static inline void dm_vector_add(DMVector *vr, const DMVector *v2)
 {
 #ifdef DM_USE_SIMD
@@ -171,16 +179,6 @@
 }
 
 
-/* Multiply given vector with a matrix
- */
-static inline void dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat)
-{
-    vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]);
-    vd->y = (vs->x * mat->m[0][1]) + (vs->y * mat->m[1][1]) + (vs->z * mat->m[2][1]);
-    vd->z = (vs->x * mat->m[0][2]) + (vs->y * mat->m[1][2]) + (vs->z * mat->m[2][2]);
-}
-
-
 /* Make rotation matrix from given angles (radians)
  */
 static inline void dm_matrix_rot_a(DMMatrix *mat, const DMFloat ax, const DMFloat ay, const DMFloat az)