Mercurial > hg > dmlib
changeset 270:89a05a5e7a82
Add (untested) SSE asm version of matrix product (matrix x matrix multiplication).
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 10 Oct 2012 19:31:09 +0300 |
parents | 159264c27929 |
children | 6d9a3577741b |
files | dmvecmat.c |
diffstat | 1 files changed, 159 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/dmvecmat.c Wed Oct 10 16:35:43 2012 +0300 +++ b/dmvecmat.c Wed Oct 10 19:31:09 2012 +0300 @@ -87,7 +87,7 @@ "movups %%xmm2, %0\n" : "=m" (vd) : "m" (mat), "m" (vs) - : "memory", "%edx", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7" + : "memory", "%edx", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7" ); #else vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]); @@ -156,7 +156,7 @@ "movups %%xmm2, %0\n" : "=m" (list[i]) : "m" (list[i]) - : "memory", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); + : "memory", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); #else DMVector q; memcpy(&q, &list[i], sizeof(DMVector)); @@ -194,32 +194,172 @@ /* Multiply matrices mat1 and mat2, putting result into mat1 */ -void dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2) -{ - int i, j; - DMMatrix tmpM; - - for (i = 0; i < DM_MATRIX_SIZE; i++) - for (j = 0; j < DM_MATRIX_SIZE; j++) - tmpM.m[i][j] = - (mat1->m[i][0] * mat2->m[0][j]) + - (mat1->m[i][1] * mat2->m[1][j]) + - (mat1->m[i][2] * mat2->m[2][j]); - - memcpy(mat1, &tmpM, sizeof(DMMatrix)); -} - - void dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2) { +#ifdef DM_USE_SIMD + asm volatile( + "mov %1, %%ebx\n" + "mov %2, %%edx\n" + + // -------------------------------------------------- + + // 0 + "movups (%%ebx), %%xmm0\n" // mat1[0] + "movups (%%edx), %%xmm1\n" // mat2[0] + "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0] + "mulps %%xmm0, %%xmm1\n" + "movups %%xmm1, %%xmm3\n" + + // 1 + "movups 16(%%ebx), %%xmm0\n" // mat1[0] + "movups (%%edx), %%xmm1\n" // mat2[0] + "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 2 + "movups 32(%%ebx), %%xmm0\n" // mat1[0] + "movups (%%edx), %%xmm1\n" // mat2[0] + "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 3 + "movups 48(%%ebx), %%xmm0\n" // mat1[0] + "movups (%%edx), %%xmm1\n" // mat2[0] + "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + "mov %0, %%ebx\n" + "movups %%xmm3, (%%ebx)\n" + + // -------------------------------------------------- + + "mov %1, %%ebx\n" + + // 0 + "movups (%%ebx), %%xmm0\n" // mat1[0] + "movups 16(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0] + "mulps %%xmm0, %%xmm1\n" + "movups %%xmm1, %%xmm3\n" + + // 1 + "movups 16(%%ebx), %%xmm0\n" // mat1[0] + "movups 16(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 2 + "movups 32(%%ebx), %%xmm0\n" // mat1[0] + "movups 16(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 3 + "movups 48(%%ebx), %%xmm0\n" // mat1[0] + "movups 16(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + "mov %0, %%ebx\n" + "movups %%xmm3, 16(%%ebx)\n" + + // -------------------------------------------------- + + "mov %1, %%ebx\n" + + // 0 + "movups (%%ebx), %%xmm0\n" // mat1[0] + "movups 32(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0] + "mulps %%xmm0, %%xmm1\n" + "movups %%xmm1, %%xmm3\n" + + // 1 + "movups 16(%%ebx), %%xmm0\n" // mat1[0] + "movups 32(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 2 + "movups 32(%%ebx), %%xmm0\n" // mat1[0] + "movups 32(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 3 + "movups 48(%%ebx), %%xmm0\n" // mat1[0] + "movups 32(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + "mov %0, %%ebx\n" + "movups %%xmm3, 32(%%ebx)\n" + + // -------------------------------------------------- + + "mov %1, %%ebx\n" + + // 0 + "movups (%%ebx), %%xmm0\n" // mat1[0] + "movups 48(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0] + "mulps %%xmm0, %%xmm1\n" + "movups %%xmm1, %%xmm3\n" + + // 1 + "movups 16(%%ebx), %%xmm0\n" // mat1[0] + "movups 48(%%edx), %%xmm1\n" // mat2[1] + "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 2 + "movups 32(%%ebx), %%xmm0\n" // mat1[0] + "movups 48(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + // 3 + "movups 48(%%ebx), %%xmm0\n" // mat1[0] + "movups 48(%%edx), %%xmm1\n" // mat2[1] + "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3] + "mulps %%xmm0, %%xmm1\n" + "addps %%xmm1, %%xmm3\n" + + "mov %0, %%ebx\n" + "movups %%xmm3, 48(%%ebx)\n" + + : "=m" (dst) + : "m" (mat1), "m" (mat2) + : "memory", "%edx", "%ebx", "%xmm0", "%xmm2", "%xmm3" + ); +#else int i, j; - for (i = 0; i < DM_MATRIX_SIZE; i++) for (j = 0; j < DM_MATRIX_SIZE; j++) dst->m[i][j] = (mat1->m[i][0] * mat2->m[0][j]) + (mat1->m[i][1] * mat2->m[1][j]) + (mat1->m[i][2] * mat2->m[2][j]); +#endif +} + + +void dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2) +{ + DMMatrix tmpM; + dm_matrix_mul_r(&tmpM, mat1, mat2); + memcpy(mat1, &tmpM, sizeof(DMMatrix)); }