comparison src/dmvecmat.c @ 1306:696c58784635

Remove SSE2 SIMD vector math paths, apparently they no longer work (segfault).. maybe a 32bit/64bit issue.
author Matti Hamalainen <ccr@tnsp.org>
date Sat, 19 Aug 2017 13:02:34 +0300
parents 848a88ce7a57
children 69a5af2eb1ea
comparison
equal deleted inserted replaced
1305:582d49915059 1306:696c58784635
10 void dm_vector_add_n(DMVector *dst, const DMVector *src, const int nlist) 10 void dm_vector_add_n(DMVector *dst, const DMVector *src, const int nlist)
11 { 11 {
12 int i; 12 int i;
13 for (i = 0; i < nlist; i++) 13 for (i = 0; i < nlist; i++)
14 { 14 {
15 #ifdef DM_USE_SIMD
16 asm("movups %2, %%xmm1\n"
17 "movups %1, %%xmm2\n"
18 "addps %%xmm2, %%xmm1\n"
19 "movups %%xmm1, %0\n"
20 : "=m" (dst[i])
21 : "m" (dst[i]), "m" (src[i])
22 : "memory", "%xmm1", "%xmm2");
23 #else
24 dm_vector_add(dst + i, src + i); 15 dm_vector_add(dst + i, src + i);
25 #endif
26 } 16 }
27 } 17 }
28 18
29 19
30 void dm_vector_add_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist) 20 void dm_vector_add_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
31 { 21 {
32 int i; 22 int i;
33 for (i = 0; i < nlist; i++) 23 for (i = 0; i < nlist; i++)
34 { 24 {
35 #ifdef DM_USE_SIMD
36 asm("movups %2, %%xmm1\n"
37 "movups %1, %%xmm2\n"
38 "addps %%xmm2, %%xmm1\n"
39 "movups %%xmm1, %0\n"
40 : "=m" (dst[i])
41 : "m" (src1[i]), "m" (src2[i])
42 : "memory", "%xmm1", "%xmm2");
43 #else
44 dm_vector_add_r(dst + i, src1 + i, src2 + i); 25 dm_vector_add_r(dst + i, src1 + i, src2 + i);
45 #endif
46 } 26 }
47 } 27 }
48 28
49 29
50 void dm_vector_sub_n(DMVector *dst, const DMVector *src, const int nlist) 30 void dm_vector_sub_n(DMVector *dst, const DMVector *src, const int nlist)
51 { 31 {
52 int i; 32 int i;
53 for (i = 0; i < nlist; i++) 33 for (i = 0; i < nlist; i++)
54 { 34 {
55 #ifdef DM_USE_SIMD
56 asm("movups %2, %%xmm1\n"
57 "movups %1, %%xmm2\n"
58 "subps %%xmm2, %%xmm1\n"
59 "movups %%xmm1, %0\n"
60 : "=m" (dst[i])
61 : "m" (dst[i]), "m" (src[i])
62 : "memory", "%xmm1", "%xmm2");
63 #else
64 dm_vector_add(dst + i, src + i); 35 dm_vector_add(dst + i, src + i);
65 #endif
66 } 36 }
67 } 37 }
68 38
69 39
70 void dm_vector_sub_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist) 40 void dm_vector_sub_r_n(DMVector *dst, const DMVector *src1, const DMVector *src2, const int nlist)
71 { 41 {
72 int i; 42 int i;
73 for (i = 0; i < nlist; i++) 43 for (i = 0; i < nlist; i++)
74 { 44 {
75 #ifdef DM_USE_SIMD
76 asm("movups %2, %%xmm1\n"
77 "movups %1, %%xmm2\n"
78 "subps %%xmm2, %%xmm1\n"
79 "movups %%xmm1, %0\n"
80 : "=m" (dst[i])
81 : "m" (src1[i]), "m" (src2[i])
82 : "memory", "%xmm1", "%xmm2");
83 #else
84 dm_vector_sub_r(dst + i, src1 + i, src2 + i); 45 dm_vector_sub_r(dst + i, src1 + i, src2 + i);
85 #endif
86 } 46 }
87 } 47 }
88 48
89 49
90 /* Multiply given vector with a matrix 50 /* Multiply given vector with a matrix
91 */ 51 */
92 void dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat) 52 void dm_vector_mul_by_mat(DMVector *vd, const DMVector *vs, const DMMatrix *mat)
93 { 53 {
94 #ifdef DM_USE_SIMD
95 asm volatile(
96 "mov %1, %%edx\n"
97 "movups (%%edx), %%xmm4\n"
98 "movups 16(%%edx), %%xmm5\n"
99 "movups 32(%%edx), %%xmm6\n"
100 "movups 48(%%edx), %%xmm7\n"
101
102 // vector -> xmm0
103 "movups %2, %%xmm0\n"
104
105 // zero final result in xmm2
106 "xorps %%xmm2, %%xmm2\n"
107
108 // perform shuffle and multiply and add whole "column" "X"
109 "movups %%xmm0, %%xmm1\n"
110 "shufps $0x00, %%xmm1, %%xmm1\n"
111 "mulps %%xmm4, %%xmm1\n"
112 "addps %%xmm1, %%xmm2\n"
113
114 // Y
115 "movups %%xmm0, %%xmm1\n"
116 "shufps $0x55, %%xmm1, %%xmm1\n"
117 "mulps %%xmm5, %%xmm1\n"
118 "addps %%xmm1, %%xmm2\n"
119
120 // Z
121 "movups %%xmm0, %%xmm1\n"
122 "shufps $0xAA, %%xmm1, %%xmm1\n"
123 "mulps %%xmm6, %%xmm1\n"
124 "addps %%xmm1, %%xmm2\n"
125
126 // W
127 "movups %%xmm0, %%xmm1\n"
128 "shufps $0xFF, %%xmm1, %%xmm1\n"
129 "mulps %%xmm7, %%xmm1\n"
130 "addps %%xmm1, %%xmm2\n"
131
132 // Result ->
133 "movups %%xmm2, %0\n"
134 : "=m" (vd)
135 : "m" (mat), "m" (vs)
136 : "memory", "%edx", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
137 );
138 #else
139 vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]); 54 vd->x = (vs->x * mat->m[0][0]) + (vs->y * mat->m[1][0]) + (vs->z * mat->m[2][0]);
140 vd->y = (vs->x * mat->m[0][1]) + (vs->y * mat->m[1][1]) + (vs->z * mat->m[2][1]); 55 vd->y = (vs->x * mat->m[0][1]) + (vs->y * mat->m[1][1]) + (vs->z * mat->m[2][1]);
141 vd->z = (vs->x * mat->m[0][2]) + (vs->y * mat->m[1][2]) + (vs->z * mat->m[2][2]); 56 vd->z = (vs->x * mat->m[0][2]) + (vs->y * mat->m[1][2]) + (vs->z * mat->m[2][2]);
142 #endif
143 } 57 }
144 58
145 59
146 /* Multiply list of given vectors with given matrix. 60 /* Multiply list of given vectors with given matrix.
147 */ 61 */
148 void dm_vector_mul_by_mat_n(DMVector *list, const int nlist, const DMMatrix *mat) 62 void dm_vector_mul_by_mat_n(DMVector *list, const int nlist, const DMMatrix *mat)
149 { 63 {
150 int i; 64 int i;
151 65
152 #ifdef DM_USE_SIMD
153 asm volatile(
154 "mov %0, %%edx\n"
155 "movups (%%edx), %%xmm4\n"
156 "movups 16(%%edx), %%xmm5\n"
157 "movups 32(%%edx), %%xmm6\n"
158 "movups 48(%%edx), %%xmm7\n"
159 :
160 : "m" (mat)
161 : "%edx", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
162 );
163 #endif
164
165 for (i = 0; i < nlist; i++) 66 for (i = 0; i < nlist; i++)
166 { 67 {
167 #ifdef DM_USE_SIMD
168 asm volatile
169 (
170 // list[i] -> xmm0
171 "movups %1, %%xmm0\n"
172
173 // zero final result in xmm2
174 "xorps %%xmm2, %%xmm2\n"
175
176 // perform shuffle and multiply and add whole "column" "X"
177 "movups %%xmm0, %%xmm1\n"
178 "shufps $0x00, %%xmm1, %%xmm1\n"
179 "mulps %%xmm4, %%xmm1\n"
180 "addps %%xmm1, %%xmm2\n"
181
182 // Y
183 "movups %%xmm0, %%xmm1\n"
184 "shufps $0x55, %%xmm1, %%xmm1\n"
185 "mulps %%xmm5, %%xmm1\n"
186 "addps %%xmm1, %%xmm2\n"
187
188 // Z
189 "movups %%xmm0, %%xmm1\n"
190 "shufps $0xAA, %%xmm1, %%xmm1\n"
191 "mulps %%xmm6, %%xmm1\n"
192 "addps %%xmm1, %%xmm2\n"
193
194 // W
195 "movups %%xmm0, %%xmm1\n"
196 "shufps $0xFF, %%xmm1, %%xmm1\n"
197 "mulps %%xmm7, %%xmm1\n"
198 "addps %%xmm1, %%xmm2\n"
199
200 // Result ->
201 "movups %%xmm2, %0\n"
202 : "=m" (list[i])
203 : "m" (list[i])
204 : "memory", "%xmm0", "%xmm1", "%xmm2", "%xmm4", "%xmm5", "%xmm6", "%xmm7");
205 #else
206 DMVector q; 68 DMVector q;
207 memcpy(&q, &list[i], sizeof(DMVector)); 69 memcpy(&q, &list[i], sizeof(DMVector));
208 70
209 list[i].x = (q.x * mat->m[0][0]) + (q.y * mat->m[1][0]) + (q.z * mat->m[2][0]); 71 list[i].x = (q.x * mat->m[0][0]) + (q.y * mat->m[1][0]) + (q.z * mat->m[2][0]);
210 list[i].y = (q.x * mat->m[0][1]) + (q.y * mat->m[1][1]) + (q.z * mat->m[2][1]); 72 list[i].y = (q.x * mat->m[0][1]) + (q.y * mat->m[1][1]) + (q.z * mat->m[2][1]);
211 list[i].z = (q.x * mat->m[0][2]) + (q.y * mat->m[1][2]) + (q.z * mat->m[2][2]); 73 list[i].z = (q.x * mat->m[0][2]) + (q.y * mat->m[1][2]) + (q.z * mat->m[2][2]);
212 #endif
213 } 74 }
214 } 75 }
215 76
216 77
217 /* Set matrix to unit-matrix 78 /* Set matrix to unit-matrix
240 101
241 /* Multiply matrices mat1 and mat2, putting result into mat1 102 /* Multiply matrices mat1 and mat2, putting result into mat1
242 */ 103 */
243 void dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2) 104 void dm_matrix_mul_r(DMMatrix *dst, const DMMatrix *mat1, const DMMatrix *mat2)
244 { 105 {
245 #ifdef DM_USE_SIMD
246 asm volatile(
247 "mov %1, %%ebx\n"
248 "mov %2, %%edx\n"
249
250 // --------------------------------------------------
251
252 // 0
253 "movups (%%ebx), %%xmm0\n" // mat1[0]
254 "movups (%%edx), %%xmm1\n" // mat2[0]
255 "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
256 "mulps %%xmm0, %%xmm1\n"
257 "movups %%xmm1, %%xmm3\n"
258
259 // 1
260 "movups 16(%%ebx), %%xmm0\n" // mat1[0]
261 "movups (%%edx), %%xmm1\n" // mat2[0]
262 "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
263 "mulps %%xmm0, %%xmm1\n"
264 "addps %%xmm1, %%xmm3\n"
265
266 // 2
267 "movups 32(%%ebx), %%xmm0\n" // mat1[0]
268 "movups (%%edx), %%xmm1\n" // mat2[0]
269 "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
270 "mulps %%xmm0, %%xmm1\n"
271 "addps %%xmm1, %%xmm3\n"
272
273 // 3
274 "movups 48(%%ebx), %%xmm0\n" // mat1[0]
275 "movups (%%edx), %%xmm1\n" // mat2[0]
276 "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
277 "mulps %%xmm0, %%xmm1\n"
278 "addps %%xmm1, %%xmm3\n"
279
280 "mov %0, %%ebx\n"
281 "movups %%xmm3, (%%ebx)\n"
282
283 // --------------------------------------------------
284
285 "mov %1, %%ebx\n"
286
287 // 0
288 "movups (%%ebx), %%xmm0\n" // mat1[0]
289 "movups 16(%%edx), %%xmm1\n" // mat2[1]
290 "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
291 "mulps %%xmm0, %%xmm1\n"
292 "movups %%xmm1, %%xmm3\n"
293
294 // 1
295 "movups 16(%%ebx), %%xmm0\n" // mat1[0]
296 "movups 16(%%edx), %%xmm1\n" // mat2[1]
297 "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
298 "mulps %%xmm0, %%xmm1\n"
299 "addps %%xmm1, %%xmm3\n"
300
301 // 2
302 "movups 32(%%ebx), %%xmm0\n" // mat1[0]
303 "movups 16(%%edx), %%xmm1\n" // mat2[1]
304 "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
305 "mulps %%xmm0, %%xmm1\n"
306 "addps %%xmm1, %%xmm3\n"
307
308 // 3
309 "movups 48(%%ebx), %%xmm0\n" // mat1[0]
310 "movups 16(%%edx), %%xmm1\n" // mat2[1]
311 "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
312 "mulps %%xmm0, %%xmm1\n"
313 "addps %%xmm1, %%xmm3\n"
314
315 "mov %0, %%ebx\n"
316 "movups %%xmm3, 16(%%ebx)\n"
317
318 // --------------------------------------------------
319
320 "mov %1, %%ebx\n"
321
322 // 0
323 "movups (%%ebx), %%xmm0\n" // mat1[0]
324 "movups 32(%%edx), %%xmm1\n" // mat2[1]
325 "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
326 "mulps %%xmm0, %%xmm1\n"
327 "movups %%xmm1, %%xmm3\n"
328
329 // 1
330 "movups 16(%%ebx), %%xmm0\n" // mat1[0]
331 "movups 32(%%edx), %%xmm1\n" // mat2[1]
332 "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
333 "mulps %%xmm0, %%xmm1\n"
334 "addps %%xmm1, %%xmm3\n"
335
336 // 2
337 "movups 32(%%ebx), %%xmm0\n" // mat1[0]
338 "movups 32(%%edx), %%xmm1\n" // mat2[1]
339 "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
340 "mulps %%xmm0, %%xmm1\n"
341 "addps %%xmm1, %%xmm3\n"
342
343 // 3
344 "movups 48(%%ebx), %%xmm0\n" // mat1[0]
345 "movups 32(%%edx), %%xmm1\n" // mat2[1]
346 "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
347 "mulps %%xmm0, %%xmm1\n"
348 "addps %%xmm1, %%xmm3\n"
349
350 "mov %0, %%ebx\n"
351 "movups %%xmm3, 32(%%ebx)\n"
352
353 // --------------------------------------------------
354
355 "mov %1, %%ebx\n"
356
357 // 0
358 "movups (%%ebx), %%xmm0\n" // mat1[0]
359 "movups 48(%%edx), %%xmm1\n" // mat2[1]
360 "shufps $0x00, %%xmm1, %%xmm1\n" // mat2[0][0]
361 "mulps %%xmm0, %%xmm1\n"
362 "movups %%xmm1, %%xmm3\n"
363
364 // 1
365 "movups 16(%%ebx), %%xmm0\n" // mat1[0]
366 "movups 48(%%edx), %%xmm1\n" // mat2[1]
367 "shufps $0x55, %%xmm1, %%xmm1\n" // mat2[0][1]
368 "mulps %%xmm0, %%xmm1\n"
369 "addps %%xmm1, %%xmm3\n"
370
371 // 2
372 "movups 32(%%ebx), %%xmm0\n" // mat1[0]
373 "movups 48(%%edx), %%xmm1\n" // mat2[1]
374 "shufps $0xAA, %%xmm1, %%xmm1\n" // mat2[0][2]
375 "mulps %%xmm0, %%xmm1\n"
376 "addps %%xmm1, %%xmm3\n"
377
378 // 3
379 "movups 48(%%ebx), %%xmm0\n" // mat1[0]
380 "movups 48(%%edx), %%xmm1\n" // mat2[1]
381 "shufps $0xFF, %%xmm1, %%xmm1\n" // mat2[0][3]
382 "mulps %%xmm0, %%xmm1\n"
383 "addps %%xmm1, %%xmm3\n"
384
385 "mov %0, %%ebx\n"
386 "movups %%xmm3, 48(%%ebx)\n"
387
388 : "=m" (dst)
389 : "m" (mat1), "m" (mat2)
390 : "memory", "%edx", "%ebx", "%xmm0", "%xmm2", "%xmm3"
391 );
392 #else
393 int i, j; 106 int i, j;
394 for (i = 0; i < DM_MATRIX_SIZE; i++) 107 for (i = 0; i < DM_MATRIX_SIZE; i++)
395 for (j = 0; j < DM_MATRIX_SIZE; j++) 108 for (j = 0; j < DM_MATRIX_SIZE; j++)
396 dst->m[i][j] = 109 dst->m[i][j] =
397 (mat1->m[i][0] * mat2->m[0][j]) + 110 (mat1->m[i][0] * mat2->m[0][j]) +
398 (mat1->m[i][1] * mat2->m[1][j]) + 111 (mat1->m[i][1] * mat2->m[1][j]) +
399 (mat1->m[i][2] * mat2->m[2][j]); 112 (mat1->m[i][2] * mat2->m[2][j]);
400 #endif
401 } 113 }
402 114
403 115
404 void dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2) 116 void dm_matrix_mul(DMMatrix *mat1, const DMMatrix *mat2)
405 { 117 {
450 162
451 163
452 mat->m[2][0] = (sy * l) + i; 164 mat->m[2][0] = (sy * l) + i;
453 mat->m[2][1] = (sy * q) - j; 165 mat->m[2][1] = (sy * q) - j;
454 mat->m[2][2] = cx * cy; 166 mat->m[2][2] = cx * cy;
455
456 } 167 }