Mercurial > hg > forks > libbpg
comparison x265/source/common/x86/pixeladd8.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:772086c29cc7 |
---|---|
1 ;***************************************************************************** | |
2 ;* Copyright (C) 2013 x265 project | |
3 ;* | |
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
5 ;* | |
6 ;* This program is free software; you can redistribute it and/or modify | |
7 ;* it under the terms of the GNU General Public License as published by | |
8 ;* the Free Software Foundation; either version 2 of the License, or | |
9 ;* (at your option) any later version. | |
10 ;* | |
11 ;* This program is distributed in the hope that it will be useful, | |
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 ;* GNU General Public License for more details. | |
15 ;* | |
16 ;* You should have received a copy of the GNU General Public License | |
17 ;* along with this program; if not, write to the Free Software | |
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 ;* | |
20 ;* This program is also available under a commercial proprietary license. | |
21 ;* For more information, contact us at license @ x265.com. | |
22 ;*****************************************************************************/ | |
23 | |
24 %include "x86inc.asm" | |
25 %include "x86util.asm" | |
26 | |
27 SECTION_RODATA 32 | |
28 | |
29 SECTION .text | |
30 | |
31 cextern pw_pixel_max | |
32 | |
33 ;----------------------------------------------------------------------------- | |
34 ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
35 ;----------------------------------------------------------------------------- | |
36 %if HIGH_BIT_DEPTH | |
37 INIT_XMM sse2 | |
38 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
39 mova m1, [pw_pixel_max] | |
40 pxor m0, m0 | |
41 add r4, r4 | |
42 add r5, r5 | |
43 add r1, r1 | |
44 movh m2, [r2] | |
45 movhps m2, [r2 + r4] | |
46 movh m3, [r3] | |
47 movhps m3, [r3 + r5] | |
48 lea r2, [r2 + r4 * 2] | |
49 lea r3, [r3 + r5 * 2] | |
50 movh m4, [r2] | |
51 movhps m4, [r2 + r4] | |
52 movh m5, [r3] | |
53 movhps m5, [r3 + r5] | |
54 | |
55 paddw m2, m3 | |
56 paddw m4, m5 | |
57 CLIPW2 m2, m4, m0, m1 | |
58 | |
59 movh [r0], m2 | |
60 movhps [r0 + r1], m2 | |
61 lea r0, [r0 + r1 * 2] | |
62 movh [r0], m4 | |
63 movhps [r0 + r1], m4 | |
64 | |
65 RET | |
66 %else | |
67 INIT_XMM sse4 | |
68 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
69 add r5, r5 | |
70 pmovzxbw m0, [r2] | |
71 pmovzxbw m2, [r2 + r4] | |
72 movh m1, [r3] | |
73 movh m3, [r3 + r5] | |
74 lea r2, [r2 + r4 * 2] | |
75 lea r3, [r3 + r5 * 2] | |
76 pmovzxbw m4, [r2] | |
77 pmovzxbw m6, [r2 + r4] | |
78 movh m5, [r3] | |
79 movh m7, [r3 + r5] | |
80 | |
81 paddw m0, m1 | |
82 paddw m2, m3 | |
83 paddw m4, m5 | |
84 paddw m6, m7 | |
85 packuswb m0, m0 | |
86 packuswb m2, m2 | |
87 packuswb m4, m4 | |
88 packuswb m6, m6 | |
89 | |
90 movd [r0], m0 | |
91 movd [r0 + r1], m2 | |
92 lea r0, [r0 + r1 * 2] | |
93 movd [r0], m4 | |
94 movd [r0 + r1], m6 | |
95 | |
96 RET | |
97 %endif | |
98 | |
99 | |
100 ;----------------------------------------------------------------------------- | |
101 ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
102 ;----------------------------------------------------------------------------- | |
103 %macro PIXEL_ADD_PS_W4_H4 2 | |
104 %if HIGH_BIT_DEPTH | |
105 INIT_XMM sse2 | |
106 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
107 mova m1, [pw_pixel_max] | |
108 pxor m0, m0 | |
109 mov r6d, %2/4 | |
110 add r4, r4 | |
111 add r5, r5 | |
112 add r1, r1 | |
113 .loop: | |
114 movh m2, [r2] | |
115 movhps m2, [r2 + r4] | |
116 movh m3, [r3] | |
117 movhps m3, [r3 + r5] | |
118 lea r2, [r2 + r4 * 2] | |
119 lea r3, [r3 + r5 * 2] | |
120 movh m4, [r2] | |
121 movhps m4, [r2 + r4] | |
122 movh m5, [r3] | |
123 movhps m5, [r3 + r5] | |
124 dec r6d | |
125 lea r2, [r2 + r4 * 2] | |
126 lea r3, [r3 + r5 * 2] | |
127 | |
128 paddw m2, m3 | |
129 paddw m4, m5 | |
130 CLIPW2 m2, m4, m0, m1 | |
131 | |
132 movh [r0], m2 | |
133 movhps [r0 + r1], m2 | |
134 lea r0, [r0 + r1 * 2] | |
135 movh [r0], m4 | |
136 movhps [r0 + r1], m4 | |
137 lea r0, [r0 + r1 * 2] | |
138 | |
139 jnz .loop | |
140 RET | |
141 %else | |
142 INIT_XMM sse4 | |
143 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
144 mov r6d, %2/4 | |
145 add r5, r5 | |
146 .loop: | |
147 pmovzxbw m0, [r2] | |
148 pmovzxbw m2, [r2 + r4] | |
149 movh m1, [r3] | |
150 movh m3, [r3 + r5] | |
151 lea r2, [r2 + r4 * 2] | |
152 lea r3, [r3 + r5 * 2] | |
153 pmovzxbw m4, [r2] | |
154 pmovzxbw m6, [r2 + r4] | |
155 movh m5, [r3] | |
156 movh m7, [r3 + r5] | |
157 dec r6d | |
158 lea r2, [r2 + r4 * 2] | |
159 lea r3, [r3 + r5 * 2] | |
160 | |
161 paddw m0, m1 | |
162 paddw m2, m3 | |
163 paddw m4, m5 | |
164 paddw m6, m7 | |
165 packuswb m0, m0 | |
166 packuswb m2, m2 | |
167 packuswb m4, m4 | |
168 packuswb m6, m6 | |
169 | |
170 movd [r0], m0 | |
171 movd [r0 + r1], m2 | |
172 lea r0, [r0 + r1 * 2] | |
173 movd [r0], m4 | |
174 movd [r0 + r1], m6 | |
175 lea r0, [r0 + r1 * 2] | |
176 | |
177 jnz .loop | |
178 RET | |
179 %endif | |
180 %endmacro | |
181 | |
182 PIXEL_ADD_PS_W4_H4 4, 8 | |
183 | |
184 | |
185 ;----------------------------------------------------------------------------- | |
186 ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
187 ;----------------------------------------------------------------------------- | |
188 %macro PIXEL_ADD_PS_W8_H4 2 | |
189 %if HIGH_BIT_DEPTH | |
190 INIT_XMM sse2 | |
191 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
192 mova m5, [pw_pixel_max] | |
193 pxor m4, m4 | |
194 mov r6d, %2/4 | |
195 add r4, r4 | |
196 add r5, r5 | |
197 add r1, r1 | |
198 .loop: | |
199 movu m0, [r2] | |
200 movu m2, [r2 + r4] | |
201 movu m1, [r3] | |
202 movu m3, [r3 + r5] | |
203 lea r2, [r2 + r4 * 2] | |
204 lea r3, [r3 + r5 * 2] | |
205 | |
206 paddw m0, m1 | |
207 paddw m2, m3 | |
208 CLIPW2 m0, m2, m4, m5 | |
209 | |
210 movu [r0], m0 | |
211 movu [r0 + r1], m2 | |
212 | |
213 movu m0, [r2] | |
214 movu m2, [r2 + r4] | |
215 movu m1, [r3] | |
216 movu m3, [r3 + r5] | |
217 dec r6d | |
218 lea r0, [r0 + r1 * 2] | |
219 lea r2, [r2 + r4 * 2] | |
220 lea r3, [r3 + r5 * 2] | |
221 | |
222 paddw m0, m1 | |
223 paddw m2, m3 | |
224 CLIPW2 m0, m2, m4, m5 | |
225 | |
226 movu [r0], m0 | |
227 movu [r0 + r1], m2 | |
228 lea r0, [r0 + r1 * 2] | |
229 | |
230 jnz .loop | |
231 RET | |
232 %else | |
233 INIT_XMM sse4 | |
234 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
235 mov r6d, %2/4 | |
236 add r5, r5 | |
237 .loop: | |
238 pmovzxbw m0, [r2] | |
239 pmovzxbw m2, [r2 + r4] | |
240 movu m1, [r3] | |
241 movu m3, [r3 + r5] | |
242 lea r2, [r2 + r4 * 2] | |
243 lea r3, [r3 + r5 * 2] | |
244 pmovzxbw m4, [r2] | |
245 pmovzxbw m6, [r2 + r4] | |
246 movu m5, [r3] | |
247 movu m7, [r3 + r5] | |
248 dec r6d | |
249 lea r2, [r2 + r4 * 2] | |
250 lea r3, [r3 + r5 * 2] | |
251 | |
252 paddw m0, m1 | |
253 paddw m2, m3 | |
254 paddw m4, m5 | |
255 paddw m6, m7 | |
256 packuswb m0, m0 | |
257 packuswb m2, m2 | |
258 packuswb m4, m4 | |
259 packuswb m6, m6 | |
260 | |
261 movh [r0], m0 | |
262 movh [r0 + r1], m2 | |
263 lea r0, [r0 + r1 * 2] | |
264 movh [r0], m4 | |
265 movh [r0 + r1], m6 | |
266 lea r0, [r0 + r1 * 2] | |
267 | |
268 jnz .loop | |
269 RET | |
270 %endif | |
271 %endmacro | |
272 | |
273 PIXEL_ADD_PS_W8_H4 8, 8 | |
274 PIXEL_ADD_PS_W8_H4 8, 16 | |
275 | |
276 | |
277 ;----------------------------------------------------------------------------- | |
278 ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
279 ;----------------------------------------------------------------------------- | |
280 %macro PIXEL_ADD_PS_W16_H4 2 | |
281 %if HIGH_BIT_DEPTH | |
282 INIT_XMM sse2 | |
283 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
284 mova m5, [pw_pixel_max] | |
285 pxor m4, m4 | |
286 mov r6d, %2/4 | |
287 add r4, r4 | |
288 add r5, r5 | |
289 add r1, r1 | |
290 .loop: | |
291 movu m0, [r2] | |
292 movu m2, [r2 + 16] | |
293 movu m1, [r3] | |
294 movu m3, [r3 + 16] | |
295 | |
296 paddw m0, m1 | |
297 paddw m2, m3 | |
298 CLIPW2 m0, m2, m4, m5 | |
299 | |
300 movu [r0], m0 | |
301 movu [r0 + 16], m2 | |
302 | |
303 movu m0, [r2 + r4] | |
304 movu m2, [r2 + r4 + 16] | |
305 movu m1, [r3 + r5] | |
306 movu m3, [r3 + r5 + 16] | |
307 lea r2, [r2 + r4 * 2] | |
308 lea r3, [r3 + r5 * 2] | |
309 | |
310 paddw m0, m1 | |
311 paddw m2, m3 | |
312 CLIPW2 m0, m2, m4, m5 | |
313 | |
314 movu [r0 + r1], m0 | |
315 movu [r0 + r1 + 16], m2 | |
316 | |
317 movu m0, [r2] | |
318 movu m2, [r2 + 16] | |
319 movu m1, [r3] | |
320 movu m3, [r3 + 16] | |
321 lea r0, [r0 + r1 * 2] | |
322 | |
323 paddw m0, m1 | |
324 paddw m2, m3 | |
325 CLIPW2 m0, m2, m4, m5 | |
326 | |
327 movu [r0], m0 | |
328 movu [r0 + 16], m2 | |
329 | |
330 movu m0, [r2 + r4] | |
331 movu m2, [r2 + r4 + 16] | |
332 movu m1, [r3 + r5] | |
333 movu m3, [r3 + r5 + 16] | |
334 dec r6d | |
335 lea r2, [r2 + r4 * 2] | |
336 lea r3, [r3 + r5 * 2] | |
337 | |
338 paddw m0, m1 | |
339 paddw m2, m3 | |
340 CLIPW2 m0, m2, m4, m5 | |
341 | |
342 movu [r0 + r1], m0 | |
343 movu [r0 + r1 + 16], m2 | |
344 lea r0, [r0 + r1 * 2] | |
345 | |
346 jnz .loop | |
347 RET | |
348 %else | |
349 INIT_XMM sse4 | |
350 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
351 mov r6d, %2/4 | |
352 add r5, r5 | |
353 .loop: | |
354 pmovzxbw m0, [r2] | |
355 pmovzxbw m1, [r2 + 8] | |
356 pmovzxbw m4, [r2 + r4] | |
357 pmovzxbw m5, [r2 + r4 + 8] | |
358 movu m2, [r3] | |
359 movu m3, [r3 + 16] | |
360 movu m6, [r3 + r5] | |
361 movu m7, [r3 + r5 + 16] | |
362 lea r2, [r2 + r4 * 2] | |
363 lea r3, [r3 + r5 * 2] | |
364 | |
365 paddw m0, m2 | |
366 paddw m1, m3 | |
367 paddw m4, m6 | |
368 paddw m5, m7 | |
369 packuswb m0, m1 | |
370 packuswb m4, m5 | |
371 | |
372 movu [r0], m0 | |
373 movu [r0 + r1], m4 | |
374 | |
375 pmovzxbw m0, [r2] | |
376 pmovzxbw m1, [r2 + 8] | |
377 pmovzxbw m4, [r2 + r4] | |
378 pmovzxbw m5, [r2 + r4 + 8] | |
379 movu m2, [r3] | |
380 movu m3, [r3 + 16] | |
381 movu m6, [r3 + r5] | |
382 movu m7, [r3 + r5 + 16] | |
383 dec r6d | |
384 lea r0, [r0 + r1 * 2] | |
385 lea r2, [r2 + r4 * 2] | |
386 lea r3, [r3 + r5 * 2] | |
387 | |
388 paddw m0, m2 | |
389 paddw m1, m3 | |
390 paddw m4, m6 | |
391 paddw m5, m7 | |
392 packuswb m0, m1 | |
393 packuswb m4, m5 | |
394 | |
395 movu [r0], m0 | |
396 movu [r0 + r1], m4 | |
397 lea r0, [r0 + r1 * 2] | |
398 | |
399 jnz .loop | |
400 RET | |
401 %endif | |
402 %endmacro | |
403 PIXEL_ADD_PS_W16_H4 16, 16 | |
404 PIXEL_ADD_PS_W16_H4 16, 32 | |
405 | |
406 ;----------------------------------------------------------------------------- | |
407 ; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
408 ;----------------------------------------------------------------------------- | |
409 %macro PIXEL_ADD_PS_W16_H4_avx2 1 | |
410 %if HIGH_BIT_DEPTH | |
411 %if ARCH_X86_64 | |
412 INIT_YMM avx2 | |
413 cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1 | |
414 mova m3, [pw_pixel_max] | |
415 pxor m2, m2 | |
416 mov r6d, %1/4 | |
417 add r4d, r4d | |
418 add r5d, r5d | |
419 add r1d, r1d | |
420 lea r7, [r4 * 3] | |
421 lea r8, [r5 * 3] | |
422 lea r9, [r1 * 3] | |
423 | |
424 .loop: | |
425 movu m0, [r2] | |
426 movu m1, [r3] | |
427 paddw m0, m1 | |
428 CLIPW m0, m2, m3 | |
429 movu [r0], m0 | |
430 | |
431 movu m0, [r2 + r4] | |
432 movu m1, [r3 + r5] | |
433 paddw m0, m1 | |
434 CLIPW m0, m2, m3 | |
435 movu [r0 + r1], m0 | |
436 | |
437 movu m0, [r2 + r4 * 2] | |
438 movu m1, [r3 + r5 * 2] | |
439 paddw m0, m1 | |
440 CLIPW m0, m2, m3 | |
441 movu [r0 + r1 * 2], m0 | |
442 | |
443 movu m0, [r2 + r7] | |
444 movu m1, [r3 + r8] | |
445 paddw m0, m1 | |
446 CLIPW m0, m2, m3 | |
447 movu [r0 + r9], m0 | |
448 | |
449 dec r6d | |
450 lea r0, [r0 + r1 * 4] | |
451 lea r2, [r2 + r4 * 4] | |
452 lea r3, [r3 + r5 * 4] | |
453 jnz .loop | |
454 RET | |
455 %endif | |
456 %else | |
457 INIT_YMM avx2 | |
458 cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
459 mov r6d, %1/4 | |
460 add r5, r5 | |
461 .loop: | |
462 | |
463 pmovzxbw m0, [r2] ; row 0 of src0 | |
464 pmovzxbw m1, [r2 + r4] ; row 1 of src0 | |
465 movu m2, [r3] ; row 0 of src1 | |
466 movu m3, [r3 + r5] ; row 1 of src1 | |
467 paddw m0, m2 | |
468 paddw m1, m3 | |
469 packuswb m0, m1 | |
470 | |
471 lea r2, [r2 + r4 * 2] | |
472 lea r3, [r3 + r5 * 2] | |
473 | |
474 pmovzxbw m2, [r2] ; row 2 of src0 | |
475 pmovzxbw m3, [r2 + r4] ; row 3 of src0 | |
476 movu m4, [r3] ; row 2 of src1 | |
477 movu m5, [r3 + r5] ; row 3 of src1 | |
478 paddw m2, m4 | |
479 paddw m3, m5 | |
480 packuswb m2, m3 | |
481 | |
482 lea r2, [r2 + r4 * 2] | |
483 lea r3, [r3 + r5 * 2] | |
484 | |
485 vpermq m0, m0, 11011000b | |
486 movu [r0], xm0 ; row 0 of dst | |
487 vextracti128 xm3, m0, 1 | |
488 movu [r0 + r1], xm3 ; row 1 of dst | |
489 | |
490 lea r0, [r0 + r1 * 2] | |
491 vpermq m2, m2, 11011000b | |
492 movu [r0], xm2 ; row 2 of dst | |
493 vextracti128 xm3, m2, 1 | |
494 movu [r0 + r1], xm3 ; row 3 of dst | |
495 | |
496 lea r0, [r0 + r1 * 2] | |
497 | |
498 dec r6d | |
499 jnz .loop | |
500 | |
501 RET | |
502 %endif | |
503 %endmacro | |
504 | |
505 PIXEL_ADD_PS_W16_H4_avx2 16 | |
506 PIXEL_ADD_PS_W16_H4_avx2 32 | |
507 | |
508 | |
509 ;----------------------------------------------------------------------------- | |
510 ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
511 ;----------------------------------------------------------------------------- | |
512 %macro PIXEL_ADD_PS_W32_H2 2 | |
513 %if HIGH_BIT_DEPTH | |
514 INIT_XMM sse2 | |
515 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
516 mova m5, [pw_pixel_max] | |
517 pxor m4, m4 | |
518 mov r6d, %2/2 | |
519 add r4, r4 | |
520 add r5, r5 | |
521 add r1, r1 | |
522 .loop: | |
523 movu m0, [r2] | |
524 movu m2, [r2 + 16] | |
525 movu m1, [r3] | |
526 movu m3, [r3 + 16] | |
527 | |
528 paddw m0, m1 | |
529 paddw m2, m3 | |
530 CLIPW2 m0, m2, m4, m5 | |
531 | |
532 movu [r0], m0 | |
533 movu [r0 + 16], m2 | |
534 | |
535 movu m0, [r2 + 32] | |
536 movu m2, [r2 + 48] | |
537 movu m1, [r3 + 32] | |
538 movu m3, [r3 + 48] | |
539 | |
540 paddw m0, m1 | |
541 paddw m2, m3 | |
542 CLIPW2 m0, m2, m4, m5 | |
543 | |
544 movu [r0 + 32], m0 | |
545 movu [r0 + 48], m2 | |
546 | |
547 movu m0, [r2 + r4] | |
548 movu m2, [r2 + r4 + 16] | |
549 movu m1, [r3 + r5] | |
550 movu m3, [r3 + r5 + 16] | |
551 | |
552 paddw m0, m1 | |
553 paddw m2, m3 | |
554 CLIPW2 m0, m2, m4, m5 | |
555 | |
556 movu [r0 + r1], m0 | |
557 movu [r0 + r1 + 16], m2 | |
558 | |
559 movu m0, [r2 + r4 + 32] | |
560 movu m2, [r2 + r4 + 48] | |
561 movu m1, [r3 + r5 + 32] | |
562 movu m3, [r3 + r5 + 48] | |
563 dec r6d | |
564 lea r2, [r2 + r4 * 2] | |
565 lea r3, [r3 + r5 * 2] | |
566 | |
567 paddw m0, m1 | |
568 paddw m2, m3 | |
569 CLIPW2 m0, m2, m4, m5 | |
570 | |
571 movu [r0 + r1 + 32], m0 | |
572 movu [r0 + r1 + 48], m2 | |
573 lea r0, [r0 + r1 * 2] | |
574 | |
575 jnz .loop | |
576 RET | |
577 %else | |
578 INIT_XMM sse4 | |
579 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
580 mov r6d, %2/2 | |
581 add r5, r5 | |
582 .loop: | |
583 pmovzxbw m0, [r2] | |
584 pmovzxbw m1, [r2 + 8] | |
585 pmovzxbw m2, [r2 + 16] | |
586 pmovzxbw m3, [r2 + 24] | |
587 movu m4, [r3] | |
588 movu m5, [r3 + 16] | |
589 movu m6, [r3 + 32] | |
590 movu m7, [r3 + 48] | |
591 | |
592 paddw m0, m4 | |
593 paddw m1, m5 | |
594 paddw m2, m6 | |
595 paddw m3, m7 | |
596 packuswb m0, m1 | |
597 packuswb m2, m3 | |
598 | |
599 movu [r0], m0 | |
600 movu [r0 + 16], m2 | |
601 | |
602 pmovzxbw m0, [r2 + r4] | |
603 pmovzxbw m1, [r2 + r4 + 8] | |
604 pmovzxbw m2, [r2 + r4 + 16] | |
605 pmovzxbw m3, [r2 + r4 + 24] | |
606 movu m4, [r3 + r5] | |
607 movu m5, [r3 + r5 + 16] | |
608 movu m6, [r3 + r5 + 32] | |
609 movu m7, [r3 + r5 + 48] | |
610 dec r6d | |
611 lea r2, [r2 + r4 * 2] | |
612 lea r3, [r3 + r5 * 2] | |
613 | |
614 paddw m0, m4 | |
615 paddw m1, m5 | |
616 paddw m2, m6 | |
617 paddw m3, m7 | |
618 packuswb m0, m1 | |
619 packuswb m2, m3 | |
620 | |
621 movu [r0 + r1], m0 | |
622 movu [r0 + r1 + 16], m2 | |
623 lea r0, [r0 + r1 * 2] | |
624 | |
625 jnz .loop | |
626 RET | |
627 %endif | |
628 %endmacro | |
629 PIXEL_ADD_PS_W32_H2 32, 32 | |
630 PIXEL_ADD_PS_W32_H2 32, 64 | |
631 | |
632 ;----------------------------------------------------------------------------- | |
633 ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
634 ;----------------------------------------------------------------------------- | |
635 %macro PIXEL_ADD_PS_W32_H4_avx2 1 | |
636 %if HIGH_BIT_DEPTH | |
637 %if ARCH_X86_64 | |
638 INIT_YMM avx2 | |
639 cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
640 mova m5, [pw_pixel_max] | |
641 pxor m4, m4 | |
642 mov r6d, %1/4 | |
643 add r4d, r4d | |
644 add r5d, r5d | |
645 add r1d, r1d | |
646 lea r7, [r4 * 3] | |
647 lea r8, [r5 * 3] | |
648 lea r9, [r1 * 3] | |
649 | |
650 .loop: | |
651 movu m0, [r2] | |
652 movu m2, [r2 + 32] | |
653 movu m1, [r3] | |
654 movu m3, [r3 + 32] | |
655 paddw m0, m1 | |
656 paddw m2, m3 | |
657 CLIPW2 m0, m2, m4, m5 | |
658 | |
659 movu [r0], m0 | |
660 movu [r0 + 32], m2 | |
661 | |
662 movu m0, [r2 + r4] | |
663 movu m2, [r2 + r4 + 32] | |
664 movu m1, [r3 + r5] | |
665 movu m3, [r3 + r5 + 32] | |
666 paddw m0, m1 | |
667 paddw m2, m3 | |
668 CLIPW2 m0, m2, m4, m5 | |
669 | |
670 movu [r0 + r1], m0 | |
671 movu [r0 + r1 + 32], m2 | |
672 | |
673 movu m0, [r2 + r4 * 2] | |
674 movu m2, [r2 + r4 * 2 + 32] | |
675 movu m1, [r3 + r5 * 2] | |
676 movu m3, [r3 + r5 * 2 + 32] | |
677 paddw m0, m1 | |
678 paddw m2, m3 | |
679 CLIPW2 m0, m2, m4, m5 | |
680 | |
681 movu [r0 + r1 * 2], m0 | |
682 movu [r0 + r1 * 2 + 32], m2 | |
683 | |
684 movu m0, [r2 + r7] | |
685 movu m2, [r2 + r7 + 32] | |
686 movu m1, [r3 + r8] | |
687 movu m3, [r3 + r8 + 32] | |
688 paddw m0, m1 | |
689 paddw m2, m3 | |
690 CLIPW2 m0, m2, m4, m5 | |
691 | |
692 movu [r0 + r9], m0 | |
693 movu [r0 + r9 + 32], m2 | |
694 | |
695 dec r6d | |
696 lea r0, [r0 + r1 * 4] | |
697 lea r2, [r2 + r4 * 4] | |
698 lea r3, [r3 + r5 * 4] | |
699 jnz .loop | |
700 RET | |
701 %endif | |
702 %else | |
703 %if ARCH_X86_64 | |
704 INIT_YMM avx2 | |
705 cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
706 mov r6d, %1/4 | |
707 add r5, r5 | |
708 lea r7, [r4 * 3] | |
709 lea r8, [r5 * 3] | |
710 lea r9, [r1 * 3] | |
711 .loop: | |
712 pmovzxbw m0, [r2] ; first half of row 0 of src0 | |
713 pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 | |
714 movu m2, [r3] ; first half of row 0 of src1 | |
715 movu m3, [r3 + 32] ; second half of row 0 of src1 | |
716 | |
717 paddw m0, m2 | |
718 paddw m1, m3 | |
719 packuswb m0, m1 | |
720 vpermq m0, m0, 11011000b | |
721 movu [r0], m0 ; row 0 of dst | |
722 | |
723 pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 | |
724 pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 | |
725 movu m2, [r3 + r5] ; first half of row 1 of src1 | |
726 movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 | |
727 | |
728 paddw m0, m2 | |
729 paddw m1, m3 | |
730 packuswb m0, m1 | |
731 vpermq m0, m0, 11011000b | |
732 movu [r0 + r1], m0 ; row 1 of dst | |
733 | |
734 pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0 | |
735 pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0 | |
736 movu m2, [r3 + r5 * 2] ; first half of row 2 of src1 | |
737 movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1 | |
738 | |
739 paddw m0, m2 | |
740 paddw m1, m3 | |
741 packuswb m0, m1 | |
742 vpermq m0, m0, 11011000b | |
743 movu [r0 + r1 * 2], m0 ; row 2 of dst | |
744 | |
745 pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0 | |
746 pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0 | |
747 movu m2, [r3 + r8] ; first half of row 3 of src1 | |
748 movu m3, [r3 + r8 + 32] ; second half of row 3 of src1 | |
749 | |
750 paddw m0, m2 | |
751 paddw m1, m3 | |
752 packuswb m0, m1 | |
753 vpermq m0, m0, 11011000b | |
754 movu [r0 + r9], m0 ; row 3 of dst | |
755 | |
756 lea r2, [r2 + r4 * 4] | |
757 lea r3, [r3 + r5 * 4] | |
758 lea r0, [r0 + r1 * 4] | |
759 | |
760 dec r6d | |
761 jnz .loop | |
762 RET | |
763 %endif | |
764 %endif | |
765 %endmacro | |
766 | |
767 PIXEL_ADD_PS_W32_H4_avx2 32 | |
768 PIXEL_ADD_PS_W32_H4_avx2 64 | |
769 | |
770 | |
771 ;----------------------------------------------------------------------------- | |
772 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
773 ;----------------------------------------------------------------------------- | |
774 %macro PIXEL_ADD_PS_W64_H2 2 | |
775 %if HIGH_BIT_DEPTH | |
776 INIT_XMM sse2 | |
777 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
778 mova m5, [pw_pixel_max] | |
779 pxor m4, m4 | |
780 mov r6d, %2/2 | |
781 add r4, r4 | |
782 add r5, r5 | |
783 add r1, r1 | |
784 .loop: | |
785 movu m0, [r2] | |
786 movu m2, [r2 + 16] | |
787 movu m1, [r3] | |
788 movu m3, [r3 + 16] | |
789 | |
790 paddw m0, m1 | |
791 paddw m2, m3 | |
792 CLIPW2 m0, m2, m4, m5 | |
793 | |
794 movu [r0], m0 | |
795 movu [r0 + 16], m2 | |
796 | |
797 movu m0, [r2 + 32] | |
798 movu m2, [r2 + 48] | |
799 movu m1, [r3 + 32] | |
800 movu m3, [r3 + 48] | |
801 | |
802 paddw m0, m1 | |
803 paddw m2, m3 | |
804 CLIPW2 m0, m2, m4, m5 | |
805 | |
806 movu [r0 + 32], m0 | |
807 movu [r0 + 48], m2 | |
808 | |
809 movu m0, [r2 + 64] | |
810 movu m2, [r2 + 80] | |
811 movu m1, [r3 + 64] | |
812 movu m3, [r3 + 80] | |
813 | |
814 paddw m0, m1 | |
815 paddw m2, m3 | |
816 CLIPW2 m0, m2, m4, m5 | |
817 | |
818 movu [r0 + 64], m0 | |
819 movu [r0 + 80], m2 | |
820 | |
821 movu m0, [r2 + 96] | |
822 movu m2, [r2 + 112] | |
823 movu m1, [r3 + 96] | |
824 movu m3, [r3 + 112] | |
825 | |
826 paddw m0, m1 | |
827 paddw m2, m3 | |
828 CLIPW2 m0, m2, m4, m5 | |
829 | |
830 movu [r0 + 96], m0 | |
831 movu [r0 + 112], m2 | |
832 | |
833 movu m0, [r2 + r4] | |
834 movu m2, [r2 + r4 + 16] | |
835 movu m1, [r3 + r5] | |
836 movu m3, [r3 + r5 + 16] | |
837 | |
838 paddw m0, m1 | |
839 paddw m2, m3 | |
840 CLIPW2 m0, m2, m4, m5 | |
841 | |
842 movu [r0 + r1], m0 | |
843 movu [r0 + r1 + 16], m2 | |
844 | |
845 movu m0, [r2 + r4 + 32] | |
846 movu m2, [r2 + r4 + 48] | |
847 movu m1, [r3 + r5 + 32] | |
848 movu m3, [r3 + r5 + 48] | |
849 | |
850 paddw m0, m1 | |
851 paddw m2, m3 | |
852 CLIPW2 m0, m2, m4, m5 | |
853 | |
854 movu [r0 + r1 + 32], m0 | |
855 movu [r0 + r1 + 48], m2 | |
856 | |
857 movu m0, [r2 + r4 + 64] | |
858 movu m2, [r2 + r4 + 80] | |
859 movu m1, [r3 + r5 + 64] | |
860 movu m3, [r3 + r5 + 80] | |
861 | |
862 paddw m0, m1 | |
863 paddw m2, m3 | |
864 CLIPW2 m0, m2, m4, m5 | |
865 | |
866 movu [r0 + r1 + 64], m0 | |
867 movu [r0 + r1 + 80], m2 | |
868 | |
869 movu m0, [r2 + r4 + 96] | |
870 movu m2, [r2 + r4 + 112] | |
871 movu m1, [r3 + r5 + 96] | |
872 movu m3, [r3 + r5 + 112] | |
873 dec r6d | |
874 lea r2, [r2 + r4 * 2] | |
875 lea r3, [r3 + r5 * 2] | |
876 | |
877 paddw m0, m1 | |
878 paddw m2, m3 | |
879 CLIPW2 m0, m2, m4, m5 | |
880 | |
881 movu [r0 + r1 + 96], m0 | |
882 movu [r0 + r1 + 112], m2 | |
883 lea r0, [r0 + r1 * 2] | |
884 | |
885 jnz .loop | |
886 RET | |
887 %else | |
888 INIT_XMM sse4 | |
889 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
890 mov r6d, %2/2 | |
891 add r5, r5 | |
892 .loop: | |
893 pmovzxbw m0, [r2] | |
894 pmovzxbw m1, [r2 + 8] | |
895 pmovzxbw m2, [r2 + 16] | |
896 pmovzxbw m3, [r2 + 24] | |
897 movu m4, [r3] | |
898 movu m5, [r3 + 16] | |
899 movu m6, [r3 + 32] | |
900 movu m7, [r3 + 48] | |
901 | |
902 paddw m0, m4 | |
903 paddw m1, m5 | |
904 paddw m2, m6 | |
905 paddw m3, m7 | |
906 packuswb m0, m1 | |
907 packuswb m2, m3 | |
908 | |
909 movu [r0], m0 | |
910 movu [r0 + 16], m2 | |
911 | |
912 pmovzxbw m0, [r2 + 32] | |
913 pmovzxbw m1, [r2 + 40] | |
914 pmovzxbw m2, [r2 + 48] | |
915 pmovzxbw m3, [r2 + 56] | |
916 movu m4, [r3 + 64] | |
917 movu m5, [r3 + 80] | |
918 movu m6, [r3 + 96] | |
919 movu m7, [r3 + 112] | |
920 | |
921 paddw m0, m4 | |
922 paddw m1, m5 | |
923 paddw m2, m6 | |
924 paddw m3, m7 | |
925 packuswb m0, m1 | |
926 packuswb m2, m3 | |
927 | |
928 movu [r0 + 32], m0 | |
929 movu [r0 + 48], m2 | |
930 | |
931 pmovzxbw m0, [r2 + r4] | |
932 pmovzxbw m1, [r2 + r4 + 8] | |
933 pmovzxbw m2, [r2 + r4 + 16] | |
934 pmovzxbw m3, [r2 + r4 + 24] | |
935 movu m4, [r3 + r5] | |
936 movu m5, [r3 + r5 + 16] | |
937 movu m6, [r3 + r5 + 32] | |
938 movu m7, [r3 + r5 + 48] | |
939 | |
940 paddw m0, m4 | |
941 paddw m1, m5 | |
942 paddw m2, m6 | |
943 paddw m3, m7 | |
944 packuswb m0, m1 | |
945 packuswb m2, m3 | |
946 | |
947 movu [r0 + r1], m0 | |
948 movu [r0 + r1 + 16], m2 | |
949 | |
950 pmovzxbw m0, [r2 + r4 + 32] | |
951 pmovzxbw m1, [r2 + r4 + 40] | |
952 pmovzxbw m2, [r2 + r4 + 48] | |
953 pmovzxbw m3, [r2 + r4 + 56] | |
954 movu m4, [r3 + r5 + 64] | |
955 movu m5, [r3 + r5 + 80] | |
956 movu m6, [r3 + r5 + 96] | |
957 movu m7, [r3 + r5 + 112] | |
958 dec r6d | |
959 lea r2, [r2 + r4 * 2] | |
960 lea r3, [r3 + r5 * 2] | |
961 | |
962 paddw m0, m4 | |
963 paddw m1, m5 | |
964 paddw m2, m6 | |
965 paddw m3, m7 | |
966 packuswb m0, m1 | |
967 packuswb m2, m3 | |
968 | |
969 movu [r0 + r1 + 32], m0 | |
970 movu [r0 + r1 + 48], m2 | |
971 lea r0, [r0 + r1 * 2] | |
972 | |
973 jnz .loop | |
974 RET | |
975 %endif | |
976 %endmacro | |
977 PIXEL_ADD_PS_W64_H2 64, 64 | |
978 | |
979 ;----------------------------------------------------------------------------- | |
980 ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
981 ;----------------------------------------------------------------------------- | |
982 %if HIGH_BIT_DEPTH | |
983 %if ARCH_X86_64 | |
984 INIT_YMM avx2 | |
985 cglobal pixel_add_ps_64x64, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
986 mova m5, [pw_pixel_max] | |
987 pxor m4, m4 | |
988 mov r6d, 16 | |
989 add r4d, r4d | |
990 add r5d, r5d | |
991 add r1d, r1d | |
992 lea r7, [r4 * 3] | |
993 lea r8, [r5 * 3] | |
994 lea r9, [r1 * 3] | |
995 | |
996 .loop: | |
997 movu m0, [r2] | |
998 movu m1, [r2 + 32] | |
999 movu m2, [r3] | |
1000 movu m3, [r3 + 32] | |
1001 paddw m0, m2 | |
1002 paddw m1, m3 | |
1003 | |
1004 CLIPW2 m0, m1, m4, m5 | |
1005 movu [r0], m0 | |
1006 movu [r0 + 32], m1 | |
1007 | |
1008 movu m0, [r2 + 64] | |
1009 movu m1, [r2 + 96] | |
1010 movu m2, [r3 + 64] | |
1011 movu m3, [r3 + 96] | |
1012 paddw m0, m2 | |
1013 paddw m1, m3 | |
1014 | |
1015 CLIPW2 m0, m1, m4, m5 | |
1016 movu [r0 + 64], m0 | |
1017 movu [r0 + 96], m1 | |
1018 | |
1019 movu m0, [r2 + r4] | |
1020 movu m1, [r2 + r4 + 32] | |
1021 movu m2, [r3 + r5] | |
1022 movu m3, [r3 + r5 + 32] | |
1023 paddw m0, m2 | |
1024 paddw m1, m3 | |
1025 | |
1026 CLIPW2 m0, m1, m4, m5 | |
1027 movu [r0 + r1], m0 | |
1028 movu [r0 + r1 + 32], m1 | |
1029 | |
1030 movu m0, [r2 + r4 + 64] | |
1031 movu m1, [r2 + r4 + 96] | |
1032 movu m2, [r3 + r5 + 64] | |
1033 movu m3, [r3 + r5 + 96] | |
1034 paddw m0, m2 | |
1035 paddw m1, m3 | |
1036 | |
1037 CLIPW2 m0, m1, m4, m5 | |
1038 movu [r0 + r1 + 64], m0 | |
1039 movu [r0 + r1 + 96], m1 | |
1040 | |
1041 movu m0, [r2 + r4 * 2] | |
1042 movu m1, [r2 + r4 * 2 + 32] | |
1043 movu m2, [r3 + r5 * 2] | |
1044 movu m3, [r3 + r5 * 2+ 32] | |
1045 paddw m0, m2 | |
1046 paddw m1, m3 | |
1047 | |
1048 CLIPW2 m0, m1, m4, m5 | |
1049 movu [r0 + r1 * 2], m0 | |
1050 movu [r0 + r1 * 2 + 32], m1 | |
1051 | |
1052 movu m0, [r2 + r4 * 2 + 64] | |
1053 movu m1, [r2 + r4 * 2 + 96] | |
1054 movu m2, [r3 + r5 * 2 + 64] | |
1055 movu m3, [r3 + r5 * 2 + 96] | |
1056 paddw m0, m2 | |
1057 paddw m1, m3 | |
1058 | |
1059 CLIPW2 m0, m1, m4, m5 | |
1060 movu [r0 + r1 * 2 + 64], m0 | |
1061 movu [r0 + r1 * 2 + 96], m1 | |
1062 | |
1063 movu m0, [r2 + r7] | |
1064 movu m1, [r2 + r7 + 32] | |
1065 movu m2, [r3 + r8] | |
1066 movu m3, [r3 + r8 + 32] | |
1067 paddw m0, m2 | |
1068 paddw m1, m3 | |
1069 | |
1070 CLIPW2 m0, m1, m4, m5 | |
1071 movu [r0 + r9], m0 | |
1072 movu [r0 + r9 + 32], m1 | |
1073 | |
1074 movu m0, [r2 + r7 + 64] | |
1075 movu m1, [r2 + r7 + 96] | |
1076 movu m2, [r3 + r8 + 64] | |
1077 movu m3, [r3 + r8 + 96] | |
1078 paddw m0, m2 | |
1079 paddw m1, m3 | |
1080 | |
1081 CLIPW2 m0, m1, m4, m5 | |
1082 movu [r0 + r9 + 64], m0 | |
1083 movu [r0 + r9 + 96], m1 | |
1084 | |
1085 dec r6d | |
1086 lea r0, [r0 + r1 * 4] | |
1087 lea r2, [r2 + r4 * 4] | |
1088 lea r3, [r3 + r5 * 4] | |
1089 jnz .loop | |
1090 RET | |
1091 %endif | |
1092 %else | |
1093 INIT_YMM avx2 | |
1094 cglobal pixel_add_ps_64x64, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
1095 mov r6d, 32 | |
1096 add r5, r5 | |
1097 .loop: | |
1098 pmovzxbw m0, [r2] ; first 16 of row 0 of src0 | |
1099 pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0 | |
1100 pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0 | |
1101 pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0 | |
1102 movu m4, [r3] ; first 16 of row 0 of src1 | |
1103 movu m5, [r3 + 32] ; second 16 of row 0 of src1 | |
1104 movu m6, [r3 + 64] ; third 16 of row 0 of src1 | |
1105 movu m7, [r3 + 96] ; forth 16 of row 0 of src1 | |
1106 | |
1107 paddw m0, m4 | |
1108 paddw m1, m5 | |
1109 paddw m2, m6 | |
1110 paddw m3, m7 | |
1111 packuswb m0, m1 | |
1112 packuswb m2, m3 | |
1113 vpermq m0, m0, 11011000b | |
1114 movu [r0], m0 ; first 32 of row 0 of dst | |
1115 vpermq m2, m2, 11011000b | |
1116 movu [r0 + 32], m2 ; second 32 of row 0 of dst | |
1117 | |
1118 pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0 | |
1119 pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0 | |
1120 pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0 | |
1121 pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0 | |
1122 movu m4, [r3 + r5] ; first 16 of row 1 of src1 | |
1123 movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1 | |
1124 movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1 | |
1125 movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1 | |
1126 | |
1127 paddw m0, m4 | |
1128 paddw m1, m5 | |
1129 paddw m2, m6 | |
1130 paddw m3, m7 | |
1131 packuswb m0, m1 | |
1132 packuswb m2, m3 | |
1133 vpermq m0, m0, 11011000b | |
1134 movu [r0 + r1], m0 ; first 32 of row 1 of dst | |
1135 vpermq m2, m2, 11011000b | |
1136 movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst | |
1137 | |
1138 lea r2, [r2 + r4 * 2] | |
1139 lea r3, [r3 + r5 * 2] | |
1140 lea r0, [r0 + r1 * 2] | |
1141 | |
1142 dec r6d | |
1143 jnz .loop | |
1144 RET | |
1145 | |
1146 %endif |