comparison x265/source/common/x86/pixeladd8.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5 ;*
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
10 ;*
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
15 ;*
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 ;*
20 ;* This program is also available under a commercial proprietary license.
21 ;* For more information, contact us at license @ x265.com.
22 ;*****************************************************************************/
23
24 %include "x86inc.asm"
25 %include "x86util.asm"
26
27 SECTION_RODATA 32
28
29 SECTION .text
30
31 cextern pw_pixel_max
32
33 ;-----------------------------------------------------------------------------
34 ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
35 ;-----------------------------------------------------------------------------
36 %if HIGH_BIT_DEPTH
37 INIT_XMM sse2
38 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
39 mova m1, [pw_pixel_max]
40 pxor m0, m0
41 add r4, r4
42 add r5, r5
43 add r1, r1
44 movh m2, [r2]
45 movhps m2, [r2 + r4]
46 movh m3, [r3]
47 movhps m3, [r3 + r5]
48 lea r2, [r2 + r4 * 2]
49 lea r3, [r3 + r5 * 2]
50 movh m4, [r2]
51 movhps m4, [r2 + r4]
52 movh m5, [r3]
53 movhps m5, [r3 + r5]
54
55 paddw m2, m3
56 paddw m4, m5
57 CLIPW2 m2, m4, m0, m1
58
59 movh [r0], m2
60 movhps [r0 + r1], m2
61 lea r0, [r0 + r1 * 2]
62 movh [r0], m4
63 movhps [r0 + r1], m4
64
65 RET
66 %else
67 INIT_XMM sse4
68 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
69 add r5, r5
70 pmovzxbw m0, [r2]
71 pmovzxbw m2, [r2 + r4]
72 movh m1, [r3]
73 movh m3, [r3 + r5]
74 lea r2, [r2 + r4 * 2]
75 lea r3, [r3 + r5 * 2]
76 pmovzxbw m4, [r2]
77 pmovzxbw m6, [r2 + r4]
78 movh m5, [r3]
79 movh m7, [r3 + r5]
80
81 paddw m0, m1
82 paddw m2, m3
83 paddw m4, m5
84 paddw m6, m7
85 packuswb m0, m0
86 packuswb m2, m2
87 packuswb m4, m4
88 packuswb m6, m6
89
90 movd [r0], m0
91 movd [r0 + r1], m2
92 lea r0, [r0 + r1 * 2]
93 movd [r0], m4
94 movd [r0 + r1], m6
95
96 RET
97 %endif
98
99
100 ;-----------------------------------------------------------------------------
101 ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
102 ;-----------------------------------------------------------------------------
103 %macro PIXEL_ADD_PS_W4_H4 2
104 %if HIGH_BIT_DEPTH
105 INIT_XMM sse2
106 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
107 mova m1, [pw_pixel_max]
108 pxor m0, m0
109 mov r6d, %2/4
110 add r4, r4
111 add r5, r5
112 add r1, r1
113 .loop:
114 movh m2, [r2]
115 movhps m2, [r2 + r4]
116 movh m3, [r3]
117 movhps m3, [r3 + r5]
118 lea r2, [r2 + r4 * 2]
119 lea r3, [r3 + r5 * 2]
120 movh m4, [r2]
121 movhps m4, [r2 + r4]
122 movh m5, [r3]
123 movhps m5, [r3 + r5]
124 dec r6d
125 lea r2, [r2 + r4 * 2]
126 lea r3, [r3 + r5 * 2]
127
128 paddw m2, m3
129 paddw m4, m5
130 CLIPW2 m2, m4, m0, m1
131
132 movh [r0], m2
133 movhps [r0 + r1], m2
134 lea r0, [r0 + r1 * 2]
135 movh [r0], m4
136 movhps [r0 + r1], m4
137 lea r0, [r0 + r1 * 2]
138
139 jnz .loop
140 RET
141 %else
142 INIT_XMM sse4
143 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
144 mov r6d, %2/4
145 add r5, r5
146 .loop:
147 pmovzxbw m0, [r2]
148 pmovzxbw m2, [r2 + r4]
149 movh m1, [r3]
150 movh m3, [r3 + r5]
151 lea r2, [r2 + r4 * 2]
152 lea r3, [r3 + r5 * 2]
153 pmovzxbw m4, [r2]
154 pmovzxbw m6, [r2 + r4]
155 movh m5, [r3]
156 movh m7, [r3 + r5]
157 dec r6d
158 lea r2, [r2 + r4 * 2]
159 lea r3, [r3 + r5 * 2]
160
161 paddw m0, m1
162 paddw m2, m3
163 paddw m4, m5
164 paddw m6, m7
165 packuswb m0, m0
166 packuswb m2, m2
167 packuswb m4, m4
168 packuswb m6, m6
169
170 movd [r0], m0
171 movd [r0 + r1], m2
172 lea r0, [r0 + r1 * 2]
173 movd [r0], m4
174 movd [r0 + r1], m6
175 lea r0, [r0 + r1 * 2]
176
177 jnz .loop
178 RET
179 %endif
180 %endmacro
181
182 PIXEL_ADD_PS_W4_H4 4, 8
183
184
185 ;-----------------------------------------------------------------------------
186 ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
187 ;-----------------------------------------------------------------------------
188 %macro PIXEL_ADD_PS_W8_H4 2
189 %if HIGH_BIT_DEPTH
190 INIT_XMM sse2
191 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
192 mova m5, [pw_pixel_max]
193 pxor m4, m4
194 mov r6d, %2/4
195 add r4, r4
196 add r5, r5
197 add r1, r1
198 .loop:
199 movu m0, [r2]
200 movu m2, [r2 + r4]
201 movu m1, [r3]
202 movu m3, [r3 + r5]
203 lea r2, [r2 + r4 * 2]
204 lea r3, [r3 + r5 * 2]
205
206 paddw m0, m1
207 paddw m2, m3
208 CLIPW2 m0, m2, m4, m5
209
210 movu [r0], m0
211 movu [r0 + r1], m2
212
213 movu m0, [r2]
214 movu m2, [r2 + r4]
215 movu m1, [r3]
216 movu m3, [r3 + r5]
217 dec r6d
218 lea r0, [r0 + r1 * 2]
219 lea r2, [r2 + r4 * 2]
220 lea r3, [r3 + r5 * 2]
221
222 paddw m0, m1
223 paddw m2, m3
224 CLIPW2 m0, m2, m4, m5
225
226 movu [r0], m0
227 movu [r0 + r1], m2
228 lea r0, [r0 + r1 * 2]
229
230 jnz .loop
231 RET
232 %else
233 INIT_XMM sse4
234 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
235 mov r6d, %2/4
236 add r5, r5
237 .loop:
238 pmovzxbw m0, [r2]
239 pmovzxbw m2, [r2 + r4]
240 movu m1, [r3]
241 movu m3, [r3 + r5]
242 lea r2, [r2 + r4 * 2]
243 lea r3, [r3 + r5 * 2]
244 pmovzxbw m4, [r2]
245 pmovzxbw m6, [r2 + r4]
246 movu m5, [r3]
247 movu m7, [r3 + r5]
248 dec r6d
249 lea r2, [r2 + r4 * 2]
250 lea r3, [r3 + r5 * 2]
251
252 paddw m0, m1
253 paddw m2, m3
254 paddw m4, m5
255 paddw m6, m7
256 packuswb m0, m0
257 packuswb m2, m2
258 packuswb m4, m4
259 packuswb m6, m6
260
261 movh [r0], m0
262 movh [r0 + r1], m2
263 lea r0, [r0 + r1 * 2]
264 movh [r0], m4
265 movh [r0 + r1], m6
266 lea r0, [r0 + r1 * 2]
267
268 jnz .loop
269 RET
270 %endif
271 %endmacro
272
273 PIXEL_ADD_PS_W8_H4 8, 8
274 PIXEL_ADD_PS_W8_H4 8, 16
275
276
277 ;-----------------------------------------------------------------------------
278 ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
279 ;-----------------------------------------------------------------------------
280 %macro PIXEL_ADD_PS_W16_H4 2
281 %if HIGH_BIT_DEPTH
282 INIT_XMM sse2
283 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
284 mova m5, [pw_pixel_max]
285 pxor m4, m4
286 mov r6d, %2/4
287 add r4, r4
288 add r5, r5
289 add r1, r1
290 .loop:
291 movu m0, [r2]
292 movu m2, [r2 + 16]
293 movu m1, [r3]
294 movu m3, [r3 + 16]
295
296 paddw m0, m1
297 paddw m2, m3
298 CLIPW2 m0, m2, m4, m5
299
300 movu [r0], m0
301 movu [r0 + 16], m2
302
303 movu m0, [r2 + r4]
304 movu m2, [r2 + r4 + 16]
305 movu m1, [r3 + r5]
306 movu m3, [r3 + r5 + 16]
307 lea r2, [r2 + r4 * 2]
308 lea r3, [r3 + r5 * 2]
309
310 paddw m0, m1
311 paddw m2, m3
312 CLIPW2 m0, m2, m4, m5
313
314 movu [r0 + r1], m0
315 movu [r0 + r1 + 16], m2
316
317 movu m0, [r2]
318 movu m2, [r2 + 16]
319 movu m1, [r3]
320 movu m3, [r3 + 16]
321 lea r0, [r0 + r1 * 2]
322
323 paddw m0, m1
324 paddw m2, m3
325 CLIPW2 m0, m2, m4, m5
326
327 movu [r0], m0
328 movu [r0 + 16], m2
329
330 movu m0, [r2 + r4]
331 movu m2, [r2 + r4 + 16]
332 movu m1, [r3 + r5]
333 movu m3, [r3 + r5 + 16]
334 dec r6d
335 lea r2, [r2 + r4 * 2]
336 lea r3, [r3 + r5 * 2]
337
338 paddw m0, m1
339 paddw m2, m3
340 CLIPW2 m0, m2, m4, m5
341
342 movu [r0 + r1], m0
343 movu [r0 + r1 + 16], m2
344 lea r0, [r0 + r1 * 2]
345
346 jnz .loop
347 RET
348 %else
349 INIT_XMM sse4
350 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
351 mov r6d, %2/4
352 add r5, r5
353 .loop:
354 pmovzxbw m0, [r2]
355 pmovzxbw m1, [r2 + 8]
356 pmovzxbw m4, [r2 + r4]
357 pmovzxbw m5, [r2 + r4 + 8]
358 movu m2, [r3]
359 movu m3, [r3 + 16]
360 movu m6, [r3 + r5]
361 movu m7, [r3 + r5 + 16]
362 lea r2, [r2 + r4 * 2]
363 lea r3, [r3 + r5 * 2]
364
365 paddw m0, m2
366 paddw m1, m3
367 paddw m4, m6
368 paddw m5, m7
369 packuswb m0, m1
370 packuswb m4, m5
371
372 movu [r0], m0
373 movu [r0 + r1], m4
374
375 pmovzxbw m0, [r2]
376 pmovzxbw m1, [r2 + 8]
377 pmovzxbw m4, [r2 + r4]
378 pmovzxbw m5, [r2 + r4 + 8]
379 movu m2, [r3]
380 movu m3, [r3 + 16]
381 movu m6, [r3 + r5]
382 movu m7, [r3 + r5 + 16]
383 dec r6d
384 lea r0, [r0 + r1 * 2]
385 lea r2, [r2 + r4 * 2]
386 lea r3, [r3 + r5 * 2]
387
388 paddw m0, m2
389 paddw m1, m3
390 paddw m4, m6
391 paddw m5, m7
392 packuswb m0, m1
393 packuswb m4, m5
394
395 movu [r0], m0
396 movu [r0 + r1], m4
397 lea r0, [r0 + r1 * 2]
398
399 jnz .loop
400 RET
401 %endif
402 %endmacro
403 PIXEL_ADD_PS_W16_H4 16, 16
404 PIXEL_ADD_PS_W16_H4 16, 32
405
406 ;-----------------------------------------------------------------------------
407 ; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
408 ;-----------------------------------------------------------------------------
409 %macro PIXEL_ADD_PS_W16_H4_avx2 1
410 %if HIGH_BIT_DEPTH
411 %if ARCH_X86_64
412 INIT_YMM avx2
413 cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1
414 mova m3, [pw_pixel_max]
415 pxor m2, m2
416 mov r6d, %1/4
417 add r4d, r4d
418 add r5d, r5d
419 add r1d, r1d
420 lea r7, [r4 * 3]
421 lea r8, [r5 * 3]
422 lea r9, [r1 * 3]
423
424 .loop:
425 movu m0, [r2]
426 movu m1, [r3]
427 paddw m0, m1
428 CLIPW m0, m2, m3
429 movu [r0], m0
430
431 movu m0, [r2 + r4]
432 movu m1, [r3 + r5]
433 paddw m0, m1
434 CLIPW m0, m2, m3
435 movu [r0 + r1], m0
436
437 movu m0, [r2 + r4 * 2]
438 movu m1, [r3 + r5 * 2]
439 paddw m0, m1
440 CLIPW m0, m2, m3
441 movu [r0 + r1 * 2], m0
442
443 movu m0, [r2 + r7]
444 movu m1, [r3 + r8]
445 paddw m0, m1
446 CLIPW m0, m2, m3
447 movu [r0 + r9], m0
448
449 dec r6d
450 lea r0, [r0 + r1 * 4]
451 lea r2, [r2 + r4 * 4]
452 lea r3, [r3 + r5 * 4]
453 jnz .loop
454 RET
455 %endif
456 %else
457 INIT_YMM avx2
458 cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
459 mov r6d, %1/4
460 add r5, r5
461 .loop:
462
463 pmovzxbw m0, [r2] ; row 0 of src0
464 pmovzxbw m1, [r2 + r4] ; row 1 of src0
465 movu m2, [r3] ; row 0 of src1
466 movu m3, [r3 + r5] ; row 1 of src1
467 paddw m0, m2
468 paddw m1, m3
469 packuswb m0, m1
470
471 lea r2, [r2 + r4 * 2]
472 lea r3, [r3 + r5 * 2]
473
474 pmovzxbw m2, [r2] ; row 2 of src0
475 pmovzxbw m3, [r2 + r4] ; row 3 of src0
476 movu m4, [r3] ; row 2 of src1
477 movu m5, [r3 + r5] ; row 3 of src1
478 paddw m2, m4
479 paddw m3, m5
480 packuswb m2, m3
481
482 lea r2, [r2 + r4 * 2]
483 lea r3, [r3 + r5 * 2]
484
485 vpermq m0, m0, 11011000b
486 movu [r0], xm0 ; row 0 of dst
487 vextracti128 xm3, m0, 1
488 movu [r0 + r1], xm3 ; row 1 of dst
489
490 lea r0, [r0 + r1 * 2]
491 vpermq m2, m2, 11011000b
492 movu [r0], xm2 ; row 2 of dst
493 vextracti128 xm3, m2, 1
494 movu [r0 + r1], xm3 ; row 3 of dst
495
496 lea r0, [r0 + r1 * 2]
497
498 dec r6d
499 jnz .loop
500
501 RET
502 %endif
503 %endmacro
504
505 PIXEL_ADD_PS_W16_H4_avx2 16
506 PIXEL_ADD_PS_W16_H4_avx2 32
507
508
509 ;-----------------------------------------------------------------------------
510 ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
511 ;-----------------------------------------------------------------------------
512 %macro PIXEL_ADD_PS_W32_H2 2
513 %if HIGH_BIT_DEPTH
514 INIT_XMM sse2
515 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
516 mova m5, [pw_pixel_max]
517 pxor m4, m4
518 mov r6d, %2/2
519 add r4, r4
520 add r5, r5
521 add r1, r1
522 .loop:
523 movu m0, [r2]
524 movu m2, [r2 + 16]
525 movu m1, [r3]
526 movu m3, [r3 + 16]
527
528 paddw m0, m1
529 paddw m2, m3
530 CLIPW2 m0, m2, m4, m5
531
532 movu [r0], m0
533 movu [r0 + 16], m2
534
535 movu m0, [r2 + 32]
536 movu m2, [r2 + 48]
537 movu m1, [r3 + 32]
538 movu m3, [r3 + 48]
539
540 paddw m0, m1
541 paddw m2, m3
542 CLIPW2 m0, m2, m4, m5
543
544 movu [r0 + 32], m0
545 movu [r0 + 48], m2
546
547 movu m0, [r2 + r4]
548 movu m2, [r2 + r4 + 16]
549 movu m1, [r3 + r5]
550 movu m3, [r3 + r5 + 16]
551
552 paddw m0, m1
553 paddw m2, m3
554 CLIPW2 m0, m2, m4, m5
555
556 movu [r0 + r1], m0
557 movu [r0 + r1 + 16], m2
558
559 movu m0, [r2 + r4 + 32]
560 movu m2, [r2 + r4 + 48]
561 movu m1, [r3 + r5 + 32]
562 movu m3, [r3 + r5 + 48]
563 dec r6d
564 lea r2, [r2 + r4 * 2]
565 lea r3, [r3 + r5 * 2]
566
567 paddw m0, m1
568 paddw m2, m3
569 CLIPW2 m0, m2, m4, m5
570
571 movu [r0 + r1 + 32], m0
572 movu [r0 + r1 + 48], m2
573 lea r0, [r0 + r1 * 2]
574
575 jnz .loop
576 RET
577 %else
578 INIT_XMM sse4
579 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
580 mov r6d, %2/2
581 add r5, r5
582 .loop:
583 pmovzxbw m0, [r2]
584 pmovzxbw m1, [r2 + 8]
585 pmovzxbw m2, [r2 + 16]
586 pmovzxbw m3, [r2 + 24]
587 movu m4, [r3]
588 movu m5, [r3 + 16]
589 movu m6, [r3 + 32]
590 movu m7, [r3 + 48]
591
592 paddw m0, m4
593 paddw m1, m5
594 paddw m2, m6
595 paddw m3, m7
596 packuswb m0, m1
597 packuswb m2, m3
598
599 movu [r0], m0
600 movu [r0 + 16], m2
601
602 pmovzxbw m0, [r2 + r4]
603 pmovzxbw m1, [r2 + r4 + 8]
604 pmovzxbw m2, [r2 + r4 + 16]
605 pmovzxbw m3, [r2 + r4 + 24]
606 movu m4, [r3 + r5]
607 movu m5, [r3 + r5 + 16]
608 movu m6, [r3 + r5 + 32]
609 movu m7, [r3 + r5 + 48]
610 dec r6d
611 lea r2, [r2 + r4 * 2]
612 lea r3, [r3 + r5 * 2]
613
614 paddw m0, m4
615 paddw m1, m5
616 paddw m2, m6
617 paddw m3, m7
618 packuswb m0, m1
619 packuswb m2, m3
620
621 movu [r0 + r1], m0
622 movu [r0 + r1 + 16], m2
623 lea r0, [r0 + r1 * 2]
624
625 jnz .loop
626 RET
627 %endif
628 %endmacro
629 PIXEL_ADD_PS_W32_H2 32, 32
630 PIXEL_ADD_PS_W32_H2 32, 64
631
632 ;-----------------------------------------------------------------------------
633 ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
634 ;-----------------------------------------------------------------------------
635 %macro PIXEL_ADD_PS_W32_H4_avx2 1
636 %if HIGH_BIT_DEPTH
637 %if ARCH_X86_64
638 INIT_YMM avx2
639 cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
640 mova m5, [pw_pixel_max]
641 pxor m4, m4
642 mov r6d, %1/4
643 add r4d, r4d
644 add r5d, r5d
645 add r1d, r1d
646 lea r7, [r4 * 3]
647 lea r8, [r5 * 3]
648 lea r9, [r1 * 3]
649
650 .loop:
651 movu m0, [r2]
652 movu m2, [r2 + 32]
653 movu m1, [r3]
654 movu m3, [r3 + 32]
655 paddw m0, m1
656 paddw m2, m3
657 CLIPW2 m0, m2, m4, m5
658
659 movu [r0], m0
660 movu [r0 + 32], m2
661
662 movu m0, [r2 + r4]
663 movu m2, [r2 + r4 + 32]
664 movu m1, [r3 + r5]
665 movu m3, [r3 + r5 + 32]
666 paddw m0, m1
667 paddw m2, m3
668 CLIPW2 m0, m2, m4, m5
669
670 movu [r0 + r1], m0
671 movu [r0 + r1 + 32], m2
672
673 movu m0, [r2 + r4 * 2]
674 movu m2, [r2 + r4 * 2 + 32]
675 movu m1, [r3 + r5 * 2]
676 movu m3, [r3 + r5 * 2 + 32]
677 paddw m0, m1
678 paddw m2, m3
679 CLIPW2 m0, m2, m4, m5
680
681 movu [r0 + r1 * 2], m0
682 movu [r0 + r1 * 2 + 32], m2
683
684 movu m0, [r2 + r7]
685 movu m2, [r2 + r7 + 32]
686 movu m1, [r3 + r8]
687 movu m3, [r3 + r8 + 32]
688 paddw m0, m1
689 paddw m2, m3
690 CLIPW2 m0, m2, m4, m5
691
692 movu [r0 + r9], m0
693 movu [r0 + r9 + 32], m2
694
695 dec r6d
696 lea r0, [r0 + r1 * 4]
697 lea r2, [r2 + r4 * 4]
698 lea r3, [r3 + r5 * 4]
699 jnz .loop
700 RET
701 %endif
702 %else
703 %if ARCH_X86_64
704 INIT_YMM avx2
705 cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1
706 mov r6d, %1/4
707 add r5, r5
708 lea r7, [r4 * 3]
709 lea r8, [r5 * 3]
710 lea r9, [r1 * 3]
711 .loop:
712 pmovzxbw m0, [r2] ; first half of row 0 of src0
713 pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0
714 movu m2, [r3] ; first half of row 0 of src1
715 movu m3, [r3 + 32] ; second half of row 0 of src1
716
717 paddw m0, m2
718 paddw m1, m3
719 packuswb m0, m1
720 vpermq m0, m0, 11011000b
721 movu [r0], m0 ; row 0 of dst
722
723 pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0
724 pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0
725 movu m2, [r3 + r5] ; first half of row 1 of src1
726 movu m3, [r3 + r5 + 32] ; second half of row 1 of src1
727
728 paddw m0, m2
729 paddw m1, m3
730 packuswb m0, m1
731 vpermq m0, m0, 11011000b
732 movu [r0 + r1], m0 ; row 1 of dst
733
734 pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0
735 pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0
736 movu m2, [r3 + r5 * 2] ; first half of row 2 of src1
737 movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1
738
739 paddw m0, m2
740 paddw m1, m3
741 packuswb m0, m1
742 vpermq m0, m0, 11011000b
743 movu [r0 + r1 * 2], m0 ; row 2 of dst
744
745 pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0
746 pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0
747 movu m2, [r3 + r8] ; first half of row 3 of src1
748 movu m3, [r3 + r8 + 32] ; second half of row 3 of src1
749
750 paddw m0, m2
751 paddw m1, m3
752 packuswb m0, m1
753 vpermq m0, m0, 11011000b
754 movu [r0 + r9], m0 ; row 3 of dst
755
756 lea r2, [r2 + r4 * 4]
757 lea r3, [r3 + r5 * 4]
758 lea r0, [r0 + r1 * 4]
759
760 dec r6d
761 jnz .loop
762 RET
763 %endif
764 %endif
765 %endmacro
766
767 PIXEL_ADD_PS_W32_H4_avx2 32
768 PIXEL_ADD_PS_W32_H4_avx2 64
769
770
771 ;-----------------------------------------------------------------------------
772 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
773 ;-----------------------------------------------------------------------------
774 %macro PIXEL_ADD_PS_W64_H2 2
775 %if HIGH_BIT_DEPTH
776 INIT_XMM sse2
777 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
778 mova m5, [pw_pixel_max]
779 pxor m4, m4
780 mov r6d, %2/2
781 add r4, r4
782 add r5, r5
783 add r1, r1
784 .loop:
785 movu m0, [r2]
786 movu m2, [r2 + 16]
787 movu m1, [r3]
788 movu m3, [r3 + 16]
789
790 paddw m0, m1
791 paddw m2, m3
792 CLIPW2 m0, m2, m4, m5
793
794 movu [r0], m0
795 movu [r0 + 16], m2
796
797 movu m0, [r2 + 32]
798 movu m2, [r2 + 48]
799 movu m1, [r3 + 32]
800 movu m3, [r3 + 48]
801
802 paddw m0, m1
803 paddw m2, m3
804 CLIPW2 m0, m2, m4, m5
805
806 movu [r0 + 32], m0
807 movu [r0 + 48], m2
808
809 movu m0, [r2 + 64]
810 movu m2, [r2 + 80]
811 movu m1, [r3 + 64]
812 movu m3, [r3 + 80]
813
814 paddw m0, m1
815 paddw m2, m3
816 CLIPW2 m0, m2, m4, m5
817
818 movu [r0 + 64], m0
819 movu [r0 + 80], m2
820
821 movu m0, [r2 + 96]
822 movu m2, [r2 + 112]
823 movu m1, [r3 + 96]
824 movu m3, [r3 + 112]
825
826 paddw m0, m1
827 paddw m2, m3
828 CLIPW2 m0, m2, m4, m5
829
830 movu [r0 + 96], m0
831 movu [r0 + 112], m2
832
833 movu m0, [r2 + r4]
834 movu m2, [r2 + r4 + 16]
835 movu m1, [r3 + r5]
836 movu m3, [r3 + r5 + 16]
837
838 paddw m0, m1
839 paddw m2, m3
840 CLIPW2 m0, m2, m4, m5
841
842 movu [r0 + r1], m0
843 movu [r0 + r1 + 16], m2
844
845 movu m0, [r2 + r4 + 32]
846 movu m2, [r2 + r4 + 48]
847 movu m1, [r3 + r5 + 32]
848 movu m3, [r3 + r5 + 48]
849
850 paddw m0, m1
851 paddw m2, m3
852 CLIPW2 m0, m2, m4, m5
853
854 movu [r0 + r1 + 32], m0
855 movu [r0 + r1 + 48], m2
856
857 movu m0, [r2 + r4 + 64]
858 movu m2, [r2 + r4 + 80]
859 movu m1, [r3 + r5 + 64]
860 movu m3, [r3 + r5 + 80]
861
862 paddw m0, m1
863 paddw m2, m3
864 CLIPW2 m0, m2, m4, m5
865
866 movu [r0 + r1 + 64], m0
867 movu [r0 + r1 + 80], m2
868
869 movu m0, [r2 + r4 + 96]
870 movu m2, [r2 + r4 + 112]
871 movu m1, [r3 + r5 + 96]
872 movu m3, [r3 + r5 + 112]
873 dec r6d
874 lea r2, [r2 + r4 * 2]
875 lea r3, [r3 + r5 * 2]
876
877 paddw m0, m1
878 paddw m2, m3
879 CLIPW2 m0, m2, m4, m5
880
881 movu [r0 + r1 + 96], m0
882 movu [r0 + r1 + 112], m2
883 lea r0, [r0 + r1 * 2]
884
885 jnz .loop
886 RET
887 %else
888 INIT_XMM sse4
889 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
890 mov r6d, %2/2
891 add r5, r5
892 .loop:
893 pmovzxbw m0, [r2]
894 pmovzxbw m1, [r2 + 8]
895 pmovzxbw m2, [r2 + 16]
896 pmovzxbw m3, [r2 + 24]
897 movu m4, [r3]
898 movu m5, [r3 + 16]
899 movu m6, [r3 + 32]
900 movu m7, [r3 + 48]
901
902 paddw m0, m4
903 paddw m1, m5
904 paddw m2, m6
905 paddw m3, m7
906 packuswb m0, m1
907 packuswb m2, m3
908
909 movu [r0], m0
910 movu [r0 + 16], m2
911
912 pmovzxbw m0, [r2 + 32]
913 pmovzxbw m1, [r2 + 40]
914 pmovzxbw m2, [r2 + 48]
915 pmovzxbw m3, [r2 + 56]
916 movu m4, [r3 + 64]
917 movu m5, [r3 + 80]
918 movu m6, [r3 + 96]
919 movu m7, [r3 + 112]
920
921 paddw m0, m4
922 paddw m1, m5
923 paddw m2, m6
924 paddw m3, m7
925 packuswb m0, m1
926 packuswb m2, m3
927
928 movu [r0 + 32], m0
929 movu [r0 + 48], m2
930
931 pmovzxbw m0, [r2 + r4]
932 pmovzxbw m1, [r2 + r4 + 8]
933 pmovzxbw m2, [r2 + r4 + 16]
934 pmovzxbw m3, [r2 + r4 + 24]
935 movu m4, [r3 + r5]
936 movu m5, [r3 + r5 + 16]
937 movu m6, [r3 + r5 + 32]
938 movu m7, [r3 + r5 + 48]
939
940 paddw m0, m4
941 paddw m1, m5
942 paddw m2, m6
943 paddw m3, m7
944 packuswb m0, m1
945 packuswb m2, m3
946
947 movu [r0 + r1], m0
948 movu [r0 + r1 + 16], m2
949
950 pmovzxbw m0, [r2 + r4 + 32]
951 pmovzxbw m1, [r2 + r4 + 40]
952 pmovzxbw m2, [r2 + r4 + 48]
953 pmovzxbw m3, [r2 + r4 + 56]
954 movu m4, [r3 + r5 + 64]
955 movu m5, [r3 + r5 + 80]
956 movu m6, [r3 + r5 + 96]
957 movu m7, [r3 + r5 + 112]
958 dec r6d
959 lea r2, [r2 + r4 * 2]
960 lea r3, [r3 + r5 * 2]
961
962 paddw m0, m4
963 paddw m1, m5
964 paddw m2, m6
965 paddw m3, m7
966 packuswb m0, m1
967 packuswb m2, m3
968
969 movu [r0 + r1 + 32], m0
970 movu [r0 + r1 + 48], m2
971 lea r0, [r0 + r1 * 2]
972
973 jnz .loop
974 RET
975 %endif
976 %endmacro
977 PIXEL_ADD_PS_W64_H2 64, 64
978
979 ;-----------------------------------------------------------------------------
980 ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
981 ;-----------------------------------------------------------------------------
982 %if HIGH_BIT_DEPTH
983 %if ARCH_X86_64
984 INIT_YMM avx2
985 cglobal pixel_add_ps_64x64, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
986 mova m5, [pw_pixel_max]
987 pxor m4, m4
988 mov r6d, 16
989 add r4d, r4d
990 add r5d, r5d
991 add r1d, r1d
992 lea r7, [r4 * 3]
993 lea r8, [r5 * 3]
994 lea r9, [r1 * 3]
995
996 .loop:
997 movu m0, [r2]
998 movu m1, [r2 + 32]
999 movu m2, [r3]
1000 movu m3, [r3 + 32]
1001 paddw m0, m2
1002 paddw m1, m3
1003
1004 CLIPW2 m0, m1, m4, m5
1005 movu [r0], m0
1006 movu [r0 + 32], m1
1007
1008 movu m0, [r2 + 64]
1009 movu m1, [r2 + 96]
1010 movu m2, [r3 + 64]
1011 movu m3, [r3 + 96]
1012 paddw m0, m2
1013 paddw m1, m3
1014
1015 CLIPW2 m0, m1, m4, m5
1016 movu [r0 + 64], m0
1017 movu [r0 + 96], m1
1018
1019 movu m0, [r2 + r4]
1020 movu m1, [r2 + r4 + 32]
1021 movu m2, [r3 + r5]
1022 movu m3, [r3 + r5 + 32]
1023 paddw m0, m2
1024 paddw m1, m3
1025
1026 CLIPW2 m0, m1, m4, m5
1027 movu [r0 + r1], m0
1028 movu [r0 + r1 + 32], m1
1029
1030 movu m0, [r2 + r4 + 64]
1031 movu m1, [r2 + r4 + 96]
1032 movu m2, [r3 + r5 + 64]
1033 movu m3, [r3 + r5 + 96]
1034 paddw m0, m2
1035 paddw m1, m3
1036
1037 CLIPW2 m0, m1, m4, m5
1038 movu [r0 + r1 + 64], m0
1039 movu [r0 + r1 + 96], m1
1040
1041 movu m0, [r2 + r4 * 2]
1042 movu m1, [r2 + r4 * 2 + 32]
1043 movu m2, [r3 + r5 * 2]
1044 movu m3, [r3 + r5 * 2+ 32]
1045 paddw m0, m2
1046 paddw m1, m3
1047
1048 CLIPW2 m0, m1, m4, m5
1049 movu [r0 + r1 * 2], m0
1050 movu [r0 + r1 * 2 + 32], m1
1051
1052 movu m0, [r2 + r4 * 2 + 64]
1053 movu m1, [r2 + r4 * 2 + 96]
1054 movu m2, [r3 + r5 * 2 + 64]
1055 movu m3, [r3 + r5 * 2 + 96]
1056 paddw m0, m2
1057 paddw m1, m3
1058
1059 CLIPW2 m0, m1, m4, m5
1060 movu [r0 + r1 * 2 + 64], m0
1061 movu [r0 + r1 * 2 + 96], m1
1062
1063 movu m0, [r2 + r7]
1064 movu m1, [r2 + r7 + 32]
1065 movu m2, [r3 + r8]
1066 movu m3, [r3 + r8 + 32]
1067 paddw m0, m2
1068 paddw m1, m3
1069
1070 CLIPW2 m0, m1, m4, m5
1071 movu [r0 + r9], m0
1072 movu [r0 + r9 + 32], m1
1073
1074 movu m0, [r2 + r7 + 64]
1075 movu m1, [r2 + r7 + 96]
1076 movu m2, [r3 + r8 + 64]
1077 movu m3, [r3 + r8 + 96]
1078 paddw m0, m2
1079 paddw m1, m3
1080
1081 CLIPW2 m0, m1, m4, m5
1082 movu [r0 + r9 + 64], m0
1083 movu [r0 + r9 + 96], m1
1084
1085 dec r6d
1086 lea r0, [r0 + r1 * 4]
1087 lea r2, [r2 + r4 * 4]
1088 lea r3, [r3 + r5 * 4]
1089 jnz .loop
1090 RET
1091 %endif
1092 %else
1093 INIT_YMM avx2
1094 cglobal pixel_add_ps_64x64, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
1095 mov r6d, 32
1096 add r5, r5
1097 .loop:
1098 pmovzxbw m0, [r2] ; first 16 of row 0 of src0
1099 pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0
1100 pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0
1101 pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0
1102 movu m4, [r3] ; first 16 of row 0 of src1
1103 movu m5, [r3 + 32] ; second 16 of row 0 of src1
1104 movu m6, [r3 + 64] ; third 16 of row 0 of src1
1105 movu m7, [r3 + 96] ; forth 16 of row 0 of src1
1106
1107 paddw m0, m4
1108 paddw m1, m5
1109 paddw m2, m6
1110 paddw m3, m7
1111 packuswb m0, m1
1112 packuswb m2, m3
1113 vpermq m0, m0, 11011000b
1114 movu [r0], m0 ; first 32 of row 0 of dst
1115 vpermq m2, m2, 11011000b
1116 movu [r0 + 32], m2 ; second 32 of row 0 of dst
1117
1118 pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0
1119 pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0
1120 pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0
1121 pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0
1122 movu m4, [r3 + r5] ; first 16 of row 1 of src1
1123 movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1
1124 movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1
1125 movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1
1126
1127 paddw m0, m4
1128 paddw m1, m5
1129 paddw m2, m6
1130 paddw m3, m7
1131 packuswb m0, m1
1132 packuswb m2, m3
1133 vpermq m0, m0, 11011000b
1134 movu [r0 + r1], m0 ; first 32 of row 1 of dst
1135 vpermq m2, m2, 11011000b
1136 movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst
1137
1138 lea r2, [r2 + r4 * 2]
1139 lea r3, [r3 + r5 * 2]
1140 lea r0, [r0 + r1 * 2]
1141
1142 dec r6d
1143 jnz .loop
1144 RET
1145
1146 %endif