comparison x265/source/common/x86/pixel-a.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* Min Chen <chenm003@163.com>
13 ;*
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
18 ;*
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
23 ;*
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;*
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at license @ x265.com.
30 ;*****************************************************************************
31
32 %include "x86inc.asm"
33 %include "x86util.asm"
34
35 SECTION_RODATA 32
36 hmul_8p: times 8 db 1
37 times 4 db 1, -1
38 times 8 db 1
39 times 4 db 1, -1
40 hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1
41 mask_10: times 4 dw 0, -1
42 mask_1100: times 2 dd 0, -1
43 hmul_8w: times 4 dw 1
44 times 2 dw 1, -1
45 times 4 dw 1
46 times 2 dw 1, -1
47
48 ALIGN 32
49 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
50 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
51
52 sw_f0: dq 0xfff0, 0
53 pd_f0: times 4 dd 0xffff0000
54
55 SECTION .text
56
57 cextern pb_0
58 cextern pb_1
59 cextern pw_1
60 cextern pw_8
61 cextern pw_16
62 cextern pw_32
63 cextern pw_00ff
64 cextern pw_ppppmmmm
65 cextern pw_ppmmppmm
66 cextern pw_pmpmpmpm
67 cextern pw_pmmpzzzz
68 cextern pd_1
69 cextern popcnt_table
70 cextern pd_2
71 cextern hmul_16p
72 cextern pb_movemask
73 cextern pb_movemask_32
74 cextern pw_pixel_max
75
76 ;=============================================================================
77 ; SATD
78 ;=============================================================================
79
80 %macro JDUP 2
81 %if cpuflag(sse4)
82 ; just use shufps on anything post conroe
83 shufps %1, %2, 0
84 %elif cpuflag(ssse3) && notcpuflag(atom)
85 ; join 2x 32 bit and duplicate them
86 ; emulating shufps is faster on conroe
87 punpcklqdq %1, %2
88 movsldup %1, %1
89 %else
90 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
91 punpckldq %1, %2
92 %endif
93 %endmacro
94
95 %macro HSUMSUB 5
96 pmaddubsw m%2, m%5
97 pmaddubsw m%1, m%5
98 pmaddubsw m%4, m%5
99 pmaddubsw m%3, m%5
100 %endmacro
101
102 %macro DIFF_UNPACK_SSE2 5
103 punpcklbw m%1, m%5
104 punpcklbw m%2, m%5
105 punpcklbw m%3, m%5
106 punpcklbw m%4, m%5
107 psubw m%1, m%2
108 psubw m%3, m%4
109 %endmacro
110
111 %macro DIFF_SUMSUB_SSSE3 5
112 HSUMSUB %1, %2, %3, %4, %5
113 psubw m%1, m%2
114 psubw m%3, m%4
115 %endmacro
116
117 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
118 movd %1, %3
119 movd %2, %4
120 JDUP %1, %2
121 %endmacro
122
123 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
124 movddup m%3, %6
125 movddup m%4, %8
126 movddup m%1, %5
127 movddup m%2, %7
128 %endmacro
129
130 %macro LOAD_DUP_4x8P_PENRYN 8
131 ; penryn and nehalem run punpcklqdq and movddup in different units
132 movh m%3, %6
133 movh m%4, %8
134 punpcklqdq m%3, m%3
135 movddup m%1, %5
136 punpcklqdq m%4, m%4
137 movddup m%2, %7
138 %endmacro
139
140 %macro LOAD_SUMSUB_8x2P 9
141 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
142 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
143 %endmacro
144
145 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
146 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
147 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
148 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
149 %if %10
150 lea %8, [%8+4*r1]
151 lea %9, [%9+4*r3]
152 %endif
153 %endmacro
154
155 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
156 movddup m%1, [%7]
157 movddup m%2, [%7+8]
158 mova m%4, [%6]
159 movddup m%3, m%4
160 punpckhqdq m%4, m%4
161 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
162 %endmacro
163
164 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
165 movu m%4, [%7]
166 mova m%2, [%6]
167 DEINTB %1, %2, %3, %4, %5
168 psubw m%1, m%3
169 psubw m%2, m%4
170 SUMSUB_BA w, %1, %2, %3
171 %endmacro
172
173 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
174 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
175 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
176 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
177 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
178 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
179 %endmacro
180
181 %macro LOAD_SUMSUB_16x2P_AVX2 9
182 ; 2*dst, 2*tmp, mul, 4*ptr
183 vbroadcasti128 m%1, [%6]
184 vbroadcasti128 m%3, [%7]
185 vbroadcasti128 m%2, [%8]
186 vbroadcasti128 m%4, [%9]
187 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
188 %endmacro
189
190 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
191 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
192 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
193 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
194 %if %10
195 lea %8, [%8+4*r1]
196 lea %9, [%9+4*r3]
197 %endif
198 %endmacro
199
200 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
201 mova xm%3, %6
202 mova xm%4, %8
203 mova xm%1, %5
204 mova xm%2, %7
205 vpermq m%3, m%3, q0011
206 vpermq m%4, m%4, q0011
207 vpermq m%1, m%1, q0011
208 vpermq m%2, m%2, q0011
209 %endmacro
210
211 %macro LOAD_SUMSUB8_16x2P_AVX2 9
212 ; 2*dst, 2*tmp, mul, 4*ptr
213 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
214 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
215 %endmacro
216
217 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
218 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
219 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
220 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
221 %if %10
222 lea %8, [%8+4*r1]
223 lea %9, [%9+4*r3]
224 %endif
225 %endmacro
226
227 ; in: r4=3*stride1, r5=3*stride2
228 ; in: %2 = horizontal offset
229 ; in: %3 = whether we need to increment pix1 and pix2
230 ; clobber: m3..m7
231 ; out: %1 = satd
232 %macro SATD_4x4_MMX 3
233 %xdefine %%n n%1
234 %assign offset %2*SIZEOF_PIXEL
235 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
236 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
237 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
238 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
239 %if %3
240 lea r0, [r0+4*r1]
241 lea r2, [r2+4*r3]
242 %endif
243 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
244 paddw m4, m6
245 ;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
246 ; pxor m5, m5
247 ; punpcklwd m6, m4, m5
248 ; punpckhwd m4, m5
249 ; paddd m4, m6
250 ;%endif
251 SWAP %%n, 4
252 %endmacro
253
254 ; in: %1 = horizontal if 0, vertical if 1
255 %macro SATD_8x4_SSE 8-9
256 %if %1
257 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
258 %else
259 HADAMARD4_V %2, %3, %4, %5, %6
260 ; doing the abs first is a slight advantage
261 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
262 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
263 HADAMARD 1, max, %2, %4, %6, %7
264 %endif
265 %ifnidn %9, swap
266 %if (BIT_DEPTH == 12)
267 pxor m%6, m%6
268 punpcklwd m%7, m%2, m%6
269 punpckhwd m%2, m%6
270 paddd m%8, m%7
271 paddd m%8, m%2
272 %else
273 paddw m%8, m%2
274 %endif
275 %else
276 SWAP %8, %2
277 %if (BIT_DEPTH == 12)
278 pxor m%6, m%6
279 punpcklwd m%7, m%8, m%6
280 punpckhwd m%8, m%6
281 paddd m%8, m%7
282 %endif
283 %endif
284 %if %1
285 %if (BIT_DEPTH == 12)
286 pxor m%6, m%6
287 punpcklwd m%7, m%4, m%6
288 punpckhwd m%4, m%6
289 paddd m%8, m%7
290 paddd m%8, m%4
291 %else
292 paddw m%8, m%4
293 %endif
294 %else
295 HADAMARD 1, max, %3, %5, %6, %7
296 %if (BIT_DEPTH == 12)
297 pxor m%6, m%6
298 punpcklwd m%7, m%3, m%6
299 punpckhwd m%3, m%6
300 paddd m%8, m%7
301 paddd m%8, m%3
302 %else
303 paddw m%8, m%3
304 %endif
305 %endif
306 %endmacro
307
308 %macro SATD_8x4_1_SSE 10
309 %if %1
310 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
311 %else
312 HADAMARD4_V %2, %3, %4, %5, %6
313 ; doing the abs first is a slight advantage
314 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
315 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
316 HADAMARD 1, max, %2, %4, %6, %7
317 %endif
318
319 pxor m%10, m%10
320 punpcklwd m%9, m%2, m%10
321 paddd m%8, m%9
322 punpckhwd m%9, m%2, m%10
323 paddd m%8, m%9
324
325 %if %1
326 pxor m%10, m%10
327 punpcklwd m%9, m%4, m%10
328 paddd m%8, m%9
329 punpckhwd m%9, m%4, m%10
330 paddd m%8, m%9
331 %else
332 HADAMARD 1, max, %3, %5, %6, %7
333 pxor m%10, m%10
334 punpcklwd m%9, m%3, m%10
335 paddd m%8, m%9
336 punpckhwd m%9, m%3, m%10
337 paddd m%8, m%9
338 %endif
339 %endmacro
340
341 %macro SATD_START_MMX 0
342 FIX_STRIDES r1, r3
343 lea r4, [3*r1] ; 3*stride1
344 lea r5, [3*r3] ; 3*stride2
345 %endmacro
346
347 %macro SATD_END_MMX 0
348 %if HIGH_BIT_DEPTH
349 HADDUW m0, m1
350 movd eax, m0
351 %else ; !HIGH_BIT_DEPTH
352 pshufw m1, m0, q1032
353 paddw m0, m1
354 pshufw m1, m0, q2301
355 paddw m0, m1
356 movd eax, m0
357 and eax, 0xffff
358 %endif ; HIGH_BIT_DEPTH
359 EMMS
360 RET
361 %endmacro
362
363 ; FIXME avoid the spilling of regs to hold 3*stride.
364 ; for small blocks on x86_32, modify pixel pointer instead.
365
366 ;-----------------------------------------------------------------------------
367 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
368 ;-----------------------------------------------------------------------------
369 INIT_MMX mmx2
370 cglobal pixel_satd_4x4, 4,6
371 SATD_START_MMX
372 SATD_4x4_MMX m0, 0, 0
373 SATD_END_MMX
374
375 %macro SATD_START_SSE2 2-3 0
376 FIX_STRIDES r1, r3
377 %if HIGH_BIT_DEPTH && %3
378 pxor %2, %2
379 %elif cpuflag(ssse3) && notcpuflag(atom)
380 %if mmsize==32
381 mova %2, [hmul_16p]
382 %else
383 mova %2, [hmul_8p]
384 %endif
385 %endif
386 lea r4, [3*r1]
387 lea r5, [3*r3]
388 pxor %1, %1
389 %endmacro
390
391 %macro SATD_END_SSE2 1-2
392 %if HIGH_BIT_DEPTH
393 %if BIT_DEPTH == 12
394 HADDD %1, xm0
395 %else ; BIT_DEPTH == 12
396 HADDUW %1, xm0
397 %endif ; BIT_DEPTH == 12
398 %if %0 == 2
399 paddd %1, %2
400 %endif
401 %else
402 HADDW %1, xm7
403 %endif
404 movd eax, %1
405 RET
406 %endmacro
407
408 %macro SATD_ACCUM 3
409 %if HIGH_BIT_DEPTH
410 HADDUW %1, %2
411 paddd %3, %1
412 pxor %1, %1
413 %endif
414 %endmacro
415
416 %macro BACKUP_POINTERS 0
417 %if ARCH_X86_64
418 %if WIN64
419 PUSH r7
420 %endif
421 mov r6, r0
422 mov r7, r2
423 %endif
424 %endmacro
425
426 %macro RESTORE_AND_INC_POINTERS 0
427 %if ARCH_X86_64
428 lea r0, [r6+8*SIZEOF_PIXEL]
429 lea r2, [r7+8*SIZEOF_PIXEL]
430 %if WIN64
431 POP r7
432 %endif
433 %else
434 mov r0, r0mp
435 mov r2, r2mp
436 add r0, 8*SIZEOF_PIXEL
437 add r2, 8*SIZEOF_PIXEL
438 %endif
439 %endmacro
440
441 %macro SATD_4x8_SSE 3-4
442 %if HIGH_BIT_DEPTH
443 movh m0, [r0+0*r1]
444 movh m4, [r2+0*r3]
445 movh m1, [r0+1*r1]
446 movh m5, [r2+1*r3]
447 movhps m0, [r0+4*r1]
448 movhps m4, [r2+4*r3]
449 movh m2, [r0+2*r1]
450 movh m6, [r2+2*r3]
451 psubw m0, m4
452 movh m3, [r0+r4]
453 movh m4, [r2+r5]
454 lea r0, [r0+4*r1]
455 lea r2, [r2+4*r3]
456 movhps m1, [r0+1*r1]
457 movhps m5, [r2+1*r3]
458 movhps m2, [r0+2*r1]
459 movhps m6, [r2+2*r3]
460 psubw m1, m5
461 movhps m3, [r0+r4]
462 movhps m4, [r2+r5]
463 psubw m2, m6
464 psubw m3, m4
465 %else ; !HIGH_BIT_DEPTH
466 movd m4, [r2]
467 movd m5, [r2+r3]
468 movd m6, [r2+2*r3]
469 add r2, r5
470 movd m0, [r0]
471 movd m1, [r0+r1]
472 movd m2, [r0+2*r1]
473 add r0, r4
474 movd m3, [r2+r3]
475 JDUP m4, m3
476 movd m3, [r0+r1]
477 JDUP m0, m3
478 movd m3, [r2+2*r3]
479 JDUP m5, m3
480 movd m3, [r0+2*r1]
481 JDUP m1, m3
482 %if %1==0 && %2==1
483 mova m3, [hmul_4p]
484 DIFFOP 0, 4, 1, 5, 3
485 %else
486 DIFFOP 0, 4, 1, 5, 7
487 %endif
488 movd m5, [r2]
489 add r2, r5
490 movd m3, [r0]
491 add r0, r4
492 movd m4, [r2]
493 JDUP m6, m4
494 movd m4, [r0]
495 JDUP m2, m4
496 movd m4, [r2+r3]
497 JDUP m5, m4
498 movd m4, [r0+r1]
499 JDUP m3, m4
500 %if %1==0 && %2==1
501 mova m4, [hmul_4p]
502 DIFFOP 2, 6, 3, 5, 4
503 %else
504 DIFFOP 2, 6, 3, 5, 7
505 %endif
506 %endif ; HIGH_BIT_DEPTH
507 %if %0 == 4
508 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
509 %else
510 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
511 %endif
512 %endmacro
513
514 ;-----------------------------------------------------------------------------
515 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
516 ;-----------------------------------------------------------------------------
517 %macro SATDS_SSE2 0
518 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
519
520 %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
521 cglobal pixel_satd_4x4, 4, 6, 6
522 SATD_START_MMX
523 mova m4, [hmul_4p]
524 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
525 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
526 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
527 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
528 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
529 HADAMARD 0, sumsub, 0, 1, 2, 3
530 HADAMARD 4, sumsub, 0, 1, 2, 3
531 HADAMARD 1, amax, 0, 1, 2, 3
532 HADDW m0, m1
533 movd eax, m0
534 RET
535 %endif
536
537 cglobal pixel_satd_4x8, 4, 6, 8
538 SATD_START_MMX
539 %if vertical==0
540 mova m7, [hmul_4p]
541 %endif
542 SATD_4x8_SSE vertical, 0, swap
543 %if BIT_DEPTH == 12
544 HADDD m7, m1
545 %else
546 HADDUW m7, m1
547 %endif
548 movd eax, m7
549 RET
550
551 cglobal pixel_satd_4x16, 4, 6, 8
552 SATD_START_MMX
553 %if vertical==0
554 mova m7, [hmul_4p]
555 %endif
556 SATD_4x8_SSE vertical, 0, swap
557 lea r0, [r0+r1*2*SIZEOF_PIXEL]
558 lea r2, [r2+r3*2*SIZEOF_PIXEL]
559 SATD_4x8_SSE vertical, 1, add
560 %if BIT_DEPTH == 12
561 HADDD m7, m1
562 %else
563 HADDUW m7, m1
564 %endif
565 movd eax, m7
566 RET
567
568 cglobal pixel_satd_8x8_internal
569 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
570 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
571 %%pixel_satd_8x4_internal:
572 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
573 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
574 ret
575
576 cglobal pixel_satd_8x8_internal2
577 %if WIN64
578 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
579 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
580 %%pixel_satd_8x4_internal2:
581 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
582 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
583 %else
584 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
585 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
586 %%pixel_satd_8x4_internal2:
587 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
588 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
589 %endif
590 ret
591
592 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
593 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
594 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
595
596 cglobal pixel_satd_16x4_internal2
597 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
598 lea r2, [r2+4*r3]
599 lea r0, [r0+4*r1]
600 SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
601 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
602 ret
603
604 cglobal pixel_satd_16x4, 4,6,14
605 SATD_START_SSE2 m10, m7
606 %if vertical
607 mova m7, [pw_00ff]
608 %endif
609 call pixel_satd_16x4_internal2
610 HADDD m10, m0
611 movd eax, m10
612 RET
613
614 cglobal pixel_satd_16x8, 4,6,14
615 SATD_START_SSE2 m10, m7
616 %if vertical
617 mova m7, [pw_00ff]
618 %endif
619 jmp %%pixel_satd_16x8_internal
620
621 cglobal pixel_satd_16x12, 4,6,14
622 SATD_START_SSE2 m10, m7
623 %if vertical
624 mova m7, [pw_00ff]
625 %endif
626 call pixel_satd_16x4_internal2
627 jmp %%pixel_satd_16x8_internal
628
629 cglobal pixel_satd_16x32, 4,6,14
630 SATD_START_SSE2 m10, m7
631 %if vertical
632 mova m7, [pw_00ff]
633 %endif
634 call pixel_satd_16x4_internal2
635 call pixel_satd_16x4_internal2
636 call pixel_satd_16x4_internal2
637 call pixel_satd_16x4_internal2
638 call pixel_satd_16x4_internal2
639 call pixel_satd_16x4_internal2
640 jmp %%pixel_satd_16x8_internal
641
642 cglobal pixel_satd_16x64, 4,6,14
643 SATD_START_SSE2 m10, m7
644 %if vertical
645 mova m7, [pw_00ff]
646 %endif
647 call pixel_satd_16x4_internal2
648 call pixel_satd_16x4_internal2
649 call pixel_satd_16x4_internal2
650 call pixel_satd_16x4_internal2
651 call pixel_satd_16x4_internal2
652 call pixel_satd_16x4_internal2
653 call pixel_satd_16x4_internal2
654 call pixel_satd_16x4_internal2
655 call pixel_satd_16x4_internal2
656 call pixel_satd_16x4_internal2
657 call pixel_satd_16x4_internal2
658 call pixel_satd_16x4_internal2
659 call pixel_satd_16x4_internal2
660 call pixel_satd_16x4_internal2
661 jmp %%pixel_satd_16x8_internal
662
663 cglobal pixel_satd_16x16, 4,6,14
664 SATD_START_SSE2 m10, m7
665 %if vertical
666 mova m7, [pw_00ff]
667 %endif
668 call pixel_satd_16x4_internal2
669 call pixel_satd_16x4_internal2
670 %%pixel_satd_16x8_internal:
671 call pixel_satd_16x4_internal2
672 call pixel_satd_16x4_internal2
673 HADDD m10, m0
674 movd eax, m10
675 RET
676
677 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
678 SATD_START_SSE2 m10, m7
679 mov r6, r0
680 mov r7, r2
681 %if vertical
682 mova m7, [pw_00ff]
683 %endif
684 call pixel_satd_16x4_internal2
685 call pixel_satd_16x4_internal2
686 lea r0, [r6 + 16]
687 lea r2, [r7 + 16]
688 call pixel_satd_16x4_internal2
689 call pixel_satd_16x4_internal2
690 HADDD m10, m0
691 movd eax, m10
692 RET
693
694 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
695 SATD_START_SSE2 m10, m7
696 mov r6, r0
697 mov r7, r2
698 %if vertical
699 mova m7, [pw_00ff]
700 %endif
701 call pixel_satd_16x4_internal2
702 call pixel_satd_16x4_internal2
703 call pixel_satd_16x4_internal2
704 call pixel_satd_16x4_internal2
705 lea r0, [r6 + 16]
706 lea r2, [r7 + 16]
707 call pixel_satd_16x4_internal2
708 call pixel_satd_16x4_internal2
709 call pixel_satd_16x4_internal2
710 call pixel_satd_16x4_internal2
711 HADDD m10, m0
712 movd eax, m10
713 RET
714
715 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx)
716 SATD_START_SSE2 m10, m7
717 mov r6, r0
718 mov r7, r2
719 %if vertical
720 mova m7, [pw_00ff]
721 %endif
722 call pixel_satd_16x4_internal2
723 call pixel_satd_16x4_internal2
724 call pixel_satd_16x4_internal2
725 call pixel_satd_16x4_internal2
726 call pixel_satd_16x4_internal2
727 call pixel_satd_16x4_internal2
728 lea r0, [r6 + 16]
729 lea r2, [r7 + 16]
730 call pixel_satd_16x4_internal2
731 call pixel_satd_16x4_internal2
732 call pixel_satd_16x4_internal2
733 call pixel_satd_16x4_internal2
734 call pixel_satd_16x4_internal2
735 call pixel_satd_16x4_internal2
736 HADDD m10, m0
737 movd eax, m10
738 RET
739
740 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
741 SATD_START_SSE2 m10, m7
742 mov r6, r0
743 mov r7, r2
744 %if vertical
745 mova m7, [pw_00ff]
746 %endif
747 call pixel_satd_16x4_internal2
748 call pixel_satd_16x4_internal2
749 call pixel_satd_16x4_internal2
750 call pixel_satd_16x4_internal2
751 call pixel_satd_16x4_internal2
752 call pixel_satd_16x4_internal2
753 call pixel_satd_16x4_internal2
754 call pixel_satd_16x4_internal2
755 lea r0, [r6 + 16]
756 lea r2, [r7 + 16]
757 call pixel_satd_16x4_internal2
758 call pixel_satd_16x4_internal2
759 call pixel_satd_16x4_internal2
760 call pixel_satd_16x4_internal2
761 call pixel_satd_16x4_internal2
762 call pixel_satd_16x4_internal2
763 call pixel_satd_16x4_internal2
764 call pixel_satd_16x4_internal2
765 HADDD m10, m0
766 movd eax, m10
767 RET
768
769 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
770 SATD_START_SSE2 m10, m7
771 mov r6, r0
772 mov r7, r2
773 %if vertical
774 mova m7, [pw_00ff]
775 %endif
776 call pixel_satd_16x4_internal2
777 call pixel_satd_16x4_internal2
778 call pixel_satd_16x4_internal2
779 call pixel_satd_16x4_internal2
780 call pixel_satd_16x4_internal2
781 call pixel_satd_16x4_internal2
782 call pixel_satd_16x4_internal2
783 call pixel_satd_16x4_internal2
784 call pixel_satd_16x4_internal2
785 call pixel_satd_16x4_internal2
786 call pixel_satd_16x4_internal2
787 call pixel_satd_16x4_internal2
788 call pixel_satd_16x4_internal2
789 call pixel_satd_16x4_internal2
790 call pixel_satd_16x4_internal2
791 call pixel_satd_16x4_internal2
792 lea r0, [r6 + 16]
793 lea r2, [r7 + 16]
794 call pixel_satd_16x4_internal2
795 call pixel_satd_16x4_internal2
796 call pixel_satd_16x4_internal2
797 call pixel_satd_16x4_internal2
798 call pixel_satd_16x4_internal2
799 call pixel_satd_16x4_internal2
800 call pixel_satd_16x4_internal2
801 call pixel_satd_16x4_internal2
802 call pixel_satd_16x4_internal2
803 call pixel_satd_16x4_internal2
804 call pixel_satd_16x4_internal2
805 call pixel_satd_16x4_internal2
806 call pixel_satd_16x4_internal2
807 call pixel_satd_16x4_internal2
808 call pixel_satd_16x4_internal2
809 call pixel_satd_16x4_internal2
810 HADDD m10, m0
811 movd eax, m10
812 RET
813
814 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
815 SATD_START_SSE2 m10, m7
816 mov r6, r0
817 mov r7, r2
818 %if vertical
819 mova m7, [pw_00ff]
820 %endif
821 call pixel_satd_16x4_internal2
822 call pixel_satd_16x4_internal2
823 call pixel_satd_16x4_internal2
824 call pixel_satd_16x4_internal2
825 call pixel_satd_16x4_internal2
826 call pixel_satd_16x4_internal2
827 call pixel_satd_16x4_internal2
828 call pixel_satd_16x4_internal2
829 call pixel_satd_16x4_internal2
830 call pixel_satd_16x4_internal2
831 call pixel_satd_16x4_internal2
832 call pixel_satd_16x4_internal2
833 call pixel_satd_16x4_internal2
834 call pixel_satd_16x4_internal2
835 call pixel_satd_16x4_internal2
836 call pixel_satd_16x4_internal2
837 lea r0, [r6 + 16]
838 lea r2, [r7 + 16]
839 call pixel_satd_16x4_internal2
840 call pixel_satd_16x4_internal2
841 call pixel_satd_16x4_internal2
842 call pixel_satd_16x4_internal2
843 call pixel_satd_16x4_internal2
844 call pixel_satd_16x4_internal2
845 call pixel_satd_16x4_internal2
846 call pixel_satd_16x4_internal2
847 call pixel_satd_16x4_internal2
848 call pixel_satd_16x4_internal2
849 call pixel_satd_16x4_internal2
850 call pixel_satd_16x4_internal2
851 call pixel_satd_16x4_internal2
852 call pixel_satd_16x4_internal2
853 call pixel_satd_16x4_internal2
854 call pixel_satd_16x4_internal2
855 lea r0, [r6 + 32]
856 lea r2, [r7 + 32]
857 call pixel_satd_16x4_internal2
858 call pixel_satd_16x4_internal2
859 call pixel_satd_16x4_internal2
860 call pixel_satd_16x4_internal2
861 call pixel_satd_16x4_internal2
862 call pixel_satd_16x4_internal2
863 call pixel_satd_16x4_internal2
864 call pixel_satd_16x4_internal2
865 call pixel_satd_16x4_internal2
866 call pixel_satd_16x4_internal2
867 call pixel_satd_16x4_internal2
868 call pixel_satd_16x4_internal2
869 call pixel_satd_16x4_internal2
870 call pixel_satd_16x4_internal2
871 call pixel_satd_16x4_internal2
872 call pixel_satd_16x4_internal2
873 HADDD m10, m0
874 movd eax, m10
875 RET
876
877 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
878 SATD_START_SSE2 m10, m7
879 mov r6, r0
880 mov r7, r2
881 %if vertical
882 mova m7, [pw_00ff]
883 %endif
884 call pixel_satd_16x4_internal2
885 call pixel_satd_16x4_internal2
886 call pixel_satd_16x4_internal2
887 call pixel_satd_16x4_internal2
888 lea r0, [r6 + 16]
889 lea r2, [r7 + 16]
890 call pixel_satd_16x4_internal2
891 call pixel_satd_16x4_internal2
892 call pixel_satd_16x4_internal2
893 call pixel_satd_16x4_internal2
894 lea r0, [r6 + 32]
895 lea r2, [r7 + 32]
896 call pixel_satd_16x4_internal2
897 call pixel_satd_16x4_internal2
898 call pixel_satd_16x4_internal2
899 call pixel_satd_16x4_internal2
900 lea r0, [r6 + 48]
901 lea r2, [r7 + 48]
902 call pixel_satd_16x4_internal2
903 call pixel_satd_16x4_internal2
904 call pixel_satd_16x4_internal2
905 call pixel_satd_16x4_internal2
906 HADDD m10, m0
907 movd eax, m10
908 RET
909
910 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
911 SATD_START_SSE2 m10, m7
912 mov r6, r0
913 mov r7, r2
914 %if vertical
915 mova m7, [pw_00ff]
916 %endif
917 call pixel_satd_16x4_internal2
918 call pixel_satd_16x4_internal2
919 call pixel_satd_16x4_internal2
920 call pixel_satd_16x4_internal2
921 call pixel_satd_16x4_internal2
922 call pixel_satd_16x4_internal2
923 call pixel_satd_16x4_internal2
924 call pixel_satd_16x4_internal2
925 lea r0, [r6 + 16]
926 lea r2, [r7 + 16]
927 call pixel_satd_16x4_internal2
928 call pixel_satd_16x4_internal2
929 call pixel_satd_16x4_internal2
930 call pixel_satd_16x4_internal2
931 call pixel_satd_16x4_internal2
932 call pixel_satd_16x4_internal2
933 call pixel_satd_16x4_internal2
934 call pixel_satd_16x4_internal2
935 lea r0, [r6 + 32]
936 lea r2, [r7 + 32]
937 call pixel_satd_16x4_internal2
938 call pixel_satd_16x4_internal2
939 call pixel_satd_16x4_internal2
940 call pixel_satd_16x4_internal2
941 call pixel_satd_16x4_internal2
942 call pixel_satd_16x4_internal2
943 call pixel_satd_16x4_internal2
944 call pixel_satd_16x4_internal2
945 lea r0, [r6 + 48]
946 lea r2, [r7 + 48]
947 call pixel_satd_16x4_internal2
948 call pixel_satd_16x4_internal2
949 call pixel_satd_16x4_internal2
950 call pixel_satd_16x4_internal2
951 call pixel_satd_16x4_internal2
952 call pixel_satd_16x4_internal2
953 call pixel_satd_16x4_internal2
954 call pixel_satd_16x4_internal2
955
956 HADDD m10, m0
957 movd eax, m10
958 RET
959
960 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
961 SATD_START_SSE2 m10, m7
962 mov r6, r0
963 mov r7, r2
964 %if vertical
965 mova m7, [pw_00ff]
966 %endif
967 call pixel_satd_16x4_internal2
968 call pixel_satd_16x4_internal2
969 call pixel_satd_16x4_internal2
970 call pixel_satd_16x4_internal2
971 call pixel_satd_16x4_internal2
972 call pixel_satd_16x4_internal2
973 call pixel_satd_16x4_internal2
974 call pixel_satd_16x4_internal2
975 call pixel_satd_16x4_internal2
976 call pixel_satd_16x4_internal2
977 call pixel_satd_16x4_internal2
978 call pixel_satd_16x4_internal2
979 lea r0, [r6 + 16]
980 lea r2, [r7 + 16]
981 call pixel_satd_16x4_internal2
982 call pixel_satd_16x4_internal2
983 call pixel_satd_16x4_internal2
984 call pixel_satd_16x4_internal2
985 call pixel_satd_16x4_internal2
986 call pixel_satd_16x4_internal2
987 call pixel_satd_16x4_internal2
988 call pixel_satd_16x4_internal2
989 call pixel_satd_16x4_internal2
990 call pixel_satd_16x4_internal2
991 call pixel_satd_16x4_internal2
992 call pixel_satd_16x4_internal2
993 lea r0, [r6 + 32]
994 lea r2, [r7 + 32]
995 call pixel_satd_16x4_internal2
996 call pixel_satd_16x4_internal2
997 call pixel_satd_16x4_internal2
998 call pixel_satd_16x4_internal2
999 call pixel_satd_16x4_internal2
1000 call pixel_satd_16x4_internal2
1001 call pixel_satd_16x4_internal2
1002 call pixel_satd_16x4_internal2
1003 call pixel_satd_16x4_internal2
1004 call pixel_satd_16x4_internal2
1005 call pixel_satd_16x4_internal2
1006 call pixel_satd_16x4_internal2
1007 lea r0, [r6 + 48]
1008 lea r2, [r7 + 48]
1009 call pixel_satd_16x4_internal2
1010 call pixel_satd_16x4_internal2
1011 call pixel_satd_16x4_internal2
1012 call pixel_satd_16x4_internal2
1013 call pixel_satd_16x4_internal2
1014 call pixel_satd_16x4_internal2
1015 call pixel_satd_16x4_internal2
1016 call pixel_satd_16x4_internal2
1017 call pixel_satd_16x4_internal2
1018 call pixel_satd_16x4_internal2
1019 call pixel_satd_16x4_internal2
1020 call pixel_satd_16x4_internal2
1021
1022 HADDD m10, m0
1023 movd eax, m10
1024 RET
1025
1026 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
1027 SATD_START_SSE2 m10, m7
1028 mov r6, r0
1029 mov r7, r2
1030 %if vertical
1031 mova m7, [pw_00ff]
1032 %endif
1033 call pixel_satd_16x4_internal2
1034 call pixel_satd_16x4_internal2
1035 call pixel_satd_16x4_internal2
1036 call pixel_satd_16x4_internal2
1037 call pixel_satd_16x4_internal2
1038 call pixel_satd_16x4_internal2
1039 call pixel_satd_16x4_internal2
1040 call pixel_satd_16x4_internal2
1041 call pixel_satd_16x4_internal2
1042 call pixel_satd_16x4_internal2
1043 call pixel_satd_16x4_internal2
1044 call pixel_satd_16x4_internal2
1045 call pixel_satd_16x4_internal2
1046 call pixel_satd_16x4_internal2
1047 call pixel_satd_16x4_internal2
1048 call pixel_satd_16x4_internal2
1049 lea r0, [r6 + 16]
1050 lea r2, [r7 + 16]
1051 call pixel_satd_16x4_internal2
1052 call pixel_satd_16x4_internal2
1053 call pixel_satd_16x4_internal2
1054 call pixel_satd_16x4_internal2
1055 call pixel_satd_16x4_internal2
1056 call pixel_satd_16x4_internal2
1057 call pixel_satd_16x4_internal2
1058 call pixel_satd_16x4_internal2
1059 call pixel_satd_16x4_internal2
1060 call pixel_satd_16x4_internal2
1061 call pixel_satd_16x4_internal2
1062 call pixel_satd_16x4_internal2
1063 call pixel_satd_16x4_internal2
1064 call pixel_satd_16x4_internal2
1065 call pixel_satd_16x4_internal2
1066 call pixel_satd_16x4_internal2
1067 lea r0, [r6 + 32]
1068 lea r2, [r7 + 32]
1069 call pixel_satd_16x4_internal2
1070 call pixel_satd_16x4_internal2
1071 call pixel_satd_16x4_internal2
1072 call pixel_satd_16x4_internal2
1073 call pixel_satd_16x4_internal2
1074 call pixel_satd_16x4_internal2
1075 call pixel_satd_16x4_internal2
1076 call pixel_satd_16x4_internal2
1077 call pixel_satd_16x4_internal2
1078 call pixel_satd_16x4_internal2
1079 call pixel_satd_16x4_internal2
1080 call pixel_satd_16x4_internal2
1081 call pixel_satd_16x4_internal2
1082 call pixel_satd_16x4_internal2
1083 call pixel_satd_16x4_internal2
1084 call pixel_satd_16x4_internal2
1085 lea r0, [r6 + 48]
1086 lea r2, [r7 + 48]
1087 call pixel_satd_16x4_internal2
1088 call pixel_satd_16x4_internal2
1089 call pixel_satd_16x4_internal2
1090 call pixel_satd_16x4_internal2
1091 call pixel_satd_16x4_internal2
1092 call pixel_satd_16x4_internal2
1093 call pixel_satd_16x4_internal2
1094 call pixel_satd_16x4_internal2
1095 call pixel_satd_16x4_internal2
1096 call pixel_satd_16x4_internal2
1097 call pixel_satd_16x4_internal2
1098 call pixel_satd_16x4_internal2
1099 call pixel_satd_16x4_internal2
1100 call pixel_satd_16x4_internal2
1101 call pixel_satd_16x4_internal2
1102 call pixel_satd_16x4_internal2
1103
1104 HADDD m10, m0
1105 movd eax, m10
1106 RET
1107
1108 %else
1109 %if WIN64
1110 cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx)
1111 SATD_START_SSE2 m6, m7
1112 mov r6, r0
1113 mov r7, r2
1114 call pixel_satd_8x8_internal2
1115 call pixel_satd_8x8_internal2
1116 call pixel_satd_8x8_internal2
1117 lea r0, [r6 + 8*SIZEOF_PIXEL]
1118 lea r2, [r7 + 8*SIZEOF_PIXEL]
1119 call pixel_satd_8x8_internal2
1120 call pixel_satd_8x8_internal2
1121 call pixel_satd_8x8_internal2
1122 HADDD m6, m0
1123 movd eax, m6
1124 RET
1125 %else
1126 cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64
1127 SATD_START_SSE2 m6, m7
1128 mov r6, r0
1129 mov [rsp], r2
1130 call pixel_satd_8x8_internal2
1131 call pixel_satd_8x8_internal2
1132 call pixel_satd_8x8_internal2
1133 lea r0, [r6 + 8*SIZEOF_PIXEL]
1134 mov r2, [rsp]
1135 add r2, 8*SIZEOF_PIXEL
1136 call pixel_satd_8x8_internal2
1137 call pixel_satd_8x8_internal2
1138 call pixel_satd_8x8_internal2
1139 HADDD m6, m0
1140 movd eax, m6
1141 RET
1142 %endif
1143 %if WIN64
1144 cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx)
1145 SATD_START_SSE2 m6, m7
1146 mov r6, r0
1147 mov r7, r2
1148 call pixel_satd_8x8_internal2
1149 call pixel_satd_8x8_internal2
1150 call pixel_satd_8x8_internal2
1151 call pixel_satd_8x8_internal2
1152 call pixel_satd_8x8_internal2
1153 call pixel_satd_8x8_internal2
1154 lea r0, [r6 + 8*SIZEOF_PIXEL]
1155 lea r2, [r7 + 8*SIZEOF_PIXEL]
1156 call pixel_satd_8x8_internal2
1157 call pixel_satd_8x8_internal2
1158 call pixel_satd_8x8_internal2
1159 call pixel_satd_8x8_internal2
1160 call pixel_satd_8x8_internal2
1161 call pixel_satd_8x8_internal2
1162 lea r0, [r6 + 16*SIZEOF_PIXEL]
1163 lea r2, [r7 + 16*SIZEOF_PIXEL]
1164 call pixel_satd_8x8_internal2
1165 call pixel_satd_8x8_internal2
1166 call pixel_satd_8x8_internal2
1167 call pixel_satd_8x8_internal2
1168 call pixel_satd_8x8_internal2
1169 call pixel_satd_8x8_internal2
1170 lea r0, [r6 + 24*SIZEOF_PIXEL]
1171 lea r2, [r7 + 24*SIZEOF_PIXEL]
1172 call pixel_satd_8x8_internal2
1173 call pixel_satd_8x8_internal2
1174 call pixel_satd_8x8_internal2
1175 call pixel_satd_8x8_internal2
1176 call pixel_satd_8x8_internal2
1177 call pixel_satd_8x8_internal2
1178 HADDD m6, m0
1179 movd eax, m6
1180 RET
1181 %else
1182 cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64
1183 SATD_START_SSE2 m6, m7
1184 mov r6, r0
1185 mov [rsp], r2
1186 call pixel_satd_8x8_internal2
1187 call pixel_satd_8x8_internal2
1188 call pixel_satd_8x8_internal2
1189 call pixel_satd_8x8_internal2
1190 call pixel_satd_8x8_internal2
1191 call pixel_satd_8x8_internal2
1192 lea r0, [r6 + 8*SIZEOF_PIXEL]
1193 mov r2, [rsp]
1194 add r2, 8*SIZEOF_PIXEL
1195 call pixel_satd_8x8_internal2
1196 call pixel_satd_8x8_internal2
1197 call pixel_satd_8x8_internal2
1198 call pixel_satd_8x8_internal2
1199 call pixel_satd_8x8_internal2
1200 call pixel_satd_8x8_internal2
1201 lea r0, [r6 + 16*SIZEOF_PIXEL]
1202 mov r2, [rsp]
1203 add r2, 16*SIZEOF_PIXEL
1204 call pixel_satd_8x8_internal2
1205 call pixel_satd_8x8_internal2
1206 call pixel_satd_8x8_internal2
1207 call pixel_satd_8x8_internal2
1208 call pixel_satd_8x8_internal2
1209 call pixel_satd_8x8_internal2
1210 lea r0, [r6 + 24*SIZEOF_PIXEL]
1211 mov r2, [rsp]
1212 add r2, 24*SIZEOF_PIXEL
1213 call pixel_satd_8x8_internal2
1214 call pixel_satd_8x8_internal2
1215 call pixel_satd_8x8_internal2
1216 call pixel_satd_8x8_internal2
1217 call pixel_satd_8x8_internal2
1218 call pixel_satd_8x8_internal2
1219 HADDD m6, m0
1220 movd eax, m6
1221 RET
1222 %endif
1223
1224 %if WIN64
1225 cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1226 SATD_START_SSE2 m6, m7
1227 mov r6, r0
1228 mov r7, r2
1229 call pixel_satd_8x8_internal2
1230 call pixel_satd_8x8_internal2
1231 call pixel_satd_8x8_internal2
1232 call pixel_satd_8x8_internal2
1233 call pixel_satd_8x8_internal2
1234 call pixel_satd_8x8_internal2
1235 call pixel_satd_8x8_internal2
1236 call pixel_satd_8x8_internal2
1237 lea r0, [r6 + 8*SIZEOF_PIXEL]
1238 lea r2, [r7 + 8*SIZEOF_PIXEL]
1239 call pixel_satd_8x8_internal2
1240 call pixel_satd_8x8_internal2
1241 call pixel_satd_8x8_internal2
1242 call pixel_satd_8x8_internal2
1243 call pixel_satd_8x8_internal2
1244 call pixel_satd_8x8_internal2
1245 call pixel_satd_8x8_internal2
1246 call pixel_satd_8x8_internal2
1247 lea r0, [r6 + 16*SIZEOF_PIXEL]
1248 lea r2, [r7 + 16*SIZEOF_PIXEL]
1249 call pixel_satd_8x8_internal2
1250 call pixel_satd_8x8_internal2
1251 call pixel_satd_8x8_internal2
1252 call pixel_satd_8x8_internal2
1253 call pixel_satd_8x8_internal2
1254 call pixel_satd_8x8_internal2
1255 call pixel_satd_8x8_internal2
1256 call pixel_satd_8x8_internal2
1257 HADDD m6, m0
1258 movd eax, m6
1259 RET
1260 %else
1261 cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64
1262 SATD_START_SSE2 m6, m7
1263 mov r6, r0
1264 mov [rsp], r2
1265 call pixel_satd_8x8_internal2
1266 call pixel_satd_8x8_internal2
1267 call pixel_satd_8x8_internal2
1268 call pixel_satd_8x8_internal2
1269 call pixel_satd_8x8_internal2
1270 call pixel_satd_8x8_internal2
1271 call pixel_satd_8x8_internal2
1272 call pixel_satd_8x8_internal2
1273 lea r0, [r6 + 8*SIZEOF_PIXEL]
1274 mov r2, [rsp]
1275 add r2, 8*SIZEOF_PIXEL
1276 call pixel_satd_8x8_internal2
1277 call pixel_satd_8x8_internal2
1278 call pixel_satd_8x8_internal2
1279 call pixel_satd_8x8_internal2
1280 call pixel_satd_8x8_internal2
1281 call pixel_satd_8x8_internal2
1282 call pixel_satd_8x8_internal2
1283 call pixel_satd_8x8_internal2
1284 lea r0, [r6 + 16*SIZEOF_PIXEL]
1285 mov r2, [rsp]
1286 add r2, 16*SIZEOF_PIXEL
1287 call pixel_satd_8x8_internal2
1288 call pixel_satd_8x8_internal2
1289 call pixel_satd_8x8_internal2
1290 call pixel_satd_8x8_internal2
1291 call pixel_satd_8x8_internal2
1292 call pixel_satd_8x8_internal2
1293 call pixel_satd_8x8_internal2
1294 call pixel_satd_8x8_internal2
1295 HADDD m6, m0
1296 movd eax, m6
1297 RET
1298 %endif
1299
1300 %if WIN64
1301 cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1302 SATD_START_SSE2 m6, m7
1303 mov r6, r0
1304 mov r7, r2
1305 call pixel_satd_8x8_internal2
1306 call pixel_satd_8x8_internal2
1307 call pixel_satd_8x8_internal2
1308 call pixel_satd_8x8_internal2
1309 call pixel_satd_8x8_internal2
1310 call pixel_satd_8x8_internal2
1311 call pixel_satd_8x8_internal2
1312 call pixel_satd_8x8_internal2
1313 HADDD m6, m0
1314 movd eax, m6
1315 RET
1316 %else
1317 cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64
1318 SATD_START_SSE2 m6, m7
1319 mov r6, r0
1320 mov [rsp], r2
1321 call pixel_satd_8x8_internal2
1322 call pixel_satd_8x8_internal2
1323 call pixel_satd_8x8_internal2
1324 call pixel_satd_8x8_internal2
1325 call pixel_satd_8x8_internal2
1326 call pixel_satd_8x8_internal2
1327 call pixel_satd_8x8_internal2
1328 call pixel_satd_8x8_internal2
1329 HADDD m6, m0
1330 movd eax, m6
1331 RET
1332 %endif
1333
1334 %if WIN64
1335 cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx)
1336 SATD_START_SSE2 m6, m7
1337 mov r6, r0
1338 mov r7, r2
1339 call pixel_satd_8x8_internal2
1340 call %%pixel_satd_8x4_internal2
1341 pxor m7, m7
1342 movhlps m7, m6
1343 paddd m6, m7
1344 pshufd m7, m6, 1
1345 paddd m6, m7
1346 movd eax, m6
1347 RET
1348 %else
1349 cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64
1350 SATD_START_SSE2 m6, m7
1351 mov r6, r0
1352 mov [rsp], r2
1353 call pixel_satd_8x8_internal2
1354 call %%pixel_satd_8x4_internal2
1355 HADDD m6, m0
1356 movd eax, m6
1357 RET
1358 %endif
1359
1360 %if HIGH_BIT_DEPTH
1361 %if WIN64
1362 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx)
1363 SATD_START_MMX
1364 mov r6, r0
1365 mov r7, r2
1366 pxor m7, m7
1367 SATD_4x8_SSE vertical, 0, 4, 5
1368 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1369 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1370 SATD_4x8_SSE vertical, 1, 4, 5
1371 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1372 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1373 SATD_4x8_SSE vertical, 1, 4, 5
1374 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1375 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1376 SATD_4x8_SSE vertical, 1, 4, 5
1377 lea r0, [r6 + 4*SIZEOF_PIXEL]
1378 lea r2, [r7 + 4*SIZEOF_PIXEL]
1379 SATD_4x8_SSE vertical, 1, 4, 5
1380 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1381 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1382 SATD_4x8_SSE vertical, 1, 4, 5
1383 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1384 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1385 SATD_4x8_SSE vertical, 1, 4, 5
1386 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1387 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1388 SATD_4x8_SSE vertical, 1, 4, 5
1389 lea r0, [r6 + 8*SIZEOF_PIXEL]
1390 lea r2, [r7 + 8*SIZEOF_PIXEL]
1391 SATD_4x8_SSE vertical, 1, 4, 5
1392 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1393 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1394 SATD_4x8_SSE vertical, 1, 4, 5
1395 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1396 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1397 SATD_4x8_SSE vertical, 1, 4, 5
1398 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1399 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1400 SATD_4x8_SSE vertical, 1, 4, 5
1401 HADDD m7, m0
1402 movd eax, m7
1403 RET
1404 %else
1405 cglobal pixel_satd_12x32, 4,7,8,0-gprsize
1406 SATD_START_MMX
1407 mov r6, r0
1408 mov [rsp], r2
1409 pxor m7, m7
1410 SATD_4x8_SSE vertical, 0, 4, 5
1411 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1412 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1413 SATD_4x8_SSE vertical, 1, 4, 5
1414 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1415 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1416 SATD_4x8_SSE vertical, 1, 4, 5
1417 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1418 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1419 SATD_4x8_SSE vertical, 1, 4, 5
1420 lea r0, [r6 + 4*SIZEOF_PIXEL]
1421 mov r2, [rsp]
1422 add r2, 4*SIZEOF_PIXEL
1423 SATD_4x8_SSE vertical, 1, 4, 5
1424 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1425 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1426 SATD_4x8_SSE vertical, 1, 4, 5
1427 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1428 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1429 SATD_4x8_SSE vertical, 1, 4, 5
1430 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1431 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1432 SATD_4x8_SSE vertical, 1, 4, 5
1433 lea r0, [r6 + 8*SIZEOF_PIXEL]
1434 mov r2, [rsp]
1435 add r2, 8*SIZEOF_PIXEL
1436 SATD_4x8_SSE vertical, 1, 4, 5
1437 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1438 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1439 SATD_4x8_SSE vertical, 1, 4, 5
1440 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1441 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1442 SATD_4x8_SSE vertical, 1, 4, 5
1443 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1444 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1445 SATD_4x8_SSE vertical, 1, 4, 5
1446 HADDD m7, m0
1447 movd eax, m7
1448 RET
1449 %endif
1450 %else ;HIGH_BIT_DEPTH
1451 %if WIN64
1452 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx)
1453 SATD_START_MMX
1454 mov r6, r0
1455 mov r7, r2
1456 %if vertical==0
1457 mova m7, [hmul_4p]
1458 %endif
1459 SATD_4x8_SSE vertical, 0, swap
1460 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1461 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1462 SATD_4x8_SSE vertical, 1, add
1463 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1464 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1465 SATD_4x8_SSE vertical, 1, add
1466 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1467 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1468 SATD_4x8_SSE vertical, 1, add
1469 lea r0, [r6 + 4*SIZEOF_PIXEL]
1470 lea r2, [r7 + 4*SIZEOF_PIXEL]
1471 SATD_4x8_SSE vertical, 1, add
1472 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1473 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1474 SATD_4x8_SSE vertical, 1, add
1475 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1476 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1477 SATD_4x8_SSE vertical, 1, add
1478 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1479 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1480 SATD_4x8_SSE vertical, 1, add
1481 lea r0, [r6 + 8*SIZEOF_PIXEL]
1482 lea r2, [r7 + 8*SIZEOF_PIXEL]
1483 SATD_4x8_SSE vertical, 1, add
1484 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1485 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1486 SATD_4x8_SSE vertical, 1, add
1487 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1488 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1489 SATD_4x8_SSE vertical, 1, add
1490 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1491 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1492 SATD_4x8_SSE vertical, 1, add
1493 HADDW m7, m1
1494 movd eax, m7
1495 RET
1496 %else
1497 cglobal pixel_satd_12x32, 4,7,8,0-gprsize
1498 SATD_START_MMX
1499 mov r6, r0
1500 mov [rsp], r2
1501 %if vertical==0
1502 mova m7, [hmul_4p]
1503 %endif
1504 SATD_4x8_SSE vertical, 0, swap
1505 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1506 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1507 SATD_4x8_SSE vertical, 1, add
1508 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1509 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1510 SATD_4x8_SSE vertical, 1, add
1511 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1512 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1513 SATD_4x8_SSE vertical, 1, add
1514 lea r0, [r6 + 4*SIZEOF_PIXEL]
1515 mov r2, [rsp]
1516 add r2, 4*SIZEOF_PIXEL
1517 SATD_4x8_SSE vertical, 1, add
1518 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1519 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1520 SATD_4x8_SSE vertical, 1, add
1521 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1522 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1523 SATD_4x8_SSE vertical, 1, add
1524 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1525 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1526 SATD_4x8_SSE vertical, 1, add
1527 lea r0, [r6 + 8*SIZEOF_PIXEL]
1528 mov r2, [rsp]
1529 add r2, 8*SIZEOF_PIXEL
1530 SATD_4x8_SSE vertical, 1, add
1531 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1532 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1533 SATD_4x8_SSE vertical, 1, add
1534 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1535 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1536 SATD_4x8_SSE vertical, 1, add
1537 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1538 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1539 SATD_4x8_SSE vertical, 1, add
1540 HADDW m7, m1
1541 movd eax, m7
1542 RET
1543 %endif
1544 %endif
1545
1546 %if HIGH_BIT_DEPTH
1547 %if WIN64
1548 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx)
1549 SATD_START_MMX
1550 mov r6, r0
1551 mov r7, r2
1552 pxor m7, m7
1553 SATD_4x8_SSE vertical, 0, 4, 5
1554 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1555 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1556 SATD_4x8_SSE vertical, 1, 4, 5
1557 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1558 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1559 SATD_4x8_SSE vertical, 1, 4, 5
1560 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1561 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1562 SATD_4x8_SSE vertical, 1, 4, 5
1563 HADDD m7, m0
1564 movd eax, m7
1565 RET
1566 %else
1567 cglobal pixel_satd_4x32, 4,7,8,0-gprsize
1568 SATD_START_MMX
1569 mov r6, r0
1570 mov [rsp], r2
1571 pxor m7, m7
1572 SATD_4x8_SSE vertical, 0, 4, 5
1573 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1574 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1575 SATD_4x8_SSE vertical, 1, 4, 5
1576 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1577 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1578 SATD_4x8_SSE vertical, 1, 4, 5
1579 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1580 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1581 SATD_4x8_SSE vertical, 1, 4, 5
1582 pxor m1, m1
1583 movhlps m1, m7
1584 paddd m7, m1
1585 pshufd m1, m7, 1
1586 paddd m7, m1
1587 movd eax, m7
1588 RET
1589 %endif
1590 %else
1591 %if WIN64
1592 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx)
1593 SATD_START_MMX
1594 mov r6, r0
1595 mov r7, r2
1596 %if vertical==0
1597 mova m7, [hmul_4p]
1598 %endif
1599 SATD_4x8_SSE vertical, 0, swap
1600 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1601 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1602 SATD_4x8_SSE vertical, 1, add
1603 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1604 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1605 SATD_4x8_SSE vertical, 1, add
1606 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1607 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1608 SATD_4x8_SSE vertical, 1, add
1609 HADDW m7, m1
1610 movd eax, m7
1611 RET
1612 %else
1613 cglobal pixel_satd_4x32, 4,7,8,0-gprsize
1614 SATD_START_MMX
1615 mov r6, r0
1616 mov [rsp], r2
1617 %if vertical==0
1618 mova m7, [hmul_4p]
1619 %endif
1620 SATD_4x8_SSE vertical, 0, swap
1621 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1622 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1623 SATD_4x8_SSE vertical, 1, add
1624 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1625 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1626 SATD_4x8_SSE vertical, 1, add
1627 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1628 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1629 SATD_4x8_SSE vertical, 1, add
1630 HADDW m7, m1
1631 movd eax, m7
1632 RET
1633 %endif
1634 %endif
1635
1636 %if WIN64
1637 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx)
1638 SATD_START_SSE2 m6, m7
1639 mov r6, r0
1640 mov r7, r2
1641 call pixel_satd_8x8_internal2
1642 lea r0, [r6 + 8*SIZEOF_PIXEL]
1643 lea r2, [r7 + 8*SIZEOF_PIXEL]
1644 call pixel_satd_8x8_internal2
1645 lea r0, [r6 + 16*SIZEOF_PIXEL]
1646 lea r2, [r7 + 16*SIZEOF_PIXEL]
1647 call pixel_satd_8x8_internal2
1648 lea r0, [r6 + 24*SIZEOF_PIXEL]
1649 lea r2, [r7 + 24*SIZEOF_PIXEL]
1650 call pixel_satd_8x8_internal2
1651 HADDD m6, m0
1652 movd eax, m6
1653 RET
1654 %else
1655 cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
1656 SATD_START_SSE2 m6, m7
1657 mov r6, r0
1658 mov [rsp], r2
1659 call pixel_satd_8x8_internal2
1660 lea r0, [r6 + 8*SIZEOF_PIXEL]
1661 mov r2, [rsp]
1662 add r2, 8*SIZEOF_PIXEL
1663 call pixel_satd_8x8_internal2
1664 lea r0, [r6 + 16*SIZEOF_PIXEL]
1665 mov r2, [rsp]
1666 add r2, 16*SIZEOF_PIXEL
1667 call pixel_satd_8x8_internal2
1668 lea r0, [r6 + 24*SIZEOF_PIXEL]
1669 mov r2, [rsp]
1670 add r2, 24*SIZEOF_PIXEL
1671 call pixel_satd_8x8_internal2
1672 HADDD m6, m0
1673 movd eax, m6
1674 RET
1675 %endif
1676
1677 %if WIN64
1678 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx)
1679 SATD_START_SSE2 m6, m7
1680 mov r6, r0
1681 mov r7, r2
1682 call pixel_satd_8x8_internal2
1683 call pixel_satd_8x8_internal2
1684 lea r0, [r6 + 8*SIZEOF_PIXEL]
1685 lea r2, [r7 + 8*SIZEOF_PIXEL]
1686 call pixel_satd_8x8_internal2
1687 call pixel_satd_8x8_internal2
1688 lea r0, [r6 + 16*SIZEOF_PIXEL]
1689 lea r2, [r7 + 16*SIZEOF_PIXEL]
1690 call pixel_satd_8x8_internal2
1691 call pixel_satd_8x8_internal2
1692 lea r0, [r6 + 24*SIZEOF_PIXEL]
1693 lea r2, [r7 + 24*SIZEOF_PIXEL]
1694 call pixel_satd_8x8_internal2
1695 call pixel_satd_8x8_internal2
1696 HADDD m6, m0
1697 movd eax, m6
1698 RET
1699 %else
1700 cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
1701 SATD_START_SSE2 m6, m7
1702 mov r6, r0
1703 mov [rsp], r2
1704 call pixel_satd_8x8_internal2
1705 call pixel_satd_8x8_internal2
1706 lea r0, [r6 + 8*SIZEOF_PIXEL]
1707 mov r2, [rsp]
1708 add r2, 8*SIZEOF_PIXEL
1709 call pixel_satd_8x8_internal2
1710 call pixel_satd_8x8_internal2
1711 lea r0, [r6 + 16*SIZEOF_PIXEL]
1712 mov r2, [rsp]
1713 add r2, 16*SIZEOF_PIXEL
1714 call pixel_satd_8x8_internal2
1715 call pixel_satd_8x8_internal2
1716 lea r0, [r6 + 24*SIZEOF_PIXEL]
1717 mov r2, [rsp]
1718 add r2, 24*SIZEOF_PIXEL
1719 call pixel_satd_8x8_internal2
1720 call pixel_satd_8x8_internal2
1721 HADDD m6, m0
1722 movd eax, m6
1723 RET
1724 %endif
1725
1726 %if WIN64
1727 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx)
1728 SATD_START_SSE2 m6, m7
1729 mov r6, r0
1730 mov r7, r2
1731 call pixel_satd_8x8_internal2
1732 call pixel_satd_8x8_internal2
1733 call pixel_satd_8x8_internal2
1734 lea r0, [r6 + 8*SIZEOF_PIXEL]
1735 lea r2, [r7 + 8*SIZEOF_PIXEL]
1736 call pixel_satd_8x8_internal2
1737 call pixel_satd_8x8_internal2
1738 call pixel_satd_8x8_internal2
1739 lea r0, [r6 + 16*SIZEOF_PIXEL]
1740 lea r2, [r7 + 16*SIZEOF_PIXEL]
1741 call pixel_satd_8x8_internal2
1742 call pixel_satd_8x8_internal2
1743 call pixel_satd_8x8_internal2
1744 lea r0, [r6 + 24*SIZEOF_PIXEL]
1745 lea r2, [r7 + 24*SIZEOF_PIXEL]
1746 call pixel_satd_8x8_internal2
1747 call pixel_satd_8x8_internal2
1748 call pixel_satd_8x8_internal2
1749 HADDD m6, m0
1750 movd eax, m6
1751 RET
1752 %else
1753 cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
1754 SATD_START_SSE2 m6, m7
1755 mov r6, r0
1756 mov [rsp], r2
1757 call pixel_satd_8x8_internal2
1758 call pixel_satd_8x8_internal2
1759 call pixel_satd_8x8_internal2
1760 lea r0, [r6 + 8*SIZEOF_PIXEL]
1761 mov r2, [rsp]
1762 add r2, 8*SIZEOF_PIXEL
1763 call pixel_satd_8x8_internal2
1764 call pixel_satd_8x8_internal2
1765 call pixel_satd_8x8_internal2
1766 lea r0, [r6 + 16*SIZEOF_PIXEL]
1767 mov r2, [rsp]
1768 add r2, 16*SIZEOF_PIXEL
1769 call pixel_satd_8x8_internal2
1770 call pixel_satd_8x8_internal2
1771 call pixel_satd_8x8_internal2
1772 lea r0, [r6 + 24*SIZEOF_PIXEL]
1773 mov r2, [rsp]
1774 add r2, 24*SIZEOF_PIXEL
1775 call pixel_satd_8x8_internal2
1776 call pixel_satd_8x8_internal2
1777 call pixel_satd_8x8_internal2
1778 HADDD m6, m0
1779 movd eax, m6
1780 RET
1781 %endif
1782
1783 %if WIN64
1784 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx)
1785 SATD_START_SSE2 m6, m7
1786 mov r6, r0
1787 mov r7, r2
1788 call pixel_satd_8x8_internal2
1789 call pixel_satd_8x8_internal2
1790 call pixel_satd_8x8_internal2
1791 call pixel_satd_8x8_internal2
1792 lea r0, [r6 + 8*SIZEOF_PIXEL]
1793 lea r2, [r7 + 8*SIZEOF_PIXEL]
1794 call pixel_satd_8x8_internal2
1795 call pixel_satd_8x8_internal2
1796 call pixel_satd_8x8_internal2
1797 call pixel_satd_8x8_internal2
1798 lea r0, [r6 + 16*SIZEOF_PIXEL]
1799 lea r2, [r7 + 16*SIZEOF_PIXEL]
1800 call pixel_satd_8x8_internal2
1801 call pixel_satd_8x8_internal2
1802 call pixel_satd_8x8_internal2
1803 call pixel_satd_8x8_internal2
1804 lea r0, [r6 + 24*SIZEOF_PIXEL]
1805 lea r2, [r7 + 24*SIZEOF_PIXEL]
1806 call pixel_satd_8x8_internal2
1807 call pixel_satd_8x8_internal2
1808 call pixel_satd_8x8_internal2
1809 call pixel_satd_8x8_internal2
1810 HADDD m6, m0
1811 movd eax, m6
1812 RET
1813 %else
1814 cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
1815 SATD_START_SSE2 m6, m7
1816 mov r6, r0
1817 mov [rsp], r2
1818 call pixel_satd_8x8_internal2
1819 call pixel_satd_8x8_internal2
1820 call pixel_satd_8x8_internal2
1821 call pixel_satd_8x8_internal2
1822 lea r0, [r6 + 8*SIZEOF_PIXEL]
1823 mov r2, [rsp]
1824 add r2, 8*SIZEOF_PIXEL
1825 call pixel_satd_8x8_internal2
1826 call pixel_satd_8x8_internal2
1827 call pixel_satd_8x8_internal2
1828 call pixel_satd_8x8_internal2
1829 lea r0, [r6 + 16*SIZEOF_PIXEL]
1830 mov r2, [rsp]
1831 add r2, 16*SIZEOF_PIXEL
1832 call pixel_satd_8x8_internal2
1833 call pixel_satd_8x8_internal2
1834 call pixel_satd_8x8_internal2
1835 call pixel_satd_8x8_internal2
1836 lea r0, [r6 + 24*SIZEOF_PIXEL]
1837 mov r2, [rsp]
1838 add r2, 24*SIZEOF_PIXEL
1839 call pixel_satd_8x8_internal2
1840 call pixel_satd_8x8_internal2
1841 call pixel_satd_8x8_internal2
1842 call pixel_satd_8x8_internal2
1843 HADDD m6, m0
1844 movd eax, m6
1845 RET
1846 %endif
1847
1848 %if WIN64
1849 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1850 SATD_START_SSE2 m6, m7
1851 mov r6, r0
1852 mov r7, r2
1853 call pixel_satd_8x8_internal2
1854 call pixel_satd_8x8_internal2
1855 call pixel_satd_8x8_internal2
1856 call pixel_satd_8x8_internal2
1857 call pixel_satd_8x8_internal2
1858 call pixel_satd_8x8_internal2
1859 call pixel_satd_8x8_internal2
1860 call pixel_satd_8x8_internal2
1861 lea r0, [r6 + 8*SIZEOF_PIXEL]
1862 lea r2, [r7 + 8*SIZEOF_PIXEL]
1863 call pixel_satd_8x8_internal2
1864 call pixel_satd_8x8_internal2
1865 call pixel_satd_8x8_internal2
1866 call pixel_satd_8x8_internal2
1867 call pixel_satd_8x8_internal2
1868 call pixel_satd_8x8_internal2
1869 call pixel_satd_8x8_internal2
1870 call pixel_satd_8x8_internal2
1871 lea r0, [r6 + 16*SIZEOF_PIXEL]
1872 lea r2, [r7 + 16*SIZEOF_PIXEL]
1873 call pixel_satd_8x8_internal2
1874 call pixel_satd_8x8_internal2
1875 call pixel_satd_8x8_internal2
1876 call pixel_satd_8x8_internal2
1877 call pixel_satd_8x8_internal2
1878 call pixel_satd_8x8_internal2
1879 call pixel_satd_8x8_internal2
1880 call pixel_satd_8x8_internal2
1881 lea r0, [r6 + 24*SIZEOF_PIXEL]
1882 lea r2, [r7 + 24*SIZEOF_PIXEL]
1883 call pixel_satd_8x8_internal2
1884 call pixel_satd_8x8_internal2
1885 call pixel_satd_8x8_internal2
1886 call pixel_satd_8x8_internal2
1887 call pixel_satd_8x8_internal2
1888 call pixel_satd_8x8_internal2
1889 call pixel_satd_8x8_internal2
1890 call pixel_satd_8x8_internal2
1891 HADDD m6, m0
1892 movd eax, m6
1893 RET
1894 %else
1895 cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
1896 SATD_START_SSE2 m6, m7
1897 mov r6, r0
1898 mov [rsp], r2
1899 call pixel_satd_8x8_internal2
1900 call pixel_satd_8x8_internal2
1901 call pixel_satd_8x8_internal2
1902 call pixel_satd_8x8_internal2
1903 call pixel_satd_8x8_internal2
1904 call pixel_satd_8x8_internal2
1905 call pixel_satd_8x8_internal2
1906 call pixel_satd_8x8_internal2
1907 lea r0, [r6 + 8*SIZEOF_PIXEL]
1908 mov r2, [rsp]
1909 add r2, 8*SIZEOF_PIXEL
1910 call pixel_satd_8x8_internal2
1911 call pixel_satd_8x8_internal2
1912 call pixel_satd_8x8_internal2
1913 call pixel_satd_8x8_internal2
1914 call pixel_satd_8x8_internal2
1915 call pixel_satd_8x8_internal2
1916 call pixel_satd_8x8_internal2
1917 call pixel_satd_8x8_internal2
1918 lea r0, [r6 + 16*SIZEOF_PIXEL]
1919 mov r2, [rsp]
1920 add r2, 16*SIZEOF_PIXEL
1921 call pixel_satd_8x8_internal2
1922 call pixel_satd_8x8_internal2
1923 call pixel_satd_8x8_internal2
1924 call pixel_satd_8x8_internal2
1925 call pixel_satd_8x8_internal2
1926 call pixel_satd_8x8_internal2
1927 call pixel_satd_8x8_internal2
1928 call pixel_satd_8x8_internal2
1929 lea r0, [r6 + 24*SIZEOF_PIXEL]
1930 mov r2, [rsp]
1931 add r2, 24*SIZEOF_PIXEL
1932 call pixel_satd_8x8_internal2
1933 call pixel_satd_8x8_internal2
1934 call pixel_satd_8x8_internal2
1935 call pixel_satd_8x8_internal2
1936 call pixel_satd_8x8_internal2
1937 call pixel_satd_8x8_internal2
1938 call pixel_satd_8x8_internal2
1939 call pixel_satd_8x8_internal2
1940 HADDD m6, m0
1941 movd eax, m6
1942 RET
1943 %endif
1944
1945 %if WIN64
1946 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1947 SATD_START_SSE2 m6, m7
1948 mov r6, r0
1949 mov r7, r2
1950 call pixel_satd_8x8_internal2
1951 call pixel_satd_8x8_internal2
1952 call pixel_satd_8x8_internal2
1953 call pixel_satd_8x8_internal2
1954 call pixel_satd_8x8_internal2
1955 call pixel_satd_8x8_internal2
1956 call pixel_satd_8x8_internal2
1957 call pixel_satd_8x8_internal2
1958 lea r0, [r6 + 8*SIZEOF_PIXEL]
1959 lea r2, [r7 + 8*SIZEOF_PIXEL]
1960 call pixel_satd_8x8_internal2
1961 call pixel_satd_8x8_internal2
1962 call pixel_satd_8x8_internal2
1963 call pixel_satd_8x8_internal2
1964 call pixel_satd_8x8_internal2
1965 call pixel_satd_8x8_internal2
1966 call pixel_satd_8x8_internal2
1967 call pixel_satd_8x8_internal2
1968 lea r0, [r6 + 16*SIZEOF_PIXEL]
1969 lea r2, [r7 + 16*SIZEOF_PIXEL]
1970 call pixel_satd_8x8_internal2
1971 call pixel_satd_8x8_internal2
1972 call pixel_satd_8x8_internal2
1973 call pixel_satd_8x8_internal2
1974 call pixel_satd_8x8_internal2
1975 call pixel_satd_8x8_internal2
1976 call pixel_satd_8x8_internal2
1977 call pixel_satd_8x8_internal2
1978 lea r0, [r6 + 24*SIZEOF_PIXEL]
1979 lea r2, [r7 + 24*SIZEOF_PIXEL]
1980 call pixel_satd_8x8_internal2
1981 call pixel_satd_8x8_internal2
1982 call pixel_satd_8x8_internal2
1983 call pixel_satd_8x8_internal2
1984 call pixel_satd_8x8_internal2
1985 call pixel_satd_8x8_internal2
1986 call pixel_satd_8x8_internal2
1987 call pixel_satd_8x8_internal2
1988 lea r0, [r6 + 32*SIZEOF_PIXEL]
1989 lea r2, [r7 + 32*SIZEOF_PIXEL]
1990 call pixel_satd_8x8_internal2
1991 call pixel_satd_8x8_internal2
1992 call pixel_satd_8x8_internal2
1993 call pixel_satd_8x8_internal2
1994 call pixel_satd_8x8_internal2
1995 call pixel_satd_8x8_internal2
1996 call pixel_satd_8x8_internal2
1997 call pixel_satd_8x8_internal2
1998 lea r0, [r6 + 40*SIZEOF_PIXEL]
1999 lea r2, [r7 + 40*SIZEOF_PIXEL]
2000 call pixel_satd_8x8_internal2
2001 call pixel_satd_8x8_internal2
2002 call pixel_satd_8x8_internal2
2003 call pixel_satd_8x8_internal2
2004 call pixel_satd_8x8_internal2
2005 call pixel_satd_8x8_internal2
2006 call pixel_satd_8x8_internal2
2007 call pixel_satd_8x8_internal2
2008 HADDD m6, m0
2009 movd eax, m6
2010 RET
2011 %else
2012 cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
2013 SATD_START_SSE2 m6, m7
2014 mov r6, r0
2015 mov [rsp], r2
2016 call pixel_satd_8x8_internal2
2017 call pixel_satd_8x8_internal2
2018 call pixel_satd_8x8_internal2
2019 call pixel_satd_8x8_internal2
2020 call pixel_satd_8x8_internal2
2021 call pixel_satd_8x8_internal2
2022 call pixel_satd_8x8_internal2
2023 call pixel_satd_8x8_internal2
2024 lea r0, [r6 + 8*SIZEOF_PIXEL]
2025 mov r2, [rsp]
2026 add r2,8*SIZEOF_PIXEL
2027 call pixel_satd_8x8_internal2
2028 call pixel_satd_8x8_internal2
2029 call pixel_satd_8x8_internal2
2030 call pixel_satd_8x8_internal2
2031 call pixel_satd_8x8_internal2
2032 call pixel_satd_8x8_internal2
2033 call pixel_satd_8x8_internal2
2034 call pixel_satd_8x8_internal2
2035 lea r0, [r6 + 16*SIZEOF_PIXEL]
2036 mov r2, [rsp]
2037 add r2,16*SIZEOF_PIXEL
2038 call pixel_satd_8x8_internal2
2039 call pixel_satd_8x8_internal2
2040 call pixel_satd_8x8_internal2
2041 call pixel_satd_8x8_internal2
2042 call pixel_satd_8x8_internal2
2043 call pixel_satd_8x8_internal2
2044 call pixel_satd_8x8_internal2
2045 call pixel_satd_8x8_internal2
2046 lea r0, [r6 + 24*SIZEOF_PIXEL]
2047 mov r2, [rsp]
2048 add r2,24*SIZEOF_PIXEL
2049 call pixel_satd_8x8_internal2
2050 call pixel_satd_8x8_internal2
2051 call pixel_satd_8x8_internal2
2052 call pixel_satd_8x8_internal2
2053 call pixel_satd_8x8_internal2
2054 call pixel_satd_8x8_internal2
2055 call pixel_satd_8x8_internal2
2056 call pixel_satd_8x8_internal2
2057 lea r0, [r6 + 32*SIZEOF_PIXEL]
2058 mov r2, [rsp]
2059 add r2,32*SIZEOF_PIXEL
2060 call pixel_satd_8x8_internal2
2061 call pixel_satd_8x8_internal2
2062 call pixel_satd_8x8_internal2
2063 call pixel_satd_8x8_internal2
2064 call pixel_satd_8x8_internal2
2065 call pixel_satd_8x8_internal2
2066 call pixel_satd_8x8_internal2
2067 call pixel_satd_8x8_internal2
2068 lea r0, [r6 + 40*SIZEOF_PIXEL]
2069 mov r2, [rsp]
2070 add r2,40*SIZEOF_PIXEL
2071 call pixel_satd_8x8_internal2
2072 call pixel_satd_8x8_internal2
2073 call pixel_satd_8x8_internal2
2074 call pixel_satd_8x8_internal2
2075 call pixel_satd_8x8_internal2
2076 call pixel_satd_8x8_internal2
2077 call pixel_satd_8x8_internal2
2078 call pixel_satd_8x8_internal2
2079 HADDD m6, m0
2080 movd eax, m6
2081 RET
2082 %endif
2083
2084
2085 %if WIN64
2086 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx)
2087 SATD_START_SSE2 m6, m7
2088 mov r6, r0
2089 mov r7, r2
2090 call pixel_satd_8x8_internal2
2091 call pixel_satd_8x8_internal2
2092 lea r0, [r6 + 8*SIZEOF_PIXEL]
2093 lea r2, [r7 + 8*SIZEOF_PIXEL]
2094 call pixel_satd_8x8_internal2
2095 call pixel_satd_8x8_internal2
2096 lea r0, [r6 + 16*SIZEOF_PIXEL]
2097 lea r2, [r7 + 16*SIZEOF_PIXEL]
2098 call pixel_satd_8x8_internal2
2099 call pixel_satd_8x8_internal2
2100 lea r0, [r6 + 24*SIZEOF_PIXEL]
2101 lea r2, [r7 + 24*SIZEOF_PIXEL]
2102 call pixel_satd_8x8_internal2
2103 call pixel_satd_8x8_internal2
2104 lea r0, [r6 + 32*SIZEOF_PIXEL]
2105 lea r2, [r7 + 32*SIZEOF_PIXEL]
2106 call pixel_satd_8x8_internal2
2107 call pixel_satd_8x8_internal2
2108 lea r0, [r6 + 40*SIZEOF_PIXEL]
2109 lea r2, [r7 + 40*SIZEOF_PIXEL]
2110 call pixel_satd_8x8_internal2
2111 call pixel_satd_8x8_internal2
2112 lea r0, [r6 + 48*SIZEOF_PIXEL]
2113 lea r2, [r7 + 48*SIZEOF_PIXEL]
2114 call pixel_satd_8x8_internal2
2115 call pixel_satd_8x8_internal2
2116 lea r0, [r6 + 56*SIZEOF_PIXEL]
2117 lea r2, [r7 + 56*SIZEOF_PIXEL]
2118 call pixel_satd_8x8_internal2
2119 call pixel_satd_8x8_internal2
2120 HADDD m6, m0
2121 movd eax, m6
2122 RET
2123 %else
2124 cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
2125 SATD_START_SSE2 m6, m7
2126 mov r6, r0
2127 mov [rsp], r2
2128 call pixel_satd_8x8_internal2
2129 call pixel_satd_8x8_internal2
2130 lea r0, [r6 + 8*SIZEOF_PIXEL]
2131 mov r2, [rsp]
2132 add r2,8*SIZEOF_PIXEL
2133 call pixel_satd_8x8_internal2
2134 call pixel_satd_8x8_internal2
2135 lea r0, [r6 + 16*SIZEOF_PIXEL]
2136 mov r2, [rsp]
2137 add r2,16*SIZEOF_PIXEL
2138 call pixel_satd_8x8_internal2
2139 call pixel_satd_8x8_internal2
2140 lea r0, [r6 + 24*SIZEOF_PIXEL]
2141 mov r2, [rsp]
2142 add r2,24*SIZEOF_PIXEL
2143 call pixel_satd_8x8_internal2
2144 call pixel_satd_8x8_internal2
2145 lea r0, [r6 + 32*SIZEOF_PIXEL]
2146 mov r2, [rsp]
2147 add r2,32*SIZEOF_PIXEL
2148 call pixel_satd_8x8_internal2
2149 call pixel_satd_8x8_internal2
2150 lea r0, [r6 + 40*SIZEOF_PIXEL]
2151 mov r2, [rsp]
2152 add r2,40*SIZEOF_PIXEL
2153 call pixel_satd_8x8_internal2
2154 call pixel_satd_8x8_internal2
2155 lea r0, [r6 + 48*SIZEOF_PIXEL]
2156 mov r2, [rsp]
2157 add r2,48*SIZEOF_PIXEL
2158 call pixel_satd_8x8_internal2
2159 call pixel_satd_8x8_internal2
2160 lea r0, [r6 + 56*SIZEOF_PIXEL]
2161 mov r2, [rsp]
2162 add r2,56*SIZEOF_PIXEL
2163 call pixel_satd_8x8_internal2
2164 call pixel_satd_8x8_internal2
2165 HADDD m6, m0
2166 movd eax, m6
2167 RET
2168 %endif
2169
2170 %if WIN64
2171 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx)
2172 SATD_START_SSE2 m6, m7
2173 mov r6, r0
2174 mov r7, r2
2175 call pixel_satd_8x8_internal2
2176 call pixel_satd_8x8_internal2
2177 call pixel_satd_8x8_internal2
2178 call pixel_satd_8x8_internal2
2179 lea r0, [r6 + 8*SIZEOF_PIXEL]
2180 lea r2, [r7 + 8*SIZEOF_PIXEL]
2181 call pixel_satd_8x8_internal2
2182 call pixel_satd_8x8_internal2
2183 call pixel_satd_8x8_internal2
2184 call pixel_satd_8x8_internal2
2185 lea r0, [r6 + 16*SIZEOF_PIXEL]
2186 lea r2, [r7 + 16*SIZEOF_PIXEL]
2187 call pixel_satd_8x8_internal2
2188 call pixel_satd_8x8_internal2
2189 call pixel_satd_8x8_internal2
2190 call pixel_satd_8x8_internal2
2191 lea r0, [r6 + 24*SIZEOF_PIXEL]
2192 lea r2, [r7 + 24*SIZEOF_PIXEL]
2193 call pixel_satd_8x8_internal2
2194 call pixel_satd_8x8_internal2
2195 call pixel_satd_8x8_internal2
2196 call pixel_satd_8x8_internal2
2197 lea r0, [r6 + 32*SIZEOF_PIXEL]
2198 lea r2, [r7 + 32*SIZEOF_PIXEL]
2199 call pixel_satd_8x8_internal2
2200 call pixel_satd_8x8_internal2
2201 call pixel_satd_8x8_internal2
2202 call pixel_satd_8x8_internal2
2203 lea r0, [r6 + 40*SIZEOF_PIXEL]
2204 lea r2, [r7 + 40*SIZEOF_PIXEL]
2205 call pixel_satd_8x8_internal2
2206 call pixel_satd_8x8_internal2
2207 call pixel_satd_8x8_internal2
2208 call pixel_satd_8x8_internal2
2209 lea r0, [r6 + 48*SIZEOF_PIXEL]
2210 lea r2, [r7 + 48*SIZEOF_PIXEL]
2211 call pixel_satd_8x8_internal2
2212 call pixel_satd_8x8_internal2
2213 call pixel_satd_8x8_internal2
2214 call pixel_satd_8x8_internal2
2215 lea r0, [r6 + 56*SIZEOF_PIXEL]
2216 lea r2, [r7 + 56*SIZEOF_PIXEL]
2217 call pixel_satd_8x8_internal2
2218 call pixel_satd_8x8_internal2
2219 call pixel_satd_8x8_internal2
2220 call pixel_satd_8x8_internal2
2221 HADDD m6, m0
2222 movd eax, m6
2223 RET
2224 %else
2225 cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
2226 SATD_START_SSE2 m6, m7
2227 mov r6, r0
2228 mov [rsp], r2
2229 call pixel_satd_8x8_internal2
2230 call pixel_satd_8x8_internal2
2231 call pixel_satd_8x8_internal2
2232 call pixel_satd_8x8_internal2
2233 lea r0, [r6 + 8*SIZEOF_PIXEL]
2234 mov r2, [rsp]
2235 add r2, 8*SIZEOF_PIXEL
2236 call pixel_satd_8x8_internal2
2237 call pixel_satd_8x8_internal2
2238 call pixel_satd_8x8_internal2
2239 call pixel_satd_8x8_internal2
2240 lea r0, [r6 + 16*SIZEOF_PIXEL]
2241 mov r2, [rsp]
2242 add r2, 16*SIZEOF_PIXEL
2243 call pixel_satd_8x8_internal2
2244 call pixel_satd_8x8_internal2
2245 call pixel_satd_8x8_internal2
2246 call pixel_satd_8x8_internal2
2247 lea r0, [r6 + 24*SIZEOF_PIXEL]
2248 mov r2, [rsp]
2249 add r2, 24*SIZEOF_PIXEL
2250 call pixel_satd_8x8_internal2
2251 call pixel_satd_8x8_internal2
2252 call pixel_satd_8x8_internal2
2253 call pixel_satd_8x8_internal2
2254 lea r0, [r6 + 32*SIZEOF_PIXEL]
2255 mov r2, [rsp]
2256 add r2, 32*SIZEOF_PIXEL
2257 call pixel_satd_8x8_internal2
2258 call pixel_satd_8x8_internal2
2259 call pixel_satd_8x8_internal2
2260 call pixel_satd_8x8_internal2
2261 lea r0, [r6 + 40*SIZEOF_PIXEL]
2262 mov r2, [rsp]
2263 add r2, 40*SIZEOF_PIXEL
2264 call pixel_satd_8x8_internal2
2265 call pixel_satd_8x8_internal2
2266 call pixel_satd_8x8_internal2
2267 call pixel_satd_8x8_internal2
2268 lea r0, [r6 + 48*SIZEOF_PIXEL]
2269 mov r2, [rsp]
2270 add r2, 48*SIZEOF_PIXEL
2271 call pixel_satd_8x8_internal2
2272 call pixel_satd_8x8_internal2
2273 call pixel_satd_8x8_internal2
2274 call pixel_satd_8x8_internal2
2275 lea r0, [r6 + 56*SIZEOF_PIXEL]
2276 mov r2, [rsp]
2277 add r2, 56*SIZEOF_PIXEL
2278 call pixel_satd_8x8_internal2
2279 call pixel_satd_8x8_internal2
2280 call pixel_satd_8x8_internal2
2281 call pixel_satd_8x8_internal2
2282 HADDD m6, m0
2283 movd eax, m6
2284 RET
2285 %endif
2286
2287 %if WIN64
2288 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx)
2289 SATD_START_SSE2 m6, m7
2290 mov r6, r0
2291 mov r7, r2
2292 call pixel_satd_8x8_internal2
2293 call pixel_satd_8x8_internal2
2294 call pixel_satd_8x8_internal2
2295 call pixel_satd_8x8_internal2
2296 call pixel_satd_8x8_internal2
2297 call pixel_satd_8x8_internal2
2298 lea r0, [r6 + 8*SIZEOF_PIXEL]
2299 lea r2, [r7 + 8*SIZEOF_PIXEL]
2300 call pixel_satd_8x8_internal2
2301 call pixel_satd_8x8_internal2
2302 call pixel_satd_8x8_internal2
2303 call pixel_satd_8x8_internal2
2304 call pixel_satd_8x8_internal2
2305 call pixel_satd_8x8_internal2
2306 lea r0, [r6 + 16*SIZEOF_PIXEL]
2307 lea r2, [r7 + 16*SIZEOF_PIXEL]
2308 call pixel_satd_8x8_internal2
2309 call pixel_satd_8x8_internal2
2310 call pixel_satd_8x8_internal2
2311 call pixel_satd_8x8_internal2
2312 call pixel_satd_8x8_internal2
2313 call pixel_satd_8x8_internal2
2314 lea r0, [r6 + 24*SIZEOF_PIXEL]
2315 lea r2, [r7 + 24*SIZEOF_PIXEL]
2316 call pixel_satd_8x8_internal2
2317 call pixel_satd_8x8_internal2
2318 call pixel_satd_8x8_internal2
2319 call pixel_satd_8x8_internal2
2320 call pixel_satd_8x8_internal2
2321 call pixel_satd_8x8_internal2
2322 lea r0, [r6 + 32*SIZEOF_PIXEL]
2323 lea r2, [r7 + 32*SIZEOF_PIXEL]
2324 call pixel_satd_8x8_internal2
2325 call pixel_satd_8x8_internal2
2326 call pixel_satd_8x8_internal2
2327 call pixel_satd_8x8_internal2
2328 call pixel_satd_8x8_internal2
2329 call pixel_satd_8x8_internal2
2330 lea r0, [r6 + 40*SIZEOF_PIXEL]
2331 lea r2, [r7 + 40*SIZEOF_PIXEL]
2332 call pixel_satd_8x8_internal2
2333 call pixel_satd_8x8_internal2
2334 call pixel_satd_8x8_internal2
2335 call pixel_satd_8x8_internal2
2336 call pixel_satd_8x8_internal2
2337 call pixel_satd_8x8_internal2
2338 lea r0, [r6 + 48*SIZEOF_PIXEL]
2339 lea r2, [r7 + 48*SIZEOF_PIXEL]
2340 call pixel_satd_8x8_internal2
2341 call pixel_satd_8x8_internal2
2342 call pixel_satd_8x8_internal2
2343 call pixel_satd_8x8_internal2
2344 call pixel_satd_8x8_internal2
2345 call pixel_satd_8x8_internal2
2346 lea r0, [r6 + 56*SIZEOF_PIXEL]
2347 lea r2, [r7 + 56*SIZEOF_PIXEL]
2348 call pixel_satd_8x8_internal2
2349 call pixel_satd_8x8_internal2
2350 call pixel_satd_8x8_internal2
2351 call pixel_satd_8x8_internal2
2352 call pixel_satd_8x8_internal2
2353 call pixel_satd_8x8_internal2
2354 HADDD m6, m0
2355 movd eax, m6
2356 RET
2357 %else
2358 cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
2359 SATD_START_SSE2 m6, m7
2360 mov r6, r0
2361 mov [rsp], r2
2362 call pixel_satd_8x8_internal2
2363 call pixel_satd_8x8_internal2
2364 call pixel_satd_8x8_internal2
2365 call pixel_satd_8x8_internal2
2366 call pixel_satd_8x8_internal2
2367 call pixel_satd_8x8_internal2
2368 lea r0, [r6 + 8*SIZEOF_PIXEL]
2369 mov r2, [rsp]
2370 add r2, 8*SIZEOF_PIXEL
2371 call pixel_satd_8x8_internal2
2372 call pixel_satd_8x8_internal2
2373 call pixel_satd_8x8_internal2
2374 call pixel_satd_8x8_internal2
2375 call pixel_satd_8x8_internal2
2376 call pixel_satd_8x8_internal2
2377 lea r0, [r6 + 16*SIZEOF_PIXEL]
2378 mov r2, [rsp]
2379 add r2, 16*SIZEOF_PIXEL
2380 call pixel_satd_8x8_internal2
2381 call pixel_satd_8x8_internal2
2382 call pixel_satd_8x8_internal2
2383 call pixel_satd_8x8_internal2
2384 call pixel_satd_8x8_internal2
2385 call pixel_satd_8x8_internal2
2386 lea r0, [r6 + 24*SIZEOF_PIXEL]
2387 mov r2, [rsp]
2388 add r2, 24*SIZEOF_PIXEL
2389 call pixel_satd_8x8_internal2
2390 call pixel_satd_8x8_internal2
2391 call pixel_satd_8x8_internal2
2392 call pixel_satd_8x8_internal2
2393 call pixel_satd_8x8_internal2
2394 call pixel_satd_8x8_internal2
2395 lea r0, [r6 + 32*SIZEOF_PIXEL]
2396 mov r2, [rsp]
2397 add r2, 32*SIZEOF_PIXEL
2398 call pixel_satd_8x8_internal2
2399 call pixel_satd_8x8_internal2
2400 call pixel_satd_8x8_internal2
2401 call pixel_satd_8x8_internal2
2402 call pixel_satd_8x8_internal2
2403 call pixel_satd_8x8_internal2
2404 lea r0, [r6 + 40*SIZEOF_PIXEL]
2405 mov r2, [rsp]
2406 add r2, 40*SIZEOF_PIXEL
2407 call pixel_satd_8x8_internal2
2408 call pixel_satd_8x8_internal2
2409 call pixel_satd_8x8_internal2
2410 call pixel_satd_8x8_internal2
2411 call pixel_satd_8x8_internal2
2412 call pixel_satd_8x8_internal2
2413 lea r0, [r6 + 48*SIZEOF_PIXEL]
2414 mov r2, [rsp]
2415 add r2, 48*SIZEOF_PIXEL
2416 call pixel_satd_8x8_internal2
2417 call pixel_satd_8x8_internal2
2418 call pixel_satd_8x8_internal2
2419 call pixel_satd_8x8_internal2
2420 call pixel_satd_8x8_internal2
2421 call pixel_satd_8x8_internal2
2422 lea r0, [r6 + 56*SIZEOF_PIXEL]
2423 mov r2, [rsp]
2424 add r2, 56*SIZEOF_PIXEL
2425 call pixel_satd_8x8_internal2
2426 call pixel_satd_8x8_internal2
2427 call pixel_satd_8x8_internal2
2428 call pixel_satd_8x8_internal2
2429 call pixel_satd_8x8_internal2
2430 call pixel_satd_8x8_internal2
2431 HADDD m6, m0
2432 movd eax, m6
2433 RET
2434 %endif
2435
2436 %if WIN64
2437 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx)
2438 SATD_START_SSE2 m6, m7
2439 mov r6, r0
2440 mov r7, r2
2441 call pixel_satd_8x8_internal2
2442 call pixel_satd_8x8_internal2
2443 call pixel_satd_8x8_internal2
2444 call pixel_satd_8x8_internal2
2445 call pixel_satd_8x8_internal2
2446 call pixel_satd_8x8_internal2
2447 call pixel_satd_8x8_internal2
2448 call pixel_satd_8x8_internal2
2449 lea r0, [r6 + 8*SIZEOF_PIXEL]
2450 lea r2, [r7 + 8*SIZEOF_PIXEL]
2451 call pixel_satd_8x8_internal2
2452 call pixel_satd_8x8_internal2
2453 call pixel_satd_8x8_internal2
2454 call pixel_satd_8x8_internal2
2455 call pixel_satd_8x8_internal2
2456 call pixel_satd_8x8_internal2
2457 call pixel_satd_8x8_internal2
2458 call pixel_satd_8x8_internal2
2459 lea r0, [r6 + 16*SIZEOF_PIXEL]
2460 lea r2, [r7 + 16*SIZEOF_PIXEL]
2461 call pixel_satd_8x8_internal2
2462 call pixel_satd_8x8_internal2
2463 call pixel_satd_8x8_internal2
2464 call pixel_satd_8x8_internal2
2465 call pixel_satd_8x8_internal2
2466 call pixel_satd_8x8_internal2
2467 call pixel_satd_8x8_internal2
2468 call pixel_satd_8x8_internal2
2469 lea r0, [r6 + 24*SIZEOF_PIXEL]
2470 lea r2, [r7 + 24*SIZEOF_PIXEL]
2471 call pixel_satd_8x8_internal2
2472 call pixel_satd_8x8_internal2
2473 call pixel_satd_8x8_internal2
2474 call pixel_satd_8x8_internal2
2475 call pixel_satd_8x8_internal2
2476 call pixel_satd_8x8_internal2
2477 call pixel_satd_8x8_internal2
2478 call pixel_satd_8x8_internal2
2479 lea r0, [r6 + 32*SIZEOF_PIXEL]
2480 lea r2, [r7 + 32*SIZEOF_PIXEL]
2481 call pixel_satd_8x8_internal2
2482 call pixel_satd_8x8_internal2
2483 call pixel_satd_8x8_internal2
2484 call pixel_satd_8x8_internal2
2485 call pixel_satd_8x8_internal2
2486 call pixel_satd_8x8_internal2
2487 call pixel_satd_8x8_internal2
2488 call pixel_satd_8x8_internal2
2489 lea r0, [r6 + 40*SIZEOF_PIXEL]
2490 lea r2, [r7 + 40*SIZEOF_PIXEL]
2491 call pixel_satd_8x8_internal2
2492 call pixel_satd_8x8_internal2
2493 call pixel_satd_8x8_internal2
2494 call pixel_satd_8x8_internal2
2495 call pixel_satd_8x8_internal2
2496 call pixel_satd_8x8_internal2
2497 call pixel_satd_8x8_internal2
2498 call pixel_satd_8x8_internal2
2499 lea r0, [r6 + 48*SIZEOF_PIXEL]
2500 lea r2, [r7 + 48*SIZEOF_PIXEL]
2501 call pixel_satd_8x8_internal2
2502 call pixel_satd_8x8_internal2
2503 call pixel_satd_8x8_internal2
2504 call pixel_satd_8x8_internal2
2505 call pixel_satd_8x8_internal2
2506 call pixel_satd_8x8_internal2
2507 call pixel_satd_8x8_internal2
2508 call pixel_satd_8x8_internal2
2509 lea r0, [r6 + 56*SIZEOF_PIXEL]
2510 lea r2, [r7 + 56*SIZEOF_PIXEL]
2511 call pixel_satd_8x8_internal2
2512 call pixel_satd_8x8_internal2
2513 call pixel_satd_8x8_internal2
2514 call pixel_satd_8x8_internal2
2515 call pixel_satd_8x8_internal2
2516 call pixel_satd_8x8_internal2
2517 call pixel_satd_8x8_internal2
2518 call pixel_satd_8x8_internal2
2519 HADDD m6, m0
2520 movd eax, m6
2521 RET
2522 %else
2523 cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
2524 SATD_START_SSE2 m6, m7
2525 mov r6, r0
2526 mov [rsp], r2
2527 call pixel_satd_8x8_internal2
2528 call pixel_satd_8x8_internal2
2529 call pixel_satd_8x8_internal2
2530 call pixel_satd_8x8_internal2
2531 call pixel_satd_8x8_internal2
2532 call pixel_satd_8x8_internal2
2533 call pixel_satd_8x8_internal2
2534 call pixel_satd_8x8_internal2
2535 lea r0, [r6 + 8*SIZEOF_PIXEL]
2536 mov r2, [rsp]
2537 add r2, 8*SIZEOF_PIXEL
2538 call pixel_satd_8x8_internal2
2539 call pixel_satd_8x8_internal2
2540 call pixel_satd_8x8_internal2
2541 call pixel_satd_8x8_internal2
2542 call pixel_satd_8x8_internal2
2543 call pixel_satd_8x8_internal2
2544 call pixel_satd_8x8_internal2
2545 call pixel_satd_8x8_internal2
2546 lea r0, [r6 + 16*SIZEOF_PIXEL]
2547 mov r2, [rsp]
2548 add r2, 16*SIZEOF_PIXEL
2549 call pixel_satd_8x8_internal2
2550 call pixel_satd_8x8_internal2
2551 call pixel_satd_8x8_internal2
2552 call pixel_satd_8x8_internal2
2553 call pixel_satd_8x8_internal2
2554 call pixel_satd_8x8_internal2
2555 call pixel_satd_8x8_internal2
2556 call pixel_satd_8x8_internal2
2557 lea r0, [r6 + 24*SIZEOF_PIXEL]
2558 mov r2, [rsp]
2559 add r2, 24*SIZEOF_PIXEL
2560 call pixel_satd_8x8_internal2
2561 call pixel_satd_8x8_internal2
2562 call pixel_satd_8x8_internal2
2563 call pixel_satd_8x8_internal2
2564 call pixel_satd_8x8_internal2
2565 call pixel_satd_8x8_internal2
2566 call pixel_satd_8x8_internal2
2567 call pixel_satd_8x8_internal2
2568 lea r0, [r6 + 32*SIZEOF_PIXEL]
2569 mov r2, [rsp]
2570 add r2, 32*SIZEOF_PIXEL
2571 call pixel_satd_8x8_internal2
2572 call pixel_satd_8x8_internal2
2573 call pixel_satd_8x8_internal2
2574 call pixel_satd_8x8_internal2
2575 call pixel_satd_8x8_internal2
2576 call pixel_satd_8x8_internal2
2577 call pixel_satd_8x8_internal2
2578 call pixel_satd_8x8_internal2
2579 lea r0, [r6 + 40*SIZEOF_PIXEL]
2580 mov r2, [rsp]
2581 add r2, 40*SIZEOF_PIXEL
2582 call pixel_satd_8x8_internal2
2583 call pixel_satd_8x8_internal2
2584 call pixel_satd_8x8_internal2
2585 call pixel_satd_8x8_internal2
2586 call pixel_satd_8x8_internal2
2587 call pixel_satd_8x8_internal2
2588 call pixel_satd_8x8_internal2
2589 call pixel_satd_8x8_internal2
2590 lea r0, [r6 + 48*SIZEOF_PIXEL]
2591 mov r2, [rsp]
2592 add r2, 48*SIZEOF_PIXEL
2593 call pixel_satd_8x8_internal2
2594 call pixel_satd_8x8_internal2
2595 call pixel_satd_8x8_internal2
2596 call pixel_satd_8x8_internal2
2597 call pixel_satd_8x8_internal2
2598 call pixel_satd_8x8_internal2
2599 call pixel_satd_8x8_internal2
2600 call pixel_satd_8x8_internal2
2601 lea r0, [r6 + 56*SIZEOF_PIXEL]
2602 mov r2, [rsp]
2603 add r2, 56*SIZEOF_PIXEL
2604 call pixel_satd_8x8_internal2
2605 call pixel_satd_8x8_internal2
2606 call pixel_satd_8x8_internal2
2607 call pixel_satd_8x8_internal2
2608 call pixel_satd_8x8_internal2
2609 call pixel_satd_8x8_internal2
2610 call pixel_satd_8x8_internal2
2611 call pixel_satd_8x8_internal2
2612 HADDD m6, m0
2613 movd eax, m6
2614 RET
2615 %endif
2616
2617 %if WIN64
2618 cglobal pixel_satd_16x4, 4,6,14
2619 %else
2620 cglobal pixel_satd_16x4, 4,6,8
2621 %endif
2622 SATD_START_SSE2 m6, m7
2623 BACKUP_POINTERS
2624 call %%pixel_satd_8x4_internal2
2625 RESTORE_AND_INC_POINTERS
2626 call %%pixel_satd_8x4_internal2
2627 HADDD m6, m0
2628 movd eax, m6
2629 RET
2630
2631 %if WIN64
2632 cglobal pixel_satd_16x8, 4,6,14
2633 %else
2634 cglobal pixel_satd_16x8, 4,6,8
2635 %endif
2636 SATD_START_SSE2 m6, m7
2637 BACKUP_POINTERS
2638 call pixel_satd_8x8_internal2
2639 RESTORE_AND_INC_POINTERS
2640 call pixel_satd_8x8_internal2
2641 HADDD m6, m0
2642 movd eax, m6
2643 RET
2644
2645 %if WIN64
2646 cglobal pixel_satd_16x12, 4,6,14
2647 %else
2648 cglobal pixel_satd_16x12, 4,6,8
2649 %endif
2650 SATD_START_SSE2 m6, m7, 1
2651 BACKUP_POINTERS
2652 call pixel_satd_8x8_internal2
2653 call %%pixel_satd_8x4_internal2
2654 RESTORE_AND_INC_POINTERS
2655 call pixel_satd_8x8_internal2
2656 call %%pixel_satd_8x4_internal2
2657 HADDD m6, m0
2658 movd eax, m6
2659 RET
2660
2661 %if WIN64
2662 cglobal pixel_satd_16x16, 4,6,14
2663 %else
2664 cglobal pixel_satd_16x16, 4,6,8
2665 %endif
2666 SATD_START_SSE2 m6, m7, 1
2667 BACKUP_POINTERS
2668 call pixel_satd_8x8_internal2
2669 call pixel_satd_8x8_internal2
2670 RESTORE_AND_INC_POINTERS
2671 call pixel_satd_8x8_internal2
2672 call pixel_satd_8x8_internal2
2673 HADDD m6, m0
2674 movd eax, m6
2675 RET
2676
2677 %if WIN64
2678 cglobal pixel_satd_16x32, 4,6,14
2679 %else
2680 cglobal pixel_satd_16x32, 4,6,8
2681 %endif
2682 SATD_START_SSE2 m6, m7, 1
2683 BACKUP_POINTERS
2684 call pixel_satd_8x8_internal2
2685 call pixel_satd_8x8_internal2
2686 call pixel_satd_8x8_internal2
2687 call pixel_satd_8x8_internal2
2688 RESTORE_AND_INC_POINTERS
2689 call pixel_satd_8x8_internal2
2690 call pixel_satd_8x8_internal2
2691 call pixel_satd_8x8_internal2
2692 call pixel_satd_8x8_internal2
2693 HADDD m6, m0
2694 movd eax, m6
2695 RET
2696
2697 %if WIN64
2698 cglobal pixel_satd_16x64, 4,6,14
2699 %else
2700 cglobal pixel_satd_16x64, 4,6,8
2701 %endif
2702 SATD_START_SSE2 m6, m7, 1
2703 BACKUP_POINTERS
2704 call pixel_satd_8x8_internal2
2705 call pixel_satd_8x8_internal2
2706 call pixel_satd_8x8_internal2
2707 call pixel_satd_8x8_internal2
2708 call pixel_satd_8x8_internal2
2709 call pixel_satd_8x8_internal2
2710 call pixel_satd_8x8_internal2
2711 call pixel_satd_8x8_internal2
2712 RESTORE_AND_INC_POINTERS
2713 call pixel_satd_8x8_internal2
2714 call pixel_satd_8x8_internal2
2715 call pixel_satd_8x8_internal2
2716 call pixel_satd_8x8_internal2
2717 call pixel_satd_8x8_internal2
2718 call pixel_satd_8x8_internal2
2719 call pixel_satd_8x8_internal2
2720 call pixel_satd_8x8_internal2
2721 HADDD m6, m0
2722 movd eax, m6
2723 RET
2724 %endif
2725
2726 %if HIGH_BIT_DEPTH
2727 %if WIN64
2728 cglobal pixel_satd_12x16, 4,8,8
2729 SATD_START_MMX
2730 mov r6, r0
2731 mov r7, r2
2732 pxor m7, m7
2733 SATD_4x8_SSE vertical, 0, 4, 5
2734 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2735 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2736 SATD_4x8_SSE vertical, 1, 4, 5
2737 lea r0, [r6 + 4*SIZEOF_PIXEL]
2738 lea r2, [r7 + 4*SIZEOF_PIXEL]
2739 SATD_4x8_SSE vertical, 1, 4, 5
2740 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2741 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2742 SATD_4x8_SSE vertical, 1, 4, 5
2743 lea r0, [r6 + 8*SIZEOF_PIXEL]
2744 lea r2, [r7 + 8*SIZEOF_PIXEL]
2745 SATD_4x8_SSE vertical, 1, 4, 5
2746 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2747 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2748 SATD_4x8_SSE vertical, 1, 4, 5
2749 HADDD m7, m0
2750 movd eax, m7
2751 RET
2752 %else
2753 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2754 SATD_START_MMX
2755 mov r6, r0
2756 mov [rsp], r2
2757 pxor m7, m7
2758 SATD_4x8_SSE vertical, 0, 4, 5
2759 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2760 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2761 SATD_4x8_SSE vertical, 1, 4, 5
2762 lea r0, [r6 + 4*SIZEOF_PIXEL]
2763 mov r2, [rsp]
2764 add r2, 4*SIZEOF_PIXEL
2765 SATD_4x8_SSE vertical, 1, 4, 5
2766 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2767 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2768 SATD_4x8_SSE vertical, 1, 4, 5
2769 lea r0, [r6 + 8*SIZEOF_PIXEL]
2770 mov r2, [rsp]
2771 add r2, 8*SIZEOF_PIXEL
2772 SATD_4x8_SSE vertical, 1, 4, 5
2773 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2774 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2775 SATD_4x8_SSE vertical, 1, 4, 5
2776 HADDD m7, m0
2777 movd eax, m7
2778 RET
2779 %endif
2780 %else ;HIGH_BIT_DEPTH
2781 %if WIN64
2782 cglobal pixel_satd_12x16, 4,8,8
2783 SATD_START_MMX
2784 mov r6, r0
2785 mov r7, r2
2786 %if vertical==0
2787 mova m7, [hmul_4p]
2788 %endif
2789 SATD_4x8_SSE vertical, 0, swap
2790 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2791 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2792 SATD_4x8_SSE vertical, 1, add
2793 lea r0, [r6 + 4*SIZEOF_PIXEL]
2794 lea r2, [r7 + 4*SIZEOF_PIXEL]
2795 SATD_4x8_SSE vertical, 1, add
2796 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2797 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2798 SATD_4x8_SSE vertical, 1, add
2799 lea r0, [r6 + 8*SIZEOF_PIXEL]
2800 lea r2, [r7 + 8*SIZEOF_PIXEL]
2801 SATD_4x8_SSE vertical, 1, add
2802 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2803 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2804 SATD_4x8_SSE vertical, 1, add
2805 HADDW m7, m1
2806 movd eax, m7
2807 RET
2808 %else
2809 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2810 SATD_START_MMX
2811 mov r6, r0
2812 mov [rsp], r2
2813 %if vertical==0
2814 mova m7, [hmul_4p]
2815 %endif
2816 SATD_4x8_SSE vertical, 0, swap
2817 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2818 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2819 SATD_4x8_SSE vertical, 1, add
2820 lea r0, [r6 + 4*SIZEOF_PIXEL]
2821 mov r2, [rsp]
2822 add r2, 4*SIZEOF_PIXEL
2823 SATD_4x8_SSE vertical, 1, add
2824 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2825 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2826 SATD_4x8_SSE vertical, 1, add
2827 lea r0, [r6 + 8*SIZEOF_PIXEL]
2828 mov r2, [rsp]
2829 add r2, 8*SIZEOF_PIXEL
2830 SATD_4x8_SSE vertical, 1, add
2831 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2832 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2833 SATD_4x8_SSE vertical, 1, add
2834 HADDW m7, m1
2835 movd eax, m7
2836 RET
2837 %endif
2838 %endif
2839
2840 %if WIN64
2841 cglobal pixel_satd_24x32, 4,8,14
2842 SATD_START_SSE2 m6, m7
2843 mov r6, r0
2844 mov r7, r2
2845 call pixel_satd_8x8_internal2
2846 call pixel_satd_8x8_internal2
2847 call pixel_satd_8x8_internal2
2848 call pixel_satd_8x8_internal2
2849 lea r0, [r6 + 8*SIZEOF_PIXEL]
2850 lea r2, [r7 + 8*SIZEOF_PIXEL]
2851 call pixel_satd_8x8_internal2
2852 call pixel_satd_8x8_internal2
2853 call pixel_satd_8x8_internal2
2854 call pixel_satd_8x8_internal2
2855 lea r0, [r6 + 16*SIZEOF_PIXEL]
2856 lea r2, [r7 + 16*SIZEOF_PIXEL]
2857 call pixel_satd_8x8_internal2
2858 call pixel_satd_8x8_internal2
2859 call pixel_satd_8x8_internal2
2860 call pixel_satd_8x8_internal2
2861 HADDD m6, m0
2862 movd eax, m6
2863 RET
2864 %else
2865 cglobal pixel_satd_24x32, 4,7,8,0-gprsize
2866 SATD_START_SSE2 m6, m7
2867 mov r6, r0
2868 mov [rsp], r2
2869 call pixel_satd_8x8_internal2
2870 call pixel_satd_8x8_internal2
2871 call pixel_satd_8x8_internal2
2872 call pixel_satd_8x8_internal2
2873 lea r0, [r6 + 8*SIZEOF_PIXEL]
2874 mov r2, [rsp]
2875 add r2, 8*SIZEOF_PIXEL
2876 call pixel_satd_8x8_internal2
2877 call pixel_satd_8x8_internal2
2878 call pixel_satd_8x8_internal2
2879 call pixel_satd_8x8_internal2
2880 lea r0, [r6 + 16*SIZEOF_PIXEL]
2881 mov r2, [rsp]
2882 add r2, 16*SIZEOF_PIXEL
2883 call pixel_satd_8x8_internal2
2884 call pixel_satd_8x8_internal2
2885 call pixel_satd_8x8_internal2
2886 call pixel_satd_8x8_internal2
2887 HADDD m6, m0
2888 movd eax, m6
2889 RET
2890 %endif ;WIN64
2891
2892 %if WIN64
2893 cglobal pixel_satd_8x32, 4,6,14
2894 %else
2895 cglobal pixel_satd_8x32, 4,6,8
2896 %endif
2897 SATD_START_SSE2 m6, m7
2898 %if vertical
2899 mova m7, [pw_00ff]
2900 %endif
2901 call pixel_satd_8x8_internal2
2902 call pixel_satd_8x8_internal2
2903 call pixel_satd_8x8_internal2
2904 call pixel_satd_8x8_internal2
2905 HADDD m6, m0
2906 movd eax, m6
2907 RET
2908
2909 %if WIN64
2910 cglobal pixel_satd_8x16, 4,6,14
2911 %else
2912 cglobal pixel_satd_8x16, 4,6,8
2913 %endif
2914 SATD_START_SSE2 m6, m7
2915 call pixel_satd_8x8_internal2
2916 call pixel_satd_8x8_internal2
2917 HADDD m6, m0
2918 movd eax, m6
2919 RET
2920
2921 cglobal pixel_satd_8x8, 4,6,8
2922 SATD_START_SSE2 m6, m7
2923 call pixel_satd_8x8_internal
2924 SATD_END_SSE2 m6
2925
2926 %if WIN64
2927 cglobal pixel_satd_8x4, 4,6,14
2928 %else
2929 cglobal pixel_satd_8x4, 4,6,8
2930 %endif
2931 SATD_START_SSE2 m6, m7
2932 call %%pixel_satd_8x4_internal2
2933 SATD_END_SSE2 m6
2934 %endmacro ; SATDS_SSE2
2935
2936
2937 ;=============================================================================
2938 ; SA8D
2939 ;=============================================================================
2940
2941 %macro SA8D_INTER 0
2942 %if ARCH_X86_64
2943 %define lh m10
2944 %define rh m0
2945 %else
2946 %define lh m0
2947 %define rh [esp+48]
2948 %endif
2949 %if HIGH_BIT_DEPTH
2950 HADDUW m0, m1
2951 paddd lh, rh
2952 %else
2953 paddusw lh, rh
2954 %endif ; HIGH_BIT_DEPTH
2955 %endmacro
2956
2957 %macro SA8D_8x8 0
2958 call pixel_sa8d_8x8_internal
2959 %if HIGH_BIT_DEPTH
2960 HADDUW m0, m1
2961 %else
2962 HADDW m0, m1
2963 %endif ; HIGH_BIT_DEPTH
2964 paddd m0, [pd_1]
2965 psrld m0, 1
2966 paddd m12, m0
2967 %endmacro
2968
2969 %macro SA8D_16x16 0
2970 call pixel_sa8d_8x8_internal ; pix[0]
2971 add r2, 8*SIZEOF_PIXEL
2972 add r0, 8*SIZEOF_PIXEL
2973 %if HIGH_BIT_DEPTH
2974 HADDUW m0, m1
2975 %endif
2976 mova m10, m0
2977 call pixel_sa8d_8x8_internal ; pix[8]
2978 lea r2, [r2+8*r3]
2979 lea r0, [r0+8*r1]
2980 SA8D_INTER
2981 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
2982 sub r2, 8*SIZEOF_PIXEL
2983 sub r0, 8*SIZEOF_PIXEL
2984 SA8D_INTER
2985 call pixel_sa8d_8x8_internal ; pix[8*stride]
2986 SA8D_INTER
2987 SWAP 0, 10
2988 %if HIGH_BIT_DEPTH == 0
2989 HADDUW m0, m1
2990 %endif
2991 paddd m0, [pd_1]
2992 psrld m0, 1
2993 paddd m12, m0
2994 %endmacro
2995
2996 %macro AVG_16x16 0
2997 SA8D_INTER
2998 %if HIGH_BIT_DEPTH == 0
2999 HADDUW m0, m1
3000 %endif
3001 movd r4d, m0
3002 add r4d, 1
3003 shr r4d, 1
3004 add r4d, dword [esp+36]
3005 mov dword [esp+36], r4d
3006 %endmacro
3007
3008 %macro SA8D 0
3009 ; sse2 doesn't seem to like the horizontal way of doing things
3010 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
3011
3012 %if ARCH_X86_64
3013 ;-----------------------------------------------------------------------------
3014 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
3015 ;-----------------------------------------------------------------------------
3016 cglobal pixel_sa8d_8x8_internal
3017 lea r6, [r0+4*r1]
3018 lea r7, [r2+4*r3]
3019 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
3020 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
3021 %if vertical
3022 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
3023 %else ; non-sse2
3024 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
3025 %endif
3026 paddw m0, m1
3027 paddw m0, m2
3028 paddw m0, m8
3029 SAVE_MM_PERMUTATION
3030 ret
3031
3032 cglobal pixel_sa8d_8x8, 4,8,12
3033 FIX_STRIDES r1, r3
3034 lea r4, [3*r1]
3035 lea r5, [3*r3]
3036 %if vertical == 0
3037 mova m7, [hmul_8p]
3038 %endif
3039 call pixel_sa8d_8x8_internal
3040 %if HIGH_BIT_DEPTH
3041 HADDUW m0, m1
3042 %else
3043 HADDW m0, m1
3044 %endif ; HIGH_BIT_DEPTH
3045 movd eax, m0
3046 add eax, 1
3047 shr eax, 1
3048 RET
3049
3050 cglobal pixel_sa8d_16x16, 4,8,12
3051 FIX_STRIDES r1, r3
3052 lea r4, [3*r1]
3053 lea r5, [3*r3]
3054 %if vertical == 0
3055 mova m7, [hmul_8p]
3056 %endif
3057 call pixel_sa8d_8x8_internal ; pix[0]
3058 add r2, 8*SIZEOF_PIXEL
3059 add r0, 8*SIZEOF_PIXEL
3060 %if HIGH_BIT_DEPTH
3061 HADDUW m0, m1
3062 %endif
3063 mova m10, m0
3064 call pixel_sa8d_8x8_internal ; pix[8]
3065 lea r2, [r2+8*r3]
3066 lea r0, [r0+8*r1]
3067 SA8D_INTER
3068 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
3069 sub r2, 8*SIZEOF_PIXEL
3070 sub r0, 8*SIZEOF_PIXEL
3071 SA8D_INTER
3072 call pixel_sa8d_8x8_internal ; pix[8*stride]
3073 SA8D_INTER
3074 SWAP 0, 10
3075 %if HIGH_BIT_DEPTH == 0
3076 HADDUW m0, m1
3077 %endif
3078 movd eax, m0
3079 add eax, 1
3080 shr eax, 1
3081 RET
3082
3083 cglobal pixel_sa8d_8x16, 4,8,13
3084 FIX_STRIDES r1, r3
3085 lea r4, [3*r1]
3086 lea r5, [3*r3]
3087 pxor m12, m12
3088 %if vertical == 0
3089 mova m7, [hmul_8p]
3090 %endif
3091 SA8D_8x8
3092 lea r0, [r0 + 8*r1]
3093 lea r2, [r2 + 8*r3]
3094 SA8D_8x8
3095 movd eax, m12
3096 RET
3097
3098 cglobal pixel_sa8d_8x32, 4,8,13
3099 FIX_STRIDES r1, r3
3100 lea r4, [3*r1]
3101 lea r5, [3*r3]
3102 pxor m12, m12
3103 %if vertical == 0
3104 mova m7, [hmul_8p]
3105 %endif
3106 SA8D_8x8
3107 lea r0, [r0 + r1*8]
3108 lea r2, [r2 + r3*8]
3109 SA8D_8x8
3110 lea r0, [r0 + r1*8]
3111 lea r2, [r2 + r3*8]
3112 SA8D_8x8
3113 lea r0, [r0 + r1*8]
3114 lea r2, [r2 + r3*8]
3115 SA8D_8x8
3116 movd eax, m12
3117 RET
3118
3119 cglobal pixel_sa8d_16x8, 4,8,13
3120 FIX_STRIDES r1, r3
3121 lea r4, [3*r1]
3122 lea r5, [3*r3]
3123 pxor m12, m12
3124 %if vertical == 0
3125 mova m7, [hmul_8p]
3126 %endif
3127 SA8D_8x8
3128 add r0, 8*SIZEOF_PIXEL
3129 add r2, 8*SIZEOF_PIXEL
3130 SA8D_8x8
3131 movd eax, m12
3132 RET
3133
3134 cglobal pixel_sa8d_16x32, 4,8,13
3135 FIX_STRIDES r1, r3
3136 lea r4, [3*r1]
3137 lea r5, [3*r3]
3138 pxor m12, m12
3139 %if vertical == 0
3140 mova m7, [hmul_8p]
3141 %endif
3142 SA8D_16x16
3143 lea r0, [r0+8*r1]
3144 lea r2, [r2+8*r3]
3145 SA8D_16x16
3146 movd eax, m12
3147 RET
3148
3149 cglobal pixel_sa8d_16x64, 4,8,13
3150 FIX_STRIDES r1, r3
3151 lea r4, [3*r1]
3152 lea r5, [3*r3]
3153 pxor m12, m12
3154 %if vertical == 0
3155 mova m7, [hmul_8p]
3156 %endif
3157 SA8D_16x16
3158 lea r0, [r0+8*r1]
3159 lea r2, [r2+8*r3]
3160 SA8D_16x16
3161 lea r0, [r0+8*r1]
3162 lea r2, [r2+8*r3]
3163 SA8D_16x16
3164 lea r0, [r0+8*r1]
3165 lea r2, [r2+8*r3]
3166 SA8D_16x16
3167 movd eax, m12
3168 RET
3169
3170 cglobal pixel_sa8d_24x32, 4,8,13
3171 FIX_STRIDES r1, r3
3172 lea r4, [3*r1]
3173 lea r5, [3*r3]
3174 pxor m12, m12
3175 %if vertical == 0
3176 mova m7, [hmul_8p]
3177 %endif
3178 SA8D_8x8
3179 add r0, 8*SIZEOF_PIXEL
3180 add r2, 8*SIZEOF_PIXEL
3181 SA8D_8x8
3182 add r0, 8*SIZEOF_PIXEL
3183 add r2, 8*SIZEOF_PIXEL
3184 SA8D_8x8
3185 lea r0, [r0 + r1*8]
3186 lea r2, [r2 + r3*8]
3187 SA8D_8x8
3188 sub r0, 8*SIZEOF_PIXEL
3189 sub r2, 8*SIZEOF_PIXEL
3190 SA8D_8x8
3191 sub r0, 8*SIZEOF_PIXEL
3192 sub r2, 8*SIZEOF_PIXEL
3193 SA8D_8x8
3194 lea r0, [r0 + r1*8]
3195 lea r2, [r2 + r3*8]
3196 SA8D_8x8
3197 add r0, 8*SIZEOF_PIXEL
3198 add r2, 8*SIZEOF_PIXEL
3199 SA8D_8x8
3200 add r0, 8*SIZEOF_PIXEL
3201 add r2, 8*SIZEOF_PIXEL
3202 SA8D_8x8
3203 lea r0, [r0 + r1*8]
3204 lea r2, [r2 + r3*8]
3205 SA8D_8x8
3206 sub r0, 8*SIZEOF_PIXEL
3207 sub r2, 8*SIZEOF_PIXEL
3208 SA8D_8x8
3209 sub r0, 8*SIZEOF_PIXEL
3210 sub r2, 8*SIZEOF_PIXEL
3211 SA8D_8x8
3212 movd eax, m12
3213 RET
3214
3215 cglobal pixel_sa8d_32x8, 4,8,13
3216 FIX_STRIDES r1, r3
3217 lea r4, [3*r1]
3218 lea r5, [3*r3]
3219 pxor m12, m12
3220 %if vertical == 0
3221 mova m7, [hmul_8p]
3222 %endif
3223 SA8D_8x8
3224 add r0, 8*SIZEOF_PIXEL
3225 add r2, 8*SIZEOF_PIXEL
3226 SA8D_8x8
3227 add r0, 8*SIZEOF_PIXEL
3228 add r2, 8*SIZEOF_PIXEL
3229 SA8D_8x8
3230 add r0, 8*SIZEOF_PIXEL
3231 add r2, 8*SIZEOF_PIXEL
3232 SA8D_8x8
3233 movd eax, m12
3234 RET
3235
3236 cglobal pixel_sa8d_32x16, 4,8,13
3237 FIX_STRIDES r1, r3
3238 lea r4, [3*r1]
3239 lea r5, [3*r3]
3240 pxor m12, m12
3241 %if vertical == 0
3242 mova m7, [hmul_8p]
3243 %endif
3244 SA8D_16x16
3245 lea r4, [8*r1]
3246 lea r5, [8*r3]
3247 sub r0, r4
3248 sub r2, r5
3249 add r2, 16*SIZEOF_PIXEL
3250 add r0, 16*SIZEOF_PIXEL
3251 lea r4, [3*r1]
3252 lea r5, [3*r3]
3253 SA8D_16x16
3254 movd eax, m12
3255 RET
3256
3257 cglobal pixel_sa8d_32x24, 4,8,13
3258 FIX_STRIDES r1, r3
3259 lea r4, [3*r1]
3260 lea r5, [3*r3]
3261 pxor m12, m12
3262 %if vertical == 0
3263 mova m7, [hmul_8p]
3264 %endif
3265 SA8D_8x8
3266 add r0, 8*SIZEOF_PIXEL
3267 add r2, 8*SIZEOF_PIXEL
3268 SA8D_8x8
3269 add r0, 8*SIZEOF_PIXEL
3270 add r2, 8*SIZEOF_PIXEL
3271 SA8D_8x8
3272 add r0, 8*SIZEOF_PIXEL
3273 add r2, 8*SIZEOF_PIXEL
3274 SA8D_8x8
3275 lea r0, [r0 + r1*8]
3276 lea r2, [r2 + r3*8]
3277 SA8D_8x8
3278 sub r0, 8*SIZEOF_PIXEL
3279 sub r2, 8*SIZEOF_PIXEL
3280 SA8D_8x8
3281 sub r0, 8*SIZEOF_PIXEL
3282 sub r2, 8*SIZEOF_PIXEL
3283 SA8D_8x8
3284 sub r0, 8*SIZEOF_PIXEL
3285 sub r2, 8*SIZEOF_PIXEL
3286 SA8D_8x8
3287 lea r0, [r0 + r1*8]
3288 lea r2, [r2 + r3*8]
3289 SA8D_8x8
3290 add r0, 8*SIZEOF_PIXEL
3291 add r2, 8*SIZEOF_PIXEL
3292 SA8D_8x8
3293 add r0, 8*SIZEOF_PIXEL
3294 add r2, 8*SIZEOF_PIXEL
3295 SA8D_8x8
3296 add r0, 8*SIZEOF_PIXEL
3297 add r2, 8*SIZEOF_PIXEL
3298 SA8D_8x8
3299 movd eax, m12
3300 RET
3301
3302 cglobal pixel_sa8d_32x32, 4,8,13
3303 FIX_STRIDES r1, r3
3304 lea r4, [3*r1]
3305 lea r5, [3*r3]
3306 pxor m12, m12
3307 %if vertical == 0
3308 mova m7, [hmul_8p]
3309 %endif
3310 SA8D_16x16
3311 lea r4, [8*r1]
3312 lea r5, [8*r3]
3313 sub r0, r4
3314 sub r2, r5
3315 add r2, 16*SIZEOF_PIXEL
3316 add r0, 16*SIZEOF_PIXEL
3317 lea r4, [3*r1]
3318 lea r5, [3*r3]
3319 SA8D_16x16
3320 lea r0, [r0+8*r1]
3321 lea r2, [r2+8*r3]
3322 SA8D_16x16
3323 lea r4, [8*r1]
3324 lea r5, [8*r3]
3325 sub r0, r4
3326 sub r2, r5
3327 sub r2, 16*SIZEOF_PIXEL
3328 sub r0, 16*SIZEOF_PIXEL
3329 lea r4, [3*r1]
3330 lea r5, [3*r3]
3331 SA8D_16x16
3332 movd eax, m12
3333 RET
3334
3335 cglobal pixel_sa8d_32x64, 4,8,13
3336 FIX_STRIDES r1, r3
3337 lea r4, [3*r1]
3338 lea r5, [3*r3]
3339 pxor m12, m12
3340 %if vertical == 0
3341 mova m7, [hmul_8p]
3342 %endif
3343 SA8D_16x16
3344 lea r4, [8*r1]
3345 lea r5, [8*r3]
3346 sub r0, r4
3347 sub r2, r5
3348 add r2, 16*SIZEOF_PIXEL
3349 add r0, 16*SIZEOF_PIXEL
3350 lea r4, [3*r1]
3351 lea r5, [3*r3]
3352 SA8D_16x16
3353 lea r0, [r0+8*r1]
3354 lea r2, [r2+8*r3]
3355 SA8D_16x16
3356 lea r4, [8*r1]
3357 lea r5, [8*r3]
3358 sub r0, r4
3359 sub r2, r5
3360 sub r2, 16*SIZEOF_PIXEL
3361 sub r0, 16*SIZEOF_PIXEL
3362 lea r4, [3*r1]
3363 lea r5, [3*r3]
3364 SA8D_16x16
3365 lea r0, [r0+8*r1]
3366 lea r2, [r2+8*r3]
3367 SA8D_16x16
3368 lea r4, [8*r1]
3369 lea r5, [8*r3]
3370 sub r0, r4
3371 sub r2, r5
3372 add r2, 16*SIZEOF_PIXEL
3373 add r0, 16*SIZEOF_PIXEL
3374 lea r4, [3*r1]
3375 lea r5, [3*r3]
3376 SA8D_16x16
3377 lea r0, [r0+8*r1]
3378 lea r2, [r2+8*r3]
3379 SA8D_16x16
3380 lea r4, [8*r1]
3381 lea r5, [8*r3]
3382 sub r0, r4
3383 sub r2, r5
3384 sub r2, 16*SIZEOF_PIXEL
3385 sub r0, 16*SIZEOF_PIXEL
3386 lea r4, [3*r1]
3387 lea r5, [3*r3]
3388 SA8D_16x16
3389 movd eax, m12
3390 RET
3391
3392 cglobal pixel_sa8d_48x64, 4,8,13
3393 FIX_STRIDES r1, r3
3394 lea r4, [3*r1]
3395 lea r5, [3*r3]
3396 pxor m12, m12
3397 %if vertical == 0
3398 mova m7, [hmul_8p]
3399 %endif
3400 SA8D_16x16
3401 lea r4, [8*r1]
3402 lea r5, [8*r3]
3403 sub r0, r4
3404 sub r2, r5
3405 add r2, 16*SIZEOF_PIXEL
3406 add r0, 16*SIZEOF_PIXEL
3407 lea r4, [3*r1]
3408 lea r5, [3*r3]
3409 SA8D_16x16
3410 lea r4, [8*r1]
3411 lea r5, [8*r3]
3412 sub r0, r4
3413 sub r2, r5
3414 add r2, 16*SIZEOF_PIXEL
3415 add r0, 16*SIZEOF_PIXEL
3416 lea r4, [3*r1]
3417 lea r5, [3*r3]
3418 SA8D_16x16
3419 lea r0, [r0+8*r1]
3420 lea r2, [r2+8*r3]
3421 SA8D_16x16
3422 lea r4, [8*r1]
3423 lea r5, [8*r3]
3424 sub r0, r4
3425 sub r2, r5
3426 sub r2, 16*SIZEOF_PIXEL
3427 sub r0, 16*SIZEOF_PIXEL
3428 lea r4, [3*r1]
3429 lea r5, [3*r3]
3430 SA8D_16x16
3431 lea r4, [8*r1]
3432 lea r5, [8*r3]
3433 sub r0, r4
3434 sub r2, r5
3435 sub r2, 16*SIZEOF_PIXEL
3436 sub r0, 16*SIZEOF_PIXEL
3437 lea r4, [3*r1]
3438 lea r5, [3*r3]
3439 SA8D_16x16
3440 lea r0, [r0+8*r1]
3441 lea r2, [r2+8*r3]
3442 SA8D_16x16
3443 lea r4, [8*r1]
3444 lea r5, [8*r3]
3445 sub r0, r4
3446 sub r2, r5
3447 add r2, 16*SIZEOF_PIXEL
3448 add r0, 16*SIZEOF_PIXEL
3449 lea r4, [3*r1]
3450 lea r5, [3*r3]
3451 SA8D_16x16
3452 lea r4, [8*r1]
3453 lea r5, [8*r3]
3454 sub r0, r4
3455 sub r2, r5
3456 add r2, 16*SIZEOF_PIXEL
3457 add r0, 16*SIZEOF_PIXEL
3458 lea r4, [3*r1]
3459 lea r5, [3*r3]
3460 SA8D_16x16
3461 lea r0, [r0+8*r1]
3462 lea r2, [r2+8*r3]
3463 SA8D_16x16
3464 lea r4, [8*r1]
3465 lea r5, [8*r3]
3466 sub r0, r4
3467 sub r2, r5
3468 sub r2, 16*SIZEOF_PIXEL
3469 sub r0, 16*SIZEOF_PIXEL
3470 lea r4, [3*r1]
3471 lea r5, [3*r3]
3472 SA8D_16x16
3473 lea r4, [8*r1]
3474 lea r5, [8*r3]
3475 sub r0, r4
3476 sub r2, r5
3477 sub r2, 16*SIZEOF_PIXEL
3478 sub r0, 16*SIZEOF_PIXEL
3479 lea r4, [3*r1]
3480 lea r5, [3*r3]
3481 SA8D_16x16
3482 movd eax, m12
3483 RET
3484
3485 cglobal pixel_sa8d_64x16, 4,8,13
3486 FIX_STRIDES r1, r3
3487 lea r4, [3*r1]
3488 lea r5, [3*r3]
3489 pxor m12, m12
3490 %if vertical == 0
3491 mova m7, [hmul_8p]
3492 %endif
3493 SA8D_16x16
3494 lea r4, [8*r1]
3495 lea r5, [8*r3]
3496 sub r0, r4
3497 sub r2, r5
3498 add r2, 16*SIZEOF_PIXEL
3499 add r0, 16*SIZEOF_PIXEL
3500 lea r4, [3*r1]
3501 lea r5, [3*r3]
3502 SA8D_16x16
3503 lea r4, [8*r1]
3504 lea r5, [8*r3]
3505 sub r0, r4
3506 sub r2, r5
3507 add r2, 16*SIZEOF_PIXEL
3508 add r0, 16*SIZEOF_PIXEL
3509 lea r4, [3*r1]
3510 lea r5, [3*r3]
3511 SA8D_16x16
3512 lea r4, [8*r1]
3513 lea r5, [8*r3]
3514 sub r0, r4
3515 sub r2, r5
3516 add r2, 16*SIZEOF_PIXEL
3517 add r0, 16*SIZEOF_PIXEL
3518 lea r4, [3*r1]
3519 lea r5, [3*r3]
3520 SA8D_16x16
3521 movd eax, m12
3522 RET
3523
3524 cglobal pixel_sa8d_64x32, 4,8,13
3525 FIX_STRIDES r1, r3
3526 lea r4, [3*r1]
3527 lea r5, [3*r3]
3528 pxor m12, m12
3529 %if vertical == 0
3530 mova m7, [hmul_8p]
3531 %endif
3532 SA8D_16x16
3533 lea r4, [8*r1]
3534 lea r5, [8*r3]
3535 sub r0, r4
3536 sub r2, r5
3537 add r2, 16*SIZEOF_PIXEL
3538 add r0, 16*SIZEOF_PIXEL
3539 lea r4, [3*r1]
3540 lea r5, [3*r3]
3541 SA8D_16x16
3542 lea r4, [8*r1]
3543 lea r5, [8*r3]
3544 sub r0, r4
3545 sub r2, r5
3546 add r2, 16*SIZEOF_PIXEL
3547 add r0, 16*SIZEOF_PIXEL
3548 lea r4, [3*r1]
3549 lea r5, [3*r3]
3550 SA8D_16x16
3551 lea r4, [8*r1]
3552 lea r5, [8*r3]
3553 sub r0, r4
3554 sub r2, r5
3555 add r2, 16*SIZEOF_PIXEL
3556 add r0, 16*SIZEOF_PIXEL
3557 lea r4, [3*r1]
3558 lea r5, [3*r3]
3559 SA8D_16x16
3560 lea r0, [r0+8*r1]
3561 lea r2, [r2+8*r3]
3562 SA8D_16x16
3563 lea r4, [8*r1]
3564 lea r5, [8*r3]
3565 sub r0, r4
3566 sub r2, r5
3567 sub r2, 16*SIZEOF_PIXEL
3568 sub r0, 16*SIZEOF_PIXEL
3569 lea r4, [3*r1]
3570 lea r5, [3*r3]
3571 SA8D_16x16
3572 lea r4, [8*r1]
3573 lea r5, [8*r3]
3574 sub r0, r4
3575 sub r2, r5
3576 sub r2, 16*SIZEOF_PIXEL
3577 sub r0, 16*SIZEOF_PIXEL
3578 lea r4, [3*r1]
3579 lea r5, [3*r3]
3580 SA8D_16x16
3581 lea r4, [8*r1]
3582 lea r5, [8*r3]
3583 sub r0, r4
3584 sub r2, r5
3585 sub r2, 16*SIZEOF_PIXEL
3586 sub r0, 16*SIZEOF_PIXEL
3587 lea r4, [3*r1]
3588 lea r5, [3*r3]
3589 SA8D_16x16
3590 movd eax, m12
3591 RET
3592
3593 cglobal pixel_sa8d_64x48, 4,8,13
3594 FIX_STRIDES r1, r3
3595 lea r4, [3*r1]
3596 lea r5, [3*r3]
3597 pxor m12, m12
3598 %if vertical == 0
3599 mova m7, [hmul_8p]
3600 %endif
3601 SA8D_16x16
3602 lea r4, [8*r1]
3603 lea r5, [8*r3]
3604 sub r0, r4
3605 sub r2, r5
3606 add r2, 16*SIZEOF_PIXEL
3607 add r0, 16*SIZEOF_PIXEL
3608 lea r4, [3*r1]
3609 lea r5, [3*r3]
3610 SA8D_16x16
3611 lea r4, [8*r1]
3612 lea r5, [8*r3]
3613 sub r0, r4
3614 sub r2, r5
3615 add r2, 16*SIZEOF_PIXEL
3616 add r0, 16*SIZEOF_PIXEL
3617 lea r4, [3*r1]
3618 lea r5, [3*r3]
3619 SA8D_16x16
3620 lea r4, [8*r1]
3621 lea r5, [8*r3]
3622 sub r0, r4
3623 sub r2, r5
3624 add r2, 16*SIZEOF_PIXEL
3625 add r0, 16*SIZEOF_PIXEL
3626 lea r4, [3*r1]
3627 lea r5, [3*r3]
3628 SA8D_16x16
3629 lea r0, [r0+8*r1]
3630 lea r2, [r2+8*r3]
3631 SA8D_16x16
3632 lea r4, [8*r1]
3633 lea r5, [8*r3]
3634 sub r0, r4
3635 sub r2, r5
3636 sub r2, 16*SIZEOF_PIXEL
3637 sub r0, 16*SIZEOF_PIXEL
3638 lea r4, [3*r1]
3639 lea r5, [3*r3]
3640 SA8D_16x16
3641 lea r4, [8*r1]
3642 lea r5, [8*r3]
3643 sub r0, r4
3644 sub r2, r5
3645 sub r2, 16*SIZEOF_PIXEL
3646 sub r0, 16*SIZEOF_PIXEL
3647 lea r4, [3*r1]
3648 lea r5, [3*r3]
3649 SA8D_16x16
3650 lea r4, [8*r1]
3651 lea r5, [8*r3]
3652 sub r0, r4
3653 sub r2, r5
3654 sub r2, 16*SIZEOF_PIXEL
3655 sub r0, 16*SIZEOF_PIXEL
3656 lea r4, [3*r1]
3657 lea r5, [3*r3]
3658 SA8D_16x16
3659 lea r0, [r0+8*r1]
3660 lea r2, [r2+8*r3]
3661 SA8D_16x16
3662 lea r4, [8*r1]
3663 lea r5, [8*r3]
3664 sub r0, r4
3665 sub r2, r5
3666 add r2, 16*SIZEOF_PIXEL
3667 add r0, 16*SIZEOF_PIXEL
3668 lea r4, [3*r1]
3669 lea r5, [3*r3]
3670 SA8D_16x16
3671 lea r4, [8*r1]
3672 lea r5, [8*r3]
3673 sub r0, r4
3674 sub r2, r5
3675 add r2, 16*SIZEOF_PIXEL
3676 add r0, 16*SIZEOF_PIXEL
3677 lea r4, [3*r1]
3678 lea r5, [3*r3]
3679 SA8D_16x16
3680 lea r4, [8*r1]
3681 lea r5, [8*r3]
3682 sub r0, r4
3683 sub r2, r5
3684 add r2, 16*SIZEOF_PIXEL
3685 add r0, 16*SIZEOF_PIXEL
3686 lea r4, [3*r1]
3687 lea r5, [3*r3]
3688 SA8D_16x16
3689 movd eax, m12
3690 RET
3691
3692 cglobal pixel_sa8d_64x64, 4,8,13
3693 FIX_STRIDES r1, r3
3694 lea r4, [3*r1]
3695 lea r5, [3*r3]
3696 pxor m12, m12
3697 %if vertical == 0
3698 mova m7, [hmul_8p]
3699 %endif
3700 SA8D_16x16
3701 lea r4, [8*r1]
3702 lea r5, [8*r3]
3703 sub r0, r4
3704 sub r2, r5
3705 add r2, 16*SIZEOF_PIXEL
3706 add r0, 16*SIZEOF_PIXEL
3707 lea r4, [3*r1]
3708 lea r5, [3*r3]
3709 SA8D_16x16
3710 lea r4, [8*r1]
3711 lea r5, [8*r3]
3712 sub r0, r4
3713 sub r2, r5
3714 add r2, 16*SIZEOF_PIXEL
3715 add r0, 16*SIZEOF_PIXEL
3716 lea r4, [3*r1]
3717 lea r5, [3*r3]
3718 SA8D_16x16
3719 lea r4, [8*r1]
3720 lea r5, [8*r3]
3721 sub r0, r4
3722 sub r2, r5
3723 add r2, 16*SIZEOF_PIXEL
3724 add r0, 16*SIZEOF_PIXEL
3725 lea r4, [3*r1]
3726 lea r5, [3*r3]
3727 SA8D_16x16
3728 lea r0, [r0+8*r1]
3729 lea r2, [r2+8*r3]
3730 SA8D_16x16
3731 lea r4, [8*r1]
3732 lea r5, [8*r3]
3733 sub r0, r4
3734 sub r2, r5
3735 sub r2, 16*SIZEOF_PIXEL
3736 sub r0, 16*SIZEOF_PIXEL
3737 lea r4, [3*r1]
3738 lea r5, [3*r3]
3739 SA8D_16x16
3740 lea r4, [8*r1]
3741 lea r5, [8*r3]
3742 sub r0, r4
3743 sub r2, r5
3744 sub r2, 16*SIZEOF_PIXEL
3745 sub r0, 16*SIZEOF_PIXEL
3746 lea r4, [3*r1]
3747 lea r5, [3*r3]
3748 SA8D_16x16
3749 lea r4, [8*r1]
3750 lea r5, [8*r3]
3751 sub r0, r4
3752 sub r2, r5
3753 sub r2, 16*SIZEOF_PIXEL
3754 sub r0, 16*SIZEOF_PIXEL
3755 lea r4, [3*r1]
3756 lea r5, [3*r3]
3757 SA8D_16x16
3758 lea r0, [r0+8*r1]
3759 lea r2, [r2+8*r3]
3760 SA8D_16x16
3761 lea r4, [8*r1]
3762 lea r5, [8*r3]
3763 sub r0, r4
3764 sub r2, r5
3765 add r2, 16*SIZEOF_PIXEL
3766 add r0, 16*SIZEOF_PIXEL
3767 lea r4, [3*r1]
3768 lea r5, [3*r3]
3769 SA8D_16x16
3770 lea r4, [8*r1]
3771 lea r5, [8*r3]
3772 sub r0, r4
3773 sub r2, r5
3774 add r2, 16*SIZEOF_PIXEL
3775 add r0, 16*SIZEOF_PIXEL
3776 lea r4, [3*r1]
3777 lea r5, [3*r3]
3778 SA8D_16x16
3779 lea r4, [8*r1]
3780 lea r5, [8*r3]
3781 sub r0, r4
3782 sub r2, r5
3783 add r2, 16*SIZEOF_PIXEL
3784 add r0, 16*SIZEOF_PIXEL
3785 lea r4, [3*r1]
3786 lea r5, [3*r3]
3787 SA8D_16x16
3788 lea r0, [r0+8*r1]
3789 lea r2, [r2+8*r3]
3790 SA8D_16x16
3791 lea r4, [8*r1]
3792 lea r5, [8*r3]
3793 sub r0, r4
3794 sub r2, r5
3795 sub r2, 16*SIZEOF_PIXEL
3796 sub r0, 16*SIZEOF_PIXEL
3797 lea r4, [3*r1]
3798 lea r5, [3*r3]
3799 SA8D_16x16
3800 lea r4, [8*r1]
3801 lea r5, [8*r3]
3802 sub r0, r4
3803 sub r2, r5
3804 sub r2, 16*SIZEOF_PIXEL
3805 sub r0, 16*SIZEOF_PIXEL
3806 lea r4, [3*r1]
3807 lea r5, [3*r3]
3808 SA8D_16x16
3809 lea r4, [8*r1]
3810 lea r5, [8*r3]
3811 sub r0, r4
3812 sub r2, r5
3813 sub r2, 16*SIZEOF_PIXEL
3814 sub r0, 16*SIZEOF_PIXEL
3815 lea r4, [3*r1]
3816 lea r5, [3*r3]
3817 SA8D_16x16
3818 movd eax, m12
3819 RET
3820
3821 %else ; ARCH_X86_32
3822 %if mmsize == 16
3823 cglobal pixel_sa8d_8x8_internal
3824 %define spill0 [esp+4]
3825 %define spill1 [esp+20]
3826 %define spill2 [esp+36]
3827 %if vertical
3828 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3829 HADAMARD4_2D 0, 1, 2, 3, 4
3830 movdqa spill0, m3
3831 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3832 HADAMARD4_2D 4, 5, 6, 7, 3
3833 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3834 movdqa m3, spill0
3835 paddw m0, m1
3836 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3837 %else ; mmsize == 8
3838 mova m7, [hmul_8p]
3839 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
3840 ; could do first HADAMARD4_V here to save spilling later
3841 ; surprisingly, not a win on conroe or even p4
3842 mova spill0, m2
3843 mova spill1, m3
3844 mova spill2, m1
3845 SWAP 1, 7
3846 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
3847 HADAMARD4_V 4, 5, 6, 7, 3
3848 mova m1, spill2
3849 mova m2, spill0
3850 mova m3, spill1
3851 mova spill0, m6
3852 mova spill1, m7
3853 HADAMARD4_V 0, 1, 2, 3, 7
3854 SUMSUB_BADC w, 0, 4, 1, 5, 7
3855 HADAMARD 2, sumsub, 0, 4, 7, 6
3856 HADAMARD 2, sumsub, 1, 5, 7, 6
3857 HADAMARD 1, amax, 0, 4, 7, 6
3858 HADAMARD 1, amax, 1, 5, 7, 6
3859 mova m6, spill0
3860 mova m7, spill1
3861 paddw m0, m1
3862 SUMSUB_BADC w, 2, 6, 3, 7, 4
3863 HADAMARD 2, sumsub, 2, 6, 4, 5
3864 HADAMARD 2, sumsub, 3, 7, 4, 5
3865 HADAMARD 1, amax, 2, 6, 4, 5
3866 HADAMARD 1, amax, 3, 7, 4, 5
3867 %endif ; sse2/non-sse2
3868 paddw m0, m2
3869 paddw m0, m3
3870 SAVE_MM_PERMUTATION
3871 ret
3872 %endif ; ifndef mmx2
3873
3874 cglobal pixel_sa8d_8x8_internal2
3875 %define spill0 [esp+4]
3876 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3877 HADAMARD4_2D 0, 1, 2, 3, 4
3878 movdqa spill0, m3
3879 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3880 HADAMARD4_2D 4, 5, 6, 7, 3
3881 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3882 movdqa m3, spill0
3883 paddw m0, m1
3884 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3885 paddw m0, m2
3886 paddw m0, m3
3887 SAVE_MM_PERMUTATION
3888 ret
3889
3890 cglobal pixel_sa8d_8x8, 4,7
3891 FIX_STRIDES r1, r3
3892 mov r6, esp
3893 and esp, ~15
3894 sub esp, 48
3895 lea r4, [3*r1]
3896 lea r5, [3*r3]
3897 call pixel_sa8d_8x8_internal
3898 %if HIGH_BIT_DEPTH
3899 HADDUW m0, m1
3900 %else
3901 HADDW m0, m1
3902 %endif ; HIGH_BIT_DEPTH
3903 movd eax, m0
3904 add eax, 1
3905 shr eax, 1
3906 mov esp, r6
3907 RET
3908
3909 cglobal pixel_sa8d_16x16, 4,7
3910 FIX_STRIDES r1, r3
3911 mov r6, esp
3912 and esp, ~15
3913 sub esp, 64
3914 lea r4, [3*r1]
3915 lea r5, [3*r3]
3916 call pixel_sa8d_8x8_internal
3917 %if mmsize == 8
3918 lea r0, [r0+4*r1]
3919 lea r2, [r2+4*r3]
3920 %endif
3921 %if HIGH_BIT_DEPTH
3922 HADDUW m0, m1
3923 %endif
3924 mova [esp+48], m0
3925 call pixel_sa8d_8x8_internal
3926 mov r0, [r6+20]
3927 mov r2, [r6+28]
3928 add r0, 8*SIZEOF_PIXEL
3929 add r2, 8*SIZEOF_PIXEL
3930 SA8D_INTER
3931 mova [esp+48], m0
3932 call pixel_sa8d_8x8_internal
3933 %if mmsize == 8
3934 lea r0, [r0+4*r1]
3935 lea r2, [r2+4*r3]
3936 %else
3937 SA8D_INTER
3938 %endif
3939 mova [esp+64-mmsize], m0
3940 call pixel_sa8d_8x8_internal
3941 %if HIGH_BIT_DEPTH
3942 SA8D_INTER
3943 %else ; !HIGH_BIT_DEPTH
3944 paddusw m0, [esp+64-mmsize]
3945 %if mmsize == 16
3946 HADDUW m0, m1
3947 %else
3948 mova m2, [esp+48]
3949 pxor m7, m7
3950 mova m1, m0
3951 mova m3, m2
3952 punpcklwd m0, m7
3953 punpckhwd m1, m7
3954 punpcklwd m2, m7
3955 punpckhwd m3, m7
3956 paddd m0, m1
3957 paddd m2, m3
3958 paddd m0, m2
3959 HADDD m0, m1
3960 %endif
3961 %endif ; HIGH_BIT_DEPTH
3962 movd eax, m0
3963 add eax, 1
3964 shr eax, 1
3965 mov esp, r6
3966 RET
3967
3968 cglobal pixel_sa8d_8x16, 4,7,8
3969 FIX_STRIDES r1, r3
3970 mov r6, esp
3971 and esp, ~15
3972 sub esp, 64
3973
3974 lea r4, [r1 + 2*r1]
3975 lea r5, [r3 + 2*r3]
3976 call pixel_sa8d_8x8_internal2
3977 HADDUW m0, m1
3978 movd r4d, m0
3979 add r4d, 1
3980 shr r4d, 1
3981 mov dword [esp+36], r4d
3982
3983 mov r0, [r6+20]
3984 mov r2, [r6+28]
3985 lea r0, [r0 + r1*8]
3986 lea r2, [r2 + r3*8]
3987 lea r4, [r1 + 2*r1]
3988 call pixel_sa8d_8x8_internal2
3989 HADDUW m0, m1
3990 movd r4d, m0
3991 add r4d, 1
3992 shr r4d, 1
3993 add r4d, dword [esp+36]
3994 mov eax, r4d
3995 mov esp, r6
3996 RET
3997
3998 cglobal pixel_sa8d_8x32, 4,7,8
3999 FIX_STRIDES r1, r3
4000 mov r6, esp
4001 and esp, ~15
4002 sub esp, 64
4003
4004 lea r4, [r1 + 2*r1]
4005 lea r5, [r3 + 2*r3]
4006 call pixel_sa8d_8x8_internal2
4007 HADDUW m0, m1
4008 movd r4d, m0
4009 add r4d, 1
4010 shr r4d, 1
4011 mov dword [esp+36], r4d
4012
4013 mov r0, [r6+20]
4014 mov r2, [r6+28]
4015 lea r0, [r0 + r1*8]
4016 lea r2, [r2 + r3*8]
4017 lea r4, [r1 + 2*r1]
4018 call pixel_sa8d_8x8_internal2
4019 HADDUW m0, m1
4020 movd r4d, m0
4021 add r4d, 1
4022 shr r4d, 1
4023 add r4d, dword [esp+36]
4024 mov dword [esp+36], r4d
4025
4026 mov r0, [r6+20]
4027 mov r2, [r6+28]
4028 lea r0, [r0 + r1*8]
4029 lea r2, [r2 + r3*8]
4030 lea r0, [r0 + r1*8]
4031 lea r2, [r2 + r3*8]
4032 lea r4, [r1 + 2*r1]
4033 call pixel_sa8d_8x8_internal2
4034 HADDUW m0, m1
4035 movd r4d, m0
4036 add r4d, 1
4037 shr r4d, 1
4038 add r4d, dword [esp+36]
4039 mov dword [esp+36], r4d
4040
4041 mov r0, [r6+20]
4042 mov r2, [r6+28]
4043 lea r0, [r0 + r1*8]
4044 lea r2, [r2 + r3*8]
4045 lea r0, [r0 + r1*8]
4046 lea r2, [r2 + r3*8]
4047 lea r0, [r0 + r1*8]
4048 lea r2, [r2 + r3*8]
4049 lea r4, [r1 + 2*r1]
4050 call pixel_sa8d_8x8_internal2
4051 HADDUW m0, m1
4052 movd r4d, m0
4053 add r4d, 1
4054 shr r4d, 1
4055 add r4d, dword [esp+36]
4056 mov eax, r4d
4057 mov esp, r6
4058 RET
4059
4060 cglobal pixel_sa8d_16x8, 4,7,8
4061 FIX_STRIDES r1, r3
4062 mov r6, esp
4063 and esp, ~15
4064 sub esp, 64
4065
4066 lea r4, [r1 + 2*r1]
4067 lea r5, [r3 + 2*r3]
4068 call pixel_sa8d_8x8_internal2
4069 HADDUW m0, m1
4070 movd r4d, m0
4071 add r4d, 1
4072 shr r4d, 1
4073 mov dword [esp+36], r4d
4074
4075 mov r0, [r6+20]
4076 mov r2, [r6+28]
4077 add r0, 8*SIZEOF_PIXEL
4078 add r2, 8*SIZEOF_PIXEL
4079 lea r4, [r1 + 2*r1]
4080 call pixel_sa8d_8x8_internal2
4081 HADDUW m0, m1
4082 movd r4d, m0
4083 add r4d, 1
4084 shr r4d, 1
4085 add r4d, dword [esp+36]
4086 mov eax, r4d
4087 mov esp, r6
4088 RET
4089
4090 cglobal pixel_sa8d_16x32, 4,7,8
4091 FIX_STRIDES r1, r3
4092 mov r6, esp
4093 and esp, ~15
4094 sub esp, 64
4095
4096 lea r4, [r1 + 2*r1]
4097 lea r5, [r3 + 2*r3]
4098 call pixel_sa8d_8x8_internal2
4099 %if HIGH_BIT_DEPTH
4100 HADDUW m0, m1
4101 %endif
4102 mova [rsp+48], m0
4103 call pixel_sa8d_8x8_internal2
4104 SA8D_INTER
4105 mova [esp+48], m0
4106
4107 mov r0, [r6+20]
4108 mov r2, [r6+28]
4109 add r0, 8*SIZEOF_PIXEL
4110 add r2, 8*SIZEOF_PIXEL
4111 call pixel_sa8d_8x8_internal2
4112 SA8D_INTER
4113 mova [esp+48], m0
4114 call pixel_sa8d_8x8_internal2
4115 SA8D_INTER
4116 %if HIGH_BIT_DEPTH == 0
4117 HADDUW m0, m1
4118 %endif
4119 movd r4d, m0
4120 add r4d, 1
4121 shr r4d, 1
4122 mov dword [esp+36], r4d
4123
4124 mov r0, [r6+20]
4125 mov r2, [r6+28]
4126 lea r0, [r0 + r1*8]
4127 lea r2, [r2 + r3*8]
4128 lea r0, [r0 + r1*8]
4129 lea r2, [r2 + r3*8]
4130 lea r4, [r1 + 2*r1]
4131 call pixel_sa8d_8x8_internal2
4132 %if HIGH_BIT_DEPTH
4133 HADDUW m0, m1
4134 %endif
4135 mova [esp+48], m0
4136 call pixel_sa8d_8x8_internal2
4137 SA8D_INTER
4138 mova [esp+48], m0
4139
4140 mov r0, [r6+20]
4141 mov r2, [r6+28]
4142 lea r0, [r0 + r1*8]
4143 lea r2, [r2 + r3*8]
4144 lea r0, [r0 + r1*8]
4145 lea r2, [r2 + r3*8]
4146 add r0, 8*SIZEOF_PIXEL
4147 add r2, 8*SIZEOF_PIXEL
4148 call pixel_sa8d_8x8_internal2
4149 SA8D_INTER
4150 mova [esp+48], m0
4151 call pixel_sa8d_8x8_internal2
4152 SA8D_INTER
4153 %if HIGH_BIT_DEPTH == 0
4154 HADDUW m0, m1
4155 %endif
4156 movd r4d, m0
4157 add r4d, 1
4158 shr r4d, 1
4159 add r4d, dword [esp+36]
4160 mov eax, r4d
4161 mov esp, r6
4162 RET
4163
4164 cglobal pixel_sa8d_16x64, 4,7,8
4165 FIX_STRIDES r1, r3
4166 mov r6, esp
4167 and esp, ~15
4168 sub esp, 64
4169
4170 lea r4, [r1 + 2*r1]
4171 lea r5, [r3 + 2*r3]
4172 call pixel_sa8d_8x8_internal2
4173 %if HIGH_BIT_DEPTH
4174 HADDUW m0, m1
4175 %endif
4176 mova [rsp+48], m0
4177 call pixel_sa8d_8x8_internal2
4178 SA8D_INTER
4179 mova [esp+48], m0
4180
4181 mov r0, [r6+20]
4182 mov r2, [r6+28]
4183 add r0, 8*SIZEOF_PIXEL
4184 add r2, 8*SIZEOF_PIXEL
4185 call pixel_sa8d_8x8_internal2
4186 SA8D_INTER
4187 mova [esp+48], m0
4188 call pixel_sa8d_8x8_internal2
4189 SA8D_INTER
4190 %if HIGH_BIT_DEPTH == 0
4191 HADDUW m0, m1
4192 %endif
4193 movd r4d, m0
4194 add r4d, 1
4195 shr r4d, 1
4196 mov dword [esp+36], r4d
4197
4198 mov r0, [r6+20]
4199 mov r2, [r6+28]
4200 lea r0, [r0 + r1*8]
4201 lea r2, [r2 + r3*8]
4202 lea r0, [r0 + r1*8]
4203 lea r2, [r2 + r3*8]
4204 mov [r6+20], r0
4205 mov [r6+28], r2
4206
4207 lea r4, [r1 + 2*r1]
4208 call pixel_sa8d_8x8_internal2
4209 %if HIGH_BIT_DEPTH
4210 HADDUW m0, m1
4211 %endif
4212 mova [esp+48], m0
4213 call pixel_sa8d_8x8_internal2
4214 SA8D_INTER
4215 mova [esp+48], m0
4216
4217 mov r0, [r6+20]
4218 mov r2, [r6+28]
4219 add r0, 8*SIZEOF_PIXEL
4220 add r2, 8*SIZEOF_PIXEL
4221 call pixel_sa8d_8x8_internal2
4222 SA8D_INTER
4223 mova [esp+64-mmsize], m0
4224 call pixel_sa8d_8x8_internal2
4225 AVG_16x16
4226
4227 mov r0, [r6+20]
4228 mov r2, [r6+28]
4229 lea r0, [r0 + r1*8]
4230 lea r2, [r2 + r3*8]
4231 lea r0, [r0 + r1*8]
4232 lea r2, [r2 + r3*8]
4233 mov [r6+20], r0
4234 mov [r6+28], r2
4235
4236 lea r4, [r1 + 2*r1]
4237 call pixel_sa8d_8x8_internal2
4238 %if HIGH_BIT_DEPTH
4239 HADDUW m0, m1
4240 %endif
4241 mova [esp+48], m0
4242 call pixel_sa8d_8x8_internal2
4243 SA8D_INTER
4244 mova [esp+48], m0
4245
4246 mov r0, [r6+20]
4247 mov r2, [r6+28]
4248 add r0, 8*SIZEOF_PIXEL
4249 add r2, 8*SIZEOF_PIXEL
4250 call pixel_sa8d_8x8_internal2
4251 SA8D_INTER
4252 mova [esp+64-mmsize], m0
4253 call pixel_sa8d_8x8_internal2
4254 AVG_16x16
4255
4256 mov r0, [r6+20]
4257 mov r2, [r6+28]
4258 lea r0, [r0 + r1*8]
4259 lea r2, [r2 + r3*8]
4260 lea r0, [r0 + r1*8]
4261 lea r2, [r2 + r3*8]
4262 mov [r6+20], r0
4263 mov [r6+28], r2
4264
4265 lea r4, [r1 + 2*r1]
4266 call pixel_sa8d_8x8_internal2
4267 %if HIGH_BIT_DEPTH
4268 HADDUW m0, m1
4269 %endif
4270 mova [esp+48], m0
4271 call pixel_sa8d_8x8_internal2
4272 SA8D_INTER
4273 mova [esp+48], m0
4274
4275 mov r0, [r6+20]
4276 mov r2, [r6+28]
4277 add r0, 8*SIZEOF_PIXEL
4278 add r2, 8*SIZEOF_PIXEL
4279 call pixel_sa8d_8x8_internal2
4280 SA8D_INTER
4281 mova [esp+64-mmsize], m0
4282 call pixel_sa8d_8x8_internal2
4283 SA8D_INTER
4284 %if HIGH_BIT_DEPTH == 0
4285 HADDUW m0, m1
4286 %endif
4287 movd r4d, m0
4288 add r4d, 1
4289 shr r4d, 1
4290 add r4d, dword [esp+36]
4291 mov eax, r4d
4292 mov esp, r6
4293 RET
4294
4295 cglobal pixel_sa8d_24x32, 4,7,8
4296 FIX_STRIDES r1, r3
4297 mov r6, esp
4298 and esp, ~15
4299 sub esp, 64
4300
4301 lea r4, [r1 + 2*r1]
4302 lea r5, [r3 + 2*r3]
4303 call pixel_sa8d_8x8_internal2
4304 HADDUW m0, m1
4305 movd r4d, m0
4306 add r4d, 1
4307 shr r4d, 1
4308 mov dword [esp+36], r4d
4309
4310 mov r0, [r6+20]
4311 mov r2, [r6+28]
4312 add r0, 8*SIZEOF_PIXEL
4313 add r2, 8*SIZEOF_PIXEL
4314 lea r4, [r1 + 2*r1]
4315 call pixel_sa8d_8x8_internal2
4316 HADDUW m0, m1
4317 movd r4d, m0
4318 add r4d, 1
4319 shr r4d, 1
4320 add r4d, dword [esp+36]
4321 mov dword [esp+36], r4d
4322
4323 mov r0, [r6+20]
4324 mov r2, [r6+28]
4325 add r0, 16*SIZEOF_PIXEL
4326 add r2, 16*SIZEOF_PIXEL
4327 lea r4, [r1 + 2*r1]
4328 call pixel_sa8d_8x8_internal2
4329 HADDUW m0, m1
4330 movd r4d, m0
4331 add r4d, 1
4332 shr r4d, 1
4333 add r4d, dword [esp+36]
4334 mov dword [esp+36], r4d
4335
4336 mov r0, [r6+20]
4337 mov r2, [r6+28]
4338 lea r0, [r0 + r1*8]
4339 lea r2, [r2 + r3*8]
4340 mov [r6+20], r0
4341 mov [r6+28], r2
4342 lea r4, [r1 + 2*r1]
4343 call pixel_sa8d_8x8_internal2
4344 HADDUW m0, m1
4345 movd r4d, m0
4346 add r4d, 1
4347 shr r4d, 1
4348 add r4d, dword [esp+36]
4349 mov dword [esp+36], r4d
4350
4351 mov r0, [r6+20]
4352 mov r2, [r6+28]
4353 add r0, 8*SIZEOF_PIXEL
4354 add r2, 8*SIZEOF_PIXEL
4355 lea r4, [r1 + 2*r1]
4356 call pixel_sa8d_8x8_internal2
4357 HADDUW m0, m1
4358 movd r4d, m0
4359 add r4d, 1
4360 shr r4d, 1
4361 add r4d, dword [esp+36]
4362 mov dword [esp+36], r4d
4363
4364 mov r0, [r6+20]
4365 mov r2, [r6+28]
4366 add r0, 16*SIZEOF_PIXEL
4367 add r2, 16*SIZEOF_PIXEL
4368 lea r4, [r1 + 2*r1]
4369 call pixel_sa8d_8x8_internal2
4370 HADDUW m0, m1
4371 movd r4d, m0
4372 add r4d, 1
4373 shr r4d, 1
4374 add r4d, dword [esp+36]
4375 mov dword [esp+36], r4d
4376
4377 mov r0, [r6+20]
4378 mov r2, [r6+28]
4379 lea r0, [r0 + r1*8]
4380 lea r2, [r2 + r3*8]
4381 mov [r6+20], r0
4382 mov [r6+28], r2
4383 lea r4, [r1 + 2*r1]
4384 call pixel_sa8d_8x8_internal2
4385 HADDUW m0, m1
4386 movd r4d, m0
4387 add r4d, 1
4388 shr r4d, 1
4389 add r4d, dword [esp+36]
4390 mov dword [esp+36], r4d
4391
4392 mov r0, [r6+20]
4393 mov r2, [r6+28]
4394 add r0, 8*SIZEOF_PIXEL
4395 add r2, 8*SIZEOF_PIXEL
4396 lea r4, [r1 + 2*r1]
4397 call pixel_sa8d_8x8_internal2
4398 HADDUW m0, m1
4399 movd r4d, m0
4400 add r4d, 1
4401 shr r4d, 1
4402 add r4d, dword [esp+36]
4403 mov dword [esp+36], r4d
4404
4405 mov r0, [r6+20]
4406 mov r2, [r6+28]
4407 add r0, 16*SIZEOF_PIXEL
4408 add r2, 16*SIZEOF_PIXEL
4409 lea r4, [r1 + 2*r1]
4410 call pixel_sa8d_8x8_internal2
4411 HADDUW m0, m1
4412 movd r4d, m0
4413 add r4d, 1
4414 shr r4d, 1
4415 add r4d, dword [esp+36]
4416 mov dword [esp+36], r4d
4417
4418 mov r0, [r6+20]
4419 mov r2, [r6+28]
4420 lea r0, [r0 + r1*8]
4421 lea r2, [r2 + r3*8]
4422 mov [r6+20], r0
4423 mov [r6+28], r2
4424 lea r4, [r1 + 2*r1]
4425 call pixel_sa8d_8x8_internal2
4426 HADDUW m0, m1
4427 movd r4d, m0
4428 add r4d, 1
4429 shr r4d, 1
4430 add r4d, dword [esp+36]
4431 mov dword [esp+36], r4d
4432
4433 mov r0, [r6+20]
4434 mov r2, [r6+28]
4435 add r0, 8*SIZEOF_PIXEL
4436 add r2, 8*SIZEOF_PIXEL
4437 lea r4, [r1 + 2*r1]
4438 call pixel_sa8d_8x8_internal2
4439 HADDUW m0, m1
4440 movd r4d, m0
4441 add r4d, 1
4442 shr r4d, 1
4443 add r4d, dword [esp+36]
4444 mov dword [esp+36], r4d
4445
4446 mov r0, [r6+20]
4447 mov r2, [r6+28]
4448 add r0, 16*SIZEOF_PIXEL
4449 add r2, 16*SIZEOF_PIXEL
4450 lea r4, [r1 + 2*r1]
4451 call pixel_sa8d_8x8_internal2
4452 HADDUW m0, m1
4453 movd r4d, m0
4454 add r4d, 1
4455 shr r4d, 1
4456 add r4d, dword [esp+36]
4457 mov eax, r4d
4458 mov esp, r6
4459 RET
4460
4461 cglobal pixel_sa8d_32x8, 4,7,8
4462 FIX_STRIDES r1, r3
4463 mov r6, esp
4464 and esp, ~15
4465 sub esp, 64
4466
4467 lea r4, [r1 + 2*r1]
4468 lea r5, [r3 + 2*r3]
4469 call pixel_sa8d_8x8_internal2
4470 HADDUW m0, m1
4471 movd r4d, m0
4472 add r4d, 1
4473 shr r4d, 1
4474 mov dword [esp+36], r4d
4475
4476 mov r0, [r6+20]
4477 mov r2, [r6+28]
4478 add r0, 8*SIZEOF_PIXEL
4479 add r2, 8*SIZEOF_PIXEL
4480 lea r4, [r1 + 2*r1]
4481 call pixel_sa8d_8x8_internal2
4482 HADDUW m0, m1
4483 movd r4d, m0
4484 add r4d, 1
4485 shr r4d, 1
4486 add r4d, dword [esp+36]
4487 mov dword [esp+36], r4d
4488
4489 mov r0, [r6+20]
4490 mov r2, [r6+28]
4491 add r0, 16*SIZEOF_PIXEL
4492 add r2, 16*SIZEOF_PIXEL
4493 lea r4, [r1 + 2*r1]
4494 call pixel_sa8d_8x8_internal2
4495 HADDUW m0, m1
4496 movd r4d, m0
4497 add r4d, 1
4498 shr r4d, 1
4499 add r4d, dword [esp+36]
4500 mov dword [esp+36], r4d
4501
4502 mov r0, [r6+20]
4503 mov r2, [r6+28]
4504 add r0, 24*SIZEOF_PIXEL
4505 add r2, 24*SIZEOF_PIXEL
4506 lea r4, [r1 + 2*r1]
4507 call pixel_sa8d_8x8_internal2
4508 HADDUW m0, m1
4509 movd r4d, m0
4510 add r4d, 1
4511 shr r4d, 1
4512 add r4d, dword [esp+36]
4513 mov eax, r4d
4514 mov esp, r6
4515 RET
4516
4517 cglobal pixel_sa8d_32x16, 4,7,8
4518 FIX_STRIDES r1, r3
4519 mov r6, esp
4520 and esp, ~15
4521 sub esp, 64
4522
4523 lea r4, [r1 + 2*r1]
4524 lea r5, [r3 + 2*r3]
4525 call pixel_sa8d_8x8_internal2
4526 %if HIGH_BIT_DEPTH
4527 HADDUW m0, m1
4528 %endif
4529 mova [rsp+48], m0
4530 call pixel_sa8d_8x8_internal2
4531 SA8D_INTER
4532 mova [esp+48], m0
4533
4534 mov r0, [r6+20]
4535 mov r2, [r6+28]
4536 add r0, 8*SIZEOF_PIXEL
4537 add r2, 8*SIZEOF_PIXEL
4538 call pixel_sa8d_8x8_internal2
4539 SA8D_INTER
4540 mova [esp+48], m0
4541 call pixel_sa8d_8x8_internal2
4542 SA8D_INTER
4543 %if HIGH_BIT_DEPTH == 0
4544 HADDUW m0, m1
4545 %endif
4546 movd r4d, m0
4547 add r4d, 1
4548 shr r4d, 1
4549 mov dword [esp+36], r4d
4550
4551 mov r0, [r6+20]
4552 mov r2, [r6+28]
4553 add r0, 16*SIZEOF_PIXEL
4554 add r2, 16*SIZEOF_PIXEL
4555 lea r4, [r1 + 2*r1]
4556 call pixel_sa8d_8x8_internal2
4557 %if HIGH_BIT_DEPTH
4558 HADDUW m0, m1
4559 %endif
4560 mova [esp+48], m0
4561 call pixel_sa8d_8x8_internal2
4562 SA8D_INTER
4563 mova [esp+48], m0
4564
4565 mov r0, [r6+20]
4566 mov r2, [r6+28]
4567 add r0, 24*SIZEOF_PIXEL
4568 add r2, 24*SIZEOF_PIXEL
4569 call pixel_sa8d_8x8_internal2
4570 SA8D_INTER
4571 mova [esp+64-mmsize], m0
4572 call pixel_sa8d_8x8_internal2
4573 SA8D_INTER
4574 %if HIGH_BIT_DEPTH == 0
4575 HADDUW m0, m1
4576 %endif
4577 movd r4d, m0
4578 add r4d, 1
4579 shr r4d, 1
4580 add r4d, dword [esp+36]
4581 mov eax, r4d
4582 mov esp, r6
4583 RET
4584
4585 cglobal pixel_sa8d_32x24, 4,7,8
4586 FIX_STRIDES r1, r3
4587 mov r6, esp
4588 and esp, ~15
4589 sub esp, 64
4590
4591 lea r4, [r1 + 2*r1]
4592 lea r5, [r3 + 2*r3]
4593 call pixel_sa8d_8x8_internal2
4594 HADDUW m0, m1
4595 movd r4d, m0
4596 add r4d, 1
4597 shr r4d, 1
4598 mov dword [esp+36], r4d
4599
4600 mov r0, [r6+20]
4601 mov r2, [r6+28]
4602 add r0, 8*SIZEOF_PIXEL
4603 add r2, 8*SIZEOF_PIXEL
4604 lea r4, [r1 + 2*r1]
4605 call pixel_sa8d_8x8_internal2
4606 HADDUW m0, m1
4607 movd r4d, m0
4608 add r4d, 1
4609 shr r4d, 1
4610 add r4d, dword [esp+36]
4611 mov dword [esp+36], r4d
4612
4613 mov r0, [r6+20]
4614 mov r2, [r6+28]
4615 add r0, 16*SIZEOF_PIXEL
4616 add r2, 16*SIZEOF_PIXEL
4617 lea r4, [r1 + 2*r1]
4618 call pixel_sa8d_8x8_internal2
4619 HADDUW m0, m1
4620 movd r4d, m0
4621 add r4d, 1
4622 shr r4d, 1
4623 add r4d, dword [esp+36]
4624 mov dword [esp+36], r4d
4625
4626 mov r0, [r6+20]
4627 mov r2, [r6+28]
4628 add r0, 24*SIZEOF_PIXEL
4629 add r2, 24*SIZEOF_PIXEL
4630 lea r4, [r1 + 2*r1]
4631 call pixel_sa8d_8x8_internal2
4632 HADDUW m0, m1
4633 movd r4d, m0
4634 add r4d, 1
4635 shr r4d, 1
4636 add r4d, dword [esp+36]
4637 mov dword [esp+36], r4d
4638
4639 mov r0, [r6+20]
4640 mov r2, [r6+28]
4641 lea r0, [r0 + r1*8]
4642 lea r2, [r2 + r3*8]
4643 mov [r6+20], r0
4644 mov [r6+28], r2
4645 lea r4, [r1 + 2*r1]
4646 call pixel_sa8d_8x8_internal2
4647 HADDUW m0, m1
4648 movd r4d, m0
4649 add r4d, 1
4650 shr r4d, 1
4651 add r4d, dword [esp+36]
4652 mov dword [esp+36], r4d
4653
4654 mov r0, [r6+20]
4655 mov r2, [r6+28]
4656 add r0, 8*SIZEOF_PIXEL
4657 add r2, 8*SIZEOF_PIXEL
4658 lea r4, [r1 + 2*r1]
4659 call pixel_sa8d_8x8_internal2
4660 HADDUW m0, m1
4661 movd r4d, m0
4662 add r4d, 1
4663 shr r4d, 1
4664 add r4d, dword [esp+36]
4665 mov dword [esp+36], r4d
4666
4667 mov r0, [r6+20]
4668 mov r2, [r6+28]
4669 add r0, 16*SIZEOF_PIXEL
4670 add r2, 16*SIZEOF_PIXEL
4671 lea r4, [r1 + 2*r1]
4672 call pixel_sa8d_8x8_internal2
4673 HADDUW m0, m1
4674 movd r4d, m0
4675 add r4d, 1
4676 shr r4d, 1
4677 add r4d, dword [esp+36]
4678 mov dword [esp+36], r4d
4679
4680 mov r0, [r6+20]
4681 mov r2, [r6+28]
4682 add r0, 24*SIZEOF_PIXEL
4683 add r2, 24*SIZEOF_PIXEL
4684 lea r4, [r1 + 2*r1]
4685 call pixel_sa8d_8x8_internal2
4686 HADDUW m0, m1
4687 movd r4d, m0
4688 add r4d, 1
4689 shr r4d, 1
4690 add r4d, dword [esp+36]
4691 mov dword [esp+36], r4d
4692
4693 mov r0, [r6+20]
4694 mov r2, [r6+28]
4695 lea r0, [r0 + r1*8]
4696 lea r2, [r2 + r3*8]
4697 mov [r6+20], r0
4698 mov [r6+28], r2
4699 lea r4, [r1 + 2*r1]
4700 call pixel_sa8d_8x8_internal2
4701 HADDUW m0, m1
4702 movd r4d, m0
4703 add r4d, 1
4704 shr r4d, 1
4705 add r4d, dword [esp+36]
4706 mov dword [esp+36], r4d
4707
4708 mov r0, [r6+20]
4709 mov r2, [r6+28]
4710 add r0, 8*SIZEOF_PIXEL
4711 add r2, 8*SIZEOF_PIXEL
4712 lea r4, [r1 + 2*r1]
4713 call pixel_sa8d_8x8_internal2
4714 HADDUW m0, m1
4715 movd r4d, m0
4716 add r4d, 1
4717 shr r4d, 1
4718 add r4d, dword [esp+36]
4719 mov dword [esp+36], r4d
4720
4721 mov r0, [r6+20]
4722 mov r2, [r6+28]
4723 add r0, 16*SIZEOF_PIXEL
4724 add r2, 16*SIZEOF_PIXEL
4725 lea r4, [r1 + 2*r1]
4726 call pixel_sa8d_8x8_internal2
4727 HADDUW m0, m1
4728 movd r4d, m0
4729 add r4d, 1
4730 shr r4d, 1
4731 add r4d, dword [esp+36]
4732 mov dword [esp+36], r4d
4733
4734 mov r0, [r6+20]
4735 mov r2, [r6+28]
4736 add r0, 24*SIZEOF_PIXEL
4737 add r2, 24*SIZEOF_PIXEL
4738 lea r4, [r1 + 2*r1]
4739 call pixel_sa8d_8x8_internal2
4740 HADDUW m0, m1
4741 movd r4d, m0
4742 add r4d, 1
4743 shr r4d, 1
4744 add r4d, dword [esp+36]
4745 mov eax, r4d
4746 mov esp, r6
4747 RET
4748
4749 cglobal pixel_sa8d_32x32, 4,7,8
4750 FIX_STRIDES r1, r3
4751 mov r6, esp
4752 and esp, ~15
4753 sub esp, 64
4754
4755 lea r4, [r1 + 2*r1]
4756 lea r5, [r3 + 2*r3]
4757 call pixel_sa8d_8x8_internal2
4758 %if HIGH_BIT_DEPTH
4759 HADDUW m0, m1
4760 %endif
4761 mova [rsp+48], m0
4762 call pixel_sa8d_8x8_internal2
4763 SA8D_INTER
4764 mova [esp+48], m0
4765
4766 mov r0, [r6+20]
4767 mov r2, [r6+28]
4768 add r0, 8*SIZEOF_PIXEL
4769 add r2, 8*SIZEOF_PIXEL
4770 call pixel_sa8d_8x8_internal2
4771 SA8D_INTER
4772 mova [esp+48], m0
4773 call pixel_sa8d_8x8_internal2
4774 SA8D_INTER
4775 %if HIGH_BIT_DEPTH == 0
4776 HADDUW m0, m1
4777 %endif
4778 movd r4d, m0
4779 add r4d, 1
4780 shr r4d, 1
4781 mov dword [esp+36], r4d
4782
4783 mov r0, [r6+20]
4784 mov r2, [r6+28]
4785 add r0, 16*SIZEOF_PIXEL
4786 add r2, 16*SIZEOF_PIXEL
4787 lea r4, [r1 + 2*r1]
4788 call pixel_sa8d_8x8_internal2
4789 %if HIGH_BIT_DEPTH
4790 HADDUW m0, m1
4791 %endif
4792 mova [esp+48], m0
4793 call pixel_sa8d_8x8_internal2
4794 SA8D_INTER
4795 mova [esp+48], m0
4796
4797 mov r0, [r6+20]
4798 mov r2, [r6+28]
4799 add r0, 24*SIZEOF_PIXEL
4800 add r2, 24*SIZEOF_PIXEL
4801 call pixel_sa8d_8x8_internal2
4802 SA8D_INTER
4803 mova [esp+64-mmsize], m0
4804 call pixel_sa8d_8x8_internal2
4805 AVG_16x16
4806
4807 mov r0, [r6+20]
4808 mov r2, [r6+28]
4809 lea r0, [r0 + r1*8]
4810 lea r2, [r2 + r3*8]
4811 lea r0, [r0 + r1*8]
4812 lea r2, [r2 + r3*8]
4813 lea r4, [r1 + 2*r1]
4814 call pixel_sa8d_8x8_internal2
4815 %if HIGH_BIT_DEPTH
4816 HADDUW m0, m1
4817 %endif
4818 mova [esp+48], m0
4819 call pixel_sa8d_8x8_internal2
4820 SA8D_INTER
4821 mova [esp+48], m0
4822
4823 mov r0, [r6+20]
4824 mov r2, [r6+28]
4825 lea r0, [r0 + r1*8]
4826 lea r2, [r2 + r3*8]
4827 lea r0, [r0 + r1*8]
4828 lea r2, [r2 + r3*8]
4829 add r0, 8*SIZEOF_PIXEL
4830 add r2, 8*SIZEOF_PIXEL
4831 call pixel_sa8d_8x8_internal2
4832 SA8D_INTER
4833 mova [esp+64-mmsize], m0
4834 call pixel_sa8d_8x8_internal2
4835 AVG_16x16
4836
4837 mov r0, [r6+20]
4838 mov r2, [r6+28]
4839 lea r0, [r0 + r1*8]
4840 lea r2, [r2 + r3*8]
4841 lea r0, [r0 + r1*8]
4842 lea r2, [r2 + r3*8]
4843 add r0, 16*SIZEOF_PIXEL
4844 add r2, 16*SIZEOF_PIXEL
4845 lea r4, [r1 + 2*r1]
4846 call pixel_sa8d_8x8_internal2
4847 %if HIGH_BIT_DEPTH
4848 HADDUW m0, m1
4849 %endif
4850 mova [esp+48], m0
4851 call pixel_sa8d_8x8_internal2
4852 SA8D_INTER
4853 mova [esp+48], m0
4854
4855 mov r0, [r6+20]
4856 mov r2, [r6+28]
4857 lea r0, [r0 + r1*8]
4858 lea r2, [r2 + r3*8]
4859 lea r0, [r0 + r1*8]
4860 lea r2, [r2 + r3*8]
4861 add r0, 24*SIZEOF_PIXEL
4862 add r2, 24*SIZEOF_PIXEL
4863 call pixel_sa8d_8x8_internal2
4864 SA8D_INTER
4865 mova [esp+64-mmsize], m0
4866 call pixel_sa8d_8x8_internal2
4867 SA8D_INTER
4868 %if HIGH_BIT_DEPTH == 0
4869 HADDUW m0, m1
4870 %endif
4871 movd r4d, m0
4872 add r4d, 1
4873 shr r4d, 1
4874 add r4d, dword [esp+36]
4875 mov eax, r4d
4876 mov esp, r6
4877 RET
4878
4879 cglobal pixel_sa8d_32x64, 4,7,8
4880 FIX_STRIDES r1, r3
4881 mov r6, esp
4882 and esp, ~15
4883 sub esp, 64
4884
4885 lea r4, [r1 + 2*r1]
4886 lea r5, [r3 + 2*r3]
4887 call pixel_sa8d_8x8_internal2
4888 %if HIGH_BIT_DEPTH
4889 HADDUW m0, m1
4890 %endif
4891 mova [rsp+48], m0
4892 call pixel_sa8d_8x8_internal2
4893 SA8D_INTER
4894 mova [esp+48], m0
4895
4896 mov r0, [r6+20]
4897 mov r2, [r6+28]
4898 add r0, 8*SIZEOF_PIXEL
4899 add r2, 8*SIZEOF_PIXEL
4900 call pixel_sa8d_8x8_internal2
4901 SA8D_INTER
4902 mova [esp+48], m0
4903 call pixel_sa8d_8x8_internal2
4904 SA8D_INTER
4905 %if HIGH_BIT_DEPTH == 0
4906 HADDUW m0, m1
4907 %endif
4908 movd r4d, m0
4909 add r4d, 1
4910 shr r4d, 1
4911 mov dword [esp+36], r4d
4912
4913 mov r0, [r6+20]
4914 mov r2, [r6+28]
4915 add r0, 16*SIZEOF_PIXEL
4916 add r2, 16*SIZEOF_PIXEL
4917 lea r4, [r1 + 2*r1]
4918 call pixel_sa8d_8x8_internal2
4919 %if HIGH_BIT_DEPTH
4920 HADDUW m0, m1
4921 %endif
4922 mova [esp+48], m0
4923 call pixel_sa8d_8x8_internal2
4924 SA8D_INTER
4925 mova [esp+48], m0
4926
4927 mov r0, [r6+20]
4928 mov r2, [r6+28]
4929 add r0, 24*SIZEOF_PIXEL
4930 add r2, 24*SIZEOF_PIXEL
4931 call pixel_sa8d_8x8_internal2
4932 SA8D_INTER
4933 mova [esp+64-mmsize], m0
4934 call pixel_sa8d_8x8_internal2
4935 AVG_16x16
4936
4937 mov r0, [r6+20]
4938 mov r2, [r6+28]
4939 lea r0, [r0 + r1*8]
4940 lea r2, [r2 + r3*8]
4941 lea r0, [r0 + r1*8]
4942 lea r2, [r2 + r3*8]
4943 mov [r6+20], r0
4944 mov [r6+28], r2
4945
4946 lea r4, [r1 + 2*r1]
4947 call pixel_sa8d_8x8_internal2
4948 %if HIGH_BIT_DEPTH
4949 HADDUW m0, m1
4950 %endif
4951 mova [esp+48], m0
4952 call pixel_sa8d_8x8_internal2
4953 SA8D_INTER
4954 mova [esp+48], m0
4955
4956 mov r0, [r6+20]
4957 mov r2, [r6+28]
4958 add r0, 8*SIZEOF_PIXEL
4959 add r2, 8*SIZEOF_PIXEL
4960 call pixel_sa8d_8x8_internal2
4961 SA8D_INTER
4962 mova [esp+64-mmsize], m0
4963 call pixel_sa8d_8x8_internal2
4964 AVG_16x16
4965
4966 mov r0, [r6+20]
4967 mov r2, [r6+28]
4968 add r0, 16*SIZEOF_PIXEL
4969 add r2, 16*SIZEOF_PIXEL
4970 lea r4, [r1 + 2*r1]
4971 call pixel_sa8d_8x8_internal2
4972 %if HIGH_BIT_DEPTH
4973 HADDUW m0, m1
4974 %endif
4975 mova [esp+48], m0
4976 call pixel_sa8d_8x8_internal2
4977 SA8D_INTER
4978 mova [esp+48], m0
4979
4980 mov r0, [r6+20]
4981 mov r2, [r6+28]
4982 add r0, 24*SIZEOF_PIXEL
4983 add r2, 24*SIZEOF_PIXEL
4984 call pixel_sa8d_8x8_internal2
4985 SA8D_INTER
4986 mova [esp+64-mmsize], m0
4987 call pixel_sa8d_8x8_internal2
4988 AVG_16x16
4989
4990 mov r0, [r6+20]
4991 mov r2, [r6+28]
4992 lea r0, [r0 + r1*8]
4993 lea r2, [r2 + r3*8]
4994 lea r0, [r0 + r1*8]
4995 lea r2, [r2 + r3*8]
4996 mov [r6+20], r0
4997 mov [r6+28], r2
4998
4999 lea r4, [r1 + 2*r1]
5000 call pixel_sa8d_8x8_internal2
5001 %if HIGH_BIT_DEPTH
5002 HADDUW m0, m1
5003 %endif
5004 mova [esp+48], m0
5005 call pixel_sa8d_8x8_internal2
5006 SA8D_INTER
5007 mova [esp+48], m0
5008
5009 mov r0, [r6+20]
5010 mov r2, [r6+28]
5011 add r0, 8*SIZEOF_PIXEL
5012 add r2, 8*SIZEOF_PIXEL
5013 call pixel_sa8d_8x8_internal2
5014 SA8D_INTER
5015 mova [esp+64-mmsize], m0
5016 call pixel_sa8d_8x8_internal2
5017 AVG_16x16
5018
5019 mov r0, [r6+20]
5020 mov r2, [r6+28]
5021 add r0, 16*SIZEOF_PIXEL
5022 add r2, 16*SIZEOF_PIXEL
5023 lea r4, [r1 + 2*r1]
5024 call pixel_sa8d_8x8_internal2
5025 %if HIGH_BIT_DEPTH
5026 HADDUW m0, m1
5027 %endif
5028 mova [esp+48], m0
5029 call pixel_sa8d_8x8_internal2
5030 SA8D_INTER
5031 mova [esp+48], m0
5032
5033 mov r0, [r6+20]
5034 mov r2, [r6+28]
5035 add r0, 24*SIZEOF_PIXEL
5036 add r2, 24*SIZEOF_PIXEL
5037 call pixel_sa8d_8x8_internal2
5038 SA8D_INTER
5039 mova [esp+64-mmsize], m0
5040 call pixel_sa8d_8x8_internal2
5041 AVG_16x16
5042
5043 mov r0, [r6+20]
5044 mov r2, [r6+28]
5045 lea r0, [r0 + r1*8]
5046 lea r2, [r2 + r3*8]
5047 lea r0, [r0 + r1*8]
5048 lea r2, [r2 + r3*8]
5049 mov [r6+20], r0
5050 mov [r6+28], r2
5051
5052 lea r4, [r1 + 2*r1]
5053 call pixel_sa8d_8x8_internal2
5054 %if HIGH_BIT_DEPTH
5055 HADDUW m0, m1
5056 %endif
5057 mova [esp+48], m0
5058 call pixel_sa8d_8x8_internal2
5059 SA8D_INTER
5060 mova [esp+48], m0
5061
5062 mov r0, [r6+20]
5063 mov r2, [r6+28]
5064 add r0, 8*SIZEOF_PIXEL
5065 add r2, 8*SIZEOF_PIXEL
5066 call pixel_sa8d_8x8_internal2
5067 SA8D_INTER
5068 mova [esp+64-mmsize], m0
5069 call pixel_sa8d_8x8_internal2
5070 AVG_16x16
5071
5072 mov r0, [r6+20]
5073 mov r2, [r6+28]
5074 add r0, 16*SIZEOF_PIXEL
5075 add r2, 16*SIZEOF_PIXEL
5076 lea r4, [r1 + 2*r1]
5077 call pixel_sa8d_8x8_internal2
5078 %if HIGH_BIT_DEPTH
5079 HADDUW m0, m1
5080 %endif
5081 mova [esp+48], m0
5082 call pixel_sa8d_8x8_internal2
5083 SA8D_INTER
5084 mova [esp+48], m0
5085
5086 mov r0, [r6+20]
5087 mov r2, [r6+28]
5088 add r0, 24*SIZEOF_PIXEL
5089 add r2, 24*SIZEOF_PIXEL
5090 call pixel_sa8d_8x8_internal2
5091 SA8D_INTER
5092 mova [esp+64-mmsize], m0
5093 call pixel_sa8d_8x8_internal2
5094 SA8D_INTER
5095 %if HIGH_BIT_DEPTH == 0
5096 HADDUW m0, m1
5097 %endif
5098 movd r4d, m0
5099 add r4d, 1
5100 shr r4d, 1
5101 add r4d, dword [esp+36]
5102 mov eax, r4d
5103 mov esp, r6
5104 RET
5105
5106 cglobal pixel_sa8d_48x64, 4,7,8
5107 FIX_STRIDES r1, r3
5108 mov r6, esp
5109 and esp, ~15
5110 sub esp, 64
5111
5112 lea r4, [r1 + 2*r1]
5113 lea r5, [r3 + 2*r3]
5114 call pixel_sa8d_8x8_internal2
5115 %if HIGH_BIT_DEPTH
5116 HADDUW m0, m1
5117 %endif
5118 mova [rsp+48], m0
5119 call pixel_sa8d_8x8_internal2
5120 SA8D_INTER
5121 mova [esp+48], m0
5122
5123 mov r0, [r6+20]
5124 mov r2, [r6+28]
5125 add r0, 8*SIZEOF_PIXEL
5126 add r2, 8*SIZEOF_PIXEL
5127 call pixel_sa8d_8x8_internal2
5128 SA8D_INTER
5129 mova [esp+48], m0
5130 call pixel_sa8d_8x8_internal2
5131 SA8D_INTER
5132 %if HIGH_BIT_DEPTH == 0
5133 HADDUW m0, m1
5134 %endif
5135 movd r4d, m0
5136 add r4d, 1
5137 shr r4d, 1
5138 mov dword [esp+36], r4d
5139
5140 mov r0, [r6+20]
5141 mov r2, [r6+28]
5142 add r0, 16*SIZEOF_PIXEL
5143 add r2, 16*SIZEOF_PIXEL
5144 lea r4, [r1 + 2*r1]
5145 call pixel_sa8d_8x8_internal2
5146 %if HIGH_BIT_DEPTH
5147 HADDUW m0, m1
5148 %endif
5149 mova [esp+48], m0
5150 call pixel_sa8d_8x8_internal2
5151 SA8D_INTER
5152 mova [esp+48], m0
5153
5154 mov r0, [r6+20]
5155 mov r2, [r6+28]
5156 add r0, 24*SIZEOF_PIXEL
5157 add r2, 24*SIZEOF_PIXEL
5158 call pixel_sa8d_8x8_internal2
5159 SA8D_INTER
5160 mova [esp+64-mmsize], m0
5161 call pixel_sa8d_8x8_internal2
5162 AVG_16x16
5163
5164 mov r0, [r6+20]
5165 mov r2, [r6+28]
5166 add r0, 32*SIZEOF_PIXEL
5167 add r2, 32*SIZEOF_PIXEL
5168 lea r4, [r1 + 2*r1]
5169 call pixel_sa8d_8x8_internal2
5170 %if HIGH_BIT_DEPTH
5171 HADDUW m0, m1
5172 %endif
5173 mova [esp+48], m0
5174 call pixel_sa8d_8x8_internal2
5175 SA8D_INTER
5176 mova [esp+48], m0
5177
5178 mov r0, [r6+20]
5179 mov r2, [r6+28]
5180 add r0, 40*SIZEOF_PIXEL
5181 add r2, 40*SIZEOF_PIXEL
5182 call pixel_sa8d_8x8_internal2
5183 SA8D_INTER
5184 mova [esp+64-mmsize], m0
5185 call pixel_sa8d_8x8_internal2
5186 AVG_16x16
5187
5188 mov r0, [r6+20]
5189 mov r2, [r6+28]
5190 lea r0, [r0 + r1*8]
5191 lea r2, [r2 + r3*8]
5192 lea r0, [r0 + r1*8]
5193 lea r2, [r2 + r3*8]
5194 mov [r6+20], r0
5195 mov [r6+28], r2
5196
5197 lea r4, [r1 + 2*r1]
5198 call pixel_sa8d_8x8_internal2
5199 %if HIGH_BIT_DEPTH
5200 HADDUW m0, m1
5201 %endif
5202 mova [esp+48], m0
5203 call pixel_sa8d_8x8_internal2
5204 SA8D_INTER
5205 mova [esp+48], m0
5206
5207 mov r0, [r6+20]
5208 mov r2, [r6+28]
5209 add r0, 8*SIZEOF_PIXEL
5210 add r2, 8*SIZEOF_PIXEL
5211 call pixel_sa8d_8x8_internal2
5212 SA8D_INTER
5213 mova [esp+64-mmsize], m0
5214 call pixel_sa8d_8x8_internal2
5215 AVG_16x16
5216
5217 mov r0, [r6+20]
5218 mov r2, [r6+28]
5219 add r0, 16*SIZEOF_PIXEL
5220 add r2, 16*SIZEOF_PIXEL
5221 lea r4, [r1 + 2*r1]
5222 call pixel_sa8d_8x8_internal2
5223 %if HIGH_BIT_DEPTH
5224 HADDUW m0, m1
5225 %endif
5226 mova [esp+48], m0
5227 call pixel_sa8d_8x8_internal2
5228 SA8D_INTER
5229 mova [esp+48], m0
5230
5231 mov r0, [r6+20]
5232 mov r2, [r6+28]
5233 add r0, 24*SIZEOF_PIXEL
5234 add r2, 24*SIZEOF_PIXEL
5235 call pixel_sa8d_8x8_internal2
5236 SA8D_INTER
5237 mova [esp+64-mmsize], m0
5238 call pixel_sa8d_8x8_internal2
5239 AVG_16x16
5240
5241 mov r0, [r6+20]
5242 mov r2, [r6+28]
5243 add r0, 32*SIZEOF_PIXEL
5244 add r2, 32*SIZEOF_PIXEL
5245 lea r4, [r1 + 2*r1]
5246 call pixel_sa8d_8x8_internal2
5247 %if HIGH_BIT_DEPTH
5248 HADDUW m0, m1
5249 %endif
5250 mova [esp+48], m0
5251 call pixel_sa8d_8x8_internal2
5252 SA8D_INTER
5253 mova [esp+48], m0
5254
5255 mov r0, [r6+20]
5256 mov r2, [r6+28]
5257 add r0, 40*SIZEOF_PIXEL
5258 add r2, 40*SIZEOF_PIXEL
5259 call pixel_sa8d_8x8_internal2
5260 SA8D_INTER
5261 mova [esp+64-mmsize], m0
5262 call pixel_sa8d_8x8_internal2
5263 AVG_16x16
5264
5265 mov r0, [r6+20]
5266 mov r2, [r6+28]
5267 lea r0, [r0 + r1*8]
5268 lea r2, [r2 + r3*8]
5269 lea r0, [r0 + r1*8]
5270 lea r2, [r2 + r3*8]
5271 mov [r6+20], r0
5272 mov [r6+28], r2
5273
5274 lea r4, [r1 + 2*r1]
5275 call pixel_sa8d_8x8_internal2
5276 %if HIGH_BIT_DEPTH
5277 HADDUW m0, m1
5278 %endif
5279 mova [esp+48], m0
5280 call pixel_sa8d_8x8_internal2
5281 SA8D_INTER
5282 mova [esp+48], m0
5283
5284 mov r0, [r6+20]
5285 mov r2, [r6+28]
5286 add r0, 8*SIZEOF_PIXEL
5287 add r2, 8*SIZEOF_PIXEL
5288 call pixel_sa8d_8x8_internal2
5289 SA8D_INTER
5290 mova [esp+64-mmsize], m0
5291 call pixel_sa8d_8x8_internal2
5292 AVG_16x16
5293
5294 mov r0, [r6+20]
5295 mov r2, [r6+28]
5296 add r0, 16*SIZEOF_PIXEL
5297 add r2, 16*SIZEOF_PIXEL
5298 lea r4, [r1 + 2*r1]
5299 call pixel_sa8d_8x8_internal2
5300 %if HIGH_BIT_DEPTH
5301 HADDUW m0, m1
5302 %endif
5303 mova [esp+48], m0
5304 call pixel_sa8d_8x8_internal2
5305 SA8D_INTER
5306 mova [esp+48], m0
5307
5308 mov r0, [r6+20]
5309 mov r2, [r6+28]
5310 add r0, 24*SIZEOF_PIXEL
5311 add r2, 24*SIZEOF_PIXEL
5312 call pixel_sa8d_8x8_internal2
5313 SA8D_INTER
5314 mova [esp+64-mmsize], m0
5315 call pixel_sa8d_8x8_internal2
5316 AVG_16x16
5317
5318 mov r0, [r6+20]
5319 mov r2, [r6+28]
5320 add r0, 32*SIZEOF_PIXEL
5321 add r2, 32*SIZEOF_PIXEL
5322 lea r4, [r1 + 2*r1]
5323 call pixel_sa8d_8x8_internal2
5324 %if HIGH_BIT_DEPTH
5325 HADDUW m0, m1
5326 %endif
5327 mova [esp+48], m0
5328 call pixel_sa8d_8x8_internal2
5329 SA8D_INTER
5330 mova [esp+48], m0
5331
5332 mov r0, [r6+20]
5333 mov r2, [r6+28]
5334 add r0, 40*SIZEOF_PIXEL
5335 add r2, 40*SIZEOF_PIXEL
5336 call pixel_sa8d_8x8_internal2
5337 SA8D_INTER
5338 mova [esp+64-mmsize], m0
5339 call pixel_sa8d_8x8_internal2
5340 AVG_16x16
5341
5342 mov r0, [r6+20]
5343 mov r2, [r6+28]
5344 lea r0, [r0 + r1*8]
5345 lea r2, [r2 + r3*8]
5346 lea r0, [r0 + r1*8]
5347 lea r2, [r2 + r3*8]
5348 mov [r6+20], r0
5349 mov [r6+28], r2
5350
5351 lea r4, [r1 + 2*r1]
5352 call pixel_sa8d_8x8_internal2
5353 %if HIGH_BIT_DEPTH
5354 HADDUW m0, m1
5355 %endif
5356 mova [esp+48], m0
5357 call pixel_sa8d_8x8_internal2
5358 SA8D_INTER
5359 mova [esp+48], m0
5360
5361 mov r0, [r6+20]
5362 mov r2, [r6+28]
5363 add r0, 8*SIZEOF_PIXEL
5364 add r2, 8*SIZEOF_PIXEL
5365 call pixel_sa8d_8x8_internal2
5366 SA8D_INTER
5367 mova [esp+64-mmsize], m0
5368 call pixel_sa8d_8x8_internal2
5369 AVG_16x16
5370
5371 mov r0, [r6+20]
5372 mov r2, [r6+28]
5373 add r0, 16*SIZEOF_PIXEL
5374 add r2, 16*SIZEOF_PIXEL
5375 lea r4, [r1 + 2*r1]
5376 call pixel_sa8d_8x8_internal2
5377 %if HIGH_BIT_DEPTH
5378 HADDUW m0, m1
5379 %endif
5380 mova [esp+48], m0
5381 call pixel_sa8d_8x8_internal2
5382 SA8D_INTER
5383 mova [esp+48], m0
5384
5385 mov r0, [r6+20]
5386 mov r2, [r6+28]
5387 add r0, 24*SIZEOF_PIXEL
5388 add r2, 24*SIZEOF_PIXEL
5389 call pixel_sa8d_8x8_internal2
5390 SA8D_INTER
5391 mova [esp+64-mmsize], m0
5392 call pixel_sa8d_8x8_internal2
5393 AVG_16x16
5394
5395 mov r0, [r6+20]
5396 mov r2, [r6+28]
5397 add r0, 32*SIZEOF_PIXEL
5398 add r2, 32*SIZEOF_PIXEL
5399 lea r4, [r1 + 2*r1]
5400 call pixel_sa8d_8x8_internal2
5401 %if HIGH_BIT_DEPTH
5402 HADDUW m0, m1
5403 %endif
5404 mova [esp+48], m0
5405 call pixel_sa8d_8x8_internal2
5406 SA8D_INTER
5407 mova [esp+48], m0
5408
5409 mov r0, [r6+20]
5410 mov r2, [r6+28]
5411 add r0, 40*SIZEOF_PIXEL
5412 add r2, 40*SIZEOF_PIXEL
5413 call pixel_sa8d_8x8_internal2
5414 SA8D_INTER
5415 mova [esp+64-mmsize], m0
5416 call pixel_sa8d_8x8_internal2
5417 SA8D_INTER
5418 %if HIGH_BIT_DEPTH == 0
5419 HADDUW m0, m1
5420 %endif
5421 movd r4d, m0
5422 add r4d, 1
5423 shr r4d, 1
5424 add r4d, dword [esp+36]
5425 mov eax, r4d
5426 mov esp, r6
5427 RET
5428
5429 cglobal pixel_sa8d_64x16, 4,7,8
5430 FIX_STRIDES r1, r3
5431 mov r6, esp
5432 and esp, ~15
5433 sub esp, 64
5434
5435 lea r4, [r1 + 2*r1]
5436 lea r5, [r3 + 2*r3]
5437 call pixel_sa8d_8x8_internal2
5438 %if HIGH_BIT_DEPTH
5439 HADDUW m0, m1
5440 %endif
5441 mova [rsp+48], m0
5442 call pixel_sa8d_8x8_internal2
5443 SA8D_INTER
5444 mova [esp+48], m0
5445
5446 mov r0, [r6+20]
5447 mov r2, [r6+28]
5448 add r0, 8*SIZEOF_PIXEL
5449 add r2, 8*SIZEOF_PIXEL
5450 call pixel_sa8d_8x8_internal2
5451 SA8D_INTER
5452 mova [esp+48], m0
5453 call pixel_sa8d_8x8_internal2
5454 SA8D_INTER
5455 %if HIGH_BIT_DEPTH == 0
5456 HADDUW m0, m1
5457 %endif
5458 movd r4d, m0
5459 add r4d, 1
5460 shr r4d, 1
5461 mov dword [esp+36], r4d
5462
5463 mov r0, [r6+20]
5464 mov r2, [r6+28]
5465 add r0, 16*SIZEOF_PIXEL
5466 add r2, 16*SIZEOF_PIXEL
5467 lea r4, [r1 + 2*r1]
5468 call pixel_sa8d_8x8_internal2
5469 %if HIGH_BIT_DEPTH
5470 HADDUW m0, m1
5471 %endif
5472 mova [esp+48], m0
5473 call pixel_sa8d_8x8_internal2
5474 SA8D_INTER
5475 mova [esp+48], m0
5476
5477 mov r0, [r6+20]
5478 mov r2, [r6+28]
5479 add r0, 24*SIZEOF_PIXEL
5480 add r2, 24*SIZEOF_PIXEL
5481 call pixel_sa8d_8x8_internal2
5482 SA8D_INTER
5483 mova [esp+64-mmsize], m0
5484 call pixel_sa8d_8x8_internal2
5485 AVG_16x16
5486
5487 mov r0, [r6+20]
5488 mov r2, [r6+28]
5489 add r0, 32*SIZEOF_PIXEL
5490 add r2, 32*SIZEOF_PIXEL
5491 lea r4, [r1 + 2*r1]
5492 call pixel_sa8d_8x8_internal2
5493 %if HIGH_BIT_DEPTH
5494 HADDUW m0, m1
5495 %endif
5496 mova [esp+48], m0
5497 call pixel_sa8d_8x8_internal2
5498 SA8D_INTER
5499 mova [esp+48], m0
5500
5501 mov r0, [r6+20]
5502 mov r2, [r6+28]
5503 add r0, 40*SIZEOF_PIXEL
5504 add r2, 40*SIZEOF_PIXEL
5505 call pixel_sa8d_8x8_internal2
5506 SA8D_INTER
5507 mova [esp+64-mmsize], m0
5508 call pixel_sa8d_8x8_internal2
5509 AVG_16x16
5510
5511 mov r0, [r6+20]
5512 mov r2, [r6+28]
5513 add r0, 48*SIZEOF_PIXEL
5514 add r2, 48*SIZEOF_PIXEL
5515 lea r4, [r1 + 2*r1]
5516 call pixel_sa8d_8x8_internal2
5517 %if HIGH_BIT_DEPTH
5518 HADDUW m0, m1
5519 %endif
5520 mova [esp+48], m0
5521 call pixel_sa8d_8x8_internal2
5522 SA8D_INTER
5523 mova [esp+48], m0
5524
5525 mov r0, [r6+20]
5526 mov r2, [r6+28]
5527 add r0, 56*SIZEOF_PIXEL
5528 add r2, 56*SIZEOF_PIXEL
5529 call pixel_sa8d_8x8_internal2
5530 SA8D_INTER
5531 mova [esp+64-mmsize], m0
5532 call pixel_sa8d_8x8_internal2
5533 SA8D_INTER
5534 %if HIGH_BIT_DEPTH == 0
5535 HADDUW m0, m1
5536 %endif
5537 movd r4d, m0
5538 add r4d, 1
5539 shr r4d, 1
5540 add r4d, dword [esp+36]
5541 mov eax, r4d
5542 mov esp, r6
5543 RET
5544
5545 cglobal pixel_sa8d_64x32, 4,7,8
5546 FIX_STRIDES r1, r3
5547 mov r6, esp
5548 and esp, ~15
5549 sub esp, 64
5550
5551 lea r4, [r1 + 2*r1]
5552 lea r5, [r3 + 2*r3]
5553 call pixel_sa8d_8x8_internal2
5554 %if HIGH_BIT_DEPTH
5555 HADDUW m0, m1
5556 %endif
5557 mova [rsp+48], m0
5558 call pixel_sa8d_8x8_internal2
5559 SA8D_INTER
5560 mova [esp+48], m0
5561
5562 mov r0, [r6+20]
5563 mov r2, [r6+28]
5564 add r0, 8*SIZEOF_PIXEL
5565 add r2, 8*SIZEOF_PIXEL
5566 call pixel_sa8d_8x8_internal2
5567 SA8D_INTER
5568 mova [esp+48], m0
5569 call pixel_sa8d_8x8_internal2
5570 SA8D_INTER
5571 %if HIGH_BIT_DEPTH == 0
5572 HADDUW m0, m1
5573 %endif
5574 movd r4d, m0
5575 add r4d, 1
5576 shr r4d, 1
5577 mov dword [esp+36], r4d
5578
5579 mov r0, [r6+20]
5580 mov r2, [r6+28]
5581 add r0, 16*SIZEOF_PIXEL
5582 add r2, 16*SIZEOF_PIXEL
5583 lea r4, [r1 + 2*r1]
5584 call pixel_sa8d_8x8_internal2
5585 %if HIGH_BIT_DEPTH
5586 HADDUW m0, m1
5587 %endif
5588 mova [esp+48], m0
5589 call pixel_sa8d_8x8_internal2
5590 SA8D_INTER
5591 mova [esp+48], m0
5592
5593 mov r0, [r6+20]
5594 mov r2, [r6+28]
5595 add r0, 24*SIZEOF_PIXEL
5596 add r2, 24*SIZEOF_PIXEL
5597 call pixel_sa8d_8x8_internal2
5598 SA8D_INTER
5599 mova [esp+64-mmsize], m0
5600 call pixel_sa8d_8x8_internal2
5601 AVG_16x16
5602
5603 mov r0, [r6+20]
5604 mov r2, [r6+28]
5605 add r0, 32*SIZEOF_PIXEL
5606 add r2, 32*SIZEOF_PIXEL
5607 lea r4, [r1 + 2*r1]
5608 call pixel_sa8d_8x8_internal2
5609 %if HIGH_BIT_DEPTH
5610 HADDUW m0, m1
5611 %endif
5612 mova [esp+48], m0
5613 call pixel_sa8d_8x8_internal2
5614 SA8D_INTER
5615 mova [esp+48], m0
5616
5617 mov r0, [r6+20]
5618 mov r2, [r6+28]
5619 add r0, 40*SIZEOF_PIXEL
5620 add r2, 40*SIZEOF_PIXEL
5621 call pixel_sa8d_8x8_internal2
5622 SA8D_INTER
5623 mova [esp+64-mmsize], m0
5624 call pixel_sa8d_8x8_internal2
5625 AVG_16x16
5626
5627 mov r0, [r6+20]
5628 mov r2, [r6+28]
5629 add r0, 48*SIZEOF_PIXEL
5630 add r2, 48*SIZEOF_PIXEL
5631 lea r4, [r1 + 2*r1]
5632 call pixel_sa8d_8x8_internal2
5633 %if HIGH_BIT_DEPTH
5634 HADDUW m0, m1
5635 %endif
5636 mova [esp+48], m0
5637 call pixel_sa8d_8x8_internal2
5638 SA8D_INTER
5639 mova [esp+48], m0
5640
5641 mov r0, [r6+20]
5642 mov r2, [r6+28]
5643 add r0, 56*SIZEOF_PIXEL
5644 add r2, 56*SIZEOF_PIXEL
5645 call pixel_sa8d_8x8_internal2
5646 SA8D_INTER
5647 mova [esp+64-mmsize], m0
5648 call pixel_sa8d_8x8_internal2
5649 AVG_16x16
5650
5651 mov r0, [r6+20]
5652 mov r2, [r6+28]
5653 lea r0, [r0 + r1*8]
5654 lea r2, [r2 + r3*8]
5655 lea r0, [r0 + r1*8]
5656 lea r2, [r2 + r3*8]
5657 mov [r6+20], r0
5658 mov [r6+28], r2
5659
5660 lea r4, [r1 + 2*r1]
5661 call pixel_sa8d_8x8_internal2
5662 %if HIGH_BIT_DEPTH
5663 HADDUW m0, m1
5664 %endif
5665 mova [esp+48], m0
5666 call pixel_sa8d_8x8_internal2
5667 SA8D_INTER
5668 mova [esp+48], m0
5669
5670 mov r0, [r6+20]
5671 mov r2, [r6+28]
5672 add r0, 8*SIZEOF_PIXEL
5673 add r2, 8*SIZEOF_PIXEL
5674 call pixel_sa8d_8x8_internal2
5675 SA8D_INTER
5676 mova [esp+64-mmsize], m0
5677 call pixel_sa8d_8x8_internal2
5678 AVG_16x16
5679
5680 mov r0, [r6+20]
5681 mov r2, [r6+28]
5682 add r0, 16*SIZEOF_PIXEL
5683 add r2, 16*SIZEOF_PIXEL
5684 lea r4, [r1 + 2*r1]
5685 call pixel_sa8d_8x8_internal2
5686 %if HIGH_BIT_DEPTH
5687 HADDUW m0, m1
5688 %endif
5689 mova [esp+48], m0
5690 call pixel_sa8d_8x8_internal2
5691 SA8D_INTER
5692 mova [esp+48], m0
5693
5694 mov r0, [r6+20]
5695 mov r2, [r6+28]
5696 add r0, 24*SIZEOF_PIXEL
5697 add r2, 24*SIZEOF_PIXEL
5698 call pixel_sa8d_8x8_internal2
5699 SA8D_INTER
5700 mova [esp+64-mmsize], m0
5701 call pixel_sa8d_8x8_internal2
5702 AVG_16x16
5703
5704 mov r0, [r6+20]
5705 mov r2, [r6+28]
5706 add r0, 32*SIZEOF_PIXEL
5707 add r2, 32*SIZEOF_PIXEL
5708 lea r4, [r1 + 2*r1]
5709 call pixel_sa8d_8x8_internal2
5710 %if HIGH_BIT_DEPTH
5711 HADDUW m0, m1
5712 %endif
5713 mova [esp+48], m0
5714 call pixel_sa8d_8x8_internal2
5715 SA8D_INTER
5716 mova [esp+48], m0
5717
5718 mov r0, [r6+20]
5719 mov r2, [r6+28]
5720 add r0, 40*SIZEOF_PIXEL
5721 add r2, 40*SIZEOF_PIXEL
5722 call pixel_sa8d_8x8_internal2
5723 SA8D_INTER
5724 mova [esp+64-mmsize], m0
5725 call pixel_sa8d_8x8_internal2
5726 AVG_16x16
5727
5728 mov r0, [r6+20]
5729 mov r2, [r6+28]
5730 add r0, 48*SIZEOF_PIXEL
5731 add r2, 48*SIZEOF_PIXEL
5732 lea r4, [r1 + 2*r1]
5733 call pixel_sa8d_8x8_internal2
5734 %if HIGH_BIT_DEPTH
5735 HADDUW m0, m1
5736 %endif
5737 mova [esp+48], m0
5738 call pixel_sa8d_8x8_internal2
5739 SA8D_INTER
5740 mova [esp+48], m0
5741
5742 mov r0, [r6+20]
5743 mov r2, [r6+28]
5744 add r0, 56*SIZEOF_PIXEL
5745 add r2, 56*SIZEOF_PIXEL
5746 call pixel_sa8d_8x8_internal2
5747 SA8D_INTER
5748 mova [esp+64-mmsize], m0
5749 call pixel_sa8d_8x8_internal2
5750 SA8D_INTER
5751 %if HIGH_BIT_DEPTH == 0
5752 HADDUW m0, m1
5753 %endif
5754 movd r4d, m0
5755 add r4d, 1
5756 shr r4d, 1
5757 add r4d, dword [esp+36]
5758 mov eax, r4d
5759 mov esp, r6
5760 RET
5761
5762 cglobal pixel_sa8d_64x48, 4,7,8
5763 FIX_STRIDES r1, r3
5764 mov r6, esp
5765 and esp, ~15
5766 sub esp, 64
5767
5768 lea r4, [r1 + 2*r1]
5769 lea r5, [r3 + 2*r3]
5770 call pixel_sa8d_8x8_internal2
5771 %if HIGH_BIT_DEPTH
5772 HADDUW m0, m1
5773 %endif
5774 mova [rsp+48], m0
5775 call pixel_sa8d_8x8_internal2
5776 SA8D_INTER
5777 mova [esp+48], m0
5778
5779 mov r0, [r6+20]
5780 mov r2, [r6+28]
5781 add r0, 8*SIZEOF_PIXEL
5782 add r2, 8*SIZEOF_PIXEL
5783 call pixel_sa8d_8x8_internal2
5784 SA8D_INTER
5785 mova [esp+48], m0
5786 call pixel_sa8d_8x8_internal2
5787 SA8D_INTER
5788 %if HIGH_BIT_DEPTH == 0
5789 HADDUW m0, m1
5790 %endif
5791 movd r4d, m0
5792 add r4d, 1
5793 shr r4d, 1
5794 mov dword [esp+36], r4d
5795
5796 mov r0, [r6+20]
5797 mov r2, [r6+28]
5798 add r0, 16*SIZEOF_PIXEL
5799 add r2, 16*SIZEOF_PIXEL
5800 lea r4, [r1 + 2*r1]
5801 call pixel_sa8d_8x8_internal2
5802 %if HIGH_BIT_DEPTH
5803 HADDUW m0, m1
5804 %endif
5805 mova [esp+48], m0
5806 call pixel_sa8d_8x8_internal2
5807 SA8D_INTER
5808 mova [esp+48], m0
5809
5810 mov r0, [r6+20]
5811 mov r2, [r6+28]
5812 add r0, 24*SIZEOF_PIXEL
5813 add r2, 24*SIZEOF_PIXEL
5814 call pixel_sa8d_8x8_internal2
5815 SA8D_INTER
5816 mova [esp+64-mmsize], m0
5817 call pixel_sa8d_8x8_internal2
5818 AVG_16x16
5819
5820 mov r0, [r6+20]
5821 mov r2, [r6+28]
5822 add r0, 32*SIZEOF_PIXEL
5823 add r2, 32*SIZEOF_PIXEL
5824 lea r4, [r1 + 2*r1]
5825 call pixel_sa8d_8x8_internal2
5826 %if HIGH_BIT_DEPTH
5827 HADDUW m0, m1
5828 %endif
5829 mova [esp+48], m0
5830 call pixel_sa8d_8x8_internal2
5831 SA8D_INTER
5832 mova [esp+48], m0
5833
5834 mov r0, [r6+20]
5835 mov r2, [r6+28]
5836 add r0, 40*SIZEOF_PIXEL
5837 add r2, 40*SIZEOF_PIXEL
5838 call pixel_sa8d_8x8_internal2
5839 SA8D_INTER
5840 mova [esp+64-mmsize], m0
5841 call pixel_sa8d_8x8_internal2
5842 AVG_16x16
5843
5844 mov r0, [r6+20]
5845 mov r2, [r6+28]
5846 add r0, 48*SIZEOF_PIXEL
5847 add r2, 48*SIZEOF_PIXEL
5848 lea r4, [r1 + 2*r1]
5849 call pixel_sa8d_8x8_internal2
5850 %if HIGH_BIT_DEPTH
5851 HADDUW m0, m1
5852 %endif
5853 mova [esp+48], m0
5854 call pixel_sa8d_8x8_internal2
5855 SA8D_INTER
5856 mova [esp+48], m0
5857
5858 mov r0, [r6+20]
5859 mov r2, [r6+28]
5860 add r0, 56*SIZEOF_PIXEL
5861 add r2, 56*SIZEOF_PIXEL
5862 call pixel_sa8d_8x8_internal2
5863 SA8D_INTER
5864 mova [esp+64-mmsize], m0
5865 call pixel_sa8d_8x8_internal2
5866 AVG_16x16
5867
5868 mov r0, [r6+20]
5869 mov r2, [r6+28]
5870 lea r0, [r0 + r1*8]
5871 lea r2, [r2 + r3*8]
5872 lea r0, [r0 + r1*8]
5873 lea r2, [r2 + r3*8]
5874 mov [r6+20], r0
5875 mov [r6+28], r2
5876
5877 lea r4, [r1 + 2*r1]
5878 call pixel_sa8d_8x8_internal2
5879 %if HIGH_BIT_DEPTH
5880 HADDUW m0, m1
5881 %endif
5882 mova [esp+48], m0
5883 call pixel_sa8d_8x8_internal2
5884 SA8D_INTER
5885 mova [esp+48], m0
5886
5887 mov r0, [r6+20]
5888 mov r2, [r6+28]
5889 add r0, 8*SIZEOF_PIXEL
5890 add r2, 8*SIZEOF_PIXEL
5891 call pixel_sa8d_8x8_internal2
5892 SA8D_INTER
5893 mova [esp+64-mmsize], m0
5894 call pixel_sa8d_8x8_internal2
5895 AVG_16x16
5896
5897 mov r0, [r6+20]
5898 mov r2, [r6+28]
5899 add r0, 16*SIZEOF_PIXEL
5900 add r2, 16*SIZEOF_PIXEL
5901 lea r4, [r1 + 2*r1]
5902 call pixel_sa8d_8x8_internal2
5903 %if HIGH_BIT_DEPTH
5904 HADDUW m0, m1
5905 %endif
5906 mova [esp+48], m0
5907 call pixel_sa8d_8x8_internal2
5908 SA8D_INTER
5909 mova [esp+48], m0
5910
5911 mov r0, [r6+20]
5912 mov r2, [r6+28]
5913 add r0, 24*SIZEOF_PIXEL
5914 add r2, 24*SIZEOF_PIXEL
5915 call pixel_sa8d_8x8_internal2
5916 SA8D_INTER
5917 mova [esp+64-mmsize], m0
5918 call pixel_sa8d_8x8_internal2
5919 AVG_16x16
5920
5921 mov r0, [r6+20]
5922 mov r2, [r6+28]
5923 add r0, 32*SIZEOF_PIXEL
5924 add r2, 32*SIZEOF_PIXEL
5925 lea r4, [r1 + 2*r1]
5926 call pixel_sa8d_8x8_internal2
5927 %if HIGH_BIT_DEPTH
5928 HADDUW m0, m1
5929 %endif
5930 mova [esp+48], m0
5931 call pixel_sa8d_8x8_internal2
5932 SA8D_INTER
5933 mova [esp+48], m0
5934
5935 mov r0, [r6+20]
5936 mov r2, [r6+28]
5937 add r0, 40*SIZEOF_PIXEL
5938 add r2, 40*SIZEOF_PIXEL
5939 call pixel_sa8d_8x8_internal2
5940 SA8D_INTER
5941 mova [esp+64-mmsize], m0
5942 call pixel_sa8d_8x8_internal2
5943 AVG_16x16
5944
5945 mov r0, [r6+20]
5946 mov r2, [r6+28]
5947 add r0, 48*SIZEOF_PIXEL
5948 add r2, 48*SIZEOF_PIXEL
5949 lea r4, [r1 + 2*r1]
5950 call pixel_sa8d_8x8_internal2
5951 %if HIGH_BIT_DEPTH
5952 HADDUW m0, m1
5953 %endif
5954 mova [esp+48], m0
5955 call pixel_sa8d_8x8_internal2
5956 SA8D_INTER
5957 mova [esp+48], m0
5958
5959 mov r0, [r6+20]
5960 mov r2, [r6+28]
5961 add r0, 56*SIZEOF_PIXEL
5962 add r2, 56*SIZEOF_PIXEL
5963 call pixel_sa8d_8x8_internal2
5964 SA8D_INTER
5965 mova [esp+64-mmsize], m0
5966 call pixel_sa8d_8x8_internal2
5967 AVG_16x16
5968
5969 mov r0, [r6+20]
5970 mov r2, [r6+28]
5971 lea r0, [r0 + r1*8]
5972 lea r2, [r2 + r3*8]
5973 lea r0, [r0 + r1*8]
5974 lea r2, [r2 + r3*8]
5975 mov [r6+20], r0
5976 mov [r6+28], r2
5977
5978 lea r4, [r1 + 2*r1]
5979 call pixel_sa8d_8x8_internal2
5980 %if HIGH_BIT_DEPTH
5981 HADDUW m0, m1
5982 %endif
5983 mova [esp+48], m0
5984 call pixel_sa8d_8x8_internal2
5985 SA8D_INTER
5986 mova [esp+48], m0
5987
5988 mov r0, [r6+20]
5989 mov r2, [r6+28]
5990 add r0, 8*SIZEOF_PIXEL
5991 add r2, 8*SIZEOF_PIXEL
5992 call pixel_sa8d_8x8_internal2
5993 SA8D_INTER
5994 mova [esp+64-mmsize], m0
5995 call pixel_sa8d_8x8_internal2
5996 AVG_16x16
5997
5998 mov r0, [r6+20]
5999 mov r2, [r6+28]
6000 add r0, 16*SIZEOF_PIXEL
6001 add r2, 16*SIZEOF_PIXEL
6002 lea r4, [r1 + 2*r1]
6003 call pixel_sa8d_8x8_internal2
6004 %if HIGH_BIT_DEPTH
6005 HADDUW m0, m1
6006 %endif
6007 mova [esp+48], m0
6008 call pixel_sa8d_8x8_internal2
6009 SA8D_INTER
6010 mova [esp+48], m0
6011
6012 mov r0, [r6+20]
6013 mov r2, [r6+28]
6014 add r0, 24*SIZEOF_PIXEL
6015 add r2, 24*SIZEOF_PIXEL
6016 call pixel_sa8d_8x8_internal2
6017 SA8D_INTER
6018 mova [esp+64-mmsize], m0
6019 call pixel_sa8d_8x8_internal2
6020 AVG_16x16
6021
6022 mov r0, [r6+20]
6023 mov r2, [r6+28]
6024 add r0, 32*SIZEOF_PIXEL
6025 add r2, 32*SIZEOF_PIXEL
6026 lea r4, [r1 + 2*r1]
6027 call pixel_sa8d_8x8_internal2
6028 %if HIGH_BIT_DEPTH
6029 HADDUW m0, m1
6030 %endif
6031 mova [esp+48], m0
6032 call pixel_sa8d_8x8_internal2
6033 SA8D_INTER
6034 mova [esp+48], m0
6035
6036 mov r0, [r6+20]
6037 mov r2, [r6+28]
6038 add r0, 40*SIZEOF_PIXEL
6039 add r2, 40*SIZEOF_PIXEL
6040 call pixel_sa8d_8x8_internal2
6041 SA8D_INTER
6042 mova [esp+64-mmsize], m0
6043 call pixel_sa8d_8x8_internal2
6044 AVG_16x16
6045
6046 mov r0, [r6+20]
6047 mov r2, [r6+28]
6048 add r0, 48*SIZEOF_PIXEL
6049 add r2, 48*SIZEOF_PIXEL
6050 lea r4, [r1 + 2*r1]
6051 call pixel_sa8d_8x8_internal2
6052 %if HIGH_BIT_DEPTH
6053 HADDUW m0, m1
6054 %endif
6055 mova [esp+48], m0
6056 call pixel_sa8d_8x8_internal2
6057 SA8D_INTER
6058 mova [esp+48], m0
6059
6060 mov r0, [r6+20]
6061 mov r2, [r6+28]
6062 add r0, 56*SIZEOF_PIXEL
6063 add r2, 56*SIZEOF_PIXEL
6064 call pixel_sa8d_8x8_internal2
6065 SA8D_INTER
6066 mova [esp+64-mmsize], m0
6067 call pixel_sa8d_8x8_internal2
6068 SA8D_INTER
6069 %if HIGH_BIT_DEPTH == 0
6070 HADDUW m0, m1
6071 %endif
6072 movd r4d, m0
6073 add r4d, 1
6074 shr r4d, 1
6075 add r4d, dword [esp+36]
6076 mov eax, r4d
6077 mov esp, r6
6078 RET
6079
6080 cglobal pixel_sa8d_64x64, 4,7,8
6081 FIX_STRIDES r1, r3
6082 mov r6, esp
6083 and esp, ~15
6084 sub esp, 64
6085
6086 lea r4, [r1 + 2*r1]
6087 lea r5, [r3 + 2*r3]
6088 call pixel_sa8d_8x8_internal2
6089 %if HIGH_BIT_DEPTH
6090 HADDUW m0, m1
6091 %endif
6092 mova [rsp+48], m0
6093 call pixel_sa8d_8x8_internal2
6094 SA8D_INTER
6095 mova [esp+48], m0
6096
6097 mov r0, [r6+20]
6098 mov r2, [r6+28]
6099 add r0, 8*SIZEOF_PIXEL
6100 add r2, 8*SIZEOF_PIXEL
6101 call pixel_sa8d_8x8_internal2
6102 SA8D_INTER
6103 mova [esp+48], m0
6104 call pixel_sa8d_8x8_internal2
6105 SA8D_INTER
6106 %if HIGH_BIT_DEPTH == 0
6107 HADDUW m0, m1
6108 %endif
6109 movd r4d, m0
6110 add r4d, 1
6111 shr r4d, 1
6112 mov dword [esp+36], r4d
6113
6114 mov r0, [r6+20]
6115 mov r2, [r6+28]
6116 add r0, 16*SIZEOF_PIXEL
6117 add r2, 16*SIZEOF_PIXEL
6118 lea r4, [r1 + 2*r1]
6119 call pixel_sa8d_8x8_internal2
6120 %if HIGH_BIT_DEPTH
6121 HADDUW m0, m1
6122 %endif
6123 mova [esp+48], m0
6124 call pixel_sa8d_8x8_internal2
6125 SA8D_INTER
6126 mova [esp+48], m0
6127
6128 mov r0, [r6+20]
6129 mov r2, [r6+28]
6130 add r0, 24*SIZEOF_PIXEL
6131 add r2, 24*SIZEOF_PIXEL
6132 call pixel_sa8d_8x8_internal2
6133 SA8D_INTER
6134 mova [esp+64-mmsize], m0
6135 call pixel_sa8d_8x8_internal2
6136 AVG_16x16
6137
6138 mov r0, [r6+20]
6139 mov r2, [r6+28]
6140 add r0, 32*SIZEOF_PIXEL
6141 add r2, 32*SIZEOF_PIXEL
6142 lea r4, [r1 + 2*r1]
6143 call pixel_sa8d_8x8_internal2
6144 %if HIGH_BIT_DEPTH
6145 HADDUW m0, m1
6146 %endif
6147 mova [esp+48], m0
6148 call pixel_sa8d_8x8_internal2
6149 SA8D_INTER
6150 mova [esp+48], m0
6151
6152 mov r0, [r6+20]
6153 mov r2, [r6+28]
6154 add r0, 40*SIZEOF_PIXEL
6155 add r2, 40*SIZEOF_PIXEL
6156 call pixel_sa8d_8x8_internal2
6157 SA8D_INTER
6158 mova [esp+64-mmsize], m0
6159 call pixel_sa8d_8x8_internal2
6160 AVG_16x16
6161
6162 mov r0, [r6+20]
6163 mov r2, [r6+28]
6164 add r0, 48*SIZEOF_PIXEL
6165 add r2, 48*SIZEOF_PIXEL
6166 lea r4, [r1 + 2*r1]
6167 call pixel_sa8d_8x8_internal2
6168 %if HIGH_BIT_DEPTH
6169 HADDUW m0, m1
6170 %endif
6171 mova [esp+48], m0
6172 call pixel_sa8d_8x8_internal2
6173 SA8D_INTER
6174 mova [esp+48], m0
6175
6176 mov r0, [r6+20]
6177 mov r2, [r6+28]
6178 add r0, 56*SIZEOF_PIXEL
6179 add r2, 56*SIZEOF_PIXEL
6180 call pixel_sa8d_8x8_internal2
6181 SA8D_INTER
6182 mova [esp+64-mmsize], m0
6183 call pixel_sa8d_8x8_internal2
6184 AVG_16x16
6185
6186 mov r0, [r6+20]
6187 mov r2, [r6+28]
6188 lea r0, [r0 + r1*8]
6189 lea r2, [r2 + r3*8]
6190 lea r0, [r0 + r1*8]
6191 lea r2, [r2 + r3*8]
6192 mov [r6+20], r0
6193 mov [r6+28], r2
6194
6195 lea r4, [r1 + 2*r1]
6196 call pixel_sa8d_8x8_internal2
6197 %if HIGH_BIT_DEPTH
6198 HADDUW m0, m1
6199 %endif
6200 mova [esp+48], m0
6201 call pixel_sa8d_8x8_internal2
6202 SA8D_INTER
6203 mova [esp+48], m0
6204
6205 mov r0, [r6+20]
6206 mov r2, [r6+28]
6207 add r0, 8*SIZEOF_PIXEL
6208 add r2, 8*SIZEOF_PIXEL
6209 call pixel_sa8d_8x8_internal2
6210 SA8D_INTER
6211 mova [esp+64-mmsize], m0
6212 call pixel_sa8d_8x8_internal2
6213 AVG_16x16
6214
6215 mov r0, [r6+20]
6216 mov r2, [r6+28]
6217 add r0, 16*SIZEOF_PIXEL
6218 add r2, 16*SIZEOF_PIXEL
6219 lea r4, [r1 + 2*r1]
6220 call pixel_sa8d_8x8_internal2
6221 %if HIGH_BIT_DEPTH
6222 HADDUW m0, m1
6223 %endif
6224 mova [esp+48], m0
6225 call pixel_sa8d_8x8_internal2
6226 SA8D_INTER
6227 mova [esp+48], m0
6228
6229 mov r0, [r6+20]
6230 mov r2, [r6+28]
6231 add r0, 24*SIZEOF_PIXEL
6232 add r2, 24*SIZEOF_PIXEL
6233 call pixel_sa8d_8x8_internal2
6234 SA8D_INTER
6235 mova [esp+64-mmsize], m0
6236 call pixel_sa8d_8x8_internal2
6237 AVG_16x16
6238
6239 mov r0, [r6+20]
6240 mov r2, [r6+28]
6241 add r0, 32*SIZEOF_PIXEL
6242 add r2, 32*SIZEOF_PIXEL
6243 lea r4, [r1 + 2*r1]
6244 call pixel_sa8d_8x8_internal2
6245 %if HIGH_BIT_DEPTH
6246 HADDUW m0, m1
6247 %endif
6248 mova [esp+48], m0
6249 call pixel_sa8d_8x8_internal2
6250 SA8D_INTER
6251 mova [esp+48], m0
6252
6253 mov r0, [r6+20]
6254 mov r2, [r6+28]
6255 add r0, 40*SIZEOF_PIXEL
6256 add r2, 40*SIZEOF_PIXEL
6257 call pixel_sa8d_8x8_internal2
6258 SA8D_INTER
6259 mova [esp+64-mmsize], m0
6260 call pixel_sa8d_8x8_internal2
6261 AVG_16x16
6262
6263 mov r0, [r6+20]
6264 mov r2, [r6+28]
6265 add r0, 48*SIZEOF_PIXEL
6266 add r2, 48*SIZEOF_PIXEL
6267 lea r4, [r1 + 2*r1]
6268 call pixel_sa8d_8x8_internal2
6269 %if HIGH_BIT_DEPTH
6270 HADDUW m0, m1
6271 %endif
6272 mova [esp+48], m0
6273 call pixel_sa8d_8x8_internal2
6274 SA8D_INTER
6275 mova [esp+48], m0
6276
6277 mov r0, [r6+20]
6278 mov r2, [r6+28]
6279 add r0, 56*SIZEOF_PIXEL
6280 add r2, 56*SIZEOF_PIXEL
6281 call pixel_sa8d_8x8_internal2
6282 SA8D_INTER
6283 mova [esp+64-mmsize], m0
6284 call pixel_sa8d_8x8_internal2
6285 AVG_16x16
6286
6287 mov r0, [r6+20]
6288 mov r2, [r6+28]
6289 lea r0, [r0 + r1*8]
6290 lea r2, [r2 + r3*8]
6291 lea r0, [r0 + r1*8]
6292 lea r2, [r2 + r3*8]
6293 mov [r6+20], r0
6294 mov [r6+28], r2
6295
6296 lea r4, [r1 + 2*r1]
6297 call pixel_sa8d_8x8_internal2
6298 %if HIGH_BIT_DEPTH
6299 HADDUW m0, m1
6300 %endif
6301 mova [esp+48], m0
6302 call pixel_sa8d_8x8_internal2
6303 SA8D_INTER
6304 mova [esp+48], m0
6305
6306 mov r0, [r6+20]
6307 mov r2, [r6+28]
6308 add r0, 8*SIZEOF_PIXEL
6309 add r2, 8*SIZEOF_PIXEL
6310 call pixel_sa8d_8x8_internal2
6311 SA8D_INTER
6312 mova [esp+64-mmsize], m0
6313 call pixel_sa8d_8x8_internal2
6314 AVG_16x16
6315
6316 mov r0, [r6+20]
6317 mov r2, [r6+28]
6318 add r0, 16*SIZEOF_PIXEL
6319 add r2, 16*SIZEOF_PIXEL
6320 lea r4, [r1 + 2*r1]
6321 call pixel_sa8d_8x8_internal2
6322 %if HIGH_BIT_DEPTH
6323 HADDUW m0, m1
6324 %endif
6325 mova [esp+48], m0
6326 call pixel_sa8d_8x8_internal2
6327 SA8D_INTER
6328 mova [esp+48], m0
6329
6330 mov r0, [r6+20]
6331 mov r2, [r6+28]
6332 add r0, 24*SIZEOF_PIXEL
6333 add r2, 24*SIZEOF_PIXEL
6334 call pixel_sa8d_8x8_internal2
6335 SA8D_INTER
6336 mova [esp+64-mmsize], m0
6337 call pixel_sa8d_8x8_internal2
6338 AVG_16x16
6339
6340 mov r0, [r6+20]
6341 mov r2, [r6+28]
6342 add r0, 32*SIZEOF_PIXEL
6343 add r2, 32*SIZEOF_PIXEL
6344 lea r4, [r1 + 2*r1]
6345 call pixel_sa8d_8x8_internal2
6346 %if HIGH_BIT_DEPTH
6347 HADDUW m0, m1
6348 %endif
6349 mova [esp+48], m0
6350 call pixel_sa8d_8x8_internal2
6351 SA8D_INTER
6352 mova [esp+48], m0
6353
6354 mov r0, [r6+20]
6355 mov r2, [r6+28]
6356 add r0, 40*SIZEOF_PIXEL
6357 add r2, 40*SIZEOF_PIXEL
6358 call pixel_sa8d_8x8_internal2
6359 SA8D_INTER
6360 mova [esp+64-mmsize], m0
6361 call pixel_sa8d_8x8_internal2
6362 AVG_16x16
6363
6364 mov r0, [r6+20]
6365 mov r2, [r6+28]
6366 add r0, 48*SIZEOF_PIXEL
6367 add r2, 48*SIZEOF_PIXEL
6368 lea r4, [r1 + 2*r1]
6369 call pixel_sa8d_8x8_internal2
6370 %if HIGH_BIT_DEPTH
6371 HADDUW m0, m1
6372 %endif
6373 mova [esp+48], m0
6374 call pixel_sa8d_8x8_internal2
6375 SA8D_INTER
6376 mova [esp+48], m0
6377
6378 mov r0, [r6+20]
6379 mov r2, [r6+28]
6380 add r0, 56*SIZEOF_PIXEL
6381 add r2, 56*SIZEOF_PIXEL
6382 call pixel_sa8d_8x8_internal2
6383 SA8D_INTER
6384 mova [esp+64-mmsize], m0
6385 call pixel_sa8d_8x8_internal2
6386 AVG_16x16
6387
6388 mov r0, [r6+20]
6389 mov r2, [r6+28]
6390 lea r0, [r0 + r1*8]
6391 lea r2, [r2 + r3*8]
6392 lea r0, [r0 + r1*8]
6393 lea r2, [r2 + r3*8]
6394 mov [r6+20], r0
6395 mov [r6+28], r2
6396
6397 lea r4, [r1 + 2*r1]
6398 call pixel_sa8d_8x8_internal2
6399 %if HIGH_BIT_DEPTH
6400 HADDUW m0, m1
6401 %endif
6402 mova [esp+48], m0
6403 call pixel_sa8d_8x8_internal2
6404 SA8D_INTER
6405 mova [esp+48], m0
6406
6407 mov r0, [r6+20]
6408 mov r2, [r6+28]
6409 add r0, 8*SIZEOF_PIXEL
6410 add r2, 8*SIZEOF_PIXEL
6411 call pixel_sa8d_8x8_internal2
6412 SA8D_INTER
6413 mova [esp+64-mmsize], m0
6414 call pixel_sa8d_8x8_internal2
6415 AVG_16x16
6416
6417 mov r0, [r6+20]
6418 mov r2, [r6+28]
6419 add r0, 16*SIZEOF_PIXEL
6420 add r2, 16*SIZEOF_PIXEL
6421 lea r4, [r1 + 2*r1]
6422 call pixel_sa8d_8x8_internal2
6423 %if HIGH_BIT_DEPTH
6424 HADDUW m0, m1
6425 %endif
6426 mova [esp+48], m0
6427 call pixel_sa8d_8x8_internal2
6428 SA8D_INTER
6429 mova [esp+48], m0
6430
6431 mov r0, [r6+20]
6432 mov r2, [r6+28]
6433 add r0, 24*SIZEOF_PIXEL
6434 add r2, 24*SIZEOF_PIXEL
6435 call pixel_sa8d_8x8_internal2
6436 SA8D_INTER
6437 mova [esp+64-mmsize], m0
6438 call pixel_sa8d_8x8_internal2
6439 AVG_16x16
6440
6441 mov r0, [r6+20]
6442 mov r2, [r6+28]
6443 add r0, 32*SIZEOF_PIXEL
6444 add r2, 32*SIZEOF_PIXEL
6445 lea r4, [r1 + 2*r1]
6446 call pixel_sa8d_8x8_internal2
6447 %if HIGH_BIT_DEPTH
6448 HADDUW m0, m1
6449 %endif
6450 mova [esp+48], m0
6451 call pixel_sa8d_8x8_internal2
6452 SA8D_INTER
6453 mova [esp+48], m0
6454
6455 mov r0, [r6+20]
6456 mov r2, [r6+28]
6457 add r0, 40*SIZEOF_PIXEL
6458 add r2, 40*SIZEOF_PIXEL
6459 call pixel_sa8d_8x8_internal2
6460 SA8D_INTER
6461 mova [esp+64-mmsize], m0
6462 call pixel_sa8d_8x8_internal2
6463 AVG_16x16
6464
6465 mov r0, [r6+20]
6466 mov r2, [r6+28]
6467 add r0, 48*SIZEOF_PIXEL
6468 add r2, 48*SIZEOF_PIXEL
6469 lea r4, [r1 + 2*r1]
6470 call pixel_sa8d_8x8_internal2
6471 %if HIGH_BIT_DEPTH
6472 HADDUW m0, m1
6473 %endif
6474 mova [esp+48], m0
6475 call pixel_sa8d_8x8_internal2
6476 SA8D_INTER
6477 mova [esp+48], m0
6478
6479 mov r0, [r6+20]
6480 mov r2, [r6+28]
6481 add r0, 56*SIZEOF_PIXEL
6482 add r2, 56*SIZEOF_PIXEL
6483 call pixel_sa8d_8x8_internal2
6484 SA8D_INTER
6485 mova [esp+64-mmsize], m0
6486 call pixel_sa8d_8x8_internal2
6487 SA8D_INTER
6488 %if HIGH_BIT_DEPTH == 0
6489 HADDUW m0, m1
6490 %endif
6491 movd r4d, m0
6492 add r4d, 1
6493 shr r4d, 1
6494 add r4d, dword [esp+36]
6495 mov eax, r4d
6496 mov esp, r6
6497 RET
6498 %endif ; !ARCH_X86_64
6499 %endmacro ; SA8D
6500
6501 ;=============================================================================
6502 ; INTRA SATD
6503 ;=============================================================================
6504 %define TRANS TRANS_SSE2
6505 %define DIFFOP DIFF_UNPACK_SSE2
6506 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
6507 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
6508 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
6509 %define movdqu movups
6510 %define punpcklqdq movlhps
6511 INIT_XMM sse2
6512 SA8D
6513 SATDS_SSE2
6514
6515 %if HIGH_BIT_DEPTH == 0
6516 INIT_XMM ssse3,atom
6517 SATDS_SSE2
6518 SA8D
6519 %endif
6520
6521 %define DIFFOP DIFF_SUMSUB_SSSE3
6522 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6523 %if HIGH_BIT_DEPTH == 0
6524 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
6525 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
6526 %endif
6527 INIT_XMM ssse3
6528 SATDS_SSE2
6529 SA8D
6530 %undef movdqa ; nehalem doesn't like movaps
6531 %undef movdqu ; movups
6532 %undef punpcklqdq ; or movlhps
6533
6534 %define TRANS TRANS_SSE4
6535 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
6536 INIT_XMM sse4
6537 SATDS_SSE2
6538 SA8D
6539
6540 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
6541 ; it's effectively free.
6542 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6543 INIT_XMM avx
6544 SATDS_SSE2
6545 SA8D
6546
6547 %define TRANS TRANS_XOP
6548 INIT_XMM xop
6549 SATDS_SSE2
6550 SA8D
6551
6552
6553 %if HIGH_BIT_DEPTH == 0
6554 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
6555 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
6556 %define TRANS TRANS_SSE4
6557
6558 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
6559 movq xm%1, [r0]
6560 movq xm%3, [r2]
6561 movq xm%2, [r0+r1]
6562 movq xm%4, [r2+r3]
6563 vinserti128 m%1, m%1, [r0+4*r1], 1
6564 vinserti128 m%3, m%3, [r2+4*r3], 1
6565 vinserti128 m%2, m%2, [r0+r4], 1
6566 vinserti128 m%4, m%4, [r2+r5], 1
6567 punpcklqdq m%1, m%1
6568 punpcklqdq m%3, m%3
6569 punpcklqdq m%2, m%2
6570 punpcklqdq m%4, m%4
6571 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
6572 lea r0, [r0+2*r1]
6573 lea r2, [r2+2*r3]
6574
6575 movq xm%3, [r0]
6576 movq xm%5, [r2]
6577 movq xm%4, [r0+r1]
6578 movq xm%6, [r2+r3]
6579 vinserti128 m%3, m%3, [r0+4*r1], 1
6580 vinserti128 m%5, m%5, [r2+4*r3], 1
6581 vinserti128 m%4, m%4, [r0+r4], 1
6582 vinserti128 m%6, m%6, [r2+r5], 1
6583 punpcklqdq m%3, m%3
6584 punpcklqdq m%5, m%5
6585 punpcklqdq m%4, m%4
6586 punpcklqdq m%6, m%6
6587 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
6588 %endmacro
6589
6590 %macro SATD_START_AVX2 2-3 0
6591 FIX_STRIDES r1, r3
6592 %if %3
6593 mova %2, [hmul_8p]
6594 lea r4, [5*r1]
6595 lea r5, [5*r3]
6596 %else
6597 mova %2, [hmul_16p]
6598 lea r4, [3*r1]
6599 lea r5, [3*r3]
6600 %endif
6601 pxor %1, %1
6602 %endmacro
6603
6604 %define TRANS TRANS_SSE4
6605 INIT_YMM avx2
6606 cglobal pixel_satd_16x8_internal
6607 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
6608 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6609 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
6610 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6611 ret
6612
6613 cglobal pixel_satd_16x16, 4,6,8
6614 SATD_START_AVX2 m6, m7
6615 call pixel_satd_16x8_internal
6616 lea r0, [r0+4*r1]
6617 lea r2, [r2+4*r3]
6618 pixel_satd_16x8_internal:
6619 call pixel_satd_16x8_internal
6620 vextracti128 xm0, m6, 1
6621 paddw xm0, xm6
6622 SATD_END_SSE2 xm0
6623 RET
6624
6625 cglobal pixel_satd_16x8, 4,6,8
6626 SATD_START_AVX2 m6, m7
6627 jmp pixel_satd_16x8_internal
6628
6629 cglobal pixel_satd_8x8_internal
6630 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6631 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6632 ret
6633
6634 cglobal pixel_satd_8x16, 4,6,8
6635 SATD_START_AVX2 m6, m7, 1
6636 call pixel_satd_8x8_internal
6637 lea r0, [r0+2*r1]
6638 lea r2, [r2+2*r3]
6639 lea r0, [r0+4*r1]
6640 lea r2, [r2+4*r3]
6641 call pixel_satd_8x8_internal
6642 vextracti128 xm0, m6, 1
6643 paddw xm0, xm6
6644 SATD_END_SSE2 xm0
6645 RET
6646
6647 cglobal pixel_satd_8x8, 4,6,8
6648 SATD_START_AVX2 m6, m7, 1
6649 call pixel_satd_8x8_internal
6650 vextracti128 xm0, m6, 1
6651 paddw xm0, xm6
6652 SATD_END_SSE2 xm0
6653 RET
6654
6655 cglobal pixel_sa8d_8x8_internal
6656 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6657 HADAMARD4_V 0, 1, 2, 3, 4
6658 HADAMARD 8, sumsub, 0, 1, 4, 5
6659 HADAMARD 8, sumsub, 2, 3, 4, 5
6660 HADAMARD 2, sumsub, 0, 1, 4, 5
6661 HADAMARD 2, sumsub, 2, 3, 4, 5
6662 HADAMARD 1, amax, 0, 1, 4, 5
6663 HADAMARD 1, amax, 2, 3, 4, 5
6664 paddw m6, m0
6665 paddw m6, m2
6666 ret
6667
6668 cglobal pixel_sa8d_8x8, 4,6,8
6669 SATD_START_AVX2 m6, m7, 1
6670 call pixel_sa8d_8x8_internal
6671 vextracti128 xm1, m6, 1
6672 paddw xm6, xm1
6673 HADDW xm6, xm1
6674 movd eax, xm6
6675 add eax, 1
6676 shr eax, 1
6677 RET
6678
6679 cglobal pixel_sa8d_16x16, 4,6,8
6680 SATD_START_AVX2 m6, m7, 1
6681
6682 call pixel_sa8d_8x8_internal ; pix[0]
6683
6684 sub r0, r1
6685 sub r0, r1
6686 add r0, 8*SIZEOF_PIXEL
6687 sub r2, r3
6688 sub r2, r3
6689 add r2, 8*SIZEOF_PIXEL
6690 call pixel_sa8d_8x8_internal ; pix[8]
6691
6692 add r0, r4
6693 add r0, r1
6694 add r2, r5
6695 add r2, r3
6696 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
6697
6698 sub r0, r1
6699 sub r0, r1
6700 sub r0, 8*SIZEOF_PIXEL
6701 sub r2, r3
6702 sub r2, r3
6703 sub r2, 8*SIZEOF_PIXEL
6704 call pixel_sa8d_8x8_internal ; pix[8*stride]
6705
6706 ; TODO: analyze Dynamic Range
6707 vextracti128 xm0, m6, 1
6708 paddusw xm6, xm0
6709 HADDUW xm6, xm0
6710 movd eax, xm6
6711 add eax, 1
6712 shr eax, 1
6713 RET
6714
6715 cglobal pixel_sa8d_16x16_internal
6716 call pixel_sa8d_8x8_internal ; pix[0]
6717
6718 sub r0, r1
6719 sub r0, r1
6720 add r0, 8*SIZEOF_PIXEL
6721 sub r2, r3
6722 sub r2, r3
6723 add r2, 8*SIZEOF_PIXEL
6724 call pixel_sa8d_8x8_internal ; pix[8]
6725
6726 add r0, r4
6727 add r0, r1
6728 add r2, r5
6729 add r2, r3
6730 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
6731
6732 sub r0, r1
6733 sub r0, r1
6734 sub r0, 8*SIZEOF_PIXEL
6735 sub r2, r3
6736 sub r2, r3
6737 sub r2, 8*SIZEOF_PIXEL
6738 call pixel_sa8d_8x8_internal ; pix[8*stride]
6739
6740 ; TODO: analyze Dynamic Range
6741 vextracti128 xm0, m6, 1
6742 paddusw xm6, xm0
6743 HADDUW xm6, xm0
6744 movd eax, xm6
6745 add eax, 1
6746 shr eax, 1
6747 ret
6748
6749 %if ARCH_X86_64
6750 cglobal pixel_sa8d_32x32, 4,8,8
6751 ; TODO: R6 is RAX on x64 platform, so we use it directly
6752
6753 SATD_START_AVX2 m6, m7, 1
6754 xor r7d, r7d
6755
6756 call pixel_sa8d_16x16_internal ; [0]
6757 pxor m6, m6
6758 add r7d, eax
6759
6760 add r0, r4
6761 add r0, r1
6762 add r2, r5
6763 add r2, r3
6764 call pixel_sa8d_16x16_internal ; [2]
6765 pxor m6, m6
6766 add r7d, eax
6767
6768 lea eax, [r4 * 5 - 16]
6769 sub r0, rax
6770 sub r0, r1
6771 lea eax, [r5 * 5 - 16]
6772 sub r2, rax
6773 sub r2, r3
6774 call pixel_sa8d_16x16_internal ; [1]
6775 pxor m6, m6
6776 add r7d, eax
6777
6778 add r0, r4
6779 add r0, r1
6780 add r2, r5
6781 add r2, r3
6782 call pixel_sa8d_16x16_internal ; [3]
6783 add eax, r7d
6784 RET
6785 %endif ; ARCH_X86_64=1
6786 %endif ; HIGH_BIT_DEPTH
6787
6788 ; Input 10bit, Output 8bit
6789 ;------------------------------------------------------------------------------------------------------------------------
6790 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
6791 ;------------------------------------------------------------------------------------------------------------------------
6792 INIT_XMM sse2
6793 cglobal downShift_16, 7,7,3
6794 movd m0, r6d ; m0 = shift
6795 add r1, r1
6796 dec r5d
6797 .loopH:
6798 xor r6, r6
6799 .loopW:
6800 movu m1, [r0 + r6 * 2]
6801 movu m2, [r0 + r6 * 2 + 16]
6802 psrlw m1, m0
6803 psrlw m2, m0
6804 packuswb m1, m2
6805 movu [r2 + r6], m1
6806
6807 add r6, 16
6808 cmp r6d, r4d
6809 jl .loopW
6810
6811 ; move to next row
6812 add r0, r1
6813 add r2, r3
6814 dec r5d
6815 jnz .loopH
6816
6817 ;processing last row of every frame [To handle width which not a multiple of 16]
6818
6819 .loop16:
6820 movu m1, [r0]
6821 movu m2, [r0 + 16]
6822 psrlw m1, m0
6823 psrlw m2, m0
6824 packuswb m1, m2
6825 movu [r2], m1
6826
6827 add r0, 2 * mmsize
6828 add r2, mmsize
6829 sub r4d, 16
6830 jz .end
6831 cmp r4d, 15
6832 jg .loop16
6833
6834 cmp r4d, 8
6835 jl .process4
6836 movu m1, [r0]
6837 psrlw m1, m0
6838 packuswb m1, m1
6839 movh [r2], m1
6840
6841 add r0, mmsize
6842 add r2, 8
6843 sub r4d, 8
6844 jz .end
6845
6846 .process4:
6847 cmp r4d, 4
6848 jl .process2
6849 movh m1,[r0]
6850 psrlw m1, m0
6851 packuswb m1, m1
6852 movd [r2], m1
6853
6854 add r0, 8
6855 add r2, 4
6856 sub r4d, 4
6857 jz .end
6858
6859 .process2:
6860 cmp r4d, 2
6861 jl .process1
6862 movd m1, [r0]
6863 psrlw m1, m0
6864 packuswb m1, m1
6865 movd r6, m1
6866 mov [r2], r6w
6867
6868 add r0, 4
6869 add r2, 2
6870 sub r4d, 2
6871 jz .end
6872
6873 .process1:
6874 movd m1, [r0]
6875 psrlw m1, m0
6876 packuswb m1, m1
6877 movd r3, m1
6878 mov [r2], r3b
6879 .end:
6880 RET
6881
6882 ; Input 10bit, Output 8bit
6883 ;-------------------------------------------------------------------------------------------------------------------------------------
6884 ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
6885 ;-------------------------------------------------------------------------------------------------------------------------------------
6886 INIT_YMM avx2
6887 cglobal downShift_16, 6,7,3
6888 movd xm0, r6m ; m0 = shift
6889 add r1d, r1d
6890 dec r5d
6891 .loopH:
6892 xor r6, r6
6893 .loopW:
6894 movu m1, [r0 + r6 * 2 + 0]
6895 movu m2, [r0 + r6 * 2 + 32]
6896 vpsrlw m1, xm0
6897 vpsrlw m2, xm0
6898 packuswb m1, m2
6899 vpermq m1, m1, 11011000b
6900 movu [r2 + r6], m1
6901
6902 add r6d, mmsize
6903 cmp r6d, r4d
6904 jl .loopW
6905
6906 ; move to next row
6907 add r0, r1
6908 add r2, r3
6909 dec r5d
6910 jnz .loopH
6911
6912 ; processing last row of every frame [To handle width which not a multiple of 32]
6913 mov r6d, r4d
6914 and r4d, 31
6915 shr r6d, 5
6916
6917 .loop32:
6918 movu m1, [r0]
6919 movu m2, [r0 + 32]
6920 psrlw m1, xm0
6921 psrlw m2, xm0
6922 packuswb m1, m2
6923 vpermq m1, m1, 11011000b
6924 movu [r2], m1
6925
6926 add r0, 2*mmsize
6927 add r2, mmsize
6928 dec r6d
6929 jnz .loop32
6930
6931 cmp r4d, 16
6932 jl .process8
6933 movu m1, [r0]
6934 psrlw m1, xm0
6935 packuswb m1, m1
6936 vpermq m1, m1, 10001000b
6937 movu [r2], xm1
6938
6939 add r0, mmsize
6940 add r2, 16
6941 sub r4d, 16
6942 jz .end
6943
6944 .process8:
6945 cmp r4d, 8
6946 jl .process4
6947 movu m1, [r0]
6948 psrlw m1, xm0
6949 packuswb m1, m1
6950 movq [r2], xm1
6951
6952 add r0, 16
6953 add r2, 8
6954 sub r4d, 8
6955 jz .end
6956
6957 .process4:
6958 cmp r4d, 4
6959 jl .process2
6960 movq xm1,[r0]
6961 psrlw m1, xm0
6962 packuswb m1, m1
6963 movd [r2], xm1
6964
6965 add r0, 8
6966 add r2, 4
6967 sub r4d, 4
6968 jz .end
6969
6970 .process2:
6971 cmp r4d, 2
6972 jl .process1
6973 movd xm1, [r0]
6974 psrlw m1, xm0
6975 packuswb m1, m1
6976 movd r6d, xm1
6977 mov [r2], r6w
6978
6979 add r0, 4
6980 add r2, 2
6981 sub r4d, 2
6982 jz .end
6983
6984 .process1:
6985 movd xm1, [r0]
6986 psrlw m1, xm0
6987 packuswb m1, m1
6988 movd r3d, xm1
6989 mov [r2], r3b
6990 .end:
6991 RET
6992
6993 ; Input 8bit, Output 10bit
6994 ;---------------------------------------------------------------------------------------------------------------------
6995 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
6996 ;---------------------------------------------------------------------------------------------------------------------
6997 INIT_XMM sse4
6998 cglobal upShift_8, 6,7,3
6999 movd xm2, r6m
7000 add r3d, r3d
7001 dec r5d
7002
7003 .loopH:
7004 xor r6, r6
7005 .loopW:
7006 pmovzxbw m0,[r0 + r6]
7007 pmovzxbw m1,[r0 + r6 + mmsize/2]
7008 psllw m0, m2
7009 psllw m1, m2
7010 movu [r2 + r6 * 2], m0
7011 movu [r2 + r6 * 2 + mmsize], m1
7012
7013 add r6d, mmsize
7014 cmp r6d, r4d
7015 jl .loopW
7016
7017 ; move to next row
7018 add r0, r1
7019 add r2, r3
7020 dec r5d
7021 jg .loopH
7022
7023 ; processing last row of every frame [To handle width which not a multiple of 16]
7024 mov r1d, (mmsize/2 - 1)
7025 and r1d, r4d
7026 sub r1, mmsize/2
7027
7028 ; NOTE: Width MUST BE more than or equal to 8
7029 shr r4d, 3 ; log2(mmsize)
7030 .loopW8:
7031 pmovzxbw m0,[r0]
7032 psllw m0, m2
7033 movu [r2], m0
7034 add r0, mmsize/2
7035 add r2, mmsize
7036 dec r4d
7037 jg .loopW8
7038
7039 ; Mac OS X can't read beyond array bound, so rollback some bytes
7040 pmovzxbw m0,[r0 + r1]
7041 psllw m0, m2
7042 movu [r2 + r1 * 2], m0
7043 RET
7044
7045
7046 ;---------------------------------------------------------------------------------------------------------------------
7047 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
7048 ;---------------------------------------------------------------------------------------------------------------------
7049 %if ARCH_X86_64
7050 INIT_YMM avx2
7051 cglobal upShift_8, 6,7,3
7052 movd xm2, r6m
7053 add r3d, r3d
7054 dec r5d
7055
7056 .loopH:
7057 xor r6, r6
7058 .loopW:
7059 pmovzxbw m0,[r0 + r6]
7060 pmovzxbw m1,[r0 + r6 + mmsize/2]
7061 psllw m0, xm2
7062 psllw m1, xm2
7063 movu [r2 + r6 * 2], m0
7064 movu [r2 + r6 * 2 + mmsize], m1
7065
7066 add r6d, mmsize
7067 cmp r6d, r4d
7068 jl .loopW
7069
7070 ; move to next row
7071 add r0, r1
7072 add r2, r3
7073 dec r5d
7074 jg .loopH
7075
7076 ; processing last row of every frame [To handle width which not a multiple of 32]
7077 mov r1d, (mmsize/2 - 1)
7078 and r1d, r4d
7079 sub r1, mmsize/2
7080
7081 ; NOTE: Width MUST BE more than or equal to 16
7082 shr r4d, 4 ; log2(mmsize)
7083 .loopW16:
7084 pmovzxbw m0,[r0]
7085 psllw m0, xm2
7086 movu [r2], m0
7087 add r0, mmsize/2
7088 add r2, mmsize
7089 dec r4d
7090 jg .loopW16
7091
7092 ; Mac OS X can't read beyond array bound, so rollback some bytes
7093 pmovzxbw m0,[r0 + r1]
7094 psllw m0, xm2
7095 movu [r2 + r1 * 2], m0
7096 RET
7097 %endif
7098
7099 %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
7100 %if cpuflag(ssse3)
7101 pabsd %1, %3
7102 pabsd %2, %4
7103 %elifidn %1, %3
7104 pxor %5, %5
7105 pxor %6, %6
7106 psubd %5, %1
7107 psubd %6, %2
7108 pmaxsd %1, %5
7109 pmaxsd %2, %6
7110 %else
7111 pxor %1, %1
7112 pxor %2, %2
7113 psubd %1, %3
7114 psubd %2, %4
7115 pmaxsd %1, %3
7116 pmaxsd %2, %4
7117 %endif
7118 %endmacro
7119
7120
7121 ; Input 10bit, Output 12bit
7122 ;------------------------------------------------------------------------------------------------------------------------
7123 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
7124 ;------------------------------------------------------------------------------------------------------------------------
7125 INIT_XMM sse2
7126 cglobal upShift_16, 6,7,4
7127 movd m0, r6m ; m0 = shift
7128 mova m3, [pw_pixel_max]
7129 FIX_STRIDES r1d, r3d
7130 dec r5d
7131 .loopH:
7132 xor r6d, r6d
7133 .loopW:
7134 movu m1, [r0 + r6 * SIZEOF_PIXEL]
7135 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
7136 psllw m1, m0
7137 psllw m2, m0
7138 ; TODO: if input always valid, we can remove below 2 instructions.
7139 pand m1, m3
7140 pand m2, m3
7141 movu [r2 + r6 * SIZEOF_PIXEL], m1
7142 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
7143
7144 add r6, mmsize * 2 / SIZEOF_PIXEL
7145 cmp r6d, r4d
7146 jl .loopW
7147
7148 ; move to next row
7149 add r0, r1
7150 add r2, r3
7151 dec r5d
7152 jnz .loopH
7153
7154 ;processing last row of every frame [To handle width which not a multiple of 16]
7155
7156 .loop16:
7157 movu m1, [r0]
7158 movu m2, [r0 + mmsize]
7159 psllw m1, m0
7160 psllw m2, m0
7161 pand m1, m3
7162 pand m2, m3
7163 movu [r2], m1
7164 movu [r2 + mmsize], m2
7165
7166 add r0, 2 * mmsize
7167 add r2, 2 * mmsize
7168 sub r4d, 16
7169 jz .end
7170 jg .loop16
7171
7172 cmp r4d, 8
7173 jl .process4
7174 movu m1, [r0]
7175 psrlw m1, m0
7176 pand m1, m3
7177 movu [r2], m1
7178
7179 add r0, mmsize
7180 add r2, mmsize
7181 sub r4d, 8
7182 jz .end
7183
7184 .process4:
7185 cmp r4d, 4
7186 jl .process2
7187 movh m1,[r0]
7188 psllw m1, m0
7189 pand m1, m3
7190 movh [r2], m1
7191
7192 add r0, 8
7193 add r2, 8
7194 sub r4d, 4
7195 jz .end
7196
7197 .process2:
7198 cmp r4d, 2
7199 jl .process1
7200 movd m1, [r0]
7201 psllw m1, m0
7202 pand m1, m3
7203 movd [r2], m1
7204
7205 add r0, 4
7206 add r2, 4
7207 sub r4d, 2
7208 jz .end
7209
7210 .process1:
7211 movd m1, [r0]
7212 psllw m1, m0
7213 pand m1, m3
7214 movd r3, m1
7215 mov [r2], r3w
7216 .end:
7217 RET
7218
7219 ; Input 10bit, Output 12bit
7220 ;-------------------------------------------------------------------------------------------------------------------------------------
7221 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
7222 ;-------------------------------------------------------------------------------------------------------------------------------------
7223 ; TODO: NO TEST CODE!
7224 INIT_YMM avx2
7225 cglobal upShift_16, 6,7,4
7226 movd xm0, r6m ; m0 = shift
7227 vbroadcasti128 m3, [pw_pixel_max]
7228 FIX_STRIDES r1d, r3d
7229 dec r5d
7230 .loopH:
7231 xor r6d, r6d
7232 .loopW:
7233 movu m1, [r0 + r6 * SIZEOF_PIXEL]
7234 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
7235 psllw m1, xm0
7236 psllw m2, xm0
7237 pand m1, m3
7238 pand m2, m3
7239 movu [r2 + r6 * SIZEOF_PIXEL], m1
7240 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
7241
7242 add r6, mmsize * 2 / SIZEOF_PIXEL
7243 cmp r6d, r4d
7244 jl .loopW
7245
7246 ; move to next row
7247 add r0, r1
7248 add r2, r3
7249 dec r5d
7250 jnz .loopH
7251
7252 ; processing last row of every frame [To handle width which not a multiple of 32]
7253 mov r6d, r4d
7254 and r4d, 31
7255 shr r6d, 5
7256
7257 .loop32:
7258 movu m1, [r0]
7259 movu m2, [r0 + mmsize]
7260 psllw m1, xm0
7261 psllw m2, xm0
7262 pand m1, m3
7263 pand m2, m3
7264 movu [r2], m1
7265 movu [r2 + mmsize], m2
7266
7267 add r0, 2*mmsize
7268 add r2, 2*mmsize
7269 dec r6d
7270 jnz .loop32
7271
7272 cmp r4d, 16
7273 jl .process8
7274 movu m1, [r0]
7275 psllw m1, xm0
7276 pand m1, m3
7277 movu [r2], m1
7278
7279 add r0, mmsize
7280 add r2, mmsize
7281 sub r4d, 16
7282 jz .end
7283
7284 .process8:
7285 cmp r4d, 8
7286 jl .process4
7287 movu xm1, [r0]
7288 psllw xm1, xm0
7289 pand xm1, xm3
7290 movu [r2], xm1
7291
7292 add r0, 16
7293 add r2, 16
7294 sub r4d, 8
7295 jz .end
7296
7297 .process4:
7298 cmp r4d, 4
7299 jl .process2
7300 movq xm1,[r0]
7301 psllw xm1, xm0
7302 pand xm1, xm3
7303 movq [r2], xm1
7304
7305 add r0, 8
7306 add r2, 8
7307 sub r4d, 4
7308 jz .end
7309
7310 .process2:
7311 cmp r4d, 2
7312 jl .process1
7313 movd xm1, [r0]
7314 psllw xm1, xm0
7315 pand xm1, xm3
7316 movd [r2], xm1
7317
7318 add r0, 4
7319 add r2, 4
7320 sub r4d, 2
7321 jz .end
7322
7323 .process1:
7324 movd xm1, [r0]
7325 psllw xm1, xm0
7326 pand xm1, xm3
7327 movd r3d, xm1
7328 mov [r2], r3w
7329 .end:
7330 RET
7331
7332
7333 ;---------------------------------------------------------------------------------------------------------------------
7334 ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
7335 ;---------------------------------------------------------------------------------------------------------------------
7336 INIT_XMM sse4
7337 cglobal psyCost_pp_4x4, 4, 5, 8
7338
7339 %if HIGH_BIT_DEPTH
7340 FIX_STRIDES r1, r3
7341 lea r4, [3 * r1]
7342 movddup m0, [r0]
7343 movddup m1, [r0 + r1]
7344 movddup m2, [r0 + r1 * 2]
7345 movddup m3, [r0 + r4]
7346 mova m4, [hmul_8w]
7347 pmaddwd m0, m4
7348 pmaddwd m1, m4
7349 pmaddwd m2, m4
7350 pmaddwd m3, m4
7351
7352 paddd m5, m0, m1
7353 paddd m5, m2
7354 paddd m5, m3
7355 psrldq m4, m5, 4
7356 paddd m5, m4
7357 psrld m5, 2
7358
7359 SUMSUB_BA d, 0, 1, 4
7360 SUMSUB_BA d, 2, 3, 4
7361 SUMSUB_BA d, 0, 2, 4
7362 SUMSUB_BA d, 1, 3, 4
7363 %define ORDER unord
7364 TRANS q, ORDER, 0, 2, 4, 6
7365 TRANS q, ORDER, 1, 3, 4, 6
7366 ABSD2 m0, m2, m0, m2, m4, m6
7367 pmaxsd m0, m2
7368 ABSD2 m1, m3, m1, m3, m4, m6
7369 pmaxsd m1, m3
7370 paddd m0, m1
7371 movhlps m1, m0
7372 paddd m0, m1
7373 psrldq m1, m0, 4
7374 paddd m0, m1
7375
7376 psubd m7, m0, m5
7377
7378 lea r4, [3 * r3]
7379 movddup m0, [r2]
7380 movddup m1, [r2 + r3]
7381 movddup m2, [r2 + r3 * 2]
7382 movddup m3, [r2 + r4]
7383 mova m4, [hmul_8w]
7384 pmaddwd m0, m4
7385 pmaddwd m1, m4
7386 pmaddwd m2, m4
7387 pmaddwd m3, m4
7388
7389 paddd m5, m0, m1
7390 paddd m5, m2
7391 paddd m5, m3
7392 psrldq m4, m5, 4
7393 paddd m5, m4
7394 psrld m5, 2
7395
7396 SUMSUB_BA d, 0, 1, 4
7397 SUMSUB_BA d, 2, 3, 4
7398 SUMSUB_BA d, 0, 2, 4
7399 SUMSUB_BA d, 1, 3, 4
7400 %define ORDER unord
7401 TRANS q, ORDER, 0, 2, 4, 6
7402 TRANS q, ORDER, 1, 3, 4, 6
7403 ABSD2 m0, m2, m0, m2, m4, m6
7404 pmaxsd m0, m2
7405 ABSD2 m1, m3, m1, m3, m4, m6
7406 pmaxsd m1, m3
7407 paddd m0, m1
7408 movhlps m1, m0
7409 paddd m0, m1
7410 psrldq m1, m0, 4
7411 paddd m0, m1
7412
7413 psubd m0, m5
7414
7415 psubd m7, m0
7416 pabsd m0, m7
7417 movd eax, m0
7418
7419 %else ; !HIGH_BIT_DEPTH
7420 lea r4, [3 * r1]
7421 movd m0, [r0]
7422 movd m1, [r0 + r1]
7423 movd m2, [r0 + r1 * 2]
7424 movd m3, [r0 + r4]
7425 shufps m0, m1, 0
7426 shufps m2, m3, 0
7427 mova m4, [hmul_4p]
7428 pmaddubsw m0, m4
7429 pmaddubsw m2, m4
7430
7431 paddw m5, m0, m2
7432 movhlps m4, m5
7433 paddw m5, m4
7434 pmaddwd m5, [pw_1]
7435 psrld m5, 2
7436
7437 HADAMARD 0, sumsub, 0, 2, 1, 3
7438 HADAMARD 4, sumsub, 0, 2, 1, 3
7439 HADAMARD 1, amax, 0, 2, 1, 3
7440 HADDW m0, m2
7441
7442 psubd m6, m0, m5
7443
7444 lea r4, [3 * r3]
7445 movd m0, [r2]
7446 movd m1, [r2 + r3]
7447 movd m2, [r2 + r3 * 2]
7448 movd m3, [r2 + r4]
7449 shufps m0, m1, 0
7450 shufps m2, m3, 0
7451 mova m4, [hmul_4p]
7452 pmaddubsw m0, m4
7453 pmaddubsw m2, m4
7454
7455 paddw m5, m0, m2
7456 movhlps m4, m5
7457 paddw m5, m4
7458 pmaddwd m5, [pw_1]
7459 psrld m5, 2
7460
7461 HADAMARD 0, sumsub, 0, 2, 1, 3
7462 HADAMARD 4, sumsub, 0, 2, 1, 3
7463 HADAMARD 1, amax, 0, 2, 1, 3
7464 HADDW m0, m2
7465
7466 psubd m0, m5
7467
7468 psubd m6, m0
7469 pabsd m0, m6
7470 movd eax, m0
7471 %endif ; HIGH_BIT_DEPTH
7472 RET
7473
7474 %if ARCH_X86_64
7475 INIT_XMM sse4
7476 cglobal psyCost_pp_8x8, 4, 6, 13
7477
7478 %if HIGH_BIT_DEPTH
7479 FIX_STRIDES r1, r3
7480 lea r4, [3 * r1]
7481 pxor m10, m10
7482 movu m0, [r0]
7483 movu m1, [r0 + r1]
7484 movu m2, [r0 + r1 * 2]
7485 movu m3, [r0 + r4]
7486 lea r5, [r0 + r1 * 4]
7487 movu m4, [r5]
7488 movu m5, [r5 + r1]
7489 movu m6, [r5 + r1 * 2]
7490 movu m7, [r5 + r4]
7491
7492 paddw m8, m0, m1
7493 paddw m8, m2
7494 paddw m8, m3
7495 paddw m8, m4
7496 paddw m8, m5
7497 paddw m8, m6
7498 paddw m8, m7
7499 pmaddwd m8, [pw_1]
7500 movhlps m9, m8
7501 paddd m8, m9
7502 psrldq m9, m8, 4
7503 paddd m8, m9
7504 psrld m8, 2
7505
7506 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7507
7508 paddd m0, m1
7509 paddd m0, m2
7510 paddd m0, m3
7511 HADDUW m0, m1
7512 paddd m0, [pd_1]
7513 psrld m0, 1
7514 psubd m10, m0, m8
7515
7516 lea r4, [3 * r3]
7517 movu m0, [r2]
7518 movu m1, [r2 + r3]
7519 movu m2, [r2 + r3 * 2]
7520 movu m3, [r2 + r4]
7521 lea r5, [r2 + r3 * 4]
7522 movu m4, [r5]
7523 movu m5, [r5 + r3]
7524 movu m6, [r5 + r3 * 2]
7525 movu m7, [r5 + r4]
7526
7527 paddw m8, m0, m1
7528 paddw m8, m2
7529 paddw m8, m3
7530 paddw m8, m4
7531 paddw m8, m5
7532 paddw m8, m6
7533 paddw m8, m7
7534 pmaddwd m8, [pw_1]
7535 movhlps m9, m8
7536 paddd m8, m9
7537 psrldq m9, m8, 4
7538 paddd m8, m9
7539 psrld m8, 2
7540
7541 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7542
7543 paddd m0, m1
7544 paddd m0, m2
7545 paddd m0, m3
7546 HADDUW m0, m1
7547 paddd m0, [pd_1]
7548 psrld m0, 1
7549 psubd m0, m8
7550 psubd m10, m0
7551 pabsd m0, m10
7552 movd eax, m0
7553 %else ; !HIGH_BIT_DEPTH
7554 lea r4, [3 * r1]
7555 mova m8, [hmul_8p]
7556
7557 movddup m0, [r0]
7558 movddup m1, [r0 + r1]
7559 movddup m2, [r0 + r1 * 2]
7560 movddup m3, [r0 + r4]
7561 lea r5, [r0 + r1 * 4]
7562 movddup m4, [r5]
7563 movddup m5, [r5 + r1]
7564 movddup m6, [r5 + r1 * 2]
7565 movddup m7, [r5 + r4]
7566
7567 pmaddubsw m0, m8
7568 pmaddubsw m1, m8
7569 pmaddubsw m2, m8
7570 pmaddubsw m3, m8
7571 pmaddubsw m4, m8
7572 pmaddubsw m5, m8
7573 pmaddubsw m6, m8
7574 pmaddubsw m7, m8
7575
7576 paddw m11, m0, m1
7577 paddw m11, m2
7578 paddw m11, m3
7579 paddw m11, m4
7580 paddw m11, m5
7581 paddw m11, m6
7582 paddw m11, m7
7583
7584 pmaddwd m11, [pw_1]
7585 psrldq m10, m11, 4
7586 paddd m11, m10
7587 psrld m11, 2
7588
7589 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
7590
7591 paddw m0, m1
7592 paddw m0, m2
7593 paddw m0, m3
7594 HADDW m0, m1
7595
7596 paddd m0, [pd_1]
7597 psrld m0, 1
7598 psubd m12, m0, m11
7599
7600 lea r4, [3 * r3]
7601
7602 movddup m0, [r2]
7603 movddup m1, [r2 + r3]
7604 movddup m2, [r2 + r3 * 2]
7605 movddup m3, [r2 + r4]
7606 lea r5, [r2 + r3 * 4]
7607 movddup m4, [r5]
7608 movddup m5, [r5 + r3]
7609 movddup m6, [r5 + r3 * 2]
7610 movddup m7, [r5 + r4]
7611
7612 pmaddubsw m0, m8
7613 pmaddubsw m1, m8
7614 pmaddubsw m2, m8
7615 pmaddubsw m3, m8
7616 pmaddubsw m4, m8
7617 pmaddubsw m5, m8
7618 pmaddubsw m6, m8
7619 pmaddubsw m7, m8
7620
7621 paddw m11, m0, m1
7622 paddw m11, m2
7623 paddw m11, m3
7624 paddw m11, m4
7625 paddw m11, m5
7626 paddw m11, m6
7627 paddw m11, m7
7628
7629 pmaddwd m11, [pw_1]
7630 psrldq m10, m11, 4
7631 paddd m11, m10
7632 psrld m11, 2
7633
7634 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
7635
7636 paddw m0, m1
7637 paddw m0, m2
7638 paddw m0, m3
7639 HADDW m0, m1
7640
7641 paddd m0, [pd_1]
7642 psrld m0, 1
7643 psubd m0, m11
7644 psubd m12, m0
7645 pabsd m0, m12
7646 movd eax, m0
7647 %endif ; HIGH_BIT_DEPTH
7648 RET
7649 %endif
7650
7651 %if ARCH_X86_64
7652 %if HIGH_BIT_DEPTH
7653 INIT_XMM sse4
7654 cglobal psyCost_pp_16x16, 4, 9, 14
7655
7656 FIX_STRIDES r1, r3
7657 lea r4, [3 * r1]
7658 lea r8, [3 * r3]
7659 mova m12, [pw_1]
7660 mova m13, [pd_1]
7661 pxor m11, m11
7662 mov r7d, 2
7663 .loopH:
7664 mov r6d, 2
7665 .loopW:
7666 pxor m10, m10
7667 movu m0, [r0]
7668 movu m1, [r0 + r1]
7669 movu m2, [r0 + r1 * 2]
7670 movu m3, [r0 + r4]
7671 lea r5, [r0 + r1 * 4]
7672 movu m4, [r5]
7673 movu m5, [r5 + r1]
7674 movu m6, [r5 + r1 * 2]
7675 movu m7, [r5 + r4]
7676
7677 paddw m8, m0, m1
7678 paddw m8, m2
7679 paddw m8, m3
7680 paddw m8, m4
7681 paddw m8, m5
7682 paddw m8, m6
7683 paddw m8, m7
7684 pmaddwd m8, m12
7685 movhlps m9, m8
7686 paddd m8, m9
7687 psrldq m9, m8, 4
7688 paddd m8, m9
7689 psrld m8, 2
7690
7691 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7692
7693 paddd m0, m1
7694 paddd m0, m2
7695 paddd m0, m3
7696 HADDUW m0, m1
7697 paddd m0, m13
7698 psrld m0, 1
7699 psubd m10, m0, m8
7700
7701 movu m0, [r2]
7702 movu m1, [r2 + r3]
7703 movu m2, [r2 + r3 * 2]
7704 movu m3, [r2 + r8]
7705 lea r5, [r2 + r3 * 4]
7706 movu m4, [r5]
7707 movu m5, [r5 + r3]
7708 movu m6, [r5 + r3 * 2]
7709 movu m7, [r5 + r8]
7710
7711 paddw m8, m0, m1
7712 paddw m8, m2
7713 paddw m8, m3
7714 paddw m8, m4
7715 paddw m8, m5
7716 paddw m8, m6
7717 paddw m8, m7
7718 pmaddwd m8, m12
7719 movhlps m9, m8
7720 paddd m8, m9
7721 psrldq m9, m8, 4
7722 paddd m8, m9
7723 psrld m8, 2
7724
7725 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7726
7727 paddd m0, m1
7728 paddd m0, m2
7729 paddd m0, m3
7730 HADDUW m0, m1
7731 paddd m0, m13
7732 psrld m0, 1
7733 psubd m0, m8
7734 psubd m10, m0
7735 pabsd m0, m10
7736 paddd m11, m0
7737 add r0, 16
7738 add r2, 16
7739 dec r6d
7740 jnz .loopW
7741 lea r0, [r0 + r1 * 8 - 32]
7742 lea r2, [r2 + r3 * 8 - 32]
7743 dec r7d
7744 jnz .loopH
7745 movd eax, m11
7746 RET
7747 %else ; !HIGH_BIT_DEPTH
7748 INIT_XMM sse4
7749 cglobal psyCost_pp_16x16, 4, 9, 15
7750 lea r4, [3 * r1]
7751 lea r8, [3 * r3]
7752 mova m8, [hmul_8p]
7753 mova m10, [pw_1]
7754 mova m14, [pd_1]
7755 pxor m13, m13
7756 mov r7d, 2
7757 .loopH:
7758 mov r6d, 2
7759 .loopW:
7760 pxor m12, m12
7761 movddup m0, [r0]
7762 movddup m1, [r0 + r1]
7763 movddup m2, [r0 + r1 * 2]
7764 movddup m3, [r0 + r4]
7765 lea r5, [r0 + r1 * 4]
7766 movddup m4, [r5]
7767 movddup m5, [r5 + r1]
7768 movddup m6, [r5 + r1 * 2]
7769 movddup m7, [r5 + r4]
7770
7771 pmaddubsw m0, m8
7772 pmaddubsw m1, m8
7773 pmaddubsw m2, m8
7774 pmaddubsw m3, m8
7775 pmaddubsw m4, m8
7776 pmaddubsw m5, m8
7777 pmaddubsw m6, m8
7778 pmaddubsw m7, m8
7779
7780 paddw m11, m0, m1
7781 paddw m11, m2
7782 paddw m11, m3
7783 paddw m11, m4
7784 paddw m11, m5
7785 paddw m11, m6
7786 paddw m11, m7
7787
7788 pmaddwd m11, m10
7789 psrldq m9, m11, 4
7790 paddd m11, m9
7791 psrld m11, 2
7792
7793 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
7794
7795 paddw m0, m1
7796 paddw m0, m2
7797 paddw m0, m3
7798 HADDW m0, m1
7799
7800 paddd m0, m14
7801 psrld m0, 1
7802 psubd m12, m0, m11
7803
7804 movddup m0, [r2]
7805 movddup m1, [r2 + r3]
7806 movddup m2, [r2 + r3 * 2]
7807 movddup m3, [r2 + r8]
7808 lea r5, [r2 + r3 * 4]
7809 movddup m4, [r5]
7810 movddup m5, [r5 + r3]
7811 movddup m6, [r5 + r3 * 2]
7812 movddup m7, [r5 + r8]
7813
7814 pmaddubsw m0, m8
7815 pmaddubsw m1, m8
7816 pmaddubsw m2, m8
7817 pmaddubsw m3, m8
7818 pmaddubsw m4, m8
7819 pmaddubsw m5, m8
7820 pmaddubsw m6, m8
7821 pmaddubsw m7, m8
7822
7823 paddw m11, m0, m1
7824 paddw m11, m2
7825 paddw m11, m3
7826 paddw m11, m4
7827 paddw m11, m5
7828 paddw m11, m6
7829 paddw m11, m7
7830
7831 pmaddwd m11, m10
7832 psrldq m9, m11, 4
7833 paddd m11, m9
7834 psrld m11, 2
7835
7836 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
7837
7838 paddw m0, m1
7839 paddw m0, m2
7840 paddw m0, m3
7841 HADDW m0, m1
7842
7843 paddd m0, m14
7844 psrld m0, 1
7845 psubd m0, m11
7846 psubd m12, m0
7847 pabsd m0, m12
7848 paddd m13, m0
7849 add r0, 8
7850 add r2, 8
7851 dec r6d
7852 jnz .loopW
7853 lea r0, [r0 + r1 * 8 - 16]
7854 lea r2, [r2 + r3 * 8 - 16]
7855 dec r7d
7856 jnz .loopH
7857 movd eax, m13
7858 RET
7859 %endif ; HIGH_BIT_DEPTH
7860 %endif
7861
7862 %if ARCH_X86_64
7863 %if HIGH_BIT_DEPTH
7864 INIT_XMM sse4
7865 cglobal psyCost_pp_32x32, 4, 9, 14
7866
7867 FIX_STRIDES r1, r3
7868 lea r4, [3 * r1]
7869 lea r8, [3 * r3]
7870 mova m12, [pw_1]
7871 mova m13, [pd_1]
7872 pxor m11, m11
7873 mov r7d, 4
7874 .loopH:
7875 mov r6d, 4
7876 .loopW:
7877 pxor m10, m10
7878 movu m0, [r0]
7879 movu m1, [r0 + r1]
7880 movu m2, [r0 + r1 * 2]
7881 movu m3, [r0 + r4]
7882 lea r5, [r0 + r1 * 4]
7883 movu m4, [r5]
7884 movu m5, [r5 + r1]
7885 movu m6, [r5 + r1 * 2]
7886 movu m7, [r5 + r4]
7887
7888 paddw m8, m0, m1
7889 paddw m8, m2
7890 paddw m8, m3
7891 paddw m8, m4
7892 paddw m8, m5
7893 paddw m8, m6
7894 paddw m8, m7
7895 pmaddwd m8, m12
7896 movhlps m9, m8
7897 paddd m8, m9
7898 psrldq m9, m8, 4
7899 paddd m8, m9
7900 psrld m8, 2
7901
7902 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7903
7904 paddd m0, m1
7905 paddd m0, m2
7906 paddd m0, m3
7907 HADDUW m0, m1
7908 paddd m0, m13
7909 psrld m0, 1
7910 psubd m10, m0, m8
7911
7912 movu m0, [r2]
7913 movu m1, [r2 + r3]
7914 movu m2, [r2 + r3 * 2]
7915 movu m3, [r2 + r8]
7916 lea r5, [r2 + r3 * 4]
7917 movu m4, [r5]
7918 movu m5, [r5 + r3]
7919 movu m6, [r5 + r3 * 2]
7920 movu m7, [r5 + r8]
7921
7922 paddw m8, m0, m1
7923 paddw m8, m2
7924 paddw m8, m3
7925 paddw m8, m4
7926 paddw m8, m5
7927 paddw m8, m6
7928 paddw m8, m7
7929 pmaddwd m8, m12
7930 movhlps m9, m8
7931 paddd m8, m9
7932 psrldq m9, m8, 4
7933 paddd m8, m9
7934 psrld m8, 2
7935
7936 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
7937
7938 paddd m0, m1
7939 paddd m0, m2
7940 paddd m0, m3
7941 HADDUW m0, m1
7942 paddd m0, m13
7943 psrld m0, 1
7944 psubd m0, m8
7945 psubd m10, m0
7946 pabsd m0, m10
7947 paddd m11, m0
7948 add r0, 16
7949 add r2, 16
7950 dec r6d
7951 jnz .loopW
7952 lea r0, [r0 + r1 * 8 - 64]
7953 lea r2, [r2 + r3 * 8 - 64]
7954 dec r7d
7955 jnz .loopH
7956 movd eax, m11
7957 RET
7958
7959 %else ; !HIGH_BIT_DEPTH
7960 INIT_XMM sse4
7961 cglobal psyCost_pp_32x32, 4, 9, 15
7962
7963 lea r4, [3 * r1]
7964 lea r8, [3 * r3]
7965 mova m8, [hmul_8p]
7966 mova m10, [pw_1]
7967 mova m14, [pd_1]
7968 pxor m13, m13
7969 mov r7d, 4
7970 .loopH:
7971 mov r6d, 4
7972 .loopW:
7973 pxor m12, m12
7974 movddup m0, [r0]
7975 movddup m1, [r0 + r1]
7976 movddup m2, [r0 + r1 * 2]
7977 movddup m3, [r0 + r4]
7978 lea r5, [r0 + r1 * 4]
7979 movddup m4, [r5]
7980 movddup m5, [r5 + r1]
7981 movddup m6, [r5 + r1 * 2]
7982 movddup m7, [r5 + r4]
7983
7984 pmaddubsw m0, m8
7985 pmaddubsw m1, m8
7986 pmaddubsw m2, m8
7987 pmaddubsw m3, m8
7988 pmaddubsw m4, m8
7989 pmaddubsw m5, m8
7990 pmaddubsw m6, m8
7991 pmaddubsw m7, m8
7992
7993 paddw m11, m0, m1
7994 paddw m11, m2
7995 paddw m11, m3
7996 paddw m11, m4
7997 paddw m11, m5
7998 paddw m11, m6
7999 paddw m11, m7
8000
8001 pmaddwd m11, m10
8002 psrldq m9, m11, 4
8003 paddd m11, m9
8004 psrld m11, 2
8005
8006 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
8007
8008 paddw m0, m1
8009 paddw m0, m2
8010 paddw m0, m3
8011 HADDW m0, m1
8012
8013 paddd m0, m14
8014 psrld m0, 1
8015 psubd m12, m0, m11
8016
8017 movddup m0, [r2]
8018 movddup m1, [r2 + r3]
8019 movddup m2, [r2 + r3 * 2]
8020 movddup m3, [r2 + r8]
8021 lea r5, [r2 + r3 * 4]
8022 movddup m4, [r5]
8023 movddup m5, [r5 + r3]
8024 movddup m6, [r5 + r3 * 2]
8025 movddup m7, [r5 + r8]
8026
8027 pmaddubsw m0, m8
8028 pmaddubsw m1, m8
8029 pmaddubsw m2, m8
8030 pmaddubsw m3, m8
8031 pmaddubsw m4, m8
8032 pmaddubsw m5, m8
8033 pmaddubsw m6, m8
8034 pmaddubsw m7, m8
8035
8036 paddw m11, m0, m1
8037 paddw m11, m2
8038 paddw m11, m3
8039 paddw m11, m4
8040 paddw m11, m5
8041 paddw m11, m6
8042 paddw m11, m7
8043
8044 pmaddwd m11, m10
8045 psrldq m9, m11, 4
8046 paddd m11, m9
8047 psrld m11, 2
8048
8049 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
8050
8051 paddw m0, m1
8052 paddw m0, m2
8053 paddw m0, m3
8054 HADDW m0, m1
8055
8056 paddd m0, m14
8057 psrld m0, 1
8058 psubd m0, m11
8059 psubd m12, m0
8060 pabsd m0, m12
8061 paddd m13, m0
8062 add r0, 8
8063 add r2, 8
8064 dec r6d
8065 jnz .loopW
8066 lea r0, [r0 + r1 * 8 - 32]
8067 lea r2, [r2 + r3 * 8 - 32]
8068 dec r7d
8069 jnz .loopH
8070 movd eax, m13
8071 RET
8072 %endif ; HIGH_BIT_DEPTH
8073 %endif
8074
8075 %if ARCH_X86_64
8076 %if HIGH_BIT_DEPTH
8077 INIT_XMM sse4
8078 cglobal psyCost_pp_64x64, 4, 9, 14
8079
8080 FIX_STRIDES r1, r3
8081 lea r4, [3 * r1]
8082 lea r8, [3 * r3]
8083 mova m12, [pw_1]
8084 mova m13, [pd_1]
8085 pxor m11, m11
8086 mov r7d, 8
8087 .loopH:
8088 mov r6d, 8
8089 .loopW:
8090 pxor m10, m10
8091 movu m0, [r0]
8092 movu m1, [r0 + r1]
8093 movu m2, [r0 + r1 * 2]
8094 movu m3, [r0 + r4]
8095 lea r5, [r0 + r1 * 4]
8096 movu m4, [r5]
8097 movu m5, [r5 + r1]
8098 movu m6, [r5 + r1 * 2]
8099 movu m7, [r5 + r4]
8100
8101 paddw m8, m0, m1
8102 paddw m8, m2
8103 paddw m8, m3
8104 paddw m8, m4
8105 paddw m8, m5
8106 paddw m8, m6
8107 paddw m8, m7
8108 pmaddwd m8, m12
8109 movhlps m9, m8
8110 paddd m8, m9
8111 psrldq m9, m8, 4
8112 paddd m8, m9
8113 psrld m8, 2
8114
8115 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
8116
8117 paddd m0, m1
8118 paddd m0, m2
8119 paddd m0, m3
8120 HADDUW m0, m1
8121 paddd m0, m13
8122 psrld m0, 1
8123 psubd m10, m0, m8
8124
8125 movu m0, [r2]
8126 movu m1, [r2 + r3]
8127 movu m2, [r2 + r3 * 2]
8128 movu m3, [r2 + r8]
8129 lea r5, [r2 + r3 * 4]
8130 movu m4, [r5]
8131 movu m5, [r5 + r3]
8132 movu m6, [r5 + r3 * 2]
8133 movu m7, [r5 + r8]
8134
8135 paddw m8, m0, m1
8136 paddw m8, m2
8137 paddw m8, m3
8138 paddw m8, m4
8139 paddw m8, m5
8140 paddw m8, m6
8141 paddw m8, m7
8142 pmaddwd m8, m12
8143 movhlps m9, m8
8144 paddd m8, m9
8145 psrldq m9, m8, 4
8146 paddd m8, m9
8147 psrld m8, 2
8148
8149 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
8150
8151 paddd m0, m1
8152 paddd m0, m2
8153 paddd m0, m3
8154 HADDUW m0, m1
8155 paddd m0, m13
8156 psrld m0, 1
8157 psubd m0, m8
8158 psubd m10, m0
8159 pabsd m0, m10
8160 paddd m11, m0
8161 add r0, 16
8162 add r2, 16
8163 dec r6d
8164 jnz .loopW
8165 lea r0, [r0 + r1 * 8 - 128]
8166 lea r2, [r2 + r3 * 8 - 128]
8167 dec r7d
8168 jnz .loopH
8169 movd eax, m11
8170 RET
8171
8172 %else ; !HIGH_BIT_DEPTH
8173 INIT_XMM sse4
8174 cglobal psyCost_pp_64x64, 4, 9, 15
8175
8176 lea r4, [3 * r1]
8177 lea r8, [3 * r3]
8178 mova m8, [hmul_8p]
8179 mova m10, [pw_1]
8180 mova m14, [pd_1]
8181 pxor m13, m13
8182 mov r7d, 8
8183 .loopH:
8184 mov r6d, 8
8185 .loopW:
8186 pxor m12, m12
8187 movddup m0, [r0]
8188 movddup m1, [r0 + r1]
8189 movddup m2, [r0 + r1 * 2]
8190 movddup m3, [r0 + r4]
8191 lea r5, [r0 + r1 * 4]
8192 movddup m4, [r5]
8193 movddup m5, [r5 + r1]
8194 movddup m6, [r5 + r1 * 2]
8195 movddup m7, [r5 + r4]
8196
8197 pmaddubsw m0, m8
8198 pmaddubsw m1, m8
8199 pmaddubsw m2, m8
8200 pmaddubsw m3, m8
8201 pmaddubsw m4, m8
8202 pmaddubsw m5, m8
8203 pmaddubsw m6, m8
8204 pmaddubsw m7, m8
8205
8206 paddw m11, m0, m1
8207 paddw m11, m2
8208 paddw m11, m3
8209 paddw m11, m4
8210 paddw m11, m5
8211 paddw m11, m6
8212 paddw m11, m7
8213
8214 pmaddwd m11, m10
8215 psrldq m9, m11, 4
8216 paddd m11, m9
8217 psrld m11, 2
8218
8219 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
8220
8221 paddw m0, m1
8222 paddw m0, m2
8223 paddw m0, m3
8224 HADDW m0, m1
8225
8226 paddd m0, m14
8227 psrld m0, 1
8228 psubd m12, m0, m11
8229
8230 movddup m0, [r2]
8231 movddup m1, [r2 + r3]
8232 movddup m2, [r2 + r3 * 2]
8233 movddup m3, [r2 + r8]
8234 lea r5, [r2 + r3 * 4]
8235 movddup m4, [r5]
8236 movddup m5, [r5 + r3]
8237 movddup m6, [r5 + r3 * 2]
8238 movddup m7, [r5 + r8]
8239
8240 pmaddubsw m0, m8
8241 pmaddubsw m1, m8
8242 pmaddubsw m2, m8
8243 pmaddubsw m3, m8
8244 pmaddubsw m4, m8
8245 pmaddubsw m5, m8
8246 pmaddubsw m6, m8
8247 pmaddubsw m7, m8
8248
8249 paddw m11, m0, m1
8250 paddw m11, m2
8251 paddw m11, m3
8252 paddw m11, m4
8253 paddw m11, m5
8254 paddw m11, m6
8255 paddw m11, m7
8256
8257 pmaddwd m11, m10
8258 psrldq m9, m11, 4
8259 paddd m11, m9
8260 psrld m11, 2
8261
8262 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
8263
8264 paddw m0, m1
8265 paddw m0, m2
8266 paddw m0, m3
8267 HADDW m0, m1
8268
8269 paddd m0, m14
8270 psrld m0, 1
8271 psubd m0, m11
8272 psubd m12, m0
8273 pabsd m0, m12
8274 paddd m13, m0
8275 add r0, 8
8276 add r2, 8
8277 dec r6d
8278 jnz .loopW
8279 lea r0, [r0 + r1 * 8 - 64]
8280 lea r2, [r2 + r3 * 8 - 64]
8281 dec r7d
8282 jnz .loopH
8283 movd eax, m13
8284 RET
8285 %endif ; HIGH_BIT_DEPTH
8286 %endif
8287
8288 INIT_YMM avx2
8289 %if HIGH_BIT_DEPTH
8290 cglobal psyCost_pp_4x4, 4, 5, 6
8291 add r1d, r1d
8292 add r3d, r3d
8293 lea r4, [r1 * 3]
8294 movddup xm0, [r0]
8295 movddup xm1, [r0 + r1]
8296 movddup xm2, [r0 + r1 * 2]
8297 movddup xm3, [r0 + r4]
8298
8299 lea r4, [r3 * 3]
8300 movddup xm4, [r2]
8301 movddup xm5, [r2 + r3]
8302 vinserti128 m0, m0, xm4, 1
8303 vinserti128 m1, m1, xm5, 1
8304 movddup xm4, [r2 + r3 * 2]
8305 movddup xm5, [r2 + r4]
8306 vinserti128 m2, m2, xm4, 1
8307 vinserti128 m3, m3, xm5, 1
8308
8309 mova m4, [hmul_8w]
8310 pmaddwd m0, m4
8311 pmaddwd m1, m4
8312 pmaddwd m2, m4
8313 pmaddwd m3, m4
8314 paddd m5, m0, m1
8315 paddd m4, m2, m3
8316 paddd m5, m4
8317 psrldq m4, m5, 4
8318 paddd m5, m4
8319 psrld m5, 2
8320
8321 mova m4, m0
8322 paddd m0, m1
8323 psubd m1, m4
8324 mova m4, m2
8325 paddd m2, m3
8326 psubd m3, m4
8327 mova m4, m0
8328 paddd m0, m2
8329 psubd m2, m4
8330 mova m4, m1
8331 paddd m1, m3
8332 psubd m3, m4
8333 movaps m4, m0
8334 vshufps m4, m4, m2, 11011101b
8335 vshufps m0, m0, m2, 10001000b
8336 movaps m2, m1
8337 vshufps m2, m2, m3, 11011101b
8338 vshufps m1, m1, m3, 10001000b
8339 pabsd m0, m0
8340 pabsd m4, m4
8341 pmaxsd m0, m4
8342 pabsd m1, m1
8343 pabsd m2, m2
8344 pmaxsd m1, m2
8345 paddd m0, m1
8346
8347 vpermq m1, m0, 11110101b
8348 paddd m0, m1
8349 psrldq m1, m0, 4
8350 paddd m0, m1
8351 psubd m0, m5
8352
8353 vextracti128 xm1, m0, 1
8354 psubd xm1, xm0
8355 pabsd xm1, xm1
8356 movd eax, xm1
8357 RET
8358 %else ; !HIGH_BIT_DEPTH
8359 cglobal psyCost_pp_4x4, 4, 5, 6
8360 lea r4, [3 * r1]
8361 movd xm0, [r0]
8362 movd xm1, [r0 + r1]
8363 movd xm2, [r0 + r1 * 2]
8364 movd xm3, [r0 + r4]
8365 vshufps xm0, xm1, 0
8366 vshufps xm2, xm3, 0
8367
8368 lea r4, [3 * r3]
8369 movd xm1, [r2]
8370 movd xm3, [r2 + r3]
8371 movd xm4, [r2 + r3 * 2]
8372 movd xm5, [r2 + r4]
8373 vshufps xm1, xm3, 0
8374 vshufps xm4, xm5, 0
8375
8376 vinserti128 m0, m0, xm1, 1
8377 vinserti128 m2, m2, xm4, 1
8378
8379 mova m4, [hmul_4p]
8380 pmaddubsw m0, m4
8381 pmaddubsw m2, m4
8382
8383 paddw m5, m0, m2
8384 mova m1, m5
8385 psrldq m4, m5, 8
8386 paddw m5, m4
8387 pmaddwd m5, [pw_1]
8388 psrld m5, 2
8389
8390 vpsubw m2, m2, m0
8391 vpunpckhqdq m0, m1, m2
8392 vpunpcklqdq m1, m1, m2
8393 vpaddw m2, m1, m0
8394 vpsubw m0, m0, m1
8395 vpblendw m1, m2, m0, 10101010b
8396 vpslld m0, m0, 10h
8397 vpsrld m2, m2, 10h
8398 vpor m0, m0, m2
8399 vpabsw m1, m1
8400 vpabsw m0, m0
8401 vpmaxsw m1, m1, m0
8402 vpmaddwd m1, m1, [pw_1]
8403 psrldq m2, m1, 8
8404 paddd m1, m2
8405 psrldq m3, m1, 4
8406 paddd m1, m3
8407 psubd m1, m5
8408 vextracti128 xm2, m1, 1
8409 psubd m1, m2
8410 pabsd m1, m1
8411 movd eax, xm1
8412 RET
8413 %endif
8414
8415 %macro PSY_PP_8x8 0
8416 movddup m0, [r0 + r1 * 0]
8417 movddup m1, [r0 + r1 * 1]
8418 movddup m2, [r0 + r1 * 2]
8419 movddup m3, [r0 + r4 * 1]
8420
8421 lea r5, [r0 + r1 * 4]
8422
8423 movddup m4, [r2 + r3 * 0]
8424 movddup m5, [r2 + r3 * 1]
8425 movddup m6, [r2 + r3 * 2]
8426 movddup m7, [r2 + r7 * 1]
8427
8428 lea r6, [r2 + r3 * 4]
8429
8430 vinserti128 m0, m0, xm4, 1
8431 vinserti128 m1, m1, xm5, 1
8432 vinserti128 m2, m2, xm6, 1
8433 vinserti128 m3, m3, xm7, 1
8434
8435 movddup m4, [r5 + r1 * 0]
8436 movddup m5, [r5 + r1 * 1]
8437 movddup m6, [r5 + r1 * 2]
8438 movddup m7, [r5 + r4 * 1]
8439
8440 movddup m9, [r6 + r3 * 0]
8441 movddup m10, [r6 + r3 * 1]
8442 movddup m11, [r6 + r3 * 2]
8443 movddup m12, [r6 + r7 * 1]
8444
8445 vinserti128 m4, m4, xm9, 1
8446 vinserti128 m5, m5, xm10, 1
8447 vinserti128 m6, m6, xm11, 1
8448 vinserti128 m7, m7, xm12, 1
8449
8450 pmaddubsw m0, m8
8451 pmaddubsw m1, m8
8452 pmaddubsw m2, m8
8453 pmaddubsw m3, m8
8454 pmaddubsw m4, m8
8455 pmaddubsw m5, m8
8456 pmaddubsw m6, m8
8457 pmaddubsw m7, m8
8458
8459 paddw m11, m0, m1
8460 paddw m11, m2
8461 paddw m11, m3
8462 paddw m11, m4
8463 paddw m11, m5
8464 paddw m11, m6
8465 paddw m11, m7
8466
8467 pmaddwd m11, [pw_1]
8468 psrldq m10, m11, 4
8469 paddd m11, m10
8470 psrld m11, 2
8471
8472 mova m9, m0
8473 paddw m0, m1 ; m0+m1
8474 psubw m1, m9 ; m1-m0
8475 mova m9, m2
8476 paddw m2, m3 ; m2+m3
8477 psubw m3, m9 ; m3-m2
8478 mova m9, m0
8479 paddw m0, m2 ; m0+m1+m2+m3
8480 psubw m2, m9 ; m2+m3-m0+m1
8481 mova m9, m1
8482 paddw m1, m3 ; m1-m0+m3-m2
8483 psubw m3, m9 ; m3-m2-m1-m0
8484
8485 movdqa m9, m4
8486 paddw m4, m5 ; m4+m5
8487 psubw m5, m9 ; m5-m4
8488 movdqa m9, m6
8489 paddw m6, m7 ; m6+m7
8490 psubw m7, m9 ; m7-m6
8491 movdqa m9, m4
8492 paddw m4, m6 ; m4+m5+m6+m7
8493 psubw m6, m9 ; m6+m7-m4+m5
8494 movdqa m9, m5
8495 paddw m5, m7 ; m5-m4+m7-m6
8496 psubw m7, m9 ; m7-m6-m5-m4
8497
8498 movdqa m9, m0
8499 paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7)
8500 psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3)
8501 movdqa m9, m1
8502 paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6)
8503 psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2)
8504
8505 mova m9, m0
8506 vshufps m9, m9, m4, 11011101b
8507 vshufps m0, m0, m4, 10001000b
8508
8509 movdqa m4, m0
8510 paddw m0, m9 ; (a0 + a4) + (a4 - a0)
8511 psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4)
8512
8513 movaps m4, m1
8514 vshufps m4, m4, m5, 11011101b
8515 vshufps m1, m1, m5, 10001000b
8516
8517 movdqa m5, m1
8518 paddw m1, m4
8519 psubw m4, m5
8520 movdqa m5, m2
8521 paddw m2, m6
8522 psubw m6, m5
8523 movdqa m5, m3
8524 paddw m3, m7
8525 psubw m7, m5
8526
8527 movaps m5, m2
8528 vshufps m5, m5, m6, 11011101b
8529 vshufps m2, m2, m6, 10001000b
8530
8531 movdqa m6, m2
8532 paddw m2, m5
8533 psubw m5, m6
8534 movaps m6, m3
8535
8536 vshufps m6, m6, m7, 11011101b
8537 vshufps m3, m3, m7, 10001000b
8538
8539 movdqa m7, m3
8540 paddw m3, m6
8541 psubw m6, m7
8542 movdqa m7, m0
8543
8544 pblendw m0, m9, 10101010b
8545 pslld m9, 10h
8546 psrld m7, 10h
8547 por m9, m7
8548 pabsw m0, m0
8549 pabsw m9, m9
8550 pmaxsw m0, m9
8551 movdqa m7, m1
8552 pblendw m1, m4, 10101010b
8553 pslld m4, 10h
8554 psrld m7, 10h
8555 por m4, m7
8556 pabsw m1, m1
8557 pabsw m4, m4
8558 pmaxsw m1, m4
8559 movdqa m7, m2
8560 pblendw m2, m5, 10101010b
8561 pslld m5, 10h
8562 psrld m7, 10h
8563 por m5, m7
8564 pabsw m2, m2
8565 pabsw m5, m5
8566 pmaxsw m2, m5
8567 mova m7, m3
8568
8569 pblendw m3, m6, 10101010b
8570 pslld m6, 10h
8571 psrld m7, 10h
8572 por m6, m7
8573 pabsw m3, m3
8574 pabsw m6, m6
8575 pmaxsw m3, m6
8576 paddw m0, m1
8577 paddw m0, m2
8578 paddw m0, m3
8579 pmaddwd m0, [pw_1]
8580 psrldq m1, m0, 8
8581 paddd m0, m1
8582
8583 pshuflw m1, m0, 00001110b
8584 paddd m0, m1
8585 paddd m0, [pd_1]
8586 psrld m0, 1
8587
8588 psubd m0, m11
8589
8590 vextracti128 xm1, m0, 1
8591 psubd m0, m1
8592 pabsd m0, m0
8593 %endmacro
8594
8595 %macro PSY_PP_8x8_AVX2 0
8596 lea r4, [r1 * 3]
8597 movu xm0, [r0]
8598 movu xm1, [r0 + r1]
8599 movu xm2, [r0 + r1 * 2]
8600 movu xm3, [r0 + r4]
8601 lea r5, [r0 + r1 * 4]
8602 movu xm4, [r5]
8603 movu xm5, [r5 + r1]
8604 movu xm6, [r5 + r1 * 2]
8605 movu xm7, [r5 + r4]
8606
8607 lea r4, [r3 * 3]
8608 vinserti128 m0, m0, [r2], 1
8609 vinserti128 m1, m1, [r2 + r3], 1
8610 vinserti128 m2, m2, [r2 + r3 * 2], 1
8611 vinserti128 m3, m3, [r2 + r4], 1
8612 lea r5, [r2 + r3 * 4]
8613 vinserti128 m4, m4, [r5], 1
8614 vinserti128 m5, m5, [r5 + r3], 1
8615 vinserti128 m6, m6, [r5 + r3 * 2], 1
8616 vinserti128 m7, m7, [r5 + r4], 1
8617
8618 paddw m8, m0, m1
8619 paddw m8, m2
8620 paddw m8, m3
8621 paddw m8, m4
8622 paddw m8, m5
8623 paddw m8, m6
8624 paddw m8, m7
8625 pmaddwd m8, [pw_1]
8626
8627 psrldq m9, m8, 8
8628 paddd m8, m9
8629 psrldq m9, m8, 4
8630 paddd m8, m9
8631 psrld m8, 2
8632
8633 psubw m9, m1, m0
8634 paddw m0, m1
8635 psubw m1, m3, m2
8636 paddw m2, m3
8637 punpckhwd m3, m0, m9
8638 punpcklwd m0, m9
8639 psubw m9, m3, m0
8640 paddw m0, m3
8641 punpckhwd m3, m2, m1
8642 punpcklwd m2, m1
8643 psubw m10, m3, m2
8644 paddw m2, m3
8645 psubw m3, m5, m4
8646 paddw m4, m5
8647 psubw m5, m7, m6
8648 paddw m6, m7
8649 punpckhwd m1, m4, m3
8650 punpcklwd m4, m3
8651 psubw m7, m1, m4
8652 paddw m4, m1
8653 punpckhwd m3, m6, m5
8654 punpcklwd m6, m5
8655 psubw m1, m3, m6
8656 paddw m6, m3
8657 psubw m3, m2, m0
8658 paddw m0, m2
8659 psubw m2, m10, m9
8660 paddw m9, m10
8661 punpckhdq m5, m0, m3
8662 punpckldq m0, m3
8663 psubw m10, m5, m0
8664 paddw m0, m5
8665 punpckhdq m3, m9, m2
8666 punpckldq m9, m2
8667 psubw m5, m3, m9
8668 paddw m9, m3
8669 psubw m3, m6, m4
8670 paddw m4, m6
8671 psubw m6, m1, m7
8672 paddw m7, m1
8673 punpckhdq m2, m4, m3
8674 punpckldq m4, m3
8675 psubw m1, m2, m4
8676 paddw m4, m2
8677 punpckhdq m3, m7, m6
8678 punpckldq m7, m6
8679 psubw m2, m3, m7
8680 paddw m7, m3
8681 psubw m3, m4, m0
8682 paddw m0, m4
8683 psubw m4, m1, m10
8684 paddw m10, m1
8685 punpckhqdq m6, m0, m3
8686 punpcklqdq m0, m3
8687 pabsw m0, m0
8688 pabsw m6, m6
8689 pmaxsw m0, m6
8690 punpckhqdq m3, m10, m4
8691 punpcklqdq m10, m4
8692 pabsw m10, m10
8693 pabsw m3, m3
8694 pmaxsw m10, m3
8695 psubw m3, m7, m9
8696 paddw m9, m7
8697 psubw m7, m2, m5
8698 paddw m5, m2
8699 punpckhqdq m4, m9, m3
8700 punpcklqdq m9, m3
8701 pabsw m9, m9
8702 pabsw m4, m4
8703 pmaxsw m9, m4
8704 punpckhqdq m3, m5, m7
8705 punpcklqdq m5, m7
8706 pabsw m5, m5
8707 pabsw m3, m3
8708 pmaxsw m5, m3
8709 paddd m0, m9
8710 paddd m0, m10
8711 paddd m0, m5
8712 psrld m9, m0, 16
8713 pslld m0, 16
8714 psrld m0, 16
8715 paddd m0, m9
8716 psrldq m9, m0, 8
8717 paddd m0, m9
8718 psrldq m9, m0, 4
8719 paddd m0, m9
8720 paddd m0, [pd_1]
8721 psrld m0, 1
8722 psubd m0, m8
8723
8724 vextracti128 xm1, m0, 1
8725 psubd xm1, xm0
8726 pabsd xm1, xm1
8727 %endmacro
8728
8729 %if ARCH_X86_64
8730 %if HIGH_BIT_DEPTH
8731 cglobal psyCost_pp_8x8, 4, 8, 11
8732 add r1d, r1d
8733 add r3d, r3d
8734 PSY_PP_8x8_AVX2
8735 movd eax, xm1
8736 RET
8737 %else ; !HIGH_BIT_DEPTH
8738 INIT_YMM avx2
8739 cglobal psyCost_pp_8x8, 4, 8, 13
8740 lea r4, [3 * r1]
8741 lea r7, [3 * r3]
8742 mova m8, [hmul_8p]
8743
8744 PSY_PP_8x8
8745
8746 movd eax, xm0
8747 RET
8748 %endif
8749 %endif
8750 %if ARCH_X86_64
8751 INIT_YMM avx2
8752 %if HIGH_BIT_DEPTH
8753 cglobal psyCost_pp_16x16, 4, 10, 12
8754 add r1d, r1d
8755 add r3d, r3d
8756 pxor m11, m11
8757
8758 mov r8d, 2
8759 .loopH:
8760 mov r9d, 2
8761 .loopW:
8762 PSY_PP_8x8_AVX2
8763
8764 paddd xm11, xm1
8765 add r0, 16
8766 add r2, 16
8767 dec r9d
8768 jnz .loopW
8769 lea r0, [r0 + r1 * 8 - 32]
8770 lea r2, [r2 + r3 * 8 - 32]
8771 dec r8d
8772 jnz .loopH
8773 movd eax, xm11
8774 RET
8775 %else ; !HIGH_BIT_DEPTH
8776 cglobal psyCost_pp_16x16, 4, 10, 14
8777 lea r4, [3 * r1]
8778 lea r7, [3 * r3]
8779 mova m8, [hmul_8p]
8780 pxor m13, m13
8781
8782 mov r8d, 2
8783 .loopH:
8784 mov r9d, 2
8785 .loopW:
8786 PSY_PP_8x8
8787
8788 paddd m13, m0
8789 add r0, 8
8790 add r2, 8
8791 dec r9d
8792 jnz .loopW
8793 lea r0, [r0 + r1 * 8 - 16]
8794 lea r2, [r2 + r3 * 8 - 16]
8795 dec r8d
8796 jnz .loopH
8797 movd eax, xm13
8798 RET
8799 %endif
8800 %endif
8801 %if ARCH_X86_64
8802 INIT_YMM avx2
8803 %if HIGH_BIT_DEPTH
8804 cglobal psyCost_pp_32x32, 4, 10, 12
8805 add r1d, r1d
8806 add r3d, r3d
8807 pxor m11, m11
8808
8809 mov r8d, 4
8810 .loopH:
8811 mov r9d, 4
8812 .loopW:
8813 PSY_PP_8x8_AVX2
8814
8815 paddd xm11, xm1
8816 add r0, 16
8817 add r2, 16
8818 dec r9d
8819 jnz .loopW
8820 lea r0, [r0 + r1 * 8 - 64]
8821 lea r2, [r2 + r3 * 8 - 64]
8822 dec r8d
8823 jnz .loopH
8824 movd eax, xm11
8825 RET
8826 %else ; !HIGH_BIT_DEPTH
8827 cglobal psyCost_pp_32x32, 4, 10, 14
8828 lea r4, [3 * r1]
8829 lea r7, [3 * r3]
8830 mova m8, [hmul_8p]
8831 pxor m13, m13
8832
8833 mov r8d, 4
8834 .loopH:
8835 mov r9d, 4
8836 .loopW:
8837 PSY_PP_8x8
8838
8839 paddd m13, m0
8840 add r0, 8
8841 add r2, 8
8842 dec r9d
8843 jnz .loopW
8844 lea r0, [r0 + r1 * 8 - 32]
8845 lea r2, [r2 + r3 * 8 - 32]
8846 dec r8d
8847 jnz .loopH
8848 movd eax, xm13
8849 RET
8850 %endif
8851 %endif
8852 %if ARCH_X86_64
8853 INIT_YMM avx2
8854 %if HIGH_BIT_DEPTH
8855 cglobal psyCost_pp_64x64, 4, 10, 12
8856 add r1d, r1d
8857 add r3d, r3d
8858 pxor m11, m11
8859
8860 mov r8d, 8
8861 .loopH:
8862 mov r9d, 8
8863 .loopW:
8864 PSY_PP_8x8_AVX2
8865
8866 paddd xm11, xm1
8867 add r0, 16
8868 add r2, 16
8869 dec r9d
8870 jnz .loopW
8871 lea r0, [r0 + r1 * 8 - 128]
8872 lea r2, [r2 + r3 * 8 - 128]
8873 dec r8d
8874 jnz .loopH
8875 movd eax, xm11
8876 RET
8877 %else ; !HIGH_BIT_DEPTH
8878 cglobal psyCost_pp_64x64, 4, 10, 14
8879 lea r4, [3 * r1]
8880 lea r7, [3 * r3]
8881 mova m8, [hmul_8p]
8882 pxor m13, m13
8883
8884 mov r8d, 8
8885 .loopH:
8886 mov r9d, 8
8887 .loopW:
8888 PSY_PP_8x8
8889
8890 paddd m13, m0
8891 add r0, 8
8892 add r2, 8
8893 dec r9d
8894 jnz .loopW
8895 lea r0, [r0 + r1 * 8 - 64]
8896 lea r2, [r2 + r3 * 8 - 64]
8897 dec r8d
8898 jnz .loopH
8899 movd eax, xm13
8900 RET
8901 %endif
8902 %endif
8903
8904 ;---------------------------------------------------------------------------------------------------------------------
8905 ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
8906 ;---------------------------------------------------------------------------------------------------------------------
8907 INIT_XMM sse4
8908 cglobal psyCost_ss_4x4, 4, 5, 8
8909
8910 add r1, r1
8911 lea r4, [3 * r1]
8912 movddup m0, [r0]
8913 movddup m1, [r0 + r1]
8914 movddup m2, [r0 + r1 * 2]
8915 movddup m3, [r0 + r4]
8916
8917 pabsw m4, m0
8918 pabsw m5, m1
8919 paddw m5, m4
8920 pabsw m4, m2
8921 paddw m5, m4
8922 pabsw m4, m3
8923 paddw m5, m4
8924 pmaddwd m5, [pw_1]
8925 psrldq m4, m5, 4
8926 paddd m5, m4
8927 psrld m6, m5, 2
8928
8929 mova m4, [hmul_8w]
8930 pmaddwd m0, m4
8931 pmaddwd m1, m4
8932 pmaddwd m2, m4
8933 pmaddwd m3, m4
8934
8935 psrldq m4, m0, 4
8936 psubd m5, m0, m4
8937 paddd m0, m4
8938 shufps m0, m5, 10001000b
8939
8940 psrldq m4, m1, 4
8941 psubd m5, m1, m4
8942 paddd m1, m4
8943 shufps m1, m5, 10001000b
8944
8945 psrldq m4, m2, 4
8946 psubd m5, m2, m4
8947 paddd m2, m4
8948 shufps m2, m5, 10001000b
8949
8950 psrldq m4, m3, 4
8951 psubd m5, m3, m4
8952 paddd m3, m4
8953 shufps m3, m5, 10001000b
8954
8955 mova m4, m0
8956 paddd m0, m1
8957 psubd m1, m4
8958 mova m4, m2
8959 paddd m2, m3
8960 psubd m3, m4
8961 mova m4, m0
8962 paddd m0, m2
8963 psubd m2, m4
8964 mova m4, m1
8965 paddd m1, m3
8966 psubd m3, m4
8967
8968 pabsd m0, m0
8969 pabsd m2, m2
8970 pabsd m1, m1
8971 pabsd m3, m3
8972 paddd m0, m2
8973 paddd m1, m3
8974 paddd m0, m1
8975 movhlps m1, m0
8976 paddd m0, m1
8977 psrldq m1, m0, 4
8978 paddd m0, m1
8979 psrld m0, 1
8980 psubd m7, m0, m6
8981
8982 add r3, r3
8983 lea r4, [3 * r3]
8984 movddup m0, [r2]
8985 movddup m1, [r2 + r3]
8986 movddup m2, [r2 + r3 * 2]
8987 movddup m3, [r2 + r4]
8988
8989 pabsw m4, m0
8990 pabsw m5, m1
8991 paddw m5, m4
8992 pabsw m4, m2
8993 paddw m5, m4
8994 pabsw m4, m3
8995 paddw m5, m4
8996 pmaddwd m5, [pw_1]
8997 psrldq m4, m5, 4
8998 paddd m5, m4
8999 psrld m6, m5, 2
9000
9001 mova m4, [hmul_8w]
9002 pmaddwd m0, m4
9003 pmaddwd m1, m4
9004 pmaddwd m2, m4
9005 pmaddwd m3, m4
9006
9007 psrldq m4, m0, 4
9008 psubd m5, m0, m4
9009 paddd m0, m4
9010 shufps m0, m5, 10001000b
9011
9012 psrldq m4, m1, 4
9013 psubd m5, m1, m4
9014 paddd m1, m4
9015 shufps m1, m5, 10001000b
9016
9017 psrldq m4, m2, 4
9018 psubd m5, m2, m4
9019 paddd m2, m4
9020 shufps m2, m5, 10001000b
9021
9022 psrldq m4, m3, 4
9023 psubd m5, m3, m4
9024 paddd m3, m4
9025 shufps m3, m5, 10001000b
9026
9027 mova m4, m0
9028 paddd m0, m1
9029 psubd m1, m4
9030 mova m4, m2
9031 paddd m2, m3
9032 psubd m3, m4
9033 mova m4, m0
9034 paddd m0, m2
9035 psubd m2, m4
9036 mova m4, m1
9037 paddd m1, m3
9038 psubd m3, m4
9039
9040 pabsd m0, m0
9041 pabsd m2, m2
9042 pabsd m1, m1
9043 pabsd m3, m3
9044 paddd m0, m2
9045 paddd m1, m3
9046 paddd m0, m1
9047 movhlps m1, m0
9048 paddd m0, m1
9049 psrldq m1, m0, 4
9050 paddd m0, m1
9051 psrld m0, 1
9052 psubd m0, m6
9053 psubd m7, m0
9054 pabsd m0, m7
9055 movd eax, m0
9056 RET
9057
9058 %if ARCH_X86_64
9059 INIT_XMM sse4
9060 cglobal psyCost_ss_8x8, 4, 6, 15
9061
9062 mova m13, [pw_pmpmpmpm]
9063 mova m14, [pw_1]
9064 add r1, r1
9065 add r3, r3
9066 lea r4, [3 * r1]
9067 movu m0, [r0]
9068 movu m1, [r0 + r1]
9069 movu m2, [r0 + r1 * 2]
9070 movu m3, [r0 + r4]
9071 lea r5, [r0 + r1 * 4]
9072 movu m4, [r5]
9073 movu m5, [r5 + r1]
9074 movu m6, [r5 + r1 * 2]
9075 movu m7, [r5 + r4]
9076
9077 pabsw m8, m0
9078 pabsw m9, m1
9079 paddw m8, m9
9080 pabsw m10, m2
9081 pabsw m11, m3
9082 paddw m10, m11
9083 paddw m8, m10
9084 pabsw m9, m4
9085 pabsw m10, m5
9086 paddw m9, m10
9087 pabsw m11, m6
9088 pabsw m12, m7
9089 paddw m11, m12
9090 paddw m9, m11
9091 paddw m8, m9
9092 movhlps m9, m8
9093 pmovzxwd m8, m8
9094 pmovzxwd m9, m9
9095 paddd m8, m9
9096 movhlps m9, m8
9097 paddd m8, m9
9098 psrldq m9, m8, 4
9099 paddd m8, m9
9100 psrld m8, 2
9101
9102 pmaddwd m0, m13
9103 pmaddwd m1, m13
9104 pmaddwd m2, m13
9105 pmaddwd m3, m13
9106
9107 psrldq m9, m0, 4
9108 psubd m10, m0, m9
9109 paddd m0, m9
9110 shufps m0, m10, 10001000b
9111 psrldq m9, m0, 4
9112 psubd m10, m0, m9
9113 paddd m0, m9
9114 shufps m0, m10, 10001000b
9115
9116 psrldq m9, m1, 4
9117 psubd m10, m1, m9
9118 paddd m1, m9
9119 shufps m1, m10, 10001000b
9120 psrldq m9, m1, 4
9121 psubd m10, m1, m9
9122 paddd m1, m9
9123 shufps m1, m10, 10001000b
9124
9125 psrldq m9, m2, 4
9126 psubd m10, m2, m9
9127 paddd m2, m9
9128 shufps m2, m10, 10001000b
9129 psrldq m9, m2, 4
9130 psubd m10, m2, m9
9131 paddd m2, m9
9132 shufps m2, m10, 10001000b
9133
9134 psrldq m9, m3, 4
9135 psubd m10, m3, m9
9136 paddd m3, m9
9137 shufps m3, m10, 10001000b
9138 psrldq m9, m3, 4
9139 psubd m10, m3, m9
9140 paddd m3, m9
9141 shufps m3, m10, 10001000b
9142
9143 SUMSUB_BA d, 0, 1, 9
9144 SUMSUB_BA d, 2, 3, 9
9145 SUMSUB_BA d, 0, 2, 9
9146 SUMSUB_BA d, 1, 3, 9
9147
9148 pmaddwd m4, m13
9149 pmaddwd m5, m13
9150 pmaddwd m6, m13
9151 pmaddwd m7, m13
9152
9153 psrldq m9, m4, 4
9154 psubd m10, m4, m9
9155 paddd m4, m9
9156 shufps m4, m10, 10001000b
9157 psrldq m9, m4, 4
9158 psubd m10, m4, m9
9159 paddd m4, m9
9160 shufps m4, m10, 10001000b
9161
9162 psrldq m9, m5, 4
9163 psubd m10, m5, m9
9164 paddd m5, m9
9165 shufps m5, m10, 10001000b
9166 psrldq m9, m5, 4
9167 psubd m10, m5, m9
9168 paddd m5, m9
9169 shufps m5, m10, 10001000b
9170
9171 psrldq m9, m6, 4
9172 psubd m10, m6, m9
9173 paddd m6, m9
9174 shufps m6, m10, 10001000b
9175 psrldq m9, m6, 4
9176 psubd m10, m6, m9
9177 paddd m6, m9
9178 shufps m6, m10, 10001000b
9179
9180 psrldq m9, m7, 4
9181 psubd m10, m7, m9
9182 paddd m7, m9
9183 shufps m7, m10, 10001000b
9184 psrldq m9, m7, 4
9185 psubd m10, m7, m9
9186 paddd m7, m9
9187 shufps m7, m10, 10001000b
9188
9189 SUMSUB_BA d, 4, 5, 9
9190 SUMSUB_BA d, 6, 7, 9
9191 SUMSUB_BA d, 4, 6, 9
9192 SUMSUB_BA d, 5, 7, 9
9193
9194 SUMSUB_BA d, 0, 4, 9
9195 SUMSUB_BA d, 1, 5, 9
9196 SUMSUB_BA d, 2, 6, 9
9197 SUMSUB_BA d, 3, 7, 9
9198
9199 pabsd m0, m0
9200 pabsd m2, m2
9201 pabsd m1, m1
9202 pabsd m3, m3
9203 pabsd m4, m4
9204 pabsd m5, m5
9205 pabsd m6, m6
9206 pabsd m7, m7
9207
9208 paddd m0, m2
9209 paddd m1, m3
9210 paddd m0, m1
9211 paddd m5, m4
9212 paddd m0, m5
9213 paddd m7, m6
9214 paddd m11, m0, m7
9215
9216 movu m0, [r0]
9217 movu m1, [r0 + r1]
9218 movu m2, [r0 + r1 * 2]
9219 movu m3, [r0 + r4]
9220
9221 pmaddwd m0, m14
9222 pmaddwd m1, m14
9223 pmaddwd m2, m14
9224 pmaddwd m3, m14
9225
9226 psrldq m9, m0, 4
9227 psubd m10, m0, m9
9228 paddd m0, m9
9229 shufps m0, m10, 10001000b
9230 psrldq m9, m0, 4
9231 psubd m10, m0, m9
9232 paddd m0, m9
9233 shufps m0, m10, 10001000b
9234
9235 psrldq m9, m1, 4
9236 psubd m10, m1, m9
9237 paddd m1, m9
9238 shufps m1, m10, 10001000b
9239 psrldq m9, m1, 4
9240 psubd m10, m1, m9
9241 paddd m1, m9
9242 shufps m1, m10, 10001000b
9243
9244 psrldq m9, m2, 4
9245 psubd m10, m2, m9
9246 paddd m2, m9
9247 shufps m2, m10, 10001000b
9248 psrldq m9, m2, 4
9249 psubd m10, m2, m9
9250 paddd m2, m9
9251 shufps m2, m10, 10001000b
9252
9253 psrldq m9, m3, 4
9254 psubd m10, m3, m9
9255 paddd m3, m9
9256 shufps m3, m10, 10001000b
9257 psrldq m9, m3, 4
9258 psubd m10, m3, m9
9259 paddd m3, m9
9260 shufps m3, m10, 10001000b
9261
9262 SUMSUB_BA d, 0, 1, 9
9263 SUMSUB_BA d, 2, 3, 9
9264 SUMSUB_BA d, 0, 2, 9
9265 SUMSUB_BA d, 1, 3, 9
9266
9267 movu m4, [r5]
9268 movu m5, [r5 + r1]
9269 movu m6, [r5 + r1 * 2]
9270 movu m7, [r5 + r4]
9271
9272 pmaddwd m4, m14
9273 pmaddwd m5, m14
9274 pmaddwd m6, m14
9275 pmaddwd m7, m14
9276
9277 psrldq m9, m4, 4
9278 psubd m10, m4, m9
9279 paddd m4, m9
9280 shufps m4, m10, 10001000b
9281 psrldq m9, m4, 4
9282 psubd m10, m4, m9
9283 paddd m4, m9
9284 shufps m4, m10, 10001000b
9285
9286 psrldq m9, m5, 4
9287 psubd m10, m5, m9
9288 paddd m5, m9
9289 shufps m5, m10, 10001000b
9290 psrldq m9, m5, 4
9291 psubd m10, m5, m9
9292 paddd m5, m9
9293 shufps m5, m10, 10001000b
9294
9295 psrldq m9, m6, 4
9296 psubd m10, m6, m9
9297 paddd m6, m9
9298 shufps m6, m10, 10001000b
9299 psrldq m9, m6, 4
9300 psubd m10, m6, m9
9301 paddd m6, m9
9302 shufps m6, m10, 10001000b
9303
9304 psrldq m9, m7, 4
9305 psubd m10, m7, m9
9306 paddd m7, m9
9307 shufps m7, m10, 10001000b
9308 psrldq m9, m7, 4
9309 psubd m10, m7, m9
9310 paddd m7, m9
9311 shufps m7, m10, 10001000b
9312
9313 SUMSUB_BA d, 4, 5, 9
9314 SUMSUB_BA d, 6, 7, 9
9315 SUMSUB_BA d, 4, 6, 9
9316 SUMSUB_BA d, 5, 7, 9
9317
9318 SUMSUB_BA d, 0, 4, 9
9319 SUMSUB_BA d, 1, 5, 9
9320 SUMSUB_BA d, 2, 6, 9
9321 SUMSUB_BA d, 3, 7, 9
9322
9323 pabsd m0, m0
9324 pabsd m2, m2
9325 pabsd m1, m1
9326 pabsd m3, m3
9327 pabsd m4, m4
9328 pabsd m5, m5
9329 pabsd m6, m6
9330 pabsd m7, m7
9331
9332 paddd m0, m2
9333 paddd m1, m3
9334 paddd m0, m1
9335 paddd m5, m4
9336 paddd m0, m5
9337 paddd m7, m6
9338 paddd m0, m7
9339 paddd m0, m11
9340
9341 movhlps m1, m0
9342 paddd m0, m1
9343 psrldq m1, m0, 4
9344 paddd m0, m1
9345 paddd m0, [pd_2]
9346 psrld m0, 2
9347 psubd m12, m0, m8
9348
9349 lea r4, [3 * r3]
9350 movu m0, [r2]
9351 movu m1, [r2 + r3]
9352 movu m2, [r2 + r3 * 2]
9353 movu m3, [r2 + r4]
9354 lea r5, [r2 + r3 * 4]
9355 movu m4, [r5]
9356 movu m5, [r5 + r3]
9357 movu m6, [r5 + r3 * 2]
9358 movu m7, [r5 + r4]
9359
9360 pabsw m8, m0
9361 pabsw m9, m1
9362 paddw m8, m9
9363 pabsw m10, m2
9364 pabsw m11, m3
9365 paddw m10, m11
9366 paddw m8, m10
9367 pabsw m9, m4
9368 pabsw m10, m5
9369 paddw m9, m10
9370 pabsw m11, m6
9371 pabsw m10, m7
9372 paddw m11, m10
9373 paddw m9, m11
9374 paddw m8, m9
9375 movhlps m9, m8
9376 pmovzxwd m8, m8
9377 pmovzxwd m9, m9
9378 paddd m8, m9
9379 movhlps m9, m8
9380 paddd m8, m9
9381 psrldq m9, m8, 4
9382 paddd m8, m9
9383 psrld m8, 2
9384
9385 pmaddwd m0, m13
9386 pmaddwd m1, m13
9387 pmaddwd m2, m13
9388 pmaddwd m3, m13
9389
9390 psrldq m9, m0, 4
9391 psubd m10, m0, m9
9392 paddd m0, m9
9393 shufps m0, m10, 10001000b
9394 psrldq m9, m0, 4
9395 psubd m10, m0, m9
9396 paddd m0, m9
9397 shufps m0, m10, 10001000b
9398
9399 psrldq m9, m1, 4
9400 psubd m10, m1, m9
9401 paddd m1, m9
9402 shufps m1, m10, 10001000b
9403 psrldq m9, m1, 4
9404 psubd m10, m1, m9
9405 paddd m1, m9
9406 shufps m1, m10, 10001000b
9407
9408 psrldq m9, m2, 4
9409 psubd m10, m2, m9
9410 paddd m2, m9
9411 shufps m2, m10, 10001000b
9412 psrldq m9, m2, 4
9413 psubd m10, m2, m9
9414 paddd m2, m9
9415 shufps m2, m10, 10001000b
9416
9417 psrldq m9, m3, 4
9418 psubd m10, m3, m9
9419 paddd m3, m9
9420 shufps m3, m10, 10001000b
9421 psrldq m9, m3, 4
9422 psubd m10, m3, m9
9423 paddd m3, m9
9424 shufps m3, m10, 10001000b
9425
9426 SUMSUB_BA d, 0, 1, 9
9427 SUMSUB_BA d, 2, 3, 9
9428 SUMSUB_BA d, 0, 2, 9
9429 SUMSUB_BA d, 1, 3, 9
9430
9431 pmaddwd m4, m13
9432 pmaddwd m5, m13
9433 pmaddwd m6, m13
9434 pmaddwd m7, m13
9435
9436 psrldq m9, m4, 4
9437 psubd m10, m4, m9
9438 paddd m4, m9
9439 shufps m4, m10, 10001000b
9440 psrldq m9, m4, 4
9441 psubd m10, m4, m9
9442 paddd m4, m9
9443 shufps m4, m10, 10001000b
9444
9445 psrldq m9, m5, 4
9446 psubd m10, m5, m9
9447 paddd m5, m9
9448 shufps m5, m10, 10001000b
9449 psrldq m9, m5, 4
9450 psubd m10, m5, m9
9451 paddd m5, m9
9452 shufps m5, m10, 10001000b
9453
9454 psrldq m9, m6, 4
9455 psubd m10, m6, m9
9456 paddd m6, m9
9457 shufps m6, m10, 10001000b
9458 psrldq m9, m6, 4
9459 psubd m10, m6, m9
9460 paddd m6, m9
9461 shufps m6, m10, 10001000b
9462
9463 psrldq m9, m7, 4
9464 psubd m10, m7, m9
9465 paddd m7, m9
9466 shufps m7, m10, 10001000b
9467 psrldq m9, m7, 4
9468 psubd m10, m7, m9
9469 paddd m7, m9
9470 shufps m7, m10, 10001000b
9471
9472 SUMSUB_BA d, 4, 5, 9
9473 SUMSUB_BA d, 6, 7, 9
9474 SUMSUB_BA d, 4, 6, 9
9475 SUMSUB_BA d, 5, 7, 9
9476
9477 SUMSUB_BA d, 0, 4, 9
9478 SUMSUB_BA d, 1, 5, 9
9479 SUMSUB_BA d, 2, 6, 9
9480 SUMSUB_BA d, 3, 7, 9
9481
9482 pabsd m0, m0
9483 pabsd m2, m2
9484 pabsd m1, m1
9485 pabsd m3, m3
9486 pabsd m4, m4
9487 pabsd m5, m5
9488 pabsd m6, m6
9489 pabsd m7, m7
9490
9491 paddd m0, m2
9492 paddd m1, m3
9493 paddd m0, m1
9494 paddd m5, m4
9495 paddd m0, m5
9496 paddd m7, m6
9497 paddd m11, m0, m7
9498
9499 movu m0, [r2]
9500 movu m1, [r2 + r3]
9501 movu m2, [r2 + r3 * 2]
9502 movu m3, [r2 + r4]
9503
9504 pmaddwd m0, m14
9505 pmaddwd m1, m14
9506 pmaddwd m2, m14
9507 pmaddwd m3, m14
9508
9509 psrldq m9, m0, 4
9510 psubd m10, m0, m9
9511 paddd m0, m9
9512 shufps m0, m10, 10001000b
9513 psrldq m9, m0, 4
9514 psubd m10, m0, m9
9515 paddd m0, m9
9516 shufps m0, m10, 10001000b
9517
9518 psrldq m9, m1, 4
9519 psubd m10, m1, m9
9520 paddd m1, m9
9521 shufps m1, m10, 10001000b
9522 psrldq m9, m1, 4
9523 psubd m10, m1, m9
9524 paddd m1, m9
9525 shufps m1, m10, 10001000b
9526
9527 psrldq m9, m2, 4
9528 psubd m10, m2, m9
9529 paddd m2, m9
9530 shufps m2, m10, 10001000b
9531 psrldq m9, m2, 4
9532 psubd m10, m2, m9
9533 paddd m2, m9
9534 shufps m2, m10, 10001000b
9535
9536 psrldq m9, m3, 4
9537 psubd m10, m3, m9
9538 paddd m3, m9
9539 shufps m3, m10, 10001000b
9540 psrldq m9, m3, 4
9541 psubd m10, m3, m9
9542 paddd m3, m9
9543 shufps m3, m10, 10001000b
9544
9545 SUMSUB_BA d, 0, 1, 9
9546 SUMSUB_BA d, 2, 3, 9
9547 SUMSUB_BA d, 0, 2, 9
9548 SUMSUB_BA d, 1, 3, 9
9549
9550 movu m4, [r5]
9551 movu m5, [r5 + r3]
9552 movu m6, [r5 + r3 * 2]
9553 movu m7, [r5 + r4]
9554
9555 pmaddwd m4, m14
9556 pmaddwd m5, m14
9557 pmaddwd m6, m14
9558 pmaddwd m7, m14
9559
9560 psrldq m9, m4, 4
9561 psubd m10, m4, m9
9562 paddd m4, m9
9563 shufps m4, m10, 10001000b
9564 psrldq m9, m4, 4
9565 psubd m10, m4, m9
9566 paddd m4, m9
9567 shufps m4, m10, 10001000b
9568
9569 psrldq m9, m5, 4
9570 psubd m10, m5, m9
9571 paddd m5, m9
9572 shufps m5, m10, 10001000b
9573 psrldq m9, m5, 4
9574 psubd m10, m5, m9
9575 paddd m5, m9
9576 shufps m5, m10, 10001000b
9577
9578 psrldq m9, m6, 4
9579 psubd m10, m6, m9
9580 paddd m6, m9
9581 shufps m6, m10, 10001000b
9582 psrldq m9, m6, 4
9583 psubd m10, m6, m9
9584 paddd m6, m9
9585 shufps m6, m10, 10001000b
9586
9587 psrldq m9, m7, 4
9588 psubd m10, m7, m9
9589 paddd m7, m9
9590 shufps m7, m10, 10001000b
9591 psrldq m9, m7, 4
9592 psubd m10, m7, m9
9593 paddd m7, m9
9594 shufps m7, m10, 10001000b
9595
9596 SUMSUB_BA d, 4, 5, 9
9597 SUMSUB_BA d, 6, 7, 9
9598 SUMSUB_BA d, 4, 6, 9
9599 SUMSUB_BA d, 5, 7, 9
9600
9601 SUMSUB_BA d, 0, 4, 9
9602 SUMSUB_BA d, 1, 5, 9
9603 SUMSUB_BA d, 2, 6, 9
9604 SUMSUB_BA d, 3, 7, 9
9605
9606 pabsd m0, m0
9607 pabsd m2, m2
9608 pabsd m1, m1
9609 pabsd m3, m3
9610 pabsd m4, m4
9611 pabsd m5, m5
9612 pabsd m6, m6
9613 pabsd m7, m7
9614
9615 paddd m0, m2
9616 paddd m1, m3
9617 paddd m0, m1
9618 paddd m5, m4
9619 paddd m0, m5
9620 paddd m7, m6
9621 paddd m0, m7
9622 paddd m0, m11
9623
9624 movhlps m1, m0
9625 paddd m0, m1
9626 psrldq m1, m0, 4
9627 paddd m0, m1
9628 paddd m0, [pd_2]
9629 psrld m0, 2
9630 psubd m0, m8
9631
9632 psubd m12, m0
9633 pabsd m0, m12
9634 movd eax, m0
9635 RET
9636 %endif
9637
9638 %macro psy_cost_ss 0
9639 movu m0, [r0]
9640 movu m1, [r0 + r1]
9641 movu m2, [r0 + r1 * 2]
9642 movu m3, [r0 + r4]
9643 lea r5, [r0 + r1 * 4]
9644 movu m4, [r5]
9645 movu m5, [r5 + r1]
9646 movu m6, [r5 + r1 * 2]
9647 movu m7, [r5 + r4]
9648
9649 pabsw m8, m0
9650 pabsw m9, m1
9651 paddw m8, m9
9652 pabsw m10, m2
9653 pabsw m11, m3
9654 paddw m10, m11
9655 paddw m8, m10
9656 pabsw m9, m4
9657 pabsw m10, m5
9658 paddw m9, m10
9659 pabsw m11, m6
9660 pabsw m12, m7
9661 paddw m11, m12
9662 paddw m9, m11
9663 paddw m8, m9
9664 movhlps m9, m8
9665 pmovzxwd m8, m8
9666 pmovzxwd m9, m9
9667 paddd m8, m9
9668 movhlps m9, m8
9669 paddd m8, m9
9670 psrldq m9, m8, 4
9671 paddd m8, m9
9672 psrld m8, 2
9673
9674 pmaddwd m0, m13
9675 pmaddwd m1, m13
9676 pmaddwd m2, m13
9677 pmaddwd m3, m13
9678
9679 psrldq m9, m0, 4
9680 psubd m10, m0, m9
9681 paddd m0, m9
9682 shufps m0, m10, 10001000b
9683 psrldq m9, m0, 4
9684 psubd m10, m0, m9
9685 paddd m0, m9
9686 shufps m0, m10, 10001000b
9687
9688 psrldq m9, m1, 4
9689 psubd m10, m1, m9
9690 paddd m1, m9
9691 shufps m1, m10, 10001000b
9692 psrldq m9, m1, 4
9693 psubd m10, m1, m9
9694 paddd m1, m9
9695 shufps m1, m10, 10001000b
9696
9697 psrldq m9, m2, 4
9698 psubd m10, m2, m9
9699 paddd m2, m9
9700 shufps m2, m10, 10001000b
9701 psrldq m9, m2, 4
9702 psubd m10, m2, m9
9703 paddd m2, m9
9704 shufps m2, m10, 10001000b
9705
9706 psrldq m9, m3, 4
9707 psubd m10, m3, m9
9708 paddd m3, m9
9709 shufps m3, m10, 10001000b
9710 psrldq m9, m3, 4
9711 psubd m10, m3, m9
9712 paddd m3, m9
9713 shufps m3, m10, 10001000b
9714
9715 SUMSUB_BA d, 0, 1, 9
9716 SUMSUB_BA d, 2, 3, 9
9717 SUMSUB_BA d, 0, 2, 9
9718 SUMSUB_BA d, 1, 3, 9
9719
9720 pmaddwd m4, m13
9721 pmaddwd m5, m13
9722 pmaddwd m6, m13
9723 pmaddwd m7, m13
9724
9725 psrldq m9, m4, 4
9726 psubd m10, m4, m9
9727 paddd m4, m9
9728 shufps m4, m10, 10001000b
9729 psrldq m9, m4, 4
9730 psubd m10, m4, m9
9731 paddd m4, m9
9732 shufps m4, m10, 10001000b
9733
9734 psrldq m9, m5, 4
9735 psubd m10, m5, m9
9736 paddd m5, m9
9737 shufps m5, m10, 10001000b
9738 psrldq m9, m5, 4
9739 psubd m10, m5, m9
9740 paddd m5, m9
9741 shufps m5, m10, 10001000b
9742
9743 psrldq m9, m6, 4
9744 psubd m10, m6, m9
9745 paddd m6, m9
9746 shufps m6, m10, 10001000b
9747 psrldq m9, m6, 4
9748 psubd m10, m6, m9
9749 paddd m6, m9
9750 shufps m6, m10, 10001000b
9751
9752 psrldq m9, m7, 4
9753 psubd m10, m7, m9
9754 paddd m7, m9
9755 shufps m7, m10, 10001000b
9756 psrldq m9, m7, 4
9757 psubd m10, m7, m9
9758 paddd m7, m9
9759 shufps m7, m10, 10001000b
9760
9761 SUMSUB_BA d, 4, 5, 9
9762 SUMSUB_BA d, 6, 7, 9
9763 SUMSUB_BA d, 4, 6, 9
9764 SUMSUB_BA d, 5, 7, 9
9765
9766 SUMSUB_BA d, 0, 4, 9
9767 SUMSUB_BA d, 1, 5, 9
9768 SUMSUB_BA d, 2, 6, 9
9769 SUMSUB_BA d, 3, 7, 9
9770
9771 pabsd m0, m0
9772 pabsd m2, m2
9773 pabsd m1, m1
9774 pabsd m3, m3
9775 pabsd m4, m4
9776 pabsd m5, m5
9777 pabsd m6, m6
9778 pabsd m7, m7
9779
9780 paddd m0, m2
9781 paddd m1, m3
9782 paddd m0, m1
9783 paddd m5, m4
9784 paddd m0, m5
9785 paddd m7, m6
9786 paddd m11, m0, m7
9787
9788 movu m0, [r0]
9789 movu m1, [r0 + r1]
9790 movu m2, [r0 + r1 * 2]
9791 movu m3, [r0 + r4]
9792
9793 pmaddwd m0, m14
9794 pmaddwd m1, m14
9795 pmaddwd m2, m14
9796 pmaddwd m3, m14
9797
9798 psrldq m9, m0, 4
9799 psubd m10, m0, m9
9800 paddd m0, m9
9801 shufps m0, m10, 10001000b
9802 psrldq m9, m0, 4
9803 psubd m10, m0, m9
9804 paddd m0, m9
9805 shufps m0, m10, 10001000b
9806
9807 psrldq m9, m1, 4
9808 psubd m10, m1, m9
9809 paddd m1, m9
9810 shufps m1, m10, 10001000b
9811 psrldq m9, m1, 4
9812 psubd m10, m1, m9
9813 paddd m1, m9
9814 shufps m1, m10, 10001000b
9815
9816 psrldq m9, m2, 4
9817 psubd m10, m2, m9
9818 paddd m2, m9
9819 shufps m2, m10, 10001000b
9820 psrldq m9, m2, 4
9821 psubd m10, m2, m9
9822 paddd m2, m9
9823 shufps m2, m10, 10001000b
9824
9825 psrldq m9, m3, 4
9826 psubd m10, m3, m9
9827 paddd m3, m9
9828 shufps m3, m10, 10001000b
9829 psrldq m9, m3, 4
9830 psubd m10, m3, m9
9831 paddd m3, m9
9832 shufps m3, m10, 10001000b
9833
9834 SUMSUB_BA d, 0, 1, 9
9835 SUMSUB_BA d, 2, 3, 9
9836 SUMSUB_BA d, 0, 2, 9
9837 SUMSUB_BA d, 1, 3, 9
9838
9839 movu m4, [r5]
9840 movu m5, [r5 + r1]
9841 movu m6, [r5 + r1 * 2]
9842 movu m7, [r5 + r4]
9843
9844 pmaddwd m4, m14
9845 pmaddwd m5, m14
9846 pmaddwd m6, m14
9847 pmaddwd m7, m14
9848
9849 psrldq m9, m4, 4
9850 psubd m10, m4, m9
9851 paddd m4, m9
9852 shufps m4, m10, 10001000b
9853 psrldq m9, m4, 4
9854 psubd m10, m4, m9
9855 paddd m4, m9
9856 shufps m4, m10, 10001000b
9857
9858 psrldq m9, m5, 4
9859 psubd m10, m5, m9
9860 paddd m5, m9
9861 shufps m5, m10, 10001000b
9862 psrldq m9, m5, 4
9863 psubd m10, m5, m9
9864 paddd m5, m9
9865 shufps m5, m10, 10001000b
9866
9867 psrldq m9, m6, 4
9868 psubd m10, m6, m9
9869 paddd m6, m9
9870 shufps m6, m10, 10001000b
9871 psrldq m9, m6, 4
9872 psubd m10, m6, m9
9873 paddd m6, m9
9874 shufps m6, m10, 10001000b
9875
9876 psrldq m9, m7, 4
9877 psubd m10, m7, m9
9878 paddd m7, m9
9879 shufps m7, m10, 10001000b
9880 psrldq m9, m7, 4
9881 psubd m10, m7, m9
9882 paddd m7, m9
9883 shufps m7, m10, 10001000b
9884
9885 SUMSUB_BA d, 4, 5, 9
9886 SUMSUB_BA d, 6, 7, 9
9887 SUMSUB_BA d, 4, 6, 9
9888 SUMSUB_BA d, 5, 7, 9
9889
9890 SUMSUB_BA d, 0, 4, 9
9891 SUMSUB_BA d, 1, 5, 9
9892 SUMSUB_BA d, 2, 6, 9
9893 SUMSUB_BA d, 3, 7, 9
9894
9895 pabsd m0, m0
9896 pabsd m2, m2
9897 pabsd m1, m1
9898 pabsd m3, m3
9899 pabsd m4, m4
9900 pabsd m5, m5
9901 pabsd m6, m6
9902 pabsd m7, m7
9903
9904 paddd m0, m2
9905 paddd m1, m3
9906 paddd m0, m1
9907 paddd m5, m4
9908 paddd m0, m5
9909 paddd m7, m6
9910 paddd m0, m7
9911 paddd m0, m11
9912
9913 movhlps m1, m0
9914 paddd m0, m1
9915 psrldq m1, m0, 4
9916 paddd m0, m1
9917 paddd m0, [pd_2]
9918 psrld m0, 2
9919 psubd m12, m0, m8
9920
9921 movu m0, [r2]
9922 movu m1, [r2 + r3]
9923 movu m2, [r2 + r3 * 2]
9924 movu m3, [r2 + r6]
9925 lea r5, [r2 + r3 * 4]
9926 movu m4, [r5]
9927 movu m5, [r5 + r3]
9928 movu m6, [r5 + r3 * 2]
9929 movu m7, [r5 + r6]
9930
9931 pabsw m8, m0
9932 pabsw m9, m1
9933 paddw m8, m9
9934 pabsw m10, m2
9935 pabsw m11, m3
9936 paddw m10, m11
9937 paddw m8, m10
9938 pabsw m9, m4
9939 pabsw m10, m5
9940 paddw m9, m10
9941 pabsw m11, m6
9942 pabsw m10, m7
9943 paddw m11, m10
9944 paddw m9, m11
9945 paddw m8, m9
9946 movhlps m9, m8
9947 pmovzxwd m8, m8
9948 pmovzxwd m9, m9
9949 paddd m8, m9
9950 movhlps m9, m8
9951 paddd m8, m9
9952 psrldq m9, m8, 4
9953 paddd m8, m9
9954 psrld m8, 2
9955
9956 pmaddwd m0, m13
9957 pmaddwd m1, m13
9958 pmaddwd m2, m13
9959 pmaddwd m3, m13
9960
9961 psrldq m9, m0, 4
9962 psubd m10, m0, m9
9963 paddd m0, m9
9964 shufps m0, m10, 10001000b
9965 psrldq m9, m0, 4
9966 psubd m10, m0, m9
9967 paddd m0, m9
9968 shufps m0, m10, 10001000b
9969
9970 psrldq m9, m1, 4
9971 psubd m10, m1, m9
9972 paddd m1, m9
9973 shufps m1, m10, 10001000b
9974 psrldq m9, m1, 4
9975 psubd m10, m1, m9
9976 paddd m1, m9
9977 shufps m1, m10, 10001000b
9978
9979 psrldq m9, m2, 4
9980 psubd m10, m2, m9
9981 paddd m2, m9
9982 shufps m2, m10, 10001000b
9983 psrldq m9, m2, 4
9984 psubd m10, m2, m9
9985 paddd m2, m9
9986 shufps m2, m10, 10001000b
9987
9988 psrldq m9, m3, 4
9989 psubd m10, m3, m9
9990 paddd m3, m9
9991 shufps m3, m10, 10001000b
9992 psrldq m9, m3, 4
9993 psubd m10, m3, m9
9994 paddd m3, m9
9995 shufps m3, m10, 10001000b
9996
9997 SUMSUB_BA d, 0, 1, 9
9998 SUMSUB_BA d, 2, 3, 9
9999 SUMSUB_BA d, 0, 2, 9
10000 SUMSUB_BA d, 1, 3, 9
10001
10002 pmaddwd m4, m13
10003 pmaddwd m5, m13
10004 pmaddwd m6, m13
10005 pmaddwd m7, m13
10006
10007 psrldq m9, m4, 4
10008 psubd m10, m4, m9
10009 paddd m4, m9
10010 shufps m4, m10, 10001000b
10011 psrldq m9, m4, 4
10012 psubd m10, m4, m9
10013 paddd m4, m9
10014 shufps m4, m10, 10001000b
10015
10016 psrldq m9, m5, 4
10017 psubd m10, m5, m9
10018 paddd m5, m9
10019 shufps m5, m10, 10001000b
10020 psrldq m9, m5, 4
10021 psubd m10, m5, m9
10022 paddd m5, m9
10023 shufps m5, m10, 10001000b
10024
10025 psrldq m9, m6, 4
10026 psubd m10, m6, m9
10027 paddd m6, m9
10028 shufps m6, m10, 10001000b
10029 psrldq m9, m6, 4
10030 psubd m10, m6, m9
10031 paddd m6, m9
10032 shufps m6, m10, 10001000b
10033
10034 psrldq m9, m7, 4
10035 psubd m10, m7, m9
10036 paddd m7, m9
10037 shufps m7, m10, 10001000b
10038 psrldq m9, m7, 4
10039 psubd m10, m7, m9
10040 paddd m7, m9
10041 shufps m7, m10, 10001000b
10042
10043 SUMSUB_BA d, 4, 5, 9
10044 SUMSUB_BA d, 6, 7, 9
10045 SUMSUB_BA d, 4, 6, 9
10046 SUMSUB_BA d, 5, 7, 9
10047
10048 SUMSUB_BA d, 0, 4, 9
10049 SUMSUB_BA d, 1, 5, 9
10050 SUMSUB_BA d, 2, 6, 9
10051 SUMSUB_BA d, 3, 7, 9
10052
10053 pabsd m0, m0
10054 pabsd m2, m2
10055 pabsd m1, m1
10056 pabsd m3, m3
10057 pabsd m4, m4
10058 pabsd m5, m5
10059 pabsd m6, m6
10060 pabsd m7, m7
10061
10062 paddd m0, m2
10063 paddd m1, m3
10064 paddd m0, m1
10065 paddd m5, m4
10066 paddd m0, m5
10067 paddd m7, m6
10068 paddd m11, m0, m7
10069
10070 movu m0, [r2]
10071 movu m1, [r2 + r3]
10072 movu m2, [r2 + r3 * 2]
10073 movu m3, [r2 + r6]
10074
10075 pmaddwd m0, m14
10076 pmaddwd m1, m14
10077 pmaddwd m2, m14
10078 pmaddwd m3, m14
10079
10080 psrldq m9, m0, 4
10081 psubd m10, m0, m9
10082 paddd m0, m9
10083 shufps m0, m10, 10001000b
10084 psrldq m9, m0, 4
10085 psubd m10, m0, m9
10086 paddd m0, m9
10087 shufps m0, m10, 10001000b
10088
10089 psrldq m9, m1, 4
10090 psubd m10, m1, m9
10091 paddd m1, m9
10092 shufps m1, m10, 10001000b
10093 psrldq m9, m1, 4
10094 psubd m10, m1, m9
10095 paddd m1, m9
10096 shufps m1, m10, 10001000b
10097
10098 psrldq m9, m2, 4
10099 psubd m10, m2, m9
10100 paddd m2, m9
10101 shufps m2, m10, 10001000b
10102 psrldq m9, m2, 4
10103 psubd m10, m2, m9
10104 paddd m2, m9
10105 shufps m2, m10, 10001000b
10106
10107 psrldq m9, m3, 4
10108 psubd m10, m3, m9
10109 paddd m3, m9
10110 shufps m3, m10, 10001000b
10111 psrldq m9, m3, 4
10112 psubd m10, m3, m9
10113 paddd m3, m9
10114 shufps m3, m10, 10001000b
10115
10116 SUMSUB_BA d, 0, 1, 9
10117 SUMSUB_BA d, 2, 3, 9
10118 SUMSUB_BA d, 0, 2, 9
10119 SUMSUB_BA d, 1, 3, 9
10120
10121 movu m4, [r5]
10122 movu m5, [r5 + r3]
10123 movu m6, [r5 + r3 * 2]
10124 movu m7, [r5 + r6]
10125
10126 pmaddwd m4, m14
10127 pmaddwd m5, m14
10128 pmaddwd m6, m14
10129 pmaddwd m7, m14
10130
10131 psrldq m9, m4, 4
10132 psubd m10, m4, m9
10133 paddd m4, m9
10134 shufps m4, m10, 10001000b
10135 psrldq m9, m4, 4
10136 psubd m10, m4, m9
10137 paddd m4, m9
10138 shufps m4, m10, 10001000b
10139
10140 psrldq m9, m5, 4
10141 psubd m10, m5, m9
10142 paddd m5, m9
10143 shufps m5, m10, 10001000b
10144 psrldq m9, m5, 4
10145 psubd m10, m5, m9
10146 paddd m5, m9
10147 shufps m5, m10, 10001000b
10148
10149 psrldq m9, m6, 4
10150 psubd m10, m6, m9
10151 paddd m6, m9
10152 shufps m6, m10, 10001000b
10153 psrldq m9, m6, 4
10154 psubd m10, m6, m9
10155 paddd m6, m9
10156 shufps m6, m10, 10001000b
10157
10158 psrldq m9, m7, 4
10159 psubd m10, m7, m9
10160 paddd m7, m9
10161 shufps m7, m10, 10001000b
10162 psrldq m9, m7, 4
10163 psubd m10, m7, m9
10164 paddd m7, m9
10165 shufps m7, m10, 10001000b
10166
10167 SUMSUB_BA d, 4, 5, 9
10168 SUMSUB_BA d, 6, 7, 9
10169 SUMSUB_BA d, 4, 6, 9
10170 SUMSUB_BA d, 5, 7, 9
10171
10172 SUMSUB_BA d, 0, 4, 9
10173 SUMSUB_BA d, 1, 5, 9
10174 SUMSUB_BA d, 2, 6, 9
10175 SUMSUB_BA d, 3, 7, 9
10176
10177 pabsd m0, m0
10178 pabsd m2, m2
10179 pabsd m1, m1
10180 pabsd m3, m3
10181 pabsd m4, m4
10182 pabsd m5, m5
10183 pabsd m6, m6
10184 pabsd m7, m7
10185
10186 paddd m0, m2
10187 paddd m1, m3
10188 paddd m0, m1
10189 paddd m5, m4
10190 paddd m0, m5
10191 paddd m7, m6
10192 paddd m0, m7
10193 paddd m0, m11
10194
10195 movhlps m1, m0
10196 paddd m0, m1
10197 psrldq m1, m0, 4
10198 paddd m0, m1
10199 paddd m0, [pd_2]
10200 psrld m0, 2
10201 psubd m0, m8
10202
10203 psubd m12, m0
10204 pabsd m0, m12
10205 paddd m15, m0
10206 %endmacro
10207
10208 %if ARCH_X86_64
10209 INIT_XMM sse4
10210 cglobal psyCost_ss_16x16, 4, 9, 16
10211
10212 mova m13, [pw_pmpmpmpm]
10213 mova m14, [pw_1]
10214 add r1, r1
10215 add r3, r3
10216 lea r4, [3 * r1]
10217 lea r6, [3 * r3]
10218 pxor m15, m15
10219 mov r7d, 2
10220 .loopH:
10221 mov r8d, 2
10222 .loopW:
10223 psy_cost_ss
10224 add r0, 16
10225 add r2, 16
10226 dec r8d
10227 jnz .loopW
10228 lea r0, [r0 + r1 * 8 - 32]
10229 lea r2, [r2 + r3 * 8 - 32]
10230 dec r7d
10231 jnz .loopH
10232 movd eax, m15
10233 RET
10234 %endif
10235
10236 %if ARCH_X86_64
10237 INIT_XMM sse4
10238 cglobal psyCost_ss_32x32, 4, 9, 16
10239
10240 mova m13, [pw_pmpmpmpm]
10241 mova m14, [pw_1]
10242 add r1, r1
10243 add r3, r3
10244 lea r4, [3 * r1]
10245 lea r6, [3 * r3]
10246 pxor m15, m15
10247 mov r7d, 4
10248 .loopH:
10249 mov r8d, 4
10250 .loopW:
10251 psy_cost_ss
10252 add r0, 16
10253 add r2, 16
10254 dec r8d
10255 jnz .loopW
10256 lea r0, [r0 + r1 * 8 - 64]
10257 lea r2, [r2 + r3 * 8 - 64]
10258 dec r7d
10259 jnz .loopH
10260 movd eax, m15
10261 RET
10262 %endif
10263
10264 %if ARCH_X86_64
10265 INIT_XMM sse4
10266 cglobal psyCost_ss_64x64, 4, 9, 16
10267
10268 mova m13, [pw_pmpmpmpm]
10269 mova m14, [pw_1]
10270 add r1, r1
10271 add r3, r3
10272 lea r4, [3 * r1]
10273 lea r6, [3 * r3]
10274 pxor m15, m15
10275 mov r7d, 8
10276 .loopH:
10277 mov r8d, 8
10278 .loopW:
10279 psy_cost_ss
10280 add r0, 16
10281 add r2, 16
10282 dec r8d
10283 jnz .loopW
10284 lea r0, [r0 + r1 * 8 - 128]
10285 lea r2, [r2 + r3 * 8 - 128]
10286 dec r7d
10287 jnz .loopH
10288 movd eax, m15
10289 RET
10290 %endif
10291
10292 INIT_YMM avx2
10293 cglobal psyCost_ss_4x4, 4, 5, 8
10294 add r1, r1
10295 add r3, r3
10296 lea r4, [3 * r1]
10297 movddup m0, [r0]
10298 movddup m1, [r0 + r1]
10299 movddup m2, [r0 + r1 * 2]
10300 movddup m3, [r0 + r4]
10301
10302 lea r4, [3 * r3]
10303 movddup m4, [r2]
10304 movddup m5, [r2 + r3]
10305 movddup m6, [r2 + r3 * 2]
10306 movddup m7, [r2 + r4]
10307
10308 vinserti128 m0, m0, xm4, 1
10309 vinserti128 m1, m1, xm5, 1
10310 vinserti128 m2, m2, xm6, 1
10311 vinserti128 m3, m3, xm7, 1
10312
10313 pabsw m4, m0
10314 pabsw m5, m1
10315 paddw m5, m4
10316 pabsw m4, m2
10317 paddw m5, m4
10318 pabsw m4, m3
10319 paddw m5, m4
10320 pmaddwd m5, [pw_1]
10321 psrldq m4, m5, 4
10322 paddd m5, m4
10323 psrld m6, m5, 2
10324
10325 mova m4, [hmul_8w]
10326 pmaddwd m0, m4
10327 pmaddwd m1, m4
10328 pmaddwd m2, m4
10329 pmaddwd m3, m4
10330
10331 psrldq m4, m0, 4
10332 psubd m5, m0, m4
10333 paddd m0, m4
10334 shufps m0, m0, m5, 10001000b
10335
10336 psrldq m4, m1, 4
10337 psubd m5, m1, m4
10338 paddd m1, m4
10339 shufps m1, m1, m5, 10001000b
10340
10341 psrldq m4, m2, 4
10342 psubd m5, m2, m4
10343 paddd m2, m4
10344 shufps m2, m2, m5, 10001000b
10345
10346 psrldq m4, m3, 4
10347 psubd m5, m3, m4
10348 paddd m3, m4
10349 shufps m3, m3, m5, 10001000b
10350
10351 mova m4, m0
10352 paddd m0, m1
10353 psubd m1, m4
10354 mova m4, m2
10355 paddd m2, m3
10356 psubd m3, m4
10357 mova m4, m0
10358 paddd m0, m2
10359 psubd m2, m4
10360 mova m4, m1
10361 paddd m1, m3
10362 psubd m3, m4
10363
10364 pabsd m0, m0
10365 pabsd m2, m2
10366 pabsd m1, m1
10367 pabsd m3, m3
10368 paddd m0, m2
10369 paddd m1, m3
10370 paddd m0, m1
10371 psrldq m1, m0, 8
10372 paddd m0, m1
10373 psrldq m1, m0, 4
10374 paddd m0, m1
10375 psrld m0, 1
10376 psubd m0, m6
10377 vextracti128 xm1, m0, 1
10378 psubd m0, m1
10379 pabsd m0, m0
10380 movd eax, xm0
10381 RET
10382
10383 %macro PSY_SS_8x8 0
10384 lea r4, [3 * r1]
10385 lea r6, [r0 + r1 * 4]
10386 movu xm0, [r0]
10387 movu xm1, [r0 + r1]
10388 movu xm2, [r0 + r1 * 2]
10389 movu xm3, [r0 + r4]
10390 movu xm4, [r6]
10391 movu xm5, [r6 + r1]
10392 movu xm6, [r6 + r1 * 2]
10393 movu xm7, [r6 + r4]
10394
10395 lea r4, [3 * r3]
10396 lea r6, [r2 + r3 * 4]
10397 movu xm8, [r2]
10398 movu xm9, [r2 + r3]
10399 movu xm10, [r2 + r3 * 2]
10400 movu xm11, [r2 + r4]
10401 vinserti128 m0, m0, xm8, 1
10402 vinserti128 m1, m1, xm9, 1
10403 vinserti128 m2, m2, xm10, 1
10404 vinserti128 m3, m3, xm11, 1
10405 movu xm8, [r6]
10406 movu xm9, [r6 + r3]
10407 movu xm10, [r6 + r3 * 2]
10408 movu xm11, [r6 + r4]
10409 vinserti128 m4, m4, xm8, 1
10410 vinserti128 m5, m5, xm9, 1
10411 vinserti128 m6, m6, xm10, 1
10412 vinserti128 m7, m7, xm11, 1
10413
10414 ;; store on stack to use later
10415 mova [rsp + 0 * mmsize], m0
10416 mova [rsp + 1 * mmsize], m1
10417 mova [rsp + 2 * mmsize], m2
10418 mova [rsp + 3 * mmsize], m3
10419 mova [rsp + 4 * mmsize], m4
10420 mova [rsp + 5 * mmsize], m5
10421 mova [rsp + 6 * mmsize], m6
10422 mova [rsp + 7 * mmsize], m7
10423
10424 pabsw m8, m0
10425 pabsw m9, m1
10426 paddw m8, m9
10427 pabsw m10, m2
10428 pabsw m11, m3
10429 paddw m10, m11
10430 paddw m8, m10
10431 pabsw m9, m4
10432 pabsw m10, m5
10433 paddw m9, m10
10434 pabsw m11, m6
10435 pabsw m10, m7
10436 paddw m11, m10
10437 paddw m9, m11
10438 paddw m8, m9
10439 psrldq m9, m8, 8
10440
10441 vextracti128 xm10, m8, 1
10442 vextracti128 xm11, m9, 1
10443
10444 vpmovzxwd m8, xm8
10445 vpmovzxwd m9, xm9
10446 vpmovzxwd m10, xm10
10447 vpmovzxwd m11, xm11
10448
10449 vinserti128 m8, m8, xm10, 1
10450 vinserti128 m9, m9, xm11, 1
10451
10452 paddd m8, m9
10453 psrldq m9, m8, 8
10454 paddd m8, m9
10455 psrldq m9, m8, 4
10456 paddd m8, m9
10457 psrld m8, 2 ; sad_4x4
10458
10459 pmaddwd m0, m13
10460 pmaddwd m1, m13
10461 pmaddwd m2, m13
10462 pmaddwd m3, m13
10463
10464 psrldq m9, m0, 4
10465 psubd m10, m0, m9
10466 paddd m0, m9
10467 vshufps m0, m0, m10, 10001000b
10468 psrldq m9, m0, 4
10469 psubd m10, m0, m9
10470 paddd m0, m9
10471 vshufps m0, m0, m10, 10001000b
10472
10473 psrldq m9, m1, 4
10474 psubd m10, m1, m9
10475 paddd m1, m9
10476 vshufps m1, m1, m10, 10001000b
10477 psrldq m9, m1, 4
10478 psubd m10, m1, m9
10479 paddd m1, m9
10480 vshufps m1, m1, m10, 10001000b
10481
10482 psrldq m9, m2, 4
10483 psubd m10, m2, m9
10484 paddd m2, m9
10485 vshufps m2, m2, m10, 10001000b
10486 psrldq m9, m2, 4
10487 psubd m10, m2, m9
10488 paddd m2, m9
10489 vshufps m2, m2, m10, 10001000b
10490
10491 psrldq m9, m3, 4
10492 psubd m10, m3, m9
10493 paddd m3, m9
10494 vshufps m3, m3, m10, 10001000b
10495 psrldq m9, m3, 4
10496 psubd m10, m3, m9
10497 paddd m3, m9
10498 vshufps m3, m3, m10, 10001000b
10499
10500 SUMSUB_BA d, 0, 1, 9
10501 SUMSUB_BA d, 2, 3, 9
10502 SUMSUB_BA d, 0, 2, 9
10503 SUMSUB_BA d, 1, 3, 9
10504
10505 pmaddwd m4, m13
10506 pmaddwd m5, m13
10507 pmaddwd m6, m13
10508 pmaddwd m7, m13
10509
10510 psrldq m9, m4, 4
10511 psubd m10, m4, m9
10512 paddd m4, m9
10513 vshufps m4, m4, m10, 10001000b
10514 psrldq m9, m4, 4
10515 psubd m10, m4, m9
10516 paddd m4, m9
10517 vshufps m4, m4, m10, 10001000b
10518
10519 psrldq m9, m5, 4
10520 psubd m10, m5, m9
10521 paddd m5, m9
10522 vshufps m5, m5, m10, 10001000b
10523 psrldq m9, m5, 4
10524 psubd m10, m5, m9
10525 paddd m5, m9
10526 vshufps m5, m5, m10, 10001000b
10527
10528 psrldq m9, m6, 4
10529 psubd m10, m6, m9
10530 paddd m6, m9
10531 vshufps m6, m6, m10, 10001000b
10532 psrldq m9, m6, 4
10533 psubd m10, m6, m9
10534 paddd m6, m9
10535 vshufps m6, m6, m10, 10001000b
10536
10537 psrldq m9, m7, 4
10538 psubd m10, m7, m9
10539 paddd m7, m9
10540 vshufps m7, m7, m10, 10001000b
10541 psrldq m9, m7, 4
10542 psubd m10, m7, m9
10543 paddd m7, m9
10544 vshufps m7, m7, m10, 10001000b
10545
10546 SUMSUB_BA d, 4, 5, 9
10547 SUMSUB_BA d, 6, 7, 9
10548 SUMSUB_BA d, 4, 6, 9
10549 SUMSUB_BA d, 5, 7, 9
10550
10551 SUMSUB_BA d, 0, 4, 9
10552 SUMSUB_BA d, 1, 5, 9
10553 SUMSUB_BA d, 2, 6, 9
10554 SUMSUB_BA d, 3, 7, 9
10555
10556 pabsd m0, m0
10557 pabsd m2, m2
10558 pabsd m1, m1
10559 pabsd m3, m3
10560 pabsd m4, m4
10561 pabsd m5, m5
10562 pabsd m6, m6
10563 pabsd m7, m7
10564
10565 paddd m0, m2
10566 paddd m1, m3
10567 paddd m0, m1
10568 paddd m5, m4
10569 paddd m0, m5
10570 paddd m7, m6
10571 paddd m11, m0, m7
10572
10573 pmaddwd m0, m12, [rsp + 0 * mmsize]
10574 pmaddwd m1, m12, [rsp + 1 * mmsize]
10575 pmaddwd m2, m12, [rsp + 2 * mmsize]
10576 pmaddwd m3, m12, [rsp + 3 * mmsize]
10577
10578 psrldq m9, m0, 4
10579 psubd m10, m0, m9
10580 paddd m0, m9
10581 vshufps m0, m0, m10, 10001000b
10582 psrldq m9, m0, 4
10583 psubd m10, m0, m9
10584 paddd m0, m9
10585 vshufps m0, m0, m10, 10001000b
10586
10587 psrldq m9, m1, 4
10588 psubd m10, m1, m9
10589 paddd m1, m9
10590 vshufps m1, m1, m10, 10001000b
10591 psrldq m9, m1, 4
10592 psubd m10, m1, m9
10593 paddd m1, m9
10594 vshufps m1, m1, m10, 10001000b
10595
10596 psrldq m9, m2, 4
10597 psubd m10, m2, m9
10598 paddd m2, m9
10599 vshufps m2, m2, m10, 10001000b
10600 psrldq m9, m2, 4
10601 psubd m10, m2, m9
10602 paddd m2, m9
10603 vshufps m2, m2, m10, 10001000b
10604
10605 psrldq m9, m3, 4
10606 psubd m10, m3, m9
10607 paddd m3, m9
10608 vshufps m3, m3, m10, 10001000b
10609 psrldq m9, m3, 4
10610 psubd m10, m3, m9
10611 paddd m3, m9
10612 vshufps m3, m3, m10, 10001000b
10613
10614 SUMSUB_BA d, 0, 1, 9
10615 SUMSUB_BA d, 2, 3, 9
10616 SUMSUB_BA d, 0, 2, 9
10617 SUMSUB_BA d, 1, 3, 9
10618
10619 pmaddwd m4, m12, [rsp + 4 * mmsize]
10620 pmaddwd m5, m12, [rsp + 5 * mmsize]
10621 pmaddwd m6, m12, [rsp + 6 * mmsize]
10622 pmaddwd m7, m12, [rsp + 7 * mmsize]
10623
10624 psrldq m9, m4, 4
10625 psubd m10, m4, m9
10626 paddd m4, m9
10627 vshufps m4, m4, m10, 10001000b
10628 psrldq m9, m4, 4
10629 psubd m10, m4, m9
10630 paddd m4, m9
10631 vshufps m4, m4, m10, 10001000b
10632
10633 psrldq m9, m5, 4
10634 psubd m10, m5, m9
10635 paddd m5, m9
10636 vshufps m5, m5, m10, 10001000b
10637 psrldq m9, m5, 4
10638 psubd m10, m5, m9
10639 paddd m5, m9
10640 vshufps m5, m5, m10, 10001000b
10641
10642 psrldq m9, m6, 4
10643 psubd m10, m6, m9
10644 paddd m6, m9
10645 vshufps m6, m6, m10, 10001000b
10646 psrldq m9, m6, 4
10647 psubd m10, m6, m9
10648 paddd m6, m9
10649 vshufps m6, m6, m10, 10001000b
10650
10651 psrldq m9, m7, 4
10652 psubd m10, m7, m9
10653 paddd m7, m9
10654 vshufps m7, m7, m10, 10001000b
10655 psrldq m9, m7, 4
10656 psubd m10, m7, m9
10657 paddd m7, m9
10658 vshufps m7, m7, m10, 10001000b
10659
10660 SUMSUB_BA d, 4, 5, 9
10661 SUMSUB_BA d, 6, 7, 9
10662 SUMSUB_BA d, 4, 6, 9
10663 SUMSUB_BA d, 5, 7, 9
10664
10665 SUMSUB_BA d, 0, 4, 9
10666 SUMSUB_BA d, 1, 5, 9
10667 SUMSUB_BA d, 2, 6, 9
10668 SUMSUB_BA d, 3, 7, 9
10669
10670 pabsd m0, m0
10671 pabsd m2, m2
10672 pabsd m1, m1
10673 pabsd m3, m3
10674 pabsd m4, m4
10675 pabsd m5, m5
10676 pabsd m6, m6
10677 pabsd m7, m7
10678
10679 paddd m0, m2
10680 paddd m1, m3
10681 paddd m0, m1
10682 paddd m5, m4
10683 paddd m0, m5
10684 paddd m7, m6
10685 paddd m0, m7
10686 paddd m0, m11
10687
10688 psrldq m1, m0, 8
10689 paddd m0, m1
10690 psrldq m1, m0, 4
10691 paddd m0, m1
10692 paddd m0, [pd_2]
10693 psrld m0, 2
10694 psubd m0, m8
10695 vextracti128 xm1, m0, 1
10696 psubd m0, m1
10697 pabsd m0, m0
10698 %endmacro
10699
10700 %if ARCH_X86_64
10701 INIT_YMM avx2
10702 cglobal psyCost_ss_8x8, 4, 7, 14
10703 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
10704 mov r5, rsp
10705 sub rsp, 8*mmsize
10706 and rsp, ~63
10707
10708 mova m12, [pw_1]
10709 mova m13, [pw_pmpmpmpm]
10710 add r1, r1
10711 add r3, r3
10712
10713 PSY_SS_8x8
10714
10715 movd eax, xm0
10716 mov rsp, r5
10717 RET
10718 %endif
10719
10720 %if ARCH_X86_64
10721 INIT_YMM avx2
10722 cglobal psyCost_ss_16x16, 4, 9, 15
10723 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
10724 mov r5, rsp
10725 sub rsp, 8*mmsize
10726 and rsp, ~63
10727
10728 mova m12, [pw_1]
10729 mova m13, [pw_pmpmpmpm]
10730 add r1, r1
10731 add r3, r3
10732 pxor m14, m14
10733
10734 mov r7d, 2
10735 .loopH:
10736 mov r8d, 2
10737 .loopW:
10738 PSY_SS_8x8
10739
10740 paddd m14, m0
10741 add r0, 16
10742 add r2, 16
10743 dec r8d
10744 jnz .loopW
10745 lea r0, [r0 + r1 * 8 - 32]
10746 lea r2, [r2 + r3 * 8 - 32]
10747 dec r7d
10748 jnz .loopH
10749 movd eax, xm14
10750 mov rsp, r5
10751 RET
10752 %endif
10753
10754 %if ARCH_X86_64
10755 INIT_YMM avx2
10756 cglobal psyCost_ss_32x32, 4, 9, 15
10757 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
10758 mov r5, rsp
10759 sub rsp, 8*mmsize
10760 and rsp, ~63
10761
10762 mova m12, [pw_1]
10763 mova m13, [pw_pmpmpmpm]
10764 add r1, r1
10765 add r3, r3
10766 pxor m14, m14
10767
10768 mov r7d, 4
10769 .loopH:
10770 mov r8d, 4
10771 .loopW:
10772 PSY_SS_8x8
10773
10774 paddd m14, m0
10775 add r0, 16
10776 add r2, 16
10777 dec r8d
10778 jnz .loopW
10779 lea r0, [r0 + r1 * 8 - 64]
10780 lea r2, [r2 + r3 * 8 - 64]
10781 dec r7d
10782 jnz .loopH
10783 movd eax, xm14
10784 mov rsp, r5
10785 RET
10786 %endif
10787
10788 %if ARCH_X86_64
10789 INIT_YMM avx2
10790 cglobal psyCost_ss_64x64, 4, 9, 15
10791 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
10792 mov r5, rsp
10793 sub rsp, 8*mmsize
10794 and rsp, ~63
10795
10796 mova m12, [pw_1]
10797 mova m13, [pw_pmpmpmpm]
10798 add r1, r1
10799 add r3, r3
10800 pxor m14, m14
10801
10802 mov r7d, 8
10803 .loopH:
10804 mov r8d, 8
10805 .loopW:
10806 PSY_SS_8x8
10807
10808 paddd m14, m0
10809 add r0, 16
10810 add r2, 16
10811 dec r8d
10812 jnz .loopW
10813 lea r0, [r0 + r1 * 8 - 128]
10814 lea r2, [r2 + r3 * 8 - 128]
10815 dec r7d
10816 jnz .loopH
10817 movd eax, xm14
10818 mov rsp, r5
10819 RET
10820 %endif
10821
10822 ;;---------------------------------------------------------------
10823 ;; SATD AVX2
10824 ;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t)
10825 ;;---------------------------------------------------------------
10826 ;; r0 - pix0
10827 ;; r1 - pix0Stride
10828 ;; r2 - pix1
10829 ;; r3 - pix1Stride
10830
10831 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
10832 INIT_YMM avx2
10833 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
10834 pxor m6, m6
10835 vbroadcasti128 m0, [r0]
10836 vbroadcasti128 m4, [r2]
10837 vbroadcasti128 m1, [r0 + r1]
10838 vbroadcasti128 m5, [r2 + r3]
10839 pmaddubsw m4, m7
10840 pmaddubsw m0, m7
10841 pmaddubsw m5, m7
10842 pmaddubsw m1, m7
10843 psubw m0, m4
10844 psubw m1, m5
10845 vbroadcasti128 m2, [r0 + r1 * 2]
10846 vbroadcasti128 m4, [r2 + r3 * 2]
10847 vbroadcasti128 m3, [r0 + r4]
10848 vbroadcasti128 m5, [r2 + r5]
10849 pmaddubsw m4, m7
10850 pmaddubsw m2, m7
10851 pmaddubsw m5, m7
10852 pmaddubsw m3, m7
10853 psubw m2, m4
10854 psubw m3, m5
10855 lea r0, [r0 + r1 * 4]
10856 lea r2, [r2 + r3 * 4]
10857 paddw m4, m0, m1
10858 psubw m1, m1, m0
10859 paddw m0, m2, m3
10860 psubw m3, m2
10861 paddw m2, m4, m0
10862 psubw m0, m4
10863 paddw m4, m1, m3
10864 psubw m3, m1
10865 pabsw m2, m2
10866 pabsw m0, m0
10867 pabsw m4, m4
10868 pabsw m3, m3
10869 pblendw m1, m2, m0, 10101010b
10870 pslld m0, 16
10871 psrld m2, 16
10872 por m0, m2
10873 pmaxsw m1, m0
10874 paddw m6, m1
10875 pblendw m2, m4, m3, 10101010b
10876 pslld m3, 16
10877 psrld m4, 16
10878 por m3, m4
10879 pmaxsw m2, m3
10880 paddw m6, m2
10881 vbroadcasti128 m1, [r0]
10882 vbroadcasti128 m4, [r2]
10883 vbroadcasti128 m2, [r0 + r1]
10884 vbroadcasti128 m5, [r2 + r3]
10885 pmaddubsw m4, m7
10886 pmaddubsw m1, m7
10887 pmaddubsw m5, m7
10888 pmaddubsw m2, m7
10889 psubw m1, m4
10890 psubw m2, m5
10891 vbroadcasti128 m0, [r0 + r1 * 2]
10892 vbroadcasti128 m4, [r2 + r3 * 2]
10893 vbroadcasti128 m3, [r0 + r4]
10894 vbroadcasti128 m5, [r2 + r5]
10895 lea r0, [r0 + r1 * 4]
10896 lea r2, [r2 + r3 * 4]
10897 pmaddubsw m4, m7
10898 pmaddubsw m0, m7
10899 pmaddubsw m5, m7
10900 pmaddubsw m3, m7
10901 psubw m0, m4
10902 psubw m3, m5
10903 paddw m4, m1, m2
10904 psubw m2, m1
10905 paddw m1, m0, m3
10906 psubw m3, m0
10907 paddw m0, m4, m1
10908 psubw m1, m4
10909 paddw m4, m2, m3
10910 psubw m3, m2
10911 pabsw m0, m0
10912 pabsw m1, m1
10913 pabsw m4, m4
10914 pabsw m3, m3
10915 pblendw m2, m0, m1, 10101010b
10916 pslld m1, 16
10917 psrld m0, 16
10918 por m1, m0
10919 pmaxsw m2, m1
10920 paddw m6, m2
10921 pblendw m0, m4, m3, 10101010b
10922 pslld m3, 16
10923 psrld m4, 16
10924 por m3, m4
10925 pmaxsw m0, m3
10926 paddw m6, m0
10927 vextracti128 xm0, m6, 1
10928 pmovzxwd m6, xm6
10929 pmovzxwd m0, xm0
10930 paddd m8, m6
10931 paddd m9, m0
10932 ret
10933
10934 cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows
10935 pxor m6, m6
10936 vbroadcasti128 m0, [r0]
10937 vbroadcasti128 m4, [r2]
10938 vbroadcasti128 m1, [r0 + r1]
10939 vbroadcasti128 m5, [r2 + r3]
10940 pmaddubsw m4, m7
10941 pmaddubsw m0, m7
10942 pmaddubsw m5, m7
10943 pmaddubsw m1, m7
10944 psubw m0, m4
10945 psubw m1, m5
10946 vbroadcasti128 m2, [r0 + r1 * 2]
10947 vbroadcasti128 m4, [r2 + r3 * 2]
10948 vbroadcasti128 m3, [r0 + r4]
10949 vbroadcasti128 m5, [r2 + r5]
10950 pmaddubsw m4, m7
10951 pmaddubsw m2, m7
10952 pmaddubsw m5, m7
10953 pmaddubsw m3, m7
10954 psubw m2, m4
10955 psubw m3, m5
10956 paddw m4, m0, m1
10957 psubw m1, m1, m0
10958 paddw m0, m2, m3
10959 psubw m3, m2
10960 paddw m2, m4, m0
10961 psubw m0, m4
10962 paddw m4, m1, m3
10963 psubw m3, m1
10964 pabsw m2, m2
10965 pabsw m0, m0
10966 pabsw m4, m4
10967 pabsw m3, m3
10968 pblendw m1, m2, m0, 10101010b
10969 pslld m0, 16
10970 psrld m2, 16
10971 por m0, m2
10972 pmaxsw m1, m0
10973 paddw m6, m1
10974 pblendw m2, m4, m3, 10101010b
10975 pslld m3, 16
10976 psrld m4, 16
10977 por m3, m4
10978 pmaxsw m2, m3
10979 paddw m6, m2
10980 vextracti128 xm0, m6, 1
10981 pmovzxwd m6, xm6
10982 pmovzxwd m0, xm0
10983 paddd m8, m6
10984 paddd m9, m0
10985 ret
10986
10987 cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2)
10988 mova m7, [hmul_16p]
10989 lea r4, [3 * r1]
10990 lea r5, [3 * r3]
10991 pxor m8, m8
10992 pxor m9, m9
10993
10994 call calc_satd_16x4
10995
10996 paddd m8, m9
10997 vextracti128 xm0, m8, 1
10998 paddd xm0, xm8
10999 movhlps xm1, xm0
11000 paddd xm0, xm1
11001 pshuflw xm1, xm0, q0032
11002 paddd xm0, xm1
11003 movd eax, xm0
11004 RET
11005
11006 cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2)
11007 mova m7, [hmul_16p]
11008 lea r4, [3 * r1]
11009 lea r5, [3 * r3]
11010 pxor m8, m8
11011 pxor m9, m9
11012
11013 call calc_satd_16x8
11014 call calc_satd_16x4
11015
11016 paddd m8, m9
11017 vextracti128 xm0, m8, 1
11018 paddd xm0, xm8
11019 movhlps xm1, xm0
11020 paddd xm0, xm1
11021 pshuflw xm1, xm0, q0032
11022 paddd xm0, xm1
11023 movd eax, xm0
11024 RET
11025
11026 cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2)
11027 mova m7, [hmul_16p]
11028 lea r4, [3 * r1]
11029 lea r5, [3 * r3]
11030 pxor m8, m8
11031 pxor m9, m9
11032
11033 call calc_satd_16x8
11034 call calc_satd_16x8
11035 call calc_satd_16x8
11036 call calc_satd_16x8
11037
11038 paddd m8, m9
11039 vextracti128 xm0, m8, 1
11040 paddd xm0, xm8
11041 movhlps xm1, xm0
11042 paddd xm0, xm1
11043 pshuflw xm1, xm0, q0032
11044 paddd xm0, xm1
11045 movd eax, xm0
11046 RET
11047
11048 cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2)
11049 mova m7, [hmul_16p]
11050 lea r4, [3 * r1]
11051 lea r5, [3 * r3]
11052 pxor m8, m8
11053 pxor m9, m9
11054
11055 call calc_satd_16x8
11056 call calc_satd_16x8
11057 call calc_satd_16x8
11058 call calc_satd_16x8
11059 call calc_satd_16x8
11060 call calc_satd_16x8
11061 call calc_satd_16x8
11062 call calc_satd_16x8
11063
11064 paddd m8, m9
11065 vextracti128 xm0, m8, 1
11066 paddd xm0, xm8
11067 movhlps xm1, xm0
11068 paddd xm0, xm1
11069 pshuflw xm1, xm0, q0032
11070 paddd xm0, xm1
11071 movd eax, xm0
11072 RET
11073
11074 cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2)
11075 mova m7, [hmul_16p]
11076 lea r4, [3 * r1]
11077 lea r5, [3 * r3]
11078 pxor m8, m8
11079 pxor m9, m9
11080 mov r6, r0
11081 mov r7, r2
11082
11083 call calc_satd_16x8
11084
11085 lea r0, [r6 + 16]
11086 lea r2, [r7 + 16]
11087
11088 call calc_satd_16x8
11089
11090 paddd m8, m9
11091 vextracti128 xm0, m8, 1
11092 paddd xm0, xm8
11093 movhlps xm1, xm0
11094 paddd xm0, xm1
11095 pshuflw xm1, xm0, q0032
11096 paddd xm0, xm1
11097 movd eax, xm0
11098 RET
11099
11100 cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
11101 mova m7, [hmul_16p]
11102 lea r4, [3 * r1]
11103 lea r5, [3 * r3]
11104 pxor m8, m8
11105 pxor m9, m9
11106 mov r6, r0
11107 mov r7, r2
11108
11109 call calc_satd_16x8
11110 call calc_satd_16x8
11111
11112 lea r0, [r6 + 16]
11113 lea r2, [r7 + 16]
11114
11115 call calc_satd_16x8
11116 call calc_satd_16x8
11117
11118 paddd m8, m9
11119 vextracti128 xm0, m8, 1
11120 paddd xm0, xm8
11121 movhlps xm1, xm0
11122 paddd xm0, xm1
11123 pshuflw xm1, xm0, q0032
11124 paddd xm0, xm1
11125 movd eax, xm0
11126 RET
11127
11128 cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2)
11129 mova m7, [hmul_16p]
11130 lea r4, [3 * r1]
11131 lea r5, [3 * r3]
11132 pxor m8, m8
11133 pxor m9, m9
11134 mov r6, r0
11135 mov r7, r2
11136
11137 call calc_satd_16x8
11138 call calc_satd_16x8
11139 call calc_satd_16x8
11140
11141 lea r0, [r6 + 16]
11142 lea r2, [r7 + 16]
11143
11144 call calc_satd_16x8
11145 call calc_satd_16x8
11146 call calc_satd_16x8
11147
11148 paddd m8, m9
11149 vextracti128 xm0, m8, 1
11150 paddd xm0, xm8
11151 movhlps xm1, xm0
11152 paddd xm0, xm1
11153 pshuflw xm1, xm0, q0032
11154 paddd xm0, xm1
11155 movd eax, xm0
11156 RET
11157
11158 cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
11159 mova m7, [hmul_16p]
11160 lea r4, [3 * r1]
11161 lea r5, [3 * r3]
11162 pxor m8, m8
11163 pxor m9, m9
11164 mov r6, r0
11165 mov r7, r2
11166
11167 call calc_satd_16x8
11168 call calc_satd_16x8
11169 call calc_satd_16x8
11170 call calc_satd_16x8
11171
11172 lea r0, [r6 + 16]
11173 lea r2, [r7 + 16]
11174
11175 call calc_satd_16x8
11176 call calc_satd_16x8
11177 call calc_satd_16x8
11178 call calc_satd_16x8
11179
11180 paddd m8, m9
11181 vextracti128 xm0, m8, 1
11182 paddd xm0, xm8
11183 movhlps xm1, xm0
11184 paddd xm0, xm1
11185 pshuflw xm1, xm0, q0032
11186 paddd xm0, xm1
11187 movd eax, xm0
11188 RET
11189
11190 cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
11191 mova m7, [hmul_16p]
11192 lea r4, [3 * r1]
11193 lea r5, [3 * r3]
11194 pxor m8, m8
11195 pxor m9, m9
11196 mov r6, r0
11197 mov r7, r2
11198
11199 call calc_satd_16x8
11200 call calc_satd_16x8
11201 call calc_satd_16x8
11202 call calc_satd_16x8
11203 call calc_satd_16x8
11204 call calc_satd_16x8
11205 call calc_satd_16x8
11206 call calc_satd_16x8
11207
11208 lea r0, [r6 + 16]
11209 lea r2, [r7 + 16]
11210
11211 call calc_satd_16x8
11212 call calc_satd_16x8
11213 call calc_satd_16x8
11214 call calc_satd_16x8
11215 call calc_satd_16x8
11216 call calc_satd_16x8
11217 call calc_satd_16x8
11218 call calc_satd_16x8
11219
11220 paddd m8, m9
11221 vextracti128 xm0, m8, 1
11222 paddd xm0, xm8
11223 movhlps xm1, xm0
11224 paddd xm0, xm1
11225 pshuflw xm1, xm0, q0032
11226 paddd xm0, xm1
11227 movd eax, xm0
11228 RET
11229
11230 cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
11231 mova m7, [hmul_16p]
11232 lea r4, [3 * r1]
11233 lea r5, [3 * r3]
11234 pxor m8, m8
11235 pxor m9, m9
11236 mov r6, r0
11237 mov r7, r2
11238
11239 call calc_satd_16x8
11240 call calc_satd_16x8
11241 call calc_satd_16x8
11242 call calc_satd_16x8
11243 call calc_satd_16x8
11244 call calc_satd_16x8
11245 call calc_satd_16x8
11246 call calc_satd_16x8
11247 lea r0, [r6 + 16]
11248 lea r2, [r7 + 16]
11249 call calc_satd_16x8
11250 call calc_satd_16x8
11251 call calc_satd_16x8
11252 call calc_satd_16x8
11253 call calc_satd_16x8
11254 call calc_satd_16x8
11255 call calc_satd_16x8
11256 call calc_satd_16x8
11257 lea r0, [r6 + 32]
11258 lea r2, [r7 + 32]
11259 call calc_satd_16x8
11260 call calc_satd_16x8
11261 call calc_satd_16x8
11262 call calc_satd_16x8
11263 call calc_satd_16x8
11264 call calc_satd_16x8
11265 call calc_satd_16x8
11266 call calc_satd_16x8
11267
11268 paddd m8, m9
11269 vextracti128 xm0, m8, 1
11270 paddd xm0, xm8
11271 movhlps xm1, xm0
11272 paddd xm0, xm1
11273 pshuflw xm1, xm0, q0032
11274 paddd xm0, xm1
11275 movd eax, xm0
11276 RET
11277
11278 cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
11279 mova m7, [hmul_16p]
11280 lea r4, [3 * r1]
11281 lea r5, [3 * r3]
11282 pxor m8, m8
11283 pxor m9, m9
11284 mov r6, r0
11285 mov r7, r2
11286
11287 call calc_satd_16x8
11288 call calc_satd_16x8
11289 lea r0, [r6 + 16]
11290 lea r2, [r7 + 16]
11291 call calc_satd_16x8
11292 call calc_satd_16x8
11293 lea r0, [r6 + 32]
11294 lea r2, [r7 + 32]
11295 call calc_satd_16x8
11296 call calc_satd_16x8
11297 lea r0, [r6 + 48]
11298 lea r2, [r7 + 48]
11299 call calc_satd_16x8
11300 call calc_satd_16x8
11301
11302 paddd m8, m9
11303 vextracti128 xm0, m8, 1
11304 paddd xm0, xm8
11305 movhlps xm1, xm0
11306 paddd xm0, xm1
11307 pshuflw xm1, xm0, q0032
11308 paddd xm0, xm1
11309 movd eax, xm0
11310 RET
11311
11312 cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
11313 mova m7, [hmul_16p]
11314 lea r4, [3 * r1]
11315 lea r5, [3 * r3]
11316 pxor m8, m8
11317 pxor m9, m9
11318 mov r6, r0
11319 mov r7, r2
11320
11321 call calc_satd_16x8
11322 call calc_satd_16x8
11323 call calc_satd_16x8
11324 call calc_satd_16x8
11325 lea r0, [r6 + 16]
11326 lea r2, [r7 + 16]
11327 call calc_satd_16x8
11328 call calc_satd_16x8
11329 call calc_satd_16x8
11330 call calc_satd_16x8
11331 lea r0, [r6 + 32]
11332 lea r2, [r7 + 32]
11333 call calc_satd_16x8
11334 call calc_satd_16x8
11335 call calc_satd_16x8
11336 call calc_satd_16x8
11337 lea r0, [r6 + 48]
11338 lea r2, [r7 + 48]
11339 call calc_satd_16x8
11340 call calc_satd_16x8
11341 call calc_satd_16x8
11342 call calc_satd_16x8
11343
11344 paddd m8, m9
11345 vextracti128 xm0, m8, 1
11346 paddd xm0, xm8
11347 movhlps xm1, xm0
11348 paddd xm0, xm1
11349 pshuflw xm1, xm0, q0032
11350 paddd xm0, xm1
11351 movd eax, xm0
11352 RET
11353
11354 cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2)
11355 mova m7, [hmul_16p]
11356 lea r4, [3 * r1]
11357 lea r5, [3 * r3]
11358 pxor m8, m8
11359 pxor m9, m9
11360 mov r6, r0
11361 mov r7, r2
11362
11363 call calc_satd_16x8
11364 call calc_satd_16x8
11365 call calc_satd_16x8
11366 call calc_satd_16x8
11367 call calc_satd_16x8
11368 call calc_satd_16x8
11369 lea r0, [r6 + 16]
11370 lea r2, [r7 + 16]
11371 call calc_satd_16x8
11372 call calc_satd_16x8
11373 call calc_satd_16x8
11374 call calc_satd_16x8
11375 call calc_satd_16x8
11376 call calc_satd_16x8
11377 lea r0, [r6 + 32]
11378 lea r2, [r7 + 32]
11379 call calc_satd_16x8
11380 call calc_satd_16x8
11381 call calc_satd_16x8
11382 call calc_satd_16x8
11383 call calc_satd_16x8
11384 call calc_satd_16x8
11385 lea r0, [r6 + 48]
11386 lea r2, [r7 + 48]
11387 call calc_satd_16x8
11388 call calc_satd_16x8
11389 call calc_satd_16x8
11390 call calc_satd_16x8
11391 call calc_satd_16x8
11392 call calc_satd_16x8
11393
11394 paddd m8, m9
11395 vextracti128 xm0, m8, 1
11396 paddd xm0, xm8
11397 movhlps xm1, xm0
11398 paddd xm0, xm1
11399 pshuflw xm1, xm0, q0032
11400 paddd xm0, xm1
11401 movd eax, xm0
11402 RET
11403
11404 cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
11405 mova m7, [hmul_16p]
11406 lea r4, [3 * r1]
11407 lea r5, [3 * r3]
11408 pxor m8, m8
11409 pxor m9, m9
11410 mov r6, r0
11411 mov r7, r2
11412
11413 call calc_satd_16x8
11414 call calc_satd_16x8
11415 call calc_satd_16x8
11416 call calc_satd_16x8
11417 call calc_satd_16x8
11418 call calc_satd_16x8
11419 call calc_satd_16x8
11420 call calc_satd_16x8
11421 lea r0, [r6 + 16]
11422 lea r2, [r7 + 16]
11423 call calc_satd_16x8
11424 call calc_satd_16x8
11425 call calc_satd_16x8
11426 call calc_satd_16x8
11427 call calc_satd_16x8
11428 call calc_satd_16x8
11429 call calc_satd_16x8
11430 call calc_satd_16x8
11431 lea r0, [r6 + 32]
11432 lea r2, [r7 + 32]
11433 call calc_satd_16x8
11434 call calc_satd_16x8
11435 call calc_satd_16x8
11436 call calc_satd_16x8
11437 call calc_satd_16x8
11438 call calc_satd_16x8
11439 call calc_satd_16x8
11440 call calc_satd_16x8
11441 lea r0, [r6 + 48]
11442 lea r2, [r7 + 48]
11443 call calc_satd_16x8
11444 call calc_satd_16x8
11445 call calc_satd_16x8
11446 call calc_satd_16x8
11447 call calc_satd_16x8
11448 call calc_satd_16x8
11449 call calc_satd_16x8
11450 call calc_satd_16x8
11451
11452 paddd m8, m9
11453 vextracti128 xm0, m8, 1
11454 paddd xm0, xm8
11455 movhlps xm1, xm0
11456 paddd xm0, xm1
11457 pshuflw xm1, xm0, q0032
11458 paddd xm0, xm1
11459 movd eax, xm0
11460 RET
11461 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
11462
11463 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
11464 INIT_YMM avx2
11465 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
11466 ; rows 0-3
11467 movu m0, [r0]
11468 movu m4, [r2]
11469 psubw m0, m4
11470 movu m1, [r0 + r1]
11471 movu m5, [r2 + r3]
11472 psubw m1, m5
11473 movu m2, [r0 + r1 * 2]
11474 movu m4, [r2 + r3 * 2]
11475 psubw m2, m4
11476 movu m3, [r0 + r4]
11477 movu m5, [r2 + r5]
11478 psubw m3, m5
11479 lea r0, [r0 + r1 * 4]
11480 lea r2, [r2 + r3 * 4]
11481 paddw m4, m0, m1
11482 psubw m1, m0
11483 paddw m0, m2, m3
11484 psubw m3, m2
11485 punpckhwd m2, m4, m1
11486 punpcklwd m4, m1
11487 punpckhwd m1, m0, m3
11488 punpcklwd m0, m3
11489 paddw m3, m4, m0
11490 psubw m0, m4
11491 paddw m4, m2, m1
11492 psubw m1, m2
11493 punpckhdq m2, m3, m0
11494 punpckldq m3, m0
11495 paddw m0, m3, m2
11496 psubw m2, m3
11497 punpckhdq m3, m4, m1
11498 punpckldq m4, m1
11499 paddw m1, m4, m3
11500 psubw m3, m4
11501 punpckhqdq m4, m0, m1
11502 punpcklqdq m0, m1
11503 pabsw m0, m0
11504 pabsw m4, m4
11505 pmaxsw m0, m0, m4
11506 punpckhqdq m1, m2, m3
11507 punpcklqdq m2, m3
11508 pabsw m2, m2
11509 pabsw m1, m1
11510 pmaxsw m2, m1
11511 pxor m7, m7
11512 mova m1, m0
11513 punpcklwd m1, m7
11514 paddd m6, m1
11515 mova m1, m0
11516 punpckhwd m1, m7
11517 paddd m6, m1
11518 pxor m7, m7
11519 mova m1, m2
11520 punpcklwd m1, m7
11521 paddd m6, m1
11522 mova m1, m2
11523 punpckhwd m1, m7
11524 paddd m6, m1
11525 ; rows 4-7
11526 movu m0, [r0]
11527 movu m4, [r2]
11528 psubw m0, m4
11529 movu m1, [r0 + r1]
11530 movu m5, [r2 + r3]
11531 psubw m1, m5
11532 movu m2, [r0 + r1 * 2]
11533 movu m4, [r2 + r3 * 2]
11534 psubw m2, m4
11535 movu m3, [r0 + r4]
11536 movu m5, [r2 + r5]
11537 psubw m3, m5
11538 lea r0, [r0 + r1 * 4]
11539 lea r2, [r2 + r3 * 4]
11540 paddw m4, m0, m1
11541 psubw m1, m0
11542 paddw m0, m2, m3
11543 psubw m3, m2
11544 punpckhwd m2, m4, m1
11545 punpcklwd m4, m1
11546 punpckhwd m1, m0, m3
11547 punpcklwd m0, m3
11548 paddw m3, m4, m0
11549 psubw m0, m4
11550 paddw m4, m2, m1
11551 psubw m1, m2
11552 punpckhdq m2, m3, m0
11553 punpckldq m3, m0
11554 paddw m0, m3, m2
11555 psubw m2, m3
11556 punpckhdq m3, m4, m1
11557 punpckldq m4, m1
11558 paddw m1, m4, m3
11559 psubw m3, m4
11560 punpckhqdq m4, m0, m1
11561 punpcklqdq m0, m1
11562 pabsw m0, m0
11563 pabsw m4, m4
11564 pmaxsw m0, m0, m4
11565 punpckhqdq m1, m2, m3
11566 punpcklqdq m2, m3
11567 pabsw m2, m2
11568 pabsw m1, m1
11569 pmaxsw m2, m1
11570 pxor m7, m7
11571 mova m1, m0
11572 punpcklwd m1, m7
11573 paddd m6, m1
11574 mova m1, m0
11575 punpckhwd m1, m7
11576 paddd m6, m1
11577 pxor m7, m7
11578 mova m1, m2
11579 punpcklwd m1, m7
11580 paddd m6, m1
11581 mova m1, m2
11582 punpckhwd m1, m7
11583 paddd m6, m1
11584 ret
11585
11586 cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows
11587 ; rows 0-3
11588 movu m0, [r0]
11589 movu m4, [r2]
11590 psubw m0, m4
11591 movu m1, [r0 + r1]
11592 movu m5, [r2 + r3]
11593 psubw m1, m5
11594 movu m2, [r0 + r1 * 2]
11595 movu m4, [r2 + r3 * 2]
11596 psubw m2, m4
11597 movu m3, [r0 + r4]
11598 movu m5, [r2 + r5]
11599 psubw m3, m5
11600 lea r0, [r0 + r1 * 4]
11601 lea r2, [r2 + r3 * 4]
11602 paddw m4, m0, m1
11603 psubw m1, m0
11604 paddw m0, m2, m3
11605 psubw m3, m2
11606 punpckhwd m2, m4, m1
11607 punpcklwd m4, m1
11608 punpckhwd m1, m0, m3
11609 punpcklwd m0, m3
11610 paddw m3, m4, m0
11611 psubw m0, m4
11612 paddw m4, m2, m1
11613 psubw m1, m2
11614 punpckhdq m2, m3, m0
11615 punpckldq m3, m0
11616 paddw m0, m3, m2
11617 psubw m2, m3
11618 punpckhdq m3, m4, m1
11619 punpckldq m4, m1
11620 paddw m1, m4, m3
11621 psubw m3, m4
11622 punpckhqdq m4, m0, m1
11623 punpcklqdq m0, m1
11624 pabsw m0, m0
11625 pabsw m4, m4
11626 pmaxsw m0, m0, m4
11627 punpckhqdq m1, m2, m3
11628 punpcklqdq m2, m3
11629 pabsw m2, m2
11630 pabsw m1, m1
11631 pmaxsw m2, m1
11632 pxor m7, m7
11633 mova m1, m0
11634 punpcklwd m1, m7
11635 paddd m6, m1
11636 mova m1, m0
11637 punpckhwd m1, m7
11638 paddd m6, m1
11639 pxor m7, m7
11640 mova m1, m2
11641 punpcklwd m1, m7
11642 paddd m6, m1
11643 mova m1, m2
11644 punpckhwd m1, m7
11645 paddd m6, m1
11646 ret
11647
11648 cglobal pixel_satd_16x4, 4,6,8
11649 add r1d, r1d
11650 add r3d, r3d
11651 lea r4, [3 * r1]
11652 lea r5, [3 * r3]
11653 pxor m6, m6
11654
11655 call calc_satd_16x4
11656
11657 vextracti128 xm7, m6, 1
11658 paddd xm6, xm7
11659 pxor xm7, xm7
11660 movhlps xm7, xm6
11661 paddd xm6, xm7
11662 pshufd xm7, xm6, 1
11663 paddd xm6, xm7
11664 movd eax, xm6
11665 RET
11666
11667 cglobal pixel_satd_16x8, 4,6,8
11668 add r1d, r1d
11669 add r3d, r3d
11670 lea r4, [3 * r1]
11671 lea r5, [3 * r3]
11672 pxor m6, m6
11673
11674 call calc_satd_16x8
11675
11676 vextracti128 xm7, m6, 1
11677 paddd xm6, xm7
11678 pxor xm7, xm7
11679 movhlps xm7, xm6
11680 paddd xm6, xm7
11681 pshufd xm7, xm6, 1
11682 paddd xm6, xm7
11683 movd eax, xm6
11684 RET
11685
11686 cglobal pixel_satd_16x12, 4,6,8
11687 add r1d, r1d
11688 add r3d, r3d
11689 lea r4, [3 * r1]
11690 lea r5, [3 * r3]
11691 pxor m6, m6
11692
11693 call calc_satd_16x8
11694 call calc_satd_16x4
11695
11696 vextracti128 xm7, m6, 1
11697 paddd xm6, xm7
11698 pxor xm7, xm7
11699 movhlps xm7, xm6
11700 paddd xm6, xm7
11701 pshufd xm7, xm6, 1
11702 paddd xm6, xm7
11703 movd eax, xm6
11704 RET
11705
11706 cglobal pixel_satd_16x16, 4,6,8
11707 add r1d, r1d
11708 add r3d, r3d
11709 lea r4, [3 * r1]
11710 lea r5, [3 * r3]
11711 pxor m6, m6
11712
11713 call calc_satd_16x8
11714 call calc_satd_16x8
11715
11716 vextracti128 xm7, m6, 1
11717 paddd xm6, xm7
11718 pxor xm7, xm7
11719 movhlps xm7, xm6
11720 paddd xm6, xm7
11721 pshufd xm7, xm6, 1
11722 paddd xm6, xm7
11723 movd eax, xm6
11724 RET
11725
11726 cglobal pixel_satd_16x32, 4,6,8
11727 add r1d, r1d
11728 add r3d, r3d
11729 lea r4, [3 * r1]
11730 lea r5, [3 * r3]
11731 pxor m6, m6
11732
11733 call calc_satd_16x8
11734 call calc_satd_16x8
11735 call calc_satd_16x8
11736 call calc_satd_16x8
11737
11738 vextracti128 xm7, m6, 1
11739 paddd xm6, xm7
11740 pxor xm7, xm7
11741 movhlps xm7, xm6
11742 paddd xm6, xm7
11743 pshufd xm7, xm6, 1
11744 paddd xm6, xm7
11745 movd eax, xm6
11746 RET
11747
11748 cglobal pixel_satd_16x64, 4,6,8
11749 add r1d, r1d
11750 add r3d, r3d
11751 lea r4, [3 * r1]
11752 lea r5, [3 * r3]
11753 pxor m6, m6
11754
11755 call calc_satd_16x8
11756 call calc_satd_16x8
11757 call calc_satd_16x8
11758 call calc_satd_16x8
11759 call calc_satd_16x8
11760 call calc_satd_16x8
11761 call calc_satd_16x8
11762 call calc_satd_16x8
11763
11764 vextracti128 xm7, m6, 1
11765 paddd xm6, xm7
11766 pxor xm7, xm7
11767 movhlps xm7, xm6
11768 paddd xm6, xm7
11769 pshufd xm7, xm6, 1
11770 paddd xm6, xm7
11771 movd eax, xm6
11772 RET
11773
11774 cglobal pixel_satd_32x8, 4,8,8
11775 add r1d, r1d
11776 add r3d, r3d
11777 lea r4, [3 * r1]
11778 lea r5, [3 * r3]
11779 pxor m6, m6
11780 mov r6, r0
11781 mov r7, r2
11782
11783 call calc_satd_16x8
11784
11785 lea r0, [r6 + 32]
11786 lea r2, [r7 + 32]
11787
11788 call calc_satd_16x8
11789
11790 vextracti128 xm7, m6, 1
11791 paddd xm6, xm7
11792 pxor xm7, xm7
11793 movhlps xm7, xm6
11794 paddd xm6, xm7
11795 pshufd xm7, xm6, 1
11796 paddd xm6, xm7
11797 movd eax, xm6
11798 RET
11799
11800 cglobal pixel_satd_32x16, 4,8,8
11801 add r1d, r1d
11802 add r3d, r3d
11803 lea r4, [3 * r1]
11804 lea r5, [3 * r3]
11805 pxor m6, m6
11806 mov r6, r0
11807 mov r7, r2
11808
11809 call calc_satd_16x8
11810 call calc_satd_16x8
11811
11812 lea r0, [r6 + 32]
11813 lea r2, [r7 + 32]
11814
11815 call calc_satd_16x8
11816 call calc_satd_16x8
11817
11818 vextracti128 xm7, m6, 1
11819 paddd xm6, xm7
11820 pxor xm7, xm7
11821 movhlps xm7, xm6
11822 paddd xm6, xm7
11823 pshufd xm7, xm6, 1
11824 paddd xm6, xm7
11825 movd eax, xm6
11826 RET
11827
11828 cglobal pixel_satd_32x24, 4,8,8
11829 add r1d, r1d
11830 add r3d, r3d
11831 lea r4, [3 * r1]
11832 lea r5, [3 * r3]
11833 pxor m6, m6
11834 mov r6, r0
11835 mov r7, r2
11836
11837 call calc_satd_16x8
11838 call calc_satd_16x8
11839 call calc_satd_16x8
11840
11841 lea r0, [r6 + 32]
11842 lea r2, [r7 + 32]
11843
11844 call calc_satd_16x8
11845 call calc_satd_16x8
11846 call calc_satd_16x8
11847
11848 vextracti128 xm7, m6, 1
11849 paddd xm6, xm7
11850 pxor xm7, xm7
11851 movhlps xm7, xm6
11852 paddd xm6, xm7
11853 pshufd xm7, xm6, 1
11854 paddd xm6, xm7
11855 movd eax, xm6
11856 RET
11857
11858 cglobal pixel_satd_32x32, 4,8,8
11859 add r1d, r1d
11860 add r3d, r3d
11861 lea r4, [3 * r1]
11862 lea r5, [3 * r3]
11863 pxor m6, m6
11864 mov r6, r0
11865 mov r7, r2
11866
11867 call calc_satd_16x8
11868 call calc_satd_16x8
11869 call calc_satd_16x8
11870 call calc_satd_16x8
11871
11872 lea r0, [r6 + 32]
11873 lea r2, [r7 + 32]
11874
11875 call calc_satd_16x8
11876 call calc_satd_16x8
11877 call calc_satd_16x8
11878 call calc_satd_16x8
11879
11880 vextracti128 xm7, m6, 1
11881 paddd xm6, xm7
11882 pxor xm7, xm7
11883 movhlps xm7, xm6
11884 paddd xm6, xm7
11885 pshufd xm7, xm6, 1
11886 paddd xm6, xm7
11887 movd eax, xm6
11888 RET
11889
11890 cglobal pixel_satd_32x64, 4,8,8
11891 add r1d, r1d
11892 add r3d, r3d
11893 lea r4, [3 * r1]
11894 lea r5, [3 * r3]
11895 pxor m6, m6
11896 mov r6, r0
11897 mov r7, r2
11898
11899 call calc_satd_16x8
11900 call calc_satd_16x8
11901 call calc_satd_16x8
11902 call calc_satd_16x8
11903 call calc_satd_16x8
11904 call calc_satd_16x8
11905 call calc_satd_16x8
11906 call calc_satd_16x8
11907
11908 lea r0, [r6 + 32]
11909 lea r2, [r7 + 32]
11910
11911 call calc_satd_16x8
11912 call calc_satd_16x8
11913 call calc_satd_16x8
11914 call calc_satd_16x8
11915 call calc_satd_16x8
11916 call calc_satd_16x8
11917 call calc_satd_16x8
11918 call calc_satd_16x8
11919
11920 vextracti128 xm7, m6, 1
11921 paddd xm6, xm7
11922 pxor xm7, xm7
11923 movhlps xm7, xm6
11924 paddd xm6, xm7
11925 pshufd xm7, xm6, 1
11926 paddd xm6, xm7
11927 movd eax, xm6
11928 RET
11929
11930 cglobal pixel_satd_48x64, 4,8,8
11931 add r1d, r1d
11932 add r3d, r3d
11933 lea r4, [3 * r1]
11934 lea r5, [3 * r3]
11935 pxor m6, m6
11936 mov r6, r0
11937 mov r7, r2
11938
11939 call calc_satd_16x8
11940 call calc_satd_16x8
11941 call calc_satd_16x8
11942 call calc_satd_16x8
11943 call calc_satd_16x8
11944 call calc_satd_16x8
11945 call calc_satd_16x8
11946 call calc_satd_16x8
11947
11948 lea r0, [r6 + 32]
11949 lea r2, [r7 + 32]
11950
11951 call calc_satd_16x8
11952 call calc_satd_16x8
11953 call calc_satd_16x8
11954 call calc_satd_16x8
11955 call calc_satd_16x8
11956 call calc_satd_16x8
11957 call calc_satd_16x8
11958 call calc_satd_16x8
11959
11960 lea r0, [r6 + 64]
11961 lea r2, [r7 + 64]
11962
11963 call calc_satd_16x8
11964 call calc_satd_16x8
11965 call calc_satd_16x8
11966 call calc_satd_16x8
11967 call calc_satd_16x8
11968 call calc_satd_16x8
11969 call calc_satd_16x8
11970 call calc_satd_16x8
11971
11972 vextracti128 xm7, m6, 1
11973 paddd xm6, xm7
11974 pxor xm7, xm7
11975 movhlps xm7, xm6
11976 paddd xm6, xm7
11977 pshufd xm7, xm6, 1
11978 paddd xm6, xm7
11979 movd eax, xm6
11980 RET
11981
11982 cglobal pixel_satd_64x16, 4,8,8
11983 add r1d, r1d
11984 add r3d, r3d
11985 lea r4, [3 * r1]
11986 lea r5, [3 * r3]
11987 pxor m6, m6
11988 mov r6, r0
11989 mov r7, r2
11990
11991 call calc_satd_16x8
11992 call calc_satd_16x8
11993
11994 lea r0, [r6 + 32]
11995 lea r2, [r7 + 32]
11996
11997 call calc_satd_16x8
11998 call calc_satd_16x8
11999
12000 lea r0, [r6 + 64]
12001 lea r2, [r7 + 64]
12002
12003 call calc_satd_16x8
12004 call calc_satd_16x8
12005
12006 lea r0, [r6 + 96]
12007 lea r2, [r7 + 96]
12008
12009 call calc_satd_16x8
12010 call calc_satd_16x8
12011
12012 vextracti128 xm7, m6, 1
12013 paddd xm6, xm7
12014 pxor xm7, xm7
12015 movhlps xm7, xm6
12016 paddd xm6, xm7
12017 pshufd xm7, xm6, 1
12018 paddd xm6, xm7
12019 movd eax, xm6
12020 RET
12021
12022 cglobal pixel_satd_64x32, 4,8,8
12023 add r1d, r1d
12024 add r3d, r3d
12025 lea r4, [3 * r1]
12026 lea r5, [3 * r3]
12027 pxor m6, m6
12028 mov r6, r0
12029 mov r7, r2
12030
12031 call calc_satd_16x8
12032 call calc_satd_16x8
12033 call calc_satd_16x8
12034 call calc_satd_16x8
12035
12036 lea r0, [r6 + 32]
12037 lea r2, [r7 + 32]
12038
12039 call calc_satd_16x8
12040 call calc_satd_16x8
12041 call calc_satd_16x8
12042 call calc_satd_16x8
12043
12044 lea r0, [r6 + 64]
12045 lea r2, [r7 + 64]
12046
12047 call calc_satd_16x8
12048 call calc_satd_16x8
12049 call calc_satd_16x8
12050 call calc_satd_16x8
12051
12052 lea r0, [r6 + 96]
12053 lea r2, [r7 + 96]
12054
12055 call calc_satd_16x8
12056 call calc_satd_16x8
12057 call calc_satd_16x8
12058 call calc_satd_16x8
12059
12060 vextracti128 xm7, m6, 1
12061 paddd xm6, xm7
12062 pxor xm7, xm7
12063 movhlps xm7, xm6
12064 paddd xm6, xm7
12065 pshufd xm7, xm6, 1
12066 paddd xm6, xm7
12067 movd eax, xm6
12068 RET
12069
12070 cglobal pixel_satd_64x48, 4,8,8
12071 add r1d, r1d
12072 add r3d, r3d
12073 lea r4, [3 * r1]
12074 lea r5, [3 * r3]
12075 pxor m6, m6
12076 mov r6, r0
12077 mov r7, r2
12078
12079 call calc_satd_16x8
12080 call calc_satd_16x8
12081 call calc_satd_16x8
12082 call calc_satd_16x8
12083 call calc_satd_16x8
12084 call calc_satd_16x8
12085
12086 lea r0, [r6 + 32]
12087 lea r2, [r7 + 32]
12088
12089 call calc_satd_16x8
12090 call calc_satd_16x8
12091 call calc_satd_16x8
12092 call calc_satd_16x8
12093 call calc_satd_16x8
12094 call calc_satd_16x8
12095
12096 lea r0, [r6 + 64]
12097 lea r2, [r7 + 64]
12098
12099 call calc_satd_16x8
12100 call calc_satd_16x8
12101 call calc_satd_16x8
12102 call calc_satd_16x8
12103 call calc_satd_16x8
12104 call calc_satd_16x8
12105
12106 lea r0, [r6 + 96]
12107 lea r2, [r7 + 96]
12108
12109 call calc_satd_16x8
12110 call calc_satd_16x8
12111 call calc_satd_16x8
12112 call calc_satd_16x8
12113 call calc_satd_16x8
12114 call calc_satd_16x8
12115
12116 vextracti128 xm7, m6, 1
12117 paddd xm6, xm7
12118 pxor xm7, xm7
12119 movhlps xm7, xm6
12120 paddd xm6, xm7
12121 pshufd xm7, xm6, 1
12122 paddd xm6, xm7
12123 movd eax, xm6
12124 RET
12125
12126 cglobal pixel_satd_64x64, 4,8,8
12127 add r1d, r1d
12128 add r3d, r3d
12129 lea r4, [3 * r1]
12130 lea r5, [3 * r3]
12131 pxor m6, m6
12132 mov r6, r0
12133 mov r7, r2
12134
12135 call calc_satd_16x8
12136 call calc_satd_16x8
12137 call calc_satd_16x8
12138 call calc_satd_16x8
12139 call calc_satd_16x8
12140 call calc_satd_16x8
12141 call calc_satd_16x8
12142 call calc_satd_16x8
12143
12144 lea r0, [r6 + 32]
12145 lea r2, [r7 + 32]
12146
12147 call calc_satd_16x8
12148 call calc_satd_16x8
12149 call calc_satd_16x8
12150 call calc_satd_16x8
12151 call calc_satd_16x8
12152 call calc_satd_16x8
12153 call calc_satd_16x8
12154 call calc_satd_16x8
12155
12156 lea r0, [r6 + 64]
12157 lea r2, [r7 + 64]
12158
12159 call calc_satd_16x8
12160 call calc_satd_16x8
12161 call calc_satd_16x8
12162 call calc_satd_16x8
12163 call calc_satd_16x8
12164 call calc_satd_16x8
12165 call calc_satd_16x8
12166 call calc_satd_16x8
12167
12168 lea r0, [r6 + 96]
12169 lea r2, [r7 + 96]
12170
12171 call calc_satd_16x8
12172 call calc_satd_16x8
12173 call calc_satd_16x8
12174 call calc_satd_16x8
12175 call calc_satd_16x8
12176 call calc_satd_16x8
12177 call calc_satd_16x8
12178 call calc_satd_16x8
12179
12180 vextracti128 xm7, m6, 1
12181 paddd xm6, xm7
12182 pxor xm7, xm7
12183 movhlps xm7, xm6
12184 paddd xm6, xm7
12185 pshufd xm7, xm6, 1
12186 paddd xm6, xm7
12187 movd eax, xm6
12188 RET
12189 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
12190
12191
12192 ;-------------------------------------------------------------------------------------------------------------------------------------
12193 ; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
12194 ;-------------------------------------------------------------------------------------------------------------------------------------
12195 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
12196 INIT_YMM avx2
12197 cglobal planeClipAndMax, 5,7,8
12198 movd xm0, r5m
12199 vpbroadcastb m0, xm0 ; m0 = [min]
12200 vpbroadcastb m1, r6m ; m1 = [max]
12201 pxor m2, m2 ; m2 = sumLuma
12202 pxor m3, m3 ; m3 = maxLumaLevel
12203 pxor m4, m4 ; m4 = zero
12204
12205 ; get mask to partial register pixels
12206 mov r5d, r2d
12207 and r2d, ~(mmsize - 1)
12208 sub r5d, r2d
12209 lea r6, [pb_movemask_32 + mmsize]
12210 sub r6, r5
12211 movu m5, [r6] ; m5 = mask for last couple column
12212
12213 .loopH:
12214 lea r5d, [r2 - mmsize]
12215
12216 .loopW:
12217 movu m6, [r0 + r5]
12218 pmaxub m6, m0
12219 pminub m6, m1
12220 movu [r0 + r5], m6 ; store back
12221 pmaxub m3, m6 ; update maxLumaLevel
12222 psadbw m6, m4
12223 paddq m2, m6
12224
12225 sub r5d, mmsize
12226 jge .loopW
12227
12228 ; partial pixels
12229 movu m7, [r0 + r2]
12230 pmaxub m6, m7, m0
12231 pminub m6, m1
12232
12233 pand m7, m5 ; get invalid/unchange pixel
12234 pandn m6, m5, m6 ; clear invalid pixels
12235 por m7, m6 ; combin valid & invalid pixels
12236 movu [r0 + r2], m7 ; store back
12237 pmaxub m3, m6 ; update maxLumaLevel
12238 psadbw m6, m4
12239 paddq m2, m6
12240
12241 .next:
12242 add r0, r1
12243 dec r3d
12244 jg .loopH
12245
12246 ; sumLuma
12247 vextracti128 xm0, m2, 1
12248 paddq xm0, xm2
12249 movhlps xm1, xm0
12250 paddq xm0, xm1
12251 movq [r4], xm0
12252
12253 ; maxLumaLevel
12254 vextracti128 xm0, m3, 1
12255 pmaxub xm0, xm3
12256 movhlps xm3, xm0
12257 pmaxub xm0, xm3
12258 pmovzxbw xm0, xm0
12259 pxor xm0, [pb_movemask + 16]
12260 phminposuw xm0, xm0
12261
12262 movd eax, xm0
12263 not al
12264 movzx eax, al
12265 RET
12266 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0