Mercurial > hg > forks > libbpg
comparison x265/source/common/x86/pixel-a.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:772086c29cc7 |
---|---|
1 ;***************************************************************************** | |
2 ;* pixel.asm: x86 pixel metrics | |
3 ;***************************************************************************** | |
4 ;* Copyright (C) 2003-2013 x264 project | |
5 ;* | |
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 ;* Holger Lubitz <holger@lubitz.org> | |
8 ;* Laurent Aimar <fenrir@via.ecp.fr> | |
9 ;* Alex Izvorski <aizvorksi@gmail.com> | |
10 ;* Fiona Glaser <fiona@x264.com> | |
11 ;* Oskar Arvidsson <oskar@irock.se> | |
12 ;* Min Chen <chenm003@163.com> | |
13 ;* | |
14 ;* This program is free software; you can redistribute it and/or modify | |
15 ;* it under the terms of the GNU General Public License as published by | |
16 ;* the Free Software Foundation; either version 2 of the License, or | |
17 ;* (at your option) any later version. | |
18 ;* | |
19 ;* This program is distributed in the hope that it will be useful, | |
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 ;* GNU General Public License for more details. | |
23 ;* | |
24 ;* You should have received a copy of the GNU General Public License | |
25 ;* along with this program; if not, write to the Free Software | |
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
27 ;* | |
28 ;* This program is also available under a commercial proprietary license. | |
29 ;* For more information, contact us at license @ x265.com. | |
30 ;***************************************************************************** | |
31 | |
32 %include "x86inc.asm" | |
33 %include "x86util.asm" | |
34 | |
35 SECTION_RODATA 32 | |
36 hmul_8p: times 8 db 1 | |
37 times 4 db 1, -1 | |
38 times 8 db 1 | |
39 times 4 db 1, -1 | |
40 hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 | |
41 mask_10: times 4 dw 0, -1 | |
42 mask_1100: times 2 dd 0, -1 | |
43 hmul_8w: times 4 dw 1 | |
44 times 2 dw 1, -1 | |
45 times 4 dw 1 | |
46 times 2 dw 1, -1 | |
47 | |
48 ALIGN 32 | |
49 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 | |
50 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 | |
51 | |
52 sw_f0: dq 0xfff0, 0 | |
53 pd_f0: times 4 dd 0xffff0000 | |
54 | |
55 SECTION .text | |
56 | |
57 cextern pb_0 | |
58 cextern pb_1 | |
59 cextern pw_1 | |
60 cextern pw_8 | |
61 cextern pw_16 | |
62 cextern pw_32 | |
63 cextern pw_00ff | |
64 cextern pw_ppppmmmm | |
65 cextern pw_ppmmppmm | |
66 cextern pw_pmpmpmpm | |
67 cextern pw_pmmpzzzz | |
68 cextern pd_1 | |
69 cextern popcnt_table | |
70 cextern pd_2 | |
71 cextern hmul_16p | |
72 cextern pb_movemask | |
73 cextern pb_movemask_32 | |
74 cextern pw_pixel_max | |
75 | |
76 ;============================================================================= | |
77 ; SATD | |
78 ;============================================================================= | |
79 | |
80 %macro JDUP 2 | |
81 %if cpuflag(sse4) | |
82 ; just use shufps on anything post conroe | |
83 shufps %1, %2, 0 | |
84 %elif cpuflag(ssse3) && notcpuflag(atom) | |
85 ; join 2x 32 bit and duplicate them | |
86 ; emulating shufps is faster on conroe | |
87 punpcklqdq %1, %2 | |
88 movsldup %1, %1 | |
89 %else | |
90 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d | |
91 punpckldq %1, %2 | |
92 %endif | |
93 %endmacro | |
94 | |
95 %macro HSUMSUB 5 | |
96 pmaddubsw m%2, m%5 | |
97 pmaddubsw m%1, m%5 | |
98 pmaddubsw m%4, m%5 | |
99 pmaddubsw m%3, m%5 | |
100 %endmacro | |
101 | |
102 %macro DIFF_UNPACK_SSE2 5 | |
103 punpcklbw m%1, m%5 | |
104 punpcklbw m%2, m%5 | |
105 punpcklbw m%3, m%5 | |
106 punpcklbw m%4, m%5 | |
107 psubw m%1, m%2 | |
108 psubw m%3, m%4 | |
109 %endmacro | |
110 | |
111 %macro DIFF_SUMSUB_SSSE3 5 | |
112 HSUMSUB %1, %2, %3, %4, %5 | |
113 psubw m%1, m%2 | |
114 psubw m%3, m%4 | |
115 %endmacro | |
116 | |
117 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer | |
118 movd %1, %3 | |
119 movd %2, %4 | |
120 JDUP %1, %2 | |
121 %endmacro | |
122 | |
123 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer | |
124 movddup m%3, %6 | |
125 movddup m%4, %8 | |
126 movddup m%1, %5 | |
127 movddup m%2, %7 | |
128 %endmacro | |
129 | |
130 %macro LOAD_DUP_4x8P_PENRYN 8 | |
131 ; penryn and nehalem run punpcklqdq and movddup in different units | |
132 movh m%3, %6 | |
133 movh m%4, %8 | |
134 punpcklqdq m%3, m%3 | |
135 movddup m%1, %5 | |
136 punpcklqdq m%4, m%4 | |
137 movddup m%2, %7 | |
138 %endmacro | |
139 | |
140 %macro LOAD_SUMSUB_8x2P 9 | |
141 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 | |
142 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 | |
143 %endmacro | |
144 | |
145 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 | |
146 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] | |
147 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] | |
148 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] | |
149 %if %10 | |
150 lea %8, [%8+4*r1] | |
151 lea %9, [%9+4*r3] | |
152 %endif | |
153 %endmacro | |
154 | |
155 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr | |
156 movddup m%1, [%7] | |
157 movddup m%2, [%7+8] | |
158 mova m%4, [%6] | |
159 movddup m%3, m%4 | |
160 punpckhqdq m%4, m%4 | |
161 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 | |
162 %endmacro | |
163 | |
164 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr | |
165 movu m%4, [%7] | |
166 mova m%2, [%6] | |
167 DEINTB %1, %2, %3, %4, %5 | |
168 psubw m%1, m%3 | |
169 psubw m%2, m%4 | |
170 SUMSUB_BA w, %1, %2, %3 | |
171 %endmacro | |
172 | |
173 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none | |
174 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] | |
175 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 | |
176 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 | |
177 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 | |
178 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 | |
179 %endmacro | |
180 | |
181 %macro LOAD_SUMSUB_16x2P_AVX2 9 | |
182 ; 2*dst, 2*tmp, mul, 4*ptr | |
183 vbroadcasti128 m%1, [%6] | |
184 vbroadcasti128 m%3, [%7] | |
185 vbroadcasti128 m%2, [%8] | |
186 vbroadcasti128 m%4, [%9] | |
187 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 | |
188 %endmacro | |
189 | |
190 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 | |
191 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] | |
192 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 | |
193 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 | |
194 %if %10 | |
195 lea %8, [%8+4*r1] | |
196 lea %9, [%9+4*r3] | |
197 %endif | |
198 %endmacro | |
199 | |
200 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer | |
201 mova xm%3, %6 | |
202 mova xm%4, %8 | |
203 mova xm%1, %5 | |
204 mova xm%2, %7 | |
205 vpermq m%3, m%3, q0011 | |
206 vpermq m%4, m%4, q0011 | |
207 vpermq m%1, m%1, q0011 | |
208 vpermq m%2, m%2, q0011 | |
209 %endmacro | |
210 | |
211 %macro LOAD_SUMSUB8_16x2P_AVX2 9 | |
212 ; 2*dst, 2*tmp, mul, 4*ptr | |
213 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 | |
214 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 | |
215 %endmacro | |
216 | |
217 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 | |
218 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] | |
219 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] | |
220 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] | |
221 %if %10 | |
222 lea %8, [%8+4*r1] | |
223 lea %9, [%9+4*r3] | |
224 %endif | |
225 %endmacro | |
226 | |
227 ; in: r4=3*stride1, r5=3*stride2 | |
228 ; in: %2 = horizontal offset | |
229 ; in: %3 = whether we need to increment pix1 and pix2 | |
230 ; clobber: m3..m7 | |
231 ; out: %1 = satd | |
232 %macro SATD_4x4_MMX 3 | |
233 %xdefine %%n n%1 | |
234 %assign offset %2*SIZEOF_PIXEL | |
235 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] | |
236 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] | |
237 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] | |
238 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] | |
239 %if %3 | |
240 lea r0, [r0+4*r1] | |
241 lea r2, [r2+4*r3] | |
242 %endif | |
243 HADAMARD4_2D 4, 5, 6, 7, 3, %%n | |
244 paddw m4, m6 | |
245 ;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12) | |
246 ; pxor m5, m5 | |
247 ; punpcklwd m6, m4, m5 | |
248 ; punpckhwd m4, m5 | |
249 ; paddd m4, m6 | |
250 ;%endif | |
251 SWAP %%n, 4 | |
252 %endmacro | |
253 | |
254 ; in: %1 = horizontal if 0, vertical if 1 | |
255 %macro SATD_8x4_SSE 8-9 | |
256 %if %1 | |
257 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax | |
258 %else | |
259 HADAMARD4_V %2, %3, %4, %5, %6 | |
260 ; doing the abs first is a slight advantage | |
261 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 | |
262 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 | |
263 HADAMARD 1, max, %2, %4, %6, %7 | |
264 %endif | |
265 %ifnidn %9, swap | |
266 %if (BIT_DEPTH == 12) | |
267 pxor m%6, m%6 | |
268 punpcklwd m%7, m%2, m%6 | |
269 punpckhwd m%2, m%6 | |
270 paddd m%8, m%7 | |
271 paddd m%8, m%2 | |
272 %else | |
273 paddw m%8, m%2 | |
274 %endif | |
275 %else | |
276 SWAP %8, %2 | |
277 %if (BIT_DEPTH == 12) | |
278 pxor m%6, m%6 | |
279 punpcklwd m%7, m%8, m%6 | |
280 punpckhwd m%8, m%6 | |
281 paddd m%8, m%7 | |
282 %endif | |
283 %endif | |
284 %if %1 | |
285 %if (BIT_DEPTH == 12) | |
286 pxor m%6, m%6 | |
287 punpcklwd m%7, m%4, m%6 | |
288 punpckhwd m%4, m%6 | |
289 paddd m%8, m%7 | |
290 paddd m%8, m%4 | |
291 %else | |
292 paddw m%8, m%4 | |
293 %endif | |
294 %else | |
295 HADAMARD 1, max, %3, %5, %6, %7 | |
296 %if (BIT_DEPTH == 12) | |
297 pxor m%6, m%6 | |
298 punpcklwd m%7, m%3, m%6 | |
299 punpckhwd m%3, m%6 | |
300 paddd m%8, m%7 | |
301 paddd m%8, m%3 | |
302 %else | |
303 paddw m%8, m%3 | |
304 %endif | |
305 %endif | |
306 %endmacro | |
307 | |
308 %macro SATD_8x4_1_SSE 10 | |
309 %if %1 | |
310 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax | |
311 %else | |
312 HADAMARD4_V %2, %3, %4, %5, %6 | |
313 ; doing the abs first is a slight advantage | |
314 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 | |
315 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 | |
316 HADAMARD 1, max, %2, %4, %6, %7 | |
317 %endif | |
318 | |
319 pxor m%10, m%10 | |
320 punpcklwd m%9, m%2, m%10 | |
321 paddd m%8, m%9 | |
322 punpckhwd m%9, m%2, m%10 | |
323 paddd m%8, m%9 | |
324 | |
325 %if %1 | |
326 pxor m%10, m%10 | |
327 punpcklwd m%9, m%4, m%10 | |
328 paddd m%8, m%9 | |
329 punpckhwd m%9, m%4, m%10 | |
330 paddd m%8, m%9 | |
331 %else | |
332 HADAMARD 1, max, %3, %5, %6, %7 | |
333 pxor m%10, m%10 | |
334 punpcklwd m%9, m%3, m%10 | |
335 paddd m%8, m%9 | |
336 punpckhwd m%9, m%3, m%10 | |
337 paddd m%8, m%9 | |
338 %endif | |
339 %endmacro | |
340 | |
341 %macro SATD_START_MMX 0 | |
342 FIX_STRIDES r1, r3 | |
343 lea r4, [3*r1] ; 3*stride1 | |
344 lea r5, [3*r3] ; 3*stride2 | |
345 %endmacro | |
346 | |
347 %macro SATD_END_MMX 0 | |
348 %if HIGH_BIT_DEPTH | |
349 HADDUW m0, m1 | |
350 movd eax, m0 | |
351 %else ; !HIGH_BIT_DEPTH | |
352 pshufw m1, m0, q1032 | |
353 paddw m0, m1 | |
354 pshufw m1, m0, q2301 | |
355 paddw m0, m1 | |
356 movd eax, m0 | |
357 and eax, 0xffff | |
358 %endif ; HIGH_BIT_DEPTH | |
359 EMMS | |
360 RET | |
361 %endmacro | |
362 | |
363 ; FIXME avoid the spilling of regs to hold 3*stride. | |
364 ; for small blocks on x86_32, modify pixel pointer instead. | |
365 | |
366 ;----------------------------------------------------------------------------- | |
367 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
368 ;----------------------------------------------------------------------------- | |
369 INIT_MMX mmx2 | |
370 cglobal pixel_satd_4x4, 4,6 | |
371 SATD_START_MMX | |
372 SATD_4x4_MMX m0, 0, 0 | |
373 SATD_END_MMX | |
374 | |
375 %macro SATD_START_SSE2 2-3 0 | |
376 FIX_STRIDES r1, r3 | |
377 %if HIGH_BIT_DEPTH && %3 | |
378 pxor %2, %2 | |
379 %elif cpuflag(ssse3) && notcpuflag(atom) | |
380 %if mmsize==32 | |
381 mova %2, [hmul_16p] | |
382 %else | |
383 mova %2, [hmul_8p] | |
384 %endif | |
385 %endif | |
386 lea r4, [3*r1] | |
387 lea r5, [3*r3] | |
388 pxor %1, %1 | |
389 %endmacro | |
390 | |
391 %macro SATD_END_SSE2 1-2 | |
392 %if HIGH_BIT_DEPTH | |
393 %if BIT_DEPTH == 12 | |
394 HADDD %1, xm0 | |
395 %else ; BIT_DEPTH == 12 | |
396 HADDUW %1, xm0 | |
397 %endif ; BIT_DEPTH == 12 | |
398 %if %0 == 2 | |
399 paddd %1, %2 | |
400 %endif | |
401 %else | |
402 HADDW %1, xm7 | |
403 %endif | |
404 movd eax, %1 | |
405 RET | |
406 %endmacro | |
407 | |
408 %macro SATD_ACCUM 3 | |
409 %if HIGH_BIT_DEPTH | |
410 HADDUW %1, %2 | |
411 paddd %3, %1 | |
412 pxor %1, %1 | |
413 %endif | |
414 %endmacro | |
415 | |
416 %macro BACKUP_POINTERS 0 | |
417 %if ARCH_X86_64 | |
418 %if WIN64 | |
419 PUSH r7 | |
420 %endif | |
421 mov r6, r0 | |
422 mov r7, r2 | |
423 %endif | |
424 %endmacro | |
425 | |
426 %macro RESTORE_AND_INC_POINTERS 0 | |
427 %if ARCH_X86_64 | |
428 lea r0, [r6+8*SIZEOF_PIXEL] | |
429 lea r2, [r7+8*SIZEOF_PIXEL] | |
430 %if WIN64 | |
431 POP r7 | |
432 %endif | |
433 %else | |
434 mov r0, r0mp | |
435 mov r2, r2mp | |
436 add r0, 8*SIZEOF_PIXEL | |
437 add r2, 8*SIZEOF_PIXEL | |
438 %endif | |
439 %endmacro | |
440 | |
441 %macro SATD_4x8_SSE 3-4 | |
442 %if HIGH_BIT_DEPTH | |
443 movh m0, [r0+0*r1] | |
444 movh m4, [r2+0*r3] | |
445 movh m1, [r0+1*r1] | |
446 movh m5, [r2+1*r3] | |
447 movhps m0, [r0+4*r1] | |
448 movhps m4, [r2+4*r3] | |
449 movh m2, [r0+2*r1] | |
450 movh m6, [r2+2*r3] | |
451 psubw m0, m4 | |
452 movh m3, [r0+r4] | |
453 movh m4, [r2+r5] | |
454 lea r0, [r0+4*r1] | |
455 lea r2, [r2+4*r3] | |
456 movhps m1, [r0+1*r1] | |
457 movhps m5, [r2+1*r3] | |
458 movhps m2, [r0+2*r1] | |
459 movhps m6, [r2+2*r3] | |
460 psubw m1, m5 | |
461 movhps m3, [r0+r4] | |
462 movhps m4, [r2+r5] | |
463 psubw m2, m6 | |
464 psubw m3, m4 | |
465 %else ; !HIGH_BIT_DEPTH | |
466 movd m4, [r2] | |
467 movd m5, [r2+r3] | |
468 movd m6, [r2+2*r3] | |
469 add r2, r5 | |
470 movd m0, [r0] | |
471 movd m1, [r0+r1] | |
472 movd m2, [r0+2*r1] | |
473 add r0, r4 | |
474 movd m3, [r2+r3] | |
475 JDUP m4, m3 | |
476 movd m3, [r0+r1] | |
477 JDUP m0, m3 | |
478 movd m3, [r2+2*r3] | |
479 JDUP m5, m3 | |
480 movd m3, [r0+2*r1] | |
481 JDUP m1, m3 | |
482 %if %1==0 && %2==1 | |
483 mova m3, [hmul_4p] | |
484 DIFFOP 0, 4, 1, 5, 3 | |
485 %else | |
486 DIFFOP 0, 4, 1, 5, 7 | |
487 %endif | |
488 movd m5, [r2] | |
489 add r2, r5 | |
490 movd m3, [r0] | |
491 add r0, r4 | |
492 movd m4, [r2] | |
493 JDUP m6, m4 | |
494 movd m4, [r0] | |
495 JDUP m2, m4 | |
496 movd m4, [r2+r3] | |
497 JDUP m5, m4 | |
498 movd m4, [r0+r1] | |
499 JDUP m3, m4 | |
500 %if %1==0 && %2==1 | |
501 mova m4, [hmul_4p] | |
502 DIFFOP 2, 6, 3, 5, 4 | |
503 %else | |
504 DIFFOP 2, 6, 3, 5, 7 | |
505 %endif | |
506 %endif ; HIGH_BIT_DEPTH | |
507 %if %0 == 4 | |
508 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 | |
509 %else | |
510 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 | |
511 %endif | |
512 %endmacro | |
513 | |
514 ;----------------------------------------------------------------------------- | |
515 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
516 ;----------------------------------------------------------------------------- | |
517 %macro SATDS_SSE2 0 | |
518 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) | |
519 | |
520 %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) | |
521 cglobal pixel_satd_4x4, 4, 6, 6 | |
522 SATD_START_MMX | |
523 mova m4, [hmul_4p] | |
524 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] | |
525 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] | |
526 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] | |
527 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] | |
528 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 | |
529 HADAMARD 0, sumsub, 0, 1, 2, 3 | |
530 HADAMARD 4, sumsub, 0, 1, 2, 3 | |
531 HADAMARD 1, amax, 0, 1, 2, 3 | |
532 HADDW m0, m1 | |
533 movd eax, m0 | |
534 RET | |
535 %endif | |
536 | |
537 cglobal pixel_satd_4x8, 4, 6, 8 | |
538 SATD_START_MMX | |
539 %if vertical==0 | |
540 mova m7, [hmul_4p] | |
541 %endif | |
542 SATD_4x8_SSE vertical, 0, swap | |
543 %if BIT_DEPTH == 12 | |
544 HADDD m7, m1 | |
545 %else | |
546 HADDUW m7, m1 | |
547 %endif | |
548 movd eax, m7 | |
549 RET | |
550 | |
551 cglobal pixel_satd_4x16, 4, 6, 8 | |
552 SATD_START_MMX | |
553 %if vertical==0 | |
554 mova m7, [hmul_4p] | |
555 %endif | |
556 SATD_4x8_SSE vertical, 0, swap | |
557 lea r0, [r0+r1*2*SIZEOF_PIXEL] | |
558 lea r2, [r2+r3*2*SIZEOF_PIXEL] | |
559 SATD_4x8_SSE vertical, 1, add | |
560 %if BIT_DEPTH == 12 | |
561 HADDD m7, m1 | |
562 %else | |
563 HADDUW m7, m1 | |
564 %endif | |
565 movd eax, m7 | |
566 RET | |
567 | |
568 cglobal pixel_satd_8x8_internal | |
569 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
570 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 | |
571 %%pixel_satd_8x4_internal: | |
572 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
573 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 | |
574 ret | |
575 | |
576 cglobal pixel_satd_8x8_internal2 | |
577 %if WIN64 | |
578 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
579 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 | |
580 %%pixel_satd_8x4_internal2: | |
581 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
582 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 | |
583 %else | |
584 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
585 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 | |
586 %%pixel_satd_8x4_internal2: | |
587 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 | |
588 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 | |
589 %endif | |
590 ret | |
591 | |
592 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) | |
593 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) | |
594 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx) | |
595 | |
596 cglobal pixel_satd_16x4_internal2 | |
597 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 | |
598 lea r2, [r2+4*r3] | |
599 lea r0, [r0+4*r1] | |
600 SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13 | |
601 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13 | |
602 ret | |
603 | |
604 cglobal pixel_satd_16x4, 4,6,14 | |
605 SATD_START_SSE2 m10, m7 | |
606 %if vertical | |
607 mova m7, [pw_00ff] | |
608 %endif | |
609 call pixel_satd_16x4_internal2 | |
610 HADDD m10, m0 | |
611 movd eax, m10 | |
612 RET | |
613 | |
614 cglobal pixel_satd_16x8, 4,6,14 | |
615 SATD_START_SSE2 m10, m7 | |
616 %if vertical | |
617 mova m7, [pw_00ff] | |
618 %endif | |
619 jmp %%pixel_satd_16x8_internal | |
620 | |
621 cglobal pixel_satd_16x12, 4,6,14 | |
622 SATD_START_SSE2 m10, m7 | |
623 %if vertical | |
624 mova m7, [pw_00ff] | |
625 %endif | |
626 call pixel_satd_16x4_internal2 | |
627 jmp %%pixel_satd_16x8_internal | |
628 | |
629 cglobal pixel_satd_16x32, 4,6,14 | |
630 SATD_START_SSE2 m10, m7 | |
631 %if vertical | |
632 mova m7, [pw_00ff] | |
633 %endif | |
634 call pixel_satd_16x4_internal2 | |
635 call pixel_satd_16x4_internal2 | |
636 call pixel_satd_16x4_internal2 | |
637 call pixel_satd_16x4_internal2 | |
638 call pixel_satd_16x4_internal2 | |
639 call pixel_satd_16x4_internal2 | |
640 jmp %%pixel_satd_16x8_internal | |
641 | |
642 cglobal pixel_satd_16x64, 4,6,14 | |
643 SATD_START_SSE2 m10, m7 | |
644 %if vertical | |
645 mova m7, [pw_00ff] | |
646 %endif | |
647 call pixel_satd_16x4_internal2 | |
648 call pixel_satd_16x4_internal2 | |
649 call pixel_satd_16x4_internal2 | |
650 call pixel_satd_16x4_internal2 | |
651 call pixel_satd_16x4_internal2 | |
652 call pixel_satd_16x4_internal2 | |
653 call pixel_satd_16x4_internal2 | |
654 call pixel_satd_16x4_internal2 | |
655 call pixel_satd_16x4_internal2 | |
656 call pixel_satd_16x4_internal2 | |
657 call pixel_satd_16x4_internal2 | |
658 call pixel_satd_16x4_internal2 | |
659 call pixel_satd_16x4_internal2 | |
660 call pixel_satd_16x4_internal2 | |
661 jmp %%pixel_satd_16x8_internal | |
662 | |
663 cglobal pixel_satd_16x16, 4,6,14 | |
664 SATD_START_SSE2 m10, m7 | |
665 %if vertical | |
666 mova m7, [pw_00ff] | |
667 %endif | |
668 call pixel_satd_16x4_internal2 | |
669 call pixel_satd_16x4_internal2 | |
670 %%pixel_satd_16x8_internal: | |
671 call pixel_satd_16x4_internal2 | |
672 call pixel_satd_16x4_internal2 | |
673 HADDD m10, m0 | |
674 movd eax, m10 | |
675 RET | |
676 | |
677 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
678 SATD_START_SSE2 m10, m7 | |
679 mov r6, r0 | |
680 mov r7, r2 | |
681 %if vertical | |
682 mova m7, [pw_00ff] | |
683 %endif | |
684 call pixel_satd_16x4_internal2 | |
685 call pixel_satd_16x4_internal2 | |
686 lea r0, [r6 + 16] | |
687 lea r2, [r7 + 16] | |
688 call pixel_satd_16x4_internal2 | |
689 call pixel_satd_16x4_internal2 | |
690 HADDD m10, m0 | |
691 movd eax, m10 | |
692 RET | |
693 | |
694 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
695 SATD_START_SSE2 m10, m7 | |
696 mov r6, r0 | |
697 mov r7, r2 | |
698 %if vertical | |
699 mova m7, [pw_00ff] | |
700 %endif | |
701 call pixel_satd_16x4_internal2 | |
702 call pixel_satd_16x4_internal2 | |
703 call pixel_satd_16x4_internal2 | |
704 call pixel_satd_16x4_internal2 | |
705 lea r0, [r6 + 16] | |
706 lea r2, [r7 + 16] | |
707 call pixel_satd_16x4_internal2 | |
708 call pixel_satd_16x4_internal2 | |
709 call pixel_satd_16x4_internal2 | |
710 call pixel_satd_16x4_internal2 | |
711 HADDD m10, m0 | |
712 movd eax, m10 | |
713 RET | |
714 | |
715 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
716 SATD_START_SSE2 m10, m7 | |
717 mov r6, r0 | |
718 mov r7, r2 | |
719 %if vertical | |
720 mova m7, [pw_00ff] | |
721 %endif | |
722 call pixel_satd_16x4_internal2 | |
723 call pixel_satd_16x4_internal2 | |
724 call pixel_satd_16x4_internal2 | |
725 call pixel_satd_16x4_internal2 | |
726 call pixel_satd_16x4_internal2 | |
727 call pixel_satd_16x4_internal2 | |
728 lea r0, [r6 + 16] | |
729 lea r2, [r7 + 16] | |
730 call pixel_satd_16x4_internal2 | |
731 call pixel_satd_16x4_internal2 | |
732 call pixel_satd_16x4_internal2 | |
733 call pixel_satd_16x4_internal2 | |
734 call pixel_satd_16x4_internal2 | |
735 call pixel_satd_16x4_internal2 | |
736 HADDD m10, m0 | |
737 movd eax, m10 | |
738 RET | |
739 | |
740 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
741 SATD_START_SSE2 m10, m7 | |
742 mov r6, r0 | |
743 mov r7, r2 | |
744 %if vertical | |
745 mova m7, [pw_00ff] | |
746 %endif | |
747 call pixel_satd_16x4_internal2 | |
748 call pixel_satd_16x4_internal2 | |
749 call pixel_satd_16x4_internal2 | |
750 call pixel_satd_16x4_internal2 | |
751 call pixel_satd_16x4_internal2 | |
752 call pixel_satd_16x4_internal2 | |
753 call pixel_satd_16x4_internal2 | |
754 call pixel_satd_16x4_internal2 | |
755 lea r0, [r6 + 16] | |
756 lea r2, [r7 + 16] | |
757 call pixel_satd_16x4_internal2 | |
758 call pixel_satd_16x4_internal2 | |
759 call pixel_satd_16x4_internal2 | |
760 call pixel_satd_16x4_internal2 | |
761 call pixel_satd_16x4_internal2 | |
762 call pixel_satd_16x4_internal2 | |
763 call pixel_satd_16x4_internal2 | |
764 call pixel_satd_16x4_internal2 | |
765 HADDD m10, m0 | |
766 movd eax, m10 | |
767 RET | |
768 | |
769 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
770 SATD_START_SSE2 m10, m7 | |
771 mov r6, r0 | |
772 mov r7, r2 | |
773 %if vertical | |
774 mova m7, [pw_00ff] | |
775 %endif | |
776 call pixel_satd_16x4_internal2 | |
777 call pixel_satd_16x4_internal2 | |
778 call pixel_satd_16x4_internal2 | |
779 call pixel_satd_16x4_internal2 | |
780 call pixel_satd_16x4_internal2 | |
781 call pixel_satd_16x4_internal2 | |
782 call pixel_satd_16x4_internal2 | |
783 call pixel_satd_16x4_internal2 | |
784 call pixel_satd_16x4_internal2 | |
785 call pixel_satd_16x4_internal2 | |
786 call pixel_satd_16x4_internal2 | |
787 call pixel_satd_16x4_internal2 | |
788 call pixel_satd_16x4_internal2 | |
789 call pixel_satd_16x4_internal2 | |
790 call pixel_satd_16x4_internal2 | |
791 call pixel_satd_16x4_internal2 | |
792 lea r0, [r6 + 16] | |
793 lea r2, [r7 + 16] | |
794 call pixel_satd_16x4_internal2 | |
795 call pixel_satd_16x4_internal2 | |
796 call pixel_satd_16x4_internal2 | |
797 call pixel_satd_16x4_internal2 | |
798 call pixel_satd_16x4_internal2 | |
799 call pixel_satd_16x4_internal2 | |
800 call pixel_satd_16x4_internal2 | |
801 call pixel_satd_16x4_internal2 | |
802 call pixel_satd_16x4_internal2 | |
803 call pixel_satd_16x4_internal2 | |
804 call pixel_satd_16x4_internal2 | |
805 call pixel_satd_16x4_internal2 | |
806 call pixel_satd_16x4_internal2 | |
807 call pixel_satd_16x4_internal2 | |
808 call pixel_satd_16x4_internal2 | |
809 call pixel_satd_16x4_internal2 | |
810 HADDD m10, m0 | |
811 movd eax, m10 | |
812 RET | |
813 | |
814 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
815 SATD_START_SSE2 m10, m7 | |
816 mov r6, r0 | |
817 mov r7, r2 | |
818 %if vertical | |
819 mova m7, [pw_00ff] | |
820 %endif | |
821 call pixel_satd_16x4_internal2 | |
822 call pixel_satd_16x4_internal2 | |
823 call pixel_satd_16x4_internal2 | |
824 call pixel_satd_16x4_internal2 | |
825 call pixel_satd_16x4_internal2 | |
826 call pixel_satd_16x4_internal2 | |
827 call pixel_satd_16x4_internal2 | |
828 call pixel_satd_16x4_internal2 | |
829 call pixel_satd_16x4_internal2 | |
830 call pixel_satd_16x4_internal2 | |
831 call pixel_satd_16x4_internal2 | |
832 call pixel_satd_16x4_internal2 | |
833 call pixel_satd_16x4_internal2 | |
834 call pixel_satd_16x4_internal2 | |
835 call pixel_satd_16x4_internal2 | |
836 call pixel_satd_16x4_internal2 | |
837 lea r0, [r6 + 16] | |
838 lea r2, [r7 + 16] | |
839 call pixel_satd_16x4_internal2 | |
840 call pixel_satd_16x4_internal2 | |
841 call pixel_satd_16x4_internal2 | |
842 call pixel_satd_16x4_internal2 | |
843 call pixel_satd_16x4_internal2 | |
844 call pixel_satd_16x4_internal2 | |
845 call pixel_satd_16x4_internal2 | |
846 call pixel_satd_16x4_internal2 | |
847 call pixel_satd_16x4_internal2 | |
848 call pixel_satd_16x4_internal2 | |
849 call pixel_satd_16x4_internal2 | |
850 call pixel_satd_16x4_internal2 | |
851 call pixel_satd_16x4_internal2 | |
852 call pixel_satd_16x4_internal2 | |
853 call pixel_satd_16x4_internal2 | |
854 call pixel_satd_16x4_internal2 | |
855 lea r0, [r6 + 32] | |
856 lea r2, [r7 + 32] | |
857 call pixel_satd_16x4_internal2 | |
858 call pixel_satd_16x4_internal2 | |
859 call pixel_satd_16x4_internal2 | |
860 call pixel_satd_16x4_internal2 | |
861 call pixel_satd_16x4_internal2 | |
862 call pixel_satd_16x4_internal2 | |
863 call pixel_satd_16x4_internal2 | |
864 call pixel_satd_16x4_internal2 | |
865 call pixel_satd_16x4_internal2 | |
866 call pixel_satd_16x4_internal2 | |
867 call pixel_satd_16x4_internal2 | |
868 call pixel_satd_16x4_internal2 | |
869 call pixel_satd_16x4_internal2 | |
870 call pixel_satd_16x4_internal2 | |
871 call pixel_satd_16x4_internal2 | |
872 call pixel_satd_16x4_internal2 | |
873 HADDD m10, m0 | |
874 movd eax, m10 | |
875 RET | |
876 | |
877 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
878 SATD_START_SSE2 m10, m7 | |
879 mov r6, r0 | |
880 mov r7, r2 | |
881 %if vertical | |
882 mova m7, [pw_00ff] | |
883 %endif | |
884 call pixel_satd_16x4_internal2 | |
885 call pixel_satd_16x4_internal2 | |
886 call pixel_satd_16x4_internal2 | |
887 call pixel_satd_16x4_internal2 | |
888 lea r0, [r6 + 16] | |
889 lea r2, [r7 + 16] | |
890 call pixel_satd_16x4_internal2 | |
891 call pixel_satd_16x4_internal2 | |
892 call pixel_satd_16x4_internal2 | |
893 call pixel_satd_16x4_internal2 | |
894 lea r0, [r6 + 32] | |
895 lea r2, [r7 + 32] | |
896 call pixel_satd_16x4_internal2 | |
897 call pixel_satd_16x4_internal2 | |
898 call pixel_satd_16x4_internal2 | |
899 call pixel_satd_16x4_internal2 | |
900 lea r0, [r6 + 48] | |
901 lea r2, [r7 + 48] | |
902 call pixel_satd_16x4_internal2 | |
903 call pixel_satd_16x4_internal2 | |
904 call pixel_satd_16x4_internal2 | |
905 call pixel_satd_16x4_internal2 | |
906 HADDD m10, m0 | |
907 movd eax, m10 | |
908 RET | |
909 | |
910 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
911 SATD_START_SSE2 m10, m7 | |
912 mov r6, r0 | |
913 mov r7, r2 | |
914 %if vertical | |
915 mova m7, [pw_00ff] | |
916 %endif | |
917 call pixel_satd_16x4_internal2 | |
918 call pixel_satd_16x4_internal2 | |
919 call pixel_satd_16x4_internal2 | |
920 call pixel_satd_16x4_internal2 | |
921 call pixel_satd_16x4_internal2 | |
922 call pixel_satd_16x4_internal2 | |
923 call pixel_satd_16x4_internal2 | |
924 call pixel_satd_16x4_internal2 | |
925 lea r0, [r6 + 16] | |
926 lea r2, [r7 + 16] | |
927 call pixel_satd_16x4_internal2 | |
928 call pixel_satd_16x4_internal2 | |
929 call pixel_satd_16x4_internal2 | |
930 call pixel_satd_16x4_internal2 | |
931 call pixel_satd_16x4_internal2 | |
932 call pixel_satd_16x4_internal2 | |
933 call pixel_satd_16x4_internal2 | |
934 call pixel_satd_16x4_internal2 | |
935 lea r0, [r6 + 32] | |
936 lea r2, [r7 + 32] | |
937 call pixel_satd_16x4_internal2 | |
938 call pixel_satd_16x4_internal2 | |
939 call pixel_satd_16x4_internal2 | |
940 call pixel_satd_16x4_internal2 | |
941 call pixel_satd_16x4_internal2 | |
942 call pixel_satd_16x4_internal2 | |
943 call pixel_satd_16x4_internal2 | |
944 call pixel_satd_16x4_internal2 | |
945 lea r0, [r6 + 48] | |
946 lea r2, [r7 + 48] | |
947 call pixel_satd_16x4_internal2 | |
948 call pixel_satd_16x4_internal2 | |
949 call pixel_satd_16x4_internal2 | |
950 call pixel_satd_16x4_internal2 | |
951 call pixel_satd_16x4_internal2 | |
952 call pixel_satd_16x4_internal2 | |
953 call pixel_satd_16x4_internal2 | |
954 call pixel_satd_16x4_internal2 | |
955 | |
956 HADDD m10, m0 | |
957 movd eax, m10 | |
958 RET | |
959 | |
960 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
961 SATD_START_SSE2 m10, m7 | |
962 mov r6, r0 | |
963 mov r7, r2 | |
964 %if vertical | |
965 mova m7, [pw_00ff] | |
966 %endif | |
967 call pixel_satd_16x4_internal2 | |
968 call pixel_satd_16x4_internal2 | |
969 call pixel_satd_16x4_internal2 | |
970 call pixel_satd_16x4_internal2 | |
971 call pixel_satd_16x4_internal2 | |
972 call pixel_satd_16x4_internal2 | |
973 call pixel_satd_16x4_internal2 | |
974 call pixel_satd_16x4_internal2 | |
975 call pixel_satd_16x4_internal2 | |
976 call pixel_satd_16x4_internal2 | |
977 call pixel_satd_16x4_internal2 | |
978 call pixel_satd_16x4_internal2 | |
979 lea r0, [r6 + 16] | |
980 lea r2, [r7 + 16] | |
981 call pixel_satd_16x4_internal2 | |
982 call pixel_satd_16x4_internal2 | |
983 call pixel_satd_16x4_internal2 | |
984 call pixel_satd_16x4_internal2 | |
985 call pixel_satd_16x4_internal2 | |
986 call pixel_satd_16x4_internal2 | |
987 call pixel_satd_16x4_internal2 | |
988 call pixel_satd_16x4_internal2 | |
989 call pixel_satd_16x4_internal2 | |
990 call pixel_satd_16x4_internal2 | |
991 call pixel_satd_16x4_internal2 | |
992 call pixel_satd_16x4_internal2 | |
993 lea r0, [r6 + 32] | |
994 lea r2, [r7 + 32] | |
995 call pixel_satd_16x4_internal2 | |
996 call pixel_satd_16x4_internal2 | |
997 call pixel_satd_16x4_internal2 | |
998 call pixel_satd_16x4_internal2 | |
999 call pixel_satd_16x4_internal2 | |
1000 call pixel_satd_16x4_internal2 | |
1001 call pixel_satd_16x4_internal2 | |
1002 call pixel_satd_16x4_internal2 | |
1003 call pixel_satd_16x4_internal2 | |
1004 call pixel_satd_16x4_internal2 | |
1005 call pixel_satd_16x4_internal2 | |
1006 call pixel_satd_16x4_internal2 | |
1007 lea r0, [r6 + 48] | |
1008 lea r2, [r7 + 48] | |
1009 call pixel_satd_16x4_internal2 | |
1010 call pixel_satd_16x4_internal2 | |
1011 call pixel_satd_16x4_internal2 | |
1012 call pixel_satd_16x4_internal2 | |
1013 call pixel_satd_16x4_internal2 | |
1014 call pixel_satd_16x4_internal2 | |
1015 call pixel_satd_16x4_internal2 | |
1016 call pixel_satd_16x4_internal2 | |
1017 call pixel_satd_16x4_internal2 | |
1018 call pixel_satd_16x4_internal2 | |
1019 call pixel_satd_16x4_internal2 | |
1020 call pixel_satd_16x4_internal2 | |
1021 | |
1022 HADDD m10, m0 | |
1023 movd eax, m10 | |
1024 RET | |
1025 | |
1026 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx) | |
1027 SATD_START_SSE2 m10, m7 | |
1028 mov r6, r0 | |
1029 mov r7, r2 | |
1030 %if vertical | |
1031 mova m7, [pw_00ff] | |
1032 %endif | |
1033 call pixel_satd_16x4_internal2 | |
1034 call pixel_satd_16x4_internal2 | |
1035 call pixel_satd_16x4_internal2 | |
1036 call pixel_satd_16x4_internal2 | |
1037 call pixel_satd_16x4_internal2 | |
1038 call pixel_satd_16x4_internal2 | |
1039 call pixel_satd_16x4_internal2 | |
1040 call pixel_satd_16x4_internal2 | |
1041 call pixel_satd_16x4_internal2 | |
1042 call pixel_satd_16x4_internal2 | |
1043 call pixel_satd_16x4_internal2 | |
1044 call pixel_satd_16x4_internal2 | |
1045 call pixel_satd_16x4_internal2 | |
1046 call pixel_satd_16x4_internal2 | |
1047 call pixel_satd_16x4_internal2 | |
1048 call pixel_satd_16x4_internal2 | |
1049 lea r0, [r6 + 16] | |
1050 lea r2, [r7 + 16] | |
1051 call pixel_satd_16x4_internal2 | |
1052 call pixel_satd_16x4_internal2 | |
1053 call pixel_satd_16x4_internal2 | |
1054 call pixel_satd_16x4_internal2 | |
1055 call pixel_satd_16x4_internal2 | |
1056 call pixel_satd_16x4_internal2 | |
1057 call pixel_satd_16x4_internal2 | |
1058 call pixel_satd_16x4_internal2 | |
1059 call pixel_satd_16x4_internal2 | |
1060 call pixel_satd_16x4_internal2 | |
1061 call pixel_satd_16x4_internal2 | |
1062 call pixel_satd_16x4_internal2 | |
1063 call pixel_satd_16x4_internal2 | |
1064 call pixel_satd_16x4_internal2 | |
1065 call pixel_satd_16x4_internal2 | |
1066 call pixel_satd_16x4_internal2 | |
1067 lea r0, [r6 + 32] | |
1068 lea r2, [r7 + 32] | |
1069 call pixel_satd_16x4_internal2 | |
1070 call pixel_satd_16x4_internal2 | |
1071 call pixel_satd_16x4_internal2 | |
1072 call pixel_satd_16x4_internal2 | |
1073 call pixel_satd_16x4_internal2 | |
1074 call pixel_satd_16x4_internal2 | |
1075 call pixel_satd_16x4_internal2 | |
1076 call pixel_satd_16x4_internal2 | |
1077 call pixel_satd_16x4_internal2 | |
1078 call pixel_satd_16x4_internal2 | |
1079 call pixel_satd_16x4_internal2 | |
1080 call pixel_satd_16x4_internal2 | |
1081 call pixel_satd_16x4_internal2 | |
1082 call pixel_satd_16x4_internal2 | |
1083 call pixel_satd_16x4_internal2 | |
1084 call pixel_satd_16x4_internal2 | |
1085 lea r0, [r6 + 48] | |
1086 lea r2, [r7 + 48] | |
1087 call pixel_satd_16x4_internal2 | |
1088 call pixel_satd_16x4_internal2 | |
1089 call pixel_satd_16x4_internal2 | |
1090 call pixel_satd_16x4_internal2 | |
1091 call pixel_satd_16x4_internal2 | |
1092 call pixel_satd_16x4_internal2 | |
1093 call pixel_satd_16x4_internal2 | |
1094 call pixel_satd_16x4_internal2 | |
1095 call pixel_satd_16x4_internal2 | |
1096 call pixel_satd_16x4_internal2 | |
1097 call pixel_satd_16x4_internal2 | |
1098 call pixel_satd_16x4_internal2 | |
1099 call pixel_satd_16x4_internal2 | |
1100 call pixel_satd_16x4_internal2 | |
1101 call pixel_satd_16x4_internal2 | |
1102 call pixel_satd_16x4_internal2 | |
1103 | |
1104 HADDD m10, m0 | |
1105 movd eax, m10 | |
1106 RET | |
1107 | |
1108 %else | |
1109 %if WIN64 | |
1110 cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1111 SATD_START_SSE2 m6, m7 | |
1112 mov r6, r0 | |
1113 mov r7, r2 | |
1114 call pixel_satd_8x8_internal2 | |
1115 call pixel_satd_8x8_internal2 | |
1116 call pixel_satd_8x8_internal2 | |
1117 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1118 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1119 call pixel_satd_8x8_internal2 | |
1120 call pixel_satd_8x8_internal2 | |
1121 call pixel_satd_8x8_internal2 | |
1122 HADDD m6, m0 | |
1123 movd eax, m6 | |
1124 RET | |
1125 %else | |
1126 cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64 | |
1127 SATD_START_SSE2 m6, m7 | |
1128 mov r6, r0 | |
1129 mov [rsp], r2 | |
1130 call pixel_satd_8x8_internal2 | |
1131 call pixel_satd_8x8_internal2 | |
1132 call pixel_satd_8x8_internal2 | |
1133 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1134 mov r2, [rsp] | |
1135 add r2, 8*SIZEOF_PIXEL | |
1136 call pixel_satd_8x8_internal2 | |
1137 call pixel_satd_8x8_internal2 | |
1138 call pixel_satd_8x8_internal2 | |
1139 HADDD m6, m0 | |
1140 movd eax, m6 | |
1141 RET | |
1142 %endif | |
1143 %if WIN64 | |
1144 cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1145 SATD_START_SSE2 m6, m7 | |
1146 mov r6, r0 | |
1147 mov r7, r2 | |
1148 call pixel_satd_8x8_internal2 | |
1149 call pixel_satd_8x8_internal2 | |
1150 call pixel_satd_8x8_internal2 | |
1151 call pixel_satd_8x8_internal2 | |
1152 call pixel_satd_8x8_internal2 | |
1153 call pixel_satd_8x8_internal2 | |
1154 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1155 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1156 call pixel_satd_8x8_internal2 | |
1157 call pixel_satd_8x8_internal2 | |
1158 call pixel_satd_8x8_internal2 | |
1159 call pixel_satd_8x8_internal2 | |
1160 call pixel_satd_8x8_internal2 | |
1161 call pixel_satd_8x8_internal2 | |
1162 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1163 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1164 call pixel_satd_8x8_internal2 | |
1165 call pixel_satd_8x8_internal2 | |
1166 call pixel_satd_8x8_internal2 | |
1167 call pixel_satd_8x8_internal2 | |
1168 call pixel_satd_8x8_internal2 | |
1169 call pixel_satd_8x8_internal2 | |
1170 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1171 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1172 call pixel_satd_8x8_internal2 | |
1173 call pixel_satd_8x8_internal2 | |
1174 call pixel_satd_8x8_internal2 | |
1175 call pixel_satd_8x8_internal2 | |
1176 call pixel_satd_8x8_internal2 | |
1177 call pixel_satd_8x8_internal2 | |
1178 HADDD m6, m0 | |
1179 movd eax, m6 | |
1180 RET | |
1181 %else | |
1182 cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64 | |
1183 SATD_START_SSE2 m6, m7 | |
1184 mov r6, r0 | |
1185 mov [rsp], r2 | |
1186 call pixel_satd_8x8_internal2 | |
1187 call pixel_satd_8x8_internal2 | |
1188 call pixel_satd_8x8_internal2 | |
1189 call pixel_satd_8x8_internal2 | |
1190 call pixel_satd_8x8_internal2 | |
1191 call pixel_satd_8x8_internal2 | |
1192 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1193 mov r2, [rsp] | |
1194 add r2, 8*SIZEOF_PIXEL | |
1195 call pixel_satd_8x8_internal2 | |
1196 call pixel_satd_8x8_internal2 | |
1197 call pixel_satd_8x8_internal2 | |
1198 call pixel_satd_8x8_internal2 | |
1199 call pixel_satd_8x8_internal2 | |
1200 call pixel_satd_8x8_internal2 | |
1201 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1202 mov r2, [rsp] | |
1203 add r2, 16*SIZEOF_PIXEL | |
1204 call pixel_satd_8x8_internal2 | |
1205 call pixel_satd_8x8_internal2 | |
1206 call pixel_satd_8x8_internal2 | |
1207 call pixel_satd_8x8_internal2 | |
1208 call pixel_satd_8x8_internal2 | |
1209 call pixel_satd_8x8_internal2 | |
1210 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1211 mov r2, [rsp] | |
1212 add r2, 24*SIZEOF_PIXEL | |
1213 call pixel_satd_8x8_internal2 | |
1214 call pixel_satd_8x8_internal2 | |
1215 call pixel_satd_8x8_internal2 | |
1216 call pixel_satd_8x8_internal2 | |
1217 call pixel_satd_8x8_internal2 | |
1218 call pixel_satd_8x8_internal2 | |
1219 HADDD m6, m0 | |
1220 movd eax, m6 | |
1221 RET | |
1222 %endif | |
1223 | |
1224 %if WIN64 | |
1225 cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1226 SATD_START_SSE2 m6, m7 | |
1227 mov r6, r0 | |
1228 mov r7, r2 | |
1229 call pixel_satd_8x8_internal2 | |
1230 call pixel_satd_8x8_internal2 | |
1231 call pixel_satd_8x8_internal2 | |
1232 call pixel_satd_8x8_internal2 | |
1233 call pixel_satd_8x8_internal2 | |
1234 call pixel_satd_8x8_internal2 | |
1235 call pixel_satd_8x8_internal2 | |
1236 call pixel_satd_8x8_internal2 | |
1237 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1238 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1239 call pixel_satd_8x8_internal2 | |
1240 call pixel_satd_8x8_internal2 | |
1241 call pixel_satd_8x8_internal2 | |
1242 call pixel_satd_8x8_internal2 | |
1243 call pixel_satd_8x8_internal2 | |
1244 call pixel_satd_8x8_internal2 | |
1245 call pixel_satd_8x8_internal2 | |
1246 call pixel_satd_8x8_internal2 | |
1247 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1248 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1249 call pixel_satd_8x8_internal2 | |
1250 call pixel_satd_8x8_internal2 | |
1251 call pixel_satd_8x8_internal2 | |
1252 call pixel_satd_8x8_internal2 | |
1253 call pixel_satd_8x8_internal2 | |
1254 call pixel_satd_8x8_internal2 | |
1255 call pixel_satd_8x8_internal2 | |
1256 call pixel_satd_8x8_internal2 | |
1257 HADDD m6, m0 | |
1258 movd eax, m6 | |
1259 RET | |
1260 %else | |
1261 cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64 | |
1262 SATD_START_SSE2 m6, m7 | |
1263 mov r6, r0 | |
1264 mov [rsp], r2 | |
1265 call pixel_satd_8x8_internal2 | |
1266 call pixel_satd_8x8_internal2 | |
1267 call pixel_satd_8x8_internal2 | |
1268 call pixel_satd_8x8_internal2 | |
1269 call pixel_satd_8x8_internal2 | |
1270 call pixel_satd_8x8_internal2 | |
1271 call pixel_satd_8x8_internal2 | |
1272 call pixel_satd_8x8_internal2 | |
1273 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1274 mov r2, [rsp] | |
1275 add r2, 8*SIZEOF_PIXEL | |
1276 call pixel_satd_8x8_internal2 | |
1277 call pixel_satd_8x8_internal2 | |
1278 call pixel_satd_8x8_internal2 | |
1279 call pixel_satd_8x8_internal2 | |
1280 call pixel_satd_8x8_internal2 | |
1281 call pixel_satd_8x8_internal2 | |
1282 call pixel_satd_8x8_internal2 | |
1283 call pixel_satd_8x8_internal2 | |
1284 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1285 mov r2, [rsp] | |
1286 add r2, 16*SIZEOF_PIXEL | |
1287 call pixel_satd_8x8_internal2 | |
1288 call pixel_satd_8x8_internal2 | |
1289 call pixel_satd_8x8_internal2 | |
1290 call pixel_satd_8x8_internal2 | |
1291 call pixel_satd_8x8_internal2 | |
1292 call pixel_satd_8x8_internal2 | |
1293 call pixel_satd_8x8_internal2 | |
1294 call pixel_satd_8x8_internal2 | |
1295 HADDD m6, m0 | |
1296 movd eax, m6 | |
1297 RET | |
1298 %endif | |
1299 | |
1300 %if WIN64 | |
1301 cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1302 SATD_START_SSE2 m6, m7 | |
1303 mov r6, r0 | |
1304 mov r7, r2 | |
1305 call pixel_satd_8x8_internal2 | |
1306 call pixel_satd_8x8_internal2 | |
1307 call pixel_satd_8x8_internal2 | |
1308 call pixel_satd_8x8_internal2 | |
1309 call pixel_satd_8x8_internal2 | |
1310 call pixel_satd_8x8_internal2 | |
1311 call pixel_satd_8x8_internal2 | |
1312 call pixel_satd_8x8_internal2 | |
1313 HADDD m6, m0 | |
1314 movd eax, m6 | |
1315 RET | |
1316 %else | |
1317 cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64 | |
1318 SATD_START_SSE2 m6, m7 | |
1319 mov r6, r0 | |
1320 mov [rsp], r2 | |
1321 call pixel_satd_8x8_internal2 | |
1322 call pixel_satd_8x8_internal2 | |
1323 call pixel_satd_8x8_internal2 | |
1324 call pixel_satd_8x8_internal2 | |
1325 call pixel_satd_8x8_internal2 | |
1326 call pixel_satd_8x8_internal2 | |
1327 call pixel_satd_8x8_internal2 | |
1328 call pixel_satd_8x8_internal2 | |
1329 HADDD m6, m0 | |
1330 movd eax, m6 | |
1331 RET | |
1332 %endif | |
1333 | |
1334 %if WIN64 | |
1335 cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1336 SATD_START_SSE2 m6, m7 | |
1337 mov r6, r0 | |
1338 mov r7, r2 | |
1339 call pixel_satd_8x8_internal2 | |
1340 call %%pixel_satd_8x4_internal2 | |
1341 pxor m7, m7 | |
1342 movhlps m7, m6 | |
1343 paddd m6, m7 | |
1344 pshufd m7, m6, 1 | |
1345 paddd m6, m7 | |
1346 movd eax, m6 | |
1347 RET | |
1348 %else | |
1349 cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64 | |
1350 SATD_START_SSE2 m6, m7 | |
1351 mov r6, r0 | |
1352 mov [rsp], r2 | |
1353 call pixel_satd_8x8_internal2 | |
1354 call %%pixel_satd_8x4_internal2 | |
1355 HADDD m6, m0 | |
1356 movd eax, m6 | |
1357 RET | |
1358 %endif | |
1359 | |
1360 %if HIGH_BIT_DEPTH | |
1361 %if WIN64 | |
1362 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) | |
1363 SATD_START_MMX | |
1364 mov r6, r0 | |
1365 mov r7, r2 | |
1366 pxor m7, m7 | |
1367 SATD_4x8_SSE vertical, 0, 4, 5 | |
1368 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1369 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1370 SATD_4x8_SSE vertical, 1, 4, 5 | |
1371 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1372 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1373 SATD_4x8_SSE vertical, 1, 4, 5 | |
1374 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1375 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1376 SATD_4x8_SSE vertical, 1, 4, 5 | |
1377 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
1378 lea r2, [r7 + 4*SIZEOF_PIXEL] | |
1379 SATD_4x8_SSE vertical, 1, 4, 5 | |
1380 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1381 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1382 SATD_4x8_SSE vertical, 1, 4, 5 | |
1383 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1384 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1385 SATD_4x8_SSE vertical, 1, 4, 5 | |
1386 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1387 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1388 SATD_4x8_SSE vertical, 1, 4, 5 | |
1389 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1390 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1391 SATD_4x8_SSE vertical, 1, 4, 5 | |
1392 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1393 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1394 SATD_4x8_SSE vertical, 1, 4, 5 | |
1395 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1396 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1397 SATD_4x8_SSE vertical, 1, 4, 5 | |
1398 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1399 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1400 SATD_4x8_SSE vertical, 1, 4, 5 | |
1401 HADDD m7, m0 | |
1402 movd eax, m7 | |
1403 RET | |
1404 %else | |
1405 cglobal pixel_satd_12x32, 4,7,8,0-gprsize | |
1406 SATD_START_MMX | |
1407 mov r6, r0 | |
1408 mov [rsp], r2 | |
1409 pxor m7, m7 | |
1410 SATD_4x8_SSE vertical, 0, 4, 5 | |
1411 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1412 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1413 SATD_4x8_SSE vertical, 1, 4, 5 | |
1414 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1415 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1416 SATD_4x8_SSE vertical, 1, 4, 5 | |
1417 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1418 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1419 SATD_4x8_SSE vertical, 1, 4, 5 | |
1420 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
1421 mov r2, [rsp] | |
1422 add r2, 4*SIZEOF_PIXEL | |
1423 SATD_4x8_SSE vertical, 1, 4, 5 | |
1424 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1425 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1426 SATD_4x8_SSE vertical, 1, 4, 5 | |
1427 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1428 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1429 SATD_4x8_SSE vertical, 1, 4, 5 | |
1430 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1431 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1432 SATD_4x8_SSE vertical, 1, 4, 5 | |
1433 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1434 mov r2, [rsp] | |
1435 add r2, 8*SIZEOF_PIXEL | |
1436 SATD_4x8_SSE vertical, 1, 4, 5 | |
1437 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1438 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1439 SATD_4x8_SSE vertical, 1, 4, 5 | |
1440 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1441 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1442 SATD_4x8_SSE vertical, 1, 4, 5 | |
1443 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1444 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1445 SATD_4x8_SSE vertical, 1, 4, 5 | |
1446 HADDD m7, m0 | |
1447 movd eax, m7 | |
1448 RET | |
1449 %endif | |
1450 %else ;HIGH_BIT_DEPTH | |
1451 %if WIN64 | |
1452 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) | |
1453 SATD_START_MMX | |
1454 mov r6, r0 | |
1455 mov r7, r2 | |
1456 %if vertical==0 | |
1457 mova m7, [hmul_4p] | |
1458 %endif | |
1459 SATD_4x8_SSE vertical, 0, swap | |
1460 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1461 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1462 SATD_4x8_SSE vertical, 1, add | |
1463 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1464 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1465 SATD_4x8_SSE vertical, 1, add | |
1466 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1467 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1468 SATD_4x8_SSE vertical, 1, add | |
1469 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
1470 lea r2, [r7 + 4*SIZEOF_PIXEL] | |
1471 SATD_4x8_SSE vertical, 1, add | |
1472 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1473 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1474 SATD_4x8_SSE vertical, 1, add | |
1475 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1476 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1477 SATD_4x8_SSE vertical, 1, add | |
1478 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1479 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1480 SATD_4x8_SSE vertical, 1, add | |
1481 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1482 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1483 SATD_4x8_SSE vertical, 1, add | |
1484 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1485 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1486 SATD_4x8_SSE vertical, 1, add | |
1487 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1488 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1489 SATD_4x8_SSE vertical, 1, add | |
1490 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1491 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1492 SATD_4x8_SSE vertical, 1, add | |
1493 HADDW m7, m1 | |
1494 movd eax, m7 | |
1495 RET | |
1496 %else | |
1497 cglobal pixel_satd_12x32, 4,7,8,0-gprsize | |
1498 SATD_START_MMX | |
1499 mov r6, r0 | |
1500 mov [rsp], r2 | |
1501 %if vertical==0 | |
1502 mova m7, [hmul_4p] | |
1503 %endif | |
1504 SATD_4x8_SSE vertical, 0, swap | |
1505 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1506 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1507 SATD_4x8_SSE vertical, 1, add | |
1508 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1509 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1510 SATD_4x8_SSE vertical, 1, add | |
1511 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1512 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1513 SATD_4x8_SSE vertical, 1, add | |
1514 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
1515 mov r2, [rsp] | |
1516 add r2, 4*SIZEOF_PIXEL | |
1517 SATD_4x8_SSE vertical, 1, add | |
1518 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1519 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1520 SATD_4x8_SSE vertical, 1, add | |
1521 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1522 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1523 SATD_4x8_SSE vertical, 1, add | |
1524 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1525 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1526 SATD_4x8_SSE vertical, 1, add | |
1527 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1528 mov r2, [rsp] | |
1529 add r2, 8*SIZEOF_PIXEL | |
1530 SATD_4x8_SSE vertical, 1, add | |
1531 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1532 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1533 SATD_4x8_SSE vertical, 1, add | |
1534 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1535 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1536 SATD_4x8_SSE vertical, 1, add | |
1537 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1538 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1539 SATD_4x8_SSE vertical, 1, add | |
1540 HADDW m7, m1 | |
1541 movd eax, m7 | |
1542 RET | |
1543 %endif | |
1544 %endif | |
1545 | |
1546 %if HIGH_BIT_DEPTH | |
1547 %if WIN64 | |
1548 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) | |
1549 SATD_START_MMX | |
1550 mov r6, r0 | |
1551 mov r7, r2 | |
1552 pxor m7, m7 | |
1553 SATD_4x8_SSE vertical, 0, 4, 5 | |
1554 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1555 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1556 SATD_4x8_SSE vertical, 1, 4, 5 | |
1557 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1558 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1559 SATD_4x8_SSE vertical, 1, 4, 5 | |
1560 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1561 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1562 SATD_4x8_SSE vertical, 1, 4, 5 | |
1563 HADDD m7, m0 | |
1564 movd eax, m7 | |
1565 RET | |
1566 %else | |
1567 cglobal pixel_satd_4x32, 4,7,8,0-gprsize | |
1568 SATD_START_MMX | |
1569 mov r6, r0 | |
1570 mov [rsp], r2 | |
1571 pxor m7, m7 | |
1572 SATD_4x8_SSE vertical, 0, 4, 5 | |
1573 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1574 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1575 SATD_4x8_SSE vertical, 1, 4, 5 | |
1576 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1577 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1578 SATD_4x8_SSE vertical, 1, 4, 5 | |
1579 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1580 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1581 SATD_4x8_SSE vertical, 1, 4, 5 | |
1582 pxor m1, m1 | |
1583 movhlps m1, m7 | |
1584 paddd m7, m1 | |
1585 pshufd m1, m7, 1 | |
1586 paddd m7, m1 | |
1587 movd eax, m7 | |
1588 RET | |
1589 %endif | |
1590 %else | |
1591 %if WIN64 | |
1592 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) | |
1593 SATD_START_MMX | |
1594 mov r6, r0 | |
1595 mov r7, r2 | |
1596 %if vertical==0 | |
1597 mova m7, [hmul_4p] | |
1598 %endif | |
1599 SATD_4x8_SSE vertical, 0, swap | |
1600 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1601 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1602 SATD_4x8_SSE vertical, 1, add | |
1603 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1604 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1605 SATD_4x8_SSE vertical, 1, add | |
1606 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1607 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1608 SATD_4x8_SSE vertical, 1, add | |
1609 HADDW m7, m1 | |
1610 movd eax, m7 | |
1611 RET | |
1612 %else | |
1613 cglobal pixel_satd_4x32, 4,7,8,0-gprsize | |
1614 SATD_START_MMX | |
1615 mov r6, r0 | |
1616 mov [rsp], r2 | |
1617 %if vertical==0 | |
1618 mova m7, [hmul_4p] | |
1619 %endif | |
1620 SATD_4x8_SSE vertical, 0, swap | |
1621 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1622 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1623 SATD_4x8_SSE vertical, 1, add | |
1624 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1625 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1626 SATD_4x8_SSE vertical, 1, add | |
1627 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
1628 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
1629 SATD_4x8_SSE vertical, 1, add | |
1630 HADDW m7, m1 | |
1631 movd eax, m7 | |
1632 RET | |
1633 %endif | |
1634 %endif | |
1635 | |
1636 %if WIN64 | |
1637 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1638 SATD_START_SSE2 m6, m7 | |
1639 mov r6, r0 | |
1640 mov r7, r2 | |
1641 call pixel_satd_8x8_internal2 | |
1642 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1643 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1644 call pixel_satd_8x8_internal2 | |
1645 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1646 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1647 call pixel_satd_8x8_internal2 | |
1648 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1649 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1650 call pixel_satd_8x8_internal2 | |
1651 HADDD m6, m0 | |
1652 movd eax, m6 | |
1653 RET | |
1654 %else | |
1655 cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 | |
1656 SATD_START_SSE2 m6, m7 | |
1657 mov r6, r0 | |
1658 mov [rsp], r2 | |
1659 call pixel_satd_8x8_internal2 | |
1660 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1661 mov r2, [rsp] | |
1662 add r2, 8*SIZEOF_PIXEL | |
1663 call pixel_satd_8x8_internal2 | |
1664 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1665 mov r2, [rsp] | |
1666 add r2, 16*SIZEOF_PIXEL | |
1667 call pixel_satd_8x8_internal2 | |
1668 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1669 mov r2, [rsp] | |
1670 add r2, 24*SIZEOF_PIXEL | |
1671 call pixel_satd_8x8_internal2 | |
1672 HADDD m6, m0 | |
1673 movd eax, m6 | |
1674 RET | |
1675 %endif | |
1676 | |
1677 %if WIN64 | |
1678 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1679 SATD_START_SSE2 m6, m7 | |
1680 mov r6, r0 | |
1681 mov r7, r2 | |
1682 call pixel_satd_8x8_internal2 | |
1683 call pixel_satd_8x8_internal2 | |
1684 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1685 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1686 call pixel_satd_8x8_internal2 | |
1687 call pixel_satd_8x8_internal2 | |
1688 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1689 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1690 call pixel_satd_8x8_internal2 | |
1691 call pixel_satd_8x8_internal2 | |
1692 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1693 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1694 call pixel_satd_8x8_internal2 | |
1695 call pixel_satd_8x8_internal2 | |
1696 HADDD m6, m0 | |
1697 movd eax, m6 | |
1698 RET | |
1699 %else | |
1700 cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 | |
1701 SATD_START_SSE2 m6, m7 | |
1702 mov r6, r0 | |
1703 mov [rsp], r2 | |
1704 call pixel_satd_8x8_internal2 | |
1705 call pixel_satd_8x8_internal2 | |
1706 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1707 mov r2, [rsp] | |
1708 add r2, 8*SIZEOF_PIXEL | |
1709 call pixel_satd_8x8_internal2 | |
1710 call pixel_satd_8x8_internal2 | |
1711 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1712 mov r2, [rsp] | |
1713 add r2, 16*SIZEOF_PIXEL | |
1714 call pixel_satd_8x8_internal2 | |
1715 call pixel_satd_8x8_internal2 | |
1716 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1717 mov r2, [rsp] | |
1718 add r2, 24*SIZEOF_PIXEL | |
1719 call pixel_satd_8x8_internal2 | |
1720 call pixel_satd_8x8_internal2 | |
1721 HADDD m6, m0 | |
1722 movd eax, m6 | |
1723 RET | |
1724 %endif | |
1725 | |
1726 %if WIN64 | |
1727 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1728 SATD_START_SSE2 m6, m7 | |
1729 mov r6, r0 | |
1730 mov r7, r2 | |
1731 call pixel_satd_8x8_internal2 | |
1732 call pixel_satd_8x8_internal2 | |
1733 call pixel_satd_8x8_internal2 | |
1734 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1735 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1736 call pixel_satd_8x8_internal2 | |
1737 call pixel_satd_8x8_internal2 | |
1738 call pixel_satd_8x8_internal2 | |
1739 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1740 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1741 call pixel_satd_8x8_internal2 | |
1742 call pixel_satd_8x8_internal2 | |
1743 call pixel_satd_8x8_internal2 | |
1744 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1745 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1746 call pixel_satd_8x8_internal2 | |
1747 call pixel_satd_8x8_internal2 | |
1748 call pixel_satd_8x8_internal2 | |
1749 HADDD m6, m0 | |
1750 movd eax, m6 | |
1751 RET | |
1752 %else | |
1753 cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 | |
1754 SATD_START_SSE2 m6, m7 | |
1755 mov r6, r0 | |
1756 mov [rsp], r2 | |
1757 call pixel_satd_8x8_internal2 | |
1758 call pixel_satd_8x8_internal2 | |
1759 call pixel_satd_8x8_internal2 | |
1760 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1761 mov r2, [rsp] | |
1762 add r2, 8*SIZEOF_PIXEL | |
1763 call pixel_satd_8x8_internal2 | |
1764 call pixel_satd_8x8_internal2 | |
1765 call pixel_satd_8x8_internal2 | |
1766 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1767 mov r2, [rsp] | |
1768 add r2, 16*SIZEOF_PIXEL | |
1769 call pixel_satd_8x8_internal2 | |
1770 call pixel_satd_8x8_internal2 | |
1771 call pixel_satd_8x8_internal2 | |
1772 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1773 mov r2, [rsp] | |
1774 add r2, 24*SIZEOF_PIXEL | |
1775 call pixel_satd_8x8_internal2 | |
1776 call pixel_satd_8x8_internal2 | |
1777 call pixel_satd_8x8_internal2 | |
1778 HADDD m6, m0 | |
1779 movd eax, m6 | |
1780 RET | |
1781 %endif | |
1782 | |
1783 %if WIN64 | |
1784 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1785 SATD_START_SSE2 m6, m7 | |
1786 mov r6, r0 | |
1787 mov r7, r2 | |
1788 call pixel_satd_8x8_internal2 | |
1789 call pixel_satd_8x8_internal2 | |
1790 call pixel_satd_8x8_internal2 | |
1791 call pixel_satd_8x8_internal2 | |
1792 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1793 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1794 call pixel_satd_8x8_internal2 | |
1795 call pixel_satd_8x8_internal2 | |
1796 call pixel_satd_8x8_internal2 | |
1797 call pixel_satd_8x8_internal2 | |
1798 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1799 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1800 call pixel_satd_8x8_internal2 | |
1801 call pixel_satd_8x8_internal2 | |
1802 call pixel_satd_8x8_internal2 | |
1803 call pixel_satd_8x8_internal2 | |
1804 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1805 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1806 call pixel_satd_8x8_internal2 | |
1807 call pixel_satd_8x8_internal2 | |
1808 call pixel_satd_8x8_internal2 | |
1809 call pixel_satd_8x8_internal2 | |
1810 HADDD m6, m0 | |
1811 movd eax, m6 | |
1812 RET | |
1813 %else | |
1814 cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 | |
1815 SATD_START_SSE2 m6, m7 | |
1816 mov r6, r0 | |
1817 mov [rsp], r2 | |
1818 call pixel_satd_8x8_internal2 | |
1819 call pixel_satd_8x8_internal2 | |
1820 call pixel_satd_8x8_internal2 | |
1821 call pixel_satd_8x8_internal2 | |
1822 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1823 mov r2, [rsp] | |
1824 add r2, 8*SIZEOF_PIXEL | |
1825 call pixel_satd_8x8_internal2 | |
1826 call pixel_satd_8x8_internal2 | |
1827 call pixel_satd_8x8_internal2 | |
1828 call pixel_satd_8x8_internal2 | |
1829 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1830 mov r2, [rsp] | |
1831 add r2, 16*SIZEOF_PIXEL | |
1832 call pixel_satd_8x8_internal2 | |
1833 call pixel_satd_8x8_internal2 | |
1834 call pixel_satd_8x8_internal2 | |
1835 call pixel_satd_8x8_internal2 | |
1836 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1837 mov r2, [rsp] | |
1838 add r2, 24*SIZEOF_PIXEL | |
1839 call pixel_satd_8x8_internal2 | |
1840 call pixel_satd_8x8_internal2 | |
1841 call pixel_satd_8x8_internal2 | |
1842 call pixel_satd_8x8_internal2 | |
1843 HADDD m6, m0 | |
1844 movd eax, m6 | |
1845 RET | |
1846 %endif | |
1847 | |
1848 %if WIN64 | |
1849 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1850 SATD_START_SSE2 m6, m7 | |
1851 mov r6, r0 | |
1852 mov r7, r2 | |
1853 call pixel_satd_8x8_internal2 | |
1854 call pixel_satd_8x8_internal2 | |
1855 call pixel_satd_8x8_internal2 | |
1856 call pixel_satd_8x8_internal2 | |
1857 call pixel_satd_8x8_internal2 | |
1858 call pixel_satd_8x8_internal2 | |
1859 call pixel_satd_8x8_internal2 | |
1860 call pixel_satd_8x8_internal2 | |
1861 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1862 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1863 call pixel_satd_8x8_internal2 | |
1864 call pixel_satd_8x8_internal2 | |
1865 call pixel_satd_8x8_internal2 | |
1866 call pixel_satd_8x8_internal2 | |
1867 call pixel_satd_8x8_internal2 | |
1868 call pixel_satd_8x8_internal2 | |
1869 call pixel_satd_8x8_internal2 | |
1870 call pixel_satd_8x8_internal2 | |
1871 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1872 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1873 call pixel_satd_8x8_internal2 | |
1874 call pixel_satd_8x8_internal2 | |
1875 call pixel_satd_8x8_internal2 | |
1876 call pixel_satd_8x8_internal2 | |
1877 call pixel_satd_8x8_internal2 | |
1878 call pixel_satd_8x8_internal2 | |
1879 call pixel_satd_8x8_internal2 | |
1880 call pixel_satd_8x8_internal2 | |
1881 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1882 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1883 call pixel_satd_8x8_internal2 | |
1884 call pixel_satd_8x8_internal2 | |
1885 call pixel_satd_8x8_internal2 | |
1886 call pixel_satd_8x8_internal2 | |
1887 call pixel_satd_8x8_internal2 | |
1888 call pixel_satd_8x8_internal2 | |
1889 call pixel_satd_8x8_internal2 | |
1890 call pixel_satd_8x8_internal2 | |
1891 HADDD m6, m0 | |
1892 movd eax, m6 | |
1893 RET | |
1894 %else | |
1895 cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 | |
1896 SATD_START_SSE2 m6, m7 | |
1897 mov r6, r0 | |
1898 mov [rsp], r2 | |
1899 call pixel_satd_8x8_internal2 | |
1900 call pixel_satd_8x8_internal2 | |
1901 call pixel_satd_8x8_internal2 | |
1902 call pixel_satd_8x8_internal2 | |
1903 call pixel_satd_8x8_internal2 | |
1904 call pixel_satd_8x8_internal2 | |
1905 call pixel_satd_8x8_internal2 | |
1906 call pixel_satd_8x8_internal2 | |
1907 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1908 mov r2, [rsp] | |
1909 add r2, 8*SIZEOF_PIXEL | |
1910 call pixel_satd_8x8_internal2 | |
1911 call pixel_satd_8x8_internal2 | |
1912 call pixel_satd_8x8_internal2 | |
1913 call pixel_satd_8x8_internal2 | |
1914 call pixel_satd_8x8_internal2 | |
1915 call pixel_satd_8x8_internal2 | |
1916 call pixel_satd_8x8_internal2 | |
1917 call pixel_satd_8x8_internal2 | |
1918 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1919 mov r2, [rsp] | |
1920 add r2, 16*SIZEOF_PIXEL | |
1921 call pixel_satd_8x8_internal2 | |
1922 call pixel_satd_8x8_internal2 | |
1923 call pixel_satd_8x8_internal2 | |
1924 call pixel_satd_8x8_internal2 | |
1925 call pixel_satd_8x8_internal2 | |
1926 call pixel_satd_8x8_internal2 | |
1927 call pixel_satd_8x8_internal2 | |
1928 call pixel_satd_8x8_internal2 | |
1929 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1930 mov r2, [rsp] | |
1931 add r2, 24*SIZEOF_PIXEL | |
1932 call pixel_satd_8x8_internal2 | |
1933 call pixel_satd_8x8_internal2 | |
1934 call pixel_satd_8x8_internal2 | |
1935 call pixel_satd_8x8_internal2 | |
1936 call pixel_satd_8x8_internal2 | |
1937 call pixel_satd_8x8_internal2 | |
1938 call pixel_satd_8x8_internal2 | |
1939 call pixel_satd_8x8_internal2 | |
1940 HADDD m6, m0 | |
1941 movd eax, m6 | |
1942 RET | |
1943 %endif | |
1944 | |
1945 %if WIN64 | |
1946 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) | |
1947 SATD_START_SSE2 m6, m7 | |
1948 mov r6, r0 | |
1949 mov r7, r2 | |
1950 call pixel_satd_8x8_internal2 | |
1951 call pixel_satd_8x8_internal2 | |
1952 call pixel_satd_8x8_internal2 | |
1953 call pixel_satd_8x8_internal2 | |
1954 call pixel_satd_8x8_internal2 | |
1955 call pixel_satd_8x8_internal2 | |
1956 call pixel_satd_8x8_internal2 | |
1957 call pixel_satd_8x8_internal2 | |
1958 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
1959 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
1960 call pixel_satd_8x8_internal2 | |
1961 call pixel_satd_8x8_internal2 | |
1962 call pixel_satd_8x8_internal2 | |
1963 call pixel_satd_8x8_internal2 | |
1964 call pixel_satd_8x8_internal2 | |
1965 call pixel_satd_8x8_internal2 | |
1966 call pixel_satd_8x8_internal2 | |
1967 call pixel_satd_8x8_internal2 | |
1968 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
1969 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
1970 call pixel_satd_8x8_internal2 | |
1971 call pixel_satd_8x8_internal2 | |
1972 call pixel_satd_8x8_internal2 | |
1973 call pixel_satd_8x8_internal2 | |
1974 call pixel_satd_8x8_internal2 | |
1975 call pixel_satd_8x8_internal2 | |
1976 call pixel_satd_8x8_internal2 | |
1977 call pixel_satd_8x8_internal2 | |
1978 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
1979 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
1980 call pixel_satd_8x8_internal2 | |
1981 call pixel_satd_8x8_internal2 | |
1982 call pixel_satd_8x8_internal2 | |
1983 call pixel_satd_8x8_internal2 | |
1984 call pixel_satd_8x8_internal2 | |
1985 call pixel_satd_8x8_internal2 | |
1986 call pixel_satd_8x8_internal2 | |
1987 call pixel_satd_8x8_internal2 | |
1988 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
1989 lea r2, [r7 + 32*SIZEOF_PIXEL] | |
1990 call pixel_satd_8x8_internal2 | |
1991 call pixel_satd_8x8_internal2 | |
1992 call pixel_satd_8x8_internal2 | |
1993 call pixel_satd_8x8_internal2 | |
1994 call pixel_satd_8x8_internal2 | |
1995 call pixel_satd_8x8_internal2 | |
1996 call pixel_satd_8x8_internal2 | |
1997 call pixel_satd_8x8_internal2 | |
1998 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
1999 lea r2, [r7 + 40*SIZEOF_PIXEL] | |
2000 call pixel_satd_8x8_internal2 | |
2001 call pixel_satd_8x8_internal2 | |
2002 call pixel_satd_8x8_internal2 | |
2003 call pixel_satd_8x8_internal2 | |
2004 call pixel_satd_8x8_internal2 | |
2005 call pixel_satd_8x8_internal2 | |
2006 call pixel_satd_8x8_internal2 | |
2007 call pixel_satd_8x8_internal2 | |
2008 HADDD m6, m0 | |
2009 movd eax, m6 | |
2010 RET | |
2011 %else | |
2012 cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 | |
2013 SATD_START_SSE2 m6, m7 | |
2014 mov r6, r0 | |
2015 mov [rsp], r2 | |
2016 call pixel_satd_8x8_internal2 | |
2017 call pixel_satd_8x8_internal2 | |
2018 call pixel_satd_8x8_internal2 | |
2019 call pixel_satd_8x8_internal2 | |
2020 call pixel_satd_8x8_internal2 | |
2021 call pixel_satd_8x8_internal2 | |
2022 call pixel_satd_8x8_internal2 | |
2023 call pixel_satd_8x8_internal2 | |
2024 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2025 mov r2, [rsp] | |
2026 add r2,8*SIZEOF_PIXEL | |
2027 call pixel_satd_8x8_internal2 | |
2028 call pixel_satd_8x8_internal2 | |
2029 call pixel_satd_8x8_internal2 | |
2030 call pixel_satd_8x8_internal2 | |
2031 call pixel_satd_8x8_internal2 | |
2032 call pixel_satd_8x8_internal2 | |
2033 call pixel_satd_8x8_internal2 | |
2034 call pixel_satd_8x8_internal2 | |
2035 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2036 mov r2, [rsp] | |
2037 add r2,16*SIZEOF_PIXEL | |
2038 call pixel_satd_8x8_internal2 | |
2039 call pixel_satd_8x8_internal2 | |
2040 call pixel_satd_8x8_internal2 | |
2041 call pixel_satd_8x8_internal2 | |
2042 call pixel_satd_8x8_internal2 | |
2043 call pixel_satd_8x8_internal2 | |
2044 call pixel_satd_8x8_internal2 | |
2045 call pixel_satd_8x8_internal2 | |
2046 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2047 mov r2, [rsp] | |
2048 add r2,24*SIZEOF_PIXEL | |
2049 call pixel_satd_8x8_internal2 | |
2050 call pixel_satd_8x8_internal2 | |
2051 call pixel_satd_8x8_internal2 | |
2052 call pixel_satd_8x8_internal2 | |
2053 call pixel_satd_8x8_internal2 | |
2054 call pixel_satd_8x8_internal2 | |
2055 call pixel_satd_8x8_internal2 | |
2056 call pixel_satd_8x8_internal2 | |
2057 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2058 mov r2, [rsp] | |
2059 add r2,32*SIZEOF_PIXEL | |
2060 call pixel_satd_8x8_internal2 | |
2061 call pixel_satd_8x8_internal2 | |
2062 call pixel_satd_8x8_internal2 | |
2063 call pixel_satd_8x8_internal2 | |
2064 call pixel_satd_8x8_internal2 | |
2065 call pixel_satd_8x8_internal2 | |
2066 call pixel_satd_8x8_internal2 | |
2067 call pixel_satd_8x8_internal2 | |
2068 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2069 mov r2, [rsp] | |
2070 add r2,40*SIZEOF_PIXEL | |
2071 call pixel_satd_8x8_internal2 | |
2072 call pixel_satd_8x8_internal2 | |
2073 call pixel_satd_8x8_internal2 | |
2074 call pixel_satd_8x8_internal2 | |
2075 call pixel_satd_8x8_internal2 | |
2076 call pixel_satd_8x8_internal2 | |
2077 call pixel_satd_8x8_internal2 | |
2078 call pixel_satd_8x8_internal2 | |
2079 HADDD m6, m0 | |
2080 movd eax, m6 | |
2081 RET | |
2082 %endif | |
2083 | |
2084 | |
2085 %if WIN64 | |
2086 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx) | |
2087 SATD_START_SSE2 m6, m7 | |
2088 mov r6, r0 | |
2089 mov r7, r2 | |
2090 call pixel_satd_8x8_internal2 | |
2091 call pixel_satd_8x8_internal2 | |
2092 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2093 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2094 call pixel_satd_8x8_internal2 | |
2095 call pixel_satd_8x8_internal2 | |
2096 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2097 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
2098 call pixel_satd_8x8_internal2 | |
2099 call pixel_satd_8x8_internal2 | |
2100 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2101 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
2102 call pixel_satd_8x8_internal2 | |
2103 call pixel_satd_8x8_internal2 | |
2104 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2105 lea r2, [r7 + 32*SIZEOF_PIXEL] | |
2106 call pixel_satd_8x8_internal2 | |
2107 call pixel_satd_8x8_internal2 | |
2108 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2109 lea r2, [r7 + 40*SIZEOF_PIXEL] | |
2110 call pixel_satd_8x8_internal2 | |
2111 call pixel_satd_8x8_internal2 | |
2112 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2113 lea r2, [r7 + 48*SIZEOF_PIXEL] | |
2114 call pixel_satd_8x8_internal2 | |
2115 call pixel_satd_8x8_internal2 | |
2116 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2117 lea r2, [r7 + 56*SIZEOF_PIXEL] | |
2118 call pixel_satd_8x8_internal2 | |
2119 call pixel_satd_8x8_internal2 | |
2120 HADDD m6, m0 | |
2121 movd eax, m6 | |
2122 RET | |
2123 %else | |
2124 cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64 | |
2125 SATD_START_SSE2 m6, m7 | |
2126 mov r6, r0 | |
2127 mov [rsp], r2 | |
2128 call pixel_satd_8x8_internal2 | |
2129 call pixel_satd_8x8_internal2 | |
2130 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2131 mov r2, [rsp] | |
2132 add r2,8*SIZEOF_PIXEL | |
2133 call pixel_satd_8x8_internal2 | |
2134 call pixel_satd_8x8_internal2 | |
2135 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2136 mov r2, [rsp] | |
2137 add r2,16*SIZEOF_PIXEL | |
2138 call pixel_satd_8x8_internal2 | |
2139 call pixel_satd_8x8_internal2 | |
2140 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2141 mov r2, [rsp] | |
2142 add r2,24*SIZEOF_PIXEL | |
2143 call pixel_satd_8x8_internal2 | |
2144 call pixel_satd_8x8_internal2 | |
2145 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2146 mov r2, [rsp] | |
2147 add r2,32*SIZEOF_PIXEL | |
2148 call pixel_satd_8x8_internal2 | |
2149 call pixel_satd_8x8_internal2 | |
2150 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2151 mov r2, [rsp] | |
2152 add r2,40*SIZEOF_PIXEL | |
2153 call pixel_satd_8x8_internal2 | |
2154 call pixel_satd_8x8_internal2 | |
2155 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2156 mov r2, [rsp] | |
2157 add r2,48*SIZEOF_PIXEL | |
2158 call pixel_satd_8x8_internal2 | |
2159 call pixel_satd_8x8_internal2 | |
2160 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2161 mov r2, [rsp] | |
2162 add r2,56*SIZEOF_PIXEL | |
2163 call pixel_satd_8x8_internal2 | |
2164 call pixel_satd_8x8_internal2 | |
2165 HADDD m6, m0 | |
2166 movd eax, m6 | |
2167 RET | |
2168 %endif | |
2169 | |
2170 %if WIN64 | |
2171 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx) | |
2172 SATD_START_SSE2 m6, m7 | |
2173 mov r6, r0 | |
2174 mov r7, r2 | |
2175 call pixel_satd_8x8_internal2 | |
2176 call pixel_satd_8x8_internal2 | |
2177 call pixel_satd_8x8_internal2 | |
2178 call pixel_satd_8x8_internal2 | |
2179 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2180 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2181 call pixel_satd_8x8_internal2 | |
2182 call pixel_satd_8x8_internal2 | |
2183 call pixel_satd_8x8_internal2 | |
2184 call pixel_satd_8x8_internal2 | |
2185 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2186 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
2187 call pixel_satd_8x8_internal2 | |
2188 call pixel_satd_8x8_internal2 | |
2189 call pixel_satd_8x8_internal2 | |
2190 call pixel_satd_8x8_internal2 | |
2191 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2192 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
2193 call pixel_satd_8x8_internal2 | |
2194 call pixel_satd_8x8_internal2 | |
2195 call pixel_satd_8x8_internal2 | |
2196 call pixel_satd_8x8_internal2 | |
2197 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2198 lea r2, [r7 + 32*SIZEOF_PIXEL] | |
2199 call pixel_satd_8x8_internal2 | |
2200 call pixel_satd_8x8_internal2 | |
2201 call pixel_satd_8x8_internal2 | |
2202 call pixel_satd_8x8_internal2 | |
2203 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2204 lea r2, [r7 + 40*SIZEOF_PIXEL] | |
2205 call pixel_satd_8x8_internal2 | |
2206 call pixel_satd_8x8_internal2 | |
2207 call pixel_satd_8x8_internal2 | |
2208 call pixel_satd_8x8_internal2 | |
2209 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2210 lea r2, [r7 + 48*SIZEOF_PIXEL] | |
2211 call pixel_satd_8x8_internal2 | |
2212 call pixel_satd_8x8_internal2 | |
2213 call pixel_satd_8x8_internal2 | |
2214 call pixel_satd_8x8_internal2 | |
2215 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2216 lea r2, [r7 + 56*SIZEOF_PIXEL] | |
2217 call pixel_satd_8x8_internal2 | |
2218 call pixel_satd_8x8_internal2 | |
2219 call pixel_satd_8x8_internal2 | |
2220 call pixel_satd_8x8_internal2 | |
2221 HADDD m6, m0 | |
2222 movd eax, m6 | |
2223 RET | |
2224 %else | |
2225 cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64 | |
2226 SATD_START_SSE2 m6, m7 | |
2227 mov r6, r0 | |
2228 mov [rsp], r2 | |
2229 call pixel_satd_8x8_internal2 | |
2230 call pixel_satd_8x8_internal2 | |
2231 call pixel_satd_8x8_internal2 | |
2232 call pixel_satd_8x8_internal2 | |
2233 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2234 mov r2, [rsp] | |
2235 add r2, 8*SIZEOF_PIXEL | |
2236 call pixel_satd_8x8_internal2 | |
2237 call pixel_satd_8x8_internal2 | |
2238 call pixel_satd_8x8_internal2 | |
2239 call pixel_satd_8x8_internal2 | |
2240 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2241 mov r2, [rsp] | |
2242 add r2, 16*SIZEOF_PIXEL | |
2243 call pixel_satd_8x8_internal2 | |
2244 call pixel_satd_8x8_internal2 | |
2245 call pixel_satd_8x8_internal2 | |
2246 call pixel_satd_8x8_internal2 | |
2247 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2248 mov r2, [rsp] | |
2249 add r2, 24*SIZEOF_PIXEL | |
2250 call pixel_satd_8x8_internal2 | |
2251 call pixel_satd_8x8_internal2 | |
2252 call pixel_satd_8x8_internal2 | |
2253 call pixel_satd_8x8_internal2 | |
2254 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2255 mov r2, [rsp] | |
2256 add r2, 32*SIZEOF_PIXEL | |
2257 call pixel_satd_8x8_internal2 | |
2258 call pixel_satd_8x8_internal2 | |
2259 call pixel_satd_8x8_internal2 | |
2260 call pixel_satd_8x8_internal2 | |
2261 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2262 mov r2, [rsp] | |
2263 add r2, 40*SIZEOF_PIXEL | |
2264 call pixel_satd_8x8_internal2 | |
2265 call pixel_satd_8x8_internal2 | |
2266 call pixel_satd_8x8_internal2 | |
2267 call pixel_satd_8x8_internal2 | |
2268 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2269 mov r2, [rsp] | |
2270 add r2, 48*SIZEOF_PIXEL | |
2271 call pixel_satd_8x8_internal2 | |
2272 call pixel_satd_8x8_internal2 | |
2273 call pixel_satd_8x8_internal2 | |
2274 call pixel_satd_8x8_internal2 | |
2275 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2276 mov r2, [rsp] | |
2277 add r2, 56*SIZEOF_PIXEL | |
2278 call pixel_satd_8x8_internal2 | |
2279 call pixel_satd_8x8_internal2 | |
2280 call pixel_satd_8x8_internal2 | |
2281 call pixel_satd_8x8_internal2 | |
2282 HADDD m6, m0 | |
2283 movd eax, m6 | |
2284 RET | |
2285 %endif | |
2286 | |
2287 %if WIN64 | |
2288 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx) | |
2289 SATD_START_SSE2 m6, m7 | |
2290 mov r6, r0 | |
2291 mov r7, r2 | |
2292 call pixel_satd_8x8_internal2 | |
2293 call pixel_satd_8x8_internal2 | |
2294 call pixel_satd_8x8_internal2 | |
2295 call pixel_satd_8x8_internal2 | |
2296 call pixel_satd_8x8_internal2 | |
2297 call pixel_satd_8x8_internal2 | |
2298 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2299 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2300 call pixel_satd_8x8_internal2 | |
2301 call pixel_satd_8x8_internal2 | |
2302 call pixel_satd_8x8_internal2 | |
2303 call pixel_satd_8x8_internal2 | |
2304 call pixel_satd_8x8_internal2 | |
2305 call pixel_satd_8x8_internal2 | |
2306 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2307 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
2308 call pixel_satd_8x8_internal2 | |
2309 call pixel_satd_8x8_internal2 | |
2310 call pixel_satd_8x8_internal2 | |
2311 call pixel_satd_8x8_internal2 | |
2312 call pixel_satd_8x8_internal2 | |
2313 call pixel_satd_8x8_internal2 | |
2314 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2315 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
2316 call pixel_satd_8x8_internal2 | |
2317 call pixel_satd_8x8_internal2 | |
2318 call pixel_satd_8x8_internal2 | |
2319 call pixel_satd_8x8_internal2 | |
2320 call pixel_satd_8x8_internal2 | |
2321 call pixel_satd_8x8_internal2 | |
2322 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2323 lea r2, [r7 + 32*SIZEOF_PIXEL] | |
2324 call pixel_satd_8x8_internal2 | |
2325 call pixel_satd_8x8_internal2 | |
2326 call pixel_satd_8x8_internal2 | |
2327 call pixel_satd_8x8_internal2 | |
2328 call pixel_satd_8x8_internal2 | |
2329 call pixel_satd_8x8_internal2 | |
2330 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2331 lea r2, [r7 + 40*SIZEOF_PIXEL] | |
2332 call pixel_satd_8x8_internal2 | |
2333 call pixel_satd_8x8_internal2 | |
2334 call pixel_satd_8x8_internal2 | |
2335 call pixel_satd_8x8_internal2 | |
2336 call pixel_satd_8x8_internal2 | |
2337 call pixel_satd_8x8_internal2 | |
2338 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2339 lea r2, [r7 + 48*SIZEOF_PIXEL] | |
2340 call pixel_satd_8x8_internal2 | |
2341 call pixel_satd_8x8_internal2 | |
2342 call pixel_satd_8x8_internal2 | |
2343 call pixel_satd_8x8_internal2 | |
2344 call pixel_satd_8x8_internal2 | |
2345 call pixel_satd_8x8_internal2 | |
2346 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2347 lea r2, [r7 + 56*SIZEOF_PIXEL] | |
2348 call pixel_satd_8x8_internal2 | |
2349 call pixel_satd_8x8_internal2 | |
2350 call pixel_satd_8x8_internal2 | |
2351 call pixel_satd_8x8_internal2 | |
2352 call pixel_satd_8x8_internal2 | |
2353 call pixel_satd_8x8_internal2 | |
2354 HADDD m6, m0 | |
2355 movd eax, m6 | |
2356 RET | |
2357 %else | |
2358 cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64 | |
2359 SATD_START_SSE2 m6, m7 | |
2360 mov r6, r0 | |
2361 mov [rsp], r2 | |
2362 call pixel_satd_8x8_internal2 | |
2363 call pixel_satd_8x8_internal2 | |
2364 call pixel_satd_8x8_internal2 | |
2365 call pixel_satd_8x8_internal2 | |
2366 call pixel_satd_8x8_internal2 | |
2367 call pixel_satd_8x8_internal2 | |
2368 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2369 mov r2, [rsp] | |
2370 add r2, 8*SIZEOF_PIXEL | |
2371 call pixel_satd_8x8_internal2 | |
2372 call pixel_satd_8x8_internal2 | |
2373 call pixel_satd_8x8_internal2 | |
2374 call pixel_satd_8x8_internal2 | |
2375 call pixel_satd_8x8_internal2 | |
2376 call pixel_satd_8x8_internal2 | |
2377 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2378 mov r2, [rsp] | |
2379 add r2, 16*SIZEOF_PIXEL | |
2380 call pixel_satd_8x8_internal2 | |
2381 call pixel_satd_8x8_internal2 | |
2382 call pixel_satd_8x8_internal2 | |
2383 call pixel_satd_8x8_internal2 | |
2384 call pixel_satd_8x8_internal2 | |
2385 call pixel_satd_8x8_internal2 | |
2386 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2387 mov r2, [rsp] | |
2388 add r2, 24*SIZEOF_PIXEL | |
2389 call pixel_satd_8x8_internal2 | |
2390 call pixel_satd_8x8_internal2 | |
2391 call pixel_satd_8x8_internal2 | |
2392 call pixel_satd_8x8_internal2 | |
2393 call pixel_satd_8x8_internal2 | |
2394 call pixel_satd_8x8_internal2 | |
2395 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2396 mov r2, [rsp] | |
2397 add r2, 32*SIZEOF_PIXEL | |
2398 call pixel_satd_8x8_internal2 | |
2399 call pixel_satd_8x8_internal2 | |
2400 call pixel_satd_8x8_internal2 | |
2401 call pixel_satd_8x8_internal2 | |
2402 call pixel_satd_8x8_internal2 | |
2403 call pixel_satd_8x8_internal2 | |
2404 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2405 mov r2, [rsp] | |
2406 add r2, 40*SIZEOF_PIXEL | |
2407 call pixel_satd_8x8_internal2 | |
2408 call pixel_satd_8x8_internal2 | |
2409 call pixel_satd_8x8_internal2 | |
2410 call pixel_satd_8x8_internal2 | |
2411 call pixel_satd_8x8_internal2 | |
2412 call pixel_satd_8x8_internal2 | |
2413 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2414 mov r2, [rsp] | |
2415 add r2, 48*SIZEOF_PIXEL | |
2416 call pixel_satd_8x8_internal2 | |
2417 call pixel_satd_8x8_internal2 | |
2418 call pixel_satd_8x8_internal2 | |
2419 call pixel_satd_8x8_internal2 | |
2420 call pixel_satd_8x8_internal2 | |
2421 call pixel_satd_8x8_internal2 | |
2422 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2423 mov r2, [rsp] | |
2424 add r2, 56*SIZEOF_PIXEL | |
2425 call pixel_satd_8x8_internal2 | |
2426 call pixel_satd_8x8_internal2 | |
2427 call pixel_satd_8x8_internal2 | |
2428 call pixel_satd_8x8_internal2 | |
2429 call pixel_satd_8x8_internal2 | |
2430 call pixel_satd_8x8_internal2 | |
2431 HADDD m6, m0 | |
2432 movd eax, m6 | |
2433 RET | |
2434 %endif | |
2435 | |
2436 %if WIN64 | |
2437 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx) | |
2438 SATD_START_SSE2 m6, m7 | |
2439 mov r6, r0 | |
2440 mov r7, r2 | |
2441 call pixel_satd_8x8_internal2 | |
2442 call pixel_satd_8x8_internal2 | |
2443 call pixel_satd_8x8_internal2 | |
2444 call pixel_satd_8x8_internal2 | |
2445 call pixel_satd_8x8_internal2 | |
2446 call pixel_satd_8x8_internal2 | |
2447 call pixel_satd_8x8_internal2 | |
2448 call pixel_satd_8x8_internal2 | |
2449 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2450 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2451 call pixel_satd_8x8_internal2 | |
2452 call pixel_satd_8x8_internal2 | |
2453 call pixel_satd_8x8_internal2 | |
2454 call pixel_satd_8x8_internal2 | |
2455 call pixel_satd_8x8_internal2 | |
2456 call pixel_satd_8x8_internal2 | |
2457 call pixel_satd_8x8_internal2 | |
2458 call pixel_satd_8x8_internal2 | |
2459 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2460 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
2461 call pixel_satd_8x8_internal2 | |
2462 call pixel_satd_8x8_internal2 | |
2463 call pixel_satd_8x8_internal2 | |
2464 call pixel_satd_8x8_internal2 | |
2465 call pixel_satd_8x8_internal2 | |
2466 call pixel_satd_8x8_internal2 | |
2467 call pixel_satd_8x8_internal2 | |
2468 call pixel_satd_8x8_internal2 | |
2469 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2470 lea r2, [r7 + 24*SIZEOF_PIXEL] | |
2471 call pixel_satd_8x8_internal2 | |
2472 call pixel_satd_8x8_internal2 | |
2473 call pixel_satd_8x8_internal2 | |
2474 call pixel_satd_8x8_internal2 | |
2475 call pixel_satd_8x8_internal2 | |
2476 call pixel_satd_8x8_internal2 | |
2477 call pixel_satd_8x8_internal2 | |
2478 call pixel_satd_8x8_internal2 | |
2479 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2480 lea r2, [r7 + 32*SIZEOF_PIXEL] | |
2481 call pixel_satd_8x8_internal2 | |
2482 call pixel_satd_8x8_internal2 | |
2483 call pixel_satd_8x8_internal2 | |
2484 call pixel_satd_8x8_internal2 | |
2485 call pixel_satd_8x8_internal2 | |
2486 call pixel_satd_8x8_internal2 | |
2487 call pixel_satd_8x8_internal2 | |
2488 call pixel_satd_8x8_internal2 | |
2489 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2490 lea r2, [r7 + 40*SIZEOF_PIXEL] | |
2491 call pixel_satd_8x8_internal2 | |
2492 call pixel_satd_8x8_internal2 | |
2493 call pixel_satd_8x8_internal2 | |
2494 call pixel_satd_8x8_internal2 | |
2495 call pixel_satd_8x8_internal2 | |
2496 call pixel_satd_8x8_internal2 | |
2497 call pixel_satd_8x8_internal2 | |
2498 call pixel_satd_8x8_internal2 | |
2499 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2500 lea r2, [r7 + 48*SIZEOF_PIXEL] | |
2501 call pixel_satd_8x8_internal2 | |
2502 call pixel_satd_8x8_internal2 | |
2503 call pixel_satd_8x8_internal2 | |
2504 call pixel_satd_8x8_internal2 | |
2505 call pixel_satd_8x8_internal2 | |
2506 call pixel_satd_8x8_internal2 | |
2507 call pixel_satd_8x8_internal2 | |
2508 call pixel_satd_8x8_internal2 | |
2509 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2510 lea r2, [r7 + 56*SIZEOF_PIXEL] | |
2511 call pixel_satd_8x8_internal2 | |
2512 call pixel_satd_8x8_internal2 | |
2513 call pixel_satd_8x8_internal2 | |
2514 call pixel_satd_8x8_internal2 | |
2515 call pixel_satd_8x8_internal2 | |
2516 call pixel_satd_8x8_internal2 | |
2517 call pixel_satd_8x8_internal2 | |
2518 call pixel_satd_8x8_internal2 | |
2519 HADDD m6, m0 | |
2520 movd eax, m6 | |
2521 RET | |
2522 %else | |
2523 cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64 | |
2524 SATD_START_SSE2 m6, m7 | |
2525 mov r6, r0 | |
2526 mov [rsp], r2 | |
2527 call pixel_satd_8x8_internal2 | |
2528 call pixel_satd_8x8_internal2 | |
2529 call pixel_satd_8x8_internal2 | |
2530 call pixel_satd_8x8_internal2 | |
2531 call pixel_satd_8x8_internal2 | |
2532 call pixel_satd_8x8_internal2 | |
2533 call pixel_satd_8x8_internal2 | |
2534 call pixel_satd_8x8_internal2 | |
2535 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2536 mov r2, [rsp] | |
2537 add r2, 8*SIZEOF_PIXEL | |
2538 call pixel_satd_8x8_internal2 | |
2539 call pixel_satd_8x8_internal2 | |
2540 call pixel_satd_8x8_internal2 | |
2541 call pixel_satd_8x8_internal2 | |
2542 call pixel_satd_8x8_internal2 | |
2543 call pixel_satd_8x8_internal2 | |
2544 call pixel_satd_8x8_internal2 | |
2545 call pixel_satd_8x8_internal2 | |
2546 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2547 mov r2, [rsp] | |
2548 add r2, 16*SIZEOF_PIXEL | |
2549 call pixel_satd_8x8_internal2 | |
2550 call pixel_satd_8x8_internal2 | |
2551 call pixel_satd_8x8_internal2 | |
2552 call pixel_satd_8x8_internal2 | |
2553 call pixel_satd_8x8_internal2 | |
2554 call pixel_satd_8x8_internal2 | |
2555 call pixel_satd_8x8_internal2 | |
2556 call pixel_satd_8x8_internal2 | |
2557 lea r0, [r6 + 24*SIZEOF_PIXEL] | |
2558 mov r2, [rsp] | |
2559 add r2, 24*SIZEOF_PIXEL | |
2560 call pixel_satd_8x8_internal2 | |
2561 call pixel_satd_8x8_internal2 | |
2562 call pixel_satd_8x8_internal2 | |
2563 call pixel_satd_8x8_internal2 | |
2564 call pixel_satd_8x8_internal2 | |
2565 call pixel_satd_8x8_internal2 | |
2566 call pixel_satd_8x8_internal2 | |
2567 call pixel_satd_8x8_internal2 | |
2568 lea r0, [r6 + 32*SIZEOF_PIXEL] | |
2569 mov r2, [rsp] | |
2570 add r2, 32*SIZEOF_PIXEL | |
2571 call pixel_satd_8x8_internal2 | |
2572 call pixel_satd_8x8_internal2 | |
2573 call pixel_satd_8x8_internal2 | |
2574 call pixel_satd_8x8_internal2 | |
2575 call pixel_satd_8x8_internal2 | |
2576 call pixel_satd_8x8_internal2 | |
2577 call pixel_satd_8x8_internal2 | |
2578 call pixel_satd_8x8_internal2 | |
2579 lea r0, [r6 + 40*SIZEOF_PIXEL] | |
2580 mov r2, [rsp] | |
2581 add r2, 40*SIZEOF_PIXEL | |
2582 call pixel_satd_8x8_internal2 | |
2583 call pixel_satd_8x8_internal2 | |
2584 call pixel_satd_8x8_internal2 | |
2585 call pixel_satd_8x8_internal2 | |
2586 call pixel_satd_8x8_internal2 | |
2587 call pixel_satd_8x8_internal2 | |
2588 call pixel_satd_8x8_internal2 | |
2589 call pixel_satd_8x8_internal2 | |
2590 lea r0, [r6 + 48*SIZEOF_PIXEL] | |
2591 mov r2, [rsp] | |
2592 add r2, 48*SIZEOF_PIXEL | |
2593 call pixel_satd_8x8_internal2 | |
2594 call pixel_satd_8x8_internal2 | |
2595 call pixel_satd_8x8_internal2 | |
2596 call pixel_satd_8x8_internal2 | |
2597 call pixel_satd_8x8_internal2 | |
2598 call pixel_satd_8x8_internal2 | |
2599 call pixel_satd_8x8_internal2 | |
2600 call pixel_satd_8x8_internal2 | |
2601 lea r0, [r6 + 56*SIZEOF_PIXEL] | |
2602 mov r2, [rsp] | |
2603 add r2, 56*SIZEOF_PIXEL | |
2604 call pixel_satd_8x8_internal2 | |
2605 call pixel_satd_8x8_internal2 | |
2606 call pixel_satd_8x8_internal2 | |
2607 call pixel_satd_8x8_internal2 | |
2608 call pixel_satd_8x8_internal2 | |
2609 call pixel_satd_8x8_internal2 | |
2610 call pixel_satd_8x8_internal2 | |
2611 call pixel_satd_8x8_internal2 | |
2612 HADDD m6, m0 | |
2613 movd eax, m6 | |
2614 RET | |
2615 %endif | |
2616 | |
2617 %if WIN64 | |
2618 cglobal pixel_satd_16x4, 4,6,14 | |
2619 %else | |
2620 cglobal pixel_satd_16x4, 4,6,8 | |
2621 %endif | |
2622 SATD_START_SSE2 m6, m7 | |
2623 BACKUP_POINTERS | |
2624 call %%pixel_satd_8x4_internal2 | |
2625 RESTORE_AND_INC_POINTERS | |
2626 call %%pixel_satd_8x4_internal2 | |
2627 HADDD m6, m0 | |
2628 movd eax, m6 | |
2629 RET | |
2630 | |
2631 %if WIN64 | |
2632 cglobal pixel_satd_16x8, 4,6,14 | |
2633 %else | |
2634 cglobal pixel_satd_16x8, 4,6,8 | |
2635 %endif | |
2636 SATD_START_SSE2 m6, m7 | |
2637 BACKUP_POINTERS | |
2638 call pixel_satd_8x8_internal2 | |
2639 RESTORE_AND_INC_POINTERS | |
2640 call pixel_satd_8x8_internal2 | |
2641 HADDD m6, m0 | |
2642 movd eax, m6 | |
2643 RET | |
2644 | |
2645 %if WIN64 | |
2646 cglobal pixel_satd_16x12, 4,6,14 | |
2647 %else | |
2648 cglobal pixel_satd_16x12, 4,6,8 | |
2649 %endif | |
2650 SATD_START_SSE2 m6, m7, 1 | |
2651 BACKUP_POINTERS | |
2652 call pixel_satd_8x8_internal2 | |
2653 call %%pixel_satd_8x4_internal2 | |
2654 RESTORE_AND_INC_POINTERS | |
2655 call pixel_satd_8x8_internal2 | |
2656 call %%pixel_satd_8x4_internal2 | |
2657 HADDD m6, m0 | |
2658 movd eax, m6 | |
2659 RET | |
2660 | |
2661 %if WIN64 | |
2662 cglobal pixel_satd_16x16, 4,6,14 | |
2663 %else | |
2664 cglobal pixel_satd_16x16, 4,6,8 | |
2665 %endif | |
2666 SATD_START_SSE2 m6, m7, 1 | |
2667 BACKUP_POINTERS | |
2668 call pixel_satd_8x8_internal2 | |
2669 call pixel_satd_8x8_internal2 | |
2670 RESTORE_AND_INC_POINTERS | |
2671 call pixel_satd_8x8_internal2 | |
2672 call pixel_satd_8x8_internal2 | |
2673 HADDD m6, m0 | |
2674 movd eax, m6 | |
2675 RET | |
2676 | |
2677 %if WIN64 | |
2678 cglobal pixel_satd_16x32, 4,6,14 | |
2679 %else | |
2680 cglobal pixel_satd_16x32, 4,6,8 | |
2681 %endif | |
2682 SATD_START_SSE2 m6, m7, 1 | |
2683 BACKUP_POINTERS | |
2684 call pixel_satd_8x8_internal2 | |
2685 call pixel_satd_8x8_internal2 | |
2686 call pixel_satd_8x8_internal2 | |
2687 call pixel_satd_8x8_internal2 | |
2688 RESTORE_AND_INC_POINTERS | |
2689 call pixel_satd_8x8_internal2 | |
2690 call pixel_satd_8x8_internal2 | |
2691 call pixel_satd_8x8_internal2 | |
2692 call pixel_satd_8x8_internal2 | |
2693 HADDD m6, m0 | |
2694 movd eax, m6 | |
2695 RET | |
2696 | |
2697 %if WIN64 | |
2698 cglobal pixel_satd_16x64, 4,6,14 | |
2699 %else | |
2700 cglobal pixel_satd_16x64, 4,6,8 | |
2701 %endif | |
2702 SATD_START_SSE2 m6, m7, 1 | |
2703 BACKUP_POINTERS | |
2704 call pixel_satd_8x8_internal2 | |
2705 call pixel_satd_8x8_internal2 | |
2706 call pixel_satd_8x8_internal2 | |
2707 call pixel_satd_8x8_internal2 | |
2708 call pixel_satd_8x8_internal2 | |
2709 call pixel_satd_8x8_internal2 | |
2710 call pixel_satd_8x8_internal2 | |
2711 call pixel_satd_8x8_internal2 | |
2712 RESTORE_AND_INC_POINTERS | |
2713 call pixel_satd_8x8_internal2 | |
2714 call pixel_satd_8x8_internal2 | |
2715 call pixel_satd_8x8_internal2 | |
2716 call pixel_satd_8x8_internal2 | |
2717 call pixel_satd_8x8_internal2 | |
2718 call pixel_satd_8x8_internal2 | |
2719 call pixel_satd_8x8_internal2 | |
2720 call pixel_satd_8x8_internal2 | |
2721 HADDD m6, m0 | |
2722 movd eax, m6 | |
2723 RET | |
2724 %endif | |
2725 | |
2726 %if HIGH_BIT_DEPTH | |
2727 %if WIN64 | |
2728 cglobal pixel_satd_12x16, 4,8,8 | |
2729 SATD_START_MMX | |
2730 mov r6, r0 | |
2731 mov r7, r2 | |
2732 pxor m7, m7 | |
2733 SATD_4x8_SSE vertical, 0, 4, 5 | |
2734 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2735 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2736 SATD_4x8_SSE vertical, 1, 4, 5 | |
2737 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
2738 lea r2, [r7 + 4*SIZEOF_PIXEL] | |
2739 SATD_4x8_SSE vertical, 1, 4, 5 | |
2740 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2741 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2742 SATD_4x8_SSE vertical, 1, 4, 5 | |
2743 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2744 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2745 SATD_4x8_SSE vertical, 1, 4, 5 | |
2746 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2747 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2748 SATD_4x8_SSE vertical, 1, 4, 5 | |
2749 HADDD m7, m0 | |
2750 movd eax, m7 | |
2751 RET | |
2752 %else | |
2753 cglobal pixel_satd_12x16, 4,7,8,0-gprsize | |
2754 SATD_START_MMX | |
2755 mov r6, r0 | |
2756 mov [rsp], r2 | |
2757 pxor m7, m7 | |
2758 SATD_4x8_SSE vertical, 0, 4, 5 | |
2759 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2760 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2761 SATD_4x8_SSE vertical, 1, 4, 5 | |
2762 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
2763 mov r2, [rsp] | |
2764 add r2, 4*SIZEOF_PIXEL | |
2765 SATD_4x8_SSE vertical, 1, 4, 5 | |
2766 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2767 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2768 SATD_4x8_SSE vertical, 1, 4, 5 | |
2769 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2770 mov r2, [rsp] | |
2771 add r2, 8*SIZEOF_PIXEL | |
2772 SATD_4x8_SSE vertical, 1, 4, 5 | |
2773 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2774 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2775 SATD_4x8_SSE vertical, 1, 4, 5 | |
2776 HADDD m7, m0 | |
2777 movd eax, m7 | |
2778 RET | |
2779 %endif | |
2780 %else ;HIGH_BIT_DEPTH | |
2781 %if WIN64 | |
2782 cglobal pixel_satd_12x16, 4,8,8 | |
2783 SATD_START_MMX | |
2784 mov r6, r0 | |
2785 mov r7, r2 | |
2786 %if vertical==0 | |
2787 mova m7, [hmul_4p] | |
2788 %endif | |
2789 SATD_4x8_SSE vertical, 0, swap | |
2790 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2791 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2792 SATD_4x8_SSE vertical, 1, add | |
2793 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
2794 lea r2, [r7 + 4*SIZEOF_PIXEL] | |
2795 SATD_4x8_SSE vertical, 1, add | |
2796 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2797 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2798 SATD_4x8_SSE vertical, 1, add | |
2799 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2800 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2801 SATD_4x8_SSE vertical, 1, add | |
2802 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2803 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2804 SATD_4x8_SSE vertical, 1, add | |
2805 HADDW m7, m1 | |
2806 movd eax, m7 | |
2807 RET | |
2808 %else | |
2809 cglobal pixel_satd_12x16, 4,7,8,0-gprsize | |
2810 SATD_START_MMX | |
2811 mov r6, r0 | |
2812 mov [rsp], r2 | |
2813 %if vertical==0 | |
2814 mova m7, [hmul_4p] | |
2815 %endif | |
2816 SATD_4x8_SSE vertical, 0, swap | |
2817 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2818 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2819 SATD_4x8_SSE vertical, 1, add | |
2820 lea r0, [r6 + 4*SIZEOF_PIXEL] | |
2821 mov r2, [rsp] | |
2822 add r2, 4*SIZEOF_PIXEL | |
2823 SATD_4x8_SSE vertical, 1, add | |
2824 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2825 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2826 SATD_4x8_SSE vertical, 1, add | |
2827 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2828 mov r2, [rsp] | |
2829 add r2, 8*SIZEOF_PIXEL | |
2830 SATD_4x8_SSE vertical, 1, add | |
2831 lea r0, [r0 + r1*2*SIZEOF_PIXEL] | |
2832 lea r2, [r2 + r3*2*SIZEOF_PIXEL] | |
2833 SATD_4x8_SSE vertical, 1, add | |
2834 HADDW m7, m1 | |
2835 movd eax, m7 | |
2836 RET | |
2837 %endif | |
2838 %endif | |
2839 | |
2840 %if WIN64 | |
2841 cglobal pixel_satd_24x32, 4,8,14 | |
2842 SATD_START_SSE2 m6, m7 | |
2843 mov r6, r0 | |
2844 mov r7, r2 | |
2845 call pixel_satd_8x8_internal2 | |
2846 call pixel_satd_8x8_internal2 | |
2847 call pixel_satd_8x8_internal2 | |
2848 call pixel_satd_8x8_internal2 | |
2849 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2850 lea r2, [r7 + 8*SIZEOF_PIXEL] | |
2851 call pixel_satd_8x8_internal2 | |
2852 call pixel_satd_8x8_internal2 | |
2853 call pixel_satd_8x8_internal2 | |
2854 call pixel_satd_8x8_internal2 | |
2855 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2856 lea r2, [r7 + 16*SIZEOF_PIXEL] | |
2857 call pixel_satd_8x8_internal2 | |
2858 call pixel_satd_8x8_internal2 | |
2859 call pixel_satd_8x8_internal2 | |
2860 call pixel_satd_8x8_internal2 | |
2861 HADDD m6, m0 | |
2862 movd eax, m6 | |
2863 RET | |
2864 %else | |
2865 cglobal pixel_satd_24x32, 4,7,8,0-gprsize | |
2866 SATD_START_SSE2 m6, m7 | |
2867 mov r6, r0 | |
2868 mov [rsp], r2 | |
2869 call pixel_satd_8x8_internal2 | |
2870 call pixel_satd_8x8_internal2 | |
2871 call pixel_satd_8x8_internal2 | |
2872 call pixel_satd_8x8_internal2 | |
2873 lea r0, [r6 + 8*SIZEOF_PIXEL] | |
2874 mov r2, [rsp] | |
2875 add r2, 8*SIZEOF_PIXEL | |
2876 call pixel_satd_8x8_internal2 | |
2877 call pixel_satd_8x8_internal2 | |
2878 call pixel_satd_8x8_internal2 | |
2879 call pixel_satd_8x8_internal2 | |
2880 lea r0, [r6 + 16*SIZEOF_PIXEL] | |
2881 mov r2, [rsp] | |
2882 add r2, 16*SIZEOF_PIXEL | |
2883 call pixel_satd_8x8_internal2 | |
2884 call pixel_satd_8x8_internal2 | |
2885 call pixel_satd_8x8_internal2 | |
2886 call pixel_satd_8x8_internal2 | |
2887 HADDD m6, m0 | |
2888 movd eax, m6 | |
2889 RET | |
2890 %endif ;WIN64 | |
2891 | |
2892 %if WIN64 | |
2893 cglobal pixel_satd_8x32, 4,6,14 | |
2894 %else | |
2895 cglobal pixel_satd_8x32, 4,6,8 | |
2896 %endif | |
2897 SATD_START_SSE2 m6, m7 | |
2898 %if vertical | |
2899 mova m7, [pw_00ff] | |
2900 %endif | |
2901 call pixel_satd_8x8_internal2 | |
2902 call pixel_satd_8x8_internal2 | |
2903 call pixel_satd_8x8_internal2 | |
2904 call pixel_satd_8x8_internal2 | |
2905 HADDD m6, m0 | |
2906 movd eax, m6 | |
2907 RET | |
2908 | |
2909 %if WIN64 | |
2910 cglobal pixel_satd_8x16, 4,6,14 | |
2911 %else | |
2912 cglobal pixel_satd_8x16, 4,6,8 | |
2913 %endif | |
2914 SATD_START_SSE2 m6, m7 | |
2915 call pixel_satd_8x8_internal2 | |
2916 call pixel_satd_8x8_internal2 | |
2917 HADDD m6, m0 | |
2918 movd eax, m6 | |
2919 RET | |
2920 | |
2921 cglobal pixel_satd_8x8, 4,6,8 | |
2922 SATD_START_SSE2 m6, m7 | |
2923 call pixel_satd_8x8_internal | |
2924 SATD_END_SSE2 m6 | |
2925 | |
2926 %if WIN64 | |
2927 cglobal pixel_satd_8x4, 4,6,14 | |
2928 %else | |
2929 cglobal pixel_satd_8x4, 4,6,8 | |
2930 %endif | |
2931 SATD_START_SSE2 m6, m7 | |
2932 call %%pixel_satd_8x4_internal2 | |
2933 SATD_END_SSE2 m6 | |
2934 %endmacro ; SATDS_SSE2 | |
2935 | |
2936 | |
2937 ;============================================================================= | |
2938 ; SA8D | |
2939 ;============================================================================= | |
2940 | |
2941 %macro SA8D_INTER 0 | |
2942 %if ARCH_X86_64 | |
2943 %define lh m10 | |
2944 %define rh m0 | |
2945 %else | |
2946 %define lh m0 | |
2947 %define rh [esp+48] | |
2948 %endif | |
2949 %if HIGH_BIT_DEPTH | |
2950 HADDUW m0, m1 | |
2951 paddd lh, rh | |
2952 %else | |
2953 paddusw lh, rh | |
2954 %endif ; HIGH_BIT_DEPTH | |
2955 %endmacro | |
2956 | |
2957 %macro SA8D_8x8 0 | |
2958 call pixel_sa8d_8x8_internal | |
2959 %if HIGH_BIT_DEPTH | |
2960 HADDUW m0, m1 | |
2961 %else | |
2962 HADDW m0, m1 | |
2963 %endif ; HIGH_BIT_DEPTH | |
2964 paddd m0, [pd_1] | |
2965 psrld m0, 1 | |
2966 paddd m12, m0 | |
2967 %endmacro | |
2968 | |
2969 %macro SA8D_16x16 0 | |
2970 call pixel_sa8d_8x8_internal ; pix[0] | |
2971 add r2, 8*SIZEOF_PIXEL | |
2972 add r0, 8*SIZEOF_PIXEL | |
2973 %if HIGH_BIT_DEPTH | |
2974 HADDUW m0, m1 | |
2975 %endif | |
2976 mova m10, m0 | |
2977 call pixel_sa8d_8x8_internal ; pix[8] | |
2978 lea r2, [r2+8*r3] | |
2979 lea r0, [r0+8*r1] | |
2980 SA8D_INTER | |
2981 call pixel_sa8d_8x8_internal ; pix[8*stride+8] | |
2982 sub r2, 8*SIZEOF_PIXEL | |
2983 sub r0, 8*SIZEOF_PIXEL | |
2984 SA8D_INTER | |
2985 call pixel_sa8d_8x8_internal ; pix[8*stride] | |
2986 SA8D_INTER | |
2987 SWAP 0, 10 | |
2988 %if HIGH_BIT_DEPTH == 0 | |
2989 HADDUW m0, m1 | |
2990 %endif | |
2991 paddd m0, [pd_1] | |
2992 psrld m0, 1 | |
2993 paddd m12, m0 | |
2994 %endmacro | |
2995 | |
2996 %macro AVG_16x16 0 | |
2997 SA8D_INTER | |
2998 %if HIGH_BIT_DEPTH == 0 | |
2999 HADDUW m0, m1 | |
3000 %endif | |
3001 movd r4d, m0 | |
3002 add r4d, 1 | |
3003 shr r4d, 1 | |
3004 add r4d, dword [esp+36] | |
3005 mov dword [esp+36], r4d | |
3006 %endmacro | |
3007 | |
3008 %macro SA8D 0 | |
3009 ; sse2 doesn't seem to like the horizontal way of doing things | |
3010 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) | |
3011 | |
3012 %if ARCH_X86_64 | |
3013 ;----------------------------------------------------------------------------- | |
3014 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
3015 ;----------------------------------------------------------------------------- | |
3016 cglobal pixel_sa8d_8x8_internal | |
3017 lea r6, [r0+4*r1] | |
3018 lea r7, [r2+4*r3] | |
3019 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 | |
3020 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 | |
3021 %if vertical | |
3022 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax | |
3023 %else ; non-sse2 | |
3024 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 | |
3025 %endif | |
3026 paddw m0, m1 | |
3027 paddw m0, m2 | |
3028 paddw m0, m8 | |
3029 SAVE_MM_PERMUTATION | |
3030 ret | |
3031 | |
3032 cglobal pixel_sa8d_8x8, 4,8,12 | |
3033 FIX_STRIDES r1, r3 | |
3034 lea r4, [3*r1] | |
3035 lea r5, [3*r3] | |
3036 %if vertical == 0 | |
3037 mova m7, [hmul_8p] | |
3038 %endif | |
3039 call pixel_sa8d_8x8_internal | |
3040 %if HIGH_BIT_DEPTH | |
3041 HADDUW m0, m1 | |
3042 %else | |
3043 HADDW m0, m1 | |
3044 %endif ; HIGH_BIT_DEPTH | |
3045 movd eax, m0 | |
3046 add eax, 1 | |
3047 shr eax, 1 | |
3048 RET | |
3049 | |
3050 cglobal pixel_sa8d_16x16, 4,8,12 | |
3051 FIX_STRIDES r1, r3 | |
3052 lea r4, [3*r1] | |
3053 lea r5, [3*r3] | |
3054 %if vertical == 0 | |
3055 mova m7, [hmul_8p] | |
3056 %endif | |
3057 call pixel_sa8d_8x8_internal ; pix[0] | |
3058 add r2, 8*SIZEOF_PIXEL | |
3059 add r0, 8*SIZEOF_PIXEL | |
3060 %if HIGH_BIT_DEPTH | |
3061 HADDUW m0, m1 | |
3062 %endif | |
3063 mova m10, m0 | |
3064 call pixel_sa8d_8x8_internal ; pix[8] | |
3065 lea r2, [r2+8*r3] | |
3066 lea r0, [r0+8*r1] | |
3067 SA8D_INTER | |
3068 call pixel_sa8d_8x8_internal ; pix[8*stride+8] | |
3069 sub r2, 8*SIZEOF_PIXEL | |
3070 sub r0, 8*SIZEOF_PIXEL | |
3071 SA8D_INTER | |
3072 call pixel_sa8d_8x8_internal ; pix[8*stride] | |
3073 SA8D_INTER | |
3074 SWAP 0, 10 | |
3075 %if HIGH_BIT_DEPTH == 0 | |
3076 HADDUW m0, m1 | |
3077 %endif | |
3078 movd eax, m0 | |
3079 add eax, 1 | |
3080 shr eax, 1 | |
3081 RET | |
3082 | |
3083 cglobal pixel_sa8d_8x16, 4,8,13 | |
3084 FIX_STRIDES r1, r3 | |
3085 lea r4, [3*r1] | |
3086 lea r5, [3*r3] | |
3087 pxor m12, m12 | |
3088 %if vertical == 0 | |
3089 mova m7, [hmul_8p] | |
3090 %endif | |
3091 SA8D_8x8 | |
3092 lea r0, [r0 + 8*r1] | |
3093 lea r2, [r2 + 8*r3] | |
3094 SA8D_8x8 | |
3095 movd eax, m12 | |
3096 RET | |
3097 | |
3098 cglobal pixel_sa8d_8x32, 4,8,13 | |
3099 FIX_STRIDES r1, r3 | |
3100 lea r4, [3*r1] | |
3101 lea r5, [3*r3] | |
3102 pxor m12, m12 | |
3103 %if vertical == 0 | |
3104 mova m7, [hmul_8p] | |
3105 %endif | |
3106 SA8D_8x8 | |
3107 lea r0, [r0 + r1*8] | |
3108 lea r2, [r2 + r3*8] | |
3109 SA8D_8x8 | |
3110 lea r0, [r0 + r1*8] | |
3111 lea r2, [r2 + r3*8] | |
3112 SA8D_8x8 | |
3113 lea r0, [r0 + r1*8] | |
3114 lea r2, [r2 + r3*8] | |
3115 SA8D_8x8 | |
3116 movd eax, m12 | |
3117 RET | |
3118 | |
3119 cglobal pixel_sa8d_16x8, 4,8,13 | |
3120 FIX_STRIDES r1, r3 | |
3121 lea r4, [3*r1] | |
3122 lea r5, [3*r3] | |
3123 pxor m12, m12 | |
3124 %if vertical == 0 | |
3125 mova m7, [hmul_8p] | |
3126 %endif | |
3127 SA8D_8x8 | |
3128 add r0, 8*SIZEOF_PIXEL | |
3129 add r2, 8*SIZEOF_PIXEL | |
3130 SA8D_8x8 | |
3131 movd eax, m12 | |
3132 RET | |
3133 | |
3134 cglobal pixel_sa8d_16x32, 4,8,13 | |
3135 FIX_STRIDES r1, r3 | |
3136 lea r4, [3*r1] | |
3137 lea r5, [3*r3] | |
3138 pxor m12, m12 | |
3139 %if vertical == 0 | |
3140 mova m7, [hmul_8p] | |
3141 %endif | |
3142 SA8D_16x16 | |
3143 lea r0, [r0+8*r1] | |
3144 lea r2, [r2+8*r3] | |
3145 SA8D_16x16 | |
3146 movd eax, m12 | |
3147 RET | |
3148 | |
3149 cglobal pixel_sa8d_16x64, 4,8,13 | |
3150 FIX_STRIDES r1, r3 | |
3151 lea r4, [3*r1] | |
3152 lea r5, [3*r3] | |
3153 pxor m12, m12 | |
3154 %if vertical == 0 | |
3155 mova m7, [hmul_8p] | |
3156 %endif | |
3157 SA8D_16x16 | |
3158 lea r0, [r0+8*r1] | |
3159 lea r2, [r2+8*r3] | |
3160 SA8D_16x16 | |
3161 lea r0, [r0+8*r1] | |
3162 lea r2, [r2+8*r3] | |
3163 SA8D_16x16 | |
3164 lea r0, [r0+8*r1] | |
3165 lea r2, [r2+8*r3] | |
3166 SA8D_16x16 | |
3167 movd eax, m12 | |
3168 RET | |
3169 | |
3170 cglobal pixel_sa8d_24x32, 4,8,13 | |
3171 FIX_STRIDES r1, r3 | |
3172 lea r4, [3*r1] | |
3173 lea r5, [3*r3] | |
3174 pxor m12, m12 | |
3175 %if vertical == 0 | |
3176 mova m7, [hmul_8p] | |
3177 %endif | |
3178 SA8D_8x8 | |
3179 add r0, 8*SIZEOF_PIXEL | |
3180 add r2, 8*SIZEOF_PIXEL | |
3181 SA8D_8x8 | |
3182 add r0, 8*SIZEOF_PIXEL | |
3183 add r2, 8*SIZEOF_PIXEL | |
3184 SA8D_8x8 | |
3185 lea r0, [r0 + r1*8] | |
3186 lea r2, [r2 + r3*8] | |
3187 SA8D_8x8 | |
3188 sub r0, 8*SIZEOF_PIXEL | |
3189 sub r2, 8*SIZEOF_PIXEL | |
3190 SA8D_8x8 | |
3191 sub r0, 8*SIZEOF_PIXEL | |
3192 sub r2, 8*SIZEOF_PIXEL | |
3193 SA8D_8x8 | |
3194 lea r0, [r0 + r1*8] | |
3195 lea r2, [r2 + r3*8] | |
3196 SA8D_8x8 | |
3197 add r0, 8*SIZEOF_PIXEL | |
3198 add r2, 8*SIZEOF_PIXEL | |
3199 SA8D_8x8 | |
3200 add r0, 8*SIZEOF_PIXEL | |
3201 add r2, 8*SIZEOF_PIXEL | |
3202 SA8D_8x8 | |
3203 lea r0, [r0 + r1*8] | |
3204 lea r2, [r2 + r3*8] | |
3205 SA8D_8x8 | |
3206 sub r0, 8*SIZEOF_PIXEL | |
3207 sub r2, 8*SIZEOF_PIXEL | |
3208 SA8D_8x8 | |
3209 sub r0, 8*SIZEOF_PIXEL | |
3210 sub r2, 8*SIZEOF_PIXEL | |
3211 SA8D_8x8 | |
3212 movd eax, m12 | |
3213 RET | |
3214 | |
3215 cglobal pixel_sa8d_32x8, 4,8,13 | |
3216 FIX_STRIDES r1, r3 | |
3217 lea r4, [3*r1] | |
3218 lea r5, [3*r3] | |
3219 pxor m12, m12 | |
3220 %if vertical == 0 | |
3221 mova m7, [hmul_8p] | |
3222 %endif | |
3223 SA8D_8x8 | |
3224 add r0, 8*SIZEOF_PIXEL | |
3225 add r2, 8*SIZEOF_PIXEL | |
3226 SA8D_8x8 | |
3227 add r0, 8*SIZEOF_PIXEL | |
3228 add r2, 8*SIZEOF_PIXEL | |
3229 SA8D_8x8 | |
3230 add r0, 8*SIZEOF_PIXEL | |
3231 add r2, 8*SIZEOF_PIXEL | |
3232 SA8D_8x8 | |
3233 movd eax, m12 | |
3234 RET | |
3235 | |
3236 cglobal pixel_sa8d_32x16, 4,8,13 | |
3237 FIX_STRIDES r1, r3 | |
3238 lea r4, [3*r1] | |
3239 lea r5, [3*r3] | |
3240 pxor m12, m12 | |
3241 %if vertical == 0 | |
3242 mova m7, [hmul_8p] | |
3243 %endif | |
3244 SA8D_16x16 | |
3245 lea r4, [8*r1] | |
3246 lea r5, [8*r3] | |
3247 sub r0, r4 | |
3248 sub r2, r5 | |
3249 add r2, 16*SIZEOF_PIXEL | |
3250 add r0, 16*SIZEOF_PIXEL | |
3251 lea r4, [3*r1] | |
3252 lea r5, [3*r3] | |
3253 SA8D_16x16 | |
3254 movd eax, m12 | |
3255 RET | |
3256 | |
3257 cglobal pixel_sa8d_32x24, 4,8,13 | |
3258 FIX_STRIDES r1, r3 | |
3259 lea r4, [3*r1] | |
3260 lea r5, [3*r3] | |
3261 pxor m12, m12 | |
3262 %if vertical == 0 | |
3263 mova m7, [hmul_8p] | |
3264 %endif | |
3265 SA8D_8x8 | |
3266 add r0, 8*SIZEOF_PIXEL | |
3267 add r2, 8*SIZEOF_PIXEL | |
3268 SA8D_8x8 | |
3269 add r0, 8*SIZEOF_PIXEL | |
3270 add r2, 8*SIZEOF_PIXEL | |
3271 SA8D_8x8 | |
3272 add r0, 8*SIZEOF_PIXEL | |
3273 add r2, 8*SIZEOF_PIXEL | |
3274 SA8D_8x8 | |
3275 lea r0, [r0 + r1*8] | |
3276 lea r2, [r2 + r3*8] | |
3277 SA8D_8x8 | |
3278 sub r0, 8*SIZEOF_PIXEL | |
3279 sub r2, 8*SIZEOF_PIXEL | |
3280 SA8D_8x8 | |
3281 sub r0, 8*SIZEOF_PIXEL | |
3282 sub r2, 8*SIZEOF_PIXEL | |
3283 SA8D_8x8 | |
3284 sub r0, 8*SIZEOF_PIXEL | |
3285 sub r2, 8*SIZEOF_PIXEL | |
3286 SA8D_8x8 | |
3287 lea r0, [r0 + r1*8] | |
3288 lea r2, [r2 + r3*8] | |
3289 SA8D_8x8 | |
3290 add r0, 8*SIZEOF_PIXEL | |
3291 add r2, 8*SIZEOF_PIXEL | |
3292 SA8D_8x8 | |
3293 add r0, 8*SIZEOF_PIXEL | |
3294 add r2, 8*SIZEOF_PIXEL | |
3295 SA8D_8x8 | |
3296 add r0, 8*SIZEOF_PIXEL | |
3297 add r2, 8*SIZEOF_PIXEL | |
3298 SA8D_8x8 | |
3299 movd eax, m12 | |
3300 RET | |
3301 | |
3302 cglobal pixel_sa8d_32x32, 4,8,13 | |
3303 FIX_STRIDES r1, r3 | |
3304 lea r4, [3*r1] | |
3305 lea r5, [3*r3] | |
3306 pxor m12, m12 | |
3307 %if vertical == 0 | |
3308 mova m7, [hmul_8p] | |
3309 %endif | |
3310 SA8D_16x16 | |
3311 lea r4, [8*r1] | |
3312 lea r5, [8*r3] | |
3313 sub r0, r4 | |
3314 sub r2, r5 | |
3315 add r2, 16*SIZEOF_PIXEL | |
3316 add r0, 16*SIZEOF_PIXEL | |
3317 lea r4, [3*r1] | |
3318 lea r5, [3*r3] | |
3319 SA8D_16x16 | |
3320 lea r0, [r0+8*r1] | |
3321 lea r2, [r2+8*r3] | |
3322 SA8D_16x16 | |
3323 lea r4, [8*r1] | |
3324 lea r5, [8*r3] | |
3325 sub r0, r4 | |
3326 sub r2, r5 | |
3327 sub r2, 16*SIZEOF_PIXEL | |
3328 sub r0, 16*SIZEOF_PIXEL | |
3329 lea r4, [3*r1] | |
3330 lea r5, [3*r3] | |
3331 SA8D_16x16 | |
3332 movd eax, m12 | |
3333 RET | |
3334 | |
3335 cglobal pixel_sa8d_32x64, 4,8,13 | |
3336 FIX_STRIDES r1, r3 | |
3337 lea r4, [3*r1] | |
3338 lea r5, [3*r3] | |
3339 pxor m12, m12 | |
3340 %if vertical == 0 | |
3341 mova m7, [hmul_8p] | |
3342 %endif | |
3343 SA8D_16x16 | |
3344 lea r4, [8*r1] | |
3345 lea r5, [8*r3] | |
3346 sub r0, r4 | |
3347 sub r2, r5 | |
3348 add r2, 16*SIZEOF_PIXEL | |
3349 add r0, 16*SIZEOF_PIXEL | |
3350 lea r4, [3*r1] | |
3351 lea r5, [3*r3] | |
3352 SA8D_16x16 | |
3353 lea r0, [r0+8*r1] | |
3354 lea r2, [r2+8*r3] | |
3355 SA8D_16x16 | |
3356 lea r4, [8*r1] | |
3357 lea r5, [8*r3] | |
3358 sub r0, r4 | |
3359 sub r2, r5 | |
3360 sub r2, 16*SIZEOF_PIXEL | |
3361 sub r0, 16*SIZEOF_PIXEL | |
3362 lea r4, [3*r1] | |
3363 lea r5, [3*r3] | |
3364 SA8D_16x16 | |
3365 lea r0, [r0+8*r1] | |
3366 lea r2, [r2+8*r3] | |
3367 SA8D_16x16 | |
3368 lea r4, [8*r1] | |
3369 lea r5, [8*r3] | |
3370 sub r0, r4 | |
3371 sub r2, r5 | |
3372 add r2, 16*SIZEOF_PIXEL | |
3373 add r0, 16*SIZEOF_PIXEL | |
3374 lea r4, [3*r1] | |
3375 lea r5, [3*r3] | |
3376 SA8D_16x16 | |
3377 lea r0, [r0+8*r1] | |
3378 lea r2, [r2+8*r3] | |
3379 SA8D_16x16 | |
3380 lea r4, [8*r1] | |
3381 lea r5, [8*r3] | |
3382 sub r0, r4 | |
3383 sub r2, r5 | |
3384 sub r2, 16*SIZEOF_PIXEL | |
3385 sub r0, 16*SIZEOF_PIXEL | |
3386 lea r4, [3*r1] | |
3387 lea r5, [3*r3] | |
3388 SA8D_16x16 | |
3389 movd eax, m12 | |
3390 RET | |
3391 | |
3392 cglobal pixel_sa8d_48x64, 4,8,13 | |
3393 FIX_STRIDES r1, r3 | |
3394 lea r4, [3*r1] | |
3395 lea r5, [3*r3] | |
3396 pxor m12, m12 | |
3397 %if vertical == 0 | |
3398 mova m7, [hmul_8p] | |
3399 %endif | |
3400 SA8D_16x16 | |
3401 lea r4, [8*r1] | |
3402 lea r5, [8*r3] | |
3403 sub r0, r4 | |
3404 sub r2, r5 | |
3405 add r2, 16*SIZEOF_PIXEL | |
3406 add r0, 16*SIZEOF_PIXEL | |
3407 lea r4, [3*r1] | |
3408 lea r5, [3*r3] | |
3409 SA8D_16x16 | |
3410 lea r4, [8*r1] | |
3411 lea r5, [8*r3] | |
3412 sub r0, r4 | |
3413 sub r2, r5 | |
3414 add r2, 16*SIZEOF_PIXEL | |
3415 add r0, 16*SIZEOF_PIXEL | |
3416 lea r4, [3*r1] | |
3417 lea r5, [3*r3] | |
3418 SA8D_16x16 | |
3419 lea r0, [r0+8*r1] | |
3420 lea r2, [r2+8*r3] | |
3421 SA8D_16x16 | |
3422 lea r4, [8*r1] | |
3423 lea r5, [8*r3] | |
3424 sub r0, r4 | |
3425 sub r2, r5 | |
3426 sub r2, 16*SIZEOF_PIXEL | |
3427 sub r0, 16*SIZEOF_PIXEL | |
3428 lea r4, [3*r1] | |
3429 lea r5, [3*r3] | |
3430 SA8D_16x16 | |
3431 lea r4, [8*r1] | |
3432 lea r5, [8*r3] | |
3433 sub r0, r4 | |
3434 sub r2, r5 | |
3435 sub r2, 16*SIZEOF_PIXEL | |
3436 sub r0, 16*SIZEOF_PIXEL | |
3437 lea r4, [3*r1] | |
3438 lea r5, [3*r3] | |
3439 SA8D_16x16 | |
3440 lea r0, [r0+8*r1] | |
3441 lea r2, [r2+8*r3] | |
3442 SA8D_16x16 | |
3443 lea r4, [8*r1] | |
3444 lea r5, [8*r3] | |
3445 sub r0, r4 | |
3446 sub r2, r5 | |
3447 add r2, 16*SIZEOF_PIXEL | |
3448 add r0, 16*SIZEOF_PIXEL | |
3449 lea r4, [3*r1] | |
3450 lea r5, [3*r3] | |
3451 SA8D_16x16 | |
3452 lea r4, [8*r1] | |
3453 lea r5, [8*r3] | |
3454 sub r0, r4 | |
3455 sub r2, r5 | |
3456 add r2, 16*SIZEOF_PIXEL | |
3457 add r0, 16*SIZEOF_PIXEL | |
3458 lea r4, [3*r1] | |
3459 lea r5, [3*r3] | |
3460 SA8D_16x16 | |
3461 lea r0, [r0+8*r1] | |
3462 lea r2, [r2+8*r3] | |
3463 SA8D_16x16 | |
3464 lea r4, [8*r1] | |
3465 lea r5, [8*r3] | |
3466 sub r0, r4 | |
3467 sub r2, r5 | |
3468 sub r2, 16*SIZEOF_PIXEL | |
3469 sub r0, 16*SIZEOF_PIXEL | |
3470 lea r4, [3*r1] | |
3471 lea r5, [3*r3] | |
3472 SA8D_16x16 | |
3473 lea r4, [8*r1] | |
3474 lea r5, [8*r3] | |
3475 sub r0, r4 | |
3476 sub r2, r5 | |
3477 sub r2, 16*SIZEOF_PIXEL | |
3478 sub r0, 16*SIZEOF_PIXEL | |
3479 lea r4, [3*r1] | |
3480 lea r5, [3*r3] | |
3481 SA8D_16x16 | |
3482 movd eax, m12 | |
3483 RET | |
3484 | |
3485 cglobal pixel_sa8d_64x16, 4,8,13 | |
3486 FIX_STRIDES r1, r3 | |
3487 lea r4, [3*r1] | |
3488 lea r5, [3*r3] | |
3489 pxor m12, m12 | |
3490 %if vertical == 0 | |
3491 mova m7, [hmul_8p] | |
3492 %endif | |
3493 SA8D_16x16 | |
3494 lea r4, [8*r1] | |
3495 lea r5, [8*r3] | |
3496 sub r0, r4 | |
3497 sub r2, r5 | |
3498 add r2, 16*SIZEOF_PIXEL | |
3499 add r0, 16*SIZEOF_PIXEL | |
3500 lea r4, [3*r1] | |
3501 lea r5, [3*r3] | |
3502 SA8D_16x16 | |
3503 lea r4, [8*r1] | |
3504 lea r5, [8*r3] | |
3505 sub r0, r4 | |
3506 sub r2, r5 | |
3507 add r2, 16*SIZEOF_PIXEL | |
3508 add r0, 16*SIZEOF_PIXEL | |
3509 lea r4, [3*r1] | |
3510 lea r5, [3*r3] | |
3511 SA8D_16x16 | |
3512 lea r4, [8*r1] | |
3513 lea r5, [8*r3] | |
3514 sub r0, r4 | |
3515 sub r2, r5 | |
3516 add r2, 16*SIZEOF_PIXEL | |
3517 add r0, 16*SIZEOF_PIXEL | |
3518 lea r4, [3*r1] | |
3519 lea r5, [3*r3] | |
3520 SA8D_16x16 | |
3521 movd eax, m12 | |
3522 RET | |
3523 | |
3524 cglobal pixel_sa8d_64x32, 4,8,13 | |
3525 FIX_STRIDES r1, r3 | |
3526 lea r4, [3*r1] | |
3527 lea r5, [3*r3] | |
3528 pxor m12, m12 | |
3529 %if vertical == 0 | |
3530 mova m7, [hmul_8p] | |
3531 %endif | |
3532 SA8D_16x16 | |
3533 lea r4, [8*r1] | |
3534 lea r5, [8*r3] | |
3535 sub r0, r4 | |
3536 sub r2, r5 | |
3537 add r2, 16*SIZEOF_PIXEL | |
3538 add r0, 16*SIZEOF_PIXEL | |
3539 lea r4, [3*r1] | |
3540 lea r5, [3*r3] | |
3541 SA8D_16x16 | |
3542 lea r4, [8*r1] | |
3543 lea r5, [8*r3] | |
3544 sub r0, r4 | |
3545 sub r2, r5 | |
3546 add r2, 16*SIZEOF_PIXEL | |
3547 add r0, 16*SIZEOF_PIXEL | |
3548 lea r4, [3*r1] | |
3549 lea r5, [3*r3] | |
3550 SA8D_16x16 | |
3551 lea r4, [8*r1] | |
3552 lea r5, [8*r3] | |
3553 sub r0, r4 | |
3554 sub r2, r5 | |
3555 add r2, 16*SIZEOF_PIXEL | |
3556 add r0, 16*SIZEOF_PIXEL | |
3557 lea r4, [3*r1] | |
3558 lea r5, [3*r3] | |
3559 SA8D_16x16 | |
3560 lea r0, [r0+8*r1] | |
3561 lea r2, [r2+8*r3] | |
3562 SA8D_16x16 | |
3563 lea r4, [8*r1] | |
3564 lea r5, [8*r3] | |
3565 sub r0, r4 | |
3566 sub r2, r5 | |
3567 sub r2, 16*SIZEOF_PIXEL | |
3568 sub r0, 16*SIZEOF_PIXEL | |
3569 lea r4, [3*r1] | |
3570 lea r5, [3*r3] | |
3571 SA8D_16x16 | |
3572 lea r4, [8*r1] | |
3573 lea r5, [8*r3] | |
3574 sub r0, r4 | |
3575 sub r2, r5 | |
3576 sub r2, 16*SIZEOF_PIXEL | |
3577 sub r0, 16*SIZEOF_PIXEL | |
3578 lea r4, [3*r1] | |
3579 lea r5, [3*r3] | |
3580 SA8D_16x16 | |
3581 lea r4, [8*r1] | |
3582 lea r5, [8*r3] | |
3583 sub r0, r4 | |
3584 sub r2, r5 | |
3585 sub r2, 16*SIZEOF_PIXEL | |
3586 sub r0, 16*SIZEOF_PIXEL | |
3587 lea r4, [3*r1] | |
3588 lea r5, [3*r3] | |
3589 SA8D_16x16 | |
3590 movd eax, m12 | |
3591 RET | |
3592 | |
3593 cglobal pixel_sa8d_64x48, 4,8,13 | |
3594 FIX_STRIDES r1, r3 | |
3595 lea r4, [3*r1] | |
3596 lea r5, [3*r3] | |
3597 pxor m12, m12 | |
3598 %if vertical == 0 | |
3599 mova m7, [hmul_8p] | |
3600 %endif | |
3601 SA8D_16x16 | |
3602 lea r4, [8*r1] | |
3603 lea r5, [8*r3] | |
3604 sub r0, r4 | |
3605 sub r2, r5 | |
3606 add r2, 16*SIZEOF_PIXEL | |
3607 add r0, 16*SIZEOF_PIXEL | |
3608 lea r4, [3*r1] | |
3609 lea r5, [3*r3] | |
3610 SA8D_16x16 | |
3611 lea r4, [8*r1] | |
3612 lea r5, [8*r3] | |
3613 sub r0, r4 | |
3614 sub r2, r5 | |
3615 add r2, 16*SIZEOF_PIXEL | |
3616 add r0, 16*SIZEOF_PIXEL | |
3617 lea r4, [3*r1] | |
3618 lea r5, [3*r3] | |
3619 SA8D_16x16 | |
3620 lea r4, [8*r1] | |
3621 lea r5, [8*r3] | |
3622 sub r0, r4 | |
3623 sub r2, r5 | |
3624 add r2, 16*SIZEOF_PIXEL | |
3625 add r0, 16*SIZEOF_PIXEL | |
3626 lea r4, [3*r1] | |
3627 lea r5, [3*r3] | |
3628 SA8D_16x16 | |
3629 lea r0, [r0+8*r1] | |
3630 lea r2, [r2+8*r3] | |
3631 SA8D_16x16 | |
3632 lea r4, [8*r1] | |
3633 lea r5, [8*r3] | |
3634 sub r0, r4 | |
3635 sub r2, r5 | |
3636 sub r2, 16*SIZEOF_PIXEL | |
3637 sub r0, 16*SIZEOF_PIXEL | |
3638 lea r4, [3*r1] | |
3639 lea r5, [3*r3] | |
3640 SA8D_16x16 | |
3641 lea r4, [8*r1] | |
3642 lea r5, [8*r3] | |
3643 sub r0, r4 | |
3644 sub r2, r5 | |
3645 sub r2, 16*SIZEOF_PIXEL | |
3646 sub r0, 16*SIZEOF_PIXEL | |
3647 lea r4, [3*r1] | |
3648 lea r5, [3*r3] | |
3649 SA8D_16x16 | |
3650 lea r4, [8*r1] | |
3651 lea r5, [8*r3] | |
3652 sub r0, r4 | |
3653 sub r2, r5 | |
3654 sub r2, 16*SIZEOF_PIXEL | |
3655 sub r0, 16*SIZEOF_PIXEL | |
3656 lea r4, [3*r1] | |
3657 lea r5, [3*r3] | |
3658 SA8D_16x16 | |
3659 lea r0, [r0+8*r1] | |
3660 lea r2, [r2+8*r3] | |
3661 SA8D_16x16 | |
3662 lea r4, [8*r1] | |
3663 lea r5, [8*r3] | |
3664 sub r0, r4 | |
3665 sub r2, r5 | |
3666 add r2, 16*SIZEOF_PIXEL | |
3667 add r0, 16*SIZEOF_PIXEL | |
3668 lea r4, [3*r1] | |
3669 lea r5, [3*r3] | |
3670 SA8D_16x16 | |
3671 lea r4, [8*r1] | |
3672 lea r5, [8*r3] | |
3673 sub r0, r4 | |
3674 sub r2, r5 | |
3675 add r2, 16*SIZEOF_PIXEL | |
3676 add r0, 16*SIZEOF_PIXEL | |
3677 lea r4, [3*r1] | |
3678 lea r5, [3*r3] | |
3679 SA8D_16x16 | |
3680 lea r4, [8*r1] | |
3681 lea r5, [8*r3] | |
3682 sub r0, r4 | |
3683 sub r2, r5 | |
3684 add r2, 16*SIZEOF_PIXEL | |
3685 add r0, 16*SIZEOF_PIXEL | |
3686 lea r4, [3*r1] | |
3687 lea r5, [3*r3] | |
3688 SA8D_16x16 | |
3689 movd eax, m12 | |
3690 RET | |
3691 | |
3692 cglobal pixel_sa8d_64x64, 4,8,13 | |
3693 FIX_STRIDES r1, r3 | |
3694 lea r4, [3*r1] | |
3695 lea r5, [3*r3] | |
3696 pxor m12, m12 | |
3697 %if vertical == 0 | |
3698 mova m7, [hmul_8p] | |
3699 %endif | |
3700 SA8D_16x16 | |
3701 lea r4, [8*r1] | |
3702 lea r5, [8*r3] | |
3703 sub r0, r4 | |
3704 sub r2, r5 | |
3705 add r2, 16*SIZEOF_PIXEL | |
3706 add r0, 16*SIZEOF_PIXEL | |
3707 lea r4, [3*r1] | |
3708 lea r5, [3*r3] | |
3709 SA8D_16x16 | |
3710 lea r4, [8*r1] | |
3711 lea r5, [8*r3] | |
3712 sub r0, r4 | |
3713 sub r2, r5 | |
3714 add r2, 16*SIZEOF_PIXEL | |
3715 add r0, 16*SIZEOF_PIXEL | |
3716 lea r4, [3*r1] | |
3717 lea r5, [3*r3] | |
3718 SA8D_16x16 | |
3719 lea r4, [8*r1] | |
3720 lea r5, [8*r3] | |
3721 sub r0, r4 | |
3722 sub r2, r5 | |
3723 add r2, 16*SIZEOF_PIXEL | |
3724 add r0, 16*SIZEOF_PIXEL | |
3725 lea r4, [3*r1] | |
3726 lea r5, [3*r3] | |
3727 SA8D_16x16 | |
3728 lea r0, [r0+8*r1] | |
3729 lea r2, [r2+8*r3] | |
3730 SA8D_16x16 | |
3731 lea r4, [8*r1] | |
3732 lea r5, [8*r3] | |
3733 sub r0, r4 | |
3734 sub r2, r5 | |
3735 sub r2, 16*SIZEOF_PIXEL | |
3736 sub r0, 16*SIZEOF_PIXEL | |
3737 lea r4, [3*r1] | |
3738 lea r5, [3*r3] | |
3739 SA8D_16x16 | |
3740 lea r4, [8*r1] | |
3741 lea r5, [8*r3] | |
3742 sub r0, r4 | |
3743 sub r2, r5 | |
3744 sub r2, 16*SIZEOF_PIXEL | |
3745 sub r0, 16*SIZEOF_PIXEL | |
3746 lea r4, [3*r1] | |
3747 lea r5, [3*r3] | |
3748 SA8D_16x16 | |
3749 lea r4, [8*r1] | |
3750 lea r5, [8*r3] | |
3751 sub r0, r4 | |
3752 sub r2, r5 | |
3753 sub r2, 16*SIZEOF_PIXEL | |
3754 sub r0, 16*SIZEOF_PIXEL | |
3755 lea r4, [3*r1] | |
3756 lea r5, [3*r3] | |
3757 SA8D_16x16 | |
3758 lea r0, [r0+8*r1] | |
3759 lea r2, [r2+8*r3] | |
3760 SA8D_16x16 | |
3761 lea r4, [8*r1] | |
3762 lea r5, [8*r3] | |
3763 sub r0, r4 | |
3764 sub r2, r5 | |
3765 add r2, 16*SIZEOF_PIXEL | |
3766 add r0, 16*SIZEOF_PIXEL | |
3767 lea r4, [3*r1] | |
3768 lea r5, [3*r3] | |
3769 SA8D_16x16 | |
3770 lea r4, [8*r1] | |
3771 lea r5, [8*r3] | |
3772 sub r0, r4 | |
3773 sub r2, r5 | |
3774 add r2, 16*SIZEOF_PIXEL | |
3775 add r0, 16*SIZEOF_PIXEL | |
3776 lea r4, [3*r1] | |
3777 lea r5, [3*r3] | |
3778 SA8D_16x16 | |
3779 lea r4, [8*r1] | |
3780 lea r5, [8*r3] | |
3781 sub r0, r4 | |
3782 sub r2, r5 | |
3783 add r2, 16*SIZEOF_PIXEL | |
3784 add r0, 16*SIZEOF_PIXEL | |
3785 lea r4, [3*r1] | |
3786 lea r5, [3*r3] | |
3787 SA8D_16x16 | |
3788 lea r0, [r0+8*r1] | |
3789 lea r2, [r2+8*r3] | |
3790 SA8D_16x16 | |
3791 lea r4, [8*r1] | |
3792 lea r5, [8*r3] | |
3793 sub r0, r4 | |
3794 sub r2, r5 | |
3795 sub r2, 16*SIZEOF_PIXEL | |
3796 sub r0, 16*SIZEOF_PIXEL | |
3797 lea r4, [3*r1] | |
3798 lea r5, [3*r3] | |
3799 SA8D_16x16 | |
3800 lea r4, [8*r1] | |
3801 lea r5, [8*r3] | |
3802 sub r0, r4 | |
3803 sub r2, r5 | |
3804 sub r2, 16*SIZEOF_PIXEL | |
3805 sub r0, 16*SIZEOF_PIXEL | |
3806 lea r4, [3*r1] | |
3807 lea r5, [3*r3] | |
3808 SA8D_16x16 | |
3809 lea r4, [8*r1] | |
3810 lea r5, [8*r3] | |
3811 sub r0, r4 | |
3812 sub r2, r5 | |
3813 sub r2, 16*SIZEOF_PIXEL | |
3814 sub r0, 16*SIZEOF_PIXEL | |
3815 lea r4, [3*r1] | |
3816 lea r5, [3*r3] | |
3817 SA8D_16x16 | |
3818 movd eax, m12 | |
3819 RET | |
3820 | |
3821 %else ; ARCH_X86_32 | |
3822 %if mmsize == 16 | |
3823 cglobal pixel_sa8d_8x8_internal | |
3824 %define spill0 [esp+4] | |
3825 %define spill1 [esp+20] | |
3826 %define spill2 [esp+36] | |
3827 %if vertical | |
3828 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 | |
3829 HADAMARD4_2D 0, 1, 2, 3, 4 | |
3830 movdqa spill0, m3 | |
3831 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 | |
3832 HADAMARD4_2D 4, 5, 6, 7, 3 | |
3833 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax | |
3834 movdqa m3, spill0 | |
3835 paddw m0, m1 | |
3836 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax | |
3837 %else ; mmsize == 8 | |
3838 mova m7, [hmul_8p] | |
3839 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 | |
3840 ; could do first HADAMARD4_V here to save spilling later | |
3841 ; surprisingly, not a win on conroe or even p4 | |
3842 mova spill0, m2 | |
3843 mova spill1, m3 | |
3844 mova spill2, m1 | |
3845 SWAP 1, 7 | |
3846 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 | |
3847 HADAMARD4_V 4, 5, 6, 7, 3 | |
3848 mova m1, spill2 | |
3849 mova m2, spill0 | |
3850 mova m3, spill1 | |
3851 mova spill0, m6 | |
3852 mova spill1, m7 | |
3853 HADAMARD4_V 0, 1, 2, 3, 7 | |
3854 SUMSUB_BADC w, 0, 4, 1, 5, 7 | |
3855 HADAMARD 2, sumsub, 0, 4, 7, 6 | |
3856 HADAMARD 2, sumsub, 1, 5, 7, 6 | |
3857 HADAMARD 1, amax, 0, 4, 7, 6 | |
3858 HADAMARD 1, amax, 1, 5, 7, 6 | |
3859 mova m6, spill0 | |
3860 mova m7, spill1 | |
3861 paddw m0, m1 | |
3862 SUMSUB_BADC w, 2, 6, 3, 7, 4 | |
3863 HADAMARD 2, sumsub, 2, 6, 4, 5 | |
3864 HADAMARD 2, sumsub, 3, 7, 4, 5 | |
3865 HADAMARD 1, amax, 2, 6, 4, 5 | |
3866 HADAMARD 1, amax, 3, 7, 4, 5 | |
3867 %endif ; sse2/non-sse2 | |
3868 paddw m0, m2 | |
3869 paddw m0, m3 | |
3870 SAVE_MM_PERMUTATION | |
3871 ret | |
3872 %endif ; ifndef mmx2 | |
3873 | |
3874 cglobal pixel_sa8d_8x8_internal2 | |
3875 %define spill0 [esp+4] | |
3876 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 | |
3877 HADAMARD4_2D 0, 1, 2, 3, 4 | |
3878 movdqa spill0, m3 | |
3879 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 | |
3880 HADAMARD4_2D 4, 5, 6, 7, 3 | |
3881 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax | |
3882 movdqa m3, spill0 | |
3883 paddw m0, m1 | |
3884 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax | |
3885 paddw m0, m2 | |
3886 paddw m0, m3 | |
3887 SAVE_MM_PERMUTATION | |
3888 ret | |
3889 | |
3890 cglobal pixel_sa8d_8x8, 4,7 | |
3891 FIX_STRIDES r1, r3 | |
3892 mov r6, esp | |
3893 and esp, ~15 | |
3894 sub esp, 48 | |
3895 lea r4, [3*r1] | |
3896 lea r5, [3*r3] | |
3897 call pixel_sa8d_8x8_internal | |
3898 %if HIGH_BIT_DEPTH | |
3899 HADDUW m0, m1 | |
3900 %else | |
3901 HADDW m0, m1 | |
3902 %endif ; HIGH_BIT_DEPTH | |
3903 movd eax, m0 | |
3904 add eax, 1 | |
3905 shr eax, 1 | |
3906 mov esp, r6 | |
3907 RET | |
3908 | |
3909 cglobal pixel_sa8d_16x16, 4,7 | |
3910 FIX_STRIDES r1, r3 | |
3911 mov r6, esp | |
3912 and esp, ~15 | |
3913 sub esp, 64 | |
3914 lea r4, [3*r1] | |
3915 lea r5, [3*r3] | |
3916 call pixel_sa8d_8x8_internal | |
3917 %if mmsize == 8 | |
3918 lea r0, [r0+4*r1] | |
3919 lea r2, [r2+4*r3] | |
3920 %endif | |
3921 %if HIGH_BIT_DEPTH | |
3922 HADDUW m0, m1 | |
3923 %endif | |
3924 mova [esp+48], m0 | |
3925 call pixel_sa8d_8x8_internal | |
3926 mov r0, [r6+20] | |
3927 mov r2, [r6+28] | |
3928 add r0, 8*SIZEOF_PIXEL | |
3929 add r2, 8*SIZEOF_PIXEL | |
3930 SA8D_INTER | |
3931 mova [esp+48], m0 | |
3932 call pixel_sa8d_8x8_internal | |
3933 %if mmsize == 8 | |
3934 lea r0, [r0+4*r1] | |
3935 lea r2, [r2+4*r3] | |
3936 %else | |
3937 SA8D_INTER | |
3938 %endif | |
3939 mova [esp+64-mmsize], m0 | |
3940 call pixel_sa8d_8x8_internal | |
3941 %if HIGH_BIT_DEPTH | |
3942 SA8D_INTER | |
3943 %else ; !HIGH_BIT_DEPTH | |
3944 paddusw m0, [esp+64-mmsize] | |
3945 %if mmsize == 16 | |
3946 HADDUW m0, m1 | |
3947 %else | |
3948 mova m2, [esp+48] | |
3949 pxor m7, m7 | |
3950 mova m1, m0 | |
3951 mova m3, m2 | |
3952 punpcklwd m0, m7 | |
3953 punpckhwd m1, m7 | |
3954 punpcklwd m2, m7 | |
3955 punpckhwd m3, m7 | |
3956 paddd m0, m1 | |
3957 paddd m2, m3 | |
3958 paddd m0, m2 | |
3959 HADDD m0, m1 | |
3960 %endif | |
3961 %endif ; HIGH_BIT_DEPTH | |
3962 movd eax, m0 | |
3963 add eax, 1 | |
3964 shr eax, 1 | |
3965 mov esp, r6 | |
3966 RET | |
3967 | |
3968 cglobal pixel_sa8d_8x16, 4,7,8 | |
3969 FIX_STRIDES r1, r3 | |
3970 mov r6, esp | |
3971 and esp, ~15 | |
3972 sub esp, 64 | |
3973 | |
3974 lea r4, [r1 + 2*r1] | |
3975 lea r5, [r3 + 2*r3] | |
3976 call pixel_sa8d_8x8_internal2 | |
3977 HADDUW m0, m1 | |
3978 movd r4d, m0 | |
3979 add r4d, 1 | |
3980 shr r4d, 1 | |
3981 mov dword [esp+36], r4d | |
3982 | |
3983 mov r0, [r6+20] | |
3984 mov r2, [r6+28] | |
3985 lea r0, [r0 + r1*8] | |
3986 lea r2, [r2 + r3*8] | |
3987 lea r4, [r1 + 2*r1] | |
3988 call pixel_sa8d_8x8_internal2 | |
3989 HADDUW m0, m1 | |
3990 movd r4d, m0 | |
3991 add r4d, 1 | |
3992 shr r4d, 1 | |
3993 add r4d, dword [esp+36] | |
3994 mov eax, r4d | |
3995 mov esp, r6 | |
3996 RET | |
3997 | |
3998 cglobal pixel_sa8d_8x32, 4,7,8 | |
3999 FIX_STRIDES r1, r3 | |
4000 mov r6, esp | |
4001 and esp, ~15 | |
4002 sub esp, 64 | |
4003 | |
4004 lea r4, [r1 + 2*r1] | |
4005 lea r5, [r3 + 2*r3] | |
4006 call pixel_sa8d_8x8_internal2 | |
4007 HADDUW m0, m1 | |
4008 movd r4d, m0 | |
4009 add r4d, 1 | |
4010 shr r4d, 1 | |
4011 mov dword [esp+36], r4d | |
4012 | |
4013 mov r0, [r6+20] | |
4014 mov r2, [r6+28] | |
4015 lea r0, [r0 + r1*8] | |
4016 lea r2, [r2 + r3*8] | |
4017 lea r4, [r1 + 2*r1] | |
4018 call pixel_sa8d_8x8_internal2 | |
4019 HADDUW m0, m1 | |
4020 movd r4d, m0 | |
4021 add r4d, 1 | |
4022 shr r4d, 1 | |
4023 add r4d, dword [esp+36] | |
4024 mov dword [esp+36], r4d | |
4025 | |
4026 mov r0, [r6+20] | |
4027 mov r2, [r6+28] | |
4028 lea r0, [r0 + r1*8] | |
4029 lea r2, [r2 + r3*8] | |
4030 lea r0, [r0 + r1*8] | |
4031 lea r2, [r2 + r3*8] | |
4032 lea r4, [r1 + 2*r1] | |
4033 call pixel_sa8d_8x8_internal2 | |
4034 HADDUW m0, m1 | |
4035 movd r4d, m0 | |
4036 add r4d, 1 | |
4037 shr r4d, 1 | |
4038 add r4d, dword [esp+36] | |
4039 mov dword [esp+36], r4d | |
4040 | |
4041 mov r0, [r6+20] | |
4042 mov r2, [r6+28] | |
4043 lea r0, [r0 + r1*8] | |
4044 lea r2, [r2 + r3*8] | |
4045 lea r0, [r0 + r1*8] | |
4046 lea r2, [r2 + r3*8] | |
4047 lea r0, [r0 + r1*8] | |
4048 lea r2, [r2 + r3*8] | |
4049 lea r4, [r1 + 2*r1] | |
4050 call pixel_sa8d_8x8_internal2 | |
4051 HADDUW m0, m1 | |
4052 movd r4d, m0 | |
4053 add r4d, 1 | |
4054 shr r4d, 1 | |
4055 add r4d, dword [esp+36] | |
4056 mov eax, r4d | |
4057 mov esp, r6 | |
4058 RET | |
4059 | |
4060 cglobal pixel_sa8d_16x8, 4,7,8 | |
4061 FIX_STRIDES r1, r3 | |
4062 mov r6, esp | |
4063 and esp, ~15 | |
4064 sub esp, 64 | |
4065 | |
4066 lea r4, [r1 + 2*r1] | |
4067 lea r5, [r3 + 2*r3] | |
4068 call pixel_sa8d_8x8_internal2 | |
4069 HADDUW m0, m1 | |
4070 movd r4d, m0 | |
4071 add r4d, 1 | |
4072 shr r4d, 1 | |
4073 mov dword [esp+36], r4d | |
4074 | |
4075 mov r0, [r6+20] | |
4076 mov r2, [r6+28] | |
4077 add r0, 8*SIZEOF_PIXEL | |
4078 add r2, 8*SIZEOF_PIXEL | |
4079 lea r4, [r1 + 2*r1] | |
4080 call pixel_sa8d_8x8_internal2 | |
4081 HADDUW m0, m1 | |
4082 movd r4d, m0 | |
4083 add r4d, 1 | |
4084 shr r4d, 1 | |
4085 add r4d, dword [esp+36] | |
4086 mov eax, r4d | |
4087 mov esp, r6 | |
4088 RET | |
4089 | |
4090 cglobal pixel_sa8d_16x32, 4,7,8 | |
4091 FIX_STRIDES r1, r3 | |
4092 mov r6, esp | |
4093 and esp, ~15 | |
4094 sub esp, 64 | |
4095 | |
4096 lea r4, [r1 + 2*r1] | |
4097 lea r5, [r3 + 2*r3] | |
4098 call pixel_sa8d_8x8_internal2 | |
4099 %if HIGH_BIT_DEPTH | |
4100 HADDUW m0, m1 | |
4101 %endif | |
4102 mova [rsp+48], m0 | |
4103 call pixel_sa8d_8x8_internal2 | |
4104 SA8D_INTER | |
4105 mova [esp+48], m0 | |
4106 | |
4107 mov r0, [r6+20] | |
4108 mov r2, [r6+28] | |
4109 add r0, 8*SIZEOF_PIXEL | |
4110 add r2, 8*SIZEOF_PIXEL | |
4111 call pixel_sa8d_8x8_internal2 | |
4112 SA8D_INTER | |
4113 mova [esp+48], m0 | |
4114 call pixel_sa8d_8x8_internal2 | |
4115 SA8D_INTER | |
4116 %if HIGH_BIT_DEPTH == 0 | |
4117 HADDUW m0, m1 | |
4118 %endif | |
4119 movd r4d, m0 | |
4120 add r4d, 1 | |
4121 shr r4d, 1 | |
4122 mov dword [esp+36], r4d | |
4123 | |
4124 mov r0, [r6+20] | |
4125 mov r2, [r6+28] | |
4126 lea r0, [r0 + r1*8] | |
4127 lea r2, [r2 + r3*8] | |
4128 lea r0, [r0 + r1*8] | |
4129 lea r2, [r2 + r3*8] | |
4130 lea r4, [r1 + 2*r1] | |
4131 call pixel_sa8d_8x8_internal2 | |
4132 %if HIGH_BIT_DEPTH | |
4133 HADDUW m0, m1 | |
4134 %endif | |
4135 mova [esp+48], m0 | |
4136 call pixel_sa8d_8x8_internal2 | |
4137 SA8D_INTER | |
4138 mova [esp+48], m0 | |
4139 | |
4140 mov r0, [r6+20] | |
4141 mov r2, [r6+28] | |
4142 lea r0, [r0 + r1*8] | |
4143 lea r2, [r2 + r3*8] | |
4144 lea r0, [r0 + r1*8] | |
4145 lea r2, [r2 + r3*8] | |
4146 add r0, 8*SIZEOF_PIXEL | |
4147 add r2, 8*SIZEOF_PIXEL | |
4148 call pixel_sa8d_8x8_internal2 | |
4149 SA8D_INTER | |
4150 mova [esp+48], m0 | |
4151 call pixel_sa8d_8x8_internal2 | |
4152 SA8D_INTER | |
4153 %if HIGH_BIT_DEPTH == 0 | |
4154 HADDUW m0, m1 | |
4155 %endif | |
4156 movd r4d, m0 | |
4157 add r4d, 1 | |
4158 shr r4d, 1 | |
4159 add r4d, dword [esp+36] | |
4160 mov eax, r4d | |
4161 mov esp, r6 | |
4162 RET | |
4163 | |
4164 cglobal pixel_sa8d_16x64, 4,7,8 | |
4165 FIX_STRIDES r1, r3 | |
4166 mov r6, esp | |
4167 and esp, ~15 | |
4168 sub esp, 64 | |
4169 | |
4170 lea r4, [r1 + 2*r1] | |
4171 lea r5, [r3 + 2*r3] | |
4172 call pixel_sa8d_8x8_internal2 | |
4173 %if HIGH_BIT_DEPTH | |
4174 HADDUW m0, m1 | |
4175 %endif | |
4176 mova [rsp+48], m0 | |
4177 call pixel_sa8d_8x8_internal2 | |
4178 SA8D_INTER | |
4179 mova [esp+48], m0 | |
4180 | |
4181 mov r0, [r6+20] | |
4182 mov r2, [r6+28] | |
4183 add r0, 8*SIZEOF_PIXEL | |
4184 add r2, 8*SIZEOF_PIXEL | |
4185 call pixel_sa8d_8x8_internal2 | |
4186 SA8D_INTER | |
4187 mova [esp+48], m0 | |
4188 call pixel_sa8d_8x8_internal2 | |
4189 SA8D_INTER | |
4190 %if HIGH_BIT_DEPTH == 0 | |
4191 HADDUW m0, m1 | |
4192 %endif | |
4193 movd r4d, m0 | |
4194 add r4d, 1 | |
4195 shr r4d, 1 | |
4196 mov dword [esp+36], r4d | |
4197 | |
4198 mov r0, [r6+20] | |
4199 mov r2, [r6+28] | |
4200 lea r0, [r0 + r1*8] | |
4201 lea r2, [r2 + r3*8] | |
4202 lea r0, [r0 + r1*8] | |
4203 lea r2, [r2 + r3*8] | |
4204 mov [r6+20], r0 | |
4205 mov [r6+28], r2 | |
4206 | |
4207 lea r4, [r1 + 2*r1] | |
4208 call pixel_sa8d_8x8_internal2 | |
4209 %if HIGH_BIT_DEPTH | |
4210 HADDUW m0, m1 | |
4211 %endif | |
4212 mova [esp+48], m0 | |
4213 call pixel_sa8d_8x8_internal2 | |
4214 SA8D_INTER | |
4215 mova [esp+48], m0 | |
4216 | |
4217 mov r0, [r6+20] | |
4218 mov r2, [r6+28] | |
4219 add r0, 8*SIZEOF_PIXEL | |
4220 add r2, 8*SIZEOF_PIXEL | |
4221 call pixel_sa8d_8x8_internal2 | |
4222 SA8D_INTER | |
4223 mova [esp+64-mmsize], m0 | |
4224 call pixel_sa8d_8x8_internal2 | |
4225 AVG_16x16 | |
4226 | |
4227 mov r0, [r6+20] | |
4228 mov r2, [r6+28] | |
4229 lea r0, [r0 + r1*8] | |
4230 lea r2, [r2 + r3*8] | |
4231 lea r0, [r0 + r1*8] | |
4232 lea r2, [r2 + r3*8] | |
4233 mov [r6+20], r0 | |
4234 mov [r6+28], r2 | |
4235 | |
4236 lea r4, [r1 + 2*r1] | |
4237 call pixel_sa8d_8x8_internal2 | |
4238 %if HIGH_BIT_DEPTH | |
4239 HADDUW m0, m1 | |
4240 %endif | |
4241 mova [esp+48], m0 | |
4242 call pixel_sa8d_8x8_internal2 | |
4243 SA8D_INTER | |
4244 mova [esp+48], m0 | |
4245 | |
4246 mov r0, [r6+20] | |
4247 mov r2, [r6+28] | |
4248 add r0, 8*SIZEOF_PIXEL | |
4249 add r2, 8*SIZEOF_PIXEL | |
4250 call pixel_sa8d_8x8_internal2 | |
4251 SA8D_INTER | |
4252 mova [esp+64-mmsize], m0 | |
4253 call pixel_sa8d_8x8_internal2 | |
4254 AVG_16x16 | |
4255 | |
4256 mov r0, [r6+20] | |
4257 mov r2, [r6+28] | |
4258 lea r0, [r0 + r1*8] | |
4259 lea r2, [r2 + r3*8] | |
4260 lea r0, [r0 + r1*8] | |
4261 lea r2, [r2 + r3*8] | |
4262 mov [r6+20], r0 | |
4263 mov [r6+28], r2 | |
4264 | |
4265 lea r4, [r1 + 2*r1] | |
4266 call pixel_sa8d_8x8_internal2 | |
4267 %if HIGH_BIT_DEPTH | |
4268 HADDUW m0, m1 | |
4269 %endif | |
4270 mova [esp+48], m0 | |
4271 call pixel_sa8d_8x8_internal2 | |
4272 SA8D_INTER | |
4273 mova [esp+48], m0 | |
4274 | |
4275 mov r0, [r6+20] | |
4276 mov r2, [r6+28] | |
4277 add r0, 8*SIZEOF_PIXEL | |
4278 add r2, 8*SIZEOF_PIXEL | |
4279 call pixel_sa8d_8x8_internal2 | |
4280 SA8D_INTER | |
4281 mova [esp+64-mmsize], m0 | |
4282 call pixel_sa8d_8x8_internal2 | |
4283 SA8D_INTER | |
4284 %if HIGH_BIT_DEPTH == 0 | |
4285 HADDUW m0, m1 | |
4286 %endif | |
4287 movd r4d, m0 | |
4288 add r4d, 1 | |
4289 shr r4d, 1 | |
4290 add r4d, dword [esp+36] | |
4291 mov eax, r4d | |
4292 mov esp, r6 | |
4293 RET | |
4294 | |
4295 cglobal pixel_sa8d_24x32, 4,7,8 | |
4296 FIX_STRIDES r1, r3 | |
4297 mov r6, esp | |
4298 and esp, ~15 | |
4299 sub esp, 64 | |
4300 | |
4301 lea r4, [r1 + 2*r1] | |
4302 lea r5, [r3 + 2*r3] | |
4303 call pixel_sa8d_8x8_internal2 | |
4304 HADDUW m0, m1 | |
4305 movd r4d, m0 | |
4306 add r4d, 1 | |
4307 shr r4d, 1 | |
4308 mov dword [esp+36], r4d | |
4309 | |
4310 mov r0, [r6+20] | |
4311 mov r2, [r6+28] | |
4312 add r0, 8*SIZEOF_PIXEL | |
4313 add r2, 8*SIZEOF_PIXEL | |
4314 lea r4, [r1 + 2*r1] | |
4315 call pixel_sa8d_8x8_internal2 | |
4316 HADDUW m0, m1 | |
4317 movd r4d, m0 | |
4318 add r4d, 1 | |
4319 shr r4d, 1 | |
4320 add r4d, dword [esp+36] | |
4321 mov dword [esp+36], r4d | |
4322 | |
4323 mov r0, [r6+20] | |
4324 mov r2, [r6+28] | |
4325 add r0, 16*SIZEOF_PIXEL | |
4326 add r2, 16*SIZEOF_PIXEL | |
4327 lea r4, [r1 + 2*r1] | |
4328 call pixel_sa8d_8x8_internal2 | |
4329 HADDUW m0, m1 | |
4330 movd r4d, m0 | |
4331 add r4d, 1 | |
4332 shr r4d, 1 | |
4333 add r4d, dword [esp+36] | |
4334 mov dword [esp+36], r4d | |
4335 | |
4336 mov r0, [r6+20] | |
4337 mov r2, [r6+28] | |
4338 lea r0, [r0 + r1*8] | |
4339 lea r2, [r2 + r3*8] | |
4340 mov [r6+20], r0 | |
4341 mov [r6+28], r2 | |
4342 lea r4, [r1 + 2*r1] | |
4343 call pixel_sa8d_8x8_internal2 | |
4344 HADDUW m0, m1 | |
4345 movd r4d, m0 | |
4346 add r4d, 1 | |
4347 shr r4d, 1 | |
4348 add r4d, dword [esp+36] | |
4349 mov dword [esp+36], r4d | |
4350 | |
4351 mov r0, [r6+20] | |
4352 mov r2, [r6+28] | |
4353 add r0, 8*SIZEOF_PIXEL | |
4354 add r2, 8*SIZEOF_PIXEL | |
4355 lea r4, [r1 + 2*r1] | |
4356 call pixel_sa8d_8x8_internal2 | |
4357 HADDUW m0, m1 | |
4358 movd r4d, m0 | |
4359 add r4d, 1 | |
4360 shr r4d, 1 | |
4361 add r4d, dword [esp+36] | |
4362 mov dword [esp+36], r4d | |
4363 | |
4364 mov r0, [r6+20] | |
4365 mov r2, [r6+28] | |
4366 add r0, 16*SIZEOF_PIXEL | |
4367 add r2, 16*SIZEOF_PIXEL | |
4368 lea r4, [r1 + 2*r1] | |
4369 call pixel_sa8d_8x8_internal2 | |
4370 HADDUW m0, m1 | |
4371 movd r4d, m0 | |
4372 add r4d, 1 | |
4373 shr r4d, 1 | |
4374 add r4d, dword [esp+36] | |
4375 mov dword [esp+36], r4d | |
4376 | |
4377 mov r0, [r6+20] | |
4378 mov r2, [r6+28] | |
4379 lea r0, [r0 + r1*8] | |
4380 lea r2, [r2 + r3*8] | |
4381 mov [r6+20], r0 | |
4382 mov [r6+28], r2 | |
4383 lea r4, [r1 + 2*r1] | |
4384 call pixel_sa8d_8x8_internal2 | |
4385 HADDUW m0, m1 | |
4386 movd r4d, m0 | |
4387 add r4d, 1 | |
4388 shr r4d, 1 | |
4389 add r4d, dword [esp+36] | |
4390 mov dword [esp+36], r4d | |
4391 | |
4392 mov r0, [r6+20] | |
4393 mov r2, [r6+28] | |
4394 add r0, 8*SIZEOF_PIXEL | |
4395 add r2, 8*SIZEOF_PIXEL | |
4396 lea r4, [r1 + 2*r1] | |
4397 call pixel_sa8d_8x8_internal2 | |
4398 HADDUW m0, m1 | |
4399 movd r4d, m0 | |
4400 add r4d, 1 | |
4401 shr r4d, 1 | |
4402 add r4d, dword [esp+36] | |
4403 mov dword [esp+36], r4d | |
4404 | |
4405 mov r0, [r6+20] | |
4406 mov r2, [r6+28] | |
4407 add r0, 16*SIZEOF_PIXEL | |
4408 add r2, 16*SIZEOF_PIXEL | |
4409 lea r4, [r1 + 2*r1] | |
4410 call pixel_sa8d_8x8_internal2 | |
4411 HADDUW m0, m1 | |
4412 movd r4d, m0 | |
4413 add r4d, 1 | |
4414 shr r4d, 1 | |
4415 add r4d, dword [esp+36] | |
4416 mov dword [esp+36], r4d | |
4417 | |
4418 mov r0, [r6+20] | |
4419 mov r2, [r6+28] | |
4420 lea r0, [r0 + r1*8] | |
4421 lea r2, [r2 + r3*8] | |
4422 mov [r6+20], r0 | |
4423 mov [r6+28], r2 | |
4424 lea r4, [r1 + 2*r1] | |
4425 call pixel_sa8d_8x8_internal2 | |
4426 HADDUW m0, m1 | |
4427 movd r4d, m0 | |
4428 add r4d, 1 | |
4429 shr r4d, 1 | |
4430 add r4d, dword [esp+36] | |
4431 mov dword [esp+36], r4d | |
4432 | |
4433 mov r0, [r6+20] | |
4434 mov r2, [r6+28] | |
4435 add r0, 8*SIZEOF_PIXEL | |
4436 add r2, 8*SIZEOF_PIXEL | |
4437 lea r4, [r1 + 2*r1] | |
4438 call pixel_sa8d_8x8_internal2 | |
4439 HADDUW m0, m1 | |
4440 movd r4d, m0 | |
4441 add r4d, 1 | |
4442 shr r4d, 1 | |
4443 add r4d, dword [esp+36] | |
4444 mov dword [esp+36], r4d | |
4445 | |
4446 mov r0, [r6+20] | |
4447 mov r2, [r6+28] | |
4448 add r0, 16*SIZEOF_PIXEL | |
4449 add r2, 16*SIZEOF_PIXEL | |
4450 lea r4, [r1 + 2*r1] | |
4451 call pixel_sa8d_8x8_internal2 | |
4452 HADDUW m0, m1 | |
4453 movd r4d, m0 | |
4454 add r4d, 1 | |
4455 shr r4d, 1 | |
4456 add r4d, dword [esp+36] | |
4457 mov eax, r4d | |
4458 mov esp, r6 | |
4459 RET | |
4460 | |
4461 cglobal pixel_sa8d_32x8, 4,7,8 | |
4462 FIX_STRIDES r1, r3 | |
4463 mov r6, esp | |
4464 and esp, ~15 | |
4465 sub esp, 64 | |
4466 | |
4467 lea r4, [r1 + 2*r1] | |
4468 lea r5, [r3 + 2*r3] | |
4469 call pixel_sa8d_8x8_internal2 | |
4470 HADDUW m0, m1 | |
4471 movd r4d, m0 | |
4472 add r4d, 1 | |
4473 shr r4d, 1 | |
4474 mov dword [esp+36], r4d | |
4475 | |
4476 mov r0, [r6+20] | |
4477 mov r2, [r6+28] | |
4478 add r0, 8*SIZEOF_PIXEL | |
4479 add r2, 8*SIZEOF_PIXEL | |
4480 lea r4, [r1 + 2*r1] | |
4481 call pixel_sa8d_8x8_internal2 | |
4482 HADDUW m0, m1 | |
4483 movd r4d, m0 | |
4484 add r4d, 1 | |
4485 shr r4d, 1 | |
4486 add r4d, dword [esp+36] | |
4487 mov dword [esp+36], r4d | |
4488 | |
4489 mov r0, [r6+20] | |
4490 mov r2, [r6+28] | |
4491 add r0, 16*SIZEOF_PIXEL | |
4492 add r2, 16*SIZEOF_PIXEL | |
4493 lea r4, [r1 + 2*r1] | |
4494 call pixel_sa8d_8x8_internal2 | |
4495 HADDUW m0, m1 | |
4496 movd r4d, m0 | |
4497 add r4d, 1 | |
4498 shr r4d, 1 | |
4499 add r4d, dword [esp+36] | |
4500 mov dword [esp+36], r4d | |
4501 | |
4502 mov r0, [r6+20] | |
4503 mov r2, [r6+28] | |
4504 add r0, 24*SIZEOF_PIXEL | |
4505 add r2, 24*SIZEOF_PIXEL | |
4506 lea r4, [r1 + 2*r1] | |
4507 call pixel_sa8d_8x8_internal2 | |
4508 HADDUW m0, m1 | |
4509 movd r4d, m0 | |
4510 add r4d, 1 | |
4511 shr r4d, 1 | |
4512 add r4d, dword [esp+36] | |
4513 mov eax, r4d | |
4514 mov esp, r6 | |
4515 RET | |
4516 | |
4517 cglobal pixel_sa8d_32x16, 4,7,8 | |
4518 FIX_STRIDES r1, r3 | |
4519 mov r6, esp | |
4520 and esp, ~15 | |
4521 sub esp, 64 | |
4522 | |
4523 lea r4, [r1 + 2*r1] | |
4524 lea r5, [r3 + 2*r3] | |
4525 call pixel_sa8d_8x8_internal2 | |
4526 %if HIGH_BIT_DEPTH | |
4527 HADDUW m0, m1 | |
4528 %endif | |
4529 mova [rsp+48], m0 | |
4530 call pixel_sa8d_8x8_internal2 | |
4531 SA8D_INTER | |
4532 mova [esp+48], m0 | |
4533 | |
4534 mov r0, [r6+20] | |
4535 mov r2, [r6+28] | |
4536 add r0, 8*SIZEOF_PIXEL | |
4537 add r2, 8*SIZEOF_PIXEL | |
4538 call pixel_sa8d_8x8_internal2 | |
4539 SA8D_INTER | |
4540 mova [esp+48], m0 | |
4541 call pixel_sa8d_8x8_internal2 | |
4542 SA8D_INTER | |
4543 %if HIGH_BIT_DEPTH == 0 | |
4544 HADDUW m0, m1 | |
4545 %endif | |
4546 movd r4d, m0 | |
4547 add r4d, 1 | |
4548 shr r4d, 1 | |
4549 mov dword [esp+36], r4d | |
4550 | |
4551 mov r0, [r6+20] | |
4552 mov r2, [r6+28] | |
4553 add r0, 16*SIZEOF_PIXEL | |
4554 add r2, 16*SIZEOF_PIXEL | |
4555 lea r4, [r1 + 2*r1] | |
4556 call pixel_sa8d_8x8_internal2 | |
4557 %if HIGH_BIT_DEPTH | |
4558 HADDUW m0, m1 | |
4559 %endif | |
4560 mova [esp+48], m0 | |
4561 call pixel_sa8d_8x8_internal2 | |
4562 SA8D_INTER | |
4563 mova [esp+48], m0 | |
4564 | |
4565 mov r0, [r6+20] | |
4566 mov r2, [r6+28] | |
4567 add r0, 24*SIZEOF_PIXEL | |
4568 add r2, 24*SIZEOF_PIXEL | |
4569 call pixel_sa8d_8x8_internal2 | |
4570 SA8D_INTER | |
4571 mova [esp+64-mmsize], m0 | |
4572 call pixel_sa8d_8x8_internal2 | |
4573 SA8D_INTER | |
4574 %if HIGH_BIT_DEPTH == 0 | |
4575 HADDUW m0, m1 | |
4576 %endif | |
4577 movd r4d, m0 | |
4578 add r4d, 1 | |
4579 shr r4d, 1 | |
4580 add r4d, dword [esp+36] | |
4581 mov eax, r4d | |
4582 mov esp, r6 | |
4583 RET | |
4584 | |
4585 cglobal pixel_sa8d_32x24, 4,7,8 | |
4586 FIX_STRIDES r1, r3 | |
4587 mov r6, esp | |
4588 and esp, ~15 | |
4589 sub esp, 64 | |
4590 | |
4591 lea r4, [r1 + 2*r1] | |
4592 lea r5, [r3 + 2*r3] | |
4593 call pixel_sa8d_8x8_internal2 | |
4594 HADDUW m0, m1 | |
4595 movd r4d, m0 | |
4596 add r4d, 1 | |
4597 shr r4d, 1 | |
4598 mov dword [esp+36], r4d | |
4599 | |
4600 mov r0, [r6+20] | |
4601 mov r2, [r6+28] | |
4602 add r0, 8*SIZEOF_PIXEL | |
4603 add r2, 8*SIZEOF_PIXEL | |
4604 lea r4, [r1 + 2*r1] | |
4605 call pixel_sa8d_8x8_internal2 | |
4606 HADDUW m0, m1 | |
4607 movd r4d, m0 | |
4608 add r4d, 1 | |
4609 shr r4d, 1 | |
4610 add r4d, dword [esp+36] | |
4611 mov dword [esp+36], r4d | |
4612 | |
4613 mov r0, [r6+20] | |
4614 mov r2, [r6+28] | |
4615 add r0, 16*SIZEOF_PIXEL | |
4616 add r2, 16*SIZEOF_PIXEL | |
4617 lea r4, [r1 + 2*r1] | |
4618 call pixel_sa8d_8x8_internal2 | |
4619 HADDUW m0, m1 | |
4620 movd r4d, m0 | |
4621 add r4d, 1 | |
4622 shr r4d, 1 | |
4623 add r4d, dword [esp+36] | |
4624 mov dword [esp+36], r4d | |
4625 | |
4626 mov r0, [r6+20] | |
4627 mov r2, [r6+28] | |
4628 add r0, 24*SIZEOF_PIXEL | |
4629 add r2, 24*SIZEOF_PIXEL | |
4630 lea r4, [r1 + 2*r1] | |
4631 call pixel_sa8d_8x8_internal2 | |
4632 HADDUW m0, m1 | |
4633 movd r4d, m0 | |
4634 add r4d, 1 | |
4635 shr r4d, 1 | |
4636 add r4d, dword [esp+36] | |
4637 mov dword [esp+36], r4d | |
4638 | |
4639 mov r0, [r6+20] | |
4640 mov r2, [r6+28] | |
4641 lea r0, [r0 + r1*8] | |
4642 lea r2, [r2 + r3*8] | |
4643 mov [r6+20], r0 | |
4644 mov [r6+28], r2 | |
4645 lea r4, [r1 + 2*r1] | |
4646 call pixel_sa8d_8x8_internal2 | |
4647 HADDUW m0, m1 | |
4648 movd r4d, m0 | |
4649 add r4d, 1 | |
4650 shr r4d, 1 | |
4651 add r4d, dword [esp+36] | |
4652 mov dword [esp+36], r4d | |
4653 | |
4654 mov r0, [r6+20] | |
4655 mov r2, [r6+28] | |
4656 add r0, 8*SIZEOF_PIXEL | |
4657 add r2, 8*SIZEOF_PIXEL | |
4658 lea r4, [r1 + 2*r1] | |
4659 call pixel_sa8d_8x8_internal2 | |
4660 HADDUW m0, m1 | |
4661 movd r4d, m0 | |
4662 add r4d, 1 | |
4663 shr r4d, 1 | |
4664 add r4d, dword [esp+36] | |
4665 mov dword [esp+36], r4d | |
4666 | |
4667 mov r0, [r6+20] | |
4668 mov r2, [r6+28] | |
4669 add r0, 16*SIZEOF_PIXEL | |
4670 add r2, 16*SIZEOF_PIXEL | |
4671 lea r4, [r1 + 2*r1] | |
4672 call pixel_sa8d_8x8_internal2 | |
4673 HADDUW m0, m1 | |
4674 movd r4d, m0 | |
4675 add r4d, 1 | |
4676 shr r4d, 1 | |
4677 add r4d, dword [esp+36] | |
4678 mov dword [esp+36], r4d | |
4679 | |
4680 mov r0, [r6+20] | |
4681 mov r2, [r6+28] | |
4682 add r0, 24*SIZEOF_PIXEL | |
4683 add r2, 24*SIZEOF_PIXEL | |
4684 lea r4, [r1 + 2*r1] | |
4685 call pixel_sa8d_8x8_internal2 | |
4686 HADDUW m0, m1 | |
4687 movd r4d, m0 | |
4688 add r4d, 1 | |
4689 shr r4d, 1 | |
4690 add r4d, dword [esp+36] | |
4691 mov dword [esp+36], r4d | |
4692 | |
4693 mov r0, [r6+20] | |
4694 mov r2, [r6+28] | |
4695 lea r0, [r0 + r1*8] | |
4696 lea r2, [r2 + r3*8] | |
4697 mov [r6+20], r0 | |
4698 mov [r6+28], r2 | |
4699 lea r4, [r1 + 2*r1] | |
4700 call pixel_sa8d_8x8_internal2 | |
4701 HADDUW m0, m1 | |
4702 movd r4d, m0 | |
4703 add r4d, 1 | |
4704 shr r4d, 1 | |
4705 add r4d, dword [esp+36] | |
4706 mov dword [esp+36], r4d | |
4707 | |
4708 mov r0, [r6+20] | |
4709 mov r2, [r6+28] | |
4710 add r0, 8*SIZEOF_PIXEL | |
4711 add r2, 8*SIZEOF_PIXEL | |
4712 lea r4, [r1 + 2*r1] | |
4713 call pixel_sa8d_8x8_internal2 | |
4714 HADDUW m0, m1 | |
4715 movd r4d, m0 | |
4716 add r4d, 1 | |
4717 shr r4d, 1 | |
4718 add r4d, dword [esp+36] | |
4719 mov dword [esp+36], r4d | |
4720 | |
4721 mov r0, [r6+20] | |
4722 mov r2, [r6+28] | |
4723 add r0, 16*SIZEOF_PIXEL | |
4724 add r2, 16*SIZEOF_PIXEL | |
4725 lea r4, [r1 + 2*r1] | |
4726 call pixel_sa8d_8x8_internal2 | |
4727 HADDUW m0, m1 | |
4728 movd r4d, m0 | |
4729 add r4d, 1 | |
4730 shr r4d, 1 | |
4731 add r4d, dword [esp+36] | |
4732 mov dword [esp+36], r4d | |
4733 | |
4734 mov r0, [r6+20] | |
4735 mov r2, [r6+28] | |
4736 add r0, 24*SIZEOF_PIXEL | |
4737 add r2, 24*SIZEOF_PIXEL | |
4738 lea r4, [r1 + 2*r1] | |
4739 call pixel_sa8d_8x8_internal2 | |
4740 HADDUW m0, m1 | |
4741 movd r4d, m0 | |
4742 add r4d, 1 | |
4743 shr r4d, 1 | |
4744 add r4d, dword [esp+36] | |
4745 mov eax, r4d | |
4746 mov esp, r6 | |
4747 RET | |
4748 | |
4749 cglobal pixel_sa8d_32x32, 4,7,8 | |
4750 FIX_STRIDES r1, r3 | |
4751 mov r6, esp | |
4752 and esp, ~15 | |
4753 sub esp, 64 | |
4754 | |
4755 lea r4, [r1 + 2*r1] | |
4756 lea r5, [r3 + 2*r3] | |
4757 call pixel_sa8d_8x8_internal2 | |
4758 %if HIGH_BIT_DEPTH | |
4759 HADDUW m0, m1 | |
4760 %endif | |
4761 mova [rsp+48], m0 | |
4762 call pixel_sa8d_8x8_internal2 | |
4763 SA8D_INTER | |
4764 mova [esp+48], m0 | |
4765 | |
4766 mov r0, [r6+20] | |
4767 mov r2, [r6+28] | |
4768 add r0, 8*SIZEOF_PIXEL | |
4769 add r2, 8*SIZEOF_PIXEL | |
4770 call pixel_sa8d_8x8_internal2 | |
4771 SA8D_INTER | |
4772 mova [esp+48], m0 | |
4773 call pixel_sa8d_8x8_internal2 | |
4774 SA8D_INTER | |
4775 %if HIGH_BIT_DEPTH == 0 | |
4776 HADDUW m0, m1 | |
4777 %endif | |
4778 movd r4d, m0 | |
4779 add r4d, 1 | |
4780 shr r4d, 1 | |
4781 mov dword [esp+36], r4d | |
4782 | |
4783 mov r0, [r6+20] | |
4784 mov r2, [r6+28] | |
4785 add r0, 16*SIZEOF_PIXEL | |
4786 add r2, 16*SIZEOF_PIXEL | |
4787 lea r4, [r1 + 2*r1] | |
4788 call pixel_sa8d_8x8_internal2 | |
4789 %if HIGH_BIT_DEPTH | |
4790 HADDUW m0, m1 | |
4791 %endif | |
4792 mova [esp+48], m0 | |
4793 call pixel_sa8d_8x8_internal2 | |
4794 SA8D_INTER | |
4795 mova [esp+48], m0 | |
4796 | |
4797 mov r0, [r6+20] | |
4798 mov r2, [r6+28] | |
4799 add r0, 24*SIZEOF_PIXEL | |
4800 add r2, 24*SIZEOF_PIXEL | |
4801 call pixel_sa8d_8x8_internal2 | |
4802 SA8D_INTER | |
4803 mova [esp+64-mmsize], m0 | |
4804 call pixel_sa8d_8x8_internal2 | |
4805 AVG_16x16 | |
4806 | |
4807 mov r0, [r6+20] | |
4808 mov r2, [r6+28] | |
4809 lea r0, [r0 + r1*8] | |
4810 lea r2, [r2 + r3*8] | |
4811 lea r0, [r0 + r1*8] | |
4812 lea r2, [r2 + r3*8] | |
4813 lea r4, [r1 + 2*r1] | |
4814 call pixel_sa8d_8x8_internal2 | |
4815 %if HIGH_BIT_DEPTH | |
4816 HADDUW m0, m1 | |
4817 %endif | |
4818 mova [esp+48], m0 | |
4819 call pixel_sa8d_8x8_internal2 | |
4820 SA8D_INTER | |
4821 mova [esp+48], m0 | |
4822 | |
4823 mov r0, [r6+20] | |
4824 mov r2, [r6+28] | |
4825 lea r0, [r0 + r1*8] | |
4826 lea r2, [r2 + r3*8] | |
4827 lea r0, [r0 + r1*8] | |
4828 lea r2, [r2 + r3*8] | |
4829 add r0, 8*SIZEOF_PIXEL | |
4830 add r2, 8*SIZEOF_PIXEL | |
4831 call pixel_sa8d_8x8_internal2 | |
4832 SA8D_INTER | |
4833 mova [esp+64-mmsize], m0 | |
4834 call pixel_sa8d_8x8_internal2 | |
4835 AVG_16x16 | |
4836 | |
4837 mov r0, [r6+20] | |
4838 mov r2, [r6+28] | |
4839 lea r0, [r0 + r1*8] | |
4840 lea r2, [r2 + r3*8] | |
4841 lea r0, [r0 + r1*8] | |
4842 lea r2, [r2 + r3*8] | |
4843 add r0, 16*SIZEOF_PIXEL | |
4844 add r2, 16*SIZEOF_PIXEL | |
4845 lea r4, [r1 + 2*r1] | |
4846 call pixel_sa8d_8x8_internal2 | |
4847 %if HIGH_BIT_DEPTH | |
4848 HADDUW m0, m1 | |
4849 %endif | |
4850 mova [esp+48], m0 | |
4851 call pixel_sa8d_8x8_internal2 | |
4852 SA8D_INTER | |
4853 mova [esp+48], m0 | |
4854 | |
4855 mov r0, [r6+20] | |
4856 mov r2, [r6+28] | |
4857 lea r0, [r0 + r1*8] | |
4858 lea r2, [r2 + r3*8] | |
4859 lea r0, [r0 + r1*8] | |
4860 lea r2, [r2 + r3*8] | |
4861 add r0, 24*SIZEOF_PIXEL | |
4862 add r2, 24*SIZEOF_PIXEL | |
4863 call pixel_sa8d_8x8_internal2 | |
4864 SA8D_INTER | |
4865 mova [esp+64-mmsize], m0 | |
4866 call pixel_sa8d_8x8_internal2 | |
4867 SA8D_INTER | |
4868 %if HIGH_BIT_DEPTH == 0 | |
4869 HADDUW m0, m1 | |
4870 %endif | |
4871 movd r4d, m0 | |
4872 add r4d, 1 | |
4873 shr r4d, 1 | |
4874 add r4d, dword [esp+36] | |
4875 mov eax, r4d | |
4876 mov esp, r6 | |
4877 RET | |
4878 | |
4879 cglobal pixel_sa8d_32x64, 4,7,8 | |
4880 FIX_STRIDES r1, r3 | |
4881 mov r6, esp | |
4882 and esp, ~15 | |
4883 sub esp, 64 | |
4884 | |
4885 lea r4, [r1 + 2*r1] | |
4886 lea r5, [r3 + 2*r3] | |
4887 call pixel_sa8d_8x8_internal2 | |
4888 %if HIGH_BIT_DEPTH | |
4889 HADDUW m0, m1 | |
4890 %endif | |
4891 mova [rsp+48], m0 | |
4892 call pixel_sa8d_8x8_internal2 | |
4893 SA8D_INTER | |
4894 mova [esp+48], m0 | |
4895 | |
4896 mov r0, [r6+20] | |
4897 mov r2, [r6+28] | |
4898 add r0, 8*SIZEOF_PIXEL | |
4899 add r2, 8*SIZEOF_PIXEL | |
4900 call pixel_sa8d_8x8_internal2 | |
4901 SA8D_INTER | |
4902 mova [esp+48], m0 | |
4903 call pixel_sa8d_8x8_internal2 | |
4904 SA8D_INTER | |
4905 %if HIGH_BIT_DEPTH == 0 | |
4906 HADDUW m0, m1 | |
4907 %endif | |
4908 movd r4d, m0 | |
4909 add r4d, 1 | |
4910 shr r4d, 1 | |
4911 mov dword [esp+36], r4d | |
4912 | |
4913 mov r0, [r6+20] | |
4914 mov r2, [r6+28] | |
4915 add r0, 16*SIZEOF_PIXEL | |
4916 add r2, 16*SIZEOF_PIXEL | |
4917 lea r4, [r1 + 2*r1] | |
4918 call pixel_sa8d_8x8_internal2 | |
4919 %if HIGH_BIT_DEPTH | |
4920 HADDUW m0, m1 | |
4921 %endif | |
4922 mova [esp+48], m0 | |
4923 call pixel_sa8d_8x8_internal2 | |
4924 SA8D_INTER | |
4925 mova [esp+48], m0 | |
4926 | |
4927 mov r0, [r6+20] | |
4928 mov r2, [r6+28] | |
4929 add r0, 24*SIZEOF_PIXEL | |
4930 add r2, 24*SIZEOF_PIXEL | |
4931 call pixel_sa8d_8x8_internal2 | |
4932 SA8D_INTER | |
4933 mova [esp+64-mmsize], m0 | |
4934 call pixel_sa8d_8x8_internal2 | |
4935 AVG_16x16 | |
4936 | |
4937 mov r0, [r6+20] | |
4938 mov r2, [r6+28] | |
4939 lea r0, [r0 + r1*8] | |
4940 lea r2, [r2 + r3*8] | |
4941 lea r0, [r0 + r1*8] | |
4942 lea r2, [r2 + r3*8] | |
4943 mov [r6+20], r0 | |
4944 mov [r6+28], r2 | |
4945 | |
4946 lea r4, [r1 + 2*r1] | |
4947 call pixel_sa8d_8x8_internal2 | |
4948 %if HIGH_BIT_DEPTH | |
4949 HADDUW m0, m1 | |
4950 %endif | |
4951 mova [esp+48], m0 | |
4952 call pixel_sa8d_8x8_internal2 | |
4953 SA8D_INTER | |
4954 mova [esp+48], m0 | |
4955 | |
4956 mov r0, [r6+20] | |
4957 mov r2, [r6+28] | |
4958 add r0, 8*SIZEOF_PIXEL | |
4959 add r2, 8*SIZEOF_PIXEL | |
4960 call pixel_sa8d_8x8_internal2 | |
4961 SA8D_INTER | |
4962 mova [esp+64-mmsize], m0 | |
4963 call pixel_sa8d_8x8_internal2 | |
4964 AVG_16x16 | |
4965 | |
4966 mov r0, [r6+20] | |
4967 mov r2, [r6+28] | |
4968 add r0, 16*SIZEOF_PIXEL | |
4969 add r2, 16*SIZEOF_PIXEL | |
4970 lea r4, [r1 + 2*r1] | |
4971 call pixel_sa8d_8x8_internal2 | |
4972 %if HIGH_BIT_DEPTH | |
4973 HADDUW m0, m1 | |
4974 %endif | |
4975 mova [esp+48], m0 | |
4976 call pixel_sa8d_8x8_internal2 | |
4977 SA8D_INTER | |
4978 mova [esp+48], m0 | |
4979 | |
4980 mov r0, [r6+20] | |
4981 mov r2, [r6+28] | |
4982 add r0, 24*SIZEOF_PIXEL | |
4983 add r2, 24*SIZEOF_PIXEL | |
4984 call pixel_sa8d_8x8_internal2 | |
4985 SA8D_INTER | |
4986 mova [esp+64-mmsize], m0 | |
4987 call pixel_sa8d_8x8_internal2 | |
4988 AVG_16x16 | |
4989 | |
4990 mov r0, [r6+20] | |
4991 mov r2, [r6+28] | |
4992 lea r0, [r0 + r1*8] | |
4993 lea r2, [r2 + r3*8] | |
4994 lea r0, [r0 + r1*8] | |
4995 lea r2, [r2 + r3*8] | |
4996 mov [r6+20], r0 | |
4997 mov [r6+28], r2 | |
4998 | |
4999 lea r4, [r1 + 2*r1] | |
5000 call pixel_sa8d_8x8_internal2 | |
5001 %if HIGH_BIT_DEPTH | |
5002 HADDUW m0, m1 | |
5003 %endif | |
5004 mova [esp+48], m0 | |
5005 call pixel_sa8d_8x8_internal2 | |
5006 SA8D_INTER | |
5007 mova [esp+48], m0 | |
5008 | |
5009 mov r0, [r6+20] | |
5010 mov r2, [r6+28] | |
5011 add r0, 8*SIZEOF_PIXEL | |
5012 add r2, 8*SIZEOF_PIXEL | |
5013 call pixel_sa8d_8x8_internal2 | |
5014 SA8D_INTER | |
5015 mova [esp+64-mmsize], m0 | |
5016 call pixel_sa8d_8x8_internal2 | |
5017 AVG_16x16 | |
5018 | |
5019 mov r0, [r6+20] | |
5020 mov r2, [r6+28] | |
5021 add r0, 16*SIZEOF_PIXEL | |
5022 add r2, 16*SIZEOF_PIXEL | |
5023 lea r4, [r1 + 2*r1] | |
5024 call pixel_sa8d_8x8_internal2 | |
5025 %if HIGH_BIT_DEPTH | |
5026 HADDUW m0, m1 | |
5027 %endif | |
5028 mova [esp+48], m0 | |
5029 call pixel_sa8d_8x8_internal2 | |
5030 SA8D_INTER | |
5031 mova [esp+48], m0 | |
5032 | |
5033 mov r0, [r6+20] | |
5034 mov r2, [r6+28] | |
5035 add r0, 24*SIZEOF_PIXEL | |
5036 add r2, 24*SIZEOF_PIXEL | |
5037 call pixel_sa8d_8x8_internal2 | |
5038 SA8D_INTER | |
5039 mova [esp+64-mmsize], m0 | |
5040 call pixel_sa8d_8x8_internal2 | |
5041 AVG_16x16 | |
5042 | |
5043 mov r0, [r6+20] | |
5044 mov r2, [r6+28] | |
5045 lea r0, [r0 + r1*8] | |
5046 lea r2, [r2 + r3*8] | |
5047 lea r0, [r0 + r1*8] | |
5048 lea r2, [r2 + r3*8] | |
5049 mov [r6+20], r0 | |
5050 mov [r6+28], r2 | |
5051 | |
5052 lea r4, [r1 + 2*r1] | |
5053 call pixel_sa8d_8x8_internal2 | |
5054 %if HIGH_BIT_DEPTH | |
5055 HADDUW m0, m1 | |
5056 %endif | |
5057 mova [esp+48], m0 | |
5058 call pixel_sa8d_8x8_internal2 | |
5059 SA8D_INTER | |
5060 mova [esp+48], m0 | |
5061 | |
5062 mov r0, [r6+20] | |
5063 mov r2, [r6+28] | |
5064 add r0, 8*SIZEOF_PIXEL | |
5065 add r2, 8*SIZEOF_PIXEL | |
5066 call pixel_sa8d_8x8_internal2 | |
5067 SA8D_INTER | |
5068 mova [esp+64-mmsize], m0 | |
5069 call pixel_sa8d_8x8_internal2 | |
5070 AVG_16x16 | |
5071 | |
5072 mov r0, [r6+20] | |
5073 mov r2, [r6+28] | |
5074 add r0, 16*SIZEOF_PIXEL | |
5075 add r2, 16*SIZEOF_PIXEL | |
5076 lea r4, [r1 + 2*r1] | |
5077 call pixel_sa8d_8x8_internal2 | |
5078 %if HIGH_BIT_DEPTH | |
5079 HADDUW m0, m1 | |
5080 %endif | |
5081 mova [esp+48], m0 | |
5082 call pixel_sa8d_8x8_internal2 | |
5083 SA8D_INTER | |
5084 mova [esp+48], m0 | |
5085 | |
5086 mov r0, [r6+20] | |
5087 mov r2, [r6+28] | |
5088 add r0, 24*SIZEOF_PIXEL | |
5089 add r2, 24*SIZEOF_PIXEL | |
5090 call pixel_sa8d_8x8_internal2 | |
5091 SA8D_INTER | |
5092 mova [esp+64-mmsize], m0 | |
5093 call pixel_sa8d_8x8_internal2 | |
5094 SA8D_INTER | |
5095 %if HIGH_BIT_DEPTH == 0 | |
5096 HADDUW m0, m1 | |
5097 %endif | |
5098 movd r4d, m0 | |
5099 add r4d, 1 | |
5100 shr r4d, 1 | |
5101 add r4d, dword [esp+36] | |
5102 mov eax, r4d | |
5103 mov esp, r6 | |
5104 RET | |
5105 | |
5106 cglobal pixel_sa8d_48x64, 4,7,8 | |
5107 FIX_STRIDES r1, r3 | |
5108 mov r6, esp | |
5109 and esp, ~15 | |
5110 sub esp, 64 | |
5111 | |
5112 lea r4, [r1 + 2*r1] | |
5113 lea r5, [r3 + 2*r3] | |
5114 call pixel_sa8d_8x8_internal2 | |
5115 %if HIGH_BIT_DEPTH | |
5116 HADDUW m0, m1 | |
5117 %endif | |
5118 mova [rsp+48], m0 | |
5119 call pixel_sa8d_8x8_internal2 | |
5120 SA8D_INTER | |
5121 mova [esp+48], m0 | |
5122 | |
5123 mov r0, [r6+20] | |
5124 mov r2, [r6+28] | |
5125 add r0, 8*SIZEOF_PIXEL | |
5126 add r2, 8*SIZEOF_PIXEL | |
5127 call pixel_sa8d_8x8_internal2 | |
5128 SA8D_INTER | |
5129 mova [esp+48], m0 | |
5130 call pixel_sa8d_8x8_internal2 | |
5131 SA8D_INTER | |
5132 %if HIGH_BIT_DEPTH == 0 | |
5133 HADDUW m0, m1 | |
5134 %endif | |
5135 movd r4d, m0 | |
5136 add r4d, 1 | |
5137 shr r4d, 1 | |
5138 mov dword [esp+36], r4d | |
5139 | |
5140 mov r0, [r6+20] | |
5141 mov r2, [r6+28] | |
5142 add r0, 16*SIZEOF_PIXEL | |
5143 add r2, 16*SIZEOF_PIXEL | |
5144 lea r4, [r1 + 2*r1] | |
5145 call pixel_sa8d_8x8_internal2 | |
5146 %if HIGH_BIT_DEPTH | |
5147 HADDUW m0, m1 | |
5148 %endif | |
5149 mova [esp+48], m0 | |
5150 call pixel_sa8d_8x8_internal2 | |
5151 SA8D_INTER | |
5152 mova [esp+48], m0 | |
5153 | |
5154 mov r0, [r6+20] | |
5155 mov r2, [r6+28] | |
5156 add r0, 24*SIZEOF_PIXEL | |
5157 add r2, 24*SIZEOF_PIXEL | |
5158 call pixel_sa8d_8x8_internal2 | |
5159 SA8D_INTER | |
5160 mova [esp+64-mmsize], m0 | |
5161 call pixel_sa8d_8x8_internal2 | |
5162 AVG_16x16 | |
5163 | |
5164 mov r0, [r6+20] | |
5165 mov r2, [r6+28] | |
5166 add r0, 32*SIZEOF_PIXEL | |
5167 add r2, 32*SIZEOF_PIXEL | |
5168 lea r4, [r1 + 2*r1] | |
5169 call pixel_sa8d_8x8_internal2 | |
5170 %if HIGH_BIT_DEPTH | |
5171 HADDUW m0, m1 | |
5172 %endif | |
5173 mova [esp+48], m0 | |
5174 call pixel_sa8d_8x8_internal2 | |
5175 SA8D_INTER | |
5176 mova [esp+48], m0 | |
5177 | |
5178 mov r0, [r6+20] | |
5179 mov r2, [r6+28] | |
5180 add r0, 40*SIZEOF_PIXEL | |
5181 add r2, 40*SIZEOF_PIXEL | |
5182 call pixel_sa8d_8x8_internal2 | |
5183 SA8D_INTER | |
5184 mova [esp+64-mmsize], m0 | |
5185 call pixel_sa8d_8x8_internal2 | |
5186 AVG_16x16 | |
5187 | |
5188 mov r0, [r6+20] | |
5189 mov r2, [r6+28] | |
5190 lea r0, [r0 + r1*8] | |
5191 lea r2, [r2 + r3*8] | |
5192 lea r0, [r0 + r1*8] | |
5193 lea r2, [r2 + r3*8] | |
5194 mov [r6+20], r0 | |
5195 mov [r6+28], r2 | |
5196 | |
5197 lea r4, [r1 + 2*r1] | |
5198 call pixel_sa8d_8x8_internal2 | |
5199 %if HIGH_BIT_DEPTH | |
5200 HADDUW m0, m1 | |
5201 %endif | |
5202 mova [esp+48], m0 | |
5203 call pixel_sa8d_8x8_internal2 | |
5204 SA8D_INTER | |
5205 mova [esp+48], m0 | |
5206 | |
5207 mov r0, [r6+20] | |
5208 mov r2, [r6+28] | |
5209 add r0, 8*SIZEOF_PIXEL | |
5210 add r2, 8*SIZEOF_PIXEL | |
5211 call pixel_sa8d_8x8_internal2 | |
5212 SA8D_INTER | |
5213 mova [esp+64-mmsize], m0 | |
5214 call pixel_sa8d_8x8_internal2 | |
5215 AVG_16x16 | |
5216 | |
5217 mov r0, [r6+20] | |
5218 mov r2, [r6+28] | |
5219 add r0, 16*SIZEOF_PIXEL | |
5220 add r2, 16*SIZEOF_PIXEL | |
5221 lea r4, [r1 + 2*r1] | |
5222 call pixel_sa8d_8x8_internal2 | |
5223 %if HIGH_BIT_DEPTH | |
5224 HADDUW m0, m1 | |
5225 %endif | |
5226 mova [esp+48], m0 | |
5227 call pixel_sa8d_8x8_internal2 | |
5228 SA8D_INTER | |
5229 mova [esp+48], m0 | |
5230 | |
5231 mov r0, [r6+20] | |
5232 mov r2, [r6+28] | |
5233 add r0, 24*SIZEOF_PIXEL | |
5234 add r2, 24*SIZEOF_PIXEL | |
5235 call pixel_sa8d_8x8_internal2 | |
5236 SA8D_INTER | |
5237 mova [esp+64-mmsize], m0 | |
5238 call pixel_sa8d_8x8_internal2 | |
5239 AVG_16x16 | |
5240 | |
5241 mov r0, [r6+20] | |
5242 mov r2, [r6+28] | |
5243 add r0, 32*SIZEOF_PIXEL | |
5244 add r2, 32*SIZEOF_PIXEL | |
5245 lea r4, [r1 + 2*r1] | |
5246 call pixel_sa8d_8x8_internal2 | |
5247 %if HIGH_BIT_DEPTH | |
5248 HADDUW m0, m1 | |
5249 %endif | |
5250 mova [esp+48], m0 | |
5251 call pixel_sa8d_8x8_internal2 | |
5252 SA8D_INTER | |
5253 mova [esp+48], m0 | |
5254 | |
5255 mov r0, [r6+20] | |
5256 mov r2, [r6+28] | |
5257 add r0, 40*SIZEOF_PIXEL | |
5258 add r2, 40*SIZEOF_PIXEL | |
5259 call pixel_sa8d_8x8_internal2 | |
5260 SA8D_INTER | |
5261 mova [esp+64-mmsize], m0 | |
5262 call pixel_sa8d_8x8_internal2 | |
5263 AVG_16x16 | |
5264 | |
5265 mov r0, [r6+20] | |
5266 mov r2, [r6+28] | |
5267 lea r0, [r0 + r1*8] | |
5268 lea r2, [r2 + r3*8] | |
5269 lea r0, [r0 + r1*8] | |
5270 lea r2, [r2 + r3*8] | |
5271 mov [r6+20], r0 | |
5272 mov [r6+28], r2 | |
5273 | |
5274 lea r4, [r1 + 2*r1] | |
5275 call pixel_sa8d_8x8_internal2 | |
5276 %if HIGH_BIT_DEPTH | |
5277 HADDUW m0, m1 | |
5278 %endif | |
5279 mova [esp+48], m0 | |
5280 call pixel_sa8d_8x8_internal2 | |
5281 SA8D_INTER | |
5282 mova [esp+48], m0 | |
5283 | |
5284 mov r0, [r6+20] | |
5285 mov r2, [r6+28] | |
5286 add r0, 8*SIZEOF_PIXEL | |
5287 add r2, 8*SIZEOF_PIXEL | |
5288 call pixel_sa8d_8x8_internal2 | |
5289 SA8D_INTER | |
5290 mova [esp+64-mmsize], m0 | |
5291 call pixel_sa8d_8x8_internal2 | |
5292 AVG_16x16 | |
5293 | |
5294 mov r0, [r6+20] | |
5295 mov r2, [r6+28] | |
5296 add r0, 16*SIZEOF_PIXEL | |
5297 add r2, 16*SIZEOF_PIXEL | |
5298 lea r4, [r1 + 2*r1] | |
5299 call pixel_sa8d_8x8_internal2 | |
5300 %if HIGH_BIT_DEPTH | |
5301 HADDUW m0, m1 | |
5302 %endif | |
5303 mova [esp+48], m0 | |
5304 call pixel_sa8d_8x8_internal2 | |
5305 SA8D_INTER | |
5306 mova [esp+48], m0 | |
5307 | |
5308 mov r0, [r6+20] | |
5309 mov r2, [r6+28] | |
5310 add r0, 24*SIZEOF_PIXEL | |
5311 add r2, 24*SIZEOF_PIXEL | |
5312 call pixel_sa8d_8x8_internal2 | |
5313 SA8D_INTER | |
5314 mova [esp+64-mmsize], m0 | |
5315 call pixel_sa8d_8x8_internal2 | |
5316 AVG_16x16 | |
5317 | |
5318 mov r0, [r6+20] | |
5319 mov r2, [r6+28] | |
5320 add r0, 32*SIZEOF_PIXEL | |
5321 add r2, 32*SIZEOF_PIXEL | |
5322 lea r4, [r1 + 2*r1] | |
5323 call pixel_sa8d_8x8_internal2 | |
5324 %if HIGH_BIT_DEPTH | |
5325 HADDUW m0, m1 | |
5326 %endif | |
5327 mova [esp+48], m0 | |
5328 call pixel_sa8d_8x8_internal2 | |
5329 SA8D_INTER | |
5330 mova [esp+48], m0 | |
5331 | |
5332 mov r0, [r6+20] | |
5333 mov r2, [r6+28] | |
5334 add r0, 40*SIZEOF_PIXEL | |
5335 add r2, 40*SIZEOF_PIXEL | |
5336 call pixel_sa8d_8x8_internal2 | |
5337 SA8D_INTER | |
5338 mova [esp+64-mmsize], m0 | |
5339 call pixel_sa8d_8x8_internal2 | |
5340 AVG_16x16 | |
5341 | |
5342 mov r0, [r6+20] | |
5343 mov r2, [r6+28] | |
5344 lea r0, [r0 + r1*8] | |
5345 lea r2, [r2 + r3*8] | |
5346 lea r0, [r0 + r1*8] | |
5347 lea r2, [r2 + r3*8] | |
5348 mov [r6+20], r0 | |
5349 mov [r6+28], r2 | |
5350 | |
5351 lea r4, [r1 + 2*r1] | |
5352 call pixel_sa8d_8x8_internal2 | |
5353 %if HIGH_BIT_DEPTH | |
5354 HADDUW m0, m1 | |
5355 %endif | |
5356 mova [esp+48], m0 | |
5357 call pixel_sa8d_8x8_internal2 | |
5358 SA8D_INTER | |
5359 mova [esp+48], m0 | |
5360 | |
5361 mov r0, [r6+20] | |
5362 mov r2, [r6+28] | |
5363 add r0, 8*SIZEOF_PIXEL | |
5364 add r2, 8*SIZEOF_PIXEL | |
5365 call pixel_sa8d_8x8_internal2 | |
5366 SA8D_INTER | |
5367 mova [esp+64-mmsize], m0 | |
5368 call pixel_sa8d_8x8_internal2 | |
5369 AVG_16x16 | |
5370 | |
5371 mov r0, [r6+20] | |
5372 mov r2, [r6+28] | |
5373 add r0, 16*SIZEOF_PIXEL | |
5374 add r2, 16*SIZEOF_PIXEL | |
5375 lea r4, [r1 + 2*r1] | |
5376 call pixel_sa8d_8x8_internal2 | |
5377 %if HIGH_BIT_DEPTH | |
5378 HADDUW m0, m1 | |
5379 %endif | |
5380 mova [esp+48], m0 | |
5381 call pixel_sa8d_8x8_internal2 | |
5382 SA8D_INTER | |
5383 mova [esp+48], m0 | |
5384 | |
5385 mov r0, [r6+20] | |
5386 mov r2, [r6+28] | |
5387 add r0, 24*SIZEOF_PIXEL | |
5388 add r2, 24*SIZEOF_PIXEL | |
5389 call pixel_sa8d_8x8_internal2 | |
5390 SA8D_INTER | |
5391 mova [esp+64-mmsize], m0 | |
5392 call pixel_sa8d_8x8_internal2 | |
5393 AVG_16x16 | |
5394 | |
5395 mov r0, [r6+20] | |
5396 mov r2, [r6+28] | |
5397 add r0, 32*SIZEOF_PIXEL | |
5398 add r2, 32*SIZEOF_PIXEL | |
5399 lea r4, [r1 + 2*r1] | |
5400 call pixel_sa8d_8x8_internal2 | |
5401 %if HIGH_BIT_DEPTH | |
5402 HADDUW m0, m1 | |
5403 %endif | |
5404 mova [esp+48], m0 | |
5405 call pixel_sa8d_8x8_internal2 | |
5406 SA8D_INTER | |
5407 mova [esp+48], m0 | |
5408 | |
5409 mov r0, [r6+20] | |
5410 mov r2, [r6+28] | |
5411 add r0, 40*SIZEOF_PIXEL | |
5412 add r2, 40*SIZEOF_PIXEL | |
5413 call pixel_sa8d_8x8_internal2 | |
5414 SA8D_INTER | |
5415 mova [esp+64-mmsize], m0 | |
5416 call pixel_sa8d_8x8_internal2 | |
5417 SA8D_INTER | |
5418 %if HIGH_BIT_DEPTH == 0 | |
5419 HADDUW m0, m1 | |
5420 %endif | |
5421 movd r4d, m0 | |
5422 add r4d, 1 | |
5423 shr r4d, 1 | |
5424 add r4d, dword [esp+36] | |
5425 mov eax, r4d | |
5426 mov esp, r6 | |
5427 RET | |
5428 | |
5429 cglobal pixel_sa8d_64x16, 4,7,8 | |
5430 FIX_STRIDES r1, r3 | |
5431 mov r6, esp | |
5432 and esp, ~15 | |
5433 sub esp, 64 | |
5434 | |
5435 lea r4, [r1 + 2*r1] | |
5436 lea r5, [r3 + 2*r3] | |
5437 call pixel_sa8d_8x8_internal2 | |
5438 %if HIGH_BIT_DEPTH | |
5439 HADDUW m0, m1 | |
5440 %endif | |
5441 mova [rsp+48], m0 | |
5442 call pixel_sa8d_8x8_internal2 | |
5443 SA8D_INTER | |
5444 mova [esp+48], m0 | |
5445 | |
5446 mov r0, [r6+20] | |
5447 mov r2, [r6+28] | |
5448 add r0, 8*SIZEOF_PIXEL | |
5449 add r2, 8*SIZEOF_PIXEL | |
5450 call pixel_sa8d_8x8_internal2 | |
5451 SA8D_INTER | |
5452 mova [esp+48], m0 | |
5453 call pixel_sa8d_8x8_internal2 | |
5454 SA8D_INTER | |
5455 %if HIGH_BIT_DEPTH == 0 | |
5456 HADDUW m0, m1 | |
5457 %endif | |
5458 movd r4d, m0 | |
5459 add r4d, 1 | |
5460 shr r4d, 1 | |
5461 mov dword [esp+36], r4d | |
5462 | |
5463 mov r0, [r6+20] | |
5464 mov r2, [r6+28] | |
5465 add r0, 16*SIZEOF_PIXEL | |
5466 add r2, 16*SIZEOF_PIXEL | |
5467 lea r4, [r1 + 2*r1] | |
5468 call pixel_sa8d_8x8_internal2 | |
5469 %if HIGH_BIT_DEPTH | |
5470 HADDUW m0, m1 | |
5471 %endif | |
5472 mova [esp+48], m0 | |
5473 call pixel_sa8d_8x8_internal2 | |
5474 SA8D_INTER | |
5475 mova [esp+48], m0 | |
5476 | |
5477 mov r0, [r6+20] | |
5478 mov r2, [r6+28] | |
5479 add r0, 24*SIZEOF_PIXEL | |
5480 add r2, 24*SIZEOF_PIXEL | |
5481 call pixel_sa8d_8x8_internal2 | |
5482 SA8D_INTER | |
5483 mova [esp+64-mmsize], m0 | |
5484 call pixel_sa8d_8x8_internal2 | |
5485 AVG_16x16 | |
5486 | |
5487 mov r0, [r6+20] | |
5488 mov r2, [r6+28] | |
5489 add r0, 32*SIZEOF_PIXEL | |
5490 add r2, 32*SIZEOF_PIXEL | |
5491 lea r4, [r1 + 2*r1] | |
5492 call pixel_sa8d_8x8_internal2 | |
5493 %if HIGH_BIT_DEPTH | |
5494 HADDUW m0, m1 | |
5495 %endif | |
5496 mova [esp+48], m0 | |
5497 call pixel_sa8d_8x8_internal2 | |
5498 SA8D_INTER | |
5499 mova [esp+48], m0 | |
5500 | |
5501 mov r0, [r6+20] | |
5502 mov r2, [r6+28] | |
5503 add r0, 40*SIZEOF_PIXEL | |
5504 add r2, 40*SIZEOF_PIXEL | |
5505 call pixel_sa8d_8x8_internal2 | |
5506 SA8D_INTER | |
5507 mova [esp+64-mmsize], m0 | |
5508 call pixel_sa8d_8x8_internal2 | |
5509 AVG_16x16 | |
5510 | |
5511 mov r0, [r6+20] | |
5512 mov r2, [r6+28] | |
5513 add r0, 48*SIZEOF_PIXEL | |
5514 add r2, 48*SIZEOF_PIXEL | |
5515 lea r4, [r1 + 2*r1] | |
5516 call pixel_sa8d_8x8_internal2 | |
5517 %if HIGH_BIT_DEPTH | |
5518 HADDUW m0, m1 | |
5519 %endif | |
5520 mova [esp+48], m0 | |
5521 call pixel_sa8d_8x8_internal2 | |
5522 SA8D_INTER | |
5523 mova [esp+48], m0 | |
5524 | |
5525 mov r0, [r6+20] | |
5526 mov r2, [r6+28] | |
5527 add r0, 56*SIZEOF_PIXEL | |
5528 add r2, 56*SIZEOF_PIXEL | |
5529 call pixel_sa8d_8x8_internal2 | |
5530 SA8D_INTER | |
5531 mova [esp+64-mmsize], m0 | |
5532 call pixel_sa8d_8x8_internal2 | |
5533 SA8D_INTER | |
5534 %if HIGH_BIT_DEPTH == 0 | |
5535 HADDUW m0, m1 | |
5536 %endif | |
5537 movd r4d, m0 | |
5538 add r4d, 1 | |
5539 shr r4d, 1 | |
5540 add r4d, dword [esp+36] | |
5541 mov eax, r4d | |
5542 mov esp, r6 | |
5543 RET | |
5544 | |
5545 cglobal pixel_sa8d_64x32, 4,7,8 | |
5546 FIX_STRIDES r1, r3 | |
5547 mov r6, esp | |
5548 and esp, ~15 | |
5549 sub esp, 64 | |
5550 | |
5551 lea r4, [r1 + 2*r1] | |
5552 lea r5, [r3 + 2*r3] | |
5553 call pixel_sa8d_8x8_internal2 | |
5554 %if HIGH_BIT_DEPTH | |
5555 HADDUW m0, m1 | |
5556 %endif | |
5557 mova [rsp+48], m0 | |
5558 call pixel_sa8d_8x8_internal2 | |
5559 SA8D_INTER | |
5560 mova [esp+48], m0 | |
5561 | |
5562 mov r0, [r6+20] | |
5563 mov r2, [r6+28] | |
5564 add r0, 8*SIZEOF_PIXEL | |
5565 add r2, 8*SIZEOF_PIXEL | |
5566 call pixel_sa8d_8x8_internal2 | |
5567 SA8D_INTER | |
5568 mova [esp+48], m0 | |
5569 call pixel_sa8d_8x8_internal2 | |
5570 SA8D_INTER | |
5571 %if HIGH_BIT_DEPTH == 0 | |
5572 HADDUW m0, m1 | |
5573 %endif | |
5574 movd r4d, m0 | |
5575 add r4d, 1 | |
5576 shr r4d, 1 | |
5577 mov dword [esp+36], r4d | |
5578 | |
5579 mov r0, [r6+20] | |
5580 mov r2, [r6+28] | |
5581 add r0, 16*SIZEOF_PIXEL | |
5582 add r2, 16*SIZEOF_PIXEL | |
5583 lea r4, [r1 + 2*r1] | |
5584 call pixel_sa8d_8x8_internal2 | |
5585 %if HIGH_BIT_DEPTH | |
5586 HADDUW m0, m1 | |
5587 %endif | |
5588 mova [esp+48], m0 | |
5589 call pixel_sa8d_8x8_internal2 | |
5590 SA8D_INTER | |
5591 mova [esp+48], m0 | |
5592 | |
5593 mov r0, [r6+20] | |
5594 mov r2, [r6+28] | |
5595 add r0, 24*SIZEOF_PIXEL | |
5596 add r2, 24*SIZEOF_PIXEL | |
5597 call pixel_sa8d_8x8_internal2 | |
5598 SA8D_INTER | |
5599 mova [esp+64-mmsize], m0 | |
5600 call pixel_sa8d_8x8_internal2 | |
5601 AVG_16x16 | |
5602 | |
5603 mov r0, [r6+20] | |
5604 mov r2, [r6+28] | |
5605 add r0, 32*SIZEOF_PIXEL | |
5606 add r2, 32*SIZEOF_PIXEL | |
5607 lea r4, [r1 + 2*r1] | |
5608 call pixel_sa8d_8x8_internal2 | |
5609 %if HIGH_BIT_DEPTH | |
5610 HADDUW m0, m1 | |
5611 %endif | |
5612 mova [esp+48], m0 | |
5613 call pixel_sa8d_8x8_internal2 | |
5614 SA8D_INTER | |
5615 mova [esp+48], m0 | |
5616 | |
5617 mov r0, [r6+20] | |
5618 mov r2, [r6+28] | |
5619 add r0, 40*SIZEOF_PIXEL | |
5620 add r2, 40*SIZEOF_PIXEL | |
5621 call pixel_sa8d_8x8_internal2 | |
5622 SA8D_INTER | |
5623 mova [esp+64-mmsize], m0 | |
5624 call pixel_sa8d_8x8_internal2 | |
5625 AVG_16x16 | |
5626 | |
5627 mov r0, [r6+20] | |
5628 mov r2, [r6+28] | |
5629 add r0, 48*SIZEOF_PIXEL | |
5630 add r2, 48*SIZEOF_PIXEL | |
5631 lea r4, [r1 + 2*r1] | |
5632 call pixel_sa8d_8x8_internal2 | |
5633 %if HIGH_BIT_DEPTH | |
5634 HADDUW m0, m1 | |
5635 %endif | |
5636 mova [esp+48], m0 | |
5637 call pixel_sa8d_8x8_internal2 | |
5638 SA8D_INTER | |
5639 mova [esp+48], m0 | |
5640 | |
5641 mov r0, [r6+20] | |
5642 mov r2, [r6+28] | |
5643 add r0, 56*SIZEOF_PIXEL | |
5644 add r2, 56*SIZEOF_PIXEL | |
5645 call pixel_sa8d_8x8_internal2 | |
5646 SA8D_INTER | |
5647 mova [esp+64-mmsize], m0 | |
5648 call pixel_sa8d_8x8_internal2 | |
5649 AVG_16x16 | |
5650 | |
5651 mov r0, [r6+20] | |
5652 mov r2, [r6+28] | |
5653 lea r0, [r0 + r1*8] | |
5654 lea r2, [r2 + r3*8] | |
5655 lea r0, [r0 + r1*8] | |
5656 lea r2, [r2 + r3*8] | |
5657 mov [r6+20], r0 | |
5658 mov [r6+28], r2 | |
5659 | |
5660 lea r4, [r1 + 2*r1] | |
5661 call pixel_sa8d_8x8_internal2 | |
5662 %if HIGH_BIT_DEPTH | |
5663 HADDUW m0, m1 | |
5664 %endif | |
5665 mova [esp+48], m0 | |
5666 call pixel_sa8d_8x8_internal2 | |
5667 SA8D_INTER | |
5668 mova [esp+48], m0 | |
5669 | |
5670 mov r0, [r6+20] | |
5671 mov r2, [r6+28] | |
5672 add r0, 8*SIZEOF_PIXEL | |
5673 add r2, 8*SIZEOF_PIXEL | |
5674 call pixel_sa8d_8x8_internal2 | |
5675 SA8D_INTER | |
5676 mova [esp+64-mmsize], m0 | |
5677 call pixel_sa8d_8x8_internal2 | |
5678 AVG_16x16 | |
5679 | |
5680 mov r0, [r6+20] | |
5681 mov r2, [r6+28] | |
5682 add r0, 16*SIZEOF_PIXEL | |
5683 add r2, 16*SIZEOF_PIXEL | |
5684 lea r4, [r1 + 2*r1] | |
5685 call pixel_sa8d_8x8_internal2 | |
5686 %if HIGH_BIT_DEPTH | |
5687 HADDUW m0, m1 | |
5688 %endif | |
5689 mova [esp+48], m0 | |
5690 call pixel_sa8d_8x8_internal2 | |
5691 SA8D_INTER | |
5692 mova [esp+48], m0 | |
5693 | |
5694 mov r0, [r6+20] | |
5695 mov r2, [r6+28] | |
5696 add r0, 24*SIZEOF_PIXEL | |
5697 add r2, 24*SIZEOF_PIXEL | |
5698 call pixel_sa8d_8x8_internal2 | |
5699 SA8D_INTER | |
5700 mova [esp+64-mmsize], m0 | |
5701 call pixel_sa8d_8x8_internal2 | |
5702 AVG_16x16 | |
5703 | |
5704 mov r0, [r6+20] | |
5705 mov r2, [r6+28] | |
5706 add r0, 32*SIZEOF_PIXEL | |
5707 add r2, 32*SIZEOF_PIXEL | |
5708 lea r4, [r1 + 2*r1] | |
5709 call pixel_sa8d_8x8_internal2 | |
5710 %if HIGH_BIT_DEPTH | |
5711 HADDUW m0, m1 | |
5712 %endif | |
5713 mova [esp+48], m0 | |
5714 call pixel_sa8d_8x8_internal2 | |
5715 SA8D_INTER | |
5716 mova [esp+48], m0 | |
5717 | |
5718 mov r0, [r6+20] | |
5719 mov r2, [r6+28] | |
5720 add r0, 40*SIZEOF_PIXEL | |
5721 add r2, 40*SIZEOF_PIXEL | |
5722 call pixel_sa8d_8x8_internal2 | |
5723 SA8D_INTER | |
5724 mova [esp+64-mmsize], m0 | |
5725 call pixel_sa8d_8x8_internal2 | |
5726 AVG_16x16 | |
5727 | |
5728 mov r0, [r6+20] | |
5729 mov r2, [r6+28] | |
5730 add r0, 48*SIZEOF_PIXEL | |
5731 add r2, 48*SIZEOF_PIXEL | |
5732 lea r4, [r1 + 2*r1] | |
5733 call pixel_sa8d_8x8_internal2 | |
5734 %if HIGH_BIT_DEPTH | |
5735 HADDUW m0, m1 | |
5736 %endif | |
5737 mova [esp+48], m0 | |
5738 call pixel_sa8d_8x8_internal2 | |
5739 SA8D_INTER | |
5740 mova [esp+48], m0 | |
5741 | |
5742 mov r0, [r6+20] | |
5743 mov r2, [r6+28] | |
5744 add r0, 56*SIZEOF_PIXEL | |
5745 add r2, 56*SIZEOF_PIXEL | |
5746 call pixel_sa8d_8x8_internal2 | |
5747 SA8D_INTER | |
5748 mova [esp+64-mmsize], m0 | |
5749 call pixel_sa8d_8x8_internal2 | |
5750 SA8D_INTER | |
5751 %if HIGH_BIT_DEPTH == 0 | |
5752 HADDUW m0, m1 | |
5753 %endif | |
5754 movd r4d, m0 | |
5755 add r4d, 1 | |
5756 shr r4d, 1 | |
5757 add r4d, dword [esp+36] | |
5758 mov eax, r4d | |
5759 mov esp, r6 | |
5760 RET | |
5761 | |
5762 cglobal pixel_sa8d_64x48, 4,7,8 | |
5763 FIX_STRIDES r1, r3 | |
5764 mov r6, esp | |
5765 and esp, ~15 | |
5766 sub esp, 64 | |
5767 | |
5768 lea r4, [r1 + 2*r1] | |
5769 lea r5, [r3 + 2*r3] | |
5770 call pixel_sa8d_8x8_internal2 | |
5771 %if HIGH_BIT_DEPTH | |
5772 HADDUW m0, m1 | |
5773 %endif | |
5774 mova [rsp+48], m0 | |
5775 call pixel_sa8d_8x8_internal2 | |
5776 SA8D_INTER | |
5777 mova [esp+48], m0 | |
5778 | |
5779 mov r0, [r6+20] | |
5780 mov r2, [r6+28] | |
5781 add r0, 8*SIZEOF_PIXEL | |
5782 add r2, 8*SIZEOF_PIXEL | |
5783 call pixel_sa8d_8x8_internal2 | |
5784 SA8D_INTER | |
5785 mova [esp+48], m0 | |
5786 call pixel_sa8d_8x8_internal2 | |
5787 SA8D_INTER | |
5788 %if HIGH_BIT_DEPTH == 0 | |
5789 HADDUW m0, m1 | |
5790 %endif | |
5791 movd r4d, m0 | |
5792 add r4d, 1 | |
5793 shr r4d, 1 | |
5794 mov dword [esp+36], r4d | |
5795 | |
5796 mov r0, [r6+20] | |
5797 mov r2, [r6+28] | |
5798 add r0, 16*SIZEOF_PIXEL | |
5799 add r2, 16*SIZEOF_PIXEL | |
5800 lea r4, [r1 + 2*r1] | |
5801 call pixel_sa8d_8x8_internal2 | |
5802 %if HIGH_BIT_DEPTH | |
5803 HADDUW m0, m1 | |
5804 %endif | |
5805 mova [esp+48], m0 | |
5806 call pixel_sa8d_8x8_internal2 | |
5807 SA8D_INTER | |
5808 mova [esp+48], m0 | |
5809 | |
5810 mov r0, [r6+20] | |
5811 mov r2, [r6+28] | |
5812 add r0, 24*SIZEOF_PIXEL | |
5813 add r2, 24*SIZEOF_PIXEL | |
5814 call pixel_sa8d_8x8_internal2 | |
5815 SA8D_INTER | |
5816 mova [esp+64-mmsize], m0 | |
5817 call pixel_sa8d_8x8_internal2 | |
5818 AVG_16x16 | |
5819 | |
5820 mov r0, [r6+20] | |
5821 mov r2, [r6+28] | |
5822 add r0, 32*SIZEOF_PIXEL | |
5823 add r2, 32*SIZEOF_PIXEL | |
5824 lea r4, [r1 + 2*r1] | |
5825 call pixel_sa8d_8x8_internal2 | |
5826 %if HIGH_BIT_DEPTH | |
5827 HADDUW m0, m1 | |
5828 %endif | |
5829 mova [esp+48], m0 | |
5830 call pixel_sa8d_8x8_internal2 | |
5831 SA8D_INTER | |
5832 mova [esp+48], m0 | |
5833 | |
5834 mov r0, [r6+20] | |
5835 mov r2, [r6+28] | |
5836 add r0, 40*SIZEOF_PIXEL | |
5837 add r2, 40*SIZEOF_PIXEL | |
5838 call pixel_sa8d_8x8_internal2 | |
5839 SA8D_INTER | |
5840 mova [esp+64-mmsize], m0 | |
5841 call pixel_sa8d_8x8_internal2 | |
5842 AVG_16x16 | |
5843 | |
5844 mov r0, [r6+20] | |
5845 mov r2, [r6+28] | |
5846 add r0, 48*SIZEOF_PIXEL | |
5847 add r2, 48*SIZEOF_PIXEL | |
5848 lea r4, [r1 + 2*r1] | |
5849 call pixel_sa8d_8x8_internal2 | |
5850 %if HIGH_BIT_DEPTH | |
5851 HADDUW m0, m1 | |
5852 %endif | |
5853 mova [esp+48], m0 | |
5854 call pixel_sa8d_8x8_internal2 | |
5855 SA8D_INTER | |
5856 mova [esp+48], m0 | |
5857 | |
5858 mov r0, [r6+20] | |
5859 mov r2, [r6+28] | |
5860 add r0, 56*SIZEOF_PIXEL | |
5861 add r2, 56*SIZEOF_PIXEL | |
5862 call pixel_sa8d_8x8_internal2 | |
5863 SA8D_INTER | |
5864 mova [esp+64-mmsize], m0 | |
5865 call pixel_sa8d_8x8_internal2 | |
5866 AVG_16x16 | |
5867 | |
5868 mov r0, [r6+20] | |
5869 mov r2, [r6+28] | |
5870 lea r0, [r0 + r1*8] | |
5871 lea r2, [r2 + r3*8] | |
5872 lea r0, [r0 + r1*8] | |
5873 lea r2, [r2 + r3*8] | |
5874 mov [r6+20], r0 | |
5875 mov [r6+28], r2 | |
5876 | |
5877 lea r4, [r1 + 2*r1] | |
5878 call pixel_sa8d_8x8_internal2 | |
5879 %if HIGH_BIT_DEPTH | |
5880 HADDUW m0, m1 | |
5881 %endif | |
5882 mova [esp+48], m0 | |
5883 call pixel_sa8d_8x8_internal2 | |
5884 SA8D_INTER | |
5885 mova [esp+48], m0 | |
5886 | |
5887 mov r0, [r6+20] | |
5888 mov r2, [r6+28] | |
5889 add r0, 8*SIZEOF_PIXEL | |
5890 add r2, 8*SIZEOF_PIXEL | |
5891 call pixel_sa8d_8x8_internal2 | |
5892 SA8D_INTER | |
5893 mova [esp+64-mmsize], m0 | |
5894 call pixel_sa8d_8x8_internal2 | |
5895 AVG_16x16 | |
5896 | |
5897 mov r0, [r6+20] | |
5898 mov r2, [r6+28] | |
5899 add r0, 16*SIZEOF_PIXEL | |
5900 add r2, 16*SIZEOF_PIXEL | |
5901 lea r4, [r1 + 2*r1] | |
5902 call pixel_sa8d_8x8_internal2 | |
5903 %if HIGH_BIT_DEPTH | |
5904 HADDUW m0, m1 | |
5905 %endif | |
5906 mova [esp+48], m0 | |
5907 call pixel_sa8d_8x8_internal2 | |
5908 SA8D_INTER | |
5909 mova [esp+48], m0 | |
5910 | |
5911 mov r0, [r6+20] | |
5912 mov r2, [r6+28] | |
5913 add r0, 24*SIZEOF_PIXEL | |
5914 add r2, 24*SIZEOF_PIXEL | |
5915 call pixel_sa8d_8x8_internal2 | |
5916 SA8D_INTER | |
5917 mova [esp+64-mmsize], m0 | |
5918 call pixel_sa8d_8x8_internal2 | |
5919 AVG_16x16 | |
5920 | |
5921 mov r0, [r6+20] | |
5922 mov r2, [r6+28] | |
5923 add r0, 32*SIZEOF_PIXEL | |
5924 add r2, 32*SIZEOF_PIXEL | |
5925 lea r4, [r1 + 2*r1] | |
5926 call pixel_sa8d_8x8_internal2 | |
5927 %if HIGH_BIT_DEPTH | |
5928 HADDUW m0, m1 | |
5929 %endif | |
5930 mova [esp+48], m0 | |
5931 call pixel_sa8d_8x8_internal2 | |
5932 SA8D_INTER | |
5933 mova [esp+48], m0 | |
5934 | |
5935 mov r0, [r6+20] | |
5936 mov r2, [r6+28] | |
5937 add r0, 40*SIZEOF_PIXEL | |
5938 add r2, 40*SIZEOF_PIXEL | |
5939 call pixel_sa8d_8x8_internal2 | |
5940 SA8D_INTER | |
5941 mova [esp+64-mmsize], m0 | |
5942 call pixel_sa8d_8x8_internal2 | |
5943 AVG_16x16 | |
5944 | |
5945 mov r0, [r6+20] | |
5946 mov r2, [r6+28] | |
5947 add r0, 48*SIZEOF_PIXEL | |
5948 add r2, 48*SIZEOF_PIXEL | |
5949 lea r4, [r1 + 2*r1] | |
5950 call pixel_sa8d_8x8_internal2 | |
5951 %if HIGH_BIT_DEPTH | |
5952 HADDUW m0, m1 | |
5953 %endif | |
5954 mova [esp+48], m0 | |
5955 call pixel_sa8d_8x8_internal2 | |
5956 SA8D_INTER | |
5957 mova [esp+48], m0 | |
5958 | |
5959 mov r0, [r6+20] | |
5960 mov r2, [r6+28] | |
5961 add r0, 56*SIZEOF_PIXEL | |
5962 add r2, 56*SIZEOF_PIXEL | |
5963 call pixel_sa8d_8x8_internal2 | |
5964 SA8D_INTER | |
5965 mova [esp+64-mmsize], m0 | |
5966 call pixel_sa8d_8x8_internal2 | |
5967 AVG_16x16 | |
5968 | |
5969 mov r0, [r6+20] | |
5970 mov r2, [r6+28] | |
5971 lea r0, [r0 + r1*8] | |
5972 lea r2, [r2 + r3*8] | |
5973 lea r0, [r0 + r1*8] | |
5974 lea r2, [r2 + r3*8] | |
5975 mov [r6+20], r0 | |
5976 mov [r6+28], r2 | |
5977 | |
5978 lea r4, [r1 + 2*r1] | |
5979 call pixel_sa8d_8x8_internal2 | |
5980 %if HIGH_BIT_DEPTH | |
5981 HADDUW m0, m1 | |
5982 %endif | |
5983 mova [esp+48], m0 | |
5984 call pixel_sa8d_8x8_internal2 | |
5985 SA8D_INTER | |
5986 mova [esp+48], m0 | |
5987 | |
5988 mov r0, [r6+20] | |
5989 mov r2, [r6+28] | |
5990 add r0, 8*SIZEOF_PIXEL | |
5991 add r2, 8*SIZEOF_PIXEL | |
5992 call pixel_sa8d_8x8_internal2 | |
5993 SA8D_INTER | |
5994 mova [esp+64-mmsize], m0 | |
5995 call pixel_sa8d_8x8_internal2 | |
5996 AVG_16x16 | |
5997 | |
5998 mov r0, [r6+20] | |
5999 mov r2, [r6+28] | |
6000 add r0, 16*SIZEOF_PIXEL | |
6001 add r2, 16*SIZEOF_PIXEL | |
6002 lea r4, [r1 + 2*r1] | |
6003 call pixel_sa8d_8x8_internal2 | |
6004 %if HIGH_BIT_DEPTH | |
6005 HADDUW m0, m1 | |
6006 %endif | |
6007 mova [esp+48], m0 | |
6008 call pixel_sa8d_8x8_internal2 | |
6009 SA8D_INTER | |
6010 mova [esp+48], m0 | |
6011 | |
6012 mov r0, [r6+20] | |
6013 mov r2, [r6+28] | |
6014 add r0, 24*SIZEOF_PIXEL | |
6015 add r2, 24*SIZEOF_PIXEL | |
6016 call pixel_sa8d_8x8_internal2 | |
6017 SA8D_INTER | |
6018 mova [esp+64-mmsize], m0 | |
6019 call pixel_sa8d_8x8_internal2 | |
6020 AVG_16x16 | |
6021 | |
6022 mov r0, [r6+20] | |
6023 mov r2, [r6+28] | |
6024 add r0, 32*SIZEOF_PIXEL | |
6025 add r2, 32*SIZEOF_PIXEL | |
6026 lea r4, [r1 + 2*r1] | |
6027 call pixel_sa8d_8x8_internal2 | |
6028 %if HIGH_BIT_DEPTH | |
6029 HADDUW m0, m1 | |
6030 %endif | |
6031 mova [esp+48], m0 | |
6032 call pixel_sa8d_8x8_internal2 | |
6033 SA8D_INTER | |
6034 mova [esp+48], m0 | |
6035 | |
6036 mov r0, [r6+20] | |
6037 mov r2, [r6+28] | |
6038 add r0, 40*SIZEOF_PIXEL | |
6039 add r2, 40*SIZEOF_PIXEL | |
6040 call pixel_sa8d_8x8_internal2 | |
6041 SA8D_INTER | |
6042 mova [esp+64-mmsize], m0 | |
6043 call pixel_sa8d_8x8_internal2 | |
6044 AVG_16x16 | |
6045 | |
6046 mov r0, [r6+20] | |
6047 mov r2, [r6+28] | |
6048 add r0, 48*SIZEOF_PIXEL | |
6049 add r2, 48*SIZEOF_PIXEL | |
6050 lea r4, [r1 + 2*r1] | |
6051 call pixel_sa8d_8x8_internal2 | |
6052 %if HIGH_BIT_DEPTH | |
6053 HADDUW m0, m1 | |
6054 %endif | |
6055 mova [esp+48], m0 | |
6056 call pixel_sa8d_8x8_internal2 | |
6057 SA8D_INTER | |
6058 mova [esp+48], m0 | |
6059 | |
6060 mov r0, [r6+20] | |
6061 mov r2, [r6+28] | |
6062 add r0, 56*SIZEOF_PIXEL | |
6063 add r2, 56*SIZEOF_PIXEL | |
6064 call pixel_sa8d_8x8_internal2 | |
6065 SA8D_INTER | |
6066 mova [esp+64-mmsize], m0 | |
6067 call pixel_sa8d_8x8_internal2 | |
6068 SA8D_INTER | |
6069 %if HIGH_BIT_DEPTH == 0 | |
6070 HADDUW m0, m1 | |
6071 %endif | |
6072 movd r4d, m0 | |
6073 add r4d, 1 | |
6074 shr r4d, 1 | |
6075 add r4d, dword [esp+36] | |
6076 mov eax, r4d | |
6077 mov esp, r6 | |
6078 RET | |
6079 | |
6080 cglobal pixel_sa8d_64x64, 4,7,8 | |
6081 FIX_STRIDES r1, r3 | |
6082 mov r6, esp | |
6083 and esp, ~15 | |
6084 sub esp, 64 | |
6085 | |
6086 lea r4, [r1 + 2*r1] | |
6087 lea r5, [r3 + 2*r3] | |
6088 call pixel_sa8d_8x8_internal2 | |
6089 %if HIGH_BIT_DEPTH | |
6090 HADDUW m0, m1 | |
6091 %endif | |
6092 mova [rsp+48], m0 | |
6093 call pixel_sa8d_8x8_internal2 | |
6094 SA8D_INTER | |
6095 mova [esp+48], m0 | |
6096 | |
6097 mov r0, [r6+20] | |
6098 mov r2, [r6+28] | |
6099 add r0, 8*SIZEOF_PIXEL | |
6100 add r2, 8*SIZEOF_PIXEL | |
6101 call pixel_sa8d_8x8_internal2 | |
6102 SA8D_INTER | |
6103 mova [esp+48], m0 | |
6104 call pixel_sa8d_8x8_internal2 | |
6105 SA8D_INTER | |
6106 %if HIGH_BIT_DEPTH == 0 | |
6107 HADDUW m0, m1 | |
6108 %endif | |
6109 movd r4d, m0 | |
6110 add r4d, 1 | |
6111 shr r4d, 1 | |
6112 mov dword [esp+36], r4d | |
6113 | |
6114 mov r0, [r6+20] | |
6115 mov r2, [r6+28] | |
6116 add r0, 16*SIZEOF_PIXEL | |
6117 add r2, 16*SIZEOF_PIXEL | |
6118 lea r4, [r1 + 2*r1] | |
6119 call pixel_sa8d_8x8_internal2 | |
6120 %if HIGH_BIT_DEPTH | |
6121 HADDUW m0, m1 | |
6122 %endif | |
6123 mova [esp+48], m0 | |
6124 call pixel_sa8d_8x8_internal2 | |
6125 SA8D_INTER | |
6126 mova [esp+48], m0 | |
6127 | |
6128 mov r0, [r6+20] | |
6129 mov r2, [r6+28] | |
6130 add r0, 24*SIZEOF_PIXEL | |
6131 add r2, 24*SIZEOF_PIXEL | |
6132 call pixel_sa8d_8x8_internal2 | |
6133 SA8D_INTER | |
6134 mova [esp+64-mmsize], m0 | |
6135 call pixel_sa8d_8x8_internal2 | |
6136 AVG_16x16 | |
6137 | |
6138 mov r0, [r6+20] | |
6139 mov r2, [r6+28] | |
6140 add r0, 32*SIZEOF_PIXEL | |
6141 add r2, 32*SIZEOF_PIXEL | |
6142 lea r4, [r1 + 2*r1] | |
6143 call pixel_sa8d_8x8_internal2 | |
6144 %if HIGH_BIT_DEPTH | |
6145 HADDUW m0, m1 | |
6146 %endif | |
6147 mova [esp+48], m0 | |
6148 call pixel_sa8d_8x8_internal2 | |
6149 SA8D_INTER | |
6150 mova [esp+48], m0 | |
6151 | |
6152 mov r0, [r6+20] | |
6153 mov r2, [r6+28] | |
6154 add r0, 40*SIZEOF_PIXEL | |
6155 add r2, 40*SIZEOF_PIXEL | |
6156 call pixel_sa8d_8x8_internal2 | |
6157 SA8D_INTER | |
6158 mova [esp+64-mmsize], m0 | |
6159 call pixel_sa8d_8x8_internal2 | |
6160 AVG_16x16 | |
6161 | |
6162 mov r0, [r6+20] | |
6163 mov r2, [r6+28] | |
6164 add r0, 48*SIZEOF_PIXEL | |
6165 add r2, 48*SIZEOF_PIXEL | |
6166 lea r4, [r1 + 2*r1] | |
6167 call pixel_sa8d_8x8_internal2 | |
6168 %if HIGH_BIT_DEPTH | |
6169 HADDUW m0, m1 | |
6170 %endif | |
6171 mova [esp+48], m0 | |
6172 call pixel_sa8d_8x8_internal2 | |
6173 SA8D_INTER | |
6174 mova [esp+48], m0 | |
6175 | |
6176 mov r0, [r6+20] | |
6177 mov r2, [r6+28] | |
6178 add r0, 56*SIZEOF_PIXEL | |
6179 add r2, 56*SIZEOF_PIXEL | |
6180 call pixel_sa8d_8x8_internal2 | |
6181 SA8D_INTER | |
6182 mova [esp+64-mmsize], m0 | |
6183 call pixel_sa8d_8x8_internal2 | |
6184 AVG_16x16 | |
6185 | |
6186 mov r0, [r6+20] | |
6187 mov r2, [r6+28] | |
6188 lea r0, [r0 + r1*8] | |
6189 lea r2, [r2 + r3*8] | |
6190 lea r0, [r0 + r1*8] | |
6191 lea r2, [r2 + r3*8] | |
6192 mov [r6+20], r0 | |
6193 mov [r6+28], r2 | |
6194 | |
6195 lea r4, [r1 + 2*r1] | |
6196 call pixel_sa8d_8x8_internal2 | |
6197 %if HIGH_BIT_DEPTH | |
6198 HADDUW m0, m1 | |
6199 %endif | |
6200 mova [esp+48], m0 | |
6201 call pixel_sa8d_8x8_internal2 | |
6202 SA8D_INTER | |
6203 mova [esp+48], m0 | |
6204 | |
6205 mov r0, [r6+20] | |
6206 mov r2, [r6+28] | |
6207 add r0, 8*SIZEOF_PIXEL | |
6208 add r2, 8*SIZEOF_PIXEL | |
6209 call pixel_sa8d_8x8_internal2 | |
6210 SA8D_INTER | |
6211 mova [esp+64-mmsize], m0 | |
6212 call pixel_sa8d_8x8_internal2 | |
6213 AVG_16x16 | |
6214 | |
6215 mov r0, [r6+20] | |
6216 mov r2, [r6+28] | |
6217 add r0, 16*SIZEOF_PIXEL | |
6218 add r2, 16*SIZEOF_PIXEL | |
6219 lea r4, [r1 + 2*r1] | |
6220 call pixel_sa8d_8x8_internal2 | |
6221 %if HIGH_BIT_DEPTH | |
6222 HADDUW m0, m1 | |
6223 %endif | |
6224 mova [esp+48], m0 | |
6225 call pixel_sa8d_8x8_internal2 | |
6226 SA8D_INTER | |
6227 mova [esp+48], m0 | |
6228 | |
6229 mov r0, [r6+20] | |
6230 mov r2, [r6+28] | |
6231 add r0, 24*SIZEOF_PIXEL | |
6232 add r2, 24*SIZEOF_PIXEL | |
6233 call pixel_sa8d_8x8_internal2 | |
6234 SA8D_INTER | |
6235 mova [esp+64-mmsize], m0 | |
6236 call pixel_sa8d_8x8_internal2 | |
6237 AVG_16x16 | |
6238 | |
6239 mov r0, [r6+20] | |
6240 mov r2, [r6+28] | |
6241 add r0, 32*SIZEOF_PIXEL | |
6242 add r2, 32*SIZEOF_PIXEL | |
6243 lea r4, [r1 + 2*r1] | |
6244 call pixel_sa8d_8x8_internal2 | |
6245 %if HIGH_BIT_DEPTH | |
6246 HADDUW m0, m1 | |
6247 %endif | |
6248 mova [esp+48], m0 | |
6249 call pixel_sa8d_8x8_internal2 | |
6250 SA8D_INTER | |
6251 mova [esp+48], m0 | |
6252 | |
6253 mov r0, [r6+20] | |
6254 mov r2, [r6+28] | |
6255 add r0, 40*SIZEOF_PIXEL | |
6256 add r2, 40*SIZEOF_PIXEL | |
6257 call pixel_sa8d_8x8_internal2 | |
6258 SA8D_INTER | |
6259 mova [esp+64-mmsize], m0 | |
6260 call pixel_sa8d_8x8_internal2 | |
6261 AVG_16x16 | |
6262 | |
6263 mov r0, [r6+20] | |
6264 mov r2, [r6+28] | |
6265 add r0, 48*SIZEOF_PIXEL | |
6266 add r2, 48*SIZEOF_PIXEL | |
6267 lea r4, [r1 + 2*r1] | |
6268 call pixel_sa8d_8x8_internal2 | |
6269 %if HIGH_BIT_DEPTH | |
6270 HADDUW m0, m1 | |
6271 %endif | |
6272 mova [esp+48], m0 | |
6273 call pixel_sa8d_8x8_internal2 | |
6274 SA8D_INTER | |
6275 mova [esp+48], m0 | |
6276 | |
6277 mov r0, [r6+20] | |
6278 mov r2, [r6+28] | |
6279 add r0, 56*SIZEOF_PIXEL | |
6280 add r2, 56*SIZEOF_PIXEL | |
6281 call pixel_sa8d_8x8_internal2 | |
6282 SA8D_INTER | |
6283 mova [esp+64-mmsize], m0 | |
6284 call pixel_sa8d_8x8_internal2 | |
6285 AVG_16x16 | |
6286 | |
6287 mov r0, [r6+20] | |
6288 mov r2, [r6+28] | |
6289 lea r0, [r0 + r1*8] | |
6290 lea r2, [r2 + r3*8] | |
6291 lea r0, [r0 + r1*8] | |
6292 lea r2, [r2 + r3*8] | |
6293 mov [r6+20], r0 | |
6294 mov [r6+28], r2 | |
6295 | |
6296 lea r4, [r1 + 2*r1] | |
6297 call pixel_sa8d_8x8_internal2 | |
6298 %if HIGH_BIT_DEPTH | |
6299 HADDUW m0, m1 | |
6300 %endif | |
6301 mova [esp+48], m0 | |
6302 call pixel_sa8d_8x8_internal2 | |
6303 SA8D_INTER | |
6304 mova [esp+48], m0 | |
6305 | |
6306 mov r0, [r6+20] | |
6307 mov r2, [r6+28] | |
6308 add r0, 8*SIZEOF_PIXEL | |
6309 add r2, 8*SIZEOF_PIXEL | |
6310 call pixel_sa8d_8x8_internal2 | |
6311 SA8D_INTER | |
6312 mova [esp+64-mmsize], m0 | |
6313 call pixel_sa8d_8x8_internal2 | |
6314 AVG_16x16 | |
6315 | |
6316 mov r0, [r6+20] | |
6317 mov r2, [r6+28] | |
6318 add r0, 16*SIZEOF_PIXEL | |
6319 add r2, 16*SIZEOF_PIXEL | |
6320 lea r4, [r1 + 2*r1] | |
6321 call pixel_sa8d_8x8_internal2 | |
6322 %if HIGH_BIT_DEPTH | |
6323 HADDUW m0, m1 | |
6324 %endif | |
6325 mova [esp+48], m0 | |
6326 call pixel_sa8d_8x8_internal2 | |
6327 SA8D_INTER | |
6328 mova [esp+48], m0 | |
6329 | |
6330 mov r0, [r6+20] | |
6331 mov r2, [r6+28] | |
6332 add r0, 24*SIZEOF_PIXEL | |
6333 add r2, 24*SIZEOF_PIXEL | |
6334 call pixel_sa8d_8x8_internal2 | |
6335 SA8D_INTER | |
6336 mova [esp+64-mmsize], m0 | |
6337 call pixel_sa8d_8x8_internal2 | |
6338 AVG_16x16 | |
6339 | |
6340 mov r0, [r6+20] | |
6341 mov r2, [r6+28] | |
6342 add r0, 32*SIZEOF_PIXEL | |
6343 add r2, 32*SIZEOF_PIXEL | |
6344 lea r4, [r1 + 2*r1] | |
6345 call pixel_sa8d_8x8_internal2 | |
6346 %if HIGH_BIT_DEPTH | |
6347 HADDUW m0, m1 | |
6348 %endif | |
6349 mova [esp+48], m0 | |
6350 call pixel_sa8d_8x8_internal2 | |
6351 SA8D_INTER | |
6352 mova [esp+48], m0 | |
6353 | |
6354 mov r0, [r6+20] | |
6355 mov r2, [r6+28] | |
6356 add r0, 40*SIZEOF_PIXEL | |
6357 add r2, 40*SIZEOF_PIXEL | |
6358 call pixel_sa8d_8x8_internal2 | |
6359 SA8D_INTER | |
6360 mova [esp+64-mmsize], m0 | |
6361 call pixel_sa8d_8x8_internal2 | |
6362 AVG_16x16 | |
6363 | |
6364 mov r0, [r6+20] | |
6365 mov r2, [r6+28] | |
6366 add r0, 48*SIZEOF_PIXEL | |
6367 add r2, 48*SIZEOF_PIXEL | |
6368 lea r4, [r1 + 2*r1] | |
6369 call pixel_sa8d_8x8_internal2 | |
6370 %if HIGH_BIT_DEPTH | |
6371 HADDUW m0, m1 | |
6372 %endif | |
6373 mova [esp+48], m0 | |
6374 call pixel_sa8d_8x8_internal2 | |
6375 SA8D_INTER | |
6376 mova [esp+48], m0 | |
6377 | |
6378 mov r0, [r6+20] | |
6379 mov r2, [r6+28] | |
6380 add r0, 56*SIZEOF_PIXEL | |
6381 add r2, 56*SIZEOF_PIXEL | |
6382 call pixel_sa8d_8x8_internal2 | |
6383 SA8D_INTER | |
6384 mova [esp+64-mmsize], m0 | |
6385 call pixel_sa8d_8x8_internal2 | |
6386 AVG_16x16 | |
6387 | |
6388 mov r0, [r6+20] | |
6389 mov r2, [r6+28] | |
6390 lea r0, [r0 + r1*8] | |
6391 lea r2, [r2 + r3*8] | |
6392 lea r0, [r0 + r1*8] | |
6393 lea r2, [r2 + r3*8] | |
6394 mov [r6+20], r0 | |
6395 mov [r6+28], r2 | |
6396 | |
6397 lea r4, [r1 + 2*r1] | |
6398 call pixel_sa8d_8x8_internal2 | |
6399 %if HIGH_BIT_DEPTH | |
6400 HADDUW m0, m1 | |
6401 %endif | |
6402 mova [esp+48], m0 | |
6403 call pixel_sa8d_8x8_internal2 | |
6404 SA8D_INTER | |
6405 mova [esp+48], m0 | |
6406 | |
6407 mov r0, [r6+20] | |
6408 mov r2, [r6+28] | |
6409 add r0, 8*SIZEOF_PIXEL | |
6410 add r2, 8*SIZEOF_PIXEL | |
6411 call pixel_sa8d_8x8_internal2 | |
6412 SA8D_INTER | |
6413 mova [esp+64-mmsize], m0 | |
6414 call pixel_sa8d_8x8_internal2 | |
6415 AVG_16x16 | |
6416 | |
6417 mov r0, [r6+20] | |
6418 mov r2, [r6+28] | |
6419 add r0, 16*SIZEOF_PIXEL | |
6420 add r2, 16*SIZEOF_PIXEL | |
6421 lea r4, [r1 + 2*r1] | |
6422 call pixel_sa8d_8x8_internal2 | |
6423 %if HIGH_BIT_DEPTH | |
6424 HADDUW m0, m1 | |
6425 %endif | |
6426 mova [esp+48], m0 | |
6427 call pixel_sa8d_8x8_internal2 | |
6428 SA8D_INTER | |
6429 mova [esp+48], m0 | |
6430 | |
6431 mov r0, [r6+20] | |
6432 mov r2, [r6+28] | |
6433 add r0, 24*SIZEOF_PIXEL | |
6434 add r2, 24*SIZEOF_PIXEL | |
6435 call pixel_sa8d_8x8_internal2 | |
6436 SA8D_INTER | |
6437 mova [esp+64-mmsize], m0 | |
6438 call pixel_sa8d_8x8_internal2 | |
6439 AVG_16x16 | |
6440 | |
6441 mov r0, [r6+20] | |
6442 mov r2, [r6+28] | |
6443 add r0, 32*SIZEOF_PIXEL | |
6444 add r2, 32*SIZEOF_PIXEL | |
6445 lea r4, [r1 + 2*r1] | |
6446 call pixel_sa8d_8x8_internal2 | |
6447 %if HIGH_BIT_DEPTH | |
6448 HADDUW m0, m1 | |
6449 %endif | |
6450 mova [esp+48], m0 | |
6451 call pixel_sa8d_8x8_internal2 | |
6452 SA8D_INTER | |
6453 mova [esp+48], m0 | |
6454 | |
6455 mov r0, [r6+20] | |
6456 mov r2, [r6+28] | |
6457 add r0, 40*SIZEOF_PIXEL | |
6458 add r2, 40*SIZEOF_PIXEL | |
6459 call pixel_sa8d_8x8_internal2 | |
6460 SA8D_INTER | |
6461 mova [esp+64-mmsize], m0 | |
6462 call pixel_sa8d_8x8_internal2 | |
6463 AVG_16x16 | |
6464 | |
6465 mov r0, [r6+20] | |
6466 mov r2, [r6+28] | |
6467 add r0, 48*SIZEOF_PIXEL | |
6468 add r2, 48*SIZEOF_PIXEL | |
6469 lea r4, [r1 + 2*r1] | |
6470 call pixel_sa8d_8x8_internal2 | |
6471 %if HIGH_BIT_DEPTH | |
6472 HADDUW m0, m1 | |
6473 %endif | |
6474 mova [esp+48], m0 | |
6475 call pixel_sa8d_8x8_internal2 | |
6476 SA8D_INTER | |
6477 mova [esp+48], m0 | |
6478 | |
6479 mov r0, [r6+20] | |
6480 mov r2, [r6+28] | |
6481 add r0, 56*SIZEOF_PIXEL | |
6482 add r2, 56*SIZEOF_PIXEL | |
6483 call pixel_sa8d_8x8_internal2 | |
6484 SA8D_INTER | |
6485 mova [esp+64-mmsize], m0 | |
6486 call pixel_sa8d_8x8_internal2 | |
6487 SA8D_INTER | |
6488 %if HIGH_BIT_DEPTH == 0 | |
6489 HADDUW m0, m1 | |
6490 %endif | |
6491 movd r4d, m0 | |
6492 add r4d, 1 | |
6493 shr r4d, 1 | |
6494 add r4d, dword [esp+36] | |
6495 mov eax, r4d | |
6496 mov esp, r6 | |
6497 RET | |
6498 %endif ; !ARCH_X86_64 | |
6499 %endmacro ; SA8D | |
6500 | |
6501 ;============================================================================= | |
6502 ; INTRA SATD | |
6503 ;============================================================================= | |
6504 %define TRANS TRANS_SSE2 | |
6505 %define DIFFOP DIFF_UNPACK_SSE2 | |
6506 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P | |
6507 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 | |
6508 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size | |
6509 %define movdqu movups | |
6510 %define punpcklqdq movlhps | |
6511 INIT_XMM sse2 | |
6512 SA8D | |
6513 SATDS_SSE2 | |
6514 | |
6515 %if HIGH_BIT_DEPTH == 0 | |
6516 INIT_XMM ssse3,atom | |
6517 SATDS_SSE2 | |
6518 SA8D | |
6519 %endif | |
6520 | |
6521 %define DIFFOP DIFF_SUMSUB_SSSE3 | |
6522 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE | |
6523 %if HIGH_BIT_DEPTH == 0 | |
6524 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 | |
6525 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 | |
6526 %endif | |
6527 INIT_XMM ssse3 | |
6528 SATDS_SSE2 | |
6529 SA8D | |
6530 %undef movdqa ; nehalem doesn't like movaps | |
6531 %undef movdqu ; movups | |
6532 %undef punpcklqdq ; or movlhps | |
6533 | |
6534 %define TRANS TRANS_SSE4 | |
6535 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN | |
6536 INIT_XMM sse4 | |
6537 SATDS_SSE2 | |
6538 SA8D | |
6539 | |
6540 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so | |
6541 ; it's effectively free. | |
6542 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE | |
6543 INIT_XMM avx | |
6544 SATDS_SSE2 | |
6545 SA8D | |
6546 | |
6547 %define TRANS TRANS_XOP | |
6548 INIT_XMM xop | |
6549 SATDS_SSE2 | |
6550 SA8D | |
6551 | |
6552 | |
6553 %if HIGH_BIT_DEPTH == 0 | |
6554 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 | |
6555 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 | |
6556 %define TRANS TRANS_SSE4 | |
6557 | |
6558 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] | |
6559 movq xm%1, [r0] | |
6560 movq xm%3, [r2] | |
6561 movq xm%2, [r0+r1] | |
6562 movq xm%4, [r2+r3] | |
6563 vinserti128 m%1, m%1, [r0+4*r1], 1 | |
6564 vinserti128 m%3, m%3, [r2+4*r3], 1 | |
6565 vinserti128 m%2, m%2, [r0+r4], 1 | |
6566 vinserti128 m%4, m%4, [r2+r5], 1 | |
6567 punpcklqdq m%1, m%1 | |
6568 punpcklqdq m%3, m%3 | |
6569 punpcklqdq m%2, m%2 | |
6570 punpcklqdq m%4, m%4 | |
6571 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 | |
6572 lea r0, [r0+2*r1] | |
6573 lea r2, [r2+2*r3] | |
6574 | |
6575 movq xm%3, [r0] | |
6576 movq xm%5, [r2] | |
6577 movq xm%4, [r0+r1] | |
6578 movq xm%6, [r2+r3] | |
6579 vinserti128 m%3, m%3, [r0+4*r1], 1 | |
6580 vinserti128 m%5, m%5, [r2+4*r3], 1 | |
6581 vinserti128 m%4, m%4, [r0+r4], 1 | |
6582 vinserti128 m%6, m%6, [r2+r5], 1 | |
6583 punpcklqdq m%3, m%3 | |
6584 punpcklqdq m%5, m%5 | |
6585 punpcklqdq m%4, m%4 | |
6586 punpcklqdq m%6, m%6 | |
6587 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 | |
6588 %endmacro | |
6589 | |
6590 %macro SATD_START_AVX2 2-3 0 | |
6591 FIX_STRIDES r1, r3 | |
6592 %if %3 | |
6593 mova %2, [hmul_8p] | |
6594 lea r4, [5*r1] | |
6595 lea r5, [5*r3] | |
6596 %else | |
6597 mova %2, [hmul_16p] | |
6598 lea r4, [3*r1] | |
6599 lea r5, [3*r3] | |
6600 %endif | |
6601 pxor %1, %1 | |
6602 %endmacro | |
6603 | |
6604 %define TRANS TRANS_SSE4 | |
6605 INIT_YMM avx2 | |
6606 cglobal pixel_satd_16x8_internal | |
6607 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 | |
6608 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 | |
6609 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 | |
6610 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 | |
6611 ret | |
6612 | |
6613 cglobal pixel_satd_16x16, 4,6,8 | |
6614 SATD_START_AVX2 m6, m7 | |
6615 call pixel_satd_16x8_internal | |
6616 lea r0, [r0+4*r1] | |
6617 lea r2, [r2+4*r3] | |
6618 pixel_satd_16x8_internal: | |
6619 call pixel_satd_16x8_internal | |
6620 vextracti128 xm0, m6, 1 | |
6621 paddw xm0, xm6 | |
6622 SATD_END_SSE2 xm0 | |
6623 RET | |
6624 | |
6625 cglobal pixel_satd_16x8, 4,6,8 | |
6626 SATD_START_AVX2 m6, m7 | |
6627 jmp pixel_satd_16x8_internal | |
6628 | |
6629 cglobal pixel_satd_8x8_internal | |
6630 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 | |
6631 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 | |
6632 ret | |
6633 | |
6634 cglobal pixel_satd_8x16, 4,6,8 | |
6635 SATD_START_AVX2 m6, m7, 1 | |
6636 call pixel_satd_8x8_internal | |
6637 lea r0, [r0+2*r1] | |
6638 lea r2, [r2+2*r3] | |
6639 lea r0, [r0+4*r1] | |
6640 lea r2, [r2+4*r3] | |
6641 call pixel_satd_8x8_internal | |
6642 vextracti128 xm0, m6, 1 | |
6643 paddw xm0, xm6 | |
6644 SATD_END_SSE2 xm0 | |
6645 RET | |
6646 | |
6647 cglobal pixel_satd_8x8, 4,6,8 | |
6648 SATD_START_AVX2 m6, m7, 1 | |
6649 call pixel_satd_8x8_internal | |
6650 vextracti128 xm0, m6, 1 | |
6651 paddw xm0, xm6 | |
6652 SATD_END_SSE2 xm0 | |
6653 RET | |
6654 | |
6655 cglobal pixel_sa8d_8x8_internal | |
6656 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 | |
6657 HADAMARD4_V 0, 1, 2, 3, 4 | |
6658 HADAMARD 8, sumsub, 0, 1, 4, 5 | |
6659 HADAMARD 8, sumsub, 2, 3, 4, 5 | |
6660 HADAMARD 2, sumsub, 0, 1, 4, 5 | |
6661 HADAMARD 2, sumsub, 2, 3, 4, 5 | |
6662 HADAMARD 1, amax, 0, 1, 4, 5 | |
6663 HADAMARD 1, amax, 2, 3, 4, 5 | |
6664 paddw m6, m0 | |
6665 paddw m6, m2 | |
6666 ret | |
6667 | |
6668 cglobal pixel_sa8d_8x8, 4,6,8 | |
6669 SATD_START_AVX2 m6, m7, 1 | |
6670 call pixel_sa8d_8x8_internal | |
6671 vextracti128 xm1, m6, 1 | |
6672 paddw xm6, xm1 | |
6673 HADDW xm6, xm1 | |
6674 movd eax, xm6 | |
6675 add eax, 1 | |
6676 shr eax, 1 | |
6677 RET | |
6678 | |
6679 cglobal pixel_sa8d_16x16, 4,6,8 | |
6680 SATD_START_AVX2 m6, m7, 1 | |
6681 | |
6682 call pixel_sa8d_8x8_internal ; pix[0] | |
6683 | |
6684 sub r0, r1 | |
6685 sub r0, r1 | |
6686 add r0, 8*SIZEOF_PIXEL | |
6687 sub r2, r3 | |
6688 sub r2, r3 | |
6689 add r2, 8*SIZEOF_PIXEL | |
6690 call pixel_sa8d_8x8_internal ; pix[8] | |
6691 | |
6692 add r0, r4 | |
6693 add r0, r1 | |
6694 add r2, r5 | |
6695 add r2, r3 | |
6696 call pixel_sa8d_8x8_internal ; pix[8*stride+8] | |
6697 | |
6698 sub r0, r1 | |
6699 sub r0, r1 | |
6700 sub r0, 8*SIZEOF_PIXEL | |
6701 sub r2, r3 | |
6702 sub r2, r3 | |
6703 sub r2, 8*SIZEOF_PIXEL | |
6704 call pixel_sa8d_8x8_internal ; pix[8*stride] | |
6705 | |
6706 ; TODO: analyze Dynamic Range | |
6707 vextracti128 xm0, m6, 1 | |
6708 paddusw xm6, xm0 | |
6709 HADDUW xm6, xm0 | |
6710 movd eax, xm6 | |
6711 add eax, 1 | |
6712 shr eax, 1 | |
6713 RET | |
6714 | |
6715 cglobal pixel_sa8d_16x16_internal | |
6716 call pixel_sa8d_8x8_internal ; pix[0] | |
6717 | |
6718 sub r0, r1 | |
6719 sub r0, r1 | |
6720 add r0, 8*SIZEOF_PIXEL | |
6721 sub r2, r3 | |
6722 sub r2, r3 | |
6723 add r2, 8*SIZEOF_PIXEL | |
6724 call pixel_sa8d_8x8_internal ; pix[8] | |
6725 | |
6726 add r0, r4 | |
6727 add r0, r1 | |
6728 add r2, r5 | |
6729 add r2, r3 | |
6730 call pixel_sa8d_8x8_internal ; pix[8*stride+8] | |
6731 | |
6732 sub r0, r1 | |
6733 sub r0, r1 | |
6734 sub r0, 8*SIZEOF_PIXEL | |
6735 sub r2, r3 | |
6736 sub r2, r3 | |
6737 sub r2, 8*SIZEOF_PIXEL | |
6738 call pixel_sa8d_8x8_internal ; pix[8*stride] | |
6739 | |
6740 ; TODO: analyze Dynamic Range | |
6741 vextracti128 xm0, m6, 1 | |
6742 paddusw xm6, xm0 | |
6743 HADDUW xm6, xm0 | |
6744 movd eax, xm6 | |
6745 add eax, 1 | |
6746 shr eax, 1 | |
6747 ret | |
6748 | |
6749 %if ARCH_X86_64 | |
6750 cglobal pixel_sa8d_32x32, 4,8,8 | |
6751 ; TODO: R6 is RAX on x64 platform, so we use it directly | |
6752 | |
6753 SATD_START_AVX2 m6, m7, 1 | |
6754 xor r7d, r7d | |
6755 | |
6756 call pixel_sa8d_16x16_internal ; [0] | |
6757 pxor m6, m6 | |
6758 add r7d, eax | |
6759 | |
6760 add r0, r4 | |
6761 add r0, r1 | |
6762 add r2, r5 | |
6763 add r2, r3 | |
6764 call pixel_sa8d_16x16_internal ; [2] | |
6765 pxor m6, m6 | |
6766 add r7d, eax | |
6767 | |
6768 lea eax, [r4 * 5 - 16] | |
6769 sub r0, rax | |
6770 sub r0, r1 | |
6771 lea eax, [r5 * 5 - 16] | |
6772 sub r2, rax | |
6773 sub r2, r3 | |
6774 call pixel_sa8d_16x16_internal ; [1] | |
6775 pxor m6, m6 | |
6776 add r7d, eax | |
6777 | |
6778 add r0, r4 | |
6779 add r0, r1 | |
6780 add r2, r5 | |
6781 add r2, r3 | |
6782 call pixel_sa8d_16x16_internal ; [3] | |
6783 add eax, r7d | |
6784 RET | |
6785 %endif ; ARCH_X86_64=1 | |
6786 %endif ; HIGH_BIT_DEPTH | |
6787 | |
6788 ; Input 10bit, Output 8bit | |
6789 ;------------------------------------------------------------------------------------------------------------------------ | |
6790 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) | |
6791 ;------------------------------------------------------------------------------------------------------------------------ | |
6792 INIT_XMM sse2 | |
6793 cglobal downShift_16, 7,7,3 | |
6794 movd m0, r6d ; m0 = shift | |
6795 add r1, r1 | |
6796 dec r5d | |
6797 .loopH: | |
6798 xor r6, r6 | |
6799 .loopW: | |
6800 movu m1, [r0 + r6 * 2] | |
6801 movu m2, [r0 + r6 * 2 + 16] | |
6802 psrlw m1, m0 | |
6803 psrlw m2, m0 | |
6804 packuswb m1, m2 | |
6805 movu [r2 + r6], m1 | |
6806 | |
6807 add r6, 16 | |
6808 cmp r6d, r4d | |
6809 jl .loopW | |
6810 | |
6811 ; move to next row | |
6812 add r0, r1 | |
6813 add r2, r3 | |
6814 dec r5d | |
6815 jnz .loopH | |
6816 | |
6817 ;processing last row of every frame [To handle width which not a multiple of 16] | |
6818 | |
6819 .loop16: | |
6820 movu m1, [r0] | |
6821 movu m2, [r0 + 16] | |
6822 psrlw m1, m0 | |
6823 psrlw m2, m0 | |
6824 packuswb m1, m2 | |
6825 movu [r2], m1 | |
6826 | |
6827 add r0, 2 * mmsize | |
6828 add r2, mmsize | |
6829 sub r4d, 16 | |
6830 jz .end | |
6831 cmp r4d, 15 | |
6832 jg .loop16 | |
6833 | |
6834 cmp r4d, 8 | |
6835 jl .process4 | |
6836 movu m1, [r0] | |
6837 psrlw m1, m0 | |
6838 packuswb m1, m1 | |
6839 movh [r2], m1 | |
6840 | |
6841 add r0, mmsize | |
6842 add r2, 8 | |
6843 sub r4d, 8 | |
6844 jz .end | |
6845 | |
6846 .process4: | |
6847 cmp r4d, 4 | |
6848 jl .process2 | |
6849 movh m1,[r0] | |
6850 psrlw m1, m0 | |
6851 packuswb m1, m1 | |
6852 movd [r2], m1 | |
6853 | |
6854 add r0, 8 | |
6855 add r2, 4 | |
6856 sub r4d, 4 | |
6857 jz .end | |
6858 | |
6859 .process2: | |
6860 cmp r4d, 2 | |
6861 jl .process1 | |
6862 movd m1, [r0] | |
6863 psrlw m1, m0 | |
6864 packuswb m1, m1 | |
6865 movd r6, m1 | |
6866 mov [r2], r6w | |
6867 | |
6868 add r0, 4 | |
6869 add r2, 2 | |
6870 sub r4d, 2 | |
6871 jz .end | |
6872 | |
6873 .process1: | |
6874 movd m1, [r0] | |
6875 psrlw m1, m0 | |
6876 packuswb m1, m1 | |
6877 movd r3, m1 | |
6878 mov [r2], r3b | |
6879 .end: | |
6880 RET | |
6881 | |
6882 ; Input 10bit, Output 8bit | |
6883 ;------------------------------------------------------------------------------------------------------------------------------------- | |
6884 ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) | |
6885 ;------------------------------------------------------------------------------------------------------------------------------------- | |
6886 INIT_YMM avx2 | |
6887 cglobal downShift_16, 6,7,3 | |
6888 movd xm0, r6m ; m0 = shift | |
6889 add r1d, r1d | |
6890 dec r5d | |
6891 .loopH: | |
6892 xor r6, r6 | |
6893 .loopW: | |
6894 movu m1, [r0 + r6 * 2 + 0] | |
6895 movu m2, [r0 + r6 * 2 + 32] | |
6896 vpsrlw m1, xm0 | |
6897 vpsrlw m2, xm0 | |
6898 packuswb m1, m2 | |
6899 vpermq m1, m1, 11011000b | |
6900 movu [r2 + r6], m1 | |
6901 | |
6902 add r6d, mmsize | |
6903 cmp r6d, r4d | |
6904 jl .loopW | |
6905 | |
6906 ; move to next row | |
6907 add r0, r1 | |
6908 add r2, r3 | |
6909 dec r5d | |
6910 jnz .loopH | |
6911 | |
6912 ; processing last row of every frame [To handle width which not a multiple of 32] | |
6913 mov r6d, r4d | |
6914 and r4d, 31 | |
6915 shr r6d, 5 | |
6916 | |
6917 .loop32: | |
6918 movu m1, [r0] | |
6919 movu m2, [r0 + 32] | |
6920 psrlw m1, xm0 | |
6921 psrlw m2, xm0 | |
6922 packuswb m1, m2 | |
6923 vpermq m1, m1, 11011000b | |
6924 movu [r2], m1 | |
6925 | |
6926 add r0, 2*mmsize | |
6927 add r2, mmsize | |
6928 dec r6d | |
6929 jnz .loop32 | |
6930 | |
6931 cmp r4d, 16 | |
6932 jl .process8 | |
6933 movu m1, [r0] | |
6934 psrlw m1, xm0 | |
6935 packuswb m1, m1 | |
6936 vpermq m1, m1, 10001000b | |
6937 movu [r2], xm1 | |
6938 | |
6939 add r0, mmsize | |
6940 add r2, 16 | |
6941 sub r4d, 16 | |
6942 jz .end | |
6943 | |
6944 .process8: | |
6945 cmp r4d, 8 | |
6946 jl .process4 | |
6947 movu m1, [r0] | |
6948 psrlw m1, xm0 | |
6949 packuswb m1, m1 | |
6950 movq [r2], xm1 | |
6951 | |
6952 add r0, 16 | |
6953 add r2, 8 | |
6954 sub r4d, 8 | |
6955 jz .end | |
6956 | |
6957 .process4: | |
6958 cmp r4d, 4 | |
6959 jl .process2 | |
6960 movq xm1,[r0] | |
6961 psrlw m1, xm0 | |
6962 packuswb m1, m1 | |
6963 movd [r2], xm1 | |
6964 | |
6965 add r0, 8 | |
6966 add r2, 4 | |
6967 sub r4d, 4 | |
6968 jz .end | |
6969 | |
6970 .process2: | |
6971 cmp r4d, 2 | |
6972 jl .process1 | |
6973 movd xm1, [r0] | |
6974 psrlw m1, xm0 | |
6975 packuswb m1, m1 | |
6976 movd r6d, xm1 | |
6977 mov [r2], r6w | |
6978 | |
6979 add r0, 4 | |
6980 add r2, 2 | |
6981 sub r4d, 2 | |
6982 jz .end | |
6983 | |
6984 .process1: | |
6985 movd xm1, [r0] | |
6986 psrlw m1, xm0 | |
6987 packuswb m1, m1 | |
6988 movd r3d, xm1 | |
6989 mov [r2], r3b | |
6990 .end: | |
6991 RET | |
6992 | |
6993 ; Input 8bit, Output 10bit | |
6994 ;--------------------------------------------------------------------------------------------------------------------- | |
6995 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) | |
6996 ;--------------------------------------------------------------------------------------------------------------------- | |
6997 INIT_XMM sse4 | |
6998 cglobal upShift_8, 6,7,3 | |
6999 movd xm2, r6m | |
7000 add r3d, r3d | |
7001 dec r5d | |
7002 | |
7003 .loopH: | |
7004 xor r6, r6 | |
7005 .loopW: | |
7006 pmovzxbw m0,[r0 + r6] | |
7007 pmovzxbw m1,[r0 + r6 + mmsize/2] | |
7008 psllw m0, m2 | |
7009 psllw m1, m2 | |
7010 movu [r2 + r6 * 2], m0 | |
7011 movu [r2 + r6 * 2 + mmsize], m1 | |
7012 | |
7013 add r6d, mmsize | |
7014 cmp r6d, r4d | |
7015 jl .loopW | |
7016 | |
7017 ; move to next row | |
7018 add r0, r1 | |
7019 add r2, r3 | |
7020 dec r5d | |
7021 jg .loopH | |
7022 | |
7023 ; processing last row of every frame [To handle width which not a multiple of 16] | |
7024 mov r1d, (mmsize/2 - 1) | |
7025 and r1d, r4d | |
7026 sub r1, mmsize/2 | |
7027 | |
7028 ; NOTE: Width MUST BE more than or equal to 8 | |
7029 shr r4d, 3 ; log2(mmsize) | |
7030 .loopW8: | |
7031 pmovzxbw m0,[r0] | |
7032 psllw m0, m2 | |
7033 movu [r2], m0 | |
7034 add r0, mmsize/2 | |
7035 add r2, mmsize | |
7036 dec r4d | |
7037 jg .loopW8 | |
7038 | |
7039 ; Mac OS X can't read beyond array bound, so rollback some bytes | |
7040 pmovzxbw m0,[r0 + r1] | |
7041 psllw m0, m2 | |
7042 movu [r2 + r1 * 2], m0 | |
7043 RET | |
7044 | |
7045 | |
7046 ;--------------------------------------------------------------------------------------------------------------------- | |
7047 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) | |
7048 ;--------------------------------------------------------------------------------------------------------------------- | |
7049 %if ARCH_X86_64 | |
7050 INIT_YMM avx2 | |
7051 cglobal upShift_8, 6,7,3 | |
7052 movd xm2, r6m | |
7053 add r3d, r3d | |
7054 dec r5d | |
7055 | |
7056 .loopH: | |
7057 xor r6, r6 | |
7058 .loopW: | |
7059 pmovzxbw m0,[r0 + r6] | |
7060 pmovzxbw m1,[r0 + r6 + mmsize/2] | |
7061 psllw m0, xm2 | |
7062 psllw m1, xm2 | |
7063 movu [r2 + r6 * 2], m0 | |
7064 movu [r2 + r6 * 2 + mmsize], m1 | |
7065 | |
7066 add r6d, mmsize | |
7067 cmp r6d, r4d | |
7068 jl .loopW | |
7069 | |
7070 ; move to next row | |
7071 add r0, r1 | |
7072 add r2, r3 | |
7073 dec r5d | |
7074 jg .loopH | |
7075 | |
7076 ; processing last row of every frame [To handle width which not a multiple of 32] | |
7077 mov r1d, (mmsize/2 - 1) | |
7078 and r1d, r4d | |
7079 sub r1, mmsize/2 | |
7080 | |
7081 ; NOTE: Width MUST BE more than or equal to 16 | |
7082 shr r4d, 4 ; log2(mmsize) | |
7083 .loopW16: | |
7084 pmovzxbw m0,[r0] | |
7085 psllw m0, xm2 | |
7086 movu [r2], m0 | |
7087 add r0, mmsize/2 | |
7088 add r2, mmsize | |
7089 dec r4d | |
7090 jg .loopW16 | |
7091 | |
7092 ; Mac OS X can't read beyond array bound, so rollback some bytes | |
7093 pmovzxbw m0,[r0 + r1] | |
7094 psllw m0, xm2 | |
7095 movu [r2 + r1 * 2], m0 | |
7096 RET | |
7097 %endif | |
7098 | |
7099 %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp | |
7100 %if cpuflag(ssse3) | |
7101 pabsd %1, %3 | |
7102 pabsd %2, %4 | |
7103 %elifidn %1, %3 | |
7104 pxor %5, %5 | |
7105 pxor %6, %6 | |
7106 psubd %5, %1 | |
7107 psubd %6, %2 | |
7108 pmaxsd %1, %5 | |
7109 pmaxsd %2, %6 | |
7110 %else | |
7111 pxor %1, %1 | |
7112 pxor %2, %2 | |
7113 psubd %1, %3 | |
7114 psubd %2, %4 | |
7115 pmaxsd %1, %3 | |
7116 pmaxsd %2, %4 | |
7117 %endif | |
7118 %endmacro | |
7119 | |
7120 | |
7121 ; Input 10bit, Output 12bit | |
7122 ;------------------------------------------------------------------------------------------------------------------------ | |
7123 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) | |
7124 ;------------------------------------------------------------------------------------------------------------------------ | |
7125 INIT_XMM sse2 | |
7126 cglobal upShift_16, 6,7,4 | |
7127 movd m0, r6m ; m0 = shift | |
7128 mova m3, [pw_pixel_max] | |
7129 FIX_STRIDES r1d, r3d | |
7130 dec r5d | |
7131 .loopH: | |
7132 xor r6d, r6d | |
7133 .loopW: | |
7134 movu m1, [r0 + r6 * SIZEOF_PIXEL] | |
7135 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] | |
7136 psllw m1, m0 | |
7137 psllw m2, m0 | |
7138 ; TODO: if input always valid, we can remove below 2 instructions. | |
7139 pand m1, m3 | |
7140 pand m2, m3 | |
7141 movu [r2 + r6 * SIZEOF_PIXEL], m1 | |
7142 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 | |
7143 | |
7144 add r6, mmsize * 2 / SIZEOF_PIXEL | |
7145 cmp r6d, r4d | |
7146 jl .loopW | |
7147 | |
7148 ; move to next row | |
7149 add r0, r1 | |
7150 add r2, r3 | |
7151 dec r5d | |
7152 jnz .loopH | |
7153 | |
7154 ;processing last row of every frame [To handle width which not a multiple of 16] | |
7155 | |
7156 .loop16: | |
7157 movu m1, [r0] | |
7158 movu m2, [r0 + mmsize] | |
7159 psllw m1, m0 | |
7160 psllw m2, m0 | |
7161 pand m1, m3 | |
7162 pand m2, m3 | |
7163 movu [r2], m1 | |
7164 movu [r2 + mmsize], m2 | |
7165 | |
7166 add r0, 2 * mmsize | |
7167 add r2, 2 * mmsize | |
7168 sub r4d, 16 | |
7169 jz .end | |
7170 jg .loop16 | |
7171 | |
7172 cmp r4d, 8 | |
7173 jl .process4 | |
7174 movu m1, [r0] | |
7175 psrlw m1, m0 | |
7176 pand m1, m3 | |
7177 movu [r2], m1 | |
7178 | |
7179 add r0, mmsize | |
7180 add r2, mmsize | |
7181 sub r4d, 8 | |
7182 jz .end | |
7183 | |
7184 .process4: | |
7185 cmp r4d, 4 | |
7186 jl .process2 | |
7187 movh m1,[r0] | |
7188 psllw m1, m0 | |
7189 pand m1, m3 | |
7190 movh [r2], m1 | |
7191 | |
7192 add r0, 8 | |
7193 add r2, 8 | |
7194 sub r4d, 4 | |
7195 jz .end | |
7196 | |
7197 .process2: | |
7198 cmp r4d, 2 | |
7199 jl .process1 | |
7200 movd m1, [r0] | |
7201 psllw m1, m0 | |
7202 pand m1, m3 | |
7203 movd [r2], m1 | |
7204 | |
7205 add r0, 4 | |
7206 add r2, 4 | |
7207 sub r4d, 2 | |
7208 jz .end | |
7209 | |
7210 .process1: | |
7211 movd m1, [r0] | |
7212 psllw m1, m0 | |
7213 pand m1, m3 | |
7214 movd r3, m1 | |
7215 mov [r2], r3w | |
7216 .end: | |
7217 RET | |
7218 | |
7219 ; Input 10bit, Output 12bit | |
7220 ;------------------------------------------------------------------------------------------------------------------------------------- | |
7221 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) | |
7222 ;------------------------------------------------------------------------------------------------------------------------------------- | |
7223 ; TODO: NO TEST CODE! | |
7224 INIT_YMM avx2 | |
7225 cglobal upShift_16, 6,7,4 | |
7226 movd xm0, r6m ; m0 = shift | |
7227 vbroadcasti128 m3, [pw_pixel_max] | |
7228 FIX_STRIDES r1d, r3d | |
7229 dec r5d | |
7230 .loopH: | |
7231 xor r6d, r6d | |
7232 .loopW: | |
7233 movu m1, [r0 + r6 * SIZEOF_PIXEL] | |
7234 movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] | |
7235 psllw m1, xm0 | |
7236 psllw m2, xm0 | |
7237 pand m1, m3 | |
7238 pand m2, m3 | |
7239 movu [r2 + r6 * SIZEOF_PIXEL], m1 | |
7240 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 | |
7241 | |
7242 add r6, mmsize * 2 / SIZEOF_PIXEL | |
7243 cmp r6d, r4d | |
7244 jl .loopW | |
7245 | |
7246 ; move to next row | |
7247 add r0, r1 | |
7248 add r2, r3 | |
7249 dec r5d | |
7250 jnz .loopH | |
7251 | |
7252 ; processing last row of every frame [To handle width which not a multiple of 32] | |
7253 mov r6d, r4d | |
7254 and r4d, 31 | |
7255 shr r6d, 5 | |
7256 | |
7257 .loop32: | |
7258 movu m1, [r0] | |
7259 movu m2, [r0 + mmsize] | |
7260 psllw m1, xm0 | |
7261 psllw m2, xm0 | |
7262 pand m1, m3 | |
7263 pand m2, m3 | |
7264 movu [r2], m1 | |
7265 movu [r2 + mmsize], m2 | |
7266 | |
7267 add r0, 2*mmsize | |
7268 add r2, 2*mmsize | |
7269 dec r6d | |
7270 jnz .loop32 | |
7271 | |
7272 cmp r4d, 16 | |
7273 jl .process8 | |
7274 movu m1, [r0] | |
7275 psllw m1, xm0 | |
7276 pand m1, m3 | |
7277 movu [r2], m1 | |
7278 | |
7279 add r0, mmsize | |
7280 add r2, mmsize | |
7281 sub r4d, 16 | |
7282 jz .end | |
7283 | |
7284 .process8: | |
7285 cmp r4d, 8 | |
7286 jl .process4 | |
7287 movu xm1, [r0] | |
7288 psllw xm1, xm0 | |
7289 pand xm1, xm3 | |
7290 movu [r2], xm1 | |
7291 | |
7292 add r0, 16 | |
7293 add r2, 16 | |
7294 sub r4d, 8 | |
7295 jz .end | |
7296 | |
7297 .process4: | |
7298 cmp r4d, 4 | |
7299 jl .process2 | |
7300 movq xm1,[r0] | |
7301 psllw xm1, xm0 | |
7302 pand xm1, xm3 | |
7303 movq [r2], xm1 | |
7304 | |
7305 add r0, 8 | |
7306 add r2, 8 | |
7307 sub r4d, 4 | |
7308 jz .end | |
7309 | |
7310 .process2: | |
7311 cmp r4d, 2 | |
7312 jl .process1 | |
7313 movd xm1, [r0] | |
7314 psllw xm1, xm0 | |
7315 pand xm1, xm3 | |
7316 movd [r2], xm1 | |
7317 | |
7318 add r0, 4 | |
7319 add r2, 4 | |
7320 sub r4d, 2 | |
7321 jz .end | |
7322 | |
7323 .process1: | |
7324 movd xm1, [r0] | |
7325 psllw xm1, xm0 | |
7326 pand xm1, xm3 | |
7327 movd r3d, xm1 | |
7328 mov [r2], r3w | |
7329 .end: | |
7330 RET | |
7331 | |
7332 | |
7333 ;--------------------------------------------------------------------------------------------------------------------- | |
7334 ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) | |
7335 ;--------------------------------------------------------------------------------------------------------------------- | |
7336 INIT_XMM sse4 | |
7337 cglobal psyCost_pp_4x4, 4, 5, 8 | |
7338 | |
7339 %if HIGH_BIT_DEPTH | |
7340 FIX_STRIDES r1, r3 | |
7341 lea r4, [3 * r1] | |
7342 movddup m0, [r0] | |
7343 movddup m1, [r0 + r1] | |
7344 movddup m2, [r0 + r1 * 2] | |
7345 movddup m3, [r0 + r4] | |
7346 mova m4, [hmul_8w] | |
7347 pmaddwd m0, m4 | |
7348 pmaddwd m1, m4 | |
7349 pmaddwd m2, m4 | |
7350 pmaddwd m3, m4 | |
7351 | |
7352 paddd m5, m0, m1 | |
7353 paddd m5, m2 | |
7354 paddd m5, m3 | |
7355 psrldq m4, m5, 4 | |
7356 paddd m5, m4 | |
7357 psrld m5, 2 | |
7358 | |
7359 SUMSUB_BA d, 0, 1, 4 | |
7360 SUMSUB_BA d, 2, 3, 4 | |
7361 SUMSUB_BA d, 0, 2, 4 | |
7362 SUMSUB_BA d, 1, 3, 4 | |
7363 %define ORDER unord | |
7364 TRANS q, ORDER, 0, 2, 4, 6 | |
7365 TRANS q, ORDER, 1, 3, 4, 6 | |
7366 ABSD2 m0, m2, m0, m2, m4, m6 | |
7367 pmaxsd m0, m2 | |
7368 ABSD2 m1, m3, m1, m3, m4, m6 | |
7369 pmaxsd m1, m3 | |
7370 paddd m0, m1 | |
7371 movhlps m1, m0 | |
7372 paddd m0, m1 | |
7373 psrldq m1, m0, 4 | |
7374 paddd m0, m1 | |
7375 | |
7376 psubd m7, m0, m5 | |
7377 | |
7378 lea r4, [3 * r3] | |
7379 movddup m0, [r2] | |
7380 movddup m1, [r2 + r3] | |
7381 movddup m2, [r2 + r3 * 2] | |
7382 movddup m3, [r2 + r4] | |
7383 mova m4, [hmul_8w] | |
7384 pmaddwd m0, m4 | |
7385 pmaddwd m1, m4 | |
7386 pmaddwd m2, m4 | |
7387 pmaddwd m3, m4 | |
7388 | |
7389 paddd m5, m0, m1 | |
7390 paddd m5, m2 | |
7391 paddd m5, m3 | |
7392 psrldq m4, m5, 4 | |
7393 paddd m5, m4 | |
7394 psrld m5, 2 | |
7395 | |
7396 SUMSUB_BA d, 0, 1, 4 | |
7397 SUMSUB_BA d, 2, 3, 4 | |
7398 SUMSUB_BA d, 0, 2, 4 | |
7399 SUMSUB_BA d, 1, 3, 4 | |
7400 %define ORDER unord | |
7401 TRANS q, ORDER, 0, 2, 4, 6 | |
7402 TRANS q, ORDER, 1, 3, 4, 6 | |
7403 ABSD2 m0, m2, m0, m2, m4, m6 | |
7404 pmaxsd m0, m2 | |
7405 ABSD2 m1, m3, m1, m3, m4, m6 | |
7406 pmaxsd m1, m3 | |
7407 paddd m0, m1 | |
7408 movhlps m1, m0 | |
7409 paddd m0, m1 | |
7410 psrldq m1, m0, 4 | |
7411 paddd m0, m1 | |
7412 | |
7413 psubd m0, m5 | |
7414 | |
7415 psubd m7, m0 | |
7416 pabsd m0, m7 | |
7417 movd eax, m0 | |
7418 | |
7419 %else ; !HIGH_BIT_DEPTH | |
7420 lea r4, [3 * r1] | |
7421 movd m0, [r0] | |
7422 movd m1, [r0 + r1] | |
7423 movd m2, [r0 + r1 * 2] | |
7424 movd m3, [r0 + r4] | |
7425 shufps m0, m1, 0 | |
7426 shufps m2, m3, 0 | |
7427 mova m4, [hmul_4p] | |
7428 pmaddubsw m0, m4 | |
7429 pmaddubsw m2, m4 | |
7430 | |
7431 paddw m5, m0, m2 | |
7432 movhlps m4, m5 | |
7433 paddw m5, m4 | |
7434 pmaddwd m5, [pw_1] | |
7435 psrld m5, 2 | |
7436 | |
7437 HADAMARD 0, sumsub, 0, 2, 1, 3 | |
7438 HADAMARD 4, sumsub, 0, 2, 1, 3 | |
7439 HADAMARD 1, amax, 0, 2, 1, 3 | |
7440 HADDW m0, m2 | |
7441 | |
7442 psubd m6, m0, m5 | |
7443 | |
7444 lea r4, [3 * r3] | |
7445 movd m0, [r2] | |
7446 movd m1, [r2 + r3] | |
7447 movd m2, [r2 + r3 * 2] | |
7448 movd m3, [r2 + r4] | |
7449 shufps m0, m1, 0 | |
7450 shufps m2, m3, 0 | |
7451 mova m4, [hmul_4p] | |
7452 pmaddubsw m0, m4 | |
7453 pmaddubsw m2, m4 | |
7454 | |
7455 paddw m5, m0, m2 | |
7456 movhlps m4, m5 | |
7457 paddw m5, m4 | |
7458 pmaddwd m5, [pw_1] | |
7459 psrld m5, 2 | |
7460 | |
7461 HADAMARD 0, sumsub, 0, 2, 1, 3 | |
7462 HADAMARD 4, sumsub, 0, 2, 1, 3 | |
7463 HADAMARD 1, amax, 0, 2, 1, 3 | |
7464 HADDW m0, m2 | |
7465 | |
7466 psubd m0, m5 | |
7467 | |
7468 psubd m6, m0 | |
7469 pabsd m0, m6 | |
7470 movd eax, m0 | |
7471 %endif ; HIGH_BIT_DEPTH | |
7472 RET | |
7473 | |
7474 %if ARCH_X86_64 | |
7475 INIT_XMM sse4 | |
7476 cglobal psyCost_pp_8x8, 4, 6, 13 | |
7477 | |
7478 %if HIGH_BIT_DEPTH | |
7479 FIX_STRIDES r1, r3 | |
7480 lea r4, [3 * r1] | |
7481 pxor m10, m10 | |
7482 movu m0, [r0] | |
7483 movu m1, [r0 + r1] | |
7484 movu m2, [r0 + r1 * 2] | |
7485 movu m3, [r0 + r4] | |
7486 lea r5, [r0 + r1 * 4] | |
7487 movu m4, [r5] | |
7488 movu m5, [r5 + r1] | |
7489 movu m6, [r5 + r1 * 2] | |
7490 movu m7, [r5 + r4] | |
7491 | |
7492 paddw m8, m0, m1 | |
7493 paddw m8, m2 | |
7494 paddw m8, m3 | |
7495 paddw m8, m4 | |
7496 paddw m8, m5 | |
7497 paddw m8, m6 | |
7498 paddw m8, m7 | |
7499 pmaddwd m8, [pw_1] | |
7500 movhlps m9, m8 | |
7501 paddd m8, m9 | |
7502 psrldq m9, m8, 4 | |
7503 paddd m8, m9 | |
7504 psrld m8, 2 | |
7505 | |
7506 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7507 | |
7508 paddd m0, m1 | |
7509 paddd m0, m2 | |
7510 paddd m0, m3 | |
7511 HADDUW m0, m1 | |
7512 paddd m0, [pd_1] | |
7513 psrld m0, 1 | |
7514 psubd m10, m0, m8 | |
7515 | |
7516 lea r4, [3 * r3] | |
7517 movu m0, [r2] | |
7518 movu m1, [r2 + r3] | |
7519 movu m2, [r2 + r3 * 2] | |
7520 movu m3, [r2 + r4] | |
7521 lea r5, [r2 + r3 * 4] | |
7522 movu m4, [r5] | |
7523 movu m5, [r5 + r3] | |
7524 movu m6, [r5 + r3 * 2] | |
7525 movu m7, [r5 + r4] | |
7526 | |
7527 paddw m8, m0, m1 | |
7528 paddw m8, m2 | |
7529 paddw m8, m3 | |
7530 paddw m8, m4 | |
7531 paddw m8, m5 | |
7532 paddw m8, m6 | |
7533 paddw m8, m7 | |
7534 pmaddwd m8, [pw_1] | |
7535 movhlps m9, m8 | |
7536 paddd m8, m9 | |
7537 psrldq m9, m8, 4 | |
7538 paddd m8, m9 | |
7539 psrld m8, 2 | |
7540 | |
7541 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7542 | |
7543 paddd m0, m1 | |
7544 paddd m0, m2 | |
7545 paddd m0, m3 | |
7546 HADDUW m0, m1 | |
7547 paddd m0, [pd_1] | |
7548 psrld m0, 1 | |
7549 psubd m0, m8 | |
7550 psubd m10, m0 | |
7551 pabsd m0, m10 | |
7552 movd eax, m0 | |
7553 %else ; !HIGH_BIT_DEPTH | |
7554 lea r4, [3 * r1] | |
7555 mova m8, [hmul_8p] | |
7556 | |
7557 movddup m0, [r0] | |
7558 movddup m1, [r0 + r1] | |
7559 movddup m2, [r0 + r1 * 2] | |
7560 movddup m3, [r0 + r4] | |
7561 lea r5, [r0 + r1 * 4] | |
7562 movddup m4, [r5] | |
7563 movddup m5, [r5 + r1] | |
7564 movddup m6, [r5 + r1 * 2] | |
7565 movddup m7, [r5 + r4] | |
7566 | |
7567 pmaddubsw m0, m8 | |
7568 pmaddubsw m1, m8 | |
7569 pmaddubsw m2, m8 | |
7570 pmaddubsw m3, m8 | |
7571 pmaddubsw m4, m8 | |
7572 pmaddubsw m5, m8 | |
7573 pmaddubsw m6, m8 | |
7574 pmaddubsw m7, m8 | |
7575 | |
7576 paddw m11, m0, m1 | |
7577 paddw m11, m2 | |
7578 paddw m11, m3 | |
7579 paddw m11, m4 | |
7580 paddw m11, m5 | |
7581 paddw m11, m6 | |
7582 paddw m11, m7 | |
7583 | |
7584 pmaddwd m11, [pw_1] | |
7585 psrldq m10, m11, 4 | |
7586 paddd m11, m10 | |
7587 psrld m11, 2 | |
7588 | |
7589 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 | |
7590 | |
7591 paddw m0, m1 | |
7592 paddw m0, m2 | |
7593 paddw m0, m3 | |
7594 HADDW m0, m1 | |
7595 | |
7596 paddd m0, [pd_1] | |
7597 psrld m0, 1 | |
7598 psubd m12, m0, m11 | |
7599 | |
7600 lea r4, [3 * r3] | |
7601 | |
7602 movddup m0, [r2] | |
7603 movddup m1, [r2 + r3] | |
7604 movddup m2, [r2 + r3 * 2] | |
7605 movddup m3, [r2 + r4] | |
7606 lea r5, [r2 + r3 * 4] | |
7607 movddup m4, [r5] | |
7608 movddup m5, [r5 + r3] | |
7609 movddup m6, [r5 + r3 * 2] | |
7610 movddup m7, [r5 + r4] | |
7611 | |
7612 pmaddubsw m0, m8 | |
7613 pmaddubsw m1, m8 | |
7614 pmaddubsw m2, m8 | |
7615 pmaddubsw m3, m8 | |
7616 pmaddubsw m4, m8 | |
7617 pmaddubsw m5, m8 | |
7618 pmaddubsw m6, m8 | |
7619 pmaddubsw m7, m8 | |
7620 | |
7621 paddw m11, m0, m1 | |
7622 paddw m11, m2 | |
7623 paddw m11, m3 | |
7624 paddw m11, m4 | |
7625 paddw m11, m5 | |
7626 paddw m11, m6 | |
7627 paddw m11, m7 | |
7628 | |
7629 pmaddwd m11, [pw_1] | |
7630 psrldq m10, m11, 4 | |
7631 paddd m11, m10 | |
7632 psrld m11, 2 | |
7633 | |
7634 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 | |
7635 | |
7636 paddw m0, m1 | |
7637 paddw m0, m2 | |
7638 paddw m0, m3 | |
7639 HADDW m0, m1 | |
7640 | |
7641 paddd m0, [pd_1] | |
7642 psrld m0, 1 | |
7643 psubd m0, m11 | |
7644 psubd m12, m0 | |
7645 pabsd m0, m12 | |
7646 movd eax, m0 | |
7647 %endif ; HIGH_BIT_DEPTH | |
7648 RET | |
7649 %endif | |
7650 | |
7651 %if ARCH_X86_64 | |
7652 %if HIGH_BIT_DEPTH | |
7653 INIT_XMM sse4 | |
7654 cglobal psyCost_pp_16x16, 4, 9, 14 | |
7655 | |
7656 FIX_STRIDES r1, r3 | |
7657 lea r4, [3 * r1] | |
7658 lea r8, [3 * r3] | |
7659 mova m12, [pw_1] | |
7660 mova m13, [pd_1] | |
7661 pxor m11, m11 | |
7662 mov r7d, 2 | |
7663 .loopH: | |
7664 mov r6d, 2 | |
7665 .loopW: | |
7666 pxor m10, m10 | |
7667 movu m0, [r0] | |
7668 movu m1, [r0 + r1] | |
7669 movu m2, [r0 + r1 * 2] | |
7670 movu m3, [r0 + r4] | |
7671 lea r5, [r0 + r1 * 4] | |
7672 movu m4, [r5] | |
7673 movu m5, [r5 + r1] | |
7674 movu m6, [r5 + r1 * 2] | |
7675 movu m7, [r5 + r4] | |
7676 | |
7677 paddw m8, m0, m1 | |
7678 paddw m8, m2 | |
7679 paddw m8, m3 | |
7680 paddw m8, m4 | |
7681 paddw m8, m5 | |
7682 paddw m8, m6 | |
7683 paddw m8, m7 | |
7684 pmaddwd m8, m12 | |
7685 movhlps m9, m8 | |
7686 paddd m8, m9 | |
7687 psrldq m9, m8, 4 | |
7688 paddd m8, m9 | |
7689 psrld m8, 2 | |
7690 | |
7691 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7692 | |
7693 paddd m0, m1 | |
7694 paddd m0, m2 | |
7695 paddd m0, m3 | |
7696 HADDUW m0, m1 | |
7697 paddd m0, m13 | |
7698 psrld m0, 1 | |
7699 psubd m10, m0, m8 | |
7700 | |
7701 movu m0, [r2] | |
7702 movu m1, [r2 + r3] | |
7703 movu m2, [r2 + r3 * 2] | |
7704 movu m3, [r2 + r8] | |
7705 lea r5, [r2 + r3 * 4] | |
7706 movu m4, [r5] | |
7707 movu m5, [r5 + r3] | |
7708 movu m6, [r5 + r3 * 2] | |
7709 movu m7, [r5 + r8] | |
7710 | |
7711 paddw m8, m0, m1 | |
7712 paddw m8, m2 | |
7713 paddw m8, m3 | |
7714 paddw m8, m4 | |
7715 paddw m8, m5 | |
7716 paddw m8, m6 | |
7717 paddw m8, m7 | |
7718 pmaddwd m8, m12 | |
7719 movhlps m9, m8 | |
7720 paddd m8, m9 | |
7721 psrldq m9, m8, 4 | |
7722 paddd m8, m9 | |
7723 psrld m8, 2 | |
7724 | |
7725 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7726 | |
7727 paddd m0, m1 | |
7728 paddd m0, m2 | |
7729 paddd m0, m3 | |
7730 HADDUW m0, m1 | |
7731 paddd m0, m13 | |
7732 psrld m0, 1 | |
7733 psubd m0, m8 | |
7734 psubd m10, m0 | |
7735 pabsd m0, m10 | |
7736 paddd m11, m0 | |
7737 add r0, 16 | |
7738 add r2, 16 | |
7739 dec r6d | |
7740 jnz .loopW | |
7741 lea r0, [r0 + r1 * 8 - 32] | |
7742 lea r2, [r2 + r3 * 8 - 32] | |
7743 dec r7d | |
7744 jnz .loopH | |
7745 movd eax, m11 | |
7746 RET | |
7747 %else ; !HIGH_BIT_DEPTH | |
7748 INIT_XMM sse4 | |
7749 cglobal psyCost_pp_16x16, 4, 9, 15 | |
7750 lea r4, [3 * r1] | |
7751 lea r8, [3 * r3] | |
7752 mova m8, [hmul_8p] | |
7753 mova m10, [pw_1] | |
7754 mova m14, [pd_1] | |
7755 pxor m13, m13 | |
7756 mov r7d, 2 | |
7757 .loopH: | |
7758 mov r6d, 2 | |
7759 .loopW: | |
7760 pxor m12, m12 | |
7761 movddup m0, [r0] | |
7762 movddup m1, [r0 + r1] | |
7763 movddup m2, [r0 + r1 * 2] | |
7764 movddup m3, [r0 + r4] | |
7765 lea r5, [r0 + r1 * 4] | |
7766 movddup m4, [r5] | |
7767 movddup m5, [r5 + r1] | |
7768 movddup m6, [r5 + r1 * 2] | |
7769 movddup m7, [r5 + r4] | |
7770 | |
7771 pmaddubsw m0, m8 | |
7772 pmaddubsw m1, m8 | |
7773 pmaddubsw m2, m8 | |
7774 pmaddubsw m3, m8 | |
7775 pmaddubsw m4, m8 | |
7776 pmaddubsw m5, m8 | |
7777 pmaddubsw m6, m8 | |
7778 pmaddubsw m7, m8 | |
7779 | |
7780 paddw m11, m0, m1 | |
7781 paddw m11, m2 | |
7782 paddw m11, m3 | |
7783 paddw m11, m4 | |
7784 paddw m11, m5 | |
7785 paddw m11, m6 | |
7786 paddw m11, m7 | |
7787 | |
7788 pmaddwd m11, m10 | |
7789 psrldq m9, m11, 4 | |
7790 paddd m11, m9 | |
7791 psrld m11, 2 | |
7792 | |
7793 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
7794 | |
7795 paddw m0, m1 | |
7796 paddw m0, m2 | |
7797 paddw m0, m3 | |
7798 HADDW m0, m1 | |
7799 | |
7800 paddd m0, m14 | |
7801 psrld m0, 1 | |
7802 psubd m12, m0, m11 | |
7803 | |
7804 movddup m0, [r2] | |
7805 movddup m1, [r2 + r3] | |
7806 movddup m2, [r2 + r3 * 2] | |
7807 movddup m3, [r2 + r8] | |
7808 lea r5, [r2 + r3 * 4] | |
7809 movddup m4, [r5] | |
7810 movddup m5, [r5 + r3] | |
7811 movddup m6, [r5 + r3 * 2] | |
7812 movddup m7, [r5 + r8] | |
7813 | |
7814 pmaddubsw m0, m8 | |
7815 pmaddubsw m1, m8 | |
7816 pmaddubsw m2, m8 | |
7817 pmaddubsw m3, m8 | |
7818 pmaddubsw m4, m8 | |
7819 pmaddubsw m5, m8 | |
7820 pmaddubsw m6, m8 | |
7821 pmaddubsw m7, m8 | |
7822 | |
7823 paddw m11, m0, m1 | |
7824 paddw m11, m2 | |
7825 paddw m11, m3 | |
7826 paddw m11, m4 | |
7827 paddw m11, m5 | |
7828 paddw m11, m6 | |
7829 paddw m11, m7 | |
7830 | |
7831 pmaddwd m11, m10 | |
7832 psrldq m9, m11, 4 | |
7833 paddd m11, m9 | |
7834 psrld m11, 2 | |
7835 | |
7836 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
7837 | |
7838 paddw m0, m1 | |
7839 paddw m0, m2 | |
7840 paddw m0, m3 | |
7841 HADDW m0, m1 | |
7842 | |
7843 paddd m0, m14 | |
7844 psrld m0, 1 | |
7845 psubd m0, m11 | |
7846 psubd m12, m0 | |
7847 pabsd m0, m12 | |
7848 paddd m13, m0 | |
7849 add r0, 8 | |
7850 add r2, 8 | |
7851 dec r6d | |
7852 jnz .loopW | |
7853 lea r0, [r0 + r1 * 8 - 16] | |
7854 lea r2, [r2 + r3 * 8 - 16] | |
7855 dec r7d | |
7856 jnz .loopH | |
7857 movd eax, m13 | |
7858 RET | |
7859 %endif ; HIGH_BIT_DEPTH | |
7860 %endif | |
7861 | |
7862 %if ARCH_X86_64 | |
7863 %if HIGH_BIT_DEPTH | |
7864 INIT_XMM sse4 | |
7865 cglobal psyCost_pp_32x32, 4, 9, 14 | |
7866 | |
7867 FIX_STRIDES r1, r3 | |
7868 lea r4, [3 * r1] | |
7869 lea r8, [3 * r3] | |
7870 mova m12, [pw_1] | |
7871 mova m13, [pd_1] | |
7872 pxor m11, m11 | |
7873 mov r7d, 4 | |
7874 .loopH: | |
7875 mov r6d, 4 | |
7876 .loopW: | |
7877 pxor m10, m10 | |
7878 movu m0, [r0] | |
7879 movu m1, [r0 + r1] | |
7880 movu m2, [r0 + r1 * 2] | |
7881 movu m3, [r0 + r4] | |
7882 lea r5, [r0 + r1 * 4] | |
7883 movu m4, [r5] | |
7884 movu m5, [r5 + r1] | |
7885 movu m6, [r5 + r1 * 2] | |
7886 movu m7, [r5 + r4] | |
7887 | |
7888 paddw m8, m0, m1 | |
7889 paddw m8, m2 | |
7890 paddw m8, m3 | |
7891 paddw m8, m4 | |
7892 paddw m8, m5 | |
7893 paddw m8, m6 | |
7894 paddw m8, m7 | |
7895 pmaddwd m8, m12 | |
7896 movhlps m9, m8 | |
7897 paddd m8, m9 | |
7898 psrldq m9, m8, 4 | |
7899 paddd m8, m9 | |
7900 psrld m8, 2 | |
7901 | |
7902 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7903 | |
7904 paddd m0, m1 | |
7905 paddd m0, m2 | |
7906 paddd m0, m3 | |
7907 HADDUW m0, m1 | |
7908 paddd m0, m13 | |
7909 psrld m0, 1 | |
7910 psubd m10, m0, m8 | |
7911 | |
7912 movu m0, [r2] | |
7913 movu m1, [r2 + r3] | |
7914 movu m2, [r2 + r3 * 2] | |
7915 movu m3, [r2 + r8] | |
7916 lea r5, [r2 + r3 * 4] | |
7917 movu m4, [r5] | |
7918 movu m5, [r5 + r3] | |
7919 movu m6, [r5 + r3 * 2] | |
7920 movu m7, [r5 + r8] | |
7921 | |
7922 paddw m8, m0, m1 | |
7923 paddw m8, m2 | |
7924 paddw m8, m3 | |
7925 paddw m8, m4 | |
7926 paddw m8, m5 | |
7927 paddw m8, m6 | |
7928 paddw m8, m7 | |
7929 pmaddwd m8, m12 | |
7930 movhlps m9, m8 | |
7931 paddd m8, m9 | |
7932 psrldq m9, m8, 4 | |
7933 paddd m8, m9 | |
7934 psrld m8, 2 | |
7935 | |
7936 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
7937 | |
7938 paddd m0, m1 | |
7939 paddd m0, m2 | |
7940 paddd m0, m3 | |
7941 HADDUW m0, m1 | |
7942 paddd m0, m13 | |
7943 psrld m0, 1 | |
7944 psubd m0, m8 | |
7945 psubd m10, m0 | |
7946 pabsd m0, m10 | |
7947 paddd m11, m0 | |
7948 add r0, 16 | |
7949 add r2, 16 | |
7950 dec r6d | |
7951 jnz .loopW | |
7952 lea r0, [r0 + r1 * 8 - 64] | |
7953 lea r2, [r2 + r3 * 8 - 64] | |
7954 dec r7d | |
7955 jnz .loopH | |
7956 movd eax, m11 | |
7957 RET | |
7958 | |
7959 %else ; !HIGH_BIT_DEPTH | |
7960 INIT_XMM sse4 | |
7961 cglobal psyCost_pp_32x32, 4, 9, 15 | |
7962 | |
7963 lea r4, [3 * r1] | |
7964 lea r8, [3 * r3] | |
7965 mova m8, [hmul_8p] | |
7966 mova m10, [pw_1] | |
7967 mova m14, [pd_1] | |
7968 pxor m13, m13 | |
7969 mov r7d, 4 | |
7970 .loopH: | |
7971 mov r6d, 4 | |
7972 .loopW: | |
7973 pxor m12, m12 | |
7974 movddup m0, [r0] | |
7975 movddup m1, [r0 + r1] | |
7976 movddup m2, [r0 + r1 * 2] | |
7977 movddup m3, [r0 + r4] | |
7978 lea r5, [r0 + r1 * 4] | |
7979 movddup m4, [r5] | |
7980 movddup m5, [r5 + r1] | |
7981 movddup m6, [r5 + r1 * 2] | |
7982 movddup m7, [r5 + r4] | |
7983 | |
7984 pmaddubsw m0, m8 | |
7985 pmaddubsw m1, m8 | |
7986 pmaddubsw m2, m8 | |
7987 pmaddubsw m3, m8 | |
7988 pmaddubsw m4, m8 | |
7989 pmaddubsw m5, m8 | |
7990 pmaddubsw m6, m8 | |
7991 pmaddubsw m7, m8 | |
7992 | |
7993 paddw m11, m0, m1 | |
7994 paddw m11, m2 | |
7995 paddw m11, m3 | |
7996 paddw m11, m4 | |
7997 paddw m11, m5 | |
7998 paddw m11, m6 | |
7999 paddw m11, m7 | |
8000 | |
8001 pmaddwd m11, m10 | |
8002 psrldq m9, m11, 4 | |
8003 paddd m11, m9 | |
8004 psrld m11, 2 | |
8005 | |
8006 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
8007 | |
8008 paddw m0, m1 | |
8009 paddw m0, m2 | |
8010 paddw m0, m3 | |
8011 HADDW m0, m1 | |
8012 | |
8013 paddd m0, m14 | |
8014 psrld m0, 1 | |
8015 psubd m12, m0, m11 | |
8016 | |
8017 movddup m0, [r2] | |
8018 movddup m1, [r2 + r3] | |
8019 movddup m2, [r2 + r3 * 2] | |
8020 movddup m3, [r2 + r8] | |
8021 lea r5, [r2 + r3 * 4] | |
8022 movddup m4, [r5] | |
8023 movddup m5, [r5 + r3] | |
8024 movddup m6, [r5 + r3 * 2] | |
8025 movddup m7, [r5 + r8] | |
8026 | |
8027 pmaddubsw m0, m8 | |
8028 pmaddubsw m1, m8 | |
8029 pmaddubsw m2, m8 | |
8030 pmaddubsw m3, m8 | |
8031 pmaddubsw m4, m8 | |
8032 pmaddubsw m5, m8 | |
8033 pmaddubsw m6, m8 | |
8034 pmaddubsw m7, m8 | |
8035 | |
8036 paddw m11, m0, m1 | |
8037 paddw m11, m2 | |
8038 paddw m11, m3 | |
8039 paddw m11, m4 | |
8040 paddw m11, m5 | |
8041 paddw m11, m6 | |
8042 paddw m11, m7 | |
8043 | |
8044 pmaddwd m11, m10 | |
8045 psrldq m9, m11, 4 | |
8046 paddd m11, m9 | |
8047 psrld m11, 2 | |
8048 | |
8049 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
8050 | |
8051 paddw m0, m1 | |
8052 paddw m0, m2 | |
8053 paddw m0, m3 | |
8054 HADDW m0, m1 | |
8055 | |
8056 paddd m0, m14 | |
8057 psrld m0, 1 | |
8058 psubd m0, m11 | |
8059 psubd m12, m0 | |
8060 pabsd m0, m12 | |
8061 paddd m13, m0 | |
8062 add r0, 8 | |
8063 add r2, 8 | |
8064 dec r6d | |
8065 jnz .loopW | |
8066 lea r0, [r0 + r1 * 8 - 32] | |
8067 lea r2, [r2 + r3 * 8 - 32] | |
8068 dec r7d | |
8069 jnz .loopH | |
8070 movd eax, m13 | |
8071 RET | |
8072 %endif ; HIGH_BIT_DEPTH | |
8073 %endif | |
8074 | |
8075 %if ARCH_X86_64 | |
8076 %if HIGH_BIT_DEPTH | |
8077 INIT_XMM sse4 | |
8078 cglobal psyCost_pp_64x64, 4, 9, 14 | |
8079 | |
8080 FIX_STRIDES r1, r3 | |
8081 lea r4, [3 * r1] | |
8082 lea r8, [3 * r3] | |
8083 mova m12, [pw_1] | |
8084 mova m13, [pd_1] | |
8085 pxor m11, m11 | |
8086 mov r7d, 8 | |
8087 .loopH: | |
8088 mov r6d, 8 | |
8089 .loopW: | |
8090 pxor m10, m10 | |
8091 movu m0, [r0] | |
8092 movu m1, [r0 + r1] | |
8093 movu m2, [r0 + r1 * 2] | |
8094 movu m3, [r0 + r4] | |
8095 lea r5, [r0 + r1 * 4] | |
8096 movu m4, [r5] | |
8097 movu m5, [r5 + r1] | |
8098 movu m6, [r5 + r1 * 2] | |
8099 movu m7, [r5 + r4] | |
8100 | |
8101 paddw m8, m0, m1 | |
8102 paddw m8, m2 | |
8103 paddw m8, m3 | |
8104 paddw m8, m4 | |
8105 paddw m8, m5 | |
8106 paddw m8, m6 | |
8107 paddw m8, m7 | |
8108 pmaddwd m8, m12 | |
8109 movhlps m9, m8 | |
8110 paddd m8, m9 | |
8111 psrldq m9, m8, 4 | |
8112 paddd m8, m9 | |
8113 psrld m8, 2 | |
8114 | |
8115 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
8116 | |
8117 paddd m0, m1 | |
8118 paddd m0, m2 | |
8119 paddd m0, m3 | |
8120 HADDUW m0, m1 | |
8121 paddd m0, m13 | |
8122 psrld m0, 1 | |
8123 psubd m10, m0, m8 | |
8124 | |
8125 movu m0, [r2] | |
8126 movu m1, [r2 + r3] | |
8127 movu m2, [r2 + r3 * 2] | |
8128 movu m3, [r2 + r8] | |
8129 lea r5, [r2 + r3 * 4] | |
8130 movu m4, [r5] | |
8131 movu m5, [r5 + r3] | |
8132 movu m6, [r5 + r3 * 2] | |
8133 movu m7, [r5 + r8] | |
8134 | |
8135 paddw m8, m0, m1 | |
8136 paddw m8, m2 | |
8137 paddw m8, m3 | |
8138 paddw m8, m4 | |
8139 paddw m8, m5 | |
8140 paddw m8, m6 | |
8141 paddw m8, m7 | |
8142 pmaddwd m8, m12 | |
8143 movhlps m9, m8 | |
8144 paddd m8, m9 | |
8145 psrldq m9, m8, 4 | |
8146 paddd m8, m9 | |
8147 psrld m8, 2 | |
8148 | |
8149 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax | |
8150 | |
8151 paddd m0, m1 | |
8152 paddd m0, m2 | |
8153 paddd m0, m3 | |
8154 HADDUW m0, m1 | |
8155 paddd m0, m13 | |
8156 psrld m0, 1 | |
8157 psubd m0, m8 | |
8158 psubd m10, m0 | |
8159 pabsd m0, m10 | |
8160 paddd m11, m0 | |
8161 add r0, 16 | |
8162 add r2, 16 | |
8163 dec r6d | |
8164 jnz .loopW | |
8165 lea r0, [r0 + r1 * 8 - 128] | |
8166 lea r2, [r2 + r3 * 8 - 128] | |
8167 dec r7d | |
8168 jnz .loopH | |
8169 movd eax, m11 | |
8170 RET | |
8171 | |
8172 %else ; !HIGH_BIT_DEPTH | |
8173 INIT_XMM sse4 | |
8174 cglobal psyCost_pp_64x64, 4, 9, 15 | |
8175 | |
8176 lea r4, [3 * r1] | |
8177 lea r8, [3 * r3] | |
8178 mova m8, [hmul_8p] | |
8179 mova m10, [pw_1] | |
8180 mova m14, [pd_1] | |
8181 pxor m13, m13 | |
8182 mov r7d, 8 | |
8183 .loopH: | |
8184 mov r6d, 8 | |
8185 .loopW: | |
8186 pxor m12, m12 | |
8187 movddup m0, [r0] | |
8188 movddup m1, [r0 + r1] | |
8189 movddup m2, [r0 + r1 * 2] | |
8190 movddup m3, [r0 + r4] | |
8191 lea r5, [r0 + r1 * 4] | |
8192 movddup m4, [r5] | |
8193 movddup m5, [r5 + r1] | |
8194 movddup m6, [r5 + r1 * 2] | |
8195 movddup m7, [r5 + r4] | |
8196 | |
8197 pmaddubsw m0, m8 | |
8198 pmaddubsw m1, m8 | |
8199 pmaddubsw m2, m8 | |
8200 pmaddubsw m3, m8 | |
8201 pmaddubsw m4, m8 | |
8202 pmaddubsw m5, m8 | |
8203 pmaddubsw m6, m8 | |
8204 pmaddubsw m7, m8 | |
8205 | |
8206 paddw m11, m0, m1 | |
8207 paddw m11, m2 | |
8208 paddw m11, m3 | |
8209 paddw m11, m4 | |
8210 paddw m11, m5 | |
8211 paddw m11, m6 | |
8212 paddw m11, m7 | |
8213 | |
8214 pmaddwd m11, m10 | |
8215 psrldq m9, m11, 4 | |
8216 paddd m11, m9 | |
8217 psrld m11, 2 | |
8218 | |
8219 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
8220 | |
8221 paddw m0, m1 | |
8222 paddw m0, m2 | |
8223 paddw m0, m3 | |
8224 HADDW m0, m1 | |
8225 | |
8226 paddd m0, m14 | |
8227 psrld m0, 1 | |
8228 psubd m12, m0, m11 | |
8229 | |
8230 movddup m0, [r2] | |
8231 movddup m1, [r2 + r3] | |
8232 movddup m2, [r2 + r3 * 2] | |
8233 movddup m3, [r2 + r8] | |
8234 lea r5, [r2 + r3 * 4] | |
8235 movddup m4, [r5] | |
8236 movddup m5, [r5 + r3] | |
8237 movddup m6, [r5 + r3 * 2] | |
8238 movddup m7, [r5 + r8] | |
8239 | |
8240 pmaddubsw m0, m8 | |
8241 pmaddubsw m1, m8 | |
8242 pmaddubsw m2, m8 | |
8243 pmaddubsw m3, m8 | |
8244 pmaddubsw m4, m8 | |
8245 pmaddubsw m5, m8 | |
8246 pmaddubsw m6, m8 | |
8247 pmaddubsw m7, m8 | |
8248 | |
8249 paddw m11, m0, m1 | |
8250 paddw m11, m2 | |
8251 paddw m11, m3 | |
8252 paddw m11, m4 | |
8253 paddw m11, m5 | |
8254 paddw m11, m6 | |
8255 paddw m11, m7 | |
8256 | |
8257 pmaddwd m11, m10 | |
8258 psrldq m9, m11, 4 | |
8259 paddd m11, m9 | |
8260 psrld m11, 2 | |
8261 | |
8262 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 | |
8263 | |
8264 paddw m0, m1 | |
8265 paddw m0, m2 | |
8266 paddw m0, m3 | |
8267 HADDW m0, m1 | |
8268 | |
8269 paddd m0, m14 | |
8270 psrld m0, 1 | |
8271 psubd m0, m11 | |
8272 psubd m12, m0 | |
8273 pabsd m0, m12 | |
8274 paddd m13, m0 | |
8275 add r0, 8 | |
8276 add r2, 8 | |
8277 dec r6d | |
8278 jnz .loopW | |
8279 lea r0, [r0 + r1 * 8 - 64] | |
8280 lea r2, [r2 + r3 * 8 - 64] | |
8281 dec r7d | |
8282 jnz .loopH | |
8283 movd eax, m13 | |
8284 RET | |
8285 %endif ; HIGH_BIT_DEPTH | |
8286 %endif | |
8287 | |
8288 INIT_YMM avx2 | |
8289 %if HIGH_BIT_DEPTH | |
8290 cglobal psyCost_pp_4x4, 4, 5, 6 | |
8291 add r1d, r1d | |
8292 add r3d, r3d | |
8293 lea r4, [r1 * 3] | |
8294 movddup xm0, [r0] | |
8295 movddup xm1, [r0 + r1] | |
8296 movddup xm2, [r0 + r1 * 2] | |
8297 movddup xm3, [r0 + r4] | |
8298 | |
8299 lea r4, [r3 * 3] | |
8300 movddup xm4, [r2] | |
8301 movddup xm5, [r2 + r3] | |
8302 vinserti128 m0, m0, xm4, 1 | |
8303 vinserti128 m1, m1, xm5, 1 | |
8304 movddup xm4, [r2 + r3 * 2] | |
8305 movddup xm5, [r2 + r4] | |
8306 vinserti128 m2, m2, xm4, 1 | |
8307 vinserti128 m3, m3, xm5, 1 | |
8308 | |
8309 mova m4, [hmul_8w] | |
8310 pmaddwd m0, m4 | |
8311 pmaddwd m1, m4 | |
8312 pmaddwd m2, m4 | |
8313 pmaddwd m3, m4 | |
8314 paddd m5, m0, m1 | |
8315 paddd m4, m2, m3 | |
8316 paddd m5, m4 | |
8317 psrldq m4, m5, 4 | |
8318 paddd m5, m4 | |
8319 psrld m5, 2 | |
8320 | |
8321 mova m4, m0 | |
8322 paddd m0, m1 | |
8323 psubd m1, m4 | |
8324 mova m4, m2 | |
8325 paddd m2, m3 | |
8326 psubd m3, m4 | |
8327 mova m4, m0 | |
8328 paddd m0, m2 | |
8329 psubd m2, m4 | |
8330 mova m4, m1 | |
8331 paddd m1, m3 | |
8332 psubd m3, m4 | |
8333 movaps m4, m0 | |
8334 vshufps m4, m4, m2, 11011101b | |
8335 vshufps m0, m0, m2, 10001000b | |
8336 movaps m2, m1 | |
8337 vshufps m2, m2, m3, 11011101b | |
8338 vshufps m1, m1, m3, 10001000b | |
8339 pabsd m0, m0 | |
8340 pabsd m4, m4 | |
8341 pmaxsd m0, m4 | |
8342 pabsd m1, m1 | |
8343 pabsd m2, m2 | |
8344 pmaxsd m1, m2 | |
8345 paddd m0, m1 | |
8346 | |
8347 vpermq m1, m0, 11110101b | |
8348 paddd m0, m1 | |
8349 psrldq m1, m0, 4 | |
8350 paddd m0, m1 | |
8351 psubd m0, m5 | |
8352 | |
8353 vextracti128 xm1, m0, 1 | |
8354 psubd xm1, xm0 | |
8355 pabsd xm1, xm1 | |
8356 movd eax, xm1 | |
8357 RET | |
8358 %else ; !HIGH_BIT_DEPTH | |
8359 cglobal psyCost_pp_4x4, 4, 5, 6 | |
8360 lea r4, [3 * r1] | |
8361 movd xm0, [r0] | |
8362 movd xm1, [r0 + r1] | |
8363 movd xm2, [r0 + r1 * 2] | |
8364 movd xm3, [r0 + r4] | |
8365 vshufps xm0, xm1, 0 | |
8366 vshufps xm2, xm3, 0 | |
8367 | |
8368 lea r4, [3 * r3] | |
8369 movd xm1, [r2] | |
8370 movd xm3, [r2 + r3] | |
8371 movd xm4, [r2 + r3 * 2] | |
8372 movd xm5, [r2 + r4] | |
8373 vshufps xm1, xm3, 0 | |
8374 vshufps xm4, xm5, 0 | |
8375 | |
8376 vinserti128 m0, m0, xm1, 1 | |
8377 vinserti128 m2, m2, xm4, 1 | |
8378 | |
8379 mova m4, [hmul_4p] | |
8380 pmaddubsw m0, m4 | |
8381 pmaddubsw m2, m4 | |
8382 | |
8383 paddw m5, m0, m2 | |
8384 mova m1, m5 | |
8385 psrldq m4, m5, 8 | |
8386 paddw m5, m4 | |
8387 pmaddwd m5, [pw_1] | |
8388 psrld m5, 2 | |
8389 | |
8390 vpsubw m2, m2, m0 | |
8391 vpunpckhqdq m0, m1, m2 | |
8392 vpunpcklqdq m1, m1, m2 | |
8393 vpaddw m2, m1, m0 | |
8394 vpsubw m0, m0, m1 | |
8395 vpblendw m1, m2, m0, 10101010b | |
8396 vpslld m0, m0, 10h | |
8397 vpsrld m2, m2, 10h | |
8398 vpor m0, m0, m2 | |
8399 vpabsw m1, m1 | |
8400 vpabsw m0, m0 | |
8401 vpmaxsw m1, m1, m0 | |
8402 vpmaddwd m1, m1, [pw_1] | |
8403 psrldq m2, m1, 8 | |
8404 paddd m1, m2 | |
8405 psrldq m3, m1, 4 | |
8406 paddd m1, m3 | |
8407 psubd m1, m5 | |
8408 vextracti128 xm2, m1, 1 | |
8409 psubd m1, m2 | |
8410 pabsd m1, m1 | |
8411 movd eax, xm1 | |
8412 RET | |
8413 %endif | |
8414 | |
8415 %macro PSY_PP_8x8 0 | |
8416 movddup m0, [r0 + r1 * 0] | |
8417 movddup m1, [r0 + r1 * 1] | |
8418 movddup m2, [r0 + r1 * 2] | |
8419 movddup m3, [r0 + r4 * 1] | |
8420 | |
8421 lea r5, [r0 + r1 * 4] | |
8422 | |
8423 movddup m4, [r2 + r3 * 0] | |
8424 movddup m5, [r2 + r3 * 1] | |
8425 movddup m6, [r2 + r3 * 2] | |
8426 movddup m7, [r2 + r7 * 1] | |
8427 | |
8428 lea r6, [r2 + r3 * 4] | |
8429 | |
8430 vinserti128 m0, m0, xm4, 1 | |
8431 vinserti128 m1, m1, xm5, 1 | |
8432 vinserti128 m2, m2, xm6, 1 | |
8433 vinserti128 m3, m3, xm7, 1 | |
8434 | |
8435 movddup m4, [r5 + r1 * 0] | |
8436 movddup m5, [r5 + r1 * 1] | |
8437 movddup m6, [r5 + r1 * 2] | |
8438 movddup m7, [r5 + r4 * 1] | |
8439 | |
8440 movddup m9, [r6 + r3 * 0] | |
8441 movddup m10, [r6 + r3 * 1] | |
8442 movddup m11, [r6 + r3 * 2] | |
8443 movddup m12, [r6 + r7 * 1] | |
8444 | |
8445 vinserti128 m4, m4, xm9, 1 | |
8446 vinserti128 m5, m5, xm10, 1 | |
8447 vinserti128 m6, m6, xm11, 1 | |
8448 vinserti128 m7, m7, xm12, 1 | |
8449 | |
8450 pmaddubsw m0, m8 | |
8451 pmaddubsw m1, m8 | |
8452 pmaddubsw m2, m8 | |
8453 pmaddubsw m3, m8 | |
8454 pmaddubsw m4, m8 | |
8455 pmaddubsw m5, m8 | |
8456 pmaddubsw m6, m8 | |
8457 pmaddubsw m7, m8 | |
8458 | |
8459 paddw m11, m0, m1 | |
8460 paddw m11, m2 | |
8461 paddw m11, m3 | |
8462 paddw m11, m4 | |
8463 paddw m11, m5 | |
8464 paddw m11, m6 | |
8465 paddw m11, m7 | |
8466 | |
8467 pmaddwd m11, [pw_1] | |
8468 psrldq m10, m11, 4 | |
8469 paddd m11, m10 | |
8470 psrld m11, 2 | |
8471 | |
8472 mova m9, m0 | |
8473 paddw m0, m1 ; m0+m1 | |
8474 psubw m1, m9 ; m1-m0 | |
8475 mova m9, m2 | |
8476 paddw m2, m3 ; m2+m3 | |
8477 psubw m3, m9 ; m3-m2 | |
8478 mova m9, m0 | |
8479 paddw m0, m2 ; m0+m1+m2+m3 | |
8480 psubw m2, m9 ; m2+m3-m0+m1 | |
8481 mova m9, m1 | |
8482 paddw m1, m3 ; m1-m0+m3-m2 | |
8483 psubw m3, m9 ; m3-m2-m1-m0 | |
8484 | |
8485 movdqa m9, m4 | |
8486 paddw m4, m5 ; m4+m5 | |
8487 psubw m5, m9 ; m5-m4 | |
8488 movdqa m9, m6 | |
8489 paddw m6, m7 ; m6+m7 | |
8490 psubw m7, m9 ; m7-m6 | |
8491 movdqa m9, m4 | |
8492 paddw m4, m6 ; m4+m5+m6+m7 | |
8493 psubw m6, m9 ; m6+m7-m4+m5 | |
8494 movdqa m9, m5 | |
8495 paddw m5, m7 ; m5-m4+m7-m6 | |
8496 psubw m7, m9 ; m7-m6-m5-m4 | |
8497 | |
8498 movdqa m9, m0 | |
8499 paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7) | |
8500 psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3) | |
8501 movdqa m9, m1 | |
8502 paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6) | |
8503 psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2) | |
8504 | |
8505 mova m9, m0 | |
8506 vshufps m9, m9, m4, 11011101b | |
8507 vshufps m0, m0, m4, 10001000b | |
8508 | |
8509 movdqa m4, m0 | |
8510 paddw m0, m9 ; (a0 + a4) + (a4 - a0) | |
8511 psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4) | |
8512 | |
8513 movaps m4, m1 | |
8514 vshufps m4, m4, m5, 11011101b | |
8515 vshufps m1, m1, m5, 10001000b | |
8516 | |
8517 movdqa m5, m1 | |
8518 paddw m1, m4 | |
8519 psubw m4, m5 | |
8520 movdqa m5, m2 | |
8521 paddw m2, m6 | |
8522 psubw m6, m5 | |
8523 movdqa m5, m3 | |
8524 paddw m3, m7 | |
8525 psubw m7, m5 | |
8526 | |
8527 movaps m5, m2 | |
8528 vshufps m5, m5, m6, 11011101b | |
8529 vshufps m2, m2, m6, 10001000b | |
8530 | |
8531 movdqa m6, m2 | |
8532 paddw m2, m5 | |
8533 psubw m5, m6 | |
8534 movaps m6, m3 | |
8535 | |
8536 vshufps m6, m6, m7, 11011101b | |
8537 vshufps m3, m3, m7, 10001000b | |
8538 | |
8539 movdqa m7, m3 | |
8540 paddw m3, m6 | |
8541 psubw m6, m7 | |
8542 movdqa m7, m0 | |
8543 | |
8544 pblendw m0, m9, 10101010b | |
8545 pslld m9, 10h | |
8546 psrld m7, 10h | |
8547 por m9, m7 | |
8548 pabsw m0, m0 | |
8549 pabsw m9, m9 | |
8550 pmaxsw m0, m9 | |
8551 movdqa m7, m1 | |
8552 pblendw m1, m4, 10101010b | |
8553 pslld m4, 10h | |
8554 psrld m7, 10h | |
8555 por m4, m7 | |
8556 pabsw m1, m1 | |
8557 pabsw m4, m4 | |
8558 pmaxsw m1, m4 | |
8559 movdqa m7, m2 | |
8560 pblendw m2, m5, 10101010b | |
8561 pslld m5, 10h | |
8562 psrld m7, 10h | |
8563 por m5, m7 | |
8564 pabsw m2, m2 | |
8565 pabsw m5, m5 | |
8566 pmaxsw m2, m5 | |
8567 mova m7, m3 | |
8568 | |
8569 pblendw m3, m6, 10101010b | |
8570 pslld m6, 10h | |
8571 psrld m7, 10h | |
8572 por m6, m7 | |
8573 pabsw m3, m3 | |
8574 pabsw m6, m6 | |
8575 pmaxsw m3, m6 | |
8576 paddw m0, m1 | |
8577 paddw m0, m2 | |
8578 paddw m0, m3 | |
8579 pmaddwd m0, [pw_1] | |
8580 psrldq m1, m0, 8 | |
8581 paddd m0, m1 | |
8582 | |
8583 pshuflw m1, m0, 00001110b | |
8584 paddd m0, m1 | |
8585 paddd m0, [pd_1] | |
8586 psrld m0, 1 | |
8587 | |
8588 psubd m0, m11 | |
8589 | |
8590 vextracti128 xm1, m0, 1 | |
8591 psubd m0, m1 | |
8592 pabsd m0, m0 | |
8593 %endmacro | |
8594 | |
8595 %macro PSY_PP_8x8_AVX2 0 | |
8596 lea r4, [r1 * 3] | |
8597 movu xm0, [r0] | |
8598 movu xm1, [r0 + r1] | |
8599 movu xm2, [r0 + r1 * 2] | |
8600 movu xm3, [r0 + r4] | |
8601 lea r5, [r0 + r1 * 4] | |
8602 movu xm4, [r5] | |
8603 movu xm5, [r5 + r1] | |
8604 movu xm6, [r5 + r1 * 2] | |
8605 movu xm7, [r5 + r4] | |
8606 | |
8607 lea r4, [r3 * 3] | |
8608 vinserti128 m0, m0, [r2], 1 | |
8609 vinserti128 m1, m1, [r2 + r3], 1 | |
8610 vinserti128 m2, m2, [r2 + r3 * 2], 1 | |
8611 vinserti128 m3, m3, [r2 + r4], 1 | |
8612 lea r5, [r2 + r3 * 4] | |
8613 vinserti128 m4, m4, [r5], 1 | |
8614 vinserti128 m5, m5, [r5 + r3], 1 | |
8615 vinserti128 m6, m6, [r5 + r3 * 2], 1 | |
8616 vinserti128 m7, m7, [r5 + r4], 1 | |
8617 | |
8618 paddw m8, m0, m1 | |
8619 paddw m8, m2 | |
8620 paddw m8, m3 | |
8621 paddw m8, m4 | |
8622 paddw m8, m5 | |
8623 paddw m8, m6 | |
8624 paddw m8, m7 | |
8625 pmaddwd m8, [pw_1] | |
8626 | |
8627 psrldq m9, m8, 8 | |
8628 paddd m8, m9 | |
8629 psrldq m9, m8, 4 | |
8630 paddd m8, m9 | |
8631 psrld m8, 2 | |
8632 | |
8633 psubw m9, m1, m0 | |
8634 paddw m0, m1 | |
8635 psubw m1, m3, m2 | |
8636 paddw m2, m3 | |
8637 punpckhwd m3, m0, m9 | |
8638 punpcklwd m0, m9 | |
8639 psubw m9, m3, m0 | |
8640 paddw m0, m3 | |
8641 punpckhwd m3, m2, m1 | |
8642 punpcklwd m2, m1 | |
8643 psubw m10, m3, m2 | |
8644 paddw m2, m3 | |
8645 psubw m3, m5, m4 | |
8646 paddw m4, m5 | |
8647 psubw m5, m7, m6 | |
8648 paddw m6, m7 | |
8649 punpckhwd m1, m4, m3 | |
8650 punpcklwd m4, m3 | |
8651 psubw m7, m1, m4 | |
8652 paddw m4, m1 | |
8653 punpckhwd m3, m6, m5 | |
8654 punpcklwd m6, m5 | |
8655 psubw m1, m3, m6 | |
8656 paddw m6, m3 | |
8657 psubw m3, m2, m0 | |
8658 paddw m0, m2 | |
8659 psubw m2, m10, m9 | |
8660 paddw m9, m10 | |
8661 punpckhdq m5, m0, m3 | |
8662 punpckldq m0, m3 | |
8663 psubw m10, m5, m0 | |
8664 paddw m0, m5 | |
8665 punpckhdq m3, m9, m2 | |
8666 punpckldq m9, m2 | |
8667 psubw m5, m3, m9 | |
8668 paddw m9, m3 | |
8669 psubw m3, m6, m4 | |
8670 paddw m4, m6 | |
8671 psubw m6, m1, m7 | |
8672 paddw m7, m1 | |
8673 punpckhdq m2, m4, m3 | |
8674 punpckldq m4, m3 | |
8675 psubw m1, m2, m4 | |
8676 paddw m4, m2 | |
8677 punpckhdq m3, m7, m6 | |
8678 punpckldq m7, m6 | |
8679 psubw m2, m3, m7 | |
8680 paddw m7, m3 | |
8681 psubw m3, m4, m0 | |
8682 paddw m0, m4 | |
8683 psubw m4, m1, m10 | |
8684 paddw m10, m1 | |
8685 punpckhqdq m6, m0, m3 | |
8686 punpcklqdq m0, m3 | |
8687 pabsw m0, m0 | |
8688 pabsw m6, m6 | |
8689 pmaxsw m0, m6 | |
8690 punpckhqdq m3, m10, m4 | |
8691 punpcklqdq m10, m4 | |
8692 pabsw m10, m10 | |
8693 pabsw m3, m3 | |
8694 pmaxsw m10, m3 | |
8695 psubw m3, m7, m9 | |
8696 paddw m9, m7 | |
8697 psubw m7, m2, m5 | |
8698 paddw m5, m2 | |
8699 punpckhqdq m4, m9, m3 | |
8700 punpcklqdq m9, m3 | |
8701 pabsw m9, m9 | |
8702 pabsw m4, m4 | |
8703 pmaxsw m9, m4 | |
8704 punpckhqdq m3, m5, m7 | |
8705 punpcklqdq m5, m7 | |
8706 pabsw m5, m5 | |
8707 pabsw m3, m3 | |
8708 pmaxsw m5, m3 | |
8709 paddd m0, m9 | |
8710 paddd m0, m10 | |
8711 paddd m0, m5 | |
8712 psrld m9, m0, 16 | |
8713 pslld m0, 16 | |
8714 psrld m0, 16 | |
8715 paddd m0, m9 | |
8716 psrldq m9, m0, 8 | |
8717 paddd m0, m9 | |
8718 psrldq m9, m0, 4 | |
8719 paddd m0, m9 | |
8720 paddd m0, [pd_1] | |
8721 psrld m0, 1 | |
8722 psubd m0, m8 | |
8723 | |
8724 vextracti128 xm1, m0, 1 | |
8725 psubd xm1, xm0 | |
8726 pabsd xm1, xm1 | |
8727 %endmacro | |
8728 | |
8729 %if ARCH_X86_64 | |
8730 %if HIGH_BIT_DEPTH | |
8731 cglobal psyCost_pp_8x8, 4, 8, 11 | |
8732 add r1d, r1d | |
8733 add r3d, r3d | |
8734 PSY_PP_8x8_AVX2 | |
8735 movd eax, xm1 | |
8736 RET | |
8737 %else ; !HIGH_BIT_DEPTH | |
8738 INIT_YMM avx2 | |
8739 cglobal psyCost_pp_8x8, 4, 8, 13 | |
8740 lea r4, [3 * r1] | |
8741 lea r7, [3 * r3] | |
8742 mova m8, [hmul_8p] | |
8743 | |
8744 PSY_PP_8x8 | |
8745 | |
8746 movd eax, xm0 | |
8747 RET | |
8748 %endif | |
8749 %endif | |
8750 %if ARCH_X86_64 | |
8751 INIT_YMM avx2 | |
8752 %if HIGH_BIT_DEPTH | |
8753 cglobal psyCost_pp_16x16, 4, 10, 12 | |
8754 add r1d, r1d | |
8755 add r3d, r3d | |
8756 pxor m11, m11 | |
8757 | |
8758 mov r8d, 2 | |
8759 .loopH: | |
8760 mov r9d, 2 | |
8761 .loopW: | |
8762 PSY_PP_8x8_AVX2 | |
8763 | |
8764 paddd xm11, xm1 | |
8765 add r0, 16 | |
8766 add r2, 16 | |
8767 dec r9d | |
8768 jnz .loopW | |
8769 lea r0, [r0 + r1 * 8 - 32] | |
8770 lea r2, [r2 + r3 * 8 - 32] | |
8771 dec r8d | |
8772 jnz .loopH | |
8773 movd eax, xm11 | |
8774 RET | |
8775 %else ; !HIGH_BIT_DEPTH | |
8776 cglobal psyCost_pp_16x16, 4, 10, 14 | |
8777 lea r4, [3 * r1] | |
8778 lea r7, [3 * r3] | |
8779 mova m8, [hmul_8p] | |
8780 pxor m13, m13 | |
8781 | |
8782 mov r8d, 2 | |
8783 .loopH: | |
8784 mov r9d, 2 | |
8785 .loopW: | |
8786 PSY_PP_8x8 | |
8787 | |
8788 paddd m13, m0 | |
8789 add r0, 8 | |
8790 add r2, 8 | |
8791 dec r9d | |
8792 jnz .loopW | |
8793 lea r0, [r0 + r1 * 8 - 16] | |
8794 lea r2, [r2 + r3 * 8 - 16] | |
8795 dec r8d | |
8796 jnz .loopH | |
8797 movd eax, xm13 | |
8798 RET | |
8799 %endif | |
8800 %endif | |
8801 %if ARCH_X86_64 | |
8802 INIT_YMM avx2 | |
8803 %if HIGH_BIT_DEPTH | |
8804 cglobal psyCost_pp_32x32, 4, 10, 12 | |
8805 add r1d, r1d | |
8806 add r3d, r3d | |
8807 pxor m11, m11 | |
8808 | |
8809 mov r8d, 4 | |
8810 .loopH: | |
8811 mov r9d, 4 | |
8812 .loopW: | |
8813 PSY_PP_8x8_AVX2 | |
8814 | |
8815 paddd xm11, xm1 | |
8816 add r0, 16 | |
8817 add r2, 16 | |
8818 dec r9d | |
8819 jnz .loopW | |
8820 lea r0, [r0 + r1 * 8 - 64] | |
8821 lea r2, [r2 + r3 * 8 - 64] | |
8822 dec r8d | |
8823 jnz .loopH | |
8824 movd eax, xm11 | |
8825 RET | |
8826 %else ; !HIGH_BIT_DEPTH | |
8827 cglobal psyCost_pp_32x32, 4, 10, 14 | |
8828 lea r4, [3 * r1] | |
8829 lea r7, [3 * r3] | |
8830 mova m8, [hmul_8p] | |
8831 pxor m13, m13 | |
8832 | |
8833 mov r8d, 4 | |
8834 .loopH: | |
8835 mov r9d, 4 | |
8836 .loopW: | |
8837 PSY_PP_8x8 | |
8838 | |
8839 paddd m13, m0 | |
8840 add r0, 8 | |
8841 add r2, 8 | |
8842 dec r9d | |
8843 jnz .loopW | |
8844 lea r0, [r0 + r1 * 8 - 32] | |
8845 lea r2, [r2 + r3 * 8 - 32] | |
8846 dec r8d | |
8847 jnz .loopH | |
8848 movd eax, xm13 | |
8849 RET | |
8850 %endif | |
8851 %endif | |
8852 %if ARCH_X86_64 | |
8853 INIT_YMM avx2 | |
8854 %if HIGH_BIT_DEPTH | |
8855 cglobal psyCost_pp_64x64, 4, 10, 12 | |
8856 add r1d, r1d | |
8857 add r3d, r3d | |
8858 pxor m11, m11 | |
8859 | |
8860 mov r8d, 8 | |
8861 .loopH: | |
8862 mov r9d, 8 | |
8863 .loopW: | |
8864 PSY_PP_8x8_AVX2 | |
8865 | |
8866 paddd xm11, xm1 | |
8867 add r0, 16 | |
8868 add r2, 16 | |
8869 dec r9d | |
8870 jnz .loopW | |
8871 lea r0, [r0 + r1 * 8 - 128] | |
8872 lea r2, [r2 + r3 * 8 - 128] | |
8873 dec r8d | |
8874 jnz .loopH | |
8875 movd eax, xm11 | |
8876 RET | |
8877 %else ; !HIGH_BIT_DEPTH | |
8878 cglobal psyCost_pp_64x64, 4, 10, 14 | |
8879 lea r4, [3 * r1] | |
8880 lea r7, [3 * r3] | |
8881 mova m8, [hmul_8p] | |
8882 pxor m13, m13 | |
8883 | |
8884 mov r8d, 8 | |
8885 .loopH: | |
8886 mov r9d, 8 | |
8887 .loopW: | |
8888 PSY_PP_8x8 | |
8889 | |
8890 paddd m13, m0 | |
8891 add r0, 8 | |
8892 add r2, 8 | |
8893 dec r9d | |
8894 jnz .loopW | |
8895 lea r0, [r0 + r1 * 8 - 64] | |
8896 lea r2, [r2 + r3 * 8 - 64] | |
8897 dec r8d | |
8898 jnz .loopH | |
8899 movd eax, xm13 | |
8900 RET | |
8901 %endif | |
8902 %endif | |
8903 | |
8904 ;--------------------------------------------------------------------------------------------------------------------- | |
8905 ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) | |
8906 ;--------------------------------------------------------------------------------------------------------------------- | |
8907 INIT_XMM sse4 | |
8908 cglobal psyCost_ss_4x4, 4, 5, 8 | |
8909 | |
8910 add r1, r1 | |
8911 lea r4, [3 * r1] | |
8912 movddup m0, [r0] | |
8913 movddup m1, [r0 + r1] | |
8914 movddup m2, [r0 + r1 * 2] | |
8915 movddup m3, [r0 + r4] | |
8916 | |
8917 pabsw m4, m0 | |
8918 pabsw m5, m1 | |
8919 paddw m5, m4 | |
8920 pabsw m4, m2 | |
8921 paddw m5, m4 | |
8922 pabsw m4, m3 | |
8923 paddw m5, m4 | |
8924 pmaddwd m5, [pw_1] | |
8925 psrldq m4, m5, 4 | |
8926 paddd m5, m4 | |
8927 psrld m6, m5, 2 | |
8928 | |
8929 mova m4, [hmul_8w] | |
8930 pmaddwd m0, m4 | |
8931 pmaddwd m1, m4 | |
8932 pmaddwd m2, m4 | |
8933 pmaddwd m3, m4 | |
8934 | |
8935 psrldq m4, m0, 4 | |
8936 psubd m5, m0, m4 | |
8937 paddd m0, m4 | |
8938 shufps m0, m5, 10001000b | |
8939 | |
8940 psrldq m4, m1, 4 | |
8941 psubd m5, m1, m4 | |
8942 paddd m1, m4 | |
8943 shufps m1, m5, 10001000b | |
8944 | |
8945 psrldq m4, m2, 4 | |
8946 psubd m5, m2, m4 | |
8947 paddd m2, m4 | |
8948 shufps m2, m5, 10001000b | |
8949 | |
8950 psrldq m4, m3, 4 | |
8951 psubd m5, m3, m4 | |
8952 paddd m3, m4 | |
8953 shufps m3, m5, 10001000b | |
8954 | |
8955 mova m4, m0 | |
8956 paddd m0, m1 | |
8957 psubd m1, m4 | |
8958 mova m4, m2 | |
8959 paddd m2, m3 | |
8960 psubd m3, m4 | |
8961 mova m4, m0 | |
8962 paddd m0, m2 | |
8963 psubd m2, m4 | |
8964 mova m4, m1 | |
8965 paddd m1, m3 | |
8966 psubd m3, m4 | |
8967 | |
8968 pabsd m0, m0 | |
8969 pabsd m2, m2 | |
8970 pabsd m1, m1 | |
8971 pabsd m3, m3 | |
8972 paddd m0, m2 | |
8973 paddd m1, m3 | |
8974 paddd m0, m1 | |
8975 movhlps m1, m0 | |
8976 paddd m0, m1 | |
8977 psrldq m1, m0, 4 | |
8978 paddd m0, m1 | |
8979 psrld m0, 1 | |
8980 psubd m7, m0, m6 | |
8981 | |
8982 add r3, r3 | |
8983 lea r4, [3 * r3] | |
8984 movddup m0, [r2] | |
8985 movddup m1, [r2 + r3] | |
8986 movddup m2, [r2 + r3 * 2] | |
8987 movddup m3, [r2 + r4] | |
8988 | |
8989 pabsw m4, m0 | |
8990 pabsw m5, m1 | |
8991 paddw m5, m4 | |
8992 pabsw m4, m2 | |
8993 paddw m5, m4 | |
8994 pabsw m4, m3 | |
8995 paddw m5, m4 | |
8996 pmaddwd m5, [pw_1] | |
8997 psrldq m4, m5, 4 | |
8998 paddd m5, m4 | |
8999 psrld m6, m5, 2 | |
9000 | |
9001 mova m4, [hmul_8w] | |
9002 pmaddwd m0, m4 | |
9003 pmaddwd m1, m4 | |
9004 pmaddwd m2, m4 | |
9005 pmaddwd m3, m4 | |
9006 | |
9007 psrldq m4, m0, 4 | |
9008 psubd m5, m0, m4 | |
9009 paddd m0, m4 | |
9010 shufps m0, m5, 10001000b | |
9011 | |
9012 psrldq m4, m1, 4 | |
9013 psubd m5, m1, m4 | |
9014 paddd m1, m4 | |
9015 shufps m1, m5, 10001000b | |
9016 | |
9017 psrldq m4, m2, 4 | |
9018 psubd m5, m2, m4 | |
9019 paddd m2, m4 | |
9020 shufps m2, m5, 10001000b | |
9021 | |
9022 psrldq m4, m3, 4 | |
9023 psubd m5, m3, m4 | |
9024 paddd m3, m4 | |
9025 shufps m3, m5, 10001000b | |
9026 | |
9027 mova m4, m0 | |
9028 paddd m0, m1 | |
9029 psubd m1, m4 | |
9030 mova m4, m2 | |
9031 paddd m2, m3 | |
9032 psubd m3, m4 | |
9033 mova m4, m0 | |
9034 paddd m0, m2 | |
9035 psubd m2, m4 | |
9036 mova m4, m1 | |
9037 paddd m1, m3 | |
9038 psubd m3, m4 | |
9039 | |
9040 pabsd m0, m0 | |
9041 pabsd m2, m2 | |
9042 pabsd m1, m1 | |
9043 pabsd m3, m3 | |
9044 paddd m0, m2 | |
9045 paddd m1, m3 | |
9046 paddd m0, m1 | |
9047 movhlps m1, m0 | |
9048 paddd m0, m1 | |
9049 psrldq m1, m0, 4 | |
9050 paddd m0, m1 | |
9051 psrld m0, 1 | |
9052 psubd m0, m6 | |
9053 psubd m7, m0 | |
9054 pabsd m0, m7 | |
9055 movd eax, m0 | |
9056 RET | |
9057 | |
9058 %if ARCH_X86_64 | |
9059 INIT_XMM sse4 | |
9060 cglobal psyCost_ss_8x8, 4, 6, 15 | |
9061 | |
9062 mova m13, [pw_pmpmpmpm] | |
9063 mova m14, [pw_1] | |
9064 add r1, r1 | |
9065 add r3, r3 | |
9066 lea r4, [3 * r1] | |
9067 movu m0, [r0] | |
9068 movu m1, [r0 + r1] | |
9069 movu m2, [r0 + r1 * 2] | |
9070 movu m3, [r0 + r4] | |
9071 lea r5, [r0 + r1 * 4] | |
9072 movu m4, [r5] | |
9073 movu m5, [r5 + r1] | |
9074 movu m6, [r5 + r1 * 2] | |
9075 movu m7, [r5 + r4] | |
9076 | |
9077 pabsw m8, m0 | |
9078 pabsw m9, m1 | |
9079 paddw m8, m9 | |
9080 pabsw m10, m2 | |
9081 pabsw m11, m3 | |
9082 paddw m10, m11 | |
9083 paddw m8, m10 | |
9084 pabsw m9, m4 | |
9085 pabsw m10, m5 | |
9086 paddw m9, m10 | |
9087 pabsw m11, m6 | |
9088 pabsw m12, m7 | |
9089 paddw m11, m12 | |
9090 paddw m9, m11 | |
9091 paddw m8, m9 | |
9092 movhlps m9, m8 | |
9093 pmovzxwd m8, m8 | |
9094 pmovzxwd m9, m9 | |
9095 paddd m8, m9 | |
9096 movhlps m9, m8 | |
9097 paddd m8, m9 | |
9098 psrldq m9, m8, 4 | |
9099 paddd m8, m9 | |
9100 psrld m8, 2 | |
9101 | |
9102 pmaddwd m0, m13 | |
9103 pmaddwd m1, m13 | |
9104 pmaddwd m2, m13 | |
9105 pmaddwd m3, m13 | |
9106 | |
9107 psrldq m9, m0, 4 | |
9108 psubd m10, m0, m9 | |
9109 paddd m0, m9 | |
9110 shufps m0, m10, 10001000b | |
9111 psrldq m9, m0, 4 | |
9112 psubd m10, m0, m9 | |
9113 paddd m0, m9 | |
9114 shufps m0, m10, 10001000b | |
9115 | |
9116 psrldq m9, m1, 4 | |
9117 psubd m10, m1, m9 | |
9118 paddd m1, m9 | |
9119 shufps m1, m10, 10001000b | |
9120 psrldq m9, m1, 4 | |
9121 psubd m10, m1, m9 | |
9122 paddd m1, m9 | |
9123 shufps m1, m10, 10001000b | |
9124 | |
9125 psrldq m9, m2, 4 | |
9126 psubd m10, m2, m9 | |
9127 paddd m2, m9 | |
9128 shufps m2, m10, 10001000b | |
9129 psrldq m9, m2, 4 | |
9130 psubd m10, m2, m9 | |
9131 paddd m2, m9 | |
9132 shufps m2, m10, 10001000b | |
9133 | |
9134 psrldq m9, m3, 4 | |
9135 psubd m10, m3, m9 | |
9136 paddd m3, m9 | |
9137 shufps m3, m10, 10001000b | |
9138 psrldq m9, m3, 4 | |
9139 psubd m10, m3, m9 | |
9140 paddd m3, m9 | |
9141 shufps m3, m10, 10001000b | |
9142 | |
9143 SUMSUB_BA d, 0, 1, 9 | |
9144 SUMSUB_BA d, 2, 3, 9 | |
9145 SUMSUB_BA d, 0, 2, 9 | |
9146 SUMSUB_BA d, 1, 3, 9 | |
9147 | |
9148 pmaddwd m4, m13 | |
9149 pmaddwd m5, m13 | |
9150 pmaddwd m6, m13 | |
9151 pmaddwd m7, m13 | |
9152 | |
9153 psrldq m9, m4, 4 | |
9154 psubd m10, m4, m9 | |
9155 paddd m4, m9 | |
9156 shufps m4, m10, 10001000b | |
9157 psrldq m9, m4, 4 | |
9158 psubd m10, m4, m9 | |
9159 paddd m4, m9 | |
9160 shufps m4, m10, 10001000b | |
9161 | |
9162 psrldq m9, m5, 4 | |
9163 psubd m10, m5, m9 | |
9164 paddd m5, m9 | |
9165 shufps m5, m10, 10001000b | |
9166 psrldq m9, m5, 4 | |
9167 psubd m10, m5, m9 | |
9168 paddd m5, m9 | |
9169 shufps m5, m10, 10001000b | |
9170 | |
9171 psrldq m9, m6, 4 | |
9172 psubd m10, m6, m9 | |
9173 paddd m6, m9 | |
9174 shufps m6, m10, 10001000b | |
9175 psrldq m9, m6, 4 | |
9176 psubd m10, m6, m9 | |
9177 paddd m6, m9 | |
9178 shufps m6, m10, 10001000b | |
9179 | |
9180 psrldq m9, m7, 4 | |
9181 psubd m10, m7, m9 | |
9182 paddd m7, m9 | |
9183 shufps m7, m10, 10001000b | |
9184 psrldq m9, m7, 4 | |
9185 psubd m10, m7, m9 | |
9186 paddd m7, m9 | |
9187 shufps m7, m10, 10001000b | |
9188 | |
9189 SUMSUB_BA d, 4, 5, 9 | |
9190 SUMSUB_BA d, 6, 7, 9 | |
9191 SUMSUB_BA d, 4, 6, 9 | |
9192 SUMSUB_BA d, 5, 7, 9 | |
9193 | |
9194 SUMSUB_BA d, 0, 4, 9 | |
9195 SUMSUB_BA d, 1, 5, 9 | |
9196 SUMSUB_BA d, 2, 6, 9 | |
9197 SUMSUB_BA d, 3, 7, 9 | |
9198 | |
9199 pabsd m0, m0 | |
9200 pabsd m2, m2 | |
9201 pabsd m1, m1 | |
9202 pabsd m3, m3 | |
9203 pabsd m4, m4 | |
9204 pabsd m5, m5 | |
9205 pabsd m6, m6 | |
9206 pabsd m7, m7 | |
9207 | |
9208 paddd m0, m2 | |
9209 paddd m1, m3 | |
9210 paddd m0, m1 | |
9211 paddd m5, m4 | |
9212 paddd m0, m5 | |
9213 paddd m7, m6 | |
9214 paddd m11, m0, m7 | |
9215 | |
9216 movu m0, [r0] | |
9217 movu m1, [r0 + r1] | |
9218 movu m2, [r0 + r1 * 2] | |
9219 movu m3, [r0 + r4] | |
9220 | |
9221 pmaddwd m0, m14 | |
9222 pmaddwd m1, m14 | |
9223 pmaddwd m2, m14 | |
9224 pmaddwd m3, m14 | |
9225 | |
9226 psrldq m9, m0, 4 | |
9227 psubd m10, m0, m9 | |
9228 paddd m0, m9 | |
9229 shufps m0, m10, 10001000b | |
9230 psrldq m9, m0, 4 | |
9231 psubd m10, m0, m9 | |
9232 paddd m0, m9 | |
9233 shufps m0, m10, 10001000b | |
9234 | |
9235 psrldq m9, m1, 4 | |
9236 psubd m10, m1, m9 | |
9237 paddd m1, m9 | |
9238 shufps m1, m10, 10001000b | |
9239 psrldq m9, m1, 4 | |
9240 psubd m10, m1, m9 | |
9241 paddd m1, m9 | |
9242 shufps m1, m10, 10001000b | |
9243 | |
9244 psrldq m9, m2, 4 | |
9245 psubd m10, m2, m9 | |
9246 paddd m2, m9 | |
9247 shufps m2, m10, 10001000b | |
9248 psrldq m9, m2, 4 | |
9249 psubd m10, m2, m9 | |
9250 paddd m2, m9 | |
9251 shufps m2, m10, 10001000b | |
9252 | |
9253 psrldq m9, m3, 4 | |
9254 psubd m10, m3, m9 | |
9255 paddd m3, m9 | |
9256 shufps m3, m10, 10001000b | |
9257 psrldq m9, m3, 4 | |
9258 psubd m10, m3, m9 | |
9259 paddd m3, m9 | |
9260 shufps m3, m10, 10001000b | |
9261 | |
9262 SUMSUB_BA d, 0, 1, 9 | |
9263 SUMSUB_BA d, 2, 3, 9 | |
9264 SUMSUB_BA d, 0, 2, 9 | |
9265 SUMSUB_BA d, 1, 3, 9 | |
9266 | |
9267 movu m4, [r5] | |
9268 movu m5, [r5 + r1] | |
9269 movu m6, [r5 + r1 * 2] | |
9270 movu m7, [r5 + r4] | |
9271 | |
9272 pmaddwd m4, m14 | |
9273 pmaddwd m5, m14 | |
9274 pmaddwd m6, m14 | |
9275 pmaddwd m7, m14 | |
9276 | |
9277 psrldq m9, m4, 4 | |
9278 psubd m10, m4, m9 | |
9279 paddd m4, m9 | |
9280 shufps m4, m10, 10001000b | |
9281 psrldq m9, m4, 4 | |
9282 psubd m10, m4, m9 | |
9283 paddd m4, m9 | |
9284 shufps m4, m10, 10001000b | |
9285 | |
9286 psrldq m9, m5, 4 | |
9287 psubd m10, m5, m9 | |
9288 paddd m5, m9 | |
9289 shufps m5, m10, 10001000b | |
9290 psrldq m9, m5, 4 | |
9291 psubd m10, m5, m9 | |
9292 paddd m5, m9 | |
9293 shufps m5, m10, 10001000b | |
9294 | |
9295 psrldq m9, m6, 4 | |
9296 psubd m10, m6, m9 | |
9297 paddd m6, m9 | |
9298 shufps m6, m10, 10001000b | |
9299 psrldq m9, m6, 4 | |
9300 psubd m10, m6, m9 | |
9301 paddd m6, m9 | |
9302 shufps m6, m10, 10001000b | |
9303 | |
9304 psrldq m9, m7, 4 | |
9305 psubd m10, m7, m9 | |
9306 paddd m7, m9 | |
9307 shufps m7, m10, 10001000b | |
9308 psrldq m9, m7, 4 | |
9309 psubd m10, m7, m9 | |
9310 paddd m7, m9 | |
9311 shufps m7, m10, 10001000b | |
9312 | |
9313 SUMSUB_BA d, 4, 5, 9 | |
9314 SUMSUB_BA d, 6, 7, 9 | |
9315 SUMSUB_BA d, 4, 6, 9 | |
9316 SUMSUB_BA d, 5, 7, 9 | |
9317 | |
9318 SUMSUB_BA d, 0, 4, 9 | |
9319 SUMSUB_BA d, 1, 5, 9 | |
9320 SUMSUB_BA d, 2, 6, 9 | |
9321 SUMSUB_BA d, 3, 7, 9 | |
9322 | |
9323 pabsd m0, m0 | |
9324 pabsd m2, m2 | |
9325 pabsd m1, m1 | |
9326 pabsd m3, m3 | |
9327 pabsd m4, m4 | |
9328 pabsd m5, m5 | |
9329 pabsd m6, m6 | |
9330 pabsd m7, m7 | |
9331 | |
9332 paddd m0, m2 | |
9333 paddd m1, m3 | |
9334 paddd m0, m1 | |
9335 paddd m5, m4 | |
9336 paddd m0, m5 | |
9337 paddd m7, m6 | |
9338 paddd m0, m7 | |
9339 paddd m0, m11 | |
9340 | |
9341 movhlps m1, m0 | |
9342 paddd m0, m1 | |
9343 psrldq m1, m0, 4 | |
9344 paddd m0, m1 | |
9345 paddd m0, [pd_2] | |
9346 psrld m0, 2 | |
9347 psubd m12, m0, m8 | |
9348 | |
9349 lea r4, [3 * r3] | |
9350 movu m0, [r2] | |
9351 movu m1, [r2 + r3] | |
9352 movu m2, [r2 + r3 * 2] | |
9353 movu m3, [r2 + r4] | |
9354 lea r5, [r2 + r3 * 4] | |
9355 movu m4, [r5] | |
9356 movu m5, [r5 + r3] | |
9357 movu m6, [r5 + r3 * 2] | |
9358 movu m7, [r5 + r4] | |
9359 | |
9360 pabsw m8, m0 | |
9361 pabsw m9, m1 | |
9362 paddw m8, m9 | |
9363 pabsw m10, m2 | |
9364 pabsw m11, m3 | |
9365 paddw m10, m11 | |
9366 paddw m8, m10 | |
9367 pabsw m9, m4 | |
9368 pabsw m10, m5 | |
9369 paddw m9, m10 | |
9370 pabsw m11, m6 | |
9371 pabsw m10, m7 | |
9372 paddw m11, m10 | |
9373 paddw m9, m11 | |
9374 paddw m8, m9 | |
9375 movhlps m9, m8 | |
9376 pmovzxwd m8, m8 | |
9377 pmovzxwd m9, m9 | |
9378 paddd m8, m9 | |
9379 movhlps m9, m8 | |
9380 paddd m8, m9 | |
9381 psrldq m9, m8, 4 | |
9382 paddd m8, m9 | |
9383 psrld m8, 2 | |
9384 | |
9385 pmaddwd m0, m13 | |
9386 pmaddwd m1, m13 | |
9387 pmaddwd m2, m13 | |
9388 pmaddwd m3, m13 | |
9389 | |
9390 psrldq m9, m0, 4 | |
9391 psubd m10, m0, m9 | |
9392 paddd m0, m9 | |
9393 shufps m0, m10, 10001000b | |
9394 psrldq m9, m0, 4 | |
9395 psubd m10, m0, m9 | |
9396 paddd m0, m9 | |
9397 shufps m0, m10, 10001000b | |
9398 | |
9399 psrldq m9, m1, 4 | |
9400 psubd m10, m1, m9 | |
9401 paddd m1, m9 | |
9402 shufps m1, m10, 10001000b | |
9403 psrldq m9, m1, 4 | |
9404 psubd m10, m1, m9 | |
9405 paddd m1, m9 | |
9406 shufps m1, m10, 10001000b | |
9407 | |
9408 psrldq m9, m2, 4 | |
9409 psubd m10, m2, m9 | |
9410 paddd m2, m9 | |
9411 shufps m2, m10, 10001000b | |
9412 psrldq m9, m2, 4 | |
9413 psubd m10, m2, m9 | |
9414 paddd m2, m9 | |
9415 shufps m2, m10, 10001000b | |
9416 | |
9417 psrldq m9, m3, 4 | |
9418 psubd m10, m3, m9 | |
9419 paddd m3, m9 | |
9420 shufps m3, m10, 10001000b | |
9421 psrldq m9, m3, 4 | |
9422 psubd m10, m3, m9 | |
9423 paddd m3, m9 | |
9424 shufps m3, m10, 10001000b | |
9425 | |
9426 SUMSUB_BA d, 0, 1, 9 | |
9427 SUMSUB_BA d, 2, 3, 9 | |
9428 SUMSUB_BA d, 0, 2, 9 | |
9429 SUMSUB_BA d, 1, 3, 9 | |
9430 | |
9431 pmaddwd m4, m13 | |
9432 pmaddwd m5, m13 | |
9433 pmaddwd m6, m13 | |
9434 pmaddwd m7, m13 | |
9435 | |
9436 psrldq m9, m4, 4 | |
9437 psubd m10, m4, m9 | |
9438 paddd m4, m9 | |
9439 shufps m4, m10, 10001000b | |
9440 psrldq m9, m4, 4 | |
9441 psubd m10, m4, m9 | |
9442 paddd m4, m9 | |
9443 shufps m4, m10, 10001000b | |
9444 | |
9445 psrldq m9, m5, 4 | |
9446 psubd m10, m5, m9 | |
9447 paddd m5, m9 | |
9448 shufps m5, m10, 10001000b | |
9449 psrldq m9, m5, 4 | |
9450 psubd m10, m5, m9 | |
9451 paddd m5, m9 | |
9452 shufps m5, m10, 10001000b | |
9453 | |
9454 psrldq m9, m6, 4 | |
9455 psubd m10, m6, m9 | |
9456 paddd m6, m9 | |
9457 shufps m6, m10, 10001000b | |
9458 psrldq m9, m6, 4 | |
9459 psubd m10, m6, m9 | |
9460 paddd m6, m9 | |
9461 shufps m6, m10, 10001000b | |
9462 | |
9463 psrldq m9, m7, 4 | |
9464 psubd m10, m7, m9 | |
9465 paddd m7, m9 | |
9466 shufps m7, m10, 10001000b | |
9467 psrldq m9, m7, 4 | |
9468 psubd m10, m7, m9 | |
9469 paddd m7, m9 | |
9470 shufps m7, m10, 10001000b | |
9471 | |
9472 SUMSUB_BA d, 4, 5, 9 | |
9473 SUMSUB_BA d, 6, 7, 9 | |
9474 SUMSUB_BA d, 4, 6, 9 | |
9475 SUMSUB_BA d, 5, 7, 9 | |
9476 | |
9477 SUMSUB_BA d, 0, 4, 9 | |
9478 SUMSUB_BA d, 1, 5, 9 | |
9479 SUMSUB_BA d, 2, 6, 9 | |
9480 SUMSUB_BA d, 3, 7, 9 | |
9481 | |
9482 pabsd m0, m0 | |
9483 pabsd m2, m2 | |
9484 pabsd m1, m1 | |
9485 pabsd m3, m3 | |
9486 pabsd m4, m4 | |
9487 pabsd m5, m5 | |
9488 pabsd m6, m6 | |
9489 pabsd m7, m7 | |
9490 | |
9491 paddd m0, m2 | |
9492 paddd m1, m3 | |
9493 paddd m0, m1 | |
9494 paddd m5, m4 | |
9495 paddd m0, m5 | |
9496 paddd m7, m6 | |
9497 paddd m11, m0, m7 | |
9498 | |
9499 movu m0, [r2] | |
9500 movu m1, [r2 + r3] | |
9501 movu m2, [r2 + r3 * 2] | |
9502 movu m3, [r2 + r4] | |
9503 | |
9504 pmaddwd m0, m14 | |
9505 pmaddwd m1, m14 | |
9506 pmaddwd m2, m14 | |
9507 pmaddwd m3, m14 | |
9508 | |
9509 psrldq m9, m0, 4 | |
9510 psubd m10, m0, m9 | |
9511 paddd m0, m9 | |
9512 shufps m0, m10, 10001000b | |
9513 psrldq m9, m0, 4 | |
9514 psubd m10, m0, m9 | |
9515 paddd m0, m9 | |
9516 shufps m0, m10, 10001000b | |
9517 | |
9518 psrldq m9, m1, 4 | |
9519 psubd m10, m1, m9 | |
9520 paddd m1, m9 | |
9521 shufps m1, m10, 10001000b | |
9522 psrldq m9, m1, 4 | |
9523 psubd m10, m1, m9 | |
9524 paddd m1, m9 | |
9525 shufps m1, m10, 10001000b | |
9526 | |
9527 psrldq m9, m2, 4 | |
9528 psubd m10, m2, m9 | |
9529 paddd m2, m9 | |
9530 shufps m2, m10, 10001000b | |
9531 psrldq m9, m2, 4 | |
9532 psubd m10, m2, m9 | |
9533 paddd m2, m9 | |
9534 shufps m2, m10, 10001000b | |
9535 | |
9536 psrldq m9, m3, 4 | |
9537 psubd m10, m3, m9 | |
9538 paddd m3, m9 | |
9539 shufps m3, m10, 10001000b | |
9540 psrldq m9, m3, 4 | |
9541 psubd m10, m3, m9 | |
9542 paddd m3, m9 | |
9543 shufps m3, m10, 10001000b | |
9544 | |
9545 SUMSUB_BA d, 0, 1, 9 | |
9546 SUMSUB_BA d, 2, 3, 9 | |
9547 SUMSUB_BA d, 0, 2, 9 | |
9548 SUMSUB_BA d, 1, 3, 9 | |
9549 | |
9550 movu m4, [r5] | |
9551 movu m5, [r5 + r3] | |
9552 movu m6, [r5 + r3 * 2] | |
9553 movu m7, [r5 + r4] | |
9554 | |
9555 pmaddwd m4, m14 | |
9556 pmaddwd m5, m14 | |
9557 pmaddwd m6, m14 | |
9558 pmaddwd m7, m14 | |
9559 | |
9560 psrldq m9, m4, 4 | |
9561 psubd m10, m4, m9 | |
9562 paddd m4, m9 | |
9563 shufps m4, m10, 10001000b | |
9564 psrldq m9, m4, 4 | |
9565 psubd m10, m4, m9 | |
9566 paddd m4, m9 | |
9567 shufps m4, m10, 10001000b | |
9568 | |
9569 psrldq m9, m5, 4 | |
9570 psubd m10, m5, m9 | |
9571 paddd m5, m9 | |
9572 shufps m5, m10, 10001000b | |
9573 psrldq m9, m5, 4 | |
9574 psubd m10, m5, m9 | |
9575 paddd m5, m9 | |
9576 shufps m5, m10, 10001000b | |
9577 | |
9578 psrldq m9, m6, 4 | |
9579 psubd m10, m6, m9 | |
9580 paddd m6, m9 | |
9581 shufps m6, m10, 10001000b | |
9582 psrldq m9, m6, 4 | |
9583 psubd m10, m6, m9 | |
9584 paddd m6, m9 | |
9585 shufps m6, m10, 10001000b | |
9586 | |
9587 psrldq m9, m7, 4 | |
9588 psubd m10, m7, m9 | |
9589 paddd m7, m9 | |
9590 shufps m7, m10, 10001000b | |
9591 psrldq m9, m7, 4 | |
9592 psubd m10, m7, m9 | |
9593 paddd m7, m9 | |
9594 shufps m7, m10, 10001000b | |
9595 | |
9596 SUMSUB_BA d, 4, 5, 9 | |
9597 SUMSUB_BA d, 6, 7, 9 | |
9598 SUMSUB_BA d, 4, 6, 9 | |
9599 SUMSUB_BA d, 5, 7, 9 | |
9600 | |
9601 SUMSUB_BA d, 0, 4, 9 | |
9602 SUMSUB_BA d, 1, 5, 9 | |
9603 SUMSUB_BA d, 2, 6, 9 | |
9604 SUMSUB_BA d, 3, 7, 9 | |
9605 | |
9606 pabsd m0, m0 | |
9607 pabsd m2, m2 | |
9608 pabsd m1, m1 | |
9609 pabsd m3, m3 | |
9610 pabsd m4, m4 | |
9611 pabsd m5, m5 | |
9612 pabsd m6, m6 | |
9613 pabsd m7, m7 | |
9614 | |
9615 paddd m0, m2 | |
9616 paddd m1, m3 | |
9617 paddd m0, m1 | |
9618 paddd m5, m4 | |
9619 paddd m0, m5 | |
9620 paddd m7, m6 | |
9621 paddd m0, m7 | |
9622 paddd m0, m11 | |
9623 | |
9624 movhlps m1, m0 | |
9625 paddd m0, m1 | |
9626 psrldq m1, m0, 4 | |
9627 paddd m0, m1 | |
9628 paddd m0, [pd_2] | |
9629 psrld m0, 2 | |
9630 psubd m0, m8 | |
9631 | |
9632 psubd m12, m0 | |
9633 pabsd m0, m12 | |
9634 movd eax, m0 | |
9635 RET | |
9636 %endif | |
9637 | |
9638 %macro psy_cost_ss 0 | |
9639 movu m0, [r0] | |
9640 movu m1, [r0 + r1] | |
9641 movu m2, [r0 + r1 * 2] | |
9642 movu m3, [r0 + r4] | |
9643 lea r5, [r0 + r1 * 4] | |
9644 movu m4, [r5] | |
9645 movu m5, [r5 + r1] | |
9646 movu m6, [r5 + r1 * 2] | |
9647 movu m7, [r5 + r4] | |
9648 | |
9649 pabsw m8, m0 | |
9650 pabsw m9, m1 | |
9651 paddw m8, m9 | |
9652 pabsw m10, m2 | |
9653 pabsw m11, m3 | |
9654 paddw m10, m11 | |
9655 paddw m8, m10 | |
9656 pabsw m9, m4 | |
9657 pabsw m10, m5 | |
9658 paddw m9, m10 | |
9659 pabsw m11, m6 | |
9660 pabsw m12, m7 | |
9661 paddw m11, m12 | |
9662 paddw m9, m11 | |
9663 paddw m8, m9 | |
9664 movhlps m9, m8 | |
9665 pmovzxwd m8, m8 | |
9666 pmovzxwd m9, m9 | |
9667 paddd m8, m9 | |
9668 movhlps m9, m8 | |
9669 paddd m8, m9 | |
9670 psrldq m9, m8, 4 | |
9671 paddd m8, m9 | |
9672 psrld m8, 2 | |
9673 | |
9674 pmaddwd m0, m13 | |
9675 pmaddwd m1, m13 | |
9676 pmaddwd m2, m13 | |
9677 pmaddwd m3, m13 | |
9678 | |
9679 psrldq m9, m0, 4 | |
9680 psubd m10, m0, m9 | |
9681 paddd m0, m9 | |
9682 shufps m0, m10, 10001000b | |
9683 psrldq m9, m0, 4 | |
9684 psubd m10, m0, m9 | |
9685 paddd m0, m9 | |
9686 shufps m0, m10, 10001000b | |
9687 | |
9688 psrldq m9, m1, 4 | |
9689 psubd m10, m1, m9 | |
9690 paddd m1, m9 | |
9691 shufps m1, m10, 10001000b | |
9692 psrldq m9, m1, 4 | |
9693 psubd m10, m1, m9 | |
9694 paddd m1, m9 | |
9695 shufps m1, m10, 10001000b | |
9696 | |
9697 psrldq m9, m2, 4 | |
9698 psubd m10, m2, m9 | |
9699 paddd m2, m9 | |
9700 shufps m2, m10, 10001000b | |
9701 psrldq m9, m2, 4 | |
9702 psubd m10, m2, m9 | |
9703 paddd m2, m9 | |
9704 shufps m2, m10, 10001000b | |
9705 | |
9706 psrldq m9, m3, 4 | |
9707 psubd m10, m3, m9 | |
9708 paddd m3, m9 | |
9709 shufps m3, m10, 10001000b | |
9710 psrldq m9, m3, 4 | |
9711 psubd m10, m3, m9 | |
9712 paddd m3, m9 | |
9713 shufps m3, m10, 10001000b | |
9714 | |
9715 SUMSUB_BA d, 0, 1, 9 | |
9716 SUMSUB_BA d, 2, 3, 9 | |
9717 SUMSUB_BA d, 0, 2, 9 | |
9718 SUMSUB_BA d, 1, 3, 9 | |
9719 | |
9720 pmaddwd m4, m13 | |
9721 pmaddwd m5, m13 | |
9722 pmaddwd m6, m13 | |
9723 pmaddwd m7, m13 | |
9724 | |
9725 psrldq m9, m4, 4 | |
9726 psubd m10, m4, m9 | |
9727 paddd m4, m9 | |
9728 shufps m4, m10, 10001000b | |
9729 psrldq m9, m4, 4 | |
9730 psubd m10, m4, m9 | |
9731 paddd m4, m9 | |
9732 shufps m4, m10, 10001000b | |
9733 | |
9734 psrldq m9, m5, 4 | |
9735 psubd m10, m5, m9 | |
9736 paddd m5, m9 | |
9737 shufps m5, m10, 10001000b | |
9738 psrldq m9, m5, 4 | |
9739 psubd m10, m5, m9 | |
9740 paddd m5, m9 | |
9741 shufps m5, m10, 10001000b | |
9742 | |
9743 psrldq m9, m6, 4 | |
9744 psubd m10, m6, m9 | |
9745 paddd m6, m9 | |
9746 shufps m6, m10, 10001000b | |
9747 psrldq m9, m6, 4 | |
9748 psubd m10, m6, m9 | |
9749 paddd m6, m9 | |
9750 shufps m6, m10, 10001000b | |
9751 | |
9752 psrldq m9, m7, 4 | |
9753 psubd m10, m7, m9 | |
9754 paddd m7, m9 | |
9755 shufps m7, m10, 10001000b | |
9756 psrldq m9, m7, 4 | |
9757 psubd m10, m7, m9 | |
9758 paddd m7, m9 | |
9759 shufps m7, m10, 10001000b | |
9760 | |
9761 SUMSUB_BA d, 4, 5, 9 | |
9762 SUMSUB_BA d, 6, 7, 9 | |
9763 SUMSUB_BA d, 4, 6, 9 | |
9764 SUMSUB_BA d, 5, 7, 9 | |
9765 | |
9766 SUMSUB_BA d, 0, 4, 9 | |
9767 SUMSUB_BA d, 1, 5, 9 | |
9768 SUMSUB_BA d, 2, 6, 9 | |
9769 SUMSUB_BA d, 3, 7, 9 | |
9770 | |
9771 pabsd m0, m0 | |
9772 pabsd m2, m2 | |
9773 pabsd m1, m1 | |
9774 pabsd m3, m3 | |
9775 pabsd m4, m4 | |
9776 pabsd m5, m5 | |
9777 pabsd m6, m6 | |
9778 pabsd m7, m7 | |
9779 | |
9780 paddd m0, m2 | |
9781 paddd m1, m3 | |
9782 paddd m0, m1 | |
9783 paddd m5, m4 | |
9784 paddd m0, m5 | |
9785 paddd m7, m6 | |
9786 paddd m11, m0, m7 | |
9787 | |
9788 movu m0, [r0] | |
9789 movu m1, [r0 + r1] | |
9790 movu m2, [r0 + r1 * 2] | |
9791 movu m3, [r0 + r4] | |
9792 | |
9793 pmaddwd m0, m14 | |
9794 pmaddwd m1, m14 | |
9795 pmaddwd m2, m14 | |
9796 pmaddwd m3, m14 | |
9797 | |
9798 psrldq m9, m0, 4 | |
9799 psubd m10, m0, m9 | |
9800 paddd m0, m9 | |
9801 shufps m0, m10, 10001000b | |
9802 psrldq m9, m0, 4 | |
9803 psubd m10, m0, m9 | |
9804 paddd m0, m9 | |
9805 shufps m0, m10, 10001000b | |
9806 | |
9807 psrldq m9, m1, 4 | |
9808 psubd m10, m1, m9 | |
9809 paddd m1, m9 | |
9810 shufps m1, m10, 10001000b | |
9811 psrldq m9, m1, 4 | |
9812 psubd m10, m1, m9 | |
9813 paddd m1, m9 | |
9814 shufps m1, m10, 10001000b | |
9815 | |
9816 psrldq m9, m2, 4 | |
9817 psubd m10, m2, m9 | |
9818 paddd m2, m9 | |
9819 shufps m2, m10, 10001000b | |
9820 psrldq m9, m2, 4 | |
9821 psubd m10, m2, m9 | |
9822 paddd m2, m9 | |
9823 shufps m2, m10, 10001000b | |
9824 | |
9825 psrldq m9, m3, 4 | |
9826 psubd m10, m3, m9 | |
9827 paddd m3, m9 | |
9828 shufps m3, m10, 10001000b | |
9829 psrldq m9, m3, 4 | |
9830 psubd m10, m3, m9 | |
9831 paddd m3, m9 | |
9832 shufps m3, m10, 10001000b | |
9833 | |
9834 SUMSUB_BA d, 0, 1, 9 | |
9835 SUMSUB_BA d, 2, 3, 9 | |
9836 SUMSUB_BA d, 0, 2, 9 | |
9837 SUMSUB_BA d, 1, 3, 9 | |
9838 | |
9839 movu m4, [r5] | |
9840 movu m5, [r5 + r1] | |
9841 movu m6, [r5 + r1 * 2] | |
9842 movu m7, [r5 + r4] | |
9843 | |
9844 pmaddwd m4, m14 | |
9845 pmaddwd m5, m14 | |
9846 pmaddwd m6, m14 | |
9847 pmaddwd m7, m14 | |
9848 | |
9849 psrldq m9, m4, 4 | |
9850 psubd m10, m4, m9 | |
9851 paddd m4, m9 | |
9852 shufps m4, m10, 10001000b | |
9853 psrldq m9, m4, 4 | |
9854 psubd m10, m4, m9 | |
9855 paddd m4, m9 | |
9856 shufps m4, m10, 10001000b | |
9857 | |
9858 psrldq m9, m5, 4 | |
9859 psubd m10, m5, m9 | |
9860 paddd m5, m9 | |
9861 shufps m5, m10, 10001000b | |
9862 psrldq m9, m5, 4 | |
9863 psubd m10, m5, m9 | |
9864 paddd m5, m9 | |
9865 shufps m5, m10, 10001000b | |
9866 | |
9867 psrldq m9, m6, 4 | |
9868 psubd m10, m6, m9 | |
9869 paddd m6, m9 | |
9870 shufps m6, m10, 10001000b | |
9871 psrldq m9, m6, 4 | |
9872 psubd m10, m6, m9 | |
9873 paddd m6, m9 | |
9874 shufps m6, m10, 10001000b | |
9875 | |
9876 psrldq m9, m7, 4 | |
9877 psubd m10, m7, m9 | |
9878 paddd m7, m9 | |
9879 shufps m7, m10, 10001000b | |
9880 psrldq m9, m7, 4 | |
9881 psubd m10, m7, m9 | |
9882 paddd m7, m9 | |
9883 shufps m7, m10, 10001000b | |
9884 | |
9885 SUMSUB_BA d, 4, 5, 9 | |
9886 SUMSUB_BA d, 6, 7, 9 | |
9887 SUMSUB_BA d, 4, 6, 9 | |
9888 SUMSUB_BA d, 5, 7, 9 | |
9889 | |
9890 SUMSUB_BA d, 0, 4, 9 | |
9891 SUMSUB_BA d, 1, 5, 9 | |
9892 SUMSUB_BA d, 2, 6, 9 | |
9893 SUMSUB_BA d, 3, 7, 9 | |
9894 | |
9895 pabsd m0, m0 | |
9896 pabsd m2, m2 | |
9897 pabsd m1, m1 | |
9898 pabsd m3, m3 | |
9899 pabsd m4, m4 | |
9900 pabsd m5, m5 | |
9901 pabsd m6, m6 | |
9902 pabsd m7, m7 | |
9903 | |
9904 paddd m0, m2 | |
9905 paddd m1, m3 | |
9906 paddd m0, m1 | |
9907 paddd m5, m4 | |
9908 paddd m0, m5 | |
9909 paddd m7, m6 | |
9910 paddd m0, m7 | |
9911 paddd m0, m11 | |
9912 | |
9913 movhlps m1, m0 | |
9914 paddd m0, m1 | |
9915 psrldq m1, m0, 4 | |
9916 paddd m0, m1 | |
9917 paddd m0, [pd_2] | |
9918 psrld m0, 2 | |
9919 psubd m12, m0, m8 | |
9920 | |
9921 movu m0, [r2] | |
9922 movu m1, [r2 + r3] | |
9923 movu m2, [r2 + r3 * 2] | |
9924 movu m3, [r2 + r6] | |
9925 lea r5, [r2 + r3 * 4] | |
9926 movu m4, [r5] | |
9927 movu m5, [r5 + r3] | |
9928 movu m6, [r5 + r3 * 2] | |
9929 movu m7, [r5 + r6] | |
9930 | |
9931 pabsw m8, m0 | |
9932 pabsw m9, m1 | |
9933 paddw m8, m9 | |
9934 pabsw m10, m2 | |
9935 pabsw m11, m3 | |
9936 paddw m10, m11 | |
9937 paddw m8, m10 | |
9938 pabsw m9, m4 | |
9939 pabsw m10, m5 | |
9940 paddw m9, m10 | |
9941 pabsw m11, m6 | |
9942 pabsw m10, m7 | |
9943 paddw m11, m10 | |
9944 paddw m9, m11 | |
9945 paddw m8, m9 | |
9946 movhlps m9, m8 | |
9947 pmovzxwd m8, m8 | |
9948 pmovzxwd m9, m9 | |
9949 paddd m8, m9 | |
9950 movhlps m9, m8 | |
9951 paddd m8, m9 | |
9952 psrldq m9, m8, 4 | |
9953 paddd m8, m9 | |
9954 psrld m8, 2 | |
9955 | |
9956 pmaddwd m0, m13 | |
9957 pmaddwd m1, m13 | |
9958 pmaddwd m2, m13 | |
9959 pmaddwd m3, m13 | |
9960 | |
9961 psrldq m9, m0, 4 | |
9962 psubd m10, m0, m9 | |
9963 paddd m0, m9 | |
9964 shufps m0, m10, 10001000b | |
9965 psrldq m9, m0, 4 | |
9966 psubd m10, m0, m9 | |
9967 paddd m0, m9 | |
9968 shufps m0, m10, 10001000b | |
9969 | |
9970 psrldq m9, m1, 4 | |
9971 psubd m10, m1, m9 | |
9972 paddd m1, m9 | |
9973 shufps m1, m10, 10001000b | |
9974 psrldq m9, m1, 4 | |
9975 psubd m10, m1, m9 | |
9976 paddd m1, m9 | |
9977 shufps m1, m10, 10001000b | |
9978 | |
9979 psrldq m9, m2, 4 | |
9980 psubd m10, m2, m9 | |
9981 paddd m2, m9 | |
9982 shufps m2, m10, 10001000b | |
9983 psrldq m9, m2, 4 | |
9984 psubd m10, m2, m9 | |
9985 paddd m2, m9 | |
9986 shufps m2, m10, 10001000b | |
9987 | |
9988 psrldq m9, m3, 4 | |
9989 psubd m10, m3, m9 | |
9990 paddd m3, m9 | |
9991 shufps m3, m10, 10001000b | |
9992 psrldq m9, m3, 4 | |
9993 psubd m10, m3, m9 | |
9994 paddd m3, m9 | |
9995 shufps m3, m10, 10001000b | |
9996 | |
9997 SUMSUB_BA d, 0, 1, 9 | |
9998 SUMSUB_BA d, 2, 3, 9 | |
9999 SUMSUB_BA d, 0, 2, 9 | |
10000 SUMSUB_BA d, 1, 3, 9 | |
10001 | |
10002 pmaddwd m4, m13 | |
10003 pmaddwd m5, m13 | |
10004 pmaddwd m6, m13 | |
10005 pmaddwd m7, m13 | |
10006 | |
10007 psrldq m9, m4, 4 | |
10008 psubd m10, m4, m9 | |
10009 paddd m4, m9 | |
10010 shufps m4, m10, 10001000b | |
10011 psrldq m9, m4, 4 | |
10012 psubd m10, m4, m9 | |
10013 paddd m4, m9 | |
10014 shufps m4, m10, 10001000b | |
10015 | |
10016 psrldq m9, m5, 4 | |
10017 psubd m10, m5, m9 | |
10018 paddd m5, m9 | |
10019 shufps m5, m10, 10001000b | |
10020 psrldq m9, m5, 4 | |
10021 psubd m10, m5, m9 | |
10022 paddd m5, m9 | |
10023 shufps m5, m10, 10001000b | |
10024 | |
10025 psrldq m9, m6, 4 | |
10026 psubd m10, m6, m9 | |
10027 paddd m6, m9 | |
10028 shufps m6, m10, 10001000b | |
10029 psrldq m9, m6, 4 | |
10030 psubd m10, m6, m9 | |
10031 paddd m6, m9 | |
10032 shufps m6, m10, 10001000b | |
10033 | |
10034 psrldq m9, m7, 4 | |
10035 psubd m10, m7, m9 | |
10036 paddd m7, m9 | |
10037 shufps m7, m10, 10001000b | |
10038 psrldq m9, m7, 4 | |
10039 psubd m10, m7, m9 | |
10040 paddd m7, m9 | |
10041 shufps m7, m10, 10001000b | |
10042 | |
10043 SUMSUB_BA d, 4, 5, 9 | |
10044 SUMSUB_BA d, 6, 7, 9 | |
10045 SUMSUB_BA d, 4, 6, 9 | |
10046 SUMSUB_BA d, 5, 7, 9 | |
10047 | |
10048 SUMSUB_BA d, 0, 4, 9 | |
10049 SUMSUB_BA d, 1, 5, 9 | |
10050 SUMSUB_BA d, 2, 6, 9 | |
10051 SUMSUB_BA d, 3, 7, 9 | |
10052 | |
10053 pabsd m0, m0 | |
10054 pabsd m2, m2 | |
10055 pabsd m1, m1 | |
10056 pabsd m3, m3 | |
10057 pabsd m4, m4 | |
10058 pabsd m5, m5 | |
10059 pabsd m6, m6 | |
10060 pabsd m7, m7 | |
10061 | |
10062 paddd m0, m2 | |
10063 paddd m1, m3 | |
10064 paddd m0, m1 | |
10065 paddd m5, m4 | |
10066 paddd m0, m5 | |
10067 paddd m7, m6 | |
10068 paddd m11, m0, m7 | |
10069 | |
10070 movu m0, [r2] | |
10071 movu m1, [r2 + r3] | |
10072 movu m2, [r2 + r3 * 2] | |
10073 movu m3, [r2 + r6] | |
10074 | |
10075 pmaddwd m0, m14 | |
10076 pmaddwd m1, m14 | |
10077 pmaddwd m2, m14 | |
10078 pmaddwd m3, m14 | |
10079 | |
10080 psrldq m9, m0, 4 | |
10081 psubd m10, m0, m9 | |
10082 paddd m0, m9 | |
10083 shufps m0, m10, 10001000b | |
10084 psrldq m9, m0, 4 | |
10085 psubd m10, m0, m9 | |
10086 paddd m0, m9 | |
10087 shufps m0, m10, 10001000b | |
10088 | |
10089 psrldq m9, m1, 4 | |
10090 psubd m10, m1, m9 | |
10091 paddd m1, m9 | |
10092 shufps m1, m10, 10001000b | |
10093 psrldq m9, m1, 4 | |
10094 psubd m10, m1, m9 | |
10095 paddd m1, m9 | |
10096 shufps m1, m10, 10001000b | |
10097 | |
10098 psrldq m9, m2, 4 | |
10099 psubd m10, m2, m9 | |
10100 paddd m2, m9 | |
10101 shufps m2, m10, 10001000b | |
10102 psrldq m9, m2, 4 | |
10103 psubd m10, m2, m9 | |
10104 paddd m2, m9 | |
10105 shufps m2, m10, 10001000b | |
10106 | |
10107 psrldq m9, m3, 4 | |
10108 psubd m10, m3, m9 | |
10109 paddd m3, m9 | |
10110 shufps m3, m10, 10001000b | |
10111 psrldq m9, m3, 4 | |
10112 psubd m10, m3, m9 | |
10113 paddd m3, m9 | |
10114 shufps m3, m10, 10001000b | |
10115 | |
10116 SUMSUB_BA d, 0, 1, 9 | |
10117 SUMSUB_BA d, 2, 3, 9 | |
10118 SUMSUB_BA d, 0, 2, 9 | |
10119 SUMSUB_BA d, 1, 3, 9 | |
10120 | |
10121 movu m4, [r5] | |
10122 movu m5, [r5 + r3] | |
10123 movu m6, [r5 + r3 * 2] | |
10124 movu m7, [r5 + r6] | |
10125 | |
10126 pmaddwd m4, m14 | |
10127 pmaddwd m5, m14 | |
10128 pmaddwd m6, m14 | |
10129 pmaddwd m7, m14 | |
10130 | |
10131 psrldq m9, m4, 4 | |
10132 psubd m10, m4, m9 | |
10133 paddd m4, m9 | |
10134 shufps m4, m10, 10001000b | |
10135 psrldq m9, m4, 4 | |
10136 psubd m10, m4, m9 | |
10137 paddd m4, m9 | |
10138 shufps m4, m10, 10001000b | |
10139 | |
10140 psrldq m9, m5, 4 | |
10141 psubd m10, m5, m9 | |
10142 paddd m5, m9 | |
10143 shufps m5, m10, 10001000b | |
10144 psrldq m9, m5, 4 | |
10145 psubd m10, m5, m9 | |
10146 paddd m5, m9 | |
10147 shufps m5, m10, 10001000b | |
10148 | |
10149 psrldq m9, m6, 4 | |
10150 psubd m10, m6, m9 | |
10151 paddd m6, m9 | |
10152 shufps m6, m10, 10001000b | |
10153 psrldq m9, m6, 4 | |
10154 psubd m10, m6, m9 | |
10155 paddd m6, m9 | |
10156 shufps m6, m10, 10001000b | |
10157 | |
10158 psrldq m9, m7, 4 | |
10159 psubd m10, m7, m9 | |
10160 paddd m7, m9 | |
10161 shufps m7, m10, 10001000b | |
10162 psrldq m9, m7, 4 | |
10163 psubd m10, m7, m9 | |
10164 paddd m7, m9 | |
10165 shufps m7, m10, 10001000b | |
10166 | |
10167 SUMSUB_BA d, 4, 5, 9 | |
10168 SUMSUB_BA d, 6, 7, 9 | |
10169 SUMSUB_BA d, 4, 6, 9 | |
10170 SUMSUB_BA d, 5, 7, 9 | |
10171 | |
10172 SUMSUB_BA d, 0, 4, 9 | |
10173 SUMSUB_BA d, 1, 5, 9 | |
10174 SUMSUB_BA d, 2, 6, 9 | |
10175 SUMSUB_BA d, 3, 7, 9 | |
10176 | |
10177 pabsd m0, m0 | |
10178 pabsd m2, m2 | |
10179 pabsd m1, m1 | |
10180 pabsd m3, m3 | |
10181 pabsd m4, m4 | |
10182 pabsd m5, m5 | |
10183 pabsd m6, m6 | |
10184 pabsd m7, m7 | |
10185 | |
10186 paddd m0, m2 | |
10187 paddd m1, m3 | |
10188 paddd m0, m1 | |
10189 paddd m5, m4 | |
10190 paddd m0, m5 | |
10191 paddd m7, m6 | |
10192 paddd m0, m7 | |
10193 paddd m0, m11 | |
10194 | |
10195 movhlps m1, m0 | |
10196 paddd m0, m1 | |
10197 psrldq m1, m0, 4 | |
10198 paddd m0, m1 | |
10199 paddd m0, [pd_2] | |
10200 psrld m0, 2 | |
10201 psubd m0, m8 | |
10202 | |
10203 psubd m12, m0 | |
10204 pabsd m0, m12 | |
10205 paddd m15, m0 | |
10206 %endmacro | |
10207 | |
10208 %if ARCH_X86_64 | |
10209 INIT_XMM sse4 | |
10210 cglobal psyCost_ss_16x16, 4, 9, 16 | |
10211 | |
10212 mova m13, [pw_pmpmpmpm] | |
10213 mova m14, [pw_1] | |
10214 add r1, r1 | |
10215 add r3, r3 | |
10216 lea r4, [3 * r1] | |
10217 lea r6, [3 * r3] | |
10218 pxor m15, m15 | |
10219 mov r7d, 2 | |
10220 .loopH: | |
10221 mov r8d, 2 | |
10222 .loopW: | |
10223 psy_cost_ss | |
10224 add r0, 16 | |
10225 add r2, 16 | |
10226 dec r8d | |
10227 jnz .loopW | |
10228 lea r0, [r0 + r1 * 8 - 32] | |
10229 lea r2, [r2 + r3 * 8 - 32] | |
10230 dec r7d | |
10231 jnz .loopH | |
10232 movd eax, m15 | |
10233 RET | |
10234 %endif | |
10235 | |
10236 %if ARCH_X86_64 | |
10237 INIT_XMM sse4 | |
10238 cglobal psyCost_ss_32x32, 4, 9, 16 | |
10239 | |
10240 mova m13, [pw_pmpmpmpm] | |
10241 mova m14, [pw_1] | |
10242 add r1, r1 | |
10243 add r3, r3 | |
10244 lea r4, [3 * r1] | |
10245 lea r6, [3 * r3] | |
10246 pxor m15, m15 | |
10247 mov r7d, 4 | |
10248 .loopH: | |
10249 mov r8d, 4 | |
10250 .loopW: | |
10251 psy_cost_ss | |
10252 add r0, 16 | |
10253 add r2, 16 | |
10254 dec r8d | |
10255 jnz .loopW | |
10256 lea r0, [r0 + r1 * 8 - 64] | |
10257 lea r2, [r2 + r3 * 8 - 64] | |
10258 dec r7d | |
10259 jnz .loopH | |
10260 movd eax, m15 | |
10261 RET | |
10262 %endif | |
10263 | |
10264 %if ARCH_X86_64 | |
10265 INIT_XMM sse4 | |
10266 cglobal psyCost_ss_64x64, 4, 9, 16 | |
10267 | |
10268 mova m13, [pw_pmpmpmpm] | |
10269 mova m14, [pw_1] | |
10270 add r1, r1 | |
10271 add r3, r3 | |
10272 lea r4, [3 * r1] | |
10273 lea r6, [3 * r3] | |
10274 pxor m15, m15 | |
10275 mov r7d, 8 | |
10276 .loopH: | |
10277 mov r8d, 8 | |
10278 .loopW: | |
10279 psy_cost_ss | |
10280 add r0, 16 | |
10281 add r2, 16 | |
10282 dec r8d | |
10283 jnz .loopW | |
10284 lea r0, [r0 + r1 * 8 - 128] | |
10285 lea r2, [r2 + r3 * 8 - 128] | |
10286 dec r7d | |
10287 jnz .loopH | |
10288 movd eax, m15 | |
10289 RET | |
10290 %endif | |
10291 | |
10292 INIT_YMM avx2 | |
10293 cglobal psyCost_ss_4x4, 4, 5, 8 | |
10294 add r1, r1 | |
10295 add r3, r3 | |
10296 lea r4, [3 * r1] | |
10297 movddup m0, [r0] | |
10298 movddup m1, [r0 + r1] | |
10299 movddup m2, [r0 + r1 * 2] | |
10300 movddup m3, [r0 + r4] | |
10301 | |
10302 lea r4, [3 * r3] | |
10303 movddup m4, [r2] | |
10304 movddup m5, [r2 + r3] | |
10305 movddup m6, [r2 + r3 * 2] | |
10306 movddup m7, [r2 + r4] | |
10307 | |
10308 vinserti128 m0, m0, xm4, 1 | |
10309 vinserti128 m1, m1, xm5, 1 | |
10310 vinserti128 m2, m2, xm6, 1 | |
10311 vinserti128 m3, m3, xm7, 1 | |
10312 | |
10313 pabsw m4, m0 | |
10314 pabsw m5, m1 | |
10315 paddw m5, m4 | |
10316 pabsw m4, m2 | |
10317 paddw m5, m4 | |
10318 pabsw m4, m3 | |
10319 paddw m5, m4 | |
10320 pmaddwd m5, [pw_1] | |
10321 psrldq m4, m5, 4 | |
10322 paddd m5, m4 | |
10323 psrld m6, m5, 2 | |
10324 | |
10325 mova m4, [hmul_8w] | |
10326 pmaddwd m0, m4 | |
10327 pmaddwd m1, m4 | |
10328 pmaddwd m2, m4 | |
10329 pmaddwd m3, m4 | |
10330 | |
10331 psrldq m4, m0, 4 | |
10332 psubd m5, m0, m4 | |
10333 paddd m0, m4 | |
10334 shufps m0, m0, m5, 10001000b | |
10335 | |
10336 psrldq m4, m1, 4 | |
10337 psubd m5, m1, m4 | |
10338 paddd m1, m4 | |
10339 shufps m1, m1, m5, 10001000b | |
10340 | |
10341 psrldq m4, m2, 4 | |
10342 psubd m5, m2, m4 | |
10343 paddd m2, m4 | |
10344 shufps m2, m2, m5, 10001000b | |
10345 | |
10346 psrldq m4, m3, 4 | |
10347 psubd m5, m3, m4 | |
10348 paddd m3, m4 | |
10349 shufps m3, m3, m5, 10001000b | |
10350 | |
10351 mova m4, m0 | |
10352 paddd m0, m1 | |
10353 psubd m1, m4 | |
10354 mova m4, m2 | |
10355 paddd m2, m3 | |
10356 psubd m3, m4 | |
10357 mova m4, m0 | |
10358 paddd m0, m2 | |
10359 psubd m2, m4 | |
10360 mova m4, m1 | |
10361 paddd m1, m3 | |
10362 psubd m3, m4 | |
10363 | |
10364 pabsd m0, m0 | |
10365 pabsd m2, m2 | |
10366 pabsd m1, m1 | |
10367 pabsd m3, m3 | |
10368 paddd m0, m2 | |
10369 paddd m1, m3 | |
10370 paddd m0, m1 | |
10371 psrldq m1, m0, 8 | |
10372 paddd m0, m1 | |
10373 psrldq m1, m0, 4 | |
10374 paddd m0, m1 | |
10375 psrld m0, 1 | |
10376 psubd m0, m6 | |
10377 vextracti128 xm1, m0, 1 | |
10378 psubd m0, m1 | |
10379 pabsd m0, m0 | |
10380 movd eax, xm0 | |
10381 RET | |
10382 | |
10383 %macro PSY_SS_8x8 0 | |
10384 lea r4, [3 * r1] | |
10385 lea r6, [r0 + r1 * 4] | |
10386 movu xm0, [r0] | |
10387 movu xm1, [r0 + r1] | |
10388 movu xm2, [r0 + r1 * 2] | |
10389 movu xm3, [r0 + r4] | |
10390 movu xm4, [r6] | |
10391 movu xm5, [r6 + r1] | |
10392 movu xm6, [r6 + r1 * 2] | |
10393 movu xm7, [r6 + r4] | |
10394 | |
10395 lea r4, [3 * r3] | |
10396 lea r6, [r2 + r3 * 4] | |
10397 movu xm8, [r2] | |
10398 movu xm9, [r2 + r3] | |
10399 movu xm10, [r2 + r3 * 2] | |
10400 movu xm11, [r2 + r4] | |
10401 vinserti128 m0, m0, xm8, 1 | |
10402 vinserti128 m1, m1, xm9, 1 | |
10403 vinserti128 m2, m2, xm10, 1 | |
10404 vinserti128 m3, m3, xm11, 1 | |
10405 movu xm8, [r6] | |
10406 movu xm9, [r6 + r3] | |
10407 movu xm10, [r6 + r3 * 2] | |
10408 movu xm11, [r6 + r4] | |
10409 vinserti128 m4, m4, xm8, 1 | |
10410 vinserti128 m5, m5, xm9, 1 | |
10411 vinserti128 m6, m6, xm10, 1 | |
10412 vinserti128 m7, m7, xm11, 1 | |
10413 | |
10414 ;; store on stack to use later | |
10415 mova [rsp + 0 * mmsize], m0 | |
10416 mova [rsp + 1 * mmsize], m1 | |
10417 mova [rsp + 2 * mmsize], m2 | |
10418 mova [rsp + 3 * mmsize], m3 | |
10419 mova [rsp + 4 * mmsize], m4 | |
10420 mova [rsp + 5 * mmsize], m5 | |
10421 mova [rsp + 6 * mmsize], m6 | |
10422 mova [rsp + 7 * mmsize], m7 | |
10423 | |
10424 pabsw m8, m0 | |
10425 pabsw m9, m1 | |
10426 paddw m8, m9 | |
10427 pabsw m10, m2 | |
10428 pabsw m11, m3 | |
10429 paddw m10, m11 | |
10430 paddw m8, m10 | |
10431 pabsw m9, m4 | |
10432 pabsw m10, m5 | |
10433 paddw m9, m10 | |
10434 pabsw m11, m6 | |
10435 pabsw m10, m7 | |
10436 paddw m11, m10 | |
10437 paddw m9, m11 | |
10438 paddw m8, m9 | |
10439 psrldq m9, m8, 8 | |
10440 | |
10441 vextracti128 xm10, m8, 1 | |
10442 vextracti128 xm11, m9, 1 | |
10443 | |
10444 vpmovzxwd m8, xm8 | |
10445 vpmovzxwd m9, xm9 | |
10446 vpmovzxwd m10, xm10 | |
10447 vpmovzxwd m11, xm11 | |
10448 | |
10449 vinserti128 m8, m8, xm10, 1 | |
10450 vinserti128 m9, m9, xm11, 1 | |
10451 | |
10452 paddd m8, m9 | |
10453 psrldq m9, m8, 8 | |
10454 paddd m8, m9 | |
10455 psrldq m9, m8, 4 | |
10456 paddd m8, m9 | |
10457 psrld m8, 2 ; sad_4x4 | |
10458 | |
10459 pmaddwd m0, m13 | |
10460 pmaddwd m1, m13 | |
10461 pmaddwd m2, m13 | |
10462 pmaddwd m3, m13 | |
10463 | |
10464 psrldq m9, m0, 4 | |
10465 psubd m10, m0, m9 | |
10466 paddd m0, m9 | |
10467 vshufps m0, m0, m10, 10001000b | |
10468 psrldq m9, m0, 4 | |
10469 psubd m10, m0, m9 | |
10470 paddd m0, m9 | |
10471 vshufps m0, m0, m10, 10001000b | |
10472 | |
10473 psrldq m9, m1, 4 | |
10474 psubd m10, m1, m9 | |
10475 paddd m1, m9 | |
10476 vshufps m1, m1, m10, 10001000b | |
10477 psrldq m9, m1, 4 | |
10478 psubd m10, m1, m9 | |
10479 paddd m1, m9 | |
10480 vshufps m1, m1, m10, 10001000b | |
10481 | |
10482 psrldq m9, m2, 4 | |
10483 psubd m10, m2, m9 | |
10484 paddd m2, m9 | |
10485 vshufps m2, m2, m10, 10001000b | |
10486 psrldq m9, m2, 4 | |
10487 psubd m10, m2, m9 | |
10488 paddd m2, m9 | |
10489 vshufps m2, m2, m10, 10001000b | |
10490 | |
10491 psrldq m9, m3, 4 | |
10492 psubd m10, m3, m9 | |
10493 paddd m3, m9 | |
10494 vshufps m3, m3, m10, 10001000b | |
10495 psrldq m9, m3, 4 | |
10496 psubd m10, m3, m9 | |
10497 paddd m3, m9 | |
10498 vshufps m3, m3, m10, 10001000b | |
10499 | |
10500 SUMSUB_BA d, 0, 1, 9 | |
10501 SUMSUB_BA d, 2, 3, 9 | |
10502 SUMSUB_BA d, 0, 2, 9 | |
10503 SUMSUB_BA d, 1, 3, 9 | |
10504 | |
10505 pmaddwd m4, m13 | |
10506 pmaddwd m5, m13 | |
10507 pmaddwd m6, m13 | |
10508 pmaddwd m7, m13 | |
10509 | |
10510 psrldq m9, m4, 4 | |
10511 psubd m10, m4, m9 | |
10512 paddd m4, m9 | |
10513 vshufps m4, m4, m10, 10001000b | |
10514 psrldq m9, m4, 4 | |
10515 psubd m10, m4, m9 | |
10516 paddd m4, m9 | |
10517 vshufps m4, m4, m10, 10001000b | |
10518 | |
10519 psrldq m9, m5, 4 | |
10520 psubd m10, m5, m9 | |
10521 paddd m5, m9 | |
10522 vshufps m5, m5, m10, 10001000b | |
10523 psrldq m9, m5, 4 | |
10524 psubd m10, m5, m9 | |
10525 paddd m5, m9 | |
10526 vshufps m5, m5, m10, 10001000b | |
10527 | |
10528 psrldq m9, m6, 4 | |
10529 psubd m10, m6, m9 | |
10530 paddd m6, m9 | |
10531 vshufps m6, m6, m10, 10001000b | |
10532 psrldq m9, m6, 4 | |
10533 psubd m10, m6, m9 | |
10534 paddd m6, m9 | |
10535 vshufps m6, m6, m10, 10001000b | |
10536 | |
10537 psrldq m9, m7, 4 | |
10538 psubd m10, m7, m9 | |
10539 paddd m7, m9 | |
10540 vshufps m7, m7, m10, 10001000b | |
10541 psrldq m9, m7, 4 | |
10542 psubd m10, m7, m9 | |
10543 paddd m7, m9 | |
10544 vshufps m7, m7, m10, 10001000b | |
10545 | |
10546 SUMSUB_BA d, 4, 5, 9 | |
10547 SUMSUB_BA d, 6, 7, 9 | |
10548 SUMSUB_BA d, 4, 6, 9 | |
10549 SUMSUB_BA d, 5, 7, 9 | |
10550 | |
10551 SUMSUB_BA d, 0, 4, 9 | |
10552 SUMSUB_BA d, 1, 5, 9 | |
10553 SUMSUB_BA d, 2, 6, 9 | |
10554 SUMSUB_BA d, 3, 7, 9 | |
10555 | |
10556 pabsd m0, m0 | |
10557 pabsd m2, m2 | |
10558 pabsd m1, m1 | |
10559 pabsd m3, m3 | |
10560 pabsd m4, m4 | |
10561 pabsd m5, m5 | |
10562 pabsd m6, m6 | |
10563 pabsd m7, m7 | |
10564 | |
10565 paddd m0, m2 | |
10566 paddd m1, m3 | |
10567 paddd m0, m1 | |
10568 paddd m5, m4 | |
10569 paddd m0, m5 | |
10570 paddd m7, m6 | |
10571 paddd m11, m0, m7 | |
10572 | |
10573 pmaddwd m0, m12, [rsp + 0 * mmsize] | |
10574 pmaddwd m1, m12, [rsp + 1 * mmsize] | |
10575 pmaddwd m2, m12, [rsp + 2 * mmsize] | |
10576 pmaddwd m3, m12, [rsp + 3 * mmsize] | |
10577 | |
10578 psrldq m9, m0, 4 | |
10579 psubd m10, m0, m9 | |
10580 paddd m0, m9 | |
10581 vshufps m0, m0, m10, 10001000b | |
10582 psrldq m9, m0, 4 | |
10583 psubd m10, m0, m9 | |
10584 paddd m0, m9 | |
10585 vshufps m0, m0, m10, 10001000b | |
10586 | |
10587 psrldq m9, m1, 4 | |
10588 psubd m10, m1, m9 | |
10589 paddd m1, m9 | |
10590 vshufps m1, m1, m10, 10001000b | |
10591 psrldq m9, m1, 4 | |
10592 psubd m10, m1, m9 | |
10593 paddd m1, m9 | |
10594 vshufps m1, m1, m10, 10001000b | |
10595 | |
10596 psrldq m9, m2, 4 | |
10597 psubd m10, m2, m9 | |
10598 paddd m2, m9 | |
10599 vshufps m2, m2, m10, 10001000b | |
10600 psrldq m9, m2, 4 | |
10601 psubd m10, m2, m9 | |
10602 paddd m2, m9 | |
10603 vshufps m2, m2, m10, 10001000b | |
10604 | |
10605 psrldq m9, m3, 4 | |
10606 psubd m10, m3, m9 | |
10607 paddd m3, m9 | |
10608 vshufps m3, m3, m10, 10001000b | |
10609 psrldq m9, m3, 4 | |
10610 psubd m10, m3, m9 | |
10611 paddd m3, m9 | |
10612 vshufps m3, m3, m10, 10001000b | |
10613 | |
10614 SUMSUB_BA d, 0, 1, 9 | |
10615 SUMSUB_BA d, 2, 3, 9 | |
10616 SUMSUB_BA d, 0, 2, 9 | |
10617 SUMSUB_BA d, 1, 3, 9 | |
10618 | |
10619 pmaddwd m4, m12, [rsp + 4 * mmsize] | |
10620 pmaddwd m5, m12, [rsp + 5 * mmsize] | |
10621 pmaddwd m6, m12, [rsp + 6 * mmsize] | |
10622 pmaddwd m7, m12, [rsp + 7 * mmsize] | |
10623 | |
10624 psrldq m9, m4, 4 | |
10625 psubd m10, m4, m9 | |
10626 paddd m4, m9 | |
10627 vshufps m4, m4, m10, 10001000b | |
10628 psrldq m9, m4, 4 | |
10629 psubd m10, m4, m9 | |
10630 paddd m4, m9 | |
10631 vshufps m4, m4, m10, 10001000b | |
10632 | |
10633 psrldq m9, m5, 4 | |
10634 psubd m10, m5, m9 | |
10635 paddd m5, m9 | |
10636 vshufps m5, m5, m10, 10001000b | |
10637 psrldq m9, m5, 4 | |
10638 psubd m10, m5, m9 | |
10639 paddd m5, m9 | |
10640 vshufps m5, m5, m10, 10001000b | |
10641 | |
10642 psrldq m9, m6, 4 | |
10643 psubd m10, m6, m9 | |
10644 paddd m6, m9 | |
10645 vshufps m6, m6, m10, 10001000b | |
10646 psrldq m9, m6, 4 | |
10647 psubd m10, m6, m9 | |
10648 paddd m6, m9 | |
10649 vshufps m6, m6, m10, 10001000b | |
10650 | |
10651 psrldq m9, m7, 4 | |
10652 psubd m10, m7, m9 | |
10653 paddd m7, m9 | |
10654 vshufps m7, m7, m10, 10001000b | |
10655 psrldq m9, m7, 4 | |
10656 psubd m10, m7, m9 | |
10657 paddd m7, m9 | |
10658 vshufps m7, m7, m10, 10001000b | |
10659 | |
10660 SUMSUB_BA d, 4, 5, 9 | |
10661 SUMSUB_BA d, 6, 7, 9 | |
10662 SUMSUB_BA d, 4, 6, 9 | |
10663 SUMSUB_BA d, 5, 7, 9 | |
10664 | |
10665 SUMSUB_BA d, 0, 4, 9 | |
10666 SUMSUB_BA d, 1, 5, 9 | |
10667 SUMSUB_BA d, 2, 6, 9 | |
10668 SUMSUB_BA d, 3, 7, 9 | |
10669 | |
10670 pabsd m0, m0 | |
10671 pabsd m2, m2 | |
10672 pabsd m1, m1 | |
10673 pabsd m3, m3 | |
10674 pabsd m4, m4 | |
10675 pabsd m5, m5 | |
10676 pabsd m6, m6 | |
10677 pabsd m7, m7 | |
10678 | |
10679 paddd m0, m2 | |
10680 paddd m1, m3 | |
10681 paddd m0, m1 | |
10682 paddd m5, m4 | |
10683 paddd m0, m5 | |
10684 paddd m7, m6 | |
10685 paddd m0, m7 | |
10686 paddd m0, m11 | |
10687 | |
10688 psrldq m1, m0, 8 | |
10689 paddd m0, m1 | |
10690 psrldq m1, m0, 4 | |
10691 paddd m0, m1 | |
10692 paddd m0, [pd_2] | |
10693 psrld m0, 2 | |
10694 psubd m0, m8 | |
10695 vextracti128 xm1, m0, 1 | |
10696 psubd m0, m1 | |
10697 pabsd m0, m0 | |
10698 %endmacro | |
10699 | |
10700 %if ARCH_X86_64 | |
10701 INIT_YMM avx2 | |
10702 cglobal psyCost_ss_8x8, 4, 7, 14 | |
10703 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
10704 mov r5, rsp | |
10705 sub rsp, 8*mmsize | |
10706 and rsp, ~63 | |
10707 | |
10708 mova m12, [pw_1] | |
10709 mova m13, [pw_pmpmpmpm] | |
10710 add r1, r1 | |
10711 add r3, r3 | |
10712 | |
10713 PSY_SS_8x8 | |
10714 | |
10715 movd eax, xm0 | |
10716 mov rsp, r5 | |
10717 RET | |
10718 %endif | |
10719 | |
10720 %if ARCH_X86_64 | |
10721 INIT_YMM avx2 | |
10722 cglobal psyCost_ss_16x16, 4, 9, 15 | |
10723 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
10724 mov r5, rsp | |
10725 sub rsp, 8*mmsize | |
10726 and rsp, ~63 | |
10727 | |
10728 mova m12, [pw_1] | |
10729 mova m13, [pw_pmpmpmpm] | |
10730 add r1, r1 | |
10731 add r3, r3 | |
10732 pxor m14, m14 | |
10733 | |
10734 mov r7d, 2 | |
10735 .loopH: | |
10736 mov r8d, 2 | |
10737 .loopW: | |
10738 PSY_SS_8x8 | |
10739 | |
10740 paddd m14, m0 | |
10741 add r0, 16 | |
10742 add r2, 16 | |
10743 dec r8d | |
10744 jnz .loopW | |
10745 lea r0, [r0 + r1 * 8 - 32] | |
10746 lea r2, [r2 + r3 * 8 - 32] | |
10747 dec r7d | |
10748 jnz .loopH | |
10749 movd eax, xm14 | |
10750 mov rsp, r5 | |
10751 RET | |
10752 %endif | |
10753 | |
10754 %if ARCH_X86_64 | |
10755 INIT_YMM avx2 | |
10756 cglobal psyCost_ss_32x32, 4, 9, 15 | |
10757 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
10758 mov r5, rsp | |
10759 sub rsp, 8*mmsize | |
10760 and rsp, ~63 | |
10761 | |
10762 mova m12, [pw_1] | |
10763 mova m13, [pw_pmpmpmpm] | |
10764 add r1, r1 | |
10765 add r3, r3 | |
10766 pxor m14, m14 | |
10767 | |
10768 mov r7d, 4 | |
10769 .loopH: | |
10770 mov r8d, 4 | |
10771 .loopW: | |
10772 PSY_SS_8x8 | |
10773 | |
10774 paddd m14, m0 | |
10775 add r0, 16 | |
10776 add r2, 16 | |
10777 dec r8d | |
10778 jnz .loopW | |
10779 lea r0, [r0 + r1 * 8 - 64] | |
10780 lea r2, [r2 + r3 * 8 - 64] | |
10781 dec r7d | |
10782 jnz .loopH | |
10783 movd eax, xm14 | |
10784 mov rsp, r5 | |
10785 RET | |
10786 %endif | |
10787 | |
10788 %if ARCH_X86_64 | |
10789 INIT_YMM avx2 | |
10790 cglobal psyCost_ss_64x64, 4, 9, 15 | |
10791 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
10792 mov r5, rsp | |
10793 sub rsp, 8*mmsize | |
10794 and rsp, ~63 | |
10795 | |
10796 mova m12, [pw_1] | |
10797 mova m13, [pw_pmpmpmpm] | |
10798 add r1, r1 | |
10799 add r3, r3 | |
10800 pxor m14, m14 | |
10801 | |
10802 mov r7d, 8 | |
10803 .loopH: | |
10804 mov r8d, 8 | |
10805 .loopW: | |
10806 PSY_SS_8x8 | |
10807 | |
10808 paddd m14, m0 | |
10809 add r0, 16 | |
10810 add r2, 16 | |
10811 dec r8d | |
10812 jnz .loopW | |
10813 lea r0, [r0 + r1 * 8 - 128] | |
10814 lea r2, [r2 + r3 * 8 - 128] | |
10815 dec r7d | |
10816 jnz .loopH | |
10817 movd eax, xm14 | |
10818 mov rsp, r5 | |
10819 RET | |
10820 %endif | |
10821 | |
10822 ;;--------------------------------------------------------------- | |
10823 ;; SATD AVX2 | |
10824 ;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t) | |
10825 ;;--------------------------------------------------------------- | |
10826 ;; r0 - pix0 | |
10827 ;; r1 - pix0Stride | |
10828 ;; r2 - pix1 | |
10829 ;; r3 - pix1Stride | |
10830 | |
10831 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 | |
10832 INIT_YMM avx2 | |
10833 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows | |
10834 pxor m6, m6 | |
10835 vbroadcasti128 m0, [r0] | |
10836 vbroadcasti128 m4, [r2] | |
10837 vbroadcasti128 m1, [r0 + r1] | |
10838 vbroadcasti128 m5, [r2 + r3] | |
10839 pmaddubsw m4, m7 | |
10840 pmaddubsw m0, m7 | |
10841 pmaddubsw m5, m7 | |
10842 pmaddubsw m1, m7 | |
10843 psubw m0, m4 | |
10844 psubw m1, m5 | |
10845 vbroadcasti128 m2, [r0 + r1 * 2] | |
10846 vbroadcasti128 m4, [r2 + r3 * 2] | |
10847 vbroadcasti128 m3, [r0 + r4] | |
10848 vbroadcasti128 m5, [r2 + r5] | |
10849 pmaddubsw m4, m7 | |
10850 pmaddubsw m2, m7 | |
10851 pmaddubsw m5, m7 | |
10852 pmaddubsw m3, m7 | |
10853 psubw m2, m4 | |
10854 psubw m3, m5 | |
10855 lea r0, [r0 + r1 * 4] | |
10856 lea r2, [r2 + r3 * 4] | |
10857 paddw m4, m0, m1 | |
10858 psubw m1, m1, m0 | |
10859 paddw m0, m2, m3 | |
10860 psubw m3, m2 | |
10861 paddw m2, m4, m0 | |
10862 psubw m0, m4 | |
10863 paddw m4, m1, m3 | |
10864 psubw m3, m1 | |
10865 pabsw m2, m2 | |
10866 pabsw m0, m0 | |
10867 pabsw m4, m4 | |
10868 pabsw m3, m3 | |
10869 pblendw m1, m2, m0, 10101010b | |
10870 pslld m0, 16 | |
10871 psrld m2, 16 | |
10872 por m0, m2 | |
10873 pmaxsw m1, m0 | |
10874 paddw m6, m1 | |
10875 pblendw m2, m4, m3, 10101010b | |
10876 pslld m3, 16 | |
10877 psrld m4, 16 | |
10878 por m3, m4 | |
10879 pmaxsw m2, m3 | |
10880 paddw m6, m2 | |
10881 vbroadcasti128 m1, [r0] | |
10882 vbroadcasti128 m4, [r2] | |
10883 vbroadcasti128 m2, [r0 + r1] | |
10884 vbroadcasti128 m5, [r2 + r3] | |
10885 pmaddubsw m4, m7 | |
10886 pmaddubsw m1, m7 | |
10887 pmaddubsw m5, m7 | |
10888 pmaddubsw m2, m7 | |
10889 psubw m1, m4 | |
10890 psubw m2, m5 | |
10891 vbroadcasti128 m0, [r0 + r1 * 2] | |
10892 vbroadcasti128 m4, [r2 + r3 * 2] | |
10893 vbroadcasti128 m3, [r0 + r4] | |
10894 vbroadcasti128 m5, [r2 + r5] | |
10895 lea r0, [r0 + r1 * 4] | |
10896 lea r2, [r2 + r3 * 4] | |
10897 pmaddubsw m4, m7 | |
10898 pmaddubsw m0, m7 | |
10899 pmaddubsw m5, m7 | |
10900 pmaddubsw m3, m7 | |
10901 psubw m0, m4 | |
10902 psubw m3, m5 | |
10903 paddw m4, m1, m2 | |
10904 psubw m2, m1 | |
10905 paddw m1, m0, m3 | |
10906 psubw m3, m0 | |
10907 paddw m0, m4, m1 | |
10908 psubw m1, m4 | |
10909 paddw m4, m2, m3 | |
10910 psubw m3, m2 | |
10911 pabsw m0, m0 | |
10912 pabsw m1, m1 | |
10913 pabsw m4, m4 | |
10914 pabsw m3, m3 | |
10915 pblendw m2, m0, m1, 10101010b | |
10916 pslld m1, 16 | |
10917 psrld m0, 16 | |
10918 por m1, m0 | |
10919 pmaxsw m2, m1 | |
10920 paddw m6, m2 | |
10921 pblendw m0, m4, m3, 10101010b | |
10922 pslld m3, 16 | |
10923 psrld m4, 16 | |
10924 por m3, m4 | |
10925 pmaxsw m0, m3 | |
10926 paddw m6, m0 | |
10927 vextracti128 xm0, m6, 1 | |
10928 pmovzxwd m6, xm6 | |
10929 pmovzxwd m0, xm0 | |
10930 paddd m8, m6 | |
10931 paddd m9, m0 | |
10932 ret | |
10933 | |
10934 cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows | |
10935 pxor m6, m6 | |
10936 vbroadcasti128 m0, [r0] | |
10937 vbroadcasti128 m4, [r2] | |
10938 vbroadcasti128 m1, [r0 + r1] | |
10939 vbroadcasti128 m5, [r2 + r3] | |
10940 pmaddubsw m4, m7 | |
10941 pmaddubsw m0, m7 | |
10942 pmaddubsw m5, m7 | |
10943 pmaddubsw m1, m7 | |
10944 psubw m0, m4 | |
10945 psubw m1, m5 | |
10946 vbroadcasti128 m2, [r0 + r1 * 2] | |
10947 vbroadcasti128 m4, [r2 + r3 * 2] | |
10948 vbroadcasti128 m3, [r0 + r4] | |
10949 vbroadcasti128 m5, [r2 + r5] | |
10950 pmaddubsw m4, m7 | |
10951 pmaddubsw m2, m7 | |
10952 pmaddubsw m5, m7 | |
10953 pmaddubsw m3, m7 | |
10954 psubw m2, m4 | |
10955 psubw m3, m5 | |
10956 paddw m4, m0, m1 | |
10957 psubw m1, m1, m0 | |
10958 paddw m0, m2, m3 | |
10959 psubw m3, m2 | |
10960 paddw m2, m4, m0 | |
10961 psubw m0, m4 | |
10962 paddw m4, m1, m3 | |
10963 psubw m3, m1 | |
10964 pabsw m2, m2 | |
10965 pabsw m0, m0 | |
10966 pabsw m4, m4 | |
10967 pabsw m3, m3 | |
10968 pblendw m1, m2, m0, 10101010b | |
10969 pslld m0, 16 | |
10970 psrld m2, 16 | |
10971 por m0, m2 | |
10972 pmaxsw m1, m0 | |
10973 paddw m6, m1 | |
10974 pblendw m2, m4, m3, 10101010b | |
10975 pslld m3, 16 | |
10976 psrld m4, 16 | |
10977 por m3, m4 | |
10978 pmaxsw m2, m3 | |
10979 paddw m6, m2 | |
10980 vextracti128 xm0, m6, 1 | |
10981 pmovzxwd m6, xm6 | |
10982 pmovzxwd m0, xm0 | |
10983 paddd m8, m6 | |
10984 paddd m9, m0 | |
10985 ret | |
10986 | |
10987 cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2) | |
10988 mova m7, [hmul_16p] | |
10989 lea r4, [3 * r1] | |
10990 lea r5, [3 * r3] | |
10991 pxor m8, m8 | |
10992 pxor m9, m9 | |
10993 | |
10994 call calc_satd_16x4 | |
10995 | |
10996 paddd m8, m9 | |
10997 vextracti128 xm0, m8, 1 | |
10998 paddd xm0, xm8 | |
10999 movhlps xm1, xm0 | |
11000 paddd xm0, xm1 | |
11001 pshuflw xm1, xm0, q0032 | |
11002 paddd xm0, xm1 | |
11003 movd eax, xm0 | |
11004 RET | |
11005 | |
11006 cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2) | |
11007 mova m7, [hmul_16p] | |
11008 lea r4, [3 * r1] | |
11009 lea r5, [3 * r3] | |
11010 pxor m8, m8 | |
11011 pxor m9, m9 | |
11012 | |
11013 call calc_satd_16x8 | |
11014 call calc_satd_16x4 | |
11015 | |
11016 paddd m8, m9 | |
11017 vextracti128 xm0, m8, 1 | |
11018 paddd xm0, xm8 | |
11019 movhlps xm1, xm0 | |
11020 paddd xm0, xm1 | |
11021 pshuflw xm1, xm0, q0032 | |
11022 paddd xm0, xm1 | |
11023 movd eax, xm0 | |
11024 RET | |
11025 | |
11026 cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2) | |
11027 mova m7, [hmul_16p] | |
11028 lea r4, [3 * r1] | |
11029 lea r5, [3 * r3] | |
11030 pxor m8, m8 | |
11031 pxor m9, m9 | |
11032 | |
11033 call calc_satd_16x8 | |
11034 call calc_satd_16x8 | |
11035 call calc_satd_16x8 | |
11036 call calc_satd_16x8 | |
11037 | |
11038 paddd m8, m9 | |
11039 vextracti128 xm0, m8, 1 | |
11040 paddd xm0, xm8 | |
11041 movhlps xm1, xm0 | |
11042 paddd xm0, xm1 | |
11043 pshuflw xm1, xm0, q0032 | |
11044 paddd xm0, xm1 | |
11045 movd eax, xm0 | |
11046 RET | |
11047 | |
11048 cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2) | |
11049 mova m7, [hmul_16p] | |
11050 lea r4, [3 * r1] | |
11051 lea r5, [3 * r3] | |
11052 pxor m8, m8 | |
11053 pxor m9, m9 | |
11054 | |
11055 call calc_satd_16x8 | |
11056 call calc_satd_16x8 | |
11057 call calc_satd_16x8 | |
11058 call calc_satd_16x8 | |
11059 call calc_satd_16x8 | |
11060 call calc_satd_16x8 | |
11061 call calc_satd_16x8 | |
11062 call calc_satd_16x8 | |
11063 | |
11064 paddd m8, m9 | |
11065 vextracti128 xm0, m8, 1 | |
11066 paddd xm0, xm8 | |
11067 movhlps xm1, xm0 | |
11068 paddd xm0, xm1 | |
11069 pshuflw xm1, xm0, q0032 | |
11070 paddd xm0, xm1 | |
11071 movd eax, xm0 | |
11072 RET | |
11073 | |
11074 cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11075 mova m7, [hmul_16p] | |
11076 lea r4, [3 * r1] | |
11077 lea r5, [3 * r3] | |
11078 pxor m8, m8 | |
11079 pxor m9, m9 | |
11080 mov r6, r0 | |
11081 mov r7, r2 | |
11082 | |
11083 call calc_satd_16x8 | |
11084 | |
11085 lea r0, [r6 + 16] | |
11086 lea r2, [r7 + 16] | |
11087 | |
11088 call calc_satd_16x8 | |
11089 | |
11090 paddd m8, m9 | |
11091 vextracti128 xm0, m8, 1 | |
11092 paddd xm0, xm8 | |
11093 movhlps xm1, xm0 | |
11094 paddd xm0, xm1 | |
11095 pshuflw xm1, xm0, q0032 | |
11096 paddd xm0, xm1 | |
11097 movd eax, xm0 | |
11098 RET | |
11099 | |
11100 cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11101 mova m7, [hmul_16p] | |
11102 lea r4, [3 * r1] | |
11103 lea r5, [3 * r3] | |
11104 pxor m8, m8 | |
11105 pxor m9, m9 | |
11106 mov r6, r0 | |
11107 mov r7, r2 | |
11108 | |
11109 call calc_satd_16x8 | |
11110 call calc_satd_16x8 | |
11111 | |
11112 lea r0, [r6 + 16] | |
11113 lea r2, [r7 + 16] | |
11114 | |
11115 call calc_satd_16x8 | |
11116 call calc_satd_16x8 | |
11117 | |
11118 paddd m8, m9 | |
11119 vextracti128 xm0, m8, 1 | |
11120 paddd xm0, xm8 | |
11121 movhlps xm1, xm0 | |
11122 paddd xm0, xm1 | |
11123 pshuflw xm1, xm0, q0032 | |
11124 paddd xm0, xm1 | |
11125 movd eax, xm0 | |
11126 RET | |
11127 | |
11128 cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11129 mova m7, [hmul_16p] | |
11130 lea r4, [3 * r1] | |
11131 lea r5, [3 * r3] | |
11132 pxor m8, m8 | |
11133 pxor m9, m9 | |
11134 mov r6, r0 | |
11135 mov r7, r2 | |
11136 | |
11137 call calc_satd_16x8 | |
11138 call calc_satd_16x8 | |
11139 call calc_satd_16x8 | |
11140 | |
11141 lea r0, [r6 + 16] | |
11142 lea r2, [r7 + 16] | |
11143 | |
11144 call calc_satd_16x8 | |
11145 call calc_satd_16x8 | |
11146 call calc_satd_16x8 | |
11147 | |
11148 paddd m8, m9 | |
11149 vextracti128 xm0, m8, 1 | |
11150 paddd xm0, xm8 | |
11151 movhlps xm1, xm0 | |
11152 paddd xm0, xm1 | |
11153 pshuflw xm1, xm0, q0032 | |
11154 paddd xm0, xm1 | |
11155 movd eax, xm0 | |
11156 RET | |
11157 | |
11158 cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11159 mova m7, [hmul_16p] | |
11160 lea r4, [3 * r1] | |
11161 lea r5, [3 * r3] | |
11162 pxor m8, m8 | |
11163 pxor m9, m9 | |
11164 mov r6, r0 | |
11165 mov r7, r2 | |
11166 | |
11167 call calc_satd_16x8 | |
11168 call calc_satd_16x8 | |
11169 call calc_satd_16x8 | |
11170 call calc_satd_16x8 | |
11171 | |
11172 lea r0, [r6 + 16] | |
11173 lea r2, [r7 + 16] | |
11174 | |
11175 call calc_satd_16x8 | |
11176 call calc_satd_16x8 | |
11177 call calc_satd_16x8 | |
11178 call calc_satd_16x8 | |
11179 | |
11180 paddd m8, m9 | |
11181 vextracti128 xm0, m8, 1 | |
11182 paddd xm0, xm8 | |
11183 movhlps xm1, xm0 | |
11184 paddd xm0, xm1 | |
11185 pshuflw xm1, xm0, q0032 | |
11186 paddd xm0, xm1 | |
11187 movd eax, xm0 | |
11188 RET | |
11189 | |
11190 cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11191 mova m7, [hmul_16p] | |
11192 lea r4, [3 * r1] | |
11193 lea r5, [3 * r3] | |
11194 pxor m8, m8 | |
11195 pxor m9, m9 | |
11196 mov r6, r0 | |
11197 mov r7, r2 | |
11198 | |
11199 call calc_satd_16x8 | |
11200 call calc_satd_16x8 | |
11201 call calc_satd_16x8 | |
11202 call calc_satd_16x8 | |
11203 call calc_satd_16x8 | |
11204 call calc_satd_16x8 | |
11205 call calc_satd_16x8 | |
11206 call calc_satd_16x8 | |
11207 | |
11208 lea r0, [r6 + 16] | |
11209 lea r2, [r7 + 16] | |
11210 | |
11211 call calc_satd_16x8 | |
11212 call calc_satd_16x8 | |
11213 call calc_satd_16x8 | |
11214 call calc_satd_16x8 | |
11215 call calc_satd_16x8 | |
11216 call calc_satd_16x8 | |
11217 call calc_satd_16x8 | |
11218 call calc_satd_16x8 | |
11219 | |
11220 paddd m8, m9 | |
11221 vextracti128 xm0, m8, 1 | |
11222 paddd xm0, xm8 | |
11223 movhlps xm1, xm0 | |
11224 paddd xm0, xm1 | |
11225 pshuflw xm1, xm0, q0032 | |
11226 paddd xm0, xm1 | |
11227 movd eax, xm0 | |
11228 RET | |
11229 | |
11230 cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11231 mova m7, [hmul_16p] | |
11232 lea r4, [3 * r1] | |
11233 lea r5, [3 * r3] | |
11234 pxor m8, m8 | |
11235 pxor m9, m9 | |
11236 mov r6, r0 | |
11237 mov r7, r2 | |
11238 | |
11239 call calc_satd_16x8 | |
11240 call calc_satd_16x8 | |
11241 call calc_satd_16x8 | |
11242 call calc_satd_16x8 | |
11243 call calc_satd_16x8 | |
11244 call calc_satd_16x8 | |
11245 call calc_satd_16x8 | |
11246 call calc_satd_16x8 | |
11247 lea r0, [r6 + 16] | |
11248 lea r2, [r7 + 16] | |
11249 call calc_satd_16x8 | |
11250 call calc_satd_16x8 | |
11251 call calc_satd_16x8 | |
11252 call calc_satd_16x8 | |
11253 call calc_satd_16x8 | |
11254 call calc_satd_16x8 | |
11255 call calc_satd_16x8 | |
11256 call calc_satd_16x8 | |
11257 lea r0, [r6 + 32] | |
11258 lea r2, [r7 + 32] | |
11259 call calc_satd_16x8 | |
11260 call calc_satd_16x8 | |
11261 call calc_satd_16x8 | |
11262 call calc_satd_16x8 | |
11263 call calc_satd_16x8 | |
11264 call calc_satd_16x8 | |
11265 call calc_satd_16x8 | |
11266 call calc_satd_16x8 | |
11267 | |
11268 paddd m8, m9 | |
11269 vextracti128 xm0, m8, 1 | |
11270 paddd xm0, xm8 | |
11271 movhlps xm1, xm0 | |
11272 paddd xm0, xm1 | |
11273 pshuflw xm1, xm0, q0032 | |
11274 paddd xm0, xm1 | |
11275 movd eax, xm0 | |
11276 RET | |
11277 | |
11278 cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11279 mova m7, [hmul_16p] | |
11280 lea r4, [3 * r1] | |
11281 lea r5, [3 * r3] | |
11282 pxor m8, m8 | |
11283 pxor m9, m9 | |
11284 mov r6, r0 | |
11285 mov r7, r2 | |
11286 | |
11287 call calc_satd_16x8 | |
11288 call calc_satd_16x8 | |
11289 lea r0, [r6 + 16] | |
11290 lea r2, [r7 + 16] | |
11291 call calc_satd_16x8 | |
11292 call calc_satd_16x8 | |
11293 lea r0, [r6 + 32] | |
11294 lea r2, [r7 + 32] | |
11295 call calc_satd_16x8 | |
11296 call calc_satd_16x8 | |
11297 lea r0, [r6 + 48] | |
11298 lea r2, [r7 + 48] | |
11299 call calc_satd_16x8 | |
11300 call calc_satd_16x8 | |
11301 | |
11302 paddd m8, m9 | |
11303 vextracti128 xm0, m8, 1 | |
11304 paddd xm0, xm8 | |
11305 movhlps xm1, xm0 | |
11306 paddd xm0, xm1 | |
11307 pshuflw xm1, xm0, q0032 | |
11308 paddd xm0, xm1 | |
11309 movd eax, xm0 | |
11310 RET | |
11311 | |
11312 cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11313 mova m7, [hmul_16p] | |
11314 lea r4, [3 * r1] | |
11315 lea r5, [3 * r3] | |
11316 pxor m8, m8 | |
11317 pxor m9, m9 | |
11318 mov r6, r0 | |
11319 mov r7, r2 | |
11320 | |
11321 call calc_satd_16x8 | |
11322 call calc_satd_16x8 | |
11323 call calc_satd_16x8 | |
11324 call calc_satd_16x8 | |
11325 lea r0, [r6 + 16] | |
11326 lea r2, [r7 + 16] | |
11327 call calc_satd_16x8 | |
11328 call calc_satd_16x8 | |
11329 call calc_satd_16x8 | |
11330 call calc_satd_16x8 | |
11331 lea r0, [r6 + 32] | |
11332 lea r2, [r7 + 32] | |
11333 call calc_satd_16x8 | |
11334 call calc_satd_16x8 | |
11335 call calc_satd_16x8 | |
11336 call calc_satd_16x8 | |
11337 lea r0, [r6 + 48] | |
11338 lea r2, [r7 + 48] | |
11339 call calc_satd_16x8 | |
11340 call calc_satd_16x8 | |
11341 call calc_satd_16x8 | |
11342 call calc_satd_16x8 | |
11343 | |
11344 paddd m8, m9 | |
11345 vextracti128 xm0, m8, 1 | |
11346 paddd xm0, xm8 | |
11347 movhlps xm1, xm0 | |
11348 paddd xm0, xm1 | |
11349 pshuflw xm1, xm0, q0032 | |
11350 paddd xm0, xm1 | |
11351 movd eax, xm0 | |
11352 RET | |
11353 | |
11354 cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11355 mova m7, [hmul_16p] | |
11356 lea r4, [3 * r1] | |
11357 lea r5, [3 * r3] | |
11358 pxor m8, m8 | |
11359 pxor m9, m9 | |
11360 mov r6, r0 | |
11361 mov r7, r2 | |
11362 | |
11363 call calc_satd_16x8 | |
11364 call calc_satd_16x8 | |
11365 call calc_satd_16x8 | |
11366 call calc_satd_16x8 | |
11367 call calc_satd_16x8 | |
11368 call calc_satd_16x8 | |
11369 lea r0, [r6 + 16] | |
11370 lea r2, [r7 + 16] | |
11371 call calc_satd_16x8 | |
11372 call calc_satd_16x8 | |
11373 call calc_satd_16x8 | |
11374 call calc_satd_16x8 | |
11375 call calc_satd_16x8 | |
11376 call calc_satd_16x8 | |
11377 lea r0, [r6 + 32] | |
11378 lea r2, [r7 + 32] | |
11379 call calc_satd_16x8 | |
11380 call calc_satd_16x8 | |
11381 call calc_satd_16x8 | |
11382 call calc_satd_16x8 | |
11383 call calc_satd_16x8 | |
11384 call calc_satd_16x8 | |
11385 lea r0, [r6 + 48] | |
11386 lea r2, [r7 + 48] | |
11387 call calc_satd_16x8 | |
11388 call calc_satd_16x8 | |
11389 call calc_satd_16x8 | |
11390 call calc_satd_16x8 | |
11391 call calc_satd_16x8 | |
11392 call calc_satd_16x8 | |
11393 | |
11394 paddd m8, m9 | |
11395 vextracti128 xm0, m8, 1 | |
11396 paddd xm0, xm8 | |
11397 movhlps xm1, xm0 | |
11398 paddd xm0, xm1 | |
11399 pshuflw xm1, xm0, q0032 | |
11400 paddd xm0, xm1 | |
11401 movd eax, xm0 | |
11402 RET | |
11403 | |
11404 cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2) | |
11405 mova m7, [hmul_16p] | |
11406 lea r4, [3 * r1] | |
11407 lea r5, [3 * r3] | |
11408 pxor m8, m8 | |
11409 pxor m9, m9 | |
11410 mov r6, r0 | |
11411 mov r7, r2 | |
11412 | |
11413 call calc_satd_16x8 | |
11414 call calc_satd_16x8 | |
11415 call calc_satd_16x8 | |
11416 call calc_satd_16x8 | |
11417 call calc_satd_16x8 | |
11418 call calc_satd_16x8 | |
11419 call calc_satd_16x8 | |
11420 call calc_satd_16x8 | |
11421 lea r0, [r6 + 16] | |
11422 lea r2, [r7 + 16] | |
11423 call calc_satd_16x8 | |
11424 call calc_satd_16x8 | |
11425 call calc_satd_16x8 | |
11426 call calc_satd_16x8 | |
11427 call calc_satd_16x8 | |
11428 call calc_satd_16x8 | |
11429 call calc_satd_16x8 | |
11430 call calc_satd_16x8 | |
11431 lea r0, [r6 + 32] | |
11432 lea r2, [r7 + 32] | |
11433 call calc_satd_16x8 | |
11434 call calc_satd_16x8 | |
11435 call calc_satd_16x8 | |
11436 call calc_satd_16x8 | |
11437 call calc_satd_16x8 | |
11438 call calc_satd_16x8 | |
11439 call calc_satd_16x8 | |
11440 call calc_satd_16x8 | |
11441 lea r0, [r6 + 48] | |
11442 lea r2, [r7 + 48] | |
11443 call calc_satd_16x8 | |
11444 call calc_satd_16x8 | |
11445 call calc_satd_16x8 | |
11446 call calc_satd_16x8 | |
11447 call calc_satd_16x8 | |
11448 call calc_satd_16x8 | |
11449 call calc_satd_16x8 | |
11450 call calc_satd_16x8 | |
11451 | |
11452 paddd m8, m9 | |
11453 vextracti128 xm0, m8, 1 | |
11454 paddd xm0, xm8 | |
11455 movhlps xm1, xm0 | |
11456 paddd xm0, xm1 | |
11457 pshuflw xm1, xm0, q0032 | |
11458 paddd xm0, xm1 | |
11459 movd eax, xm0 | |
11460 RET | |
11461 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 | |
11462 | |
11463 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 | |
11464 INIT_YMM avx2 | |
11465 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows | |
11466 ; rows 0-3 | |
11467 movu m0, [r0] | |
11468 movu m4, [r2] | |
11469 psubw m0, m4 | |
11470 movu m1, [r0 + r1] | |
11471 movu m5, [r2 + r3] | |
11472 psubw m1, m5 | |
11473 movu m2, [r0 + r1 * 2] | |
11474 movu m4, [r2 + r3 * 2] | |
11475 psubw m2, m4 | |
11476 movu m3, [r0 + r4] | |
11477 movu m5, [r2 + r5] | |
11478 psubw m3, m5 | |
11479 lea r0, [r0 + r1 * 4] | |
11480 lea r2, [r2 + r3 * 4] | |
11481 paddw m4, m0, m1 | |
11482 psubw m1, m0 | |
11483 paddw m0, m2, m3 | |
11484 psubw m3, m2 | |
11485 punpckhwd m2, m4, m1 | |
11486 punpcklwd m4, m1 | |
11487 punpckhwd m1, m0, m3 | |
11488 punpcklwd m0, m3 | |
11489 paddw m3, m4, m0 | |
11490 psubw m0, m4 | |
11491 paddw m4, m2, m1 | |
11492 psubw m1, m2 | |
11493 punpckhdq m2, m3, m0 | |
11494 punpckldq m3, m0 | |
11495 paddw m0, m3, m2 | |
11496 psubw m2, m3 | |
11497 punpckhdq m3, m4, m1 | |
11498 punpckldq m4, m1 | |
11499 paddw m1, m4, m3 | |
11500 psubw m3, m4 | |
11501 punpckhqdq m4, m0, m1 | |
11502 punpcklqdq m0, m1 | |
11503 pabsw m0, m0 | |
11504 pabsw m4, m4 | |
11505 pmaxsw m0, m0, m4 | |
11506 punpckhqdq m1, m2, m3 | |
11507 punpcklqdq m2, m3 | |
11508 pabsw m2, m2 | |
11509 pabsw m1, m1 | |
11510 pmaxsw m2, m1 | |
11511 pxor m7, m7 | |
11512 mova m1, m0 | |
11513 punpcklwd m1, m7 | |
11514 paddd m6, m1 | |
11515 mova m1, m0 | |
11516 punpckhwd m1, m7 | |
11517 paddd m6, m1 | |
11518 pxor m7, m7 | |
11519 mova m1, m2 | |
11520 punpcklwd m1, m7 | |
11521 paddd m6, m1 | |
11522 mova m1, m2 | |
11523 punpckhwd m1, m7 | |
11524 paddd m6, m1 | |
11525 ; rows 4-7 | |
11526 movu m0, [r0] | |
11527 movu m4, [r2] | |
11528 psubw m0, m4 | |
11529 movu m1, [r0 + r1] | |
11530 movu m5, [r2 + r3] | |
11531 psubw m1, m5 | |
11532 movu m2, [r0 + r1 * 2] | |
11533 movu m4, [r2 + r3 * 2] | |
11534 psubw m2, m4 | |
11535 movu m3, [r0 + r4] | |
11536 movu m5, [r2 + r5] | |
11537 psubw m3, m5 | |
11538 lea r0, [r0 + r1 * 4] | |
11539 lea r2, [r2 + r3 * 4] | |
11540 paddw m4, m0, m1 | |
11541 psubw m1, m0 | |
11542 paddw m0, m2, m3 | |
11543 psubw m3, m2 | |
11544 punpckhwd m2, m4, m1 | |
11545 punpcklwd m4, m1 | |
11546 punpckhwd m1, m0, m3 | |
11547 punpcklwd m0, m3 | |
11548 paddw m3, m4, m0 | |
11549 psubw m0, m4 | |
11550 paddw m4, m2, m1 | |
11551 psubw m1, m2 | |
11552 punpckhdq m2, m3, m0 | |
11553 punpckldq m3, m0 | |
11554 paddw m0, m3, m2 | |
11555 psubw m2, m3 | |
11556 punpckhdq m3, m4, m1 | |
11557 punpckldq m4, m1 | |
11558 paddw m1, m4, m3 | |
11559 psubw m3, m4 | |
11560 punpckhqdq m4, m0, m1 | |
11561 punpcklqdq m0, m1 | |
11562 pabsw m0, m0 | |
11563 pabsw m4, m4 | |
11564 pmaxsw m0, m0, m4 | |
11565 punpckhqdq m1, m2, m3 | |
11566 punpcklqdq m2, m3 | |
11567 pabsw m2, m2 | |
11568 pabsw m1, m1 | |
11569 pmaxsw m2, m1 | |
11570 pxor m7, m7 | |
11571 mova m1, m0 | |
11572 punpcklwd m1, m7 | |
11573 paddd m6, m1 | |
11574 mova m1, m0 | |
11575 punpckhwd m1, m7 | |
11576 paddd m6, m1 | |
11577 pxor m7, m7 | |
11578 mova m1, m2 | |
11579 punpcklwd m1, m7 | |
11580 paddd m6, m1 | |
11581 mova m1, m2 | |
11582 punpckhwd m1, m7 | |
11583 paddd m6, m1 | |
11584 ret | |
11585 | |
11586 cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows | |
11587 ; rows 0-3 | |
11588 movu m0, [r0] | |
11589 movu m4, [r2] | |
11590 psubw m0, m4 | |
11591 movu m1, [r0 + r1] | |
11592 movu m5, [r2 + r3] | |
11593 psubw m1, m5 | |
11594 movu m2, [r0 + r1 * 2] | |
11595 movu m4, [r2 + r3 * 2] | |
11596 psubw m2, m4 | |
11597 movu m3, [r0 + r4] | |
11598 movu m5, [r2 + r5] | |
11599 psubw m3, m5 | |
11600 lea r0, [r0 + r1 * 4] | |
11601 lea r2, [r2 + r3 * 4] | |
11602 paddw m4, m0, m1 | |
11603 psubw m1, m0 | |
11604 paddw m0, m2, m3 | |
11605 psubw m3, m2 | |
11606 punpckhwd m2, m4, m1 | |
11607 punpcklwd m4, m1 | |
11608 punpckhwd m1, m0, m3 | |
11609 punpcklwd m0, m3 | |
11610 paddw m3, m4, m0 | |
11611 psubw m0, m4 | |
11612 paddw m4, m2, m1 | |
11613 psubw m1, m2 | |
11614 punpckhdq m2, m3, m0 | |
11615 punpckldq m3, m0 | |
11616 paddw m0, m3, m2 | |
11617 psubw m2, m3 | |
11618 punpckhdq m3, m4, m1 | |
11619 punpckldq m4, m1 | |
11620 paddw m1, m4, m3 | |
11621 psubw m3, m4 | |
11622 punpckhqdq m4, m0, m1 | |
11623 punpcklqdq m0, m1 | |
11624 pabsw m0, m0 | |
11625 pabsw m4, m4 | |
11626 pmaxsw m0, m0, m4 | |
11627 punpckhqdq m1, m2, m3 | |
11628 punpcklqdq m2, m3 | |
11629 pabsw m2, m2 | |
11630 pabsw m1, m1 | |
11631 pmaxsw m2, m1 | |
11632 pxor m7, m7 | |
11633 mova m1, m0 | |
11634 punpcklwd m1, m7 | |
11635 paddd m6, m1 | |
11636 mova m1, m0 | |
11637 punpckhwd m1, m7 | |
11638 paddd m6, m1 | |
11639 pxor m7, m7 | |
11640 mova m1, m2 | |
11641 punpcklwd m1, m7 | |
11642 paddd m6, m1 | |
11643 mova m1, m2 | |
11644 punpckhwd m1, m7 | |
11645 paddd m6, m1 | |
11646 ret | |
11647 | |
11648 cglobal pixel_satd_16x4, 4,6,8 | |
11649 add r1d, r1d | |
11650 add r3d, r3d | |
11651 lea r4, [3 * r1] | |
11652 lea r5, [3 * r3] | |
11653 pxor m6, m6 | |
11654 | |
11655 call calc_satd_16x4 | |
11656 | |
11657 vextracti128 xm7, m6, 1 | |
11658 paddd xm6, xm7 | |
11659 pxor xm7, xm7 | |
11660 movhlps xm7, xm6 | |
11661 paddd xm6, xm7 | |
11662 pshufd xm7, xm6, 1 | |
11663 paddd xm6, xm7 | |
11664 movd eax, xm6 | |
11665 RET | |
11666 | |
11667 cglobal pixel_satd_16x8, 4,6,8 | |
11668 add r1d, r1d | |
11669 add r3d, r3d | |
11670 lea r4, [3 * r1] | |
11671 lea r5, [3 * r3] | |
11672 pxor m6, m6 | |
11673 | |
11674 call calc_satd_16x8 | |
11675 | |
11676 vextracti128 xm7, m6, 1 | |
11677 paddd xm6, xm7 | |
11678 pxor xm7, xm7 | |
11679 movhlps xm7, xm6 | |
11680 paddd xm6, xm7 | |
11681 pshufd xm7, xm6, 1 | |
11682 paddd xm6, xm7 | |
11683 movd eax, xm6 | |
11684 RET | |
11685 | |
11686 cglobal pixel_satd_16x12, 4,6,8 | |
11687 add r1d, r1d | |
11688 add r3d, r3d | |
11689 lea r4, [3 * r1] | |
11690 lea r5, [3 * r3] | |
11691 pxor m6, m6 | |
11692 | |
11693 call calc_satd_16x8 | |
11694 call calc_satd_16x4 | |
11695 | |
11696 vextracti128 xm7, m6, 1 | |
11697 paddd xm6, xm7 | |
11698 pxor xm7, xm7 | |
11699 movhlps xm7, xm6 | |
11700 paddd xm6, xm7 | |
11701 pshufd xm7, xm6, 1 | |
11702 paddd xm6, xm7 | |
11703 movd eax, xm6 | |
11704 RET | |
11705 | |
11706 cglobal pixel_satd_16x16, 4,6,8 | |
11707 add r1d, r1d | |
11708 add r3d, r3d | |
11709 lea r4, [3 * r1] | |
11710 lea r5, [3 * r3] | |
11711 pxor m6, m6 | |
11712 | |
11713 call calc_satd_16x8 | |
11714 call calc_satd_16x8 | |
11715 | |
11716 vextracti128 xm7, m6, 1 | |
11717 paddd xm6, xm7 | |
11718 pxor xm7, xm7 | |
11719 movhlps xm7, xm6 | |
11720 paddd xm6, xm7 | |
11721 pshufd xm7, xm6, 1 | |
11722 paddd xm6, xm7 | |
11723 movd eax, xm6 | |
11724 RET | |
11725 | |
11726 cglobal pixel_satd_16x32, 4,6,8 | |
11727 add r1d, r1d | |
11728 add r3d, r3d | |
11729 lea r4, [3 * r1] | |
11730 lea r5, [3 * r3] | |
11731 pxor m6, m6 | |
11732 | |
11733 call calc_satd_16x8 | |
11734 call calc_satd_16x8 | |
11735 call calc_satd_16x8 | |
11736 call calc_satd_16x8 | |
11737 | |
11738 vextracti128 xm7, m6, 1 | |
11739 paddd xm6, xm7 | |
11740 pxor xm7, xm7 | |
11741 movhlps xm7, xm6 | |
11742 paddd xm6, xm7 | |
11743 pshufd xm7, xm6, 1 | |
11744 paddd xm6, xm7 | |
11745 movd eax, xm6 | |
11746 RET | |
11747 | |
11748 cglobal pixel_satd_16x64, 4,6,8 | |
11749 add r1d, r1d | |
11750 add r3d, r3d | |
11751 lea r4, [3 * r1] | |
11752 lea r5, [3 * r3] | |
11753 pxor m6, m6 | |
11754 | |
11755 call calc_satd_16x8 | |
11756 call calc_satd_16x8 | |
11757 call calc_satd_16x8 | |
11758 call calc_satd_16x8 | |
11759 call calc_satd_16x8 | |
11760 call calc_satd_16x8 | |
11761 call calc_satd_16x8 | |
11762 call calc_satd_16x8 | |
11763 | |
11764 vextracti128 xm7, m6, 1 | |
11765 paddd xm6, xm7 | |
11766 pxor xm7, xm7 | |
11767 movhlps xm7, xm6 | |
11768 paddd xm6, xm7 | |
11769 pshufd xm7, xm6, 1 | |
11770 paddd xm6, xm7 | |
11771 movd eax, xm6 | |
11772 RET | |
11773 | |
11774 cglobal pixel_satd_32x8, 4,8,8 | |
11775 add r1d, r1d | |
11776 add r3d, r3d | |
11777 lea r4, [3 * r1] | |
11778 lea r5, [3 * r3] | |
11779 pxor m6, m6 | |
11780 mov r6, r0 | |
11781 mov r7, r2 | |
11782 | |
11783 call calc_satd_16x8 | |
11784 | |
11785 lea r0, [r6 + 32] | |
11786 lea r2, [r7 + 32] | |
11787 | |
11788 call calc_satd_16x8 | |
11789 | |
11790 vextracti128 xm7, m6, 1 | |
11791 paddd xm6, xm7 | |
11792 pxor xm7, xm7 | |
11793 movhlps xm7, xm6 | |
11794 paddd xm6, xm7 | |
11795 pshufd xm7, xm6, 1 | |
11796 paddd xm6, xm7 | |
11797 movd eax, xm6 | |
11798 RET | |
11799 | |
11800 cglobal pixel_satd_32x16, 4,8,8 | |
11801 add r1d, r1d | |
11802 add r3d, r3d | |
11803 lea r4, [3 * r1] | |
11804 lea r5, [3 * r3] | |
11805 pxor m6, m6 | |
11806 mov r6, r0 | |
11807 mov r7, r2 | |
11808 | |
11809 call calc_satd_16x8 | |
11810 call calc_satd_16x8 | |
11811 | |
11812 lea r0, [r6 + 32] | |
11813 lea r2, [r7 + 32] | |
11814 | |
11815 call calc_satd_16x8 | |
11816 call calc_satd_16x8 | |
11817 | |
11818 vextracti128 xm7, m6, 1 | |
11819 paddd xm6, xm7 | |
11820 pxor xm7, xm7 | |
11821 movhlps xm7, xm6 | |
11822 paddd xm6, xm7 | |
11823 pshufd xm7, xm6, 1 | |
11824 paddd xm6, xm7 | |
11825 movd eax, xm6 | |
11826 RET | |
11827 | |
11828 cglobal pixel_satd_32x24, 4,8,8 | |
11829 add r1d, r1d | |
11830 add r3d, r3d | |
11831 lea r4, [3 * r1] | |
11832 lea r5, [3 * r3] | |
11833 pxor m6, m6 | |
11834 mov r6, r0 | |
11835 mov r7, r2 | |
11836 | |
11837 call calc_satd_16x8 | |
11838 call calc_satd_16x8 | |
11839 call calc_satd_16x8 | |
11840 | |
11841 lea r0, [r6 + 32] | |
11842 lea r2, [r7 + 32] | |
11843 | |
11844 call calc_satd_16x8 | |
11845 call calc_satd_16x8 | |
11846 call calc_satd_16x8 | |
11847 | |
11848 vextracti128 xm7, m6, 1 | |
11849 paddd xm6, xm7 | |
11850 pxor xm7, xm7 | |
11851 movhlps xm7, xm6 | |
11852 paddd xm6, xm7 | |
11853 pshufd xm7, xm6, 1 | |
11854 paddd xm6, xm7 | |
11855 movd eax, xm6 | |
11856 RET | |
11857 | |
11858 cglobal pixel_satd_32x32, 4,8,8 | |
11859 add r1d, r1d | |
11860 add r3d, r3d | |
11861 lea r4, [3 * r1] | |
11862 lea r5, [3 * r3] | |
11863 pxor m6, m6 | |
11864 mov r6, r0 | |
11865 mov r7, r2 | |
11866 | |
11867 call calc_satd_16x8 | |
11868 call calc_satd_16x8 | |
11869 call calc_satd_16x8 | |
11870 call calc_satd_16x8 | |
11871 | |
11872 lea r0, [r6 + 32] | |
11873 lea r2, [r7 + 32] | |
11874 | |
11875 call calc_satd_16x8 | |
11876 call calc_satd_16x8 | |
11877 call calc_satd_16x8 | |
11878 call calc_satd_16x8 | |
11879 | |
11880 vextracti128 xm7, m6, 1 | |
11881 paddd xm6, xm7 | |
11882 pxor xm7, xm7 | |
11883 movhlps xm7, xm6 | |
11884 paddd xm6, xm7 | |
11885 pshufd xm7, xm6, 1 | |
11886 paddd xm6, xm7 | |
11887 movd eax, xm6 | |
11888 RET | |
11889 | |
11890 cglobal pixel_satd_32x64, 4,8,8 | |
11891 add r1d, r1d | |
11892 add r3d, r3d | |
11893 lea r4, [3 * r1] | |
11894 lea r5, [3 * r3] | |
11895 pxor m6, m6 | |
11896 mov r6, r0 | |
11897 mov r7, r2 | |
11898 | |
11899 call calc_satd_16x8 | |
11900 call calc_satd_16x8 | |
11901 call calc_satd_16x8 | |
11902 call calc_satd_16x8 | |
11903 call calc_satd_16x8 | |
11904 call calc_satd_16x8 | |
11905 call calc_satd_16x8 | |
11906 call calc_satd_16x8 | |
11907 | |
11908 lea r0, [r6 + 32] | |
11909 lea r2, [r7 + 32] | |
11910 | |
11911 call calc_satd_16x8 | |
11912 call calc_satd_16x8 | |
11913 call calc_satd_16x8 | |
11914 call calc_satd_16x8 | |
11915 call calc_satd_16x8 | |
11916 call calc_satd_16x8 | |
11917 call calc_satd_16x8 | |
11918 call calc_satd_16x8 | |
11919 | |
11920 vextracti128 xm7, m6, 1 | |
11921 paddd xm6, xm7 | |
11922 pxor xm7, xm7 | |
11923 movhlps xm7, xm6 | |
11924 paddd xm6, xm7 | |
11925 pshufd xm7, xm6, 1 | |
11926 paddd xm6, xm7 | |
11927 movd eax, xm6 | |
11928 RET | |
11929 | |
11930 cglobal pixel_satd_48x64, 4,8,8 | |
11931 add r1d, r1d | |
11932 add r3d, r3d | |
11933 lea r4, [3 * r1] | |
11934 lea r5, [3 * r3] | |
11935 pxor m6, m6 | |
11936 mov r6, r0 | |
11937 mov r7, r2 | |
11938 | |
11939 call calc_satd_16x8 | |
11940 call calc_satd_16x8 | |
11941 call calc_satd_16x8 | |
11942 call calc_satd_16x8 | |
11943 call calc_satd_16x8 | |
11944 call calc_satd_16x8 | |
11945 call calc_satd_16x8 | |
11946 call calc_satd_16x8 | |
11947 | |
11948 lea r0, [r6 + 32] | |
11949 lea r2, [r7 + 32] | |
11950 | |
11951 call calc_satd_16x8 | |
11952 call calc_satd_16x8 | |
11953 call calc_satd_16x8 | |
11954 call calc_satd_16x8 | |
11955 call calc_satd_16x8 | |
11956 call calc_satd_16x8 | |
11957 call calc_satd_16x8 | |
11958 call calc_satd_16x8 | |
11959 | |
11960 lea r0, [r6 + 64] | |
11961 lea r2, [r7 + 64] | |
11962 | |
11963 call calc_satd_16x8 | |
11964 call calc_satd_16x8 | |
11965 call calc_satd_16x8 | |
11966 call calc_satd_16x8 | |
11967 call calc_satd_16x8 | |
11968 call calc_satd_16x8 | |
11969 call calc_satd_16x8 | |
11970 call calc_satd_16x8 | |
11971 | |
11972 vextracti128 xm7, m6, 1 | |
11973 paddd xm6, xm7 | |
11974 pxor xm7, xm7 | |
11975 movhlps xm7, xm6 | |
11976 paddd xm6, xm7 | |
11977 pshufd xm7, xm6, 1 | |
11978 paddd xm6, xm7 | |
11979 movd eax, xm6 | |
11980 RET | |
11981 | |
11982 cglobal pixel_satd_64x16, 4,8,8 | |
11983 add r1d, r1d | |
11984 add r3d, r3d | |
11985 lea r4, [3 * r1] | |
11986 lea r5, [3 * r3] | |
11987 pxor m6, m6 | |
11988 mov r6, r0 | |
11989 mov r7, r2 | |
11990 | |
11991 call calc_satd_16x8 | |
11992 call calc_satd_16x8 | |
11993 | |
11994 lea r0, [r6 + 32] | |
11995 lea r2, [r7 + 32] | |
11996 | |
11997 call calc_satd_16x8 | |
11998 call calc_satd_16x8 | |
11999 | |
12000 lea r0, [r6 + 64] | |
12001 lea r2, [r7 + 64] | |
12002 | |
12003 call calc_satd_16x8 | |
12004 call calc_satd_16x8 | |
12005 | |
12006 lea r0, [r6 + 96] | |
12007 lea r2, [r7 + 96] | |
12008 | |
12009 call calc_satd_16x8 | |
12010 call calc_satd_16x8 | |
12011 | |
12012 vextracti128 xm7, m6, 1 | |
12013 paddd xm6, xm7 | |
12014 pxor xm7, xm7 | |
12015 movhlps xm7, xm6 | |
12016 paddd xm6, xm7 | |
12017 pshufd xm7, xm6, 1 | |
12018 paddd xm6, xm7 | |
12019 movd eax, xm6 | |
12020 RET | |
12021 | |
12022 cglobal pixel_satd_64x32, 4,8,8 | |
12023 add r1d, r1d | |
12024 add r3d, r3d | |
12025 lea r4, [3 * r1] | |
12026 lea r5, [3 * r3] | |
12027 pxor m6, m6 | |
12028 mov r6, r0 | |
12029 mov r7, r2 | |
12030 | |
12031 call calc_satd_16x8 | |
12032 call calc_satd_16x8 | |
12033 call calc_satd_16x8 | |
12034 call calc_satd_16x8 | |
12035 | |
12036 lea r0, [r6 + 32] | |
12037 lea r2, [r7 + 32] | |
12038 | |
12039 call calc_satd_16x8 | |
12040 call calc_satd_16x8 | |
12041 call calc_satd_16x8 | |
12042 call calc_satd_16x8 | |
12043 | |
12044 lea r0, [r6 + 64] | |
12045 lea r2, [r7 + 64] | |
12046 | |
12047 call calc_satd_16x8 | |
12048 call calc_satd_16x8 | |
12049 call calc_satd_16x8 | |
12050 call calc_satd_16x8 | |
12051 | |
12052 lea r0, [r6 + 96] | |
12053 lea r2, [r7 + 96] | |
12054 | |
12055 call calc_satd_16x8 | |
12056 call calc_satd_16x8 | |
12057 call calc_satd_16x8 | |
12058 call calc_satd_16x8 | |
12059 | |
12060 vextracti128 xm7, m6, 1 | |
12061 paddd xm6, xm7 | |
12062 pxor xm7, xm7 | |
12063 movhlps xm7, xm6 | |
12064 paddd xm6, xm7 | |
12065 pshufd xm7, xm6, 1 | |
12066 paddd xm6, xm7 | |
12067 movd eax, xm6 | |
12068 RET | |
12069 | |
12070 cglobal pixel_satd_64x48, 4,8,8 | |
12071 add r1d, r1d | |
12072 add r3d, r3d | |
12073 lea r4, [3 * r1] | |
12074 lea r5, [3 * r3] | |
12075 pxor m6, m6 | |
12076 mov r6, r0 | |
12077 mov r7, r2 | |
12078 | |
12079 call calc_satd_16x8 | |
12080 call calc_satd_16x8 | |
12081 call calc_satd_16x8 | |
12082 call calc_satd_16x8 | |
12083 call calc_satd_16x8 | |
12084 call calc_satd_16x8 | |
12085 | |
12086 lea r0, [r6 + 32] | |
12087 lea r2, [r7 + 32] | |
12088 | |
12089 call calc_satd_16x8 | |
12090 call calc_satd_16x8 | |
12091 call calc_satd_16x8 | |
12092 call calc_satd_16x8 | |
12093 call calc_satd_16x8 | |
12094 call calc_satd_16x8 | |
12095 | |
12096 lea r0, [r6 + 64] | |
12097 lea r2, [r7 + 64] | |
12098 | |
12099 call calc_satd_16x8 | |
12100 call calc_satd_16x8 | |
12101 call calc_satd_16x8 | |
12102 call calc_satd_16x8 | |
12103 call calc_satd_16x8 | |
12104 call calc_satd_16x8 | |
12105 | |
12106 lea r0, [r6 + 96] | |
12107 lea r2, [r7 + 96] | |
12108 | |
12109 call calc_satd_16x8 | |
12110 call calc_satd_16x8 | |
12111 call calc_satd_16x8 | |
12112 call calc_satd_16x8 | |
12113 call calc_satd_16x8 | |
12114 call calc_satd_16x8 | |
12115 | |
12116 vextracti128 xm7, m6, 1 | |
12117 paddd xm6, xm7 | |
12118 pxor xm7, xm7 | |
12119 movhlps xm7, xm6 | |
12120 paddd xm6, xm7 | |
12121 pshufd xm7, xm6, 1 | |
12122 paddd xm6, xm7 | |
12123 movd eax, xm6 | |
12124 RET | |
12125 | |
12126 cglobal pixel_satd_64x64, 4,8,8 | |
12127 add r1d, r1d | |
12128 add r3d, r3d | |
12129 lea r4, [3 * r1] | |
12130 lea r5, [3 * r3] | |
12131 pxor m6, m6 | |
12132 mov r6, r0 | |
12133 mov r7, r2 | |
12134 | |
12135 call calc_satd_16x8 | |
12136 call calc_satd_16x8 | |
12137 call calc_satd_16x8 | |
12138 call calc_satd_16x8 | |
12139 call calc_satd_16x8 | |
12140 call calc_satd_16x8 | |
12141 call calc_satd_16x8 | |
12142 call calc_satd_16x8 | |
12143 | |
12144 lea r0, [r6 + 32] | |
12145 lea r2, [r7 + 32] | |
12146 | |
12147 call calc_satd_16x8 | |
12148 call calc_satd_16x8 | |
12149 call calc_satd_16x8 | |
12150 call calc_satd_16x8 | |
12151 call calc_satd_16x8 | |
12152 call calc_satd_16x8 | |
12153 call calc_satd_16x8 | |
12154 call calc_satd_16x8 | |
12155 | |
12156 lea r0, [r6 + 64] | |
12157 lea r2, [r7 + 64] | |
12158 | |
12159 call calc_satd_16x8 | |
12160 call calc_satd_16x8 | |
12161 call calc_satd_16x8 | |
12162 call calc_satd_16x8 | |
12163 call calc_satd_16x8 | |
12164 call calc_satd_16x8 | |
12165 call calc_satd_16x8 | |
12166 call calc_satd_16x8 | |
12167 | |
12168 lea r0, [r6 + 96] | |
12169 lea r2, [r7 + 96] | |
12170 | |
12171 call calc_satd_16x8 | |
12172 call calc_satd_16x8 | |
12173 call calc_satd_16x8 | |
12174 call calc_satd_16x8 | |
12175 call calc_satd_16x8 | |
12176 call calc_satd_16x8 | |
12177 call calc_satd_16x8 | |
12178 call calc_satd_16x8 | |
12179 | |
12180 vextracti128 xm7, m6, 1 | |
12181 paddd xm6, xm7 | |
12182 pxor xm7, xm7 | |
12183 movhlps xm7, xm6 | |
12184 paddd xm6, xm7 | |
12185 pshufd xm7, xm6, 1 | |
12186 paddd xm6, xm7 | |
12187 movd eax, xm6 | |
12188 RET | |
12189 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 | |
12190 | |
12191 | |
12192 ;------------------------------------------------------------------------------------------------------------------------------------- | |
12193 ; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix) | |
12194 ;------------------------------------------------------------------------------------------------------------------------------------- | |
12195 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 | |
12196 INIT_YMM avx2 | |
12197 cglobal planeClipAndMax, 5,7,8 | |
12198 movd xm0, r5m | |
12199 vpbroadcastb m0, xm0 ; m0 = [min] | |
12200 vpbroadcastb m1, r6m ; m1 = [max] | |
12201 pxor m2, m2 ; m2 = sumLuma | |
12202 pxor m3, m3 ; m3 = maxLumaLevel | |
12203 pxor m4, m4 ; m4 = zero | |
12204 | |
12205 ; get mask to partial register pixels | |
12206 mov r5d, r2d | |
12207 and r2d, ~(mmsize - 1) | |
12208 sub r5d, r2d | |
12209 lea r6, [pb_movemask_32 + mmsize] | |
12210 sub r6, r5 | |
12211 movu m5, [r6] ; m5 = mask for last couple column | |
12212 | |
12213 .loopH: | |
12214 lea r5d, [r2 - mmsize] | |
12215 | |
12216 .loopW: | |
12217 movu m6, [r0 + r5] | |
12218 pmaxub m6, m0 | |
12219 pminub m6, m1 | |
12220 movu [r0 + r5], m6 ; store back | |
12221 pmaxub m3, m6 ; update maxLumaLevel | |
12222 psadbw m6, m4 | |
12223 paddq m2, m6 | |
12224 | |
12225 sub r5d, mmsize | |
12226 jge .loopW | |
12227 | |
12228 ; partial pixels | |
12229 movu m7, [r0 + r2] | |
12230 pmaxub m6, m7, m0 | |
12231 pminub m6, m1 | |
12232 | |
12233 pand m7, m5 ; get invalid/unchange pixel | |
12234 pandn m6, m5, m6 ; clear invalid pixels | |
12235 por m7, m6 ; combin valid & invalid pixels | |
12236 movu [r0 + r2], m7 ; store back | |
12237 pmaxub m3, m6 ; update maxLumaLevel | |
12238 psadbw m6, m4 | |
12239 paddq m2, m6 | |
12240 | |
12241 .next: | |
12242 add r0, r1 | |
12243 dec r3d | |
12244 jg .loopH | |
12245 | |
12246 ; sumLuma | |
12247 vextracti128 xm0, m2, 1 | |
12248 paddq xm0, xm2 | |
12249 movhlps xm1, xm0 | |
12250 paddq xm0, xm1 | |
12251 movq [r4], xm0 | |
12252 | |
12253 ; maxLumaLevel | |
12254 vextracti128 xm0, m3, 1 | |
12255 pmaxub xm0, xm3 | |
12256 movhlps xm3, xm0 | |
12257 pmaxub xm0, xm3 | |
12258 pmovzxbw xm0, xm0 | |
12259 pxor xm0, [pb_movemask + 16] | |
12260 phminposuw xm0, xm0 | |
12261 | |
12262 movd eax, xm0 | |
12263 not al | |
12264 movzx eax, al | |
12265 RET | |
12266 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 |