Mercurial > hg > forks > libbpg
comparison x265/source/common/x86/intrapred16.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:772086c29cc7 |
---|---|
1 ;***************************************************************************** | |
2 ;* Copyright (C) 2013 x265 project | |
3 ;* | |
4 ;* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com> | |
5 ;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com> | |
6 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
7 ;* | |
8 ;* This program is free software; you can redistribute it and/or modify | |
9 ;* it under the terms of the GNU General Public License as published by | |
10 ;* the Free Software Foundation; either version 2 of the License, or | |
11 ;* (at your option) any later version. | |
12 ;* | |
13 ;* This program is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 ;* GNU General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU General Public License | |
19 ;* along with this program; if not, write to the Free Software | |
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 ;* | |
22 ;* This program is also available under a commercial proprietary license. | |
23 ;* For more information, contact us at license @ x265.com. | |
24 ;*****************************************************************************/ | |
25 | |
26 %include "x86inc.asm" | |
27 %include "x86util.asm" | |
28 | |
29 SECTION_RODATA 32 | |
30 | |
31 const ang_table | |
32 %assign x 0 | |
33 %rep 32 | |
34 times 4 dw (32-x), x | |
35 %assign x x+1 | |
36 %endrep | |
37 | |
38 const ang_table_avx2 | |
39 %assign x 0 | |
40 %rep 32 | |
41 times 8 dw (32-x), x | |
42 %assign x x+1 | |
43 %endrep | |
44 | |
45 const pw_ang16_12_24, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 14, 15, 0, 1, 0, 1 | |
46 const pw_ang16_13_23, db 2, 3, 2, 3, 14, 15, 14, 15, 6, 7, 6, 7, 0, 1, 0, 1 | |
47 const pw_ang16_14_22, db 2, 3, 2, 3, 10, 11, 10, 11, 6, 7, 6, 7, 0, 1, 0, 1 | |
48 const pw_ang16_15_21, db 12, 13, 12, 13, 8, 9, 8, 9, 4, 5, 4, 5, 0, 1, 0, 1 | |
49 const pw_ang16_16_20, db 8, 9, 8, 9, 6, 7, 6, 7, 2, 3, 2, 3, 0, 1, 0, 1 | |
50 | |
51 const pw_ang32_12_24, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 | |
52 const pw_ang32_13_23, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 6, 7, 0, 1 | |
53 const pw_ang32_14_22, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 6, 7, 0, 1 | |
54 const pw_ang32_15_21, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1 | |
55 const pw_ang32_16_20, db 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 6, 7, 2, 3, 0, 1 | |
56 const pw_ang32_17_19_0, db 0, 0, 0, 0, 12, 13, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 | |
57 | |
58 const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 | |
59 const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 | |
60 const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 | |
61 const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1 | |
62 const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1 | |
63 const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 | |
64 const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 | |
65 const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | |
66 | |
67 const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1 | |
68 const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1 | |
69 const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1 | |
70 const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1 | |
71 const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1 | |
72 const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1 | |
73 const pw_swap16, times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 | |
74 | |
75 const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
76 const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 | |
77 | |
78 intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 | |
79 intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 | |
80 intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
81 | |
82 ;; (blkSize - 1 - x) | |
83 pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 | |
84 | |
85 const planar32_table | |
86 %assign x 31 | |
87 %rep 8 | |
88 dd x, x-1, x-2, x-3 | |
89 %assign x x-4 | |
90 %endrep | |
91 | |
92 const planar32_table1 | |
93 %assign x 1 | |
94 %rep 8 | |
95 dd x, x+1, x+2, x+3 | |
96 %assign x x+4 | |
97 %endrep | |
98 | |
99 SECTION .text | |
100 | |
101 cextern pb_01 | |
102 cextern pw_1 | |
103 cextern pw_2 | |
104 cextern pw_3 | |
105 cextern pw_7 | |
106 cextern pw_4 | |
107 cextern pw_8 | |
108 cextern pw_15 | |
109 cextern pw_16 | |
110 cextern pw_31 | |
111 cextern pw_32 | |
112 cextern pd_16 | |
113 cextern pd_31 | |
114 cextern pd_32 | |
115 cextern pw_4096 | |
116 cextern pw_pixel_max | |
117 cextern multiL | |
118 cextern multiH | |
119 cextern multiH2 | |
120 cextern multiH3 | |
121 cextern multi_2Row | |
122 cextern pw_swap | |
123 cextern pb_unpackwq1 | |
124 cextern pb_unpackwq2 | |
125 cextern pw_planar16_mul | |
126 cextern pw_planar32_mul | |
127 | |
128 ;----------------------------------------------------------------------------------- | |
129 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) | |
130 ;----------------------------------------------------------------------------------- | |
131 INIT_XMM sse2 | |
132 cglobal intra_pred_dc4, 5,6,2 | |
133 movh m0, [r2 + 18] ; sumAbove | |
134 movh m1, [r2 + 2] ; sumLeft | |
135 | |
136 paddw m0, m1 | |
137 pshuflw m1, m0, 0x4E | |
138 paddw m0, m1 | |
139 pshuflw m1, m0, 0xB1 | |
140 paddw m0, m1 | |
141 | |
142 test r4d, r4d | |
143 | |
144 paddw m0, [pw_4] | |
145 psrlw m0, 3 | |
146 | |
147 ; store DC 4x4 | |
148 movh [r0], m0 | |
149 movh [r0 + r1 * 2], m0 | |
150 movh [r0 + r1 * 4], m0 | |
151 lea r5, [r0 + r1 * 4] | |
152 movh [r5 + r1 * 2], m0 | |
153 | |
154 ; do DC filter | |
155 jz .end | |
156 movh m1, m0 | |
157 psllw m1, 1 | |
158 paddw m1, [pw_2] | |
159 movd r3d, m1 | |
160 paddw m0, m1 | |
161 ; filter top | |
162 movh m1, [r2 + 2] | |
163 paddw m1, m0 | |
164 psrlw m1, 2 | |
165 movh [r0], m1 ; overwrite top-left pixel, we will update it later | |
166 | |
167 ; filter top-left | |
168 movzx r3d, r3w | |
169 movzx r4d, word [r2 + 18] | |
170 add r3d, r4d | |
171 movzx r4d, word [r2 + 2] | |
172 add r4d, r3d | |
173 shr r4d, 2 | |
174 mov [r0], r4w | |
175 | |
176 ; filter left | |
177 movu m1, [r2 + 20] | |
178 paddw m1, m0 | |
179 psrlw m1, 2 | |
180 movd r3d, m1 | |
181 mov [r0 + r1 * 2], r3w | |
182 shr r3d, 16 | |
183 mov [r0 + r1 * 4], r3w | |
184 pextrw r3d, m1, 2 | |
185 mov [r5 + r1 * 2], r3w | |
186 .end: | |
187 RET | |
188 | |
189 ;----------------------------------------------------------------------------------- | |
190 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) | |
191 ;----------------------------------------------------------------------------------- | |
192 INIT_XMM sse2 | |
193 cglobal intra_pred_dc8, 5, 8, 2 | |
194 movu m0, [r2 + 34] | |
195 movu m1, [r2 + 2] | |
196 | |
197 paddw m0, m1 | |
198 movhlps m1, m0 | |
199 paddw m0, m1 | |
200 pshufd m1, m0, 1 | |
201 paddw m0, m1 | |
202 pmaddwd m0, [pw_1] | |
203 | |
204 paddw m0, [pw_8] | |
205 psrlw m0, 4 ; sum = sum / 16 | |
206 pshuflw m0, m0, 0 | |
207 pshufd m0, m0, 0 ; m0 = word [dc_val ...] | |
208 | |
209 test r4d, r4d | |
210 | |
211 ; store DC 8x8 | |
212 lea r6, [r1 + r1 * 4] | |
213 lea r6, [r6 + r1] | |
214 lea r5, [r6 + r1 * 4] | |
215 lea r7, [r6 + r1 * 8] | |
216 movu [r0], m0 | |
217 movu [r0 + r1 * 2], m0 | |
218 movu [r0 + r1 * 4], m0 | |
219 movu [r0 + r6], m0 | |
220 movu [r0 + r1 * 8], m0 | |
221 movu [r0 + r5], m0 | |
222 movu [r0 + r6 * 2], m0 | |
223 movu [r0 + r7], m0 | |
224 | |
225 ; Do DC Filter | |
226 jz .end | |
227 mova m1, [pw_2] | |
228 pmullw m1, m0 | |
229 paddw m1, [pw_2] | |
230 movd r4d, m1 ; r4d = DC * 2 + 2 | |
231 paddw m1, m0 ; m1 = DC * 3 + 2 | |
232 pshuflw m1, m1, 0 | |
233 pshufd m1, m1, 0 ; m1 = pixDCx3 | |
234 | |
235 ; filter top | |
236 movu m0, [r2 + 2] | |
237 paddw m0, m1 | |
238 psrlw m0, 2 | |
239 movu [r0], m0 | |
240 | |
241 ; filter top-left | |
242 movzx r4d, r4w | |
243 movzx r3d, word [r2 + 34] | |
244 add r4d, r3d | |
245 movzx r3d, word [r2 + 2] | |
246 add r3d, r4d | |
247 shr r3d, 2 | |
248 mov [r0], r3w | |
249 | |
250 ; filter left | |
251 movu m0, [r2 + 36] | |
252 paddw m0, m1 | |
253 psrlw m0, 2 | |
254 movh r3, m0 | |
255 mov [r0 + r1 * 2], r3w | |
256 shr r3, 16 | |
257 mov [r0 + r1 * 4], r3w | |
258 shr r3, 16 | |
259 mov [r0 + r6], r3w | |
260 shr r3, 16 | |
261 mov [r0 + r1 * 8], r3w | |
262 pshufd m0, m0, 0x6E | |
263 movh r3, m0 | |
264 mov [r0 + r5], r3w | |
265 shr r3, 16 | |
266 mov [r0 + r6 * 2], r3w | |
267 shr r3, 16 | |
268 mov [r0 + r7], r3w | |
269 .end: | |
270 RET | |
271 | |
272 ;------------------------------------------------------------------------------------------------------- | |
273 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
274 ;------------------------------------------------------------------------------------------------------- | |
275 INIT_XMM sse2 | |
276 cglobal intra_pred_dc16, 5, 10, 4 | |
277 lea r3, [r2 + 66] | |
278 add r1, r1 | |
279 movu m0, [r3] | |
280 movu m1, [r3 + 16] | |
281 movu m2, [r2 + 2] | |
282 movu m3, [r2 + 18] | |
283 | |
284 paddw m0, m1 | |
285 paddw m2, m3 | |
286 paddw m0, m2 | |
287 HADDUW m0, m1 | |
288 paddd m0, [pd_16] | |
289 psrld m0, 5 | |
290 | |
291 movd r5d, m0 | |
292 pshuflw m0, m0, 0 ; m0 = word [dc_val ...] | |
293 pshufd m0, m0, 0 | |
294 | |
295 test r4d, r4d | |
296 | |
297 ; store DC 16x16 | |
298 lea r6, [r1 + r1 * 2] ;index 3 | |
299 lea r7, [r1 + r1 * 4] ;index 5 | |
300 lea r8, [r6 + r1 * 4] ;index 7 | |
301 lea r9, [r0 + r8] ;base + 7 | |
302 movu [r0], m0 | |
303 movu [r0 + 16], m0 | |
304 movu [r0 + r1], m0 | |
305 movu [r0 + 16 + r1], m0 | |
306 movu [r0 + r1 * 2], m0 | |
307 movu [r0 + r1 * 2 + 16], m0 | |
308 movu [r0 + r6], m0 | |
309 movu [r0 + r6 + 16], m0 | |
310 movu [r0 + r1 * 4], m0 | |
311 movu [r0 + r1 * 4 + 16], m0 | |
312 movu [r0 + r7], m0 | |
313 movu [r0 + r7 + 16], m0 | |
314 movu [r0 + r6 * 2], m0 | |
315 movu [r0 + r6 * 2 + 16], m0 | |
316 movu [r9], m0 | |
317 movu [r9 + 16], m0 | |
318 movu [r0 + r1 * 8], m0 | |
319 movu [r0 + r1 * 8 + 16], m0 | |
320 movu [r9 + r1 * 2], m0 | |
321 movu [r9 + r1 * 2 + 16], m0 | |
322 movu [r0 + r7 * 2], m0 | |
323 movu [r0 + r7 * 2 + 16], m0 | |
324 movu [r9 + r1 * 4], m0 | |
325 movu [r9 + r1 * 4 + 16], m0 | |
326 movu [r0 + r6 * 4], m0 | |
327 movu [r0 + r6 * 4 + 16], m0 | |
328 movu [r9 + r6 * 2], m0 | |
329 movu [r9 + r6 * 2 + 16], m0 | |
330 movu [r9 + r8], m0 | |
331 movu [r9 + r8 + 16], m0 | |
332 movu [r9 + r1 * 8], m0 | |
333 movu [r9 + r1 * 8 + 16], m0 | |
334 | |
335 ; Do DC Filter | |
336 jz .end | |
337 mova m1, [pw_2] | |
338 pmullw m1, m0 | |
339 paddw m1, [pw_2] | |
340 movd r4d, m1 | |
341 paddw m1, m0 | |
342 | |
343 ; filter top | |
344 movu m2, [r2 + 2] | |
345 paddw m2, m1 | |
346 psrlw m2, 2 | |
347 movu [r0], m2 | |
348 movu m3, [r2 + 18] | |
349 paddw m3, m1 | |
350 psrlw m3, 2 | |
351 movu [r0 + 16], m3 | |
352 | |
353 ; filter top-left | |
354 movzx r4d, r4w | |
355 movzx r5d, word [r3] | |
356 add r4d, r5d | |
357 movzx r5d, word [r2 + 2] | |
358 add r5d, r4d | |
359 shr r5d, 2 | |
360 mov [r0], r5w | |
361 | |
362 ; filter left | |
363 movu m2, [r3 + 2] | |
364 paddw m2, m1 | |
365 psrlw m2, 2 | |
366 | |
367 movq r2, m2 | |
368 pshufd m2, m2, 0xEE | |
369 mov [r0 + r1], r2w | |
370 shr r2, 16 | |
371 mov [r0 + r1 * 2], r2w | |
372 shr r2, 16 | |
373 mov [r0 + r6], r2w | |
374 shr r2, 16 | |
375 mov [r0 + r1 * 4], r2w | |
376 movq r2, m2 | |
377 mov [r0 + r7], r2w | |
378 shr r2, 16 | |
379 mov [r0 + r6 * 2], r2w | |
380 shr r2, 16 | |
381 mov [r9], r2w | |
382 shr r2, 16 | |
383 mov [r0 + r1 * 8], r2w | |
384 | |
385 movu m3, [r3 + 18] | |
386 paddw m3, m1 | |
387 psrlw m3, 2 | |
388 | |
389 movq r3, m3 | |
390 pshufd m3, m3, 0xEE | |
391 mov [r9 + r1 * 2], r3w | |
392 shr r3, 16 | |
393 mov [r0 + r7 * 2], r3w | |
394 shr r3, 16 | |
395 mov [r9 + r1 * 4], r3w | |
396 shr r3, 16 | |
397 mov [r0 + r6 * 4], r3w | |
398 movq r3, m3 | |
399 mov [r9 + r6 * 2], r3w | |
400 shr r3, 16 | |
401 mov [r9 + r8], r3w | |
402 shr r3, 16 | |
403 mov [r9 + r1 * 8], r3w | |
404 .end: | |
405 RET | |
406 | |
407 ;------------------------------------------------------------------------------------------- | |
408 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) | |
409 ;------------------------------------------------------------------------------------------- | |
410 INIT_XMM sse2 | |
411 cglobal intra_pred_dc32, 3, 4, 6 | |
412 lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) | |
413 add r2, 2 | |
414 add r1, r1 | |
415 movu m0, [r3] | |
416 movu m1, [r3 + 16] | |
417 movu m2, [r3 + 32] | |
418 movu m3, [r3 + 48] | |
419 paddw m0, m1 | |
420 paddw m2, m3 | |
421 paddw m0, m2 | |
422 HADDUWD m0, m1 | |
423 | |
424 movu m1, [r2] | |
425 movu m2, [r2 + 16] | |
426 movu m3, [r2 + 32] | |
427 movu m4, [r2 + 48] | |
428 paddw m1, m2 | |
429 paddw m3, m4 | |
430 paddw m1, m3 | |
431 HADDUWD m1, m2 | |
432 | |
433 paddd m0, m1 | |
434 HADDD m0, m1 | |
435 paddd m0, [pd_32] ; sum = sum + 32 | |
436 psrld m0, 6 ; sum = sum / 64 | |
437 pshuflw m0, m0, 0 | |
438 pshufd m0, m0, 0 | |
439 | |
440 lea r2, [r1 * 3] | |
441 ; store DC 32x32 | |
442 %assign x 1 | |
443 %rep 8 | |
444 movu [r0 + 0], m0 | |
445 movu [r0 + 16], m0 | |
446 movu [r0 + 32], m0 | |
447 movu [r0 + 48], m0 | |
448 movu [r0 + r1 + 0], m0 | |
449 movu [r0 + r1 + 16], m0 | |
450 movu [r0 + r1 + 32], m0 | |
451 movu [r0 + r1 + 48], m0 | |
452 movu [r0 + r1 * 2 + 0], m0 | |
453 movu [r0 + r1 * 2 + 16], m0 | |
454 movu [r0 + r1 * 2 + 32], m0 | |
455 movu [r0 + r1 * 2 + 48], m0 | |
456 movu [r0 + r2 + 0], m0 | |
457 movu [r0 + r2 + 16], m0 | |
458 movu [r0 + r2 + 32], m0 | |
459 movu [r0 + r2 + 48], m0 | |
460 %if x < 8 | |
461 lea r0, [r0 + r1 * 4] | |
462 %endif | |
463 %assign x x + 1 | |
464 %endrep | |
465 RET | |
466 | |
467 ;------------------------------------------------------------------------------------------------------- | |
468 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
469 ;------------------------------------------------------------------------------------------------------- | |
470 INIT_YMM avx2 | |
471 cglobal intra_pred_dc16, 3, 9, 4 | |
472 mov r3d, r4m | |
473 add r1d, r1d | |
474 movu m0, [r2 + 66] | |
475 movu m2, [r2 + 2] | |
476 paddw m0, m2 ; dynamic range 13 bits | |
477 | |
478 vextracti128 xm1, m0, 1 | |
479 paddw xm0, xm1 ; dynamic range 14 bits | |
480 movhlps xm1, xm0 | |
481 paddw xm0, xm1 ; dynamic range 15 bits | |
482 pmaddwd xm0, [pw_1] | |
483 phaddd xm0, xm0 | |
484 paddd xm0, [pd_16] | |
485 psrld xm0, 5 | |
486 movd r5d, xm0 | |
487 vpbroadcastw m0, xm0 | |
488 | |
489 test r3d, r3d | |
490 | |
491 ; store DC 16x16 | |
492 lea r6, [r1 + r1 * 2] ; index 3 | |
493 lea r7, [r1 + r1 * 4] ; index 5 | |
494 lea r8, [r6 + r1 * 4] ; index 7 | |
495 lea r4, [r0 + r8 * 1] ; base + 7 | |
496 | |
497 movu [r0], m0 | |
498 movu [r0 + r1], m0 | |
499 movu [r0 + r1 * 2], m0 | |
500 movu [r0 + r6], m0 | |
501 movu [r0 + r1 * 4], m0 | |
502 movu [r0 + r7], m0 | |
503 movu [r0 + r6 * 2], m0 | |
504 movu [r4], m0 | |
505 movu [r0 + r1 * 8], m0 | |
506 movu [r4 + r1 * 2], m0 | |
507 movu [r0 + r7 * 2], m0 | |
508 movu [r4 + r1 * 4], m0 | |
509 movu [r0 + r6 * 4], m0 | |
510 movu [r4 + r6 * 2], m0 | |
511 movu [r4 + r8], m0 | |
512 movu [r4 + r1 * 8], m0 | |
513 | |
514 ; Do DC Filter | |
515 jz .end | |
516 mova m1, [pw_2] | |
517 pmullw m1, m0 | |
518 paddw m1, [pw_2] | |
519 movd r3d, xm1 | |
520 paddw m1, m0 | |
521 | |
522 ; filter top | |
523 movu m2, [r2 + 2] | |
524 paddw m2, m1 | |
525 psrlw m2, 2 | |
526 movu [r0], m2 | |
527 | |
528 ; filter top-left | |
529 movzx r3d, r3w | |
530 movzx r5d, word [r2 + 66] | |
531 add r3d, r5d | |
532 movzx r5d, word [r2 + 2] | |
533 add r5d, r3d | |
534 shr r5d, 2 | |
535 mov [r0], r5w | |
536 | |
537 ; filter left | |
538 movu m2, [r2 + 68] | |
539 paddw m2, m1 | |
540 psrlw m2, 2 | |
541 vextracti128 xm3, m2, 1 | |
542 | |
543 movq r3, xm2 | |
544 pshufd xm2, xm2, 0xEE | |
545 mov [r0 + r1], r3w | |
546 shr r3, 16 | |
547 mov [r0 + r1 * 2], r3w | |
548 shr r3, 16 | |
549 mov [r0 + r6], r3w | |
550 shr r3, 16 | |
551 mov [r0 + r1 * 4], r3w | |
552 movq r3, xm2 | |
553 mov [r0 + r7], r3w | |
554 shr r3, 16 | |
555 mov [r0 + r6 * 2], r3w | |
556 shr r3, 16 | |
557 mov [r4], r3w | |
558 shr r3, 16 | |
559 mov [r0 + r1 * 8], r3w | |
560 | |
561 movq r3, xm3 | |
562 pshufd xm3, xm3, 0xEE | |
563 mov [r4 + r1 * 2], r3w | |
564 shr r3, 16 | |
565 mov [r0 + r7 * 2], r3w | |
566 shr r3, 16 | |
567 mov [r4 + r1 * 4], r3w | |
568 shr r3, 16 | |
569 mov [r0 + r6 * 4], r3w | |
570 movq r3, xm3 | |
571 mov [r4 + r6 * 2], r3w | |
572 shr r3, 16 | |
573 mov [r4 + r8], r3w | |
574 shr r3, 16 | |
575 mov [r4 + r1 * 8], r3w | |
576 .end: | |
577 RET | |
578 | |
579 ;--------------------------------------------------------------------------------------------- | |
580 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
581 ;--------------------------------------------------------------------------------------------- | |
582 INIT_YMM avx2 | |
583 cglobal intra_pred_dc32, 3,3,3 | |
584 add r2, 2 | |
585 add r1d, r1d | |
586 movu m0, [r2] | |
587 movu m1, [r2 + 32] | |
588 add r2, mmsize*4 ; r2 += 128 | |
589 paddw m0, m1 ; dynamic range 13 bits | |
590 movu m1, [r2] | |
591 movu m2, [r2 + 32] | |
592 paddw m1, m2 ; dynamic range 13 bits | |
593 paddw m0, m1 ; dynamic range 14 bits | |
594 vextracti128 xm1, m0, 1 | |
595 paddw xm0, xm1 ; dynamic range 15 bits | |
596 pmaddwd xm0, [pw_1] | |
597 movhlps xm1, xm0 | |
598 paddd xm0, xm1 | |
599 phaddd xm0, xm0 | |
600 paddd xm0, [pd_32] ; sum = sum + 32 | |
601 psrld xm0, 6 ; sum = sum / 64 | |
602 vpbroadcastw m0, xm0 | |
603 | |
604 lea r2, [r1 * 3] | |
605 ; store DC 32x32 | |
606 movu [r0 + r1 * 0 + 0], m0 | |
607 movu [r0 + r1 * 0 + mmsize], m0 | |
608 movu [r0 + r1 * 1 + 0], m0 | |
609 movu [r0 + r1 * 1 + mmsize], m0 | |
610 movu [r0 + r1 * 2 + 0], m0 | |
611 movu [r0 + r1 * 2 + mmsize], m0 | |
612 movu [r0 + r2 * 1 + 0], m0 | |
613 movu [r0 + r2 * 1 + mmsize], m0 | |
614 lea r0, [r0 + r1 * 4] | |
615 movu [r0 + r1 * 0 + 0], m0 | |
616 movu [r0 + r1 * 0 + mmsize], m0 | |
617 movu [r0 + r1 * 1 + 0], m0 | |
618 movu [r0 + r1 * 1 + mmsize], m0 | |
619 movu [r0 + r1 * 2 + 0], m0 | |
620 movu [r0 + r1 * 2 + mmsize], m0 | |
621 movu [r0 + r2 * 1 + 0], m0 | |
622 movu [r0 + r2 * 1 + mmsize], m0 | |
623 lea r0, [r0 + r1 * 4] | |
624 movu [r0 + r1 * 0 + 0], m0 | |
625 movu [r0 + r1 * 0 + mmsize], m0 | |
626 movu [r0 + r1 * 1 + 0], m0 | |
627 movu [r0 + r1 * 1 + mmsize], m0 | |
628 movu [r0 + r1 * 2 + 0], m0 | |
629 movu [r0 + r1 * 2 + mmsize], m0 | |
630 movu [r0 + r2 * 1 + 0], m0 | |
631 movu [r0 + r2 * 1 + mmsize], m0 | |
632 lea r0, [r0 + r1 * 4] | |
633 movu [r0 + r1 * 0 + 0], m0 | |
634 movu [r0 + r1 * 0 + mmsize], m0 | |
635 movu [r0 + r1 * 1 + 0], m0 | |
636 movu [r0 + r1 * 1 + mmsize], m0 | |
637 movu [r0 + r1 * 2 + 0], m0 | |
638 movu [r0 + r1 * 2 + mmsize], m0 | |
639 movu [r0 + r2 * 1 + 0], m0 | |
640 movu [r0 + r2 * 1 + mmsize], m0 | |
641 lea r0, [r0 + r1 * 4] | |
642 movu [r0 + r1 * 0 + 0], m0 | |
643 movu [r0 + r1 * 0 + mmsize], m0 | |
644 movu [r0 + r1 * 1 + 0], m0 | |
645 movu [r0 + r1 * 1 + mmsize], m0 | |
646 movu [r0 + r1 * 2 + 0], m0 | |
647 movu [r0 + r1 * 2 + mmsize], m0 | |
648 movu [r0 + r2 * 1 + 0], m0 | |
649 movu [r0 + r2 * 1 + mmsize], m0 | |
650 lea r0, [r0 + r1 * 4] | |
651 movu [r0 + r1 * 0 + 0], m0 | |
652 movu [r0 + r1 * 0 + mmsize], m0 | |
653 movu [r0 + r1 * 1 + 0], m0 | |
654 movu [r0 + r1 * 1 + mmsize], m0 | |
655 movu [r0 + r1 * 2 + 0], m0 | |
656 movu [r0 + r1 * 2 + mmsize], m0 | |
657 movu [r0 + r2 * 1 + 0], m0 | |
658 movu [r0 + r2 * 1 + mmsize], m0 | |
659 lea r0, [r0 + r1 * 4] | |
660 movu [r0 + r1 * 0 + 0], m0 | |
661 movu [r0 + r1 * 0 + mmsize], m0 | |
662 movu [r0 + r1 * 1 + 0], m0 | |
663 movu [r0 + r1 * 1 + mmsize], m0 | |
664 movu [r0 + r1 * 2 + 0], m0 | |
665 movu [r0 + r1 * 2 + mmsize], m0 | |
666 movu [r0 + r2 * 1 + 0], m0 | |
667 movu [r0 + r2 * 1 + mmsize], m0 | |
668 lea r0, [r0 + r1 * 4] | |
669 movu [r0 + r1 * 0 + 0], m0 | |
670 movu [r0 + r1 * 0 + mmsize], m0 | |
671 movu [r0 + r1 * 1 + 0], m0 | |
672 movu [r0 + r1 * 1 + mmsize], m0 | |
673 movu [r0 + r1 * 2 + 0], m0 | |
674 movu [r0 + r1 * 2 + mmsize], m0 | |
675 movu [r0 + r2 * 1 + 0], m0 | |
676 movu [r0 + r2 * 1 + mmsize], m0 | |
677 RET | |
678 | |
679 ;--------------------------------------------------------------------------------------- | |
680 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
681 ;--------------------------------------------------------------------------------------- | |
682 INIT_XMM sse2 | |
683 cglobal intra_pred_planar8, 3,3,5 | |
684 movu m1, [r2 + 2] | |
685 movu m2, [r2 + 34] | |
686 | |
687 movd m3, [r2 + 18] ; topRight = above[8]; | |
688 movd m4, [r2 + 50] ; bottomLeft = left[8]; | |
689 | |
690 pshuflw m3, m3, 0 | |
691 pshuflw m4, m4, 0 | |
692 pshufd m3, m3, 0 ; v_topRight | |
693 pshufd m4, m4, 0 ; v_bottomLeft | |
694 | |
695 pmullw m3, [multiL] ; (x + 1) * topRight | |
696 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] | |
697 paddw m3, [pw_8] | |
698 paddw m3, m4 | |
699 paddw m3, m0 | |
700 psubw m4, m1 | |
701 | |
702 %macro INTRA_PRED_PLANAR_8 1 | |
703 %if (%1 < 4) | |
704 pshuflw m1, m2, 0x55 * %1 | |
705 pshufd m1, m1, 0 | |
706 %else | |
707 pshufhw m1, m2, 0x55 * (%1 - 4) | |
708 pshufd m1, m1, 0xAA | |
709 %endif | |
710 pmullw m1, [pw_planar16_mul + mmsize] | |
711 paddw m1, m3 | |
712 psraw m1, 4 | |
713 movu [r0], m1 | |
714 %if (%1 < 7) | |
715 paddw m3, m4 | |
716 lea r0, [r0 + r1 * 2] | |
717 %endif | |
718 %endmacro | |
719 | |
720 INTRA_PRED_PLANAR_8 0 | |
721 INTRA_PRED_PLANAR_8 1 | |
722 INTRA_PRED_PLANAR_8 2 | |
723 INTRA_PRED_PLANAR_8 3 | |
724 INTRA_PRED_PLANAR_8 4 | |
725 INTRA_PRED_PLANAR_8 5 | |
726 INTRA_PRED_PLANAR_8 6 | |
727 INTRA_PRED_PLANAR_8 7 | |
728 RET | |
729 | |
730 ;--------------------------------------------------------------------------------------- | |
731 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
732 ;--------------------------------------------------------------------------------------- | |
733 INIT_XMM sse2 | |
734 cglobal intra_pred_planar16, 3,3,8 | |
735 movu m2, [r2 + 2] | |
736 movu m7, [r2 + 18] | |
737 | |
738 movd m3, [r2 + 34] ; topRight = above[16] | |
739 movd m6, [r2 + 98] ; bottomLeft = left[16] | |
740 | |
741 pshuflw m3, m3, 0 | |
742 pshuflw m6, m6, 0 | |
743 pshufd m3, m3, 0 ; v_topRight | |
744 pshufd m6, m6, 0 ; v_bottomLeft | |
745 | |
746 pmullw m4, m3, [multiH] ; (x + 1) * topRight | |
747 pmullw m3, [multiL] ; (x + 1) * topRight | |
748 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
749 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] | |
750 paddw m4, [pw_16] | |
751 paddw m3, [pw_16] | |
752 paddw m4, m6 | |
753 paddw m3, m6 | |
754 paddw m4, m5 | |
755 paddw m3, m1 | |
756 psubw m1, m6, m7 | |
757 psubw m6, m2 | |
758 | |
759 movu m2, [r2 + 66] | |
760 movu m7, [r2 + 82] | |
761 | |
762 %macro INTRA_PRED_PLANAR_16 1 | |
763 %if (%1 < 4) | |
764 pshuflw m5, m2, 0x55 * %1 | |
765 pshufd m5, m5, 0 | |
766 %else | |
767 %if (%1 < 8) | |
768 pshufhw m5, m2, 0x55 * (%1 - 4) | |
769 pshufd m5, m5, 0xAA | |
770 %else | |
771 %if (%1 < 12) | |
772 pshuflw m5, m7, 0x55 * (%1 - 8) | |
773 pshufd m5, m5, 0 | |
774 %else | |
775 pshufhw m5, m7, 0x55 * (%1 - 12) | |
776 pshufd m5, m5, 0xAA | |
777 %endif | |
778 %endif | |
779 %endif | |
780 %if (%1 > 0) | |
781 paddw m3, m6 | |
782 paddw m4, m1 | |
783 lea r0, [r0 + r1 * 2] | |
784 %endif | |
785 pmullw m0, m5, [pw_planar16_mul + mmsize] | |
786 pmullw m5, [pw_planar16_mul] | |
787 paddw m0, m4 | |
788 paddw m5, m3 | |
789 psraw m5, 5 | |
790 psraw m0, 5 | |
791 movu [r0], m5 | |
792 movu [r0 + 16], m0 | |
793 %endmacro | |
794 | |
795 INTRA_PRED_PLANAR_16 0 | |
796 INTRA_PRED_PLANAR_16 1 | |
797 INTRA_PRED_PLANAR_16 2 | |
798 INTRA_PRED_PLANAR_16 3 | |
799 INTRA_PRED_PLANAR_16 4 | |
800 INTRA_PRED_PLANAR_16 5 | |
801 INTRA_PRED_PLANAR_16 6 | |
802 INTRA_PRED_PLANAR_16 7 | |
803 INTRA_PRED_PLANAR_16 8 | |
804 INTRA_PRED_PLANAR_16 9 | |
805 INTRA_PRED_PLANAR_16 10 | |
806 INTRA_PRED_PLANAR_16 11 | |
807 INTRA_PRED_PLANAR_16 12 | |
808 INTRA_PRED_PLANAR_16 13 | |
809 INTRA_PRED_PLANAR_16 14 | |
810 INTRA_PRED_PLANAR_16 15 | |
811 RET | |
812 | |
813 ;--------------------------------------------------------------------------------------- | |
814 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
815 ;--------------------------------------------------------------------------------------- | |
816 INIT_XMM sse2 | |
817 cglobal intra_pred_planar32, 3,3,16 | |
818 movd m3, [r2 + 66] ; topRight = above[32] | |
819 | |
820 pshuflw m3, m3, 0x00 | |
821 pshufd m3, m3, 0x44 | |
822 | |
823 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
824 pmullw m1, m3, [multiH] ; (x + 1) * topRight | |
825 pmullw m2, m3, [multiH2] ; (x + 1) * topRight | |
826 pmullw m3, [multiH3] ; (x + 1) * topRight | |
827 | |
828 movd m6, [r2 + 194] ; bottomLeft = left[32] | |
829 pshuflw m6, m6, 0x00 | |
830 pshufd m6, m6, 0x44 | |
831 mova m5, m6 | |
832 paddw m5, [pw_32] | |
833 | |
834 paddw m0, m5 | |
835 paddw m1, m5 | |
836 paddw m2, m5 | |
837 paddw m3, m5 | |
838 mova m8, m6 | |
839 mova m9, m6 | |
840 mova m10, m6 | |
841 | |
842 mova m12, [pw_31] | |
843 movu m4, [r2 + 2] | |
844 psubw m8, m4 | |
845 pmullw m4, m12 | |
846 paddw m0, m4 | |
847 | |
848 movu m5, [r2 + 18] | |
849 psubw m9, m5 | |
850 pmullw m5, m12 | |
851 paddw m1, m5 | |
852 | |
853 movu m4, [r2 + 34] | |
854 psubw m10, m4 | |
855 pmullw m4, m12 | |
856 paddw m2, m4 | |
857 | |
858 movu m5, [r2 + 50] | |
859 psubw m6, m5 | |
860 pmullw m5, m12 | |
861 paddw m3, m5 | |
862 | |
863 mova m12, [pw_planar32_mul] | |
864 mova m13, [pw_planar32_mul + mmsize] | |
865 mova m14, [pw_planar16_mul] | |
866 mova m15, [pw_planar16_mul + mmsize] | |
867 add r1, r1 | |
868 | |
869 %macro PROCESS 1 | |
870 pmullw m5, %1, m12 | |
871 pmullw m11, %1, m13 | |
872 paddw m5, m0 | |
873 paddw m11, m1 | |
874 psrlw m5, 6 | |
875 psrlw m11, 6 | |
876 movu [r0], m5 | |
877 movu [r0 + 16], m11 | |
878 | |
879 pmullw m5, %1, m14 | |
880 pmullw %1, m15 | |
881 paddw m5, m2 | |
882 paddw %1, m3 | |
883 psrlw m5, 6 | |
884 psrlw %1, 6 | |
885 movu [r0 + 32], m5 | |
886 movu [r0 + 48], %1 | |
887 %endmacro | |
888 | |
889 %macro INCREMENT 0 | |
890 paddw m2, m10 | |
891 paddw m3, m6 | |
892 paddw m0, m8 | |
893 paddw m1, m9 | |
894 add r0, r1 | |
895 %endmacro | |
896 | |
897 add r2, 130 ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) | |
898 %assign x 0 | |
899 %rep 4 | |
900 movu m4, [r2] | |
901 add r2, 16 | |
902 %assign y 0 | |
903 %rep 8 | |
904 %if y < 4 | |
905 pshuflw m7, m4, 0x55 * y | |
906 pshufd m7, m7, 0x44 | |
907 %else | |
908 pshufhw m7, m4, 0x55 * (y - 4) | |
909 pshufd m7, m7, 0xEE | |
910 %endif | |
911 PROCESS m7 | |
912 %if x + y < 10 | |
913 INCREMENT | |
914 %endif | |
915 %assign y y+1 | |
916 %endrep | |
917 %assign x x+1 | |
918 %endrep | |
919 RET | |
920 | |
921 ;--------------------------------------------------------------------------------------- | |
922 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
923 ;--------------------------------------------------------------------------------------- | |
924 INIT_YMM avx2 | |
925 cglobal intra_pred_planar32, 3,3,8 | |
926 movu m1, [r2 + 2] | |
927 movu m4, [r2 + 34] | |
928 lea r2, [r2 + 66] | |
929 vpbroadcastw m3, [r2] ; topRight = above[32] | |
930 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
931 pmullw m2, m3, [multiH2] ; (x + 1) * topRight | |
932 vpbroadcastw m6, [r2 + 128] ; bottomLeft = left[32] | |
933 mova m5, m6 | |
934 paddw m5, [pw_32] | |
935 | |
936 paddw m0, m5 | |
937 paddw m2, m5 | |
938 mova m5, m6 | |
939 psubw m3, m6, m1 | |
940 pmullw m1, [pw_31] | |
941 paddw m0, m1 | |
942 psubw m5, m4 | |
943 pmullw m4, [pw_31] | |
944 paddw m2, m4 | |
945 | |
946 mova m6, [pw_planar32_mul] | |
947 mova m4, [pw_planar16_mul] | |
948 add r1, r1 | |
949 | |
950 %macro PROCESS_AVX2 1 | |
951 vpbroadcastw m7, [r2 + %1 * 2] | |
952 pmullw m1, m7, m6 | |
953 pmullw m7, m4 | |
954 paddw m1, m0 | |
955 paddw m7, m2 | |
956 psrlw m1, 6 | |
957 psrlw m7, 6 | |
958 movu [r0], m1 | |
959 movu [r0 + mmsize], m7 | |
960 %endmacro | |
961 | |
962 %macro INCREMENT_AVX2 0 | |
963 paddw m2, m5 | |
964 paddw m0, m3 | |
965 add r0, r1 | |
966 %endmacro | |
967 | |
968 add r2, mmsize*2 | |
969 %assign x 0 | |
970 %rep 4 | |
971 %assign y 0 | |
972 %rep 8 | |
973 PROCESS_AVX2 y | |
974 %if x + y < 10 | |
975 INCREMENT_AVX2 | |
976 %endif | |
977 %assign y y+1 | |
978 %endrep | |
979 lea r2, [r2 + 16] | |
980 %assign x x+1 | |
981 %endrep | |
982 RET | |
983 | |
984 ;--------------------------------------------------------------------------------------- | |
985 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
986 ;--------------------------------------------------------------------------------------- | |
987 INIT_YMM avx2 | |
988 cglobal intra_pred_planar16, 3,3,4 | |
989 add r1d, r1d | |
990 vpbroadcastw m3, [r2 + 34] | |
991 vpbroadcastw m4, [r2 + 98] | |
992 mova m0, [pw_planar16_mul] | |
993 movu m2, [r2 + 2] | |
994 | |
995 pmullw m3, [multiL] ; (x + 1) * topRight | |
996 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
997 paddw m3, [pw_16] | |
998 paddw m3, m4 | |
999 paddw m3, m1 | |
1000 psubw m4, m2 | |
1001 add r2, 66 | |
1002 | |
1003 %macro INTRA_PRED_PLANAR16_AVX2 1 | |
1004 vpbroadcastw m1, [r2 + %1] | |
1005 vpbroadcastw m2, [r2 + %1 + 2] | |
1006 | |
1007 pmullw m1, m0 | |
1008 pmullw m2, m0 | |
1009 paddw m1, m3 | |
1010 paddw m3, m4 | |
1011 psraw m1, 5 | |
1012 paddw m2, m3 | |
1013 psraw m2, 5 | |
1014 paddw m3, m4 | |
1015 movu [r0], m1 | |
1016 movu [r0 + r1], m2 | |
1017 %if %1 <= 24 | |
1018 lea r0, [r0 + r1 * 2] | |
1019 %endif | |
1020 %endmacro | |
1021 INTRA_PRED_PLANAR16_AVX2 0 | |
1022 INTRA_PRED_PLANAR16_AVX2 4 | |
1023 INTRA_PRED_PLANAR16_AVX2 8 | |
1024 INTRA_PRED_PLANAR16_AVX2 12 | |
1025 INTRA_PRED_PLANAR16_AVX2 16 | |
1026 INTRA_PRED_PLANAR16_AVX2 20 | |
1027 INTRA_PRED_PLANAR16_AVX2 24 | |
1028 INTRA_PRED_PLANAR16_AVX2 28 | |
1029 %undef INTRA_PRED_PLANAR16_AVX2 | |
1030 RET | |
1031 | |
1032 %macro TRANSPOSE_4x4 0 | |
1033 punpckhwd m0, m1, m3 | |
1034 punpcklwd m1, m3 | |
1035 punpckhwd m3, m1, m0 | |
1036 punpcklwd m1, m0 | |
1037 %endmacro | |
1038 | |
1039 %macro STORE_4x4 0 | |
1040 add r1, r1 | |
1041 movh [r0], m1 | |
1042 movhps [r0 + r1], m1 | |
1043 movh [r0 + r1 * 2], m3 | |
1044 lea r1, [r1 * 3] | |
1045 movhps [r0 + r1], m3 | |
1046 %endmacro | |
1047 | |
1048 %macro CALC_4x4 4 | |
1049 mova m0, [pd_16] | |
1050 pmaddwd m1, [ang_table + %1 * 16] | |
1051 paddd m1, m0 | |
1052 psrld m1, 5 | |
1053 | |
1054 pmaddwd m2, [ang_table + %2 * 16] | |
1055 paddd m2, m0 | |
1056 psrld m2, 5 | |
1057 packssdw m1, m2 | |
1058 | |
1059 pmaddwd m3, [ang_table + %3 * 16] | |
1060 paddd m3, m0 | |
1061 psrld m3, 5 | |
1062 | |
1063 pmaddwd m4, [ang_table + %4 * 16] | |
1064 paddd m4, m0 | |
1065 psrld m4, 5 | |
1066 packssdw m3, m4 | |
1067 %endmacro | |
1068 | |
1069 ;----------------------------------------------------------------------------------------- | |
1070 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
1071 ;----------------------------------------------------------------------------------------- | |
1072 INIT_XMM sse2 | |
1073 cglobal intra_pred_ang4_2, 3,5,4 | |
1074 lea r4, [r2 + 4] | |
1075 add r2, 20 | |
1076 cmp r3m, byte 34 | |
1077 cmove r2, r4 | |
1078 | |
1079 add r1, r1 | |
1080 movu m0, [r2] | |
1081 movh [r0], m0 | |
1082 psrldq m0, 2 | |
1083 movh [r0 + r1], m0 | |
1084 psrldq m0, 2 | |
1085 movh [r0 + r1 * 2], m0 | |
1086 lea r1, [r1 * 3] | |
1087 psrldq m0, 2 | |
1088 movh [r0 + r1], m0 | |
1089 RET | |
1090 | |
1091 cglobal intra_pred_ang4_3, 3,3,5 | |
1092 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1093 mova m1, m0 | |
1094 psrldq m0, 2 | |
1095 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1096 mova m2, m0 | |
1097 psrldq m0, 2 | |
1098 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1099 mova m3, m0 | |
1100 psrldq m0, 2 | |
1101 punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3] | |
1102 mova m4, m0 | |
1103 psrldq m0, 2 | |
1104 punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4] | |
1105 | |
1106 CALC_4x4 26, 20, 14, 8 | |
1107 | |
1108 TRANSPOSE_4x4 | |
1109 | |
1110 STORE_4x4 | |
1111 RET | |
1112 | |
1113 cglobal intra_pred_ang4_33, 3,3,5 | |
1114 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1115 mova m1, m0 | |
1116 psrldq m0, 2 | |
1117 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1118 mova m2, m0 | |
1119 psrldq m0, 2 | |
1120 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1121 mova m3, m0 | |
1122 psrldq m0, 2 | |
1123 punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3] | |
1124 mova m4, m0 | |
1125 psrldq m0, 2 | |
1126 punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4] | |
1127 | |
1128 CALC_4x4 26, 20, 14, 8 | |
1129 | |
1130 STORE_4x4 | |
1131 RET | |
1132 | |
1133 cglobal intra_pred_ang4_4, 3,3,5 | |
1134 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1135 mova m1, m0 | |
1136 psrldq m0, 2 | |
1137 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1138 mova m2, m0 | |
1139 psrldq m0, 2 | |
1140 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1141 mova m3, m2 | |
1142 mova m4, m0 | |
1143 psrldq m0, 2 | |
1144 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] | |
1145 | |
1146 CALC_4x4 21, 10, 31, 20 | |
1147 | |
1148 TRANSPOSE_4x4 | |
1149 | |
1150 STORE_4x4 | |
1151 RET | |
1152 | |
1153 cglobal intra_pred_ang4_6, 3,3,5 | |
1154 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1155 mova m1, m0 | |
1156 psrldq m0, 2 | |
1157 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1158 mova m2, m1 | |
1159 mova m3, m0 | |
1160 psrldq m0, 2 | |
1161 punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2] | |
1162 mova m4, m3 | |
1163 | |
1164 CALC_4x4 13, 26, 7, 20 | |
1165 | |
1166 TRANSPOSE_4x4 | |
1167 | |
1168 STORE_4x4 | |
1169 RET | |
1170 | |
1171 cglobal intra_pred_ang4_7, 3,3,5 | |
1172 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1173 mova m1, m0 | |
1174 psrldq m0, 2 | |
1175 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1176 mova m2, m1 | |
1177 mova m3, m1 | |
1178 mova m4, m0 | |
1179 psrldq m0, 2 | |
1180 punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2] | |
1181 | |
1182 CALC_4x4 9, 18, 27, 4 | |
1183 | |
1184 TRANSPOSE_4x4 | |
1185 | |
1186 STORE_4x4 | |
1187 RET | |
1188 | |
1189 cglobal intra_pred_ang4_8, 3,3,5 | |
1190 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1191 mova m1, m0 | |
1192 psrldq m0, 2 | |
1193 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1194 mova m2, m1 | |
1195 mova m3, m1 | |
1196 mova m4, m1 | |
1197 | |
1198 CALC_4x4 5, 10, 15, 20 | |
1199 | |
1200 TRANSPOSE_4x4 | |
1201 | |
1202 STORE_4x4 | |
1203 RET | |
1204 | |
1205 cglobal intra_pred_ang4_9, 3,3,5 | |
1206 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1207 mova m1, m0 | |
1208 psrldq m0, 2 | |
1209 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1210 mova m2, m1 | |
1211 mova m3, m1 | |
1212 mova m4, m1 | |
1213 | |
1214 CALC_4x4 2, 4, 6, 8 | |
1215 | |
1216 TRANSPOSE_4x4 | |
1217 | |
1218 STORE_4x4 | |
1219 RET | |
1220 | |
1221 cglobal intra_pred_ang4_10, 3,3,3 | |
1222 movh m0, [r2 + 18] ;[4 3 2 1] | |
1223 | |
1224 punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1] | |
1225 pshufd m1, m0, 0xFA | |
1226 add r1d, r1d | |
1227 pshufd m0, m0, 0x50 | |
1228 movhps [r0 + r1], m0 | |
1229 movh [r0 + r1 * 2], m1 | |
1230 lea r1d, [r1 * 3] | |
1231 movhps [r0 + r1], m1 | |
1232 | |
1233 cmp r4m, byte 0 | |
1234 jz .quit | |
1235 | |
1236 ; filter | |
1237 movd m2, [r2] ;[7 6 5 4 3 2 1 0] | |
1238 pshuflw m2, m2, 0x00 | |
1239 movh m1, [r2 + 2] | |
1240 psubw m1, m2 | |
1241 psraw m1, 1 | |
1242 paddw m0, m1 | |
1243 pxor m1, m1 | |
1244 pmaxsw m0, m1 | |
1245 pminsw m0, [pw_pixel_max] | |
1246 .quit: | |
1247 movh [r0], m0 | |
1248 RET | |
1249 | |
1250 cglobal intra_pred_ang4_11, 3,3,5 | |
1251 movh m0, [r2 + 18] ;[x x x 4 3 2 1 0] | |
1252 movh m1, [r2 - 6] | |
1253 punpcklqdq m1, m0 | |
1254 psrldq m1, 6 | |
1255 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1256 mova m2, m1 | |
1257 mova m3, m1 | |
1258 mova m4, m1 | |
1259 | |
1260 CALC_4x4 30, 28, 26, 24 | |
1261 | |
1262 TRANSPOSE_4x4 | |
1263 | |
1264 STORE_4x4 | |
1265 RET | |
1266 | |
1267 cglobal intra_pred_ang4_12, 3,3,5 | |
1268 movh m0, [r2 + 18] | |
1269 movh m1, [r2 - 6] | |
1270 punpcklqdq m1, m0 | |
1271 psrldq m1, 6 | |
1272 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1273 mova m2, m1 | |
1274 mova m3, m1 | |
1275 mova m4, m1 | |
1276 | |
1277 CALC_4x4 27, 22, 17, 12 | |
1278 | |
1279 TRANSPOSE_4x4 | |
1280 | |
1281 STORE_4x4 | |
1282 RET | |
1283 | |
1284 cglobal intra_pred_ang4_13, 3,3,5 | |
1285 movd m4, [r2 + 6] | |
1286 movd m1, [r2 - 2] | |
1287 movh m0, [r2 + 18] | |
1288 punpcklwd m4, m1 | |
1289 punpcklqdq m4, m0 | |
1290 psrldq m4, 4 | |
1291 mova m1, m4 | |
1292 psrldq m1, 2 | |
1293 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] | |
1294 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1295 mova m2, m1 | |
1296 mova m3, m1 | |
1297 | |
1298 CALC_4x4 23, 14, 5, 28 | |
1299 | |
1300 TRANSPOSE_4x4 | |
1301 | |
1302 STORE_4x4 | |
1303 RET | |
1304 | |
1305 cglobal intra_pred_ang4_14, 3,3,5 | |
1306 movd m4, [r2 + 2] | |
1307 movd m1, [r2 - 2] | |
1308 movh m0, [r2 + 18] | |
1309 punpcklwd m4, m1 | |
1310 punpcklqdq m4, m0 | |
1311 psrldq m4, 4 | |
1312 mova m1, m4 | |
1313 psrldq m1, 2 | |
1314 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] | |
1315 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1316 mova m2, m1 | |
1317 mova m3, m4 | |
1318 | |
1319 CALC_4x4 19, 6, 25, 12 | |
1320 | |
1321 TRANSPOSE_4x4 | |
1322 | |
1323 STORE_4x4 | |
1324 RET | |
1325 | |
1326 cglobal intra_pred_ang4_15, 3,3,5 | |
1327 movd m3, [r2] ;[x x x A] | |
1328 movh m4, [r2 + 4] ;[x C x B] | |
1329 movh m0, [r2 + 18] ;[4 3 2 1] | |
1330 pshuflw m4, m4, 0x22 ;[B C B C] | |
1331 punpcklqdq m4, m3 ;[x x x A B C B C] | |
1332 psrldq m4, 2 ;[x x x x A B C B] | |
1333 punpcklqdq m4, m0 | |
1334 psrldq m4, 2 | |
1335 mova m1, m4 | |
1336 mova m2, m4 | |
1337 psrldq m1, 4 | |
1338 psrldq m2, 2 | |
1339 punpcklwd m4, m2 ;[2 1 1 0 0 x x y] | |
1340 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] | |
1341 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1342 mova m3, m2 | |
1343 | |
1344 CALC_4x4 15, 30, 13, 28 | |
1345 | |
1346 TRANSPOSE_4x4 | |
1347 | |
1348 STORE_4x4 | |
1349 RET | |
1350 | |
1351 cglobal intra_pred_ang4_16, 3,3,5 | |
1352 movd m3, [r2] ;[x x x A] | |
1353 movd m4, [r2 + 4] ;[x x C B] | |
1354 movh m0, [r2 + 18] ;[4 3 2 1] | |
1355 punpcklwd m4, m3 ;[x C A B] | |
1356 pshuflw m4, m4, 0x4A ;[A B C C] | |
1357 punpcklqdq m4, m0 ;[4 3 2 1 A B C C] | |
1358 psrldq m4, 2 | |
1359 mova m1, m4 | |
1360 mova m2, m4 | |
1361 psrldq m1, 4 | |
1362 psrldq m2, 2 | |
1363 punpcklwd m4, m2 ;[2 1 1 0 0 x x y] | |
1364 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] | |
1365 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1366 mova m3, m2 | |
1367 | |
1368 CALC_4x4 11, 22, 1, 12 | |
1369 | |
1370 TRANSPOSE_4x4 | |
1371 | |
1372 STORE_4x4 | |
1373 RET | |
1374 | |
1375 cglobal intra_pred_ang4_17, 3,3,5 | |
1376 movd m3, [r2] | |
1377 movh m4, [r2 + 2] ;[D x C B] | |
1378 pshuflw m4, m4, 0x1F ;[B C D D] | |
1379 punpcklqdq m4, m3 ;[x x x A B C D D] | |
1380 psrldq m4, 2 ;[x x x x A B C D] | |
1381 movhps m4, [r2 + 18] | |
1382 | |
1383 mova m3, m4 | |
1384 psrldq m3, 2 | |
1385 punpcklwd m4, m3 | |
1386 mova m2, m3 | |
1387 psrldq m2, 2 | |
1388 punpcklwd m3, m2 | |
1389 mova m1, m2 | |
1390 psrldq m1, 2 | |
1391 punpcklwd m2, m1 | |
1392 mova m0, m1 | |
1393 psrldq m0, 2 | |
1394 punpcklwd m1, m0 | |
1395 | |
1396 CALC_4x4 6, 12, 18, 24 | |
1397 | |
1398 TRANSPOSE_4x4 | |
1399 | |
1400 STORE_4x4 | |
1401 RET | |
1402 | |
1403 cglobal intra_pred_ang4_18, 3,3,1 | |
1404 movh m0, [r2 + 16] | |
1405 pinsrw m0, [r2], 0 | |
1406 pshuflw m0, m0, q0123 | |
1407 movhps m0, [r2 + 2] | |
1408 add r1, r1 | |
1409 lea r2, [r1 * 3] | |
1410 movh [r0 + r2], m0 | |
1411 psrldq m0, 2 | |
1412 movh [r0 + r1 * 2], m0 | |
1413 psrldq m0, 2 | |
1414 movh [r0 + r1], m0 | |
1415 psrldq m0, 2 | |
1416 movh [r0], m0 | |
1417 RET | |
1418 | |
1419 cglobal intra_pred_ang4_19, 3,3,5 | |
1420 movd m3, [r2] | |
1421 movh m4, [r2 + 18] ;[D x C B] | |
1422 pshuflw m4, m4, 0x1F ;[B C D D] | |
1423 punpcklqdq m4, m3 ;[x x x A B C D D] | |
1424 psrldq m4, 2 ;[x x x x A B C D] | |
1425 movhps m4, [r2 + 2] | |
1426 | |
1427 mova m3, m4 | |
1428 psrldq m3, 2 | |
1429 punpcklwd m4, m3 | |
1430 mova m2, m3 | |
1431 psrldq m2, 2 | |
1432 punpcklwd m3, m2 | |
1433 mova m1, m2 | |
1434 psrldq m1, 2 | |
1435 punpcklwd m2, m1 | |
1436 mova m0, m1 | |
1437 psrldq m0, 2 | |
1438 punpcklwd m1, m0 | |
1439 | |
1440 CALC_4x4 6, 12, 18, 24 | |
1441 | |
1442 STORE_4x4 | |
1443 RET | |
1444 | |
1445 cglobal intra_pred_ang4_20, 3,3,5 | |
1446 movd m3, [r2] ;[x x x A] | |
1447 movd m4, [r2 + 20] ;[x x C B] | |
1448 movh m0, [r2 + 2] ;[4 3 2 1] | |
1449 punpcklwd m4, m3 ;[x C A B] | |
1450 pshuflw m4, m4, 0x4A ;[A B C C] | |
1451 punpcklqdq m4, m0 ;[4 3 2 1 A B C C] | |
1452 psrldq m4, 2 | |
1453 mova m1, m4 | |
1454 mova m2, m4 | |
1455 psrldq m1, 4 | |
1456 psrldq m2, 2 | |
1457 punpcklwd m4, m2 ;[2 1 1 0 0 x x y] | |
1458 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] | |
1459 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1460 mova m3, m2 | |
1461 | |
1462 CALC_4x4 11, 22, 1, 12 | |
1463 | |
1464 STORE_4x4 | |
1465 RET | |
1466 | |
1467 cglobal intra_pred_ang4_21, 3,3,5 | |
1468 movd m3, [r2] ;[x x x A] | |
1469 movh m4, [r2 + 20] ;[x C x B] | |
1470 movh m0, [r2 + 2] ;[4 3 2 1] | |
1471 pshuflw m4, m4, 0x22 ;[B C B C] | |
1472 punpcklqdq m4, m3 ;[x x x A B C B C] | |
1473 psrldq m4, 2 ;[x x x x A B C B] | |
1474 punpcklqdq m4, m0 | |
1475 psrldq m4, 2 | |
1476 mova m1, m4 | |
1477 mova m2, m4 | |
1478 psrldq m1, 4 | |
1479 psrldq m2, 2 | |
1480 punpcklwd m4, m2 ;[2 1 1 0 0 x x y] | |
1481 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] | |
1482 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1483 mova m3, m2 | |
1484 | |
1485 CALC_4x4 15, 30, 13, 28 | |
1486 | |
1487 STORE_4x4 | |
1488 RET | |
1489 | |
1490 cglobal intra_pred_ang4_22, 3,3,5 | |
1491 movd m4, [r2 + 18] | |
1492 movd m1, [r2 - 2] | |
1493 movh m0, [r2 + 2] | |
1494 punpcklwd m4, m1 | |
1495 punpcklqdq m4, m0 | |
1496 psrldq m4, 4 | |
1497 mova m1, m4 | |
1498 psrldq m1, 2 | |
1499 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] | |
1500 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1501 mova m2, m1 | |
1502 mova m3, m4 | |
1503 | |
1504 CALC_4x4 19, 6, 25, 12 | |
1505 | |
1506 STORE_4x4 | |
1507 RET | |
1508 | |
1509 cglobal intra_pred_ang4_23, 3,3,5 | |
1510 movd m4, [r2 + 22] | |
1511 movd m1, [r2 - 2] | |
1512 movh m0, [r2 + 2] | |
1513 punpcklwd m4, m1 | |
1514 punpcklqdq m4, m0 | |
1515 psrldq m4, 4 | |
1516 mova m1, m4 | |
1517 psrldq m1, 2 | |
1518 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] | |
1519 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1520 mova m2, m1 | |
1521 mova m3, m1 | |
1522 | |
1523 CALC_4x4 23, 14, 5, 28 | |
1524 | |
1525 STORE_4x4 | |
1526 RET | |
1527 | |
1528 cglobal intra_pred_ang4_24, 3,3,5 | |
1529 movh m0, [r2 + 2] | |
1530 movh m1, [r2 - 6] | |
1531 punpcklqdq m1, m0 | |
1532 psrldq m1, 6 | |
1533 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1534 mova m2, m1 | |
1535 mova m3, m1 | |
1536 mova m4, m1 | |
1537 | |
1538 CALC_4x4 27, 22, 17, 12 | |
1539 | |
1540 STORE_4x4 | |
1541 RET | |
1542 | |
1543 cglobal intra_pred_ang4_25, 3,3,5 | |
1544 movh m0, [r2 + 2] ;[x x x 4 3 2 1 0] | |
1545 movh m1, [r2 - 6] | |
1546 punpcklqdq m1, m0 | |
1547 psrldq m1, 6 | |
1548 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] | |
1549 mova m2, m1 | |
1550 mova m3, m1 | |
1551 mova m4, m1 | |
1552 | |
1553 CALC_4x4 30, 28, 26, 24 | |
1554 | |
1555 STORE_4x4 | |
1556 RET | |
1557 | |
1558 cglobal intra_pred_ang4_26, 3,3,3 | |
1559 movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1560 add r1d, r1d | |
1561 ; store | |
1562 movh [r0], m0 | |
1563 movh [r0 + r1], m0 | |
1564 movh [r0 + r1 * 2], m0 | |
1565 lea r3, [r1 * 3] | |
1566 movh [r0 + r3], m0 | |
1567 | |
1568 ; filter | |
1569 cmp r4m, byte 0 | |
1570 jz .quit | |
1571 | |
1572 pshuflw m0, m0, 0x00 | |
1573 movd m2, [r2] | |
1574 pshuflw m2, m2, 0x00 | |
1575 movh m1, [r2 + 18] | |
1576 psubw m1, m2 | |
1577 psraw m1, 1 | |
1578 paddw m0, m1 | |
1579 pxor m1, m1 | |
1580 pmaxsw m0, m1 | |
1581 pminsw m0, [pw_pixel_max] | |
1582 | |
1583 movh r2, m0 | |
1584 mov [r0], r2w | |
1585 shr r2, 16 | |
1586 mov [r0 + r1], r2w | |
1587 shr r2, 16 | |
1588 mov [r0 + r1 * 2], r2w | |
1589 shr r2, 16 | |
1590 mov [r0 + r3], r2w | |
1591 .quit: | |
1592 RET | |
1593 | |
1594 cglobal intra_pred_ang4_27, 3,3,5 | |
1595 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1596 mova m1, m0 | |
1597 psrldq m0, 2 | |
1598 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1599 mova m2, m1 | |
1600 mova m3, m1 | |
1601 mova m4, m1 | |
1602 | |
1603 CALC_4x4 2, 4, 6, 8 | |
1604 | |
1605 STORE_4x4 | |
1606 RET | |
1607 | |
1608 cglobal intra_pred_ang4_28, 3,3,5 | |
1609 | |
1610 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1611 mova m1, m0 | |
1612 psrldq m0, 2 | |
1613 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1614 mova m2, m1 | |
1615 mova m3, m1 | |
1616 mova m4, m1 | |
1617 | |
1618 CALC_4x4 5, 10, 15, 20 | |
1619 | |
1620 STORE_4x4 | |
1621 RET | |
1622 | |
1623 cglobal intra_pred_ang4_29, 3,3,5 | |
1624 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1625 mova m1, m0 | |
1626 psrldq m0, 2 | |
1627 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1628 mova m2, m1 | |
1629 mova m3, m1 | |
1630 mova m4, m0 | |
1631 psrldq m0, 2 | |
1632 punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2] | |
1633 | |
1634 CALC_4x4 9, 18, 27, 4 | |
1635 | |
1636 STORE_4x4 | |
1637 RET | |
1638 | |
1639 cglobal intra_pred_ang4_30, 3,3,5 | |
1640 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1641 mova m1, m0 | |
1642 psrldq m0, 2 | |
1643 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1644 mova m2, m1 | |
1645 mova m3, m0 | |
1646 psrldq m0, 2 | |
1647 punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2] | |
1648 mova m4, m3 | |
1649 | |
1650 CALC_4x4 13, 26, 7, 20 | |
1651 | |
1652 STORE_4x4 | |
1653 RET | |
1654 | |
1655 cglobal intra_pred_ang4_5, 3,3,5 | |
1656 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] | |
1657 mova m1, m0 | |
1658 psrldq m0, 2 | |
1659 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1660 mova m2, m0 | |
1661 psrldq m0, 2 | |
1662 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1663 mova m3, m2 | |
1664 mova m4, m0 | |
1665 psrldq m0, 2 | |
1666 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] | |
1667 | |
1668 CALC_4x4 17, 2, 19, 4 | |
1669 | |
1670 TRANSPOSE_4x4 | |
1671 | |
1672 STORE_4x4 | |
1673 RET | |
1674 | |
1675 cglobal intra_pred_ang4_31, 3,3,5 | |
1676 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1677 mova m1, m0 | |
1678 psrldq m0, 2 | |
1679 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1680 mova m2, m0 | |
1681 psrldq m0, 2 | |
1682 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1683 mova m3, m2 | |
1684 mova m4, m0 | |
1685 psrldq m0, 2 | |
1686 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] | |
1687 | |
1688 CALC_4x4 17, 2, 19, 4 | |
1689 | |
1690 STORE_4x4 | |
1691 RET | |
1692 | |
1693 cglobal intra_pred_ang4_32, 3,3,5 | |
1694 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] | |
1695 mova m1, m0 | |
1696 psrldq m0, 2 | |
1697 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] | |
1698 mova m2, m0 | |
1699 psrldq m0, 2 | |
1700 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] | |
1701 mova m3, m2 | |
1702 mova m4, m0 | |
1703 psrldq m0, 2 | |
1704 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] | |
1705 | |
1706 CALC_4x4 21, 10, 31, 20 | |
1707 | |
1708 STORE_4x4 | |
1709 RET | |
1710 | |
1711 ;----------------------------------------------------------------------------------- | |
1712 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) | |
1713 ;----------------------------------------------------------------------------------- | |
1714 INIT_XMM sse4 | |
1715 cglobal intra_pred_dc4, 5,6,2 | |
1716 lea r3, [r2 + 18] | |
1717 add r2, 2 | |
1718 | |
1719 movh m0, [r3] ; sumAbove | |
1720 movh m1, [r2] ; sumLeft | |
1721 | |
1722 paddw m0, m1 | |
1723 pshufd m1, m0, 1 | |
1724 paddw m0, m1 | |
1725 phaddw m0, m0 ; m0 = sum | |
1726 | |
1727 test r4d, r4d | |
1728 | |
1729 pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8 | |
1730 movd r4d, m0 ; r4d = dc_val | |
1731 movzx r4d, r4w | |
1732 pshuflw m0, m0, 0 ; m0 = word [dc_val ...] | |
1733 | |
1734 ; store DC 4x4 | |
1735 movh [r0], m0 | |
1736 movh [r0 + r1 * 2], m0 | |
1737 movh [r0 + r1 * 4], m0 | |
1738 lea r5, [r0 + r1 * 4] | |
1739 movh [r5 + r1 * 2], m0 | |
1740 | |
1741 ; do DC filter | |
1742 jz .end | |
1743 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 | |
1744 add r4d, r5d ; r4d = DC * 3 + 2 | |
1745 movd m0, r4d | |
1746 pshuflw m0, m0, 0 ; m0 = pixDCx3 | |
1747 | |
1748 ; filter top | |
1749 movu m1, [r2] | |
1750 paddw m1, m0 | |
1751 psrlw m1, 2 | |
1752 movh [r0], m1 ; overwrite top-left pixel, we will update it later | |
1753 | |
1754 ; filter top-left | |
1755 movzx r4d, word [r3] | |
1756 add r5d, r4d | |
1757 movzx r4d, word [r2] | |
1758 add r4d, r5d | |
1759 shr r4d, 2 | |
1760 mov [r0], r4w | |
1761 | |
1762 ; filter left | |
1763 lea r0, [r0 + r1 * 2] | |
1764 movu m1, [r3 + 2] | |
1765 paddw m1, m0 | |
1766 psrlw m1, 2 | |
1767 movd r3d, m1 | |
1768 mov [r0], r3w | |
1769 shr r3d, 16 | |
1770 mov [r0 + r1 * 2], r3w | |
1771 pextrw [r0 + r1 * 4], m1, 2 | |
1772 .end: | |
1773 RET | |
1774 | |
1775 ;--------------------------------------------------------------------------------------- | |
1776 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
1777 ;--------------------------------------------------------------------------------------- | |
1778 INIT_XMM sse2 | |
1779 cglobal intra_pred_planar4, 3,3,5 | |
1780 movu m1, [r2 + 2] | |
1781 movu m2, [r2 + 18] | |
1782 pshufhw m3, m1, 0 ; topRight | |
1783 pshufd m3, m3, 0xAA | |
1784 pshufhw m4, m2, 0 ; bottomLeft | |
1785 pshufd m4, m4, 0xAA | |
1786 | |
1787 pmullw m3, [multi_2Row] ; (x + 1) * topRight | |
1788 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] | |
1789 | |
1790 paddw m3, [pw_4] | |
1791 paddw m3, m4 | |
1792 paddw m3, m0 | |
1793 psubw m4, m1 | |
1794 | |
1795 pshuflw m1, m2, 0 | |
1796 pmullw m1, [pw_planar4_0] | |
1797 paddw m1, m3 | |
1798 paddw m3, m4 | |
1799 psraw m1, 3 | |
1800 movh [r0], m1 | |
1801 | |
1802 pshuflw m1, m2, 01010101b | |
1803 pmullw m1, [pw_planar4_0] | |
1804 paddw m1, m3 | |
1805 paddw m3, m4 | |
1806 psraw m1, 3 | |
1807 movh [r0 + r1 * 2], m1 | |
1808 lea r0, [r0 + 4 * r1] | |
1809 | |
1810 pshuflw m1, m2, 10101010b | |
1811 pmullw m1, [pw_planar4_0] | |
1812 paddw m1, m3 | |
1813 paddw m3, m4 | |
1814 psraw m1, 3 | |
1815 movh [r0], m1 | |
1816 | |
1817 pshuflw m1, m2, 11111111b | |
1818 pmullw m1, [pw_planar4_0] | |
1819 paddw m1, m3 | |
1820 psraw m1, 3 | |
1821 movh [r0 + r1 * 2], m1 | |
1822 RET | |
1823 | |
1824 ;----------------------------------------------------------------------------------- | |
1825 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) | |
1826 ;----------------------------------------------------------------------------------- | |
1827 INIT_XMM sse4 | |
1828 cglobal intra_pred_dc8, 5, 7, 2 | |
1829 lea r3, [r2 + 34] | |
1830 add r2, 2 | |
1831 add r1, r1 | |
1832 movu m0, [r3] | |
1833 movu m1, [r2] | |
1834 | |
1835 paddw m0, m1 | |
1836 movhlps m1, m0 | |
1837 paddw m0, m1 | |
1838 phaddw m0, m0 | |
1839 pmaddwd m0, [pw_1] | |
1840 | |
1841 movd r5d, m0 | |
1842 add r5d, 8 | |
1843 shr r5d, 4 ; sum = sum / 16 | |
1844 movd m1, r5d | |
1845 pshuflw m1, m1, 0 ; m1 = word [dc_val ...] | |
1846 pshufd m1, m1, 0 | |
1847 | |
1848 test r4d, r4d | |
1849 | |
1850 ; store DC 8x8 | |
1851 mov r6, r0 | |
1852 movu [r0], m1 | |
1853 movu [r0 + r1], m1 | |
1854 movu [r0 + r1 * 2], m1 | |
1855 lea r0, [r0 + r1 * 2] | |
1856 movu [r0 + r1], m1 | |
1857 movu [r0 + r1 * 2], m1 | |
1858 lea r0, [r0 + r1 * 2] | |
1859 movu [r0 + r1], m1 | |
1860 movu [r0 + r1 * 2], m1 | |
1861 lea r0, [r0 + r1 * 2] | |
1862 movu [r0 + r1], m1 | |
1863 | |
1864 ; Do DC Filter | |
1865 jz .end | |
1866 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
1867 add r5d, r4d ; r5d = DC * 3 + 2 | |
1868 movd m1, r5d | |
1869 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
1870 pshufd m1, m1, 0 | |
1871 | |
1872 ; filter top | |
1873 movu m0, [r2] | |
1874 paddw m0, m1 | |
1875 psrlw m0, 2 | |
1876 movu [r6], m0 | |
1877 | |
1878 ; filter top-left | |
1879 movzx r5d, word [r3] | |
1880 add r4d, r5d | |
1881 movzx r5d, word [r2] | |
1882 add r5d, r4d | |
1883 shr r5d, 2 | |
1884 mov [r6], r5w | |
1885 | |
1886 ; filter left | |
1887 add r6, r1 | |
1888 movu m0, [r3 + 2] | |
1889 paddw m0, m1 | |
1890 psrlw m0, 2 | |
1891 pextrw [r6], m0, 0 | |
1892 pextrw [r6 + r1], m0, 1 | |
1893 pextrw [r6 + r1 * 2], m0, 2 | |
1894 lea r6, [r6 + r1 * 2] | |
1895 pextrw [r6 + r1], m0, 3 | |
1896 pextrw [r6 + r1 * 2], m0, 4 | |
1897 lea r6, [r6 + r1 * 2] | |
1898 pextrw [r6 + r1], m0, 5 | |
1899 pextrw [r6 + r1 * 2], m0, 6 | |
1900 .end: | |
1901 RET | |
1902 | |
1903 ;------------------------------------------------------------------------------------------------------- | |
1904 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
1905 ;------------------------------------------------------------------------------------------------------- | |
1906 INIT_XMM sse4 | |
1907 cglobal intra_pred_dc16, 5, 7, 4 | |
1908 lea r3, [r2 + 66] | |
1909 add r2, 2 | |
1910 add r1, r1 | |
1911 movu m0, [r3] | |
1912 movu m1, [r3 + 16] | |
1913 movu m2, [r2] | |
1914 movu m3, [r2 + 16] | |
1915 | |
1916 paddw m0, m1 ; dynamic range 13 bits | |
1917 paddw m2, m3 | |
1918 paddw m0, m2 ; dynamic range 14 bits | |
1919 movhlps m1, m0 ; dynamic range 15 bits | |
1920 paddw m0, m1 ; dynamic range 16 bits | |
1921 pmaddwd m0, [pw_1] | |
1922 phaddd m0, m0 | |
1923 | |
1924 movd r5d, m0 | |
1925 add r5d, 16 | |
1926 shr r5d, 5 ; sum = sum / 16 | |
1927 movd m1, r5d | |
1928 pshuflw m1, m1, 0 ; m1 = word [dc_val ...] | |
1929 pshufd m1, m1, 0 | |
1930 | |
1931 test r4d, r4d | |
1932 | |
1933 ; store DC 16x16 | |
1934 mov r6, r0 | |
1935 movu [r0], m1 | |
1936 movu [r0 + 16], m1 | |
1937 movu [r0 + r1], m1 | |
1938 movu [r0 + 16 + r1], m1 | |
1939 lea r0, [r0 + r1 * 2] | |
1940 movu [r0], m1 | |
1941 movu [r0 + 16], m1 | |
1942 movu [r0 + r1], m1 | |
1943 movu [r0 + 16 + r1], m1 | |
1944 lea r0, [r0 + r1 * 2] | |
1945 movu [r0], m1 | |
1946 movu [r0 + 16], m1 | |
1947 movu [r0 + r1], m1 | |
1948 movu [r0 + 16 + r1], m1 | |
1949 lea r0, [r0 + r1 * 2] | |
1950 movu [r0], m1 | |
1951 movu [r0 + 16], m1 | |
1952 movu [r0 + r1], m1 | |
1953 movu [r0 + 16 + r1], m1 | |
1954 lea r0, [r0 + r1 * 2] | |
1955 movu [r0], m1 | |
1956 movu [r0 + 16], m1 | |
1957 movu [r0 + r1], m1 | |
1958 movu [r0 + 16 + r1], m1 | |
1959 lea r0, [r0 + r1 * 2] | |
1960 movu [r0], m1 | |
1961 movu [r0 + 16], m1 | |
1962 movu [r0 + r1], m1 | |
1963 movu [r0 + 16 + r1], m1 | |
1964 lea r0, [r0 + r1 * 2] | |
1965 movu [r0], m1 | |
1966 movu [r0 + 16], m1 | |
1967 movu [r0 + r1], m1 | |
1968 movu [r0 + 16 + r1], m1 | |
1969 lea r0, [r0 + r1 * 2] | |
1970 movu [r0], m1 | |
1971 movu [r0 + 16], m1 | |
1972 movu [r0 + r1], m1 | |
1973 movu [r0 + 16 + r1], m1 | |
1974 | |
1975 ; Do DC Filter | |
1976 jz .end | |
1977 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
1978 add r5d, r4d ; r5d = DC * 3 + 2 | |
1979 movd m1, r5d | |
1980 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
1981 pshufd m1, m1, 0 | |
1982 | |
1983 ; filter top | |
1984 movu m2, [r2] | |
1985 paddw m2, m1 | |
1986 psrlw m2, 2 | |
1987 movu [r6], m2 | |
1988 movu m3, [r2 + 16] | |
1989 paddw m3, m1 | |
1990 psrlw m3, 2 | |
1991 movu [r6 + 16], m3 | |
1992 | |
1993 ; filter top-left | |
1994 movzx r5d, word [r3] | |
1995 add r4d, r5d | |
1996 movzx r5d, word [r2] | |
1997 add r5d, r4d | |
1998 shr r5d, 2 | |
1999 mov [r6], r5w | |
2000 | |
2001 ; filter left | |
2002 add r6, r1 | |
2003 movu m2, [r3 + 2] | |
2004 paddw m2, m1 | |
2005 psrlw m2, 2 | |
2006 | |
2007 pextrw [r6], m2, 0 | |
2008 pextrw [r6 + r1], m2, 1 | |
2009 lea r6, [r6 + r1 * 2] | |
2010 pextrw [r6], m2, 2 | |
2011 pextrw [r6 + r1], m2, 3 | |
2012 lea r6, [r6 + r1 * 2] | |
2013 pextrw [r6], m2, 4 | |
2014 pextrw [r6 + r1], m2, 5 | |
2015 lea r6, [r6 + r1 * 2] | |
2016 pextrw [r6], m2, 6 | |
2017 pextrw [r6 + r1], m2, 7 | |
2018 | |
2019 lea r6, [r6 + r1 * 2] | |
2020 movu m3, [r3 + 18] | |
2021 paddw m3, m1 | |
2022 psrlw m3, 2 | |
2023 | |
2024 pextrw [r6], m3, 0 | |
2025 pextrw [r6 + r1], m3, 1 | |
2026 lea r6, [r6 + r1 * 2] | |
2027 pextrw [r6], m3, 2 | |
2028 pextrw [r6 + r1], m3, 3 | |
2029 lea r6, [r6 + r1 * 2] | |
2030 pextrw [r6], m3, 4 | |
2031 pextrw [r6 + r1], m3, 5 | |
2032 lea r6, [r6 + r1 * 2] | |
2033 pextrw [r6], m3, 6 | |
2034 .end: | |
2035 RET | |
2036 | |
2037 ;------------------------------------------------------------------------------------------- | |
2038 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) | |
2039 ;------------------------------------------------------------------------------------------- | |
2040 INIT_XMM sse4 | |
2041 cglobal intra_pred_dc32, 3, 5, 6 | |
2042 lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) | |
2043 add r2, 2 | |
2044 add r1, r1 | |
2045 movu m0, [r3] | |
2046 movu m1, [r3 + 16] | |
2047 movu m2, [r3 + 32] | |
2048 movu m3, [r3 + 48] | |
2049 paddw m0, m1 ; dynamic range 13 bits | |
2050 paddw m2, m3 | |
2051 paddw m0, m2 ; dynamic range 14 bits | |
2052 movu m1, [r2] | |
2053 movu m3, [r2 + 16] | |
2054 movu m4, [r2 + 32] | |
2055 movu m5, [r2 + 48] | |
2056 paddw m1, m3 ; dynamic range 13 bits | |
2057 paddw m4, m5 | |
2058 paddw m1, m4 ; dynamic range 14 bits | |
2059 paddw m0, m1 ; dynamic range 15 bits | |
2060 pmaddwd m0, [pw_1] | |
2061 movhlps m1, m0 | |
2062 paddd m0, m1 | |
2063 phaddd m0, m0 | |
2064 | |
2065 paddd m0, [pd_32] ; sum = sum + 32 | |
2066 psrld m0, 6 ; sum = sum / 64 | |
2067 pshuflw m0, m0, 0 | |
2068 pshufd m0, m0, 0 | |
2069 | |
2070 lea r2, [r1 * 3] | |
2071 mov r3d, 4 | |
2072 .loop: | |
2073 ; store DC 32x32 | |
2074 movu [r0 + 0], m0 | |
2075 movu [r0 + 16], m0 | |
2076 movu [r0 + 32], m0 | |
2077 movu [r0 + 48], m0 | |
2078 movu [r0 + r1 + 0], m0 | |
2079 movu [r0 + r1 + 16], m0 | |
2080 movu [r0 + r1 + 32], m0 | |
2081 movu [r0 + r1 + 48], m0 | |
2082 movu [r0 + r1 * 2 + 0], m0 | |
2083 movu [r0 + r1 * 2 + 16], m0 | |
2084 movu [r0 + r1 * 2 + 32], m0 | |
2085 movu [r0 + r1 * 2 + 48], m0 | |
2086 movu [r0 + r2 + 0], m0 | |
2087 movu [r0 + r2 + 16], m0 | |
2088 movu [r0 + r2 + 32], m0 | |
2089 movu [r0 + r2 + 48], m0 | |
2090 lea r0, [r0 + r1 * 4] | |
2091 movu [r0 + 0], m0 | |
2092 movu [r0 + 16], m0 | |
2093 movu [r0 + 32], m0 | |
2094 movu [r0 + 48], m0 | |
2095 movu [r0 + r1 + 0], m0 | |
2096 movu [r0 + r1 + 16], m0 | |
2097 movu [r0 + r1 + 32], m0 | |
2098 movu [r0 + r1 + 48], m0 | |
2099 movu [r0 + r1 * 2 + 0], m0 | |
2100 movu [r0 + r1 * 2 + 16], m0 | |
2101 movu [r0 + r1 * 2 + 32], m0 | |
2102 movu [r0 + r1 * 2 + 48], m0 | |
2103 movu [r0 + r2 + 0], m0 | |
2104 movu [r0 + r2 + 16], m0 | |
2105 movu [r0 + r2 + 32], m0 | |
2106 movu [r0 + r2 + 48], m0 | |
2107 lea r0, [r0 + r1 * 4] | |
2108 dec r3d | |
2109 jnz .loop | |
2110 RET | |
2111 | |
2112 ;--------------------------------------------------------------------------------------- | |
2113 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2114 ;--------------------------------------------------------------------------------------- | |
2115 INIT_XMM sse4 | |
2116 cglobal intra_pred_planar4, 3,3,5 | |
2117 add r1, r1 | |
2118 movu m1, [r2 + 2] | |
2119 movu m2, [r2 + 18] | |
2120 pshufhw m3, m1, 0 ; topRight | |
2121 pshufd m3, m3, 0xAA | |
2122 pshufhw m4, m2, 0 ; bottomLeft | |
2123 pshufd m4, m4, 0xAA | |
2124 | |
2125 pmullw m3, [multi_2Row] ; (x + 1) * topRight | |
2126 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] | |
2127 | |
2128 paddw m3, [pw_4] | |
2129 paddw m3, m4 | |
2130 paddw m3, m0 | |
2131 psubw m4, m1 | |
2132 mova m0, [pw_planar4_0] | |
2133 | |
2134 pshuflw m1, m2, 0 | |
2135 pmullw m1, m0 | |
2136 paddw m1, m3 | |
2137 paddw m3, m4 | |
2138 psraw m1, 3 | |
2139 movh [r0], m1 | |
2140 | |
2141 pshuflw m1, m2, 01010101b | |
2142 pmullw m1, m0 | |
2143 paddw m1, m3 | |
2144 paddw m3, m4 | |
2145 psraw m1, 3 | |
2146 movh [r0 + r1], m1 | |
2147 lea r0, [r0 + 2 * r1] | |
2148 | |
2149 pshuflw m1, m2, 10101010b | |
2150 pmullw m1, m0 | |
2151 paddw m1, m3 | |
2152 paddw m3, m4 | |
2153 psraw m1, 3 | |
2154 movh [r0], m1 | |
2155 | |
2156 pshuflw m1, m2, 11111111b | |
2157 pmullw m1, m0 | |
2158 paddw m1, m3 | |
2159 paddw m3, m4 | |
2160 psraw m1, 3 | |
2161 movh [r0 + r1], m1 | |
2162 RET | |
2163 | |
2164 ;--------------------------------------------------------------------------------------- | |
2165 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2166 ;--------------------------------------------------------------------------------------- | |
2167 INIT_XMM sse4 | |
2168 cglobal intra_pred_planar8, 3,3,5 | |
2169 add r1, r1 | |
2170 movu m1, [r2 + 2] | |
2171 movu m2, [r2 + 34] | |
2172 | |
2173 movd m3, [r2 + 18] ; topRight = above[8]; | |
2174 movd m4, [r2 + 50] ; bottomLeft = left[8]; | |
2175 | |
2176 pshuflw m3, m3, 0 | |
2177 pshuflw m4, m4, 0 | |
2178 pshufd m3, m3, 0 ; v_topRight | |
2179 pshufd m4, m4, 0 ; v_bottomLeft | |
2180 | |
2181 pmullw m3, [multiL] ; (x + 1) * topRight | |
2182 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] | |
2183 paddw m3, [pw_8] | |
2184 paddw m3, m4 | |
2185 paddw m3, m0 | |
2186 psubw m4, m1 | |
2187 mova m0, [pw_planar16_mul + mmsize] | |
2188 | |
2189 %macro INTRA_PRED_PLANAR8 1 | |
2190 %if (%1 < 4) | |
2191 pshuflw m1, m2, 0x55 * %1 | |
2192 pshufd m1, m1, 0 | |
2193 %else | |
2194 pshufhw m1, m2, 0x55 * (%1 - 4) | |
2195 pshufd m1, m1, 0xAA | |
2196 %endif | |
2197 pmullw m1, m0 | |
2198 paddw m1, m3 | |
2199 paddw m3, m4 | |
2200 psraw m1, 4 | |
2201 movu [r0], m1 | |
2202 lea r0, [r0 + r1] | |
2203 %endmacro | |
2204 | |
2205 INTRA_PRED_PLANAR8 0 | |
2206 INTRA_PRED_PLANAR8 1 | |
2207 INTRA_PRED_PLANAR8 2 | |
2208 INTRA_PRED_PLANAR8 3 | |
2209 INTRA_PRED_PLANAR8 4 | |
2210 INTRA_PRED_PLANAR8 5 | |
2211 INTRA_PRED_PLANAR8 6 | |
2212 INTRA_PRED_PLANAR8 7 | |
2213 RET | |
2214 | |
2215 ;--------------------------------------------------------------------------------------- | |
2216 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2217 ;--------------------------------------------------------------------------------------- | |
2218 INIT_XMM sse4 | |
2219 cglobal intra_pred_planar16, 3,3,8 | |
2220 add r1, r1 | |
2221 movu m2, [r2 + 2] | |
2222 movu m7, [r2 + 18] | |
2223 | |
2224 movd m3, [r2 + 34] ; topRight = above[16] | |
2225 movd m6, [r2 + 98] ; bottomLeft = left[16] | |
2226 | |
2227 pshuflw m3, m3, 0 | |
2228 pshuflw m6, m6, 0 | |
2229 pshufd m3, m3, 0 ; v_topRight | |
2230 pshufd m6, m6, 0 ; v_bottomLeft | |
2231 | |
2232 pmullw m4, m3, [multiH] ; (x + 1) * topRight | |
2233 pmullw m3, [multiL] ; (x + 1) * topRight | |
2234 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
2235 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] | |
2236 paddw m4, [pw_16] | |
2237 paddw m3, [pw_16] | |
2238 paddw m4, m6 | |
2239 paddw m3, m6 | |
2240 paddw m4, m5 | |
2241 paddw m3, m1 | |
2242 psubw m1, m6, m7 | |
2243 psubw m6, m2 | |
2244 | |
2245 movu m2, [r2 + 66] | |
2246 movu m7, [r2 + 82] | |
2247 | |
2248 %macro INTRA_PRED_PLANAR16 1 | |
2249 %if (%1 < 4) | |
2250 pshuflw m5, m2, 0x55 * %1 | |
2251 pshufd m5, m5, 0 | |
2252 %else | |
2253 %if (%1 < 8) | |
2254 pshufhw m5, m2, 0x55 * (%1 - 4) | |
2255 pshufd m5, m5, 0xAA | |
2256 %else | |
2257 %if (%1 < 12) | |
2258 pshuflw m5, m7, 0x55 * (%1 - 8) | |
2259 pshufd m5, m5, 0 | |
2260 %else | |
2261 pshufhw m5, m7, 0x55 * (%1 - 12) | |
2262 pshufd m5, m5, 0xAA | |
2263 %endif | |
2264 %endif | |
2265 %endif | |
2266 pmullw m0, m5, [pw_planar16_mul + mmsize] | |
2267 pmullw m5, [pw_planar16_mul] | |
2268 paddw m0, m4 | |
2269 paddw m5, m3 | |
2270 paddw m3, m6 | |
2271 paddw m4, m1 | |
2272 psraw m5, 5 | |
2273 psraw m0, 5 | |
2274 movu [r0], m5 | |
2275 movu [r0 + 16], m0 | |
2276 lea r0, [r0 + r1] | |
2277 %endmacro | |
2278 | |
2279 INTRA_PRED_PLANAR16 0 | |
2280 INTRA_PRED_PLANAR16 1 | |
2281 INTRA_PRED_PLANAR16 2 | |
2282 INTRA_PRED_PLANAR16 3 | |
2283 INTRA_PRED_PLANAR16 4 | |
2284 INTRA_PRED_PLANAR16 5 | |
2285 INTRA_PRED_PLANAR16 6 | |
2286 INTRA_PRED_PLANAR16 7 | |
2287 INTRA_PRED_PLANAR16 8 | |
2288 INTRA_PRED_PLANAR16 9 | |
2289 INTRA_PRED_PLANAR16 10 | |
2290 INTRA_PRED_PLANAR16 11 | |
2291 INTRA_PRED_PLANAR16 12 | |
2292 INTRA_PRED_PLANAR16 13 | |
2293 INTRA_PRED_PLANAR16 14 | |
2294 INTRA_PRED_PLANAR16 15 | |
2295 RET | |
2296 | |
2297 ;--------------------------------------------------------------------------------------- | |
2298 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2299 ;--------------------------------------------------------------------------------------- | |
2300 INIT_XMM sse4 | |
2301 %if ARCH_X86_64 == 1 | |
2302 cglobal intra_pred_planar32, 3,7,16 | |
2303 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
2304 mov r6, rsp | |
2305 sub rsp, 4*mmsize | |
2306 and rsp, ~63 | |
2307 %define m16 [rsp + 0 * mmsize] | |
2308 %define m17 [rsp + 1 * mmsize] | |
2309 %define m18 [rsp + 2 * mmsize] | |
2310 %define m19 [rsp + 3 * mmsize] | |
2311 %else | |
2312 cglobal intra_pred_planar32, 3,7,8 | |
2313 ; NOTE: align stack to 64 bytes, so all of local data in same cache line | |
2314 mov r6, rsp | |
2315 sub rsp, 12*mmsize | |
2316 and rsp, ~63 | |
2317 %define m8 [rsp + 0 * mmsize] | |
2318 %define m9 [rsp + 1 * mmsize] | |
2319 %define m10 [rsp + 2 * mmsize] | |
2320 %define m11 [rsp + 3 * mmsize] | |
2321 %define m12 [rsp + 4 * mmsize] | |
2322 %define m13 [rsp + 5 * mmsize] | |
2323 %define m14 [rsp + 6 * mmsize] | |
2324 %define m15 [rsp + 7 * mmsize] | |
2325 %define m16 [rsp + 8 * mmsize] | |
2326 %define m17 [rsp + 9 * mmsize] | |
2327 %define m18 [rsp + 10 * mmsize] | |
2328 %define m19 [rsp + 11 * mmsize] | |
2329 %endif | |
2330 add r1, r1 | |
2331 lea r5, [planar32_table1] | |
2332 | |
2333 movzx r3d, word [r2 + 66] ; topRight = above[32] | |
2334 movd m7, r3d | |
2335 pshufd m7, m7, 0 ; v_topRight | |
2336 | |
2337 pmulld m0, m7, [r5 + 0 ] ; (x + 1) * topRight | |
2338 pmulld m1, m7, [r5 + 16 ] | |
2339 pmulld m2, m7, [r5 + 32 ] | |
2340 pmulld m3, m7, [r5 + 48 ] | |
2341 pmulld m4, m7, [r5 + 64 ] | |
2342 pmulld m5, m7, [r5 + 80 ] | |
2343 pmulld m6, m7, [r5 + 96 ] | |
2344 pmulld m7, m7, [r5 + 112] | |
2345 | |
2346 mova m12, m4 | |
2347 mova m13, m5 | |
2348 mova m14, m6 | |
2349 mova m15, m7 | |
2350 | |
2351 movzx r3d, word [r2 + 194] ; bottomLeft = left[32] | |
2352 movd m6, r3d | |
2353 pshufd m6, m6, 0 ; v_bottomLeft | |
2354 | |
2355 paddd m0, m6 | |
2356 paddd m1, m6 | |
2357 paddd m2, m6 | |
2358 paddd m3, m6 | |
2359 paddd m0, [pd_32] | |
2360 paddd m1, [pd_32] | |
2361 paddd m2, [pd_32] | |
2362 paddd m3, [pd_32] | |
2363 | |
2364 mova m4, m12 | |
2365 mova m5, m13 | |
2366 paddd m4, m6 | |
2367 paddd m5, m6 | |
2368 paddd m4, [pd_32] | |
2369 paddd m5, [pd_32] | |
2370 mova m12, m4 | |
2371 mova m13, m5 | |
2372 | |
2373 mova m4, m14 | |
2374 mova m5, m15 | |
2375 paddd m4, m6 | |
2376 paddd m5, m6 | |
2377 paddd m4, [pd_32] | |
2378 paddd m5, [pd_32] | |
2379 mova m14, m4 | |
2380 mova m15, m5 | |
2381 | |
2382 ; above[0-3] * (blkSize - 1 - y) | |
2383 pmovzxwd m4, [r2 + 2] | |
2384 pmulld m5, m4, [pd_31] | |
2385 paddd m0, m5 | |
2386 psubd m5, m6, m4 | |
2387 mova m8, m5 | |
2388 | |
2389 ; above[4-7] * (blkSize - 1 - y) | |
2390 pmovzxwd m4, [r2 + 10] | |
2391 pmulld m5, m4, [pd_31] | |
2392 paddd m1, m5 | |
2393 psubd m5, m6, m4 | |
2394 mova m9, m5 | |
2395 | |
2396 ; above[8-11] * (blkSize - 1 - y) | |
2397 pmovzxwd m4, [r2 + 18] | |
2398 pmulld m5, m4, [pd_31] | |
2399 paddd m2, m5 | |
2400 psubd m5, m6, m4 | |
2401 mova m10, m5 | |
2402 | |
2403 ; above[12-15] * (blkSize - 1 - y) | |
2404 pmovzxwd m4, [r2 + 26] | |
2405 pmulld m5, m4, [pd_31] | |
2406 paddd m3, m5 | |
2407 psubd m5, m6, m4 | |
2408 mova m11, m5 | |
2409 | |
2410 ; above[16-19] * (blkSize - 1 - y) | |
2411 pmovzxwd m4, [r2 + 34] | |
2412 mova m7, m12 | |
2413 pmulld m5, m4, [pd_31] | |
2414 paddd m7, m5 | |
2415 mova m12, m7 | |
2416 psubd m5, m6, m4 | |
2417 mova m16, m5 | |
2418 | |
2419 ; above[20-23] * (blkSize - 1 - y) | |
2420 pmovzxwd m4, [r2 + 42] | |
2421 mova m7, m13 | |
2422 pmulld m5, m4, [pd_31] | |
2423 paddd m7, m5 | |
2424 mova m13, m7 | |
2425 psubd m5, m6, m4 | |
2426 mova m17, m5 | |
2427 | |
2428 ; above[24-27] * (blkSize - 1 - y) | |
2429 pmovzxwd m4, [r2 + 50] | |
2430 mova m7, m14 | |
2431 pmulld m5, m4, [pd_31] | |
2432 paddd m7, m5 | |
2433 mova m14, m7 | |
2434 psubd m5, m6, m4 | |
2435 mova m18, m5 | |
2436 | |
2437 ; above[28-31] * (blkSize - 1 - y) | |
2438 pmovzxwd m4, [r2 + 58] | |
2439 mova m7, m15 | |
2440 pmulld m5, m4, [pd_31] | |
2441 paddd m7, m5 | |
2442 mova m15, m7 | |
2443 psubd m5, m6, m4 | |
2444 mova m19, m5 | |
2445 | |
2446 add r2, 130 ; (2 * blkSize + 1) | |
2447 lea r5, [planar32_table] | |
2448 | |
2449 %macro INTRA_PRED_PLANAR32 0 | |
2450 movzx r3d, word [r2] | |
2451 movd m4, r3d | |
2452 pshufd m4, m4, 0 | |
2453 | |
2454 pmulld m5, m4, [r5] | |
2455 pmulld m6, m4, [r5 + 16] | |
2456 paddd m5, m0 | |
2457 paddd m6, m1 | |
2458 paddd m0, m8 | |
2459 paddd m1, m9 | |
2460 psrad m5, 6 | |
2461 psrad m6, 6 | |
2462 packusdw m5, m6 | |
2463 movu [r0], m5 | |
2464 | |
2465 pmulld m5, m4, [r5 + 32] | |
2466 pmulld m6, m4, [r5 + 48] | |
2467 paddd m5, m2 | |
2468 paddd m6, m3 | |
2469 paddd m2, m10 | |
2470 paddd m3, m11 | |
2471 psrad m5, 6 | |
2472 psrad m6, 6 | |
2473 packusdw m5, m6 | |
2474 movu [r0 + 16], m5 | |
2475 | |
2476 pmulld m5, m4, [r5 + 64] | |
2477 pmulld m6, m4, [r5 + 80] | |
2478 paddd m5, m12 | |
2479 paddd m6, m13 | |
2480 psrad m5, 6 | |
2481 psrad m6, 6 | |
2482 packusdw m5, m6 | |
2483 movu [r0 + 32], m5 | |
2484 mova m5, m12 | |
2485 mova m6, m13 | |
2486 paddd m5, m16 | |
2487 paddd m6, m17 | |
2488 mova m12, m5 | |
2489 mova m13, m6 | |
2490 | |
2491 pmulld m5, m4, [r5 + 96] | |
2492 pmulld m4, [r5 + 112] | |
2493 paddd m5, m14 | |
2494 paddd m4, m15 | |
2495 psrad m5, 6 | |
2496 psrad m4, 6 | |
2497 packusdw m5, m4 | |
2498 movu [r0 + 48], m5 | |
2499 mova m4, m14 | |
2500 mova m5, m15 | |
2501 paddd m4, m18 | |
2502 paddd m5, m19 | |
2503 mova m14, m4 | |
2504 mova m15, m5 | |
2505 | |
2506 lea r0, [r0 + r1] | |
2507 add r2, 2 | |
2508 %endmacro | |
2509 | |
2510 mov r4, 8 | |
2511 .loop: | |
2512 INTRA_PRED_PLANAR32 | |
2513 INTRA_PRED_PLANAR32 | |
2514 INTRA_PRED_PLANAR32 | |
2515 INTRA_PRED_PLANAR32 | |
2516 dec r4 | |
2517 jnz .loop | |
2518 mov rsp, r6 | |
2519 RET | |
2520 | |
2521 ;----------------------------------------------------------------------------------------- | |
2522 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
2523 ;----------------------------------------------------------------------------------------- | |
2524 INIT_XMM ssse3 | |
2525 cglobal intra_pred_ang4_2, 3,5,4 | |
2526 lea r4, [r2 + 4] | |
2527 add r2, 20 | |
2528 cmp r3m, byte 34 | |
2529 cmove r2, r4 | |
2530 | |
2531 add r1, r1 | |
2532 movu m0, [r2] | |
2533 movh [r0], m0 | |
2534 palignr m1, m0, 2 | |
2535 movh [r0 + r1], m1 | |
2536 palignr m2, m0, 4 | |
2537 movh [r0 + r1 * 2], m2 | |
2538 lea r1, [r1 * 3] | |
2539 psrldq m0, 6 | |
2540 movh [r0 + r1], m0 | |
2541 RET | |
2542 | |
2543 INIT_XMM sse4 | |
2544 cglobal intra_pred_ang4_3, 3,5,8 | |
2545 mov r4, 2 | |
2546 cmp r3m, byte 33 | |
2547 mov r3, 18 | |
2548 cmove r3, r4 | |
2549 | |
2550 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2551 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2552 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2553 palignr m5, m0, 4 ; [x x 8 7 6 5 4 3] | |
2554 punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2] | |
2555 palignr m1, m0, 6 ; [x x x 8 7 6 5 4] | |
2556 punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3] | |
2557 movhlps m0, m0 ; [x x x x 8 7 6 5] | |
2558 punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4] | |
2559 | |
2560 lea r3, [ang_table + 20 * 16] | |
2561 mova m0, [r3 + 6 * 16] ; [26] | |
2562 mova m1, [r3] ; [20] | |
2563 mova m6, [r3 - 6 * 16] ; [14] | |
2564 mova m7, [r3 - 12 * 16] ; [ 8] | |
2565 jmp .do_filter4x4 | |
2566 | |
2567 ALIGN 16 | |
2568 .do_filter4x4: | |
2569 pmaddwd m2, m0 | |
2570 paddd m2, [pd_16] | |
2571 psrld m2, 5 | |
2572 | |
2573 pmaddwd m3, m1 | |
2574 paddd m3, [pd_16] | |
2575 psrld m3, 5 | |
2576 packusdw m2, m3 | |
2577 | |
2578 pmaddwd m4, m6 | |
2579 paddd m4, [pd_16] | |
2580 psrld m4, 5 | |
2581 | |
2582 pmaddwd m5, m7 | |
2583 paddd m5, [pd_16] | |
2584 psrld m5, 5 | |
2585 packusdw m4, m5 | |
2586 | |
2587 jz .store | |
2588 | |
2589 ; transpose 4x4 | |
2590 punpckhwd m0, m2, m4 | |
2591 punpcklwd m2, m4 | |
2592 punpckhwd m4, m2, m0 | |
2593 punpcklwd m2, m0 | |
2594 | |
2595 .store: | |
2596 add r1, r1 | |
2597 movh [r0], m2 | |
2598 movhps [r0 + r1], m2 | |
2599 movh [r0 + r1 * 2], m4 | |
2600 lea r1, [r1 * 3] | |
2601 movhps [r0 + r1], m4 | |
2602 RET | |
2603 | |
2604 cglobal intra_pred_ang4_4, 3,5,8 | |
2605 mov r4, 2 | |
2606 cmp r3m, byte 32 | |
2607 mov r3, 18 | |
2608 cmove r3, r4 | |
2609 | |
2610 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2611 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2612 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2613 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] | |
2614 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] | |
2615 mova m4, m3 | |
2616 palignr m7, m0, 6 ; [x x x 8 7 6 5 4] | |
2617 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] | |
2618 | |
2619 lea r3, [ang_table + 18 * 16] | |
2620 mova m0, [r3 + 3 * 16] ; [21] | |
2621 mova m1, [r3 - 8 * 16] ; [10] | |
2622 mova m6, [r3 + 13 * 16] ; [31] | |
2623 mova m7, [r3 + 2 * 16] ; [20] | |
2624 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2625 | |
2626 cglobal intra_pred_ang4_5, 3,5,8 | |
2627 mov r4, 2 | |
2628 cmp r3m, byte 31 | |
2629 mov r3, 18 | |
2630 cmove r3, r4 | |
2631 | |
2632 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2633 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2634 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2635 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] | |
2636 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] | |
2637 mova m4, m3 | |
2638 palignr m7, m0, 6 ; [x x x 8 7 6 5 4] | |
2639 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] | |
2640 | |
2641 lea r3, [ang_table + 10 * 16] | |
2642 mova m0, [r3 + 7 * 16] ; [17] | |
2643 mova m1, [r3 - 8 * 16] ; [ 2] | |
2644 mova m6, [r3 + 9 * 16] ; [19] | |
2645 mova m7, [r3 - 6 * 16] ; [ 4] | |
2646 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2647 | |
2648 cglobal intra_pred_ang4_6, 3,5,8 | |
2649 mov r4, 2 | |
2650 cmp r3m, byte 30 | |
2651 mov r3, 18 | |
2652 cmove r3, r4 | |
2653 | |
2654 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2655 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2656 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2657 mova m3, m2 | |
2658 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] | |
2659 punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2] | |
2660 mova m5, m4 | |
2661 | |
2662 lea r3, [ang_table + 19 * 16] | |
2663 mova m0, [r3 - 6 * 16] ; [13] | |
2664 mova m1, [r3 + 7 * 16] ; [26] | |
2665 mova m6, [r3 - 12 * 16] ; [ 7] | |
2666 mova m7, [r3 + 1 * 16] ; [20] | |
2667 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2668 | |
2669 cglobal intra_pred_ang4_7, 3,5,8 | |
2670 mov r4, 2 | |
2671 cmp r3m, byte 29 | |
2672 mov r3, 18 | |
2673 cmove r3, r4 | |
2674 | |
2675 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2676 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2677 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2678 mova m3, m2 | |
2679 mova m4, m2 | |
2680 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] | |
2681 punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2] | |
2682 | |
2683 lea r3, [ang_table + 20 * 16] | |
2684 mova m0, [r3 - 11 * 16] ; [ 9] | |
2685 mova m1, [r3 - 2 * 16] ; [18] | |
2686 mova m6, [r3 + 7 * 16] ; [27] | |
2687 mova m7, [r3 - 16 * 16] ; [ 4] | |
2688 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2689 | |
2690 cglobal intra_pred_ang4_8, 3,5,8 | |
2691 mov r4, 2 | |
2692 cmp r3m, byte 28 | |
2693 mov r3, 18 | |
2694 cmove r3, r4 | |
2695 | |
2696 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2697 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2698 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2699 mova m3, m2 | |
2700 mova m4, m2 | |
2701 mova m5, m2 | |
2702 | |
2703 lea r3, [ang_table + 13 * 16] | |
2704 mova m0, [r3 - 8 * 16] ; [ 5] | |
2705 mova m1, [r3 - 3 * 16] ; [10] | |
2706 mova m6, [r3 + 2 * 16] ; [15] | |
2707 mova m7, [r3 + 7 * 16] ; [20] | |
2708 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2709 | |
2710 cglobal intra_pred_ang4_9, 3,5,8 | |
2711 mov r4, 2 | |
2712 cmp r3m, byte 27 | |
2713 mov r3, 18 | |
2714 cmove r3, r4 | |
2715 | |
2716 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
2717 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] | |
2718 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] | |
2719 mova m3, m2 | |
2720 mova m4, m2 | |
2721 mova m5, m2 | |
2722 | |
2723 lea r3, [ang_table + 4 * 16] | |
2724 mova m0, [r3 - 2 * 16] ; [ 2] | |
2725 mova m1, [r3 - 0 * 16] ; [ 4] | |
2726 mova m6, [r3 + 2 * 16] ; [ 6] | |
2727 mova m7, [r3 + 4 * 16] ; [ 8] | |
2728 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2729 | |
2730 cglobal intra_pred_ang4_10, 3,3,4 | |
2731 movh m0, [r2 + 18] ; [4 3 2 1] | |
2732 pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3] | |
2733 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] | |
2734 add r1, r1 | |
2735 movhlps m1, m0 ; [2 2 2 2] | |
2736 movhlps m3, m2 ; [4 4 4 4] | |
2737 movh [r0 + r1], m1 | |
2738 movh [r0 + r1 * 2], m2 | |
2739 lea r1, [r1 * 3] | |
2740 movh [r0 + r1], m3 | |
2741 | |
2742 cmp r4m, byte 0 | |
2743 jz .quit | |
2744 | |
2745 ; filter | |
2746 movu m1, [r2] ; [7 6 5 4 3 2 1 0] | |
2747 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] | |
2748 palignr m1, m1, 2 ; [4 3 2 1] | |
2749 psubw m1, m2 | |
2750 psraw m1, 1 | |
2751 paddw m0, m1 | |
2752 pxor m1, m1 | |
2753 pmaxsw m0, m1 | |
2754 pminsw m0, [pw_pixel_max] | |
2755 .quit: | |
2756 movh [r0], m0 | |
2757 RET | |
2758 | |
2759 cglobal intra_pred_ang4_26, 3,4,3 | |
2760 movh m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
2761 add r1, r1 | |
2762 ; store | |
2763 movh [r0], m0 | |
2764 movh [r0 + r1], m0 | |
2765 movh [r0 + r1 * 2], m0 | |
2766 lea r3, [r1 * 3] | |
2767 movh [r0 + r3], m0 | |
2768 | |
2769 ; filter | |
2770 cmp r4m, byte 0 | |
2771 jz .quit | |
2772 | |
2773 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] | |
2774 movu m1, [r2 + 16] | |
2775 pinsrw m1, [r2], 0 ; [7 6 5 4 3 2 1 0] | |
2776 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] | |
2777 palignr m1, m1, 2 ; [4 3 2 1] | |
2778 psubw m1, m2 | |
2779 psraw m1, 1 | |
2780 paddw m0, m1 | |
2781 pxor m1, m1 | |
2782 pmaxsw m0, m1 | |
2783 pminsw m0, [pw_pixel_max] | |
2784 | |
2785 pextrw [r0], m0, 0 | |
2786 pextrw [r0 + r1], m0, 1 | |
2787 pextrw [r0 + r1 * 2], m0, 2 | |
2788 pextrw [r0 + r3], m0, 3 | |
2789 .quit: | |
2790 RET | |
2791 | |
2792 cglobal intra_pred_ang4_11, 3,5,8 | |
2793 xor r4, r4 | |
2794 cmp r3m, byte 25 | |
2795 mov r3, 16 | |
2796 cmove r3, r4 | |
2797 | |
2798 movu m2, [r2 + r3] ; [x x x 4 3 2 1 0] | |
2799 pinsrw m2, [r2], 0 | |
2800 palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
2801 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] | |
2802 mova m3, m2 | |
2803 mova m4, m2 | |
2804 mova m5, m2 | |
2805 | |
2806 lea r3, [ang_table + 24 * 16] | |
2807 mova m0, [r3 + 6 * 16] ; [24] | |
2808 mova m1, [r3 + 4 * 16] ; [26] | |
2809 mova m6, [r3 + 2 * 16] ; [28] | |
2810 mova m7, [r3 + 0 * 16] ; [30] | |
2811 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2812 | |
2813 cglobal intra_pred_ang4_12, 3,5,8 | |
2814 xor r4, r4 | |
2815 cmp r3m, byte 24 | |
2816 mov r3, 16 | |
2817 cmove r3, r4 | |
2818 | |
2819 movu m2, [r2 + r3] ; [x x x 4 3 2 1 0] | |
2820 pinsrw m2, [r2], 0 | |
2821 palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
2822 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] | |
2823 mova m3, m2 | |
2824 mova m4, m2 | |
2825 mova m5, m2 | |
2826 | |
2827 lea r3, [ang_table + 20 * 16] | |
2828 mova m0, [r3 + 7 * 16] ; [27] | |
2829 mova m1, [r3 + 2 * 16] ; [22] | |
2830 mova m6, [r3 - 3 * 16] ; [17] | |
2831 mova m7, [r3 - 8 * 16] ; [12] | |
2832 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2833 | |
2834 cglobal intra_pred_ang4_13, 3,5,8 | |
2835 xor r4, r4 | |
2836 cmp r3m, byte 23 | |
2837 mov r3, 16 | |
2838 jz .next | |
2839 xchg r3, r4 | |
2840 .next: | |
2841 movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] | |
2842 pinsrw m5, [r2], 1 | |
2843 palignr m2, m5, 2 ; [x x x 4 3 2 1 0] | |
2844 palignr m0, m5, 4 ; [x x x x 4 3 2 1] | |
2845 pinsrw m5, [r2 + r3 + 8], 0 | |
2846 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] | |
2847 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] | |
2848 mova m3, m2 | |
2849 mova m4, m2 | |
2850 | |
2851 lea r3, [ang_table + 21 * 16] | |
2852 mova m0, [r3 + 2 * 16] ; [23] | |
2853 mova m1, [r3 - 7 * 16] ; [14] | |
2854 mova m6, [r3 - 16 * 16] ; [ 5] | |
2855 mova m7, [r3 + 7 * 16] ; [28] | |
2856 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2857 | |
2858 cglobal intra_pred_ang4_14, 3,5,8 | |
2859 xor r4, r4 | |
2860 cmp r3m, byte 22 | |
2861 mov r3, 16 | |
2862 jz .next | |
2863 xchg r3, r4 | |
2864 .next: | |
2865 movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] | |
2866 pinsrw m5, [r2], 1 | |
2867 palignr m2, m5, 2 ; [x x x 4 3 2 1 0] | |
2868 palignr m0, m5, 4 ; [x x x x 4 3 2 1] | |
2869 pinsrw m5, [r2 + r3 + 4], 0 | |
2870 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] | |
2871 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] | |
2872 mova m3, m2 | |
2873 mova m4, m5 | |
2874 | |
2875 lea r3, [ang_table + 19 * 16] | |
2876 mova m0, [r3 + 0 * 16] ; [19] | |
2877 mova m1, [r3 - 13 * 16] ; [ 6] | |
2878 mova m6, [r3 + 6 * 16] ; [25] | |
2879 mova m7, [r3 - 7 * 16] ; [12] | |
2880 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2881 | |
2882 cglobal intra_pred_ang4_15, 3,5,8 | |
2883 xor r4, r4 | |
2884 cmp r3m, byte 21 | |
2885 mov r3, 16 | |
2886 jz .next | |
2887 xchg r3, r4 | |
2888 .next: | |
2889 movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] | |
2890 pinsrw m3, [r2], 1 | |
2891 palignr m2, m3, 2 ; [x x x 4 3 2 1 0] | |
2892 palignr m0, m3, 4 ; [x x x x 4 3 2 1] | |
2893 pinsrw m3, [r2 + r3 + 4], 0 | |
2894 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] | |
2895 pinsrw m5, [r2 + r3 + 8], 0 | |
2896 punpcklwd m5, m3 ; [2 1 1 0 0 x x y] | |
2897 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] | |
2898 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] | |
2899 mova m4, m3 | |
2900 | |
2901 lea r3, [ang_table + 23 * 16] | |
2902 mova m0, [r3 - 8 * 16] ; [15] | |
2903 mova m1, [r3 + 7 * 16] ; [30] | |
2904 mova m6, [r3 - 10 * 16] ; [13] | |
2905 mova m7, [r3 + 5 * 16] ; [28] | |
2906 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2907 | |
2908 cglobal intra_pred_ang4_16, 3,5,8 | |
2909 xor r4, r4 | |
2910 cmp r3m, byte 20 | |
2911 mov r3, 16 | |
2912 jz .next | |
2913 xchg r3, r4 | |
2914 .next: | |
2915 movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] | |
2916 pinsrw m3, [r2], 1 | |
2917 palignr m2, m3, 2 ; [x x x 4 3 2 1 0] | |
2918 palignr m0, m3, 4 ; [x x x x 4 3 2 1] | |
2919 pinsrw m3, [r2 + r3 + 4], 0 | |
2920 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] | |
2921 pinsrw m5, [r2 + r3 + 6], 0 | |
2922 punpcklwd m5, m3 ; [2 1 1 0 0 x x y] | |
2923 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] | |
2924 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] | |
2925 mova m4, m3 | |
2926 | |
2927 lea r3, [ang_table + 19 * 16] | |
2928 mova m0, [r3 - 8 * 16] ; [11] | |
2929 mova m1, [r3 + 3 * 16] ; [22] | |
2930 mova m6, [r3 - 18 * 16] ; [ 1] | |
2931 mova m7, [r3 - 7 * 16] ; [12] | |
2932 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2933 | |
2934 cglobal intra_pred_ang4_17, 3,5,8 | |
2935 xor r4, r4 | |
2936 cmp r3m, byte 19 | |
2937 mov r3, 16 | |
2938 jz .next | |
2939 xchg r3, r4 | |
2940 .next: | |
2941 movu m6, [r2 + r4 - 2] ; [- - 4 3 2 1 0 x] | |
2942 pinsrw m6, [r2], 1 | |
2943 palignr m2, m6, 2 ; [- - - 4 3 2 1 0] | |
2944 palignr m1, m6, 4 ; [- - - - 4 3 2 1] | |
2945 mova m4, m2 | |
2946 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] | |
2947 | |
2948 pinsrw m6, [r2 + r3 + 2], 0 | |
2949 punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x] | |
2950 | |
2951 pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y] | |
2952 pinsrw m4, [r2 + r3 + 4], 0 | |
2953 pslldq m5, m4, 2 ; [4 3 2 1 0 x y z] | |
2954 pinsrw m5, [r2 + r3 + 8], 0 | |
2955 punpcklwd m5, m4 ; [1 0 0 x x y y z] | |
2956 punpcklwd m4, m6 ; [2 1 1 0 0 x x y] | |
2957 | |
2958 lea r3, [ang_table + 14 * 16] | |
2959 mova m0, [r3 - 8 * 16] ; [ 6] | |
2960 mova m1, [r3 - 2 * 16] ; [12] | |
2961 mova m6, [r3 + 4 * 16] ; [18] | |
2962 mova m7, [r3 + 10 * 16] ; [24] | |
2963 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
2964 | |
2965 cglobal intra_pred_ang4_18, 3,3,1 | |
2966 movh m0, [r2 + 16] | |
2967 pinsrw m0, [r2], 0 | |
2968 pshufb m0, [pw_swap] | |
2969 movhps m0, [r2 + 2] | |
2970 add r1, r1 | |
2971 lea r2, [r1 * 3] | |
2972 movh [r0 + r2], m0 | |
2973 psrldq m0, 2 | |
2974 movh [r0 + r1 * 2], m0 | |
2975 psrldq m0, 2 | |
2976 movh [r0 + r1], m0 | |
2977 psrldq m0, 2 | |
2978 movh [r0], m0 | |
2979 RET | |
2980 | |
2981 ;----------------------------------------------------------------------------------------- | |
2982 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
2983 ;----------------------------------------------------------------------------------------- | |
2984 INIT_XMM ssse3 | |
2985 cglobal intra_pred_ang8_2, 3,5,3 | |
2986 lea r4, [r2] | |
2987 add r2, 32 | |
2988 cmp r3m, byte 34 | |
2989 cmove r2, r4 | |
2990 add r1, r1 | |
2991 lea r3, [r1 * 3] | |
2992 movu m0, [r2 + 4] | |
2993 movu m1, [r2 + 20] | |
2994 movu [r0], m0 | |
2995 palignr m2, m1, m0, 2 | |
2996 movu [r0 + r1], m2 | |
2997 palignr m2, m1, m0, 4 | |
2998 movu [r0 + r1 * 2], m2 | |
2999 palignr m2, m1, m0, 6 | |
3000 movu [r0 + r3], m2 | |
3001 lea r0, [r0 + r1 * 4] | |
3002 palignr m2, m1, m0, 8 | |
3003 movu [r0], m2 | |
3004 palignr m2, m1, m0, 10 | |
3005 movu [r0 + r1], m2 | |
3006 palignr m2, m1, m0, 12 | |
3007 movu [r0 + r1 * 2], m2 | |
3008 palignr m1, m0, 14 | |
3009 movu [r0 + r3], m1 | |
3010 RET | |
3011 | |
3012 INIT_XMM sse4 | |
3013 cglobal intra_pred_ang8_3, 3,5,8 | |
3014 add r2, 32 | |
3015 lea r3, [ang_table + 14 * 16] | |
3016 add r1, r1 | |
3017 | |
3018 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3019 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
3020 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
3021 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
3022 | |
3023 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
3024 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
3025 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
3026 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] | |
3027 | |
3028 mova m4, m3 | |
3029 pmaddwd m4, [r3 + 12 * 16] ; [26] | |
3030 paddd m4, [pd_16] | |
3031 psrld m4, 5 | |
3032 mova m2, m0 | |
3033 pmaddwd m2, [r3 + 12 * 16] | |
3034 paddd m2, [pd_16] | |
3035 psrld m2, 5 | |
3036 packusdw m4, m2 | |
3037 | |
3038 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3039 pmaddwd m2, [r3 + 6 * 16] ; [20] | |
3040 paddd m2, [pd_16] | |
3041 psrld m2, 5 | |
3042 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3043 pmaddwd m6, [r3 + 6 * 16] | |
3044 paddd m6, [pd_16] | |
3045 psrld m6, 5 | |
3046 packusdw m2, m6 | |
3047 | |
3048 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3049 pmaddwd m6, [r3] ; [14] | |
3050 paddd m6, [pd_16] | |
3051 psrld m6, 5 | |
3052 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3053 pmaddwd m7, [r3] | |
3054 paddd m7, [pd_16] | |
3055 psrld m7, 5 | |
3056 packusdw m6, m7 | |
3057 | |
3058 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
3059 pmaddwd m7, [r3 - 6 * 16] ; [ 8] | |
3060 paddd m7, [pd_16] | |
3061 psrld m7, 5 | |
3062 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
3063 pmaddwd m3, [r3 - 6 * 16] | |
3064 paddd m3, [pd_16] | |
3065 psrld m3, 5 | |
3066 packusdw m7, m3 | |
3067 | |
3068 punpckhwd m3, m4, m2 | |
3069 punpcklwd m4, m2 | |
3070 punpckhwd m2, m6, m7 | |
3071 punpcklwd m6, m7 | |
3072 | |
3073 punpckldq m7, m4, m6 | |
3074 punpckhdq m4, m6 | |
3075 punpckldq m6, m3, m2 | |
3076 punpckhdq m3, m2 | |
3077 | |
3078 lea r4, [r1 * 3] | |
3079 movh [r0], m7 | |
3080 movhps [r0 + r1], m7 | |
3081 movh [r0 + r1 * 2], m4 | |
3082 movhps [r0 + r4], m4 | |
3083 lea r2, [r0 + r1 * 4] | |
3084 movh [r2], m6 | |
3085 movhps [r2 + r1], m6 | |
3086 movh [r2 + r1 * 2], m3 | |
3087 movhps [r2 + r4], m3 | |
3088 | |
3089 mova m4, m0 | |
3090 pmaddwd m4, [r3 - 12 * 16] ; [ 2] | |
3091 paddd m4, [pd_16] | |
3092 psrld m4, 5 | |
3093 mova m2, m5 | |
3094 pmaddwd m2, [r3 - 12 * 16] | |
3095 paddd m2, [pd_16] | |
3096 psrld m2, 5 | |
3097 packusdw m4, m2 | |
3098 | |
3099 mova m2, m0 | |
3100 pmaddwd m2, [r3 + 14 * 16] ; [28] | |
3101 paddd m2, [pd_16] | |
3102 psrld m2, 5 | |
3103 mova m6, m5 | |
3104 pmaddwd m6, [r3 + 14 * 16] | |
3105 paddd m6, [pd_16] | |
3106 psrld m6, 5 | |
3107 packusdw m2, m6 | |
3108 | |
3109 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3110 pmaddwd m6, [r3 + 8 * 16] ; [22] | |
3111 paddd m6, [pd_16] | |
3112 psrld m6, 5 | |
3113 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
3114 pmaddwd m7, [r3 + 8 * 16] | |
3115 paddd m7, [pd_16] | |
3116 psrld m7, 5 | |
3117 packusdw m6, m7 | |
3118 | |
3119 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3120 pmaddwd m7, [r3 + 2 * 16] ; [16] | |
3121 paddd m7, [pd_16] | |
3122 psrld m7, 5 | |
3123 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] | |
3124 pmaddwd m1, [r3 + 2 * 16] | |
3125 paddd m1, [pd_16] | |
3126 psrld m1, 5 | |
3127 packusdw m7, m1 | |
3128 | |
3129 punpckhwd m3, m4, m2 | |
3130 punpcklwd m4, m2 | |
3131 punpckhwd m2, m6, m7 | |
3132 punpcklwd m6, m7 | |
3133 | |
3134 punpckldq m7, m4, m6 | |
3135 punpckhdq m4, m6 | |
3136 punpckldq m6, m3, m2 | |
3137 punpckhdq m3, m2 | |
3138 | |
3139 movh [r0 + 8], m7 | |
3140 movhps [r0 + r1 + 8], m7 | |
3141 movh [r0 + r1 * 2 + 8], m4 | |
3142 movhps [r0 + r4 + 8], m4 | |
3143 lea r0, [r0 + r1 * 4] | |
3144 movh [r0 + 8], m6 | |
3145 movhps [r0 + r1 + 8], m6 | |
3146 movh [r0 + r1 * 2 + 8], m3 | |
3147 movhps [r0 + r4 + 8], m3 | |
3148 RET | |
3149 | |
3150 cglobal intra_pred_ang8_4, 3,6,8 | |
3151 add r2, 32 | |
3152 lea r3, [ang_table + 19 * 16] | |
3153 add r1, r1 | |
3154 | |
3155 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3156 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
3157 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
3158 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
3159 | |
3160 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
3161 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
3162 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
3163 | |
3164 mova m4, m3 | |
3165 pmaddwd m4, [r3 + 2 * 16] ; [21] | |
3166 paddd m4, [pd_16] | |
3167 psrld m4, 5 | |
3168 mova m2, m0 | |
3169 pmaddwd m2, [r3 + 2 * 16] | |
3170 paddd m2, [pd_16] | |
3171 psrld m2, 5 | |
3172 packusdw m4, m2 | |
3173 | |
3174 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3175 mova m6, m2 | |
3176 pmaddwd m2, [r3 - 9 * 16] ; [10] | |
3177 paddd m2, [pd_16] | |
3178 psrld m2, 5 | |
3179 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3180 mova m7, m1 | |
3181 pmaddwd m1, [r3 - 9 * 16] | |
3182 paddd m1, [pd_16] | |
3183 psrld m1, 5 | |
3184 packusdw m2, m1 | |
3185 | |
3186 pmaddwd m6, [r3 + 12 * 16] ; [31] | |
3187 paddd m6, [pd_16] | |
3188 psrld m6, 5 | |
3189 pmaddwd m7, [r3 + 12 * 16] | |
3190 paddd m7, [pd_16] | |
3191 psrld m7, 5 | |
3192 packusdw m6, m7 | |
3193 | |
3194 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3195 pmaddwd m7, [r3 + 1 * 16] ; [20] | |
3196 paddd m7, [pd_16] | |
3197 psrld m7, 5 | |
3198 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3199 pmaddwd m1, [r3 + 1 * 16] | |
3200 paddd m1, [pd_16] | |
3201 psrld m1, 5 | |
3202 packusdw m7, m1 | |
3203 | |
3204 punpckhwd m1, m4, m2 | |
3205 punpcklwd m4, m2 | |
3206 punpckhwd m2, m6, m7 | |
3207 punpcklwd m6, m7 | |
3208 | |
3209 punpckldq m7, m4, m6 | |
3210 punpckhdq m4, m6 | |
3211 punpckldq m6, m1, m2 | |
3212 punpckhdq m1, m2 | |
3213 | |
3214 lea r4, [r1 * 3] | |
3215 movh [r0], m7 | |
3216 movhps [r0 + r1], m7 | |
3217 movh [r0 + r1 * 2], m4 | |
3218 movhps [r0 + r4], m4 | |
3219 lea r5, [r0 + r1 * 4] | |
3220 movh [r5], m6 | |
3221 movhps [r5 + r1], m6 | |
3222 movh [r5 + r1 * 2], m1 | |
3223 movhps [r5 + r4], m1 | |
3224 | |
3225 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
3226 mova m2, m4 | |
3227 pmaddwd m4, [r3 - 10 * 16] ; [ 9] | |
3228 paddd m4, [pd_16] | |
3229 psrld m4, 5 | |
3230 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
3231 mova m6, m3 | |
3232 pmaddwd m3, [r3 - 10 * 16] | |
3233 paddd m3, [pd_16] | |
3234 psrld m3, 5 | |
3235 packusdw m4, m3 | |
3236 | |
3237 pmaddwd m2, [r3 + 11 * 16] ; [30] | |
3238 paddd m2, [pd_16] | |
3239 psrld m2, 5 | |
3240 pmaddwd m6, [r3 + 11 * 16] | |
3241 paddd m6, [pd_16] | |
3242 psrld m6, 5 | |
3243 packusdw m2, m6 | |
3244 | |
3245 mova m6, m0 | |
3246 pmaddwd m6, [r3] ; [19] | |
3247 paddd m6, [pd_16] | |
3248 psrld m6, 5 | |
3249 mova m7, m5 | |
3250 pmaddwd m7, [r3] | |
3251 paddd m7, [pd_16] | |
3252 psrld m7, 5 | |
3253 packusdw m6, m7 | |
3254 | |
3255 movh m1, [r2 + 26] ; [16 15 14 13] | |
3256 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3257 pmaddwd m7, [r3 - 11 * 16] ; [8] | |
3258 paddd m7, [pd_16] | |
3259 psrld m7, 5 | |
3260 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
3261 pmaddwd m1, [r3 - 11 * 16] | |
3262 paddd m1, [pd_16] | |
3263 psrld m1, 5 | |
3264 packusdw m7, m1 | |
3265 | |
3266 punpckhwd m3, m4, m2 | |
3267 punpcklwd m4, m2 | |
3268 punpckhwd m2, m6, m7 | |
3269 punpcklwd m6, m7 | |
3270 | |
3271 punpckldq m7, m4, m6 | |
3272 punpckhdq m4, m6 | |
3273 punpckldq m6, m3, m2 | |
3274 punpckhdq m3, m2 | |
3275 | |
3276 movh [r0 + 8], m7 | |
3277 movhps [r0 + r1 + 8], m7 | |
3278 movh [r0 + r1 * 2 + 8], m4 | |
3279 movhps [r0 + r4 + 8], m4 | |
3280 lea r0, [r0 + r1 * 4] | |
3281 movh [r0 + 8], m6 | |
3282 movhps [r0 + r1 + 8], m6 | |
3283 movh [r0 + r1 * 2 + 8], m3 | |
3284 movhps [r0 + r4 + 8], m3 | |
3285 RET | |
3286 | |
3287 cglobal intra_pred_ang8_5, 3,5,8 | |
3288 add r2, 32 | |
3289 lea r3, [ang_table + 13 * 16] | |
3290 add r1, r1 | |
3291 | |
3292 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3293 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
3294 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
3295 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
3296 | |
3297 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
3298 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
3299 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
3300 | |
3301 mova m4, m3 | |
3302 pmaddwd m4, [r3 + 4 * 16] ; [17] | |
3303 paddd m4, [pd_16] | |
3304 psrld m4, 5 | |
3305 mova m2, m0 | |
3306 pmaddwd m2, [r3 + 4 * 16] | |
3307 paddd m2, [pd_16] | |
3308 psrld m2, 5 | |
3309 packusdw m4, m2 | |
3310 | |
3311 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3312 mova m6, m2 | |
3313 pmaddwd m2, [r3 - 11 * 16] ; [2] | |
3314 paddd m2, [pd_16] | |
3315 psrld m2, 5 | |
3316 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3317 mova m7, m1 | |
3318 pmaddwd m1, [r3 - 11 * 16] | |
3319 paddd m1, [pd_16] | |
3320 psrld m1, 5 | |
3321 packusdw m2, m1 | |
3322 | |
3323 pmaddwd m6, [r3 + 6 * 16] ; [19] | |
3324 paddd m6, [pd_16] | |
3325 psrld m6, 5 | |
3326 pmaddwd m7, [r3 + 6 * 16] | |
3327 paddd m7, [pd_16] | |
3328 psrld m7, 5 | |
3329 packusdw m6, m7 | |
3330 | |
3331 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3332 pmaddwd m7, [r3 - 9 * 16] ; [4] | |
3333 paddd m7, [pd_16] | |
3334 psrld m7, 5 | |
3335 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3336 pmaddwd m1, [r3 - 9 * 16] | |
3337 paddd m1, [pd_16] | |
3338 psrld m1, 5 | |
3339 packusdw m7, m1 | |
3340 | |
3341 punpckhwd m1, m4, m2 | |
3342 punpcklwd m4, m2 | |
3343 punpckhwd m2, m6, m7 | |
3344 punpcklwd m6, m7 | |
3345 | |
3346 punpckldq m7, m4, m6 | |
3347 punpckhdq m4, m6 | |
3348 punpckldq m6, m1, m2 | |
3349 punpckhdq m1, m2 | |
3350 | |
3351 lea r4, [r1 * 3] | |
3352 movh [r0], m7 | |
3353 movhps [r0 + r1], m7 | |
3354 movh [r0 + r1 * 2], m4 | |
3355 movhps [r0 + r4], m4 | |
3356 lea r2, [r0 + r1 * 4] | |
3357 movh [r2], m6 | |
3358 movhps [r2 + r1], m6 | |
3359 movh [r2 + r1 * 2], m1 | |
3360 movhps [r2 + r4], m1 | |
3361 | |
3362 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3363 pmaddwd m4, [r3 + 8 * 16] ; [21] | |
3364 paddd m4, [pd_16] | |
3365 psrld m4, 5 | |
3366 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3367 pmaddwd m2, [r3 + 8 * 16] | |
3368 paddd m2, [pd_16] | |
3369 psrld m2, 5 | |
3370 packusdw m4, m2 | |
3371 | |
3372 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
3373 mova m6, m2 | |
3374 pmaddwd m2, [r3 - 7 * 16] ; [6] | |
3375 paddd m2, [pd_16] | |
3376 psrld m2, 5 | |
3377 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
3378 mova m7, m1 | |
3379 pmaddwd m1, [r3 - 7 * 16] | |
3380 paddd m1, [pd_16] | |
3381 psrld m1, 5 | |
3382 packusdw m2, m1 | |
3383 | |
3384 pmaddwd m6, [r3 + 10 * 16] ; [23] | |
3385 paddd m6, [pd_16] | |
3386 psrld m6, 5 | |
3387 pmaddwd m7, [r3 + 10 * 16] | |
3388 paddd m7, [pd_16] | |
3389 psrld m7, 5 | |
3390 packusdw m6, m7 | |
3391 | |
3392 mova m7, m0 | |
3393 pmaddwd m7, [r3 - 5 * 16] ; [8] | |
3394 paddd m7, [pd_16] | |
3395 psrld m7, 5 | |
3396 mova m1, m5 | |
3397 pmaddwd m1, [r3 - 5 * 16] | |
3398 paddd m1, [pd_16] | |
3399 psrld m1, 5 | |
3400 packusdw m7, m1 | |
3401 | |
3402 punpckhwd m3, m4, m2 | |
3403 punpcklwd m4, m2 | |
3404 punpckhwd m2, m6, m7 | |
3405 punpcklwd m6, m7 | |
3406 | |
3407 punpckldq m7, m4, m6 | |
3408 punpckhdq m4, m6 | |
3409 punpckldq m6, m3, m2 | |
3410 punpckhdq m3, m2 | |
3411 | |
3412 movh [r0 + 8], m7 | |
3413 movhps [r0 + r1 + 8], m7 | |
3414 movh [r0 + r1 * 2 + 8], m4 | |
3415 movhps [r0 + r4 + 8], m4 | |
3416 lea r0, [r0 + r1 * 4] | |
3417 movh [r0 + 8], m6 | |
3418 movhps [r0 + r1 + 8], m6 | |
3419 movh [r0 + r1 * 2 + 8], m3 | |
3420 movhps [r0 + r4 + 8], m3 | |
3421 RET | |
3422 | |
3423 cglobal intra_pred_ang8_6, 3,5,8 | |
3424 add r2, 32 | |
3425 lea r3, [ang_table + 14 * 16] | |
3426 add r1, r1 | |
3427 | |
3428 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3429 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
3430 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
3431 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
3432 | |
3433 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
3434 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
3435 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
3436 | |
3437 mova m4, m3 | |
3438 pmaddwd m4, [r3 - 1 * 16] ; [13] | |
3439 paddd m4, [pd_16] | |
3440 psrld m4, 5 | |
3441 mova m2, m0 | |
3442 pmaddwd m2, [r3 - 1 * 16] | |
3443 paddd m2, [pd_16] | |
3444 psrld m2, 5 | |
3445 packusdw m4, m2 | |
3446 | |
3447 mova m2, m3 | |
3448 pmaddwd m2, [r3 + 12 * 16] ; [26] | |
3449 paddd m2, [pd_16] | |
3450 psrld m2, 5 | |
3451 mova m1, m0 | |
3452 pmaddwd m1, [r3 + 12 * 16] | |
3453 paddd m1, [pd_16] | |
3454 psrld m1, 5 | |
3455 packusdw m2, m1 | |
3456 | |
3457 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3458 mova m7, m6 | |
3459 pmaddwd m6, [r3 - 7 * 16] ; [7] | |
3460 paddd m6, [pd_16] | |
3461 psrld m6, 5 | |
3462 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3463 pmaddwd m1, [r3 - 7 * 16] | |
3464 paddd m1, [pd_16] | |
3465 psrld m1, 5 | |
3466 packusdw m6, m1 | |
3467 | |
3468 pmaddwd m7, [r3 + 6 * 16] ; [20] | |
3469 paddd m7, [pd_16] | |
3470 psrld m7, 5 | |
3471 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3472 pmaddwd m1, [r3 + 6 * 16] | |
3473 paddd m1, [pd_16] | |
3474 psrld m1, 5 | |
3475 packusdw m7, m1 | |
3476 | |
3477 punpckhwd m1, m4, m2 | |
3478 punpcklwd m4, m2 | |
3479 punpckhwd m2, m6, m7 | |
3480 punpcklwd m6, m7 | |
3481 | |
3482 punpckldq m7, m4, m6 | |
3483 punpckhdq m4, m6 | |
3484 punpckldq m6, m1, m2 | |
3485 punpckhdq m1, m2 | |
3486 | |
3487 lea r4, [r1 * 3] | |
3488 movh [r0], m7 | |
3489 movhps [r0 + r1], m7 | |
3490 movh [r0 + r1 * 2], m4 | |
3491 movhps [r0 + r4], m4 | |
3492 lea r2, [r0 + r1 * 4] | |
3493 movh [r2], m6 | |
3494 movhps [r2 + r1], m6 | |
3495 movh [r2 + r1 * 2], m1 | |
3496 movhps [r2 + r4], m1 | |
3497 | |
3498 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3499 mova m6, m4 | |
3500 pmaddwd m4, [r3 - 13 * 16] ; [1] | |
3501 paddd m4, [pd_16] | |
3502 psrld m4, 5 | |
3503 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3504 mova m7, m2 | |
3505 pmaddwd m2, [r3 - 13 * 16] | |
3506 paddd m2, [pd_16] | |
3507 psrld m2, 5 | |
3508 packusdw m4, m2 | |
3509 | |
3510 pmaddwd m2, m6, [r3] ; [14] | |
3511 paddd m2, [pd_16] | |
3512 psrld m2, 5 | |
3513 pmaddwd m1, m7, [r3] | |
3514 paddd m1, [pd_16] | |
3515 psrld m1, 5 | |
3516 packusdw m2, m1 | |
3517 | |
3518 pmaddwd m6, [r3 + 13 * 16] ; [27] | |
3519 paddd m6, [pd_16] | |
3520 psrld m6, 5 | |
3521 pmaddwd m7, [r3 + 13 * 16] | |
3522 paddd m7, [pd_16] | |
3523 psrld m7, 5 | |
3524 packusdw m6, m7 | |
3525 | |
3526 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
3527 pmaddwd m7, [r3 - 6 * 16] ; [8] | |
3528 paddd m7, [pd_16] | |
3529 psrld m7, 5 | |
3530 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
3531 pmaddwd m5, [r3 - 6 * 16] | |
3532 paddd m5, [pd_16] | |
3533 psrld m5, 5 | |
3534 packusdw m7, m5 | |
3535 | |
3536 punpckhwd m3, m4, m2 | |
3537 punpcklwd m4, m2 | |
3538 punpckhwd m2, m6, m7 | |
3539 punpcklwd m6, m7 | |
3540 | |
3541 punpckldq m7, m4, m6 | |
3542 punpckhdq m4, m6 | |
3543 punpckldq m6, m3, m2 | |
3544 punpckhdq m3, m2 | |
3545 | |
3546 movh [r0 + 8], m7 | |
3547 movhps [r0 + r1 + 8], m7 | |
3548 movh [r0 + r1 * 2 + 8], m4 | |
3549 movhps [r0 + r4 + 8], m4 | |
3550 lea r0, [r0 + r1 * 4] | |
3551 movh [r0 + 8], m6 | |
3552 movhps [r0 + r1 + 8], m6 | |
3553 movh [r0 + r1 * 2 + 8], m3 | |
3554 movhps [r0 + r4 + 8], m3 | |
3555 RET | |
3556 | |
3557 cglobal intra_pred_ang8_7, 3,5,8 | |
3558 add r2, 32 | |
3559 lea r3, [ang_table + 18 * 16] | |
3560 add r1, r1 | |
3561 | |
3562 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3563 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
3564 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
3565 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
3566 | |
3567 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
3568 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
3569 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
3570 | |
3571 mova m4, m3 | |
3572 pmaddwd m4, [r3 - 9 * 16] ; [9] | |
3573 paddd m4, [pd_16] | |
3574 psrld m4, 5 | |
3575 mova m2, m0 | |
3576 pmaddwd m2, [r3 - 9 * 16] | |
3577 paddd m2, [pd_16] | |
3578 psrld m2, 5 | |
3579 packusdw m4, m2 | |
3580 | |
3581 mova m2, m3 | |
3582 pmaddwd m2, [r3] ; [18] | |
3583 paddd m2, [pd_16] | |
3584 psrld m2, 5 | |
3585 mova m1, m0 | |
3586 pmaddwd m1, [r3] | |
3587 paddd m1, [pd_16] | |
3588 psrld m1, 5 | |
3589 packusdw m2, m1 | |
3590 | |
3591 mova m6, m3 | |
3592 pmaddwd m6, [r3 + 9 * 16] ; [27] | |
3593 paddd m6, [pd_16] | |
3594 psrld m6, 5 | |
3595 mova m1, m0 | |
3596 pmaddwd m1, [r3 + 9 * 16] | |
3597 paddd m1, [pd_16] | |
3598 psrld m1, 5 | |
3599 packusdw m6, m1 | |
3600 | |
3601 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3602 pmaddwd m7, [r3 - 14 * 16] ; [4] | |
3603 paddd m7, [pd_16] | |
3604 psrld m7, 5 | |
3605 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3606 pmaddwd m1, [r3 - 14 * 16] | |
3607 paddd m1, [pd_16] | |
3608 psrld m1, 5 | |
3609 packusdw m7, m1 | |
3610 | |
3611 punpckhwd m1, m4, m2 | |
3612 punpcklwd m4, m2 | |
3613 punpckhwd m2, m6, m7 | |
3614 punpcklwd m6, m7 | |
3615 | |
3616 punpckldq m7, m4, m6 | |
3617 punpckhdq m4, m6 | |
3618 punpckldq m6, m1, m2 | |
3619 punpckhdq m1, m2 | |
3620 | |
3621 lea r4, [r1 * 3] | |
3622 movh [r0], m7 | |
3623 movhps [r0 + r1], m7 | |
3624 movh [r0 + r1 * 2], m4 | |
3625 movhps [r0 + r4], m4 | |
3626 lea r2, [r0 + r1 * 4] | |
3627 movh [r2], m6 | |
3628 movhps [r2 + r1], m6 | |
3629 movh [r2 + r1 * 2], m1 | |
3630 movhps [r2 + r4], m1 | |
3631 | |
3632 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3633 mova m6, m4 | |
3634 pmaddwd m4, [r3 - 5 * 16] ; [13] | |
3635 paddd m4, [pd_16] | |
3636 psrld m4, 5 | |
3637 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3638 mova m7, m2 | |
3639 pmaddwd m2, [r3 - 5 * 16] | |
3640 paddd m2, [pd_16] | |
3641 psrld m2, 5 | |
3642 packusdw m4, m2 | |
3643 | |
3644 pmaddwd m2, m6, [r3 + 4 * 16] ; [22] | |
3645 paddd m2, [pd_16] | |
3646 psrld m2, 5 | |
3647 pmaddwd m1, m7, [r3 + 4 * 16] | |
3648 paddd m1, [pd_16] | |
3649 psrld m1, 5 | |
3650 packusdw m2, m1 | |
3651 | |
3652 pmaddwd m6, [r3 + 13 * 16] ; [31] | |
3653 paddd m6, [pd_16] | |
3654 psrld m6, 5 | |
3655 pmaddwd m7, [r3 + 13 * 16] | |
3656 paddd m7, [pd_16] | |
3657 psrld m7, 5 | |
3658 packusdw m6, m7 | |
3659 | |
3660 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
3661 pmaddwd m7, [r3 - 10 * 16] ; [8] | |
3662 paddd m7, [pd_16] | |
3663 psrld m7, 5 | |
3664 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
3665 pmaddwd m5, [r3 - 10 * 16] | |
3666 paddd m5, [pd_16] | |
3667 psrld m5, 5 | |
3668 packusdw m7, m5 | |
3669 | |
3670 punpckhwd m3, m4, m2 | |
3671 punpcklwd m4, m2 | |
3672 punpckhwd m2, m6, m7 | |
3673 punpcklwd m6, m7 | |
3674 | |
3675 punpckldq m7, m4, m6 | |
3676 punpckhdq m4, m6 | |
3677 punpckldq m6, m3, m2 | |
3678 punpckhdq m3, m2 | |
3679 | |
3680 movh [r0 + 8], m7 | |
3681 movhps [r0 + r1 + 8], m7 | |
3682 movh [r0 + r1 * 2 + 8], m4 | |
3683 movhps [r0 + r4 + 8], m4 | |
3684 lea r0, [r0 + r1 * 4] | |
3685 movh [r0 + 8], m6 | |
3686 movhps [r0 + r1 + 8], m6 | |
3687 movh [r0 + r1 * 2 + 8], m3 | |
3688 movhps [r0 + r4 + 8], m3 | |
3689 RET | |
3690 | |
3691 cglobal intra_pred_ang8_8, 3,6,7 | |
3692 add r2, 32 | |
3693 lea r3, [ang_table + 17 * 16] | |
3694 add r1, r1 | |
3695 | |
3696 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3697 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] | |
3698 | |
3699 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] | |
3700 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] | |
3701 | |
3702 mova m4, m3 | |
3703 pmaddwd m4, [r3 - 12 * 16] ; [5] | |
3704 paddd m4, [pd_16] | |
3705 psrld m4, 5 | |
3706 mova m2, m0 | |
3707 pmaddwd m2, [r3 - 12 * 16] | |
3708 paddd m2, [pd_16] | |
3709 psrld m2, 5 | |
3710 packusdw m4, m2 | |
3711 | |
3712 mova m2, m3 | |
3713 pmaddwd m2, [r3 - 7 * 16] ; [10] | |
3714 paddd m2, [pd_16] | |
3715 psrld m2, 5 | |
3716 mova m1, m0 | |
3717 pmaddwd m1, [r3 - 7 * 16] | |
3718 paddd m1, [pd_16] | |
3719 psrld m1, 5 | |
3720 packusdw m2, m1 | |
3721 | |
3722 mova m6, m3 | |
3723 pmaddwd m6, [r3 - 2 * 16] ; [15] | |
3724 paddd m6, [pd_16] | |
3725 psrld m6, 5 | |
3726 mova m1, m0 | |
3727 pmaddwd m1, [r3 - 2 * 16] | |
3728 paddd m1, [pd_16] | |
3729 psrld m1, 5 | |
3730 packusdw m6, m1 | |
3731 | |
3732 mova m5, m3 | |
3733 pmaddwd m5, [r3 + 3 * 16] ; [20] | |
3734 paddd m5, [pd_16] | |
3735 psrld m5, 5 | |
3736 mova m1, m0 | |
3737 pmaddwd m1, [r3 + 3 * 16] | |
3738 paddd m1, [pd_16] | |
3739 psrld m1, 5 | |
3740 packusdw m5, m1 | |
3741 | |
3742 punpckhwd m1, m4, m2 | |
3743 punpcklwd m4, m2 | |
3744 punpckhwd m2, m6, m5 | |
3745 punpcklwd m6, m5 | |
3746 | |
3747 punpckldq m5, m4, m6 | |
3748 punpckhdq m4, m6 | |
3749 punpckldq m6, m1, m2 | |
3750 punpckhdq m1, m2 | |
3751 | |
3752 lea r4, [r1 * 3] | |
3753 movh [r0], m5 | |
3754 movhps [r0 + r1], m5 | |
3755 movh [r0 + r1 * 2], m4 | |
3756 movhps [r0 + r4], m4 | |
3757 lea r5, [r0 + r1 * 4] | |
3758 movh [r5], m6 | |
3759 movhps [r5 + r1], m6 | |
3760 movh [r5 + r1 * 2], m1 | |
3761 movhps [r5 + r4], m1 | |
3762 | |
3763 mova m4, m3 | |
3764 pmaddwd m4, [r3 + 8 * 16] ; [25] | |
3765 paddd m4, [pd_16] | |
3766 psrld m4, 5 | |
3767 mova m2, m0 | |
3768 pmaddwd m2, [r3 + 8 * 16] | |
3769 paddd m2, [pd_16] | |
3770 psrld m2, 5 | |
3771 packusdw m4, m2 | |
3772 | |
3773 mova m2, m3 | |
3774 pmaddwd m2, [r3 + 13 * 16] ; [30] | |
3775 paddd m2, [pd_16] | |
3776 psrld m2, 5 | |
3777 mova m1, m0 | |
3778 pmaddwd m1, [r3 + 13 * 16] | |
3779 paddd m1, [pd_16] | |
3780 psrld m1, 5 | |
3781 packusdw m2, m1 | |
3782 | |
3783 movh m1, [r2 + 18] ; [12 11 10 9] | |
3784 | |
3785 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
3786 mova m5, m6 | |
3787 pmaddwd m6, [r3 - 14 * 16] ; [3] | |
3788 paddd m6, [pd_16] | |
3789 psrld m6, 5 | |
3790 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] | |
3791 mova m3, m1 | |
3792 pmaddwd m1, [r3 - 14 * 16] | |
3793 paddd m1, [pd_16] | |
3794 psrld m1, 5 | |
3795 packusdw m6, m1 | |
3796 | |
3797 pmaddwd m5, [r3 - 9 * 16] ; [8] | |
3798 paddd m5, [pd_16] | |
3799 psrld m5, 5 | |
3800 pmaddwd m3, [r3 - 9 * 16] | |
3801 paddd m3, [pd_16] | |
3802 psrld m3, 5 | |
3803 packusdw m5, m3 | |
3804 | |
3805 punpckhwd m3, m4, m2 | |
3806 punpcklwd m4, m2 | |
3807 punpckhwd m2, m6, m5 | |
3808 punpcklwd m6, m5 | |
3809 | |
3810 punpckldq m5, m4, m6 | |
3811 punpckhdq m4, m6 | |
3812 punpckldq m6, m3, m2 | |
3813 punpckhdq m3, m2 | |
3814 | |
3815 movh [r0 + 8], m5 | |
3816 movhps [r0 + r1 + 8], m5 | |
3817 movh [r0 + r1 * 2 + 8], m4 | |
3818 movhps [r0 + r4 + 8], m4 | |
3819 lea r0, [r0 + r1 * 4] | |
3820 movh [r0 + 8], m6 | |
3821 movhps [r0 + r1 + 8], m6 | |
3822 movh [r0 + r1 * 2 + 8], m3 | |
3823 movhps [r0 + r4 + 8], m3 | |
3824 RET | |
3825 | |
3826 cglobal intra_pred_ang8_9, 3,5,7 | |
3827 add r2, 32 | |
3828 lea r3, [ang_table + 9 * 16] | |
3829 add r1, r1 | |
3830 | |
3831 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3832 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] | |
3833 | |
3834 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] | |
3835 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] | |
3836 | |
3837 mova m4, m3 | |
3838 pmaddwd m4, [r3 - 7 * 16] ; [2] | |
3839 paddd m4, [pd_16] | |
3840 psrld m4, 5 | |
3841 mova m2, m0 | |
3842 pmaddwd m2, [r3 - 7 * 16] | |
3843 paddd m2, [pd_16] | |
3844 psrld m2, 5 | |
3845 packusdw m4, m2 | |
3846 | |
3847 mova m2, m3 | |
3848 pmaddwd m2, [r3 - 5 * 16] ; [4] | |
3849 paddd m2, [pd_16] | |
3850 psrld m2, 5 | |
3851 mova m1, m0 | |
3852 pmaddwd m1, [r3 - 5 * 16] | |
3853 paddd m1, [pd_16] | |
3854 psrld m1, 5 | |
3855 packusdw m2, m1 | |
3856 | |
3857 mova m6, m3 | |
3858 pmaddwd m6, [r3 - 3 * 16] ; [6] | |
3859 paddd m6, [pd_16] | |
3860 psrld m6, 5 | |
3861 mova m1, m0 | |
3862 pmaddwd m1, [r3 - 3 * 16] | |
3863 paddd m1, [pd_16] | |
3864 psrld m1, 5 | |
3865 packusdw m6, m1 | |
3866 | |
3867 mova m5, m3 | |
3868 pmaddwd m5, [r3 - 1 * 16] ; [8] | |
3869 paddd m5, [pd_16] | |
3870 psrld m5, 5 | |
3871 mova m1, m0 | |
3872 pmaddwd m1, [r3 - 1 * 16] | |
3873 paddd m1, [pd_16] | |
3874 psrld m1, 5 | |
3875 packusdw m5, m1 | |
3876 | |
3877 punpckhwd m1, m4, m2 | |
3878 punpcklwd m4, m2 | |
3879 punpckhwd m2, m6, m5 | |
3880 punpcklwd m6, m5 | |
3881 | |
3882 punpckldq m5, m4, m6 | |
3883 punpckhdq m4, m6 | |
3884 punpckldq m6, m1, m2 | |
3885 punpckhdq m1, m2 | |
3886 | |
3887 lea r4, [r1 * 3] | |
3888 movh [r0], m5 | |
3889 movhps [r0 + r1], m5 | |
3890 movh [r0 + r1 * 2], m4 | |
3891 movhps [r0 + r4], m4 | |
3892 lea r2, [r0 + r1 * 4] | |
3893 movh [r2], m6 | |
3894 movhps [r2 + r1], m6 | |
3895 movh [r2 + r1 * 2], m1 | |
3896 movhps [r2 + r4], m1 | |
3897 | |
3898 mova m4, m3 | |
3899 pmaddwd m4, [r3 + 1 * 16] ; [10] | |
3900 paddd m4, [pd_16] | |
3901 psrld m4, 5 | |
3902 mova m2, m0 | |
3903 pmaddwd m2, [r3 + 1 * 16] | |
3904 paddd m2, [pd_16] | |
3905 psrld m2, 5 | |
3906 packusdw m4, m2 | |
3907 | |
3908 mova m2, m3 | |
3909 pmaddwd m2, [r3 + 3 * 16] ; [12] | |
3910 paddd m2, [pd_16] | |
3911 psrld m2, 5 | |
3912 mova m1, m0 | |
3913 pmaddwd m1, [r3 + 3 * 16] | |
3914 paddd m1, [pd_16] | |
3915 psrld m1, 5 | |
3916 packusdw m2, m1 | |
3917 | |
3918 mova m6, m3 | |
3919 pmaddwd m6, [r3 + 5 * 16] ; [14] | |
3920 paddd m6, [pd_16] | |
3921 psrld m6, 5 | |
3922 mova m5, m0 | |
3923 pmaddwd m5, [r3 + 5 * 16] | |
3924 paddd m5, [pd_16] | |
3925 psrld m5, 5 | |
3926 packusdw m6, m5 | |
3927 | |
3928 pmaddwd m3, [r3 + 7 * 16] ; [16] | |
3929 paddd m3, [pd_16] | |
3930 psrld m3, 5 | |
3931 pmaddwd m0, [r3 + 7 * 16] | |
3932 paddd m0, [pd_16] | |
3933 psrld m0, 5 | |
3934 packusdw m3, m0 | |
3935 | |
3936 punpckhwd m5, m4, m2 | |
3937 punpcklwd m4, m2 | |
3938 punpckhwd m2, m6, m3 | |
3939 punpcklwd m6, m3 | |
3940 | |
3941 punpckldq m3, m4, m6 | |
3942 punpckhdq m4, m6 | |
3943 punpckldq m6, m5, m2 | |
3944 punpckhdq m5, m2 | |
3945 | |
3946 movh [r0 + 8], m3 | |
3947 movhps [r0 + r1 + 8], m3 | |
3948 movh [r0 + r1 * 2 + 8], m4 | |
3949 movhps [r0 + r4 + 8], m4 | |
3950 lea r0, [r0 + r1 * 4] | |
3951 movh [r0 + 8], m6 | |
3952 movhps [r0 + r1 + 8], m6 | |
3953 movh [r0 + r1 * 2 + 8], m5 | |
3954 movhps [r0 + r4 + 8], m5 | |
3955 RET | |
3956 | |
3957 cglobal intra_pred_ang8_10, 3,6,3 | |
3958 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
3959 pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1] | |
3960 add r1, r1 | |
3961 lea r3, [r1 * 3] | |
3962 | |
3963 psrldq m1, 2 | |
3964 pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2] | |
3965 movu [r0 + r1], m2 | |
3966 psrldq m1, 2 | |
3967 pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3] | |
3968 movu [r0 + r1 * 2], m2 | |
3969 psrldq m1, 2 | |
3970 pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4] | |
3971 movu [r0 + r3], m2 | |
3972 | |
3973 lea r5, [r0 + r1 *4] | |
3974 psrldq m1, 2 | |
3975 pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5] | |
3976 movu [r5], m2 | |
3977 psrldq m1, 2 | |
3978 pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6] | |
3979 movu [r5 + r1], m2 | |
3980 psrldq m1, 2 | |
3981 pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7] | |
3982 movu [r5 + r1 * 2], m2 | |
3983 psrldq m1, 2 | |
3984 pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8] | |
3985 movu [r5 + r3], m2 | |
3986 | |
3987 cmp r4m, byte 0 | |
3988 jz .quit | |
3989 | |
3990 ; filter | |
3991 | |
3992 movh m1, [r2] ; [3 2 1 0] | |
3993 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] | |
3994 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
3995 psubw m1, m2 | |
3996 psraw m1, 1 | |
3997 paddw m0, m1 | |
3998 pxor m1, m1 | |
3999 pmaxsw m0, m1 | |
4000 pminsw m0, [pw_pixel_max] | |
4001 .quit: | |
4002 movu [r0], m0 | |
4003 RET | |
4004 | |
4005 cglobal intra_pred_ang8_11, 3,5,7 | |
4006 lea r3, [ang_table + 23 * 16] | |
4007 add r1, r1 | |
4008 | |
4009 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4010 pinsrw m0, [r2], 0 | |
4011 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4012 | |
4013 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4014 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4015 | |
4016 mova m4, m3 | |
4017 pmaddwd m4, [r3 + 7 * 16] ; [30] | |
4018 paddd m4, [pd_16] | |
4019 psrld m4, 5 | |
4020 mova m2, m0 | |
4021 pmaddwd m2, [r3 + 7 * 16] | |
4022 paddd m2, [pd_16] | |
4023 psrld m2, 5 | |
4024 packusdw m4, m2 | |
4025 | |
4026 mova m2, m3 | |
4027 pmaddwd m2, [r3 + 5 * 16] ; [28] | |
4028 paddd m2, [pd_16] | |
4029 psrld m2, 5 | |
4030 mova m1, m0 | |
4031 pmaddwd m1, [r3 + 5 * 16] | |
4032 paddd m1, [pd_16] | |
4033 psrld m1, 5 | |
4034 packusdw m2, m1 | |
4035 | |
4036 mova m6, m3 | |
4037 pmaddwd m6, [r3 + 3 * 16] ; [26] | |
4038 paddd m6, [pd_16] | |
4039 psrld m6, 5 | |
4040 mova m1, m0 | |
4041 pmaddwd m1, [r3 + 3 * 16] | |
4042 paddd m1, [pd_16] | |
4043 psrld m1, 5 | |
4044 packusdw m6, m1 | |
4045 | |
4046 mova m5, m3 | |
4047 pmaddwd m5, [r3 + 1 * 16] ; [24] | |
4048 paddd m5, [pd_16] | |
4049 psrld m5, 5 | |
4050 mova m1, m0 | |
4051 pmaddwd m1, [r3 + 1 * 16] | |
4052 paddd m1, [pd_16] | |
4053 psrld m1, 5 | |
4054 packusdw m5, m1 | |
4055 | |
4056 punpckhwd m1, m4, m2 | |
4057 punpcklwd m4, m2 | |
4058 punpckhwd m2, m6, m5 | |
4059 punpcklwd m6, m5 | |
4060 | |
4061 punpckldq m5, m4, m6 | |
4062 punpckhdq m4, m6 | |
4063 punpckldq m6, m1, m2 | |
4064 punpckhdq m1, m2 | |
4065 | |
4066 lea r4, [r1 * 3] | |
4067 movh [r0], m5 | |
4068 movhps [r0 + r1], m5 | |
4069 movh [r0 + r1 * 2], m4 | |
4070 movhps [r0 + r4], m4 | |
4071 lea r2, [r0 + r1 * 4] | |
4072 movh [r2], m6 | |
4073 movhps [r2 + r1], m6 | |
4074 movh [r2 + r1 * 2], m1 | |
4075 movhps [r2 + r4], m1 | |
4076 | |
4077 mova m4, m3 | |
4078 pmaddwd m4, [r3 - 1 * 16] ; [22] | |
4079 paddd m4, [pd_16] | |
4080 psrld m4, 5 | |
4081 mova m2, m0 | |
4082 pmaddwd m2, [r3 - 1 * 16] | |
4083 paddd m2, [pd_16] | |
4084 psrld m2, 5 | |
4085 packusdw m4, m2 | |
4086 | |
4087 mova m2, m3 | |
4088 pmaddwd m2, [r3 - 3 * 16] ; [20] | |
4089 paddd m2, [pd_16] | |
4090 psrld m2, 5 | |
4091 mova m1, m0 | |
4092 pmaddwd m1, [r3 - 3 * 16] | |
4093 paddd m1, [pd_16] | |
4094 psrld m1, 5 | |
4095 packusdw m2, m1 | |
4096 | |
4097 mova m6, m3 | |
4098 pmaddwd m6, [r3 - 5 * 16] ; [18] | |
4099 paddd m6, [pd_16] | |
4100 psrld m6, 5 | |
4101 mova m5, m0 | |
4102 pmaddwd m5, [r3 - 5 * 16] | |
4103 paddd m5, [pd_16] | |
4104 psrld m5, 5 | |
4105 packusdw m6, m5 | |
4106 | |
4107 pmaddwd m3, [r3 - 7 * 16] ; [16] | |
4108 paddd m3, [pd_16] | |
4109 psrld m3, 5 | |
4110 pmaddwd m0, [r3 - 7 * 16] | |
4111 paddd m0, [pd_16] | |
4112 psrld m0, 5 | |
4113 packusdw m3, m0 | |
4114 | |
4115 punpckhwd m5, m4, m2 | |
4116 punpcklwd m4, m2 | |
4117 punpckhwd m2, m6, m3 | |
4118 punpcklwd m6, m3 | |
4119 | |
4120 punpckldq m3, m4, m6 | |
4121 punpckhdq m4, m6 | |
4122 punpckldq m6, m5, m2 | |
4123 punpckhdq m5, m2 | |
4124 | |
4125 movh [r0 + 8], m3 | |
4126 movhps [r0 + r1 + 8], m3 | |
4127 movh [r0 + r1 * 2 + 8], m4 | |
4128 movhps [r0 + r4 + 8], m4 | |
4129 lea r0, [r0 + r1 * 4] | |
4130 movh [r0 + 8], m6 | |
4131 movhps [r0 + r1 + 8], m6 | |
4132 movh [r0 + r1 * 2 + 8], m5 | |
4133 movhps [r0 + r4 + 8], m5 | |
4134 RET | |
4135 | |
4136 cglobal intra_pred_ang8_12, 3,6,7 | |
4137 lea r5, [ang_table + 16 * 16] | |
4138 add r1, r1 | |
4139 | |
4140 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4141 pinsrw m0, [r2], 0 | |
4142 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4143 | |
4144 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4145 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4146 | |
4147 mova m4, m3 | |
4148 pmaddwd m4, [r5 + 11 * 16] ; [27] | |
4149 paddd m4, [pd_16] | |
4150 psrld m4, 5 | |
4151 mova m2, m0 | |
4152 pmaddwd m2, [r5 + 11 * 16] | |
4153 paddd m2, [pd_16] | |
4154 psrld m2, 5 | |
4155 packusdw m4, m2 | |
4156 | |
4157 mova m2, m3 | |
4158 pmaddwd m2, [r5 + 6 * 16] ; [22] | |
4159 paddd m2, [pd_16] | |
4160 psrld m2, 5 | |
4161 mova m1, m0 | |
4162 pmaddwd m1, [r5 + 6 * 16] | |
4163 paddd m1, [pd_16] | |
4164 psrld m1, 5 | |
4165 packusdw m2, m1 | |
4166 | |
4167 mova m6, m3 | |
4168 pmaddwd m6, [r5 + 1 * 16] ; [17] | |
4169 paddd m6, [pd_16] | |
4170 psrld m6, 5 | |
4171 mova m1, m0 | |
4172 pmaddwd m1, [r5 + 1 * 16] | |
4173 paddd m1, [pd_16] | |
4174 psrld m1, 5 | |
4175 packusdw m6, m1 | |
4176 | |
4177 mova m5, m3 | |
4178 pmaddwd m5, [r5 - 4 * 16] ; [12] | |
4179 paddd m5, [pd_16] | |
4180 psrld m5, 5 | |
4181 mova m1, m0 | |
4182 pmaddwd m1, [r5 - 4 * 16] | |
4183 paddd m1, [pd_16] | |
4184 psrld m1, 5 | |
4185 packusdw m5, m1 | |
4186 | |
4187 punpckhwd m1, m4, m2 | |
4188 punpcklwd m4, m2 | |
4189 punpckhwd m2, m6, m5 | |
4190 punpcklwd m6, m5 | |
4191 | |
4192 punpckldq m5, m4, m6 | |
4193 punpckhdq m4, m6 | |
4194 punpckldq m6, m1, m2 | |
4195 punpckhdq m1, m2 | |
4196 | |
4197 lea r4, [r1 * 3] | |
4198 movh [r0], m5 | |
4199 movhps [r0 + r1], m5 | |
4200 movh [r0 + r1 * 2], m4 | |
4201 movhps [r0 + r4], m4 | |
4202 lea r3, [r0 + r1 * 4] | |
4203 movh [r3], m6 | |
4204 movhps [r3 + r1], m6 | |
4205 movh [r3 + r1 * 2], m1 | |
4206 movhps [r3 + r4], m1 | |
4207 | |
4208 mova m4, m3 | |
4209 pmaddwd m4, [r5 - 9 * 16] ; [7] | |
4210 paddd m4, [pd_16] | |
4211 psrld m4, 5 | |
4212 mova m2, m0 | |
4213 pmaddwd m2, [r5 - 9 * 16] | |
4214 paddd m2, [pd_16] | |
4215 psrld m2, 5 | |
4216 packusdw m4, m2 | |
4217 | |
4218 mova m2, m3 | |
4219 pmaddwd m2, [r5 - 14 * 16] ; [2] | |
4220 paddd m2, [pd_16] | |
4221 psrld m2, 5 | |
4222 mova m1, m0 | |
4223 pmaddwd m1, [r5 - 14 * 16] | |
4224 paddd m1, [pd_16] | |
4225 psrld m1, 5 | |
4226 packusdw m2, m1 | |
4227 | |
4228 palignr m0, m3, 12 | |
4229 movu m1, [r2] | |
4230 pshufb m1, [pw_ang8_12] | |
4231 palignr m3, m1, 12 | |
4232 | |
4233 mova m6, m3 | |
4234 pmaddwd m6, [r5 + 13 * 16] ; [29] | |
4235 paddd m6, [pd_16] | |
4236 psrld m6, 5 | |
4237 mova m5, m0 | |
4238 pmaddwd m5, [r5 + 13 * 16] | |
4239 paddd m5, [pd_16] | |
4240 psrld m5, 5 | |
4241 packusdw m6, m5 | |
4242 | |
4243 pmaddwd m3, [r5 + 8 * 16] ; [24] | |
4244 paddd m3, [pd_16] | |
4245 psrld m3, 5 | |
4246 pmaddwd m0, [r5 + 8 * 16] | |
4247 paddd m0, [pd_16] | |
4248 psrld m0, 5 | |
4249 packusdw m3, m0 | |
4250 | |
4251 punpckhwd m5, m4, m2 | |
4252 punpcklwd m4, m2 | |
4253 punpckhwd m2, m6, m3 | |
4254 punpcklwd m6, m3 | |
4255 | |
4256 punpckldq m3, m4, m6 | |
4257 punpckhdq m4, m6 | |
4258 punpckldq m6, m5, m2 | |
4259 punpckhdq m5, m2 | |
4260 | |
4261 movh [r0 + 8], m3 | |
4262 movhps [r0 + r1 + 8], m3 | |
4263 movh [r0 + r1 * 2 + 8], m4 | |
4264 movhps [r0 + r4 + 8], m4 | |
4265 lea r0, [r0 + r1 * 4] | |
4266 movh [r0 + 8], m6 | |
4267 movhps [r0 + r1 + 8], m6 | |
4268 movh [r0 + r1 * 2 + 8], m5 | |
4269 movhps [r0 + r4 + 8], m5 | |
4270 RET | |
4271 | |
4272 cglobal intra_pred_ang8_13, 3,6,8 | |
4273 lea r5, [ang_table + 14 * 16] | |
4274 add r1, r1 | |
4275 | |
4276 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4277 pinsrw m0, [r2], 0 | |
4278 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4279 | |
4280 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4281 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4282 | |
4283 mova m4, m3 | |
4284 pmaddwd m4, [r5 + 9 * 16] ; [23] | |
4285 paddd m4, [pd_16] | |
4286 psrld m4, 5 | |
4287 mova m2, m0 | |
4288 pmaddwd m2, [r5 + 9 * 16] | |
4289 paddd m2, [pd_16] | |
4290 psrld m2, 5 | |
4291 packusdw m4, m2 | |
4292 | |
4293 mova m2, m3 | |
4294 pmaddwd m2, [r5] ; [14] | |
4295 paddd m2, [pd_16] | |
4296 psrld m2, 5 | |
4297 mova m1, m0 | |
4298 pmaddwd m1, [r5] | |
4299 paddd m1, [pd_16] | |
4300 psrld m1, 5 | |
4301 packusdw m2, m1 | |
4302 | |
4303 mova m6, m3 | |
4304 pmaddwd m6, [r5 - 9 * 16] ; [5] | |
4305 paddd m6, [pd_16] | |
4306 psrld m6, 5 | |
4307 mova m1, m0 | |
4308 pmaddwd m1, [r5 - 9 * 16] | |
4309 paddd m1, [pd_16] | |
4310 psrld m1, 5 | |
4311 packusdw m6, m1 | |
4312 | |
4313 palignr m0, m3, 12 | |
4314 movu m1, [r2] | |
4315 pshufb m1, [pw_ang8_13] | |
4316 palignr m3, m1, 12 | |
4317 | |
4318 mova m5, m3 | |
4319 pmaddwd m5, [r5 + 14 * 16] ; [28] | |
4320 paddd m5, [pd_16] | |
4321 psrld m5, 5 | |
4322 mova m7, m0 | |
4323 pmaddwd m7, [r5 + 14 * 16] | |
4324 paddd m7, [pd_16] | |
4325 psrld m7, 5 | |
4326 packusdw m5, m7 | |
4327 | |
4328 punpckhwd m7, m4, m2 | |
4329 punpcklwd m4, m2 | |
4330 punpckhwd m2, m6, m5 | |
4331 punpcklwd m6, m5 | |
4332 | |
4333 punpckldq m5, m4, m6 | |
4334 punpckhdq m4, m6 | |
4335 punpckldq m6, m7, m2 | |
4336 punpckhdq m7, m2 | |
4337 | |
4338 lea r4, [r1 * 3] | |
4339 movh [r0], m5 | |
4340 movhps [r0 + r1], m5 | |
4341 movh [r0 + r1 * 2], m4 | |
4342 movhps [r0 + r4], m4 | |
4343 lea r2, [r0 + r1 * 4] | |
4344 movh [r2], m6 | |
4345 movhps [r2 + r1], m6 | |
4346 movh [r2 + r1 * 2], m7 | |
4347 movhps [r2 + r4], m7 | |
4348 | |
4349 mova m4, m3 | |
4350 pmaddwd m4, [r5 + 5 * 16] ; [19] | |
4351 paddd m4, [pd_16] | |
4352 psrld m4, 5 | |
4353 mova m2, m0 | |
4354 pmaddwd m2, [r5 + 5 * 16] | |
4355 paddd m2, [pd_16] | |
4356 psrld m2, 5 | |
4357 packusdw m4, m2 | |
4358 | |
4359 mova m2, m3 | |
4360 pmaddwd m2, [r5 - 4 * 16] ; [10] | |
4361 paddd m2, [pd_16] | |
4362 psrld m2, 5 | |
4363 mova m5, m0 | |
4364 pmaddwd m5, [r5 - 4 * 16] | |
4365 paddd m5, [pd_16] | |
4366 psrld m5, 5 | |
4367 packusdw m2, m5 | |
4368 | |
4369 mova m6, m3 | |
4370 pmaddwd m6, [r5 - 13 * 16] ; [1] | |
4371 paddd m6, [pd_16] | |
4372 psrld m6, 5 | |
4373 mova m5, m0 | |
4374 pmaddwd m5, [r5 - 13 * 16] | |
4375 paddd m5, [pd_16] | |
4376 psrld m5, 5 | |
4377 packusdw m6, m5 | |
4378 | |
4379 pslldq m1, 2 | |
4380 palignr m0, m3, 12 | |
4381 palignr m3, m1, 12 | |
4382 | |
4383 pmaddwd m3, [r5 + 10 * 16] ; [24] | |
4384 paddd m3, [pd_16] | |
4385 psrld m3, 5 | |
4386 pmaddwd m0, [r5 + 10 * 16] | |
4387 paddd m0, [pd_16] | |
4388 psrld m0, 5 | |
4389 packusdw m3, m0 | |
4390 | |
4391 punpckhwd m5, m4, m2 | |
4392 punpcklwd m4, m2 | |
4393 punpckhwd m2, m6, m3 | |
4394 punpcklwd m6, m3 | |
4395 | |
4396 punpckldq m3, m4, m6 | |
4397 punpckhdq m4, m6 | |
4398 punpckldq m6, m5, m2 | |
4399 punpckhdq m5, m2 | |
4400 | |
4401 movh [r0 + 8], m3 | |
4402 movhps [r0 + r1 + 8], m3 | |
4403 movh [r0 + r1 * 2 + 8], m4 | |
4404 movhps [r0 + r4 + 8], m4 | |
4405 lea r0, [r0 + r1 * 4] | |
4406 movh [r0 + 8], m6 | |
4407 movhps [r0 + r1 + 8], m6 | |
4408 movh [r0 + r1 * 2 + 8], m5 | |
4409 movhps [r0 + r4 + 8], m5 | |
4410 RET | |
4411 | |
4412 cglobal intra_pred_ang8_14, 3,6,8 | |
4413 lea r5, [ang_table + 18 * 16] | |
4414 add r1, r1 | |
4415 | |
4416 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4417 pinsrw m0, [r2], 0 | |
4418 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4419 | |
4420 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4421 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4422 | |
4423 mova m4, m3 | |
4424 pmaddwd m4, [r5 + 1 * 16] ; [19] | |
4425 paddd m4, [pd_16] | |
4426 psrld m4, 5 | |
4427 mova m2, m0 | |
4428 pmaddwd m2, [r5 + 1 * 16] | |
4429 paddd m2, [pd_16] | |
4430 psrld m2, 5 | |
4431 packusdw m4, m2 | |
4432 | |
4433 mova m2, m3 | |
4434 pmaddwd m2, [r5 - 12 * 16] ; [6] | |
4435 paddd m2, [pd_16] | |
4436 psrld m2, 5 | |
4437 mova m1, m0 | |
4438 pmaddwd m1, [r5 - 12 * 16] | |
4439 paddd m1, [pd_16] | |
4440 psrld m1, 5 | |
4441 packusdw m2, m1 | |
4442 | |
4443 palignr m0, m3, 12 | |
4444 movu m1, [r2] | |
4445 pshufb m1, [pw_ang8_14] | |
4446 palignr m3, m1, 12 | |
4447 | |
4448 mova m6, m3 | |
4449 pmaddwd m6, [r5 + 7 * 16] ; [25] | |
4450 paddd m6, [pd_16] | |
4451 psrld m6, 5 | |
4452 mova m5, m0 | |
4453 pmaddwd m5, [r5 + 7 * 16] | |
4454 paddd m5, [pd_16] | |
4455 psrld m5, 5 | |
4456 packusdw m6, m5 | |
4457 | |
4458 mova m5, m3 | |
4459 pmaddwd m5, [r5 - 6 * 16] ; [12] | |
4460 paddd m5, [pd_16] | |
4461 psrld m5, 5 | |
4462 mova m7, m0 | |
4463 pmaddwd m7, [r5 - 6 * 16] | |
4464 paddd m7, [pd_16] | |
4465 psrld m7, 5 | |
4466 packusdw m5, m7 | |
4467 | |
4468 punpckhwd m7, m4, m2 | |
4469 punpcklwd m4, m2 | |
4470 punpckhwd m2, m6, m5 | |
4471 punpcklwd m6, m5 | |
4472 | |
4473 punpckldq m5, m4, m6 | |
4474 punpckhdq m4, m6 | |
4475 punpckldq m6, m7, m2 | |
4476 punpckhdq m7, m2 | |
4477 | |
4478 lea r4, [r1 * 3] | |
4479 movh [r0], m5 | |
4480 movhps [r0 + r1], m5 | |
4481 movh [r0 + r1 * 2], m4 | |
4482 movhps [r0 + r4], m4 | |
4483 lea r2, [r0 + r1 * 4] | |
4484 movh [r2], m6 | |
4485 movhps [r2 + r1], m6 | |
4486 movh [r2 + r1 * 2], m7 | |
4487 movhps [r2 + r4], m7 | |
4488 | |
4489 pslldq m1, 2 | |
4490 palignr m0, m3, 12 | |
4491 palignr m3, m1, 12 | |
4492 | |
4493 mova m4, m3 | |
4494 pmaddwd m4, [r5 + 13 * 16] ; [31] | |
4495 paddd m4, [pd_16] | |
4496 psrld m4, 5 | |
4497 mova m2, m0 | |
4498 pmaddwd m2, [r5 + 13 * 16] | |
4499 paddd m2, [pd_16] | |
4500 psrld m2, 5 | |
4501 packusdw m4, m2 | |
4502 | |
4503 mova m2, m3 | |
4504 pmaddwd m2, [r5] ; [18] | |
4505 paddd m2, [pd_16] | |
4506 psrld m2, 5 | |
4507 mova m5, m0 | |
4508 pmaddwd m5, [r5] | |
4509 paddd m5, [pd_16] | |
4510 psrld m5, 5 | |
4511 packusdw m2, m5 | |
4512 | |
4513 mova m6, m3 | |
4514 pmaddwd m6, [r5 - 13 * 16] ; [5] | |
4515 paddd m6, [pd_16] | |
4516 psrld m6, 5 | |
4517 mova m5, m0 | |
4518 pmaddwd m5, [r5 - 13 * 16] | |
4519 paddd m5, [pd_16] | |
4520 psrld m5, 5 | |
4521 packusdw m6, m5 | |
4522 | |
4523 pslldq m1, 2 | |
4524 palignr m0, m3, 12 | |
4525 palignr m3, m1, 12 | |
4526 | |
4527 pmaddwd m3, [r5 + 6 * 16] ; [24] | |
4528 paddd m3, [pd_16] | |
4529 psrld m3, 5 | |
4530 pmaddwd m0, [r5 + 6 * 16] | |
4531 paddd m0, [pd_16] | |
4532 psrld m0, 5 | |
4533 packusdw m3, m0 | |
4534 | |
4535 punpckhwd m5, m4, m2 | |
4536 punpcklwd m4, m2 | |
4537 punpckhwd m2, m6, m3 | |
4538 punpcklwd m6, m3 | |
4539 | |
4540 punpckldq m3, m4, m6 | |
4541 punpckhdq m4, m6 | |
4542 punpckldq m6, m5, m2 | |
4543 punpckhdq m5, m2 | |
4544 | |
4545 movh [r0 + 8], m3 | |
4546 movhps [r0 + r1 + 8], m3 | |
4547 movh [r0 + r1 * 2 + 8], m4 | |
4548 movhps [r0 + r4 + 8], m4 | |
4549 lea r0, [r0 + r1 * 4] | |
4550 movh [r0 + 8], m6 | |
4551 movhps [r0 + r1 + 8], m6 | |
4552 movh [r0 + r1 * 2 + 8], m5 | |
4553 movhps [r0 + r4 + 8], m5 | |
4554 RET | |
4555 | |
4556 cglobal intra_pred_ang8_15, 3,6,8 | |
4557 lea r5, [ang_table + 20 * 16] | |
4558 add r1, r1 | |
4559 | |
4560 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4561 pinsrw m0, [r2], 0 | |
4562 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4563 | |
4564 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4565 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4566 | |
4567 mova m4, m3 | |
4568 pmaddwd m4, [r5 - 5 * 16] ; [15] | |
4569 paddd m4, [pd_16] | |
4570 psrld m4, 5 | |
4571 mova m2, m0 | |
4572 pmaddwd m2, [r5 - 5 * 16] | |
4573 paddd m2, [pd_16] | |
4574 psrld m2, 5 | |
4575 packusdw m4, m2 | |
4576 | |
4577 palignr m0, m3, 12 | |
4578 movu m1, [r2] | |
4579 pshufb m1, [pw_ang8_15] | |
4580 palignr m3, m1, 12 | |
4581 | |
4582 mova m2, m3 | |
4583 pmaddwd m2, [r5 + 10 * 16] ; [30] | |
4584 paddd m2, [pd_16] | |
4585 psrld m2, 5 | |
4586 mova m5, m0 | |
4587 pmaddwd m5, [r5 + 10 * 16] | |
4588 paddd m5, [pd_16] | |
4589 psrld m5, 5 | |
4590 packusdw m2, m5 | |
4591 | |
4592 mova m6, m3 | |
4593 pmaddwd m6, [r5 - 7 * 16] ; [13] | |
4594 paddd m6, [pd_16] | |
4595 psrld m6, 5 | |
4596 mova m5, m0 | |
4597 pmaddwd m5, [r5 - 7 * 16] | |
4598 paddd m5, [pd_16] | |
4599 psrld m5, 5 | |
4600 packusdw m6, m5 | |
4601 | |
4602 pslldq m1, 2 | |
4603 palignr m0, m3, 12 | |
4604 palignr m3, m1, 12 | |
4605 | |
4606 mova m5, m3 | |
4607 pmaddwd m5, [r5 + 8 * 16] ; [28] | |
4608 paddd m5, [pd_16] | |
4609 psrld m5, 5 | |
4610 mova m7, m0 | |
4611 pmaddwd m7, [r5 + 8 * 16] | |
4612 paddd m7, [pd_16] | |
4613 psrld m7, 5 | |
4614 packusdw m5, m7 | |
4615 | |
4616 punpckhwd m7, m4, m2 | |
4617 punpcklwd m4, m2 | |
4618 punpckhwd m2, m6, m5 | |
4619 punpcklwd m6, m5 | |
4620 | |
4621 punpckldq m5, m4, m6 | |
4622 punpckhdq m4, m6 | |
4623 punpckldq m6, m7, m2 | |
4624 punpckhdq m7, m2 | |
4625 | |
4626 lea r4, [r1 * 3] | |
4627 movh [r0], m5 | |
4628 movhps [r0 + r1], m5 | |
4629 movh [r0 + r1 * 2], m4 | |
4630 movhps [r0 + r4], m4 | |
4631 lea r3, [r0 + r1 * 4] | |
4632 movh [r3], m6 | |
4633 movhps [r3 + r1], m6 | |
4634 movh [r3 + r1 * 2], m7 | |
4635 movhps [r3 + r4], m7 | |
4636 | |
4637 mova m4, m3 | |
4638 pmaddwd m4, [r5 - 9 * 16] ; [11] | |
4639 paddd m4, [pd_16] | |
4640 psrld m4, 5 | |
4641 mova m2, m0 | |
4642 pmaddwd m2, [r5 - 9 * 16] | |
4643 paddd m2, [pd_16] | |
4644 psrld m2, 5 | |
4645 packusdw m4, m2 | |
4646 | |
4647 pslldq m1, 2 | |
4648 palignr m0, m3, 12 | |
4649 palignr m3, m1, 12 | |
4650 | |
4651 mova m2, m3 | |
4652 pmaddwd m2, [r5 + 6 * 16] ; [26] | |
4653 paddd m2, [pd_16] | |
4654 psrld m2, 5 | |
4655 mova m5, m0 | |
4656 pmaddwd m5, [r5 + 6 * 16] | |
4657 paddd m5, [pd_16] | |
4658 psrld m5, 5 | |
4659 packusdw m2, m5 | |
4660 | |
4661 mova m6, m3 | |
4662 pmaddwd m6, [r5 - 11 * 16] ; [9] | |
4663 paddd m6, [pd_16] | |
4664 psrld m6, 5 | |
4665 mova m5, m0 | |
4666 pmaddwd m5, [r5 - 11 * 16] | |
4667 paddd m5, [pd_16] | |
4668 psrld m5, 5 | |
4669 packusdw m6, m5 | |
4670 | |
4671 pslldq m1, 2 | |
4672 palignr m0, m3, 12 | |
4673 palignr m3, m1, 12 | |
4674 pinsrw m3, [r2 + 16], 0 | |
4675 | |
4676 pmaddwd m3, [r5 + 4 * 16] ; [24] | |
4677 paddd m3, [pd_16] | |
4678 psrld m3, 5 | |
4679 pmaddwd m0, [r5 + 4 * 16] | |
4680 paddd m0, [pd_16] | |
4681 psrld m0, 5 | |
4682 packusdw m3, m0 | |
4683 | |
4684 punpckhwd m5, m4, m2 | |
4685 punpcklwd m4, m2 | |
4686 punpckhwd m2, m6, m3 | |
4687 punpcklwd m6, m3 | |
4688 | |
4689 punpckldq m3, m4, m6 | |
4690 punpckhdq m4, m6 | |
4691 punpckldq m6, m5, m2 | |
4692 punpckhdq m5, m2 | |
4693 | |
4694 movh [r0 + 8], m3 | |
4695 movhps [r0 + r1 + 8], m3 | |
4696 movh [r0 + r1 * 2 + 8], m4 | |
4697 movhps [r0 + r4 + 8], m4 | |
4698 lea r0, [r0 + r1 * 4] | |
4699 movh [r0 + 8], m6 | |
4700 movhps [r0 + r1 + 8], m6 | |
4701 movh [r0 + r1 * 2 + 8], m5 | |
4702 movhps [r0 + r4 + 8], m5 | |
4703 RET | |
4704 | |
4705 cglobal intra_pred_ang8_16, 3,6,8 | |
4706 lea r5, [ang_table + 13 * 16] | |
4707 add r1, r1 | |
4708 | |
4709 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4710 pinsrw m0, [r2], 0 | |
4711 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4712 | |
4713 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4714 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4715 | |
4716 mova m4, m3 | |
4717 pmaddwd m4, [r5 - 2 * 16] ; [11] | |
4718 paddd m4, [pd_16] | |
4719 psrld m4, 5 | |
4720 mova m2, m0 | |
4721 pmaddwd m2, [r5 - 2 * 16] | |
4722 paddd m2, [pd_16] | |
4723 psrld m2, 5 | |
4724 packusdw m4, m2 | |
4725 | |
4726 palignr m0, m3, 12 | |
4727 movu m1, [r2] | |
4728 pshufb m1, [pw_ang8_16] | |
4729 palignr m3, m1, 12 | |
4730 | |
4731 mova m2, m3 | |
4732 pmaddwd m2, [r5 + 9 * 16] ; [22] | |
4733 paddd m2, [pd_16] | |
4734 psrld m2, 5 | |
4735 mova m5, m0 | |
4736 pmaddwd m5, [r5 + 9 * 16] | |
4737 paddd m5, [pd_16] | |
4738 psrld m5, 5 | |
4739 packusdw m2, m5 | |
4740 | |
4741 mova m6, m3 | |
4742 pmaddwd m6, [r5 - 12 * 16] ; [1] | |
4743 paddd m6, [pd_16] | |
4744 psrld m6, 5 | |
4745 mova m5, m0 | |
4746 pmaddwd m5, [r5 - 12 * 16] | |
4747 paddd m5, [pd_16] | |
4748 psrld m5, 5 | |
4749 packusdw m6, m5 | |
4750 | |
4751 pslldq m1, 2 | |
4752 palignr m0, m3, 12 | |
4753 palignr m3, m1, 12 | |
4754 | |
4755 mova m5, m3 | |
4756 pmaddwd m5, [r5 - 1 * 16] ; [12] | |
4757 paddd m5, [pd_16] | |
4758 psrld m5, 5 | |
4759 mova m7, m0 | |
4760 pmaddwd m7, [r5 - 1 * 16] | |
4761 paddd m7, [pd_16] | |
4762 psrld m7, 5 | |
4763 packusdw m5, m7 | |
4764 | |
4765 punpckhwd m7, m4, m2 | |
4766 punpcklwd m4, m2 | |
4767 punpckhwd m2, m6, m5 | |
4768 punpcklwd m6, m5 | |
4769 | |
4770 punpckldq m5, m4, m6 | |
4771 punpckhdq m4, m6 | |
4772 punpckldq m6, m7, m2 | |
4773 punpckhdq m7, m2 | |
4774 | |
4775 lea r4, [r1 * 3] | |
4776 movh [r0], m5 | |
4777 movhps [r0 + r1], m5 | |
4778 movh [r0 + r1 * 2], m4 | |
4779 movhps [r0 + r4], m4 | |
4780 lea r3, [r0 + r1 * 4] | |
4781 movh [r3], m6 | |
4782 movhps [r3 + r1], m6 | |
4783 movh [r3 + r1 * 2], m7 | |
4784 movhps [r3 + r4], m7 | |
4785 | |
4786 pslldq m1, 2 | |
4787 palignr m0, m3, 12 | |
4788 palignr m3, m1, 12 | |
4789 | |
4790 mova m4, m3 | |
4791 pmaddwd m4, [r5 + 10 * 16] ; [23] | |
4792 paddd m4, [pd_16] | |
4793 psrld m4, 5 | |
4794 mova m2, m0 | |
4795 pmaddwd m2, [r5 + 10 * 16] | |
4796 paddd m2, [pd_16] | |
4797 psrld m2, 5 | |
4798 packusdw m4, m2 | |
4799 | |
4800 mova m2, m3 | |
4801 pmaddwd m2, [r5 - 11 * 16] ; [2] | |
4802 paddd m2, [pd_16] | |
4803 psrld m2, 5 | |
4804 mova m5, m0 | |
4805 pmaddwd m5, [r5 - 11 * 16] | |
4806 paddd m5, [pd_16] | |
4807 psrld m5, 5 | |
4808 packusdw m2, m5 | |
4809 | |
4810 pslldq m1, 2 | |
4811 palignr m0, m3, 12 | |
4812 palignr m3, m1, 12 | |
4813 | |
4814 mova m6, m3 | |
4815 pmaddwd m6, [r5] ; [13] | |
4816 paddd m6, [pd_16] | |
4817 psrld m6, 5 | |
4818 mova m5, m0 | |
4819 pmaddwd m5, [r5] | |
4820 paddd m5, [pd_16] | |
4821 psrld m5, 5 | |
4822 packusdw m6, m5 | |
4823 | |
4824 pslldq m1, 2 | |
4825 palignr m0, m3, 12 | |
4826 palignr m3, m1, 12 | |
4827 pinsrw m3, [r2 + 16], 0 | |
4828 | |
4829 pmaddwd m3, [r5 + 11 * 16] ; [24] | |
4830 paddd m3, [pd_16] | |
4831 psrld m3, 5 | |
4832 pmaddwd m0, [r5 + 11 * 16] | |
4833 paddd m0, [pd_16] | |
4834 psrld m0, 5 | |
4835 packusdw m3, m0 | |
4836 | |
4837 punpckhwd m5, m4, m2 | |
4838 punpcklwd m4, m2 | |
4839 punpckhwd m2, m6, m3 | |
4840 punpcklwd m6, m3 | |
4841 | |
4842 punpckldq m3, m4, m6 | |
4843 punpckhdq m4, m6 | |
4844 punpckldq m6, m5, m2 | |
4845 punpckhdq m5, m2 | |
4846 | |
4847 movh [r0 + 8], m3 | |
4848 movhps [r0 + r1 + 8], m3 | |
4849 movh [r0 + r1 * 2 + 8], m4 | |
4850 movhps [r0 + r4 + 8], m4 | |
4851 lea r0, [r0 + r1 * 4] | |
4852 movh [r0 + 8], m6 | |
4853 movhps [r0 + r1 + 8], m6 | |
4854 movh [r0 + r1 * 2 + 8], m5 | |
4855 movhps [r0 + r4 + 8], m5 | |
4856 RET | |
4857 | |
4858 cglobal intra_pred_ang8_17, 3,6,8 | |
4859 lea r5, [ang_table + 17 * 16] | |
4860 add r1, r1 | |
4861 | |
4862 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
4863 pinsrw m0, [r2], 0 | |
4864 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] | |
4865 | |
4866 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
4867 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
4868 | |
4869 mova m4, m3 | |
4870 pmaddwd m4, [r5 - 11 * 16] ; [6] | |
4871 paddd m4, [pd_16] | |
4872 psrld m4, 5 | |
4873 mova m2, m0 | |
4874 pmaddwd m2, [r5 - 11 * 16] | |
4875 paddd m2, [pd_16] | |
4876 psrld m2, 5 | |
4877 packusdw m4, m2 | |
4878 | |
4879 palignr m0, m3, 12 | |
4880 movu m1, [r2] | |
4881 pshufb m1, [pw_ang8_17] | |
4882 palignr m3, m1, 12 | |
4883 | |
4884 mova m2, m3 | |
4885 pmaddwd m2, [r5 - 5 * 16] ; [12] | |
4886 paddd m2, [pd_16] | |
4887 psrld m2, 5 | |
4888 mova m5, m0 | |
4889 pmaddwd m5, [r5 - 5 * 16] | |
4890 paddd m5, [pd_16] | |
4891 psrld m5, 5 | |
4892 packusdw m2, m5 | |
4893 | |
4894 pslldq m1, 2 | |
4895 palignr m0, m3, 12 | |
4896 palignr m3, m1, 12 | |
4897 | |
4898 mova m6, m3 | |
4899 pmaddwd m6, [r5 + 1 * 16] ; [18] | |
4900 paddd m6, [pd_16] | |
4901 psrld m6, 5 | |
4902 mova m5, m0 | |
4903 pmaddwd m5, [r5 + 1 * 16] | |
4904 paddd m5, [pd_16] | |
4905 psrld m5, 5 | |
4906 packusdw m6, m5 | |
4907 | |
4908 pslldq m1, 2 | |
4909 palignr m0, m3, 12 | |
4910 palignr m3, m1, 12 | |
4911 | |
4912 mova m5, m3 | |
4913 pmaddwd m5, [r5 + 7 * 16] ; [24] | |
4914 paddd m5, [pd_16] | |
4915 psrld m5, 5 | |
4916 mova m7, m0 | |
4917 pmaddwd m7, [r5 + 7 * 16] | |
4918 paddd m7, [pd_16] | |
4919 psrld m7, 5 | |
4920 packusdw m5, m7 | |
4921 | |
4922 punpckhwd m7, m4, m2 | |
4923 punpcklwd m4, m2 | |
4924 punpckhwd m2, m6, m5 | |
4925 punpcklwd m6, m5 | |
4926 | |
4927 punpckldq m5, m4, m6 | |
4928 punpckhdq m4, m6 | |
4929 punpckldq m6, m7, m2 | |
4930 punpckhdq m7, m2 | |
4931 | |
4932 lea r4, [r1 * 3] | |
4933 movh [r0], m5 | |
4934 movhps [r0 + r1], m5 | |
4935 movh [r0 + r1 * 2], m4 | |
4936 movhps [r0 + r4], m4 | |
4937 lea r3, [r0 + r1 * 4] | |
4938 movh [r3], m6 | |
4939 movhps [r3 + r1], m6 | |
4940 movh [r3 + r1 * 2], m7 | |
4941 movhps [r3 + r4], m7 | |
4942 | |
4943 pslldq m1, 2 | |
4944 palignr m0, m3, 12 | |
4945 palignr m3, m1, 12 | |
4946 | |
4947 mova m4, m3 | |
4948 pmaddwd m4, [r5 + 13 * 16] ; [30] | |
4949 paddd m4, [pd_16] | |
4950 psrld m4, 5 | |
4951 mova m2, m0 | |
4952 pmaddwd m2, [r5 + 13 * 16] | |
4953 paddd m2, [pd_16] | |
4954 psrld m2, 5 | |
4955 packusdw m4, m2 | |
4956 | |
4957 mova m2, m3 | |
4958 pmaddwd m2, [r5 - 13 * 16] ; [4] | |
4959 paddd m2, [pd_16] | |
4960 psrld m2, 5 | |
4961 mova m5, m0 | |
4962 pmaddwd m5, [r5 - 13 * 16] | |
4963 paddd m5, [pd_16] | |
4964 psrld m5, 5 | |
4965 packusdw m2, m5 | |
4966 | |
4967 pslldq m1, 2 | |
4968 palignr m0, m3, 12 | |
4969 palignr m3, m1, 12 | |
4970 | |
4971 mova m6, m3 | |
4972 pmaddwd m6, [r5 - 7 * 16] ; [10] | |
4973 paddd m6, [pd_16] | |
4974 psrld m6, 5 | |
4975 mova m5, m0 | |
4976 pmaddwd m5, [r5 - 7 * 16] | |
4977 paddd m5, [pd_16] | |
4978 psrld m5, 5 | |
4979 packusdw m6, m5 | |
4980 | |
4981 pslldq m1, 2 | |
4982 palignr m0, m3, 12 | |
4983 palignr m3, m1, 12 | |
4984 | |
4985 pmaddwd m3, [r5 - 1 * 16] ; [16] | |
4986 paddd m3, [pd_16] | |
4987 psrld m3, 5 | |
4988 pmaddwd m0, [r5 - 1 * 16] | |
4989 paddd m0, [pd_16] | |
4990 psrld m0, 5 | |
4991 packusdw m3, m0 | |
4992 | |
4993 punpckhwd m5, m4, m2 | |
4994 punpcklwd m4, m2 | |
4995 punpckhwd m2, m6, m3 | |
4996 punpcklwd m6, m3 | |
4997 | |
4998 punpckldq m3, m4, m6 | |
4999 punpckhdq m4, m6 | |
5000 punpckldq m6, m5, m2 | |
5001 punpckhdq m5, m2 | |
5002 | |
5003 movh [r0 + 8], m3 | |
5004 movhps [r0 + r1 + 8], m3 | |
5005 movh [r0 + r1 * 2 + 8], m4 | |
5006 movhps [r0 + r4 + 8], m4 | |
5007 lea r0, [r0 + r1 * 4] | |
5008 movh [r0 + 8], m6 | |
5009 movhps [r0 + r1 + 8], m6 | |
5010 movh [r0 + r1 * 2 + 8], m5 | |
5011 movhps [r0 + r4 + 8], m5 | |
5012 RET | |
5013 | |
5014 cglobal intra_pred_ang8_18, 3,4,3 | |
5015 add r1, r1 | |
5016 lea r3, [r1 * 3] | |
5017 movu m1, [r2] | |
5018 movu m0, [r2 + 34] | |
5019 pshufb m0, [pw_swap16] | |
5020 movu [r0], m1 | |
5021 palignr m2, m1, m0, 14 | |
5022 movu [r0 + r1], m2 | |
5023 palignr m2, m1, m0, 12 | |
5024 movu [r0 + r1 * 2], m2 | |
5025 palignr m2, m1, m0, 10 | |
5026 movu [r0 + r3], m2 | |
5027 lea r0, [r0 + r1 * 4] | |
5028 palignr m2, m1, m0, 8 | |
5029 movu [r0], m2 | |
5030 palignr m2, m1, m0, 6 | |
5031 movu [r0 + r1], m2 | |
5032 palignr m2, m1, m0, 4 | |
5033 movu [r0 + r1 * 2], m2 | |
5034 palignr m1, m0, 2 | |
5035 movu [r0 + r3], m1 | |
5036 RET | |
5037 | |
5038 cglobal intra_pred_ang8_19, 3,5,8 | |
5039 lea r3, [ang_table + 17 * 16] | |
5040 add r1, r1 | |
5041 | |
5042 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5043 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5044 | |
5045 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5046 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5047 | |
5048 mova m4, m3 | |
5049 pmaddwd m4, [r3 - 11 * 16] ; [6] | |
5050 paddd m4, [pd_16] | |
5051 psrld m4, 5 | |
5052 mova m2, m0 | |
5053 pmaddwd m2, [r3 - 11 * 16] | |
5054 paddd m2, [pd_16] | |
5055 psrld m2, 5 | |
5056 packusdw m4, m2 | |
5057 | |
5058 palignr m0, m3, 12 | |
5059 movu m1, [r2 + 32] | |
5060 pinsrw m1, [r2], 0 | |
5061 pshufb m1, [pw_ang8_17] | |
5062 palignr m3, m1, 12 | |
5063 | |
5064 mova m2, m3 | |
5065 pmaddwd m2, [r3 - 5 * 16] ; [12] | |
5066 paddd m2, [pd_16] | |
5067 psrld m2, 5 | |
5068 mova m5, m0 | |
5069 pmaddwd m5, [r3 - 5 * 16] | |
5070 paddd m5, [pd_16] | |
5071 psrld m5, 5 | |
5072 packusdw m2, m5 | |
5073 | |
5074 pslldq m1, 2 | |
5075 palignr m0, m3, 12 | |
5076 palignr m3, m1, 12 | |
5077 | |
5078 mova m6, m3 | |
5079 pmaddwd m6, [r3 + 1 * 16] ; [18] | |
5080 paddd m6, [pd_16] | |
5081 psrld m6, 5 | |
5082 mova m5, m0 | |
5083 pmaddwd m5, [r3 + 1 * 16] | |
5084 paddd m5, [pd_16] | |
5085 psrld m5, 5 | |
5086 packusdw m6, m5 | |
5087 | |
5088 pslldq m1, 2 | |
5089 palignr m0, m3, 12 | |
5090 palignr m3, m1, 12 | |
5091 | |
5092 mova m5, m3 | |
5093 pmaddwd m5, [r3 + 7 * 16] ; [24] | |
5094 paddd m5, [pd_16] | |
5095 psrld m5, 5 | |
5096 mova m7, m0 | |
5097 pmaddwd m7, [r3 + 7 * 16] | |
5098 paddd m7, [pd_16] | |
5099 psrld m7, 5 | |
5100 packusdw m5, m7 | |
5101 | |
5102 lea r4, [r1 * 3] | |
5103 movu [r0], m4 | |
5104 movu [r0 + r1], m2 | |
5105 movu [r0 + r1 * 2], m6 | |
5106 movu [r0 + r4], m5 | |
5107 | |
5108 pslldq m1, 2 | |
5109 palignr m0, m3, 12 | |
5110 palignr m3, m1, 12 | |
5111 | |
5112 mova m4, m3 | |
5113 pmaddwd m4, [r3 + 13 * 16] ; [30] | |
5114 paddd m4, [pd_16] | |
5115 psrld m4, 5 | |
5116 mova m2, m0 | |
5117 pmaddwd m2, [r3 + 13 * 16] | |
5118 paddd m2, [pd_16] | |
5119 psrld m2, 5 | |
5120 packusdw m4, m2 | |
5121 | |
5122 mova m2, m3 | |
5123 pmaddwd m2, [r3 - 13 * 16] ; [4] | |
5124 paddd m2, [pd_16] | |
5125 psrld m2, 5 | |
5126 mova m5, m0 | |
5127 pmaddwd m5, [r3 - 13 * 16] | |
5128 paddd m5, [pd_16] | |
5129 psrld m5, 5 | |
5130 packusdw m2, m5 | |
5131 | |
5132 pslldq m1, 2 | |
5133 palignr m0, m3, 12 | |
5134 palignr m3, m1, 12 | |
5135 | |
5136 mova m6, m3 | |
5137 pmaddwd m6, [r3 - 7 * 16] ; [10] | |
5138 paddd m6, [pd_16] | |
5139 psrld m6, 5 | |
5140 mova m5, m0 | |
5141 pmaddwd m5, [r3 - 7 * 16] | |
5142 paddd m5, [pd_16] | |
5143 psrld m5, 5 | |
5144 packusdw m6, m5 | |
5145 | |
5146 pslldq m1, 2 | |
5147 palignr m0, m3, 12 | |
5148 palignr m3, m1, 12 | |
5149 | |
5150 pmaddwd m3, [r3 - 1 * 16] ; [16] | |
5151 paddd m3, [pd_16] | |
5152 psrld m3, 5 | |
5153 pmaddwd m0, [r3 - 1 * 16] | |
5154 paddd m0, [pd_16] | |
5155 psrld m0, 5 | |
5156 packusdw m3, m0 | |
5157 | |
5158 lea r0, [r0 + r1 * 4] | |
5159 movu [r0], m4 | |
5160 movu [r0 + r1], m2 | |
5161 movu [r0 + r1 * 2], m6 | |
5162 movu [r0 + r4], m3 | |
5163 RET | |
5164 | |
5165 cglobal intra_pred_ang8_20, 3,5,8 | |
5166 lea r3, [ang_table + 13 * 16] | |
5167 add r1, r1 | |
5168 | |
5169 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5170 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5171 | |
5172 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5173 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5174 | |
5175 mova m4, m3 | |
5176 pmaddwd m4, [r3 - 2 * 16] ; [11] | |
5177 paddd m4, [pd_16] | |
5178 psrld m4, 5 | |
5179 mova m2, m0 | |
5180 pmaddwd m2, [r3 - 2 * 16] | |
5181 paddd m2, [pd_16] | |
5182 psrld m2, 5 | |
5183 packusdw m4, m2 | |
5184 | |
5185 palignr m0, m3, 12 | |
5186 movu m1, [r2 + 32] | |
5187 pinsrw m1, [r2], 0 | |
5188 pshufb m1, [pw_ang8_16] | |
5189 palignr m3, m1, 12 | |
5190 | |
5191 mova m2, m3 | |
5192 pmaddwd m2, [r3 + 9 * 16] ; [22] | |
5193 paddd m2, [pd_16] | |
5194 psrld m2, 5 | |
5195 mova m5, m0 | |
5196 pmaddwd m5, [r3 + 9 * 16] | |
5197 paddd m5, [pd_16] | |
5198 psrld m5, 5 | |
5199 packusdw m2, m5 | |
5200 | |
5201 mova m6, m3 | |
5202 pmaddwd m6, [r3 - 12 * 16] ; [1] | |
5203 paddd m6, [pd_16] | |
5204 psrld m6, 5 | |
5205 mova m5, m0 | |
5206 pmaddwd m5, [r3 - 12 * 16] | |
5207 paddd m5, [pd_16] | |
5208 psrld m5, 5 | |
5209 packusdw m6, m5 | |
5210 | |
5211 pslldq m1, 2 | |
5212 palignr m0, m3, 12 | |
5213 palignr m3, m1, 12 | |
5214 | |
5215 mova m5, m3 | |
5216 pmaddwd m5, [r3 - 1 * 16] ; [12] | |
5217 paddd m5, [pd_16] | |
5218 psrld m5, 5 | |
5219 mova m7, m0 | |
5220 pmaddwd m7, [r3 - 1 * 16] | |
5221 paddd m7, [pd_16] | |
5222 psrld m7, 5 | |
5223 packusdw m5, m7 | |
5224 | |
5225 lea r4, [r1 * 3] | |
5226 movu [r0], m4 | |
5227 movu [r0 + r1], m2 | |
5228 movu [r0 + r1 * 2], m6 | |
5229 movu [r0 + r4], m5 | |
5230 | |
5231 pslldq m1, 2 | |
5232 palignr m0, m3, 12 | |
5233 palignr m3, m1, 12 | |
5234 | |
5235 mova m4, m3 | |
5236 pmaddwd m4, [r3 + 10 * 16] ; [23] | |
5237 paddd m4, [pd_16] | |
5238 psrld m4, 5 | |
5239 mova m2, m0 | |
5240 pmaddwd m2, [r3 + 10 * 16] | |
5241 paddd m2, [pd_16] | |
5242 psrld m2, 5 | |
5243 packusdw m4, m2 | |
5244 | |
5245 mova m2, m3 | |
5246 pmaddwd m2, [r3 - 11 * 16] ; [2] | |
5247 paddd m2, [pd_16] | |
5248 psrld m2, 5 | |
5249 mova m5, m0 | |
5250 pmaddwd m5, [r3 - 11 * 16] | |
5251 paddd m5, [pd_16] | |
5252 psrld m5, 5 | |
5253 packusdw m2, m5 | |
5254 | |
5255 pslldq m1, 2 | |
5256 palignr m0, m3, 12 | |
5257 palignr m3, m1, 12 | |
5258 | |
5259 mova m6, m3 | |
5260 pmaddwd m6, [r3] ; [13] | |
5261 paddd m6, [pd_16] | |
5262 psrld m6, 5 | |
5263 mova m5, m0 | |
5264 pmaddwd m5, [r3] | |
5265 paddd m5, [pd_16] | |
5266 psrld m5, 5 | |
5267 packusdw m6, m5 | |
5268 | |
5269 pslldq m1, 2 | |
5270 palignr m0, m3, 12 | |
5271 palignr m3, m1, 12 | |
5272 pinsrw m3, [r2 + 16 + 32], 0 | |
5273 | |
5274 pmaddwd m3, [r3 + 11 * 16] ; [24] | |
5275 paddd m3, [pd_16] | |
5276 psrld m3, 5 | |
5277 pmaddwd m0, [r3 + 11 * 16] | |
5278 paddd m0, [pd_16] | |
5279 psrld m0, 5 | |
5280 packusdw m3, m0 | |
5281 | |
5282 lea r0, [r0 + r1 * 4] | |
5283 movu [r0], m4 | |
5284 movu [r0 + r1], m2 | |
5285 movu [r0 + r1 * 2], m6 | |
5286 movu [r0 + r4], m3 | |
5287 RET | |
5288 | |
5289 cglobal intra_pred_ang8_21, 3,5,8 | |
5290 lea r3, [ang_table + 20 * 16] | |
5291 add r1, r1 | |
5292 | |
5293 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5294 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5295 | |
5296 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5297 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5298 | |
5299 mova m4, m3 | |
5300 pmaddwd m4, [r3 - 5 * 16] ; [15] | |
5301 paddd m4, [pd_16] | |
5302 psrld m4, 5 | |
5303 mova m2, m0 | |
5304 pmaddwd m2, [r3 - 5 * 16] | |
5305 paddd m2, [pd_16] | |
5306 psrld m2, 5 | |
5307 packusdw m4, m2 | |
5308 | |
5309 palignr m0, m3, 12 | |
5310 movu m1, [r2 + 32] | |
5311 pinsrw m1, [r2], 0 | |
5312 pshufb m1, [pw_ang8_15] | |
5313 palignr m3, m1, 12 | |
5314 | |
5315 mova m2, m3 | |
5316 pmaddwd m2, [r3 + 10 * 16] ; [30] | |
5317 paddd m2, [pd_16] | |
5318 psrld m2, 5 | |
5319 mova m5, m0 | |
5320 pmaddwd m5, [r3 + 10 * 16] | |
5321 paddd m5, [pd_16] | |
5322 psrld m5, 5 | |
5323 packusdw m2, m5 | |
5324 | |
5325 mova m6, m3 | |
5326 pmaddwd m6, [r3 - 7 * 16] ; [13] | |
5327 paddd m6, [pd_16] | |
5328 psrld m6, 5 | |
5329 mova m5, m0 | |
5330 pmaddwd m5, [r3 - 7 * 16] | |
5331 paddd m5, [pd_16] | |
5332 psrld m5, 5 | |
5333 packusdw m6, m5 | |
5334 | |
5335 pslldq m1, 2 | |
5336 palignr m0, m3, 12 | |
5337 palignr m3, m1, 12 | |
5338 | |
5339 mova m5, m3 | |
5340 pmaddwd m5, [r3 + 8 * 16] ; [28] | |
5341 paddd m5, [pd_16] | |
5342 psrld m5, 5 | |
5343 mova m7, m0 | |
5344 pmaddwd m7, [r3 + 8 * 16] | |
5345 paddd m7, [pd_16] | |
5346 psrld m7, 5 | |
5347 packusdw m5, m7 | |
5348 | |
5349 lea r4, [r1 * 3] | |
5350 movu [r0], m4 | |
5351 movu [r0 + r1], m2 | |
5352 movu [r0 + r1 * 2], m6 | |
5353 movu [r0 + r4], m5 | |
5354 | |
5355 mova m4, m3 | |
5356 pmaddwd m4, [r3 - 9 * 16] ; [11] | |
5357 paddd m4, [pd_16] | |
5358 psrld m4, 5 | |
5359 mova m2, m0 | |
5360 pmaddwd m2, [r3 - 9 * 16] | |
5361 paddd m2, [pd_16] | |
5362 psrld m2, 5 | |
5363 packusdw m4, m2 | |
5364 | |
5365 pslldq m1, 2 | |
5366 palignr m0, m3, 12 | |
5367 palignr m3, m1, 12 | |
5368 | |
5369 mova m2, m3 | |
5370 pmaddwd m2, [r3 + 6 * 16] ; [26] | |
5371 paddd m2, [pd_16] | |
5372 psrld m2, 5 | |
5373 mova m5, m0 | |
5374 pmaddwd m5, [r3 + 6 * 16] | |
5375 paddd m5, [pd_16] | |
5376 psrld m5, 5 | |
5377 packusdw m2, m5 | |
5378 | |
5379 mova m6, m3 | |
5380 pmaddwd m6, [r3 - 11 * 16] ; [9] | |
5381 paddd m6, [pd_16] | |
5382 psrld m6, 5 | |
5383 mova m5, m0 | |
5384 pmaddwd m5, [r3 - 11 * 16] | |
5385 paddd m5, [pd_16] | |
5386 psrld m5, 5 | |
5387 packusdw m6, m5 | |
5388 | |
5389 pslldq m1, 2 | |
5390 palignr m0, m3, 12 | |
5391 palignr m3, m1, 12 | |
5392 pinsrw m3, [r2 + 16 + 32], 0 | |
5393 | |
5394 pmaddwd m3, [r3 + 4 * 16] ; [24] | |
5395 paddd m3, [pd_16] | |
5396 psrld m3, 5 | |
5397 pmaddwd m0, [r3 + 4 * 16] | |
5398 paddd m0, [pd_16] | |
5399 psrld m0, 5 | |
5400 packusdw m3, m0 | |
5401 | |
5402 lea r0, [r0 + r1 * 4] | |
5403 movu [r0], m4 | |
5404 movu [r0 + r1], m2 | |
5405 movu [r0 + r1 * 2], m6 | |
5406 movu [r0 + r4], m3 | |
5407 RET | |
5408 | |
5409 cglobal intra_pred_ang8_22, 3,5,8 | |
5410 lea r3, [ang_table + 18 * 16] | |
5411 add r1, r1 | |
5412 | |
5413 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5414 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5415 | |
5416 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5417 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5418 | |
5419 mova m4, m3 | |
5420 pmaddwd m4, [r3 + 1 * 16] ; [19] | |
5421 paddd m4, [pd_16] | |
5422 psrld m4, 5 | |
5423 mova m2, m0 | |
5424 pmaddwd m2, [r3 + 1 * 16] | |
5425 paddd m2, [pd_16] | |
5426 psrld m2, 5 | |
5427 packusdw m4, m2 | |
5428 | |
5429 mova m2, m3 | |
5430 pmaddwd m2, [r3 - 12 * 16] ; [6] | |
5431 paddd m2, [pd_16] | |
5432 psrld m2, 5 | |
5433 mova m1, m0 | |
5434 pmaddwd m1, [r3 - 12 * 16] | |
5435 paddd m1, [pd_16] | |
5436 psrld m1, 5 | |
5437 packusdw m2, m1 | |
5438 | |
5439 palignr m0, m3, 12 | |
5440 movu m1, [r2 + 32] | |
5441 pinsrw m1, [r2], 0 | |
5442 pshufb m1, [pw_ang8_14] | |
5443 palignr m3, m1, 12 | |
5444 | |
5445 mova m6, m3 | |
5446 pmaddwd m6, [r3 + 7 * 16] ; [25] | |
5447 paddd m6, [pd_16] | |
5448 psrld m6, 5 | |
5449 mova m5, m0 | |
5450 pmaddwd m5, [r3 + 7 * 16] | |
5451 paddd m5, [pd_16] | |
5452 psrld m5, 5 | |
5453 packusdw m6, m5 | |
5454 | |
5455 mova m5, m3 | |
5456 pmaddwd m5, [r3 - 6 * 16] ; [12] | |
5457 paddd m5, [pd_16] | |
5458 psrld m5, 5 | |
5459 mova m7, m0 | |
5460 pmaddwd m7, [r3 - 6 * 16] | |
5461 paddd m7, [pd_16] | |
5462 psrld m7, 5 | |
5463 packusdw m5, m7 | |
5464 | |
5465 lea r4, [r1 * 3] | |
5466 movu [r0], m4 | |
5467 movu [r0 + r1], m2 | |
5468 movu [r0 + r1 * 2], m6 | |
5469 movu [r0 + r4], m5 | |
5470 | |
5471 pslldq m1, 2 | |
5472 palignr m0, m3, 12 | |
5473 palignr m3, m1, 12 | |
5474 | |
5475 mova m4, m3 | |
5476 pmaddwd m4, [r3 + 13 * 16] ; [31] | |
5477 paddd m4, [pd_16] | |
5478 psrld m4, 5 | |
5479 mova m2, m0 | |
5480 pmaddwd m2, [r3 + 13 * 16] | |
5481 paddd m2, [pd_16] | |
5482 psrld m2, 5 | |
5483 packusdw m4, m2 | |
5484 | |
5485 mova m2, m3 | |
5486 pmaddwd m2, [r3] ; [18] | |
5487 paddd m2, [pd_16] | |
5488 psrld m2, 5 | |
5489 mova m5, m0 | |
5490 pmaddwd m5, [r3] | |
5491 paddd m5, [pd_16] | |
5492 psrld m5, 5 | |
5493 packusdw m2, m5 | |
5494 | |
5495 mova m6, m3 | |
5496 pmaddwd m6, [r3 - 13 * 16] ; [5] | |
5497 paddd m6, [pd_16] | |
5498 psrld m6, 5 | |
5499 mova m5, m0 | |
5500 pmaddwd m5, [r3 - 13 * 16] | |
5501 paddd m5, [pd_16] | |
5502 psrld m5, 5 | |
5503 packusdw m6, m5 | |
5504 | |
5505 pslldq m1, 2 | |
5506 palignr m0, m3, 12 | |
5507 palignr m3, m1, 12 | |
5508 | |
5509 pmaddwd m3, [r3 + 6 * 16] ; [24] | |
5510 paddd m3, [pd_16] | |
5511 psrld m3, 5 | |
5512 pmaddwd m0, [r3 + 6 * 16] | |
5513 paddd m0, [pd_16] | |
5514 psrld m0, 5 | |
5515 packusdw m3, m0 | |
5516 | |
5517 lea r0, [r0 + r1 * 4] | |
5518 movu [r0], m4 | |
5519 movu [r0 + r1], m2 | |
5520 movu [r0 + r1 * 2], m6 | |
5521 movu [r0 + r4], m3 | |
5522 RET | |
5523 | |
5524 cglobal intra_pred_ang8_23, 3,5,8 | |
5525 lea r3, [ang_table + 14 * 16] | |
5526 add r1, r1 | |
5527 | |
5528 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5529 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5530 | |
5531 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5532 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5533 | |
5534 mova m4, m3 | |
5535 pmaddwd m4, [r3 + 9 * 16] ; [23] | |
5536 paddd m4, [pd_16] | |
5537 psrld m4, 5 | |
5538 mova m2, m0 | |
5539 pmaddwd m2, [r3 + 9 * 16] | |
5540 paddd m2, [pd_16] | |
5541 psrld m2, 5 | |
5542 packusdw m4, m2 | |
5543 | |
5544 mova m2, m3 | |
5545 pmaddwd m2, [r3] ; [14] | |
5546 paddd m2, [pd_16] | |
5547 psrld m2, 5 | |
5548 mova m1, m0 | |
5549 pmaddwd m1, [r3] | |
5550 paddd m1, [pd_16] | |
5551 psrld m1, 5 | |
5552 packusdw m2, m1 | |
5553 | |
5554 mova m6, m3 | |
5555 pmaddwd m6, [r3 - 9 * 16] ; [5] | |
5556 paddd m6, [pd_16] | |
5557 psrld m6, 5 | |
5558 mova m1, m0 | |
5559 pmaddwd m1, [r3 - 9 * 16] | |
5560 paddd m1, [pd_16] | |
5561 psrld m1, 5 | |
5562 packusdw m6, m1 | |
5563 | |
5564 palignr m0, m3, 12 | |
5565 movu m1, [r2 + 32] | |
5566 pinsrw m1, [r2], 0 | |
5567 pshufb m1, [pw_ang8_13] | |
5568 palignr m3, m1, 12 | |
5569 | |
5570 mova m5, m3 | |
5571 pmaddwd m5, [r3 + 14 * 16] ; [28] | |
5572 paddd m5, [pd_16] | |
5573 psrld m5, 5 | |
5574 mova m7, m0 | |
5575 pmaddwd m7, [r3 + 14 * 16] | |
5576 paddd m7, [pd_16] | |
5577 psrld m7, 5 | |
5578 packusdw m5, m7 | |
5579 | |
5580 lea r4, [r1 * 3] | |
5581 movu [r0], m4 | |
5582 movu [r0 + r1], m2 | |
5583 movu [r0 + r1 * 2], m6 | |
5584 movu [r0 + r4], m5 | |
5585 | |
5586 mova m4, m3 | |
5587 pmaddwd m4, [r3 + 5 * 16] ; [19] | |
5588 paddd m4, [pd_16] | |
5589 psrld m4, 5 | |
5590 mova m2, m0 | |
5591 pmaddwd m2, [r3 + 5 * 16] | |
5592 paddd m2, [pd_16] | |
5593 psrld m2, 5 | |
5594 packusdw m4, m2 | |
5595 | |
5596 mova m2, m3 | |
5597 pmaddwd m2, [r3 - 4 * 16] ; [10] | |
5598 paddd m2, [pd_16] | |
5599 psrld m2, 5 | |
5600 mova m5, m0 | |
5601 pmaddwd m5, [r3 - 4 * 16] | |
5602 paddd m5, [pd_16] | |
5603 psrld m5, 5 | |
5604 packusdw m2, m5 | |
5605 | |
5606 mova m6, m3 | |
5607 pmaddwd m6, [r3 - 13 * 16] ; [1] | |
5608 paddd m6, [pd_16] | |
5609 psrld m6, 5 | |
5610 mova m5, m0 | |
5611 pmaddwd m5, [r3 - 13 * 16] | |
5612 paddd m5, [pd_16] | |
5613 psrld m5, 5 | |
5614 packusdw m6, m5 | |
5615 | |
5616 pslldq m1, 2 | |
5617 palignr m0, m3, 12 | |
5618 palignr m3, m1, 12 | |
5619 | |
5620 pmaddwd m3, [r3 + 10 * 16] ; [24] | |
5621 paddd m3, [pd_16] | |
5622 psrld m3, 5 | |
5623 pmaddwd m0, [r3 + 10 * 16] | |
5624 paddd m0, [pd_16] | |
5625 psrld m0, 5 | |
5626 packusdw m3, m0 | |
5627 | |
5628 lea r0, [r0 + r1 * 4] | |
5629 movu [r0], m4 | |
5630 movu [r0 + r1], m2 | |
5631 movu [r0 + r1 * 2], m6 | |
5632 movu [r0 + r4], m3 | |
5633 RET | |
5634 | |
5635 cglobal intra_pred_ang8_24, 3,5,7 | |
5636 lea r3, [ang_table + 16 * 16] | |
5637 add r1, r1 | |
5638 | |
5639 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5640 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5641 | |
5642 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5643 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5644 | |
5645 mova m4, m3 | |
5646 pmaddwd m4, [r3 + 11 * 16] ; [27] | |
5647 paddd m4, [pd_16] | |
5648 psrld m4, 5 | |
5649 mova m2, m0 | |
5650 pmaddwd m2, [r3 + 11 * 16] | |
5651 paddd m2, [pd_16] | |
5652 psrld m2, 5 | |
5653 packusdw m4, m2 | |
5654 | |
5655 mova m2, m3 | |
5656 pmaddwd m2, [r3 + 6 * 16] ; [22] | |
5657 paddd m2, [pd_16] | |
5658 psrld m2, 5 | |
5659 mova m1, m0 | |
5660 pmaddwd m1, [r3 + 6 * 16] | |
5661 paddd m1, [pd_16] | |
5662 psrld m1, 5 | |
5663 packusdw m2, m1 | |
5664 | |
5665 mova m6, m3 | |
5666 pmaddwd m6, [r3 + 1 * 16] ; [17] | |
5667 paddd m6, [pd_16] | |
5668 psrld m6, 5 | |
5669 mova m1, m0 | |
5670 pmaddwd m1, [r3 + 1 * 16] | |
5671 paddd m1, [pd_16] | |
5672 psrld m1, 5 | |
5673 packusdw m6, m1 | |
5674 | |
5675 mova m5, m3 | |
5676 pmaddwd m5, [r3 - 4 * 16] ; [12] | |
5677 paddd m5, [pd_16] | |
5678 psrld m5, 5 | |
5679 mova m1, m0 | |
5680 pmaddwd m1, [r3 - 4 * 16] | |
5681 paddd m1, [pd_16] | |
5682 psrld m1, 5 | |
5683 packusdw m5, m1 | |
5684 | |
5685 lea r4, [r1 * 3] | |
5686 movu [r0], m4 | |
5687 movu [r0 + r1], m2 | |
5688 movu [r0 + r1 * 2], m6 | |
5689 movu [r0 + r4], m5 | |
5690 | |
5691 mova m4, m3 | |
5692 pmaddwd m4, [r3 - 9 * 16] ; [7] | |
5693 paddd m4, [pd_16] | |
5694 psrld m4, 5 | |
5695 mova m2, m0 | |
5696 pmaddwd m2, [r3 - 9 * 16] | |
5697 paddd m2, [pd_16] | |
5698 psrld m2, 5 | |
5699 packusdw m4, m2 | |
5700 | |
5701 mova m2, m3 | |
5702 pmaddwd m2, [r3 - 14 * 16] ; [2] | |
5703 paddd m2, [pd_16] | |
5704 psrld m2, 5 | |
5705 mova m1, m0 | |
5706 pmaddwd m1, [r3 - 14 * 16] | |
5707 paddd m1, [pd_16] | |
5708 psrld m1, 5 | |
5709 packusdw m2, m1 | |
5710 | |
5711 palignr m0, m3, 12 | |
5712 movu m1, [r2 + 32] | |
5713 pinsrw m1, [r2], 0 | |
5714 pshufb m1, [pw_ang8_12] | |
5715 palignr m3, m1, 12 | |
5716 | |
5717 mova m6, m3 | |
5718 pmaddwd m6, [r3 + 13 * 16] ; [29] | |
5719 paddd m6, [pd_16] | |
5720 psrld m6, 5 | |
5721 mova m5, m0 | |
5722 pmaddwd m5, [r3 + 13 * 16] | |
5723 paddd m5, [pd_16] | |
5724 psrld m5, 5 | |
5725 packusdw m6, m5 | |
5726 | |
5727 pmaddwd m3, [r3 + 8 * 16] ; [24] | |
5728 paddd m3, [pd_16] | |
5729 psrld m3, 5 | |
5730 pmaddwd m0, [r3 + 8 * 16] | |
5731 paddd m0, [pd_16] | |
5732 psrld m0, 5 | |
5733 packusdw m3, m0 | |
5734 | |
5735 lea r0, [r0 + r1 * 4] | |
5736 movu [r0], m4 | |
5737 movu [r0 + r1], m2 | |
5738 movu [r0 + r1 * 2], m6 | |
5739 movu [r0 + r4], m3 | |
5740 RET | |
5741 | |
5742 cglobal intra_pred_ang8_25, 3,5,7 | |
5743 lea r3, [ang_table + 23 * 16] | |
5744 add r1, r1 | |
5745 | |
5746 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
5747 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5748 | |
5749 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
5750 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
5751 | |
5752 mova m4, m3 | |
5753 pmaddwd m4, [r3 + 7 * 16] ; [30] | |
5754 paddd m4, [pd_16] | |
5755 psrld m4, 5 | |
5756 mova m2, m0 | |
5757 pmaddwd m2, [r3 + 7 * 16] | |
5758 paddd m2, [pd_16] | |
5759 psrld m2, 5 | |
5760 packusdw m4, m2 | |
5761 | |
5762 mova m2, m3 | |
5763 pmaddwd m2, [r3 + 5 * 16] ; [28] | |
5764 paddd m2, [pd_16] | |
5765 psrld m2, 5 | |
5766 mova m1, m0 | |
5767 pmaddwd m1, [r3 + 5 * 16] | |
5768 paddd m1, [pd_16] | |
5769 psrld m1, 5 | |
5770 packusdw m2, m1 | |
5771 | |
5772 mova m6, m3 | |
5773 pmaddwd m6, [r3 + 3 * 16] ; [26] | |
5774 paddd m6, [pd_16] | |
5775 psrld m6, 5 | |
5776 mova m1, m0 | |
5777 pmaddwd m1, [r3 + 3 * 16] | |
5778 paddd m1, [pd_16] | |
5779 psrld m1, 5 | |
5780 packusdw m6, m1 | |
5781 | |
5782 mova m5, m3 | |
5783 pmaddwd m5, [r3 + 1 * 16] ; [24] | |
5784 paddd m5, [pd_16] | |
5785 psrld m5, 5 | |
5786 mova m1, m0 | |
5787 pmaddwd m1, [r3 + 1 * 16] | |
5788 paddd m1, [pd_16] | |
5789 psrld m1, 5 | |
5790 packusdw m5, m1 | |
5791 | |
5792 lea r4, [r1 * 3] | |
5793 movu [r0], m4 | |
5794 movu [r0 + r1], m2 | |
5795 movu [r0 + r1 * 2], m6 | |
5796 movu [r0 + r4], m5 | |
5797 | |
5798 mova m4, m3 | |
5799 pmaddwd m4, [r3 - 1 * 16] ; [22] | |
5800 paddd m4, [pd_16] | |
5801 psrld m4, 5 | |
5802 mova m2, m0 | |
5803 pmaddwd m2, [r3 - 1 * 16] | |
5804 paddd m2, [pd_16] | |
5805 psrld m2, 5 | |
5806 packusdw m4, m2 | |
5807 | |
5808 mova m2, m3 | |
5809 pmaddwd m2, [r3 - 3 * 16] ; [20] | |
5810 paddd m2, [pd_16] | |
5811 psrld m2, 5 | |
5812 mova m1, m0 | |
5813 pmaddwd m1, [r3 - 3 * 16] | |
5814 paddd m1, [pd_16] | |
5815 psrld m1, 5 | |
5816 packusdw m2, m1 | |
5817 | |
5818 mova m6, m3 | |
5819 pmaddwd m6, [r3 - 5 * 16] ; [18] | |
5820 paddd m6, [pd_16] | |
5821 psrld m6, 5 | |
5822 mova m5, m0 | |
5823 pmaddwd m5, [r3 - 5 * 16] | |
5824 paddd m5, [pd_16] | |
5825 psrld m5, 5 | |
5826 packusdw m6, m5 | |
5827 | |
5828 pmaddwd m3, [r3 - 7 * 16] ; [16] | |
5829 paddd m3, [pd_16] | |
5830 psrld m3, 5 | |
5831 pmaddwd m0, [r3 - 7 * 16] | |
5832 paddd m0, [pd_16] | |
5833 psrld m0, 5 | |
5834 packusdw m3, m0 | |
5835 | |
5836 lea r0, [r0 + r1 * 4] | |
5837 movu [r0], m4 | |
5838 movu [r0 + r1], m2 | |
5839 movu [r0 + r1 * 2], m6 | |
5840 movu [r0 + r4], m3 | |
5841 RET | |
5842 | |
5843 cglobal intra_pred_ang8_26, 3,6,3 | |
5844 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5845 add r1, r1 | |
5846 lea r5, [r1 * 3] | |
5847 | |
5848 movu [r0], m0 | |
5849 movu [r0 + r1], m0 | |
5850 movu [r0 + r1 * 2], m0 | |
5851 movu [r0 + r5], m0 | |
5852 | |
5853 lea r3, [r0 + r1 *4] | |
5854 movu [r3], m0 | |
5855 movu [r3 + r1], m0 | |
5856 movu [r3 + r1 * 2], m0 | |
5857 movu [r3 + r5], m0 | |
5858 | |
5859 cmp r4m, byte 0 | |
5860 jz .quit | |
5861 | |
5862 ; filter | |
5863 pshufb m0, [pb_01] | |
5864 pinsrw m1, [r2], 0 ; [3 2 1 0] | |
5865 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] | |
5866 movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1] | |
5867 psubw m1, m2 | |
5868 psraw m1, 1 | |
5869 paddw m0, m1 | |
5870 pxor m1, m1 | |
5871 pmaxsw m0, m1 | |
5872 pminsw m0, [pw_pixel_max] | |
5873 pextrw [r0], m0, 0 | |
5874 pextrw [r0 + r1], m0, 1 | |
5875 pextrw [r0 + r1 * 2], m0, 2 | |
5876 pextrw [r0 + r5], m0, 3 | |
5877 pextrw [r3], m0, 4 | |
5878 pextrw [r3 + r1], m0, 5 | |
5879 pextrw [r3 + r1 * 2], m0, 6 | |
5880 pextrw [r3 + r5], m0, 7 | |
5881 .quit: | |
5882 RET | |
5883 | |
5884 cglobal intra_pred_ang8_27, 3,5,7 | |
5885 lea r3, [ang_table + 9 * 16] | |
5886 add r1, r1 | |
5887 | |
5888 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5889 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] | |
5890 | |
5891 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] | |
5892 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] | |
5893 | |
5894 mova m4, m3 | |
5895 pmaddwd m4, [r3 - 7 * 16] ; [2] | |
5896 paddd m4, [pd_16] | |
5897 psrld m4, 5 | |
5898 mova m2, m0 | |
5899 pmaddwd m2, [r3 - 7 * 16] | |
5900 paddd m2, [pd_16] | |
5901 psrld m2, 5 | |
5902 packusdw m4, m2 | |
5903 | |
5904 mova m2, m3 | |
5905 pmaddwd m2, [r3 - 5 * 16] ; [4] | |
5906 paddd m2, [pd_16] | |
5907 psrld m2, 5 | |
5908 mova m1, m0 | |
5909 pmaddwd m1, [r3 - 5 * 16] | |
5910 paddd m1, [pd_16] | |
5911 psrld m1, 5 | |
5912 packusdw m2, m1 | |
5913 | |
5914 mova m6, m3 | |
5915 pmaddwd m6, [r3 - 3 * 16] ; [6] | |
5916 paddd m6, [pd_16] | |
5917 psrld m6, 5 | |
5918 mova m1, m0 | |
5919 pmaddwd m1, [r3 - 3 * 16] | |
5920 paddd m1, [pd_16] | |
5921 psrld m1, 5 | |
5922 packusdw m6, m1 | |
5923 | |
5924 mova m5, m3 | |
5925 pmaddwd m5, [r3 - 1 * 16] ; [8] | |
5926 paddd m5, [pd_16] | |
5927 psrld m5, 5 | |
5928 mova m1, m0 | |
5929 pmaddwd m1, [r3 - 1 * 16] | |
5930 paddd m1, [pd_16] | |
5931 psrld m1, 5 | |
5932 packusdw m5, m1 | |
5933 | |
5934 lea r4, [r1 * 3] | |
5935 movu [r0], m4 | |
5936 movu [r0 + r1], m2 | |
5937 movu [r0 + r1 * 2], m6 | |
5938 movu [r0 + r4], m5 | |
5939 | |
5940 mova m4, m3 | |
5941 pmaddwd m4, [r3 + 1 * 16] ; [10] | |
5942 paddd m4, [pd_16] | |
5943 psrld m4, 5 | |
5944 mova m2, m0 | |
5945 pmaddwd m2, [r3 + 1 * 16] | |
5946 paddd m2, [pd_16] | |
5947 psrld m2, 5 | |
5948 packusdw m4, m2 | |
5949 | |
5950 mova m2, m3 | |
5951 pmaddwd m2, [r3 + 3 * 16] ; [12] | |
5952 paddd m2, [pd_16] | |
5953 psrld m2, 5 | |
5954 mova m1, m0 | |
5955 pmaddwd m1, [r3 + 3 * 16] | |
5956 paddd m1, [pd_16] | |
5957 psrld m1, 5 | |
5958 packusdw m2, m1 | |
5959 | |
5960 mova m6, m3 | |
5961 pmaddwd m6, [r3 + 5 * 16] ; [14] | |
5962 paddd m6, [pd_16] | |
5963 psrld m6, 5 | |
5964 mova m5, m0 | |
5965 pmaddwd m5, [r3 + 5 * 16] | |
5966 paddd m5, [pd_16] | |
5967 psrld m5, 5 | |
5968 packusdw m6, m5 | |
5969 | |
5970 pmaddwd m3, [r3 + 7 * 16] ; [16] | |
5971 paddd m3, [pd_16] | |
5972 psrld m3, 5 | |
5973 pmaddwd m0, [r3 + 7 * 16] | |
5974 paddd m0, [pd_16] | |
5975 psrld m0, 5 | |
5976 packusdw m3, m0 | |
5977 | |
5978 lea r0, [r0 + r1 * 4] | |
5979 movu [r0], m4 | |
5980 movu [r0 + r1], m2 | |
5981 movu [r0 + r1 * 2], m6 | |
5982 movu [r0 + r4], m3 | |
5983 RET | |
5984 | |
5985 cglobal intra_pred_ang8_28, 3,5,7 | |
5986 lea r3, [ang_table + 17 * 16] | |
5987 add r1, r1 | |
5988 | |
5989 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
5990 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] | |
5991 | |
5992 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] | |
5993 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] | |
5994 | |
5995 mova m4, m3 | |
5996 pmaddwd m4, [r3 - 12 * 16] ; [5] | |
5997 paddd m4, [pd_16] | |
5998 psrld m4, 5 | |
5999 mova m2, m0 | |
6000 pmaddwd m2, [r3 - 12 * 16] | |
6001 paddd m2, [pd_16] | |
6002 psrld m2, 5 | |
6003 packusdw m4, m2 | |
6004 | |
6005 mova m2, m3 | |
6006 pmaddwd m2, [r3 - 7 * 16] ; [10] | |
6007 paddd m2, [pd_16] | |
6008 psrld m2, 5 | |
6009 mova m1, m0 | |
6010 pmaddwd m1, [r3 - 7 * 16] | |
6011 paddd m1, [pd_16] | |
6012 psrld m1, 5 | |
6013 packusdw m2, m1 | |
6014 | |
6015 mova m6, m3 | |
6016 pmaddwd m6, [r3 - 2 * 16] ; [15] | |
6017 paddd m6, [pd_16] | |
6018 psrld m6, 5 | |
6019 mova m1, m0 | |
6020 pmaddwd m1, [r3 - 2 * 16] | |
6021 paddd m1, [pd_16] | |
6022 psrld m1, 5 | |
6023 packusdw m6, m1 | |
6024 | |
6025 mova m5, m3 | |
6026 pmaddwd m5, [r3 + 3 * 16] ; [20] | |
6027 paddd m5, [pd_16] | |
6028 psrld m5, 5 | |
6029 mova m1, m0 | |
6030 pmaddwd m1, [r3 + 3 * 16] | |
6031 paddd m1, [pd_16] | |
6032 psrld m1, 5 | |
6033 packusdw m5, m1 | |
6034 | |
6035 lea r4, [r1 * 3] | |
6036 movu [r0], m4 | |
6037 movu [r0 + r1], m2 | |
6038 movu [r0 + r1 * 2], m6 | |
6039 movu [r0 + r4], m5 | |
6040 | |
6041 mova m4, m3 | |
6042 pmaddwd m4, [r3 + 8 * 16] ; [25] | |
6043 paddd m4, [pd_16] | |
6044 psrld m4, 5 | |
6045 mova m2, m0 | |
6046 pmaddwd m2, [r3 + 8 * 16] | |
6047 paddd m2, [pd_16] | |
6048 psrld m2, 5 | |
6049 packusdw m4, m2 | |
6050 | |
6051 mova m2, m3 | |
6052 pmaddwd m2, [r3 + 13 * 16] ; [30] | |
6053 paddd m2, [pd_16] | |
6054 psrld m2, 5 | |
6055 mova m1, m0 | |
6056 pmaddwd m1, [r3 + 13 * 16] | |
6057 paddd m1, [pd_16] | |
6058 psrld m1, 5 | |
6059 packusdw m2, m1 | |
6060 | |
6061 movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6062 | |
6063 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6064 mova m5, m6 | |
6065 pmaddwd m6, [r3 - 14 * 16] ; [3] | |
6066 paddd m6, [pd_16] | |
6067 psrld m6, 5 | |
6068 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6069 mova m3, m1 | |
6070 pmaddwd m1, [r3 - 14 * 16] | |
6071 paddd m1, [pd_16] | |
6072 psrld m1, 5 | |
6073 packusdw m6, m1 | |
6074 | |
6075 pmaddwd m5, [r3 - 9 * 16] ; [8] | |
6076 paddd m5, [pd_16] | |
6077 psrld m5, 5 | |
6078 pmaddwd m3, [r3 - 9 * 16] | |
6079 paddd m3, [pd_16] | |
6080 psrld m3, 5 | |
6081 packusdw m5, m3 | |
6082 | |
6083 lea r0, [r0 + r1 * 4] | |
6084 movu [r0], m4 | |
6085 movu [r0 + r1], m2 | |
6086 movu [r0 + r1 * 2], m6 | |
6087 movu [r0 + r4], m5 | |
6088 RET | |
6089 | |
6090 cglobal intra_pred_ang8_29, 3,5,8 | |
6091 lea r3, [ang_table + 18 * 16] | |
6092 add r1, r1 | |
6093 | |
6094 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6095 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6096 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6097 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6098 | |
6099 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6100 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6101 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6102 | |
6103 mova m4, m3 | |
6104 pmaddwd m4, [r3 - 9 * 16] ; [9] | |
6105 paddd m4, [pd_16] | |
6106 psrld m4, 5 | |
6107 mova m2, m0 | |
6108 pmaddwd m2, [r3 - 9 * 16] | |
6109 paddd m2, [pd_16] | |
6110 psrld m2, 5 | |
6111 packusdw m4, m2 | |
6112 | |
6113 mova m2, m3 | |
6114 pmaddwd m2, [r3] ; [18] | |
6115 paddd m2, [pd_16] | |
6116 psrld m2, 5 | |
6117 mova m1, m0 | |
6118 pmaddwd m1, [r3] | |
6119 paddd m1, [pd_16] | |
6120 psrld m1, 5 | |
6121 packusdw m2, m1 | |
6122 | |
6123 mova m6, m3 | |
6124 pmaddwd m6, [r3 + 9 * 16] ; [27] | |
6125 paddd m6, [pd_16] | |
6126 psrld m6, 5 | |
6127 mova m1, m0 | |
6128 pmaddwd m1, [r3 + 9 * 16] | |
6129 paddd m1, [pd_16] | |
6130 psrld m1, 5 | |
6131 packusdw m6, m1 | |
6132 | |
6133 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6134 pmaddwd m7, [r3 - 14 * 16] ; [4] | |
6135 paddd m7, [pd_16] | |
6136 psrld m7, 5 | |
6137 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6138 pmaddwd m1, [r3 - 14 * 16] | |
6139 paddd m1, [pd_16] | |
6140 psrld m1, 5 | |
6141 packusdw m7, m1 | |
6142 | |
6143 lea r4, [r1 * 3] | |
6144 movu [r0], m4 | |
6145 movu [r0 + r1], m2 | |
6146 movu [r0 + r1 * 2], m6 | |
6147 movu [r0 + r4], m7 | |
6148 | |
6149 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6150 mova m6, m4 | |
6151 pmaddwd m4, [r3 - 5 * 16] ; [13] | |
6152 paddd m4, [pd_16] | |
6153 psrld m4, 5 | |
6154 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6155 mova m7, m2 | |
6156 pmaddwd m2, [r3 - 5 * 16] | |
6157 paddd m2, [pd_16] | |
6158 psrld m2, 5 | |
6159 packusdw m4, m2 | |
6160 | |
6161 pmaddwd m2, m6, [r3 + 4 * 16] ; [22] | |
6162 paddd m2, [pd_16] | |
6163 psrld m2, 5 | |
6164 pmaddwd m1, m7, [r3 + 4 * 16] | |
6165 paddd m1, [pd_16] | |
6166 psrld m1, 5 | |
6167 packusdw m2, m1 | |
6168 | |
6169 pmaddwd m6, [r3 + 13 * 16] ; [31] | |
6170 paddd m6, [pd_16] | |
6171 psrld m6, 5 | |
6172 pmaddwd m7, [r3 + 13 * 16] | |
6173 paddd m7, [pd_16] | |
6174 psrld m7, 5 | |
6175 packusdw m6, m7 | |
6176 | |
6177 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6178 pmaddwd m7, [r3 - 10 * 16] ; [8] | |
6179 paddd m7, [pd_16] | |
6180 psrld m7, 5 | |
6181 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6182 pmaddwd m5, [r3 - 10 * 16] | |
6183 paddd m5, [pd_16] | |
6184 psrld m5, 5 | |
6185 packusdw m7, m5 | |
6186 | |
6187 lea r0, [r0 + r1 * 4] | |
6188 movu [r0], m4 | |
6189 movu [r0 + r1], m2 | |
6190 movu [r0 + r1 * 2], m6 | |
6191 movu [r0 + r4], m7 | |
6192 RET | |
6193 | |
6194 cglobal intra_pred_ang8_30, 3,5,8 | |
6195 lea r3, [ang_table + 14 * 16] | |
6196 add r1, r1 | |
6197 | |
6198 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6199 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6200 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6201 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6202 | |
6203 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6204 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6205 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6206 | |
6207 mova m4, m3 | |
6208 pmaddwd m4, [r3 - 1 * 16] ; [13] | |
6209 paddd m4, [pd_16] | |
6210 psrld m4, 5 | |
6211 mova m2, m0 | |
6212 pmaddwd m2, [r3 - 1 * 16] | |
6213 paddd m2, [pd_16] | |
6214 psrld m2, 5 | |
6215 packusdw m4, m2 | |
6216 | |
6217 mova m2, m3 | |
6218 pmaddwd m2, [r3 + 12 * 16] ; [26] | |
6219 paddd m2, [pd_16] | |
6220 psrld m2, 5 | |
6221 mova m1, m0 | |
6222 pmaddwd m1, [r3 + 12 * 16] | |
6223 paddd m1, [pd_16] | |
6224 psrld m1, 5 | |
6225 packusdw m2, m1 | |
6226 | |
6227 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6228 mova m7, m6 | |
6229 pmaddwd m6, [r3 - 7 * 16] ; [7] | |
6230 paddd m6, [pd_16] | |
6231 psrld m6, 5 | |
6232 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6233 pmaddwd m1, [r3 - 7 * 16] | |
6234 paddd m1, [pd_16] | |
6235 psrld m1, 5 | |
6236 packusdw m6, m1 | |
6237 | |
6238 pmaddwd m7, [r3 + 6 * 16] ; [20] | |
6239 paddd m7, [pd_16] | |
6240 psrld m7, 5 | |
6241 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6242 pmaddwd m1, [r3 + 6 * 16] | |
6243 paddd m1, [pd_16] | |
6244 psrld m1, 5 | |
6245 packusdw m7, m1 | |
6246 | |
6247 lea r4, [r1 * 3] | |
6248 movu [r0], m4 | |
6249 movu [r0 + r1], m2 | |
6250 movu [r0 + r1 * 2], m6 | |
6251 movu [r0 + r4], m7 | |
6252 | |
6253 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6254 mova m6, m4 | |
6255 pmaddwd m4, [r3 - 13 * 16] ; [1] | |
6256 paddd m4, [pd_16] | |
6257 psrld m4, 5 | |
6258 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6259 mova m7, m2 | |
6260 pmaddwd m2, [r3 - 13 * 16] | |
6261 paddd m2, [pd_16] | |
6262 psrld m2, 5 | |
6263 packusdw m4, m2 | |
6264 | |
6265 pmaddwd m2, m6, [r3] ; [14] | |
6266 paddd m2, [pd_16] | |
6267 psrld m2, 5 | |
6268 pmaddwd m1, m7, [r3] | |
6269 paddd m1, [pd_16] | |
6270 psrld m1, 5 | |
6271 packusdw m2, m1 | |
6272 | |
6273 pmaddwd m6, [r3 + 13 * 16] ; [27] | |
6274 paddd m6, [pd_16] | |
6275 psrld m6, 5 | |
6276 pmaddwd m7, [r3 + 13 * 16] | |
6277 paddd m7, [pd_16] | |
6278 psrld m7, 5 | |
6279 packusdw m6, m7 | |
6280 | |
6281 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6282 pmaddwd m7, [r3 - 6 * 16] ; [8] | |
6283 paddd m7, [pd_16] | |
6284 psrld m7, 5 | |
6285 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6286 pmaddwd m5, [r3 - 6 * 16] | |
6287 paddd m5, [pd_16] | |
6288 psrld m5, 5 | |
6289 packusdw m7, m5 | |
6290 | |
6291 lea r0, [r0 + r1 * 4] | |
6292 movu [r0], m4 | |
6293 movu [r0 + r1], m2 | |
6294 movu [r0 + r1 * 2], m6 | |
6295 movu [r0 + r4], m7 | |
6296 RET | |
6297 | |
6298 cglobal intra_pred_ang8_31, 3,5,8 | |
6299 lea r3, [ang_table + 13 * 16] | |
6300 add r1, r1 | |
6301 | |
6302 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6303 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6304 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6305 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6306 | |
6307 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6308 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6309 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6310 | |
6311 mova m4, m3 | |
6312 pmaddwd m4, [r3 + 4 * 16] ; [17] | |
6313 paddd m4, [pd_16] | |
6314 psrld m4, 5 | |
6315 mova m2, m0 | |
6316 pmaddwd m2, [r3 + 4 * 16] | |
6317 paddd m2, [pd_16] | |
6318 psrld m2, 5 | |
6319 packusdw m4, m2 | |
6320 | |
6321 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6322 mova m6, m2 | |
6323 pmaddwd m2, [r3 - 11 * 16] ; [2] | |
6324 paddd m2, [pd_16] | |
6325 psrld m2, 5 | |
6326 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6327 mova m7, m1 | |
6328 pmaddwd m1, [r3 - 11 * 16] | |
6329 paddd m1, [pd_16] | |
6330 psrld m1, 5 | |
6331 packusdw m2, m1 | |
6332 | |
6333 pmaddwd m6, [r3 + 6 * 16] ; [19] | |
6334 paddd m6, [pd_16] | |
6335 psrld m6, 5 | |
6336 pmaddwd m7, [r3 + 6 * 16] | |
6337 paddd m7, [pd_16] | |
6338 psrld m7, 5 | |
6339 packusdw m6, m7 | |
6340 | |
6341 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6342 pmaddwd m7, [r3 - 9 * 16] ; [4] | |
6343 paddd m7, [pd_16] | |
6344 psrld m7, 5 | |
6345 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6346 pmaddwd m1, [r3 - 9 * 16] | |
6347 paddd m1, [pd_16] | |
6348 psrld m1, 5 | |
6349 packusdw m7, m1 | |
6350 | |
6351 lea r4, [r1 * 3] | |
6352 movu [r0], m4 | |
6353 movu [r0 + r1], m2 | |
6354 movu [r0 + r1 * 2], m6 | |
6355 movu [r0 + r4], m7 | |
6356 | |
6357 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6358 pmaddwd m4, [r3 + 8 * 16] ; [21] | |
6359 paddd m4, [pd_16] | |
6360 psrld m4, 5 | |
6361 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6362 pmaddwd m2, [r3 + 8 * 16] | |
6363 paddd m2, [pd_16] | |
6364 psrld m2, 5 | |
6365 packusdw m4, m2 | |
6366 | |
6367 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6368 mova m6, m2 | |
6369 pmaddwd m2, [r3 - 7 * 16] ; [6] | |
6370 paddd m2, [pd_16] | |
6371 psrld m2, 5 | |
6372 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6373 mova m7, m1 | |
6374 pmaddwd m1, [r3 - 7 * 16] | |
6375 paddd m1, [pd_16] | |
6376 psrld m1, 5 | |
6377 packusdw m2, m1 | |
6378 | |
6379 pmaddwd m6, [r3 + 10 * 16] ; [23] | |
6380 paddd m6, [pd_16] | |
6381 psrld m6, 5 | |
6382 pmaddwd m7, [r3 + 10 * 16] | |
6383 paddd m7, [pd_16] | |
6384 psrld m7, 5 | |
6385 packusdw m6, m7 | |
6386 | |
6387 mova m7, m0 | |
6388 pmaddwd m7, [r3 - 5 * 16] ; [8] | |
6389 paddd m7, [pd_16] | |
6390 psrld m7, 5 | |
6391 mova m1, m5 | |
6392 pmaddwd m1, [r3 - 5 * 16] | |
6393 paddd m1, [pd_16] | |
6394 psrld m1, 5 | |
6395 packusdw m7, m1 | |
6396 | |
6397 lea r0, [r0 + r1 * 4] | |
6398 movu [r0], m4 | |
6399 movu [r0 + r1], m2 | |
6400 movu [r0 + r1 * 2], m6 | |
6401 movu [r0 + r4], m7 | |
6402 RET | |
6403 | |
6404 cglobal intra_pred_ang8_32, 3,5,8 | |
6405 lea r3, [ang_table + 19 * 16] | |
6406 add r1, r1 | |
6407 | |
6408 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6409 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6410 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6411 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6412 | |
6413 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6414 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6415 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6416 | |
6417 mova m4, m3 | |
6418 pmaddwd m4, [r3 + 2 * 16] ; [21] | |
6419 paddd m4, [pd_16] | |
6420 psrld m4, 5 | |
6421 mova m2, m0 | |
6422 pmaddwd m2, [r3 + 2 * 16] | |
6423 paddd m2, [pd_16] | |
6424 psrld m2, 5 | |
6425 packusdw m4, m2 | |
6426 | |
6427 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6428 mova m6, m2 | |
6429 pmaddwd m2, [r3 - 9 * 16] ; [10] | |
6430 paddd m2, [pd_16] | |
6431 psrld m2, 5 | |
6432 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6433 mova m7, m1 | |
6434 pmaddwd m1, [r3 - 9 * 16] | |
6435 paddd m1, [pd_16] | |
6436 psrld m1, 5 | |
6437 packusdw m2, m1 | |
6438 | |
6439 pmaddwd m6, [r3 + 12 * 16] ; [31] | |
6440 paddd m6, [pd_16] | |
6441 psrld m6, 5 | |
6442 pmaddwd m7, [r3 + 12 * 16] | |
6443 paddd m7, [pd_16] | |
6444 psrld m7, 5 | |
6445 packusdw m6, m7 | |
6446 | |
6447 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6448 pmaddwd m7, [r3 + 1 * 16] ; [20] | |
6449 paddd m7, [pd_16] | |
6450 psrld m7, 5 | |
6451 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6452 pmaddwd m1, [r3 + 1 * 16] | |
6453 paddd m1, [pd_16] | |
6454 psrld m1, 5 | |
6455 packusdw m7, m1 | |
6456 | |
6457 lea r4, [r1 * 3] | |
6458 movu [r0], m4 | |
6459 movu [r0 + r1], m2 | |
6460 movu [r0 + r1 * 2], m6 | |
6461 movu [r0 + r4], m7 | |
6462 | |
6463 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6464 mova m2, m4 | |
6465 pmaddwd m4, [r3 - 10 * 16] ; [ 9] | |
6466 paddd m4, [pd_16] | |
6467 psrld m4, 5 | |
6468 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6469 mova m6, m3 | |
6470 pmaddwd m3, [r3 - 10 * 16] | |
6471 paddd m3, [pd_16] | |
6472 psrld m3, 5 | |
6473 packusdw m4, m3 | |
6474 | |
6475 pmaddwd m2, [r3 + 11 * 16] ; [30] | |
6476 paddd m2, [pd_16] | |
6477 psrld m2, 5 | |
6478 pmaddwd m6, [r3 + 11 * 16] | |
6479 paddd m6, [pd_16] | |
6480 psrld m6, 5 | |
6481 packusdw m2, m6 | |
6482 | |
6483 mova m6, m0 | |
6484 pmaddwd m6, [r3] ; [19] | |
6485 paddd m6, [pd_16] | |
6486 psrld m6, 5 | |
6487 mova m7, m5 | |
6488 pmaddwd m7, [r3] | |
6489 paddd m7, [pd_16] | |
6490 psrld m7, 5 | |
6491 packusdw m6, m7 | |
6492 | |
6493 movh m1, [r2 + 26] ; [16 15 14 13] | |
6494 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6495 pmaddwd m7, [r3 - 11 * 16] ; [8] | |
6496 paddd m7, [pd_16] | |
6497 psrld m7, 5 | |
6498 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6499 pmaddwd m1, [r3 - 11 * 16] | |
6500 paddd m1, [pd_16] | |
6501 psrld m1, 5 | |
6502 packusdw m7, m1 | |
6503 | |
6504 lea r0, [r0 + r1 * 4] | |
6505 movu [r0], m4 | |
6506 movu [r0 + r1], m2 | |
6507 movu [r0 + r1 * 2], m6 | |
6508 movu [r0 + r4], m7 | |
6509 RET | |
6510 | |
6511 cglobal intra_pred_ang8_33, 3,5,8 | |
6512 lea r3, [ang_table + 14 * 16] | |
6513 add r1, r1 | |
6514 | |
6515 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6516 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6517 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6518 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6519 | |
6520 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6521 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6522 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6523 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] | |
6524 | |
6525 mova m4, m3 | |
6526 pmaddwd m4, [r3 + 12 * 16] ; [26] | |
6527 paddd m4, [pd_16] | |
6528 psrld m4, 5 | |
6529 mova m2, m0 | |
6530 pmaddwd m2, [r3 + 12 * 16] | |
6531 paddd m2, [pd_16] | |
6532 psrld m2, 5 | |
6533 packusdw m4, m2 | |
6534 | |
6535 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6536 pmaddwd m2, [r3 + 6 * 16] ; [20] | |
6537 paddd m2, [pd_16] | |
6538 psrld m2, 5 | |
6539 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6540 pmaddwd m6, [r3 + 6 * 16] | |
6541 paddd m6, [pd_16] | |
6542 psrld m6, 5 | |
6543 packusdw m2, m6 | |
6544 | |
6545 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6546 pmaddwd m6, [r3] ; [14] | |
6547 paddd m6, [pd_16] | |
6548 psrld m6, 5 | |
6549 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6550 pmaddwd m7, [r3] | |
6551 paddd m7, [pd_16] | |
6552 psrld m7, 5 | |
6553 packusdw m6, m7 | |
6554 | |
6555 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6556 pmaddwd m7, [r3 - 6 * 16] ; [ 8] | |
6557 paddd m7, [pd_16] | |
6558 psrld m7, 5 | |
6559 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6560 pmaddwd m3, [r3 - 6 * 16] | |
6561 paddd m3, [pd_16] | |
6562 psrld m3, 5 | |
6563 packusdw m7, m3 | |
6564 | |
6565 lea r4, [r1 * 3] | |
6566 movu [r0], m4 | |
6567 movu [r0 + r1], m2 | |
6568 movu [r0 + r1 * 2], m6 | |
6569 movu [r0 + r4], m7 | |
6570 | |
6571 mova m4, m0 | |
6572 pmaddwd m4, [r3 - 12 * 16] ; [ 2] | |
6573 paddd m4, [pd_16] | |
6574 psrld m4, 5 | |
6575 mova m2, m5 | |
6576 pmaddwd m2, [r3 - 12 * 16] | |
6577 paddd m2, [pd_16] | |
6578 psrld m2, 5 | |
6579 packusdw m4, m2 | |
6580 | |
6581 mova m2, m0 | |
6582 pmaddwd m2, [r3 + 14 * 16] ; [28] | |
6583 paddd m2, [pd_16] | |
6584 psrld m2, 5 | |
6585 mova m6, m5 | |
6586 pmaddwd m6, [r3 + 14 * 16] | |
6587 paddd m6, [pd_16] | |
6588 psrld m6, 5 | |
6589 packusdw m2, m6 | |
6590 | |
6591 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6592 pmaddwd m6, [r3 + 8 * 16] ; [22] | |
6593 paddd m6, [pd_16] | |
6594 psrld m6, 5 | |
6595 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6596 pmaddwd m7, [r3 + 8 * 16] | |
6597 paddd m7, [pd_16] | |
6598 psrld m7, 5 | |
6599 packusdw m6, m7 | |
6600 | |
6601 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6602 pmaddwd m7, [r3 + 2 * 16] ; [16] | |
6603 paddd m7, [pd_16] | |
6604 psrld m7, 5 | |
6605 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] | |
6606 pmaddwd m1, [r3 + 2 * 16] | |
6607 paddd m1, [pd_16] | |
6608 psrld m1, 5 | |
6609 packusdw m7, m1 | |
6610 | |
6611 lea r0, [r0 + r1 * 4] | |
6612 movu [r0], m4 | |
6613 movu [r0 + r1], m2 | |
6614 movu [r0 + r1 * 2], m6 | |
6615 movu [r0 + r4], m7 | |
6616 RET | |
6617 | |
6618 %macro TRANSPOSE_STORE 6 | |
6619 jnz .skip%6 | |
6620 punpckhwd %5, %1, %2 | |
6621 punpcklwd %1, %2 | |
6622 punpckhwd %2, %3, %4 | |
6623 punpcklwd %3, %4 | |
6624 | |
6625 punpckldq %4, %1, %3 | |
6626 punpckhdq %1, %3 | |
6627 punpckldq %3, %5, %2 | |
6628 punpckhdq %5, %2 | |
6629 | |
6630 movh [r0 + %6], %4 | |
6631 movhps [r0 + r1 + %6], %4 | |
6632 movh [r0 + r1 * 2 + %6], %1 | |
6633 movhps [r0 + r4 + %6], %1 | |
6634 lea r5, [r0 + r1 * 4] | |
6635 movh [r5 + %6], %3 | |
6636 movhps [r5 + r1 + %6], %3 | |
6637 movh [r5 + r1 * 2 + %6], %5 | |
6638 movhps [r5 + r4 + %6], %5 | |
6639 jmp .end%6 | |
6640 | |
6641 .skip%6: | |
6642 movu [r5], %1 | |
6643 movu [r5 + r1], %2 | |
6644 movu [r5 + r1 * 2], %3 | |
6645 movu [r5 + r4], %4 | |
6646 .end%6: | |
6647 %endmacro | |
6648 | |
6649 INIT_XMM sse4 | |
6650 cglobal ang16_mode_3_33 | |
6651 test r6d, r6d | |
6652 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6653 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6654 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6655 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6656 | |
6657 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6658 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6659 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6660 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] | |
6661 | |
6662 mova m4, m3 | |
6663 pmaddwd m4, [r3 + 10 * 16] ; [26] | |
6664 paddd m4, [pd_16] | |
6665 psrld m4, 5 | |
6666 mova m2, m0 | |
6667 pmaddwd m2, [r3 + 10 * 16] | |
6668 paddd m2, [pd_16] | |
6669 psrld m2, 5 | |
6670 packusdw m4, m2 | |
6671 | |
6672 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6673 pmaddwd m2, [r3 + 4 * 16] ; [20] | |
6674 paddd m2, [pd_16] | |
6675 psrld m2, 5 | |
6676 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6677 pmaddwd m6, [r3 + 4 * 16] | |
6678 paddd m6, [pd_16] | |
6679 psrld m6, 5 | |
6680 packusdw m2, m6 | |
6681 | |
6682 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6683 pmaddwd m6, [r3 - 2 * 16] ; [14] | |
6684 paddd m6, [pd_16] | |
6685 psrld m6, 5 | |
6686 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6687 pmaddwd m7, [r3 - 2 * 16] | |
6688 paddd m7, [pd_16] | |
6689 psrld m7, 5 | |
6690 packusdw m6, m7 | |
6691 | |
6692 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6693 pmaddwd m7, [r3 - 8 * 16] ; [ 8] | |
6694 paddd m7, [pd_16] | |
6695 psrld m7, 5 | |
6696 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6697 pmaddwd m3, [r3 - 8 * 16] | |
6698 paddd m3, [pd_16] | |
6699 psrld m3, 5 | |
6700 packusdw m7, m3 | |
6701 | |
6702 mov r5, r0 | |
6703 | |
6704 TRANSPOSE_STORE m4, m2, m6, m7, m3, 0 | |
6705 | |
6706 mova m4, m0 | |
6707 pmaddwd m4, [r3 - 14 * 16] ; [ 2] | |
6708 paddd m4, [pd_16] | |
6709 psrld m4, 5 | |
6710 mova m2, m5 | |
6711 pmaddwd m2, [r3 - 14 * 16] | |
6712 paddd m2, [pd_16] | |
6713 psrld m2, 5 | |
6714 packusdw m4, m2 | |
6715 | |
6716 mova m2, m0 | |
6717 pmaddwd m2, [r3 + 12 * 16] ; [28] | |
6718 paddd m2, [pd_16] | |
6719 psrld m2, 5 | |
6720 mova m6, m5 | |
6721 pmaddwd m6, [r3 + 12 * 16] | |
6722 paddd m6, [pd_16] | |
6723 psrld m6, 5 | |
6724 packusdw m2, m6 | |
6725 | |
6726 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6727 pmaddwd m6, [r3 + 6 * 16] ; [22] | |
6728 paddd m6, [pd_16] | |
6729 psrld m6, 5 | |
6730 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6731 pmaddwd m7, [r3 + 6 * 16] | |
6732 paddd m7, [pd_16] | |
6733 psrld m7, 5 | |
6734 packusdw m6, m7 | |
6735 | |
6736 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6737 pmaddwd m7, [r3] ; [16] | |
6738 paddd m7, [pd_16] | |
6739 psrld m7, 5 | |
6740 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] | |
6741 pmaddwd m1, [r3] | |
6742 paddd m1, [pd_16] | |
6743 psrld m1, 5 | |
6744 packusdw m7, m1 | |
6745 | |
6746 lea r5, [r0 + r1 * 4] | |
6747 | |
6748 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 | |
6749 | |
6750 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] | |
6751 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] | |
6752 | |
6753 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] | |
6754 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] | |
6755 | |
6756 palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6757 pmaddwd m4, [r3 - 6 * 16] ; [10] | |
6758 paddd m4, [pd_16] | |
6759 psrld m4, 5 | |
6760 palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12] | |
6761 pmaddwd m2, [r3 - 6 * 16] | |
6762 paddd m2, [pd_16] | |
6763 psrld m2, 5 | |
6764 packusdw m4, m2 | |
6765 | |
6766 mova m2, m5 | |
6767 pmaddwd m2, [r3 - 12 * 16] ; [4] | |
6768 paddd m2, [pd_16] | |
6769 psrld m2, 5 | |
6770 mova m6, m3 | |
6771 pmaddwd m6, [r3 - 12 * 16] | |
6772 paddd m6, [pd_16] | |
6773 psrld m6, 5 | |
6774 packusdw m2, m6 | |
6775 | |
6776 mova m6, m5 | |
6777 pmaddwd m6, [r3 + 14 * 16] ; [30] | |
6778 paddd m6, [pd_16] | |
6779 psrld m6, 5 | |
6780 mova m7, m3 | |
6781 pmaddwd m7, [r3 + 14 * 16] | |
6782 paddd m7, [pd_16] | |
6783 psrld m7, 5 | |
6784 packusdw m6, m7 | |
6785 | |
6786 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6787 pmaddwd m7, [r3 + 8 * 16] ; [24] | |
6788 paddd m7, [pd_16] | |
6789 psrld m7, 5 | |
6790 palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14] | |
6791 pmaddwd m0, [r3 + 8 * 16] | |
6792 paddd m0, [pd_16] | |
6793 psrld m0, 5 | |
6794 packusdw m7, m0 | |
6795 | |
6796 lea r5, [r5 + r1 * 4] | |
6797 | |
6798 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 | |
6799 | |
6800 palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11] | |
6801 pmaddwd m4, [r3 + 2 * 16] ; [18] | |
6802 paddd m4, [pd_16] | |
6803 psrld m4, 5 | |
6804 palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15] | |
6805 pmaddwd m2, [r3 + 2 * 16] | |
6806 paddd m2, [pd_16] | |
6807 psrld m2, 5 | |
6808 packusdw m4, m2 | |
6809 | |
6810 palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12] | |
6811 pmaddwd m2, [r3 - 4 * 16] ; [12] | |
6812 paddd m2, [pd_16] | |
6813 psrld m2, 5 | |
6814 palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16] | |
6815 pmaddwd m6, [r3 - 4 * 16] | |
6816 paddd m6, [pd_16] | |
6817 psrld m6, 5 | |
6818 packusdw m2, m6 | |
6819 | |
6820 pinsrw m1, [r2 + 42], 7 | |
6821 pmaddwd m3, [r3 - 10 * 16] ; [6] | |
6822 paddd m3, [pd_16] | |
6823 psrld m3, 5 | |
6824 pmaddwd m1, [r3 - 10 * 16] | |
6825 paddd m1, [pd_16] | |
6826 psrld m1, 5 | |
6827 packusdw m3, m1 | |
6828 | |
6829 movu m7, [r2 + 28] | |
6830 | |
6831 lea r5, [r5 + r1 * 4] | |
6832 | |
6833 TRANSPOSE_STORE m4, m2, m3, m7, m0, 24 | |
6834 | |
6835 ret | |
6836 | |
6837 cglobal ang16_mode_4_32 | |
6838 test r6d, r6d | |
6839 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
6840 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
6841 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
6842 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
6843 | |
6844 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
6845 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
6846 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
6847 | |
6848 mova m4, m3 | |
6849 pmaddwd m4, [r3 + 3 * 16] ; [21] | |
6850 paddd m4, [pd_16] | |
6851 psrld m4, 5 | |
6852 mova m2, m0 | |
6853 pmaddwd m2, [r3 + 3 * 16] | |
6854 paddd m2, [pd_16] | |
6855 psrld m2, 5 | |
6856 packusdw m4, m2 | |
6857 | |
6858 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
6859 mova m6, m2 | |
6860 pmaddwd m2, [r3 - 8 * 16] ; [10] | |
6861 paddd m2, [pd_16] | |
6862 psrld m2, 5 | |
6863 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6864 mova m7, m1 | |
6865 pmaddwd m1, [r3 - 8 * 16] | |
6866 paddd m1, [pd_16] | |
6867 psrld m1, 5 | |
6868 packusdw m2, m1 | |
6869 | |
6870 pmaddwd m6, [r3 + 13 * 16] ; [31] | |
6871 paddd m6, [pd_16] | |
6872 psrld m6, 5 | |
6873 pmaddwd m7, [r3 + 13 * 16] | |
6874 paddd m7, [pd_16] | |
6875 psrld m7, 5 | |
6876 packusdw m6, m7 | |
6877 | |
6878 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
6879 pmaddwd m7, [r3 + 2 * 16] ; [20] | |
6880 paddd m7, [pd_16] | |
6881 psrld m7, 5 | |
6882 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6883 pmaddwd m1, [r3 + 2 * 16] | |
6884 paddd m1, [pd_16] | |
6885 psrld m1, 5 | |
6886 packusdw m7, m1 | |
6887 | |
6888 mov r5, r0 | |
6889 | |
6890 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
6891 | |
6892 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
6893 mova m2, m4 | |
6894 pmaddwd m4, [r3 - 9 * 16] ; [9] | |
6895 paddd m4, [pd_16] | |
6896 psrld m4, 5 | |
6897 palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6898 mova m6, m7 | |
6899 pmaddwd m7, [r3 - 9 * 16] | |
6900 paddd m7, [pd_16] | |
6901 psrld m7, 5 | |
6902 packusdw m4, m7 | |
6903 | |
6904 pmaddwd m2, [r3 + 12 * 16] ; [30] | |
6905 paddd m2, [pd_16] | |
6906 psrld m2, 5 | |
6907 pmaddwd m6, [r3 + 12 * 16] | |
6908 paddd m6, [pd_16] | |
6909 psrld m6, 5 | |
6910 packusdw m2, m6 | |
6911 | |
6912 mova m6, m0 | |
6913 pmaddwd m6, [r3 + 1 * 16] ; [19] | |
6914 paddd m6, [pd_16] | |
6915 psrld m6, 5 | |
6916 mova m7, m5 | |
6917 pmaddwd m7, [r3 + 1 * 16] | |
6918 paddd m7, [pd_16] | |
6919 psrld m7, 5 | |
6920 packusdw m6, m7 | |
6921 | |
6922 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] | |
6923 | |
6924 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6925 pmaddwd m7, [r3 - 10 * 16] ; [8] | |
6926 paddd m7, [pd_16] | |
6927 psrld m7, 5 | |
6928 palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6929 pmaddwd m3, [r3 - 10 * 16] | |
6930 paddd m3, [pd_16] | |
6931 psrld m3, 5 | |
6932 packusdw m7, m3 | |
6933 | |
6934 lea r5, [r0 + r1 * 4] | |
6935 | |
6936 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 | |
6937 | |
6938 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] | |
6939 | |
6940 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] | |
6941 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] | |
6942 | |
6943 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
6944 pmaddwd m4, [r3 + 11 * 16] ; [29] | |
6945 paddd m4, [pd_16] | |
6946 psrld m4, 5 | |
6947 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6948 pmaddwd m2, [r3 + 11 * 16] | |
6949 paddd m2, [pd_16] | |
6950 psrld m2, 5 | |
6951 packusdw m4, m2 | |
6952 | |
6953 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
6954 pmaddwd m2, [r3] ; [18] | |
6955 paddd m2, [pd_16] | |
6956 psrld m2, 5 | |
6957 palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11] | |
6958 pmaddwd m6, [r3] | |
6959 paddd m6, [pd_16] | |
6960 psrld m6, 5 | |
6961 packusdw m2, m6 | |
6962 | |
6963 palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
6964 mova m7, m6 | |
6965 pmaddwd m6, [r3 - 11 * 16] ; [7] | |
6966 paddd m6, [pd_16] | |
6967 psrld m6, 5 | |
6968 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] | |
6969 pmaddwd m0, [r3 - 11 * 16] | |
6970 paddd m0, [pd_16] | |
6971 psrld m0, 5 | |
6972 packusdw m6, m0 | |
6973 | |
6974 pmaddwd m7, [r3 + 10 * 16] ; [28] | |
6975 paddd m7, [pd_16] | |
6976 psrld m7, 5 | |
6977 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] | |
6978 pmaddwd m0, [r3 + 10 * 16] | |
6979 paddd m0, [pd_16] | |
6980 psrld m0, 5 | |
6981 packusdw m7, m0 | |
6982 | |
6983 lea r5, [r5 + r1 * 4] | |
6984 | |
6985 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 | |
6986 | |
6987 mova m4, m5 | |
6988 pmaddwd m4, [r3 - 1 * 16] ; [17] | |
6989 paddd m4, [pd_16] | |
6990 psrld m4, 5 | |
6991 mova m2, m3 | |
6992 pmaddwd m2, [r3 - 1 * 16] | |
6993 paddd m2, [pd_16] | |
6994 psrld m2, 5 | |
6995 packusdw m4, m2 | |
6996 | |
6997 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] | |
6998 mova m7, m2 | |
6999 pmaddwd m2, [r3 - 12 * 16] ; [6] | |
7000 paddd m2, [pd_16] | |
7001 psrld m2, 5 | |
7002 palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14] | |
7003 mova m0, m6 | |
7004 pmaddwd m6, [r3 - 12 * 16] | |
7005 paddd m6, [pd_16] | |
7006 psrld m6, 5 | |
7007 packusdw m2, m6 | |
7008 | |
7009 pmaddwd m7, [r3 + 9 * 16] ; [27] | |
7010 paddd m7, [pd_16] | |
7011 psrld m7, 5 | |
7012 pmaddwd m0, [r3 + 9 * 16] | |
7013 paddd m0, [pd_16] | |
7014 psrld m0, 5 | |
7015 packusdw m7, m0 | |
7016 | |
7017 palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11] | |
7018 pmaddwd m0, [r3 - 2 * 16] ; [16] | |
7019 paddd m0, [pd_16] | |
7020 psrld m0, 5 | |
7021 palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15] | |
7022 pmaddwd m1, [r3 - 2 * 16] | |
7023 paddd m1, [pd_16] | |
7024 psrld m1, 5 | |
7025 packusdw m0, m1 | |
7026 | |
7027 lea r5, [r5 + r1 * 4] | |
7028 | |
7029 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 | |
7030 | |
7031 ret | |
7032 | |
7033 cglobal ang16_mode_5_31 | |
7034 test r6d, r6d | |
7035 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7036 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
7037 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
7038 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
7039 | |
7040 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
7041 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
7042 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
7043 | |
7044 mova m4, m3 | |
7045 pmaddwd m4, [r3 + 1 * 16] ; [17] | |
7046 paddd m4, [pd_16] | |
7047 psrld m4, 5 | |
7048 mova m2, m0 | |
7049 pmaddwd m2, [r3 + 1 * 16] | |
7050 paddd m2, [pd_16] | |
7051 psrld m2, 5 | |
7052 packusdw m4, m2 | |
7053 | |
7054 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7055 mova m6, m2 | |
7056 pmaddwd m2, [r3 - 14 * 16] ; [2] | |
7057 paddd m2, [pd_16] | |
7058 psrld m2, 5 | |
7059 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7060 mova m7, m1 | |
7061 pmaddwd m1, [r3 - 14 * 16] | |
7062 paddd m1, [pd_16] | |
7063 psrld m1, 5 | |
7064 packusdw m2, m1 | |
7065 | |
7066 pmaddwd m6, [r3 + 3 * 16] ; [19] | |
7067 paddd m6, [pd_16] | |
7068 psrld m6, 5 | |
7069 pmaddwd m7, [r3 + 3 * 16] | |
7070 paddd m7, [pd_16] | |
7071 psrld m7, 5 | |
7072 packusdw m6, m7 | |
7073 | |
7074 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7075 pmaddwd m7, [r3 - 12 * 16] ; [4] | |
7076 paddd m7, [pd_16] | |
7077 psrld m7, 5 | |
7078 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7079 pmaddwd m1, [r3 - 12 * 16] | |
7080 paddd m1, [pd_16] | |
7081 psrld m1, 5 | |
7082 packusdw m7, m1 | |
7083 | |
7084 mov r5, r0 | |
7085 | |
7086 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
7087 | |
7088 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7089 pmaddwd m4, [r3 + 5 * 16] ; [21] | |
7090 paddd m4, [pd_16] | |
7091 psrld m4, 5 | |
7092 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7093 pmaddwd m7, [r3 + 5 * 16] | |
7094 paddd m7, [pd_16] | |
7095 psrld m7, 5 | |
7096 packusdw m4, m7 | |
7097 | |
7098 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7099 mova m6, m2 | |
7100 pmaddwd m2, [r3 - 10 * 16] ; [6] | |
7101 paddd m2, [pd_16] | |
7102 psrld m2, 5 | |
7103 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7104 mova m7, m1 | |
7105 pmaddwd m1, [r3 - 10 * 16] | |
7106 paddd m1, [pd_16] | |
7107 psrld m1, 5 | |
7108 packusdw m2, m1 | |
7109 | |
7110 pmaddwd m6, [r3 + 7 * 16] ; [23] | |
7111 paddd m6, [pd_16] | |
7112 psrld m6, 5 | |
7113 pmaddwd m7, [r3 + 7 * 16] | |
7114 paddd m7, [pd_16] | |
7115 psrld m7, 5 | |
7116 packusdw m6, m7 | |
7117 | |
7118 mova m7, m0 | |
7119 pmaddwd m7, [r3 - 8 * 16] ; [8] | |
7120 paddd m7, [pd_16] | |
7121 psrld m7, 5 | |
7122 mova m3, m5 | |
7123 pmaddwd m3, [r3 - 8 * 16] | |
7124 paddd m3, [pd_16] | |
7125 psrld m3, 5 | |
7126 packusdw m7, m3 | |
7127 | |
7128 lea r5, [r0 + r1 * 4] | |
7129 | |
7130 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 | |
7131 | |
7132 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] | |
7133 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] | |
7134 | |
7135 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] | |
7136 | |
7137 mova m4, m0 | |
7138 pmaddwd m4, [r3 + 9 * 16] ; [25] | |
7139 paddd m4, [pd_16] | |
7140 psrld m4, 5 | |
7141 mova m2, m5 | |
7142 pmaddwd m2, [r3 + 9 * 16] | |
7143 paddd m2, [pd_16] | |
7144 psrld m2, 5 | |
7145 packusdw m4, m2 | |
7146 | |
7147 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7148 mova m6, m2 | |
7149 pmaddwd m2, [r3 - 6 * 16] ; [10] | |
7150 paddd m2, [pd_16] | |
7151 psrld m2, 5 | |
7152 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] | |
7153 mova m1, m7 | |
7154 pmaddwd m7, [r3 - 6 * 16] | |
7155 paddd m7, [pd_16] | |
7156 psrld m7, 5 | |
7157 packusdw m2, m7 | |
7158 | |
7159 pmaddwd m6, [r3 + 11 * 16] ; [27] | |
7160 paddd m6, [pd_16] | |
7161 psrld m6, 5 | |
7162 pmaddwd m1, [r3 + 11 * 16] | |
7163 paddd m1, [pd_16] | |
7164 psrld m1, 5 | |
7165 packusdw m6, m1 | |
7166 | |
7167 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7168 pmaddwd m7, [r3 - 4 * 16] ; [12] | |
7169 paddd m7, [pd_16] | |
7170 psrld m7, 5 | |
7171 palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11] | |
7172 pmaddwd m1, [r3 - 4 * 16] | |
7173 paddd m1, [pd_16] | |
7174 psrld m1, 5 | |
7175 packusdw m7, m1 | |
7176 | |
7177 lea r5, [r5 + r1 * 4] | |
7178 | |
7179 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
7180 | |
7181 palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7182 pmaddwd m4, [r3 + 13 * 16] ; [29] | |
7183 paddd m4, [pd_16] | |
7184 psrld m4, 5 | |
7185 palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11] | |
7186 pmaddwd m2, [r3 + 13 * 16] | |
7187 paddd m2, [pd_16] | |
7188 psrld m2, 5 | |
7189 packusdw m4, m2 | |
7190 | |
7191 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7192 mova m7, m2 | |
7193 pmaddwd m2, [r3 - 2 * 16] ; [14] | |
7194 paddd m2, [pd_16] | |
7195 psrld m2, 5 | |
7196 palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12] | |
7197 mova m0, m6 | |
7198 pmaddwd m6, [r3 - 2 * 16] | |
7199 paddd m6, [pd_16] | |
7200 psrld m6, 5 | |
7201 packusdw m2, m6 | |
7202 | |
7203 pmaddwd m7, [r3 + 15 * 16] ; [31] | |
7204 paddd m7, [pd_16] | |
7205 psrld m7, 5 | |
7206 pmaddwd m0, [r3 + 15 * 16] | |
7207 paddd m0, [pd_16] | |
7208 psrld m0, 5 | |
7209 packusdw m7, m0 | |
7210 | |
7211 pmaddwd m5, [r3] ; [16] | |
7212 paddd m5, [pd_16] | |
7213 psrld m5, 5 | |
7214 pmaddwd m3, [r3] | |
7215 paddd m3, [pd_16] | |
7216 psrld m3, 5 | |
7217 packusdw m5, m3 | |
7218 | |
7219 lea r5, [r5 + r1 * 4] | |
7220 | |
7221 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 | |
7222 | |
7223 ret | |
7224 | |
7225 cglobal ang16_mode_6_30 | |
7226 test r6d, r6d | |
7227 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7228 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
7229 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
7230 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
7231 | |
7232 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
7233 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
7234 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
7235 | |
7236 mova m4, m3 | |
7237 pmaddwd m4, [r3 - 2 * 16] ; [13] | |
7238 paddd m4, [pd_16] | |
7239 psrld m4, 5 | |
7240 mova m2, m0 | |
7241 pmaddwd m2, [r3 - 2 * 16] | |
7242 paddd m2, [pd_16] | |
7243 psrld m2, 5 | |
7244 packusdw m4, m2 | |
7245 | |
7246 mova m2, m3 | |
7247 pmaddwd m2, [r3 + 11 * 16] ; [26] | |
7248 paddd m2, [pd_16] | |
7249 psrld m2, 5 | |
7250 mova m1, m0 | |
7251 pmaddwd m1, [r3 + 11 * 16] | |
7252 paddd m1, [pd_16] | |
7253 psrld m1, 5 | |
7254 packusdw m2, m1 | |
7255 | |
7256 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7257 mova m7, m6 | |
7258 pmaddwd m6, [r3 - 8 * 16] ; [7] | |
7259 paddd m6, [pd_16] | |
7260 psrld m6, 5 | |
7261 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7262 pmaddwd m1, [r3 - 8 * 16] | |
7263 paddd m1, [pd_16] | |
7264 psrld m1, 5 | |
7265 packusdw m6, m1 | |
7266 | |
7267 pmaddwd m7, [r3 + 5 * 16] ; [20] | |
7268 paddd m7, [pd_16] | |
7269 psrld m7, 5 | |
7270 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7271 pmaddwd m1, [r3 + 5 * 16] | |
7272 paddd m1, [pd_16] | |
7273 psrld m1, 5 | |
7274 packusdw m7, m1 | |
7275 | |
7276 mov r5, r0 | |
7277 | |
7278 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
7279 | |
7280 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7281 mova m6, m4 | |
7282 pmaddwd m4, [r3 - 14 * 16] ; [1] | |
7283 paddd m4, [pd_16] | |
7284 psrld m4, 5 | |
7285 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7286 mova m7, m1 | |
7287 pmaddwd m1, [r3 - 14 * 16] | |
7288 paddd m1, [pd_16] | |
7289 psrld m1, 5 | |
7290 packusdw m4, m1 | |
7291 | |
7292 mova m2, m6 | |
7293 pmaddwd m2, [r3 - 1 * 16] ; [14] | |
7294 paddd m2, [pd_16] | |
7295 psrld m2, 5 | |
7296 mova m1, m7 | |
7297 pmaddwd m1, [r3 - 1 * 16] | |
7298 paddd m1, [pd_16] | |
7299 psrld m1, 5 | |
7300 packusdw m2, m1 | |
7301 | |
7302 pmaddwd m6, [r3 + 12 * 16] ; [27] | |
7303 paddd m6, [pd_16] | |
7304 psrld m6, 5 | |
7305 pmaddwd m7, [r3 + 12 * 16] | |
7306 paddd m7, [pd_16] | |
7307 psrld m7, 5 | |
7308 packusdw m6, m7 | |
7309 | |
7310 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7311 pmaddwd m7, [r3 - 7 * 16] ; [8] | |
7312 paddd m7, [pd_16] | |
7313 psrld m7, 5 | |
7314 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7315 pmaddwd m1, [r3 - 7 * 16] | |
7316 paddd m1, [pd_16] | |
7317 psrld m1, 5 | |
7318 packusdw m7, m1 | |
7319 | |
7320 lea r5, [r0 + r1 * 4] | |
7321 | |
7322 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
7323 | |
7324 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7325 pmaddwd m4, [r3 + 6 * 16] ; [21] | |
7326 paddd m4, [pd_16] | |
7327 psrld m4, 5 | |
7328 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7329 pmaddwd m2, [r3 + 6 * 16] | |
7330 paddd m2, [pd_16] | |
7331 psrld m2, 5 | |
7332 packusdw m4, m2 | |
7333 | |
7334 mova m2, m0 | |
7335 pmaddwd m2, [r3 - 13 * 16] ; [2] | |
7336 paddd m2, [pd_16] | |
7337 psrld m2, 5 | |
7338 mova m7, m5 | |
7339 pmaddwd m7, [r3 - 13 * 16] | |
7340 paddd m7, [pd_16] | |
7341 psrld m7, 5 | |
7342 packusdw m2, m7 | |
7343 | |
7344 mova m6, m0 | |
7345 pmaddwd m6, [r3] ; [15] | |
7346 paddd m6, [pd_16] | |
7347 psrld m6, 5 | |
7348 mova m1, m5 | |
7349 pmaddwd m1, [r3] | |
7350 paddd m1, [pd_16] | |
7351 psrld m1, 5 | |
7352 packusdw m6, m1 | |
7353 | |
7354 mova m7, m0 | |
7355 pmaddwd m7, [r3 + 13 * 16] ; [28] | |
7356 paddd m7, [pd_16] | |
7357 psrld m7, 5 | |
7358 mova m1, m5 | |
7359 pmaddwd m1, [r3 + 13 * 16] | |
7360 paddd m1, [pd_16] | |
7361 psrld m1, 5 | |
7362 packusdw m7, m1 | |
7363 | |
7364 lea r5, [r5 + r1 * 4] | |
7365 | |
7366 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
7367 | |
7368 movh m3, [r2 + 26] ; [16 15 14 13] | |
7369 | |
7370 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7371 mova m2, m4 | |
7372 pmaddwd m4, [r3 - 6 * 16] ; [9] | |
7373 paddd m4, [pd_16] | |
7374 psrld m4, 5 | |
7375 palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10] | |
7376 mova m6, m1 | |
7377 pmaddwd m1, [r3 - 6 * 16] | |
7378 paddd m1, [pd_16] | |
7379 psrld m1, 5 | |
7380 packusdw m4, m1 | |
7381 | |
7382 pmaddwd m2, [r3 + 7 * 16] ; [22] | |
7383 paddd m2, [pd_16] | |
7384 psrld m2, 5 | |
7385 mova m1, m6 | |
7386 pmaddwd m1, [r3 + 7 * 16] | |
7387 paddd m1, [pd_16] | |
7388 psrld m1, 5 | |
7389 packusdw m2, m1 | |
7390 | |
7391 psrldq m3, 2 | |
7392 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7393 mova m5, m7 | |
7394 pmaddwd m7, [r3 - 12 * 16] ; [3] | |
7395 paddd m7, [pd_16] | |
7396 psrld m7, 5 | |
7397 palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11] | |
7398 mova m1, m3 | |
7399 pmaddwd m3, [r3 - 12 * 16] | |
7400 paddd m3, [pd_16] | |
7401 psrld m3, 5 | |
7402 packusdw m7, m3 | |
7403 | |
7404 pmaddwd m5, [r3 + 1 * 16] ; [16] | |
7405 paddd m5, [pd_16] | |
7406 psrld m5, 5 | |
7407 pmaddwd m1, [r3 + 1 * 16] | |
7408 paddd m1, [pd_16] | |
7409 psrld m1, 5 | |
7410 packusdw m5, m1 | |
7411 | |
7412 lea r5, [r5 + r1 * 4] | |
7413 | |
7414 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 | |
7415 | |
7416 ret | |
7417 | |
7418 cglobal ang16_mode_7_29 | |
7419 test r6d, r6d | |
7420 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7421 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
7422 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
7423 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
7424 | |
7425 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
7426 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
7427 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
7428 | |
7429 mova m4, m3 | |
7430 pmaddwd m4, [r3 - 8 * 16] ; [9] | |
7431 paddd m4, [pd_16] | |
7432 psrld m4, 5 | |
7433 mova m2, m0 | |
7434 pmaddwd m2, [r3 - 8 * 16] | |
7435 paddd m2, [pd_16] | |
7436 psrld m2, 5 | |
7437 packusdw m4, m2 | |
7438 | |
7439 mova m2, m3 | |
7440 pmaddwd m2, [r3 + 1 * 16] ; [18] | |
7441 paddd m2, [pd_16] | |
7442 psrld m2, 5 | |
7443 mova m1, m0 | |
7444 pmaddwd m1, [r3 + 1 * 16] | |
7445 paddd m1, [pd_16] | |
7446 psrld m1, 5 | |
7447 packusdw m2, m1 | |
7448 | |
7449 mova m6, m3 | |
7450 pmaddwd m6, [r3 + 10 * 16] ; [27] | |
7451 paddd m6, [pd_16] | |
7452 psrld m6, 5 | |
7453 mova m1, m0 | |
7454 pmaddwd m1, [r3 + 10 * 16] | |
7455 paddd m1, [pd_16] | |
7456 psrld m1, 5 | |
7457 packusdw m6, m1 | |
7458 | |
7459 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7460 pmaddwd m7, [r3 - 13 * 16] ; [4] | |
7461 paddd m7, [pd_16] | |
7462 psrld m7, 5 | |
7463 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7464 pmaddwd m1, [r3 - 13 * 16] | |
7465 paddd m1, [pd_16] | |
7466 psrld m1, 5 | |
7467 packusdw m7, m1 | |
7468 | |
7469 mov r5, r0 | |
7470 | |
7471 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
7472 | |
7473 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7474 mova m6, m4 | |
7475 pmaddwd m4, [r3 - 4 * 16] ; [13] | |
7476 paddd m4, [pd_16] | |
7477 psrld m4, 5 | |
7478 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7479 mova m7, m1 | |
7480 pmaddwd m1, [r3 - 4 * 16] | |
7481 paddd m1, [pd_16] | |
7482 psrld m1, 5 | |
7483 packusdw m4, m1 | |
7484 | |
7485 mova m2, m6 | |
7486 pmaddwd m2, [r3 + 5 * 16] ; [22] | |
7487 paddd m2, [pd_16] | |
7488 psrld m2, 5 | |
7489 mova m1, m7 | |
7490 pmaddwd m1, [r3 + 5 * 16] | |
7491 paddd m1, [pd_16] | |
7492 psrld m1, 5 | |
7493 packusdw m2, m1 | |
7494 | |
7495 pmaddwd m6, [r3 + 14 * 16] ; [31] | |
7496 paddd m6, [pd_16] | |
7497 psrld m6, 5 | |
7498 pmaddwd m7, [r3 + 14 * 16] | |
7499 paddd m7, [pd_16] | |
7500 psrld m7, 5 | |
7501 packusdw m6, m7 | |
7502 | |
7503 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7504 pmaddwd m7, [r3 - 9 * 16] ; [8] | |
7505 paddd m7, [pd_16] | |
7506 psrld m7, 5 | |
7507 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7508 pmaddwd m1, [r3 - 9 * 16] | |
7509 paddd m1, [pd_16] | |
7510 psrld m1, 5 | |
7511 packusdw m7, m1 | |
7512 | |
7513 lea r5, [r0 + r1 * 4] | |
7514 | |
7515 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
7516 | |
7517 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7518 mova m2, m4 | |
7519 pmaddwd m4, [r3] ; [17] | |
7520 paddd m4, [pd_16] | |
7521 psrld m4, 5 | |
7522 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7523 mova m7, m1 | |
7524 pmaddwd m1, [r3] | |
7525 paddd m1, [pd_16] | |
7526 psrld m1, 5 | |
7527 packusdw m4, m1 | |
7528 | |
7529 pmaddwd m2, [r3 + 9 * 16] ; [26] | |
7530 paddd m2, [pd_16] | |
7531 psrld m2, 5 | |
7532 pmaddwd m7, [r3 + 9 * 16] | |
7533 paddd m7, [pd_16] | |
7534 psrld m7, 5 | |
7535 packusdw m2, m7 | |
7536 | |
7537 palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7538 pmaddwd m6, [r3 - 14 * 16] ; [3] | |
7539 paddd m6, [pd_16] | |
7540 psrld m6, 5 | |
7541 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7542 pmaddwd m1, [r3 - 14 * 16] | |
7543 paddd m1, [pd_16] | |
7544 psrld m1, 5 | |
7545 packusdw m6, m1 | |
7546 | |
7547 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7548 pmaddwd m7, [r3 - 5 * 16] ; [12] | |
7549 paddd m7, [pd_16] | |
7550 psrld m7, 5 | |
7551 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7552 pmaddwd m1, [r3 - 5 * 16] | |
7553 paddd m1, [pd_16] | |
7554 psrld m1, 5 | |
7555 packusdw m7, m1 | |
7556 | |
7557 lea r5, [r5 + r1 * 4] | |
7558 | |
7559 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
7560 | |
7561 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] | |
7562 mova m2, m4 | |
7563 pmaddwd m4, [r3 + 4 * 16] ; [21] | |
7564 paddd m4, [pd_16] | |
7565 psrld m4, 5 | |
7566 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] | |
7567 mova m3, m1 | |
7568 pmaddwd m1, [r3 + 4 * 16] | |
7569 paddd m1, [pd_16] | |
7570 psrld m1, 5 | |
7571 packusdw m4, m1 | |
7572 | |
7573 pmaddwd m2, [r3 + 13 * 16] ; [30] | |
7574 paddd m2, [pd_16] | |
7575 psrld m2, 5 | |
7576 pmaddwd m3, [r3 + 13 * 16] | |
7577 paddd m3, [pd_16] | |
7578 psrld m3, 5 | |
7579 packusdw m2, m3 | |
7580 | |
7581 mova m7, m0 | |
7582 pmaddwd m7, [r3 - 10 * 16] ; [7] | |
7583 paddd m7, [pd_16] | |
7584 psrld m7, 5 | |
7585 mova m3, m5 | |
7586 pmaddwd m3, [r3 - 10 * 16] | |
7587 paddd m3, [pd_16] | |
7588 psrld m3, 5 | |
7589 packusdw m7, m3 | |
7590 | |
7591 pmaddwd m0, [r3 - 1 * 16] ; [16] | |
7592 paddd m0, [pd_16] | |
7593 psrld m0, 5 | |
7594 pmaddwd m5, [r3 - 1 * 16] | |
7595 paddd m5, [pd_16] | |
7596 psrld m5, 5 | |
7597 packusdw m0, m5 | |
7598 | |
7599 lea r5, [r5 + r1 * 4] | |
7600 | |
7601 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 | |
7602 | |
7603 ret | |
7604 | |
7605 cglobal ang16_mode_8_28 | |
7606 test r6d, r6d | |
7607 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7608 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
7609 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] | |
7610 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] | |
7611 | |
7612 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] | |
7613 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] | |
7614 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] | |
7615 | |
7616 mova m4, m3 | |
7617 pmaddwd m4, [r3 - 10 * 16] ; [5] | |
7618 paddd m4, [pd_16] | |
7619 psrld m4, 5 | |
7620 mova m2, m0 | |
7621 pmaddwd m2, [r3 - 10 * 16] | |
7622 paddd m2, [pd_16] | |
7623 psrld m2, 5 | |
7624 packusdw m4, m2 | |
7625 | |
7626 mova m2, m3 | |
7627 pmaddwd m2, [r3 - 5 * 16] ; [10] | |
7628 paddd m2, [pd_16] | |
7629 psrld m2, 5 | |
7630 mova m1, m0 | |
7631 pmaddwd m1, [r3 - 5 * 16] | |
7632 paddd m1, [pd_16] | |
7633 psrld m1, 5 | |
7634 packusdw m2, m1 | |
7635 | |
7636 mova m6, m3 | |
7637 pmaddwd m6, [r3] ; [15] | |
7638 paddd m6, [pd_16] | |
7639 psrld m6, 5 | |
7640 mova m1, m0 | |
7641 pmaddwd m1, [r3] | |
7642 paddd m1, [pd_16] | |
7643 psrld m1, 5 | |
7644 packusdw m6, m1 | |
7645 | |
7646 mova m7, m3 | |
7647 pmaddwd m7, [r3 + 5 * 16] ; [20] | |
7648 paddd m7, [pd_16] | |
7649 psrld m7, 5 | |
7650 mova m1, m0 | |
7651 pmaddwd m1, [r3 + 5 * 16] | |
7652 paddd m1, [pd_16] | |
7653 psrld m1, 5 | |
7654 packusdw m7, m1 | |
7655 | |
7656 mov r5, r0 | |
7657 | |
7658 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
7659 | |
7660 mova m4, m3 | |
7661 pmaddwd m4, [r3 + 10 * 16] ; [25] | |
7662 paddd m4, [pd_16] | |
7663 psrld m4, 5 | |
7664 mova m1, m0 | |
7665 pmaddwd m1, [r3 + 10 * 16] | |
7666 paddd m1, [pd_16] | |
7667 psrld m1, 5 | |
7668 packusdw m4, m1 | |
7669 | |
7670 mova m2, m3 | |
7671 pmaddwd m2, [r3 + 15 * 16] ; [30] | |
7672 paddd m2, [pd_16] | |
7673 psrld m2, 5 | |
7674 mova m1, m0 | |
7675 pmaddwd m1, [r3 + 15 * 16] | |
7676 paddd m1, [pd_16] | |
7677 psrld m1, 5 | |
7678 packusdw m2, m1 | |
7679 | |
7680 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7681 pmaddwd m6, [r3 - 12 * 16] ; [3] | |
7682 paddd m6, [pd_16] | |
7683 psrld m6, 5 | |
7684 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7685 pmaddwd m7, [r3 - 12 * 16] | |
7686 paddd m7, [pd_16] | |
7687 psrld m7, 5 | |
7688 packusdw m6, m7 | |
7689 | |
7690 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7691 pmaddwd m7, [r3 - 7 * 16] ; [8] | |
7692 paddd m7, [pd_16] | |
7693 psrld m7, 5 | |
7694 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7695 pmaddwd m1, [r3 - 7 * 16] | |
7696 paddd m1, [pd_16] | |
7697 psrld m1, 5 | |
7698 packusdw m7, m1 | |
7699 | |
7700 lea r5, [r0 + r1 * 4] | |
7701 | |
7702 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
7703 | |
7704 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] | |
7705 mova m7, m4 | |
7706 pmaddwd m4, [r3 - 2 *16] ; [13] | |
7707 paddd m4, [pd_16] | |
7708 psrld m4, 5 | |
7709 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7710 mova m1, m6 | |
7711 pmaddwd m6, [r3 - 2 * 16] | |
7712 paddd m6, [pd_16] | |
7713 psrld m6, 5 | |
7714 packusdw m4, m6 | |
7715 | |
7716 mova m2, m7 | |
7717 pmaddwd m2, [r3 + 3 * 16] ; [18] | |
7718 paddd m2, [pd_16] | |
7719 psrld m2, 5 | |
7720 mova m6, m1 | |
7721 pmaddwd m6, [r3 + 3 * 16] | |
7722 paddd m6, [pd_16] | |
7723 psrld m6, 5 | |
7724 packusdw m2, m6 | |
7725 | |
7726 mova m6, m7 | |
7727 pmaddwd m6, [r3 + 8 * 16] ; [23] | |
7728 paddd m6, [pd_16] | |
7729 psrld m6, 5 | |
7730 pmaddwd m1, [r3 + 8 * 16] | |
7731 paddd m1, [pd_16] | |
7732 psrld m1, 5 | |
7733 packusdw m6, m1 | |
7734 | |
7735 pmaddwd m7, [r3 + 13 * 16] ; [28] | |
7736 paddd m7, [pd_16] | |
7737 psrld m7, 5 | |
7738 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] | |
7739 pmaddwd m1, [r3 + 13 * 16] | |
7740 paddd m1, [pd_16] | |
7741 psrld m1, 5 | |
7742 packusdw m7, m1 | |
7743 | |
7744 lea r5, [r5 + r1 * 4] | |
7745 | |
7746 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
7747 | |
7748 palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3] | |
7749 mova m4, m1 | |
7750 pmaddwd m4, [r3 - 14 * 16] ; [1] | |
7751 paddd m4, [pd_16] | |
7752 psrld m4, 5 | |
7753 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] | |
7754 mova m0, m5 | |
7755 pmaddwd m0, [r3 - 14 * 16] | |
7756 paddd m0, [pd_16] | |
7757 psrld m0, 5 | |
7758 packusdw m4, m0 | |
7759 | |
7760 mova m2, m1 | |
7761 pmaddwd m2, [r3 - 9 * 16] ; [6] | |
7762 paddd m2, [pd_16] | |
7763 psrld m2, 5 | |
7764 mova m3, m5 | |
7765 pmaddwd m3, [r3 - 9 * 16] | |
7766 paddd m3, [pd_16] | |
7767 psrld m3, 5 | |
7768 packusdw m2, m3 | |
7769 | |
7770 mova m7, m1 | |
7771 pmaddwd m7, [r3 - 4 * 16] ; [11] | |
7772 paddd m7, [pd_16] | |
7773 psrld m7, 5 | |
7774 mova m3, m5 | |
7775 pmaddwd m3, [r3 - 4 * 16] | |
7776 paddd m3, [pd_16] | |
7777 psrld m3, 5 | |
7778 packusdw m7, m3 | |
7779 | |
7780 pmaddwd m1, [r3 + 1 * 16] ; [16] | |
7781 paddd m1, [pd_16] | |
7782 psrld m1, 5 | |
7783 pmaddwd m5, [r3 + 1 * 16] | |
7784 paddd m5, [pd_16] | |
7785 psrld m5, 5 | |
7786 packusdw m1, m5 | |
7787 | |
7788 lea r5, [r5 + r1 * 4] | |
7789 | |
7790 TRANSPOSE_STORE m4, m2, m7, m1, m3, 24 | |
7791 | |
7792 ret | |
7793 | |
7794 cglobal ang16_mode_9_27 | |
7795 test r6d, r6d | |
7796 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7797 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] | |
7798 | |
7799 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] | |
7800 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] | |
7801 | |
7802 mova m4, m3 | |
7803 pmaddwd m4, [r3 - 14 * 16] ; [2] | |
7804 paddd m4, [pd_16] | |
7805 psrld m4, 5 | |
7806 mova m2, m0 | |
7807 pmaddwd m2, [r3 - 14 * 16] | |
7808 paddd m2, [pd_16] | |
7809 psrld m2, 5 | |
7810 packusdw m4, m2 | |
7811 | |
7812 mova m2, m3 | |
7813 pmaddwd m2, [r3 - 12 * 16] ; [4] | |
7814 paddd m2, [pd_16] | |
7815 psrld m2, 5 | |
7816 mova m1, m0 | |
7817 pmaddwd m1, [r3 - 12 * 16] | |
7818 paddd m1, [pd_16] | |
7819 psrld m1, 5 | |
7820 packusdw m2, m1 | |
7821 | |
7822 mova m6, m3 | |
7823 pmaddwd m6, [r3 - 10 *16] ; [6] | |
7824 paddd m6, [pd_16] | |
7825 psrld m6, 5 | |
7826 mova m1, m0 | |
7827 pmaddwd m1, [r3 - 10 * 16] | |
7828 paddd m1, [pd_16] | |
7829 psrld m1, 5 | |
7830 packusdw m6, m1 | |
7831 | |
7832 mova m7, m3 | |
7833 pmaddwd m7, [r3 - 8 * 16] ; [8] | |
7834 paddd m7, [pd_16] | |
7835 psrld m7, 5 | |
7836 mova m1, m0 | |
7837 pmaddwd m1, [r3 - 8 * 16] | |
7838 paddd m1, [pd_16] | |
7839 psrld m1, 5 | |
7840 packusdw m7, m1 | |
7841 | |
7842 mov r5, r0 | |
7843 | |
7844 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
7845 | |
7846 mova m4, m3 | |
7847 pmaddwd m4, [r3 - 6 * 16] ; [10] | |
7848 paddd m4, [pd_16] | |
7849 psrld m4, 5 | |
7850 mova m1, m0 | |
7851 pmaddwd m1, [r3 - 6 * 16] | |
7852 paddd m1, [pd_16] | |
7853 psrld m1, 5 | |
7854 packusdw m4, m1 | |
7855 | |
7856 mova m2, m3 | |
7857 pmaddwd m2, [r3 - 4 * 16] ; [12] | |
7858 paddd m2, [pd_16] | |
7859 psrld m2, 5 | |
7860 mova m1, m0 | |
7861 pmaddwd m1, [r3 - 4 * 16] | |
7862 paddd m1, [pd_16] | |
7863 psrld m1, 5 | |
7864 packusdw m2, m1 | |
7865 | |
7866 mova m6, m3 | |
7867 pmaddwd m6, [r3 - 2 * 16] ; [14] | |
7868 paddd m6, [pd_16] | |
7869 psrld m6, 5 | |
7870 mova m7, m0 | |
7871 pmaddwd m7, [r3 - 2 * 16] | |
7872 paddd m7, [pd_16] | |
7873 psrld m7, 5 | |
7874 packusdw m6, m7 | |
7875 | |
7876 mova m7, m3 | |
7877 pmaddwd m7, [r3] ; [16] | |
7878 paddd m7, [pd_16] | |
7879 psrld m7, 5 | |
7880 mova m1, m0 | |
7881 pmaddwd m1, [r3] | |
7882 paddd m1, [pd_16] | |
7883 psrld m1, 5 | |
7884 packusdw m7, m1 | |
7885 | |
7886 lea r5, [r0 + r1 * 4] | |
7887 | |
7888 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
7889 | |
7890 mova m4, m3 | |
7891 pmaddwd m4, [r3 + 2 *16] ; [18] | |
7892 paddd m4, [pd_16] | |
7893 psrld m4, 5 | |
7894 mova m6, m0 | |
7895 pmaddwd m6, [r3 + 2 * 16] | |
7896 paddd m6, [pd_16] | |
7897 psrld m6, 5 | |
7898 packusdw m4, m6 | |
7899 | |
7900 mova m2, m3 | |
7901 pmaddwd m2, [r3 + 4 * 16] ; [20] | |
7902 paddd m2, [pd_16] | |
7903 psrld m2, 5 | |
7904 mova m6, m0 | |
7905 pmaddwd m6, [r3 + 4 * 16] | |
7906 paddd m6, [pd_16] | |
7907 psrld m6, 5 | |
7908 packusdw m2, m6 | |
7909 | |
7910 mova m6, m3 | |
7911 pmaddwd m6, [r3 + 6 * 16] ; [22] | |
7912 paddd m6, [pd_16] | |
7913 psrld m6, 5 | |
7914 mova m1, m0 | |
7915 pmaddwd m1, [r3 + 6 * 16] | |
7916 paddd m1, [pd_16] | |
7917 psrld m1, 5 | |
7918 packusdw m6, m1 | |
7919 | |
7920 mova m7, m3 | |
7921 pmaddwd m7, [r3 + 8 * 16] ; [24] | |
7922 paddd m7, [pd_16] | |
7923 psrld m7, 5 | |
7924 mova m1, m0 | |
7925 pmaddwd m1, [r3 + 8 * 16] | |
7926 paddd m1, [pd_16] | |
7927 psrld m1, 5 | |
7928 packusdw m7, m1 | |
7929 | |
7930 lea r5, [r5 + r1 * 4] | |
7931 | |
7932 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
7933 | |
7934 mova m4, m3 | |
7935 pmaddwd m4, [r3 + 10 * 16] ; [26] | |
7936 paddd m4, [pd_16] | |
7937 psrld m4, 5 | |
7938 mova m1, m0 | |
7939 pmaddwd m1, [r3 + 10 * 16] | |
7940 paddd m1, [pd_16] | |
7941 psrld m1, 5 | |
7942 packusdw m4, m1 | |
7943 | |
7944 mova m2, m3 | |
7945 pmaddwd m2, [r3 + 12 * 16] ; [28] | |
7946 paddd m2, [pd_16] | |
7947 psrld m2, 5 | |
7948 mova m1, m0 | |
7949 pmaddwd m1, [r3 + 12 * 16] | |
7950 paddd m1, [pd_16] | |
7951 psrld m1, 5 | |
7952 packusdw m2, m1 | |
7953 | |
7954 pmaddwd m3, [r3 + 14 * 16] ; [30] | |
7955 paddd m3, [pd_16] | |
7956 psrld m3, 5 | |
7957 pmaddwd m0, [r3 + 14 * 16] | |
7958 paddd m0, [pd_16] | |
7959 psrld m0, 5 | |
7960 packusdw m3, m0 | |
7961 | |
7962 movu m7, [r2 + 4] | |
7963 | |
7964 lea r5, [r5 + r1 * 4] | |
7965 | |
7966 TRANSPOSE_STORE m4, m2, m3, m7, m1, 24 | |
7967 | |
7968 ret | |
7969 | |
7970 cglobal ang16_mode_11_25 | |
7971 test r6d, r6d | |
7972 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
7973 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
7974 | |
7975 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
7976 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
7977 | |
7978 mova m4, m3 | |
7979 pmaddwd m4, [r3 + 14 * 16] ; [30] | |
7980 paddd m4, [pd_16] | |
7981 psrld m4, 5 | |
7982 mova m2, m0 | |
7983 pmaddwd m2, [r3 + 14 * 16] | |
7984 paddd m2, [pd_16] | |
7985 psrld m2, 5 | |
7986 packusdw m4, m2 | |
7987 | |
7988 mova m2, m3 | |
7989 pmaddwd m2, [r3 + 12 * 16] ; [28] | |
7990 paddd m2, [pd_16] | |
7991 psrld m2, 5 | |
7992 mova m1, m0 | |
7993 pmaddwd m1, [r3 + 12 * 16] | |
7994 paddd m1, [pd_16] | |
7995 psrld m1, 5 | |
7996 packusdw m2, m1 | |
7997 | |
7998 mova m6, m3 | |
7999 pmaddwd m6, [r3 + 10 *16] ; [26] | |
8000 paddd m6, [pd_16] | |
8001 psrld m6, 5 | |
8002 mova m1, m0 | |
8003 pmaddwd m1, [r3 + 10 * 16] | |
8004 paddd m1, [pd_16] | |
8005 psrld m1, 5 | |
8006 packusdw m6, m1 | |
8007 | |
8008 mova m7, m3 | |
8009 pmaddwd m7, [r3 + 8 * 16] ; [24] | |
8010 paddd m7, [pd_16] | |
8011 psrld m7, 5 | |
8012 mova m1, m0 | |
8013 pmaddwd m1, [r3 + 8 * 16] | |
8014 paddd m1, [pd_16] | |
8015 psrld m1, 5 | |
8016 packusdw m7, m1 | |
8017 | |
8018 mov r5, r0 | |
8019 | |
8020 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
8021 | |
8022 mova m4, m3 | |
8023 pmaddwd m4, [r3 + 6 * 16] ; [22] | |
8024 paddd m4, [pd_16] | |
8025 psrld m4, 5 | |
8026 mova m1, m0 | |
8027 pmaddwd m1, [r3 + 6 * 16] | |
8028 paddd m1, [pd_16] | |
8029 psrld m1, 5 | |
8030 packusdw m4, m1 | |
8031 | |
8032 mova m2, m3 | |
8033 pmaddwd m2, [r3 + 4 * 16] ; [20] | |
8034 paddd m2, [pd_16] | |
8035 psrld m2, 5 | |
8036 mova m1, m0 | |
8037 pmaddwd m1, [r3 + 4 * 16] | |
8038 paddd m1, [pd_16] | |
8039 psrld m1, 5 | |
8040 packusdw m2, m1 | |
8041 | |
8042 mova m6, m3 | |
8043 pmaddwd m6, [r3 + 2 * 16] ; [18] | |
8044 paddd m6, [pd_16] | |
8045 psrld m6, 5 | |
8046 mova m7, m0 | |
8047 pmaddwd m7, [r3 + 2 * 16] | |
8048 paddd m7, [pd_16] | |
8049 psrld m7, 5 | |
8050 packusdw m6, m7 | |
8051 | |
8052 mova m7, m3 | |
8053 pmaddwd m7, [r3] ; [16] | |
8054 paddd m7, [pd_16] | |
8055 psrld m7, 5 | |
8056 mova m1, m0 | |
8057 pmaddwd m1, [r3] | |
8058 paddd m1, [pd_16] | |
8059 psrld m1, 5 | |
8060 packusdw m7, m1 | |
8061 | |
8062 lea r5, [r0 + r1 * 4] | |
8063 | |
8064 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
8065 | |
8066 mova m4, m3 | |
8067 pmaddwd m4, [r3 - 2 *16] ; [14] | |
8068 paddd m4, [pd_16] | |
8069 psrld m4, 5 | |
8070 mova m6, m0 | |
8071 pmaddwd m6, [r3 - 2 * 16] | |
8072 paddd m6, [pd_16] | |
8073 psrld m6, 5 | |
8074 packusdw m4, m6 | |
8075 | |
8076 mova m2, m3 | |
8077 pmaddwd m2, [r3 - 4 * 16] ; [12] | |
8078 paddd m2, [pd_16] | |
8079 psrld m2, 5 | |
8080 mova m6, m0 | |
8081 pmaddwd m6, [r3 - 4 * 16] | |
8082 paddd m6, [pd_16] | |
8083 psrld m6, 5 | |
8084 packusdw m2, m6 | |
8085 | |
8086 mova m6, m3 | |
8087 pmaddwd m6, [r3 - 6 * 16] ; [10] | |
8088 paddd m6, [pd_16] | |
8089 psrld m6, 5 | |
8090 mova m1, m0 | |
8091 pmaddwd m1, [r3 - 6 * 16] | |
8092 paddd m1, [pd_16] | |
8093 psrld m1, 5 | |
8094 packusdw m6, m1 | |
8095 | |
8096 mova m7, m3 | |
8097 pmaddwd m7, [r3 - 8 * 16] ; [8] | |
8098 paddd m7, [pd_16] | |
8099 psrld m7, 5 | |
8100 mova m1, m0 | |
8101 pmaddwd m1, [r3 - 8 * 16] | |
8102 paddd m1, [pd_16] | |
8103 psrld m1, 5 | |
8104 packusdw m7, m1 | |
8105 | |
8106 lea r5, [r5 + r1 * 4] | |
8107 | |
8108 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
8109 | |
8110 mova m4, m3 | |
8111 pmaddwd m4, [r3 - 10 * 16] ; [6] | |
8112 paddd m4, [pd_16] | |
8113 psrld m4, 5 | |
8114 mova m1, m0 | |
8115 pmaddwd m1, [r3 - 10 * 16] | |
8116 paddd m1, [pd_16] | |
8117 psrld m1, 5 | |
8118 packusdw m4, m1 | |
8119 | |
8120 mova m2, m3 | |
8121 pmaddwd m2, [r3 - 12 * 16] ; [4] | |
8122 paddd m2, [pd_16] | |
8123 psrld m2, 5 | |
8124 mova m1, m0 | |
8125 pmaddwd m1, [r3 - 12 * 16] | |
8126 paddd m1, [pd_16] | |
8127 psrld m1, 5 | |
8128 packusdw m2, m1 | |
8129 | |
8130 mova m7, m3 | |
8131 pmaddwd m7, [r3 - 14 * 16] ; [2] | |
8132 paddd m7, [pd_16] | |
8133 psrld m7, 5 | |
8134 mova m1, m0 | |
8135 pmaddwd m1, [r3 - 14 * 16] | |
8136 paddd m1, [pd_16] | |
8137 psrld m1, 5 | |
8138 packusdw m7, m1 | |
8139 | |
8140 movu m3, [r2] | |
8141 | |
8142 lea r5, [r5 + r1 * 4] | |
8143 | |
8144 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
8145 | |
8146 ret | |
8147 | |
8148 cglobal ang16_mode_12_24 | |
8149 test r3d, r3d | |
8150 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
8151 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
8152 | |
8153 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
8154 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
8155 | |
8156 mova m4, m3 | |
8157 pmaddwd m4, [r6 + 11 * 16] ; [27] | |
8158 paddd m4, [pd_16] | |
8159 psrld m4, 5 | |
8160 mova m2, m0 | |
8161 pmaddwd m2, [r6 + 11 * 16] | |
8162 paddd m2, [pd_16] | |
8163 psrld m2, 5 | |
8164 packusdw m4, m2 | |
8165 | |
8166 mova m2, m3 | |
8167 pmaddwd m2, [r6 + 6 * 16] ; [22] | |
8168 paddd m2, [pd_16] | |
8169 psrld m2, 5 | |
8170 mova m1, m0 | |
8171 pmaddwd m1, [r6 + 6 * 16] | |
8172 paddd m1, [pd_16] | |
8173 psrld m1, 5 | |
8174 packusdw m2, m1 | |
8175 | |
8176 mova m6, m3 | |
8177 pmaddwd m6, [r6 + 1 *16] ; [17] | |
8178 paddd m6, [pd_16] | |
8179 psrld m6, 5 | |
8180 mova m1, m0 | |
8181 pmaddwd m1, [r6 + 1 * 16] | |
8182 paddd m1, [pd_16] | |
8183 psrld m1, 5 | |
8184 packusdw m6, m1 | |
8185 | |
8186 mova m7, m3 | |
8187 pmaddwd m7, [r6 - 4 * 16] ; [12] | |
8188 paddd m7, [pd_16] | |
8189 psrld m7, 5 | |
8190 mova m1, m0 | |
8191 pmaddwd m1, [r6 - 4 * 16] | |
8192 paddd m1, [pd_16] | |
8193 psrld m1, 5 | |
8194 packusdw m7, m1 | |
8195 | |
8196 mov r5, r0 | |
8197 | |
8198 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
8199 | |
8200 mova m4, m3 | |
8201 pmaddwd m4, [r6 - 9 * 16] ; [7] | |
8202 paddd m4, [pd_16] | |
8203 psrld m4, 5 | |
8204 mova m1, m0 | |
8205 pmaddwd m1, [r6 - 9 * 16] | |
8206 paddd m1, [pd_16] | |
8207 psrld m1, 5 | |
8208 packusdw m4, m1 | |
8209 | |
8210 mova m2, m3 | |
8211 pmaddwd m2, [r6 - 14 * 16] ; [2] | |
8212 paddd m2, [pd_16] | |
8213 psrld m2, 5 | |
8214 mova m1, m0 | |
8215 pmaddwd m1, [r6 - 14 * 16] | |
8216 paddd m1, [pd_16] | |
8217 psrld m1, 5 | |
8218 packusdw m2, m1 | |
8219 | |
8220 palignr m0, m3, 12 | |
8221 palignr m3, m5, 12 | |
8222 | |
8223 mova m6, m3 | |
8224 pmaddwd m6, [r6 + 13 * 16] ; [29] | |
8225 paddd m6, [pd_16] | |
8226 psrld m6, 5 | |
8227 mova m7, m0 | |
8228 pmaddwd m7, [r6 + 13 * 16] | |
8229 paddd m7, [pd_16] | |
8230 psrld m7, 5 | |
8231 packusdw m6, m7 | |
8232 | |
8233 mova m7, m3 | |
8234 pmaddwd m7, [r6 + 8 * 16] ; [24] | |
8235 paddd m7, [pd_16] | |
8236 psrld m7, 5 | |
8237 mova m1, m0 | |
8238 pmaddwd m1, [r6 + 8 * 16] | |
8239 paddd m1, [pd_16] | |
8240 psrld m1, 5 | |
8241 packusdw m7, m1 | |
8242 | |
8243 lea r5, [r0 + r1 * 4] | |
8244 | |
8245 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
8246 | |
8247 mova m4, m3 | |
8248 pmaddwd m4, [r6 + 3 *16] ; [19] | |
8249 paddd m4, [pd_16] | |
8250 psrld m4, 5 | |
8251 mova m6, m0 | |
8252 pmaddwd m6, [r6 + 3 * 16] | |
8253 paddd m6, [pd_16] | |
8254 psrld m6, 5 | |
8255 packusdw m4, m6 | |
8256 | |
8257 mova m2, m3 | |
8258 pmaddwd m2, [r6 - 2 * 16] ; [14] | |
8259 paddd m2, [pd_16] | |
8260 psrld m2, 5 | |
8261 mova m6, m0 | |
8262 pmaddwd m6, [r6 - 2 * 16] | |
8263 paddd m6, [pd_16] | |
8264 psrld m6, 5 | |
8265 packusdw m2, m6 | |
8266 | |
8267 mova m6, m3 | |
8268 pmaddwd m6, [r6 - 7 * 16] ; [9] | |
8269 paddd m6, [pd_16] | |
8270 psrld m6, 5 | |
8271 mova m1, m0 | |
8272 pmaddwd m1, [r6 - 7 * 16] | |
8273 paddd m1, [pd_16] | |
8274 psrld m1, 5 | |
8275 packusdw m6, m1 | |
8276 | |
8277 mova m7, m3 | |
8278 pmaddwd m7, [r6 - 12 * 16] ; [4] | |
8279 paddd m7, [pd_16] | |
8280 psrld m7, 5 | |
8281 mova m1, m0 | |
8282 pmaddwd m1, [r6 - 12 * 16] | |
8283 paddd m1, [pd_16] | |
8284 psrld m1, 5 | |
8285 packusdw m7, m1 | |
8286 | |
8287 lea r5, [r5 + r1 * 4] | |
8288 | |
8289 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
8290 | |
8291 pslldq m5, 2 | |
8292 palignr m0, m3, 12 | |
8293 palignr m3, m5, 12 | |
8294 | |
8295 mova m4, m3 | |
8296 pmaddwd m4, [r6 + 15 * 16] ; [31] | |
8297 paddd m4, [pd_16] | |
8298 psrld m4, 5 | |
8299 mova m1, m0 | |
8300 pmaddwd m1, [r6 + 15 * 16] | |
8301 paddd m1, [pd_16] | |
8302 psrld m1, 5 | |
8303 packusdw m4, m1 | |
8304 | |
8305 mova m2, m3 | |
8306 pmaddwd m2, [r6 + 10 * 16] ; [26] | |
8307 paddd m2, [pd_16] | |
8308 psrld m2, 5 | |
8309 mova m1, m0 | |
8310 pmaddwd m1, [r6 + 10 * 16] | |
8311 paddd m1, [pd_16] | |
8312 psrld m1, 5 | |
8313 packusdw m2, m1 | |
8314 | |
8315 mova m7, m3 | |
8316 pmaddwd m7, [r6 + 5 * 16] ; [21] | |
8317 paddd m7, [pd_16] | |
8318 psrld m7, 5 | |
8319 mova m1, m0 | |
8320 pmaddwd m1, [r6 + 5 * 16] | |
8321 paddd m1, [pd_16] | |
8322 psrld m1, 5 | |
8323 packusdw m7, m1 | |
8324 | |
8325 pmaddwd m3, [r6] ; [16] | |
8326 paddd m3, [pd_16] | |
8327 psrld m3, 5 | |
8328 pmaddwd m0, [r6] | |
8329 paddd m0, [pd_16] | |
8330 psrld m0, 5 | |
8331 packusdw m3, m0 | |
8332 | |
8333 lea r5, [r5 + r1 * 4] | |
8334 | |
8335 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
8336 | |
8337 ret | |
8338 | |
8339 cglobal ang16_mode_13_23 | |
8340 test r3d, r3d | |
8341 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
8342 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
8343 | |
8344 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
8345 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
8346 | |
8347 mova m4, m3 | |
8348 pmaddwd m4, [r6 + 8 * 16] ; [23] | |
8349 paddd m4, [pd_16] | |
8350 psrld m4, 5 | |
8351 mova m2, m0 | |
8352 pmaddwd m2, [r6 + 8 * 16] | |
8353 paddd m2, [pd_16] | |
8354 psrld m2, 5 | |
8355 packusdw m4, m2 | |
8356 | |
8357 mova m2, m3 | |
8358 pmaddwd m2, [r6 - 1 * 16] ; [14] | |
8359 paddd m2, [pd_16] | |
8360 psrld m2, 5 | |
8361 mova m1, m0 | |
8362 pmaddwd m1, [r6 - 1 * 16] | |
8363 paddd m1, [pd_16] | |
8364 psrld m1, 5 | |
8365 packusdw m2, m1 | |
8366 | |
8367 mova m6, m3 | |
8368 pmaddwd m6, [r6 - 10 *16] ; [5] | |
8369 paddd m6, [pd_16] | |
8370 psrld m6, 5 | |
8371 mova m1, m0 | |
8372 pmaddwd m1, [r6 - 10 * 16] | |
8373 paddd m1, [pd_16] | |
8374 psrld m1, 5 | |
8375 packusdw m6, m1 | |
8376 | |
8377 palignr m0, m3, 12 | |
8378 palignr m3, m5, 12 | |
8379 | |
8380 mova m7, m3 | |
8381 pmaddwd m7, [r6 + 13 * 16] ; [28] | |
8382 paddd m7, [pd_16] | |
8383 psrld m7, 5 | |
8384 mova m1, m0 | |
8385 pmaddwd m1, [r6 + 13 * 16] | |
8386 paddd m1, [pd_16] | |
8387 psrld m1, 5 | |
8388 packusdw m7, m1 | |
8389 | |
8390 mov r5, r0 | |
8391 | |
8392 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
8393 | |
8394 mova m4, m3 | |
8395 pmaddwd m4, [r6 + 4 * 16] ; [19] | |
8396 paddd m4, [pd_16] | |
8397 psrld m4, 5 | |
8398 mova m1, m0 | |
8399 pmaddwd m1, [r6 + 4 * 16] | |
8400 paddd m1, [pd_16] | |
8401 psrld m1, 5 | |
8402 packusdw m4, m1 | |
8403 | |
8404 mova m2, m3 | |
8405 pmaddwd m2, [r6 - 5 * 16] ; [10] | |
8406 paddd m2, [pd_16] | |
8407 psrld m2, 5 | |
8408 mova m1, m0 | |
8409 pmaddwd m1, [r6 - 5 * 16] | |
8410 paddd m1, [pd_16] | |
8411 psrld m1, 5 | |
8412 packusdw m2, m1 | |
8413 | |
8414 mova m6, m3 | |
8415 pmaddwd m6, [r6 - 14 * 16] ; [1] | |
8416 paddd m6, [pd_16] | |
8417 psrld m6, 5 | |
8418 mova m7, m0 | |
8419 pmaddwd m7, [r6 - 14 * 16] | |
8420 paddd m7, [pd_16] | |
8421 psrld m7, 5 | |
8422 packusdw m6, m7 | |
8423 | |
8424 pslldq m5, 2 | |
8425 palignr m0, m3, 12 | |
8426 palignr m3, m5, 12 | |
8427 | |
8428 mova m7, m3 | |
8429 pmaddwd m7, [r6 + 9 * 16] ; [24] | |
8430 paddd m7, [pd_16] | |
8431 psrld m7, 5 | |
8432 mova m1, m0 | |
8433 pmaddwd m1, [r6 + 9 * 16] | |
8434 paddd m1, [pd_16] | |
8435 psrld m1, 5 | |
8436 packusdw m7, m1 | |
8437 | |
8438 lea r5, [r0 + r1 * 4] | |
8439 | |
8440 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
8441 | |
8442 mova m4, m3 | |
8443 pmaddwd m4, [r6] ; [15] | |
8444 paddd m4, [pd_16] | |
8445 psrld m4, 5 | |
8446 mova m6, m0 | |
8447 pmaddwd m6, [r6] | |
8448 paddd m6, [pd_16] | |
8449 psrld m6, 5 | |
8450 packusdw m4, m6 | |
8451 | |
8452 mova m2, m3 | |
8453 pmaddwd m2, [r6 - 9 * 16] ; [6] | |
8454 paddd m2, [pd_16] | |
8455 psrld m2, 5 | |
8456 mova m6, m0 | |
8457 pmaddwd m6, [r6 - 9 * 16] | |
8458 paddd m6, [pd_16] | |
8459 psrld m6, 5 | |
8460 packusdw m2, m6 | |
8461 | |
8462 pslldq m5, 2 | |
8463 palignr m0, m3, 12 | |
8464 palignr m3, m5, 12 | |
8465 | |
8466 mova m6, m3 | |
8467 pmaddwd m6, [r6 + 14 * 16] ; [29] | |
8468 paddd m6, [pd_16] | |
8469 psrld m6, 5 | |
8470 mova m1, m0 | |
8471 pmaddwd m1, [r6 + 14 * 16] | |
8472 paddd m1, [pd_16] | |
8473 psrld m1, 5 | |
8474 packusdw m6, m1 | |
8475 | |
8476 mova m7, m3 | |
8477 pmaddwd m7, [r6 + 5 * 16] ; [20] | |
8478 paddd m7, [pd_16] | |
8479 psrld m7, 5 | |
8480 mova m1, m0 | |
8481 pmaddwd m1, [r6 + 5 * 16] | |
8482 paddd m1, [pd_16] | |
8483 psrld m1, 5 | |
8484 packusdw m7, m1 | |
8485 | |
8486 lea r5, [r5 + r1 * 4] | |
8487 | |
8488 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
8489 | |
8490 mova m4, m3 | |
8491 pmaddwd m4, [r6 - 4 * 16] ; [11] | |
8492 paddd m4, [pd_16] | |
8493 psrld m4, 5 | |
8494 mova m1, m0 | |
8495 pmaddwd m1, [r6 - 4 * 16] | |
8496 paddd m1, [pd_16] | |
8497 psrld m1, 5 | |
8498 packusdw m4, m1 | |
8499 | |
8500 mova m2, m3 | |
8501 pmaddwd m2, [r6 - 13 * 16] ; [2] | |
8502 paddd m2, [pd_16] | |
8503 psrld m2, 5 | |
8504 mova m1, m0 | |
8505 pmaddwd m1, [r6 - 13 * 16] | |
8506 paddd m1, [pd_16] | |
8507 psrld m1, 5 | |
8508 packusdw m2, m1 | |
8509 | |
8510 pslldq m5, 2 | |
8511 palignr m0, m3, 12 | |
8512 palignr m3, m5, 12 | |
8513 | |
8514 mova m7, m3 | |
8515 pmaddwd m7, [r6 + 10 * 16] ; [25] | |
8516 paddd m7, [pd_16] | |
8517 psrld m7, 5 | |
8518 mova m1, m0 | |
8519 pmaddwd m1, [r6 + 10 * 16] | |
8520 paddd m1, [pd_16] | |
8521 psrld m1, 5 | |
8522 packusdw m7, m1 | |
8523 | |
8524 pmaddwd m3, [r6 + 1 * 16] ; [16] | |
8525 paddd m3, [pd_16] | |
8526 psrld m3, 5 | |
8527 pmaddwd m0, [r6 + 1 *16] | |
8528 paddd m0, [pd_16] | |
8529 psrld m0, 5 | |
8530 packusdw m3, m0 | |
8531 | |
8532 lea r5, [r5 + r1 * 4] | |
8533 | |
8534 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
8535 | |
8536 ret | |
8537 | |
8538 cglobal ang16_mode_14_22 | |
8539 test r3d, r3d | |
8540 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
8541 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
8542 | |
8543 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
8544 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
8545 | |
8546 mova m4, m3 | |
8547 pmaddwd m4, [r6 + 1 * 16] ; [19] | |
8548 paddd m4, [pd_16] | |
8549 psrld m4, 5 | |
8550 mova m2, m0 | |
8551 pmaddwd m2, [r6 + 1 * 16] | |
8552 paddd m2, [pd_16] | |
8553 psrld m2, 5 | |
8554 packusdw m4, m2 | |
8555 | |
8556 mova m2, m3 | |
8557 pmaddwd m2, [r6 - 12 * 16] ; [6] | |
8558 paddd m2, [pd_16] | |
8559 psrld m2, 5 | |
8560 mova m1, m0 | |
8561 pmaddwd m1, [r6 - 12 * 16] | |
8562 paddd m1, [pd_16] | |
8563 psrld m1, 5 | |
8564 packusdw m2, m1 | |
8565 | |
8566 palignr m0, m3, 12 | |
8567 palignr m3, m5, 12 | |
8568 | |
8569 mova m6, m3 | |
8570 pmaddwd m6, [r6 + 7 * 16] ; [25] | |
8571 paddd m6, [pd_16] | |
8572 psrld m6, 5 | |
8573 mova m1, m0 | |
8574 pmaddwd m1, [r6 + 7 * 16] | |
8575 paddd m1, [pd_16] | |
8576 psrld m1, 5 | |
8577 packusdw m6, m1 | |
8578 | |
8579 mova m7, m3 | |
8580 pmaddwd m7, [r6 - 6 * 16] ; [12] | |
8581 paddd m7, [pd_16] | |
8582 psrld m7, 5 | |
8583 mova m1, m0 | |
8584 pmaddwd m1, [r6 - 6 * 16] | |
8585 paddd m1, [pd_16] | |
8586 psrld m1, 5 | |
8587 packusdw m7, m1 | |
8588 | |
8589 mov r5, r0 | |
8590 | |
8591 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
8592 | |
8593 pslldq m5, 2 | |
8594 palignr m0, m3, 12 | |
8595 palignr m3, m5, 12 | |
8596 | |
8597 mova m4, m3 | |
8598 pmaddwd m4, [r6 + 13 * 16] ; [31] | |
8599 paddd m4, [pd_16] | |
8600 psrld m4, 5 | |
8601 mova m1, m0 | |
8602 pmaddwd m1, [r6 + 13 * 16] | |
8603 paddd m1, [pd_16] | |
8604 psrld m1, 5 | |
8605 packusdw m4, m1 | |
8606 | |
8607 mova m2, m3 | |
8608 pmaddwd m2, [r6] ; [18] | |
8609 paddd m2, [pd_16] | |
8610 psrld m2, 5 | |
8611 mova m1, m0 | |
8612 pmaddwd m1, [r6] | |
8613 paddd m1, [pd_16] | |
8614 psrld m1, 5 | |
8615 packusdw m2, m1 | |
8616 | |
8617 mova m6, m3 | |
8618 pmaddwd m6, [r6 - 13 * 16] ; [5] | |
8619 paddd m6, [pd_16] | |
8620 psrld m6, 5 | |
8621 mova m7, m0 | |
8622 pmaddwd m7, [r6 - 13 * 16] | |
8623 paddd m7, [pd_16] | |
8624 psrld m7, 5 | |
8625 packusdw m6, m7 | |
8626 | |
8627 pslldq m5, 2 | |
8628 palignr m0, m3, 12 | |
8629 palignr m3, m5, 12 | |
8630 | |
8631 mova m7, m3 | |
8632 pmaddwd m7, [r6 + 6 * 16] ; [24] | |
8633 paddd m7, [pd_16] | |
8634 psrld m7, 5 | |
8635 mova m1, m0 | |
8636 pmaddwd m1, [r6 + 6 * 16] | |
8637 paddd m1, [pd_16] | |
8638 psrld m1, 5 | |
8639 packusdw m7, m1 | |
8640 | |
8641 lea r5, [r0 + r1 * 4] | |
8642 | |
8643 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
8644 | |
8645 mova m4, m3 | |
8646 pmaddwd m4, [r6 - 7 * 16] ; [11] | |
8647 paddd m4, [pd_16] | |
8648 psrld m4, 5 | |
8649 mova m6, m0 | |
8650 pmaddwd m6, [r6 - 7 * 16] | |
8651 paddd m6, [pd_16] | |
8652 psrld m6, 5 | |
8653 packusdw m4, m6 | |
8654 | |
8655 pslldq m5, 2 | |
8656 palignr m0, m3, 12 | |
8657 palignr m3, m5, 12 | |
8658 | |
8659 mova m2, m3 | |
8660 pmaddwd m2, [r6 + 12 * 16] ; [30] | |
8661 paddd m2, [pd_16] | |
8662 psrld m2, 5 | |
8663 mova m6, m0 | |
8664 pmaddwd m6, [r6 + 12 * 16] | |
8665 paddd m6, [pd_16] | |
8666 psrld m6, 5 | |
8667 packusdw m2, m6 | |
8668 | |
8669 mova m6, m3 | |
8670 pmaddwd m6, [r6 - 1 * 16] ; [17] | |
8671 paddd m6, [pd_16] | |
8672 psrld m6, 5 | |
8673 mova m1, m0 | |
8674 pmaddwd m1, [r6 - 1 * 16] | |
8675 paddd m1, [pd_16] | |
8676 psrld m1, 5 | |
8677 packusdw m6, m1 | |
8678 | |
8679 mova m7, m3 | |
8680 pmaddwd m7, [r6 - 14 * 16] ; [4] | |
8681 paddd m7, [pd_16] | |
8682 psrld m7, 5 | |
8683 mova m1, m0 | |
8684 pmaddwd m1, [r6 - 14 * 16] | |
8685 paddd m1, [pd_16] | |
8686 psrld m1, 5 | |
8687 packusdw m7, m1 | |
8688 | |
8689 lea r5, [r5 + r1 * 4] | |
8690 | |
8691 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
8692 | |
8693 pslldq m5, 2 | |
8694 palignr m0, m3, 12 | |
8695 palignr m3, m5, 12 | |
8696 | |
8697 mova m4, m3 | |
8698 pmaddwd m4, [r6 + 5 * 16] ; [23] | |
8699 paddd m4, [pd_16] | |
8700 psrld m4, 5 | |
8701 mova m1, m0 | |
8702 pmaddwd m1, [r6 + 5 * 16] | |
8703 paddd m1, [pd_16] | |
8704 psrld m1, 5 | |
8705 packusdw m4, m1 | |
8706 | |
8707 mova m2, m3 | |
8708 pmaddwd m2, [r6 - 8 * 16] ; [10] | |
8709 paddd m2, [pd_16] | |
8710 psrld m2, 5 | |
8711 mova m1, m0 | |
8712 pmaddwd m1, [r6 - 8 * 16] | |
8713 paddd m1, [pd_16] | |
8714 psrld m1, 5 | |
8715 packusdw m2, m1 | |
8716 | |
8717 pslldq m5, 2 | |
8718 palignr m0, m3, 12 | |
8719 palignr m3, m5, 12 | |
8720 | |
8721 mova m7, m3 | |
8722 pmaddwd m7, [r6 + 11 * 16] ; [29] | |
8723 paddd m7, [pd_16] | |
8724 psrld m7, 5 | |
8725 mova m1, m0 | |
8726 pmaddwd m1, [r6 + 11 * 16] | |
8727 paddd m1, [pd_16] | |
8728 psrld m1, 5 | |
8729 packusdw m7, m1 | |
8730 | |
8731 pmaddwd m3, [r6 - 2 * 16] ; [16] | |
8732 paddd m3, [pd_16] | |
8733 psrld m3, 5 | |
8734 pmaddwd m0, [r6 - 2 *16] | |
8735 paddd m0, [pd_16] | |
8736 psrld m0, 5 | |
8737 packusdw m3, m0 | |
8738 | |
8739 lea r5, [r5 + r1 * 4] | |
8740 | |
8741 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
8742 | |
8743 ret | |
8744 | |
8745 cglobal ang16_mode_15_21 | |
8746 test r3d, r3d | |
8747 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
8748 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
8749 | |
8750 palignr m6, m0, m5, 2 | |
8751 | |
8752 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
8753 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
8754 | |
8755 mova m4, m3 | |
8756 pmaddwd m4, [r6] ; [15] | |
8757 paddd m4, [pd_16] | |
8758 psrld m4, 5 | |
8759 mova m2, m0 | |
8760 pmaddwd m2, [r6] | |
8761 paddd m2, [pd_16] | |
8762 psrld m2, 5 | |
8763 packusdw m4, m2 | |
8764 | |
8765 palignr m0, m3, 12 | |
8766 palignr m3, m6, 12 | |
8767 | |
8768 mova m2, m3 | |
8769 pmaddwd m2, [r6 + 15 * 16] ; [30] | |
8770 paddd m2, [pd_16] | |
8771 psrld m2, 5 | |
8772 mova m1, m0 | |
8773 pmaddwd m1, [r6 + 15 * 16] | |
8774 paddd m1, [pd_16] | |
8775 psrld m1, 5 | |
8776 packusdw m2, m1 | |
8777 | |
8778 mova m6, m3 | |
8779 pmaddwd m6, [r6 - 2 * 16] ; [13] | |
8780 paddd m6, [pd_16] | |
8781 psrld m6, 5 | |
8782 mova m1, m0 | |
8783 pmaddwd m1, [r6 - 2 * 16] | |
8784 paddd m1, [pd_16] | |
8785 psrld m1, 5 | |
8786 packusdw m6, m1 | |
8787 | |
8788 palignr m0, m3, 12 | |
8789 palignr m3, m5, 12 | |
8790 | |
8791 mova m7, m3 | |
8792 pmaddwd m7, [r6 + 13 * 16] ; [28] | |
8793 paddd m7, [pd_16] | |
8794 psrld m7, 5 | |
8795 mova m1, m0 | |
8796 pmaddwd m1, [r6 + 13 * 16] | |
8797 paddd m1, [pd_16] | |
8798 psrld m1, 5 | |
8799 packusdw m7, m1 | |
8800 | |
8801 mov r5, r0 | |
8802 | |
8803 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
8804 | |
8805 mova m4, m3 | |
8806 pmaddwd m4, [r6 - 4 * 16] ; [11] | |
8807 paddd m4, [pd_16] | |
8808 psrld m4, 5 | |
8809 mova m1, m0 | |
8810 pmaddwd m1, [r6 - 4 * 16] | |
8811 paddd m1, [pd_16] | |
8812 psrld m1, 5 | |
8813 packusdw m4, m1 | |
8814 | |
8815 pslldq m5, 2 | |
8816 palignr m0, m3, 12 | |
8817 palignr m3, m5, 12 | |
8818 | |
8819 mova m2, m3 | |
8820 pmaddwd m2, [r6 + 11 * 16] ; [26] | |
8821 paddd m2, [pd_16] | |
8822 psrld m2, 5 | |
8823 mova m1, m0 | |
8824 pmaddwd m1, [r6 + 11 * 16] | |
8825 paddd m1, [pd_16] | |
8826 psrld m1, 5 | |
8827 packusdw m2, m1 | |
8828 | |
8829 mova m6, m3 | |
8830 pmaddwd m6, [r6 - 6 * 16] ; [9] | |
8831 paddd m6, [pd_16] | |
8832 psrld m6, 5 | |
8833 mova m7, m0 | |
8834 pmaddwd m7, [r6 - 6 * 16] | |
8835 paddd m7, [pd_16] | |
8836 psrld m7, 5 | |
8837 packusdw m6, m7 | |
8838 | |
8839 pslldq m5, 2 | |
8840 palignr m0, m3, 12 | |
8841 palignr m3, m5, 12 | |
8842 | |
8843 mova m7, m3 | |
8844 pmaddwd m7, [r6 + 9 * 16] ; [24] | |
8845 paddd m7, [pd_16] | |
8846 psrld m7, 5 | |
8847 mova m1, m0 | |
8848 pmaddwd m1, [r6 + 9 * 16] | |
8849 paddd m1, [pd_16] | |
8850 psrld m1, 5 | |
8851 packusdw m7, m1 | |
8852 | |
8853 lea r5, [r0 + r1 * 4] | |
8854 | |
8855 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
8856 | |
8857 mova m4, m3 | |
8858 pmaddwd m4, [r6 - 8 * 16] ; [7] | |
8859 paddd m4, [pd_16] | |
8860 psrld m4, 5 | |
8861 mova m6, m0 | |
8862 pmaddwd m6, [r6 - 8 * 16] | |
8863 paddd m6, [pd_16] | |
8864 psrld m6, 5 | |
8865 packusdw m4, m6 | |
8866 | |
8867 pslldq m5, 2 | |
8868 palignr m0, m3, 12 | |
8869 palignr m3, m5, 12 | |
8870 | |
8871 mova m2, m3 | |
8872 pmaddwd m2, [r6 + 7 * 16] ; [22] | |
8873 paddd m2, [pd_16] | |
8874 psrld m2, 5 | |
8875 mova m6, m0 | |
8876 pmaddwd m6, [r6 + 7 * 16] | |
8877 paddd m6, [pd_16] | |
8878 psrld m6, 5 | |
8879 packusdw m2, m6 | |
8880 | |
8881 mova m6, m3 | |
8882 pmaddwd m6, [r6 - 10 * 16] ; [5] | |
8883 paddd m6, [pd_16] | |
8884 psrld m6, 5 | |
8885 mova m1, m0 | |
8886 pmaddwd m1, [r6 - 10 * 16] | |
8887 paddd m1, [pd_16] | |
8888 psrld m1, 5 | |
8889 packusdw m6, m1 | |
8890 | |
8891 pslldq m5, 2 | |
8892 palignr m0, m3, 12 | |
8893 palignr m3, m5, 12 | |
8894 | |
8895 mova m7, m3 | |
8896 pmaddwd m7, [r6 + 5 * 16] ; [20] | |
8897 paddd m7, [pd_16] | |
8898 psrld m7, 5 | |
8899 mova m1, m0 | |
8900 pmaddwd m1, [r6 + 5 * 16] | |
8901 paddd m1, [pd_16] | |
8902 psrld m1, 5 | |
8903 packusdw m7, m1 | |
8904 | |
8905 lea r5, [r5 + r1 * 4] | |
8906 | |
8907 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
8908 | |
8909 mova m4, m3 | |
8910 pmaddwd m4, [r6 - 12 * 16] ; [3] | |
8911 paddd m4, [pd_16] | |
8912 psrld m4, 5 | |
8913 mova m1, m0 | |
8914 pmaddwd m1, [r6 - 12 * 16] | |
8915 paddd m1, [pd_16] | |
8916 psrld m1, 5 | |
8917 packusdw m4, m1 | |
8918 | |
8919 pslldq m5, 2 | |
8920 palignr m0, m3, 12 | |
8921 palignr m3, m5, 12 | |
8922 | |
8923 mova m2, m3 | |
8924 pmaddwd m2, [r6 + 3 * 16] ; [18] | |
8925 paddd m2, [pd_16] | |
8926 psrld m2, 5 | |
8927 mova m1, m0 | |
8928 pmaddwd m1, [r6 + 3 * 16] | |
8929 paddd m1, [pd_16] | |
8930 psrld m1, 5 | |
8931 packusdw m2, m1 | |
8932 | |
8933 mova m7, m3 | |
8934 pmaddwd m7, [r6 - 14 * 16] ; [1] | |
8935 paddd m7, [pd_16] | |
8936 psrld m7, 5 | |
8937 mova m1, m0 | |
8938 pmaddwd m1, [r6 - 14 * 16] | |
8939 paddd m1, [pd_16] | |
8940 psrld m1, 5 | |
8941 packusdw m7, m1 | |
8942 | |
8943 pslldq m5, 2 | |
8944 palignr m0, m3, 12 | |
8945 palignr m3, m5, 12 | |
8946 | |
8947 pmaddwd m3, [r6 + 1 * 16] ; [16] | |
8948 paddd m3, [pd_16] | |
8949 psrld m3, 5 | |
8950 pmaddwd m0, [r6 + 1 * 16] | |
8951 paddd m0, [pd_16] | |
8952 psrld m0, 5 | |
8953 packusdw m3, m0 | |
8954 | |
8955 lea r5, [r5 + r1 * 4] | |
8956 | |
8957 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
8958 | |
8959 ret | |
8960 | |
8961 cglobal ang16_mode_16_20 | |
8962 test r4d, r4d | |
8963 lea r4, [r1 * 3] | |
8964 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
8965 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
8966 | |
8967 palignr m6, m0, m5, 2 | |
8968 | |
8969 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
8970 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
8971 | |
8972 mova m4, m3 | |
8973 pmaddwd m4, [r6 - 2 * 16] ; [11] | |
8974 paddd m4, [pd_16] | |
8975 psrld m4, 5 | |
8976 mova m2, m0 | |
8977 pmaddwd m2, [r6 - 2 * 16] | |
8978 paddd m2, [pd_16] | |
8979 psrld m2, 5 | |
8980 packusdw m4, m2 | |
8981 | |
8982 palignr m0, m3, 12 | |
8983 palignr m3, m6, 12 | |
8984 | |
8985 mova m2, m3 | |
8986 pmaddwd m2, [r6 + 9 * 16] ; [22] | |
8987 paddd m2, [pd_16] | |
8988 psrld m2, 5 | |
8989 mova m1, m0 | |
8990 pmaddwd m1, [r6 + 9 * 16] | |
8991 paddd m1, [pd_16] | |
8992 psrld m1, 5 | |
8993 packusdw m2, m1 | |
8994 | |
8995 mova m6, m3 | |
8996 pmaddwd m6, [r6 - 12 * 16] ; [1] | |
8997 paddd m6, [pd_16] | |
8998 psrld m6, 5 | |
8999 mova m1, m0 | |
9000 pmaddwd m1, [r6 - 12 * 16] | |
9001 paddd m1, [pd_16] | |
9002 psrld m1, 5 | |
9003 packusdw m6, m1 | |
9004 | |
9005 palignr m0, m3, 12 | |
9006 palignr m3, m5, 12 | |
9007 | |
9008 mova m7, m3 | |
9009 pmaddwd m7, [r6 - 1 * 16] ; [12] | |
9010 paddd m7, [pd_16] | |
9011 psrld m7, 5 | |
9012 mova m1, m0 | |
9013 pmaddwd m1, [r6 - 1 * 16] | |
9014 paddd m1, [pd_16] | |
9015 psrld m1, 5 | |
9016 packusdw m7, m1 | |
9017 | |
9018 mov r5, r0 | |
9019 | |
9020 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
9021 | |
9022 pslldq m5, 2 | |
9023 palignr m0, m3, 12 | |
9024 palignr m3, m5, 12 | |
9025 | |
9026 mova m4, m3 | |
9027 pmaddwd m4, [r6 + 10 * 16] ; [23] | |
9028 paddd m4, [pd_16] | |
9029 psrld m4, 5 | |
9030 mova m1, m0 | |
9031 pmaddwd m1, [r6 + 10 * 16] | |
9032 paddd m1, [pd_16] | |
9033 psrld m1, 5 | |
9034 packusdw m4, m1 | |
9035 | |
9036 mova m2, m3 | |
9037 pmaddwd m2, [r6 - 11 * 16] ; [2] | |
9038 paddd m2, [pd_16] | |
9039 psrld m2, 5 | |
9040 mova m1, m0 | |
9041 pmaddwd m1, [r6 - 11 * 16] | |
9042 paddd m1, [pd_16] | |
9043 psrld m1, 5 | |
9044 packusdw m2, m1 | |
9045 | |
9046 pslldq m5, 2 | |
9047 palignr m0, m3, 12 | |
9048 palignr m3, m5, 12 | |
9049 | |
9050 mova m6, m3 | |
9051 pmaddwd m6, [r6] ; [13] | |
9052 paddd m6, [pd_16] | |
9053 psrld m6, 5 | |
9054 mova m7, m0 | |
9055 pmaddwd m7, [r6] | |
9056 paddd m7, [pd_16] | |
9057 psrld m7, 5 | |
9058 packusdw m6, m7 | |
9059 | |
9060 pslldq m5, 2 | |
9061 palignr m0, m3, 12 | |
9062 palignr m3, m5, 12 | |
9063 | |
9064 mova m7, m3 | |
9065 pmaddwd m7, [r6 + 11 * 16] ; [24] | |
9066 paddd m7, [pd_16] | |
9067 psrld m7, 5 | |
9068 mova m1, m0 | |
9069 pmaddwd m1, [r6 + 11 * 16] | |
9070 paddd m1, [pd_16] | |
9071 psrld m1, 5 | |
9072 packusdw m7, m1 | |
9073 | |
9074 lea r5, [r0 + r1 * 4] | |
9075 | |
9076 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
9077 | |
9078 mova m4, m3 | |
9079 pmaddwd m4, [r6 - 10 * 16] ; [3] | |
9080 paddd m4, [pd_16] | |
9081 psrld m4, 5 | |
9082 mova m6, m0 | |
9083 pmaddwd m6, [r6 - 10 * 16] | |
9084 paddd m6, [pd_16] | |
9085 psrld m6, 5 | |
9086 packusdw m4, m6 | |
9087 | |
9088 pslldq m5, 2 | |
9089 palignr m0, m3, 12 | |
9090 palignr m3, m5, 12 | |
9091 | |
9092 mova m2, m3 | |
9093 pmaddwd m2, [r6 + 1 * 16] ; [14] | |
9094 paddd m2, [pd_16] | |
9095 psrld m2, 5 | |
9096 mova m6, m0 | |
9097 pmaddwd m6, [r6 + 1 * 16] | |
9098 paddd m6, [pd_16] | |
9099 psrld m6, 5 | |
9100 packusdw m2, m6 | |
9101 | |
9102 pslldq m5, 2 | |
9103 palignr m0, m3, 12 | |
9104 palignr m3, m5, 12 | |
9105 | |
9106 mova m6, m3 | |
9107 pmaddwd m6, [r6 + 12 * 16] ; [25] | |
9108 paddd m6, [pd_16] | |
9109 psrld m6, 5 | |
9110 mova m1, m0 | |
9111 pmaddwd m1, [r6 + 12 * 16] | |
9112 paddd m1, [pd_16] | |
9113 psrld m1, 5 | |
9114 packusdw m6, m1 | |
9115 | |
9116 mova m7, m3 | |
9117 pmaddwd m7, [r6 - 9 * 16] ; [4] | |
9118 paddd m7, [pd_16] | |
9119 psrld m7, 5 | |
9120 mova m1, m0 | |
9121 pmaddwd m1, [r6 - 9 * 16] | |
9122 paddd m1, [pd_16] | |
9123 psrld m1, 5 | |
9124 packusdw m7, m1 | |
9125 | |
9126 lea r5, [r5 + r1 * 4] | |
9127 | |
9128 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
9129 | |
9130 pslldq m5, 2 | |
9131 palignr m0, m3, 12 | |
9132 palignr m3, m5, 12 | |
9133 | |
9134 mova m4, m3 | |
9135 pmaddwd m4, [r6 + 2 * 16] ; [15] | |
9136 paddd m4, [pd_16] | |
9137 psrld m4, 5 | |
9138 mova m1, m0 | |
9139 pmaddwd m1, [r6 + 2 * 16] | |
9140 paddd m1, [pd_16] | |
9141 psrld m1, 5 | |
9142 packusdw m4, m1 | |
9143 | |
9144 movu m5, [r3] | |
9145 pshufb m5, [pw_ang8_16] | |
9146 | |
9147 palignr m0, m3, 12 | |
9148 palignr m3, m5, 12 | |
9149 | |
9150 mova m2, m3 | |
9151 pmaddwd m2, [r6 + 13 * 16] ; [26] | |
9152 paddd m2, [pd_16] | |
9153 psrld m2, 5 | |
9154 mova m1, m0 | |
9155 pmaddwd m1, [r6 + 13 * 16] | |
9156 paddd m1, [pd_16] | |
9157 psrld m1, 5 | |
9158 packusdw m2, m1 | |
9159 | |
9160 mova m7, m3 | |
9161 pmaddwd m7, [r6 - 8 * 16] ; [5] | |
9162 paddd m7, [pd_16] | |
9163 psrld m7, 5 | |
9164 mova m1, m0 | |
9165 pmaddwd m1, [r6 - 8 * 16] | |
9166 paddd m1, [pd_16] | |
9167 psrld m1, 5 | |
9168 packusdw m7, m1 | |
9169 | |
9170 pslldq m5, 2 | |
9171 palignr m0, m3, 12 | |
9172 palignr m3, m5, 12 | |
9173 | |
9174 pmaddwd m3, [r6 + 3 * 16] ; [16] | |
9175 paddd m3, [pd_16] | |
9176 psrld m3, 5 | |
9177 pmaddwd m0, [r6 + 3 * 16] | |
9178 paddd m0, [pd_16] | |
9179 psrld m0, 5 | |
9180 packusdw m3, m0 | |
9181 | |
9182 lea r5, [r5 + r1 * 4] | |
9183 | |
9184 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
9185 | |
9186 ret | |
9187 | |
9188 cglobal ang16_mode_17_19 | |
9189 test r4d, r4d | |
9190 lea r4, [r1 * 3] | |
9191 movu m0, [r2] ; [7 6 5 4 3 2 1 0] | |
9192 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
9193 | |
9194 palignr m6, m0, m5, 2 | |
9195 | |
9196 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] | |
9197 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] | |
9198 | |
9199 mova m4, m3 | |
9200 pmaddwd m4, [r6 - 10 * 16] ; [6] | |
9201 paddd m4, [pd_16] | |
9202 psrld m4, 5 | |
9203 mova m2, m0 | |
9204 pmaddwd m2, [r6 - 10 * 16] | |
9205 paddd m2, [pd_16] | |
9206 psrld m2, 5 | |
9207 packusdw m4, m2 | |
9208 | |
9209 palignr m0, m3, 12 | |
9210 palignr m3, m6, 12 | |
9211 | |
9212 mova m2, m3 | |
9213 pmaddwd m2, [r6 - 4 * 16] ; [12] | |
9214 paddd m2, [pd_16] | |
9215 psrld m2, 5 | |
9216 mova m1, m0 | |
9217 pmaddwd m1, [r6 - 4 * 16] | |
9218 paddd m1, [pd_16] | |
9219 psrld m1, 5 | |
9220 packusdw m2, m1 | |
9221 | |
9222 palignr m0, m3, 12 | |
9223 palignr m3, m5, 12 | |
9224 | |
9225 mova m6, m3 | |
9226 pmaddwd m6, [r6 + 2 * 16] ; [18] | |
9227 paddd m6, [pd_16] | |
9228 psrld m6, 5 | |
9229 mova m1, m0 | |
9230 pmaddwd m1, [r6 + 2 * 16] | |
9231 paddd m1, [pd_16] | |
9232 psrld m1, 5 | |
9233 packusdw m6, m1 | |
9234 | |
9235 pslldq m5, 2 | |
9236 palignr m0, m3, 12 | |
9237 palignr m3, m5, 12 | |
9238 | |
9239 mova m7, m3 | |
9240 pmaddwd m7, [r6 + 8 * 16] ; [24] | |
9241 paddd m7, [pd_16] | |
9242 psrld m7, 5 | |
9243 mova m1, m0 | |
9244 pmaddwd m1, [r6 + 8 * 16] | |
9245 paddd m1, [pd_16] | |
9246 psrld m1, 5 | |
9247 packusdw m7, m1 | |
9248 | |
9249 mov r5, r0 | |
9250 | |
9251 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 | |
9252 | |
9253 pslldq m5, 2 | |
9254 palignr m0, m3, 12 | |
9255 palignr m3, m5, 12 | |
9256 | |
9257 mova m4, m3 | |
9258 pmaddwd m4, [r6 + 14 * 16] ; [30] | |
9259 paddd m4, [pd_16] | |
9260 psrld m4, 5 | |
9261 mova m1, m0 | |
9262 pmaddwd m1, [r6 + 14 * 16] | |
9263 paddd m1, [pd_16] | |
9264 psrld m1, 5 | |
9265 packusdw m4, m1 | |
9266 | |
9267 mova m2, m3 | |
9268 pmaddwd m2, [r6 - 12 * 16] ; [4] | |
9269 paddd m2, [pd_16] | |
9270 psrld m2, 5 | |
9271 mova m1, m0 | |
9272 pmaddwd m1, [r6 - 12 * 16] | |
9273 paddd m1, [pd_16] | |
9274 psrld m1, 5 | |
9275 packusdw m2, m1 | |
9276 | |
9277 pslldq m5, 2 | |
9278 palignr m0, m3, 12 | |
9279 palignr m3, m5, 12 | |
9280 | |
9281 mova m6, m3 | |
9282 pmaddwd m6, [r6 - 6 * 16] ; [10] | |
9283 paddd m6, [pd_16] | |
9284 psrld m6, 5 | |
9285 mova m7, m0 | |
9286 pmaddwd m7, [r6 - 6 * 16] | |
9287 paddd m7, [pd_16] | |
9288 psrld m7, 5 | |
9289 packusdw m6, m7 | |
9290 | |
9291 pslldq m5, 2 | |
9292 palignr m0, m3, 12 | |
9293 palignr m3, m5, 12 | |
9294 | |
9295 mova m7, m3 | |
9296 pmaddwd m7, [r6] ; [16] | |
9297 paddd m7, [pd_16] | |
9298 psrld m7, 5 | |
9299 mova m1, m0 | |
9300 pmaddwd m1, [r6] | |
9301 paddd m1, [pd_16] | |
9302 psrld m1, 5 | |
9303 packusdw m7, m1 | |
9304 | |
9305 lea r5, [r0 + r1 * 4] | |
9306 | |
9307 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 | |
9308 | |
9309 pslldq m5, 2 | |
9310 palignr m0, m3, 12 | |
9311 palignr m3, m5, 12 | |
9312 | |
9313 mova m4, m3 | |
9314 pmaddwd m4, [r6 + 6 * 16] ; [22] | |
9315 paddd m4, [pd_16] | |
9316 psrld m4, 5 | |
9317 mova m6, m0 | |
9318 pmaddwd m6, [r6 + 6 * 16] | |
9319 paddd m6, [pd_16] | |
9320 psrld m6, 5 | |
9321 packusdw m4, m6 | |
9322 | |
9323 pslldq m5, 2 | |
9324 palignr m0, m3, 12 | |
9325 palignr m3, m5, 12 | |
9326 | |
9327 mova m2, m3 | |
9328 pmaddwd m2, [r6 + 12 * 16] ; [28] | |
9329 paddd m2, [pd_16] | |
9330 psrld m2, 5 | |
9331 mova m6, m0 | |
9332 pmaddwd m6, [r6 + 12 * 16] | |
9333 paddd m6, [pd_16] | |
9334 psrld m6, 5 | |
9335 packusdw m2, m6 | |
9336 | |
9337 mova m6, m3 | |
9338 pmaddwd m6, [r6 - 14 * 16] ; [2] | |
9339 paddd m6, [pd_16] | |
9340 psrld m6, 5 | |
9341 mova m1, m0 | |
9342 pmaddwd m1, [r6 - 14 * 16] | |
9343 paddd m1, [pd_16] | |
9344 psrld m1, 5 | |
9345 packusdw m6, m1 | |
9346 | |
9347 movu m5, [r3] | |
9348 pshufb m5, [pw_ang8_17] | |
9349 | |
9350 palignr m0, m3, 12 | |
9351 palignr m3, m5, 12 | |
9352 | |
9353 mova m7, m3 | |
9354 pmaddwd m7, [r6 - 8 * 16] ; [8] | |
9355 paddd m7, [pd_16] | |
9356 psrld m7, 5 | |
9357 mova m1, m0 | |
9358 pmaddwd m1, [r6 - 8 * 16] | |
9359 paddd m1, [pd_16] | |
9360 psrld m1, 5 | |
9361 packusdw m7, m1 | |
9362 | |
9363 lea r5, [r5 + r1 * 4] | |
9364 | |
9365 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 | |
9366 | |
9367 pslldq m5, 2 | |
9368 palignr m0, m3, 12 | |
9369 palignr m3, m5, 12 | |
9370 | |
9371 mova m4, m3 | |
9372 pmaddwd m4, [r6 - 2 * 16] ; [14] | |
9373 paddd m4, [pd_16] | |
9374 psrld m4, 5 | |
9375 mova m1, m0 | |
9376 pmaddwd m1, [r6 - 2 * 16] | |
9377 paddd m1, [pd_16] | |
9378 psrld m1, 5 | |
9379 packusdw m4, m1 | |
9380 | |
9381 pslldq m5, 2 | |
9382 palignr m0, m3, 12 | |
9383 palignr m3, m5, 12 | |
9384 | |
9385 mova m2, m3 | |
9386 pmaddwd m2, [r6 + 4 * 16] ; [20] | |
9387 paddd m2, [pd_16] | |
9388 psrld m2, 5 | |
9389 mova m1, m0 | |
9390 pmaddwd m1, [r6 + 4 * 16] | |
9391 paddd m1, [pd_16] | |
9392 psrld m1, 5 | |
9393 packusdw m2, m1 | |
9394 | |
9395 pslldq m5, 2 | |
9396 palignr m0, m3, 12 | |
9397 palignr m3, m5, 12 | |
9398 | |
9399 mova m7, m3 | |
9400 pmaddwd m7, [r6 + 10 * 16] ; [26] | |
9401 paddd m7, [pd_16] | |
9402 psrld m7, 5 | |
9403 mova m1, m0 | |
9404 pmaddwd m1, [r6 + 10 * 16] | |
9405 paddd m1, [pd_16] | |
9406 psrld m1, 5 | |
9407 packusdw m7, m1 | |
9408 | |
9409 pmaddwd m3, [r6 - 16 * 16] | |
9410 paddd m3, [pd_16] | |
9411 psrld m3, 5 | |
9412 pmaddwd m0, [r6 - 16 * 16] | |
9413 paddd m0, [pd_16] | |
9414 psrld m0, 5 | |
9415 packusdw m3, m0 | |
9416 | |
9417 lea r5, [r5 + r1 * 4] | |
9418 | |
9419 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 | |
9420 | |
9421 ret | |
9422 | |
9423 ;------------------------------------------------------------------------------------------ | |
9424 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
9425 ;------------------------------------------------------------------------------------------ | |
9426 INIT_XMM ssse3 | |
9427 cglobal intra_pred_ang16_2, 3,5,5 | |
9428 lea r4, [r2] | |
9429 add r2, 64 | |
9430 cmp r3m, byte 34 | |
9431 cmove r2, r4 | |
9432 add r1, r1 | |
9433 lea r3, [r1 * 3] | |
9434 movu m0, [r2 + 4] | |
9435 movu m1, [r2 + 20] | |
9436 movu m2, [r2 + 36] | |
9437 | |
9438 movu [r0], m0 | |
9439 movu [r0 + 16], m1 | |
9440 palignr m3, m1, m0, 2 | |
9441 palignr m4, m2, m1, 2 | |
9442 movu [r0 + r1], m3 | |
9443 movu [r0 + r1 + 16], m4 | |
9444 palignr m3, m1, m0, 4 | |
9445 palignr m4, m2, m1, 4 | |
9446 movu [r0 + r1 * 2], m3 | |
9447 movu [r0 + r1 * 2 + 16], m4 | |
9448 palignr m3, m1, m0, 6 | |
9449 palignr m4, m2, m1, 6 | |
9450 movu [r0 + r3], m3 | |
9451 movu [r0 + r3 + 16], m4 | |
9452 | |
9453 lea r0, [r0 + r1 * 4] | |
9454 palignr m3, m1, m0, 8 | |
9455 palignr m4, m2, m1, 8 | |
9456 movu [r0], m3 | |
9457 movu [r0 + 16], m4 | |
9458 palignr m3, m1, m0, 10 | |
9459 palignr m4, m2, m1, 10 | |
9460 movu [r0 + r1], m3 | |
9461 movu [r0 + r1 + 16], m4 | |
9462 palignr m3, m1, m0, 12 | |
9463 palignr m4, m2, m1, 12 | |
9464 movu [r0 + r1 * 2], m3 | |
9465 movu [r0 + r1 * 2 + 16], m4 | |
9466 palignr m3, m1, m0, 14 | |
9467 palignr m4, m2, m1, 14 | |
9468 movu [r0 + r3], m3 | |
9469 movu [r0 + r3 + 16], m4 | |
9470 | |
9471 movu m0, [r2 + 52] | |
9472 lea r0, [r0 + r1 * 4] | |
9473 movu [r0], m1 | |
9474 movu [r0 + 16], m2 | |
9475 palignr m3, m2, m1, 2 | |
9476 palignr m4, m0, m2, 2 | |
9477 movu [r0 + r1], m3 | |
9478 movu [r0 + r1 + 16], m4 | |
9479 palignr m3, m2, m1, 4 | |
9480 palignr m4, m0, m2, 4 | |
9481 movu [r0 + r1 * 2], m3 | |
9482 movu [r0 + r1 * 2 + 16], m4 | |
9483 palignr m3, m2, m1, 6 | |
9484 palignr m4, m0, m2, 6 | |
9485 movu [r0 + r3], m3 | |
9486 movu [r0 + r3 + 16], m4 | |
9487 | |
9488 lea r0, [r0 + r1 * 4] | |
9489 palignr m3, m2, m1, 8 | |
9490 palignr m4, m0, m2, 8 | |
9491 movu [r0], m3 | |
9492 movu [r0 + 16], m4 | |
9493 palignr m3, m2, m1, 10 | |
9494 palignr m4, m0, m2, 10 | |
9495 movu [r0 + r1], m3 | |
9496 movu [r0 + r1 + 16], m4 | |
9497 palignr m3, m2, m1, 12 | |
9498 palignr m4, m0, m2, 12 | |
9499 movu [r0 + r1 * 2], m3 | |
9500 movu [r0 + r1 * 2 + 16], m4 | |
9501 palignr m3, m2, m1, 14 | |
9502 palignr m4, m0, m2, 14 | |
9503 movu [r0 + r3], m3 | |
9504 movu [r0 + r3 + 16], m4 | |
9505 RET | |
9506 | |
9507 INIT_XMM sse4 | |
9508 cglobal intra_pred_ang16_3, 3,7,8 | |
9509 add r2, 64 | |
9510 xor r6d, r6d | |
9511 lea r3, [ang_table + 16 * 16] | |
9512 add r1, r1 | |
9513 lea r4, [r1 * 3] | |
9514 | |
9515 call ang16_mode_3_33 | |
9516 | |
9517 lea r2, [r2 + 16] | |
9518 lea r0, [r0 + r1 * 8] | |
9519 | |
9520 call ang16_mode_3_33 | |
9521 RET | |
9522 | |
9523 cglobal intra_pred_ang16_33, 3,7,8 | |
9524 xor r6d, r6d | |
9525 inc r6d | |
9526 lea r3, [ang_table + 16 * 16] | |
9527 add r1, r1 | |
9528 lea r4, [r1 * 3] | |
9529 | |
9530 call ang16_mode_3_33 | |
9531 | |
9532 lea r2, [r2 + 16] | |
9533 lea r0, [r0 + 16] | |
9534 | |
9535 call ang16_mode_3_33 | |
9536 RET | |
9537 | |
9538 cglobal intra_pred_ang16_4, 3,7,8 | |
9539 add r2, 64 | |
9540 xor r6d, r6d | |
9541 lea r3, [ang_table + 18 * 16] | |
9542 add r1, r1 | |
9543 lea r4, [r1 * 3] | |
9544 | |
9545 call ang16_mode_4_32 | |
9546 | |
9547 lea r2, [r2 + 16] | |
9548 lea r0, [r0 + r1 * 8] | |
9549 | |
9550 call ang16_mode_4_32 | |
9551 RET | |
9552 | |
9553 cglobal intra_pred_ang16_32, 3,7,8 | |
9554 xor r6d, r6d | |
9555 inc r6d | |
9556 lea r3, [ang_table + 18 * 16] | |
9557 add r1, r1 | |
9558 lea r4, [r1 * 3] | |
9559 | |
9560 call ang16_mode_4_32 | |
9561 | |
9562 lea r2, [r2 + 16] | |
9563 lea r0, [r0 + 16] | |
9564 | |
9565 call ang16_mode_4_32 | |
9566 RET | |
9567 | |
9568 cglobal intra_pred_ang16_5, 3,7,8 | |
9569 add r2, 64 | |
9570 xor r6d, r6d | |
9571 lea r3, [ang_table + 16 * 16] | |
9572 add r1, r1 | |
9573 lea r4, [r1 * 3] | |
9574 | |
9575 call ang16_mode_5_31 | |
9576 | |
9577 lea r2, [r2 + 16] | |
9578 lea r0, [r0 + r1 * 8] | |
9579 | |
9580 call ang16_mode_5_31 | |
9581 RET | |
9582 | |
9583 cglobal intra_pred_ang16_31, 3,7,8 | |
9584 xor r6d, r6d | |
9585 inc r6d | |
9586 lea r3, [ang_table + 16 * 16] | |
9587 add r1, r1 | |
9588 lea r4, [r1 * 3] | |
9589 | |
9590 call ang16_mode_5_31 | |
9591 | |
9592 lea r2, [r2 + 16] | |
9593 lea r0, [r0 + 16] | |
9594 | |
9595 call ang16_mode_5_31 | |
9596 RET | |
9597 | |
9598 cglobal intra_pred_ang16_6, 3,7,8 | |
9599 add r2, 64 | |
9600 xor r6d, r6d | |
9601 lea r3, [ang_table + 15 * 16] | |
9602 add r1, r1 | |
9603 lea r4, [r1 * 3] | |
9604 | |
9605 call ang16_mode_6_30 | |
9606 | |
9607 lea r2, [r2 + 16] | |
9608 lea r0, [r0 + r1 * 8] | |
9609 | |
9610 call ang16_mode_6_30 | |
9611 RET | |
9612 | |
9613 cglobal intra_pred_ang16_30, 3,7,8 | |
9614 xor r6d, r6d | |
9615 inc r6d | |
9616 lea r3, [ang_table + 15 * 16] | |
9617 add r1, r1 | |
9618 lea r4, [r1 * 3] | |
9619 | |
9620 call ang16_mode_6_30 | |
9621 | |
9622 lea r2, [r2 + 16] | |
9623 lea r0, [r0 + 16] | |
9624 | |
9625 call ang16_mode_6_30 | |
9626 RET | |
9627 | |
9628 cglobal intra_pred_ang16_7, 3,7,8 | |
9629 add r2, 64 | |
9630 xor r6d, r6d | |
9631 lea r3, [ang_table + 17 * 16] | |
9632 add r1, r1 | |
9633 lea r4, [r1 * 3] | |
9634 | |
9635 call ang16_mode_7_29 | |
9636 | |
9637 lea r2, [r2 + 16] | |
9638 lea r0, [r0 + r1 * 8] | |
9639 | |
9640 call ang16_mode_7_29 | |
9641 RET | |
9642 | |
9643 cglobal intra_pred_ang16_29, 3,7,8 | |
9644 xor r6d, r6d | |
9645 inc r6d | |
9646 lea r3, [ang_table + 17 * 16] | |
9647 add r1, r1 | |
9648 lea r4, [r1 * 3] | |
9649 | |
9650 call ang16_mode_7_29 | |
9651 | |
9652 lea r2, [r2 + 16] | |
9653 lea r0, [r0 + 16] | |
9654 | |
9655 call ang16_mode_7_29 | |
9656 RET | |
9657 | |
9658 cglobal intra_pred_ang16_8, 3,7,8 | |
9659 add r2, 64 | |
9660 xor r6d, r6d | |
9661 lea r3, [ang_table + 15 * 16] | |
9662 add r1, r1 | |
9663 lea r4, [r1 * 3] | |
9664 | |
9665 call ang16_mode_8_28 | |
9666 | |
9667 lea r2, [r2 + 16] | |
9668 lea r0, [r0 + r1 * 8] | |
9669 | |
9670 call ang16_mode_8_28 | |
9671 RET | |
9672 | |
9673 cglobal intra_pred_ang16_28, 3,7,8 | |
9674 xor r6d, r6d | |
9675 inc r6d | |
9676 lea r3, [ang_table + 15 * 16] | |
9677 add r1, r1 | |
9678 lea r4, [r1 * 3] | |
9679 | |
9680 call ang16_mode_8_28 | |
9681 | |
9682 lea r2, [r2 + 16] | |
9683 lea r0, [r0 + 16] | |
9684 | |
9685 call ang16_mode_8_28 | |
9686 RET | |
9687 | |
9688 cglobal intra_pred_ang16_9, 3,7,8 | |
9689 add r2, 64 | |
9690 xor r6d, r6d | |
9691 lea r3, [ang_table + 16 * 16] | |
9692 add r1, r1 | |
9693 lea r4, [r1 * 3] | |
9694 | |
9695 call ang16_mode_9_27 | |
9696 | |
9697 lea r2, [r2 + 16] | |
9698 lea r0, [r0 + r1 * 8] | |
9699 | |
9700 call ang16_mode_9_27 | |
9701 RET | |
9702 | |
9703 cglobal intra_pred_ang16_27, 3,7,8 | |
9704 xor r6d, r6d | |
9705 inc r6d | |
9706 lea r3, [ang_table + 16 * 16] | |
9707 add r1, r1 | |
9708 lea r4, [r1 * 3] | |
9709 | |
9710 call ang16_mode_9_27 | |
9711 | |
9712 lea r2, [r2 + 16] | |
9713 lea r0, [r0 + 16] | |
9714 | |
9715 call ang16_mode_9_27 | |
9716 RET | |
9717 | |
9718 cglobal intra_pred_ang16_11, 3,7,8, 0-4 | |
9719 movzx r5d, word [r2 + 64] | |
9720 movzx r6d, word [r2] | |
9721 mov [rsp], r5w | |
9722 mov [r2 + 64], r6w | |
9723 | |
9724 add r2, 64 | |
9725 xor r6d, r6d | |
9726 lea r3, [ang_table + 16 * 16] | |
9727 add r1, r1 | |
9728 lea r4, [r1 * 3] | |
9729 | |
9730 call ang16_mode_11_25 | |
9731 | |
9732 lea r2, [r2 + 16] | |
9733 lea r0, [r0 + r1 * 8] | |
9734 | |
9735 call ang16_mode_11_25 | |
9736 | |
9737 mov r6d, [rsp] | |
9738 mov [r2 - 16], r6w | |
9739 RET | |
9740 | |
9741 cglobal intra_pred_ang16_25, 3,7,8 | |
9742 xor r6d, r6d | |
9743 inc r6d | |
9744 lea r3, [ang_table + 16 * 16] | |
9745 add r1, r1 | |
9746 lea r4, [r1 * 3] | |
9747 | |
9748 call ang16_mode_11_25 | |
9749 | |
9750 lea r2, [r2 + 16] | |
9751 lea r0, [r0 + 16] | |
9752 | |
9753 call ang16_mode_11_25 | |
9754 RET | |
9755 | |
9756 cglobal intra_pred_ang16_12, 3,7,8, 0-4 | |
9757 movzx r5d, word [r2 + 64] | |
9758 movzx r6d, word [r2] | |
9759 mov [rsp], r5w | |
9760 mov [r2 + 64], r6w | |
9761 | |
9762 add r1, r1 | |
9763 lea r4, [r1 * 3] | |
9764 lea r6, [ang_table + 16 * 16] | |
9765 movu m5, [r2] | |
9766 pshufb m5, [pw_ang8_12] | |
9767 pinsrw m5, [r2 + 26], 5 | |
9768 xor r3d, r3d | |
9769 add r2, 64 | |
9770 | |
9771 call ang16_mode_12_24 | |
9772 | |
9773 lea r0, [r0 + r1 * 8] | |
9774 movu m5, [r2 + 2] | |
9775 lea r2, [r2 + 16] | |
9776 | |
9777 call ang16_mode_12_24 | |
9778 | |
9779 mov r6d, [rsp] | |
9780 mov [r2 - 16], r6w | |
9781 RET | |
9782 | |
9783 cglobal intra_pred_ang16_24, 3,7,8, 0-4 | |
9784 movzx r5d, word [r2 + 64] | |
9785 movzx r6d, word [r2] | |
9786 mov [rsp], r5w | |
9787 mov [r2 + 64], r6w | |
9788 | |
9789 add r1, r1 | |
9790 lea r4, [r1 * 3] | |
9791 lea r6, [ang_table + 16 * 16] | |
9792 movu m5, [r2 + 64] | |
9793 pshufb m5, [pw_ang8_12] | |
9794 pinsrw m5, [r2 + 26 + 64], 5 | |
9795 xor r3d, r3d | |
9796 inc r3d | |
9797 | |
9798 call ang16_mode_12_24 | |
9799 | |
9800 lea r0, [r0 + 16] | |
9801 movu m5, [r2 + 2] | |
9802 lea r2, [r2 + 16] | |
9803 | |
9804 call ang16_mode_12_24 | |
9805 | |
9806 mov r6d, [rsp] | |
9807 mov [r2 + 48], r6w | |
9808 RET | |
9809 | |
9810 cglobal intra_pred_ang16_13, 3,7,8, 0-4 | |
9811 movzx r5d, word [r2 + 64] | |
9812 movzx r6d, word [r2] | |
9813 mov [rsp], r5w | |
9814 mov [r2 + 64], r6w | |
9815 | |
9816 add r1, r1 | |
9817 lea r4, [r1 * 3] | |
9818 lea r6, [ang_table + 15 * 16] | |
9819 movu m5, [r2] | |
9820 pshufb m5, [pw_ang16_13] | |
9821 movu m6, [r2 + 14] | |
9822 pshufb m6, [pw_ang8_13] | |
9823 pslldq m6, 2 | |
9824 palignr m5, m6, 6 | |
9825 xor r3d, r3d | |
9826 add r2, 64 | |
9827 | |
9828 call ang16_mode_13_23 | |
9829 | |
9830 lea r0, [r0 + r1 * 8] | |
9831 movu m5, [r2 + 2] | |
9832 lea r2, [r2 + 16] | |
9833 | |
9834 call ang16_mode_13_23 | |
9835 | |
9836 mov r6d, [rsp] | |
9837 mov [r2 - 16], r6w | |
9838 RET | |
9839 | |
9840 cglobal intra_pred_ang16_23, 3,7,8, 0-4 | |
9841 movzx r5d, word [r2 + 64] | |
9842 movzx r6d, word [r2] | |
9843 mov [rsp], r5w | |
9844 mov [r2 + 64], r6w | |
9845 | |
9846 add r1, r1 | |
9847 lea r4, [r1 * 3] | |
9848 lea r6, [ang_table + 15 * 16] | |
9849 movu m5, [r2 + 64] | |
9850 pshufb m5, [pw_ang16_13] | |
9851 movu m6, [r2 + 14 + 64] | |
9852 pshufb m6, [pw_ang8_13] | |
9853 pslldq m6, 2 | |
9854 palignr m5, m6, 6 | |
9855 xor r3d, r3d | |
9856 inc r3d | |
9857 | |
9858 call ang16_mode_13_23 | |
9859 | |
9860 lea r0, [r0 + 16] | |
9861 movu m5, [r2 + 2] | |
9862 lea r2, [r2 + 16] | |
9863 | |
9864 call ang16_mode_13_23 | |
9865 | |
9866 mov r6d, [rsp] | |
9867 mov [r2 + 48], r6w | |
9868 RET | |
9869 | |
9870 cglobal intra_pred_ang16_14, 3,7,8, 0-4 | |
9871 movzx r5d, word [r2 + 64] | |
9872 movzx r6d, word [r2] | |
9873 mov [rsp], r5w | |
9874 mov [r2 + 64], r6w | |
9875 | |
9876 add r1, r1 | |
9877 lea r4, [r1 * 3] | |
9878 lea r6, [ang_table + 18 * 16] | |
9879 movu m6, [r2] | |
9880 pshufb m6, [pw_ang8_14] | |
9881 movu m5, [r2 + 20] | |
9882 pshufb m5, [pw_ang8_14] | |
9883 punpckhqdq m5, m6 | |
9884 xor r3d, r3d | |
9885 add r2, 64 | |
9886 | |
9887 call ang16_mode_14_22 | |
9888 | |
9889 lea r0, [r0 + r1 * 8] | |
9890 movu m5, [r2 + 2] | |
9891 lea r2, [r2 + 16] | |
9892 | |
9893 call ang16_mode_14_22 | |
9894 | |
9895 mov r6d, [rsp] | |
9896 mov [r2 - 16], r6w | |
9897 RET | |
9898 | |
9899 cglobal intra_pred_ang16_22, 3,7,8, 0-4 | |
9900 movzx r5d, word [r2 + 64] | |
9901 movzx r6d, word [r2] | |
9902 mov [rsp], r5w | |
9903 mov [r2 + 64], r6w | |
9904 | |
9905 add r1, r1 | |
9906 lea r4, [r1 * 3] | |
9907 lea r6, [ang_table + 18 * 16] | |
9908 movu m6, [r2 + 64] | |
9909 pshufb m6, [pw_ang8_14] | |
9910 movu m5, [r2 + 20 + 64] | |
9911 pshufb m5, [pw_ang8_14] | |
9912 punpckhqdq m5, m6 | |
9913 xor r3d, r3d | |
9914 inc r3d | |
9915 | |
9916 call ang16_mode_14_22 | |
9917 | |
9918 lea r0, [r0 + 16] | |
9919 movu m5, [r2 + 2] | |
9920 lea r2, [r2 + 16] | |
9921 | |
9922 call ang16_mode_14_22 | |
9923 | |
9924 mov r6d, [rsp] | |
9925 mov [r2 + 48], r6w | |
9926 RET | |
9927 | |
9928 cglobal intra_pred_ang16_15, 3,7,8, 0-4 | |
9929 movzx r5d, word [r2 + 64] | |
9930 movzx r6d, word [r2] | |
9931 mov [rsp], r5w | |
9932 mov [r2 + 64], r6w | |
9933 | |
9934 add r1, r1 | |
9935 lea r4, [r1 * 3] | |
9936 lea r6, [ang_table + 15 * 16] | |
9937 movu m6, [r2 + 4] | |
9938 pshufb m6, [pw_ang8_15] | |
9939 movu m5, [r2 + 18] | |
9940 pshufb m5, [pw_ang8_15] | |
9941 punpckhqdq m5, m6 | |
9942 xor r3d, r3d | |
9943 add r2, 64 | |
9944 | |
9945 call ang16_mode_15_21 | |
9946 | |
9947 lea r0, [r0 + r1 * 8] | |
9948 movu m5, [r2] | |
9949 lea r2, [r2 + 16] | |
9950 | |
9951 call ang16_mode_15_21 | |
9952 | |
9953 mov r6d, [rsp] | |
9954 mov [r2 - 16], r6w | |
9955 RET | |
9956 | |
9957 cglobal intra_pred_ang16_21, 3,7,8, 0-4 | |
9958 movzx r5d, word [r2 + 64] | |
9959 movzx r6d, word [r2] | |
9960 mov [rsp], r5w | |
9961 mov [r2 + 64], r6w | |
9962 | |
9963 add r1, r1 | |
9964 lea r4, [r1 * 3] | |
9965 lea r6, [ang_table + 15 * 16] | |
9966 movu m6, [r2 + 4 + 64] | |
9967 pshufb m6, [pw_ang8_15] | |
9968 movu m5, [r2 + 18 + 64] | |
9969 pshufb m5, [pw_ang8_15] | |
9970 punpckhqdq m5, m6 | |
9971 xor r3d, r3d | |
9972 inc r3d | |
9973 | |
9974 call ang16_mode_15_21 | |
9975 | |
9976 lea r0, [r0 + 16] | |
9977 movu m5, [r2] | |
9978 lea r2, [r2 + 16] | |
9979 | |
9980 call ang16_mode_15_21 | |
9981 | |
9982 mov r6d, [rsp] | |
9983 mov [r2 + 48], r6w | |
9984 RET | |
9985 | |
9986 cglobal intra_pred_ang16_16, 3,7,8,0-(1*mmsize+4) | |
9987 movzx r5d, word [r2 + 64] | |
9988 movzx r6d, word [r2] | |
9989 mov [rsp + 16], r5w | |
9990 mov [r2 + 64], r6w | |
9991 | |
9992 add r1, r1 | |
9993 lea r6, [ang_table + 13 * 16] | |
9994 movu m6, [r2 + 4] | |
9995 pshufb m6, [pw_ang16_16] | |
9996 movu m5, [r2 + 16] | |
9997 pshufb m5, [pw_ang16_16] | |
9998 punpckhqdq m5, m6 | |
9999 mov [rsp], r2 | |
10000 lea r3, [r2 + 24] | |
10001 add r2, 64 | |
10002 xor r4, r4 | |
10003 | |
10004 call ang16_mode_16_20 | |
10005 | |
10006 lea r0, [r0 + r1 * 8] | |
10007 mov r3, [rsp] | |
10008 movu m5, [r2] | |
10009 lea r2, [r2 + 16] | |
10010 xor r4, r4 | |
10011 | |
10012 call ang16_mode_16_20 | |
10013 | |
10014 mov r6d, [rsp + 16] | |
10015 mov [r2 - 16], r6w | |
10016 RET | |
10017 | |
10018 cglobal intra_pred_ang16_20, 3,7,8,0-(1*mmsize+4) | |
10019 movzx r5d, word [r2 + 64] | |
10020 movzx r6d, word [r2] | |
10021 mov [rsp + 16], r5w | |
10022 mov [r2 + 64], r6w | |
10023 | |
10024 lea r3, [r2 + 64] | |
10025 add r1, r1 | |
10026 lea r6, [ang_table + 13 * 16] | |
10027 movu m6, [r3 + 4] | |
10028 pshufb m6, [pw_ang16_16] | |
10029 movu m5, [r3 + 16] | |
10030 pshufb m5, [pw_ang16_16] | |
10031 punpckhqdq m5, m6 | |
10032 mov [rsp], r3 | |
10033 lea r3, [r3 + 24] | |
10034 xor r4, r4 | |
10035 inc r4 | |
10036 | |
10037 call ang16_mode_16_20 | |
10038 | |
10039 lea r0, [r0 + 16] | |
10040 mov r3, [rsp] | |
10041 movu m5, [r2] | |
10042 lea r2, [r2 + 16] | |
10043 xor r4, r4 | |
10044 inc r4 | |
10045 | |
10046 call ang16_mode_16_20 | |
10047 mov r6d, [rsp + 16] | |
10048 mov [r3], r6w | |
10049 RET | |
10050 | |
10051 cglobal intra_pred_ang16_17, 3,7,8,0-(1*mmsize+4) | |
10052 movzx r5d, word [r2 + 64] | |
10053 movzx r6d, word [r2] | |
10054 mov [rsp + 16], r5w | |
10055 mov [r2 + 64], r6w | |
10056 | |
10057 add r1, r1 | |
10058 lea r6, [ang_table + 16 * 16] | |
10059 movu m6, [r2 + 2] | |
10060 pshufb m6, [pw_ang16_16] | |
10061 movu m5, [r2 + 12] | |
10062 pshufb m5, [pw_ang16_16] | |
10063 punpckhqdq m5, m6 | |
10064 mov [rsp], r2 | |
10065 lea r3, [r2 + 20] | |
10066 add r2, 64 | |
10067 xor r4, r4 | |
10068 | |
10069 call ang16_mode_17_19 | |
10070 | |
10071 lea r0, [r0 + r1 * 8] | |
10072 mov r3, [rsp] | |
10073 movu m5, [r2] | |
10074 lea r2, [r2 + 16] | |
10075 xor r4, r4 | |
10076 | |
10077 call ang16_mode_17_19 | |
10078 | |
10079 mov r6d, [rsp + 16] | |
10080 mov [r2 - 16], r6w | |
10081 RET | |
10082 | |
10083 cglobal intra_pred_ang16_19, 3,7,8,0-(1*mmsize+4) | |
10084 movzx r5d, word [r2 + 64] | |
10085 movzx r6d, word [r2] | |
10086 mov [rsp + 16], r5w | |
10087 mov [r2 + 64], r6w | |
10088 | |
10089 lea r3, [r2 + 64] | |
10090 add r1, r1 | |
10091 lea r6, [ang_table + 16 * 16] | |
10092 movu m6, [r3 + 2] | |
10093 pshufb m6, [pw_ang16_16] | |
10094 movu m5, [r3 + 12] | |
10095 pshufb m5, [pw_ang16_16] | |
10096 punpckhqdq m5, m6 | |
10097 mov [rsp], r3 | |
10098 lea r3, [r3 + 20] | |
10099 xor r4, r4 | |
10100 inc r4 | |
10101 | |
10102 call ang16_mode_17_19 | |
10103 | |
10104 lea r0, [r0 + 16] | |
10105 mov r3, [rsp] | |
10106 movu m5, [r2] | |
10107 lea r2, [r2 + 16] | |
10108 xor r4, r4 | |
10109 inc r4 | |
10110 | |
10111 call ang16_mode_17_19 | |
10112 | |
10113 mov r6d, [rsp + 16] | |
10114 mov [r3], r6w | |
10115 RET | |
10116 | |
10117 cglobal intra_pred_ang16_18, 3,5,4 | |
10118 add r1, r1 | |
10119 lea r4, [r1 * 3] | |
10120 movu m1, [r2] | |
10121 movu m3, [r2 + 16] | |
10122 movu m0, [r2 + 2 + 64] | |
10123 pshufb m0, [pw_swap16] | |
10124 movu [r0], m1 | |
10125 movu [r0 + 16], m3 | |
10126 palignr m2, m1, m0, 14 | |
10127 movu [r0 + r1], m2 | |
10128 palignr m2, m3, m1, 14 | |
10129 movu [r0 + r1 + 16], m2 | |
10130 palignr m2, m1, m0, 12 | |
10131 movu [r0 + r1 * 2], m2 | |
10132 palignr m2, m3, m1, 12 | |
10133 movu [r0 + r1 * 2 + 16], m2 | |
10134 palignr m2, m1, m0, 10 | |
10135 movu [r0 + r4], m2 | |
10136 palignr m2, m3, m1, 10 | |
10137 movu [r0 + r4 + 16], m2 | |
10138 | |
10139 lea r0, [r0 + r1 * 4] | |
10140 palignr m2, m1, m0, 8 | |
10141 movu [r0], m2 | |
10142 palignr m2, m3, m1, 8 | |
10143 movu [r0 + 16], m2 | |
10144 palignr m2, m1, m0, 6 | |
10145 movu [r0 + r1], m2 | |
10146 palignr m2, m3, m1, 6 | |
10147 movu [r0 + r1 + 16], m2 | |
10148 palignr m2, m1, m0, 4 | |
10149 movu [r0 + r1 * 2], m2 | |
10150 palignr m2, m3, m1, 4 | |
10151 movu [r0 + r1 * 2 + 16], m2 | |
10152 palignr m2, m1, m0, 2 | |
10153 movu [r0 + r4], m2 | |
10154 palignr m3, m1, 2 | |
10155 movu [r0 + r4 + 16], m3 | |
10156 | |
10157 lea r0, [r0 + r1 * 4] | |
10158 movu [r0], m0 | |
10159 movu [r0 + 16], m1 | |
10160 movu m3, [r2 + 18 + 64] | |
10161 pshufb m3, [pw_swap16] | |
10162 palignr m2, m0, m3, 14 | |
10163 movu [r0 + r1], m2 | |
10164 palignr m2, m1, m0, 14 | |
10165 movu [r0 + r1 + 16], m2 | |
10166 palignr m2, m0, m3, 12 | |
10167 movu [r0 + r1 * 2], m2 | |
10168 palignr m2, m1, m0, 12 | |
10169 movu [r0 + r1 * 2 + 16], m2 | |
10170 palignr m2, m0, m3, 10 | |
10171 movu [r0 + r4], m2 | |
10172 palignr m2, m1, m0, 10 | |
10173 movu [r0 + r4 + 16], m2 | |
10174 | |
10175 lea r0, [r0 + r1 * 4] | |
10176 palignr m2, m0, m3, 8 | |
10177 movu [r0], m2 | |
10178 palignr m2, m1, m0, 8 | |
10179 movu [r0 + 16], m2 | |
10180 palignr m2, m0, m3, 6 | |
10181 movu [r0 + r1], m2 | |
10182 palignr m2, m1, m0, 6 | |
10183 movu [r0 + r1 + 16], m2 | |
10184 palignr m2, m0, m3, 4 | |
10185 movu [r0 + r1 * 2], m2 | |
10186 palignr m2, m1, m0, 4 | |
10187 movu [r0 + r1 * 2 + 16], m2 | |
10188 palignr m2, m0, m3, 2 | |
10189 movu [r0 + r4], m2 | |
10190 palignr m1, m0, 2 | |
10191 movu [r0 + r4 + 16], m1 | |
10192 RET | |
10193 | |
10194 cglobal intra_pred_ang16_10, 3,6,4 | |
10195 mov r5d, r4m | |
10196 movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1] | |
10197 movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9] | |
10198 pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1] | |
10199 add r1, r1 | |
10200 lea r4, [r1 * 3] | |
10201 | |
10202 psrldq m1, 2 | |
10203 pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2] | |
10204 movu [r0 + r1], m2 | |
10205 movu [r0 + r1 + 16], m2 | |
10206 psrldq m1, 2 | |
10207 pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3] | |
10208 movu [r0 + r1 * 2], m2 | |
10209 movu [r0 + r1 * 2 + 16], m2 | |
10210 psrldq m1, 2 | |
10211 pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4] | |
10212 movu [r0 + r4], m2 | |
10213 movu [r0 + r4 + 16], m2 | |
10214 | |
10215 lea r3, [r0 + r1 *4] | |
10216 psrldq m1, 2 | |
10217 pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5] | |
10218 movu [r3], m2 | |
10219 movu [r3 + 16], m2 | |
10220 psrldq m1, 2 | |
10221 pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6] | |
10222 movu [r3 + r1], m2 | |
10223 movu [r3 + r1 + 16], m2 | |
10224 psrldq m1, 2 | |
10225 pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7] | |
10226 movu [r3 + r1 * 2], m2 | |
10227 movu [r3 + r1 * 2 + 16], m2 | |
10228 psrldq m1, 2 | |
10229 pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8] | |
10230 movu [r3 + r4], m2 | |
10231 movu [r3 + r4 + 16], m2 | |
10232 | |
10233 lea r3, [r3 + r1 *4] | |
10234 pshufb m2, m3, [pb_01] ; [9 9 9 9 9 9 9 9] | |
10235 movu [r3], m2 | |
10236 movu [r3 + 16], m2 | |
10237 psrldq m3, 2 | |
10238 pshufb m2, m3, [pb_01] ; [10 10 10 10 10 10 10 10] | |
10239 movu [r3 + r1], m2 | |
10240 movu [r3 + r1 + 16], m2 | |
10241 psrldq m3, 2 | |
10242 pshufb m2, m3, [pb_01] ; [11 11 11 11 11 11 11 11] | |
10243 movu [r3 + r1 * 2], m2 | |
10244 movu [r3 + r1 * 2 + 16], m2 | |
10245 psrldq m3, 2 | |
10246 pshufb m2, m3, [pb_01] ; [12 12 12 12 12 12 12 12] | |
10247 movu [r3 + r4], m2 | |
10248 movu [r3 + r4 + 16], m2 | |
10249 | |
10250 lea r3, [r3 + r1 *4] | |
10251 psrldq m3, 2 | |
10252 pshufb m2, m3, [pb_01] ; [13 13 13 13 13 13 13 13] | |
10253 movu [r3], m2 | |
10254 movu [r3 + 16], m2 | |
10255 psrldq m3, 2 | |
10256 pshufb m2, m3, [pb_01] ; [14 14 14 14 14 14 14 14] | |
10257 movu [r3 + r1], m2 | |
10258 movu [r3 + r1 + 16], m2 | |
10259 psrldq m3, 2 | |
10260 pshufb m2, m3, [pb_01] ; [15 15 15 15 15 15 15 15] | |
10261 movu [r3 + r1 * 2], m2 | |
10262 movu [r3 + r1 * 2 + 16], m2 | |
10263 psrldq m3, 2 | |
10264 pshufb m2, m3, [pb_01] ; [16 16 16 16 16 16 16 16] | |
10265 movu [r3 + r4], m2 | |
10266 movu [r3 + r4 + 16], m2 | |
10267 mova m3, m0 | |
10268 | |
10269 cmp r5d, byte 0 | |
10270 jz .quit | |
10271 | |
10272 ; filter | |
10273 pinsrw m1, [r2], 0 ; [3 2 1 0] | |
10274 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] | |
10275 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
10276 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
10277 psubw m1, m2 | |
10278 psubw m3, m2 | |
10279 psraw m1, 1 | |
10280 psraw m3, 1 | |
10281 paddw m3, m0 | |
10282 paddw m0, m1 | |
10283 pxor m1, m1 | |
10284 pmaxsw m0, m1 | |
10285 pminsw m0, [pw_pixel_max] | |
10286 pmaxsw m3, m1 | |
10287 pminsw m3, [pw_pixel_max] | |
10288 .quit: | |
10289 movu [r0], m0 | |
10290 movu [r0 + 16], m3 | |
10291 RET | |
10292 | |
10293 cglobal intra_pred_ang16_26, 3,6,4 | |
10294 mov r5d, r4m | |
10295 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
10296 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
10297 add r1, r1 | |
10298 lea r4, [r1 * 3] | |
10299 | |
10300 movu [r0], m0 | |
10301 movu [r0 + 16], m3 | |
10302 movu [r0 + r1], m0 | |
10303 movu [r0 + r1 + 16], m3 | |
10304 movu [r0 + r1 * 2], m0 | |
10305 movu [r0 + r1 * 2 + 16], m3 | |
10306 movu [r0 + r4], m0 | |
10307 movu [r0 + r4 + 16], m3 | |
10308 | |
10309 lea r3, [r0 + r1 *4] | |
10310 movu [r3], m0 | |
10311 movu [r3 + 16], m3 | |
10312 movu [r3 + r1], m0 | |
10313 movu [r3 + r1 + 16], m3 | |
10314 movu [r3 + r1 * 2], m0 | |
10315 movu [r3 + r1 * 2 + 16], m3 | |
10316 movu [r3 + r4], m0 | |
10317 movu [r3 + r4 + 16], m3 | |
10318 | |
10319 lea r3, [r3 + r1 *4] | |
10320 movu [r3], m0 | |
10321 movu [r3 + 16], m3 | |
10322 movu [r3 + r1], m0 | |
10323 movu [r3 + r1 + 16], m3 | |
10324 movu [r3 + r1 * 2], m0 | |
10325 movu [r3 + r1 * 2 + 16], m3 | |
10326 movu [r3 + r4], m0 | |
10327 movu [r3 + r4 + 16], m3 | |
10328 | |
10329 lea r3, [r3 + r1 *4] | |
10330 movu [r3], m0 | |
10331 movu [r3 + 16], m3 | |
10332 movu [r3 + r1], m0 | |
10333 movu [r3 + r1 + 16], m3 | |
10334 movu [r3 + r1 * 2], m0 | |
10335 movu [r3 + r1 * 2 + 16], m3 | |
10336 movu [r3 + r4], m0 | |
10337 movu [r3 + r4 + 16], m3 | |
10338 | |
10339 cmp r5d, byte 0 | |
10340 jz .quit | |
10341 | |
10342 ; filter | |
10343 | |
10344 pshufb m0, [pb_01] | |
10345 pinsrw m1, [r2], 0 ; [3 2 1 0] | |
10346 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] | |
10347 movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1] | |
10348 movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9] | |
10349 psubw m1, m2 | |
10350 psubw m3, m2 | |
10351 psraw m1, 1 | |
10352 psraw m3, 1 | |
10353 paddw m3, m0 | |
10354 paddw m0, m1 | |
10355 pxor m1, m1 | |
10356 pmaxsw m0, m1 | |
10357 pminsw m0, [pw_pixel_max] | |
10358 pmaxsw m3, m1 | |
10359 pminsw m3, [pw_pixel_max] | |
10360 pextrw [r0], m0, 0 | |
10361 pextrw [r0 + r1], m0, 1 | |
10362 pextrw [r0 + r1 * 2], m0, 2 | |
10363 pextrw [r0 + r4], m0, 3 | |
10364 lea r0, [r0 + r1 * 4] | |
10365 pextrw [r0], m0, 4 | |
10366 pextrw [r0 + r1], m0, 5 | |
10367 pextrw [r0 + r1 * 2], m0, 6 | |
10368 pextrw [r0 + r4], m0, 7 | |
10369 lea r0, [r0 + r1 * 4] | |
10370 pextrw [r0], m3, 0 | |
10371 pextrw [r0 + r1], m3, 1 | |
10372 pextrw [r0 + r1 * 2], m3, 2 | |
10373 pextrw [r0 + r4], m3, 3 | |
10374 pextrw [r3], m3, 4 | |
10375 pextrw [r3 + r1], m3, 5 | |
10376 pextrw [r3 + r1 * 2], m3, 6 | |
10377 pextrw [r3 + r4], m3, 7 | |
10378 .quit: | |
10379 RET | |
10380 | |
10381 ;------------------------------------------------------------------------------------------------------- | |
10382 ; avx2 code for intra_pred_ang16 mode 2 to 34 start | |
10383 ;------------------------------------------------------------------------------------------------------- | |
10384 INIT_YMM avx2 | |
10385 cglobal intra_pred_ang16_2, 3,5,3 | |
10386 lea r4, [r2] | |
10387 add r2, 64 | |
10388 cmp r3m, byte 34 | |
10389 cmove r2, r4 | |
10390 add r1d, r1d | |
10391 lea r3, [r1 * 3] | |
10392 movu m0, [r2 + 4] | |
10393 movu m1, [r2 + 20] | |
10394 | |
10395 movu [r0], m0 | |
10396 palignr m2, m1, m0, 2 | |
10397 movu [r0 + r1], m2 | |
10398 palignr m2, m1, m0, 4 | |
10399 movu [r0 + r1 * 2], m2 | |
10400 palignr m2, m1, m0, 6 | |
10401 movu [r0 + r3], m2 | |
10402 | |
10403 lea r0, [r0 + r1 * 4] | |
10404 palignr m2, m1, m0, 8 | |
10405 movu [r0], m2 | |
10406 palignr m2, m1, m0, 10 | |
10407 movu [r0 + r1], m2 | |
10408 palignr m2, m1, m0, 12 | |
10409 movu [r0 + r1 * 2], m2 | |
10410 palignr m2, m1, m0, 14 | |
10411 movu [r0 + r3], m2 | |
10412 | |
10413 movu m0, [r2 + 36] | |
10414 lea r0, [r0 + r1 * 4] | |
10415 movu [r0], m1 | |
10416 palignr m2, m0, m1, 2 | |
10417 movu [r0 + r1], m2 | |
10418 palignr m2, m0, m1, 4 | |
10419 movu [r0 + r1 * 2], m2 | |
10420 palignr m2, m0, m1, 6 | |
10421 movu [r0 + r3], m2 | |
10422 | |
10423 lea r0, [r0 + r1 * 4] | |
10424 palignr m2, m0, m1, 8 | |
10425 movu [r0], m2 | |
10426 palignr m2, m0, m1, 10 | |
10427 movu [r0 + r1], m2 | |
10428 palignr m2, m0, m1, 12 | |
10429 movu [r0 + r1 * 2], m2 | |
10430 palignr m2, m0, m1, 14 | |
10431 movu [r0 + r3], m2 | |
10432 RET | |
10433 | |
10434 %macro TRANSPOSE_STORE_AVX2 11 | |
10435 jnz .skip%11 | |
10436 punpckhwd m%9, m%1, m%2 | |
10437 punpcklwd m%1, m%2 | |
10438 punpckhwd m%2, m%3, m%4 | |
10439 punpcklwd m%3, m%4 | |
10440 | |
10441 punpckldq m%4, m%1, m%3 | |
10442 punpckhdq m%1, m%3 | |
10443 punpckldq m%3, m%9, m%2 | |
10444 punpckhdq m%9, m%2 | |
10445 | |
10446 punpckhwd m%10, m%5, m%6 | |
10447 punpcklwd m%5, m%6 | |
10448 punpckhwd m%6, m%7, m%8 | |
10449 punpcklwd m%7, m%8 | |
10450 | |
10451 punpckldq m%8, m%5, m%7 | |
10452 punpckhdq m%5, m%7 | |
10453 punpckldq m%7, m%10, m%6 | |
10454 punpckhdq m%10, m%6 | |
10455 | |
10456 punpcklqdq m%6, m%4, m%8 | |
10457 punpckhqdq m%2, m%4, m%8 | |
10458 punpcklqdq m%4, m%1, m%5 | |
10459 punpckhqdq m%8, m%1, m%5 | |
10460 | |
10461 punpcklqdq m%1, m%3, m%7 | |
10462 punpckhqdq m%5, m%3, m%7 | |
10463 punpcklqdq m%3, m%9, m%10 | |
10464 punpckhqdq m%7, m%9, m%10 | |
10465 | |
10466 movu [r0 + r1 * 0 + %11], xm%6 | |
10467 movu [r0 + r1 * 1 + %11], xm%2 | |
10468 movu [r0 + r1 * 2 + %11], xm%4 | |
10469 movu [r0 + r4 * 1 + %11], xm%8 | |
10470 | |
10471 lea r5, [r0 + r1 * 4] | |
10472 movu [r5 + r1 * 0 + %11], xm%1 | |
10473 movu [r5 + r1 * 1 + %11], xm%5 | |
10474 movu [r5 + r1 * 2 + %11], xm%3 | |
10475 movu [r5 + r4 * 1 + %11], xm%7 | |
10476 | |
10477 lea r5, [r5 + r1 * 4] | |
10478 vextracti128 [r5 + r1 * 0 + %11], m%6, 1 | |
10479 vextracti128 [r5 + r1 * 1 + %11], m%2, 1 | |
10480 vextracti128 [r5 + r1 * 2 + %11], m%4, 1 | |
10481 vextracti128 [r5 + r4 * 1 + %11], m%8, 1 | |
10482 | |
10483 lea r5, [r5 + r1 * 4] | |
10484 vextracti128 [r5 + r1 * 0 + %11], m%1, 1 | |
10485 vextracti128 [r5 + r1 * 1 + %11], m%5, 1 | |
10486 vextracti128 [r5 + r1 * 2 + %11], m%3, 1 | |
10487 vextracti128 [r5 + r4 * 1 + %11], m%7, 1 | |
10488 jmp .end%11 | |
10489 .skip%11: | |
10490 movu [r0 + r1 * 0], m%1 | |
10491 movu [r0 + r1 * 1], m%2 | |
10492 movu [r0 + r1 * 2], m%3 | |
10493 movu [r0 + r4 * 1], m%4 | |
10494 | |
10495 lea r0, [r0 + r1 * 4] | |
10496 movu [r0 + r1 * 0], m%5 | |
10497 movu [r0 + r1 * 1], m%6 | |
10498 movu [r0 + r1 * 2], m%7 | |
10499 movu [r0 + r4 * 1], m%8 | |
10500 lea r0, [r0 + r1 * 4] | |
10501 .end%11: | |
10502 %endmacro | |
10503 | |
10504 ;; angle 16, modes 3 and 33 | |
10505 cglobal ang16_mode_3_33 | |
10506 test r6d, r6d | |
10507 | |
10508 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
10509 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
10510 | |
10511 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
10512 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
10513 | |
10514 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
10515 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
10516 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
10517 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
10518 | |
10519 pmaddwd m4, m3, [r3 + 10 * 32] ; [26] | |
10520 paddd m4, [pd_16] | |
10521 psrld m4, 5 | |
10522 pmaddwd m5, m0, [r3 + 10 * 32] | |
10523 paddd m5, [pd_16] | |
10524 psrld m5, 5 | |
10525 packusdw m4, m5 | |
10526 | |
10527 palignr m5, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] | |
10528 pmaddwd m5, [r3 + 4 * 32] ; [20] | |
10529 paddd m5, [pd_16] | |
10530 psrld m5, 5 | |
10531 palignr m6, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] | |
10532 pmaddwd m6, [r3 + 4 * 32] | |
10533 paddd m6, [pd_16] | |
10534 psrld m6, 5 | |
10535 packusdw m5, m6 | |
10536 | |
10537 palignr m6, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] | |
10538 pmaddwd m6, [r3 - 2 * 32] ; [14] | |
10539 paddd m6, [pd_16] | |
10540 psrld m6, 5 | |
10541 palignr m7, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] | |
10542 pmaddwd m7, [r3 - 2 * 32] | |
10543 paddd m7, [pd_16] | |
10544 psrld m7, 5 | |
10545 packusdw m6, m7 | |
10546 | |
10547 palignr m7, m0, m3, 12 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
10548 pmaddwd m7, [r3 - 8 * 32] ; [8] | |
10549 paddd m7, [pd_16] | |
10550 psrld m7, 5 | |
10551 palignr m8, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8] | |
10552 pmaddwd m8, [r3 - 8 * 32] | |
10553 paddd m8, [pd_16] | |
10554 psrld m8, 5 | |
10555 packusdw m7, m8 | |
10556 | |
10557 pmaddwd m8, m0, [r3 - 14 * 32] ; [2] | |
10558 paddd m8, [pd_16] | |
10559 psrld m8, 5 | |
10560 pmaddwd m3, m2, [r3 - 14 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
10561 paddd m3, [pd_16] | |
10562 psrld m3, 5 | |
10563 packusdw m8, m3 | |
10564 | |
10565 pmaddwd m9, m0, [r3 + 12 * 32] ; [28] | |
10566 paddd m9, [pd_16] | |
10567 psrld m9, 5 | |
10568 pmaddwd m3, m2, [r3 + 12 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
10569 paddd m3, [pd_16] | |
10570 psrld m3, 5 | |
10571 packusdw m9, m3 | |
10572 | |
10573 palignr m10, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] | |
10574 pmaddwd m10, [r3 + 6 * 32] ; [22] | |
10575 paddd m10, [pd_16] | |
10576 psrld m10, 5 | |
10577 palignr m3, m1, m2, 4 ; [22 21 21 20 20 19 19 18 14 13 13 12 12 11 11 10] | |
10578 pmaddwd m3, [r3 + 6 * 32] | |
10579 paddd m3, [pd_16] | |
10580 psrld m3, 5 | |
10581 packusdw m10, m3 | |
10582 | |
10583 palignr m11, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] | |
10584 pmaddwd m11, [r3] ; [16] | |
10585 paddd m11, [pd_16] | |
10586 psrld m11, 5 | |
10587 palignr m3, m1, m2, 8 ; [23 22 22 21 21 20 20 19 15 14 14 13 13 12 12 11] | |
10588 pmaddwd m3, [r3] | |
10589 paddd m3, [pd_16] | |
10590 psrld m3, 5 | |
10591 packusdw m11, m3 | |
10592 | |
10593 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 | |
10594 | |
10595 palignr m4, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8] | |
10596 pmaddwd m4, [r3 - 6 * 32] ; [10] | |
10597 paddd m4, [pd_16] | |
10598 psrld m4, 5 | |
10599 palignr m5, m1, m2, 12 ; [24 23 23 22 22 21 21 20 15 16 15 14 14 13 13 12] | |
10600 pmaddwd m5, [r3 - 6 * 32] | |
10601 paddd m5, [pd_16] | |
10602 psrld m5, 5 | |
10603 packusdw m4, m5 | |
10604 | |
10605 pmaddwd m5, m2, [r3 - 12 * 32] ; [4] | |
10606 paddd m5, [pd_16] | |
10607 psrld m5, 5 | |
10608 pmaddwd m6, m1, [r3 - 12 * 32] | |
10609 paddd m6, [pd_16] | |
10610 psrld m6, 5 | |
10611 packusdw m5, m6 | |
10612 | |
10613 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
10614 pmaddwd m6, m2, [r3 + 14 * 32] ; [30] | |
10615 paddd m6, [pd_16] | |
10616 psrld m6, 5 | |
10617 pmaddwd m7, m1, [r3 + 14 * 32] | |
10618 paddd m7, [pd_16] | |
10619 psrld m7, 5 | |
10620 packusdw m6, m7 | |
10621 | |
10622 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] | |
10623 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] | |
10624 | |
10625 palignr m7, m1, m2, 4 | |
10626 pmaddwd m7, [r3 + 8 * 32] ; [24] | |
10627 paddd m7, [pd_16] | |
10628 psrld m7, 5 | |
10629 palignr m8, m0, m1, 4 | |
10630 pmaddwd m8, [r3 + 8 * 32] | |
10631 paddd m8, [pd_16] | |
10632 psrld m8, 5 | |
10633 packusdw m7, m8 | |
10634 | |
10635 palignr m8, m1, m2, 8 | |
10636 pmaddwd m8, [r3 + 2 * 32] ; [18] | |
10637 paddd m8, [pd_16] | |
10638 psrld m8, 5 | |
10639 palignr m9, m0, m1, 8 | |
10640 pmaddwd m9, [r3 + 2 * 32] | |
10641 paddd m9, [pd_16] | |
10642 psrld m9, 5 | |
10643 packusdw m8, m9 | |
10644 | |
10645 palignr m9, m1, m2, 12 | |
10646 pmaddwd m9, [r3 - 4 * 32] ; [12] | |
10647 paddd m9, [pd_16] | |
10648 psrld m9, 5 | |
10649 palignr m3, m0, m1, 12 | |
10650 pmaddwd m3, [r3 - 4 * 32] | |
10651 paddd m3, [pd_16] | |
10652 psrld m3, 5 | |
10653 packusdw m9, m3 | |
10654 | |
10655 pmaddwd m1, [r3 - 10 * 32] ; [6] | |
10656 paddd m1, [pd_16] | |
10657 psrld m1, 5 | |
10658 pmaddwd m0, [r3 - 10 * 32] | |
10659 paddd m0, [pd_16] | |
10660 psrld m0, 5 | |
10661 packusdw m1, m0 | |
10662 | |
10663 movu m2, [r2 + 28] | |
10664 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 2, 0, 3, 16 | |
10665 ret | |
10666 | |
10667 ;; angle 16, modes 4 and 32 | |
10668 cglobal ang16_mode_4_32 | |
10669 test r6d, r6d | |
10670 | |
10671 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
10672 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
10673 | |
10674 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
10675 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
10676 | |
10677 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
10678 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
10679 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
10680 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
10681 | |
10682 pmaddwd m4, m3, [r3 + 3 * 32] ; [21] | |
10683 paddd m4, [pd_16] | |
10684 psrld m4, 5 | |
10685 pmaddwd m5, m0, [r3 + 3 * 32] | |
10686 paddd m5, [pd_16] | |
10687 psrld m5, 5 | |
10688 packusdw m4, m5 | |
10689 | |
10690 palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] | |
10691 pmaddwd m5, m6, [r3 - 8 * 32] ; [10] | |
10692 paddd m5, [pd_16] | |
10693 psrld m5, 5 | |
10694 palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] | |
10695 pmaddwd m8, m7, [r3 - 8 * 32] | |
10696 paddd m8, [pd_16] | |
10697 psrld m8, 5 | |
10698 packusdw m5, m8 | |
10699 | |
10700 pmaddwd m6, [r3 + 13 * 32] ; [31] | |
10701 paddd m6, [pd_16] | |
10702 psrld m6, 5 | |
10703 pmaddwd m7, [r3 + 13 * 32] | |
10704 paddd m7, [pd_16] | |
10705 psrld m7, 5 | |
10706 packusdw m6, m7 | |
10707 | |
10708 palignr m7, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] | |
10709 pmaddwd m7, [r3 + 2 * 32] ; [20] | |
10710 paddd m7, [pd_16] | |
10711 psrld m7, 5 | |
10712 palignr m8, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] | |
10713 pmaddwd m8, [r3 + 2 * 32] | |
10714 paddd m8, [pd_16] | |
10715 psrld m8, 5 | |
10716 packusdw m7, m8 | |
10717 | |
10718 palignr m9, m0, m3, 12 | |
10719 pmaddwd m8, m9, [r3 - 9 * 32] ; [9] | |
10720 paddd m8, [pd_16] | |
10721 psrld m8, 5 | |
10722 palignr m3, m2, m0, 12 | |
10723 pmaddwd m10, m3, [r3 - 9 * 32] | |
10724 paddd m10, [pd_16] | |
10725 psrld m10, 5 | |
10726 packusdw m8, m10 | |
10727 | |
10728 pmaddwd m9, [r3 + 12 * 32] ; [30] | |
10729 paddd m9, [pd_16] | |
10730 psrld m9, 5 | |
10731 pmaddwd m3, [r3 + 12 * 32] | |
10732 paddd m3, [pd_16] | |
10733 psrld m3, 5 | |
10734 packusdw m9, m3 | |
10735 | |
10736 pmaddwd m10, m0, [r3 + 1 * 32] ; [19] | |
10737 paddd m10, [pd_16] | |
10738 psrld m10, 5 | |
10739 pmaddwd m3, m2, [r3 + 1 * 32] | |
10740 paddd m3, [pd_16] | |
10741 psrld m3, 5 | |
10742 packusdw m10, m3 | |
10743 | |
10744 palignr m11, m2, m0, 4 | |
10745 pmaddwd m11, [r3 - 10 * 32] ; [8] | |
10746 paddd m11, [pd_16] | |
10747 psrld m11, 5 | |
10748 palignr m3, m1, m2, 4 | |
10749 pmaddwd m3, [r3 - 10 * 32] | |
10750 paddd m3, [pd_16] | |
10751 psrld m3, 5 | |
10752 packusdw m11, m3 | |
10753 | |
10754 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 | |
10755 | |
10756 palignr m4, m2, m0, 4 | |
10757 pmaddwd m4, [r3 + 11 * 32] ; [29] | |
10758 paddd m4, [pd_16] | |
10759 psrld m4, 5 | |
10760 palignr m5, m1, m2, 4 | |
10761 pmaddwd m5, [r3 + 11 * 32] | |
10762 paddd m5, [pd_16] | |
10763 psrld m5, 5 | |
10764 packusdw m4, m5 | |
10765 | |
10766 palignr m5, m2, m0, 8 | |
10767 pmaddwd m5, [r3] ; [18] | |
10768 paddd m5, [pd_16] | |
10769 psrld m5, 5 | |
10770 palignr m6, m1, m2, 8 | |
10771 pmaddwd m6, [r3] | |
10772 paddd m6, [pd_16] | |
10773 psrld m6, 5 | |
10774 packusdw m5, m6 | |
10775 | |
10776 palignr m7, m2, m0, 12 | |
10777 pmaddwd m6, m7, [r3 - 11 * 32] ; [7] | |
10778 paddd m6, [pd_16] | |
10779 psrld m6, 5 | |
10780 palignr m8, m1, m2, 12 | |
10781 pmaddwd m3, m8, [r3 - 11 * 32] | |
10782 paddd m3, [pd_16] | |
10783 psrld m3, 5 | |
10784 packusdw m6, m3 | |
10785 | |
10786 pmaddwd m7, [r3 + 10 * 32] ; [28] | |
10787 paddd m7, [pd_16] | |
10788 psrld m7, 5 | |
10789 pmaddwd m8, [r3 + 10 * 32] | |
10790 paddd m8, [pd_16] | |
10791 psrld m8, 5 | |
10792 packusdw m7, m8 | |
10793 | |
10794 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
10795 pmaddwd m8, m2, [r3 - 1 * 32] ; [17] | |
10796 paddd m8, [pd_16] | |
10797 psrld m8, 5 | |
10798 pmaddwd m9, m1, [r3 - 1 * 32] | |
10799 paddd m9, [pd_16] | |
10800 psrld m9, 5 | |
10801 packusdw m8, m9 | |
10802 | |
10803 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] | |
10804 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] | |
10805 | |
10806 palignr m10, m1, m2, 4 | |
10807 pmaddwd m9, m10, [r3 - 12 * 32] ; [6] | |
10808 paddd m9, [pd_16] | |
10809 psrld m9, 5 | |
10810 palignr m11, m0, m1, 4 | |
10811 pmaddwd m3, m11, [r3 - 12 * 32] | |
10812 paddd m3, [pd_16] | |
10813 psrld m3, 5 | |
10814 packusdw m9, m3 | |
10815 | |
10816 pmaddwd m10, [r3 + 9 * 32] ; [27] | |
10817 paddd m10, [pd_16] | |
10818 psrld m10, 5 | |
10819 pmaddwd m11, [r3 + 9 * 32] | |
10820 paddd m11, [pd_16] | |
10821 psrld m11, 5 | |
10822 packusdw m10, m11 | |
10823 | |
10824 palignr m3, m1, m2, 8 | |
10825 pmaddwd m3, [r3 - 2 * 32] ; [16] | |
10826 paddd m3, [pd_16] | |
10827 psrld m3, 5 | |
10828 palignr m0, m1, 8 | |
10829 pmaddwd m0, [r3 - 2 * 32] | |
10830 paddd m0, [pd_16] | |
10831 psrld m0, 5 | |
10832 packusdw m3, m0 | |
10833 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16 | |
10834 ret | |
10835 | |
10836 ;; angle 16, modes 5 and 31 | |
10837 cglobal ang16_mode_5_31 | |
10838 test r6d, r6d | |
10839 | |
10840 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
10841 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
10842 | |
10843 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
10844 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
10845 | |
10846 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
10847 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
10848 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
10849 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
10850 | |
10851 pmaddwd m4, m3, [r3 + 1 * 32] ; [17] | |
10852 paddd m4, [pd_16] | |
10853 psrld m4, 5 | |
10854 pmaddwd m5, m0, [r3 + 1 * 32] | |
10855 paddd m5, [pd_16] | |
10856 psrld m5, 5 | |
10857 packusdw m4, m5 | |
10858 | |
10859 palignr m6, m0, m3, 4 | |
10860 pmaddwd m5, m6, [r3 - 14 * 32] ; [2] | |
10861 paddd m5, [pd_16] | |
10862 psrld m5, 5 | |
10863 palignr m7, m2, m0, 4 | |
10864 pmaddwd m8, m7, [r3 - 14 * 32] | |
10865 paddd m8, [pd_16] | |
10866 psrld m8, 5 | |
10867 packusdw m5, m8 | |
10868 | |
10869 pmaddwd m6, [r3 + 3 * 32] ; [19] | |
10870 paddd m6, [pd_16] | |
10871 psrld m6, 5 | |
10872 pmaddwd m7, [r3 + 3 * 32] | |
10873 paddd m7, [pd_16] | |
10874 psrld m7, 5 | |
10875 packusdw m6, m7 | |
10876 | |
10877 palignr m8, m0, m3, 8 | |
10878 pmaddwd m7, m8, [r3 - 12 * 32] ; [4] | |
10879 paddd m7, [pd_16] | |
10880 psrld m7, 5 | |
10881 palignr m9, m2, m0, 8 | |
10882 pmaddwd m10, m9, [r3 - 12 * 32] | |
10883 paddd m10, [pd_16] | |
10884 psrld m10, 5 | |
10885 packusdw m7, m10 | |
10886 | |
10887 pmaddwd m8, [r3 + 5 * 32] ; [21] | |
10888 paddd m8, [pd_16] | |
10889 psrld m8, 5 | |
10890 pmaddwd m9, [r3 + 5 * 32] | |
10891 paddd m9, [pd_16] | |
10892 psrld m9, 5 | |
10893 packusdw m8, m9 | |
10894 | |
10895 palignr m10, m0, m3, 12 | |
10896 pmaddwd m9, m10, [r3 - 10 * 32] ; [6] | |
10897 paddd m9, [pd_16] | |
10898 psrld m9, 5 | |
10899 palignr m11, m2, m0, 12 | |
10900 pmaddwd m3, m11, [r3 - 10 * 32] | |
10901 paddd m3, [pd_16] | |
10902 psrld m3, 5 | |
10903 packusdw m9, m3 | |
10904 | |
10905 pmaddwd m10, [r3 + 7 * 32] ; [23] | |
10906 paddd m10, [pd_16] | |
10907 psrld m10, 5 | |
10908 pmaddwd m11, [r3 + 7 * 32] | |
10909 paddd m11, [pd_16] | |
10910 psrld m11, 5 | |
10911 packusdw m10, m11 | |
10912 | |
10913 pmaddwd m11, m0, [r3 - 8 * 32] ; [8] | |
10914 paddd m11, [pd_16] | |
10915 psrld m11, 5 | |
10916 pmaddwd m3, m2, [r3 - 8 * 32] | |
10917 paddd m3, [pd_16] | |
10918 psrld m3, 5 | |
10919 packusdw m11, m3 | |
10920 | |
10921 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 | |
10922 | |
10923 pmaddwd m4, m0, [r3 + 9 * 32] ; [25] | |
10924 paddd m4, [pd_16] | |
10925 psrld m4, 5 | |
10926 pmaddwd m5, m2, [r3 + 9 * 32] | |
10927 paddd m5, [pd_16] | |
10928 psrld m5, 5 | |
10929 packusdw m4, m5 | |
10930 | |
10931 palignr m6, m2, m0, 4 | |
10932 pmaddwd m5, m6, [r3 - 6 * 32] ; [10] | |
10933 paddd m5, [pd_16] | |
10934 psrld m5, 5 | |
10935 palignr m7, m1, m2, 4 | |
10936 pmaddwd m3, m7, [r3 - 6 * 32] | |
10937 paddd m3, [pd_16] | |
10938 psrld m3, 5 | |
10939 packusdw m5, m3 | |
10940 | |
10941 pmaddwd m6, [r3 + 11 * 32] ; [27] | |
10942 paddd m6, [pd_16] | |
10943 psrld m6, 5 | |
10944 pmaddwd m7, [r3 + 11 * 32] | |
10945 paddd m7, [pd_16] | |
10946 psrld m7, 5 | |
10947 packusdw m6, m7 | |
10948 | |
10949 palignr m8, m2, m0, 8 | |
10950 pmaddwd m7, m8, [r3 - 4 * 32] ; [12] | |
10951 paddd m7, [pd_16] | |
10952 psrld m7, 5 | |
10953 palignr m9, m1, m2, 8 | |
10954 pmaddwd m3, m9, [r3 - 4 * 32] | |
10955 paddd m3, [pd_16] | |
10956 psrld m3, 5 | |
10957 packusdw m7, m3 | |
10958 | |
10959 pmaddwd m8, [r3 + 13 * 32] ; [29] | |
10960 paddd m8, [pd_16] | |
10961 psrld m8, 5 | |
10962 pmaddwd m9, [r3 + 13 * 32] | |
10963 paddd m9, [pd_16] | |
10964 psrld m9, 5 | |
10965 packusdw m8, m9 | |
10966 | |
10967 palignr m10, m2, m0, 12 | |
10968 pmaddwd m9, m10, [r3 - 2 * 32] ; [14] | |
10969 paddd m9, [pd_16] | |
10970 psrld m9, 5 | |
10971 palignr m11, m1, m2, 12 | |
10972 pmaddwd m3, m11, [r3 - 2 * 32] | |
10973 paddd m3, [pd_16] | |
10974 psrld m3, 5 | |
10975 packusdw m9, m3 | |
10976 | |
10977 pmaddwd m10, [r3 + 15 * 32] ; [31] | |
10978 paddd m10, [pd_16] | |
10979 psrld m10, 5 | |
10980 pmaddwd m11, [r3 + 15 * 32] | |
10981 paddd m11, [pd_16] | |
10982 psrld m11, 5 | |
10983 packusdw m10, m11 | |
10984 | |
10985 pmaddwd m2, [r3] ; [16] | |
10986 paddd m2, [pd_16] | |
10987 psrld m2, 5 | |
10988 pmaddwd m1, [r3] | |
10989 paddd m1, [pd_16] | |
10990 psrld m1, 5 | |
10991 packusdw m2, m1 | |
10992 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16 | |
10993 ret | |
10994 | |
10995 ;; angle 16, modes 6 and 30 | |
10996 cglobal ang16_mode_6_30 | |
10997 test r6d, r6d | |
10998 | |
10999 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11000 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11001 | |
11002 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
11003 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
11004 | |
11005 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
11006 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
11007 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
11008 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
11009 | |
11010 pmaddwd m4, m3, [r3 - 2 * 32] ; [13] | |
11011 paddd m4, [pd_16] | |
11012 psrld m4, 5 | |
11013 pmaddwd m5, m0, [r3 - 2 * 32] | |
11014 paddd m5, [pd_16] | |
11015 psrld m5, 5 | |
11016 packusdw m4, m5 | |
11017 | |
11018 pmaddwd m5, m3, [r3 + 11 * 32] ; [26] | |
11019 paddd m5, [pd_16] | |
11020 psrld m5, 5 | |
11021 pmaddwd m8, m0, [r3 + 11 * 32] | |
11022 paddd m8, [pd_16] | |
11023 psrld m8, 5 | |
11024 packusdw m5, m8 | |
11025 | |
11026 palignr m7, m0, m3, 4 | |
11027 pmaddwd m6, m7, [r3 - 8 * 32] ; [7] | |
11028 paddd m6, [pd_16] | |
11029 psrld m6, 5 | |
11030 palignr m8, m2, m0, 4 | |
11031 pmaddwd m9, m8, [r3 - 8 * 32] | |
11032 paddd m9, [pd_16] | |
11033 psrld m9, 5 | |
11034 packusdw m6, m9 | |
11035 | |
11036 pmaddwd m7, [r3 + 5 * 32] ; [20] | |
11037 paddd m7, [pd_16] | |
11038 psrld m7, 5 | |
11039 pmaddwd m8, [r3 + 5 * 32] | |
11040 paddd m8, [pd_16] | |
11041 psrld m8, 5 | |
11042 packusdw m7, m8 | |
11043 | |
11044 palignr m10, m0, m3, 8 | |
11045 pmaddwd m8, m10, [r3 - 14 * 32] ; [1] | |
11046 paddd m8, [pd_16] | |
11047 psrld m8, 5 | |
11048 palignr m11, m2, m0, 8 | |
11049 pmaddwd m9, m11, [r3 - 14 * 32] | |
11050 paddd m9, [pd_16] | |
11051 psrld m9, 5 | |
11052 packusdw m8, m9 | |
11053 | |
11054 pmaddwd m9, m10, [r3 - 1 * 32] ; [14] | |
11055 paddd m9, [pd_16] | |
11056 psrld m9, 5 | |
11057 pmaddwd m12, m11, [r3 - 1 * 32] | |
11058 paddd m12, [pd_16] | |
11059 psrld m12, 5 | |
11060 packusdw m9, m12 | |
11061 | |
11062 pmaddwd m10, [r3 + 12 * 32] ; [27] | |
11063 paddd m10, [pd_16] | |
11064 psrld m10, 5 | |
11065 pmaddwd m11, [r3 + 12 * 32] | |
11066 paddd m11, [pd_16] | |
11067 psrld m11, 5 | |
11068 packusdw m10, m11 | |
11069 | |
11070 palignr m11, m0, m3, 12 | |
11071 pmaddwd m11, [r3 - 7 * 32] ; [8] | |
11072 paddd m11, [pd_16] | |
11073 psrld m11, 5 | |
11074 palignr m12, m2, m0, 12 | |
11075 pmaddwd m12, [r3 - 7 * 32] | |
11076 paddd m12, [pd_16] | |
11077 psrld m12, 5 | |
11078 packusdw m11, m12 | |
11079 | |
11080 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
11081 | |
11082 palignr m4, m0, m3, 12 | |
11083 pmaddwd m4, [r3 + 6 * 32] ; [21] | |
11084 paddd m4, [pd_16] | |
11085 psrld m4, 5 | |
11086 palignr m5, m2, m0, 12 | |
11087 pmaddwd m5, [r3 + 6 * 32] | |
11088 paddd m5, [pd_16] | |
11089 psrld m5, 5 | |
11090 packusdw m4, m5 | |
11091 | |
11092 pmaddwd m5, m0, [r3 - 13 * 32] ; [2] | |
11093 paddd m5, [pd_16] | |
11094 psrld m5, 5 | |
11095 pmaddwd m3, m2, [r3 - 13 * 32] | |
11096 paddd m3, [pd_16] | |
11097 psrld m3, 5 | |
11098 packusdw m5, m3 | |
11099 | |
11100 pmaddwd m6, m0, [r3] ; [15] | |
11101 paddd m6, [pd_16] | |
11102 psrld m6, 5 | |
11103 pmaddwd m7, m2, [r3] | |
11104 paddd m7, [pd_16] | |
11105 psrld m7, 5 | |
11106 packusdw m6, m7 | |
11107 | |
11108 pmaddwd m7, m0, [r3 + 13 * 32] ; [28] | |
11109 paddd m7, [pd_16] | |
11110 psrld m7, 5 | |
11111 pmaddwd m3, m2, [r3 + 13 * 32] | |
11112 paddd m3, [pd_16] | |
11113 psrld m3, 5 | |
11114 packusdw m7, m3 | |
11115 | |
11116 palignr m9, m2, m0, 4 | |
11117 pmaddwd m8, m9, [r3 - 6 * 32] ; [9] | |
11118 paddd m8, [pd_16] | |
11119 psrld m8, 5 | |
11120 palignr m3, m1, m2, 4 | |
11121 pmaddwd m10, m3, [r3 - 6 * 32] | |
11122 paddd m10, [pd_16] | |
11123 psrld m10, 5 | |
11124 packusdw m8, m10 | |
11125 | |
11126 pmaddwd m9, [r3 + 7 * 32] ; [22] | |
11127 paddd m9, [pd_16] | |
11128 psrld m9, 5 | |
11129 pmaddwd m3, [r3 + 7 * 32] | |
11130 paddd m3, [pd_16] | |
11131 psrld m3, 5 | |
11132 packusdw m9, m3 | |
11133 | |
11134 palignr m11, m2, m0, 8 | |
11135 pmaddwd m10, m11, [r3 - 12 * 32] ; [3] | |
11136 paddd m10, [pd_16] | |
11137 psrld m10, 5 | |
11138 palignr m3, m1, m2, 8 | |
11139 pmaddwd m12, m3, [r3 - 12 * 32] | |
11140 paddd m12, [pd_16] | |
11141 psrld m12, 5 | |
11142 packusdw m10, m12 | |
11143 | |
11144 pmaddwd m11, [r3 + 1 * 32] ; [16] | |
11145 paddd m11, [pd_16] | |
11146 psrld m11, 5 | |
11147 pmaddwd m3, [r3 + 1 * 32] | |
11148 paddd m3, [pd_16] | |
11149 psrld m3, 5 | |
11150 packusdw m11, m3 | |
11151 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 | |
11152 ret | |
11153 | |
11154 ;; angle 16, modes 7 and 29 | |
11155 cglobal ang16_mode_7_29 | |
11156 test r6d, r6d | |
11157 | |
11158 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11159 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11160 | |
11161 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
11162 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
11163 | |
11164 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
11165 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
11166 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
11167 | |
11168 pmaddwd m4, m3, [r3 - 8 * 32] ; [9] | |
11169 paddd m4, [pd_16] | |
11170 psrld m4, 5 | |
11171 pmaddwd m5, m0, [r3 - 8 * 32] | |
11172 paddd m5, [pd_16] | |
11173 psrld m5, 5 | |
11174 packusdw m4, m5 | |
11175 | |
11176 pmaddwd m5, m3, [r3 + 1 * 32] ; [18] | |
11177 paddd m5, [pd_16] | |
11178 psrld m5, 5 | |
11179 pmaddwd m8, m0, [r3 + 1 * 32] | |
11180 paddd m8, [pd_16] | |
11181 psrld m8, 5 | |
11182 packusdw m5, m8 | |
11183 | |
11184 pmaddwd m6, m3, [r3 + 10 * 32] ; [27] | |
11185 paddd m6, [pd_16] | |
11186 psrld m6, 5 | |
11187 pmaddwd m9, m0, [r3 + 10 * 32] | |
11188 paddd m9, [pd_16] | |
11189 psrld m9, 5 | |
11190 packusdw m6, m9 | |
11191 | |
11192 palignr m10, m0, m3, 4 | |
11193 pmaddwd m7, m10, [r3 - 13 * 32] ; [4] | |
11194 paddd m7, [pd_16] | |
11195 psrld m7, 5 | |
11196 palignr m11, m2, m0, 4 | |
11197 pmaddwd m8, m11, [r3 - 13 * 32] | |
11198 paddd m8, [pd_16] | |
11199 psrld m8, 5 | |
11200 packusdw m7, m8 | |
11201 | |
11202 pmaddwd m8, m10, [r3 - 4 * 32] ; [13] | |
11203 paddd m8, [pd_16] | |
11204 psrld m8, 5 | |
11205 pmaddwd m9, m11, [r3 - 4 * 32] | |
11206 paddd m9, [pd_16] | |
11207 psrld m9, 5 | |
11208 packusdw m8, m9 | |
11209 | |
11210 pmaddwd m9, m10, [r3 + 5 * 32] ; [22] | |
11211 paddd m9, [pd_16] | |
11212 psrld m9, 5 | |
11213 pmaddwd m12, m11, [r3 + 5 * 32] | |
11214 paddd m12, [pd_16] | |
11215 psrld m12, 5 | |
11216 packusdw m9, m12 | |
11217 | |
11218 pmaddwd m10, [r3 + 14 * 32] ; [31] | |
11219 paddd m10, [pd_16] | |
11220 psrld m10, 5 | |
11221 pmaddwd m11, [r3 + 14 * 32] | |
11222 paddd m11, [pd_16] | |
11223 psrld m11, 5 | |
11224 packusdw m10, m11 | |
11225 | |
11226 palignr m11, m0, m3, 8 | |
11227 pmaddwd m11, [r3 - 9 * 32] ; [8] | |
11228 paddd m11, [pd_16] | |
11229 psrld m11, 5 | |
11230 palignr m12, m2, m0, 8 | |
11231 pmaddwd m12, [r3 - 9 * 32] | |
11232 paddd m12, [pd_16] | |
11233 psrld m12, 5 | |
11234 packusdw m11, m12 | |
11235 | |
11236 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 | |
11237 | |
11238 palignr m5, m0, m3, 8 | |
11239 pmaddwd m4, m5, [r3] ; [17] | |
11240 paddd m4, [pd_16] | |
11241 psrld m4, 5 | |
11242 palignr m6, m2, m0, 8 | |
11243 pmaddwd m7, m6, [r3] | |
11244 paddd m7, [pd_16] | |
11245 psrld m7, 5 | |
11246 packusdw m4, m7 | |
11247 | |
11248 pmaddwd m5, [r3 + 9 * 32] ; [26] | |
11249 paddd m5, [pd_16] | |
11250 psrld m5, 5 | |
11251 pmaddwd m6, [r3 + 9 * 32] | |
11252 paddd m6, [pd_16] | |
11253 psrld m6, 5 | |
11254 packusdw m5, m6 | |
11255 | |
11256 palignr m9, m0, m3, 12 | |
11257 pmaddwd m6, m9, [r3 - 14 * 32] ; [3] | |
11258 paddd m6, [pd_16] | |
11259 psrld m6, 5 | |
11260 palignr m3, m2, m0, 12 | |
11261 pmaddwd m7, m3, [r3 - 14 * 32] | |
11262 paddd m7, [pd_16] | |
11263 psrld m7, 5 | |
11264 packusdw m6, m7 | |
11265 | |
11266 pmaddwd m7, m9, [r3 - 5 * 32] ; [12] | |
11267 paddd m7, [pd_16] | |
11268 psrld m7, 5 | |
11269 pmaddwd m8, m3, [r3 - 5 * 32] | |
11270 paddd m8, [pd_16] | |
11271 psrld m8, 5 | |
11272 packusdw m7, m8 | |
11273 | |
11274 pmaddwd m8, m9, [r3 + 4 * 32] ; [21] | |
11275 paddd m8, [pd_16] | |
11276 psrld m8, 5 | |
11277 pmaddwd m10, m3, [r3 + 4 * 32] | |
11278 paddd m10, [pd_16] | |
11279 psrld m10, 5 | |
11280 packusdw m8, m10 | |
11281 | |
11282 pmaddwd m9, [r3 + 13 * 32] ; [30] | |
11283 paddd m9, [pd_16] | |
11284 psrld m9, 5 | |
11285 pmaddwd m3, [r3 + 13 * 32] | |
11286 paddd m3, [pd_16] | |
11287 psrld m3, 5 | |
11288 packusdw m9, m3 | |
11289 | |
11290 pmaddwd m10, m0, [r3 - 10 * 32] ; [7] | |
11291 paddd m10, [pd_16] | |
11292 psrld m10, 5 | |
11293 pmaddwd m12, m2, [r3 - 10 * 32] | |
11294 paddd m12, [pd_16] | |
11295 psrld m12, 5 | |
11296 packusdw m10, m12 | |
11297 | |
11298 pmaddwd m0, [r3 - 1 * 32] ; [16] | |
11299 paddd m0, [pd_16] | |
11300 psrld m0, 5 | |
11301 pmaddwd m2, [r3 - 1 * 32] | |
11302 paddd m2, [pd_16] | |
11303 psrld m2, 5 | |
11304 packusdw m0, m2 | |
11305 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 16 | |
11306 ret | |
11307 | |
11308 ;; angle 16, modes 8 and 28 | |
11309 cglobal ang16_mode_8_28 | |
11310 test r6d, r6d | |
11311 | |
11312 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11313 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11314 | |
11315 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
11316 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
11317 | |
11318 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
11319 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
11320 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
11321 | |
11322 pmaddwd m4, m3, [r3 - 10 * 32] ; [5] | |
11323 paddd m4, [pd_16] | |
11324 psrld m4, 5 | |
11325 pmaddwd m5, m0, [r3 - 10 * 32] | |
11326 paddd m5, [pd_16] | |
11327 psrld m5, 5 | |
11328 packusdw m4, m5 | |
11329 | |
11330 pmaddwd m5, m3, [r3 - 5 * 32] ; [10] | |
11331 paddd m5, [pd_16] | |
11332 psrld m5, 5 | |
11333 pmaddwd m8, m0, [r3 - 5 * 32] | |
11334 paddd m8, [pd_16] | |
11335 psrld m8, 5 | |
11336 packusdw m5, m8 | |
11337 | |
11338 pmaddwd m6, m3, [r3] ; [15] | |
11339 paddd m6, [pd_16] | |
11340 psrld m6, 5 | |
11341 pmaddwd m9, m0, [r3] | |
11342 paddd m9, [pd_16] | |
11343 psrld m9, 5 | |
11344 packusdw m6, m9 | |
11345 | |
11346 pmaddwd m7, m3, [r3 + 5 * 32] ; [20] | |
11347 paddd m7, [pd_16] | |
11348 psrld m7, 5 | |
11349 pmaddwd m8, m0, [r3 + 5 * 32] | |
11350 paddd m8, [pd_16] | |
11351 psrld m8, 5 | |
11352 packusdw m7, m8 | |
11353 | |
11354 pmaddwd m8, m3, [r3 + 10 * 32] ; [25] | |
11355 paddd m8, [pd_16] | |
11356 psrld m8, 5 | |
11357 pmaddwd m9, m0, [r3 + 10 * 32] | |
11358 paddd m9, [pd_16] | |
11359 psrld m9, 5 | |
11360 packusdw m8, m9 | |
11361 | |
11362 pmaddwd m9, m3, [r3 + 15 * 32] ; [30] | |
11363 paddd m9, [pd_16] | |
11364 psrld m9, 5 | |
11365 pmaddwd m10, m0, [r3 + 15 * 32] | |
11366 paddd m10, [pd_16] | |
11367 psrld m10, 5 | |
11368 packusdw m9, m10 | |
11369 | |
11370 palignr m11, m0, m3, 4 | |
11371 pmaddwd m10, m11, [r3 - 12 * 32] ; [3] | |
11372 paddd m10, [pd_16] | |
11373 psrld m10, 5 | |
11374 palignr m1, m2, m0, 4 | |
11375 pmaddwd m12, m1, [r3 - 12 * 32] | |
11376 paddd m12, [pd_16] | |
11377 psrld m12, 5 | |
11378 packusdw m10, m12 | |
11379 | |
11380 pmaddwd m11, [r3 - 7 * 32] ; [8] | |
11381 paddd m11, [pd_16] | |
11382 psrld m11, 5 | |
11383 pmaddwd m1, [r3 - 7 * 32] | |
11384 paddd m1, [pd_16] | |
11385 psrld m1, 5 | |
11386 packusdw m11, m1 | |
11387 | |
11388 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 | |
11389 | |
11390 palignr m7, m0, m3, 4 | |
11391 pmaddwd m4, m7, [r3 - 2 * 32] ; [13] | |
11392 paddd m4, [pd_16] | |
11393 psrld m4, 5 | |
11394 palignr m1, m2, m0, 4 | |
11395 pmaddwd m5, m1, [r3 - 2 * 32] | |
11396 paddd m5, [pd_16] | |
11397 psrld m5, 5 | |
11398 packusdw m4, m5 | |
11399 | |
11400 pmaddwd m5, m7, [r3 + 3 * 32] ; [18] | |
11401 paddd m5, [pd_16] | |
11402 psrld m5, 5 | |
11403 pmaddwd m6, m1, [r3 + 3 * 32] | |
11404 paddd m6, [pd_16] | |
11405 psrld m6, 5 | |
11406 packusdw m5, m6 | |
11407 | |
11408 pmaddwd m6, m7, [r3 + 8 * 32] ; [23] | |
11409 paddd m6, [pd_16] | |
11410 psrld m6, 5 | |
11411 pmaddwd m8, m1, [r3 + 8 * 32] | |
11412 paddd m8, [pd_16] | |
11413 psrld m8, 5 | |
11414 packusdw m6, m8 | |
11415 | |
11416 pmaddwd m7, [r3 + 13 * 32] ; [28] | |
11417 paddd m7, [pd_16] | |
11418 psrld m7, 5 | |
11419 pmaddwd m1, [r3 + 13 * 32] | |
11420 paddd m1, [pd_16] | |
11421 psrld m1, 5 | |
11422 packusdw m7, m1 | |
11423 | |
11424 palignr m1, m0, m3, 8 | |
11425 pmaddwd m8, m1, [r3 - 14 * 32] ; [1] | |
11426 paddd m8, [pd_16] | |
11427 psrld m8, 5 | |
11428 palignr m2, m0, 8 | |
11429 pmaddwd m9, m2, [r3 - 14 * 32] | |
11430 paddd m9, [pd_16] | |
11431 psrld m9, 5 | |
11432 packusdw m8, m9 | |
11433 | |
11434 pmaddwd m9, m1, [r3 - 9 * 32] ; [6] | |
11435 paddd m9, [pd_16] | |
11436 psrld m9, 5 | |
11437 pmaddwd m3, m2, [r3 - 9 * 32] | |
11438 paddd m3, [pd_16] | |
11439 psrld m3, 5 | |
11440 packusdw m9, m3 | |
11441 | |
11442 pmaddwd m3, m1, [r3 - 4 * 32] ; [11] | |
11443 paddd m3, [pd_16] | |
11444 psrld m3, 5 | |
11445 pmaddwd m0, m2, [r3 - 4 * 32] | |
11446 paddd m0, [pd_16] | |
11447 psrld m0, 5 | |
11448 packusdw m3, m0 | |
11449 | |
11450 pmaddwd m1, [r3 + 1 * 32] ; [16] | |
11451 paddd m1, [pd_16] | |
11452 psrld m1, 5 | |
11453 pmaddwd m2, [r3 + 1 * 32] | |
11454 paddd m2, [pd_16] | |
11455 psrld m2, 5 | |
11456 packusdw m1, m2 | |
11457 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 | |
11458 ret | |
11459 | |
11460 ;; angle 16, modes 9 and 27 | |
11461 cglobal ang16_mode_9_27 | |
11462 test r6d, r6d | |
11463 | |
11464 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11465 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11466 | |
11467 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
11468 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
11469 | |
11470 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
11471 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
11472 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
11473 | |
11474 pmaddwd m4, m3, [r3 - 14 * 32] ; [2] | |
11475 paddd m4, [pd_16] | |
11476 psrld m4, 5 | |
11477 pmaddwd m5, m0, [r3 - 14 * 32] | |
11478 paddd m5, [pd_16] | |
11479 psrld m5, 5 | |
11480 packusdw m4, m5 | |
11481 | |
11482 pmaddwd m5, m3, [r3 - 12 * 32] ; [4] | |
11483 paddd m5, [pd_16] | |
11484 psrld m5, 5 | |
11485 pmaddwd m8, m0, [r3 - 12 * 32] | |
11486 paddd m8, [pd_16] | |
11487 psrld m8, 5 | |
11488 packusdw m5, m8 | |
11489 | |
11490 pmaddwd m6, m3, [r3 - 10 * 32] ; [6] | |
11491 paddd m6, [pd_16] | |
11492 psrld m6, 5 | |
11493 pmaddwd m9, m0, [r3 - 10 * 32] | |
11494 paddd m9, [pd_16] | |
11495 psrld m9, 5 | |
11496 packusdw m6, m9 | |
11497 | |
11498 pmaddwd m7, m3, [r3 - 8 * 32] ; [8] | |
11499 paddd m7, [pd_16] | |
11500 psrld m7, 5 | |
11501 pmaddwd m8, m0, [r3 - 8 * 32] | |
11502 paddd m8, [pd_16] | |
11503 psrld m8, 5 | |
11504 packusdw m7, m8 | |
11505 | |
11506 pmaddwd m8, m3, [r3 - 6 * 32] ; [10] | |
11507 paddd m8, [pd_16] | |
11508 psrld m8, 5 | |
11509 pmaddwd m9, m0, [r3 - 6 * 32] | |
11510 paddd m9, [pd_16] | |
11511 psrld m9, 5 | |
11512 packusdw m8, m9 | |
11513 | |
11514 pmaddwd m9, m3, [r3 - 4 * 32] ; [12] | |
11515 paddd m9, [pd_16] | |
11516 psrld m9, 5 | |
11517 pmaddwd m10, m0, [r3 - 4 * 32] | |
11518 paddd m10, [pd_16] | |
11519 psrld m10, 5 | |
11520 packusdw m9, m10 | |
11521 | |
11522 pmaddwd m10, m3, [r3 - 2 * 32] ; [14] | |
11523 paddd m10, [pd_16] | |
11524 psrld m10, 5 | |
11525 pmaddwd m1, m0, [r3 - 2 * 32] | |
11526 paddd m1, [pd_16] | |
11527 psrld m1, 5 | |
11528 packusdw m10, m1 | |
11529 | |
11530 pmaddwd m11, m3, [r3] ; [16] | |
11531 paddd m11, [pd_16] | |
11532 psrld m11, 5 | |
11533 pmaddwd m1, m0, [r3] | |
11534 paddd m1, [pd_16] | |
11535 psrld m1, 5 | |
11536 packusdw m11, m1 | |
11537 | |
11538 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 | |
11539 | |
11540 pmaddwd m4, m3, [r3 + 2 * 32] ; [18] | |
11541 paddd m4, [pd_16] | |
11542 psrld m4, 5 | |
11543 pmaddwd m5, m0, [r3 + 2 * 32] | |
11544 paddd m5, [pd_16] | |
11545 psrld m5, 5 | |
11546 packusdw m4, m5 | |
11547 | |
11548 pmaddwd m5, m3, [r3 + 4 * 32] ; [20] | |
11549 paddd m5, [pd_16] | |
11550 psrld m5, 5 | |
11551 pmaddwd m6, m0, [r3 + 4 * 32] | |
11552 paddd m6, [pd_16] | |
11553 psrld m6, 5 | |
11554 packusdw m5, m6 | |
11555 | |
11556 pmaddwd m6, m3, [r3 + 6 * 32] ; [22] | |
11557 paddd m6, [pd_16] | |
11558 psrld m6, 5 | |
11559 pmaddwd m8, m0, [r3 + 6 * 32] | |
11560 paddd m8, [pd_16] | |
11561 psrld m8, 5 | |
11562 packusdw m6, m8 | |
11563 | |
11564 pmaddwd m7, m3, [r3 + 8 * 32] ; [24] | |
11565 paddd m7, [pd_16] | |
11566 psrld m7, 5 | |
11567 pmaddwd m1, m0, [r3 + 8 * 32] | |
11568 paddd m1, [pd_16] | |
11569 psrld m1, 5 | |
11570 packusdw m7, m1 | |
11571 | |
11572 pmaddwd m8, m3, [r3 + 10 * 32] ; [26] | |
11573 paddd m8, [pd_16] | |
11574 psrld m8, 5 | |
11575 pmaddwd m9, m0, [r3 + 10 * 32] | |
11576 paddd m9, [pd_16] | |
11577 psrld m9, 5 | |
11578 packusdw m8, m9 | |
11579 | |
11580 pmaddwd m9, m3, [r3 + 12 * 32] ; [28] | |
11581 paddd m9, [pd_16] | |
11582 psrld m9, 5 | |
11583 pmaddwd m1, m0, [r3 + 12 * 32] | |
11584 paddd m1, [pd_16] | |
11585 psrld m1, 5 | |
11586 packusdw m9, m1 | |
11587 | |
11588 pmaddwd m3, [r3 + 14 * 32] ; [30] | |
11589 paddd m3, [pd_16] | |
11590 psrld m3, 5 | |
11591 pmaddwd m0, [r3 + 14 * 32] | |
11592 paddd m0, [pd_16] | |
11593 psrld m0, 5 | |
11594 packusdw m3, m0 | |
11595 | |
11596 movu m1, [r2 + 4] | |
11597 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 | |
11598 ret | |
11599 | |
11600 ;; angle 16, modes 11 and 25 | |
11601 cglobal ang16_mode_11_25 | |
11602 test r6d, r6d | |
11603 | |
11604 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
11605 movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11606 | |
11607 punpcklwd m3, m0, m1 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
11608 punpckhwd m0, m1 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
11609 | |
11610 pmaddwd m4, m3, [r3 + 14 * 32] ; [30] | |
11611 paddd m4, [pd_16] | |
11612 psrld m4, 5 | |
11613 pmaddwd m5, m0, [r3 + 14 * 32] | |
11614 paddd m5, [pd_16] | |
11615 psrld m5, 5 | |
11616 packusdw m4, m5 | |
11617 | |
11618 pmaddwd m5, m3, [r3 + 12 * 32] ; [28] | |
11619 paddd m5, [pd_16] | |
11620 psrld m5, 5 | |
11621 pmaddwd m8, m0, [r3 + 12 * 32] | |
11622 paddd m8, [pd_16] | |
11623 psrld m8, 5 | |
11624 packusdw m5, m8 | |
11625 | |
11626 pmaddwd m6, m3, [r3 + 10 * 32] ; [26] | |
11627 paddd m6, [pd_16] | |
11628 psrld m6, 5 | |
11629 pmaddwd m9, m0, [r3 + 10 * 32] | |
11630 paddd m9, [pd_16] | |
11631 psrld m9, 5 | |
11632 packusdw m6, m9 | |
11633 | |
11634 pmaddwd m7, m3, [r3 + 8 * 32] ; [24] | |
11635 paddd m7, [pd_16] | |
11636 psrld m7, 5 | |
11637 pmaddwd m8, m0, [r3 + 8 * 32] | |
11638 paddd m8, [pd_16] | |
11639 psrld m8, 5 | |
11640 packusdw m7, m8 | |
11641 | |
11642 pmaddwd m8, m3, [r3 + 6 * 32] ; [22] | |
11643 paddd m8, [pd_16] | |
11644 psrld m8, 5 | |
11645 pmaddwd m9, m0, [r3 + 6 * 32] | |
11646 paddd m9, [pd_16] | |
11647 psrld m9, 5 | |
11648 packusdw m8, m9 | |
11649 | |
11650 pmaddwd m9, m3, [r3 + 4 * 32] ; [20] | |
11651 paddd m9, [pd_16] | |
11652 psrld m9, 5 | |
11653 pmaddwd m10, m0, [r3 + 4 * 32] | |
11654 paddd m10, [pd_16] | |
11655 psrld m10, 5 | |
11656 packusdw m9, m10 | |
11657 | |
11658 pmaddwd m10, m3, [r3 + 2 * 32] ; [18] | |
11659 paddd m10, [pd_16] | |
11660 psrld m10, 5 | |
11661 pmaddwd m1, m0, [r3 + 2 * 32] | |
11662 paddd m1, [pd_16] | |
11663 psrld m1, 5 | |
11664 packusdw m10, m1 | |
11665 | |
11666 pmaddwd m11, m3, [r3] ; [16] | |
11667 paddd m11, [pd_16] | |
11668 psrld m11, 5 | |
11669 pmaddwd m1, m0, [r3] | |
11670 paddd m1, [pd_16] | |
11671 psrld m1, 5 | |
11672 packusdw m11, m1 | |
11673 | |
11674 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 | |
11675 | |
11676 pmaddwd m4, m3, [r3 - 2 * 32] ; [14] | |
11677 paddd m4, [pd_16] | |
11678 psrld m4, 5 | |
11679 pmaddwd m5, m0, [r3 - 2 * 32] | |
11680 paddd m5, [pd_16] | |
11681 psrld m5, 5 | |
11682 packusdw m4, m5 | |
11683 | |
11684 pmaddwd m5, m3, [r3 - 4 * 32] ; [12] | |
11685 paddd m5, [pd_16] | |
11686 psrld m5, 5 | |
11687 pmaddwd m6, m0, [r3 - 4 * 32] | |
11688 paddd m6, [pd_16] | |
11689 psrld m6, 5 | |
11690 packusdw m5, m6 | |
11691 | |
11692 pmaddwd m6, m3, [r3 - 6 * 32] ; [10] | |
11693 paddd m6, [pd_16] | |
11694 psrld m6, 5 | |
11695 pmaddwd m8, m0, [r3 - 6 * 32] | |
11696 paddd m8, [pd_16] | |
11697 psrld m8, 5 | |
11698 packusdw m6, m8 | |
11699 | |
11700 pmaddwd m7, m3, [r3 - 8 * 32] ; [8] | |
11701 paddd m7, [pd_16] | |
11702 psrld m7, 5 | |
11703 pmaddwd m1, m0, [r3 - 8 * 32] | |
11704 paddd m1, [pd_16] | |
11705 psrld m1, 5 | |
11706 packusdw m7, m1 | |
11707 | |
11708 pmaddwd m8, m3, [r3 - 10 * 32] ; [6] | |
11709 paddd m8, [pd_16] | |
11710 psrld m8, 5 | |
11711 pmaddwd m9, m0, [r3 - 10 * 32] | |
11712 paddd m9, [pd_16] | |
11713 psrld m9, 5 | |
11714 packusdw m8, m9 | |
11715 | |
11716 pmaddwd m9, m3, [r3 - 12 * 32] ; [4] | |
11717 paddd m9, [pd_16] | |
11718 psrld m9, 5 | |
11719 pmaddwd m1, m0, [r3 - 12 * 32] | |
11720 paddd m1, [pd_16] | |
11721 psrld m1, 5 | |
11722 packusdw m9, m1 | |
11723 | |
11724 pmaddwd m3, [r3 - 14 * 32] ; [2] | |
11725 paddd m3, [pd_16] | |
11726 psrld m3, 5 | |
11727 pmaddwd m0, [r3 - 14 * 32] | |
11728 paddd m0, [pd_16] | |
11729 psrld m0, 5 | |
11730 packusdw m3, m0 | |
11731 | |
11732 movu m1, [r2] | |
11733 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 | |
11734 ret | |
11735 | |
11736 ;; angle 16, modes 12 and 24 | |
11737 cglobal ang16_mode_12_24 | |
11738 test r6d, r6d | |
11739 | |
11740 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
11741 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11742 | |
11743 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
11744 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
11745 | |
11746 pmaddwd m4, m3, [r3 + 11 * 32] ; [27] | |
11747 paddd m4, [pd_16] | |
11748 psrld m4, 5 | |
11749 pmaddwd m5, m2, [r3 + 11 * 32] | |
11750 paddd m5, [pd_16] | |
11751 psrld m5, 5 | |
11752 packusdw m4, m5 | |
11753 | |
11754 pmaddwd m5, m3, [r3 + 6 * 32] ; [22] | |
11755 paddd m5, [pd_16] | |
11756 psrld m5, 5 | |
11757 pmaddwd m8, m2, [r3 + 6 * 32] | |
11758 paddd m8, [pd_16] | |
11759 psrld m8, 5 | |
11760 packusdw m5, m8 | |
11761 | |
11762 pmaddwd m6, m3, [r3 + 1 * 32] ; [17] | |
11763 paddd m6, [pd_16] | |
11764 psrld m6, 5 | |
11765 pmaddwd m9, m2, [r3 + 1 * 32] | |
11766 paddd m9, [pd_16] | |
11767 psrld m9, 5 | |
11768 packusdw m6, m9 | |
11769 | |
11770 pmaddwd m7, m3, [r3 - 4 * 32] ; [12] | |
11771 paddd m7, [pd_16] | |
11772 psrld m7, 5 | |
11773 pmaddwd m8, m2, [r3 - 4 * 32] | |
11774 paddd m8, [pd_16] | |
11775 psrld m8, 5 | |
11776 packusdw m7, m8 | |
11777 | |
11778 pmaddwd m8, m3, [r3 - 9 * 32] ; [7] | |
11779 paddd m8, [pd_16] | |
11780 psrld m8, 5 | |
11781 pmaddwd m9, m2, [r3 - 9 * 32] | |
11782 paddd m9, [pd_16] | |
11783 psrld m9, 5 | |
11784 packusdw m8, m9 | |
11785 | |
11786 pmaddwd m9, m3, [r3 - 14 * 32] ; [2] | |
11787 paddd m9, [pd_16] | |
11788 psrld m9, 5 | |
11789 pmaddwd m2, [r3 - 14 * 32] | |
11790 paddd m2, [pd_16] | |
11791 psrld m2, 5 | |
11792 packusdw m9, m2 | |
11793 | |
11794 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
11795 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
11796 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 x x x x] | |
11797 | |
11798 palignr m2, m3, m1, 14 | |
11799 palignr m13, m0, m3, 14 | |
11800 | |
11801 pmaddwd m10, m2, [r3 + 13 * 32] ; [29] | |
11802 paddd m10, [pd_16] | |
11803 psrld m10, 5 | |
11804 pmaddwd m12, m13, [r3 + 13 * 32] | |
11805 paddd m12, [pd_16] | |
11806 psrld m12, 5 | |
11807 packusdw m10, m12 | |
11808 | |
11809 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
11810 paddd m11, [pd_16] | |
11811 psrld m11, 5 | |
11812 pmaddwd m13, [r3 + 8 * 32] | |
11813 paddd m13, [pd_16] | |
11814 psrld m13, 5 | |
11815 packusdw m11, m13 | |
11816 | |
11817 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
11818 | |
11819 palignr m13, m0, m3, 14 | |
11820 | |
11821 pmaddwd m4, m2, [r3 + 3 * 32] ; [19] | |
11822 paddd m4, [pd_16] | |
11823 psrld m4, 5 | |
11824 pmaddwd m5, m13, [r3 + 3 * 32] | |
11825 paddd m5, [pd_16] | |
11826 psrld m5, 5 | |
11827 packusdw m4, m5 | |
11828 | |
11829 pmaddwd m5, m2, [r3 - 2 * 32] ; [14] | |
11830 paddd m5, [pd_16] | |
11831 psrld m5, 5 | |
11832 pmaddwd m6, m13, [r3 - 2 * 32] | |
11833 paddd m6, [pd_16] | |
11834 psrld m6, 5 | |
11835 packusdw m5, m6 | |
11836 | |
11837 pmaddwd m6, m2, [r3 - 7 * 32] ; [9] | |
11838 paddd m6, [pd_16] | |
11839 psrld m6, 5 | |
11840 pmaddwd m8, m13, [r3 - 7 * 32] | |
11841 paddd m8, [pd_16] | |
11842 psrld m8, 5 | |
11843 packusdw m6, m8 | |
11844 | |
11845 pmaddwd m7, m2, [r3 - 12 * 32] ; [4] | |
11846 paddd m7, [pd_16] | |
11847 psrld m7, 5 | |
11848 pmaddwd m8, m13, [r3 - 12 * 32] | |
11849 paddd m8, [pd_16] | |
11850 psrld m8, 5 | |
11851 packusdw m7, m8 | |
11852 | |
11853 palignr m0, m3, 10 | |
11854 palignr m3, m1, 10 | |
11855 | |
11856 pmaddwd m8, m3, [r3 + 15 * 32] ; [31] | |
11857 paddd m8, [pd_16] | |
11858 psrld m8, 5 | |
11859 pmaddwd m9, m0, [r3 + 15 * 32] | |
11860 paddd m9, [pd_16] | |
11861 psrld m9, 5 | |
11862 packusdw m8, m9 | |
11863 | |
11864 pmaddwd m9, m3, [r3 + 10 * 32] ; [26] | |
11865 paddd m9, [pd_16] | |
11866 psrld m9, 5 | |
11867 pmaddwd m1, m0, [r3 + 10 * 32] | |
11868 paddd m1, [pd_16] | |
11869 psrld m1, 5 | |
11870 packusdw m9, m1 | |
11871 | |
11872 pmaddwd m1, m3, [r3 + 5 * 32] ; [21] | |
11873 paddd m1, [pd_16] | |
11874 psrld m1, 5 | |
11875 pmaddwd m2, m0, [r3 + 5 * 32] | |
11876 paddd m2, [pd_16] | |
11877 psrld m2, 5 | |
11878 packusdw m1, m2 | |
11879 | |
11880 pmaddwd m3, [r3] ; [16] | |
11881 paddd m3, [pd_16] | |
11882 psrld m3, 5 | |
11883 pmaddwd m0, [r3] | |
11884 paddd m0, [pd_16] | |
11885 psrld m0, 5 | |
11886 packusdw m3, m0 | |
11887 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 | |
11888 ret | |
11889 | |
11890 ;; angle 16, modes 13 and 23 | |
11891 cglobal ang16_mode_13_23 | |
11892 test r6d, r6d | |
11893 | |
11894 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
11895 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11896 | |
11897 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
11898 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
11899 | |
11900 pmaddwd m4, m3, [r3 + 7 * 32] ; [23] | |
11901 paddd m4, [pd_16] | |
11902 psrld m4, 5 | |
11903 pmaddwd m5, m2, [r3 + 7 * 32] | |
11904 paddd m5, [pd_16] | |
11905 psrld m5, 5 | |
11906 packusdw m4, m5 | |
11907 | |
11908 pmaddwd m5, m3, [r3 - 2 * 32] ; [14] | |
11909 paddd m5, [pd_16] | |
11910 psrld m5, 5 | |
11911 pmaddwd m6, m2, [r3 - 2 * 32] | |
11912 paddd m6, [pd_16] | |
11913 psrld m6, 5 | |
11914 packusdw m5, m6 | |
11915 | |
11916 pmaddwd m6, m3, [r3 - 11 * 32] ; [5] | |
11917 paddd m6, [pd_16] | |
11918 psrld m6, 5 | |
11919 pmaddwd m2, [r3 - 11 * 32] | |
11920 paddd m2, [pd_16] | |
11921 psrld m2, 5 | |
11922 packusdw m6, m2 | |
11923 | |
11924 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
11925 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
11926 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] | |
11927 | |
11928 palignr m2, m3, m1, 14 | |
11929 palignr m13, m0, m3, 14 | |
11930 | |
11931 pmaddwd m7, m2, [r3 + 12 * 32] ; [28] | |
11932 paddd m7, [pd_16] | |
11933 psrld m7, 5 | |
11934 pmaddwd m8, m13, [r3 + 12 * 32] | |
11935 paddd m8, [pd_16] | |
11936 psrld m8, 5 | |
11937 packusdw m7, m8 | |
11938 | |
11939 pmaddwd m8, m2, [r3 + 3 * 32] ; [19] | |
11940 paddd m8, [pd_16] | |
11941 psrld m8, 5 | |
11942 pmaddwd m9, m13, [r3 + 3 * 32] | |
11943 paddd m9, [pd_16] | |
11944 psrld m9, 5 | |
11945 packusdw m8, m9 | |
11946 | |
11947 pmaddwd m9, m2, [r3 - 6 * 32] ; [10] | |
11948 paddd m9, [pd_16] | |
11949 psrld m9, 5 | |
11950 pmaddwd m10, m13, [r3 - 6 * 32] | |
11951 paddd m10, [pd_16] | |
11952 psrld m10, 5 | |
11953 packusdw m9, m10 | |
11954 | |
11955 pmaddwd m10, m2, [r3 - 15 * 32] ; [1] | |
11956 paddd m10, [pd_16] | |
11957 psrld m10, 5 | |
11958 pmaddwd m12, m13, [r3 - 15 * 32] | |
11959 paddd m12, [pd_16] | |
11960 psrld m12, 5 | |
11961 packusdw m10, m12 | |
11962 | |
11963 palignr m2, m3, m1, 10 | |
11964 palignr m13, m0, m3, 10 | |
11965 | |
11966 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
11967 paddd m11, [pd_16] | |
11968 psrld m11, 5 | |
11969 pmaddwd m13, [r3 + 8 * 32] | |
11970 paddd m13, [pd_16] | |
11971 psrld m13, 5 | |
11972 packusdw m11, m13 | |
11973 | |
11974 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
11975 | |
11976 palignr m13, m0, m3, 10 | |
11977 | |
11978 pmaddwd m4, m2, [r3 - 1 * 32] ; [15] | |
11979 paddd m4, [pd_16] | |
11980 psrld m4, 5 | |
11981 pmaddwd m5, m13, [r3 - 1 * 32] | |
11982 paddd m5, [pd_16] | |
11983 psrld m5, 5 | |
11984 packusdw m4, m5 | |
11985 | |
11986 pmaddwd m5, m2, [r3 - 10 * 32] ; [6] | |
11987 paddd m5, [pd_16] | |
11988 psrld m5, 5 | |
11989 pmaddwd m6, m13, [r3 - 10 * 32] | |
11990 paddd m6, [pd_16] | |
11991 psrld m6, 5 | |
11992 packusdw m5, m6 | |
11993 | |
11994 palignr m2, m3, m1, 6 | |
11995 palignr m13, m0, m3, 6 | |
11996 | |
11997 pmaddwd m6, m2, [r3 + 13 * 32] ; [29] | |
11998 paddd m6, [pd_16] | |
11999 psrld m6, 5 | |
12000 pmaddwd m8, m13, [r3 + 13 * 32] | |
12001 paddd m8, [pd_16] | |
12002 psrld m8, 5 | |
12003 packusdw m6, m8 | |
12004 | |
12005 pmaddwd m7, m2, [r3 + 4 * 32] ; [20] | |
12006 paddd m7, [pd_16] | |
12007 psrld m7, 5 | |
12008 pmaddwd m8, m13, [r3 + 4 * 32] | |
12009 paddd m8, [pd_16] | |
12010 psrld m8, 5 | |
12011 packusdw m7, m8 | |
12012 | |
12013 pmaddwd m8, m2, [r3 - 5 * 32] ; [11] | |
12014 paddd m8, [pd_16] | |
12015 psrld m8, 5 | |
12016 pmaddwd m9, m13, [r3 - 5 * 32] | |
12017 paddd m9, [pd_16] | |
12018 psrld m9, 5 | |
12019 packusdw m8, m9 | |
12020 | |
12021 pmaddwd m9, m2, [r3 - 14 * 32] ; [2] | |
12022 paddd m9, [pd_16] | |
12023 psrld m9, 5 | |
12024 pmaddwd m13, [r3 - 14 * 32] | |
12025 paddd m13, [pd_16] | |
12026 psrld m13, 5 | |
12027 packusdw m9, m13 | |
12028 | |
12029 palignr m0, m3, 2 | |
12030 palignr m3, m1, 2 | |
12031 | |
12032 pmaddwd m1, m3, [r3 + 9 * 32] ; [25] | |
12033 paddd m1, [pd_16] | |
12034 psrld m1, 5 | |
12035 pmaddwd m2, m0, [r3 + 9 * 32] | |
12036 paddd m2, [pd_16] | |
12037 psrld m2, 5 | |
12038 packusdw m1, m2 | |
12039 | |
12040 pmaddwd m3, [r3] ; [16] | |
12041 paddd m3, [pd_16] | |
12042 psrld m3, 5 | |
12043 pmaddwd m0, [r3] | |
12044 paddd m0, [pd_16] | |
12045 psrld m0, 5 | |
12046 packusdw m3, m0 | |
12047 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 | |
12048 ret | |
12049 | |
12050 ;; angle 16, modes 14 and 22 | |
12051 cglobal ang16_mode_14_22 | |
12052 test r6d, r6d | |
12053 | |
12054 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
12055 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12056 | |
12057 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
12058 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
12059 | |
12060 pmaddwd m4, m3, [r3 + 3 * 32] ; [19] | |
12061 paddd m4, [pd_16] | |
12062 psrld m4, 5 | |
12063 pmaddwd m5, m2, [r3 + 3 * 32] | |
12064 paddd m5, [pd_16] | |
12065 psrld m5, 5 | |
12066 packusdw m4, m5 | |
12067 | |
12068 pmaddwd m5, m3, [r3 - 10 * 32] ; [6] | |
12069 paddd m5, [pd_16] | |
12070 psrld m5, 5 | |
12071 pmaddwd m2, [r3 - 10 * 32] | |
12072 paddd m2, [pd_16] | |
12073 psrld m2, 5 | |
12074 packusdw m5, m2 | |
12075 | |
12076 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
12077 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
12078 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 5 5 7 7 10 10] | |
12079 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 12 12 15 15 x x x x] | |
12080 | |
12081 palignr m2, m3, m1, 14 | |
12082 palignr m13, m0, m3, 14 | |
12083 | |
12084 pmaddwd m6, m2, [r3 + 9 * 32] ; [25] | |
12085 paddd m6, [pd_16] | |
12086 psrld m6, 5 | |
12087 pmaddwd m9, m13, [r3 + 9 * 32] | |
12088 paddd m9, [pd_16] | |
12089 psrld m9, 5 | |
12090 packusdw m6, m9 | |
12091 | |
12092 pmaddwd m7, m2, [r3 - 4 * 32] ; [12] | |
12093 paddd m7, [pd_16] | |
12094 psrld m7, 5 | |
12095 pmaddwd m8, m13, [r3 - 4 * 32] | |
12096 paddd m8, [pd_16] | |
12097 psrld m8, 5 | |
12098 packusdw m7, m8 | |
12099 | |
12100 palignr m2, m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5] | |
12101 palignr m13, m0, m3, 10 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] | |
12102 | |
12103 pmaddwd m8, m2, [r3 + 15 * 32] ; [31] | |
12104 paddd m8, [pd_16] | |
12105 psrld m8, 5 | |
12106 pmaddwd m9, m13, [r3 + 15 * 32] | |
12107 paddd m9, [pd_16] | |
12108 psrld m9, 5 | |
12109 packusdw m8, m9 | |
12110 | |
12111 pmaddwd m9, m2, [r3 + 2 * 32] ; [18] | |
12112 paddd m9, [pd_16] | |
12113 psrld m9, 5 | |
12114 pmaddwd m10, m13, [r3 + 2 * 32] | |
12115 paddd m10, [pd_16] | |
12116 psrld m10, 5 | |
12117 packusdw m9, m10 | |
12118 | |
12119 pmaddwd m10, m2, [r3 - 11 * 32] ; [5] | |
12120 paddd m10, [pd_16] | |
12121 psrld m10, 5 | |
12122 pmaddwd m12, m13, [r3 - 11 * 32] | |
12123 paddd m12, [pd_16] | |
12124 psrld m12, 5 | |
12125 packusdw m10, m12 | |
12126 | |
12127 palignr m2, m3, m1, 6 ; [ 9 8 8 7 7 6 6 5 1 0 0 2 2 5 5 7] | |
12128 palignr m13, m0, m3, 6 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
12129 | |
12130 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
12131 paddd m11, [pd_16] | |
12132 psrld m11, 5 | |
12133 pmaddwd m13, [r3 + 8 * 32] | |
12134 paddd m13, [pd_16] | |
12135 psrld m13, 5 | |
12136 packusdw m11, m13 | |
12137 | |
12138 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
12139 | |
12140 palignr m13, m0, m3, 6 | |
12141 | |
12142 pmaddwd m4, m2, [r3 - 5 * 32] ; [11] | |
12143 paddd m4, [pd_16] | |
12144 psrld m4, 5 | |
12145 pmaddwd m5, m13, [r3 - 5 * 32] | |
12146 paddd m5, [pd_16] | |
12147 psrld m5, 5 | |
12148 packusdw m4, m5 | |
12149 | |
12150 palignr m2, m0, m3, 2 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
12151 palignr m13, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 5 5 7 7 10] | |
12152 | |
12153 pmaddwd m5, m13, [r3 + 14 * 32] ; [30] | |
12154 paddd m5, [pd_16] | |
12155 psrld m5, 5 | |
12156 pmaddwd m6, m2, [r3 + 14 * 32] | |
12157 paddd m6, [pd_16] | |
12158 psrld m6, 5 | |
12159 packusdw m5, m6 | |
12160 | |
12161 pmaddwd m6, m13, [r3 + 1 * 32] ; [17] | |
12162 paddd m6, [pd_16] | |
12163 psrld m6, 5 | |
12164 pmaddwd m8, m2, [r3 + 1 * 32] | |
12165 paddd m8, [pd_16] | |
12166 psrld m8, 5 | |
12167 packusdw m6, m8 | |
12168 | |
12169 pmaddwd m7, m13, [r3 - 12 * 32] ; [4] | |
12170 paddd m7, [pd_16] | |
12171 psrld m7, 5 | |
12172 pmaddwd m8, m2, [r3 - 12 * 32] | |
12173 paddd m8, [pd_16] | |
12174 psrld m8, 5 | |
12175 packusdw m7, m8 | |
12176 | |
12177 palignr m2, m1, m14, 14 ; [ 7 6 6 5 5 4 4 3 2 5 5 7 7 10 10 12] | |
12178 palignr m0, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 2] | |
12179 | |
12180 pmaddwd m8, m2, [r3 + 7 * 32] ; [23] | |
12181 paddd m8, [pd_16] | |
12182 psrld m8, 5 | |
12183 pmaddwd m9, m0, [r3 + 7 * 32] | |
12184 paddd m9, [pd_16] | |
12185 psrld m9, 5 | |
12186 packusdw m8, m9 | |
12187 | |
12188 pmaddwd m9, m2, [r3 - 6 * 32] ; [10] | |
12189 paddd m9, [pd_16] | |
12190 psrld m9, 5 | |
12191 pmaddwd m2, m0, [r3 - 6 * 32] | |
12192 paddd m2, [pd_16] | |
12193 psrld m2, 5 | |
12194 packusdw m9, m2 | |
12195 | |
12196 palignr m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5] | |
12197 palignr m1, m14, 10 ; [ 6 5 5 4 4 3 3 2 5 7 7 10 10 12 12 15] | |
12198 | |
12199 pmaddwd m2, m1, [r3 + 13 * 32] ; [29] | |
12200 paddd m2, [pd_16] | |
12201 psrld m2, 5 | |
12202 pmaddwd m0, m3, [r3 + 13 * 32] | |
12203 paddd m0, [pd_16] | |
12204 psrld m0, 5 | |
12205 packusdw m2, m0 | |
12206 | |
12207 pmaddwd m1, [r3] ; [16] | |
12208 paddd m1, [pd_16] | |
12209 psrld m1, 5 | |
12210 pmaddwd m3, [r3] | |
12211 paddd m3, [pd_16] | |
12212 psrld m3, 5 | |
12213 packusdw m1, m3 | |
12214 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16 | |
12215 ret | |
12216 | |
12217 ;; angle 16, modes 15 and 21 | |
12218 cglobal ang16_mode_15_21 | |
12219 test r6d, r6d | |
12220 | |
12221 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
12222 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12223 | |
12224 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
12225 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
12226 | |
12227 pmaddwd m4, m3, [r3 - 1 * 32] ; [15] | |
12228 paddd m4, [pd_16] | |
12229 psrld m4, 5 | |
12230 pmaddwd m5, m2, [r3 - 1 * 32] | |
12231 paddd m5, [pd_16] | |
12232 psrld m5, 5 | |
12233 packusdw m4, m5 | |
12234 | |
12235 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
12236 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
12237 vinserti128 m1, m1, xm0, 1 | |
12238 vinserti128 m14, m14, xm3, 1 | |
12239 | |
12240 palignr m2, m3, m1, 14 | |
12241 palignr m13, m0, m3, 14 | |
12242 | |
12243 pmaddwd m5, m2, [r3 + 14 * 32] ; [30] | |
12244 paddd m5, [pd_16] | |
12245 psrld m5, 5 | |
12246 pmaddwd m8, m13, [r3 + 14 * 32] | |
12247 paddd m8, [pd_16] | |
12248 psrld m8, 5 | |
12249 packusdw m5, m8 | |
12250 | |
12251 pmaddwd m6, m2, [r3 - 3 * 32] ; [13] | |
12252 paddd m6, [pd_16] | |
12253 psrld m6, 5 | |
12254 pmaddwd m9, m13, [r3 - 3 * 32] | |
12255 paddd m9, [pd_16] | |
12256 psrld m9, 5 | |
12257 packusdw m6, m9 | |
12258 | |
12259 palignr m2, m3, m1, 10 | |
12260 palignr m13, m0, m3, 10 | |
12261 | |
12262 pmaddwd m7, m2, [r3 + 12 * 32] ; [28] | |
12263 paddd m7, [pd_16] | |
12264 psrld m7, 5 | |
12265 pmaddwd m8, m13, [r3 + 12 * 32] | |
12266 paddd m8, [pd_16] | |
12267 psrld m8, 5 | |
12268 packusdw m7, m8 | |
12269 | |
12270 pmaddwd m8, m2, [r3 - 5 * 32] ; [11] | |
12271 paddd m8, [pd_16] | |
12272 psrld m8, 5 | |
12273 pmaddwd m9, m13, [r3 - 5 * 32] | |
12274 paddd m9, [pd_16] | |
12275 psrld m9, 5 | |
12276 packusdw m8, m9 | |
12277 | |
12278 palignr m2, m3, m1, 6 | |
12279 palignr m13, m0, m3, 6 | |
12280 | |
12281 pmaddwd m9, m2, [r3 + 10 * 32] ; [26] | |
12282 paddd m9, [pd_16] | |
12283 psrld m9, 5 | |
12284 pmaddwd m10, m13, [r3 + 10 * 32] | |
12285 paddd m10, [pd_16] | |
12286 psrld m10, 5 | |
12287 packusdw m9, m10 | |
12288 | |
12289 pmaddwd m10, m2, [r3 - 7 * 32] ; [9] | |
12290 paddd m10, [pd_16] | |
12291 psrld m10, 5 | |
12292 pmaddwd m12, m13, [r3 - 7 * 32] | |
12293 paddd m12, [pd_16] | |
12294 psrld m12, 5 | |
12295 packusdw m10, m12 | |
12296 | |
12297 palignr m2, m3, m1, 2 | |
12298 palignr m13, m0, m3, 2 | |
12299 | |
12300 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
12301 paddd m11, [pd_16] | |
12302 psrld m11, 5 | |
12303 pmaddwd m13, [r3 + 8 * 32] | |
12304 paddd m13, [pd_16] | |
12305 psrld m13, 5 | |
12306 packusdw m11, m13 | |
12307 | |
12308 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
12309 | |
12310 palignr m13, m0, m3, 2 | |
12311 | |
12312 pmaddwd m4, m2, [r3 - 9 * 32] ; [7] | |
12313 paddd m4, [pd_16] | |
12314 psrld m4, 5 | |
12315 pmaddwd m5, m13, [r3 - 9 * 32] | |
12316 paddd m5, [pd_16] | |
12317 psrld m5, 5 | |
12318 packusdw m4, m5 | |
12319 | |
12320 palignr m6, m1, m14, 14 | |
12321 palignr m7, m3, m1, 14 | |
12322 | |
12323 pmaddwd m5, m6, [r3 + 6 * 32] ; [22] | |
12324 paddd m5, [pd_16] | |
12325 psrld m5, 5 | |
12326 pmaddwd m8, m7, [r3 + 6 * 32] | |
12327 paddd m8, [pd_16] | |
12328 psrld m8, 5 | |
12329 packusdw m5, m8 | |
12330 | |
12331 pmaddwd m6, [r3 - 11 * 32] ; [5] | |
12332 paddd m6, [pd_16] | |
12333 psrld m6, 5 | |
12334 pmaddwd m7, [r3 - 11 * 32] | |
12335 paddd m7, [pd_16] | |
12336 psrld m7, 5 | |
12337 packusdw m6, m7 | |
12338 | |
12339 palignr m8, m1, m14, 10 | |
12340 palignr m9, m3, m1, 10 | |
12341 | |
12342 pmaddwd m7, m8, [r3 + 4 * 32] ; [20] | |
12343 paddd m7, [pd_16] | |
12344 psrld m7, 5 | |
12345 pmaddwd m10, m9, [r3 + 4 * 32] | |
12346 paddd m10, [pd_16] | |
12347 psrld m10, 5 | |
12348 packusdw m7, m10 | |
12349 | |
12350 pmaddwd m8, [r3 - 13 * 32] ; [3] | |
12351 paddd m8, [pd_16] | |
12352 psrld m8, 5 | |
12353 pmaddwd m9, [r3 - 13 * 32] | |
12354 paddd m9, [pd_16] | |
12355 psrld m9, 5 | |
12356 packusdw m8, m9 | |
12357 | |
12358 palignr m2, m1, m14, 6 | |
12359 palignr m0, m3, m1, 6 | |
12360 | |
12361 pmaddwd m9, m2, [r3 + 2 * 32] ; [18] | |
12362 paddd m9, [pd_16] | |
12363 psrld m9, 5 | |
12364 pmaddwd m13, m0, [r3 + 2 * 32] | |
12365 paddd m13, [pd_16] | |
12366 psrld m13, 5 | |
12367 packusdw m9, m13 | |
12368 | |
12369 pmaddwd m2, [r3 - 15 * 32] ; [1] | |
12370 paddd m2, [pd_16] | |
12371 psrld m2, 5 | |
12372 pmaddwd m0, [r3 - 15 * 32] | |
12373 paddd m0, [pd_16] | |
12374 psrld m0, 5 | |
12375 packusdw m2, m0 | |
12376 | |
12377 palignr m3, m1, 2 | |
12378 palignr m1, m14, 2 | |
12379 | |
12380 pmaddwd m1, [r3] ; [16] | |
12381 paddd m1, [pd_16] | |
12382 psrld m1, 5 | |
12383 pmaddwd m3, [r3] | |
12384 paddd m3, [pd_16] | |
12385 psrld m3, 5 | |
12386 packusdw m1, m3 | |
12387 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16 | |
12388 ret | |
12389 | |
12390 ;; angle 16, modes 16 and 20 | |
12391 cglobal ang16_mode_16_20 | |
12392 test r6d, r6d | |
12393 | |
12394 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
12395 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12396 | |
12397 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
12398 punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
12399 | |
12400 pmaddwd m4, m3, [r3 - 5 * 32] ; [11] | |
12401 paddd m4, [pd_16] | |
12402 psrld m4, 5 | |
12403 pmaddwd m5, m12, [r3 - 5 * 32] | |
12404 paddd m5, [pd_16] | |
12405 psrld m5, 5 | |
12406 packusdw m4, m5 | |
12407 | |
12408 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
12409 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
12410 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6] | |
12411 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12] | |
12412 vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x] | |
12413 | |
12414 palignr m12, m3, m1, 14 | |
12415 palignr m13, m0, m3, 14 | |
12416 | |
12417 pmaddwd m5, m12, [r3 + 6 * 32] ; [22] | |
12418 paddd m5, [pd_16] | |
12419 psrld m5, 5 | |
12420 pmaddwd m8, m13, [r3 + 6 * 32] | |
12421 paddd m8, [pd_16] | |
12422 psrld m8, 5 | |
12423 packusdw m5, m8 | |
12424 | |
12425 pmaddwd m6, m12, [r3 - 15 * 32] ; [1] | |
12426 paddd m6, [pd_16] | |
12427 psrld m6, 5 | |
12428 pmaddwd m9, m13, [r3 - 15 * 32] | |
12429 paddd m9, [pd_16] | |
12430 psrld m9, 5 | |
12431 packusdw m6, m9 | |
12432 | |
12433 palignr m12, m3, m1, 10 | |
12434 palignr m13, m0, m3, 10 | |
12435 | |
12436 pmaddwd m7, m12, [r3 - 4 * 32] ; [12] | |
12437 paddd m7, [pd_16] | |
12438 psrld m7, 5 | |
12439 pmaddwd m8, m13, [r3 - 4 * 32] | |
12440 paddd m8, [pd_16] | |
12441 psrld m8, 5 | |
12442 packusdw m7, m8 | |
12443 | |
12444 palignr m12, m3, m1, 6 | |
12445 palignr m13, m0, m3, 6 | |
12446 | |
12447 pmaddwd m8, m12, [r3 + 7 * 32] ; [23] | |
12448 paddd m8, [pd_16] | |
12449 psrld m8, 5 | |
12450 pmaddwd m9, m13, [r3 + 7 * 32] | |
12451 paddd m9, [pd_16] | |
12452 psrld m9, 5 | |
12453 packusdw m8, m9 | |
12454 | |
12455 pmaddwd m9, m12, [r3 - 14 * 32] ; [2] | |
12456 paddd m9, [pd_16] | |
12457 psrld m9, 5 | |
12458 pmaddwd m10, m13, [r3 - 14 * 32] | |
12459 paddd m10, [pd_16] | |
12460 psrld m10, 5 | |
12461 packusdw m9, m10 | |
12462 | |
12463 palignr m12, m3, m1, 2 | |
12464 palignr m13, m0, m3, 2 | |
12465 | |
12466 pmaddwd m10, m12, [r3 - 3 * 32] ; [13] | |
12467 paddd m10, [pd_16] | |
12468 psrld m10, 5 | |
12469 pmaddwd m11, m13, [r3 - 3 * 32] | |
12470 paddd m11, [pd_16] | |
12471 psrld m11, 5 | |
12472 packusdw m10, m11 | |
12473 | |
12474 palignr m12, m1, m14, 14 | |
12475 palignr m13, m3, m1, 14 | |
12476 | |
12477 pmaddwd m11, m12, [r3 + 8 * 32] ; [24] | |
12478 paddd m11, [pd_16] | |
12479 psrld m11, 5 | |
12480 pmaddwd m13, [r3 + 8 * 32] | |
12481 paddd m13, [pd_16] | |
12482 psrld m13, 5 | |
12483 packusdw m11, m13 | |
12484 | |
12485 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0 | |
12486 | |
12487 palignr m13, m3, m1, 14 | |
12488 | |
12489 pmaddwd m4, m12, [r3 - 13 * 32] ; [3] | |
12490 paddd m4, [pd_16] | |
12491 psrld m4, 5 | |
12492 pmaddwd m5, m13, [r3 - 13 * 32] | |
12493 paddd m5, [pd_16] | |
12494 psrld m5, 5 | |
12495 packusdw m4, m5 | |
12496 | |
12497 palignr m6, m1, m14, 10 | |
12498 palignr m7, m3, m1, 10 | |
12499 | |
12500 pmaddwd m5, m6, [r3 - 2 * 32] ; [14] | |
12501 paddd m5, [pd_16] | |
12502 psrld m5, 5 | |
12503 pmaddwd m8, m7, [r3 - 2 * 32] | |
12504 paddd m8, [pd_16] | |
12505 psrld m8, 5 | |
12506 packusdw m5, m8 | |
12507 | |
12508 palignr m7, m1, m14, 6 | |
12509 palignr m10, m3, m1, 6 | |
12510 | |
12511 pmaddwd m6, m7, [r3 + 9 * 32] ; [25] | |
12512 paddd m6, [pd_16] | |
12513 psrld m6, 5 | |
12514 pmaddwd m8, m10, [r3 + 9 * 32] | |
12515 paddd m8, [pd_16] | |
12516 psrld m8, 5 | |
12517 packusdw m6, m8 | |
12518 | |
12519 pmaddwd m7, [r3 - 12 * 32] ; [4] | |
12520 paddd m7, [pd_16] | |
12521 psrld m7, 5 | |
12522 pmaddwd m10, [r3 - 12 * 32] | |
12523 paddd m10, [pd_16] | |
12524 psrld m10, 5 | |
12525 packusdw m7, m10 | |
12526 | |
12527 palignr m8, m1, m14, 2 ; [ 4 3 3 2 2 1 1 0 6 8 8 9 9 11 11 12] | |
12528 palignr m9, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 3 3 5 5 6] | |
12529 | |
12530 pmaddwd m8, [r3 - 1 * 32] ; [15] | |
12531 paddd m8, [pd_16] | |
12532 psrld m8, 5 | |
12533 pmaddwd m9, [r3 - 1 * 32] | |
12534 paddd m9, [pd_16] | |
12535 psrld m9, 5 | |
12536 packusdw m8, m9 | |
12537 | |
12538 palignr m12, m14, m2, 14 | |
12539 palignr m0, m1, m14, 14 | |
12540 | |
12541 pmaddwd m9, m12, [r3 + 10 * 32] ; [26] | |
12542 paddd m9, [pd_16] | |
12543 psrld m9, 5 | |
12544 pmaddwd m13, m0, [r3 + 10 * 32] | |
12545 paddd m13, [pd_16] | |
12546 psrld m13, 5 | |
12547 packusdw m9, m13 | |
12548 | |
12549 pmaddwd m12, [r3 - 11 * 32] ; [5] | |
12550 paddd m12, [pd_16] | |
12551 psrld m12, 5 | |
12552 pmaddwd m0, [r3 - 11 * 32] | |
12553 paddd m0, [pd_16] | |
12554 psrld m0, 5 | |
12555 packusdw m12, m0 | |
12556 | |
12557 palignr m1, m14, 10 | |
12558 palignr m14, m2, 10 | |
12559 | |
12560 pmaddwd m14, [r3] ; [16] | |
12561 paddd m14, [pd_16] | |
12562 psrld m14, 5 | |
12563 pmaddwd m1, [r3] | |
12564 paddd m1, [pd_16] | |
12565 psrld m1, 5 | |
12566 packusdw m14, m1 | |
12567 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16 | |
12568 ret | |
12569 | |
12570 ;; angle 16, modes 17 and 19 | |
12571 cglobal ang16_mode_17_19 | |
12572 test r6d, r6d | |
12573 | |
12574 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
12575 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12576 | |
12577 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
12578 punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
12579 | |
12580 pmaddwd m4, m3, [r3 - 10 * 32] ; [6] | |
12581 paddd m4, [pd_16] | |
12582 psrld m4, 5 | |
12583 pmaddwd m5, m12, [r3 - 10 * 32] | |
12584 paddd m5, [pd_16] | |
12585 psrld m5, 5 | |
12586 packusdw m4, m5 | |
12587 | |
12588 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
12589 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
12590 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6] | |
12591 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12] | |
12592 vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x] | |
12593 | |
12594 palignr m12, m3, m1, 14 | |
12595 palignr m13, m0, m3, 14 | |
12596 | |
12597 pmaddwd m5, m12, [r3 - 4 * 32] ; [12] | |
12598 paddd m5, [pd_16] | |
12599 psrld m5, 5 | |
12600 pmaddwd m8, m13, [r3 - 4 * 32] | |
12601 paddd m8, [pd_16] | |
12602 psrld m8, 5 | |
12603 packusdw m5, m8 | |
12604 | |
12605 palignr m12, m3, m1, 10 | |
12606 palignr m13, m0, m3, 10 | |
12607 | |
12608 pmaddwd m6, m12, [r3 + 2 * 32] ; [18] | |
12609 paddd m6, [pd_16] | |
12610 psrld m6, 5 | |
12611 pmaddwd m9, m13, [r3 + 2 * 32] | |
12612 paddd m9, [pd_16] | |
12613 psrld m9, 5 | |
12614 packusdw m6, m9 | |
12615 | |
12616 palignr m12, m3, m1, 6 | |
12617 palignr m13, m0, m3, 6 | |
12618 | |
12619 pmaddwd m7, m12, [r3 + 8 * 32] ; [24] | |
12620 paddd m7, [pd_16] | |
12621 psrld m7, 5 | |
12622 pmaddwd m8, m13, [r3 + 8 * 32] | |
12623 paddd m8, [pd_16] | |
12624 psrld m8, 5 | |
12625 packusdw m7, m8 | |
12626 | |
12627 palignr m12, m3, m1, 2 | |
12628 palignr m13, m0, m3, 2 | |
12629 | |
12630 pmaddwd m8, m12, [r3 + 14 * 32] ; [30] | |
12631 paddd m8, [pd_16] | |
12632 psrld m8, 5 | |
12633 pmaddwd m9, m13, [r3 + 14 * 32] | |
12634 paddd m9, [pd_16] | |
12635 psrld m9, 5 | |
12636 packusdw m8, m9 | |
12637 | |
12638 pmaddwd m9, m12, [r3 - 12 * 32] ; [4] | |
12639 paddd m9, [pd_16] | |
12640 psrld m9, 5 | |
12641 pmaddwd m10, m13, [r3 - 12 * 32] | |
12642 paddd m10, [pd_16] | |
12643 psrld m10, 5 | |
12644 packusdw m9, m10 | |
12645 | |
12646 palignr m12, m1, m14, 14 | |
12647 palignr m13, m3, m1, 14 | |
12648 | |
12649 pmaddwd m10, m12, [r3 - 6 * 32] ; [10] | |
12650 paddd m10, [pd_16] | |
12651 psrld m10, 5 | |
12652 pmaddwd m11, m13, [r3 - 6 * 32] | |
12653 paddd m11, [pd_16] | |
12654 psrld m11, 5 | |
12655 packusdw m10, m11 | |
12656 | |
12657 palignr m12, m1, m14, 10 | |
12658 palignr m13, m3, m1, 10 | |
12659 | |
12660 pmaddwd m11, m12, [r3] ; [16] | |
12661 paddd m11, [pd_16] | |
12662 psrld m11, 5 | |
12663 pmaddwd m13, [r3] | |
12664 paddd m13, [pd_16] | |
12665 psrld m13, 5 | |
12666 packusdw m11, m13 | |
12667 | |
12668 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0 | |
12669 | |
12670 palignr m12, m1, m14, 6 | |
12671 palignr m13, m3, m1, 6 | |
12672 | |
12673 pmaddwd m4, m12, [r3 + 6 * 32] ; [22] | |
12674 paddd m4, [pd_16] | |
12675 psrld m4, 5 | |
12676 pmaddwd m5, m13, [r3 + 6 * 32] | |
12677 paddd m5, [pd_16] | |
12678 psrld m5, 5 | |
12679 packusdw m4, m5 | |
12680 | |
12681 palignr m12, m1, m14, 2 | |
12682 palignr m13, m3, m1, 2 | |
12683 | |
12684 pmaddwd m5, m12, [r3 + 12 * 32] ; [28] | |
12685 paddd m5, [pd_16] | |
12686 psrld m5, 5 | |
12687 pmaddwd m8, m13, [r3 + 12 * 32] | |
12688 paddd m8, [pd_16] | |
12689 psrld m8, 5 | |
12690 packusdw m5, m8 | |
12691 | |
12692 pmaddwd m6, m12, [r3 - 14 * 32] ; [2] | |
12693 paddd m6, [pd_16] | |
12694 psrld m6, 5 | |
12695 pmaddwd m8, m13, [r3 - 14 * 32] | |
12696 paddd m8, [pd_16] | |
12697 psrld m8, 5 | |
12698 packusdw m6, m8 | |
12699 | |
12700 palignr m7, m14, m2, 14 | |
12701 palignr m0, m1, m14, 14 | |
12702 | |
12703 pmaddwd m7, [r3 - 8 * 32] ; [8] | |
12704 paddd m7, [pd_16] | |
12705 psrld m7, 5 | |
12706 pmaddwd m0, [r3 - 8 * 32] | |
12707 paddd m0, [pd_16] | |
12708 psrld m0, 5 | |
12709 packusdw m7, m0 | |
12710 | |
12711 palignr m8, m14, m2, 10 | |
12712 palignr m9, m1, m14, 10 | |
12713 | |
12714 pmaddwd m8, [r3 - 2 * 32] ; [14] | |
12715 paddd m8, [pd_16] | |
12716 psrld m8, 5 | |
12717 pmaddwd m9, [r3 - 2 * 32] | |
12718 paddd m9, [pd_16] | |
12719 psrld m9, 5 | |
12720 packusdw m8, m9 | |
12721 | |
12722 palignr m9, m14, m2, 6 | |
12723 palignr m13, m1, m14, 6 | |
12724 | |
12725 pmaddwd m9, [r3 + 4 * 32] ; [20] | |
12726 paddd m9, [pd_16] | |
12727 psrld m9, 5 | |
12728 pmaddwd m13, [r3 + 4 * 32] | |
12729 paddd m13, [pd_16] | |
12730 psrld m13, 5 | |
12731 packusdw m9, m13 | |
12732 | |
12733 palignr m1, m14, 2 | |
12734 palignr m14, m2, 2 | |
12735 | |
12736 pmaddwd m12, m14, [r3 + 10 * 32] ; [26] | |
12737 paddd m12, [pd_16] | |
12738 psrld m12, 5 | |
12739 pmaddwd m0, m1, [r3 + 10 * 32] | |
12740 paddd m0, [pd_16] | |
12741 psrld m0, 5 | |
12742 packusdw m12, m0 | |
12743 | |
12744 pmaddwd m14, [r3 - 16 * 32] ; [0] | |
12745 paddd m14, [pd_16] | |
12746 psrld m14, 5 | |
12747 pmaddwd m1, [r3 - 16 * 32] | |
12748 paddd m1, [pd_16] | |
12749 psrld m1, 5 | |
12750 packusdw m14, m1 | |
12751 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16 | |
12752 ret | |
12753 | |
12754 cglobal intra_pred_ang16_3, 3,7,13 | |
12755 add r2, 64 | |
12756 xor r6d, r6d | |
12757 lea r3, [ang_table_avx2 + 16 * 32] | |
12758 add r1d, r1d | |
12759 lea r4, [r1 * 3] | |
12760 | |
12761 call ang16_mode_3_33 | |
12762 RET | |
12763 | |
12764 cglobal intra_pred_ang16_33, 3,7,13 | |
12765 xor r6d, r6d | |
12766 inc r6d | |
12767 lea r3, [ang_table_avx2 + 16 * 32] | |
12768 add r1d, r1d | |
12769 lea r4, [r1 * 3] | |
12770 | |
12771 call ang16_mode_3_33 | |
12772 RET | |
12773 | |
12774 cglobal intra_pred_ang16_4, 3,7,13 | |
12775 add r2, 64 | |
12776 xor r6d, r6d | |
12777 lea r3, [ang_table_avx2 + 18 * 32] | |
12778 add r1d, r1d | |
12779 lea r4, [r1 * 3] | |
12780 | |
12781 call ang16_mode_4_32 | |
12782 RET | |
12783 | |
12784 cglobal intra_pred_ang16_32, 3,7,13 | |
12785 xor r6d, r6d | |
12786 inc r6d | |
12787 lea r3, [ang_table_avx2 + 18 * 32] | |
12788 add r1d, r1d | |
12789 lea r4, [r1 * 3] | |
12790 | |
12791 call ang16_mode_4_32 | |
12792 RET | |
12793 | |
12794 cglobal intra_pred_ang16_5, 3,7,13 | |
12795 add r2, 64 | |
12796 xor r6d, r6d | |
12797 lea r3, [ang_table_avx2 + 16 * 32] | |
12798 add r1d, r1d | |
12799 lea r4, [r1 * 3] | |
12800 | |
12801 call ang16_mode_5_31 | |
12802 RET | |
12803 | |
12804 cglobal intra_pred_ang16_31, 3,7,13 | |
12805 xor r6d, r6d | |
12806 inc r6d | |
12807 lea r3, [ang_table_avx2 + 16 * 32] | |
12808 add r1d, r1d | |
12809 lea r4, [r1 * 3] | |
12810 | |
12811 call ang16_mode_5_31 | |
12812 RET | |
12813 | |
12814 cglobal intra_pred_ang16_6, 3,7,14 | |
12815 add r2, 64 | |
12816 xor r6d, r6d | |
12817 lea r3, [ang_table_avx2 + 15 * 32] | |
12818 add r1d, r1d | |
12819 lea r4, [r1 * 3] | |
12820 | |
12821 call ang16_mode_6_30 | |
12822 RET | |
12823 | |
12824 cglobal intra_pred_ang16_30, 3,7,14 | |
12825 xor r6d, r6d | |
12826 inc r6d | |
12827 lea r3, [ang_table_avx2 + 15 * 32] | |
12828 add r1d, r1d | |
12829 lea r4, [r1 * 3] | |
12830 | |
12831 call ang16_mode_6_30 | |
12832 RET | |
12833 | |
12834 cglobal intra_pred_ang16_7, 3,7,13 | |
12835 add r2, 64 | |
12836 xor r6d, r6d | |
12837 lea r3, [ang_table_avx2 + 17 * 32] | |
12838 add r1d, r1d | |
12839 lea r4, [r1 * 3] | |
12840 | |
12841 call ang16_mode_7_29 | |
12842 RET | |
12843 | |
12844 cglobal intra_pred_ang16_29, 3,7,13 | |
12845 xor r6d, r6d | |
12846 inc r6d | |
12847 lea r3, [ang_table_avx2 + 17 * 32] | |
12848 add r1d, r1d | |
12849 lea r4, [r1 * 3] | |
12850 | |
12851 call ang16_mode_7_29 | |
12852 RET | |
12853 | |
12854 cglobal intra_pred_ang16_8, 3,7,13 | |
12855 add r2, 64 | |
12856 xor r6d, r6d | |
12857 lea r3, [ang_table_avx2 + 15 * 32] | |
12858 add r1d, r1d | |
12859 lea r4, [r1 * 3] | |
12860 | |
12861 call ang16_mode_8_28 | |
12862 RET | |
12863 | |
12864 cglobal intra_pred_ang16_28, 3,7,13 | |
12865 xor r6d, r6d | |
12866 inc r6d | |
12867 lea r3, [ang_table_avx2 + 15 * 32] | |
12868 add r1d, r1d | |
12869 lea r4, [r1 * 3] | |
12870 | |
12871 call ang16_mode_8_28 | |
12872 RET | |
12873 | |
12874 cglobal intra_pred_ang16_9, 3,7,12 | |
12875 add r2, 64 | |
12876 xor r6d, r6d | |
12877 lea r3, [ang_table_avx2 + 16 * 32] | |
12878 add r1d, r1d | |
12879 lea r4, [r1 * 3] | |
12880 | |
12881 call ang16_mode_9_27 | |
12882 RET | |
12883 | |
12884 cglobal intra_pred_ang16_27, 3,7,12 | |
12885 xor r6d, r6d | |
12886 inc r6d | |
12887 lea r3, [ang_table_avx2 + 16 * 32] | |
12888 add r1d, r1d | |
12889 lea r4, [r1 * 3] | |
12890 | |
12891 call ang16_mode_9_27 | |
12892 RET | |
12893 | |
12894 cglobal intra_pred_ang16_10, 3,6,3 | |
12895 mov r5d, r4m | |
12896 add r1d, r1d | |
12897 lea r4, [r1 * 3] | |
12898 | |
12899 vpbroadcastw m2, [r2 + 2 + 64] ; [1...] | |
12900 mova m0, m2 | |
12901 movu [r0], m2 | |
12902 vpbroadcastw m1, [r2 + 2 + 64 + 2] ; [2...] | |
12903 movu [r0 + r1], m1 | |
12904 vpbroadcastw m2, [r2 + 2 + 64 + 4] ; [3...] | |
12905 movu [r0 + r1 * 2], m2 | |
12906 vpbroadcastw m1, [r2 + 2 + 64 + 6] ; [4...] | |
12907 movu [r0 + r4], m1 | |
12908 | |
12909 lea r3, [r0 + r1 * 4] | |
12910 vpbroadcastw m2, [r2 + 2 + 64 + 8] ; [5...] | |
12911 movu [r3], m2 | |
12912 vpbroadcastw m1, [r2 + 2 + 64 + 10] ; [6...] | |
12913 movu [r3 + r1], m1 | |
12914 vpbroadcastw m2, [r2 + 2 + 64 + 12] ; [7...] | |
12915 movu [r3 + r1 * 2], m2 | |
12916 vpbroadcastw m1, [r2 + 2 + 64 + 14] ; [8...] | |
12917 movu [r3 + r4], m1 | |
12918 | |
12919 lea r3, [r3 + r1 *4] | |
12920 vpbroadcastw m2, [r2 + 2 + 64 + 16] ; [9...] | |
12921 movu [r3], m2 | |
12922 vpbroadcastw m1, [r2 + 2 + 64 + 18] ; [10...] | |
12923 movu [r3 + r1], m1 | |
12924 vpbroadcastw m2, [r2 + 2 + 64 + 20] ; [11...] | |
12925 movu [r3 + r1 * 2], m2 | |
12926 vpbroadcastw m1, [r2 + 2 + 64 + 22] ; [12...] | |
12927 movu [r3 + r4], m1 | |
12928 | |
12929 lea r3, [r3 + r1 *4] | |
12930 vpbroadcastw m2, [r2 + 2 + 64 + 24] ; [13...] | |
12931 movu [r3], m2 | |
12932 vpbroadcastw m1, [r2 + 2 + 64 + 26] ; [14...] | |
12933 movu [r3 + r1], m1 | |
12934 vpbroadcastw m2, [r2 + 2 + 64 + 28] ; [15...] | |
12935 movu [r3 + r1 * 2], m2 | |
12936 vpbroadcastw m1, [r2 + 2 + 64 + 30] ; [16...] | |
12937 movu [r3 + r4], m1 | |
12938 | |
12939 cmp r5d, byte 0 | |
12940 jz .quit | |
12941 | |
12942 ; filter | |
12943 vpbroadcastw m2, [r2] ; [0 0...] | |
12944 movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12945 psubw m1, m2 | |
12946 psraw m1, 1 | |
12947 paddw m0, m1 | |
12948 pxor m1, m1 | |
12949 pmaxsw m0, m1 | |
12950 pminsw m0, [pw_pixel_max] | |
12951 .quit: | |
12952 movu [r0], m0 | |
12953 RET | |
12954 | |
12955 cglobal intra_pred_ang16_26, 3,6,4 | |
12956 mov r5d, r4m | |
12957 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12958 add r1d, r1d | |
12959 lea r4, [r1 * 3] | |
12960 | |
12961 movu [r0], m0 | |
12962 movu [r0 + r1], m0 | |
12963 movu [r0 + r1 * 2], m0 | |
12964 movu [r0 + r4], m0 | |
12965 | |
12966 lea r3, [r0 + r1 *4] | |
12967 movu [r3], m0 | |
12968 movu [r3 + r1], m0 | |
12969 movu [r3 + r1 * 2], m0 | |
12970 movu [r3 + r4], m0 | |
12971 | |
12972 lea r3, [r3 + r1 *4] | |
12973 movu [r3], m0 | |
12974 movu [r3 + r1], m0 | |
12975 movu [r3 + r1 * 2], m0 | |
12976 movu [r3 + r4], m0 | |
12977 | |
12978 lea r3, [r3 + r1 *4] | |
12979 movu [r3], m0 | |
12980 movu [r3 + r1], m0 | |
12981 movu [r3 + r1 * 2], m0 | |
12982 movu [r3 + r4], m0 | |
12983 | |
12984 cmp r5d, byte 0 | |
12985 jz .quit | |
12986 | |
12987 ; filter | |
12988 | |
12989 vpbroadcastw m0, xm0 | |
12990 vpbroadcastw m2, [r2] | |
12991 movu m1, [r2 + 2 + 64] | |
12992 psubw m1, m2 | |
12993 psraw m1, 1 | |
12994 paddw m0, m1 | |
12995 pxor m1, m1 | |
12996 pmaxsw m0, m1 | |
12997 pminsw m0, [pw_pixel_max] | |
12998 pextrw [r0], xm0, 0 | |
12999 pextrw [r0 + r1], xm0, 1 | |
13000 pextrw [r0 + r1 * 2], xm0, 2 | |
13001 pextrw [r0 + r4], xm0, 3 | |
13002 lea r0, [r0 + r1 * 4] | |
13003 pextrw [r0], xm0, 4 | |
13004 pextrw [r0 + r1], xm0, 5 | |
13005 pextrw [r0 + r1 * 2], xm0, 6 | |
13006 pextrw [r0 + r4], xm0, 7 | |
13007 lea r0, [r0 + r1 * 4] | |
13008 vpermq m0, m0, 11101110b | |
13009 pextrw [r0], xm0, 0 | |
13010 pextrw [r0 + r1], xm0, 1 | |
13011 pextrw [r0 + r1 * 2], xm0, 2 | |
13012 pextrw [r0 + r4], xm0, 3 | |
13013 pextrw [r3], xm0, 4 | |
13014 pextrw [r3 + r1], xm0, 5 | |
13015 pextrw [r3 + r1 * 2], xm0, 6 | |
13016 pextrw [r3 + r4], xm0, 7 | |
13017 .quit: | |
13018 RET | |
13019 | |
13020 cglobal intra_pred_ang16_11, 3,7,12, 0-4 | |
13021 movzx r5d, word [r2 + 64] | |
13022 movzx r6d, word [r2] | |
13023 mov [rsp], r5w | |
13024 mov [r2 + 64], r6w | |
13025 | |
13026 add r2, 64 | |
13027 xor r6d, r6d | |
13028 lea r3, [ang_table_avx2 + 16 * 32] | |
13029 add r1d, r1d | |
13030 lea r4, [r1 * 3] | |
13031 | |
13032 call ang16_mode_11_25 | |
13033 | |
13034 mov r6d, [rsp] | |
13035 mov [r2], r6w | |
13036 RET | |
13037 | |
13038 cglobal intra_pred_ang16_25, 3,7,12 | |
13039 xor r6d, r6d | |
13040 inc r6d | |
13041 lea r3, [ang_table_avx2 + 16 * 32] | |
13042 add r1d, r1d | |
13043 lea r4, [r1 * 3] | |
13044 | |
13045 call ang16_mode_11_25 | |
13046 RET | |
13047 | |
13048 cglobal intra_pred_ang16_12, 3,7,14, 0-4 | |
13049 movzx r5d, word [r2 + 64] | |
13050 movzx r6d, word [r2] | |
13051 mov [rsp], r5w | |
13052 mov [r2 + 64], r6w | |
13053 | |
13054 add r1d, r1d | |
13055 lea r4, [r1 * 3] | |
13056 lea r3, [ang_table_avx2 + 16 * 32] | |
13057 movu xm1, [r2 + 12] ; [13 12 11 10 9 8 7 6] | |
13058 pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x] | |
13059 xor r6d, r6d | |
13060 add r2, 64 | |
13061 | |
13062 call ang16_mode_12_24 | |
13063 | |
13064 mov r6d, [rsp] | |
13065 mov [r2], r6w | |
13066 RET | |
13067 | |
13068 cglobal intra_pred_ang16_24, 3,7,14, 0-4 | |
13069 add r1d, r1d | |
13070 lea r4, [r1 * 3] | |
13071 lea r3, [ang_table_avx2 + 16 * 32] | |
13072 movu xm1, [r2 + 76] ; [13 12 11 10 9 8 7 6] | |
13073 pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x] | |
13074 xor r6d, r6d | |
13075 inc r6d | |
13076 | |
13077 call ang16_mode_12_24 | |
13078 RET | |
13079 | |
13080 cglobal intra_pred_ang16_13, 3,7,14, 0-4 | |
13081 movzx r5d, word [r2 + 64] | |
13082 movzx r6d, word [r2] | |
13083 mov [rsp], r5w | |
13084 mov [r2 + 64], r6w | |
13085 | |
13086 add r1d, r1d | |
13087 lea r4, [r1 * 3] | |
13088 lea r3, [ang_table_avx2 + 16 * 32] | |
13089 movu xm1, [r2 + 8] ; [11 x x x 7 x x 4] | |
13090 pinsrw xm1, [r2 + 28], 1 ; [11 x x x 7 x 14 4] | |
13091 pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14] | |
13092 xor r6d, r6d | |
13093 add r2, 64 | |
13094 | |
13095 call ang16_mode_13_23 | |
13096 | |
13097 mov r6d, [rsp] | |
13098 mov [r2], r6w | |
13099 RET | |
13100 | |
13101 cglobal intra_pred_ang16_23, 3,7,14, 0-4 | |
13102 add r1d, r1d | |
13103 lea r4, [r1 * 3] | |
13104 lea r3, [ang_table_avx2 + 16 * 32] | |
13105 movu xm1, [r2 + 72] ; [11 10 9 8 7 6 5 4] | |
13106 pinsrw xm1, [r2 + 92], 1 ; [11 x x x 7 x 14 4] | |
13107 pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14] | |
13108 xor r6d, r6d | |
13109 inc r6d | |
13110 | |
13111 call ang16_mode_13_23 | |
13112 RET | |
13113 | |
13114 cglobal intra_pred_ang16_14, 3,7,15, 0-4 | |
13115 movzx r5d, word [r2 + 64] | |
13116 movzx r6d, word [r2] | |
13117 mov [rsp], r5w | |
13118 mov [r2 + 64], r6w | |
13119 | |
13120 add r1d, r1d | |
13121 lea r4, [r1 * 3] | |
13122 lea r3, [ang_table_avx2 + 16 * 32] | |
13123 movu xm1, [r2 + 4] ; [ x x 7 x 5 x x 2] | |
13124 pinsrw xm1, [r2 + 20], 1 ; [ x x 7 x 5 x 10 2] | |
13125 movu xm14, [r2 + 24] ; [ x x x x 15 x x 12] | |
13126 pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x] | |
13127 pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10] | |
13128 xor r6d, r6d | |
13129 add r2, 64 | |
13130 | |
13131 call ang16_mode_14_22 | |
13132 | |
13133 mov r6d, [rsp] | |
13134 mov [r2], r6w | |
13135 RET | |
13136 | |
13137 cglobal intra_pred_ang16_22, 3,7,15, 0-4 | |
13138 add r1d, r1d | |
13139 lea r4, [r1 * 3] | |
13140 lea r3, [ang_table_avx2 + 16 * 32] | |
13141 movu xm1, [r2 + 68] ; [ x x 7 x 5 x x 2] | |
13142 pinsrw xm1, [r2 + 84], 1 ; [ x x 7 x 5 x 10 2] | |
13143 movu xm14, [r2 + 88] ; [ x x x x 15 x x 12] | |
13144 pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x] | |
13145 pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10] | |
13146 xor r6d, r6d | |
13147 inc r6d | |
13148 | |
13149 call ang16_mode_14_22 | |
13150 RET | |
13151 | |
13152 cglobal intra_pred_ang16_15, 3,7,15, 0-4 | |
13153 movzx r5d, word [r2 + 64] | |
13154 movzx r6d, word [r2] | |
13155 mov [rsp], r5w | |
13156 mov [r2 + 64], r6w | |
13157 | |
13158 add r1d, r1d | |
13159 lea r4, [r1 * 3] | |
13160 lea r3, [ang_table_avx2 + 16 * 32] | |
13161 movu xm1, [r2 + 4] ; [ x 8 x 6 x 4 x 2] | |
13162 movu xm14, [r2 + 18] ; [ x 15 x 13 x 11 x 9] | |
13163 pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15] | |
13164 pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8] | |
13165 xor r6d, r6d | |
13166 add r2, 64 | |
13167 | |
13168 call ang16_mode_15_21 | |
13169 | |
13170 mov r6d, [rsp] | |
13171 mov [r2], r6w | |
13172 RET | |
13173 | |
13174 cglobal intra_pred_ang16_21, 3,7,15, 0-4 | |
13175 add r1d, r1d | |
13176 lea r4, [r1 * 3] | |
13177 lea r3, [ang_table_avx2 + 16 * 32] | |
13178 movu xm1, [r2 + 68] ; [ x 8 x 6 x 4 x 2] | |
13179 movu xm14, [r2 + 82] ; [ x 15 x 13 x 11 x 9] | |
13180 pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15] | |
13181 pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8] | |
13182 xor r6d, r6d | |
13183 inc r6d | |
13184 | |
13185 call ang16_mode_15_21 | |
13186 RET | |
13187 | |
13188 cglobal intra_pred_ang16_16, 3,7,15, 0-4 | |
13189 movzx r5d, word [r2 + 64] | |
13190 movzx r6d, word [r2] | |
13191 mov [rsp], r5w | |
13192 mov [r2 + 64], r6w | |
13193 | |
13194 add r1d, r1d | |
13195 lea r4, [r1 * 3] | |
13196 lea r3, [ang_table_avx2 + 16 * 32] | |
13197 movu xm1, [r2 + 4] ; [ x x x 6 5 x 3 2] | |
13198 movu xm14, [r2 + 16] ; [ x x x 12 11 x 9 8] | |
13199 movu xm2, [r2 + 28] ; [ x x x x x x 15 14] | |
13200 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] | |
13201 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] | |
13202 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] | |
13203 xor r6d, r6d | |
13204 add r2, 64 | |
13205 | |
13206 call ang16_mode_16_20 | |
13207 | |
13208 mov r6d, [rsp] | |
13209 mov [r2], r6w | |
13210 RET | |
13211 | |
13212 cglobal intra_pred_ang16_20, 3,7,15, 0-4 | |
13213 add r1d, r1d | |
13214 lea r4, [r1 * 3] | |
13215 lea r3, [ang_table_avx2 + 16 * 32] | |
13216 movu xm1, [r2 + 68] ; [ x x x 6 5 x 3 2] | |
13217 movu xm14, [r2 + 80] ; [ x x x 12 11 x 9 8] | |
13218 movu xm2, [r2 + 92] ; [ x x x x x x 15 14] | |
13219 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] | |
13220 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] | |
13221 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] | |
13222 xor r6d, r6d | |
13223 inc r6d | |
13224 | |
13225 call ang16_mode_16_20 | |
13226 RET | |
13227 | |
13228 cglobal intra_pred_ang16_17, 3,7,15, 0-4 | |
13229 movzx r5d, word [r2 + 64] | |
13230 movzx r6d, word [r2] | |
13231 mov [rsp], r5w | |
13232 mov [r2 + 64], r6w | |
13233 | |
13234 add r1d, r1d | |
13235 lea r4, [r1 * 3] | |
13236 lea r3, [ang_table_avx2 + 16 * 32] | |
13237 movu xm1, [r2 + 2] ; [ x x x 6 5 x 3 2] | |
13238 movu xm14, [r2 + 12] ; [ x x x 12 11 x 9 8] | |
13239 movu xm2, [r2 + 22] ; [ x x x x x x 15 14] | |
13240 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] | |
13241 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] | |
13242 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] | |
13243 xor r6d, r6d | |
13244 add r2, 64 | |
13245 | |
13246 call ang16_mode_17_19 | |
13247 | |
13248 mov r6d, [rsp] | |
13249 mov [r2], r6w | |
13250 RET | |
13251 | |
13252 cglobal intra_pred_ang16_19, 3,7,15, 0-4 | |
13253 add r1d, r1d | |
13254 lea r4, [r1 * 3] | |
13255 lea r3, [ang_table_avx2 + 16 * 32] | |
13256 movu xm1, [r2 + 66] ; [ x x x 6 5 x 3 2] | |
13257 movu xm14, [r2 + 76] ; [ x x x 12 11 x 9 8] | |
13258 movu xm2, [r2 + 86] ; [ x x x x x x 15 14] | |
13259 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] | |
13260 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] | |
13261 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] | |
13262 xor r6d, r6d | |
13263 inc r6d | |
13264 | |
13265 call ang16_mode_17_19 | |
13266 RET | |
13267 | |
13268 cglobal intra_pred_ang16_18, 3,5,4 | |
13269 add r1d, r1d | |
13270 lea r4, [r1 * 3] | |
13271 movu m1, [r2] | |
13272 movu m0, [r2 + 2 + 64] | |
13273 pshufb m0, [pw_swap16] | |
13274 mova m3, m0 | |
13275 vinserti128 m0, m0, xm1, 1 | |
13276 movu [r0], m1 | |
13277 palignr m2, m1, m0, 14 | |
13278 movu [r0 + r1], m2 | |
13279 | |
13280 palignr m2, m1, m0, 12 | |
13281 movu [r0 + r1 * 2], m2 | |
13282 palignr m2, m1, m0, 10 | |
13283 movu [r0 + r4], m2 | |
13284 | |
13285 lea r0, [r0 + r1 * 4] | |
13286 palignr m2, m1, m0, 8 | |
13287 movu [r0], m2 | |
13288 palignr m2, m1, m0, 6 | |
13289 movu [r0 + r1], m2 | |
13290 palignr m2, m1, m0, 4 | |
13291 movu [r0 + r1 * 2], m2 | |
13292 palignr m2, m1, m0, 2 | |
13293 movu [r0 + r4], m2 | |
13294 | |
13295 lea r0, [r0 + r1 * 4] | |
13296 movu [r0], m0 | |
13297 vpermq m3, m3, 01001110b | |
13298 palignr m2, m0, m3, 14 | |
13299 movu [r0 + r1], m2 | |
13300 palignr m2, m0, m3, 12 | |
13301 movu [r0 + r1 * 2], m2 | |
13302 palignr m2, m0, m3, 10 | |
13303 movu [r0 + r4], m2 | |
13304 palignr m2, m1, m0, 10 | |
13305 | |
13306 lea r0, [r0 + r1 * 4] | |
13307 palignr m2, m0, m3, 8 | |
13308 movu [r0], m2 | |
13309 palignr m2, m0, m3, 6 | |
13310 movu [r0 + r1], m2 | |
13311 palignr m2, m0, m3, 4 | |
13312 movu [r0 + r1 * 2], m2 | |
13313 palignr m2, m0, m3, 2 | |
13314 movu [r0 + r4], m2 | |
13315 palignr m1, m0, 2 | |
13316 RET | |
13317 | |
13318 ;------------------------------------------------------------------------------------------------------- | |
13319 ; end of avx2 code for intra_pred_ang16 mode 2 to 34 | |
13320 ;------------------------------------------------------------------------------------------------------- | |
13321 | |
13322 ;------------------------------------------------------------------------------------------------------- | |
13323 ; avx2 code for intra_pred_ang32 mode 2 to 34 start | |
13324 ;------------------------------------------------------------------------------------------------------- | |
13325 INIT_YMM avx2 | |
13326 cglobal intra_pred_ang32_2, 3,5,6 | |
13327 lea r4, [r2] | |
13328 add r2, 128 | |
13329 cmp r3m, byte 34 | |
13330 cmove r2, r4 | |
13331 add r1d, r1d | |
13332 lea r3, [r1 * 3] | |
13333 movu m0, [r2 + 4] | |
13334 movu m1, [r2 + 20] | |
13335 movu m3, [r2 + 36] | |
13336 movu m4, [r2 + 52] | |
13337 | |
13338 movu [r0], m0 | |
13339 movu [r0 + 32], m3 | |
13340 palignr m2, m1, m0, 2 | |
13341 palignr m5, m4, m3, 2 | |
13342 movu [r0 + r1], m2 | |
13343 movu [r0 + r1 + 32], m5 | |
13344 palignr m2, m1, m0, 4 | |
13345 palignr m5, m4, m3, 4 | |
13346 movu [r0 + r1 * 2], m2 | |
13347 movu [r0 + r1 * 2 + 32], m5 | |
13348 palignr m2, m1, m0, 6 | |
13349 palignr m5, m4, m3, 6 | |
13350 movu [r0 + r3], m2 | |
13351 movu [r0 + r3 + 32], m5 | |
13352 | |
13353 lea r0, [r0 + r1 * 4] | |
13354 palignr m2, m1, m0, 8 | |
13355 palignr m5, m4, m3, 8 | |
13356 movu [r0], m2 | |
13357 movu [r0 + 32], m5 | |
13358 palignr m2, m1, m0, 10 | |
13359 palignr m5, m4, m3, 10 | |
13360 movu [r0 + r1], m2 | |
13361 movu [r0 + r1 + 32], m5 | |
13362 palignr m2, m1, m0, 12 | |
13363 palignr m5, m4, m3, 12 | |
13364 movu [r0 + r1 * 2], m2 | |
13365 movu [r0 + r1 * 2 + 32], m5 | |
13366 palignr m2, m1, m0, 14 | |
13367 palignr m5, m4, m3, 14 | |
13368 movu [r0 + r3], m2 | |
13369 movu [r0 + r3 + 32], m5 | |
13370 | |
13371 movu m0, [r2 + 36] | |
13372 movu m3, [r2 + 68] | |
13373 lea r0, [r0 + r1 * 4] | |
13374 movu [r0], m1 | |
13375 movu [r0 + 32], m4 | |
13376 palignr m2, m0, m1, 2 | |
13377 palignr m5, m3, m4, 2 | |
13378 movu [r0 + r1], m2 | |
13379 movu [r0 + r1 + 32], m5 | |
13380 palignr m2, m0, m1, 4 | |
13381 palignr m5, m3, m4, 4 | |
13382 movu [r0 + r1 * 2], m2 | |
13383 movu [r0 + r1 * 2 + 32], m5 | |
13384 palignr m2, m0, m1, 6 | |
13385 palignr m5, m3, m4, 6 | |
13386 movu [r0 + r3], m2 | |
13387 movu [r0 + r3 + 32], m5 | |
13388 | |
13389 lea r0, [r0 + r1 * 4] | |
13390 palignr m2, m0, m1, 8 | |
13391 palignr m5, m3, m4, 8 | |
13392 movu [r0], m2 | |
13393 movu [r0 + 32], m5 | |
13394 palignr m2, m0, m1, 10 | |
13395 palignr m5, m3, m4, 10 | |
13396 movu [r0 + r1], m2 | |
13397 movu [r0 + r1 + 32], m5 | |
13398 palignr m2, m0, m1, 12 | |
13399 palignr m5, m3, m4, 12 | |
13400 movu [r0 + r1 * 2], m2 | |
13401 movu [r0 + r1 * 2 + 32], m5 | |
13402 palignr m2, m0, m1, 14 | |
13403 palignr m5, m3, m4, 14 | |
13404 movu [r0 + r3], m2 | |
13405 movu [r0 + r3 + 32], m5 | |
13406 | |
13407 lea r0, [r0 + r1 * 4] | |
13408 movu m1, [r2 + 52] | |
13409 movu m4, [r2 + 84] | |
13410 | |
13411 movu [r0], m0 | |
13412 movu [r0 + 32], m3 | |
13413 palignr m2, m1, m0, 2 | |
13414 palignr m5, m4, m3, 2 | |
13415 movu [r0 + r1], m2 | |
13416 movu [r0 + r1 + 32], m5 | |
13417 palignr m2, m1, m0, 4 | |
13418 palignr m5, m4, m3, 4 | |
13419 movu [r0 + r1 * 2], m2 | |
13420 movu [r0 + r1 * 2 + 32], m5 | |
13421 palignr m2, m1, m0, 6 | |
13422 palignr m5, m4, m3, 6 | |
13423 movu [r0 + r3], m2 | |
13424 movu [r0 + r3 + 32], m5 | |
13425 | |
13426 lea r0, [r0 + r1 * 4] | |
13427 palignr m2, m1, m0, 8 | |
13428 palignr m5, m4, m3, 8 | |
13429 movu [r0], m2 | |
13430 movu [r0 + 32], m5 | |
13431 palignr m2, m1, m0, 10 | |
13432 palignr m5, m4, m3, 10 | |
13433 movu [r0 + r1], m2 | |
13434 movu [r0 + r1 + 32], m5 | |
13435 palignr m2, m1, m0, 12 | |
13436 palignr m5, m4, m3, 12 | |
13437 movu [r0 + r1 * 2], m2 | |
13438 movu [r0 + r1 * 2 + 32], m5 | |
13439 palignr m2, m1, m0, 14 | |
13440 palignr m5, m4, m3, 14 | |
13441 movu [r0 + r3], m2 | |
13442 movu [r0 + r3 + 32], m5 | |
13443 | |
13444 movu m0, [r2 + 68] | |
13445 movu m3, [r2 + 100] | |
13446 lea r0, [r0 + r1 * 4] | |
13447 movu [r0], m1 | |
13448 movu [r0 + 32], m4 | |
13449 palignr m2, m0, m1, 2 | |
13450 palignr m5, m3, m4, 2 | |
13451 movu [r0 + r1], m2 | |
13452 movu [r0 + r1 + 32], m5 | |
13453 palignr m2, m0, m1, 4 | |
13454 palignr m5, m3, m4, 4 | |
13455 movu [r0 + r1 * 2], m2 | |
13456 movu [r0 + r1 * 2 + 32], m5 | |
13457 palignr m2, m0, m1, 6 | |
13458 palignr m5, m3, m4, 6 | |
13459 movu [r0 + r3], m2 | |
13460 movu [r0 + r3 + 32], m5 | |
13461 | |
13462 lea r0, [r0 + r1 * 4] | |
13463 palignr m2, m0, m1, 8 | |
13464 palignr m5, m3, m4, 8 | |
13465 movu [r0], m2 | |
13466 movu [r0 + 32], m5 | |
13467 palignr m2, m0, m1, 10 | |
13468 palignr m5, m3, m4, 10 | |
13469 movu [r0 + r1], m2 | |
13470 movu [r0 + r1 + 32], m5 | |
13471 palignr m2, m0, m1, 12 | |
13472 palignr m5, m3, m4, 12 | |
13473 movu [r0 + r1 * 2], m2 | |
13474 movu [r0 + r1 * 2 + 32], m5 | |
13475 palignr m2, m0, m1, 14 | |
13476 palignr m5, m3, m4, 14 | |
13477 movu [r0 + r3], m2 | |
13478 movu [r0 + r3 + 32], m5 | |
13479 RET | |
13480 | |
13481 cglobal intra_pred_ang32_3, 3,8,13 | |
13482 add r2, 128 | |
13483 xor r6d, r6d | |
13484 lea r3, [ang_table_avx2 + 16 * 32] | |
13485 add r1d, r1d | |
13486 lea r4, [r1 * 3] | |
13487 lea r7, [r0 + 8 * r1] | |
13488 | |
13489 call ang16_mode_3_33 | |
13490 | |
13491 add r2, 26 | |
13492 lea r0, [r0 + 32] | |
13493 | |
13494 call ang16_mode_3_33 | |
13495 | |
13496 add r2, 6 | |
13497 lea r0, [r7 + 8 * r1] | |
13498 | |
13499 call ang16_mode_3_33 | |
13500 | |
13501 add r2, 26 | |
13502 lea r0, [r0 + 32] | |
13503 | |
13504 call ang16_mode_3_33 | |
13505 RET | |
13506 | |
13507 cglobal intra_pred_ang32_33, 3,7,13 | |
13508 xor r6d, r6d | |
13509 inc r6d | |
13510 lea r3, [ang_table_avx2 + 16 * 32] | |
13511 add r1d, r1d | |
13512 lea r4, [r1 * 3] | |
13513 lea r5, [r0 + 32] | |
13514 | |
13515 call ang16_mode_3_33 | |
13516 | |
13517 add r2, 26 | |
13518 | |
13519 call ang16_mode_3_33 | |
13520 | |
13521 add r2, 6 | |
13522 mov r0, r5 | |
13523 | |
13524 call ang16_mode_3_33 | |
13525 | |
13526 add r2, 26 | |
13527 | |
13528 call ang16_mode_3_33 | |
13529 RET | |
13530 | |
13531 ;; angle 32, modes 4 and 32 | |
13532 cglobal ang32_mode_4_32 | |
13533 test r6d, r6d | |
13534 | |
13535 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
13536 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
13537 | |
13538 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
13539 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
13540 | |
13541 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
13542 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
13543 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
13544 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
13545 | |
13546 pmaddwd m4, m3, [r3 - 13 * 32] ; [5] | |
13547 paddd m4, [pd_16] | |
13548 psrld m4, 5 | |
13549 pmaddwd m5, m0, [r3 - 13 * 32] | |
13550 paddd m5, [pd_16] | |
13551 psrld m5, 5 | |
13552 packusdw m4, m5 | |
13553 | |
13554 pmaddwd m5, m3, [r3 + 8 * 32] ; [26] | |
13555 paddd m5, [pd_16] | |
13556 psrld m5, 5 | |
13557 pmaddwd m8, m0, [r3 + 8 * 32] | |
13558 paddd m8, [pd_16] | |
13559 psrld m8, 5 | |
13560 packusdw m5, m8 | |
13561 | |
13562 palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] | |
13563 pmaddwd m6, [r3 - 3 * 32] ; [15] | |
13564 paddd m6, [pd_16] | |
13565 psrld m6, 5 | |
13566 palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] | |
13567 pmaddwd m7, [r3 - 3 * 32] | |
13568 paddd m7, [pd_16] | |
13569 psrld m7, 5 | |
13570 packusdw m6, m7 | |
13571 | |
13572 palignr m8, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] | |
13573 pmaddwd m7, m8, [r3 - 14 * 32] ; [4] | |
13574 paddd m7, [pd_16] | |
13575 psrld m7, 5 | |
13576 palignr m9, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] | |
13577 pmaddwd m10, m9, [r3 - 14 * 32] | |
13578 paddd m10, [pd_16] | |
13579 psrld m10, 5 | |
13580 packusdw m7, m10 | |
13581 | |
13582 pmaddwd m8, [r3 + 7 * 32] ; [25] | |
13583 paddd m8, [pd_16] | |
13584 psrld m8, 5 | |
13585 pmaddwd m9, [r3 + 7 * 32] | |
13586 paddd m9, [pd_16] | |
13587 psrld m9, 5 | |
13588 packusdw m8, m9 | |
13589 | |
13590 palignr m9, m0, m3, 12 | |
13591 pmaddwd m9, [r3 - 4 * 32] ; [14] | |
13592 paddd m9, [pd_16] | |
13593 psrld m9, 5 | |
13594 palignr m3, m2, m0, 12 | |
13595 pmaddwd m3, [r3 - 4 * 32] | |
13596 paddd m3, [pd_16] | |
13597 psrld m3, 5 | |
13598 packusdw m9, m3 | |
13599 | |
13600 pmaddwd m10, m0, [r3 - 15 * 32] ; [3] | |
13601 paddd m10, [pd_16] | |
13602 psrld m10, 5 | |
13603 pmaddwd m3, m2, [r3 - 15 * 32] | |
13604 paddd m3, [pd_16] | |
13605 psrld m3, 5 | |
13606 packusdw m10, m3 | |
13607 | |
13608 pmaddwd m11, m0, [r3 + 6 * 32] ; [24] | |
13609 paddd m11, [pd_16] | |
13610 psrld m11, 5 | |
13611 pmaddwd m3, m2, [r3 + 6 * 32] | |
13612 paddd m3, [pd_16] | |
13613 psrld m3, 5 | |
13614 packusdw m11, m3 | |
13615 | |
13616 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 | |
13617 | |
13618 palignr m4, m2, m0, 4 | |
13619 pmaddwd m4, [r3 - 5* 32] ; [13] | |
13620 paddd m4, [pd_16] | |
13621 psrld m4, 5 | |
13622 palignr m5, m1, m2, 4 | |
13623 pmaddwd m5, [r3 - 5 * 32] | |
13624 paddd m5, [pd_16] | |
13625 psrld m5, 5 | |
13626 packusdw m4, m5 | |
13627 | |
13628 palignr m6, m2, m0, 8 | |
13629 pmaddwd m5, m6, [r3 - 16 * 32] ; [2] | |
13630 paddd m5, [pd_16] | |
13631 psrld m5, 5 | |
13632 palignr m7, m1, m2, 8 | |
13633 pmaddwd m8, m7, [r3 - 16 * 32] | |
13634 paddd m8, [pd_16] | |
13635 psrld m8, 5 | |
13636 packusdw m5, m8 | |
13637 | |
13638 pmaddwd m6, [r3 + 5 * 32] ; [23] | |
13639 paddd m6, [pd_16] | |
13640 psrld m6, 5 | |
13641 pmaddwd m7, [r3 + 5 * 32] | |
13642 paddd m7, [pd_16] | |
13643 psrld m7, 5 | |
13644 packusdw m6, m7 | |
13645 | |
13646 palignr m7, m2, m0, 12 | |
13647 pmaddwd m7, [r3 - 6 * 32] ; [12] | |
13648 paddd m7, [pd_16] | |
13649 psrld m7, 5 | |
13650 palignr m8, m1, m2, 12 | |
13651 pmaddwd m8, [r3 - 6 * 32] | |
13652 paddd m8, [pd_16] | |
13653 psrld m8, 5 | |
13654 packusdw m7, m8 | |
13655 | |
13656 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
13657 pmaddwd m8, m2, [r3 - 17 * 32] ; [1] | |
13658 paddd m8, [pd_16] | |
13659 psrld m8, 5 | |
13660 pmaddwd m9, m1, [r3 - 17 * 32] | |
13661 paddd m9, [pd_16] | |
13662 psrld m9, 5 | |
13663 packusdw m8, m9 | |
13664 | |
13665 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] | |
13666 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] | |
13667 | |
13668 pmaddwd m9, m2, [r3 + 4 * 32] ; [22] | |
13669 paddd m9, [pd_16] | |
13670 psrld m9, 5 | |
13671 pmaddwd m3, m1, [r3 + 4 * 32] | |
13672 paddd m3, [pd_16] | |
13673 psrld m3, 5 | |
13674 packusdw m9, m3 | |
13675 | |
13676 palignr m10, m1, m2, 4 | |
13677 pmaddwd m10, [r3 - 7 * 32] ; [11] | |
13678 paddd m10, [pd_16] | |
13679 psrld m10, 5 | |
13680 palignr m11, m0, m1, 4 | |
13681 pmaddwd m11, [r3 - 7 * 32] | |
13682 paddd m11, [pd_16] | |
13683 psrld m11, 5 | |
13684 packusdw m10, m11 | |
13685 | |
13686 palignr m3, m1, m2, 8 | |
13687 pmaddwd m3, [r3 - 18 * 32] ; [0] | |
13688 paddd m3, [pd_16] | |
13689 psrld m3, 5 | |
13690 palignr m0, m1, 8 | |
13691 pmaddwd m0, [r3 - 18 * 32] | |
13692 paddd m0, [pd_16] | |
13693 psrld m0, 5 | |
13694 packusdw m3, m0 | |
13695 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16 | |
13696 ret | |
13697 | |
13698 cglobal intra_pred_ang32_4, 3,8,13 | |
13699 add r2, 128 | |
13700 xor r6d, r6d | |
13701 lea r3, [ang_table_avx2 + 18 * 32] | |
13702 add r1d, r1d | |
13703 lea r4, [r1 * 3] | |
13704 lea r7, [r0 + 8 * r1] | |
13705 | |
13706 call ang16_mode_4_32 | |
13707 | |
13708 add r2, 22 | |
13709 lea r0, [r0 + 32] | |
13710 | |
13711 call ang32_mode_4_32 | |
13712 | |
13713 add r2, 10 | |
13714 lea r0, [r7 + 8 * r1] | |
13715 | |
13716 call ang16_mode_4_32 | |
13717 | |
13718 add r2, 22 | |
13719 lea r0, [r0 + 32] | |
13720 | |
13721 call ang32_mode_4_32 | |
13722 RET | |
13723 | |
13724 cglobal intra_pred_ang32_32, 3,7,13 | |
13725 xor r6d, r6d | |
13726 inc r6d | |
13727 lea r3, [ang_table_avx2 + 18 * 32] | |
13728 add r1d, r1d | |
13729 lea r4, [r1 * 3] | |
13730 lea r5, [r0 + 32] | |
13731 | |
13732 call ang16_mode_4_32 | |
13733 | |
13734 add r2, 22 | |
13735 | |
13736 call ang32_mode_4_32 | |
13737 | |
13738 add r2, 10 | |
13739 mov r0, r5 | |
13740 | |
13741 call ang16_mode_4_32 | |
13742 | |
13743 add r2, 22 | |
13744 | |
13745 call ang32_mode_4_32 | |
13746 RET | |
13747 | |
13748 ;; angle 32, modes 5 and 31 | |
13749 cglobal ang32_mode_5_31 | |
13750 test r6d, r6d | |
13751 | |
13752 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
13753 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
13754 | |
13755 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
13756 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
13757 | |
13758 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
13759 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
13760 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
13761 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
13762 | |
13763 pmaddwd m4, m3, [r3 - 15 * 32] ; [1] | |
13764 paddd m4, [pd_16] | |
13765 psrld m4, 5 | |
13766 pmaddwd m5, m0, [r3 - 15 * 32] | |
13767 paddd m5, [pd_16] | |
13768 psrld m5, 5 | |
13769 packusdw m4, m5 | |
13770 | |
13771 pmaddwd m5, m3, [r3 + 2 * 32] ; [18] | |
13772 paddd m5, [pd_16] | |
13773 psrld m5, 5 | |
13774 pmaddwd m8, m0, [r3 + 2 * 32] | |
13775 paddd m8, [pd_16] | |
13776 psrld m8, 5 | |
13777 packusdw m5, m8 | |
13778 | |
13779 palignr m7, m0, m3, 4 | |
13780 pmaddwd m6, m7, [r3 - 13 * 32] ; [3] | |
13781 paddd m6, [pd_16] | |
13782 psrld m6, 5 | |
13783 palignr m8, m2, m0, 4 | |
13784 pmaddwd m9, m8, [r3 - 13 * 32] | |
13785 paddd m9, [pd_16] | |
13786 psrld m9, 5 | |
13787 packusdw m6, m9 | |
13788 | |
13789 pmaddwd m7, [r3 + 4 * 32] ; [20] | |
13790 paddd m7, [pd_16] | |
13791 psrld m7, 5 | |
13792 pmaddwd m8, [r3 + 4 * 32] | |
13793 paddd m8, [pd_16] | |
13794 psrld m8, 5 | |
13795 packusdw m7, m8 | |
13796 | |
13797 palignr m9, m0, m3, 8 | |
13798 pmaddwd m8, m9, [r3 - 11 * 32] ; [5] | |
13799 paddd m8, [pd_16] | |
13800 psrld m8, 5 | |
13801 palignr m10, m2, m0, 8 | |
13802 pmaddwd m11, m10, [r3 - 11 * 32] | |
13803 paddd m11, [pd_16] | |
13804 psrld m11, 5 | |
13805 packusdw m8, m11 | |
13806 | |
13807 pmaddwd m9, [r3 + 6 * 32] ; [22] | |
13808 paddd m9, [pd_16] | |
13809 psrld m9, 5 | |
13810 pmaddwd m10, [r3 + 6 * 32] | |
13811 paddd m10, [pd_16] | |
13812 psrld m10, 5 | |
13813 packusdw m9, m10 | |
13814 | |
13815 palignr m11, m0, m3, 12 | |
13816 pmaddwd m10, m11, [r3 - 9 * 32] ; [7] | |
13817 paddd m10, [pd_16] | |
13818 psrld m10, 5 | |
13819 palignr m12, m2, m0, 12 | |
13820 pmaddwd m3, m12, [r3 - 9 * 32] | |
13821 paddd m3, [pd_16] | |
13822 psrld m3, 5 | |
13823 packusdw m10, m3 | |
13824 | |
13825 pmaddwd m11, [r3 + 8 * 32] ; [24] | |
13826 paddd m11, [pd_16] | |
13827 psrld m11, 5 | |
13828 pmaddwd m12, [r3 + 8 * 32] | |
13829 paddd m12, [pd_16] | |
13830 psrld m12, 5 | |
13831 packusdw m11, m12 | |
13832 | |
13833 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 | |
13834 | |
13835 pmaddwd m4, m0, [r3 - 7 * 32] ; [9] | |
13836 paddd m4, [pd_16] | |
13837 psrld m4, 5 | |
13838 pmaddwd m5, m2, [r3 - 7 * 32] | |
13839 paddd m5, [pd_16] | |
13840 psrld m5, 5 | |
13841 packusdw m4, m5 | |
13842 | |
13843 pmaddwd m5, m0, [r3 + 10 * 32] ; [26] | |
13844 paddd m5, [pd_16] | |
13845 psrld m5, 5 | |
13846 pmaddwd m3, m2, [r3 + 10 * 32] | |
13847 paddd m3, [pd_16] | |
13848 psrld m3, 5 | |
13849 packusdw m5, m3 | |
13850 | |
13851 palignr m7, m2, m0, 4 | |
13852 pmaddwd m6, m7, [r3 - 5 * 32] ; [11] | |
13853 paddd m6, [pd_16] | |
13854 psrld m6, 5 | |
13855 palignr m8, m1, m2, 4 | |
13856 pmaddwd m9, m8, [r3 - 5 * 32] | |
13857 paddd m9, [pd_16] | |
13858 psrld m9, 5 | |
13859 packusdw m6, m9 | |
13860 | |
13861 pmaddwd m7, [r3 + 12 * 32] ; [28] | |
13862 paddd m7, [pd_16] | |
13863 psrld m7, 5 | |
13864 pmaddwd m8, [r3 + 12 * 32] | |
13865 paddd m8, [pd_16] | |
13866 psrld m8, 5 | |
13867 packusdw m7, m8 | |
13868 | |
13869 palignr m9, m2, m0, 8 | |
13870 pmaddwd m8, m9, [r3 - 3 * 32] ; [13] | |
13871 paddd m8, [pd_16] | |
13872 psrld m8, 5 | |
13873 palignr m3, m1, m2, 8 | |
13874 pmaddwd m10, m3, [r3 - 3 * 32] | |
13875 paddd m10, [pd_16] | |
13876 psrld m10, 5 | |
13877 packusdw m8, m10 | |
13878 | |
13879 pmaddwd m9, [r3 + 14 * 32] ; [30] | |
13880 paddd m9, [pd_16] | |
13881 psrld m9, 5 | |
13882 pmaddwd m3, [r3 + 14 * 32] | |
13883 paddd m3, [pd_16] | |
13884 psrld m3, 5 | |
13885 packusdw m9, m3 | |
13886 | |
13887 palignr m10, m2, m0, 12 | |
13888 pmaddwd m10, [r3 - 1 * 32] ; [15] | |
13889 paddd m10, [pd_16] | |
13890 psrld m10, 5 | |
13891 palignr m11, m1, m2, 12 | |
13892 pmaddwd m11, [r3 - 1 * 32] | |
13893 paddd m11, [pd_16] | |
13894 psrld m11, 5 | |
13895 packusdw m10, m11 | |
13896 | |
13897 pmaddwd m2, [r3 - 16 * 32] ; [0] | |
13898 paddd m2, [pd_16] | |
13899 psrld m2, 5 | |
13900 pmaddwd m1, [r3 - 16 * 32] | |
13901 paddd m1, [pd_16] | |
13902 psrld m1, 5 | |
13903 packusdw m2, m1 | |
13904 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16 | |
13905 ret | |
13906 | |
13907 cglobal intra_pred_ang32_5, 3,8,13 | |
13908 add r2, 128 | |
13909 xor r6d, r6d | |
13910 lea r3, [ang_table_avx2 + 16 * 32] | |
13911 add r1d, r1d | |
13912 lea r4, [r1 * 3] | |
13913 lea r7, [r0 + 8 * r1] | |
13914 | |
13915 call ang16_mode_5_31 | |
13916 | |
13917 add r2, 18 | |
13918 lea r0, [r0 + 32] | |
13919 | |
13920 call ang32_mode_5_31 | |
13921 | |
13922 add r2, 14 | |
13923 lea r0, [r7 + 8 * r1] | |
13924 | |
13925 call ang16_mode_5_31 | |
13926 | |
13927 add r2, 18 | |
13928 lea r0, [r0 + 32] | |
13929 | |
13930 call ang32_mode_5_31 | |
13931 RET | |
13932 | |
13933 cglobal intra_pred_ang32_31, 3,7,13 | |
13934 xor r6d, r6d | |
13935 inc r6d | |
13936 lea r3, [ang_table_avx2 + 16 * 32] | |
13937 add r1d, r1d | |
13938 lea r4, [r1 * 3] | |
13939 lea r5, [r0 + 32] | |
13940 | |
13941 call ang16_mode_5_31 | |
13942 | |
13943 add r2, 18 | |
13944 | |
13945 call ang32_mode_5_31 | |
13946 | |
13947 add r2, 14 | |
13948 mov r0, r5 | |
13949 | |
13950 call ang16_mode_5_31 | |
13951 | |
13952 add r2, 18 | |
13953 | |
13954 call ang32_mode_5_31 | |
13955 RET | |
13956 | |
13957 ;; angle 32, modes 6 and 30 | |
13958 cglobal ang32_mode_6_30 | |
13959 test r6d, r6d | |
13960 | |
13961 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
13962 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
13963 | |
13964 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
13965 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
13966 | |
13967 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
13968 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
13969 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
13970 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
13971 | |
13972 pmaddwd m4, m3, [r3 + 14 * 32] ; [29] | |
13973 paddd m4, [pd_16] | |
13974 psrld m4, 5 | |
13975 pmaddwd m5, m0, [r3 + 14 * 32] | |
13976 paddd m5, [pd_16] | |
13977 psrld m5, 5 | |
13978 packusdw m4, m5 | |
13979 | |
13980 palignr m6, m0, m3, 4 | |
13981 pmaddwd m5, m6, [r3 - 5 * 32] ; [10] | |
13982 paddd m5, [pd_16] | |
13983 psrld m5, 5 | |
13984 palignr m7, m2, m0, 4 | |
13985 pmaddwd m8, m7, [r3 - 5 * 32] | |
13986 paddd m8, [pd_16] | |
13987 psrld m8, 5 | |
13988 packusdw m5, m8 | |
13989 | |
13990 pmaddwd m6, [r3 + 8 * 32] ; [23] | |
13991 paddd m6, [pd_16] | |
13992 psrld m6, 5 | |
13993 pmaddwd m7, [r3 + 8 * 32] | |
13994 paddd m7, [pd_16] | |
13995 psrld m7, 5 | |
13996 packusdw m6, m7 | |
13997 | |
13998 palignr m9, m0, m3, 8 | |
13999 pmaddwd m7, m9, [r3 - 11 * 32] ; [4] | |
14000 paddd m7, [pd_16] | |
14001 psrld m7, 5 | |
14002 palignr m12, m2, m0, 8 | |
14003 pmaddwd m11, m12, [r3 - 11 * 32] | |
14004 paddd m11, [pd_16] | |
14005 psrld m11, 5 | |
14006 packusdw m7, m11 | |
14007 | |
14008 pmaddwd m8, m9, [r3 + 2 * 32] ; [17] | |
14009 paddd m8, [pd_16] | |
14010 psrld m8, 5 | |
14011 pmaddwd m10, m12, [r3 + 2 * 32] | |
14012 paddd m10, [pd_16] | |
14013 psrld m10, 5 | |
14014 packusdw m8, m10 | |
14015 | |
14016 pmaddwd m9, [r3 + 15 * 32] ; [30] | |
14017 paddd m9, [pd_16] | |
14018 psrld m9, 5 | |
14019 pmaddwd m12, [r3 + 15 * 32] | |
14020 paddd m12, [pd_16] | |
14021 psrld m12, 5 | |
14022 packusdw m9, m12 | |
14023 | |
14024 palignr m11, m0, m3, 12 | |
14025 pmaddwd m10, m11, [r3 - 4 * 32] ; [11] | |
14026 paddd m10, [pd_16] | |
14027 psrld m10, 5 | |
14028 palignr m12, m2, m0, 12 | |
14029 pmaddwd m3, m12, [r3 - 4 * 32] | |
14030 paddd m3, [pd_16] | |
14031 psrld m3, 5 | |
14032 packusdw m10, m3 | |
14033 | |
14034 pmaddwd m11, [r3 + 9 * 32] ; [24] | |
14035 paddd m11, [pd_16] | |
14036 psrld m11, 5 | |
14037 pmaddwd m12, [r3 + 9 * 32] | |
14038 paddd m12, [pd_16] | |
14039 psrld m12, 5 | |
14040 packusdw m11, m12 | |
14041 | |
14042 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
14043 | |
14044 pmaddwd m4, m0, [r3 - 10 * 32] ; [5] | |
14045 paddd m4, [pd_16] | |
14046 psrld m4, 5 | |
14047 pmaddwd m5, m2, [r3 - 10 * 32] | |
14048 paddd m5, [pd_16] | |
14049 psrld m5, 5 | |
14050 packusdw m4, m5 | |
14051 | |
14052 pmaddwd m5, m0, [r3 + 3 * 32] ; [18] | |
14053 paddd m5, [pd_16] | |
14054 psrld m5, 5 | |
14055 pmaddwd m3, m2, [r3 + 3 * 32] | |
14056 paddd m3, [pd_16] | |
14057 psrld m3, 5 | |
14058 packusdw m5, m3 | |
14059 | |
14060 pmaddwd m6, m0, [r3 + 16 * 32] ; [31] | |
14061 paddd m6, [pd_16] | |
14062 psrld m6, 5 | |
14063 pmaddwd m7, m2, [r3 + 16 * 32] | |
14064 paddd m7, [pd_16] | |
14065 psrld m7, 5 | |
14066 packusdw m6, m7 | |
14067 | |
14068 palignr m8, m2, m0, 4 | |
14069 pmaddwd m7, m8, [r3 - 3 * 32] ; [12] | |
14070 paddd m7, [pd_16] | |
14071 psrld m7, 5 | |
14072 palignr m9, m1, m2, 4 | |
14073 pmaddwd m3, m9, [r3 - 3 * 32] | |
14074 paddd m3, [pd_16] | |
14075 psrld m3, 5 | |
14076 packusdw m7, m3 | |
14077 | |
14078 pmaddwd m8, [r3 + 10 * 32] ; [25] | |
14079 paddd m8, [pd_16] | |
14080 psrld m8, 5 | |
14081 pmaddwd m9, [r3 + 10 * 32] | |
14082 paddd m9, [pd_16] | |
14083 psrld m9, 5 | |
14084 packusdw m8, m9 | |
14085 | |
14086 palignr m10, m2, m0, 8 | |
14087 pmaddwd m9, m10, [r3 - 9 * 32] ; [6] | |
14088 paddd m9, [pd_16] | |
14089 psrld m9, 5 | |
14090 palignr m12, m1, m2, 8 | |
14091 pmaddwd m3, m12, [r3 - 9 * 32] | |
14092 paddd m3, [pd_16] | |
14093 psrld m3, 5 | |
14094 packusdw m9, m3 | |
14095 | |
14096 pmaddwd m10, [r3 + 4 * 32] ; [19] | |
14097 paddd m10, [pd_16] | |
14098 psrld m10, 5 | |
14099 pmaddwd m12, [r3 + 4 * 32] | |
14100 paddd m12, [pd_16] | |
14101 psrld m12, 5 | |
14102 packusdw m10, m12 | |
14103 | |
14104 palignr m11, m2, m0, 12 | |
14105 pmaddwd m11, [r3 - 15 * 32] ; [0] | |
14106 paddd m11, [pd_16] | |
14107 psrld m11, 5 | |
14108 palignr m3, m1, m2, 12 | |
14109 pmaddwd m3, [r3 - 15 * 32] | |
14110 paddd m3, [pd_16] | |
14111 psrld m3, 5 | |
14112 packusdw m11, m3 | |
14113 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 | |
14114 ret | |
14115 | |
14116 cglobal intra_pred_ang32_6, 3,8,14 | |
14117 add r2, 128 | |
14118 xor r6d, r6d | |
14119 lea r3, [ang_table_avx2 + 15 * 32] | |
14120 add r1d, r1d | |
14121 lea r4, [r1 * 3] | |
14122 lea r7, [r0 + 8 * r1] | |
14123 | |
14124 call ang16_mode_6_30 | |
14125 | |
14126 add r2, 12 | |
14127 lea r0, [r0 + 32] | |
14128 | |
14129 call ang32_mode_6_30 | |
14130 | |
14131 add r2, 20 | |
14132 lea r0, [r7 + 8 * r1] | |
14133 | |
14134 call ang16_mode_6_30 | |
14135 | |
14136 add r2, 12 | |
14137 lea r0, [r0 + 32] | |
14138 | |
14139 call ang32_mode_6_30 | |
14140 RET | |
14141 | |
14142 cglobal intra_pred_ang32_30, 3,7,14 | |
14143 xor r6d, r6d | |
14144 inc r6d | |
14145 lea r3, [ang_table_avx2 + 15 * 32] | |
14146 add r1d, r1d | |
14147 lea r4, [r1 * 3] | |
14148 lea r5, [r0 + 32] | |
14149 | |
14150 call ang16_mode_6_30 | |
14151 | |
14152 add r2, 12 | |
14153 | |
14154 call ang32_mode_6_30 | |
14155 | |
14156 add r2, 20 | |
14157 mov r0, r5 | |
14158 | |
14159 call ang16_mode_6_30 | |
14160 | |
14161 add r2, 12 | |
14162 | |
14163 call ang32_mode_6_30 | |
14164 RET | |
14165 | |
14166 ;; angle 32, modes 7 and 29 | |
14167 cglobal ang32_mode_7_29 | |
14168 test r6d, r6d | |
14169 | |
14170 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
14171 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
14172 | |
14173 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
14174 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
14175 | |
14176 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
14177 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
14178 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
14179 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] | |
14180 | |
14181 pmaddwd m4, m3, [r3 + 8 * 32] ; [25] | |
14182 paddd m4, [pd_16] | |
14183 psrld m4, 5 | |
14184 pmaddwd m5, m0, [r3 + 8 * 32] | |
14185 paddd m5, [pd_16] | |
14186 psrld m5, 5 | |
14187 packusdw m4, m5 | |
14188 | |
14189 palignr m8, m0, m3, 4 | |
14190 pmaddwd m5, m8, [r3 - 15 * 32] ; [2] | |
14191 paddd m5, [pd_16] | |
14192 psrld m5, 5 | |
14193 palignr m9, m2, m0, 4 | |
14194 pmaddwd m10, m9, [r3 - 15 * 32] | |
14195 paddd m10, [pd_16] | |
14196 psrld m10, 5 | |
14197 packusdw m5, m10 | |
14198 | |
14199 pmaddwd m6, m8, [r3 - 6 * 32] ; [11] | |
14200 paddd m6, [pd_16] | |
14201 psrld m6, 5 | |
14202 pmaddwd m7, m9, [r3 - 6 * 32] | |
14203 paddd m7, [pd_16] | |
14204 psrld m7, 5 | |
14205 packusdw m6, m7 | |
14206 | |
14207 pmaddwd m7, m8, [r3 + 3 * 32] ; [20] | |
14208 paddd m7, [pd_16] | |
14209 psrld m7, 5 | |
14210 pmaddwd m10, m9, [r3 + 3 * 32] | |
14211 paddd m10, [pd_16] | |
14212 psrld m10, 5 | |
14213 packusdw m7, m10 | |
14214 | |
14215 pmaddwd m8, [r3 + 12 * 32] ; [29] | |
14216 paddd m8, [pd_16] | |
14217 psrld m8, 5 | |
14218 pmaddwd m9, [r3 + 12 * 32] | |
14219 paddd m9, [pd_16] | |
14220 psrld m9, 5 | |
14221 packusdw m8, m9 | |
14222 | |
14223 palignr m11, m0, m3, 8 | |
14224 pmaddwd m9, m11, [r3 - 11 * 32] ; [6] | |
14225 paddd m9, [pd_16] | |
14226 psrld m9, 5 | |
14227 palignr m12, m2, m0, 8 | |
14228 pmaddwd m10, m12, [r3 - 11 * 32] | |
14229 paddd m10, [pd_16] | |
14230 psrld m10, 5 | |
14231 packusdw m9, m10 | |
14232 | |
14233 pmaddwd m10, m11, [r3 - 2 * 32] ; [15] | |
14234 paddd m10, [pd_16] | |
14235 psrld m10, 5 | |
14236 pmaddwd m13, m12, [r3 - 2 * 32] | |
14237 paddd m13, [pd_16] | |
14238 psrld m13, 5 | |
14239 packusdw m10, m13 | |
14240 | |
14241 pmaddwd m11, [r3 + 7 * 32] ; [24] | |
14242 paddd m11, [pd_16] | |
14243 psrld m11, 5 | |
14244 pmaddwd m12, [r3 + 7 * 32] | |
14245 paddd m12, [pd_16] | |
14246 psrld m12, 5 | |
14247 packusdw m11, m12 | |
14248 | |
14249 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
14250 | |
14251 palignr m5, m0, m3, 12 | |
14252 pmaddwd m4, m5, [r3 - 16 * 32] ; [1] | |
14253 paddd m4, [pd_16] | |
14254 psrld m4, 5 | |
14255 palignr m6, m2, m0, 12 | |
14256 pmaddwd m7, m6, [r3 - 16 * 32] | |
14257 paddd m7, [pd_16] | |
14258 psrld m7, 5 | |
14259 packusdw m4, m7 | |
14260 | |
14261 pmaddwd m5, [r3 - 7 * 32] ; [10] | |
14262 paddd m5, [pd_16] | |
14263 psrld m5, 5 | |
14264 pmaddwd m6, [r3 - 7 * 32] | |
14265 paddd m6, [pd_16] | |
14266 psrld m6, 5 | |
14267 packusdw m5, m6 | |
14268 | |
14269 palignr m9, m0, m3, 12 | |
14270 pmaddwd m6, m9, [r3 + 2 * 32] ; [19] | |
14271 paddd m6, [pd_16] | |
14272 psrld m6, 5 | |
14273 palignr m3, m2, m0, 12 | |
14274 pmaddwd m7, m3, [r3 + 2 * 32] | |
14275 paddd m7, [pd_16] | |
14276 psrld m7, 5 | |
14277 packusdw m6, m7 | |
14278 | |
14279 pmaddwd m7, m9, [r3 + 11 * 32] ; [28] | |
14280 paddd m7, [pd_16] | |
14281 psrld m7, 5 | |
14282 pmaddwd m8, m3, [r3 + 11 * 32] | |
14283 paddd m8, [pd_16] | |
14284 psrld m8, 5 | |
14285 packusdw m7, m8 | |
14286 | |
14287 pmaddwd m8, m0, [r3 - 12 * 32] ; [5] | |
14288 paddd m8, [pd_16] | |
14289 psrld m8, 5 | |
14290 pmaddwd m10, m2, [r3 - 12 * 32] | |
14291 paddd m10, [pd_16] | |
14292 psrld m10, 5 | |
14293 packusdw m8, m10 | |
14294 | |
14295 pmaddwd m9, m0, [r3 - 3 * 32] ; [14] | |
14296 paddd m9, [pd_16] | |
14297 psrld m9, 5 | |
14298 pmaddwd m3, m2, [r3 - 3 * 32] | |
14299 paddd m3, [pd_16] | |
14300 psrld m3, 5 | |
14301 packusdw m9, m3 | |
14302 | |
14303 pmaddwd m10, m0, [r3 + 6 * 32] ; [23] | |
14304 paddd m10, [pd_16] | |
14305 psrld m10, 5 | |
14306 pmaddwd m12, m2, [r3 + 6 * 32] | |
14307 paddd m12, [pd_16] | |
14308 psrld m12, 5 | |
14309 packusdw m10, m12 | |
14310 | |
14311 palignr m11, m2, m0, 4 | |
14312 pmaddwd m11, [r3 - 17 * 32] ; [0] | |
14313 paddd m11, [pd_16] | |
14314 psrld m11, 5 | |
14315 palignr m12, m1, m2, 4 | |
14316 pmaddwd m12, [r3 - 17 * 32] | |
14317 paddd m12, [pd_16] | |
14318 psrld m12, 5 | |
14319 packusdw m11, m12 | |
14320 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 3, 2, 16 | |
14321 ret | |
14322 | |
14323 cglobal intra_pred_ang32_7, 3,8,14 | |
14324 add r2, 128 | |
14325 xor r6d, r6d | |
14326 lea r3, [ang_table_avx2 + 17 * 32] | |
14327 add r1d, r1d | |
14328 lea r4, [r1 * 3] | |
14329 lea r7, [r0 + 8 * r1] | |
14330 | |
14331 call ang16_mode_7_29 | |
14332 | |
14333 add r2, 8 | |
14334 lea r0, [r0 + 32] | |
14335 | |
14336 call ang32_mode_7_29 | |
14337 | |
14338 add r2, 24 | |
14339 lea r0, [r7 + 8 * r1] | |
14340 | |
14341 call ang16_mode_7_29 | |
14342 | |
14343 add r2, 8 | |
14344 lea r0, [r0 + 32] | |
14345 | |
14346 call ang32_mode_7_29 | |
14347 RET | |
14348 | |
14349 cglobal intra_pred_ang32_29, 3,7,14 | |
14350 xor r6d, r6d | |
14351 inc r6d | |
14352 lea r3, [ang_table_avx2 + 17 * 32] | |
14353 add r1d, r1d | |
14354 lea r4, [r1 * 3] | |
14355 lea r5, [r0 + 32] | |
14356 | |
14357 call ang16_mode_7_29 | |
14358 | |
14359 add r2, 8 | |
14360 | |
14361 call ang32_mode_7_29 | |
14362 | |
14363 add r2, 24 | |
14364 mov r0, r5 | |
14365 | |
14366 call ang16_mode_7_29 | |
14367 | |
14368 add r2, 8 | |
14369 | |
14370 call ang32_mode_7_29 | |
14371 RET | |
14372 | |
14373 ;; angle 32, modes 8 and 28 | |
14374 cglobal ang32_mode_8_28 | |
14375 test r6d, r6d | |
14376 | |
14377 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
14378 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
14379 | |
14380 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] | |
14381 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] | |
14382 | |
14383 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] | |
14384 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] | |
14385 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] | |
14386 | |
14387 pmaddwd m4, m3, [r3 + 6 * 32] ; [21] | |
14388 paddd m4, [pd_16] | |
14389 psrld m4, 5 | |
14390 pmaddwd m5, m0, [r3 + 6 * 32] | |
14391 paddd m5, [pd_16] | |
14392 psrld m5, 5 | |
14393 packusdw m4, m5 | |
14394 | |
14395 pmaddwd m5, m3, [r3 + 11 * 32] ; [26] | |
14396 paddd m5, [pd_16] | |
14397 psrld m5, 5 | |
14398 pmaddwd m8, m0, [r3 + 11 * 32] | |
14399 paddd m8, [pd_16] | |
14400 psrld m8, 5 | |
14401 packusdw m5, m8 | |
14402 | |
14403 pmaddwd m6, m3, [r3 + 16 * 32] ; [31] | |
14404 paddd m6, [pd_16] | |
14405 psrld m6, 5 | |
14406 pmaddwd m9, m0, [r3 + 16 * 32] | |
14407 paddd m9, [pd_16] | |
14408 psrld m9, 5 | |
14409 packusdw m6, m9 | |
14410 | |
14411 palignr m11, m0, m3, 4 | |
14412 pmaddwd m7, m11, [r3 - 11 * 32] ; [4] | |
14413 paddd m7, [pd_16] | |
14414 psrld m7, 5 | |
14415 palignr m1, m2, m0, 4 | |
14416 pmaddwd m8, m1, [r3 - 11 * 32] | |
14417 paddd m8, [pd_16] | |
14418 psrld m8, 5 | |
14419 packusdw m7, m8 | |
14420 | |
14421 pmaddwd m8, m11, [r3 - 6 * 32] ; [9] | |
14422 paddd m8, [pd_16] | |
14423 psrld m8, 5 | |
14424 pmaddwd m9, m1, [r3 - 6 * 32] | |
14425 paddd m9, [pd_16] | |
14426 psrld m9, 5 | |
14427 packusdw m8, m9 | |
14428 | |
14429 pmaddwd m9, m11, [r3 - 1 * 32] ; [14] | |
14430 paddd m9, [pd_16] | |
14431 psrld m9, 5 | |
14432 pmaddwd m10, m1, [r3 - 1 * 32] | |
14433 paddd m10, [pd_16] | |
14434 psrld m10, 5 | |
14435 packusdw m9, m10 | |
14436 | |
14437 pmaddwd m10, m11, [r3 + 4 * 32] ; [19] | |
14438 paddd m10, [pd_16] | |
14439 psrld m10, 5 | |
14440 pmaddwd m12, m1, [r3 + 4 * 32] | |
14441 paddd m12, [pd_16] | |
14442 psrld m12, 5 | |
14443 packusdw m10, m12 | |
14444 | |
14445 pmaddwd m11, [r3 + 9 * 32] ; [24] | |
14446 paddd m11, [pd_16] | |
14447 psrld m11, 5 | |
14448 pmaddwd m1, [r3 + 9 * 32] | |
14449 paddd m1, [pd_16] | |
14450 psrld m1, 5 | |
14451 packusdw m11, m1 | |
14452 | |
14453 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 | |
14454 | |
14455 palignr m4, m0, m3, 4 | |
14456 pmaddwd m4, [r3 + 14 * 32] ; [29] | |
14457 paddd m4, [pd_16] | |
14458 psrld m4, 5 | |
14459 palignr m5, m2, m0, 4 | |
14460 pmaddwd m5, [r3 + 14 * 32] | |
14461 paddd m5, [pd_16] | |
14462 psrld m5, 5 | |
14463 packusdw m4, m5 | |
14464 | |
14465 palignr m1, m0, m3, 8 | |
14466 pmaddwd m5, m1, [r3 - 13 * 32] ; [2] | |
14467 paddd m5, [pd_16] | |
14468 psrld m5, 5 | |
14469 palignr m10, m2, m0, 8 | |
14470 pmaddwd m6, m10, [r3 - 13 * 32] | |
14471 paddd m6, [pd_16] | |
14472 psrld m6, 5 | |
14473 packusdw m5, m6 | |
14474 | |
14475 pmaddwd m6, m1, [r3 - 8 * 32] ; [7] | |
14476 paddd m6, [pd_16] | |
14477 psrld m6, 5 | |
14478 pmaddwd m8, m10, [r3 - 8 * 32] | |
14479 paddd m8, [pd_16] | |
14480 psrld m8, 5 | |
14481 packusdw m6, m8 | |
14482 | |
14483 pmaddwd m7, m1, [r3 - 3 * 32] ; [12] | |
14484 paddd m7, [pd_16] | |
14485 psrld m7, 5 | |
14486 pmaddwd m8, m10, [r3 - 3 * 32] | |
14487 paddd m8, [pd_16] | |
14488 psrld m8, 5 | |
14489 packusdw m7, m8 | |
14490 | |
14491 pmaddwd m8, m1, [r3 + 2 * 32] ; [17] | |
14492 paddd m8, [pd_16] | |
14493 psrld m8, 5 | |
14494 pmaddwd m9, m10, [r3 + 2 * 32] | |
14495 paddd m9, [pd_16] | |
14496 psrld m9, 5 | |
14497 packusdw m8, m9 | |
14498 | |
14499 pmaddwd m9, m1, [r3 + 7 * 32] ; [22] | |
14500 paddd m9, [pd_16] | |
14501 psrld m9, 5 | |
14502 pmaddwd m11, m10, [r3 + 7 * 32] | |
14503 paddd m11, [pd_16] | |
14504 psrld m11, 5 | |
14505 packusdw m9, m11 | |
14506 | |
14507 pmaddwd m1, [r3 + 12 * 32] ; [27] | |
14508 paddd m1, [pd_16] | |
14509 psrld m1, 5 | |
14510 pmaddwd m10, [r3 + 12 * 32] | |
14511 paddd m10, [pd_16] | |
14512 psrld m10, 5 | |
14513 packusdw m1, m10 | |
14514 | |
14515 palignr m11, m0, m3, 12 | |
14516 pmaddwd m11, [r3 - 15 * 32] ; [0] | |
14517 paddd m11, [pd_16] | |
14518 psrld m11, 5 | |
14519 palignr m2, m0, 12 | |
14520 pmaddwd m2, [r3 - 15 * 32] | |
14521 paddd m2, [pd_16] | |
14522 psrld m2, 5 | |
14523 packusdw m11, m2 | |
14524 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 11, 0, 2, 16 | |
14525 ret | |
14526 | |
14527 cglobal intra_pred_ang32_8, 3,8,13 | |
14528 add r2, 128 | |
14529 xor r6d, r6d | |
14530 lea r3, [ang_table_avx2 + 15 * 32] | |
14531 add r1d, r1d | |
14532 lea r4, [r1 * 3] | |
14533 lea r7, [r0 + 8 * r1] | |
14534 | |
14535 call ang16_mode_8_28 | |
14536 | |
14537 add r2, 4 | |
14538 lea r0, [r0 + 32] | |
14539 | |
14540 call ang32_mode_8_28 | |
14541 | |
14542 add r2, 28 | |
14543 lea r0, [r7 + 8 * r1] | |
14544 | |
14545 call ang16_mode_8_28 | |
14546 | |
14547 add r2, 4 | |
14548 lea r0, [r0 + 32] | |
14549 | |
14550 call ang32_mode_8_28 | |
14551 RET | |
14552 | |
14553 cglobal intra_pred_ang32_28, 3,7,13 | |
14554 xor r6d, r6d | |
14555 inc r6d | |
14556 lea r3, [ang_table_avx2 + 15 * 32] | |
14557 add r1d, r1d | |
14558 lea r4, [r1 * 3] | |
14559 lea r5, [r0 + 32] | |
14560 | |
14561 call ang16_mode_8_28 | |
14562 | |
14563 add r2, 4 | |
14564 | |
14565 call ang32_mode_8_28 | |
14566 | |
14567 add r2, 28 | |
14568 mov r0, r5 | |
14569 | |
14570 call ang16_mode_8_28 | |
14571 | |
14572 add r2, 4 | |
14573 | |
14574 call ang32_mode_8_28 | |
14575 RET | |
14576 | |
14577 cglobal intra_pred_ang32_9, 3,8,13 | |
14578 add r2, 128 | |
14579 xor r6d, r6d | |
14580 lea r3, [ang_table_avx2 + 16 * 32] | |
14581 add r1d, r1d | |
14582 lea r4, [r1 * 3] | |
14583 lea r7, [r0 + 8 * r1] | |
14584 | |
14585 call ang16_mode_9_27 | |
14586 | |
14587 add r2, 2 | |
14588 lea r0, [r0 + 32] | |
14589 | |
14590 call ang16_mode_9_27 | |
14591 | |
14592 add r2, 30 | |
14593 lea r0, [r7 + 8 * r1] | |
14594 | |
14595 call ang16_mode_9_27 | |
14596 | |
14597 add r2, 2 | |
14598 lea r0, [r0 + 32] | |
14599 | |
14600 call ang16_mode_9_27 | |
14601 RET | |
14602 | |
14603 cglobal intra_pred_ang32_27, 3,7,13 | |
14604 xor r6d, r6d | |
14605 inc r6d | |
14606 lea r3, [ang_table_avx2 + 16 * 32] | |
14607 add r1d, r1d | |
14608 lea r4, [r1 * 3] | |
14609 lea r5, [r0 + 32] | |
14610 | |
14611 call ang16_mode_9_27 | |
14612 | |
14613 add r2, 2 | |
14614 | |
14615 call ang16_mode_9_27 | |
14616 | |
14617 add r2, 30 | |
14618 mov r0, r5 | |
14619 | |
14620 call ang16_mode_9_27 | |
14621 | |
14622 add r2, 2 | |
14623 | |
14624 call ang16_mode_9_27 | |
14625 RET | |
14626 | |
14627 cglobal intra_pred_ang32_10, 3,4,2 | |
14628 add r2, mmsize*4 | |
14629 add r1d, r1d | |
14630 lea r3, [r1 * 3] | |
14631 | |
14632 vpbroadcastw m0, [r2 + 2] ; [1...] | |
14633 movu [r0], m0 | |
14634 movu [r0 + 32], m0 | |
14635 vpbroadcastw m1, [r2 + 2 + 2] ; [2...] | |
14636 movu [r0 + r1], m1 | |
14637 movu [r0 + r1 + 32], m1 | |
14638 vpbroadcastw m0, [r2 + 2 + 4] ; [3...] | |
14639 movu [r0 + r1 * 2], m0 | |
14640 movu [r0 + r1 * 2 + 32], m0 | |
14641 vpbroadcastw m1, [r2 + 2 + 6] ; [4...] | |
14642 movu [r0 + r3], m1 | |
14643 movu [r0 + r3 + 32], m1 | |
14644 | |
14645 lea r0, [r0 + r1 * 4] | |
14646 vpbroadcastw m0, [r2 + 2 + 8] ; [5...] | |
14647 movu [r0], m0 | |
14648 movu [r0 + 32], m0 | |
14649 vpbroadcastw m1, [r2 + 2 + 10] ; [6...] | |
14650 movu [r0 + r1], m1 | |
14651 movu [r0 + r1 + 32], m1 | |
14652 vpbroadcastw m0, [r2 + 2 + 12] ; [7...] | |
14653 movu [r0 + r1 * 2], m0 | |
14654 movu [r0 + r1 * 2 + 32], m0 | |
14655 vpbroadcastw m1, [r2 + 2 + 14] ; [8...] | |
14656 movu [r0 + r3], m1 | |
14657 movu [r0 + r3 + 32], m1 | |
14658 | |
14659 lea r0, [r0 + r1 *4] | |
14660 vpbroadcastw m0, [r2 + 2 + 16] ; [9...] | |
14661 movu [r0], m0 | |
14662 movu [r0 + 32], m0 | |
14663 vpbroadcastw m1, [r2 + 2 + 18] ; [10...] | |
14664 movu [r0 + r1], m1 | |
14665 movu [r0 + r1 + 32], m1 | |
14666 vpbroadcastw m0, [r2 + 2 + 20] ; [11...] | |
14667 movu [r0 + r1 * 2], m0 | |
14668 movu [r0 + r1 * 2 + 32], m0 | |
14669 vpbroadcastw m1, [r2 + 2 + 22] ; [12...] | |
14670 movu [r0 + r3], m1 | |
14671 movu [r0 + r3 + 32], m1 | |
14672 | |
14673 lea r0, [r0 + r1 *4] | |
14674 vpbroadcastw m0, [r2 + 2 + 24] ; [13...] | |
14675 movu [r0], m0 | |
14676 movu [r0 + 32], m0 | |
14677 vpbroadcastw m1, [r2 + 2 + 26] ; [14...] | |
14678 movu [r0 + r1], m1 | |
14679 movu [r0 + r1 + 32], m1 | |
14680 vpbroadcastw m0, [r2 + 2 + 28] ; [15...] | |
14681 movu [r0 + r1 * 2], m0 | |
14682 movu [r0 + r1 * 2 + 32], m0 | |
14683 vpbroadcastw m1, [r2 + 2 + 30] ; [16...] | |
14684 movu [r0 + r3], m1 | |
14685 movu [r0 + r3 + 32], m1 | |
14686 | |
14687 lea r0, [r0 + r1 *4] | |
14688 vpbroadcastw m0, [r2 + 2 + 32] ; [17...] | |
14689 movu [r0], m0 | |
14690 movu [r0 + 32], m0 | |
14691 vpbroadcastw m1, [r2 + 2 + 34] ; [18...] | |
14692 movu [r0 + r1], m1 | |
14693 movu [r0 + r1 + 32], m1 | |
14694 vpbroadcastw m0, [r2 + 2 + 36] ; [19...] | |
14695 movu [r0 + r1 * 2], m0 | |
14696 movu [r0 + r1 * 2 + 32], m0 | |
14697 vpbroadcastw m1, [r2 + 2 + 38] ; [20...] | |
14698 movu [r0 + r3], m1 | |
14699 movu [r0 + r3 + 32], m1 | |
14700 | |
14701 lea r0, [r0 + r1 *4] | |
14702 vpbroadcastw m0, [r2 + 2 + 40] ; [21...] | |
14703 movu [r0], m0 | |
14704 movu [r0 + 32], m0 | |
14705 vpbroadcastw m1, [r2 + 2 + 42] ; [22...] | |
14706 movu [r0 + r1], m1 | |
14707 movu [r0 + r1 + 32], m1 | |
14708 vpbroadcastw m0, [r2 + 2 + 44] ; [23...] | |
14709 movu [r0 + r1 * 2], m0 | |
14710 movu [r0 + r1 * 2 + 32], m0 | |
14711 vpbroadcastw m1, [r2 + 2 + 46] ; [24...] | |
14712 movu [r0 + r3], m1 | |
14713 movu [r0 + r3 + 32], m1 | |
14714 | |
14715 lea r0, [r0 + r1 *4] | |
14716 vpbroadcastw m0, [r2 + 2 + 48] ; [25...] | |
14717 movu [r0], m0 | |
14718 movu [r0 + 32], m0 | |
14719 vpbroadcastw m1, [r2 + 2 + 50] ; [26...] | |
14720 movu [r0 + r1], m1 | |
14721 movu [r0 + r1 + 32], m1 | |
14722 vpbroadcastw m0, [r2 + 2 + 52] ; [27...] | |
14723 movu [r0 + r1 * 2], m0 | |
14724 movu [r0 + r1 * 2 + 32], m0 | |
14725 vpbroadcastw m1, [r2 + 2 + 54] ; [28...] | |
14726 movu [r0 + r3], m1 | |
14727 movu [r0 + r3 + 32], m1 | |
14728 | |
14729 lea r0, [r0 + r1 *4] | |
14730 vpbroadcastw m0, [r2 + 2 + 56] ; [29...] | |
14731 movu [r0], m0 | |
14732 movu [r0 + 32], m0 | |
14733 vpbroadcastw m1, [r2 + 2 + 58] ; [30...] | |
14734 movu [r0 + r1], m1 | |
14735 movu [r0 + r1 + 32], m1 | |
14736 vpbroadcastw m0, [r2 + 2 + 60] ; [31...] | |
14737 movu [r0 + r1 * 2], m0 | |
14738 movu [r0 + r1 * 2 + 32], m0 | |
14739 vpbroadcastw m1, [r2 + 2 + 62] ; [32...] | |
14740 movu [r0 + r3], m1 | |
14741 movu [r0 + r3 + 32], m1 | |
14742 RET | |
14743 | |
14744 cglobal intra_pred_ang32_26, 3,3,2 | |
14745 movu m0, [r2 + 2] | |
14746 movu m1, [r2 + 34] | |
14747 add r1d, r1d | |
14748 lea r2, [r1 * 3] | |
14749 | |
14750 movu [r0], m0 | |
14751 movu [r0 + 32], m1 | |
14752 movu [r0 + r1], m0 | |
14753 movu [r0 + r1 + 32], m1 | |
14754 movu [r0 + r1 * 2], m0 | |
14755 movu [r0 + r1 * 2 + 32], m1 | |
14756 movu [r0 + r2], m0 | |
14757 movu [r0 + r2 + 32], m1 | |
14758 | |
14759 lea r0, [r0 + r1 *4] | |
14760 movu [r0], m0 | |
14761 movu [r0 + 32], m1 | |
14762 movu [r0 + r1], m0 | |
14763 movu [r0 + r1 + 32], m1 | |
14764 movu [r0 + r1 * 2], m0 | |
14765 movu [r0 + r1 * 2 + 32], m1 | |
14766 movu [r0 + r2], m0 | |
14767 movu [r0 + r2 + 32], m1 | |
14768 | |
14769 lea r0, [r0 + r1 *4] | |
14770 movu [r0], m0 | |
14771 movu [r0 + 32], m1 | |
14772 movu [r0 + r1], m0 | |
14773 movu [r0 + r1 + 32], m1 | |
14774 movu [r0 + r1 * 2], m0 | |
14775 movu [r0 + r1 * 2 + 32], m1 | |
14776 movu [r0 + r2], m0 | |
14777 movu [r0 + r2 + 32], m1 | |
14778 | |
14779 lea r0, [r0 + r1 *4] | |
14780 movu [r0], m0 | |
14781 movu [r0 + 32], m1 | |
14782 movu [r0 + r1], m0 | |
14783 movu [r0 + r1 + 32], m1 | |
14784 movu [r0 + r1 * 2], m0 | |
14785 movu [r0 + r1 * 2 + 32], m1 | |
14786 movu [r0 + r2], m0 | |
14787 movu [r0 + r2 + 32], m1 | |
14788 | |
14789 lea r0, [r0 + r1 *4] | |
14790 movu [r0], m0 | |
14791 movu [r0 + 32], m1 | |
14792 movu [r0 + r1], m0 | |
14793 movu [r0 + r1 + 32], m1 | |
14794 movu [r0 + r1 * 2], m0 | |
14795 movu [r0 + r1 * 2 + 32], m1 | |
14796 movu [r0 + r2], m0 | |
14797 movu [r0 + r2 + 32], m1 | |
14798 | |
14799 lea r0, [r0 + r1 *4] | |
14800 movu [r0], m0 | |
14801 movu [r0 + 32], m1 | |
14802 movu [r0 + r1], m0 | |
14803 movu [r0 + r1 + 32], m1 | |
14804 movu [r0 + r1 * 2], m0 | |
14805 movu [r0 + r1 * 2 + 32], m1 | |
14806 movu [r0 + r2], m0 | |
14807 movu [r0 + r2 + 32], m1 | |
14808 | |
14809 lea r0, [r0 + r1 *4] | |
14810 movu [r0], m0 | |
14811 movu [r0 + 32], m1 | |
14812 movu [r0 + r1], m0 | |
14813 movu [r0 + r1 + 32], m1 | |
14814 movu [r0 + r1 * 2], m0 | |
14815 movu [r0 + r1 * 2 + 32], m1 | |
14816 movu [r0 + r2], m0 | |
14817 movu [r0 + r2 + 32], m1 | |
14818 | |
14819 lea r0, [r0 + r1 *4] | |
14820 movu [r0], m0 | |
14821 movu [r0 + 32], m1 | |
14822 movu [r0 + r1], m0 | |
14823 movu [r0 + r1 + 32], m1 | |
14824 movu [r0 + r1 * 2], m0 | |
14825 movu [r0 + r1 * 2 + 32], m1 | |
14826 movu [r0 + r2], m0 | |
14827 movu [r0 + r2 + 32], m1 | |
14828 RET | |
14829 | |
14830 cglobal intra_pred_ang32_11, 3,8,12, 0-8 | |
14831 movzx r5d, word [r2 + 128] ; [0] | |
14832 movzx r6d, word [r2] | |
14833 mov [rsp], r5w | |
14834 mov [r2 + 128], r6w | |
14835 | |
14836 movzx r5d, word [r2 + 126] ; [16] | |
14837 movzx r6d, word [r2 + 32] | |
14838 mov [rsp + 4], r5w | |
14839 mov [r2 + 126], r6w | |
14840 | |
14841 add r2, 128 | |
14842 xor r6d, r6d | |
14843 lea r3, [ang_table_avx2 + 16 * 32] | |
14844 add r1d, r1d | |
14845 lea r4, [r1 * 3] | |
14846 lea r7, [r0 + 8 * r1] | |
14847 | |
14848 call ang16_mode_11_25 | |
14849 | |
14850 sub r2, 2 | |
14851 lea r0, [r0 + 32] | |
14852 | |
14853 call ang16_mode_11_25 | |
14854 | |
14855 add r2, 34 | |
14856 lea r0, [r7 + 8 * r1] | |
14857 | |
14858 call ang16_mode_11_25 | |
14859 | |
14860 sub r2, 2 | |
14861 lea r0, [r0 + 32] | |
14862 | |
14863 call ang16_mode_11_25 | |
14864 | |
14865 mov r6d, [rsp] | |
14866 mov [r2 - 30], r6w | |
14867 mov r6d, [rsp + 4] | |
14868 mov [r2 - 32], r6w | |
14869 RET | |
14870 | |
14871 cglobal intra_pred_ang32_25, 3,7,12, 0-4 | |
14872 xor r6d, r6d | |
14873 inc r6d | |
14874 lea r3, [ang_table_avx2 + 16 * 32] | |
14875 add r1d, r1d | |
14876 | |
14877 movzx r4d, word [r2 - 2] | |
14878 movzx r5d, word [r2 + 160] ; [16] | |
14879 mov [rsp], r4w | |
14880 mov [r2 - 2], r5w | |
14881 | |
14882 lea r4, [r1 * 3] | |
14883 lea r5, [r0 + 32] | |
14884 | |
14885 call ang16_mode_11_25 | |
14886 | |
14887 sub r2, 2 | |
14888 | |
14889 call ang16_mode_11_25 | |
14890 | |
14891 add r2, 34 | |
14892 mov r0, r5 | |
14893 | |
14894 call ang16_mode_11_25 | |
14895 | |
14896 sub r2, 2 | |
14897 | |
14898 call ang16_mode_11_25 | |
14899 | |
14900 mov r5d, [rsp] | |
14901 mov [r2 - 32], r5w | |
14902 RET | |
14903 | |
14904 ;; angle 32, modes 12 and 24, row 0 to 15 | |
14905 cglobal ang32_mode_12_24_0_15 | |
14906 test r6d, r6d | |
14907 | |
14908 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
14909 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
14910 | |
14911 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
14912 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
14913 | |
14914 pmaddwd m4, m3, [r3 + 11 * 32] ; [27] | |
14915 paddd m4, [pd_16] | |
14916 psrld m4, 5 | |
14917 pmaddwd m5, m2, [r3 + 11 * 32] | |
14918 paddd m5, [pd_16] | |
14919 psrld m5, 5 | |
14920 packusdw m4, m5 | |
14921 | |
14922 pmaddwd m5, m3, [r3 + 6 * 32] ; [22] | |
14923 paddd m5, [pd_16] | |
14924 psrld m5, 5 | |
14925 pmaddwd m8, m2, [r3 + 6 * 32] | |
14926 paddd m8, [pd_16] | |
14927 psrld m8, 5 | |
14928 packusdw m5, m8 | |
14929 | |
14930 pmaddwd m6, m3, [r3 + 1 * 32] ; [17] | |
14931 paddd m6, [pd_16] | |
14932 psrld m6, 5 | |
14933 pmaddwd m9, m2, [r3 + 1 * 32] | |
14934 paddd m9, [pd_16] | |
14935 psrld m9, 5 | |
14936 packusdw m6, m9 | |
14937 | |
14938 pmaddwd m7, m3, [r3 - 4 * 32] ; [12] | |
14939 paddd m7, [pd_16] | |
14940 psrld m7, 5 | |
14941 pmaddwd m8, m2, [r3 - 4 * 32] | |
14942 paddd m8, [pd_16] | |
14943 psrld m8, 5 | |
14944 packusdw m7, m8 | |
14945 | |
14946 pmaddwd m8, m3, [r3 - 9 * 32] ; [7] | |
14947 paddd m8, [pd_16] | |
14948 psrld m8, 5 | |
14949 pmaddwd m9, m2, [r3 - 9 * 32] | |
14950 paddd m9, [pd_16] | |
14951 psrld m9, 5 | |
14952 packusdw m8, m9 | |
14953 | |
14954 pmaddwd m9, m3, [r3 - 14 * 32] ; [2] | |
14955 paddd m9, [pd_16] | |
14956 psrld m9, 5 | |
14957 pmaddwd m2, [r3 - 14 * 32] | |
14958 paddd m2, [pd_16] | |
14959 psrld m2, 5 | |
14960 packusdw m9, m2 | |
14961 | |
14962 movu xm1, [r2 - 8] | |
14963 pshufb xm1, [pw_ang32_12_24] | |
14964 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
14965 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
14966 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 19 19 26 26] | |
14967 | |
14968 palignr m2, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 6] | |
14969 palignr m13, m0, m3, 14 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] | |
14970 | |
14971 pmaddwd m10, m2, [r3 + 13 * 32] ; [29] | |
14972 paddd m10, [pd_16] | |
14973 psrld m10, 5 | |
14974 pmaddwd m12, m13, [r3 + 13 * 32] | |
14975 paddd m12, [pd_16] | |
14976 psrld m12, 5 | |
14977 packusdw m10, m12 | |
14978 | |
14979 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
14980 paddd m11, [pd_16] | |
14981 psrld m11, 5 | |
14982 pmaddwd m13, [r3 + 8 * 32] | |
14983 paddd m13, [pd_16] | |
14984 psrld m13, 5 | |
14985 packusdw m11, m13 | |
14986 | |
14987 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
14988 | |
14989 palignr m13, m0, m3, 14 | |
14990 | |
14991 pmaddwd m4, m2, [r3 + 3 * 32] ; [19] | |
14992 paddd m4, [pd_16] | |
14993 psrld m4, 5 | |
14994 pmaddwd m5, m13, [r3 + 3 * 32] | |
14995 paddd m5, [pd_16] | |
14996 psrld m5, 5 | |
14997 packusdw m4, m5 | |
14998 | |
14999 pmaddwd m5, m2, [r3 - 2 * 32] ; [14] | |
15000 paddd m5, [pd_16] | |
15001 psrld m5, 5 | |
15002 pmaddwd m6, m13, [r3 - 2 * 32] | |
15003 paddd m6, [pd_16] | |
15004 psrld m6, 5 | |
15005 packusdw m5, m6 | |
15006 | |
15007 pmaddwd m6, m2, [r3 - 7 * 32] ; [9] | |
15008 paddd m6, [pd_16] | |
15009 psrld m6, 5 | |
15010 pmaddwd m8, m13, [r3 - 7 * 32] | |
15011 paddd m8, [pd_16] | |
15012 psrld m8, 5 | |
15013 packusdw m6, m8 | |
15014 | |
15015 pmaddwd m7, m2, [r3 - 12 * 32] ; [4] | |
15016 paddd m7, [pd_16] | |
15017 psrld m7, 5 | |
15018 pmaddwd m8, m13, [r3 - 12 * 32] | |
15019 paddd m8, [pd_16] | |
15020 psrld m8, 5 | |
15021 packusdw m7, m8 | |
15022 | |
15023 palignr m0, m3, 10 | |
15024 palignr m3, m1, 10 | |
15025 | |
15026 pmaddwd m8, m3, [r3 + 15 * 32] ; [31] | |
15027 paddd m8, [pd_16] | |
15028 psrld m8, 5 | |
15029 pmaddwd m9, m0, [r3 + 15 * 32] | |
15030 paddd m9, [pd_16] | |
15031 psrld m9, 5 | |
15032 packusdw m8, m9 | |
15033 | |
15034 pmaddwd m9, m3, [r3 + 10 * 32] ; [26] | |
15035 paddd m9, [pd_16] | |
15036 psrld m9, 5 | |
15037 pmaddwd m10, m0, [r3 + 10 * 32] | |
15038 paddd m10, [pd_16] | |
15039 psrld m10, 5 | |
15040 packusdw m9, m10 | |
15041 | |
15042 pmaddwd m10, m3, [r3 + 5 * 32] ; [21] | |
15043 paddd m10, [pd_16] | |
15044 psrld m10, 5 | |
15045 pmaddwd m2, m0, [r3 + 5 * 32] | |
15046 paddd m2, [pd_16] | |
15047 psrld m2, 5 | |
15048 packusdw m10, m2 | |
15049 | |
15050 pmaddwd m3, [r3] ; [16] | |
15051 paddd m3, [pd_16] | |
15052 psrld m3, 5 | |
15053 pmaddwd m0, [r3] | |
15054 paddd m0, [pd_16] | |
15055 psrld m0, 5 | |
15056 packusdw m3, m0 | |
15057 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 2, 16 | |
15058 ret | |
15059 | |
15060 ;; angle 32, modes 12 and 24, row 16 to 31 | |
15061 cglobal ang32_mode_12_24_16_31 | |
15062 test r6d, r6d | |
15063 | |
15064 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
15065 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
15066 | |
15067 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
15068 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
15069 | |
15070 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
15071 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
15072 | |
15073 palignr m2, m3, m1, 10 | |
15074 palignr m13, m0, m3, 10 | |
15075 | |
15076 pmaddwd m4, m2, [r3 - 5 * 32] ; [11] | |
15077 paddd m4, [pd_16] | |
15078 psrld m4, 5 | |
15079 pmaddwd m5, m13, [r3 - 5 * 32] | |
15080 paddd m5, [pd_16] | |
15081 psrld m5, 5 | |
15082 packusdw m4, m5 | |
15083 | |
15084 pmaddwd m5, m2, [r3 - 10 * 32] ; [6] | |
15085 paddd m5, [pd_16] | |
15086 psrld m5, 5 | |
15087 pmaddwd m8, m13, [r3 - 10 * 32] | |
15088 paddd m8, [pd_16] | |
15089 psrld m8, 5 | |
15090 packusdw m5, m8 | |
15091 | |
15092 pmaddwd m6, m2, [r3 - 15 * 32] ; [1] | |
15093 paddd m6, [pd_16] | |
15094 psrld m6, 5 | |
15095 pmaddwd m9, m13, [r3 - 15 * 32] | |
15096 paddd m9, [pd_16] | |
15097 psrld m9, 5 | |
15098 packusdw m6, m9 | |
15099 | |
15100 palignr m2, m3, m1, 6 | |
15101 palignr m13, m0, m3, 6 | |
15102 | |
15103 pmaddwd m7, m2, [r3 + 12 * 32] ; [28] | |
15104 paddd m7, [pd_16] | |
15105 psrld m7, 5 | |
15106 pmaddwd m8, m13, [r3 + 12 * 32] | |
15107 paddd m8, [pd_16] | |
15108 psrld m8, 5 | |
15109 packusdw m7, m8 | |
15110 | |
15111 pmaddwd m8, m2, [r3 + 7 * 32] ; [23] | |
15112 paddd m8, [pd_16] | |
15113 psrld m8, 5 | |
15114 pmaddwd m9, m13, [r3 + 7 * 32] | |
15115 paddd m9, [pd_16] | |
15116 psrld m9, 5 | |
15117 packusdw m8, m9 | |
15118 | |
15119 pmaddwd m9, m2, [r3 + 2 * 32] ; [18] | |
15120 paddd m9, [pd_16] | |
15121 psrld m9, 5 | |
15122 pmaddwd m10, m13, [r3 + 2 * 32] | |
15123 paddd m10, [pd_16] | |
15124 psrld m10, 5 | |
15125 packusdw m9, m10 | |
15126 | |
15127 pmaddwd m10, m2, [r3 - 3 * 32] ; [13] | |
15128 paddd m10, [pd_16] | |
15129 psrld m10, 5 | |
15130 pmaddwd m12, m13, [r3 - 3 * 32] | |
15131 paddd m12, [pd_16] | |
15132 psrld m12, 5 | |
15133 packusdw m10, m12 | |
15134 | |
15135 pmaddwd m11, m2, [r3 - 8 * 32] ; [8] | |
15136 paddd m11, [pd_16] | |
15137 psrld m11, 5 | |
15138 pmaddwd m13, [r3 - 8 * 32] | |
15139 paddd m13, [pd_16] | |
15140 psrld m13, 5 | |
15141 packusdw m11, m13 | |
15142 | |
15143 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
15144 | |
15145 palignr m13, m0, m3, 6 | |
15146 | |
15147 pmaddwd m4, m2, [r3 - 13 * 32] ; [3] | |
15148 paddd m4, [pd_16] | |
15149 psrld m4, 5 | |
15150 pmaddwd m5, m13, [r3 - 13 * 32] | |
15151 paddd m5, [pd_16] | |
15152 psrld m5, 5 | |
15153 packusdw m4, m5 | |
15154 | |
15155 palignr m2, m3, m1, 2 | |
15156 palignr m13, m0, m3, 2 | |
15157 | |
15158 pmaddwd m5, m2, [r3 + 14 * 32] ; [30] | |
15159 paddd m5, [pd_16] | |
15160 psrld m5, 5 | |
15161 pmaddwd m6, m13, [r3 + 14 * 32] | |
15162 paddd m6, [pd_16] | |
15163 psrld m6, 5 | |
15164 packusdw m5, m6 | |
15165 | |
15166 pmaddwd m6, m2, [r3 + 9 * 32] ; [25] | |
15167 paddd m6, [pd_16] | |
15168 psrld m6, 5 | |
15169 pmaddwd m8, m13, [r3 + 9 * 32] | |
15170 paddd m8, [pd_16] | |
15171 psrld m8, 5 | |
15172 packusdw m6, m8 | |
15173 | |
15174 pmaddwd m7, m2, [r3 + 4 * 32] ; [20] | |
15175 paddd m7, [pd_16] | |
15176 psrld m7, 5 | |
15177 pmaddwd m8, m13, [r3 + 4 * 32] | |
15178 paddd m8, [pd_16] | |
15179 psrld m8, 5 | |
15180 packusdw m7, m8 | |
15181 | |
15182 pmaddwd m8, m2, [r3 - 1 * 32] ; [15] | |
15183 paddd m8, [pd_16] | |
15184 psrld m8, 5 | |
15185 pmaddwd m9, m13, [r3 - 1 * 32] | |
15186 paddd m9, [pd_16] | |
15187 psrld m9, 5 | |
15188 packusdw m8, m9 | |
15189 | |
15190 pmaddwd m9, m2, [r3 - 6 * 32] ; [10] | |
15191 paddd m9, [pd_16] | |
15192 psrld m9, 5 | |
15193 pmaddwd m10, m13, [r3 - 6 * 32] | |
15194 paddd m10, [pd_16] | |
15195 psrld m10, 5 | |
15196 packusdw m9, m10 | |
15197 | |
15198 pmaddwd m10, m2, [r3 - 11 * 32] ; [5] | |
15199 paddd m10, [pd_16] | |
15200 psrld m10, 5 | |
15201 pmaddwd m12, m13, [r3 - 11 * 32] | |
15202 paddd m12, [pd_16] | |
15203 psrld m12, 5 | |
15204 packusdw m10, m12 | |
15205 | |
15206 pmaddwd m2, [r3 - 16 * 32] ; [0] | |
15207 paddd m2, [pd_16] | |
15208 psrld m2, 5 | |
15209 pmaddwd m13, [r3 - 16 * 32] | |
15210 paddd m13, [pd_16] | |
15211 psrld m13, 5 | |
15212 packusdw m2, m13 | |
15213 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 3, 16 | |
15214 ret | |
15215 | |
15216 cglobal intra_pred_ang32_12, 3,8,14, 0-16 | |
15217 movu xm0, [r2 + 114] | |
15218 mova [rsp], xm0 | |
15219 | |
15220 add r1d, r1d | |
15221 lea r4, [r1 * 3] | |
15222 lea r3, [ang_table_avx2 + 16 * 32] | |
15223 | |
15224 pinsrw xm1, [r2], 7 ; [0] | |
15225 pinsrw xm1, [r2 + 12], 6 ; [6] | |
15226 pinsrw xm1, [r2 + 26], 5 ; [13] | |
15227 pinsrw xm1, [r2 + 38], 4 ; [19] | |
15228 pinsrw xm1, [r2 + 52], 3 ; [26] | |
15229 movu [r2 + 114], xm1 | |
15230 | |
15231 xor r6d, r6d | |
15232 add r2, 128 | |
15233 lea r7, [r0 + 8 * r1] | |
15234 | |
15235 call ang32_mode_12_24_0_15 | |
15236 | |
15237 lea r0, [r0 + 32] | |
15238 | |
15239 call ang32_mode_12_24_16_31 | |
15240 | |
15241 add r2, 32 | |
15242 lea r0, [r7 + 8 * r1] | |
15243 | |
15244 call ang32_mode_12_24_0_15 | |
15245 | |
15246 lea r0, [r0 + 32] | |
15247 | |
15248 call ang32_mode_12_24_16_31 | |
15249 | |
15250 mova xm0, [rsp] | |
15251 movu [r2 - 46], xm0 | |
15252 RET | |
15253 | |
15254 cglobal intra_pred_ang32_24, 3,7,14, 0-16 | |
15255 movu xm0, [r2 - 16] | |
15256 mova [rsp], xm0 | |
15257 | |
15258 add r1d, r1d | |
15259 lea r4, [r1 * 3] | |
15260 lea r3, [ang_table_avx2 + 16 * 32] | |
15261 | |
15262 pinsrw xm1, [r2 + 140], 7 ; [6] | |
15263 pinsrw xm1, [r2 + 154], 6 ; [13] | |
15264 pinsrw xm1, [r2 + 166], 5 ; [19] | |
15265 pinsrw xm1, [r2 + 180], 4 ; [26] | |
15266 movu [r2 - 16], xm1 | |
15267 | |
15268 xor r6d, r6d | |
15269 inc r6d | |
15270 lea r5, [r0 + 32] | |
15271 | |
15272 call ang32_mode_12_24_0_15 | |
15273 | |
15274 call ang32_mode_12_24_16_31 | |
15275 | |
15276 add r2, 32 | |
15277 mov r0, r5 | |
15278 | |
15279 call ang32_mode_12_24_0_15 | |
15280 | |
15281 call ang32_mode_12_24_16_31 | |
15282 | |
15283 mova xm0, [rsp] | |
15284 movu [r2 - 48], xm0 | |
15285 RET | |
15286 | |
15287 ;; angle 32, modes 13 and 23, row 0 to 15 | |
15288 cglobal ang32_mode_13_23_row_0_15 | |
15289 test r6d, r6d | |
15290 | |
15291 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
15292 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
15293 | |
15294 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
15295 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] | |
15296 | |
15297 pmaddwd m4, m3, [r3 + 7 * 32] ; [23] | |
15298 paddd m4, [pd_16] | |
15299 psrld m4, 5 | |
15300 pmaddwd m5, m2, [r3 + 7 * 32] | |
15301 paddd m5, [pd_16] | |
15302 psrld m5, 5 | |
15303 packusdw m4, m5 | |
15304 | |
15305 pmaddwd m5, m3, [r3 - 2 * 32] ; [14] | |
15306 paddd m5, [pd_16] | |
15307 psrld m5, 5 | |
15308 pmaddwd m6, m2, [r3 - 2 * 32] | |
15309 paddd m6, [pd_16] | |
15310 psrld m6, 5 | |
15311 packusdw m5, m6 | |
15312 | |
15313 pmaddwd m6, m3, [r3 - 11 * 32] ; [5] | |
15314 paddd m6, [pd_16] | |
15315 psrld m6, 5 | |
15316 pmaddwd m2, [r3 - 11 * 32] | |
15317 paddd m2, [pd_16] | |
15318 psrld m2, 5 | |
15319 packusdw m6, m2 | |
15320 | |
15321 movu xm1, [r2 - 8] | |
15322 pshufb xm1, [pw_ang32_12_24] | |
15323 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
15324 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] | |
15325 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] | |
15326 | |
15327 palignr m2, m3, m1, 14 | |
15328 palignr m13, m0, m3, 14 | |
15329 | |
15330 pmaddwd m7, m2, [r3 + 12 * 32] ; [28] | |
15331 paddd m7, [pd_16] | |
15332 psrld m7, 5 | |
15333 pmaddwd m8, m13, [r3 + 12 * 32] | |
15334 paddd m8, [pd_16] | |
15335 psrld m8, 5 | |
15336 packusdw m7, m8 | |
15337 | |
15338 pmaddwd m8, m2, [r3 + 3 * 32] ; [19] | |
15339 paddd m8, [pd_16] | |
15340 psrld m8, 5 | |
15341 pmaddwd m9, m13, [r3 + 3 * 32] | |
15342 paddd m9, [pd_16] | |
15343 psrld m9, 5 | |
15344 packusdw m8, m9 | |
15345 | |
15346 pmaddwd m9, m2, [r3 - 6 * 32] ; [10] | |
15347 paddd m9, [pd_16] | |
15348 psrld m9, 5 | |
15349 pmaddwd m10, m13, [r3 - 6 * 32] | |
15350 paddd m10, [pd_16] | |
15351 psrld m10, 5 | |
15352 packusdw m9, m10 | |
15353 | |
15354 pmaddwd m10, m2, [r3 - 15 * 32] ; [1] | |
15355 paddd m10, [pd_16] | |
15356 psrld m10, 5 | |
15357 pmaddwd m12, m13, [r3 - 15 * 32] | |
15358 paddd m12, [pd_16] | |
15359 psrld m12, 5 | |
15360 packusdw m10, m12 | |
15361 | |
15362 palignr m2, m3, m1, 10 | |
15363 palignr m13, m0, m3, 10 | |
15364 | |
15365 pmaddwd m11, m2, [r3 + 8 * 32] ; [24] | |
15366 paddd m11, [pd_16] | |
15367 psrld m11, 5 | |
15368 pmaddwd m13, [r3 + 8 * 32] | |
15369 paddd m13, [pd_16] | |
15370 psrld m13, 5 | |
15371 packusdw m11, m13 | |
15372 | |
15373 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
15374 | |
15375 palignr m13, m0, m3, 10 | |
15376 | |
15377 pmaddwd m4, m2, [r3 - 1 * 32] ; [15] | |
15378 paddd m4, [pd_16] | |
15379 psrld m4, 5 | |
15380 pmaddwd m5, m13, [r3 - 1 * 32] | |
15381 paddd m5, [pd_16] | |
15382 psrld m5, 5 | |
15383 packusdw m4, m5 | |
15384 | |
15385 pmaddwd m5, m2, [r3 - 10 * 32] ; [6] | |
15386 paddd m5, [pd_16] | |
15387 psrld m5, 5 | |
15388 pmaddwd m6, m13, [r3 - 10 * 32] | |
15389 paddd m6, [pd_16] | |
15390 psrld m6, 5 | |
15391 packusdw m5, m6 | |
15392 | |
15393 palignr m2, m3, m1, 6 | |
15394 palignr m13, m0, m3, 6 | |
15395 | |
15396 pmaddwd m6, m2, [r3 + 13 * 32] ; [29] | |
15397 paddd m6, [pd_16] | |
15398 psrld m6, 5 | |
15399 pmaddwd m8, m13, [r3 + 13 * 32] | |
15400 paddd m8, [pd_16] | |
15401 psrld m8, 5 | |
15402 packusdw m6, m8 | |
15403 | |
15404 pmaddwd m7, m2, [r3 + 4 * 32] ; [20] | |
15405 paddd m7, [pd_16] | |
15406 psrld m7, 5 | |
15407 pmaddwd m8, m13, [r3 + 4 * 32] | |
15408 paddd m8, [pd_16] | |
15409 psrld m8, 5 | |
15410 packusdw m7, m8 | |
15411 | |
15412 pmaddwd m8, m2, [r3 - 5 * 32] ; [11] | |
15413 paddd m8, [pd_16] | |
15414 psrld m8, 5 | |
15415 pmaddwd m9, m13, [r3 - 5 * 32] | |
15416 paddd m9, [pd_16] | |
15417 psrld m9, 5 | |
15418 packusdw m8, m9 | |
15419 | |
15420 pmaddwd m9, m2, [r3 - 14 * 32] ; [2] | |
15421 paddd m9, [pd_16] | |
15422 psrld m9, 5 | |
15423 pmaddwd m13, [r3 - 14 * 32] | |
15424 paddd m13, [pd_16] | |
15425 psrld m13, 5 | |
15426 packusdw m9, m13 | |
15427 | |
15428 palignr m0, m3, 2 | |
15429 palignr m3, m1, 2 | |
15430 | |
15431 pmaddwd m1, m3, [r3 + 9 * 32] ; [25] | |
15432 paddd m1, [pd_16] | |
15433 psrld m1, 5 | |
15434 pmaddwd m2, m0, [r3 + 9 * 32] | |
15435 paddd m2, [pd_16] | |
15436 psrld m2, 5 | |
15437 packusdw m1, m2 | |
15438 | |
15439 pmaddwd m3, [r3] ; [16] | |
15440 paddd m3, [pd_16] | |
15441 psrld m3, 5 | |
15442 pmaddwd m0, [r3] | |
15443 paddd m0, [pd_16] | |
15444 psrld m0, 5 | |
15445 packusdw m3, m0 | |
15446 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 | |
15447 ret | |
15448 | |
15449 ;; angle 32, modes 13 and 23, row 16 to 31 | |
15450 cglobal ang32_mode_13_23_row_16_31 | |
15451 test r6d, r6d | |
15452 | |
15453 movu m0, [r2] ; [11 10 9 8 7 6 5 4 3 2 1 0 4 7 11 14] | |
15454 movu m5, [r2 + 2] ; [12 11 10 9 8 7 6 5 4 3 2 1 0 4 7 11] | |
15455 | |
15456 punpcklwd m4, m0, m5 ; [ 8 7 7 6 6 5 5 4 0 4 4 7 7 11 11 14] | |
15457 punpckhwd m2, m0, m5 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] | |
15458 | |
15459 pmaddwd m4, [r3 - 9 * 32] ; [7] | |
15460 paddd m4, [pd_16] | |
15461 psrld m4, 5 | |
15462 pmaddwd m2, [r3 - 9 * 32] | |
15463 paddd m2, [pd_16] | |
15464 psrld m2, 5 | |
15465 packusdw m4, m2 | |
15466 | |
15467 movu xm1, [r2 - 8] | |
15468 pshufb xm1, [pw_ang32_12_24] ; [18 18 21 21 25 25 28 28] | |
15469 punpcklwd m3, m0, m0 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] | |
15470 punpckhwd m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] | |
15471 vinserti128 m1, m1, xm0, 1 ; [ 3 3 2 2 1 1 0 0 18 18 21 21 25 25 28 28] | |
15472 | |
15473 palignr m2, m3, m1, 14 | |
15474 palignr m13, m0, m3, 14 | |
15475 | |
15476 pmaddwd m5, m2, [r3 + 14 * 32] ; [30] | |
15477 paddd m5, [pd_16] | |
15478 psrld m5, 5 | |
15479 pmaddwd m6, m13, [r3 + 14 * 32] | |
15480 paddd m6, [pd_16] | |
15481 psrld m6, 5 | |
15482 packusdw m5, m6 | |
15483 | |
15484 pmaddwd m6, m2, [r3 + 5 * 32] ; [21] | |
15485 paddd m6, [pd_16] | |
15486 psrld m6, 5 | |
15487 pmaddwd m7, m13, [r3 + 5 * 32] | |
15488 paddd m7, [pd_16] | |
15489 psrld m7, 5 | |
15490 packusdw m6, m7 | |
15491 | |
15492 pmaddwd m7, m2, [r3 - 4 * 32] ; [12] | |
15493 paddd m7, [pd_16] | |
15494 psrld m7, 5 | |
15495 pmaddwd m8, m13, [r3 - 4 * 32] | |
15496 paddd m8, [pd_16] | |
15497 psrld m8, 5 | |
15498 packusdw m7, m8 | |
15499 | |
15500 pmaddwd m8, m2, [r3 - 13 * 32] ; [3] | |
15501 paddd m8, [pd_16] | |
15502 psrld m8, 5 | |
15503 pmaddwd m9, m13, [r3 - 13 * 32] | |
15504 paddd m9, [pd_16] | |
15505 psrld m9, 5 | |
15506 packusdw m8, m9 | |
15507 | |
15508 palignr m2, m3, m1, 10 | |
15509 palignr m13, m0, m3, 10 | |
15510 | |
15511 pmaddwd m9, m2, [r3 + 10 * 32] ; [26] | |
15512 paddd m9, [pd_16] | |
15513 psrld m9, 5 | |
15514 pmaddwd m10, m13, [r3 + 10 * 32] | |
15515 paddd m10, [pd_16] | |
15516 psrld m10, 5 | |
15517 packusdw m9, m10 | |
15518 | |
15519 pmaddwd m10, m2, [r3 + 1 * 32] ; [17] | |
15520 paddd m10, [pd_16] | |
15521 psrld m10, 5 | |
15522 pmaddwd m12, m13, [r3 + 1 * 32] | |
15523 paddd m12, [pd_16] | |
15524 psrld m12, 5 | |
15525 packusdw m10, m12 | |
15526 | |
15527 pmaddwd m11, m2, [r3 - 8 * 32] ; [8] | |
15528 paddd m11, [pd_16] | |
15529 psrld m11, 5 | |
15530 pmaddwd m13, [r3 - 8 * 32] | |
15531 paddd m13, [pd_16] | |
15532 psrld m13, 5 | |
15533 packusdw m11, m13 | |
15534 | |
15535 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 | |
15536 | |
15537 palignr m2, m3, m1, 6 | |
15538 palignr m13, m0, m3, 6 | |
15539 | |
15540 pmaddwd m4, m2, [r3 + 15 * 32] ; [31] | |
15541 paddd m4, [pd_16] | |
15542 psrld m4, 5 | |
15543 pmaddwd m5, m13, [r3 + 15 * 32] | |
15544 paddd m5, [pd_16] | |
15545 psrld m5, 5 | |
15546 packusdw m4, m5 | |
15547 | |
15548 pmaddwd m5, m2, [r3 + 6 * 32] ; [22] | |
15549 paddd m5, [pd_16] | |
15550 psrld m5, 5 | |
15551 pmaddwd m6, m13, [r3 + 6 * 32] | |
15552 paddd m6, [pd_16] | |
15553 psrld m6, 5 | |
15554 packusdw m5, m6 | |
15555 | |
15556 pmaddwd m6, m2, [r3 - 3 * 32] ; [13] | |
15557 paddd m6, [pd_16] | |
15558 psrld m6, 5 | |
15559 pmaddwd m8, m13, [r3 - 3 * 32] | |
15560 paddd m8, [pd_16] | |
15561 psrld m8, 5 | |
15562 packusdw m6, m8 | |
15563 | |
15564 pmaddwd m7, m2, [r3 - 12 * 32] ; [4] | |
15565 paddd m7, [pd_16] | |
15566 psrld m7, 5 | |
15567 pmaddwd m8, m13, [r3 - 12 * 32] | |
15568 paddd m8, [pd_16] | |
15569 psrld m8, 5 | |
15570 packusdw m7, m8 | |
15571 | |
15572 palignr m0, m3, 2 | |
15573 palignr m3, m1, 2 | |
15574 | |
15575 pmaddwd m8, m3, [r3 + 11 * 32] ; [27] | |
15576 paddd m8, [pd_16] | |
15577 psrld m8, 5 | |
15578 pmaddwd m9, m0, [r3 + 11 * 32] | |
15579 paddd m9, [pd_16] | |
15580 psrld m9, 5 | |
15581 packusdw m8, m9 | |
15582 | |
15583 pmaddwd m9, m3, [r3 + 2 * 32] ; [18] | |
15584 paddd m9, [pd_16] | |
15585 psrld m9, 5 | |
15586 pmaddwd m10, m0, [r3 + 2 * 32] | |
15587 paddd m10, [pd_16] | |
15588 psrld m10, 5 | |
15589 packusdw m9, m10 | |
15590 | |
15591 pmaddwd m1, m3, [r3 - 7 * 32] ; [9] | |
15592 paddd m1, [pd_16] | |
15593 psrld m1, 5 | |
15594 pmaddwd m2, m0, [r3 - 7 * 32] | |
15595 paddd m2, [pd_16] | |
15596 psrld m2, 5 | |
15597 packusdw m1, m2 | |
15598 | |
15599 pmaddwd m3, [r3 - 16 * 32] ; [0] | |
15600 paddd m3, [pd_16] | |
15601 psrld m3, 5 | |
15602 pmaddwd m0, [r3 - 16 * 32] | |
15603 paddd m0, [pd_16] | |
15604 psrld m0, 5 | |
15605 packusdw m3, m0 | |
15606 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 | |
15607 ret | |
15608 | |
15609 cglobal intra_pred_ang32_13, 3,8,14, 0-mmsize | |
15610 movu m0, [r2 + 112] | |
15611 mova [rsp], m0 | |
15612 | |
15613 add r1d, r1d | |
15614 lea r4, [r1 * 3] | |
15615 lea r3, [ang_table_avx2 + 16 * 32] | |
15616 | |
15617 movu xm1, [r2 + 8] | |
15618 movu xm2, [r2 + 36] | |
15619 pshufb xm1, [pw_ang32_13_23] | |
15620 pshufb xm2, [pw_ang32_13_23] | |
15621 pinsrw xm1, [r2 + 28], 4 | |
15622 pinsrw xm2, [r2 + 56], 4 | |
15623 punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28] | |
15624 | |
15625 movzx r6d, word [r2] | |
15626 mov [r2 + 128], r6w | |
15627 movu [r2 + 112], xm2 | |
15628 | |
15629 xor r6d, r6d | |
15630 add r2, 128 | |
15631 lea r7, [r0 + 8 * r1] | |
15632 | |
15633 call ang32_mode_13_23_row_0_15 | |
15634 | |
15635 sub r2, 8 | |
15636 lea r0, [r0 + 32] | |
15637 | |
15638 call ang32_mode_13_23_row_16_31 | |
15639 | |
15640 add r2, 40 | |
15641 lea r0, [r7 + 8 * r1] | |
15642 | |
15643 call ang32_mode_13_23_row_0_15 | |
15644 | |
15645 sub r2, 8 | |
15646 lea r0, [r0 + 32] | |
15647 | |
15648 call ang32_mode_13_23_row_16_31 | |
15649 | |
15650 mova m0, [rsp] | |
15651 movu [r2 - 40], m0 | |
15652 RET | |
15653 | |
15654 cglobal intra_pred_ang32_23, 3,7,14, 0-16 | |
15655 movu xm0, [r2 - 16] | |
15656 mova [rsp], xm0 | |
15657 | |
15658 add r1d, r1d | |
15659 lea r4, [r1 * 3] | |
15660 lea r3, [ang_table_avx2 + 16 * 32] | |
15661 | |
15662 movu xm1, [r2 + 136] | |
15663 movu xm2, [r2 + 164] | |
15664 pshufb xm1, [pw_ang32_13_23] | |
15665 pshufb xm2, [pw_ang32_13_23] | |
15666 pinsrw xm1, [r2 + 156], 4 | |
15667 pinsrw xm2, [r2 + 184], 4 | |
15668 punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28] | |
15669 | |
15670 movu [r2 - 16], xm2 | |
15671 | |
15672 xor r6d, r6d | |
15673 inc r6d | |
15674 lea r5, [r0 + 32] | |
15675 | |
15676 call ang32_mode_13_23_row_0_15 | |
15677 | |
15678 sub r2, 8 | |
15679 | |
15680 call ang32_mode_13_23_row_16_31 | |
15681 | |
15682 add r2, 40 | |
15683 mov r0, r5 | |
15684 | |
15685 call ang32_mode_13_23_row_0_15 | |
15686 | |
15687 sub r2, 8 | |
15688 | |
15689 call ang32_mode_13_23_row_16_31 | |
15690 | |
15691 mova xm0, [rsp] | |
15692 movu [r2 - 40], xm0 | |
15693 RET | |
15694 | |
15695 %macro TRANSPOSE_STORE_AVX2_STACK 11 | |
15696 jnz .skip%11 | |
15697 punpckhwd m%9, m%1, m%2 | |
15698 punpcklwd m%1, m%2 | |
15699 punpckhwd m%2, m%3, m%4 | |
15700 punpcklwd m%3, m%4 | |
15701 | |
15702 punpckldq m%4, m%1, m%3 | |
15703 punpckhdq m%1, m%3 | |
15704 punpckldq m%3, m%9, m%2 | |
15705 punpckhdq m%9, m%2 | |
15706 | |
15707 punpckhwd m%10, m%5, m%6 | |
15708 punpcklwd m%5, m%6 | |
15709 punpckhwd m%6, m%7, m%8 | |
15710 punpcklwd m%7, m%8 | |
15711 | |
15712 punpckldq m%8, m%5, m%7 | |
15713 punpckhdq m%5, m%7 | |
15714 punpckldq m%7, m%10, m%6 | |
15715 punpckhdq m%10, m%6 | |
15716 | |
15717 punpcklqdq m%6, m%4, m%8 | |
15718 punpckhqdq m%2, m%4, m%8 | |
15719 punpcklqdq m%4, m%1, m%5 | |
15720 punpckhqdq m%8, m%1, m%5 | |
15721 | |
15722 punpcklqdq m%1, m%3, m%7 | |
15723 punpckhqdq m%5, m%3, m%7 | |
15724 punpcklqdq m%3, m%9, m%10 | |
15725 punpckhqdq m%7, m%9, m%10 | |
15726 | |
15727 movu [r0 + r1 * 0 + %11], xm%6 | |
15728 movu [r0 + r1 * 1 + %11], xm%2 | |
15729 movu [r0 + r1 * 2 + %11], xm%4 | |
15730 movu [r0 + r4 * 1 + %11], xm%8 | |
15731 | |
15732 lea r5, [r0 + r1 * 4] | |
15733 movu [r5 + r1 * 0 + %11], xm%1 | |
15734 movu [r5 + r1 * 1 + %11], xm%5 | |
15735 movu [r5 + r1 * 2 + %11], xm%3 | |
15736 movu [r5 + r4 * 1 + %11], xm%7 | |
15737 | |
15738 lea r5, [r5 + r1 * 4] | |
15739 vextracti128 [r5 + r1 * 0 + %11], m%6, 1 | |
15740 vextracti128 [r5 + r1 * 1 + %11], m%2, 1 | |
15741 vextracti128 [r5 + r1 * 2 + %11], m%4, 1 | |
15742 vextracti128 [r5 + r4 * 1 + %11], m%8, 1 | |
15743 | |
15744 lea r5, [r5 + r1 * 4] | |
15745 vextracti128 [r5 + r1 * 0 + %11], m%1, 1 | |
15746 vextracti128 [r5 + r1 * 1 + %11], m%5, 1 | |
15747 vextracti128 [r5 + r1 * 2 + %11], m%3, 1 | |
15748 vextracti128 [r5 + r4 * 1 + %11], m%7, 1 | |
15749 jmp .end%11 | |
15750 .skip%11: | |
15751 %if %11 == 16 | |
15752 lea r7, [r0 + 8 * r1] | |
15753 %else | |
15754 lea r7, [r0] | |
15755 %endif | |
15756 movu [r7 + r1 * 0], m%1 | |
15757 movu [r7 + r1 * 1], m%2 | |
15758 movu [r7 + r1 * 2], m%3 | |
15759 movu [r7 + r4 * 1], m%4 | |
15760 | |
15761 %if %11 == 16 | |
15762 lea r7, [r7 + r1 * 4] | |
15763 %else | |
15764 lea r7, [r7 + r1 * 4] | |
15765 %endif | |
15766 movu [r7 + r1 * 0], m%5 | |
15767 movu [r7 + r1 * 1], m%6 | |
15768 movu [r7 + r1 * 2], m%7 | |
15769 movu [r7 + r4 * 1], m%8 | |
15770 .end%11: | |
15771 %endmacro | |
15772 | |
15773 ;; angle 32, modes 14 and 22, row 0 to 15 | |
15774 cglobal ang32_mode_14_22_rows_0_15 | |
15775 test r6d, r6d | |
15776 | |
15777 movu m0, [r2 - 12] | |
15778 movu m1, [r2 - 10] | |
15779 | |
15780 punpcklwd m3, m0, m1 | |
15781 punpckhwd m0, m1 | |
15782 | |
15783 movu m1, [r2 + 4] | |
15784 movu m4, [r2 + 6] | |
15785 punpcklwd m2, m1, m4 | |
15786 punpckhwd m1, m4 | |
15787 | |
15788 pmaddwd m4, m3, [r3] ; [16] | |
15789 paddd m4, [pd_16] | |
15790 psrld m4, 5 | |
15791 pmaddwd m5, m0, [r3] | |
15792 paddd m5, [pd_16] | |
15793 psrld m5, 5 | |
15794 packusdw m4, m5 | |
15795 | |
15796 pmaddwd m5, m3, [r3 + 13 * 32] ; [29] | |
15797 paddd m5, [pd_16] | |
15798 psrld m5, 5 | |
15799 pmaddwd m8, m0, [r3 + 13 * 32] | |
15800 paddd m8, [pd_16] | |
15801 psrld m8, 5 | |
15802 packusdw m5, m8 | |
15803 | |
15804 palignr m7, m0, m3, 4 | |
15805 pmaddwd m6, m7, [r3 - 6 * 32] ; [10] | |
15806 paddd m6, [pd_16] | |
15807 psrld m6, 5 | |
15808 palignr m8, m2, m0, 4 | |
15809 pmaddwd m9, m8, [r3 - 6 * 32] | |
15810 paddd m9, [pd_16] | |
15811 psrld m9, 5 | |
15812 packusdw m6, m9 | |
15813 | |
15814 pmaddwd m7, [r3 + 7 * 32] ; [23] | |
15815 paddd m7, [pd_16] | |
15816 psrld m7, 5 | |
15817 pmaddwd m8, [r3 + 7 * 32] | |
15818 paddd m8, [pd_16] | |
15819 psrld m8, 5 | |
15820 packusdw m7, m8 | |
15821 | |
15822 palignr m10, m0, m3, 8 | |
15823 pmaddwd m8, m10, [r3 - 12 * 32] ; [4] | |
15824 paddd m8, [pd_16] | |
15825 psrld m8, 5 | |
15826 palignr m12, m2, m0, 8 | |
15827 pmaddwd m9, m12, [r3 - 12 * 32] | |
15828 paddd m9, [pd_16] | |
15829 psrld m9, 5 | |
15830 packusdw m8, m9 | |
15831 | |
15832 pmaddwd m9, m10, [r3 + 1 * 32] ; [17] | |
15833 paddd m9, [pd_16] | |
15834 psrld m9, 5 | |
15835 pmaddwd m11, m12, [r3 + 1 * 32] | |
15836 paddd m11, [pd_16] | |
15837 psrld m11, 5 | |
15838 packusdw m9, m11 | |
15839 | |
15840 pmaddwd m10, [r3 + 14 * 32] ; [30] | |
15841 paddd m10, [pd_16] | |
15842 psrld m10, 5 | |
15843 pmaddwd m12, [r3 + 14 * 32] | |
15844 paddd m12, [pd_16] | |
15845 psrld m12, 5 | |
15846 packusdw m10, m12 | |
15847 | |
15848 palignr m11, m0, m3, 12 | |
15849 pmaddwd m11, [r3 - 5 * 32] ; [11] | |
15850 paddd m11, [pd_16] | |
15851 psrld m11, 5 | |
15852 palignr m12, m2, m0, 12 | |
15853 pmaddwd m12, [r3 - 5 * 32] | |
15854 paddd m12, [pd_16] | |
15855 psrld m12, 5 | |
15856 packusdw m11, m12 | |
15857 | |
15858 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
15859 | |
15860 palignr m4, m0, m3, 12 | |
15861 pmaddwd m4, [r3 + 8 * 32] ; [24] | |
15862 paddd m4, [pd_16] | |
15863 psrld m4, 5 | |
15864 palignr m5, m2, m0, 12 | |
15865 pmaddwd m5, [r3 + 8 * 32] | |
15866 paddd m5, [pd_16] | |
15867 psrld m5, 5 | |
15868 packusdw m4, m5 | |
15869 | |
15870 pmaddwd m5, m0, [r3 - 11 * 32] ; [5] | |
15871 paddd m5, [pd_16] | |
15872 psrld m5, 5 | |
15873 pmaddwd m3, m2, [r3 - 11 * 32] | |
15874 paddd m3, [pd_16] | |
15875 psrld m3, 5 | |
15876 packusdw m5, m3 | |
15877 | |
15878 pmaddwd m6, m0, [r3 + 2 * 32] ; [18] | |
15879 paddd m6, [pd_16] | |
15880 psrld m6, 5 | |
15881 pmaddwd m7, m2, [r3 + 2 * 32] | |
15882 paddd m7, [pd_16] | |
15883 psrld m7, 5 | |
15884 packusdw m6, m7 | |
15885 | |
15886 pmaddwd m7, m0, [r3 + 15 * 32] ; [31] | |
15887 paddd m7, [pd_16] | |
15888 psrld m7, 5 | |
15889 pmaddwd m3, m2, [r3 + 15 * 32] | |
15890 paddd m3, [pd_16] | |
15891 psrld m3, 5 | |
15892 packusdw m7, m3 | |
15893 | |
15894 palignr m9, m2, m0, 4 | |
15895 palignr m10, m1, m2, 4 | |
15896 pmaddwd m8, m9, [r3 - 4 * 32] ; [12] | |
15897 paddd m8, [pd_16] | |
15898 psrld m8, 5 | |
15899 pmaddwd m11, m10, [r3 - 4 * 32] | |
15900 paddd m11, [pd_16] | |
15901 psrld m11, 5 | |
15902 packusdw m8, m11 | |
15903 | |
15904 pmaddwd m9, [r3 + 9 * 32] ; [25] | |
15905 paddd m9, [pd_16] | |
15906 psrld m9, 5 | |
15907 pmaddwd m10, [r3 + 9 * 32] | |
15908 paddd m10, [pd_16] | |
15909 psrld m10, 5 | |
15910 packusdw m9, m10 | |
15911 | |
15912 palignr m1, m2, 8 | |
15913 palignr m2, m0, 8 | |
15914 | |
15915 pmaddwd m10, m2, [r3 - 10 * 32] ; [6] | |
15916 paddd m10, [pd_16] | |
15917 psrld m10, 5 | |
15918 pmaddwd m12, m1, [r3 - 10 * 32] | |
15919 paddd m12, [pd_16] | |
15920 psrld m12, 5 | |
15921 packusdw m10, m12 | |
15922 | |
15923 pmaddwd m2, [r3 + 3 * 32] ; [19] | |
15924 paddd m2, [pd_16] | |
15925 psrld m2, 5 | |
15926 pmaddwd m1, [r3 + 3 * 32] | |
15927 paddd m1, [pd_16] | |
15928 psrld m1, 5 | |
15929 packusdw m2, m1 | |
15930 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 | |
15931 ret | |
15932 | |
15933 ;; angle 32, modes 14 and 22, rows 16 to 31 | |
15934 cglobal ang32_mode_14_22_rows_16_31 | |
15935 test r6d, r6d | |
15936 | |
15937 movu m0, [r2 - 24] | |
15938 movu m1, [r2 - 22] | |
15939 | |
15940 punpcklwd m3, m0, m1 | |
15941 punpckhwd m0, m1 | |
15942 | |
15943 movu m1, [r2 - 8] | |
15944 movu m4, [r2 - 6] | |
15945 punpcklwd m2, m1, m4 | |
15946 punpckhwd m1, m4 | |
15947 | |
15948 pmaddwd m4, m3, [r3 - 16 * 32] ; [0] | |
15949 paddd m4, [pd_16] | |
15950 psrld m4, 5 | |
15951 pmaddwd m5, m0, [r3 - 16 * 32] | |
15952 paddd m5, [pd_16] | |
15953 psrld m5, 5 | |
15954 packusdw m4, m5 | |
15955 | |
15956 pmaddwd m5, m3, [r3 - 3 * 32] ; [13] | |
15957 paddd m5, [pd_16] | |
15958 psrld m5, 5 | |
15959 pmaddwd m8, m0, [r3 - 3 * 32] | |
15960 paddd m8, [pd_16] | |
15961 psrld m8, 5 | |
15962 packusdw m5, m8 | |
15963 | |
15964 pmaddwd m6, m3, [r3 + 10 * 32] ; [26] | |
15965 paddd m6, [pd_16] | |
15966 psrld m6, 5 | |
15967 pmaddwd m9, m0, [r3 + 10 * 32] | |
15968 paddd m9, [pd_16] | |
15969 psrld m9, 5 | |
15970 packusdw m6, m9 | |
15971 | |
15972 palignr m8, m0, m3, 4 | |
15973 palignr m9, m2, m0, 4 | |
15974 pmaddwd m7, m8, [r3 - 9 * 32] ; [7] | |
15975 paddd m7, [pd_16] | |
15976 psrld m7, 5 | |
15977 pmaddwd m10, m9, [r3 - 9 * 32] | |
15978 paddd m10, [pd_16] | |
15979 psrld m10, 5 | |
15980 packusdw m7, m10 | |
15981 | |
15982 pmaddwd m8, [r3 + 4 * 32] ; [20] | |
15983 paddd m8, [pd_16] | |
15984 psrld m8, 5 | |
15985 pmaddwd m9, [r3 + 4 * 32] | |
15986 paddd m9, [pd_16] | |
15987 psrld m9, 5 | |
15988 packusdw m8, m9 | |
15989 | |
15990 palignr m11, m0, m3, 8 | |
15991 palignr m12, m2, m0, 8 | |
15992 pmaddwd m9, m11, [r3 - 15 * 32] ; [1] | |
15993 paddd m9, [pd_16] | |
15994 psrld m9, 5 | |
15995 pmaddwd m10, m12, [r3 - 15 * 32] | |
15996 paddd m10, [pd_16] | |
15997 psrld m10, 5 | |
15998 packusdw m9, m10 | |
15999 | |
16000 pmaddwd m10, m11, [r3 - 2 * 32] ; [14] | |
16001 paddd m10, [pd_16] | |
16002 psrld m10, 5 | |
16003 pmaddwd m13, m12, [r3 - 2 * 32] | |
16004 paddd m13, [pd_16] | |
16005 psrld m13, 5 | |
16006 packusdw m10, m13 | |
16007 | |
16008 pmaddwd m11, [r3 + 11 * 32] ; [27] | |
16009 paddd m11, [pd_16] | |
16010 psrld m11, 5 | |
16011 pmaddwd m12, [r3 + 11 * 32] | |
16012 paddd m12, [pd_16] | |
16013 psrld m12, 5 | |
16014 packusdw m11, m12 | |
16015 | |
16016 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
16017 | |
16018 palignr m5, m0, m3, 12 | |
16019 palignr m6, m2, m0, 12 | |
16020 pmaddwd m4, m5, [r3 - 8 * 32] ; [8] | |
16021 paddd m4, [pd_16] | |
16022 psrld m4, 5 | |
16023 pmaddwd m7, m6, [r3 - 8 * 32] | |
16024 paddd m7, [pd_16] | |
16025 psrld m7, 5 | |
16026 packusdw m4, m7 | |
16027 | |
16028 pmaddwd m5, [r3 + 5 * 32] ; [21] | |
16029 paddd m5, [pd_16] | |
16030 psrld m5, 5 | |
16031 pmaddwd m6, [r3 + 5 * 32] | |
16032 paddd m6, [pd_16] | |
16033 psrld m6, 5 | |
16034 packusdw m5, m6 | |
16035 | |
16036 pmaddwd m6, m0, [r3 - 14 * 32] ; [2] | |
16037 paddd m6, [pd_16] | |
16038 psrld m6, 5 | |
16039 pmaddwd m7, m2, [r3 - 14 * 32] | |
16040 paddd m7, [pd_16] | |
16041 psrld m7, 5 | |
16042 packusdw m6, m7 | |
16043 | |
16044 pmaddwd m7, m0, [r3 - 1 * 32] ; [15] | |
16045 paddd m7, [pd_16] | |
16046 psrld m7, 5 | |
16047 pmaddwd m3, m2, [r3 - 1 * 32] | |
16048 paddd m3, [pd_16] | |
16049 psrld m3, 5 | |
16050 packusdw m7, m3 | |
16051 | |
16052 pmaddwd m8, m0, [r3 + 12 * 32] ; [28] | |
16053 paddd m8, [pd_16] | |
16054 psrld m8, 5 | |
16055 pmaddwd m11, m2, [r3 + 12 * 32] | |
16056 paddd m11, [pd_16] | |
16057 psrld m11, 5 | |
16058 packusdw m8, m11 | |
16059 | |
16060 palignr m10, m2, m0, 4 | |
16061 palignr m11, m1, m2, 4 | |
16062 | |
16063 pmaddwd m9, m10, [r3 - 7 * 32] ; [9] | |
16064 paddd m9, [pd_16] | |
16065 psrld m9, 5 | |
16066 pmaddwd m3, m11, [r3 - 7 * 32] | |
16067 paddd m3, [pd_16] | |
16068 psrld m3, 5 | |
16069 packusdw m9, m3 | |
16070 | |
16071 pmaddwd m10, [r3 + 6 * 32] ; [22] | |
16072 paddd m10, [pd_16] | |
16073 psrld m10, 5 | |
16074 pmaddwd m11, [r3 + 6 * 32] | |
16075 paddd m11, [pd_16] | |
16076 psrld m11, 5 | |
16077 packusdw m10, m11 | |
16078 | |
16079 palignr m1, m2, 8 | |
16080 palignr m2, m0, 8 | |
16081 | |
16082 pmaddwd m2, [r3 - 13 * 32] ; [3] | |
16083 paddd m2, [pd_16] | |
16084 psrld m2, 5 | |
16085 pmaddwd m1, [r3 - 13 * 32] | |
16086 paddd m1, [pd_16] | |
16087 psrld m1, 5 | |
16088 packusdw m2, m1 | |
16089 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 | |
16090 ret | |
16091 | |
16092 cglobal intra_pred_ang32_14, 3,8,14 | |
16093 mov r6, rsp | |
16094 sub rsp, 4*mmsize+gprsize | |
16095 and rsp, ~63 | |
16096 mov [rsp+4*mmsize], r6 | |
16097 | |
16098 movu m0, [r2 + 128] | |
16099 movu m1, [r2 + 160] | |
16100 movd xm2, [r2 + 192] | |
16101 | |
16102 mova [rsp + 1*mmsize], m0 | |
16103 mova [rsp + 2*mmsize], m1 | |
16104 movd [rsp + 3*mmsize], xm2 | |
16105 | |
16106 add r1d, r1d | |
16107 lea r4, [r1 * 3] | |
16108 lea r3, [ang_table_avx2 + 16 * 32] | |
16109 | |
16110 movu xm1, [r2 + 4] | |
16111 movu xm2, [r2 + 24] | |
16112 movu xm3, [r2 + 44] | |
16113 pshufb xm1, [pw_ang32_14_22] | |
16114 pshufb xm2, [pw_ang32_14_22] | |
16115 pshufb xm3, [pw_ang32_14_22] | |
16116 pinsrw xm1, [r2 + 20], 4 | |
16117 pinsrw xm2, [r2 + 40], 4 | |
16118 pinsrw xm3, [r2 + 60], 4 | |
16119 | |
16120 punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20] | |
16121 punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30] | |
16122 | |
16123 movzx r6d, word [r2] | |
16124 mov [rsp + 1*mmsize], r6w | |
16125 movu [rsp + 16], xm2 | |
16126 movq [rsp + 8], xm3 | |
16127 | |
16128 xor r6d, r6d | |
16129 lea r2, [rsp + 1*mmsize] | |
16130 lea r7, [r0 + 8 * r1] | |
16131 | |
16132 call ang32_mode_14_22_rows_0_15 | |
16133 | |
16134 lea r0, [r0 + 32] | |
16135 | |
16136 call ang32_mode_14_22_rows_16_31 | |
16137 | |
16138 add r2, 32 | |
16139 lea r0, [r7 + 8 * r1] | |
16140 | |
16141 call ang32_mode_14_22_rows_0_15 | |
16142 | |
16143 lea r0, [r0 + 32] | |
16144 | |
16145 call ang32_mode_14_22_rows_16_31 | |
16146 | |
16147 mov rsp, [rsp+4*mmsize] | |
16148 RET | |
16149 | |
16150 cglobal intra_pred_ang32_22, 3,8,14 | |
16151 mov r6, rsp | |
16152 sub rsp, 4*mmsize+gprsize | |
16153 and rsp, ~63 | |
16154 mov [rsp+4*mmsize], r6 | |
16155 | |
16156 movu m0, [r2] | |
16157 movu m1, [r2 + 32] | |
16158 movd xm2, [r2 + 64] | |
16159 | |
16160 mova [rsp + 1*mmsize], m0 | |
16161 mova [rsp + 2*mmsize], m1 | |
16162 movd [rsp + 3*mmsize], xm2 | |
16163 | |
16164 add r1d, r1d | |
16165 lea r4, [r1 * 3] | |
16166 lea r3, [ang_table_avx2 + 16 * 32] | |
16167 | |
16168 movu xm1, [r2 + 132] | |
16169 movu xm2, [r2 + 152] | |
16170 movu xm3, [r2 + 172] | |
16171 pshufb xm1, [pw_ang32_14_22] | |
16172 pshufb xm2, [pw_ang32_14_22] | |
16173 pshufb xm3, [pw_ang32_14_22] | |
16174 pinsrw xm1, [r2 + 148], 4 | |
16175 pinsrw xm2, [r2 + 168], 4 | |
16176 pinsrw xm3, [r2 + 188], 4 | |
16177 | |
16178 punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20] | |
16179 punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30] | |
16180 | |
16181 movu [rsp + 16], xm2 | |
16182 movq [rsp + 8], xm3 | |
16183 | |
16184 xor r6d, r6d | |
16185 inc r6d | |
16186 lea r2, [rsp + 1*mmsize] | |
16187 lea r5, [r0 + 32] | |
16188 | |
16189 call ang32_mode_14_22_rows_0_15 | |
16190 | |
16191 lea r0, [r0 + 8 * r1] | |
16192 lea r0, [r0 + 8 * r1] | |
16193 | |
16194 call ang32_mode_14_22_rows_16_31 | |
16195 | |
16196 add r2, 32 | |
16197 mov r0, r5 | |
16198 | |
16199 call ang32_mode_14_22_rows_0_15 | |
16200 | |
16201 lea r0, [r0 + 8 * r1] | |
16202 lea r0, [r0 + 8 * r1] | |
16203 | |
16204 call ang32_mode_14_22_rows_16_31 | |
16205 | |
16206 mov rsp, [rsp+4*mmsize] | |
16207 RET | |
16208 | |
16209 ;; angle 32, modes 15 and 21, row 0 to 15 | |
16210 cglobal ang32_mode_15_21_rows_0_15 | |
16211 test r6d, r6d | |
16212 | |
16213 movu m0, [r2 - 16] | |
16214 movu m1, [r2 - 14] | |
16215 | |
16216 punpcklwd m3, m0, m1 | |
16217 punpckhwd m0, m1 | |
16218 | |
16219 movu m1, [r2] | |
16220 movu m4, [r2 + 2] | |
16221 punpcklwd m2, m1, m4 | |
16222 punpckhwd m1, m4 | |
16223 | |
16224 pmaddwd m4, m3, [r3] ; [16] | |
16225 paddd m4, [pd_16] | |
16226 psrld m4, 5 | |
16227 pmaddwd m5, m0, [r3] | |
16228 paddd m5, [pd_16] | |
16229 psrld m5, 5 | |
16230 packusdw m4, m5 | |
16231 | |
16232 palignr m6, m0, m3, 4 | |
16233 palignr m7, m2, m0, 4 | |
16234 pmaddwd m5, m6, [r3 - 15 * 32] ; [1] | |
16235 paddd m5, [pd_16] | |
16236 psrld m5, 5 | |
16237 pmaddwd m8, m7, [r3 - 15 * 32] | |
16238 paddd m8, [pd_16] | |
16239 psrld m8, 5 | |
16240 packusdw m5, m8 | |
16241 | |
16242 pmaddwd m6, [r3 + 2 * 32] ; [18] | |
16243 paddd m6, [pd_16] | |
16244 psrld m6, 5 | |
16245 pmaddwd m7, [r3 + 2 * 32] | |
16246 paddd m7, [pd_16] | |
16247 psrld m7, 5 | |
16248 packusdw m6, m7 | |
16249 | |
16250 palignr m8, m0, m3, 8 | |
16251 palignr m9, m2, m0, 8 | |
16252 pmaddwd m7, m8, [r3 - 13 * 32] ; [3] | |
16253 paddd m7, [pd_16] | |
16254 psrld m7, 5 | |
16255 pmaddwd m10, m9, [r3 - 13 * 32] | |
16256 paddd m10, [pd_16] | |
16257 psrld m10, 5 | |
16258 packusdw m7, m10 | |
16259 | |
16260 pmaddwd m8, [r3 + 4 * 32] ; [20] | |
16261 paddd m8, [pd_16] | |
16262 psrld m8, 5 | |
16263 pmaddwd m9, [r3 + 4 * 32] | |
16264 paddd m9, [pd_16] | |
16265 psrld m9, 5 | |
16266 packusdw m8, m9 | |
16267 | |
16268 palignr m10, m0, m3, 12 | |
16269 palignr m11, m2, m0, 12 | |
16270 pmaddwd m9, m10, [r3 - 11 * 32] ; [5] | |
16271 paddd m9, [pd_16] | |
16272 psrld m9, 5 | |
16273 pmaddwd m12, m11, [r3 - 11 * 32] | |
16274 paddd m12, [pd_16] | |
16275 psrld m12, 5 | |
16276 packusdw m9, m12 | |
16277 | |
16278 pmaddwd m10, [r3 + 6 * 32] ; [22] | |
16279 paddd m10, [pd_16] | |
16280 psrld m10, 5 | |
16281 pmaddwd m11, [r3 + 6 * 32] | |
16282 paddd m11, [pd_16] | |
16283 psrld m11, 5 | |
16284 packusdw m10, m11 | |
16285 | |
16286 pmaddwd m11, m0, [r3 - 9 * 32] ; [7] | |
16287 paddd m11, [pd_16] | |
16288 psrld m11, 5 | |
16289 pmaddwd m12, m2, [r3 - 9 * 32] | |
16290 paddd m12, [pd_16] | |
16291 psrld m12, 5 | |
16292 packusdw m11, m12 | |
16293 | |
16294 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
16295 | |
16296 pmaddwd m4, m0, [r3 + 8 * 32] ; [24] | |
16297 paddd m4, [pd_16] | |
16298 psrld m4, 5 | |
16299 pmaddwd m5, m2, [r3 + 8 * 32] | |
16300 paddd m5, [pd_16] | |
16301 psrld m5, 5 | |
16302 packusdw m4, m5 | |
16303 | |
16304 palignr m6, m2, m0, 4 | |
16305 palignr m7, m1, m2, 4 | |
16306 pmaddwd m5, m6, [r3 - 7 * 32] ; [9] | |
16307 paddd m5, [pd_16] | |
16308 psrld m5, 5 | |
16309 pmaddwd m3, m7, [r3 - 7 * 32] | |
16310 paddd m3, [pd_16] | |
16311 psrld m3, 5 | |
16312 packusdw m5, m3 | |
16313 | |
16314 pmaddwd m6, [r3 + 10 * 32] ; [26] | |
16315 paddd m6, [pd_16] | |
16316 psrld m6, 5 | |
16317 pmaddwd m7, [r3 + 10 * 32] | |
16318 paddd m7, [pd_16] | |
16319 psrld m7, 5 | |
16320 packusdw m6, m7 | |
16321 | |
16322 palignr m8, m2, m0, 8 | |
16323 palignr m9, m1, m2, 8 | |
16324 pmaddwd m7, m8, [r3 - 5 * 32] ; [11] | |
16325 paddd m7, [pd_16] | |
16326 psrld m7, 5 | |
16327 pmaddwd m3, m9, [r3 - 5 * 32] | |
16328 paddd m3, [pd_16] | |
16329 psrld m3, 5 | |
16330 packusdw m7, m3 | |
16331 | |
16332 pmaddwd m8, [r3 + 12 * 32] ; [28] | |
16333 paddd m8, [pd_16] | |
16334 psrld m8, 5 | |
16335 pmaddwd m9, [r3 + 12 * 32] | |
16336 paddd m9, [pd_16] | |
16337 psrld m9, 5 | |
16338 packusdw m8, m9 | |
16339 | |
16340 palignr m10, m2, m0, 12 | |
16341 palignr m11, m1, m2, 12 | |
16342 pmaddwd m9, m10, [r3 - 3 * 32] ; [13] | |
16343 paddd m9, [pd_16] | |
16344 psrld m9, 5 | |
16345 pmaddwd m3, m11, [r3 - 3 * 32] | |
16346 paddd m3, [pd_16] | |
16347 psrld m3, 5 | |
16348 packusdw m9, m3 | |
16349 | |
16350 pmaddwd m10, [r3 + 14 * 32] ; [30] | |
16351 paddd m10, [pd_16] | |
16352 psrld m10, 5 | |
16353 pmaddwd m11, [r3 + 14 * 32] | |
16354 paddd m11, [pd_16] | |
16355 psrld m11, 5 | |
16356 packusdw m10, m11 | |
16357 | |
16358 pmaddwd m2, [r3 - 1 * 32] ; [15] | |
16359 paddd m2, [pd_16] | |
16360 psrld m2, 5 | |
16361 pmaddwd m1, [r3 - 1 * 32] | |
16362 paddd m1, [pd_16] | |
16363 psrld m1, 5 | |
16364 packusdw m2, m1 | |
16365 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 | |
16366 ret | |
16367 | |
16368 ;; angle 32, modes 15 and 21, rows 16 to 31 | |
16369 cglobal ang32_mode_15_21_rows_16_31 | |
16370 test r6d, r6d | |
16371 | |
16372 movu m0, [r2 - 32] | |
16373 movu m1, [r2 - 30] | |
16374 | |
16375 punpcklwd m3, m0, m1 | |
16376 punpckhwd m0, m1 | |
16377 | |
16378 movu m1, [r2 - 16] | |
16379 movu m4, [r2 - 14] | |
16380 punpcklwd m2, m1, m4 | |
16381 punpckhwd m1, m4 | |
16382 | |
16383 pmaddwd m4, m3, [r3 - 16 * 32] ; [0] | |
16384 paddd m4, [pd_16] | |
16385 psrld m4, 5 | |
16386 pmaddwd m5, m0, [r3 - 16 * 32] | |
16387 paddd m5, [pd_16] | |
16388 psrld m5, 5 | |
16389 packusdw m4, m5 | |
16390 | |
16391 pmaddwd m5, m3, [r3 + 1 * 32] ; [17] | |
16392 paddd m5, [pd_16] | |
16393 psrld m5, 5 | |
16394 pmaddwd m8, m0, [r3 + 1 * 32] | |
16395 paddd m8, [pd_16] | |
16396 psrld m8, 5 | |
16397 packusdw m5, m8 | |
16398 | |
16399 palignr m7, m0, m3, 4 | |
16400 palignr m8, m2, m0, 4 | |
16401 pmaddwd m6, m7, [r3 - 14 * 32] ; [2] | |
16402 paddd m6, [pd_16] | |
16403 psrld m6, 5 | |
16404 pmaddwd m9, m8, [r3 - 14 * 32] | |
16405 paddd m9, [pd_16] | |
16406 psrld m9, 5 | |
16407 packusdw m6, m9 | |
16408 | |
16409 pmaddwd m7, [r3 + 3 * 32] ; [19] | |
16410 paddd m7, [pd_16] | |
16411 psrld m7, 5 | |
16412 pmaddwd m8, [r3 + 3 * 32] | |
16413 paddd m8, [pd_16] | |
16414 psrld m8, 5 | |
16415 packusdw m7, m8 | |
16416 | |
16417 palignr m9, m0, m3, 8 | |
16418 palignr m10, m2, m0, 8 | |
16419 pmaddwd m8, m9, [r3 - 12 * 32] ; [4] | |
16420 paddd m8, [pd_16] | |
16421 psrld m8, 5 | |
16422 pmaddwd m11, m10, [r3 - 12 * 32] | |
16423 paddd m11, [pd_16] | |
16424 psrld m11, 5 | |
16425 packusdw m8, m11 | |
16426 | |
16427 pmaddwd m9, [r3 + 5 * 32] ; [21] | |
16428 paddd m9, [pd_16] | |
16429 psrld m9, 5 | |
16430 pmaddwd m10, [r3 + 5 * 32] | |
16431 paddd m10, [pd_16] | |
16432 psrld m10, 5 | |
16433 packusdw m9, m10 | |
16434 | |
16435 palignr m11, m0, m3, 12 | |
16436 palignr m12, m2, m0, 12 | |
16437 pmaddwd m10, m11, [r3 - 10 * 32] ; [6] | |
16438 paddd m10, [pd_16] | |
16439 psrld m10, 5 | |
16440 pmaddwd m13, m12, [r3 - 10 * 32] | |
16441 paddd m13, [pd_16] | |
16442 psrld m13, 5 | |
16443 packusdw m10, m13 | |
16444 | |
16445 pmaddwd m11, [r3 + 7 * 32] ; [23] | |
16446 paddd m11, [pd_16] | |
16447 psrld m11, 5 | |
16448 pmaddwd m12, [r3 + 7 * 32] | |
16449 paddd m12, [pd_16] | |
16450 psrld m12, 5 | |
16451 packusdw m11, m12 | |
16452 | |
16453 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
16454 | |
16455 pmaddwd m4, m0, [r3 - 8 * 32] ; [8] | |
16456 paddd m4, [pd_16] | |
16457 psrld m4, 5 | |
16458 pmaddwd m7, m2, [r3 - 8 * 32] | |
16459 paddd m7, [pd_16] | |
16460 psrld m7, 5 | |
16461 packusdw m4, m7 | |
16462 | |
16463 pmaddwd m5, m0, [r3 + 9 * 32] ; [25] | |
16464 paddd m5, [pd_16] | |
16465 psrld m5, 5 | |
16466 pmaddwd m6, m2, [r3 + 9 * 32] | |
16467 paddd m6, [pd_16] | |
16468 psrld m6, 5 | |
16469 packusdw m5, m6 | |
16470 | |
16471 palignr m7, m2, m0, 4 | |
16472 palignr m8, m1, m2, 4 | |
16473 pmaddwd m6, m7, [r3 - 6 * 32] ; [10] | |
16474 paddd m6, [pd_16] | |
16475 psrld m6, 5 | |
16476 pmaddwd m3, m8, [r3 - 6 * 32] | |
16477 paddd m3, [pd_16] | |
16478 psrld m3, 5 | |
16479 packusdw m6, m3 | |
16480 | |
16481 pmaddwd m7, [r3 + 11 * 32] ; [27] | |
16482 paddd m7, [pd_16] | |
16483 psrld m7, 5 | |
16484 pmaddwd m8, [r3 + 11 * 32] | |
16485 paddd m8, [pd_16] | |
16486 psrld m8, 5 | |
16487 packusdw m7, m8 | |
16488 | |
16489 palignr m9, m2, m0, 8 | |
16490 palignr m3, m1, m2, 8 | |
16491 pmaddwd m8, m9, [r3 - 4 * 32] ; [12] | |
16492 paddd m8, [pd_16] | |
16493 psrld m8, 5 | |
16494 pmaddwd m11, m3, [r3 - 4 * 32] | |
16495 paddd m11, [pd_16] | |
16496 psrld m11, 5 | |
16497 packusdw m8, m11 | |
16498 | |
16499 pmaddwd m9, [r3 + 13 * 32] ; [29] | |
16500 paddd m9, [pd_16] | |
16501 psrld m9, 5 | |
16502 pmaddwd m3, [r3 + 13 * 32] | |
16503 paddd m3, [pd_16] | |
16504 psrld m3, 5 | |
16505 packusdw m9, m3 | |
16506 | |
16507 palignr m1, m2, 12 | |
16508 palignr m2, m0, 12 | |
16509 pmaddwd m10, m2, [r3 - 2 * 32] ; [14] | |
16510 paddd m10, [pd_16] | |
16511 psrld m10, 5 | |
16512 pmaddwd m11, m1, [r3 - 2 * 32] | |
16513 paddd m11, [pd_16] | |
16514 psrld m11, 5 | |
16515 packusdw m10, m11 | |
16516 | |
16517 pmaddwd m2, [r3 + 15 * 32] ; [31] | |
16518 paddd m2, [pd_16] | |
16519 psrld m2, 5 | |
16520 pmaddwd m1, [r3 + 15 * 32] | |
16521 paddd m1, [pd_16] | |
16522 psrld m1, 5 | |
16523 packusdw m2, m1 | |
16524 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 | |
16525 ret | |
16526 | |
16527 cglobal intra_pred_ang32_15, 3,8,14 | |
16528 mov r6, rsp | |
16529 sub rsp, 4*mmsize+gprsize | |
16530 and rsp, ~63 | |
16531 mov [rsp+4*mmsize], r6 | |
16532 | |
16533 movu m0, [r2 + 128] | |
16534 movu m1, [r2 + 160] | |
16535 movd xm2, [r2 + 192] | |
16536 | |
16537 mova [rsp + 1*mmsize], m0 | |
16538 mova [rsp + 2*mmsize], m1 | |
16539 movd [rsp + 3*mmsize], xm2 | |
16540 | |
16541 add r1d, r1d | |
16542 lea r4, [r1 * 3] | |
16543 lea r3, [ang_table_avx2 + 16 * 32] | |
16544 | |
16545 movu xm1, [r2 + 4] | |
16546 movu xm2, [r2 + 18] | |
16547 movu xm3, [r2 + 34] | |
16548 movu xm4, [r2 + 48] | |
16549 pshufb xm1, [pw_ang32_15_21] | |
16550 pshufb xm2, [pw_ang32_15_21] | |
16551 pshufb xm3, [pw_ang32_15_21] | |
16552 pshufb xm4, [pw_ang32_15_21] | |
16553 | |
16554 punpckhqdq xm2, xm1 | |
16555 punpckhqdq xm4, xm3 | |
16556 | |
16557 movzx r6d, word [r2] | |
16558 mov [rsp + 1*mmsize], r6w | |
16559 movu [rsp + 16], xm2 | |
16560 movu [rsp], xm4 | |
16561 | |
16562 xor r6d, r6d | |
16563 lea r2, [rsp + 1*mmsize] | |
16564 lea r7, [r0 + 8 * r1] | |
16565 | |
16566 call ang32_mode_15_21_rows_0_15 | |
16567 | |
16568 lea r0, [r0 + 32] | |
16569 | |
16570 call ang32_mode_15_21_rows_16_31 | |
16571 | |
16572 add r2, 32 | |
16573 lea r0, [r7 + 8 * r1] | |
16574 | |
16575 call ang32_mode_15_21_rows_0_15 | |
16576 | |
16577 lea r0, [r0 + 32] | |
16578 | |
16579 call ang32_mode_15_21_rows_16_31 | |
16580 | |
16581 mov rsp, [rsp+4*mmsize] | |
16582 RET | |
16583 | |
16584 cglobal intra_pred_ang32_21, 3,8,14 | |
16585 mov r6, rsp | |
16586 sub rsp, 4*mmsize+gprsize | |
16587 and rsp, ~63 | |
16588 mov [rsp+4*mmsize], r6 | |
16589 | |
16590 movu m0, [r2] | |
16591 movu m1, [r2 + 32] | |
16592 movd xm2, [r2 + 64] | |
16593 | |
16594 mova [rsp + 1*mmsize], m0 | |
16595 mova [rsp + 2*mmsize], m1 | |
16596 movd [rsp + 3*mmsize], xm2 | |
16597 | |
16598 add r1d, r1d | |
16599 lea r4, [r1 * 3] | |
16600 lea r3, [ang_table_avx2 + 16 * 32] | |
16601 | |
16602 movu xm1, [r2 + 132] | |
16603 movu xm2, [r2 + 146] | |
16604 movu xm3, [r2 + 162] | |
16605 movu xm4, [r2 + 176] | |
16606 pshufb xm1, [pw_ang32_15_21] | |
16607 pshufb xm2, [pw_ang32_15_21] | |
16608 pshufb xm3, [pw_ang32_15_21] | |
16609 pshufb xm4, [pw_ang32_15_21] | |
16610 | |
16611 punpckhqdq xm2, xm1 | |
16612 punpckhqdq xm4, xm3 | |
16613 | |
16614 movu [rsp + 16], xm2 | |
16615 movu [rsp], xm4 | |
16616 | |
16617 xor r6d, r6d | |
16618 inc r6d | |
16619 lea r2, [rsp + 1*mmsize] | |
16620 lea r5, [r0 + 32] | |
16621 | |
16622 call ang32_mode_15_21_rows_0_15 | |
16623 | |
16624 lea r0, [r0 + 8 * r1] | |
16625 lea r0, [r0 + 8 * r1] | |
16626 | |
16627 call ang32_mode_15_21_rows_16_31 | |
16628 | |
16629 add r2, 32 | |
16630 mov r0, r5 | |
16631 | |
16632 call ang32_mode_15_21_rows_0_15 | |
16633 | |
16634 lea r0, [r0 + 8 * r1] | |
16635 lea r0, [r0 + 8 * r1] | |
16636 | |
16637 call ang32_mode_15_21_rows_16_31 | |
16638 | |
16639 mov rsp, [rsp+4*mmsize] | |
16640 RET | |
16641 | |
16642 ;; angle 32, modes 16 and 20, row 0 to 15 | |
16643 cglobal ang32_mode_16_20_rows_0_15 | |
16644 test r6d, r6d | |
16645 | |
16646 movu m0, [r2 - 20] | |
16647 movu m1, [r2 - 18] | |
16648 | |
16649 punpcklwd m3, m0, m1 | |
16650 punpckhwd m0, m1 | |
16651 | |
16652 movu m1, [r2 - 4] ; [ 3 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13] | |
16653 movu m4, [r2 - 2] ; [ 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14] | |
16654 punpcklwd m2, m1, m4 ; [-3 -2 -4 -3 -5 -4 -6 -5 -11 -10 -12 -11 -13 -12 -14 -13] | |
16655 punpckhwd m1, m4 ; [ 2 3 2 0 -1 0 -2 -1 -7 -6 -8 -7 -9 -8 -10 -9] | |
16656 | |
16657 pmaddwd m4, m3, [r3] ; [16] | |
16658 paddd m4, [pd_16] | |
16659 psrld m4, 5 | |
16660 pmaddwd m5, m0, [r3] | |
16661 paddd m5, [pd_16] | |
16662 psrld m5, 5 | |
16663 packusdw m4, m5 | |
16664 | |
16665 palignr m6, m0, m3, 4 | |
16666 palignr m7, m2, m0, 4 | |
16667 pmaddwd m5, m6, [r3 - 11 * 32] ; [5] | |
16668 paddd m5, [pd_16] | |
16669 psrld m5, 5 | |
16670 pmaddwd m8, m7, [r3 - 11 * 32] | |
16671 paddd m8, [pd_16] | |
16672 psrld m8, 5 | |
16673 packusdw m5, m8 | |
16674 | |
16675 pmaddwd m6, [r3 + 10 * 32] ; [26] | |
16676 paddd m6, [pd_16] | |
16677 psrld m6, 5 | |
16678 pmaddwd m7, [r3 + 10 * 32] | |
16679 paddd m7, [pd_16] | |
16680 psrld m7, 5 | |
16681 packusdw m6, m7 | |
16682 | |
16683 palignr m8, m0, m3, 8 | |
16684 palignr m9, m2, m0, 8 | |
16685 pmaddwd m7, m8, [r3 - 1 * 32] ; [15] | |
16686 paddd m7, [pd_16] | |
16687 psrld m7, 5 | |
16688 pmaddwd m10, m9, [r3 - 1 * 32] | |
16689 paddd m10, [pd_16] | |
16690 psrld m10, 5 | |
16691 packusdw m7, m10 | |
16692 | |
16693 palignr m9, m0, m3, 12 | |
16694 palignr m12, m2, m0, 12 | |
16695 pmaddwd m8, m9, [r3 - 12 * 32] ; [4] | |
16696 paddd m8, [pd_16] | |
16697 psrld m8, 5 | |
16698 pmaddwd m10, m12, [r3 - 12 * 32] | |
16699 paddd m10, [pd_16] | |
16700 psrld m10, 5 | |
16701 packusdw m8, m10 | |
16702 | |
16703 pmaddwd m9, [r3 + 9 * 32] ; [25] | |
16704 paddd m9, [pd_16] | |
16705 psrld m9, 5 | |
16706 pmaddwd m12, [r3 + 9 * 32] | |
16707 paddd m12, [pd_16] | |
16708 psrld m12, 5 | |
16709 packusdw m9, m12 | |
16710 | |
16711 pmaddwd m10, m0, [r3 - 2 * 32] ; [14] | |
16712 paddd m10, [pd_16] | |
16713 psrld m10, 5 | |
16714 pmaddwd m11, m2, [r3 - 2 * 32] | |
16715 paddd m11, [pd_16] | |
16716 psrld m11, 5 | |
16717 packusdw m10, m11 | |
16718 | |
16719 palignr m11, m2, m0, 4 | |
16720 palignr m12, m1, m2, 4 | |
16721 pmaddwd m11, [r3 - 13 * 32] ; [3] | |
16722 paddd m11, [pd_16] | |
16723 psrld m11, 5 | |
16724 pmaddwd m12, [r3 - 13 * 32] | |
16725 paddd m12, [pd_16] | |
16726 psrld m12, 5 | |
16727 packusdw m11, m12 | |
16728 | |
16729 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
16730 | |
16731 palignr m4, m2, m0, 4 | |
16732 palignr m5, m1, m2, 4 | |
16733 pmaddwd m4, [r3 + 8 * 32] ; [24] | |
16734 paddd m4, [pd_16] | |
16735 psrld m4, 5 | |
16736 pmaddwd m5, [r3 + 8 * 32] | |
16737 paddd m5, [pd_16] | |
16738 psrld m5, 5 | |
16739 packusdw m4, m5 | |
16740 | |
16741 palignr m5, m2, m0, 8 | |
16742 palignr m3, m1, m2, 8 | |
16743 pmaddwd m5, [r3 - 3 * 32] ; [13] | |
16744 paddd m5, [pd_16] | |
16745 psrld m5, 5 | |
16746 pmaddwd m3, [r3 - 3 * 32] | |
16747 paddd m3, [pd_16] | |
16748 psrld m3, 5 | |
16749 packusdw m5, m3 | |
16750 | |
16751 palignr m7, m2, m0, 12 | |
16752 palignr m3, m1, m2, 12 | |
16753 pmaddwd m6, m7, [r3 - 14 * 32] ; [2] | |
16754 paddd m6, [pd_16] | |
16755 psrld m6, 5 | |
16756 pmaddwd m8, m3, [r3 - 14 * 32] | |
16757 paddd m8, [pd_16] | |
16758 psrld m8, 5 | |
16759 packusdw m6, m8 | |
16760 | |
16761 pmaddwd m7, [r3 + 7 * 32] ; [23] | |
16762 paddd m7, [pd_16] | |
16763 psrld m7, 5 | |
16764 pmaddwd m3, [r3 + 7 * 32] | |
16765 paddd m3, [pd_16] | |
16766 psrld m3, 5 | |
16767 packusdw m7, m3 | |
16768 | |
16769 pmaddwd m8, m2, [r3 - 4 * 32] ; [12] | |
16770 paddd m8, [pd_16] | |
16771 psrld m8, 5 | |
16772 pmaddwd m9, m1, [r3 - 4 * 32] | |
16773 paddd m9, [pd_16] | |
16774 psrld m9, 5 | |
16775 packusdw m8, m9 | |
16776 | |
16777 movu m0, [r2 - 2] | |
16778 movu m1, [r2] | |
16779 | |
16780 punpcklwd m3, m0, m1 | |
16781 punpckhwd m0, m1 | |
16782 | |
16783 movu m2, [r2 + 14] | |
16784 movu m1, [r2 + 16] | |
16785 punpcklwd m2, m1 | |
16786 | |
16787 pmaddwd m9, m3, [r3 - 15 * 32] ; [1] | |
16788 paddd m9, [pd_16] | |
16789 psrld m9, 5 | |
16790 pmaddwd m10, m0, [r3 - 15 * 32] | |
16791 paddd m10, [pd_16] | |
16792 psrld m10, 5 | |
16793 packusdw m9, m10 | |
16794 | |
16795 pmaddwd m10, m3, [r3 + 6 * 32] ; [22] | |
16796 paddd m10, [pd_16] | |
16797 psrld m10, 5 | |
16798 pmaddwd m11, m0, [r3 + 6 * 32] | |
16799 paddd m11, [pd_16] | |
16800 psrld m11, 5 | |
16801 packusdw m10, m11 | |
16802 | |
16803 palignr m2, m0, 4 | |
16804 palignr m0, m3, 4 | |
16805 pmaddwd m0, [r3 - 5 * 32] ; [11] | |
16806 paddd m0, [pd_16] | |
16807 psrld m0, 5 | |
16808 pmaddwd m2, [r3 - 5 * 32] | |
16809 paddd m2, [pd_16] | |
16810 psrld m2, 5 | |
16811 packusdw m0, m2 | |
16812 TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0 | |
16813 ret | |
16814 | |
16815 ;; angle 32, modes 16 and 20, rows 16 to 31 | |
16816 cglobal ang32_mode_16_20_rows_16_31 | |
16817 test r6d, r6d | |
16818 | |
16819 movu m0, [r2 - 40] | |
16820 movu m1, [r2 - 38] | |
16821 | |
16822 punpcklwd m3, m0, m1 | |
16823 punpckhwd m0, m1 | |
16824 | |
16825 movu m1, [r2 - 24] | |
16826 movu m4, [r2 - 22] | |
16827 punpcklwd m2, m1, m4 | |
16828 punpckhwd m1, m4 | |
16829 | |
16830 pmaddwd m4, m3, [r3 - 16 * 32] ; [0] | |
16831 paddd m4, [pd_16] | |
16832 psrld m4, 5 | |
16833 pmaddwd m5, m0, [r3 - 16 * 32] | |
16834 paddd m5, [pd_16] | |
16835 psrld m5, 5 | |
16836 packusdw m4, m5 | |
16837 | |
16838 pmaddwd m5, m3, [r3 + 5 * 32] ; [21] | |
16839 paddd m5, [pd_16] | |
16840 psrld m5, 5 | |
16841 pmaddwd m8, m0, [r3 + 5 * 32] | |
16842 paddd m8, [pd_16] | |
16843 psrld m8, 5 | |
16844 packusdw m5, m8 | |
16845 | |
16846 palignr m7, m0, m3, 4 | |
16847 palignr m8, m2, m0, 4 | |
16848 pmaddwd m6, m7, [r3 - 6 * 32] ; [10] | |
16849 paddd m6, [pd_16] | |
16850 psrld m6, 5 | |
16851 pmaddwd m9, m8, [r3 - 6 * 32] | |
16852 paddd m9, [pd_16] | |
16853 psrld m9, 5 | |
16854 packusdw m6, m9 | |
16855 | |
16856 pmaddwd m7, [r3 + 15 * 32] ; [31] | |
16857 paddd m7, [pd_16] | |
16858 psrld m7, 5 | |
16859 pmaddwd m8, [r3 + 15 * 32] | |
16860 paddd m8, [pd_16] | |
16861 psrld m8, 5 | |
16862 packusdw m7, m8 | |
16863 | |
16864 palignr m8, m0, m3, 8 | |
16865 palignr m9, m2, m0, 8 | |
16866 pmaddwd m8, [r3 + 4 * 32] ; [20] | |
16867 paddd m8, [pd_16] | |
16868 psrld m8, 5 | |
16869 pmaddwd m9, [r3 + 4 * 32] | |
16870 paddd m9, [pd_16] | |
16871 psrld m9, 5 | |
16872 packusdw m8, m9 | |
16873 | |
16874 palignr m10, m0, m3, 12 | |
16875 palignr m11, m2, m0, 12 | |
16876 pmaddwd m9, m10, [r3 - 7 * 32] ; [9] | |
16877 paddd m9, [pd_16] | |
16878 psrld m9, 5 | |
16879 pmaddwd m12, m11, [r3 - 7 * 32] | |
16880 paddd m12, [pd_16] | |
16881 psrld m12, 5 | |
16882 packusdw m9, m12 | |
16883 | |
16884 pmaddwd m10, [r3 + 14 * 32] ; [30] | |
16885 paddd m10, [pd_16] | |
16886 psrld m10, 5 | |
16887 pmaddwd m11, [r3 + 14 * 32] | |
16888 paddd m11, [pd_16] | |
16889 psrld m11, 5 | |
16890 packusdw m10, m11 | |
16891 | |
16892 pmaddwd m11, m0, [r3 + 3 * 32] ; [19] | |
16893 paddd m11, [pd_16] | |
16894 psrld m11, 5 | |
16895 pmaddwd m12, m2, [r3 + 3 * 32] | |
16896 paddd m12, [pd_16] | |
16897 psrld m12, 5 | |
16898 packusdw m11, m12 | |
16899 | |
16900 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
16901 | |
16902 palignr m5, m2, m0, 4 | |
16903 palignr m6, m1, m2, 4 | |
16904 pmaddwd m4, m5, [r3 - 8 * 32] ; [8] | |
16905 paddd m4, [pd_16] | |
16906 psrld m4, 5 | |
16907 pmaddwd m7, m6, [r3 - 8 * 32] | |
16908 paddd m7, [pd_16] | |
16909 psrld m7, 5 | |
16910 packusdw m4, m7 | |
16911 | |
16912 pmaddwd m5, [r3 + 13 * 32] ; [29] | |
16913 paddd m5, [pd_16] | |
16914 psrld m5, 5 | |
16915 pmaddwd m6, [r3 + 13 * 32] | |
16916 paddd m6, [pd_16] | |
16917 psrld m6, 5 | |
16918 packusdw m5, m6 | |
16919 | |
16920 palignr m6, m2, m0, 8 | |
16921 palignr m3, m1, m2, 8 | |
16922 pmaddwd m6, [r3 + 2 * 32] ; [18] | |
16923 paddd m6, [pd_16] | |
16924 psrld m6, 5 | |
16925 pmaddwd m3, [r3 + 2 * 32] | |
16926 paddd m3, [pd_16] | |
16927 psrld m3, 5 | |
16928 packusdw m6, m3 | |
16929 | |
16930 palignr m8, m2, m0, 12 | |
16931 palignr m9, m1, m2, 12 | |
16932 pmaddwd m7, m8, [r3 - 9 * 32] ; [7] | |
16933 paddd m7, [pd_16] | |
16934 psrld m7, 5 | |
16935 pmaddwd m10, m9, [r3 - 9 * 32] | |
16936 paddd m10, [pd_16] | |
16937 psrld m10, 5 | |
16938 packusdw m7, m10 | |
16939 | |
16940 pmaddwd m8, [r3 + 12 * 32] ; [28] | |
16941 paddd m8, [pd_16] | |
16942 psrld m8, 5 | |
16943 pmaddwd m9, [r3 + 12 * 32] | |
16944 paddd m9, [pd_16] | |
16945 psrld m9, 5 | |
16946 packusdw m8, m9 | |
16947 | |
16948 pmaddwd m9, m2, [r3 + 1 * 32] ; [17] | |
16949 paddd m9, [pd_16] | |
16950 psrld m9, 5 | |
16951 pmaddwd m3, m1, [r3 + 1 * 32] | |
16952 paddd m3, [pd_16] | |
16953 psrld m3, 5 | |
16954 packusdw m9, m3 | |
16955 | |
16956 movu m0, [r2 - 22] | |
16957 movu m1, [r2 - 20] | |
16958 punpcklwd m3, m0, m1 | |
16959 punpckhwd m0, m1 | |
16960 | |
16961 pmaddwd m10, m3, [r3 - 10 * 32] ; [6] | |
16962 paddd m10, [pd_16] | |
16963 psrld m10, 5 | |
16964 pmaddwd m11, m0, [r3 - 10 * 32] | |
16965 paddd m11, [pd_16] | |
16966 psrld m11, 5 | |
16967 packusdw m10, m11 | |
16968 | |
16969 pmaddwd m3, [r3 + 11 * 32] ; [27] | |
16970 paddd m3, [pd_16] | |
16971 psrld m3, 5 | |
16972 pmaddwd m0, [r3 + 11 * 32] | |
16973 paddd m0, [pd_16] | |
16974 psrld m0, 5 | |
16975 packusdw m3, m0 | |
16976 TRANSPOSE_STORE_AVX2_STACK 3, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 | |
16977 ret | |
16978 | |
16979 cglobal intra_pred_ang32_16, 3,8,14 | |
16980 mov r6, rsp | |
16981 sub rsp, 5*mmsize+gprsize | |
16982 and rsp, ~63 | |
16983 mov [rsp+5*mmsize], r6 | |
16984 | |
16985 movu m0, [r2 + 128] | |
16986 movu m1, [r2 + 160] | |
16987 movd xm2, [r2 + 192] | |
16988 | |
16989 mova [rsp + 2*mmsize], m0 | |
16990 mova [rsp + 3*mmsize], m1 | |
16991 movd [rsp + 4*mmsize], xm2 | |
16992 | |
16993 add r1d, r1d | |
16994 lea r4, [r1 * 3] | |
16995 lea r3, [ang_table_avx2 + 16 * 32] | |
16996 | |
16997 movu xm1, [r2 + 4] | |
16998 movu xm2, [r2 + 16] | |
16999 movu xm3, [r2 + 28] | |
17000 movu xm4, [r2 + 40] | |
17001 movu xm5, [r2 + 52] | |
17002 pshufb xm1, [pw_ang32_16_20] | |
17003 pshufb xm2, [pw_ang32_16_20] | |
17004 pshufb xm3, [pw_ang32_16_20] | |
17005 pshufb xm4, [pw_ang32_16_20] | |
17006 pshufb xm5, [pw_ang32_16_20] | |
17007 | |
17008 punpckhqdq xm2, xm1 | |
17009 punpckhqdq xm4, xm3 | |
17010 punpckhqdq xm5, xm5 | |
17011 | |
17012 movzx r6d, word [r2] | |
17013 mov [rsp + 2*mmsize], r6w | |
17014 movu [rsp + 48], xm2 | |
17015 movu [rsp + 32], xm4 | |
17016 movq [rsp + 24], xm5 | |
17017 | |
17018 xor r6d, r6d | |
17019 lea r2, [rsp + 2*mmsize] | |
17020 lea r7, [r0 + 8 * r1] | |
17021 | |
17022 call ang32_mode_16_20_rows_0_15 | |
17023 | |
17024 lea r0, [r0 + 32] | |
17025 | |
17026 call ang32_mode_16_20_rows_16_31 | |
17027 | |
17028 add r2, 32 | |
17029 lea r0, [r7 + 8 * r1] | |
17030 | |
17031 call ang32_mode_16_20_rows_0_15 | |
17032 | |
17033 lea r0, [r0 + 32] | |
17034 | |
17035 call ang32_mode_16_20_rows_16_31 | |
17036 | |
17037 mov rsp, [rsp+5*mmsize] | |
17038 RET | |
17039 | |
17040 cglobal intra_pred_ang32_20, 3,8,14 | |
17041 mov r6, rsp | |
17042 sub rsp, 5*mmsize+gprsize | |
17043 and rsp, ~63 | |
17044 mov [rsp+5*mmsize], r6 | |
17045 | |
17046 movu m0, [r2] | |
17047 movu m1, [r2 + 32] | |
17048 movd xm2, [r2 + 64] | |
17049 | |
17050 mova [rsp + 2*mmsize], m0 | |
17051 mova [rsp + 3*mmsize], m1 | |
17052 movd [rsp + 4*mmsize], xm2 | |
17053 | |
17054 add r1d, r1d | |
17055 lea r4, [r1 * 3] | |
17056 lea r3, [ang_table_avx2 + 16 * 32] | |
17057 | |
17058 movu xm1, [r2 + 132] | |
17059 movu xm2, [r2 + 144] | |
17060 movu xm3, [r2 + 156] | |
17061 movu xm4, [r2 + 168] | |
17062 movu xm5, [r2 + 180] | |
17063 pshufb xm1, [pw_ang32_16_20] | |
17064 pshufb xm2, [pw_ang32_16_20] | |
17065 pshufb xm3, [pw_ang32_16_20] | |
17066 pshufb xm4, [pw_ang32_16_20] | |
17067 pshufb xm5, [pw_ang32_16_20] | |
17068 | |
17069 punpckhqdq xm2, xm1 | |
17070 punpckhqdq xm4, xm3 | |
17071 punpckhqdq xm5, xm5 | |
17072 | |
17073 movu [rsp + 48], xm2 | |
17074 movu [rsp + 32], xm4 | |
17075 movq [rsp + 24], xm5 | |
17076 | |
17077 xor r6d, r6d | |
17078 inc r6d | |
17079 lea r2, [rsp + 2*mmsize] | |
17080 lea r5, [r0 + 32] | |
17081 | |
17082 call ang32_mode_16_20_rows_0_15 | |
17083 | |
17084 lea r0, [r0 + 8 * r1] | |
17085 lea r0, [r0 + 8 * r1] | |
17086 | |
17087 call ang32_mode_16_20_rows_16_31 | |
17088 | |
17089 add r2, 32 | |
17090 mov r0, r5 | |
17091 | |
17092 call ang32_mode_16_20_rows_0_15 | |
17093 | |
17094 lea r0, [r0 + 8 * r1] | |
17095 lea r0, [r0 + 8 * r1] | |
17096 | |
17097 call ang32_mode_16_20_rows_16_31 | |
17098 | |
17099 mov rsp, [rsp+5*mmsize] | |
17100 RET | |
17101 | |
17102 ;; angle 32, modes 17 and 19, row 0 to 15 | |
17103 cglobal ang32_mode_17_19_rows_0_15 | |
17104 test r6d, r6d | |
17105 | |
17106 movu m0, [r2 - 24] | |
17107 movu m1, [r2 - 22] | |
17108 | |
17109 punpcklwd m3, m0, m1 | |
17110 punpckhwd m0, m1 | |
17111 | |
17112 movu m1, [r2 - 8] | |
17113 movu m4, [r2 - 6] | |
17114 punpcklwd m2, m1, m4 | |
17115 punpckhwd m1, m4 | |
17116 | |
17117 pmaddwd m4, m3, [r3 - 16 * 32] ; [0] | |
17118 paddd m4, [pd_16] | |
17119 psrld m4, 5 | |
17120 pmaddwd m5, m0, [r3 - 16 * 32] | |
17121 paddd m5, [pd_16] | |
17122 psrld m5, 5 | |
17123 packusdw m4, m5 | |
17124 | |
17125 pmaddwd m5, m3, [r3 + 10 * 32] ; [26] | |
17126 paddd m5, [pd_16] | |
17127 psrld m5, 5 | |
17128 pmaddwd m8, m0, [r3 + 10 * 32] | |
17129 paddd m8, [pd_16] | |
17130 psrld m8, 5 | |
17131 packusdw m5, m8 | |
17132 | |
17133 palignr m6, m0, m3, 4 | |
17134 palignr m8, m2, m0, 4 | |
17135 pmaddwd m6, [r3 + 4 * 32] ; [20] | |
17136 paddd m6, [pd_16] | |
17137 psrld m6, 5 | |
17138 pmaddwd m8, [r3 + 4 * 32] | |
17139 paddd m8, [pd_16] | |
17140 psrld m8, 5 | |
17141 packusdw m6, m8 | |
17142 | |
17143 palignr m7, m0, m3, 8 | |
17144 palignr m9, m2, m0, 8 | |
17145 pmaddwd m7, [r3 - 2 * 32] ; [14] | |
17146 paddd m7, [pd_16] | |
17147 psrld m7, 5 | |
17148 pmaddwd m9, [r3 - 2 * 32] | |
17149 paddd m9, [pd_16] | |
17150 psrld m9, 5 | |
17151 packusdw m7, m9 | |
17152 | |
17153 palignr m8, m0, m3, 12 | |
17154 palignr m10, m2, m0, 12 | |
17155 pmaddwd m8, [r3 - 8 * 32] ; [8] | |
17156 paddd m8, [pd_16] | |
17157 psrld m8, 5 | |
17158 pmaddwd m10, [r3 - 8 * 32] | |
17159 paddd m10, [pd_16] | |
17160 psrld m10, 5 | |
17161 packusdw m8, m10 | |
17162 | |
17163 pmaddwd m9, m0, [r3 - 14 * 32] ; [2] | |
17164 paddd m9, [pd_16] | |
17165 psrld m9, 5 | |
17166 pmaddwd m12, m2, [r3 - 14 * 32] | |
17167 paddd m12, [pd_16] | |
17168 psrld m12, 5 | |
17169 packusdw m9, m12 | |
17170 | |
17171 pmaddwd m10, m0, [r3 + 12 * 32] ; [28] | |
17172 paddd m10, [pd_16] | |
17173 psrld m10, 5 | |
17174 pmaddwd m11, m2, [r3 + 12 * 32] | |
17175 paddd m11, [pd_16] | |
17176 psrld m11, 5 | |
17177 packusdw m10, m11 | |
17178 | |
17179 palignr m11, m2, m0, 4 | |
17180 palignr m12, m1, m2, 4 | |
17181 pmaddwd m11, [r3 + 6 * 32] ; [22] | |
17182 paddd m11, [pd_16] | |
17183 psrld m11, 5 | |
17184 pmaddwd m12, [r3 + 6 * 32] | |
17185 paddd m12, [pd_16] | |
17186 psrld m12, 5 | |
17187 packusdw m11, m12 | |
17188 | |
17189 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 | |
17190 | |
17191 palignr m4, m2, m0, 8 | |
17192 palignr m5, m1, m2, 8 | |
17193 pmaddwd m4, [r3] ; [16] | |
17194 paddd m4, [pd_16] | |
17195 psrld m4, 5 | |
17196 pmaddwd m5, [r3] | |
17197 paddd m5, [pd_16] | |
17198 psrld m5, 5 | |
17199 packusdw m4, m5 | |
17200 | |
17201 palignr m5, m2, m0, 12 | |
17202 palignr m3, m1, m2, 12 | |
17203 pmaddwd m5, [r3 - 6 * 32] ; [10] | |
17204 paddd m5, [pd_16] | |
17205 psrld m5, 5 | |
17206 pmaddwd m3, [r3 - 6 * 32] | |
17207 paddd m3, [pd_16] | |
17208 psrld m3, 5 | |
17209 packusdw m5, m3 | |
17210 | |
17211 pmaddwd m6, m2, [r3 - 12 * 32] ; [4] | |
17212 paddd m6, [pd_16] | |
17213 psrld m6, 5 | |
17214 pmaddwd m8, m1, [r3 - 12 * 32] | |
17215 paddd m8, [pd_16] | |
17216 psrld m8, 5 | |
17217 packusdw m6, m8 | |
17218 | |
17219 pmaddwd m7, m2, [r3 + 14 * 32] ; [30] | |
17220 paddd m7, [pd_16] | |
17221 psrld m7, 5 | |
17222 pmaddwd m3, m1, [r3 + 14 * 32] | |
17223 paddd m3, [pd_16] | |
17224 psrld m3, 5 | |
17225 packusdw m7, m3 | |
17226 | |
17227 movu m0, [r2 - 6] | |
17228 movu m1, [r2 - 4] | |
17229 | |
17230 punpcklwd m3, m0, m1 | |
17231 punpckhwd m0, m1 | |
17232 | |
17233 movu m2, [r2 + 10] | |
17234 movu m1, [r2 + 12] | |
17235 punpcklwd m2, m1 | |
17236 | |
17237 pmaddwd m8, m3, [r3 + 8 * 32] ; [24] | |
17238 paddd m8, [pd_16] | |
17239 psrld m8, 5 | |
17240 pmaddwd m9, m0, [r3 + 8 * 32] | |
17241 paddd m9, [pd_16] | |
17242 psrld m9, 5 | |
17243 packusdw m8, m9 | |
17244 | |
17245 palignr m9, m0, m3, 4 | |
17246 palignr m10, m2, m0, 4 | |
17247 pmaddwd m9, [r3 + 2 * 32] ; [18] | |
17248 paddd m9, [pd_16] | |
17249 psrld m9, 5 | |
17250 pmaddwd m10, [r3 + 2 * 32] | |
17251 paddd m10, [pd_16] | |
17252 psrld m10, 5 | |
17253 packusdw m9, m10 | |
17254 | |
17255 palignr m10, m0, m3, 8 | |
17256 palignr m11, m2, m0, 8 | |
17257 pmaddwd m10, [r3 - 4 * 32] ; [12] | |
17258 paddd m10, [pd_16] | |
17259 psrld m10, 5 | |
17260 pmaddwd m11, [r3 - 4 * 32] | |
17261 paddd m11, [pd_16] | |
17262 psrld m11, 5 | |
17263 packusdw m10, m11 | |
17264 | |
17265 palignr m2, m0, 12 | |
17266 palignr m0, m3, 12 | |
17267 pmaddwd m0, [r3 - 10 * 32] ; [6] | |
17268 paddd m0, [pd_16] | |
17269 psrld m0, 5 | |
17270 pmaddwd m2, [r3 - 10 * 32] | |
17271 paddd m2, [pd_16] | |
17272 psrld m2, 5 | |
17273 packusdw m0, m2 | |
17274 TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0 | |
17275 ret | |
17276 | |
17277 cglobal intra_pred_ang32_17, 3,8,14 | |
17278 mov r6, rsp | |
17279 sub rsp, 5*mmsize+gprsize | |
17280 and rsp, ~63 | |
17281 mov [rsp+5*mmsize], r6 | |
17282 | |
17283 movu m0, [r2 + 128] | |
17284 movu m1, [r2 + 160] | |
17285 movd xm2, [r2 + 192] | |
17286 | |
17287 mova [rsp + 2*mmsize], m0 | |
17288 mova [rsp + 3*mmsize], m1 | |
17289 movd [rsp + 4*mmsize], xm2 | |
17290 | |
17291 add r1d, r1d | |
17292 lea r4, [r1 * 3] | |
17293 lea r3, [ang_table_avx2 + 16 * 32] | |
17294 | |
17295 movu xm1, [r2 + 2] | |
17296 movu xm2, [r2 + 18] | |
17297 movu xm3, [r2 + 34] | |
17298 movu xm4, [r2 + 50] | |
17299 pshufb xm1, [pw_ang32_17_19_0] | |
17300 pshufb xm2, [shuf_mode_17_19] | |
17301 pshufb xm3, [pw_ang32_17_19_0] | |
17302 pshufb xm4, [shuf_mode_17_19] | |
17303 | |
17304 movzx r6d, word [r2] | |
17305 mov [rsp + 2*mmsize], r6w | |
17306 movu [rsp + 48], xm1 | |
17307 movu [rsp + 36], xm2 | |
17308 movu [rsp + 22], xm3 | |
17309 movu [rsp + 10], xm4 | |
17310 | |
17311 xor r6d, r6d | |
17312 lea r2, [rsp + 2*mmsize] | |
17313 lea r7, [r0 + 8 * r1] | |
17314 | |
17315 call ang32_mode_17_19_rows_0_15 | |
17316 | |
17317 sub r2, 26 | |
17318 lea r0, [r0 + 32] | |
17319 | |
17320 call ang32_mode_17_19_rows_0_15 | |
17321 | |
17322 add r2, 58 | |
17323 lea r0, [r7 + 8 * r1] | |
17324 | |
17325 call ang32_mode_17_19_rows_0_15 | |
17326 | |
17327 sub r2, 26 | |
17328 lea r0, [r0 + 32] | |
17329 | |
17330 call ang32_mode_17_19_rows_0_15 | |
17331 | |
17332 mov rsp, [rsp+5*mmsize] | |
17333 RET | |
17334 | |
17335 cglobal intra_pred_ang32_19, 3,8,14 | |
17336 mov r6, rsp | |
17337 sub rsp, 5*mmsize+gprsize | |
17338 and rsp, ~63 | |
17339 mov [rsp+5*mmsize], r6 | |
17340 | |
17341 movu m0, [r2] | |
17342 movu m1, [r2 + 32] | |
17343 movd xm2, [r2 + 64] | |
17344 | |
17345 mova [rsp + 2*mmsize], m0 | |
17346 mova [rsp + 3*mmsize], m1 | |
17347 movd [rsp + 4*mmsize], xm2 | |
17348 | |
17349 add r1d, r1d | |
17350 lea r4, [r1 * 3] | |
17351 lea r3, [ang_table_avx2 + 16 * 32] | |
17352 | |
17353 movu xm1, [r2 + 130] | |
17354 movu xm2, [r2 + 146] | |
17355 movu xm3, [r2 + 162] | |
17356 movu xm4, [r2 + 178] | |
17357 pshufb xm1, [pw_ang32_17_19_0] | |
17358 pshufb xm2, [shuf_mode_17_19] | |
17359 pshufb xm3, [pw_ang32_17_19_0] | |
17360 pshufb xm4, [shuf_mode_17_19] | |
17361 | |
17362 movu [rsp + 48], xm1 | |
17363 movu [rsp + 36], xm2 | |
17364 movu [rsp + 22], xm3 | |
17365 movu [rsp + 10], xm4 | |
17366 | |
17367 xor r6d, r6d | |
17368 inc r6d | |
17369 lea r2, [rsp + 2*mmsize] | |
17370 lea r5, [r0 + 32] | |
17371 | |
17372 call ang32_mode_17_19_rows_0_15 | |
17373 | |
17374 sub r2, 26 | |
17375 lea r0, [r0 + 8 * r1] | |
17376 lea r0, [r0 + 8 * r1] | |
17377 | |
17378 call ang32_mode_17_19_rows_0_15 | |
17379 | |
17380 add r2, 58 | |
17381 mov r0, r5 | |
17382 | |
17383 call ang32_mode_17_19_rows_0_15 | |
17384 | |
17385 sub r2, 26 | |
17386 lea r0, [r0 + 8 * r1] | |
17387 lea r0, [r0 + 8 * r1] | |
17388 | |
17389 call ang32_mode_17_19_rows_0_15 | |
17390 | |
17391 mov rsp, [rsp+5*mmsize] | |
17392 RET | |
17393 | |
17394 cglobal intra_pred_ang32_18, 3,6,6 | |
17395 mov r4, rsp | |
17396 sub rsp, 4*mmsize+gprsize | |
17397 and rsp, ~63 | |
17398 mov [rsp+4*mmsize], r4 | |
17399 | |
17400 movu m0, [r2] | |
17401 movu m1, [r2 + 32] | |
17402 mova [rsp + 2*mmsize], m0 | |
17403 mova [rsp + 3*mmsize], m1 | |
17404 | |
17405 movu m2, [r2 + 130] | |
17406 movu m3, [r2 + 162] | |
17407 pshufb m2, [pw_swap16] | |
17408 pshufb m3, [pw_swap16] | |
17409 vpermq m2, m2, 01001110b | |
17410 vpermq m3, m3, 01001110b | |
17411 mova [rsp + 1*mmsize], m2 | |
17412 mova [rsp + 0*mmsize], m3 | |
17413 | |
17414 add r1d, r1d | |
17415 lea r2, [rsp+2*mmsize] | |
17416 lea r4, [r1 * 2] | |
17417 lea r3, [r1 * 3] | |
17418 lea r5, [r1 * 4] | |
17419 | |
17420 movu m0, [r2] | |
17421 movu m1, [r2 + 32] | |
17422 movu m2, [r2 - 16] | |
17423 movu m3, [r2 + 16] | |
17424 | |
17425 movu [r0], m0 | |
17426 movu [r0 + 32], m1 | |
17427 | |
17428 palignr m4, m0, m2, 14 | |
17429 palignr m5, m1, m3, 14 | |
17430 movu [r0 + r1], m4 | |
17431 movu [r0 + r1 + 32], m5 | |
17432 | |
17433 palignr m4, m0, m2, 12 | |
17434 palignr m5, m1, m3, 12 | |
17435 movu [r0 + r4], m4 | |
17436 movu [r0 + r4 + 32], m5 | |
17437 | |
17438 palignr m4, m0, m2, 10 | |
17439 palignr m5, m1, m3, 10 | |
17440 movu [r0 + r3], m4 | |
17441 movu [r0 + r3 + 32], m5 | |
17442 | |
17443 add r0, r5 | |
17444 | |
17445 palignr m4, m0, m2, 8 | |
17446 palignr m5, m1, m3, 8 | |
17447 movu [r0], m4 | |
17448 movu [r0 + 32], m5 | |
17449 | |
17450 palignr m4, m0, m2, 6 | |
17451 palignr m5, m1, m3, 6 | |
17452 movu [r0 + r1], m4 | |
17453 movu [r0 + r1 + 32], m5 | |
17454 | |
17455 palignr m4, m0, m2, 4 | |
17456 palignr m5, m1, m3, 4 | |
17457 movu [r0 + r4], m4 | |
17458 movu [r0 + r4 + 32], m5 | |
17459 | |
17460 palignr m4, m0, m2, 2 | |
17461 palignr m5, m1, m3, 2 | |
17462 movu [r0 + r3], m4 | |
17463 movu [r0 + r3 + 32], m5 | |
17464 | |
17465 add r0, r5 | |
17466 | |
17467 movu [r0], m2 | |
17468 movu [r0 + 32], m3 | |
17469 | |
17470 movu m0, [r2 - 32] | |
17471 movu m1, [r2] | |
17472 | |
17473 palignr m4, m2, m0, 14 | |
17474 palignr m5, m3, m1, 14 | |
17475 movu [r0 + r1], m4 | |
17476 movu [r0 + r1 + 32], m5 | |
17477 | |
17478 palignr m4, m2, m0, 12 | |
17479 palignr m5, m3, m1, 12 | |
17480 movu [r0 + r4], m4 | |
17481 movu [r0 + r4 + 32], m5 | |
17482 | |
17483 palignr m4, m2, m0, 10 | |
17484 palignr m5, m3, m1, 10 | |
17485 movu [r0 + r3], m4 | |
17486 movu [r0 + r3 + 32], m5 | |
17487 | |
17488 add r0, r5 | |
17489 | |
17490 palignr m4, m2, m0, 8 | |
17491 palignr m5, m3, m1, 8 | |
17492 movu [r0], m4 | |
17493 movu [r0 + 32], m5 | |
17494 | |
17495 palignr m4, m2, m0, 6 | |
17496 palignr m5, m3, m1, 6 | |
17497 movu [r0 + r1], m4 | |
17498 movu [r0 + r1 + 32], m5 | |
17499 | |
17500 palignr m4, m2, m0, 4 | |
17501 palignr m5, m3, m1, 4 | |
17502 movu [r0 + r4], m4 | |
17503 movu [r0 + r4 + 32], m5 | |
17504 | |
17505 palignr m4, m2, m0, 2 | |
17506 palignr m5, m3, m1, 2 | |
17507 movu [r0 + r3], m4 | |
17508 movu [r0 + r3 + 32], m5 | |
17509 | |
17510 add r0, r5 | |
17511 | |
17512 movu [r0], m0 | |
17513 movu [r0 + 32], m1 | |
17514 | |
17515 movu m2, [r2 - 48] | |
17516 movu m3, [r2 - 16] | |
17517 | |
17518 palignr m4, m0, m2, 14 | |
17519 palignr m5, m1, m3, 14 | |
17520 movu [r0 + r1], m4 | |
17521 movu [r0 + r1 + 32], m5 | |
17522 | |
17523 palignr m4, m0, m2, 12 | |
17524 palignr m5, m1, m3, 12 | |
17525 movu [r0 + r4], m4 | |
17526 movu [r0 + r4 + 32], m5 | |
17527 | |
17528 palignr m4, m0, m2, 10 | |
17529 palignr m5, m1, m3, 10 | |
17530 movu [r0 + r3], m4 | |
17531 movu [r0 + r3 + 32], m5 | |
17532 | |
17533 add r0, r5 | |
17534 | |
17535 palignr m4, m0, m2, 8 | |
17536 palignr m5, m1, m3, 8 | |
17537 movu [r0], m4 | |
17538 movu [r0 + 32], m5 | |
17539 | |
17540 palignr m4, m0, m2, 6 | |
17541 palignr m5, m1, m3, 6 | |
17542 movu [r0 + r1], m4 | |
17543 movu [r0 + r1 + 32], m5 | |
17544 | |
17545 palignr m4, m0, m2, 4 | |
17546 palignr m5, m1, m3, 4 | |
17547 movu [r0 + r4], m4 | |
17548 movu [r0 + r4 + 32], m5 | |
17549 | |
17550 palignr m4, m0, m2, 2 | |
17551 palignr m5, m1, m3, 2 | |
17552 movu [r0 + r3], m4 | |
17553 movu [r0 + r3 + 32], m5 | |
17554 | |
17555 add r0, r5 | |
17556 | |
17557 movu [r0], m2 | |
17558 movu [r0 + 32], m3 | |
17559 | |
17560 movu m0, [r2 - 64] | |
17561 movu m1, [r2 - 32] | |
17562 | |
17563 palignr m4, m2, m0, 14 | |
17564 palignr m5, m3, m1, 14 | |
17565 movu [r0 + r1], m4 | |
17566 movu [r0 + r1 + 32], m5 | |
17567 | |
17568 palignr m4, m2, m0, 12 | |
17569 palignr m5, m3, m1, 12 | |
17570 movu [r0 + r4], m4 | |
17571 movu [r0 + r4 + 32], m5 | |
17572 | |
17573 palignr m4, m2, m0, 10 | |
17574 palignr m5, m3, m1, 10 | |
17575 movu [r0 + r3], m4 | |
17576 movu [r0 + r3 + 32], m5 | |
17577 | |
17578 add r0, r5 | |
17579 | |
17580 palignr m4, m2, m0, 8 | |
17581 palignr m5, m3, m1, 8 | |
17582 movu [r0], m4 | |
17583 movu [r0 + 32], m5 | |
17584 | |
17585 palignr m4, m2, m0, 6 | |
17586 palignr m5, m3, m1, 6 | |
17587 movu [r0 + r1], m4 | |
17588 movu [r0 + r1 + 32], m5 | |
17589 | |
17590 palignr m4, m2, m0, 4 | |
17591 palignr m5, m3, m1, 4 | |
17592 movu [r0 + r4], m4 | |
17593 movu [r0 + r4 + 32], m5 | |
17594 | |
17595 palignr m4, m2, m0, 2 | |
17596 palignr m5, m3, m1, 2 | |
17597 movu [r0 + r3], m4 | |
17598 movu [r0 + r3 + 32], m5 | |
17599 | |
17600 mov rsp, [rsp+4*mmsize] | |
17601 RET | |
17602 ;------------------------------------------------------------------------------------------------------- | |
17603 ; end of avx2 code for intra_pred_ang32 mode 2 to 34 | |
17604 ;------------------------------------------------------------------------------------------------------- | |
17605 | |
17606 %macro MODE_2_34 0 | |
17607 movu m0, [r2 + 4] | |
17608 movu m1, [r2 + 20] | |
17609 movu m2, [r2 + 36] | |
17610 movu m3, [r2 + 52] | |
17611 movu m4, [r2 + 68] | |
17612 movu [r0], m0 | |
17613 movu [r0 + 16], m1 | |
17614 movu [r0 + 32], m2 | |
17615 movu [r0 + 48], m3 | |
17616 palignr m5, m1, m0, 2 | |
17617 movu [r0 + r1], m5 | |
17618 palignr m5, m2, m1, 2 | |
17619 movu [r0 + r1 + 16], m5 | |
17620 palignr m5, m3, m2, 2 | |
17621 movu [r0 + r1 + 32], m5 | |
17622 palignr m5, m4, m3, 2 | |
17623 movu [r0 + r1 + 48], m5 | |
17624 palignr m5, m1, m0, 4 | |
17625 movu [r0 + r3], m5 | |
17626 palignr m5, m2, m1, 4 | |
17627 movu [r0 + r3 + 16], m5 | |
17628 palignr m5, m3, m2, 4 | |
17629 movu [r0 + r3 + 32], m5 | |
17630 palignr m5, m4, m3, 4 | |
17631 movu [r0 + r3 + 48], m5 | |
17632 palignr m5, m1, m0, 6 | |
17633 movu [r0 + r4], m5 | |
17634 palignr m5, m2, m1, 6 | |
17635 movu [r0 + r4 + 16], m5 | |
17636 palignr m5, m3, m2, 6 | |
17637 movu [r0 + r4 + 32], m5 | |
17638 palignr m5, m4, m3, 6 | |
17639 movu [r0 + r4 + 48], m5 | |
17640 lea r0, [r0 + r1 * 4] | |
17641 palignr m5, m1, m0, 8 | |
17642 movu [r0], m5 | |
17643 palignr m5, m2, m1, 8 | |
17644 movu [r0 + 16], m5 | |
17645 palignr m5, m3, m2, 8 | |
17646 movu [r0 + 32], m5 | |
17647 palignr m5, m4, m3, 8 | |
17648 movu [r0 + 48], m5 | |
17649 palignr m5, m1, m0, 10 | |
17650 movu [r0 + r1], m5 | |
17651 palignr m5, m2, m1, 10 | |
17652 movu [r0 + r1 + 16], m5 | |
17653 palignr m5, m3, m2, 10 | |
17654 movu [r0 + r1 + 32], m5 | |
17655 palignr m5, m4, m3, 10 | |
17656 movu [r0 + r1 + 48], m5 | |
17657 palignr m5, m1, m0, 12 | |
17658 movu [r0 + r3], m5 | |
17659 palignr m5, m2, m1, 12 | |
17660 movu [r0 + r3 + 16], m5 | |
17661 palignr m5, m3, m2, 12 | |
17662 movu [r0 + r3 + 32], m5 | |
17663 palignr m5, m4, m3, 12 | |
17664 movu [r0 + r3 + 48], m5 | |
17665 palignr m5, m1, m0, 14 | |
17666 movu [r0 + r4], m5 | |
17667 palignr m5, m2, m1, 14 | |
17668 movu [r0 + r4 + 16], m5 | |
17669 palignr m5, m3, m2, 14 | |
17670 movu [r0 + r4 + 32], m5 | |
17671 palignr m5, m4, m3, 14 | |
17672 movu [r0 + r4 + 48], m5 | |
17673 lea r0, [r0 + r1 * 4] | |
17674 movu m0, [r2 + 84] | |
17675 movu [r0], m1 | |
17676 movu [r0 + 16], m2 | |
17677 movu [r0 + 32], m3 | |
17678 movu [r0 + 48], m4 | |
17679 palignr m5, m2, m1, 2 | |
17680 movu [r0 + r1], m5 | |
17681 palignr m5, m3, m2, 2 | |
17682 movu [r0 + r1 + 16], m5 | |
17683 palignr m5, m4, m3, 2 | |
17684 movu [r0 + r1 + 32], m5 | |
17685 palignr m5, m0, m4, 2 | |
17686 movu [r0 + r1 + 48], m5 | |
17687 palignr m5, m2, m1, 4 | |
17688 movu [r0 + r3], m5 | |
17689 palignr m5, m3, m2, 4 | |
17690 movu [r0 + r3 + 16], m5 | |
17691 palignr m5, m4, m3, 4 | |
17692 movu [r0 + r3 + 32], m5 | |
17693 palignr m5, m0, m4, 4 | |
17694 movu [r0 + r3 + 48], m5 | |
17695 palignr m5, m2, m1, 6 | |
17696 movu [r0 + r4], m5 | |
17697 palignr m5, m3, m2, 6 | |
17698 movu [r0 + r4 + 16], m5 | |
17699 palignr m5, m4, m3, 6 | |
17700 movu [r0 + r4 + 32], m5 | |
17701 palignr m5, m0, m4, 6 | |
17702 movu [r0 + r4 + 48], m5 | |
17703 lea r0, [r0 + r1 * 4] | |
17704 palignr m5, m2, m1, 8 | |
17705 movu [r0], m5 | |
17706 palignr m5, m3, m2, 8 | |
17707 movu [r0 + 16], m5 | |
17708 palignr m5, m4, m3, 8 | |
17709 movu [r0 + 32], m5 | |
17710 palignr m5, m0, m4, 8 | |
17711 movu [r0 + 48], m5 | |
17712 palignr m5, m2, m1, 10 | |
17713 movu [r0 + r1], m5 | |
17714 palignr m5, m3, m2, 10 | |
17715 movu [r0 + r1 + 16], m5 | |
17716 palignr m5, m4, m3, 10 | |
17717 movu [r0 + r1 + 32], m5 | |
17718 palignr m5, m0, m4, 10 | |
17719 movu [r0 + r1 + 48], m5 | |
17720 palignr m5, m2, m1, 12 | |
17721 movu [r0 + r3], m5 | |
17722 palignr m5, m3, m2, 12 | |
17723 movu [r0 + r3 + 16], m5 | |
17724 palignr m5, m4, m3, 12 | |
17725 movu [r0 + r3 + 32], m5 | |
17726 palignr m5, m0, m4, 12 | |
17727 movu [r0 + r3 + 48], m5 | |
17728 palignr m5, m2, m1, 14 | |
17729 movu [r0 + r4], m5 | |
17730 palignr m5, m3, m2, 14 | |
17731 movu [r0 + r4 + 16], m5 | |
17732 palignr m5, m4, m3, 14 | |
17733 movu [r0 + r4 + 32], m5 | |
17734 palignr m5, m0, m4, 14 | |
17735 movu [r0 + r4 + 48], m5 | |
17736 lea r0, [r0 + r1 * 4] | |
17737 %endmacro | |
17738 | |
17739 %macro TRANSPOSE_STORE_8x8 6 | |
17740 %if %2 == 1 | |
17741 ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 | |
17742 punpckhwd m0, %3, %4 | |
17743 punpcklwd %3, %4 | |
17744 punpckhwd %4, %3, m0 | |
17745 punpcklwd %3, m0 | |
17746 | |
17747 punpckhwd m0, %5, %6 | |
17748 punpcklwd %5, %6 | |
17749 punpckhwd %6, %5, m0 | |
17750 punpcklwd %5, m0 | |
17751 | |
17752 punpckhqdq m0, %3, %5 | |
17753 punpcklqdq %3, %5 | |
17754 punpcklqdq %5, %4, %6 | |
17755 punpckhqdq %4, %6 | |
17756 | |
17757 movu [r0 + %1], %3 | |
17758 movu [r0 + r1 + %1], m0 | |
17759 movu [r0 + r1 * 2 + %1], %5 | |
17760 movu [r0 + r5 + %1], %4 | |
17761 %else | |
17762 ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32 | |
17763 movh [r0], %3 | |
17764 movhps [r0 + r1], %3 | |
17765 movh [r0 + r1 * 2], %4 | |
17766 movhps [r0 + r5], %4 | |
17767 lea r0, [r0 + r1 * 4] | |
17768 movh [r0], %5 | |
17769 movhps [r0 + r1], %5 | |
17770 movh [r0 + r1 * 2], %6 | |
17771 movhps [r0 + r5], %6 | |
17772 lea r0, [r0 + r1 * 4] | |
17773 %endif | |
17774 %endmacro | |
17775 | |
17776 %macro MODE_3_33 1 | |
17777 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
17778 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
17779 mova m7, m0 | |
17780 | |
17781 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
17782 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2 | |
17783 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0 | |
17784 | |
17785 palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1 | |
17786 pmaddwd m4, m0, [r3 + 10 * 16] ; [26] | |
17787 paddd m4, [pd_16] | |
17788 psrld m4, 5 | |
17789 | |
17790 pmaddwd m5, m1, [r3 + 4 * 16] ; [20] | |
17791 paddd m5, [pd_16] | |
17792 psrld m5, 5 | |
17793 packusdw m4, m5 | |
17794 | |
17795 palignr m5, m2, m0, 8 | |
17796 pmaddwd m5, [r3 - 2 * 16] ; [14] | |
17797 paddd m5, [pd_16] | |
17798 psrld m5, 5 | |
17799 | |
17800 palignr m6, m2, m0, 12 | |
17801 pmaddwd m6, [r3 - 8 * 16] ; [ 8] | |
17802 paddd m6, [pd_16] | |
17803 psrld m6, 5 | |
17804 packusdw m5, m6 | |
17805 | |
17806 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] | |
17807 paddd m6, [pd_16] | |
17808 psrld m6, 5 | |
17809 | |
17810 pmaddwd m1, m2, [r3 + 12 * 16] ; [28] | |
17811 paddd m1, [pd_16] | |
17812 psrld m1, 5 | |
17813 packusdw m6, m1 | |
17814 | |
17815 palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6] | |
17816 pmaddwd m1, m0, [r3 + 6 * 16] ; [22] | |
17817 paddd m1, [pd_16] | |
17818 psrld m1, 5 | |
17819 | |
17820 psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10] | |
17821 palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7] | |
17822 | |
17823 pmaddwd m2, [r3] ; [16] | |
17824 paddd m2, [pd_16] | |
17825 psrld m2, 5 | |
17826 packusdw m1, m2 | |
17827 | |
17828 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
17829 | |
17830 palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8] | |
17831 movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16] | |
17832 palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9] | |
17833 punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12] | |
17834 punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8] | |
17835 | |
17836 palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9] | |
17837 pmaddwd m4, m0, [r3 - 6 * 16] ; [10] | |
17838 paddd m4, [pd_16] | |
17839 psrld m4, 5 | |
17840 | |
17841 pmaddwd m1, m5, [r3 - 12 * 16] ; [04] | |
17842 paddd m1, [pd_16] | |
17843 psrld m1, 5 | |
17844 packusdw m4, m1 | |
17845 | |
17846 pmaddwd m5, [r3 + 14 * 16] ; [30] | |
17847 paddd m5, [pd_16] | |
17848 psrld m5, 5 | |
17849 | |
17850 palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10] | |
17851 pmaddwd m6, [r3 + 8 * 16] ; [24] | |
17852 paddd m6, [pd_16] | |
17853 psrld m6, 5 | |
17854 packusdw m5, m6 | |
17855 | |
17856 palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11] | |
17857 pmaddwd m6, m1, [r3 + 2 * 16] ; [18] | |
17858 paddd m6, [pd_16] | |
17859 psrld m6, 5 | |
17860 | |
17861 pmaddwd m1, m7, [r3 - 4 * 16] ; [12] | |
17862 paddd m1, [pd_16] | |
17863 psrld m1, 5 | |
17864 packusdw m6, m1 | |
17865 | |
17866 palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13] | |
17867 pmaddwd m1, m2, [r3 - 10 * 16] ; [6] | |
17868 paddd m1, [pd_16] | |
17869 psrld m1, 5 | |
17870 | |
17871 packusdw m1, m1 | |
17872 movhps m1, [r2 + 28] ; [00] | |
17873 | |
17874 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
17875 | |
17876 movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28] | |
17877 palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29] | |
17878 punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32] | |
17879 punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28] | |
17880 | |
17881 pmaddwd m4, m0, [r3 + 10 * 16] ; [26] | |
17882 paddd m4, [pd_16] | |
17883 psrld m4, 5 | |
17884 | |
17885 palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29] | |
17886 pmaddwd m1, [r3 + 4 * 16] ; [20] | |
17887 paddd m1, [pd_16] | |
17888 psrld m1, 5 | |
17889 packusdw m4, m1 | |
17890 | |
17891 palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30] | |
17892 pmaddwd m5, [r3 - 2 * 16] ; [14] | |
17893 paddd m5, [pd_16] | |
17894 psrld m5, 5 | |
17895 | |
17896 palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31] | |
17897 pmaddwd m6, [r3 - 8 * 16] ; [ 8] | |
17898 paddd m6, [pd_16] | |
17899 psrld m6, 5 | |
17900 packusdw m5, m6 | |
17901 | |
17902 pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31] | |
17903 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] | |
17904 paddd m6, [pd_16] | |
17905 psrld m6, 5 | |
17906 | |
17907 pmaddwd m2, [r3 + 12 * 16] ; [28] | |
17908 paddd m2, [pd_16] | |
17909 psrld m2, 5 | |
17910 packusdw m6, m2 | |
17911 | |
17912 movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38] | |
17913 palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39] | |
17914 punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32] | |
17915 punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28] | |
17916 | |
17917 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
17918 paddd m1, [pd_16] | |
17919 psrld m1, 5 | |
17920 | |
17921 palignr m0, m2, m3, 4 | |
17922 pmaddwd m0, [r3] ; [16] | |
17923 paddd m0, [pd_16] | |
17924 psrld m0, 5 | |
17925 packusdw m1, m0 | |
17926 | |
17927 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
17928 | |
17929 palignr m5, m2, m3, 8 | |
17930 pmaddwd m4, m5, [r3 - 6 * 16] ; [10] | |
17931 paddd m4, [pd_16] | |
17932 psrld m4, 5 | |
17933 | |
17934 palignr m5, m2, m3, 12 | |
17935 pmaddwd m1, m5, [r3 - 12 * 16] ; [04] | |
17936 paddd m1, [pd_16] | |
17937 psrld m1, 5 | |
17938 packusdw m4, m1 | |
17939 | |
17940 pmaddwd m5, [r3 + 14 * 16] ; [30] | |
17941 paddd m5, [pd_16] | |
17942 psrld m5, 5 | |
17943 | |
17944 movu m3, [r2 + 46] | |
17945 palignr m1, m3, 2 | |
17946 punpckhwd m2, m3, m1 | |
17947 punpcklwd m3, m1 | |
17948 | |
17949 pmaddwd m6, m3, [r3 + 8 * 16] ; [24] | |
17950 paddd m6, [pd_16] | |
17951 psrld m6, 5 | |
17952 packusdw m5, m6 | |
17953 | |
17954 palignr m6, m2, m3, 4 | |
17955 pmaddwd m6, [r3 + 2 * 16] ; [18] | |
17956 paddd m6, [pd_16] | |
17957 psrld m6, 5 | |
17958 | |
17959 palignr m1, m2, m3, 8 | |
17960 pmaddwd m1, [r3 - 4 * 16] ; [12] | |
17961 paddd m1, [pd_16] | |
17962 psrld m1, 5 | |
17963 packusdw m6, m1 | |
17964 | |
17965 palignr m1, m2, m3, 12 | |
17966 pmaddwd m1, [r3 - 10 * 16] ; [06] | |
17967 paddd m1, [pd_16] | |
17968 psrld m1, 5 | |
17969 | |
17970 packusdw m1, m1 | |
17971 movhps m1, [r2 + 54] ; [00] | |
17972 | |
17973 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
17974 %endmacro | |
17975 | |
17976 %macro MODE_4_32 1 | |
17977 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
17978 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
17979 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
17980 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] | |
17981 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] | |
17982 | |
17983 pmaddwd m4, m0, [r3 + 5 * 16] ; [21] | |
17984 paddd m4, [pd_16] | |
17985 psrld m4, 5 | |
17986 | |
17987 palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2] | |
17988 pmaddwd m1, m5, [r3 - 6 * 16] ; [10] | |
17989 paddd m1, [pd_16] | |
17990 psrld m1, 5 | |
17991 packusdw m4, m1 | |
17992 | |
17993 pmaddwd m5, [r3 + 15 * 16] ; [31] | |
17994 paddd m5, [pd_16] | |
17995 psrld m5, 5 | |
17996 | |
17997 palignr m6, m2, m0, 8 | |
17998 pmaddwd m6, [r3 + 4 * 16] ; [ 20] | |
17999 paddd m6, [pd_16] | |
18000 psrld m6, 5 | |
18001 packusdw m5, m6 | |
18002 | |
18003 palignr m1, m2, m0, 12 | |
18004 pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9] | |
18005 paddd m6, [pd_16] | |
18006 psrld m6, 5 | |
18007 | |
18008 pmaddwd m1, [r3 + 14 * 16] ; [30] | |
18009 paddd m1, [pd_16] | |
18010 psrld m1, 5 | |
18011 packusdw m6, m1 | |
18012 | |
18013 pmaddwd m1, m2, [r3 + 3 * 16] ; [19] | |
18014 paddd m1, [pd_16] | |
18015 psrld m1, 5 | |
18016 | |
18017 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] | |
18018 pmaddwd m0, m7, [r3 - 8 * 16] ; [8] | |
18019 paddd m0, [pd_16] | |
18020 psrld m0, 5 | |
18021 packusdw m1, m0 | |
18022 | |
18023 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18024 | |
18025 pmaddwd m4, m7, [r3 + 13 * 16] ; [29] | |
18026 paddd m4, [pd_16] | |
18027 psrld m4, 5 | |
18028 | |
18029 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] | |
18030 | |
18031 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] | |
18032 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] | |
18033 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] | |
18034 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] | |
18035 | |
18036 palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6] | |
18037 pmaddwd m1, [r3 + 2 * 16] ; [18] | |
18038 paddd m1, [pd_16] | |
18039 psrld m1, 5 | |
18040 packusdw m4, m1 | |
18041 | |
18042 palignr m5, m2, m7, 8 | |
18043 mova m6, m5 | |
18044 pmaddwd m5, [r3 - 9 * 16] ; [07] | |
18045 paddd m5, [pd_16] | |
18046 psrld m5, 5 | |
18047 | |
18048 pmaddwd m6, [r3 + 12 * 16] ; [28] | |
18049 paddd m6, [pd_16] | |
18050 psrld m6, 5 | |
18051 packusdw m5, m6 | |
18052 | |
18053 palignr m6, m2, m7, 12 | |
18054 pmaddwd m6, [r3 + 16] ; [17] | |
18055 paddd m6, [pd_16] | |
18056 psrld m6, 5 | |
18057 | |
18058 pmaddwd m1, m2, [r3 - 10 * 16] ; [06] | |
18059 paddd m1, [pd_16] | |
18060 psrld m1, 5 | |
18061 packusdw m6, m1 | |
18062 | |
18063 pmaddwd m1, m2, [r3 + 11 * 16] ; [27] | |
18064 paddd m1, [pd_16] | |
18065 psrld m1, 5 | |
18066 | |
18067 palignr m7, m3, m2, 4 | |
18068 pmaddwd m7, [r3] ; [16] | |
18069 paddd m7, [pd_16] | |
18070 psrld m7, 5 | |
18071 packusdw m1, m7 | |
18072 mova m7, m0 | |
18073 | |
18074 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18075 | |
18076 palignr m0, m3, m2, 8 | |
18077 pmaddwd m4, m0, [r3 - 11 * 16] ; [5] | |
18078 paddd m4, [pd_16] | |
18079 psrld m4, 5 | |
18080 | |
18081 pmaddwd m1, m0, [r3 + 10 * 16] ; [26] | |
18082 paddd m1, [pd_16] | |
18083 psrld m1, 5 | |
18084 packusdw m4, m1 | |
18085 | |
18086 palignr m5, m3, m2, 12 | |
18087 pmaddwd m5, [r3 - 16] ; [15] | |
18088 paddd m5, [pd_16] | |
18089 psrld m5, 5 | |
18090 | |
18091 pmaddwd m1, m3, [r3 - 12 * 16] ; [4] | |
18092 paddd m1, [pd_16] | |
18093 psrld m1, 5 | |
18094 packusdw m5, m1 | |
18095 | |
18096 pmaddwd m6, m3, [r3 + 9 * 16] ; [25] | |
18097 paddd m6, [pd_16] | |
18098 psrld m6, 5 | |
18099 | |
18100 movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25] | |
18101 palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18] | |
18102 palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19] | |
18103 punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22] | |
18104 punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18] | |
18105 | |
18106 palignr m1, m2, m3, 4 | |
18107 pmaddwd m1, [r3 - 2 * 16] ; [14] | |
18108 paddd m1, [pd_16] | |
18109 psrld m1, 5 | |
18110 packusdw m6, m1 | |
18111 | |
18112 palignr m1, m2, m3, 8 | |
18113 mova m0, m1 | |
18114 pmaddwd m1, [r3 - 13 * 16] ; [3] | |
18115 paddd m1, [pd_16] | |
18116 psrld m1, 5 | |
18117 | |
18118 pmaddwd m0, [r3 + 8 * 16] ; [24] | |
18119 paddd m0, [pd_16] | |
18120 psrld m0, 5 | |
18121 packusdw m1, m0 | |
18122 | |
18123 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18124 | |
18125 palignr m4, m2, m3, 12 | |
18126 pmaddwd m4, [r3 - 3 * 16] ; [13] | |
18127 paddd m4, [pd_16] | |
18128 psrld m4, 5 | |
18129 | |
18130 pmaddwd m1, m2, [r3 - 14 * 16] ; [2] | |
18131 paddd m1, [pd_16] | |
18132 psrld m1, 5 | |
18133 packusdw m4, m1 | |
18134 | |
18135 pmaddwd m5, m2, [r3 + 7 * 16] ; [23] | |
18136 paddd m5, [pd_16] | |
18137 psrld m5, 5 | |
18138 | |
18139 palignr m6, m7, m2, 4 | |
18140 pmaddwd m6, [r3 - 4 * 16] ; [12] | |
18141 paddd m6, [pd_16] | |
18142 psrld m6, 5 | |
18143 packusdw m5, m6 | |
18144 | |
18145 palignr m1, m7, m2, 8 | |
18146 pmaddwd m6, m1, [r3 - 15 * 16] ; [1] | |
18147 paddd m6, [pd_16] | |
18148 psrld m6, 5 | |
18149 | |
18150 pmaddwd m1, [r3 + 6 * 16] ; [22] | |
18151 paddd m1, [pd_16] | |
18152 psrld m1, 5 | |
18153 packusdw m6, m1 | |
18154 | |
18155 palignr m1, m7, m2, 12 | |
18156 pmaddwd m1, [r3 - 5 * 16] ; [11] | |
18157 paddd m1, [pd_16] | |
18158 psrld m1, 5 | |
18159 packusdw m1, m1 | |
18160 movhps m1, [r2 + 44] ; [00] | |
18161 | |
18162 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
18163 %endmacro | |
18164 | |
18165 %macro MODE_5_31 1 | |
18166 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
18167 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
18168 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
18169 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] | |
18170 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] | |
18171 | |
18172 pmaddwd m4, m0, [r3 + 16] ; [17] | |
18173 paddd m4, [pd_16] | |
18174 psrld m4, 5 | |
18175 | |
18176 palignr m1, m2, m0, 4 | |
18177 mova m5, m1 | |
18178 pmaddwd m1, [r3 - 14 * 16] ; [2] | |
18179 paddd m1, [pd_16] | |
18180 psrld m1, 5 | |
18181 packusdw m4, m1 | |
18182 | |
18183 pmaddwd m5, [r3 + 3 * 16] ; [19] | |
18184 paddd m5, [pd_16] | |
18185 psrld m5, 5 | |
18186 | |
18187 palignr m6, m2, m0, 8 | |
18188 mova m1, m6 | |
18189 pmaddwd m6, [r3 - 12 * 16] ; [4] | |
18190 paddd m6, [pd_16] | |
18191 psrld m6, 5 | |
18192 packusdw m5, m6 | |
18193 | |
18194 pmaddwd m6, m1, [r3 + 5 * 16] ; [21] | |
18195 paddd m6, [pd_16] | |
18196 psrld m6, 5 | |
18197 | |
18198 palignr m1, m2, m0, 12 | |
18199 mova m7, m1 | |
18200 pmaddwd m7, [r3 - 10 * 16] ; [6] | |
18201 paddd m7, [pd_16] | |
18202 psrld m7, 5 | |
18203 packusdw m6, m7 | |
18204 | |
18205 pmaddwd m1, [r3 + 7 * 16] ; [23] | |
18206 paddd m1, [pd_16] | |
18207 psrld m1, 5 | |
18208 | |
18209 pmaddwd m7, m2, [r3 - 8 * 16] ; [8] | |
18210 paddd m7, [pd_16] | |
18211 psrld m7, 5 | |
18212 packusdw m1, m7 | |
18213 | |
18214 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18215 | |
18216 pmaddwd m4, m2, [r3 + 9 * 16] ; [25] | |
18217 paddd m4, [pd_16] | |
18218 psrld m4, 5 | |
18219 | |
18220 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] | |
18221 pmaddwd m1, m7, [r3 - 6 * 16] ; [10] | |
18222 paddd m1, [pd_16] | |
18223 psrld m1, 5 | |
18224 packusdw m4, m1 | |
18225 | |
18226 pmaddwd m5, m7, [r3 + 11 * 16] ; [27] | |
18227 paddd m5, [pd_16] | |
18228 psrld m5, 5 | |
18229 | |
18230 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] | |
18231 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] | |
18232 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] | |
18233 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] | |
18234 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] | |
18235 | |
18236 palignr m6, m2, m7, 4 | |
18237 pmaddwd m1, m6, [r3 - 4 * 16] ; [12] | |
18238 paddd m1, [pd_16] | |
18239 psrld m1, 5 | |
18240 packusdw m5, m1 | |
18241 | |
18242 pmaddwd m6, [r3 + 13 * 16] ; [29] | |
18243 paddd m6, [pd_16] | |
18244 psrld m6, 5 | |
18245 | |
18246 palignr m1, m2, m7, 8 | |
18247 mova m0, m1 | |
18248 pmaddwd m1, [r3 - 2 * 16] ; [14] | |
18249 paddd m1, [pd_16] | |
18250 psrld m1, 5 | |
18251 packusdw m6, m1 | |
18252 | |
18253 pmaddwd m1, m0, [r3 + 15 * 16] ; [31] | |
18254 paddd m1, [pd_16] | |
18255 psrld m1, 5 | |
18256 | |
18257 palignr m0, m2, m7, 12 | |
18258 pmaddwd m0, [r3] ; [16] | |
18259 paddd m0, [pd_16] | |
18260 psrld m0, 5 | |
18261 packusdw m1, m0 | |
18262 | |
18263 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18264 | |
18265 pmaddwd m4, m2, [r3 - 15 * 16] ; [1] | |
18266 paddd m4, [pd_16] | |
18267 psrld m4, 5 | |
18268 | |
18269 pmaddwd m1, m2, [r3 + 2 * 16] ; [18] | |
18270 paddd m1, [pd_16] | |
18271 psrld m1, 5 | |
18272 packusdw m4, m1 | |
18273 | |
18274 palignr m1, m3, m2, 4 | |
18275 pmaddwd m5, m1, [r3 - 13 * 16] ; [3] | |
18276 paddd m5, [pd_16] | |
18277 psrld m5, 5 | |
18278 | |
18279 pmaddwd m1, [r3 + 4 * 16] ; [20] | |
18280 paddd m1, [pd_16] | |
18281 psrld m1, 5 | |
18282 packusdw m5, m1 | |
18283 | |
18284 palignr m1, m3, m2, 8 | |
18285 pmaddwd m6, m1, [r3 - 11 * 16] ; [5] | |
18286 paddd m6, [pd_16] | |
18287 psrld m6, 5 | |
18288 | |
18289 pmaddwd m1, [r3 + 6 * 16] ; [22] | |
18290 paddd m1, [pd_16] | |
18291 psrld m1, 5 | |
18292 packusdw m6, m1 | |
18293 | |
18294 palignr m7, m3, m2, 12 | |
18295 pmaddwd m1, m7, [r3 - 9 * 16] ; [7] | |
18296 paddd m1, [pd_16] | |
18297 psrld m1, 5 | |
18298 | |
18299 pmaddwd m7, [r3 + 8 * 16] ; [24] | |
18300 paddd m7, [pd_16] | |
18301 psrld m7, 5 | |
18302 packusdw m1, m7 | |
18303 | |
18304 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18305 | |
18306 pmaddwd m4, m3, [r3 - 7 * 16] ; [9] | |
18307 paddd m4, [pd_16] | |
18308 psrld m4, 5 | |
18309 | |
18310 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
18311 paddd m1, [pd_16] | |
18312 psrld m1, 5 | |
18313 packusdw m4, m1 | |
18314 | |
18315 movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18] | |
18316 palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19] | |
18317 punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18] | |
18318 | |
18319 palignr m1, m0, m3, 4 | |
18320 pmaddwd m5, m1, [r3 - 5 * 16] ; [11] | |
18321 paddd m5, [pd_16] | |
18322 psrld m5, 5 | |
18323 | |
18324 pmaddwd m1, [r3 + 12 * 16] ; [28] | |
18325 paddd m1, [pd_16] | |
18326 psrld m1, 5 | |
18327 packusdw m5, m1 | |
18328 | |
18329 palignr m1, m0, m3, 8 | |
18330 pmaddwd m6, m1, [r3 - 3 * 16] ; [13] | |
18331 paddd m6, [pd_16] | |
18332 psrld m6, 5 | |
18333 | |
18334 pmaddwd m1, [r3 + 14 * 16] ; [30] | |
18335 paddd m1, [pd_16] | |
18336 psrld m1, 5 | |
18337 packusdw m6, m1 | |
18338 | |
18339 palignr m1, m0, m3, 12 | |
18340 pmaddwd m1, [r3 - 16] ; [15] | |
18341 paddd m1, [pd_16] | |
18342 psrld m1, 5 | |
18343 packusdw m1, m1 | |
18344 movhps m1, [r2 + 36] ; [00] | |
18345 | |
18346 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
18347 %endmacro | |
18348 | |
18349 %macro MODE_6_30 1 | |
18350 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
18351 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
18352 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
18353 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] | |
18354 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] | |
18355 | |
18356 pmaddwd m4, m0, [r3 - 3 * 16] ; [13] | |
18357 paddd m4, [pd_16] | |
18358 psrld m4, 5 | |
18359 | |
18360 pmaddwd m1, m0, [r3 + 10 * 16] ; [26] | |
18361 paddd m1, [pd_16] | |
18362 psrld m1, 5 | |
18363 packusdw m4, m1 | |
18364 | |
18365 palignr m1, m2, m0, 4 | |
18366 pmaddwd m5, m1, [r3 - 9 * 16] ; [7] | |
18367 paddd m5, [pd_16] | |
18368 psrld m5, 5 | |
18369 | |
18370 pmaddwd m1, [r3 + 4 * 16] ; [20] | |
18371 paddd m1, [pd_16] | |
18372 psrld m1, 5 | |
18373 packusdw m5, m1 | |
18374 | |
18375 palignr m1, m2, m0, 8 | |
18376 pmaddwd m6, m1, [r3 - 15 * 16] ; [1] | |
18377 paddd m6, [pd_16] | |
18378 psrld m6, 5 | |
18379 | |
18380 pmaddwd m7, m1, [r3 - 2 * 16] ; [14] | |
18381 paddd m7, [pd_16] | |
18382 psrld m7, 5 | |
18383 packusdw m6, m7 | |
18384 | |
18385 pmaddwd m1, [r3 + 11 * 16] ; [27] | |
18386 paddd m1, [pd_16] | |
18387 psrld m1, 5 | |
18388 | |
18389 palignr m7, m2, m0, 12 | |
18390 pmaddwd m0, m7, [r3 - 8 * 16] ; [8] | |
18391 paddd m0, [pd_16] | |
18392 psrld m0, 5 | |
18393 packusdw m1, m0 | |
18394 | |
18395 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18396 | |
18397 pmaddwd m4, m7, [r3 + 5 * 16] ; [21] | |
18398 paddd m4, [pd_16] | |
18399 psrld m4, 5 | |
18400 | |
18401 pmaddwd m1, m2, [r3 - 14 * 16] ; [2] | |
18402 paddd m1, [pd_16] | |
18403 psrld m1, 5 | |
18404 packusdw m4, m1 | |
18405 | |
18406 pmaddwd m5, m2, [r3 - 16] ; [15] | |
18407 paddd m5, [pd_16] | |
18408 psrld m5, 5 | |
18409 | |
18410 pmaddwd m6, m2, [r3 + 12 * 16] ; [28] | |
18411 paddd m6, [pd_16] | |
18412 psrld m6, 5 | |
18413 packusdw m5, m6 | |
18414 | |
18415 palignr m7, m3, m2, 4 | |
18416 pmaddwd m6, m7, [r3 - 7 * 16] ; [9] | |
18417 paddd m6, [pd_16] | |
18418 psrld m6, 5 | |
18419 | |
18420 pmaddwd m1, m7, [r3 + 6 * 16] ; [22] | |
18421 paddd m1, [pd_16] | |
18422 psrld m1, 5 | |
18423 packusdw m6, m1 | |
18424 | |
18425 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] | |
18426 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] | |
18427 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] | |
18428 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] | |
18429 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] | |
18430 | |
18431 palignr m0, m2, m7, 4 | |
18432 pmaddwd m1, m0, [r3 - 13 * 16] ; [3] | |
18433 paddd m1, [pd_16] | |
18434 psrld m1, 5 | |
18435 | |
18436 pmaddwd m0, [r3] ; [16] | |
18437 paddd m0, [pd_16] | |
18438 psrld m0, 5 | |
18439 packusdw m1, m0 | |
18440 | |
18441 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18442 | |
18443 palignr m4, m2, m7, 4 | |
18444 pmaddwd m4, [r3 + 13 * 16] ; [29] | |
18445 paddd m4, [pd_16] | |
18446 psrld m4, 5 | |
18447 | |
18448 palignr m5, m2, m7, 8 | |
18449 pmaddwd m1, m5, [r3 - 6 * 16] ; [10] | |
18450 paddd m1, [pd_16] | |
18451 psrld m1, 5 | |
18452 packusdw m4, m1 | |
18453 | |
18454 pmaddwd m5, [r3 + 7 * 16] ; [23] | |
18455 paddd m5, [pd_16] | |
18456 psrld m5, 5 | |
18457 | |
18458 palignr m1, m2, m7, 12 | |
18459 pmaddwd m6, m1, [r3 - 12 * 16] ; [4] | |
18460 paddd m6, [pd_16] | |
18461 psrld m6, 5 | |
18462 packusdw m5, m6 | |
18463 | |
18464 pmaddwd m6, m1, [r3 + 16] ; [17] | |
18465 paddd m6, [pd_16] | |
18466 psrld m6, 5 | |
18467 | |
18468 pmaddwd m1, [r3 + 14 * 16] ; [30] | |
18469 paddd m1, [pd_16] | |
18470 psrld m1, 5 | |
18471 packusdw m6, m1 | |
18472 | |
18473 pmaddwd m1, m2, [r3 - 5 * 16] ; [11] | |
18474 paddd m1, [pd_16] | |
18475 psrld m1, 5 | |
18476 | |
18477 pmaddwd m0, m2, [r3 + 8 * 16] ; [24] | |
18478 paddd m0, [pd_16] | |
18479 psrld m0, 5 | |
18480 packusdw m1, m0 | |
18481 | |
18482 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18483 | |
18484 palignr m5, m3, m2, 4 | |
18485 pmaddwd m4, m5, [r3 - 11 * 16] ; [5] | |
18486 paddd m4, [pd_16] | |
18487 psrld m4, 5 | |
18488 | |
18489 pmaddwd m1, m5, [r3 + 2 * 16] ; [18] | |
18490 paddd m1, [pd_16] | |
18491 psrld m1, 5 | |
18492 packusdw m4, m1 | |
18493 | |
18494 pmaddwd m5, [r3 + 15 * 16] ; [31] | |
18495 paddd m5, [pd_16] | |
18496 psrld m5, 5 | |
18497 | |
18498 palignr m6, m3, m2, 8 | |
18499 pmaddwd m1, m6, [r3 - 4 * 16] ; [12] | |
18500 paddd m1, [pd_16] | |
18501 psrld m1, 5 | |
18502 packusdw m5, m1 | |
18503 | |
18504 pmaddwd m6, [r3 + 9 * 16] ; [25] | |
18505 paddd m6, [pd_16] | |
18506 psrld m6, 5 | |
18507 | |
18508 palignr m1, m3, m2, 12 | |
18509 pmaddwd m0, m1, [r3 - 10 * 16] ; [6] | |
18510 paddd m0, [pd_16] | |
18511 psrld m0, 5 | |
18512 packusdw m6, m0 | |
18513 | |
18514 pmaddwd m1, [r3 + 3 * 16] ; [19] | |
18515 paddd m1, [pd_16] | |
18516 psrld m1, 5 | |
18517 packusdw m1, m1 | |
18518 movhps m1, [r2 + 28] ; [00] | |
18519 | |
18520 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
18521 %endmacro | |
18522 | |
18523 %macro MODE_7_29 1 | |
18524 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
18525 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
18526 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
18527 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] | |
18528 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] | |
18529 | |
18530 pmaddwd m4, m0, [r3 - 7 * 16] ; [9] | |
18531 paddd m4, [pd_16] | |
18532 psrld m4, 5 | |
18533 | |
18534 pmaddwd m1, m0, [r3 + 2 * 16] ; [18] | |
18535 paddd m1, [pd_16] | |
18536 psrld m1, 5 | |
18537 packusdw m4, m1 | |
18538 | |
18539 pmaddwd m5, m0, [r3 + 11 * 16] ; [27] | |
18540 paddd m5, [pd_16] | |
18541 psrld m5, 5 | |
18542 | |
18543 palignr m1, m2, m0, 4 | |
18544 pmaddwd m6, m1, [r3 - 12 * 16] ; [4] | |
18545 paddd m6, [pd_16] | |
18546 psrld m6, 5 | |
18547 packusdw m5, m6 | |
18548 | |
18549 pmaddwd m6, m1, [r3 - 3 * 16] ; [13] | |
18550 paddd m6, [pd_16] | |
18551 psrld m6, 5 | |
18552 | |
18553 pmaddwd m7, m1, [r3 + 6 * 16] ; [22] | |
18554 paddd m7, [pd_16] | |
18555 psrld m7, 5 | |
18556 packusdw m6, m7 | |
18557 | |
18558 pmaddwd m1, [r3 + 15 * 16] ; [31] | |
18559 paddd m1, [pd_16] | |
18560 psrld m1, 5 | |
18561 | |
18562 mova m3, m0 | |
18563 palignr m7, m2, m0, 8 | |
18564 pmaddwd m0, m7, [r3 - 8 * 16] ; [8] | |
18565 paddd m0, [pd_16] | |
18566 psrld m0, 5 | |
18567 packusdw m1, m0 | |
18568 | |
18569 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18570 | |
18571 pmaddwd m4, m7, [r3 + 16] ; [17] | |
18572 paddd m4, [pd_16] | |
18573 psrld m4, 5 | |
18574 | |
18575 pmaddwd m1, m7, [r3 + 10 * 16] ; [26] | |
18576 paddd m1, [pd_16] | |
18577 psrld m1, 5 | |
18578 packusdw m4, m1 | |
18579 | |
18580 palignr m1, m2, m3, 12 | |
18581 pmaddwd m5, m1, [r3 - 13 * 16] ; [3] | |
18582 paddd m5, [pd_16] | |
18583 psrld m5, 5 | |
18584 | |
18585 pmaddwd m6, m1, [r3 - 4 * 16] ; [12] | |
18586 paddd m6, [pd_16] | |
18587 psrld m6, 5 | |
18588 packusdw m5, m6 | |
18589 | |
18590 pmaddwd m6, m1, [r3 + 5 * 16] ; [21] | |
18591 paddd m6, [pd_16] | |
18592 psrld m6, 5 | |
18593 | |
18594 pmaddwd m1, [r3 + 14 * 16] ; [30] | |
18595 paddd m1, [pd_16] | |
18596 psrld m1, 5 | |
18597 packusdw m6, m1 | |
18598 | |
18599 pmaddwd m1, m2, [r3 - 9 * 16] ; [7] | |
18600 paddd m1, [pd_16] | |
18601 psrld m1, 5 | |
18602 | |
18603 pmaddwd m0, m2, [r3] ; [16] | |
18604 paddd m0, [pd_16] | |
18605 psrld m0, 5 | |
18606 packusdw m1, m0 | |
18607 | |
18608 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18609 | |
18610 pmaddwd m4, m2, [r3 + 9 * 16] ; [25] | |
18611 paddd m4, [pd_16] | |
18612 psrld m4, 5 | |
18613 | |
18614 movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
18615 palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10] | |
18616 punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9] | |
18617 | |
18618 palignr m6, m7, m2, 4 | |
18619 pmaddwd m1, m6, [r3 - 14 * 16] ; [2] | |
18620 paddd m1, [pd_16] | |
18621 psrld m1, 5 | |
18622 packusdw m4, m1 | |
18623 | |
18624 pmaddwd m5, m6, [r3 - 5 * 16] ; [11] | |
18625 paddd m5, [pd_16] | |
18626 psrld m5, 5 | |
18627 | |
18628 pmaddwd m0, m6, [r3 + 4 * 16] ; [20] | |
18629 paddd m0, [pd_16] | |
18630 psrld m0, 5 | |
18631 packusdw m5, m0 | |
18632 | |
18633 pmaddwd m6, [r3 + 13 * 16] ; [29] | |
18634 paddd m6, [pd_16] | |
18635 psrld m6, 5 | |
18636 | |
18637 palignr m0, m7, m2, 8 | |
18638 pmaddwd m1, m0, [r3 - 10 * 16] ; [6] | |
18639 paddd m1, [pd_16] | |
18640 psrld m1, 5 | |
18641 packusdw m6, m1 | |
18642 | |
18643 pmaddwd m1, m0, [r3 - 16] ; [15] | |
18644 paddd m1, [pd_16] | |
18645 psrld m1, 5 | |
18646 | |
18647 pmaddwd m0, [r3 + 8 * 16] ; [24] | |
18648 paddd m0, [pd_16] | |
18649 psrld m0, 5 | |
18650 packusdw m1, m0 | |
18651 | |
18652 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18653 | |
18654 palignr m0, m7, m2, 12 | |
18655 pmaddwd m4, m0, [r3 - 15 * 16] ; [1] | |
18656 paddd m4, [pd_16] | |
18657 psrld m4, 5 | |
18658 | |
18659 pmaddwd m1, m0, [r3 - 6 * 16] ; [10] | |
18660 paddd m1, [pd_16] | |
18661 psrld m1, 5 | |
18662 packusdw m4, m1 | |
18663 | |
18664 pmaddwd m5, m0, [r3 + 3 * 16] ; [19] | |
18665 paddd m5, [pd_16] | |
18666 psrld m5, 5 | |
18667 | |
18668 pmaddwd m0, [r3 + 12 * 16] ; [28] | |
18669 paddd m0, [pd_16] | |
18670 psrld m0, 5 | |
18671 packusdw m5, m0 | |
18672 | |
18673 pmaddwd m6, m7, [r3 - 11 * 16] ; [5] | |
18674 paddd m6, [pd_16] | |
18675 psrld m6, 5 | |
18676 | |
18677 pmaddwd m0, m7, [r3 - 2 * 16] ; [14] | |
18678 paddd m0, [pd_16] | |
18679 psrld m0, 5 | |
18680 packusdw m6, m0 | |
18681 | |
18682 pmaddwd m1, m7, [r3 + 7 * 16] ; [23] | |
18683 paddd m1, [pd_16] | |
18684 psrld m1, 5 | |
18685 packusdw m1, m1 | |
18686 movhps m1, [r2 + 20] ; [00] | |
18687 | |
18688 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
18689 %endmacro | |
18690 | |
18691 %macro MODE_8_28 1 | |
18692 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
18693 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
18694 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] | |
18695 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] | |
18696 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] | |
18697 | |
18698 pmaddwd m4, m0, [r3 - 11 * 16] ; [5] | |
18699 paddd m4, [pd_16] | |
18700 psrld m4, 5 | |
18701 | |
18702 pmaddwd m1, m0, [r3 - 6 * 16] ; [10] | |
18703 paddd m1, [pd_16] | |
18704 psrld m1, 5 | |
18705 packusdw m4, m1 | |
18706 | |
18707 pmaddwd m5, m0, [r3 - 16] ; [15] | |
18708 paddd m5, [pd_16] | |
18709 psrld m5, 5 | |
18710 | |
18711 pmaddwd m6, m0, [r3 + 4 * 16] ; [20] | |
18712 paddd m6, [pd_16] | |
18713 psrld m6, 5 | |
18714 packusdw m5, m6 | |
18715 | |
18716 pmaddwd m6, m0, [r3 + 9 * 16] ; [25] | |
18717 paddd m6, [pd_16] | |
18718 psrld m6, 5 | |
18719 | |
18720 pmaddwd m1, m0, [r3 + 14 * 16] ; [30] | |
18721 paddd m1, [pd_16] | |
18722 psrld m1, 5 | |
18723 packusdw m6, m1 | |
18724 | |
18725 palignr m7, m2, m0, 4 | |
18726 pmaddwd m1, m7, [r3 - 13 * 16] ; [3] | |
18727 paddd m1, [pd_16] | |
18728 psrld m1, 5 | |
18729 | |
18730 mova m3, m0 | |
18731 pmaddwd m0, m7, [r3 - 8 * 16] ; [8] | |
18732 paddd m0, [pd_16] | |
18733 psrld m0, 5 | |
18734 packusdw m1, m0 | |
18735 | |
18736 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18737 | |
18738 pmaddwd m4, m7, [r3 - 3 * 16] ; [13] | |
18739 paddd m4, [pd_16] | |
18740 psrld m4, 5 | |
18741 | |
18742 pmaddwd m1, m7, [r3 + 2 * 16] ; [18] | |
18743 paddd m1, [pd_16] | |
18744 psrld m1, 5 | |
18745 packusdw m4, m1 | |
18746 | |
18747 pmaddwd m5, m7, [r3 + 7 * 16] ; [23] | |
18748 paddd m5, [pd_16] | |
18749 psrld m5, 5 | |
18750 | |
18751 pmaddwd m6, m7, [r3 + 12 * 16] ; [28] | |
18752 paddd m6, [pd_16] | |
18753 psrld m6, 5 | |
18754 packusdw m5, m6 | |
18755 | |
18756 palignr m7, m2, m3, 8 | |
18757 pmaddwd m6, m7, [r3 - 15 * 16] ; [1] | |
18758 paddd m6, [pd_16] | |
18759 psrld m6, 5 | |
18760 | |
18761 pmaddwd m1, m7, [r3 - 10 * 16] ; [6] | |
18762 paddd m1, [pd_16] | |
18763 psrld m1, 5 | |
18764 packusdw m6, m1 | |
18765 | |
18766 pmaddwd m1, m7, [r3 - 5 * 16] ; [11] | |
18767 paddd m1, [pd_16] | |
18768 psrld m1, 5 | |
18769 | |
18770 pmaddwd m0, m7, [r3] ; [16] | |
18771 paddd m0, [pd_16] | |
18772 psrld m0, 5 | |
18773 packusdw m1, m0 | |
18774 | |
18775 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18776 | |
18777 pmaddwd m4, m7, [r3 + 5 * 16] ; [21] | |
18778 paddd m4, [pd_16] | |
18779 psrld m4, 5 | |
18780 | |
18781 pmaddwd m1, m7, [r3 + 10 * 16] ; [26] | |
18782 paddd m1, [pd_16] | |
18783 psrld m1, 5 | |
18784 packusdw m4, m1 | |
18785 | |
18786 pmaddwd m5, m7, [r3 + 15 * 16] ; [31] | |
18787 paddd m5, [pd_16] | |
18788 psrld m5, 5 | |
18789 | |
18790 palignr m7, m2, m3, 12 | |
18791 pmaddwd m0, m7, [r3 - 12 * 16] ; [4] | |
18792 paddd m0, [pd_16] | |
18793 psrld m0, 5 | |
18794 packusdw m5, m0 | |
18795 | |
18796 pmaddwd m6, m7, [r3 - 7 * 16] ; [9] | |
18797 paddd m6, [pd_16] | |
18798 psrld m6, 5 | |
18799 | |
18800 pmaddwd m1, m7, [r3 - 2 * 16] ; [14] | |
18801 paddd m1, [pd_16] | |
18802 psrld m1, 5 | |
18803 packusdw m6, m1 | |
18804 | |
18805 pmaddwd m1, m7, [r3 + 3 * 16] ; [19] | |
18806 paddd m1, [pd_16] | |
18807 psrld m1, 5 | |
18808 | |
18809 pmaddwd m0, m7, [r3 + 8 * 16] ; [24] | |
18810 paddd m0, [pd_16] | |
18811 psrld m0, 5 | |
18812 packusdw m1, m0 | |
18813 | |
18814 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18815 | |
18816 pmaddwd m4, m7, [r3 + 13 * 16] ; [29] | |
18817 paddd m4, [pd_16] | |
18818 psrld m4, 5 | |
18819 | |
18820 pmaddwd m1, m2, [r3 - 14 * 16] ; [2] | |
18821 paddd m1, [pd_16] | |
18822 psrld m1, 5 | |
18823 packusdw m4, m1 | |
18824 | |
18825 pmaddwd m5, m2, [r3 - 9 * 16] ; [7] | |
18826 paddd m5, [pd_16] | |
18827 psrld m5, 5 | |
18828 | |
18829 pmaddwd m0, m2, [r3 - 4 * 16] ; [12] | |
18830 paddd m0, [pd_16] | |
18831 psrld m0, 5 | |
18832 packusdw m5, m0 | |
18833 | |
18834 pmaddwd m6, m2, [r3 + 16] ; [17] | |
18835 paddd m6, [pd_16] | |
18836 psrld m6, 5 | |
18837 | |
18838 pmaddwd m0, m2, [r3 + 6 * 16] ; [22] | |
18839 paddd m0, [pd_16] | |
18840 psrld m0, 5 | |
18841 packusdw m6, m0 | |
18842 | |
18843 pmaddwd m1, m2, [r3 + 11 * 16] ; [27] | |
18844 paddd m1, [pd_16] | |
18845 psrld m1, 5 | |
18846 packusdw m1, m1 | |
18847 movhps m1, [r2 + 12] ; [00] | |
18848 | |
18849 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
18850 %endmacro | |
18851 | |
18852 %macro MODE_9_27 1 | |
18853 movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
18854 palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2] | |
18855 punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5] | |
18856 punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1] | |
18857 | |
18858 pmaddwd m4, m3, [r3 - 14 * 16] ; [2] | |
18859 paddd m4, [pd_16] | |
18860 psrld m4, 5 | |
18861 | |
18862 pmaddwd m1, m3, [r3 - 12 * 16] ; [4] | |
18863 paddd m1, [pd_16] | |
18864 psrld m1, 5 | |
18865 packusdw m4, m1 | |
18866 | |
18867 pmaddwd m5, m3, [r3 - 10 * 16] ; [6] | |
18868 paddd m5, [pd_16] | |
18869 psrld m5, 5 | |
18870 | |
18871 pmaddwd m6, m3, [r3 - 8 * 16] ; [8] | |
18872 paddd m6, [pd_16] | |
18873 psrld m6, 5 | |
18874 packusdw m5, m6 | |
18875 | |
18876 pmaddwd m6, m3, [r3 - 6 * 16] ; [10] | |
18877 paddd m6, [pd_16] | |
18878 psrld m6, 5 | |
18879 | |
18880 pmaddwd m1, m3, [r3 - 4 * 16] ; [12] | |
18881 paddd m1, [pd_16] | |
18882 psrld m1, 5 | |
18883 packusdw m6, m1 | |
18884 | |
18885 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
18886 paddd m1, [pd_16] | |
18887 psrld m1, 5 | |
18888 | |
18889 pmaddwd m0, m3, [r3] ; [16] | |
18890 paddd m0, [pd_16] | |
18891 psrld m0, 5 | |
18892 packusdw m1, m0 | |
18893 | |
18894 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
18895 | |
18896 pmaddwd m4, m3, [r3 + 2 * 16] ; [18] | |
18897 paddd m4, [pd_16] | |
18898 psrld m4, 5 | |
18899 | |
18900 pmaddwd m1, m3, [r3 + 4 * 16] ; [20] | |
18901 paddd m1, [pd_16] | |
18902 psrld m1, 5 | |
18903 packusdw m4, m1 | |
18904 | |
18905 pmaddwd m5, m3, [r3 + 6 * 16] ; [22] | |
18906 paddd m5, [pd_16] | |
18907 psrld m5, 5 | |
18908 | |
18909 pmaddwd m6, m3, [r3 + 8 * 16] ; [24] | |
18910 paddd m6, [pd_16] | |
18911 psrld m6, 5 | |
18912 packusdw m5, m6 | |
18913 | |
18914 pmaddwd m6, m3, [r3 + 10 * 16] ; [26] | |
18915 paddd m6, [pd_16] | |
18916 psrld m6, 5 | |
18917 | |
18918 pmaddwd m1, m3, [r3 + 12 * 16] ; [28] | |
18919 paddd m1, [pd_16] | |
18920 psrld m1, 5 | |
18921 packusdw m6, m1 | |
18922 | |
18923 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
18924 paddd m1, [pd_16] | |
18925 psrld m1, 5 | |
18926 | |
18927 packusdw m1, m1 | |
18928 movhps m1, [r2 + 4] ; [00] | |
18929 | |
18930 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
18931 | |
18932 palignr m7, m2, m3, 4 | |
18933 pmaddwd m4, m7, [r3 - 14 * 16] ; [2] | |
18934 paddd m4, [pd_16] | |
18935 psrld m4, 5 | |
18936 | |
18937 pmaddwd m1, m7, [r3 - 12 * 16] ; [4] | |
18938 paddd m1, [pd_16] | |
18939 psrld m1, 5 | |
18940 packusdw m4, m1 | |
18941 | |
18942 pmaddwd m5, m7, [r3 - 10 * 16] ; [6] | |
18943 paddd m5, [pd_16] | |
18944 psrld m5, 5 | |
18945 | |
18946 pmaddwd m0, m7, [r3 - 8 * 16] ; [8] | |
18947 paddd m0, [pd_16] | |
18948 psrld m0, 5 | |
18949 packusdw m5, m0 | |
18950 | |
18951 pmaddwd m6, m7, [r3 - 6 * 16] ; [10] | |
18952 paddd m6, [pd_16] | |
18953 psrld m6, 5 | |
18954 | |
18955 pmaddwd m1, m7, [r3 - 4 * 16] ; [12] | |
18956 paddd m1, [pd_16] | |
18957 psrld m1, 5 | |
18958 packusdw m6, m1 | |
18959 | |
18960 pmaddwd m1, m7, [r3 - 2 * 16] ; [14] | |
18961 paddd m1, [pd_16] | |
18962 psrld m1, 5 | |
18963 | |
18964 pmaddwd m0, m7, [r3] ; [16] | |
18965 paddd m0, [pd_16] | |
18966 psrld m0, 5 | |
18967 packusdw m1, m0 | |
18968 | |
18969 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
18970 | |
18971 pmaddwd m4, m7, [r3 + 2 * 16] ; [18] | |
18972 paddd m4, [pd_16] | |
18973 psrld m4, 5 | |
18974 | |
18975 pmaddwd m1, m7, [r3 + 4 * 16] ; [20] | |
18976 paddd m1, [pd_16] | |
18977 psrld m1, 5 | |
18978 packusdw m4, m1 | |
18979 | |
18980 pmaddwd m5, m7, [r3 + 6 * 16] ; [22] | |
18981 paddd m5, [pd_16] | |
18982 psrld m5, 5 | |
18983 | |
18984 pmaddwd m0, m7, [r3 + 8 * 16] ; [24] | |
18985 paddd m0, [pd_16] | |
18986 psrld m0, 5 | |
18987 packusdw m5, m0 | |
18988 | |
18989 pmaddwd m6, m7, [r3 + 10 * 16] ; [26] | |
18990 paddd m6, [pd_16] | |
18991 psrld m6, 5 | |
18992 | |
18993 pmaddwd m0, m7, [r3 + 12 * 16] ; [28] | |
18994 paddd m0, [pd_16] | |
18995 psrld m0, 5 | |
18996 packusdw m6, m0 | |
18997 | |
18998 pmaddwd m7, [r3 + 14 * 16] ; [30] | |
18999 paddd m7, [pd_16] | |
19000 psrld m7, 5 | |
19001 packusdw m7, m7 | |
19002 movhps m7, [r2 + 6] ; [00] | |
19003 | |
19004 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7 | |
19005 %endmacro | |
19006 | |
19007 %macro MODE_11_25 1 | |
19008 movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0] | |
19009 pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0] | |
19010 | |
19011 pmaddwd m4, m3, [r3 + 14 * 16] ; [30] | |
19012 paddd m4, [pd_16] | |
19013 psrld m4, 5 | |
19014 | |
19015 pmaddwd m1, m3, [r3 + 12 * 16] ; [28] | |
19016 paddd m1, [pd_16] | |
19017 psrld m1, 5 | |
19018 packusdw m4, m1 | |
19019 | |
19020 pmaddwd m5, m3, [r3 + 10 * 16] ; [26] | |
19021 paddd m5, [pd_16] | |
19022 psrld m5, 5 | |
19023 | |
19024 pmaddwd m6, m3, [r3 + 8 * 16] ; [24] | |
19025 paddd m6, [pd_16] | |
19026 psrld m6, 5 | |
19027 packusdw m5, m6 | |
19028 | |
19029 pmaddwd m6, m3, [r3 + 6 * 16] ; [22] | |
19030 paddd m6, [pd_16] | |
19031 psrld m6, 5 | |
19032 | |
19033 pmaddwd m1, m3, [r3 + 4 * 16] ; [20] | |
19034 paddd m1, [pd_16] | |
19035 psrld m1, 5 | |
19036 packusdw m6, m1 | |
19037 | |
19038 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19039 paddd m1, [pd_16] | |
19040 psrld m1, 5 | |
19041 | |
19042 pmaddwd m0, m3, [r3] ; [16] | |
19043 paddd m0, [pd_16] | |
19044 psrld m0, 5 | |
19045 packusdw m1, m0 | |
19046 | |
19047 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19048 | |
19049 pmaddwd m4, m3, [r3 - 2 * 16] ; [14] | |
19050 paddd m4, [pd_16] | |
19051 psrld m4, 5 | |
19052 | |
19053 pmaddwd m1, m3, [r3 - 4 * 16] ; [12] | |
19054 paddd m1, [pd_16] | |
19055 psrld m1, 5 | |
19056 packusdw m4, m1 | |
19057 | |
19058 pmaddwd m5, m3, [r3 - 6 * 16] ; [10] | |
19059 paddd m5, [pd_16] | |
19060 psrld m5, 5 | |
19061 | |
19062 pmaddwd m6, m3, [r3 - 8 * 16] ; [8] | |
19063 paddd m6, [pd_16] | |
19064 psrld m6, 5 | |
19065 packusdw m5, m6 | |
19066 | |
19067 pmaddwd m6, m3, [r3 - 10 * 16] ; [6] | |
19068 paddd m6, [pd_16] | |
19069 psrld m6, 5 | |
19070 | |
19071 pmaddwd m1, m3, [r3 - 12 * 16] ; [4] | |
19072 paddd m1, [pd_16] | |
19073 psrld m1, 5 | |
19074 packusdw m6, m1 | |
19075 | |
19076 pmaddwd m1, m3, [r3 - 14 * 16] ; [2] | |
19077 paddd m1, [pd_16] | |
19078 psrld m1, 5 | |
19079 | |
19080 packusdw m1, m1 | |
19081 movhps m1, [r2 + 2] ; [00] | |
19082 | |
19083 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
19084 | |
19085 movu m3, [r2] ; [6 5 4 3 2 1 0 16] | |
19086 pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16] | |
19087 | |
19088 pmaddwd m4, m3, [r3 + 14 * 16] ; [30] | |
19089 paddd m4, [pd_16] | |
19090 psrld m4, 5 | |
19091 | |
19092 pmaddwd m1, m3, [r3 + 12 * 16] ; [28] | |
19093 paddd m1, [pd_16] | |
19094 psrld m1, 5 | |
19095 packusdw m4, m1 | |
19096 | |
19097 pmaddwd m5, m3, [r3 + 10 * 16] ; [26] | |
19098 paddd m5, [pd_16] | |
19099 psrld m5, 5 | |
19100 | |
19101 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19102 paddd m0, [pd_16] | |
19103 psrld m0, 5 | |
19104 packusdw m5, m0 | |
19105 | |
19106 pmaddwd m6, m3, [r3 + 6 * 16] ; [22] | |
19107 paddd m6, [pd_16] | |
19108 psrld m6, 5 | |
19109 | |
19110 pmaddwd m1, m3, [r3 + 4 * 16] ; [20] | |
19111 paddd m1, [pd_16] | |
19112 psrld m1, 5 | |
19113 packusdw m6, m1 | |
19114 | |
19115 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19116 paddd m1, [pd_16] | |
19117 psrld m1, 5 | |
19118 | |
19119 pmaddwd m0, m3, [r3] ; [16] | |
19120 paddd m0, [pd_16] | |
19121 psrld m0, 5 | |
19122 packusdw m1, m0 | |
19123 | |
19124 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
19125 | |
19126 pmaddwd m4, m3, [r3 - 2 * 16] ; [14] | |
19127 paddd m4, [pd_16] | |
19128 psrld m4, 5 | |
19129 | |
19130 pmaddwd m1, m3, [r3 - 4 * 16] ; [12] | |
19131 paddd m1, [pd_16] | |
19132 psrld m1, 5 | |
19133 packusdw m4, m1 | |
19134 | |
19135 pmaddwd m5, m3, [r3 - 6 * 16] ; [10] | |
19136 paddd m5, [pd_16] | |
19137 psrld m5, 5 | |
19138 | |
19139 pmaddwd m6, m3, [r3 - 8 * 16] ; [8] | |
19140 paddd m6, [pd_16] | |
19141 psrld m6, 5 | |
19142 packusdw m5, m6 | |
19143 | |
19144 pmaddwd m6, m3, [r3 - 10 * 16] ; [6] | |
19145 paddd m6, [pd_16] | |
19146 psrld m6, 5 | |
19147 | |
19148 pmaddwd m1, m3, [r3 - 12 * 16] ; [4] | |
19149 paddd m1, [pd_16] | |
19150 psrld m1, 5 | |
19151 packusdw m6, m1 | |
19152 | |
19153 pmaddwd m1, m3, [r3 - 14 * 16] ; [2] | |
19154 paddd m1, [pd_16] | |
19155 psrld m1, 5 | |
19156 | |
19157 packusdw m1, m1 | |
19158 movhps m1, [r2] ; [00] | |
19159 | |
19160 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
19161 %endmacro | |
19162 | |
19163 %macro MODE_12_24 1 | |
19164 movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0] | |
19165 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
19166 | |
19167 pmaddwd m4, m3, [r3 + 11 * 16] ; [27] | |
19168 paddd m4, [pd_16] | |
19169 psrld m4, 5 | |
19170 | |
19171 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
19172 paddd m1, [pd_16] | |
19173 psrld m1, 5 | |
19174 packusdw m4, m1 | |
19175 | |
19176 pmaddwd m5, m3, [r3 + 16] ; [17] | |
19177 paddd m5, [pd_16] | |
19178 psrld m5, 5 | |
19179 | |
19180 pmaddwd m6, m3, [r3 - 4 * 16] ; [12] | |
19181 paddd m6, [pd_16] | |
19182 psrld m6, 5 | |
19183 packusdw m5, m6 | |
19184 | |
19185 pmaddwd m6, m3, [r3 - 9 * 16] ; [7] | |
19186 paddd m6, [pd_16] | |
19187 psrld m6, 5 | |
19188 | |
19189 pmaddwd m1, m3, [r3 - 14 * 16] ; [2] | |
19190 paddd m1, [pd_16] | |
19191 psrld m1, 5 | |
19192 packusdw m6, m1 | |
19193 | |
19194 movu m3, [r2 + 6] | |
19195 pshufb m3, m2 | |
19196 | |
19197 pmaddwd m1, m3, [r3 + 13 * 16] ; [29] | |
19198 paddd m1, [pd_16] | |
19199 psrld m1, 5 | |
19200 | |
19201 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19202 paddd m0, [pd_16] | |
19203 psrld m0, 5 | |
19204 packusdw m1, m0 | |
19205 | |
19206 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19207 | |
19208 pmaddwd m4, m3, [r3 + 3 * 16] ; [19] | |
19209 paddd m4, [pd_16] | |
19210 psrld m4, 5 | |
19211 | |
19212 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
19213 paddd m1, [pd_16] | |
19214 psrld m1, 5 | |
19215 packusdw m4, m1 | |
19216 | |
19217 pmaddwd m5, m3, [r3 - 7 * 16] ; [9] | |
19218 paddd m5, [pd_16] | |
19219 psrld m5, 5 | |
19220 | |
19221 pmaddwd m6, m3, [r3 - 12 * 16] ; [4] | |
19222 paddd m6, [pd_16] | |
19223 psrld m6, 5 | |
19224 packusdw m5, m6 | |
19225 | |
19226 movu m3, [r2 + 4] | |
19227 pshufb m3, m2 | |
19228 | |
19229 pmaddwd m6, m3, [r3 + 15 * 16] ; [31] | |
19230 paddd m6, [pd_16] | |
19231 psrld m6, 5 | |
19232 | |
19233 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
19234 paddd m1, [pd_16] | |
19235 psrld m1, 5 | |
19236 packusdw m6, m1 | |
19237 | |
19238 pmaddwd m1, m3, [r3 + 5 * 16] ; [21] | |
19239 paddd m1, [pd_16] | |
19240 psrld m1, 5 | |
19241 | |
19242 pmaddwd m0, m3, [r3] ; [16] | |
19243 paddd m0, [pd_16] | |
19244 psrld m0, 5 | |
19245 packusdw m1, m0 | |
19246 | |
19247 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
19248 | |
19249 pmaddwd m4, m3, [r3 - 5 * 16] ; [11] | |
19250 paddd m4, [pd_16] | |
19251 psrld m4, 5 | |
19252 | |
19253 pmaddwd m1, m3, [r3 - 10 * 16] ; [6] | |
19254 paddd m1, [pd_16] | |
19255 psrld m1, 5 | |
19256 packusdw m4, m1 | |
19257 | |
19258 pmaddwd m5, m3, [r3 - 15 * 16] ; [1] | |
19259 paddd m5, [pd_16] | |
19260 psrld m5, 5 | |
19261 | |
19262 movu m3, [r2 + 2] | |
19263 pshufb m3, m2 | |
19264 | |
19265 pmaddwd m0, m3, [r3 + 12 * 16] ; [28] | |
19266 paddd m0, [pd_16] | |
19267 psrld m0, 5 | |
19268 packusdw m5, m0 | |
19269 | |
19270 pmaddwd m6, m3, [r3 + 7 * 16] ; [23] | |
19271 paddd m6, [pd_16] | |
19272 psrld m6, 5 | |
19273 | |
19274 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19275 paddd m1, [pd_16] | |
19276 psrld m1, 5 | |
19277 packusdw m6, m1 | |
19278 | |
19279 pmaddwd m1, m3, [r3 - 3 * 16] ; [13] | |
19280 paddd m1, [pd_16] | |
19281 psrld m1, 5 | |
19282 | |
19283 pmaddwd m0, m3, [r3 - 8 * 16] ; [8] | |
19284 paddd m0, [pd_16] | |
19285 psrld m0, 5 | |
19286 packusdw m1, m0 | |
19287 | |
19288 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
19289 | |
19290 pmaddwd m4, m3, [r3 - 13 * 16] ; [3] | |
19291 paddd m4, [pd_16] | |
19292 psrld m4, 5 | |
19293 | |
19294 movu m3, [r2] | |
19295 pshufb m3, m2 | |
19296 | |
19297 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
19298 paddd m1, [pd_16] | |
19299 psrld m1, 5 | |
19300 packusdw m4, m1 | |
19301 | |
19302 pmaddwd m5, m3, [r3 + 9 * 16] ; [25] | |
19303 paddd m5, [pd_16] | |
19304 psrld m5, 5 | |
19305 | |
19306 pmaddwd m6, m3, [r3 + 4 * 16] ; [20] | |
19307 paddd m6, [pd_16] | |
19308 psrld m6, 5 | |
19309 packusdw m5, m6 | |
19310 | |
19311 pmaddwd m6, m3, [r3 - 16] ; [15] | |
19312 paddd m6, [pd_16] | |
19313 psrld m6, 5 | |
19314 | |
19315 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
19316 paddd m1, [pd_16] | |
19317 psrld m1, 5 | |
19318 packusdw m6, m1 | |
19319 | |
19320 pmaddwd m1, m3, [r3 - 11 * 16] ; [5] | |
19321 paddd m1, [pd_16] | |
19322 psrld m1, 5 | |
19323 | |
19324 packusdw m1, m1 | |
19325 movhps m1, [r2] ; [00] | |
19326 | |
19327 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
19328 %endmacro | |
19329 | |
19330 %macro MODE_13_23 1 | |
19331 movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0] | |
19332 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
19333 | |
19334 pmaddwd m4, m3, [r3 + 7 * 16] ; [23] | |
19335 paddd m4, [pd_16] | |
19336 psrld m4, 5 | |
19337 | |
19338 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
19339 paddd m1, [pd_16] | |
19340 psrld m1, 5 | |
19341 packusdw m4, m1 | |
19342 | |
19343 pmaddwd m5, m3, [r3 - 11 * 16] ; [05] | |
19344 paddd m5, [pd_16] | |
19345 psrld m5, 5 | |
19346 | |
19347 movu m3, [r2 + 14] | |
19348 pshufb m3, m2 | |
19349 | |
19350 pmaddwd m6, m3, [r3 + 12 * 16] ; [28] | |
19351 paddd m6, [pd_16] | |
19352 psrld m6, 5 | |
19353 packusdw m5, m6 | |
19354 | |
19355 pmaddwd m6, m3, [r3 + 3 * 16] ; [19] | |
19356 paddd m6, [pd_16] | |
19357 psrld m6, 5 | |
19358 | |
19359 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
19360 paddd m1, [pd_16] | |
19361 psrld m1, 5 | |
19362 packusdw m6, m1 | |
19363 | |
19364 pmaddwd m1, m3, [r3 - 15 * 16] ; [01] | |
19365 paddd m1, [pd_16] | |
19366 psrld m1, 5 | |
19367 | |
19368 movu m3, [r2 + 12] | |
19369 pshufb m3, m2 | |
19370 | |
19371 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19372 paddd m0, [pd_16] | |
19373 psrld m0, 5 | |
19374 packusdw m1, m0 | |
19375 | |
19376 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19377 | |
19378 pmaddwd m4, m3, [r3 - 16] ; [15] | |
19379 paddd m4, [pd_16] | |
19380 psrld m4, 5 | |
19381 | |
19382 pmaddwd m1, m3, [r3 - 10 * 16] ; [06] | |
19383 paddd m1, [pd_16] | |
19384 psrld m1, 5 | |
19385 packusdw m4, m1 | |
19386 | |
19387 movu m3, [r2 + 10] | |
19388 pshufb m3, m2 | |
19389 | |
19390 pmaddwd m5, m3, [r3 + 13 * 16] ; [29] | |
19391 paddd m5, [pd_16] | |
19392 psrld m5, 5 | |
19393 | |
19394 pmaddwd m6, m3, [r3 + 4 * 16] ; [20] | |
19395 paddd m6, [pd_16] | |
19396 psrld m6, 5 | |
19397 packusdw m5, m6 | |
19398 | |
19399 pmaddwd m6, m3, [r3 - 5 * 16] ; [11] | |
19400 paddd m6, [pd_16] | |
19401 psrld m6, 5 | |
19402 | |
19403 pmaddwd m1, m3, [r3 - 14 * 16] ; [02] | |
19404 paddd m1, [pd_16] | |
19405 psrld m1, 5 | |
19406 packusdw m6, m1 | |
19407 | |
19408 movu m3, [r2 + 8] | |
19409 pshufb m3, m2 | |
19410 | |
19411 pmaddwd m1, m3, [r3 + 9 * 16] ; [25] | |
19412 paddd m1, [pd_16] | |
19413 psrld m1, 5 | |
19414 | |
19415 pmaddwd m0, m3, [r3] ; [16] | |
19416 paddd m0, [pd_16] | |
19417 psrld m0, 5 | |
19418 packusdw m1, m0 | |
19419 | |
19420 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
19421 | |
19422 pmaddwd m4, m3, [r3 - 9 * 16] ; [07] | |
19423 paddd m4, [pd_16] | |
19424 psrld m4, 5 | |
19425 | |
19426 movu m3, [r2 + 6] | |
19427 pshufb m3, m2 | |
19428 | |
19429 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
19430 paddd m1, [pd_16] | |
19431 psrld m1, 5 | |
19432 packusdw m4, m1 | |
19433 | |
19434 pmaddwd m5, m3, [r3 + 5 * 16] ; [21] | |
19435 paddd m5, [pd_16] | |
19436 psrld m5, 5 | |
19437 | |
19438 pmaddwd m0, m3, [r3 - 4 * 16] ; [12] | |
19439 paddd m0, [pd_16] | |
19440 psrld m0, 5 | |
19441 packusdw m5, m0 | |
19442 | |
19443 pmaddwd m6, m3, [r3 - 13 * 16] ; [03] | |
19444 paddd m6, [pd_16] | |
19445 psrld m6, 5 | |
19446 | |
19447 movu m3, [r2 + 4] | |
19448 pshufb m3, m2 | |
19449 | |
19450 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
19451 paddd m1, [pd_16] | |
19452 psrld m1, 5 | |
19453 packusdw m6, m1 | |
19454 | |
19455 pmaddwd m1, m3, [r3 + 16] ; [17] | |
19456 paddd m1, [pd_16] | |
19457 psrld m1, 5 | |
19458 | |
19459 pmaddwd m0, m3, [r3 - 8 * 16] ; [08] | |
19460 paddd m0, [pd_16] | |
19461 psrld m0, 5 | |
19462 packusdw m1, m0 | |
19463 | |
19464 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
19465 | |
19466 movu m3, [r2 + 2] | |
19467 pshufb m3, m2 | |
19468 | |
19469 pmaddwd m4, m3, [r3 + 15 * 16] ; [31] | |
19470 paddd m4, [pd_16] | |
19471 psrld m4, 5 | |
19472 | |
19473 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
19474 paddd m1, [pd_16] | |
19475 psrld m1, 5 | |
19476 packusdw m4, m1 | |
19477 | |
19478 pmaddwd m5, m3, [r3 - 3 * 16] ; [13] | |
19479 paddd m5, [pd_16] | |
19480 psrld m5, 5 | |
19481 | |
19482 pmaddwd m6, m3, [r3 - 12 * 16] ; [04] | |
19483 paddd m6, [pd_16] | |
19484 psrld m6, 5 | |
19485 packusdw m5, m6 | |
19486 | |
19487 movu m3, [r2] | |
19488 pshufb m3, m2 | |
19489 | |
19490 pmaddwd m6, m3, [r3 + 11 * 16] ; [27] | |
19491 paddd m6, [pd_16] | |
19492 psrld m6, 5 | |
19493 | |
19494 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19495 paddd m1, [pd_16] | |
19496 psrld m1, 5 | |
19497 packusdw m6, m1 | |
19498 | |
19499 pmaddwd m1, m3, [r3 - 7 * 16] ; [09] | |
19500 paddd m1, [pd_16] | |
19501 psrld m1, 5 | |
19502 | |
19503 packusdw m1, m1 | |
19504 movhps m1, [r2] ; [00] | |
19505 | |
19506 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
19507 %endmacro | |
19508 | |
19509 %macro MODE_14_22 1 | |
19510 movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0] | |
19511 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
19512 | |
19513 pmaddwd m4, m3, [r3 + 3 * 16] ; [19] | |
19514 paddd m4, [pd_16] | |
19515 psrld m4, 5 | |
19516 | |
19517 pmaddwd m1, m3, [r3 - 10 * 16] ; [06] | |
19518 paddd m1, [pd_16] | |
19519 psrld m1, 5 | |
19520 packusdw m4, m1 | |
19521 | |
19522 movu m3, [r2 + 22] | |
19523 pshufb m3, m2 | |
19524 | |
19525 pmaddwd m5, m3, [r3 + 9 * 16] ; [25] | |
19526 paddd m5, [pd_16] | |
19527 psrld m5, 5 | |
19528 | |
19529 pmaddwd m6, m3, [r3 - 4 * 16] ; [12] | |
19530 paddd m6, [pd_16] | |
19531 psrld m6, 5 | |
19532 packusdw m5, m6 | |
19533 | |
19534 movu m3, [r2 + 20] | |
19535 pshufb m3, m2 | |
19536 | |
19537 pmaddwd m6, m3, [r3 + 15 * 16] ; [31] | |
19538 paddd m6, [pd_16] | |
19539 psrld m6, 5 | |
19540 | |
19541 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19542 paddd m1, [pd_16] | |
19543 psrld m1, 5 | |
19544 packusdw m6, m1 | |
19545 | |
19546 pmaddwd m1, m3, [r3 - 11 * 16] ; [05] | |
19547 paddd m1, [pd_16] | |
19548 psrld m1, 5 | |
19549 | |
19550 movu m3, [r2 + 18] | |
19551 pshufb m3, m2 | |
19552 | |
19553 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19554 paddd m0, [pd_16] | |
19555 psrld m0, 5 | |
19556 packusdw m1, m0 | |
19557 | |
19558 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19559 | |
19560 pmaddwd m4, m3, [r3 - 5 * 16] ; [11] | |
19561 paddd m4, [pd_16] | |
19562 psrld m4, 5 | |
19563 | |
19564 movu m3, [r2 + 16] | |
19565 pshufb m3, m2 | |
19566 | |
19567 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
19568 paddd m1, [pd_16] | |
19569 psrld m1, 5 | |
19570 packusdw m4, m1 | |
19571 | |
19572 pmaddwd m5, m3, [r3 + 16] ; [17] | |
19573 paddd m5, [pd_16] | |
19574 psrld m5, 5 | |
19575 | |
19576 pmaddwd m6, m3, [r3 - 12 * 16] ; [04] | |
19577 paddd m6, [pd_16] | |
19578 psrld m6, 5 | |
19579 packusdw m5, m6 | |
19580 | |
19581 movu m3, [r2 + 14] | |
19582 pshufb m3, m2 | |
19583 | |
19584 pmaddwd m6, m3, [r3 + 7 * 16] ; [23] | |
19585 paddd m6, [pd_16] | |
19586 psrld m6, 5 | |
19587 | |
19588 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
19589 paddd m1, [pd_16] | |
19590 psrld m1, 5 | |
19591 packusdw m6, m1 | |
19592 | |
19593 movu m3, [r2 + 12] | |
19594 pshufb m3, m2 | |
19595 | |
19596 pmaddwd m1, m3, [r3 + 13 * 16] ; [29] | |
19597 paddd m1, [pd_16] | |
19598 psrld m1, 5 | |
19599 | |
19600 pmaddwd m0, m3, [r3] ; [16] | |
19601 paddd m0, [pd_16] | |
19602 psrld m0, 5 | |
19603 packusdw m1, m0 | |
19604 | |
19605 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
19606 | |
19607 pmaddwd m4, m3, [r3 - 13 * 16] ; [03] | |
19608 paddd m4, [pd_16] | |
19609 psrld m4, 5 | |
19610 | |
19611 movu m3, [r2 + 10] | |
19612 pshufb m3, m2 | |
19613 | |
19614 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
19615 paddd m1, [pd_16] | |
19616 psrld m1, 5 | |
19617 packusdw m4, m1 | |
19618 | |
19619 pmaddwd m5, m3, [r3 - 7 * 16] ; [09] | |
19620 paddd m5, [pd_16] | |
19621 psrld m5, 5 | |
19622 | |
19623 movu m3, [r2 + 8] | |
19624 pshufb m3, m2 | |
19625 | |
19626 pmaddwd m0, m3, [r3 + 12 * 16] ; [28] | |
19627 paddd m0, [pd_16] | |
19628 psrld m0, 5 | |
19629 packusdw m5, m0 | |
19630 | |
19631 pmaddwd m6, m3, [r3 - 16] ; [15] | |
19632 paddd m6, [pd_16] | |
19633 psrld m6, 5 | |
19634 | |
19635 pmaddwd m1, m3, [r3 - 14 * 16] ; [02] | |
19636 paddd m1, [pd_16] | |
19637 psrld m1, 5 | |
19638 packusdw m6, m1 | |
19639 | |
19640 movu m3, [r2 + 6] | |
19641 pshufb m3, m2 | |
19642 | |
19643 pmaddwd m1, m3, [r3 + 5 * 16] ; [21] | |
19644 paddd m1, [pd_16] | |
19645 psrld m1, 5 | |
19646 | |
19647 pmaddwd m0, m3, [r3 - 8 * 16] ; [08] | |
19648 paddd m0, [pd_16] | |
19649 psrld m0, 5 | |
19650 packusdw m1, m0 | |
19651 | |
19652 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
19653 | |
19654 movu m3, [r2 + 4] | |
19655 pshufb m3, m2 | |
19656 | |
19657 pmaddwd m4, m3, [r3 + 11 * 16] ; [27] | |
19658 paddd m4, [pd_16] | |
19659 psrld m4, 5 | |
19660 | |
19661 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
19662 paddd m1, [pd_16] | |
19663 psrld m1, 5 | |
19664 packusdw m4, m1 | |
19665 | |
19666 pmaddwd m5, m3, [r3 - 15 * 16] ; [01] | |
19667 paddd m5, [pd_16] | |
19668 psrld m5, 5 | |
19669 | |
19670 movu m3, [r2 + 2] | |
19671 pshufb m3, m2 | |
19672 | |
19673 pmaddwd m6, m3, [r3 + 4 * 16] ; [20] | |
19674 paddd m6, [pd_16] | |
19675 psrld m6, 5 | |
19676 packusdw m5, m6 | |
19677 | |
19678 pmaddwd m6, m3, [r3 - 9 * 16] ; [07] | |
19679 paddd m6, [pd_16] | |
19680 psrld m6, 5 | |
19681 | |
19682 movu m3, [r2] | |
19683 pshufb m3, m2 | |
19684 | |
19685 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
19686 paddd m1, [pd_16] | |
19687 psrld m1, 5 | |
19688 packusdw m6, m1 | |
19689 | |
19690 pmaddwd m1, m3, [r3 - 3 * 16] ; [13] | |
19691 paddd m1, [pd_16] | |
19692 psrld m1, 5 | |
19693 | |
19694 packusdw m1, m1 | |
19695 movhps m1, [r2] ; [00] | |
19696 | |
19697 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
19698 %endmacro | |
19699 | |
19700 %macro MODE_15_21 1 | |
19701 movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0] | |
19702 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
19703 | |
19704 pmaddwd m4, m3, [r3 - 16] ; [15] | |
19705 paddd m4, [pd_16] | |
19706 psrld m4, 5 | |
19707 | |
19708 movu m3, [r2 + 30] | |
19709 pshufb m3, m2 | |
19710 | |
19711 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
19712 paddd m1, [pd_16] | |
19713 psrld m1, 5 | |
19714 packusdw m4, m1 | |
19715 | |
19716 pmaddwd m5, m3, [r3 - 3 * 16] ; [13] | |
19717 paddd m5, [pd_16] | |
19718 psrld m5, 5 | |
19719 | |
19720 movu m3, [r2 + 28] | |
19721 pshufb m3, m2 | |
19722 | |
19723 pmaddwd m6, m3, [r3 + 12 * 16] ; [28] | |
19724 paddd m6, [pd_16] | |
19725 psrld m6, 5 | |
19726 packusdw m5, m6 | |
19727 | |
19728 pmaddwd m6, m3, [r3 - 5 * 16] ; [11] | |
19729 paddd m6, [pd_16] | |
19730 psrld m6, 5 | |
19731 | |
19732 movu m3, [r2 + 26] | |
19733 pshufb m3, m2 | |
19734 | |
19735 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
19736 paddd m1, [pd_16] | |
19737 psrld m1, 5 | |
19738 packusdw m6, m1 | |
19739 | |
19740 pmaddwd m1, m3, [r3 - 7 * 16] ; [09] | |
19741 paddd m1, [pd_16] | |
19742 psrld m1, 5 | |
19743 | |
19744 movu m3, [r2 + 24] | |
19745 pshufb m3, m2 | |
19746 | |
19747 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19748 paddd m0, [pd_16] | |
19749 psrld m0, 5 | |
19750 packusdw m1, m0 | |
19751 | |
19752 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19753 | |
19754 pmaddwd m4, m3, [r3 - 9 * 16] ; [07] | |
19755 paddd m4, [pd_16] | |
19756 psrld m4, 5 | |
19757 | |
19758 movu m3, [r2 + 22] | |
19759 pshufb m3, m2 | |
19760 | |
19761 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
19762 paddd m1, [pd_16] | |
19763 psrld m1, 5 | |
19764 packusdw m4, m1 | |
19765 | |
19766 pmaddwd m5, m3, [r3 - 11 * 16] ; [05] | |
19767 paddd m5, [pd_16] | |
19768 psrld m5, 5 | |
19769 | |
19770 movu m3, [r2 + 20] | |
19771 pshufb m3, m2 | |
19772 | |
19773 pmaddwd m6, m3, [r3 + 4 * 16] ; [20] | |
19774 paddd m6, [pd_16] | |
19775 psrld m6, 5 | |
19776 packusdw m5, m6 | |
19777 | |
19778 pmaddwd m6, m3, [r3 - 13 * 16] ; [03] | |
19779 paddd m6, [pd_16] | |
19780 psrld m6, 5 | |
19781 | |
19782 movu m3, [r2 + 18] | |
19783 pshufb m3, m2 | |
19784 | |
19785 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
19786 paddd m1, [pd_16] | |
19787 psrld m1, 5 | |
19788 packusdw m6, m1 | |
19789 | |
19790 pmaddwd m1, m3, [r3 - 15 * 16] ; [01] | |
19791 paddd m1, [pd_16] | |
19792 psrld m1, 5 | |
19793 | |
19794 movu m3, [r2 + 16] | |
19795 pshufb m3, m2 | |
19796 | |
19797 pmaddwd m0, m3, [r3] ; [16] | |
19798 paddd m0, [pd_16] | |
19799 psrld m0, 5 | |
19800 packusdw m1, m0 | |
19801 | |
19802 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
19803 | |
19804 movu m3, [r2 + 14] | |
19805 pshufb m3, m2 | |
19806 | |
19807 pmaddwd m4, m3, [r3 + 15 * 16] ; [31] | |
19808 paddd m4, [pd_16] | |
19809 psrld m4, 5 | |
19810 | |
19811 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
19812 paddd m1, [pd_16] | |
19813 psrld m1, 5 | |
19814 packusdw m4, m1 | |
19815 | |
19816 movu m3, [r2 + 12] | |
19817 pshufb m3, m2 | |
19818 | |
19819 pmaddwd m5, m3, [r3 + 13 * 16] ; [29] | |
19820 paddd m5, [pd_16] | |
19821 psrld m5, 5 | |
19822 | |
19823 pmaddwd m0, m3, [r3 - 4 * 16] ; [12] | |
19824 paddd m0, [pd_16] | |
19825 psrld m0, 5 | |
19826 packusdw m5, m0 | |
19827 | |
19828 movu m3, [r2 + 10] | |
19829 pshufb m3, m2 | |
19830 | |
19831 pmaddwd m6, m3, [r3 + 11 * 16] ; [27] | |
19832 paddd m6, [pd_16] | |
19833 psrld m6, 5 | |
19834 | |
19835 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
19836 paddd m1, [pd_16] | |
19837 psrld m1, 5 | |
19838 packusdw m6, m1 | |
19839 | |
19840 movu m3, [r2 + 8] | |
19841 pshufb m3, m2 | |
19842 | |
19843 pmaddwd m1, m3, [r3 + 9 * 16] ; [25] | |
19844 paddd m1, [pd_16] | |
19845 psrld m1, 5 | |
19846 | |
19847 pmaddwd m0, m3, [r3 - 8 * 16] ; [08] | |
19848 paddd m0, [pd_16] | |
19849 psrld m0, 5 | |
19850 packusdw m1, m0 | |
19851 | |
19852 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
19853 | |
19854 movu m3, [r2 + 6] | |
19855 pshufb m3, m2 | |
19856 | |
19857 pmaddwd m4, m3, [r3 + 7 * 16] ; [23] | |
19858 paddd m4, [pd_16] | |
19859 psrld m4, 5 | |
19860 | |
19861 pmaddwd m1, m3, [r3 - 10 * 16] ; [06] | |
19862 paddd m1, [pd_16] | |
19863 psrld m1, 5 | |
19864 packusdw m4, m1 | |
19865 | |
19866 movu m3, [r2 + 4] | |
19867 pshufb m3, m2 | |
19868 | |
19869 pmaddwd m5, m3, [r3 + 5 * 16] ; [21] | |
19870 paddd m5, [pd_16] | |
19871 psrld m5, 5 | |
19872 | |
19873 pmaddwd m6, m3, [r3 - 12 * 16] ; [04] | |
19874 paddd m6, [pd_16] | |
19875 psrld m6, 5 | |
19876 packusdw m5, m6 | |
19877 | |
19878 movu m3, [r2 + 2] | |
19879 pshufb m3, m2 | |
19880 | |
19881 pmaddwd m6, m3, [r3 + 3 * 16] ; [19] | |
19882 paddd m6, [pd_16] | |
19883 psrld m6, 5 | |
19884 | |
19885 pmaddwd m1, m3, [r3 - 14 * 16] ; [02] | |
19886 paddd m1, [pd_16] | |
19887 psrld m1, 5 | |
19888 packusdw m6, m1 | |
19889 | |
19890 movu m3, [r2] | |
19891 pshufb m3, m2 | |
19892 | |
19893 pmaddwd m1, m3, [r3 + 16] ; [17] | |
19894 paddd m1, [pd_16] | |
19895 psrld m1, 5 | |
19896 | |
19897 packusdw m1, m1 | |
19898 movhps m1, [r2] ; [00] | |
19899 | |
19900 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
19901 %endmacro | |
19902 | |
19903 %macro MODE_16_20 1 | |
19904 movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0] | |
19905 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
19906 | |
19907 pmaddwd m4, m3, [r3 - 5 * 16] ; [11] | |
19908 paddd m4, [pd_16] | |
19909 psrld m4, 5 | |
19910 | |
19911 movu m3, [r2 + 38] | |
19912 pshufb m3, m2 | |
19913 | |
19914 pmaddwd m1, m3, [r3 + 6 * 16] ; [22] | |
19915 paddd m1, [pd_16] | |
19916 psrld m1, 5 | |
19917 packusdw m4, m1 | |
19918 | |
19919 pmaddwd m5, m3, [r3 - 15 * 16] ; [01] | |
19920 paddd m5, [pd_16] | |
19921 psrld m5, 5 | |
19922 | |
19923 movu m3, [r2 + 36] | |
19924 pshufb m3, m2 | |
19925 | |
19926 pmaddwd m6, m3, [r3 - 4 * 16] ; [12] | |
19927 paddd m6, [pd_16] | |
19928 psrld m6, 5 | |
19929 packusdw m5, m6 | |
19930 | |
19931 movu m3, [r2 + 34] | |
19932 pshufb m3, m2 | |
19933 | |
19934 pmaddwd m6, m3, [r3 + 7 * 16] ; [23] | |
19935 paddd m6, [pd_16] | |
19936 psrld m6, 5 | |
19937 | |
19938 pmaddwd m1, m3, [r3 - 14 * 16] ; [02] | |
19939 paddd m1, [pd_16] | |
19940 psrld m1, 5 | |
19941 packusdw m6, m1 | |
19942 | |
19943 movu m3, [r2 + 32] | |
19944 pshufb m3, m2 | |
19945 | |
19946 pmaddwd m1, m3, [r3 - 3 * 16] ; [13] | |
19947 paddd m1, [pd_16] | |
19948 psrld m1, 5 | |
19949 | |
19950 movu m3, [r2 + 30] | |
19951 pshufb m3, m2 | |
19952 | |
19953 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
19954 paddd m0, [pd_16] | |
19955 psrld m0, 5 | |
19956 packusdw m1, m0 | |
19957 | |
19958 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
19959 | |
19960 pmaddwd m4, m3, [r3 - 13 * 16] ; [03] | |
19961 paddd m4, [pd_16] | |
19962 psrld m4, 5 | |
19963 | |
19964 movu m3, [r2 + 28] | |
19965 pshufb m3, m2 | |
19966 | |
19967 pmaddwd m1, m3, [r3 - 2 * 16] ; [14] | |
19968 paddd m1, [pd_16] | |
19969 psrld m1, 5 | |
19970 packusdw m4, m1 | |
19971 | |
19972 movu m3, [r2 + 26] | |
19973 pshufb m3, m2 | |
19974 | |
19975 pmaddwd m5, m3, [r3 + 9 * 16] ; [25] | |
19976 paddd m5, [pd_16] | |
19977 psrld m5, 5 | |
19978 | |
19979 pmaddwd m6, m3, [r3 - 12 * 16] ; [04] | |
19980 paddd m6, [pd_16] | |
19981 psrld m6, 5 | |
19982 packusdw m5, m6 | |
19983 | |
19984 movu m3, [r2 + 24] | |
19985 pshufb m3, m2 | |
19986 | |
19987 pmaddwd m6, m3, [r3 - 16] ; [15] | |
19988 paddd m6, [pd_16] | |
19989 psrld m6, 5 | |
19990 | |
19991 movu m3, [r2 + 22] | |
19992 pshufb m3, m2 | |
19993 | |
19994 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
19995 paddd m1, [pd_16] | |
19996 psrld m1, 5 | |
19997 packusdw m6, m1 | |
19998 | |
19999 pmaddwd m1, m3, [r3 - 11 * 16] ; [05] | |
20000 paddd m1, [pd_16] | |
20001 psrld m1, 5 | |
20002 | |
20003 movu m3, [r2 + 20] | |
20004 pshufb m3, m2 | |
20005 | |
20006 pmaddwd m0, m3, [r3] ; [16] | |
20007 paddd m0, [pd_16] | |
20008 psrld m0, 5 | |
20009 packusdw m1, m0 | |
20010 | |
20011 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
20012 | |
20013 movu m3, [r2 + 18] | |
20014 pshufb m3, m2 | |
20015 | |
20016 pmaddwd m4, m3, [r3 + 11 * 16] ; [27] | |
20017 paddd m4, [pd_16] | |
20018 psrld m4, 5 | |
20019 | |
20020 pmaddwd m1, m3, [r3 - 10 * 16] ; [06] | |
20021 paddd m1, [pd_16] | |
20022 psrld m1, 5 | |
20023 packusdw m4, m1 | |
20024 | |
20025 movu m3, [r2 + 16] | |
20026 pshufb m3, m2 | |
20027 | |
20028 pmaddwd m5, m3, [r3 + 16] ; [17] | |
20029 paddd m5, [pd_16] | |
20030 psrld m5, 5 | |
20031 | |
20032 movu m3, [r2 + 14] | |
20033 pshufb m3, m2 | |
20034 | |
20035 pmaddwd m0, m3, [r3 + 12 * 16] ; [28] | |
20036 paddd m0, [pd_16] | |
20037 psrld m0, 5 | |
20038 packusdw m5, m0 | |
20039 | |
20040 pmaddwd m6, m3, [r3 - 9 * 16] ; [07] | |
20041 paddd m6, [pd_16] | |
20042 psrld m6, 5 | |
20043 | |
20044 movu m3, [r2 + 12] | |
20045 pshufb m3, m2 | |
20046 | |
20047 pmaddwd m1, m3, [r3 + 2 * 16] ; [18] | |
20048 paddd m1, [pd_16] | |
20049 psrld m1, 5 | |
20050 packusdw m6, m1 | |
20051 | |
20052 movu m3, [r2 + 10] | |
20053 pshufb m3, m2 | |
20054 | |
20055 pmaddwd m1, m3, [r3 + 13 * 16] ; [29] | |
20056 paddd m1, [pd_16] | |
20057 psrld m1, 5 | |
20058 | |
20059 pmaddwd m0, m3, [r3 - 8 * 16] ; [08] | |
20060 paddd m0, [pd_16] | |
20061 psrld m0, 5 | |
20062 packusdw m1, m0 | |
20063 | |
20064 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
20065 | |
20066 movu m3, [r2 + 8] | |
20067 pshufb m3, m2 | |
20068 | |
20069 pmaddwd m4, m3, [r3 + 3 * 16] ; [19] | |
20070 paddd m4, [pd_16] | |
20071 psrld m4, 5 | |
20072 | |
20073 movu m3, [r2 + 6] | |
20074 pshufb m3, m2 | |
20075 | |
20076 pmaddwd m1, m3, [r3 + 14 * 16] ; [30] | |
20077 paddd m1, [pd_16] | |
20078 psrld m1, 5 | |
20079 packusdw m4, m1 | |
20080 | |
20081 pmaddwd m5, m3, [r3 - 7 * 16] ; [09] | |
20082 paddd m5, [pd_16] | |
20083 psrld m5, 5 | |
20084 | |
20085 movu m3, [r2 + 4] | |
20086 pshufb m3, m2 | |
20087 | |
20088 pmaddwd m6, m3, [r3 + 4 * 16] ; [20] | |
20089 paddd m6, [pd_16] | |
20090 psrld m6, 5 | |
20091 packusdw m5, m6 | |
20092 | |
20093 movu m3, [r2 + 2] | |
20094 pshufb m3, m2 | |
20095 | |
20096 pmaddwd m6, m3, [r3 + 15 * 16] ; [31] | |
20097 paddd m6, [pd_16] | |
20098 psrld m6, 5 | |
20099 | |
20100 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
20101 paddd m1, [pd_16] | |
20102 psrld m1, 5 | |
20103 packusdw m6, m1 | |
20104 | |
20105 movu m3, [r2] | |
20106 pshufb m3, m2 | |
20107 | |
20108 pmaddwd m1, m3, [r3 + 5 * 16] ; [21] | |
20109 paddd m1, [pd_16] | |
20110 psrld m1, 5 | |
20111 | |
20112 packusdw m1, m1 | |
20113 movhps m1, [r2] ; [00] | |
20114 | |
20115 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
20116 %endmacro | |
20117 | |
20118 %macro MODE_17_19 1 | |
20119 movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0] | |
20120 pshufb m3, m2 ; [4 3 3 2 2 1 1 0] | |
20121 | |
20122 pmaddwd m4, m3, [r3 - 10 * 16] ; [06] | |
20123 paddd m4, [pd_16] | |
20124 psrld m4, 5 | |
20125 | |
20126 movu m3, [r2 + 48] | |
20127 pshufb m3, m2 | |
20128 | |
20129 pmaddwd m1, m3, [r3 - 4 * 16] ; [12] | |
20130 paddd m1, [pd_16] | |
20131 psrld m1, 5 | |
20132 packusdw m4, m1 | |
20133 | |
20134 movu m3, [r2 + 46] | |
20135 pshufb m3, m2 | |
20136 | |
20137 pmaddwd m5, m3, [r3 + 2 * 16] ; [18] | |
20138 paddd m5, [pd_16] | |
20139 psrld m5, 5 | |
20140 | |
20141 movu m3, [r2 + 44] | |
20142 pshufb m3, m2 | |
20143 | |
20144 pmaddwd m6, m3, [r3 + 8 * 16] ; [24] | |
20145 paddd m6, [pd_16] | |
20146 psrld m6, 5 | |
20147 packusdw m5, m6 | |
20148 | |
20149 movu m3, [r2 + 42] | |
20150 pshufb m3, m2 | |
20151 | |
20152 pmaddwd m6, m3, [r3 + 14 * 16] ; [30] | |
20153 paddd m6, [pd_16] | |
20154 psrld m6, 5 | |
20155 | |
20156 pmaddwd m1, m3, [r3 - 12 * 16] ; [04] | |
20157 paddd m1, [pd_16] | |
20158 psrld m1, 5 | |
20159 packusdw m6, m1 | |
20160 | |
20161 movu m3, [r2 + 40] | |
20162 pshufb m3, m2 | |
20163 | |
20164 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
20165 paddd m1, [pd_16] | |
20166 psrld m1, 5 | |
20167 | |
20168 movu m3, [r2 + 38] | |
20169 pshufb m3, m2 | |
20170 | |
20171 pmaddwd m0, m3, [r3] ; [16] | |
20172 paddd m0, [pd_16] | |
20173 psrld m0, 5 | |
20174 packusdw m1, m0 | |
20175 | |
20176 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
20177 | |
20178 movu m3, [r2 + 36] | |
20179 pshufb m3, m2 | |
20180 | |
20181 pmaddwd m4, m3, [r3 + 6 * 16] ; [22] | |
20182 paddd m4, [pd_16] | |
20183 psrld m4, 5 | |
20184 | |
20185 movu m3, [r2 + 34] | |
20186 pshufb m3, m2 | |
20187 | |
20188 pmaddwd m1, m3, [r3 + 12 * 16] ; [28] | |
20189 paddd m1, [pd_16] | |
20190 psrld m1, 5 | |
20191 packusdw m4, m1 | |
20192 | |
20193 pmaddwd m5, m3, [r3 - 14 * 16] ; [02] | |
20194 paddd m5, [pd_16] | |
20195 psrld m5, 5 | |
20196 | |
20197 movu m3, [r2 + 32] | |
20198 pshufb m3, m2 | |
20199 | |
20200 pmaddwd m6, m3, [r3 - 8 * 16] ; [08] | |
20201 paddd m6, [pd_16] | |
20202 psrld m6, 5 | |
20203 packusdw m5, m6 | |
20204 | |
20205 movu m3, [r2 + 30] | |
20206 pshufb m3, m2 | |
20207 | |
20208 pmaddwd m6, m3, [r3 - 2 * 16] ; [14] | |
20209 paddd m6, [pd_16] | |
20210 psrld m6, 5 | |
20211 | |
20212 movu m3, [r2 + 28] | |
20213 pshufb m3, m2 | |
20214 | |
20215 pmaddwd m1, m3, [r3 + 4 * 16] ; [20] | |
20216 paddd m1, [pd_16] | |
20217 psrld m1, 5 | |
20218 packusdw m6, m1 | |
20219 | |
20220 movu m3, [r2 + 26] | |
20221 pshufb m3, m2 | |
20222 | |
20223 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
20224 paddd m1, [pd_16] | |
20225 psrld m1, 5 | |
20226 | |
20227 packusdw m1, m1 | |
20228 movhps m1, [r2 + 26] ; [00] | |
20229 | |
20230 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 | |
20231 | |
20232 movu m3, [r2 + 24] | |
20233 pshufb m3, m2 | |
20234 | |
20235 pmaddwd m4, m3, [r3 - 10 * 16] ; [06] | |
20236 paddd m4, [pd_16] | |
20237 psrld m4, 5 | |
20238 | |
20239 movu m3, [r2 + 22] | |
20240 pshufb m3, m2 | |
20241 | |
20242 pmaddwd m1, m3, [r3 - 4 * 16] ; [12] | |
20243 paddd m1, [pd_16] | |
20244 psrld m1, 5 | |
20245 packusdw m4, m1 | |
20246 | |
20247 movu m3, [r2 + 20] | |
20248 pshufb m3, m2 | |
20249 | |
20250 pmaddwd m5, m3, [r3 + 2 * 16] ; [18] | |
20251 paddd m5, [pd_16] | |
20252 psrld m5, 5 | |
20253 | |
20254 movu m3, [r2 + 18] | |
20255 pshufb m3, m2 | |
20256 | |
20257 pmaddwd m0, m3, [r3 + 8 * 16] ; [24] | |
20258 paddd m0, [pd_16] | |
20259 psrld m0, 5 | |
20260 packusdw m5, m0 | |
20261 | |
20262 movu m3, [r2 + 16] | |
20263 pshufb m3, m2 | |
20264 | |
20265 pmaddwd m6, m3, [r3 + 14 * 16] ; [30] | |
20266 paddd m6, [pd_16] | |
20267 psrld m6, 5 | |
20268 | |
20269 pmaddwd m1, m3, [r3 - 12 * 16] ; [04] | |
20270 paddd m1, [pd_16] | |
20271 psrld m1, 5 | |
20272 packusdw m6, m1 | |
20273 | |
20274 movu m3, [r2 + 14] | |
20275 pshufb m3, m2 | |
20276 | |
20277 pmaddwd m1, m3, [r3 - 6 * 16] ; [10] | |
20278 paddd m1, [pd_16] | |
20279 psrld m1, 5 | |
20280 | |
20281 movu m3, [r2 + 12] | |
20282 pshufb m3, m2 | |
20283 | |
20284 pmaddwd m0, m3, [r3] ; [16] | |
20285 paddd m0, [pd_16] | |
20286 psrld m0, 5 | |
20287 packusdw m1, m0 | |
20288 | |
20289 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 | |
20290 | |
20291 movu m3, [r2 + 10] | |
20292 pshufb m3, m2 | |
20293 | |
20294 pmaddwd m4, m3, [r3 + 6 * 16] ; [22] | |
20295 paddd m4, [pd_16] | |
20296 psrld m4, 5 | |
20297 | |
20298 movu m3, [r2 + 8] | |
20299 pshufb m3, m2 | |
20300 | |
20301 pmaddwd m1, m3, [r3 + 12 * 16] ; [28] | |
20302 paddd m1, [pd_16] | |
20303 psrld m1, 5 | |
20304 packusdw m4, m1 | |
20305 | |
20306 pmaddwd m5, m3, [r3 - 14 * 16] ; [02] | |
20307 paddd m5, [pd_16] | |
20308 psrld m5, 5 | |
20309 | |
20310 movu m3, [r2 + 6] | |
20311 pshufb m3, m2 | |
20312 | |
20313 pmaddwd m6, m3, [r3 - 8 * 16] ; [08] | |
20314 paddd m6, [pd_16] | |
20315 psrld m6, 5 | |
20316 packusdw m5, m6 | |
20317 | |
20318 movu m3, [r2 + 4] | |
20319 pshufb m3, m2 | |
20320 | |
20321 pmaddwd m6, m3, [r3 - 2 * 16] ; [14] | |
20322 paddd m6, [pd_16] | |
20323 psrld m6, 5 | |
20324 | |
20325 movu m3, [r2 + 2] | |
20326 pshufb m3, m2 | |
20327 | |
20328 pmaddwd m1, m3, [r3 + 4 * 16] ; [20] | |
20329 paddd m1, [pd_16] | |
20330 psrld m1, 5 | |
20331 packusdw m6, m1 | |
20332 | |
20333 movu m3, [r2] | |
20334 pshufb m3, m2 | |
20335 | |
20336 pmaddwd m1, m3, [r3 + 10 * 16] ; [26] | |
20337 paddd m1, [pd_16] | |
20338 psrld m1, 5 | |
20339 | |
20340 packusdw m1, m1 | |
20341 movhps m1, [r2] ; [00] | |
20342 | |
20343 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 | |
20344 %endmacro | |
20345 | |
20346 ;------------------------------------------------------------------------------------------ | |
20347 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
20348 ;------------------------------------------------------------------------------------------ | |
20349 INIT_XMM ssse3 | |
20350 cglobal intra_pred_ang32_2, 3,6,6 | |
20351 lea r4, [r2] | |
20352 add r2, 128 | |
20353 cmp r3m, byte 34 | |
20354 cmove r2, r4 | |
20355 | |
20356 add r1, r1 | |
20357 lea r3, [r1 * 2] | |
20358 lea r4, [r1 * 3] | |
20359 mov r5, 2 | |
20360 | |
20361 .loop: | |
20362 MODE_2_34 | |
20363 add r2, 32 | |
20364 dec r5 | |
20365 jnz .loop | |
20366 RET | |
20367 | |
20368 INIT_XMM sse4 | |
20369 cglobal intra_pred_ang32_3, 3,6,8 | |
20370 add r2, 128 | |
20371 lea r3, [ang_table + 16 * 16] | |
20372 mov r4d, 8 | |
20373 add r1, r1 | |
20374 lea r5, [r1 * 3] | |
20375 | |
20376 .loop: | |
20377 MODE_3_33 1 | |
20378 lea r0, [r0 + r1 * 4 ] | |
20379 add r2, 8 | |
20380 dec r4 | |
20381 jnz .loop | |
20382 RET | |
20383 | |
20384 INIT_XMM sse4 | |
20385 cglobal intra_pred_ang32_4, 3,6,8 | |
20386 add r2, 128 | |
20387 lea r3, [ang_table + 16 * 16] | |
20388 mov r4d, 8 | |
20389 add r1, r1 | |
20390 lea r5, [r1 * 3] | |
20391 | |
20392 .loop: | |
20393 MODE_4_32 1 | |
20394 lea r0, [r0 + r1 * 4 ] | |
20395 add r2, 8 | |
20396 dec r4 | |
20397 jnz .loop | |
20398 RET | |
20399 | |
20400 INIT_XMM sse4 | |
20401 cglobal intra_pred_ang32_5, 3,6,8 | |
20402 add r2, 128 | |
20403 lea r3, [ang_table + 16 * 16] | |
20404 mov r4d, 8 | |
20405 add r1, r1 | |
20406 lea r5, [r1 * 3] | |
20407 | |
20408 .loop: | |
20409 MODE_5_31 1 | |
20410 lea r0, [r0 + r1 * 4 ] | |
20411 add r2, 8 | |
20412 dec r4 | |
20413 jnz .loop | |
20414 RET | |
20415 | |
20416 INIT_XMM sse4 | |
20417 cglobal intra_pred_ang32_6, 3,6,8 | |
20418 add r2, 128 | |
20419 lea r3, [ang_table + 16 * 16] | |
20420 mov r4d, 8 | |
20421 add r1, r1 | |
20422 lea r5, [r1 * 3] | |
20423 | |
20424 .loop: | |
20425 MODE_6_30 1 | |
20426 lea r0, [r0 + r1 * 4 ] | |
20427 add r2, 8 | |
20428 dec r4 | |
20429 jnz .loop | |
20430 RET | |
20431 | |
20432 INIT_XMM sse4 | |
20433 cglobal intra_pred_ang32_7, 3,6,8 | |
20434 add r2, 128 | |
20435 lea r3, [ang_table + 16 * 16] | |
20436 mov r4d, 8 | |
20437 add r1, r1 | |
20438 lea r5, [r1 * 3] | |
20439 | |
20440 .loop: | |
20441 MODE_7_29 1 | |
20442 lea r0, [r0 + r1 * 4 ] | |
20443 add r2, 8 | |
20444 dec r4 | |
20445 jnz .loop | |
20446 RET | |
20447 | |
20448 INIT_XMM sse4 | |
20449 cglobal intra_pred_ang32_8, 3,6,8 | |
20450 add r2, 128 | |
20451 lea r3, [ang_table + 16 * 16] | |
20452 mov r4d, 8 | |
20453 add r1, r1 | |
20454 lea r5, [r1 * 3] | |
20455 | |
20456 .loop: | |
20457 MODE_8_28 1 | |
20458 lea r0, [r0 + r1 * 4 ] | |
20459 add r2, 8 | |
20460 dec r4 | |
20461 jnz .loop | |
20462 RET | |
20463 | |
20464 INIT_XMM sse4 | |
20465 cglobal intra_pred_ang32_9, 3,6,8 | |
20466 add r2, 128 | |
20467 lea r3, [ang_table + 16 * 16] | |
20468 mov r4d, 8 | |
20469 add r1, r1 | |
20470 lea r5, [r1 * 3] | |
20471 | |
20472 .loop: | |
20473 MODE_9_27 1 | |
20474 lea r0, [r0 + r1 * 4 ] | |
20475 add r2, 8 | |
20476 dec r4 | |
20477 jnz .loop | |
20478 RET | |
20479 | |
20480 INIT_XMM sse4 | |
20481 cglobal intra_pred_ang32_10, 3,7,8 | |
20482 add r2, 128 | |
20483 mov r6d, 4 | |
20484 add r1, r1 | |
20485 lea r5, [r1 * 3] | |
20486 lea r4, [r1 * 2] | |
20487 lea r3, [r1 * 4] | |
20488 mova m7, [c_mode32_10_0] | |
20489 | |
20490 .loop: | |
20491 movu m0, [r2 + 2] | |
20492 pshufb m1, m0, m7 | |
20493 movu [r0], m1 | |
20494 movu [r0 + 16], m1 | |
20495 movu [r0 + 32], m1 | |
20496 movu [r0 + 48], m1 | |
20497 | |
20498 palignr m1, m0, 2 | |
20499 pshufb m1, m7 | |
20500 movu [r0 + r1], m1 | |
20501 movu [r0 + r1 + 16], m1 | |
20502 movu [r0 + r1 + 32], m1 | |
20503 movu [r0 + r1 + 48], m1 | |
20504 | |
20505 palignr m1, m0, 4 | |
20506 pshufb m1, m7 | |
20507 movu [r0 + r4], m1 | |
20508 movu [r0 + r4 + 16], m1 | |
20509 movu [r0 + r4 + 32], m1 | |
20510 movu [r0 + r4 + 48], m1 | |
20511 | |
20512 palignr m1, m0, 6 | |
20513 pshufb m1, m7 | |
20514 movu [r0 + r5], m1 | |
20515 movu [r0 + r5 + 16], m1 | |
20516 movu [r0 + r5 + 32], m1 | |
20517 movu [r0 + r5 + 48], m1 | |
20518 | |
20519 add r0, r3 | |
20520 | |
20521 palignr m1, m0, 8 | |
20522 pshufb m1, m7 | |
20523 movu [r0], m1 | |
20524 movu [r0 + 16], m1 | |
20525 movu [r0 + 32], m1 | |
20526 movu [r0 + 48], m1 | |
20527 | |
20528 palignr m1, m0, 10 | |
20529 pshufb m1, m7 | |
20530 movu [r0 + r1], m1 | |
20531 movu [r0 + r1 + 16], m1 | |
20532 movu [r0 + r1 + 32], m1 | |
20533 movu [r0 + r1 + 48], m1 | |
20534 | |
20535 palignr m1, m0, 12 | |
20536 pshufb m1, m7 | |
20537 movu [r0 + r4], m1 | |
20538 movu [r0 + r4 + 16], m1 | |
20539 movu [r0 + r4 + 32], m1 | |
20540 movu [r0 + r4 + 48], m1 | |
20541 | |
20542 palignr m1, m0, 14 | |
20543 pshufb m1, m7 | |
20544 movu [r0 + r5], m1 | |
20545 movu [r0 + r5 + 16], m1 | |
20546 movu [r0 + r5 + 32], m1 | |
20547 movu [r0 + r5 + 48], m1 | |
20548 | |
20549 add r0, r3 | |
20550 add r2, 16 | |
20551 dec r6d | |
20552 jnz .loop | |
20553 RET | |
20554 | |
20555 INIT_XMM sse4 | |
20556 cglobal intra_pred_ang32_11, 3,6,7,0-(4*mmsize+4) | |
20557 mov r3, r2mp | |
20558 add r2, 128 | |
20559 movu m0, [r2 + 0*mmsize] | |
20560 pinsrw m0, [r3], 0 | |
20561 movu m1, [r2 + 1*mmsize] | |
20562 movu m2, [r2 + 2*mmsize] | |
20563 movu m3, [r2 + 3*mmsize] | |
20564 movu [rsp + 0*mmsize + 2], m0 | |
20565 movu [rsp + 1*mmsize + 2], m1 | |
20566 movu [rsp + 2*mmsize + 2], m2 | |
20567 movu [rsp + 3*mmsize + 2], m3 | |
20568 mov r4w, [r3+32] | |
20569 mov [rsp], r4w | |
20570 mov r4w, [r2+64] | |
20571 mov [rsp+66], r4w | |
20572 | |
20573 lea r3, [ang_table + 16 * 16] | |
20574 mov r4d, 8 | |
20575 mov r2, rsp | |
20576 add r1, r1 | |
20577 lea r5, [r1 * 3] | |
20578 | |
20579 .loop: | |
20580 MODE_11_25 1 | |
20581 lea r0, [r0 + r1 * 4 ] | |
20582 add r2, 8 | |
20583 dec r4 | |
20584 jnz .loop | |
20585 RET | |
20586 | |
20587 INIT_XMM sse4 | |
20588 cglobal intra_pred_ang32_12, 3,6,7,0-(4*mmsize+10) | |
20589 mov r3, r2mp | |
20590 add r2, 128 | |
20591 movu m0, [r2 + 0*mmsize] | |
20592 pinsrw m0, [r3], 0 | |
20593 movu m1, [r2 + 1*mmsize] | |
20594 movu m2, [r2 + 2*mmsize] | |
20595 movu m3, [r2 + 3*mmsize] | |
20596 movu [rsp + 0*mmsize + 8], m0 | |
20597 movu [rsp + 1*mmsize + 8], m1 | |
20598 movu [rsp + 2*mmsize + 8], m2 | |
20599 movu [rsp + 3*mmsize + 8], m3 | |
20600 | |
20601 mov r4w, [r2+64] | |
20602 mov [rsp+72], r4w | |
20603 mov r4w, [r3+12] | |
20604 mov [rsp+6], r4w | |
20605 mov r4w, [r3+26] | |
20606 mov [rsp+4], r4w | |
20607 mov r4w, [r3+38] | |
20608 mov [rsp+2], r4w | |
20609 mov r4w, [r3+52] | |
20610 mov [rsp], r4w | |
20611 | |
20612 lea r3, [ang_table + 16 * 16] | |
20613 mov r4d, 8 | |
20614 mov r2, rsp | |
20615 add r1, r1 | |
20616 lea r5, [r1 * 3] | |
20617 mova m2, [pw_punpcklwd] | |
20618 | |
20619 .loop: | |
20620 MODE_12_24 1 | |
20621 lea r0, [r0 + r1 * 4 ] | |
20622 add r2, 8 | |
20623 dec r4 | |
20624 jnz .loop | |
20625 RET | |
20626 | |
20627 INIT_XMM sse4 | |
20628 cglobal intra_pred_ang32_13, 3,6,7,0-(5*mmsize+2) | |
20629 mov r3, r2mp | |
20630 add r2, 128 | |
20631 movu m0, [r2 + 0*mmsize] | |
20632 pinsrw m0, [r3], 0 | |
20633 movu m1, [r2 + 1*mmsize] | |
20634 movu m2, [r2 + 2*mmsize] | |
20635 movu m3, [r2 + 3*mmsize] | |
20636 movu [rsp + 1*mmsize], m0 | |
20637 movu [rsp + 2*mmsize], m1 | |
20638 movu [rsp + 3*mmsize], m2 | |
20639 movu [rsp + 4*mmsize], m3 | |
20640 | |
20641 mov r4w, [r2+64] | |
20642 mov [rsp+80], r4w | |
20643 movu m0, [r3 + 8] | |
20644 movu m1, [r3 + 36] | |
20645 pshufb m0, [shuf_mode_13_23] | |
20646 pshufb m1, [shuf_mode_13_23] | |
20647 movh [rsp + 8], m0 | |
20648 movh [rsp], m1 | |
20649 mov r4w, [r3+28] | |
20650 mov [rsp+8], r4w | |
20651 mov r4w, [r3+56] | |
20652 mov [rsp], r4w | |
20653 | |
20654 lea r3, [ang_table + 16 * 16] | |
20655 mov r4d, 8 | |
20656 mov r2, rsp | |
20657 add r1, r1 | |
20658 lea r5, [r1 * 3] | |
20659 mova m2, [pw_punpcklwd] | |
20660 | |
20661 .loop: | |
20662 MODE_13_23 1 | |
20663 lea r0, [r0 + r1 * 4 ] | |
20664 add r2, 8 | |
20665 dec r4 | |
20666 jnz .loop | |
20667 RET | |
20668 | |
20669 INIT_XMM sse4 | |
20670 cglobal intra_pred_ang32_14, 3,6,7,0-(5*mmsize+10) | |
20671 mov r3, r2mp | |
20672 add r2, 128 | |
20673 movu m0, [r2 + 0*mmsize] | |
20674 pinsrw m0, [r3], 0 | |
20675 movu m1, [r2 + 1*mmsize] | |
20676 movu m2, [r2 + 2*mmsize] | |
20677 movu m3, [r2 + 3*mmsize] | |
20678 movu [rsp + 1*mmsize + 8], m0 | |
20679 movu [rsp + 2*mmsize + 8], m1 | |
20680 movu [rsp + 3*mmsize + 8], m2 | |
20681 movu [rsp + 4*mmsize + 8], m3 | |
20682 | |
20683 mov r4w, [r2 + 64] | |
20684 mov [rsp + 88], r4w | |
20685 mov r4w, [r3+4] | |
20686 mov [rsp+22], r4w | |
20687 movu m0, [r3 + 10] | |
20688 movu m1, [r3 + 30] | |
20689 movu m2, [r3 + 50] | |
20690 pshufb m0, [shuf_mode_14_22] | |
20691 pshufb m1, [shuf_mode_14_22] | |
20692 pshufb m2, [shuf_mode_14_22] | |
20693 movh [rsp + 14], m0 | |
20694 movh [rsp + 6], m1 | |
20695 movh [rsp - 2], m2 | |
20696 | |
20697 lea r3, [ang_table + 16 * 16] | |
20698 mov r4d, 8 | |
20699 mov r2, rsp | |
20700 add r1, r1 | |
20701 lea r5, [r1 * 3] | |
20702 mova m2, [pw_punpcklwd] | |
20703 | |
20704 .loop: | |
20705 MODE_14_22 1 | |
20706 lea r0, [r0 + r1 * 4 ] | |
20707 add r2, 8 | |
20708 dec r4 | |
20709 jnz .loop | |
20710 RET | |
20711 | |
20712 INIT_XMM sse4 | |
20713 cglobal intra_pred_ang32_15, 3,6,7,0-(6*mmsize+2) | |
20714 mov r3, r2mp | |
20715 add r2, 128 | |
20716 movu m0, [r2 + 0*mmsize] | |
20717 pinsrw m0, [r3], 0 | |
20718 movu m1, [r2 + 1*mmsize] | |
20719 movu m2, [r2 + 2*mmsize] | |
20720 movu m3, [r2 + 3*mmsize] | |
20721 movu [rsp + 2*mmsize], m0 | |
20722 movu [rsp + 3*mmsize], m1 | |
20723 movu [rsp + 4*mmsize], m2 | |
20724 movu [rsp + 5*mmsize], m3 | |
20725 | |
20726 mov r4w, [r2 + 64] | |
20727 mov [rsp + 96], r4w | |
20728 movu m0, [r3 + 4] | |
20729 movu m1, [r3 + 18] | |
20730 movu m2, [r3 + 34] | |
20731 movu m3, [r3 + 48] | |
20732 pshufb m0, [shuf_mode_15_21] | |
20733 pshufb m1, [shuf_mode_15_21] | |
20734 pshufb m2, [shuf_mode_15_21] | |
20735 pshufb m3, [shuf_mode_15_21] | |
20736 movh [rsp + 24], m0 | |
20737 movh [rsp + 16], m1 | |
20738 movh [rsp + 8], m2 | |
20739 movh [rsp], m3 | |
20740 | |
20741 lea r3, [ang_table + 16 * 16] | |
20742 mov r4d, 8 | |
20743 mov r2, rsp | |
20744 add r1, r1 | |
20745 lea r5, [r1 * 3] | |
20746 mova m2, [pw_punpcklwd] | |
20747 | |
20748 .loop: | |
20749 MODE_15_21 1 | |
20750 lea r0, [r0 + r1 * 4 ] | |
20751 add r2, 8 | |
20752 dec r4 | |
20753 jnz .loop | |
20754 RET | |
20755 | |
20756 INIT_XMM sse4 | |
20757 cglobal intra_pred_ang32_16, 3,6,7,0-(6*mmsize+10) | |
20758 mov r3, r2mp | |
20759 add r2, 128 | |
20760 movu m0, [r2 + 0*mmsize] | |
20761 pinsrw m0, [r3], 0 | |
20762 movu m1, [r2 + 1*mmsize] | |
20763 movu m2, [r2 + 2*mmsize] | |
20764 movu m3, [r2 + 3*mmsize] | |
20765 movu [rsp + 2*mmsize + 8], m0 | |
20766 movu [rsp + 3*mmsize + 8], m1 | |
20767 movu [rsp + 4*mmsize + 8], m2 | |
20768 movu [rsp + 5*mmsize + 8], m3 | |
20769 | |
20770 mov r4w, [r2 + 64] | |
20771 mov [rsp + 104], r4w | |
20772 movu m0, [r3 + 4] | |
20773 movu m1, [r3 + 22] | |
20774 movu m2, [r3 + 40] | |
20775 movd m3, [r3 + 58] | |
20776 pshufb m0, [shuf_mode_16_20] | |
20777 pshufb m1, [shuf_mode_16_20] | |
20778 pshufb m2, [shuf_mode_16_20] | |
20779 pshufb m3, [shuf_mode_16_20] | |
20780 movu [rsp + 24], m0 | |
20781 movu [rsp + 12], m1 | |
20782 movu [rsp], m2 | |
20783 movd [rsp], m3 | |
20784 | |
20785 lea r3, [ang_table + 16 * 16] | |
20786 mov r4d, 8 | |
20787 mov r2, rsp | |
20788 add r1, r1 | |
20789 lea r5, [r1 * 3] | |
20790 mova m2, [pw_punpcklwd] | |
20791 | |
20792 .loop: | |
20793 MODE_16_20 1 | |
20794 lea r0, [r0 + r1 * 4 ] | |
20795 add r2, 8 | |
20796 dec r4 | |
20797 jnz .loop | |
20798 RET | |
20799 | |
20800 INIT_XMM sse4 | |
20801 cglobal intra_pred_ang32_17, 3,6,7,0-(7*mmsize+4) | |
20802 mov r3, r2mp | |
20803 add r2, 128 | |
20804 movu m0, [r2 + 0*mmsize] | |
20805 pinsrw m0, [r3], 0 | |
20806 movu m1, [r2 + 1*mmsize] | |
20807 movu m2, [r2 + 2*mmsize] | |
20808 movu m3, [r2 + 3*mmsize] | |
20809 movu [rsp + 3*mmsize + 2], m0 | |
20810 movu [rsp + 4*mmsize + 2], m1 | |
20811 movu [rsp + 5*mmsize + 2], m2 | |
20812 movu [rsp + 6*mmsize + 2], m3 | |
20813 | |
20814 mov r4w, [r2 + 64] | |
20815 mov [rsp + 114], r4w | |
20816 movu m0, [r3 + 8] | |
20817 movu m1, [r3 + 30] | |
20818 movu m2, [r3 + 50] | |
20819 movd m3, [r3 + 2] | |
20820 pshufb m0, [shuf_mode_17_19] | |
20821 pshufb m1, [shuf_mode_17_19] | |
20822 pshufb m2, [shuf_mode_17_19] | |
20823 pshufb m3, [shuf_mode_16_20] | |
20824 movd [rsp + 46], m3 | |
20825 movu [rsp + 30], m0 | |
20826 movu [rsp + 12], m1 | |
20827 movu [rsp - 4], m2 | |
20828 mov r4w, [r3 + 24] | |
20829 mov [rsp + 30], r4w | |
20830 mov r4w, [r3 + 28] | |
20831 mov [rsp + 28], r4w | |
20832 mov r4w, [r3 + 46] | |
20833 mov [rsp + 12], r4w | |
20834 | |
20835 lea r3, [ang_table + 16 * 16] | |
20836 mov r4d, 8 | |
20837 mov r2, rsp | |
20838 add r1, r1 | |
20839 lea r5, [r1 * 3] | |
20840 mova m2, [pw_punpcklwd] | |
20841 | |
20842 .loop: | |
20843 MODE_17_19 1 | |
20844 lea r0, [r0 + r1 * 4 ] | |
20845 add r2, 8 | |
20846 dec r4 | |
20847 jnz .loop | |
20848 RET | |
20849 | |
20850 INIT_XMM sse4 | |
20851 cglobal intra_pred_ang32_18, 3,7,8 | |
20852 mov r3, r2mp | |
20853 add r2, 128 | |
20854 movu m0, [r3] ; [7 6 5 4 3 2 1 0] | |
20855 movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8] | |
20856 movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16] | |
20857 movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24] | |
20858 movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1] | |
20859 movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9] | |
20860 | |
20861 add r1, r1 | |
20862 lea r6, [r1 * 2] | |
20863 lea r3, [r1 * 3] | |
20864 lea r4, [r1 * 4] | |
20865 | |
20866 movu [r0], m0 | |
20867 movu [r0 + 16], m1 | |
20868 movu [r0 + 32], m2 | |
20869 movu [r0 + 48], m3 | |
20870 | |
20871 pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8] | |
20872 pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16] | |
20873 | |
20874 palignr m6, m0, m4, 14 | |
20875 movu [r0 + r1], m6 | |
20876 palignr m6, m1, m0, 14 | |
20877 movu [r0 + r1 + 16], m6 | |
20878 palignr m6, m2, m1, 14 | |
20879 movu [r0 + r1 + 32], m6 | |
20880 palignr m6, m3, m2, 14 | |
20881 movu [r0 + r1 + 48], m6 | |
20882 | |
20883 palignr m6, m0, m4, 12 | |
20884 movu [r0 + r6], m6 | |
20885 palignr m6, m1, m0, 12 | |
20886 movu [r0 + r6 + 16], m6 | |
20887 palignr m6, m2, m1, 12 | |
20888 movu [r0 + r6 + 32], m6 | |
20889 palignr m6, m3, m2, 12 | |
20890 movu [r0 + r6 + 48], m6 | |
20891 | |
20892 palignr m6, m0, m4, 10 | |
20893 movu [r0 + r3], m6 | |
20894 palignr m6, m1, m0, 10 | |
20895 movu [r0 + r3 + 16], m6 | |
20896 palignr m6, m2, m1, 10 | |
20897 movu [r0 + r3 + 32], m6 | |
20898 palignr m6, m3, m2, 10 | |
20899 movu [r0 + r3 + 48], m6 | |
20900 | |
20901 add r0, r4 | |
20902 | |
20903 palignr m6, m0, m4, 8 | |
20904 movu [r0], m6 | |
20905 palignr m6, m1, m0, 8 | |
20906 movu [r0 + 16], m6 | |
20907 palignr m6, m2, m1, 8 | |
20908 movu [r0 + 32], m6 | |
20909 palignr m6, m3, m2, 8 | |
20910 movu [r0 + 48], m6 | |
20911 | |
20912 palignr m6, m0, m4, 6 | |
20913 movu [r0 + r1], m6 | |
20914 palignr m6, m1, m0, 6 | |
20915 movu [r0 + r1 + 16], m6 | |
20916 palignr m6, m2, m1, 6 | |
20917 movu [r0 + r1 + 32], m6 | |
20918 palignr m6, m3, m2, 6 | |
20919 movu [r0 + r1 + 48], m6 | |
20920 | |
20921 palignr m6, m0, m4, 4 | |
20922 movu [r0 + r6], m6 | |
20923 palignr m6, m1, m0, 4 | |
20924 movu [r0 + r6 + 16], m6 | |
20925 palignr m6, m2, m1, 4 | |
20926 movu [r0 + r6 + 32], m6 | |
20927 palignr m6, m3, m2, 4 | |
20928 movu [r0 + r6 + 48], m6 | |
20929 | |
20930 palignr m6, m0, m4, 2 | |
20931 movu [r0 + r3], m6 | |
20932 palignr m6, m1, m0, 2 | |
20933 movu [r0 + r3 + 16], m6 | |
20934 palignr m6, m2, m1, 2 | |
20935 movu [r0 + r3 + 32], m6 | |
20936 palignr m6, m3, m2, 2 | |
20937 movu [r0 + r3 + 48], m6 | |
20938 | |
20939 add r0, r4 | |
20940 | |
20941 movu [r0], m4 | |
20942 movu [r0 + 16], m0 | |
20943 movu [r0 + 32], m1 | |
20944 movu [r0 + 48], m2 | |
20945 | |
20946 palignr m6, m4, m5, 14 | |
20947 movu [r0 + r1], m6 | |
20948 palignr m6, m0, m4, 14 | |
20949 movu [r0 + r1 + 16], m6 | |
20950 palignr m6, m1, m0, 14 | |
20951 movu [r0 + r1 + 32], m6 | |
20952 palignr m6, m2, m1, 14 | |
20953 movu [r0 + r1 + 48], m6 | |
20954 | |
20955 palignr m6, m4, m5, 12 | |
20956 movu [r0 + r6], m6 | |
20957 palignr m6, m0, m4, 12 | |
20958 movu [r0 + r6 + 16], m6 | |
20959 palignr m6, m1, m0, 12 | |
20960 movu [r0 + r6 + 32], m6 | |
20961 palignr m6, m2, m1, 12 | |
20962 movu [r0 + r6 + 48], m6 | |
20963 | |
20964 palignr m6, m4, m5, 10 | |
20965 movu [r0 + r3], m6 | |
20966 palignr m6, m0, m4, 10 | |
20967 movu [r0 + r3 + 16], m6 | |
20968 palignr m6, m1, m0, 10 | |
20969 movu [r0 + r3 + 32], m6 | |
20970 palignr m6, m2, m1, 10 | |
20971 movu [r0 + r3 + 48], m6 | |
20972 | |
20973 add r0, r4 | |
20974 | |
20975 palignr m6, m4, m5, 8 | |
20976 movu [r0], m6 | |
20977 palignr m6, m0, m4, 8 | |
20978 movu [r0 + 16], m6 | |
20979 palignr m6, m1, m0, 8 | |
20980 movu [r0 + 32], m6 | |
20981 palignr m6, m2, m1, 8 | |
20982 movu [r0 + 48], m6 | |
20983 | |
20984 palignr m6, m4, m5, 6 | |
20985 movu [r0 + r1], m6 | |
20986 palignr m6, m0, m4, 6 | |
20987 movu [r0 + r1 + 16], m6 | |
20988 palignr m6, m1, m0, 6 | |
20989 movu [r0 + r1 + 32], m6 | |
20990 palignr m6, m2, m1, 6 | |
20991 movu [r0 + r1 + 48], m6 | |
20992 | |
20993 palignr m6, m4, m5, 4 | |
20994 movu [r0 + r6], m6 | |
20995 palignr m6, m0, m4, 4 | |
20996 movu [r0 + r6 + 16], m6 | |
20997 palignr m6, m1, m0, 4 | |
20998 movu [r0 + r6 + 32], m6 | |
20999 palignr m6, m2, m1, 4 | |
21000 movu [r0 + r6 + 48], m6 | |
21001 | |
21002 palignr m6, m4, m5, 2 | |
21003 movu [r0 + r3], m6 | |
21004 palignr m6, m0, m4, 2 | |
21005 movu [r0 + r3 + 16], m6 | |
21006 palignr m6, m1, m0, 2 | |
21007 movu [r0 + r3 + 32], m6 | |
21008 palignr m6, m2, m1, 2 | |
21009 movu [r0 + r3 + 48], m6 | |
21010 | |
21011 add r0, r4 | |
21012 | |
21013 movu m2, [r2 + 34] | |
21014 movu m3, [r2 + 50] | |
21015 pshufb m2, [shuf_mode32_18] | |
21016 pshufb m3, [shuf_mode32_18] | |
21017 | |
21018 movu [r0], m5 | |
21019 movu [r0 + 16], m4 | |
21020 movu [r0 + 32], m0 | |
21021 movu [r0 + 48], m1 | |
21022 | |
21023 palignr m6, m5, m2, 14 | |
21024 movu [r0 + r1], m6 | |
21025 palignr m6, m4, m5, 14 | |
21026 movu [r0 + r1 + 16], m6 | |
21027 palignr m6, m0, m4, 14 | |
21028 movu [r0 + r1 + 32], m6 | |
21029 palignr m6, m1, m0, 14 | |
21030 movu [r0 + r1 + 48], m6 | |
21031 | |
21032 palignr m6, m5, m2, 12 | |
21033 movu [r0 + r6], m6 | |
21034 palignr m6, m4, m5, 12 | |
21035 movu [r0 + r6 + 16], m6 | |
21036 palignr m6, m0, m4, 12 | |
21037 movu [r0 + r6 + 32], m6 | |
21038 palignr m6, m1, m0, 12 | |
21039 movu [r0 + r6 + 48], m6 | |
21040 | |
21041 palignr m6, m5, m2, 10 | |
21042 movu [r0 + r3], m6 | |
21043 palignr m6, m4, m5, 10 | |
21044 movu [r0 + r3 + 16], m6 | |
21045 palignr m6, m0, m4, 10 | |
21046 movu [r0 + r3 + 32], m6 | |
21047 palignr m6, m1, m0, 10 | |
21048 movu [r0 + r3 + 48], m6 | |
21049 | |
21050 add r0, r4 | |
21051 | |
21052 palignr m6, m5, m2, 8 | |
21053 movu [r0], m6 | |
21054 palignr m6, m4, m5, 8 | |
21055 movu [r0 + 16], m6 | |
21056 palignr m6, m0, m4, 8 | |
21057 movu [r0 + 32], m6 | |
21058 palignr m6, m1, m0, 8 | |
21059 movu [r0 + 48], m6 | |
21060 | |
21061 palignr m6, m5, m2, 6 | |
21062 movu [r0 + r1], m6 | |
21063 palignr m6, m4, m5, 6 | |
21064 movu [r0 + r1 + 16], m6 | |
21065 palignr m6, m0, m4, 6 | |
21066 movu [r0 + r1 + 32], m6 | |
21067 palignr m6, m1, m0, 6 | |
21068 movu [r0 + r1 + 48], m6 | |
21069 | |
21070 palignr m6, m5, m2, 4 | |
21071 movu [r0 + r6], m6 | |
21072 palignr m6, m4, m5, 4 | |
21073 movu [r0 + r6 + 16], m6 | |
21074 palignr m6, m0, m4, 4 | |
21075 movu [r0 + r6 + 32], m6 | |
21076 palignr m6, m1, m0, 4 | |
21077 movu [r0 + r6 + 48], m6 | |
21078 | |
21079 palignr m6, m5, m2, 2 | |
21080 movu [r0 + r3], m6 | |
21081 palignr m6, m4, m5, 2 | |
21082 movu [r0 + r3 + 16], m6 | |
21083 palignr m6, m0, m4, 2 | |
21084 movu [r0 + r3 + 32], m6 | |
21085 palignr m6, m1, m0, 2 | |
21086 movu [r0 + r3 + 48], m6 | |
21087 | |
21088 add r0, r4 | |
21089 | |
21090 movu [r0], m2 | |
21091 movu [r0 + 16], m5 | |
21092 movu [r0 + 32], m4 | |
21093 movu [r0 + 48], m0 | |
21094 | |
21095 palignr m6, m2, m3, 14 | |
21096 movu [r0 + r1], m6 | |
21097 palignr m6, m5, m2, 14 | |
21098 movu [r0 + r1 + 16], m6 | |
21099 palignr m6, m4, m5, 14 | |
21100 movu [r0 + r1 + 32], m6 | |
21101 palignr m6, m0, m4, 14 | |
21102 movu [r0 + r1 + 48], m6 | |
21103 | |
21104 palignr m6, m2, m3, 12 | |
21105 movu [r0 + r6], m6 | |
21106 palignr m6, m5, m2, 12 | |
21107 movu [r0 + r6 + 16], m6 | |
21108 palignr m6, m4, m5, 12 | |
21109 movu [r0 + r6 + 32], m6 | |
21110 palignr m6, m0, m4, 12 | |
21111 movu [r0 + r6 + 48], m6 | |
21112 | |
21113 palignr m6, m2, m3, 10 | |
21114 movu [r0 + r3], m6 | |
21115 palignr m6, m5, m2, 10 | |
21116 movu [r0 + r3 + 16], m6 | |
21117 palignr m6, m4, m5, 10 | |
21118 movu [r0 + r3 + 32], m6 | |
21119 palignr m6, m0, m4, 10 | |
21120 movu [r0 + r3 + 48], m6 | |
21121 | |
21122 add r0, r4 | |
21123 | |
21124 palignr m6, m2, m3, 8 | |
21125 movu [r0], m6 | |
21126 palignr m6, m5, m2, 8 | |
21127 movu [r0 + 16], m6 | |
21128 palignr m6, m4, m5, 8 | |
21129 movu [r0 + 32], m6 | |
21130 palignr m6, m0, m4, 8 | |
21131 movu [r0 + 48], m6 | |
21132 | |
21133 palignr m6, m2, m3, 6 | |
21134 movu [r0 + r1], m6 | |
21135 palignr m6, m5, m2, 6 | |
21136 movu [r0 + r1 + 16], m6 | |
21137 palignr m6, m4, m5, 6 | |
21138 movu [r0 + r1 + 32], m6 | |
21139 palignr m6, m0, m4, 6 | |
21140 movu [r0 + r1 + 48], m6 | |
21141 | |
21142 palignr m6, m2, m3, 4 | |
21143 movu [r0 + r6], m6 | |
21144 palignr m6, m5, m2, 4 | |
21145 movu [r0 + r6 + 16], m6 | |
21146 palignr m6, m4, m5, 4 | |
21147 movu [r0 + r6 + 32], m6 | |
21148 palignr m6, m0, m4, 4 | |
21149 movu [r0 + r6 + 48], m6 | |
21150 | |
21151 palignr m6, m2, m3, 2 | |
21152 movu [r0 + r3], m6 | |
21153 palignr m6, m5, m2, 2 | |
21154 movu [r0 + r3 + 16], m6 | |
21155 palignr m6, m4, m5, 2 | |
21156 movu [r0 + r3 + 32], m6 | |
21157 palignr m6, m0, m4, 2 | |
21158 movu [r0 + r3 + 48], m6 | |
21159 RET | |
21160 | |
21161 INIT_XMM sse4 | |
21162 cglobal intra_pred_ang32_19, 3,7,7,0-(7*mmsize+4) | |
21163 lea r3, [r2 + 128] | |
21164 movu m0, [r2 + 0*mmsize] | |
21165 movu m1, [r2 + 1*mmsize] | |
21166 movu m2, [r2 + 2*mmsize] | |
21167 movu m3, [r2 + 3*mmsize] | |
21168 movu [rsp + 3*mmsize + 2], m0 | |
21169 movu [rsp + 4*mmsize + 2], m1 | |
21170 movu [rsp + 5*mmsize + 2], m2 | |
21171 movu [rsp + 6*mmsize + 2], m3 | |
21172 | |
21173 mov r4w, [r2 + 64] | |
21174 mov [rsp + 114], r4w | |
21175 movu m0, [r3 + 8] | |
21176 movu m1, [r3 + 30] | |
21177 movu m2, [r3 + 50] | |
21178 movd m3, [r3 + 2] | |
21179 pshufb m0, [shuf_mode_17_19] | |
21180 pshufb m1, [shuf_mode_17_19] | |
21181 pshufb m2, [shuf_mode_17_19] | |
21182 pshufb m3, [shuf_mode_16_20] | |
21183 movd [rsp + 46], m3 | |
21184 movu [rsp + 30], m0 | |
21185 movu [rsp + 12], m1 | |
21186 movu [rsp - 4], m2 | |
21187 mov r4w, [r3 + 24] | |
21188 mov [rsp + 30], r4w | |
21189 mov r4w, [r3 + 28] | |
21190 mov [rsp + 28], r4w | |
21191 mov r4w, [r3 + 46] | |
21192 mov [rsp + 12], r4w | |
21193 | |
21194 lea r3, [ang_table + 16 * 16] | |
21195 mov r4d, 8 | |
21196 mov r2, rsp | |
21197 add r1, r1 | |
21198 lea r5, [r1 * 3] | |
21199 mova m2, [pw_punpcklwd] | |
21200 mov r6, r0 | |
21201 | |
21202 .loop: | |
21203 MODE_17_19 0 | |
21204 add r6, 8 | |
21205 mov r0, r6 | |
21206 add r2, 8 | |
21207 dec r4 | |
21208 jnz .loop | |
21209 RET | |
21210 | |
21211 INIT_XMM sse4 | |
21212 cglobal intra_pred_ang32_20, 3,7,7,0-(6*mmsize+10) | |
21213 lea r3, [r2 + 128] | |
21214 movu m0, [r2 + 0*mmsize] | |
21215 movu m1, [r2 + 1*mmsize] | |
21216 movu m2, [r2 + 2*mmsize] | |
21217 movu m3, [r2 + 3*mmsize] | |
21218 movu [rsp + 2*mmsize + 8], m0 | |
21219 movu [rsp + 3*mmsize + 8], m1 | |
21220 movu [rsp + 4*mmsize + 8], m2 | |
21221 movu [rsp + 5*mmsize + 8], m3 | |
21222 | |
21223 mov r4w, [r2 + 64] | |
21224 mov [rsp + 104], r4w | |
21225 movu m0, [r3 + 4] | |
21226 movu m1, [r3 + 22] | |
21227 movu m2, [r3 + 40] | |
21228 movd m3, [r3 + 58] | |
21229 pshufb m0, [shuf_mode_16_20] | |
21230 pshufb m1, [shuf_mode_16_20] | |
21231 pshufb m2, [shuf_mode_16_20] | |
21232 pshufb m3, [shuf_mode_16_20] | |
21233 movu [rsp + 24], m0 | |
21234 movu [rsp + 12], m1 | |
21235 movu [rsp], m2 | |
21236 movd [rsp], m3 | |
21237 | |
21238 lea r3, [ang_table + 16 * 16] | |
21239 mov r4d, 8 | |
21240 mov r2, rsp | |
21241 add r1, r1 | |
21242 lea r5, [r1 * 3] | |
21243 mova m2, [pw_punpcklwd] | |
21244 mov r6, r0 | |
21245 | |
21246 .loop: | |
21247 MODE_16_20 0 | |
21248 add r6, 8 | |
21249 mov r0, r6 | |
21250 add r2, 8 | |
21251 dec r4 | |
21252 jnz .loop | |
21253 RET | |
21254 | |
21255 INIT_XMM sse4 | |
21256 cglobal intra_pred_ang32_21, 3,7,7,0-(6*mmsize+2) | |
21257 lea r3, [r2 + 128] | |
21258 movu m0, [r2 + 0*mmsize] | |
21259 movu m1, [r2 + 1*mmsize] | |
21260 movu m2, [r2 + 2*mmsize] | |
21261 movu m3, [r2 + 3*mmsize] | |
21262 movu [rsp + 2*mmsize], m0 | |
21263 movu [rsp + 3*mmsize], m1 | |
21264 movu [rsp + 4*mmsize], m2 | |
21265 movu [rsp + 5*mmsize], m3 | |
21266 | |
21267 mov r4w, [r2 + 64] | |
21268 mov [rsp + 96], r4w | |
21269 movu m0, [r3 + 4] | |
21270 movu m1, [r3 + 18] | |
21271 movu m2, [r3 + 34] | |
21272 movu m3, [r3 + 48] | |
21273 pshufb m0, [shuf_mode_15_21] | |
21274 pshufb m1, [shuf_mode_15_21] | |
21275 pshufb m2, [shuf_mode_15_21] | |
21276 pshufb m3, [shuf_mode_15_21] | |
21277 movh [rsp + 24], m0 | |
21278 movh [rsp + 16], m1 | |
21279 movh [rsp + 8], m2 | |
21280 movh [rsp], m3 | |
21281 | |
21282 lea r3, [ang_table + 16 * 16] | |
21283 mov r4d, 8 | |
21284 mov r2, rsp | |
21285 add r1, r1 | |
21286 lea r5, [r1 * 3] | |
21287 mova m2, [pw_punpcklwd] | |
21288 mov r6, r0 | |
21289 | |
21290 .loop: | |
21291 MODE_15_21 0 | |
21292 add r6, 8 | |
21293 mov r0, r6 | |
21294 add r2, 8 | |
21295 dec r4 | |
21296 jnz .loop | |
21297 RET | |
21298 | |
21299 INIT_XMM sse4 | |
21300 cglobal intra_pred_ang32_22, 3,7,7,0-(5*mmsize+10) | |
21301 lea r3, [r2 + 128] | |
21302 movu m0, [r2 + 0*mmsize] | |
21303 movu m1, [r2 + 1*mmsize] | |
21304 movu m2, [r2 + 2*mmsize] | |
21305 movu m3, [r2 + 3*mmsize] | |
21306 movu [rsp + 1*mmsize + 8], m0 | |
21307 movu [rsp + 2*mmsize + 8], m1 | |
21308 movu [rsp + 3*mmsize + 8], m2 | |
21309 movu [rsp + 4*mmsize + 8], m3 | |
21310 | |
21311 mov r4w, [r2 + 64] | |
21312 mov [rsp + 88], r4w | |
21313 mov r4w, [r3+4] | |
21314 mov [rsp+22], r4w | |
21315 movu m0, [r3 + 10] | |
21316 movu m1, [r3 + 30] | |
21317 movu m2, [r3 + 50] | |
21318 pshufb m0, [shuf_mode_14_22] | |
21319 pshufb m1, [shuf_mode_14_22] | |
21320 pshufb m2, [shuf_mode_14_22] | |
21321 movh [rsp + 14], m0 | |
21322 movh [rsp + 6], m1 | |
21323 movh [rsp - 2], m2 | |
21324 | |
21325 lea r3, [ang_table + 16 * 16] | |
21326 mov r4d, 8 | |
21327 mov r2, rsp | |
21328 add r1, r1 | |
21329 lea r5, [r1 * 3] | |
21330 mova m2, [pw_punpcklwd] | |
21331 mov r6, r0 | |
21332 | |
21333 .loop: | |
21334 MODE_14_22 0 | |
21335 add r6, 8 | |
21336 mov r0, r6 | |
21337 add r2, 8 | |
21338 dec r4 | |
21339 jnz .loop | |
21340 RET | |
21341 | |
21342 INIT_XMM sse4 | |
21343 cglobal intra_pred_ang32_23, 3,7,7,0-(5*mmsize+2) | |
21344 lea r3, [r2 + 128] | |
21345 movu m0, [r2 + 0*mmsize] | |
21346 movu m1, [r2 + 1*mmsize] | |
21347 movu m2, [r2 + 2*mmsize] | |
21348 movu m3, [r2 + 3*mmsize] | |
21349 movu [rsp + 1*mmsize], m0 | |
21350 movu [rsp + 2*mmsize], m1 | |
21351 movu [rsp + 3*mmsize], m2 | |
21352 movu [rsp + 4*mmsize], m3 | |
21353 | |
21354 mov r4w, [r2+64] | |
21355 mov [rsp+80], r4w | |
21356 movu m0, [r3 + 8] | |
21357 movu m1, [r3 + 36] | |
21358 pshufb m0, [shuf_mode_13_23] | |
21359 pshufb m1, [shuf_mode_13_23] | |
21360 movh [rsp + 8], m0 | |
21361 movh [rsp], m1 | |
21362 mov r4w, [r3+28] | |
21363 mov [rsp+8], r4w | |
21364 mov r4w, [r3+56] | |
21365 mov [rsp], r4w | |
21366 | |
21367 lea r3, [ang_table + 16 * 16] | |
21368 mov r4d, 8 | |
21369 mov r2, rsp | |
21370 add r1, r1 | |
21371 lea r5, [r1 * 3] | |
21372 mova m2, [pw_punpcklwd] | |
21373 mov r6, r0 | |
21374 | |
21375 .loop: | |
21376 MODE_13_23 0 | |
21377 add r6, 8 | |
21378 mov r0, r6 | |
21379 add r2, 8 | |
21380 dec r4 | |
21381 jnz .loop | |
21382 RET | |
21383 | |
21384 INIT_XMM sse4 | |
21385 cglobal intra_pred_ang32_24, 3,7,7,0-(4*mmsize+10) | |
21386 lea r3, [r2 + 128] | |
21387 movu m0, [r2 + 0*mmsize] | |
21388 movu m1, [r2 + 1*mmsize] | |
21389 movu m2, [r2 + 2*mmsize] | |
21390 movu m3, [r2 + 3*mmsize] | |
21391 | |
21392 movu [rsp + 0*mmsize + 8], m0 | |
21393 movu [rsp + 1*mmsize + 8], m1 | |
21394 movu [rsp + 2*mmsize + 8], m2 | |
21395 movu [rsp + 3*mmsize + 8], m3 | |
21396 | |
21397 mov r4w, [r2+64] | |
21398 mov [rsp+72], r4w | |
21399 mov r4w, [r3+12] | |
21400 mov [rsp+6], r4w | |
21401 mov r4w, [r3+26] | |
21402 mov [rsp+4], r4w | |
21403 mov r4w, [r3+38] | |
21404 mov [rsp+2], r4w | |
21405 mov r4w, [r3+52] | |
21406 mov [rsp], r4w | |
21407 | |
21408 lea r3, [ang_table + 16 * 16] | |
21409 mov r4d, 8 | |
21410 mov r2, rsp | |
21411 add r1, r1 | |
21412 lea r5, [r1 * 3] | |
21413 mov r6, r0 | |
21414 mova m2, [pw_punpcklwd] | |
21415 | |
21416 .loop: | |
21417 MODE_12_24 0 | |
21418 add r6, 8 | |
21419 mov r0, r6 | |
21420 add r2, 8 | |
21421 dec r4 | |
21422 jnz .loop | |
21423 RET | |
21424 | |
21425 INIT_XMM sse4 | |
21426 cglobal intra_pred_ang32_25, 3,7,7,0-(4*mmsize+4) | |
21427 lea r3, [r2 + 128] | |
21428 movu m0, [r2 + 0*mmsize] | |
21429 movu m1, [r2 + 1*mmsize] | |
21430 movu m2, [r2 + 2*mmsize] | |
21431 movu m3, [r2 + 3*mmsize] | |
21432 movu [rsp + 0*mmsize + 2], m0 | |
21433 movu [rsp + 1*mmsize + 2], m1 | |
21434 movu [rsp + 2*mmsize + 2], m2 | |
21435 movu [rsp + 3*mmsize + 2], m3 | |
21436 mov r4w, [r3+32] | |
21437 mov [rsp], r4w | |
21438 mov r4w, [r2+64] | |
21439 mov [rsp+66], r4w | |
21440 | |
21441 lea r3, [ang_table + 16 * 16] | |
21442 mov r4d, 8 | |
21443 mov r2, rsp | |
21444 add r1, r1 | |
21445 lea r5, [r1 * 3] | |
21446 mov r6, r0 | |
21447 | |
21448 .loop: | |
21449 MODE_11_25 0 | |
21450 add r6, 8 | |
21451 mov r0, r6 | |
21452 add r2, 8 | |
21453 dec r4 | |
21454 jnz .loop | |
21455 RET | |
21456 | |
21457 INIT_XMM sse4 | |
21458 cglobal intra_pred_ang32_26, 3,7,5 | |
21459 mov r6d, 4 | |
21460 add r1, r1 | |
21461 lea r3, [r1 * 2] | |
21462 lea r4, [r1 * 3] | |
21463 lea r5, [r1 * 4] | |
21464 mova m4, [c_mode32_10_0] | |
21465 | |
21466 movu m0, [r2 + 2 ] | |
21467 movu m1, [r2 + 18] | |
21468 movu m2, [r2 + 34] | |
21469 movu m3, [r2 + 50] | |
21470 | |
21471 .loop: | |
21472 movu [r0], m0 | |
21473 movu [r0 + 16], m1 | |
21474 movu [r0 + 32], m2 | |
21475 movu [r0 + 48], m3 | |
21476 | |
21477 movu [r0 + r1], m0 | |
21478 movu [r0 + r1 + 16], m1 | |
21479 movu [r0 + r1 + 32], m2 | |
21480 movu [r0 + r1 + 48], m3 | |
21481 | |
21482 movu [r0 + r3], m0 | |
21483 movu [r0 + r3 + 16], m1 | |
21484 movu [r0 + r3 + 32], m2 | |
21485 movu [r0 + r3 + 48], m3 | |
21486 | |
21487 movu [r0 + r4], m0 | |
21488 movu [r0 + r4 + 16], m1 | |
21489 movu [r0 + r4 + 32], m2 | |
21490 movu [r0 + r4 + 48], m3 | |
21491 | |
21492 add r0, r5 | |
21493 | |
21494 movu [r0], m0 | |
21495 movu [r0 + 16], m1 | |
21496 movu [r0 + 32], m2 | |
21497 movu [r0 + 48], m3 | |
21498 | |
21499 movu [r0 + r1], m0 | |
21500 movu [r0 + r1 + 16], m1 | |
21501 movu [r0 + r1 + 32], m2 | |
21502 movu [r0 + r1 + 48], m3 | |
21503 | |
21504 movu [r0 + r3], m0 | |
21505 movu [r0 + r3 + 16], m1 | |
21506 movu [r0 + r3 + 32], m2 | |
21507 movu [r0 + r3 + 48], m3 | |
21508 | |
21509 movu [r0 + r4], m0 | |
21510 movu [r0 + r4 + 16], m1 | |
21511 movu [r0 + r4 + 32], m2 | |
21512 movu [r0 + r4 + 48], m3 | |
21513 | |
21514 add r0, r5 | |
21515 dec r6d | |
21516 jnz .loop | |
21517 RET | |
21518 | |
21519 INIT_XMM sse4 | |
21520 cglobal intra_pred_ang32_27, 3,7,8 | |
21521 lea r3, [ang_table + 16 * 16] | |
21522 add r1, r1 | |
21523 lea r5, [r1 * 3] | |
21524 mov r6, r0 | |
21525 mov r4d, 8 | |
21526 | |
21527 .loop: | |
21528 MODE_9_27 0 | |
21529 add r6, 8 | |
21530 mov r0, r6 | |
21531 add r2, 8 | |
21532 dec r4 | |
21533 jnz .loop | |
21534 RET | |
21535 | |
21536 INIT_XMM sse4 | |
21537 cglobal intra_pred_ang32_28, 3,7,8 | |
21538 lea r3, [ang_table + 16 * 16] | |
21539 add r1, r1 | |
21540 lea r5, [r1 * 3] | |
21541 mov r6, r0 | |
21542 mov r4d, 8 | |
21543 | |
21544 .loop: | |
21545 MODE_8_28 0 | |
21546 add r6, 8 | |
21547 mov r0, r6 | |
21548 add r2, 8 | |
21549 dec r4 | |
21550 jnz .loop | |
21551 RET | |
21552 | |
21553 INIT_XMM sse4 | |
21554 cglobal intra_pred_ang32_29, 3,7,8 | |
21555 lea r3, [ang_table + 16 * 16] | |
21556 add r1, r1 | |
21557 lea r5, [r1 * 3] | |
21558 mov r6, r0 | |
21559 mov r4d, 8 | |
21560 | |
21561 .loop: | |
21562 MODE_7_29 0 | |
21563 add r6, 8 | |
21564 mov r0, r6 | |
21565 add r2, 8 | |
21566 dec r4 | |
21567 jnz .loop | |
21568 RET | |
21569 | |
21570 INIT_XMM sse4 | |
21571 cglobal intra_pred_ang32_30, 3,7,8 | |
21572 lea r3, [ang_table + 16 * 16] | |
21573 add r1, r1 | |
21574 lea r5, [r1 * 3] | |
21575 mov r6, r0 | |
21576 mov r4d, 8 | |
21577 | |
21578 .loop: | |
21579 MODE_6_30 0 | |
21580 add r6, 8 | |
21581 mov r0, r6 | |
21582 add r2, 8 | |
21583 dec r4 | |
21584 jnz .loop | |
21585 RET | |
21586 | |
21587 INIT_XMM sse4 | |
21588 cglobal intra_pred_ang32_31, 3,7,8 | |
21589 lea r3, [ang_table + 16 * 16] | |
21590 add r1, r1 | |
21591 lea r5, [r1 * 3] | |
21592 mov r6, r0 | |
21593 mov r4d, 8 | |
21594 | |
21595 .loop: | |
21596 MODE_5_31 0 | |
21597 add r6, 8 | |
21598 mov r0, r6 | |
21599 add r2, 8 | |
21600 dec r4 | |
21601 jnz .loop | |
21602 RET | |
21603 | |
21604 INIT_XMM sse4 | |
21605 cglobal intra_pred_ang32_32, 3,7,8 | |
21606 lea r3, [ang_table + 16 * 16] | |
21607 add r1, r1 | |
21608 lea r5, [r1 * 3] | |
21609 mov r6, r0 | |
21610 mov r4d, 8 | |
21611 | |
21612 .loop: | |
21613 MODE_4_32 0 | |
21614 add r6, 8 | |
21615 mov r0, r6 | |
21616 add r2, 8 | |
21617 dec r4 | |
21618 jnz .loop | |
21619 RET | |
21620 | |
21621 INIT_XMM sse4 | |
21622 cglobal intra_pred_ang32_33, 3,7,8 | |
21623 lea r3, [ang_table + 16 * 16] | |
21624 add r1, r1 | |
21625 lea r5, [r1 * 3] | |
21626 mov r6, r0 | |
21627 mov r4d, 8 | |
21628 .loop: | |
21629 MODE_3_33 0 | |
21630 add r6, 8 | |
21631 mov r0, r6 | |
21632 add r2, 8 | |
21633 dec r4 | |
21634 jnz .loop | |
21635 RET | |
21636 | |
21637 ;----------------------------------------------------------------------------------- | |
21638 ; void intra_filter_NxN(const pixel* references, pixel* filtered) | |
21639 ;----------------------------------------------------------------------------------- | |
21640 INIT_XMM sse4 | |
21641 cglobal intra_filter_4x4, 2,4,5 | |
21642 mov r2w, word [r0 + 16] ; topLast | |
21643 mov r3w, word [r0 + 32] ; LeftLast | |
21644 | |
21645 ; filtering top | |
21646 movu m0, [r0 + 0] | |
21647 movu m1, [r0 + 16] | |
21648 movu m2, [r0 + 32] | |
21649 | |
21650 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] | |
21651 palignr m3, m1, m0, 4 | |
21652 pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] | |
21653 | |
21654 psllw m0, 1 | |
21655 paddw m4, m3 | |
21656 paddw m0, m4 | |
21657 paddw m0, [pw_2] | |
21658 psrlw m0, 2 | |
21659 | |
21660 ; filtering left | |
21661 palignr m4, m1, m1, 14 | |
21662 pinsrw m4, [r0], 1 | |
21663 palignr m3, m2, m1, 4 | |
21664 pshufb m3, [intra_filter4_shuf1] | |
21665 | |
21666 psllw m1, 1 | |
21667 paddw m4, m3 | |
21668 paddw m1, m4 | |
21669 paddw m1, [pw_2] | |
21670 psrlw m1, 2 | |
21671 | |
21672 movu [r1], m0 | |
21673 movu [r1 + 16], m1 | |
21674 mov [r1 + 16], r2w ; topLast | |
21675 mov [r1 + 32], r3w ; LeftLast | |
21676 RET | |
21677 | |
21678 INIT_XMM sse4 | |
21679 cglobal intra_filter_8x8, 2,4,6 | |
21680 mov r2w, word [r0 + 32] ; topLast | |
21681 mov r3w, word [r0 + 64] ; LeftLast | |
21682 | |
21683 ; filtering top | |
21684 movu m0, [r0] | |
21685 movu m1, [r0 + 16] | |
21686 movu m2, [r0 + 32] | |
21687 | |
21688 pshufb m4, m0, [intra_filter4_shuf0] | |
21689 palignr m5, m1, m0, 2 | |
21690 pinsrw m5, [r0 + 34], 0 | |
21691 | |
21692 palignr m3, m1, m0, 14 | |
21693 psllw m0, 1 | |
21694 paddw m4, m5 | |
21695 paddw m0, m4 | |
21696 paddw m0, [pw_2] | |
21697 psrlw m0, 2 | |
21698 | |
21699 palignr m4, m2, m1, 2 | |
21700 psllw m1, 1 | |
21701 paddw m4, m3 | |
21702 paddw m1, m4 | |
21703 paddw m1, [pw_2] | |
21704 psrlw m1, 2 | |
21705 movu [r1], m0 | |
21706 movu [r1 + 16], m1 | |
21707 | |
21708 ; filtering left | |
21709 movu m1, [r0 + 48] | |
21710 movu m0, [r0 + 64] | |
21711 | |
21712 palignr m4, m2, m2, 14 | |
21713 pinsrw m4, [r0], 1 | |
21714 palignr m5, m1, m2, 2 | |
21715 | |
21716 palignr m3, m1, m2, 14 | |
21717 palignr m0, m1, 2 | |
21718 | |
21719 psllw m2, 1 | |
21720 paddw m4, m5 | |
21721 paddw m2, m4 | |
21722 paddw m2, [pw_2] | |
21723 psrlw m2, 2 | |
21724 | |
21725 psllw m1, 1 | |
21726 paddw m0, m3 | |
21727 paddw m1, m0 | |
21728 paddw m1, [pw_2] | |
21729 psrlw m1, 2 | |
21730 | |
21731 movu [r1 + 32], m2 | |
21732 movu [r1 + 48], m1 | |
21733 mov [r1 + 32], r2w ; topLast | |
21734 mov [r1 + 64], r3w ; LeftLast | |
21735 RET | |
21736 | |
21737 INIT_XMM sse4 | |
21738 cglobal intra_filter_16x16, 2,4,6 | |
21739 mov r2w, word [r0 + 64] ; topLast | |
21740 mov r3w, word [r0 + 128] ; LeftLast | |
21741 | |
21742 ; filtering top | |
21743 movu m0, [r0] | |
21744 movu m1, [r0 + 16] | |
21745 movu m2, [r0 + 32] | |
21746 | |
21747 pshufb m4, m0, [intra_filter4_shuf0] | |
21748 palignr m5, m1, m0, 2 | |
21749 pinsrw m5, [r0 + 66], 0 | |
21750 | |
21751 palignr m3, m1, m0, 14 | |
21752 psllw m0, 1 | |
21753 paddw m4, m5 | |
21754 paddw m0, m4 | |
21755 paddw m0, [pw_2] | |
21756 psrlw m0, 2 | |
21757 | |
21758 palignr m4, m2, m1, 2 | |
21759 psllw m5, m1, 1 | |
21760 paddw m4, m3 | |
21761 paddw m5, m4 | |
21762 paddw m5, [pw_2] | |
21763 psrlw m5, 2 | |
21764 movu [r1], m0 | |
21765 movu [r1 + 16], m5 | |
21766 | |
21767 movu m0, [r0 + 48] | |
21768 movu m5, [r0 + 64] | |
21769 | |
21770 palignr m3, m2, m1, 14 | |
21771 palignr m4, m0, m2, 2 | |
21772 | |
21773 psllw m1, m2, 1 | |
21774 paddw m3, m4 | |
21775 paddw m1, m3 | |
21776 paddw m1, [pw_2] | |
21777 psrlw m1, 2 | |
21778 | |
21779 palignr m3, m0, m2, 14 | |
21780 palignr m4, m5, m0, 2 | |
21781 | |
21782 psllw m0, 1 | |
21783 paddw m4, m3 | |
21784 paddw m0, m4 | |
21785 paddw m0, [pw_2] | |
21786 psrlw m0, 2 | |
21787 movu [r1 + 32], m1 | |
21788 movu [r1 + 48], m0 | |
21789 | |
21790 ; filtering left | |
21791 movu m1, [r0 + 80] | |
21792 movu m2, [r0 + 96] | |
21793 | |
21794 palignr m4, m5, m5, 14 | |
21795 pinsrw m4, [r0], 1 | |
21796 palignr m0, m1, m5, 2 | |
21797 | |
21798 psllw m3, m5, 1 | |
21799 paddw m4, m0 | |
21800 paddw m3, m4 | |
21801 paddw m3, [pw_2] | |
21802 psrlw m3, 2 | |
21803 | |
21804 palignr m0, m1, m5, 14 | |
21805 palignr m4, m2, m1, 2 | |
21806 | |
21807 psllw m5, m1, 1 | |
21808 paddw m4, m0 | |
21809 paddw m5, m4 | |
21810 paddw m5, [pw_2] | |
21811 psrlw m5, 2 | |
21812 movu [r1 + 64], m3 | |
21813 movu [r1 + 80], m5 | |
21814 | |
21815 movu m5, [r0 + 112] | |
21816 movu m0, [r0 + 128] | |
21817 | |
21818 palignr m3, m2, m1, 14 | |
21819 palignr m4, m5, m2, 2 | |
21820 | |
21821 psllw m1, m2, 1 | |
21822 paddw m3, m4 | |
21823 paddw m1, m3 | |
21824 paddw m1, [pw_2] | |
21825 psrlw m1, 2 | |
21826 | |
21827 palignr m3, m5, m2, 14 | |
21828 palignr m4, m0, m5, 2 | |
21829 | |
21830 psllw m5, 1 | |
21831 paddw m4, m3 | |
21832 paddw m5, m4 | |
21833 paddw m5, [pw_2] | |
21834 psrlw m5, 2 | |
21835 movu [r1 + 96], m1 | |
21836 movu [r1 + 112], m5 | |
21837 | |
21838 mov [r1 + 64], r2w ; topLast | |
21839 mov [r1 + 128], r3w ; LeftLast | |
21840 RET | |
21841 | |
21842 INIT_XMM sse4 | |
21843 cglobal intra_filter_32x32, 2,4,6 | |
21844 mov r2w, word [r0 + 128] ; topLast | |
21845 mov r3w, word [r0 + 256] ; LeftLast | |
21846 | |
21847 ; filtering top | |
21848 ; 0 to 15 | |
21849 movu m0, [r0 + 0] | |
21850 movu m1, [r0 + 16] | |
21851 movu m2, [r0 + 32] | |
21852 | |
21853 pshufb m4, m0, [intra_filter4_shuf0] | |
21854 palignr m5, m1, m0, 2 | |
21855 pinsrw m5, [r0 + 130], 0 | |
21856 | |
21857 palignr m3, m1, m0, 14 | |
21858 psllw m0, 1 | |
21859 paddw m4, m5 | |
21860 paddw m0, m4 | |
21861 paddw m0, [pw_2] | |
21862 psrlw m0, 2 | |
21863 | |
21864 palignr m4, m2, m1, 2 | |
21865 psllw m5, m1, 1 | |
21866 paddw m4, m3 | |
21867 paddw m5, m4 | |
21868 paddw m5, [pw_2] | |
21869 psrlw m5, 2 | |
21870 movu [r1], m0 | |
21871 movu [r1 + 16], m5 | |
21872 | |
21873 ; 16 to 31 | |
21874 movu m0, [r0 + 48] | |
21875 movu m5, [r0 + 64] | |
21876 | |
21877 palignr m3, m2, m1, 14 | |
21878 palignr m4, m0, m2, 2 | |
21879 | |
21880 psllw m1, m2, 1 | |
21881 paddw m3, m4 | |
21882 paddw m1, m3 | |
21883 paddw m1, [pw_2] | |
21884 psrlw m1, 2 | |
21885 | |
21886 palignr m3, m0, m2, 14 | |
21887 palignr m4, m5, m0, 2 | |
21888 | |
21889 psllw m2, m0, 1 | |
21890 paddw m4, m3 | |
21891 paddw m2, m4 | |
21892 paddw m2, [pw_2] | |
21893 psrlw m2, 2 | |
21894 movu [r1 + 32], m1 | |
21895 movu [r1 + 48], m2 | |
21896 | |
21897 ; 32 to 47 | |
21898 movu m1, [r0 + 80] | |
21899 movu m2, [r0 + 96] | |
21900 | |
21901 palignr m3, m5, m0, 14 | |
21902 palignr m4, m1, m5, 2 | |
21903 | |
21904 psllw m0, m5, 1 | |
21905 paddw m3, m4 | |
21906 paddw m0, m3 | |
21907 paddw m0, [pw_2] | |
21908 psrlw m0, 2 | |
21909 | |
21910 palignr m3, m1, m5, 14 | |
21911 palignr m4, m2, m1, 2 | |
21912 | |
21913 psllw m5, m1, 1 | |
21914 paddw m4, m3 | |
21915 paddw m5, m4 | |
21916 paddw m5, [pw_2] | |
21917 psrlw m5, 2 | |
21918 movu [r1 + 64], m0 | |
21919 movu [r1 + 80], m5 | |
21920 | |
21921 ; 48 to 63 | |
21922 movu m0, [r0 + 112] | |
21923 movu m5, [r0 + 128] | |
21924 | |
21925 palignr m3, m2, m1, 14 | |
21926 palignr m4, m0, m2, 2 | |
21927 | |
21928 psllw m1, m2, 1 | |
21929 paddw m3, m4 | |
21930 paddw m1, m3 | |
21931 paddw m1, [pw_2] | |
21932 psrlw m1, 2 | |
21933 | |
21934 palignr m3, m0, m2, 14 | |
21935 palignr m4, m5, m0, 2 | |
21936 | |
21937 psllw m0, 1 | |
21938 paddw m4, m3 | |
21939 paddw m0, m4 | |
21940 paddw m0, [pw_2] | |
21941 psrlw m0, 2 | |
21942 movu [r1 + 96], m1 | |
21943 movu [r1 + 112], m0 | |
21944 | |
21945 ; filtering left | |
21946 ; 64 to 79 | |
21947 movu m1, [r0 + 144] | |
21948 movu m2, [r0 + 160] | |
21949 | |
21950 palignr m4, m5, m5, 14 | |
21951 pinsrw m4, [r0], 1 | |
21952 palignr m0, m1, m5, 2 | |
21953 | |
21954 psllw m3, m5, 1 | |
21955 paddw m4, m0 | |
21956 paddw m3, m4 | |
21957 paddw m3, [pw_2] | |
21958 psrlw m3, 2 | |
21959 | |
21960 palignr m0, m1, m5, 14 | |
21961 palignr m4, m2, m1, 2 | |
21962 | |
21963 psllw m5, m1, 1 | |
21964 paddw m4, m0 | |
21965 paddw m5, m4 | |
21966 paddw m5, [pw_2] | |
21967 psrlw m5, 2 | |
21968 movu [r1 + 128], m3 | |
21969 movu [r1 + 144], m5 | |
21970 | |
21971 ; 80 to 95 | |
21972 movu m5, [r0 + 176] | |
21973 movu m0, [r0 + 192] | |
21974 | |
21975 palignr m3, m2, m1, 14 | |
21976 palignr m4, m5, m2, 2 | |
21977 | |
21978 psllw m1, m2, 1 | |
21979 paddw m3, m4 | |
21980 paddw m1, m3 | |
21981 paddw m1, [pw_2] | |
21982 psrlw m1, 2 | |
21983 | |
21984 palignr m3, m5, m2, 14 | |
21985 palignr m4, m0, m5, 2 | |
21986 | |
21987 psllw m2, m5, 1 | |
21988 paddw m4, m3 | |
21989 paddw m2, m4 | |
21990 paddw m2, [pw_2] | |
21991 psrlw m2, 2 | |
21992 movu [r1 + 160], m1 | |
21993 movu [r1 + 176], m2 | |
21994 | |
21995 ; 96 to 111 | |
21996 movu m1, [r0 + 208] | |
21997 movu m2, [r0 + 224] | |
21998 | |
21999 palignr m3, m0, m5, 14 | |
22000 palignr m4, m1, m0, 2 | |
22001 | |
22002 psllw m5, m0, 1 | |
22003 paddw m3, m4 | |
22004 paddw m5, m3 | |
22005 paddw m5, [pw_2] | |
22006 psrlw m5, 2 | |
22007 | |
22008 palignr m3, m1, m0, 14 | |
22009 palignr m4, m2, m1, 2 | |
22010 | |
22011 psllw m0, m1, 1 | |
22012 paddw m4, m3 | |
22013 paddw m0, m4 | |
22014 paddw m0, [pw_2] | |
22015 psrlw m0, 2 | |
22016 movu [r1 + 192], m5 | |
22017 movu [r1 + 208], m0 | |
22018 | |
22019 ; 112 to 127 | |
22020 movu m5, [r0 + 240] | |
22021 movu m0, [r0 + 256] | |
22022 | |
22023 palignr m3, m2, m1, 14 | |
22024 palignr m4, m5, m2, 2 | |
22025 | |
22026 psllw m1, m2, 1 | |
22027 paddw m3, m4 | |
22028 paddw m1, m3 | |
22029 paddw m1, [pw_2] | |
22030 psrlw m1, 2 | |
22031 | |
22032 palignr m3, m5, m2, 14 | |
22033 palignr m4, m0, m5, 2 | |
22034 | |
22035 psllw m5, 1 | |
22036 paddw m4, m3 | |
22037 paddw m5, m4 | |
22038 paddw m5, [pw_2] | |
22039 psrlw m5, 2 | |
22040 movu [r1 + 224], m1 | |
22041 movu [r1 + 240], m5 | |
22042 | |
22043 mov [r1 + 128], r2w ; topLast | |
22044 mov [r1 + 256], r3w ; LeftLast | |
22045 RET | |
22046 | |
22047 INIT_YMM avx2 | |
22048 cglobal intra_filter_4x4, 2,4,4 | |
22049 mov r2w, word [r0 + 16] ; topLast | |
22050 mov r3w, word [r0 + 32] ; LeftLast | |
22051 | |
22052 ; filtering top | |
22053 movu m0, [r0] | |
22054 vpbroadcastw m2, xm0 | |
22055 movu m1, [r0 + 16] | |
22056 | |
22057 palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] | |
22058 pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] | |
22059 palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] | |
22060 palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] | |
22061 | |
22062 psllw m0, 1 | |
22063 paddw m3, m1 | |
22064 paddw m0, m3 | |
22065 paddw m0, [pw_2] | |
22066 psrlw m0, 2 | |
22067 | |
22068 movu [r1], m0 | |
22069 mov [r1 + 16], r2w ; topLast | |
22070 mov [r1 + 32], r3w ; LeftLast | |
22071 RET |