comparison x265/source/common/x86/intrapred16.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5 ;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com>
6 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
7 ;*
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
12 ;*
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
25
26 %include "x86inc.asm"
27 %include "x86util.asm"
28
29 SECTION_RODATA 32
30
31 const ang_table
32 %assign x 0
33 %rep 32
34 times 4 dw (32-x), x
35 %assign x x+1
36 %endrep
37
38 const ang_table_avx2
39 %assign x 0
40 %rep 32
41 times 8 dw (32-x), x
42 %assign x x+1
43 %endrep
44
45 const pw_ang16_12_24, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 14, 15, 0, 1, 0, 1
46 const pw_ang16_13_23, db 2, 3, 2, 3, 14, 15, 14, 15, 6, 7, 6, 7, 0, 1, 0, 1
47 const pw_ang16_14_22, db 2, 3, 2, 3, 10, 11, 10, 11, 6, 7, 6, 7, 0, 1, 0, 1
48 const pw_ang16_15_21, db 12, 13, 12, 13, 8, 9, 8, 9, 4, 5, 4, 5, 0, 1, 0, 1
49 const pw_ang16_16_20, db 8, 9, 8, 9, 6, 7, 6, 7, 2, 3, 2, 3, 0, 1, 0, 1
50
51 const pw_ang32_12_24, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
52 const pw_ang32_13_23, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 6, 7, 0, 1
53 const pw_ang32_14_22, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 6, 7, 0, 1
54 const pw_ang32_15_21, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
55 const pw_ang32_16_20, db 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 6, 7, 2, 3, 0, 1
56 const pw_ang32_17_19_0, db 0, 0, 0, 0, 12, 13, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
57
58 const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
59 const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
60 const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
61 const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1
62 const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1
63 const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
64 const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
65 const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
66
67 const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
68 const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
69 const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
70 const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
71 const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
72 const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
73 const pw_swap16, times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
74
75 const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
76 const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
77
78 intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
79 intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
80 intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
81
82 ;; (blkSize - 1 - x)
83 pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
84
85 const planar32_table
86 %assign x 31
87 %rep 8
88 dd x, x-1, x-2, x-3
89 %assign x x-4
90 %endrep
91
92 const planar32_table1
93 %assign x 1
94 %rep 8
95 dd x, x+1, x+2, x+3
96 %assign x x+4
97 %endrep
98
99 SECTION .text
100
101 cextern pb_01
102 cextern pw_1
103 cextern pw_2
104 cextern pw_3
105 cextern pw_7
106 cextern pw_4
107 cextern pw_8
108 cextern pw_15
109 cextern pw_16
110 cextern pw_31
111 cextern pw_32
112 cextern pd_16
113 cextern pd_31
114 cextern pd_32
115 cextern pw_4096
116 cextern pw_pixel_max
117 cextern multiL
118 cextern multiH
119 cextern multiH2
120 cextern multiH3
121 cextern multi_2Row
122 cextern pw_swap
123 cextern pb_unpackwq1
124 cextern pb_unpackwq2
125 cextern pw_planar16_mul
126 cextern pw_planar32_mul
127
128 ;-----------------------------------------------------------------------------------
129 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
130 ;-----------------------------------------------------------------------------------
131 INIT_XMM sse2
132 cglobal intra_pred_dc4, 5,6,2
133 movh m0, [r2 + 18] ; sumAbove
134 movh m1, [r2 + 2] ; sumLeft
135
136 paddw m0, m1
137 pshuflw m1, m0, 0x4E
138 paddw m0, m1
139 pshuflw m1, m0, 0xB1
140 paddw m0, m1
141
142 test r4d, r4d
143
144 paddw m0, [pw_4]
145 psrlw m0, 3
146
147 ; store DC 4x4
148 movh [r0], m0
149 movh [r0 + r1 * 2], m0
150 movh [r0 + r1 * 4], m0
151 lea r5, [r0 + r1 * 4]
152 movh [r5 + r1 * 2], m0
153
154 ; do DC filter
155 jz .end
156 movh m1, m0
157 psllw m1, 1
158 paddw m1, [pw_2]
159 movd r3d, m1
160 paddw m0, m1
161 ; filter top
162 movh m1, [r2 + 2]
163 paddw m1, m0
164 psrlw m1, 2
165 movh [r0], m1 ; overwrite top-left pixel, we will update it later
166
167 ; filter top-left
168 movzx r3d, r3w
169 movzx r4d, word [r2 + 18]
170 add r3d, r4d
171 movzx r4d, word [r2 + 2]
172 add r4d, r3d
173 shr r4d, 2
174 mov [r0], r4w
175
176 ; filter left
177 movu m1, [r2 + 20]
178 paddw m1, m0
179 psrlw m1, 2
180 movd r3d, m1
181 mov [r0 + r1 * 2], r3w
182 shr r3d, 16
183 mov [r0 + r1 * 4], r3w
184 pextrw r3d, m1, 2
185 mov [r5 + r1 * 2], r3w
186 .end:
187 RET
188
189 ;-----------------------------------------------------------------------------------
190 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
191 ;-----------------------------------------------------------------------------------
192 INIT_XMM sse2
193 cglobal intra_pred_dc8, 5, 8, 2
194 movu m0, [r2 + 34]
195 movu m1, [r2 + 2]
196
197 paddw m0, m1
198 movhlps m1, m0
199 paddw m0, m1
200 pshufd m1, m0, 1
201 paddw m0, m1
202 pmaddwd m0, [pw_1]
203
204 paddw m0, [pw_8]
205 psrlw m0, 4 ; sum = sum / 16
206 pshuflw m0, m0, 0
207 pshufd m0, m0, 0 ; m0 = word [dc_val ...]
208
209 test r4d, r4d
210
211 ; store DC 8x8
212 lea r6, [r1 + r1 * 4]
213 lea r6, [r6 + r1]
214 lea r5, [r6 + r1 * 4]
215 lea r7, [r6 + r1 * 8]
216 movu [r0], m0
217 movu [r0 + r1 * 2], m0
218 movu [r0 + r1 * 4], m0
219 movu [r0 + r6], m0
220 movu [r0 + r1 * 8], m0
221 movu [r0 + r5], m0
222 movu [r0 + r6 * 2], m0
223 movu [r0 + r7], m0
224
225 ; Do DC Filter
226 jz .end
227 mova m1, [pw_2]
228 pmullw m1, m0
229 paddw m1, [pw_2]
230 movd r4d, m1 ; r4d = DC * 2 + 2
231 paddw m1, m0 ; m1 = DC * 3 + 2
232 pshuflw m1, m1, 0
233 pshufd m1, m1, 0 ; m1 = pixDCx3
234
235 ; filter top
236 movu m0, [r2 + 2]
237 paddw m0, m1
238 psrlw m0, 2
239 movu [r0], m0
240
241 ; filter top-left
242 movzx r4d, r4w
243 movzx r3d, word [r2 + 34]
244 add r4d, r3d
245 movzx r3d, word [r2 + 2]
246 add r3d, r4d
247 shr r3d, 2
248 mov [r0], r3w
249
250 ; filter left
251 movu m0, [r2 + 36]
252 paddw m0, m1
253 psrlw m0, 2
254 movh r3, m0
255 mov [r0 + r1 * 2], r3w
256 shr r3, 16
257 mov [r0 + r1 * 4], r3w
258 shr r3, 16
259 mov [r0 + r6], r3w
260 shr r3, 16
261 mov [r0 + r1 * 8], r3w
262 pshufd m0, m0, 0x6E
263 movh r3, m0
264 mov [r0 + r5], r3w
265 shr r3, 16
266 mov [r0 + r6 * 2], r3w
267 shr r3, 16
268 mov [r0 + r7], r3w
269 .end:
270 RET
271
272 ;-------------------------------------------------------------------------------------------------------
273 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
274 ;-------------------------------------------------------------------------------------------------------
275 INIT_XMM sse2
276 cglobal intra_pred_dc16, 5, 10, 4
277 lea r3, [r2 + 66]
278 add r1, r1
279 movu m0, [r3]
280 movu m1, [r3 + 16]
281 movu m2, [r2 + 2]
282 movu m3, [r2 + 18]
283
284 paddw m0, m1
285 paddw m2, m3
286 paddw m0, m2
287 HADDUW m0, m1
288 paddd m0, [pd_16]
289 psrld m0, 5
290
291 movd r5d, m0
292 pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
293 pshufd m0, m0, 0
294
295 test r4d, r4d
296
297 ; store DC 16x16
298 lea r6, [r1 + r1 * 2] ;index 3
299 lea r7, [r1 + r1 * 4] ;index 5
300 lea r8, [r6 + r1 * 4] ;index 7
301 lea r9, [r0 + r8] ;base + 7
302 movu [r0], m0
303 movu [r0 + 16], m0
304 movu [r0 + r1], m0
305 movu [r0 + 16 + r1], m0
306 movu [r0 + r1 * 2], m0
307 movu [r0 + r1 * 2 + 16], m0
308 movu [r0 + r6], m0
309 movu [r0 + r6 + 16], m0
310 movu [r0 + r1 * 4], m0
311 movu [r0 + r1 * 4 + 16], m0
312 movu [r0 + r7], m0
313 movu [r0 + r7 + 16], m0
314 movu [r0 + r6 * 2], m0
315 movu [r0 + r6 * 2 + 16], m0
316 movu [r9], m0
317 movu [r9 + 16], m0
318 movu [r0 + r1 * 8], m0
319 movu [r0 + r1 * 8 + 16], m0
320 movu [r9 + r1 * 2], m0
321 movu [r9 + r1 * 2 + 16], m0
322 movu [r0 + r7 * 2], m0
323 movu [r0 + r7 * 2 + 16], m0
324 movu [r9 + r1 * 4], m0
325 movu [r9 + r1 * 4 + 16], m0
326 movu [r0 + r6 * 4], m0
327 movu [r0 + r6 * 4 + 16], m0
328 movu [r9 + r6 * 2], m0
329 movu [r9 + r6 * 2 + 16], m0
330 movu [r9 + r8], m0
331 movu [r9 + r8 + 16], m0
332 movu [r9 + r1 * 8], m0
333 movu [r9 + r1 * 8 + 16], m0
334
335 ; Do DC Filter
336 jz .end
337 mova m1, [pw_2]
338 pmullw m1, m0
339 paddw m1, [pw_2]
340 movd r4d, m1
341 paddw m1, m0
342
343 ; filter top
344 movu m2, [r2 + 2]
345 paddw m2, m1
346 psrlw m2, 2
347 movu [r0], m2
348 movu m3, [r2 + 18]
349 paddw m3, m1
350 psrlw m3, 2
351 movu [r0 + 16], m3
352
353 ; filter top-left
354 movzx r4d, r4w
355 movzx r5d, word [r3]
356 add r4d, r5d
357 movzx r5d, word [r2 + 2]
358 add r5d, r4d
359 shr r5d, 2
360 mov [r0], r5w
361
362 ; filter left
363 movu m2, [r3 + 2]
364 paddw m2, m1
365 psrlw m2, 2
366
367 movq r2, m2
368 pshufd m2, m2, 0xEE
369 mov [r0 + r1], r2w
370 shr r2, 16
371 mov [r0 + r1 * 2], r2w
372 shr r2, 16
373 mov [r0 + r6], r2w
374 shr r2, 16
375 mov [r0 + r1 * 4], r2w
376 movq r2, m2
377 mov [r0 + r7], r2w
378 shr r2, 16
379 mov [r0 + r6 * 2], r2w
380 shr r2, 16
381 mov [r9], r2w
382 shr r2, 16
383 mov [r0 + r1 * 8], r2w
384
385 movu m3, [r3 + 18]
386 paddw m3, m1
387 psrlw m3, 2
388
389 movq r3, m3
390 pshufd m3, m3, 0xEE
391 mov [r9 + r1 * 2], r3w
392 shr r3, 16
393 mov [r0 + r7 * 2], r3w
394 shr r3, 16
395 mov [r9 + r1 * 4], r3w
396 shr r3, 16
397 mov [r0 + r6 * 4], r3w
398 movq r3, m3
399 mov [r9 + r6 * 2], r3w
400 shr r3, 16
401 mov [r9 + r8], r3w
402 shr r3, 16
403 mov [r9 + r1 * 8], r3w
404 .end:
405 RET
406
407 ;-------------------------------------------------------------------------------------------
408 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
409 ;-------------------------------------------------------------------------------------------
410 INIT_XMM sse2
411 cglobal intra_pred_dc32, 3, 4, 6
412 lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
413 add r2, 2
414 add r1, r1
415 movu m0, [r3]
416 movu m1, [r3 + 16]
417 movu m2, [r3 + 32]
418 movu m3, [r3 + 48]
419 paddw m0, m1
420 paddw m2, m3
421 paddw m0, m2
422 HADDUWD m0, m1
423
424 movu m1, [r2]
425 movu m2, [r2 + 16]
426 movu m3, [r2 + 32]
427 movu m4, [r2 + 48]
428 paddw m1, m2
429 paddw m3, m4
430 paddw m1, m3
431 HADDUWD m1, m2
432
433 paddd m0, m1
434 HADDD m0, m1
435 paddd m0, [pd_32] ; sum = sum + 32
436 psrld m0, 6 ; sum = sum / 64
437 pshuflw m0, m0, 0
438 pshufd m0, m0, 0
439
440 lea r2, [r1 * 3]
441 ; store DC 32x32
442 %assign x 1
443 %rep 8
444 movu [r0 + 0], m0
445 movu [r0 + 16], m0
446 movu [r0 + 32], m0
447 movu [r0 + 48], m0
448 movu [r0 + r1 + 0], m0
449 movu [r0 + r1 + 16], m0
450 movu [r0 + r1 + 32], m0
451 movu [r0 + r1 + 48], m0
452 movu [r0 + r1 * 2 + 0], m0
453 movu [r0 + r1 * 2 + 16], m0
454 movu [r0 + r1 * 2 + 32], m0
455 movu [r0 + r1 * 2 + 48], m0
456 movu [r0 + r2 + 0], m0
457 movu [r0 + r2 + 16], m0
458 movu [r0 + r2 + 32], m0
459 movu [r0 + r2 + 48], m0
460 %if x < 8
461 lea r0, [r0 + r1 * 4]
462 %endif
463 %assign x x + 1
464 %endrep
465 RET
466
467 ;-------------------------------------------------------------------------------------------------------
468 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
469 ;-------------------------------------------------------------------------------------------------------
470 INIT_YMM avx2
471 cglobal intra_pred_dc16, 3, 9, 4
472 mov r3d, r4m
473 add r1d, r1d
474 movu m0, [r2 + 66]
475 movu m2, [r2 + 2]
476 paddw m0, m2 ; dynamic range 13 bits
477
478 vextracti128 xm1, m0, 1
479 paddw xm0, xm1 ; dynamic range 14 bits
480 movhlps xm1, xm0
481 paddw xm0, xm1 ; dynamic range 15 bits
482 pmaddwd xm0, [pw_1]
483 phaddd xm0, xm0
484 paddd xm0, [pd_16]
485 psrld xm0, 5
486 movd r5d, xm0
487 vpbroadcastw m0, xm0
488
489 test r3d, r3d
490
491 ; store DC 16x16
492 lea r6, [r1 + r1 * 2] ; index 3
493 lea r7, [r1 + r1 * 4] ; index 5
494 lea r8, [r6 + r1 * 4] ; index 7
495 lea r4, [r0 + r8 * 1] ; base + 7
496
497 movu [r0], m0
498 movu [r0 + r1], m0
499 movu [r0 + r1 * 2], m0
500 movu [r0 + r6], m0
501 movu [r0 + r1 * 4], m0
502 movu [r0 + r7], m0
503 movu [r0 + r6 * 2], m0
504 movu [r4], m0
505 movu [r0 + r1 * 8], m0
506 movu [r4 + r1 * 2], m0
507 movu [r0 + r7 * 2], m0
508 movu [r4 + r1 * 4], m0
509 movu [r0 + r6 * 4], m0
510 movu [r4 + r6 * 2], m0
511 movu [r4 + r8], m0
512 movu [r4 + r1 * 8], m0
513
514 ; Do DC Filter
515 jz .end
516 mova m1, [pw_2]
517 pmullw m1, m0
518 paddw m1, [pw_2]
519 movd r3d, xm1
520 paddw m1, m0
521
522 ; filter top
523 movu m2, [r2 + 2]
524 paddw m2, m1
525 psrlw m2, 2
526 movu [r0], m2
527
528 ; filter top-left
529 movzx r3d, r3w
530 movzx r5d, word [r2 + 66]
531 add r3d, r5d
532 movzx r5d, word [r2 + 2]
533 add r5d, r3d
534 shr r5d, 2
535 mov [r0], r5w
536
537 ; filter left
538 movu m2, [r2 + 68]
539 paddw m2, m1
540 psrlw m2, 2
541 vextracti128 xm3, m2, 1
542
543 movq r3, xm2
544 pshufd xm2, xm2, 0xEE
545 mov [r0 + r1], r3w
546 shr r3, 16
547 mov [r0 + r1 * 2], r3w
548 shr r3, 16
549 mov [r0 + r6], r3w
550 shr r3, 16
551 mov [r0 + r1 * 4], r3w
552 movq r3, xm2
553 mov [r0 + r7], r3w
554 shr r3, 16
555 mov [r0 + r6 * 2], r3w
556 shr r3, 16
557 mov [r4], r3w
558 shr r3, 16
559 mov [r0 + r1 * 8], r3w
560
561 movq r3, xm3
562 pshufd xm3, xm3, 0xEE
563 mov [r4 + r1 * 2], r3w
564 shr r3, 16
565 mov [r0 + r7 * 2], r3w
566 shr r3, 16
567 mov [r4 + r1 * 4], r3w
568 shr r3, 16
569 mov [r0 + r6 * 4], r3w
570 movq r3, xm3
571 mov [r4 + r6 * 2], r3w
572 shr r3, 16
573 mov [r4 + r8], r3w
574 shr r3, 16
575 mov [r4 + r1 * 8], r3w
576 .end:
577 RET
578
579 ;---------------------------------------------------------------------------------------------
580 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
581 ;---------------------------------------------------------------------------------------------
582 INIT_YMM avx2
583 cglobal intra_pred_dc32, 3,3,3
584 add r2, 2
585 add r1d, r1d
586 movu m0, [r2]
587 movu m1, [r2 + 32]
588 add r2, mmsize*4 ; r2 += 128
589 paddw m0, m1 ; dynamic range 13 bits
590 movu m1, [r2]
591 movu m2, [r2 + 32]
592 paddw m1, m2 ; dynamic range 13 bits
593 paddw m0, m1 ; dynamic range 14 bits
594 vextracti128 xm1, m0, 1
595 paddw xm0, xm1 ; dynamic range 15 bits
596 pmaddwd xm0, [pw_1]
597 movhlps xm1, xm0
598 paddd xm0, xm1
599 phaddd xm0, xm0
600 paddd xm0, [pd_32] ; sum = sum + 32
601 psrld xm0, 6 ; sum = sum / 64
602 vpbroadcastw m0, xm0
603
604 lea r2, [r1 * 3]
605 ; store DC 32x32
606 movu [r0 + r1 * 0 + 0], m0
607 movu [r0 + r1 * 0 + mmsize], m0
608 movu [r0 + r1 * 1 + 0], m0
609 movu [r0 + r1 * 1 + mmsize], m0
610 movu [r0 + r1 * 2 + 0], m0
611 movu [r0 + r1 * 2 + mmsize], m0
612 movu [r0 + r2 * 1 + 0], m0
613 movu [r0 + r2 * 1 + mmsize], m0
614 lea r0, [r0 + r1 * 4]
615 movu [r0 + r1 * 0 + 0], m0
616 movu [r0 + r1 * 0 + mmsize], m0
617 movu [r0 + r1 * 1 + 0], m0
618 movu [r0 + r1 * 1 + mmsize], m0
619 movu [r0 + r1 * 2 + 0], m0
620 movu [r0 + r1 * 2 + mmsize], m0
621 movu [r0 + r2 * 1 + 0], m0
622 movu [r0 + r2 * 1 + mmsize], m0
623 lea r0, [r0 + r1 * 4]
624 movu [r0 + r1 * 0 + 0], m0
625 movu [r0 + r1 * 0 + mmsize], m0
626 movu [r0 + r1 * 1 + 0], m0
627 movu [r0 + r1 * 1 + mmsize], m0
628 movu [r0 + r1 * 2 + 0], m0
629 movu [r0 + r1 * 2 + mmsize], m0
630 movu [r0 + r2 * 1 + 0], m0
631 movu [r0 + r2 * 1 + mmsize], m0
632 lea r0, [r0 + r1 * 4]
633 movu [r0 + r1 * 0 + 0], m0
634 movu [r0 + r1 * 0 + mmsize], m0
635 movu [r0 + r1 * 1 + 0], m0
636 movu [r0 + r1 * 1 + mmsize], m0
637 movu [r0 + r1 * 2 + 0], m0
638 movu [r0 + r1 * 2 + mmsize], m0
639 movu [r0 + r2 * 1 + 0], m0
640 movu [r0 + r2 * 1 + mmsize], m0
641 lea r0, [r0 + r1 * 4]
642 movu [r0 + r1 * 0 + 0], m0
643 movu [r0 + r1 * 0 + mmsize], m0
644 movu [r0 + r1 * 1 + 0], m0
645 movu [r0 + r1 * 1 + mmsize], m0
646 movu [r0 + r1 * 2 + 0], m0
647 movu [r0 + r1 * 2 + mmsize], m0
648 movu [r0 + r2 * 1 + 0], m0
649 movu [r0 + r2 * 1 + mmsize], m0
650 lea r0, [r0 + r1 * 4]
651 movu [r0 + r1 * 0 + 0], m0
652 movu [r0 + r1 * 0 + mmsize], m0
653 movu [r0 + r1 * 1 + 0], m0
654 movu [r0 + r1 * 1 + mmsize], m0
655 movu [r0 + r1 * 2 + 0], m0
656 movu [r0 + r1 * 2 + mmsize], m0
657 movu [r0 + r2 * 1 + 0], m0
658 movu [r0 + r2 * 1 + mmsize], m0
659 lea r0, [r0 + r1 * 4]
660 movu [r0 + r1 * 0 + 0], m0
661 movu [r0 + r1 * 0 + mmsize], m0
662 movu [r0 + r1 * 1 + 0], m0
663 movu [r0 + r1 * 1 + mmsize], m0
664 movu [r0 + r1 * 2 + 0], m0
665 movu [r0 + r1 * 2 + mmsize], m0
666 movu [r0 + r2 * 1 + 0], m0
667 movu [r0 + r2 * 1 + mmsize], m0
668 lea r0, [r0 + r1 * 4]
669 movu [r0 + r1 * 0 + 0], m0
670 movu [r0 + r1 * 0 + mmsize], m0
671 movu [r0 + r1 * 1 + 0], m0
672 movu [r0 + r1 * 1 + mmsize], m0
673 movu [r0 + r1 * 2 + 0], m0
674 movu [r0 + r1 * 2 + mmsize], m0
675 movu [r0 + r2 * 1 + 0], m0
676 movu [r0 + r2 * 1 + mmsize], m0
677 RET
678
679 ;---------------------------------------------------------------------------------------
680 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
681 ;---------------------------------------------------------------------------------------
682 INIT_XMM sse2
683 cglobal intra_pred_planar8, 3,3,5
684 movu m1, [r2 + 2]
685 movu m2, [r2 + 34]
686
687 movd m3, [r2 + 18] ; topRight = above[8];
688 movd m4, [r2 + 50] ; bottomLeft = left[8];
689
690 pshuflw m3, m3, 0
691 pshuflw m4, m4, 0
692 pshufd m3, m3, 0 ; v_topRight
693 pshufd m4, m4, 0 ; v_bottomLeft
694
695 pmullw m3, [multiL] ; (x + 1) * topRight
696 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
697 paddw m3, [pw_8]
698 paddw m3, m4
699 paddw m3, m0
700 psubw m4, m1
701
702 %macro INTRA_PRED_PLANAR_8 1
703 %if (%1 < 4)
704 pshuflw m1, m2, 0x55 * %1
705 pshufd m1, m1, 0
706 %else
707 pshufhw m1, m2, 0x55 * (%1 - 4)
708 pshufd m1, m1, 0xAA
709 %endif
710 pmullw m1, [pw_planar16_mul + mmsize]
711 paddw m1, m3
712 psraw m1, 4
713 movu [r0], m1
714 %if (%1 < 7)
715 paddw m3, m4
716 lea r0, [r0 + r1 * 2]
717 %endif
718 %endmacro
719
720 INTRA_PRED_PLANAR_8 0
721 INTRA_PRED_PLANAR_8 1
722 INTRA_PRED_PLANAR_8 2
723 INTRA_PRED_PLANAR_8 3
724 INTRA_PRED_PLANAR_8 4
725 INTRA_PRED_PLANAR_8 5
726 INTRA_PRED_PLANAR_8 6
727 INTRA_PRED_PLANAR_8 7
728 RET
729
730 ;---------------------------------------------------------------------------------------
731 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
732 ;---------------------------------------------------------------------------------------
733 INIT_XMM sse2
734 cglobal intra_pred_planar16, 3,3,8
735 movu m2, [r2 + 2]
736 movu m7, [r2 + 18]
737
738 movd m3, [r2 + 34] ; topRight = above[16]
739 movd m6, [r2 + 98] ; bottomLeft = left[16]
740
741 pshuflw m3, m3, 0
742 pshuflw m6, m6, 0
743 pshufd m3, m3, 0 ; v_topRight
744 pshufd m6, m6, 0 ; v_bottomLeft
745
746 pmullw m4, m3, [multiH] ; (x + 1) * topRight
747 pmullw m3, [multiL] ; (x + 1) * topRight
748 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
749 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
750 paddw m4, [pw_16]
751 paddw m3, [pw_16]
752 paddw m4, m6
753 paddw m3, m6
754 paddw m4, m5
755 paddw m3, m1
756 psubw m1, m6, m7
757 psubw m6, m2
758
759 movu m2, [r2 + 66]
760 movu m7, [r2 + 82]
761
762 %macro INTRA_PRED_PLANAR_16 1
763 %if (%1 < 4)
764 pshuflw m5, m2, 0x55 * %1
765 pshufd m5, m5, 0
766 %else
767 %if (%1 < 8)
768 pshufhw m5, m2, 0x55 * (%1 - 4)
769 pshufd m5, m5, 0xAA
770 %else
771 %if (%1 < 12)
772 pshuflw m5, m7, 0x55 * (%1 - 8)
773 pshufd m5, m5, 0
774 %else
775 pshufhw m5, m7, 0x55 * (%1 - 12)
776 pshufd m5, m5, 0xAA
777 %endif
778 %endif
779 %endif
780 %if (%1 > 0)
781 paddw m3, m6
782 paddw m4, m1
783 lea r0, [r0 + r1 * 2]
784 %endif
785 pmullw m0, m5, [pw_planar16_mul + mmsize]
786 pmullw m5, [pw_planar16_mul]
787 paddw m0, m4
788 paddw m5, m3
789 psraw m5, 5
790 psraw m0, 5
791 movu [r0], m5
792 movu [r0 + 16], m0
793 %endmacro
794
795 INTRA_PRED_PLANAR_16 0
796 INTRA_PRED_PLANAR_16 1
797 INTRA_PRED_PLANAR_16 2
798 INTRA_PRED_PLANAR_16 3
799 INTRA_PRED_PLANAR_16 4
800 INTRA_PRED_PLANAR_16 5
801 INTRA_PRED_PLANAR_16 6
802 INTRA_PRED_PLANAR_16 7
803 INTRA_PRED_PLANAR_16 8
804 INTRA_PRED_PLANAR_16 9
805 INTRA_PRED_PLANAR_16 10
806 INTRA_PRED_PLANAR_16 11
807 INTRA_PRED_PLANAR_16 12
808 INTRA_PRED_PLANAR_16 13
809 INTRA_PRED_PLANAR_16 14
810 INTRA_PRED_PLANAR_16 15
811 RET
812
813 ;---------------------------------------------------------------------------------------
814 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
815 ;---------------------------------------------------------------------------------------
816 INIT_XMM sse2
817 cglobal intra_pred_planar32, 3,3,16
818 movd m3, [r2 + 66] ; topRight = above[32]
819
820 pshuflw m3, m3, 0x00
821 pshufd m3, m3, 0x44
822
823 pmullw m0, m3, [multiL] ; (x + 1) * topRight
824 pmullw m1, m3, [multiH] ; (x + 1) * topRight
825 pmullw m2, m3, [multiH2] ; (x + 1) * topRight
826 pmullw m3, [multiH3] ; (x + 1) * topRight
827
828 movd m6, [r2 + 194] ; bottomLeft = left[32]
829 pshuflw m6, m6, 0x00
830 pshufd m6, m6, 0x44
831 mova m5, m6
832 paddw m5, [pw_32]
833
834 paddw m0, m5
835 paddw m1, m5
836 paddw m2, m5
837 paddw m3, m5
838 mova m8, m6
839 mova m9, m6
840 mova m10, m6
841
842 mova m12, [pw_31]
843 movu m4, [r2 + 2]
844 psubw m8, m4
845 pmullw m4, m12
846 paddw m0, m4
847
848 movu m5, [r2 + 18]
849 psubw m9, m5
850 pmullw m5, m12
851 paddw m1, m5
852
853 movu m4, [r2 + 34]
854 psubw m10, m4
855 pmullw m4, m12
856 paddw m2, m4
857
858 movu m5, [r2 + 50]
859 psubw m6, m5
860 pmullw m5, m12
861 paddw m3, m5
862
863 mova m12, [pw_planar32_mul]
864 mova m13, [pw_planar32_mul + mmsize]
865 mova m14, [pw_planar16_mul]
866 mova m15, [pw_planar16_mul + mmsize]
867 add r1, r1
868
869 %macro PROCESS 1
870 pmullw m5, %1, m12
871 pmullw m11, %1, m13
872 paddw m5, m0
873 paddw m11, m1
874 psrlw m5, 6
875 psrlw m11, 6
876 movu [r0], m5
877 movu [r0 + 16], m11
878
879 pmullw m5, %1, m14
880 pmullw %1, m15
881 paddw m5, m2
882 paddw %1, m3
883 psrlw m5, 6
884 psrlw %1, 6
885 movu [r0 + 32], m5
886 movu [r0 + 48], %1
887 %endmacro
888
889 %macro INCREMENT 0
890 paddw m2, m10
891 paddw m3, m6
892 paddw m0, m8
893 paddw m1, m9
894 add r0, r1
895 %endmacro
896
897 add r2, 130 ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
898 %assign x 0
899 %rep 4
900 movu m4, [r2]
901 add r2, 16
902 %assign y 0
903 %rep 8
904 %if y < 4
905 pshuflw m7, m4, 0x55 * y
906 pshufd m7, m7, 0x44
907 %else
908 pshufhw m7, m4, 0x55 * (y - 4)
909 pshufd m7, m7, 0xEE
910 %endif
911 PROCESS m7
912 %if x + y < 10
913 INCREMENT
914 %endif
915 %assign y y+1
916 %endrep
917 %assign x x+1
918 %endrep
919 RET
920
921 ;---------------------------------------------------------------------------------------
922 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
923 ;---------------------------------------------------------------------------------------
924 INIT_YMM avx2
925 cglobal intra_pred_planar32, 3,3,8
926 movu m1, [r2 + 2]
927 movu m4, [r2 + 34]
928 lea r2, [r2 + 66]
929 vpbroadcastw m3, [r2] ; topRight = above[32]
930 pmullw m0, m3, [multiL] ; (x + 1) * topRight
931 pmullw m2, m3, [multiH2] ; (x + 1) * topRight
932 vpbroadcastw m6, [r2 + 128] ; bottomLeft = left[32]
933 mova m5, m6
934 paddw m5, [pw_32]
935
936 paddw m0, m5
937 paddw m2, m5
938 mova m5, m6
939 psubw m3, m6, m1
940 pmullw m1, [pw_31]
941 paddw m0, m1
942 psubw m5, m4
943 pmullw m4, [pw_31]
944 paddw m2, m4
945
946 mova m6, [pw_planar32_mul]
947 mova m4, [pw_planar16_mul]
948 add r1, r1
949
950 %macro PROCESS_AVX2 1
951 vpbroadcastw m7, [r2 + %1 * 2]
952 pmullw m1, m7, m6
953 pmullw m7, m4
954 paddw m1, m0
955 paddw m7, m2
956 psrlw m1, 6
957 psrlw m7, 6
958 movu [r0], m1
959 movu [r0 + mmsize], m7
960 %endmacro
961
962 %macro INCREMENT_AVX2 0
963 paddw m2, m5
964 paddw m0, m3
965 add r0, r1
966 %endmacro
967
968 add r2, mmsize*2
969 %assign x 0
970 %rep 4
971 %assign y 0
972 %rep 8
973 PROCESS_AVX2 y
974 %if x + y < 10
975 INCREMENT_AVX2
976 %endif
977 %assign y y+1
978 %endrep
979 lea r2, [r2 + 16]
980 %assign x x+1
981 %endrep
982 RET
983
984 ;---------------------------------------------------------------------------------------
985 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
986 ;---------------------------------------------------------------------------------------
987 INIT_YMM avx2
988 cglobal intra_pred_planar16, 3,3,4
989 add r1d, r1d
990 vpbroadcastw m3, [r2 + 34]
991 vpbroadcastw m4, [r2 + 98]
992 mova m0, [pw_planar16_mul]
993 movu m2, [r2 + 2]
994
995 pmullw m3, [multiL] ; (x + 1) * topRight
996 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
997 paddw m3, [pw_16]
998 paddw m3, m4
999 paddw m3, m1
1000 psubw m4, m2
1001 add r2, 66
1002
1003 %macro INTRA_PRED_PLANAR16_AVX2 1
1004 vpbroadcastw m1, [r2 + %1]
1005 vpbroadcastw m2, [r2 + %1 + 2]
1006
1007 pmullw m1, m0
1008 pmullw m2, m0
1009 paddw m1, m3
1010 paddw m3, m4
1011 psraw m1, 5
1012 paddw m2, m3
1013 psraw m2, 5
1014 paddw m3, m4
1015 movu [r0], m1
1016 movu [r0 + r1], m2
1017 %if %1 <= 24
1018 lea r0, [r0 + r1 * 2]
1019 %endif
1020 %endmacro
1021 INTRA_PRED_PLANAR16_AVX2 0
1022 INTRA_PRED_PLANAR16_AVX2 4
1023 INTRA_PRED_PLANAR16_AVX2 8
1024 INTRA_PRED_PLANAR16_AVX2 12
1025 INTRA_PRED_PLANAR16_AVX2 16
1026 INTRA_PRED_PLANAR16_AVX2 20
1027 INTRA_PRED_PLANAR16_AVX2 24
1028 INTRA_PRED_PLANAR16_AVX2 28
1029 %undef INTRA_PRED_PLANAR16_AVX2
1030 RET
1031
1032 %macro TRANSPOSE_4x4 0
1033 punpckhwd m0, m1, m3
1034 punpcklwd m1, m3
1035 punpckhwd m3, m1, m0
1036 punpcklwd m1, m0
1037 %endmacro
1038
1039 %macro STORE_4x4 0
1040 add r1, r1
1041 movh [r0], m1
1042 movhps [r0 + r1], m1
1043 movh [r0 + r1 * 2], m3
1044 lea r1, [r1 * 3]
1045 movhps [r0 + r1], m3
1046 %endmacro
1047
1048 %macro CALC_4x4 4
1049 mova m0, [pd_16]
1050 pmaddwd m1, [ang_table + %1 * 16]
1051 paddd m1, m0
1052 psrld m1, 5
1053
1054 pmaddwd m2, [ang_table + %2 * 16]
1055 paddd m2, m0
1056 psrld m2, 5
1057 packssdw m1, m2
1058
1059 pmaddwd m3, [ang_table + %3 * 16]
1060 paddd m3, m0
1061 psrld m3, 5
1062
1063 pmaddwd m4, [ang_table + %4 * 16]
1064 paddd m4, m0
1065 psrld m4, 5
1066 packssdw m3, m4
1067 %endmacro
1068
1069 ;-----------------------------------------------------------------------------------------
1070 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
1071 ;-----------------------------------------------------------------------------------------
1072 INIT_XMM sse2
1073 cglobal intra_pred_ang4_2, 3,5,4
1074 lea r4, [r2 + 4]
1075 add r2, 20
1076 cmp r3m, byte 34
1077 cmove r2, r4
1078
1079 add r1, r1
1080 movu m0, [r2]
1081 movh [r0], m0
1082 psrldq m0, 2
1083 movh [r0 + r1], m0
1084 psrldq m0, 2
1085 movh [r0 + r1 * 2], m0
1086 lea r1, [r1 * 3]
1087 psrldq m0, 2
1088 movh [r0 + r1], m0
1089 RET
1090
1091 cglobal intra_pred_ang4_3, 3,3,5
1092 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1093 mova m1, m0
1094 psrldq m0, 2
1095 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1096 mova m2, m0
1097 psrldq m0, 2
1098 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1099 mova m3, m0
1100 psrldq m0, 2
1101 punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3]
1102 mova m4, m0
1103 psrldq m0, 2
1104 punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4]
1105
1106 CALC_4x4 26, 20, 14, 8
1107
1108 TRANSPOSE_4x4
1109
1110 STORE_4x4
1111 RET
1112
1113 cglobal intra_pred_ang4_33, 3,3,5
1114 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1115 mova m1, m0
1116 psrldq m0, 2
1117 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1118 mova m2, m0
1119 psrldq m0, 2
1120 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1121 mova m3, m0
1122 psrldq m0, 2
1123 punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3]
1124 mova m4, m0
1125 psrldq m0, 2
1126 punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4]
1127
1128 CALC_4x4 26, 20, 14, 8
1129
1130 STORE_4x4
1131 RET
1132
1133 cglobal intra_pred_ang4_4, 3,3,5
1134 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1135 mova m1, m0
1136 psrldq m0, 2
1137 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1138 mova m2, m0
1139 psrldq m0, 2
1140 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1141 mova m3, m2
1142 mova m4, m0
1143 psrldq m0, 2
1144 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
1145
1146 CALC_4x4 21, 10, 31, 20
1147
1148 TRANSPOSE_4x4
1149
1150 STORE_4x4
1151 RET
1152
1153 cglobal intra_pred_ang4_6, 3,3,5
1154 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1155 mova m1, m0
1156 psrldq m0, 2
1157 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1158 mova m2, m1
1159 mova m3, m0
1160 psrldq m0, 2
1161 punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2]
1162 mova m4, m3
1163
1164 CALC_4x4 13, 26, 7, 20
1165
1166 TRANSPOSE_4x4
1167
1168 STORE_4x4
1169 RET
1170
1171 cglobal intra_pred_ang4_7, 3,3,5
1172 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1173 mova m1, m0
1174 psrldq m0, 2
1175 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1176 mova m2, m1
1177 mova m3, m1
1178 mova m4, m0
1179 psrldq m0, 2
1180 punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2]
1181
1182 CALC_4x4 9, 18, 27, 4
1183
1184 TRANSPOSE_4x4
1185
1186 STORE_4x4
1187 RET
1188
1189 cglobal intra_pred_ang4_8, 3,3,5
1190 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1191 mova m1, m0
1192 psrldq m0, 2
1193 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1194 mova m2, m1
1195 mova m3, m1
1196 mova m4, m1
1197
1198 CALC_4x4 5, 10, 15, 20
1199
1200 TRANSPOSE_4x4
1201
1202 STORE_4x4
1203 RET
1204
1205 cglobal intra_pred_ang4_9, 3,3,5
1206 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1207 mova m1, m0
1208 psrldq m0, 2
1209 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1210 mova m2, m1
1211 mova m3, m1
1212 mova m4, m1
1213
1214 CALC_4x4 2, 4, 6, 8
1215
1216 TRANSPOSE_4x4
1217
1218 STORE_4x4
1219 RET
1220
1221 cglobal intra_pred_ang4_10, 3,3,3
1222 movh m0, [r2 + 18] ;[4 3 2 1]
1223
1224 punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1]
1225 pshufd m1, m0, 0xFA
1226 add r1d, r1d
1227 pshufd m0, m0, 0x50
1228 movhps [r0 + r1], m0
1229 movh [r0 + r1 * 2], m1
1230 lea r1d, [r1 * 3]
1231 movhps [r0 + r1], m1
1232
1233 cmp r4m, byte 0
1234 jz .quit
1235
1236 ; filter
1237 movd m2, [r2] ;[7 6 5 4 3 2 1 0]
1238 pshuflw m2, m2, 0x00
1239 movh m1, [r2 + 2]
1240 psubw m1, m2
1241 psraw m1, 1
1242 paddw m0, m1
1243 pxor m1, m1
1244 pmaxsw m0, m1
1245 pminsw m0, [pw_pixel_max]
1246 .quit:
1247 movh [r0], m0
1248 RET
1249
1250 cglobal intra_pred_ang4_11, 3,3,5
1251 movh m0, [r2 + 18] ;[x x x 4 3 2 1 0]
1252 movh m1, [r2 - 6]
1253 punpcklqdq m1, m0
1254 psrldq m1, 6
1255 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1256 mova m2, m1
1257 mova m3, m1
1258 mova m4, m1
1259
1260 CALC_4x4 30, 28, 26, 24
1261
1262 TRANSPOSE_4x4
1263
1264 STORE_4x4
1265 RET
1266
1267 cglobal intra_pred_ang4_12, 3,3,5
1268 movh m0, [r2 + 18]
1269 movh m1, [r2 - 6]
1270 punpcklqdq m1, m0
1271 psrldq m1, 6
1272 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1273 mova m2, m1
1274 mova m3, m1
1275 mova m4, m1
1276
1277 CALC_4x4 27, 22, 17, 12
1278
1279 TRANSPOSE_4x4
1280
1281 STORE_4x4
1282 RET
1283
1284 cglobal intra_pred_ang4_13, 3,3,5
1285 movd m4, [r2 + 6]
1286 movd m1, [r2 - 2]
1287 movh m0, [r2 + 18]
1288 punpcklwd m4, m1
1289 punpcklqdq m4, m0
1290 psrldq m4, 4
1291 mova m1, m4
1292 psrldq m1, 2
1293 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
1294 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1295 mova m2, m1
1296 mova m3, m1
1297
1298 CALC_4x4 23, 14, 5, 28
1299
1300 TRANSPOSE_4x4
1301
1302 STORE_4x4
1303 RET
1304
1305 cglobal intra_pred_ang4_14, 3,3,5
1306 movd m4, [r2 + 2]
1307 movd m1, [r2 - 2]
1308 movh m0, [r2 + 18]
1309 punpcklwd m4, m1
1310 punpcklqdq m4, m0
1311 psrldq m4, 4
1312 mova m1, m4
1313 psrldq m1, 2
1314 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
1315 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1316 mova m2, m1
1317 mova m3, m4
1318
1319 CALC_4x4 19, 6, 25, 12
1320
1321 TRANSPOSE_4x4
1322
1323 STORE_4x4
1324 RET
1325
1326 cglobal intra_pred_ang4_15, 3,3,5
1327 movd m3, [r2] ;[x x x A]
1328 movh m4, [r2 + 4] ;[x C x B]
1329 movh m0, [r2 + 18] ;[4 3 2 1]
1330 pshuflw m4, m4, 0x22 ;[B C B C]
1331 punpcklqdq m4, m3 ;[x x x A B C B C]
1332 psrldq m4, 2 ;[x x x x A B C B]
1333 punpcklqdq m4, m0
1334 psrldq m4, 2
1335 mova m1, m4
1336 mova m2, m4
1337 psrldq m1, 4
1338 psrldq m2, 2
1339 punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
1340 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
1341 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1342 mova m3, m2
1343
1344 CALC_4x4 15, 30, 13, 28
1345
1346 TRANSPOSE_4x4
1347
1348 STORE_4x4
1349 RET
1350
1351 cglobal intra_pred_ang4_16, 3,3,5
1352 movd m3, [r2] ;[x x x A]
1353 movd m4, [r2 + 4] ;[x x C B]
1354 movh m0, [r2 + 18] ;[4 3 2 1]
1355 punpcklwd m4, m3 ;[x C A B]
1356 pshuflw m4, m4, 0x4A ;[A B C C]
1357 punpcklqdq m4, m0 ;[4 3 2 1 A B C C]
1358 psrldq m4, 2
1359 mova m1, m4
1360 mova m2, m4
1361 psrldq m1, 4
1362 psrldq m2, 2
1363 punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
1364 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
1365 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1366 mova m3, m2
1367
1368 CALC_4x4 11, 22, 1, 12
1369
1370 TRANSPOSE_4x4
1371
1372 STORE_4x4
1373 RET
1374
1375 cglobal intra_pred_ang4_17, 3,3,5
1376 movd m3, [r2]
1377 movh m4, [r2 + 2] ;[D x C B]
1378 pshuflw m4, m4, 0x1F ;[B C D D]
1379 punpcklqdq m4, m3 ;[x x x A B C D D]
1380 psrldq m4, 2 ;[x x x x A B C D]
1381 movhps m4, [r2 + 18]
1382
1383 mova m3, m4
1384 psrldq m3, 2
1385 punpcklwd m4, m3
1386 mova m2, m3
1387 psrldq m2, 2
1388 punpcklwd m3, m2
1389 mova m1, m2
1390 psrldq m1, 2
1391 punpcklwd m2, m1
1392 mova m0, m1
1393 psrldq m0, 2
1394 punpcklwd m1, m0
1395
1396 CALC_4x4 6, 12, 18, 24
1397
1398 TRANSPOSE_4x4
1399
1400 STORE_4x4
1401 RET
1402
1403 cglobal intra_pred_ang4_18, 3,3,1
1404 movh m0, [r2 + 16]
1405 pinsrw m0, [r2], 0
1406 pshuflw m0, m0, q0123
1407 movhps m0, [r2 + 2]
1408 add r1, r1
1409 lea r2, [r1 * 3]
1410 movh [r0 + r2], m0
1411 psrldq m0, 2
1412 movh [r0 + r1 * 2], m0
1413 psrldq m0, 2
1414 movh [r0 + r1], m0
1415 psrldq m0, 2
1416 movh [r0], m0
1417 RET
1418
1419 cglobal intra_pred_ang4_19, 3,3,5
1420 movd m3, [r2]
1421 movh m4, [r2 + 18] ;[D x C B]
1422 pshuflw m4, m4, 0x1F ;[B C D D]
1423 punpcklqdq m4, m3 ;[x x x A B C D D]
1424 psrldq m4, 2 ;[x x x x A B C D]
1425 movhps m4, [r2 + 2]
1426
1427 mova m3, m4
1428 psrldq m3, 2
1429 punpcklwd m4, m3
1430 mova m2, m3
1431 psrldq m2, 2
1432 punpcklwd m3, m2
1433 mova m1, m2
1434 psrldq m1, 2
1435 punpcklwd m2, m1
1436 mova m0, m1
1437 psrldq m0, 2
1438 punpcklwd m1, m0
1439
1440 CALC_4x4 6, 12, 18, 24
1441
1442 STORE_4x4
1443 RET
1444
1445 cglobal intra_pred_ang4_20, 3,3,5
1446 movd m3, [r2] ;[x x x A]
1447 movd m4, [r2 + 20] ;[x x C B]
1448 movh m0, [r2 + 2] ;[4 3 2 1]
1449 punpcklwd m4, m3 ;[x C A B]
1450 pshuflw m4, m4, 0x4A ;[A B C C]
1451 punpcklqdq m4, m0 ;[4 3 2 1 A B C C]
1452 psrldq m4, 2
1453 mova m1, m4
1454 mova m2, m4
1455 psrldq m1, 4
1456 psrldq m2, 2
1457 punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
1458 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
1459 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1460 mova m3, m2
1461
1462 CALC_4x4 11, 22, 1, 12
1463
1464 STORE_4x4
1465 RET
1466
1467 cglobal intra_pred_ang4_21, 3,3,5
1468 movd m3, [r2] ;[x x x A]
1469 movh m4, [r2 + 20] ;[x C x B]
1470 movh m0, [r2 + 2] ;[4 3 2 1]
1471 pshuflw m4, m4, 0x22 ;[B C B C]
1472 punpcklqdq m4, m3 ;[x x x A B C B C]
1473 psrldq m4, 2 ;[x x x x A B C B]
1474 punpcklqdq m4, m0
1475 psrldq m4, 2
1476 mova m1, m4
1477 mova m2, m4
1478 psrldq m1, 4
1479 psrldq m2, 2
1480 punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
1481 punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
1482 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1483 mova m3, m2
1484
1485 CALC_4x4 15, 30, 13, 28
1486
1487 STORE_4x4
1488 RET
1489
1490 cglobal intra_pred_ang4_22, 3,3,5
1491 movd m4, [r2 + 18]
1492 movd m1, [r2 - 2]
1493 movh m0, [r2 + 2]
1494 punpcklwd m4, m1
1495 punpcklqdq m4, m0
1496 psrldq m4, 4
1497 mova m1, m4
1498 psrldq m1, 2
1499 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
1500 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1501 mova m2, m1
1502 mova m3, m4
1503
1504 CALC_4x4 19, 6, 25, 12
1505
1506 STORE_4x4
1507 RET
1508
1509 cglobal intra_pred_ang4_23, 3,3,5
1510 movd m4, [r2 + 22]
1511 movd m1, [r2 - 2]
1512 movh m0, [r2 + 2]
1513 punpcklwd m4, m1
1514 punpcklqdq m4, m0
1515 psrldq m4, 4
1516 mova m1, m4
1517 psrldq m1, 2
1518 punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
1519 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1520 mova m2, m1
1521 mova m3, m1
1522
1523 CALC_4x4 23, 14, 5, 28
1524
1525 STORE_4x4
1526 RET
1527
1528 cglobal intra_pred_ang4_24, 3,3,5
1529 movh m0, [r2 + 2]
1530 movh m1, [r2 - 6]
1531 punpcklqdq m1, m0
1532 psrldq m1, 6
1533 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1534 mova m2, m1
1535 mova m3, m1
1536 mova m4, m1
1537
1538 CALC_4x4 27, 22, 17, 12
1539
1540 STORE_4x4
1541 RET
1542
1543 cglobal intra_pred_ang4_25, 3,3,5
1544 movh m0, [r2 + 2] ;[x x x 4 3 2 1 0]
1545 movh m1, [r2 - 6]
1546 punpcklqdq m1, m0
1547 psrldq m1, 6
1548 punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
1549 mova m2, m1
1550 mova m3, m1
1551 mova m4, m1
1552
1553 CALC_4x4 30, 28, 26, 24
1554
1555 STORE_4x4
1556 RET
1557
1558 cglobal intra_pred_ang4_26, 3,3,3
1559 movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1560 add r1d, r1d
1561 ; store
1562 movh [r0], m0
1563 movh [r0 + r1], m0
1564 movh [r0 + r1 * 2], m0
1565 lea r3, [r1 * 3]
1566 movh [r0 + r3], m0
1567
1568 ; filter
1569 cmp r4m, byte 0
1570 jz .quit
1571
1572 pshuflw m0, m0, 0x00
1573 movd m2, [r2]
1574 pshuflw m2, m2, 0x00
1575 movh m1, [r2 + 18]
1576 psubw m1, m2
1577 psraw m1, 1
1578 paddw m0, m1
1579 pxor m1, m1
1580 pmaxsw m0, m1
1581 pminsw m0, [pw_pixel_max]
1582
1583 movh r2, m0
1584 mov [r0], r2w
1585 shr r2, 16
1586 mov [r0 + r1], r2w
1587 shr r2, 16
1588 mov [r0 + r1 * 2], r2w
1589 shr r2, 16
1590 mov [r0 + r3], r2w
1591 .quit:
1592 RET
1593
1594 cglobal intra_pred_ang4_27, 3,3,5
1595 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1596 mova m1, m0
1597 psrldq m0, 2
1598 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1599 mova m2, m1
1600 mova m3, m1
1601 mova m4, m1
1602
1603 CALC_4x4 2, 4, 6, 8
1604
1605 STORE_4x4
1606 RET
1607
1608 cglobal intra_pred_ang4_28, 3,3,5
1609
1610 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1611 mova m1, m0
1612 psrldq m0, 2
1613 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1614 mova m2, m1
1615 mova m3, m1
1616 mova m4, m1
1617
1618 CALC_4x4 5, 10, 15, 20
1619
1620 STORE_4x4
1621 RET
1622
1623 cglobal intra_pred_ang4_29, 3,3,5
1624 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1625 mova m1, m0
1626 psrldq m0, 2
1627 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1628 mova m2, m1
1629 mova m3, m1
1630 mova m4, m0
1631 psrldq m0, 2
1632 punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2]
1633
1634 CALC_4x4 9, 18, 27, 4
1635
1636 STORE_4x4
1637 RET
1638
1639 cglobal intra_pred_ang4_30, 3,3,5
1640 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1641 mova m1, m0
1642 psrldq m0, 2
1643 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1644 mova m2, m1
1645 mova m3, m0
1646 psrldq m0, 2
1647 punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2]
1648 mova m4, m3
1649
1650 CALC_4x4 13, 26, 7, 20
1651
1652 STORE_4x4
1653 RET
1654
1655 cglobal intra_pred_ang4_5, 3,3,5
1656 movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
1657 mova m1, m0
1658 psrldq m0, 2
1659 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1660 mova m2, m0
1661 psrldq m0, 2
1662 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1663 mova m3, m2
1664 mova m4, m0
1665 psrldq m0, 2
1666 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
1667
1668 CALC_4x4 17, 2, 19, 4
1669
1670 TRANSPOSE_4x4
1671
1672 STORE_4x4
1673 RET
1674
1675 cglobal intra_pred_ang4_31, 3,3,5
1676 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1677 mova m1, m0
1678 psrldq m0, 2
1679 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1680 mova m2, m0
1681 psrldq m0, 2
1682 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1683 mova m3, m2
1684 mova m4, m0
1685 psrldq m0, 2
1686 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
1687
1688 CALC_4x4 17, 2, 19, 4
1689
1690 STORE_4x4
1691 RET
1692
1693 cglobal intra_pred_ang4_32, 3,3,5
1694 movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
1695 mova m1, m0
1696 psrldq m0, 2
1697 punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
1698 mova m2, m0
1699 psrldq m0, 2
1700 punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
1701 mova m3, m2
1702 mova m4, m0
1703 psrldq m0, 2
1704 punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
1705
1706 CALC_4x4 21, 10, 31, 20
1707
1708 STORE_4x4
1709 RET
1710
1711 ;-----------------------------------------------------------------------------------
1712 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
1713 ;-----------------------------------------------------------------------------------
1714 INIT_XMM sse4
1715 cglobal intra_pred_dc4, 5,6,2
1716 lea r3, [r2 + 18]
1717 add r2, 2
1718
1719 movh m0, [r3] ; sumAbove
1720 movh m1, [r2] ; sumLeft
1721
1722 paddw m0, m1
1723 pshufd m1, m0, 1
1724 paddw m0, m1
1725 phaddw m0, m0 ; m0 = sum
1726
1727 test r4d, r4d
1728
1729 pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
1730 movd r4d, m0 ; r4d = dc_val
1731 movzx r4d, r4w
1732 pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
1733
1734 ; store DC 4x4
1735 movh [r0], m0
1736 movh [r0 + r1 * 2], m0
1737 movh [r0 + r1 * 4], m0
1738 lea r5, [r0 + r1 * 4]
1739 movh [r5 + r1 * 2], m0
1740
1741 ; do DC filter
1742 jz .end
1743 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
1744 add r4d, r5d ; r4d = DC * 3 + 2
1745 movd m0, r4d
1746 pshuflw m0, m0, 0 ; m0 = pixDCx3
1747
1748 ; filter top
1749 movu m1, [r2]
1750 paddw m1, m0
1751 psrlw m1, 2
1752 movh [r0], m1 ; overwrite top-left pixel, we will update it later
1753
1754 ; filter top-left
1755 movzx r4d, word [r3]
1756 add r5d, r4d
1757 movzx r4d, word [r2]
1758 add r4d, r5d
1759 shr r4d, 2
1760 mov [r0], r4w
1761
1762 ; filter left
1763 lea r0, [r0 + r1 * 2]
1764 movu m1, [r3 + 2]
1765 paddw m1, m0
1766 psrlw m1, 2
1767 movd r3d, m1
1768 mov [r0], r3w
1769 shr r3d, 16
1770 mov [r0 + r1 * 2], r3w
1771 pextrw [r0 + r1 * 4], m1, 2
1772 .end:
1773 RET
1774
1775 ;---------------------------------------------------------------------------------------
1776 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
1777 ;---------------------------------------------------------------------------------------
1778 INIT_XMM sse2
1779 cglobal intra_pred_planar4, 3,3,5
1780 movu m1, [r2 + 2]
1781 movu m2, [r2 + 18]
1782 pshufhw m3, m1, 0 ; topRight
1783 pshufd m3, m3, 0xAA
1784 pshufhw m4, m2, 0 ; bottomLeft
1785 pshufd m4, m4, 0xAA
1786
1787 pmullw m3, [multi_2Row] ; (x + 1) * topRight
1788 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
1789
1790 paddw m3, [pw_4]
1791 paddw m3, m4
1792 paddw m3, m0
1793 psubw m4, m1
1794
1795 pshuflw m1, m2, 0
1796 pmullw m1, [pw_planar4_0]
1797 paddw m1, m3
1798 paddw m3, m4
1799 psraw m1, 3
1800 movh [r0], m1
1801
1802 pshuflw m1, m2, 01010101b
1803 pmullw m1, [pw_planar4_0]
1804 paddw m1, m3
1805 paddw m3, m4
1806 psraw m1, 3
1807 movh [r0 + r1 * 2], m1
1808 lea r0, [r0 + 4 * r1]
1809
1810 pshuflw m1, m2, 10101010b
1811 pmullw m1, [pw_planar4_0]
1812 paddw m1, m3
1813 paddw m3, m4
1814 psraw m1, 3
1815 movh [r0], m1
1816
1817 pshuflw m1, m2, 11111111b
1818 pmullw m1, [pw_planar4_0]
1819 paddw m1, m3
1820 psraw m1, 3
1821 movh [r0 + r1 * 2], m1
1822 RET
1823
1824 ;-----------------------------------------------------------------------------------
1825 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
1826 ;-----------------------------------------------------------------------------------
1827 INIT_XMM sse4
1828 cglobal intra_pred_dc8, 5, 7, 2
1829 lea r3, [r2 + 34]
1830 add r2, 2
1831 add r1, r1
1832 movu m0, [r3]
1833 movu m1, [r2]
1834
1835 paddw m0, m1
1836 movhlps m1, m0
1837 paddw m0, m1
1838 phaddw m0, m0
1839 pmaddwd m0, [pw_1]
1840
1841 movd r5d, m0
1842 add r5d, 8
1843 shr r5d, 4 ; sum = sum / 16
1844 movd m1, r5d
1845 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
1846 pshufd m1, m1, 0
1847
1848 test r4d, r4d
1849
1850 ; store DC 8x8
1851 mov r6, r0
1852 movu [r0], m1
1853 movu [r0 + r1], m1
1854 movu [r0 + r1 * 2], m1
1855 lea r0, [r0 + r1 * 2]
1856 movu [r0 + r1], m1
1857 movu [r0 + r1 * 2], m1
1858 lea r0, [r0 + r1 * 2]
1859 movu [r0 + r1], m1
1860 movu [r0 + r1 * 2], m1
1861 lea r0, [r0 + r1 * 2]
1862 movu [r0 + r1], m1
1863
1864 ; Do DC Filter
1865 jz .end
1866 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
1867 add r5d, r4d ; r5d = DC * 3 + 2
1868 movd m1, r5d
1869 pshuflw m1, m1, 0 ; m1 = pixDCx3
1870 pshufd m1, m1, 0
1871
1872 ; filter top
1873 movu m0, [r2]
1874 paddw m0, m1
1875 psrlw m0, 2
1876 movu [r6], m0
1877
1878 ; filter top-left
1879 movzx r5d, word [r3]
1880 add r4d, r5d
1881 movzx r5d, word [r2]
1882 add r5d, r4d
1883 shr r5d, 2
1884 mov [r6], r5w
1885
1886 ; filter left
1887 add r6, r1
1888 movu m0, [r3 + 2]
1889 paddw m0, m1
1890 psrlw m0, 2
1891 pextrw [r6], m0, 0
1892 pextrw [r6 + r1], m0, 1
1893 pextrw [r6 + r1 * 2], m0, 2
1894 lea r6, [r6 + r1 * 2]
1895 pextrw [r6 + r1], m0, 3
1896 pextrw [r6 + r1 * 2], m0, 4
1897 lea r6, [r6 + r1 * 2]
1898 pextrw [r6 + r1], m0, 5
1899 pextrw [r6 + r1 * 2], m0, 6
1900 .end:
1901 RET
1902
1903 ;-------------------------------------------------------------------------------------------------------
1904 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
1905 ;-------------------------------------------------------------------------------------------------------
1906 INIT_XMM sse4
1907 cglobal intra_pred_dc16, 5, 7, 4
1908 lea r3, [r2 + 66]
1909 add r2, 2
1910 add r1, r1
1911 movu m0, [r3]
1912 movu m1, [r3 + 16]
1913 movu m2, [r2]
1914 movu m3, [r2 + 16]
1915
1916 paddw m0, m1 ; dynamic range 13 bits
1917 paddw m2, m3
1918 paddw m0, m2 ; dynamic range 14 bits
1919 movhlps m1, m0 ; dynamic range 15 bits
1920 paddw m0, m1 ; dynamic range 16 bits
1921 pmaddwd m0, [pw_1]
1922 phaddd m0, m0
1923
1924 movd r5d, m0
1925 add r5d, 16
1926 shr r5d, 5 ; sum = sum / 16
1927 movd m1, r5d
1928 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
1929 pshufd m1, m1, 0
1930
1931 test r4d, r4d
1932
1933 ; store DC 16x16
1934 mov r6, r0
1935 movu [r0], m1
1936 movu [r0 + 16], m1
1937 movu [r0 + r1], m1
1938 movu [r0 + 16 + r1], m1
1939 lea r0, [r0 + r1 * 2]
1940 movu [r0], m1
1941 movu [r0 + 16], m1
1942 movu [r0 + r1], m1
1943 movu [r0 + 16 + r1], m1
1944 lea r0, [r0 + r1 * 2]
1945 movu [r0], m1
1946 movu [r0 + 16], m1
1947 movu [r0 + r1], m1
1948 movu [r0 + 16 + r1], m1
1949 lea r0, [r0 + r1 * 2]
1950 movu [r0], m1
1951 movu [r0 + 16], m1
1952 movu [r0 + r1], m1
1953 movu [r0 + 16 + r1], m1
1954 lea r0, [r0 + r1 * 2]
1955 movu [r0], m1
1956 movu [r0 + 16], m1
1957 movu [r0 + r1], m1
1958 movu [r0 + 16 + r1], m1
1959 lea r0, [r0 + r1 * 2]
1960 movu [r0], m1
1961 movu [r0 + 16], m1
1962 movu [r0 + r1], m1
1963 movu [r0 + 16 + r1], m1
1964 lea r0, [r0 + r1 * 2]
1965 movu [r0], m1
1966 movu [r0 + 16], m1
1967 movu [r0 + r1], m1
1968 movu [r0 + 16 + r1], m1
1969 lea r0, [r0 + r1 * 2]
1970 movu [r0], m1
1971 movu [r0 + 16], m1
1972 movu [r0 + r1], m1
1973 movu [r0 + 16 + r1], m1
1974
1975 ; Do DC Filter
1976 jz .end
1977 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
1978 add r5d, r4d ; r5d = DC * 3 + 2
1979 movd m1, r5d
1980 pshuflw m1, m1, 0 ; m1 = pixDCx3
1981 pshufd m1, m1, 0
1982
1983 ; filter top
1984 movu m2, [r2]
1985 paddw m2, m1
1986 psrlw m2, 2
1987 movu [r6], m2
1988 movu m3, [r2 + 16]
1989 paddw m3, m1
1990 psrlw m3, 2
1991 movu [r6 + 16], m3
1992
1993 ; filter top-left
1994 movzx r5d, word [r3]
1995 add r4d, r5d
1996 movzx r5d, word [r2]
1997 add r5d, r4d
1998 shr r5d, 2
1999 mov [r6], r5w
2000
2001 ; filter left
2002 add r6, r1
2003 movu m2, [r3 + 2]
2004 paddw m2, m1
2005 psrlw m2, 2
2006
2007 pextrw [r6], m2, 0
2008 pextrw [r6 + r1], m2, 1
2009 lea r6, [r6 + r1 * 2]
2010 pextrw [r6], m2, 2
2011 pextrw [r6 + r1], m2, 3
2012 lea r6, [r6 + r1 * 2]
2013 pextrw [r6], m2, 4
2014 pextrw [r6 + r1], m2, 5
2015 lea r6, [r6 + r1 * 2]
2016 pextrw [r6], m2, 6
2017 pextrw [r6 + r1], m2, 7
2018
2019 lea r6, [r6 + r1 * 2]
2020 movu m3, [r3 + 18]
2021 paddw m3, m1
2022 psrlw m3, 2
2023
2024 pextrw [r6], m3, 0
2025 pextrw [r6 + r1], m3, 1
2026 lea r6, [r6 + r1 * 2]
2027 pextrw [r6], m3, 2
2028 pextrw [r6 + r1], m3, 3
2029 lea r6, [r6 + r1 * 2]
2030 pextrw [r6], m3, 4
2031 pextrw [r6 + r1], m3, 5
2032 lea r6, [r6 + r1 * 2]
2033 pextrw [r6], m3, 6
2034 .end:
2035 RET
2036
2037 ;-------------------------------------------------------------------------------------------
2038 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
2039 ;-------------------------------------------------------------------------------------------
2040 INIT_XMM sse4
2041 cglobal intra_pred_dc32, 3, 5, 6
2042 lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
2043 add r2, 2
2044 add r1, r1
2045 movu m0, [r3]
2046 movu m1, [r3 + 16]
2047 movu m2, [r3 + 32]
2048 movu m3, [r3 + 48]
2049 paddw m0, m1 ; dynamic range 13 bits
2050 paddw m2, m3
2051 paddw m0, m2 ; dynamic range 14 bits
2052 movu m1, [r2]
2053 movu m3, [r2 + 16]
2054 movu m4, [r2 + 32]
2055 movu m5, [r2 + 48]
2056 paddw m1, m3 ; dynamic range 13 bits
2057 paddw m4, m5
2058 paddw m1, m4 ; dynamic range 14 bits
2059 paddw m0, m1 ; dynamic range 15 bits
2060 pmaddwd m0, [pw_1]
2061 movhlps m1, m0
2062 paddd m0, m1
2063 phaddd m0, m0
2064
2065 paddd m0, [pd_32] ; sum = sum + 32
2066 psrld m0, 6 ; sum = sum / 64
2067 pshuflw m0, m0, 0
2068 pshufd m0, m0, 0
2069
2070 lea r2, [r1 * 3]
2071 mov r3d, 4
2072 .loop:
2073 ; store DC 32x32
2074 movu [r0 + 0], m0
2075 movu [r0 + 16], m0
2076 movu [r0 + 32], m0
2077 movu [r0 + 48], m0
2078 movu [r0 + r1 + 0], m0
2079 movu [r0 + r1 + 16], m0
2080 movu [r0 + r1 + 32], m0
2081 movu [r0 + r1 + 48], m0
2082 movu [r0 + r1 * 2 + 0], m0
2083 movu [r0 + r1 * 2 + 16], m0
2084 movu [r0 + r1 * 2 + 32], m0
2085 movu [r0 + r1 * 2 + 48], m0
2086 movu [r0 + r2 + 0], m0
2087 movu [r0 + r2 + 16], m0
2088 movu [r0 + r2 + 32], m0
2089 movu [r0 + r2 + 48], m0
2090 lea r0, [r0 + r1 * 4]
2091 movu [r0 + 0], m0
2092 movu [r0 + 16], m0
2093 movu [r0 + 32], m0
2094 movu [r0 + 48], m0
2095 movu [r0 + r1 + 0], m0
2096 movu [r0 + r1 + 16], m0
2097 movu [r0 + r1 + 32], m0
2098 movu [r0 + r1 + 48], m0
2099 movu [r0 + r1 * 2 + 0], m0
2100 movu [r0 + r1 * 2 + 16], m0
2101 movu [r0 + r1 * 2 + 32], m0
2102 movu [r0 + r1 * 2 + 48], m0
2103 movu [r0 + r2 + 0], m0
2104 movu [r0 + r2 + 16], m0
2105 movu [r0 + r2 + 32], m0
2106 movu [r0 + r2 + 48], m0
2107 lea r0, [r0 + r1 * 4]
2108 dec r3d
2109 jnz .loop
2110 RET
2111
2112 ;---------------------------------------------------------------------------------------
2113 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2114 ;---------------------------------------------------------------------------------------
2115 INIT_XMM sse4
2116 cglobal intra_pred_planar4, 3,3,5
2117 add r1, r1
2118 movu m1, [r2 + 2]
2119 movu m2, [r2 + 18]
2120 pshufhw m3, m1, 0 ; topRight
2121 pshufd m3, m3, 0xAA
2122 pshufhw m4, m2, 0 ; bottomLeft
2123 pshufd m4, m4, 0xAA
2124
2125 pmullw m3, [multi_2Row] ; (x + 1) * topRight
2126 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
2127
2128 paddw m3, [pw_4]
2129 paddw m3, m4
2130 paddw m3, m0
2131 psubw m4, m1
2132 mova m0, [pw_planar4_0]
2133
2134 pshuflw m1, m2, 0
2135 pmullw m1, m0
2136 paddw m1, m3
2137 paddw m3, m4
2138 psraw m1, 3
2139 movh [r0], m1
2140
2141 pshuflw m1, m2, 01010101b
2142 pmullw m1, m0
2143 paddw m1, m3
2144 paddw m3, m4
2145 psraw m1, 3
2146 movh [r0 + r1], m1
2147 lea r0, [r0 + 2 * r1]
2148
2149 pshuflw m1, m2, 10101010b
2150 pmullw m1, m0
2151 paddw m1, m3
2152 paddw m3, m4
2153 psraw m1, 3
2154 movh [r0], m1
2155
2156 pshuflw m1, m2, 11111111b
2157 pmullw m1, m0
2158 paddw m1, m3
2159 paddw m3, m4
2160 psraw m1, 3
2161 movh [r0 + r1], m1
2162 RET
2163
2164 ;---------------------------------------------------------------------------------------
2165 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2166 ;---------------------------------------------------------------------------------------
2167 INIT_XMM sse4
2168 cglobal intra_pred_planar8, 3,3,5
2169 add r1, r1
2170 movu m1, [r2 + 2]
2171 movu m2, [r2 + 34]
2172
2173 movd m3, [r2 + 18] ; topRight = above[8];
2174 movd m4, [r2 + 50] ; bottomLeft = left[8];
2175
2176 pshuflw m3, m3, 0
2177 pshuflw m4, m4, 0
2178 pshufd m3, m3, 0 ; v_topRight
2179 pshufd m4, m4, 0 ; v_bottomLeft
2180
2181 pmullw m3, [multiL] ; (x + 1) * topRight
2182 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
2183 paddw m3, [pw_8]
2184 paddw m3, m4
2185 paddw m3, m0
2186 psubw m4, m1
2187 mova m0, [pw_planar16_mul + mmsize]
2188
2189 %macro INTRA_PRED_PLANAR8 1
2190 %if (%1 < 4)
2191 pshuflw m1, m2, 0x55 * %1
2192 pshufd m1, m1, 0
2193 %else
2194 pshufhw m1, m2, 0x55 * (%1 - 4)
2195 pshufd m1, m1, 0xAA
2196 %endif
2197 pmullw m1, m0
2198 paddw m1, m3
2199 paddw m3, m4
2200 psraw m1, 4
2201 movu [r0], m1
2202 lea r0, [r0 + r1]
2203 %endmacro
2204
2205 INTRA_PRED_PLANAR8 0
2206 INTRA_PRED_PLANAR8 1
2207 INTRA_PRED_PLANAR8 2
2208 INTRA_PRED_PLANAR8 3
2209 INTRA_PRED_PLANAR8 4
2210 INTRA_PRED_PLANAR8 5
2211 INTRA_PRED_PLANAR8 6
2212 INTRA_PRED_PLANAR8 7
2213 RET
2214
2215 ;---------------------------------------------------------------------------------------
2216 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2217 ;---------------------------------------------------------------------------------------
2218 INIT_XMM sse4
2219 cglobal intra_pred_planar16, 3,3,8
2220 add r1, r1
2221 movu m2, [r2 + 2]
2222 movu m7, [r2 + 18]
2223
2224 movd m3, [r2 + 34] ; topRight = above[16]
2225 movd m6, [r2 + 98] ; bottomLeft = left[16]
2226
2227 pshuflw m3, m3, 0
2228 pshuflw m6, m6, 0
2229 pshufd m3, m3, 0 ; v_topRight
2230 pshufd m6, m6, 0 ; v_bottomLeft
2231
2232 pmullw m4, m3, [multiH] ; (x + 1) * topRight
2233 pmullw m3, [multiL] ; (x + 1) * topRight
2234 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
2235 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
2236 paddw m4, [pw_16]
2237 paddw m3, [pw_16]
2238 paddw m4, m6
2239 paddw m3, m6
2240 paddw m4, m5
2241 paddw m3, m1
2242 psubw m1, m6, m7
2243 psubw m6, m2
2244
2245 movu m2, [r2 + 66]
2246 movu m7, [r2 + 82]
2247
2248 %macro INTRA_PRED_PLANAR16 1
2249 %if (%1 < 4)
2250 pshuflw m5, m2, 0x55 * %1
2251 pshufd m5, m5, 0
2252 %else
2253 %if (%1 < 8)
2254 pshufhw m5, m2, 0x55 * (%1 - 4)
2255 pshufd m5, m5, 0xAA
2256 %else
2257 %if (%1 < 12)
2258 pshuflw m5, m7, 0x55 * (%1 - 8)
2259 pshufd m5, m5, 0
2260 %else
2261 pshufhw m5, m7, 0x55 * (%1 - 12)
2262 pshufd m5, m5, 0xAA
2263 %endif
2264 %endif
2265 %endif
2266 pmullw m0, m5, [pw_planar16_mul + mmsize]
2267 pmullw m5, [pw_planar16_mul]
2268 paddw m0, m4
2269 paddw m5, m3
2270 paddw m3, m6
2271 paddw m4, m1
2272 psraw m5, 5
2273 psraw m0, 5
2274 movu [r0], m5
2275 movu [r0 + 16], m0
2276 lea r0, [r0 + r1]
2277 %endmacro
2278
2279 INTRA_PRED_PLANAR16 0
2280 INTRA_PRED_PLANAR16 1
2281 INTRA_PRED_PLANAR16 2
2282 INTRA_PRED_PLANAR16 3
2283 INTRA_PRED_PLANAR16 4
2284 INTRA_PRED_PLANAR16 5
2285 INTRA_PRED_PLANAR16 6
2286 INTRA_PRED_PLANAR16 7
2287 INTRA_PRED_PLANAR16 8
2288 INTRA_PRED_PLANAR16 9
2289 INTRA_PRED_PLANAR16 10
2290 INTRA_PRED_PLANAR16 11
2291 INTRA_PRED_PLANAR16 12
2292 INTRA_PRED_PLANAR16 13
2293 INTRA_PRED_PLANAR16 14
2294 INTRA_PRED_PLANAR16 15
2295 RET
2296
2297 ;---------------------------------------------------------------------------------------
2298 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2299 ;---------------------------------------------------------------------------------------
2300 INIT_XMM sse4
2301 %if ARCH_X86_64 == 1
2302 cglobal intra_pred_planar32, 3,7,16
2303 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
2304 mov r6, rsp
2305 sub rsp, 4*mmsize
2306 and rsp, ~63
2307 %define m16 [rsp + 0 * mmsize]
2308 %define m17 [rsp + 1 * mmsize]
2309 %define m18 [rsp + 2 * mmsize]
2310 %define m19 [rsp + 3 * mmsize]
2311 %else
2312 cglobal intra_pred_planar32, 3,7,8
2313 ; NOTE: align stack to 64 bytes, so all of local data in same cache line
2314 mov r6, rsp
2315 sub rsp, 12*mmsize
2316 and rsp, ~63
2317 %define m8 [rsp + 0 * mmsize]
2318 %define m9 [rsp + 1 * mmsize]
2319 %define m10 [rsp + 2 * mmsize]
2320 %define m11 [rsp + 3 * mmsize]
2321 %define m12 [rsp + 4 * mmsize]
2322 %define m13 [rsp + 5 * mmsize]
2323 %define m14 [rsp + 6 * mmsize]
2324 %define m15 [rsp + 7 * mmsize]
2325 %define m16 [rsp + 8 * mmsize]
2326 %define m17 [rsp + 9 * mmsize]
2327 %define m18 [rsp + 10 * mmsize]
2328 %define m19 [rsp + 11 * mmsize]
2329 %endif
2330 add r1, r1
2331 lea r5, [planar32_table1]
2332
2333 movzx r3d, word [r2 + 66] ; topRight = above[32]
2334 movd m7, r3d
2335 pshufd m7, m7, 0 ; v_topRight
2336
2337 pmulld m0, m7, [r5 + 0 ] ; (x + 1) * topRight
2338 pmulld m1, m7, [r5 + 16 ]
2339 pmulld m2, m7, [r5 + 32 ]
2340 pmulld m3, m7, [r5 + 48 ]
2341 pmulld m4, m7, [r5 + 64 ]
2342 pmulld m5, m7, [r5 + 80 ]
2343 pmulld m6, m7, [r5 + 96 ]
2344 pmulld m7, m7, [r5 + 112]
2345
2346 mova m12, m4
2347 mova m13, m5
2348 mova m14, m6
2349 mova m15, m7
2350
2351 movzx r3d, word [r2 + 194] ; bottomLeft = left[32]
2352 movd m6, r3d
2353 pshufd m6, m6, 0 ; v_bottomLeft
2354
2355 paddd m0, m6
2356 paddd m1, m6
2357 paddd m2, m6
2358 paddd m3, m6
2359 paddd m0, [pd_32]
2360 paddd m1, [pd_32]
2361 paddd m2, [pd_32]
2362 paddd m3, [pd_32]
2363
2364 mova m4, m12
2365 mova m5, m13
2366 paddd m4, m6
2367 paddd m5, m6
2368 paddd m4, [pd_32]
2369 paddd m5, [pd_32]
2370 mova m12, m4
2371 mova m13, m5
2372
2373 mova m4, m14
2374 mova m5, m15
2375 paddd m4, m6
2376 paddd m5, m6
2377 paddd m4, [pd_32]
2378 paddd m5, [pd_32]
2379 mova m14, m4
2380 mova m15, m5
2381
2382 ; above[0-3] * (blkSize - 1 - y)
2383 pmovzxwd m4, [r2 + 2]
2384 pmulld m5, m4, [pd_31]
2385 paddd m0, m5
2386 psubd m5, m6, m4
2387 mova m8, m5
2388
2389 ; above[4-7] * (blkSize - 1 - y)
2390 pmovzxwd m4, [r2 + 10]
2391 pmulld m5, m4, [pd_31]
2392 paddd m1, m5
2393 psubd m5, m6, m4
2394 mova m9, m5
2395
2396 ; above[8-11] * (blkSize - 1 - y)
2397 pmovzxwd m4, [r2 + 18]
2398 pmulld m5, m4, [pd_31]
2399 paddd m2, m5
2400 psubd m5, m6, m4
2401 mova m10, m5
2402
2403 ; above[12-15] * (blkSize - 1 - y)
2404 pmovzxwd m4, [r2 + 26]
2405 pmulld m5, m4, [pd_31]
2406 paddd m3, m5
2407 psubd m5, m6, m4
2408 mova m11, m5
2409
2410 ; above[16-19] * (blkSize - 1 - y)
2411 pmovzxwd m4, [r2 + 34]
2412 mova m7, m12
2413 pmulld m5, m4, [pd_31]
2414 paddd m7, m5
2415 mova m12, m7
2416 psubd m5, m6, m4
2417 mova m16, m5
2418
2419 ; above[20-23] * (blkSize - 1 - y)
2420 pmovzxwd m4, [r2 + 42]
2421 mova m7, m13
2422 pmulld m5, m4, [pd_31]
2423 paddd m7, m5
2424 mova m13, m7
2425 psubd m5, m6, m4
2426 mova m17, m5
2427
2428 ; above[24-27] * (blkSize - 1 - y)
2429 pmovzxwd m4, [r2 + 50]
2430 mova m7, m14
2431 pmulld m5, m4, [pd_31]
2432 paddd m7, m5
2433 mova m14, m7
2434 psubd m5, m6, m4
2435 mova m18, m5
2436
2437 ; above[28-31] * (blkSize - 1 - y)
2438 pmovzxwd m4, [r2 + 58]
2439 mova m7, m15
2440 pmulld m5, m4, [pd_31]
2441 paddd m7, m5
2442 mova m15, m7
2443 psubd m5, m6, m4
2444 mova m19, m5
2445
2446 add r2, 130 ; (2 * blkSize + 1)
2447 lea r5, [planar32_table]
2448
2449 %macro INTRA_PRED_PLANAR32 0
2450 movzx r3d, word [r2]
2451 movd m4, r3d
2452 pshufd m4, m4, 0
2453
2454 pmulld m5, m4, [r5]
2455 pmulld m6, m4, [r5 + 16]
2456 paddd m5, m0
2457 paddd m6, m1
2458 paddd m0, m8
2459 paddd m1, m9
2460 psrad m5, 6
2461 psrad m6, 6
2462 packusdw m5, m6
2463 movu [r0], m5
2464
2465 pmulld m5, m4, [r5 + 32]
2466 pmulld m6, m4, [r5 + 48]
2467 paddd m5, m2
2468 paddd m6, m3
2469 paddd m2, m10
2470 paddd m3, m11
2471 psrad m5, 6
2472 psrad m6, 6
2473 packusdw m5, m6
2474 movu [r0 + 16], m5
2475
2476 pmulld m5, m4, [r5 + 64]
2477 pmulld m6, m4, [r5 + 80]
2478 paddd m5, m12
2479 paddd m6, m13
2480 psrad m5, 6
2481 psrad m6, 6
2482 packusdw m5, m6
2483 movu [r0 + 32], m5
2484 mova m5, m12
2485 mova m6, m13
2486 paddd m5, m16
2487 paddd m6, m17
2488 mova m12, m5
2489 mova m13, m6
2490
2491 pmulld m5, m4, [r5 + 96]
2492 pmulld m4, [r5 + 112]
2493 paddd m5, m14
2494 paddd m4, m15
2495 psrad m5, 6
2496 psrad m4, 6
2497 packusdw m5, m4
2498 movu [r0 + 48], m5
2499 mova m4, m14
2500 mova m5, m15
2501 paddd m4, m18
2502 paddd m5, m19
2503 mova m14, m4
2504 mova m15, m5
2505
2506 lea r0, [r0 + r1]
2507 add r2, 2
2508 %endmacro
2509
2510 mov r4, 8
2511 .loop:
2512 INTRA_PRED_PLANAR32
2513 INTRA_PRED_PLANAR32
2514 INTRA_PRED_PLANAR32
2515 INTRA_PRED_PLANAR32
2516 dec r4
2517 jnz .loop
2518 mov rsp, r6
2519 RET
2520
2521 ;-----------------------------------------------------------------------------------------
2522 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
2523 ;-----------------------------------------------------------------------------------------
2524 INIT_XMM ssse3
2525 cglobal intra_pred_ang4_2, 3,5,4
2526 lea r4, [r2 + 4]
2527 add r2, 20
2528 cmp r3m, byte 34
2529 cmove r2, r4
2530
2531 add r1, r1
2532 movu m0, [r2]
2533 movh [r0], m0
2534 palignr m1, m0, 2
2535 movh [r0 + r1], m1
2536 palignr m2, m0, 4
2537 movh [r0 + r1 * 2], m2
2538 lea r1, [r1 * 3]
2539 psrldq m0, 6
2540 movh [r0 + r1], m0
2541 RET
2542
2543 INIT_XMM sse4
2544 cglobal intra_pred_ang4_3, 3,5,8
2545 mov r4, 2
2546 cmp r3m, byte 33
2547 mov r3, 18
2548 cmove r3, r4
2549
2550 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2551 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2552 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2553 palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
2554 punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2]
2555 palignr m1, m0, 6 ; [x x x 8 7 6 5 4]
2556 punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3]
2557 movhlps m0, m0 ; [x x x x 8 7 6 5]
2558 punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
2559
2560 lea r3, [ang_table + 20 * 16]
2561 mova m0, [r3 + 6 * 16] ; [26]
2562 mova m1, [r3] ; [20]
2563 mova m6, [r3 - 6 * 16] ; [14]
2564 mova m7, [r3 - 12 * 16] ; [ 8]
2565 jmp .do_filter4x4
2566
2567 ALIGN 16
2568 .do_filter4x4:
2569 pmaddwd m2, m0
2570 paddd m2, [pd_16]
2571 psrld m2, 5
2572
2573 pmaddwd m3, m1
2574 paddd m3, [pd_16]
2575 psrld m3, 5
2576 packusdw m2, m3
2577
2578 pmaddwd m4, m6
2579 paddd m4, [pd_16]
2580 psrld m4, 5
2581
2582 pmaddwd m5, m7
2583 paddd m5, [pd_16]
2584 psrld m5, 5
2585 packusdw m4, m5
2586
2587 jz .store
2588
2589 ; transpose 4x4
2590 punpckhwd m0, m2, m4
2591 punpcklwd m2, m4
2592 punpckhwd m4, m2, m0
2593 punpcklwd m2, m0
2594
2595 .store:
2596 add r1, r1
2597 movh [r0], m2
2598 movhps [r0 + r1], m2
2599 movh [r0 + r1 * 2], m4
2600 lea r1, [r1 * 3]
2601 movhps [r0 + r1], m4
2602 RET
2603
2604 cglobal intra_pred_ang4_4, 3,5,8
2605 mov r4, 2
2606 cmp r3m, byte 32
2607 mov r3, 18
2608 cmove r3, r4
2609
2610 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2611 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2612 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2613 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
2614 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
2615 mova m4, m3
2616 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
2617 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
2618
2619 lea r3, [ang_table + 18 * 16]
2620 mova m0, [r3 + 3 * 16] ; [21]
2621 mova m1, [r3 - 8 * 16] ; [10]
2622 mova m6, [r3 + 13 * 16] ; [31]
2623 mova m7, [r3 + 2 * 16] ; [20]
2624 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2625
2626 cglobal intra_pred_ang4_5, 3,5,8
2627 mov r4, 2
2628 cmp r3m, byte 31
2629 mov r3, 18
2630 cmove r3, r4
2631
2632 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2633 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2634 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2635 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
2636 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
2637 mova m4, m3
2638 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
2639 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
2640
2641 lea r3, [ang_table + 10 * 16]
2642 mova m0, [r3 + 7 * 16] ; [17]
2643 mova m1, [r3 - 8 * 16] ; [ 2]
2644 mova m6, [r3 + 9 * 16] ; [19]
2645 mova m7, [r3 - 6 * 16] ; [ 4]
2646 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2647
2648 cglobal intra_pred_ang4_6, 3,5,8
2649 mov r4, 2
2650 cmp r3m, byte 30
2651 mov r3, 18
2652 cmove r3, r4
2653
2654 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2655 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2656 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2657 mova m3, m2
2658 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
2659 punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2]
2660 mova m5, m4
2661
2662 lea r3, [ang_table + 19 * 16]
2663 mova m0, [r3 - 6 * 16] ; [13]
2664 mova m1, [r3 + 7 * 16] ; [26]
2665 mova m6, [r3 - 12 * 16] ; [ 7]
2666 mova m7, [r3 + 1 * 16] ; [20]
2667 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2668
2669 cglobal intra_pred_ang4_7, 3,5,8
2670 mov r4, 2
2671 cmp r3m, byte 29
2672 mov r3, 18
2673 cmove r3, r4
2674
2675 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2676 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2677 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2678 mova m3, m2
2679 mova m4, m2
2680 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
2681 punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2]
2682
2683 lea r3, [ang_table + 20 * 16]
2684 mova m0, [r3 - 11 * 16] ; [ 9]
2685 mova m1, [r3 - 2 * 16] ; [18]
2686 mova m6, [r3 + 7 * 16] ; [27]
2687 mova m7, [r3 - 16 * 16] ; [ 4]
2688 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2689
2690 cglobal intra_pred_ang4_8, 3,5,8
2691 mov r4, 2
2692 cmp r3m, byte 28
2693 mov r3, 18
2694 cmove r3, r4
2695
2696 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2697 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2698 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2699 mova m3, m2
2700 mova m4, m2
2701 mova m5, m2
2702
2703 lea r3, [ang_table + 13 * 16]
2704 mova m0, [r3 - 8 * 16] ; [ 5]
2705 mova m1, [r3 - 3 * 16] ; [10]
2706 mova m6, [r3 + 2 * 16] ; [15]
2707 mova m7, [r3 + 7 * 16] ; [20]
2708 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2709
2710 cglobal intra_pred_ang4_9, 3,5,8
2711 mov r4, 2
2712 cmp r3m, byte 27
2713 mov r3, 18
2714 cmove r3, r4
2715
2716 movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
2717 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
2718 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
2719 mova m3, m2
2720 mova m4, m2
2721 mova m5, m2
2722
2723 lea r3, [ang_table + 4 * 16]
2724 mova m0, [r3 - 2 * 16] ; [ 2]
2725 mova m1, [r3 - 0 * 16] ; [ 4]
2726 mova m6, [r3 + 2 * 16] ; [ 6]
2727 mova m7, [r3 + 4 * 16] ; [ 8]
2728 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2729
2730 cglobal intra_pred_ang4_10, 3,3,4
2731 movh m0, [r2 + 18] ; [4 3 2 1]
2732 pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3]
2733 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
2734 add r1, r1
2735 movhlps m1, m0 ; [2 2 2 2]
2736 movhlps m3, m2 ; [4 4 4 4]
2737 movh [r0 + r1], m1
2738 movh [r0 + r1 * 2], m2
2739 lea r1, [r1 * 3]
2740 movh [r0 + r1], m3
2741
2742 cmp r4m, byte 0
2743 jz .quit
2744
2745 ; filter
2746 movu m1, [r2] ; [7 6 5 4 3 2 1 0]
2747 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
2748 palignr m1, m1, 2 ; [4 3 2 1]
2749 psubw m1, m2
2750 psraw m1, 1
2751 paddw m0, m1
2752 pxor m1, m1
2753 pmaxsw m0, m1
2754 pminsw m0, [pw_pixel_max]
2755 .quit:
2756 movh [r0], m0
2757 RET
2758
2759 cglobal intra_pred_ang4_26, 3,4,3
2760 movh m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2761 add r1, r1
2762 ; store
2763 movh [r0], m0
2764 movh [r0 + r1], m0
2765 movh [r0 + r1 * 2], m0
2766 lea r3, [r1 * 3]
2767 movh [r0 + r3], m0
2768
2769 ; filter
2770 cmp r4m, byte 0
2771 jz .quit
2772
2773 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
2774 movu m1, [r2 + 16]
2775 pinsrw m1, [r2], 0 ; [7 6 5 4 3 2 1 0]
2776 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
2777 palignr m1, m1, 2 ; [4 3 2 1]
2778 psubw m1, m2
2779 psraw m1, 1
2780 paddw m0, m1
2781 pxor m1, m1
2782 pmaxsw m0, m1
2783 pminsw m0, [pw_pixel_max]
2784
2785 pextrw [r0], m0, 0
2786 pextrw [r0 + r1], m0, 1
2787 pextrw [r0 + r1 * 2], m0, 2
2788 pextrw [r0 + r3], m0, 3
2789 .quit:
2790 RET
2791
2792 cglobal intra_pred_ang4_11, 3,5,8
2793 xor r4, r4
2794 cmp r3m, byte 25
2795 mov r3, 16
2796 cmove r3, r4
2797
2798 movu m2, [r2 + r3] ; [x x x 4 3 2 1 0]
2799 pinsrw m2, [r2], 0
2800 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
2801 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
2802 mova m3, m2
2803 mova m4, m2
2804 mova m5, m2
2805
2806 lea r3, [ang_table + 24 * 16]
2807 mova m0, [r3 + 6 * 16] ; [24]
2808 mova m1, [r3 + 4 * 16] ; [26]
2809 mova m6, [r3 + 2 * 16] ; [28]
2810 mova m7, [r3 + 0 * 16] ; [30]
2811 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2812
2813 cglobal intra_pred_ang4_12, 3,5,8
2814 xor r4, r4
2815 cmp r3m, byte 24
2816 mov r3, 16
2817 cmove r3, r4
2818
2819 movu m2, [r2 + r3] ; [x x x 4 3 2 1 0]
2820 pinsrw m2, [r2], 0
2821 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
2822 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
2823 mova m3, m2
2824 mova m4, m2
2825 mova m5, m2
2826
2827 lea r3, [ang_table + 20 * 16]
2828 mova m0, [r3 + 7 * 16] ; [27]
2829 mova m1, [r3 + 2 * 16] ; [22]
2830 mova m6, [r3 - 3 * 16] ; [17]
2831 mova m7, [r3 - 8 * 16] ; [12]
2832 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2833
2834 cglobal intra_pred_ang4_13, 3,5,8
2835 xor r4, r4
2836 cmp r3m, byte 23
2837 mov r3, 16
2838 jz .next
2839 xchg r3, r4
2840 .next:
2841 movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
2842 pinsrw m5, [r2], 1
2843 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
2844 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
2845 pinsrw m5, [r2 + r3 + 8], 0
2846 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
2847 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
2848 mova m3, m2
2849 mova m4, m2
2850
2851 lea r3, [ang_table + 21 * 16]
2852 mova m0, [r3 + 2 * 16] ; [23]
2853 mova m1, [r3 - 7 * 16] ; [14]
2854 mova m6, [r3 - 16 * 16] ; [ 5]
2855 mova m7, [r3 + 7 * 16] ; [28]
2856 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2857
2858 cglobal intra_pred_ang4_14, 3,5,8
2859 xor r4, r4
2860 cmp r3m, byte 22
2861 mov r3, 16
2862 jz .next
2863 xchg r3, r4
2864 .next:
2865 movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
2866 pinsrw m5, [r2], 1
2867 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
2868 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
2869 pinsrw m5, [r2 + r3 + 4], 0
2870 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
2871 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
2872 mova m3, m2
2873 mova m4, m5
2874
2875 lea r3, [ang_table + 19 * 16]
2876 mova m0, [r3 + 0 * 16] ; [19]
2877 mova m1, [r3 - 13 * 16] ; [ 6]
2878 mova m6, [r3 + 6 * 16] ; [25]
2879 mova m7, [r3 - 7 * 16] ; [12]
2880 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2881
2882 cglobal intra_pred_ang4_15, 3,5,8
2883 xor r4, r4
2884 cmp r3m, byte 21
2885 mov r3, 16
2886 jz .next
2887 xchg r3, r4
2888 .next:
2889 movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
2890 pinsrw m3, [r2], 1
2891 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
2892 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
2893 pinsrw m3, [r2 + r3 + 4], 0
2894 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
2895 pinsrw m5, [r2 + r3 + 8], 0
2896 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
2897 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
2898 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
2899 mova m4, m3
2900
2901 lea r3, [ang_table + 23 * 16]
2902 mova m0, [r3 - 8 * 16] ; [15]
2903 mova m1, [r3 + 7 * 16] ; [30]
2904 mova m6, [r3 - 10 * 16] ; [13]
2905 mova m7, [r3 + 5 * 16] ; [28]
2906 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2907
2908 cglobal intra_pred_ang4_16, 3,5,8
2909 xor r4, r4
2910 cmp r3m, byte 20
2911 mov r3, 16
2912 jz .next
2913 xchg r3, r4
2914 .next:
2915 movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x]
2916 pinsrw m3, [r2], 1
2917 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
2918 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
2919 pinsrw m3, [r2 + r3 + 4], 0
2920 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
2921 pinsrw m5, [r2 + r3 + 6], 0
2922 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
2923 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
2924 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
2925 mova m4, m3
2926
2927 lea r3, [ang_table + 19 * 16]
2928 mova m0, [r3 - 8 * 16] ; [11]
2929 mova m1, [r3 + 3 * 16] ; [22]
2930 mova m6, [r3 - 18 * 16] ; [ 1]
2931 mova m7, [r3 - 7 * 16] ; [12]
2932 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2933
2934 cglobal intra_pred_ang4_17, 3,5,8
2935 xor r4, r4
2936 cmp r3m, byte 19
2937 mov r3, 16
2938 jz .next
2939 xchg r3, r4
2940 .next:
2941 movu m6, [r2 + r4 - 2] ; [- - 4 3 2 1 0 x]
2942 pinsrw m6, [r2], 1
2943 palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
2944 palignr m1, m6, 4 ; [- - - - 4 3 2 1]
2945 mova m4, m2
2946 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
2947
2948 pinsrw m6, [r2 + r3 + 2], 0
2949 punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
2950
2951 pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
2952 pinsrw m4, [r2 + r3 + 4], 0
2953 pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
2954 pinsrw m5, [r2 + r3 + 8], 0
2955 punpcklwd m5, m4 ; [1 0 0 x x y y z]
2956 punpcklwd m4, m6 ; [2 1 1 0 0 x x y]
2957
2958 lea r3, [ang_table + 14 * 16]
2959 mova m0, [r3 - 8 * 16] ; [ 6]
2960 mova m1, [r3 - 2 * 16] ; [12]
2961 mova m6, [r3 + 4 * 16] ; [18]
2962 mova m7, [r3 + 10 * 16] ; [24]
2963 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
2964
2965 cglobal intra_pred_ang4_18, 3,3,1
2966 movh m0, [r2 + 16]
2967 pinsrw m0, [r2], 0
2968 pshufb m0, [pw_swap]
2969 movhps m0, [r2 + 2]
2970 add r1, r1
2971 lea r2, [r1 * 3]
2972 movh [r0 + r2], m0
2973 psrldq m0, 2
2974 movh [r0 + r1 * 2], m0
2975 psrldq m0, 2
2976 movh [r0 + r1], m0
2977 psrldq m0, 2
2978 movh [r0], m0
2979 RET
2980
2981 ;-----------------------------------------------------------------------------------------
2982 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
2983 ;-----------------------------------------------------------------------------------------
2984 INIT_XMM ssse3
2985 cglobal intra_pred_ang8_2, 3,5,3
2986 lea r4, [r2]
2987 add r2, 32
2988 cmp r3m, byte 34
2989 cmove r2, r4
2990 add r1, r1
2991 lea r3, [r1 * 3]
2992 movu m0, [r2 + 4]
2993 movu m1, [r2 + 20]
2994 movu [r0], m0
2995 palignr m2, m1, m0, 2
2996 movu [r0 + r1], m2
2997 palignr m2, m1, m0, 4
2998 movu [r0 + r1 * 2], m2
2999 palignr m2, m1, m0, 6
3000 movu [r0 + r3], m2
3001 lea r0, [r0 + r1 * 4]
3002 palignr m2, m1, m0, 8
3003 movu [r0], m2
3004 palignr m2, m1, m0, 10
3005 movu [r0 + r1], m2
3006 palignr m2, m1, m0, 12
3007 movu [r0 + r1 * 2], m2
3008 palignr m1, m0, 14
3009 movu [r0 + r3], m1
3010 RET
3011
3012 INIT_XMM sse4
3013 cglobal intra_pred_ang8_3, 3,5,8
3014 add r2, 32
3015 lea r3, [ang_table + 14 * 16]
3016 add r1, r1
3017
3018 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3019 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
3020 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
3021 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
3022
3023 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
3024 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
3025 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
3026 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
3027
3028 mova m4, m3
3029 pmaddwd m4, [r3 + 12 * 16] ; [26]
3030 paddd m4, [pd_16]
3031 psrld m4, 5
3032 mova m2, m0
3033 pmaddwd m2, [r3 + 12 * 16]
3034 paddd m2, [pd_16]
3035 psrld m2, 5
3036 packusdw m4, m2
3037
3038 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3039 pmaddwd m2, [r3 + 6 * 16] ; [20]
3040 paddd m2, [pd_16]
3041 psrld m2, 5
3042 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3043 pmaddwd m6, [r3 + 6 * 16]
3044 paddd m6, [pd_16]
3045 psrld m6, 5
3046 packusdw m2, m6
3047
3048 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3049 pmaddwd m6, [r3] ; [14]
3050 paddd m6, [pd_16]
3051 psrld m6, 5
3052 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3053 pmaddwd m7, [r3]
3054 paddd m7, [pd_16]
3055 psrld m7, 5
3056 packusdw m6, m7
3057
3058 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
3059 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
3060 paddd m7, [pd_16]
3061 psrld m7, 5
3062 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
3063 pmaddwd m3, [r3 - 6 * 16]
3064 paddd m3, [pd_16]
3065 psrld m3, 5
3066 packusdw m7, m3
3067
3068 punpckhwd m3, m4, m2
3069 punpcklwd m4, m2
3070 punpckhwd m2, m6, m7
3071 punpcklwd m6, m7
3072
3073 punpckldq m7, m4, m6
3074 punpckhdq m4, m6
3075 punpckldq m6, m3, m2
3076 punpckhdq m3, m2
3077
3078 lea r4, [r1 * 3]
3079 movh [r0], m7
3080 movhps [r0 + r1], m7
3081 movh [r0 + r1 * 2], m4
3082 movhps [r0 + r4], m4
3083 lea r2, [r0 + r1 * 4]
3084 movh [r2], m6
3085 movhps [r2 + r1], m6
3086 movh [r2 + r1 * 2], m3
3087 movhps [r2 + r4], m3
3088
3089 mova m4, m0
3090 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
3091 paddd m4, [pd_16]
3092 psrld m4, 5
3093 mova m2, m5
3094 pmaddwd m2, [r3 - 12 * 16]
3095 paddd m2, [pd_16]
3096 psrld m2, 5
3097 packusdw m4, m2
3098
3099 mova m2, m0
3100 pmaddwd m2, [r3 + 14 * 16] ; [28]
3101 paddd m2, [pd_16]
3102 psrld m2, 5
3103 mova m6, m5
3104 pmaddwd m6, [r3 + 14 * 16]
3105 paddd m6, [pd_16]
3106 psrld m6, 5
3107 packusdw m2, m6
3108
3109 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3110 pmaddwd m6, [r3 + 8 * 16] ; [22]
3111 paddd m6, [pd_16]
3112 psrld m6, 5
3113 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
3114 pmaddwd m7, [r3 + 8 * 16]
3115 paddd m7, [pd_16]
3116 psrld m7, 5
3117 packusdw m6, m7
3118
3119 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3120 pmaddwd m7, [r3 + 2 * 16] ; [16]
3121 paddd m7, [pd_16]
3122 psrld m7, 5
3123 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
3124 pmaddwd m1, [r3 + 2 * 16]
3125 paddd m1, [pd_16]
3126 psrld m1, 5
3127 packusdw m7, m1
3128
3129 punpckhwd m3, m4, m2
3130 punpcklwd m4, m2
3131 punpckhwd m2, m6, m7
3132 punpcklwd m6, m7
3133
3134 punpckldq m7, m4, m6
3135 punpckhdq m4, m6
3136 punpckldq m6, m3, m2
3137 punpckhdq m3, m2
3138
3139 movh [r0 + 8], m7
3140 movhps [r0 + r1 + 8], m7
3141 movh [r0 + r1 * 2 + 8], m4
3142 movhps [r0 + r4 + 8], m4
3143 lea r0, [r0 + r1 * 4]
3144 movh [r0 + 8], m6
3145 movhps [r0 + r1 + 8], m6
3146 movh [r0 + r1 * 2 + 8], m3
3147 movhps [r0 + r4 + 8], m3
3148 RET
3149
3150 cglobal intra_pred_ang8_4, 3,6,8
3151 add r2, 32
3152 lea r3, [ang_table + 19 * 16]
3153 add r1, r1
3154
3155 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3156 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
3157 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
3158 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
3159
3160 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
3161 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
3162 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
3163
3164 mova m4, m3
3165 pmaddwd m4, [r3 + 2 * 16] ; [21]
3166 paddd m4, [pd_16]
3167 psrld m4, 5
3168 mova m2, m0
3169 pmaddwd m2, [r3 + 2 * 16]
3170 paddd m2, [pd_16]
3171 psrld m2, 5
3172 packusdw m4, m2
3173
3174 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3175 mova m6, m2
3176 pmaddwd m2, [r3 - 9 * 16] ; [10]
3177 paddd m2, [pd_16]
3178 psrld m2, 5
3179 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3180 mova m7, m1
3181 pmaddwd m1, [r3 - 9 * 16]
3182 paddd m1, [pd_16]
3183 psrld m1, 5
3184 packusdw m2, m1
3185
3186 pmaddwd m6, [r3 + 12 * 16] ; [31]
3187 paddd m6, [pd_16]
3188 psrld m6, 5
3189 pmaddwd m7, [r3 + 12 * 16]
3190 paddd m7, [pd_16]
3191 psrld m7, 5
3192 packusdw m6, m7
3193
3194 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3195 pmaddwd m7, [r3 + 1 * 16] ; [20]
3196 paddd m7, [pd_16]
3197 psrld m7, 5
3198 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3199 pmaddwd m1, [r3 + 1 * 16]
3200 paddd m1, [pd_16]
3201 psrld m1, 5
3202 packusdw m7, m1
3203
3204 punpckhwd m1, m4, m2
3205 punpcklwd m4, m2
3206 punpckhwd m2, m6, m7
3207 punpcklwd m6, m7
3208
3209 punpckldq m7, m4, m6
3210 punpckhdq m4, m6
3211 punpckldq m6, m1, m2
3212 punpckhdq m1, m2
3213
3214 lea r4, [r1 * 3]
3215 movh [r0], m7
3216 movhps [r0 + r1], m7
3217 movh [r0 + r1 * 2], m4
3218 movhps [r0 + r4], m4
3219 lea r5, [r0 + r1 * 4]
3220 movh [r5], m6
3221 movhps [r5 + r1], m6
3222 movh [r5 + r1 * 2], m1
3223 movhps [r5 + r4], m1
3224
3225 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
3226 mova m2, m4
3227 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
3228 paddd m4, [pd_16]
3229 psrld m4, 5
3230 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
3231 mova m6, m3
3232 pmaddwd m3, [r3 - 10 * 16]
3233 paddd m3, [pd_16]
3234 psrld m3, 5
3235 packusdw m4, m3
3236
3237 pmaddwd m2, [r3 + 11 * 16] ; [30]
3238 paddd m2, [pd_16]
3239 psrld m2, 5
3240 pmaddwd m6, [r3 + 11 * 16]
3241 paddd m6, [pd_16]
3242 psrld m6, 5
3243 packusdw m2, m6
3244
3245 mova m6, m0
3246 pmaddwd m6, [r3] ; [19]
3247 paddd m6, [pd_16]
3248 psrld m6, 5
3249 mova m7, m5
3250 pmaddwd m7, [r3]
3251 paddd m7, [pd_16]
3252 psrld m7, 5
3253 packusdw m6, m7
3254
3255 movh m1, [r2 + 26] ; [16 15 14 13]
3256 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3257 pmaddwd m7, [r3 - 11 * 16] ; [8]
3258 paddd m7, [pd_16]
3259 psrld m7, 5
3260 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
3261 pmaddwd m1, [r3 - 11 * 16]
3262 paddd m1, [pd_16]
3263 psrld m1, 5
3264 packusdw m7, m1
3265
3266 punpckhwd m3, m4, m2
3267 punpcklwd m4, m2
3268 punpckhwd m2, m6, m7
3269 punpcklwd m6, m7
3270
3271 punpckldq m7, m4, m6
3272 punpckhdq m4, m6
3273 punpckldq m6, m3, m2
3274 punpckhdq m3, m2
3275
3276 movh [r0 + 8], m7
3277 movhps [r0 + r1 + 8], m7
3278 movh [r0 + r1 * 2 + 8], m4
3279 movhps [r0 + r4 + 8], m4
3280 lea r0, [r0 + r1 * 4]
3281 movh [r0 + 8], m6
3282 movhps [r0 + r1 + 8], m6
3283 movh [r0 + r1 * 2 + 8], m3
3284 movhps [r0 + r4 + 8], m3
3285 RET
3286
3287 cglobal intra_pred_ang8_5, 3,5,8
3288 add r2, 32
3289 lea r3, [ang_table + 13 * 16]
3290 add r1, r1
3291
3292 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3293 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
3294 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
3295 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
3296
3297 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
3298 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
3299 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
3300
3301 mova m4, m3
3302 pmaddwd m4, [r3 + 4 * 16] ; [17]
3303 paddd m4, [pd_16]
3304 psrld m4, 5
3305 mova m2, m0
3306 pmaddwd m2, [r3 + 4 * 16]
3307 paddd m2, [pd_16]
3308 psrld m2, 5
3309 packusdw m4, m2
3310
3311 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3312 mova m6, m2
3313 pmaddwd m2, [r3 - 11 * 16] ; [2]
3314 paddd m2, [pd_16]
3315 psrld m2, 5
3316 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3317 mova m7, m1
3318 pmaddwd m1, [r3 - 11 * 16]
3319 paddd m1, [pd_16]
3320 psrld m1, 5
3321 packusdw m2, m1
3322
3323 pmaddwd m6, [r3 + 6 * 16] ; [19]
3324 paddd m6, [pd_16]
3325 psrld m6, 5
3326 pmaddwd m7, [r3 + 6 * 16]
3327 paddd m7, [pd_16]
3328 psrld m7, 5
3329 packusdw m6, m7
3330
3331 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3332 pmaddwd m7, [r3 - 9 * 16] ; [4]
3333 paddd m7, [pd_16]
3334 psrld m7, 5
3335 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3336 pmaddwd m1, [r3 - 9 * 16]
3337 paddd m1, [pd_16]
3338 psrld m1, 5
3339 packusdw m7, m1
3340
3341 punpckhwd m1, m4, m2
3342 punpcklwd m4, m2
3343 punpckhwd m2, m6, m7
3344 punpcklwd m6, m7
3345
3346 punpckldq m7, m4, m6
3347 punpckhdq m4, m6
3348 punpckldq m6, m1, m2
3349 punpckhdq m1, m2
3350
3351 lea r4, [r1 * 3]
3352 movh [r0], m7
3353 movhps [r0 + r1], m7
3354 movh [r0 + r1 * 2], m4
3355 movhps [r0 + r4], m4
3356 lea r2, [r0 + r1 * 4]
3357 movh [r2], m6
3358 movhps [r2 + r1], m6
3359 movh [r2 + r1 * 2], m1
3360 movhps [r2 + r4], m1
3361
3362 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3363 pmaddwd m4, [r3 + 8 * 16] ; [21]
3364 paddd m4, [pd_16]
3365 psrld m4, 5
3366 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3367 pmaddwd m2, [r3 + 8 * 16]
3368 paddd m2, [pd_16]
3369 psrld m2, 5
3370 packusdw m4, m2
3371
3372 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
3373 mova m6, m2
3374 pmaddwd m2, [r3 - 7 * 16] ; [6]
3375 paddd m2, [pd_16]
3376 psrld m2, 5
3377 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
3378 mova m7, m1
3379 pmaddwd m1, [r3 - 7 * 16]
3380 paddd m1, [pd_16]
3381 psrld m1, 5
3382 packusdw m2, m1
3383
3384 pmaddwd m6, [r3 + 10 * 16] ; [23]
3385 paddd m6, [pd_16]
3386 psrld m6, 5
3387 pmaddwd m7, [r3 + 10 * 16]
3388 paddd m7, [pd_16]
3389 psrld m7, 5
3390 packusdw m6, m7
3391
3392 mova m7, m0
3393 pmaddwd m7, [r3 - 5 * 16] ; [8]
3394 paddd m7, [pd_16]
3395 psrld m7, 5
3396 mova m1, m5
3397 pmaddwd m1, [r3 - 5 * 16]
3398 paddd m1, [pd_16]
3399 psrld m1, 5
3400 packusdw m7, m1
3401
3402 punpckhwd m3, m4, m2
3403 punpcklwd m4, m2
3404 punpckhwd m2, m6, m7
3405 punpcklwd m6, m7
3406
3407 punpckldq m7, m4, m6
3408 punpckhdq m4, m6
3409 punpckldq m6, m3, m2
3410 punpckhdq m3, m2
3411
3412 movh [r0 + 8], m7
3413 movhps [r0 + r1 + 8], m7
3414 movh [r0 + r1 * 2 + 8], m4
3415 movhps [r0 + r4 + 8], m4
3416 lea r0, [r0 + r1 * 4]
3417 movh [r0 + 8], m6
3418 movhps [r0 + r1 + 8], m6
3419 movh [r0 + r1 * 2 + 8], m3
3420 movhps [r0 + r4 + 8], m3
3421 RET
3422
3423 cglobal intra_pred_ang8_6, 3,5,8
3424 add r2, 32
3425 lea r3, [ang_table + 14 * 16]
3426 add r1, r1
3427
3428 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3429 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
3430 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
3431 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
3432
3433 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
3434 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
3435 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
3436
3437 mova m4, m3
3438 pmaddwd m4, [r3 - 1 * 16] ; [13]
3439 paddd m4, [pd_16]
3440 psrld m4, 5
3441 mova m2, m0
3442 pmaddwd m2, [r3 - 1 * 16]
3443 paddd m2, [pd_16]
3444 psrld m2, 5
3445 packusdw m4, m2
3446
3447 mova m2, m3
3448 pmaddwd m2, [r3 + 12 * 16] ; [26]
3449 paddd m2, [pd_16]
3450 psrld m2, 5
3451 mova m1, m0
3452 pmaddwd m1, [r3 + 12 * 16]
3453 paddd m1, [pd_16]
3454 psrld m1, 5
3455 packusdw m2, m1
3456
3457 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3458 mova m7, m6
3459 pmaddwd m6, [r3 - 7 * 16] ; [7]
3460 paddd m6, [pd_16]
3461 psrld m6, 5
3462 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3463 pmaddwd m1, [r3 - 7 * 16]
3464 paddd m1, [pd_16]
3465 psrld m1, 5
3466 packusdw m6, m1
3467
3468 pmaddwd m7, [r3 + 6 * 16] ; [20]
3469 paddd m7, [pd_16]
3470 psrld m7, 5
3471 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3472 pmaddwd m1, [r3 + 6 * 16]
3473 paddd m1, [pd_16]
3474 psrld m1, 5
3475 packusdw m7, m1
3476
3477 punpckhwd m1, m4, m2
3478 punpcklwd m4, m2
3479 punpckhwd m2, m6, m7
3480 punpcklwd m6, m7
3481
3482 punpckldq m7, m4, m6
3483 punpckhdq m4, m6
3484 punpckldq m6, m1, m2
3485 punpckhdq m1, m2
3486
3487 lea r4, [r1 * 3]
3488 movh [r0], m7
3489 movhps [r0 + r1], m7
3490 movh [r0 + r1 * 2], m4
3491 movhps [r0 + r4], m4
3492 lea r2, [r0 + r1 * 4]
3493 movh [r2], m6
3494 movhps [r2 + r1], m6
3495 movh [r2 + r1 * 2], m1
3496 movhps [r2 + r4], m1
3497
3498 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3499 mova m6, m4
3500 pmaddwd m4, [r3 - 13 * 16] ; [1]
3501 paddd m4, [pd_16]
3502 psrld m4, 5
3503 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3504 mova m7, m2
3505 pmaddwd m2, [r3 - 13 * 16]
3506 paddd m2, [pd_16]
3507 psrld m2, 5
3508 packusdw m4, m2
3509
3510 pmaddwd m2, m6, [r3] ; [14]
3511 paddd m2, [pd_16]
3512 psrld m2, 5
3513 pmaddwd m1, m7, [r3]
3514 paddd m1, [pd_16]
3515 psrld m1, 5
3516 packusdw m2, m1
3517
3518 pmaddwd m6, [r3 + 13 * 16] ; [27]
3519 paddd m6, [pd_16]
3520 psrld m6, 5
3521 pmaddwd m7, [r3 + 13 * 16]
3522 paddd m7, [pd_16]
3523 psrld m7, 5
3524 packusdw m6, m7
3525
3526 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
3527 pmaddwd m7, [r3 - 6 * 16] ; [8]
3528 paddd m7, [pd_16]
3529 psrld m7, 5
3530 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
3531 pmaddwd m5, [r3 - 6 * 16]
3532 paddd m5, [pd_16]
3533 psrld m5, 5
3534 packusdw m7, m5
3535
3536 punpckhwd m3, m4, m2
3537 punpcklwd m4, m2
3538 punpckhwd m2, m6, m7
3539 punpcklwd m6, m7
3540
3541 punpckldq m7, m4, m6
3542 punpckhdq m4, m6
3543 punpckldq m6, m3, m2
3544 punpckhdq m3, m2
3545
3546 movh [r0 + 8], m7
3547 movhps [r0 + r1 + 8], m7
3548 movh [r0 + r1 * 2 + 8], m4
3549 movhps [r0 + r4 + 8], m4
3550 lea r0, [r0 + r1 * 4]
3551 movh [r0 + 8], m6
3552 movhps [r0 + r1 + 8], m6
3553 movh [r0 + r1 * 2 + 8], m3
3554 movhps [r0 + r4 + 8], m3
3555 RET
3556
3557 cglobal intra_pred_ang8_7, 3,5,8
3558 add r2, 32
3559 lea r3, [ang_table + 18 * 16]
3560 add r1, r1
3561
3562 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3563 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
3564 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
3565 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
3566
3567 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
3568 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
3569 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
3570
3571 mova m4, m3
3572 pmaddwd m4, [r3 - 9 * 16] ; [9]
3573 paddd m4, [pd_16]
3574 psrld m4, 5
3575 mova m2, m0
3576 pmaddwd m2, [r3 - 9 * 16]
3577 paddd m2, [pd_16]
3578 psrld m2, 5
3579 packusdw m4, m2
3580
3581 mova m2, m3
3582 pmaddwd m2, [r3] ; [18]
3583 paddd m2, [pd_16]
3584 psrld m2, 5
3585 mova m1, m0
3586 pmaddwd m1, [r3]
3587 paddd m1, [pd_16]
3588 psrld m1, 5
3589 packusdw m2, m1
3590
3591 mova m6, m3
3592 pmaddwd m6, [r3 + 9 * 16] ; [27]
3593 paddd m6, [pd_16]
3594 psrld m6, 5
3595 mova m1, m0
3596 pmaddwd m1, [r3 + 9 * 16]
3597 paddd m1, [pd_16]
3598 psrld m1, 5
3599 packusdw m6, m1
3600
3601 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3602 pmaddwd m7, [r3 - 14 * 16] ; [4]
3603 paddd m7, [pd_16]
3604 psrld m7, 5
3605 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3606 pmaddwd m1, [r3 - 14 * 16]
3607 paddd m1, [pd_16]
3608 psrld m1, 5
3609 packusdw m7, m1
3610
3611 punpckhwd m1, m4, m2
3612 punpcklwd m4, m2
3613 punpckhwd m2, m6, m7
3614 punpcklwd m6, m7
3615
3616 punpckldq m7, m4, m6
3617 punpckhdq m4, m6
3618 punpckldq m6, m1, m2
3619 punpckhdq m1, m2
3620
3621 lea r4, [r1 * 3]
3622 movh [r0], m7
3623 movhps [r0 + r1], m7
3624 movh [r0 + r1 * 2], m4
3625 movhps [r0 + r4], m4
3626 lea r2, [r0 + r1 * 4]
3627 movh [r2], m6
3628 movhps [r2 + r1], m6
3629 movh [r2 + r1 * 2], m1
3630 movhps [r2 + r4], m1
3631
3632 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3633 mova m6, m4
3634 pmaddwd m4, [r3 - 5 * 16] ; [13]
3635 paddd m4, [pd_16]
3636 psrld m4, 5
3637 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
3638 mova m7, m2
3639 pmaddwd m2, [r3 - 5 * 16]
3640 paddd m2, [pd_16]
3641 psrld m2, 5
3642 packusdw m4, m2
3643
3644 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
3645 paddd m2, [pd_16]
3646 psrld m2, 5
3647 pmaddwd m1, m7, [r3 + 4 * 16]
3648 paddd m1, [pd_16]
3649 psrld m1, 5
3650 packusdw m2, m1
3651
3652 pmaddwd m6, [r3 + 13 * 16] ; [31]
3653 paddd m6, [pd_16]
3654 psrld m6, 5
3655 pmaddwd m7, [r3 + 13 * 16]
3656 paddd m7, [pd_16]
3657 psrld m7, 5
3658 packusdw m6, m7
3659
3660 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
3661 pmaddwd m7, [r3 - 10 * 16] ; [8]
3662 paddd m7, [pd_16]
3663 psrld m7, 5
3664 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
3665 pmaddwd m5, [r3 - 10 * 16]
3666 paddd m5, [pd_16]
3667 psrld m5, 5
3668 packusdw m7, m5
3669
3670 punpckhwd m3, m4, m2
3671 punpcklwd m4, m2
3672 punpckhwd m2, m6, m7
3673 punpcklwd m6, m7
3674
3675 punpckldq m7, m4, m6
3676 punpckhdq m4, m6
3677 punpckldq m6, m3, m2
3678 punpckhdq m3, m2
3679
3680 movh [r0 + 8], m7
3681 movhps [r0 + r1 + 8], m7
3682 movh [r0 + r1 * 2 + 8], m4
3683 movhps [r0 + r4 + 8], m4
3684 lea r0, [r0 + r1 * 4]
3685 movh [r0 + 8], m6
3686 movhps [r0 + r1 + 8], m6
3687 movh [r0 + r1 * 2 + 8], m3
3688 movhps [r0 + r4 + 8], m3
3689 RET
3690
3691 cglobal intra_pred_ang8_8, 3,6,7
3692 add r2, 32
3693 lea r3, [ang_table + 17 * 16]
3694 add r1, r1
3695
3696 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3697 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
3698
3699 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
3700 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
3701
3702 mova m4, m3
3703 pmaddwd m4, [r3 - 12 * 16] ; [5]
3704 paddd m4, [pd_16]
3705 psrld m4, 5
3706 mova m2, m0
3707 pmaddwd m2, [r3 - 12 * 16]
3708 paddd m2, [pd_16]
3709 psrld m2, 5
3710 packusdw m4, m2
3711
3712 mova m2, m3
3713 pmaddwd m2, [r3 - 7 * 16] ; [10]
3714 paddd m2, [pd_16]
3715 psrld m2, 5
3716 mova m1, m0
3717 pmaddwd m1, [r3 - 7 * 16]
3718 paddd m1, [pd_16]
3719 psrld m1, 5
3720 packusdw m2, m1
3721
3722 mova m6, m3
3723 pmaddwd m6, [r3 - 2 * 16] ; [15]
3724 paddd m6, [pd_16]
3725 psrld m6, 5
3726 mova m1, m0
3727 pmaddwd m1, [r3 - 2 * 16]
3728 paddd m1, [pd_16]
3729 psrld m1, 5
3730 packusdw m6, m1
3731
3732 mova m5, m3
3733 pmaddwd m5, [r3 + 3 * 16] ; [20]
3734 paddd m5, [pd_16]
3735 psrld m5, 5
3736 mova m1, m0
3737 pmaddwd m1, [r3 + 3 * 16]
3738 paddd m1, [pd_16]
3739 psrld m1, 5
3740 packusdw m5, m1
3741
3742 punpckhwd m1, m4, m2
3743 punpcklwd m4, m2
3744 punpckhwd m2, m6, m5
3745 punpcklwd m6, m5
3746
3747 punpckldq m5, m4, m6
3748 punpckhdq m4, m6
3749 punpckldq m6, m1, m2
3750 punpckhdq m1, m2
3751
3752 lea r4, [r1 * 3]
3753 movh [r0], m5
3754 movhps [r0 + r1], m5
3755 movh [r0 + r1 * 2], m4
3756 movhps [r0 + r4], m4
3757 lea r5, [r0 + r1 * 4]
3758 movh [r5], m6
3759 movhps [r5 + r1], m6
3760 movh [r5 + r1 * 2], m1
3761 movhps [r5 + r4], m1
3762
3763 mova m4, m3
3764 pmaddwd m4, [r3 + 8 * 16] ; [25]
3765 paddd m4, [pd_16]
3766 psrld m4, 5
3767 mova m2, m0
3768 pmaddwd m2, [r3 + 8 * 16]
3769 paddd m2, [pd_16]
3770 psrld m2, 5
3771 packusdw m4, m2
3772
3773 mova m2, m3
3774 pmaddwd m2, [r3 + 13 * 16] ; [30]
3775 paddd m2, [pd_16]
3776 psrld m2, 5
3777 mova m1, m0
3778 pmaddwd m1, [r3 + 13 * 16]
3779 paddd m1, [pd_16]
3780 psrld m1, 5
3781 packusdw m2, m1
3782
3783 movh m1, [r2 + 18] ; [12 11 10 9]
3784
3785 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
3786 mova m5, m6
3787 pmaddwd m6, [r3 - 14 * 16] ; [3]
3788 paddd m6, [pd_16]
3789 psrld m6, 5
3790 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
3791 mova m3, m1
3792 pmaddwd m1, [r3 - 14 * 16]
3793 paddd m1, [pd_16]
3794 psrld m1, 5
3795 packusdw m6, m1
3796
3797 pmaddwd m5, [r3 - 9 * 16] ; [8]
3798 paddd m5, [pd_16]
3799 psrld m5, 5
3800 pmaddwd m3, [r3 - 9 * 16]
3801 paddd m3, [pd_16]
3802 psrld m3, 5
3803 packusdw m5, m3
3804
3805 punpckhwd m3, m4, m2
3806 punpcklwd m4, m2
3807 punpckhwd m2, m6, m5
3808 punpcklwd m6, m5
3809
3810 punpckldq m5, m4, m6
3811 punpckhdq m4, m6
3812 punpckldq m6, m3, m2
3813 punpckhdq m3, m2
3814
3815 movh [r0 + 8], m5
3816 movhps [r0 + r1 + 8], m5
3817 movh [r0 + r1 * 2 + 8], m4
3818 movhps [r0 + r4 + 8], m4
3819 lea r0, [r0 + r1 * 4]
3820 movh [r0 + 8], m6
3821 movhps [r0 + r1 + 8], m6
3822 movh [r0 + r1 * 2 + 8], m3
3823 movhps [r0 + r4 + 8], m3
3824 RET
3825
3826 cglobal intra_pred_ang8_9, 3,5,7
3827 add r2, 32
3828 lea r3, [ang_table + 9 * 16]
3829 add r1, r1
3830
3831 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3832 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
3833
3834 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
3835 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
3836
3837 mova m4, m3
3838 pmaddwd m4, [r3 - 7 * 16] ; [2]
3839 paddd m4, [pd_16]
3840 psrld m4, 5
3841 mova m2, m0
3842 pmaddwd m2, [r3 - 7 * 16]
3843 paddd m2, [pd_16]
3844 psrld m2, 5
3845 packusdw m4, m2
3846
3847 mova m2, m3
3848 pmaddwd m2, [r3 - 5 * 16] ; [4]
3849 paddd m2, [pd_16]
3850 psrld m2, 5
3851 mova m1, m0
3852 pmaddwd m1, [r3 - 5 * 16]
3853 paddd m1, [pd_16]
3854 psrld m1, 5
3855 packusdw m2, m1
3856
3857 mova m6, m3
3858 pmaddwd m6, [r3 - 3 * 16] ; [6]
3859 paddd m6, [pd_16]
3860 psrld m6, 5
3861 mova m1, m0
3862 pmaddwd m1, [r3 - 3 * 16]
3863 paddd m1, [pd_16]
3864 psrld m1, 5
3865 packusdw m6, m1
3866
3867 mova m5, m3
3868 pmaddwd m5, [r3 - 1 * 16] ; [8]
3869 paddd m5, [pd_16]
3870 psrld m5, 5
3871 mova m1, m0
3872 pmaddwd m1, [r3 - 1 * 16]
3873 paddd m1, [pd_16]
3874 psrld m1, 5
3875 packusdw m5, m1
3876
3877 punpckhwd m1, m4, m2
3878 punpcklwd m4, m2
3879 punpckhwd m2, m6, m5
3880 punpcklwd m6, m5
3881
3882 punpckldq m5, m4, m6
3883 punpckhdq m4, m6
3884 punpckldq m6, m1, m2
3885 punpckhdq m1, m2
3886
3887 lea r4, [r1 * 3]
3888 movh [r0], m5
3889 movhps [r0 + r1], m5
3890 movh [r0 + r1 * 2], m4
3891 movhps [r0 + r4], m4
3892 lea r2, [r0 + r1 * 4]
3893 movh [r2], m6
3894 movhps [r2 + r1], m6
3895 movh [r2 + r1 * 2], m1
3896 movhps [r2 + r4], m1
3897
3898 mova m4, m3
3899 pmaddwd m4, [r3 + 1 * 16] ; [10]
3900 paddd m4, [pd_16]
3901 psrld m4, 5
3902 mova m2, m0
3903 pmaddwd m2, [r3 + 1 * 16]
3904 paddd m2, [pd_16]
3905 psrld m2, 5
3906 packusdw m4, m2
3907
3908 mova m2, m3
3909 pmaddwd m2, [r3 + 3 * 16] ; [12]
3910 paddd m2, [pd_16]
3911 psrld m2, 5
3912 mova m1, m0
3913 pmaddwd m1, [r3 + 3 * 16]
3914 paddd m1, [pd_16]
3915 psrld m1, 5
3916 packusdw m2, m1
3917
3918 mova m6, m3
3919 pmaddwd m6, [r3 + 5 * 16] ; [14]
3920 paddd m6, [pd_16]
3921 psrld m6, 5
3922 mova m5, m0
3923 pmaddwd m5, [r3 + 5 * 16]
3924 paddd m5, [pd_16]
3925 psrld m5, 5
3926 packusdw m6, m5
3927
3928 pmaddwd m3, [r3 + 7 * 16] ; [16]
3929 paddd m3, [pd_16]
3930 psrld m3, 5
3931 pmaddwd m0, [r3 + 7 * 16]
3932 paddd m0, [pd_16]
3933 psrld m0, 5
3934 packusdw m3, m0
3935
3936 punpckhwd m5, m4, m2
3937 punpcklwd m4, m2
3938 punpckhwd m2, m6, m3
3939 punpcklwd m6, m3
3940
3941 punpckldq m3, m4, m6
3942 punpckhdq m4, m6
3943 punpckldq m6, m5, m2
3944 punpckhdq m5, m2
3945
3946 movh [r0 + 8], m3
3947 movhps [r0 + r1 + 8], m3
3948 movh [r0 + r1 * 2 + 8], m4
3949 movhps [r0 + r4 + 8], m4
3950 lea r0, [r0 + r1 * 4]
3951 movh [r0 + 8], m6
3952 movhps [r0 + r1 + 8], m6
3953 movh [r0 + r1 * 2 + 8], m5
3954 movhps [r0 + r4 + 8], m5
3955 RET
3956
3957 cglobal intra_pred_ang8_10, 3,6,3
3958 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
3959 pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
3960 add r1, r1
3961 lea r3, [r1 * 3]
3962
3963 psrldq m1, 2
3964 pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
3965 movu [r0 + r1], m2
3966 psrldq m1, 2
3967 pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
3968 movu [r0 + r1 * 2], m2
3969 psrldq m1, 2
3970 pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
3971 movu [r0 + r3], m2
3972
3973 lea r5, [r0 + r1 *4]
3974 psrldq m1, 2
3975 pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
3976 movu [r5], m2
3977 psrldq m1, 2
3978 pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
3979 movu [r5 + r1], m2
3980 psrldq m1, 2
3981 pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
3982 movu [r5 + r1 * 2], m2
3983 psrldq m1, 2
3984 pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
3985 movu [r5 + r3], m2
3986
3987 cmp r4m, byte 0
3988 jz .quit
3989
3990 ; filter
3991
3992 movh m1, [r2] ; [3 2 1 0]
3993 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
3994 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3995 psubw m1, m2
3996 psraw m1, 1
3997 paddw m0, m1
3998 pxor m1, m1
3999 pmaxsw m0, m1
4000 pminsw m0, [pw_pixel_max]
4001 .quit:
4002 movu [r0], m0
4003 RET
4004
4005 cglobal intra_pred_ang8_11, 3,5,7
4006 lea r3, [ang_table + 23 * 16]
4007 add r1, r1
4008
4009 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4010 pinsrw m0, [r2], 0
4011 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4012
4013 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4014 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4015
4016 mova m4, m3
4017 pmaddwd m4, [r3 + 7 * 16] ; [30]
4018 paddd m4, [pd_16]
4019 psrld m4, 5
4020 mova m2, m0
4021 pmaddwd m2, [r3 + 7 * 16]
4022 paddd m2, [pd_16]
4023 psrld m2, 5
4024 packusdw m4, m2
4025
4026 mova m2, m3
4027 pmaddwd m2, [r3 + 5 * 16] ; [28]
4028 paddd m2, [pd_16]
4029 psrld m2, 5
4030 mova m1, m0
4031 pmaddwd m1, [r3 + 5 * 16]
4032 paddd m1, [pd_16]
4033 psrld m1, 5
4034 packusdw m2, m1
4035
4036 mova m6, m3
4037 pmaddwd m6, [r3 + 3 * 16] ; [26]
4038 paddd m6, [pd_16]
4039 psrld m6, 5
4040 mova m1, m0
4041 pmaddwd m1, [r3 + 3 * 16]
4042 paddd m1, [pd_16]
4043 psrld m1, 5
4044 packusdw m6, m1
4045
4046 mova m5, m3
4047 pmaddwd m5, [r3 + 1 * 16] ; [24]
4048 paddd m5, [pd_16]
4049 psrld m5, 5
4050 mova m1, m0
4051 pmaddwd m1, [r3 + 1 * 16]
4052 paddd m1, [pd_16]
4053 psrld m1, 5
4054 packusdw m5, m1
4055
4056 punpckhwd m1, m4, m2
4057 punpcklwd m4, m2
4058 punpckhwd m2, m6, m5
4059 punpcklwd m6, m5
4060
4061 punpckldq m5, m4, m6
4062 punpckhdq m4, m6
4063 punpckldq m6, m1, m2
4064 punpckhdq m1, m2
4065
4066 lea r4, [r1 * 3]
4067 movh [r0], m5
4068 movhps [r0 + r1], m5
4069 movh [r0 + r1 * 2], m4
4070 movhps [r0 + r4], m4
4071 lea r2, [r0 + r1 * 4]
4072 movh [r2], m6
4073 movhps [r2 + r1], m6
4074 movh [r2 + r1 * 2], m1
4075 movhps [r2 + r4], m1
4076
4077 mova m4, m3
4078 pmaddwd m4, [r3 - 1 * 16] ; [22]
4079 paddd m4, [pd_16]
4080 psrld m4, 5
4081 mova m2, m0
4082 pmaddwd m2, [r3 - 1 * 16]
4083 paddd m2, [pd_16]
4084 psrld m2, 5
4085 packusdw m4, m2
4086
4087 mova m2, m3
4088 pmaddwd m2, [r3 - 3 * 16] ; [20]
4089 paddd m2, [pd_16]
4090 psrld m2, 5
4091 mova m1, m0
4092 pmaddwd m1, [r3 - 3 * 16]
4093 paddd m1, [pd_16]
4094 psrld m1, 5
4095 packusdw m2, m1
4096
4097 mova m6, m3
4098 pmaddwd m6, [r3 - 5 * 16] ; [18]
4099 paddd m6, [pd_16]
4100 psrld m6, 5
4101 mova m5, m0
4102 pmaddwd m5, [r3 - 5 * 16]
4103 paddd m5, [pd_16]
4104 psrld m5, 5
4105 packusdw m6, m5
4106
4107 pmaddwd m3, [r3 - 7 * 16] ; [16]
4108 paddd m3, [pd_16]
4109 psrld m3, 5
4110 pmaddwd m0, [r3 - 7 * 16]
4111 paddd m0, [pd_16]
4112 psrld m0, 5
4113 packusdw m3, m0
4114
4115 punpckhwd m5, m4, m2
4116 punpcklwd m4, m2
4117 punpckhwd m2, m6, m3
4118 punpcklwd m6, m3
4119
4120 punpckldq m3, m4, m6
4121 punpckhdq m4, m6
4122 punpckldq m6, m5, m2
4123 punpckhdq m5, m2
4124
4125 movh [r0 + 8], m3
4126 movhps [r0 + r1 + 8], m3
4127 movh [r0 + r1 * 2 + 8], m4
4128 movhps [r0 + r4 + 8], m4
4129 lea r0, [r0 + r1 * 4]
4130 movh [r0 + 8], m6
4131 movhps [r0 + r1 + 8], m6
4132 movh [r0 + r1 * 2 + 8], m5
4133 movhps [r0 + r4 + 8], m5
4134 RET
4135
4136 cglobal intra_pred_ang8_12, 3,6,7
4137 lea r5, [ang_table + 16 * 16]
4138 add r1, r1
4139
4140 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4141 pinsrw m0, [r2], 0
4142 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4143
4144 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4145 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4146
4147 mova m4, m3
4148 pmaddwd m4, [r5 + 11 * 16] ; [27]
4149 paddd m4, [pd_16]
4150 psrld m4, 5
4151 mova m2, m0
4152 pmaddwd m2, [r5 + 11 * 16]
4153 paddd m2, [pd_16]
4154 psrld m2, 5
4155 packusdw m4, m2
4156
4157 mova m2, m3
4158 pmaddwd m2, [r5 + 6 * 16] ; [22]
4159 paddd m2, [pd_16]
4160 psrld m2, 5
4161 mova m1, m0
4162 pmaddwd m1, [r5 + 6 * 16]
4163 paddd m1, [pd_16]
4164 psrld m1, 5
4165 packusdw m2, m1
4166
4167 mova m6, m3
4168 pmaddwd m6, [r5 + 1 * 16] ; [17]
4169 paddd m6, [pd_16]
4170 psrld m6, 5
4171 mova m1, m0
4172 pmaddwd m1, [r5 + 1 * 16]
4173 paddd m1, [pd_16]
4174 psrld m1, 5
4175 packusdw m6, m1
4176
4177 mova m5, m3
4178 pmaddwd m5, [r5 - 4 * 16] ; [12]
4179 paddd m5, [pd_16]
4180 psrld m5, 5
4181 mova m1, m0
4182 pmaddwd m1, [r5 - 4 * 16]
4183 paddd m1, [pd_16]
4184 psrld m1, 5
4185 packusdw m5, m1
4186
4187 punpckhwd m1, m4, m2
4188 punpcklwd m4, m2
4189 punpckhwd m2, m6, m5
4190 punpcklwd m6, m5
4191
4192 punpckldq m5, m4, m6
4193 punpckhdq m4, m6
4194 punpckldq m6, m1, m2
4195 punpckhdq m1, m2
4196
4197 lea r4, [r1 * 3]
4198 movh [r0], m5
4199 movhps [r0 + r1], m5
4200 movh [r0 + r1 * 2], m4
4201 movhps [r0 + r4], m4
4202 lea r3, [r0 + r1 * 4]
4203 movh [r3], m6
4204 movhps [r3 + r1], m6
4205 movh [r3 + r1 * 2], m1
4206 movhps [r3 + r4], m1
4207
4208 mova m4, m3
4209 pmaddwd m4, [r5 - 9 * 16] ; [7]
4210 paddd m4, [pd_16]
4211 psrld m4, 5
4212 mova m2, m0
4213 pmaddwd m2, [r5 - 9 * 16]
4214 paddd m2, [pd_16]
4215 psrld m2, 5
4216 packusdw m4, m2
4217
4218 mova m2, m3
4219 pmaddwd m2, [r5 - 14 * 16] ; [2]
4220 paddd m2, [pd_16]
4221 psrld m2, 5
4222 mova m1, m0
4223 pmaddwd m1, [r5 - 14 * 16]
4224 paddd m1, [pd_16]
4225 psrld m1, 5
4226 packusdw m2, m1
4227
4228 palignr m0, m3, 12
4229 movu m1, [r2]
4230 pshufb m1, [pw_ang8_12]
4231 palignr m3, m1, 12
4232
4233 mova m6, m3
4234 pmaddwd m6, [r5 + 13 * 16] ; [29]
4235 paddd m6, [pd_16]
4236 psrld m6, 5
4237 mova m5, m0
4238 pmaddwd m5, [r5 + 13 * 16]
4239 paddd m5, [pd_16]
4240 psrld m5, 5
4241 packusdw m6, m5
4242
4243 pmaddwd m3, [r5 + 8 * 16] ; [24]
4244 paddd m3, [pd_16]
4245 psrld m3, 5
4246 pmaddwd m0, [r5 + 8 * 16]
4247 paddd m0, [pd_16]
4248 psrld m0, 5
4249 packusdw m3, m0
4250
4251 punpckhwd m5, m4, m2
4252 punpcklwd m4, m2
4253 punpckhwd m2, m6, m3
4254 punpcklwd m6, m3
4255
4256 punpckldq m3, m4, m6
4257 punpckhdq m4, m6
4258 punpckldq m6, m5, m2
4259 punpckhdq m5, m2
4260
4261 movh [r0 + 8], m3
4262 movhps [r0 + r1 + 8], m3
4263 movh [r0 + r1 * 2 + 8], m4
4264 movhps [r0 + r4 + 8], m4
4265 lea r0, [r0 + r1 * 4]
4266 movh [r0 + 8], m6
4267 movhps [r0 + r1 + 8], m6
4268 movh [r0 + r1 * 2 + 8], m5
4269 movhps [r0 + r4 + 8], m5
4270 RET
4271
4272 cglobal intra_pred_ang8_13, 3,6,8
4273 lea r5, [ang_table + 14 * 16]
4274 add r1, r1
4275
4276 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4277 pinsrw m0, [r2], 0
4278 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4279
4280 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4281 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4282
4283 mova m4, m3
4284 pmaddwd m4, [r5 + 9 * 16] ; [23]
4285 paddd m4, [pd_16]
4286 psrld m4, 5
4287 mova m2, m0
4288 pmaddwd m2, [r5 + 9 * 16]
4289 paddd m2, [pd_16]
4290 psrld m2, 5
4291 packusdw m4, m2
4292
4293 mova m2, m3
4294 pmaddwd m2, [r5] ; [14]
4295 paddd m2, [pd_16]
4296 psrld m2, 5
4297 mova m1, m0
4298 pmaddwd m1, [r5]
4299 paddd m1, [pd_16]
4300 psrld m1, 5
4301 packusdw m2, m1
4302
4303 mova m6, m3
4304 pmaddwd m6, [r5 - 9 * 16] ; [5]
4305 paddd m6, [pd_16]
4306 psrld m6, 5
4307 mova m1, m0
4308 pmaddwd m1, [r5 - 9 * 16]
4309 paddd m1, [pd_16]
4310 psrld m1, 5
4311 packusdw m6, m1
4312
4313 palignr m0, m3, 12
4314 movu m1, [r2]
4315 pshufb m1, [pw_ang8_13]
4316 palignr m3, m1, 12
4317
4318 mova m5, m3
4319 pmaddwd m5, [r5 + 14 * 16] ; [28]
4320 paddd m5, [pd_16]
4321 psrld m5, 5
4322 mova m7, m0
4323 pmaddwd m7, [r5 + 14 * 16]
4324 paddd m7, [pd_16]
4325 psrld m7, 5
4326 packusdw m5, m7
4327
4328 punpckhwd m7, m4, m2
4329 punpcklwd m4, m2
4330 punpckhwd m2, m6, m5
4331 punpcklwd m6, m5
4332
4333 punpckldq m5, m4, m6
4334 punpckhdq m4, m6
4335 punpckldq m6, m7, m2
4336 punpckhdq m7, m2
4337
4338 lea r4, [r1 * 3]
4339 movh [r0], m5
4340 movhps [r0 + r1], m5
4341 movh [r0 + r1 * 2], m4
4342 movhps [r0 + r4], m4
4343 lea r2, [r0 + r1 * 4]
4344 movh [r2], m6
4345 movhps [r2 + r1], m6
4346 movh [r2 + r1 * 2], m7
4347 movhps [r2 + r4], m7
4348
4349 mova m4, m3
4350 pmaddwd m4, [r5 + 5 * 16] ; [19]
4351 paddd m4, [pd_16]
4352 psrld m4, 5
4353 mova m2, m0
4354 pmaddwd m2, [r5 + 5 * 16]
4355 paddd m2, [pd_16]
4356 psrld m2, 5
4357 packusdw m4, m2
4358
4359 mova m2, m3
4360 pmaddwd m2, [r5 - 4 * 16] ; [10]
4361 paddd m2, [pd_16]
4362 psrld m2, 5
4363 mova m5, m0
4364 pmaddwd m5, [r5 - 4 * 16]
4365 paddd m5, [pd_16]
4366 psrld m5, 5
4367 packusdw m2, m5
4368
4369 mova m6, m3
4370 pmaddwd m6, [r5 - 13 * 16] ; [1]
4371 paddd m6, [pd_16]
4372 psrld m6, 5
4373 mova m5, m0
4374 pmaddwd m5, [r5 - 13 * 16]
4375 paddd m5, [pd_16]
4376 psrld m5, 5
4377 packusdw m6, m5
4378
4379 pslldq m1, 2
4380 palignr m0, m3, 12
4381 palignr m3, m1, 12
4382
4383 pmaddwd m3, [r5 + 10 * 16] ; [24]
4384 paddd m3, [pd_16]
4385 psrld m3, 5
4386 pmaddwd m0, [r5 + 10 * 16]
4387 paddd m0, [pd_16]
4388 psrld m0, 5
4389 packusdw m3, m0
4390
4391 punpckhwd m5, m4, m2
4392 punpcklwd m4, m2
4393 punpckhwd m2, m6, m3
4394 punpcklwd m6, m3
4395
4396 punpckldq m3, m4, m6
4397 punpckhdq m4, m6
4398 punpckldq m6, m5, m2
4399 punpckhdq m5, m2
4400
4401 movh [r0 + 8], m3
4402 movhps [r0 + r1 + 8], m3
4403 movh [r0 + r1 * 2 + 8], m4
4404 movhps [r0 + r4 + 8], m4
4405 lea r0, [r0 + r1 * 4]
4406 movh [r0 + 8], m6
4407 movhps [r0 + r1 + 8], m6
4408 movh [r0 + r1 * 2 + 8], m5
4409 movhps [r0 + r4 + 8], m5
4410 RET
4411
4412 cglobal intra_pred_ang8_14, 3,6,8
4413 lea r5, [ang_table + 18 * 16]
4414 add r1, r1
4415
4416 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4417 pinsrw m0, [r2], 0
4418 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4419
4420 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4421 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4422
4423 mova m4, m3
4424 pmaddwd m4, [r5 + 1 * 16] ; [19]
4425 paddd m4, [pd_16]
4426 psrld m4, 5
4427 mova m2, m0
4428 pmaddwd m2, [r5 + 1 * 16]
4429 paddd m2, [pd_16]
4430 psrld m2, 5
4431 packusdw m4, m2
4432
4433 mova m2, m3
4434 pmaddwd m2, [r5 - 12 * 16] ; [6]
4435 paddd m2, [pd_16]
4436 psrld m2, 5
4437 mova m1, m0
4438 pmaddwd m1, [r5 - 12 * 16]
4439 paddd m1, [pd_16]
4440 psrld m1, 5
4441 packusdw m2, m1
4442
4443 palignr m0, m3, 12
4444 movu m1, [r2]
4445 pshufb m1, [pw_ang8_14]
4446 palignr m3, m1, 12
4447
4448 mova m6, m3
4449 pmaddwd m6, [r5 + 7 * 16] ; [25]
4450 paddd m6, [pd_16]
4451 psrld m6, 5
4452 mova m5, m0
4453 pmaddwd m5, [r5 + 7 * 16]
4454 paddd m5, [pd_16]
4455 psrld m5, 5
4456 packusdw m6, m5
4457
4458 mova m5, m3
4459 pmaddwd m5, [r5 - 6 * 16] ; [12]
4460 paddd m5, [pd_16]
4461 psrld m5, 5
4462 mova m7, m0
4463 pmaddwd m7, [r5 - 6 * 16]
4464 paddd m7, [pd_16]
4465 psrld m7, 5
4466 packusdw m5, m7
4467
4468 punpckhwd m7, m4, m2
4469 punpcklwd m4, m2
4470 punpckhwd m2, m6, m5
4471 punpcklwd m6, m5
4472
4473 punpckldq m5, m4, m6
4474 punpckhdq m4, m6
4475 punpckldq m6, m7, m2
4476 punpckhdq m7, m2
4477
4478 lea r4, [r1 * 3]
4479 movh [r0], m5
4480 movhps [r0 + r1], m5
4481 movh [r0 + r1 * 2], m4
4482 movhps [r0 + r4], m4
4483 lea r2, [r0 + r1 * 4]
4484 movh [r2], m6
4485 movhps [r2 + r1], m6
4486 movh [r2 + r1 * 2], m7
4487 movhps [r2 + r4], m7
4488
4489 pslldq m1, 2
4490 palignr m0, m3, 12
4491 palignr m3, m1, 12
4492
4493 mova m4, m3
4494 pmaddwd m4, [r5 + 13 * 16] ; [31]
4495 paddd m4, [pd_16]
4496 psrld m4, 5
4497 mova m2, m0
4498 pmaddwd m2, [r5 + 13 * 16]
4499 paddd m2, [pd_16]
4500 psrld m2, 5
4501 packusdw m4, m2
4502
4503 mova m2, m3
4504 pmaddwd m2, [r5] ; [18]
4505 paddd m2, [pd_16]
4506 psrld m2, 5
4507 mova m5, m0
4508 pmaddwd m5, [r5]
4509 paddd m5, [pd_16]
4510 psrld m5, 5
4511 packusdw m2, m5
4512
4513 mova m6, m3
4514 pmaddwd m6, [r5 - 13 * 16] ; [5]
4515 paddd m6, [pd_16]
4516 psrld m6, 5
4517 mova m5, m0
4518 pmaddwd m5, [r5 - 13 * 16]
4519 paddd m5, [pd_16]
4520 psrld m5, 5
4521 packusdw m6, m5
4522
4523 pslldq m1, 2
4524 palignr m0, m3, 12
4525 palignr m3, m1, 12
4526
4527 pmaddwd m3, [r5 + 6 * 16] ; [24]
4528 paddd m3, [pd_16]
4529 psrld m3, 5
4530 pmaddwd m0, [r5 + 6 * 16]
4531 paddd m0, [pd_16]
4532 psrld m0, 5
4533 packusdw m3, m0
4534
4535 punpckhwd m5, m4, m2
4536 punpcklwd m4, m2
4537 punpckhwd m2, m6, m3
4538 punpcklwd m6, m3
4539
4540 punpckldq m3, m4, m6
4541 punpckhdq m4, m6
4542 punpckldq m6, m5, m2
4543 punpckhdq m5, m2
4544
4545 movh [r0 + 8], m3
4546 movhps [r0 + r1 + 8], m3
4547 movh [r0 + r1 * 2 + 8], m4
4548 movhps [r0 + r4 + 8], m4
4549 lea r0, [r0 + r1 * 4]
4550 movh [r0 + 8], m6
4551 movhps [r0 + r1 + 8], m6
4552 movh [r0 + r1 * 2 + 8], m5
4553 movhps [r0 + r4 + 8], m5
4554 RET
4555
4556 cglobal intra_pred_ang8_15, 3,6,8
4557 lea r5, [ang_table + 20 * 16]
4558 add r1, r1
4559
4560 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4561 pinsrw m0, [r2], 0
4562 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4563
4564 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4565 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4566
4567 mova m4, m3
4568 pmaddwd m4, [r5 - 5 * 16] ; [15]
4569 paddd m4, [pd_16]
4570 psrld m4, 5
4571 mova m2, m0
4572 pmaddwd m2, [r5 - 5 * 16]
4573 paddd m2, [pd_16]
4574 psrld m2, 5
4575 packusdw m4, m2
4576
4577 palignr m0, m3, 12
4578 movu m1, [r2]
4579 pshufb m1, [pw_ang8_15]
4580 palignr m3, m1, 12
4581
4582 mova m2, m3
4583 pmaddwd m2, [r5 + 10 * 16] ; [30]
4584 paddd m2, [pd_16]
4585 psrld m2, 5
4586 mova m5, m0
4587 pmaddwd m5, [r5 + 10 * 16]
4588 paddd m5, [pd_16]
4589 psrld m5, 5
4590 packusdw m2, m5
4591
4592 mova m6, m3
4593 pmaddwd m6, [r5 - 7 * 16] ; [13]
4594 paddd m6, [pd_16]
4595 psrld m6, 5
4596 mova m5, m0
4597 pmaddwd m5, [r5 - 7 * 16]
4598 paddd m5, [pd_16]
4599 psrld m5, 5
4600 packusdw m6, m5
4601
4602 pslldq m1, 2
4603 palignr m0, m3, 12
4604 palignr m3, m1, 12
4605
4606 mova m5, m3
4607 pmaddwd m5, [r5 + 8 * 16] ; [28]
4608 paddd m5, [pd_16]
4609 psrld m5, 5
4610 mova m7, m0
4611 pmaddwd m7, [r5 + 8 * 16]
4612 paddd m7, [pd_16]
4613 psrld m7, 5
4614 packusdw m5, m7
4615
4616 punpckhwd m7, m4, m2
4617 punpcklwd m4, m2
4618 punpckhwd m2, m6, m5
4619 punpcklwd m6, m5
4620
4621 punpckldq m5, m4, m6
4622 punpckhdq m4, m6
4623 punpckldq m6, m7, m2
4624 punpckhdq m7, m2
4625
4626 lea r4, [r1 * 3]
4627 movh [r0], m5
4628 movhps [r0 + r1], m5
4629 movh [r0 + r1 * 2], m4
4630 movhps [r0 + r4], m4
4631 lea r3, [r0 + r1 * 4]
4632 movh [r3], m6
4633 movhps [r3 + r1], m6
4634 movh [r3 + r1 * 2], m7
4635 movhps [r3 + r4], m7
4636
4637 mova m4, m3
4638 pmaddwd m4, [r5 - 9 * 16] ; [11]
4639 paddd m4, [pd_16]
4640 psrld m4, 5
4641 mova m2, m0
4642 pmaddwd m2, [r5 - 9 * 16]
4643 paddd m2, [pd_16]
4644 psrld m2, 5
4645 packusdw m4, m2
4646
4647 pslldq m1, 2
4648 palignr m0, m3, 12
4649 palignr m3, m1, 12
4650
4651 mova m2, m3
4652 pmaddwd m2, [r5 + 6 * 16] ; [26]
4653 paddd m2, [pd_16]
4654 psrld m2, 5
4655 mova m5, m0
4656 pmaddwd m5, [r5 + 6 * 16]
4657 paddd m5, [pd_16]
4658 psrld m5, 5
4659 packusdw m2, m5
4660
4661 mova m6, m3
4662 pmaddwd m6, [r5 - 11 * 16] ; [9]
4663 paddd m6, [pd_16]
4664 psrld m6, 5
4665 mova m5, m0
4666 pmaddwd m5, [r5 - 11 * 16]
4667 paddd m5, [pd_16]
4668 psrld m5, 5
4669 packusdw m6, m5
4670
4671 pslldq m1, 2
4672 palignr m0, m3, 12
4673 palignr m3, m1, 12
4674 pinsrw m3, [r2 + 16], 0
4675
4676 pmaddwd m3, [r5 + 4 * 16] ; [24]
4677 paddd m3, [pd_16]
4678 psrld m3, 5
4679 pmaddwd m0, [r5 + 4 * 16]
4680 paddd m0, [pd_16]
4681 psrld m0, 5
4682 packusdw m3, m0
4683
4684 punpckhwd m5, m4, m2
4685 punpcklwd m4, m2
4686 punpckhwd m2, m6, m3
4687 punpcklwd m6, m3
4688
4689 punpckldq m3, m4, m6
4690 punpckhdq m4, m6
4691 punpckldq m6, m5, m2
4692 punpckhdq m5, m2
4693
4694 movh [r0 + 8], m3
4695 movhps [r0 + r1 + 8], m3
4696 movh [r0 + r1 * 2 + 8], m4
4697 movhps [r0 + r4 + 8], m4
4698 lea r0, [r0 + r1 * 4]
4699 movh [r0 + 8], m6
4700 movhps [r0 + r1 + 8], m6
4701 movh [r0 + r1 * 2 + 8], m5
4702 movhps [r0 + r4 + 8], m5
4703 RET
4704
4705 cglobal intra_pred_ang8_16, 3,6,8
4706 lea r5, [ang_table + 13 * 16]
4707 add r1, r1
4708
4709 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4710 pinsrw m0, [r2], 0
4711 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4712
4713 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4714 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4715
4716 mova m4, m3
4717 pmaddwd m4, [r5 - 2 * 16] ; [11]
4718 paddd m4, [pd_16]
4719 psrld m4, 5
4720 mova m2, m0
4721 pmaddwd m2, [r5 - 2 * 16]
4722 paddd m2, [pd_16]
4723 psrld m2, 5
4724 packusdw m4, m2
4725
4726 palignr m0, m3, 12
4727 movu m1, [r2]
4728 pshufb m1, [pw_ang8_16]
4729 palignr m3, m1, 12
4730
4731 mova m2, m3
4732 pmaddwd m2, [r5 + 9 * 16] ; [22]
4733 paddd m2, [pd_16]
4734 psrld m2, 5
4735 mova m5, m0
4736 pmaddwd m5, [r5 + 9 * 16]
4737 paddd m5, [pd_16]
4738 psrld m5, 5
4739 packusdw m2, m5
4740
4741 mova m6, m3
4742 pmaddwd m6, [r5 - 12 * 16] ; [1]
4743 paddd m6, [pd_16]
4744 psrld m6, 5
4745 mova m5, m0
4746 pmaddwd m5, [r5 - 12 * 16]
4747 paddd m5, [pd_16]
4748 psrld m5, 5
4749 packusdw m6, m5
4750
4751 pslldq m1, 2
4752 palignr m0, m3, 12
4753 palignr m3, m1, 12
4754
4755 mova m5, m3
4756 pmaddwd m5, [r5 - 1 * 16] ; [12]
4757 paddd m5, [pd_16]
4758 psrld m5, 5
4759 mova m7, m0
4760 pmaddwd m7, [r5 - 1 * 16]
4761 paddd m7, [pd_16]
4762 psrld m7, 5
4763 packusdw m5, m7
4764
4765 punpckhwd m7, m4, m2
4766 punpcklwd m4, m2
4767 punpckhwd m2, m6, m5
4768 punpcklwd m6, m5
4769
4770 punpckldq m5, m4, m6
4771 punpckhdq m4, m6
4772 punpckldq m6, m7, m2
4773 punpckhdq m7, m2
4774
4775 lea r4, [r1 * 3]
4776 movh [r0], m5
4777 movhps [r0 + r1], m5
4778 movh [r0 + r1 * 2], m4
4779 movhps [r0 + r4], m4
4780 lea r3, [r0 + r1 * 4]
4781 movh [r3], m6
4782 movhps [r3 + r1], m6
4783 movh [r3 + r1 * 2], m7
4784 movhps [r3 + r4], m7
4785
4786 pslldq m1, 2
4787 palignr m0, m3, 12
4788 palignr m3, m1, 12
4789
4790 mova m4, m3
4791 pmaddwd m4, [r5 + 10 * 16] ; [23]
4792 paddd m4, [pd_16]
4793 psrld m4, 5
4794 mova m2, m0
4795 pmaddwd m2, [r5 + 10 * 16]
4796 paddd m2, [pd_16]
4797 psrld m2, 5
4798 packusdw m4, m2
4799
4800 mova m2, m3
4801 pmaddwd m2, [r5 - 11 * 16] ; [2]
4802 paddd m2, [pd_16]
4803 psrld m2, 5
4804 mova m5, m0
4805 pmaddwd m5, [r5 - 11 * 16]
4806 paddd m5, [pd_16]
4807 psrld m5, 5
4808 packusdw m2, m5
4809
4810 pslldq m1, 2
4811 palignr m0, m3, 12
4812 palignr m3, m1, 12
4813
4814 mova m6, m3
4815 pmaddwd m6, [r5] ; [13]
4816 paddd m6, [pd_16]
4817 psrld m6, 5
4818 mova m5, m0
4819 pmaddwd m5, [r5]
4820 paddd m5, [pd_16]
4821 psrld m5, 5
4822 packusdw m6, m5
4823
4824 pslldq m1, 2
4825 palignr m0, m3, 12
4826 palignr m3, m1, 12
4827 pinsrw m3, [r2 + 16], 0
4828
4829 pmaddwd m3, [r5 + 11 * 16] ; [24]
4830 paddd m3, [pd_16]
4831 psrld m3, 5
4832 pmaddwd m0, [r5 + 11 * 16]
4833 paddd m0, [pd_16]
4834 psrld m0, 5
4835 packusdw m3, m0
4836
4837 punpckhwd m5, m4, m2
4838 punpcklwd m4, m2
4839 punpckhwd m2, m6, m3
4840 punpcklwd m6, m3
4841
4842 punpckldq m3, m4, m6
4843 punpckhdq m4, m6
4844 punpckldq m6, m5, m2
4845 punpckhdq m5, m2
4846
4847 movh [r0 + 8], m3
4848 movhps [r0 + r1 + 8], m3
4849 movh [r0 + r1 * 2 + 8], m4
4850 movhps [r0 + r4 + 8], m4
4851 lea r0, [r0 + r1 * 4]
4852 movh [r0 + 8], m6
4853 movhps [r0 + r1 + 8], m6
4854 movh [r0 + r1 * 2 + 8], m5
4855 movhps [r0 + r4 + 8], m5
4856 RET
4857
4858 cglobal intra_pred_ang8_17, 3,6,8
4859 lea r5, [ang_table + 17 * 16]
4860 add r1, r1
4861
4862 movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0]
4863 pinsrw m0, [r2], 0
4864 movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
4865
4866 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4867 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4868
4869 mova m4, m3
4870 pmaddwd m4, [r5 - 11 * 16] ; [6]
4871 paddd m4, [pd_16]
4872 psrld m4, 5
4873 mova m2, m0
4874 pmaddwd m2, [r5 - 11 * 16]
4875 paddd m2, [pd_16]
4876 psrld m2, 5
4877 packusdw m4, m2
4878
4879 palignr m0, m3, 12
4880 movu m1, [r2]
4881 pshufb m1, [pw_ang8_17]
4882 palignr m3, m1, 12
4883
4884 mova m2, m3
4885 pmaddwd m2, [r5 - 5 * 16] ; [12]
4886 paddd m2, [pd_16]
4887 psrld m2, 5
4888 mova m5, m0
4889 pmaddwd m5, [r5 - 5 * 16]
4890 paddd m5, [pd_16]
4891 psrld m5, 5
4892 packusdw m2, m5
4893
4894 pslldq m1, 2
4895 palignr m0, m3, 12
4896 palignr m3, m1, 12
4897
4898 mova m6, m3
4899 pmaddwd m6, [r5 + 1 * 16] ; [18]
4900 paddd m6, [pd_16]
4901 psrld m6, 5
4902 mova m5, m0
4903 pmaddwd m5, [r5 + 1 * 16]
4904 paddd m5, [pd_16]
4905 psrld m5, 5
4906 packusdw m6, m5
4907
4908 pslldq m1, 2
4909 palignr m0, m3, 12
4910 palignr m3, m1, 12
4911
4912 mova m5, m3
4913 pmaddwd m5, [r5 + 7 * 16] ; [24]
4914 paddd m5, [pd_16]
4915 psrld m5, 5
4916 mova m7, m0
4917 pmaddwd m7, [r5 + 7 * 16]
4918 paddd m7, [pd_16]
4919 psrld m7, 5
4920 packusdw m5, m7
4921
4922 punpckhwd m7, m4, m2
4923 punpcklwd m4, m2
4924 punpckhwd m2, m6, m5
4925 punpcklwd m6, m5
4926
4927 punpckldq m5, m4, m6
4928 punpckhdq m4, m6
4929 punpckldq m6, m7, m2
4930 punpckhdq m7, m2
4931
4932 lea r4, [r1 * 3]
4933 movh [r0], m5
4934 movhps [r0 + r1], m5
4935 movh [r0 + r1 * 2], m4
4936 movhps [r0 + r4], m4
4937 lea r3, [r0 + r1 * 4]
4938 movh [r3], m6
4939 movhps [r3 + r1], m6
4940 movh [r3 + r1 * 2], m7
4941 movhps [r3 + r4], m7
4942
4943 pslldq m1, 2
4944 palignr m0, m3, 12
4945 palignr m3, m1, 12
4946
4947 mova m4, m3
4948 pmaddwd m4, [r5 + 13 * 16] ; [30]
4949 paddd m4, [pd_16]
4950 psrld m4, 5
4951 mova m2, m0
4952 pmaddwd m2, [r5 + 13 * 16]
4953 paddd m2, [pd_16]
4954 psrld m2, 5
4955 packusdw m4, m2
4956
4957 mova m2, m3
4958 pmaddwd m2, [r5 - 13 * 16] ; [4]
4959 paddd m2, [pd_16]
4960 psrld m2, 5
4961 mova m5, m0
4962 pmaddwd m5, [r5 - 13 * 16]
4963 paddd m5, [pd_16]
4964 psrld m5, 5
4965 packusdw m2, m5
4966
4967 pslldq m1, 2
4968 palignr m0, m3, 12
4969 palignr m3, m1, 12
4970
4971 mova m6, m3
4972 pmaddwd m6, [r5 - 7 * 16] ; [10]
4973 paddd m6, [pd_16]
4974 psrld m6, 5
4975 mova m5, m0
4976 pmaddwd m5, [r5 - 7 * 16]
4977 paddd m5, [pd_16]
4978 psrld m5, 5
4979 packusdw m6, m5
4980
4981 pslldq m1, 2
4982 palignr m0, m3, 12
4983 palignr m3, m1, 12
4984
4985 pmaddwd m3, [r5 - 1 * 16] ; [16]
4986 paddd m3, [pd_16]
4987 psrld m3, 5
4988 pmaddwd m0, [r5 - 1 * 16]
4989 paddd m0, [pd_16]
4990 psrld m0, 5
4991 packusdw m3, m0
4992
4993 punpckhwd m5, m4, m2
4994 punpcklwd m4, m2
4995 punpckhwd m2, m6, m3
4996 punpcklwd m6, m3
4997
4998 punpckldq m3, m4, m6
4999 punpckhdq m4, m6
5000 punpckldq m6, m5, m2
5001 punpckhdq m5, m2
5002
5003 movh [r0 + 8], m3
5004 movhps [r0 + r1 + 8], m3
5005 movh [r0 + r1 * 2 + 8], m4
5006 movhps [r0 + r4 + 8], m4
5007 lea r0, [r0 + r1 * 4]
5008 movh [r0 + 8], m6
5009 movhps [r0 + r1 + 8], m6
5010 movh [r0 + r1 * 2 + 8], m5
5011 movhps [r0 + r4 + 8], m5
5012 RET
5013
5014 cglobal intra_pred_ang8_18, 3,4,3
5015 add r1, r1
5016 lea r3, [r1 * 3]
5017 movu m1, [r2]
5018 movu m0, [r2 + 34]
5019 pshufb m0, [pw_swap16]
5020 movu [r0], m1
5021 palignr m2, m1, m0, 14
5022 movu [r0 + r1], m2
5023 palignr m2, m1, m0, 12
5024 movu [r0 + r1 * 2], m2
5025 palignr m2, m1, m0, 10
5026 movu [r0 + r3], m2
5027 lea r0, [r0 + r1 * 4]
5028 palignr m2, m1, m0, 8
5029 movu [r0], m2
5030 palignr m2, m1, m0, 6
5031 movu [r0 + r1], m2
5032 palignr m2, m1, m0, 4
5033 movu [r0 + r1 * 2], m2
5034 palignr m1, m0, 2
5035 movu [r0 + r3], m1
5036 RET
5037
5038 cglobal intra_pred_ang8_19, 3,5,8
5039 lea r3, [ang_table + 17 * 16]
5040 add r1, r1
5041
5042 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5043 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5044
5045 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5046 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5047
5048 mova m4, m3
5049 pmaddwd m4, [r3 - 11 * 16] ; [6]
5050 paddd m4, [pd_16]
5051 psrld m4, 5
5052 mova m2, m0
5053 pmaddwd m2, [r3 - 11 * 16]
5054 paddd m2, [pd_16]
5055 psrld m2, 5
5056 packusdw m4, m2
5057
5058 palignr m0, m3, 12
5059 movu m1, [r2 + 32]
5060 pinsrw m1, [r2], 0
5061 pshufb m1, [pw_ang8_17]
5062 palignr m3, m1, 12
5063
5064 mova m2, m3
5065 pmaddwd m2, [r3 - 5 * 16] ; [12]
5066 paddd m2, [pd_16]
5067 psrld m2, 5
5068 mova m5, m0
5069 pmaddwd m5, [r3 - 5 * 16]
5070 paddd m5, [pd_16]
5071 psrld m5, 5
5072 packusdw m2, m5
5073
5074 pslldq m1, 2
5075 palignr m0, m3, 12
5076 palignr m3, m1, 12
5077
5078 mova m6, m3
5079 pmaddwd m6, [r3 + 1 * 16] ; [18]
5080 paddd m6, [pd_16]
5081 psrld m6, 5
5082 mova m5, m0
5083 pmaddwd m5, [r3 + 1 * 16]
5084 paddd m5, [pd_16]
5085 psrld m5, 5
5086 packusdw m6, m5
5087
5088 pslldq m1, 2
5089 palignr m0, m3, 12
5090 palignr m3, m1, 12
5091
5092 mova m5, m3
5093 pmaddwd m5, [r3 + 7 * 16] ; [24]
5094 paddd m5, [pd_16]
5095 psrld m5, 5
5096 mova m7, m0
5097 pmaddwd m7, [r3 + 7 * 16]
5098 paddd m7, [pd_16]
5099 psrld m7, 5
5100 packusdw m5, m7
5101
5102 lea r4, [r1 * 3]
5103 movu [r0], m4
5104 movu [r0 + r1], m2
5105 movu [r0 + r1 * 2], m6
5106 movu [r0 + r4], m5
5107
5108 pslldq m1, 2
5109 palignr m0, m3, 12
5110 palignr m3, m1, 12
5111
5112 mova m4, m3
5113 pmaddwd m4, [r3 + 13 * 16] ; [30]
5114 paddd m4, [pd_16]
5115 psrld m4, 5
5116 mova m2, m0
5117 pmaddwd m2, [r3 + 13 * 16]
5118 paddd m2, [pd_16]
5119 psrld m2, 5
5120 packusdw m4, m2
5121
5122 mova m2, m3
5123 pmaddwd m2, [r3 - 13 * 16] ; [4]
5124 paddd m2, [pd_16]
5125 psrld m2, 5
5126 mova m5, m0
5127 pmaddwd m5, [r3 - 13 * 16]
5128 paddd m5, [pd_16]
5129 psrld m5, 5
5130 packusdw m2, m5
5131
5132 pslldq m1, 2
5133 palignr m0, m3, 12
5134 palignr m3, m1, 12
5135
5136 mova m6, m3
5137 pmaddwd m6, [r3 - 7 * 16] ; [10]
5138 paddd m6, [pd_16]
5139 psrld m6, 5
5140 mova m5, m0
5141 pmaddwd m5, [r3 - 7 * 16]
5142 paddd m5, [pd_16]
5143 psrld m5, 5
5144 packusdw m6, m5
5145
5146 pslldq m1, 2
5147 palignr m0, m3, 12
5148 palignr m3, m1, 12
5149
5150 pmaddwd m3, [r3 - 1 * 16] ; [16]
5151 paddd m3, [pd_16]
5152 psrld m3, 5
5153 pmaddwd m0, [r3 - 1 * 16]
5154 paddd m0, [pd_16]
5155 psrld m0, 5
5156 packusdw m3, m0
5157
5158 lea r0, [r0 + r1 * 4]
5159 movu [r0], m4
5160 movu [r0 + r1], m2
5161 movu [r0 + r1 * 2], m6
5162 movu [r0 + r4], m3
5163 RET
5164
5165 cglobal intra_pred_ang8_20, 3,5,8
5166 lea r3, [ang_table + 13 * 16]
5167 add r1, r1
5168
5169 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5170 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5171
5172 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5173 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5174
5175 mova m4, m3
5176 pmaddwd m4, [r3 - 2 * 16] ; [11]
5177 paddd m4, [pd_16]
5178 psrld m4, 5
5179 mova m2, m0
5180 pmaddwd m2, [r3 - 2 * 16]
5181 paddd m2, [pd_16]
5182 psrld m2, 5
5183 packusdw m4, m2
5184
5185 palignr m0, m3, 12
5186 movu m1, [r2 + 32]
5187 pinsrw m1, [r2], 0
5188 pshufb m1, [pw_ang8_16]
5189 palignr m3, m1, 12
5190
5191 mova m2, m3
5192 pmaddwd m2, [r3 + 9 * 16] ; [22]
5193 paddd m2, [pd_16]
5194 psrld m2, 5
5195 mova m5, m0
5196 pmaddwd m5, [r3 + 9 * 16]
5197 paddd m5, [pd_16]
5198 psrld m5, 5
5199 packusdw m2, m5
5200
5201 mova m6, m3
5202 pmaddwd m6, [r3 - 12 * 16] ; [1]
5203 paddd m6, [pd_16]
5204 psrld m6, 5
5205 mova m5, m0
5206 pmaddwd m5, [r3 - 12 * 16]
5207 paddd m5, [pd_16]
5208 psrld m5, 5
5209 packusdw m6, m5
5210
5211 pslldq m1, 2
5212 palignr m0, m3, 12
5213 palignr m3, m1, 12
5214
5215 mova m5, m3
5216 pmaddwd m5, [r3 - 1 * 16] ; [12]
5217 paddd m5, [pd_16]
5218 psrld m5, 5
5219 mova m7, m0
5220 pmaddwd m7, [r3 - 1 * 16]
5221 paddd m7, [pd_16]
5222 psrld m7, 5
5223 packusdw m5, m7
5224
5225 lea r4, [r1 * 3]
5226 movu [r0], m4
5227 movu [r0 + r1], m2
5228 movu [r0 + r1 * 2], m6
5229 movu [r0 + r4], m5
5230
5231 pslldq m1, 2
5232 palignr m0, m3, 12
5233 palignr m3, m1, 12
5234
5235 mova m4, m3
5236 pmaddwd m4, [r3 + 10 * 16] ; [23]
5237 paddd m4, [pd_16]
5238 psrld m4, 5
5239 mova m2, m0
5240 pmaddwd m2, [r3 + 10 * 16]
5241 paddd m2, [pd_16]
5242 psrld m2, 5
5243 packusdw m4, m2
5244
5245 mova m2, m3
5246 pmaddwd m2, [r3 - 11 * 16] ; [2]
5247 paddd m2, [pd_16]
5248 psrld m2, 5
5249 mova m5, m0
5250 pmaddwd m5, [r3 - 11 * 16]
5251 paddd m5, [pd_16]
5252 psrld m5, 5
5253 packusdw m2, m5
5254
5255 pslldq m1, 2
5256 palignr m0, m3, 12
5257 palignr m3, m1, 12
5258
5259 mova m6, m3
5260 pmaddwd m6, [r3] ; [13]
5261 paddd m6, [pd_16]
5262 psrld m6, 5
5263 mova m5, m0
5264 pmaddwd m5, [r3]
5265 paddd m5, [pd_16]
5266 psrld m5, 5
5267 packusdw m6, m5
5268
5269 pslldq m1, 2
5270 palignr m0, m3, 12
5271 palignr m3, m1, 12
5272 pinsrw m3, [r2 + 16 + 32], 0
5273
5274 pmaddwd m3, [r3 + 11 * 16] ; [24]
5275 paddd m3, [pd_16]
5276 psrld m3, 5
5277 pmaddwd m0, [r3 + 11 * 16]
5278 paddd m0, [pd_16]
5279 psrld m0, 5
5280 packusdw m3, m0
5281
5282 lea r0, [r0 + r1 * 4]
5283 movu [r0], m4
5284 movu [r0 + r1], m2
5285 movu [r0 + r1 * 2], m6
5286 movu [r0 + r4], m3
5287 RET
5288
5289 cglobal intra_pred_ang8_21, 3,5,8
5290 lea r3, [ang_table + 20 * 16]
5291 add r1, r1
5292
5293 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5294 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5295
5296 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5297 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5298
5299 mova m4, m3
5300 pmaddwd m4, [r3 - 5 * 16] ; [15]
5301 paddd m4, [pd_16]
5302 psrld m4, 5
5303 mova m2, m0
5304 pmaddwd m2, [r3 - 5 * 16]
5305 paddd m2, [pd_16]
5306 psrld m2, 5
5307 packusdw m4, m2
5308
5309 palignr m0, m3, 12
5310 movu m1, [r2 + 32]
5311 pinsrw m1, [r2], 0
5312 pshufb m1, [pw_ang8_15]
5313 palignr m3, m1, 12
5314
5315 mova m2, m3
5316 pmaddwd m2, [r3 + 10 * 16] ; [30]
5317 paddd m2, [pd_16]
5318 psrld m2, 5
5319 mova m5, m0
5320 pmaddwd m5, [r3 + 10 * 16]
5321 paddd m5, [pd_16]
5322 psrld m5, 5
5323 packusdw m2, m5
5324
5325 mova m6, m3
5326 pmaddwd m6, [r3 - 7 * 16] ; [13]
5327 paddd m6, [pd_16]
5328 psrld m6, 5
5329 mova m5, m0
5330 pmaddwd m5, [r3 - 7 * 16]
5331 paddd m5, [pd_16]
5332 psrld m5, 5
5333 packusdw m6, m5
5334
5335 pslldq m1, 2
5336 palignr m0, m3, 12
5337 palignr m3, m1, 12
5338
5339 mova m5, m3
5340 pmaddwd m5, [r3 + 8 * 16] ; [28]
5341 paddd m5, [pd_16]
5342 psrld m5, 5
5343 mova m7, m0
5344 pmaddwd m7, [r3 + 8 * 16]
5345 paddd m7, [pd_16]
5346 psrld m7, 5
5347 packusdw m5, m7
5348
5349 lea r4, [r1 * 3]
5350 movu [r0], m4
5351 movu [r0 + r1], m2
5352 movu [r0 + r1 * 2], m6
5353 movu [r0 + r4], m5
5354
5355 mova m4, m3
5356 pmaddwd m4, [r3 - 9 * 16] ; [11]
5357 paddd m4, [pd_16]
5358 psrld m4, 5
5359 mova m2, m0
5360 pmaddwd m2, [r3 - 9 * 16]
5361 paddd m2, [pd_16]
5362 psrld m2, 5
5363 packusdw m4, m2
5364
5365 pslldq m1, 2
5366 palignr m0, m3, 12
5367 palignr m3, m1, 12
5368
5369 mova m2, m3
5370 pmaddwd m2, [r3 + 6 * 16] ; [26]
5371 paddd m2, [pd_16]
5372 psrld m2, 5
5373 mova m5, m0
5374 pmaddwd m5, [r3 + 6 * 16]
5375 paddd m5, [pd_16]
5376 psrld m5, 5
5377 packusdw m2, m5
5378
5379 mova m6, m3
5380 pmaddwd m6, [r3 - 11 * 16] ; [9]
5381 paddd m6, [pd_16]
5382 psrld m6, 5
5383 mova m5, m0
5384 pmaddwd m5, [r3 - 11 * 16]
5385 paddd m5, [pd_16]
5386 psrld m5, 5
5387 packusdw m6, m5
5388
5389 pslldq m1, 2
5390 palignr m0, m3, 12
5391 palignr m3, m1, 12
5392 pinsrw m3, [r2 + 16 + 32], 0
5393
5394 pmaddwd m3, [r3 + 4 * 16] ; [24]
5395 paddd m3, [pd_16]
5396 psrld m3, 5
5397 pmaddwd m0, [r3 + 4 * 16]
5398 paddd m0, [pd_16]
5399 psrld m0, 5
5400 packusdw m3, m0
5401
5402 lea r0, [r0 + r1 * 4]
5403 movu [r0], m4
5404 movu [r0 + r1], m2
5405 movu [r0 + r1 * 2], m6
5406 movu [r0 + r4], m3
5407 RET
5408
5409 cglobal intra_pred_ang8_22, 3,5,8
5410 lea r3, [ang_table + 18 * 16]
5411 add r1, r1
5412
5413 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5414 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5415
5416 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5417 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5418
5419 mova m4, m3
5420 pmaddwd m4, [r3 + 1 * 16] ; [19]
5421 paddd m4, [pd_16]
5422 psrld m4, 5
5423 mova m2, m0
5424 pmaddwd m2, [r3 + 1 * 16]
5425 paddd m2, [pd_16]
5426 psrld m2, 5
5427 packusdw m4, m2
5428
5429 mova m2, m3
5430 pmaddwd m2, [r3 - 12 * 16] ; [6]
5431 paddd m2, [pd_16]
5432 psrld m2, 5
5433 mova m1, m0
5434 pmaddwd m1, [r3 - 12 * 16]
5435 paddd m1, [pd_16]
5436 psrld m1, 5
5437 packusdw m2, m1
5438
5439 palignr m0, m3, 12
5440 movu m1, [r2 + 32]
5441 pinsrw m1, [r2], 0
5442 pshufb m1, [pw_ang8_14]
5443 palignr m3, m1, 12
5444
5445 mova m6, m3
5446 pmaddwd m6, [r3 + 7 * 16] ; [25]
5447 paddd m6, [pd_16]
5448 psrld m6, 5
5449 mova m5, m0
5450 pmaddwd m5, [r3 + 7 * 16]
5451 paddd m5, [pd_16]
5452 psrld m5, 5
5453 packusdw m6, m5
5454
5455 mova m5, m3
5456 pmaddwd m5, [r3 - 6 * 16] ; [12]
5457 paddd m5, [pd_16]
5458 psrld m5, 5
5459 mova m7, m0
5460 pmaddwd m7, [r3 - 6 * 16]
5461 paddd m7, [pd_16]
5462 psrld m7, 5
5463 packusdw m5, m7
5464
5465 lea r4, [r1 * 3]
5466 movu [r0], m4
5467 movu [r0 + r1], m2
5468 movu [r0 + r1 * 2], m6
5469 movu [r0 + r4], m5
5470
5471 pslldq m1, 2
5472 palignr m0, m3, 12
5473 palignr m3, m1, 12
5474
5475 mova m4, m3
5476 pmaddwd m4, [r3 + 13 * 16] ; [31]
5477 paddd m4, [pd_16]
5478 psrld m4, 5
5479 mova m2, m0
5480 pmaddwd m2, [r3 + 13 * 16]
5481 paddd m2, [pd_16]
5482 psrld m2, 5
5483 packusdw m4, m2
5484
5485 mova m2, m3
5486 pmaddwd m2, [r3] ; [18]
5487 paddd m2, [pd_16]
5488 psrld m2, 5
5489 mova m5, m0
5490 pmaddwd m5, [r3]
5491 paddd m5, [pd_16]
5492 psrld m5, 5
5493 packusdw m2, m5
5494
5495 mova m6, m3
5496 pmaddwd m6, [r3 - 13 * 16] ; [5]
5497 paddd m6, [pd_16]
5498 psrld m6, 5
5499 mova m5, m0
5500 pmaddwd m5, [r3 - 13 * 16]
5501 paddd m5, [pd_16]
5502 psrld m5, 5
5503 packusdw m6, m5
5504
5505 pslldq m1, 2
5506 palignr m0, m3, 12
5507 palignr m3, m1, 12
5508
5509 pmaddwd m3, [r3 + 6 * 16] ; [24]
5510 paddd m3, [pd_16]
5511 psrld m3, 5
5512 pmaddwd m0, [r3 + 6 * 16]
5513 paddd m0, [pd_16]
5514 psrld m0, 5
5515 packusdw m3, m0
5516
5517 lea r0, [r0 + r1 * 4]
5518 movu [r0], m4
5519 movu [r0 + r1], m2
5520 movu [r0 + r1 * 2], m6
5521 movu [r0 + r4], m3
5522 RET
5523
5524 cglobal intra_pred_ang8_23, 3,5,8
5525 lea r3, [ang_table + 14 * 16]
5526 add r1, r1
5527
5528 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5529 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5530
5531 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5532 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5533
5534 mova m4, m3
5535 pmaddwd m4, [r3 + 9 * 16] ; [23]
5536 paddd m4, [pd_16]
5537 psrld m4, 5
5538 mova m2, m0
5539 pmaddwd m2, [r3 + 9 * 16]
5540 paddd m2, [pd_16]
5541 psrld m2, 5
5542 packusdw m4, m2
5543
5544 mova m2, m3
5545 pmaddwd m2, [r3] ; [14]
5546 paddd m2, [pd_16]
5547 psrld m2, 5
5548 mova m1, m0
5549 pmaddwd m1, [r3]
5550 paddd m1, [pd_16]
5551 psrld m1, 5
5552 packusdw m2, m1
5553
5554 mova m6, m3
5555 pmaddwd m6, [r3 - 9 * 16] ; [5]
5556 paddd m6, [pd_16]
5557 psrld m6, 5
5558 mova m1, m0
5559 pmaddwd m1, [r3 - 9 * 16]
5560 paddd m1, [pd_16]
5561 psrld m1, 5
5562 packusdw m6, m1
5563
5564 palignr m0, m3, 12
5565 movu m1, [r2 + 32]
5566 pinsrw m1, [r2], 0
5567 pshufb m1, [pw_ang8_13]
5568 palignr m3, m1, 12
5569
5570 mova m5, m3
5571 pmaddwd m5, [r3 + 14 * 16] ; [28]
5572 paddd m5, [pd_16]
5573 psrld m5, 5
5574 mova m7, m0
5575 pmaddwd m7, [r3 + 14 * 16]
5576 paddd m7, [pd_16]
5577 psrld m7, 5
5578 packusdw m5, m7
5579
5580 lea r4, [r1 * 3]
5581 movu [r0], m4
5582 movu [r0 + r1], m2
5583 movu [r0 + r1 * 2], m6
5584 movu [r0 + r4], m5
5585
5586 mova m4, m3
5587 pmaddwd m4, [r3 + 5 * 16] ; [19]
5588 paddd m4, [pd_16]
5589 psrld m4, 5
5590 mova m2, m0
5591 pmaddwd m2, [r3 + 5 * 16]
5592 paddd m2, [pd_16]
5593 psrld m2, 5
5594 packusdw m4, m2
5595
5596 mova m2, m3
5597 pmaddwd m2, [r3 - 4 * 16] ; [10]
5598 paddd m2, [pd_16]
5599 psrld m2, 5
5600 mova m5, m0
5601 pmaddwd m5, [r3 - 4 * 16]
5602 paddd m5, [pd_16]
5603 psrld m5, 5
5604 packusdw m2, m5
5605
5606 mova m6, m3
5607 pmaddwd m6, [r3 - 13 * 16] ; [1]
5608 paddd m6, [pd_16]
5609 psrld m6, 5
5610 mova m5, m0
5611 pmaddwd m5, [r3 - 13 * 16]
5612 paddd m5, [pd_16]
5613 psrld m5, 5
5614 packusdw m6, m5
5615
5616 pslldq m1, 2
5617 palignr m0, m3, 12
5618 palignr m3, m1, 12
5619
5620 pmaddwd m3, [r3 + 10 * 16] ; [24]
5621 paddd m3, [pd_16]
5622 psrld m3, 5
5623 pmaddwd m0, [r3 + 10 * 16]
5624 paddd m0, [pd_16]
5625 psrld m0, 5
5626 packusdw m3, m0
5627
5628 lea r0, [r0 + r1 * 4]
5629 movu [r0], m4
5630 movu [r0 + r1], m2
5631 movu [r0 + r1 * 2], m6
5632 movu [r0 + r4], m3
5633 RET
5634
5635 cglobal intra_pred_ang8_24, 3,5,7
5636 lea r3, [ang_table + 16 * 16]
5637 add r1, r1
5638
5639 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5640 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5641
5642 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5643 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5644
5645 mova m4, m3
5646 pmaddwd m4, [r3 + 11 * 16] ; [27]
5647 paddd m4, [pd_16]
5648 psrld m4, 5
5649 mova m2, m0
5650 pmaddwd m2, [r3 + 11 * 16]
5651 paddd m2, [pd_16]
5652 psrld m2, 5
5653 packusdw m4, m2
5654
5655 mova m2, m3
5656 pmaddwd m2, [r3 + 6 * 16] ; [22]
5657 paddd m2, [pd_16]
5658 psrld m2, 5
5659 mova m1, m0
5660 pmaddwd m1, [r3 + 6 * 16]
5661 paddd m1, [pd_16]
5662 psrld m1, 5
5663 packusdw m2, m1
5664
5665 mova m6, m3
5666 pmaddwd m6, [r3 + 1 * 16] ; [17]
5667 paddd m6, [pd_16]
5668 psrld m6, 5
5669 mova m1, m0
5670 pmaddwd m1, [r3 + 1 * 16]
5671 paddd m1, [pd_16]
5672 psrld m1, 5
5673 packusdw m6, m1
5674
5675 mova m5, m3
5676 pmaddwd m5, [r3 - 4 * 16] ; [12]
5677 paddd m5, [pd_16]
5678 psrld m5, 5
5679 mova m1, m0
5680 pmaddwd m1, [r3 - 4 * 16]
5681 paddd m1, [pd_16]
5682 psrld m1, 5
5683 packusdw m5, m1
5684
5685 lea r4, [r1 * 3]
5686 movu [r0], m4
5687 movu [r0 + r1], m2
5688 movu [r0 + r1 * 2], m6
5689 movu [r0 + r4], m5
5690
5691 mova m4, m3
5692 pmaddwd m4, [r3 - 9 * 16] ; [7]
5693 paddd m4, [pd_16]
5694 psrld m4, 5
5695 mova m2, m0
5696 pmaddwd m2, [r3 - 9 * 16]
5697 paddd m2, [pd_16]
5698 psrld m2, 5
5699 packusdw m4, m2
5700
5701 mova m2, m3
5702 pmaddwd m2, [r3 - 14 * 16] ; [2]
5703 paddd m2, [pd_16]
5704 psrld m2, 5
5705 mova m1, m0
5706 pmaddwd m1, [r3 - 14 * 16]
5707 paddd m1, [pd_16]
5708 psrld m1, 5
5709 packusdw m2, m1
5710
5711 palignr m0, m3, 12
5712 movu m1, [r2 + 32]
5713 pinsrw m1, [r2], 0
5714 pshufb m1, [pw_ang8_12]
5715 palignr m3, m1, 12
5716
5717 mova m6, m3
5718 pmaddwd m6, [r3 + 13 * 16] ; [29]
5719 paddd m6, [pd_16]
5720 psrld m6, 5
5721 mova m5, m0
5722 pmaddwd m5, [r3 + 13 * 16]
5723 paddd m5, [pd_16]
5724 psrld m5, 5
5725 packusdw m6, m5
5726
5727 pmaddwd m3, [r3 + 8 * 16] ; [24]
5728 paddd m3, [pd_16]
5729 psrld m3, 5
5730 pmaddwd m0, [r3 + 8 * 16]
5731 paddd m0, [pd_16]
5732 psrld m0, 5
5733 packusdw m3, m0
5734
5735 lea r0, [r0 + r1 * 4]
5736 movu [r0], m4
5737 movu [r0 + r1], m2
5738 movu [r0 + r1 * 2], m6
5739 movu [r0 + r4], m3
5740 RET
5741
5742 cglobal intra_pred_ang8_25, 3,5,7
5743 lea r3, [ang_table + 23 * 16]
5744 add r1, r1
5745
5746 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
5747 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5748
5749 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
5750 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
5751
5752 mova m4, m3
5753 pmaddwd m4, [r3 + 7 * 16] ; [30]
5754 paddd m4, [pd_16]
5755 psrld m4, 5
5756 mova m2, m0
5757 pmaddwd m2, [r3 + 7 * 16]
5758 paddd m2, [pd_16]
5759 psrld m2, 5
5760 packusdw m4, m2
5761
5762 mova m2, m3
5763 pmaddwd m2, [r3 + 5 * 16] ; [28]
5764 paddd m2, [pd_16]
5765 psrld m2, 5
5766 mova m1, m0
5767 pmaddwd m1, [r3 + 5 * 16]
5768 paddd m1, [pd_16]
5769 psrld m1, 5
5770 packusdw m2, m1
5771
5772 mova m6, m3
5773 pmaddwd m6, [r3 + 3 * 16] ; [26]
5774 paddd m6, [pd_16]
5775 psrld m6, 5
5776 mova m1, m0
5777 pmaddwd m1, [r3 + 3 * 16]
5778 paddd m1, [pd_16]
5779 psrld m1, 5
5780 packusdw m6, m1
5781
5782 mova m5, m3
5783 pmaddwd m5, [r3 + 1 * 16] ; [24]
5784 paddd m5, [pd_16]
5785 psrld m5, 5
5786 mova m1, m0
5787 pmaddwd m1, [r3 + 1 * 16]
5788 paddd m1, [pd_16]
5789 psrld m1, 5
5790 packusdw m5, m1
5791
5792 lea r4, [r1 * 3]
5793 movu [r0], m4
5794 movu [r0 + r1], m2
5795 movu [r0 + r1 * 2], m6
5796 movu [r0 + r4], m5
5797
5798 mova m4, m3
5799 pmaddwd m4, [r3 - 1 * 16] ; [22]
5800 paddd m4, [pd_16]
5801 psrld m4, 5
5802 mova m2, m0
5803 pmaddwd m2, [r3 - 1 * 16]
5804 paddd m2, [pd_16]
5805 psrld m2, 5
5806 packusdw m4, m2
5807
5808 mova m2, m3
5809 pmaddwd m2, [r3 - 3 * 16] ; [20]
5810 paddd m2, [pd_16]
5811 psrld m2, 5
5812 mova m1, m0
5813 pmaddwd m1, [r3 - 3 * 16]
5814 paddd m1, [pd_16]
5815 psrld m1, 5
5816 packusdw m2, m1
5817
5818 mova m6, m3
5819 pmaddwd m6, [r3 - 5 * 16] ; [18]
5820 paddd m6, [pd_16]
5821 psrld m6, 5
5822 mova m5, m0
5823 pmaddwd m5, [r3 - 5 * 16]
5824 paddd m5, [pd_16]
5825 psrld m5, 5
5826 packusdw m6, m5
5827
5828 pmaddwd m3, [r3 - 7 * 16] ; [16]
5829 paddd m3, [pd_16]
5830 psrld m3, 5
5831 pmaddwd m0, [r3 - 7 * 16]
5832 paddd m0, [pd_16]
5833 psrld m0, 5
5834 packusdw m3, m0
5835
5836 lea r0, [r0 + r1 * 4]
5837 movu [r0], m4
5838 movu [r0 + r1], m2
5839 movu [r0 + r1 * 2], m6
5840 movu [r0 + r4], m3
5841 RET
5842
5843 cglobal intra_pred_ang8_26, 3,6,3
5844 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5845 add r1, r1
5846 lea r5, [r1 * 3]
5847
5848 movu [r0], m0
5849 movu [r0 + r1], m0
5850 movu [r0 + r1 * 2], m0
5851 movu [r0 + r5], m0
5852
5853 lea r3, [r0 + r1 *4]
5854 movu [r3], m0
5855 movu [r3 + r1], m0
5856 movu [r3 + r1 * 2], m0
5857 movu [r3 + r5], m0
5858
5859 cmp r4m, byte 0
5860 jz .quit
5861
5862 ; filter
5863 pshufb m0, [pb_01]
5864 pinsrw m1, [r2], 0 ; [3 2 1 0]
5865 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
5866 movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1]
5867 psubw m1, m2
5868 psraw m1, 1
5869 paddw m0, m1
5870 pxor m1, m1
5871 pmaxsw m0, m1
5872 pminsw m0, [pw_pixel_max]
5873 pextrw [r0], m0, 0
5874 pextrw [r0 + r1], m0, 1
5875 pextrw [r0 + r1 * 2], m0, 2
5876 pextrw [r0 + r5], m0, 3
5877 pextrw [r3], m0, 4
5878 pextrw [r3 + r1], m0, 5
5879 pextrw [r3 + r1 * 2], m0, 6
5880 pextrw [r3 + r5], m0, 7
5881 .quit:
5882 RET
5883
5884 cglobal intra_pred_ang8_27, 3,5,7
5885 lea r3, [ang_table + 9 * 16]
5886 add r1, r1
5887
5888 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5889 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
5890
5891 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
5892 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
5893
5894 mova m4, m3
5895 pmaddwd m4, [r3 - 7 * 16] ; [2]
5896 paddd m4, [pd_16]
5897 psrld m4, 5
5898 mova m2, m0
5899 pmaddwd m2, [r3 - 7 * 16]
5900 paddd m2, [pd_16]
5901 psrld m2, 5
5902 packusdw m4, m2
5903
5904 mova m2, m3
5905 pmaddwd m2, [r3 - 5 * 16] ; [4]
5906 paddd m2, [pd_16]
5907 psrld m2, 5
5908 mova m1, m0
5909 pmaddwd m1, [r3 - 5 * 16]
5910 paddd m1, [pd_16]
5911 psrld m1, 5
5912 packusdw m2, m1
5913
5914 mova m6, m3
5915 pmaddwd m6, [r3 - 3 * 16] ; [6]
5916 paddd m6, [pd_16]
5917 psrld m6, 5
5918 mova m1, m0
5919 pmaddwd m1, [r3 - 3 * 16]
5920 paddd m1, [pd_16]
5921 psrld m1, 5
5922 packusdw m6, m1
5923
5924 mova m5, m3
5925 pmaddwd m5, [r3 - 1 * 16] ; [8]
5926 paddd m5, [pd_16]
5927 psrld m5, 5
5928 mova m1, m0
5929 pmaddwd m1, [r3 - 1 * 16]
5930 paddd m1, [pd_16]
5931 psrld m1, 5
5932 packusdw m5, m1
5933
5934 lea r4, [r1 * 3]
5935 movu [r0], m4
5936 movu [r0 + r1], m2
5937 movu [r0 + r1 * 2], m6
5938 movu [r0 + r4], m5
5939
5940 mova m4, m3
5941 pmaddwd m4, [r3 + 1 * 16] ; [10]
5942 paddd m4, [pd_16]
5943 psrld m4, 5
5944 mova m2, m0
5945 pmaddwd m2, [r3 + 1 * 16]
5946 paddd m2, [pd_16]
5947 psrld m2, 5
5948 packusdw m4, m2
5949
5950 mova m2, m3
5951 pmaddwd m2, [r3 + 3 * 16] ; [12]
5952 paddd m2, [pd_16]
5953 psrld m2, 5
5954 mova m1, m0
5955 pmaddwd m1, [r3 + 3 * 16]
5956 paddd m1, [pd_16]
5957 psrld m1, 5
5958 packusdw m2, m1
5959
5960 mova m6, m3
5961 pmaddwd m6, [r3 + 5 * 16] ; [14]
5962 paddd m6, [pd_16]
5963 psrld m6, 5
5964 mova m5, m0
5965 pmaddwd m5, [r3 + 5 * 16]
5966 paddd m5, [pd_16]
5967 psrld m5, 5
5968 packusdw m6, m5
5969
5970 pmaddwd m3, [r3 + 7 * 16] ; [16]
5971 paddd m3, [pd_16]
5972 psrld m3, 5
5973 pmaddwd m0, [r3 + 7 * 16]
5974 paddd m0, [pd_16]
5975 psrld m0, 5
5976 packusdw m3, m0
5977
5978 lea r0, [r0 + r1 * 4]
5979 movu [r0], m4
5980 movu [r0 + r1], m2
5981 movu [r0 + r1 * 2], m6
5982 movu [r0 + r4], m3
5983 RET
5984
5985 cglobal intra_pred_ang8_28, 3,5,7
5986 lea r3, [ang_table + 17 * 16]
5987 add r1, r1
5988
5989 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5990 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
5991
5992 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
5993 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
5994
5995 mova m4, m3
5996 pmaddwd m4, [r3 - 12 * 16] ; [5]
5997 paddd m4, [pd_16]
5998 psrld m4, 5
5999 mova m2, m0
6000 pmaddwd m2, [r3 - 12 * 16]
6001 paddd m2, [pd_16]
6002 psrld m2, 5
6003 packusdw m4, m2
6004
6005 mova m2, m3
6006 pmaddwd m2, [r3 - 7 * 16] ; [10]
6007 paddd m2, [pd_16]
6008 psrld m2, 5
6009 mova m1, m0
6010 pmaddwd m1, [r3 - 7 * 16]
6011 paddd m1, [pd_16]
6012 psrld m1, 5
6013 packusdw m2, m1
6014
6015 mova m6, m3
6016 pmaddwd m6, [r3 - 2 * 16] ; [15]
6017 paddd m6, [pd_16]
6018 psrld m6, 5
6019 mova m1, m0
6020 pmaddwd m1, [r3 - 2 * 16]
6021 paddd m1, [pd_16]
6022 psrld m1, 5
6023 packusdw m6, m1
6024
6025 mova m5, m3
6026 pmaddwd m5, [r3 + 3 * 16] ; [20]
6027 paddd m5, [pd_16]
6028 psrld m5, 5
6029 mova m1, m0
6030 pmaddwd m1, [r3 + 3 * 16]
6031 paddd m1, [pd_16]
6032 psrld m1, 5
6033 packusdw m5, m1
6034
6035 lea r4, [r1 * 3]
6036 movu [r0], m4
6037 movu [r0 + r1], m2
6038 movu [r0 + r1 * 2], m6
6039 movu [r0 + r4], m5
6040
6041 mova m4, m3
6042 pmaddwd m4, [r3 + 8 * 16] ; [25]
6043 paddd m4, [pd_16]
6044 psrld m4, 5
6045 mova m2, m0
6046 pmaddwd m2, [r3 + 8 * 16]
6047 paddd m2, [pd_16]
6048 psrld m2, 5
6049 packusdw m4, m2
6050
6051 mova m2, m3
6052 pmaddwd m2, [r3 + 13 * 16] ; [30]
6053 paddd m2, [pd_16]
6054 psrld m2, 5
6055 mova m1, m0
6056 pmaddwd m1, [r3 + 13 * 16]
6057 paddd m1, [pd_16]
6058 psrld m1, 5
6059 packusdw m2, m1
6060
6061 movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6062
6063 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6064 mova m5, m6
6065 pmaddwd m6, [r3 - 14 * 16] ; [3]
6066 paddd m6, [pd_16]
6067 psrld m6, 5
6068 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
6069 mova m3, m1
6070 pmaddwd m1, [r3 - 14 * 16]
6071 paddd m1, [pd_16]
6072 psrld m1, 5
6073 packusdw m6, m1
6074
6075 pmaddwd m5, [r3 - 9 * 16] ; [8]
6076 paddd m5, [pd_16]
6077 psrld m5, 5
6078 pmaddwd m3, [r3 - 9 * 16]
6079 paddd m3, [pd_16]
6080 psrld m3, 5
6081 packusdw m5, m3
6082
6083 lea r0, [r0 + r1 * 4]
6084 movu [r0], m4
6085 movu [r0 + r1], m2
6086 movu [r0 + r1 * 2], m6
6087 movu [r0 + r4], m5
6088 RET
6089
6090 cglobal intra_pred_ang8_29, 3,5,8
6091 lea r3, [ang_table + 18 * 16]
6092 add r1, r1
6093
6094 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6095 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6096 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6097 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6098
6099 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6100 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6101 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6102
6103 mova m4, m3
6104 pmaddwd m4, [r3 - 9 * 16] ; [9]
6105 paddd m4, [pd_16]
6106 psrld m4, 5
6107 mova m2, m0
6108 pmaddwd m2, [r3 - 9 * 16]
6109 paddd m2, [pd_16]
6110 psrld m2, 5
6111 packusdw m4, m2
6112
6113 mova m2, m3
6114 pmaddwd m2, [r3] ; [18]
6115 paddd m2, [pd_16]
6116 psrld m2, 5
6117 mova m1, m0
6118 pmaddwd m1, [r3]
6119 paddd m1, [pd_16]
6120 psrld m1, 5
6121 packusdw m2, m1
6122
6123 mova m6, m3
6124 pmaddwd m6, [r3 + 9 * 16] ; [27]
6125 paddd m6, [pd_16]
6126 psrld m6, 5
6127 mova m1, m0
6128 pmaddwd m1, [r3 + 9 * 16]
6129 paddd m1, [pd_16]
6130 psrld m1, 5
6131 packusdw m6, m1
6132
6133 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6134 pmaddwd m7, [r3 - 14 * 16] ; [4]
6135 paddd m7, [pd_16]
6136 psrld m7, 5
6137 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6138 pmaddwd m1, [r3 - 14 * 16]
6139 paddd m1, [pd_16]
6140 psrld m1, 5
6141 packusdw m7, m1
6142
6143 lea r4, [r1 * 3]
6144 movu [r0], m4
6145 movu [r0 + r1], m2
6146 movu [r0 + r1 * 2], m6
6147 movu [r0 + r4], m7
6148
6149 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6150 mova m6, m4
6151 pmaddwd m4, [r3 - 5 * 16] ; [13]
6152 paddd m4, [pd_16]
6153 psrld m4, 5
6154 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6155 mova m7, m2
6156 pmaddwd m2, [r3 - 5 * 16]
6157 paddd m2, [pd_16]
6158 psrld m2, 5
6159 packusdw m4, m2
6160
6161 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
6162 paddd m2, [pd_16]
6163 psrld m2, 5
6164 pmaddwd m1, m7, [r3 + 4 * 16]
6165 paddd m1, [pd_16]
6166 psrld m1, 5
6167 packusdw m2, m1
6168
6169 pmaddwd m6, [r3 + 13 * 16] ; [31]
6170 paddd m6, [pd_16]
6171 psrld m6, 5
6172 pmaddwd m7, [r3 + 13 * 16]
6173 paddd m7, [pd_16]
6174 psrld m7, 5
6175 packusdw m6, m7
6176
6177 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6178 pmaddwd m7, [r3 - 10 * 16] ; [8]
6179 paddd m7, [pd_16]
6180 psrld m7, 5
6181 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6182 pmaddwd m5, [r3 - 10 * 16]
6183 paddd m5, [pd_16]
6184 psrld m5, 5
6185 packusdw m7, m5
6186
6187 lea r0, [r0 + r1 * 4]
6188 movu [r0], m4
6189 movu [r0 + r1], m2
6190 movu [r0 + r1 * 2], m6
6191 movu [r0 + r4], m7
6192 RET
6193
6194 cglobal intra_pred_ang8_30, 3,5,8
6195 lea r3, [ang_table + 14 * 16]
6196 add r1, r1
6197
6198 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6199 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6200 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6201 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6202
6203 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6204 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6205 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6206
6207 mova m4, m3
6208 pmaddwd m4, [r3 - 1 * 16] ; [13]
6209 paddd m4, [pd_16]
6210 psrld m4, 5
6211 mova m2, m0
6212 pmaddwd m2, [r3 - 1 * 16]
6213 paddd m2, [pd_16]
6214 psrld m2, 5
6215 packusdw m4, m2
6216
6217 mova m2, m3
6218 pmaddwd m2, [r3 + 12 * 16] ; [26]
6219 paddd m2, [pd_16]
6220 psrld m2, 5
6221 mova m1, m0
6222 pmaddwd m1, [r3 + 12 * 16]
6223 paddd m1, [pd_16]
6224 psrld m1, 5
6225 packusdw m2, m1
6226
6227 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6228 mova m7, m6
6229 pmaddwd m6, [r3 - 7 * 16] ; [7]
6230 paddd m6, [pd_16]
6231 psrld m6, 5
6232 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6233 pmaddwd m1, [r3 - 7 * 16]
6234 paddd m1, [pd_16]
6235 psrld m1, 5
6236 packusdw m6, m1
6237
6238 pmaddwd m7, [r3 + 6 * 16] ; [20]
6239 paddd m7, [pd_16]
6240 psrld m7, 5
6241 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6242 pmaddwd m1, [r3 + 6 * 16]
6243 paddd m1, [pd_16]
6244 psrld m1, 5
6245 packusdw m7, m1
6246
6247 lea r4, [r1 * 3]
6248 movu [r0], m4
6249 movu [r0 + r1], m2
6250 movu [r0 + r1 * 2], m6
6251 movu [r0 + r4], m7
6252
6253 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6254 mova m6, m4
6255 pmaddwd m4, [r3 - 13 * 16] ; [1]
6256 paddd m4, [pd_16]
6257 psrld m4, 5
6258 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6259 mova m7, m2
6260 pmaddwd m2, [r3 - 13 * 16]
6261 paddd m2, [pd_16]
6262 psrld m2, 5
6263 packusdw m4, m2
6264
6265 pmaddwd m2, m6, [r3] ; [14]
6266 paddd m2, [pd_16]
6267 psrld m2, 5
6268 pmaddwd m1, m7, [r3]
6269 paddd m1, [pd_16]
6270 psrld m1, 5
6271 packusdw m2, m1
6272
6273 pmaddwd m6, [r3 + 13 * 16] ; [27]
6274 paddd m6, [pd_16]
6275 psrld m6, 5
6276 pmaddwd m7, [r3 + 13 * 16]
6277 paddd m7, [pd_16]
6278 psrld m7, 5
6279 packusdw m6, m7
6280
6281 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6282 pmaddwd m7, [r3 - 6 * 16] ; [8]
6283 paddd m7, [pd_16]
6284 psrld m7, 5
6285 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6286 pmaddwd m5, [r3 - 6 * 16]
6287 paddd m5, [pd_16]
6288 psrld m5, 5
6289 packusdw m7, m5
6290
6291 lea r0, [r0 + r1 * 4]
6292 movu [r0], m4
6293 movu [r0 + r1], m2
6294 movu [r0 + r1 * 2], m6
6295 movu [r0 + r4], m7
6296 RET
6297
6298 cglobal intra_pred_ang8_31, 3,5,8
6299 lea r3, [ang_table + 13 * 16]
6300 add r1, r1
6301
6302 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6303 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6304 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6305 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6306
6307 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6308 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6309 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6310
6311 mova m4, m3
6312 pmaddwd m4, [r3 + 4 * 16] ; [17]
6313 paddd m4, [pd_16]
6314 psrld m4, 5
6315 mova m2, m0
6316 pmaddwd m2, [r3 + 4 * 16]
6317 paddd m2, [pd_16]
6318 psrld m2, 5
6319 packusdw m4, m2
6320
6321 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6322 mova m6, m2
6323 pmaddwd m2, [r3 - 11 * 16] ; [2]
6324 paddd m2, [pd_16]
6325 psrld m2, 5
6326 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6327 mova m7, m1
6328 pmaddwd m1, [r3 - 11 * 16]
6329 paddd m1, [pd_16]
6330 psrld m1, 5
6331 packusdw m2, m1
6332
6333 pmaddwd m6, [r3 + 6 * 16] ; [19]
6334 paddd m6, [pd_16]
6335 psrld m6, 5
6336 pmaddwd m7, [r3 + 6 * 16]
6337 paddd m7, [pd_16]
6338 psrld m7, 5
6339 packusdw m6, m7
6340
6341 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6342 pmaddwd m7, [r3 - 9 * 16] ; [4]
6343 paddd m7, [pd_16]
6344 psrld m7, 5
6345 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6346 pmaddwd m1, [r3 - 9 * 16]
6347 paddd m1, [pd_16]
6348 psrld m1, 5
6349 packusdw m7, m1
6350
6351 lea r4, [r1 * 3]
6352 movu [r0], m4
6353 movu [r0 + r1], m2
6354 movu [r0 + r1 * 2], m6
6355 movu [r0 + r4], m7
6356
6357 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6358 pmaddwd m4, [r3 + 8 * 16] ; [21]
6359 paddd m4, [pd_16]
6360 psrld m4, 5
6361 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6362 pmaddwd m2, [r3 + 8 * 16]
6363 paddd m2, [pd_16]
6364 psrld m2, 5
6365 packusdw m4, m2
6366
6367 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6368 mova m6, m2
6369 pmaddwd m2, [r3 - 7 * 16] ; [6]
6370 paddd m2, [pd_16]
6371 psrld m2, 5
6372 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6373 mova m7, m1
6374 pmaddwd m1, [r3 - 7 * 16]
6375 paddd m1, [pd_16]
6376 psrld m1, 5
6377 packusdw m2, m1
6378
6379 pmaddwd m6, [r3 + 10 * 16] ; [23]
6380 paddd m6, [pd_16]
6381 psrld m6, 5
6382 pmaddwd m7, [r3 + 10 * 16]
6383 paddd m7, [pd_16]
6384 psrld m7, 5
6385 packusdw m6, m7
6386
6387 mova m7, m0
6388 pmaddwd m7, [r3 - 5 * 16] ; [8]
6389 paddd m7, [pd_16]
6390 psrld m7, 5
6391 mova m1, m5
6392 pmaddwd m1, [r3 - 5 * 16]
6393 paddd m1, [pd_16]
6394 psrld m1, 5
6395 packusdw m7, m1
6396
6397 lea r0, [r0 + r1 * 4]
6398 movu [r0], m4
6399 movu [r0 + r1], m2
6400 movu [r0 + r1 * 2], m6
6401 movu [r0 + r4], m7
6402 RET
6403
6404 cglobal intra_pred_ang8_32, 3,5,8
6405 lea r3, [ang_table + 19 * 16]
6406 add r1, r1
6407
6408 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6409 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6410 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6411 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6412
6413 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6414 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6415 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6416
6417 mova m4, m3
6418 pmaddwd m4, [r3 + 2 * 16] ; [21]
6419 paddd m4, [pd_16]
6420 psrld m4, 5
6421 mova m2, m0
6422 pmaddwd m2, [r3 + 2 * 16]
6423 paddd m2, [pd_16]
6424 psrld m2, 5
6425 packusdw m4, m2
6426
6427 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6428 mova m6, m2
6429 pmaddwd m2, [r3 - 9 * 16] ; [10]
6430 paddd m2, [pd_16]
6431 psrld m2, 5
6432 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6433 mova m7, m1
6434 pmaddwd m1, [r3 - 9 * 16]
6435 paddd m1, [pd_16]
6436 psrld m1, 5
6437 packusdw m2, m1
6438
6439 pmaddwd m6, [r3 + 12 * 16] ; [31]
6440 paddd m6, [pd_16]
6441 psrld m6, 5
6442 pmaddwd m7, [r3 + 12 * 16]
6443 paddd m7, [pd_16]
6444 psrld m7, 5
6445 packusdw m6, m7
6446
6447 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6448 pmaddwd m7, [r3 + 1 * 16] ; [20]
6449 paddd m7, [pd_16]
6450 psrld m7, 5
6451 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6452 pmaddwd m1, [r3 + 1 * 16]
6453 paddd m1, [pd_16]
6454 psrld m1, 5
6455 packusdw m7, m1
6456
6457 lea r4, [r1 * 3]
6458 movu [r0], m4
6459 movu [r0 + r1], m2
6460 movu [r0 + r1 * 2], m6
6461 movu [r0 + r4], m7
6462
6463 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6464 mova m2, m4
6465 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
6466 paddd m4, [pd_16]
6467 psrld m4, 5
6468 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6469 mova m6, m3
6470 pmaddwd m3, [r3 - 10 * 16]
6471 paddd m3, [pd_16]
6472 psrld m3, 5
6473 packusdw m4, m3
6474
6475 pmaddwd m2, [r3 + 11 * 16] ; [30]
6476 paddd m2, [pd_16]
6477 psrld m2, 5
6478 pmaddwd m6, [r3 + 11 * 16]
6479 paddd m6, [pd_16]
6480 psrld m6, 5
6481 packusdw m2, m6
6482
6483 mova m6, m0
6484 pmaddwd m6, [r3] ; [19]
6485 paddd m6, [pd_16]
6486 psrld m6, 5
6487 mova m7, m5
6488 pmaddwd m7, [r3]
6489 paddd m7, [pd_16]
6490 psrld m7, 5
6491 packusdw m6, m7
6492
6493 movh m1, [r2 + 26] ; [16 15 14 13]
6494 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6495 pmaddwd m7, [r3 - 11 * 16] ; [8]
6496 paddd m7, [pd_16]
6497 psrld m7, 5
6498 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
6499 pmaddwd m1, [r3 - 11 * 16]
6500 paddd m1, [pd_16]
6501 psrld m1, 5
6502 packusdw m7, m1
6503
6504 lea r0, [r0 + r1 * 4]
6505 movu [r0], m4
6506 movu [r0 + r1], m2
6507 movu [r0 + r1 * 2], m6
6508 movu [r0 + r4], m7
6509 RET
6510
6511 cglobal intra_pred_ang8_33, 3,5,8
6512 lea r3, [ang_table + 14 * 16]
6513 add r1, r1
6514
6515 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6516 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6517 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6518 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6519
6520 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6521 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6522 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6523 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
6524
6525 mova m4, m3
6526 pmaddwd m4, [r3 + 12 * 16] ; [26]
6527 paddd m4, [pd_16]
6528 psrld m4, 5
6529 mova m2, m0
6530 pmaddwd m2, [r3 + 12 * 16]
6531 paddd m2, [pd_16]
6532 psrld m2, 5
6533 packusdw m4, m2
6534
6535 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6536 pmaddwd m2, [r3 + 6 * 16] ; [20]
6537 paddd m2, [pd_16]
6538 psrld m2, 5
6539 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6540 pmaddwd m6, [r3 + 6 * 16]
6541 paddd m6, [pd_16]
6542 psrld m6, 5
6543 packusdw m2, m6
6544
6545 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6546 pmaddwd m6, [r3] ; [14]
6547 paddd m6, [pd_16]
6548 psrld m6, 5
6549 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6550 pmaddwd m7, [r3]
6551 paddd m7, [pd_16]
6552 psrld m7, 5
6553 packusdw m6, m7
6554
6555 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6556 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
6557 paddd m7, [pd_16]
6558 psrld m7, 5
6559 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6560 pmaddwd m3, [r3 - 6 * 16]
6561 paddd m3, [pd_16]
6562 psrld m3, 5
6563 packusdw m7, m3
6564
6565 lea r4, [r1 * 3]
6566 movu [r0], m4
6567 movu [r0 + r1], m2
6568 movu [r0 + r1 * 2], m6
6569 movu [r0 + r4], m7
6570
6571 mova m4, m0
6572 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
6573 paddd m4, [pd_16]
6574 psrld m4, 5
6575 mova m2, m5
6576 pmaddwd m2, [r3 - 12 * 16]
6577 paddd m2, [pd_16]
6578 psrld m2, 5
6579 packusdw m4, m2
6580
6581 mova m2, m0
6582 pmaddwd m2, [r3 + 14 * 16] ; [28]
6583 paddd m2, [pd_16]
6584 psrld m2, 5
6585 mova m6, m5
6586 pmaddwd m6, [r3 + 14 * 16]
6587 paddd m6, [pd_16]
6588 psrld m6, 5
6589 packusdw m2, m6
6590
6591 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6592 pmaddwd m6, [r3 + 8 * 16] ; [22]
6593 paddd m6, [pd_16]
6594 psrld m6, 5
6595 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
6596 pmaddwd m7, [r3 + 8 * 16]
6597 paddd m7, [pd_16]
6598 psrld m7, 5
6599 packusdw m6, m7
6600
6601 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6602 pmaddwd m7, [r3 + 2 * 16] ; [16]
6603 paddd m7, [pd_16]
6604 psrld m7, 5
6605 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
6606 pmaddwd m1, [r3 + 2 * 16]
6607 paddd m1, [pd_16]
6608 psrld m1, 5
6609 packusdw m7, m1
6610
6611 lea r0, [r0 + r1 * 4]
6612 movu [r0], m4
6613 movu [r0 + r1], m2
6614 movu [r0 + r1 * 2], m6
6615 movu [r0 + r4], m7
6616 RET
6617
6618 %macro TRANSPOSE_STORE 6
6619 jnz .skip%6
6620 punpckhwd %5, %1, %2
6621 punpcklwd %1, %2
6622 punpckhwd %2, %3, %4
6623 punpcklwd %3, %4
6624
6625 punpckldq %4, %1, %3
6626 punpckhdq %1, %3
6627 punpckldq %3, %5, %2
6628 punpckhdq %5, %2
6629
6630 movh [r0 + %6], %4
6631 movhps [r0 + r1 + %6], %4
6632 movh [r0 + r1 * 2 + %6], %1
6633 movhps [r0 + r4 + %6], %1
6634 lea r5, [r0 + r1 * 4]
6635 movh [r5 + %6], %3
6636 movhps [r5 + r1 + %6], %3
6637 movh [r5 + r1 * 2 + %6], %5
6638 movhps [r5 + r4 + %6], %5
6639 jmp .end%6
6640
6641 .skip%6:
6642 movu [r5], %1
6643 movu [r5 + r1], %2
6644 movu [r5 + r1 * 2], %3
6645 movu [r5 + r4], %4
6646 .end%6:
6647 %endmacro
6648
6649 INIT_XMM sse4
6650 cglobal ang16_mode_3_33
6651 test r6d, r6d
6652 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6653 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6654 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6655 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6656
6657 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6658 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6659 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6660 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
6661
6662 mova m4, m3
6663 pmaddwd m4, [r3 + 10 * 16] ; [26]
6664 paddd m4, [pd_16]
6665 psrld m4, 5
6666 mova m2, m0
6667 pmaddwd m2, [r3 + 10 * 16]
6668 paddd m2, [pd_16]
6669 psrld m2, 5
6670 packusdw m4, m2
6671
6672 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6673 pmaddwd m2, [r3 + 4 * 16] ; [20]
6674 paddd m2, [pd_16]
6675 psrld m2, 5
6676 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6677 pmaddwd m6, [r3 + 4 * 16]
6678 paddd m6, [pd_16]
6679 psrld m6, 5
6680 packusdw m2, m6
6681
6682 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6683 pmaddwd m6, [r3 - 2 * 16] ; [14]
6684 paddd m6, [pd_16]
6685 psrld m6, 5
6686 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6687 pmaddwd m7, [r3 - 2 * 16]
6688 paddd m7, [pd_16]
6689 psrld m7, 5
6690 packusdw m6, m7
6691
6692 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6693 pmaddwd m7, [r3 - 8 * 16] ; [ 8]
6694 paddd m7, [pd_16]
6695 psrld m7, 5
6696 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6697 pmaddwd m3, [r3 - 8 * 16]
6698 paddd m3, [pd_16]
6699 psrld m3, 5
6700 packusdw m7, m3
6701
6702 mov r5, r0
6703
6704 TRANSPOSE_STORE m4, m2, m6, m7, m3, 0
6705
6706 mova m4, m0
6707 pmaddwd m4, [r3 - 14 * 16] ; [ 2]
6708 paddd m4, [pd_16]
6709 psrld m4, 5
6710 mova m2, m5
6711 pmaddwd m2, [r3 - 14 * 16]
6712 paddd m2, [pd_16]
6713 psrld m2, 5
6714 packusdw m4, m2
6715
6716 mova m2, m0
6717 pmaddwd m2, [r3 + 12 * 16] ; [28]
6718 paddd m2, [pd_16]
6719 psrld m2, 5
6720 mova m6, m5
6721 pmaddwd m6, [r3 + 12 * 16]
6722 paddd m6, [pd_16]
6723 psrld m6, 5
6724 packusdw m2, m6
6725
6726 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6727 pmaddwd m6, [r3 + 6 * 16] ; [22]
6728 paddd m6, [pd_16]
6729 psrld m6, 5
6730 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
6731 pmaddwd m7, [r3 + 6 * 16]
6732 paddd m7, [pd_16]
6733 psrld m7, 5
6734 packusdw m6, m7
6735
6736 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6737 pmaddwd m7, [r3] ; [16]
6738 paddd m7, [pd_16]
6739 psrld m7, 5
6740 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
6741 pmaddwd m1, [r3]
6742 paddd m1, [pd_16]
6743 psrld m1, 5
6744 packusdw m7, m1
6745
6746 lea r5, [r0 + r1 * 4]
6747
6748 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
6749
6750 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
6751 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
6752
6753 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
6754 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
6755
6756 palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6757 pmaddwd m4, [r3 - 6 * 16] ; [10]
6758 paddd m4, [pd_16]
6759 psrld m4, 5
6760 palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
6761 pmaddwd m2, [r3 - 6 * 16]
6762 paddd m2, [pd_16]
6763 psrld m2, 5
6764 packusdw m4, m2
6765
6766 mova m2, m5
6767 pmaddwd m2, [r3 - 12 * 16] ; [4]
6768 paddd m2, [pd_16]
6769 psrld m2, 5
6770 mova m6, m3
6771 pmaddwd m6, [r3 - 12 * 16]
6772 paddd m6, [pd_16]
6773 psrld m6, 5
6774 packusdw m2, m6
6775
6776 mova m6, m5
6777 pmaddwd m6, [r3 + 14 * 16] ; [30]
6778 paddd m6, [pd_16]
6779 psrld m6, 5
6780 mova m7, m3
6781 pmaddwd m7, [r3 + 14 * 16]
6782 paddd m7, [pd_16]
6783 psrld m7, 5
6784 packusdw m6, m7
6785
6786 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
6787 pmaddwd m7, [r3 + 8 * 16] ; [24]
6788 paddd m7, [pd_16]
6789 psrld m7, 5
6790 palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
6791 pmaddwd m0, [r3 + 8 * 16]
6792 paddd m0, [pd_16]
6793 psrld m0, 5
6794 packusdw m7, m0
6795
6796 lea r5, [r5 + r1 * 4]
6797
6798 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
6799
6800 palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
6801 pmaddwd m4, [r3 + 2 * 16] ; [18]
6802 paddd m4, [pd_16]
6803 psrld m4, 5
6804 palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15]
6805 pmaddwd m2, [r3 + 2 * 16]
6806 paddd m2, [pd_16]
6807 psrld m2, 5
6808 packusdw m4, m2
6809
6810 palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12]
6811 pmaddwd m2, [r3 - 4 * 16] ; [12]
6812 paddd m2, [pd_16]
6813 psrld m2, 5
6814 palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16]
6815 pmaddwd m6, [r3 - 4 * 16]
6816 paddd m6, [pd_16]
6817 psrld m6, 5
6818 packusdw m2, m6
6819
6820 pinsrw m1, [r2 + 42], 7
6821 pmaddwd m3, [r3 - 10 * 16] ; [6]
6822 paddd m3, [pd_16]
6823 psrld m3, 5
6824 pmaddwd m1, [r3 - 10 * 16]
6825 paddd m1, [pd_16]
6826 psrld m1, 5
6827 packusdw m3, m1
6828
6829 movu m7, [r2 + 28]
6830
6831 lea r5, [r5 + r1 * 4]
6832
6833 TRANSPOSE_STORE m4, m2, m3, m7, m0, 24
6834
6835 ret
6836
6837 cglobal ang16_mode_4_32
6838 test r6d, r6d
6839 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6840 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6841 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6842 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6843
6844 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6845 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6846 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6847
6848 mova m4, m3
6849 pmaddwd m4, [r3 + 3 * 16] ; [21]
6850 paddd m4, [pd_16]
6851 psrld m4, 5
6852 mova m2, m0
6853 pmaddwd m2, [r3 + 3 * 16]
6854 paddd m2, [pd_16]
6855 psrld m2, 5
6856 packusdw m4, m2
6857
6858 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6859 mova m6, m2
6860 pmaddwd m2, [r3 - 8 * 16] ; [10]
6861 paddd m2, [pd_16]
6862 psrld m2, 5
6863 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6864 mova m7, m1
6865 pmaddwd m1, [r3 - 8 * 16]
6866 paddd m1, [pd_16]
6867 psrld m1, 5
6868 packusdw m2, m1
6869
6870 pmaddwd m6, [r3 + 13 * 16] ; [31]
6871 paddd m6, [pd_16]
6872 psrld m6, 5
6873 pmaddwd m7, [r3 + 13 * 16]
6874 paddd m7, [pd_16]
6875 psrld m7, 5
6876 packusdw m6, m7
6877
6878 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6879 pmaddwd m7, [r3 + 2 * 16] ; [20]
6880 paddd m7, [pd_16]
6881 psrld m7, 5
6882 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6883 pmaddwd m1, [r3 + 2 * 16]
6884 paddd m1, [pd_16]
6885 psrld m1, 5
6886 packusdw m7, m1
6887
6888 mov r5, r0
6889
6890 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6891
6892 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6893 mova m2, m4
6894 pmaddwd m4, [r3 - 9 * 16] ; [9]
6895 paddd m4, [pd_16]
6896 psrld m4, 5
6897 palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6898 mova m6, m7
6899 pmaddwd m7, [r3 - 9 * 16]
6900 paddd m7, [pd_16]
6901 psrld m7, 5
6902 packusdw m4, m7
6903
6904 pmaddwd m2, [r3 + 12 * 16] ; [30]
6905 paddd m2, [pd_16]
6906 psrld m2, 5
6907 pmaddwd m6, [r3 + 12 * 16]
6908 paddd m6, [pd_16]
6909 psrld m6, 5
6910 packusdw m2, m6
6911
6912 mova m6, m0
6913 pmaddwd m6, [r3 + 1 * 16] ; [19]
6914 paddd m6, [pd_16]
6915 psrld m6, 5
6916 mova m7, m5
6917 pmaddwd m7, [r3 + 1 * 16]
6918 paddd m7, [pd_16]
6919 psrld m7, 5
6920 packusdw m6, m7
6921
6922 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
6923
6924 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6925 pmaddwd m7, [r3 - 10 * 16] ; [8]
6926 paddd m7, [pd_16]
6927 psrld m7, 5
6928 palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
6929 pmaddwd m3, [r3 - 10 * 16]
6930 paddd m3, [pd_16]
6931 psrld m3, 5
6932 packusdw m7, m3
6933
6934 lea r5, [r0 + r1 * 4]
6935
6936 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
6937
6938 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
6939
6940 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
6941 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
6942
6943 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6944 pmaddwd m4, [r3 + 11 * 16] ; [29]
6945 paddd m4, [pd_16]
6946 psrld m4, 5
6947 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
6948 pmaddwd m2, [r3 + 11 * 16]
6949 paddd m2, [pd_16]
6950 psrld m2, 5
6951 packusdw m4, m2
6952
6953 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6954 pmaddwd m2, [r3] ; [18]
6955 paddd m2, [pd_16]
6956 psrld m2, 5
6957 palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
6958 pmaddwd m6, [r3]
6959 paddd m6, [pd_16]
6960 psrld m6, 5
6961 packusdw m2, m6
6962
6963 palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6964 mova m7, m6
6965 pmaddwd m6, [r3 - 11 * 16] ; [7]
6966 paddd m6, [pd_16]
6967 psrld m6, 5
6968 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
6969 pmaddwd m0, [r3 - 11 * 16]
6970 paddd m0, [pd_16]
6971 psrld m0, 5
6972 packusdw m6, m0
6973
6974 pmaddwd m7, [r3 + 10 * 16] ; [28]
6975 paddd m7, [pd_16]
6976 psrld m7, 5
6977 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
6978 pmaddwd m0, [r3 + 10 * 16]
6979 paddd m0, [pd_16]
6980 psrld m0, 5
6981 packusdw m7, m0
6982
6983 lea r5, [r5 + r1 * 4]
6984
6985 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
6986
6987 mova m4, m5
6988 pmaddwd m4, [r3 - 1 * 16] ; [17]
6989 paddd m4, [pd_16]
6990 psrld m4, 5
6991 mova m2, m3
6992 pmaddwd m2, [r3 - 1 * 16]
6993 paddd m2, [pd_16]
6994 psrld m2, 5
6995 packusdw m4, m2
6996
6997 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
6998 mova m7, m2
6999 pmaddwd m2, [r3 - 12 * 16] ; [6]
7000 paddd m2, [pd_16]
7001 psrld m2, 5
7002 palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
7003 mova m0, m6
7004 pmaddwd m6, [r3 - 12 * 16]
7005 paddd m6, [pd_16]
7006 psrld m6, 5
7007 packusdw m2, m6
7008
7009 pmaddwd m7, [r3 + 9 * 16] ; [27]
7010 paddd m7, [pd_16]
7011 psrld m7, 5
7012 pmaddwd m0, [r3 + 9 * 16]
7013 paddd m0, [pd_16]
7014 psrld m0, 5
7015 packusdw m7, m0
7016
7017 palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
7018 pmaddwd m0, [r3 - 2 * 16] ; [16]
7019 paddd m0, [pd_16]
7020 psrld m0, 5
7021 palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15]
7022 pmaddwd m1, [r3 - 2 * 16]
7023 paddd m1, [pd_16]
7024 psrld m1, 5
7025 packusdw m0, m1
7026
7027 lea r5, [r5 + r1 * 4]
7028
7029 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
7030
7031 ret
7032
7033 cglobal ang16_mode_5_31
7034 test r6d, r6d
7035 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7036 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
7037 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
7038 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
7039
7040 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
7041 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
7042 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
7043
7044 mova m4, m3
7045 pmaddwd m4, [r3 + 1 * 16] ; [17]
7046 paddd m4, [pd_16]
7047 psrld m4, 5
7048 mova m2, m0
7049 pmaddwd m2, [r3 + 1 * 16]
7050 paddd m2, [pd_16]
7051 psrld m2, 5
7052 packusdw m4, m2
7053
7054 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7055 mova m6, m2
7056 pmaddwd m2, [r3 - 14 * 16] ; [2]
7057 paddd m2, [pd_16]
7058 psrld m2, 5
7059 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7060 mova m7, m1
7061 pmaddwd m1, [r3 - 14 * 16]
7062 paddd m1, [pd_16]
7063 psrld m1, 5
7064 packusdw m2, m1
7065
7066 pmaddwd m6, [r3 + 3 * 16] ; [19]
7067 paddd m6, [pd_16]
7068 psrld m6, 5
7069 pmaddwd m7, [r3 + 3 * 16]
7070 paddd m7, [pd_16]
7071 psrld m7, 5
7072 packusdw m6, m7
7073
7074 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7075 pmaddwd m7, [r3 - 12 * 16] ; [4]
7076 paddd m7, [pd_16]
7077 psrld m7, 5
7078 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7079 pmaddwd m1, [r3 - 12 * 16]
7080 paddd m1, [pd_16]
7081 psrld m1, 5
7082 packusdw m7, m1
7083
7084 mov r5, r0
7085
7086 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7087
7088 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7089 pmaddwd m4, [r3 + 5 * 16] ; [21]
7090 paddd m4, [pd_16]
7091 psrld m4, 5
7092 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7093 pmaddwd m7, [r3 + 5 * 16]
7094 paddd m7, [pd_16]
7095 psrld m7, 5
7096 packusdw m4, m7
7097
7098 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7099 mova m6, m2
7100 pmaddwd m2, [r3 - 10 * 16] ; [6]
7101 paddd m2, [pd_16]
7102 psrld m2, 5
7103 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7104 mova m7, m1
7105 pmaddwd m1, [r3 - 10 * 16]
7106 paddd m1, [pd_16]
7107 psrld m1, 5
7108 packusdw m2, m1
7109
7110 pmaddwd m6, [r3 + 7 * 16] ; [23]
7111 paddd m6, [pd_16]
7112 psrld m6, 5
7113 pmaddwd m7, [r3 + 7 * 16]
7114 paddd m7, [pd_16]
7115 psrld m7, 5
7116 packusdw m6, m7
7117
7118 mova m7, m0
7119 pmaddwd m7, [r3 - 8 * 16] ; [8]
7120 paddd m7, [pd_16]
7121 psrld m7, 5
7122 mova m3, m5
7123 pmaddwd m3, [r3 - 8 * 16]
7124 paddd m3, [pd_16]
7125 psrld m3, 5
7126 packusdw m7, m3
7127
7128 lea r5, [r0 + r1 * 4]
7129
7130 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
7131
7132 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
7133 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
7134
7135 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
7136
7137 mova m4, m0
7138 pmaddwd m4, [r3 + 9 * 16] ; [25]
7139 paddd m4, [pd_16]
7140 psrld m4, 5
7141 mova m2, m5
7142 pmaddwd m2, [r3 + 9 * 16]
7143 paddd m2, [pd_16]
7144 psrld m2, 5
7145 packusdw m4, m2
7146
7147 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7148 mova m6, m2
7149 pmaddwd m2, [r3 - 6 * 16] ; [10]
7150 paddd m2, [pd_16]
7151 psrld m2, 5
7152 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
7153 mova m1, m7
7154 pmaddwd m7, [r3 - 6 * 16]
7155 paddd m7, [pd_16]
7156 psrld m7, 5
7157 packusdw m2, m7
7158
7159 pmaddwd m6, [r3 + 11 * 16] ; [27]
7160 paddd m6, [pd_16]
7161 psrld m6, 5
7162 pmaddwd m1, [r3 + 11 * 16]
7163 paddd m1, [pd_16]
7164 psrld m1, 5
7165 packusdw m6, m1
7166
7167 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7168 pmaddwd m7, [r3 - 4 * 16] ; [12]
7169 paddd m7, [pd_16]
7170 psrld m7, 5
7171 palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
7172 pmaddwd m1, [r3 - 4 * 16]
7173 paddd m1, [pd_16]
7174 psrld m1, 5
7175 packusdw m7, m1
7176
7177 lea r5, [r5 + r1 * 4]
7178
7179 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7180
7181 palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7182 pmaddwd m4, [r3 + 13 * 16] ; [29]
7183 paddd m4, [pd_16]
7184 psrld m4, 5
7185 palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
7186 pmaddwd m2, [r3 + 13 * 16]
7187 paddd m2, [pd_16]
7188 psrld m2, 5
7189 packusdw m4, m2
7190
7191 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7192 mova m7, m2
7193 pmaddwd m2, [r3 - 2 * 16] ; [14]
7194 paddd m2, [pd_16]
7195 psrld m2, 5
7196 palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
7197 mova m0, m6
7198 pmaddwd m6, [r3 - 2 * 16]
7199 paddd m6, [pd_16]
7200 psrld m6, 5
7201 packusdw m2, m6
7202
7203 pmaddwd m7, [r3 + 15 * 16] ; [31]
7204 paddd m7, [pd_16]
7205 psrld m7, 5
7206 pmaddwd m0, [r3 + 15 * 16]
7207 paddd m0, [pd_16]
7208 psrld m0, 5
7209 packusdw m7, m0
7210
7211 pmaddwd m5, [r3] ; [16]
7212 paddd m5, [pd_16]
7213 psrld m5, 5
7214 pmaddwd m3, [r3]
7215 paddd m3, [pd_16]
7216 psrld m3, 5
7217 packusdw m5, m3
7218
7219 lea r5, [r5 + r1 * 4]
7220
7221 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
7222
7223 ret
7224
7225 cglobal ang16_mode_6_30
7226 test r6d, r6d
7227 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7228 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
7229 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
7230 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
7231
7232 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
7233 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
7234 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
7235
7236 mova m4, m3
7237 pmaddwd m4, [r3 - 2 * 16] ; [13]
7238 paddd m4, [pd_16]
7239 psrld m4, 5
7240 mova m2, m0
7241 pmaddwd m2, [r3 - 2 * 16]
7242 paddd m2, [pd_16]
7243 psrld m2, 5
7244 packusdw m4, m2
7245
7246 mova m2, m3
7247 pmaddwd m2, [r3 + 11 * 16] ; [26]
7248 paddd m2, [pd_16]
7249 psrld m2, 5
7250 mova m1, m0
7251 pmaddwd m1, [r3 + 11 * 16]
7252 paddd m1, [pd_16]
7253 psrld m1, 5
7254 packusdw m2, m1
7255
7256 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7257 mova m7, m6
7258 pmaddwd m6, [r3 - 8 * 16] ; [7]
7259 paddd m6, [pd_16]
7260 psrld m6, 5
7261 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7262 pmaddwd m1, [r3 - 8 * 16]
7263 paddd m1, [pd_16]
7264 psrld m1, 5
7265 packusdw m6, m1
7266
7267 pmaddwd m7, [r3 + 5 * 16] ; [20]
7268 paddd m7, [pd_16]
7269 psrld m7, 5
7270 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7271 pmaddwd m1, [r3 + 5 * 16]
7272 paddd m1, [pd_16]
7273 psrld m1, 5
7274 packusdw m7, m1
7275
7276 mov r5, r0
7277
7278 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7279
7280 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7281 mova m6, m4
7282 pmaddwd m4, [r3 - 14 * 16] ; [1]
7283 paddd m4, [pd_16]
7284 psrld m4, 5
7285 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7286 mova m7, m1
7287 pmaddwd m1, [r3 - 14 * 16]
7288 paddd m1, [pd_16]
7289 psrld m1, 5
7290 packusdw m4, m1
7291
7292 mova m2, m6
7293 pmaddwd m2, [r3 - 1 * 16] ; [14]
7294 paddd m2, [pd_16]
7295 psrld m2, 5
7296 mova m1, m7
7297 pmaddwd m1, [r3 - 1 * 16]
7298 paddd m1, [pd_16]
7299 psrld m1, 5
7300 packusdw m2, m1
7301
7302 pmaddwd m6, [r3 + 12 * 16] ; [27]
7303 paddd m6, [pd_16]
7304 psrld m6, 5
7305 pmaddwd m7, [r3 + 12 * 16]
7306 paddd m7, [pd_16]
7307 psrld m7, 5
7308 packusdw m6, m7
7309
7310 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7311 pmaddwd m7, [r3 - 7 * 16] ; [8]
7312 paddd m7, [pd_16]
7313 psrld m7, 5
7314 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7315 pmaddwd m1, [r3 - 7 * 16]
7316 paddd m1, [pd_16]
7317 psrld m1, 5
7318 packusdw m7, m1
7319
7320 lea r5, [r0 + r1 * 4]
7321
7322 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7323
7324 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7325 pmaddwd m4, [r3 + 6 * 16] ; [21]
7326 paddd m4, [pd_16]
7327 psrld m4, 5
7328 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7329 pmaddwd m2, [r3 + 6 * 16]
7330 paddd m2, [pd_16]
7331 psrld m2, 5
7332 packusdw m4, m2
7333
7334 mova m2, m0
7335 pmaddwd m2, [r3 - 13 * 16] ; [2]
7336 paddd m2, [pd_16]
7337 psrld m2, 5
7338 mova m7, m5
7339 pmaddwd m7, [r3 - 13 * 16]
7340 paddd m7, [pd_16]
7341 psrld m7, 5
7342 packusdw m2, m7
7343
7344 mova m6, m0
7345 pmaddwd m6, [r3] ; [15]
7346 paddd m6, [pd_16]
7347 psrld m6, 5
7348 mova m1, m5
7349 pmaddwd m1, [r3]
7350 paddd m1, [pd_16]
7351 psrld m1, 5
7352 packusdw m6, m1
7353
7354 mova m7, m0
7355 pmaddwd m7, [r3 + 13 * 16] ; [28]
7356 paddd m7, [pd_16]
7357 psrld m7, 5
7358 mova m1, m5
7359 pmaddwd m1, [r3 + 13 * 16]
7360 paddd m1, [pd_16]
7361 psrld m1, 5
7362 packusdw m7, m1
7363
7364 lea r5, [r5 + r1 * 4]
7365
7366 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7367
7368 movh m3, [r2 + 26] ; [16 15 14 13]
7369
7370 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7371 mova m2, m4
7372 pmaddwd m4, [r3 - 6 * 16] ; [9]
7373 paddd m4, [pd_16]
7374 psrld m4, 5
7375 palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
7376 mova m6, m1
7377 pmaddwd m1, [r3 - 6 * 16]
7378 paddd m1, [pd_16]
7379 psrld m1, 5
7380 packusdw m4, m1
7381
7382 pmaddwd m2, [r3 + 7 * 16] ; [22]
7383 paddd m2, [pd_16]
7384 psrld m2, 5
7385 mova m1, m6
7386 pmaddwd m1, [r3 + 7 * 16]
7387 paddd m1, [pd_16]
7388 psrld m1, 5
7389 packusdw m2, m1
7390
7391 psrldq m3, 2
7392 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7393 mova m5, m7
7394 pmaddwd m7, [r3 - 12 * 16] ; [3]
7395 paddd m7, [pd_16]
7396 psrld m7, 5
7397 palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11]
7398 mova m1, m3
7399 pmaddwd m3, [r3 - 12 * 16]
7400 paddd m3, [pd_16]
7401 psrld m3, 5
7402 packusdw m7, m3
7403
7404 pmaddwd m5, [r3 + 1 * 16] ; [16]
7405 paddd m5, [pd_16]
7406 psrld m5, 5
7407 pmaddwd m1, [r3 + 1 * 16]
7408 paddd m1, [pd_16]
7409 psrld m1, 5
7410 packusdw m5, m1
7411
7412 lea r5, [r5 + r1 * 4]
7413
7414 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
7415
7416 ret
7417
7418 cglobal ang16_mode_7_29
7419 test r6d, r6d
7420 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7421 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
7422 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
7423 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
7424
7425 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
7426 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
7427 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
7428
7429 mova m4, m3
7430 pmaddwd m4, [r3 - 8 * 16] ; [9]
7431 paddd m4, [pd_16]
7432 psrld m4, 5
7433 mova m2, m0
7434 pmaddwd m2, [r3 - 8 * 16]
7435 paddd m2, [pd_16]
7436 psrld m2, 5
7437 packusdw m4, m2
7438
7439 mova m2, m3
7440 pmaddwd m2, [r3 + 1 * 16] ; [18]
7441 paddd m2, [pd_16]
7442 psrld m2, 5
7443 mova m1, m0
7444 pmaddwd m1, [r3 + 1 * 16]
7445 paddd m1, [pd_16]
7446 psrld m1, 5
7447 packusdw m2, m1
7448
7449 mova m6, m3
7450 pmaddwd m6, [r3 + 10 * 16] ; [27]
7451 paddd m6, [pd_16]
7452 psrld m6, 5
7453 mova m1, m0
7454 pmaddwd m1, [r3 + 10 * 16]
7455 paddd m1, [pd_16]
7456 psrld m1, 5
7457 packusdw m6, m1
7458
7459 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7460 pmaddwd m7, [r3 - 13 * 16] ; [4]
7461 paddd m7, [pd_16]
7462 psrld m7, 5
7463 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7464 pmaddwd m1, [r3 - 13 * 16]
7465 paddd m1, [pd_16]
7466 psrld m1, 5
7467 packusdw m7, m1
7468
7469 mov r5, r0
7470
7471 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7472
7473 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7474 mova m6, m4
7475 pmaddwd m4, [r3 - 4 * 16] ; [13]
7476 paddd m4, [pd_16]
7477 psrld m4, 5
7478 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7479 mova m7, m1
7480 pmaddwd m1, [r3 - 4 * 16]
7481 paddd m1, [pd_16]
7482 psrld m1, 5
7483 packusdw m4, m1
7484
7485 mova m2, m6
7486 pmaddwd m2, [r3 + 5 * 16] ; [22]
7487 paddd m2, [pd_16]
7488 psrld m2, 5
7489 mova m1, m7
7490 pmaddwd m1, [r3 + 5 * 16]
7491 paddd m1, [pd_16]
7492 psrld m1, 5
7493 packusdw m2, m1
7494
7495 pmaddwd m6, [r3 + 14 * 16] ; [31]
7496 paddd m6, [pd_16]
7497 psrld m6, 5
7498 pmaddwd m7, [r3 + 14 * 16]
7499 paddd m7, [pd_16]
7500 psrld m7, 5
7501 packusdw m6, m7
7502
7503 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7504 pmaddwd m7, [r3 - 9 * 16] ; [8]
7505 paddd m7, [pd_16]
7506 psrld m7, 5
7507 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7508 pmaddwd m1, [r3 - 9 * 16]
7509 paddd m1, [pd_16]
7510 psrld m1, 5
7511 packusdw m7, m1
7512
7513 lea r5, [r0 + r1 * 4]
7514
7515 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7516
7517 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7518 mova m2, m4
7519 pmaddwd m4, [r3] ; [17]
7520 paddd m4, [pd_16]
7521 psrld m4, 5
7522 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7523 mova m7, m1
7524 pmaddwd m1, [r3]
7525 paddd m1, [pd_16]
7526 psrld m1, 5
7527 packusdw m4, m1
7528
7529 pmaddwd m2, [r3 + 9 * 16] ; [26]
7530 paddd m2, [pd_16]
7531 psrld m2, 5
7532 pmaddwd m7, [r3 + 9 * 16]
7533 paddd m7, [pd_16]
7534 psrld m7, 5
7535 packusdw m2, m7
7536
7537 palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7538 pmaddwd m6, [r3 - 14 * 16] ; [3]
7539 paddd m6, [pd_16]
7540 psrld m6, 5
7541 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7542 pmaddwd m1, [r3 - 14 * 16]
7543 paddd m1, [pd_16]
7544 psrld m1, 5
7545 packusdw m6, m1
7546
7547 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7548 pmaddwd m7, [r3 - 5 * 16] ; [12]
7549 paddd m7, [pd_16]
7550 psrld m7, 5
7551 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7552 pmaddwd m1, [r3 - 5 * 16]
7553 paddd m1, [pd_16]
7554 psrld m1, 5
7555 packusdw m7, m1
7556
7557 lea r5, [r5 + r1 * 4]
7558
7559 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7560
7561 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
7562 mova m2, m4
7563 pmaddwd m4, [r3 + 4 * 16] ; [21]
7564 paddd m4, [pd_16]
7565 psrld m4, 5
7566 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
7567 mova m3, m1
7568 pmaddwd m1, [r3 + 4 * 16]
7569 paddd m1, [pd_16]
7570 psrld m1, 5
7571 packusdw m4, m1
7572
7573 pmaddwd m2, [r3 + 13 * 16] ; [30]
7574 paddd m2, [pd_16]
7575 psrld m2, 5
7576 pmaddwd m3, [r3 + 13 * 16]
7577 paddd m3, [pd_16]
7578 psrld m3, 5
7579 packusdw m2, m3
7580
7581 mova m7, m0
7582 pmaddwd m7, [r3 - 10 * 16] ; [7]
7583 paddd m7, [pd_16]
7584 psrld m7, 5
7585 mova m3, m5
7586 pmaddwd m3, [r3 - 10 * 16]
7587 paddd m3, [pd_16]
7588 psrld m3, 5
7589 packusdw m7, m3
7590
7591 pmaddwd m0, [r3 - 1 * 16] ; [16]
7592 paddd m0, [pd_16]
7593 psrld m0, 5
7594 pmaddwd m5, [r3 - 1 * 16]
7595 paddd m5, [pd_16]
7596 psrld m5, 5
7597 packusdw m0, m5
7598
7599 lea r5, [r5 + r1 * 4]
7600
7601 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
7602
7603 ret
7604
7605 cglobal ang16_mode_8_28
7606 test r6d, r6d
7607 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7608 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
7609 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
7610 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
7611
7612 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
7613 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
7614 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
7615
7616 mova m4, m3
7617 pmaddwd m4, [r3 - 10 * 16] ; [5]
7618 paddd m4, [pd_16]
7619 psrld m4, 5
7620 mova m2, m0
7621 pmaddwd m2, [r3 - 10 * 16]
7622 paddd m2, [pd_16]
7623 psrld m2, 5
7624 packusdw m4, m2
7625
7626 mova m2, m3
7627 pmaddwd m2, [r3 - 5 * 16] ; [10]
7628 paddd m2, [pd_16]
7629 psrld m2, 5
7630 mova m1, m0
7631 pmaddwd m1, [r3 - 5 * 16]
7632 paddd m1, [pd_16]
7633 psrld m1, 5
7634 packusdw m2, m1
7635
7636 mova m6, m3
7637 pmaddwd m6, [r3] ; [15]
7638 paddd m6, [pd_16]
7639 psrld m6, 5
7640 mova m1, m0
7641 pmaddwd m1, [r3]
7642 paddd m1, [pd_16]
7643 psrld m1, 5
7644 packusdw m6, m1
7645
7646 mova m7, m3
7647 pmaddwd m7, [r3 + 5 * 16] ; [20]
7648 paddd m7, [pd_16]
7649 psrld m7, 5
7650 mova m1, m0
7651 pmaddwd m1, [r3 + 5 * 16]
7652 paddd m1, [pd_16]
7653 psrld m1, 5
7654 packusdw m7, m1
7655
7656 mov r5, r0
7657
7658 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7659
7660 mova m4, m3
7661 pmaddwd m4, [r3 + 10 * 16] ; [25]
7662 paddd m4, [pd_16]
7663 psrld m4, 5
7664 mova m1, m0
7665 pmaddwd m1, [r3 + 10 * 16]
7666 paddd m1, [pd_16]
7667 psrld m1, 5
7668 packusdw m4, m1
7669
7670 mova m2, m3
7671 pmaddwd m2, [r3 + 15 * 16] ; [30]
7672 paddd m2, [pd_16]
7673 psrld m2, 5
7674 mova m1, m0
7675 pmaddwd m1, [r3 + 15 * 16]
7676 paddd m1, [pd_16]
7677 psrld m1, 5
7678 packusdw m2, m1
7679
7680 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7681 pmaddwd m6, [r3 - 12 * 16] ; [3]
7682 paddd m6, [pd_16]
7683 psrld m6, 5
7684 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7685 pmaddwd m7, [r3 - 12 * 16]
7686 paddd m7, [pd_16]
7687 psrld m7, 5
7688 packusdw m6, m7
7689
7690 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7691 pmaddwd m7, [r3 - 7 * 16] ; [8]
7692 paddd m7, [pd_16]
7693 psrld m7, 5
7694 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7695 pmaddwd m1, [r3 - 7 * 16]
7696 paddd m1, [pd_16]
7697 psrld m1, 5
7698 packusdw m7, m1
7699
7700 lea r5, [r0 + r1 * 4]
7701
7702 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7703
7704 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
7705 mova m7, m4
7706 pmaddwd m4, [r3 - 2 *16] ; [13]
7707 paddd m4, [pd_16]
7708 psrld m4, 5
7709 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7710 mova m1, m6
7711 pmaddwd m6, [r3 - 2 * 16]
7712 paddd m6, [pd_16]
7713 psrld m6, 5
7714 packusdw m4, m6
7715
7716 mova m2, m7
7717 pmaddwd m2, [r3 + 3 * 16] ; [18]
7718 paddd m2, [pd_16]
7719 psrld m2, 5
7720 mova m6, m1
7721 pmaddwd m6, [r3 + 3 * 16]
7722 paddd m6, [pd_16]
7723 psrld m6, 5
7724 packusdw m2, m6
7725
7726 mova m6, m7
7727 pmaddwd m6, [r3 + 8 * 16] ; [23]
7728 paddd m6, [pd_16]
7729 psrld m6, 5
7730 pmaddwd m1, [r3 + 8 * 16]
7731 paddd m1, [pd_16]
7732 psrld m1, 5
7733 packusdw m6, m1
7734
7735 pmaddwd m7, [r3 + 13 * 16] ; [28]
7736 paddd m7, [pd_16]
7737 psrld m7, 5
7738 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
7739 pmaddwd m1, [r3 + 13 * 16]
7740 paddd m1, [pd_16]
7741 psrld m1, 5
7742 packusdw m7, m1
7743
7744 lea r5, [r5 + r1 * 4]
7745
7746 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7747
7748 palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
7749 mova m4, m1
7750 pmaddwd m4, [r3 - 14 * 16] ; [1]
7751 paddd m4, [pd_16]
7752 psrld m4, 5
7753 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
7754 mova m0, m5
7755 pmaddwd m0, [r3 - 14 * 16]
7756 paddd m0, [pd_16]
7757 psrld m0, 5
7758 packusdw m4, m0
7759
7760 mova m2, m1
7761 pmaddwd m2, [r3 - 9 * 16] ; [6]
7762 paddd m2, [pd_16]
7763 psrld m2, 5
7764 mova m3, m5
7765 pmaddwd m3, [r3 - 9 * 16]
7766 paddd m3, [pd_16]
7767 psrld m3, 5
7768 packusdw m2, m3
7769
7770 mova m7, m1
7771 pmaddwd m7, [r3 - 4 * 16] ; [11]
7772 paddd m7, [pd_16]
7773 psrld m7, 5
7774 mova m3, m5
7775 pmaddwd m3, [r3 - 4 * 16]
7776 paddd m3, [pd_16]
7777 psrld m3, 5
7778 packusdw m7, m3
7779
7780 pmaddwd m1, [r3 + 1 * 16] ; [16]
7781 paddd m1, [pd_16]
7782 psrld m1, 5
7783 pmaddwd m5, [r3 + 1 * 16]
7784 paddd m5, [pd_16]
7785 psrld m5, 5
7786 packusdw m1, m5
7787
7788 lea r5, [r5 + r1 * 4]
7789
7790 TRANSPOSE_STORE m4, m2, m7, m1, m3, 24
7791
7792 ret
7793
7794 cglobal ang16_mode_9_27
7795 test r6d, r6d
7796 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7797 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
7798
7799 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
7800 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
7801
7802 mova m4, m3
7803 pmaddwd m4, [r3 - 14 * 16] ; [2]
7804 paddd m4, [pd_16]
7805 psrld m4, 5
7806 mova m2, m0
7807 pmaddwd m2, [r3 - 14 * 16]
7808 paddd m2, [pd_16]
7809 psrld m2, 5
7810 packusdw m4, m2
7811
7812 mova m2, m3
7813 pmaddwd m2, [r3 - 12 * 16] ; [4]
7814 paddd m2, [pd_16]
7815 psrld m2, 5
7816 mova m1, m0
7817 pmaddwd m1, [r3 - 12 * 16]
7818 paddd m1, [pd_16]
7819 psrld m1, 5
7820 packusdw m2, m1
7821
7822 mova m6, m3
7823 pmaddwd m6, [r3 - 10 *16] ; [6]
7824 paddd m6, [pd_16]
7825 psrld m6, 5
7826 mova m1, m0
7827 pmaddwd m1, [r3 - 10 * 16]
7828 paddd m1, [pd_16]
7829 psrld m1, 5
7830 packusdw m6, m1
7831
7832 mova m7, m3
7833 pmaddwd m7, [r3 - 8 * 16] ; [8]
7834 paddd m7, [pd_16]
7835 psrld m7, 5
7836 mova m1, m0
7837 pmaddwd m1, [r3 - 8 * 16]
7838 paddd m1, [pd_16]
7839 psrld m1, 5
7840 packusdw m7, m1
7841
7842 mov r5, r0
7843
7844 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7845
7846 mova m4, m3
7847 pmaddwd m4, [r3 - 6 * 16] ; [10]
7848 paddd m4, [pd_16]
7849 psrld m4, 5
7850 mova m1, m0
7851 pmaddwd m1, [r3 - 6 * 16]
7852 paddd m1, [pd_16]
7853 psrld m1, 5
7854 packusdw m4, m1
7855
7856 mova m2, m3
7857 pmaddwd m2, [r3 - 4 * 16] ; [12]
7858 paddd m2, [pd_16]
7859 psrld m2, 5
7860 mova m1, m0
7861 pmaddwd m1, [r3 - 4 * 16]
7862 paddd m1, [pd_16]
7863 psrld m1, 5
7864 packusdw m2, m1
7865
7866 mova m6, m3
7867 pmaddwd m6, [r3 - 2 * 16] ; [14]
7868 paddd m6, [pd_16]
7869 psrld m6, 5
7870 mova m7, m0
7871 pmaddwd m7, [r3 - 2 * 16]
7872 paddd m7, [pd_16]
7873 psrld m7, 5
7874 packusdw m6, m7
7875
7876 mova m7, m3
7877 pmaddwd m7, [r3] ; [16]
7878 paddd m7, [pd_16]
7879 psrld m7, 5
7880 mova m1, m0
7881 pmaddwd m1, [r3]
7882 paddd m1, [pd_16]
7883 psrld m1, 5
7884 packusdw m7, m1
7885
7886 lea r5, [r0 + r1 * 4]
7887
7888 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7889
7890 mova m4, m3
7891 pmaddwd m4, [r3 + 2 *16] ; [18]
7892 paddd m4, [pd_16]
7893 psrld m4, 5
7894 mova m6, m0
7895 pmaddwd m6, [r3 + 2 * 16]
7896 paddd m6, [pd_16]
7897 psrld m6, 5
7898 packusdw m4, m6
7899
7900 mova m2, m3
7901 pmaddwd m2, [r3 + 4 * 16] ; [20]
7902 paddd m2, [pd_16]
7903 psrld m2, 5
7904 mova m6, m0
7905 pmaddwd m6, [r3 + 4 * 16]
7906 paddd m6, [pd_16]
7907 psrld m6, 5
7908 packusdw m2, m6
7909
7910 mova m6, m3
7911 pmaddwd m6, [r3 + 6 * 16] ; [22]
7912 paddd m6, [pd_16]
7913 psrld m6, 5
7914 mova m1, m0
7915 pmaddwd m1, [r3 + 6 * 16]
7916 paddd m1, [pd_16]
7917 psrld m1, 5
7918 packusdw m6, m1
7919
7920 mova m7, m3
7921 pmaddwd m7, [r3 + 8 * 16] ; [24]
7922 paddd m7, [pd_16]
7923 psrld m7, 5
7924 mova m1, m0
7925 pmaddwd m1, [r3 + 8 * 16]
7926 paddd m1, [pd_16]
7927 psrld m1, 5
7928 packusdw m7, m1
7929
7930 lea r5, [r5 + r1 * 4]
7931
7932 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7933
7934 mova m4, m3
7935 pmaddwd m4, [r3 + 10 * 16] ; [26]
7936 paddd m4, [pd_16]
7937 psrld m4, 5
7938 mova m1, m0
7939 pmaddwd m1, [r3 + 10 * 16]
7940 paddd m1, [pd_16]
7941 psrld m1, 5
7942 packusdw m4, m1
7943
7944 mova m2, m3
7945 pmaddwd m2, [r3 + 12 * 16] ; [28]
7946 paddd m2, [pd_16]
7947 psrld m2, 5
7948 mova m1, m0
7949 pmaddwd m1, [r3 + 12 * 16]
7950 paddd m1, [pd_16]
7951 psrld m1, 5
7952 packusdw m2, m1
7953
7954 pmaddwd m3, [r3 + 14 * 16] ; [30]
7955 paddd m3, [pd_16]
7956 psrld m3, 5
7957 pmaddwd m0, [r3 + 14 * 16]
7958 paddd m0, [pd_16]
7959 psrld m0, 5
7960 packusdw m3, m0
7961
7962 movu m7, [r2 + 4]
7963
7964 lea r5, [r5 + r1 * 4]
7965
7966 TRANSPOSE_STORE m4, m2, m3, m7, m1, 24
7967
7968 ret
7969
7970 cglobal ang16_mode_11_25
7971 test r6d, r6d
7972 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7973 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7974
7975 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7976 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7977
7978 mova m4, m3
7979 pmaddwd m4, [r3 + 14 * 16] ; [30]
7980 paddd m4, [pd_16]
7981 psrld m4, 5
7982 mova m2, m0
7983 pmaddwd m2, [r3 + 14 * 16]
7984 paddd m2, [pd_16]
7985 psrld m2, 5
7986 packusdw m4, m2
7987
7988 mova m2, m3
7989 pmaddwd m2, [r3 + 12 * 16] ; [28]
7990 paddd m2, [pd_16]
7991 psrld m2, 5
7992 mova m1, m0
7993 pmaddwd m1, [r3 + 12 * 16]
7994 paddd m1, [pd_16]
7995 psrld m1, 5
7996 packusdw m2, m1
7997
7998 mova m6, m3
7999 pmaddwd m6, [r3 + 10 *16] ; [26]
8000 paddd m6, [pd_16]
8001 psrld m6, 5
8002 mova m1, m0
8003 pmaddwd m1, [r3 + 10 * 16]
8004 paddd m1, [pd_16]
8005 psrld m1, 5
8006 packusdw m6, m1
8007
8008 mova m7, m3
8009 pmaddwd m7, [r3 + 8 * 16] ; [24]
8010 paddd m7, [pd_16]
8011 psrld m7, 5
8012 mova m1, m0
8013 pmaddwd m1, [r3 + 8 * 16]
8014 paddd m1, [pd_16]
8015 psrld m1, 5
8016 packusdw m7, m1
8017
8018 mov r5, r0
8019
8020 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8021
8022 mova m4, m3
8023 pmaddwd m4, [r3 + 6 * 16] ; [22]
8024 paddd m4, [pd_16]
8025 psrld m4, 5
8026 mova m1, m0
8027 pmaddwd m1, [r3 + 6 * 16]
8028 paddd m1, [pd_16]
8029 psrld m1, 5
8030 packusdw m4, m1
8031
8032 mova m2, m3
8033 pmaddwd m2, [r3 + 4 * 16] ; [20]
8034 paddd m2, [pd_16]
8035 psrld m2, 5
8036 mova m1, m0
8037 pmaddwd m1, [r3 + 4 * 16]
8038 paddd m1, [pd_16]
8039 psrld m1, 5
8040 packusdw m2, m1
8041
8042 mova m6, m3
8043 pmaddwd m6, [r3 + 2 * 16] ; [18]
8044 paddd m6, [pd_16]
8045 psrld m6, 5
8046 mova m7, m0
8047 pmaddwd m7, [r3 + 2 * 16]
8048 paddd m7, [pd_16]
8049 psrld m7, 5
8050 packusdw m6, m7
8051
8052 mova m7, m3
8053 pmaddwd m7, [r3] ; [16]
8054 paddd m7, [pd_16]
8055 psrld m7, 5
8056 mova m1, m0
8057 pmaddwd m1, [r3]
8058 paddd m1, [pd_16]
8059 psrld m1, 5
8060 packusdw m7, m1
8061
8062 lea r5, [r0 + r1 * 4]
8063
8064 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8065
8066 mova m4, m3
8067 pmaddwd m4, [r3 - 2 *16] ; [14]
8068 paddd m4, [pd_16]
8069 psrld m4, 5
8070 mova m6, m0
8071 pmaddwd m6, [r3 - 2 * 16]
8072 paddd m6, [pd_16]
8073 psrld m6, 5
8074 packusdw m4, m6
8075
8076 mova m2, m3
8077 pmaddwd m2, [r3 - 4 * 16] ; [12]
8078 paddd m2, [pd_16]
8079 psrld m2, 5
8080 mova m6, m0
8081 pmaddwd m6, [r3 - 4 * 16]
8082 paddd m6, [pd_16]
8083 psrld m6, 5
8084 packusdw m2, m6
8085
8086 mova m6, m3
8087 pmaddwd m6, [r3 - 6 * 16] ; [10]
8088 paddd m6, [pd_16]
8089 psrld m6, 5
8090 mova m1, m0
8091 pmaddwd m1, [r3 - 6 * 16]
8092 paddd m1, [pd_16]
8093 psrld m1, 5
8094 packusdw m6, m1
8095
8096 mova m7, m3
8097 pmaddwd m7, [r3 - 8 * 16] ; [8]
8098 paddd m7, [pd_16]
8099 psrld m7, 5
8100 mova m1, m0
8101 pmaddwd m1, [r3 - 8 * 16]
8102 paddd m1, [pd_16]
8103 psrld m1, 5
8104 packusdw m7, m1
8105
8106 lea r5, [r5 + r1 * 4]
8107
8108 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8109
8110 mova m4, m3
8111 pmaddwd m4, [r3 - 10 * 16] ; [6]
8112 paddd m4, [pd_16]
8113 psrld m4, 5
8114 mova m1, m0
8115 pmaddwd m1, [r3 - 10 * 16]
8116 paddd m1, [pd_16]
8117 psrld m1, 5
8118 packusdw m4, m1
8119
8120 mova m2, m3
8121 pmaddwd m2, [r3 - 12 * 16] ; [4]
8122 paddd m2, [pd_16]
8123 psrld m2, 5
8124 mova m1, m0
8125 pmaddwd m1, [r3 - 12 * 16]
8126 paddd m1, [pd_16]
8127 psrld m1, 5
8128 packusdw m2, m1
8129
8130 mova m7, m3
8131 pmaddwd m7, [r3 - 14 * 16] ; [2]
8132 paddd m7, [pd_16]
8133 psrld m7, 5
8134 mova m1, m0
8135 pmaddwd m1, [r3 - 14 * 16]
8136 paddd m1, [pd_16]
8137 psrld m1, 5
8138 packusdw m7, m1
8139
8140 movu m3, [r2]
8141
8142 lea r5, [r5 + r1 * 4]
8143
8144 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8145
8146 ret
8147
8148 cglobal ang16_mode_12_24
8149 test r3d, r3d
8150 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8151 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8152
8153 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8154 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8155
8156 mova m4, m3
8157 pmaddwd m4, [r6 + 11 * 16] ; [27]
8158 paddd m4, [pd_16]
8159 psrld m4, 5
8160 mova m2, m0
8161 pmaddwd m2, [r6 + 11 * 16]
8162 paddd m2, [pd_16]
8163 psrld m2, 5
8164 packusdw m4, m2
8165
8166 mova m2, m3
8167 pmaddwd m2, [r6 + 6 * 16] ; [22]
8168 paddd m2, [pd_16]
8169 psrld m2, 5
8170 mova m1, m0
8171 pmaddwd m1, [r6 + 6 * 16]
8172 paddd m1, [pd_16]
8173 psrld m1, 5
8174 packusdw m2, m1
8175
8176 mova m6, m3
8177 pmaddwd m6, [r6 + 1 *16] ; [17]
8178 paddd m6, [pd_16]
8179 psrld m6, 5
8180 mova m1, m0
8181 pmaddwd m1, [r6 + 1 * 16]
8182 paddd m1, [pd_16]
8183 psrld m1, 5
8184 packusdw m6, m1
8185
8186 mova m7, m3
8187 pmaddwd m7, [r6 - 4 * 16] ; [12]
8188 paddd m7, [pd_16]
8189 psrld m7, 5
8190 mova m1, m0
8191 pmaddwd m1, [r6 - 4 * 16]
8192 paddd m1, [pd_16]
8193 psrld m1, 5
8194 packusdw m7, m1
8195
8196 mov r5, r0
8197
8198 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8199
8200 mova m4, m3
8201 pmaddwd m4, [r6 - 9 * 16] ; [7]
8202 paddd m4, [pd_16]
8203 psrld m4, 5
8204 mova m1, m0
8205 pmaddwd m1, [r6 - 9 * 16]
8206 paddd m1, [pd_16]
8207 psrld m1, 5
8208 packusdw m4, m1
8209
8210 mova m2, m3
8211 pmaddwd m2, [r6 - 14 * 16] ; [2]
8212 paddd m2, [pd_16]
8213 psrld m2, 5
8214 mova m1, m0
8215 pmaddwd m1, [r6 - 14 * 16]
8216 paddd m1, [pd_16]
8217 psrld m1, 5
8218 packusdw m2, m1
8219
8220 palignr m0, m3, 12
8221 palignr m3, m5, 12
8222
8223 mova m6, m3
8224 pmaddwd m6, [r6 + 13 * 16] ; [29]
8225 paddd m6, [pd_16]
8226 psrld m6, 5
8227 mova m7, m0
8228 pmaddwd m7, [r6 + 13 * 16]
8229 paddd m7, [pd_16]
8230 psrld m7, 5
8231 packusdw m6, m7
8232
8233 mova m7, m3
8234 pmaddwd m7, [r6 + 8 * 16] ; [24]
8235 paddd m7, [pd_16]
8236 psrld m7, 5
8237 mova m1, m0
8238 pmaddwd m1, [r6 + 8 * 16]
8239 paddd m1, [pd_16]
8240 psrld m1, 5
8241 packusdw m7, m1
8242
8243 lea r5, [r0 + r1 * 4]
8244
8245 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8246
8247 mova m4, m3
8248 pmaddwd m4, [r6 + 3 *16] ; [19]
8249 paddd m4, [pd_16]
8250 psrld m4, 5
8251 mova m6, m0
8252 pmaddwd m6, [r6 + 3 * 16]
8253 paddd m6, [pd_16]
8254 psrld m6, 5
8255 packusdw m4, m6
8256
8257 mova m2, m3
8258 pmaddwd m2, [r6 - 2 * 16] ; [14]
8259 paddd m2, [pd_16]
8260 psrld m2, 5
8261 mova m6, m0
8262 pmaddwd m6, [r6 - 2 * 16]
8263 paddd m6, [pd_16]
8264 psrld m6, 5
8265 packusdw m2, m6
8266
8267 mova m6, m3
8268 pmaddwd m6, [r6 - 7 * 16] ; [9]
8269 paddd m6, [pd_16]
8270 psrld m6, 5
8271 mova m1, m0
8272 pmaddwd m1, [r6 - 7 * 16]
8273 paddd m1, [pd_16]
8274 psrld m1, 5
8275 packusdw m6, m1
8276
8277 mova m7, m3
8278 pmaddwd m7, [r6 - 12 * 16] ; [4]
8279 paddd m7, [pd_16]
8280 psrld m7, 5
8281 mova m1, m0
8282 pmaddwd m1, [r6 - 12 * 16]
8283 paddd m1, [pd_16]
8284 psrld m1, 5
8285 packusdw m7, m1
8286
8287 lea r5, [r5 + r1 * 4]
8288
8289 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8290
8291 pslldq m5, 2
8292 palignr m0, m3, 12
8293 palignr m3, m5, 12
8294
8295 mova m4, m3
8296 pmaddwd m4, [r6 + 15 * 16] ; [31]
8297 paddd m4, [pd_16]
8298 psrld m4, 5
8299 mova m1, m0
8300 pmaddwd m1, [r6 + 15 * 16]
8301 paddd m1, [pd_16]
8302 psrld m1, 5
8303 packusdw m4, m1
8304
8305 mova m2, m3
8306 pmaddwd m2, [r6 + 10 * 16] ; [26]
8307 paddd m2, [pd_16]
8308 psrld m2, 5
8309 mova m1, m0
8310 pmaddwd m1, [r6 + 10 * 16]
8311 paddd m1, [pd_16]
8312 psrld m1, 5
8313 packusdw m2, m1
8314
8315 mova m7, m3
8316 pmaddwd m7, [r6 + 5 * 16] ; [21]
8317 paddd m7, [pd_16]
8318 psrld m7, 5
8319 mova m1, m0
8320 pmaddwd m1, [r6 + 5 * 16]
8321 paddd m1, [pd_16]
8322 psrld m1, 5
8323 packusdw m7, m1
8324
8325 pmaddwd m3, [r6] ; [16]
8326 paddd m3, [pd_16]
8327 psrld m3, 5
8328 pmaddwd m0, [r6]
8329 paddd m0, [pd_16]
8330 psrld m0, 5
8331 packusdw m3, m0
8332
8333 lea r5, [r5 + r1 * 4]
8334
8335 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8336
8337 ret
8338
8339 cglobal ang16_mode_13_23
8340 test r3d, r3d
8341 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8342 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8343
8344 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8345 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8346
8347 mova m4, m3
8348 pmaddwd m4, [r6 + 8 * 16] ; [23]
8349 paddd m4, [pd_16]
8350 psrld m4, 5
8351 mova m2, m0
8352 pmaddwd m2, [r6 + 8 * 16]
8353 paddd m2, [pd_16]
8354 psrld m2, 5
8355 packusdw m4, m2
8356
8357 mova m2, m3
8358 pmaddwd m2, [r6 - 1 * 16] ; [14]
8359 paddd m2, [pd_16]
8360 psrld m2, 5
8361 mova m1, m0
8362 pmaddwd m1, [r6 - 1 * 16]
8363 paddd m1, [pd_16]
8364 psrld m1, 5
8365 packusdw m2, m1
8366
8367 mova m6, m3
8368 pmaddwd m6, [r6 - 10 *16] ; [5]
8369 paddd m6, [pd_16]
8370 psrld m6, 5
8371 mova m1, m0
8372 pmaddwd m1, [r6 - 10 * 16]
8373 paddd m1, [pd_16]
8374 psrld m1, 5
8375 packusdw m6, m1
8376
8377 palignr m0, m3, 12
8378 palignr m3, m5, 12
8379
8380 mova m7, m3
8381 pmaddwd m7, [r6 + 13 * 16] ; [28]
8382 paddd m7, [pd_16]
8383 psrld m7, 5
8384 mova m1, m0
8385 pmaddwd m1, [r6 + 13 * 16]
8386 paddd m1, [pd_16]
8387 psrld m1, 5
8388 packusdw m7, m1
8389
8390 mov r5, r0
8391
8392 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8393
8394 mova m4, m3
8395 pmaddwd m4, [r6 + 4 * 16] ; [19]
8396 paddd m4, [pd_16]
8397 psrld m4, 5
8398 mova m1, m0
8399 pmaddwd m1, [r6 + 4 * 16]
8400 paddd m1, [pd_16]
8401 psrld m1, 5
8402 packusdw m4, m1
8403
8404 mova m2, m3
8405 pmaddwd m2, [r6 - 5 * 16] ; [10]
8406 paddd m2, [pd_16]
8407 psrld m2, 5
8408 mova m1, m0
8409 pmaddwd m1, [r6 - 5 * 16]
8410 paddd m1, [pd_16]
8411 psrld m1, 5
8412 packusdw m2, m1
8413
8414 mova m6, m3
8415 pmaddwd m6, [r6 - 14 * 16] ; [1]
8416 paddd m6, [pd_16]
8417 psrld m6, 5
8418 mova m7, m0
8419 pmaddwd m7, [r6 - 14 * 16]
8420 paddd m7, [pd_16]
8421 psrld m7, 5
8422 packusdw m6, m7
8423
8424 pslldq m5, 2
8425 palignr m0, m3, 12
8426 palignr m3, m5, 12
8427
8428 mova m7, m3
8429 pmaddwd m7, [r6 + 9 * 16] ; [24]
8430 paddd m7, [pd_16]
8431 psrld m7, 5
8432 mova m1, m0
8433 pmaddwd m1, [r6 + 9 * 16]
8434 paddd m1, [pd_16]
8435 psrld m1, 5
8436 packusdw m7, m1
8437
8438 lea r5, [r0 + r1 * 4]
8439
8440 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8441
8442 mova m4, m3
8443 pmaddwd m4, [r6] ; [15]
8444 paddd m4, [pd_16]
8445 psrld m4, 5
8446 mova m6, m0
8447 pmaddwd m6, [r6]
8448 paddd m6, [pd_16]
8449 psrld m6, 5
8450 packusdw m4, m6
8451
8452 mova m2, m3
8453 pmaddwd m2, [r6 - 9 * 16] ; [6]
8454 paddd m2, [pd_16]
8455 psrld m2, 5
8456 mova m6, m0
8457 pmaddwd m6, [r6 - 9 * 16]
8458 paddd m6, [pd_16]
8459 psrld m6, 5
8460 packusdw m2, m6
8461
8462 pslldq m5, 2
8463 palignr m0, m3, 12
8464 palignr m3, m5, 12
8465
8466 mova m6, m3
8467 pmaddwd m6, [r6 + 14 * 16] ; [29]
8468 paddd m6, [pd_16]
8469 psrld m6, 5
8470 mova m1, m0
8471 pmaddwd m1, [r6 + 14 * 16]
8472 paddd m1, [pd_16]
8473 psrld m1, 5
8474 packusdw m6, m1
8475
8476 mova m7, m3
8477 pmaddwd m7, [r6 + 5 * 16] ; [20]
8478 paddd m7, [pd_16]
8479 psrld m7, 5
8480 mova m1, m0
8481 pmaddwd m1, [r6 + 5 * 16]
8482 paddd m1, [pd_16]
8483 psrld m1, 5
8484 packusdw m7, m1
8485
8486 lea r5, [r5 + r1 * 4]
8487
8488 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8489
8490 mova m4, m3
8491 pmaddwd m4, [r6 - 4 * 16] ; [11]
8492 paddd m4, [pd_16]
8493 psrld m4, 5
8494 mova m1, m0
8495 pmaddwd m1, [r6 - 4 * 16]
8496 paddd m1, [pd_16]
8497 psrld m1, 5
8498 packusdw m4, m1
8499
8500 mova m2, m3
8501 pmaddwd m2, [r6 - 13 * 16] ; [2]
8502 paddd m2, [pd_16]
8503 psrld m2, 5
8504 mova m1, m0
8505 pmaddwd m1, [r6 - 13 * 16]
8506 paddd m1, [pd_16]
8507 psrld m1, 5
8508 packusdw m2, m1
8509
8510 pslldq m5, 2
8511 palignr m0, m3, 12
8512 palignr m3, m5, 12
8513
8514 mova m7, m3
8515 pmaddwd m7, [r6 + 10 * 16] ; [25]
8516 paddd m7, [pd_16]
8517 psrld m7, 5
8518 mova m1, m0
8519 pmaddwd m1, [r6 + 10 * 16]
8520 paddd m1, [pd_16]
8521 psrld m1, 5
8522 packusdw m7, m1
8523
8524 pmaddwd m3, [r6 + 1 * 16] ; [16]
8525 paddd m3, [pd_16]
8526 psrld m3, 5
8527 pmaddwd m0, [r6 + 1 *16]
8528 paddd m0, [pd_16]
8529 psrld m0, 5
8530 packusdw m3, m0
8531
8532 lea r5, [r5 + r1 * 4]
8533
8534 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8535
8536 ret
8537
8538 cglobal ang16_mode_14_22
8539 test r3d, r3d
8540 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8541 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8542
8543 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8544 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8545
8546 mova m4, m3
8547 pmaddwd m4, [r6 + 1 * 16] ; [19]
8548 paddd m4, [pd_16]
8549 psrld m4, 5
8550 mova m2, m0
8551 pmaddwd m2, [r6 + 1 * 16]
8552 paddd m2, [pd_16]
8553 psrld m2, 5
8554 packusdw m4, m2
8555
8556 mova m2, m3
8557 pmaddwd m2, [r6 - 12 * 16] ; [6]
8558 paddd m2, [pd_16]
8559 psrld m2, 5
8560 mova m1, m0
8561 pmaddwd m1, [r6 - 12 * 16]
8562 paddd m1, [pd_16]
8563 psrld m1, 5
8564 packusdw m2, m1
8565
8566 palignr m0, m3, 12
8567 palignr m3, m5, 12
8568
8569 mova m6, m3
8570 pmaddwd m6, [r6 + 7 * 16] ; [25]
8571 paddd m6, [pd_16]
8572 psrld m6, 5
8573 mova m1, m0
8574 pmaddwd m1, [r6 + 7 * 16]
8575 paddd m1, [pd_16]
8576 psrld m1, 5
8577 packusdw m6, m1
8578
8579 mova m7, m3
8580 pmaddwd m7, [r6 - 6 * 16] ; [12]
8581 paddd m7, [pd_16]
8582 psrld m7, 5
8583 mova m1, m0
8584 pmaddwd m1, [r6 - 6 * 16]
8585 paddd m1, [pd_16]
8586 psrld m1, 5
8587 packusdw m7, m1
8588
8589 mov r5, r0
8590
8591 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8592
8593 pslldq m5, 2
8594 palignr m0, m3, 12
8595 palignr m3, m5, 12
8596
8597 mova m4, m3
8598 pmaddwd m4, [r6 + 13 * 16] ; [31]
8599 paddd m4, [pd_16]
8600 psrld m4, 5
8601 mova m1, m0
8602 pmaddwd m1, [r6 + 13 * 16]
8603 paddd m1, [pd_16]
8604 psrld m1, 5
8605 packusdw m4, m1
8606
8607 mova m2, m3
8608 pmaddwd m2, [r6] ; [18]
8609 paddd m2, [pd_16]
8610 psrld m2, 5
8611 mova m1, m0
8612 pmaddwd m1, [r6]
8613 paddd m1, [pd_16]
8614 psrld m1, 5
8615 packusdw m2, m1
8616
8617 mova m6, m3
8618 pmaddwd m6, [r6 - 13 * 16] ; [5]
8619 paddd m6, [pd_16]
8620 psrld m6, 5
8621 mova m7, m0
8622 pmaddwd m7, [r6 - 13 * 16]
8623 paddd m7, [pd_16]
8624 psrld m7, 5
8625 packusdw m6, m7
8626
8627 pslldq m5, 2
8628 palignr m0, m3, 12
8629 palignr m3, m5, 12
8630
8631 mova m7, m3
8632 pmaddwd m7, [r6 + 6 * 16] ; [24]
8633 paddd m7, [pd_16]
8634 psrld m7, 5
8635 mova m1, m0
8636 pmaddwd m1, [r6 + 6 * 16]
8637 paddd m1, [pd_16]
8638 psrld m1, 5
8639 packusdw m7, m1
8640
8641 lea r5, [r0 + r1 * 4]
8642
8643 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8644
8645 mova m4, m3
8646 pmaddwd m4, [r6 - 7 * 16] ; [11]
8647 paddd m4, [pd_16]
8648 psrld m4, 5
8649 mova m6, m0
8650 pmaddwd m6, [r6 - 7 * 16]
8651 paddd m6, [pd_16]
8652 psrld m6, 5
8653 packusdw m4, m6
8654
8655 pslldq m5, 2
8656 palignr m0, m3, 12
8657 palignr m3, m5, 12
8658
8659 mova m2, m3
8660 pmaddwd m2, [r6 + 12 * 16] ; [30]
8661 paddd m2, [pd_16]
8662 psrld m2, 5
8663 mova m6, m0
8664 pmaddwd m6, [r6 + 12 * 16]
8665 paddd m6, [pd_16]
8666 psrld m6, 5
8667 packusdw m2, m6
8668
8669 mova m6, m3
8670 pmaddwd m6, [r6 - 1 * 16] ; [17]
8671 paddd m6, [pd_16]
8672 psrld m6, 5
8673 mova m1, m0
8674 pmaddwd m1, [r6 - 1 * 16]
8675 paddd m1, [pd_16]
8676 psrld m1, 5
8677 packusdw m6, m1
8678
8679 mova m7, m3
8680 pmaddwd m7, [r6 - 14 * 16] ; [4]
8681 paddd m7, [pd_16]
8682 psrld m7, 5
8683 mova m1, m0
8684 pmaddwd m1, [r6 - 14 * 16]
8685 paddd m1, [pd_16]
8686 psrld m1, 5
8687 packusdw m7, m1
8688
8689 lea r5, [r5 + r1 * 4]
8690
8691 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8692
8693 pslldq m5, 2
8694 palignr m0, m3, 12
8695 palignr m3, m5, 12
8696
8697 mova m4, m3
8698 pmaddwd m4, [r6 + 5 * 16] ; [23]
8699 paddd m4, [pd_16]
8700 psrld m4, 5
8701 mova m1, m0
8702 pmaddwd m1, [r6 + 5 * 16]
8703 paddd m1, [pd_16]
8704 psrld m1, 5
8705 packusdw m4, m1
8706
8707 mova m2, m3
8708 pmaddwd m2, [r6 - 8 * 16] ; [10]
8709 paddd m2, [pd_16]
8710 psrld m2, 5
8711 mova m1, m0
8712 pmaddwd m1, [r6 - 8 * 16]
8713 paddd m1, [pd_16]
8714 psrld m1, 5
8715 packusdw m2, m1
8716
8717 pslldq m5, 2
8718 palignr m0, m3, 12
8719 palignr m3, m5, 12
8720
8721 mova m7, m3
8722 pmaddwd m7, [r6 + 11 * 16] ; [29]
8723 paddd m7, [pd_16]
8724 psrld m7, 5
8725 mova m1, m0
8726 pmaddwd m1, [r6 + 11 * 16]
8727 paddd m1, [pd_16]
8728 psrld m1, 5
8729 packusdw m7, m1
8730
8731 pmaddwd m3, [r6 - 2 * 16] ; [16]
8732 paddd m3, [pd_16]
8733 psrld m3, 5
8734 pmaddwd m0, [r6 - 2 *16]
8735 paddd m0, [pd_16]
8736 psrld m0, 5
8737 packusdw m3, m0
8738
8739 lea r5, [r5 + r1 * 4]
8740
8741 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8742
8743 ret
8744
8745 cglobal ang16_mode_15_21
8746 test r3d, r3d
8747 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8748 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8749
8750 palignr m6, m0, m5, 2
8751
8752 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8753 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8754
8755 mova m4, m3
8756 pmaddwd m4, [r6] ; [15]
8757 paddd m4, [pd_16]
8758 psrld m4, 5
8759 mova m2, m0
8760 pmaddwd m2, [r6]
8761 paddd m2, [pd_16]
8762 psrld m2, 5
8763 packusdw m4, m2
8764
8765 palignr m0, m3, 12
8766 palignr m3, m6, 12
8767
8768 mova m2, m3
8769 pmaddwd m2, [r6 + 15 * 16] ; [30]
8770 paddd m2, [pd_16]
8771 psrld m2, 5
8772 mova m1, m0
8773 pmaddwd m1, [r6 + 15 * 16]
8774 paddd m1, [pd_16]
8775 psrld m1, 5
8776 packusdw m2, m1
8777
8778 mova m6, m3
8779 pmaddwd m6, [r6 - 2 * 16] ; [13]
8780 paddd m6, [pd_16]
8781 psrld m6, 5
8782 mova m1, m0
8783 pmaddwd m1, [r6 - 2 * 16]
8784 paddd m1, [pd_16]
8785 psrld m1, 5
8786 packusdw m6, m1
8787
8788 palignr m0, m3, 12
8789 palignr m3, m5, 12
8790
8791 mova m7, m3
8792 pmaddwd m7, [r6 + 13 * 16] ; [28]
8793 paddd m7, [pd_16]
8794 psrld m7, 5
8795 mova m1, m0
8796 pmaddwd m1, [r6 + 13 * 16]
8797 paddd m1, [pd_16]
8798 psrld m1, 5
8799 packusdw m7, m1
8800
8801 mov r5, r0
8802
8803 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8804
8805 mova m4, m3
8806 pmaddwd m4, [r6 - 4 * 16] ; [11]
8807 paddd m4, [pd_16]
8808 psrld m4, 5
8809 mova m1, m0
8810 pmaddwd m1, [r6 - 4 * 16]
8811 paddd m1, [pd_16]
8812 psrld m1, 5
8813 packusdw m4, m1
8814
8815 pslldq m5, 2
8816 palignr m0, m3, 12
8817 palignr m3, m5, 12
8818
8819 mova m2, m3
8820 pmaddwd m2, [r6 + 11 * 16] ; [26]
8821 paddd m2, [pd_16]
8822 psrld m2, 5
8823 mova m1, m0
8824 pmaddwd m1, [r6 + 11 * 16]
8825 paddd m1, [pd_16]
8826 psrld m1, 5
8827 packusdw m2, m1
8828
8829 mova m6, m3
8830 pmaddwd m6, [r6 - 6 * 16] ; [9]
8831 paddd m6, [pd_16]
8832 psrld m6, 5
8833 mova m7, m0
8834 pmaddwd m7, [r6 - 6 * 16]
8835 paddd m7, [pd_16]
8836 psrld m7, 5
8837 packusdw m6, m7
8838
8839 pslldq m5, 2
8840 palignr m0, m3, 12
8841 palignr m3, m5, 12
8842
8843 mova m7, m3
8844 pmaddwd m7, [r6 + 9 * 16] ; [24]
8845 paddd m7, [pd_16]
8846 psrld m7, 5
8847 mova m1, m0
8848 pmaddwd m1, [r6 + 9 * 16]
8849 paddd m1, [pd_16]
8850 psrld m1, 5
8851 packusdw m7, m1
8852
8853 lea r5, [r0 + r1 * 4]
8854
8855 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8856
8857 mova m4, m3
8858 pmaddwd m4, [r6 - 8 * 16] ; [7]
8859 paddd m4, [pd_16]
8860 psrld m4, 5
8861 mova m6, m0
8862 pmaddwd m6, [r6 - 8 * 16]
8863 paddd m6, [pd_16]
8864 psrld m6, 5
8865 packusdw m4, m6
8866
8867 pslldq m5, 2
8868 palignr m0, m3, 12
8869 palignr m3, m5, 12
8870
8871 mova m2, m3
8872 pmaddwd m2, [r6 + 7 * 16] ; [22]
8873 paddd m2, [pd_16]
8874 psrld m2, 5
8875 mova m6, m0
8876 pmaddwd m6, [r6 + 7 * 16]
8877 paddd m6, [pd_16]
8878 psrld m6, 5
8879 packusdw m2, m6
8880
8881 mova m6, m3
8882 pmaddwd m6, [r6 - 10 * 16] ; [5]
8883 paddd m6, [pd_16]
8884 psrld m6, 5
8885 mova m1, m0
8886 pmaddwd m1, [r6 - 10 * 16]
8887 paddd m1, [pd_16]
8888 psrld m1, 5
8889 packusdw m6, m1
8890
8891 pslldq m5, 2
8892 palignr m0, m3, 12
8893 palignr m3, m5, 12
8894
8895 mova m7, m3
8896 pmaddwd m7, [r6 + 5 * 16] ; [20]
8897 paddd m7, [pd_16]
8898 psrld m7, 5
8899 mova m1, m0
8900 pmaddwd m1, [r6 + 5 * 16]
8901 paddd m1, [pd_16]
8902 psrld m1, 5
8903 packusdw m7, m1
8904
8905 lea r5, [r5 + r1 * 4]
8906
8907 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8908
8909 mova m4, m3
8910 pmaddwd m4, [r6 - 12 * 16] ; [3]
8911 paddd m4, [pd_16]
8912 psrld m4, 5
8913 mova m1, m0
8914 pmaddwd m1, [r6 - 12 * 16]
8915 paddd m1, [pd_16]
8916 psrld m1, 5
8917 packusdw m4, m1
8918
8919 pslldq m5, 2
8920 palignr m0, m3, 12
8921 palignr m3, m5, 12
8922
8923 mova m2, m3
8924 pmaddwd m2, [r6 + 3 * 16] ; [18]
8925 paddd m2, [pd_16]
8926 psrld m2, 5
8927 mova m1, m0
8928 pmaddwd m1, [r6 + 3 * 16]
8929 paddd m1, [pd_16]
8930 psrld m1, 5
8931 packusdw m2, m1
8932
8933 mova m7, m3
8934 pmaddwd m7, [r6 - 14 * 16] ; [1]
8935 paddd m7, [pd_16]
8936 psrld m7, 5
8937 mova m1, m0
8938 pmaddwd m1, [r6 - 14 * 16]
8939 paddd m1, [pd_16]
8940 psrld m1, 5
8941 packusdw m7, m1
8942
8943 pslldq m5, 2
8944 palignr m0, m3, 12
8945 palignr m3, m5, 12
8946
8947 pmaddwd m3, [r6 + 1 * 16] ; [16]
8948 paddd m3, [pd_16]
8949 psrld m3, 5
8950 pmaddwd m0, [r6 + 1 * 16]
8951 paddd m0, [pd_16]
8952 psrld m0, 5
8953 packusdw m3, m0
8954
8955 lea r5, [r5 + r1 * 4]
8956
8957 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8958
8959 ret
8960
8961 cglobal ang16_mode_16_20
8962 test r4d, r4d
8963 lea r4, [r1 * 3]
8964 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8965 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8966
8967 palignr m6, m0, m5, 2
8968
8969 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8970 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8971
8972 mova m4, m3
8973 pmaddwd m4, [r6 - 2 * 16] ; [11]
8974 paddd m4, [pd_16]
8975 psrld m4, 5
8976 mova m2, m0
8977 pmaddwd m2, [r6 - 2 * 16]
8978 paddd m2, [pd_16]
8979 psrld m2, 5
8980 packusdw m4, m2
8981
8982 palignr m0, m3, 12
8983 palignr m3, m6, 12
8984
8985 mova m2, m3
8986 pmaddwd m2, [r6 + 9 * 16] ; [22]
8987 paddd m2, [pd_16]
8988 psrld m2, 5
8989 mova m1, m0
8990 pmaddwd m1, [r6 + 9 * 16]
8991 paddd m1, [pd_16]
8992 psrld m1, 5
8993 packusdw m2, m1
8994
8995 mova m6, m3
8996 pmaddwd m6, [r6 - 12 * 16] ; [1]
8997 paddd m6, [pd_16]
8998 psrld m6, 5
8999 mova m1, m0
9000 pmaddwd m1, [r6 - 12 * 16]
9001 paddd m1, [pd_16]
9002 psrld m1, 5
9003 packusdw m6, m1
9004
9005 palignr m0, m3, 12
9006 palignr m3, m5, 12
9007
9008 mova m7, m3
9009 pmaddwd m7, [r6 - 1 * 16] ; [12]
9010 paddd m7, [pd_16]
9011 psrld m7, 5
9012 mova m1, m0
9013 pmaddwd m1, [r6 - 1 * 16]
9014 paddd m1, [pd_16]
9015 psrld m1, 5
9016 packusdw m7, m1
9017
9018 mov r5, r0
9019
9020 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
9021
9022 pslldq m5, 2
9023 palignr m0, m3, 12
9024 palignr m3, m5, 12
9025
9026 mova m4, m3
9027 pmaddwd m4, [r6 + 10 * 16] ; [23]
9028 paddd m4, [pd_16]
9029 psrld m4, 5
9030 mova m1, m0
9031 pmaddwd m1, [r6 + 10 * 16]
9032 paddd m1, [pd_16]
9033 psrld m1, 5
9034 packusdw m4, m1
9035
9036 mova m2, m3
9037 pmaddwd m2, [r6 - 11 * 16] ; [2]
9038 paddd m2, [pd_16]
9039 psrld m2, 5
9040 mova m1, m0
9041 pmaddwd m1, [r6 - 11 * 16]
9042 paddd m1, [pd_16]
9043 psrld m1, 5
9044 packusdw m2, m1
9045
9046 pslldq m5, 2
9047 palignr m0, m3, 12
9048 palignr m3, m5, 12
9049
9050 mova m6, m3
9051 pmaddwd m6, [r6] ; [13]
9052 paddd m6, [pd_16]
9053 psrld m6, 5
9054 mova m7, m0
9055 pmaddwd m7, [r6]
9056 paddd m7, [pd_16]
9057 psrld m7, 5
9058 packusdw m6, m7
9059
9060 pslldq m5, 2
9061 palignr m0, m3, 12
9062 palignr m3, m5, 12
9063
9064 mova m7, m3
9065 pmaddwd m7, [r6 + 11 * 16] ; [24]
9066 paddd m7, [pd_16]
9067 psrld m7, 5
9068 mova m1, m0
9069 pmaddwd m1, [r6 + 11 * 16]
9070 paddd m1, [pd_16]
9071 psrld m1, 5
9072 packusdw m7, m1
9073
9074 lea r5, [r0 + r1 * 4]
9075
9076 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
9077
9078 mova m4, m3
9079 pmaddwd m4, [r6 - 10 * 16] ; [3]
9080 paddd m4, [pd_16]
9081 psrld m4, 5
9082 mova m6, m0
9083 pmaddwd m6, [r6 - 10 * 16]
9084 paddd m6, [pd_16]
9085 psrld m6, 5
9086 packusdw m4, m6
9087
9088 pslldq m5, 2
9089 palignr m0, m3, 12
9090 palignr m3, m5, 12
9091
9092 mova m2, m3
9093 pmaddwd m2, [r6 + 1 * 16] ; [14]
9094 paddd m2, [pd_16]
9095 psrld m2, 5
9096 mova m6, m0
9097 pmaddwd m6, [r6 + 1 * 16]
9098 paddd m6, [pd_16]
9099 psrld m6, 5
9100 packusdw m2, m6
9101
9102 pslldq m5, 2
9103 palignr m0, m3, 12
9104 palignr m3, m5, 12
9105
9106 mova m6, m3
9107 pmaddwd m6, [r6 + 12 * 16] ; [25]
9108 paddd m6, [pd_16]
9109 psrld m6, 5
9110 mova m1, m0
9111 pmaddwd m1, [r6 + 12 * 16]
9112 paddd m1, [pd_16]
9113 psrld m1, 5
9114 packusdw m6, m1
9115
9116 mova m7, m3
9117 pmaddwd m7, [r6 - 9 * 16] ; [4]
9118 paddd m7, [pd_16]
9119 psrld m7, 5
9120 mova m1, m0
9121 pmaddwd m1, [r6 - 9 * 16]
9122 paddd m1, [pd_16]
9123 psrld m1, 5
9124 packusdw m7, m1
9125
9126 lea r5, [r5 + r1 * 4]
9127
9128 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
9129
9130 pslldq m5, 2
9131 palignr m0, m3, 12
9132 palignr m3, m5, 12
9133
9134 mova m4, m3
9135 pmaddwd m4, [r6 + 2 * 16] ; [15]
9136 paddd m4, [pd_16]
9137 psrld m4, 5
9138 mova m1, m0
9139 pmaddwd m1, [r6 + 2 * 16]
9140 paddd m1, [pd_16]
9141 psrld m1, 5
9142 packusdw m4, m1
9143
9144 movu m5, [r3]
9145 pshufb m5, [pw_ang8_16]
9146
9147 palignr m0, m3, 12
9148 palignr m3, m5, 12
9149
9150 mova m2, m3
9151 pmaddwd m2, [r6 + 13 * 16] ; [26]
9152 paddd m2, [pd_16]
9153 psrld m2, 5
9154 mova m1, m0
9155 pmaddwd m1, [r6 + 13 * 16]
9156 paddd m1, [pd_16]
9157 psrld m1, 5
9158 packusdw m2, m1
9159
9160 mova m7, m3
9161 pmaddwd m7, [r6 - 8 * 16] ; [5]
9162 paddd m7, [pd_16]
9163 psrld m7, 5
9164 mova m1, m0
9165 pmaddwd m1, [r6 - 8 * 16]
9166 paddd m1, [pd_16]
9167 psrld m1, 5
9168 packusdw m7, m1
9169
9170 pslldq m5, 2
9171 palignr m0, m3, 12
9172 palignr m3, m5, 12
9173
9174 pmaddwd m3, [r6 + 3 * 16] ; [16]
9175 paddd m3, [pd_16]
9176 psrld m3, 5
9177 pmaddwd m0, [r6 + 3 * 16]
9178 paddd m0, [pd_16]
9179 psrld m0, 5
9180 packusdw m3, m0
9181
9182 lea r5, [r5 + r1 * 4]
9183
9184 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
9185
9186 ret
9187
9188 cglobal ang16_mode_17_19
9189 test r4d, r4d
9190 lea r4, [r1 * 3]
9191 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
9192 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9193
9194 palignr m6, m0, m5, 2
9195
9196 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
9197 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
9198
9199 mova m4, m3
9200 pmaddwd m4, [r6 - 10 * 16] ; [6]
9201 paddd m4, [pd_16]
9202 psrld m4, 5
9203 mova m2, m0
9204 pmaddwd m2, [r6 - 10 * 16]
9205 paddd m2, [pd_16]
9206 psrld m2, 5
9207 packusdw m4, m2
9208
9209 palignr m0, m3, 12
9210 palignr m3, m6, 12
9211
9212 mova m2, m3
9213 pmaddwd m2, [r6 - 4 * 16] ; [12]
9214 paddd m2, [pd_16]
9215 psrld m2, 5
9216 mova m1, m0
9217 pmaddwd m1, [r6 - 4 * 16]
9218 paddd m1, [pd_16]
9219 psrld m1, 5
9220 packusdw m2, m1
9221
9222 palignr m0, m3, 12
9223 palignr m3, m5, 12
9224
9225 mova m6, m3
9226 pmaddwd m6, [r6 + 2 * 16] ; [18]
9227 paddd m6, [pd_16]
9228 psrld m6, 5
9229 mova m1, m0
9230 pmaddwd m1, [r6 + 2 * 16]
9231 paddd m1, [pd_16]
9232 psrld m1, 5
9233 packusdw m6, m1
9234
9235 pslldq m5, 2
9236 palignr m0, m3, 12
9237 palignr m3, m5, 12
9238
9239 mova m7, m3
9240 pmaddwd m7, [r6 + 8 * 16] ; [24]
9241 paddd m7, [pd_16]
9242 psrld m7, 5
9243 mova m1, m0
9244 pmaddwd m1, [r6 + 8 * 16]
9245 paddd m1, [pd_16]
9246 psrld m1, 5
9247 packusdw m7, m1
9248
9249 mov r5, r0
9250
9251 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
9252
9253 pslldq m5, 2
9254 palignr m0, m3, 12
9255 palignr m3, m5, 12
9256
9257 mova m4, m3
9258 pmaddwd m4, [r6 + 14 * 16] ; [30]
9259 paddd m4, [pd_16]
9260 psrld m4, 5
9261 mova m1, m0
9262 pmaddwd m1, [r6 + 14 * 16]
9263 paddd m1, [pd_16]
9264 psrld m1, 5
9265 packusdw m4, m1
9266
9267 mova m2, m3
9268 pmaddwd m2, [r6 - 12 * 16] ; [4]
9269 paddd m2, [pd_16]
9270 psrld m2, 5
9271 mova m1, m0
9272 pmaddwd m1, [r6 - 12 * 16]
9273 paddd m1, [pd_16]
9274 psrld m1, 5
9275 packusdw m2, m1
9276
9277 pslldq m5, 2
9278 palignr m0, m3, 12
9279 palignr m3, m5, 12
9280
9281 mova m6, m3
9282 pmaddwd m6, [r6 - 6 * 16] ; [10]
9283 paddd m6, [pd_16]
9284 psrld m6, 5
9285 mova m7, m0
9286 pmaddwd m7, [r6 - 6 * 16]
9287 paddd m7, [pd_16]
9288 psrld m7, 5
9289 packusdw m6, m7
9290
9291 pslldq m5, 2
9292 palignr m0, m3, 12
9293 palignr m3, m5, 12
9294
9295 mova m7, m3
9296 pmaddwd m7, [r6] ; [16]
9297 paddd m7, [pd_16]
9298 psrld m7, 5
9299 mova m1, m0
9300 pmaddwd m1, [r6]
9301 paddd m1, [pd_16]
9302 psrld m1, 5
9303 packusdw m7, m1
9304
9305 lea r5, [r0 + r1 * 4]
9306
9307 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
9308
9309 pslldq m5, 2
9310 palignr m0, m3, 12
9311 palignr m3, m5, 12
9312
9313 mova m4, m3
9314 pmaddwd m4, [r6 + 6 * 16] ; [22]
9315 paddd m4, [pd_16]
9316 psrld m4, 5
9317 mova m6, m0
9318 pmaddwd m6, [r6 + 6 * 16]
9319 paddd m6, [pd_16]
9320 psrld m6, 5
9321 packusdw m4, m6
9322
9323 pslldq m5, 2
9324 palignr m0, m3, 12
9325 palignr m3, m5, 12
9326
9327 mova m2, m3
9328 pmaddwd m2, [r6 + 12 * 16] ; [28]
9329 paddd m2, [pd_16]
9330 psrld m2, 5
9331 mova m6, m0
9332 pmaddwd m6, [r6 + 12 * 16]
9333 paddd m6, [pd_16]
9334 psrld m6, 5
9335 packusdw m2, m6
9336
9337 mova m6, m3
9338 pmaddwd m6, [r6 - 14 * 16] ; [2]
9339 paddd m6, [pd_16]
9340 psrld m6, 5
9341 mova m1, m0
9342 pmaddwd m1, [r6 - 14 * 16]
9343 paddd m1, [pd_16]
9344 psrld m1, 5
9345 packusdw m6, m1
9346
9347 movu m5, [r3]
9348 pshufb m5, [pw_ang8_17]
9349
9350 palignr m0, m3, 12
9351 palignr m3, m5, 12
9352
9353 mova m7, m3
9354 pmaddwd m7, [r6 - 8 * 16] ; [8]
9355 paddd m7, [pd_16]
9356 psrld m7, 5
9357 mova m1, m0
9358 pmaddwd m1, [r6 - 8 * 16]
9359 paddd m1, [pd_16]
9360 psrld m1, 5
9361 packusdw m7, m1
9362
9363 lea r5, [r5 + r1 * 4]
9364
9365 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
9366
9367 pslldq m5, 2
9368 palignr m0, m3, 12
9369 palignr m3, m5, 12
9370
9371 mova m4, m3
9372 pmaddwd m4, [r6 - 2 * 16] ; [14]
9373 paddd m4, [pd_16]
9374 psrld m4, 5
9375 mova m1, m0
9376 pmaddwd m1, [r6 - 2 * 16]
9377 paddd m1, [pd_16]
9378 psrld m1, 5
9379 packusdw m4, m1
9380
9381 pslldq m5, 2
9382 palignr m0, m3, 12
9383 palignr m3, m5, 12
9384
9385 mova m2, m3
9386 pmaddwd m2, [r6 + 4 * 16] ; [20]
9387 paddd m2, [pd_16]
9388 psrld m2, 5
9389 mova m1, m0
9390 pmaddwd m1, [r6 + 4 * 16]
9391 paddd m1, [pd_16]
9392 psrld m1, 5
9393 packusdw m2, m1
9394
9395 pslldq m5, 2
9396 palignr m0, m3, 12
9397 palignr m3, m5, 12
9398
9399 mova m7, m3
9400 pmaddwd m7, [r6 + 10 * 16] ; [26]
9401 paddd m7, [pd_16]
9402 psrld m7, 5
9403 mova m1, m0
9404 pmaddwd m1, [r6 + 10 * 16]
9405 paddd m1, [pd_16]
9406 psrld m1, 5
9407 packusdw m7, m1
9408
9409 pmaddwd m3, [r6 - 16 * 16]
9410 paddd m3, [pd_16]
9411 psrld m3, 5
9412 pmaddwd m0, [r6 - 16 * 16]
9413 paddd m0, [pd_16]
9414 psrld m0, 5
9415 packusdw m3, m0
9416
9417 lea r5, [r5 + r1 * 4]
9418
9419 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
9420
9421 ret
9422
9423 ;------------------------------------------------------------------------------------------
9424 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
9425 ;------------------------------------------------------------------------------------------
9426 INIT_XMM ssse3
9427 cglobal intra_pred_ang16_2, 3,5,5
9428 lea r4, [r2]
9429 add r2, 64
9430 cmp r3m, byte 34
9431 cmove r2, r4
9432 add r1, r1
9433 lea r3, [r1 * 3]
9434 movu m0, [r2 + 4]
9435 movu m1, [r2 + 20]
9436 movu m2, [r2 + 36]
9437
9438 movu [r0], m0
9439 movu [r0 + 16], m1
9440 palignr m3, m1, m0, 2
9441 palignr m4, m2, m1, 2
9442 movu [r0 + r1], m3
9443 movu [r0 + r1 + 16], m4
9444 palignr m3, m1, m0, 4
9445 palignr m4, m2, m1, 4
9446 movu [r0 + r1 * 2], m3
9447 movu [r0 + r1 * 2 + 16], m4
9448 palignr m3, m1, m0, 6
9449 palignr m4, m2, m1, 6
9450 movu [r0 + r3], m3
9451 movu [r0 + r3 + 16], m4
9452
9453 lea r0, [r0 + r1 * 4]
9454 palignr m3, m1, m0, 8
9455 palignr m4, m2, m1, 8
9456 movu [r0], m3
9457 movu [r0 + 16], m4
9458 palignr m3, m1, m0, 10
9459 palignr m4, m2, m1, 10
9460 movu [r0 + r1], m3
9461 movu [r0 + r1 + 16], m4
9462 palignr m3, m1, m0, 12
9463 palignr m4, m2, m1, 12
9464 movu [r0 + r1 * 2], m3
9465 movu [r0 + r1 * 2 + 16], m4
9466 palignr m3, m1, m0, 14
9467 palignr m4, m2, m1, 14
9468 movu [r0 + r3], m3
9469 movu [r0 + r3 + 16], m4
9470
9471 movu m0, [r2 + 52]
9472 lea r0, [r0 + r1 * 4]
9473 movu [r0], m1
9474 movu [r0 + 16], m2
9475 palignr m3, m2, m1, 2
9476 palignr m4, m0, m2, 2
9477 movu [r0 + r1], m3
9478 movu [r0 + r1 + 16], m4
9479 palignr m3, m2, m1, 4
9480 palignr m4, m0, m2, 4
9481 movu [r0 + r1 * 2], m3
9482 movu [r0 + r1 * 2 + 16], m4
9483 palignr m3, m2, m1, 6
9484 palignr m4, m0, m2, 6
9485 movu [r0 + r3], m3
9486 movu [r0 + r3 + 16], m4
9487
9488 lea r0, [r0 + r1 * 4]
9489 palignr m3, m2, m1, 8
9490 palignr m4, m0, m2, 8
9491 movu [r0], m3
9492 movu [r0 + 16], m4
9493 palignr m3, m2, m1, 10
9494 palignr m4, m0, m2, 10
9495 movu [r0 + r1], m3
9496 movu [r0 + r1 + 16], m4
9497 palignr m3, m2, m1, 12
9498 palignr m4, m0, m2, 12
9499 movu [r0 + r1 * 2], m3
9500 movu [r0 + r1 * 2 + 16], m4
9501 palignr m3, m2, m1, 14
9502 palignr m4, m0, m2, 14
9503 movu [r0 + r3], m3
9504 movu [r0 + r3 + 16], m4
9505 RET
9506
9507 INIT_XMM sse4
9508 cglobal intra_pred_ang16_3, 3,7,8
9509 add r2, 64
9510 xor r6d, r6d
9511 lea r3, [ang_table + 16 * 16]
9512 add r1, r1
9513 lea r4, [r1 * 3]
9514
9515 call ang16_mode_3_33
9516
9517 lea r2, [r2 + 16]
9518 lea r0, [r0 + r1 * 8]
9519
9520 call ang16_mode_3_33
9521 RET
9522
9523 cglobal intra_pred_ang16_33, 3,7,8
9524 xor r6d, r6d
9525 inc r6d
9526 lea r3, [ang_table + 16 * 16]
9527 add r1, r1
9528 lea r4, [r1 * 3]
9529
9530 call ang16_mode_3_33
9531
9532 lea r2, [r2 + 16]
9533 lea r0, [r0 + 16]
9534
9535 call ang16_mode_3_33
9536 RET
9537
9538 cglobal intra_pred_ang16_4, 3,7,8
9539 add r2, 64
9540 xor r6d, r6d
9541 lea r3, [ang_table + 18 * 16]
9542 add r1, r1
9543 lea r4, [r1 * 3]
9544
9545 call ang16_mode_4_32
9546
9547 lea r2, [r2 + 16]
9548 lea r0, [r0 + r1 * 8]
9549
9550 call ang16_mode_4_32
9551 RET
9552
9553 cglobal intra_pred_ang16_32, 3,7,8
9554 xor r6d, r6d
9555 inc r6d
9556 lea r3, [ang_table + 18 * 16]
9557 add r1, r1
9558 lea r4, [r1 * 3]
9559
9560 call ang16_mode_4_32
9561
9562 lea r2, [r2 + 16]
9563 lea r0, [r0 + 16]
9564
9565 call ang16_mode_4_32
9566 RET
9567
9568 cglobal intra_pred_ang16_5, 3,7,8
9569 add r2, 64
9570 xor r6d, r6d
9571 lea r3, [ang_table + 16 * 16]
9572 add r1, r1
9573 lea r4, [r1 * 3]
9574
9575 call ang16_mode_5_31
9576
9577 lea r2, [r2 + 16]
9578 lea r0, [r0 + r1 * 8]
9579
9580 call ang16_mode_5_31
9581 RET
9582
9583 cglobal intra_pred_ang16_31, 3,7,8
9584 xor r6d, r6d
9585 inc r6d
9586 lea r3, [ang_table + 16 * 16]
9587 add r1, r1
9588 lea r4, [r1 * 3]
9589
9590 call ang16_mode_5_31
9591
9592 lea r2, [r2 + 16]
9593 lea r0, [r0 + 16]
9594
9595 call ang16_mode_5_31
9596 RET
9597
9598 cglobal intra_pred_ang16_6, 3,7,8
9599 add r2, 64
9600 xor r6d, r6d
9601 lea r3, [ang_table + 15 * 16]
9602 add r1, r1
9603 lea r4, [r1 * 3]
9604
9605 call ang16_mode_6_30
9606
9607 lea r2, [r2 + 16]
9608 lea r0, [r0 + r1 * 8]
9609
9610 call ang16_mode_6_30
9611 RET
9612
9613 cglobal intra_pred_ang16_30, 3,7,8
9614 xor r6d, r6d
9615 inc r6d
9616 lea r3, [ang_table + 15 * 16]
9617 add r1, r1
9618 lea r4, [r1 * 3]
9619
9620 call ang16_mode_6_30
9621
9622 lea r2, [r2 + 16]
9623 lea r0, [r0 + 16]
9624
9625 call ang16_mode_6_30
9626 RET
9627
9628 cglobal intra_pred_ang16_7, 3,7,8
9629 add r2, 64
9630 xor r6d, r6d
9631 lea r3, [ang_table + 17 * 16]
9632 add r1, r1
9633 lea r4, [r1 * 3]
9634
9635 call ang16_mode_7_29
9636
9637 lea r2, [r2 + 16]
9638 lea r0, [r0 + r1 * 8]
9639
9640 call ang16_mode_7_29
9641 RET
9642
9643 cglobal intra_pred_ang16_29, 3,7,8
9644 xor r6d, r6d
9645 inc r6d
9646 lea r3, [ang_table + 17 * 16]
9647 add r1, r1
9648 lea r4, [r1 * 3]
9649
9650 call ang16_mode_7_29
9651
9652 lea r2, [r2 + 16]
9653 lea r0, [r0 + 16]
9654
9655 call ang16_mode_7_29
9656 RET
9657
9658 cglobal intra_pred_ang16_8, 3,7,8
9659 add r2, 64
9660 xor r6d, r6d
9661 lea r3, [ang_table + 15 * 16]
9662 add r1, r1
9663 lea r4, [r1 * 3]
9664
9665 call ang16_mode_8_28
9666
9667 lea r2, [r2 + 16]
9668 lea r0, [r0 + r1 * 8]
9669
9670 call ang16_mode_8_28
9671 RET
9672
9673 cglobal intra_pred_ang16_28, 3,7,8
9674 xor r6d, r6d
9675 inc r6d
9676 lea r3, [ang_table + 15 * 16]
9677 add r1, r1
9678 lea r4, [r1 * 3]
9679
9680 call ang16_mode_8_28
9681
9682 lea r2, [r2 + 16]
9683 lea r0, [r0 + 16]
9684
9685 call ang16_mode_8_28
9686 RET
9687
9688 cglobal intra_pred_ang16_9, 3,7,8
9689 add r2, 64
9690 xor r6d, r6d
9691 lea r3, [ang_table + 16 * 16]
9692 add r1, r1
9693 lea r4, [r1 * 3]
9694
9695 call ang16_mode_9_27
9696
9697 lea r2, [r2 + 16]
9698 lea r0, [r0 + r1 * 8]
9699
9700 call ang16_mode_9_27
9701 RET
9702
9703 cglobal intra_pred_ang16_27, 3,7,8
9704 xor r6d, r6d
9705 inc r6d
9706 lea r3, [ang_table + 16 * 16]
9707 add r1, r1
9708 lea r4, [r1 * 3]
9709
9710 call ang16_mode_9_27
9711
9712 lea r2, [r2 + 16]
9713 lea r0, [r0 + 16]
9714
9715 call ang16_mode_9_27
9716 RET
9717
9718 cglobal intra_pred_ang16_11, 3,7,8, 0-4
9719 movzx r5d, word [r2 + 64]
9720 movzx r6d, word [r2]
9721 mov [rsp], r5w
9722 mov [r2 + 64], r6w
9723
9724 add r2, 64
9725 xor r6d, r6d
9726 lea r3, [ang_table + 16 * 16]
9727 add r1, r1
9728 lea r4, [r1 * 3]
9729
9730 call ang16_mode_11_25
9731
9732 lea r2, [r2 + 16]
9733 lea r0, [r0 + r1 * 8]
9734
9735 call ang16_mode_11_25
9736
9737 mov r6d, [rsp]
9738 mov [r2 - 16], r6w
9739 RET
9740
9741 cglobal intra_pred_ang16_25, 3,7,8
9742 xor r6d, r6d
9743 inc r6d
9744 lea r3, [ang_table + 16 * 16]
9745 add r1, r1
9746 lea r4, [r1 * 3]
9747
9748 call ang16_mode_11_25
9749
9750 lea r2, [r2 + 16]
9751 lea r0, [r0 + 16]
9752
9753 call ang16_mode_11_25
9754 RET
9755
9756 cglobal intra_pred_ang16_12, 3,7,8, 0-4
9757 movzx r5d, word [r2 + 64]
9758 movzx r6d, word [r2]
9759 mov [rsp], r5w
9760 mov [r2 + 64], r6w
9761
9762 add r1, r1
9763 lea r4, [r1 * 3]
9764 lea r6, [ang_table + 16 * 16]
9765 movu m5, [r2]
9766 pshufb m5, [pw_ang8_12]
9767 pinsrw m5, [r2 + 26], 5
9768 xor r3d, r3d
9769 add r2, 64
9770
9771 call ang16_mode_12_24
9772
9773 lea r0, [r0 + r1 * 8]
9774 movu m5, [r2 + 2]
9775 lea r2, [r2 + 16]
9776
9777 call ang16_mode_12_24
9778
9779 mov r6d, [rsp]
9780 mov [r2 - 16], r6w
9781 RET
9782
9783 cglobal intra_pred_ang16_24, 3,7,8, 0-4
9784 movzx r5d, word [r2 + 64]
9785 movzx r6d, word [r2]
9786 mov [rsp], r5w
9787 mov [r2 + 64], r6w
9788
9789 add r1, r1
9790 lea r4, [r1 * 3]
9791 lea r6, [ang_table + 16 * 16]
9792 movu m5, [r2 + 64]
9793 pshufb m5, [pw_ang8_12]
9794 pinsrw m5, [r2 + 26 + 64], 5
9795 xor r3d, r3d
9796 inc r3d
9797
9798 call ang16_mode_12_24
9799
9800 lea r0, [r0 + 16]
9801 movu m5, [r2 + 2]
9802 lea r2, [r2 + 16]
9803
9804 call ang16_mode_12_24
9805
9806 mov r6d, [rsp]
9807 mov [r2 + 48], r6w
9808 RET
9809
9810 cglobal intra_pred_ang16_13, 3,7,8, 0-4
9811 movzx r5d, word [r2 + 64]
9812 movzx r6d, word [r2]
9813 mov [rsp], r5w
9814 mov [r2 + 64], r6w
9815
9816 add r1, r1
9817 lea r4, [r1 * 3]
9818 lea r6, [ang_table + 15 * 16]
9819 movu m5, [r2]
9820 pshufb m5, [pw_ang16_13]
9821 movu m6, [r2 + 14]
9822 pshufb m6, [pw_ang8_13]
9823 pslldq m6, 2
9824 palignr m5, m6, 6
9825 xor r3d, r3d
9826 add r2, 64
9827
9828 call ang16_mode_13_23
9829
9830 lea r0, [r0 + r1 * 8]
9831 movu m5, [r2 + 2]
9832 lea r2, [r2 + 16]
9833
9834 call ang16_mode_13_23
9835
9836 mov r6d, [rsp]
9837 mov [r2 - 16], r6w
9838 RET
9839
9840 cglobal intra_pred_ang16_23, 3,7,8, 0-4
9841 movzx r5d, word [r2 + 64]
9842 movzx r6d, word [r2]
9843 mov [rsp], r5w
9844 mov [r2 + 64], r6w
9845
9846 add r1, r1
9847 lea r4, [r1 * 3]
9848 lea r6, [ang_table + 15 * 16]
9849 movu m5, [r2 + 64]
9850 pshufb m5, [pw_ang16_13]
9851 movu m6, [r2 + 14 + 64]
9852 pshufb m6, [pw_ang8_13]
9853 pslldq m6, 2
9854 palignr m5, m6, 6
9855 xor r3d, r3d
9856 inc r3d
9857
9858 call ang16_mode_13_23
9859
9860 lea r0, [r0 + 16]
9861 movu m5, [r2 + 2]
9862 lea r2, [r2 + 16]
9863
9864 call ang16_mode_13_23
9865
9866 mov r6d, [rsp]
9867 mov [r2 + 48], r6w
9868 RET
9869
9870 cglobal intra_pred_ang16_14, 3,7,8, 0-4
9871 movzx r5d, word [r2 + 64]
9872 movzx r6d, word [r2]
9873 mov [rsp], r5w
9874 mov [r2 + 64], r6w
9875
9876 add r1, r1
9877 lea r4, [r1 * 3]
9878 lea r6, [ang_table + 18 * 16]
9879 movu m6, [r2]
9880 pshufb m6, [pw_ang8_14]
9881 movu m5, [r2 + 20]
9882 pshufb m5, [pw_ang8_14]
9883 punpckhqdq m5, m6
9884 xor r3d, r3d
9885 add r2, 64
9886
9887 call ang16_mode_14_22
9888
9889 lea r0, [r0 + r1 * 8]
9890 movu m5, [r2 + 2]
9891 lea r2, [r2 + 16]
9892
9893 call ang16_mode_14_22
9894
9895 mov r6d, [rsp]
9896 mov [r2 - 16], r6w
9897 RET
9898
9899 cglobal intra_pred_ang16_22, 3,7,8, 0-4
9900 movzx r5d, word [r2 + 64]
9901 movzx r6d, word [r2]
9902 mov [rsp], r5w
9903 mov [r2 + 64], r6w
9904
9905 add r1, r1
9906 lea r4, [r1 * 3]
9907 lea r6, [ang_table + 18 * 16]
9908 movu m6, [r2 + 64]
9909 pshufb m6, [pw_ang8_14]
9910 movu m5, [r2 + 20 + 64]
9911 pshufb m5, [pw_ang8_14]
9912 punpckhqdq m5, m6
9913 xor r3d, r3d
9914 inc r3d
9915
9916 call ang16_mode_14_22
9917
9918 lea r0, [r0 + 16]
9919 movu m5, [r2 + 2]
9920 lea r2, [r2 + 16]
9921
9922 call ang16_mode_14_22
9923
9924 mov r6d, [rsp]
9925 mov [r2 + 48], r6w
9926 RET
9927
9928 cglobal intra_pred_ang16_15, 3,7,8, 0-4
9929 movzx r5d, word [r2 + 64]
9930 movzx r6d, word [r2]
9931 mov [rsp], r5w
9932 mov [r2 + 64], r6w
9933
9934 add r1, r1
9935 lea r4, [r1 * 3]
9936 lea r6, [ang_table + 15 * 16]
9937 movu m6, [r2 + 4]
9938 pshufb m6, [pw_ang8_15]
9939 movu m5, [r2 + 18]
9940 pshufb m5, [pw_ang8_15]
9941 punpckhqdq m5, m6
9942 xor r3d, r3d
9943 add r2, 64
9944
9945 call ang16_mode_15_21
9946
9947 lea r0, [r0 + r1 * 8]
9948 movu m5, [r2]
9949 lea r2, [r2 + 16]
9950
9951 call ang16_mode_15_21
9952
9953 mov r6d, [rsp]
9954 mov [r2 - 16], r6w
9955 RET
9956
9957 cglobal intra_pred_ang16_21, 3,7,8, 0-4
9958 movzx r5d, word [r2 + 64]
9959 movzx r6d, word [r2]
9960 mov [rsp], r5w
9961 mov [r2 + 64], r6w
9962
9963 add r1, r1
9964 lea r4, [r1 * 3]
9965 lea r6, [ang_table + 15 * 16]
9966 movu m6, [r2 + 4 + 64]
9967 pshufb m6, [pw_ang8_15]
9968 movu m5, [r2 + 18 + 64]
9969 pshufb m5, [pw_ang8_15]
9970 punpckhqdq m5, m6
9971 xor r3d, r3d
9972 inc r3d
9973
9974 call ang16_mode_15_21
9975
9976 lea r0, [r0 + 16]
9977 movu m5, [r2]
9978 lea r2, [r2 + 16]
9979
9980 call ang16_mode_15_21
9981
9982 mov r6d, [rsp]
9983 mov [r2 + 48], r6w
9984 RET
9985
9986 cglobal intra_pred_ang16_16, 3,7,8,0-(1*mmsize+4)
9987 movzx r5d, word [r2 + 64]
9988 movzx r6d, word [r2]
9989 mov [rsp + 16], r5w
9990 mov [r2 + 64], r6w
9991
9992 add r1, r1
9993 lea r6, [ang_table + 13 * 16]
9994 movu m6, [r2 + 4]
9995 pshufb m6, [pw_ang16_16]
9996 movu m5, [r2 + 16]
9997 pshufb m5, [pw_ang16_16]
9998 punpckhqdq m5, m6
9999 mov [rsp], r2
10000 lea r3, [r2 + 24]
10001 add r2, 64
10002 xor r4, r4
10003
10004 call ang16_mode_16_20
10005
10006 lea r0, [r0 + r1 * 8]
10007 mov r3, [rsp]
10008 movu m5, [r2]
10009 lea r2, [r2 + 16]
10010 xor r4, r4
10011
10012 call ang16_mode_16_20
10013
10014 mov r6d, [rsp + 16]
10015 mov [r2 - 16], r6w
10016 RET
10017
10018 cglobal intra_pred_ang16_20, 3,7,8,0-(1*mmsize+4)
10019 movzx r5d, word [r2 + 64]
10020 movzx r6d, word [r2]
10021 mov [rsp + 16], r5w
10022 mov [r2 + 64], r6w
10023
10024 lea r3, [r2 + 64]
10025 add r1, r1
10026 lea r6, [ang_table + 13 * 16]
10027 movu m6, [r3 + 4]
10028 pshufb m6, [pw_ang16_16]
10029 movu m5, [r3 + 16]
10030 pshufb m5, [pw_ang16_16]
10031 punpckhqdq m5, m6
10032 mov [rsp], r3
10033 lea r3, [r3 + 24]
10034 xor r4, r4
10035 inc r4
10036
10037 call ang16_mode_16_20
10038
10039 lea r0, [r0 + 16]
10040 mov r3, [rsp]
10041 movu m5, [r2]
10042 lea r2, [r2 + 16]
10043 xor r4, r4
10044 inc r4
10045
10046 call ang16_mode_16_20
10047 mov r6d, [rsp + 16]
10048 mov [r3], r6w
10049 RET
10050
10051 cglobal intra_pred_ang16_17, 3,7,8,0-(1*mmsize+4)
10052 movzx r5d, word [r2 + 64]
10053 movzx r6d, word [r2]
10054 mov [rsp + 16], r5w
10055 mov [r2 + 64], r6w
10056
10057 add r1, r1
10058 lea r6, [ang_table + 16 * 16]
10059 movu m6, [r2 + 2]
10060 pshufb m6, [pw_ang16_16]
10061 movu m5, [r2 + 12]
10062 pshufb m5, [pw_ang16_16]
10063 punpckhqdq m5, m6
10064 mov [rsp], r2
10065 lea r3, [r2 + 20]
10066 add r2, 64
10067 xor r4, r4
10068
10069 call ang16_mode_17_19
10070
10071 lea r0, [r0 + r1 * 8]
10072 mov r3, [rsp]
10073 movu m5, [r2]
10074 lea r2, [r2 + 16]
10075 xor r4, r4
10076
10077 call ang16_mode_17_19
10078
10079 mov r6d, [rsp + 16]
10080 mov [r2 - 16], r6w
10081 RET
10082
10083 cglobal intra_pred_ang16_19, 3,7,8,0-(1*mmsize+4)
10084 movzx r5d, word [r2 + 64]
10085 movzx r6d, word [r2]
10086 mov [rsp + 16], r5w
10087 mov [r2 + 64], r6w
10088
10089 lea r3, [r2 + 64]
10090 add r1, r1
10091 lea r6, [ang_table + 16 * 16]
10092 movu m6, [r3 + 2]
10093 pshufb m6, [pw_ang16_16]
10094 movu m5, [r3 + 12]
10095 pshufb m5, [pw_ang16_16]
10096 punpckhqdq m5, m6
10097 mov [rsp], r3
10098 lea r3, [r3 + 20]
10099 xor r4, r4
10100 inc r4
10101
10102 call ang16_mode_17_19
10103
10104 lea r0, [r0 + 16]
10105 mov r3, [rsp]
10106 movu m5, [r2]
10107 lea r2, [r2 + 16]
10108 xor r4, r4
10109 inc r4
10110
10111 call ang16_mode_17_19
10112
10113 mov r6d, [rsp + 16]
10114 mov [r3], r6w
10115 RET
10116
10117 cglobal intra_pred_ang16_18, 3,5,4
10118 add r1, r1
10119 lea r4, [r1 * 3]
10120 movu m1, [r2]
10121 movu m3, [r2 + 16]
10122 movu m0, [r2 + 2 + 64]
10123 pshufb m0, [pw_swap16]
10124 movu [r0], m1
10125 movu [r0 + 16], m3
10126 palignr m2, m1, m0, 14
10127 movu [r0 + r1], m2
10128 palignr m2, m3, m1, 14
10129 movu [r0 + r1 + 16], m2
10130 palignr m2, m1, m0, 12
10131 movu [r0 + r1 * 2], m2
10132 palignr m2, m3, m1, 12
10133 movu [r0 + r1 * 2 + 16], m2
10134 palignr m2, m1, m0, 10
10135 movu [r0 + r4], m2
10136 palignr m2, m3, m1, 10
10137 movu [r0 + r4 + 16], m2
10138
10139 lea r0, [r0 + r1 * 4]
10140 palignr m2, m1, m0, 8
10141 movu [r0], m2
10142 palignr m2, m3, m1, 8
10143 movu [r0 + 16], m2
10144 palignr m2, m1, m0, 6
10145 movu [r0 + r1], m2
10146 palignr m2, m3, m1, 6
10147 movu [r0 + r1 + 16], m2
10148 palignr m2, m1, m0, 4
10149 movu [r0 + r1 * 2], m2
10150 palignr m2, m3, m1, 4
10151 movu [r0 + r1 * 2 + 16], m2
10152 palignr m2, m1, m0, 2
10153 movu [r0 + r4], m2
10154 palignr m3, m1, 2
10155 movu [r0 + r4 + 16], m3
10156
10157 lea r0, [r0 + r1 * 4]
10158 movu [r0], m0
10159 movu [r0 + 16], m1
10160 movu m3, [r2 + 18 + 64]
10161 pshufb m3, [pw_swap16]
10162 palignr m2, m0, m3, 14
10163 movu [r0 + r1], m2
10164 palignr m2, m1, m0, 14
10165 movu [r0 + r1 + 16], m2
10166 palignr m2, m0, m3, 12
10167 movu [r0 + r1 * 2], m2
10168 palignr m2, m1, m0, 12
10169 movu [r0 + r1 * 2 + 16], m2
10170 palignr m2, m0, m3, 10
10171 movu [r0 + r4], m2
10172 palignr m2, m1, m0, 10
10173 movu [r0 + r4 + 16], m2
10174
10175 lea r0, [r0 + r1 * 4]
10176 palignr m2, m0, m3, 8
10177 movu [r0], m2
10178 palignr m2, m1, m0, 8
10179 movu [r0 + 16], m2
10180 palignr m2, m0, m3, 6
10181 movu [r0 + r1], m2
10182 palignr m2, m1, m0, 6
10183 movu [r0 + r1 + 16], m2
10184 palignr m2, m0, m3, 4
10185 movu [r0 + r1 * 2], m2
10186 palignr m2, m1, m0, 4
10187 movu [r0 + r1 * 2 + 16], m2
10188 palignr m2, m0, m3, 2
10189 movu [r0 + r4], m2
10190 palignr m1, m0, 2
10191 movu [r0 + r4 + 16], m1
10192 RET
10193
10194 cglobal intra_pred_ang16_10, 3,6,4
10195 mov r5d, r4m
10196 movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
10197 movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
10198 pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
10199 add r1, r1
10200 lea r4, [r1 * 3]
10201
10202 psrldq m1, 2
10203 pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
10204 movu [r0 + r1], m2
10205 movu [r0 + r1 + 16], m2
10206 psrldq m1, 2
10207 pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
10208 movu [r0 + r1 * 2], m2
10209 movu [r0 + r1 * 2 + 16], m2
10210 psrldq m1, 2
10211 pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
10212 movu [r0 + r4], m2
10213 movu [r0 + r4 + 16], m2
10214
10215 lea r3, [r0 + r1 *4]
10216 psrldq m1, 2
10217 pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
10218 movu [r3], m2
10219 movu [r3 + 16], m2
10220 psrldq m1, 2
10221 pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
10222 movu [r3 + r1], m2
10223 movu [r3 + r1 + 16], m2
10224 psrldq m1, 2
10225 pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
10226 movu [r3 + r1 * 2], m2
10227 movu [r3 + r1 * 2 + 16], m2
10228 psrldq m1, 2
10229 pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
10230 movu [r3 + r4], m2
10231 movu [r3 + r4 + 16], m2
10232
10233 lea r3, [r3 + r1 *4]
10234 pshufb m2, m3, [pb_01] ; [9 9 9 9 9 9 9 9]
10235 movu [r3], m2
10236 movu [r3 + 16], m2
10237 psrldq m3, 2
10238 pshufb m2, m3, [pb_01] ; [10 10 10 10 10 10 10 10]
10239 movu [r3 + r1], m2
10240 movu [r3 + r1 + 16], m2
10241 psrldq m3, 2
10242 pshufb m2, m3, [pb_01] ; [11 11 11 11 11 11 11 11]
10243 movu [r3 + r1 * 2], m2
10244 movu [r3 + r1 * 2 + 16], m2
10245 psrldq m3, 2
10246 pshufb m2, m3, [pb_01] ; [12 12 12 12 12 12 12 12]
10247 movu [r3 + r4], m2
10248 movu [r3 + r4 + 16], m2
10249
10250 lea r3, [r3 + r1 *4]
10251 psrldq m3, 2
10252 pshufb m2, m3, [pb_01] ; [13 13 13 13 13 13 13 13]
10253 movu [r3], m2
10254 movu [r3 + 16], m2
10255 psrldq m3, 2
10256 pshufb m2, m3, [pb_01] ; [14 14 14 14 14 14 14 14]
10257 movu [r3 + r1], m2
10258 movu [r3 + r1 + 16], m2
10259 psrldq m3, 2
10260 pshufb m2, m3, [pb_01] ; [15 15 15 15 15 15 15 15]
10261 movu [r3 + r1 * 2], m2
10262 movu [r3 + r1 * 2 + 16], m2
10263 psrldq m3, 2
10264 pshufb m2, m3, [pb_01] ; [16 16 16 16 16 16 16 16]
10265 movu [r3 + r4], m2
10266 movu [r3 + r4 + 16], m2
10267 mova m3, m0
10268
10269 cmp r5d, byte 0
10270 jz .quit
10271
10272 ; filter
10273 pinsrw m1, [r2], 0 ; [3 2 1 0]
10274 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
10275 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
10276 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
10277 psubw m1, m2
10278 psubw m3, m2
10279 psraw m1, 1
10280 psraw m3, 1
10281 paddw m3, m0
10282 paddw m0, m1
10283 pxor m1, m1
10284 pmaxsw m0, m1
10285 pminsw m0, [pw_pixel_max]
10286 pmaxsw m3, m1
10287 pminsw m3, [pw_pixel_max]
10288 .quit:
10289 movu [r0], m0
10290 movu [r0 + 16], m3
10291 RET
10292
10293 cglobal intra_pred_ang16_26, 3,6,4
10294 mov r5d, r4m
10295 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
10296 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
10297 add r1, r1
10298 lea r4, [r1 * 3]
10299
10300 movu [r0], m0
10301 movu [r0 + 16], m3
10302 movu [r0 + r1], m0
10303 movu [r0 + r1 + 16], m3
10304 movu [r0 + r1 * 2], m0
10305 movu [r0 + r1 * 2 + 16], m3
10306 movu [r0 + r4], m0
10307 movu [r0 + r4 + 16], m3
10308
10309 lea r3, [r0 + r1 *4]
10310 movu [r3], m0
10311 movu [r3 + 16], m3
10312 movu [r3 + r1], m0
10313 movu [r3 + r1 + 16], m3
10314 movu [r3 + r1 * 2], m0
10315 movu [r3 + r1 * 2 + 16], m3
10316 movu [r3 + r4], m0
10317 movu [r3 + r4 + 16], m3
10318
10319 lea r3, [r3 + r1 *4]
10320 movu [r3], m0
10321 movu [r3 + 16], m3
10322 movu [r3 + r1], m0
10323 movu [r3 + r1 + 16], m3
10324 movu [r3 + r1 * 2], m0
10325 movu [r3 + r1 * 2 + 16], m3
10326 movu [r3 + r4], m0
10327 movu [r3 + r4 + 16], m3
10328
10329 lea r3, [r3 + r1 *4]
10330 movu [r3], m0
10331 movu [r3 + 16], m3
10332 movu [r3 + r1], m0
10333 movu [r3 + r1 + 16], m3
10334 movu [r3 + r1 * 2], m0
10335 movu [r3 + r1 * 2 + 16], m3
10336 movu [r3 + r4], m0
10337 movu [r3 + r4 + 16], m3
10338
10339 cmp r5d, byte 0
10340 jz .quit
10341
10342 ; filter
10343
10344 pshufb m0, [pb_01]
10345 pinsrw m1, [r2], 0 ; [3 2 1 0]
10346 pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
10347 movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
10348 movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
10349 psubw m1, m2
10350 psubw m3, m2
10351 psraw m1, 1
10352 psraw m3, 1
10353 paddw m3, m0
10354 paddw m0, m1
10355 pxor m1, m1
10356 pmaxsw m0, m1
10357 pminsw m0, [pw_pixel_max]
10358 pmaxsw m3, m1
10359 pminsw m3, [pw_pixel_max]
10360 pextrw [r0], m0, 0
10361 pextrw [r0 + r1], m0, 1
10362 pextrw [r0 + r1 * 2], m0, 2
10363 pextrw [r0 + r4], m0, 3
10364 lea r0, [r0 + r1 * 4]
10365 pextrw [r0], m0, 4
10366 pextrw [r0 + r1], m0, 5
10367 pextrw [r0 + r1 * 2], m0, 6
10368 pextrw [r0 + r4], m0, 7
10369 lea r0, [r0 + r1 * 4]
10370 pextrw [r0], m3, 0
10371 pextrw [r0 + r1], m3, 1
10372 pextrw [r0 + r1 * 2], m3, 2
10373 pextrw [r0 + r4], m3, 3
10374 pextrw [r3], m3, 4
10375 pextrw [r3 + r1], m3, 5
10376 pextrw [r3 + r1 * 2], m3, 6
10377 pextrw [r3 + r4], m3, 7
10378 .quit:
10379 RET
10380
10381 ;-------------------------------------------------------------------------------------------------------
10382 ; avx2 code for intra_pred_ang16 mode 2 to 34 start
10383 ;-------------------------------------------------------------------------------------------------------
10384 INIT_YMM avx2
10385 cglobal intra_pred_ang16_2, 3,5,3
10386 lea r4, [r2]
10387 add r2, 64
10388 cmp r3m, byte 34
10389 cmove r2, r4
10390 add r1d, r1d
10391 lea r3, [r1 * 3]
10392 movu m0, [r2 + 4]
10393 movu m1, [r2 + 20]
10394
10395 movu [r0], m0
10396 palignr m2, m1, m0, 2
10397 movu [r0 + r1], m2
10398 palignr m2, m1, m0, 4
10399 movu [r0 + r1 * 2], m2
10400 palignr m2, m1, m0, 6
10401 movu [r0 + r3], m2
10402
10403 lea r0, [r0 + r1 * 4]
10404 palignr m2, m1, m0, 8
10405 movu [r0], m2
10406 palignr m2, m1, m0, 10
10407 movu [r0 + r1], m2
10408 palignr m2, m1, m0, 12
10409 movu [r0 + r1 * 2], m2
10410 palignr m2, m1, m0, 14
10411 movu [r0 + r3], m2
10412
10413 movu m0, [r2 + 36]
10414 lea r0, [r0 + r1 * 4]
10415 movu [r0], m1
10416 palignr m2, m0, m1, 2
10417 movu [r0 + r1], m2
10418 palignr m2, m0, m1, 4
10419 movu [r0 + r1 * 2], m2
10420 palignr m2, m0, m1, 6
10421 movu [r0 + r3], m2
10422
10423 lea r0, [r0 + r1 * 4]
10424 palignr m2, m0, m1, 8
10425 movu [r0], m2
10426 palignr m2, m0, m1, 10
10427 movu [r0 + r1], m2
10428 palignr m2, m0, m1, 12
10429 movu [r0 + r1 * 2], m2
10430 palignr m2, m0, m1, 14
10431 movu [r0 + r3], m2
10432 RET
10433
10434 %macro TRANSPOSE_STORE_AVX2 11
10435 jnz .skip%11
10436 punpckhwd m%9, m%1, m%2
10437 punpcklwd m%1, m%2
10438 punpckhwd m%2, m%3, m%4
10439 punpcklwd m%3, m%4
10440
10441 punpckldq m%4, m%1, m%3
10442 punpckhdq m%1, m%3
10443 punpckldq m%3, m%9, m%2
10444 punpckhdq m%9, m%2
10445
10446 punpckhwd m%10, m%5, m%6
10447 punpcklwd m%5, m%6
10448 punpckhwd m%6, m%7, m%8
10449 punpcklwd m%7, m%8
10450
10451 punpckldq m%8, m%5, m%7
10452 punpckhdq m%5, m%7
10453 punpckldq m%7, m%10, m%6
10454 punpckhdq m%10, m%6
10455
10456 punpcklqdq m%6, m%4, m%8
10457 punpckhqdq m%2, m%4, m%8
10458 punpcklqdq m%4, m%1, m%5
10459 punpckhqdq m%8, m%1, m%5
10460
10461 punpcklqdq m%1, m%3, m%7
10462 punpckhqdq m%5, m%3, m%7
10463 punpcklqdq m%3, m%9, m%10
10464 punpckhqdq m%7, m%9, m%10
10465
10466 movu [r0 + r1 * 0 + %11], xm%6
10467 movu [r0 + r1 * 1 + %11], xm%2
10468 movu [r0 + r1 * 2 + %11], xm%4
10469 movu [r0 + r4 * 1 + %11], xm%8
10470
10471 lea r5, [r0 + r1 * 4]
10472 movu [r5 + r1 * 0 + %11], xm%1
10473 movu [r5 + r1 * 1 + %11], xm%5
10474 movu [r5 + r1 * 2 + %11], xm%3
10475 movu [r5 + r4 * 1 + %11], xm%7
10476
10477 lea r5, [r5 + r1 * 4]
10478 vextracti128 [r5 + r1 * 0 + %11], m%6, 1
10479 vextracti128 [r5 + r1 * 1 + %11], m%2, 1
10480 vextracti128 [r5 + r1 * 2 + %11], m%4, 1
10481 vextracti128 [r5 + r4 * 1 + %11], m%8, 1
10482
10483 lea r5, [r5 + r1 * 4]
10484 vextracti128 [r5 + r1 * 0 + %11], m%1, 1
10485 vextracti128 [r5 + r1 * 1 + %11], m%5, 1
10486 vextracti128 [r5 + r1 * 2 + %11], m%3, 1
10487 vextracti128 [r5 + r4 * 1 + %11], m%7, 1
10488 jmp .end%11
10489 .skip%11:
10490 movu [r0 + r1 * 0], m%1
10491 movu [r0 + r1 * 1], m%2
10492 movu [r0 + r1 * 2], m%3
10493 movu [r0 + r4 * 1], m%4
10494
10495 lea r0, [r0 + r1 * 4]
10496 movu [r0 + r1 * 0], m%5
10497 movu [r0 + r1 * 1], m%6
10498 movu [r0 + r1 * 2], m%7
10499 movu [r0 + r4 * 1], m%8
10500 lea r0, [r0 + r1 * 4]
10501 .end%11:
10502 %endmacro
10503
10504 ;; angle 16, modes 3 and 33
10505 cglobal ang16_mode_3_33
10506 test r6d, r6d
10507
10508 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
10509 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
10510
10511 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
10512 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
10513
10514 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
10515 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
10516 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
10517 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
10518
10519 pmaddwd m4, m3, [r3 + 10 * 32] ; [26]
10520 paddd m4, [pd_16]
10521 psrld m4, 5
10522 pmaddwd m5, m0, [r3 + 10 * 32]
10523 paddd m5, [pd_16]
10524 psrld m5, 5
10525 packusdw m4, m5
10526
10527 palignr m5, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
10528 pmaddwd m5, [r3 + 4 * 32] ; [20]
10529 paddd m5, [pd_16]
10530 psrld m5, 5
10531 palignr m6, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
10532 pmaddwd m6, [r3 + 4 * 32]
10533 paddd m6, [pd_16]
10534 psrld m6, 5
10535 packusdw m5, m6
10536
10537 palignr m6, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
10538 pmaddwd m6, [r3 - 2 * 32] ; [14]
10539 paddd m6, [pd_16]
10540 psrld m6, 5
10541 palignr m7, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
10542 pmaddwd m7, [r3 - 2 * 32]
10543 paddd m7, [pd_16]
10544 psrld m7, 5
10545 packusdw m6, m7
10546
10547 palignr m7, m0, m3, 12 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
10548 pmaddwd m7, [r3 - 8 * 32] ; [8]
10549 paddd m7, [pd_16]
10550 psrld m7, 5
10551 palignr m8, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8]
10552 pmaddwd m8, [r3 - 8 * 32]
10553 paddd m8, [pd_16]
10554 psrld m8, 5
10555 packusdw m7, m8
10556
10557 pmaddwd m8, m0, [r3 - 14 * 32] ; [2]
10558 paddd m8, [pd_16]
10559 psrld m8, 5
10560 pmaddwd m3, m2, [r3 - 14 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
10561 paddd m3, [pd_16]
10562 psrld m3, 5
10563 packusdw m8, m3
10564
10565 pmaddwd m9, m0, [r3 + 12 * 32] ; [28]
10566 paddd m9, [pd_16]
10567 psrld m9, 5
10568 pmaddwd m3, m2, [r3 + 12 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
10569 paddd m3, [pd_16]
10570 psrld m3, 5
10571 packusdw m9, m3
10572
10573 palignr m10, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
10574 pmaddwd m10, [r3 + 6 * 32] ; [22]
10575 paddd m10, [pd_16]
10576 psrld m10, 5
10577 palignr m3, m1, m2, 4 ; [22 21 21 20 20 19 19 18 14 13 13 12 12 11 11 10]
10578 pmaddwd m3, [r3 + 6 * 32]
10579 paddd m3, [pd_16]
10580 psrld m3, 5
10581 packusdw m10, m3
10582
10583 palignr m11, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
10584 pmaddwd m11, [r3] ; [16]
10585 paddd m11, [pd_16]
10586 psrld m11, 5
10587 palignr m3, m1, m2, 8 ; [23 22 22 21 21 20 20 19 15 14 14 13 13 12 12 11]
10588 pmaddwd m3, [r3]
10589 paddd m3, [pd_16]
10590 psrld m3, 5
10591 packusdw m11, m3
10592
10593 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
10594
10595 palignr m4, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8]
10596 pmaddwd m4, [r3 - 6 * 32] ; [10]
10597 paddd m4, [pd_16]
10598 psrld m4, 5
10599 palignr m5, m1, m2, 12 ; [24 23 23 22 22 21 21 20 15 16 15 14 14 13 13 12]
10600 pmaddwd m5, [r3 - 6 * 32]
10601 paddd m5, [pd_16]
10602 psrld m5, 5
10603 packusdw m4, m5
10604
10605 pmaddwd m5, m2, [r3 - 12 * 32] ; [4]
10606 paddd m5, [pd_16]
10607 psrld m5, 5
10608 pmaddwd m6, m1, [r3 - 12 * 32]
10609 paddd m6, [pd_16]
10610 psrld m6, 5
10611 packusdw m5, m6
10612
10613 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
10614 pmaddwd m6, m2, [r3 + 14 * 32] ; [30]
10615 paddd m6, [pd_16]
10616 psrld m6, 5
10617 pmaddwd m7, m1, [r3 + 14 * 32]
10618 paddd m7, [pd_16]
10619 psrld m7, 5
10620 packusdw m6, m7
10621
10622 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18]
10623 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
10624
10625 palignr m7, m1, m2, 4
10626 pmaddwd m7, [r3 + 8 * 32] ; [24]
10627 paddd m7, [pd_16]
10628 psrld m7, 5
10629 palignr m8, m0, m1, 4
10630 pmaddwd m8, [r3 + 8 * 32]
10631 paddd m8, [pd_16]
10632 psrld m8, 5
10633 packusdw m7, m8
10634
10635 palignr m8, m1, m2, 8
10636 pmaddwd m8, [r3 + 2 * 32] ; [18]
10637 paddd m8, [pd_16]
10638 psrld m8, 5
10639 palignr m9, m0, m1, 8
10640 pmaddwd m9, [r3 + 2 * 32]
10641 paddd m9, [pd_16]
10642 psrld m9, 5
10643 packusdw m8, m9
10644
10645 palignr m9, m1, m2, 12
10646 pmaddwd m9, [r3 - 4 * 32] ; [12]
10647 paddd m9, [pd_16]
10648 psrld m9, 5
10649 palignr m3, m0, m1, 12
10650 pmaddwd m3, [r3 - 4 * 32]
10651 paddd m3, [pd_16]
10652 psrld m3, 5
10653 packusdw m9, m3
10654
10655 pmaddwd m1, [r3 - 10 * 32] ; [6]
10656 paddd m1, [pd_16]
10657 psrld m1, 5
10658 pmaddwd m0, [r3 - 10 * 32]
10659 paddd m0, [pd_16]
10660 psrld m0, 5
10661 packusdw m1, m0
10662
10663 movu m2, [r2 + 28]
10664 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 2, 0, 3, 16
10665 ret
10666
10667 ;; angle 16, modes 4 and 32
10668 cglobal ang16_mode_4_32
10669 test r6d, r6d
10670
10671 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
10672 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
10673
10674 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
10675 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
10676
10677 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
10678 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
10679 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
10680 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
10681
10682 pmaddwd m4, m3, [r3 + 3 * 32] ; [21]
10683 paddd m4, [pd_16]
10684 psrld m4, 5
10685 pmaddwd m5, m0, [r3 + 3 * 32]
10686 paddd m5, [pd_16]
10687 psrld m5, 5
10688 packusdw m4, m5
10689
10690 palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
10691 pmaddwd m5, m6, [r3 - 8 * 32] ; [10]
10692 paddd m5, [pd_16]
10693 psrld m5, 5
10694 palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
10695 pmaddwd m8, m7, [r3 - 8 * 32]
10696 paddd m8, [pd_16]
10697 psrld m8, 5
10698 packusdw m5, m8
10699
10700 pmaddwd m6, [r3 + 13 * 32] ; [31]
10701 paddd m6, [pd_16]
10702 psrld m6, 5
10703 pmaddwd m7, [r3 + 13 * 32]
10704 paddd m7, [pd_16]
10705 psrld m7, 5
10706 packusdw m6, m7
10707
10708 palignr m7, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
10709 pmaddwd m7, [r3 + 2 * 32] ; [20]
10710 paddd m7, [pd_16]
10711 psrld m7, 5
10712 palignr m8, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
10713 pmaddwd m8, [r3 + 2 * 32]
10714 paddd m8, [pd_16]
10715 psrld m8, 5
10716 packusdw m7, m8
10717
10718 palignr m9, m0, m3, 12
10719 pmaddwd m8, m9, [r3 - 9 * 32] ; [9]
10720 paddd m8, [pd_16]
10721 psrld m8, 5
10722 palignr m3, m2, m0, 12
10723 pmaddwd m10, m3, [r3 - 9 * 32]
10724 paddd m10, [pd_16]
10725 psrld m10, 5
10726 packusdw m8, m10
10727
10728 pmaddwd m9, [r3 + 12 * 32] ; [30]
10729 paddd m9, [pd_16]
10730 psrld m9, 5
10731 pmaddwd m3, [r3 + 12 * 32]
10732 paddd m3, [pd_16]
10733 psrld m3, 5
10734 packusdw m9, m3
10735
10736 pmaddwd m10, m0, [r3 + 1 * 32] ; [19]
10737 paddd m10, [pd_16]
10738 psrld m10, 5
10739 pmaddwd m3, m2, [r3 + 1 * 32]
10740 paddd m3, [pd_16]
10741 psrld m3, 5
10742 packusdw m10, m3
10743
10744 palignr m11, m2, m0, 4
10745 pmaddwd m11, [r3 - 10 * 32] ; [8]
10746 paddd m11, [pd_16]
10747 psrld m11, 5
10748 palignr m3, m1, m2, 4
10749 pmaddwd m3, [r3 - 10 * 32]
10750 paddd m3, [pd_16]
10751 psrld m3, 5
10752 packusdw m11, m3
10753
10754 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
10755
10756 palignr m4, m2, m0, 4
10757 pmaddwd m4, [r3 + 11 * 32] ; [29]
10758 paddd m4, [pd_16]
10759 psrld m4, 5
10760 palignr m5, m1, m2, 4
10761 pmaddwd m5, [r3 + 11 * 32]
10762 paddd m5, [pd_16]
10763 psrld m5, 5
10764 packusdw m4, m5
10765
10766 palignr m5, m2, m0, 8
10767 pmaddwd m5, [r3] ; [18]
10768 paddd m5, [pd_16]
10769 psrld m5, 5
10770 palignr m6, m1, m2, 8
10771 pmaddwd m6, [r3]
10772 paddd m6, [pd_16]
10773 psrld m6, 5
10774 packusdw m5, m6
10775
10776 palignr m7, m2, m0, 12
10777 pmaddwd m6, m7, [r3 - 11 * 32] ; [7]
10778 paddd m6, [pd_16]
10779 psrld m6, 5
10780 palignr m8, m1, m2, 12
10781 pmaddwd m3, m8, [r3 - 11 * 32]
10782 paddd m3, [pd_16]
10783 psrld m3, 5
10784 packusdw m6, m3
10785
10786 pmaddwd m7, [r3 + 10 * 32] ; [28]
10787 paddd m7, [pd_16]
10788 psrld m7, 5
10789 pmaddwd m8, [r3 + 10 * 32]
10790 paddd m8, [pd_16]
10791 psrld m8, 5
10792 packusdw m7, m8
10793
10794 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
10795 pmaddwd m8, m2, [r3 - 1 * 32] ; [17]
10796 paddd m8, [pd_16]
10797 psrld m8, 5
10798 pmaddwd m9, m1, [r3 - 1 * 32]
10799 paddd m9, [pd_16]
10800 psrld m9, 5
10801 packusdw m8, m9
10802
10803 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18]
10804 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
10805
10806 palignr m10, m1, m2, 4
10807 pmaddwd m9, m10, [r3 - 12 * 32] ; [6]
10808 paddd m9, [pd_16]
10809 psrld m9, 5
10810 palignr m11, m0, m1, 4
10811 pmaddwd m3, m11, [r3 - 12 * 32]
10812 paddd m3, [pd_16]
10813 psrld m3, 5
10814 packusdw m9, m3
10815
10816 pmaddwd m10, [r3 + 9 * 32] ; [27]
10817 paddd m10, [pd_16]
10818 psrld m10, 5
10819 pmaddwd m11, [r3 + 9 * 32]
10820 paddd m11, [pd_16]
10821 psrld m11, 5
10822 packusdw m10, m11
10823
10824 palignr m3, m1, m2, 8
10825 pmaddwd m3, [r3 - 2 * 32] ; [16]
10826 paddd m3, [pd_16]
10827 psrld m3, 5
10828 palignr m0, m1, 8
10829 pmaddwd m0, [r3 - 2 * 32]
10830 paddd m0, [pd_16]
10831 psrld m0, 5
10832 packusdw m3, m0
10833 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
10834 ret
10835
10836 ;; angle 16, modes 5 and 31
10837 cglobal ang16_mode_5_31
10838 test r6d, r6d
10839
10840 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
10841 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
10842
10843 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
10844 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
10845
10846 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
10847 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
10848 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
10849 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
10850
10851 pmaddwd m4, m3, [r3 + 1 * 32] ; [17]
10852 paddd m4, [pd_16]
10853 psrld m4, 5
10854 pmaddwd m5, m0, [r3 + 1 * 32]
10855 paddd m5, [pd_16]
10856 psrld m5, 5
10857 packusdw m4, m5
10858
10859 palignr m6, m0, m3, 4
10860 pmaddwd m5, m6, [r3 - 14 * 32] ; [2]
10861 paddd m5, [pd_16]
10862 psrld m5, 5
10863 palignr m7, m2, m0, 4
10864 pmaddwd m8, m7, [r3 - 14 * 32]
10865 paddd m8, [pd_16]
10866 psrld m8, 5
10867 packusdw m5, m8
10868
10869 pmaddwd m6, [r3 + 3 * 32] ; [19]
10870 paddd m6, [pd_16]
10871 psrld m6, 5
10872 pmaddwd m7, [r3 + 3 * 32]
10873 paddd m7, [pd_16]
10874 psrld m7, 5
10875 packusdw m6, m7
10876
10877 palignr m8, m0, m3, 8
10878 pmaddwd m7, m8, [r3 - 12 * 32] ; [4]
10879 paddd m7, [pd_16]
10880 psrld m7, 5
10881 palignr m9, m2, m0, 8
10882 pmaddwd m10, m9, [r3 - 12 * 32]
10883 paddd m10, [pd_16]
10884 psrld m10, 5
10885 packusdw m7, m10
10886
10887 pmaddwd m8, [r3 + 5 * 32] ; [21]
10888 paddd m8, [pd_16]
10889 psrld m8, 5
10890 pmaddwd m9, [r3 + 5 * 32]
10891 paddd m9, [pd_16]
10892 psrld m9, 5
10893 packusdw m8, m9
10894
10895 palignr m10, m0, m3, 12
10896 pmaddwd m9, m10, [r3 - 10 * 32] ; [6]
10897 paddd m9, [pd_16]
10898 psrld m9, 5
10899 palignr m11, m2, m0, 12
10900 pmaddwd m3, m11, [r3 - 10 * 32]
10901 paddd m3, [pd_16]
10902 psrld m3, 5
10903 packusdw m9, m3
10904
10905 pmaddwd m10, [r3 + 7 * 32] ; [23]
10906 paddd m10, [pd_16]
10907 psrld m10, 5
10908 pmaddwd m11, [r3 + 7 * 32]
10909 paddd m11, [pd_16]
10910 psrld m11, 5
10911 packusdw m10, m11
10912
10913 pmaddwd m11, m0, [r3 - 8 * 32] ; [8]
10914 paddd m11, [pd_16]
10915 psrld m11, 5
10916 pmaddwd m3, m2, [r3 - 8 * 32]
10917 paddd m3, [pd_16]
10918 psrld m3, 5
10919 packusdw m11, m3
10920
10921 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
10922
10923 pmaddwd m4, m0, [r3 + 9 * 32] ; [25]
10924 paddd m4, [pd_16]
10925 psrld m4, 5
10926 pmaddwd m5, m2, [r3 + 9 * 32]
10927 paddd m5, [pd_16]
10928 psrld m5, 5
10929 packusdw m4, m5
10930
10931 palignr m6, m2, m0, 4
10932 pmaddwd m5, m6, [r3 - 6 * 32] ; [10]
10933 paddd m5, [pd_16]
10934 psrld m5, 5
10935 palignr m7, m1, m2, 4
10936 pmaddwd m3, m7, [r3 - 6 * 32]
10937 paddd m3, [pd_16]
10938 psrld m3, 5
10939 packusdw m5, m3
10940
10941 pmaddwd m6, [r3 + 11 * 32] ; [27]
10942 paddd m6, [pd_16]
10943 psrld m6, 5
10944 pmaddwd m7, [r3 + 11 * 32]
10945 paddd m7, [pd_16]
10946 psrld m7, 5
10947 packusdw m6, m7
10948
10949 palignr m8, m2, m0, 8
10950 pmaddwd m7, m8, [r3 - 4 * 32] ; [12]
10951 paddd m7, [pd_16]
10952 psrld m7, 5
10953 palignr m9, m1, m2, 8
10954 pmaddwd m3, m9, [r3 - 4 * 32]
10955 paddd m3, [pd_16]
10956 psrld m3, 5
10957 packusdw m7, m3
10958
10959 pmaddwd m8, [r3 + 13 * 32] ; [29]
10960 paddd m8, [pd_16]
10961 psrld m8, 5
10962 pmaddwd m9, [r3 + 13 * 32]
10963 paddd m9, [pd_16]
10964 psrld m9, 5
10965 packusdw m8, m9
10966
10967 palignr m10, m2, m0, 12
10968 pmaddwd m9, m10, [r3 - 2 * 32] ; [14]
10969 paddd m9, [pd_16]
10970 psrld m9, 5
10971 palignr m11, m1, m2, 12
10972 pmaddwd m3, m11, [r3 - 2 * 32]
10973 paddd m3, [pd_16]
10974 psrld m3, 5
10975 packusdw m9, m3
10976
10977 pmaddwd m10, [r3 + 15 * 32] ; [31]
10978 paddd m10, [pd_16]
10979 psrld m10, 5
10980 pmaddwd m11, [r3 + 15 * 32]
10981 paddd m11, [pd_16]
10982 psrld m11, 5
10983 packusdw m10, m11
10984
10985 pmaddwd m2, [r3] ; [16]
10986 paddd m2, [pd_16]
10987 psrld m2, 5
10988 pmaddwd m1, [r3]
10989 paddd m1, [pd_16]
10990 psrld m1, 5
10991 packusdw m2, m1
10992 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
10993 ret
10994
10995 ;; angle 16, modes 6 and 30
10996 cglobal ang16_mode_6_30
10997 test r6d, r6d
10998
10999 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11000 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11001
11002 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
11003 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
11004
11005 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
11006 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
11007 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
11008 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
11009
11010 pmaddwd m4, m3, [r3 - 2 * 32] ; [13]
11011 paddd m4, [pd_16]
11012 psrld m4, 5
11013 pmaddwd m5, m0, [r3 - 2 * 32]
11014 paddd m5, [pd_16]
11015 psrld m5, 5
11016 packusdw m4, m5
11017
11018 pmaddwd m5, m3, [r3 + 11 * 32] ; [26]
11019 paddd m5, [pd_16]
11020 psrld m5, 5
11021 pmaddwd m8, m0, [r3 + 11 * 32]
11022 paddd m8, [pd_16]
11023 psrld m8, 5
11024 packusdw m5, m8
11025
11026 palignr m7, m0, m3, 4
11027 pmaddwd m6, m7, [r3 - 8 * 32] ; [7]
11028 paddd m6, [pd_16]
11029 psrld m6, 5
11030 palignr m8, m2, m0, 4
11031 pmaddwd m9, m8, [r3 - 8 * 32]
11032 paddd m9, [pd_16]
11033 psrld m9, 5
11034 packusdw m6, m9
11035
11036 pmaddwd m7, [r3 + 5 * 32] ; [20]
11037 paddd m7, [pd_16]
11038 psrld m7, 5
11039 pmaddwd m8, [r3 + 5 * 32]
11040 paddd m8, [pd_16]
11041 psrld m8, 5
11042 packusdw m7, m8
11043
11044 palignr m10, m0, m3, 8
11045 pmaddwd m8, m10, [r3 - 14 * 32] ; [1]
11046 paddd m8, [pd_16]
11047 psrld m8, 5
11048 palignr m11, m2, m0, 8
11049 pmaddwd m9, m11, [r3 - 14 * 32]
11050 paddd m9, [pd_16]
11051 psrld m9, 5
11052 packusdw m8, m9
11053
11054 pmaddwd m9, m10, [r3 - 1 * 32] ; [14]
11055 paddd m9, [pd_16]
11056 psrld m9, 5
11057 pmaddwd m12, m11, [r3 - 1 * 32]
11058 paddd m12, [pd_16]
11059 psrld m12, 5
11060 packusdw m9, m12
11061
11062 pmaddwd m10, [r3 + 12 * 32] ; [27]
11063 paddd m10, [pd_16]
11064 psrld m10, 5
11065 pmaddwd m11, [r3 + 12 * 32]
11066 paddd m11, [pd_16]
11067 psrld m11, 5
11068 packusdw m10, m11
11069
11070 palignr m11, m0, m3, 12
11071 pmaddwd m11, [r3 - 7 * 32] ; [8]
11072 paddd m11, [pd_16]
11073 psrld m11, 5
11074 palignr m12, m2, m0, 12
11075 pmaddwd m12, [r3 - 7 * 32]
11076 paddd m12, [pd_16]
11077 psrld m12, 5
11078 packusdw m11, m12
11079
11080 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
11081
11082 palignr m4, m0, m3, 12
11083 pmaddwd m4, [r3 + 6 * 32] ; [21]
11084 paddd m4, [pd_16]
11085 psrld m4, 5
11086 palignr m5, m2, m0, 12
11087 pmaddwd m5, [r3 + 6 * 32]
11088 paddd m5, [pd_16]
11089 psrld m5, 5
11090 packusdw m4, m5
11091
11092 pmaddwd m5, m0, [r3 - 13 * 32] ; [2]
11093 paddd m5, [pd_16]
11094 psrld m5, 5
11095 pmaddwd m3, m2, [r3 - 13 * 32]
11096 paddd m3, [pd_16]
11097 psrld m3, 5
11098 packusdw m5, m3
11099
11100 pmaddwd m6, m0, [r3] ; [15]
11101 paddd m6, [pd_16]
11102 psrld m6, 5
11103 pmaddwd m7, m2, [r3]
11104 paddd m7, [pd_16]
11105 psrld m7, 5
11106 packusdw m6, m7
11107
11108 pmaddwd m7, m0, [r3 + 13 * 32] ; [28]
11109 paddd m7, [pd_16]
11110 psrld m7, 5
11111 pmaddwd m3, m2, [r3 + 13 * 32]
11112 paddd m3, [pd_16]
11113 psrld m3, 5
11114 packusdw m7, m3
11115
11116 palignr m9, m2, m0, 4
11117 pmaddwd m8, m9, [r3 - 6 * 32] ; [9]
11118 paddd m8, [pd_16]
11119 psrld m8, 5
11120 palignr m3, m1, m2, 4
11121 pmaddwd m10, m3, [r3 - 6 * 32]
11122 paddd m10, [pd_16]
11123 psrld m10, 5
11124 packusdw m8, m10
11125
11126 pmaddwd m9, [r3 + 7 * 32] ; [22]
11127 paddd m9, [pd_16]
11128 psrld m9, 5
11129 pmaddwd m3, [r3 + 7 * 32]
11130 paddd m3, [pd_16]
11131 psrld m3, 5
11132 packusdw m9, m3
11133
11134 palignr m11, m2, m0, 8
11135 pmaddwd m10, m11, [r3 - 12 * 32] ; [3]
11136 paddd m10, [pd_16]
11137 psrld m10, 5
11138 palignr m3, m1, m2, 8
11139 pmaddwd m12, m3, [r3 - 12 * 32]
11140 paddd m12, [pd_16]
11141 psrld m12, 5
11142 packusdw m10, m12
11143
11144 pmaddwd m11, [r3 + 1 * 32] ; [16]
11145 paddd m11, [pd_16]
11146 psrld m11, 5
11147 pmaddwd m3, [r3 + 1 * 32]
11148 paddd m3, [pd_16]
11149 psrld m3, 5
11150 packusdw m11, m3
11151 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
11152 ret
11153
11154 ;; angle 16, modes 7 and 29
11155 cglobal ang16_mode_7_29
11156 test r6d, r6d
11157
11158 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11159 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11160
11161 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
11162 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
11163
11164 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
11165 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
11166 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
11167
11168 pmaddwd m4, m3, [r3 - 8 * 32] ; [9]
11169 paddd m4, [pd_16]
11170 psrld m4, 5
11171 pmaddwd m5, m0, [r3 - 8 * 32]
11172 paddd m5, [pd_16]
11173 psrld m5, 5
11174 packusdw m4, m5
11175
11176 pmaddwd m5, m3, [r3 + 1 * 32] ; [18]
11177 paddd m5, [pd_16]
11178 psrld m5, 5
11179 pmaddwd m8, m0, [r3 + 1 * 32]
11180 paddd m8, [pd_16]
11181 psrld m8, 5
11182 packusdw m5, m8
11183
11184 pmaddwd m6, m3, [r3 + 10 * 32] ; [27]
11185 paddd m6, [pd_16]
11186 psrld m6, 5
11187 pmaddwd m9, m0, [r3 + 10 * 32]
11188 paddd m9, [pd_16]
11189 psrld m9, 5
11190 packusdw m6, m9
11191
11192 palignr m10, m0, m3, 4
11193 pmaddwd m7, m10, [r3 - 13 * 32] ; [4]
11194 paddd m7, [pd_16]
11195 psrld m7, 5
11196 palignr m11, m2, m0, 4
11197 pmaddwd m8, m11, [r3 - 13 * 32]
11198 paddd m8, [pd_16]
11199 psrld m8, 5
11200 packusdw m7, m8
11201
11202 pmaddwd m8, m10, [r3 - 4 * 32] ; [13]
11203 paddd m8, [pd_16]
11204 psrld m8, 5
11205 pmaddwd m9, m11, [r3 - 4 * 32]
11206 paddd m9, [pd_16]
11207 psrld m9, 5
11208 packusdw m8, m9
11209
11210 pmaddwd m9, m10, [r3 + 5 * 32] ; [22]
11211 paddd m9, [pd_16]
11212 psrld m9, 5
11213 pmaddwd m12, m11, [r3 + 5 * 32]
11214 paddd m12, [pd_16]
11215 psrld m12, 5
11216 packusdw m9, m12
11217
11218 pmaddwd m10, [r3 + 14 * 32] ; [31]
11219 paddd m10, [pd_16]
11220 psrld m10, 5
11221 pmaddwd m11, [r3 + 14 * 32]
11222 paddd m11, [pd_16]
11223 psrld m11, 5
11224 packusdw m10, m11
11225
11226 palignr m11, m0, m3, 8
11227 pmaddwd m11, [r3 - 9 * 32] ; [8]
11228 paddd m11, [pd_16]
11229 psrld m11, 5
11230 palignr m12, m2, m0, 8
11231 pmaddwd m12, [r3 - 9 * 32]
11232 paddd m12, [pd_16]
11233 psrld m12, 5
11234 packusdw m11, m12
11235
11236 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
11237
11238 palignr m5, m0, m3, 8
11239 pmaddwd m4, m5, [r3] ; [17]
11240 paddd m4, [pd_16]
11241 psrld m4, 5
11242 palignr m6, m2, m0, 8
11243 pmaddwd m7, m6, [r3]
11244 paddd m7, [pd_16]
11245 psrld m7, 5
11246 packusdw m4, m7
11247
11248 pmaddwd m5, [r3 + 9 * 32] ; [26]
11249 paddd m5, [pd_16]
11250 psrld m5, 5
11251 pmaddwd m6, [r3 + 9 * 32]
11252 paddd m6, [pd_16]
11253 psrld m6, 5
11254 packusdw m5, m6
11255
11256 palignr m9, m0, m3, 12
11257 pmaddwd m6, m9, [r3 - 14 * 32] ; [3]
11258 paddd m6, [pd_16]
11259 psrld m6, 5
11260 palignr m3, m2, m0, 12
11261 pmaddwd m7, m3, [r3 - 14 * 32]
11262 paddd m7, [pd_16]
11263 psrld m7, 5
11264 packusdw m6, m7
11265
11266 pmaddwd m7, m9, [r3 - 5 * 32] ; [12]
11267 paddd m7, [pd_16]
11268 psrld m7, 5
11269 pmaddwd m8, m3, [r3 - 5 * 32]
11270 paddd m8, [pd_16]
11271 psrld m8, 5
11272 packusdw m7, m8
11273
11274 pmaddwd m8, m9, [r3 + 4 * 32] ; [21]
11275 paddd m8, [pd_16]
11276 psrld m8, 5
11277 pmaddwd m10, m3, [r3 + 4 * 32]
11278 paddd m10, [pd_16]
11279 psrld m10, 5
11280 packusdw m8, m10
11281
11282 pmaddwd m9, [r3 + 13 * 32] ; [30]
11283 paddd m9, [pd_16]
11284 psrld m9, 5
11285 pmaddwd m3, [r3 + 13 * 32]
11286 paddd m3, [pd_16]
11287 psrld m3, 5
11288 packusdw m9, m3
11289
11290 pmaddwd m10, m0, [r3 - 10 * 32] ; [7]
11291 paddd m10, [pd_16]
11292 psrld m10, 5
11293 pmaddwd m12, m2, [r3 - 10 * 32]
11294 paddd m12, [pd_16]
11295 psrld m12, 5
11296 packusdw m10, m12
11297
11298 pmaddwd m0, [r3 - 1 * 32] ; [16]
11299 paddd m0, [pd_16]
11300 psrld m0, 5
11301 pmaddwd m2, [r3 - 1 * 32]
11302 paddd m2, [pd_16]
11303 psrld m2, 5
11304 packusdw m0, m2
11305 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 16
11306 ret
11307
11308 ;; angle 16, modes 8 and 28
11309 cglobal ang16_mode_8_28
11310 test r6d, r6d
11311
11312 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11313 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11314
11315 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
11316 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
11317
11318 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
11319 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
11320 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
11321
11322 pmaddwd m4, m3, [r3 - 10 * 32] ; [5]
11323 paddd m4, [pd_16]
11324 psrld m4, 5
11325 pmaddwd m5, m0, [r3 - 10 * 32]
11326 paddd m5, [pd_16]
11327 psrld m5, 5
11328 packusdw m4, m5
11329
11330 pmaddwd m5, m3, [r3 - 5 * 32] ; [10]
11331 paddd m5, [pd_16]
11332 psrld m5, 5
11333 pmaddwd m8, m0, [r3 - 5 * 32]
11334 paddd m8, [pd_16]
11335 psrld m8, 5
11336 packusdw m5, m8
11337
11338 pmaddwd m6, m3, [r3] ; [15]
11339 paddd m6, [pd_16]
11340 psrld m6, 5
11341 pmaddwd m9, m0, [r3]
11342 paddd m9, [pd_16]
11343 psrld m9, 5
11344 packusdw m6, m9
11345
11346 pmaddwd m7, m3, [r3 + 5 * 32] ; [20]
11347 paddd m7, [pd_16]
11348 psrld m7, 5
11349 pmaddwd m8, m0, [r3 + 5 * 32]
11350 paddd m8, [pd_16]
11351 psrld m8, 5
11352 packusdw m7, m8
11353
11354 pmaddwd m8, m3, [r3 + 10 * 32] ; [25]
11355 paddd m8, [pd_16]
11356 psrld m8, 5
11357 pmaddwd m9, m0, [r3 + 10 * 32]
11358 paddd m9, [pd_16]
11359 psrld m9, 5
11360 packusdw m8, m9
11361
11362 pmaddwd m9, m3, [r3 + 15 * 32] ; [30]
11363 paddd m9, [pd_16]
11364 psrld m9, 5
11365 pmaddwd m10, m0, [r3 + 15 * 32]
11366 paddd m10, [pd_16]
11367 psrld m10, 5
11368 packusdw m9, m10
11369
11370 palignr m11, m0, m3, 4
11371 pmaddwd m10, m11, [r3 - 12 * 32] ; [3]
11372 paddd m10, [pd_16]
11373 psrld m10, 5
11374 palignr m1, m2, m0, 4
11375 pmaddwd m12, m1, [r3 - 12 * 32]
11376 paddd m12, [pd_16]
11377 psrld m12, 5
11378 packusdw m10, m12
11379
11380 pmaddwd m11, [r3 - 7 * 32] ; [8]
11381 paddd m11, [pd_16]
11382 psrld m11, 5
11383 pmaddwd m1, [r3 - 7 * 32]
11384 paddd m1, [pd_16]
11385 psrld m1, 5
11386 packusdw m11, m1
11387
11388 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
11389
11390 palignr m7, m0, m3, 4
11391 pmaddwd m4, m7, [r3 - 2 * 32] ; [13]
11392 paddd m4, [pd_16]
11393 psrld m4, 5
11394 palignr m1, m2, m0, 4
11395 pmaddwd m5, m1, [r3 - 2 * 32]
11396 paddd m5, [pd_16]
11397 psrld m5, 5
11398 packusdw m4, m5
11399
11400 pmaddwd m5, m7, [r3 + 3 * 32] ; [18]
11401 paddd m5, [pd_16]
11402 psrld m5, 5
11403 pmaddwd m6, m1, [r3 + 3 * 32]
11404 paddd m6, [pd_16]
11405 psrld m6, 5
11406 packusdw m5, m6
11407
11408 pmaddwd m6, m7, [r3 + 8 * 32] ; [23]
11409 paddd m6, [pd_16]
11410 psrld m6, 5
11411 pmaddwd m8, m1, [r3 + 8 * 32]
11412 paddd m8, [pd_16]
11413 psrld m8, 5
11414 packusdw m6, m8
11415
11416 pmaddwd m7, [r3 + 13 * 32] ; [28]
11417 paddd m7, [pd_16]
11418 psrld m7, 5
11419 pmaddwd m1, [r3 + 13 * 32]
11420 paddd m1, [pd_16]
11421 psrld m1, 5
11422 packusdw m7, m1
11423
11424 palignr m1, m0, m3, 8
11425 pmaddwd m8, m1, [r3 - 14 * 32] ; [1]
11426 paddd m8, [pd_16]
11427 psrld m8, 5
11428 palignr m2, m0, 8
11429 pmaddwd m9, m2, [r3 - 14 * 32]
11430 paddd m9, [pd_16]
11431 psrld m9, 5
11432 packusdw m8, m9
11433
11434 pmaddwd m9, m1, [r3 - 9 * 32] ; [6]
11435 paddd m9, [pd_16]
11436 psrld m9, 5
11437 pmaddwd m3, m2, [r3 - 9 * 32]
11438 paddd m3, [pd_16]
11439 psrld m3, 5
11440 packusdw m9, m3
11441
11442 pmaddwd m3, m1, [r3 - 4 * 32] ; [11]
11443 paddd m3, [pd_16]
11444 psrld m3, 5
11445 pmaddwd m0, m2, [r3 - 4 * 32]
11446 paddd m0, [pd_16]
11447 psrld m0, 5
11448 packusdw m3, m0
11449
11450 pmaddwd m1, [r3 + 1 * 32] ; [16]
11451 paddd m1, [pd_16]
11452 psrld m1, 5
11453 pmaddwd m2, [r3 + 1 * 32]
11454 paddd m2, [pd_16]
11455 psrld m2, 5
11456 packusdw m1, m2
11457 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
11458 ret
11459
11460 ;; angle 16, modes 9 and 27
11461 cglobal ang16_mode_9_27
11462 test r6d, r6d
11463
11464 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11465 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11466
11467 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
11468 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
11469
11470 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
11471 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
11472 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
11473
11474 pmaddwd m4, m3, [r3 - 14 * 32] ; [2]
11475 paddd m4, [pd_16]
11476 psrld m4, 5
11477 pmaddwd m5, m0, [r3 - 14 * 32]
11478 paddd m5, [pd_16]
11479 psrld m5, 5
11480 packusdw m4, m5
11481
11482 pmaddwd m5, m3, [r3 - 12 * 32] ; [4]
11483 paddd m5, [pd_16]
11484 psrld m5, 5
11485 pmaddwd m8, m0, [r3 - 12 * 32]
11486 paddd m8, [pd_16]
11487 psrld m8, 5
11488 packusdw m5, m8
11489
11490 pmaddwd m6, m3, [r3 - 10 * 32] ; [6]
11491 paddd m6, [pd_16]
11492 psrld m6, 5
11493 pmaddwd m9, m0, [r3 - 10 * 32]
11494 paddd m9, [pd_16]
11495 psrld m9, 5
11496 packusdw m6, m9
11497
11498 pmaddwd m7, m3, [r3 - 8 * 32] ; [8]
11499 paddd m7, [pd_16]
11500 psrld m7, 5
11501 pmaddwd m8, m0, [r3 - 8 * 32]
11502 paddd m8, [pd_16]
11503 psrld m8, 5
11504 packusdw m7, m8
11505
11506 pmaddwd m8, m3, [r3 - 6 * 32] ; [10]
11507 paddd m8, [pd_16]
11508 psrld m8, 5
11509 pmaddwd m9, m0, [r3 - 6 * 32]
11510 paddd m9, [pd_16]
11511 psrld m9, 5
11512 packusdw m8, m9
11513
11514 pmaddwd m9, m3, [r3 - 4 * 32] ; [12]
11515 paddd m9, [pd_16]
11516 psrld m9, 5
11517 pmaddwd m10, m0, [r3 - 4 * 32]
11518 paddd m10, [pd_16]
11519 psrld m10, 5
11520 packusdw m9, m10
11521
11522 pmaddwd m10, m3, [r3 - 2 * 32] ; [14]
11523 paddd m10, [pd_16]
11524 psrld m10, 5
11525 pmaddwd m1, m0, [r3 - 2 * 32]
11526 paddd m1, [pd_16]
11527 psrld m1, 5
11528 packusdw m10, m1
11529
11530 pmaddwd m11, m3, [r3] ; [16]
11531 paddd m11, [pd_16]
11532 psrld m11, 5
11533 pmaddwd m1, m0, [r3]
11534 paddd m1, [pd_16]
11535 psrld m1, 5
11536 packusdw m11, m1
11537
11538 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
11539
11540 pmaddwd m4, m3, [r3 + 2 * 32] ; [18]
11541 paddd m4, [pd_16]
11542 psrld m4, 5
11543 pmaddwd m5, m0, [r3 + 2 * 32]
11544 paddd m5, [pd_16]
11545 psrld m5, 5
11546 packusdw m4, m5
11547
11548 pmaddwd m5, m3, [r3 + 4 * 32] ; [20]
11549 paddd m5, [pd_16]
11550 psrld m5, 5
11551 pmaddwd m6, m0, [r3 + 4 * 32]
11552 paddd m6, [pd_16]
11553 psrld m6, 5
11554 packusdw m5, m6
11555
11556 pmaddwd m6, m3, [r3 + 6 * 32] ; [22]
11557 paddd m6, [pd_16]
11558 psrld m6, 5
11559 pmaddwd m8, m0, [r3 + 6 * 32]
11560 paddd m8, [pd_16]
11561 psrld m8, 5
11562 packusdw m6, m8
11563
11564 pmaddwd m7, m3, [r3 + 8 * 32] ; [24]
11565 paddd m7, [pd_16]
11566 psrld m7, 5
11567 pmaddwd m1, m0, [r3 + 8 * 32]
11568 paddd m1, [pd_16]
11569 psrld m1, 5
11570 packusdw m7, m1
11571
11572 pmaddwd m8, m3, [r3 + 10 * 32] ; [26]
11573 paddd m8, [pd_16]
11574 psrld m8, 5
11575 pmaddwd m9, m0, [r3 + 10 * 32]
11576 paddd m9, [pd_16]
11577 psrld m9, 5
11578 packusdw m8, m9
11579
11580 pmaddwd m9, m3, [r3 + 12 * 32] ; [28]
11581 paddd m9, [pd_16]
11582 psrld m9, 5
11583 pmaddwd m1, m0, [r3 + 12 * 32]
11584 paddd m1, [pd_16]
11585 psrld m1, 5
11586 packusdw m9, m1
11587
11588 pmaddwd m3, [r3 + 14 * 32] ; [30]
11589 paddd m3, [pd_16]
11590 psrld m3, 5
11591 pmaddwd m0, [r3 + 14 * 32]
11592 paddd m0, [pd_16]
11593 psrld m0, 5
11594 packusdw m3, m0
11595
11596 movu m1, [r2 + 4]
11597 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
11598 ret
11599
11600 ;; angle 16, modes 11 and 25
11601 cglobal ang16_mode_11_25
11602 test r6d, r6d
11603
11604 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
11605 movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11606
11607 punpcklwd m3, m0, m1 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
11608 punpckhwd m0, m1 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
11609
11610 pmaddwd m4, m3, [r3 + 14 * 32] ; [30]
11611 paddd m4, [pd_16]
11612 psrld m4, 5
11613 pmaddwd m5, m0, [r3 + 14 * 32]
11614 paddd m5, [pd_16]
11615 psrld m5, 5
11616 packusdw m4, m5
11617
11618 pmaddwd m5, m3, [r3 + 12 * 32] ; [28]
11619 paddd m5, [pd_16]
11620 psrld m5, 5
11621 pmaddwd m8, m0, [r3 + 12 * 32]
11622 paddd m8, [pd_16]
11623 psrld m8, 5
11624 packusdw m5, m8
11625
11626 pmaddwd m6, m3, [r3 + 10 * 32] ; [26]
11627 paddd m6, [pd_16]
11628 psrld m6, 5
11629 pmaddwd m9, m0, [r3 + 10 * 32]
11630 paddd m9, [pd_16]
11631 psrld m9, 5
11632 packusdw m6, m9
11633
11634 pmaddwd m7, m3, [r3 + 8 * 32] ; [24]
11635 paddd m7, [pd_16]
11636 psrld m7, 5
11637 pmaddwd m8, m0, [r3 + 8 * 32]
11638 paddd m8, [pd_16]
11639 psrld m8, 5
11640 packusdw m7, m8
11641
11642 pmaddwd m8, m3, [r3 + 6 * 32] ; [22]
11643 paddd m8, [pd_16]
11644 psrld m8, 5
11645 pmaddwd m9, m0, [r3 + 6 * 32]
11646 paddd m9, [pd_16]
11647 psrld m9, 5
11648 packusdw m8, m9
11649
11650 pmaddwd m9, m3, [r3 + 4 * 32] ; [20]
11651 paddd m9, [pd_16]
11652 psrld m9, 5
11653 pmaddwd m10, m0, [r3 + 4 * 32]
11654 paddd m10, [pd_16]
11655 psrld m10, 5
11656 packusdw m9, m10
11657
11658 pmaddwd m10, m3, [r3 + 2 * 32] ; [18]
11659 paddd m10, [pd_16]
11660 psrld m10, 5
11661 pmaddwd m1, m0, [r3 + 2 * 32]
11662 paddd m1, [pd_16]
11663 psrld m1, 5
11664 packusdw m10, m1
11665
11666 pmaddwd m11, m3, [r3] ; [16]
11667 paddd m11, [pd_16]
11668 psrld m11, 5
11669 pmaddwd m1, m0, [r3]
11670 paddd m1, [pd_16]
11671 psrld m1, 5
11672 packusdw m11, m1
11673
11674 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
11675
11676 pmaddwd m4, m3, [r3 - 2 * 32] ; [14]
11677 paddd m4, [pd_16]
11678 psrld m4, 5
11679 pmaddwd m5, m0, [r3 - 2 * 32]
11680 paddd m5, [pd_16]
11681 psrld m5, 5
11682 packusdw m4, m5
11683
11684 pmaddwd m5, m3, [r3 - 4 * 32] ; [12]
11685 paddd m5, [pd_16]
11686 psrld m5, 5
11687 pmaddwd m6, m0, [r3 - 4 * 32]
11688 paddd m6, [pd_16]
11689 psrld m6, 5
11690 packusdw m5, m6
11691
11692 pmaddwd m6, m3, [r3 - 6 * 32] ; [10]
11693 paddd m6, [pd_16]
11694 psrld m6, 5
11695 pmaddwd m8, m0, [r3 - 6 * 32]
11696 paddd m8, [pd_16]
11697 psrld m8, 5
11698 packusdw m6, m8
11699
11700 pmaddwd m7, m3, [r3 - 8 * 32] ; [8]
11701 paddd m7, [pd_16]
11702 psrld m7, 5
11703 pmaddwd m1, m0, [r3 - 8 * 32]
11704 paddd m1, [pd_16]
11705 psrld m1, 5
11706 packusdw m7, m1
11707
11708 pmaddwd m8, m3, [r3 - 10 * 32] ; [6]
11709 paddd m8, [pd_16]
11710 psrld m8, 5
11711 pmaddwd m9, m0, [r3 - 10 * 32]
11712 paddd m9, [pd_16]
11713 psrld m9, 5
11714 packusdw m8, m9
11715
11716 pmaddwd m9, m3, [r3 - 12 * 32] ; [4]
11717 paddd m9, [pd_16]
11718 psrld m9, 5
11719 pmaddwd m1, m0, [r3 - 12 * 32]
11720 paddd m1, [pd_16]
11721 psrld m1, 5
11722 packusdw m9, m1
11723
11724 pmaddwd m3, [r3 - 14 * 32] ; [2]
11725 paddd m3, [pd_16]
11726 psrld m3, 5
11727 pmaddwd m0, [r3 - 14 * 32]
11728 paddd m0, [pd_16]
11729 psrld m0, 5
11730 packusdw m3, m0
11731
11732 movu m1, [r2]
11733 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
11734 ret
11735
11736 ;; angle 16, modes 12 and 24
11737 cglobal ang16_mode_12_24
11738 test r6d, r6d
11739
11740 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
11741 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11742
11743 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
11744 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
11745
11746 pmaddwd m4, m3, [r3 + 11 * 32] ; [27]
11747 paddd m4, [pd_16]
11748 psrld m4, 5
11749 pmaddwd m5, m2, [r3 + 11 * 32]
11750 paddd m5, [pd_16]
11751 psrld m5, 5
11752 packusdw m4, m5
11753
11754 pmaddwd m5, m3, [r3 + 6 * 32] ; [22]
11755 paddd m5, [pd_16]
11756 psrld m5, 5
11757 pmaddwd m8, m2, [r3 + 6 * 32]
11758 paddd m8, [pd_16]
11759 psrld m8, 5
11760 packusdw m5, m8
11761
11762 pmaddwd m6, m3, [r3 + 1 * 32] ; [17]
11763 paddd m6, [pd_16]
11764 psrld m6, 5
11765 pmaddwd m9, m2, [r3 + 1 * 32]
11766 paddd m9, [pd_16]
11767 psrld m9, 5
11768 packusdw m6, m9
11769
11770 pmaddwd m7, m3, [r3 - 4 * 32] ; [12]
11771 paddd m7, [pd_16]
11772 psrld m7, 5
11773 pmaddwd m8, m2, [r3 - 4 * 32]
11774 paddd m8, [pd_16]
11775 psrld m8, 5
11776 packusdw m7, m8
11777
11778 pmaddwd m8, m3, [r3 - 9 * 32] ; [7]
11779 paddd m8, [pd_16]
11780 psrld m8, 5
11781 pmaddwd m9, m2, [r3 - 9 * 32]
11782 paddd m9, [pd_16]
11783 psrld m9, 5
11784 packusdw m8, m9
11785
11786 pmaddwd m9, m3, [r3 - 14 * 32] ; [2]
11787 paddd m9, [pd_16]
11788 psrld m9, 5
11789 pmaddwd m2, [r3 - 14 * 32]
11790 paddd m2, [pd_16]
11791 psrld m2, 5
11792 packusdw m9, m2
11793
11794 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
11795 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
11796 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 x x x x]
11797
11798 palignr m2, m3, m1, 14
11799 palignr m13, m0, m3, 14
11800
11801 pmaddwd m10, m2, [r3 + 13 * 32] ; [29]
11802 paddd m10, [pd_16]
11803 psrld m10, 5
11804 pmaddwd m12, m13, [r3 + 13 * 32]
11805 paddd m12, [pd_16]
11806 psrld m12, 5
11807 packusdw m10, m12
11808
11809 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
11810 paddd m11, [pd_16]
11811 psrld m11, 5
11812 pmaddwd m13, [r3 + 8 * 32]
11813 paddd m13, [pd_16]
11814 psrld m13, 5
11815 packusdw m11, m13
11816
11817 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
11818
11819 palignr m13, m0, m3, 14
11820
11821 pmaddwd m4, m2, [r3 + 3 * 32] ; [19]
11822 paddd m4, [pd_16]
11823 psrld m4, 5
11824 pmaddwd m5, m13, [r3 + 3 * 32]
11825 paddd m5, [pd_16]
11826 psrld m5, 5
11827 packusdw m4, m5
11828
11829 pmaddwd m5, m2, [r3 - 2 * 32] ; [14]
11830 paddd m5, [pd_16]
11831 psrld m5, 5
11832 pmaddwd m6, m13, [r3 - 2 * 32]
11833 paddd m6, [pd_16]
11834 psrld m6, 5
11835 packusdw m5, m6
11836
11837 pmaddwd m6, m2, [r3 - 7 * 32] ; [9]
11838 paddd m6, [pd_16]
11839 psrld m6, 5
11840 pmaddwd m8, m13, [r3 - 7 * 32]
11841 paddd m8, [pd_16]
11842 psrld m8, 5
11843 packusdw m6, m8
11844
11845 pmaddwd m7, m2, [r3 - 12 * 32] ; [4]
11846 paddd m7, [pd_16]
11847 psrld m7, 5
11848 pmaddwd m8, m13, [r3 - 12 * 32]
11849 paddd m8, [pd_16]
11850 psrld m8, 5
11851 packusdw m7, m8
11852
11853 palignr m0, m3, 10
11854 palignr m3, m1, 10
11855
11856 pmaddwd m8, m3, [r3 + 15 * 32] ; [31]
11857 paddd m8, [pd_16]
11858 psrld m8, 5
11859 pmaddwd m9, m0, [r3 + 15 * 32]
11860 paddd m9, [pd_16]
11861 psrld m9, 5
11862 packusdw m8, m9
11863
11864 pmaddwd m9, m3, [r3 + 10 * 32] ; [26]
11865 paddd m9, [pd_16]
11866 psrld m9, 5
11867 pmaddwd m1, m0, [r3 + 10 * 32]
11868 paddd m1, [pd_16]
11869 psrld m1, 5
11870 packusdw m9, m1
11871
11872 pmaddwd m1, m3, [r3 + 5 * 32] ; [21]
11873 paddd m1, [pd_16]
11874 psrld m1, 5
11875 pmaddwd m2, m0, [r3 + 5 * 32]
11876 paddd m2, [pd_16]
11877 psrld m2, 5
11878 packusdw m1, m2
11879
11880 pmaddwd m3, [r3] ; [16]
11881 paddd m3, [pd_16]
11882 psrld m3, 5
11883 pmaddwd m0, [r3]
11884 paddd m0, [pd_16]
11885 psrld m0, 5
11886 packusdw m3, m0
11887 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16
11888 ret
11889
11890 ;; angle 16, modes 13 and 23
11891 cglobal ang16_mode_13_23
11892 test r6d, r6d
11893
11894 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
11895 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11896
11897 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
11898 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
11899
11900 pmaddwd m4, m3, [r3 + 7 * 32] ; [23]
11901 paddd m4, [pd_16]
11902 psrld m4, 5
11903 pmaddwd m5, m2, [r3 + 7 * 32]
11904 paddd m5, [pd_16]
11905 psrld m5, 5
11906 packusdw m4, m5
11907
11908 pmaddwd m5, m3, [r3 - 2 * 32] ; [14]
11909 paddd m5, [pd_16]
11910 psrld m5, 5
11911 pmaddwd m6, m2, [r3 - 2 * 32]
11912 paddd m6, [pd_16]
11913 psrld m6, 5
11914 packusdw m5, m6
11915
11916 pmaddwd m6, m3, [r3 - 11 * 32] ; [5]
11917 paddd m6, [pd_16]
11918 psrld m6, 5
11919 pmaddwd m2, [r3 - 11 * 32]
11920 paddd m2, [pd_16]
11921 psrld m2, 5
11922 packusdw m6, m2
11923
11924 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
11925 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
11926 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14]
11927
11928 palignr m2, m3, m1, 14
11929 palignr m13, m0, m3, 14
11930
11931 pmaddwd m7, m2, [r3 + 12 * 32] ; [28]
11932 paddd m7, [pd_16]
11933 psrld m7, 5
11934 pmaddwd m8, m13, [r3 + 12 * 32]
11935 paddd m8, [pd_16]
11936 psrld m8, 5
11937 packusdw m7, m8
11938
11939 pmaddwd m8, m2, [r3 + 3 * 32] ; [19]
11940 paddd m8, [pd_16]
11941 psrld m8, 5
11942 pmaddwd m9, m13, [r3 + 3 * 32]
11943 paddd m9, [pd_16]
11944 psrld m9, 5
11945 packusdw m8, m9
11946
11947 pmaddwd m9, m2, [r3 - 6 * 32] ; [10]
11948 paddd m9, [pd_16]
11949 psrld m9, 5
11950 pmaddwd m10, m13, [r3 - 6 * 32]
11951 paddd m10, [pd_16]
11952 psrld m10, 5
11953 packusdw m9, m10
11954
11955 pmaddwd m10, m2, [r3 - 15 * 32] ; [1]
11956 paddd m10, [pd_16]
11957 psrld m10, 5
11958 pmaddwd m12, m13, [r3 - 15 * 32]
11959 paddd m12, [pd_16]
11960 psrld m12, 5
11961 packusdw m10, m12
11962
11963 palignr m2, m3, m1, 10
11964 palignr m13, m0, m3, 10
11965
11966 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
11967 paddd m11, [pd_16]
11968 psrld m11, 5
11969 pmaddwd m13, [r3 + 8 * 32]
11970 paddd m13, [pd_16]
11971 psrld m13, 5
11972 packusdw m11, m13
11973
11974 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
11975
11976 palignr m13, m0, m3, 10
11977
11978 pmaddwd m4, m2, [r3 - 1 * 32] ; [15]
11979 paddd m4, [pd_16]
11980 psrld m4, 5
11981 pmaddwd m5, m13, [r3 - 1 * 32]
11982 paddd m5, [pd_16]
11983 psrld m5, 5
11984 packusdw m4, m5
11985
11986 pmaddwd m5, m2, [r3 - 10 * 32] ; [6]
11987 paddd m5, [pd_16]
11988 psrld m5, 5
11989 pmaddwd m6, m13, [r3 - 10 * 32]
11990 paddd m6, [pd_16]
11991 psrld m6, 5
11992 packusdw m5, m6
11993
11994 palignr m2, m3, m1, 6
11995 palignr m13, m0, m3, 6
11996
11997 pmaddwd m6, m2, [r3 + 13 * 32] ; [29]
11998 paddd m6, [pd_16]
11999 psrld m6, 5
12000 pmaddwd m8, m13, [r3 + 13 * 32]
12001 paddd m8, [pd_16]
12002 psrld m8, 5
12003 packusdw m6, m8
12004
12005 pmaddwd m7, m2, [r3 + 4 * 32] ; [20]
12006 paddd m7, [pd_16]
12007 psrld m7, 5
12008 pmaddwd m8, m13, [r3 + 4 * 32]
12009 paddd m8, [pd_16]
12010 psrld m8, 5
12011 packusdw m7, m8
12012
12013 pmaddwd m8, m2, [r3 - 5 * 32] ; [11]
12014 paddd m8, [pd_16]
12015 psrld m8, 5
12016 pmaddwd m9, m13, [r3 - 5 * 32]
12017 paddd m9, [pd_16]
12018 psrld m9, 5
12019 packusdw m8, m9
12020
12021 pmaddwd m9, m2, [r3 - 14 * 32] ; [2]
12022 paddd m9, [pd_16]
12023 psrld m9, 5
12024 pmaddwd m13, [r3 - 14 * 32]
12025 paddd m13, [pd_16]
12026 psrld m13, 5
12027 packusdw m9, m13
12028
12029 palignr m0, m3, 2
12030 palignr m3, m1, 2
12031
12032 pmaddwd m1, m3, [r3 + 9 * 32] ; [25]
12033 paddd m1, [pd_16]
12034 psrld m1, 5
12035 pmaddwd m2, m0, [r3 + 9 * 32]
12036 paddd m2, [pd_16]
12037 psrld m2, 5
12038 packusdw m1, m2
12039
12040 pmaddwd m3, [r3] ; [16]
12041 paddd m3, [pd_16]
12042 psrld m3, 5
12043 pmaddwd m0, [r3]
12044 paddd m0, [pd_16]
12045 psrld m0, 5
12046 packusdw m3, m0
12047 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16
12048 ret
12049
12050 ;; angle 16, modes 14 and 22
12051 cglobal ang16_mode_14_22
12052 test r6d, r6d
12053
12054 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
12055 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12056
12057 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
12058 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
12059
12060 pmaddwd m4, m3, [r3 + 3 * 32] ; [19]
12061 paddd m4, [pd_16]
12062 psrld m4, 5
12063 pmaddwd m5, m2, [r3 + 3 * 32]
12064 paddd m5, [pd_16]
12065 psrld m5, 5
12066 packusdw m4, m5
12067
12068 pmaddwd m5, m3, [r3 - 10 * 32] ; [6]
12069 paddd m5, [pd_16]
12070 psrld m5, 5
12071 pmaddwd m2, [r3 - 10 * 32]
12072 paddd m2, [pd_16]
12073 psrld m2, 5
12074 packusdw m5, m2
12075
12076 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
12077 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
12078 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 5 5 7 7 10 10]
12079 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 12 12 15 15 x x x x]
12080
12081 palignr m2, m3, m1, 14
12082 palignr m13, m0, m3, 14
12083
12084 pmaddwd m6, m2, [r3 + 9 * 32] ; [25]
12085 paddd m6, [pd_16]
12086 psrld m6, 5
12087 pmaddwd m9, m13, [r3 + 9 * 32]
12088 paddd m9, [pd_16]
12089 psrld m9, 5
12090 packusdw m6, m9
12091
12092 pmaddwd m7, m2, [r3 - 4 * 32] ; [12]
12093 paddd m7, [pd_16]
12094 psrld m7, 5
12095 pmaddwd m8, m13, [r3 - 4 * 32]
12096 paddd m8, [pd_16]
12097 psrld m8, 5
12098 packusdw m7, m8
12099
12100 palignr m2, m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5]
12101 palignr m13, m0, m3, 10 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
12102
12103 pmaddwd m8, m2, [r3 + 15 * 32] ; [31]
12104 paddd m8, [pd_16]
12105 psrld m8, 5
12106 pmaddwd m9, m13, [r3 + 15 * 32]
12107 paddd m9, [pd_16]
12108 psrld m9, 5
12109 packusdw m8, m9
12110
12111 pmaddwd m9, m2, [r3 + 2 * 32] ; [18]
12112 paddd m9, [pd_16]
12113 psrld m9, 5
12114 pmaddwd m10, m13, [r3 + 2 * 32]
12115 paddd m10, [pd_16]
12116 psrld m10, 5
12117 packusdw m9, m10
12118
12119 pmaddwd m10, m2, [r3 - 11 * 32] ; [5]
12120 paddd m10, [pd_16]
12121 psrld m10, 5
12122 pmaddwd m12, m13, [r3 - 11 * 32]
12123 paddd m12, [pd_16]
12124 psrld m12, 5
12125 packusdw m10, m12
12126
12127 palignr m2, m3, m1, 6 ; [ 9 8 8 7 7 6 6 5 1 0 0 2 2 5 5 7]
12128 palignr m13, m0, m3, 6 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
12129
12130 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
12131 paddd m11, [pd_16]
12132 psrld m11, 5
12133 pmaddwd m13, [r3 + 8 * 32]
12134 paddd m13, [pd_16]
12135 psrld m13, 5
12136 packusdw m11, m13
12137
12138 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
12139
12140 palignr m13, m0, m3, 6
12141
12142 pmaddwd m4, m2, [r3 - 5 * 32] ; [11]
12143 paddd m4, [pd_16]
12144 psrld m4, 5
12145 pmaddwd m5, m13, [r3 - 5 * 32]
12146 paddd m5, [pd_16]
12147 psrld m5, 5
12148 packusdw m4, m5
12149
12150 palignr m2, m0, m3, 2 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
12151 palignr m13, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 5 5 7 7 10]
12152
12153 pmaddwd m5, m13, [r3 + 14 * 32] ; [30]
12154 paddd m5, [pd_16]
12155 psrld m5, 5
12156 pmaddwd m6, m2, [r3 + 14 * 32]
12157 paddd m6, [pd_16]
12158 psrld m6, 5
12159 packusdw m5, m6
12160
12161 pmaddwd m6, m13, [r3 + 1 * 32] ; [17]
12162 paddd m6, [pd_16]
12163 psrld m6, 5
12164 pmaddwd m8, m2, [r3 + 1 * 32]
12165 paddd m8, [pd_16]
12166 psrld m8, 5
12167 packusdw m6, m8
12168
12169 pmaddwd m7, m13, [r3 - 12 * 32] ; [4]
12170 paddd m7, [pd_16]
12171 psrld m7, 5
12172 pmaddwd m8, m2, [r3 - 12 * 32]
12173 paddd m8, [pd_16]
12174 psrld m8, 5
12175 packusdw m7, m8
12176
12177 palignr m2, m1, m14, 14 ; [ 7 6 6 5 5 4 4 3 2 5 5 7 7 10 10 12]
12178 palignr m0, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 2]
12179
12180 pmaddwd m8, m2, [r3 + 7 * 32] ; [23]
12181 paddd m8, [pd_16]
12182 psrld m8, 5
12183 pmaddwd m9, m0, [r3 + 7 * 32]
12184 paddd m9, [pd_16]
12185 psrld m9, 5
12186 packusdw m8, m9
12187
12188 pmaddwd m9, m2, [r3 - 6 * 32] ; [10]
12189 paddd m9, [pd_16]
12190 psrld m9, 5
12191 pmaddwd m2, m0, [r3 - 6 * 32]
12192 paddd m2, [pd_16]
12193 psrld m2, 5
12194 packusdw m9, m2
12195
12196 palignr m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5]
12197 palignr m1, m14, 10 ; [ 6 5 5 4 4 3 3 2 5 7 7 10 10 12 12 15]
12198
12199 pmaddwd m2, m1, [r3 + 13 * 32] ; [29]
12200 paddd m2, [pd_16]
12201 psrld m2, 5
12202 pmaddwd m0, m3, [r3 + 13 * 32]
12203 paddd m0, [pd_16]
12204 psrld m0, 5
12205 packusdw m2, m0
12206
12207 pmaddwd m1, [r3] ; [16]
12208 paddd m1, [pd_16]
12209 psrld m1, 5
12210 pmaddwd m3, [r3]
12211 paddd m3, [pd_16]
12212 psrld m3, 5
12213 packusdw m1, m3
12214 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16
12215 ret
12216
12217 ;; angle 16, modes 15 and 21
12218 cglobal ang16_mode_15_21
12219 test r6d, r6d
12220
12221 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
12222 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12223
12224 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
12225 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
12226
12227 pmaddwd m4, m3, [r3 - 1 * 32] ; [15]
12228 paddd m4, [pd_16]
12229 psrld m4, 5
12230 pmaddwd m5, m2, [r3 - 1 * 32]
12231 paddd m5, [pd_16]
12232 psrld m5, 5
12233 packusdw m4, m5
12234
12235 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
12236 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
12237 vinserti128 m1, m1, xm0, 1
12238 vinserti128 m14, m14, xm3, 1
12239
12240 palignr m2, m3, m1, 14
12241 palignr m13, m0, m3, 14
12242
12243 pmaddwd m5, m2, [r3 + 14 * 32] ; [30]
12244 paddd m5, [pd_16]
12245 psrld m5, 5
12246 pmaddwd m8, m13, [r3 + 14 * 32]
12247 paddd m8, [pd_16]
12248 psrld m8, 5
12249 packusdw m5, m8
12250
12251 pmaddwd m6, m2, [r3 - 3 * 32] ; [13]
12252 paddd m6, [pd_16]
12253 psrld m6, 5
12254 pmaddwd m9, m13, [r3 - 3 * 32]
12255 paddd m9, [pd_16]
12256 psrld m9, 5
12257 packusdw m6, m9
12258
12259 palignr m2, m3, m1, 10
12260 palignr m13, m0, m3, 10
12261
12262 pmaddwd m7, m2, [r3 + 12 * 32] ; [28]
12263 paddd m7, [pd_16]
12264 psrld m7, 5
12265 pmaddwd m8, m13, [r3 + 12 * 32]
12266 paddd m8, [pd_16]
12267 psrld m8, 5
12268 packusdw m7, m8
12269
12270 pmaddwd m8, m2, [r3 - 5 * 32] ; [11]
12271 paddd m8, [pd_16]
12272 psrld m8, 5
12273 pmaddwd m9, m13, [r3 - 5 * 32]
12274 paddd m9, [pd_16]
12275 psrld m9, 5
12276 packusdw m8, m9
12277
12278 palignr m2, m3, m1, 6
12279 palignr m13, m0, m3, 6
12280
12281 pmaddwd m9, m2, [r3 + 10 * 32] ; [26]
12282 paddd m9, [pd_16]
12283 psrld m9, 5
12284 pmaddwd m10, m13, [r3 + 10 * 32]
12285 paddd m10, [pd_16]
12286 psrld m10, 5
12287 packusdw m9, m10
12288
12289 pmaddwd m10, m2, [r3 - 7 * 32] ; [9]
12290 paddd m10, [pd_16]
12291 psrld m10, 5
12292 pmaddwd m12, m13, [r3 - 7 * 32]
12293 paddd m12, [pd_16]
12294 psrld m12, 5
12295 packusdw m10, m12
12296
12297 palignr m2, m3, m1, 2
12298 palignr m13, m0, m3, 2
12299
12300 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
12301 paddd m11, [pd_16]
12302 psrld m11, 5
12303 pmaddwd m13, [r3 + 8 * 32]
12304 paddd m13, [pd_16]
12305 psrld m13, 5
12306 packusdw m11, m13
12307
12308 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
12309
12310 palignr m13, m0, m3, 2
12311
12312 pmaddwd m4, m2, [r3 - 9 * 32] ; [7]
12313 paddd m4, [pd_16]
12314 psrld m4, 5
12315 pmaddwd m5, m13, [r3 - 9 * 32]
12316 paddd m5, [pd_16]
12317 psrld m5, 5
12318 packusdw m4, m5
12319
12320 palignr m6, m1, m14, 14
12321 palignr m7, m3, m1, 14
12322
12323 pmaddwd m5, m6, [r3 + 6 * 32] ; [22]
12324 paddd m5, [pd_16]
12325 psrld m5, 5
12326 pmaddwd m8, m7, [r3 + 6 * 32]
12327 paddd m8, [pd_16]
12328 psrld m8, 5
12329 packusdw m5, m8
12330
12331 pmaddwd m6, [r3 - 11 * 32] ; [5]
12332 paddd m6, [pd_16]
12333 psrld m6, 5
12334 pmaddwd m7, [r3 - 11 * 32]
12335 paddd m7, [pd_16]
12336 psrld m7, 5
12337 packusdw m6, m7
12338
12339 palignr m8, m1, m14, 10
12340 palignr m9, m3, m1, 10
12341
12342 pmaddwd m7, m8, [r3 + 4 * 32] ; [20]
12343 paddd m7, [pd_16]
12344 psrld m7, 5
12345 pmaddwd m10, m9, [r3 + 4 * 32]
12346 paddd m10, [pd_16]
12347 psrld m10, 5
12348 packusdw m7, m10
12349
12350 pmaddwd m8, [r3 - 13 * 32] ; [3]
12351 paddd m8, [pd_16]
12352 psrld m8, 5
12353 pmaddwd m9, [r3 - 13 * 32]
12354 paddd m9, [pd_16]
12355 psrld m9, 5
12356 packusdw m8, m9
12357
12358 palignr m2, m1, m14, 6
12359 palignr m0, m3, m1, 6
12360
12361 pmaddwd m9, m2, [r3 + 2 * 32] ; [18]
12362 paddd m9, [pd_16]
12363 psrld m9, 5
12364 pmaddwd m13, m0, [r3 + 2 * 32]
12365 paddd m13, [pd_16]
12366 psrld m13, 5
12367 packusdw m9, m13
12368
12369 pmaddwd m2, [r3 - 15 * 32] ; [1]
12370 paddd m2, [pd_16]
12371 psrld m2, 5
12372 pmaddwd m0, [r3 - 15 * 32]
12373 paddd m0, [pd_16]
12374 psrld m0, 5
12375 packusdw m2, m0
12376
12377 palignr m3, m1, 2
12378 palignr m1, m14, 2
12379
12380 pmaddwd m1, [r3] ; [16]
12381 paddd m1, [pd_16]
12382 psrld m1, 5
12383 pmaddwd m3, [r3]
12384 paddd m3, [pd_16]
12385 psrld m3, 5
12386 packusdw m1, m3
12387 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16
12388 ret
12389
12390 ;; angle 16, modes 16 and 20
12391 cglobal ang16_mode_16_20
12392 test r6d, r6d
12393
12394 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
12395 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12396
12397 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
12398 punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
12399
12400 pmaddwd m4, m3, [r3 - 5 * 32] ; [11]
12401 paddd m4, [pd_16]
12402 psrld m4, 5
12403 pmaddwd m5, m12, [r3 - 5 * 32]
12404 paddd m5, [pd_16]
12405 psrld m5, 5
12406 packusdw m4, m5
12407
12408 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
12409 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
12410 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6]
12411 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12]
12412 vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x]
12413
12414 palignr m12, m3, m1, 14
12415 palignr m13, m0, m3, 14
12416
12417 pmaddwd m5, m12, [r3 + 6 * 32] ; [22]
12418 paddd m5, [pd_16]
12419 psrld m5, 5
12420 pmaddwd m8, m13, [r3 + 6 * 32]
12421 paddd m8, [pd_16]
12422 psrld m8, 5
12423 packusdw m5, m8
12424
12425 pmaddwd m6, m12, [r3 - 15 * 32] ; [1]
12426 paddd m6, [pd_16]
12427 psrld m6, 5
12428 pmaddwd m9, m13, [r3 - 15 * 32]
12429 paddd m9, [pd_16]
12430 psrld m9, 5
12431 packusdw m6, m9
12432
12433 palignr m12, m3, m1, 10
12434 palignr m13, m0, m3, 10
12435
12436 pmaddwd m7, m12, [r3 - 4 * 32] ; [12]
12437 paddd m7, [pd_16]
12438 psrld m7, 5
12439 pmaddwd m8, m13, [r3 - 4 * 32]
12440 paddd m8, [pd_16]
12441 psrld m8, 5
12442 packusdw m7, m8
12443
12444 palignr m12, m3, m1, 6
12445 palignr m13, m0, m3, 6
12446
12447 pmaddwd m8, m12, [r3 + 7 * 32] ; [23]
12448 paddd m8, [pd_16]
12449 psrld m8, 5
12450 pmaddwd m9, m13, [r3 + 7 * 32]
12451 paddd m9, [pd_16]
12452 psrld m9, 5
12453 packusdw m8, m9
12454
12455 pmaddwd m9, m12, [r3 - 14 * 32] ; [2]
12456 paddd m9, [pd_16]
12457 psrld m9, 5
12458 pmaddwd m10, m13, [r3 - 14 * 32]
12459 paddd m10, [pd_16]
12460 psrld m10, 5
12461 packusdw m9, m10
12462
12463 palignr m12, m3, m1, 2
12464 palignr m13, m0, m3, 2
12465
12466 pmaddwd m10, m12, [r3 - 3 * 32] ; [13]
12467 paddd m10, [pd_16]
12468 psrld m10, 5
12469 pmaddwd m11, m13, [r3 - 3 * 32]
12470 paddd m11, [pd_16]
12471 psrld m11, 5
12472 packusdw m10, m11
12473
12474 palignr m12, m1, m14, 14
12475 palignr m13, m3, m1, 14
12476
12477 pmaddwd m11, m12, [r3 + 8 * 32] ; [24]
12478 paddd m11, [pd_16]
12479 psrld m11, 5
12480 pmaddwd m13, [r3 + 8 * 32]
12481 paddd m13, [pd_16]
12482 psrld m13, 5
12483 packusdw m11, m13
12484
12485 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0
12486
12487 palignr m13, m3, m1, 14
12488
12489 pmaddwd m4, m12, [r3 - 13 * 32] ; [3]
12490 paddd m4, [pd_16]
12491 psrld m4, 5
12492 pmaddwd m5, m13, [r3 - 13 * 32]
12493 paddd m5, [pd_16]
12494 psrld m5, 5
12495 packusdw m4, m5
12496
12497 palignr m6, m1, m14, 10
12498 palignr m7, m3, m1, 10
12499
12500 pmaddwd m5, m6, [r3 - 2 * 32] ; [14]
12501 paddd m5, [pd_16]
12502 psrld m5, 5
12503 pmaddwd m8, m7, [r3 - 2 * 32]
12504 paddd m8, [pd_16]
12505 psrld m8, 5
12506 packusdw m5, m8
12507
12508 palignr m7, m1, m14, 6
12509 palignr m10, m3, m1, 6
12510
12511 pmaddwd m6, m7, [r3 + 9 * 32] ; [25]
12512 paddd m6, [pd_16]
12513 psrld m6, 5
12514 pmaddwd m8, m10, [r3 + 9 * 32]
12515 paddd m8, [pd_16]
12516 psrld m8, 5
12517 packusdw m6, m8
12518
12519 pmaddwd m7, [r3 - 12 * 32] ; [4]
12520 paddd m7, [pd_16]
12521 psrld m7, 5
12522 pmaddwd m10, [r3 - 12 * 32]
12523 paddd m10, [pd_16]
12524 psrld m10, 5
12525 packusdw m7, m10
12526
12527 palignr m8, m1, m14, 2 ; [ 4 3 3 2 2 1 1 0 6 8 8 9 9 11 11 12]
12528 palignr m9, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 3 3 5 5 6]
12529
12530 pmaddwd m8, [r3 - 1 * 32] ; [15]
12531 paddd m8, [pd_16]
12532 psrld m8, 5
12533 pmaddwd m9, [r3 - 1 * 32]
12534 paddd m9, [pd_16]
12535 psrld m9, 5
12536 packusdw m8, m9
12537
12538 palignr m12, m14, m2, 14
12539 palignr m0, m1, m14, 14
12540
12541 pmaddwd m9, m12, [r3 + 10 * 32] ; [26]
12542 paddd m9, [pd_16]
12543 psrld m9, 5
12544 pmaddwd m13, m0, [r3 + 10 * 32]
12545 paddd m13, [pd_16]
12546 psrld m13, 5
12547 packusdw m9, m13
12548
12549 pmaddwd m12, [r3 - 11 * 32] ; [5]
12550 paddd m12, [pd_16]
12551 psrld m12, 5
12552 pmaddwd m0, [r3 - 11 * 32]
12553 paddd m0, [pd_16]
12554 psrld m0, 5
12555 packusdw m12, m0
12556
12557 palignr m1, m14, 10
12558 palignr m14, m2, 10
12559
12560 pmaddwd m14, [r3] ; [16]
12561 paddd m14, [pd_16]
12562 psrld m14, 5
12563 pmaddwd m1, [r3]
12564 paddd m1, [pd_16]
12565 psrld m1, 5
12566 packusdw m14, m1
12567 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16
12568 ret
12569
12570 ;; angle 16, modes 17 and 19
12571 cglobal ang16_mode_17_19
12572 test r6d, r6d
12573
12574 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
12575 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12576
12577 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
12578 punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
12579
12580 pmaddwd m4, m3, [r3 - 10 * 32] ; [6]
12581 paddd m4, [pd_16]
12582 psrld m4, 5
12583 pmaddwd m5, m12, [r3 - 10 * 32]
12584 paddd m5, [pd_16]
12585 psrld m5, 5
12586 packusdw m4, m5
12587
12588 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
12589 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
12590 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6]
12591 vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12]
12592 vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x]
12593
12594 palignr m12, m3, m1, 14
12595 palignr m13, m0, m3, 14
12596
12597 pmaddwd m5, m12, [r3 - 4 * 32] ; [12]
12598 paddd m5, [pd_16]
12599 psrld m5, 5
12600 pmaddwd m8, m13, [r3 - 4 * 32]
12601 paddd m8, [pd_16]
12602 psrld m8, 5
12603 packusdw m5, m8
12604
12605 palignr m12, m3, m1, 10
12606 palignr m13, m0, m3, 10
12607
12608 pmaddwd m6, m12, [r3 + 2 * 32] ; [18]
12609 paddd m6, [pd_16]
12610 psrld m6, 5
12611 pmaddwd m9, m13, [r3 + 2 * 32]
12612 paddd m9, [pd_16]
12613 psrld m9, 5
12614 packusdw m6, m9
12615
12616 palignr m12, m3, m1, 6
12617 palignr m13, m0, m3, 6
12618
12619 pmaddwd m7, m12, [r3 + 8 * 32] ; [24]
12620 paddd m7, [pd_16]
12621 psrld m7, 5
12622 pmaddwd m8, m13, [r3 + 8 * 32]
12623 paddd m8, [pd_16]
12624 psrld m8, 5
12625 packusdw m7, m8
12626
12627 palignr m12, m3, m1, 2
12628 palignr m13, m0, m3, 2
12629
12630 pmaddwd m8, m12, [r3 + 14 * 32] ; [30]
12631 paddd m8, [pd_16]
12632 psrld m8, 5
12633 pmaddwd m9, m13, [r3 + 14 * 32]
12634 paddd m9, [pd_16]
12635 psrld m9, 5
12636 packusdw m8, m9
12637
12638 pmaddwd m9, m12, [r3 - 12 * 32] ; [4]
12639 paddd m9, [pd_16]
12640 psrld m9, 5
12641 pmaddwd m10, m13, [r3 - 12 * 32]
12642 paddd m10, [pd_16]
12643 psrld m10, 5
12644 packusdw m9, m10
12645
12646 palignr m12, m1, m14, 14
12647 palignr m13, m3, m1, 14
12648
12649 pmaddwd m10, m12, [r3 - 6 * 32] ; [10]
12650 paddd m10, [pd_16]
12651 psrld m10, 5
12652 pmaddwd m11, m13, [r3 - 6 * 32]
12653 paddd m11, [pd_16]
12654 psrld m11, 5
12655 packusdw m10, m11
12656
12657 palignr m12, m1, m14, 10
12658 palignr m13, m3, m1, 10
12659
12660 pmaddwd m11, m12, [r3] ; [16]
12661 paddd m11, [pd_16]
12662 psrld m11, 5
12663 pmaddwd m13, [r3]
12664 paddd m13, [pd_16]
12665 psrld m13, 5
12666 packusdw m11, m13
12667
12668 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0
12669
12670 palignr m12, m1, m14, 6
12671 palignr m13, m3, m1, 6
12672
12673 pmaddwd m4, m12, [r3 + 6 * 32] ; [22]
12674 paddd m4, [pd_16]
12675 psrld m4, 5
12676 pmaddwd m5, m13, [r3 + 6 * 32]
12677 paddd m5, [pd_16]
12678 psrld m5, 5
12679 packusdw m4, m5
12680
12681 palignr m12, m1, m14, 2
12682 palignr m13, m3, m1, 2
12683
12684 pmaddwd m5, m12, [r3 + 12 * 32] ; [28]
12685 paddd m5, [pd_16]
12686 psrld m5, 5
12687 pmaddwd m8, m13, [r3 + 12 * 32]
12688 paddd m8, [pd_16]
12689 psrld m8, 5
12690 packusdw m5, m8
12691
12692 pmaddwd m6, m12, [r3 - 14 * 32] ; [2]
12693 paddd m6, [pd_16]
12694 psrld m6, 5
12695 pmaddwd m8, m13, [r3 - 14 * 32]
12696 paddd m8, [pd_16]
12697 psrld m8, 5
12698 packusdw m6, m8
12699
12700 palignr m7, m14, m2, 14
12701 palignr m0, m1, m14, 14
12702
12703 pmaddwd m7, [r3 - 8 * 32] ; [8]
12704 paddd m7, [pd_16]
12705 psrld m7, 5
12706 pmaddwd m0, [r3 - 8 * 32]
12707 paddd m0, [pd_16]
12708 psrld m0, 5
12709 packusdw m7, m0
12710
12711 palignr m8, m14, m2, 10
12712 palignr m9, m1, m14, 10
12713
12714 pmaddwd m8, [r3 - 2 * 32] ; [14]
12715 paddd m8, [pd_16]
12716 psrld m8, 5
12717 pmaddwd m9, [r3 - 2 * 32]
12718 paddd m9, [pd_16]
12719 psrld m9, 5
12720 packusdw m8, m9
12721
12722 palignr m9, m14, m2, 6
12723 palignr m13, m1, m14, 6
12724
12725 pmaddwd m9, [r3 + 4 * 32] ; [20]
12726 paddd m9, [pd_16]
12727 psrld m9, 5
12728 pmaddwd m13, [r3 + 4 * 32]
12729 paddd m13, [pd_16]
12730 psrld m13, 5
12731 packusdw m9, m13
12732
12733 palignr m1, m14, 2
12734 palignr m14, m2, 2
12735
12736 pmaddwd m12, m14, [r3 + 10 * 32] ; [26]
12737 paddd m12, [pd_16]
12738 psrld m12, 5
12739 pmaddwd m0, m1, [r3 + 10 * 32]
12740 paddd m0, [pd_16]
12741 psrld m0, 5
12742 packusdw m12, m0
12743
12744 pmaddwd m14, [r3 - 16 * 32] ; [0]
12745 paddd m14, [pd_16]
12746 psrld m14, 5
12747 pmaddwd m1, [r3 - 16 * 32]
12748 paddd m1, [pd_16]
12749 psrld m1, 5
12750 packusdw m14, m1
12751 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16
12752 ret
12753
12754 cglobal intra_pred_ang16_3, 3,7,13
12755 add r2, 64
12756 xor r6d, r6d
12757 lea r3, [ang_table_avx2 + 16 * 32]
12758 add r1d, r1d
12759 lea r4, [r1 * 3]
12760
12761 call ang16_mode_3_33
12762 RET
12763
12764 cglobal intra_pred_ang16_33, 3,7,13
12765 xor r6d, r6d
12766 inc r6d
12767 lea r3, [ang_table_avx2 + 16 * 32]
12768 add r1d, r1d
12769 lea r4, [r1 * 3]
12770
12771 call ang16_mode_3_33
12772 RET
12773
12774 cglobal intra_pred_ang16_4, 3,7,13
12775 add r2, 64
12776 xor r6d, r6d
12777 lea r3, [ang_table_avx2 + 18 * 32]
12778 add r1d, r1d
12779 lea r4, [r1 * 3]
12780
12781 call ang16_mode_4_32
12782 RET
12783
12784 cglobal intra_pred_ang16_32, 3,7,13
12785 xor r6d, r6d
12786 inc r6d
12787 lea r3, [ang_table_avx2 + 18 * 32]
12788 add r1d, r1d
12789 lea r4, [r1 * 3]
12790
12791 call ang16_mode_4_32
12792 RET
12793
12794 cglobal intra_pred_ang16_5, 3,7,13
12795 add r2, 64
12796 xor r6d, r6d
12797 lea r3, [ang_table_avx2 + 16 * 32]
12798 add r1d, r1d
12799 lea r4, [r1 * 3]
12800
12801 call ang16_mode_5_31
12802 RET
12803
12804 cglobal intra_pred_ang16_31, 3,7,13
12805 xor r6d, r6d
12806 inc r6d
12807 lea r3, [ang_table_avx2 + 16 * 32]
12808 add r1d, r1d
12809 lea r4, [r1 * 3]
12810
12811 call ang16_mode_5_31
12812 RET
12813
12814 cglobal intra_pred_ang16_6, 3,7,14
12815 add r2, 64
12816 xor r6d, r6d
12817 lea r3, [ang_table_avx2 + 15 * 32]
12818 add r1d, r1d
12819 lea r4, [r1 * 3]
12820
12821 call ang16_mode_6_30
12822 RET
12823
12824 cglobal intra_pred_ang16_30, 3,7,14
12825 xor r6d, r6d
12826 inc r6d
12827 lea r3, [ang_table_avx2 + 15 * 32]
12828 add r1d, r1d
12829 lea r4, [r1 * 3]
12830
12831 call ang16_mode_6_30
12832 RET
12833
12834 cglobal intra_pred_ang16_7, 3,7,13
12835 add r2, 64
12836 xor r6d, r6d
12837 lea r3, [ang_table_avx2 + 17 * 32]
12838 add r1d, r1d
12839 lea r4, [r1 * 3]
12840
12841 call ang16_mode_7_29
12842 RET
12843
12844 cglobal intra_pred_ang16_29, 3,7,13
12845 xor r6d, r6d
12846 inc r6d
12847 lea r3, [ang_table_avx2 + 17 * 32]
12848 add r1d, r1d
12849 lea r4, [r1 * 3]
12850
12851 call ang16_mode_7_29
12852 RET
12853
12854 cglobal intra_pred_ang16_8, 3,7,13
12855 add r2, 64
12856 xor r6d, r6d
12857 lea r3, [ang_table_avx2 + 15 * 32]
12858 add r1d, r1d
12859 lea r4, [r1 * 3]
12860
12861 call ang16_mode_8_28
12862 RET
12863
12864 cglobal intra_pred_ang16_28, 3,7,13
12865 xor r6d, r6d
12866 inc r6d
12867 lea r3, [ang_table_avx2 + 15 * 32]
12868 add r1d, r1d
12869 lea r4, [r1 * 3]
12870
12871 call ang16_mode_8_28
12872 RET
12873
12874 cglobal intra_pred_ang16_9, 3,7,12
12875 add r2, 64
12876 xor r6d, r6d
12877 lea r3, [ang_table_avx2 + 16 * 32]
12878 add r1d, r1d
12879 lea r4, [r1 * 3]
12880
12881 call ang16_mode_9_27
12882 RET
12883
12884 cglobal intra_pred_ang16_27, 3,7,12
12885 xor r6d, r6d
12886 inc r6d
12887 lea r3, [ang_table_avx2 + 16 * 32]
12888 add r1d, r1d
12889 lea r4, [r1 * 3]
12890
12891 call ang16_mode_9_27
12892 RET
12893
12894 cglobal intra_pred_ang16_10, 3,6,3
12895 mov r5d, r4m
12896 add r1d, r1d
12897 lea r4, [r1 * 3]
12898
12899 vpbroadcastw m2, [r2 + 2 + 64] ; [1...]
12900 mova m0, m2
12901 movu [r0], m2
12902 vpbroadcastw m1, [r2 + 2 + 64 + 2] ; [2...]
12903 movu [r0 + r1], m1
12904 vpbroadcastw m2, [r2 + 2 + 64 + 4] ; [3...]
12905 movu [r0 + r1 * 2], m2
12906 vpbroadcastw m1, [r2 + 2 + 64 + 6] ; [4...]
12907 movu [r0 + r4], m1
12908
12909 lea r3, [r0 + r1 * 4]
12910 vpbroadcastw m2, [r2 + 2 + 64 + 8] ; [5...]
12911 movu [r3], m2
12912 vpbroadcastw m1, [r2 + 2 + 64 + 10] ; [6...]
12913 movu [r3 + r1], m1
12914 vpbroadcastw m2, [r2 + 2 + 64 + 12] ; [7...]
12915 movu [r3 + r1 * 2], m2
12916 vpbroadcastw m1, [r2 + 2 + 64 + 14] ; [8...]
12917 movu [r3 + r4], m1
12918
12919 lea r3, [r3 + r1 *4]
12920 vpbroadcastw m2, [r2 + 2 + 64 + 16] ; [9...]
12921 movu [r3], m2
12922 vpbroadcastw m1, [r2 + 2 + 64 + 18] ; [10...]
12923 movu [r3 + r1], m1
12924 vpbroadcastw m2, [r2 + 2 + 64 + 20] ; [11...]
12925 movu [r3 + r1 * 2], m2
12926 vpbroadcastw m1, [r2 + 2 + 64 + 22] ; [12...]
12927 movu [r3 + r4], m1
12928
12929 lea r3, [r3 + r1 *4]
12930 vpbroadcastw m2, [r2 + 2 + 64 + 24] ; [13...]
12931 movu [r3], m2
12932 vpbroadcastw m1, [r2 + 2 + 64 + 26] ; [14...]
12933 movu [r3 + r1], m1
12934 vpbroadcastw m2, [r2 + 2 + 64 + 28] ; [15...]
12935 movu [r3 + r1 * 2], m2
12936 vpbroadcastw m1, [r2 + 2 + 64 + 30] ; [16...]
12937 movu [r3 + r4], m1
12938
12939 cmp r5d, byte 0
12940 jz .quit
12941
12942 ; filter
12943 vpbroadcastw m2, [r2] ; [0 0...]
12944 movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12945 psubw m1, m2
12946 psraw m1, 1
12947 paddw m0, m1
12948 pxor m1, m1
12949 pmaxsw m0, m1
12950 pminsw m0, [pw_pixel_max]
12951 .quit:
12952 movu [r0], m0
12953 RET
12954
12955 cglobal intra_pred_ang16_26, 3,6,4
12956 mov r5d, r4m
12957 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12958 add r1d, r1d
12959 lea r4, [r1 * 3]
12960
12961 movu [r0], m0
12962 movu [r0 + r1], m0
12963 movu [r0 + r1 * 2], m0
12964 movu [r0 + r4], m0
12965
12966 lea r3, [r0 + r1 *4]
12967 movu [r3], m0
12968 movu [r3 + r1], m0
12969 movu [r3 + r1 * 2], m0
12970 movu [r3 + r4], m0
12971
12972 lea r3, [r3 + r1 *4]
12973 movu [r3], m0
12974 movu [r3 + r1], m0
12975 movu [r3 + r1 * 2], m0
12976 movu [r3 + r4], m0
12977
12978 lea r3, [r3 + r1 *4]
12979 movu [r3], m0
12980 movu [r3 + r1], m0
12981 movu [r3 + r1 * 2], m0
12982 movu [r3 + r4], m0
12983
12984 cmp r5d, byte 0
12985 jz .quit
12986
12987 ; filter
12988
12989 vpbroadcastw m0, xm0
12990 vpbroadcastw m2, [r2]
12991 movu m1, [r2 + 2 + 64]
12992 psubw m1, m2
12993 psraw m1, 1
12994 paddw m0, m1
12995 pxor m1, m1
12996 pmaxsw m0, m1
12997 pminsw m0, [pw_pixel_max]
12998 pextrw [r0], xm0, 0
12999 pextrw [r0 + r1], xm0, 1
13000 pextrw [r0 + r1 * 2], xm0, 2
13001 pextrw [r0 + r4], xm0, 3
13002 lea r0, [r0 + r1 * 4]
13003 pextrw [r0], xm0, 4
13004 pextrw [r0 + r1], xm0, 5
13005 pextrw [r0 + r1 * 2], xm0, 6
13006 pextrw [r0 + r4], xm0, 7
13007 lea r0, [r0 + r1 * 4]
13008 vpermq m0, m0, 11101110b
13009 pextrw [r0], xm0, 0
13010 pextrw [r0 + r1], xm0, 1
13011 pextrw [r0 + r1 * 2], xm0, 2
13012 pextrw [r0 + r4], xm0, 3
13013 pextrw [r3], xm0, 4
13014 pextrw [r3 + r1], xm0, 5
13015 pextrw [r3 + r1 * 2], xm0, 6
13016 pextrw [r3 + r4], xm0, 7
13017 .quit:
13018 RET
13019
13020 cglobal intra_pred_ang16_11, 3,7,12, 0-4
13021 movzx r5d, word [r2 + 64]
13022 movzx r6d, word [r2]
13023 mov [rsp], r5w
13024 mov [r2 + 64], r6w
13025
13026 add r2, 64
13027 xor r6d, r6d
13028 lea r3, [ang_table_avx2 + 16 * 32]
13029 add r1d, r1d
13030 lea r4, [r1 * 3]
13031
13032 call ang16_mode_11_25
13033
13034 mov r6d, [rsp]
13035 mov [r2], r6w
13036 RET
13037
13038 cglobal intra_pred_ang16_25, 3,7,12
13039 xor r6d, r6d
13040 inc r6d
13041 lea r3, [ang_table_avx2 + 16 * 32]
13042 add r1d, r1d
13043 lea r4, [r1 * 3]
13044
13045 call ang16_mode_11_25
13046 RET
13047
13048 cglobal intra_pred_ang16_12, 3,7,14, 0-4
13049 movzx r5d, word [r2 + 64]
13050 movzx r6d, word [r2]
13051 mov [rsp], r5w
13052 mov [r2 + 64], r6w
13053
13054 add r1d, r1d
13055 lea r4, [r1 * 3]
13056 lea r3, [ang_table_avx2 + 16 * 32]
13057 movu xm1, [r2 + 12] ; [13 12 11 10 9 8 7 6]
13058 pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x]
13059 xor r6d, r6d
13060 add r2, 64
13061
13062 call ang16_mode_12_24
13063
13064 mov r6d, [rsp]
13065 mov [r2], r6w
13066 RET
13067
13068 cglobal intra_pred_ang16_24, 3,7,14, 0-4
13069 add r1d, r1d
13070 lea r4, [r1 * 3]
13071 lea r3, [ang_table_avx2 + 16 * 32]
13072 movu xm1, [r2 + 76] ; [13 12 11 10 9 8 7 6]
13073 pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x]
13074 xor r6d, r6d
13075 inc r6d
13076
13077 call ang16_mode_12_24
13078 RET
13079
13080 cglobal intra_pred_ang16_13, 3,7,14, 0-4
13081 movzx r5d, word [r2 + 64]
13082 movzx r6d, word [r2]
13083 mov [rsp], r5w
13084 mov [r2 + 64], r6w
13085
13086 add r1d, r1d
13087 lea r4, [r1 * 3]
13088 lea r3, [ang_table_avx2 + 16 * 32]
13089 movu xm1, [r2 + 8] ; [11 x x x 7 x x 4]
13090 pinsrw xm1, [r2 + 28], 1 ; [11 x x x 7 x 14 4]
13091 pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14]
13092 xor r6d, r6d
13093 add r2, 64
13094
13095 call ang16_mode_13_23
13096
13097 mov r6d, [rsp]
13098 mov [r2], r6w
13099 RET
13100
13101 cglobal intra_pred_ang16_23, 3,7,14, 0-4
13102 add r1d, r1d
13103 lea r4, [r1 * 3]
13104 lea r3, [ang_table_avx2 + 16 * 32]
13105 movu xm1, [r2 + 72] ; [11 10 9 8 7 6 5 4]
13106 pinsrw xm1, [r2 + 92], 1 ; [11 x x x 7 x 14 4]
13107 pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14]
13108 xor r6d, r6d
13109 inc r6d
13110
13111 call ang16_mode_13_23
13112 RET
13113
13114 cglobal intra_pred_ang16_14, 3,7,15, 0-4
13115 movzx r5d, word [r2 + 64]
13116 movzx r6d, word [r2]
13117 mov [rsp], r5w
13118 mov [r2 + 64], r6w
13119
13120 add r1d, r1d
13121 lea r4, [r1 * 3]
13122 lea r3, [ang_table_avx2 + 16 * 32]
13123 movu xm1, [r2 + 4] ; [ x x 7 x 5 x x 2]
13124 pinsrw xm1, [r2 + 20], 1 ; [ x x 7 x 5 x 10 2]
13125 movu xm14, [r2 + 24] ; [ x x x x 15 x x 12]
13126 pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x]
13127 pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10]
13128 xor r6d, r6d
13129 add r2, 64
13130
13131 call ang16_mode_14_22
13132
13133 mov r6d, [rsp]
13134 mov [r2], r6w
13135 RET
13136
13137 cglobal intra_pred_ang16_22, 3,7,15, 0-4
13138 add r1d, r1d
13139 lea r4, [r1 * 3]
13140 lea r3, [ang_table_avx2 + 16 * 32]
13141 movu xm1, [r2 + 68] ; [ x x 7 x 5 x x 2]
13142 pinsrw xm1, [r2 + 84], 1 ; [ x x 7 x 5 x 10 2]
13143 movu xm14, [r2 + 88] ; [ x x x x 15 x x 12]
13144 pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x]
13145 pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10]
13146 xor r6d, r6d
13147 inc r6d
13148
13149 call ang16_mode_14_22
13150 RET
13151
13152 cglobal intra_pred_ang16_15, 3,7,15, 0-4
13153 movzx r5d, word [r2 + 64]
13154 movzx r6d, word [r2]
13155 mov [rsp], r5w
13156 mov [r2 + 64], r6w
13157
13158 add r1d, r1d
13159 lea r4, [r1 * 3]
13160 lea r3, [ang_table_avx2 + 16 * 32]
13161 movu xm1, [r2 + 4] ; [ x 8 x 6 x 4 x 2]
13162 movu xm14, [r2 + 18] ; [ x 15 x 13 x 11 x 9]
13163 pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15]
13164 pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8]
13165 xor r6d, r6d
13166 add r2, 64
13167
13168 call ang16_mode_15_21
13169
13170 mov r6d, [rsp]
13171 mov [r2], r6w
13172 RET
13173
13174 cglobal intra_pred_ang16_21, 3,7,15, 0-4
13175 add r1d, r1d
13176 lea r4, [r1 * 3]
13177 lea r3, [ang_table_avx2 + 16 * 32]
13178 movu xm1, [r2 + 68] ; [ x 8 x 6 x 4 x 2]
13179 movu xm14, [r2 + 82] ; [ x 15 x 13 x 11 x 9]
13180 pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15]
13181 pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8]
13182 xor r6d, r6d
13183 inc r6d
13184
13185 call ang16_mode_15_21
13186 RET
13187
13188 cglobal intra_pred_ang16_16, 3,7,15, 0-4
13189 movzx r5d, word [r2 + 64]
13190 movzx r6d, word [r2]
13191 mov [rsp], r5w
13192 mov [r2 + 64], r6w
13193
13194 add r1d, r1d
13195 lea r4, [r1 * 3]
13196 lea r3, [ang_table_avx2 + 16 * 32]
13197 movu xm1, [r2 + 4] ; [ x x x 6 5 x 3 2]
13198 movu xm14, [r2 + 16] ; [ x x x 12 11 x 9 8]
13199 movu xm2, [r2 + 28] ; [ x x x x x x 15 14]
13200 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12]
13201 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6]
13202 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x]
13203 xor r6d, r6d
13204 add r2, 64
13205
13206 call ang16_mode_16_20
13207
13208 mov r6d, [rsp]
13209 mov [r2], r6w
13210 RET
13211
13212 cglobal intra_pred_ang16_20, 3,7,15, 0-4
13213 add r1d, r1d
13214 lea r4, [r1 * 3]
13215 lea r3, [ang_table_avx2 + 16 * 32]
13216 movu xm1, [r2 + 68] ; [ x x x 6 5 x 3 2]
13217 movu xm14, [r2 + 80] ; [ x x x 12 11 x 9 8]
13218 movu xm2, [r2 + 92] ; [ x x x x x x 15 14]
13219 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12]
13220 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6]
13221 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x]
13222 xor r6d, r6d
13223 inc r6d
13224
13225 call ang16_mode_16_20
13226 RET
13227
13228 cglobal intra_pred_ang16_17, 3,7,15, 0-4
13229 movzx r5d, word [r2 + 64]
13230 movzx r6d, word [r2]
13231 mov [rsp], r5w
13232 mov [r2 + 64], r6w
13233
13234 add r1d, r1d
13235 lea r4, [r1 * 3]
13236 lea r3, [ang_table_avx2 + 16 * 32]
13237 movu xm1, [r2 + 2] ; [ x x x 6 5 x 3 2]
13238 movu xm14, [r2 + 12] ; [ x x x 12 11 x 9 8]
13239 movu xm2, [r2 + 22] ; [ x x x x x x 15 14]
13240 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12]
13241 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6]
13242 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x]
13243 xor r6d, r6d
13244 add r2, 64
13245
13246 call ang16_mode_17_19
13247
13248 mov r6d, [rsp]
13249 mov [r2], r6w
13250 RET
13251
13252 cglobal intra_pred_ang16_19, 3,7,15, 0-4
13253 add r1d, r1d
13254 lea r4, [r1 * 3]
13255 lea r3, [ang_table_avx2 + 16 * 32]
13256 movu xm1, [r2 + 66] ; [ x x x 6 5 x 3 2]
13257 movu xm14, [r2 + 76] ; [ x x x 12 11 x 9 8]
13258 movu xm2, [r2 + 86] ; [ x x x x x x 15 14]
13259 pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12]
13260 pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6]
13261 pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x]
13262 xor r6d, r6d
13263 inc r6d
13264
13265 call ang16_mode_17_19
13266 RET
13267
13268 cglobal intra_pred_ang16_18, 3,5,4
13269 add r1d, r1d
13270 lea r4, [r1 * 3]
13271 movu m1, [r2]
13272 movu m0, [r2 + 2 + 64]
13273 pshufb m0, [pw_swap16]
13274 mova m3, m0
13275 vinserti128 m0, m0, xm1, 1
13276 movu [r0], m1
13277 palignr m2, m1, m0, 14
13278 movu [r0 + r1], m2
13279
13280 palignr m2, m1, m0, 12
13281 movu [r0 + r1 * 2], m2
13282 palignr m2, m1, m0, 10
13283 movu [r0 + r4], m2
13284
13285 lea r0, [r0 + r1 * 4]
13286 palignr m2, m1, m0, 8
13287 movu [r0], m2
13288 palignr m2, m1, m0, 6
13289 movu [r0 + r1], m2
13290 palignr m2, m1, m0, 4
13291 movu [r0 + r1 * 2], m2
13292 palignr m2, m1, m0, 2
13293 movu [r0 + r4], m2
13294
13295 lea r0, [r0 + r1 * 4]
13296 movu [r0], m0
13297 vpermq m3, m3, 01001110b
13298 palignr m2, m0, m3, 14
13299 movu [r0 + r1], m2
13300 palignr m2, m0, m3, 12
13301 movu [r0 + r1 * 2], m2
13302 palignr m2, m0, m3, 10
13303 movu [r0 + r4], m2
13304 palignr m2, m1, m0, 10
13305
13306 lea r0, [r0 + r1 * 4]
13307 palignr m2, m0, m3, 8
13308 movu [r0], m2
13309 palignr m2, m0, m3, 6
13310 movu [r0 + r1], m2
13311 palignr m2, m0, m3, 4
13312 movu [r0 + r1 * 2], m2
13313 palignr m2, m0, m3, 2
13314 movu [r0 + r4], m2
13315 palignr m1, m0, 2
13316 RET
13317
13318 ;-------------------------------------------------------------------------------------------------------
13319 ; end of avx2 code for intra_pred_ang16 mode 2 to 34
13320 ;-------------------------------------------------------------------------------------------------------
13321
13322 ;-------------------------------------------------------------------------------------------------------
13323 ; avx2 code for intra_pred_ang32 mode 2 to 34 start
13324 ;-------------------------------------------------------------------------------------------------------
13325 INIT_YMM avx2
13326 cglobal intra_pred_ang32_2, 3,5,6
13327 lea r4, [r2]
13328 add r2, 128
13329 cmp r3m, byte 34
13330 cmove r2, r4
13331 add r1d, r1d
13332 lea r3, [r1 * 3]
13333 movu m0, [r2 + 4]
13334 movu m1, [r2 + 20]
13335 movu m3, [r2 + 36]
13336 movu m4, [r2 + 52]
13337
13338 movu [r0], m0
13339 movu [r0 + 32], m3
13340 palignr m2, m1, m0, 2
13341 palignr m5, m4, m3, 2
13342 movu [r0 + r1], m2
13343 movu [r0 + r1 + 32], m5
13344 palignr m2, m1, m0, 4
13345 palignr m5, m4, m3, 4
13346 movu [r0 + r1 * 2], m2
13347 movu [r0 + r1 * 2 + 32], m5
13348 palignr m2, m1, m0, 6
13349 palignr m5, m4, m3, 6
13350 movu [r0 + r3], m2
13351 movu [r0 + r3 + 32], m5
13352
13353 lea r0, [r0 + r1 * 4]
13354 palignr m2, m1, m0, 8
13355 palignr m5, m4, m3, 8
13356 movu [r0], m2
13357 movu [r0 + 32], m5
13358 palignr m2, m1, m0, 10
13359 palignr m5, m4, m3, 10
13360 movu [r0 + r1], m2
13361 movu [r0 + r1 + 32], m5
13362 palignr m2, m1, m0, 12
13363 palignr m5, m4, m3, 12
13364 movu [r0 + r1 * 2], m2
13365 movu [r0 + r1 * 2 + 32], m5
13366 palignr m2, m1, m0, 14
13367 palignr m5, m4, m3, 14
13368 movu [r0 + r3], m2
13369 movu [r0 + r3 + 32], m5
13370
13371 movu m0, [r2 + 36]
13372 movu m3, [r2 + 68]
13373 lea r0, [r0 + r1 * 4]
13374 movu [r0], m1
13375 movu [r0 + 32], m4
13376 palignr m2, m0, m1, 2
13377 palignr m5, m3, m4, 2
13378 movu [r0 + r1], m2
13379 movu [r0 + r1 + 32], m5
13380 palignr m2, m0, m1, 4
13381 palignr m5, m3, m4, 4
13382 movu [r0 + r1 * 2], m2
13383 movu [r0 + r1 * 2 + 32], m5
13384 palignr m2, m0, m1, 6
13385 palignr m5, m3, m4, 6
13386 movu [r0 + r3], m2
13387 movu [r0 + r3 + 32], m5
13388
13389 lea r0, [r0 + r1 * 4]
13390 palignr m2, m0, m1, 8
13391 palignr m5, m3, m4, 8
13392 movu [r0], m2
13393 movu [r0 + 32], m5
13394 palignr m2, m0, m1, 10
13395 palignr m5, m3, m4, 10
13396 movu [r0 + r1], m2
13397 movu [r0 + r1 + 32], m5
13398 palignr m2, m0, m1, 12
13399 palignr m5, m3, m4, 12
13400 movu [r0 + r1 * 2], m2
13401 movu [r0 + r1 * 2 + 32], m5
13402 palignr m2, m0, m1, 14
13403 palignr m5, m3, m4, 14
13404 movu [r0 + r3], m2
13405 movu [r0 + r3 + 32], m5
13406
13407 lea r0, [r0 + r1 * 4]
13408 movu m1, [r2 + 52]
13409 movu m4, [r2 + 84]
13410
13411 movu [r0], m0
13412 movu [r0 + 32], m3
13413 palignr m2, m1, m0, 2
13414 palignr m5, m4, m3, 2
13415 movu [r0 + r1], m2
13416 movu [r0 + r1 + 32], m5
13417 palignr m2, m1, m0, 4
13418 palignr m5, m4, m3, 4
13419 movu [r0 + r1 * 2], m2
13420 movu [r0 + r1 * 2 + 32], m5
13421 palignr m2, m1, m0, 6
13422 palignr m5, m4, m3, 6
13423 movu [r0 + r3], m2
13424 movu [r0 + r3 + 32], m5
13425
13426 lea r0, [r0 + r1 * 4]
13427 palignr m2, m1, m0, 8
13428 palignr m5, m4, m3, 8
13429 movu [r0], m2
13430 movu [r0 + 32], m5
13431 palignr m2, m1, m0, 10
13432 palignr m5, m4, m3, 10
13433 movu [r0 + r1], m2
13434 movu [r0 + r1 + 32], m5
13435 palignr m2, m1, m0, 12
13436 palignr m5, m4, m3, 12
13437 movu [r0 + r1 * 2], m2
13438 movu [r0 + r1 * 2 + 32], m5
13439 palignr m2, m1, m0, 14
13440 palignr m5, m4, m3, 14
13441 movu [r0 + r3], m2
13442 movu [r0 + r3 + 32], m5
13443
13444 movu m0, [r2 + 68]
13445 movu m3, [r2 + 100]
13446 lea r0, [r0 + r1 * 4]
13447 movu [r0], m1
13448 movu [r0 + 32], m4
13449 palignr m2, m0, m1, 2
13450 palignr m5, m3, m4, 2
13451 movu [r0 + r1], m2
13452 movu [r0 + r1 + 32], m5
13453 palignr m2, m0, m1, 4
13454 palignr m5, m3, m4, 4
13455 movu [r0 + r1 * 2], m2
13456 movu [r0 + r1 * 2 + 32], m5
13457 palignr m2, m0, m1, 6
13458 palignr m5, m3, m4, 6
13459 movu [r0 + r3], m2
13460 movu [r0 + r3 + 32], m5
13461
13462 lea r0, [r0 + r1 * 4]
13463 palignr m2, m0, m1, 8
13464 palignr m5, m3, m4, 8
13465 movu [r0], m2
13466 movu [r0 + 32], m5
13467 palignr m2, m0, m1, 10
13468 palignr m5, m3, m4, 10
13469 movu [r0 + r1], m2
13470 movu [r0 + r1 + 32], m5
13471 palignr m2, m0, m1, 12
13472 palignr m5, m3, m4, 12
13473 movu [r0 + r1 * 2], m2
13474 movu [r0 + r1 * 2 + 32], m5
13475 palignr m2, m0, m1, 14
13476 palignr m5, m3, m4, 14
13477 movu [r0 + r3], m2
13478 movu [r0 + r3 + 32], m5
13479 RET
13480
13481 cglobal intra_pred_ang32_3, 3,8,13
13482 add r2, 128
13483 xor r6d, r6d
13484 lea r3, [ang_table_avx2 + 16 * 32]
13485 add r1d, r1d
13486 lea r4, [r1 * 3]
13487 lea r7, [r0 + 8 * r1]
13488
13489 call ang16_mode_3_33
13490
13491 add r2, 26
13492 lea r0, [r0 + 32]
13493
13494 call ang16_mode_3_33
13495
13496 add r2, 6
13497 lea r0, [r7 + 8 * r1]
13498
13499 call ang16_mode_3_33
13500
13501 add r2, 26
13502 lea r0, [r0 + 32]
13503
13504 call ang16_mode_3_33
13505 RET
13506
13507 cglobal intra_pred_ang32_33, 3,7,13
13508 xor r6d, r6d
13509 inc r6d
13510 lea r3, [ang_table_avx2 + 16 * 32]
13511 add r1d, r1d
13512 lea r4, [r1 * 3]
13513 lea r5, [r0 + 32]
13514
13515 call ang16_mode_3_33
13516
13517 add r2, 26
13518
13519 call ang16_mode_3_33
13520
13521 add r2, 6
13522 mov r0, r5
13523
13524 call ang16_mode_3_33
13525
13526 add r2, 26
13527
13528 call ang16_mode_3_33
13529 RET
13530
13531 ;; angle 32, modes 4 and 32
13532 cglobal ang32_mode_4_32
13533 test r6d, r6d
13534
13535 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
13536 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
13537
13538 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
13539 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
13540
13541 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
13542 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
13543 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
13544 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
13545
13546 pmaddwd m4, m3, [r3 - 13 * 32] ; [5]
13547 paddd m4, [pd_16]
13548 psrld m4, 5
13549 pmaddwd m5, m0, [r3 - 13 * 32]
13550 paddd m5, [pd_16]
13551 psrld m5, 5
13552 packusdw m4, m5
13553
13554 pmaddwd m5, m3, [r3 + 8 * 32] ; [26]
13555 paddd m5, [pd_16]
13556 psrld m5, 5
13557 pmaddwd m8, m0, [r3 + 8 * 32]
13558 paddd m8, [pd_16]
13559 psrld m8, 5
13560 packusdw m5, m8
13561
13562 palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
13563 pmaddwd m6, [r3 - 3 * 32] ; [15]
13564 paddd m6, [pd_16]
13565 psrld m6, 5
13566 palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
13567 pmaddwd m7, [r3 - 3 * 32]
13568 paddd m7, [pd_16]
13569 psrld m7, 5
13570 packusdw m6, m7
13571
13572 palignr m8, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
13573 pmaddwd m7, m8, [r3 - 14 * 32] ; [4]
13574 paddd m7, [pd_16]
13575 psrld m7, 5
13576 palignr m9, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
13577 pmaddwd m10, m9, [r3 - 14 * 32]
13578 paddd m10, [pd_16]
13579 psrld m10, 5
13580 packusdw m7, m10
13581
13582 pmaddwd m8, [r3 + 7 * 32] ; [25]
13583 paddd m8, [pd_16]
13584 psrld m8, 5
13585 pmaddwd m9, [r3 + 7 * 32]
13586 paddd m9, [pd_16]
13587 psrld m9, 5
13588 packusdw m8, m9
13589
13590 palignr m9, m0, m3, 12
13591 pmaddwd m9, [r3 - 4 * 32] ; [14]
13592 paddd m9, [pd_16]
13593 psrld m9, 5
13594 palignr m3, m2, m0, 12
13595 pmaddwd m3, [r3 - 4 * 32]
13596 paddd m3, [pd_16]
13597 psrld m3, 5
13598 packusdw m9, m3
13599
13600 pmaddwd m10, m0, [r3 - 15 * 32] ; [3]
13601 paddd m10, [pd_16]
13602 psrld m10, 5
13603 pmaddwd m3, m2, [r3 - 15 * 32]
13604 paddd m3, [pd_16]
13605 psrld m3, 5
13606 packusdw m10, m3
13607
13608 pmaddwd m11, m0, [r3 + 6 * 32] ; [24]
13609 paddd m11, [pd_16]
13610 psrld m11, 5
13611 pmaddwd m3, m2, [r3 + 6 * 32]
13612 paddd m3, [pd_16]
13613 psrld m3, 5
13614 packusdw m11, m3
13615
13616 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
13617
13618 palignr m4, m2, m0, 4
13619 pmaddwd m4, [r3 - 5* 32] ; [13]
13620 paddd m4, [pd_16]
13621 psrld m4, 5
13622 palignr m5, m1, m2, 4
13623 pmaddwd m5, [r3 - 5 * 32]
13624 paddd m5, [pd_16]
13625 psrld m5, 5
13626 packusdw m4, m5
13627
13628 palignr m6, m2, m0, 8
13629 pmaddwd m5, m6, [r3 - 16 * 32] ; [2]
13630 paddd m5, [pd_16]
13631 psrld m5, 5
13632 palignr m7, m1, m2, 8
13633 pmaddwd m8, m7, [r3 - 16 * 32]
13634 paddd m8, [pd_16]
13635 psrld m8, 5
13636 packusdw m5, m8
13637
13638 pmaddwd m6, [r3 + 5 * 32] ; [23]
13639 paddd m6, [pd_16]
13640 psrld m6, 5
13641 pmaddwd m7, [r3 + 5 * 32]
13642 paddd m7, [pd_16]
13643 psrld m7, 5
13644 packusdw m6, m7
13645
13646 palignr m7, m2, m0, 12
13647 pmaddwd m7, [r3 - 6 * 32] ; [12]
13648 paddd m7, [pd_16]
13649 psrld m7, 5
13650 palignr m8, m1, m2, 12
13651 pmaddwd m8, [r3 - 6 * 32]
13652 paddd m8, [pd_16]
13653 psrld m8, 5
13654 packusdw m7, m8
13655
13656 movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
13657 pmaddwd m8, m2, [r3 - 17 * 32] ; [1]
13658 paddd m8, [pd_16]
13659 psrld m8, 5
13660 pmaddwd m9, m1, [r3 - 17 * 32]
13661 paddd m9, [pd_16]
13662 psrld m9, 5
13663 packusdw m8, m9
13664
13665 palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18]
13666 punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
13667
13668 pmaddwd m9, m2, [r3 + 4 * 32] ; [22]
13669 paddd m9, [pd_16]
13670 psrld m9, 5
13671 pmaddwd m3, m1, [r3 + 4 * 32]
13672 paddd m3, [pd_16]
13673 psrld m3, 5
13674 packusdw m9, m3
13675
13676 palignr m10, m1, m2, 4
13677 pmaddwd m10, [r3 - 7 * 32] ; [11]
13678 paddd m10, [pd_16]
13679 psrld m10, 5
13680 palignr m11, m0, m1, 4
13681 pmaddwd m11, [r3 - 7 * 32]
13682 paddd m11, [pd_16]
13683 psrld m11, 5
13684 packusdw m10, m11
13685
13686 palignr m3, m1, m2, 8
13687 pmaddwd m3, [r3 - 18 * 32] ; [0]
13688 paddd m3, [pd_16]
13689 psrld m3, 5
13690 palignr m0, m1, 8
13691 pmaddwd m0, [r3 - 18 * 32]
13692 paddd m0, [pd_16]
13693 psrld m0, 5
13694 packusdw m3, m0
13695 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
13696 ret
13697
13698 cglobal intra_pred_ang32_4, 3,8,13
13699 add r2, 128
13700 xor r6d, r6d
13701 lea r3, [ang_table_avx2 + 18 * 32]
13702 add r1d, r1d
13703 lea r4, [r1 * 3]
13704 lea r7, [r0 + 8 * r1]
13705
13706 call ang16_mode_4_32
13707
13708 add r2, 22
13709 lea r0, [r0 + 32]
13710
13711 call ang32_mode_4_32
13712
13713 add r2, 10
13714 lea r0, [r7 + 8 * r1]
13715
13716 call ang16_mode_4_32
13717
13718 add r2, 22
13719 lea r0, [r0 + 32]
13720
13721 call ang32_mode_4_32
13722 RET
13723
13724 cglobal intra_pred_ang32_32, 3,7,13
13725 xor r6d, r6d
13726 inc r6d
13727 lea r3, [ang_table_avx2 + 18 * 32]
13728 add r1d, r1d
13729 lea r4, [r1 * 3]
13730 lea r5, [r0 + 32]
13731
13732 call ang16_mode_4_32
13733
13734 add r2, 22
13735
13736 call ang32_mode_4_32
13737
13738 add r2, 10
13739 mov r0, r5
13740
13741 call ang16_mode_4_32
13742
13743 add r2, 22
13744
13745 call ang32_mode_4_32
13746 RET
13747
13748 ;; angle 32, modes 5 and 31
13749 cglobal ang32_mode_5_31
13750 test r6d, r6d
13751
13752 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
13753 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
13754
13755 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
13756 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
13757
13758 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
13759 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
13760 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
13761 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
13762
13763 pmaddwd m4, m3, [r3 - 15 * 32] ; [1]
13764 paddd m4, [pd_16]
13765 psrld m4, 5
13766 pmaddwd m5, m0, [r3 - 15 * 32]
13767 paddd m5, [pd_16]
13768 psrld m5, 5
13769 packusdw m4, m5
13770
13771 pmaddwd m5, m3, [r3 + 2 * 32] ; [18]
13772 paddd m5, [pd_16]
13773 psrld m5, 5
13774 pmaddwd m8, m0, [r3 + 2 * 32]
13775 paddd m8, [pd_16]
13776 psrld m8, 5
13777 packusdw m5, m8
13778
13779 palignr m7, m0, m3, 4
13780 pmaddwd m6, m7, [r3 - 13 * 32] ; [3]
13781 paddd m6, [pd_16]
13782 psrld m6, 5
13783 palignr m8, m2, m0, 4
13784 pmaddwd m9, m8, [r3 - 13 * 32]
13785 paddd m9, [pd_16]
13786 psrld m9, 5
13787 packusdw m6, m9
13788
13789 pmaddwd m7, [r3 + 4 * 32] ; [20]
13790 paddd m7, [pd_16]
13791 psrld m7, 5
13792 pmaddwd m8, [r3 + 4 * 32]
13793 paddd m8, [pd_16]
13794 psrld m8, 5
13795 packusdw m7, m8
13796
13797 palignr m9, m0, m3, 8
13798 pmaddwd m8, m9, [r3 - 11 * 32] ; [5]
13799 paddd m8, [pd_16]
13800 psrld m8, 5
13801 palignr m10, m2, m0, 8
13802 pmaddwd m11, m10, [r3 - 11 * 32]
13803 paddd m11, [pd_16]
13804 psrld m11, 5
13805 packusdw m8, m11
13806
13807 pmaddwd m9, [r3 + 6 * 32] ; [22]
13808 paddd m9, [pd_16]
13809 psrld m9, 5
13810 pmaddwd m10, [r3 + 6 * 32]
13811 paddd m10, [pd_16]
13812 psrld m10, 5
13813 packusdw m9, m10
13814
13815 palignr m11, m0, m3, 12
13816 pmaddwd m10, m11, [r3 - 9 * 32] ; [7]
13817 paddd m10, [pd_16]
13818 psrld m10, 5
13819 palignr m12, m2, m0, 12
13820 pmaddwd m3, m12, [r3 - 9 * 32]
13821 paddd m3, [pd_16]
13822 psrld m3, 5
13823 packusdw m10, m3
13824
13825 pmaddwd m11, [r3 + 8 * 32] ; [24]
13826 paddd m11, [pd_16]
13827 psrld m11, 5
13828 pmaddwd m12, [r3 + 8 * 32]
13829 paddd m12, [pd_16]
13830 psrld m12, 5
13831 packusdw m11, m12
13832
13833 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
13834
13835 pmaddwd m4, m0, [r3 - 7 * 32] ; [9]
13836 paddd m4, [pd_16]
13837 psrld m4, 5
13838 pmaddwd m5, m2, [r3 - 7 * 32]
13839 paddd m5, [pd_16]
13840 psrld m5, 5
13841 packusdw m4, m5
13842
13843 pmaddwd m5, m0, [r3 + 10 * 32] ; [26]
13844 paddd m5, [pd_16]
13845 psrld m5, 5
13846 pmaddwd m3, m2, [r3 + 10 * 32]
13847 paddd m3, [pd_16]
13848 psrld m3, 5
13849 packusdw m5, m3
13850
13851 palignr m7, m2, m0, 4
13852 pmaddwd m6, m7, [r3 - 5 * 32] ; [11]
13853 paddd m6, [pd_16]
13854 psrld m6, 5
13855 palignr m8, m1, m2, 4
13856 pmaddwd m9, m8, [r3 - 5 * 32]
13857 paddd m9, [pd_16]
13858 psrld m9, 5
13859 packusdw m6, m9
13860
13861 pmaddwd m7, [r3 + 12 * 32] ; [28]
13862 paddd m7, [pd_16]
13863 psrld m7, 5
13864 pmaddwd m8, [r3 + 12 * 32]
13865 paddd m8, [pd_16]
13866 psrld m8, 5
13867 packusdw m7, m8
13868
13869 palignr m9, m2, m0, 8
13870 pmaddwd m8, m9, [r3 - 3 * 32] ; [13]
13871 paddd m8, [pd_16]
13872 psrld m8, 5
13873 palignr m3, m1, m2, 8
13874 pmaddwd m10, m3, [r3 - 3 * 32]
13875 paddd m10, [pd_16]
13876 psrld m10, 5
13877 packusdw m8, m10
13878
13879 pmaddwd m9, [r3 + 14 * 32] ; [30]
13880 paddd m9, [pd_16]
13881 psrld m9, 5
13882 pmaddwd m3, [r3 + 14 * 32]
13883 paddd m3, [pd_16]
13884 psrld m3, 5
13885 packusdw m9, m3
13886
13887 palignr m10, m2, m0, 12
13888 pmaddwd m10, [r3 - 1 * 32] ; [15]
13889 paddd m10, [pd_16]
13890 psrld m10, 5
13891 palignr m11, m1, m2, 12
13892 pmaddwd m11, [r3 - 1 * 32]
13893 paddd m11, [pd_16]
13894 psrld m11, 5
13895 packusdw m10, m11
13896
13897 pmaddwd m2, [r3 - 16 * 32] ; [0]
13898 paddd m2, [pd_16]
13899 psrld m2, 5
13900 pmaddwd m1, [r3 - 16 * 32]
13901 paddd m1, [pd_16]
13902 psrld m1, 5
13903 packusdw m2, m1
13904 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
13905 ret
13906
13907 cglobal intra_pred_ang32_5, 3,8,13
13908 add r2, 128
13909 xor r6d, r6d
13910 lea r3, [ang_table_avx2 + 16 * 32]
13911 add r1d, r1d
13912 lea r4, [r1 * 3]
13913 lea r7, [r0 + 8 * r1]
13914
13915 call ang16_mode_5_31
13916
13917 add r2, 18
13918 lea r0, [r0 + 32]
13919
13920 call ang32_mode_5_31
13921
13922 add r2, 14
13923 lea r0, [r7 + 8 * r1]
13924
13925 call ang16_mode_5_31
13926
13927 add r2, 18
13928 lea r0, [r0 + 32]
13929
13930 call ang32_mode_5_31
13931 RET
13932
13933 cglobal intra_pred_ang32_31, 3,7,13
13934 xor r6d, r6d
13935 inc r6d
13936 lea r3, [ang_table_avx2 + 16 * 32]
13937 add r1d, r1d
13938 lea r4, [r1 * 3]
13939 lea r5, [r0 + 32]
13940
13941 call ang16_mode_5_31
13942
13943 add r2, 18
13944
13945 call ang32_mode_5_31
13946
13947 add r2, 14
13948 mov r0, r5
13949
13950 call ang16_mode_5_31
13951
13952 add r2, 18
13953
13954 call ang32_mode_5_31
13955 RET
13956
13957 ;; angle 32, modes 6 and 30
13958 cglobal ang32_mode_6_30
13959 test r6d, r6d
13960
13961 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
13962 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
13963
13964 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
13965 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
13966
13967 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
13968 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
13969 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
13970 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
13971
13972 pmaddwd m4, m3, [r3 + 14 * 32] ; [29]
13973 paddd m4, [pd_16]
13974 psrld m4, 5
13975 pmaddwd m5, m0, [r3 + 14 * 32]
13976 paddd m5, [pd_16]
13977 psrld m5, 5
13978 packusdw m4, m5
13979
13980 palignr m6, m0, m3, 4
13981 pmaddwd m5, m6, [r3 - 5 * 32] ; [10]
13982 paddd m5, [pd_16]
13983 psrld m5, 5
13984 palignr m7, m2, m0, 4
13985 pmaddwd m8, m7, [r3 - 5 * 32]
13986 paddd m8, [pd_16]
13987 psrld m8, 5
13988 packusdw m5, m8
13989
13990 pmaddwd m6, [r3 + 8 * 32] ; [23]
13991 paddd m6, [pd_16]
13992 psrld m6, 5
13993 pmaddwd m7, [r3 + 8 * 32]
13994 paddd m7, [pd_16]
13995 psrld m7, 5
13996 packusdw m6, m7
13997
13998 palignr m9, m0, m3, 8
13999 pmaddwd m7, m9, [r3 - 11 * 32] ; [4]
14000 paddd m7, [pd_16]
14001 psrld m7, 5
14002 palignr m12, m2, m0, 8
14003 pmaddwd m11, m12, [r3 - 11 * 32]
14004 paddd m11, [pd_16]
14005 psrld m11, 5
14006 packusdw m7, m11
14007
14008 pmaddwd m8, m9, [r3 + 2 * 32] ; [17]
14009 paddd m8, [pd_16]
14010 psrld m8, 5
14011 pmaddwd m10, m12, [r3 + 2 * 32]
14012 paddd m10, [pd_16]
14013 psrld m10, 5
14014 packusdw m8, m10
14015
14016 pmaddwd m9, [r3 + 15 * 32] ; [30]
14017 paddd m9, [pd_16]
14018 psrld m9, 5
14019 pmaddwd m12, [r3 + 15 * 32]
14020 paddd m12, [pd_16]
14021 psrld m12, 5
14022 packusdw m9, m12
14023
14024 palignr m11, m0, m3, 12
14025 pmaddwd m10, m11, [r3 - 4 * 32] ; [11]
14026 paddd m10, [pd_16]
14027 psrld m10, 5
14028 palignr m12, m2, m0, 12
14029 pmaddwd m3, m12, [r3 - 4 * 32]
14030 paddd m3, [pd_16]
14031 psrld m3, 5
14032 packusdw m10, m3
14033
14034 pmaddwd m11, [r3 + 9 * 32] ; [24]
14035 paddd m11, [pd_16]
14036 psrld m11, 5
14037 pmaddwd m12, [r3 + 9 * 32]
14038 paddd m12, [pd_16]
14039 psrld m12, 5
14040 packusdw m11, m12
14041
14042 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
14043
14044 pmaddwd m4, m0, [r3 - 10 * 32] ; [5]
14045 paddd m4, [pd_16]
14046 psrld m4, 5
14047 pmaddwd m5, m2, [r3 - 10 * 32]
14048 paddd m5, [pd_16]
14049 psrld m5, 5
14050 packusdw m4, m5
14051
14052 pmaddwd m5, m0, [r3 + 3 * 32] ; [18]
14053 paddd m5, [pd_16]
14054 psrld m5, 5
14055 pmaddwd m3, m2, [r3 + 3 * 32]
14056 paddd m3, [pd_16]
14057 psrld m3, 5
14058 packusdw m5, m3
14059
14060 pmaddwd m6, m0, [r3 + 16 * 32] ; [31]
14061 paddd m6, [pd_16]
14062 psrld m6, 5
14063 pmaddwd m7, m2, [r3 + 16 * 32]
14064 paddd m7, [pd_16]
14065 psrld m7, 5
14066 packusdw m6, m7
14067
14068 palignr m8, m2, m0, 4
14069 pmaddwd m7, m8, [r3 - 3 * 32] ; [12]
14070 paddd m7, [pd_16]
14071 psrld m7, 5
14072 palignr m9, m1, m2, 4
14073 pmaddwd m3, m9, [r3 - 3 * 32]
14074 paddd m3, [pd_16]
14075 psrld m3, 5
14076 packusdw m7, m3
14077
14078 pmaddwd m8, [r3 + 10 * 32] ; [25]
14079 paddd m8, [pd_16]
14080 psrld m8, 5
14081 pmaddwd m9, [r3 + 10 * 32]
14082 paddd m9, [pd_16]
14083 psrld m9, 5
14084 packusdw m8, m9
14085
14086 palignr m10, m2, m0, 8
14087 pmaddwd m9, m10, [r3 - 9 * 32] ; [6]
14088 paddd m9, [pd_16]
14089 psrld m9, 5
14090 palignr m12, m1, m2, 8
14091 pmaddwd m3, m12, [r3 - 9 * 32]
14092 paddd m3, [pd_16]
14093 psrld m3, 5
14094 packusdw m9, m3
14095
14096 pmaddwd m10, [r3 + 4 * 32] ; [19]
14097 paddd m10, [pd_16]
14098 psrld m10, 5
14099 pmaddwd m12, [r3 + 4 * 32]
14100 paddd m12, [pd_16]
14101 psrld m12, 5
14102 packusdw m10, m12
14103
14104 palignr m11, m2, m0, 12
14105 pmaddwd m11, [r3 - 15 * 32] ; [0]
14106 paddd m11, [pd_16]
14107 psrld m11, 5
14108 palignr m3, m1, m2, 12
14109 pmaddwd m3, [r3 - 15 * 32]
14110 paddd m3, [pd_16]
14111 psrld m3, 5
14112 packusdw m11, m3
14113 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
14114 ret
14115
14116 cglobal intra_pred_ang32_6, 3,8,14
14117 add r2, 128
14118 xor r6d, r6d
14119 lea r3, [ang_table_avx2 + 15 * 32]
14120 add r1d, r1d
14121 lea r4, [r1 * 3]
14122 lea r7, [r0 + 8 * r1]
14123
14124 call ang16_mode_6_30
14125
14126 add r2, 12
14127 lea r0, [r0 + 32]
14128
14129 call ang32_mode_6_30
14130
14131 add r2, 20
14132 lea r0, [r7 + 8 * r1]
14133
14134 call ang16_mode_6_30
14135
14136 add r2, 12
14137 lea r0, [r0 + 32]
14138
14139 call ang32_mode_6_30
14140 RET
14141
14142 cglobal intra_pred_ang32_30, 3,7,14
14143 xor r6d, r6d
14144 inc r6d
14145 lea r3, [ang_table_avx2 + 15 * 32]
14146 add r1d, r1d
14147 lea r4, [r1 * 3]
14148 lea r5, [r0 + 32]
14149
14150 call ang16_mode_6_30
14151
14152 add r2, 12
14153
14154 call ang32_mode_6_30
14155
14156 add r2, 20
14157 mov r0, r5
14158
14159 call ang16_mode_6_30
14160
14161 add r2, 12
14162
14163 call ang32_mode_6_30
14164 RET
14165
14166 ;; angle 32, modes 7 and 29
14167 cglobal ang32_mode_7_29
14168 test r6d, r6d
14169
14170 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
14171 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
14172
14173 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
14174 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
14175
14176 movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
14177 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
14178 punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
14179 punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
14180
14181 pmaddwd m4, m3, [r3 + 8 * 32] ; [25]
14182 paddd m4, [pd_16]
14183 psrld m4, 5
14184 pmaddwd m5, m0, [r3 + 8 * 32]
14185 paddd m5, [pd_16]
14186 psrld m5, 5
14187 packusdw m4, m5
14188
14189 palignr m8, m0, m3, 4
14190 pmaddwd m5, m8, [r3 - 15 * 32] ; [2]
14191 paddd m5, [pd_16]
14192 psrld m5, 5
14193 palignr m9, m2, m0, 4
14194 pmaddwd m10, m9, [r3 - 15 * 32]
14195 paddd m10, [pd_16]
14196 psrld m10, 5
14197 packusdw m5, m10
14198
14199 pmaddwd m6, m8, [r3 - 6 * 32] ; [11]
14200 paddd m6, [pd_16]
14201 psrld m6, 5
14202 pmaddwd m7, m9, [r3 - 6 * 32]
14203 paddd m7, [pd_16]
14204 psrld m7, 5
14205 packusdw m6, m7
14206
14207 pmaddwd m7, m8, [r3 + 3 * 32] ; [20]
14208 paddd m7, [pd_16]
14209 psrld m7, 5
14210 pmaddwd m10, m9, [r3 + 3 * 32]
14211 paddd m10, [pd_16]
14212 psrld m10, 5
14213 packusdw m7, m10
14214
14215 pmaddwd m8, [r3 + 12 * 32] ; [29]
14216 paddd m8, [pd_16]
14217 psrld m8, 5
14218 pmaddwd m9, [r3 + 12 * 32]
14219 paddd m9, [pd_16]
14220 psrld m9, 5
14221 packusdw m8, m9
14222
14223 palignr m11, m0, m3, 8
14224 pmaddwd m9, m11, [r3 - 11 * 32] ; [6]
14225 paddd m9, [pd_16]
14226 psrld m9, 5
14227 palignr m12, m2, m0, 8
14228 pmaddwd m10, m12, [r3 - 11 * 32]
14229 paddd m10, [pd_16]
14230 psrld m10, 5
14231 packusdw m9, m10
14232
14233 pmaddwd m10, m11, [r3 - 2 * 32] ; [15]
14234 paddd m10, [pd_16]
14235 psrld m10, 5
14236 pmaddwd m13, m12, [r3 - 2 * 32]
14237 paddd m13, [pd_16]
14238 psrld m13, 5
14239 packusdw m10, m13
14240
14241 pmaddwd m11, [r3 + 7 * 32] ; [24]
14242 paddd m11, [pd_16]
14243 psrld m11, 5
14244 pmaddwd m12, [r3 + 7 * 32]
14245 paddd m12, [pd_16]
14246 psrld m12, 5
14247 packusdw m11, m12
14248
14249 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
14250
14251 palignr m5, m0, m3, 12
14252 pmaddwd m4, m5, [r3 - 16 * 32] ; [1]
14253 paddd m4, [pd_16]
14254 psrld m4, 5
14255 palignr m6, m2, m0, 12
14256 pmaddwd m7, m6, [r3 - 16 * 32]
14257 paddd m7, [pd_16]
14258 psrld m7, 5
14259 packusdw m4, m7
14260
14261 pmaddwd m5, [r3 - 7 * 32] ; [10]
14262 paddd m5, [pd_16]
14263 psrld m5, 5
14264 pmaddwd m6, [r3 - 7 * 32]
14265 paddd m6, [pd_16]
14266 psrld m6, 5
14267 packusdw m5, m6
14268
14269 palignr m9, m0, m3, 12
14270 pmaddwd m6, m9, [r3 + 2 * 32] ; [19]
14271 paddd m6, [pd_16]
14272 psrld m6, 5
14273 palignr m3, m2, m0, 12
14274 pmaddwd m7, m3, [r3 + 2 * 32]
14275 paddd m7, [pd_16]
14276 psrld m7, 5
14277 packusdw m6, m7
14278
14279 pmaddwd m7, m9, [r3 + 11 * 32] ; [28]
14280 paddd m7, [pd_16]
14281 psrld m7, 5
14282 pmaddwd m8, m3, [r3 + 11 * 32]
14283 paddd m8, [pd_16]
14284 psrld m8, 5
14285 packusdw m7, m8
14286
14287 pmaddwd m8, m0, [r3 - 12 * 32] ; [5]
14288 paddd m8, [pd_16]
14289 psrld m8, 5
14290 pmaddwd m10, m2, [r3 - 12 * 32]
14291 paddd m10, [pd_16]
14292 psrld m10, 5
14293 packusdw m8, m10
14294
14295 pmaddwd m9, m0, [r3 - 3 * 32] ; [14]
14296 paddd m9, [pd_16]
14297 psrld m9, 5
14298 pmaddwd m3, m2, [r3 - 3 * 32]
14299 paddd m3, [pd_16]
14300 psrld m3, 5
14301 packusdw m9, m3
14302
14303 pmaddwd m10, m0, [r3 + 6 * 32] ; [23]
14304 paddd m10, [pd_16]
14305 psrld m10, 5
14306 pmaddwd m12, m2, [r3 + 6 * 32]
14307 paddd m12, [pd_16]
14308 psrld m12, 5
14309 packusdw m10, m12
14310
14311 palignr m11, m2, m0, 4
14312 pmaddwd m11, [r3 - 17 * 32] ; [0]
14313 paddd m11, [pd_16]
14314 psrld m11, 5
14315 palignr m12, m1, m2, 4
14316 pmaddwd m12, [r3 - 17 * 32]
14317 paddd m12, [pd_16]
14318 psrld m12, 5
14319 packusdw m11, m12
14320 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 3, 2, 16
14321 ret
14322
14323 cglobal intra_pred_ang32_7, 3,8,14
14324 add r2, 128
14325 xor r6d, r6d
14326 lea r3, [ang_table_avx2 + 17 * 32]
14327 add r1d, r1d
14328 lea r4, [r1 * 3]
14329 lea r7, [r0 + 8 * r1]
14330
14331 call ang16_mode_7_29
14332
14333 add r2, 8
14334 lea r0, [r0 + 32]
14335
14336 call ang32_mode_7_29
14337
14338 add r2, 24
14339 lea r0, [r7 + 8 * r1]
14340
14341 call ang16_mode_7_29
14342
14343 add r2, 8
14344 lea r0, [r0 + 32]
14345
14346 call ang32_mode_7_29
14347 RET
14348
14349 cglobal intra_pred_ang32_29, 3,7,14
14350 xor r6d, r6d
14351 inc r6d
14352 lea r3, [ang_table_avx2 + 17 * 32]
14353 add r1d, r1d
14354 lea r4, [r1 * 3]
14355 lea r5, [r0 + 32]
14356
14357 call ang16_mode_7_29
14358
14359 add r2, 8
14360
14361 call ang32_mode_7_29
14362
14363 add r2, 24
14364 mov r0, r5
14365
14366 call ang16_mode_7_29
14367
14368 add r2, 8
14369
14370 call ang32_mode_7_29
14371 RET
14372
14373 ;; angle 32, modes 8 and 28
14374 cglobal ang32_mode_8_28
14375 test r6d, r6d
14376
14377 movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
14378 movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
14379
14380 punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
14381 punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
14382
14383 movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
14384 movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
14385 punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
14386
14387 pmaddwd m4, m3, [r3 + 6 * 32] ; [21]
14388 paddd m4, [pd_16]
14389 psrld m4, 5
14390 pmaddwd m5, m0, [r3 + 6 * 32]
14391 paddd m5, [pd_16]
14392 psrld m5, 5
14393 packusdw m4, m5
14394
14395 pmaddwd m5, m3, [r3 + 11 * 32] ; [26]
14396 paddd m5, [pd_16]
14397 psrld m5, 5
14398 pmaddwd m8, m0, [r3 + 11 * 32]
14399 paddd m8, [pd_16]
14400 psrld m8, 5
14401 packusdw m5, m8
14402
14403 pmaddwd m6, m3, [r3 + 16 * 32] ; [31]
14404 paddd m6, [pd_16]
14405 psrld m6, 5
14406 pmaddwd m9, m0, [r3 + 16 * 32]
14407 paddd m9, [pd_16]
14408 psrld m9, 5
14409 packusdw m6, m9
14410
14411 palignr m11, m0, m3, 4
14412 pmaddwd m7, m11, [r3 - 11 * 32] ; [4]
14413 paddd m7, [pd_16]
14414 psrld m7, 5
14415 palignr m1, m2, m0, 4
14416 pmaddwd m8, m1, [r3 - 11 * 32]
14417 paddd m8, [pd_16]
14418 psrld m8, 5
14419 packusdw m7, m8
14420
14421 pmaddwd m8, m11, [r3 - 6 * 32] ; [9]
14422 paddd m8, [pd_16]
14423 psrld m8, 5
14424 pmaddwd m9, m1, [r3 - 6 * 32]
14425 paddd m9, [pd_16]
14426 psrld m9, 5
14427 packusdw m8, m9
14428
14429 pmaddwd m9, m11, [r3 - 1 * 32] ; [14]
14430 paddd m9, [pd_16]
14431 psrld m9, 5
14432 pmaddwd m10, m1, [r3 - 1 * 32]
14433 paddd m10, [pd_16]
14434 psrld m10, 5
14435 packusdw m9, m10
14436
14437 pmaddwd m10, m11, [r3 + 4 * 32] ; [19]
14438 paddd m10, [pd_16]
14439 psrld m10, 5
14440 pmaddwd m12, m1, [r3 + 4 * 32]
14441 paddd m12, [pd_16]
14442 psrld m12, 5
14443 packusdw m10, m12
14444
14445 pmaddwd m11, [r3 + 9 * 32] ; [24]
14446 paddd m11, [pd_16]
14447 psrld m11, 5
14448 pmaddwd m1, [r3 + 9 * 32]
14449 paddd m1, [pd_16]
14450 psrld m1, 5
14451 packusdw m11, m1
14452
14453 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
14454
14455 palignr m4, m0, m3, 4
14456 pmaddwd m4, [r3 + 14 * 32] ; [29]
14457 paddd m4, [pd_16]
14458 psrld m4, 5
14459 palignr m5, m2, m0, 4
14460 pmaddwd m5, [r3 + 14 * 32]
14461 paddd m5, [pd_16]
14462 psrld m5, 5
14463 packusdw m4, m5
14464
14465 palignr m1, m0, m3, 8
14466 pmaddwd m5, m1, [r3 - 13 * 32] ; [2]
14467 paddd m5, [pd_16]
14468 psrld m5, 5
14469 palignr m10, m2, m0, 8
14470 pmaddwd m6, m10, [r3 - 13 * 32]
14471 paddd m6, [pd_16]
14472 psrld m6, 5
14473 packusdw m5, m6
14474
14475 pmaddwd m6, m1, [r3 - 8 * 32] ; [7]
14476 paddd m6, [pd_16]
14477 psrld m6, 5
14478 pmaddwd m8, m10, [r3 - 8 * 32]
14479 paddd m8, [pd_16]
14480 psrld m8, 5
14481 packusdw m6, m8
14482
14483 pmaddwd m7, m1, [r3 - 3 * 32] ; [12]
14484 paddd m7, [pd_16]
14485 psrld m7, 5
14486 pmaddwd m8, m10, [r3 - 3 * 32]
14487 paddd m8, [pd_16]
14488 psrld m8, 5
14489 packusdw m7, m8
14490
14491 pmaddwd m8, m1, [r3 + 2 * 32] ; [17]
14492 paddd m8, [pd_16]
14493 psrld m8, 5
14494 pmaddwd m9, m10, [r3 + 2 * 32]
14495 paddd m9, [pd_16]
14496 psrld m9, 5
14497 packusdw m8, m9
14498
14499 pmaddwd m9, m1, [r3 + 7 * 32] ; [22]
14500 paddd m9, [pd_16]
14501 psrld m9, 5
14502 pmaddwd m11, m10, [r3 + 7 * 32]
14503 paddd m11, [pd_16]
14504 psrld m11, 5
14505 packusdw m9, m11
14506
14507 pmaddwd m1, [r3 + 12 * 32] ; [27]
14508 paddd m1, [pd_16]
14509 psrld m1, 5
14510 pmaddwd m10, [r3 + 12 * 32]
14511 paddd m10, [pd_16]
14512 psrld m10, 5
14513 packusdw m1, m10
14514
14515 palignr m11, m0, m3, 12
14516 pmaddwd m11, [r3 - 15 * 32] ; [0]
14517 paddd m11, [pd_16]
14518 psrld m11, 5
14519 palignr m2, m0, 12
14520 pmaddwd m2, [r3 - 15 * 32]
14521 paddd m2, [pd_16]
14522 psrld m2, 5
14523 packusdw m11, m2
14524 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 11, 0, 2, 16
14525 ret
14526
14527 cglobal intra_pred_ang32_8, 3,8,13
14528 add r2, 128
14529 xor r6d, r6d
14530 lea r3, [ang_table_avx2 + 15 * 32]
14531 add r1d, r1d
14532 lea r4, [r1 * 3]
14533 lea r7, [r0 + 8 * r1]
14534
14535 call ang16_mode_8_28
14536
14537 add r2, 4
14538 lea r0, [r0 + 32]
14539
14540 call ang32_mode_8_28
14541
14542 add r2, 28
14543 lea r0, [r7 + 8 * r1]
14544
14545 call ang16_mode_8_28
14546
14547 add r2, 4
14548 lea r0, [r0 + 32]
14549
14550 call ang32_mode_8_28
14551 RET
14552
14553 cglobal intra_pred_ang32_28, 3,7,13
14554 xor r6d, r6d
14555 inc r6d
14556 lea r3, [ang_table_avx2 + 15 * 32]
14557 add r1d, r1d
14558 lea r4, [r1 * 3]
14559 lea r5, [r0 + 32]
14560
14561 call ang16_mode_8_28
14562
14563 add r2, 4
14564
14565 call ang32_mode_8_28
14566
14567 add r2, 28
14568 mov r0, r5
14569
14570 call ang16_mode_8_28
14571
14572 add r2, 4
14573
14574 call ang32_mode_8_28
14575 RET
14576
14577 cglobal intra_pred_ang32_9, 3,8,13
14578 add r2, 128
14579 xor r6d, r6d
14580 lea r3, [ang_table_avx2 + 16 * 32]
14581 add r1d, r1d
14582 lea r4, [r1 * 3]
14583 lea r7, [r0 + 8 * r1]
14584
14585 call ang16_mode_9_27
14586
14587 add r2, 2
14588 lea r0, [r0 + 32]
14589
14590 call ang16_mode_9_27
14591
14592 add r2, 30
14593 lea r0, [r7 + 8 * r1]
14594
14595 call ang16_mode_9_27
14596
14597 add r2, 2
14598 lea r0, [r0 + 32]
14599
14600 call ang16_mode_9_27
14601 RET
14602
14603 cglobal intra_pred_ang32_27, 3,7,13
14604 xor r6d, r6d
14605 inc r6d
14606 lea r3, [ang_table_avx2 + 16 * 32]
14607 add r1d, r1d
14608 lea r4, [r1 * 3]
14609 lea r5, [r0 + 32]
14610
14611 call ang16_mode_9_27
14612
14613 add r2, 2
14614
14615 call ang16_mode_9_27
14616
14617 add r2, 30
14618 mov r0, r5
14619
14620 call ang16_mode_9_27
14621
14622 add r2, 2
14623
14624 call ang16_mode_9_27
14625 RET
14626
14627 cglobal intra_pred_ang32_10, 3,4,2
14628 add r2, mmsize*4
14629 add r1d, r1d
14630 lea r3, [r1 * 3]
14631
14632 vpbroadcastw m0, [r2 + 2] ; [1...]
14633 movu [r0], m0
14634 movu [r0 + 32], m0
14635 vpbroadcastw m1, [r2 + 2 + 2] ; [2...]
14636 movu [r0 + r1], m1
14637 movu [r0 + r1 + 32], m1
14638 vpbroadcastw m0, [r2 + 2 + 4] ; [3...]
14639 movu [r0 + r1 * 2], m0
14640 movu [r0 + r1 * 2 + 32], m0
14641 vpbroadcastw m1, [r2 + 2 + 6] ; [4...]
14642 movu [r0 + r3], m1
14643 movu [r0 + r3 + 32], m1
14644
14645 lea r0, [r0 + r1 * 4]
14646 vpbroadcastw m0, [r2 + 2 + 8] ; [5...]
14647 movu [r0], m0
14648 movu [r0 + 32], m0
14649 vpbroadcastw m1, [r2 + 2 + 10] ; [6...]
14650 movu [r0 + r1], m1
14651 movu [r0 + r1 + 32], m1
14652 vpbroadcastw m0, [r2 + 2 + 12] ; [7...]
14653 movu [r0 + r1 * 2], m0
14654 movu [r0 + r1 * 2 + 32], m0
14655 vpbroadcastw m1, [r2 + 2 + 14] ; [8...]
14656 movu [r0 + r3], m1
14657 movu [r0 + r3 + 32], m1
14658
14659 lea r0, [r0 + r1 *4]
14660 vpbroadcastw m0, [r2 + 2 + 16] ; [9...]
14661 movu [r0], m0
14662 movu [r0 + 32], m0
14663 vpbroadcastw m1, [r2 + 2 + 18] ; [10...]
14664 movu [r0 + r1], m1
14665 movu [r0 + r1 + 32], m1
14666 vpbroadcastw m0, [r2 + 2 + 20] ; [11...]
14667 movu [r0 + r1 * 2], m0
14668 movu [r0 + r1 * 2 + 32], m0
14669 vpbroadcastw m1, [r2 + 2 + 22] ; [12...]
14670 movu [r0 + r3], m1
14671 movu [r0 + r3 + 32], m1
14672
14673 lea r0, [r0 + r1 *4]
14674 vpbroadcastw m0, [r2 + 2 + 24] ; [13...]
14675 movu [r0], m0
14676 movu [r0 + 32], m0
14677 vpbroadcastw m1, [r2 + 2 + 26] ; [14...]
14678 movu [r0 + r1], m1
14679 movu [r0 + r1 + 32], m1
14680 vpbroadcastw m0, [r2 + 2 + 28] ; [15...]
14681 movu [r0 + r1 * 2], m0
14682 movu [r0 + r1 * 2 + 32], m0
14683 vpbroadcastw m1, [r2 + 2 + 30] ; [16...]
14684 movu [r0 + r3], m1
14685 movu [r0 + r3 + 32], m1
14686
14687 lea r0, [r0 + r1 *4]
14688 vpbroadcastw m0, [r2 + 2 + 32] ; [17...]
14689 movu [r0], m0
14690 movu [r0 + 32], m0
14691 vpbroadcastw m1, [r2 + 2 + 34] ; [18...]
14692 movu [r0 + r1], m1
14693 movu [r0 + r1 + 32], m1
14694 vpbroadcastw m0, [r2 + 2 + 36] ; [19...]
14695 movu [r0 + r1 * 2], m0
14696 movu [r0 + r1 * 2 + 32], m0
14697 vpbroadcastw m1, [r2 + 2 + 38] ; [20...]
14698 movu [r0 + r3], m1
14699 movu [r0 + r3 + 32], m1
14700
14701 lea r0, [r0 + r1 *4]
14702 vpbroadcastw m0, [r2 + 2 + 40] ; [21...]
14703 movu [r0], m0
14704 movu [r0 + 32], m0
14705 vpbroadcastw m1, [r2 + 2 + 42] ; [22...]
14706 movu [r0 + r1], m1
14707 movu [r0 + r1 + 32], m1
14708 vpbroadcastw m0, [r2 + 2 + 44] ; [23...]
14709 movu [r0 + r1 * 2], m0
14710 movu [r0 + r1 * 2 + 32], m0
14711 vpbroadcastw m1, [r2 + 2 + 46] ; [24...]
14712 movu [r0 + r3], m1
14713 movu [r0 + r3 + 32], m1
14714
14715 lea r0, [r0 + r1 *4]
14716 vpbroadcastw m0, [r2 + 2 + 48] ; [25...]
14717 movu [r0], m0
14718 movu [r0 + 32], m0
14719 vpbroadcastw m1, [r2 + 2 + 50] ; [26...]
14720 movu [r0 + r1], m1
14721 movu [r0 + r1 + 32], m1
14722 vpbroadcastw m0, [r2 + 2 + 52] ; [27...]
14723 movu [r0 + r1 * 2], m0
14724 movu [r0 + r1 * 2 + 32], m0
14725 vpbroadcastw m1, [r2 + 2 + 54] ; [28...]
14726 movu [r0 + r3], m1
14727 movu [r0 + r3 + 32], m1
14728
14729 lea r0, [r0 + r1 *4]
14730 vpbroadcastw m0, [r2 + 2 + 56] ; [29...]
14731 movu [r0], m0
14732 movu [r0 + 32], m0
14733 vpbroadcastw m1, [r2 + 2 + 58] ; [30...]
14734 movu [r0 + r1], m1
14735 movu [r0 + r1 + 32], m1
14736 vpbroadcastw m0, [r2 + 2 + 60] ; [31...]
14737 movu [r0 + r1 * 2], m0
14738 movu [r0 + r1 * 2 + 32], m0
14739 vpbroadcastw m1, [r2 + 2 + 62] ; [32...]
14740 movu [r0 + r3], m1
14741 movu [r0 + r3 + 32], m1
14742 RET
14743
14744 cglobal intra_pred_ang32_26, 3,3,2
14745 movu m0, [r2 + 2]
14746 movu m1, [r2 + 34]
14747 add r1d, r1d
14748 lea r2, [r1 * 3]
14749
14750 movu [r0], m0
14751 movu [r0 + 32], m1
14752 movu [r0 + r1], m0
14753 movu [r0 + r1 + 32], m1
14754 movu [r0 + r1 * 2], m0
14755 movu [r0 + r1 * 2 + 32], m1
14756 movu [r0 + r2], m0
14757 movu [r0 + r2 + 32], m1
14758
14759 lea r0, [r0 + r1 *4]
14760 movu [r0], m0
14761 movu [r0 + 32], m1
14762 movu [r0 + r1], m0
14763 movu [r0 + r1 + 32], m1
14764 movu [r0 + r1 * 2], m0
14765 movu [r0 + r1 * 2 + 32], m1
14766 movu [r0 + r2], m0
14767 movu [r0 + r2 + 32], m1
14768
14769 lea r0, [r0 + r1 *4]
14770 movu [r0], m0
14771 movu [r0 + 32], m1
14772 movu [r0 + r1], m0
14773 movu [r0 + r1 + 32], m1
14774 movu [r0 + r1 * 2], m0
14775 movu [r0 + r1 * 2 + 32], m1
14776 movu [r0 + r2], m0
14777 movu [r0 + r2 + 32], m1
14778
14779 lea r0, [r0 + r1 *4]
14780 movu [r0], m0
14781 movu [r0 + 32], m1
14782 movu [r0 + r1], m0
14783 movu [r0 + r1 + 32], m1
14784 movu [r0 + r1 * 2], m0
14785 movu [r0 + r1 * 2 + 32], m1
14786 movu [r0 + r2], m0
14787 movu [r0 + r2 + 32], m1
14788
14789 lea r0, [r0 + r1 *4]
14790 movu [r0], m0
14791 movu [r0 + 32], m1
14792 movu [r0 + r1], m0
14793 movu [r0 + r1 + 32], m1
14794 movu [r0 + r1 * 2], m0
14795 movu [r0 + r1 * 2 + 32], m1
14796 movu [r0 + r2], m0
14797 movu [r0 + r2 + 32], m1
14798
14799 lea r0, [r0 + r1 *4]
14800 movu [r0], m0
14801 movu [r0 + 32], m1
14802 movu [r0 + r1], m0
14803 movu [r0 + r1 + 32], m1
14804 movu [r0 + r1 * 2], m0
14805 movu [r0 + r1 * 2 + 32], m1
14806 movu [r0 + r2], m0
14807 movu [r0 + r2 + 32], m1
14808
14809 lea r0, [r0 + r1 *4]
14810 movu [r0], m0
14811 movu [r0 + 32], m1
14812 movu [r0 + r1], m0
14813 movu [r0 + r1 + 32], m1
14814 movu [r0 + r1 * 2], m0
14815 movu [r0 + r1 * 2 + 32], m1
14816 movu [r0 + r2], m0
14817 movu [r0 + r2 + 32], m1
14818
14819 lea r0, [r0 + r1 *4]
14820 movu [r0], m0
14821 movu [r0 + 32], m1
14822 movu [r0 + r1], m0
14823 movu [r0 + r1 + 32], m1
14824 movu [r0 + r1 * 2], m0
14825 movu [r0 + r1 * 2 + 32], m1
14826 movu [r0 + r2], m0
14827 movu [r0 + r2 + 32], m1
14828 RET
14829
14830 cglobal intra_pred_ang32_11, 3,8,12, 0-8
14831 movzx r5d, word [r2 + 128] ; [0]
14832 movzx r6d, word [r2]
14833 mov [rsp], r5w
14834 mov [r2 + 128], r6w
14835
14836 movzx r5d, word [r2 + 126] ; [16]
14837 movzx r6d, word [r2 + 32]
14838 mov [rsp + 4], r5w
14839 mov [r2 + 126], r6w
14840
14841 add r2, 128
14842 xor r6d, r6d
14843 lea r3, [ang_table_avx2 + 16 * 32]
14844 add r1d, r1d
14845 lea r4, [r1 * 3]
14846 lea r7, [r0 + 8 * r1]
14847
14848 call ang16_mode_11_25
14849
14850 sub r2, 2
14851 lea r0, [r0 + 32]
14852
14853 call ang16_mode_11_25
14854
14855 add r2, 34
14856 lea r0, [r7 + 8 * r1]
14857
14858 call ang16_mode_11_25
14859
14860 sub r2, 2
14861 lea r0, [r0 + 32]
14862
14863 call ang16_mode_11_25
14864
14865 mov r6d, [rsp]
14866 mov [r2 - 30], r6w
14867 mov r6d, [rsp + 4]
14868 mov [r2 - 32], r6w
14869 RET
14870
14871 cglobal intra_pred_ang32_25, 3,7,12, 0-4
14872 xor r6d, r6d
14873 inc r6d
14874 lea r3, [ang_table_avx2 + 16 * 32]
14875 add r1d, r1d
14876
14877 movzx r4d, word [r2 - 2]
14878 movzx r5d, word [r2 + 160] ; [16]
14879 mov [rsp], r4w
14880 mov [r2 - 2], r5w
14881
14882 lea r4, [r1 * 3]
14883 lea r5, [r0 + 32]
14884
14885 call ang16_mode_11_25
14886
14887 sub r2, 2
14888
14889 call ang16_mode_11_25
14890
14891 add r2, 34
14892 mov r0, r5
14893
14894 call ang16_mode_11_25
14895
14896 sub r2, 2
14897
14898 call ang16_mode_11_25
14899
14900 mov r5d, [rsp]
14901 mov [r2 - 32], r5w
14902 RET
14903
14904 ;; angle 32, modes 12 and 24, row 0 to 15
14905 cglobal ang32_mode_12_24_0_15
14906 test r6d, r6d
14907
14908 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
14909 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
14910
14911 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
14912 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
14913
14914 pmaddwd m4, m3, [r3 + 11 * 32] ; [27]
14915 paddd m4, [pd_16]
14916 psrld m4, 5
14917 pmaddwd m5, m2, [r3 + 11 * 32]
14918 paddd m5, [pd_16]
14919 psrld m5, 5
14920 packusdw m4, m5
14921
14922 pmaddwd m5, m3, [r3 + 6 * 32] ; [22]
14923 paddd m5, [pd_16]
14924 psrld m5, 5
14925 pmaddwd m8, m2, [r3 + 6 * 32]
14926 paddd m8, [pd_16]
14927 psrld m8, 5
14928 packusdw m5, m8
14929
14930 pmaddwd m6, m3, [r3 + 1 * 32] ; [17]
14931 paddd m6, [pd_16]
14932 psrld m6, 5
14933 pmaddwd m9, m2, [r3 + 1 * 32]
14934 paddd m9, [pd_16]
14935 psrld m9, 5
14936 packusdw m6, m9
14937
14938 pmaddwd m7, m3, [r3 - 4 * 32] ; [12]
14939 paddd m7, [pd_16]
14940 psrld m7, 5
14941 pmaddwd m8, m2, [r3 - 4 * 32]
14942 paddd m8, [pd_16]
14943 psrld m8, 5
14944 packusdw m7, m8
14945
14946 pmaddwd m8, m3, [r3 - 9 * 32] ; [7]
14947 paddd m8, [pd_16]
14948 psrld m8, 5
14949 pmaddwd m9, m2, [r3 - 9 * 32]
14950 paddd m9, [pd_16]
14951 psrld m9, 5
14952 packusdw m8, m9
14953
14954 pmaddwd m9, m3, [r3 - 14 * 32] ; [2]
14955 paddd m9, [pd_16]
14956 psrld m9, 5
14957 pmaddwd m2, [r3 - 14 * 32]
14958 paddd m2, [pd_16]
14959 psrld m2, 5
14960 packusdw m9, m2
14961
14962 movu xm1, [r2 - 8]
14963 pshufb xm1, [pw_ang32_12_24]
14964 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
14965 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
14966 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 19 19 26 26]
14967
14968 palignr m2, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 6]
14969 palignr m13, m0, m3, 14 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
14970
14971 pmaddwd m10, m2, [r3 + 13 * 32] ; [29]
14972 paddd m10, [pd_16]
14973 psrld m10, 5
14974 pmaddwd m12, m13, [r3 + 13 * 32]
14975 paddd m12, [pd_16]
14976 psrld m12, 5
14977 packusdw m10, m12
14978
14979 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
14980 paddd m11, [pd_16]
14981 psrld m11, 5
14982 pmaddwd m13, [r3 + 8 * 32]
14983 paddd m13, [pd_16]
14984 psrld m13, 5
14985 packusdw m11, m13
14986
14987 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
14988
14989 palignr m13, m0, m3, 14
14990
14991 pmaddwd m4, m2, [r3 + 3 * 32] ; [19]
14992 paddd m4, [pd_16]
14993 psrld m4, 5
14994 pmaddwd m5, m13, [r3 + 3 * 32]
14995 paddd m5, [pd_16]
14996 psrld m5, 5
14997 packusdw m4, m5
14998
14999 pmaddwd m5, m2, [r3 - 2 * 32] ; [14]
15000 paddd m5, [pd_16]
15001 psrld m5, 5
15002 pmaddwd m6, m13, [r3 - 2 * 32]
15003 paddd m6, [pd_16]
15004 psrld m6, 5
15005 packusdw m5, m6
15006
15007 pmaddwd m6, m2, [r3 - 7 * 32] ; [9]
15008 paddd m6, [pd_16]
15009 psrld m6, 5
15010 pmaddwd m8, m13, [r3 - 7 * 32]
15011 paddd m8, [pd_16]
15012 psrld m8, 5
15013 packusdw m6, m8
15014
15015 pmaddwd m7, m2, [r3 - 12 * 32] ; [4]
15016 paddd m7, [pd_16]
15017 psrld m7, 5
15018 pmaddwd m8, m13, [r3 - 12 * 32]
15019 paddd m8, [pd_16]
15020 psrld m8, 5
15021 packusdw m7, m8
15022
15023 palignr m0, m3, 10
15024 palignr m3, m1, 10
15025
15026 pmaddwd m8, m3, [r3 + 15 * 32] ; [31]
15027 paddd m8, [pd_16]
15028 psrld m8, 5
15029 pmaddwd m9, m0, [r3 + 15 * 32]
15030 paddd m9, [pd_16]
15031 psrld m9, 5
15032 packusdw m8, m9
15033
15034 pmaddwd m9, m3, [r3 + 10 * 32] ; [26]
15035 paddd m9, [pd_16]
15036 psrld m9, 5
15037 pmaddwd m10, m0, [r3 + 10 * 32]
15038 paddd m10, [pd_16]
15039 psrld m10, 5
15040 packusdw m9, m10
15041
15042 pmaddwd m10, m3, [r3 + 5 * 32] ; [21]
15043 paddd m10, [pd_16]
15044 psrld m10, 5
15045 pmaddwd m2, m0, [r3 + 5 * 32]
15046 paddd m2, [pd_16]
15047 psrld m2, 5
15048 packusdw m10, m2
15049
15050 pmaddwd m3, [r3] ; [16]
15051 paddd m3, [pd_16]
15052 psrld m3, 5
15053 pmaddwd m0, [r3]
15054 paddd m0, [pd_16]
15055 psrld m0, 5
15056 packusdw m3, m0
15057 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 2, 16
15058 ret
15059
15060 ;; angle 32, modes 12 and 24, row 16 to 31
15061 cglobal ang32_mode_12_24_16_31
15062 test r6d, r6d
15063
15064 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
15065 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
15066
15067 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
15068 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
15069
15070 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
15071 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
15072
15073 palignr m2, m3, m1, 10
15074 palignr m13, m0, m3, 10
15075
15076 pmaddwd m4, m2, [r3 - 5 * 32] ; [11]
15077 paddd m4, [pd_16]
15078 psrld m4, 5
15079 pmaddwd m5, m13, [r3 - 5 * 32]
15080 paddd m5, [pd_16]
15081 psrld m5, 5
15082 packusdw m4, m5
15083
15084 pmaddwd m5, m2, [r3 - 10 * 32] ; [6]
15085 paddd m5, [pd_16]
15086 psrld m5, 5
15087 pmaddwd m8, m13, [r3 - 10 * 32]
15088 paddd m8, [pd_16]
15089 psrld m8, 5
15090 packusdw m5, m8
15091
15092 pmaddwd m6, m2, [r3 - 15 * 32] ; [1]
15093 paddd m6, [pd_16]
15094 psrld m6, 5
15095 pmaddwd m9, m13, [r3 - 15 * 32]
15096 paddd m9, [pd_16]
15097 psrld m9, 5
15098 packusdw m6, m9
15099
15100 palignr m2, m3, m1, 6
15101 palignr m13, m0, m3, 6
15102
15103 pmaddwd m7, m2, [r3 + 12 * 32] ; [28]
15104 paddd m7, [pd_16]
15105 psrld m7, 5
15106 pmaddwd m8, m13, [r3 + 12 * 32]
15107 paddd m8, [pd_16]
15108 psrld m8, 5
15109 packusdw m7, m8
15110
15111 pmaddwd m8, m2, [r3 + 7 * 32] ; [23]
15112 paddd m8, [pd_16]
15113 psrld m8, 5
15114 pmaddwd m9, m13, [r3 + 7 * 32]
15115 paddd m9, [pd_16]
15116 psrld m9, 5
15117 packusdw m8, m9
15118
15119 pmaddwd m9, m2, [r3 + 2 * 32] ; [18]
15120 paddd m9, [pd_16]
15121 psrld m9, 5
15122 pmaddwd m10, m13, [r3 + 2 * 32]
15123 paddd m10, [pd_16]
15124 psrld m10, 5
15125 packusdw m9, m10
15126
15127 pmaddwd m10, m2, [r3 - 3 * 32] ; [13]
15128 paddd m10, [pd_16]
15129 psrld m10, 5
15130 pmaddwd m12, m13, [r3 - 3 * 32]
15131 paddd m12, [pd_16]
15132 psrld m12, 5
15133 packusdw m10, m12
15134
15135 pmaddwd m11, m2, [r3 - 8 * 32] ; [8]
15136 paddd m11, [pd_16]
15137 psrld m11, 5
15138 pmaddwd m13, [r3 - 8 * 32]
15139 paddd m13, [pd_16]
15140 psrld m13, 5
15141 packusdw m11, m13
15142
15143 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
15144
15145 palignr m13, m0, m3, 6
15146
15147 pmaddwd m4, m2, [r3 - 13 * 32] ; [3]
15148 paddd m4, [pd_16]
15149 psrld m4, 5
15150 pmaddwd m5, m13, [r3 - 13 * 32]
15151 paddd m5, [pd_16]
15152 psrld m5, 5
15153 packusdw m4, m5
15154
15155 palignr m2, m3, m1, 2
15156 palignr m13, m0, m3, 2
15157
15158 pmaddwd m5, m2, [r3 + 14 * 32] ; [30]
15159 paddd m5, [pd_16]
15160 psrld m5, 5
15161 pmaddwd m6, m13, [r3 + 14 * 32]
15162 paddd m6, [pd_16]
15163 psrld m6, 5
15164 packusdw m5, m6
15165
15166 pmaddwd m6, m2, [r3 + 9 * 32] ; [25]
15167 paddd m6, [pd_16]
15168 psrld m6, 5
15169 pmaddwd m8, m13, [r3 + 9 * 32]
15170 paddd m8, [pd_16]
15171 psrld m8, 5
15172 packusdw m6, m8
15173
15174 pmaddwd m7, m2, [r3 + 4 * 32] ; [20]
15175 paddd m7, [pd_16]
15176 psrld m7, 5
15177 pmaddwd m8, m13, [r3 + 4 * 32]
15178 paddd m8, [pd_16]
15179 psrld m8, 5
15180 packusdw m7, m8
15181
15182 pmaddwd m8, m2, [r3 - 1 * 32] ; [15]
15183 paddd m8, [pd_16]
15184 psrld m8, 5
15185 pmaddwd m9, m13, [r3 - 1 * 32]
15186 paddd m9, [pd_16]
15187 psrld m9, 5
15188 packusdw m8, m9
15189
15190 pmaddwd m9, m2, [r3 - 6 * 32] ; [10]
15191 paddd m9, [pd_16]
15192 psrld m9, 5
15193 pmaddwd m10, m13, [r3 - 6 * 32]
15194 paddd m10, [pd_16]
15195 psrld m10, 5
15196 packusdw m9, m10
15197
15198 pmaddwd m10, m2, [r3 - 11 * 32] ; [5]
15199 paddd m10, [pd_16]
15200 psrld m10, 5
15201 pmaddwd m12, m13, [r3 - 11 * 32]
15202 paddd m12, [pd_16]
15203 psrld m12, 5
15204 packusdw m10, m12
15205
15206 pmaddwd m2, [r3 - 16 * 32] ; [0]
15207 paddd m2, [pd_16]
15208 psrld m2, 5
15209 pmaddwd m13, [r3 - 16 * 32]
15210 paddd m13, [pd_16]
15211 psrld m13, 5
15212 packusdw m2, m13
15213 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 3, 16
15214 ret
15215
15216 cglobal intra_pred_ang32_12, 3,8,14, 0-16
15217 movu xm0, [r2 + 114]
15218 mova [rsp], xm0
15219
15220 add r1d, r1d
15221 lea r4, [r1 * 3]
15222 lea r3, [ang_table_avx2 + 16 * 32]
15223
15224 pinsrw xm1, [r2], 7 ; [0]
15225 pinsrw xm1, [r2 + 12], 6 ; [6]
15226 pinsrw xm1, [r2 + 26], 5 ; [13]
15227 pinsrw xm1, [r2 + 38], 4 ; [19]
15228 pinsrw xm1, [r2 + 52], 3 ; [26]
15229 movu [r2 + 114], xm1
15230
15231 xor r6d, r6d
15232 add r2, 128
15233 lea r7, [r0 + 8 * r1]
15234
15235 call ang32_mode_12_24_0_15
15236
15237 lea r0, [r0 + 32]
15238
15239 call ang32_mode_12_24_16_31
15240
15241 add r2, 32
15242 lea r0, [r7 + 8 * r1]
15243
15244 call ang32_mode_12_24_0_15
15245
15246 lea r0, [r0 + 32]
15247
15248 call ang32_mode_12_24_16_31
15249
15250 mova xm0, [rsp]
15251 movu [r2 - 46], xm0
15252 RET
15253
15254 cglobal intra_pred_ang32_24, 3,7,14, 0-16
15255 movu xm0, [r2 - 16]
15256 mova [rsp], xm0
15257
15258 add r1d, r1d
15259 lea r4, [r1 * 3]
15260 lea r3, [ang_table_avx2 + 16 * 32]
15261
15262 pinsrw xm1, [r2 + 140], 7 ; [6]
15263 pinsrw xm1, [r2 + 154], 6 ; [13]
15264 pinsrw xm1, [r2 + 166], 5 ; [19]
15265 pinsrw xm1, [r2 + 180], 4 ; [26]
15266 movu [r2 - 16], xm1
15267
15268 xor r6d, r6d
15269 inc r6d
15270 lea r5, [r0 + 32]
15271
15272 call ang32_mode_12_24_0_15
15273
15274 call ang32_mode_12_24_16_31
15275
15276 add r2, 32
15277 mov r0, r5
15278
15279 call ang32_mode_12_24_0_15
15280
15281 call ang32_mode_12_24_16_31
15282
15283 mova xm0, [rsp]
15284 movu [r2 - 48], xm0
15285 RET
15286
15287 ;; angle 32, modes 13 and 23, row 0 to 15
15288 cglobal ang32_mode_13_23_row_0_15
15289 test r6d, r6d
15290
15291 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
15292 movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
15293
15294 punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
15295 punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
15296
15297 pmaddwd m4, m3, [r3 + 7 * 32] ; [23]
15298 paddd m4, [pd_16]
15299 psrld m4, 5
15300 pmaddwd m5, m2, [r3 + 7 * 32]
15301 paddd m5, [pd_16]
15302 psrld m5, 5
15303 packusdw m4, m5
15304
15305 pmaddwd m5, m3, [r3 - 2 * 32] ; [14]
15306 paddd m5, [pd_16]
15307 psrld m5, 5
15308 pmaddwd m6, m2, [r3 - 2 * 32]
15309 paddd m6, [pd_16]
15310 psrld m6, 5
15311 packusdw m5, m6
15312
15313 pmaddwd m6, m3, [r3 - 11 * 32] ; [5]
15314 paddd m6, [pd_16]
15315 psrld m6, 5
15316 pmaddwd m2, [r3 - 11 * 32]
15317 paddd m2, [pd_16]
15318 psrld m2, 5
15319 packusdw m6, m2
15320
15321 movu xm1, [r2 - 8]
15322 pshufb xm1, [pw_ang32_12_24]
15323 punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
15324 punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4]
15325 vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14]
15326
15327 palignr m2, m3, m1, 14
15328 palignr m13, m0, m3, 14
15329
15330 pmaddwd m7, m2, [r3 + 12 * 32] ; [28]
15331 paddd m7, [pd_16]
15332 psrld m7, 5
15333 pmaddwd m8, m13, [r3 + 12 * 32]
15334 paddd m8, [pd_16]
15335 psrld m8, 5
15336 packusdw m7, m8
15337
15338 pmaddwd m8, m2, [r3 + 3 * 32] ; [19]
15339 paddd m8, [pd_16]
15340 psrld m8, 5
15341 pmaddwd m9, m13, [r3 + 3 * 32]
15342 paddd m9, [pd_16]
15343 psrld m9, 5
15344 packusdw m8, m9
15345
15346 pmaddwd m9, m2, [r3 - 6 * 32] ; [10]
15347 paddd m9, [pd_16]
15348 psrld m9, 5
15349 pmaddwd m10, m13, [r3 - 6 * 32]
15350 paddd m10, [pd_16]
15351 psrld m10, 5
15352 packusdw m9, m10
15353
15354 pmaddwd m10, m2, [r3 - 15 * 32] ; [1]
15355 paddd m10, [pd_16]
15356 psrld m10, 5
15357 pmaddwd m12, m13, [r3 - 15 * 32]
15358 paddd m12, [pd_16]
15359 psrld m12, 5
15360 packusdw m10, m12
15361
15362 palignr m2, m3, m1, 10
15363 palignr m13, m0, m3, 10
15364
15365 pmaddwd m11, m2, [r3 + 8 * 32] ; [24]
15366 paddd m11, [pd_16]
15367 psrld m11, 5
15368 pmaddwd m13, [r3 + 8 * 32]
15369 paddd m13, [pd_16]
15370 psrld m13, 5
15371 packusdw m11, m13
15372
15373 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
15374
15375 palignr m13, m0, m3, 10
15376
15377 pmaddwd m4, m2, [r3 - 1 * 32] ; [15]
15378 paddd m4, [pd_16]
15379 psrld m4, 5
15380 pmaddwd m5, m13, [r3 - 1 * 32]
15381 paddd m5, [pd_16]
15382 psrld m5, 5
15383 packusdw m4, m5
15384
15385 pmaddwd m5, m2, [r3 - 10 * 32] ; [6]
15386 paddd m5, [pd_16]
15387 psrld m5, 5
15388 pmaddwd m6, m13, [r3 - 10 * 32]
15389 paddd m6, [pd_16]
15390 psrld m6, 5
15391 packusdw m5, m6
15392
15393 palignr m2, m3, m1, 6
15394 palignr m13, m0, m3, 6
15395
15396 pmaddwd m6, m2, [r3 + 13 * 32] ; [29]
15397 paddd m6, [pd_16]
15398 psrld m6, 5
15399 pmaddwd m8, m13, [r3 + 13 * 32]
15400 paddd m8, [pd_16]
15401 psrld m8, 5
15402 packusdw m6, m8
15403
15404 pmaddwd m7, m2, [r3 + 4 * 32] ; [20]
15405 paddd m7, [pd_16]
15406 psrld m7, 5
15407 pmaddwd m8, m13, [r3 + 4 * 32]
15408 paddd m8, [pd_16]
15409 psrld m8, 5
15410 packusdw m7, m8
15411
15412 pmaddwd m8, m2, [r3 - 5 * 32] ; [11]
15413 paddd m8, [pd_16]
15414 psrld m8, 5
15415 pmaddwd m9, m13, [r3 - 5 * 32]
15416 paddd m9, [pd_16]
15417 psrld m9, 5
15418 packusdw m8, m9
15419
15420 pmaddwd m9, m2, [r3 - 14 * 32] ; [2]
15421 paddd m9, [pd_16]
15422 psrld m9, 5
15423 pmaddwd m13, [r3 - 14 * 32]
15424 paddd m13, [pd_16]
15425 psrld m13, 5
15426 packusdw m9, m13
15427
15428 palignr m0, m3, 2
15429 palignr m3, m1, 2
15430
15431 pmaddwd m1, m3, [r3 + 9 * 32] ; [25]
15432 paddd m1, [pd_16]
15433 psrld m1, 5
15434 pmaddwd m2, m0, [r3 + 9 * 32]
15435 paddd m2, [pd_16]
15436 psrld m2, 5
15437 packusdw m1, m2
15438
15439 pmaddwd m3, [r3] ; [16]
15440 paddd m3, [pd_16]
15441 psrld m3, 5
15442 pmaddwd m0, [r3]
15443 paddd m0, [pd_16]
15444 psrld m0, 5
15445 packusdw m3, m0
15446 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16
15447 ret
15448
15449 ;; angle 32, modes 13 and 23, row 16 to 31
15450 cglobal ang32_mode_13_23_row_16_31
15451 test r6d, r6d
15452
15453 movu m0, [r2] ; [11 10 9 8 7 6 5 4 3 2 1 0 4 7 11 14]
15454 movu m5, [r2 + 2] ; [12 11 10 9 8 7 6 5 4 3 2 1 0 4 7 11]
15455
15456 punpcklwd m4, m0, m5 ; [ 8 7 7 6 6 5 5 4 0 4 4 7 7 11 11 14]
15457 punpckhwd m2, m0, m5 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
15458
15459 pmaddwd m4, [r3 - 9 * 32] ; [7]
15460 paddd m4, [pd_16]
15461 psrld m4, 5
15462 pmaddwd m2, [r3 - 9 * 32]
15463 paddd m2, [pd_16]
15464 psrld m2, 5
15465 packusdw m4, m2
15466
15467 movu xm1, [r2 - 8]
15468 pshufb xm1, [pw_ang32_12_24] ; [18 18 21 21 25 25 28 28]
15469 punpcklwd m3, m0, m0 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14]
15470 punpckhwd m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0]
15471 vinserti128 m1, m1, xm0, 1 ; [ 3 3 2 2 1 1 0 0 18 18 21 21 25 25 28 28]
15472
15473 palignr m2, m3, m1, 14
15474 palignr m13, m0, m3, 14
15475
15476 pmaddwd m5, m2, [r3 + 14 * 32] ; [30]
15477 paddd m5, [pd_16]
15478 psrld m5, 5
15479 pmaddwd m6, m13, [r3 + 14 * 32]
15480 paddd m6, [pd_16]
15481 psrld m6, 5
15482 packusdw m5, m6
15483
15484 pmaddwd m6, m2, [r3 + 5 * 32] ; [21]
15485 paddd m6, [pd_16]
15486 psrld m6, 5
15487 pmaddwd m7, m13, [r3 + 5 * 32]
15488 paddd m7, [pd_16]
15489 psrld m7, 5
15490 packusdw m6, m7
15491
15492 pmaddwd m7, m2, [r3 - 4 * 32] ; [12]
15493 paddd m7, [pd_16]
15494 psrld m7, 5
15495 pmaddwd m8, m13, [r3 - 4 * 32]
15496 paddd m8, [pd_16]
15497 psrld m8, 5
15498 packusdw m7, m8
15499
15500 pmaddwd m8, m2, [r3 - 13 * 32] ; [3]
15501 paddd m8, [pd_16]
15502 psrld m8, 5
15503 pmaddwd m9, m13, [r3 - 13 * 32]
15504 paddd m9, [pd_16]
15505 psrld m9, 5
15506 packusdw m8, m9
15507
15508 palignr m2, m3, m1, 10
15509 palignr m13, m0, m3, 10
15510
15511 pmaddwd m9, m2, [r3 + 10 * 32] ; [26]
15512 paddd m9, [pd_16]
15513 psrld m9, 5
15514 pmaddwd m10, m13, [r3 + 10 * 32]
15515 paddd m10, [pd_16]
15516 psrld m10, 5
15517 packusdw m9, m10
15518
15519 pmaddwd m10, m2, [r3 + 1 * 32] ; [17]
15520 paddd m10, [pd_16]
15521 psrld m10, 5
15522 pmaddwd m12, m13, [r3 + 1 * 32]
15523 paddd m12, [pd_16]
15524 psrld m12, 5
15525 packusdw m10, m12
15526
15527 pmaddwd m11, m2, [r3 - 8 * 32] ; [8]
15528 paddd m11, [pd_16]
15529 psrld m11, 5
15530 pmaddwd m13, [r3 - 8 * 32]
15531 paddd m13, [pd_16]
15532 psrld m13, 5
15533 packusdw m11, m13
15534
15535 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
15536
15537 palignr m2, m3, m1, 6
15538 palignr m13, m0, m3, 6
15539
15540 pmaddwd m4, m2, [r3 + 15 * 32] ; [31]
15541 paddd m4, [pd_16]
15542 psrld m4, 5
15543 pmaddwd m5, m13, [r3 + 15 * 32]
15544 paddd m5, [pd_16]
15545 psrld m5, 5
15546 packusdw m4, m5
15547
15548 pmaddwd m5, m2, [r3 + 6 * 32] ; [22]
15549 paddd m5, [pd_16]
15550 psrld m5, 5
15551 pmaddwd m6, m13, [r3 + 6 * 32]
15552 paddd m6, [pd_16]
15553 psrld m6, 5
15554 packusdw m5, m6
15555
15556 pmaddwd m6, m2, [r3 - 3 * 32] ; [13]
15557 paddd m6, [pd_16]
15558 psrld m6, 5
15559 pmaddwd m8, m13, [r3 - 3 * 32]
15560 paddd m8, [pd_16]
15561 psrld m8, 5
15562 packusdw m6, m8
15563
15564 pmaddwd m7, m2, [r3 - 12 * 32] ; [4]
15565 paddd m7, [pd_16]
15566 psrld m7, 5
15567 pmaddwd m8, m13, [r3 - 12 * 32]
15568 paddd m8, [pd_16]
15569 psrld m8, 5
15570 packusdw m7, m8
15571
15572 palignr m0, m3, 2
15573 palignr m3, m1, 2
15574
15575 pmaddwd m8, m3, [r3 + 11 * 32] ; [27]
15576 paddd m8, [pd_16]
15577 psrld m8, 5
15578 pmaddwd m9, m0, [r3 + 11 * 32]
15579 paddd m9, [pd_16]
15580 psrld m9, 5
15581 packusdw m8, m9
15582
15583 pmaddwd m9, m3, [r3 + 2 * 32] ; [18]
15584 paddd m9, [pd_16]
15585 psrld m9, 5
15586 pmaddwd m10, m0, [r3 + 2 * 32]
15587 paddd m10, [pd_16]
15588 psrld m10, 5
15589 packusdw m9, m10
15590
15591 pmaddwd m1, m3, [r3 - 7 * 32] ; [9]
15592 paddd m1, [pd_16]
15593 psrld m1, 5
15594 pmaddwd m2, m0, [r3 - 7 * 32]
15595 paddd m2, [pd_16]
15596 psrld m2, 5
15597 packusdw m1, m2
15598
15599 pmaddwd m3, [r3 - 16 * 32] ; [0]
15600 paddd m3, [pd_16]
15601 psrld m3, 5
15602 pmaddwd m0, [r3 - 16 * 32]
15603 paddd m0, [pd_16]
15604 psrld m0, 5
15605 packusdw m3, m0
15606 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16
15607 ret
15608
15609 cglobal intra_pred_ang32_13, 3,8,14, 0-mmsize
15610 movu m0, [r2 + 112]
15611 mova [rsp], m0
15612
15613 add r1d, r1d
15614 lea r4, [r1 * 3]
15615 lea r3, [ang_table_avx2 + 16 * 32]
15616
15617 movu xm1, [r2 + 8]
15618 movu xm2, [r2 + 36]
15619 pshufb xm1, [pw_ang32_13_23]
15620 pshufb xm2, [pw_ang32_13_23]
15621 pinsrw xm1, [r2 + 28], 4
15622 pinsrw xm2, [r2 + 56], 4
15623 punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28]
15624
15625 movzx r6d, word [r2]
15626 mov [r2 + 128], r6w
15627 movu [r2 + 112], xm2
15628
15629 xor r6d, r6d
15630 add r2, 128
15631 lea r7, [r0 + 8 * r1]
15632
15633 call ang32_mode_13_23_row_0_15
15634
15635 sub r2, 8
15636 lea r0, [r0 + 32]
15637
15638 call ang32_mode_13_23_row_16_31
15639
15640 add r2, 40
15641 lea r0, [r7 + 8 * r1]
15642
15643 call ang32_mode_13_23_row_0_15
15644
15645 sub r2, 8
15646 lea r0, [r0 + 32]
15647
15648 call ang32_mode_13_23_row_16_31
15649
15650 mova m0, [rsp]
15651 movu [r2 - 40], m0
15652 RET
15653
15654 cglobal intra_pred_ang32_23, 3,7,14, 0-16
15655 movu xm0, [r2 - 16]
15656 mova [rsp], xm0
15657
15658 add r1d, r1d
15659 lea r4, [r1 * 3]
15660 lea r3, [ang_table_avx2 + 16 * 32]
15661
15662 movu xm1, [r2 + 136]
15663 movu xm2, [r2 + 164]
15664 pshufb xm1, [pw_ang32_13_23]
15665 pshufb xm2, [pw_ang32_13_23]
15666 pinsrw xm1, [r2 + 156], 4
15667 pinsrw xm2, [r2 + 184], 4
15668 punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28]
15669
15670 movu [r2 - 16], xm2
15671
15672 xor r6d, r6d
15673 inc r6d
15674 lea r5, [r0 + 32]
15675
15676 call ang32_mode_13_23_row_0_15
15677
15678 sub r2, 8
15679
15680 call ang32_mode_13_23_row_16_31
15681
15682 add r2, 40
15683 mov r0, r5
15684
15685 call ang32_mode_13_23_row_0_15
15686
15687 sub r2, 8
15688
15689 call ang32_mode_13_23_row_16_31
15690
15691 mova xm0, [rsp]
15692 movu [r2 - 40], xm0
15693 RET
15694
15695 %macro TRANSPOSE_STORE_AVX2_STACK 11
15696 jnz .skip%11
15697 punpckhwd m%9, m%1, m%2
15698 punpcklwd m%1, m%2
15699 punpckhwd m%2, m%3, m%4
15700 punpcklwd m%3, m%4
15701
15702 punpckldq m%4, m%1, m%3
15703 punpckhdq m%1, m%3
15704 punpckldq m%3, m%9, m%2
15705 punpckhdq m%9, m%2
15706
15707 punpckhwd m%10, m%5, m%6
15708 punpcklwd m%5, m%6
15709 punpckhwd m%6, m%7, m%8
15710 punpcklwd m%7, m%8
15711
15712 punpckldq m%8, m%5, m%7
15713 punpckhdq m%5, m%7
15714 punpckldq m%7, m%10, m%6
15715 punpckhdq m%10, m%6
15716
15717 punpcklqdq m%6, m%4, m%8
15718 punpckhqdq m%2, m%4, m%8
15719 punpcklqdq m%4, m%1, m%5
15720 punpckhqdq m%8, m%1, m%5
15721
15722 punpcklqdq m%1, m%3, m%7
15723 punpckhqdq m%5, m%3, m%7
15724 punpcklqdq m%3, m%9, m%10
15725 punpckhqdq m%7, m%9, m%10
15726
15727 movu [r0 + r1 * 0 + %11], xm%6
15728 movu [r0 + r1 * 1 + %11], xm%2
15729 movu [r0 + r1 * 2 + %11], xm%4
15730 movu [r0 + r4 * 1 + %11], xm%8
15731
15732 lea r5, [r0 + r1 * 4]
15733 movu [r5 + r1 * 0 + %11], xm%1
15734 movu [r5 + r1 * 1 + %11], xm%5
15735 movu [r5 + r1 * 2 + %11], xm%3
15736 movu [r5 + r4 * 1 + %11], xm%7
15737
15738 lea r5, [r5 + r1 * 4]
15739 vextracti128 [r5 + r1 * 0 + %11], m%6, 1
15740 vextracti128 [r5 + r1 * 1 + %11], m%2, 1
15741 vextracti128 [r5 + r1 * 2 + %11], m%4, 1
15742 vextracti128 [r5 + r4 * 1 + %11], m%8, 1
15743
15744 lea r5, [r5 + r1 * 4]
15745 vextracti128 [r5 + r1 * 0 + %11], m%1, 1
15746 vextracti128 [r5 + r1 * 1 + %11], m%5, 1
15747 vextracti128 [r5 + r1 * 2 + %11], m%3, 1
15748 vextracti128 [r5 + r4 * 1 + %11], m%7, 1
15749 jmp .end%11
15750 .skip%11:
15751 %if %11 == 16
15752 lea r7, [r0 + 8 * r1]
15753 %else
15754 lea r7, [r0]
15755 %endif
15756 movu [r7 + r1 * 0], m%1
15757 movu [r7 + r1 * 1], m%2
15758 movu [r7 + r1 * 2], m%3
15759 movu [r7 + r4 * 1], m%4
15760
15761 %if %11 == 16
15762 lea r7, [r7 + r1 * 4]
15763 %else
15764 lea r7, [r7 + r1 * 4]
15765 %endif
15766 movu [r7 + r1 * 0], m%5
15767 movu [r7 + r1 * 1], m%6
15768 movu [r7 + r1 * 2], m%7
15769 movu [r7 + r4 * 1], m%8
15770 .end%11:
15771 %endmacro
15772
15773 ;; angle 32, modes 14 and 22, row 0 to 15
15774 cglobal ang32_mode_14_22_rows_0_15
15775 test r6d, r6d
15776
15777 movu m0, [r2 - 12]
15778 movu m1, [r2 - 10]
15779
15780 punpcklwd m3, m0, m1
15781 punpckhwd m0, m1
15782
15783 movu m1, [r2 + 4]
15784 movu m4, [r2 + 6]
15785 punpcklwd m2, m1, m4
15786 punpckhwd m1, m4
15787
15788 pmaddwd m4, m3, [r3] ; [16]
15789 paddd m4, [pd_16]
15790 psrld m4, 5
15791 pmaddwd m5, m0, [r3]
15792 paddd m5, [pd_16]
15793 psrld m5, 5
15794 packusdw m4, m5
15795
15796 pmaddwd m5, m3, [r3 + 13 * 32] ; [29]
15797 paddd m5, [pd_16]
15798 psrld m5, 5
15799 pmaddwd m8, m0, [r3 + 13 * 32]
15800 paddd m8, [pd_16]
15801 psrld m8, 5
15802 packusdw m5, m8
15803
15804 palignr m7, m0, m3, 4
15805 pmaddwd m6, m7, [r3 - 6 * 32] ; [10]
15806 paddd m6, [pd_16]
15807 psrld m6, 5
15808 palignr m8, m2, m0, 4
15809 pmaddwd m9, m8, [r3 - 6 * 32]
15810 paddd m9, [pd_16]
15811 psrld m9, 5
15812 packusdw m6, m9
15813
15814 pmaddwd m7, [r3 + 7 * 32] ; [23]
15815 paddd m7, [pd_16]
15816 psrld m7, 5
15817 pmaddwd m8, [r3 + 7 * 32]
15818 paddd m8, [pd_16]
15819 psrld m8, 5
15820 packusdw m7, m8
15821
15822 palignr m10, m0, m3, 8
15823 pmaddwd m8, m10, [r3 - 12 * 32] ; [4]
15824 paddd m8, [pd_16]
15825 psrld m8, 5
15826 palignr m12, m2, m0, 8
15827 pmaddwd m9, m12, [r3 - 12 * 32]
15828 paddd m9, [pd_16]
15829 psrld m9, 5
15830 packusdw m8, m9
15831
15832 pmaddwd m9, m10, [r3 + 1 * 32] ; [17]
15833 paddd m9, [pd_16]
15834 psrld m9, 5
15835 pmaddwd m11, m12, [r3 + 1 * 32]
15836 paddd m11, [pd_16]
15837 psrld m11, 5
15838 packusdw m9, m11
15839
15840 pmaddwd m10, [r3 + 14 * 32] ; [30]
15841 paddd m10, [pd_16]
15842 psrld m10, 5
15843 pmaddwd m12, [r3 + 14 * 32]
15844 paddd m12, [pd_16]
15845 psrld m12, 5
15846 packusdw m10, m12
15847
15848 palignr m11, m0, m3, 12
15849 pmaddwd m11, [r3 - 5 * 32] ; [11]
15850 paddd m11, [pd_16]
15851 psrld m11, 5
15852 palignr m12, m2, m0, 12
15853 pmaddwd m12, [r3 - 5 * 32]
15854 paddd m12, [pd_16]
15855 psrld m12, 5
15856 packusdw m11, m12
15857
15858 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
15859
15860 palignr m4, m0, m3, 12
15861 pmaddwd m4, [r3 + 8 * 32] ; [24]
15862 paddd m4, [pd_16]
15863 psrld m4, 5
15864 palignr m5, m2, m0, 12
15865 pmaddwd m5, [r3 + 8 * 32]
15866 paddd m5, [pd_16]
15867 psrld m5, 5
15868 packusdw m4, m5
15869
15870 pmaddwd m5, m0, [r3 - 11 * 32] ; [5]
15871 paddd m5, [pd_16]
15872 psrld m5, 5
15873 pmaddwd m3, m2, [r3 - 11 * 32]
15874 paddd m3, [pd_16]
15875 psrld m3, 5
15876 packusdw m5, m3
15877
15878 pmaddwd m6, m0, [r3 + 2 * 32] ; [18]
15879 paddd m6, [pd_16]
15880 psrld m6, 5
15881 pmaddwd m7, m2, [r3 + 2 * 32]
15882 paddd m7, [pd_16]
15883 psrld m7, 5
15884 packusdw m6, m7
15885
15886 pmaddwd m7, m0, [r3 + 15 * 32] ; [31]
15887 paddd m7, [pd_16]
15888 psrld m7, 5
15889 pmaddwd m3, m2, [r3 + 15 * 32]
15890 paddd m3, [pd_16]
15891 psrld m3, 5
15892 packusdw m7, m3
15893
15894 palignr m9, m2, m0, 4
15895 palignr m10, m1, m2, 4
15896 pmaddwd m8, m9, [r3 - 4 * 32] ; [12]
15897 paddd m8, [pd_16]
15898 psrld m8, 5
15899 pmaddwd m11, m10, [r3 - 4 * 32]
15900 paddd m11, [pd_16]
15901 psrld m11, 5
15902 packusdw m8, m11
15903
15904 pmaddwd m9, [r3 + 9 * 32] ; [25]
15905 paddd m9, [pd_16]
15906 psrld m9, 5
15907 pmaddwd m10, [r3 + 9 * 32]
15908 paddd m10, [pd_16]
15909 psrld m10, 5
15910 packusdw m9, m10
15911
15912 palignr m1, m2, 8
15913 palignr m2, m0, 8
15914
15915 pmaddwd m10, m2, [r3 - 10 * 32] ; [6]
15916 paddd m10, [pd_16]
15917 psrld m10, 5
15918 pmaddwd m12, m1, [r3 - 10 * 32]
15919 paddd m12, [pd_16]
15920 psrld m12, 5
15921 packusdw m10, m12
15922
15923 pmaddwd m2, [r3 + 3 * 32] ; [19]
15924 paddd m2, [pd_16]
15925 psrld m2, 5
15926 pmaddwd m1, [r3 + 3 * 32]
15927 paddd m1, [pd_16]
15928 psrld m1, 5
15929 packusdw m2, m1
15930 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0
15931 ret
15932
15933 ;; angle 32, modes 14 and 22, rows 16 to 31
15934 cglobal ang32_mode_14_22_rows_16_31
15935 test r6d, r6d
15936
15937 movu m0, [r2 - 24]
15938 movu m1, [r2 - 22]
15939
15940 punpcklwd m3, m0, m1
15941 punpckhwd m0, m1
15942
15943 movu m1, [r2 - 8]
15944 movu m4, [r2 - 6]
15945 punpcklwd m2, m1, m4
15946 punpckhwd m1, m4
15947
15948 pmaddwd m4, m3, [r3 - 16 * 32] ; [0]
15949 paddd m4, [pd_16]
15950 psrld m4, 5
15951 pmaddwd m5, m0, [r3 - 16 * 32]
15952 paddd m5, [pd_16]
15953 psrld m5, 5
15954 packusdw m4, m5
15955
15956 pmaddwd m5, m3, [r3 - 3 * 32] ; [13]
15957 paddd m5, [pd_16]
15958 psrld m5, 5
15959 pmaddwd m8, m0, [r3 - 3 * 32]
15960 paddd m8, [pd_16]
15961 psrld m8, 5
15962 packusdw m5, m8
15963
15964 pmaddwd m6, m3, [r3 + 10 * 32] ; [26]
15965 paddd m6, [pd_16]
15966 psrld m6, 5
15967 pmaddwd m9, m0, [r3 + 10 * 32]
15968 paddd m9, [pd_16]
15969 psrld m9, 5
15970 packusdw m6, m9
15971
15972 palignr m8, m0, m3, 4
15973 palignr m9, m2, m0, 4
15974 pmaddwd m7, m8, [r3 - 9 * 32] ; [7]
15975 paddd m7, [pd_16]
15976 psrld m7, 5
15977 pmaddwd m10, m9, [r3 - 9 * 32]
15978 paddd m10, [pd_16]
15979 psrld m10, 5
15980 packusdw m7, m10
15981
15982 pmaddwd m8, [r3 + 4 * 32] ; [20]
15983 paddd m8, [pd_16]
15984 psrld m8, 5
15985 pmaddwd m9, [r3 + 4 * 32]
15986 paddd m9, [pd_16]
15987 psrld m9, 5
15988 packusdw m8, m9
15989
15990 palignr m11, m0, m3, 8
15991 palignr m12, m2, m0, 8
15992 pmaddwd m9, m11, [r3 - 15 * 32] ; [1]
15993 paddd m9, [pd_16]
15994 psrld m9, 5
15995 pmaddwd m10, m12, [r3 - 15 * 32]
15996 paddd m10, [pd_16]
15997 psrld m10, 5
15998 packusdw m9, m10
15999
16000 pmaddwd m10, m11, [r3 - 2 * 32] ; [14]
16001 paddd m10, [pd_16]
16002 psrld m10, 5
16003 pmaddwd m13, m12, [r3 - 2 * 32]
16004 paddd m13, [pd_16]
16005 psrld m13, 5
16006 packusdw m10, m13
16007
16008 pmaddwd m11, [r3 + 11 * 32] ; [27]
16009 paddd m11, [pd_16]
16010 psrld m11, 5
16011 pmaddwd m12, [r3 + 11 * 32]
16012 paddd m12, [pd_16]
16013 psrld m12, 5
16014 packusdw m11, m12
16015
16016 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
16017
16018 palignr m5, m0, m3, 12
16019 palignr m6, m2, m0, 12
16020 pmaddwd m4, m5, [r3 - 8 * 32] ; [8]
16021 paddd m4, [pd_16]
16022 psrld m4, 5
16023 pmaddwd m7, m6, [r3 - 8 * 32]
16024 paddd m7, [pd_16]
16025 psrld m7, 5
16026 packusdw m4, m7
16027
16028 pmaddwd m5, [r3 + 5 * 32] ; [21]
16029 paddd m5, [pd_16]
16030 psrld m5, 5
16031 pmaddwd m6, [r3 + 5 * 32]
16032 paddd m6, [pd_16]
16033 psrld m6, 5
16034 packusdw m5, m6
16035
16036 pmaddwd m6, m0, [r3 - 14 * 32] ; [2]
16037 paddd m6, [pd_16]
16038 psrld m6, 5
16039 pmaddwd m7, m2, [r3 - 14 * 32]
16040 paddd m7, [pd_16]
16041 psrld m7, 5
16042 packusdw m6, m7
16043
16044 pmaddwd m7, m0, [r3 - 1 * 32] ; [15]
16045 paddd m7, [pd_16]
16046 psrld m7, 5
16047 pmaddwd m3, m2, [r3 - 1 * 32]
16048 paddd m3, [pd_16]
16049 psrld m3, 5
16050 packusdw m7, m3
16051
16052 pmaddwd m8, m0, [r3 + 12 * 32] ; [28]
16053 paddd m8, [pd_16]
16054 psrld m8, 5
16055 pmaddwd m11, m2, [r3 + 12 * 32]
16056 paddd m11, [pd_16]
16057 psrld m11, 5
16058 packusdw m8, m11
16059
16060 palignr m10, m2, m0, 4
16061 palignr m11, m1, m2, 4
16062
16063 pmaddwd m9, m10, [r3 - 7 * 32] ; [9]
16064 paddd m9, [pd_16]
16065 psrld m9, 5
16066 pmaddwd m3, m11, [r3 - 7 * 32]
16067 paddd m3, [pd_16]
16068 psrld m3, 5
16069 packusdw m9, m3
16070
16071 pmaddwd m10, [r3 + 6 * 32] ; [22]
16072 paddd m10, [pd_16]
16073 psrld m10, 5
16074 pmaddwd m11, [r3 + 6 * 32]
16075 paddd m11, [pd_16]
16076 psrld m11, 5
16077 packusdw m10, m11
16078
16079 palignr m1, m2, 8
16080 palignr m2, m0, 8
16081
16082 pmaddwd m2, [r3 - 13 * 32] ; [3]
16083 paddd m2, [pd_16]
16084 psrld m2, 5
16085 pmaddwd m1, [r3 - 13 * 32]
16086 paddd m1, [pd_16]
16087 psrld m1, 5
16088 packusdw m2, m1
16089 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0
16090 ret
16091
16092 cglobal intra_pred_ang32_14, 3,8,14
16093 mov r6, rsp
16094 sub rsp, 4*mmsize+gprsize
16095 and rsp, ~63
16096 mov [rsp+4*mmsize], r6
16097
16098 movu m0, [r2 + 128]
16099 movu m1, [r2 + 160]
16100 movd xm2, [r2 + 192]
16101
16102 mova [rsp + 1*mmsize], m0
16103 mova [rsp + 2*mmsize], m1
16104 movd [rsp + 3*mmsize], xm2
16105
16106 add r1d, r1d
16107 lea r4, [r1 * 3]
16108 lea r3, [ang_table_avx2 + 16 * 32]
16109
16110 movu xm1, [r2 + 4]
16111 movu xm2, [r2 + 24]
16112 movu xm3, [r2 + 44]
16113 pshufb xm1, [pw_ang32_14_22]
16114 pshufb xm2, [pw_ang32_14_22]
16115 pshufb xm3, [pw_ang32_14_22]
16116 pinsrw xm1, [r2 + 20], 4
16117 pinsrw xm2, [r2 + 40], 4
16118 pinsrw xm3, [r2 + 60], 4
16119
16120 punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20]
16121 punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30]
16122
16123 movzx r6d, word [r2]
16124 mov [rsp + 1*mmsize], r6w
16125 movu [rsp + 16], xm2
16126 movq [rsp + 8], xm3
16127
16128 xor r6d, r6d
16129 lea r2, [rsp + 1*mmsize]
16130 lea r7, [r0 + 8 * r1]
16131
16132 call ang32_mode_14_22_rows_0_15
16133
16134 lea r0, [r0 + 32]
16135
16136 call ang32_mode_14_22_rows_16_31
16137
16138 add r2, 32
16139 lea r0, [r7 + 8 * r1]
16140
16141 call ang32_mode_14_22_rows_0_15
16142
16143 lea r0, [r0 + 32]
16144
16145 call ang32_mode_14_22_rows_16_31
16146
16147 mov rsp, [rsp+4*mmsize]
16148 RET
16149
16150 cglobal intra_pred_ang32_22, 3,8,14
16151 mov r6, rsp
16152 sub rsp, 4*mmsize+gprsize
16153 and rsp, ~63
16154 mov [rsp+4*mmsize], r6
16155
16156 movu m0, [r2]
16157 movu m1, [r2 + 32]
16158 movd xm2, [r2 + 64]
16159
16160 mova [rsp + 1*mmsize], m0
16161 mova [rsp + 2*mmsize], m1
16162 movd [rsp + 3*mmsize], xm2
16163
16164 add r1d, r1d
16165 lea r4, [r1 * 3]
16166 lea r3, [ang_table_avx2 + 16 * 32]
16167
16168 movu xm1, [r2 + 132]
16169 movu xm2, [r2 + 152]
16170 movu xm3, [r2 + 172]
16171 pshufb xm1, [pw_ang32_14_22]
16172 pshufb xm2, [pw_ang32_14_22]
16173 pshufb xm3, [pw_ang32_14_22]
16174 pinsrw xm1, [r2 + 148], 4
16175 pinsrw xm2, [r2 + 168], 4
16176 pinsrw xm3, [r2 + 188], 4
16177
16178 punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20]
16179 punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30]
16180
16181 movu [rsp + 16], xm2
16182 movq [rsp + 8], xm3
16183
16184 xor r6d, r6d
16185 inc r6d
16186 lea r2, [rsp + 1*mmsize]
16187 lea r5, [r0 + 32]
16188
16189 call ang32_mode_14_22_rows_0_15
16190
16191 lea r0, [r0 + 8 * r1]
16192 lea r0, [r0 + 8 * r1]
16193
16194 call ang32_mode_14_22_rows_16_31
16195
16196 add r2, 32
16197 mov r0, r5
16198
16199 call ang32_mode_14_22_rows_0_15
16200
16201 lea r0, [r0 + 8 * r1]
16202 lea r0, [r0 + 8 * r1]
16203
16204 call ang32_mode_14_22_rows_16_31
16205
16206 mov rsp, [rsp+4*mmsize]
16207 RET
16208
16209 ;; angle 32, modes 15 and 21, row 0 to 15
16210 cglobal ang32_mode_15_21_rows_0_15
16211 test r6d, r6d
16212
16213 movu m0, [r2 - 16]
16214 movu m1, [r2 - 14]
16215
16216 punpcklwd m3, m0, m1
16217 punpckhwd m0, m1
16218
16219 movu m1, [r2]
16220 movu m4, [r2 + 2]
16221 punpcklwd m2, m1, m4
16222 punpckhwd m1, m4
16223
16224 pmaddwd m4, m3, [r3] ; [16]
16225 paddd m4, [pd_16]
16226 psrld m4, 5
16227 pmaddwd m5, m0, [r3]
16228 paddd m5, [pd_16]
16229 psrld m5, 5
16230 packusdw m4, m5
16231
16232 palignr m6, m0, m3, 4
16233 palignr m7, m2, m0, 4
16234 pmaddwd m5, m6, [r3 - 15 * 32] ; [1]
16235 paddd m5, [pd_16]
16236 psrld m5, 5
16237 pmaddwd m8, m7, [r3 - 15 * 32]
16238 paddd m8, [pd_16]
16239 psrld m8, 5
16240 packusdw m5, m8
16241
16242 pmaddwd m6, [r3 + 2 * 32] ; [18]
16243 paddd m6, [pd_16]
16244 psrld m6, 5
16245 pmaddwd m7, [r3 + 2 * 32]
16246 paddd m7, [pd_16]
16247 psrld m7, 5
16248 packusdw m6, m7
16249
16250 palignr m8, m0, m3, 8
16251 palignr m9, m2, m0, 8
16252 pmaddwd m7, m8, [r3 - 13 * 32] ; [3]
16253 paddd m7, [pd_16]
16254 psrld m7, 5
16255 pmaddwd m10, m9, [r3 - 13 * 32]
16256 paddd m10, [pd_16]
16257 psrld m10, 5
16258 packusdw m7, m10
16259
16260 pmaddwd m8, [r3 + 4 * 32] ; [20]
16261 paddd m8, [pd_16]
16262 psrld m8, 5
16263 pmaddwd m9, [r3 + 4 * 32]
16264 paddd m9, [pd_16]
16265 psrld m9, 5
16266 packusdw m8, m9
16267
16268 palignr m10, m0, m3, 12
16269 palignr m11, m2, m0, 12
16270 pmaddwd m9, m10, [r3 - 11 * 32] ; [5]
16271 paddd m9, [pd_16]
16272 psrld m9, 5
16273 pmaddwd m12, m11, [r3 - 11 * 32]
16274 paddd m12, [pd_16]
16275 psrld m12, 5
16276 packusdw m9, m12
16277
16278 pmaddwd m10, [r3 + 6 * 32] ; [22]
16279 paddd m10, [pd_16]
16280 psrld m10, 5
16281 pmaddwd m11, [r3 + 6 * 32]
16282 paddd m11, [pd_16]
16283 psrld m11, 5
16284 packusdw m10, m11
16285
16286 pmaddwd m11, m0, [r3 - 9 * 32] ; [7]
16287 paddd m11, [pd_16]
16288 psrld m11, 5
16289 pmaddwd m12, m2, [r3 - 9 * 32]
16290 paddd m12, [pd_16]
16291 psrld m12, 5
16292 packusdw m11, m12
16293
16294 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
16295
16296 pmaddwd m4, m0, [r3 + 8 * 32] ; [24]
16297 paddd m4, [pd_16]
16298 psrld m4, 5
16299 pmaddwd m5, m2, [r3 + 8 * 32]
16300 paddd m5, [pd_16]
16301 psrld m5, 5
16302 packusdw m4, m5
16303
16304 palignr m6, m2, m0, 4
16305 palignr m7, m1, m2, 4
16306 pmaddwd m5, m6, [r3 - 7 * 32] ; [9]
16307 paddd m5, [pd_16]
16308 psrld m5, 5
16309 pmaddwd m3, m7, [r3 - 7 * 32]
16310 paddd m3, [pd_16]
16311 psrld m3, 5
16312 packusdw m5, m3
16313
16314 pmaddwd m6, [r3 + 10 * 32] ; [26]
16315 paddd m6, [pd_16]
16316 psrld m6, 5
16317 pmaddwd m7, [r3 + 10 * 32]
16318 paddd m7, [pd_16]
16319 psrld m7, 5
16320 packusdw m6, m7
16321
16322 palignr m8, m2, m0, 8
16323 palignr m9, m1, m2, 8
16324 pmaddwd m7, m8, [r3 - 5 * 32] ; [11]
16325 paddd m7, [pd_16]
16326 psrld m7, 5
16327 pmaddwd m3, m9, [r3 - 5 * 32]
16328 paddd m3, [pd_16]
16329 psrld m3, 5
16330 packusdw m7, m3
16331
16332 pmaddwd m8, [r3 + 12 * 32] ; [28]
16333 paddd m8, [pd_16]
16334 psrld m8, 5
16335 pmaddwd m9, [r3 + 12 * 32]
16336 paddd m9, [pd_16]
16337 psrld m9, 5
16338 packusdw m8, m9
16339
16340 palignr m10, m2, m0, 12
16341 palignr m11, m1, m2, 12
16342 pmaddwd m9, m10, [r3 - 3 * 32] ; [13]
16343 paddd m9, [pd_16]
16344 psrld m9, 5
16345 pmaddwd m3, m11, [r3 - 3 * 32]
16346 paddd m3, [pd_16]
16347 psrld m3, 5
16348 packusdw m9, m3
16349
16350 pmaddwd m10, [r3 + 14 * 32] ; [30]
16351 paddd m10, [pd_16]
16352 psrld m10, 5
16353 pmaddwd m11, [r3 + 14 * 32]
16354 paddd m11, [pd_16]
16355 psrld m11, 5
16356 packusdw m10, m11
16357
16358 pmaddwd m2, [r3 - 1 * 32] ; [15]
16359 paddd m2, [pd_16]
16360 psrld m2, 5
16361 pmaddwd m1, [r3 - 1 * 32]
16362 paddd m1, [pd_16]
16363 psrld m1, 5
16364 packusdw m2, m1
16365 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0
16366 ret
16367
16368 ;; angle 32, modes 15 and 21, rows 16 to 31
16369 cglobal ang32_mode_15_21_rows_16_31
16370 test r6d, r6d
16371
16372 movu m0, [r2 - 32]
16373 movu m1, [r2 - 30]
16374
16375 punpcklwd m3, m0, m1
16376 punpckhwd m0, m1
16377
16378 movu m1, [r2 - 16]
16379 movu m4, [r2 - 14]
16380 punpcklwd m2, m1, m4
16381 punpckhwd m1, m4
16382
16383 pmaddwd m4, m3, [r3 - 16 * 32] ; [0]
16384 paddd m4, [pd_16]
16385 psrld m4, 5
16386 pmaddwd m5, m0, [r3 - 16 * 32]
16387 paddd m5, [pd_16]
16388 psrld m5, 5
16389 packusdw m4, m5
16390
16391 pmaddwd m5, m3, [r3 + 1 * 32] ; [17]
16392 paddd m5, [pd_16]
16393 psrld m5, 5
16394 pmaddwd m8, m0, [r3 + 1 * 32]
16395 paddd m8, [pd_16]
16396 psrld m8, 5
16397 packusdw m5, m8
16398
16399 palignr m7, m0, m3, 4
16400 palignr m8, m2, m0, 4
16401 pmaddwd m6, m7, [r3 - 14 * 32] ; [2]
16402 paddd m6, [pd_16]
16403 psrld m6, 5
16404 pmaddwd m9, m8, [r3 - 14 * 32]
16405 paddd m9, [pd_16]
16406 psrld m9, 5
16407 packusdw m6, m9
16408
16409 pmaddwd m7, [r3 + 3 * 32] ; [19]
16410 paddd m7, [pd_16]
16411 psrld m7, 5
16412 pmaddwd m8, [r3 + 3 * 32]
16413 paddd m8, [pd_16]
16414 psrld m8, 5
16415 packusdw m7, m8
16416
16417 palignr m9, m0, m3, 8
16418 palignr m10, m2, m0, 8
16419 pmaddwd m8, m9, [r3 - 12 * 32] ; [4]
16420 paddd m8, [pd_16]
16421 psrld m8, 5
16422 pmaddwd m11, m10, [r3 - 12 * 32]
16423 paddd m11, [pd_16]
16424 psrld m11, 5
16425 packusdw m8, m11
16426
16427 pmaddwd m9, [r3 + 5 * 32] ; [21]
16428 paddd m9, [pd_16]
16429 psrld m9, 5
16430 pmaddwd m10, [r3 + 5 * 32]
16431 paddd m10, [pd_16]
16432 psrld m10, 5
16433 packusdw m9, m10
16434
16435 palignr m11, m0, m3, 12
16436 palignr m12, m2, m0, 12
16437 pmaddwd m10, m11, [r3 - 10 * 32] ; [6]
16438 paddd m10, [pd_16]
16439 psrld m10, 5
16440 pmaddwd m13, m12, [r3 - 10 * 32]
16441 paddd m13, [pd_16]
16442 psrld m13, 5
16443 packusdw m10, m13
16444
16445 pmaddwd m11, [r3 + 7 * 32] ; [23]
16446 paddd m11, [pd_16]
16447 psrld m11, 5
16448 pmaddwd m12, [r3 + 7 * 32]
16449 paddd m12, [pd_16]
16450 psrld m12, 5
16451 packusdw m11, m12
16452
16453 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
16454
16455 pmaddwd m4, m0, [r3 - 8 * 32] ; [8]
16456 paddd m4, [pd_16]
16457 psrld m4, 5
16458 pmaddwd m7, m2, [r3 - 8 * 32]
16459 paddd m7, [pd_16]
16460 psrld m7, 5
16461 packusdw m4, m7
16462
16463 pmaddwd m5, m0, [r3 + 9 * 32] ; [25]
16464 paddd m5, [pd_16]
16465 psrld m5, 5
16466 pmaddwd m6, m2, [r3 + 9 * 32]
16467 paddd m6, [pd_16]
16468 psrld m6, 5
16469 packusdw m5, m6
16470
16471 palignr m7, m2, m0, 4
16472 palignr m8, m1, m2, 4
16473 pmaddwd m6, m7, [r3 - 6 * 32] ; [10]
16474 paddd m6, [pd_16]
16475 psrld m6, 5
16476 pmaddwd m3, m8, [r3 - 6 * 32]
16477 paddd m3, [pd_16]
16478 psrld m3, 5
16479 packusdw m6, m3
16480
16481 pmaddwd m7, [r3 + 11 * 32] ; [27]
16482 paddd m7, [pd_16]
16483 psrld m7, 5
16484 pmaddwd m8, [r3 + 11 * 32]
16485 paddd m8, [pd_16]
16486 psrld m8, 5
16487 packusdw m7, m8
16488
16489 palignr m9, m2, m0, 8
16490 palignr m3, m1, m2, 8
16491 pmaddwd m8, m9, [r3 - 4 * 32] ; [12]
16492 paddd m8, [pd_16]
16493 psrld m8, 5
16494 pmaddwd m11, m3, [r3 - 4 * 32]
16495 paddd m11, [pd_16]
16496 psrld m11, 5
16497 packusdw m8, m11
16498
16499 pmaddwd m9, [r3 + 13 * 32] ; [29]
16500 paddd m9, [pd_16]
16501 psrld m9, 5
16502 pmaddwd m3, [r3 + 13 * 32]
16503 paddd m3, [pd_16]
16504 psrld m3, 5
16505 packusdw m9, m3
16506
16507 palignr m1, m2, 12
16508 palignr m2, m0, 12
16509 pmaddwd m10, m2, [r3 - 2 * 32] ; [14]
16510 paddd m10, [pd_16]
16511 psrld m10, 5
16512 pmaddwd m11, m1, [r3 - 2 * 32]
16513 paddd m11, [pd_16]
16514 psrld m11, 5
16515 packusdw m10, m11
16516
16517 pmaddwd m2, [r3 + 15 * 32] ; [31]
16518 paddd m2, [pd_16]
16519 psrld m2, 5
16520 pmaddwd m1, [r3 + 15 * 32]
16521 paddd m1, [pd_16]
16522 psrld m1, 5
16523 packusdw m2, m1
16524 TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0
16525 ret
16526
16527 cglobal intra_pred_ang32_15, 3,8,14
16528 mov r6, rsp
16529 sub rsp, 4*mmsize+gprsize
16530 and rsp, ~63
16531 mov [rsp+4*mmsize], r6
16532
16533 movu m0, [r2 + 128]
16534 movu m1, [r2 + 160]
16535 movd xm2, [r2 + 192]
16536
16537 mova [rsp + 1*mmsize], m0
16538 mova [rsp + 2*mmsize], m1
16539 movd [rsp + 3*mmsize], xm2
16540
16541 add r1d, r1d
16542 lea r4, [r1 * 3]
16543 lea r3, [ang_table_avx2 + 16 * 32]
16544
16545 movu xm1, [r2 + 4]
16546 movu xm2, [r2 + 18]
16547 movu xm3, [r2 + 34]
16548 movu xm4, [r2 + 48]
16549 pshufb xm1, [pw_ang32_15_21]
16550 pshufb xm2, [pw_ang32_15_21]
16551 pshufb xm3, [pw_ang32_15_21]
16552 pshufb xm4, [pw_ang32_15_21]
16553
16554 punpckhqdq xm2, xm1
16555 punpckhqdq xm4, xm3
16556
16557 movzx r6d, word [r2]
16558 mov [rsp + 1*mmsize], r6w
16559 movu [rsp + 16], xm2
16560 movu [rsp], xm4
16561
16562 xor r6d, r6d
16563 lea r2, [rsp + 1*mmsize]
16564 lea r7, [r0 + 8 * r1]
16565
16566 call ang32_mode_15_21_rows_0_15
16567
16568 lea r0, [r0 + 32]
16569
16570 call ang32_mode_15_21_rows_16_31
16571
16572 add r2, 32
16573 lea r0, [r7 + 8 * r1]
16574
16575 call ang32_mode_15_21_rows_0_15
16576
16577 lea r0, [r0 + 32]
16578
16579 call ang32_mode_15_21_rows_16_31
16580
16581 mov rsp, [rsp+4*mmsize]
16582 RET
16583
16584 cglobal intra_pred_ang32_21, 3,8,14
16585 mov r6, rsp
16586 sub rsp, 4*mmsize+gprsize
16587 and rsp, ~63
16588 mov [rsp+4*mmsize], r6
16589
16590 movu m0, [r2]
16591 movu m1, [r2 + 32]
16592 movd xm2, [r2 + 64]
16593
16594 mova [rsp + 1*mmsize], m0
16595 mova [rsp + 2*mmsize], m1
16596 movd [rsp + 3*mmsize], xm2
16597
16598 add r1d, r1d
16599 lea r4, [r1 * 3]
16600 lea r3, [ang_table_avx2 + 16 * 32]
16601
16602 movu xm1, [r2 + 132]
16603 movu xm2, [r2 + 146]
16604 movu xm3, [r2 + 162]
16605 movu xm4, [r2 + 176]
16606 pshufb xm1, [pw_ang32_15_21]
16607 pshufb xm2, [pw_ang32_15_21]
16608 pshufb xm3, [pw_ang32_15_21]
16609 pshufb xm4, [pw_ang32_15_21]
16610
16611 punpckhqdq xm2, xm1
16612 punpckhqdq xm4, xm3
16613
16614 movu [rsp + 16], xm2
16615 movu [rsp], xm4
16616
16617 xor r6d, r6d
16618 inc r6d
16619 lea r2, [rsp + 1*mmsize]
16620 lea r5, [r0 + 32]
16621
16622 call ang32_mode_15_21_rows_0_15
16623
16624 lea r0, [r0 + 8 * r1]
16625 lea r0, [r0 + 8 * r1]
16626
16627 call ang32_mode_15_21_rows_16_31
16628
16629 add r2, 32
16630 mov r0, r5
16631
16632 call ang32_mode_15_21_rows_0_15
16633
16634 lea r0, [r0 + 8 * r1]
16635 lea r0, [r0 + 8 * r1]
16636
16637 call ang32_mode_15_21_rows_16_31
16638
16639 mov rsp, [rsp+4*mmsize]
16640 RET
16641
16642 ;; angle 32, modes 16 and 20, row 0 to 15
16643 cglobal ang32_mode_16_20_rows_0_15
16644 test r6d, r6d
16645
16646 movu m0, [r2 - 20]
16647 movu m1, [r2 - 18]
16648
16649 punpcklwd m3, m0, m1
16650 punpckhwd m0, m1
16651
16652 movu m1, [r2 - 4] ; [ 3 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13]
16653 movu m4, [r2 - 2] ; [ 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14]
16654 punpcklwd m2, m1, m4 ; [-3 -2 -4 -3 -5 -4 -6 -5 -11 -10 -12 -11 -13 -12 -14 -13]
16655 punpckhwd m1, m4 ; [ 2 3 2 0 -1 0 -2 -1 -7 -6 -8 -7 -9 -8 -10 -9]
16656
16657 pmaddwd m4, m3, [r3] ; [16]
16658 paddd m4, [pd_16]
16659 psrld m4, 5
16660 pmaddwd m5, m0, [r3]
16661 paddd m5, [pd_16]
16662 psrld m5, 5
16663 packusdw m4, m5
16664
16665 palignr m6, m0, m3, 4
16666 palignr m7, m2, m0, 4
16667 pmaddwd m5, m6, [r3 - 11 * 32] ; [5]
16668 paddd m5, [pd_16]
16669 psrld m5, 5
16670 pmaddwd m8, m7, [r3 - 11 * 32]
16671 paddd m8, [pd_16]
16672 psrld m8, 5
16673 packusdw m5, m8
16674
16675 pmaddwd m6, [r3 + 10 * 32] ; [26]
16676 paddd m6, [pd_16]
16677 psrld m6, 5
16678 pmaddwd m7, [r3 + 10 * 32]
16679 paddd m7, [pd_16]
16680 psrld m7, 5
16681 packusdw m6, m7
16682
16683 palignr m8, m0, m3, 8
16684 palignr m9, m2, m0, 8
16685 pmaddwd m7, m8, [r3 - 1 * 32] ; [15]
16686 paddd m7, [pd_16]
16687 psrld m7, 5
16688 pmaddwd m10, m9, [r3 - 1 * 32]
16689 paddd m10, [pd_16]
16690 psrld m10, 5
16691 packusdw m7, m10
16692
16693 palignr m9, m0, m3, 12
16694 palignr m12, m2, m0, 12
16695 pmaddwd m8, m9, [r3 - 12 * 32] ; [4]
16696 paddd m8, [pd_16]
16697 psrld m8, 5
16698 pmaddwd m10, m12, [r3 - 12 * 32]
16699 paddd m10, [pd_16]
16700 psrld m10, 5
16701 packusdw m8, m10
16702
16703 pmaddwd m9, [r3 + 9 * 32] ; [25]
16704 paddd m9, [pd_16]
16705 psrld m9, 5
16706 pmaddwd m12, [r3 + 9 * 32]
16707 paddd m12, [pd_16]
16708 psrld m12, 5
16709 packusdw m9, m12
16710
16711 pmaddwd m10, m0, [r3 - 2 * 32] ; [14]
16712 paddd m10, [pd_16]
16713 psrld m10, 5
16714 pmaddwd m11, m2, [r3 - 2 * 32]
16715 paddd m11, [pd_16]
16716 psrld m11, 5
16717 packusdw m10, m11
16718
16719 palignr m11, m2, m0, 4
16720 palignr m12, m1, m2, 4
16721 pmaddwd m11, [r3 - 13 * 32] ; [3]
16722 paddd m11, [pd_16]
16723 psrld m11, 5
16724 pmaddwd m12, [r3 - 13 * 32]
16725 paddd m12, [pd_16]
16726 psrld m12, 5
16727 packusdw m11, m12
16728
16729 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
16730
16731 palignr m4, m2, m0, 4
16732 palignr m5, m1, m2, 4
16733 pmaddwd m4, [r3 + 8 * 32] ; [24]
16734 paddd m4, [pd_16]
16735 psrld m4, 5
16736 pmaddwd m5, [r3 + 8 * 32]
16737 paddd m5, [pd_16]
16738 psrld m5, 5
16739 packusdw m4, m5
16740
16741 palignr m5, m2, m0, 8
16742 palignr m3, m1, m2, 8
16743 pmaddwd m5, [r3 - 3 * 32] ; [13]
16744 paddd m5, [pd_16]
16745 psrld m5, 5
16746 pmaddwd m3, [r3 - 3 * 32]
16747 paddd m3, [pd_16]
16748 psrld m3, 5
16749 packusdw m5, m3
16750
16751 palignr m7, m2, m0, 12
16752 palignr m3, m1, m2, 12
16753 pmaddwd m6, m7, [r3 - 14 * 32] ; [2]
16754 paddd m6, [pd_16]
16755 psrld m6, 5
16756 pmaddwd m8, m3, [r3 - 14 * 32]
16757 paddd m8, [pd_16]
16758 psrld m8, 5
16759 packusdw m6, m8
16760
16761 pmaddwd m7, [r3 + 7 * 32] ; [23]
16762 paddd m7, [pd_16]
16763 psrld m7, 5
16764 pmaddwd m3, [r3 + 7 * 32]
16765 paddd m3, [pd_16]
16766 psrld m3, 5
16767 packusdw m7, m3
16768
16769 pmaddwd m8, m2, [r3 - 4 * 32] ; [12]
16770 paddd m8, [pd_16]
16771 psrld m8, 5
16772 pmaddwd m9, m1, [r3 - 4 * 32]
16773 paddd m9, [pd_16]
16774 psrld m9, 5
16775 packusdw m8, m9
16776
16777 movu m0, [r2 - 2]
16778 movu m1, [r2]
16779
16780 punpcklwd m3, m0, m1
16781 punpckhwd m0, m1
16782
16783 movu m2, [r2 + 14]
16784 movu m1, [r2 + 16]
16785 punpcklwd m2, m1
16786
16787 pmaddwd m9, m3, [r3 - 15 * 32] ; [1]
16788 paddd m9, [pd_16]
16789 psrld m9, 5
16790 pmaddwd m10, m0, [r3 - 15 * 32]
16791 paddd m10, [pd_16]
16792 psrld m10, 5
16793 packusdw m9, m10
16794
16795 pmaddwd m10, m3, [r3 + 6 * 32] ; [22]
16796 paddd m10, [pd_16]
16797 psrld m10, 5
16798 pmaddwd m11, m0, [r3 + 6 * 32]
16799 paddd m11, [pd_16]
16800 psrld m11, 5
16801 packusdw m10, m11
16802
16803 palignr m2, m0, 4
16804 palignr m0, m3, 4
16805 pmaddwd m0, [r3 - 5 * 32] ; [11]
16806 paddd m0, [pd_16]
16807 psrld m0, 5
16808 pmaddwd m2, [r3 - 5 * 32]
16809 paddd m2, [pd_16]
16810 psrld m2, 5
16811 packusdw m0, m2
16812 TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0
16813 ret
16814
16815 ;; angle 32, modes 16 and 20, rows 16 to 31
16816 cglobal ang32_mode_16_20_rows_16_31
16817 test r6d, r6d
16818
16819 movu m0, [r2 - 40]
16820 movu m1, [r2 - 38]
16821
16822 punpcklwd m3, m0, m1
16823 punpckhwd m0, m1
16824
16825 movu m1, [r2 - 24]
16826 movu m4, [r2 - 22]
16827 punpcklwd m2, m1, m4
16828 punpckhwd m1, m4
16829
16830 pmaddwd m4, m3, [r3 - 16 * 32] ; [0]
16831 paddd m4, [pd_16]
16832 psrld m4, 5
16833 pmaddwd m5, m0, [r3 - 16 * 32]
16834 paddd m5, [pd_16]
16835 psrld m5, 5
16836 packusdw m4, m5
16837
16838 pmaddwd m5, m3, [r3 + 5 * 32] ; [21]
16839 paddd m5, [pd_16]
16840 psrld m5, 5
16841 pmaddwd m8, m0, [r3 + 5 * 32]
16842 paddd m8, [pd_16]
16843 psrld m8, 5
16844 packusdw m5, m8
16845
16846 palignr m7, m0, m3, 4
16847 palignr m8, m2, m0, 4
16848 pmaddwd m6, m7, [r3 - 6 * 32] ; [10]
16849 paddd m6, [pd_16]
16850 psrld m6, 5
16851 pmaddwd m9, m8, [r3 - 6 * 32]
16852 paddd m9, [pd_16]
16853 psrld m9, 5
16854 packusdw m6, m9
16855
16856 pmaddwd m7, [r3 + 15 * 32] ; [31]
16857 paddd m7, [pd_16]
16858 psrld m7, 5
16859 pmaddwd m8, [r3 + 15 * 32]
16860 paddd m8, [pd_16]
16861 psrld m8, 5
16862 packusdw m7, m8
16863
16864 palignr m8, m0, m3, 8
16865 palignr m9, m2, m0, 8
16866 pmaddwd m8, [r3 + 4 * 32] ; [20]
16867 paddd m8, [pd_16]
16868 psrld m8, 5
16869 pmaddwd m9, [r3 + 4 * 32]
16870 paddd m9, [pd_16]
16871 psrld m9, 5
16872 packusdw m8, m9
16873
16874 palignr m10, m0, m3, 12
16875 palignr m11, m2, m0, 12
16876 pmaddwd m9, m10, [r3 - 7 * 32] ; [9]
16877 paddd m9, [pd_16]
16878 psrld m9, 5
16879 pmaddwd m12, m11, [r3 - 7 * 32]
16880 paddd m12, [pd_16]
16881 psrld m12, 5
16882 packusdw m9, m12
16883
16884 pmaddwd m10, [r3 + 14 * 32] ; [30]
16885 paddd m10, [pd_16]
16886 psrld m10, 5
16887 pmaddwd m11, [r3 + 14 * 32]
16888 paddd m11, [pd_16]
16889 psrld m11, 5
16890 packusdw m10, m11
16891
16892 pmaddwd m11, m0, [r3 + 3 * 32] ; [19]
16893 paddd m11, [pd_16]
16894 psrld m11, 5
16895 pmaddwd m12, m2, [r3 + 3 * 32]
16896 paddd m12, [pd_16]
16897 psrld m12, 5
16898 packusdw m11, m12
16899
16900 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
16901
16902 palignr m5, m2, m0, 4
16903 palignr m6, m1, m2, 4
16904 pmaddwd m4, m5, [r3 - 8 * 32] ; [8]
16905 paddd m4, [pd_16]
16906 psrld m4, 5
16907 pmaddwd m7, m6, [r3 - 8 * 32]
16908 paddd m7, [pd_16]
16909 psrld m7, 5
16910 packusdw m4, m7
16911
16912 pmaddwd m5, [r3 + 13 * 32] ; [29]
16913 paddd m5, [pd_16]
16914 psrld m5, 5
16915 pmaddwd m6, [r3 + 13 * 32]
16916 paddd m6, [pd_16]
16917 psrld m6, 5
16918 packusdw m5, m6
16919
16920 palignr m6, m2, m0, 8
16921 palignr m3, m1, m2, 8
16922 pmaddwd m6, [r3 + 2 * 32] ; [18]
16923 paddd m6, [pd_16]
16924 psrld m6, 5
16925 pmaddwd m3, [r3 + 2 * 32]
16926 paddd m3, [pd_16]
16927 psrld m3, 5
16928 packusdw m6, m3
16929
16930 palignr m8, m2, m0, 12
16931 palignr m9, m1, m2, 12
16932 pmaddwd m7, m8, [r3 - 9 * 32] ; [7]
16933 paddd m7, [pd_16]
16934 psrld m7, 5
16935 pmaddwd m10, m9, [r3 - 9 * 32]
16936 paddd m10, [pd_16]
16937 psrld m10, 5
16938 packusdw m7, m10
16939
16940 pmaddwd m8, [r3 + 12 * 32] ; [28]
16941 paddd m8, [pd_16]
16942 psrld m8, 5
16943 pmaddwd m9, [r3 + 12 * 32]
16944 paddd m9, [pd_16]
16945 psrld m9, 5
16946 packusdw m8, m9
16947
16948 pmaddwd m9, m2, [r3 + 1 * 32] ; [17]
16949 paddd m9, [pd_16]
16950 psrld m9, 5
16951 pmaddwd m3, m1, [r3 + 1 * 32]
16952 paddd m3, [pd_16]
16953 psrld m3, 5
16954 packusdw m9, m3
16955
16956 movu m0, [r2 - 22]
16957 movu m1, [r2 - 20]
16958 punpcklwd m3, m0, m1
16959 punpckhwd m0, m1
16960
16961 pmaddwd m10, m3, [r3 - 10 * 32] ; [6]
16962 paddd m10, [pd_16]
16963 psrld m10, 5
16964 pmaddwd m11, m0, [r3 - 10 * 32]
16965 paddd m11, [pd_16]
16966 psrld m11, 5
16967 packusdw m10, m11
16968
16969 pmaddwd m3, [r3 + 11 * 32] ; [27]
16970 paddd m3, [pd_16]
16971 psrld m3, 5
16972 pmaddwd m0, [r3 + 11 * 32]
16973 paddd m0, [pd_16]
16974 psrld m0, 5
16975 packusdw m3, m0
16976 TRANSPOSE_STORE_AVX2_STACK 3, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0
16977 ret
16978
16979 cglobal intra_pred_ang32_16, 3,8,14
16980 mov r6, rsp
16981 sub rsp, 5*mmsize+gprsize
16982 and rsp, ~63
16983 mov [rsp+5*mmsize], r6
16984
16985 movu m0, [r2 + 128]
16986 movu m1, [r2 + 160]
16987 movd xm2, [r2 + 192]
16988
16989 mova [rsp + 2*mmsize], m0
16990 mova [rsp + 3*mmsize], m1
16991 movd [rsp + 4*mmsize], xm2
16992
16993 add r1d, r1d
16994 lea r4, [r1 * 3]
16995 lea r3, [ang_table_avx2 + 16 * 32]
16996
16997 movu xm1, [r2 + 4]
16998 movu xm2, [r2 + 16]
16999 movu xm3, [r2 + 28]
17000 movu xm4, [r2 + 40]
17001 movu xm5, [r2 + 52]
17002 pshufb xm1, [pw_ang32_16_20]
17003 pshufb xm2, [pw_ang32_16_20]
17004 pshufb xm3, [pw_ang32_16_20]
17005 pshufb xm4, [pw_ang32_16_20]
17006 pshufb xm5, [pw_ang32_16_20]
17007
17008 punpckhqdq xm2, xm1
17009 punpckhqdq xm4, xm3
17010 punpckhqdq xm5, xm5
17011
17012 movzx r6d, word [r2]
17013 mov [rsp + 2*mmsize], r6w
17014 movu [rsp + 48], xm2
17015 movu [rsp + 32], xm4
17016 movq [rsp + 24], xm5
17017
17018 xor r6d, r6d
17019 lea r2, [rsp + 2*mmsize]
17020 lea r7, [r0 + 8 * r1]
17021
17022 call ang32_mode_16_20_rows_0_15
17023
17024 lea r0, [r0 + 32]
17025
17026 call ang32_mode_16_20_rows_16_31
17027
17028 add r2, 32
17029 lea r0, [r7 + 8 * r1]
17030
17031 call ang32_mode_16_20_rows_0_15
17032
17033 lea r0, [r0 + 32]
17034
17035 call ang32_mode_16_20_rows_16_31
17036
17037 mov rsp, [rsp+5*mmsize]
17038 RET
17039
17040 cglobal intra_pred_ang32_20, 3,8,14
17041 mov r6, rsp
17042 sub rsp, 5*mmsize+gprsize
17043 and rsp, ~63
17044 mov [rsp+5*mmsize], r6
17045
17046 movu m0, [r2]
17047 movu m1, [r2 + 32]
17048 movd xm2, [r2 + 64]
17049
17050 mova [rsp + 2*mmsize], m0
17051 mova [rsp + 3*mmsize], m1
17052 movd [rsp + 4*mmsize], xm2
17053
17054 add r1d, r1d
17055 lea r4, [r1 * 3]
17056 lea r3, [ang_table_avx2 + 16 * 32]
17057
17058 movu xm1, [r2 + 132]
17059 movu xm2, [r2 + 144]
17060 movu xm3, [r2 + 156]
17061 movu xm4, [r2 + 168]
17062 movu xm5, [r2 + 180]
17063 pshufb xm1, [pw_ang32_16_20]
17064 pshufb xm2, [pw_ang32_16_20]
17065 pshufb xm3, [pw_ang32_16_20]
17066 pshufb xm4, [pw_ang32_16_20]
17067 pshufb xm5, [pw_ang32_16_20]
17068
17069 punpckhqdq xm2, xm1
17070 punpckhqdq xm4, xm3
17071 punpckhqdq xm5, xm5
17072
17073 movu [rsp + 48], xm2
17074 movu [rsp + 32], xm4
17075 movq [rsp + 24], xm5
17076
17077 xor r6d, r6d
17078 inc r6d
17079 lea r2, [rsp + 2*mmsize]
17080 lea r5, [r0 + 32]
17081
17082 call ang32_mode_16_20_rows_0_15
17083
17084 lea r0, [r0 + 8 * r1]
17085 lea r0, [r0 + 8 * r1]
17086
17087 call ang32_mode_16_20_rows_16_31
17088
17089 add r2, 32
17090 mov r0, r5
17091
17092 call ang32_mode_16_20_rows_0_15
17093
17094 lea r0, [r0 + 8 * r1]
17095 lea r0, [r0 + 8 * r1]
17096
17097 call ang32_mode_16_20_rows_16_31
17098
17099 mov rsp, [rsp+5*mmsize]
17100 RET
17101
17102 ;; angle 32, modes 17 and 19, row 0 to 15
17103 cglobal ang32_mode_17_19_rows_0_15
17104 test r6d, r6d
17105
17106 movu m0, [r2 - 24]
17107 movu m1, [r2 - 22]
17108
17109 punpcklwd m3, m0, m1
17110 punpckhwd m0, m1
17111
17112 movu m1, [r2 - 8]
17113 movu m4, [r2 - 6]
17114 punpcklwd m2, m1, m4
17115 punpckhwd m1, m4
17116
17117 pmaddwd m4, m3, [r3 - 16 * 32] ; [0]
17118 paddd m4, [pd_16]
17119 psrld m4, 5
17120 pmaddwd m5, m0, [r3 - 16 * 32]
17121 paddd m5, [pd_16]
17122 psrld m5, 5
17123 packusdw m4, m5
17124
17125 pmaddwd m5, m3, [r3 + 10 * 32] ; [26]
17126 paddd m5, [pd_16]
17127 psrld m5, 5
17128 pmaddwd m8, m0, [r3 + 10 * 32]
17129 paddd m8, [pd_16]
17130 psrld m8, 5
17131 packusdw m5, m8
17132
17133 palignr m6, m0, m3, 4
17134 palignr m8, m2, m0, 4
17135 pmaddwd m6, [r3 + 4 * 32] ; [20]
17136 paddd m6, [pd_16]
17137 psrld m6, 5
17138 pmaddwd m8, [r3 + 4 * 32]
17139 paddd m8, [pd_16]
17140 psrld m8, 5
17141 packusdw m6, m8
17142
17143 palignr m7, m0, m3, 8
17144 palignr m9, m2, m0, 8
17145 pmaddwd m7, [r3 - 2 * 32] ; [14]
17146 paddd m7, [pd_16]
17147 psrld m7, 5
17148 pmaddwd m9, [r3 - 2 * 32]
17149 paddd m9, [pd_16]
17150 psrld m9, 5
17151 packusdw m7, m9
17152
17153 palignr m8, m0, m3, 12
17154 palignr m10, m2, m0, 12
17155 pmaddwd m8, [r3 - 8 * 32] ; [8]
17156 paddd m8, [pd_16]
17157 psrld m8, 5
17158 pmaddwd m10, [r3 - 8 * 32]
17159 paddd m10, [pd_16]
17160 psrld m10, 5
17161 packusdw m8, m10
17162
17163 pmaddwd m9, m0, [r3 - 14 * 32] ; [2]
17164 paddd m9, [pd_16]
17165 psrld m9, 5
17166 pmaddwd m12, m2, [r3 - 14 * 32]
17167 paddd m12, [pd_16]
17168 psrld m12, 5
17169 packusdw m9, m12
17170
17171 pmaddwd m10, m0, [r3 + 12 * 32] ; [28]
17172 paddd m10, [pd_16]
17173 psrld m10, 5
17174 pmaddwd m11, m2, [r3 + 12 * 32]
17175 paddd m11, [pd_16]
17176 psrld m11, 5
17177 packusdw m10, m11
17178
17179 palignr m11, m2, m0, 4
17180 palignr m12, m1, m2, 4
17181 pmaddwd m11, [r3 + 6 * 32] ; [22]
17182 paddd m11, [pd_16]
17183 psrld m11, 5
17184 pmaddwd m12, [r3 + 6 * 32]
17185 paddd m12, [pd_16]
17186 psrld m12, 5
17187 packusdw m11, m12
17188
17189 TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16
17190
17191 palignr m4, m2, m0, 8
17192 palignr m5, m1, m2, 8
17193 pmaddwd m4, [r3] ; [16]
17194 paddd m4, [pd_16]
17195 psrld m4, 5
17196 pmaddwd m5, [r3]
17197 paddd m5, [pd_16]
17198 psrld m5, 5
17199 packusdw m4, m5
17200
17201 palignr m5, m2, m0, 12
17202 palignr m3, m1, m2, 12
17203 pmaddwd m5, [r3 - 6 * 32] ; [10]
17204 paddd m5, [pd_16]
17205 psrld m5, 5
17206 pmaddwd m3, [r3 - 6 * 32]
17207 paddd m3, [pd_16]
17208 psrld m3, 5
17209 packusdw m5, m3
17210
17211 pmaddwd m6, m2, [r3 - 12 * 32] ; [4]
17212 paddd m6, [pd_16]
17213 psrld m6, 5
17214 pmaddwd m8, m1, [r3 - 12 * 32]
17215 paddd m8, [pd_16]
17216 psrld m8, 5
17217 packusdw m6, m8
17218
17219 pmaddwd m7, m2, [r3 + 14 * 32] ; [30]
17220 paddd m7, [pd_16]
17221 psrld m7, 5
17222 pmaddwd m3, m1, [r3 + 14 * 32]
17223 paddd m3, [pd_16]
17224 psrld m3, 5
17225 packusdw m7, m3
17226
17227 movu m0, [r2 - 6]
17228 movu m1, [r2 - 4]
17229
17230 punpcklwd m3, m0, m1
17231 punpckhwd m0, m1
17232
17233 movu m2, [r2 + 10]
17234 movu m1, [r2 + 12]
17235 punpcklwd m2, m1
17236
17237 pmaddwd m8, m3, [r3 + 8 * 32] ; [24]
17238 paddd m8, [pd_16]
17239 psrld m8, 5
17240 pmaddwd m9, m0, [r3 + 8 * 32]
17241 paddd m9, [pd_16]
17242 psrld m9, 5
17243 packusdw m8, m9
17244
17245 palignr m9, m0, m3, 4
17246 palignr m10, m2, m0, 4
17247 pmaddwd m9, [r3 + 2 * 32] ; [18]
17248 paddd m9, [pd_16]
17249 psrld m9, 5
17250 pmaddwd m10, [r3 + 2 * 32]
17251 paddd m10, [pd_16]
17252 psrld m10, 5
17253 packusdw m9, m10
17254
17255 palignr m10, m0, m3, 8
17256 palignr m11, m2, m0, 8
17257 pmaddwd m10, [r3 - 4 * 32] ; [12]
17258 paddd m10, [pd_16]
17259 psrld m10, 5
17260 pmaddwd m11, [r3 - 4 * 32]
17261 paddd m11, [pd_16]
17262 psrld m11, 5
17263 packusdw m10, m11
17264
17265 palignr m2, m0, 12
17266 palignr m0, m3, 12
17267 pmaddwd m0, [r3 - 10 * 32] ; [6]
17268 paddd m0, [pd_16]
17269 psrld m0, 5
17270 pmaddwd m2, [r3 - 10 * 32]
17271 paddd m2, [pd_16]
17272 psrld m2, 5
17273 packusdw m0, m2
17274 TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0
17275 ret
17276
17277 cglobal intra_pred_ang32_17, 3,8,14
17278 mov r6, rsp
17279 sub rsp, 5*mmsize+gprsize
17280 and rsp, ~63
17281 mov [rsp+5*mmsize], r6
17282
17283 movu m0, [r2 + 128]
17284 movu m1, [r2 + 160]
17285 movd xm2, [r2 + 192]
17286
17287 mova [rsp + 2*mmsize], m0
17288 mova [rsp + 3*mmsize], m1
17289 movd [rsp + 4*mmsize], xm2
17290
17291 add r1d, r1d
17292 lea r4, [r1 * 3]
17293 lea r3, [ang_table_avx2 + 16 * 32]
17294
17295 movu xm1, [r2 + 2]
17296 movu xm2, [r2 + 18]
17297 movu xm3, [r2 + 34]
17298 movu xm4, [r2 + 50]
17299 pshufb xm1, [pw_ang32_17_19_0]
17300 pshufb xm2, [shuf_mode_17_19]
17301 pshufb xm3, [pw_ang32_17_19_0]
17302 pshufb xm4, [shuf_mode_17_19]
17303
17304 movzx r6d, word [r2]
17305 mov [rsp + 2*mmsize], r6w
17306 movu [rsp + 48], xm1
17307 movu [rsp + 36], xm2
17308 movu [rsp + 22], xm3
17309 movu [rsp + 10], xm4
17310
17311 xor r6d, r6d
17312 lea r2, [rsp + 2*mmsize]
17313 lea r7, [r0 + 8 * r1]
17314
17315 call ang32_mode_17_19_rows_0_15
17316
17317 sub r2, 26
17318 lea r0, [r0 + 32]
17319
17320 call ang32_mode_17_19_rows_0_15
17321
17322 add r2, 58
17323 lea r0, [r7 + 8 * r1]
17324
17325 call ang32_mode_17_19_rows_0_15
17326
17327 sub r2, 26
17328 lea r0, [r0 + 32]
17329
17330 call ang32_mode_17_19_rows_0_15
17331
17332 mov rsp, [rsp+5*mmsize]
17333 RET
17334
17335 cglobal intra_pred_ang32_19, 3,8,14
17336 mov r6, rsp
17337 sub rsp, 5*mmsize+gprsize
17338 and rsp, ~63
17339 mov [rsp+5*mmsize], r6
17340
17341 movu m0, [r2]
17342 movu m1, [r2 + 32]
17343 movd xm2, [r2 + 64]
17344
17345 mova [rsp + 2*mmsize], m0
17346 mova [rsp + 3*mmsize], m1
17347 movd [rsp + 4*mmsize], xm2
17348
17349 add r1d, r1d
17350 lea r4, [r1 * 3]
17351 lea r3, [ang_table_avx2 + 16 * 32]
17352
17353 movu xm1, [r2 + 130]
17354 movu xm2, [r2 + 146]
17355 movu xm3, [r2 + 162]
17356 movu xm4, [r2 + 178]
17357 pshufb xm1, [pw_ang32_17_19_0]
17358 pshufb xm2, [shuf_mode_17_19]
17359 pshufb xm3, [pw_ang32_17_19_0]
17360 pshufb xm4, [shuf_mode_17_19]
17361
17362 movu [rsp + 48], xm1
17363 movu [rsp + 36], xm2
17364 movu [rsp + 22], xm3
17365 movu [rsp + 10], xm4
17366
17367 xor r6d, r6d
17368 inc r6d
17369 lea r2, [rsp + 2*mmsize]
17370 lea r5, [r0 + 32]
17371
17372 call ang32_mode_17_19_rows_0_15
17373
17374 sub r2, 26
17375 lea r0, [r0 + 8 * r1]
17376 lea r0, [r0 + 8 * r1]
17377
17378 call ang32_mode_17_19_rows_0_15
17379
17380 add r2, 58
17381 mov r0, r5
17382
17383 call ang32_mode_17_19_rows_0_15
17384
17385 sub r2, 26
17386 lea r0, [r0 + 8 * r1]
17387 lea r0, [r0 + 8 * r1]
17388
17389 call ang32_mode_17_19_rows_0_15
17390
17391 mov rsp, [rsp+5*mmsize]
17392 RET
17393
17394 cglobal intra_pred_ang32_18, 3,6,6
17395 mov r4, rsp
17396 sub rsp, 4*mmsize+gprsize
17397 and rsp, ~63
17398 mov [rsp+4*mmsize], r4
17399
17400 movu m0, [r2]
17401 movu m1, [r2 + 32]
17402 mova [rsp + 2*mmsize], m0
17403 mova [rsp + 3*mmsize], m1
17404
17405 movu m2, [r2 + 130]
17406 movu m3, [r2 + 162]
17407 pshufb m2, [pw_swap16]
17408 pshufb m3, [pw_swap16]
17409 vpermq m2, m2, 01001110b
17410 vpermq m3, m3, 01001110b
17411 mova [rsp + 1*mmsize], m2
17412 mova [rsp + 0*mmsize], m3
17413
17414 add r1d, r1d
17415 lea r2, [rsp+2*mmsize]
17416 lea r4, [r1 * 2]
17417 lea r3, [r1 * 3]
17418 lea r5, [r1 * 4]
17419
17420 movu m0, [r2]
17421 movu m1, [r2 + 32]
17422 movu m2, [r2 - 16]
17423 movu m3, [r2 + 16]
17424
17425 movu [r0], m0
17426 movu [r0 + 32], m1
17427
17428 palignr m4, m0, m2, 14
17429 palignr m5, m1, m3, 14
17430 movu [r0 + r1], m4
17431 movu [r0 + r1 + 32], m5
17432
17433 palignr m4, m0, m2, 12
17434 palignr m5, m1, m3, 12
17435 movu [r0 + r4], m4
17436 movu [r0 + r4 + 32], m5
17437
17438 palignr m4, m0, m2, 10
17439 palignr m5, m1, m3, 10
17440 movu [r0 + r3], m4
17441 movu [r0 + r3 + 32], m5
17442
17443 add r0, r5
17444
17445 palignr m4, m0, m2, 8
17446 palignr m5, m1, m3, 8
17447 movu [r0], m4
17448 movu [r0 + 32], m5
17449
17450 palignr m4, m0, m2, 6
17451 palignr m5, m1, m3, 6
17452 movu [r0 + r1], m4
17453 movu [r0 + r1 + 32], m5
17454
17455 palignr m4, m0, m2, 4
17456 palignr m5, m1, m3, 4
17457 movu [r0 + r4], m4
17458 movu [r0 + r4 + 32], m5
17459
17460 palignr m4, m0, m2, 2
17461 palignr m5, m1, m3, 2
17462 movu [r0 + r3], m4
17463 movu [r0 + r3 + 32], m5
17464
17465 add r0, r5
17466
17467 movu [r0], m2
17468 movu [r0 + 32], m3
17469
17470 movu m0, [r2 - 32]
17471 movu m1, [r2]
17472
17473 palignr m4, m2, m0, 14
17474 palignr m5, m3, m1, 14
17475 movu [r0 + r1], m4
17476 movu [r0 + r1 + 32], m5
17477
17478 palignr m4, m2, m0, 12
17479 palignr m5, m3, m1, 12
17480 movu [r0 + r4], m4
17481 movu [r0 + r4 + 32], m5
17482
17483 palignr m4, m2, m0, 10
17484 palignr m5, m3, m1, 10
17485 movu [r0 + r3], m4
17486 movu [r0 + r3 + 32], m5
17487
17488 add r0, r5
17489
17490 palignr m4, m2, m0, 8
17491 palignr m5, m3, m1, 8
17492 movu [r0], m4
17493 movu [r0 + 32], m5
17494
17495 palignr m4, m2, m0, 6
17496 palignr m5, m3, m1, 6
17497 movu [r0 + r1], m4
17498 movu [r0 + r1 + 32], m5
17499
17500 palignr m4, m2, m0, 4
17501 palignr m5, m3, m1, 4
17502 movu [r0 + r4], m4
17503 movu [r0 + r4 + 32], m5
17504
17505 palignr m4, m2, m0, 2
17506 palignr m5, m3, m1, 2
17507 movu [r0 + r3], m4
17508 movu [r0 + r3 + 32], m5
17509
17510 add r0, r5
17511
17512 movu [r0], m0
17513 movu [r0 + 32], m1
17514
17515 movu m2, [r2 - 48]
17516 movu m3, [r2 - 16]
17517
17518 palignr m4, m0, m2, 14
17519 palignr m5, m1, m3, 14
17520 movu [r0 + r1], m4
17521 movu [r0 + r1 + 32], m5
17522
17523 palignr m4, m0, m2, 12
17524 palignr m5, m1, m3, 12
17525 movu [r0 + r4], m4
17526 movu [r0 + r4 + 32], m5
17527
17528 palignr m4, m0, m2, 10
17529 palignr m5, m1, m3, 10
17530 movu [r0 + r3], m4
17531 movu [r0 + r3 + 32], m5
17532
17533 add r0, r5
17534
17535 palignr m4, m0, m2, 8
17536 palignr m5, m1, m3, 8
17537 movu [r0], m4
17538 movu [r0 + 32], m5
17539
17540 palignr m4, m0, m2, 6
17541 palignr m5, m1, m3, 6
17542 movu [r0 + r1], m4
17543 movu [r0 + r1 + 32], m5
17544
17545 palignr m4, m0, m2, 4
17546 palignr m5, m1, m3, 4
17547 movu [r0 + r4], m4
17548 movu [r0 + r4 + 32], m5
17549
17550 palignr m4, m0, m2, 2
17551 palignr m5, m1, m3, 2
17552 movu [r0 + r3], m4
17553 movu [r0 + r3 + 32], m5
17554
17555 add r0, r5
17556
17557 movu [r0], m2
17558 movu [r0 + 32], m3
17559
17560 movu m0, [r2 - 64]
17561 movu m1, [r2 - 32]
17562
17563 palignr m4, m2, m0, 14
17564 palignr m5, m3, m1, 14
17565 movu [r0 + r1], m4
17566 movu [r0 + r1 + 32], m5
17567
17568 palignr m4, m2, m0, 12
17569 palignr m5, m3, m1, 12
17570 movu [r0 + r4], m4
17571 movu [r0 + r4 + 32], m5
17572
17573 palignr m4, m2, m0, 10
17574 palignr m5, m3, m1, 10
17575 movu [r0 + r3], m4
17576 movu [r0 + r3 + 32], m5
17577
17578 add r0, r5
17579
17580 palignr m4, m2, m0, 8
17581 palignr m5, m3, m1, 8
17582 movu [r0], m4
17583 movu [r0 + 32], m5
17584
17585 palignr m4, m2, m0, 6
17586 palignr m5, m3, m1, 6
17587 movu [r0 + r1], m4
17588 movu [r0 + r1 + 32], m5
17589
17590 palignr m4, m2, m0, 4
17591 palignr m5, m3, m1, 4
17592 movu [r0 + r4], m4
17593 movu [r0 + r4 + 32], m5
17594
17595 palignr m4, m2, m0, 2
17596 palignr m5, m3, m1, 2
17597 movu [r0 + r3], m4
17598 movu [r0 + r3 + 32], m5
17599
17600 mov rsp, [rsp+4*mmsize]
17601 RET
17602 ;-------------------------------------------------------------------------------------------------------
17603 ; end of avx2 code for intra_pred_ang32 mode 2 to 34
17604 ;-------------------------------------------------------------------------------------------------------
17605
17606 %macro MODE_2_34 0
17607 movu m0, [r2 + 4]
17608 movu m1, [r2 + 20]
17609 movu m2, [r2 + 36]
17610 movu m3, [r2 + 52]
17611 movu m4, [r2 + 68]
17612 movu [r0], m0
17613 movu [r0 + 16], m1
17614 movu [r0 + 32], m2
17615 movu [r0 + 48], m3
17616 palignr m5, m1, m0, 2
17617 movu [r0 + r1], m5
17618 palignr m5, m2, m1, 2
17619 movu [r0 + r1 + 16], m5
17620 palignr m5, m3, m2, 2
17621 movu [r0 + r1 + 32], m5
17622 palignr m5, m4, m3, 2
17623 movu [r0 + r1 + 48], m5
17624 palignr m5, m1, m0, 4
17625 movu [r0 + r3], m5
17626 palignr m5, m2, m1, 4
17627 movu [r0 + r3 + 16], m5
17628 palignr m5, m3, m2, 4
17629 movu [r0 + r3 + 32], m5
17630 palignr m5, m4, m3, 4
17631 movu [r0 + r3 + 48], m5
17632 palignr m5, m1, m0, 6
17633 movu [r0 + r4], m5
17634 palignr m5, m2, m1, 6
17635 movu [r0 + r4 + 16], m5
17636 palignr m5, m3, m2, 6
17637 movu [r0 + r4 + 32], m5
17638 palignr m5, m4, m3, 6
17639 movu [r0 + r4 + 48], m5
17640 lea r0, [r0 + r1 * 4]
17641 palignr m5, m1, m0, 8
17642 movu [r0], m5
17643 palignr m5, m2, m1, 8
17644 movu [r0 + 16], m5
17645 palignr m5, m3, m2, 8
17646 movu [r0 + 32], m5
17647 palignr m5, m4, m3, 8
17648 movu [r0 + 48], m5
17649 palignr m5, m1, m0, 10
17650 movu [r0 + r1], m5
17651 palignr m5, m2, m1, 10
17652 movu [r0 + r1 + 16], m5
17653 palignr m5, m3, m2, 10
17654 movu [r0 + r1 + 32], m5
17655 palignr m5, m4, m3, 10
17656 movu [r0 + r1 + 48], m5
17657 palignr m5, m1, m0, 12
17658 movu [r0 + r3], m5
17659 palignr m5, m2, m1, 12
17660 movu [r0 + r3 + 16], m5
17661 palignr m5, m3, m2, 12
17662 movu [r0 + r3 + 32], m5
17663 palignr m5, m4, m3, 12
17664 movu [r0 + r3 + 48], m5
17665 palignr m5, m1, m0, 14
17666 movu [r0 + r4], m5
17667 palignr m5, m2, m1, 14
17668 movu [r0 + r4 + 16], m5
17669 palignr m5, m3, m2, 14
17670 movu [r0 + r4 + 32], m5
17671 palignr m5, m4, m3, 14
17672 movu [r0 + r4 + 48], m5
17673 lea r0, [r0 + r1 * 4]
17674 movu m0, [r2 + 84]
17675 movu [r0], m1
17676 movu [r0 + 16], m2
17677 movu [r0 + 32], m3
17678 movu [r0 + 48], m4
17679 palignr m5, m2, m1, 2
17680 movu [r0 + r1], m5
17681 palignr m5, m3, m2, 2
17682 movu [r0 + r1 + 16], m5
17683 palignr m5, m4, m3, 2
17684 movu [r0 + r1 + 32], m5
17685 palignr m5, m0, m4, 2
17686 movu [r0 + r1 + 48], m5
17687 palignr m5, m2, m1, 4
17688 movu [r0 + r3], m5
17689 palignr m5, m3, m2, 4
17690 movu [r0 + r3 + 16], m5
17691 palignr m5, m4, m3, 4
17692 movu [r0 + r3 + 32], m5
17693 palignr m5, m0, m4, 4
17694 movu [r0 + r3 + 48], m5
17695 palignr m5, m2, m1, 6
17696 movu [r0 + r4], m5
17697 palignr m5, m3, m2, 6
17698 movu [r0 + r4 + 16], m5
17699 palignr m5, m4, m3, 6
17700 movu [r0 + r4 + 32], m5
17701 palignr m5, m0, m4, 6
17702 movu [r0 + r4 + 48], m5
17703 lea r0, [r0 + r1 * 4]
17704 palignr m5, m2, m1, 8
17705 movu [r0], m5
17706 palignr m5, m3, m2, 8
17707 movu [r0 + 16], m5
17708 palignr m5, m4, m3, 8
17709 movu [r0 + 32], m5
17710 palignr m5, m0, m4, 8
17711 movu [r0 + 48], m5
17712 palignr m5, m2, m1, 10
17713 movu [r0 + r1], m5
17714 palignr m5, m3, m2, 10
17715 movu [r0 + r1 + 16], m5
17716 palignr m5, m4, m3, 10
17717 movu [r0 + r1 + 32], m5
17718 palignr m5, m0, m4, 10
17719 movu [r0 + r1 + 48], m5
17720 palignr m5, m2, m1, 12
17721 movu [r0 + r3], m5
17722 palignr m5, m3, m2, 12
17723 movu [r0 + r3 + 16], m5
17724 palignr m5, m4, m3, 12
17725 movu [r0 + r3 + 32], m5
17726 palignr m5, m0, m4, 12
17727 movu [r0 + r3 + 48], m5
17728 palignr m5, m2, m1, 14
17729 movu [r0 + r4], m5
17730 palignr m5, m3, m2, 14
17731 movu [r0 + r4 + 16], m5
17732 palignr m5, m4, m3, 14
17733 movu [r0 + r4 + 32], m5
17734 palignr m5, m0, m4, 14
17735 movu [r0 + r4 + 48], m5
17736 lea r0, [r0 + r1 * 4]
17737 %endmacro
17738
17739 %macro TRANSPOSE_STORE_8x8 6
17740 %if %2 == 1
17741 ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
17742 punpckhwd m0, %3, %4
17743 punpcklwd %3, %4
17744 punpckhwd %4, %3, m0
17745 punpcklwd %3, m0
17746
17747 punpckhwd m0, %5, %6
17748 punpcklwd %5, %6
17749 punpckhwd %6, %5, m0
17750 punpcklwd %5, m0
17751
17752 punpckhqdq m0, %3, %5
17753 punpcklqdq %3, %5
17754 punpcklqdq %5, %4, %6
17755 punpckhqdq %4, %6
17756
17757 movu [r0 + %1], %3
17758 movu [r0 + r1 + %1], m0
17759 movu [r0 + r1 * 2 + %1], %5
17760 movu [r0 + r5 + %1], %4
17761 %else
17762 ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32
17763 movh [r0], %3
17764 movhps [r0 + r1], %3
17765 movh [r0 + r1 * 2], %4
17766 movhps [r0 + r5], %4
17767 lea r0, [r0 + r1 * 4]
17768 movh [r0], %5
17769 movhps [r0 + r1], %5
17770 movh [r0 + r1 * 2], %6
17771 movhps [r0 + r5], %6
17772 lea r0, [r0 + r1 * 4]
17773 %endif
17774 %endmacro
17775
17776 %macro MODE_3_33 1
17777 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
17778 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
17779 mova m7, m0
17780
17781 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
17782 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2
17783 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0
17784
17785 palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1
17786 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
17787 paddd m4, [pd_16]
17788 psrld m4, 5
17789
17790 pmaddwd m5, m1, [r3 + 4 * 16] ; [20]
17791 paddd m5, [pd_16]
17792 psrld m5, 5
17793 packusdw m4, m5
17794
17795 palignr m5, m2, m0, 8
17796 pmaddwd m5, [r3 - 2 * 16] ; [14]
17797 paddd m5, [pd_16]
17798 psrld m5, 5
17799
17800 palignr m6, m2, m0, 12
17801 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
17802 paddd m6, [pd_16]
17803 psrld m6, 5
17804 packusdw m5, m6
17805
17806 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
17807 paddd m6, [pd_16]
17808 psrld m6, 5
17809
17810 pmaddwd m1, m2, [r3 + 12 * 16] ; [28]
17811 paddd m1, [pd_16]
17812 psrld m1, 5
17813 packusdw m6, m1
17814
17815 palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6]
17816 pmaddwd m1, m0, [r3 + 6 * 16] ; [22]
17817 paddd m1, [pd_16]
17818 psrld m1, 5
17819
17820 psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10]
17821 palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7]
17822
17823 pmaddwd m2, [r3] ; [16]
17824 paddd m2, [pd_16]
17825 psrld m2, 5
17826 packusdw m1, m2
17827
17828 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
17829
17830 palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8]
17831 movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16]
17832 palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9]
17833 punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12]
17834 punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8]
17835
17836 palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9]
17837 pmaddwd m4, m0, [r3 - 6 * 16] ; [10]
17838 paddd m4, [pd_16]
17839 psrld m4, 5
17840
17841 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
17842 paddd m1, [pd_16]
17843 psrld m1, 5
17844 packusdw m4, m1
17845
17846 pmaddwd m5, [r3 + 14 * 16] ; [30]
17847 paddd m5, [pd_16]
17848 psrld m5, 5
17849
17850 palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10]
17851 pmaddwd m6, [r3 + 8 * 16] ; [24]
17852 paddd m6, [pd_16]
17853 psrld m6, 5
17854 packusdw m5, m6
17855
17856 palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11]
17857 pmaddwd m6, m1, [r3 + 2 * 16] ; [18]
17858 paddd m6, [pd_16]
17859 psrld m6, 5
17860
17861 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
17862 paddd m1, [pd_16]
17863 psrld m1, 5
17864 packusdw m6, m1
17865
17866 palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13]
17867 pmaddwd m1, m2, [r3 - 10 * 16] ; [6]
17868 paddd m1, [pd_16]
17869 psrld m1, 5
17870
17871 packusdw m1, m1
17872 movhps m1, [r2 + 28] ; [00]
17873
17874 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
17875
17876 movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28]
17877 palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29]
17878 punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32]
17879 punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28]
17880
17881 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
17882 paddd m4, [pd_16]
17883 psrld m4, 5
17884
17885 palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29]
17886 pmaddwd m1, [r3 + 4 * 16] ; [20]
17887 paddd m1, [pd_16]
17888 psrld m1, 5
17889 packusdw m4, m1
17890
17891 palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30]
17892 pmaddwd m5, [r3 - 2 * 16] ; [14]
17893 paddd m5, [pd_16]
17894 psrld m5, 5
17895
17896 palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31]
17897 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
17898 paddd m6, [pd_16]
17899 psrld m6, 5
17900 packusdw m5, m6
17901
17902 pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31]
17903 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
17904 paddd m6, [pd_16]
17905 psrld m6, 5
17906
17907 pmaddwd m2, [r3 + 12 * 16] ; [28]
17908 paddd m2, [pd_16]
17909 psrld m2, 5
17910 packusdw m6, m2
17911
17912 movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38]
17913 palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39]
17914 punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32]
17915 punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28]
17916
17917 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
17918 paddd m1, [pd_16]
17919 psrld m1, 5
17920
17921 palignr m0, m2, m3, 4
17922 pmaddwd m0, [r3] ; [16]
17923 paddd m0, [pd_16]
17924 psrld m0, 5
17925 packusdw m1, m0
17926
17927 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
17928
17929 palignr m5, m2, m3, 8
17930 pmaddwd m4, m5, [r3 - 6 * 16] ; [10]
17931 paddd m4, [pd_16]
17932 psrld m4, 5
17933
17934 palignr m5, m2, m3, 12
17935 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
17936 paddd m1, [pd_16]
17937 psrld m1, 5
17938 packusdw m4, m1
17939
17940 pmaddwd m5, [r3 + 14 * 16] ; [30]
17941 paddd m5, [pd_16]
17942 psrld m5, 5
17943
17944 movu m3, [r2 + 46]
17945 palignr m1, m3, 2
17946 punpckhwd m2, m3, m1
17947 punpcklwd m3, m1
17948
17949 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
17950 paddd m6, [pd_16]
17951 psrld m6, 5
17952 packusdw m5, m6
17953
17954 palignr m6, m2, m3, 4
17955 pmaddwd m6, [r3 + 2 * 16] ; [18]
17956 paddd m6, [pd_16]
17957 psrld m6, 5
17958
17959 palignr m1, m2, m3, 8
17960 pmaddwd m1, [r3 - 4 * 16] ; [12]
17961 paddd m1, [pd_16]
17962 psrld m1, 5
17963 packusdw m6, m1
17964
17965 palignr m1, m2, m3, 12
17966 pmaddwd m1, [r3 - 10 * 16] ; [06]
17967 paddd m1, [pd_16]
17968 psrld m1, 5
17969
17970 packusdw m1, m1
17971 movhps m1, [r2 + 54] ; [00]
17972
17973 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
17974 %endmacro
17975
17976 %macro MODE_4_32 1
17977 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
17978 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
17979 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
17980 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
17981 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
17982
17983 pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
17984 paddd m4, [pd_16]
17985 psrld m4, 5
17986
17987 palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
17988 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
17989 paddd m1, [pd_16]
17990 psrld m1, 5
17991 packusdw m4, m1
17992
17993 pmaddwd m5, [r3 + 15 * 16] ; [31]
17994 paddd m5, [pd_16]
17995 psrld m5, 5
17996
17997 palignr m6, m2, m0, 8
17998 pmaddwd m6, [r3 + 4 * 16] ; [ 20]
17999 paddd m6, [pd_16]
18000 psrld m6, 5
18001 packusdw m5, m6
18002
18003 palignr m1, m2, m0, 12
18004 pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
18005 paddd m6, [pd_16]
18006 psrld m6, 5
18007
18008 pmaddwd m1, [r3 + 14 * 16] ; [30]
18009 paddd m1, [pd_16]
18010 psrld m1, 5
18011 packusdw m6, m1
18012
18013 pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
18014 paddd m1, [pd_16]
18015 psrld m1, 5
18016
18017 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
18018 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
18019 paddd m0, [pd_16]
18020 psrld m0, 5
18021 packusdw m1, m0
18022
18023 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18024
18025 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
18026 paddd m4, [pd_16]
18027 psrld m4, 5
18028
18029 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
18030
18031 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
18032 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
18033 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
18034 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
18035
18036 palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
18037 pmaddwd m1, [r3 + 2 * 16] ; [18]
18038 paddd m1, [pd_16]
18039 psrld m1, 5
18040 packusdw m4, m1
18041
18042 palignr m5, m2, m7, 8
18043 mova m6, m5
18044 pmaddwd m5, [r3 - 9 * 16] ; [07]
18045 paddd m5, [pd_16]
18046 psrld m5, 5
18047
18048 pmaddwd m6, [r3 + 12 * 16] ; [28]
18049 paddd m6, [pd_16]
18050 psrld m6, 5
18051 packusdw m5, m6
18052
18053 palignr m6, m2, m7, 12
18054 pmaddwd m6, [r3 + 16] ; [17]
18055 paddd m6, [pd_16]
18056 psrld m6, 5
18057
18058 pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
18059 paddd m1, [pd_16]
18060 psrld m1, 5
18061 packusdw m6, m1
18062
18063 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
18064 paddd m1, [pd_16]
18065 psrld m1, 5
18066
18067 palignr m7, m3, m2, 4
18068 pmaddwd m7, [r3] ; [16]
18069 paddd m7, [pd_16]
18070 psrld m7, 5
18071 packusdw m1, m7
18072 mova m7, m0
18073
18074 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18075
18076 palignr m0, m3, m2, 8
18077 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
18078 paddd m4, [pd_16]
18079 psrld m4, 5
18080
18081 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
18082 paddd m1, [pd_16]
18083 psrld m1, 5
18084 packusdw m4, m1
18085
18086 palignr m5, m3, m2, 12
18087 pmaddwd m5, [r3 - 16] ; [15]
18088 paddd m5, [pd_16]
18089 psrld m5, 5
18090
18091 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
18092 paddd m1, [pd_16]
18093 psrld m1, 5
18094 packusdw m5, m1
18095
18096 pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
18097 paddd m6, [pd_16]
18098 psrld m6, 5
18099
18100 movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
18101 palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
18102 palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
18103 punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
18104 punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
18105
18106 palignr m1, m2, m3, 4
18107 pmaddwd m1, [r3 - 2 * 16] ; [14]
18108 paddd m1, [pd_16]
18109 psrld m1, 5
18110 packusdw m6, m1
18111
18112 palignr m1, m2, m3, 8
18113 mova m0, m1
18114 pmaddwd m1, [r3 - 13 * 16] ; [3]
18115 paddd m1, [pd_16]
18116 psrld m1, 5
18117
18118 pmaddwd m0, [r3 + 8 * 16] ; [24]
18119 paddd m0, [pd_16]
18120 psrld m0, 5
18121 packusdw m1, m0
18122
18123 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18124
18125 palignr m4, m2, m3, 12
18126 pmaddwd m4, [r3 - 3 * 16] ; [13]
18127 paddd m4, [pd_16]
18128 psrld m4, 5
18129
18130 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
18131 paddd m1, [pd_16]
18132 psrld m1, 5
18133 packusdw m4, m1
18134
18135 pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
18136 paddd m5, [pd_16]
18137 psrld m5, 5
18138
18139 palignr m6, m7, m2, 4
18140 pmaddwd m6, [r3 - 4 * 16] ; [12]
18141 paddd m6, [pd_16]
18142 psrld m6, 5
18143 packusdw m5, m6
18144
18145 palignr m1, m7, m2, 8
18146 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
18147 paddd m6, [pd_16]
18148 psrld m6, 5
18149
18150 pmaddwd m1, [r3 + 6 * 16] ; [22]
18151 paddd m1, [pd_16]
18152 psrld m1, 5
18153 packusdw m6, m1
18154
18155 palignr m1, m7, m2, 12
18156 pmaddwd m1, [r3 - 5 * 16] ; [11]
18157 paddd m1, [pd_16]
18158 psrld m1, 5
18159 packusdw m1, m1
18160 movhps m1, [r2 + 44] ; [00]
18161
18162 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
18163 %endmacro
18164
18165 %macro MODE_5_31 1
18166 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
18167 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
18168 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
18169 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
18170 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
18171
18172 pmaddwd m4, m0, [r3 + 16] ; [17]
18173 paddd m4, [pd_16]
18174 psrld m4, 5
18175
18176 palignr m1, m2, m0, 4
18177 mova m5, m1
18178 pmaddwd m1, [r3 - 14 * 16] ; [2]
18179 paddd m1, [pd_16]
18180 psrld m1, 5
18181 packusdw m4, m1
18182
18183 pmaddwd m5, [r3 + 3 * 16] ; [19]
18184 paddd m5, [pd_16]
18185 psrld m5, 5
18186
18187 palignr m6, m2, m0, 8
18188 mova m1, m6
18189 pmaddwd m6, [r3 - 12 * 16] ; [4]
18190 paddd m6, [pd_16]
18191 psrld m6, 5
18192 packusdw m5, m6
18193
18194 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
18195 paddd m6, [pd_16]
18196 psrld m6, 5
18197
18198 palignr m1, m2, m0, 12
18199 mova m7, m1
18200 pmaddwd m7, [r3 - 10 * 16] ; [6]
18201 paddd m7, [pd_16]
18202 psrld m7, 5
18203 packusdw m6, m7
18204
18205 pmaddwd m1, [r3 + 7 * 16] ; [23]
18206 paddd m1, [pd_16]
18207 psrld m1, 5
18208
18209 pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
18210 paddd m7, [pd_16]
18211 psrld m7, 5
18212 packusdw m1, m7
18213
18214 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18215
18216 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
18217 paddd m4, [pd_16]
18218 psrld m4, 5
18219
18220 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
18221 pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
18222 paddd m1, [pd_16]
18223 psrld m1, 5
18224 packusdw m4, m1
18225
18226 pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
18227 paddd m5, [pd_16]
18228 psrld m5, 5
18229
18230 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
18231 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
18232 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
18233 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
18234 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
18235
18236 palignr m6, m2, m7, 4
18237 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
18238 paddd m1, [pd_16]
18239 psrld m1, 5
18240 packusdw m5, m1
18241
18242 pmaddwd m6, [r3 + 13 * 16] ; [29]
18243 paddd m6, [pd_16]
18244 psrld m6, 5
18245
18246 palignr m1, m2, m7, 8
18247 mova m0, m1
18248 pmaddwd m1, [r3 - 2 * 16] ; [14]
18249 paddd m1, [pd_16]
18250 psrld m1, 5
18251 packusdw m6, m1
18252
18253 pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
18254 paddd m1, [pd_16]
18255 psrld m1, 5
18256
18257 palignr m0, m2, m7, 12
18258 pmaddwd m0, [r3] ; [16]
18259 paddd m0, [pd_16]
18260 psrld m0, 5
18261 packusdw m1, m0
18262
18263 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18264
18265 pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
18266 paddd m4, [pd_16]
18267 psrld m4, 5
18268
18269 pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
18270 paddd m1, [pd_16]
18271 psrld m1, 5
18272 packusdw m4, m1
18273
18274 palignr m1, m3, m2, 4
18275 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
18276 paddd m5, [pd_16]
18277 psrld m5, 5
18278
18279 pmaddwd m1, [r3 + 4 * 16] ; [20]
18280 paddd m1, [pd_16]
18281 psrld m1, 5
18282 packusdw m5, m1
18283
18284 palignr m1, m3, m2, 8
18285 pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
18286 paddd m6, [pd_16]
18287 psrld m6, 5
18288
18289 pmaddwd m1, [r3 + 6 * 16] ; [22]
18290 paddd m1, [pd_16]
18291 psrld m1, 5
18292 packusdw m6, m1
18293
18294 palignr m7, m3, m2, 12
18295 pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
18296 paddd m1, [pd_16]
18297 psrld m1, 5
18298
18299 pmaddwd m7, [r3 + 8 * 16] ; [24]
18300 paddd m7, [pd_16]
18301 psrld m7, 5
18302 packusdw m1, m7
18303
18304 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18305
18306 pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
18307 paddd m4, [pd_16]
18308 psrld m4, 5
18309
18310 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
18311 paddd m1, [pd_16]
18312 psrld m1, 5
18313 packusdw m4, m1
18314
18315 movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
18316 palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
18317 punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
18318
18319 palignr m1, m0, m3, 4
18320 pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
18321 paddd m5, [pd_16]
18322 psrld m5, 5
18323
18324 pmaddwd m1, [r3 + 12 * 16] ; [28]
18325 paddd m1, [pd_16]
18326 psrld m1, 5
18327 packusdw m5, m1
18328
18329 palignr m1, m0, m3, 8
18330 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
18331 paddd m6, [pd_16]
18332 psrld m6, 5
18333
18334 pmaddwd m1, [r3 + 14 * 16] ; [30]
18335 paddd m1, [pd_16]
18336 psrld m1, 5
18337 packusdw m6, m1
18338
18339 palignr m1, m0, m3, 12
18340 pmaddwd m1, [r3 - 16] ; [15]
18341 paddd m1, [pd_16]
18342 psrld m1, 5
18343 packusdw m1, m1
18344 movhps m1, [r2 + 36] ; [00]
18345
18346 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
18347 %endmacro
18348
18349 %macro MODE_6_30 1
18350 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
18351 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
18352 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
18353 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
18354 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
18355
18356 pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
18357 paddd m4, [pd_16]
18358 psrld m4, 5
18359
18360 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
18361 paddd m1, [pd_16]
18362 psrld m1, 5
18363 packusdw m4, m1
18364
18365 palignr m1, m2, m0, 4
18366 pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
18367 paddd m5, [pd_16]
18368 psrld m5, 5
18369
18370 pmaddwd m1, [r3 + 4 * 16] ; [20]
18371 paddd m1, [pd_16]
18372 psrld m1, 5
18373 packusdw m5, m1
18374
18375 palignr m1, m2, m0, 8
18376 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
18377 paddd m6, [pd_16]
18378 psrld m6, 5
18379
18380 pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
18381 paddd m7, [pd_16]
18382 psrld m7, 5
18383 packusdw m6, m7
18384
18385 pmaddwd m1, [r3 + 11 * 16] ; [27]
18386 paddd m1, [pd_16]
18387 psrld m1, 5
18388
18389 palignr m7, m2, m0, 12
18390 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
18391 paddd m0, [pd_16]
18392 psrld m0, 5
18393 packusdw m1, m0
18394
18395 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18396
18397 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
18398 paddd m4, [pd_16]
18399 psrld m4, 5
18400
18401 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
18402 paddd m1, [pd_16]
18403 psrld m1, 5
18404 packusdw m4, m1
18405
18406 pmaddwd m5, m2, [r3 - 16] ; [15]
18407 paddd m5, [pd_16]
18408 psrld m5, 5
18409
18410 pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
18411 paddd m6, [pd_16]
18412 psrld m6, 5
18413 packusdw m5, m6
18414
18415 palignr m7, m3, m2, 4
18416 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
18417 paddd m6, [pd_16]
18418 psrld m6, 5
18419
18420 pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
18421 paddd m1, [pd_16]
18422 psrld m1, 5
18423 packusdw m6, m1
18424
18425 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
18426 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
18427 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
18428 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
18429 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
18430
18431 palignr m0, m2, m7, 4
18432 pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
18433 paddd m1, [pd_16]
18434 psrld m1, 5
18435
18436 pmaddwd m0, [r3] ; [16]
18437 paddd m0, [pd_16]
18438 psrld m0, 5
18439 packusdw m1, m0
18440
18441 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18442
18443 palignr m4, m2, m7, 4
18444 pmaddwd m4, [r3 + 13 * 16] ; [29]
18445 paddd m4, [pd_16]
18446 psrld m4, 5
18447
18448 palignr m5, m2, m7, 8
18449 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
18450 paddd m1, [pd_16]
18451 psrld m1, 5
18452 packusdw m4, m1
18453
18454 pmaddwd m5, [r3 + 7 * 16] ; [23]
18455 paddd m5, [pd_16]
18456 psrld m5, 5
18457
18458 palignr m1, m2, m7, 12
18459 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
18460 paddd m6, [pd_16]
18461 psrld m6, 5
18462 packusdw m5, m6
18463
18464 pmaddwd m6, m1, [r3 + 16] ; [17]
18465 paddd m6, [pd_16]
18466 psrld m6, 5
18467
18468 pmaddwd m1, [r3 + 14 * 16] ; [30]
18469 paddd m1, [pd_16]
18470 psrld m1, 5
18471 packusdw m6, m1
18472
18473 pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
18474 paddd m1, [pd_16]
18475 psrld m1, 5
18476
18477 pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
18478 paddd m0, [pd_16]
18479 psrld m0, 5
18480 packusdw m1, m0
18481
18482 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18483
18484 palignr m5, m3, m2, 4
18485 pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
18486 paddd m4, [pd_16]
18487 psrld m4, 5
18488
18489 pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
18490 paddd m1, [pd_16]
18491 psrld m1, 5
18492 packusdw m4, m1
18493
18494 pmaddwd m5, [r3 + 15 * 16] ; [31]
18495 paddd m5, [pd_16]
18496 psrld m5, 5
18497
18498 palignr m6, m3, m2, 8
18499 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
18500 paddd m1, [pd_16]
18501 psrld m1, 5
18502 packusdw m5, m1
18503
18504 pmaddwd m6, [r3 + 9 * 16] ; [25]
18505 paddd m6, [pd_16]
18506 psrld m6, 5
18507
18508 palignr m1, m3, m2, 12
18509 pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
18510 paddd m0, [pd_16]
18511 psrld m0, 5
18512 packusdw m6, m0
18513
18514 pmaddwd m1, [r3 + 3 * 16] ; [19]
18515 paddd m1, [pd_16]
18516 psrld m1, 5
18517 packusdw m1, m1
18518 movhps m1, [r2 + 28] ; [00]
18519
18520 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
18521 %endmacro
18522
18523 %macro MODE_7_29 1
18524 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
18525 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
18526 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
18527 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
18528 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
18529
18530 pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
18531 paddd m4, [pd_16]
18532 psrld m4, 5
18533
18534 pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
18535 paddd m1, [pd_16]
18536 psrld m1, 5
18537 packusdw m4, m1
18538
18539 pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
18540 paddd m5, [pd_16]
18541 psrld m5, 5
18542
18543 palignr m1, m2, m0, 4
18544 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
18545 paddd m6, [pd_16]
18546 psrld m6, 5
18547 packusdw m5, m6
18548
18549 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
18550 paddd m6, [pd_16]
18551 psrld m6, 5
18552
18553 pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
18554 paddd m7, [pd_16]
18555 psrld m7, 5
18556 packusdw m6, m7
18557
18558 pmaddwd m1, [r3 + 15 * 16] ; [31]
18559 paddd m1, [pd_16]
18560 psrld m1, 5
18561
18562 mova m3, m0
18563 palignr m7, m2, m0, 8
18564 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
18565 paddd m0, [pd_16]
18566 psrld m0, 5
18567 packusdw m1, m0
18568
18569 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18570
18571 pmaddwd m4, m7, [r3 + 16] ; [17]
18572 paddd m4, [pd_16]
18573 psrld m4, 5
18574
18575 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
18576 paddd m1, [pd_16]
18577 psrld m1, 5
18578 packusdw m4, m1
18579
18580 palignr m1, m2, m3, 12
18581 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
18582 paddd m5, [pd_16]
18583 psrld m5, 5
18584
18585 pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
18586 paddd m6, [pd_16]
18587 psrld m6, 5
18588 packusdw m5, m6
18589
18590 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
18591 paddd m6, [pd_16]
18592 psrld m6, 5
18593
18594 pmaddwd m1, [r3 + 14 * 16] ; [30]
18595 paddd m1, [pd_16]
18596 psrld m1, 5
18597 packusdw m6, m1
18598
18599 pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
18600 paddd m1, [pd_16]
18601 psrld m1, 5
18602
18603 pmaddwd m0, m2, [r3] ; [16]
18604 paddd m0, [pd_16]
18605 psrld m0, 5
18606 packusdw m1, m0
18607
18608 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18609
18610 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
18611 paddd m4, [pd_16]
18612 psrld m4, 5
18613
18614 movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
18615 palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
18616 punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
18617
18618 palignr m6, m7, m2, 4
18619 pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
18620 paddd m1, [pd_16]
18621 psrld m1, 5
18622 packusdw m4, m1
18623
18624 pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
18625 paddd m5, [pd_16]
18626 psrld m5, 5
18627
18628 pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
18629 paddd m0, [pd_16]
18630 psrld m0, 5
18631 packusdw m5, m0
18632
18633 pmaddwd m6, [r3 + 13 * 16] ; [29]
18634 paddd m6, [pd_16]
18635 psrld m6, 5
18636
18637 palignr m0, m7, m2, 8
18638 pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
18639 paddd m1, [pd_16]
18640 psrld m1, 5
18641 packusdw m6, m1
18642
18643 pmaddwd m1, m0, [r3 - 16] ; [15]
18644 paddd m1, [pd_16]
18645 psrld m1, 5
18646
18647 pmaddwd m0, [r3 + 8 * 16] ; [24]
18648 paddd m0, [pd_16]
18649 psrld m0, 5
18650 packusdw m1, m0
18651
18652 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18653
18654 palignr m0, m7, m2, 12
18655 pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
18656 paddd m4, [pd_16]
18657 psrld m4, 5
18658
18659 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
18660 paddd m1, [pd_16]
18661 psrld m1, 5
18662 packusdw m4, m1
18663
18664 pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
18665 paddd m5, [pd_16]
18666 psrld m5, 5
18667
18668 pmaddwd m0, [r3 + 12 * 16] ; [28]
18669 paddd m0, [pd_16]
18670 psrld m0, 5
18671 packusdw m5, m0
18672
18673 pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
18674 paddd m6, [pd_16]
18675 psrld m6, 5
18676
18677 pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
18678 paddd m0, [pd_16]
18679 psrld m0, 5
18680 packusdw m6, m0
18681
18682 pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
18683 paddd m1, [pd_16]
18684 psrld m1, 5
18685 packusdw m1, m1
18686 movhps m1, [r2 + 20] ; [00]
18687
18688 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
18689 %endmacro
18690
18691 %macro MODE_8_28 1
18692 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
18693 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
18694 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
18695 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
18696 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
18697
18698 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
18699 paddd m4, [pd_16]
18700 psrld m4, 5
18701
18702 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
18703 paddd m1, [pd_16]
18704 psrld m1, 5
18705 packusdw m4, m1
18706
18707 pmaddwd m5, m0, [r3 - 16] ; [15]
18708 paddd m5, [pd_16]
18709 psrld m5, 5
18710
18711 pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
18712 paddd m6, [pd_16]
18713 psrld m6, 5
18714 packusdw m5, m6
18715
18716 pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
18717 paddd m6, [pd_16]
18718 psrld m6, 5
18719
18720 pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
18721 paddd m1, [pd_16]
18722 psrld m1, 5
18723 packusdw m6, m1
18724
18725 palignr m7, m2, m0, 4
18726 pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
18727 paddd m1, [pd_16]
18728 psrld m1, 5
18729
18730 mova m3, m0
18731 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
18732 paddd m0, [pd_16]
18733 psrld m0, 5
18734 packusdw m1, m0
18735
18736 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18737
18738 pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
18739 paddd m4, [pd_16]
18740 psrld m4, 5
18741
18742 pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
18743 paddd m1, [pd_16]
18744 psrld m1, 5
18745 packusdw m4, m1
18746
18747 pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
18748 paddd m5, [pd_16]
18749 psrld m5, 5
18750
18751 pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
18752 paddd m6, [pd_16]
18753 psrld m6, 5
18754 packusdw m5, m6
18755
18756 palignr m7, m2, m3, 8
18757 pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
18758 paddd m6, [pd_16]
18759 psrld m6, 5
18760
18761 pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
18762 paddd m1, [pd_16]
18763 psrld m1, 5
18764 packusdw m6, m1
18765
18766 pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
18767 paddd m1, [pd_16]
18768 psrld m1, 5
18769
18770 pmaddwd m0, m7, [r3] ; [16]
18771 paddd m0, [pd_16]
18772 psrld m0, 5
18773 packusdw m1, m0
18774
18775 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18776
18777 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
18778 paddd m4, [pd_16]
18779 psrld m4, 5
18780
18781 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
18782 paddd m1, [pd_16]
18783 psrld m1, 5
18784 packusdw m4, m1
18785
18786 pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
18787 paddd m5, [pd_16]
18788 psrld m5, 5
18789
18790 palignr m7, m2, m3, 12
18791 pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
18792 paddd m0, [pd_16]
18793 psrld m0, 5
18794 packusdw m5, m0
18795
18796 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
18797 paddd m6, [pd_16]
18798 psrld m6, 5
18799
18800 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
18801 paddd m1, [pd_16]
18802 psrld m1, 5
18803 packusdw m6, m1
18804
18805 pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
18806 paddd m1, [pd_16]
18807 psrld m1, 5
18808
18809 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
18810 paddd m0, [pd_16]
18811 psrld m0, 5
18812 packusdw m1, m0
18813
18814 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18815
18816 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
18817 paddd m4, [pd_16]
18818 psrld m4, 5
18819
18820 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
18821 paddd m1, [pd_16]
18822 psrld m1, 5
18823 packusdw m4, m1
18824
18825 pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
18826 paddd m5, [pd_16]
18827 psrld m5, 5
18828
18829 pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
18830 paddd m0, [pd_16]
18831 psrld m0, 5
18832 packusdw m5, m0
18833
18834 pmaddwd m6, m2, [r3 + 16] ; [17]
18835 paddd m6, [pd_16]
18836 psrld m6, 5
18837
18838 pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
18839 paddd m0, [pd_16]
18840 psrld m0, 5
18841 packusdw m6, m0
18842
18843 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
18844 paddd m1, [pd_16]
18845 psrld m1, 5
18846 packusdw m1, m1
18847 movhps m1, [r2 + 12] ; [00]
18848
18849 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
18850 %endmacro
18851
18852 %macro MODE_9_27 1
18853 movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
18854 palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
18855 punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
18856 punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
18857
18858 pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
18859 paddd m4, [pd_16]
18860 psrld m4, 5
18861
18862 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
18863 paddd m1, [pd_16]
18864 psrld m1, 5
18865 packusdw m4, m1
18866
18867 pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
18868 paddd m5, [pd_16]
18869 psrld m5, 5
18870
18871 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
18872 paddd m6, [pd_16]
18873 psrld m6, 5
18874 packusdw m5, m6
18875
18876 pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
18877 paddd m6, [pd_16]
18878 psrld m6, 5
18879
18880 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
18881 paddd m1, [pd_16]
18882 psrld m1, 5
18883 packusdw m6, m1
18884
18885 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
18886 paddd m1, [pd_16]
18887 psrld m1, 5
18888
18889 pmaddwd m0, m3, [r3] ; [16]
18890 paddd m0, [pd_16]
18891 psrld m0, 5
18892 packusdw m1, m0
18893
18894 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
18895
18896 pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
18897 paddd m4, [pd_16]
18898 psrld m4, 5
18899
18900 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
18901 paddd m1, [pd_16]
18902 psrld m1, 5
18903 packusdw m4, m1
18904
18905 pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
18906 paddd m5, [pd_16]
18907 psrld m5, 5
18908
18909 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
18910 paddd m6, [pd_16]
18911 psrld m6, 5
18912 packusdw m5, m6
18913
18914 pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
18915 paddd m6, [pd_16]
18916 psrld m6, 5
18917
18918 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
18919 paddd m1, [pd_16]
18920 psrld m1, 5
18921 packusdw m6, m1
18922
18923 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
18924 paddd m1, [pd_16]
18925 psrld m1, 5
18926
18927 packusdw m1, m1
18928 movhps m1, [r2 + 4] ; [00]
18929
18930 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
18931
18932 palignr m7, m2, m3, 4
18933 pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
18934 paddd m4, [pd_16]
18935 psrld m4, 5
18936
18937 pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
18938 paddd m1, [pd_16]
18939 psrld m1, 5
18940 packusdw m4, m1
18941
18942 pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
18943 paddd m5, [pd_16]
18944 psrld m5, 5
18945
18946 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
18947 paddd m0, [pd_16]
18948 psrld m0, 5
18949 packusdw m5, m0
18950
18951 pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
18952 paddd m6, [pd_16]
18953 psrld m6, 5
18954
18955 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
18956 paddd m1, [pd_16]
18957 psrld m1, 5
18958 packusdw m6, m1
18959
18960 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
18961 paddd m1, [pd_16]
18962 psrld m1, 5
18963
18964 pmaddwd m0, m7, [r3] ; [16]
18965 paddd m0, [pd_16]
18966 psrld m0, 5
18967 packusdw m1, m0
18968
18969 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
18970
18971 pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
18972 paddd m4, [pd_16]
18973 psrld m4, 5
18974
18975 pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
18976 paddd m1, [pd_16]
18977 psrld m1, 5
18978 packusdw m4, m1
18979
18980 pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
18981 paddd m5, [pd_16]
18982 psrld m5, 5
18983
18984 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
18985 paddd m0, [pd_16]
18986 psrld m0, 5
18987 packusdw m5, m0
18988
18989 pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
18990 paddd m6, [pd_16]
18991 psrld m6, 5
18992
18993 pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
18994 paddd m0, [pd_16]
18995 psrld m0, 5
18996 packusdw m6, m0
18997
18998 pmaddwd m7, [r3 + 14 * 16] ; [30]
18999 paddd m7, [pd_16]
19000 psrld m7, 5
19001 packusdw m7, m7
19002 movhps m7, [r2 + 6] ; [00]
19003
19004 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
19005 %endmacro
19006
19007 %macro MODE_11_25 1
19008 movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
19009 pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
19010
19011 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
19012 paddd m4, [pd_16]
19013 psrld m4, 5
19014
19015 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
19016 paddd m1, [pd_16]
19017 psrld m1, 5
19018 packusdw m4, m1
19019
19020 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
19021 paddd m5, [pd_16]
19022 psrld m5, 5
19023
19024 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
19025 paddd m6, [pd_16]
19026 psrld m6, 5
19027 packusdw m5, m6
19028
19029 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
19030 paddd m6, [pd_16]
19031 psrld m6, 5
19032
19033 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
19034 paddd m1, [pd_16]
19035 psrld m1, 5
19036 packusdw m6, m1
19037
19038 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19039 paddd m1, [pd_16]
19040 psrld m1, 5
19041
19042 pmaddwd m0, m3, [r3] ; [16]
19043 paddd m0, [pd_16]
19044 psrld m0, 5
19045 packusdw m1, m0
19046
19047 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19048
19049 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
19050 paddd m4, [pd_16]
19051 psrld m4, 5
19052
19053 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
19054 paddd m1, [pd_16]
19055 psrld m1, 5
19056 packusdw m4, m1
19057
19058 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
19059 paddd m5, [pd_16]
19060 psrld m5, 5
19061
19062 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
19063 paddd m6, [pd_16]
19064 psrld m6, 5
19065 packusdw m5, m6
19066
19067 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
19068 paddd m6, [pd_16]
19069 psrld m6, 5
19070
19071 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
19072 paddd m1, [pd_16]
19073 psrld m1, 5
19074 packusdw m6, m1
19075
19076 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
19077 paddd m1, [pd_16]
19078 psrld m1, 5
19079
19080 packusdw m1, m1
19081 movhps m1, [r2 + 2] ; [00]
19082
19083 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
19084
19085 movu m3, [r2] ; [6 5 4 3 2 1 0 16]
19086 pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
19087
19088 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
19089 paddd m4, [pd_16]
19090 psrld m4, 5
19091
19092 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
19093 paddd m1, [pd_16]
19094 psrld m1, 5
19095 packusdw m4, m1
19096
19097 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
19098 paddd m5, [pd_16]
19099 psrld m5, 5
19100
19101 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19102 paddd m0, [pd_16]
19103 psrld m0, 5
19104 packusdw m5, m0
19105
19106 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
19107 paddd m6, [pd_16]
19108 psrld m6, 5
19109
19110 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
19111 paddd m1, [pd_16]
19112 psrld m1, 5
19113 packusdw m6, m1
19114
19115 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19116 paddd m1, [pd_16]
19117 psrld m1, 5
19118
19119 pmaddwd m0, m3, [r3] ; [16]
19120 paddd m0, [pd_16]
19121 psrld m0, 5
19122 packusdw m1, m0
19123
19124 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
19125
19126 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
19127 paddd m4, [pd_16]
19128 psrld m4, 5
19129
19130 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
19131 paddd m1, [pd_16]
19132 psrld m1, 5
19133 packusdw m4, m1
19134
19135 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
19136 paddd m5, [pd_16]
19137 psrld m5, 5
19138
19139 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
19140 paddd m6, [pd_16]
19141 psrld m6, 5
19142 packusdw m5, m6
19143
19144 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
19145 paddd m6, [pd_16]
19146 psrld m6, 5
19147
19148 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
19149 paddd m1, [pd_16]
19150 psrld m1, 5
19151 packusdw m6, m1
19152
19153 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
19154 paddd m1, [pd_16]
19155 psrld m1, 5
19156
19157 packusdw m1, m1
19158 movhps m1, [r2] ; [00]
19159
19160 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
19161 %endmacro
19162
19163 %macro MODE_12_24 1
19164 movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
19165 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
19166
19167 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
19168 paddd m4, [pd_16]
19169 psrld m4, 5
19170
19171 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
19172 paddd m1, [pd_16]
19173 psrld m1, 5
19174 packusdw m4, m1
19175
19176 pmaddwd m5, m3, [r3 + 16] ; [17]
19177 paddd m5, [pd_16]
19178 psrld m5, 5
19179
19180 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
19181 paddd m6, [pd_16]
19182 psrld m6, 5
19183 packusdw m5, m6
19184
19185 pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
19186 paddd m6, [pd_16]
19187 psrld m6, 5
19188
19189 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
19190 paddd m1, [pd_16]
19191 psrld m1, 5
19192 packusdw m6, m1
19193
19194 movu m3, [r2 + 6]
19195 pshufb m3, m2
19196
19197 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
19198 paddd m1, [pd_16]
19199 psrld m1, 5
19200
19201 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19202 paddd m0, [pd_16]
19203 psrld m0, 5
19204 packusdw m1, m0
19205
19206 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19207
19208 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
19209 paddd m4, [pd_16]
19210 psrld m4, 5
19211
19212 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
19213 paddd m1, [pd_16]
19214 psrld m1, 5
19215 packusdw m4, m1
19216
19217 pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
19218 paddd m5, [pd_16]
19219 psrld m5, 5
19220
19221 pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
19222 paddd m6, [pd_16]
19223 psrld m6, 5
19224 packusdw m5, m6
19225
19226 movu m3, [r2 + 4]
19227 pshufb m3, m2
19228
19229 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
19230 paddd m6, [pd_16]
19231 psrld m6, 5
19232
19233 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
19234 paddd m1, [pd_16]
19235 psrld m1, 5
19236 packusdw m6, m1
19237
19238 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
19239 paddd m1, [pd_16]
19240 psrld m1, 5
19241
19242 pmaddwd m0, m3, [r3] ; [16]
19243 paddd m0, [pd_16]
19244 psrld m0, 5
19245 packusdw m1, m0
19246
19247 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
19248
19249 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
19250 paddd m4, [pd_16]
19251 psrld m4, 5
19252
19253 pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
19254 paddd m1, [pd_16]
19255 psrld m1, 5
19256 packusdw m4, m1
19257
19258 pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
19259 paddd m5, [pd_16]
19260 psrld m5, 5
19261
19262 movu m3, [r2 + 2]
19263 pshufb m3, m2
19264
19265 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
19266 paddd m0, [pd_16]
19267 psrld m0, 5
19268 packusdw m5, m0
19269
19270 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
19271 paddd m6, [pd_16]
19272 psrld m6, 5
19273
19274 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19275 paddd m1, [pd_16]
19276 psrld m1, 5
19277 packusdw m6, m1
19278
19279 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
19280 paddd m1, [pd_16]
19281 psrld m1, 5
19282
19283 pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
19284 paddd m0, [pd_16]
19285 psrld m0, 5
19286 packusdw m1, m0
19287
19288 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
19289
19290 pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
19291 paddd m4, [pd_16]
19292 psrld m4, 5
19293
19294 movu m3, [r2]
19295 pshufb m3, m2
19296
19297 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
19298 paddd m1, [pd_16]
19299 psrld m1, 5
19300 packusdw m4, m1
19301
19302 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
19303 paddd m5, [pd_16]
19304 psrld m5, 5
19305
19306 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
19307 paddd m6, [pd_16]
19308 psrld m6, 5
19309 packusdw m5, m6
19310
19311 pmaddwd m6, m3, [r3 - 16] ; [15]
19312 paddd m6, [pd_16]
19313 psrld m6, 5
19314
19315 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
19316 paddd m1, [pd_16]
19317 psrld m1, 5
19318 packusdw m6, m1
19319
19320 pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
19321 paddd m1, [pd_16]
19322 psrld m1, 5
19323
19324 packusdw m1, m1
19325 movhps m1, [r2] ; [00]
19326
19327 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
19328 %endmacro
19329
19330 %macro MODE_13_23 1
19331 movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
19332 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
19333
19334 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
19335 paddd m4, [pd_16]
19336 psrld m4, 5
19337
19338 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
19339 paddd m1, [pd_16]
19340 psrld m1, 5
19341 packusdw m4, m1
19342
19343 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
19344 paddd m5, [pd_16]
19345 psrld m5, 5
19346
19347 movu m3, [r2 + 14]
19348 pshufb m3, m2
19349
19350 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
19351 paddd m6, [pd_16]
19352 psrld m6, 5
19353 packusdw m5, m6
19354
19355 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
19356 paddd m6, [pd_16]
19357 psrld m6, 5
19358
19359 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
19360 paddd m1, [pd_16]
19361 psrld m1, 5
19362 packusdw m6, m1
19363
19364 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
19365 paddd m1, [pd_16]
19366 psrld m1, 5
19367
19368 movu m3, [r2 + 12]
19369 pshufb m3, m2
19370
19371 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19372 paddd m0, [pd_16]
19373 psrld m0, 5
19374 packusdw m1, m0
19375
19376 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19377
19378 pmaddwd m4, m3, [r3 - 16] ; [15]
19379 paddd m4, [pd_16]
19380 psrld m4, 5
19381
19382 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
19383 paddd m1, [pd_16]
19384 psrld m1, 5
19385 packusdw m4, m1
19386
19387 movu m3, [r2 + 10]
19388 pshufb m3, m2
19389
19390 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
19391 paddd m5, [pd_16]
19392 psrld m5, 5
19393
19394 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
19395 paddd m6, [pd_16]
19396 psrld m6, 5
19397 packusdw m5, m6
19398
19399 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
19400 paddd m6, [pd_16]
19401 psrld m6, 5
19402
19403 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
19404 paddd m1, [pd_16]
19405 psrld m1, 5
19406 packusdw m6, m1
19407
19408 movu m3, [r2 + 8]
19409 pshufb m3, m2
19410
19411 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
19412 paddd m1, [pd_16]
19413 psrld m1, 5
19414
19415 pmaddwd m0, m3, [r3] ; [16]
19416 paddd m0, [pd_16]
19417 psrld m0, 5
19418 packusdw m1, m0
19419
19420 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
19421
19422 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
19423 paddd m4, [pd_16]
19424 psrld m4, 5
19425
19426 movu m3, [r2 + 6]
19427 pshufb m3, m2
19428
19429 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
19430 paddd m1, [pd_16]
19431 psrld m1, 5
19432 packusdw m4, m1
19433
19434 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
19435 paddd m5, [pd_16]
19436 psrld m5, 5
19437
19438 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
19439 paddd m0, [pd_16]
19440 psrld m0, 5
19441 packusdw m5, m0
19442
19443 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
19444 paddd m6, [pd_16]
19445 psrld m6, 5
19446
19447 movu m3, [r2 + 4]
19448 pshufb m3, m2
19449
19450 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
19451 paddd m1, [pd_16]
19452 psrld m1, 5
19453 packusdw m6, m1
19454
19455 pmaddwd m1, m3, [r3 + 16] ; [17]
19456 paddd m1, [pd_16]
19457 psrld m1, 5
19458
19459 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
19460 paddd m0, [pd_16]
19461 psrld m0, 5
19462 packusdw m1, m0
19463
19464 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
19465
19466 movu m3, [r2 + 2]
19467 pshufb m3, m2
19468
19469 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
19470 paddd m4, [pd_16]
19471 psrld m4, 5
19472
19473 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
19474 paddd m1, [pd_16]
19475 psrld m1, 5
19476 packusdw m4, m1
19477
19478 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
19479 paddd m5, [pd_16]
19480 psrld m5, 5
19481
19482 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
19483 paddd m6, [pd_16]
19484 psrld m6, 5
19485 packusdw m5, m6
19486
19487 movu m3, [r2]
19488 pshufb m3, m2
19489
19490 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
19491 paddd m6, [pd_16]
19492 psrld m6, 5
19493
19494 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19495 paddd m1, [pd_16]
19496 psrld m1, 5
19497 packusdw m6, m1
19498
19499 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
19500 paddd m1, [pd_16]
19501 psrld m1, 5
19502
19503 packusdw m1, m1
19504 movhps m1, [r2] ; [00]
19505
19506 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
19507 %endmacro
19508
19509 %macro MODE_14_22 1
19510 movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
19511 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
19512
19513 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
19514 paddd m4, [pd_16]
19515 psrld m4, 5
19516
19517 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
19518 paddd m1, [pd_16]
19519 psrld m1, 5
19520 packusdw m4, m1
19521
19522 movu m3, [r2 + 22]
19523 pshufb m3, m2
19524
19525 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
19526 paddd m5, [pd_16]
19527 psrld m5, 5
19528
19529 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
19530 paddd m6, [pd_16]
19531 psrld m6, 5
19532 packusdw m5, m6
19533
19534 movu m3, [r2 + 20]
19535 pshufb m3, m2
19536
19537 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
19538 paddd m6, [pd_16]
19539 psrld m6, 5
19540
19541 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19542 paddd m1, [pd_16]
19543 psrld m1, 5
19544 packusdw m6, m1
19545
19546 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
19547 paddd m1, [pd_16]
19548 psrld m1, 5
19549
19550 movu m3, [r2 + 18]
19551 pshufb m3, m2
19552
19553 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19554 paddd m0, [pd_16]
19555 psrld m0, 5
19556 packusdw m1, m0
19557
19558 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19559
19560 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
19561 paddd m4, [pd_16]
19562 psrld m4, 5
19563
19564 movu m3, [r2 + 16]
19565 pshufb m3, m2
19566
19567 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
19568 paddd m1, [pd_16]
19569 psrld m1, 5
19570 packusdw m4, m1
19571
19572 pmaddwd m5, m3, [r3 + 16] ; [17]
19573 paddd m5, [pd_16]
19574 psrld m5, 5
19575
19576 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
19577 paddd m6, [pd_16]
19578 psrld m6, 5
19579 packusdw m5, m6
19580
19581 movu m3, [r2 + 14]
19582 pshufb m3, m2
19583
19584 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
19585 paddd m6, [pd_16]
19586 psrld m6, 5
19587
19588 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
19589 paddd m1, [pd_16]
19590 psrld m1, 5
19591 packusdw m6, m1
19592
19593 movu m3, [r2 + 12]
19594 pshufb m3, m2
19595
19596 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
19597 paddd m1, [pd_16]
19598 psrld m1, 5
19599
19600 pmaddwd m0, m3, [r3] ; [16]
19601 paddd m0, [pd_16]
19602 psrld m0, 5
19603 packusdw m1, m0
19604
19605 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
19606
19607 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
19608 paddd m4, [pd_16]
19609 psrld m4, 5
19610
19611 movu m3, [r2 + 10]
19612 pshufb m3, m2
19613
19614 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
19615 paddd m1, [pd_16]
19616 psrld m1, 5
19617 packusdw m4, m1
19618
19619 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
19620 paddd m5, [pd_16]
19621 psrld m5, 5
19622
19623 movu m3, [r2 + 8]
19624 pshufb m3, m2
19625
19626 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
19627 paddd m0, [pd_16]
19628 psrld m0, 5
19629 packusdw m5, m0
19630
19631 pmaddwd m6, m3, [r3 - 16] ; [15]
19632 paddd m6, [pd_16]
19633 psrld m6, 5
19634
19635 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
19636 paddd m1, [pd_16]
19637 psrld m1, 5
19638 packusdw m6, m1
19639
19640 movu m3, [r2 + 6]
19641 pshufb m3, m2
19642
19643 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
19644 paddd m1, [pd_16]
19645 psrld m1, 5
19646
19647 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
19648 paddd m0, [pd_16]
19649 psrld m0, 5
19650 packusdw m1, m0
19651
19652 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
19653
19654 movu m3, [r2 + 4]
19655 pshufb m3, m2
19656
19657 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
19658 paddd m4, [pd_16]
19659 psrld m4, 5
19660
19661 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
19662 paddd m1, [pd_16]
19663 psrld m1, 5
19664 packusdw m4, m1
19665
19666 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
19667 paddd m5, [pd_16]
19668 psrld m5, 5
19669
19670 movu m3, [r2 + 2]
19671 pshufb m3, m2
19672
19673 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
19674 paddd m6, [pd_16]
19675 psrld m6, 5
19676 packusdw m5, m6
19677
19678 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
19679 paddd m6, [pd_16]
19680 psrld m6, 5
19681
19682 movu m3, [r2]
19683 pshufb m3, m2
19684
19685 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
19686 paddd m1, [pd_16]
19687 psrld m1, 5
19688 packusdw m6, m1
19689
19690 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
19691 paddd m1, [pd_16]
19692 psrld m1, 5
19693
19694 packusdw m1, m1
19695 movhps m1, [r2] ; [00]
19696
19697 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
19698 %endmacro
19699
19700 %macro MODE_15_21 1
19701 movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
19702 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
19703
19704 pmaddwd m4, m3, [r3 - 16] ; [15]
19705 paddd m4, [pd_16]
19706 psrld m4, 5
19707
19708 movu m3, [r2 + 30]
19709 pshufb m3, m2
19710
19711 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
19712 paddd m1, [pd_16]
19713 psrld m1, 5
19714 packusdw m4, m1
19715
19716 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
19717 paddd m5, [pd_16]
19718 psrld m5, 5
19719
19720 movu m3, [r2 + 28]
19721 pshufb m3, m2
19722
19723 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
19724 paddd m6, [pd_16]
19725 psrld m6, 5
19726 packusdw m5, m6
19727
19728 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
19729 paddd m6, [pd_16]
19730 psrld m6, 5
19731
19732 movu m3, [r2 + 26]
19733 pshufb m3, m2
19734
19735 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
19736 paddd m1, [pd_16]
19737 psrld m1, 5
19738 packusdw m6, m1
19739
19740 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
19741 paddd m1, [pd_16]
19742 psrld m1, 5
19743
19744 movu m3, [r2 + 24]
19745 pshufb m3, m2
19746
19747 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19748 paddd m0, [pd_16]
19749 psrld m0, 5
19750 packusdw m1, m0
19751
19752 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19753
19754 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
19755 paddd m4, [pd_16]
19756 psrld m4, 5
19757
19758 movu m3, [r2 + 22]
19759 pshufb m3, m2
19760
19761 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
19762 paddd m1, [pd_16]
19763 psrld m1, 5
19764 packusdw m4, m1
19765
19766 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
19767 paddd m5, [pd_16]
19768 psrld m5, 5
19769
19770 movu m3, [r2 + 20]
19771 pshufb m3, m2
19772
19773 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
19774 paddd m6, [pd_16]
19775 psrld m6, 5
19776 packusdw m5, m6
19777
19778 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
19779 paddd m6, [pd_16]
19780 psrld m6, 5
19781
19782 movu m3, [r2 + 18]
19783 pshufb m3, m2
19784
19785 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
19786 paddd m1, [pd_16]
19787 psrld m1, 5
19788 packusdw m6, m1
19789
19790 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
19791 paddd m1, [pd_16]
19792 psrld m1, 5
19793
19794 movu m3, [r2 + 16]
19795 pshufb m3, m2
19796
19797 pmaddwd m0, m3, [r3] ; [16]
19798 paddd m0, [pd_16]
19799 psrld m0, 5
19800 packusdw m1, m0
19801
19802 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
19803
19804 movu m3, [r2 + 14]
19805 pshufb m3, m2
19806
19807 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
19808 paddd m4, [pd_16]
19809 psrld m4, 5
19810
19811 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
19812 paddd m1, [pd_16]
19813 psrld m1, 5
19814 packusdw m4, m1
19815
19816 movu m3, [r2 + 12]
19817 pshufb m3, m2
19818
19819 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
19820 paddd m5, [pd_16]
19821 psrld m5, 5
19822
19823 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
19824 paddd m0, [pd_16]
19825 psrld m0, 5
19826 packusdw m5, m0
19827
19828 movu m3, [r2 + 10]
19829 pshufb m3, m2
19830
19831 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
19832 paddd m6, [pd_16]
19833 psrld m6, 5
19834
19835 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
19836 paddd m1, [pd_16]
19837 psrld m1, 5
19838 packusdw m6, m1
19839
19840 movu m3, [r2 + 8]
19841 pshufb m3, m2
19842
19843 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
19844 paddd m1, [pd_16]
19845 psrld m1, 5
19846
19847 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
19848 paddd m0, [pd_16]
19849 psrld m0, 5
19850 packusdw m1, m0
19851
19852 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
19853
19854 movu m3, [r2 + 6]
19855 pshufb m3, m2
19856
19857 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
19858 paddd m4, [pd_16]
19859 psrld m4, 5
19860
19861 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
19862 paddd m1, [pd_16]
19863 psrld m1, 5
19864 packusdw m4, m1
19865
19866 movu m3, [r2 + 4]
19867 pshufb m3, m2
19868
19869 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
19870 paddd m5, [pd_16]
19871 psrld m5, 5
19872
19873 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
19874 paddd m6, [pd_16]
19875 psrld m6, 5
19876 packusdw m5, m6
19877
19878 movu m3, [r2 + 2]
19879 pshufb m3, m2
19880
19881 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
19882 paddd m6, [pd_16]
19883 psrld m6, 5
19884
19885 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
19886 paddd m1, [pd_16]
19887 psrld m1, 5
19888 packusdw m6, m1
19889
19890 movu m3, [r2]
19891 pshufb m3, m2
19892
19893 pmaddwd m1, m3, [r3 + 16] ; [17]
19894 paddd m1, [pd_16]
19895 psrld m1, 5
19896
19897 packusdw m1, m1
19898 movhps m1, [r2] ; [00]
19899
19900 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
19901 %endmacro
19902
19903 %macro MODE_16_20 1
19904 movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
19905 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
19906
19907 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
19908 paddd m4, [pd_16]
19909 psrld m4, 5
19910
19911 movu m3, [r2 + 38]
19912 pshufb m3, m2
19913
19914 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
19915 paddd m1, [pd_16]
19916 psrld m1, 5
19917 packusdw m4, m1
19918
19919 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
19920 paddd m5, [pd_16]
19921 psrld m5, 5
19922
19923 movu m3, [r2 + 36]
19924 pshufb m3, m2
19925
19926 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
19927 paddd m6, [pd_16]
19928 psrld m6, 5
19929 packusdw m5, m6
19930
19931 movu m3, [r2 + 34]
19932 pshufb m3, m2
19933
19934 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
19935 paddd m6, [pd_16]
19936 psrld m6, 5
19937
19938 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
19939 paddd m1, [pd_16]
19940 psrld m1, 5
19941 packusdw m6, m1
19942
19943 movu m3, [r2 + 32]
19944 pshufb m3, m2
19945
19946 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
19947 paddd m1, [pd_16]
19948 psrld m1, 5
19949
19950 movu m3, [r2 + 30]
19951 pshufb m3, m2
19952
19953 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
19954 paddd m0, [pd_16]
19955 psrld m0, 5
19956 packusdw m1, m0
19957
19958 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
19959
19960 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
19961 paddd m4, [pd_16]
19962 psrld m4, 5
19963
19964 movu m3, [r2 + 28]
19965 pshufb m3, m2
19966
19967 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
19968 paddd m1, [pd_16]
19969 psrld m1, 5
19970 packusdw m4, m1
19971
19972 movu m3, [r2 + 26]
19973 pshufb m3, m2
19974
19975 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
19976 paddd m5, [pd_16]
19977 psrld m5, 5
19978
19979 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
19980 paddd m6, [pd_16]
19981 psrld m6, 5
19982 packusdw m5, m6
19983
19984 movu m3, [r2 + 24]
19985 pshufb m3, m2
19986
19987 pmaddwd m6, m3, [r3 - 16] ; [15]
19988 paddd m6, [pd_16]
19989 psrld m6, 5
19990
19991 movu m3, [r2 + 22]
19992 pshufb m3, m2
19993
19994 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
19995 paddd m1, [pd_16]
19996 psrld m1, 5
19997 packusdw m6, m1
19998
19999 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
20000 paddd m1, [pd_16]
20001 psrld m1, 5
20002
20003 movu m3, [r2 + 20]
20004 pshufb m3, m2
20005
20006 pmaddwd m0, m3, [r3] ; [16]
20007 paddd m0, [pd_16]
20008 psrld m0, 5
20009 packusdw m1, m0
20010
20011 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
20012
20013 movu m3, [r2 + 18]
20014 pshufb m3, m2
20015
20016 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
20017 paddd m4, [pd_16]
20018 psrld m4, 5
20019
20020 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
20021 paddd m1, [pd_16]
20022 psrld m1, 5
20023 packusdw m4, m1
20024
20025 movu m3, [r2 + 16]
20026 pshufb m3, m2
20027
20028 pmaddwd m5, m3, [r3 + 16] ; [17]
20029 paddd m5, [pd_16]
20030 psrld m5, 5
20031
20032 movu m3, [r2 + 14]
20033 pshufb m3, m2
20034
20035 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
20036 paddd m0, [pd_16]
20037 psrld m0, 5
20038 packusdw m5, m0
20039
20040 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
20041 paddd m6, [pd_16]
20042 psrld m6, 5
20043
20044 movu m3, [r2 + 12]
20045 pshufb m3, m2
20046
20047 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
20048 paddd m1, [pd_16]
20049 psrld m1, 5
20050 packusdw m6, m1
20051
20052 movu m3, [r2 + 10]
20053 pshufb m3, m2
20054
20055 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
20056 paddd m1, [pd_16]
20057 psrld m1, 5
20058
20059 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
20060 paddd m0, [pd_16]
20061 psrld m0, 5
20062 packusdw m1, m0
20063
20064 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
20065
20066 movu m3, [r2 + 8]
20067 pshufb m3, m2
20068
20069 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
20070 paddd m4, [pd_16]
20071 psrld m4, 5
20072
20073 movu m3, [r2 + 6]
20074 pshufb m3, m2
20075
20076 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
20077 paddd m1, [pd_16]
20078 psrld m1, 5
20079 packusdw m4, m1
20080
20081 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
20082 paddd m5, [pd_16]
20083 psrld m5, 5
20084
20085 movu m3, [r2 + 4]
20086 pshufb m3, m2
20087
20088 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
20089 paddd m6, [pd_16]
20090 psrld m6, 5
20091 packusdw m5, m6
20092
20093 movu m3, [r2 + 2]
20094 pshufb m3, m2
20095
20096 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
20097 paddd m6, [pd_16]
20098 psrld m6, 5
20099
20100 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
20101 paddd m1, [pd_16]
20102 psrld m1, 5
20103 packusdw m6, m1
20104
20105 movu m3, [r2]
20106 pshufb m3, m2
20107
20108 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
20109 paddd m1, [pd_16]
20110 psrld m1, 5
20111
20112 packusdw m1, m1
20113 movhps m1, [r2] ; [00]
20114
20115 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
20116 %endmacro
20117
20118 %macro MODE_17_19 1
20119 movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
20120 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
20121
20122 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
20123 paddd m4, [pd_16]
20124 psrld m4, 5
20125
20126 movu m3, [r2 + 48]
20127 pshufb m3, m2
20128
20129 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
20130 paddd m1, [pd_16]
20131 psrld m1, 5
20132 packusdw m4, m1
20133
20134 movu m3, [r2 + 46]
20135 pshufb m3, m2
20136
20137 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
20138 paddd m5, [pd_16]
20139 psrld m5, 5
20140
20141 movu m3, [r2 + 44]
20142 pshufb m3, m2
20143
20144 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
20145 paddd m6, [pd_16]
20146 psrld m6, 5
20147 packusdw m5, m6
20148
20149 movu m3, [r2 + 42]
20150 pshufb m3, m2
20151
20152 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
20153 paddd m6, [pd_16]
20154 psrld m6, 5
20155
20156 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
20157 paddd m1, [pd_16]
20158 psrld m1, 5
20159 packusdw m6, m1
20160
20161 movu m3, [r2 + 40]
20162 pshufb m3, m2
20163
20164 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
20165 paddd m1, [pd_16]
20166 psrld m1, 5
20167
20168 movu m3, [r2 + 38]
20169 pshufb m3, m2
20170
20171 pmaddwd m0, m3, [r3] ; [16]
20172 paddd m0, [pd_16]
20173 psrld m0, 5
20174 packusdw m1, m0
20175
20176 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
20177
20178 movu m3, [r2 + 36]
20179 pshufb m3, m2
20180
20181 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
20182 paddd m4, [pd_16]
20183 psrld m4, 5
20184
20185 movu m3, [r2 + 34]
20186 pshufb m3, m2
20187
20188 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
20189 paddd m1, [pd_16]
20190 psrld m1, 5
20191 packusdw m4, m1
20192
20193 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
20194 paddd m5, [pd_16]
20195 psrld m5, 5
20196
20197 movu m3, [r2 + 32]
20198 pshufb m3, m2
20199
20200 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
20201 paddd m6, [pd_16]
20202 psrld m6, 5
20203 packusdw m5, m6
20204
20205 movu m3, [r2 + 30]
20206 pshufb m3, m2
20207
20208 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
20209 paddd m6, [pd_16]
20210 psrld m6, 5
20211
20212 movu m3, [r2 + 28]
20213 pshufb m3, m2
20214
20215 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
20216 paddd m1, [pd_16]
20217 psrld m1, 5
20218 packusdw m6, m1
20219
20220 movu m3, [r2 + 26]
20221 pshufb m3, m2
20222
20223 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
20224 paddd m1, [pd_16]
20225 psrld m1, 5
20226
20227 packusdw m1, m1
20228 movhps m1, [r2 + 26] ; [00]
20229
20230 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
20231
20232 movu m3, [r2 + 24]
20233 pshufb m3, m2
20234
20235 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
20236 paddd m4, [pd_16]
20237 psrld m4, 5
20238
20239 movu m3, [r2 + 22]
20240 pshufb m3, m2
20241
20242 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
20243 paddd m1, [pd_16]
20244 psrld m1, 5
20245 packusdw m4, m1
20246
20247 movu m3, [r2 + 20]
20248 pshufb m3, m2
20249
20250 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
20251 paddd m5, [pd_16]
20252 psrld m5, 5
20253
20254 movu m3, [r2 + 18]
20255 pshufb m3, m2
20256
20257 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
20258 paddd m0, [pd_16]
20259 psrld m0, 5
20260 packusdw m5, m0
20261
20262 movu m3, [r2 + 16]
20263 pshufb m3, m2
20264
20265 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
20266 paddd m6, [pd_16]
20267 psrld m6, 5
20268
20269 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
20270 paddd m1, [pd_16]
20271 psrld m1, 5
20272 packusdw m6, m1
20273
20274 movu m3, [r2 + 14]
20275 pshufb m3, m2
20276
20277 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
20278 paddd m1, [pd_16]
20279 psrld m1, 5
20280
20281 movu m3, [r2 + 12]
20282 pshufb m3, m2
20283
20284 pmaddwd m0, m3, [r3] ; [16]
20285 paddd m0, [pd_16]
20286 psrld m0, 5
20287 packusdw m1, m0
20288
20289 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
20290
20291 movu m3, [r2 + 10]
20292 pshufb m3, m2
20293
20294 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
20295 paddd m4, [pd_16]
20296 psrld m4, 5
20297
20298 movu m3, [r2 + 8]
20299 pshufb m3, m2
20300
20301 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
20302 paddd m1, [pd_16]
20303 psrld m1, 5
20304 packusdw m4, m1
20305
20306 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
20307 paddd m5, [pd_16]
20308 psrld m5, 5
20309
20310 movu m3, [r2 + 6]
20311 pshufb m3, m2
20312
20313 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
20314 paddd m6, [pd_16]
20315 psrld m6, 5
20316 packusdw m5, m6
20317
20318 movu m3, [r2 + 4]
20319 pshufb m3, m2
20320
20321 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
20322 paddd m6, [pd_16]
20323 psrld m6, 5
20324
20325 movu m3, [r2 + 2]
20326 pshufb m3, m2
20327
20328 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
20329 paddd m1, [pd_16]
20330 psrld m1, 5
20331 packusdw m6, m1
20332
20333 movu m3, [r2]
20334 pshufb m3, m2
20335
20336 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
20337 paddd m1, [pd_16]
20338 psrld m1, 5
20339
20340 packusdw m1, m1
20341 movhps m1, [r2] ; [00]
20342
20343 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
20344 %endmacro
20345
20346 ;------------------------------------------------------------------------------------------
20347 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
20348 ;------------------------------------------------------------------------------------------
20349 INIT_XMM ssse3
20350 cglobal intra_pred_ang32_2, 3,6,6
20351 lea r4, [r2]
20352 add r2, 128
20353 cmp r3m, byte 34
20354 cmove r2, r4
20355
20356 add r1, r1
20357 lea r3, [r1 * 2]
20358 lea r4, [r1 * 3]
20359 mov r5, 2
20360
20361 .loop:
20362 MODE_2_34
20363 add r2, 32
20364 dec r5
20365 jnz .loop
20366 RET
20367
20368 INIT_XMM sse4
20369 cglobal intra_pred_ang32_3, 3,6,8
20370 add r2, 128
20371 lea r3, [ang_table + 16 * 16]
20372 mov r4d, 8
20373 add r1, r1
20374 lea r5, [r1 * 3]
20375
20376 .loop:
20377 MODE_3_33 1
20378 lea r0, [r0 + r1 * 4 ]
20379 add r2, 8
20380 dec r4
20381 jnz .loop
20382 RET
20383
20384 INIT_XMM sse4
20385 cglobal intra_pred_ang32_4, 3,6,8
20386 add r2, 128
20387 lea r3, [ang_table + 16 * 16]
20388 mov r4d, 8
20389 add r1, r1
20390 lea r5, [r1 * 3]
20391
20392 .loop:
20393 MODE_4_32 1
20394 lea r0, [r0 + r1 * 4 ]
20395 add r2, 8
20396 dec r4
20397 jnz .loop
20398 RET
20399
20400 INIT_XMM sse4
20401 cglobal intra_pred_ang32_5, 3,6,8
20402 add r2, 128
20403 lea r3, [ang_table + 16 * 16]
20404 mov r4d, 8
20405 add r1, r1
20406 lea r5, [r1 * 3]
20407
20408 .loop:
20409 MODE_5_31 1
20410 lea r0, [r0 + r1 * 4 ]
20411 add r2, 8
20412 dec r4
20413 jnz .loop
20414 RET
20415
20416 INIT_XMM sse4
20417 cglobal intra_pred_ang32_6, 3,6,8
20418 add r2, 128
20419 lea r3, [ang_table + 16 * 16]
20420 mov r4d, 8
20421 add r1, r1
20422 lea r5, [r1 * 3]
20423
20424 .loop:
20425 MODE_6_30 1
20426 lea r0, [r0 + r1 * 4 ]
20427 add r2, 8
20428 dec r4
20429 jnz .loop
20430 RET
20431
20432 INIT_XMM sse4
20433 cglobal intra_pred_ang32_7, 3,6,8
20434 add r2, 128
20435 lea r3, [ang_table + 16 * 16]
20436 mov r4d, 8
20437 add r1, r1
20438 lea r5, [r1 * 3]
20439
20440 .loop:
20441 MODE_7_29 1
20442 lea r0, [r0 + r1 * 4 ]
20443 add r2, 8
20444 dec r4
20445 jnz .loop
20446 RET
20447
20448 INIT_XMM sse4
20449 cglobal intra_pred_ang32_8, 3,6,8
20450 add r2, 128
20451 lea r3, [ang_table + 16 * 16]
20452 mov r4d, 8
20453 add r1, r1
20454 lea r5, [r1 * 3]
20455
20456 .loop:
20457 MODE_8_28 1
20458 lea r0, [r0 + r1 * 4 ]
20459 add r2, 8
20460 dec r4
20461 jnz .loop
20462 RET
20463
20464 INIT_XMM sse4
20465 cglobal intra_pred_ang32_9, 3,6,8
20466 add r2, 128
20467 lea r3, [ang_table + 16 * 16]
20468 mov r4d, 8
20469 add r1, r1
20470 lea r5, [r1 * 3]
20471
20472 .loop:
20473 MODE_9_27 1
20474 lea r0, [r0 + r1 * 4 ]
20475 add r2, 8
20476 dec r4
20477 jnz .loop
20478 RET
20479
20480 INIT_XMM sse4
20481 cglobal intra_pred_ang32_10, 3,7,8
20482 add r2, 128
20483 mov r6d, 4
20484 add r1, r1
20485 lea r5, [r1 * 3]
20486 lea r4, [r1 * 2]
20487 lea r3, [r1 * 4]
20488 mova m7, [c_mode32_10_0]
20489
20490 .loop:
20491 movu m0, [r2 + 2]
20492 pshufb m1, m0, m7
20493 movu [r0], m1
20494 movu [r0 + 16], m1
20495 movu [r0 + 32], m1
20496 movu [r0 + 48], m1
20497
20498 palignr m1, m0, 2
20499 pshufb m1, m7
20500 movu [r0 + r1], m1
20501 movu [r0 + r1 + 16], m1
20502 movu [r0 + r1 + 32], m1
20503 movu [r0 + r1 + 48], m1
20504
20505 palignr m1, m0, 4
20506 pshufb m1, m7
20507 movu [r0 + r4], m1
20508 movu [r0 + r4 + 16], m1
20509 movu [r0 + r4 + 32], m1
20510 movu [r0 + r4 + 48], m1
20511
20512 palignr m1, m0, 6
20513 pshufb m1, m7
20514 movu [r0 + r5], m1
20515 movu [r0 + r5 + 16], m1
20516 movu [r0 + r5 + 32], m1
20517 movu [r0 + r5 + 48], m1
20518
20519 add r0, r3
20520
20521 palignr m1, m0, 8
20522 pshufb m1, m7
20523 movu [r0], m1
20524 movu [r0 + 16], m1
20525 movu [r0 + 32], m1
20526 movu [r0 + 48], m1
20527
20528 palignr m1, m0, 10
20529 pshufb m1, m7
20530 movu [r0 + r1], m1
20531 movu [r0 + r1 + 16], m1
20532 movu [r0 + r1 + 32], m1
20533 movu [r0 + r1 + 48], m1
20534
20535 palignr m1, m0, 12
20536 pshufb m1, m7
20537 movu [r0 + r4], m1
20538 movu [r0 + r4 + 16], m1
20539 movu [r0 + r4 + 32], m1
20540 movu [r0 + r4 + 48], m1
20541
20542 palignr m1, m0, 14
20543 pshufb m1, m7
20544 movu [r0 + r5], m1
20545 movu [r0 + r5 + 16], m1
20546 movu [r0 + r5 + 32], m1
20547 movu [r0 + r5 + 48], m1
20548
20549 add r0, r3
20550 add r2, 16
20551 dec r6d
20552 jnz .loop
20553 RET
20554
20555 INIT_XMM sse4
20556 cglobal intra_pred_ang32_11, 3,6,7,0-(4*mmsize+4)
20557 mov r3, r2mp
20558 add r2, 128
20559 movu m0, [r2 + 0*mmsize]
20560 pinsrw m0, [r3], 0
20561 movu m1, [r2 + 1*mmsize]
20562 movu m2, [r2 + 2*mmsize]
20563 movu m3, [r2 + 3*mmsize]
20564 movu [rsp + 0*mmsize + 2], m0
20565 movu [rsp + 1*mmsize + 2], m1
20566 movu [rsp + 2*mmsize + 2], m2
20567 movu [rsp + 3*mmsize + 2], m3
20568 mov r4w, [r3+32]
20569 mov [rsp], r4w
20570 mov r4w, [r2+64]
20571 mov [rsp+66], r4w
20572
20573 lea r3, [ang_table + 16 * 16]
20574 mov r4d, 8
20575 mov r2, rsp
20576 add r1, r1
20577 lea r5, [r1 * 3]
20578
20579 .loop:
20580 MODE_11_25 1
20581 lea r0, [r0 + r1 * 4 ]
20582 add r2, 8
20583 dec r4
20584 jnz .loop
20585 RET
20586
20587 INIT_XMM sse4
20588 cglobal intra_pred_ang32_12, 3,6,7,0-(4*mmsize+10)
20589 mov r3, r2mp
20590 add r2, 128
20591 movu m0, [r2 + 0*mmsize]
20592 pinsrw m0, [r3], 0
20593 movu m1, [r2 + 1*mmsize]
20594 movu m2, [r2 + 2*mmsize]
20595 movu m3, [r2 + 3*mmsize]
20596 movu [rsp + 0*mmsize + 8], m0
20597 movu [rsp + 1*mmsize + 8], m1
20598 movu [rsp + 2*mmsize + 8], m2
20599 movu [rsp + 3*mmsize + 8], m3
20600
20601 mov r4w, [r2+64]
20602 mov [rsp+72], r4w
20603 mov r4w, [r3+12]
20604 mov [rsp+6], r4w
20605 mov r4w, [r3+26]
20606 mov [rsp+4], r4w
20607 mov r4w, [r3+38]
20608 mov [rsp+2], r4w
20609 mov r4w, [r3+52]
20610 mov [rsp], r4w
20611
20612 lea r3, [ang_table + 16 * 16]
20613 mov r4d, 8
20614 mov r2, rsp
20615 add r1, r1
20616 lea r5, [r1 * 3]
20617 mova m2, [pw_punpcklwd]
20618
20619 .loop:
20620 MODE_12_24 1
20621 lea r0, [r0 + r1 * 4 ]
20622 add r2, 8
20623 dec r4
20624 jnz .loop
20625 RET
20626
20627 INIT_XMM sse4
20628 cglobal intra_pred_ang32_13, 3,6,7,0-(5*mmsize+2)
20629 mov r3, r2mp
20630 add r2, 128
20631 movu m0, [r2 + 0*mmsize]
20632 pinsrw m0, [r3], 0
20633 movu m1, [r2 + 1*mmsize]
20634 movu m2, [r2 + 2*mmsize]
20635 movu m3, [r2 + 3*mmsize]
20636 movu [rsp + 1*mmsize], m0
20637 movu [rsp + 2*mmsize], m1
20638 movu [rsp + 3*mmsize], m2
20639 movu [rsp + 4*mmsize], m3
20640
20641 mov r4w, [r2+64]
20642 mov [rsp+80], r4w
20643 movu m0, [r3 + 8]
20644 movu m1, [r3 + 36]
20645 pshufb m0, [shuf_mode_13_23]
20646 pshufb m1, [shuf_mode_13_23]
20647 movh [rsp + 8], m0
20648 movh [rsp], m1
20649 mov r4w, [r3+28]
20650 mov [rsp+8], r4w
20651 mov r4w, [r3+56]
20652 mov [rsp], r4w
20653
20654 lea r3, [ang_table + 16 * 16]
20655 mov r4d, 8
20656 mov r2, rsp
20657 add r1, r1
20658 lea r5, [r1 * 3]
20659 mova m2, [pw_punpcklwd]
20660
20661 .loop:
20662 MODE_13_23 1
20663 lea r0, [r0 + r1 * 4 ]
20664 add r2, 8
20665 dec r4
20666 jnz .loop
20667 RET
20668
20669 INIT_XMM sse4
20670 cglobal intra_pred_ang32_14, 3,6,7,0-(5*mmsize+10)
20671 mov r3, r2mp
20672 add r2, 128
20673 movu m0, [r2 + 0*mmsize]
20674 pinsrw m0, [r3], 0
20675 movu m1, [r2 + 1*mmsize]
20676 movu m2, [r2 + 2*mmsize]
20677 movu m3, [r2 + 3*mmsize]
20678 movu [rsp + 1*mmsize + 8], m0
20679 movu [rsp + 2*mmsize + 8], m1
20680 movu [rsp + 3*mmsize + 8], m2
20681 movu [rsp + 4*mmsize + 8], m3
20682
20683 mov r4w, [r2 + 64]
20684 mov [rsp + 88], r4w
20685 mov r4w, [r3+4]
20686 mov [rsp+22], r4w
20687 movu m0, [r3 + 10]
20688 movu m1, [r3 + 30]
20689 movu m2, [r3 + 50]
20690 pshufb m0, [shuf_mode_14_22]
20691 pshufb m1, [shuf_mode_14_22]
20692 pshufb m2, [shuf_mode_14_22]
20693 movh [rsp + 14], m0
20694 movh [rsp + 6], m1
20695 movh [rsp - 2], m2
20696
20697 lea r3, [ang_table + 16 * 16]
20698 mov r4d, 8
20699 mov r2, rsp
20700 add r1, r1
20701 lea r5, [r1 * 3]
20702 mova m2, [pw_punpcklwd]
20703
20704 .loop:
20705 MODE_14_22 1
20706 lea r0, [r0 + r1 * 4 ]
20707 add r2, 8
20708 dec r4
20709 jnz .loop
20710 RET
20711
20712 INIT_XMM sse4
20713 cglobal intra_pred_ang32_15, 3,6,7,0-(6*mmsize+2)
20714 mov r3, r2mp
20715 add r2, 128
20716 movu m0, [r2 + 0*mmsize]
20717 pinsrw m0, [r3], 0
20718 movu m1, [r2 + 1*mmsize]
20719 movu m2, [r2 + 2*mmsize]
20720 movu m3, [r2 + 3*mmsize]
20721 movu [rsp + 2*mmsize], m0
20722 movu [rsp + 3*mmsize], m1
20723 movu [rsp + 4*mmsize], m2
20724 movu [rsp + 5*mmsize], m3
20725
20726 mov r4w, [r2 + 64]
20727 mov [rsp + 96], r4w
20728 movu m0, [r3 + 4]
20729 movu m1, [r3 + 18]
20730 movu m2, [r3 + 34]
20731 movu m3, [r3 + 48]
20732 pshufb m0, [shuf_mode_15_21]
20733 pshufb m1, [shuf_mode_15_21]
20734 pshufb m2, [shuf_mode_15_21]
20735 pshufb m3, [shuf_mode_15_21]
20736 movh [rsp + 24], m0
20737 movh [rsp + 16], m1
20738 movh [rsp + 8], m2
20739 movh [rsp], m3
20740
20741 lea r3, [ang_table + 16 * 16]
20742 mov r4d, 8
20743 mov r2, rsp
20744 add r1, r1
20745 lea r5, [r1 * 3]
20746 mova m2, [pw_punpcklwd]
20747
20748 .loop:
20749 MODE_15_21 1
20750 lea r0, [r0 + r1 * 4 ]
20751 add r2, 8
20752 dec r4
20753 jnz .loop
20754 RET
20755
20756 INIT_XMM sse4
20757 cglobal intra_pred_ang32_16, 3,6,7,0-(6*mmsize+10)
20758 mov r3, r2mp
20759 add r2, 128
20760 movu m0, [r2 + 0*mmsize]
20761 pinsrw m0, [r3], 0
20762 movu m1, [r2 + 1*mmsize]
20763 movu m2, [r2 + 2*mmsize]
20764 movu m3, [r2 + 3*mmsize]
20765 movu [rsp + 2*mmsize + 8], m0
20766 movu [rsp + 3*mmsize + 8], m1
20767 movu [rsp + 4*mmsize + 8], m2
20768 movu [rsp + 5*mmsize + 8], m3
20769
20770 mov r4w, [r2 + 64]
20771 mov [rsp + 104], r4w
20772 movu m0, [r3 + 4]
20773 movu m1, [r3 + 22]
20774 movu m2, [r3 + 40]
20775 movd m3, [r3 + 58]
20776 pshufb m0, [shuf_mode_16_20]
20777 pshufb m1, [shuf_mode_16_20]
20778 pshufb m2, [shuf_mode_16_20]
20779 pshufb m3, [shuf_mode_16_20]
20780 movu [rsp + 24], m0
20781 movu [rsp + 12], m1
20782 movu [rsp], m2
20783 movd [rsp], m3
20784
20785 lea r3, [ang_table + 16 * 16]
20786 mov r4d, 8
20787 mov r2, rsp
20788 add r1, r1
20789 lea r5, [r1 * 3]
20790 mova m2, [pw_punpcklwd]
20791
20792 .loop:
20793 MODE_16_20 1
20794 lea r0, [r0 + r1 * 4 ]
20795 add r2, 8
20796 dec r4
20797 jnz .loop
20798 RET
20799
20800 INIT_XMM sse4
20801 cglobal intra_pred_ang32_17, 3,6,7,0-(7*mmsize+4)
20802 mov r3, r2mp
20803 add r2, 128
20804 movu m0, [r2 + 0*mmsize]
20805 pinsrw m0, [r3], 0
20806 movu m1, [r2 + 1*mmsize]
20807 movu m2, [r2 + 2*mmsize]
20808 movu m3, [r2 + 3*mmsize]
20809 movu [rsp + 3*mmsize + 2], m0
20810 movu [rsp + 4*mmsize + 2], m1
20811 movu [rsp + 5*mmsize + 2], m2
20812 movu [rsp + 6*mmsize + 2], m3
20813
20814 mov r4w, [r2 + 64]
20815 mov [rsp + 114], r4w
20816 movu m0, [r3 + 8]
20817 movu m1, [r3 + 30]
20818 movu m2, [r3 + 50]
20819 movd m3, [r3 + 2]
20820 pshufb m0, [shuf_mode_17_19]
20821 pshufb m1, [shuf_mode_17_19]
20822 pshufb m2, [shuf_mode_17_19]
20823 pshufb m3, [shuf_mode_16_20]
20824 movd [rsp + 46], m3
20825 movu [rsp + 30], m0
20826 movu [rsp + 12], m1
20827 movu [rsp - 4], m2
20828 mov r4w, [r3 + 24]
20829 mov [rsp + 30], r4w
20830 mov r4w, [r3 + 28]
20831 mov [rsp + 28], r4w
20832 mov r4w, [r3 + 46]
20833 mov [rsp + 12], r4w
20834
20835 lea r3, [ang_table + 16 * 16]
20836 mov r4d, 8
20837 mov r2, rsp
20838 add r1, r1
20839 lea r5, [r1 * 3]
20840 mova m2, [pw_punpcklwd]
20841
20842 .loop:
20843 MODE_17_19 1
20844 lea r0, [r0 + r1 * 4 ]
20845 add r2, 8
20846 dec r4
20847 jnz .loop
20848 RET
20849
20850 INIT_XMM sse4
20851 cglobal intra_pred_ang32_18, 3,7,8
20852 mov r3, r2mp
20853 add r2, 128
20854 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
20855 movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
20856 movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
20857 movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24]
20858 movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1]
20859 movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9]
20860
20861 add r1, r1
20862 lea r6, [r1 * 2]
20863 lea r3, [r1 * 3]
20864 lea r4, [r1 * 4]
20865
20866 movu [r0], m0
20867 movu [r0 + 16], m1
20868 movu [r0 + 32], m2
20869 movu [r0 + 48], m3
20870
20871 pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8]
20872 pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16]
20873
20874 palignr m6, m0, m4, 14
20875 movu [r0 + r1], m6
20876 palignr m6, m1, m0, 14
20877 movu [r0 + r1 + 16], m6
20878 palignr m6, m2, m1, 14
20879 movu [r0 + r1 + 32], m6
20880 palignr m6, m3, m2, 14
20881 movu [r0 + r1 + 48], m6
20882
20883 palignr m6, m0, m4, 12
20884 movu [r0 + r6], m6
20885 palignr m6, m1, m0, 12
20886 movu [r0 + r6 + 16], m6
20887 palignr m6, m2, m1, 12
20888 movu [r0 + r6 + 32], m6
20889 palignr m6, m3, m2, 12
20890 movu [r0 + r6 + 48], m6
20891
20892 palignr m6, m0, m4, 10
20893 movu [r0 + r3], m6
20894 palignr m6, m1, m0, 10
20895 movu [r0 + r3 + 16], m6
20896 palignr m6, m2, m1, 10
20897 movu [r0 + r3 + 32], m6
20898 palignr m6, m3, m2, 10
20899 movu [r0 + r3 + 48], m6
20900
20901 add r0, r4
20902
20903 palignr m6, m0, m4, 8
20904 movu [r0], m6
20905 palignr m6, m1, m0, 8
20906 movu [r0 + 16], m6
20907 palignr m6, m2, m1, 8
20908 movu [r0 + 32], m6
20909 palignr m6, m3, m2, 8
20910 movu [r0 + 48], m6
20911
20912 palignr m6, m0, m4, 6
20913 movu [r0 + r1], m6
20914 palignr m6, m1, m0, 6
20915 movu [r0 + r1 + 16], m6
20916 palignr m6, m2, m1, 6
20917 movu [r0 + r1 + 32], m6
20918 palignr m6, m3, m2, 6
20919 movu [r0 + r1 + 48], m6
20920
20921 palignr m6, m0, m4, 4
20922 movu [r0 + r6], m6
20923 palignr m6, m1, m0, 4
20924 movu [r0 + r6 + 16], m6
20925 palignr m6, m2, m1, 4
20926 movu [r0 + r6 + 32], m6
20927 palignr m6, m3, m2, 4
20928 movu [r0 + r6 + 48], m6
20929
20930 palignr m6, m0, m4, 2
20931 movu [r0 + r3], m6
20932 palignr m6, m1, m0, 2
20933 movu [r0 + r3 + 16], m6
20934 palignr m6, m2, m1, 2
20935 movu [r0 + r3 + 32], m6
20936 palignr m6, m3, m2, 2
20937 movu [r0 + r3 + 48], m6
20938
20939 add r0, r4
20940
20941 movu [r0], m4
20942 movu [r0 + 16], m0
20943 movu [r0 + 32], m1
20944 movu [r0 + 48], m2
20945
20946 palignr m6, m4, m5, 14
20947 movu [r0 + r1], m6
20948 palignr m6, m0, m4, 14
20949 movu [r0 + r1 + 16], m6
20950 palignr m6, m1, m0, 14
20951 movu [r0 + r1 + 32], m6
20952 palignr m6, m2, m1, 14
20953 movu [r0 + r1 + 48], m6
20954
20955 palignr m6, m4, m5, 12
20956 movu [r0 + r6], m6
20957 palignr m6, m0, m4, 12
20958 movu [r0 + r6 + 16], m6
20959 palignr m6, m1, m0, 12
20960 movu [r0 + r6 + 32], m6
20961 palignr m6, m2, m1, 12
20962 movu [r0 + r6 + 48], m6
20963
20964 palignr m6, m4, m5, 10
20965 movu [r0 + r3], m6
20966 palignr m6, m0, m4, 10
20967 movu [r0 + r3 + 16], m6
20968 palignr m6, m1, m0, 10
20969 movu [r0 + r3 + 32], m6
20970 palignr m6, m2, m1, 10
20971 movu [r0 + r3 + 48], m6
20972
20973 add r0, r4
20974
20975 palignr m6, m4, m5, 8
20976 movu [r0], m6
20977 palignr m6, m0, m4, 8
20978 movu [r0 + 16], m6
20979 palignr m6, m1, m0, 8
20980 movu [r0 + 32], m6
20981 palignr m6, m2, m1, 8
20982 movu [r0 + 48], m6
20983
20984 palignr m6, m4, m5, 6
20985 movu [r0 + r1], m6
20986 palignr m6, m0, m4, 6
20987 movu [r0 + r1 + 16], m6
20988 palignr m6, m1, m0, 6
20989 movu [r0 + r1 + 32], m6
20990 palignr m6, m2, m1, 6
20991 movu [r0 + r1 + 48], m6
20992
20993 palignr m6, m4, m5, 4
20994 movu [r0 + r6], m6
20995 palignr m6, m0, m4, 4
20996 movu [r0 + r6 + 16], m6
20997 palignr m6, m1, m0, 4
20998 movu [r0 + r6 + 32], m6
20999 palignr m6, m2, m1, 4
21000 movu [r0 + r6 + 48], m6
21001
21002 palignr m6, m4, m5, 2
21003 movu [r0 + r3], m6
21004 palignr m6, m0, m4, 2
21005 movu [r0 + r3 + 16], m6
21006 palignr m6, m1, m0, 2
21007 movu [r0 + r3 + 32], m6
21008 palignr m6, m2, m1, 2
21009 movu [r0 + r3 + 48], m6
21010
21011 add r0, r4
21012
21013 movu m2, [r2 + 34]
21014 movu m3, [r2 + 50]
21015 pshufb m2, [shuf_mode32_18]
21016 pshufb m3, [shuf_mode32_18]
21017
21018 movu [r0], m5
21019 movu [r0 + 16], m4
21020 movu [r0 + 32], m0
21021 movu [r0 + 48], m1
21022
21023 palignr m6, m5, m2, 14
21024 movu [r0 + r1], m6
21025 palignr m6, m4, m5, 14
21026 movu [r0 + r1 + 16], m6
21027 palignr m6, m0, m4, 14
21028 movu [r0 + r1 + 32], m6
21029 palignr m6, m1, m0, 14
21030 movu [r0 + r1 + 48], m6
21031
21032 palignr m6, m5, m2, 12
21033 movu [r0 + r6], m6
21034 palignr m6, m4, m5, 12
21035 movu [r0 + r6 + 16], m6
21036 palignr m6, m0, m4, 12
21037 movu [r0 + r6 + 32], m6
21038 palignr m6, m1, m0, 12
21039 movu [r0 + r6 + 48], m6
21040
21041 palignr m6, m5, m2, 10
21042 movu [r0 + r3], m6
21043 palignr m6, m4, m5, 10
21044 movu [r0 + r3 + 16], m6
21045 palignr m6, m0, m4, 10
21046 movu [r0 + r3 + 32], m6
21047 palignr m6, m1, m0, 10
21048 movu [r0 + r3 + 48], m6
21049
21050 add r0, r4
21051
21052 palignr m6, m5, m2, 8
21053 movu [r0], m6
21054 palignr m6, m4, m5, 8
21055 movu [r0 + 16], m6
21056 palignr m6, m0, m4, 8
21057 movu [r0 + 32], m6
21058 palignr m6, m1, m0, 8
21059 movu [r0 + 48], m6
21060
21061 palignr m6, m5, m2, 6
21062 movu [r0 + r1], m6
21063 palignr m6, m4, m5, 6
21064 movu [r0 + r1 + 16], m6
21065 palignr m6, m0, m4, 6
21066 movu [r0 + r1 + 32], m6
21067 palignr m6, m1, m0, 6
21068 movu [r0 + r1 + 48], m6
21069
21070 palignr m6, m5, m2, 4
21071 movu [r0 + r6], m6
21072 palignr m6, m4, m5, 4
21073 movu [r0 + r6 + 16], m6
21074 palignr m6, m0, m4, 4
21075 movu [r0 + r6 + 32], m6
21076 palignr m6, m1, m0, 4
21077 movu [r0 + r6 + 48], m6
21078
21079 palignr m6, m5, m2, 2
21080 movu [r0 + r3], m6
21081 palignr m6, m4, m5, 2
21082 movu [r0 + r3 + 16], m6
21083 palignr m6, m0, m4, 2
21084 movu [r0 + r3 + 32], m6
21085 palignr m6, m1, m0, 2
21086 movu [r0 + r3 + 48], m6
21087
21088 add r0, r4
21089
21090 movu [r0], m2
21091 movu [r0 + 16], m5
21092 movu [r0 + 32], m4
21093 movu [r0 + 48], m0
21094
21095 palignr m6, m2, m3, 14
21096 movu [r0 + r1], m6
21097 palignr m6, m5, m2, 14
21098 movu [r0 + r1 + 16], m6
21099 palignr m6, m4, m5, 14
21100 movu [r0 + r1 + 32], m6
21101 palignr m6, m0, m4, 14
21102 movu [r0 + r1 + 48], m6
21103
21104 palignr m6, m2, m3, 12
21105 movu [r0 + r6], m6
21106 palignr m6, m5, m2, 12
21107 movu [r0 + r6 + 16], m6
21108 palignr m6, m4, m5, 12
21109 movu [r0 + r6 + 32], m6
21110 palignr m6, m0, m4, 12
21111 movu [r0 + r6 + 48], m6
21112
21113 palignr m6, m2, m3, 10
21114 movu [r0 + r3], m6
21115 palignr m6, m5, m2, 10
21116 movu [r0 + r3 + 16], m6
21117 palignr m6, m4, m5, 10
21118 movu [r0 + r3 + 32], m6
21119 palignr m6, m0, m4, 10
21120 movu [r0 + r3 + 48], m6
21121
21122 add r0, r4
21123
21124 palignr m6, m2, m3, 8
21125 movu [r0], m6
21126 palignr m6, m5, m2, 8
21127 movu [r0 + 16], m6
21128 palignr m6, m4, m5, 8
21129 movu [r0 + 32], m6
21130 palignr m6, m0, m4, 8
21131 movu [r0 + 48], m6
21132
21133 palignr m6, m2, m3, 6
21134 movu [r0 + r1], m6
21135 palignr m6, m5, m2, 6
21136 movu [r0 + r1 + 16], m6
21137 palignr m6, m4, m5, 6
21138 movu [r0 + r1 + 32], m6
21139 palignr m6, m0, m4, 6
21140 movu [r0 + r1 + 48], m6
21141
21142 palignr m6, m2, m3, 4
21143 movu [r0 + r6], m6
21144 palignr m6, m5, m2, 4
21145 movu [r0 + r6 + 16], m6
21146 palignr m6, m4, m5, 4
21147 movu [r0 + r6 + 32], m6
21148 palignr m6, m0, m4, 4
21149 movu [r0 + r6 + 48], m6
21150
21151 palignr m6, m2, m3, 2
21152 movu [r0 + r3], m6
21153 palignr m6, m5, m2, 2
21154 movu [r0 + r3 + 16], m6
21155 palignr m6, m4, m5, 2
21156 movu [r0 + r3 + 32], m6
21157 palignr m6, m0, m4, 2
21158 movu [r0 + r3 + 48], m6
21159 RET
21160
21161 INIT_XMM sse4
21162 cglobal intra_pred_ang32_19, 3,7,7,0-(7*mmsize+4)
21163 lea r3, [r2 + 128]
21164 movu m0, [r2 + 0*mmsize]
21165 movu m1, [r2 + 1*mmsize]
21166 movu m2, [r2 + 2*mmsize]
21167 movu m3, [r2 + 3*mmsize]
21168 movu [rsp + 3*mmsize + 2], m0
21169 movu [rsp + 4*mmsize + 2], m1
21170 movu [rsp + 5*mmsize + 2], m2
21171 movu [rsp + 6*mmsize + 2], m3
21172
21173 mov r4w, [r2 + 64]
21174 mov [rsp + 114], r4w
21175 movu m0, [r3 + 8]
21176 movu m1, [r3 + 30]
21177 movu m2, [r3 + 50]
21178 movd m3, [r3 + 2]
21179 pshufb m0, [shuf_mode_17_19]
21180 pshufb m1, [shuf_mode_17_19]
21181 pshufb m2, [shuf_mode_17_19]
21182 pshufb m3, [shuf_mode_16_20]
21183 movd [rsp + 46], m3
21184 movu [rsp + 30], m0
21185 movu [rsp + 12], m1
21186 movu [rsp - 4], m2
21187 mov r4w, [r3 + 24]
21188 mov [rsp + 30], r4w
21189 mov r4w, [r3 + 28]
21190 mov [rsp + 28], r4w
21191 mov r4w, [r3 + 46]
21192 mov [rsp + 12], r4w
21193
21194 lea r3, [ang_table + 16 * 16]
21195 mov r4d, 8
21196 mov r2, rsp
21197 add r1, r1
21198 lea r5, [r1 * 3]
21199 mova m2, [pw_punpcklwd]
21200 mov r6, r0
21201
21202 .loop:
21203 MODE_17_19 0
21204 add r6, 8
21205 mov r0, r6
21206 add r2, 8
21207 dec r4
21208 jnz .loop
21209 RET
21210
21211 INIT_XMM sse4
21212 cglobal intra_pred_ang32_20, 3,7,7,0-(6*mmsize+10)
21213 lea r3, [r2 + 128]
21214 movu m0, [r2 + 0*mmsize]
21215 movu m1, [r2 + 1*mmsize]
21216 movu m2, [r2 + 2*mmsize]
21217 movu m3, [r2 + 3*mmsize]
21218 movu [rsp + 2*mmsize + 8], m0
21219 movu [rsp + 3*mmsize + 8], m1
21220 movu [rsp + 4*mmsize + 8], m2
21221 movu [rsp + 5*mmsize + 8], m3
21222
21223 mov r4w, [r2 + 64]
21224 mov [rsp + 104], r4w
21225 movu m0, [r3 + 4]
21226 movu m1, [r3 + 22]
21227 movu m2, [r3 + 40]
21228 movd m3, [r3 + 58]
21229 pshufb m0, [shuf_mode_16_20]
21230 pshufb m1, [shuf_mode_16_20]
21231 pshufb m2, [shuf_mode_16_20]
21232 pshufb m3, [shuf_mode_16_20]
21233 movu [rsp + 24], m0
21234 movu [rsp + 12], m1
21235 movu [rsp], m2
21236 movd [rsp], m3
21237
21238 lea r3, [ang_table + 16 * 16]
21239 mov r4d, 8
21240 mov r2, rsp
21241 add r1, r1
21242 lea r5, [r1 * 3]
21243 mova m2, [pw_punpcklwd]
21244 mov r6, r0
21245
21246 .loop:
21247 MODE_16_20 0
21248 add r6, 8
21249 mov r0, r6
21250 add r2, 8
21251 dec r4
21252 jnz .loop
21253 RET
21254
21255 INIT_XMM sse4
21256 cglobal intra_pred_ang32_21, 3,7,7,0-(6*mmsize+2)
21257 lea r3, [r2 + 128]
21258 movu m0, [r2 + 0*mmsize]
21259 movu m1, [r2 + 1*mmsize]
21260 movu m2, [r2 + 2*mmsize]
21261 movu m3, [r2 + 3*mmsize]
21262 movu [rsp + 2*mmsize], m0
21263 movu [rsp + 3*mmsize], m1
21264 movu [rsp + 4*mmsize], m2
21265 movu [rsp + 5*mmsize], m3
21266
21267 mov r4w, [r2 + 64]
21268 mov [rsp + 96], r4w
21269 movu m0, [r3 + 4]
21270 movu m1, [r3 + 18]
21271 movu m2, [r3 + 34]
21272 movu m3, [r3 + 48]
21273 pshufb m0, [shuf_mode_15_21]
21274 pshufb m1, [shuf_mode_15_21]
21275 pshufb m2, [shuf_mode_15_21]
21276 pshufb m3, [shuf_mode_15_21]
21277 movh [rsp + 24], m0
21278 movh [rsp + 16], m1
21279 movh [rsp + 8], m2
21280 movh [rsp], m3
21281
21282 lea r3, [ang_table + 16 * 16]
21283 mov r4d, 8
21284 mov r2, rsp
21285 add r1, r1
21286 lea r5, [r1 * 3]
21287 mova m2, [pw_punpcklwd]
21288 mov r6, r0
21289
21290 .loop:
21291 MODE_15_21 0
21292 add r6, 8
21293 mov r0, r6
21294 add r2, 8
21295 dec r4
21296 jnz .loop
21297 RET
21298
21299 INIT_XMM sse4
21300 cglobal intra_pred_ang32_22, 3,7,7,0-(5*mmsize+10)
21301 lea r3, [r2 + 128]
21302 movu m0, [r2 + 0*mmsize]
21303 movu m1, [r2 + 1*mmsize]
21304 movu m2, [r2 + 2*mmsize]
21305 movu m3, [r2 + 3*mmsize]
21306 movu [rsp + 1*mmsize + 8], m0
21307 movu [rsp + 2*mmsize + 8], m1
21308 movu [rsp + 3*mmsize + 8], m2
21309 movu [rsp + 4*mmsize + 8], m3
21310
21311 mov r4w, [r2 + 64]
21312 mov [rsp + 88], r4w
21313 mov r4w, [r3+4]
21314 mov [rsp+22], r4w
21315 movu m0, [r3 + 10]
21316 movu m1, [r3 + 30]
21317 movu m2, [r3 + 50]
21318 pshufb m0, [shuf_mode_14_22]
21319 pshufb m1, [shuf_mode_14_22]
21320 pshufb m2, [shuf_mode_14_22]
21321 movh [rsp + 14], m0
21322 movh [rsp + 6], m1
21323 movh [rsp - 2], m2
21324
21325 lea r3, [ang_table + 16 * 16]
21326 mov r4d, 8
21327 mov r2, rsp
21328 add r1, r1
21329 lea r5, [r1 * 3]
21330 mova m2, [pw_punpcklwd]
21331 mov r6, r0
21332
21333 .loop:
21334 MODE_14_22 0
21335 add r6, 8
21336 mov r0, r6
21337 add r2, 8
21338 dec r4
21339 jnz .loop
21340 RET
21341
21342 INIT_XMM sse4
21343 cglobal intra_pred_ang32_23, 3,7,7,0-(5*mmsize+2)
21344 lea r3, [r2 + 128]
21345 movu m0, [r2 + 0*mmsize]
21346 movu m1, [r2 + 1*mmsize]
21347 movu m2, [r2 + 2*mmsize]
21348 movu m3, [r2 + 3*mmsize]
21349 movu [rsp + 1*mmsize], m0
21350 movu [rsp + 2*mmsize], m1
21351 movu [rsp + 3*mmsize], m2
21352 movu [rsp + 4*mmsize], m3
21353
21354 mov r4w, [r2+64]
21355 mov [rsp+80], r4w
21356 movu m0, [r3 + 8]
21357 movu m1, [r3 + 36]
21358 pshufb m0, [shuf_mode_13_23]
21359 pshufb m1, [shuf_mode_13_23]
21360 movh [rsp + 8], m0
21361 movh [rsp], m1
21362 mov r4w, [r3+28]
21363 mov [rsp+8], r4w
21364 mov r4w, [r3+56]
21365 mov [rsp], r4w
21366
21367 lea r3, [ang_table + 16 * 16]
21368 mov r4d, 8
21369 mov r2, rsp
21370 add r1, r1
21371 lea r5, [r1 * 3]
21372 mova m2, [pw_punpcklwd]
21373 mov r6, r0
21374
21375 .loop:
21376 MODE_13_23 0
21377 add r6, 8
21378 mov r0, r6
21379 add r2, 8
21380 dec r4
21381 jnz .loop
21382 RET
21383
21384 INIT_XMM sse4
21385 cglobal intra_pred_ang32_24, 3,7,7,0-(4*mmsize+10)
21386 lea r3, [r2 + 128]
21387 movu m0, [r2 + 0*mmsize]
21388 movu m1, [r2 + 1*mmsize]
21389 movu m2, [r2 + 2*mmsize]
21390 movu m3, [r2 + 3*mmsize]
21391
21392 movu [rsp + 0*mmsize + 8], m0
21393 movu [rsp + 1*mmsize + 8], m1
21394 movu [rsp + 2*mmsize + 8], m2
21395 movu [rsp + 3*mmsize + 8], m3
21396
21397 mov r4w, [r2+64]
21398 mov [rsp+72], r4w
21399 mov r4w, [r3+12]
21400 mov [rsp+6], r4w
21401 mov r4w, [r3+26]
21402 mov [rsp+4], r4w
21403 mov r4w, [r3+38]
21404 mov [rsp+2], r4w
21405 mov r4w, [r3+52]
21406 mov [rsp], r4w
21407
21408 lea r3, [ang_table + 16 * 16]
21409 mov r4d, 8
21410 mov r2, rsp
21411 add r1, r1
21412 lea r5, [r1 * 3]
21413 mov r6, r0
21414 mova m2, [pw_punpcklwd]
21415
21416 .loop:
21417 MODE_12_24 0
21418 add r6, 8
21419 mov r0, r6
21420 add r2, 8
21421 dec r4
21422 jnz .loop
21423 RET
21424
21425 INIT_XMM sse4
21426 cglobal intra_pred_ang32_25, 3,7,7,0-(4*mmsize+4)
21427 lea r3, [r2 + 128]
21428 movu m0, [r2 + 0*mmsize]
21429 movu m1, [r2 + 1*mmsize]
21430 movu m2, [r2 + 2*mmsize]
21431 movu m3, [r2 + 3*mmsize]
21432 movu [rsp + 0*mmsize + 2], m0
21433 movu [rsp + 1*mmsize + 2], m1
21434 movu [rsp + 2*mmsize + 2], m2
21435 movu [rsp + 3*mmsize + 2], m3
21436 mov r4w, [r3+32]
21437 mov [rsp], r4w
21438 mov r4w, [r2+64]
21439 mov [rsp+66], r4w
21440
21441 lea r3, [ang_table + 16 * 16]
21442 mov r4d, 8
21443 mov r2, rsp
21444 add r1, r1
21445 lea r5, [r1 * 3]
21446 mov r6, r0
21447
21448 .loop:
21449 MODE_11_25 0
21450 add r6, 8
21451 mov r0, r6
21452 add r2, 8
21453 dec r4
21454 jnz .loop
21455 RET
21456
21457 INIT_XMM sse4
21458 cglobal intra_pred_ang32_26, 3,7,5
21459 mov r6d, 4
21460 add r1, r1
21461 lea r3, [r1 * 2]
21462 lea r4, [r1 * 3]
21463 lea r5, [r1 * 4]
21464 mova m4, [c_mode32_10_0]
21465
21466 movu m0, [r2 + 2 ]
21467 movu m1, [r2 + 18]
21468 movu m2, [r2 + 34]
21469 movu m3, [r2 + 50]
21470
21471 .loop:
21472 movu [r0], m0
21473 movu [r0 + 16], m1
21474 movu [r0 + 32], m2
21475 movu [r0 + 48], m3
21476
21477 movu [r0 + r1], m0
21478 movu [r0 + r1 + 16], m1
21479 movu [r0 + r1 + 32], m2
21480 movu [r0 + r1 + 48], m3
21481
21482 movu [r0 + r3], m0
21483 movu [r0 + r3 + 16], m1
21484 movu [r0 + r3 + 32], m2
21485 movu [r0 + r3 + 48], m3
21486
21487 movu [r0 + r4], m0
21488 movu [r0 + r4 + 16], m1
21489 movu [r0 + r4 + 32], m2
21490 movu [r0 + r4 + 48], m3
21491
21492 add r0, r5
21493
21494 movu [r0], m0
21495 movu [r0 + 16], m1
21496 movu [r0 + 32], m2
21497 movu [r0 + 48], m3
21498
21499 movu [r0 + r1], m0
21500 movu [r0 + r1 + 16], m1
21501 movu [r0 + r1 + 32], m2
21502 movu [r0 + r1 + 48], m3
21503
21504 movu [r0 + r3], m0
21505 movu [r0 + r3 + 16], m1
21506 movu [r0 + r3 + 32], m2
21507 movu [r0 + r3 + 48], m3
21508
21509 movu [r0 + r4], m0
21510 movu [r0 + r4 + 16], m1
21511 movu [r0 + r4 + 32], m2
21512 movu [r0 + r4 + 48], m3
21513
21514 add r0, r5
21515 dec r6d
21516 jnz .loop
21517 RET
21518
21519 INIT_XMM sse4
21520 cglobal intra_pred_ang32_27, 3,7,8
21521 lea r3, [ang_table + 16 * 16]
21522 add r1, r1
21523 lea r5, [r1 * 3]
21524 mov r6, r0
21525 mov r4d, 8
21526
21527 .loop:
21528 MODE_9_27 0
21529 add r6, 8
21530 mov r0, r6
21531 add r2, 8
21532 dec r4
21533 jnz .loop
21534 RET
21535
21536 INIT_XMM sse4
21537 cglobal intra_pred_ang32_28, 3,7,8
21538 lea r3, [ang_table + 16 * 16]
21539 add r1, r1
21540 lea r5, [r1 * 3]
21541 mov r6, r0
21542 mov r4d, 8
21543
21544 .loop:
21545 MODE_8_28 0
21546 add r6, 8
21547 mov r0, r6
21548 add r2, 8
21549 dec r4
21550 jnz .loop
21551 RET
21552
21553 INIT_XMM sse4
21554 cglobal intra_pred_ang32_29, 3,7,8
21555 lea r3, [ang_table + 16 * 16]
21556 add r1, r1
21557 lea r5, [r1 * 3]
21558 mov r6, r0
21559 mov r4d, 8
21560
21561 .loop:
21562 MODE_7_29 0
21563 add r6, 8
21564 mov r0, r6
21565 add r2, 8
21566 dec r4
21567 jnz .loop
21568 RET
21569
21570 INIT_XMM sse4
21571 cglobal intra_pred_ang32_30, 3,7,8
21572 lea r3, [ang_table + 16 * 16]
21573 add r1, r1
21574 lea r5, [r1 * 3]
21575 mov r6, r0
21576 mov r4d, 8
21577
21578 .loop:
21579 MODE_6_30 0
21580 add r6, 8
21581 mov r0, r6
21582 add r2, 8
21583 dec r4
21584 jnz .loop
21585 RET
21586
21587 INIT_XMM sse4
21588 cglobal intra_pred_ang32_31, 3,7,8
21589 lea r3, [ang_table + 16 * 16]
21590 add r1, r1
21591 lea r5, [r1 * 3]
21592 mov r6, r0
21593 mov r4d, 8
21594
21595 .loop:
21596 MODE_5_31 0
21597 add r6, 8
21598 mov r0, r6
21599 add r2, 8
21600 dec r4
21601 jnz .loop
21602 RET
21603
21604 INIT_XMM sse4
21605 cglobal intra_pred_ang32_32, 3,7,8
21606 lea r3, [ang_table + 16 * 16]
21607 add r1, r1
21608 lea r5, [r1 * 3]
21609 mov r6, r0
21610 mov r4d, 8
21611
21612 .loop:
21613 MODE_4_32 0
21614 add r6, 8
21615 mov r0, r6
21616 add r2, 8
21617 dec r4
21618 jnz .loop
21619 RET
21620
21621 INIT_XMM sse4
21622 cglobal intra_pred_ang32_33, 3,7,8
21623 lea r3, [ang_table + 16 * 16]
21624 add r1, r1
21625 lea r5, [r1 * 3]
21626 mov r6, r0
21627 mov r4d, 8
21628 .loop:
21629 MODE_3_33 0
21630 add r6, 8
21631 mov r0, r6
21632 add r2, 8
21633 dec r4
21634 jnz .loop
21635 RET
21636
21637 ;-----------------------------------------------------------------------------------
21638 ; void intra_filter_NxN(const pixel* references, pixel* filtered)
21639 ;-----------------------------------------------------------------------------------
21640 INIT_XMM sse4
21641 cglobal intra_filter_4x4, 2,4,5
21642 mov r2w, word [r0 + 16] ; topLast
21643 mov r3w, word [r0 + 32] ; LeftLast
21644
21645 ; filtering top
21646 movu m0, [r0 + 0]
21647 movu m1, [r0 + 16]
21648 movu m2, [r0 + 32]
21649
21650 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
21651 palignr m3, m1, m0, 4
21652 pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
21653
21654 psllw m0, 1
21655 paddw m4, m3
21656 paddw m0, m4
21657 paddw m0, [pw_2]
21658 psrlw m0, 2
21659
21660 ; filtering left
21661 palignr m4, m1, m1, 14
21662 pinsrw m4, [r0], 1
21663 palignr m3, m2, m1, 4
21664 pshufb m3, [intra_filter4_shuf1]
21665
21666 psllw m1, 1
21667 paddw m4, m3
21668 paddw m1, m4
21669 paddw m1, [pw_2]
21670 psrlw m1, 2
21671
21672 movu [r1], m0
21673 movu [r1 + 16], m1
21674 mov [r1 + 16], r2w ; topLast
21675 mov [r1 + 32], r3w ; LeftLast
21676 RET
21677
21678 INIT_XMM sse4
21679 cglobal intra_filter_8x8, 2,4,6
21680 mov r2w, word [r0 + 32] ; topLast
21681 mov r3w, word [r0 + 64] ; LeftLast
21682
21683 ; filtering top
21684 movu m0, [r0]
21685 movu m1, [r0 + 16]
21686 movu m2, [r0 + 32]
21687
21688 pshufb m4, m0, [intra_filter4_shuf0]
21689 palignr m5, m1, m0, 2
21690 pinsrw m5, [r0 + 34], 0
21691
21692 palignr m3, m1, m0, 14
21693 psllw m0, 1
21694 paddw m4, m5
21695 paddw m0, m4
21696 paddw m0, [pw_2]
21697 psrlw m0, 2
21698
21699 palignr m4, m2, m1, 2
21700 psllw m1, 1
21701 paddw m4, m3
21702 paddw m1, m4
21703 paddw m1, [pw_2]
21704 psrlw m1, 2
21705 movu [r1], m0
21706 movu [r1 + 16], m1
21707
21708 ; filtering left
21709 movu m1, [r0 + 48]
21710 movu m0, [r0 + 64]
21711
21712 palignr m4, m2, m2, 14
21713 pinsrw m4, [r0], 1
21714 palignr m5, m1, m2, 2
21715
21716 palignr m3, m1, m2, 14
21717 palignr m0, m1, 2
21718
21719 psllw m2, 1
21720 paddw m4, m5
21721 paddw m2, m4
21722 paddw m2, [pw_2]
21723 psrlw m2, 2
21724
21725 psllw m1, 1
21726 paddw m0, m3
21727 paddw m1, m0
21728 paddw m1, [pw_2]
21729 psrlw m1, 2
21730
21731 movu [r1 + 32], m2
21732 movu [r1 + 48], m1
21733 mov [r1 + 32], r2w ; topLast
21734 mov [r1 + 64], r3w ; LeftLast
21735 RET
21736
21737 INIT_XMM sse4
21738 cglobal intra_filter_16x16, 2,4,6
21739 mov r2w, word [r0 + 64] ; topLast
21740 mov r3w, word [r0 + 128] ; LeftLast
21741
21742 ; filtering top
21743 movu m0, [r0]
21744 movu m1, [r0 + 16]
21745 movu m2, [r0 + 32]
21746
21747 pshufb m4, m0, [intra_filter4_shuf0]
21748 palignr m5, m1, m0, 2
21749 pinsrw m5, [r0 + 66], 0
21750
21751 palignr m3, m1, m0, 14
21752 psllw m0, 1
21753 paddw m4, m5
21754 paddw m0, m4
21755 paddw m0, [pw_2]
21756 psrlw m0, 2
21757
21758 palignr m4, m2, m1, 2
21759 psllw m5, m1, 1
21760 paddw m4, m3
21761 paddw m5, m4
21762 paddw m5, [pw_2]
21763 psrlw m5, 2
21764 movu [r1], m0
21765 movu [r1 + 16], m5
21766
21767 movu m0, [r0 + 48]
21768 movu m5, [r0 + 64]
21769
21770 palignr m3, m2, m1, 14
21771 palignr m4, m0, m2, 2
21772
21773 psllw m1, m2, 1
21774 paddw m3, m4
21775 paddw m1, m3
21776 paddw m1, [pw_2]
21777 psrlw m1, 2
21778
21779 palignr m3, m0, m2, 14
21780 palignr m4, m5, m0, 2
21781
21782 psllw m0, 1
21783 paddw m4, m3
21784 paddw m0, m4
21785 paddw m0, [pw_2]
21786 psrlw m0, 2
21787 movu [r1 + 32], m1
21788 movu [r1 + 48], m0
21789
21790 ; filtering left
21791 movu m1, [r0 + 80]
21792 movu m2, [r0 + 96]
21793
21794 palignr m4, m5, m5, 14
21795 pinsrw m4, [r0], 1
21796 palignr m0, m1, m5, 2
21797
21798 psllw m3, m5, 1
21799 paddw m4, m0
21800 paddw m3, m4
21801 paddw m3, [pw_2]
21802 psrlw m3, 2
21803
21804 palignr m0, m1, m5, 14
21805 palignr m4, m2, m1, 2
21806
21807 psllw m5, m1, 1
21808 paddw m4, m0
21809 paddw m5, m4
21810 paddw m5, [pw_2]
21811 psrlw m5, 2
21812 movu [r1 + 64], m3
21813 movu [r1 + 80], m5
21814
21815 movu m5, [r0 + 112]
21816 movu m0, [r0 + 128]
21817
21818 palignr m3, m2, m1, 14
21819 palignr m4, m5, m2, 2
21820
21821 psllw m1, m2, 1
21822 paddw m3, m4
21823 paddw m1, m3
21824 paddw m1, [pw_2]
21825 psrlw m1, 2
21826
21827 palignr m3, m5, m2, 14
21828 palignr m4, m0, m5, 2
21829
21830 psllw m5, 1
21831 paddw m4, m3
21832 paddw m5, m4
21833 paddw m5, [pw_2]
21834 psrlw m5, 2
21835 movu [r1 + 96], m1
21836 movu [r1 + 112], m5
21837
21838 mov [r1 + 64], r2w ; topLast
21839 mov [r1 + 128], r3w ; LeftLast
21840 RET
21841
21842 INIT_XMM sse4
21843 cglobal intra_filter_32x32, 2,4,6
21844 mov r2w, word [r0 + 128] ; topLast
21845 mov r3w, word [r0 + 256] ; LeftLast
21846
21847 ; filtering top
21848 ; 0 to 15
21849 movu m0, [r0 + 0]
21850 movu m1, [r0 + 16]
21851 movu m2, [r0 + 32]
21852
21853 pshufb m4, m0, [intra_filter4_shuf0]
21854 palignr m5, m1, m0, 2
21855 pinsrw m5, [r0 + 130], 0
21856
21857 palignr m3, m1, m0, 14
21858 psllw m0, 1
21859 paddw m4, m5
21860 paddw m0, m4
21861 paddw m0, [pw_2]
21862 psrlw m0, 2
21863
21864 palignr m4, m2, m1, 2
21865 psllw m5, m1, 1
21866 paddw m4, m3
21867 paddw m5, m4
21868 paddw m5, [pw_2]
21869 psrlw m5, 2
21870 movu [r1], m0
21871 movu [r1 + 16], m5
21872
21873 ; 16 to 31
21874 movu m0, [r0 + 48]
21875 movu m5, [r0 + 64]
21876
21877 palignr m3, m2, m1, 14
21878 palignr m4, m0, m2, 2
21879
21880 psllw m1, m2, 1
21881 paddw m3, m4
21882 paddw m1, m3
21883 paddw m1, [pw_2]
21884 psrlw m1, 2
21885
21886 palignr m3, m0, m2, 14
21887 palignr m4, m5, m0, 2
21888
21889 psllw m2, m0, 1
21890 paddw m4, m3
21891 paddw m2, m4
21892 paddw m2, [pw_2]
21893 psrlw m2, 2
21894 movu [r1 + 32], m1
21895 movu [r1 + 48], m2
21896
21897 ; 32 to 47
21898 movu m1, [r0 + 80]
21899 movu m2, [r0 + 96]
21900
21901 palignr m3, m5, m0, 14
21902 palignr m4, m1, m5, 2
21903
21904 psllw m0, m5, 1
21905 paddw m3, m4
21906 paddw m0, m3
21907 paddw m0, [pw_2]
21908 psrlw m0, 2
21909
21910 palignr m3, m1, m5, 14
21911 palignr m4, m2, m1, 2
21912
21913 psllw m5, m1, 1
21914 paddw m4, m3
21915 paddw m5, m4
21916 paddw m5, [pw_2]
21917 psrlw m5, 2
21918 movu [r1 + 64], m0
21919 movu [r1 + 80], m5
21920
21921 ; 48 to 63
21922 movu m0, [r0 + 112]
21923 movu m5, [r0 + 128]
21924
21925 palignr m3, m2, m1, 14
21926 palignr m4, m0, m2, 2
21927
21928 psllw m1, m2, 1
21929 paddw m3, m4
21930 paddw m1, m3
21931 paddw m1, [pw_2]
21932 psrlw m1, 2
21933
21934 palignr m3, m0, m2, 14
21935 palignr m4, m5, m0, 2
21936
21937 psllw m0, 1
21938 paddw m4, m3
21939 paddw m0, m4
21940 paddw m0, [pw_2]
21941 psrlw m0, 2
21942 movu [r1 + 96], m1
21943 movu [r1 + 112], m0
21944
21945 ; filtering left
21946 ; 64 to 79
21947 movu m1, [r0 + 144]
21948 movu m2, [r0 + 160]
21949
21950 palignr m4, m5, m5, 14
21951 pinsrw m4, [r0], 1
21952 palignr m0, m1, m5, 2
21953
21954 psllw m3, m5, 1
21955 paddw m4, m0
21956 paddw m3, m4
21957 paddw m3, [pw_2]
21958 psrlw m3, 2
21959
21960 palignr m0, m1, m5, 14
21961 palignr m4, m2, m1, 2
21962
21963 psllw m5, m1, 1
21964 paddw m4, m0
21965 paddw m5, m4
21966 paddw m5, [pw_2]
21967 psrlw m5, 2
21968 movu [r1 + 128], m3
21969 movu [r1 + 144], m5
21970
21971 ; 80 to 95
21972 movu m5, [r0 + 176]
21973 movu m0, [r0 + 192]
21974
21975 palignr m3, m2, m1, 14
21976 palignr m4, m5, m2, 2
21977
21978 psllw m1, m2, 1
21979 paddw m3, m4
21980 paddw m1, m3
21981 paddw m1, [pw_2]
21982 psrlw m1, 2
21983
21984 palignr m3, m5, m2, 14
21985 palignr m4, m0, m5, 2
21986
21987 psllw m2, m5, 1
21988 paddw m4, m3
21989 paddw m2, m4
21990 paddw m2, [pw_2]
21991 psrlw m2, 2
21992 movu [r1 + 160], m1
21993 movu [r1 + 176], m2
21994
21995 ; 96 to 111
21996 movu m1, [r0 + 208]
21997 movu m2, [r0 + 224]
21998
21999 palignr m3, m0, m5, 14
22000 palignr m4, m1, m0, 2
22001
22002 psllw m5, m0, 1
22003 paddw m3, m4
22004 paddw m5, m3
22005 paddw m5, [pw_2]
22006 psrlw m5, 2
22007
22008 palignr m3, m1, m0, 14
22009 palignr m4, m2, m1, 2
22010
22011 psllw m0, m1, 1
22012 paddw m4, m3
22013 paddw m0, m4
22014 paddw m0, [pw_2]
22015 psrlw m0, 2
22016 movu [r1 + 192], m5
22017 movu [r1 + 208], m0
22018
22019 ; 112 to 127
22020 movu m5, [r0 + 240]
22021 movu m0, [r0 + 256]
22022
22023 palignr m3, m2, m1, 14
22024 palignr m4, m5, m2, 2
22025
22026 psllw m1, m2, 1
22027 paddw m3, m4
22028 paddw m1, m3
22029 paddw m1, [pw_2]
22030 psrlw m1, 2
22031
22032 palignr m3, m5, m2, 14
22033 palignr m4, m0, m5, 2
22034
22035 psllw m5, 1
22036 paddw m4, m3
22037 paddw m5, m4
22038 paddw m5, [pw_2]
22039 psrlw m5, 2
22040 movu [r1 + 224], m1
22041 movu [r1 + 240], m5
22042
22043 mov [r1 + 128], r2w ; topLast
22044 mov [r1 + 256], r3w ; LeftLast
22045 RET
22046
22047 INIT_YMM avx2
22048 cglobal intra_filter_4x4, 2,4,4
22049 mov r2w, word [r0 + 16] ; topLast
22050 mov r3w, word [r0 + 32] ; LeftLast
22051
22052 ; filtering top
22053 movu m0, [r0]
22054 vpbroadcastw m2, xm0
22055 movu m1, [r0 + 16]
22056
22057 palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
22058 pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
22059 palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
22060 palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
22061
22062 psllw m0, 1
22063 paddw m3, m1
22064 paddw m0, m3
22065 paddw m0, [pw_2]
22066 psrlw m0, 2
22067
22068 movu [r1], m0
22069 mov [r1 + 16], r2w ; topLast
22070 mov [r1 + 32], r3w ; LeftLast
22071 RET