Mercurial > hg > forks > libbpg
diff x265/source/common/x86/intrapred16.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x265/source/common/x86/intrapred16.asm Wed Nov 16 11:16:33 2016 +0200 @@ -0,0 +1,22071 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com> +;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com> +;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +const ang_table +%assign x 0 +%rep 32 + times 4 dw (32-x), x +%assign x x+1 +%endrep + +const ang_table_avx2 +%assign x 0 +%rep 32 + times 8 dw (32-x), x +%assign x x+1 +%endrep + +const pw_ang16_12_24, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 14, 15, 0, 1, 0, 1 +const pw_ang16_13_23, db 2, 3, 2, 3, 14, 15, 14, 15, 6, 7, 6, 7, 0, 1, 0, 1 +const pw_ang16_14_22, db 2, 3, 2, 3, 10, 11, 10, 11, 6, 7, 6, 7, 0, 1, 0, 1 +const pw_ang16_15_21, db 12, 13, 12, 13, 8, 9, 8, 9, 4, 5, 4, 5, 0, 1, 0, 1 +const pw_ang16_16_20, db 8, 9, 8, 9, 6, 7, 6, 7, 2, 3, 2, 3, 0, 1, 0, 1 + +const pw_ang32_12_24, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 +const pw_ang32_13_23, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 6, 7, 0, 1 +const pw_ang32_14_22, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 6, 7, 0, 1 +const pw_ang32_15_21, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1 +const pw_ang32_16_20, db 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 6, 7, 2, 3, 0, 1 +const pw_ang32_17_19_0, db 0, 0, 0, 0, 12, 13, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 + +const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 +const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1 +const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1 +const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + +const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1 +const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1 +const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1 +const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1 +const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1 +const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1 +const pw_swap16, times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + +const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 + +intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +;; (blkSize - 1 - x) +pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 + +const planar32_table +%assign x 31 +%rep 8 + dd x, x-1, x-2, x-3 +%assign x x-4 +%endrep + +const planar32_table1 +%assign x 1 +%rep 8 + dd x, x+1, x+2, x+3 +%assign x x+4 +%endrep + +SECTION .text + +cextern pb_01 +cextern pw_1 +cextern pw_2 +cextern pw_3 +cextern pw_7 +cextern pw_4 +cextern pw_8 +cextern pw_15 +cextern pw_16 +cextern pw_31 +cextern pw_32 +cextern pd_16 +cextern pd_31 +cextern pd_32 +cextern pw_4096 +cextern pw_pixel_max +cextern multiL +cextern multiH +cextern multiH2 +cextern multiH3 +cextern multi_2Row +cextern pw_swap +cextern pb_unpackwq1 +cextern pb_unpackwq2 +cextern pw_planar16_mul +cextern pw_planar32_mul + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc4, 5,6,2 + movh m0, [r2 + 18] ; sumAbove + movh m1, [r2 + 2] ; sumLeft + + paddw m0, m1 + pshuflw m1, m0, 0x4E + paddw m0, m1 + pshuflw m1, m0, 0xB1 + paddw m0, m1 + + test r4d, r4d + + paddw m0, [pw_4] + psrlw m0, 3 + + ; store DC 4x4 + movh [r0], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r1 * 4], m0 + lea r5, [r0 + r1 * 4] + movh [r5 + r1 * 2], m0 + + ; do DC filter + jz .end + movh m1, m0 + psllw m1, 1 + paddw m1, [pw_2] + movd r3d, m1 + paddw m0, m1 + ; filter top + movh m1, [r2 + 2] + paddw m1, m0 + psrlw m1, 2 + movh [r0], m1 ; overwrite top-left pixel, we will update it later + + ; filter top-left + movzx r3d, r3w + movzx r4d, word [r2 + 18] + add r3d, r4d + movzx r4d, word [r2 + 2] + add r4d, r3d + shr r4d, 2 + mov [r0], r4w + + ; filter left + movu m1, [r2 + 20] + paddw m1, m0 + psrlw m1, 2 + movd r3d, m1 + mov [r0 + r1 * 2], r3w + shr r3d, 16 + mov [r0 + r1 * 4], r3w + pextrw r3d, m1, 2 + mov [r5 + r1 * 2], r3w +.end: + RET + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc8, 5, 8, 2 + movu m0, [r2 + 34] + movu m1, [r2 + 2] + + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + pshufd m1, m0, 1 + paddw m0, m1 + pmaddwd m0, [pw_1] + + paddw m0, [pw_8] + psrlw m0, 4 ; sum = sum / 16 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 ; m0 = word [dc_val ...] + + test r4d, r4d + + ; store DC 8x8 + lea r6, [r1 + r1 * 4] + lea r6, [r6 + r1] + lea r5, [r6 + r1 * 4] + lea r7, [r6 + r1 * 8] + movu [r0], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 4], m0 + movu [r0 + r6], m0 + movu [r0 + r1 * 8], m0 + movu [r0 + r5], m0 + movu [r0 + r6 * 2], m0 + movu [r0 + r7], m0 + + ; Do DC Filter + jz .end + mova m1, [pw_2] + pmullw m1, m0 + paddw m1, [pw_2] + movd r4d, m1 ; r4d = DC * 2 + 2 + paddw m1, m0 ; m1 = DC * 3 + 2 + pshuflw m1, m1, 0 + pshufd m1, m1, 0 ; m1 = pixDCx3 + + ; filter top + movu m0, [r2 + 2] + paddw m0, m1 + psrlw m0, 2 + movu [r0], m0 + + ; filter top-left + movzx r4d, r4w + movzx r3d, word [r2 + 34] + add r4d, r3d + movzx r3d, word [r2 + 2] + add r3d, r4d + shr r3d, 2 + mov [r0], r3w + + ; filter left + movu m0, [r2 + 36] + paddw m0, m1 + psrlw m0, 2 + movh r3, m0 + mov [r0 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r1 * 4], r3w + shr r3, 16 + mov [r0 + r6], r3w + shr r3, 16 + mov [r0 + r1 * 8], r3w + pshufd m0, m0, 0x6E + movh r3, m0 + mov [r0 + r5], r3w + shr r3, 16 + mov [r0 + r6 * 2], r3w + shr r3, 16 + mov [r0 + r7], r3w +.end: + RET + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc16, 5, 10, 4 + lea r3, [r2 + 66] + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r2 + 2] + movu m3, [r2 + 18] + + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + HADDUW m0, m1 + paddd m0, [pd_16] + psrld m0, 5 + + movd r5d, m0 + pshuflw m0, m0, 0 ; m0 = word [dc_val ...] + pshufd m0, m0, 0 + + test r4d, r4d + + ; store DC 16x16 + lea r6, [r1 + r1 * 2] ;index 3 + lea r7, [r1 + r1 * 4] ;index 5 + lea r8, [r6 + r1 * 4] ;index 7 + lea r9, [r0 + r8] ;base + 7 + movu [r0], m0 + movu [r0 + 16], m0 + movu [r0 + r1], m0 + movu [r0 + 16 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r6], m0 + movu [r0 + r6 + 16], m0 + movu [r0 + r1 * 4], m0 + movu [r0 + r1 * 4 + 16], m0 + movu [r0 + r7], m0 + movu [r0 + r7 + 16], m0 + movu [r0 + r6 * 2], m0 + movu [r0 + r6 * 2 + 16], m0 + movu [r9], m0 + movu [r9 + 16], m0 + movu [r0 + r1 * 8], m0 + movu [r0 + r1 * 8 + 16], m0 + movu [r9 + r1 * 2], m0 + movu [r9 + r1 * 2 + 16], m0 + movu [r0 + r7 * 2], m0 + movu [r0 + r7 * 2 + 16], m0 + movu [r9 + r1 * 4], m0 + movu [r9 + r1 * 4 + 16], m0 + movu [r0 + r6 * 4], m0 + movu [r0 + r6 * 4 + 16], m0 + movu [r9 + r6 * 2], m0 + movu [r9 + r6 * 2 + 16], m0 + movu [r9 + r8], m0 + movu [r9 + r8 + 16], m0 + movu [r9 + r1 * 8], m0 + movu [r9 + r1 * 8 + 16], m0 + + ; Do DC Filter + jz .end + mova m1, [pw_2] + pmullw m1, m0 + paddw m1, [pw_2] + movd r4d, m1 + paddw m1, m0 + + ; filter top + movu m2, [r2 + 2] + paddw m2, m1 + psrlw m2, 2 + movu [r0], m2 + movu m3, [r2 + 18] + paddw m3, m1 + psrlw m3, 2 + movu [r0 + 16], m3 + + ; filter top-left + movzx r4d, r4w + movzx r5d, word [r3] + add r4d, r5d + movzx r5d, word [r2 + 2] + add r5d, r4d + shr r5d, 2 + mov [r0], r5w + + ; filter left + movu m2, [r3 + 2] + paddw m2, m1 + psrlw m2, 2 + + movq r2, m2 + pshufd m2, m2, 0xEE + mov [r0 + r1], r2w + shr r2, 16 + mov [r0 + r1 * 2], r2w + shr r2, 16 + mov [r0 + r6], r2w + shr r2, 16 + mov [r0 + r1 * 4], r2w + movq r2, m2 + mov [r0 + r7], r2w + shr r2, 16 + mov [r0 + r6 * 2], r2w + shr r2, 16 + mov [r9], r2w + shr r2, 16 + mov [r0 + r1 * 8], r2w + + movu m3, [r3 + 18] + paddw m3, m1 + psrlw m3, 2 + + movq r3, m3 + pshufd m3, m3, 0xEE + mov [r9 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r7 * 2], r3w + shr r3, 16 + mov [r9 + r1 * 4], r3w + shr r3, 16 + mov [r0 + r6 * 4], r3w + movq r3, m3 + mov [r9 + r6 * 2], r3w + shr r3, 16 + mov [r9 + r8], r3w + shr r3, 16 + mov [r9 + r1 * 8], r3w +.end: + RET + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc32, 3, 4, 6 + lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) + add r2, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r3 + 32] + movu m3, [r3 + 48] + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + HADDUWD m0, m1 + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r2 + 32] + movu m4, [r2 + 48] + paddw m1, m2 + paddw m3, m4 + paddw m1, m3 + HADDUWD m1, m2 + + paddd m0, m1 + HADDD m0, m1 + paddd m0, [pd_32] ; sum = sum + 32 + psrld m0, 6 ; sum = sum / 64 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 + + lea r2, [r1 * 3] + ; store DC 32x32 +%assign x 1 +%rep 8 + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + %if x < 8 + lea r0, [r0 + r1 * 4] + %endif +%assign x x + 1 +%endrep + RET + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_dc16, 3, 9, 4 + mov r3d, r4m + add r1d, r1d + movu m0, [r2 + 66] + movu m2, [r2 + 2] + paddw m0, m2 ; dynamic range 13 bits + + vextracti128 xm1, m0, 1 + paddw xm0, xm1 ; dynamic range 14 bits + movhlps xm1, xm0 + paddw xm0, xm1 ; dynamic range 15 bits + pmaddwd xm0, [pw_1] + phaddd xm0, xm0 + paddd xm0, [pd_16] + psrld xm0, 5 + movd r5d, xm0 + vpbroadcastw m0, xm0 + + test r3d, r3d + + ; store DC 16x16 + lea r6, [r1 + r1 * 2] ; index 3 + lea r7, [r1 + r1 * 4] ; index 5 + lea r8, [r6 + r1 * 4] ; index 7 + lea r4, [r0 + r8 * 1] ; base + 7 + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r6], m0 + movu [r0 + r1 * 4], m0 + movu [r0 + r7], m0 + movu [r0 + r6 * 2], m0 + movu [r4], m0 + movu [r0 + r1 * 8], m0 + movu [r4 + r1 * 2], m0 + movu [r0 + r7 * 2], m0 + movu [r4 + r1 * 4], m0 + movu [r0 + r6 * 4], m0 + movu [r4 + r6 * 2], m0 + movu [r4 + r8], m0 + movu [r4 + r1 * 8], m0 + + ; Do DC Filter + jz .end + mova m1, [pw_2] + pmullw m1, m0 + paddw m1, [pw_2] + movd r3d, xm1 + paddw m1, m0 + + ; filter top + movu m2, [r2 + 2] + paddw m2, m1 + psrlw m2, 2 + movu [r0], m2 + + ; filter top-left + movzx r3d, r3w + movzx r5d, word [r2 + 66] + add r3d, r5d + movzx r5d, word [r2 + 2] + add r5d, r3d + shr r5d, 2 + mov [r0], r5w + + ; filter left + movu m2, [r2 + 68] + paddw m2, m1 + psrlw m2, 2 + vextracti128 xm3, m2, 1 + + movq r3, xm2 + pshufd xm2, xm2, 0xEE + mov [r0 + r1], r3w + shr r3, 16 + mov [r0 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r6], r3w + shr r3, 16 + mov [r0 + r1 * 4], r3w + movq r3, xm2 + mov [r0 + r7], r3w + shr r3, 16 + mov [r0 + r6 * 2], r3w + shr r3, 16 + mov [r4], r3w + shr r3, 16 + mov [r0 + r1 * 8], r3w + + movq r3, xm3 + pshufd xm3, xm3, 0xEE + mov [r4 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r7 * 2], r3w + shr r3, 16 + mov [r4 + r1 * 4], r3w + shr r3, 16 + mov [r0 + r6 * 4], r3w + movq r3, xm3 + mov [r4 + r6 * 2], r3w + shr r3, 16 + mov [r4 + r8], r3w + shr r3, 16 + mov [r4 + r1 * 8], r3w +.end: + RET + +;--------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) +;--------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_dc32, 3,3,3 + add r2, 2 + add r1d, r1d + movu m0, [r2] + movu m1, [r2 + 32] + add r2, mmsize*4 ; r2 += 128 + paddw m0, m1 ; dynamic range 13 bits + movu m1, [r2] + movu m2, [r2 + 32] + paddw m1, m2 ; dynamic range 13 bits + paddw m0, m1 ; dynamic range 14 bits + vextracti128 xm1, m0, 1 + paddw xm0, xm1 ; dynamic range 15 bits + pmaddwd xm0, [pw_1] + movhlps xm1, xm0 + paddd xm0, xm1 + phaddd xm0, xm0 + paddd xm0, [pd_32] ; sum = sum + 32 + psrld xm0, 6 ; sum = sum / 64 + vpbroadcastw m0, xm0 + + lea r2, [r1 * 3] + ; store DC 32x32 + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 0 + mmsize], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 1 + mmsize], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r2 * 1 + 0], m0 + movu [r0 + r2 * 1 + mmsize], m0 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar8, 3,3,5 + movu m1, [r2 + 2] + movu m2, [r2 + 34] + + movd m3, [r2 + 18] ; topRight = above[8]; + movd m4, [r2 + 50] ; bottomLeft = left[8]; + + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m4, m4, 0 ; v_bottomLeft + + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] + paddw m3, [pw_8] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + +%macro INTRA_PRED_PLANAR_8 1 +%if (%1 < 4) + pshuflw m1, m2, 0x55 * %1 + pshufd m1, m1, 0 +%else + pshufhw m1, m2, 0x55 * (%1 - 4) + pshufd m1, m1, 0xAA +%endif + pmullw m1, [pw_planar16_mul + mmsize] + paddw m1, m3 + psraw m1, 4 + movu [r0], m1 +%if (%1 < 7) + paddw m3, m4 + lea r0, [r0 + r1 * 2] +%endif +%endmacro + + INTRA_PRED_PLANAR_8 0 + INTRA_PRED_PLANAR_8 1 + INTRA_PRED_PLANAR_8 2 + INTRA_PRED_PLANAR_8 3 + INTRA_PRED_PLANAR_8 4 + INTRA_PRED_PLANAR_8 5 + INTRA_PRED_PLANAR_8 6 + INTRA_PRED_PLANAR_8 7 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar16, 3,3,8 + movu m2, [r2 + 2] + movu m7, [r2 + 18] + + movd m3, [r2 + 34] ; topRight = above[16] + movd m6, [r2 + 98] ; bottomLeft = left[16] + + pshuflw m3, m3, 0 + pshuflw m6, m6, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m6, m6, 0 ; v_bottomLeft + + pmullw m4, m3, [multiH] ; (x + 1) * topRight + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] + pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] + paddw m4, [pw_16] + paddw m3, [pw_16] + paddw m4, m6 + paddw m3, m6 + paddw m4, m5 + paddw m3, m1 + psubw m1, m6, m7 + psubw m6, m2 + + movu m2, [r2 + 66] + movu m7, [r2 + 82] + +%macro INTRA_PRED_PLANAR_16 1 +%if (%1 < 4) + pshuflw m5, m2, 0x55 * %1 + pshufd m5, m5, 0 +%else +%if (%1 < 8) + pshufhw m5, m2, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA +%else +%if (%1 < 12) + pshuflw m5, m7, 0x55 * (%1 - 8) + pshufd m5, m5, 0 +%else + pshufhw m5, m7, 0x55 * (%1 - 12) + pshufd m5, m5, 0xAA +%endif +%endif +%endif +%if (%1 > 0) + paddw m3, m6 + paddw m4, m1 + lea r0, [r0 + r1 * 2] +%endif + pmullw m0, m5, [pw_planar16_mul + mmsize] + pmullw m5, [pw_planar16_mul] + paddw m0, m4 + paddw m5, m3 + psraw m5, 5 + psraw m0, 5 + movu [r0], m5 + movu [r0 + 16], m0 +%endmacro + + INTRA_PRED_PLANAR_16 0 + INTRA_PRED_PLANAR_16 1 + INTRA_PRED_PLANAR_16 2 + INTRA_PRED_PLANAR_16 3 + INTRA_PRED_PLANAR_16 4 + INTRA_PRED_PLANAR_16 5 + INTRA_PRED_PLANAR_16 6 + INTRA_PRED_PLANAR_16 7 + INTRA_PRED_PLANAR_16 8 + INTRA_PRED_PLANAR_16 9 + INTRA_PRED_PLANAR_16 10 + INTRA_PRED_PLANAR_16 11 + INTRA_PRED_PLANAR_16 12 + INTRA_PRED_PLANAR_16 13 + INTRA_PRED_PLANAR_16 14 + INTRA_PRED_PLANAR_16 15 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar32, 3,3,16 + movd m3, [r2 + 66] ; topRight = above[32] + + pshuflw m3, m3, 0x00 + pshufd m3, m3, 0x44 + + pmullw m0, m3, [multiL] ; (x + 1) * topRight + pmullw m1, m3, [multiH] ; (x + 1) * topRight + pmullw m2, m3, [multiH2] ; (x + 1) * topRight + pmullw m3, [multiH3] ; (x + 1) * topRight + + movd m6, [r2 + 194] ; bottomLeft = left[32] + pshuflw m6, m6, 0x00 + pshufd m6, m6, 0x44 + mova m5, m6 + paddw m5, [pw_32] + + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + mova m8, m6 + mova m9, m6 + mova m10, m6 + + mova m12, [pw_31] + movu m4, [r2 + 2] + psubw m8, m4 + pmullw m4, m12 + paddw m0, m4 + + movu m5, [r2 + 18] + psubw m9, m5 + pmullw m5, m12 + paddw m1, m5 + + movu m4, [r2 + 34] + psubw m10, m4 + pmullw m4, m12 + paddw m2, m4 + + movu m5, [r2 + 50] + psubw m6, m5 + pmullw m5, m12 + paddw m3, m5 + + mova m12, [pw_planar32_mul] + mova m13, [pw_planar32_mul + mmsize] + mova m14, [pw_planar16_mul] + mova m15, [pw_planar16_mul + mmsize] + add r1, r1 + +%macro PROCESS 1 + pmullw m5, %1, m12 + pmullw m11, %1, m13 + paddw m5, m0 + paddw m11, m1 + psrlw m5, 6 + psrlw m11, 6 + movu [r0], m5 + movu [r0 + 16], m11 + + pmullw m5, %1, m14 + pmullw %1, m15 + paddw m5, m2 + paddw %1, m3 + psrlw m5, 6 + psrlw %1, 6 + movu [r0 + 32], m5 + movu [r0 + 48], %1 +%endmacro + +%macro INCREMENT 0 + paddw m2, m10 + paddw m3, m6 + paddw m0, m8 + paddw m1, m9 + add r0, r1 +%endmacro + + add r2, 130 ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) +%assign x 0 +%rep 4 + movu m4, [r2] + add r2, 16 +%assign y 0 +%rep 8 + %if y < 4 + pshuflw m7, m4, 0x55 * y + pshufd m7, m7, 0x44 + %else + pshufhw m7, m4, 0x55 * (y - 4) + pshufd m7, m7, 0xEE + %endif + PROCESS m7 + %if x + y < 10 + INCREMENT + %endif +%assign y y+1 +%endrep +%assign x x+1 +%endrep + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_planar32, 3,3,8 + movu m1, [r2 + 2] + movu m4, [r2 + 34] + lea r2, [r2 + 66] + vpbroadcastw m3, [r2] ; topRight = above[32] + pmullw m0, m3, [multiL] ; (x + 1) * topRight + pmullw m2, m3, [multiH2] ; (x + 1) * topRight + vpbroadcastw m6, [r2 + 128] ; bottomLeft = left[32] + mova m5, m6 + paddw m5, [pw_32] + + paddw m0, m5 + paddw m2, m5 + mova m5, m6 + psubw m3, m6, m1 + pmullw m1, [pw_31] + paddw m0, m1 + psubw m5, m4 + pmullw m4, [pw_31] + paddw m2, m4 + + mova m6, [pw_planar32_mul] + mova m4, [pw_planar16_mul] + add r1, r1 + +%macro PROCESS_AVX2 1 + vpbroadcastw m7, [r2 + %1 * 2] + pmullw m1, m7, m6 + pmullw m7, m4 + paddw m1, m0 + paddw m7, m2 + psrlw m1, 6 + psrlw m7, 6 + movu [r0], m1 + movu [r0 + mmsize], m7 +%endmacro + +%macro INCREMENT_AVX2 0 + paddw m2, m5 + paddw m0, m3 + add r0, r1 +%endmacro + + add r2, mmsize*2 +%assign x 0 +%rep 4 +%assign y 0 +%rep 8 + PROCESS_AVX2 y +%if x + y < 10 + INCREMENT_AVX2 +%endif +%assign y y+1 +%endrep +lea r2, [r2 + 16] +%assign x x+1 +%endrep + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_planar16, 3,3,4 + add r1d, r1d + vpbroadcastw m3, [r2 + 34] + vpbroadcastw m4, [r2 + 98] + mova m0, [pw_planar16_mul] + movu m2, [r2 + 2] + + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] + paddw m3, [pw_16] + paddw m3, m4 + paddw m3, m1 + psubw m4, m2 + add r2, 66 + +%macro INTRA_PRED_PLANAR16_AVX2 1 + vpbroadcastw m1, [r2 + %1] + vpbroadcastw m2, [r2 + %1 + 2] + + pmullw m1, m0 + pmullw m2, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 5 + paddw m2, m3 + psraw m2, 5 + paddw m3, m4 + movu [r0], m1 + movu [r0 + r1], m2 +%if %1 <= 24 + lea r0, [r0 + r1 * 2] +%endif +%endmacro + INTRA_PRED_PLANAR16_AVX2 0 + INTRA_PRED_PLANAR16_AVX2 4 + INTRA_PRED_PLANAR16_AVX2 8 + INTRA_PRED_PLANAR16_AVX2 12 + INTRA_PRED_PLANAR16_AVX2 16 + INTRA_PRED_PLANAR16_AVX2 20 + INTRA_PRED_PLANAR16_AVX2 24 + INTRA_PRED_PLANAR16_AVX2 28 +%undef INTRA_PRED_PLANAR16_AVX2 + RET + +%macro TRANSPOSE_4x4 0 + punpckhwd m0, m1, m3 + punpcklwd m1, m3 + punpckhwd m3, m1, m0 + punpcklwd m1, m0 +%endmacro + +%macro STORE_4x4 0 + add r1, r1 + movh [r0], m1 + movhps [r0 + r1], m1 + movh [r0 + r1 * 2], m3 + lea r1, [r1 * 3] + movhps [r0 + r1], m3 +%endmacro + +%macro CALC_4x4 4 + mova m0, [pd_16] + pmaddwd m1, [ang_table + %1 * 16] + paddd m1, m0 + psrld m1, 5 + + pmaddwd m2, [ang_table + %2 * 16] + paddd m2, m0 + psrld m2, 5 + packssdw m1, m2 + + pmaddwd m3, [ang_table + %3 * 16] + paddd m3, m0 + psrld m3, 5 + + pmaddwd m4, [ang_table + %4 * 16] + paddd m4, m0 + psrld m4, 5 + packssdw m3, m4 +%endmacro + +;----------------------------------------------------------------------------------------- +; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_ang4_2, 3,5,4 + lea r4, [r2 + 4] + add r2, 20 + cmp r3m, byte 34 + cmove r2, r4 + + add r1, r1 + movu m0, [r2] + movh [r0], m0 + psrldq m0, 2 + movh [r0 + r1], m0 + psrldq m0, 2 + movh [r0 + r1 * 2], m0 + lea r1, [r1 * 3] + psrldq m0, 2 + movh [r0 + r1], m0 + RET + +cglobal intra_pred_ang4_3, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m0 + psrldq m0, 2 + punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3] + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4] + + CALC_4x4 26, 20, 14, 8 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_33, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m0 + psrldq m0, 2 + punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3] + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4] + + CALC_4x4 26, 20, 14, 8 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_4, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m2 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] + + CALC_4x4 21, 10, 31, 20 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_6, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m0 + psrldq m0, 2 + punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2] + mova m4, m3 + + CALC_4x4 13, 26, 7, 20 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_7, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2] + + CALC_4x4 9, 18, 27, 4 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_8, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 5, 10, 15, 20 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_9, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 2, 4, 6, 8 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_10, 3,3,3 + movh m0, [r2 + 18] ;[4 3 2 1] + + punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1] + pshufd m1, m0, 0xFA + add r1d, r1d + pshufd m0, m0, 0x50 + movhps [r0 + r1], m0 + movh [r0 + r1 * 2], m1 + lea r1d, [r1 * 3] + movhps [r0 + r1], m1 + + cmp r4m, byte 0 + jz .quit + + ; filter + movd m2, [r2] ;[7 6 5 4 3 2 1 0] + pshuflw m2, m2, 0x00 + movh m1, [r2 + 2] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] +.quit: + movh [r0], m0 + RET + +cglobal intra_pred_ang4_11, 3,3,5 + movh m0, [r2 + 18] ;[x x x 4 3 2 1 0] + movh m1, [r2 - 6] + punpcklqdq m1, m0 + psrldq m1, 6 + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 30, 28, 26, 24 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_12, 3,3,5 + movh m0, [r2 + 18] + movh m1, [r2 - 6] + punpcklqdq m1, m0 + psrldq m1, 6 + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 27, 22, 17, 12 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_13, 3,3,5 + movd m4, [r2 + 6] + movd m1, [r2 - 2] + movh m0, [r2 + 18] + punpcklwd m4, m1 + punpcklqdq m4, m0 + psrldq m4, 4 + mova m1, m4 + psrldq m1, 2 + punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + + CALC_4x4 23, 14, 5, 28 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_14, 3,3,5 + movd m4, [r2 + 2] + movd m1, [r2 - 2] + movh m0, [r2 + 18] + punpcklwd m4, m1 + punpcklqdq m4, m0 + psrldq m4, 4 + mova m1, m4 + psrldq m1, 2 + punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m4 + + CALC_4x4 19, 6, 25, 12 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_15, 3,3,5 + movd m3, [r2] ;[x x x A] + movh m4, [r2 + 4] ;[x C x B] + movh m0, [r2 + 18] ;[4 3 2 1] + pshuflw m4, m4, 0x22 ;[B C B C] + punpcklqdq m4, m3 ;[x x x A B C B C] + psrldq m4, 2 ;[x x x x A B C B] + punpcklqdq m4, m0 + psrldq m4, 2 + mova m1, m4 + mova m2, m4 + psrldq m1, 4 + psrldq m2, 2 + punpcklwd m4, m2 ;[2 1 1 0 0 x x y] + punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m3, m2 + + CALC_4x4 15, 30, 13, 28 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_16, 3,3,5 + movd m3, [r2] ;[x x x A] + movd m4, [r2 + 4] ;[x x C B] + movh m0, [r2 + 18] ;[4 3 2 1] + punpcklwd m4, m3 ;[x C A B] + pshuflw m4, m4, 0x4A ;[A B C C] + punpcklqdq m4, m0 ;[4 3 2 1 A B C C] + psrldq m4, 2 + mova m1, m4 + mova m2, m4 + psrldq m1, 4 + psrldq m2, 2 + punpcklwd m4, m2 ;[2 1 1 0 0 x x y] + punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m3, m2 + + CALC_4x4 11, 22, 1, 12 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_17, 3,3,5 + movd m3, [r2] + movh m4, [r2 + 2] ;[D x C B] + pshuflw m4, m4, 0x1F ;[B C D D] + punpcklqdq m4, m3 ;[x x x A B C D D] + psrldq m4, 2 ;[x x x x A B C D] + movhps m4, [r2 + 18] + + mova m3, m4 + psrldq m3, 2 + punpcklwd m4, m3 + mova m2, m3 + psrldq m2, 2 + punpcklwd m3, m2 + mova m1, m2 + psrldq m1, 2 + punpcklwd m2, m1 + mova m0, m1 + psrldq m0, 2 + punpcklwd m1, m0 + + CALC_4x4 6, 12, 18, 24 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_18, 3,3,1 + movh m0, [r2 + 16] + pinsrw m0, [r2], 0 + pshuflw m0, m0, q0123 + movhps m0, [r2 + 2] + add r1, r1 + lea r2, [r1 * 3] + movh [r0 + r2], m0 + psrldq m0, 2 + movh [r0 + r1 * 2], m0 + psrldq m0, 2 + movh [r0 + r1], m0 + psrldq m0, 2 + movh [r0], m0 + RET + + cglobal intra_pred_ang4_19, 3,3,5 + movd m3, [r2] + movh m4, [r2 + 18] ;[D x C B] + pshuflw m4, m4, 0x1F ;[B C D D] + punpcklqdq m4, m3 ;[x x x A B C D D] + psrldq m4, 2 ;[x x x x A B C D] + movhps m4, [r2 + 2] + + mova m3, m4 + psrldq m3, 2 + punpcklwd m4, m3 + mova m2, m3 + psrldq m2, 2 + punpcklwd m3, m2 + mova m1, m2 + psrldq m1, 2 + punpcklwd m2, m1 + mova m0, m1 + psrldq m0, 2 + punpcklwd m1, m0 + + CALC_4x4 6, 12, 18, 24 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_20, 3,3,5 + movd m3, [r2] ;[x x x A] + movd m4, [r2 + 20] ;[x x C B] + movh m0, [r2 + 2] ;[4 3 2 1] + punpcklwd m4, m3 ;[x C A B] + pshuflw m4, m4, 0x4A ;[A B C C] + punpcklqdq m4, m0 ;[4 3 2 1 A B C C] + psrldq m4, 2 + mova m1, m4 + mova m2, m4 + psrldq m1, 4 + psrldq m2, 2 + punpcklwd m4, m2 ;[2 1 1 0 0 x x y] + punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m3, m2 + + CALC_4x4 11, 22, 1, 12 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_21, 3,3,5 + movd m3, [r2] ;[x x x A] + movh m4, [r2 + 20] ;[x C x B] + movh m0, [r2 + 2] ;[4 3 2 1] + pshuflw m4, m4, 0x22 ;[B C B C] + punpcklqdq m4, m3 ;[x x x A B C B C] + psrldq m4, 2 ;[x x x x A B C B] + punpcklqdq m4, m0 + psrldq m4, 2 + mova m1, m4 + mova m2, m4 + psrldq m1, 4 + psrldq m2, 2 + punpcklwd m4, m2 ;[2 1 1 0 0 x x y] + punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m3, m2 + + CALC_4x4 15, 30, 13, 28 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_22, 3,3,5 + movd m4, [r2 + 18] + movd m1, [r2 - 2] + movh m0, [r2 + 2] + punpcklwd m4, m1 + punpcklqdq m4, m0 + psrldq m4, 4 + mova m1, m4 + psrldq m1, 2 + punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m4 + + CALC_4x4 19, 6, 25, 12 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_23, 3,3,5 + movd m4, [r2 + 22] + movd m1, [r2 - 2] + movh m0, [r2 + 2] + punpcklwd m4, m1 + punpcklqdq m4, m0 + psrldq m4, 4 + mova m1, m4 + psrldq m1, 2 + punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x] + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + + CALC_4x4 23, 14, 5, 28 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_24, 3,3,5 + movh m0, [r2 + 2] + movh m1, [r2 - 6] + punpcklqdq m1, m0 + psrldq m1, 6 + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 27, 22, 17, 12 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_25, 3,3,5 + movh m0, [r2 + 2] ;[x x x 4 3 2 1 0] + movh m1, [r2 - 6] + punpcklqdq m1, m0 + psrldq m1, 6 + punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 30, 28, 26, 24 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_26, 3,3,3 + movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + add r1d, r1d + ; store + movh [r0], m0 + movh [r0 + r1], m0 + movh [r0 + r1 * 2], m0 + lea r3, [r1 * 3] + movh [r0 + r3], m0 + + ; filter + cmp r4m, byte 0 + jz .quit + + pshuflw m0, m0, 0x00 + movd m2, [r2] + pshuflw m2, m2, 0x00 + movh m1, [r2 + 18] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + + movh r2, m0 + mov [r0], r2w + shr r2, 16 + mov [r0 + r1], r2w + shr r2, 16 + mov [r0 + r1 * 2], r2w + shr r2, 16 + mov [r0 + r3], r2w +.quit: + RET + +cglobal intra_pred_ang4_27, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 2, 4, 6, 8 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_28, 3,3,5 + + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + + CALC_4x4 5, 10, 15, 20 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_29, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m1 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2] + + CALC_4x4 9, 18, 27, 4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_30, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m1 + mova m3, m0 + psrldq m0, 2 + punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2] + mova m4, m3 + + CALC_4x4 13, 26, 7, 20 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_5, 3,3,5 + movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m2 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] + + CALC_4x4 17, 2, 19, 4 + + TRANSPOSE_4x4 + + STORE_4x4 + RET + +cglobal intra_pred_ang4_31, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m2 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] + + CALC_4x4 17, 2, 19, 4 + + STORE_4x4 + RET + + cglobal intra_pred_ang4_32, 3,3,5 + movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1] + mova m1, m0 + psrldq m0, 2 + punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1] + mova m2, m0 + psrldq m0, 2 + punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2] + mova m3, m2 + mova m4, m0 + psrldq m0, 2 + punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3] + + CALC_4x4 21, 10, 31, 20 + + STORE_4x4 + RET + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc4, 5,6,2 + lea r3, [r2 + 18] + add r2, 2 + + movh m0, [r3] ; sumAbove + movh m1, [r2] ; sumLeft + + paddw m0, m1 + pshufd m1, m0, 1 + paddw m0, m1 + phaddw m0, m0 ; m0 = sum + + test r4d, r4d + + pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8 + movd r4d, m0 ; r4d = dc_val + movzx r4d, r4w + pshuflw m0, m0, 0 ; m0 = word [dc_val ...] + + ; store DC 4x4 + movh [r0], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r1 * 4], m0 + lea r5, [r0 + r1 * 4] + movh [r5 + r1 * 2], m0 + + ; do DC filter + jz .end + lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 + add r4d, r5d ; r4d = DC * 3 + 2 + movd m0, r4d + pshuflw m0, m0, 0 ; m0 = pixDCx3 + + ; filter top + movu m1, [r2] + paddw m1, m0 + psrlw m1, 2 + movh [r0], m1 ; overwrite top-left pixel, we will update it later + + ; filter top-left + movzx r4d, word [r3] + add r5d, r4d + movzx r4d, word [r2] + add r4d, r5d + shr r4d, 2 + mov [r0], r4w + + ; filter left + lea r0, [r0 + r1 * 2] + movu m1, [r3 + 2] + paddw m1, m0 + psrlw m1, 2 + movd r3d, m1 + mov [r0], r3w + shr r3d, 16 + mov [r0 + r1 * 2], r3w + pextrw [r0 + r1 * 4], m1, 2 +.end: + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar4, 3,3,5 + movu m1, [r2 + 2] + movu m2, [r2 + 18] + pshufhw m3, m1, 0 ; topRight + pshufd m3, m3, 0xAA + pshufhw m4, m2, 0 ; bottomLeft + pshufd m4, m4, 0xAA + + pmullw m3, [multi_2Row] ; (x + 1) * topRight + pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] + + paddw m3, [pw_4] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + + pshuflw m1, m2, 0 + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 01010101b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0 + r1 * 2], m1 + lea r0, [r0 + 4 * r1] + + pshuflw m1, m2, 10101010b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 11111111b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + psraw m1, 3 + movh [r0 + r1 * 2], m1 + RET + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc8, 5, 7, 2 + lea r3, [r2 + 34] + add r2, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r2] + + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + phaddw m0, m0 + pmaddwd m0, [pw_1] + + movd r5d, m0 + add r5d, 8 + shr r5d, 4 ; sum = sum / 16 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = word [dc_val ...] + pshufd m1, m1, 0 + + test r4d, r4d + + ; store DC 8x8 + mov r6, r0 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + lea r0, [r0 + r1 * 2] + movu [r0 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + movu m0, [r2] + paddw m0, m1 + psrlw m0, 2 + movu [r6], m0 + + ; filter top-left + movzx r5d, word [r3] + add r4d, r5d + movzx r5d, word [r2] + add r5d, r4d + shr r5d, 2 + mov [r6], r5w + + ; filter left + add r6, r1 + movu m0, [r3 + 2] + paddw m0, m1 + psrlw m0, 2 + pextrw [r6], m0, 0 + pextrw [r6 + r1], m0, 1 + pextrw [r6 + r1 * 2], m0, 2 + lea r6, [r6 + r1 * 2] + pextrw [r6 + r1], m0, 3 + pextrw [r6 + r1 * 2], m0, 4 + lea r6, [r6 + r1 * 2] + pextrw [r6 + r1], m0, 5 + pextrw [r6 + r1 * 2], m0, 6 +.end: + RET + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc16, 5, 7, 4 + lea r3, [r2 + 66] + add r2, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r2] + movu m3, [r2 + 16] + + paddw m0, m1 ; dynamic range 13 bits + paddw m2, m3 + paddw m0, m2 ; dynamic range 14 bits + movhlps m1, m0 ; dynamic range 15 bits + paddw m0, m1 ; dynamic range 16 bits + pmaddwd m0, [pw_1] + phaddd m0, m0 + + movd r5d, m0 + add r5d, 16 + shr r5d, 5 ; sum = sum / 16 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = word [dc_val ...] + pshufd m1, m1, 0 + + test r4d, r4d + + ; store DC 16x16 + mov r6, r0 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + r1], m1 + movu [r0 + 16 + r1], m1 + + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 + + ; filter top + movu m2, [r2] + paddw m2, m1 + psrlw m2, 2 + movu [r6], m2 + movu m3, [r2 + 16] + paddw m3, m1 + psrlw m3, 2 + movu [r6 + 16], m3 + + ; filter top-left + movzx r5d, word [r3] + add r4d, r5d + movzx r5d, word [r2] + add r5d, r4d + shr r5d, 2 + mov [r6], r5w + + ; filter left + add r6, r1 + movu m2, [r3 + 2] + paddw m2, m1 + psrlw m2, 2 + + pextrw [r6], m2, 0 + pextrw [r6 + r1], m2, 1 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 2 + pextrw [r6 + r1], m2, 3 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 4 + pextrw [r6 + r1], m2, 5 + lea r6, [r6 + r1 * 2] + pextrw [r6], m2, 6 + pextrw [r6 + r1], m2, 7 + + lea r6, [r6 + r1 * 2] + movu m3, [r3 + 18] + paddw m3, m1 + psrlw m3, 2 + + pextrw [r6], m3, 0 + pextrw [r6 + r1], m3, 1 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 2 + pextrw [r6 + r1], m3, 3 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 4 + pextrw [r6 + r1], m3, 5 + lea r6, [r6 + r1 * 2] + pextrw [r6], m3, 6 +.end: + RET + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc32, 3, 5, 6 + lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) + add r2, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r3 + 32] + movu m3, [r3 + 48] + paddw m0, m1 ; dynamic range 13 bits + paddw m2, m3 + paddw m0, m2 ; dynamic range 14 bits + movu m1, [r2] + movu m3, [r2 + 16] + movu m4, [r2 + 32] + movu m5, [r2 + 48] + paddw m1, m3 ; dynamic range 13 bits + paddw m4, m5 + paddw m1, m4 ; dynamic range 14 bits + paddw m0, m1 ; dynamic range 15 bits + pmaddwd m0, [pw_1] + movhlps m1, m0 + paddd m0, m1 + phaddd m0, m0 + + paddd m0, [pd_32] ; sum = sum + 32 + psrld m0, 6 ; sum = sum / 64 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 + + lea r2, [r1 * 3] + mov r3d, 4 +.loop: + ; store DC 32x32 + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + lea r0, [r0 + r1 * 4] + dec r3d + jnz .loop + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar4, 3,3,5 + add r1, r1 + movu m1, [r2 + 2] + movu m2, [r2 + 18] + pshufhw m3, m1, 0 ; topRight + pshufd m3, m3, 0xAA + pshufhw m4, m2, 0 ; bottomLeft + pshufd m4, m4, 0xAA + + pmullw m3, [multi_2Row] ; (x + 1) * topRight + pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] + + paddw m3, [pw_4] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + mova m0, [pw_planar4_0] + + pshuflw m1, m2, 0 + pmullw m1, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 01010101b + pmullw m1, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + + pshuflw m1, m2, 10101010b + pmullw m1, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 11111111b + pmullw m1, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0 + r1], m1 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar8, 3,3,5 + add r1, r1 + movu m1, [r2 + 2] + movu m2, [r2 + 34] + + movd m3, [r2 + 18] ; topRight = above[8]; + movd m4, [r2 + 50] ; bottomLeft = left[8]; + + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m4, m4, 0 ; v_bottomLeft + + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] + paddw m3, [pw_8] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + mova m0, [pw_planar16_mul + mmsize] + +%macro INTRA_PRED_PLANAR8 1 +%if (%1 < 4) + pshuflw m1, m2, 0x55 * %1 + pshufd m1, m1, 0 +%else + pshufhw m1, m2, 0x55 * (%1 - 4) + pshufd m1, m1, 0xAA +%endif + pmullw m1, m0 + paddw m1, m3 + paddw m3, m4 + psraw m1, 4 + movu [r0], m1 + lea r0, [r0 + r1] +%endmacro + + INTRA_PRED_PLANAR8 0 + INTRA_PRED_PLANAR8 1 + INTRA_PRED_PLANAR8 2 + INTRA_PRED_PLANAR8 3 + INTRA_PRED_PLANAR8 4 + INTRA_PRED_PLANAR8 5 + INTRA_PRED_PLANAR8 6 + INTRA_PRED_PLANAR8 7 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar16, 3,3,8 + add r1, r1 + movu m2, [r2 + 2] + movu m7, [r2 + 18] + + movd m3, [r2 + 34] ; topRight = above[16] + movd m6, [r2 + 98] ; bottomLeft = left[16] + + pshuflw m3, m3, 0 + pshuflw m6, m6, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m6, m6, 0 ; v_bottomLeft + + pmullw m4, m3, [multiH] ; (x + 1) * topRight + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] + pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] + paddw m4, [pw_16] + paddw m3, [pw_16] + paddw m4, m6 + paddw m3, m6 + paddw m4, m5 + paddw m3, m1 + psubw m1, m6, m7 + psubw m6, m2 + + movu m2, [r2 + 66] + movu m7, [r2 + 82] + +%macro INTRA_PRED_PLANAR16 1 +%if (%1 < 4) + pshuflw m5, m2, 0x55 * %1 + pshufd m5, m5, 0 +%else +%if (%1 < 8) + pshufhw m5, m2, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA +%else +%if (%1 < 12) + pshuflw m5, m7, 0x55 * (%1 - 8) + pshufd m5, m5, 0 +%else + pshufhw m5, m7, 0x55 * (%1 - 12) + pshufd m5, m5, 0xAA +%endif +%endif +%endif + pmullw m0, m5, [pw_planar16_mul + mmsize] + pmullw m5, [pw_planar16_mul] + paddw m0, m4 + paddw m5, m3 + paddw m3, m6 + paddw m4, m1 + psraw m5, 5 + psraw m0, 5 + movu [r0], m5 + movu [r0 + 16], m0 + lea r0, [r0 + r1] +%endmacro + + INTRA_PRED_PLANAR16 0 + INTRA_PRED_PLANAR16 1 + INTRA_PRED_PLANAR16 2 + INTRA_PRED_PLANAR16 3 + INTRA_PRED_PLANAR16 4 + INTRA_PRED_PLANAR16 5 + INTRA_PRED_PLANAR16 6 + INTRA_PRED_PLANAR16 7 + INTRA_PRED_PLANAR16 8 + INTRA_PRED_PLANAR16 9 + INTRA_PRED_PLANAR16 10 + INTRA_PRED_PLANAR16 11 + INTRA_PRED_PLANAR16 12 + INTRA_PRED_PLANAR16 13 + INTRA_PRED_PLANAR16 14 + INTRA_PRED_PLANAR16 15 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal intra_pred_planar32, 3,7,16 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 4*mmsize + and rsp, ~63 + %define m16 [rsp + 0 * mmsize] + %define m17 [rsp + 1 * mmsize] + %define m18 [rsp + 2 * mmsize] + %define m19 [rsp + 3 * mmsize] +%else +cglobal intra_pred_planar32, 3,7,8 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 12*mmsize + and rsp, ~63 + %define m8 [rsp + 0 * mmsize] + %define m9 [rsp + 1 * mmsize] + %define m10 [rsp + 2 * mmsize] + %define m11 [rsp + 3 * mmsize] + %define m12 [rsp + 4 * mmsize] + %define m13 [rsp + 5 * mmsize] + %define m14 [rsp + 6 * mmsize] + %define m15 [rsp + 7 * mmsize] + %define m16 [rsp + 8 * mmsize] + %define m17 [rsp + 9 * mmsize] + %define m18 [rsp + 10 * mmsize] + %define m19 [rsp + 11 * mmsize] +%endif + add r1, r1 + lea r5, [planar32_table1] + + movzx r3d, word [r2 + 66] ; topRight = above[32] + movd m7, r3d + pshufd m7, m7, 0 ; v_topRight + + pmulld m0, m7, [r5 + 0 ] ; (x + 1) * topRight + pmulld m1, m7, [r5 + 16 ] + pmulld m2, m7, [r5 + 32 ] + pmulld m3, m7, [r5 + 48 ] + pmulld m4, m7, [r5 + 64 ] + pmulld m5, m7, [r5 + 80 ] + pmulld m6, m7, [r5 + 96 ] + pmulld m7, m7, [r5 + 112] + + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + + movzx r3d, word [r2 + 194] ; bottomLeft = left[32] + movd m6, r3d + pshufd m6, m6, 0 ; v_bottomLeft + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + paddd m0, [pd_32] + paddd m1, [pd_32] + paddd m2, [pd_32] + paddd m3, [pd_32] + + mova m4, m12 + mova m5, m13 + paddd m4, m6 + paddd m5, m6 + paddd m4, [pd_32] + paddd m5, [pd_32] + mova m12, m4 + mova m13, m5 + + mova m4, m14 + mova m5, m15 + paddd m4, m6 + paddd m5, m6 + paddd m4, [pd_32] + paddd m5, [pd_32] + mova m14, m4 + mova m15, m5 + + ; above[0-3] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 2] + pmulld m5, m4, [pd_31] + paddd m0, m5 + psubd m5, m6, m4 + mova m8, m5 + + ; above[4-7] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 10] + pmulld m5, m4, [pd_31] + paddd m1, m5 + psubd m5, m6, m4 + mova m9, m5 + + ; above[8-11] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 18] + pmulld m5, m4, [pd_31] + paddd m2, m5 + psubd m5, m6, m4 + mova m10, m5 + + ; above[12-15] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 26] + pmulld m5, m4, [pd_31] + paddd m3, m5 + psubd m5, m6, m4 + mova m11, m5 + + ; above[16-19] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 34] + mova m7, m12 + pmulld m5, m4, [pd_31] + paddd m7, m5 + mova m12, m7 + psubd m5, m6, m4 + mova m16, m5 + + ; above[20-23] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 42] + mova m7, m13 + pmulld m5, m4, [pd_31] + paddd m7, m5 + mova m13, m7 + psubd m5, m6, m4 + mova m17, m5 + + ; above[24-27] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 50] + mova m7, m14 + pmulld m5, m4, [pd_31] + paddd m7, m5 + mova m14, m7 + psubd m5, m6, m4 + mova m18, m5 + + ; above[28-31] * (blkSize - 1 - y) + pmovzxwd m4, [r2 + 58] + mova m7, m15 + pmulld m5, m4, [pd_31] + paddd m7, m5 + mova m15, m7 + psubd m5, m6, m4 + mova m19, m5 + + add r2, 130 ; (2 * blkSize + 1) + lea r5, [planar32_table] + +%macro INTRA_PRED_PLANAR32 0 + movzx r3d, word [r2] + movd m4, r3d + pshufd m4, m4, 0 + + pmulld m5, m4, [r5] + pmulld m6, m4, [r5 + 16] + paddd m5, m0 + paddd m6, m1 + paddd m0, m8 + paddd m1, m9 + psrad m5, 6 + psrad m6, 6 + packusdw m5, m6 + movu [r0], m5 + + pmulld m5, m4, [r5 + 32] + pmulld m6, m4, [r5 + 48] + paddd m5, m2 + paddd m6, m3 + paddd m2, m10 + paddd m3, m11 + psrad m5, 6 + psrad m6, 6 + packusdw m5, m6 + movu [r0 + 16], m5 + + pmulld m5, m4, [r5 + 64] + pmulld m6, m4, [r5 + 80] + paddd m5, m12 + paddd m6, m13 + psrad m5, 6 + psrad m6, 6 + packusdw m5, m6 + movu [r0 + 32], m5 + mova m5, m12 + mova m6, m13 + paddd m5, m16 + paddd m6, m17 + mova m12, m5 + mova m13, m6 + + pmulld m5, m4, [r5 + 96] + pmulld m4, [r5 + 112] + paddd m5, m14 + paddd m4, m15 + psrad m5, 6 + psrad m4, 6 + packusdw m5, m4 + movu [r0 + 48], m5 + mova m4, m14 + mova m5, m15 + paddd m4, m18 + paddd m5, m19 + mova m14, m4 + mova m15, m5 + + lea r0, [r0 + r1] + add r2, 2 +%endmacro + + mov r4, 8 +.loop: + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + dec r4 + jnz .loop + mov rsp, r6 + RET + +;----------------------------------------------------------------------------------------- +; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang4_2, 3,5,4 + lea r4, [r2 + 4] + add r2, 20 + cmp r3m, byte 34 + cmove r2, r4 + + add r1, r1 + movu m0, [r2] + movh [r0], m0 + palignr m1, m0, 2 + movh [r0 + r1], m1 + palignr m2, m0, 4 + movh [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + psrldq m0, 6 + movh [r0 + r1], m0 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang4_3, 3,5,8 + mov r4, 2 + cmp r3m, byte 33 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m5, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2] + palignr m1, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3] + movhlps m0, m0 ; [x x x x 8 7 6 5] + punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4] + + lea r3, [ang_table + 20 * 16] + mova m0, [r3 + 6 * 16] ; [26] + mova m1, [r3] ; [20] + mova m6, [r3 - 6 * 16] ; [14] + mova m7, [r3 - 12 * 16] ; [ 8] + jmp .do_filter4x4 + +ALIGN 16 +.do_filter4x4: + pmaddwd m2, m0 + paddd m2, [pd_16] + psrld m2, 5 + + pmaddwd m3, m1 + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + pmaddwd m4, m6 + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m5, m7 + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + jz .store + + ; transpose 4x4 + punpckhwd m0, m2, m4 + punpcklwd m2, m4 + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + +.store: + add r1, r1 + movh [r0], m2 + movhps [r0 + r1], m2 + movh [r0 + r1 * 2], m4 + lea r1, [r1 * 3] + movhps [r0 + r1], m4 + RET + +cglobal intra_pred_ang4_4, 3,5,8 + mov r4, 2 + cmp r3m, byte 32 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m4, m3 + palignr m7, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] + + lea r3, [ang_table + 18 * 16] + mova m0, [r3 + 3 * 16] ; [21] + mova m1, [r3 - 8 * 16] ; [10] + mova m6, [r3 + 13 * 16] ; [31] + mova m7, [r3 + 2 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_5, 3,5,8 + mov r4, 2 + cmp r3m, byte 31 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m4, m3 + palignr m7, m0, 6 ; [x x x 8 7 6 5 4] + punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3] + + lea r3, [ang_table + 10 * 16] + mova m0, [r3 + 7 * 16] ; [17] + mova m1, [r3 - 8 * 16] ; [ 2] + mova m6, [r3 + 9 * 16] ; [19] + mova m7, [r3 - 6 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_6, 3,5,8 + mov r4, 2 + cmp r3m, byte 30 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2] + mova m5, m4 + + lea r3, [ang_table + 19 * 16] + mova m0, [r3 - 6 * 16] ; [13] + mova m1, [r3 + 7 * 16] ; [26] + mova m6, [r3 - 12 * 16] ; [ 7] + mova m7, [r3 + 1 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_7, 3,5,8 + mov r4, 2 + cmp r3m, byte 29 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + palignr m6, m0, 4 ; [x x 8 7 6 5 4 3] + punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2] + + lea r3, [ang_table + 20 * 16] + mova m0, [r3 - 11 * 16] ; [ 9] + mova m1, [r3 - 2 * 16] ; [18] + mova m6, [r3 + 7 * 16] ; [27] + mova m7, [r3 - 16 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_8, 3,5,8 + mov r4, 2 + cmp r3m, byte 28 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + lea r3, [ang_table + 13 * 16] + mova m0, [r3 - 8 * 16] ; [ 5] + mova m1, [r3 - 3 * 16] ; [10] + mova m6, [r3 + 2 * 16] ; [15] + mova m7, [r3 + 7 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_9, 3,5,8 + mov r4, 2 + cmp r3m, byte 27 + mov r3, 18 + cmove r3, r4 + + movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2] + punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + lea r3, [ang_table + 4 * 16] + mova m0, [r3 - 2 * 16] ; [ 2] + mova m1, [r3 - 0 * 16] ; [ 4] + mova m6, [r3 + 2 * 16] ; [ 6] + mova m7, [r3 + 4 * 16] ; [ 8] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_10, 3,3,4 + movh m0, [r2 + 18] ; [4 3 2 1] + pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3] + pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] + add r1, r1 + movhlps m1, m0 ; [2 2 2 2] + movhlps m3, m2 ; [4 4 4 4] + movh [r0 + r1], m1 + movh [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + movh [r0 + r1], m3 + + cmp r4m, byte 0 + jz .quit + + ; filter + movu m1, [r2] ; [7 6 5 4 3 2 1 0] + pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] + palignr m1, m1, 2 ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] +.quit: + movh [r0], m0 + RET + +cglobal intra_pred_ang4_26, 3,4,3 + movh m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + add r1, r1 + ; store + movh [r0], m0 + movh [r0 + r1], m0 + movh [r0 + r1 * 2], m0 + lea r3, [r1 * 3] + movh [r0 + r3], m0 + + ; filter + cmp r4m, byte 0 + jz .quit + + pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1] + movu m1, [r2 + 16] + pinsrw m1, [r2], 0 ; [7 6 5 4 3 2 1 0] + pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0] + palignr m1, m1, 2 ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r3], m0, 3 +.quit: + RET + +cglobal intra_pred_ang4_11, 3,5,8 + xor r4, r4 + cmp r3m, byte 25 + mov r3, 16 + cmove r3, r4 + + movu m2, [r2 + r3] ; [x x x 4 3 2 1 0] + pinsrw m2, [r2], 0 + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + lea r3, [ang_table + 24 * 16] + mova m0, [r3 + 6 * 16] ; [24] + mova m1, [r3 + 4 * 16] ; [26] + mova m6, [r3 + 2 * 16] ; [28] + mova m7, [r3 + 0 * 16] ; [30] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_12, 3,5,8 + xor r4, r4 + cmp r3m, byte 24 + mov r3, 16 + cmove r3, r4 + + movu m2, [r2 + r3] ; [x x x 4 3 2 1 0] + pinsrw m2, [r2], 0 + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + mova m5, m2 + + lea r3, [ang_table + 20 * 16] + mova m0, [r3 + 7 * 16] ; [27] + mova m1, [r3 + 2 * 16] ; [22] + mova m6, [r3 - 3 * 16] ; [17] + mova m7, [r3 - 8 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_13, 3,5,8 + xor r4, r4 + cmp r3m, byte 23 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] + pinsrw m5, [r2], 1 + palignr m2, m5, 2 ; [x x x 4 3 2 1 0] + palignr m0, m5, 4 ; [x x x x 4 3 2 1] + pinsrw m5, [r2 + r3 + 8], 0 + punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m2 + + lea r3, [ang_table + 21 * 16] + mova m0, [r3 + 2 * 16] ; [23] + mova m1, [r3 - 7 * 16] ; [14] + mova m6, [r3 - 16 * 16] ; [ 5] + mova m7, [r3 + 7 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_14, 3,5,8 + xor r4, r4 + cmp r3m, byte 22 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + movu m5, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] + pinsrw m5, [r2], 1 + palignr m2, m5, 2 ; [x x x 4 3 2 1 0] + palignr m0, m5, 4 ; [x x x x 4 3 2 1] + pinsrw m5, [r2 + r3 + 4], 0 + punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m3, m2 + mova m4, m5 + + lea r3, [ang_table + 19 * 16] + mova m0, [r3 + 0 * 16] ; [19] + mova m1, [r3 - 13 * 16] ; [ 6] + mova m6, [r3 + 6 * 16] ; [25] + mova m7, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_15, 3,5,8 + xor r4, r4 + cmp r3m, byte 21 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] + pinsrw m3, [r2], 1 + palignr m2, m3, 2 ; [x x x 4 3 2 1 0] + palignr m0, m3, 4 ; [x x x x 4 3 2 1] + pinsrw m3, [r2 + r3 + 4], 0 + pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] + pinsrw m5, [r2 + r3 + 8], 0 + punpcklwd m5, m3 ; [2 1 1 0 0 x x y] + punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m4, m3 + + lea r3, [ang_table + 23 * 16] + mova m0, [r3 - 8 * 16] ; [15] + mova m1, [r3 + 7 * 16] ; [30] + mova m6, [r3 - 10 * 16] ; [13] + mova m7, [r3 + 5 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_16, 3,5,8 + xor r4, r4 + cmp r3m, byte 20 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + movu m3, [r2 + r4 - 2] ; [x x 4 3 2 1 0 x] + pinsrw m3, [r2], 1 + palignr m2, m3, 2 ; [x x x 4 3 2 1 0] + palignr m0, m3, 4 ; [x x x x 4 3 2 1] + pinsrw m3, [r2 + r3 + 4], 0 + pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y] + pinsrw m5, [r2 + r3 + 6], 0 + punpcklwd m5, m3 ; [2 1 1 0 0 x x y] + punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x] + punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0] + mova m4, m3 + + lea r3, [ang_table + 19 * 16] + mova m0, [r3 - 8 * 16] ; [11] + mova m1, [r3 + 3 * 16] ; [22] + mova m6, [r3 - 18 * 16] ; [ 1] + mova m7, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_17, 3,5,8 + xor r4, r4 + cmp r3m, byte 19 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + movu m6, [r2 + r4 - 2] ; [- - 4 3 2 1 0 x] + pinsrw m6, [r2], 1 + palignr m2, m6, 2 ; [- - - 4 3 2 1 0] + palignr m1, m6, 4 ; [- - - - 4 3 2 1] + mova m4, m2 + punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0] + + pinsrw m6, [r2 + r3 + 2], 0 + punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x] + + pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y] + pinsrw m4, [r2 + r3 + 4], 0 + pslldq m5, m4, 2 ; [4 3 2 1 0 x y z] + pinsrw m5, [r2 + r3 + 8], 0 + punpcklwd m5, m4 ; [1 0 0 x x y y z] + punpcklwd m4, m6 ; [2 1 1 0 0 x x y] + + lea r3, [ang_table + 14 * 16] + mova m0, [r3 - 8 * 16] ; [ 6] + mova m1, [r3 - 2 * 16] ; [12] + mova m6, [r3 + 4 * 16] ; [18] + mova m7, [r3 + 10 * 16] ; [24] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + +cglobal intra_pred_ang4_18, 3,3,1 + movh m0, [r2 + 16] + pinsrw m0, [r2], 0 + pshufb m0, [pw_swap] + movhps m0, [r2 + 2] + add r1, r1 + lea r2, [r1 * 3] + movh [r0 + r2], m0 + psrldq m0, 2 + movh [r0 + r1 * 2], m0 + psrldq m0, 2 + movh [r0 + r1], m0 + psrldq m0, 2 + movh [r0], m0 + RET + +;----------------------------------------------------------------------------------------- +; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang8_2, 3,5,3 + lea r4, [r2] + add r2, 32 + cmp r3m, byte 34 + cmove r2, r4 + add r1, r1 + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu [r0], m0 + palignr m2, m1, m0, 2 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 6 + movu [r0 + r3], m2 + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 10 + movu [r0 + r1], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m1, m0, 14 + movu [r0 + r3], m1 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang8_3, 3,5,8 + add r2, 32 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 12 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 6 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 6 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m3 + movhps [r2 + r4], m3 + + mova m4, m0 + pmaddwd m4, [r3 - 12 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 8 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 2 * 16] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_4, 3,6,8 + add r2, 32 + lea r3, [ang_table + 19 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 2 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 9 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 1 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r5, [r0 + r1 * 4] + movh [r5], m6 + movhps [r5 + r1], m6 + movh [r5 + r1 * 2], m1 + movhps [r5 + r4], m1 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 10 * 16] ; [ 9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m3 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m4, m3 + + pmaddwd m2, [r3 + 11 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 11 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movh m1, [r2 + 26] ; [16 15 14 13] + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 11 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_5, 3,5,8 + add r2, 32 + lea r3, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 4 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 4 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 6 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 8 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 7 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 10 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 5 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_6, 3,5,8 + add r2, 32 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 7 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 6 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 13 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m2 + pmaddwd m2, [r3 - 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m5, [r3 - 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_7, 3,5,8 + add r2, 32 + lea r3, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 9 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m7 + movhps [r0 + r1], m7 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 5 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m2 + pmaddwd m2, [r3 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3 + 4 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m5, [r3 - 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m7 + punpcklwd m6, m7 + + punpckldq m7, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m7 + movhps [r0 + r1 + 8], m7 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_8, 3,6,7 + add r2, 32 + lea r3, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 12 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 7 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 3 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r5, [r0 + r1 * 4] + movh [r5], m6 + movhps [r5 + r1], m6 + movh [r5 + r1 * 2], m1 + movhps [r5 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 + 8 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + movh m1, [r2 + 18] ; [12 11 10 9] + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m5, m6 + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m3, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m5, [r3 - 9 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + punpckhwd m3, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m3, m2 + punpckhdq m3, m2 + + movh [r0 + 8], m5 + movhps [r0 + r1 + 8], m5 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m3 + movhps [r0 + r4 + 8], m3 + RET + +cglobal intra_pred_ang8_9, 3,5,7 + add r2, 32 + lea r3, [ang_table + 9 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 7 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 3 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 - 1 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 3 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 5 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 + 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_10, 3,6,3 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1] + add r1, r1 + lea r3, [r1 * 3] + + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2] + movu [r0 + r1], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3] + movu [r0 + r1 * 2], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4] + movu [r0 + r3], m2 + + lea r5, [r0 + r1 *4] + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5] + movu [r5], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6] + movu [r5 + r1], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7] + movu [r5 + r1 * 2], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8] + movu [r5 + r3], m2 + + cmp r4m, byte 0 + jz .quit + + ; filter + + movh m1, [r2] ; [3 2 1 0] + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] +.quit: + movu [r0], m0 + RET + +cglobal intra_pred_ang8_11, 3,5,7 + lea r3, [ang_table + 23 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 7 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 5 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 3 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 1 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m1 + movhps [r2 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 3 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 5 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 - 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_12, 3,6,7 + lea r5, [ang_table + 16 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r5 - 4 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r5 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + punpckhwd m1, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m1, m2 + punpckhdq m1, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r3, [r0 + r1 * 4] + movh [r3], m6 + movhps [r3 + r1], m6 + movh [r3 + r1 * 2], m1 + movhps [r3 + r4], m1 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_12] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r5 + 8 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_13, 3,6,8 + lea r5, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 9 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r5 - 9 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r5 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_13] + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 14 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + mova m4, m3 + pmaddwd m4, [r5 + 5 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 4 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 4 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 10 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_14, 3,6,8 + lea r5, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r5 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_14] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + mova m5, m3 + pmaddwd m5, [r5 - 6 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r2, [r0 + r1 * 4] + movh [r2], m6 + movhps [r2 + r1], m6 + movh [r2 + r1 * 2], m7 + movhps [r2 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 + 6 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 6 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_15, 3,6,8 + lea r5, [ang_table + 20 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 5 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_15] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 10 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 8 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r3, [r0 + r1 * 4] + movh [r3], m6 + movhps [r3 + r1], m6 + movh [r3 + r1 * 2], m7 + movhps [r3 + r4], m7 + + mova m4, m3 + pmaddwd m4, [r5 - 9 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 6 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 11 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16], 0 + + pmaddwd m3, [r5 + 4 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 4 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_16, 3,6,8 + lea r5, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_16] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 + 9 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r5 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 12 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 - 1 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 - 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r3, [r0 + r1 * 4] + movh [r3], m6 + movhps [r3 + r1], m6 + movh [r3 + r1 * 2], m7 + movhps [r3 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16], 0 + + pmaddwd m3, [r5 + 11 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 + 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_17, 3,6,8 + lea r5, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pinsrw m0, [r2], 0 + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r5 - 11 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 - 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2] + pshufb m1, [pw_ang8_17] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r5 - 5 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 + 1 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r5 + 7 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r5 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + punpckhwd m7, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m5 + punpcklwd m6, m5 + + punpckldq m5, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + + lea r4, [r1 * 3] + movh [r0], m5 + movhps [r0 + r1], m5 + movh [r0 + r1 * 2], m4 + movhps [r0 + r4], m4 + lea r3, [r0 + r1 * 4] + movh [r3], m6 + movhps [r3 + r1], m6 + movh [r3 + r1 * 2], m7 + movhps [r3 + r4], m7 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r5 + 13 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r5 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r5 - 13 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r5 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r5 - 7 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r5 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r5 - 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r5 - 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m2, m6, m3 + punpcklwd m6, m3 + + punpckldq m3, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m5, m2 + punpckhdq m5, m2 + + movh [r0 + 8], m3 + movhps [r0 + r1 + 8], m3 + movh [r0 + r1 * 2 + 8], m4 + movhps [r0 + r4 + 8], m4 + lea r0, [r0 + r1 * 4] + movh [r0 + 8], m6 + movhps [r0 + r1 + 8], m6 + movh [r0 + r1 * 2 + 8], m5 + movhps [r0 + r4 + 8], m5 + RET + +cglobal intra_pred_ang8_18, 3,4,3 + add r1, r1 + lea r3, [r1 * 3] + movu m1, [r2] + movu m0, [r2 + 34] + pshufb m0, [pw_swap16] + movu [r0], m1 + palignr m2, m1, m0, 14 + movu [r0 + r1], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 10 + movu [r0 + r3], m2 + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m1, m0, 2 + movu [r0 + r3], m1 + RET + +cglobal intra_pred_ang8_19, 3,5,8 + lea r3, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 - 11 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_17] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r3 + 1 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r3 + 7 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r3 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r3 + 13 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 13 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r3 - 7 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r3 - 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_20, 3,5,8 + lea r3, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_16] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r3 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 + 9 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r3 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 12 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r3 - 1 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r3 - 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r3] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16 + 32], 0 + + pmaddwd m3, [r3 + 11 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_21, 3,5,8 + lea r3, [ang_table + 20 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 - 5 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_15] + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r3 + 10 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 + 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r3 - 7 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r3 + 8 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r3 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m2, m3 + pmaddwd m2, [r3 + 6 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 + 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r3 - 11 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 11 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + pinsrw m3, [r2 + 16 + 32], 0 + + pmaddwd m3, [r3 + 4 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 4 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_22, 3,5,8 + lea r3, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_14] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r3 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 7 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + mova m5, m3 + pmaddwd m5, [r3 - 6 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r3 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + mova m4, m3 + pmaddwd m4, [r3 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r3 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r3 + 6 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 6 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_23, 3,5,8 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 9 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 9 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_13] + palignr m3, m1, 12 + + mova m5, m3 + pmaddwd m5, [r3 + 14 * 16] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + mova m7, m0 + pmaddwd m7, [r3 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m5, m7 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 + 5 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 4 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m5, m0 + pmaddwd m5, [r3 - 4 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m2, m5 + + mova m6, m3 + pmaddwd m6, [r3 - 13 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pslldq m1, 2 + palignr m0, m3, 12 + palignr m3, m1, 12 + + pmaddwd m3, [r3 + 10 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_24, 3,5,7 + lea r3, [ang_table + 16 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 - 4 * 16] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + movu m1, [r2 + 32] + pinsrw m1, [r2], 0 + pshufb m1, [pw_ang8_12] + palignr m3, m1, 12 + + mova m6, m3 + pmaddwd m6, [r3 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 13 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 + 8 * 16] ; [24] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_25, 3,5,7 + lea r3, [ang_table + 23 * 16] + add r1, r1 + + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 7 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 5 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 3 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 1 * 16] ; [24] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 3 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 5 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 - 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 - 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_26, 3,6,3 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + add r1, r1 + lea r5, [r1 * 3] + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r5], m0 + + lea r3, [r0 + r1 *4] + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r5], m0 + + cmp r4m, byte 0 + jz .quit + + ; filter + pshufb m0, [pb_01] + pinsrw m1, [r2], 0 ; [3 2 1 0] + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r5], m0, 3 + pextrw [r3], m0, 4 + pextrw [r3 + r1], m0, 5 + pextrw [r3 + r1 * 2], m0, 6 + pextrw [r3 + r5], m0, 7 +.quit: + RET + +cglobal intra_pred_ang8_27, 3,5,7 + lea r3, [ang_table + 9 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 7 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 7 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 3 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 - 1 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 3 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 5 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m5, m0 + pmaddwd m5, [r3 + 5 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m6, m5 + + pmaddwd m3, [r3 + 7 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 7 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m3 + RET + +cglobal intra_pred_ang8_28, 3,5,7 + lea r3, [ang_table + 17 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 12 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 7 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m5, m3 + pmaddwd m5, [r3 + 3 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + mova m1, m0 + pmaddwd m1, [r3 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + + mova m4, m3 + pmaddwd m4, [r3 + 8 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m5, m6 + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m3, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m5, [r3 - 9 * 16] ; [8] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m5 + RET + +cglobal intra_pred_ang8_29, 3,5,8 + lea r3, [ang_table + 18 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 9 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 5 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m2 + pmaddwd m2, [r3 - 5 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3 + 4 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m5, [r3 - 10 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + RET + +cglobal intra_pred_ang8_30, 3,5,8 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 1 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 7 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 6 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 13 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m2 + pmaddwd m2, [r3 - 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + pmaddwd m2, m6, [r3] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, m7, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m5, [r3 - 6 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m7, m5 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + RET + +cglobal intra_pred_ang8_31, 3,5,8 + lea r3, [ang_table + 13 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 4 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 4 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 6 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 8 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 7 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 10 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 5 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + RET + +cglobal intra_pred_ang8_32, 3,5,8 + lea r3, [ang_table + 19 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 2 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 9 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 1 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 10 * 16] ; [ 9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m3 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m4, m3 + + pmaddwd m2, [r3 + 11 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 11 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movh m1, [r2 + 26] ; [16 15 14 13] + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 11 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m1, [r3 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + RET + +cglobal intra_pred_ang8_33, 3,5,8 + lea r3, [ang_table + 14 * 16] + add r1, r1 + + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 12 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 6 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 6 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 6 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r4, [r1 * 3] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + + mova m4, m0 + pmaddwd m4, [r3 - 12 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 8 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 2 * 16] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r0, [r0 + r1 * 4] + movu [r0], m4 + movu [r0 + r1], m2 + movu [r0 + r1 * 2], m6 + movu [r0 + r4], m7 + RET + +%macro TRANSPOSE_STORE 6 + jnz .skip%6 + punpckhwd %5, %1, %2 + punpcklwd %1, %2 + punpckhwd %2, %3, %4 + punpcklwd %3, %4 + + punpckldq %4, %1, %3 + punpckhdq %1, %3 + punpckldq %3, %5, %2 + punpckhdq %5, %2 + + movh [r0 + %6], %4 + movhps [r0 + r1 + %6], %4 + movh [r0 + r1 * 2 + %6], %1 + movhps [r0 + r4 + %6], %1 + lea r5, [r0 + r1 * 4] + movh [r5 + %6], %3 + movhps [r5 + r1 + %6], %3 + movh [r5 + r1 * 2 + %6], %5 + movhps [r5 + r4 + %6], %5 + jmp .end%6 + +.skip%6: + movu [r5], %1 + movu [r5 + r1], %2 + movu [r5 + r1 * 2], %3 + movu [r5 + r4], %4 +.end%6: +%endmacro + +INIT_XMM sse4 +cglobal ang16_mode_3_33 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13] + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m6, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 - 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 8 * 16] ; [ 8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m3, [r3 - 8 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 0 + + mova m4, m0 + pmaddwd m4, [r3 - 14 * 16] ; [ 2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 - 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m5 + pmaddwd m6, [r3 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] + + palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m4, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m2, [r3 - 6 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m5 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m3 + pmaddwd m6, [r3 - 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m5 + pmaddwd m6, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m3 + pmaddwd m7, [r3 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14] + pmaddwd m0, [r3 + 8 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 + + palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m4, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15] + pmaddwd m2, [r3 + 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12] + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16] + pmaddwd m6, [r3 - 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pinsrw m1, [r2 + 42], 7 + pmaddwd m3, [r3 - 10 * 16] ; [6] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m3, m1 + + movu m7, [r2 + 28] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m3, m7, m0, 24 + + ret + +cglobal ang16_mode_4_32 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 3 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 3 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 8 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 2 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 - 9 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m6, m7 + pmaddwd m7, [r3 - 9 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m2, [r3 + 12 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m6, [r3 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m0 + pmaddwd m6, [r3 + 1 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m5 + pmaddwd m7, [r3 + 1 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 10 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17] + + palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m4, [r3 + 11 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + pmaddwd m2, [r3 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m2, [r3] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m6, [r3] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m6 + pmaddwd m6, [r3 - 11 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m0, [r3 - 11 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m7, [r3 + 10 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + pmaddwd m0, [r3 + 10 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m0, 16 + + mova m4, m5 + pmaddwd m4, [r3 - 1 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m3 + pmaddwd m2, [r3 - 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m7, m2 + pmaddwd m2, [r3 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14] + mova m0, m6 + pmaddwd m6, [r3 - 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pmaddwd m7, [r3 + 9 * 16] ; [27] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m0, [r3 + 9 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m0, [r3 - 2 * 16] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15] + pmaddwd m1, [r3 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m0, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 + + ret + +cglobal ang16_mode_5_31 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 + 1 * 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m2 + pmaddwd m2, [r3 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 3 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 12 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m4, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 + 5 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m6, m2 + pmaddwd m2, [r3 - 10 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m1 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 7 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m0 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 8 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m3, 8 + + movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13] + psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14] + + punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13] + + mova m4, m0 + pmaddwd m4, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m5 + pmaddwd m2, [r3 + 9 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m6, m2 + pmaddwd m2, [r3 - 6 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m1, m7 + pmaddwd m7, [r3 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + pmaddwd m6, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m1, [r3 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 - 4 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m4, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11] + pmaddwd m2, [r3 + 13 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m7, m2 + pmaddwd m2, [r3 - 2 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12] + mova m0, m6 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pmaddwd m7, [r3 + 15 * 16] ; [31] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m0, [r3 + 15 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + pmaddwd m5, [r3] ; [16] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 + + ret + +cglobal ang16_mode_6_30 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 2 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 11 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m6 + pmaddwd m6, [r3 - 8 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m6, m4 + pmaddwd m4, [r3 - 14 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m1 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m6 + pmaddwd m2, [r3 - 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m7 + pmaddwd m1, [r3 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 12 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 7 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m4, [r3 + 6 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m2, [r3 + 6 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m0 + pmaddwd m2, [r3 - 13 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m7, m5 + pmaddwd m7, [r3 - 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + mova m6, m0 + pmaddwd m6, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m5 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m0 + pmaddwd m7, [r3 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m5 + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + movh m3, [r2 + 26] ; [16 15 14 13] + + palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m2, m4 + pmaddwd m4, [r3 - 6 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10] + mova m6, m1 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 7 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m6 + pmaddwd m1, [r3 + 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + psrldq m3, 2 + palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m5, m7 + pmaddwd m7, [r3 - 12 * 16] ; [3] + paddd m7, [pd_16] + psrld m7, 5 + palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11] + mova m1, m3 + pmaddwd m3, [r3 - 12 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m5, [r3 + 1 * 16] ; [16] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m5, m3, 24 + + ret + +cglobal ang16_mode_7_29 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 8 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 1 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 10 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 13 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m6, m4 + pmaddwd m4, [r3 - 4 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m7, m1 + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m6 + pmaddwd m2, [r3 + 5 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m7 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m6, [r3 + 14 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 - 9 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + pmaddwd m1, [r3 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m2, m4 + pmaddwd m4, [r3] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m7, m1 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 9 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m7, [r3 + 9 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m2, m7 + + palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m6, [r3 - 14 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 5 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4] + mova m2, m4 + pmaddwd m4, [r3 + 4 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8] + mova m3, m1 + pmaddwd m1, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m2, [r3 + 13 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m3, [r3 + 13 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + mova m7, m0 + pmaddwd m7, [r3 - 10 * 16] ; [7] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 10 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m0, [r3 - 1 * 16] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + pmaddwd m5, [r3 - 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m0, m5 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m0, m3, 24 + + ret + +cglobal ang16_mode_8_28 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2] + psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10] + + punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5] + punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9] + + mova m4, m3 + pmaddwd m4, [r3 - 10 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 5 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 15 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m6, [r3 - 12 * 16] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 12 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m7, [r3 - 7 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2] + mova m7, m4 + pmaddwd m4, [r3 - 2 *16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + mova m1, m6 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m7 + pmaddwd m2, [r3 + 3 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m1 + pmaddwd m6, [r3 + 3 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m7 + pmaddwd m6, [r3 + 8 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m7, [r3 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, [r3 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3] + mova m4, m1 + pmaddwd m4, [r3 - 14 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7] + mova m0, m5 + pmaddwd m0, [r3 - 14 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m4, m0 + + mova m2, m1 + pmaddwd m2, [r3 - 9 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m3, m5 + pmaddwd m3, [r3 - 9 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m2, m3 + + mova m7, m1 + pmaddwd m7, [r3 - 4 * 16] ; [11] + paddd m7, [pd_16] + psrld m7, 5 + mova m3, m5 + pmaddwd m3, [r3 - 4 * 16] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m1, [r3 + 1 * 16] ; [16] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m5, [r3 + 1 * 16] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m1, m5 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m1, m3, 24 + + ret + +cglobal ang16_mode_9_27 + test r6d, r6d + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5] + + mova m4, m3 + pmaddwd m4, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 - 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 10 *16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r3 - 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r3 + 2 *16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r3 + 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r3 + 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pmaddwd m3, [r3 + 14 * 16] ; [30] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 14 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + movu m7, [r2 + 4] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m3, m7, m1, 24 + + ret + +cglobal ang16_mode_11_25 + test r6d, r6d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r3 + 14 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 10 *16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r3 + 2 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r3 - 2 *16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r3 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r3 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r3 - 4 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r3 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r3 - 10 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r3 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r3 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r3 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r3 - 14 * 16] ; [2] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r3 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + movu m3, [r2] + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_12_24 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 11 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 + 6 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 + 1 *16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 4 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 - 9 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 14 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 + 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + mova m7, m3 + pmaddwd m7, [r6 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 + 3 *16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 + 3 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r6 - 2 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 - 2 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 12 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 + 10 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [21] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_13_23 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 8 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 8 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 - 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 10 *16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 + 4 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 5 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 14 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 14 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 9 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + mova m2, m3 + pmaddwd m2, [r6 - 9 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 - 9 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 14 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r6 - 4 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 13 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 10 * 16] ; [25] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 + 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 1 *16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_14_22 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 + 1 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 + 1 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + mova m2, m3 + pmaddwd m2, [r6 - 12 * 16] ; [6] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 7 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 7 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 6 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 13 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 13 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 13 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 6 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 6 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 7 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 7 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 12 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 1 * 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 14 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 5 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 8 * 16] ; [10] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 11 * 16] ; [29] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 - 2 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 - 2 *16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_15_21 + test r3d, r3d + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 15 * 16] ; [30] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 15 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 2 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 13 * 16] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + mova m4, m3 + pmaddwd m4, [r6 - 4 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 11 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 6 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 9 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 8 * 16] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 8 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 7 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 7 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 10 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 5 * 16] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 5 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + mova m4, m3 + pmaddwd m4, [r6 - 12 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 3 * 16] ; [18] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 3 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 14 * 16] ; [1] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + pmaddwd m3, [r6 + 1 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 1 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_16_20 + test r4d, r4d + lea r4, [r1 * 3] + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 - 2 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 - 2 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 9 * 16] ; [22] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m6, m3 + pmaddwd m6, [r6 - 12 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 - 1 * 16] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 1 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 10 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 11 * 16] ; [2] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 11 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 11 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + mova m4, m3 + pmaddwd m4, [r6 - 10 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 - 10 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 1 * 16] ; [14] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 1 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 12 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 9 * 16] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 9 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 2 * 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m5, [r3] + pshufb m5, [pw_ang8_16] + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 13 * 16] ; [26] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 13 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + mova m7, m3 + pmaddwd m7, [r6 - 8 * 16] ; [5] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + pmaddwd m3, [r6 + 3 * 16] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 + 3 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +cglobal ang16_mode_17_19 + test r4d, r4d + lea r4, [r1 * 3] + movu m0, [r2] ; [7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + + palignr m6, m0, m5, 2 + + punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4] + + mova m4, m3 + pmaddwd m4, [r6 - 10 * 16] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + mova m2, m0 + pmaddwd m2, [r6 - 10 * 16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + palignr m0, m3, 12 + palignr m3, m6, 12 + + mova m2, m3 + pmaddwd m2, [r6 - 4 * 16] ; [12] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 + 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + mov r5, r0 + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 0 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 + 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + mova m2, m3 + pmaddwd m2, [r6 - 12 * 16] ; [4] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 - 12 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m6, m3 + pmaddwd m6, [r6 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + mova m7, m0 + pmaddwd m7, [r6 - 6 * 16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r0 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 8 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + mova m6, m0 + pmaddwd m6, [r6 + 6 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m4, m6 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + mova m6, m0 + pmaddwd m6, [r6 + 12 * 16] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m2, m6 + + mova m6, m3 + pmaddwd m6, [r6 - 14 * 16] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + mova m1, m0 + pmaddwd m1, [r6 - 14 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m5, [r3] + pshufb m5, [pw_ang8_17] + + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 - 8 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m6, m7, m1, 16 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m4, m3 + pmaddwd m4, [r6 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + mova m1, m0 + pmaddwd m1, [r6 - 2 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m2, m3 + pmaddwd m2, [r6 + 4 * 16] ; [20] + paddd m2, [pd_16] + psrld m2, 5 + mova m1, m0 + pmaddwd m1, [r6 + 4 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + + pslldq m5, 2 + palignr m0, m3, 12 + palignr m3, m5, 12 + + mova m7, m3 + pmaddwd m7, [r6 + 10 * 16] ; [26] + paddd m7, [pd_16] + psrld m7, 5 + mova m1, m0 + pmaddwd m1, [r6 + 10 * 16] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m3, [r6 - 16 * 16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r6 - 16 * 16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + lea r5, [r5 + r1 * 4] + + TRANSPOSE_STORE m4, m2, m7, m3, m1, 24 + + ret + +;------------------------------------------------------------------------------------------ +; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------ +INIT_XMM ssse3 +cglobal intra_pred_ang16_2, 3,5,5 + lea r4, [r2] + add r2, 64 + cmp r3m, byte 34 + cmove r2, r4 + add r1, r1 + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu m2, [r2 + 36] + + movu [r0], m0 + movu [r0 + 16], m1 + palignr m3, m1, m0, 2 + palignr m4, m2, m1, 2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m1, m0, 4 + palignr m4, m2, m1, 4 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m1, m0, 6 + palignr m4, m2, m1, 6 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r1 * 4] + palignr m3, m1, m0, 8 + palignr m4, m2, m1, 8 + movu [r0], m3 + movu [r0 + 16], m4 + palignr m3, m1, m0, 10 + palignr m4, m2, m1, 10 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m1, m0, 12 + palignr m4, m2, m1, 12 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m1, m0, 14 + palignr m4, m2, m1, 14 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + movu m0, [r2 + 52] + lea r0, [r0 + r1 * 4] + movu [r0], m1 + movu [r0 + 16], m2 + palignr m3, m2, m1, 2 + palignr m4, m0, m2, 2 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m2, m1, 4 + palignr m4, m0, m2, 4 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m2, m1, 6 + palignr m4, m0, m2, 6 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + + lea r0, [r0 + r1 * 4] + palignr m3, m2, m1, 8 + palignr m4, m0, m2, 8 + movu [r0], m3 + movu [r0 + 16], m4 + palignr m3, m2, m1, 10 + palignr m4, m0, m2, 10 + movu [r0 + r1], m3 + movu [r0 + r1 + 16], m4 + palignr m3, m2, m1, 12 + palignr m4, m0, m2, 12 + movu [r0 + r1 * 2], m3 + movu [r0 + r1 * 2 + 16], m4 + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 14 + movu [r0 + r3], m3 + movu [r0 + r3 + 16], m4 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_3, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_3_33 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_3_33 + RET + +cglobal intra_pred_ang16_33, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_3_33 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_3_33 + RET + +cglobal intra_pred_ang16_4, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 18 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_4_32 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_4_32 + RET + +cglobal intra_pred_ang16_32, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 18 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_4_32 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_4_32 + RET + +cglobal intra_pred_ang16_5, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_5_31 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_5_31 + RET + +cglobal intra_pred_ang16_31, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_5_31 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_5_31 + RET + +cglobal intra_pred_ang16_6, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_6_30 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_6_30 + RET + +cglobal intra_pred_ang16_30, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_6_30 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_6_30 + RET + +cglobal intra_pred_ang16_7, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 17 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_7_29 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_7_29 + RET + +cglobal intra_pred_ang16_29, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 17 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_7_29 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_7_29 + RET + +cglobal intra_pred_ang16_8, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_8_28 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_8_28 + RET + +cglobal intra_pred_ang16_28, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 15 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_8_28 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_8_28 + RET + +cglobal intra_pred_ang16_9, 3,7,8 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_9_27 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang16_27, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_9_27 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang16_11, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r2, 64 + xor r6d, r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_11_25 + + lea r2, [r2 + 16] + lea r0, [r0 + r1 * 8] + + call ang16_mode_11_25 + + mov r6d, [rsp] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_25, 3,7,8 + xor r6d, r6d + inc r6d + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r4, [r1 * 3] + + call ang16_mode_11_25 + + lea r2, [r2 + 16] + lea r0, [r0 + 16] + + call ang16_mode_11_25 + RET + +cglobal intra_pred_ang16_12, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 16 * 16] + movu m5, [r2] + pshufb m5, [pw_ang8_12] + pinsrw m5, [r2 + 26], 5 + xor r3d, r3d + add r2, 64 + + call ang16_mode_12_24 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_12_24 + + mov r6d, [rsp] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_24, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 16 * 16] + movu m5, [r2 + 64] + pshufb m5, [pw_ang8_12] + pinsrw m5, [r2 + 26 + 64], 5 + xor r3d, r3d + inc r3d + + call ang16_mode_12_24 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_12_24 + + mov r6d, [rsp] + mov [r2 + 48], r6w + RET + +cglobal intra_pred_ang16_13, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m5, [r2] + pshufb m5, [pw_ang16_13] + movu m6, [r2 + 14] + pshufb m6, [pw_ang8_13] + pslldq m6, 2 + palignr m5, m6, 6 + xor r3d, r3d + add r2, 64 + + call ang16_mode_13_23 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_13_23 + + mov r6d, [rsp] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_23, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m5, [r2 + 64] + pshufb m5, [pw_ang16_13] + movu m6, [r2 + 14 + 64] + pshufb m6, [pw_ang8_13] + pslldq m6, 2 + palignr m5, m6, 6 + xor r3d, r3d + inc r3d + + call ang16_mode_13_23 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_13_23 + + mov r6d, [rsp] + mov [r2 + 48], r6w + RET + +cglobal intra_pred_ang16_14, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 18 * 16] + movu m6, [r2] + pshufb m6, [pw_ang8_14] + movu m5, [r2 + 20] + pshufb m5, [pw_ang8_14] + punpckhqdq m5, m6 + xor r3d, r3d + add r2, 64 + + call ang16_mode_14_22 + + lea r0, [r0 + r1 * 8] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_14_22 + + mov r6d, [rsp] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_22, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 18 * 16] + movu m6, [r2 + 64] + pshufb m6, [pw_ang8_14] + movu m5, [r2 + 20 + 64] + pshufb m5, [pw_ang8_14] + punpckhqdq m5, m6 + xor r3d, r3d + inc r3d + + call ang16_mode_14_22 + + lea r0, [r0 + 16] + movu m5, [r2 + 2] + lea r2, [r2 + 16] + + call ang16_mode_14_22 + + mov r6d, [rsp] + mov [r2 + 48], r6w + RET + +cglobal intra_pred_ang16_15, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m6, [r2 + 4] + pshufb m6, [pw_ang8_15] + movu m5, [r2 + 18] + pshufb m5, [pw_ang8_15] + punpckhqdq m5, m6 + xor r3d, r3d + add r2, 64 + + call ang16_mode_15_21 + + lea r0, [r0 + r1 * 8] + movu m5, [r2] + lea r2, [r2 + 16] + + call ang16_mode_15_21 + + mov r6d, [rsp] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_21, 3,7,8, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r4, [r1 * 3] + lea r6, [ang_table + 15 * 16] + movu m6, [r2 + 4 + 64] + pshufb m6, [pw_ang8_15] + movu m5, [r2 + 18 + 64] + pshufb m5, [pw_ang8_15] + punpckhqdq m5, m6 + xor r3d, r3d + inc r3d + + call ang16_mode_15_21 + + lea r0, [r0 + 16] + movu m5, [r2] + lea r2, [r2 + 16] + + call ang16_mode_15_21 + + mov r6d, [rsp] + mov [r2 + 48], r6w + RET + +cglobal intra_pred_ang16_16, 3,7,8,0-(1*mmsize+4) + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp + 16], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r6, [ang_table + 13 * 16] + movu m6, [r2 + 4] + pshufb m6, [pw_ang16_16] + movu m5, [r2 + 16] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r2 + lea r3, [r2 + 24] + add r2, 64 + xor r4, r4 + + call ang16_mode_16_20 + + lea r0, [r0 + r1 * 8] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + + call ang16_mode_16_20 + + mov r6d, [rsp + 16] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_20, 3,7,8,0-(1*mmsize+4) + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp + 16], r5w + mov [r2 + 64], r6w + + lea r3, [r2 + 64] + add r1, r1 + lea r6, [ang_table + 13 * 16] + movu m6, [r3 + 4] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 16] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 24] + xor r4, r4 + inc r4 + + call ang16_mode_16_20 + + lea r0, [r0 + 16] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + inc r4 + + call ang16_mode_16_20 + mov r6d, [rsp + 16] + mov [r3], r6w + RET + +cglobal intra_pred_ang16_17, 3,7,8,0-(1*mmsize+4) + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp + 16], r5w + mov [r2 + 64], r6w + + add r1, r1 + lea r6, [ang_table + 16 * 16] + movu m6, [r2 + 2] + pshufb m6, [pw_ang16_16] + movu m5, [r2 + 12] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r2 + lea r3, [r2 + 20] + add r2, 64 + xor r4, r4 + + call ang16_mode_17_19 + + lea r0, [r0 + r1 * 8] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + + call ang16_mode_17_19 + + mov r6d, [rsp + 16] + mov [r2 - 16], r6w + RET + +cglobal intra_pred_ang16_19, 3,7,8,0-(1*mmsize+4) + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp + 16], r5w + mov [r2 + 64], r6w + + lea r3, [r2 + 64] + add r1, r1 + lea r6, [ang_table + 16 * 16] + movu m6, [r3 + 2] + pshufb m6, [pw_ang16_16] + movu m5, [r3 + 12] + pshufb m5, [pw_ang16_16] + punpckhqdq m5, m6 + mov [rsp], r3 + lea r3, [r3 + 20] + xor r4, r4 + inc r4 + + call ang16_mode_17_19 + + lea r0, [r0 + 16] + mov r3, [rsp] + movu m5, [r2] + lea r2, [r2 + 16] + xor r4, r4 + inc r4 + + call ang16_mode_17_19 + + mov r6d, [rsp + 16] + mov [r3], r6w + RET + +cglobal intra_pred_ang16_18, 3,5,4 + add r1, r1 + lea r4, [r1 * 3] + movu m1, [r2] + movu m3, [r2 + 16] + movu m0, [r2 + 2 + 64] + pshufb m0, [pw_swap16] + movu [r0], m1 + movu [r0 + 16], m3 + palignr m2, m1, m0, 14 + movu [r0 + r1], m2 + palignr m2, m3, m1, 14 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 12 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4], m2 + palignr m2, m3, m1, 10 + movu [r0 + r4 + 16], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m3, m1, 8 + movu [r0 + 16], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1], m2 + palignr m2, m3, m1, 6 + movu [r0 + r1 + 16], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m3, m1, 4 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m1, m0, 2 + movu [r0 + r4], m2 + palignr m3, m1, 2 + movu [r0 + r4 + 16], m3 + + lea r0, [r0 + r1 * 4] + movu [r0], m0 + movu [r0 + 16], m1 + movu m3, [r2 + 18 + 64] + pshufb m3, [pw_swap16] + palignr m2, m0, m3, 14 + movu [r0 + r1], m2 + palignr m2, m1, m0, 14 + movu [r0 + r1 + 16], m2 + palignr m2, m0, m3, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m0, m3, 10 + movu [r0 + r4], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4 + 16], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m3, 8 + movu [r0], m2 + palignr m2, m1, m0, 8 + movu [r0 + 16], m2 + palignr m2, m0, m3, 6 + movu [r0 + r1], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1 + 16], m2 + palignr m2, m0, m3, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2 + 16], m2 + palignr m2, m0, m3, 2 + movu [r0 + r4], m2 + palignr m1, m0, 2 + movu [r0 + r4 + 16], m1 + RET + +cglobal intra_pred_ang16_10, 3,6,4 + mov r5d, r4m + movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9] + pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1] + add r1, r1 + lea r4, [r1 * 3] + + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2] + movu [r0 + r1], m2 + movu [r0 + r1 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3] + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4] + movu [r0 + r4], m2 + movu [r0 + r4 + 16], m2 + + lea r3, [r0 + r1 *4] + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5] + movu [r3], m2 + movu [r3 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6] + movu [r3 + r1], m2 + movu [r3 + r1 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7] + movu [r3 + r1 * 2], m2 + movu [r3 + r1 * 2 + 16], m2 + psrldq m1, 2 + pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8] + movu [r3 + r4], m2 + movu [r3 + r4 + 16], m2 + + lea r3, [r3 + r1 *4] + pshufb m2, m3, [pb_01] ; [9 9 9 9 9 9 9 9] + movu [r3], m2 + movu [r3 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [10 10 10 10 10 10 10 10] + movu [r3 + r1], m2 + movu [r3 + r1 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [11 11 11 11 11 11 11 11] + movu [r3 + r1 * 2], m2 + movu [r3 + r1 * 2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [12 12 12 12 12 12 12 12] + movu [r3 + r4], m2 + movu [r3 + r4 + 16], m2 + + lea r3, [r3 + r1 *4] + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [13 13 13 13 13 13 13 13] + movu [r3], m2 + movu [r3 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [14 14 14 14 14 14 14 14] + movu [r3 + r1], m2 + movu [r3 + r1 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [15 15 15 15 15 15 15 15] + movu [r3 + r1 * 2], m2 + movu [r3 + r1 * 2 + 16], m2 + psrldq m3, 2 + pshufb m2, m3, [pb_01] ; [16 16 16 16 16 16 16 16] + movu [r3 + r4], m2 + movu [r3 + r4 + 16], m2 + mova m3, m0 + + cmp r5d, byte 0 + jz .quit + + ; filter + pinsrw m1, [r2], 0 ; [3 2 1 0] + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + psubw m1, m2 + psubw m3, m2 + psraw m1, 1 + psraw m3, 1 + paddw m3, m0 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + pmaxsw m3, m1 + pminsw m3, [pw_pixel_max] +.quit: + movu [r0], m0 + movu [r0 + 16], m3 + RET + +cglobal intra_pred_ang16_26, 3,6,4 + mov r5d, r4m + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + add r1, r1 + lea r4, [r1 * 3] + + movu [r0], m0 + movu [r0 + 16], m3 + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m3 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 16], m3 + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m3 + + lea r3, [r0 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + 16], m3 + movu [r3 + r1], m0 + movu [r3 + r1 + 16], m3 + movu [r3 + r1 * 2], m0 + movu [r3 + r1 * 2 + 16], m3 + movu [r3 + r4], m0 + movu [r3 + r4 + 16], m3 + + cmp r5d, byte 0 + jz .quit + + ; filter + + pshufb m0, [pb_01] + pinsrw m1, [r2], 0 ; [3 2 1 0] + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0] + movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9] + psubw m1, m2 + psubw m3, m2 + psraw m1, 1 + psraw m3, 1 + paddw m3, m0 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + pmaxsw m3, m1 + pminsw m3, [pw_pixel_max] + pextrw [r0], m0, 0 + pextrw [r0 + r1], m0, 1 + pextrw [r0 + r1 * 2], m0, 2 + pextrw [r0 + r4], m0, 3 + lea r0, [r0 + r1 * 4] + pextrw [r0], m0, 4 + pextrw [r0 + r1], m0, 5 + pextrw [r0 + r1 * 2], m0, 6 + pextrw [r0 + r4], m0, 7 + lea r0, [r0 + r1 * 4] + pextrw [r0], m3, 0 + pextrw [r0 + r1], m3, 1 + pextrw [r0 + r1 * 2], m3, 2 + pextrw [r0 + r4], m3, 3 + pextrw [r3], m3, 4 + pextrw [r3 + r1], m3, 5 + pextrw [r3 + r1 * 2], m3, 6 + pextrw [r3 + r4], m3, 7 +.quit: + RET + +;------------------------------------------------------------------------------------------------------- +; avx2 code for intra_pred_ang16 mode 2 to 34 start +;------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_ang16_2, 3,5,3 + lea r4, [r2] + add r2, 64 + cmp r3m, byte 34 + cmove r2, r4 + add r1d, r1d + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + + movu [r0], m0 + palignr m2, m1, m0, 2 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 6 + movu [r0 + r3], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 10 + movu [r0 + r1], m2 + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 14 + movu [r0 + r3], m2 + + movu m0, [r2 + 36] + lea r0, [r0 + r1 * 4] + movu [r0], m1 + palignr m2, m0, m1, 2 + movu [r0 + r1], m2 + palignr m2, m0, m1, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m1, 6 + movu [r0 + r3], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m1, 8 + movu [r0], m2 + palignr m2, m0, m1, 10 + movu [r0 + r1], m2 + palignr m2, m0, m1, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m1, 14 + movu [r0 + r3], m2 + RET + +%macro TRANSPOSE_STORE_AVX2 11 + jnz .skip%11 + punpckhwd m%9, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%3, m%4 + punpcklwd m%3, m%4 + + punpckldq m%4, m%1, m%3 + punpckhdq m%1, m%3 + punpckldq m%3, m%9, m%2 + punpckhdq m%9, m%2 + + punpckhwd m%10, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%7, m%8 + punpcklwd m%7, m%8 + + punpckldq m%8, m%5, m%7 + punpckhdq m%5, m%7 + punpckldq m%7, m%10, m%6 + punpckhdq m%10, m%6 + + punpcklqdq m%6, m%4, m%8 + punpckhqdq m%2, m%4, m%8 + punpcklqdq m%4, m%1, m%5 + punpckhqdq m%8, m%1, m%5 + + punpcklqdq m%1, m%3, m%7 + punpckhqdq m%5, m%3, m%7 + punpcklqdq m%3, m%9, m%10 + punpckhqdq m%7, m%9, m%10 + + movu [r0 + r1 * 0 + %11], xm%6 + movu [r0 + r1 * 1 + %11], xm%2 + movu [r0 + r1 * 2 + %11], xm%4 + movu [r0 + r4 * 1 + %11], xm%8 + + lea r5, [r0 + r1 * 4] + movu [r5 + r1 * 0 + %11], xm%1 + movu [r5 + r1 * 1 + %11], xm%5 + movu [r5 + r1 * 2 + %11], xm%3 + movu [r5 + r4 * 1 + %11], xm%7 + + lea r5, [r5 + r1 * 4] + vextracti128 [r5 + r1 * 0 + %11], m%6, 1 + vextracti128 [r5 + r1 * 1 + %11], m%2, 1 + vextracti128 [r5 + r1 * 2 + %11], m%4, 1 + vextracti128 [r5 + r4 * 1 + %11], m%8, 1 + + lea r5, [r5 + r1 * 4] + vextracti128 [r5 + r1 * 0 + %11], m%1, 1 + vextracti128 [r5 + r1 * 1 + %11], m%5, 1 + vextracti128 [r5 + r1 * 2 + %11], m%3, 1 + vextracti128 [r5 + r4 * 1 + %11], m%7, 1 + jmp .end%11 +.skip%11: + movu [r0 + r1 * 0], m%1 + movu [r0 + r1 * 1], m%2 + movu [r0 + r1 * 2], m%3 + movu [r0 + r4 * 1], m%4 + + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0], m%5 + movu [r0 + r1 * 1], m%6 + movu [r0 + r1 * 2], m%7 + movu [r0 + r4 * 1], m%8 + lea r0, [r0 + r1 * 4] +.end%11: +%endmacro + +;; angle 16, modes 3 and 33 +cglobal ang16_mode_3_33 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 + 10 * 32] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 10 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] + pmaddwd m5, [r3 + 4 * 32] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + palignr m6, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] + pmaddwd m6, [r3 + 4 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] + pmaddwd m6, [r3 - 2 * 32] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] + pmaddwd m7, [r3 - 2 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 12 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + pmaddwd m7, [r3 - 8 * 32] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + palignr m8, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8] + pmaddwd m8, [r3 - 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m0, [r3 - 14 * 32] ; [2] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m3, m2, [r3 - 14 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m8, m3 + + pmaddwd m9, m0, [r3 + 12 * 32] ; [28] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m2, [r3 + 12 * 32] ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + palignr m10, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] + pmaddwd m10, [r3 + 6 * 32] ; [22] + paddd m10, [pd_16] + psrld m10, 5 + palignr m3, m1, m2, 4 ; [22 21 21 20 20 19 19 18 14 13 13 12 12 11 11 10] + pmaddwd m3, [r3 + 6 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m10, m3 + + palignr m11, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] + pmaddwd m11, [r3] ; [16] + paddd m11, [pd_16] + psrld m11, 5 + palignr m3, m1, m2, 8 ; [23 22 22 21 21 20 20 19 15 14 14 13 13 12 12 11] + pmaddwd m3, [r3] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + + palignr m4, m2, m0, 12 ; [20 19 19 18 18 17 17 16 12 11 11 10 10 9 9 8] + pmaddwd m4, [r3 - 6 * 32] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m1, m2, 12 ; [24 23 23 22 22 21 21 20 15 16 15 14 14 13 13 12] + pmaddwd m5, [r3 - 6 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 12 * 32] ; [4] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m1, [r3 - 12 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] + pmaddwd m6, m2, [r3 + 14 * 32] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m1, [r3 + 14 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] + punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] + + palignr m7, m1, m2, 4 + pmaddwd m7, [r3 + 8 * 32] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + palignr m8, m0, m1, 4 + pmaddwd m8, [r3 + 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m8, m1, m2, 8 + pmaddwd m8, [r3 + 2 * 32] ; [18] + paddd m8, [pd_16] + psrld m8, 5 + palignr m9, m0, m1, 8 + pmaddwd m9, [r3 + 2 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m9, m1, m2, 12 + pmaddwd m9, [r3 - 4 * 32] ; [12] + paddd m9, [pd_16] + psrld m9, 5 + palignr m3, m0, m1, 12 + pmaddwd m3, [r3 - 4 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m1, [r3 - 10 * 32] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m0, [r3 - 10 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + movu m2, [r2 + 28] + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 2, 0, 3, 16 + ret + +;; angle 16, modes 4 and 32 +cglobal ang16_mode_4_32 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 + 3 * 32] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 3 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] + pmaddwd m5, m6, [r3 - 8 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] + pmaddwd m8, m7, [r3 - 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 13 * 32] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 13 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] + pmaddwd m7, [r3 + 2 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + palignr m8, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] + pmaddwd m8, [r3 + 2 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m9, m0, m3, 12 + pmaddwd m8, m9, [r3 - 9 * 32] ; [9] + paddd m8, [pd_16] + psrld m8, 5 + palignr m3, m2, m0, 12 + pmaddwd m10, m3, [r3 - 9 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 12 * 32] ; [30] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, [r3 + 12 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, m0, [r3 + 1 * 32] ; [19] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m3, m2, [r3 + 1 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m10, m3 + + palignr m11, m2, m0, 4 + pmaddwd m11, [r3 - 10 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + palignr m3, m1, m2, 4 + pmaddwd m3, [r3 - 10 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + + palignr m4, m2, m0, 4 + pmaddwd m4, [r3 + 11 * 32] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m1, m2, 4 + pmaddwd m5, [r3 + 11 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m2, m0, 8 + pmaddwd m5, [r3] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + palignr m6, m1, m2, 8 + pmaddwd m6, [r3] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m2, m0, 12 + pmaddwd m6, m7, [r3 - 11 * 32] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m8, m1, m2, 12 + pmaddwd m3, m8, [r3 - 11 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m6, m3 + + pmaddwd m7, [r3 + 10 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 10 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] + pmaddwd m8, m2, [r3 - 1 * 32] ; [17] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m1, [r3 - 1 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] + punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] + + palignr m10, m1, m2, 4 + pmaddwd m9, m10, [r3 - 12 * 32] ; [6] + paddd m9, [pd_16] + psrld m9, 5 + palignr m11, m0, m1, 4 + pmaddwd m3, m11, [r3 - 12 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 9 * 32] ; [27] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 9 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m3, m1, m2, 8 + pmaddwd m3, [r3 - 2 * 32] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + palignr m0, m1, 8 + pmaddwd m0, [r3 - 2 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16 + ret + +;; angle 16, modes 5 and 31 +cglobal ang16_mode_5_31 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 + 1 * 32] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 1 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m0, m3, 4 + pmaddwd m5, m6, [r3 - 14 * 32] ; [2] + paddd m5, [pd_16] + psrld m5, 5 + palignr m7, m2, m0, 4 + pmaddwd m8, m7, [r3 - 14 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 3 * 32] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 3 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m0, m3, 8 + pmaddwd m7, m8, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m9, m2, m0, 8 + pmaddwd m10, m9, [r3 - 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 5 * 32] ; [21] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 5 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m0, m3, 12 + pmaddwd m9, m10, [r3 - 10 * 32] ; [6] + paddd m9, [pd_16] + psrld m9, 5 + palignr m11, m2, m0, 12 + pmaddwd m3, m11, [r3 - 10 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 7 * 32] ; [23] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 7 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m11, m0, [r3 - 8 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m3, m2, [r3 - 8 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + + pmaddwd m4, m0, [r3 + 9 * 32] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 9 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m2, m0, 4 + pmaddwd m5, m6, [r3 - 6 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + palignr m7, m1, m2, 4 + pmaddwd m3, m7, [r3 - 6 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, [r3 + 11 * 32] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 11 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m2, m0, 8 + pmaddwd m7, m8, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m9, m1, m2, 8 + pmaddwd m3, m9, [r3 - 4 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m8, [r3 + 13 * 32] ; [29] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 13 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m2, m0, 12 + pmaddwd m9, m10, [r3 - 2 * 32] ; [14] + paddd m9, [pd_16] + psrld m9, 5 + palignr m11, m1, m2, 12 + pmaddwd m3, m11, [r3 - 2 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 15 * 32] ; [31] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 15 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m2, [r3] ; [16] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16 + ret + +;; angle 16, modes 6 and 30 +cglobal ang16_mode_6_30 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 - 2 * 32] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 2 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 11 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m0, m3, 4 + pmaddwd m6, m7, [r3 - 8 * 32] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + palignr m8, m2, m0, 4 + pmaddwd m9, m8, [r3 - 8 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 5 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m10, m0, m3, 8 + pmaddwd m8, m10, [r3 - 14 * 32] ; [1] + paddd m8, [pd_16] + psrld m8, 5 + palignr m11, m2, m0, 8 + pmaddwd m9, m11, [r3 - 14 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m10, [r3 - 1 * 32] ; [14] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, m11, [r3 - 1 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, [r3 + 12 * 32] ; [27] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 12 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m11, m0, m3, 12 + pmaddwd m11, [r3 - 7 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + palignr m12, m2, m0, 12 + pmaddwd m12, [r3 - 7 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m4, m0, m3, 12 + pmaddwd m4, [r3 + 6 * 32] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m2, m0, 12 + pmaddwd m5, [r3 + 6 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m0, [r3 - 13 * 32] ; [2] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, m2, [r3 - 13 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, m0, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m2, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m0, [r3 + 13 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, m2, [r3 + 13 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + palignr m9, m2, m0, 4 + pmaddwd m8, m9, [r3 - 6 * 32] ; [9] + paddd m8, [pd_16] + psrld m8, 5 + palignr m3, m1, m2, 4 + pmaddwd m10, m3, [r3 - 6 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 7 * 32] ; [22] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, [r3 + 7 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + palignr m11, m2, m0, 8 + pmaddwd m10, m11, [r3 - 12 * 32] ; [3] + paddd m10, [pd_16] + psrld m10, 5 + palignr m3, m1, m2, 8 + pmaddwd m12, m3, [r3 - 12 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, [r3 + 1 * 32] ; [16] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m3, [r3 + 1 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 + ret + +;; angle 16, modes 7 and 29 +cglobal ang16_mode_7_29 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + + pmaddwd m4, m3, [r3 - 8 * 32] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 8 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 1 * 32] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 1 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 10 * 32] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m10, m0, m3, 4 + pmaddwd m7, m10, [r3 - 13 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m11, m2, m0, 4 + pmaddwd m8, m11, [r3 - 13 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m10, [r3 - 4 * 32] ; [13] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m11, [r3 - 4 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m10, [r3 + 5 * 32] ; [22] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, m11, [r3 + 5 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, [r3 + 14 * 32] ; [31] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 14 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m11, m0, m3, 8 + pmaddwd m11, [r3 - 9 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + palignr m12, m2, m0, 8 + pmaddwd m12, [r3 - 9 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 + + palignr m5, m0, m3, 8 + pmaddwd m4, m5, [r3] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + palignr m6, m2, m0, 8 + pmaddwd m7, m6, [r3] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m5, [r3 + 9 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, [r3 + 9 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m9, m0, m3, 12 + pmaddwd m6, m9, [r3 - 14 * 32] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m3, m2, m0, 12 + pmaddwd m7, m3, [r3 - 14 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m9, [r3 - 5 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m3, [r3 - 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m9, [r3 + 4 * 32] ; [21] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m10, m3, [r3 + 4 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 13 * 32] ; [30] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, [r3 + 13 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, m0, [r3 - 10 * 32] ; [7] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m2, [r3 - 10 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m0, [r3 - 1 * 32] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + pmaddwd m2, [r3 - 1 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m0, m2 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 16 + ret + +;; angle 16, modes 8 and 28 +cglobal ang16_mode_8_28 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + + pmaddwd m4, m3, [r3 - 10 * 32] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 10 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 5 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 - 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m3, [r3 + 5 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m0, [r3 + 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m3, [r3 + 10 * 32] ; [25] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 15 * 32] ; [30] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 + 15 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m11, m0, m3, 4 + pmaddwd m10, m11, [r3 - 12 * 32] ; [3] + paddd m10, [pd_16] + psrld m10, 5 + palignr m1, m2, m0, 4 + pmaddwd m12, m1, [r3 - 12 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, [r3 - 7 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m1, [r3 - 7 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m11, m1 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 + + palignr m7, m0, m3, 4 + pmaddwd m4, m7, [r3 - 2 * 32] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m1, m2, m0, 4 + pmaddwd m5, m1, [r3 - 2 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m7, [r3 + 3 * 32] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m1, [r3 + 3 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m7, [r3 + 8 * 32] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m1, [r3 + 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, [r3 + 13 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m1, [r3 + 13 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + palignr m1, m0, m3, 8 + pmaddwd m8, m1, [r3 - 14 * 32] ; [1] + paddd m8, [pd_16] + psrld m8, 5 + palignr m2, m0, 8 + pmaddwd m9, m2, [r3 - 14 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m1, [r3 - 9 * 32] ; [6] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m2, [r3 - 9 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m3, m1, [r3 - 4 * 32] ; [11] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, m2, [r3 - 4 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + pmaddwd m1, [r3 + 1 * 32] ; [16] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m2, [r3 + 1 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 + ret + +;; angle 16, modes 9 and 27 +cglobal ang16_mode_9_27 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + + pmaddwd m4, m3, [r3 - 14 * 32] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 14 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 12 * 32] ; [4] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 - 10 * 32] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3 - 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m3, [r3 - 8 * 32] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m0, [r3 - 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m3, [r3 - 6 * 32] ; [10] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 - 6 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 - 4 * 32] ; [12] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 - 4 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m3, [r3 - 2 * 32] ; [14] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m1, m0, [r3 - 2 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m10, m1 + + pmaddwd m11, m3, [r3] ; [16] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m1, m0, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m11, m1 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 + + pmaddwd m4, m3, [r3 + 2 * 32] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 2 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 4 * 32] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m0, [r3 + 4 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 6 * 32] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m0, [r3 + 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m3, [r3 + 8 * 32] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m1, m0, [r3 + 8 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m8, m3, [r3 + 10 * 32] ; [26] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 12 * 32] ; [28] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m1, m0, [r3 + 12 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m9, m1 + + pmaddwd m3, [r3 + 14 * 32] ; [30] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 14 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + movu m1, [r2 + 4] + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 + ret + +;; angle 16, modes 11 and 25 +cglobal ang16_mode_11_25 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m1 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m0, m1 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 14 * 32] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 14 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 12 * 32] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 10 * 32] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m3, [r3 + 8 * 32] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m0, [r3 + 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m3, [r3 + 6 * 32] ; [22] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 6 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 4 * 32] ; [20] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 + 4 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m3, [r3 + 2 * 32] ; [18] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m1, m0, [r3 + 2 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m10, m1 + + pmaddwd m11, m3, [r3] ; [16] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m1, m0, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m11, m1 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 + + pmaddwd m4, m3, [r3 - 2 * 32] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 2 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 4 * 32] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m0, [r3 - 4 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 6 * 32] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m0, [r3 - 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m3, [r3 - 8 * 32] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m1, m0, [r3 - 8 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m7, m1 + + pmaddwd m8, m3, [r3 - 10 * 32] ; [6] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 - 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 - 12 * 32] ; [4] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m1, m0, [r3 - 12 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m9, m1 + + pmaddwd m3, [r3 - 14 * 32] ; [2] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 14 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + + movu m1, [r2] + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 + ret + +;; angle 16, modes 12 and 24 +cglobal ang16_mode_12_24 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 11 * 32] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 11 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 6 * 32] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m2, [r3 + 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 1 * 32] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m2, [r3 + 1 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m3, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m2, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m3, [r3 - 9 * 32] ; [7] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m2, [r3 - 9 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m2, [r3 - 14 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m9, m2 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 x x x x] + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m10, m2, [r3 + 13 * 32] ; [29] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 + 13 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 14 + + pmaddwd m4, m2, [r3 + 3 * 32] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 + 3 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 2 * 32] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 - 2 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 - 7 * 32] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 - 7 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m0, m3, 10 + palignr m3, m1, 10 + + pmaddwd m8, m3, [r3 + 15 * 32] ; [31] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 15 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 10 * 32] ; [26] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m1, m0, [r3 + 10 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m9, m1 + + pmaddwd m1, m3, [r3 + 5 * 32] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m2, m0, [r3 + 5 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + pmaddwd m3, [r3] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 + ret + +;; angle 16, modes 13 and 23 +cglobal ang16_mode_13_23 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 7 * 32] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 7 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 2 * 32] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m2, [r3 - 2 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 11 * 32] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m2, [r3 - 11 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m6, m2 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m7, m2, [r3 + 12 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 + 3 * 32] ; [19] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 3 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 6 * 32] ; [10] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 - 6 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 15 * 32] ; [1] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 15 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m2, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 10 + + pmaddwd m4, m2, [r3 - 1 * 32] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 1 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 10 * 32] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 - 10 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m2, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m6, m2, [r3 + 13 * 32] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 + 13 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 + 4 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 - 5 * 32] ; [11] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 - 5 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m13, [r3 - 14 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m9, m13 + + palignr m0, m3, 2 + palignr m3, m1, 2 + + pmaddwd m1, m3, [r3 + 9 * 32] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m2, m0, [r3 + 9 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + pmaddwd m3, [r3] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 + ret + +;; angle 16, modes 14 and 22 +cglobal ang16_mode_14_22 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 3 * 32] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 3 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 10 * 32] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m2, [r3 - 10 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m5, m2 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 5 5 7 7 10 10] + vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 12 12 15 15 x x x x] + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m6, m2, [r3 + 9 * 32] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m13, [r3 + 9 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m2, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m2, m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5] + palignr m13, m0, m3, 10 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] + + pmaddwd m8, m2, [r3 + 15 * 32] ; [31] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 15 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 + 2 * 32] ; [18] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 + 2 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 11 * 32] ; [5] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 11 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m2, m3, m1, 6 ; [ 9 8 8 7 7 6 6 5 1 0 0 2 2 5 5 7] + palignr m13, m0, m3, 6 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 6 + + pmaddwd m4, m2, [r3 - 5 * 32] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 5 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m2, m0, m3, 2 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + palignr m13, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 5 5 7 7 10] + + pmaddwd m5, m13, [r3 + 14 * 32] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m2, [r3 + 14 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m13, [r3 + 1 * 32] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m2, [r3 + 1 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m13, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m2, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m2, m1, m14, 14 ; [ 7 6 6 5 5 4 4 3 2 5 5 7 7 10 10 12] + palignr m0, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 2] + + pmaddwd m8, m2, [r3 + 7 * 32] ; [23] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 7 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 6 * 32] ; [10] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m2, m0, [r3 - 6 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m9, m2 + + palignr m3, m1, 10 ; [10 9 9 8 8 7 7 6 2 1 1 0 0 2 2 5] + palignr m1, m14, 10 ; [ 6 5 5 4 4 3 3 2 5 7 7 10 10 12 12 15] + + pmaddwd m2, m1, [r3 + 13 * 32] ; [29] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m0, m3, [r3 + 13 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m2, m0 + + pmaddwd m1, [r3] ; [16] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m3, [r3] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m1, m3 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16 + ret + +;; angle 16, modes 15 and 21 +cglobal ang16_mode_15_21 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 - 1 * 32] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 - 1 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 + vinserti128 m14, m14, xm3, 1 + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m5, m2, [r3 + 14 * 32] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m13, [r3 + 14 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m2, [r3 - 3 * 32] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m13, [r3 - 3 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m2, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m7, m2, [r3 + 12 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 - 5 * 32] ; [11] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 - 5 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m2, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m9, m2, [r3 + 10 * 32] ; [26] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 + 10 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 7 * 32] ; [9] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 7 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m2, m3, m1, 2 + palignr m13, m0, m3, 2 + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 2 + + pmaddwd m4, m2, [r3 - 9 * 32] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 9 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m1, m14, 14 + palignr m7, m3, m1, 14 + + pmaddwd m5, m6, [r3 + 6 * 32] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m7, [r3 + 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 - 11 * 32] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 - 11 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m1, m14, 10 + palignr m9, m3, m1, 10 + + pmaddwd m7, m8, [r3 + 4 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 + 4 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 - 13 * 32] ; [3] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 - 13 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m2, m1, m14, 6 + palignr m0, m3, m1, 6 + + pmaddwd m9, m2, [r3 + 2 * 32] ; [18] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m13, m0, [r3 + 2 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m9, m13 + + pmaddwd m2, [r3 - 15 * 32] ; [1] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m0, [r3 - 15 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m2, m0 + + palignr m3, m1, 2 + palignr m1, m14, 2 + + pmaddwd m1, [r3] ; [16] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m3, [r3] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m1, m3 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 2, 1, 0, 3, 16 + ret + +;; angle 16, modes 16 and 20 +cglobal ang16_mode_16_20 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 - 5 * 32] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m12, [r3 - 5 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6] + vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12] + vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x] + + palignr m12, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m5, m12, [r3 + 6 * 32] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m13, [r3 + 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m12, [r3 - 15 * 32] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m13, [r3 - 15 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m12, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m7, m12, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m12, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m8, m12, [r3 + 7 * 32] ; [23] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 7 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m12, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 - 14 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m12, m3, m1, 2 + palignr m13, m0, m3, 2 + + pmaddwd m10, m12, [r3 - 3 * 32] ; [13] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m13, [r3 - 3 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m12, m1, m14, 14 + palignr m13, m3, m1, 14 + + pmaddwd m11, m12, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0 + + palignr m13, m3, m1, 14 + + pmaddwd m4, m12, [r3 - 13 * 32] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 13 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m1, m14, 10 + palignr m7, m3, m1, 10 + + pmaddwd m5, m6, [r3 - 2 * 32] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m7, [r3 - 2 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m1, m14, 6 + palignr m10, m3, m1, 6 + + pmaddwd m6, m7, [r3 + 9 * 32] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m10, [r3 + 9 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, [r3 - 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + palignr m8, m1, m14, 2 ; [ 4 3 3 2 2 1 1 0 6 8 8 9 9 11 11 12] + palignr m9, m3, m1, 2 ; [ 8 7 7 6 6 5 5 4 0 2 2 3 3 5 5 6] + + pmaddwd m8, [r3 - 1 * 32] ; [15] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 - 1 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m12, m14, m2, 14 + palignr m0, m1, m14, 14 + + pmaddwd m9, m12, [r3 + 10 * 32] ; [26] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m13, m0, [r3 + 10 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m9, m13 + + pmaddwd m12, [r3 - 11 * 32] ; [5] + paddd m12, [pd_16] + psrld m12, 5 + pmaddwd m0, [r3 - 11 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m12, m0 + + palignr m1, m14, 10 + palignr m14, m2, 10 + + pmaddwd m14, [r3] ; [16] + paddd m14, [pd_16] + psrld m14, 5 + pmaddwd m1, [r3] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m14, m1 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16 + ret + +;; angle 16, modes 17 and 19 +cglobal ang16_mode_17_19 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m12, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 - 10 * 32] ; [6] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m12, [r3 - 10 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 2 2 3 3 5 5 6 6] + vinserti128 m14, m14, xm3, 1 ; [ 3 3 2 2 1 1 0 0 8 8 9 9 11 11 12 12] + vinserti128 m2, m2, xm1, 1 ; [ 2 2 3 3 5 5 6 6 14 14 15 15 x x x x] + + palignr m12, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m5, m12, [r3 - 4 * 32] ; [12] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m13, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m12, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m6, m12, [r3 + 2 * 32] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m13, [r3 + 2 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m12, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m7, m12, [r3 + 8 * 32] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m12, m3, m1, 2 + palignr m13, m0, m3, 2 + + pmaddwd m8, m12, [r3 + 14 * 32] ; [30] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 14 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m12, [r3 - 12 * 32] ; [4] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 - 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m12, m1, m14, 14 + palignr m13, m3, m1, 14 + + pmaddwd m10, m12, [r3 - 6 * 32] ; [10] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m13, [r3 - 6 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m12, m1, m14, 10 + palignr m13, m3, m1, 10 + + pmaddwd m11, m12, [r3] ; [16] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 0 + + palignr m12, m1, m14, 6 + palignr m13, m3, m1, 6 + + pmaddwd m4, m12, [r3 + 6 * 32] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 + 6 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m12, m1, m14, 2 + palignr m13, m3, m1, 2 + + pmaddwd m5, m12, [r3 + 12 * 32] ; [28] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m13, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m12, [r3 - 14 * 32] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 - 14 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + palignr m7, m14, m2, 14 + palignr m0, m1, m14, 14 + + pmaddwd m7, [r3 - 8 * 32] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m0, [r3 - 8 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m7, m0 + + palignr m8, m14, m2, 10 + palignr m9, m1, m14, 10 + + pmaddwd m8, [r3 - 2 * 32] ; [14] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 - 2 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m9, m14, m2, 6 + palignr m13, m1, m14, 6 + + pmaddwd m9, [r3 + 4 * 32] ; [20] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m13, [r3 + 4 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m9, m13 + + palignr m1, m14, 2 + palignr m14, m2, 2 + + pmaddwd m12, m14, [r3 + 10 * 32] ; [26] + paddd m12, [pd_16] + psrld m12, 5 + pmaddwd m0, m1, [r3 + 10 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m12, m0 + + pmaddwd m14, [r3 - 16 * 32] ; [0] + paddd m14, [pd_16] + psrld m14, 5 + pmaddwd m1, [r3 - 16 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m14, m1 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 12, 14, 0, 3, 16 + ret + +cglobal intra_pred_ang16_3, 3,7,13 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_3_33 + RET + +cglobal intra_pred_ang16_33, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_3_33 + RET + +cglobal intra_pred_ang16_4, 3,7,13 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 18 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_4_32 + RET + +cglobal intra_pred_ang16_32, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 18 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_4_32 + RET + +cglobal intra_pred_ang16_5, 3,7,13 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_5_31 + RET + +cglobal intra_pred_ang16_31, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_5_31 + RET + +cglobal intra_pred_ang16_6, 3,7,14 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_6_30 + RET + +cglobal intra_pred_ang16_30, 3,7,14 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_6_30 + RET + +cglobal intra_pred_ang16_7, 3,7,13 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 17 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_7_29 + RET + +cglobal intra_pred_ang16_29, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 17 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_7_29 + RET + +cglobal intra_pred_ang16_8, 3,7,13 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_8_28 + RET + +cglobal intra_pred_ang16_28, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_8_28 + RET + +cglobal intra_pred_ang16_9, 3,7,12 + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang16_27, 3,7,12 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang16_10, 3,6,3 + mov r5d, r4m + add r1d, r1d + lea r4, [r1 * 3] + + vpbroadcastw m2, [r2 + 2 + 64] ; [1...] + mova m0, m2 + movu [r0], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 2] ; [2...] + movu [r0 + r1], m1 + vpbroadcastw m2, [r2 + 2 + 64 + 4] ; [3...] + movu [r0 + r1 * 2], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 6] ; [4...] + movu [r0 + r4], m1 + + lea r3, [r0 + r1 * 4] + vpbroadcastw m2, [r2 + 2 + 64 + 8] ; [5...] + movu [r3], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 10] ; [6...] + movu [r3 + r1], m1 + vpbroadcastw m2, [r2 + 2 + 64 + 12] ; [7...] + movu [r3 + r1 * 2], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 14] ; [8...] + movu [r3 + r4], m1 + + lea r3, [r3 + r1 *4] + vpbroadcastw m2, [r2 + 2 + 64 + 16] ; [9...] + movu [r3], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 18] ; [10...] + movu [r3 + r1], m1 + vpbroadcastw m2, [r2 + 2 + 64 + 20] ; [11...] + movu [r3 + r1 * 2], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 22] ; [12...] + movu [r3 + r4], m1 + + lea r3, [r3 + r1 *4] + vpbroadcastw m2, [r2 + 2 + 64 + 24] ; [13...] + movu [r3], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 26] ; [14...] + movu [r3 + r1], m1 + vpbroadcastw m2, [r2 + 2 + 64 + 28] ; [15...] + movu [r3 + r1 * 2], m2 + vpbroadcastw m1, [r2 + 2 + 64 + 30] ; [16...] + movu [r3 + r4], m1 + + cmp r5d, byte 0 + jz .quit + + ; filter + vpbroadcastw m2, [r2] ; [0 0...] + movu m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] +.quit: + movu [r0], m0 + RET + +cglobal intra_pred_ang16_26, 3,6,4 + mov r5d, r4m + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + add r1d, r1d + lea r4, [r1 * 3] + + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + + lea r3, [r0 + r1 *4] + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + + lea r3, [r3 + r1 *4] + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + + cmp r5d, byte 0 + jz .quit + + ; filter + + vpbroadcastw m0, xm0 + vpbroadcastw m2, [r2] + movu m1, [r2 + 2 + 64] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + pxor m1, m1 + pmaxsw m0, m1 + pminsw m0, [pw_pixel_max] + pextrw [r0], xm0, 0 + pextrw [r0 + r1], xm0, 1 + pextrw [r0 + r1 * 2], xm0, 2 + pextrw [r0 + r4], xm0, 3 + lea r0, [r0 + r1 * 4] + pextrw [r0], xm0, 4 + pextrw [r0 + r1], xm0, 5 + pextrw [r0 + r1 * 2], xm0, 6 + pextrw [r0 + r4], xm0, 7 + lea r0, [r0 + r1 * 4] + vpermq m0, m0, 11101110b + pextrw [r0], xm0, 0 + pextrw [r0 + r1], xm0, 1 + pextrw [r0 + r1 * 2], xm0, 2 + pextrw [r0 + r4], xm0, 3 + pextrw [r3], xm0, 4 + pextrw [r3 + r1], xm0, 5 + pextrw [r3 + r1 * 2], xm0, 6 + pextrw [r3 + r4], xm0, 7 +.quit: + RET + +cglobal intra_pred_ang16_11, 3,7,12, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r2, 64 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_11_25 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_25, 3,7,12 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + + call ang16_mode_11_25 + RET + +cglobal intra_pred_ang16_12, 3,7,14, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 12] ; [13 12 11 10 9 8 7 6] + pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x] + xor r6d, r6d + add r2, 64 + + call ang16_mode_12_24 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_24, 3,7,14, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 76] ; [13 12 11 10 9 8 7 6] + pshufb xm1, [pw_ang16_12_24] ; [ 6 6 13 13 x x x x] + xor r6d, r6d + inc r6d + + call ang16_mode_12_24 + RET + +cglobal intra_pred_ang16_13, 3,7,14, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 8] ; [11 x x x 7 x x 4] + pinsrw xm1, [r2 + 28], 1 ; [11 x x x 7 x 14 4] + pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14] + xor r6d, r6d + add r2, 64 + + call ang16_mode_13_23 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_23, 3,7,14, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 72] ; [11 10 9 8 7 6 5 4] + pinsrw xm1, [r2 + 92], 1 ; [11 x x x 7 x 14 4] + pshufb xm1, [pw_ang16_13_23] ; [ 4 4 7 7 11 11 14 14] + xor r6d, r6d + inc r6d + + call ang16_mode_13_23 + RET + +cglobal intra_pred_ang16_14, 3,7,15, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 4] ; [ x x 7 x 5 x x 2] + pinsrw xm1, [r2 + 20], 1 ; [ x x 7 x 5 x 10 2] + movu xm14, [r2 + 24] ; [ x x x x 15 x x 12] + pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x] + pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10] + xor r6d, r6d + add r2, 64 + + call ang16_mode_14_22 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_22, 3,7,15, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 68] ; [ x x 7 x 5 x x 2] + pinsrw xm1, [r2 + 84], 1 ; [ x x 7 x 5 x 10 2] + movu xm14, [r2 + 88] ; [ x x x x 15 x x 12] + pshufb xm14, [pw_ang16_14_22] ; [12 12 15 15 x x x x] + pshufb xm1, [pw_ang16_14_22] ; [ 2 2 5 5 7 7 10 10] + xor r6d, r6d + inc r6d + + call ang16_mode_14_22 + RET + +cglobal intra_pred_ang16_15, 3,7,15, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 4] ; [ x 8 x 6 x 4 x 2] + movu xm14, [r2 + 18] ; [ x 15 x 13 x 11 x 9] + pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15] + pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8] + xor r6d, r6d + add r2, 64 + + call ang16_mode_15_21 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_21, 3,7,15, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 68] ; [ x 8 x 6 x 4 x 2] + movu xm14, [r2 + 82] ; [ x 15 x 13 x 11 x 9] + pshufb xm14, [pw_ang16_15_21] ; [ 9 9 11 11 13 13 15 15] + pshufb xm1, [pw_ang16_15_21] ; [ 2 2 4 4 6 6 8 8] + xor r6d, r6d + inc r6d + + call ang16_mode_15_21 + RET + +cglobal intra_pred_ang16_16, 3,7,15, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 4] ; [ x x x 6 5 x 3 2] + movu xm14, [r2 + 16] ; [ x x x 12 11 x 9 8] + movu xm2, [r2 + 28] ; [ x x x x x x 15 14] + pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] + pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] + pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] + xor r6d, r6d + add r2, 64 + + call ang16_mode_16_20 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_20, 3,7,15, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 68] ; [ x x x 6 5 x 3 2] + movu xm14, [r2 + 80] ; [ x x x 12 11 x 9 8] + movu xm2, [r2 + 92] ; [ x x x x x x 15 14] + pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] + pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] + pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] + xor r6d, r6d + inc r6d + + call ang16_mode_16_20 + RET + +cglobal intra_pred_ang16_17, 3,7,15, 0-4 + movzx r5d, word [r2 + 64] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 64], r6w + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 2] ; [ x x x 6 5 x 3 2] + movu xm14, [r2 + 12] ; [ x x x 12 11 x 9 8] + movu xm2, [r2 + 22] ; [ x x x x x x 15 14] + pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] + pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] + pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] + xor r6d, r6d + add r2, 64 + + call ang16_mode_17_19 + + mov r6d, [rsp] + mov [r2], r6w + RET + +cglobal intra_pred_ang16_19, 3,7,15, 0-4 + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + movu xm1, [r2 + 66] ; [ x x x 6 5 x 3 2] + movu xm14, [r2 + 76] ; [ x x x 12 11 x 9 8] + movu xm2, [r2 + 86] ; [ x x x x x x 15 14] + pshufb xm14, [pw_ang16_16_20] ; [ 8 8 9 9 11 11 12 12] + pshufb xm1, [pw_ang16_16_20] ; [ 2 2 3 3 5 5 6 6] + pshufb xm2, [pw_ang16_16_20] ; [14 14 15 15 x x x x] + xor r6d, r6d + inc r6d + + call ang16_mode_17_19 + RET + +cglobal intra_pred_ang16_18, 3,5,4 + add r1d, r1d + lea r4, [r1 * 3] + movu m1, [r2] + movu m0, [r2 + 2 + 64] + pshufb m0, [pw_swap16] + mova m3, m0 + vinserti128 m0, m0, xm1, 1 + movu [r0], m1 + palignr m2, m1, m0, 14 + movu [r0 + r1], m2 + + palignr m2, m1, m0, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 10 + movu [r0 + r4], m2 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 6 + movu [r0 + r1], m2 + palignr m2, m1, m0, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m1, m0, 2 + movu [r0 + r4], m2 + + lea r0, [r0 + r1 * 4] + movu [r0], m0 + vpermq m3, m3, 01001110b + palignr m2, m0, m3, 14 + movu [r0 + r1], m2 + palignr m2, m0, m3, 12 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 10 + movu [r0 + r4], m2 + palignr m2, m1, m0, 10 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m3, 8 + movu [r0], m2 + palignr m2, m0, m3, 6 + movu [r0 + r1], m2 + palignr m2, m0, m3, 4 + movu [r0 + r1 * 2], m2 + palignr m2, m0, m3, 2 + movu [r0 + r4], m2 + palignr m1, m0, 2 + RET + +;------------------------------------------------------------------------------------------------------- +; end of avx2 code for intra_pred_ang16 mode 2 to 34 +;------------------------------------------------------------------------------------------------------- + +;------------------------------------------------------------------------------------------------------- +; avx2 code for intra_pred_ang32 mode 2 to 34 start +;------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_ang32_2, 3,5,6 + lea r4, [r2] + add r2, 128 + cmp r3m, byte 34 + cmove r2, r4 + add r1d, r1d + lea r3, [r1 * 3] + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu m3, [r2 + 36] + movu m4, [r2 + 52] + + movu [r0], m0 + movu [r0 + 32], m3 + palignr m2, m1, m0, 2 + palignr m5, m4, m3, 2 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m1, m0, 4 + palignr m5, m4, m3, 4 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m1, m0, 6 + palignr m5, m4, m3, 6 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + palignr m5, m4, m3, 8 + movu [r0], m2 + movu [r0 + 32], m5 + palignr m2, m1, m0, 10 + palignr m5, m4, m3, 10 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m1, m0, 12 + palignr m5, m4, m3, 12 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m1, m0, 14 + palignr m5, m4, m3, 14 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + movu m0, [r2 + 36] + movu m3, [r2 + 68] + lea r0, [r0 + r1 * 4] + movu [r0], m1 + movu [r0 + 32], m4 + palignr m2, m0, m1, 2 + palignr m5, m3, m4, 2 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m0, m1, 4 + palignr m5, m3, m4, 4 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m0, m1, 6 + palignr m5, m3, m4, 6 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m1, 8 + palignr m5, m3, m4, 8 + movu [r0], m2 + movu [r0 + 32], m5 + palignr m2, m0, m1, 10 + palignr m5, m3, m4, 10 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m0, m1, 12 + palignr m5, m3, m4, 12 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m0, m1, 14 + palignr m5, m3, m4, 14 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + lea r0, [r0 + r1 * 4] + movu m1, [r2 + 52] + movu m4, [r2 + 84] + + movu [r0], m0 + movu [r0 + 32], m3 + palignr m2, m1, m0, 2 + palignr m5, m4, m3, 2 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m1, m0, 4 + palignr m5, m4, m3, 4 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m1, m0, 6 + palignr m5, m4, m3, 6 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + lea r0, [r0 + r1 * 4] + palignr m2, m1, m0, 8 + palignr m5, m4, m3, 8 + movu [r0], m2 + movu [r0 + 32], m5 + palignr m2, m1, m0, 10 + palignr m5, m4, m3, 10 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m1, m0, 12 + palignr m5, m4, m3, 12 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m1, m0, 14 + palignr m5, m4, m3, 14 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + movu m0, [r2 + 68] + movu m3, [r2 + 100] + lea r0, [r0 + r1 * 4] + movu [r0], m1 + movu [r0 + 32], m4 + palignr m2, m0, m1, 2 + palignr m5, m3, m4, 2 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m0, m1, 4 + palignr m5, m3, m4, 4 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m0, m1, 6 + palignr m5, m3, m4, 6 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + + lea r0, [r0 + r1 * 4] + palignr m2, m0, m1, 8 + palignr m5, m3, m4, 8 + movu [r0], m2 + movu [r0 + 32], m5 + palignr m2, m0, m1, 10 + palignr m5, m3, m4, 10 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m5 + palignr m2, m0, m1, 12 + palignr m5, m3, m4, 12 + movu [r0 + r1 * 2], m2 + movu [r0 + r1 * 2 + 32], m5 + palignr m2, m0, m1, 14 + palignr m5, m3, m4, 14 + movu [r0 + r3], m2 + movu [r0 + r3 + 32], m5 + RET + +cglobal intra_pred_ang32_3, 3,8,13 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_3_33 + + add r2, 26 + lea r0, [r0 + 32] + + call ang16_mode_3_33 + + add r2, 6 + lea r0, [r7 + 8 * r1] + + call ang16_mode_3_33 + + add r2, 26 + lea r0, [r0 + 32] + + call ang16_mode_3_33 + RET + +cglobal intra_pred_ang32_33, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_3_33 + + add r2, 26 + + call ang16_mode_3_33 + + add r2, 6 + mov r0, r5 + + call ang16_mode_3_33 + + add r2, 26 + + call ang16_mode_3_33 + RET + +;; angle 32, modes 4 and 32 +cglobal ang32_mode_4_32 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 - 13 * 32] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 13 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 8 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] + pmaddwd m6, [r3 - 3 * 32] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] + pmaddwd m7, [r3 - 3 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] + pmaddwd m7, m8, [r3 - 14 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m9, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] + pmaddwd m10, m9, [r3 - 14 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 7 * 32] ; [25] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 7 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m9, m0, m3, 12 + pmaddwd m9, [r3 - 4 * 32] ; [14] + paddd m9, [pd_16] + psrld m9, 5 + palignr m3, m2, m0, 12 + pmaddwd m3, [r3 - 4 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, m0, [r3 - 15 * 32] ; [3] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m3, m2, [r3 - 15 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m10, m3 + + pmaddwd m11, m0, [r3 + 6 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m3, m2, [r3 + 6 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + + palignr m4, m2, m0, 4 + pmaddwd m4, [r3 - 5* 32] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m1, m2, 4 + pmaddwd m5, [r3 - 5 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m2, m0, 8 + pmaddwd m5, m6, [r3 - 16 * 32] ; [2] + paddd m5, [pd_16] + psrld m5, 5 + palignr m7, m1, m2, 8 + pmaddwd m8, m7, [r3 - 16 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 5 * 32] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 5 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m7, m2, m0, 12 + pmaddwd m7, [r3 - 6 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m8, m1, m2, 12 + pmaddwd m8, [r3 - 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] + pmaddwd m8, m2, [r3 - 17 * 32] ; [1] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m1, [r3 - 17 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18] + punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17] + + pmaddwd m9, m2, [r3 + 4 * 32] ; [22] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m1, [r3 + 4 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + palignr m10, m1, m2, 4 + pmaddwd m10, [r3 - 7 * 32] ; [11] + paddd m10, [pd_16] + psrld m10, 5 + palignr m11, m0, m1, 4 + pmaddwd m11, [r3 - 7 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m3, m1, m2, 8 + pmaddwd m3, [r3 - 18 * 32] ; [0] + paddd m3, [pd_16] + psrld m3, 5 + palignr m0, m1, 8 + pmaddwd m0, [r3 - 18 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16 + ret + +cglobal intra_pred_ang32_4, 3,8,13 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 18 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_4_32 + + add r2, 22 + lea r0, [r0 + 32] + + call ang32_mode_4_32 + + add r2, 10 + lea r0, [r7 + 8 * r1] + + call ang16_mode_4_32 + + add r2, 22 + lea r0, [r0 + 32] + + call ang32_mode_4_32 + RET + +cglobal intra_pred_ang32_32, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 18 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_4_32 + + add r2, 22 + + call ang32_mode_4_32 + + add r2, 10 + mov r0, r5 + + call ang16_mode_4_32 + + add r2, 22 + + call ang32_mode_4_32 + RET + +;; angle 32, modes 5 and 31 +cglobal ang32_mode_5_31 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 - 15 * 32] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 15 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 2 * 32] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 2 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m0, m3, 4 + pmaddwd m6, m7, [r3 - 13 * 32] ; [3] + paddd m6, [pd_16] + psrld m6, 5 + palignr m8, m2, m0, 4 + pmaddwd m9, m8, [r3 - 13 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 4 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m9, m0, m3, 8 + pmaddwd m8, m9, [r3 - 11 * 32] ; [5] + paddd m8, [pd_16] + psrld m8, 5 + palignr m10, m2, m0, 8 + pmaddwd m11, m10, [r3 - 11 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m8, m11 + + pmaddwd m9, [r3 + 6 * 32] ; [22] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, [r3 + 6 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m11, m0, m3, 12 + pmaddwd m10, m11, [r3 - 9 * 32] ; [7] + paddd m10, [pd_16] + psrld m10, 5 + palignr m12, m2, m0, 12 + pmaddwd m3, m12, [r3 - 9 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m10, m3 + + pmaddwd m11, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 8 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + + pmaddwd m4, m0, [r3 - 7 * 32] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 - 7 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m0, [r3 + 10 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, m2, [r3 + 10 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + palignr m7, m2, m0, 4 + pmaddwd m6, m7, [r3 - 5 * 32] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + palignr m8, m1, m2, 4 + pmaddwd m9, m8, [r3 - 5 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 12 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m9, m2, m0, 8 + pmaddwd m8, m9, [r3 - 3 * 32] ; [13] + paddd m8, [pd_16] + psrld m8, 5 + palignr m3, m1, m2, 8 + pmaddwd m10, m3, [r3 - 3 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 14 * 32] ; [30] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, [r3 + 14 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + palignr m10, m2, m0, 12 + pmaddwd m10, [r3 - 1 * 32] ; [15] + paddd m10, [pd_16] + psrld m10, 5 + palignr m11, m1, m2, 12 + pmaddwd m11, [r3 - 1 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m2, [r3 - 16 * 32] ; [0] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3 - 16 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16 + ret + +cglobal intra_pred_ang32_5, 3,8,13 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_5_31 + + add r2, 18 + lea r0, [r0 + 32] + + call ang32_mode_5_31 + + add r2, 14 + lea r0, [r7 + 8 * r1] + + call ang16_mode_5_31 + + add r2, 18 + lea r0, [r0 + 32] + + call ang32_mode_5_31 + RET + +cglobal intra_pred_ang32_31, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_5_31 + + add r2, 18 + + call ang32_mode_5_31 + + add r2, 14 + mov r0, r5 + + call ang16_mode_5_31 + + add r2, 18 + + call ang32_mode_5_31 + RET + +;; angle 32, modes 6 and 30 +cglobal ang32_mode_6_30 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 + 14 * 32] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 14 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m0, m3, 4 + pmaddwd m5, m6, [r3 - 5 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + palignr m7, m2, m0, 4 + pmaddwd m8, m7, [r3 - 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 8 * 32] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 8 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m9, m0, m3, 8 + pmaddwd m7, m9, [r3 - 11 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m12, m2, m0, 8 + pmaddwd m11, m12, [r3 - 11 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m7, m11 + + pmaddwd m8, m9, [r3 + 2 * 32] ; [17] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m10, m12, [r3 + 2 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 15 * 32] ; [30] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, [r3 + 15 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + palignr m11, m0, m3, 12 + pmaddwd m10, m11, [r3 - 4 * 32] ; [11] + paddd m10, [pd_16] + psrld m10, 5 + palignr m12, m2, m0, 12 + pmaddwd m3, m12, [r3 - 4 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m10, m3 + + pmaddwd m11, [r3 + 9 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 9 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + pmaddwd m4, m0, [r3 - 10 * 32] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 - 10 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m0, [r3 + 3 * 32] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, m2, [r3 + 3 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, m0, [r3 + 16 * 32] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m2, [r3 + 16 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m2, m0, 4 + pmaddwd m7, m8, [r3 - 3 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + palignr m9, m1, m2, 4 + pmaddwd m3, m9, [r3 - 3 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m8, [r3 + 10 * 32] ; [25] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m2, m0, 8 + pmaddwd m9, m10, [r3 - 9 * 32] ; [6] + paddd m9, [pd_16] + psrld m9, 5 + palignr m12, m1, m2, 8 + pmaddwd m3, m12, [r3 - 9 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 4 * 32] ; [19] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, [r3 + 4 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m11, m2, m0, 12 + pmaddwd m11, [r3 - 15 * 32] ; [0] + paddd m11, [pd_16] + psrld m11, 5 + palignr m3, m1, m2, 12 + pmaddwd m3, [r3 - 15 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m11, m3 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 + ret + +cglobal intra_pred_ang32_6, 3,8,14 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_6_30 + + add r2, 12 + lea r0, [r0 + 32] + + call ang32_mode_6_30 + + add r2, 20 + lea r0, [r7 + 8 * r1] + + call ang16_mode_6_30 + + add r2, 12 + lea r0, [r0 + 32] + + call ang32_mode_6_30 + RET + +cglobal intra_pred_ang32_30, 3,7,14 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_6_30 + + add r2, 12 + + call ang32_mode_6_30 + + add r2, 20 + mov r0, r5 + + call ang16_mode_6_30 + + add r2, 12 + + call ang32_mode_6_30 + RET + +;; angle 32, modes 7 and 29 +cglobal ang32_mode_7_29 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + + pmaddwd m4, m3, [r3 + 8 * 32] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 8 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m8, m0, m3, 4 + pmaddwd m5, m8, [r3 - 15 * 32] ; [2] + paddd m5, [pd_16] + psrld m5, 5 + palignr m9, m2, m0, 4 + pmaddwd m10, m9, [r3 - 15 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m5, m10 + + pmaddwd m6, m8, [r3 - 6 * 32] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m9, [r3 - 6 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m8, [r3 + 3 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 + 3 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 12 * 32] ; [29] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 12 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m11, m0, m3, 8 + pmaddwd m9, m11, [r3 - 11 * 32] ; [6] + paddd m9, [pd_16] + psrld m9, 5 + palignr m12, m2, m0, 8 + pmaddwd m10, m12, [r3 - 11 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m11, [r3 - 2 * 32] ; [15] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m13, m12, [r3 - 2 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m10, m13 + + pmaddwd m11, [r3 + 7 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 7 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m5, m0, m3, 12 + pmaddwd m4, m5, [r3 - 16 * 32] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + palignr m6, m2, m0, 12 + pmaddwd m7, m6, [r3 - 16 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m5, [r3 - 7 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, [r3 - 7 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m9, m0, m3, 12 + pmaddwd m6, m9, [r3 + 2 * 32] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + palignr m3, m2, m0, 12 + pmaddwd m7, m3, [r3 + 2 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m9, [r3 + 11 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m3, [r3 + 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m0, [r3 - 12 * 32] ; [5] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m10, m2, [r3 - 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, m0, [r3 - 3 * 32] ; [14] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m2, [r3 - 3 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, m0, [r3 + 6 * 32] ; [23] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m2, [r3 + 6 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m11, m2, m0, 4 + pmaddwd m11, [r3 - 17 * 32] ; [0] + paddd m11, [pd_16] + psrld m11, 5 + palignr m12, m1, m2, 4 + pmaddwd m12, [r3 - 17 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 3, 2, 16 + ret + +cglobal intra_pred_ang32_7, 3,8,14 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 17 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_7_29 + + add r2, 8 + lea r0, [r0 + 32] + + call ang32_mode_7_29 + + add r2, 24 + lea r0, [r7 + 8 * r1] + + call ang16_mode_7_29 + + add r2, 8 + lea r0, [r0 + 32] + + call ang32_mode_7_29 + RET + +cglobal intra_pred_ang32_29, 3,7,14 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 17 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_7_29 + + add r2, 8 + + call ang32_mode_7_29 + + add r2, 24 + mov r0, r5 + + call ang16_mode_7_29 + + add r2, 8 + + call ang32_mode_7_29 + RET + +;; angle 32, modes 8 and 28 +cglobal ang32_mode_8_28 + test r6d, r6d + + movu m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + movu m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + movu m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + + pmaddwd m4, m3, [r3 + 6 * 32] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 + 6 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 11 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 16 * 32] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3 + 16 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m11, m0, m3, 4 + pmaddwd m7, m11, [r3 - 11 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + palignr m1, m2, m0, 4 + pmaddwd m8, m1, [r3 - 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m11, [r3 - 6 * 32] ; [9] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m1, [r3 - 6 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m11, [r3 - 1 * 32] ; [14] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m1, [r3 - 1 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m11, [r3 + 4 * 32] ; [19] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m1, [r3 + 4 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, [r3 + 9 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m1, [r3 + 9 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m11, m1 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 + + palignr m4, m0, m3, 4 + pmaddwd m4, [r3 + 14 * 32] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m2, m0, 4 + pmaddwd m5, [r3 + 14 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m1, m0, m3, 8 + pmaddwd m5, m1, [r3 - 13 * 32] ; [2] + paddd m5, [pd_16] + psrld m5, 5 + palignr m10, m2, m0, 8 + pmaddwd m6, m10, [r3 - 13 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 - 8 * 32] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m10, [r3 - 8 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m1, [r3 - 3 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m10, [r3 - 3 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m1, [r3 + 2 * 32] ; [17] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m10, [r3 + 2 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m1, [r3 + 7 * 32] ; [22] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m11, m10, [r3 + 7 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m9, m11 + + pmaddwd m1, [r3 + 12 * 32] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m10, [r3 + 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m1, m10 + + palignr m11, m0, m3, 12 + pmaddwd m11, [r3 - 15 * 32] ; [0] + paddd m11, [pd_16] + psrld m11, 5 + palignr m2, m0, 12 + pmaddwd m2, [r3 - 15 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m11, m2 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 11, 0, 2, 16 + ret + +cglobal intra_pred_ang32_8, 3,8,13 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_8_28 + + add r2, 4 + lea r0, [r0 + 32] + + call ang32_mode_8_28 + + add r2, 28 + lea r0, [r7 + 8 * r1] + + call ang16_mode_8_28 + + add r2, 4 + lea r0, [r0 + 32] + + call ang32_mode_8_28 + RET + +cglobal intra_pred_ang32_28, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 15 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_8_28 + + add r2, 4 + + call ang32_mode_8_28 + + add r2, 28 + mov r0, r5 + + call ang16_mode_8_28 + + add r2, 4 + + call ang32_mode_8_28 + RET + +cglobal intra_pred_ang32_9, 3,8,13 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_9_27 + + add r2, 2 + lea r0, [r0 + 32] + + call ang16_mode_9_27 + + add r2, 30 + lea r0, [r7 + 8 * r1] + + call ang16_mode_9_27 + + add r2, 2 + lea r0, [r0 + 32] + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang32_27, 3,7,13 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_9_27 + + add r2, 2 + + call ang16_mode_9_27 + + add r2, 30 + mov r0, r5 + + call ang16_mode_9_27 + + add r2, 2 + + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang32_10, 3,4,2 + add r2, mmsize*4 + add r1d, r1d + lea r3, [r1 * 3] + + vpbroadcastw m0, [r2 + 2] ; [1...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 2] ; [2...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 4] ; [3...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 6] ; [4...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 * 4] + vpbroadcastw m0, [r2 + 2 + 8] ; [5...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 10] ; [6...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 12] ; [7...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 14] ; [8...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 16] ; [9...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 18] ; [10...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 20] ; [11...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 22] ; [12...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 24] ; [13...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 26] ; [14...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 28] ; [15...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 30] ; [16...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 32] ; [17...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 34] ; [18...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 36] ; [19...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 38] ; [20...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 40] ; [21...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 42] ; [22...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 44] ; [23...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 46] ; [24...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 48] ; [25...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 50] ; [26...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 52] ; [27...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 54] ; [28...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + + lea r0, [r0 + r1 *4] + vpbroadcastw m0, [r2 + 2 + 56] ; [29...] + movu [r0], m0 + movu [r0 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 58] ; [30...] + movu [r0 + r1], m1 + movu [r0 + r1 + 32], m1 + vpbroadcastw m0, [r2 + 2 + 60] ; [31...] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m0 + vpbroadcastw m1, [r2 + 2 + 62] ; [32...] + movu [r0 + r3], m1 + movu [r0 + r3 + 32], m1 + RET + +cglobal intra_pred_ang32_26, 3,3,2 + movu m0, [r2 + 2] + movu m1, [r2 + 34] + add r1d, r1d + lea r2, [r1 * 3] + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + + lea r0, [r0 + r1 *4] + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + movu [r0 + r2], m0 + movu [r0 + r2 + 32], m1 + RET + +cglobal intra_pred_ang32_11, 3,8,12, 0-8 + movzx r5d, word [r2 + 128] ; [0] + movzx r6d, word [r2] + mov [rsp], r5w + mov [r2 + 128], r6w + + movzx r5d, word [r2 + 126] ; [16] + movzx r6d, word [r2 + 32] + mov [rsp + 4], r5w + mov [r2 + 126], r6w + + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + + call ang16_mode_11_25 + + sub r2, 2 + lea r0, [r0 + 32] + + call ang16_mode_11_25 + + add r2, 34 + lea r0, [r7 + 8 * r1] + + call ang16_mode_11_25 + + sub r2, 2 + lea r0, [r0 + 32] + + call ang16_mode_11_25 + + mov r6d, [rsp] + mov [r2 - 30], r6w + mov r6d, [rsp + 4] + mov [r2 - 32], r6w + RET + +cglobal intra_pred_ang32_25, 3,7,12, 0-4 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + add r1d, r1d + + movzx r4d, word [r2 - 2] + movzx r5d, word [r2 + 160] ; [16] + mov [rsp], r4w + mov [r2 - 2], r5w + + lea r4, [r1 * 3] + lea r5, [r0 + 32] + + call ang16_mode_11_25 + + sub r2, 2 + + call ang16_mode_11_25 + + add r2, 34 + mov r0, r5 + + call ang16_mode_11_25 + + sub r2, 2 + + call ang16_mode_11_25 + + mov r5d, [rsp] + mov [r2 - 32], r5w + RET + +;; angle 32, modes 12 and 24, row 0 to 15 +cglobal ang32_mode_12_24_0_15 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 11 * 32] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 11 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 6 * 32] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m2, [r3 + 6 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 1 * 32] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m2, [r3 + 1 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, m3, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m2, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m3, [r3 - 9 * 32] ; [7] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m2, [r3 - 9 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m2, [r3 - 14 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m9, m2 + + movu xm1, [r2 - 8] + pshufb xm1, [pw_ang32_12_24] + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 6 6 13 13 19 19 26 26] + + palignr m2, m3, m1, 14 ; [11 10 10 9 9 8 8 7 3 2 2 1 1 0 0 6] + palignr m13, m0, m3, 14 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] + + pmaddwd m10, m2, [r3 + 13 * 32] ; [29] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 + 13 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 14 + + pmaddwd m4, m2, [r3 + 3 * 32] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 + 3 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 2 * 32] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 - 2 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 - 7 * 32] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 - 7 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m0, m3, 10 + palignr m3, m1, 10 + + pmaddwd m8, m3, [r3 + 15 * 32] ; [31] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 15 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 10 * 32] ; [26] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 + 10 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m3, [r3 + 5 * 32] ; [21] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m2, m0, [r3 + 5 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m10, m2 + + pmaddwd m3, [r3] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 2, 16 + ret + +;; angle 32, modes 12 and 24, row 16 to 31 +cglobal ang32_mode_12_24_16_31 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + + palignr m2, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m4, m2, [r3 - 5 * 32] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 5 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 10 * 32] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m13, [r3 - 10 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m2, [r3 - 15 * 32] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m13, [r3 - 15 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m2, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m7, m2, [r3 + 12 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 + 7 * 32] ; [23] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 7 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 + 2 * 32] ; [18] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 + 2 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 3 * 32] ; [13] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 3 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, m2, [r3 - 8 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 - 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 6 + + pmaddwd m4, m2, [r3 - 13 * 32] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 13 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m2, m3, m1, 2 + palignr m13, m0, m3, 2 + + pmaddwd m5, m2, [r3 + 14 * 32] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 + 14 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 + 9 * 32] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 + 9 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 + 4 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 - 1 * 32] ; [15] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 - 1 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 6 * 32] ; [10] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 - 6 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 11 * 32] ; [5] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 11 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m2, [r3 - 16 * 32] ; [0] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m13, [r3 - 16 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m2, m13 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 3, 16 + ret + +cglobal intra_pred_ang32_12, 3,8,14, 0-16 + movu xm0, [r2 + 114] + mova [rsp], xm0 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + pinsrw xm1, [r2], 7 ; [0] + pinsrw xm1, [r2 + 12], 6 ; [6] + pinsrw xm1, [r2 + 26], 5 ; [13] + pinsrw xm1, [r2 + 38], 4 ; [19] + pinsrw xm1, [r2 + 52], 3 ; [26] + movu [r2 + 114], xm1 + + xor r6d, r6d + add r2, 128 + lea r7, [r0 + 8 * r1] + + call ang32_mode_12_24_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_12_24_16_31 + + add r2, 32 + lea r0, [r7 + 8 * r1] + + call ang32_mode_12_24_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_12_24_16_31 + + mova xm0, [rsp] + movu [r2 - 46], xm0 + RET + +cglobal intra_pred_ang32_24, 3,7,14, 0-16 + movu xm0, [r2 - 16] + mova [rsp], xm0 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + pinsrw xm1, [r2 + 140], 7 ; [6] + pinsrw xm1, [r2 + 154], 6 ; [13] + pinsrw xm1, [r2 + 166], 5 ; [19] + pinsrw xm1, [r2 + 180], 4 ; [26] + movu [r2 - 16], xm1 + + xor r6d, r6d + inc r6d + lea r5, [r0 + 32] + + call ang32_mode_12_24_0_15 + + call ang32_mode_12_24_16_31 + + add r2, 32 + mov r0, r5 + + call ang32_mode_12_24_0_15 + + call ang32_mode_12_24_16_31 + + mova xm0, [rsp] + movu [r2 - 48], xm0 + RET + +;; angle 32, modes 13 and 23, row 0 to 15 +cglobal ang32_mode_13_23_row_0_15 + test r6d, r6d + + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m4, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + + punpcklwd m3, m0, m4 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + punpckhwd m2, m0, m4 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + + pmaddwd m4, m3, [r3 + 7 * 32] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 7 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 2 * 32] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m2, [r3 - 2 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 11 * 32] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m2, [r3 - 11 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m6, m2 + + movu xm1, [r2 - 8] + pshufb xm1, [pw_ang32_12_24] + punpcklwd m3, m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + punpckhwd m0, m0 ; [15 15 14 14 13 13 12 12 7 7 6 6 5 5 4 4] + vinserti128 m1, m1, xm0, 1 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m7, m2, [r3 + 12 * 32] ; [28] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 + 3 * 32] ; [19] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 + 3 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 6 * 32] ; [10] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 - 6 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 - 15 * 32] ; [1] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 - 15 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m2, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m11, m2, [r3 + 8 * 32] ; [24] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 + 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m13, m0, m3, 10 + + pmaddwd m4, m2, [r3 - 1 * 32] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 - 1 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 - 10 * 32] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 - 10 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m2, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m6, m2, [r3 + 13 * 32] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 + 13 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 + 4 * 32] ; [20] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 + 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 - 5 * 32] ; [11] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 - 5 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m13, [r3 - 14 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m9, m13 + + palignr m0, m3, 2 + palignr m3, m1, 2 + + pmaddwd m1, m3, [r3 + 9 * 32] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m2, m0, [r3 + 9 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + pmaddwd m3, [r3] ; [16] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 + ret + +;; angle 32, modes 13 and 23, row 16 to 31 +cglobal ang32_mode_13_23_row_16_31 + test r6d, r6d + + movu m0, [r2] ; [11 10 9 8 7 6 5 4 3 2 1 0 4 7 11 14] + movu m5, [r2 + 2] ; [12 11 10 9 8 7 6 5 4 3 2 1 0 4 7 11] + + punpcklwd m4, m0, m5 ; [ 8 7 7 6 6 5 5 4 0 4 4 7 7 11 11 14] + punpckhwd m2, m0, m5 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] + + pmaddwd m4, [r3 - 9 * 32] ; [7] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m2, [r3 - 9 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m4, m2 + + movu xm1, [r2 - 8] + pshufb xm1, [pw_ang32_12_24] ; [18 18 21 21 25 25 28 28] + punpcklwd m3, m0, m0 ; [ 7 7 6 6 5 5 4 4 4 4 7 7 11 11 14 14] + punpckhwd m0, m0 ; [11 11 10 10 9 9 8 8 3 3 2 2 1 1 0 0] + vinserti128 m1, m1, xm0, 1 ; [ 3 3 2 2 1 1 0 0 18 18 21 21 25 25 28 28] + + palignr m2, m3, m1, 14 + palignr m13, m0, m3, 14 + + pmaddwd m5, m2, [r3 + 14 * 32] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 + 14 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 + 5 * 32] ; [21] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m13, [r3 + 5 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m2, [r3 - 4 * 32] ; [12] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + pmaddwd m8, m2, [r3 - 13 * 32] ; [3] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m13, [r3 - 13 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m2, m3, m1, 10 + palignr m13, m0, m3, 10 + + pmaddwd m9, m2, [r3 + 10 * 32] ; [26] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m13, [r3 + 10 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m2, [r3 + 1 * 32] ; [17] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m13, [r3 + 1 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m11, m2, [r3 - 8 * 32] ; [8] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m13, [r3 - 8 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m11, m13 + + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0 + + palignr m2, m3, m1, 6 + palignr m13, m0, m3, 6 + + pmaddwd m4, m2, [r3 + 15 * 32] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m13, [r3 + 15 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m2, [r3 + 6 * 32] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m13, [r3 + 6 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 - 3 * 32] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m13, [r3 - 3 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 - 12 * 32] ; [4] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, m13, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m0, m3, 2 + palignr m3, m1, 2 + + pmaddwd m8, m3, [r3 + 11 * 32] ; [27] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 11 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m3, [r3 + 2 * 32] ; [18] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 + 2 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m1, m3, [r3 - 7 * 32] ; [9] + paddd m1, [pd_16] + psrld m1, 5 + pmaddwd m2, m0, [r3 - 7 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + pmaddwd m3, [r3 - 16 * 32] ; [0] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 - 16 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 3, 0, 2, 16 + ret + +cglobal intra_pred_ang32_13, 3,8,14, 0-mmsize + movu m0, [r2 + 112] + mova [rsp], m0 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 8] + movu xm2, [r2 + 36] + pshufb xm1, [pw_ang32_13_23] + pshufb xm2, [pw_ang32_13_23] + pinsrw xm1, [r2 + 28], 4 + pinsrw xm2, [r2 + 56], 4 + punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28] + + movzx r6d, word [r2] + mov [r2 + 128], r6w + movu [r2 + 112], xm2 + + xor r6d, r6d + add r2, 128 + lea r7, [r0 + 8 * r1] + + call ang32_mode_13_23_row_0_15 + + sub r2, 8 + lea r0, [r0 + 32] + + call ang32_mode_13_23_row_16_31 + + add r2, 40 + lea r0, [r7 + 8 * r1] + + call ang32_mode_13_23_row_0_15 + + sub r2, 8 + lea r0, [r0 + 32] + + call ang32_mode_13_23_row_16_31 + + mova m0, [rsp] + movu [r2 - 40], m0 + RET + +cglobal intra_pred_ang32_23, 3,7,14, 0-16 + movu xm0, [r2 - 16] + mova [rsp], xm0 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 136] + movu xm2, [r2 + 164] + pshufb xm1, [pw_ang32_13_23] + pshufb xm2, [pw_ang32_13_23] + pinsrw xm1, [r2 + 156], 4 + pinsrw xm2, [r2 + 184], 4 + punpckhqdq xm2, xm1 ; [ 4 7 8 11 18 21 25 28] + + movu [r2 - 16], xm2 + + xor r6d, r6d + inc r6d + lea r5, [r0 + 32] + + call ang32_mode_13_23_row_0_15 + + sub r2, 8 + + call ang32_mode_13_23_row_16_31 + + add r2, 40 + mov r0, r5 + + call ang32_mode_13_23_row_0_15 + + sub r2, 8 + + call ang32_mode_13_23_row_16_31 + + mova xm0, [rsp] + movu [r2 - 40], xm0 + RET + +%macro TRANSPOSE_STORE_AVX2_STACK 11 + jnz .skip%11 + punpckhwd m%9, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%3, m%4 + punpcklwd m%3, m%4 + + punpckldq m%4, m%1, m%3 + punpckhdq m%1, m%3 + punpckldq m%3, m%9, m%2 + punpckhdq m%9, m%2 + + punpckhwd m%10, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%7, m%8 + punpcklwd m%7, m%8 + + punpckldq m%8, m%5, m%7 + punpckhdq m%5, m%7 + punpckldq m%7, m%10, m%6 + punpckhdq m%10, m%6 + + punpcklqdq m%6, m%4, m%8 + punpckhqdq m%2, m%4, m%8 + punpcklqdq m%4, m%1, m%5 + punpckhqdq m%8, m%1, m%5 + + punpcklqdq m%1, m%3, m%7 + punpckhqdq m%5, m%3, m%7 + punpcklqdq m%3, m%9, m%10 + punpckhqdq m%7, m%9, m%10 + + movu [r0 + r1 * 0 + %11], xm%6 + movu [r0 + r1 * 1 + %11], xm%2 + movu [r0 + r1 * 2 + %11], xm%4 + movu [r0 + r4 * 1 + %11], xm%8 + + lea r5, [r0 + r1 * 4] + movu [r5 + r1 * 0 + %11], xm%1 + movu [r5 + r1 * 1 + %11], xm%5 + movu [r5 + r1 * 2 + %11], xm%3 + movu [r5 + r4 * 1 + %11], xm%7 + + lea r5, [r5 + r1 * 4] + vextracti128 [r5 + r1 * 0 + %11], m%6, 1 + vextracti128 [r5 + r1 * 1 + %11], m%2, 1 + vextracti128 [r5 + r1 * 2 + %11], m%4, 1 + vextracti128 [r5 + r4 * 1 + %11], m%8, 1 + + lea r5, [r5 + r1 * 4] + vextracti128 [r5 + r1 * 0 + %11], m%1, 1 + vextracti128 [r5 + r1 * 1 + %11], m%5, 1 + vextracti128 [r5 + r1 * 2 + %11], m%3, 1 + vextracti128 [r5 + r4 * 1 + %11], m%7, 1 + jmp .end%11 +.skip%11: +%if %11 == 16 + lea r7, [r0 + 8 * r1] +%else + lea r7, [r0] +%endif + movu [r7 + r1 * 0], m%1 + movu [r7 + r1 * 1], m%2 + movu [r7 + r1 * 2], m%3 + movu [r7 + r4 * 1], m%4 + +%if %11 == 16 + lea r7, [r7 + r1 * 4] +%else + lea r7, [r7 + r1 * 4] +%endif + movu [r7 + r1 * 0], m%5 + movu [r7 + r1 * 1], m%6 + movu [r7 + r1 * 2], m%7 + movu [r7 + r4 * 1], m%8 +.end%11: +%endmacro + +;; angle 32, modes 14 and 22, row 0 to 15 +cglobal ang32_mode_14_22_rows_0_15 + test r6d, r6d + + movu m0, [r2 - 12] + movu m1, [r2 - 10] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 + 4] + movu m4, [r2 + 6] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3] ; [16] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 13 * 32] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 13 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m0, m3, 4 + pmaddwd m6, m7, [r3 - 6 * 32] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + palignr m8, m2, m0, 4 + pmaddwd m9, m8, [r3 - 6 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 7 * 32] ; [23] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 7 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m10, m0, m3, 8 + pmaddwd m8, m10, [r3 - 12 * 32] ; [4] + paddd m8, [pd_16] + psrld m8, 5 + palignr m12, m2, m0, 8 + pmaddwd m9, m12, [r3 - 12 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m10, [r3 + 1 * 32] ; [17] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m11, m12, [r3 + 1 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m9, m11 + + pmaddwd m10, [r3 + 14 * 32] ; [30] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, [r3 + 14 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + palignr m11, m0, m3, 12 + pmaddwd m11, [r3 - 5 * 32] ; [11] + paddd m11, [pd_16] + psrld m11, 5 + palignr m12, m2, m0, 12 + pmaddwd m12, [r3 - 5 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + palignr m4, m0, m3, 12 + pmaddwd m4, [r3 + 8 * 32] ; [24] + paddd m4, [pd_16] + psrld m4, 5 + palignr m5, m2, m0, 12 + pmaddwd m5, [r3 + 8 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m0, [r3 - 11 * 32] ; [5] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, m2, [r3 - 11 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, m0, [r3 + 2 * 32] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m2, [r3 + 2 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m0, [r3 + 15 * 32] ; [31] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, m2, [r3 + 15 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + palignr m9, m2, m0, 4 + palignr m10, m1, m2, 4 + pmaddwd m8, m9, [r3 - 4 * 32] ; [12] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m11, m10, [r3 - 4 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m8, m11 + + pmaddwd m9, [r3 + 9 * 32] ; [25] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, [r3 + 9 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m1, m2, 8 + palignr m2, m0, 8 + + pmaddwd m10, m2, [r3 - 10 * 32] ; [6] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m12, m1, [r3 - 10 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m10, m12 + + pmaddwd m2, [r3 + 3 * 32] ; [19] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3 + 3 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 + ret + +;; angle 32, modes 14 and 22, rows 16 to 31 +cglobal ang32_mode_14_22_rows_16_31 + test r6d, r6d + + movu m0, [r2 - 24] + movu m1, [r2 - 22] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 - 8] + movu m4, [r2 - 6] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3 - 16 * 32] ; [0] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 16 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 - 3 * 32] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 - 3 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, m3, [r3 + 10 * 32] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m0, [r3 + 10 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + palignr m8, m0, m3, 4 + palignr m9, m2, m0, 4 + pmaddwd m7, m8, [r3 - 9 * 32] ; [7] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 - 9 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 4 * 32] ; [20] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 4 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m11, m0, m3, 8 + palignr m12, m2, m0, 8 + pmaddwd m9, m11, [r3 - 15 * 32] ; [1] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m12, [r3 - 15 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m11, [r3 - 2 * 32] ; [14] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m13, m12, [r3 - 2 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m10, m13 + + pmaddwd m11, [r3 + 11 * 32] ; [27] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 11 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + palignr m5, m0, m3, 12 + palignr m6, m2, m0, 12 + pmaddwd m4, m5, [r3 - 8 * 32] ; [8] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m7, m6, [r3 - 8 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m5, [r3 + 5 * 32] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, [r3 + 5 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m0, [r3 - 14 * 32] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, m2, [r3 - 14 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m7, m0, [r3 - 1 * 32] ; [15] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, m2, [r3 - 1 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m8, m0, [r3 + 12 * 32] ; [28] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m11, m2, [r3 + 12 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m8, m11 + + palignr m10, m2, m0, 4 + palignr m11, m1, m2, 4 + + pmaddwd m9, m10, [r3 - 7 * 32] ; [9] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m11, [r3 - 7 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 6 * 32] ; [22] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 6 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m1, m2, 8 + palignr m2, m0, 8 + + pmaddwd m2, [r3 - 13 * 32] ; [3] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3 - 13 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 + ret + +cglobal intra_pred_ang32_14, 3,8,14 + mov r6, rsp + sub rsp, 4*mmsize+gprsize + and rsp, ~63 + mov [rsp+4*mmsize], r6 + + movu m0, [r2 + 128] + movu m1, [r2 + 160] + movd xm2, [r2 + 192] + + mova [rsp + 1*mmsize], m0 + mova [rsp + 2*mmsize], m1 + movd [rsp + 3*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 4] + movu xm2, [r2 + 24] + movu xm3, [r2 + 44] + pshufb xm1, [pw_ang32_14_22] + pshufb xm2, [pw_ang32_14_22] + pshufb xm3, [pw_ang32_14_22] + pinsrw xm1, [r2 + 20], 4 + pinsrw xm2, [r2 + 40], 4 + pinsrw xm3, [r2 + 60], 4 + + punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20] + punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30] + + movzx r6d, word [r2] + mov [rsp + 1*mmsize], r6w + movu [rsp + 16], xm2 + movq [rsp + 8], xm3 + + xor r6d, r6d + lea r2, [rsp + 1*mmsize] + lea r7, [r0 + 8 * r1] + + call ang32_mode_14_22_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_14_22_rows_16_31 + + add r2, 32 + lea r0, [r7 + 8 * r1] + + call ang32_mode_14_22_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_14_22_rows_16_31 + + mov rsp, [rsp+4*mmsize] + RET + +cglobal intra_pred_ang32_22, 3,8,14 + mov r6, rsp + sub rsp, 4*mmsize+gprsize + and rsp, ~63 + mov [rsp+4*mmsize], r6 + + movu m0, [r2] + movu m1, [r2 + 32] + movd xm2, [r2 + 64] + + mova [rsp + 1*mmsize], m0 + mova [rsp + 2*mmsize], m1 + movd [rsp + 3*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 132] + movu xm2, [r2 + 152] + movu xm3, [r2 + 172] + pshufb xm1, [pw_ang32_14_22] + pshufb xm2, [pw_ang32_14_22] + pshufb xm3, [pw_ang32_14_22] + pinsrw xm1, [r2 + 148], 4 + pinsrw xm2, [r2 + 168], 4 + pinsrw xm3, [r2 + 188], 4 + + punpckhqdq xm2, xm1 ; [ 2 5 7 10 12 15 17 20] + punpckhqdq xm3, xm3 ; [22 25 27 30 22 25 27 30] + + movu [rsp + 16], xm2 + movq [rsp + 8], xm3 + + xor r6d, r6d + inc r6d + lea r2, [rsp + 1*mmsize] + lea r5, [r0 + 32] + + call ang32_mode_14_22_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_14_22_rows_16_31 + + add r2, 32 + mov r0, r5 + + call ang32_mode_14_22_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_14_22_rows_16_31 + + mov rsp, [rsp+4*mmsize] + RET + +;; angle 32, modes 15 and 21, row 0 to 15 +cglobal ang32_mode_15_21_rows_0_15 + test r6d, r6d + + movu m0, [r2 - 16] + movu m1, [r2 - 14] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2] + movu m4, [r2 + 2] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3] ; [16] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m0, m3, 4 + palignr m7, m2, m0, 4 + pmaddwd m5, m6, [r3 - 15 * 32] ; [1] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m7, [r3 - 15 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 2 * 32] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 2 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m0, m3, 8 + palignr m9, m2, m0, 8 + pmaddwd m7, m8, [r3 - 13 * 32] ; [3] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 - 13 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 4 * 32] ; [20] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 4 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m0, m3, 12 + palignr m11, m2, m0, 12 + pmaddwd m9, m10, [r3 - 11 * 32] ; [5] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, m11, [r3 - 11 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, [r3 + 6 * 32] ; [22] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 6 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m11, m0, [r3 - 9 * 32] ; [7] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, m2, [r3 - 9 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + pmaddwd m4, m0, [r3 + 8 * 32] ; [24] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m2, [r3 + 8 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m2, m0, 4 + palignr m7, m1, m2, 4 + pmaddwd m5, m6, [r3 - 7 * 32] ; [9] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, m7, [r3 - 7 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, [r3 + 10 * 32] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m2, m0, 8 + palignr m9, m1, m2, 8 + pmaddwd m7, m8, [r3 - 5 * 32] ; [11] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, m9, [r3 - 5 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m8, [r3 + 12 * 32] ; [28] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 12 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m2, m0, 12 + palignr m11, m1, m2, 12 + pmaddwd m9, m10, [r3 - 3 * 32] ; [13] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m11, [r3 - 3 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + pmaddwd m10, [r3 + 14 * 32] ; [30] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 14 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m2, [r3 - 1 * 32] ; [15] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3 - 1 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 + ret + +;; angle 32, modes 15 and 21, rows 16 to 31 +cglobal ang32_mode_15_21_rows_16_31 + test r6d, r6d + + movu m0, [r2 - 32] + movu m1, [r2 - 30] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 - 16] + movu m4, [r2 - 14] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3 - 16 * 32] ; [0] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 16 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 1 * 32] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 1 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m0, m3, 4 + palignr m8, m2, m0, 4 + pmaddwd m6, m7, [r3 - 14 * 32] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m8, [r3 - 14 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 3 * 32] ; [19] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 3 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m9, m0, m3, 8 + palignr m10, m2, m0, 8 + pmaddwd m8, m9, [r3 - 12 * 32] ; [4] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m11, m10, [r3 - 12 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m8, m11 + + pmaddwd m9, [r3 + 5 * 32] ; [21] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, [r3 + 5 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m11, m0, m3, 12 + palignr m12, m2, m0, 12 + pmaddwd m10, m11, [r3 - 10 * 32] ; [6] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m13, m12, [r3 - 10 * 32] + paddd m13, [pd_16] + psrld m13, 5 + packusdw m10, m13 + + pmaddwd m11, [r3 + 7 * 32] ; [23] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 7 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + pmaddwd m4, m0, [r3 - 8 * 32] ; [8] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m7, m2, [r3 - 8 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m5, m0, [r3 + 9 * 32] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, m2, [r3 + 9 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m2, m0, 4 + palignr m8, m1, m2, 4 + pmaddwd m6, m7, [r3 - 6 * 32] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m3, m8, [r3 - 6 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m6, m3 + + pmaddwd m7, [r3 + 11 * 32] ; [27] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m9, m2, m0, 8 + palignr m3, m1, m2, 8 + pmaddwd m8, m9, [r3 - 4 * 32] ; [12] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m11, m3, [r3 - 4 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m8, m11 + + pmaddwd m9, [r3 + 13 * 32] ; [29] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, [r3 + 13 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + palignr m1, m2, 12 + palignr m2, m0, 12 + pmaddwd m10, m2, [r3 - 2 * 32] ; [14] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m1, [r3 - 2 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m2, [r3 + 15 * 32] ; [31] + paddd m2, [pd_16] + psrld m2, 5 + pmaddwd m1, [r3 + 15 * 32] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m2, m1 + TRANSPOSE_STORE_AVX2_STACK 2, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 + ret + +cglobal intra_pred_ang32_15, 3,8,14 + mov r6, rsp + sub rsp, 4*mmsize+gprsize + and rsp, ~63 + mov [rsp+4*mmsize], r6 + + movu m0, [r2 + 128] + movu m1, [r2 + 160] + movd xm2, [r2 + 192] + + mova [rsp + 1*mmsize], m0 + mova [rsp + 2*mmsize], m1 + movd [rsp + 3*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 4] + movu xm2, [r2 + 18] + movu xm3, [r2 + 34] + movu xm4, [r2 + 48] + pshufb xm1, [pw_ang32_15_21] + pshufb xm2, [pw_ang32_15_21] + pshufb xm3, [pw_ang32_15_21] + pshufb xm4, [pw_ang32_15_21] + + punpckhqdq xm2, xm1 + punpckhqdq xm4, xm3 + + movzx r6d, word [r2] + mov [rsp + 1*mmsize], r6w + movu [rsp + 16], xm2 + movu [rsp], xm4 + + xor r6d, r6d + lea r2, [rsp + 1*mmsize] + lea r7, [r0 + 8 * r1] + + call ang32_mode_15_21_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_15_21_rows_16_31 + + add r2, 32 + lea r0, [r7 + 8 * r1] + + call ang32_mode_15_21_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_15_21_rows_16_31 + + mov rsp, [rsp+4*mmsize] + RET + +cglobal intra_pred_ang32_21, 3,8,14 + mov r6, rsp + sub rsp, 4*mmsize+gprsize + and rsp, ~63 + mov [rsp+4*mmsize], r6 + + movu m0, [r2] + movu m1, [r2 + 32] + movd xm2, [r2 + 64] + + mova [rsp + 1*mmsize], m0 + mova [rsp + 2*mmsize], m1 + movd [rsp + 3*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 132] + movu xm2, [r2 + 146] + movu xm3, [r2 + 162] + movu xm4, [r2 + 176] + pshufb xm1, [pw_ang32_15_21] + pshufb xm2, [pw_ang32_15_21] + pshufb xm3, [pw_ang32_15_21] + pshufb xm4, [pw_ang32_15_21] + + punpckhqdq xm2, xm1 + punpckhqdq xm4, xm3 + + movu [rsp + 16], xm2 + movu [rsp], xm4 + + xor r6d, r6d + inc r6d + lea r2, [rsp + 1*mmsize] + lea r5, [r0 + 32] + + call ang32_mode_15_21_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_15_21_rows_16_31 + + add r2, 32 + mov r0, r5 + + call ang32_mode_15_21_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_15_21_rows_16_31 + + mov rsp, [rsp+4*mmsize] + RET + +;; angle 32, modes 16 and 20, row 0 to 15 +cglobal ang32_mode_16_20_rows_0_15 + test r6d, r6d + + movu m0, [r2 - 20] + movu m1, [r2 - 18] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 - 4] ; [ 3 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13] + movu m4, [r2 - 2] ; [ 2 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14] + punpcklwd m2, m1, m4 ; [-3 -2 -4 -3 -5 -4 -6 -5 -11 -10 -12 -11 -13 -12 -14 -13] + punpckhwd m1, m4 ; [ 2 3 2 0 -1 0 -2 -1 -7 -6 -8 -7 -9 -8 -10 -9] + + pmaddwd m4, m3, [r3] ; [16] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m6, m0, m3, 4 + palignr m7, m2, m0, 4 + pmaddwd m5, m6, [r3 - 11 * 32] ; [5] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m7, [r3 - 11 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + pmaddwd m6, [r3 + 10 * 32] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m7, [r3 + 10 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + palignr m8, m0, m3, 8 + palignr m9, m2, m0, 8 + pmaddwd m7, m8, [r3 - 1 * 32] ; [15] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 - 1 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + palignr m9, m0, m3, 12 + palignr m12, m2, m0, 12 + pmaddwd m8, m9, [r3 - 12 * 32] ; [4] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m10, m12, [r3 - 12 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, [r3 + 9 * 32] ; [25] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, [r3 + 9 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, m0, [r3 - 2 * 32] ; [14] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m2, [r3 - 2 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m11, m2, m0, 4 + palignr m12, m1, m2, 4 + pmaddwd m11, [r3 - 13 * 32] ; [3] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 - 13 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + palignr m4, m2, m0, 4 + palignr m5, m1, m2, 4 + pmaddwd m4, [r3 + 8 * 32] ; [24] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, [r3 + 8 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m2, m0, 8 + palignr m3, m1, m2, 8 + pmaddwd m5, [r3 - 3 * 32] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 3 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + palignr m7, m2, m0, 12 + palignr m3, m1, m2, 12 + pmaddwd m6, m7, [r3 - 14 * 32] ; [2] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m3, [r3 - 14 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, [r3 + 7 * 32] ; [23] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, [r3 + 7 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + pmaddwd m8, m2, [r3 - 4 * 32] ; [12] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m1, [r3 - 4 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + movu m0, [r2 - 2] + movu m1, [r2] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m2, [r2 + 14] + movu m1, [r2 + 16] + punpcklwd m2, m1 + + pmaddwd m9, m3, [r3 - 15 * 32] ; [1] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, m0, [r3 - 15 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + pmaddwd m10, m3, [r3 + 6 * 32] ; [22] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m0, [r3 + 6 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m2, m0, 4 + palignr m0, m3, 4 + pmaddwd m0, [r3 - 5 * 32] ; [11] + paddd m0, [pd_16] + psrld m0, 5 + pmaddwd m2, [r3 - 5 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m0, m2 + TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0 + ret + +;; angle 32, modes 16 and 20, rows 16 to 31 +cglobal ang32_mode_16_20_rows_16_31 + test r6d, r6d + + movu m0, [r2 - 40] + movu m1, [r2 - 38] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 - 24] + movu m4, [r2 - 22] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3 - 16 * 32] ; [0] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 16 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 5 * 32] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 5 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m7, m0, m3, 4 + palignr m8, m2, m0, 4 + pmaddwd m6, m7, [r3 - 6 * 32] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m9, m8, [r3 - 6 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m6, m9 + + pmaddwd m7, [r3 + 15 * 32] ; [31] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m8, [r3 + 15 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m7, m8 + + palignr m8, m0, m3, 8 + palignr m9, m2, m0, 8 + pmaddwd m8, [r3 + 4 * 32] ; [20] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 4 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m10, m0, m3, 12 + palignr m11, m2, m0, 12 + pmaddwd m9, m10, [r3 - 7 * 32] ; [9] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, m11, [r3 - 7 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, [r3 + 14 * 32] ; [30] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 + 14 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m11, m0, [r3 + 3 * 32] ; [19] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, m2, [r3 + 3 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + palignr m5, m2, m0, 4 + palignr m6, m1, m2, 4 + pmaddwd m4, m5, [r3 - 8 * 32] ; [8] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m7, m6, [r3 - 8 * 32] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m4, m7 + + pmaddwd m5, [r3 + 13 * 32] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m6, [r3 + 13 * 32] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m2, m0, 8 + palignr m3, m1, m2, 8 + pmaddwd m6, [r3 + 2 * 32] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m3, [r3 + 2 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m6, m3 + + palignr m8, m2, m0, 12 + palignr m9, m1, m2, 12 + pmaddwd m7, m8, [r3 - 9 * 32] ; [7] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m10, m9, [r3 - 9 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m7, m10 + + pmaddwd m8, [r3 + 12 * 32] ; [28] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, [r3 + 12 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + pmaddwd m9, m2, [r3 + 1 * 32] ; [17] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m3, m1, [r3 + 1 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m9, m3 + + movu m0, [r2 - 22] + movu m1, [r2 - 20] + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + pmaddwd m10, m3, [r3 - 10 * 32] ; [6] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m0, [r3 - 10 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + pmaddwd m3, [r3 + 11 * 32] ; [27] + paddd m3, [pd_16] + psrld m3, 5 + pmaddwd m0, [r3 + 11 * 32] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m3, m0 + TRANSPOSE_STORE_AVX2_STACK 3, 10, 9, 8, 7, 6, 5, 4, 0, 1, 0 + ret + +cglobal intra_pred_ang32_16, 3,8,14 + mov r6, rsp + sub rsp, 5*mmsize+gprsize + and rsp, ~63 + mov [rsp+5*mmsize], r6 + + movu m0, [r2 + 128] + movu m1, [r2 + 160] + movd xm2, [r2 + 192] + + mova [rsp + 2*mmsize], m0 + mova [rsp + 3*mmsize], m1 + movd [rsp + 4*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 4] + movu xm2, [r2 + 16] + movu xm3, [r2 + 28] + movu xm4, [r2 + 40] + movu xm5, [r2 + 52] + pshufb xm1, [pw_ang32_16_20] + pshufb xm2, [pw_ang32_16_20] + pshufb xm3, [pw_ang32_16_20] + pshufb xm4, [pw_ang32_16_20] + pshufb xm5, [pw_ang32_16_20] + + punpckhqdq xm2, xm1 + punpckhqdq xm4, xm3 + punpckhqdq xm5, xm5 + + movzx r6d, word [r2] + mov [rsp + 2*mmsize], r6w + movu [rsp + 48], xm2 + movu [rsp + 32], xm4 + movq [rsp + 24], xm5 + + xor r6d, r6d + lea r2, [rsp + 2*mmsize] + lea r7, [r0 + 8 * r1] + + call ang32_mode_16_20_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_16_20_rows_16_31 + + add r2, 32 + lea r0, [r7 + 8 * r1] + + call ang32_mode_16_20_rows_0_15 + + lea r0, [r0 + 32] + + call ang32_mode_16_20_rows_16_31 + + mov rsp, [rsp+5*mmsize] + RET + +cglobal intra_pred_ang32_20, 3,8,14 + mov r6, rsp + sub rsp, 5*mmsize+gprsize + and rsp, ~63 + mov [rsp+5*mmsize], r6 + + movu m0, [r2] + movu m1, [r2 + 32] + movd xm2, [r2 + 64] + + mova [rsp + 2*mmsize], m0 + mova [rsp + 3*mmsize], m1 + movd [rsp + 4*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 132] + movu xm2, [r2 + 144] + movu xm3, [r2 + 156] + movu xm4, [r2 + 168] + movu xm5, [r2 + 180] + pshufb xm1, [pw_ang32_16_20] + pshufb xm2, [pw_ang32_16_20] + pshufb xm3, [pw_ang32_16_20] + pshufb xm4, [pw_ang32_16_20] + pshufb xm5, [pw_ang32_16_20] + + punpckhqdq xm2, xm1 + punpckhqdq xm4, xm3 + punpckhqdq xm5, xm5 + + movu [rsp + 48], xm2 + movu [rsp + 32], xm4 + movq [rsp + 24], xm5 + + xor r6d, r6d + inc r6d + lea r2, [rsp + 2*mmsize] + lea r5, [r0 + 32] + + call ang32_mode_16_20_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_16_20_rows_16_31 + + add r2, 32 + mov r0, r5 + + call ang32_mode_16_20_rows_0_15 + + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_16_20_rows_16_31 + + mov rsp, [rsp+5*mmsize] + RET + +;; angle 32, modes 17 and 19, row 0 to 15 +cglobal ang32_mode_17_19_rows_0_15 + test r6d, r6d + + movu m0, [r2 - 24] + movu m1, [r2 - 22] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m1, [r2 - 8] + movu m4, [r2 - 6] + punpcklwd m2, m1, m4 + punpckhwd m1, m4 + + pmaddwd m4, m3, [r3 - 16 * 32] ; [0] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, m0, [r3 - 16 * 32] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + pmaddwd m5, m3, [r3 + 10 * 32] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m8, m0, [r3 + 10 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m5, m8 + + palignr m6, m0, m3, 4 + palignr m8, m2, m0, 4 + pmaddwd m6, [r3 + 4 * 32] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, [r3 + 4 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + palignr m7, m0, m3, 8 + palignr m9, m2, m0, 8 + pmaddwd m7, [r3 - 2 * 32] ; [14] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m9, [r3 - 2 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m7, m9 + + palignr m8, m0, m3, 12 + palignr m10, m2, m0, 12 + pmaddwd m8, [r3 - 8 * 32] ; [8] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m10, [r3 - 8 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m8, m10 + + pmaddwd m9, m0, [r3 - 14 * 32] ; [2] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m12, m2, [r3 - 14 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m9, m12 + + pmaddwd m10, m0, [r3 + 12 * 32] ; [28] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, m2, [r3 + 12 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m11, m2, m0, 4 + palignr m12, m1, m2, 4 + pmaddwd m11, [r3 + 6 * 32] ; [22] + paddd m11, [pd_16] + psrld m11, 5 + pmaddwd m12, [r3 + 6 * 32] + paddd m12, [pd_16] + psrld m12, 5 + packusdw m11, m12 + + TRANSPOSE_STORE_AVX2_STACK 11, 10, 9, 8, 7, 6, 5, 4, 12, 13, 16 + + palignr m4, m2, m0, 8 + palignr m5, m1, m2, 8 + pmaddwd m4, [r3] ; [16] + paddd m4, [pd_16] + psrld m4, 5 + pmaddwd m5, [r3] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m2, m0, 12 + palignr m3, m1, m2, 12 + pmaddwd m5, [r3 - 6 * 32] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + pmaddwd m3, [r3 - 6 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m5, m3 + + pmaddwd m6, m2, [r3 - 12 * 32] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + pmaddwd m8, m1, [r3 - 12 * 32] + paddd m8, [pd_16] + psrld m8, 5 + packusdw m6, m8 + + pmaddwd m7, m2, [r3 + 14 * 32] ; [30] + paddd m7, [pd_16] + psrld m7, 5 + pmaddwd m3, m1, [r3 + 14 * 32] + paddd m3, [pd_16] + psrld m3, 5 + packusdw m7, m3 + + movu m0, [r2 - 6] + movu m1, [r2 - 4] + + punpcklwd m3, m0, m1 + punpckhwd m0, m1 + + movu m2, [r2 + 10] + movu m1, [r2 + 12] + punpcklwd m2, m1 + + pmaddwd m8, m3, [r3 + 8 * 32] ; [24] + paddd m8, [pd_16] + psrld m8, 5 + pmaddwd m9, m0, [r3 + 8 * 32] + paddd m9, [pd_16] + psrld m9, 5 + packusdw m8, m9 + + palignr m9, m0, m3, 4 + palignr m10, m2, m0, 4 + pmaddwd m9, [r3 + 2 * 32] ; [18] + paddd m9, [pd_16] + psrld m9, 5 + pmaddwd m10, [r3 + 2 * 32] + paddd m10, [pd_16] + psrld m10, 5 + packusdw m9, m10 + + palignr m10, m0, m3, 8 + palignr m11, m2, m0, 8 + pmaddwd m10, [r3 - 4 * 32] ; [12] + paddd m10, [pd_16] + psrld m10, 5 + pmaddwd m11, [r3 - 4 * 32] + paddd m11, [pd_16] + psrld m11, 5 + packusdw m10, m11 + + palignr m2, m0, 12 + palignr m0, m3, 12 + pmaddwd m0, [r3 - 10 * 32] ; [6] + paddd m0, [pd_16] + psrld m0, 5 + pmaddwd m2, [r3 - 10 * 32] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m0, m2 + TRANSPOSE_STORE_AVX2_STACK 0, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0 + ret + +cglobal intra_pred_ang32_17, 3,8,14 + mov r6, rsp + sub rsp, 5*mmsize+gprsize + and rsp, ~63 + mov [rsp+5*mmsize], r6 + + movu m0, [r2 + 128] + movu m1, [r2 + 160] + movd xm2, [r2 + 192] + + mova [rsp + 2*mmsize], m0 + mova [rsp + 3*mmsize], m1 + movd [rsp + 4*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 2] + movu xm2, [r2 + 18] + movu xm3, [r2 + 34] + movu xm4, [r2 + 50] + pshufb xm1, [pw_ang32_17_19_0] + pshufb xm2, [shuf_mode_17_19] + pshufb xm3, [pw_ang32_17_19_0] + pshufb xm4, [shuf_mode_17_19] + + movzx r6d, word [r2] + mov [rsp + 2*mmsize], r6w + movu [rsp + 48], xm1 + movu [rsp + 36], xm2 + movu [rsp + 22], xm3 + movu [rsp + 10], xm4 + + xor r6d, r6d + lea r2, [rsp + 2*mmsize] + lea r7, [r0 + 8 * r1] + + call ang32_mode_17_19_rows_0_15 + + sub r2, 26 + lea r0, [r0 + 32] + + call ang32_mode_17_19_rows_0_15 + + add r2, 58 + lea r0, [r7 + 8 * r1] + + call ang32_mode_17_19_rows_0_15 + + sub r2, 26 + lea r0, [r0 + 32] + + call ang32_mode_17_19_rows_0_15 + + mov rsp, [rsp+5*mmsize] + RET + +cglobal intra_pred_ang32_19, 3,8,14 + mov r6, rsp + sub rsp, 5*mmsize+gprsize + and rsp, ~63 + mov [rsp+5*mmsize], r6 + + movu m0, [r2] + movu m1, [r2 + 32] + movd xm2, [r2 + 64] + + mova [rsp + 2*mmsize], m0 + mova [rsp + 3*mmsize], m1 + movd [rsp + 4*mmsize], xm2 + + add r1d, r1d + lea r4, [r1 * 3] + lea r3, [ang_table_avx2 + 16 * 32] + + movu xm1, [r2 + 130] + movu xm2, [r2 + 146] + movu xm3, [r2 + 162] + movu xm4, [r2 + 178] + pshufb xm1, [pw_ang32_17_19_0] + pshufb xm2, [shuf_mode_17_19] + pshufb xm3, [pw_ang32_17_19_0] + pshufb xm4, [shuf_mode_17_19] + + movu [rsp + 48], xm1 + movu [rsp + 36], xm2 + movu [rsp + 22], xm3 + movu [rsp + 10], xm4 + + xor r6d, r6d + inc r6d + lea r2, [rsp + 2*mmsize] + lea r5, [r0 + 32] + + call ang32_mode_17_19_rows_0_15 + + sub r2, 26 + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_17_19_rows_0_15 + + add r2, 58 + mov r0, r5 + + call ang32_mode_17_19_rows_0_15 + + sub r2, 26 + lea r0, [r0 + 8 * r1] + lea r0, [r0 + 8 * r1] + + call ang32_mode_17_19_rows_0_15 + + mov rsp, [rsp+5*mmsize] + RET + +cglobal intra_pred_ang32_18, 3,6,6 + mov r4, rsp + sub rsp, 4*mmsize+gprsize + and rsp, ~63 + mov [rsp+4*mmsize], r4 + + movu m0, [r2] + movu m1, [r2 + 32] + mova [rsp + 2*mmsize], m0 + mova [rsp + 3*mmsize], m1 + + movu m2, [r2 + 130] + movu m3, [r2 + 162] + pshufb m2, [pw_swap16] + pshufb m3, [pw_swap16] + vpermq m2, m2, 01001110b + vpermq m3, m3, 01001110b + mova [rsp + 1*mmsize], m2 + mova [rsp + 0*mmsize], m3 + + add r1d, r1d + lea r2, [rsp+2*mmsize] + lea r4, [r1 * 2] + lea r3, [r1 * 3] + lea r5, [r1 * 4] + + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 - 16] + movu m3, [r2 + 16] + + movu [r0], m0 + movu [r0 + 32], m1 + + palignr m4, m0, m2, 14 + palignr m5, m1, m3, 14 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m0, m2, 12 + palignr m5, m1, m3, 12 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m0, m2, 10 + palignr m5, m1, m3, 10 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + palignr m4, m0, m2, 8 + palignr m5, m1, m3, 8 + movu [r0], m4 + movu [r0 + 32], m5 + + palignr m4, m0, m2, 6 + palignr m5, m1, m3, 6 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m0, m2, 4 + palignr m5, m1, m3, 4 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m0, m2, 2 + palignr m5, m1, m3, 2 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + movu [r0], m2 + movu [r0 + 32], m3 + + movu m0, [r2 - 32] + movu m1, [r2] + + palignr m4, m2, m0, 14 + palignr m5, m3, m1, 14 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m2, m0, 12 + palignr m5, m3, m1, 12 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m2, m0, 10 + palignr m5, m3, m1, 10 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + palignr m4, m2, m0, 8 + palignr m5, m3, m1, 8 + movu [r0], m4 + movu [r0 + 32], m5 + + palignr m4, m2, m0, 6 + palignr m5, m3, m1, 6 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m2, m0, 4 + palignr m5, m3, m1, 4 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m2, m0, 2 + palignr m5, m3, m1, 2 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + movu [r0], m0 + movu [r0 + 32], m1 + + movu m2, [r2 - 48] + movu m3, [r2 - 16] + + palignr m4, m0, m2, 14 + palignr m5, m1, m3, 14 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m0, m2, 12 + palignr m5, m1, m3, 12 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m0, m2, 10 + palignr m5, m1, m3, 10 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + palignr m4, m0, m2, 8 + palignr m5, m1, m3, 8 + movu [r0], m4 + movu [r0 + 32], m5 + + palignr m4, m0, m2, 6 + palignr m5, m1, m3, 6 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m0, m2, 4 + palignr m5, m1, m3, 4 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m0, m2, 2 + palignr m5, m1, m3, 2 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + movu [r0], m2 + movu [r0 + 32], m3 + + movu m0, [r2 - 64] + movu m1, [r2 - 32] + + palignr m4, m2, m0, 14 + palignr m5, m3, m1, 14 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m2, m0, 12 + palignr m5, m3, m1, 12 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m2, m0, 10 + palignr m5, m3, m1, 10 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + add r0, r5 + + palignr m4, m2, m0, 8 + palignr m5, m3, m1, 8 + movu [r0], m4 + movu [r0 + 32], m5 + + palignr m4, m2, m0, 6 + palignr m5, m3, m1, 6 + movu [r0 + r1], m4 + movu [r0 + r1 + 32], m5 + + palignr m4, m2, m0, 4 + palignr m5, m3, m1, 4 + movu [r0 + r4], m4 + movu [r0 + r4 + 32], m5 + + palignr m4, m2, m0, 2 + palignr m5, m3, m1, 2 + movu [r0 + r3], m4 + movu [r0 + r3 + 32], m5 + + mov rsp, [rsp+4*mmsize] + RET +;------------------------------------------------------------------------------------------------------- +; end of avx2 code for intra_pred_ang32 mode 2 to 34 +;------------------------------------------------------------------------------------------------------- + +%macro MODE_2_34 0 + movu m0, [r2 + 4] + movu m1, [r2 + 20] + movu m2, [r2 + 36] + movu m3, [r2 + 52] + movu m4, [r2 + 68] + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + palignr m5, m1, m0, 2 + movu [r0 + r1], m5 + palignr m5, m2, m1, 2 + movu [r0 + r1 + 16], m5 + palignr m5, m3, m2, 2 + movu [r0 + r1 + 32], m5 + palignr m5, m4, m3, 2 + movu [r0 + r1 + 48], m5 + palignr m5, m1, m0, 4 + movu [r0 + r3], m5 + palignr m5, m2, m1, 4 + movu [r0 + r3 + 16], m5 + palignr m5, m3, m2, 4 + movu [r0 + r3 + 32], m5 + palignr m5, m4, m3, 4 + movu [r0 + r3 + 48], m5 + palignr m5, m1, m0, 6 + movu [r0 + r4], m5 + palignr m5, m2, m1, 6 + movu [r0 + r4 + 16], m5 + palignr m5, m3, m2, 6 + movu [r0 + r4 + 32], m5 + palignr m5, m4, m3, 6 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + palignr m5, m1, m0, 8 + movu [r0], m5 + palignr m5, m2, m1, 8 + movu [r0 + 16], m5 + palignr m5, m3, m2, 8 + movu [r0 + 32], m5 + palignr m5, m4, m3, 8 + movu [r0 + 48], m5 + palignr m5, m1, m0, 10 + movu [r0 + r1], m5 + palignr m5, m2, m1, 10 + movu [r0 + r1 + 16], m5 + palignr m5, m3, m2, 10 + movu [r0 + r1 + 32], m5 + palignr m5, m4, m3, 10 + movu [r0 + r1 + 48], m5 + palignr m5, m1, m0, 12 + movu [r0 + r3], m5 + palignr m5, m2, m1, 12 + movu [r0 + r3 + 16], m5 + palignr m5, m3, m2, 12 + movu [r0 + r3 + 32], m5 + palignr m5, m4, m3, 12 + movu [r0 + r3 + 48], m5 + palignr m5, m1, m0, 14 + movu [r0 + r4], m5 + palignr m5, m2, m1, 14 + movu [r0 + r4 + 16], m5 + palignr m5, m3, m2, 14 + movu [r0 + r4 + 32], m5 + palignr m5, m4, m3, 14 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + movu m0, [r2 + 84] + movu [r0], m1 + movu [r0 + 16], m2 + movu [r0 + 32], m3 + movu [r0 + 48], m4 + palignr m5, m2, m1, 2 + movu [r0 + r1], m5 + palignr m5, m3, m2, 2 + movu [r0 + r1 + 16], m5 + palignr m5, m4, m3, 2 + movu [r0 + r1 + 32], m5 + palignr m5, m0, m4, 2 + movu [r0 + r1 + 48], m5 + palignr m5, m2, m1, 4 + movu [r0 + r3], m5 + palignr m5, m3, m2, 4 + movu [r0 + r3 + 16], m5 + palignr m5, m4, m3, 4 + movu [r0 + r3 + 32], m5 + palignr m5, m0, m4, 4 + movu [r0 + r3 + 48], m5 + palignr m5, m2, m1, 6 + movu [r0 + r4], m5 + palignr m5, m3, m2, 6 + movu [r0 + r4 + 16], m5 + palignr m5, m4, m3, 6 + movu [r0 + r4 + 32], m5 + palignr m5, m0, m4, 6 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] + palignr m5, m2, m1, 8 + movu [r0], m5 + palignr m5, m3, m2, 8 + movu [r0 + 16], m5 + palignr m5, m4, m3, 8 + movu [r0 + 32], m5 + palignr m5, m0, m4, 8 + movu [r0 + 48], m5 + palignr m5, m2, m1, 10 + movu [r0 + r1], m5 + palignr m5, m3, m2, 10 + movu [r0 + r1 + 16], m5 + palignr m5, m4, m3, 10 + movu [r0 + r1 + 32], m5 + palignr m5, m0, m4, 10 + movu [r0 + r1 + 48], m5 + palignr m5, m2, m1, 12 + movu [r0 + r3], m5 + palignr m5, m3, m2, 12 + movu [r0 + r3 + 16], m5 + palignr m5, m4, m3, 12 + movu [r0 + r3 + 32], m5 + palignr m5, m0, m4, 12 + movu [r0 + r3 + 48], m5 + palignr m5, m2, m1, 14 + movu [r0 + r4], m5 + palignr m5, m3, m2, 14 + movu [r0 + r4 + 16], m5 + palignr m5, m4, m3, 14 + movu [r0 + r4 + 32], m5 + palignr m5, m0, m4, 14 + movu [r0 + r4 + 48], m5 + lea r0, [r0 + r1 * 4] +%endmacro + +%macro TRANSPOSE_STORE_8x8 6 + %if %2 == 1 + ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 + punpckhwd m0, %3, %4 + punpcklwd %3, %4 + punpckhwd %4, %3, m0 + punpcklwd %3, m0 + + punpckhwd m0, %5, %6 + punpcklwd %5, %6 + punpckhwd %6, %5, m0 + punpcklwd %5, m0 + + punpckhqdq m0, %3, %5 + punpcklqdq %3, %5 + punpcklqdq %5, %4, %6 + punpckhqdq %4, %6 + + movu [r0 + %1], %3 + movu [r0 + r1 + %1], m0 + movu [r0 + r1 * 2 + %1], %5 + movu [r0 + r5 + %1], %4 + %else + ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32 + movh [r0], %3 + movhps [r0 + r1], %3 + movh [r0 + r1 * 2], %4 + movhps [r0 + r5], %4 + lea r0, [r0 + r1 * 4] + movh [r0], %5 + movhps [r0 + r1], %5 + movh [r0 + r1 * 2], %6 + movhps [r0 + r5], %6 + lea r0, [r0 + r1 * 4] + %endif +%endmacro + +%macro MODE_3_33 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + mova m7, m0 + + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2 + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0 + + palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1 + pmaddwd m4, m0, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m5, m1, [r3 + 4 * 16] ; [20] + paddd m5, [pd_16] + psrld m5, 5 + packusdw m4, m5 + + palignr m5, m2, m0, 8 + pmaddwd m5, [r3 - 2 * 16] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 12 + pmaddwd m6, [r3 - 8 * 16] ; [ 8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m2, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6] + pmaddwd m1, m0, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + + psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10] + palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7] + + pmaddwd m2, [r3] ; [16] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m1, m2 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8] + movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16] + palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9] + punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12] + punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8] + + palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9] + pmaddwd m4, m0, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m5, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 14 * 16] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10] + pmaddwd m6, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11] + pmaddwd m6, m1, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13] + pmaddwd m1, m2, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 28] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28] + palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29] + punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32] + punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28] + + pmaddwd m4, m0, [r3 + 10 * 16] ; [26] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29] + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30] + pmaddwd m5, [r3 - 2 * 16] ; [14] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31] + pmaddwd m6, [r3 - 8 * 16] ; [ 8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31] + pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m2, [r3 + 12 * 16] ; [28] + paddd m2, [pd_16] + psrld m2, 5 + packusdw m6, m2 + + movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38] + palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39] + punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32] + punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28] + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m0, m2, m3, 4 + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m5, m2, m3, 8 + pmaddwd m4, m5, [r3 - 6 * 16] ; [10] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m3, 12 + pmaddwd m1, m5, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 14 * 16] ; [30] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 46] + palignr m1, m3, 2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m2, m3, 4 + pmaddwd m6, [r3 + 2 * 16] ; [18] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m3, 8 + pmaddwd m1, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m2, m3, 12 + pmaddwd m1, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 54] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_4_32 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2] + pmaddwd m1, m5, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 8 + pmaddwd m6, [r3 + 4 * 16] ; [ 20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m2, m0, 12 + pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6] + pmaddwd m1, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m2, m7, 8 + mova m6, m5 + pmaddwd m5, [r3 - 9 * 16] ; [07] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m6, m2, m7, 12 + pmaddwd m6, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m2, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m3, m2, 4 + pmaddwd m7, [r3] ; [16] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + mova m7, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m0, m3, m2, 8 + pmaddwd m4, m0, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m5, m3, m2, 12 + pmaddwd m5, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, m3, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25] + palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18] + palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19] + punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22] + punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18] + + palignr m1, m2, m3, 4 + pmaddwd m1, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m2, m3, 8 + mova m0, m1 + pmaddwd m1, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m4, m2, m3, 12 + pmaddwd m4, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m7, m2, 4 + pmaddwd m6, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m1, m7, m2, 8 + pmaddwd m6, m1, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m7, m2, 12 + pmaddwd m1, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 44] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_5_31 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 + 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m1, m2, m0, 4 + mova m5, m1 + pmaddwd m1, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 3 * 16] ; [19] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m2, m0, 8 + mova m1, m6 + pmaddwd m6, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 5 * 16] ; [21] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m0, 12 + mova m7, m1 + pmaddwd m7, [r3 - 10 * 16] ; [6] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 7 * 16] ; [23] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m7, m2, [r3 - 8 * 16] ; [8] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4] + pmaddwd m1, m7, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 11 * 16] ; [27] + paddd m5, [pd_16] + psrld m5, 5 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m6, m2, m7, 4 + pmaddwd m1, m6, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, [r3 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m2, m7, 8 + mova m0, m1 + pmaddwd m1, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m0, [r3 + 15 * 16] ; [31] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m0, m2, m7, 12 + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 - 15 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m3, m2, 4 + pmaddwd m5, m1, [r3 - 13 * 16] ; [3] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m3, m2, 8 + pmaddwd m6, m1, [r3 - 11 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m3, m2, 12 + pmaddwd m1, m7, [r3 - 9 * 16] ; [7] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m7, [r3 + 8 * 16] ; [24] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m1, m7 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 7 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18] + palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19] + punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18] + + palignr m1, m0, m3, 4 + pmaddwd m5, m1, [r3 - 5 * 16] ; [11] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m0, m3, 8 + pmaddwd m6, m1, [r3 - 3 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m1, m0, m3, 12 + pmaddwd m1, [r3 - 16] ; [15] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 36] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_6_30 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m2, m0, 4 + pmaddwd m5, m1, [r3 - 9 * 16] ; [7] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m1, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + palignr m1, m2, m0, 8 + pmaddwd m6, m1, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m7, m1, [r3 - 2 * 16] ; [14] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + + palignr m7, m2, m0, 12 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m2, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m3, m2, 4 + pmaddwd m6, m7, [r3 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17] + palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10] + palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11] + punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14] + punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10] + + palignr m0, m2, m7, 4 + pmaddwd m1, m0, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m4, m2, m7, 4 + pmaddwd m4, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + palignr m5, m2, m7, 8 + pmaddwd m1, m5, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m1, m2, m7, 12 + pmaddwd m6, m1, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m2, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m5, m3, m2, 4 + pmaddwd m4, m5, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m5, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m6, m3, m2, 8 + pmaddwd m1, m6, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m5, m1 + + pmaddwd m6, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m1, m3, m2, 12 + pmaddwd m0, m1, [r3 - 10 * 16] ; [6] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 28] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_7_29 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 7 * 16] ; [9] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 + 11 * 16] ; [27] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m1, m2, m0, 4 + pmaddwd m6, m1, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 - 3 * 16] ; [13] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m7, m1, [r3 + 6 * 16] ; [22] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m6, m7 + + pmaddwd m1, [r3 + 15 * 16] ; [31] + paddd m1, [pd_16] + psrld m1, 5 + + mova m3, m0 + palignr m7, m2, m0, 8 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 16] ; [17] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + palignr m1, m2, m3, 12 + pmaddwd m5, m1, [r3 - 13 * 16] ; [3] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m1, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m1, [r3 + 5 * 16] ; [21] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m2, [r3 - 9 * 16] ; [7] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m2, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m2, [r3 + 9 * 16] ; [25] + paddd m4, [pd_16] + psrld m4, 5 + + movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10] + punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9] + + palignr m6, m7, m2, 4 + pmaddwd m1, m6, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m6, [r3 - 5 * 16] ; [11] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m6, [r3 + 4 * 16] ; [20] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, [r3 + 13 * 16] ; [29] + paddd m6, [pd_16] + psrld m6, 5 + + palignr m0, m7, m2, 8 + pmaddwd m1, m0, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m0, [r3 - 16] ; [15] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + palignr m0, m7, m2, 12 + pmaddwd m4, m0, [r3 - 15 * 16] ; [1] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 + 3 * 16] ; [19] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 11 * 16] ; [5] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m7, [r3 - 2 * 16] ; [14] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, m7, [r3 + 7 * 16] ; [23] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 20] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_8_28 1 + movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9] + palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m0, [r3 - 11 * 16] ; [5] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m0, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m0, [r3 - 16] ; [15] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m0, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m0, [r3 + 9 * 16] ; [25] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m0, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + palignr m7, m2, m0, 4 + pmaddwd m1, m7, [r3 - 13 * 16] ; [3] + paddd m1, [pd_16] + psrld m1, 5 + + mova m3, m0 + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 - 3 * 16] ; [13] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 7 * 16] ; [23] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m7, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + palignr m7, m2, m3, 8 + pmaddwd m6, m7, [r3 - 15 * 16] ; [1] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 - 5 * 16] ; [11] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 5 * 16] ; [21] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 15 * 16] ; [31] + paddd m5, [pd_16] + psrld m5, 5 + + palignr m7, m2, m3, 12 + pmaddwd m0, m7, [r3 - 12 * 16] ; [4] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 7 * 16] ; [9] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 + 3 * 16] ; [19] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 13 * 16] ; [29] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m2, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m2, [r3 - 9 * 16] ; [7] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m2, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m2, [r3 + 16] ; [17] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m2, [r3 + 6 * 16] ; [22] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m1, m2, [r3 + 11 * 16] ; [27] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m1, m1 + movhps m1, [r2 + 12] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_9_27 1 + movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1] + palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2] + punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5] + punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1] + + pmaddwd m4, m3, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 10 * 16] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 6 * 16] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 10 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 4] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + palignr m7, m2, m3, 4 + pmaddwd m4, m7, [r3 - 14 * 16] ; [2] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 - 10 * 16] ; [6] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m7, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 - 6 * 16] ; [10] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m7, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m7, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m7, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m7, [r3 + 2 * 16] ; [18] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m7, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m7, [r3 + 6 * 16] ; [22] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m7, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m7, [r3 + 10 * 16] ; [26] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m0, m7, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m6, m0 + + pmaddwd m7, [r3 + 14 * 16] ; [30] + paddd m7, [pd_16] + psrld m7, 5 + packusdw m7, m7 + movhps m7, [r2 + 6] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7 +%endmacro + +%macro MODE_11_25 1 + movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0] + pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 10 * 16] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 6 * 16] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 10 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 2] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2] ; [6 5 4 3 2 1 0 16] + pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16] + + pmaddwd m4, m3, [r3 + 14 * 16] ; [30] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 10 * 16] ; [26] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 + 6 * 16] ; [22] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 2 * 16] ; [14] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 6 * 16] ; [10] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [8] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 10 * 16] ; [6] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [4] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_12_24 1 + movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [7] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [2] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [9] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [4] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [6] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [1] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [8] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [3] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [5] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_13_23 1 + movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 7 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 11 * 16] ; [05] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 15 * 16] ; [01] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 13 * 16] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 5 * 16] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 9 * 16] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 9 * 16] ; [07] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 5 * 16] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 13 * 16] ; [03] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 + 16] ; [17] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 3 * 16] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 7 * 16] ; [09] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_14_22 1 + movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [05] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [03] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [09] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [01] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [07] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_15_21 1 + movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 16] ; [15] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 3 * 16] ; [13] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 12 * 16] ; [28] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 5 * 16] ; [11] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 7 * 16] ; [09] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 9 * 16] ; [07] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 11 * 16] ; [05] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + pmaddwd m6, m3, [r3 - 13 * 16] ; [03] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 15 * 16] ; [01] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 15 * 16] ; [31] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 13 * 16] ; [29] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m0, m3, [r3 - 4 * 16] ; [12] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 11 * 16] ; [27] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 9 * 16] ; [25] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 7 * 16] ; [23] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 5 * 16] ; [21] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 3 * 16] ; [19] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 16] ; [17] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_16_20 1 + movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 5 * 16] ; [11] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 38] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 6 * 16] ; [22] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 15 * 16] ; [01] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 36] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 4 * 16] ; [12] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 34] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 7 * 16] ; [23] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 14 * 16] ; [02] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 32] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 3 * 16] ; [13] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + pmaddwd m4, m3, [r3 - 13 * 16] ; [03] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 2 * 16] ; [14] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 9 * 16] ; [25] + paddd m5, [pd_16] + psrld m5, 5 + + pmaddwd m6, m3, [r3 - 12 * 16] ; [04] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 16] ; [15] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + pmaddwd m1, m3, [r3 - 11 * 16] ; [05] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 11 * 16] ; [27] + paddd m4, [pd_16] + psrld m4, 5 + + pmaddwd m1, m3, [r3 - 10 * 16] ; [06] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 16] ; [17] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 12 * 16] ; [28] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + pmaddwd m6, m3, [r3 - 9 * 16] ; [07] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 2 * 16] ; [18] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 13 * 16] ; [29] + paddd m1, [pd_16] + psrld m1, 5 + + pmaddwd m0, m3, [r3 - 8 * 16] ; [08] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 3 * 16] ; [19] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 14 * 16] ; [30] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 7 * 16] ; [09] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 4 * 16] ; [20] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 15 * 16] ; [31] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 5 * 16] ; [21] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +%macro MODE_17_19 1 + movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0] + pshufb m3, m2 ; [4 3 3 2 2 1 1 0] + + pmaddwd m4, m3, [r3 - 10 * 16] ; [06] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 48] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 46] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 2 * 16] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 44] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 8 * 16] ; [24] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 42] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 40] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 38] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + + movu m3, [r2 + 36] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 34] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 14 * 16] ; [02] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 32] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [08] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 30] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 28] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 26] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2 + 26] ; [00] + + TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1 + + movu m3, [r2 + 24] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 - 10 * 16] ; [06] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 22] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 4 * 16] ; [12] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + movu m3, [r2 + 20] + pshufb m3, m2 + + pmaddwd m5, m3, [r3 + 2 * 16] ; [18] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 18] + pshufb m3, m2 + + pmaddwd m0, m3, [r3 + 8 * 16] ; [24] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m5, m0 + + movu m3, [r2 + 16] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 + 14 * 16] ; [30] + paddd m6, [pd_16] + psrld m6, 5 + + pmaddwd m1, m3, [r3 - 12 * 16] ; [04] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2 + 14] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 - 6 * 16] ; [10] + paddd m1, [pd_16] + psrld m1, 5 + + movu m3, [r2 + 12] + pshufb m3, m2 + + pmaddwd m0, m3, [r3] ; [16] + paddd m0, [pd_16] + psrld m0, 5 + packusdw m1, m0 + + TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1 + + movu m3, [r2 + 10] + pshufb m3, m2 + + pmaddwd m4, m3, [r3 + 6 * 16] ; [22] + paddd m4, [pd_16] + psrld m4, 5 + + movu m3, [r2 + 8] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 12 * 16] ; [28] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m4, m1 + + pmaddwd m5, m3, [r3 - 14 * 16] ; [02] + paddd m5, [pd_16] + psrld m5, 5 + + movu m3, [r2 + 6] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 8 * 16] ; [08] + paddd m6, [pd_16] + psrld m6, 5 + packusdw m5, m6 + + movu m3, [r2 + 4] + pshufb m3, m2 + + pmaddwd m6, m3, [r3 - 2 * 16] ; [14] + paddd m6, [pd_16] + psrld m6, 5 + + movu m3, [r2 + 2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 4 * 16] ; [20] + paddd m1, [pd_16] + psrld m1, 5 + packusdw m6, m1 + + movu m3, [r2] + pshufb m3, m2 + + pmaddwd m1, m3, [r3 + 10 * 16] ; [26] + paddd m1, [pd_16] + psrld m1, 5 + + packusdw m1, m1 + movhps m1, [r2] ; [00] + + TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1 +%endmacro + +;------------------------------------------------------------------------------------------ +; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------ +INIT_XMM ssse3 +cglobal intra_pred_ang32_2, 3,6,6 + lea r4, [r2] + add r2, 128 + cmp r3m, byte 34 + cmove r2, r4 + + add r1, r1 + lea r3, [r1 * 2] + lea r4, [r1 * 3] + mov r5, 2 + +.loop: + MODE_2_34 + add r2, 32 + dec r5 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_3, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_3_33 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_4, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_4_32 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_5, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_5_31 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_6, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_6_30 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_7, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_7_29 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_8, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_8_28 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_9, 3,6,8 + add r2, 128 + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_9_27 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_10, 3,7,8 + add r2, 128 + mov r6d, 4 + add r1, r1 + lea r5, [r1 * 3] + lea r4, [r1 * 2] + lea r3, [r1 * 4] + mova m7, [c_mode32_10_0] + +.loop: + movu m0, [r2 + 2] + pshufb m1, m0, m7 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + 32], m1 + movu [r0 + 48], m1 + + palignr m1, m0, 2 + pshufb m1, m7 + movu [r0 + r1], m1 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 48], m1 + + palignr m1, m0, 4 + pshufb m1, m7 + movu [r0 + r4], m1 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m1 + movu [r0 + r4 + 48], m1 + + palignr m1, m0, 6 + pshufb m1, m7 + movu [r0 + r5], m1 + movu [r0 + r5 + 16], m1 + movu [r0 + r5 + 32], m1 + movu [r0 + r5 + 48], m1 + + add r0, r3 + + palignr m1, m0, 8 + pshufb m1, m7 + movu [r0], m1 + movu [r0 + 16], m1 + movu [r0 + 32], m1 + movu [r0 + 48], m1 + + palignr m1, m0, 10 + pshufb m1, m7 + movu [r0 + r1], m1 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 48], m1 + + palignr m1, m0, 12 + pshufb m1, m7 + movu [r0 + r4], m1 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m1 + movu [r0 + r4 + 48], m1 + + palignr m1, m0, 14 + pshufb m1, m7 + movu [r0 + r5], m1 + movu [r0 + r5 + 16], m1 + movu [r0 + r5 + 32], m1 + movu [r0 + r5 + 48], m1 + + add r0, r3 + add r2, 16 + dec r6d + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_11, 3,6,7,0-(4*mmsize+4) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 2], m0 + movu [rsp + 1*mmsize + 2], m1 + movu [rsp + 2*mmsize + 2], m2 + movu [rsp + 3*mmsize + 2], m3 + mov r4w, [r3+32] + mov [rsp], r4w + mov r4w, [r2+64] + mov [rsp+66], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + +.loop: + MODE_11_25 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_12, 3,6,7,0-(4*mmsize+10) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 8], m0 + movu [rsp + 1*mmsize + 8], m1 + movu [rsp + 2*mmsize + 8], m2 + movu [rsp + 3*mmsize + 8], m3 + + mov r4w, [r2+64] + mov [rsp+72], r4w + mov r4w, [r3+12] + mov [rsp+6], r4w + mov r4w, [r3+26] + mov [rsp+4], r4w + mov r4w, [r3+38] + mov [rsp+2], r4w + mov r4w, [r3+52] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_12_24 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_13, 3,6,7,0-(5*mmsize+2) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize], m0 + movu [rsp + 2*mmsize], m1 + movu [rsp + 3*mmsize], m2 + movu [rsp + 4*mmsize], m3 + + mov r4w, [r2+64] + mov [rsp+80], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 36] + pshufb m0, [shuf_mode_13_23] + pshufb m1, [shuf_mode_13_23] + movh [rsp + 8], m0 + movh [rsp], m1 + mov r4w, [r3+28] + mov [rsp+8], r4w + mov r4w, [r3+56] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_13_23 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_14, 3,6,7,0-(5*mmsize+10) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize + 8], m0 + movu [rsp + 2*mmsize + 8], m1 + movu [rsp + 3*mmsize + 8], m2 + movu [rsp + 4*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 88], r4w + mov r4w, [r3+4] + mov [rsp+22], r4w + movu m0, [r3 + 10] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + pshufb m0, [shuf_mode_14_22] + pshufb m1, [shuf_mode_14_22] + pshufb m2, [shuf_mode_14_22] + movh [rsp + 14], m0 + movh [rsp + 6], m1 + movh [rsp - 2], m2 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_14_22 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_15, 3,6,7,0-(6*mmsize+2) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize], m0 + movu [rsp + 3*mmsize], m1 + movu [rsp + 4*mmsize], m2 + movu [rsp + 5*mmsize], m3 + + mov r4w, [r2 + 64] + mov [rsp + 96], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 18] + movu m2, [r3 + 34] + movu m3, [r3 + 48] + pshufb m0, [shuf_mode_15_21] + pshufb m1, [shuf_mode_15_21] + pshufb m2, [shuf_mode_15_21] + pshufb m3, [shuf_mode_15_21] + movh [rsp + 24], m0 + movh [rsp + 16], m1 + movh [rsp + 8], m2 + movh [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_15_21 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_16, 3,6,7,0-(6*mmsize+10) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize + 8], m0 + movu [rsp + 3*mmsize + 8], m1 + movu [rsp + 4*mmsize + 8], m2 + movu [rsp + 5*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 104], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 22] + movu m2, [r3 + 40] + movd m3, [r3 + 58] + pshufb m0, [shuf_mode_16_20] + pshufb m1, [shuf_mode_16_20] + pshufb m2, [shuf_mode_16_20] + pshufb m3, [shuf_mode_16_20] + movu [rsp + 24], m0 + movu [rsp + 12], m1 + movu [rsp], m2 + movd [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_16_20 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_17, 3,6,7,0-(7*mmsize+4) + mov r3, r2mp + add r2, 128 + movu m0, [r2 + 0*mmsize] + pinsrw m0, [r3], 0 + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 3*mmsize + 2], m0 + movu [rsp + 4*mmsize + 2], m1 + movu [rsp + 5*mmsize + 2], m2 + movu [rsp + 6*mmsize + 2], m3 + + mov r4w, [r2 + 64] + mov [rsp + 114], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + movd m3, [r3 + 2] + pshufb m0, [shuf_mode_17_19] + pshufb m1, [shuf_mode_17_19] + pshufb m2, [shuf_mode_17_19] + pshufb m3, [shuf_mode_16_20] + movd [rsp + 46], m3 + movu [rsp + 30], m0 + movu [rsp + 12], m1 + movu [rsp - 4], m2 + mov r4w, [r3 + 24] + mov [rsp + 30], r4w + mov r4w, [r3 + 28] + mov [rsp + 28], r4w + mov r4w, [r3 + 46] + mov [rsp + 12], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + +.loop: + MODE_17_19 1 + lea r0, [r0 + r1 * 4 ] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_18, 3,7,8 + mov r3, r2mp + add r2, 128 + movu m0, [r3] ; [7 6 5 4 3 2 1 0] + movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8] + movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16] + movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24] + movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1] + movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9] + + add r1, r1 + lea r6, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8] + pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16] + + palignr m6, m0, m4, 14 + movu [r0 + r1], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m2, m1, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m3, m2, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m0, m4, 12 + movu [r0 + r6], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m2, m1, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m3, m2, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m0, m4, 10 + movu [r0 + r3], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m2, m1, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m3, m2, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m0, m4, 8 + movu [r0], m6 + palignr m6, m1, m0, 8 + movu [r0 + 16], m6 + palignr m6, m2, m1, 8 + movu [r0 + 32], m6 + palignr m6, m3, m2, 8 + movu [r0 + 48], m6 + + palignr m6, m0, m4, 6 + movu [r0 + r1], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m2, m1, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m3, m2, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m0, m4, 4 + movu [r0 + r6], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m2, m1, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m3, m2, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m0, m4, 2 + movu [r0 + r3], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m2, m1, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m3, m2, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu [r0], m4 + movu [r0 + 16], m0 + movu [r0 + 32], m1 + movu [r0 + 48], m2 + + palignr m6, m4, m5, 14 + movu [r0 + r1], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m2, m1, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m4, m5, 12 + movu [r0 + r6], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m2, m1, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m4, m5, 10 + movu [r0 + r3], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m2, m1, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m4, m5, 8 + movu [r0], m6 + palignr m6, m0, m4, 8 + movu [r0 + 16], m6 + palignr m6, m1, m0, 8 + movu [r0 + 32], m6 + palignr m6, m2, m1, 8 + movu [r0 + 48], m6 + + palignr m6, m4, m5, 6 + movu [r0 + r1], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m2, m1, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m4, m5, 4 + movu [r0 + r6], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m2, m1, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m4, m5, 2 + movu [r0 + r3], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m2, m1, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu m2, [r2 + 34] + movu m3, [r2 + 50] + pshufb m2, [shuf_mode32_18] + pshufb m3, [shuf_mode32_18] + + movu [r0], m5 + movu [r0 + 16], m4 + movu [r0 + 32], m0 + movu [r0 + 48], m1 + + palignr m6, m5, m2, 14 + movu [r0 + r1], m6 + palignr m6, m4, m5, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m1, m0, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m5, m2, 12 + movu [r0 + r6], m6 + palignr m6, m4, m5, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m1, m0, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m5, m2, 10 + movu [r0 + r3], m6 + palignr m6, m4, m5, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m1, m0, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m5, m2, 8 + movu [r0], m6 + palignr m6, m4, m5, 8 + movu [r0 + 16], m6 + palignr m6, m0, m4, 8 + movu [r0 + 32], m6 + palignr m6, m1, m0, 8 + movu [r0 + 48], m6 + + palignr m6, m5, m2, 6 + movu [r0 + r1], m6 + palignr m6, m4, m5, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m1, m0, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m5, m2, 4 + movu [r0 + r6], m6 + palignr m6, m4, m5, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m1, m0, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m5, m2, 2 + movu [r0 + r3], m6 + palignr m6, m4, m5, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m1, m0, 2 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + movu [r0], m2 + movu [r0 + 16], m5 + movu [r0 + 32], m4 + movu [r0 + 48], m0 + + palignr m6, m2, m3, 14 + movu [r0 + r1], m6 + palignr m6, m5, m2, 14 + movu [r0 + r1 + 16], m6 + palignr m6, m4, m5, 14 + movu [r0 + r1 + 32], m6 + palignr m6, m0, m4, 14 + movu [r0 + r1 + 48], m6 + + palignr m6, m2, m3, 12 + movu [r0 + r6], m6 + palignr m6, m5, m2, 12 + movu [r0 + r6 + 16], m6 + palignr m6, m4, m5, 12 + movu [r0 + r6 + 32], m6 + palignr m6, m0, m4, 12 + movu [r0 + r6 + 48], m6 + + palignr m6, m2, m3, 10 + movu [r0 + r3], m6 + palignr m6, m5, m2, 10 + movu [r0 + r3 + 16], m6 + palignr m6, m4, m5, 10 + movu [r0 + r3 + 32], m6 + palignr m6, m0, m4, 10 + movu [r0 + r3 + 48], m6 + + add r0, r4 + + palignr m6, m2, m3, 8 + movu [r0], m6 + palignr m6, m5, m2, 8 + movu [r0 + 16], m6 + palignr m6, m4, m5, 8 + movu [r0 + 32], m6 + palignr m6, m0, m4, 8 + movu [r0 + 48], m6 + + palignr m6, m2, m3, 6 + movu [r0 + r1], m6 + palignr m6, m5, m2, 6 + movu [r0 + r1 + 16], m6 + palignr m6, m4, m5, 6 + movu [r0 + r1 + 32], m6 + palignr m6, m0, m4, 6 + movu [r0 + r1 + 48], m6 + + palignr m6, m2, m3, 4 + movu [r0 + r6], m6 + palignr m6, m5, m2, 4 + movu [r0 + r6 + 16], m6 + palignr m6, m4, m5, 4 + movu [r0 + r6 + 32], m6 + palignr m6, m0, m4, 4 + movu [r0 + r6 + 48], m6 + + palignr m6, m2, m3, 2 + movu [r0 + r3], m6 + palignr m6, m5, m2, 2 + movu [r0 + r3 + 16], m6 + palignr m6, m4, m5, 2 + movu [r0 + r3 + 32], m6 + palignr m6, m0, m4, 2 + movu [r0 + r3 + 48], m6 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_19, 3,7,7,0-(7*mmsize+4) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 3*mmsize + 2], m0 + movu [rsp + 4*mmsize + 2], m1 + movu [rsp + 5*mmsize + 2], m2 + movu [rsp + 6*mmsize + 2], m3 + + mov r4w, [r2 + 64] + mov [rsp + 114], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + movd m3, [r3 + 2] + pshufb m0, [shuf_mode_17_19] + pshufb m1, [shuf_mode_17_19] + pshufb m2, [shuf_mode_17_19] + pshufb m3, [shuf_mode_16_20] + movd [rsp + 46], m3 + movu [rsp + 30], m0 + movu [rsp + 12], m1 + movu [rsp - 4], m2 + mov r4w, [r3 + 24] + mov [rsp + 30], r4w + mov r4w, [r3 + 28] + mov [rsp + 28], r4w + mov r4w, [r3 + 46] + mov [rsp + 12], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_17_19 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_20, 3,7,7,0-(6*mmsize+10) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize + 8], m0 + movu [rsp + 3*mmsize + 8], m1 + movu [rsp + 4*mmsize + 8], m2 + movu [rsp + 5*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 104], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 22] + movu m2, [r3 + 40] + movd m3, [r3 + 58] + pshufb m0, [shuf_mode_16_20] + pshufb m1, [shuf_mode_16_20] + pshufb m2, [shuf_mode_16_20] + pshufb m3, [shuf_mode_16_20] + movu [rsp + 24], m0 + movu [rsp + 12], m1 + movu [rsp], m2 + movd [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_16_20 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_21, 3,7,7,0-(6*mmsize+2) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 2*mmsize], m0 + movu [rsp + 3*mmsize], m1 + movu [rsp + 4*mmsize], m2 + movu [rsp + 5*mmsize], m3 + + mov r4w, [r2 + 64] + mov [rsp + 96], r4w + movu m0, [r3 + 4] + movu m1, [r3 + 18] + movu m2, [r3 + 34] + movu m3, [r3 + 48] + pshufb m0, [shuf_mode_15_21] + pshufb m1, [shuf_mode_15_21] + pshufb m2, [shuf_mode_15_21] + pshufb m3, [shuf_mode_15_21] + movh [rsp + 24], m0 + movh [rsp + 16], m1 + movh [rsp + 8], m2 + movh [rsp], m3 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_15_21 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_22, 3,7,7,0-(5*mmsize+10) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize + 8], m0 + movu [rsp + 2*mmsize + 8], m1 + movu [rsp + 3*mmsize + 8], m2 + movu [rsp + 4*mmsize + 8], m3 + + mov r4w, [r2 + 64] + mov [rsp + 88], r4w + mov r4w, [r3+4] + mov [rsp+22], r4w + movu m0, [r3 + 10] + movu m1, [r3 + 30] + movu m2, [r3 + 50] + pshufb m0, [shuf_mode_14_22] + pshufb m1, [shuf_mode_14_22] + pshufb m2, [shuf_mode_14_22] + movh [rsp + 14], m0 + movh [rsp + 6], m1 + movh [rsp - 2], m2 + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_14_22 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_23, 3,7,7,0-(5*mmsize+2) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 1*mmsize], m0 + movu [rsp + 2*mmsize], m1 + movu [rsp + 3*mmsize], m2 + movu [rsp + 4*mmsize], m3 + + mov r4w, [r2+64] + mov [rsp+80], r4w + movu m0, [r3 + 8] + movu m1, [r3 + 36] + pshufb m0, [shuf_mode_13_23] + pshufb m1, [shuf_mode_13_23] + movh [rsp + 8], m0 + movh [rsp], m1 + mov r4w, [r3+28] + mov [rsp+8], r4w + mov r4w, [r3+56] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mova m2, [pw_punpcklwd] + mov r6, r0 + +.loop: + MODE_13_23 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_24, 3,7,7,0-(4*mmsize+10) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + + movu [rsp + 0*mmsize + 8], m0 + movu [rsp + 1*mmsize + 8], m1 + movu [rsp + 2*mmsize + 8], m2 + movu [rsp + 3*mmsize + 8], m3 + + mov r4w, [r2+64] + mov [rsp+72], r4w + mov r4w, [r3+12] + mov [rsp+6], r4w + mov r4w, [r3+26] + mov [rsp+4], r4w + mov r4w, [r3+38] + mov [rsp+2], r4w + mov r4w, [r3+52] + mov [rsp], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mova m2, [pw_punpcklwd] + +.loop: + MODE_12_24 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_25, 3,7,7,0-(4*mmsize+4) + lea r3, [r2 + 128] + movu m0, [r2 + 0*mmsize] + movu m1, [r2 + 1*mmsize] + movu m2, [r2 + 2*mmsize] + movu m3, [r2 + 3*mmsize] + movu [rsp + 0*mmsize + 2], m0 + movu [rsp + 1*mmsize + 2], m1 + movu [rsp + 2*mmsize + 2], m2 + movu [rsp + 3*mmsize + 2], m3 + mov r4w, [r3+32] + mov [rsp], r4w + mov r4w, [r2+64] + mov [rsp+66], r4w + + lea r3, [ang_table + 16 * 16] + mov r4d, 8 + mov r2, rsp + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + +.loop: + MODE_11_25 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_26, 3,7,5 + mov r6d, 4 + add r1, r1 + lea r3, [r1 * 2] + lea r4, [r1 * 3] + lea r5, [r1 * 4] + mova m4, [c_mode32_10_0] + + movu m0, [r2 + 2 ] + movu m1, [r2 + 18] + movu m2, [r2 + 34] + movu m3, [r2 + 50] + +.loop: + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu [r0 + r3], m0 + movu [r0 + r3 + 16], m1 + movu [r0 + r3 + 32], m2 + movu [r0 + r3 + 48], m3 + + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m2 + movu [r0 + r4 + 48], m3 + + add r0, r5 + + movu [r0], m0 + movu [r0 + 16], m1 + movu [r0 + 32], m2 + movu [r0 + 48], m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m1 + movu [r0 + r1 + 32], m2 + movu [r0 + r1 + 48], m3 + + movu [r0 + r3], m0 + movu [r0 + r3 + 16], m1 + movu [r0 + r3 + 32], m2 + movu [r0 + r3 + 48], m3 + + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m1 + movu [r0 + r4 + 32], m2 + movu [r0 + r4 + 48], m3 + + add r0, r5 + dec r6d + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_27, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_9_27 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_28, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_8_28 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_29, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_7_29 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_30, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_6_30 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_31, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_5_31 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_32, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 + +.loop: + MODE_4_32 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang32_33, 3,7,8 + lea r3, [ang_table + 16 * 16] + add r1, r1 + lea r5, [r1 * 3] + mov r6, r0 + mov r4d, 8 +.loop: + MODE_3_33 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET + +;----------------------------------------------------------------------------------- +; void intra_filter_NxN(const pixel* references, pixel* filtered) +;----------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_filter_4x4, 2,4,5 + mov r2w, word [r0 + 16] ; topLast + mov r3w, word [r0 + 32] ; LeftLast + + ; filtering top + movu m0, [r0 + 0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] + palignr m3, m1, m0, 4 + pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + ; filtering left + palignr m4, m1, m1, 14 + pinsrw m4, [r0], 1 + palignr m3, m2, m1, 4 + pshufb m3, [intra_filter4_shuf1] + + psllw m1, 1 + paddw m4, m3 + paddw m1, m4 + paddw m1, [pw_2] + psrlw m1, 2 + + movu [r1], m0 + movu [r1 + 16], m1 + mov [r1 + 16], r2w ; topLast + mov [r1 + 32], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_8x8, 2,4,6 + mov r2w, word [r0 + 32] ; topLast + mov r3w, word [r0 + 64] ; LeftLast + + ; filtering top + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 34], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m1, 1 + paddw m4, m3 + paddw m1, m4 + paddw m1, [pw_2] + psrlw m1, 2 + movu [r1], m0 + movu [r1 + 16], m1 + + ; filtering left + movu m1, [r0 + 48] + movu m0, [r0 + 64] + + palignr m4, m2, m2, 14 + pinsrw m4, [r0], 1 + palignr m5, m1, m2, 2 + + palignr m3, m1, m2, 14 + palignr m0, m1, 2 + + psllw m2, 1 + paddw m4, m5 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + + psllw m1, 1 + paddw m0, m3 + paddw m1, m0 + paddw m1, [pw_2] + psrlw m1, 2 + + movu [r1 + 32], m2 + movu [r1 + 48], m1 + mov [r1 + 32], r2w ; topLast + mov [r1 + 64], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_16x16, 2,4,6 + mov r2w, word [r0 + 64] ; topLast + mov r3w, word [r0 + 128] ; LeftLast + + ; filtering top + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 66], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1], m0 + movu [r1 + 16], m5 + + movu m0, [r0 + 48] + movu m5, [r0 + 64] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 32], m1 + movu [r1 + 48], m0 + + ; filtering left + movu m1, [r0 + 80] + movu m2, [r0 + 96] + + palignr m4, m5, m5, 14 + pinsrw m4, [r0], 1 + palignr m0, m1, m5, 2 + + psllw m3, m5, 1 + paddw m4, m0 + paddw m3, m4 + paddw m3, [pw_2] + psrlw m3, 2 + + palignr m0, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m0 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 64], m3 + movu [r1 + 80], m5 + + movu m5, [r0 + 112] + movu m0, [r0 + 128] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m5, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 96], m1 + movu [r1 + 112], m5 + + mov [r1 + 64], r2w ; topLast + mov [r1 + 128], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_32x32, 2,4,6 + mov r2w, word [r0 + 128] ; topLast + mov r3w, word [r0 + 256] ; LeftLast + + ; filtering top + ; 0 to 15 + movu m0, [r0 + 0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 130], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1], m0 + movu [r1 + 16], m5 + + ; 16 to 31 + movu m0, [r0 + 48] + movu m5, [r0 + 64] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m2, m0, 1 + paddw m4, m3 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + movu [r1 + 32], m1 + movu [r1 + 48], m2 + + ; 32 to 47 + movu m1, [r0 + 80] + movu m2, [r0 + 96] + + palignr m3, m5, m0, 14 + palignr m4, m1, m5, 2 + + psllw m0, m5, 1 + paddw m3, m4 + paddw m0, m3 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m3, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 64], m0 + movu [r1 + 80], m5 + + ; 48 to 63 + movu m0, [r0 + 112] + movu m5, [r0 + 128] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 96], m1 + movu [r1 + 112], m0 + + ; filtering left + ; 64 to 79 + movu m1, [r0 + 144] + movu m2, [r0 + 160] + + palignr m4, m5, m5, 14 + pinsrw m4, [r0], 1 + palignr m0, m1, m5, 2 + + psllw m3, m5, 1 + paddw m4, m0 + paddw m3, m4 + paddw m3, [pw_2] + psrlw m3, 2 + + palignr m0, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m0 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 128], m3 + movu [r1 + 144], m5 + + ; 80 to 95 + movu m5, [r0 + 176] + movu m0, [r0 + 192] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m2, m5, 1 + paddw m4, m3 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + movu [r1 + 160], m1 + movu [r1 + 176], m2 + + ; 96 to 111 + movu m1, [r0 + 208] + movu m2, [r0 + 224] + + palignr m3, m0, m5, 14 + palignr m4, m1, m0, 2 + + psllw m5, m0, 1 + paddw m3, m4 + paddw m5, m3 + paddw m5, [pw_2] + psrlw m5, 2 + + palignr m3, m1, m0, 14 + palignr m4, m2, m1, 2 + + psllw m0, m1, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 192], m5 + movu [r1 + 208], m0 + + ; 112 to 127 + movu m5, [r0 + 240] + movu m0, [r0 + 256] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m5, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 224], m1 + movu [r1 + 240], m5 + + mov [r1 + 128], r2w ; topLast + mov [r1 + 256], r3w ; LeftLast + RET + +INIT_YMM avx2 +cglobal intra_filter_4x4, 2,4,4 + mov r2w, word [r0 + 16] ; topLast + mov r3w, word [r0 + 32] ; LeftLast + + ; filtering top + movu m0, [r0] + vpbroadcastw m2, xm0 + movu m1, [r0 + 16] + + palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] + pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] + palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] + palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] + + psllw m0, 1 + paddw m3, m1 + paddw m0, m3 + paddw m0, [pw_2] + psrlw m0, 2 + + movu [r1], m0 + mov [r1 + 16], r2w ; topLast + mov [r1 + 32], r3w ; LeftLast + RET