view x265/source/common/x86/intrapred8.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
line wrap: on
line source

;*****************************************************************************
;* Copyright (C) 2013 x265 project
;*
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 32

const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9

intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15

pb_0_8        times 8 db  0,  8
pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
c_trans_4x4           db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
const tab_S1,         db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
const tab_S2,         db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
const tab_Si,         db  0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
pb_fact0:             db  0,  2,  4,  6,  8, 10, 12, 14,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_12_0:        db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  7,  0
c_mode32_13_0:        db  3,  6, 10, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_13_shuf:     db  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  3,  2,  1,  0
c_mode32_14_shuf:     db 15, 14, 13,  0,  2,  3,  4,  5,  6,  7, 10, 11, 12, 13, 14, 15
c_mode32_14_0:        db 15, 12, 10,  7,  5,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_15_0:        db 15, 13, 11,  9,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_16_0:        db 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0
c_mode32_17_0:        db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
c_mode32_18_0:        db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1

ALIGN 32
c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_26_20:         db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src3_11_4_12:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_14_8:          db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_src5_13_5_13:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
c_ang8_2_28:          db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
c_ang8_src6_14_7_15:  db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
c_ang8_22_16:         db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

c_ang8_21_10       :  db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
c_ang8_src2_10_3_11:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
c_ang8_31_20:         db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src4_12_4_12:  times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_9_30:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
c_ang8_src5_13_6_14:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
c_ang8_19_8:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8

c_ang8_17_2:          db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
c_ang8_19_4:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang8_21_6:          db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
c_ang8_23_8:          db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
c_ang8_src4_12_5_13:  db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12

c_ang8_13_26:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
c_ang8_7_20:          db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_1_14:          db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
c_ang8_27_8:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_src2_10_2_10:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_src3_11_3_11:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10

c_ang8_31_8:          db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_13_22:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
c_ang8_27_4:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang8_9_18:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18

c_ang8_5_10:          db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
c_ang8_15_20:         db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_25_30:         db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
c_ang8_3_8:           db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8

c_ang8_mode_27:       db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

c_ang8_mode_25:       db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

c_ang8_mode_24:       db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24

ALIGN 32
c_ang16_mode_25:      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0

ALIGN 32
c_ang16_mode_11:      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                      db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0


ALIGN 32
c_ang16_mode_12:      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
                      db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
                      db  8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16


ALIGN 32
c_ang16_mode_13:      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                      db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
                      db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
                      db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_28:      db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
                      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                      db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_9:       db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                      db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0

ALIGN 32
c_ang16_mode_27:      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                      db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0

ALIGN 32
intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15

ALIGN 32
c_ang16_mode_29:     db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9,  14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
                     db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                     db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                     db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                     db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_30:      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
                      db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_31:      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6,  9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,  7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
                      db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_24:     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                     db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                     db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
                     db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
                     db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                     db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                     db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                     db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_23:     db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                     db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
                     db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
                     db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
                     db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
                     db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                     db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                     db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
                     db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
c_ang16_mode_22:     db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                     db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                     db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                     db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
                     db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
                     db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                     db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                     db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

ALIGN 32
intra_pred_shuff_0_4:    times 4 db 0, 1, 1, 2, 2, 3, 3, 4
intra_pred4_shuff1:      db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
intra_pred4_shuff2:      db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
intra_pred4_shuff31:     db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
intra_pred4_shuff33:     db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
intra_pred4_shuff3:      db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff4:      db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff5:      db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff6:      db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14
intra_pred4_shuff7:      db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14
intra_pred4_shuff9:      db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13
intra_pred4_shuff12:     db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12
intra_pred4_shuff13:     db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
intra_pred4_shuff14:     db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
intra_pred4_shuff15:     db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
intra_pred4_shuff16:     db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
intra_pred4_shuff17:     db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
intra_pred4_shuff19:     db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
intra_pred4_shuff20:     db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
intra_pred4_shuff21:     db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
intra_pred4_shuff22:     db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
intra_pred4_shuff23:     db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3

c_ang4_mode_27:          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_28:          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_29:          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_30:          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_31:          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_32:          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_33:          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_5:           db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_6:           db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_7:           db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_8:           db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_9:           db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_11:          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_12:          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_13:          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_14:          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_15:          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_16:          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_17:          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_19:          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_20:          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_21:          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_22:          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_23:          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_24:          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_25:          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24

ALIGN 32
;; (blkSize - 1 - x)
pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
ALIGN 32
c_ang8_mode_13:       db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                      db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24

ALIGN 32
c_ang8_mode_14:       db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
                      db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
                      db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                      db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24

ALIGN 32
c_ang8_mode_15:       db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
                      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                      db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24

const c_ang8_mode_16,       db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0

const intra_pred8_shuff16,  db 0, 1, 1, 2, 3, 3, 4, 5
                            db 1, 2, 2, 3, 4, 4, 5, 6
                            db 2, 3, 3, 4, 5, 5, 6, 7
                            db 3, 4, 4, 5, 6, 6, 7, 8
                            db 4, 5, 5, 6, 7, 7, 8, 9

const angHor8_tab_16,       db (32-11), 11, (32-22), 22, (32-1 ),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24

const c_ang8_mode_20,       db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0

; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table
const angHor8_tab_20,       times 8 db (32-24), 24
                            times 8 db (32-13), 13
                            times 8 db (32- 2),  2
                            times 8 db (32-23), 23
                            times 8 db (32-12), 12
                            times 8 db (32- 1),  1
                            times 8 db (32-22), 22
                            times 8 db (32-11), 11

const ang16_shuf_mode9,    times 8 db 0, 1
                           times 8 db 1, 2

const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32

const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0

const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2

const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16

const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0

const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16

const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0

const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16

const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0

const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16

const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0

const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16

const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0

const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0

; Intrapred_angle32x32, modes 1 to 33 constants
const ang32_shuf_mode9,         times 8 db 0, 1
                                times 8 db 1, 2

const ang32_shuf_mode11,        times 8 db 1, 2
                                times 8 db 0, 1

const ang32_fact_mode12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
                                db (32-11), 11, (32- 6),  6, (32- 1),  1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8),  8
                                db (32-19), 19, (32-14), 14, (32- 9),  9, (32- 4),  4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
                                db (32- 3),  3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5),  5, (32- 0),  0
const ang32_shuf_mode12,        db  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
                                db  3,  4,  3,  4,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3,  2,  3,  1,  2,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
const ang32_shuf_mode24,        db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13, 13,  6,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 10,  3,  3
                                dd  0,  0,  7,  3,  0,  0,  7,  3

const ang32_fact_mode13,        db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
                                db (32- 7),  7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3),  3, (32-26), 26, (32-17), 17, (32- 8),  8
                                db (32-15), 15, (32- 6),  6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2),  2, (32-25), 25, (32-16), 16
                                db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4),  4, (32-27), 27, (32-18), 18, (32- 9),  9, (32- 0),  0
const ang32_shuf_mode13,        db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11,  9, 10,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9
                                db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  6,  7,  6,  7
                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 11,  7,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  9,  5,  2
const ang32_shuf_mode23,        db  0,  0,  0,  0,  0,  0,  0,  0, 14, 14, 11, 11,  7,  7,  4,  4,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  5,  5,  2,  2

const ang32_fact_mode14,        db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
                                db (32- 3),  3, (32-22), 22, (32- 9),  9, (32-28), 28, (32-15), 15, (32- 2),  2, (32-21), 21, (32- 8),  8
                                db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4),  4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
                                db (32-27), 27, (32-14), 14, (32- 1),  1, (32-20), 20, (32- 7),  7, (32-26), 26, (32-13), 13, (32- 0),  0
const ang32_shuf_mode14,        db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12,  8,  9,  7,  8,  7,  8,  6,  7,  6,  7,  6,  7,  5,  6,  5,  6
                                db 11, 12, 10, 11, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  4,  5,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3
                                db  0,  0,  0,  0,  0,  0,  0,  0, 15, 12, 10,  7,  5,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 11,  9,  6,  4,  1
const ang32_shuf_mode22,        db  0,  0, 15, 15, 13, 13, 10, 10,  8,  8,  5,  5,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  7,  7,  4,  4,  2

const ang32_fact_mode15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
                                db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8),  8
                                db (32- 7),  7, (32-22), 22, (32- 5),  5, (32-20), 20, (32- 3),  3, (32-18), 18, (32- 1),  1, (32-16), 16
                                db (32-23), 23, (32- 6),  6, (32-21), 21, (32- 4),  4, (32-19), 19, (32- 2),  2, (32-17), 17, (32- 0),  0
const ang32_shuf_mode15,        db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11,  5,  6,  5,  6,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3
                                db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  3,  4,  3,  4,  2,  3,  2,  3,  1,  2,  1,  2,  0,  1,  0,  1
                                db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 12, 10,  8,  7,  5,  3,  1
const ang32_shuf_mode21,        db 15, 15, 13, 13, 11, 11,  9,  9,  8,  8,  6,  6,  4,  4,  2,  2, 14, 14, 12, 12, 10, 10,  8,  8,  7,  7,  5,  5,  3,  3,  1,  1

const ang32_fact_mode16,        db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
                                db (32- 3),  3, (32-14), 14, (32-25), 25, (32- 4),  4, (32-15), 15, (32-26), 26, (32- 5),  5, (32-16), 16
                                db (32-27), 27, (32- 6),  6, (32-17), 17, (32-28), 28, (32- 7),  7, (32-18), 18, (32-29), 29, (32- 8),  8
                                db (32-19), 19, (32-30), 30, (32- 9),  9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0),  0
const ang32_shuf_mode16,        db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8,  7,  8,  6,  7,  5,  6,  5,  6,  4,  5
                                db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7,  6,  7,  5,  6,  5,  6
                                db  0,  0,  0,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 13, 11, 10,  8,  7,  5,  4,  2,  1
                                dd  7,  1,  2,  3,  7,  1,  2,  3
const ang32_shuf_mode20,        db 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0, 14, 15,  8,  7,  5,  4,  2,  1,  0,  0, 14, 13, 13, 11, 11, 10, 10,  8
                                db  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  1,  1,  0,  0

const ang32_fact_mode17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
                                db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
const ang32_shuf_mode17,        db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  7,  8,  6,  7,  6,  7,  5,  6,  4,  5,  3,  4,  2,  3,  2,  3
                                db  0,  0,  0,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0
const ang32_shuf_mode19,        db  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
                                dd  0,  0,  2,  3,  0,  0,  7,  1
                                dd  0,  0,  5,  6,  0,  0,  0,  0

const ang_table
%assign x 0
%rep 32
    times 8 db (32-x), x
%assign x x+1
%endrep

const ang_table_avx2
%assign x 0
%rep 32
    times 16 db (32-x), x
%assign x x+1
%endrep

const pw_ang_table
%assign x 0
%rep 32
    times 4 dw (32-x), x
%assign x x+1
%endrep

SECTION .text
cextern pb_1
cextern pw_2
cextern pw_3
cextern pw_4
cextern pw_7
cextern pw_8
cextern pw_16
cextern pw_15
cextern pw_31
cextern pw_32
cextern pw_257
cextern pw_512
cextern pw_1024
cextern pw_4096
cextern pw_00ff
cextern pb_unpackbd1
cextern multiL
cextern multiH
cextern multiH2
cextern multiH3
cextern multi_2Row
cextern trans8_shuf
cextern pw_planar16_mul
cextern pw_planar32_mul

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc4, 5,5,3
    inc         r2
    pxor        m0, m0
    movu        m1, [r2]
    pshufd      m1, m1, 0xF8
    psadbw      m1, m0              ; m1 = sum

    test        r4d, r4d

    paddw       m1, [pw_4]
    psraw       m1, 3
    movd        r4d, m1             ; r4d = dc_val
    pmullw      m1, [pw_257]
    pshuflw     m1, m1, 0x00

    ; store DC 4x4
    lea         r3, [r1 * 3]
    movd        [r0], m1
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m1
    movd        [r0 + r3], m1

    ; do DC filter
    jz         .end
    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
    add         r4d, r3d            ; r4d = DC * 3 + 2
    movd        m1, r4d
    pshuflw     m1, m1, 0           ; m1 = pixDCx3

    ; filter top
    movd        m2, [r2]
    punpcklbw   m2, m0
    paddw       m2, m1
    psraw       m2, 2
    packuswb    m2, m2
    movd        [r0], m2            ; overwrite top-left pixel, we will update it later

    ; filter top-left
    movzx       r4d, byte [r2 + 8]
    add         r3d, r4d
    movzx       r4d, byte [r2]
    add         r3d, r4d
    shr         r3d, 2
    mov         [r0], r3b

    ; filter left
    add         r0, r1
    movq        m2, [r2 + 9]
    punpcklbw   m2, m0
    paddw       m2, m1
    psraw       m2, 2
    packuswb    m2, m2
%if ARCH_X86_64
    movq        r4, m2
    mov         [r0], r4b
    shr         r4, 8
    mov         [r0 + r1], r4b
    shr         r4, 8
    mov         [r0 + r1 * 2], r4b
%else
    movd        r2d, m2
    mov         [r0], r2b
    shr         r2, 8
    mov         [r0 + r1], r2b
    shr         r2, 8
    mov         [r0 + r1 * 2], r2b
%endif
.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc8, 5, 7, 3
    pxor            m0,            m0
    movh            m1,            [r2 + 1]
    movh            m2,            [r2 + 17]
    punpcklqdq      m1,            m2
    psadbw          m1,            m0
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    paddw           m1,            [pw_8]
    psraw           m1,            4
    pmullw          m1,            [pw_257]
    pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]

    test            r4d,           r4d

    ; store DC 8x8
    lea             r6,            [r1 + r1 * 2]
    lea             r5,            [r6 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    movh            [r0 + r1 * 2], m1
    movh            [r0 + r6],     m1
    movh            [r0 + r1 * 4], m1
    movh            [r0 + r5],     m1
    movh            [r0 + r6 * 2], m1
    lea             r5,            [r5 + r1 * 2]
    movh            [r0 + r5],     m1

    ; Do DC Filter
    jz              .end
    psrlw           m1,            8
    movq            m2,            [pw_2]
    pmullw          m2,            m1
    paddw           m2,            [pw_2]
    movd            r4d,           m2             ; r4d = DC * 2 + 2
    paddw           m1,            m2             ; m1 = DC * 3 + 2
    pshufd          m1,            m1, 0

    ; filter top
    movq            m2,            [r2 + 1]
    punpcklbw       m2,            m0
    paddw           m2,            m1
    psraw           m2,            2              ; sum = sum / 16
    packuswb        m2,            m2
    movh            [r0],          m2

    ; filter top-left
    movzx           r3d, byte      [r2 + 17]
    add             r4d,           r3d
    movzx           r3d, byte      [r2 + 1]
    add             r3d,           r4d
    shr             r3d,           2
    mov             [r0],          r3b

    ; filter left
    movq            m2,            [r2 + 18]
    punpcklbw       m2,            m0
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movd            r2d,           m2
    lea             r0,            [r0 + r1]
    lea             r5,            [r6 + r1 * 2]
    mov             [r0],          r2b
    shr             r2,            8
    mov             [r0 + r1],     r2b
    shr             r2,            8
    mov             [r0 + r1 * 2], r2b
    shr             r2,            8
    mov             [r0 + r6],     r2b
    pshufd          m2,            m2, 0x01
    movd            r2d,           m2
    mov             [r0 + r1 * 4], r2b
    shr             r2,            8
    mov             [r0 + r5],     r2b
    shr             r2,            8
    mov             [r0 + r6 * 2], r2b

.end:
    RET

;--------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;--------------------------------------------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64
cglobal intra_pred_dc16, 5, 10, 4
%else
cglobal intra_pred_dc16, 5, 7, 4
%endif
    pxor            m0,            m0
    movu            m1,            [r2 + 1]
    movu            m2,            [r2 + 33]
    psadbw          m1,            m0
    psadbw          m2,            m0
    paddw           m1,            m2
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    paddw           m1,            [pw_16]
    psraw           m1,            5
    pmullw          m1,            [pw_257]
    pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
    pshufd          m1,            m1, 0x00


    test            r4d,           r4d

    ; store DC 16x16
%if ARCH_X86_64
    lea             r6,            [r1 + r1 * 2]        ;index 3
    lea             r7,            [r1 + r1 * 4]        ;index 5
    lea             r8,            [r6 + r1 * 4]        ;index 7
    lea             r9,            [r0 + r8]            ;base + 7
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r6],     m1
    movu            [r0 + r1 * 4], m1
    movu            [r0 + r7],     m1
    movu            [r0 + r6 * 2], m1
    movu            [r0 + r8],     m1
    movu            [r0 + r1 * 8], m1
    movu            [r9 + r1 * 2], m1
    movu            [r0 + r7 * 2], m1
    movu            [r9 + r1 * 4], m1
    movu            [r0 + r6 * 4], m1
    movu            [r9 + r6 * 2], m1
    movu            [r0 + r8 * 2], m1
    movu            [r9 + r1 * 8], m1
%else ;32 bit
    mov             r6,            r0
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
%endif
    ; Do DC Filter
    jz              .end
    psrlw           m1,            8
    mova            m2,            [pw_2]
    pmullw          m2,            m1
    paddw           m2,            [pw_2]
    movd            r4d,           m2
    paddw           m1,            m2

    ; filter top
    movh            m2,            [r2 + 1]
    punpcklbw       m2,            m0
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movh            m3,            [r2 + 9]
    punpcklbw       m3,            m0
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3

    ; filter top-left
    movzx           r5d, byte      [r2 + 33]
    add             r4d,           r5d
    movzx           r3d, byte      [r2 + 1]
    add             r3d,           r4d
    shr             r3d,           2

%if ARCH_X86_64
    movh            [r0],          m2
    movh            [r0 + 8],      m3
    mov             [r0],          r3b
%else ;32 bit
    movh            [r6],          m2
    movh            [r6 + 8],      m3
    mov             [r6],          r3b
    add             r6,            r1
%endif

    ; filter left
    movh            m2,            [r2 + 34]
    punpcklbw       m2,            m0
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2

    movh            m3,            [r2 + 42]
    punpcklbw       m3,            m0
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3
%if ARCH_X86_64
    movh            r3,            m2
    mov             [r0 + r1],     r3b
    shr             r3,            8
    mov             [r0 + r1 * 2], r3b
    shr             r3,            8
    mov             [r0 + r6],     r3b
    shr             r3,            8
    mov             [r0 + r1 * 4], r3b
    shr             r3,            8
    mov             [r0 + r7],     r3b
    shr             r3,            8
    mov             [r0 + r6 * 2], r3b
    shr             r3,            8
    mov             [r0 + r8],     r3b
    shr             r3,            8
    mov             [r0 + r1 * 8], r3b
    movh            r3,            m3
    mov             [r9 + r1 * 2], r3b
    shr             r3,            8
    mov             [r0 + r7 * 2], r3b
    shr             r3,            8
    mov             [r9 + r1 * 4], r3b
    shr             r3,            8
    mov             [r0 + r6 * 4], r3b
    shr             r3,            8
    mov             [r9 + r6 * 2], r3b
    shr             r3,            8
    mov             [r0 + r8 * 2], r3b
    shr             r3,            8
    mov             [r9 + r1 * 8], r3b
%else ;32 bit
    movd            r2d,            m2
    pshufd          m2,            m2, 0x01
    mov             [r6],          r2b
    shr             r2,            8
    mov             [r6 + r1],     r2b
    shr             r2,            8
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    movd            r2d,           m2
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    shr             r2,            8
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    movd            r2d,            m3
    pshufd          m3,             m3, 0x01
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    shr             r2,            8
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    movd            r2d,           m3
    mov             [r6 + r1 * 2], r2b
    lea             r6,            [r6 + r1 * 2]
    shr             r2,            8
    mov             [r6 + r1],     r2b
    shr             r2,            8
    mov             [r6 + r1 * 2], r2b
%endif
.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc32, 3, 3, 5
    pxor            m0,            m0
    movu            m1,            [r2 + 1]
    movu            m2,            [r2 + 17]
    movu            m3,            [r2 + 65]
    movu            m4,            [r2 + 81]
    psadbw          m1,            m0
    psadbw          m2,            m0
    psadbw          m3,            m0
    psadbw          m4,            m0
    paddw           m1,            m2
    paddw           m3,            m4
    paddw           m1,            m3
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    paddw           m1,            [pw_32]
    psraw           m1,            6
    pmullw          m1,            [pw_257]
    pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
    pshufd          m1,            m1, 0x00

%assign x 0
%rep 16
    ; store DC 16x16
    movu            [r0],               m1
    movu            [r0 + r1],          m1
    movu            [r0 + 16],          m1
    movu            [r0 + r1 + 16],     m1
%if x < 16
    lea             r0,            [r0 + 2 * r1]
%endif
%assign x x+1
%endrep
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar4, 3,3,5
    pxor            m0, m0
    movh            m1, [r2 + 1]
    punpcklbw       m1, m0
    movh            m2, [r2 + 9]
    punpcklbw       m2, m0
    pshufhw         m3, m1, 0               ; topRight
    pshufd          m3, m3, 0xAA
    pshufhw         m4, m2, 0               ; bottomLeft
    pshufd          m4, m4, 0xAA
    pmullw          m3, [multi_2Row]        ; (x + 1) * topRight
    pmullw          m0, m1, [pw_3]          ; (blkSize - 1 - y) * above[x]
    paddw           m3, [pw_4]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

    pshuflw         m1, m2, 0
    pmullw          m1, [pw_planar4_0]
    paddw           m1, m3
    paddw           m3, m4
    psraw           m1, 3
    packuswb        m1, m1
    movd            [r0], m1

    pshuflw         m1, m2, 01010101b
    pmullw          m1, [pw_planar4_0]
    paddw           m1, m3
    paddw           m3, m4
    psraw           m1, 3
    packuswb        m1, m1
    movd            [r0 + r1], m1
    lea             r0, [r0 + 2 * r1]

    pshuflw         m1, m2, 10101010b
    pmullw          m1, [pw_planar4_0]
    paddw           m1, m3
    paddw           m3, m4
    psraw           m1, 3
    packuswb        m1, m1
    movd            [r0], m1

    pshuflw         m1, m2, 11111111b
    pmullw          m1, [pw_planar4_0]
    paddw           m1, m3
    psraw           m1, 3
    packuswb        m1, m1
    movd            [r0 + r1], m1
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar8, 3,3,6
    pxor            m0, m0
    movh            m1, [r2 + 1]
    punpcklbw       m1, m0
    movh            m2, [r2 + 17]
    punpcklbw       m2, m0

    movd            m3, [r2 + 9]            ; topRight   = above[8];
    movd            m4, [r2 + 25]           ; bottomLeft = left[8];

    pand            m3, [pw_00ff]
    pand            m4, [pw_00ff]
    pshuflw         m3, m3, 0x00
    pshuflw         m4, m4, 0x00
    pshufd          m3, m3, 0x44
    pshufd          m4, m4, 0x44
    pmullw          m3, [multiL]            ; (x + 1) * topRight
    pmullw          m0, m1, [pw_7]          ; (blkSize - 1 - y) * above[x]
    paddw           m3, [pw_8]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

%macro INTRA_PRED_PLANAR_8 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%endif
    pmullw          m5, [pw_planar16_mul + mmsize]
    paddw           m5, m3
    psraw           m5, 4
    packuswb        m5, m5
    movh            [r0], m5
%if (%1 < 7)
    paddw           m3, m4
    lea             r0, [r0 + r1]
%endif
%endmacro

    INTRA_PRED_PLANAR_8 0
    INTRA_PRED_PLANAR_8 1
    INTRA_PRED_PLANAR_8 2
    INTRA_PRED_PLANAR_8 3
    INTRA_PRED_PLANAR_8 4
    INTRA_PRED_PLANAR_8 5
    INTRA_PRED_PLANAR_8 6
    INTRA_PRED_PLANAR_8 7
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar16, 3,5,8
    pxor            m0, m0
    movh            m2, [r2 + 1]
    punpcklbw       m2, m0
    movh            m7, [r2 + 9]
    punpcklbw       m7, m0

    movd            m3, [r2 + 17]               ; topRight   = above[16]
    movd            m6, [r2 + 49]               ; bottomLeft = left[16]
    pand            m3, [pw_00ff]
    pand            m6, [pw_00ff]
    pshuflw         m3, m3, 0x00
    pshuflw         m6, m6, 0x00
    pshufd          m3, m3, 0x44                ; v_topRight
    pshufd          m6, m6, 0x44                ; v_bottomLeft
    pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m3, [multiL]                ; (x + 1) * topRight
    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
    pmullw          m5, m7, [pw_15]             ; (blkSize - 1 - y) * above[x]
    paddw           m4, [pw_16]
    paddw           m3, [pw_16]
    paddw           m4, m6
    paddw           m3, m6
    paddw           m4, m5
    paddw           m3, m1
    psubw           m1, m6, m7
    psubw           m6, m2

    movh            m2, [r2 + 33]
    punpcklbw       m2, m0
    movh            m7, [r2 + 41]
    punpcklbw       m7, m0

%macro INTRA_PRED_PLANAR_16 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
%if (%1 < 8)
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%else
%if (%1 < 12)
    pshuflw         m5, m7, 0x55 * (%1 - 8)
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m7, 0x55 * (%1 - 12)
    pshufd          m5, m5, 0xAA
%endif
%endif
%endif
%if (%1 > 0)
    paddw           m3, m6
    paddw           m4, m1
    lea             r0, [r0 + r1]
%endif
    pmullw          m0, m5, [pw_planar16_mul + mmsize]
    pmullw          m5, [pw_planar16_mul]
    paddw           m0, m4
    paddw           m5, m3
    psraw           m5, 5
    psraw           m0, 5
    packuswb        m5, m0
    movu            [r0], m5
%endmacro

    INTRA_PRED_PLANAR_16 0
    INTRA_PRED_PLANAR_16 1
    INTRA_PRED_PLANAR_16 2
    INTRA_PRED_PLANAR_16 3
    INTRA_PRED_PLANAR_16 4
    INTRA_PRED_PLANAR_16 5
    INTRA_PRED_PLANAR_16 6
    INTRA_PRED_PLANAR_16 7
    INTRA_PRED_PLANAR_16 8
    INTRA_PRED_PLANAR_16 9
    INTRA_PRED_PLANAR_16 10
    INTRA_PRED_PLANAR_16 11
    INTRA_PRED_PLANAR_16 12
    INTRA_PRED_PLANAR_16 13
    INTRA_PRED_PLANAR_16 14
    INTRA_PRED_PLANAR_16 15
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64 == 1
cglobal intra_pred_planar32, 3,3,16
    movd            m3, [r2 + 33]               ; topRight   = above[32]

    pxor            m7, m7
    pand            m3, [pw_00ff]
    pshuflw         m3, m3, 0x00
    pshufd          m3, m3, 0x44

    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
    pmullw          m1, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m2, m3, [multiH2]           ; (x + 1) * topRight
    pmullw          m3, [multiH3]               ; (x + 1) * topRight

    movd            m11, [r2 + 97]               ; bottomLeft = left[32]
    pand            m11, [pw_00ff]
    pshuflw         m11, m11, 0x00
    pshufd          m11, m11, 0x44
    mova            m5,  m11
    paddw           m5,  [pw_32]

    paddw           m0, m5
    paddw           m1, m5
    paddw           m2, m5
    paddw           m3, m5
    mova            m8, m11
    mova            m9, m11
    mova            m10, m11
    mova            m12, [pw_31]
    movh            m4, [r2 + 1]
    punpcklbw       m4, m7
    psubw           m8, m4
    pmullw          m4, m12
    paddw           m0, m4

    movh            m4, [r2 + 9]
    punpcklbw       m4, m7
    psubw           m9, m4
    pmullw          m4, m12
    paddw           m1, m4

    movh            m4, [r2 + 17]
    punpcklbw       m4, m7
    psubw           m10, m4
    pmullw          m4, m12
    paddw           m2, m4

    movh            m4, [r2 + 25]
    punpcklbw       m4, m7
    psubw           m11, m4
    pmullw          m4, m12
    paddw           m3, m4
    mova            m12, [pw_planar32_mul]
    mova            m13, [pw_planar32_mul + mmsize]
    mova            m14, [pw_planar16_mul]
    mova            m15, [pw_planar16_mul + mmsize]
%macro PROCESS 1
    pmullw          m5, %1, m12
    pmullw          m6, %1, m13
    paddw           m5, m0
    paddw           m6, m1
    psraw           m5, 6
    psraw           m6, 6
    packuswb        m5, m6
    movu            [r0], m5

    pmullw          m5, %1, m14
    pmullw          %1, m15
    paddw           m5, m2
    paddw           %1, m3
    psraw           m5, 6
    psraw           %1, 6
    packuswb        m5, %1
    movu            [r0 + 16], m5
%endmacro

%macro INCREMENT 0
    paddw           m2, m10
    paddw           m3, m11
    paddw           m0, m8
    paddw           m1, m9
    add             r0, r1
%endmacro

%assign x 0
%rep 4
    pxor            m7, m7
    movq            m4, [r2 + 65 + x * 8]
    punpcklbw       m4, m7
%assign y 0
%rep 8
    %if y < 4
    pshuflw         m7, m4, 0x55 * y
    pshufd          m7, m7, 0x44
    %else
    pshufhw         m7, m4, 0x55 * (y - 4)
    pshufd          m7, m7, 0xEE
    %endif
    PROCESS m7
    %if x + y < 10
    INCREMENT
    %endif
%assign y y+1
%endrep
%assign x x+1
%endrep
    RET

%else ;end ARCH_X86_64, start ARCH_X86_32
cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize)
    movd            m3, [r2 + 33]               ; topRight   = above[32]

    pxor            m7, m7
    pand            m3, [pw_00ff]
    pshuflw         m3, m3, 0x00
    pshufd          m3, m3, 0x44

    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
    pmullw          m1, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m2, m3, [multiH2]           ; (x + 1) * topRight
    pmullw          m3, [multiH3]               ; (x + 1) * topRight

    movd            m6, [r2 + 97]               ; bottomLeft = left[32]
    pand            m6, [pw_00ff]
    pshuflw         m6, m6, 0x00
    pshufd          m6, m6, 0x44
    mova            m5, m6
    paddw           m5, [pw_32]

    paddw           m0, m5
    paddw           m1, m5
    paddw           m2, m5
    paddw           m3, m5

    movh            m4, [r2 + 1]
    punpcklbw       m4, m7
    psubw           m5, m6, m4
    mova            [rsp + 0 * mmsize], m5
    pmullw          m4, [pw_31]
    paddw           m0, m4
    movh            m4, [r2 + 9]
    punpcklbw       m4, m7
    psubw           m5, m6, m4
    mova            [rsp + 1 * mmsize], m5
    pmullw          m4, [pw_31]
    paddw           m1, m4
    movh            m4, [r2 + 17]
    punpcklbw       m4, m7
    psubw           m5, m6, m4
    mova            [rsp + 2 * mmsize], m5
    pmullw          m4, [pw_31]
    paddw           m2, m4
    movh            m4, [r2 + 25]
    punpcklbw       m4, m7
    psubw           m5, m6, m4
    mova            [rsp + 3 * mmsize], m5
    pmullw          m4, [pw_31]
    paddw           m3, m4
%macro PROCESS 1
    pmullw          m5, %1, [pw_planar32_mul]
    pmullw          m6, %1, [pw_planar32_mul + mmsize]
    paddw           m5, m0
    paddw           m6, m1
    psraw           m5, 6
    psraw           m6, 6
    packuswb        m5, m6
    movu            [r0], m5
    pmullw          m5, %1, [pw_planar16_mul]
    pmullw          %1, [pw_planar16_mul + mmsize]
    paddw           m5, m2
    paddw           %1, m3
    psraw           m5, 6
    psraw           %1, 6
    packuswb        m5, %1
    movu            [r0 + 16], m5
%endmacro

%macro INCREMENT 0
    paddw           m0, [rsp + 0 * mmsize]
    paddw           m1, [rsp + 1 * mmsize]
    paddw           m2, [rsp + 2 * mmsize]
    paddw           m3, [rsp + 3 * mmsize]
    add             r0, r1
%endmacro

%assign y 0
%rep 4
    pxor            m7, m7
    movq            m4, [r2 + 65 + y * 8]
    punpcklbw       m4, m7
%assign x 0
%rep 8
    %if x < 4
    pshuflw         m7, m4, 0x55 * x
    pshufd          m7, m7, 0x44
    %else
    pshufhw         m7, m4, 0x55 * (x - 4)
    pshufd          m7, m7, 0xEE
    %endif

    PROCESS m7
    %if x + y < 10
    INCREMENT
    %endif
%assign x x+1
%endrep
%assign y y+1
%endrep
    RET

%endif ; end ARCH_X86_32

%macro STORE_4x4 0
    movd        [r0], m0
    psrldq      m0, 4
    movd        [r0 + r1], m0
    psrldq      m0, 4
    movd        [r0 + r1 * 2], m0
    lea         r1, [r1 * 3]
    psrldq      m0, 4
    movd        [r0 + r1], m0
%endmacro

%macro TRANSPOSE_4x4 0
    pshufd      m0, m0, 0xD8
    pshufd      m1, m2, 0xD8
    pshuflw     m0, m0, 0xD8
    pshuflw     m1, m1, 0xD8
    pshufhw     m0, m0, 0xD8
    pshufhw     m1, m1, 0xD8
    mova        m2, m0
    punpckldq   m0, m1
    punpckhdq   m2, m1
    packuswb    m0, m2
%endmacro

;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_ang4_2, 3,5,1
    lea         r4, [r2 + 2]
    add         r2, 10
    cmp         r3m, byte 34
    cmove       r2, r4

    movh        m0, [r2]
    movd        [r0], m0
    psrldq      m0, 1
    movd        [r0 + r1], m0
    psrldq      m0, 1
    movd        [r0 + r1 * 2], m0
    lea         r1, [r1 * 3]
    psrldq      m0, 1
    movd        [r0 + r1], m0
    RET

INIT_XMM sse2
cglobal intra_pred_ang4_3, 3,3,5
    movh        m3, [r2 + 9]   ; [8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    movh        m0, m3                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
    psrldq      m3, 2
    movh        m1, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m3, 2
    movh        m2, m3                  ;[x x x x x x x x 7 6 6 5 5 4 4 3]
    psrldq      m3, 2                   ;[x x x x x x x x 8 7 7 6 6 5 5 4]

    pxor        m4, m4
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 20 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 26 * 16]
    packssdw    m0, m1
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m4
    pmaddwd     m3, [pw_ang_table + 8 * 16]
    punpcklbw   m2, m4
    pmaddwd     m2, [pw_ang_table + 14 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_4, 3,3,5
    movh        m1, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m1, m1
    psrldq      m1, 1
    movh        m0, m1                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
    psrldq      m1, 2
    movh        m2, m1                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m1, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 10 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 21 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 31 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_5, 3,3,5
    movh        m3, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    mova        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m3, 2
    mova        m2, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m3, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]

    pxor        m1, m1
    punpcklbw   m2, m1
    mova        m4, m2
    pmaddwd     m4, [pw_ang_table + 2 * 16]
    punpcklbw   m0, m1
    pmaddwd     m0, [pw_ang_table + 17 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m2, [pw_ang_table + 19 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_6, 3,3,4
    movh        m2, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m2, m2
    psrldq      m2, 1
    movh        m0, m2                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m2, 2                   ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m3, m0
    pmaddwd     m3, [pw_ang_table + 26 * 16]
    pmaddwd     m0, [pw_ang_table + 13 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m2, m1
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 7 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_7, 3,3,5
    movh        m3, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    movh        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m3, 2                   ;[x x x x x x x x 6 5 5 4 4 3 3 2]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m4, m0
    mova        m2, m0
    pmaddwd     m4, [pw_ang_table + 18 * 16]
    pmaddwd     m0, [pw_ang_table + 9 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m2, [pw_ang_table + 27 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_8, 3,3,5
    movh        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m0, m0
    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 10 * 16]
    pmaddwd     m0, [pw_ang_table + 5 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 15 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_9, 3,3,5
    movh        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m0, m0
    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m0, [pw_ang_table + 2 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 8 * 16]
    pmaddwd     m2, [pw_ang_table + 6 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_10, 3,5,4
    movd        m0, [r2 + 9]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m0, m0
    punpcklwd   m0, m0
    pshufd      m1, m0, 1
    movhlps     m2, m0
    pshufd      m3, m0, 3
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m2
    lea         r1, [r1 * 3]
    movd        [r0 + r1], m3
    cmp         r4m, byte 0
    jz          .quit

    ; filter
    pxor        m3, m3
    punpcklbw   m0, m3
    movh        m1, [r2]                ;[4 3 2 1 0]
    punpcklbw   m1, m3
    pshuflw     m2, m1, 0x00
    psrldq      m1, 2
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0

.quit:
    movd        [r0], m0
    RET

cglobal intra_pred_ang4_11, 3,3,5
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    movh        m0, [r2 - 7]            ;[A x x x x x x x]
    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 28 * 16]
    pmaddwd     m0, [pw_ang_table + 30 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 24 * 16]
    pmaddwd     m2, [pw_ang_table + 26 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

    cglobal intra_pred_ang4_12, 3,3,5
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    movh        m0, [r2 - 7]            ;[A x x x x x x x]
    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 22 * 16]
    pmaddwd     m0, [pw_ang_table + 27 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 17 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

    cglobal intra_pred_ang4_24, 3,3,5
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    movh        m0, [r2 - 7]            ;[A x x x x x x x]
    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 22 * 16]
    pmaddwd     m0, [pw_ang_table + 27 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 17 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_13, 3,3,5
    movd        m1, [r2 - 1]            ;[x x A x]
    movd        m2, [r2 + 9]           ;[4 3 2 1]
    movd        m0, [r2 + 3]            ;[x x B x]
    punpcklbw   m0, m1                  ;[x x x x A B x x]
    punpckldq   m0, m2                  ;[4 3 2 1 A B x x]
    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
    punpcklbw   m0, m0
    psrldq      m0, 1
    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m4, m0
    mova        m2, m0
    pmaddwd     m4, [pw_ang_table + 14 * 16]
    pmaddwd     m0, [pw_ang_table + 23 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 28 * 16]
    pmaddwd     m2, [pw_ang_table + 5 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_14, 3,3,4
    movd        m1, [r2 - 1]            ;[x x A x]
    movd        m0, [r2 + 1]            ;[x x B x]
    punpcklbw   m0, m1                  ;[A B x x]
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B x x]
    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
    psrldq      m0, 1
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m3, m0
    pmaddwd     m3, [pw_ang_table + 6 * 16]
    pmaddwd     m0, [pw_ang_table + 19 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m2, m1
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 25 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_15, 3,3,5
    movd        m0, [r2]                ;[x x x A]
    movd        m1, [r2 + 2]            ;[x x x B]
    punpcklbw   m1, m0                  ;[x x A B]
    movd        m0, [r2 + 3]            ;[x x C x]
    punpcklwd   m0, m1                  ;[A B C x]
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
    psrldq      m0, 1
    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 30 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 15 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 28 * 16]
    pmaddwd     m2, [pw_ang_table + 13 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_16, 3,3,5
    movd        m2, [r2]                ;[x x x A]
    movd        m1, [r2 + 2]            ;[x x x B]
    punpcklbw   m1, m2                  ;[x x A B]
    movd        m0, [r2 + 2]            ;[x x C x]
    punpcklwd   m0, m1                  ;[A B C x]
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
    psrldq      m0, 1
    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 22 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 11 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 1 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_17, 3,3,5
    movd        m2, [r2]                ;[x x x A]
    movd        m3, [r2 + 1]            ;[x x x B]
    movd        m4, [r2 + 2]            ;[x x x C]
    movd        m0, [r2 + 4]            ;[x x x D]
    punpcklbw   m3, m2                  ;[x x A B]
    punpcklbw   m0, m4                  ;[x x C D]
    punpcklwd   m0, m3                  ;[A B C D]
    movd        m1, [r2 + 9]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C D]
    punpcklbw   m0, m0                  ;[4 4 3 3 2 2 1 1 A A B B C C D D]
    psrldq      m0, 1
    movh        m1, m0                  ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m3, m4
    pmaddwd     m3, [pw_ang_table + 12 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 6 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 24 * 16]
    punpcklbw   m2, m4
    pmaddwd     m2, [pw_ang_table + 18 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5

    TRANSPOSE_4x4

    STORE_4x4
    RET

cglobal intra_pred_ang4_18, 3,4,2
    mov         r3d, [r2 + 8]
    mov         r3b, byte [r2]
    bswap       r3d
    movd        m0, r3d

    movd        m1, [r2 + 1]
    punpckldq   m0, m1
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0
    psrldq      m0, 1
    movd        [r0 + r1 * 2], m0
    psrldq      m0, 1
    movd        [r0 + r1], m0
    psrldq      m0, 1
    movd        [r0], m0
    RET

cglobal intra_pred_ang4_19, 3,3,5
    movd        m2, [r2]                ;[x x x A]
    movd        m3, [r2 + 9]            ;[x x x B]
    movd        m4, [r2 + 10]           ;[x x x C]
    movd        m0, [r2 + 12]           ;[x x x D]
    punpcklbw   m3, m2                  ;[x x A B]
    punpcklbw   m0, m4                  ;[x x C D]
    punpcklwd   m0, m3                  ;[A B C D]
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C D]
    punpcklbw   m0, m0                  ;[4 4 3 3 2 2 1 1 A A B B C C D D]
    psrldq      m0, 1
    movh        m1, m0                  ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m3, m4
    pmaddwd     m3, [pw_ang_table + 12 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 6 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 24 * 16]
    punpcklbw   m2, m4
    pmaddwd     m2, [pw_ang_table + 18 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_20, 3,3,5
    movd        m2, [r2]                ;[x x x A]
    movd        m1, [r2 + 10]           ;[x x x B]
    punpcklbw   m1, m2                  ;[x x A B]
    movd        m0, [r2 + 10]           ;[x x C x]
    punpcklwd   m0, m1                  ;[A B C x]
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
    psrldq      m0, 1
    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 22 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 11 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 1 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_21, 3,3,5
    movd        m0, [r2]                ;[x x x A]
    movd        m1, [r2 + 10]           ;[x x x B]
    punpcklbw   m1, m0                  ;[x x A B]
    movd        m0, [r2 + 11]           ;[x x C x]
    punpcklwd   m0, m1                  ;[A B C x]
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B C x]
    psrldq      m0, 1                   ;[x 4 3 2 1 A B C]
    punpcklbw   m0, m0                  ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
    psrldq      m0, 1
    movh        m1, m0                  ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
    psrldq      m0, 2
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 30 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 15 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 28 * 16]
    pmaddwd     m2, [pw_ang_table + 13 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_22, 3,3,4
    movd        m1, [r2 - 1]            ;[x x A x]
    movd        m0, [r2 + 9]            ;[x x B x]
    punpcklbw   m0, m1                  ;[A B x x]
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    punpckldq   m0, m1                  ;[4 3 2 1 A B x x]
    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
    punpcklbw   m0, m0                  ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
    psrldq      m0, 1
    movh        m2, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m3, m0
    pmaddwd     m3, [pw_ang_table + 6 * 16]
    pmaddwd     m0, [pw_ang_table + 19 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m2, m1
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 12 * 16]
    pmaddwd     m2, [pw_ang_table + 25 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_23, 3,3,5
    movd        m1, [r2 - 1]            ;[x x A x]
    movd        m2, [r2 + 1]            ;[4 3 2 1]
    movd        m0, [r2 + 11]           ;[x x B x]
    punpcklbw   m0, m1                  ;[x x x x A B x x]
    punpckldq   m0, m2                  ;[4 3 2 1 A B x x]
    psrldq      m0, 2                   ;[x x 4 3 2 1 A B]
    punpcklbw   m0, m0
    psrldq      m0, 1
    mova        m3, m0                  ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
    psrldq      m0, 2                   ;[x x x x x x x 4 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m4, m0
    mova        m2, m0
    pmaddwd     m4, [pw_ang_table + 14 * 16]
    pmaddwd     m0, [pw_ang_table + 23 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 28 * 16]
    pmaddwd     m2, [pw_ang_table + 5 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_25, 3,3,5
    movd        m1, [r2 + 1]            ;[4 3 2 1]
    movh        m0, [r2 - 7]            ;[A x x x x x x x]
    punpcklbw   m1, m1                  ;[4 4 3 3 2 2 1 1]
    punpcklqdq  m0, m1                  ;[4 4 3 3 2 2 1 1 A x x x x x x x]
    psrldq      m0, 7                   ;[x x x x x x x x 4 3 3 2 2 1 1 A]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 28 * 16]
    pmaddwd     m0, [pw_ang_table + 30 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 24 * 16]
    pmaddwd     m2, [pw_ang_table + 26 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_26, 3,4,4
    movd        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]

    ; store
    movd        [r0], m0
    movd        [r0 + r1], m0
    movd        [r0 + r1 * 2], m0
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0

    ; filter
    cmp         r4m, byte 0
    jz         .quit

    pxor        m3, m3
    punpcklbw   m0, m3
    pshuflw     m0, m0, 0x00
    movd        m2, [r2]
    punpcklbw   m2, m3
    pshuflw     m2, m2, 0x00
    movd        m1, [r2 + 9]
    punpcklbw   m1, m3
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0

    movd        r2, m0
    mov         [r0], r2b
    shr         r2, 8
    mov         [r0 + r1], r2b
    shr         r2, 8
    mov         [r0 + r1 * 2], r2b
    shr         r2, 8
    mov         [r0 + r3], r2b

.quit:
    RET

cglobal intra_pred_ang4_27, 3,3,5
    movh        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m0, m0
    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m0, [pw_ang_table + 2 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 8 * 16]
    pmaddwd     m2, [pw_ang_table + 6 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_28, 3,3,5
    movh        m0, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m0, m0
    psrldq      m0, 1                   ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m2, m0
    mova        m3, m0
    mova        m4, m2
    pmaddwd     m3, [pw_ang_table + 10 * 16]
    pmaddwd     m0, [pw_ang_table + 5 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    pmaddwd     m4, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 15 * 16]
    packssdw    m2, m4
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_29, 3,3,5
    movh        m3, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    movh        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m3, 2                   ;[x x x x x x x x 6 5 5 4 4 3 3 2]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m4, m0
    mova        m2, m0
    pmaddwd     m4, [pw_ang_table + 18 * 16]
    pmaddwd     m0, [pw_ang_table + 9 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m2, [pw_ang_table + 27 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_30, 3,3,4
    movh        m2, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m2, m2
    psrldq      m2, 1
    movh        m0, m2                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m2, 2                   ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pxor        m1, m1
    punpcklbw   m0, m1
    mova        m3, m0
    pmaddwd     m3, [pw_ang_table + 26 * 16]
    pmaddwd     m0, [pw_ang_table + 13 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m2, m1
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 7 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_31, 3,3,5
    movh        m3, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    mova        m0, m3                  ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    psrldq      m3, 2
    mova        m2, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m3, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]

    pxor        m1, m1
    punpcklbw   m2, m1
    mova        m4, m2
    pmaddwd     m4, [pw_ang_table + 2 * 16]
    punpcklbw   m0, m1
    pmaddwd     m0, [pw_ang_table + 17 * 16]
    packssdw    m0, m4
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m1
    pmaddwd     m3, [pw_ang_table + 4 * 16]
    pmaddwd     m2, [pw_ang_table + 19 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_32, 3,3,5
    movh        m1, [r2 + 1]            ;[8 7 6 5 4 3 2 1]
    punpcklbw   m1, m1
    psrldq      m1, 1
    movh        m0, m1                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
    psrldq      m1, 2
    movh        m2, m1                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m1, 2                   ;[x x x x x x x x 7 6 6 5 5 4 4 3]

    pxor        m4, m4
    punpcklbw   m2, m4
    mova        m3, m2
    pmaddwd     m3, [pw_ang_table + 10 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 21 * 16]
    packssdw    m0, m3
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 20 * 16]
    pmaddwd     m2, [pw_ang_table + 31 * 16]
    packssdw    m2, m1
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

cglobal intra_pred_ang4_33, 3,3,5
    movh        m3, [r2 + 1]   ; [8 7 6 5 4 3 2 1]
    punpcklbw   m3, m3
    psrldq      m3, 1
    movh        m0, m3                  ;[x x x x x x x x 5 4 4 3 3 2 2 1]
    psrldq      m3, 2
    movh        m1, m3                  ;[x x x x x x x x 6 5 5 4 4 3 3 2]
    psrldq      m3, 2
    movh        m2, m3                  ;[x x x x x x x x 7 6 6 5 5 4 4 3]
    psrldq      m3, 2                   ;[x x x x x x x x 8 7 7 6 6 5 5 4]

    pxor        m4, m4
    punpcklbw   m1, m4
    pmaddwd     m1, [pw_ang_table + 20 * 16]
    punpcklbw   m0, m4
    pmaddwd     m0, [pw_ang_table + 26 * 16]
    packssdw    m0, m1
    paddw       m0, [pw_16]
    psraw       m0, 5
    punpcklbw   m3, m4
    pmaddwd     m3, [pw_ang_table + 8 * 16]
    punpcklbw   m2, m4
    pmaddwd     m2, [pw_ang_table + 14 * 16]
    packssdw    m2, m3
    paddw       m2, [pw_16]
    psraw       m2, 5
    packuswb    m0, m2

    STORE_4x4
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc4, 5,5,3
    inc         r2
    pxor        m0, m0
    movd        m1, [r2]
    movd        m2, [r2 + 8]
    punpckldq   m1, m2
    psadbw      m1, m0              ; m1 = sum

    test        r4d, r4d

    pmulhrsw    m1, [pw_4096]       ; m1 = (sum + 4) / 8
    movd        r4d, m1             ; r4d = dc_val
    pshufb      m1, m0              ; m1 = byte [dc_val ...]

    ; store DC 4x4
    lea         r3, [r1 * 3]
    movd        [r0], m1
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m1
    movd        [r0 + r3], m1

    ; do DC filter
    jz         .end
    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
    add         r4d, r3d            ; r4d = DC * 3 + 2
    movd        m1, r4d
    pshuflw     m1, m1, 0           ; m1 = pixDCx3
    pshufd      m1, m1, 0

    ; filter top
    movd        m2, [r2]
    movd        m0, [r2 + 9]
    punpckldq   m2, m0
    pmovzxbw    m2, m2
    paddw       m2, m1
    psraw       m2, 2
    packuswb    m2, m2
    movd        [r0], m2            ; overwrite top-left pixel, we will update it later

    ; filter top-left
    movzx       r4d, byte [r2 + 8]
    add         r3d, r4d
    movzx       r4d, byte [r2]
    add         r3d, r4d
    shr         r3d, 2
    mov         [r0], r3b

    ; filter left
    add         r0, r1
    pextrb      [r0], m2, 4
    pextrb      [r0 + r1], m2, 5
    pextrb      [r0 + r1 * 2], m2, 6

.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc8, 5, 7, 3
    lea             r3, [r2 + 17]
    inc             r2
    pxor            m0,            m0
    movh            m1,            [r2]
    movh            m2,            [r3]
    punpcklqdq      m1,            m2
    psadbw          m1,            m0
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r5d,           m1
    add             r5d,           8
    shr             r5d,           4     ; sum = sum / 16
    movd            m1,            r5d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

    test            r4d,           r4d

    ; store DC 8x8
    mov             r6,            r0
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1

    ; Do DC Filter
    jz              .end
    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
    add             r5d,           r4d            ; r5d = DC * 3 + 2
    movd            m1,            r5d
    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
    pshufd          m1,            m1, 0

    ; filter top
    pmovzxbw        m2,            [r2]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movh            [r6],          m2

    ; filter top-left
    movzx           r5d, byte      [r3]
    add             r4d,           r5d
    movzx           r3d, byte      [r2]
    add             r3d,           r4d
    shr             r3d,           2
    mov             [r6],          r3b

    ; filter left
    add             r6,            r1
    pmovzxbw        m2,            [r2 + 17]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    pextrb          [r6],          m2, 0
    pextrb          [r6 + r1],     m2, 1
    pextrb          [r6 + 2 * r1], m2, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 3
    pextrb          [r6 + r1 * 2], m2, 4
    pextrb          [r6 + r1 * 4], m2, 6
    lea             r1,            [r1 * 3]
    pextrb          [r6 + r1],     m2, 5

.end:
    RET

;--------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;--------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc16, 5, 7, 4
    lea             r3, [r2 + 33]
    inc             r2
    pxor            m0,            m0
    movu            m1,            [r2]
    movu            m2,            [r3]
    psadbw          m1,            m0
    psadbw          m2,            m0
    paddw           m1,            m2
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r5d,           m1
    add             r5d,           16
    shr             r5d,           5     ; sum = sum / 32
    movd            m1,            r5d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

    test            r4d,           r4d

    ; store DC 16x16
    mov             r6,            r0
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1

    ; Do DC Filter
    jz              .end
    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
    add             r5d,           r4d            ; r5d = DC * 3 + 2
    movd            m1,            r5d
    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
    pshufd          m1,            m1, 0

    ; filter top
    pmovzxbw        m2,            [r2]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movh            [r6],          m2
    pmovzxbw        m3,            [r2 + 8]
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3
    movh            [r6 + 8],      m3

    ; filter top-left
    movzx           r5d, byte      [r3]
    add             r4d,           r5d
    movzx           r3d, byte      [r2]
    add             r3d,           r4d
    shr             r3d,           2
    mov             [r6],          r3b

    ; filter left
    add             r6,            r1
    pmovzxbw        m2,            [r2 + 33]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    pextrb          [r6],          m2, 0
    pextrb          [r6 + r1],     m2, 1
    pextrb          [r6 + r1 * 2], m2, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 3
    pextrb          [r6 + r1 * 2], m2, 4
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 5
    pextrb          [r6 + r1 * 2], m2, 6
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 7

    pmovzxbw        m3,            [r2 + 41]
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3
    pextrb          [r6 + r1 * 2], m3, 0
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 1
    pextrb          [r6 + r1 * 2], m3, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 3
    pextrb          [r6 + r1 * 2], m3, 4
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 5
    pextrb          [r6 + r1 * 2], m3, 6

.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc32, 3, 5, 5
    lea             r3, [r2 + 65]
    inc             r2
    pxor            m0,            m0
    movu            m1,            [r2]
    movu            m2,            [r2 + 16]
    movu            m3,            [r3]
    movu            m4,            [r3 + 16]
    psadbw          m1,            m0
    psadbw          m2,            m0
    psadbw          m3,            m0
    psadbw          m4,            m0
    paddw           m1,            m2
    paddw           m3,            m4
    paddw           m1,            m3
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r4d,           m1
    add             r4d,           32
    shr             r4d,           6     ; sum = sum / 64
    movd            m1,            r4d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

%rep 2
    ; store DC 16x16
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
%endrep

    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal intra_pred_dc32, 3, 4, 3
    lea             r3, [r1 * 3]
    pxor            m0, m0
    movu            m1, [r2 + 1]
    movu            m2, [r2 + 65]
    psadbw          m1, m0
    psadbw          m2, m0
    paddw           m1, m2
    vextracti128    xm2, m1, 1
    paddw           m1, m2
    pshufd          m2, m1, 2
    paddw           m1, m2

    pmulhrsw        m1, [pw_512]    ; sum = (sum + 32) / 64
    vpbroadcastb    m1, xm1         ; m1 = byte [dc_val ...]

    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    lea             r0, [r0 + 4 * r1]
    movu            [r0 + r1 * 0], m1
    movu            [r0 + r1 * 1], m1
    movu            [r0 + r1 * 2], m1
    movu            [r0 + r3 * 1], m1
    RET
%endif ;; ARCH_X86_64 == 1

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar4, 3,3,7
    pmovzxbw        m1, [r2 + 1]
    pmovzxbw        m2, [r2 + 9]
    pshufhw         m3, m1, 0               ; topRight
    pshufd          m3, m3, 0xAA
    pshufhw         m4, m2, 0               ; bottomLeft
    pshufd          m4, m4, 0xAA
    pmullw          m3, [multi_2Row]        ; (x + 1) * topRight
    pmullw          m0, m1, [pw_3]          ; (blkSize - 1 - y) * above[x]
    mova            m6, [pw_planar4_0]
    paddw           m3, [pw_4]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

    pshuflw         m5, m2, 0
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0], m5

    pshuflw         m5, m2, 01010101b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0 + r1], m5
    lea             r0, [r0 + 2 * r1]

    pshuflw         m5, m2, 10101010b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0], m5

    pshuflw         m5, m2, 11111111b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0 + r1], m5
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar8, 3,3,7
    pmovzxbw        m1, [r2 + 1]
    pmovzxbw        m2, [r2 + 17]

    movd            m3, [r2 + 9]            ; topRight   = above[8];
    movd            m4, [r2 + 25]           ; bottomLeft = left[8];

    pxor            m0, m0
    pshufb          m3, m0
    pshufb          m4, m0
    punpcklbw       m3, m0                  ; v_topRight
    punpcklbw       m4, m0                  ; v_bottomLeft
    pmullw          m3, [multiL]            ; (x + 1) * topRight
    pmullw          m0, m1, [pw_7]          ; (blkSize - 1 - y) * above[x]
    mova            m6, [pw_planar16_mul + mmsize]
    paddw           m3, [pw_8]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

%macro INTRA_PRED_PLANAR8 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%endif
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 4
    packuswb        m5, m5
    movh            [r0], m5
    lea             r0, [r0 + r1]
%endmacro

    INTRA_PRED_PLANAR8 0
    INTRA_PRED_PLANAR8 1
    INTRA_PRED_PLANAR8 2
    INTRA_PRED_PLANAR8 3
    INTRA_PRED_PLANAR8 4
    INTRA_PRED_PLANAR8 5
    INTRA_PRED_PLANAR8 6
    INTRA_PRED_PLANAR8 7
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar16, 3,3,8
    pmovzxbw        m2, [r2 + 1]
    pmovzxbw        m7, [r2 + 9]

    movd            m3, [r2 + 17]               ; topRight   = above[16]
    movd            m6, [r2 + 49]               ; bottomLeft = left[16]

    pxor            m0, m0
    pshufb          m3, m0
    pshufb          m6, m0
    punpcklbw       m3, m0                      ; v_topRight
    punpcklbw       m6, m0                      ; v_bottomLeft
    pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m3, [multiL]                ; (x + 1) * topRight
    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
    pmullw          m5, m7, [pw_15]             ; (blkSize - 1 - y) * above[x]
    paddw           m4, [pw_16]
    paddw           m3, [pw_16]
    paddw           m4, m6
    paddw           m3, m6
    paddw           m4, m5
    paddw           m3, m1
    psubw           m1, m6, m7
    psubw           m6, m2

    pmovzxbw        m2, [r2 + 33]
    pmovzxbw        m7, [r2 + 41]

%macro INTRA_PRED_PLANAR16 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
%if (%1 < 8)
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%else
%if (%1 < 12)
    pshuflw         m5, m7, 0x55 * (%1 - 8)
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m7, 0x55 * (%1 - 12)
    pshufd          m5, m5, 0xAA
%endif
%endif
%endif
    pmullw          m0, m5, [pw_planar16_mul + mmsize]
    pmullw          m5, [pw_planar16_mul]
    paddw           m0, m4
    paddw           m5, m3
    paddw           m3, m6
    paddw           m4, m1
    psraw           m5, 5
    psraw           m0, 5
    packuswb        m5, m0
    movu            [r0], m5
    lea             r0, [r0 + r1]
%endmacro

    INTRA_PRED_PLANAR16 0
    INTRA_PRED_PLANAR16 1
    INTRA_PRED_PLANAR16 2
    INTRA_PRED_PLANAR16 3
    INTRA_PRED_PLANAR16 4
    INTRA_PRED_PLANAR16 5
    INTRA_PRED_PLANAR16 6
    INTRA_PRED_PLANAR16 7
    INTRA_PRED_PLANAR16 8
    INTRA_PRED_PLANAR16 9
    INTRA_PRED_PLANAR16 10
    INTRA_PRED_PLANAR16 11
    INTRA_PRED_PLANAR16 12
    INTRA_PRED_PLANAR16 13
    INTRA_PRED_PLANAR16 14
    INTRA_PRED_PLANAR16 15
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal intra_pred_planar16, 3,3,6
    vpbroadcastw    m3, [r2 + 17]
    mova            m5, [pw_00ff]
    vpbroadcastw    m4, [r2 + 49]
    mova            m0, [pw_planar16_mul]
    pmovzxbw        m2, [r2 + 1]
    pand            m3, m5                      ; v_topRight
    pand            m4, m5                      ; v_bottomLeft

    pmullw          m3, [multiL]                ; (x + 1) * topRight
    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
    paddw           m3, [pw_16]
    paddw           m3, m4
    paddw           m3, m1
    psubw           m4, m2
    add             r2, 33

%macro INTRA_PRED_PLANAR16_AVX2 1
    vpbroadcastw    m1, [r2 + %1]
    vpsrlw          m2, m1, 8
    pand            m1, m5

    pmullw          m1, m0
    pmullw          m2, m0
    paddw           m1, m3
    paddw           m3, m4
    psraw           m1, 5
    paddw           m2, m3
    psraw           m2, 5
    paddw           m3, m4
    packuswb        m1, m2
    vpermq          m1, m1, 11011000b
    movu            [r0], xm1
    vextracti128    [r0 + r1], m1, 1
    lea             r0, [r0 + r1 * 2]
%endmacro
    INTRA_PRED_PLANAR16_AVX2 0
    INTRA_PRED_PLANAR16_AVX2 2
    INTRA_PRED_PLANAR16_AVX2 4
    INTRA_PRED_PLANAR16_AVX2 6
    INTRA_PRED_PLANAR16_AVX2 8
    INTRA_PRED_PLANAR16_AVX2 10
    INTRA_PRED_PLANAR16_AVX2 12
    INTRA_PRED_PLANAR16_AVX2 14
%undef INTRA_PRED_PLANAR16_AVX2
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_planar32, 3,4,12
%else
cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
  %define           m8  [rsp + 0 * mmsize]
  %define           m9  [rsp + 1 * mmsize]
  %define           m10 [rsp + 2 * mmsize]
  %define           m11 [rsp + 3 * mmsize]
%endif
    movd            m3, [r2 + 33]               ; topRight   = above[32]

    pxor            m7, m7
    pshufb          m3, m7
    punpcklbw       m3, m7                      ; v_topRight

    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
    pmullw          m1, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m2, m3, [multiH2]           ; (x + 1) * topRight
    pmullw          m3, [multiH3]               ; (x + 1) * topRight

    movd            m6, [r2 + 97]               ; bottomLeft = left[32]
    pshufb          m6, m7
    punpcklbw       m6, m7                      ; v_bottomLeft

    paddw           m0, m6
    paddw           m1, m6
    paddw           m2, m6
    paddw           m3, m6
    paddw           m0, [pw_32]
    paddw           m1, [pw_32]
    paddw           m2, [pw_32]
    paddw           m3, [pw_32]
    pmovzxbw        m4, [r2 + 1]
    pmullw          m5, m4, [pw_31]
    paddw           m0, m5
    psubw           m5, m6, m4
    mova            m8, m5
    pmovzxbw        m4, [r2 + 9]
    pmullw          m5, m4, [pw_31]
    paddw           m1, m5
    psubw           m5, m6, m4
    mova            m9, m5
    pmovzxbw        m4, [r2 + 17]
    pmullw          m5, m4, [pw_31]
    paddw           m2, m5
    psubw           m5, m6, m4
    mova            m10, m5
    pmovzxbw        m4, [r2 + 25]
    pmullw          m5, m4, [pw_31]
    paddw           m3, m5
    psubw           m5, m6, m4
    mova            m11, m5
    add             r2, 65                      ; (2 * blkSize + 1)

%macro INTRA_PRED_PLANAR32 0
    movd            m4, [r2]
    pshufb          m4, m7
    punpcklbw       m4, m7
    pmullw          m5, m4, [pw_planar32_mul]
    pmullw          m6, m4, [pw_planar32_mul + mmsize]
    paddw           m5, m0
    paddw           m6, m1
    paddw           m0, m8
    paddw           m1, m9
    psraw           m5, 6
    psraw           m6, 6
    packuswb        m5, m6
    movu            [r0], m5
    pmullw          m5, m4, [pw_planar16_mul]
    pmullw          m4, [pw_planar16_mul + mmsize]
    paddw           m5, m2
    paddw           m4, m3
    paddw           m2, m10
    paddw           m3, m11
    psraw           m5, 6
    psraw           m4, 6
    packuswb        m5, m4
    movu            [r0 + 16], m5

    lea             r0, [r0 + r1]
    inc             r2
%endmacro

    mov             r3, 4
.loop:
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    dec             r3
    jnz             .loop
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal intra_pred_planar32, 3,4,11
    mova            m6, [pw_00ff]
    vpbroadcastw    m3, [r2 + 33]               ; topRight   = above[32]
    vpbroadcastw    m2, [r2 + 97]               ; bottomLeft = left[32]
    pand            m3, m6
    pand            m2, m6

    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
    pmullw          m3, [multiH2]               ; (x + 1) * topRight

    paddw           m0, m2
    paddw           m3, m2
    paddw           m0, [pw_32]
    paddw           m3, [pw_32]

    pmovzxbw        m4, [r2 + 1]
    pmovzxbw        m1, [r2 + 17]
    pmullw          m5, m4, [pw_31]
    paddw           m0, m5
    psubw           m5, m2, m4
    psubw           m2, m1
    pmullw          m1, [pw_31]
    paddw           m3, m1
    mova            m1, m5

    add             r2, 65                      ; (2 * blkSize + 1)
    mova            m9, [pw_planar32_mul]
    mova            m10, [pw_planar16_mul]

%macro INTRA_PRED_PLANAR32_AVX2 0
    vpbroadcastw    m4, [r2]
    vpsrlw          m7, m4, 8
    pand            m4, m6

    pmullw          m5, m4, m9
    pmullw          m4, m4, m10
    paddw           m5, m0
    paddw           m4, m3
    paddw           m0, m1
    paddw           m3, m2
    psraw           m5, 6
    psraw           m4, 6
    packuswb        m5, m4
    pmullw          m8, m7, m9
    pmullw          m7, m7, m10
    vpermq          m5, m5, 11011000b
    paddw           m8, m0
    paddw           m7, m3
    paddw           m0, m1
    paddw           m3, m2
    psraw           m8, 6
    psraw           m7, 6
    packuswb        m8, m7
    add             r2, 2
    vpermq          m8, m8, 11011000b

    movu            [r0], m5
    movu            [r0 + r1], m8
    lea             r0, [r0 + r1 * 2]
%endmacro
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
    INTRA_PRED_PLANAR32_AVX2
%undef INTRA_PRED_PLANAR32_AVX2
    RET
%endif ;; ARCH_X86_64 == 1

;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang4_2, 3,5,3
    lea         r4, [r2 + 2]
    add         r2, 10
    cmp         r3m, byte 34
    cmove       r2, r4

    movh        m0, [r2]
    movd        [r0], m0
    palignr     m1, m0, 1
    movd        [r0 + r1], m1
    palignr     m2, m0, 2
    movd        [r0 + r1 * 2], m2
    lea         r1, [r1 * 3]
    psrldq      m0, 3
    movd        [r0 + r1], m0
    RET

INIT_XMM sse4
cglobal intra_pred_ang4_3, 3,5,5
    mov         r4, 1
    cmp         r3m, byte 33
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m2, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    palignr     m3, m0, 6       ; [x x x x x x x x 8 7 7 6 6 5 5 4]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m3

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 + 6 * 16]   ; [26]
    movhps      m3, [r3]            ; [20]
    movh        m4, [r3 - 6 * 16]   ; [14]
    movhps      m4, [r3 - 12 * 16]  ; [ 8]
    jmp        .do_filter4x4

    ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
ALIGN 16
.do_filter4x4:
    mova        m1, [pw_1024]

    pmaddubsw   m0, m3
    pmulhrsw    m0, m1
    pmaddubsw   m2, m4
    pmulhrsw    m2, m1
    packuswb    m0, m2

    ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
    jz         .store

    ; transpose 4x4
    pshufb      m0, [c_trans_4x4]

.store:
    ; TODO: use pextrd here after intrinsic ssse3 removed
    movd        [r0], m0
    pextrd      [r0 + r1], m0, 1
    pextrd      [r0 + r1 * 2], m0, 2
    lea         r1, [r1 * 3]
    pextrd      [r0 + r1], m0, 3
    RET

cglobal intra_pred_ang4_4, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 32
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m1, m3

    lea         r3, [ang_table + 18 * 16]
    movh        m3, [r3 +  3 * 16]  ; [21]
    movhps      m3, [r3 -  8 * 16]  ; [10]
    movh        m4, [r3 + 13 * 16]  ; [31]
    movhps      m4, [r3 +  2 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_5, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 31
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m1, m3

    lea         r3, [ang_table + 10 * 16]
    movh        m3, [r3 +  7 * 16]  ; [17]
    movhps      m3, [r3 -  8 * 16]  ; [ 2]
    movh        m4, [r3 +  9 * 16]  ; [19]
    movhps      m4, [r3 -  6 * 16]  ; [ 4]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_6, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 30
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m2, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    punpcklqdq  m0, m0
    punpcklqdq  m2, m2

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 -  6 * 16]  ; [13]
    movhps      m3, [r3 +  7 * 16]  ; [26]
    movh        m4, [r3 - 12 * 16]  ; [ 7]
    movhps      m4, [r3 +  1 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_7, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 29
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m3, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    punpcklqdq  m2, m0, m3
    punpcklqdq  m0, m0

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 - 11 * 16]  ; [ 9]
    movhps      m3, [r3 -  2 * 16]  ; [18]
    movh        m4, [r3 +  7 * 16]  ; [27]
    movhps      m4, [r3 - 16 * 16]  ; [ 4]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_8, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 28
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 13 * 16]
    movh        m3, [r3 -  8 * 16]  ; [ 5]
    movhps      m3, [r3 -  3 * 16]  ; [10]
    movh        m4, [r3 +  2 * 16]  ; [15]
    movhps      m4, [r3 +  7 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_9, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 27
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 4 * 16]
    movh        m3, [r3 -  2 * 16]  ; [ 2]
    movhps      m3, [r3 -  0 * 16]  ; [ 4]
    movh        m4, [r3 +  2 * 16]  ; [ 6]
    movhps      m4, [r3 +  4 * 16]  ; [ 8]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_10, 3,3,4
    movd        m0, [r2 + 9]            ; [8 7 6 5 4 3 2 1]
    pshufb      m0, [pb_unpackbd1]
    pshufd      m1, m0, 1
    movhlps     m2, m0
    pshufd      m3, m0, 3
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m2
    lea         r1, [r1 * 3]
    movd        [r0 + r1], m3
    cmp         r4m, byte 0
    jz          .quit

    ; filter
    pmovzxbw    m0, m0                  ; [-1 -1 -1 -1]
    movh        m1, [r2]                ; [4 3 2 1 0]
    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
    pshufb      m1, [pb_unpackbw1]      ; [4 3 2 1]
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0
.quit:
    movd        [r0], m0
    RET

INIT_XMM sse4
cglobal intra_pred_ang4_26, 3,4,3
    movd        m0, [r2 + 1]            ; [8 7 6 5 4 3 2 1]

    ; store
    movd        [r0], m0
    movd        [r0 + r1], m0
    movd        [r0 + r1 * 2], m0
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0

    ; filter
    cmp         r4m, byte 0
    jz         .quit

    pshufb      m0, [pb_0_8]            ; [ 1  1  1  1]
    movh        m1, [r2 + 8]                ; [-4 -3 -2 -1 0]
    pinsrb      m1, [r2], 0
    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
    pshufb      m1, [pb_unpackbw1]      ; [-4 -3 -2 -1]
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0

    pextrb      [r0], m0, 0
    pextrb      [r0 + r1], m0, 1
    pextrb      [r0 + r1 * 2], m0, 2
    pextrb      [r0 + r3], m0, 3
.quit:
    RET

cglobal intra_pred_ang4_11, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 25
    mov         r3, 8
    cmove       r3, r4

    movh        m0, [r2 + r3]        ; [x x x 4 3 2 1 0]
    pinsrb      m0, [r2], 0
    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 24 * 16]

    movh        m3, [r3 +  6 * 16]  ; [24]
    movhps      m3, [r3 +  4 * 16]  ; [26]
    movh        m4, [r3 +  2 * 16]  ; [28]
    movhps      m4, [r3 +  0 * 16]  ; [30]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_12, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 24
    mov         r3, 8
    cmove       r3, r4

    movh        m0, [r2 + r3]        ; [x x x 4 3 2 1 0]
    pinsrb      m0, [r2], 0
    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 +  7 * 16]  ; [27]
    movhps      m3, [r3 +  2 * 16]  ; [22]
    movh        m4, [r3 -  3 * 16]  ; [17]
    movhps      m4, [r3 -  8 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_13, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 23
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m1, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m1, [r2], 1
    palignr     m0, m1, 1       ; [x x x 4 3 2 1 0]
    palignr     m2, m1, 2       ; [x x x x 4 3 2 1]
    pinsrb      m1, [r2 + r3 + 4], 0
    punpcklbw   m1, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m2          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m2, m0, m1
    punpcklqdq  m0, m0

    lea         r3, [ang_table + 21 * 16]
    movh        m3, [r3 +  2 * 16]  ; [23]
    movhps      m3, [r3 -  7 * 16]  ; [14]
    movh        m4, [r3 - 16 * 16]  ; [ 5]
    movhps      m4, [r3 +  7 * 16]  ; [28]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_14, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 22
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    punpcklqdq  m2, m2

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 +  0 * 16]  ; [19]
    movhps      m3, [r3 - 13 * 16]  ; [ 6]
    movh        m4, [r3 +  6 * 16]  ; [25]
    movhps      m4, [r3 -  7 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_15, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 21
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
    pinsrb      m3, [r2 + r3 + 4], 0
    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m2
    punpcklqdq  m2, m4

    lea         r3, [ang_table + 23 * 16]
    movh        m3, [r3 -  8 * 16]  ; [15]
    movhps      m3, [r3 +  7 * 16]  ; [30]
    movh        m4, [r3 - 10 * 16]  ; [13]
    movhps      m4, [r3 +  5 * 16]  ; [28]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_16, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 20
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
    pinsrb      m3, [r2 + r3 + 3], 0
    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m2
    punpcklqdq  m2, m4

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 -  8 * 16]  ; [11]
    movhps      m3, [r3 +  3 * 16]  ; [22]
    movh        m4, [r3 - 18 * 16]  ; [ 1]
    movhps      m4, [r3 -  7 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_17, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 19
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m3, [r2 + r4 - 1]    ; [- - 4 3 2 1 0 x]
    pinsrb      m3, [r2], 1
    palignr     m0, m3, 1       ; [- - - 4 3 2 1 0]
    palignr     m1, m3, 2       ; [- - - - 4 3 2 1]
    mova        m4, m0
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    pinsrb      m3, [r2 + r3 + 1], 0
    punpcklbw   m1, m3, m4      ; [3 2 2 1 1 0 0 x]
    punpcklqdq  m0, m1

    pslldq      m2, m3, 1       ; [- 4 3 2 1 0 x y]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m1, m2, 1       ; [4 3 2 1 0 x y z]
    pinsrb      m1, [r2 + r3 + 4], 0
    punpcklbw   m1, m2          ; [1 0 0 x x y y z]
    punpcklbw   m2, m3          ; [2 1 1 0 0 x x y]
    punpcklqdq  m2, m1

    lea         r3, [ang_table + 14 * 16]
    movh        m3, [r3 -  8 * 16]  ; [ 6]
    movhps      m3, [r3 -  2 * 16]  ; [12]
    movh        m4, [r3 +  4 * 16]  ; [18]
    movhps      m4, [r3 + 10 * 16]  ; [24]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_18, 3,5,1
    mov         r4d, [r2 + 8]
    mov         r3b, byte [r2]
    mov         [r2 + 8], r3b
    mov         r3d, [r2 + 8]
    bswap       r3d
    movd        m0, r3d

    pinsrd      m0, [r2 + 1], 1     ; [- 3 2 1 0 -1 -2 -3]
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0
    psrldq      m0, 1
    movd        [r0 + r1 * 2], m0
    psrldq      m0, 1
    movd        [r0 + r1], m0
    psrldq      m0, 1
    movd        [r0], m0
    mov         [r2 + 8], r4w
    RET

;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang8_2, 3,5,2
    lea         r4,             [r2 + 2]
    add         r2,             18
    cmp         r3m,            byte 34
    cmove       r2,             r4
    movu        m0,             [r2]
    lea         r4,             [r1 * 3]

    movh        [r0],           m0
    palignr     m1,             m0, 1
    movh        [r0 + r1],      m1
    palignr     m1,             m0, 2
    movh        [r0 + r1 * 2],  m1
    palignr     m1,             m0, 3
    movh        [r0 + r4],      m1
    palignr     m1,             m0, 4
    lea         r0,             [r0 + r1 * 4]
    movh        [r0],           m1
    palignr     m1,             m0, 5
    movh        [r0 + r1],      m1
    palignr     m1,             m0, 6
    movh        [r0 + r1 * 2],  m1
    palignr     m1,             m0, 7
    movh        [r0 + r4],      m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang8_3, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 33
    cmove       r2,        r4
    lea         r3,        [ang_table + 22 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m4,        m0, [r3 + 4 * 16]          ; [26]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r3 - 2 * 16]              ; [20]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m5,        [r3 - 8 * 16]              ; [14]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m6,        [r4]                       ; [ 8]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8                  ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]

    pmaddubsw   m6,        m1, [r4 - 6 * 16]          ; [ 2]
    pmulhrsw    m6,        m3

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [28]
    pmulhrsw    m1,        m3
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10                 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]

    pmaddubsw   m1,        [r3]                       ; [22]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 12                     ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]

    pmaddubsw   m2,        [r3 - 6 * 16]              ; [16]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         .transpose8x8

ALIGN 16
.transpose8x8:
    jz         .store

    ; transpose 8x8
    punpckhbw   m0,        m4, m5
    punpcklbw   m4,        m5
    punpckhbw   m2,        m4, m0
    punpcklbw   m4,        m0

    punpckhbw   m0,        m6, m1
    punpcklbw   m6,        m1
    punpckhbw   m1,        m6, m0
    punpcklbw   m6,        m0

    punpckhdq   m5,        m4, m6
    punpckldq   m4,        m6
    punpckldq   m6,        m2, m1
    punpckhdq   m2,        m1
    mova        m1,        m2

.store:
    lea         r4,              [r1 * 3]
    movh        [r0],            m4
    movhps      [r0 + r1],       m4
    movh        [r0 + r1 * 2],   m5
    movhps      [r0 + r4],       m5
    add         r0,              r4
    movh        [r0 + r1],       m6
    movhps      [r0 + r1 * 2],   m6
    movh        [r0 + r4],       m1
    movhps      [r0 + r1 * 4],   m1
    RET

cglobal intra_pred_ang8_4, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 32
    cmove       r2,        r4
    lea         r3,        [ang_table + 24 * 16]
    lea         r4,        [ang_table + 10 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [21]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r4]                       ; [10]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 7 * 16]              ; [31]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m6,        [r3 - 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m6,        m1, [r4 - 1 * 16]          ; [ 9]
    pmulhrsw    m6,        m3

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [30]
    pmulhrsw    m1,        m3
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8                  ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]

    pmaddubsw   m1,        [r3 - 5 * 16]              ; [19]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 10                     ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]

    pmaddubsw   m2,        [r4 - 2 * 16]              ; [8]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_5, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 31
    cmove       r2,        r4
    lea         r3,        [ang_table + 17 * 16]
    lea         r4,        [ang_table +  2 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3]                   ; [17]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r4]                       ; [2]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 2 * 16]              ; [19]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
    mova        m1,        m6

    pmaddubsw   m1,        [r4 + 2 * 16]              ; [4]
    pmulhrsw    m1,        m3
    packuswb    m5,        m1

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [21]
    pmulhrsw    m6,        m3

    palignr     m1,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    mova        m7,        m1
    pmaddubsw   m7,        [r4 + 4 * 16]              ; [6]
    pmulhrsw    m7,        m3
    packuswb    m6,        m7

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [23]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 8                      ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]

    pmaddubsw   m2,        [r4 + 6 * 16]              ; [8]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_6, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 30
    cmove       r2,        r4
    lea         r3,        [ang_table + 20 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    mova        m1,        m0

    pmaddubsw   m4,        m0, [r3 - 7 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m5,        m6, [r4 - 1 * 16]          ; [7]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3]                       ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m6,        m1, [r4 - 7 * 16]          ; [1]
    pmulhrsw    m6,        m7

    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 6 * 16]              ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [27]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 6                      ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m2,        [r4]                       ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_7, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 29
    cmove       r2,        r4
    lea         r3,        [ang_table + 24 * 16]
    lea         r4,        [ang_table +  6 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m0, [r4 + 3 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 + 3 * 16]          ; [27]
    pmulhrsw    m5,        m7

    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m6,        m1, [r4 - 2 * 16]          ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 7 * 16]          ; [13]
    pmulhrsw    m6,        m7

    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 2 * 16]              ; [22]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [31]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 4                      ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m2,        [r4 + 2 * 16]              ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_8, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 28
    cmove       r2,        r4
    lea         r3,        [ang_table + 23 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m2,        m0, 2                      ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m4,        m0, [r4 - 3 * 16]          ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 + 2 * 16]          ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 - 8 * 16]          ; [15]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 - 3 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 2 * 16]          ; [25]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        [r3 + 7 * 16]              ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m2, [r4 - 5 * 16]          ; [3]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r4]                       ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_9, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 27
    cmove       r2,        r4
    lea         r3,        [ang_table + 10 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m0, [r3 - 8 * 16]          ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]          ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 - 4 * 16]          ; [6]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 - 2 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3]                   ; [10]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m0, [r3 + 2 * 16]          ; [12]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        m0, [r3 + 4 * 16]          ; [14]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r3 + 6 * 16]              ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_10, 3,6,5
    movh        m0,        [r2 + 17]
    mova        m4,        [pb_unpackbq]
    palignr     m1,        m0, 2
    pshufb      m1,        m4
    palignr     m2,        m0, 4
    pshufb      m2,        m4
    palignr     m3,        m0, 6
    pshufb      m3,        m4
    pshufb      m0,        m4

    lea         r5,             [r1 * 3]
    movhps      [r0 + r1],      m0
    movh        [r0 + r1 * 2],  m1
    movhps      [r0 + r5],      m1
    lea         r3,             [r0 + r1 * 4]
    movh        [r3],           m2
    movhps      [r3 + r1],      m2
    movh        [r3 + r1 * 2],  m3
    movhps      [r3 + r5],      m3

; filter
    cmp         r4m, byte 0
    jz         .quit

    pmovzxbw    m0,        m0
    movu        m1,        [r2]
    palignr     m2,        m1, 1
    pshufb      m1,        m4
    pmovzxbw    m1,        m1
    pmovzxbw    m2,        m2
    psubw       m2,        m1
    psraw       m2,        1
    paddw       m0,        m2
    packuswb    m0,        m0

.quit:
    movh        [r0],      m0
    RET

cglobal intra_pred_ang8_26, 3,6,3
    movu        m2,             [r2]
    palignr     m0,             m2, 1
    lea         r5,             [r1 * 3]
    movh        [r0],           m0
    movh        [r0 + r1],      m0
    movh        [r0 + r1 * 2],  m0
    movh        [r0 + r5],      m0
    lea         r3,             [r0 + r1 * 4]
    movh        [r3],           m0
    movh        [r3 + r1],      m0
    movh        [r3 + r1 * 2],  m0
    movh        [r3 + r5],      m0

; filter
    cmp         r4m, byte 0
    jz         .quit

    pshufb      m2,        [pb_unpackbq]
    movhlps     m1,        m2
    pmovzxbw    m2,        m2
    movu        m0,        [r2 + 17]
    pmovzxbw    m1,        m1
    pmovzxbw    m0,        m0
    psubw       m0,        m2
    psraw       m0,        1
    paddw       m1,        m0
    packuswb    m1,        m1
    pextrb      [r0],          m1, 0
    pextrb      [r0 + r1],     m1, 1
    pextrb      [r0 + r1 * 2], m1, 2
    pextrb      [r0 + r5],     m1, 3
    pextrb      [r3],          m1, 4
    pextrb      [r3 + r1],     m1, 5
    pextrb      [r3 + r1 * 2], m1, 6
    pextrb      [r3 + r5],     m1, 7
.quit:
    RET

cglobal intra_pred_ang8_11, 3,5,8
    xor         r4,        r4
    cmp         r3m,       byte 25
    mov         r3,        16
    cmove       r3,        r4

    movu        m0,        [r2 + r3]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m0,        [r2], 0
    palignr     m1,        m0, 1                      ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]

    punpcklbw   m0,        m1                         ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r3,        [ang_table + 23 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m4,        m0, [r3 + 7 * 16]          ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 + 5 * 16]          ; [28]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 + 3 * 16]          ; [26]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 + 1 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 - 1 * 16]          ; [22]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m0, [r3 - 3 * 16]          ; [20]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        m0, [r3 - 5 * 16]          ; [18]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r3 - 7 * 16]              ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_12, 3,5,8
    xor         r4,        r4
    cmp         r3m,       byte 24
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    pslldq      m0,        m1, 1                      ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    pinsrb      m0,        [r2 + r3 + 6], 0

    lea         r4,        [ang_table + 22 * 16]
    mova        m7,        [pw_1024]

    punpckhbw   m2,        m0, m1                     ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
    punpcklbw   m0,        m1                         ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m2,        m0, 2                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m2, [r4 + 5 * 16]          ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m1,        m0, [r4 + 7 * 16]          ; [29]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r4 + 2 * 16]              ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    pmaddubsw   m5,        m2, [r4 - 5 * 16]          ; [17]
    pmulhrsw    m5,        m7

    lea         r4,        [ang_table + 7 * 16]
    pmaddubsw   m6,        m2, [r4 + 5 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r4]                   ; [7]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        [r4 - 5 * 16]              ; [2]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_13, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 23
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 +  r4]                 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    pslldq      m1,        1                          ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    pinsrb      m1,        [r2 + r3 + 4], 0
    pslldq      m0,        m1, 1                      ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m5,        m0, m1                     ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
    punpcklbw   m0,        m1                         ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m1,        m5, m0, 2                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m5,        m0, 4                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 24 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m4,        m5, [r4 - 1 * 16]          ; [23]
    pmulhrsw    m4,        m7

    pmaddubsw   m6,        m1, [r4 + 4 * 16]          ; [28]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        [r4]                       ; [24]
    pmulhrsw    m0,        m7

    lea         r4,        [ang_table + 13 * 16]
    pmaddubsw   m3,        m5, [r4 + 1 * 16]          ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        [r4 - 8 * 16]              ; [5]
    pmulhrsw    m5,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 6 * 16]          ; [19]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m1, [r4 - 3 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 12 * 16]             ; [1]
    pmulhrsw    m1,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_14, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 22
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4 - 2]              ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
    pinsrb      m1,        [r2], 2
    pinsrb      m1,        [r2 + r3 + 2], 1
    pinsrb      m1,        [r2 + r3 + 5], 0
    pslldq      m0,        m1, 1                      ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m2,        m0, m1                     ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
    punpcklbw   m0,        m1                         ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m1,        m2, m0, 2                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m6,        m2, m0, 4                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m2,        m0, 6                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 24 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [19]
    pmulhrsw    m4,        m3

    pmaddubsw   m0,        [r4]                       ; [24]
    pmulhrsw    m0,        m3

    pmaddubsw   m5,        m6, [r4 + 1 * 16]          ; [25]
    pmulhrsw    m5,        m3

    lea         r4,        [ang_table + 12 * 16]
    pmaddubsw   m6,        [r4]                       ; [12]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 19 * 16]         ; [31]
    pmulhrsw    m6,        m3

    pmaddubsw   m2,        [r4 - 6 * 16]              ; [6]
    pmulhrsw    m2,        m3
    packuswb    m4,        m2

    pmaddubsw   m2,        m1, [r4 + 6 * 16]          ; [18]
    pmulhrsw    m2,        m3
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 7 * 16]              ; [5]
    pmulhrsw    m1,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_15, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 21
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    movu        m2,        [r2 + r3]
    pshufb      m2,        [c_mode16_15]
    palignr     m1,        m2, 13                     ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
    pslldq      m0,        m1, 1                      ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
    pinsrb      m0,        [r2 + r3 + 8], 0
    punpckhbw   m4,        m0, m1                     ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
    punpcklbw   m0,        m1                         ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    palignr     m1,        m4, m0, 2                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m6,        m4, m0, 4                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m5,        m4, m0, 6                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m0, 8                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 23 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m4,        [r4 - 8 * 16]              ; [15]
    pmulhrsw    m4,        m3

    pmaddubsw   m2,        m5, [r4 + 7 * 16]          ; [30]
    pmulhrsw    m2,        m3
    packuswb    m4,        m2

    pmaddubsw   m5,        [r4 - 10 * 16]             ; [13]
    pmulhrsw    m5,        m3

    pmaddubsw   m2,        m6, [r4 + 5 * 16]          ; [28]
    pmulhrsw    m2,        m3
    packuswb    m5,        m2

    pmaddubsw   m2,        m1, [r4 + 3 * 16]          ; [26]
    pmulhrsw    m2,        m3

    pmaddubsw   m0,        [r4 + 1 * 16]              ; [24]
    pmulhrsw    m0,        m3

    lea         r4,        [ang_table + 11 * 16]
    pmaddubsw   m6,        [r4]                       ; [11]
    pmulhrsw    m6,        m3
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 2 * 16]              ; [9]
    pmulhrsw    m1,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_16, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 20
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    movu        m2,        [r2 + r3]
    pshufb      m2,        [c_mode16_16]
    palignr     m1,        m2, 12                     ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
    pslldq      m0,        m1, 1                      ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
    pinsrb      m0,        [r2 + r3 + 8], 0
    punpckhbw   m4,        m0, m1                     ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
    punpcklbw   m0,        m1                         ; [3 2 2 1 1 0 0 a a b b c c d d e]
    palignr     m1,        m4, m0, 2                  ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    palignr     m6,        m4, m0, 4                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m2,        m4, m0, 6                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m5,        m4, m0, 8                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m0, 10                     ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 22 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m3,        m5, [r4]                   ; [22]
    pmulhrsw    m3,        m7

    pmaddubsw   m0,        [r4 + 2 * 16]              ; [24]
    pmulhrsw    m0,        m7

    lea         r4,        [ang_table + 9 * 16]

    pmaddubsw   m4,        [r4 + 2 * 16]              ; [11]
    pmulhrsw    m4,        m7
    packuswb    m4,        m3

    pmaddubsw   m2,        [r4 + 3 * 16]              ; [12]
    pmulhrsw    m2,        m7

    pmaddubsw   m5,        [r4 - 8 * 16]              ; [1]
    pmulhrsw    m5,        m7
    packuswb    m5,        m2

    mova        m2,        m6
    pmaddubsw   m6,        [r4 + 14 * 16]             ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        [r4 -  7 * 16]             ; [2]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 + 4 * 16]              ; [13]
    pmulhrsw    m1,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_17, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 19
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m2,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m2,        [r2], 0
    movu        m1,        [r2 + r3]
    pshufb      m1,        [c_mode16_17]
    palignr     m2,        m1, 11                     ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
    pslldq      m0,        m2, 1                      ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m1,        m0, m2                     ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    punpcklbw   m0,        m2                         ; [2 1 1 0 0 a a b b c c d d e e f]

    palignr     m5,        m1, m0, 8                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m2,        m1, m0, 10                 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m1, m0, 12                 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 17 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m2,        [r4 - 5 * 16]              ; [12]
    pmulhrsw    m2,        m3

    pmaddubsw   m4,        [r4 - 11 * 16]             ; [6]
    pmulhrsw    m4,        m3
    packuswb    m4,        m2

    pmaddubsw   m5,        [r4 + 1 * 16]              ; [18]
    pmulhrsw    m5,        m3

    palignr     m2,        m1, m0, 6                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    pmaddubsw   m2,        [r4 + 7 * 16]              ; [24]
    pmulhrsw    m2,        m3
    packuswb    m5,        m2

    palignr     m6,        m1, m0, 4                  ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    mova        m2,        m6
    pmaddubsw   m6,        [r4 + 13 * 16]             ; [30]
    pmulhrsw    m6,        m3

    pmaddubsw   m2,        [r4 - 13 * 16]             ; [4]
    pmulhrsw    m2,        m3
    packuswb    m6,        m2

    palignr     m1,        m0, 2                      ; [3 2 2 1 1 0 0 a a b b c c d d e]
    pmaddubsw   m1,        [r4 - 7 * 16]              ; [10]
    pmulhrsw    m1,        m3

    pmaddubsw   m0,        [r4 - 1 * 16]              ; [16]
    pmulhrsw    m0,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_18, 4,4,1
    movu        m0, [r2 + 16]
    pinsrb      m0, [r2], 0
    pshufb      m0, [pb_swap8]
    movhps      m0, [r2 + 1]
    lea         r2, [r0 + r1 * 4]
    lea         r3, [r1 * 3]
    movh        [r2 + r3], m0
    psrldq      m0, 1
    movh        [r2 + r1 * 2], m0
    psrldq      m0, 1
    movh        [r2 + r1], m0
    psrldq      m0, 1
    movh        [r2], m0
    psrldq      m0, 1
    movh        [r0 + r3], m0
    psrldq      m0, 1
    movh        [r0 + r1 * 2], m0
    psrldq      m0, 1
    movh        [r0 + r1], m0
    psrldq      m0, 1
    movh        [r0], m0
    RET

%macro TRANSPOSE_STORE_8x8 6
  %if %2 == 1
    ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
    punpckhbw   m0,        %3, %4
    punpcklbw   %3,        %4
    punpckhbw   %4,        %3, m0
    punpcklbw   %3,        m0

    punpckhbw   m0,        %5, m1
    punpcklbw   %5,        %6
    punpckhbw   %6,        %5, m0
    punpcklbw   %5,        m0

    punpckhdq   m0,        %3, %5
    punpckldq   %3,        %5
    punpckldq   %5,        %4, %6
    punpckhdq   %4,        %6

    movh        [r0 +       + %1 * 8], %3
    movhps      [r0 +  r1   + %1 * 8], %3
    movh        [r0 +  r1*2 + %1 * 8], m0
    movhps      [r0 +  r5   + %1 * 8], m0
    movh        [r6         + %1 * 8], %5
    movhps      [r6 +  r1   + %1 * 8], %5
    movh        [r6 +  r1*2 + %1 * 8], %4
    movhps      [r6 +  r5   + %1 * 8], %4
  %else
    ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
    movh        [r0         ], %3
    movhps      [r0 + r1    ], %3
    movh        [r0 + r1 * 2], %4
    movhps      [r0 + r5    ], %4
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], %5
    movhps      [r0 + r1    ], %5
    movh        [r0 + r1 * 2], %6
    movhps      [r0 + r5    ], %6
    lea         r0, [r0 + r1 * 4]
  %endif
%endmacro

;------------------------------------------------------------------------------------------
; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang16_2, 3,5,3
    lea             r4, [r2 + 2]
    add             r2, 34
    cmp             r3m, byte 34
    cmove           r2, r4
    movu            m0, [r2]
    movu            m1, [r2 + 16]
    movu            [r0], m0
    palignr         m2, m1, m0, 1
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 2
    movu            [r0], m2
    palignr         m2, m1, m0, 3
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 4
    movu            [r0], m2
    palignr         m2, m1, m0, 5
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 6
    movu            [r0], m2
    palignr         m2, m1, m0, 7
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 8
    movu            [r0], m2
    palignr         m2, m1, m0, 9
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 10
    movu            [r0], m2
    palignr         m2, m1, m0, 11
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 12
    movu            [r0], m2
    palignr         m2, m1, m0, 13
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 14
    movu            [r0], m2
    palignr         m2, m1, m0, 15
    movu            [r0 + r1], m2
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_3, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4

    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8

    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 12

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_33, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4

    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8

    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 12

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        m2,        [r2 + 14]                  ; [00]

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_4, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 10

    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7

    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4
    mova        m6,        m5

    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8
    palignr     m2,        m0, 10

    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_32, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1


    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 10

    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7

    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4
    mova        m6,        m5

    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8
    palignr     m2,        m0, 10

    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_5, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m1,        [r2 + 2]                   ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    palignr     m5,        m2, m3, 2

    pmaddubsw   m4,        m3, [r3 +      16]         ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 4

    pmaddubsw   m5,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 6

    pmaddubsw   m6,        [r3 + 5 * 16]              ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 8

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m4,        m2, m3, 8
    palignr     m5,        m2, m3, 10

    pmaddubsw   m4,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 12

    pmaddubsw   m5,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 14

    pmaddubsw   m6,        [r3 + 13 * 16]             ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_31, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m1,        [r2 + 2]                   ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    palignr     m5,        m2, m3, 2

    pmaddubsw   m4,        m3, [r3 +      16]         ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 4

    pmaddubsw   m5,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 6

    pmaddubsw   m6,        [r3 + 5 * 16]              ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 8

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m4,        m2, m3, 8
    palignr     m5,        m2, m3, 10

    pmaddubsw   m4,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 12

    pmaddubsw   m5,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 14

    pmaddubsw   m6,        [r3 + 13 * 16]             ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_6, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 2

    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m3, 4

    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 6

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m4,        m2, m3, 6
    palignr     m6,        m2, m3, 8

    pmaddubsw   m4,        [r3 +  5 * 16]             ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m0,        m2, m3, 10

    pmaddubsw   m6,        m0, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 12

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_30, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 2

    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m3, 4

    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 6

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m4,        m2, m3, 6
    palignr     m6,        m2, m3, 8

    pmaddubsw   m4,        [r3 +  5 * 16]             ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m0,        m2, m3, 10

    pmaddubsw   m6,        m0, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 12

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_7, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    palignr     m1,        m2, m3, 2

    pmaddubsw   m5,        m3, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 4

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m1,        m2, m3, 4

    pmaddubsw   m4,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m0,        m2, m3, 6

    pmaddubsw   m5,        m0, [r3 - 13 * 16]         ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 8

    pmaddubsw   m1,        m2, [r3 - 9 * 16]          ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_29, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    palignr     m1,        m2, m3, 2

    pmaddubsw   m5,        m3, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 4

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m1,        m2, m3, 4

    pmaddubsw   m4,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m0,        m2, m3, 6

    pmaddubsw   m5,        m0, [r3 - 13 * 16]         ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 8

    pmaddubsw   m1,        m2, [r3 - 9 * 16]          ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_8, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m1,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m1, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m0,        m1, m3                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m1,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m1, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        m1, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2

    pmaddubsw   m5,        m1, [r3 - 1 * 16]          ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 + 9 * 16]          ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m2,        m1, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    palignr     m2,        m0, m1, 2
    palignr     m3,        m0, m1, 4

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 7 * 16]          ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2

    pmaddubsw   m6,        m3, [r3 - 15 * 16]         ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r3]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_28, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m1,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m1, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m0,        m1, m3                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m1,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m1, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        m1, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2

    pmaddubsw   m5,        m1, [r3 - 1 * 16]          ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 + 9 * 16]          ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m2,        m1, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    palignr     m2,        m0, m1, 2
    palignr     m3,        m0, m1, 4

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 7 * 16]          ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2

    pmaddubsw   m6,        m3, [r3 - 15 * 16]         ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r3]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_9, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m2,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m2, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpcklbw   m2,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m2, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m2, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m2, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m2, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m2, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    punpcklqdq  m1,        m3                         ; [00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_27, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m2,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpcklbw   m3,        m2                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_10, 5,6,8
    lea         r5,        [r1 * 3]
    pxor        m7,        m7

    movu        m0,        [r2 + 1 + 32]
    palignr     m1,        m0, 1
    pshufb      m1,        m7
    palignr     m2,        m0, 2
    pshufb      m2,        m7
    palignr     m3,        m0, 3
    pshufb      m3,        m7
    palignr     m4,        m0, 4
    pshufb      m4,        m7
    palignr     m5,        m0, 5
    pshufb      m5,        m7
    palignr     m6,        m0, 6
    pshufb      m6,        m7

    movu        [r0 + r1],      m1
    movu        [r0 + r1 * 2],  m2
    movu        [r0 + r5],      m3
    lea         r3,             [r0 + r1 * 4]
    movu        [r3],           m4
    movu        [r3 + r1],      m5
    movu        [r3 + r1 * 2],  m6

    palignr     m1,        m0, 7
    pshufb      m1,        m7
    movhlps     m2,        m0
    pshufb      m2,        m7
    palignr     m3,        m0, 9
    pshufb      m3,        m7
    palignr     m4,        m0, 10
    pshufb      m4,        m7
    palignr     m5,        m0, 11
    pshufb      m5,        m7
    palignr     m6,        m0, 12
    pshufb      m6,        m7

    movu        [r3 + r5],      m1
    lea         r3,             [r3 + r1 * 4]
    movu        [r3],           m2
    movu        [r3 + r1],      m3
    movu        [r3 + r1 * 2],  m4
    movu        [r3 + r5],      m5
    lea         r3,             [r3 + r1 * 4]
    movu        [r3],           m6

    palignr     m1,        m0, 13
    pshufb      m1,        m7
    palignr     m2,        m0, 14
    pshufb      m2,        m7
    palignr     m3,        m0, 15
    pshufb      m3,        m7
    pshufb      m0,        m7

    movu        [r3 + r1],      m1
    movu        [r3 + r1 * 2],  m2
    movu        [r3 + r5],      m3

; filter
    cmp         r4w, byte 0
    jz         .quit
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        [r2]
    movu        m3,        [r2 + 1]

    pshufb      m2,        m7
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1
.quit:
    movu        [r0],      m0
    RET

INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_ang16_26, 3,8,5
    mov     r7, r4mp
    %define bfilter r7w
%else
cglobal intra_pred_ang16_26, 5,7,5,0-4
    %define bfilter dword[rsp]
    mov     bfilter, r4
%endif
    movu        m0,             [r2 + 1]

    lea         r4,             [r1 * 3]
    lea         r3,             [r0 + r1 * 4]
    lea         r5,             [r3 + r1 * 4]
    lea         r6,             [r5 + r1 * 4]

    movu        [r0],           m0
    movu        [r0 + r1],      m0
    movu        [r0 + r1 * 2],  m0
    movu        [r0 + r4],      m0
    movu        [r3],           m0
    movu        [r3 + r1],      m0
    movu        [r3 + r1 * 2],  m0
    movu        [r3 + r4],      m0
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0

    movu        [r6],           m0
    movu        [r6 + r1],      m0
    movu        [r6 + r1 * 2],  m0
    movu        [r6 + r4],      m0

; filter
    cmp         bfilter, byte 0
    jz         .quit

    pxor        m4,        m4
    pshufb      m0,        m4
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    movu        m3,        [r2 + 1 + 32]

    pshufb      m2,        m4
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1

    pextrb      [r0],           m0, 0
    pextrb      [r0 + r1],      m0, 1
    pextrb      [r0 + r1 * 2],  m0, 2
    pextrb      [r0 + r4],      m0, 3
    pextrb      [r3],           m0, 4
    pextrb      [r3 + r1],      m0, 5
    pextrb      [r3 + r1 * 2],  m0, 6
    pextrb      [r3 + r4],      m0, 7
    pextrb      [r5],           m0, 8
    pextrb      [r5 + r1],      m0, 9
    pextrb      [r5 + r1 * 2],  m0, 10
    pextrb      [r5 + r4],      m0, 11
    pextrb      [r6],           m0, 12
    pextrb      [r6 + r1],      m0, 13
    pextrb      [r6 + r1 * 2],  m0, 14
    pextrb      [r6 + r4],      m0, 15
.quit:
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_11, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]              ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    punpcklqdq  m1,        m2                         ;[00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m3,        [r2 + 40]              ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    punpcklqdq  m1,        m2                         ;[00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_25, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2]                   ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_12, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m0,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_12]

    palignr     m0,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m0, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r4 - 14 * 16]             ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 15

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_24, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m0,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_12]

    palignr     m0,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m0, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r4 - 14 * 16]             ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 15

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_13, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_13]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m5, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        [r4 - 11 * 16]             ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_23, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_13]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m5, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        [r4 - 11 * 16]             ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_14, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_14]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_22, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_14]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_15, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_15]

    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 16]                  ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 -  5 * 16]         ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7  * 16]         ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 +  4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_21, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_15]

    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 16]                  ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 -  5 * 16]         ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7  * 16]         ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 +  4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_16, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_16]              ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 5  * 16]             ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1                           ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 9  * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 5  * 16]         ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 +  9 * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_20, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_16]              ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 5  * 16]             ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1                           ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 9  * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 5  * 16]         ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 +  9 * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_17, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_17]              ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 -  4 * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
    pinsrb      m2,       [r2 + 5], 0                 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pslldq      m2,       1                           ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1                           ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]

    pmaddubsw   m4,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  2 * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_19, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_17]              ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 -  4 * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
    pinsrb      m2,       [r2 + 5 + 32], 0            ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pslldq      m2,       1                           ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1                           ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  2 * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_18, 4,5,3
    movu        m0,         [r2]
    movu        m1,         [r2 + 32]
    mova        m2,         [c_mode16_18]
    pshufb      m1,         m2

    lea         r2,         [r1 * 2]
    lea         r3,         [r1 * 3]
    lea         r4,         [r1 * 4]
    movu        [r0],       m0
    palignr     m2,         m0, m1, 15
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 14
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 13
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 12
    movu        [r0],       m2
    palignr     m2,         m0, m1, 11
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 10
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 9
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 8
    movu        [r0],       m2
    palignr     m2,         m0, m1, 7
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 6
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 5
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 4
    movu        [r0],       m2
    palignr     m2,         m0, m1, 3
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 2
    movu        [r0 + r2],  m2
    palignr     m0,         m1, 1
    movu        [r0 + r3],  m0
    RET

; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
%macro PROC32_8x8 10  ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
  %if %3 == 0
  %else
    pshufb      m0, [r3]
    pmaddubsw   m0, [r4 + %3 * 16]
    pmulhrsw    m0, [pw_1024]
  %endif
  %if %4 == 0
    pmovzxbw    m1, m1
  %else
    pshufb      m1, [r3]
    pmaddubsw   m1, [r4 + %4 * 16]
    pmulhrsw    m1, [pw_1024]
  %endif
  %if %3 == 0
    packuswb    m1, m1
    movlhps     m0, m1
  %else
    packuswb    m0, m1
  %endif
    mova        m1, [pw_1024]
  %if %5 == 0
  %else
    pshufb      m2, [r3]
    pmaddubsw   m2, [r4 + %5 * 16]
    pmulhrsw    m2, m1
  %endif
  %if %6 == 0
    pmovzxbw    m3, m3
  %else
    pshufb      m3, [r3]
    pmaddubsw   m3, [r4 + %6 * 16]
    pmulhrsw    m3, m1
  %endif
  %if %5 == 0
    packuswb    m3, m3
    movlhps     m2, m3
  %else
    packuswb    m2, m3
  %endif
  %if %7 == 0
  %else
    pshufb      m4, [r3]
    pmaddubsw   m4, [r4 + %7 * 16]
    pmulhrsw    m4, m1
  %endif
  %if %8 == 0
    pmovzxbw    m5, m5
  %else
    pshufb      m5, [r3]
    pmaddubsw   m5, [r4 + %8 * 16]
    pmulhrsw    m5, m1
  %endif
  %if %7 == 0
    packuswb    m5, m5
    movlhps     m4, m5
  %else
    packuswb    m4, m5
  %endif
  %if %9 == 0
  %else
    pshufb      m6, [r3]
    pmaddubsw   m6, [r4 + %9 * 16]
    pmulhrsw    m6, m1
  %endif
  %if %10 == 0
    pmovzxbw    m7, m7
  %else
    pshufb      m7, [r3]
    pmaddubsw   m7, [r4 + %10 * 16]
    pmulhrsw    m7, m1
  %endif
  %if %9 == 0
    packuswb    m7, m7
    movlhps     m6, m7
  %else
    packuswb    m6, m7
  %endif

  %if %2 == 1
    ; transpose
    punpckhbw   m1,        m0, m2
    punpcklbw   m0,        m2
    punpckhbw   m3,        m0, m1
    punpcklbw   m0,        m1

    punpckhbw   m1,        m4, m6
    punpcklbw   m4,        m6
    punpckhbw   m6,        m4, m1
    punpcklbw   m4,        m1

    punpckhdq   m2,        m0, m4
    punpckldq   m0,        m4
    punpckldq   m4,        m3, m6
    punpckhdq   m3,        m6

    movh        [r0 +       + %1 * 8], m0
    movhps      [r0 +  r1   + %1 * 8], m0
    movh        [r0 +  r1*2 + %1 * 8], m2
    movhps      [r0 +  r5   + %1 * 8], m2
    movh        [r6         + %1 * 8], m4
    movhps      [r6 +  r1   + %1 * 8], m4
    movh        [r6 +  r1*2 + %1 * 8], m3
    movhps      [r6 +  r5   + %1 * 8], m3
  %else
    movh        [r0         ], m0
    movhps      [r0 + r1    ], m0
    movh        [r0 + r1 * 2], m2
    movhps      [r0 + r5    ], m2
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m6
    movhps      [r0 + r5    ], m6
  %endif
%endmacro

%macro MODE_3_33 1
    movu        m0,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9  8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 8
    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 12
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 14]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 8
    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 12
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 21]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 27]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_4_32 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 10
    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    mova        m6,        m5
    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 10
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 12]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m5,        [r3 - 16]                  ; [15]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m1,        [r3 - 12 * 16]             ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    pmaddubsw   m6,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    pmaddubsw   m1,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    mova        m2,        m1
    pmaddubsw   m1,        [r3 - 13 * 16]             ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 17]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    palignr     m5,        m2, m0, 2
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m6,        m2, m0, 6
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 15 * 16]             ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 5 * 16]              ; [11]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 22]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_5_31 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 +      16]          ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 14 * 16]              ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 3 * 16]               ; [19]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 12 * 16]              ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 5 * 16]               ; [21]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 10 * 16]              ; [6]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 7 * 16]               ; [23]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 8
    pmaddubsw   m2,        [r3 - 8 * 16]               ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 + 9 * 16]           ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]               ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 11 * 16]              ; [27]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 4 * 16]               ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 13 * 16]          ; [29]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 2 * 16]               ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 15 * 16]              ; [31]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 8
    pmaddubsw   m2,        [r3]                        ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 10]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 15 * 16]          ; [1]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 2 * 16]               ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    mova        m1,        m5
    pmaddubsw   m5,        [r3 - 13 * 16]              ; [3]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]               ; [20]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 11 * 16]          ; [5]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]               ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m0, 6
    pmaddubsw   m1,        m2, [r3 - 9 * 16]           ; [7]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3 + 8 * 16]               ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 14]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 7 * 16]           ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]              ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    mova        m1,        m5
    pmaddubsw   m5,        [r3 - 5 * 16]               ; [11]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]              ; [28]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 3 * 16]           ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]              ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m0, 6
    pmaddubsw   m1,        m2, [r3 - 16]               ; [15]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 18]                   ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_6_30 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m6,        m2, m0, 2
    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 6
    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 +  5 * 16]         ; [21]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m6,        m0
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m3,        m2, m0, 2
    pmaddubsw   m6,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    palignr     m2,        m0, 4
    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r3]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 +  13 * 16]        ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 7]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m5,        m7
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m2, m0, 6
    pmaddubsw   m1,        m2, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        m2, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 11]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m5,        m0
    pmaddubsw   m4,        m0, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m5, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 2
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    pmaddubsw   m6,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 4
    pmaddubsw   m2,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2
    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_7_29 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m5,        m0
    pmaddubsw   m4,        m0, [r3 - 7 * 16]         ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m5, [r3 + 2 * 16]         ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        [r3 + 11 * 16]            ; [27]
    pmulhrsw    m5,        m7
    palignr     m1,        m2, m0, 2
    palignr     m2,        m0, 4
    pmaddubsw   m6,        m1, [r3 - 12 * 16]        ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 - 3 * 16]         ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]         ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        [r3 + 15 * 16]            ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]         ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 16]             ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        [r3 + 10 * 16]            ; [26]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2
    movu        m0,        [r2 + 4]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m5,        m0, [r3 - 13 * 16]        ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 5 * 16]         ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]            ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 9 * 16]         ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r3]                  ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 9 * 16]         ; [25]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m1,        m0, [r3 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m0, [r3 - 5 * 16]         ; [11]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 13 * 16]        ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 - 10 * 16]        ; [6]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 - 16]             ; [15]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        m2, [r3 + 8 * 16]         ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m4,        m0, [r3 - 15 * 16]        ; [1]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]         ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r3 + 3 * 16]         ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r3 - 11 * 16]        ; [5]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 2 * 16]         ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 + 7 * 16]         ; [23]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 10]                 ; [0]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_8_28 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m4,        m0, [r3 - 11 * 16]     ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]      ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r3 - 1 * 16]      ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 4 * 16]      ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 9 * 16]      ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]         ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 13 * 16]     ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]      ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]      ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]      ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 7 * 16]      ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]         ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2
    movu        m0,        [r2 + 3]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m6,        m0, [r3 - 15 * 16]     ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 - 10 * 16]     ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 - 5 * 16]      ; [11]
    pmulhrsw    m1,        m7
    mova        m2,        m0
    pmaddubsw   m0,        [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 5 * 16]      ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 10 * 16]     ; [26]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 15 * 16]     ; [31]
    pmulhrsw    m5,        m7
    movu        m0,        [r2 + 4]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m2,        m0, [r3 - 12 * 16]     ; [4]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2
    pmaddubsw   m6,        m0, [r3 - 7 * 16]      ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 - 2 * 16]      ; [14]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 + 3 * 16]      ; [19]
    pmulhrsw    m1,        m7
    mova        m2,        m0
    pmaddubsw   m0,        [r3 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]     ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m1,        m0, [r3 - 14 * 16]     ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m0, [r3 - 9 * 16]      ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]      ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 16]          ; [17]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 + 6 * 16]      ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 6]               ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_9_27 1
    movu        m2,        [r2 + 1]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 - 14 * 16]   ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r3 - 12 * 16]   ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r3 - 10 * 16]   ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]    ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 - 6 * 16]    ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r3 - 4 * 16]    ; [12]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r3 - 2 * 16]    ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]             ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 2 * 16]    ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]    ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 6 * 16]    ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]    ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 + 10 * 16]   ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]   ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 + 14 * 16]   ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 2]             ; [00]

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m2,        [r2 + 2]
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 - 14 * 16]   ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r3 - 12 * 16]   ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r3 - 10 * 16]   ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]    ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 - 6 * 16]    ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 4 * 16]    ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 2 * 16]    ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]             ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m2,        [r2 + 2]
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 + 2 * 16]    ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]    ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 6 * 16]    ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]    ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 + 10 * 16]   ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]   ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 + 14 * 16]   ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 3]             ; [00]

     TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_12_24 1
    movu        m2,        [r2]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    palignr     m0,        m2, 2
    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m0, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    movu        m0,        [r2 - 2]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 15 * 16]         ; [1]
    pmulhrsw    m5,        m7
    movu        m0,        [r2 - 3]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 13 * 16]         ; [3]
    pmulhrsw    m4,        m7
    movu        m2,        [r2 - 4]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    palignr     m0,        m2, 2
    pmaddubsw   m5,        m0, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m0, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7
    movu        m2,        [pb_fact0]
    pshufb      m0,        m2
    pmovzxbw    m0,        m0
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

;------------------------------------------------------------------------------------------
; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang32_2, 3,5,4
    lea             r4, [r2]
    add             r2, 64
    cmp             r3m, byte 34
    cmove           r2, r4
    movu            m0, [r2 + 2]
    movu            m1, [r2 + 18]
    movu            m3, [r2 + 34]

    lea             r3, [r1 * 3]

    movu            [r0], m0
    movu            [r0 + 16], m1
    palignr         m2, m1, m0, 1
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 1
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 2
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 2
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 3
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 3
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 4
    movu            [r0], m2
    palignr         m2, m3, m1, 4
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 5
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 5
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 6
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 6
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 7
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 7
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 8
    movu            [r0], m2
    palignr         m2, m3, m1, 8
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 9
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 9
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 10
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 10
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 11
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 11
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 12
    movu            [r0], m2
    palignr         m2, m3, m1, 12
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 13
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 13
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 14
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 14
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 15
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 15
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    movu            [r0], m1
    movu            m0, [r2 + 50]
    movu            [r0 + 16], m3
    palignr         m2, m3, m1, 1
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 1
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 2
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 2
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 3
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 3
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 4
    movu            [r0], m2
    palignr         m2, m0, m3, 4
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 5
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 5
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 6
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 6
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 7
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 7
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 8
    movu            [r0], m2
    palignr         m2, m0, m3, 8
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 9
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 9
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 10
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 10
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 11
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 11
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 12
    movu            [r0], m2
    palignr         m2, m0, m3, 12
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 13
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 13
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 14
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 14
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 15
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 15
    movu            [r0 + r3 + 16], m2
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_3, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_3_33 1
    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_4, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                    ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]               ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_4_32 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_5, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_5_31 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_6, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                  ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]             ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_6_30 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_7, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]               ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]          ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_7_29 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_8, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_8_28 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_9, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]         ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]    ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_9_27 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
    pxor        m7, m7
    mov         r6, 2
    movu        m0, [r2]
    movu        m1, [r2 + 1]
    mova        m8, m0
    mova        m9, m1
    mov         r3d, r4d
    lea         r4, [r1 * 3]

.loop:
    movu        m0, [r2 + 1 + 64]
    palignr     m1, m0, 1
    pshufb      m1, m7
    palignr     m2, m0, 2
    pshufb      m2, m7
    palignr     m3, m0, 3
    pshufb      m3, m7
    palignr     m4, m0, 4
    pshufb      m4, m7
    palignr     m5, m0, 5
    pshufb      m5, m7
    palignr     m6, m0, 6
    pshufb      m6, m7

    movu        [r0 + r1], m1
    movu        [r0 + r1 + 16], m1
    movu        [r0 + r1 * 2], m2
    movu        [r0 + r1 * 2 + 16], m2
    movu        [r0 + r4], m3
    movu        [r0 + r4 + 16], m3
    lea         r5, [r0 + r1 * 4]
    movu        [r5], m4
    movu        [r5 + 16], m4
    movu        [r5 + r1], m5
    movu        [r5 + r1 + 16], m5
    movu        [r5 + r1 * 2], m6
    movu        [r5 + r1 * 2 + 16], m6

    palignr     m1, m0, 7
    pshufb      m1, m7
    movhlps     m2, m0
    pshufb      m2, m7
    palignr     m3, m0, 9
    pshufb      m3, m7
    palignr     m4, m0, 10
    pshufb      m4, m7
    palignr     m5, m0, 11
    pshufb      m5, m7
    palignr     m6, m0, 12
    pshufb      m6, m7

    movu        [r5 + r4], m1
    movu        [r5 + r4 + 16], m1
    lea         r5, [r5 + r1 * 4]
    movu        [r5], m2
    movu        [r5 + 16], m2
    movu        [r5 + r1], m3
    movu        [r5 + r1 + 16], m3
    movu        [r5 + r1 * 2], m4
    movu        [r5 + r1 * 2 + 16], m4
    movu        [r5 + r4], m5
    movu        [r5 + r4 + 16], m5
    lea         r5, [r5 + r1 * 4]
    movu        [r5], m6
    movu        [r5 + 16], m6

    palignr     m1, m0, 13
    pshufb      m1, m7
    palignr     m2, m0, 14
    pshufb      m2, m7
    palignr     m3, m0, 15
    pshufb      m3, m7
    pshufb      m0, m7

    movu        [r5 + r1], m1
    movu        [r5 + r1 + 16], m1
    movu        [r5 + r1 * 2], m2
    movu        [r5 + r1 * 2 + 16], m2
    movu        [r5 + r4], m3
    movu        [r5 + r4 + 16], m3

; filter
    cmp         r3d, byte 0
    jz         .quit
    movhlps     m1, m0
    pmovzxbw    m0, m0
    mova        m1, m0
    movu        m2, m8
    movu        m3, m9

    pshufb      m2, m7
    pmovzxbw    m2, m2
    movhlps     m4, m3
    pmovzxbw    m3, m3
    pmovzxbw    m4, m4
    psubw       m3, m2
    psubw       m4, m2
    psraw       m3, 1
    psraw       m4, 1
    paddw       m0, m3
    paddw       m1, m4
    packuswb    m0, m1

.quit:
    movu        [r0], m0
    movu        [r0 + 16], m0
    dec         r6
    lea         r0, [r5 + r1 * 4]
    lea         r2, [r2 + 16]
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_11, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 16]
    pxor        m1, m1
    pshufb      m0, m1                   ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    mova        [rsp], m0
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 16 + 64]
    movu        m2, [r2 + 32 + 64]
    movu        [rsp + 1], m0
    movu        [rsp + 1 + 16], m1
    movu        [rsp + 1 + 32], m2
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 1]            ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  0, 1, 30,28,26,24,22,20,18,16

    ; Row[8 - 15]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  1, 1, 14,12,10,8,6,4,2,0

    ; Row[16 - 23]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  2, 1, 30,28,26,24,22,20,18,16

    ; Row[24 - 31]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  3, 1, 14,12,10,8,6,4,2,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

%macro MODE_12_24_ROW0 1
    movu        m0,        [r3 + 6]
    pshufb      m0,        [c_mode32_12_0]
    pinsrb      m0,        [r3 + 26], 12
    mova        above,     m0
    movu        m2,        [r2]
  %if %1 == 1
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    movu        m1,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
  %if %1 == 1
    pinsrb      m1,        [r3], 0
  %endif
    palignr     m2,        m1, above, 15              ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    punpcklbw   m2,        m1                         ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    pmaddubsw   m1,        m2, [r4 + 13 * 16]             ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m2,        above, 14                  ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    pmaddubsw   m6,        m2, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 15 * 16]         ; [1]
    pmulhrsw    m5,        m7
    pslldq      m1,        above, 1
    palignr     m2,        m1, 14
    pmaddubsw   m6,        m2, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 13 * 16]         ; [3]
    pmulhrsw    m4,        m7
    pslldq      m1,        above, 2
    palignr     m2,        m1, 14
    pmaddubsw   m5,        m2, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7
    movu        m0,        [pb_fact0]
    pshufb      m2,        m0
    pmovzxbw    m2,        m2
    packuswb    m1,        m2
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

INIT_XMM sse4
cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
  %define above    [rsp + 0 * mmsize]
    mov         r3,        r2
    add         r2,        64
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    MODE_12_24_ROW0 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_12_24 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

%macro MODE_13_23_ROW0 1
    movu        m0,        [r3 + 1]
    movu        m1,        [r3 + 15]
    pshufb      m0,        [c_mode32_13_0]
    pshufb      m1,        [c_mode32_13_0]
    punpckldq   m0,        m1
    pshufb      m0,        [c_mode32_13_shuf]
    mova        above,     m0
    movu        m2,        [r2]
  %if (%1 == 1)
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r4 + 7 * 16]         ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 2 * 16]         ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 11 * 16]        ; [5]
    pmulhrsw    m5,        m7
    movu        m1,        [r2]                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
  %if (%1 == 1)
    pinsrb      m1,        [r3], 0
  %endif
    palignr     m2,        m1, above, 15             ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    punpcklbw   m2,        m1                        ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    pmaddubsw   m6,        m2, [r4 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 3 * 16]         ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r4 - 6 * 16]         ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r4 - 15 * 16]        ; [1]
    pmulhrsw    m1,        m7
    palignr     m2,        above, 14
    pmaddubsw   m3,        m2, [r4 + 8 * 16]         ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 16]             ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 10 * 16]        ; [6]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pslldq      m0,        above, 1
    palignr     m2,        m0, 14
    pmaddubsw   m5,        m2, [r4 + 13 * 16]        ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 5 * 16]         ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m1,        m2, [r4 + 9 * 16]         ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r4]                  ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 9 * 16]         ; [7]
    pmulhrsw    m4,        m7
    pslldq      m0,        above, 3
    palignr     m2,        m0, 14
    pmaddubsw   m3,        m2, [r4 + 14 * 16]        ; [30]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 5 * 16]         ; [21]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 13 * 16]        ; [3]
    pmulhrsw    m6,        m7
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m0,        m2, [r4 + 10 * 16]        ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r4 + 16]             ; [17]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r4 - 8 * 16]         ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pslldq      m0,        above, 5
    palignr     m2,        m0, 14
    pmaddubsw   m4,        m2, [r4 + 15 * 16]        ; [31]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 + 6 * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 3 * 16]         ; [13]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]        ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m6,        m2, [r4 + 11 * 16]        ; [27]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 2 * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 7 * 16]         ; [09]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 16 * 16]        ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_13_23 2
    movu        m2,        [r2]                      ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m4,        m0, [r4 + 7 * 16]         ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 - 2 * 16]         ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r4 - 11 * 16]        ; [05]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 3 * 16]         ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 - 6 * 16]         ; [10]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 15 * 16]        ; [1]
    pmulhrsw    m1,        m7
    movu        m2,        [r2 - 2]                  ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
    palignr     m3,        m2, 1                     ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    punpckhbw   m0,        m2, m3
    punpcklbw   m2,        m3
    palignr     m0,        m2, 2
    pmaddubsw   m3,        m0, [r4 + 8 * 16]         ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    mova        m3,        m0
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m3, [r4 - 16]             ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]        ; [6]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 + 13 * 16]        ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 5 * 16]         ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    movu        m2,        [r2 - 4]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m1,        m0, [r4 + 9 * 16]         ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m0, [r4]                  ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    mova        m3,        m0
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m3, [r4 - 9 * 16]         ; [7]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 + 14 * 16]        ; [30]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 5 * 16]         ; [21]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 13 * 16]        ; [3]
    pmulhrsw    m6,        m7
    movu        m2,        [r2 - 6]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m3,        m0, [r4 + 10 * 16]        ; [26]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m0, [r4 + 16]             ; [17]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m0, [r4 - 8 * 16]         ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 15 * 16]        ; [31]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 + 6 * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 3 * 16]         ; [13]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]        ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    movu        m2,        [r2 - 7]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
  %if ((%1 & %2) == 1)
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    pmaddubsw   m6,        m2, [r4 + 11 * 16]        ; [27]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 2 * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 7 * 16]         ; [09]
    pmulhrsw    m1,        m7
    movu        m0,        [pb_fact0]
    pshufb      m2,        m0
    pmovzxbw    m2,        m2
    packuswb    m1,        m2
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

INIT_XMM sse4
cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
    mov         r3,        r2
    add         r2,        64
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                  ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]             ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    MODE_13_23_ROW0 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        7

    MODE_13_23 1, 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    mov         r3,        2
.loop:
    MODE_13_23 1, 0
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_14, 3,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_14_0]      ; [x x x x x x x x x 0 2 5 7 10 12 15]
    pshufb      m1, [c_mode32_14_0]      ; [x x x x x x x x x 15 17 20 22 25 27 30]
    pslldq      m1, 10                   ; [17 20 22 25 27 30 x x x x x x x x x x x]
    palignr     m0, m1, 10               ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
    mova        [rsp], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 13], m0
    movu        [rsp + 13 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 13]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 4]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m4
    PROC32_8x8  0, 1, 19,6,25,12,31,18,5,24

    ; Row[8 - 15]
    movu        m7, [r2 - 7]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    mova        m3, m1
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  1, 1, 11,30,17,4,23,10,29,16

    ; Row[16 - 23]
    movu        m7, [r2 - 10]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m3
    mova        m6, m7
    PROC32_8x8  2, 1, 3,22,9,28,15,2,21,8

    ; Row[24 - 31]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 2
    mova        m1, m0
    mova        m2, m0
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  3, 1, 27,14,1,20,7,26,13,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_15, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_15_0]      ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
    pshufb      m1, [c_mode32_15_0]      ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
    mova        [rsp], m1
    movu        [rsp + 8], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 17], m0
    movu        [rsp + 17 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 17]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 5]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  0, 1, 15,30,13,28,11,26,9,24

    ; Row[8 - 15]
    movu        m7, [r2 - 9]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  1, 1, 7,22,5,20,3,18,1,16

    ; Row[16 - 23]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  2, 1, 31,14,29,12,27,10,25,8

    ; Row[24 - 31]
    movu        m7, [r2 - 17]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  3, 1, 23,6,21,4,19,2,17,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_16, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_16_0]      ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
    pshufb      m1, [c_mode32_16_0]      ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
    mova        [rsp], m1
    movu        [rsp + 10], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 21], m0
    movu        [rsp + 21 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 21]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 1, 11,22,1,12,23,2,13,24

    ; Row[8 - 15]
    movu        m7, [r2 - 11]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    palignr     m2, m7, 3
    mova        m3, m2
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  1, 1, 3,14,25,4,15,26,5,16

    ; Row[16 - 23]
    movu        m7, [r2 - 16]
    palignr     m0, m7, 4
    mova        m1, m0
    palignr     m2, m7, 3
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  2, 1, 27,6,17,28,7,18,29,8

    ; Row[24 - 31]
    movu        m7, [r2 - 21]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  3, 1, 19,30,9,20,31,10,21,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_17, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 16]
    pshufb      m0, [c_mode32_17_0]
    pshufb      m1, [c_mode32_17_0]
    mova        [rsp     ], m1
    movu        [rsp + 13], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 26], m0
    movu        [rsp + 26 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 25]          ; r2 -> [0]
    lea         r3, [c_shuf8_0]         ; r3 -> shuffle8
    lea         r4, [ang_table]         ; r4 -> ang_table
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m5, [pw_1024]           ; m5 -> 1024
    mova        m6, [c_deinterval8]     ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 1, 6,12,18,24,30,4,10,16

    ; Row[7 - 15]
    movu        m7, [r2 - 12]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  1, 1, 22,28,2,8,14,20,26,0

    ; Row[16 - 23]
    movu        m7, [r2 - 19]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  2, 1, 6,12,18,24,30,4,10,16

    ; Row[24 - 31]
    movu        m7, [r2 - 25]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  3, 1, 22,28,2,8,14,20,26,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]

    RET

INIT_YMM avx2
cglobal intra_pred_ang32_18, 4, 4, 3
    movu           m0, [r2]
    movu           xm1, [r2 + 1 + 64]
    pshufb         xm1, [intra_pred_shuff_15_0]
    mova           xm2, xm0
    vinserti128    m1, m1, xm2, 1

    lea            r3, [r1 * 3]

    movu           [r0], m0
    palignr        m2, m0, m1, 15
    movu           [r0 + r1], m2
    palignr        m2, m0, m1, 14
    movu           [r0 + r1 * 2], m2
    palignr        m2, m0, m1, 13
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m0, m1, 12
    movu           [r0], m2
    palignr        m2, m0, m1, 11
    movu           [r0 + r1], m2
    palignr        m2, m0, m1, 10
    movu           [r0 + r1 * 2], m2
    palignr        m2, m0, m1, 9
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m0, m1, 8
    movu           [r0], m2
    palignr        m2, m0, m1, 7
    movu           [r0 + r1], m2
    palignr        m2, m0, m1, 6
    movu           [r0 + r1 * 2], m2
    palignr        m2, m0, m1, 5
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m0, m1, 4
    movu           [r0], m2
    palignr        m2, m0, m1, 3
    movu           [r0 + r1], m2
    palignr        m2, m0, m1, 2
    movu           [r0 + r1 * 2], m2
    palignr        m2, m0, m1, 1
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    movu           [r0], m1

    movu           xm0, [r2 + 64 + 17]
    pshufb         xm0, [intra_pred_shuff_15_0]
    vinserti128    m0, m0, xm1, 1

    palignr        m2, m1, m0, 15
    movu           [r0 + r1], m2
    palignr        m2, m1, m0, 14
    movu           [r0 + r1 * 2], m2
    palignr        m2, m1, m0, 13
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m1, m0, 12
    movu           [r0], m2
    palignr        m2, m1, m0, 11
    movu           [r0 + r1], m2
    palignr        m2, m1, m0, 10
    movu           [r0 + r1 * 2], m2
    palignr        m2, m1, m0, 9
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m1, m0, 8
    movu           [r0], m2
    palignr        m2, m1, m0, 7
    movu           [r0 + r1], m2
    palignr        m2, m1, m0,6
    movu           [r0 + r1 * 2], m2
    palignr        m2, m1, m0, 5
    movu           [r0 + r3], m2

    lea            r0, [r0 + r1 * 4]
    palignr        m2, m1, m0, 4
    movu           [r0], m2
    palignr        m2, m1, m0, 3
    movu           [r0 + r1], m2
    palignr        m2, m1, m0,2
    movu           [r0 + r1 * 2], m2
    palignr        m2, m1, m0, 1
    movu           [r0 + r3], m2
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_18, 4,5,5
    movu        m0, [r2]               ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    movu        m1, [r2 + 16]          ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
    movu        m2, [r2 + 1 + 64]      ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m3, [r2 + 17 + 64]     ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]

    lea         r2, [r1 * 2]
    lea         r3, [r1 * 3]
    lea         r4, [r1 * 4]

    movu        [r0], m0
    movu        [r0 + 16], m1

    pshufb      m2, [c_mode32_18_0]    ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
    pshufb      m3, [c_mode32_18_0]    ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]

    palignr     m4, m0, m2, 15
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 15
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 14
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 14
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 13
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 13
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 12
    movu        [r0], m4
    palignr     m4, m1, m0, 12
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 11
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 11
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 10
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 10
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 9
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 9
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 8
    movu        [r0], m4
    palignr     m4, m1, m0, 8
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 7
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 7
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 6
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 6
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 5
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 5
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 4
    movu        [r0], m4
    palignr     m4, m1, m0, 4
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 3
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 3
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 2
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 2
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 1
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 1
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    movu        [r0], m2
    movu        [r0 + 16], m0
    palignr     m4, m2, m3, 15
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 15
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 14
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 14
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 13
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 13
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 12
    movu        [r0], m4
    palignr     m4, m0, m2, 12
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 11
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 11
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 10
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 10
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 9
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 9
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 8
    movu        [r0], m4
    palignr     m4, m0, m2, 8
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 7
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 7
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 6
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 6
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 5
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 5
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 4
    movu        [r0], m4
    palignr     m4, m0, m2, 4
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 3
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 3
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 2
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 2
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 1
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 1
    movu        [r0 + r3 + 16], m4
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_19, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 16 + 64]
    pshufb      m0, [c_mode32_17_0]
    pshufb      m1, [c_mode32_17_0]
    mova        [rsp     ], m1
    movu        [rsp + 13], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 26], m0
    movu        [rsp + 26 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 25]          ; r2 -> [0]
    lea         r3, [c_shuf8_0]         ; r3 -> shuffle8
    lea         r4, [ang_table]         ; r4 -> ang_table
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r0]                ; r6 -> r0
    mova        m5, [pw_1024]           ; m5 -> 1024
    mova        m6, [c_deinterval8]     ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 0, 6,12,18,24,30,4,10,16

    ; Row[7 - 15]
    movu        m7, [r2 - 12]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 22,28,2,8,14,20,26,0

    ; Row[16 - 23]
    movu        m7, [r2 - 19]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 6,12,18,24,30,4,10,16

    ; Row[24 - 31]
    movu        m7, [r2 - 25]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 22,28,2,8,14,20,26,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_20, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_16_0]      ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
    pshufb      m1, [c_mode32_16_0]      ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
    mova        [rsp], m1
    movu        [rsp + 10], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 21], m0
    movu        [rsp + 21 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 21]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 0, 11,22,1,12,23,2,13,24

    ; Row[8 - 15]
    movu        m7, [r2 - 11]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    palignr     m2, m7, 3
    mova        m3, m2
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m5
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 3,14,25,4,15,26,5,16

    ; Row[16 - 23]
    movu        m7, [r2 - 16]
    palignr     m0, m7, 4
    mova        m1, m0
    palignr     m2, m7, 3
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 27,6,17,28,7,18,29,8

    ; Row[24 - 31]
    movu        m7, [r2 - 21]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 19,30,9,20,31,10,21,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_21, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_15_0]      ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
    pshufb      m1, [c_mode32_15_0]      ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
    mova        [rsp], m1
    movu        [rsp + 8], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 17], m0
    movu        [rsp + 17 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 17]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 5]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  0, 0, 15,30,13,28,11,26,9,24

    ; Row[8 - 15]
    movu        m7, [r2 - 9]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 7,22,5,20,3,18,1,16

    ; Row[16 - 23]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 31,14,29,12,27,10,25,8

    ; Row[24 - 31]
    movu        m7, [r2 - 17]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 23,6,21,4,19,2,17,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_22, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_14_0]      ; [x x x x x x x x x 0 2 5 7 10 12 15]
    pshufb      m1, [c_mode32_14_0]      ; [x x x x x x x x x 15 17 20 22 25 27 30]
    pslldq      m1, 10                   ; [17 20 22 25 27 30 x x x x x x x x x x x]
    palignr     m0, m1, 10               ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
    mova        [rsp], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 13], m0
    movu        [rsp + 13 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 13]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 4]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m4
    PROC32_8x8  0, 0, 19,6,25,12,31,18,5,24

    ; Row[8 - 15]
    movu        m7, [r2 - 7]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    mova        m3, m1
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 11,30,17,4,23,10,29,16

    ; Row[16 - 23]
    movu        m7, [r2 - 10]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m3
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 3,22,9,28,15,2,21,8

    ; Row[24 - 31]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 2
    mova        m1, m0
    mova        m2, m0
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 27,14,1,20,7,26,13,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
    lea         r3,        [r2 + 64]
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    MODE_13_23_ROW0 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_13_23 0, 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
  %define above    [rsp + 0 * mmsize]
    lea         r3,        [r2 + 64]
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    MODE_12_24_ROW0 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_12_24 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_25, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 16 + 64]
    pxor        m1, m1
    pshufb      m0, m1                   ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    mova        [rsp], m0
    movu        m0, [r2]
    movu        m1, [r2 + 16]
    movu        m2, [r2 + 32]
    movu        [rsp + 1], m0
    movu        [rsp + 1 + 16], m1
    movu        [rsp + 1 + 32], m2
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 1]            ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  0, 0, 30,28,26,24,22,20,18,16

    ; Row[8 - 15]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 14,12,10,8,6,4,2,0

    ; Row[16 - 23]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 30,28,26,24,22,20,18,16

    ; Row[24 - 31]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 14,12,10,8,6,4,2,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
    mov         r6,             2
    movu        m0,             [r2 + 64]
    pinsrb      m0,             [r2], 0
    movu        m1,             [r2 + 1 + 64]
    mova        m8,             m0
    mova        m9,             m1
    mov         r3d,            r4d
    lea         r4,             [r1 * 3]

.loop:
    movu        m0,             [r2 + 1]

    movu        [r0],           m0
    movu        [r0 + r1],      m0
    movu        [r0 + r1 * 2],  m0
    movu        [r0 + r4],      m0
    lea         r5,             [r0 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r0 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0

; filter
    cmp         r3d, byte 0
    jz         .quit

    pxor        m4,        m4
    pshufb      m0,        m4
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        m8
    movu        m3,        m9

    pshufb      m2,        m4
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1

    pextrb      [r0],           m0, 0
    pextrb      [r0 + r1],      m0, 1
    pextrb      [r0 + r1 * 2],  m0, 2
    pextrb      [r0 + r4],      m0, 3
    lea         r5,             [r0 + r1 * 4]
    pextrb      [r5],           m0, 4
    pextrb      [r5 + r1],      m0, 5
    pextrb      [r5 + r1 * 2],  m0, 6
    pextrb      [r5 + r4],      m0, 7
    lea         r5,             [r5 + r1 * 4]
    pextrb      [r5],           m0, 8
    pextrb      [r5 + r1],      m0, 9
    pextrb      [r5 + r1 * 2],  m0, 10
    pextrb      [r5 + r4],      m0, 11
    lea         r5,             [r5 + r1 * 4]
    pextrb      [r5],           m0, 12
    pextrb      [r5 + r1],      m0, 13
    pextrb      [r5 + r1 * 2],  m0, 14
    pextrb      [r5 + r4],      m0, 15

.quit:
    lea         r2, [r2 + 16]
    add         r0, 16
    dec         r6d
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_27, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_9_27 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_28, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_8_28 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_29, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_7_29 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_30, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_6_30 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_31, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_5_31 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_32, 3,7,8
    lea         r3,     [ang_table + 16 * 16]
    mov         r4d,    4
    lea         r5,     [r1 * 3]
    mov         r6,     r0
    mova        m7,     [pw_1024]
.loop:
    MODE_4_32 0
    add         r6,      8
    mov         r0,     r6
    add         r2,     8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_33, 3,7,8
    lea         r3,    [ang_table + 16 * 16]
    mov         r4d,   4
    lea         r5,    [r1 * 3]
    mov         r6,    r0
    mova        m7,    [pw_1024]
.loop:
    MODE_3_33 0
    add         r6,    8
    mov         r0,    r6
    add         r2,    8
    dec         r4
    jnz        .loop
    RET

;-----------------------------------------------------------------------------------------
; start of intra_pred_ang32 angular modes avx2 asm
;-----------------------------------------------------------------------------------------

%if ARCH_X86_64 == 1
INIT_YMM avx2

; register mapping :
; %1-%8 - output registers
; %9    - temp register
; %10   - for label naming
%macro TRANSPOSE_32x8_AVX2 10
    jnz         .skip%10

    ; transpose 8x32 to 32x8 and then store
    punpcklbw   m%9, m%1, m%2
    punpckhbw   m%1, m%2
    punpcklbw   m%2, m%3, m%4
    punpckhbw   m%3, m%4
    punpcklbw   m%4, m%5, m%6
    punpckhbw   m%5, m%6
    punpcklbw   m%6, m%7, m%8
    punpckhbw   m%7, m%8

    punpcklwd   m%8, m%9, m%2
    punpckhwd   m%9, m%2
    punpcklwd   m%2, m%4, m%6
    punpckhwd   m%4, m%6
    punpcklwd   m%6, m%1, m%3
    punpckhwd   m%1, m%3
    punpcklwd   m%3, m%5, m%7
    punpckhwd   m%5, m%7

    punpckldq   m%7, m%8, m%2
    punpckhdq   m%8, m%2
    punpckldq   m%2, m%6, m%3
    punpckhdq   m%6, m%3
    punpckldq   m%3, m%9, m%4
    punpckhdq   m%9, m%4
    punpckldq   m%4, m%1, m%5
    punpckhdq   m%1, m%5

    movq        [r0 + r1 * 0], xm%7
    movhps      [r0 + r1 * 1], xm%7
    movq        [r0 + r1 * 2], xm%8
    movhps      [r0 + r5 * 1], xm%8

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%3
    movhps      [r0 + r1 * 1], xm%3
    movq        [r0 + r1 * 2], xm%9
    movhps      [r0 + r5 * 1], xm%9

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%2
    movhps      [r0 + r1 * 1], xm%2
    movq        [r0 + r1 * 2], xm%6
    movhps      [r0 + r5 * 1], xm%6

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%4
    movhps      [r0 + r1 * 1], xm%4
    movq        [r0 + r1 * 2], xm%1
    movhps      [r0 + r5 * 1], xm%1

    lea         r0, [r0 + r6]

    vpermq      m%8, m%8, 00001110b
    vpermq      m%7, m%7, 00001110b
    vpermq      m%6, m%6, 00001110b
    vpermq      m%3, m%3, 00001110b
    vpermq      m%9, m%9, 00001110b
    vpermq      m%2, m%2, 00001110b
    vpermq      m%4, m%4, 00001110b
    vpermq      m%1, m%1, 00001110b

    movq        [r0 + r1 * 0], xm%7
    movhps      [r0 + r1 * 1], xm%7
    movq        [r0 + r1 * 2], xm%8
    movhps      [r0 + r5 * 1], xm%8

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%3
    movhps      [r0 + r1 * 1], xm%3
    movq        [r0 + r1 * 2], xm%9
    movhps      [r0 + r5 * 1], xm%9

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%2
    movhps      [r0 + r1 * 1], xm%2
    movq        [r0 + r1 * 2], xm%6
    movhps      [r0 + r5 * 1], xm%6

    lea         r0, [r0 + r6]

    movq        [r0 + r1 * 0], xm%4
    movhps      [r0 + r1 * 1], xm%4
    movq        [r0 + r1 * 2], xm%1
    movhps      [r0 + r5 * 1], xm%1

    lea         r0, [r4 + 8]
    jmp         .end%10
.skip%10:
    movu        [r0 + r1 * 0], m%1
    movu        [r0 + r1 * 1], m%2
    movu        [r0 + r1 * 2], m%3
    movu        [r0 + r5 * 1], m%4

    lea         r0, [r0 + r6]

    movu        [r0 + r1 * 0], m%5
    movu        [r0 + r1 * 1], m%6
    movu        [r0 + r1 * 2], m%7
    movu        [r0 + r5 * 1], m%8

    lea         r0, [r0 + r6]
.end%10:
%endmacro

cglobal ang32_mode_3_33_row_0_15
    test        r7d,      r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 + 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m5,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m5,         [r3 + 4 * 32]       ; [20]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         [r3 + 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    palignr     m6,         m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m6,         [r3 - 2 * 32]       ; [14]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 - 2 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m8,         m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m8,         [r3 - 8 * 32]       ; [8]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m10,        m2, m0, 8
    palignr     m11,        m3, m2, 8
    pmaddubsw   m9,         m10, [r3 - 14 * 32] ; [2]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m11, [r3 - 14 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    pmaddubsw   m10,        [r3 + 12 * 32]      ; [28]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        [r3 + 12 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    palignr     m11,        m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m11,        [r3 + 6 * 32]       ; [22]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m12,        m2, m0, 12
    palignr     m1,         m3, m2, 12
    pmaddubsw   m12,        [r3]                ; [16]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m4,         m2, m0, 14
    palignr     m1,         m3, m2, 14
    pmaddubsw   m4,         [r3 - 6 * 32]       ; [10]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 - 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m2, [r3 - 12 * 32]  ; [4]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         m3, [r3 - 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    pmaddubsw   m6,         m2, [r3 + 14 * 32]  ; [30]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         m3, [r3 + 14 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    movu        m0,         [r2 + 25]
    movu        m1,         [r2 + 26]
    punpcklbw   m0,         m1

    palignr     m8,         m3, m2, 2
    palignr     m1,         m0, m3, 2
    pmaddubsw   m8,         [r3 + 8 * 32]       ; [24]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m9,         m3, m2, 4
    palignr     m1,         m0, m3, 4
    pmaddubsw   m9,         [r3 + 2 * 32]       ; [18]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         [r3 + 2 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    palignr     m10,        m3, m2, 6
    palignr     m1,         m0, m3, 6
    pmaddubsw   m10,        [r3 - 4 * 32]       ; [12]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         [r3 - 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    palignr     m11,        m3, m2, 8
    palignr     m1,         m0, m3, 8
    pmaddubsw   m11,        [r3 - 10 * 32]      ; [6]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    movu        m12,        [r2 + 14]

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_3, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_3_33_row_0_15

    add         r4, 16
    mov         r0, r4
    add         r2, 13

    call ang32_mode_3_33_row_0_15
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_33, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_3_33_row_0_15

    add         r2, 13

    call ang32_mode_3_33_row_0_15
    RET

cglobal ang32_mode_4_32_row_0_15
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 + 5 * 32]   ; [21]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 + 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m6,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 - 6 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m8,         m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m8,         [r3 + 4 * 32]       ; [20]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m10,        m2, m0, 6
    palignr     m11,        m3, m2, 6
    pmaddubsw   m9,         m10, [r3 - 7 * 32]  ; [9]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m11, [r3 - 7 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        [r3 + 14 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    palignr     m11,        m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m11,        [r3 + 3 * 32]       ; [19]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m12,        m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m4,         m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m4,         [r3 + 13 * 32]      ; [29]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 + 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m5,         m2, m0, 12
    palignr     m1,         m3, m2, 12
    pmaddubsw   m5,         [r3 + 2 * 32]       ; [18]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         [r3 + 2 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    palignr     m8,         m2, m0, 14
    palignr     m1,         m3, m2, 14
    pmaddubsw   m6,         m8, [r3 - 9 * 32]   ; [7]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 - 9 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    pmaddubsw   m9,         m2, [r3 + 1 * 32]   ; [17]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m3, [r3 + 1 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    movu        m0,         [r2 + 25]
    movu        m1,         [r2 + 26]
    punpcklbw   m0,         m1

    palignr     m11,        m3, m2, 2
    palignr     m1,         m0, m3, 2
    pmaddubsw   m10,        m11, [r3 - 10 * 32] ; [6]
    pmulhrsw    m10,        m7
    pmaddubsw   m12,        m1, [r3 - 10 * 32]
    pmulhrsw    m12,        m7
    packuswb    m10,        m12

    pmaddubsw   m11,        [r3 + 11 * 32]      ; [27]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m0,         m3, 4
    palignr     m3,         m2, 4
    pmaddubsw   m3,         [r3]                ; [16]
    pmulhrsw    m3,         m7
    pmaddubsw   m0,         [r3]
    pmulhrsw    m0,         m7
    packuswb    m3,         m0

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8
    ret

cglobal ang32_mode_4_32_row_16_31
    test        r7d,      r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 - 11 * 32]  ; [5]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 - 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         m2, [r3 + 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    palignr     m6,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m6,         [r3 - 1 * 32]       ; [15]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 - 1 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m9,         m2, m0, 4
    palignr     m10,        m3, m2, 4
    pmaddubsw   m8,         m9, [r3 - 12 * 32]  ; [4]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         m10, [r3 - 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    pmaddubsw   m9,         [r3 + 9 * 32]       ; [25]
    pmulhrsw    m9,         m7
    pmaddubsw   m10,        [r3 + 9 * 32]
    pmulhrsw    m10,        m7
    packuswb    m9,         m10

    palignr     m10,         m2, m0, 6
    palignr     m11,        m3, m2, 6
    pmaddubsw   m10,        [r3 - 2 * 32]       ; [14]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        [r3 - 2 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    palignr     m12,        m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m11,        m12, [r3 - 13 * 32] ; [3]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 8
    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m4,         m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m4,         [r3 - 3 * 32]       ; [13]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 - 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m6,         m2, m0, 12
    palignr     m8,         m3, m2, 12
    pmaddubsw   m5,         m6, [r3 - 14 * 32]  ; [2]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         m8, [r3 - 14 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    pmaddubsw   m6,         [r3 + 7 * 32]       ; [23]
    pmulhrsw    m6,         m7
    pmaddubsw   m8,         [r3 + 7 * 32]
    pmulhrsw    m8,         m7
    packuswb    m6,         m8

    palignr     m8,         m2, m0, 14
    palignr     m1,         m3, m2, 14
    pmaddubsw   m8,         [r3 - 4 * 32]       ; [12]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 - 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    pmaddubsw   m9,         m2, [r3 - 15 * 32]  ; [1]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m3, [r3 - 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    pmaddubsw   m10,        m2, [r3 + 6 * 32]   ; [22]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m3, [r3 + 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    movu        m0,         [r2 + 25]
    movu        m1,         [r2 + 26]
    punpcklbw   m0,         m1

    palignr     m11,        m3, m2, 2
    palignr     m1,         m0, m3, 2
    pmaddubsw   m11,        [r3 - 5 * 32]       ; [11]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    movu        m12,        [r2 + 11]

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_4, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_4_32_row_0_15

    add         r4, 16
    mov         r0, r4
    add         r2, 11

    call ang32_mode_4_32_row_16_31
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_32, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_4_32_row_0_15

    add         r2, 11

    call ang32_mode_4_32_row_16_31
    RET

cglobal ang32_mode_5_31_row_0_15
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 + 1 * 32]   ; [17]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 + 1 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m6,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m5,         m6, [r3 - 14 * 32]  ; [2]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 - 14 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         [r3 + 3 * 32]       ; [19]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m9,         m2, m0, 4
    palignr     m10,        m3, m2, 4
    pmaddubsw   m8,         m9, [r3 - 12 * 32]  ; [4]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         m10, [r3 - 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    pmaddubsw   m9,         [r3 + 5 * 32]       ; [21]
    pmulhrsw    m9,         m7
    pmaddubsw   m10,        [r3 + 5 * 32]
    pmulhrsw    m10,        m7
    packuswb    m9,         m10

    palignr     m11,        m2, m0, 6
    palignr     m12,        m3, m2, 6
    pmaddubsw   m10,        m11, [r3 - 10 * 32] ; [6]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m12, [r3 - 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    pmaddubsw   m11,        [r3 + 7 * 32]       ; [23]
    pmulhrsw    m11,        m7
    pmaddubsw   m12,        [r3 + 7 * 32]
    pmulhrsw    m12,        m7
    packuswb    m11,        m12

    palignr     m12,        m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m4,         m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m4,         [r3 + 9 * 32]       ; [25]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 + 9 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m6,         m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 - 6 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         [r3 + 11 * 32]      ; [27]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m9,         m2, m0, 12
    palignr     m1,         m3, m2, 12
    pmaddubsw   m8,         m9, [r3 - 4 * 32]   ; [12]
    pmulhrsw    m8,         m7
    pmaddubsw   m10,        m1, [r3 - 4 * 32]
    pmulhrsw    m10,        m7
    packuswb    m8,         m10

    pmaddubsw   m9,         [r3 + 13 * 32]      ; [29]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         [r3 + 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    palignr     m11,        m2, m0, 14
    palignr     m1,         m3, m2, 14
    pmaddubsw   m10,        m11, [r3 - 2 * 32]  ; [14]
    pmulhrsw    m10,        m7
    pmaddubsw   m12,        m1, [r3 - 2 * 32]
    pmulhrsw    m12,        m7
    packuswb    m10,        m12

    pmaddubsw   m11,        [r3 + 15 * 32]      ; [31]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    pmaddubsw   m2,         [r3]                ; [16]
    pmulhrsw    m2,         m7
    pmaddubsw   m3,         [r3]
    pmulhrsw    m3,         m7
    packuswb    m2,         m3

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
    ret

cglobal ang32_mode_5_31_row_16_31
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]               ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]               ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]               ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]               ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1                  ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                      ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                      ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 - 15 * 32]      ; [1]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 - 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m0, [r3 + 2 * 32]       ; [18]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m2, [r3 + 2 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    palignr     m8,         m2, m0, 2
    palignr     m9,         m3, m2, 2
    pmaddubsw   m6,         m8, [r3 - 13 * 32]      ; [3]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         m9, [r3 - 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    pmaddubsw   m8,         [r3 + 4 * 32]           ; [20]
    pmulhrsw    m8,         m7
    pmaddubsw   m9,         [r3 + 4 * 32]
    pmulhrsw    m9,         m7
    packuswb    m8,         m9

    palignr     m10,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m9,         m10, [r3 - 11 * 32]     ; [5]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 - 11 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        [r3 + 6 * 32] ; [22]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         [r3 + 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    palignr     m12,        m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m11,        m12, [r3 - 9 * 32]      ; [7]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 9 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 6
    pmaddubsw   m12,        [r3 + 8 * 32]           ; [24]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m5,         m2, m0, 8
    palignr     m8,         m3, m2, 8
    pmaddubsw   m4,         m5, [r3 - 7 * 32]       ; [9]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m8, [r3 - 7 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         [r3 + 10 * 32]          ; [26]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         [r3 + 10 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    palignr     m8,         m2, m0, 10
    palignr     m9,         m3, m2, 10
    pmaddubsw   m6,         m8, [r3 - 5 * 32]       ; [11]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         m9, [r3 - 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    pmaddubsw   m8,         [r3 + 12 * 32]          ; [28]
    pmulhrsw    m8,         m7
    pmaddubsw   m9,         [r3 + 12 * 32]
    pmulhrsw    m9,         m7
    packuswb    m8,         m9

    palignr     m10,        m2, m0, 12
    palignr     m11,        m3, m2, 12
    pmaddubsw   m9,         m10, [r3 - 3 * 32]      ; [13]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m11, [r3 - 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    pmaddubsw   m10,        [r3 + 14 * 32]          ; [30]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        [r3 + 14 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    palignr     m11,        m2, m0, 14
    palignr     m1,         m3, m2, 14
    pmaddubsw   m11,        [r3 - 1 * 32]           ; [15]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 1 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    movu        m2,         [r2 + 9]
    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_5, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_5_31_row_0_15

    add         r4, 16
    mov         r0, r4
    add         r2, 9

    call ang32_mode_5_31_row_16_31
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_31, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_5_31_row_0_15

    add         r2, 9

    call ang32_mode_5_31_row_16_31
    RET

cglobal ang32_mode_6_30_row_0_15
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 - 3 * 32]   ; [13]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 - 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m2, [r3 + 10 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    palignr     m8,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m6,         m8, [r3 - 9 * 32]   ; [7]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 - 9 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         [r3 + 4 * 32]       ; [20]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m11,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m9,         m11, [r3 - 15 * 32] ; [1]
    pmulhrsw    m9,         m7
    pmaddubsw   m12,        m1, [r3 - 15 * 32]
    pmulhrsw    m12,        m7
    packuswb    m9,         m12

    pmaddubsw   m10,        m11, [r3 - 2 * 32]  ; [14]
    pmulhrsw    m10,        m7
    pmaddubsw   m12,        m1, [r3 - 2 * 32]
    pmulhrsw    m12,        m7
    packuswb    m10,        m12

    pmaddubsw   m11,        [r3 + 11 * 32]      ; [27]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m12,        m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m4,         m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m4,         [r3 + 5 * 32]       ; [21]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 + 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m8,         m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m5,         m8, [r3 - 14 * 32]  ; [2]
    pmulhrsw    m5,         m7
    pmaddubsw   m9,         m1, [r3 - 14 * 32]
    pmulhrsw    m9,         m7
    packuswb    m5,         m9

    pmaddubsw   m6,         m8, [r3 - 1 * 32]   ; [15]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 - 1 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m10,        m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m9,         m10, [r3 - 7 * 32]  ; [9]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 - 7 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        [r3 + 6 * 32]       ; [22]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m1, [r3 + 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    palignr     m3,         m2, 12
    palignr     m2,         m0, 12
    pmaddubsw   m11,        m2, [r3 - 13 * 32]  ; [3]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         m3, [r3 - 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    pmaddubsw   m2,         [r3]                ; [16]
    pmulhrsw    m2,         m7
    pmaddubsw   m3,         [r3]
    pmulhrsw    m3,         m7
    packuswb    m2,         m3

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
    ret

cglobal ang32_mode_6_30_row_16_31
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 + 13 * 32]  ; [29]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 + 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m6,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 - 6 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         [r3 + 7 * 32]       ; [23]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 7 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m10,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m8,         m10, [r3 - 12 * 32] ; [4]
    pmulhrsw    m8,         m7
    pmaddubsw   m11,        m1, [r3 - 12 * 32]
    pmulhrsw    m11,        m7
    packuswb    m8,         m11

    pmaddubsw   m9,         m10, [r3 + 1 * 32]  ; [17]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 + 1 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         [r3 + 14 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    palignr     m12,        m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m11,        m12, [r3 - 5 * 32]  ; [11]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 6
    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m6,         m2, m0, 8
    palignr     m1,         m3, m2, 8
    pmaddubsw   m4,         m6, [r3 - 11 * 32]  ; [5]
    pmulhrsw    m4,         m7
    pmaddubsw   m8,         m1, [r3 - 11 * 32]
    pmulhrsw    m8,         m7
    packuswb    m4,         m8

    pmaddubsw   m5,         m6, [r3 + 2 * 32]   ; [18]
    pmulhrsw    m5,         m7
    pmaddubsw   m9,         m1, [r3 + 2 * 32]
    pmulhrsw    m9,         m7
    packuswb    m5,         m9

    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m9,         m2, m0, 10
    palignr     m1,         m3, m2, 10
    pmaddubsw   m8,         m9, [r3 - 4 * 32]   ; [12]
    pmulhrsw    m8,         m7
    pmaddubsw   m10,        m1, [r3 - 4 * 32]
    pmulhrsw    m10,        m7
    packuswb    m8,         m10

    pmaddubsw   m9,         [r3 + 9 * 32]       ; [25]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         [r3 + 9 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    palignr     m3,         m2, 12
    palignr     m2,         m0, 12
    pmaddubsw   m10,        m2, [r3 - 10 * 32]  ; [6]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m3, [r3 - 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    pmaddubsw   m2,         [r3 + 3 * 32]        ; [19]
    pmulhrsw    m2,         m7
    pmaddubsw   m3,         [r3 + 3 * 32]
    pmulhrsw    m3,         m7
    packuswb    m2,         m3

    movu        m3,         [r2 + 8]             ; [0]

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_6, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_6_30_row_0_15

    add         r4, 16
    mov         r0, r4
    add         r2, 6

    call ang32_mode_6_30_row_16_31
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_30, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_6_30_row_0_15

    add         r2, 6

    call ang32_mode_6_30_row_16_31
    RET

cglobal ang32_mode_7_29_row_0_15
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 - 7 * 32]   ; [9]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 - 7 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m0, [r3 + 2 * 32]   ; [18]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m2, [r3 + 2 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         m0, [r3 + 11 * 32]  ; [27]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m2, [r3 + 11 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    palignr     m11,        m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m8,         m11, [r3 - 12 * 32] ; [4]
    pmulhrsw    m8,         m7
    pmaddubsw   m12,        m1, [r3 - 12 * 32]
    pmulhrsw    m12,        m7
    packuswb    m8,         m12

    pmaddubsw   m9,         m11, [r3 - 3 * 32]  ; [13]
    pmulhrsw    m9,         m7
    pmaddubsw   m12,        m1, [r3 - 3 * 32]
    pmulhrsw    m12,        m7
    packuswb    m9,         m12

    pmaddubsw   m10,        m11, [r3 + 6 * 32]  ; [22]
    pmulhrsw    m10,        m7
    pmaddubsw   m12,        m1, [r3 + 6 * 32]
    pmulhrsw    m12,        m7
    packuswb    m10,        m12

    pmaddubsw   m11,        [r3 + 15 * 32]      ; [31]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m12,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m5,         m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m4,         m5, [r3 + 1 * 32]   ; [17]
    pmulhrsw    m4,         m7
    pmaddubsw   m8,         m1, [r3 + 1 * 32]
    pmulhrsw    m8,         m7
    packuswb    m4,         m8

    pmaddubsw   m5,         [r3 + 10 * 32]      ; [26]
    pmulhrsw    m5,         m7
    pmaddubsw   m1,         [r3 + 10 * 32]
    pmulhrsw    m1,         m7
    packuswb    m5,         m1

    palignr     m10,        m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m6,         m10, [r3 - 13 * 32] ; [3]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 - 13 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         m10, [r3 - 4 * 32]  ; [12]
    pmulhrsw    m8,         m7
    pmaddubsw   m11,        m1, [r3 - 4 * 32]
    pmulhrsw    m11,        m7
    packuswb    m8,         m11

    pmaddubsw   m9,         m10, [r3 + 5 * 32]  ; [21]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 + 5 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         [r3 + 14 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    palignr     m3,         m2, 8
    palignr     m2,         m0, 8
    pmaddubsw   m11,        m2, [r3 - 9 * 32]   ; [7]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         m3, [r3 - 9 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    pmaddubsw   m2,         [r3]                ; [16]
    pmulhrsw    m2,         m7
    pmaddubsw   m3,         [r3]
    pmulhrsw    m3,         m7
    packuswb    m2,         m3

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
    ret

cglobal ang32_mode_7_29_row_16_31
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 + 9 * 32]   ; [25]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 + 9 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m9,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m5,         m9, [r3 - 14 * 32]  ; [2]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 - 14 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         m9, [r3 - 5 * 32]   ; [11]
    pmulhrsw    m6,         m7
    pmaddubsw   m10,        m1, [r3 - 5 * 32]
    pmulhrsw    m10,        m7
    packuswb    m6,         m10

    pmaddubsw   m8,         m9, [r3 + 4 * 32]   ; [20]
    pmulhrsw    m8,         m7
    pmaddubsw   m10,        m1, [r3 + 4 * 32]
    pmulhrsw    m10,        m7
    packuswb    m8,         m10

    pmaddubsw   m9,         [r3 + 13 * 32]      ; [29]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         [r3 + 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    palignr     m12,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m10,        m12, [r3 - 10 * 32] ; [6]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        m1, [r3 - 10 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    pmaddubsw   m11,        m12, [r3 - 1 * 32]  ; [15]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 1 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 4
    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15
    palignr     m8,         m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m4,         m8, [r3 - 15 * 32]  ; [1]
    pmulhrsw    m4,         m7
    pmaddubsw   m9,         m1, [r3 - 15 * 32]
    pmulhrsw    m9,         m7
    packuswb    m4,         m9

    pmaddubsw   m5,         m8, [r3 - 6 * 32]   ; [10]
    pmulhrsw    m5,         m7
    pmaddubsw   m9,         m1, [r3 - 6 * 32]
    pmulhrsw    m9,         m7
    packuswb    m5,         m9

    pmaddubsw   m6,         m8, [r3 + 3 * 32]   ; [19]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 + 3 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m3,         m2, 8
    palignr     m2,         m0, 8
    pmaddubsw   m9,         m2, [r3 - 11 * 32]  ; [5]
    pmulhrsw    m9,         m7
    pmaddubsw   m1,         m3, [r3 - 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m9,         m1

    pmaddubsw   m10,        m2, [r3 - 2 * 32]   ; [14]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m3, [r3 - 2 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    pmaddubsw   m2,        [r3 + 7 * 32]        ; [23]
    pmulhrsw    m2,        m7
    pmaddubsw   m3,        [r3 + 7 * 32]
    pmulhrsw    m3,        m7
    packuswb    m2,        m3

    movu        m1,         [r2 + 6]            ; [0]

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_7, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_7_29_row_0_15

    add         r4, 16
    mov         r0, r4
    add         r2, 4

    call ang32_mode_7_29_row_16_31
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_29, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_7_29_row_0_15

    add         r2, 4

    call ang32_mode_7_29_row_16_31
    RET

cglobal ang32_mode_8_28_avx2
    test        r7d,        r7d
    ; rows 0 to 7
    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw   m4,         m0, [r3 - 11 * 32]  ; [5]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         m2, [r3 - 11 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    pmaddubsw   m5,         m0, [r3 - 6 * 32]   ; [10]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m2, [r3 - 6 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         m0, [r3 - 1 * 32]   ; [15]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m2, [r3 - 1 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         m0, [r3 + 4 * 32]   ; [20]
    pmulhrsw    m8,         m7
    pmaddubsw   m12,        m2, [r3 + 4 * 32]
    pmulhrsw    m12,        m7
    packuswb    m8,         m12

    pmaddubsw   m9,         m0, [r3 + 9 * 32]   ; [25]
    pmulhrsw    m9,         m7
    pmaddubsw   m12,        m2, [r3 + 9 * 32]
    pmulhrsw    m12,        m7
    packuswb    m9,         m12

    pmaddubsw   m10,        m0, [r3 + 14 * 32]  ; [30]
    pmulhrsw    m10,        m7
    pmaddubsw   m12,        m2, [r3 + 14 * 32]
    pmulhrsw    m12,        m7
    packuswb    m10,        m12

    palignr     m12,        m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m11,        m12, [r3 - 13 * 32] ; [3]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 2
    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 - 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0

    ; rows 8 to 15

    palignr     m8,         m2, m0, 2
    palignr     m1,         m3, m2, 2
    pmaddubsw   m4,         m8, [r3 - 3 * 32]   ; [13]
    pmulhrsw    m4,         m7
    pmaddubsw   m9,         m1, [r3 - 3 * 32]
    pmulhrsw    m9,         m7
    packuswb    m4,         m9

    pmaddubsw   m5,         m8, [r3 + 2 * 32]   ; [18]
    pmulhrsw    m5,         m7
    pmaddubsw   m9,         m1, [r3 + 2 * 32]
    pmulhrsw    m9,         m7
    packuswb    m5,         m9

    pmaddubsw   m6,         m8, [r3 + 7 * 32]   ; [23]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m1, [r3 + 7 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         [r3 + 12 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    palignr     m12,        m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m9,         m12, [r3 - 15 * 32] ; [1]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 - 15 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        m12, [r3 - 10 * 32] ; [6]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        m1, [r3 - 10 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    pmaddubsw   m11,        m12, [r3 - 5 * 32]  ; [11]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 - 5 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 4
    pmaddubsw   m12,        [r3]                ; [16]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8

    ; rows 16 to 23

    jnz         .doNotAdjustBufferPtr
    lea         r4,         [r4 + mmsize/2]
    mov         r0,         r4
.doNotAdjustBufferPtr:

    palignr     m6,         m2, m0, 4
    palignr     m1,         m3, m2, 4
    pmaddubsw   m4,         m6, [r3 + 5 * 32]   ; [21]
    pmulhrsw    m4,         m7
    pmaddubsw   m8,         m1, [r3 + 5 * 32]
    pmulhrsw    m8,         m7
    packuswb    m4,         m8

    pmaddubsw   m5,         m6, [r3 + 10 * 32]  ; [26]
    pmulhrsw    m5,         m7
    pmaddubsw   m8,         m1, [r3 + 10 * 32]
    pmulhrsw    m8,         m7
    packuswb    m5,         m8

    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
    pmulhrsw    m6,         m7
    pmaddubsw   m1,         [r3 + 15 * 32]
    pmulhrsw    m1,         m7
    packuswb    m6,         m1

    palignr     m12,        m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m8,         m12, [r3 - 12 * 32] ; [4]
    pmulhrsw    m8,         m7
    pmaddubsw   m11,        m1, [r3 - 12 * 32]
    pmulhrsw    m11,        m7
    packuswb    m8,         m11

    pmaddubsw   m9,         m12, [r3 - 7 * 32]  ; [9]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m1, [r3 - 7 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        m12, [r3 - 2 * 32]  ; [14]
    pmulhrsw    m10,        m7
    pmaddubsw   m11,        m1, [r3 - 2 * 32]
    pmulhrsw    m11,        m7
    packuswb    m10,        m11

    pmaddubsw   m11,        m12, [r3 + 3 * 32]  ; [19]
    pmulhrsw    m11,        m7
    pmaddubsw   m1,         [r3 + 3 * 32]
    pmulhrsw    m1,         m7
    packuswb    m11,        m1

    palignr     m1,         m3, m2, 6
    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
    pmulhrsw    m12,        m7
    pmaddubsw   m1,         [r3 + 8 * 32]
    pmulhrsw    m1,         m7
    packuswb    m12,        m1

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16

    ; rows 24 to 31
    palignr     m4,         m2, m0, 6
    palignr     m1,         m3, m2, 6
    pmaddubsw   m4,         [r3 + 13 * 32]      ; [29]
    pmulhrsw    m4,         m7
    pmaddubsw   m1,         [r3 + 13 * 32]
    pmulhrsw    m1,         m7
    packuswb    m4,         m1

    palignr     m3,         m2, 8
    palignr     m2,         m0, 8
    pmaddubsw   m5,         m2, [r3 - 14 * 32]  ; [2]
    pmulhrsw    m5,         m7
    pmaddubsw   m9,         m3, [r3 - 14 * 32]
    pmulhrsw    m9,         m7
    packuswb    m5,         m9

    pmaddubsw   m6,         m2, [r3 - 9 * 32]   ; [7]
    pmulhrsw    m6,         m7
    pmaddubsw   m9,         m3, [r3 - 9 * 32]
    pmulhrsw    m9,         m7
    packuswb    m6,         m9

    pmaddubsw   m8,         m2, [r3 - 4 * 32]   ; [12]
    pmulhrsw    m8,         m7
    pmaddubsw   m1,         m3, [r3 - 4 * 32]
    pmulhrsw    m1,         m7
    packuswb    m8,         m1

    pmaddubsw   m9,         m2, [r3 + 1 * 32]   ; [17]
    pmulhrsw    m9,         m7
    pmaddubsw   m11,        m3, [r3 + 1 * 32]
    pmulhrsw    m11,        m7
    packuswb    m9,         m11

    pmaddubsw   m10,        m2, [r3 + 6 * 32]   ; [22]
    pmulhrsw    m10,        m7
    pmaddubsw   m1,         m3, [r3 + 6 * 32]
    pmulhrsw    m1,         m7
    packuswb    m10,        m1

    pmaddubsw   m2,         [r3 + 11 * 32]      ; [27]
    pmulhrsw    m2,         m7
    pmaddubsw   m3,         [r3 + 11 * 32]
    pmulhrsw    m3,         m7
    packuswb    m2,         m3

    movu        m3,         [r2 + 6]            ; [0]

    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24
    ret

INIT_YMM avx2
cglobal intra_pred_ang32_8, 3,8,13
    add         r2, 64
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    mov         r4, r0
    xor         r7d, r7d

    call ang32_mode_8_28_avx2
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_28, 3,8,13
    lea         r3, [ang_table_avx2 + 32 * 16]
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
    mova        m7, [pw_1024]
    xor         r7d, r7d
    inc         r7d

    call ang32_mode_8_28_avx2
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_9, 3,5,8
    vbroadcasti128      m0, [angHor_tab_9]
    vbroadcasti128      m1, [angHor_tab_9 + mmsize/2]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode9]
    lea                 r3, [r1 * 3]

    vbroadcasti128      m3, [r2 + mmsize*2 +  1]
    vbroadcasti128      m6, [r2 + mmsize*2 + 17]

    pshufb              m5, m3, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 1
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 2
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m6, m3, 3
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 4
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 5
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 6
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m6, m3, 7
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 8
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 9
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 10
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m6, m3, 11
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 12
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 13
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 14
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m6, m3, 15
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    vbroadcasti128      m3, [r2 + mmsize*2 + 33]

    pshufb              m5, m6, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 1
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 2
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m3, m6, 3
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 4
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 5
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 6
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m3, m6, 7
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 8
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 9
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 10
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m3, m6, 11
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 12
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 13
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 14
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1*2], m4

    palignr             m5, m3, m6, 15
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_27, 3,5,6
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]            ; r4 -> 3 * stride
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu                m1, [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
    movu                m3, [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
    movu                m4, [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]

    punpckhbw           m2, m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw           m0, m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    punpcklbw           m3, m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]

    pmaddubsw           m4, m0, [r3 - 14 * 32]  ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 12 * 32]  ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 10 * 32]  ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 - 8 * 32]   ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 - 6 * 32]   ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 4 * 32]   ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 2 * 32]   ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3]            ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m0, [r3 + 2 * 32]   ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 4 * 32]   ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 6 * 32]   ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 + 8 * 32]   ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 12 * 32]  ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 14 * 32]  ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m3, m2, 2
    palignr             m2, m0, 2
    movu                m1, [r2 + 2]            ; [0]
    movu                [r0 + r4], m1

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    pmaddubsw           m4, m2, [r3 - 14 * 32]  ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m2, [r3 - 12 * 32]  ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m2, [r3 - 10 * 32]  ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m2, [r3 - 8 * 32]   ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m2, [r3 - 6 * 32]   ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m2, [r3 - 4 * 32]   ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1],  m4

    pmaddubsw           m4, m2, [r3 - 2 * 32]   ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m2, [r3]            ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0,         [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m2, [r3 + 2 * 32]   ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m2, [r3 + 4 * 32]   ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1],  m4

    pmaddubsw           m4, m2, [r3 + 6 * 32]   ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m2, [r3 + 8 * 32]   ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4],  m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m2, [r3 + 10 * 32]  ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m2, [r3 + 12 * 32]  ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1],  m4

    pmaddubsw           m2, [r3 + 14 * 32]      ; [30]
    pmulhrsw            m2, m5
    pmaddubsw           m3, [r3 + 14 * 32]
    pmulhrsw            m3, m5
    packuswb            m2, m3
    movu                [r0 + r1*2], m2

    movu                m1, [r2 + 3]            ; [0]
    movu                [r0 + r4], m1
    RET

cglobal intra_pred_ang32_10, 5,5,4
    pxor                m0, m0
    mova                m1, [pb_1]
    lea                 r4, [r1 * 3]

    vbroadcasti128      m2, [r2 + mmsize*2 + 1]

    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]
    pxor                m0, m0
    vbroadcasti128      m2, [r2 + mmsize*2 + mmsize/2 + 1]

    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3

    lea                 r0, [r0 + r1 * 4]

    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r1 * 2], m3
    paddb               m0, m1
    pshufb              m3, m2, m0
    movu                [r0 + r4], m3
    RET

cglobal intra_pred_ang32_11, 3,4,8
    vbroadcasti128      m0, [angHor_tab_11]
    vbroadcasti128      m1, [angHor_tab_11 + mmsize/2]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode11]
    lea                 r3, [r1 * 3]

    ; prepare for [16 0 -1 -2 ...]
    movu               xm3, [r2 + mmsize*2 -  1]
    vbroadcasti128      m6, [r2 + mmsize*2 + 15]

    pinsrb             xm3, [r2 +  0], 1
    pinsrb             xm3, [r2 + 16], 0
    vinserti128         m3, m3, xm3, 1          ; [16  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]

    pshufb              m5, m3, m7              ; [ 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1 16  0 16  0 16  0 16  0 16  0 16  0 16  0 16  0]
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 1
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 2
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 3
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 4
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 5
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 6
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 7
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 8
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 9
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 10
    pshufb              m5, m7

    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 11
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 12
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 13
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 14
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 15
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    mova                m3, m6
    vbroadcasti128      m6, [r2 + mmsize*2 + 15 + 16]
    pshufb              m5, m3, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 1
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 2
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 3
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 4
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 5
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 6
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 7
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 8
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 9
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 10
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 11
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 12
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m6, m3, 13
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m6, m3, 14
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m6, m3, 15
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_25, 3,5,7
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 +  0]               ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]
    movu                m1, [r2 +  1]               ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]

    pinsrb              xm3, [r2], 15
    pinsrb              xm3, [r2 + mmsize*2 + 16], 14

    punpckhbw           m2, m0, m1                  ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8]
    punpcklbw           m0, m1                      ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16  8  7  7  6  6  5  5  4  4  3  3  2  2  1  1  0]
    vinserti128         m3, m3, xm2, 1              ; [16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  0 16  x  x  x  x  x  x  x  x  x  x  x  x  x  x]

    pmaddubsw           m4, m0, [r3 + 14 * 32]      ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 12 * 32]      ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 10 * 32]      ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 + 8 * 32]       ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 + 6 * 32]       ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 4 * 32]       ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 2 * 32]       ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3]                ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m0, [r3 - 2 * 32]       ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 4 * 32]       ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 6 * 32]       ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 - 8 * 32]       ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 - 10 * 32]      ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 12 * 32]      ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 14 * 32]      ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    movu                m1, [r2]                    ; [0]
    movu                [r0 + r4],  m1

    lea                 r0, [r0 + r1 * 4]
    palignr             m2, m0, 14
    palignr             m0, m3, 14

    ; rows 16 to 23
    pmaddubsw           m4, m0, [r3 + 14 * 32]      ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 12 * 32]      ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 10 * 32]      ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 + 8 * 32]       ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 + 6 * 32]       ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 4 * 32]       ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 2 * 32]       ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3]                ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    pmaddubsw           m4, m0, [r3 - 2 * 32]       ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 4 * 32]       ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 6 * 32]       ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m0, [r3 - 8 * 32]       ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 - 10 * 32]      ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 12 * 32]      ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m0, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m0, m5
    pmaddubsw           m2, [r3 - 14 * 32]
    pmulhrsw            m2, m5
    packuswb            m0, m2
    movu                [r0 + r1*2], m0

    movu                m1, [r2 + 1]                ; [0]
    palignr             m1, m3, 14
    movu                [r0 + r4], m1
    RET

cglobal intra_pred_ang32_12, 3,4,9
    movu                m0, [ang32_fact_mode12]
    movu                m1, [ang32_fact_mode12 + mmsize]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode12]
    mova                m8, [ang32_shuf_mode12 + mmsize]
    lea                 r3, [r1 * 3]

    ; prepare for [26, 19, 13,  6,  0, -1, -2....]

    movu               xm4, [r2 + mmsize*2 - 4]
    vbroadcasti128      m6, [r2 + mmsize*2 + 12]

    pinsrb             xm4, [r2 +  0], 4
    pinsrb             xm4, [r2 +  6], 3
    pinsrb             xm4, [r2 + 13], 2
    pinsrb             xm4, [r2 + 19], 1
    pinsrb             xm4, [r2 + 26], 0
    vinserti128         m3, m4, xm4, 1      ; [26, 19, 13,  6,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 26, 19, 13,  6,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]

    pshufb              m4, m3, m7          ; [ 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  6,  0,  6,  0, 13,  6, 13,  6, 13,  6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13]
    pshufb              m5, m3, m8          ; [ 6,  0,  6,  0,  6,  0,  6,  0, 13,  6, 13,  6, 13,  6, 13,  6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19]
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 1
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 2
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 3
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 4
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 5
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 6
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 7
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 8
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 9
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 10
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 11
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 12
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 13
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 14
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 15
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]
    mova                m3, m6
    vbroadcasti128      m6, [r2 + mmsize*2 + 12 + 16]

    pshufb              m4, m3, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 1
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 2
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 3
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 4
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 5
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 6
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 7
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 8
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 9
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 10
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 11
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m6, m3, 12
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m6, m3, 13
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m6, m3, 14
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m3, 15
    pshufb              m5, m4, m8
    pshufb              m4, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_24, 3,5,8
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2]
    pshufb              m4, [ang32_shuf_mode24]
    mova                m3, [ang32_shuf_mode24 + mmsize]
    vpermd              m4, m3, m4                      ; [6  6 13 13 19 19 26 26 x x x...]
    palignr             m3, m0, m4, 1
    vinserti128         m3, m3, xm2, 1

    pmaddubsw           m4, m0, [r3 + 11 * 32]          ; [27]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 + 1 * 32]           ; [17]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m0, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m0, [r3 - 9 * 32]           ; [7]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14

    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12

    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10

    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8

    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 1 * 32]           ; [15]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pand                m6, [pw_00ff]
    pand                m7, [pw_00ff]
    packuswb            m6, m7
    movu                [r0 + r4], m6
    RET

cglobal intra_pred_ang32_13, 3,4,9
    movu                m0, [ang32_fact_mode13]
    movu                m1, [ang32_fact_mode13 + mmsize]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode13]
    mova                m8, [ang32_shuf_mode13 + mmsize]
    lea                 r3, [r1 * 3]

    ; prepare for [28, 25, 21, 18, 14, 11,  7,  4,  0, -1, -2....]

    movu                m6, [r2]
    pshufb              m6, [ang32_shuf_mode13 + mmsize*2]
    mova                m3, [ang32_shuf_mode24 + mmsize*1]
    vpermd              m6, m3, m6
    palignr             m6, m6, 1
    vbroadcasti128      m3, [r2 + mmsize*2 + 1]

    palignr             m5, m3, m6, 1
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 2
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 3
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 4
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 5
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 6
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 7
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 8
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 9
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 10
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 11
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 12
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 13
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 14
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 15
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    pshufb              m4, m3, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    mova                m6, m3
    vbroadcasti128      m3, [r2 + mmsize*2 + 17]
    palignr             m5, m3, m6, 1
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 2
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 3
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 4
    pshufb              m4, m5, m7
    pshufb              m5, m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 5
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 6
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 7
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 8
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 9
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 10
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 11
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 12
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 13
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 14
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 15
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    pshufb              m4, m3, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_23, 3,5,8
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2]
    pshufb              m4, [ang32_shuf_mode23]
    vpermq              m4, m4, q1313
    palignr             m3, m0, m4, 1
    vinserti128         m3, m3, xm2, 1

    pmaddubsw           m4, m0, [r3 + 7 * 32]           ; [23]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m0, [r3 - 11 * 32]          ; [5]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14

    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12

    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m6, [r3 - 1 * 32]           ; [15]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10

    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8

    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 6
    palignr             m7, m2, m0, 6

    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 4
    palignr             m7, m2, m0, 4
    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    palignr             m6, m0, m3, 2
    palignr             m7, m2, m0, 2
    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m3, [r3 + 11 * 32]          ; [27]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 + 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m3, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m3, [r3 - 7 * 32]           ; [9]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pand                m3, [pw_00ff]
    pand                m0, [pw_00ff]
    packuswb            m3, m0
    movu                [r0 + r4], m3
    RET

cglobal intra_pred_ang32_14, 3,4,9
    movu                m0, [ang32_fact_mode14]
    movu                m1, [ang32_fact_mode14 + mmsize]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode14]
    mova                m8, [ang32_shuf_mode14 + mmsize]
    lea                 r3, [r1 * 3]

    ; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...]

    movu                m6, [r2]
    pshufb              m6, [ang32_shuf_mode14 + mmsize*2]
    vpermq              m6, m6, 01110111b
    pslldq              m6, m6, 1
    vbroadcasti128      m3, [r2 + mmsize*2 + 1]

    palignr             m5, m3, m6, 1
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 2
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 3
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 4
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 5
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 6
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 7
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 8
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 9
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 10
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 11
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 12
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 13
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 14
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 15
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    pshufb              m4, m3, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    mova                m6, m3
    vbroadcasti128      m3, [r2 + mmsize*2 + 17]
    palignr             m5, m3, m6, 1
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 2
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 3
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 4
    pshufb              m4, m5, m7
    pshufb              m5, m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 5
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 6
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 7
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 8
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 9
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 10
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 11
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 12
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m3, m6, 13
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m5, m3, m6, 14
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 15
    pshufb              m4, m5, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    pshufb              m4, m3, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_22, 3,5,9
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2 + 2]
    pshufb              m4, [ang32_shuf_mode22]
    vextracti128        xm8, m4, 1

    palignr             m3, m0, m4, 2
    palignr             m3, m8, 15
    vinserti128         m3, m3, xm2, 1
    vinserti128         m8, m8, xm0, 1

    pmaddubsw           m4, m0, [r3 + 3 * 32]           ; [19]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 + 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m0, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14

    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12

    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10

    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8

    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m0, m3, 6
    palignr             m7, m2, m0, 6

    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 4
    palignr             m7, m2, m0, 4

    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 2
    palignr             m7, m2, m0, 2

    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m3, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m3, [r3 - 1 * 32]           ; [15]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m3, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 14
    palignr             m7, m0, m3, 14

    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    palignr             m6, m3, m8, 12
    palignr             m7, m0, m3, 12
    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    palignr             m6, m3, m8, 10
    palignr             m7, m0, m3, 10
    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m0, m3, 8
    palignr             m3, m8, 8
    pmaddubsw           m4, m3, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m3, [r3 - 3 * 32]           ; [13]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pand                m3, [pw_00ff]
    pand                m0, [pw_00ff]
    packuswb            m3, m0
    movu                [r0 + r4], m3
    RET

cglobal intra_pred_ang32_15, 3,4,9
    movu                m0, [ang32_fact_mode15]
    movu                m1, [ang32_fact_mode15 + mmsize]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode15]
    mova                m8, [ang32_shuf_mode15 + mmsize]
    lea                 r3, [r1 * 3]

    ; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11,  9,  8,  6,  4,  2,  0, -1, -2...]

    movu                m6, [r2]
    pshufb              m6, [ang32_shuf_mode15 + mmsize*2]
    vpermq              m6, m6, 01110111b

    movu                xm3, [r2 + mmsize*2]
    pinsrb              xm3, [r2], 0
    vpermq              m3, m3, 01000100b

    palignr             m4, m3, m6, 2
    pshufb              m4, m7
    pshufb              m5, m6, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 3
    pshufb              m4, m7
    palignr             m5, m3, m6, 1
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 4
    pshufb              m4, m7
    palignr             m5, m3, m6, 2
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 5
    pshufb              m4, m7
    palignr             m5, m3, m6, 3
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 6
    pshufb              m4, m7
    palignr             m5, m3, m6, 4
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 7
    pshufb              m4, m7
    palignr             m5, m3, m6, 5
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 8
    pshufb              m4, m7
    palignr             m5, m3, m6, 6
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 9
    pshufb              m4, m7
    palignr             m5, m3, m6, 7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 10
    pshufb              m4, m7
    palignr             m5, m3, m6, 8
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 11
    pshufb              m4, m7
    palignr             m5, m3, m6, 9
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 12
    pshufb              m4, m7
    palignr             m5, m3, m6, 10
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 13
    pshufb              m4, m7
    palignr             m5, m3, m6, 11
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 14
    pshufb              m4, m7
    palignr             m5, m3, m6, 12
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 15
    pshufb              m4, m7
    palignr             m5, m3, m6, 13
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    pshufb              m4, m3, m7
    palignr             m5, m3, m6, 14
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 15
    mova                m6, m3
    vbroadcasti128      m3, [r2 + mmsize*2 + 16]

    palignr             m4, m3, m6, 1
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 2
    pshufb              m4, m7
    pshufb              m5, m6, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 3
    pshufb              m4, m7
    palignr             m5, m3, m6, 1
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 4
    pshufb              m4, m7
    palignr             m5, m3, m6, 2
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 5
    pshufb              m4, m7
    palignr             m5, m3, m6, 3
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 6
    pshufb              m4, m7
    palignr             m5, m3, m6, 4
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 7
    pshufb              m4, m7
    palignr             m5, m3, m6, 5
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 8
    pshufb              m4, m7
    palignr             m5, m3, m6, 6
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 9
    pshufb              m4, m7
    palignr             m5, m3, m6, 7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 10
    pshufb              m4, m7
    palignr             m5, m3, m6, 8
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 11
    pshufb              m4, m7
    palignr             m5, m3, m6, 9
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 12
    pshufb              m4, m7
    palignr             m5, m3, m6, 10
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 13
    pshufb              m4, m7
    palignr             m5, m3, m6, 11
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 14
    pshufb              m4, m7
    palignr             m5, m3, m6, 12
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], m4

    palignr             m4, m3, m6, 15
    pshufb              m4, m7
    palignr             m5, m3, m6, 13
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1], m4

    pshufb              m4, m3, m7
    palignr             m5, m3, m6, 14
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 15
    vbroadcasti128      m6, [r2 + mmsize*2 + 32]

    palignr             m4, m6, m3, 1
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_21, 3,5,9
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2]
    pshufb              m4, [ang32_shuf_mode21]
    vextracti128        xm6, m4, 1

    palignr             m3, m0, m4, 1
    palignr             m8, m3, m6, 1
    vinserti128         m3, m3, xm2, 1
    vinserti128         m8, m8, xm0, 1

    pmaddubsw           m4, m0, [r3 - 1 * 32]           ; [15]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14
    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12
    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10
    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8

    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 6
    palignr             m7, m2, m0, 6
    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 4
    palignr             m7, m2, m0, 4
    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 2
    palignr             m7, m2, m0, 2
    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m3, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    palignr             m6, m3, m8, 14
    palignr             m7, m0, m3, 14
    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 12
    palignr             m7, m0, m3, 12
    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m3, m8, 10
    palignr             m7, m0, m3, 10
    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 8
    palignr             m7, m0, m3, 8
    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    palignr             m6, m3, m8, 6
    palignr             m7, m0, m3, 6
    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 4
    palignr             m7, m0, m3, 4
    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m3, m8, 2
    palignr             m7, m0, m3, 2
    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m8, [r3 + 1 * 32]           ; [17]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pand                m8, [pw_00ff]
    pand                m3, [pw_00ff]
    packuswb            m8, m3
    movu                [r0 + r4], m8
    RET

cglobal intra_pred_ang32_16, 3,4,10
    movu                m0, [ang32_fact_mode16]
    movu                m1, [ang32_fact_mode16 + mmsize]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode16]
    mova                m8, [ang32_shuf_mode16 + mmsize]
    lea                 r3, [r1 * 3]

    ; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0, -1, -2...]

    movu                m6, [r2]
    pshufb              m6, [ang32_shuf_mode16 + mmsize*2]
    mova                m9, m6
    mova                m3, [ang32_shuf_mode16 + mmsize*3]
    vpermd              m6, m3, m6
    vpermq              m9, m9, q3232
    pslldq              m9, 4
    palignr             m6, m9, 15
    pslldq              m9, 1

    vbroadcasti128      m3, [r2 + mmsize*2 + 1]

    palignr             m4, m3, m6, 1
    palignr             m5, m6, m9, 6
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 2
    palignr             m5, m6, m9, 7
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 3
    palignr             m5, m6, m9, 8
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 4
    palignr             m5, m6, m9, 9
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 5
    palignr             m5, m6, m9, 10
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 6
    palignr             m5, m6, m9, 11
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 7
    palignr             m5, m6, m9, 12
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 8
    palignr             m5, m6, m9, 13
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 9
    palignr             m5, m6, m9, 14
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 10
    palignr             m5, m6, m9, 15
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 11
    pshufb              m4, m7
    pshufb              m5, m6, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 12
    palignr             m5, m3, m6, 1
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 13
    palignr             m5, m3, m6, 2
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 14
    palignr             m5, m3, m6, 3
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 15
    palignr             m5, m3, m6, 4
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m5, m3, m6, 5
    pshufb              m4, m3, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    vbroadcasti128      m9, [r2 + mmsize*2 + 17]

    palignr             m4, m9, m3, 1
    palignr             m5, m3, m6, 6
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m9, m3, 2
    palignr             m5, m3, m6, 7
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m9, m3, 3
    palignr             m5, m3, m6, 8
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m9, m3, 4
    palignr             m5, m3, m6, 9
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m9, m3, 5
    palignr             m5, m3, m6, 10
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m9, m3, 6
    palignr             m5, m3, m6, 11
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m9, m3, 7
    palignr             m5, m3, m6, 12
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m9, m3, 8
    palignr             m5, m3, m6, 13
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m9, m3, 9
    palignr             m5, m3, m6, 14
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m9, m3, 10
    palignr             m5, m3, m6, 15
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m9, m3, 11
    pshufb              m4, m7
    pshufb              m5, m3, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m9, m3, 12
    palignr             m5, m9, m3, 1
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m9, m3, 13
    palignr             m5, m9, m3, 2
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m9, m3, 14
    palignr             m5, m9, m3, 3
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m9, m3, 15
    palignr             m5, m9, m3, 4
    pshufb              m4, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m5, m9, m3, 5
    pshufb              m4, m9, m7
    pshufb              m5, m8
    pmaddubsw           m4, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_20, 3,5,10
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2]
    pshufb              m4, [ang32_shuf_mode20]
    mova                m9, m4
    vpermq              m9, m9, q3333
    mova                m7, m4
    vpermq              m7, m7, q1111
    palignr             m4, m7, 14
    pshufb              m4, [ang32_shuf_mode20 + mmsize*1]

    vextracti128       xm6, m4, 1
    palignr             m3, m0, m4, 1
    palignr             m8, m3, m6, 1
    vinserti128         m3, m3, xm2, 1
    vinserti128         m8, m8, xm0, 1
    vinserti128         m9, m9, xm3, 1

    pmaddubsw           m4, m0, [r3 - 5 * 32]           ; [11]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14
    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12
    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10
    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8
    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 6
    palignr             m7, m2, m0, 6
    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 4
    palignr             m7, m2, m0, 4
    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 2
    palignr             m7, m2, m0, 2
    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m3, [r3 - 1 * 32]           ; [15]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m3, m8, 14
    palignr             m7, m0, m3, 14
    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    palignr             m6, m3, m8, 12
    palignr             m7, m0, m3, 12
    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    palignr             m6, m3, m8, 10
    palignr             m7, m0, m3, 10
    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 11 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 8
    palignr             m7, m0, m3, 8
    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 1 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m3, m8, 6
    palignr             m7, m0, m3, 6
    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 9 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m3, m8, 4
    palignr             m7, m0, m3, 4
    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 2
    palignr             m7, m0, m3, 2
    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 13 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    pmaddubsw           m4, m8, [r3 + 3 * 32]           ; [19]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 3 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m8, m9, 14
    palignr             m7, m3, m8, 14
    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 7 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    palignr             m6, m8, m9, 12
    palignr             m7, m3, m8, 12
    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m8, m9, 10
    palignr             m7, m3, m8, 10
    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 15 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m8, m9, 8
    palignr             m7, m3, m8, 8
    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 5 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pand                m6, [pw_00ff]
    pand                m7, [pw_00ff]
    packuswb            m6, m7
    movu                [r0 + r4], m6
    RET

cglobal intra_pred_ang32_17, 3,4,8
    movu                m0, [ang32_fact_mode17]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode17]
    lea                 r3, [r1 * 3]

    ; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0, -1, -2...]

    movu                m6, [r2]
    pshufb              m6, [ang32_shuf_mode17 + mmsize]
    mova                m1, m6
    mova                m3, [ang32_shuf_mode16 + mmsize*3]
    vpermd              m6, m3, m6
    vpermq              m1, m1, q3232
    pslldq              m1, 4

    movu               xm4, [r2 + mmsize*2]
    pinsrb             xm4, [r2], 0
    vinserti128         m3, m4, xm4, 1

    palignr             m4, m3, m6, 2
    palignr             m5, m6, m1, 5
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 3
    palignr             m5, m6, m1, 6
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 4
    palignr             m5, m6, m1, 7
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 5
    palignr             m5, m6, m1, 8
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 6
    palignr             m5, m6, m1, 9
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 7
    palignr             m5, m6, m1, 10
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 8
    palignr             m5, m6, m1, 11
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 9
    palignr             m5, m6, m1, 12
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 10
    palignr             m5, m6, m1, 13
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 11
    palignr             m5, m6, m1, 14
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m3, m6, 12
    palignr             m5, m6, m1, 15
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m3, m6, 13
    pshufb              m4, m7
    pshufb              m5, m6, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m3, m6, 14
    palignr             m5, m3, m6, 1
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m3, m6, 15
    palignr             m5, m3, m6, 2
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m5, m3, m6, 3
    pshufb              m4, m3, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    vbroadcasti128      m1, [r2 + mmsize*2 + 16]
    palignr             m4, m1, m3, 1
    palignr             m5, m3, m6, 4
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m1, m3, 2
    palignr             m5, m3, m6, 5
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m1, m3, 3
    palignr             m5, m3, m6, 6
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m1, m3, 4
    palignr             m5, m3, m6, 7
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m1, m3, 5
    palignr             m5, m3, m6, 8
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m1, m3, 6
    palignr             m5, m3, m6, 9
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m1, m3, 7
    palignr             m5, m3, m6, 10
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m1, m3, 8
    palignr             m5, m3, m6, 11
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m1, m3, 9
    palignr             m5, m3, m6, 12
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m1, m3, 10
    palignr             m5, m3, m6, 13
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m1, m3, 11
    palignr             m5, m3, m6, 14
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    palignr             m4, m1, m3, 12
    palignr             m5, m3, m6, 15
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m1, m3, 13
    pshufb              m4, m7
    pshufb              m5, m3, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m4, m1, m3, 14
    palignr             m5, m1, m3, 1
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0], m4

    palignr             m4, m1, m3, 15
    palignr             m5, m1, m3, 2
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1], m4

    vbroadcasti128      m6, [r2 + mmsize*2 + mmsize]
    palignr             m5, m1, m3, 3
    pshufb              m4, m1, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r1 * 2], m4

    palignr             m4, m6, m1, 1
    palignr             m5, m1, m3, 4
    pshufb              m4, m7
    pshufb              m5, m7
    pmaddubsw           m4, m0
    pmaddubsw           m5, m0
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    vpermq              m4, m4, q3120
    movu                [r0 + r3], m4
    RET

cglobal intra_pred_ang32_19, 3,5,10
    lea                 r3, [ang_table_avx2 + 32 * 16]
    lea                 r4, [r1 * 3]
    mova                m5, [pw_1024]

    ; rows 0 to 7
    movu                m0, [r2 + 0]
    movu                m1, [r2 + 1]
    punpckhbw           m2, m0, m1
    punpcklbw           m0, m1

    movu                m4, [r2 + mmsize*2]
    pshufb              m4, [ang32_shuf_mode17 + mmsize*1]
    mova                m3, [ang32_shuf_mode19 + mmsize*1]
    mova                m6, [ang32_shuf_mode19 + mmsize*2]
    mova                m9, m4
    vpermd              m4, m3, m4
    vpermd              m9, m6, m9
    pshufb              m4, [ang32_shuf_mode19]
    pshufb              m9, [ang32_shuf_mode19]

    vextracti128       xm6, m4, 1
    palignr             m3, m0, m4, 1
    palignr             m8, m3, m6, 1
    palignr             m7, m8, m9, 1
    vinserti128         m3, m3, xm2, 1
    vinserti128         m8, m8, xm0, 1
    vinserti128         m9, m7, xm3, 1

    pmaddubsw           m4, m0, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m2, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m0, m3, 14
    palignr             m7, m2, m0, 14
    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 12
    palignr             m7, m2, m0, 12
    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 10
    palignr             m7, m2, m0, 10
    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m0, m3, 8
    palignr             m7, m2, m0, 8
    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m0, m3, 6
    palignr             m7, m2, m0, 6
    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m0, m3, 4
    palignr             m7, m2, m0, 4
    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 8 to 15
    palignr             m6, m0, m3, 2
    palignr             m7, m2, m0, 2
    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m3, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m3, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m0, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m3, m8, 14
    palignr             m7, m0, m3, 14
    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m3, m8, 12
    palignr             m7, m0, m3, 12
    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m3, m8, 10
    palignr             m7, m0, m3, 10
    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 8
    palignr             m7, m0, m3, 8
    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pand                m6, [pw_00ff]
    pand                m7, [pw_00ff]
    packuswb            m6, m7
    movu                [r0 + r4], m6

    lea                 r0, [r0 + r1 * 4]

    ; rows 16 to 23
    palignr             m6, m3, m8, 6
    palignr             m7, m0, m3, 6
    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m3, m8, 4
    palignr             m7, m0, m3, 4
    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m3, m8, 2
    palignr             m7, m0, m3, 2
    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    pmaddubsw           m4, m8, [r3 + 8 * 32]           ; [24]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m3, [r3 + 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    palignr             m6, m8, m9, 14
    palignr             m7, m3, m8, 14
    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m8, m9, 12
    palignr             m7, m3, m8, 12
    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m8, m9, 10
    palignr             m7, m3, m8, 10
    pmaddubsw           m4, m6, [r3]                    ; [16]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    ; rows 24 to 31
    palignr             m6, m8, m9, 8
    palignr             m7, m3, m8, 8
    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 6 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    palignr             m6, m8, m9, 6
    palignr             m7, m3, m8, 6
    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 12 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 14 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1*2], m4

    palignr             m6, m8, m9, 4
    palignr             m7, m3, m8, 4
    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 8 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r4], m4

    lea                 r0, [r0 + r1 * 4]

    vpbroadcastb        m0, [r2 + mmsize*2 + 31]
    palignr             m1, m9, m0, 1
    vinserti128         m0, m1, xm8, 1

    palignr             m6, m8, m9, 2
    palignr             m7, m3, m8, 2
    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 - 2 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0], m4

    pmaddubsw           m4, m9, [r3 + 4 * 32]           ; [20]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m8, [r3 + 4 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1], m4

    palignr             m6, m9, m0, 14
    palignr             m7, m8, m9, 14
    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
    pmulhrsw            m4, m5
    pmaddubsw           m1, m7, [r3 + 10 * 32]
    pmulhrsw            m1, m5
    packuswb            m4, m1
    movu                [r0 + r1 * 2], m4

    pand                m6, [pw_00ff]
    pand                m7, [pw_00ff]
    packuswb            m6, m7
    movu                [r0 + r4], m6
    RET

%endif  ; ARCH_X86_64
;-----------------------------------------------------------------------------------------
; end of intra_pred_ang32 angular modes avx2 asm
;-----------------------------------------------------------------------------------------

;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal intra_pred_ang8_3, 3,4,5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src3_11_4_12]
    pshufb            m4, m0, [c_ang8_src5_13_5_13]
    pshufb            m0,     [c_ang8_src6_14_7_15]

    pmaddubsw         m1, [c_ang8_26_20]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_14_8]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_2_28]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_22_16]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_33, 3,4,5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src3_11_4_12]
    pshufb            m4, m0, [c_ang8_src5_13_5_13]
    pshufb            m0,     [c_ang8_src6_14_7_15]

    pmaddubsw         m1, [c_ang8_26_20]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_14_8]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_2_28]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_22_16]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_4, 3,4,5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src2_10_3_11]
    pshufb            m4, m0, [c_ang8_src4_12_4_12]
    pshufb            m0,     [c_ang8_src5_13_6_14]

    pmaddubsw         m1, [c_ang8_21_10]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_31_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_9_30]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_19_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_32, 3,4,5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src2_10_3_11]
    pshufb            m4, m0, [c_ang8_src4_12_4_12]
    pshufb            m0,     [c_ang8_src5_13_6_14]

    pmaddubsw         m1, [c_ang8_21_10]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_31_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_9_30]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_19_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_5, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src2_10_3_11]
    pshufb            m4, m0, [c_ang8_src3_11_4_12]
    pshufb            m0,     [c_ang8_src4_12_5_13]

    pmaddubsw         m1, [c_ang8_17_2]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_19_4]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_21_6]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_23_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_31, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m1, m0, [c_ang8_src1_9_2_10]
    pshufb            m2, m0, [c_ang8_src2_10_3_11]
    pshufb            m4, m0, [c_ang8_src3_11_4_12]
    pshufb            m0,     [c_ang8_src4_12_5_13]

    pmaddubsw         m1, [c_ang8_17_2]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_19_4]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_21_6]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_23_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_6, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m1, m0, [intra_pred_shuff_0_8]
    pshufb            m2, m0, [c_ang8_src2_10_2_10]
    pshufb            m4, m0, [c_ang8_src3_11_3_11]
    pshufb            m0,     [c_ang8_src3_11_4_12]

    pmaddubsw         m1, [c_ang8_13_26]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_7_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_1_14]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_27_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_30, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m1, m0, [intra_pred_shuff_0_8]
    pshufb            m2, m0, [c_ang8_src2_10_2_10]
    pshufb            m4, m0, [c_ang8_src3_11_3_11]
    pshufb            m0,     [c_ang8_src3_11_4_12]

    pmaddubsw         m1, [c_ang8_13_26]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_7_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_1_14]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_27_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_9, 3, 5, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m0, [intra_pred_shuff_0_8]

    lea               r4, [c_ang8_mode_27]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_27, 3, 5, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m0, [intra_pred_shuff_0_8]

    lea               r4, [c_ang8_mode_27]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_25, 3, 5, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2]

    pshufb            m0, [intra_pred_shuff_0_8]

    lea               r4, [c_ang8_mode_25]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_7, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]

    pshufb            m1, m0, [intra_pred_shuff_0_8]
    pshufb            m2, m0, [c_ang8_src1_9_2_10]
    pshufb            m4, m0, [c_ang8_src2_10_2_10]
    pshufb            m0,     [c_ang8_src2_10_3_11]

    pmaddubsw         m1, [c_ang8_9_18]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_27_4]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_13_22]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_31_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_29, 3, 4, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]

    pshufb            m1, m0, [intra_pred_shuff_0_8]
    pshufb            m2, m0, [c_ang8_src1_9_2_10]
    pshufb            m4, m0, [c_ang8_src2_10_2_10]
    pshufb            m0,     [c_ang8_src2_10_3_11]

    pmaddubsw         m1, [c_ang8_9_18]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_27_4]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_13_22]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_31_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_8, 3, 4, 6
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 17]
    mova              m5, [intra_pred_shuff_0_8]

    pshufb            m1, m0, m5
    pshufb            m2, m0, m5
    pshufb            m4, m0, m5
    pshufb            m0,     [c_ang8_src2_10_2_10]

    pmaddubsw         m1, [c_ang8_5_10]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_15_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_25_30]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_3_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_28, 3, 4, 6
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2 + 1]
    mova              m5, [intra_pred_shuff_0_8]

    pshufb            m1, m0, m5
    pshufb            m2, m0, m5
    pshufb            m4, m0, m5
    pshufb            m0,     [c_ang8_src2_10_2_10]

    pmaddubsw         m1, [c_ang8_5_10]
    pmulhrsw          m1, m3
    pmaddubsw         m2, [c_ang8_15_20]
    pmulhrsw          m2, m3
    pmaddubsw         m4, [c_ang8_25_30]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [c_ang8_3_8]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_11, 3, 5, 5
    mova              m3, [pw_1024]
    movu              xm1, [r2 + 16]
    pinsrb            xm1, [r2], 0
    pshufb            xm1, [intra_pred_shuff_0_8]
    vinserti128       m0, m1, xm1, 1

    lea               r4, [c_ang8_mode_25]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_15, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2 + 16]
    pinsrb            xm5, [r2], 0
    lea               r5, [intra_pred_shuff_0_8]
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 2], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_15]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 4], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 6], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 8], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_16, 3,4,7
    lea                 r0, [r0 + r1 * 8]
    sub                 r0, r1
    neg                 r1
    lea                 r3, [r1 * 3]
    vbroadcasti128      m0, [angHor8_tab_16]            ; m0 = factor
    mova                m1, [intra_pred8_shuff16]       ; m1 = 4 of Row shuffle
    movu                m2, [intra_pred8_shuff16 + 8]   ; m2 = 4 of Row shuffle

    ; prepare reference pixel
    movq                xm3, [r2 + 16 + 1]              ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
    movhps              xm3, [r2 + 2]                   ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
    pslldq              xm3, 1
    pinsrb              xm3, [r2], 0                    ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
    pshufb              xm3, [c_ang8_mode_16]
    vinserti128         m3, m3, xm3, 1                  ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1  0 2 3 5 6 8]

    ; process 4 rows
    pshufb              m4, m3, m1
    pshufb              m5, m3, m2
    psrldq              m3, 4
    punpcklbw           m6, m5, m4
    punpckhbw           m5, m4
    pmaddubsw           m6, m0
    pmulhrsw            m6, [pw_1024]
    pmaddubsw           m5, m0
    pmulhrsw            m5, [pw_1024]
    packuswb            m6, m5
    vextracti128        xm5, m6, 1
    movq                [r0], xm6
    movhps              [r0 + r1], xm6
    movq                [r0 + r1 * 2], xm5
    movhps              [r0 + r3], xm5

    ; process 4 rows
    lea                 r0, [r0 + r1 * 4]
    pshufb              m4, m3, m1
    pshufb              m5, m3, m2
    punpcklbw           m6, m5, m4
    punpckhbw           m5, m4
    pmaddubsw           m6, m0
    pmulhrsw            m6, [pw_1024]
    pmaddubsw           m5, m0
    pmulhrsw            m5, [pw_1024]
    packuswb            m6, m5
    vextracti128        xm5, m6, 1
    movq                [r0], xm6
    movhps              [r0 + r1], xm6
    movq                [r0 + r1 * 2], xm5
    movhps              [r0 + r3], xm5
    RET

%if 1
INIT_YMM avx2
cglobal intra_pred_ang8_20, 3,5,6
    lea                 r0, [r0 + r1 * 8]
    sub                 r0, r1
    neg                 r1
    lea                 r3, [angHor8_tab_20]
    lea                 r4, [r1 * 3]
    movu                m5, [intra_pred_shuff_0_8 + 16]

    ; prepare reference pixel
    movq                xm1, [r2 + 1]                   ; m3 = [ 1  2  3  4  5  6  7  8  x  x x  x  x  x  x  x]
    movhps              xm1, [r2 + 16 + 2]              ; m3 = [ 1  2  3  4  5  6  7  8 -2 -3 x -5 -6  x -8  x]
    palignr             xm1, xm1, [r2 - 15], 15         ; m3 = [ 0  1  2  3  4  5  6  7  8 -2 -3 x -5 -6  x -8]
    pshufb              xm1, [c_ang8_mode_20]
    vinserti128         m1, m1, xm1, 1

    ; process 4 rows
    pshufb              m3, m1, m5
    psrldq              m1, 2
    pmaddubsw           m3, [r3 + 0 * 16]
    pmulhrsw            m3, [pw_1024]

    pshufb              m4, m1, [intra_pred_shuff_0_8]
    psrldq              m1, 1
    pmaddubsw           m4, [r3 + 2 * 16]
    pmulhrsw            m4, [pw_1024]

    packuswb            m3, m4
    vextracti128        xm4, m3, 1
    movq                [r0], xm3
    movq                [r0 + r1], xm4
    movhps              [r0 + r1 * 2], xm3
    movhps              [r0 + r4], xm4

    ; process 4 rows
    lea                 r0, [r0 + r1 * 4]
    pshufb              m3, m1, m5
    psrldq              m1, 1
    pmaddubsw           m3, [r3 + 4 * 16]
    pmulhrsw            m3, [pw_1024]

    pshufb              m4, m1, m5
    pmaddubsw           m4, [r3 + 6 * 16]
    pmulhrsw            m4, [pw_1024]

    packuswb            m3, m4
    vextracti128        xm4, m3, 1
    movq                [r0], xm3
    movq                [r0 + r1], xm4
    movhps              [r0 + r1 * 2], xm3
    movhps              [r0 + r4], xm4
    RET

%else
INIT_YMM avx2
cglobal intra_pred_ang8_20, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2]
    lea               r5, [intra_pred_shuff_0_8]
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 2 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_20]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 3 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 5 + 16], 0
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 6 + 16], 0
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 8 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3

    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET
%endif

INIT_YMM avx2
cglobal intra_pred_ang8_21, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2]
    lea               r5, [intra_pred_shuff_0_8]
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 2 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_15]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 4 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 6 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    mova              xm0, xm5
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 8 + 16], 0
    vinserti128       m0, m0, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_22, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2]
    lea               r5, [intra_pred_shuff_0_8]
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_14]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 2 + 16], 0
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 5 + 16], 0
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 7 + 16], 0
    pshufb            xm5, [r5]
    vinserti128       m0, m0, xm5, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_14, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2 + 16]
    pinsrb            xm5, [r2], 0
    lea               r5, [intra_pred_shuff_0_8]
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_14]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 2], 0
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 5], 0
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 7], 0
    pshufb            xm5, [r5]
    vinserti128       m0, m0, xm5, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_13, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2 + 16]
    pinsrb            xm5, [r2], 0
    lea               r5, [intra_pred_shuff_0_8]
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_13]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 4], 0
    pshufb            xm4, xm5, [r5]
    vinserti128       m0, m0, xm4, 1
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    vinserti128       m0, m0, xm4, 0
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 7], 0
    pshufb            xm5, [r5]
    vinserti128       m0, m0, xm5, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET


INIT_YMM avx2
cglobal intra_pred_ang8_23, 3, 6, 6
    mova              m3, [pw_1024]
    movu              xm5, [r2]
    lea               r5, [intra_pred_shuff_0_8]
    vinserti128       m0, m5, xm5, 1
    pshufb            m0, [r5]

    lea               r4, [c_ang8_mode_13]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 4 + 16], 0
    pshufb            xm4, xm5, [r5]
    vinserti128       m0, m0, xm4, 1
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    vinserti128       m0, m0, xm4, 0
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm5, 1
    pinsrb            xm5, [r2 + 7 + 16], 0
    pshufb            xm5, [r5]
    vinserti128       m0, m0, xm5, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3

    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_12, 3, 5, 5
    mova              m3, [pw_1024]
    movu              xm1, [r2 + 16]
    pinsrb            xm1, [r2], 0
    pshufb            xm1, [intra_pred_shuff_0_8]
    vinserti128       m0, m1, xm1, 1

    lea               r4, [c_ang8_mode_24]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm0, 2
    pinsrb            xm0, [r2 + 6], 0
    pinsrb            xm0, [r2 + 0], 1
    vinserti128       m0, m0, xm0, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    vperm2i128        m2, m1, m4, 00100000b
    vperm2i128        m1, m1, m4, 00110001b
    punpcklbw         m4, m2, m1
    punpckhbw         m2, m1
    punpcklwd         m1, m4, m2
    punpckhwd         m4, m2
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4

    lea               r3, [3 * r1]
    movq              [r0], xm1
    movhps            [r0 + r1], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    movhps            [r0 + r1], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + 2 * r1], xm2
    movhps            [r0 + r3], xm2
    RET

INIT_YMM avx2
cglobal intra_pred_ang8_24, 3, 5, 5
    mova              m3, [pw_1024]
    vbroadcasti128    m0, [r2]

    pshufb            m0, [intra_pred_shuff_0_8]

    lea               r4, [c_ang8_mode_24]
    pmaddubsw         m1, m0, [r4]
    pmulhrsw          m1, m3
    pmaddubsw         m2, m0, [r4 + mmsize]
    pmulhrsw          m2, m3
    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
    pmulhrsw          m4, m3
    pslldq            xm0, 2
    pinsrb            xm0, [r2 + 16 + 6], 0
    pinsrb            xm0, [r2 + 0], 1
    vinserti128       m0, m0, xm0, 1
    pmaddubsw         m0, [r4 + 3 * mmsize]
    pmulhrsw          m0, m3
    packuswb          m1, m2
    packuswb          m4, m0

    lea               r3, [3 * r1]
    movq              [r0], xm1
    vextracti128      xm2, m1, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm1
    movhps            [r0 + r3], xm2
    lea               r0, [r0 + 4 * r1]
    movq              [r0], xm4
    vextracti128      xm2, m4, 1
    movq              [r0 + r1], xm2
    movhps            [r0 + 2 * r1], xm4
    movhps            [r0 + r3], xm2
    RET

%macro INTRA_PRED_ANG16_MC0 3
    pmaddubsw         m3, m1, [r4 + %3 * mmsize]
    pmulhrsw          m3, m0
    pmaddubsw         m4, m2, [r4 + %3 * mmsize]
    pmulhrsw          m4, m0
    packuswb          m3, m4
    movu              [%1], xm3
    vextracti128      xm4, m3, 1
    movu              [%2], xm4
%endmacro

%macro INTRA_PRED_ANG16_MC1 1
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1)
%endmacro

%macro INTRA_PRED_ANG16_MC2 1
    vbroadcasti128    m1, [r2 + %1]
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + (%1 + 8)]
    pshufb            m2, m5
%endmacro

%macro INTRA_PRED_ANG16_MC3 2
    vperm2i128        m1, m1, m2, 00100000b
    pmaddubsw         m3, m1, [r4 + (%2 * mmsize)]
    pmulhrsw          m3, m0
    packuswb          m3, m3
    vpermq            m3, m3, 11011000b
    movu              [%1], xm3
%endmacro

%macro INTRA_PRED_ANG16_MC4 3
    vperm2i128        m1, m1, m2, 00100000b
    pmaddubsw         m4, m1, [r4 + (%3 * mmsize)]
    pmulhrsw          m4, m0
    packuswb          m3, m4
    vpermq            m3, m3, 11011000b
    movu              [%1], xm3
    vextracti128      xm3, m3, 1
    movu              [%2], xm3
%endmacro

%if ARCH_X86_64 == 1
%macro INTRA_PRED_TRANS_STORE_16x16 0
    punpcklbw    m8, m0, m1
    punpckhbw    m0, m1

    punpcklbw    m1, m2, m3
    punpckhbw    m2, m3

    punpcklbw    m3, m4, m5
    punpckhbw    m4, m5

    punpcklbw    m5, m6, m7
    punpckhbw    m6, m7

    punpcklwd    m7, m8, m1
    punpckhwd    m8, m1

    punpcklwd    m1, m3, m5
    punpckhwd    m3, m5

    punpcklwd    m5, m0, m2
    punpckhwd    m0, m2

    punpcklwd    m2, m4, m6
    punpckhwd    m4, m6

    punpckldq    m6, m7, m1
    punpckhdq    m7, m1

    punpckldq    m1, m8, m3
    punpckhdq    m8, m3

    punpckldq    m3, m5, m2
    punpckhdq    m5, m2

    punpckldq    m2, m0, m4
    punpckhdq    m0, m4

    vpermq       m6, m6, 0xD8
    vpermq       m7, m7, 0xD8
    vpermq       m1, m1, 0xD8
    vpermq       m8, m8, 0xD8
    vpermq       m3, m3, 0xD8
    vpermq       m5, m5, 0xD8
    vpermq       m2, m2, 0xD8
    vpermq       m0, m0, 0xD8

    movu            [r0], xm6
    vextracti128    xm4, m6, 1
    movu            [r0 + r1], xm4

    movu            [r0 + 2 * r1], xm7
    vextracti128    xm4, m7, 1
    movu            [r0 + r3], xm4

    lea             r0, [r0 + 4 * r1]

    movu            [r0], xm1
    vextracti128    xm4, m1, 1
    movu            [r0 + r1], xm4

    movu            [r0 + 2 * r1], xm8
    vextracti128    xm4, m8, 1
    movu            [r0 + r3], xm4

    lea             r0, [r0 + 4 * r1]

    movu            [r0], xm3
    vextracti128    xm4, m3, 1
    movu            [r0 + r1], xm4

    movu            [r0 + 2 * r1], xm5
    vextracti128    xm4, m5, 1
    movu            [r0 + r3], xm4

    lea             r0, [r0 + 4 * r1]

    movu            [r0], xm2
    vextracti128    xm4, m2, 1
    movu            [r0 + r1], xm4

    movu            [r0 + 2 * r1], xm0
    vextracti128    xm4, m0, 1
    movu            [r0 + r3], xm4
%endmacro

%macro INTRA_PRED_ANG16_CAL_ROW 3
    pmaddubsw         %1, m9, [r4 + (%3 * mmsize)]
    pmulhrsw          %1, m11
    pmaddubsw         %2, m10, [r4 + (%3 * mmsize)]
    pmulhrsw          %2, m11
    packuswb          %1, %2
%endmacro


INIT_YMM avx2
cglobal intra_pred_ang16_12, 3,4,9
    vbroadcasti128    m0, [angHor_tab_12]
    vbroadcasti128    m1, [angHor_tab_12 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode12]
    mova              m8, [ang16_shuf_mode12 + mmsize]
    lea               r3, [r1 * 3]

    movu              xm4, [r2 + mmsize - 2]
    pinsrb            xm4, [r2 +  0], 2
    pinsrb            xm4, [r2 +  6], 1
    pinsrb            xm4, [r2 + 13], 0
    vbroadcasti128    m6, [r2 + mmsize + 14]
    vinserti128       m3, m4, xm4, 1

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_13, 3,4,9
    vbroadcasti128    m0, [angHor_tab_13]
    vbroadcasti128    m1, [angHor_tab_13 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode13]
    mova              m8, [ang16_shuf_mode13 + mmsize]
    lea               r3, [r1 * 3]

    vbroadcasti128    m3, [r2 + mmsize + 1]
    vbroadcasti128    m4, [r2]
    pshufb            m4, [ang16_shuf_mode13 + mmsize * 2]

    palignr           m3, m4, 11
    vbroadcasti128    m6, [r2 + mmsize + 12]

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_14, 3,4,9
    vbroadcasti128    m0, [angHor_tab_14]
    vbroadcasti128    m1, [angHor_tab_14 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode14]
    mova              m8, [ang16_shuf_mode14 + mmsize]
    lea               r3, [r1 * 3]

    vbroadcasti128    m3, [r2 + mmsize + 1]
    vbroadcasti128    m4, [r2]
    pshufb            m4, [ang16_shuf_mode14 + mmsize * 2]
    palignr           m3, m4, 9
    vbroadcasti128    m6, [r2 + mmsize + 10]

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_15, 3,4,9
    vbroadcasti128    m0, [angHor_tab_15]
    vbroadcasti128    m1, [angHor_tab_15 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode15]
    mova              m8, [ang16_shuf_mode15 + mmsize]
    lea               r3, [r1 * 3]

    vbroadcasti128    m3, [r2 + mmsize + 1]
    vbroadcasti128    m4, [r2]
    pshufb            m4, [ang16_shuf_mode15 + mmsize * 2]
    palignr           m3, m3, m4, 7
    vbroadcasti128    m6, [r2 + mmsize + 8]

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_16, 3,4,9
    vbroadcasti128    m0, [angHor_tab_16]
    vbroadcasti128    m1, [angHor_tab_16 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode16]
    mova              m8, [ang16_shuf_mode16 + mmsize]
    lea               r3, [r1 * 3]

    vbroadcasti128    m3, [r2 + mmsize + 1]
    vbroadcasti128    m4, [r2]
    pshufb            m4, [ang16_shuf_mode16 + mmsize * 2]
    palignr           m3, m4, 5
    vbroadcasti128    m6, [r2 + mmsize + 6]

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_17, 3,4,9
    vbroadcasti128    m0, [angHor_tab_17]
    vbroadcasti128    m1, [angHor_tab_17 + mmsize/2]
    mova              m2, [pw_1024]
    mova              m7, [ang16_shuf_mode17]
    mova              m8, [ang16_shuf_mode17 + mmsize]
    lea               r3, [r1 * 3]

    vbroadcasti128    m3, [r2 + mmsize + 1]
    vbroadcasti128    m4, [r2]
    pshufb            m4, [ang16_shuf_mode17 + mmsize * 2]
    palignr           m3, m4, 3
    vbroadcasti128    m6, [r2 + mmsize + 4]

    pshufb            m4, m3, m7
    pshufb            m5, m3, m8
    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 2
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 4
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 6
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 8
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 10
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    lea               r0, [r0 + r1 * 4]

    palignr           m5, m6, m3, 12
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0], xm4
    vextracti128      [r0 + r1], m4, 1

    palignr           m5, m6, m3, 14
    pshufb            m4, m5, m7
    pshufb            m5, m8

    pmaddubsw         m4, m0
    pmaddubsw         m5, m1
    pmulhrsw          m4, m2
    pmulhrsw          m5, m2
    packuswb          m4, m5
    movu              [r0 + r1 * 2], xm4
    vextracti128      [r0 + r3], m4, 1
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_11, 3,4,8
    vbroadcasti128      m0, [angHor_tab_11]
    vbroadcasti128      m1, [angHor_tab_11 + mmsize/2]
    mova                m2, [pw_1024]
    mova                m7, [ang32_shuf_mode9]
    lea                 r3, [r1 * 3]

    ; prepare for [0 -1 -2...]

    movu               xm3, [r2 + mmsize]
    pinsrb             xm3, [r2], 0
    vbroadcasti128      m6, [r2 + mmsize + 16]
    vinserti128         m3, m3, xm3, 1

    pshufb              m5, m3, m7              ; [ 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1  1  2  1  2  1  2  1  2  1  2  1  2  1  2  1  2]
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], xm4
    vextracti128        [r0 + r1], m4, 1

    palignr             m5, m6, m3, 2
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], xm4
    vextracti128        [r0 + r3], m4, 1

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 4
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], xm4
    vextracti128        [r0 + r1], m4, 1

    palignr             m5, m6, m3, 6
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], xm4
    vextracti128        [r0 + r3], m4, 1

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 8
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], xm4
    vextracti128        [r0 + r1], m4, 1

    palignr             m5, m6, m3, 10
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], xm4
    vextracti128        [r0 + r3], m4, 1

    lea                 r0, [r0 + r1 * 4]

    palignr             m5, m6, m3, 12
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0], xm4
    vextracti128        [r0 + r1], m4, 1

    palignr             m5, m6, m3, 14
    pshufb              m5, m7
    pmaddubsw           m4, m5, m0
    pmaddubsw           m5, m1
    pmulhrsw            m4, m2
    pmulhrsw            m5, m2
    packuswb            m4, m5
    movu                [r0 + r1 * 2], xm4
    vextracti128        [r0 + r3], m4, 1
    RET


; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
%if ARCH_X86_64 == 1
INIT_YMM avx2
%macro TRANSPOSE_STORE_8x32 12
    jc             .skip

    punpcklbw       m%9, m%1, m%2
    punpckhbw       m%1, m%2
    punpcklbw       m%10, m%3, m%4
    punpckhbw       m%3, m%4

    punpcklwd       m%11, m%9, m%10
    punpckhwd       m%9, m%10
    punpcklwd       m%10, m%1, m%3
    punpckhwd       m%1, m%3

    punpckldq       m%12, m%11, m%10
    punpckhdq       m%11, m%10
    punpckldq       m%10, m%9, m%1
    punpckhdq       m%9, m%1

    punpcklbw       m%1, m%5, m%6
    punpckhbw       m%5, m%6
    punpcklbw       m%2, m%7, m%8
    punpckhbw       m%7, m%8

    punpcklwd       m%3, m%1, m%2
    punpckhwd       m%1, m%2
    punpcklwd       m%4, m%5, m%7
    punpckhwd       m%5, m%7

    punpckldq       m%2, m%3, m%4
    punpckhdq       m%3, m%4
    punpckldq       m%4, m%1, m%5
    punpckhdq       m%1, m%5

    punpckldq       m%5, m%12, m%2
    punpckhdq       m%6, m%12, m%2
    punpckldq       m%7, m%10, m%4
    punpckhdq       m%8, m%10, m%4

    punpckldq       m%2, m%11, m%3
    punpckhdq       m%11, m%11, m%3
    punpckldq       m%4, m%9, m%1
    punpckhdq       m%9, m%9, m%1

    movu            [r0 + r1 * 0], xm%5
    movu            [r0 + r1 * 1], xm%6
    movu            [r0 + r1 * 2], xm%2
    movu            [r0 + r5 * 1], xm%11

    add             r0, r6

    movu            [r0 + r1 * 0], xm%7
    movu            [r0 + r1 * 1], xm%8
    movu            [r0 + r1 * 2], xm%4
    movu            [r0 + r5 * 1], xm%9

    add             r0, r6

    vextracti128    [r0 + r1 * 0], m%5, 1
    vextracti128    [r0 + r1 * 1], m%6, 1
    vextracti128    [r0 + r1 * 2], m%2, 1
    vextracti128    [r0 + r5 * 1], m%11, 1

    add             r0, r6

    vextracti128    [r0 + r1 * 0], m%7, 1
    vextracti128    [r0 + r1 * 1], m%8, 1
    vextracti128    [r0 + r1 * 2], m%4, 1
    vextracti128    [r0 + r5 * 1], m%9, 1
    jmp             .end

.skip:
    vpermq          m%1, m%1, q3120
    vpermq          m%2, m%2, q3120
    vpermq          m%3, m%3, q3120
    vpermq          m%4, m%4, q3120
    vpermq          m%5, m%5, q3120
    vpermq          m%6, m%6, q3120
    vpermq          m%7, m%7, q3120
    vpermq          m%8, m%8, q3120

    movu            [r0 + r1 * 0], xm%1
    movu            [r0 + r1 * 1], xm%2
    movu            [r0 + r1 * 2], xm%3
    movu            [r0 + r5 * 1], xm%4

    add             r0, r6

    movu            [r0 + r1 * 0], xm%5
    movu            [r0 + r1 * 1], xm%6
    movu            [r0 + r1 * 2], xm%7
    movu            [r0 + r5 * 1], xm%8

    add             r0, r6

    vextracti128    [r0 + r1 * 0], m%1, 1
    vextracti128    [r0 + r1 * 1], m%2, 1
    vextracti128    [r0 + r1 * 2], m%3, 1
    vextracti128    [r0 + r5 * 1], m%4, 1

    add             r0, r6

    vextracti128    [r0 + r1 * 0], m%5, 1
    vextracti128    [r0 + r1 * 1], m%6, 1
    vextracti128    [r0 + r1 * 2], m%7, 1
    vextracti128    [r0 + r5 * 1], m%8, 1
.end:
%endmacro

cglobal ang16_mode_3_33
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw        m4,         m7

    palignr         m5,         m2, m0, 2
    pmaddubsw       m5,         [r3 + 4 * 32]       ; [20]
    pmulhrsw        m5,         m7

    palignr         m6,         m2, m0, 4
    palignr         m8,         m2, m0, 6
    pmaddubsw       m6,         [r3 - 2 * 32]       ; [14]
    pmulhrsw        m6,         m7
    pmaddubsw       m8,         [r3 - 8 * 32]       ; [8]
    pmulhrsw        m8,         m7

    palignr         m10,        m2, m0, 8
    pmaddubsw       m9,         m10, [r3 - 14 * 32] ; [2]
    pmulhrsw        m9,         m7
    pmaddubsw       m10,        [r3 + 12 * 32]      ; [28]
    pmulhrsw        m10,        m7

    palignr         m11,        m2, m0, 10
    palignr         m12,        m2, m0, 12
    pmaddubsw       m11,        [r3 + 6 * 32]       ; [22]
    pmulhrsw        m11,        m7
    pmaddubsw       m12,        [r3]                ; [16]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    palignr         m3,         m2, m0, 14
    palignr         m1,         m1, m2, 14
    pmaddubsw       m3,         [r3 - 6 * 32]       ; [10]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3

    pmaddubsw       m3,         m2, [r3 - 12 * 32]  ; [4]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    pmaddubsw       m3,         m2, [r3 + 14 * 32]  ; [30]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3

    movu            xm0,        [r2 + 25]
    movu            xm1,        [r2 + 26]
    punpcklbw       m0,         m1
    mova            m1,         m2
    vinserti128     m1,         m1, xm0, 0
    vpermq          m1,         m1, 01001110b

    palignr         m3,         m1, m2, 2
    pmaddubsw       m3,         [r3 + 8 * 32]       ; [24]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    palignr         m3,         m1, m2, 4
    pmaddubsw       m3,         [r3 + 2 * 32]       ; [18]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3

    palignr         m3,         m1, m2, 6
    pmaddubsw       m3,         [r3 - 4 * 32]       ; [12]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    palignr         m3,         m1, m2, 8
    pmaddubsw       m3,         [r3 - 10 * 32]      ; [6]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3

    pmovzxbw        m1,         [r2 + 14]
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_3, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_3_33
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_33, 3, 7, 13
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    stc

    call ang16_mode_3_33
    RET

cglobal ang16_mode_4_32
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 + 5 * 32]   ; [21]
    pmulhrsw        m4,         m7

    palignr         m1,         m2, m0, 2
    pmaddubsw       m5,         m1, [r3 - 6 * 32]   ; [10]
    pmulhrsw        m5,         m7

    palignr         m8,         m2, m0, 4
    pmaddubsw       m6,         m1, [r3 + 15 * 32]  ; [31]
    pmulhrsw        m6,         m7
    pmaddubsw       m8,         [r3 + 4 * 32]       ; [20]
    pmulhrsw        m8,         m7

    palignr         m10,        m2, m0, 6
    pmaddubsw       m9,         m10, [r3 - 7 * 32]  ; [9]
    pmulhrsw        m9,         m7
    pmaddubsw       m10,        [r3 + 14 * 32]      ; [30]
    pmulhrsw        m10,        m7

    palignr         m11,        m2, m0, 8
    palignr         m1,         m2, m0, 10
    pmaddubsw       m11,        [r3 + 3 * 32]       ; [19]
    pmulhrsw        m11,        m7
    pmaddubsw       m12,        m1, [r3 - 8 * 32]   ; [8]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    pmaddubsw       m3,         m1, [r3 + 13 * 32]  ; [29]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3

    palignr         m3,         m2, m0, 12
    pmaddubsw       m3,         m3, [r3 + 2 * 32]   ; [18]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    palignr         m1,         m2, m0, 14
    pmaddubsw       m3,         m1, [r3 - 9 * 32]   ; [7]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3

    pmaddubsw       m3,         m1, [r3 + 12 * 32]  ; [28]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    palignr         m3,         m2, m0, 16
    pmaddubsw       m3,         [r3 + 1 * 32]       ; [17]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3

    movu            xm0,        [r2 + 25]
    movu            xm1,        [r2 + 26]
    punpcklbw       m0,         m1
    mova            m1,         m2
    vinserti128     m1,         m1, xm0, 0
    vpermq          m1,         m1, 01001110b

    palignr         m0,         m1, m2, 2
    pmaddubsw       m3,         m0, [r3 - 10 * 32]  ; [6]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    pmaddubsw       m3,         m0, [r3 + 11 * 32]  ; [27]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3

    palignr         m1,         m1, m2, 4
    pmaddubsw       m1,         [r3]                ; [16]
    pmulhrsw        m1,         m7
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_4, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_4_32
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_32, 3, 7, 13
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    stc

    call ang16_mode_4_32
    RET

cglobal ang16_mode_5
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 + 1 * 32]   ; [17]
    pmulhrsw        m4,         m7

    palignr         m1,         m2, m0, 2
    pmaddubsw       m5,         m1, [r3 - 14 * 32]  ; [2]
    pmulhrsw        m5,         m7

    palignr         m3,         m2, m0, 4
    pmaddubsw       m6,         m1, [r3 + 3 * 32]   ; [19]
    pmulhrsw        m6,         m7
    pmaddubsw       m8,         m3, [r3 - 12 * 32]  ; [4]
    pmulhrsw        m8,         m7
    pmaddubsw       m9,         m3, [r3 + 5 * 32]   ; [21]
    pmulhrsw        m9,         m7

    palignr         m3,         m2, m0, 6
    pmaddubsw       m10,        m3, [r3 - 10 * 32]  ; [6]
    pmulhrsw        m10,        m7

    palignr         m1,         m2, m0, 8
    pmaddubsw       m11,        m3, [r3 + 7 * 32]   ; [23]
    pmulhrsw        m11,        m7
    pmaddubsw       m12,        m1, [r3 - 8 * 32]   ; [8]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    pmaddubsw       m3,         m1, [r3 + 9 * 32]   ; [25]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3

    palignr         m1,         m2, m0, 10
    pmaddubsw       m3,         m1, [r3 - 6 * 32]   ; [10]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    pmaddubsw       m3,         m1, [r3 + 11 * 32]  ; [27]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3

    palignr         m1,         m2, m0, 12
    pmaddubsw       m3,         m1, [r3 - 4 * 32]   ; [12]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    pmaddubsw       m3,         m1, [r3 + 13 * 32]  ; [29]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3

    palignr         m1,         m2, m0, 14
    pmaddubsw       m3,         m1, [r3 - 2 * 32]   ; [14]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    pmaddubsw       m3,         m1, [r3 + 15 * 32]  ; [31]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3

    palignr         m1,         m2, m0, 16
    pmaddubsw       m1,         [r3]                ; [16]
    pmulhrsw        m1,         m7
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_5, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_5
    RET

cglobal ang16_mode_6
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 - 3 * 32]   ; [13]
    pmulhrsw        m4,         m7

    pmaddubsw       m5,         m0, [r3 + 10 * 32]  ; [26]
    pmulhrsw        m5,         m7

    palignr         m3,         m2, m0, 2
    pmaddubsw       m6,         m3, [r3 - 9 * 32]   ; [7]
    pmulhrsw        m6,         m7
    pmaddubsw       m8,         m3, [r3 + 4 * 32]   ; [20]
    pmulhrsw        m8,         m7

    palignr         m3,         m2, m0, 4
    pmaddubsw       m9,         m3, [r3 - 15 * 32]  ; [1]
    pmulhrsw        m9,         m7

    pmaddubsw       m10,        m3, [r3 - 2 * 32]   ; [14]
    pmulhrsw        m10,        m7

    pmaddubsw       m11,        m3, [r3 + 11 * 32]  ; [27]
    pmulhrsw        m11,        m7

    palignr         m1,         m2, m0, 6
    pmaddubsw       m12,        m1, [r3 - 8 * 32]   ; [8]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    pmaddubsw       m3,         m1, [r3 + 5 * 32]   ; [21]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3

    palignr         m1,         m2, m0, 8
    pmaddubsw       m3,         m1, [r3 - 14 * 32]  ; [2]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    pmaddubsw       m3,         m1, [r3 - 1 * 32]   ; [15]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3

    pmaddubsw       m3,         m1, [r3 + 12 * 32]  ; [28]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    palignr         m1,         m2, m0, 10
    pmaddubsw       m3,         m1, [r3 - 7 * 32]   ; [9]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3

    pmaddubsw       m3,         m1, [r3 + 6 * 32]   ; [22]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    palignr         m1,         m2, m0, 12
    pmaddubsw       m3,         m1, [r3 - 13 * 32]  ; [3]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3

    pmaddubsw       m1,         [r3]                ; [16]
    pmulhrsw        m1,         m7
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_6, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_6
    RET

cglobal ang16_mode_7
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 - 7 * 32]   ; [9]
    pmulhrsw        m4,         m7

    pmaddubsw       m5,         m0, [r3 + 2 * 32]   ; [18]
    pmulhrsw        m5,         m7
    pmaddubsw       m6,         m0, [r3 + 11 * 32]  ; [27]
    pmulhrsw        m6,         m7

    palignr         m3,         m2, m0, 2
    pmaddubsw       m8,         m3, [r3 - 12 * 32]  ; [4]
    pmulhrsw        m8,         m7

    pmaddubsw       m9,         m3, [r3 - 3 * 32]   ; [13]
    pmulhrsw        m9,         m7

    pmaddubsw       m10,        m3, [r3 + 6 * 32]   ; [22]
    pmulhrsw        m10,        m7

    pmaddubsw       m11,        m3, [r3 + 15 * 32]  ; [31]
    pmulhrsw        m11,        m7

    palignr         m1,         m2, m0, 4
    pmaddubsw       m12,        m1, [r3 - 8 * 32]   ; [8]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    pmaddubsw       m3,         m1, [r3 + 1 * 32]   ; [17]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3

    pmaddubsw       m3,         m1, [r3 + 10 * 32]  ; [26]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    palignr         m1,         m2, m0, 6
    pmaddubsw       m3,         m1, [r3 - 13 * 32]  ; [3]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3

    pmaddubsw       m3,         m1, [r3 - 4 * 32]   ; [12]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    pmaddubsw       m3,         m1, [r3 + 5 * 32]   ; [21]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3

    pmaddubsw       m3,         m1, [r3 + 14 * 32]  ; [30]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    palignr         m1,         m2, m0, 8
    pmaddubsw       m3,         m1, [r3 - 9 * 32]   ; [7]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3

    pmaddubsw       m1,         [r3]                ; [16]
    pmulhrsw        m1,         m7
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_7, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_7
    RET

cglobal ang16_mode_8
    ; rows 0 to 7
    movu            m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
    movu            m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]

    punpckhbw       m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
    punpcklbw       m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vextracti128    xm1,        m0, 1
    vperm2i128      m0,         m0, m2, 0x20        ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
    vperm2i128      m2,         m2, m1, 0x20        ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]

    pmaddubsw       m4,         m0, [r3 - 11 * 32]  ; [5]
    pmulhrsw        m4,         m7
    pmaddubsw       m5,         m0, [r3 - 6 * 32]   ; [10]
    pmulhrsw        m5,         m7

    pmaddubsw       m6,         m0, [r3 - 1 * 32]   ; [15]
    pmulhrsw        m6,         m7
    pmaddubsw       m8,         m0, [r3 + 4 * 32]   ; [20]
    pmulhrsw        m8,         m7
    pmaddubsw       m9,         m0, [r3 + 9 * 32]   ; [25]
    pmulhrsw        m9,         m7

    pmaddubsw       m10,        m0, [r3 + 14 * 32]  ; [30]
    pmulhrsw        m10,        m7
    palignr         m1,         m2, m0, 2
    pmaddubsw       m11,        m1, [r3 - 13 * 32]  ; [3]
    pmulhrsw        m11,        m7
    pmaddubsw       m12,        m1, [r3 - 8 * 32]   ; [8]
    pmulhrsw        m12,        m7

    ; rows 8 to 15
    pmaddubsw       m3,         m1, [r3 - 3 * 32]   ; [13]
    pmulhrsw        m3,         m7
    packuswb        m4,         m3
    pmaddubsw       m3,         m1, [r3 + 2 * 32]   ; [18]
    pmulhrsw        m3,         m7
    packuswb        m5,         m3

    pmaddubsw       m3,         m1, [r3 + 7 * 32]   ; [23]
    pmulhrsw        m3,         m7
    packuswb        m6,         m3
    pmaddubsw       m3,         m1, [r3 + 12 * 32]  ; [28]
    pmulhrsw        m3,         m7
    packuswb        m8,         m3

    palignr         m1,         m2, m0, 4
    pmaddubsw       m3,         m1, [r3 - 15 * 32]  ; [1]
    pmulhrsw        m3,         m7
    packuswb        m9,         m3
    pmaddubsw       m3,         m1, [r3 - 10 * 32]  ; [6]
    pmulhrsw        m3,         m7
    packuswb        m10,        m3

    pmaddubsw       m3,         m1, [r3 - 5 * 32]   ; [11]
    pmulhrsw        m3,         m7
    packuswb        m11,        m3
    pmaddubsw       m1,         [r3]                ; [16]
    pmulhrsw        m1,         m7
    packuswb        m12,        m1

    TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
    ret

INIT_YMM avx2
cglobal intra_pred_ang16_8, 3, 7, 13
    add             r2, 32
    lea             r3, [ang_table_avx2 + 16 * 32]
    lea             r5, [r1 * 3]       ; r5 -> 3 * stride
    lea             r6, [r1 * 4]       ; r6 -> 4 * stride
    mova            m7, [pw_1024]
    clc

    call ang16_mode_8
    RET
%endif  ; ARCH_X86_64

INIT_YMM avx2
cglobal intra_pred_ang16_9, 3,4,8
    vbroadcasti128  m0, [angHor_tab_9]
    vbroadcasti128  m1, [angHor_tab_9 + mmsize/2]
    mova            m2, [pw_1024]
    lea             r3, [r1 * 3]
    mova            m7, [ang16_shuf_mode9]

    vbroadcasti128  m6, [r2 + mmsize + 17]
    vbroadcasti128  m3, [r2 + mmsize + 1]

    pshufb          m5, m3, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0], xm4
    vextracti128    [r0 + r1], m4, 1

    palignr         m5, m6, m3, 2
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0 + r1 * 2], xm4
    vextracti128    [r0 + r3], m4, 1

    lea             r0, [r0 + r1 * 4]

    palignr         m5, m6, m3, 4
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0], xm4
    vextracti128    [r0 + r1], m4, 1

    palignr         m5, m6, m3, 6
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0 + r1 * 2], xm4
    vextracti128    [r0 + r3], m4, 1

    lea             r0, [r0 + r1 * 4]

    palignr         m5, m6, m3, 8
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0], xm4
    vextracti128    [r0 + r1], m4, 1

    palignr         m5, m6, m3, 10
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0 + r1 * 2], xm4
    vextracti128    [r0 + r3], m4, 1

    lea             r0, [r0 + r1 * 4]

    palignr         m5, m6, m3, 12
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0], xm4
    vextracti128    [r0 + r1], m4, 1

    palignr         m5, m6, m3, 14
    pshufb          m5, m7
    pmaddubsw       m4, m5, m0
    pmaddubsw       m5, m1
    pmulhrsw        m4, m2
    pmulhrsw        m5, m2
    packuswb        m4, m5
    movu            [r0 + r1 * 2], xm4
    vextracti128    [r0 + r3], m4, 1
    RET
%endif

INIT_YMM avx2
cglobal intra_pred_ang16_25, 3, 5, 5
    mova              m0, [pw_1024]

    vbroadcasti128    m1, [r2]
    pshufb            m1, [intra_pred_shuff_0_8]
    vbroadcasti128    m2, [r2 + 8]
    pshufb            m2, [intra_pred_shuff_0_8]

    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_25]

    INTRA_PRED_ANG16_MC1 0

    lea    r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC1 2

    add           r4, 4 * mmsize

    lea    r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC1 0

    lea    r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC1 2
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_28, 3, 5, 6
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_28]

    INTRA_PRED_ANG16_MC2 1
    INTRA_PRED_ANG16_MC1 0

    lea               r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2

    INTRA_PRED_ANG16_MC2 2
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3

    lea               r0, [r0 + 4 * r1]
    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC1 0
    INTRA_PRED_ANG16_MC2 3

    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC1 2
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_27, 3, 5, 5
    mova              m0, [pw_1024]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_27]

    vbroadcasti128    m1, [r2 + 1]
    pshufb            m1, [intra_pred_shuff_0_8]
    vbroadcasti128    m2, [r2 + 9]
    pshufb            m2, [intra_pred_shuff_0_8]

    INTRA_PRED_ANG16_MC1 0

    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC1 2

    lea               r0, [r0 + 4 * r1]
    add               r4, 4 * mmsize
    INTRA_PRED_ANG16_MC1 0

    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2

    vperm2i128        m1, m1, m2, 00100000b
    pmaddubsw         m3, m1, [r4 + 3 * mmsize]
    pmulhrsw          m3, m0
    vbroadcasti128    m2, [r2 + 2]
    pshufb            m2, [intra_pred_shuff_0_15]
    pmaddubsw         m2, [r4 + 4 * mmsize]
    pmulhrsw          m2, m0
    packuswb          m3, m2
    vpermq            m3, m3, 11011000b
    movu              [r0 + 2 * r1], xm3
    vextracti128      xm4, m3, 1
    movu              [r0 + r3], xm4
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_29, 3, 5, 5
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_29]

    INTRA_PRED_ANG16_MC2 1
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
    INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1

    INTRA_PRED_ANG16_MC2 2
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2

    lea               r0, [r0 + r1 * 4]
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3

    INTRA_PRED_ANG16_MC2 3
    add               r4, 4 * mmsize
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
    lea               r0, [r0 + r1 * 4]
    INTRA_PRED_ANG16_MC3 r0 + r1, 1

    INTRA_PRED_ANG16_MC2 4
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
    lea               r0, [r0 + r1 * 4]
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3

    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC2 5
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_30, 3, 5, 6
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_30]

    INTRA_PRED_ANG16_MC2 1
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0

    INTRA_PRED_ANG16_MC2 2
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1

    INTRA_PRED_ANG16_MC2 3
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
    INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3

    INTRA_PRED_ANG16_MC2 4
    add               r4, 4 * mmsize
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0

    INTRA_PRED_ANG16_MC2 5
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
    INTRA_PRED_ANG16_MC3 r0 + r3 , 2

    INTRA_PRED_ANG16_MC2 6
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3

    INTRA_PRED_ANG16_MC2 7
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_31, 3, 5, 6
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_31]

    INTRA_PRED_ANG16_MC2 1
    INTRA_PRED_ANG16_MC3 r0, 0

    INTRA_PRED_ANG16_MC2 2
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1

    INTRA_PRED_ANG16_MC2 3
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2

    INTRA_PRED_ANG16_MC2 4
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3

    INTRA_PRED_ANG16_MC2 5
    add               r4, 4 * mmsize
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0

    INTRA_PRED_ANG16_MC2 6
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1

    INTRA_PRED_ANG16_MC2 7
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2

    INTRA_PRED_ANG16_MC2 8
    lea               r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3

    INTRA_PRED_ANG16_MC2 9
    INTRA_PRED_ANG16_MC3 r0 + r3, 4
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_24, 3, 5, 6
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_24]

    INTRA_PRED_ANG16_MC2 0
    INTRA_PRED_ANG16_MC1 0

    lea                  r0, [r0 + 4 * r1]
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2

    movu              xm1, [r2 - 1]
    pinsrb            xm1, [r2 + 38], 0
    vinserti128       m1, m1, xm1, 1
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + 7]
    pshufb            m2, m5
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3

    lea               r0, [r0 + 4 * r1]
    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC1 0

    movu              xm1, [r2 - 2]
    pinsrb            xm1, [r2 + 45], 0
    pinsrb            xm1, [r2 + 38], 1
    vinserti128       m1, m1, xm1, 1
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + 6]
    pshufb            m2, m5

    lea               r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC1 2
    RET

%macro INTRA_PRED_ANG16_MC5 2
    pslldq            xm6,  xm6, 1
    pinsrb            xm6, [r2 + %1], 0
    vinserti128       m1, m6, xm6, 1
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + %2]
    pshufb            m2, m5
%endmacro

INIT_YMM avx2
cglobal intra_pred_ang16_23, 3, 5, 7
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_23]

    INTRA_PRED_ANG16_MC2 0
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
    INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1

    movu              xm6, [r2 - 1]
    pinsrb            xm6, [r2 + 36], 0
    vinserti128       m1, m6, xm6, 1
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + 7]
    pshufb            m2, m5
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2

    lea                  r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3

    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC5 39, 6
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0

    lea                  r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC3 r0 + r1, 1
    INTRA_PRED_ANG16_MC5 43, 5
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2

    lea                  r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3

    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC5 46, 4
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
    RET

INIT_YMM avx2
cglobal intra_pred_ang16_22, 3, 5, 7
    mova              m0, [pw_1024]
    mova              m5, [intra_pred_shuff_0_8]
    lea               r3, [3 * r1]
    lea               r4, [c_ang16_mode_22]

    INTRA_PRED_ANG16_MC2 0
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0

    movu              xm6, [r2 - 1]
    pinsrb            xm6, [r2 + 34], 0
    vinserti128       m1, m6, xm6, 1
    pshufb            m1, m5
    vbroadcasti128    m2, [r2 + 7]
    pshufb            m2, m5
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1

    lea               r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC5 37, 6
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
    INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3

    add               r4, 4 * mmsize

    INTRA_PRED_ANG16_MC5 39, 5
    INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0

    lea               r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC5 42, 4
    INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
    INTRA_PRED_ANG16_MC3 r0 + r3, 2

    lea               r0, [r0 + 4 * r1]

    INTRA_PRED_ANG16_MC5 44, 3
    INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
    INTRA_PRED_ANG16_MC5 47, 2
    INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
    RET

%macro INTRA_PRED_ANG32_ALIGNR_STORE 1
    lea     r0, [r0 + 4 * r1]
    palignr m2, m1, m0, %1
    movu    [r0], m2
    palignr m2, m1, m0, (%1 + 1)
    movu    [r0 + r1], m2
    palignr m2, m1, m0, (%1 + 2)
    movu    [r0 + 2 * r1], m2
    palignr m2, m1, m0, (%1 + 3)
    movu    [r0 + r3], m2
%endmacro

INIT_YMM avx2
cglobal intra_pred_ang32_34, 3, 4,3
    lea     r3, [3 * r1]

    movu    m0, [r2 + 2]
    movu    m1, [r2 + 18]
    movu    [r0], m0
    palignr m2, m1, m0, 1
    movu    [r0 + r1], m2
    palignr m2, m1, m0, 2
    movu    [r0 + 2 * r1], m2
    palignr m2, m1, m0, 3
    movu    [r0 + r3], m2

    INTRA_PRED_ANG32_ALIGNR_STORE  4
    INTRA_PRED_ANG32_ALIGNR_STORE  8
    INTRA_PRED_ANG32_ALIGNR_STORE 12

    lea     r0, [r0 + 4 * r1]
    palignr m2, m1, m0, 16
    movu    [r0], m2
    movu    m0, [r2 + 19]
    movu    [r0 + r1], m0
    movu    m1, [r2 + 35]
    palignr m2, m1, m0, 1
    movu    [r0 + 2 * r1], m2
    palignr m2, m1, m0, 2
    movu    [r0 + r3], m2

    INTRA_PRED_ANG32_ALIGNR_STORE  3
    INTRA_PRED_ANG32_ALIGNR_STORE  7
    INTRA_PRED_ANG32_ALIGNR_STORE 11
    RET

INIT_YMM avx2
cglobal intra_pred_ang32_2, 3, 4,3
    lea     r3, [3 * r1]

    movu    m0, [r2 + 64 + 2]
    movu    m1, [r2 + 64 + 18]
    movu    [r0], m0
    palignr m2, m1, m0, 1
    movu    [r0 + r1], m2
    palignr m2, m1, m0, 2
    movu    [r0 + 2 * r1], m2
    palignr m2, m1, m0, 3
    movu    [r0 + r3], m2

    INTRA_PRED_ANG32_ALIGNR_STORE  4
    INTRA_PRED_ANG32_ALIGNR_STORE  8
    INTRA_PRED_ANG32_ALIGNR_STORE 12

    lea     r0, [r0 + 4 * r1]
    palignr m2, m1, m0, 16
    movu    [r0], m2
    movu    m0, [r2 + 64 + 19]
    movu    [r0 + r1], m0
    movu    m1, [r2 + 64 + 35]
    palignr m2, m1, m0, 1
    movu    [r0 + 2 * r1], m2
    palignr m2, m1, m0, 2
    movu    [r0 + r3], m2

    INTRA_PRED_ANG32_ALIGNR_STORE  3
    INTRA_PRED_ANG32_ALIGNR_STORE  7
    INTRA_PRED_ANG32_ALIGNR_STORE 11
    RET

%macro INTRA_PRED_ANG32_STORE 0
    lea    r0, [r0 + 4 * r1]
    movu   [r0],          m0
    movu   [r0 + r1],     m0
    movu   [r0 + r1 * 2], m0
    movu   [r0 + r3],     m0
%endmacro

INIT_YMM avx2
cglobal intra_pred_ang32_26, 3, 4, 1
    lea    r3,            [3 * r1]
    movu    m0,           [r2 + 1]
    movu   [r0],          m0
    movu   [r0 + r1],     m0
    movu   [r0 + r1 * 2], m0
    movu   [r0 + r3],     m0

    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    INTRA_PRED_ANG32_STORE
    RET

%macro INTRA_PRED_STORE_4x4 0
    movd              [r0], xm0
    pextrd            [r0 + r1], xm0, 1
    vextracti128      xm0, m0, 1
    lea               r0, [r0 + 2 * r1]
    movd              [r0], xm0
    pextrd            [r0 + r1], xm0, 1
%endmacro

%macro INTRA_PRED_TRANS_STORE_4x4 0
    vpermq            m0, m0, 00001000b
    pshufb            m0, [c_trans_4x4]

    ;store
    movd              [r0], xm0
    pextrd            [r0 + r1], xm0, 1
    lea               r0, [r0 + 2 * r1]
    pextrd            [r0], xm0, 2
    pextrd            [r0 + r1], xm0, 3
%endmacro

INIT_YMM avx2
cglobal intra_pred_ang4_27, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred_shuff_0_4]
    pmaddubsw         m0, [c_ang4_mode_27]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_28, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred_shuff_0_4]
    pmaddubsw         m0, [c_ang4_mode_28]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_29, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff1]
    pmaddubsw         m0, [c_ang4_mode_29]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_30, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff2]
    pmaddubsw         m0, [c_ang4_mode_30]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_31, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff31]
    pmaddubsw         m0, [c_ang4_mode_31]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_32, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff31]
    pmaddubsw         m0, [c_ang4_mode_32]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_33, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff33]
    pmaddubsw         m0, [c_ang4_mode_33]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET


INIT_YMM avx2
cglobal intra_pred_ang4_3, 3, 3, 1
    vbroadcasti128    m0, [r2 + 1]
    pshufb            m0, [intra_pred4_shuff3]
    pmaddubsw         m0, [c_ang4_mode_33]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_4, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff5]
    pmaddubsw         m0, [c_ang4_mode_32]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_5, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff5]
    pmaddubsw         m0, [c_ang4_mode_5]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_6, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff6]
    pmaddubsw         m0, [c_ang4_mode_6]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_7, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff7]
    pmaddubsw         m0, [c_ang4_mode_7]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_8, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff9]
    pmaddubsw         m0, [c_ang4_mode_8]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_9, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff9]
    pmaddubsw         m0, [c_ang4_mode_9]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_11, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff12]
    pmaddubsw         m0, [c_ang4_mode_11]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_12, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff12]
    pmaddubsw         m0, [c_ang4_mode_12]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_13, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff13]
    pmaddubsw         m0, [c_ang4_mode_13]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_14, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff14]
    pmaddubsw         m0, [c_ang4_mode_14]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_15, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff15]
    pmaddubsw         m0, [c_ang4_mode_15]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_16, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff16]
    pmaddubsw         m0, [c_ang4_mode_16]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_17, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff17]
    pmaddubsw         m0, [c_ang4_mode_17]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_TRANS_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_19, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff19]
    pmaddubsw         m0, [c_ang4_mode_19]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_20, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff20]
    pmaddubsw         m0, [c_ang4_mode_20]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_21, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff21]
    pmaddubsw         m0, [c_ang4_mode_21]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_22, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff22]
    pmaddubsw         m0, [c_ang4_mode_22]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_23, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred4_shuff23]
    pmaddubsw         m0, [c_ang4_mode_23]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_24, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred_shuff_0_4]
    pmaddubsw         m0, [c_ang4_mode_24]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

INIT_YMM avx2
cglobal intra_pred_ang4_25, 3, 3, 1
    vbroadcasti128    m0, [r2]
    pshufb            m0, [intra_pred_shuff_0_4]
    pmaddubsw         m0, [c_ang4_mode_25]
    pmulhrsw          m0, [pw_1024]
    packuswb          m0, m0

    INTRA_PRED_STORE_4x4
    RET

;-----------------------------------------------------------------------------------
; void intra_filter_NxN(const pixel* references, pixel* filtered)
;-----------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_filter_4x4, 2,4,5
    mov             r2b, byte [r0 +  8]             ; topLast
    mov             r3b, byte [r0 + 16]             ; LeftLast

    ; filtering top
    pmovzxbw        m0, [r0 +  0]
    pmovzxbw        m1, [r0 +  8]
    pmovzxbw        m2, [r0 + 16]

    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
    palignr         m3, m1, m0, 4
    pshufb          m3, [intra_filter4_shuf1]       ; [8 7 6 5 4 3 2 9] samples[i + 1]

    psllw           m0, 1
    paddw           m4, m3
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2

    ; filtering left
    palignr         m4, m1, m1, 14                  ; [14 13 12 11 10 9 8 15] samples[i - 1]
    pinsrb          m4, [r0], 2                     ; [14 13 12 11 10 9 0 15] samples[i + 1]
    palignr         m3, m2, m1, 4
    pshufb          m3, [intra_filter4_shuf1]

    psllw           m1, 1
    paddw           m4, m3
    paddw           m1, m4
    paddw           m1, [pw_2]
    psrlw           m1, 2
    packuswb        m0, m1

    movu            [r1], m0
    mov             [r1 +  8], r2b                  ; topLast
    mov             [r1 + 16], r3b                  ; LeftLast
    RET

INIT_XMM sse4
cglobal intra_filter_8x8, 2,4,6
    mov             r2b, byte [r0 + 16]             ; topLast
    mov             r3b, byte [r0 + 32]             ; LeftLast

    ; filtering top
    pmovzxbw        m0, [r0 +  0]
    pmovzxbw        m1, [r0 +  8]
    pmovzxbw        m2, [r0 + 16]

    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
    palignr         m5, m1, m0, 2
    pinsrb          m5, [r0 + 17], 0                ; [8 7 6 5 4 3 2 9] samples[i + 1]

    palignr         m3, m1, m0, 14
    psllw           m0, 1
    paddw           m4, m5
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2

    palignr         m4, m2, m1, 2
    psllw           m1, 1
    paddw           m4, m3
    paddw           m1, m4
    paddw           m1, [pw_2]
    psrlw           m1, 2

    packuswb        m0, m1
    movu            [r1], m0

    ; filtering left
    pmovzxbw        m1, [r0 + 24]
    pmovzxbw        m0, [r0 + 32]

    palignr         m4, m2, m2, 14
    pinsrb          m4, [r0], 2
    palignr         m5, m1, m2, 2

    palignr         m3, m1, m2, 14
    palignr         m0, m1, 2

    psllw           m2, 1
    paddw           m4, m5
    paddw           m2, m4
    paddw           m2, [pw_2]
    psrlw           m2, 2

    psllw           m1, 1
    paddw           m0, m3
    paddw           m1, m0
    paddw           m1, [pw_2]
    psrlw           m1, 2

    packuswb        m2, m1
    movu            [r1 + 16], m2
    mov             [r1 + 16], r2b                  ; topLast
    mov             [r1 + 32], r3b                  ; LeftLast
    RET

INIT_XMM sse4
cglobal intra_filter_16x16, 2,4,6
    mov             r2b, byte [r0 + 32]             ; topLast
    mov             r3b, byte [r0 + 64]             ; LeftLast

    ; filtering top
    pmovzxbw        m0, [r0 +  0]
    pmovzxbw        m1, [r0 +  8]
    pmovzxbw        m2, [r0 + 16]

    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
    palignr         m5, m1, m0, 2
    pinsrb          m5, [r0 + 33], 0                ; [8 7 6 5 4 3 2 9] samples[i + 1]

    palignr         m3, m1, m0, 14
    psllw           m0, 1
    paddw           m4, m5
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2

    palignr         m4, m2, m1, 2
    psllw           m5, m1, 1
    paddw           m4, m3
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m0, m5
    movu            [r1], m0

    pmovzxbw        m0, [r0 + 24]
    pmovzxbw        m5, [r0 + 32]

    palignr         m3, m2, m1, 14
    palignr         m4, m0, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m0, m2, 14
    palignr         m4, m5, m0, 2

    psllw           m0, 1
    paddw           m4, m3
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2
    packuswb        m1, m0
    movu            [r1 + 16], m1

    ; filtering left
    pmovzxbw        m1, [r0 + 40]
    pmovzxbw        m2, [r0 + 48]

    palignr         m4, m5, m5, 14
    pinsrb          m4, [r0], 2
    palignr         m0, m1, m5, 2

    psllw           m3, m5, 1
    paddw           m4, m0
    paddw           m3, m4
    paddw           m3, [pw_2]
    psrlw           m3, 2

    palignr         m0, m1, m5, 14
    palignr         m4, m2, m1, 2

    psllw           m5, m1, 1
    paddw           m4, m0
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m3, m5
    movu            [r1 + 32], m3

    pmovzxbw        m5, [r0 + 56]
    pmovzxbw        m0, [r0 + 64]

    palignr         m3, m2, m1, 14
    palignr         m4, m5, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m5, m2, 14
    palignr         m4, m0, m5, 2

    psllw           m5, 1
    paddw           m4, m3
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m1, m5
    movu            [r1 + 48], m1

    mov             [r1 + 32], r2b                  ; topLast
    mov             [r1 + 64], r3b                  ; LeftLast
    RET

INIT_XMM sse4
cglobal intra_filter_32x32, 2,4,6
    mov             r2b, byte [r0 +  64]            ; topLast
    mov             r3b, byte [r0 + 128]            ; LeftLast

    ; filtering top
    ; 0 to 15
    pmovzxbw        m0, [r0 +  0]
    pmovzxbw        m1, [r0 +  8]
    pmovzxbw        m2, [r0 + 16]

    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
    palignr         m5, m1, m0, 2
    pinsrb          m5, [r0 + 65], 0                ; [8 7 6 5 4 3 2 9] samples[i + 1]

    palignr         m3, m1, m0, 14
    psllw           m0, 1
    paddw           m4, m5
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2

    palignr         m4, m2, m1, 2
    psllw           m5, m1, 1
    paddw           m4, m3
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m0, m5
    movu            [r1], m0

    ; 16 to 31
    pmovzxbw        m0, [r0 + 24]
    pmovzxbw        m5, [r0 + 32]

    palignr         m3, m2, m1, 14
    palignr         m4, m0, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m0, m2, 14
    palignr         m4, m5, m0, 2

    psllw           m2, m0, 1
    paddw           m4, m3
    paddw           m2, m4
    paddw           m2, [pw_2]
    psrlw           m2, 2
    packuswb        m1, m2
    movu            [r1 + 16], m1

    ; 32 to 47
    pmovzxbw        m1, [r0 + 40]
    pmovzxbw        m2, [r0 + 48]

    palignr         m3, m5, m0, 14
    palignr         m4, m1, m5, 2

    psllw           m0, m5, 1
    paddw           m3, m4
    paddw           m0, m3
    paddw           m0, [pw_2]
    psrlw           m0, 2

    palignr         m3, m1, m5, 14
    palignr         m4, m2, m1, 2

    psllw           m5, m1, 1
    paddw           m4, m3
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m0, m5
    movu            [r1 + 32], m0

    ; 48 to 63
    pmovzxbw        m0, [r0 + 56]
    pmovzxbw        m5, [r0 + 64]

    palignr         m3, m2, m1, 14
    palignr         m4, m0, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m0, m2, 14
    palignr         m4, m5, m0, 2

    psllw           m0, 1
    paddw           m4, m3
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2
    packuswb        m1, m0
    movu            [r1 + 48], m1

    ; filtering left
    ; 64 to 79
    pmovzxbw        m1, [r0 + 72]
    pmovzxbw        m2, [r0 + 80]

    palignr         m4, m5, m5, 14
    pinsrb          m4, [r0], 2
    palignr         m0, m1, m5, 2

    psllw           m3, m5, 1
    paddw           m4, m0
    paddw           m3, m4
    paddw           m3, [pw_2]
    psrlw           m3, 2

    palignr         m0, m1, m5, 14
    palignr         m4, m2, m1, 2

    psllw           m5, m1, 1
    paddw           m4, m0
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m3, m5
    movu            [r1 + 64], m3

    ; 80 to 95
    pmovzxbw        m5, [r0 + 88]
    pmovzxbw        m0, [r0 + 96]

    palignr         m3, m2, m1, 14
    palignr         m4, m5, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m5, m2, 14
    palignr         m4, m0, m5, 2

    psllw           m2, m5, 1
    paddw           m4, m3
    paddw           m2, m4
    paddw           m2, [pw_2]
    psrlw           m2, 2
    packuswb        m1, m2
    movu            [r1 + 80], m1

    ; 96 to 111
    pmovzxbw        m1, [r0 + 104]
    pmovzxbw        m2, [r0 + 112]

    palignr         m3, m0, m5, 14
    palignr         m4, m1, m0, 2

    psllw           m5, m0, 1
    paddw           m3, m4
    paddw           m5, m3
    paddw           m5, [pw_2]
    psrlw           m5, 2

    palignr         m3, m1, m0, 14
    palignr         m4, m2, m1, 2

    psllw           m0, m1, 1
    paddw           m4, m3
    paddw           m0, m4
    paddw           m0, [pw_2]
    psrlw           m0, 2
    packuswb        m5, m0
    movu            [r1 + 96], m5

    ; 112 to 127
    pmovzxbw        m5, [r0 + 120]
    pmovzxbw        m0, [r0 + 128]

    palignr         m3, m2, m1, 14
    palignr         m4, m5, m2, 2

    psllw           m1, m2, 1
    paddw           m3, m4
    paddw           m1, m3
    paddw           m1, [pw_2]
    psrlw           m1, 2

    palignr         m3, m5, m2, 14
    palignr         m4, m0, m5, 2

    psllw           m5, 1
    paddw           m4, m3
    paddw           m5, m4
    paddw           m5, [pw_2]
    psrlw           m5, 2
    packuswb        m1, m5
    movu            [r1 + 112], m1

    mov             [r1 +  64], r2b                 ; topLast
    mov             [r1 + 128], r3b                 ; LeftLast
    RET

INIT_YMM avx2
cglobal intra_filter_4x4, 2,4,4
    mov             r2b, byte [r0 +  8]         ; topLast
    mov             r3b, byte [r0 + 16]         ; LeftLast

    ; filtering top
    pmovzxbw        m0, [r0]
    vpbroadcastw    m2, xm0
    pmovzxbw        m1, [r0 + 8]

    palignr         m3, m0, m2, 14              ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
    pshufb          m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
    palignr         m1, m0, 4                   ; [9 8 7 6 5 4 3 2]
    palignr         m1, m1, 14                  ; [9 8 7 6 5 4 3 2]

    psllw           m0, 1
    paddw           m3, m1
    paddw           m0, m3
    paddw           m0, [pw_2]
    psrlw           m0, 2

    packuswb        m0, m0
    vpermq          m0, m0, 10001000b

    movu            [r1], xm0
    mov             [r1 +  8], r2b              ; topLast
    mov             [r1 + 16], r3b              ; LeftLast
    RET