comparison x265/source/common/x86/intrapred8.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 const intra_pred_shuff_0_8, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
31 db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
32
33 intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
34
35 intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
36 intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
37 intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
38
39 pb_0_8 times 8 db 0, 8
40 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
41 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
42 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
43 const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
44 const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
45 const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
46 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
47 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
48 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
49 c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
50 c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
51 c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
52 c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
53 c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
54 c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
55 c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
56 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
57 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
58 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
59 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
60 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
61 c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
62 c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
63 c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
64 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
65 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
66
67 ALIGN 32
68 c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
69 c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
70 c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
71 c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
72 c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
73 c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
74 c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
75 c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
76
77 c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
78 c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
79 c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
80 c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
81 c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
82 c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
83 c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
84
85 c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
86 c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
87 c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
88 c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
89 c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
90
91 c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
92 c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
93 c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
94 c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
95 c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
96 c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
97
98 c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
99 c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
100 c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
101 c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
102
103 c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
104 c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
105 c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
106 c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
107
108 c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
109 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
110 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
111 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
112
113 c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
114 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
115 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
116 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
117
118 c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
119 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
120 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
121 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
122
123 ALIGN 32
124 c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
125 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
126 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
127 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
128 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
129 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
130 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
131 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
132
133 ALIGN 32
134 c_ang16_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
135 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
136 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
137 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
138 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
139 db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
140 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
141 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
142
143
144 ALIGN 32
145 c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
146 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
147 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
148 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
149 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
150 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
151 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
152 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
153
154
155 ALIGN 32
156 c_ang16_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
157 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
158 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
159 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
160 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
161 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
162 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
163 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
164
165 ALIGN 32
166 c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
167 db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
168 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
169 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
170 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
171 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
172 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
173 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
174
175 ALIGN 32
176 c_ang16_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
177 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
178 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
179 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
180 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
181 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
182 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
183 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
184
185 ALIGN 32
186 c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
187 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
188 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
189 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
190 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
191 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
192 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
193 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
194 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
195
196 ALIGN 32
197 intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
198
199 ALIGN 32
200 c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
201 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
202 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
203 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
204 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
205 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
206 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
207 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
208 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
209
210 ALIGN 32
211 c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
212 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
213 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
214 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
215 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
216 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
217 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
218 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
219 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
220
221 ALIGN 32
222 c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
223 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
224 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
225 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
226 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
227 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
228 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
229 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
230 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
231
232 ALIGN 32
233 c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
234 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
235 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
236 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
237 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
238 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
239 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
240 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
241
242 ALIGN 32
243 c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
244 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
245 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
246 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
247 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
248 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
249 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
250 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
251 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
252
253 ALIGN 32
254 c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
255 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
256 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
257 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
258 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
259 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
260 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
261 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
262 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
263
264 ALIGN 32
265 intra_pred_shuff_0_4: times 4 db 0, 1, 1, 2, 2, 3, 3, 4
266 intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
267 intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
268 intra_pred4_shuff31: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
269 intra_pred4_shuff33: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
270 intra_pred4_shuff3: db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
271 intra_pred4_shuff4: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
272 intra_pred4_shuff5: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
273 intra_pred4_shuff6: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14
274 intra_pred4_shuff7: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14
275 intra_pred4_shuff9: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13
276 intra_pred4_shuff12: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12
277 intra_pred4_shuff13: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
278 intra_pred4_shuff14: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
279 intra_pred4_shuff15: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
280 intra_pred4_shuff16: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
281 intra_pred4_shuff17: db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
282 intra_pred4_shuff19: db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
283 intra_pred4_shuff20: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
284 intra_pred4_shuff21: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
285 intra_pred4_shuff22: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
286 intra_pred4_shuff23: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
287
288 c_ang4_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
289 c_ang4_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
290 c_ang4_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
291 c_ang4_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
292 c_ang4_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
293 c_ang4_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
294 c_ang4_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
295 c_ang4_mode_5: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
296 c_ang4_mode_6: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
297 c_ang4_mode_7: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
298 c_ang4_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
299 c_ang4_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
300 c_ang4_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
301 c_ang4_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
302 c_ang4_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
303 c_ang4_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
304 c_ang4_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4
305 c_ang4_mode_16: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
306 c_ang4_mode_17: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
307 c_ang4_mode_19: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
308 c_ang4_mode_20: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
309 c_ang4_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
310 c_ang4_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
311 c_ang4_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
312 c_ang4_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
313 c_ang4_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
314
315 ALIGN 32
316 ;; (blkSize - 1 - x)
317 pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
318 ALIGN 32
319 c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
320 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
321 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
322 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
323
324 ALIGN 32
325 c_ang8_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
326 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
327 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
328 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
329
330 ALIGN 32
331 c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
332 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
333 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
334 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
335
336 const c_ang8_mode_16, db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0
337
338 const intra_pred8_shuff16, db 0, 1, 1, 2, 3, 3, 4, 5
339 db 1, 2, 2, 3, 4, 4, 5, 6
340 db 2, 3, 3, 4, 5, 5, 6, 7
341 db 3, 4, 4, 5, 6, 6, 7, 8
342 db 4, 5, 5, 6, 7, 7, 8, 9
343
344 const angHor8_tab_16, db (32-11), 11, (32-22), 22, (32-1 ), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
345
346 const c_ang8_mode_20, db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0
347
348 ; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table
349 const angHor8_tab_20, times 8 db (32-24), 24
350 times 8 db (32-13), 13
351 times 8 db (32- 2), 2
352 times 8 db (32-23), 23
353 times 8 db (32-12), 12
354 times 8 db (32- 1), 1
355 times 8 db (32-22), 22
356 times 8 db (32-11), 11
357
358 const ang16_shuf_mode9, times 8 db 0, 1
359 times 8 db 1, 2
360
361 const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
362 db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32
363
364 const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
365 db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0
366
367 const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
368 db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
369
370 const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
371 db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
372
373 const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
374 db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
375 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
376
377 const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
378 db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
379
380 const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
381 db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
382 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
383
384 const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
385 db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
386
387 const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
388 db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
389 db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
390
391 const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
392 db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
393
394 const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
395 db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
396 db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
397
398 const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
399 db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
400
401 const ang16_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
402 db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
403 db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
404
405 const angHor_tab_17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
406 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
407
408 ; Intrapred_angle32x32, modes 1 to 33 constants
409 const ang32_shuf_mode9, times 8 db 0, 1
410 times 8 db 1, 2
411
412 const ang32_shuf_mode11, times 8 db 1, 2
413 times 8 db 0, 1
414
415 const ang32_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24
416 db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8
417 db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
418 db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5), 5, (32- 0), 0
419 const ang32_shuf_mode12, db 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
420 db 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
421 const ang32_shuf_mode24, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 3, 3
422 dd 0, 0, 7, 3, 0, 0, 7, 3
423
424 const ang32_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24
425 db (32- 7), 7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3), 3, (32-26), 26, (32-17), 17, (32- 8), 8
426 db (32-15), 15, (32- 6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2), 2, (32-25), 25, (32-16), 16
427 db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4), 4, (32-27), 27, (32-18), 18, (32- 9), 9, (32- 0), 0
428 const ang32_shuf_mode13, db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11, 9, 10, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9
429 db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 6, 7
430 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2
431 const ang32_shuf_mode23, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2
432
433 const ang32_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24
434 db (32- 3), 3, (32-22), 22, (32- 9), 9, (32-28), 28, (32-15), 15, (32- 2), 2, (32-21), 21, (32- 8), 8
435 db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
436 db (32-27), 27, (32-14), 14, (32- 1), 1, (32-20), 20, (32- 7), 7, (32-26), 26, (32-13), 13, (32- 0), 0
437 const ang32_shuf_mode14, db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 5, 6, 5, 6
438 db 11, 12, 10, 11, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3
439 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1
440 const ang32_shuf_mode22, db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2
441
442 const ang32_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24
443 db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8
444 db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16), 16
445 db (32-23), 23, (32- 6), 6, (32-21), 21, (32- 4), 4, (32-19), 19, (32- 2), 2, (32-17), 17, (32- 0), 0
446 const ang32_shuf_mode15, db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11, 5, 6, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3
447 db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1
448 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1
449 const ang32_shuf_mode21, db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1
450
451 const ang32_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
452 db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16
453 db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8
454 db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0
455 const ang32_shuf_mode16, db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5
456 db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6
457 db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1
458 dd 7, 1, 2, 3, 7, 1, 2, 3
459 const ang32_shuf_mode20, db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8
460 db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0
461
462 const ang32_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
463 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
464 const ang32_shuf_mode17, db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 2, 3
465 db 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
466 const ang32_shuf_mode19, db 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
467 dd 0, 0, 2, 3, 0, 0, 7, 1
468 dd 0, 0, 5, 6, 0, 0, 0, 0
469
470 const ang_table
471 %assign x 0
472 %rep 32
473 times 8 db (32-x), x
474 %assign x x+1
475 %endrep
476
477 const ang_table_avx2
478 %assign x 0
479 %rep 32
480 times 16 db (32-x), x
481 %assign x x+1
482 %endrep
483
484 const pw_ang_table
485 %assign x 0
486 %rep 32
487 times 4 dw (32-x), x
488 %assign x x+1
489 %endrep
490
491 SECTION .text
492 cextern pb_1
493 cextern pw_2
494 cextern pw_3
495 cextern pw_4
496 cextern pw_7
497 cextern pw_8
498 cextern pw_16
499 cextern pw_15
500 cextern pw_31
501 cextern pw_32
502 cextern pw_257
503 cextern pw_512
504 cextern pw_1024
505 cextern pw_4096
506 cextern pw_00ff
507 cextern pb_unpackbd1
508 cextern multiL
509 cextern multiH
510 cextern multiH2
511 cextern multiH3
512 cextern multi_2Row
513 cextern trans8_shuf
514 cextern pw_planar16_mul
515 cextern pw_planar32_mul
516
517 ;---------------------------------------------------------------------------------------------
518 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
519 ;---------------------------------------------------------------------------------------------
520 INIT_XMM sse2
521 cglobal intra_pred_dc4, 5,5,3
522 inc r2
523 pxor m0, m0
524 movu m1, [r2]
525 pshufd m1, m1, 0xF8
526 psadbw m1, m0 ; m1 = sum
527
528 test r4d, r4d
529
530 paddw m1, [pw_4]
531 psraw m1, 3
532 movd r4d, m1 ; r4d = dc_val
533 pmullw m1, [pw_257]
534 pshuflw m1, m1, 0x00
535
536 ; store DC 4x4
537 lea r3, [r1 * 3]
538 movd [r0], m1
539 movd [r0 + r1], m1
540 movd [r0 + r1 * 2], m1
541 movd [r0 + r3], m1
542
543 ; do DC filter
544 jz .end
545 lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
546 add r4d, r3d ; r4d = DC * 3 + 2
547 movd m1, r4d
548 pshuflw m1, m1, 0 ; m1 = pixDCx3
549
550 ; filter top
551 movd m2, [r2]
552 punpcklbw m2, m0
553 paddw m2, m1
554 psraw m2, 2
555 packuswb m2, m2
556 movd [r0], m2 ; overwrite top-left pixel, we will update it later
557
558 ; filter top-left
559 movzx r4d, byte [r2 + 8]
560 add r3d, r4d
561 movzx r4d, byte [r2]
562 add r3d, r4d
563 shr r3d, 2
564 mov [r0], r3b
565
566 ; filter left
567 add r0, r1
568 movq m2, [r2 + 9]
569 punpcklbw m2, m0
570 paddw m2, m1
571 psraw m2, 2
572 packuswb m2, m2
573 %if ARCH_X86_64
574 movq r4, m2
575 mov [r0], r4b
576 shr r4, 8
577 mov [r0 + r1], r4b
578 shr r4, 8
579 mov [r0 + r1 * 2], r4b
580 %else
581 movd r2d, m2
582 mov [r0], r2b
583 shr r2, 8
584 mov [r0 + r1], r2b
585 shr r2, 8
586 mov [r0 + r1 * 2], r2b
587 %endif
588 .end:
589 RET
590
591 ;---------------------------------------------------------------------------------------------
592 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
593 ;---------------------------------------------------------------------------------------------
594 INIT_XMM sse2
595 cglobal intra_pred_dc8, 5, 7, 3
596 pxor m0, m0
597 movh m1, [r2 + 1]
598 movh m2, [r2 + 17]
599 punpcklqdq m1, m2
600 psadbw m1, m0
601 pshufd m2, m1, 2
602 paddw m1, m2
603
604 paddw m1, [pw_8]
605 psraw m1, 4
606 pmullw m1, [pw_257]
607 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
608
609 test r4d, r4d
610
611 ; store DC 8x8
612 lea r6, [r1 + r1 * 2]
613 lea r5, [r6 + r1 * 2]
614 movh [r0], m1
615 movh [r0 + r1], m1
616 movh [r0 + r1 * 2], m1
617 movh [r0 + r6], m1
618 movh [r0 + r1 * 4], m1
619 movh [r0 + r5], m1
620 movh [r0 + r6 * 2], m1
621 lea r5, [r5 + r1 * 2]
622 movh [r0 + r5], m1
623
624 ; Do DC Filter
625 jz .end
626 psrlw m1, 8
627 movq m2, [pw_2]
628 pmullw m2, m1
629 paddw m2, [pw_2]
630 movd r4d, m2 ; r4d = DC * 2 + 2
631 paddw m1, m2 ; m1 = DC * 3 + 2
632 pshufd m1, m1, 0
633
634 ; filter top
635 movq m2, [r2 + 1]
636 punpcklbw m2, m0
637 paddw m2, m1
638 psraw m2, 2 ; sum = sum / 16
639 packuswb m2, m2
640 movh [r0], m2
641
642 ; filter top-left
643 movzx r3d, byte [r2 + 17]
644 add r4d, r3d
645 movzx r3d, byte [r2 + 1]
646 add r3d, r4d
647 shr r3d, 2
648 mov [r0], r3b
649
650 ; filter left
651 movq m2, [r2 + 18]
652 punpcklbw m2, m0
653 paddw m2, m1
654 psraw m2, 2
655 packuswb m2, m2
656 movd r2d, m2
657 lea r0, [r0 + r1]
658 lea r5, [r6 + r1 * 2]
659 mov [r0], r2b
660 shr r2, 8
661 mov [r0 + r1], r2b
662 shr r2, 8
663 mov [r0 + r1 * 2], r2b
664 shr r2, 8
665 mov [r0 + r6], r2b
666 pshufd m2, m2, 0x01
667 movd r2d, m2
668 mov [r0 + r1 * 4], r2b
669 shr r2, 8
670 mov [r0 + r5], r2b
671 shr r2, 8
672 mov [r0 + r6 * 2], r2b
673
674 .end:
675 RET
676
677 ;--------------------------------------------------------------------------------------------
678 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
679 ;--------------------------------------------------------------------------------------------
680 INIT_XMM sse2
681 %if ARCH_X86_64
682 cglobal intra_pred_dc16, 5, 10, 4
683 %else
684 cglobal intra_pred_dc16, 5, 7, 4
685 %endif
686 pxor m0, m0
687 movu m1, [r2 + 1]
688 movu m2, [r2 + 33]
689 psadbw m1, m0
690 psadbw m2, m0
691 paddw m1, m2
692 pshufd m2, m1, 2
693 paddw m1, m2
694
695 paddw m1, [pw_16]
696 psraw m1, 5
697 pmullw m1, [pw_257]
698 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
699 pshufd m1, m1, 0x00
700
701
702 test r4d, r4d
703
704 ; store DC 16x16
705 %if ARCH_X86_64
706 lea r6, [r1 + r1 * 2] ;index 3
707 lea r7, [r1 + r1 * 4] ;index 5
708 lea r8, [r6 + r1 * 4] ;index 7
709 lea r9, [r0 + r8] ;base + 7
710 movu [r0], m1
711 movu [r0 + r1], m1
712 movu [r0 + r1 * 2], m1
713 movu [r0 + r6], m1
714 movu [r0 + r1 * 4], m1
715 movu [r0 + r7], m1
716 movu [r0 + r6 * 2], m1
717 movu [r0 + r8], m1
718 movu [r0 + r1 * 8], m1
719 movu [r9 + r1 * 2], m1
720 movu [r0 + r7 * 2], m1
721 movu [r9 + r1 * 4], m1
722 movu [r0 + r6 * 4], m1
723 movu [r9 + r6 * 2], m1
724 movu [r0 + r8 * 2], m1
725 movu [r9 + r1 * 8], m1
726 %else ;32 bit
727 mov r6, r0
728 movu [r0], m1
729 movu [r0 + r1], m1
730 lea r0, [r0 + r1 * 2]
731 movu [r0], m1
732 movu [r0 + r1], m1
733 lea r0, [r0 + r1 * 2]
734 movu [r0], m1
735 movu [r0 + r1], m1
736 lea r0, [r0 + r1 * 2]
737 movu [r0], m1
738 movu [r0 + r1], m1
739 lea r0, [r0 + r1 * 2]
740 movu [r0], m1
741 movu [r0 + r1], m1
742 lea r0, [r0 + r1 * 2]
743 movu [r0], m1
744 movu [r0 + r1], m1
745 lea r0, [r0 + r1 * 2]
746 movu [r0], m1
747 movu [r0 + r1], m1
748 lea r0, [r0 + r1 * 2]
749 movu [r0], m1
750 movu [r0 + r1], m1
751 %endif
752 ; Do DC Filter
753 jz .end
754 psrlw m1, 8
755 mova m2, [pw_2]
756 pmullw m2, m1
757 paddw m2, [pw_2]
758 movd r4d, m2
759 paddw m1, m2
760
761 ; filter top
762 movh m2, [r2 + 1]
763 punpcklbw m2, m0
764 paddw m2, m1
765 psraw m2, 2
766 packuswb m2, m2
767 movh m3, [r2 + 9]
768 punpcklbw m3, m0
769 paddw m3, m1
770 psraw m3, 2
771 packuswb m3, m3
772
773 ; filter top-left
774 movzx r5d, byte [r2 + 33]
775 add r4d, r5d
776 movzx r3d, byte [r2 + 1]
777 add r3d, r4d
778 shr r3d, 2
779
780 %if ARCH_X86_64
781 movh [r0], m2
782 movh [r0 + 8], m3
783 mov [r0], r3b
784 %else ;32 bit
785 movh [r6], m2
786 movh [r6 + 8], m3
787 mov [r6], r3b
788 add r6, r1
789 %endif
790
791 ; filter left
792 movh m2, [r2 + 34]
793 punpcklbw m2, m0
794 paddw m2, m1
795 psraw m2, 2
796 packuswb m2, m2
797
798 movh m3, [r2 + 42]
799 punpcklbw m3, m0
800 paddw m3, m1
801 psraw m3, 2
802 packuswb m3, m3
803 %if ARCH_X86_64
804 movh r3, m2
805 mov [r0 + r1], r3b
806 shr r3, 8
807 mov [r0 + r1 * 2], r3b
808 shr r3, 8
809 mov [r0 + r6], r3b
810 shr r3, 8
811 mov [r0 + r1 * 4], r3b
812 shr r3, 8
813 mov [r0 + r7], r3b
814 shr r3, 8
815 mov [r0 + r6 * 2], r3b
816 shr r3, 8
817 mov [r0 + r8], r3b
818 shr r3, 8
819 mov [r0 + r1 * 8], r3b
820 movh r3, m3
821 mov [r9 + r1 * 2], r3b
822 shr r3, 8
823 mov [r0 + r7 * 2], r3b
824 shr r3, 8
825 mov [r9 + r1 * 4], r3b
826 shr r3, 8
827 mov [r0 + r6 * 4], r3b
828 shr r3, 8
829 mov [r9 + r6 * 2], r3b
830 shr r3, 8
831 mov [r0 + r8 * 2], r3b
832 shr r3, 8
833 mov [r9 + r1 * 8], r3b
834 %else ;32 bit
835 movd r2d, m2
836 pshufd m2, m2, 0x01
837 mov [r6], r2b
838 shr r2, 8
839 mov [r6 + r1], r2b
840 shr r2, 8
841 mov [r6 + r1 * 2], r2b
842 lea r6, [r6 + r1 * 2]
843 shr r2, 8
844 mov [r6 + r1], r2b
845 movd r2d, m2
846 mov [r6 + r1 * 2], r2b
847 lea r6, [r6 + r1 * 2]
848 shr r2, 8
849 mov [r6 + r1], r2b
850 shr r2, 8
851 mov [r6 + r1 * 2], r2b
852 lea r6, [r6 + r1 * 2]
853 shr r2, 8
854 mov [r6 + r1], r2b
855 movd r2d, m3
856 pshufd m3, m3, 0x01
857 mov [r6 + r1 * 2], r2b
858 lea r6, [r6 + r1 * 2]
859 shr r2, 8
860 mov [r6 + r1], r2b
861 shr r2, 8
862 mov [r6 + r1 * 2], r2b
863 lea r6, [r6 + r1 * 2]
864 shr r2, 8
865 mov [r6 + r1], r2b
866 movd r2d, m3
867 mov [r6 + r1 * 2], r2b
868 lea r6, [r6 + r1 * 2]
869 shr r2, 8
870 mov [r6 + r1], r2b
871 shr r2, 8
872 mov [r6 + r1 * 2], r2b
873 %endif
874 .end:
875 RET
876
877 ;---------------------------------------------------------------------------------------------
878 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
879 ;---------------------------------------------------------------------------------------------
880 INIT_XMM sse2
881 cglobal intra_pred_dc32, 3, 3, 5
882 pxor m0, m0
883 movu m1, [r2 + 1]
884 movu m2, [r2 + 17]
885 movu m3, [r2 + 65]
886 movu m4, [r2 + 81]
887 psadbw m1, m0
888 psadbw m2, m0
889 psadbw m3, m0
890 psadbw m4, m0
891 paddw m1, m2
892 paddw m3, m4
893 paddw m1, m3
894 pshufd m2, m1, 2
895 paddw m1, m2
896
897 paddw m1, [pw_32]
898 psraw m1, 6
899 pmullw m1, [pw_257]
900 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
901 pshufd m1, m1, 0x00
902
903 %assign x 0
904 %rep 16
905 ; store DC 16x16
906 movu [r0], m1
907 movu [r0 + r1], m1
908 movu [r0 + 16], m1
909 movu [r0 + r1 + 16], m1
910 %if x < 16
911 lea r0, [r0 + 2 * r1]
912 %endif
913 %assign x x+1
914 %endrep
915 RET
916
917 ;---------------------------------------------------------------------------------------
918 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
919 ;---------------------------------------------------------------------------------------
920 INIT_XMM sse2
921 cglobal intra_pred_planar4, 3,3,5
922 pxor m0, m0
923 movh m1, [r2 + 1]
924 punpcklbw m1, m0
925 movh m2, [r2 + 9]
926 punpcklbw m2, m0
927 pshufhw m3, m1, 0 ; topRight
928 pshufd m3, m3, 0xAA
929 pshufhw m4, m2, 0 ; bottomLeft
930 pshufd m4, m4, 0xAA
931 pmullw m3, [multi_2Row] ; (x + 1) * topRight
932 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
933 paddw m3, [pw_4]
934 paddw m3, m4
935 paddw m3, m0
936 psubw m4, m1
937
938 pshuflw m1, m2, 0
939 pmullw m1, [pw_planar4_0]
940 paddw m1, m3
941 paddw m3, m4
942 psraw m1, 3
943 packuswb m1, m1
944 movd [r0], m1
945
946 pshuflw m1, m2, 01010101b
947 pmullw m1, [pw_planar4_0]
948 paddw m1, m3
949 paddw m3, m4
950 psraw m1, 3
951 packuswb m1, m1
952 movd [r0 + r1], m1
953 lea r0, [r0 + 2 * r1]
954
955 pshuflw m1, m2, 10101010b
956 pmullw m1, [pw_planar4_0]
957 paddw m1, m3
958 paddw m3, m4
959 psraw m1, 3
960 packuswb m1, m1
961 movd [r0], m1
962
963 pshuflw m1, m2, 11111111b
964 pmullw m1, [pw_planar4_0]
965 paddw m1, m3
966 psraw m1, 3
967 packuswb m1, m1
968 movd [r0 + r1], m1
969 RET
970
971 ;---------------------------------------------------------------------------------------
972 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
973 ;---------------------------------------------------------------------------------------
974 INIT_XMM sse2
975 cglobal intra_pred_planar8, 3,3,6
976 pxor m0, m0
977 movh m1, [r2 + 1]
978 punpcklbw m1, m0
979 movh m2, [r2 + 17]
980 punpcklbw m2, m0
981
982 movd m3, [r2 + 9] ; topRight = above[8];
983 movd m4, [r2 + 25] ; bottomLeft = left[8];
984
985 pand m3, [pw_00ff]
986 pand m4, [pw_00ff]
987 pshuflw m3, m3, 0x00
988 pshuflw m4, m4, 0x00
989 pshufd m3, m3, 0x44
990 pshufd m4, m4, 0x44
991 pmullw m3, [multiL] ; (x + 1) * topRight
992 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
993 paddw m3, [pw_8]
994 paddw m3, m4
995 paddw m3, m0
996 psubw m4, m1
997
998 %macro INTRA_PRED_PLANAR_8 1
999 %if (%1 < 4)
1000 pshuflw m5, m2, 0x55 * %1
1001 pshufd m5, m5, 0
1002 %else
1003 pshufhw m5, m2, 0x55 * (%1 - 4)
1004 pshufd m5, m5, 0xAA
1005 %endif
1006 pmullw m5, [pw_planar16_mul + mmsize]
1007 paddw m5, m3
1008 psraw m5, 4
1009 packuswb m5, m5
1010 movh [r0], m5
1011 %if (%1 < 7)
1012 paddw m3, m4
1013 lea r0, [r0 + r1]
1014 %endif
1015 %endmacro
1016
1017 INTRA_PRED_PLANAR_8 0
1018 INTRA_PRED_PLANAR_8 1
1019 INTRA_PRED_PLANAR_8 2
1020 INTRA_PRED_PLANAR_8 3
1021 INTRA_PRED_PLANAR_8 4
1022 INTRA_PRED_PLANAR_8 5
1023 INTRA_PRED_PLANAR_8 6
1024 INTRA_PRED_PLANAR_8 7
1025 RET
1026
1027 ;---------------------------------------------------------------------------------------
1028 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
1029 ;---------------------------------------------------------------------------------------
1030 INIT_XMM sse2
1031 cglobal intra_pred_planar16, 3,5,8
1032 pxor m0, m0
1033 movh m2, [r2 + 1]
1034 punpcklbw m2, m0
1035 movh m7, [r2 + 9]
1036 punpcklbw m7, m0
1037
1038 movd m3, [r2 + 17] ; topRight = above[16]
1039 movd m6, [r2 + 49] ; bottomLeft = left[16]
1040 pand m3, [pw_00ff]
1041 pand m6, [pw_00ff]
1042 pshuflw m3, m3, 0x00
1043 pshuflw m6, m6, 0x00
1044 pshufd m3, m3, 0x44 ; v_topRight
1045 pshufd m6, m6, 0x44 ; v_bottomLeft
1046 pmullw m4, m3, [multiH] ; (x + 1) * topRight
1047 pmullw m3, [multiL] ; (x + 1) * topRight
1048 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
1049 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
1050 paddw m4, [pw_16]
1051 paddw m3, [pw_16]
1052 paddw m4, m6
1053 paddw m3, m6
1054 paddw m4, m5
1055 paddw m3, m1
1056 psubw m1, m6, m7
1057 psubw m6, m2
1058
1059 movh m2, [r2 + 33]
1060 punpcklbw m2, m0
1061 movh m7, [r2 + 41]
1062 punpcklbw m7, m0
1063
1064 %macro INTRA_PRED_PLANAR_16 1
1065 %if (%1 < 4)
1066 pshuflw m5, m2, 0x55 * %1
1067 pshufd m5, m5, 0
1068 %else
1069 %if (%1 < 8)
1070 pshufhw m5, m2, 0x55 * (%1 - 4)
1071 pshufd m5, m5, 0xAA
1072 %else
1073 %if (%1 < 12)
1074 pshuflw m5, m7, 0x55 * (%1 - 8)
1075 pshufd m5, m5, 0
1076 %else
1077 pshufhw m5, m7, 0x55 * (%1 - 12)
1078 pshufd m5, m5, 0xAA
1079 %endif
1080 %endif
1081 %endif
1082 %if (%1 > 0)
1083 paddw m3, m6
1084 paddw m4, m1
1085 lea r0, [r0 + r1]
1086 %endif
1087 pmullw m0, m5, [pw_planar16_mul + mmsize]
1088 pmullw m5, [pw_planar16_mul]
1089 paddw m0, m4
1090 paddw m5, m3
1091 psraw m5, 5
1092 psraw m0, 5
1093 packuswb m5, m0
1094 movu [r0], m5
1095 %endmacro
1096
1097 INTRA_PRED_PLANAR_16 0
1098 INTRA_PRED_PLANAR_16 1
1099 INTRA_PRED_PLANAR_16 2
1100 INTRA_PRED_PLANAR_16 3
1101 INTRA_PRED_PLANAR_16 4
1102 INTRA_PRED_PLANAR_16 5
1103 INTRA_PRED_PLANAR_16 6
1104 INTRA_PRED_PLANAR_16 7
1105 INTRA_PRED_PLANAR_16 8
1106 INTRA_PRED_PLANAR_16 9
1107 INTRA_PRED_PLANAR_16 10
1108 INTRA_PRED_PLANAR_16 11
1109 INTRA_PRED_PLANAR_16 12
1110 INTRA_PRED_PLANAR_16 13
1111 INTRA_PRED_PLANAR_16 14
1112 INTRA_PRED_PLANAR_16 15
1113 RET
1114
1115 ;---------------------------------------------------------------------------------------
1116 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
1117 ;---------------------------------------------------------------------------------------
1118 INIT_XMM sse2
1119 %if ARCH_X86_64 == 1
1120 cglobal intra_pred_planar32, 3,3,16
1121 movd m3, [r2 + 33] ; topRight = above[32]
1122
1123 pxor m7, m7
1124 pand m3, [pw_00ff]
1125 pshuflw m3, m3, 0x00
1126 pshufd m3, m3, 0x44
1127
1128 pmullw m0, m3, [multiL] ; (x + 1) * topRight
1129 pmullw m1, m3, [multiH] ; (x + 1) * topRight
1130 pmullw m2, m3, [multiH2] ; (x + 1) * topRight
1131 pmullw m3, [multiH3] ; (x + 1) * topRight
1132
1133 movd m11, [r2 + 97] ; bottomLeft = left[32]
1134 pand m11, [pw_00ff]
1135 pshuflw m11, m11, 0x00
1136 pshufd m11, m11, 0x44
1137 mova m5, m11
1138 paddw m5, [pw_32]
1139
1140 paddw m0, m5
1141 paddw m1, m5
1142 paddw m2, m5
1143 paddw m3, m5
1144 mova m8, m11
1145 mova m9, m11
1146 mova m10, m11
1147 mova m12, [pw_31]
1148 movh m4, [r2 + 1]
1149 punpcklbw m4, m7
1150 psubw m8, m4
1151 pmullw m4, m12
1152 paddw m0, m4
1153
1154 movh m4, [r2 + 9]
1155 punpcklbw m4, m7
1156 psubw m9, m4
1157 pmullw m4, m12
1158 paddw m1, m4
1159
1160 movh m4, [r2 + 17]
1161 punpcklbw m4, m7
1162 psubw m10, m4
1163 pmullw m4, m12
1164 paddw m2, m4
1165
1166 movh m4, [r2 + 25]
1167 punpcklbw m4, m7
1168 psubw m11, m4
1169 pmullw m4, m12
1170 paddw m3, m4
1171 mova m12, [pw_planar32_mul]
1172 mova m13, [pw_planar32_mul + mmsize]
1173 mova m14, [pw_planar16_mul]
1174 mova m15, [pw_planar16_mul + mmsize]
1175 %macro PROCESS 1
1176 pmullw m5, %1, m12
1177 pmullw m6, %1, m13
1178 paddw m5, m0
1179 paddw m6, m1
1180 psraw m5, 6
1181 psraw m6, 6
1182 packuswb m5, m6
1183 movu [r0], m5
1184
1185 pmullw m5, %1, m14
1186 pmullw %1, m15
1187 paddw m5, m2
1188 paddw %1, m3
1189 psraw m5, 6
1190 psraw %1, 6
1191 packuswb m5, %1
1192 movu [r0 + 16], m5
1193 %endmacro
1194
1195 %macro INCREMENT 0
1196 paddw m2, m10
1197 paddw m3, m11
1198 paddw m0, m8
1199 paddw m1, m9
1200 add r0, r1
1201 %endmacro
1202
1203 %assign x 0
1204 %rep 4
1205 pxor m7, m7
1206 movq m4, [r2 + 65 + x * 8]
1207 punpcklbw m4, m7
1208 %assign y 0
1209 %rep 8
1210 %if y < 4
1211 pshuflw m7, m4, 0x55 * y
1212 pshufd m7, m7, 0x44
1213 %else
1214 pshufhw m7, m4, 0x55 * (y - 4)
1215 pshufd m7, m7, 0xEE
1216 %endif
1217 PROCESS m7
1218 %if x + y < 10
1219 INCREMENT
1220 %endif
1221 %assign y y+1
1222 %endrep
1223 %assign x x+1
1224 %endrep
1225 RET
1226
1227 %else ;end ARCH_X86_64, start ARCH_X86_32
1228 cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize)
1229 movd m3, [r2 + 33] ; topRight = above[32]
1230
1231 pxor m7, m7
1232 pand m3, [pw_00ff]
1233 pshuflw m3, m3, 0x00
1234 pshufd m3, m3, 0x44
1235
1236 pmullw m0, m3, [multiL] ; (x + 1) * topRight
1237 pmullw m1, m3, [multiH] ; (x + 1) * topRight
1238 pmullw m2, m3, [multiH2] ; (x + 1) * topRight
1239 pmullw m3, [multiH3] ; (x + 1) * topRight
1240
1241 movd m6, [r2 + 97] ; bottomLeft = left[32]
1242 pand m6, [pw_00ff]
1243 pshuflw m6, m6, 0x00
1244 pshufd m6, m6, 0x44
1245 mova m5, m6
1246 paddw m5, [pw_32]
1247
1248 paddw m0, m5
1249 paddw m1, m5
1250 paddw m2, m5
1251 paddw m3, m5
1252
1253 movh m4, [r2 + 1]
1254 punpcklbw m4, m7
1255 psubw m5, m6, m4
1256 mova [rsp + 0 * mmsize], m5
1257 pmullw m4, [pw_31]
1258 paddw m0, m4
1259 movh m4, [r2 + 9]
1260 punpcklbw m4, m7
1261 psubw m5, m6, m4
1262 mova [rsp + 1 * mmsize], m5
1263 pmullw m4, [pw_31]
1264 paddw m1, m4
1265 movh m4, [r2 + 17]
1266 punpcklbw m4, m7
1267 psubw m5, m6, m4
1268 mova [rsp + 2 * mmsize], m5
1269 pmullw m4, [pw_31]
1270 paddw m2, m4
1271 movh m4, [r2 + 25]
1272 punpcklbw m4, m7
1273 psubw m5, m6, m4
1274 mova [rsp + 3 * mmsize], m5
1275 pmullw m4, [pw_31]
1276 paddw m3, m4
1277 %macro PROCESS 1
1278 pmullw m5, %1, [pw_planar32_mul]
1279 pmullw m6, %1, [pw_planar32_mul + mmsize]
1280 paddw m5, m0
1281 paddw m6, m1
1282 psraw m5, 6
1283 psraw m6, 6
1284 packuswb m5, m6
1285 movu [r0], m5
1286 pmullw m5, %1, [pw_planar16_mul]
1287 pmullw %1, [pw_planar16_mul + mmsize]
1288 paddw m5, m2
1289 paddw %1, m3
1290 psraw m5, 6
1291 psraw %1, 6
1292 packuswb m5, %1
1293 movu [r0 + 16], m5
1294 %endmacro
1295
1296 %macro INCREMENT 0
1297 paddw m0, [rsp + 0 * mmsize]
1298 paddw m1, [rsp + 1 * mmsize]
1299 paddw m2, [rsp + 2 * mmsize]
1300 paddw m3, [rsp + 3 * mmsize]
1301 add r0, r1
1302 %endmacro
1303
1304 %assign y 0
1305 %rep 4
1306 pxor m7, m7
1307 movq m4, [r2 + 65 + y * 8]
1308 punpcklbw m4, m7
1309 %assign x 0
1310 %rep 8
1311 %if x < 4
1312 pshuflw m7, m4, 0x55 * x
1313 pshufd m7, m7, 0x44
1314 %else
1315 pshufhw m7, m4, 0x55 * (x - 4)
1316 pshufd m7, m7, 0xEE
1317 %endif
1318
1319 PROCESS m7
1320 %if x + y < 10
1321 INCREMENT
1322 %endif
1323 %assign x x+1
1324 %endrep
1325 %assign y y+1
1326 %endrep
1327 RET
1328
1329 %endif ; end ARCH_X86_32
1330
1331 %macro STORE_4x4 0
1332 movd [r0], m0
1333 psrldq m0, 4
1334 movd [r0 + r1], m0
1335 psrldq m0, 4
1336 movd [r0 + r1 * 2], m0
1337 lea r1, [r1 * 3]
1338 psrldq m0, 4
1339 movd [r0 + r1], m0
1340 %endmacro
1341
1342 %macro TRANSPOSE_4x4 0
1343 pshufd m0, m0, 0xD8
1344 pshufd m1, m2, 0xD8
1345 pshuflw m0, m0, 0xD8
1346 pshuflw m1, m1, 0xD8
1347 pshufhw m0, m0, 0xD8
1348 pshufhw m1, m1, 0xD8
1349 mova m2, m0
1350 punpckldq m0, m1
1351 punpckhdq m2, m1
1352 packuswb m0, m2
1353 %endmacro
1354
1355 ;-----------------------------------------------------------------------------------------
1356 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
1357 ;-----------------------------------------------------------------------------------------
1358 INIT_XMM sse2
1359 cglobal intra_pred_ang4_2, 3,5,1
1360 lea r4, [r2 + 2]
1361 add r2, 10
1362 cmp r3m, byte 34
1363 cmove r2, r4
1364
1365 movh m0, [r2]
1366 movd [r0], m0
1367 psrldq m0, 1
1368 movd [r0 + r1], m0
1369 psrldq m0, 1
1370 movd [r0 + r1 * 2], m0
1371 lea r1, [r1 * 3]
1372 psrldq m0, 1
1373 movd [r0 + r1], m0
1374 RET
1375
1376 INIT_XMM sse2
1377 cglobal intra_pred_ang4_3, 3,3,5
1378 movh m3, [r2 + 9] ; [8 7 6 5 4 3 2 1]
1379 punpcklbw m3, m3
1380 psrldq m3, 1
1381 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
1382 psrldq m3, 2
1383 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
1384 psrldq m3, 2
1385 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
1386 psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
1387
1388 pxor m4, m4
1389 punpcklbw m1, m4
1390 pmaddwd m1, [pw_ang_table + 20 * 16]
1391 punpcklbw m0, m4
1392 pmaddwd m0, [pw_ang_table + 26 * 16]
1393 packssdw m0, m1
1394 paddw m0, [pw_16]
1395 psraw m0, 5
1396 punpcklbw m3, m4
1397 pmaddwd m3, [pw_ang_table + 8 * 16]
1398 punpcklbw m2, m4
1399 pmaddwd m2, [pw_ang_table + 14 * 16]
1400 packssdw m2, m3
1401 paddw m2, [pw_16]
1402 psraw m2, 5
1403
1404 TRANSPOSE_4x4
1405
1406 STORE_4x4
1407 RET
1408
1409 cglobal intra_pred_ang4_4, 3,3,5
1410 movh m1, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1411 punpcklbw m1, m1
1412 psrldq m1, 1
1413 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
1414 psrldq m1, 2
1415 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
1416 psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
1417
1418 pxor m4, m4
1419 punpcklbw m2, m4
1420 mova m3, m2
1421 pmaddwd m3, [pw_ang_table + 10 * 16]
1422 punpcklbw m0, m4
1423 pmaddwd m0, [pw_ang_table + 21 * 16]
1424 packssdw m0, m3
1425 paddw m0, [pw_16]
1426 psraw m0, 5
1427 punpcklbw m1, m4
1428 pmaddwd m1, [pw_ang_table + 20 * 16]
1429 pmaddwd m2, [pw_ang_table + 31 * 16]
1430 packssdw m2, m1
1431 paddw m2, [pw_16]
1432 psraw m2, 5
1433
1434 TRANSPOSE_4x4
1435
1436 STORE_4x4
1437 RET
1438
1439 cglobal intra_pred_ang4_5, 3,3,5
1440 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1441 punpcklbw m3, m3
1442 psrldq m3, 1
1443 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1444 psrldq m3, 2
1445 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
1446 psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
1447
1448 pxor m1, m1
1449 punpcklbw m2, m1
1450 mova m4, m2
1451 pmaddwd m4, [pw_ang_table + 2 * 16]
1452 punpcklbw m0, m1
1453 pmaddwd m0, [pw_ang_table + 17 * 16]
1454 packssdw m0, m4
1455 paddw m0, [pw_16]
1456 psraw m0, 5
1457 punpcklbw m3, m1
1458 pmaddwd m3, [pw_ang_table + 4 * 16]
1459 pmaddwd m2, [pw_ang_table + 19 * 16]
1460 packssdw m2, m3
1461 paddw m2, [pw_16]
1462 psraw m2, 5
1463
1464 TRANSPOSE_4x4
1465
1466 STORE_4x4
1467 RET
1468
1469 cglobal intra_pred_ang4_6, 3,3,4
1470 movh m2, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1471 punpcklbw m2, m2
1472 psrldq m2, 1
1473 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1474 psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
1475
1476 pxor m1, m1
1477 punpcklbw m0, m1
1478 mova m3, m0
1479 pmaddwd m3, [pw_ang_table + 26 * 16]
1480 pmaddwd m0, [pw_ang_table + 13 * 16]
1481 packssdw m0, m3
1482 paddw m0, [pw_16]
1483 psraw m0, 5
1484 punpcklbw m2, m1
1485 mova m3, m2
1486 pmaddwd m3, [pw_ang_table + 20 * 16]
1487 pmaddwd m2, [pw_ang_table + 7 * 16]
1488 packssdw m2, m3
1489 paddw m2, [pw_16]
1490 psraw m2, 5
1491
1492 TRANSPOSE_4x4
1493
1494 STORE_4x4
1495 RET
1496
1497 cglobal intra_pred_ang4_7, 3,3,5
1498 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1499 punpcklbw m3, m3
1500 psrldq m3, 1
1501 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1502 psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
1503
1504 pxor m1, m1
1505 punpcklbw m0, m1
1506 mova m4, m0
1507 mova m2, m0
1508 pmaddwd m4, [pw_ang_table + 18 * 16]
1509 pmaddwd m0, [pw_ang_table + 9 * 16]
1510 packssdw m0, m4
1511 paddw m0, [pw_16]
1512 psraw m0, 5
1513 punpcklbw m3, m1
1514 pmaddwd m3, [pw_ang_table + 4 * 16]
1515 pmaddwd m2, [pw_ang_table + 27 * 16]
1516 packssdw m2, m3
1517 paddw m2, [pw_16]
1518 psraw m2, 5
1519
1520 TRANSPOSE_4x4
1521
1522 STORE_4x4
1523 RET
1524
1525 cglobal intra_pred_ang4_8, 3,3,5
1526 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1527 punpcklbw m0, m0
1528 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1529
1530 pxor m1, m1
1531 punpcklbw m0, m1
1532 mova m2, m0
1533 mova m3, m0
1534 mova m4, m2
1535 pmaddwd m3, [pw_ang_table + 10 * 16]
1536 pmaddwd m0, [pw_ang_table + 5 * 16]
1537 packssdw m0, m3
1538 paddw m0, [pw_16]
1539 psraw m0, 5
1540 pmaddwd m4, [pw_ang_table + 20 * 16]
1541 pmaddwd m2, [pw_ang_table + 15 * 16]
1542 packssdw m2, m4
1543 paddw m2, [pw_16]
1544 psraw m2, 5
1545
1546 TRANSPOSE_4x4
1547
1548 STORE_4x4
1549 RET
1550
1551 cglobal intra_pred_ang4_9, 3,3,5
1552 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1553 punpcklbw m0, m0
1554 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1555
1556 pxor m1, m1
1557 punpcklbw m0, m1
1558 mova m2, m0
1559 mova m3, m0
1560 mova m4, m2
1561 pmaddwd m3, [pw_ang_table + 4 * 16]
1562 pmaddwd m0, [pw_ang_table + 2 * 16]
1563 packssdw m0, m3
1564 paddw m0, [pw_16]
1565 psraw m0, 5
1566 pmaddwd m4, [pw_ang_table + 8 * 16]
1567 pmaddwd m2, [pw_ang_table + 6 * 16]
1568 packssdw m2, m4
1569 paddw m2, [pw_16]
1570 psraw m2, 5
1571
1572 TRANSPOSE_4x4
1573
1574 STORE_4x4
1575 RET
1576
1577 cglobal intra_pred_ang4_10, 3,5,4
1578 movd m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
1579 punpcklbw m0, m0
1580 punpcklwd m0, m0
1581 pshufd m1, m0, 1
1582 movhlps m2, m0
1583 pshufd m3, m0, 3
1584 movd [r0 + r1], m1
1585 movd [r0 + r1 * 2], m2
1586 lea r1, [r1 * 3]
1587 movd [r0 + r1], m3
1588 cmp r4m, byte 0
1589 jz .quit
1590
1591 ; filter
1592 pxor m3, m3
1593 punpcklbw m0, m3
1594 movh m1, [r2] ;[4 3 2 1 0]
1595 punpcklbw m1, m3
1596 pshuflw m2, m1, 0x00
1597 psrldq m1, 2
1598 psubw m1, m2
1599 psraw m1, 1
1600 paddw m0, m1
1601 packuswb m0, m0
1602
1603 .quit:
1604 movd [r0], m0
1605 RET
1606
1607 cglobal intra_pred_ang4_11, 3,3,5
1608 movd m1, [r2 + 9] ;[4 3 2 1]
1609 movh m0, [r2 - 7] ;[A x x x x x x x]
1610 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
1611 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
1612 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
1613
1614 pxor m1, m1
1615 punpcklbw m0, m1
1616 mova m2, m0
1617 mova m3, m0
1618 mova m4, m2
1619 pmaddwd m3, [pw_ang_table + 28 * 16]
1620 pmaddwd m0, [pw_ang_table + 30 * 16]
1621 packssdw m0, m3
1622 paddw m0, [pw_16]
1623 psraw m0, 5
1624 pmaddwd m4, [pw_ang_table + 24 * 16]
1625 pmaddwd m2, [pw_ang_table + 26 * 16]
1626 packssdw m2, m4
1627 paddw m2, [pw_16]
1628 psraw m2, 5
1629
1630 TRANSPOSE_4x4
1631
1632 STORE_4x4
1633 RET
1634
1635 cglobal intra_pred_ang4_12, 3,3,5
1636 movd m1, [r2 + 9] ;[4 3 2 1]
1637 movh m0, [r2 - 7] ;[A x x x x x x x]
1638 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
1639 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
1640 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
1641
1642 pxor m1, m1
1643 punpcklbw m0, m1
1644 mova m2, m0
1645 mova m3, m0
1646 mova m4, m2
1647 pmaddwd m3, [pw_ang_table + 22 * 16]
1648 pmaddwd m0, [pw_ang_table + 27 * 16]
1649 packssdw m0, m3
1650 paddw m0, [pw_16]
1651 psraw m0, 5
1652 pmaddwd m4, [pw_ang_table + 12 * 16]
1653 pmaddwd m2, [pw_ang_table + 17 * 16]
1654 packssdw m2, m4
1655 paddw m2, [pw_16]
1656 psraw m2, 5
1657
1658 TRANSPOSE_4x4
1659
1660 STORE_4x4
1661 RET
1662
1663 cglobal intra_pred_ang4_24, 3,3,5
1664 movd m1, [r2 + 1] ;[4 3 2 1]
1665 movh m0, [r2 - 7] ;[A x x x x x x x]
1666 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
1667 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
1668 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
1669
1670 pxor m1, m1
1671 punpcklbw m0, m1
1672 mova m2, m0
1673 mova m3, m0
1674 mova m4, m2
1675 pmaddwd m3, [pw_ang_table + 22 * 16]
1676 pmaddwd m0, [pw_ang_table + 27 * 16]
1677 packssdw m0, m3
1678 paddw m0, [pw_16]
1679 psraw m0, 5
1680 pmaddwd m4, [pw_ang_table + 12 * 16]
1681 pmaddwd m2, [pw_ang_table + 17 * 16]
1682 packssdw m2, m4
1683 paddw m2, [pw_16]
1684 psraw m2, 5
1685 packuswb m0, m2
1686
1687 STORE_4x4
1688 RET
1689
1690 cglobal intra_pred_ang4_13, 3,3,5
1691 movd m1, [r2 - 1] ;[x x A x]
1692 movd m2, [r2 + 9] ;[4 3 2 1]
1693 movd m0, [r2 + 3] ;[x x B x]
1694 punpcklbw m0, m1 ;[x x x x A B x x]
1695 punpckldq m0, m2 ;[4 3 2 1 A B x x]
1696 psrldq m0, 2 ;[x x 4 3 2 1 A B]
1697 punpcklbw m0, m0
1698 psrldq m0, 1
1699 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1700 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1701
1702 pxor m1, m1
1703 punpcklbw m0, m1
1704 mova m4, m0
1705 mova m2, m0
1706 pmaddwd m4, [pw_ang_table + 14 * 16]
1707 pmaddwd m0, [pw_ang_table + 23 * 16]
1708 packssdw m0, m4
1709 paddw m0, [pw_16]
1710 psraw m0, 5
1711 punpcklbw m3, m1
1712 pmaddwd m3, [pw_ang_table + 28 * 16]
1713 pmaddwd m2, [pw_ang_table + 5 * 16]
1714 packssdw m2, m3
1715 paddw m2, [pw_16]
1716 psraw m2, 5
1717
1718 TRANSPOSE_4x4
1719
1720 STORE_4x4
1721 RET
1722
1723 cglobal intra_pred_ang4_14, 3,3,4
1724 movd m1, [r2 - 1] ;[x x A x]
1725 movd m0, [r2 + 1] ;[x x B x]
1726 punpcklbw m0, m1 ;[A B x x]
1727 movd m1, [r2 + 9] ;[4 3 2 1]
1728 punpckldq m0, m1 ;[4 3 2 1 A B x x]
1729 psrldq m0, 2 ;[x x 4 3 2 1 A B]
1730 punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
1731 psrldq m0, 1
1732 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1733 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1734
1735 pxor m1, m1
1736 punpcklbw m0, m1
1737 mova m3, m0
1738 pmaddwd m3, [pw_ang_table + 6 * 16]
1739 pmaddwd m0, [pw_ang_table + 19 * 16]
1740 packssdw m0, m3
1741 paddw m0, [pw_16]
1742 psraw m0, 5
1743 punpcklbw m2, m1
1744 mova m3, m2
1745 pmaddwd m3, [pw_ang_table + 12 * 16]
1746 pmaddwd m2, [pw_ang_table + 25 * 16]
1747 packssdw m2, m3
1748 paddw m2, [pw_16]
1749 psraw m2, 5
1750
1751 TRANSPOSE_4x4
1752
1753 STORE_4x4
1754 RET
1755
1756 cglobal intra_pred_ang4_15, 3,3,5
1757 movd m0, [r2] ;[x x x A]
1758 movd m1, [r2 + 2] ;[x x x B]
1759 punpcklbw m1, m0 ;[x x A B]
1760 movd m0, [r2 + 3] ;[x x C x]
1761 punpcklwd m0, m1 ;[A B C x]
1762 movd m1, [r2 + 9] ;[4 3 2 1]
1763 punpckldq m0, m1 ;[4 3 2 1 A B C x]
1764 psrldq m0, 1 ;[x 4 3 2 1 A B C]
1765 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
1766 psrldq m0, 1
1767 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1768 psrldq m0, 2
1769 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1770 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1771
1772 pxor m4, m4
1773 punpcklbw m2, m4
1774 mova m3, m2
1775 pmaddwd m3, [pw_ang_table + 30 * 16]
1776 punpcklbw m0, m4
1777 pmaddwd m0, [pw_ang_table + 15 * 16]
1778 packssdw m0, m3
1779 paddw m0, [pw_16]
1780 psraw m0, 5
1781 punpcklbw m1, m4
1782 pmaddwd m1, [pw_ang_table + 28 * 16]
1783 pmaddwd m2, [pw_ang_table + 13 * 16]
1784 packssdw m2, m1
1785 paddw m2, [pw_16]
1786 psraw m2, 5
1787
1788 TRANSPOSE_4x4
1789
1790 STORE_4x4
1791 RET
1792
1793 cglobal intra_pred_ang4_16, 3,3,5
1794 movd m2, [r2] ;[x x x A]
1795 movd m1, [r2 + 2] ;[x x x B]
1796 punpcklbw m1, m2 ;[x x A B]
1797 movd m0, [r2 + 2] ;[x x C x]
1798 punpcklwd m0, m1 ;[A B C x]
1799 movd m1, [r2 + 9] ;[4 3 2 1]
1800 punpckldq m0, m1 ;[4 3 2 1 A B C x]
1801 psrldq m0, 1 ;[x 4 3 2 1 A B C]
1802 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
1803 psrldq m0, 1
1804 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1805 psrldq m0, 2
1806 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1807 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1808
1809 pxor m4, m4
1810 punpcklbw m2, m4
1811 mova m3, m2
1812 pmaddwd m3, [pw_ang_table + 22 * 16]
1813 punpcklbw m0, m4
1814 pmaddwd m0, [pw_ang_table + 11 * 16]
1815 packssdw m0, m3
1816 paddw m0, [pw_16]
1817 psraw m0, 5
1818 punpcklbw m1, m4
1819 pmaddwd m1, [pw_ang_table + 12 * 16]
1820 pmaddwd m2, [pw_ang_table + 1 * 16]
1821 packssdw m2, m1
1822 paddw m2, [pw_16]
1823 psraw m2, 5
1824
1825 TRANSPOSE_4x4
1826
1827 STORE_4x4
1828 RET
1829
1830 cglobal intra_pred_ang4_17, 3,3,5
1831 movd m2, [r2] ;[x x x A]
1832 movd m3, [r2 + 1] ;[x x x B]
1833 movd m4, [r2 + 2] ;[x x x C]
1834 movd m0, [r2 + 4] ;[x x x D]
1835 punpcklbw m3, m2 ;[x x A B]
1836 punpcklbw m0, m4 ;[x x C D]
1837 punpcklwd m0, m3 ;[A B C D]
1838 movd m1, [r2 + 9] ;[4 3 2 1]
1839 punpckldq m0, m1 ;[4 3 2 1 A B C D]
1840 punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
1841 psrldq m0, 1
1842 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
1843 psrldq m0, 2
1844 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1845 psrldq m0, 2
1846 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1847 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1848
1849 pxor m4, m4
1850 punpcklbw m3, m4
1851 pmaddwd m3, [pw_ang_table + 12 * 16]
1852 punpcklbw m0, m4
1853 pmaddwd m0, [pw_ang_table + 6 * 16]
1854 packssdw m0, m3
1855 paddw m0, [pw_16]
1856 psraw m0, 5
1857 punpcklbw m1, m4
1858 pmaddwd m1, [pw_ang_table + 24 * 16]
1859 punpcklbw m2, m4
1860 pmaddwd m2, [pw_ang_table + 18 * 16]
1861 packssdw m2, m1
1862 paddw m2, [pw_16]
1863 psraw m2, 5
1864
1865 TRANSPOSE_4x4
1866
1867 STORE_4x4
1868 RET
1869
1870 cglobal intra_pred_ang4_18, 3,4,2
1871 mov r3d, [r2 + 8]
1872 mov r3b, byte [r2]
1873 bswap r3d
1874 movd m0, r3d
1875
1876 movd m1, [r2 + 1]
1877 punpckldq m0, m1
1878 lea r3, [r1 * 3]
1879 movd [r0 + r3], m0
1880 psrldq m0, 1
1881 movd [r0 + r1 * 2], m0
1882 psrldq m0, 1
1883 movd [r0 + r1], m0
1884 psrldq m0, 1
1885 movd [r0], m0
1886 RET
1887
1888 cglobal intra_pred_ang4_19, 3,3,5
1889 movd m2, [r2] ;[x x x A]
1890 movd m3, [r2 + 9] ;[x x x B]
1891 movd m4, [r2 + 10] ;[x x x C]
1892 movd m0, [r2 + 12] ;[x x x D]
1893 punpcklbw m3, m2 ;[x x A B]
1894 punpcklbw m0, m4 ;[x x C D]
1895 punpcklwd m0, m3 ;[A B C D]
1896 movd m1, [r2 + 1] ;[4 3 2 1]
1897 punpckldq m0, m1 ;[4 3 2 1 A B C D]
1898 punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
1899 psrldq m0, 1
1900 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
1901 psrldq m0, 2
1902 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1903 psrldq m0, 2
1904 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1905 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1906
1907 pxor m4, m4
1908 punpcklbw m3, m4
1909 pmaddwd m3, [pw_ang_table + 12 * 16]
1910 punpcklbw m0, m4
1911 pmaddwd m0, [pw_ang_table + 6 * 16]
1912 packssdw m0, m3
1913 paddw m0, [pw_16]
1914 psraw m0, 5
1915 punpcklbw m1, m4
1916 pmaddwd m1, [pw_ang_table + 24 * 16]
1917 punpcklbw m2, m4
1918 pmaddwd m2, [pw_ang_table + 18 * 16]
1919 packssdw m2, m1
1920 paddw m2, [pw_16]
1921 psraw m2, 5
1922 packuswb m0, m2
1923
1924 STORE_4x4
1925 RET
1926
1927 cglobal intra_pred_ang4_20, 3,3,5
1928 movd m2, [r2] ;[x x x A]
1929 movd m1, [r2 + 10] ;[x x x B]
1930 punpcklbw m1, m2 ;[x x A B]
1931 movd m0, [r2 + 10] ;[x x C x]
1932 punpcklwd m0, m1 ;[A B C x]
1933 movd m1, [r2 + 1] ;[4 3 2 1]
1934 punpckldq m0, m1 ;[4 3 2 1 A B C x]
1935 psrldq m0, 1 ;[x 4 3 2 1 A B C]
1936 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
1937 psrldq m0, 1
1938 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1939 psrldq m0, 2
1940 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1941 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1942
1943 pxor m4, m4
1944 punpcklbw m2, m4
1945 mova m3, m2
1946 pmaddwd m3, [pw_ang_table + 22 * 16]
1947 punpcklbw m0, m4
1948 pmaddwd m0, [pw_ang_table + 11 * 16]
1949 packssdw m0, m3
1950 paddw m0, [pw_16]
1951 psraw m0, 5
1952 punpcklbw m1, m4
1953 pmaddwd m1, [pw_ang_table + 12 * 16]
1954 pmaddwd m2, [pw_ang_table + 1 * 16]
1955 packssdw m2, m1
1956 paddw m2, [pw_16]
1957 psraw m2, 5
1958 packuswb m0, m2
1959
1960 STORE_4x4
1961 RET
1962
1963 cglobal intra_pred_ang4_21, 3,3,5
1964 movd m0, [r2] ;[x x x A]
1965 movd m1, [r2 + 10] ;[x x x B]
1966 punpcklbw m1, m0 ;[x x A B]
1967 movd m0, [r2 + 11] ;[x x C x]
1968 punpcklwd m0, m1 ;[A B C x]
1969 movd m1, [r2 + 1] ;[4 3 2 1]
1970 punpckldq m0, m1 ;[4 3 2 1 A B C x]
1971 psrldq m0, 1 ;[x 4 3 2 1 A B C]
1972 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
1973 psrldq m0, 1
1974 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
1975 psrldq m0, 2
1976 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
1977 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
1978
1979 pxor m4, m4
1980 punpcklbw m2, m4
1981 mova m3, m2
1982 pmaddwd m3, [pw_ang_table + 30 * 16]
1983 punpcklbw m0, m4
1984 pmaddwd m0, [pw_ang_table + 15 * 16]
1985 packssdw m0, m3
1986 paddw m0, [pw_16]
1987 psraw m0, 5
1988 punpcklbw m1, m4
1989 pmaddwd m1, [pw_ang_table + 28 * 16]
1990 pmaddwd m2, [pw_ang_table + 13 * 16]
1991 packssdw m2, m1
1992 paddw m2, [pw_16]
1993 psraw m2, 5
1994 packuswb m0, m2
1995
1996 STORE_4x4
1997 RET
1998
1999 cglobal intra_pred_ang4_22, 3,3,4
2000 movd m1, [r2 - 1] ;[x x A x]
2001 movd m0, [r2 + 9] ;[x x B x]
2002 punpcklbw m0, m1 ;[A B x x]
2003 movd m1, [r2 + 1] ;[4 3 2 1]
2004 punpckldq m0, m1 ;[4 3 2 1 A B x x]
2005 psrldq m0, 2 ;[x x 4 3 2 1 A B]
2006 punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
2007 psrldq m0, 1
2008 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
2009 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
2010
2011 pxor m1, m1
2012 punpcklbw m0, m1
2013 mova m3, m0
2014 pmaddwd m3, [pw_ang_table + 6 * 16]
2015 pmaddwd m0, [pw_ang_table + 19 * 16]
2016 packssdw m0, m3
2017 paddw m0, [pw_16]
2018 psraw m0, 5
2019 punpcklbw m2, m1
2020 mova m3, m2
2021 pmaddwd m3, [pw_ang_table + 12 * 16]
2022 pmaddwd m2, [pw_ang_table + 25 * 16]
2023 packssdw m2, m3
2024 paddw m2, [pw_16]
2025 psraw m2, 5
2026 packuswb m0, m2
2027
2028 STORE_4x4
2029 RET
2030
2031 cglobal intra_pred_ang4_23, 3,3,5
2032 movd m1, [r2 - 1] ;[x x A x]
2033 movd m2, [r2 + 1] ;[4 3 2 1]
2034 movd m0, [r2 + 11] ;[x x B x]
2035 punpcklbw m0, m1 ;[x x x x A B x x]
2036 punpckldq m0, m2 ;[4 3 2 1 A B x x]
2037 psrldq m0, 2 ;[x x 4 3 2 1 A B]
2038 punpcklbw m0, m0
2039 psrldq m0, 1
2040 mova m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
2041 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
2042
2043 pxor m1, m1
2044 punpcklbw m0, m1
2045 mova m4, m0
2046 mova m2, m0
2047 pmaddwd m4, [pw_ang_table + 14 * 16]
2048 pmaddwd m0, [pw_ang_table + 23 * 16]
2049 packssdw m0, m4
2050 paddw m0, [pw_16]
2051 psraw m0, 5
2052 punpcklbw m3, m1
2053 pmaddwd m3, [pw_ang_table + 28 * 16]
2054 pmaddwd m2, [pw_ang_table + 5 * 16]
2055 packssdw m2, m3
2056 paddw m2, [pw_16]
2057 psraw m2, 5
2058 packuswb m0, m2
2059
2060 STORE_4x4
2061 RET
2062
2063 cglobal intra_pred_ang4_25, 3,3,5
2064 movd m1, [r2 + 1] ;[4 3 2 1]
2065 movh m0, [r2 - 7] ;[A x x x x x x x]
2066 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
2067 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
2068 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
2069
2070 pxor m1, m1
2071 punpcklbw m0, m1
2072 mova m2, m0
2073 mova m3, m0
2074 mova m4, m2
2075 pmaddwd m3, [pw_ang_table + 28 * 16]
2076 pmaddwd m0, [pw_ang_table + 30 * 16]
2077 packssdw m0, m3
2078 paddw m0, [pw_16]
2079 psraw m0, 5
2080 pmaddwd m4, [pw_ang_table + 24 * 16]
2081 pmaddwd m2, [pw_ang_table + 26 * 16]
2082 packssdw m2, m4
2083 paddw m2, [pw_16]
2084 psraw m2, 5
2085 packuswb m0, m2
2086
2087 STORE_4x4
2088 RET
2089
2090 cglobal intra_pred_ang4_26, 3,4,4
2091 movd m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2092
2093 ; store
2094 movd [r0], m0
2095 movd [r0 + r1], m0
2096 movd [r0 + r1 * 2], m0
2097 lea r3, [r1 * 3]
2098 movd [r0 + r3], m0
2099
2100 ; filter
2101 cmp r4m, byte 0
2102 jz .quit
2103
2104 pxor m3, m3
2105 punpcklbw m0, m3
2106 pshuflw m0, m0, 0x00
2107 movd m2, [r2]
2108 punpcklbw m2, m3
2109 pshuflw m2, m2, 0x00
2110 movd m1, [r2 + 9]
2111 punpcklbw m1, m3
2112 psubw m1, m2
2113 psraw m1, 1
2114 paddw m0, m1
2115 packuswb m0, m0
2116
2117 movd r2, m0
2118 mov [r0], r2b
2119 shr r2, 8
2120 mov [r0 + r1], r2b
2121 shr r2, 8
2122 mov [r0 + r1 * 2], r2b
2123 shr r2, 8
2124 mov [r0 + r3], r2b
2125
2126 .quit:
2127 RET
2128
2129 cglobal intra_pred_ang4_27, 3,3,5
2130 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2131 punpcklbw m0, m0
2132 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2133
2134 pxor m1, m1
2135 punpcklbw m0, m1
2136 mova m2, m0
2137 mova m3, m0
2138 mova m4, m2
2139 pmaddwd m3, [pw_ang_table + 4 * 16]
2140 pmaddwd m0, [pw_ang_table + 2 * 16]
2141 packssdw m0, m3
2142 paddw m0, [pw_16]
2143 psraw m0, 5
2144 pmaddwd m4, [pw_ang_table + 8 * 16]
2145 pmaddwd m2, [pw_ang_table + 6 * 16]
2146 packssdw m2, m4
2147 paddw m2, [pw_16]
2148 psraw m2, 5
2149 packuswb m0, m2
2150
2151 STORE_4x4
2152 RET
2153
2154 cglobal intra_pred_ang4_28, 3,3,5
2155 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2156 punpcklbw m0, m0
2157 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2158
2159 pxor m1, m1
2160 punpcklbw m0, m1
2161 mova m2, m0
2162 mova m3, m0
2163 mova m4, m2
2164 pmaddwd m3, [pw_ang_table + 10 * 16]
2165 pmaddwd m0, [pw_ang_table + 5 * 16]
2166 packssdw m0, m3
2167 paddw m0, [pw_16]
2168 psraw m0, 5
2169 pmaddwd m4, [pw_ang_table + 20 * 16]
2170 pmaddwd m2, [pw_ang_table + 15 * 16]
2171 packssdw m2, m4
2172 paddw m2, [pw_16]
2173 psraw m2, 5
2174 packuswb m0, m2
2175
2176 STORE_4x4
2177 RET
2178
2179 cglobal intra_pred_ang4_29, 3,3,5
2180 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2181 punpcklbw m3, m3
2182 psrldq m3, 1
2183 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2184 psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
2185
2186 pxor m1, m1
2187 punpcklbw m0, m1
2188 mova m4, m0
2189 mova m2, m0
2190 pmaddwd m4, [pw_ang_table + 18 * 16]
2191 pmaddwd m0, [pw_ang_table + 9 * 16]
2192 packssdw m0, m4
2193 paddw m0, [pw_16]
2194 psraw m0, 5
2195 punpcklbw m3, m1
2196 pmaddwd m3, [pw_ang_table + 4 * 16]
2197 pmaddwd m2, [pw_ang_table + 27 * 16]
2198 packssdw m2, m3
2199 paddw m2, [pw_16]
2200 psraw m2, 5
2201 packuswb m0, m2
2202
2203 STORE_4x4
2204 RET
2205
2206 cglobal intra_pred_ang4_30, 3,3,4
2207 movh m2, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2208 punpcklbw m2, m2
2209 psrldq m2, 1
2210 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2211 psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
2212
2213 pxor m1, m1
2214 punpcklbw m0, m1
2215 mova m3, m0
2216 pmaddwd m3, [pw_ang_table + 26 * 16]
2217 pmaddwd m0, [pw_ang_table + 13 * 16]
2218 packssdw m0, m3
2219 paddw m0, [pw_16]
2220 psraw m0, 5
2221 punpcklbw m2, m1
2222 mova m3, m2
2223 pmaddwd m3, [pw_ang_table + 20 * 16]
2224 pmaddwd m2, [pw_ang_table + 7 * 16]
2225 packssdw m2, m3
2226 paddw m2, [pw_16]
2227 psraw m2, 5
2228 packuswb m0, m2
2229
2230 STORE_4x4
2231 RET
2232
2233 cglobal intra_pred_ang4_31, 3,3,5
2234 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2235 punpcklbw m3, m3
2236 psrldq m3, 1
2237 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2238 psrldq m3, 2
2239 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
2240 psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
2241
2242 pxor m1, m1
2243 punpcklbw m2, m1
2244 mova m4, m2
2245 pmaddwd m4, [pw_ang_table + 2 * 16]
2246 punpcklbw m0, m1
2247 pmaddwd m0, [pw_ang_table + 17 * 16]
2248 packssdw m0, m4
2249 paddw m0, [pw_16]
2250 psraw m0, 5
2251 punpcklbw m3, m1
2252 pmaddwd m3, [pw_ang_table + 4 * 16]
2253 pmaddwd m2, [pw_ang_table + 19 * 16]
2254 packssdw m2, m3
2255 paddw m2, [pw_16]
2256 psraw m2, 5
2257 packuswb m0, m2
2258
2259 STORE_4x4
2260 RET
2261
2262 cglobal intra_pred_ang4_32, 3,3,5
2263 movh m1, [r2 + 1] ;[8 7 6 5 4 3 2 1]
2264 punpcklbw m1, m1
2265 psrldq m1, 1
2266 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
2267 psrldq m1, 2
2268 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
2269 psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
2270
2271 pxor m4, m4
2272 punpcklbw m2, m4
2273 mova m3, m2
2274 pmaddwd m3, [pw_ang_table + 10 * 16]
2275 punpcklbw m0, m4
2276 pmaddwd m0, [pw_ang_table + 21 * 16]
2277 packssdw m0, m3
2278 paddw m0, [pw_16]
2279 psraw m0, 5
2280 punpcklbw m1, m4
2281 pmaddwd m1, [pw_ang_table + 20 * 16]
2282 pmaddwd m2, [pw_ang_table + 31 * 16]
2283 packssdw m2, m1
2284 paddw m2, [pw_16]
2285 psraw m2, 5
2286 packuswb m0, m2
2287
2288 STORE_4x4
2289 RET
2290
2291 cglobal intra_pred_ang4_33, 3,3,5
2292 movh m3, [r2 + 1] ; [8 7 6 5 4 3 2 1]
2293 punpcklbw m3, m3
2294 psrldq m3, 1
2295 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
2296 psrldq m3, 2
2297 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
2298 psrldq m3, 2
2299 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
2300 psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
2301
2302 pxor m4, m4
2303 punpcklbw m1, m4
2304 pmaddwd m1, [pw_ang_table + 20 * 16]
2305 punpcklbw m0, m4
2306 pmaddwd m0, [pw_ang_table + 26 * 16]
2307 packssdw m0, m1
2308 paddw m0, [pw_16]
2309 psraw m0, 5
2310 punpcklbw m3, m4
2311 pmaddwd m3, [pw_ang_table + 8 * 16]
2312 punpcklbw m2, m4
2313 pmaddwd m2, [pw_ang_table + 14 * 16]
2314 packssdw m2, m3
2315 paddw m2, [pw_16]
2316 psraw m2, 5
2317 packuswb m0, m2
2318
2319 STORE_4x4
2320 RET
2321
2322 ;---------------------------------------------------------------------------------------------
2323 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
2324 ;---------------------------------------------------------------------------------------------
2325 INIT_XMM sse4
2326 cglobal intra_pred_dc4, 5,5,3
2327 inc r2
2328 pxor m0, m0
2329 movd m1, [r2]
2330 movd m2, [r2 + 8]
2331 punpckldq m1, m2
2332 psadbw m1, m0 ; m1 = sum
2333
2334 test r4d, r4d
2335
2336 pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
2337 movd r4d, m1 ; r4d = dc_val
2338 pshufb m1, m0 ; m1 = byte [dc_val ...]
2339
2340 ; store DC 4x4
2341 lea r3, [r1 * 3]
2342 movd [r0], m1
2343 movd [r0 + r1], m1
2344 movd [r0 + r1 * 2], m1
2345 movd [r0 + r3], m1
2346
2347 ; do DC filter
2348 jz .end
2349 lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
2350 add r4d, r3d ; r4d = DC * 3 + 2
2351 movd m1, r4d
2352 pshuflw m1, m1, 0 ; m1 = pixDCx3
2353 pshufd m1, m1, 0
2354
2355 ; filter top
2356 movd m2, [r2]
2357 movd m0, [r2 + 9]
2358 punpckldq m2, m0
2359 pmovzxbw m2, m2
2360 paddw m2, m1
2361 psraw m2, 2
2362 packuswb m2, m2
2363 movd [r0], m2 ; overwrite top-left pixel, we will update it later
2364
2365 ; filter top-left
2366 movzx r4d, byte [r2 + 8]
2367 add r3d, r4d
2368 movzx r4d, byte [r2]
2369 add r3d, r4d
2370 shr r3d, 2
2371 mov [r0], r3b
2372
2373 ; filter left
2374 add r0, r1
2375 pextrb [r0], m2, 4
2376 pextrb [r0 + r1], m2, 5
2377 pextrb [r0 + r1 * 2], m2, 6
2378
2379 .end:
2380 RET
2381
2382 ;---------------------------------------------------------------------------------------------
2383 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
2384 ;---------------------------------------------------------------------------------------------
2385 INIT_XMM sse4
2386 cglobal intra_pred_dc8, 5, 7, 3
2387 lea r3, [r2 + 17]
2388 inc r2
2389 pxor m0, m0
2390 movh m1, [r2]
2391 movh m2, [r3]
2392 punpcklqdq m1, m2
2393 psadbw m1, m0
2394 pshufd m2, m1, 2
2395 paddw m1, m2
2396
2397 movd r5d, m1
2398 add r5d, 8
2399 shr r5d, 4 ; sum = sum / 16
2400 movd m1, r5d
2401 pshufb m1, m0 ; m1 = byte [dc_val ...]
2402
2403 test r4d, r4d
2404
2405 ; store DC 8x8
2406 mov r6, r0
2407 movh [r0], m1
2408 movh [r0 + r1], m1
2409 lea r0, [r0 + r1 * 2]
2410 movh [r0], m1
2411 movh [r0 + r1], m1
2412 lea r0, [r0 + r1 * 2]
2413 movh [r0], m1
2414 movh [r0 + r1], m1
2415 lea r0, [r0 + r1 * 2]
2416 movh [r0], m1
2417 movh [r0 + r1], m1
2418
2419 ; Do DC Filter
2420 jz .end
2421 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
2422 add r5d, r4d ; r5d = DC * 3 + 2
2423 movd m1, r5d
2424 pshuflw m1, m1, 0 ; m1 = pixDCx3
2425 pshufd m1, m1, 0
2426
2427 ; filter top
2428 pmovzxbw m2, [r2]
2429 paddw m2, m1
2430 psraw m2, 2
2431 packuswb m2, m2
2432 movh [r6], m2
2433
2434 ; filter top-left
2435 movzx r5d, byte [r3]
2436 add r4d, r5d
2437 movzx r3d, byte [r2]
2438 add r3d, r4d
2439 shr r3d, 2
2440 mov [r6], r3b
2441
2442 ; filter left
2443 add r6, r1
2444 pmovzxbw m2, [r2 + 17]
2445 paddw m2, m1
2446 psraw m2, 2
2447 packuswb m2, m2
2448 pextrb [r6], m2, 0
2449 pextrb [r6 + r1], m2, 1
2450 pextrb [r6 + 2 * r1], m2, 2
2451 lea r6, [r6 + r1 * 2]
2452 pextrb [r6 + r1], m2, 3
2453 pextrb [r6 + r1 * 2], m2, 4
2454 pextrb [r6 + r1 * 4], m2, 6
2455 lea r1, [r1 * 3]
2456 pextrb [r6 + r1], m2, 5
2457
2458 .end:
2459 RET
2460
2461 ;--------------------------------------------------------------------------------------------
2462 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
2463 ;--------------------------------------------------------------------------------------------
2464 INIT_XMM sse4
2465 cglobal intra_pred_dc16, 5, 7, 4
2466 lea r3, [r2 + 33]
2467 inc r2
2468 pxor m0, m0
2469 movu m1, [r2]
2470 movu m2, [r3]
2471 psadbw m1, m0
2472 psadbw m2, m0
2473 paddw m1, m2
2474 pshufd m2, m1, 2
2475 paddw m1, m2
2476
2477 movd r5d, m1
2478 add r5d, 16
2479 shr r5d, 5 ; sum = sum / 32
2480 movd m1, r5d
2481 pshufb m1, m0 ; m1 = byte [dc_val ...]
2482
2483 test r4d, r4d
2484
2485 ; store DC 16x16
2486 mov r6, r0
2487 movu [r0], m1
2488 movu [r0 + r1], m1
2489 lea r0, [r0 + r1 * 2]
2490 movu [r0], m1
2491 movu [r0 + r1], m1
2492 lea r0, [r0 + r1 * 2]
2493 movu [r0], m1
2494 movu [r0 + r1], m1
2495 lea r0, [r0 + r1 * 2]
2496 movu [r0], m1
2497 movu [r0 + r1], m1
2498 lea r0, [r0 + r1 * 2]
2499 movu [r0], m1
2500 movu [r0 + r1], m1
2501 lea r0, [r0 + r1 * 2]
2502 movu [r0], m1
2503 movu [r0 + r1], m1
2504 lea r0, [r0 + r1 * 2]
2505 movu [r0], m1
2506 movu [r0 + r1], m1
2507 lea r0, [r0 + r1 * 2]
2508 movu [r0], m1
2509 movu [r0 + r1], m1
2510
2511 ; Do DC Filter
2512 jz .end
2513 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
2514 add r5d, r4d ; r5d = DC * 3 + 2
2515 movd m1, r5d
2516 pshuflw m1, m1, 0 ; m1 = pixDCx3
2517 pshufd m1, m1, 0
2518
2519 ; filter top
2520 pmovzxbw m2, [r2]
2521 paddw m2, m1
2522 psraw m2, 2
2523 packuswb m2, m2
2524 movh [r6], m2
2525 pmovzxbw m3, [r2 + 8]
2526 paddw m3, m1
2527 psraw m3, 2
2528 packuswb m3, m3
2529 movh [r6 + 8], m3
2530
2531 ; filter top-left
2532 movzx r5d, byte [r3]
2533 add r4d, r5d
2534 movzx r3d, byte [r2]
2535 add r3d, r4d
2536 shr r3d, 2
2537 mov [r6], r3b
2538
2539 ; filter left
2540 add r6, r1
2541 pmovzxbw m2, [r2 + 33]
2542 paddw m2, m1
2543 psraw m2, 2
2544 packuswb m2, m2
2545 pextrb [r6], m2, 0
2546 pextrb [r6 + r1], m2, 1
2547 pextrb [r6 + r1 * 2], m2, 2
2548 lea r6, [r6 + r1 * 2]
2549 pextrb [r6 + r1], m2, 3
2550 pextrb [r6 + r1 * 2], m2, 4
2551 lea r6, [r6 + r1 * 2]
2552 pextrb [r6 + r1], m2, 5
2553 pextrb [r6 + r1 * 2], m2, 6
2554 lea r6, [r6 + r1 * 2]
2555 pextrb [r6 + r1], m2, 7
2556
2557 pmovzxbw m3, [r2 + 41]
2558 paddw m3, m1
2559 psraw m3, 2
2560 packuswb m3, m3
2561 pextrb [r6 + r1 * 2], m3, 0
2562 lea r6, [r6 + r1 * 2]
2563 pextrb [r6 + r1], m3, 1
2564 pextrb [r6 + r1 * 2], m3, 2
2565 lea r6, [r6 + r1 * 2]
2566 pextrb [r6 + r1], m3, 3
2567 pextrb [r6 + r1 * 2], m3, 4
2568 lea r6, [r6 + r1 * 2]
2569 pextrb [r6 + r1], m3, 5
2570 pextrb [r6 + r1 * 2], m3, 6
2571
2572 .end:
2573 RET
2574
2575 ;---------------------------------------------------------------------------------------------
2576 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
2577 ;---------------------------------------------------------------------------------------------
2578 INIT_XMM sse4
2579 cglobal intra_pred_dc32, 3, 5, 5
2580 lea r3, [r2 + 65]
2581 inc r2
2582 pxor m0, m0
2583 movu m1, [r2]
2584 movu m2, [r2 + 16]
2585 movu m3, [r3]
2586 movu m4, [r3 + 16]
2587 psadbw m1, m0
2588 psadbw m2, m0
2589 psadbw m3, m0
2590 psadbw m4, m0
2591 paddw m1, m2
2592 paddw m3, m4
2593 paddw m1, m3
2594 pshufd m2, m1, 2
2595 paddw m1, m2
2596
2597 movd r4d, m1
2598 add r4d, 32
2599 shr r4d, 6 ; sum = sum / 64
2600 movd m1, r4d
2601 pshufb m1, m0 ; m1 = byte [dc_val ...]
2602
2603 %rep 2
2604 ; store DC 16x16
2605 movu [r0], m1
2606 movu [r0 + r1], m1
2607 movu [r0 + 16], m1
2608 movu [r0 + r1 + 16],m1
2609 lea r0, [r0 + 2 * r1]
2610 movu [r0], m1
2611 movu [r0 + r1], m1
2612 movu [r0 + 16], m1
2613 movu [r0 + r1 + 16],m1
2614 lea r0, [r0 + 2 * r1]
2615 movu [r0], m1
2616 movu [r0 + r1], m1
2617 movu [r0 + 16], m1
2618 movu [r0 + r1 + 16],m1
2619 lea r0, [r0 + 2 * r1]
2620 movu [r0], m1
2621 movu [r0 + r1], m1
2622 movu [r0 + 16], m1
2623 movu [r0 + r1 + 16],m1
2624 lea r0, [r0 + 2 * r1]
2625 movu [r0], m1
2626 movu [r0 + r1], m1
2627 movu [r0 + 16], m1
2628 movu [r0 + r1 + 16],m1
2629 lea r0, [r0 + 2 * r1]
2630 movu [r0], m1
2631 movu [r0 + r1], m1
2632 movu [r0 + 16], m1
2633 movu [r0 + r1 + 16],m1
2634 lea r0, [r0 + 2 * r1]
2635 movu [r0], m1
2636 movu [r0 + r1], m1
2637 movu [r0 + 16], m1
2638 movu [r0 + r1 + 16],m1
2639 lea r0, [r0 + 2 * r1]
2640 movu [r0], m1
2641 movu [r0 + r1], m1
2642 movu [r0 + 16], m1
2643 movu [r0 + r1 + 16],m1
2644 lea r0, [r0 + 2 * r1]
2645 %endrep
2646
2647 RET
2648
2649 ;---------------------------------------------------------------------------------------------
2650 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
2651 ;---------------------------------------------------------------------------------------------
2652 %if ARCH_X86_64 == 1
2653 INIT_YMM avx2
2654 cglobal intra_pred_dc32, 3, 4, 3
2655 lea r3, [r1 * 3]
2656 pxor m0, m0
2657 movu m1, [r2 + 1]
2658 movu m2, [r2 + 65]
2659 psadbw m1, m0
2660 psadbw m2, m0
2661 paddw m1, m2
2662 vextracti128 xm2, m1, 1
2663 paddw m1, m2
2664 pshufd m2, m1, 2
2665 paddw m1, m2
2666
2667 pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64
2668 vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...]
2669
2670 movu [r0 + r1 * 0], m1
2671 movu [r0 + r1 * 1], m1
2672 movu [r0 + r1 * 2], m1
2673 movu [r0 + r3 * 1], m1
2674 lea r0, [r0 + 4 * r1]
2675 movu [r0 + r1 * 0], m1
2676 movu [r0 + r1 * 1], m1
2677 movu [r0 + r1 * 2], m1
2678 movu [r0 + r3 * 1], m1
2679 lea r0, [r0 + 4 * r1]
2680 movu [r0 + r1 * 0], m1
2681 movu [r0 + r1 * 1], m1
2682 movu [r0 + r1 * 2], m1
2683 movu [r0 + r3 * 1], m1
2684 lea r0, [r0 + 4 * r1]
2685 movu [r0 + r1 * 0], m1
2686 movu [r0 + r1 * 1], m1
2687 movu [r0 + r1 * 2], m1
2688 movu [r0 + r3 * 1], m1
2689 lea r0, [r0 + 4 * r1]
2690 movu [r0 + r1 * 0], m1
2691 movu [r0 + r1 * 1], m1
2692 movu [r0 + r1 * 2], m1
2693 movu [r0 + r3 * 1], m1
2694 lea r0, [r0 + 4 * r1]
2695 movu [r0 + r1 * 0], m1
2696 movu [r0 + r1 * 1], m1
2697 movu [r0 + r1 * 2], m1
2698 movu [r0 + r3 * 1], m1
2699 lea r0, [r0 + 4 * r1]
2700 movu [r0 + r1 * 0], m1
2701 movu [r0 + r1 * 1], m1
2702 movu [r0 + r1 * 2], m1
2703 movu [r0 + r3 * 1], m1
2704 lea r0, [r0 + 4 * r1]
2705 movu [r0 + r1 * 0], m1
2706 movu [r0 + r1 * 1], m1
2707 movu [r0 + r1 * 2], m1
2708 movu [r0 + r3 * 1], m1
2709 RET
2710 %endif ;; ARCH_X86_64 == 1
2711
2712 ;---------------------------------------------------------------------------------------
2713 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2714 ;---------------------------------------------------------------------------------------
2715 INIT_XMM sse4
2716 cglobal intra_pred_planar4, 3,3,7
2717 pmovzxbw m1, [r2 + 1]
2718 pmovzxbw m2, [r2 + 9]
2719 pshufhw m3, m1, 0 ; topRight
2720 pshufd m3, m3, 0xAA
2721 pshufhw m4, m2, 0 ; bottomLeft
2722 pshufd m4, m4, 0xAA
2723 pmullw m3, [multi_2Row] ; (x + 1) * topRight
2724 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
2725 mova m6, [pw_planar4_0]
2726 paddw m3, [pw_4]
2727 paddw m3, m4
2728 paddw m3, m0
2729 psubw m4, m1
2730
2731 pshuflw m5, m2, 0
2732 pmullw m5, m6
2733 paddw m5, m3
2734 paddw m3, m4
2735 psraw m5, 3
2736 packuswb m5, m5
2737 movd [r0], m5
2738
2739 pshuflw m5, m2, 01010101b
2740 pmullw m5, m6
2741 paddw m5, m3
2742 paddw m3, m4
2743 psraw m5, 3
2744 packuswb m5, m5
2745 movd [r0 + r1], m5
2746 lea r0, [r0 + 2 * r1]
2747
2748 pshuflw m5, m2, 10101010b
2749 pmullw m5, m6
2750 paddw m5, m3
2751 paddw m3, m4
2752 psraw m5, 3
2753 packuswb m5, m5
2754 movd [r0], m5
2755
2756 pshuflw m5, m2, 11111111b
2757 pmullw m5, m6
2758 paddw m5, m3
2759 paddw m3, m4
2760 psraw m5, 3
2761 packuswb m5, m5
2762 movd [r0 + r1], m5
2763 RET
2764
2765 ;---------------------------------------------------------------------------------------
2766 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2767 ;---------------------------------------------------------------------------------------
2768 INIT_XMM sse4
2769 cglobal intra_pred_planar8, 3,3,7
2770 pmovzxbw m1, [r2 + 1]
2771 pmovzxbw m2, [r2 + 17]
2772
2773 movd m3, [r2 + 9] ; topRight = above[8];
2774 movd m4, [r2 + 25] ; bottomLeft = left[8];
2775
2776 pxor m0, m0
2777 pshufb m3, m0
2778 pshufb m4, m0
2779 punpcklbw m3, m0 ; v_topRight
2780 punpcklbw m4, m0 ; v_bottomLeft
2781 pmullw m3, [multiL] ; (x + 1) * topRight
2782 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
2783 mova m6, [pw_planar16_mul + mmsize]
2784 paddw m3, [pw_8]
2785 paddw m3, m4
2786 paddw m3, m0
2787 psubw m4, m1
2788
2789 %macro INTRA_PRED_PLANAR8 1
2790 %if (%1 < 4)
2791 pshuflw m5, m2, 0x55 * %1
2792 pshufd m5, m5, 0
2793 %else
2794 pshufhw m5, m2, 0x55 * (%1 - 4)
2795 pshufd m5, m5, 0xAA
2796 %endif
2797 pmullw m5, m6
2798 paddw m5, m3
2799 paddw m3, m4
2800 psraw m5, 4
2801 packuswb m5, m5
2802 movh [r0], m5
2803 lea r0, [r0 + r1]
2804 %endmacro
2805
2806 INTRA_PRED_PLANAR8 0
2807 INTRA_PRED_PLANAR8 1
2808 INTRA_PRED_PLANAR8 2
2809 INTRA_PRED_PLANAR8 3
2810 INTRA_PRED_PLANAR8 4
2811 INTRA_PRED_PLANAR8 5
2812 INTRA_PRED_PLANAR8 6
2813 INTRA_PRED_PLANAR8 7
2814 RET
2815
2816 ;---------------------------------------------------------------------------------------
2817 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2818 ;---------------------------------------------------------------------------------------
2819 INIT_XMM sse4
2820 cglobal intra_pred_planar16, 3,3,8
2821 pmovzxbw m2, [r2 + 1]
2822 pmovzxbw m7, [r2 + 9]
2823
2824 movd m3, [r2 + 17] ; topRight = above[16]
2825 movd m6, [r2 + 49] ; bottomLeft = left[16]
2826
2827 pxor m0, m0
2828 pshufb m3, m0
2829 pshufb m6, m0
2830 punpcklbw m3, m0 ; v_topRight
2831 punpcklbw m6, m0 ; v_bottomLeft
2832 pmullw m4, m3, [multiH] ; (x + 1) * topRight
2833 pmullw m3, [multiL] ; (x + 1) * topRight
2834 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
2835 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
2836 paddw m4, [pw_16]
2837 paddw m3, [pw_16]
2838 paddw m4, m6
2839 paddw m3, m6
2840 paddw m4, m5
2841 paddw m3, m1
2842 psubw m1, m6, m7
2843 psubw m6, m2
2844
2845 pmovzxbw m2, [r2 + 33]
2846 pmovzxbw m7, [r2 + 41]
2847
2848 %macro INTRA_PRED_PLANAR16 1
2849 %if (%1 < 4)
2850 pshuflw m5, m2, 0x55 * %1
2851 pshufd m5, m5, 0
2852 %else
2853 %if (%1 < 8)
2854 pshufhw m5, m2, 0x55 * (%1 - 4)
2855 pshufd m5, m5, 0xAA
2856 %else
2857 %if (%1 < 12)
2858 pshuflw m5, m7, 0x55 * (%1 - 8)
2859 pshufd m5, m5, 0
2860 %else
2861 pshufhw m5, m7, 0x55 * (%1 - 12)
2862 pshufd m5, m5, 0xAA
2863 %endif
2864 %endif
2865 %endif
2866 pmullw m0, m5, [pw_planar16_mul + mmsize]
2867 pmullw m5, [pw_planar16_mul]
2868 paddw m0, m4
2869 paddw m5, m3
2870 paddw m3, m6
2871 paddw m4, m1
2872 psraw m5, 5
2873 psraw m0, 5
2874 packuswb m5, m0
2875 movu [r0], m5
2876 lea r0, [r0 + r1]
2877 %endmacro
2878
2879 INTRA_PRED_PLANAR16 0
2880 INTRA_PRED_PLANAR16 1
2881 INTRA_PRED_PLANAR16 2
2882 INTRA_PRED_PLANAR16 3
2883 INTRA_PRED_PLANAR16 4
2884 INTRA_PRED_PLANAR16 5
2885 INTRA_PRED_PLANAR16 6
2886 INTRA_PRED_PLANAR16 7
2887 INTRA_PRED_PLANAR16 8
2888 INTRA_PRED_PLANAR16 9
2889 INTRA_PRED_PLANAR16 10
2890 INTRA_PRED_PLANAR16 11
2891 INTRA_PRED_PLANAR16 12
2892 INTRA_PRED_PLANAR16 13
2893 INTRA_PRED_PLANAR16 14
2894 INTRA_PRED_PLANAR16 15
2895 RET
2896
2897 ;---------------------------------------------------------------------------------------
2898 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2899 ;---------------------------------------------------------------------------------------
2900 INIT_YMM avx2
2901 cglobal intra_pred_planar16, 3,3,6
2902 vpbroadcastw m3, [r2 + 17]
2903 mova m5, [pw_00ff]
2904 vpbroadcastw m4, [r2 + 49]
2905 mova m0, [pw_planar16_mul]
2906 pmovzxbw m2, [r2 + 1]
2907 pand m3, m5 ; v_topRight
2908 pand m4, m5 ; v_bottomLeft
2909
2910 pmullw m3, [multiL] ; (x + 1) * topRight
2911 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
2912 paddw m3, [pw_16]
2913 paddw m3, m4
2914 paddw m3, m1
2915 psubw m4, m2
2916 add r2, 33
2917
2918 %macro INTRA_PRED_PLANAR16_AVX2 1
2919 vpbroadcastw m1, [r2 + %1]
2920 vpsrlw m2, m1, 8
2921 pand m1, m5
2922
2923 pmullw m1, m0
2924 pmullw m2, m0
2925 paddw m1, m3
2926 paddw m3, m4
2927 psraw m1, 5
2928 paddw m2, m3
2929 psraw m2, 5
2930 paddw m3, m4
2931 packuswb m1, m2
2932 vpermq m1, m1, 11011000b
2933 movu [r0], xm1
2934 vextracti128 [r0 + r1], m1, 1
2935 lea r0, [r0 + r1 * 2]
2936 %endmacro
2937 INTRA_PRED_PLANAR16_AVX2 0
2938 INTRA_PRED_PLANAR16_AVX2 2
2939 INTRA_PRED_PLANAR16_AVX2 4
2940 INTRA_PRED_PLANAR16_AVX2 6
2941 INTRA_PRED_PLANAR16_AVX2 8
2942 INTRA_PRED_PLANAR16_AVX2 10
2943 INTRA_PRED_PLANAR16_AVX2 12
2944 INTRA_PRED_PLANAR16_AVX2 14
2945 %undef INTRA_PRED_PLANAR16_AVX2
2946 RET
2947
2948 ;---------------------------------------------------------------------------------------
2949 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
2950 ;---------------------------------------------------------------------------------------
2951 INIT_XMM sse4
2952 %if ARCH_X86_64 == 1
2953 cglobal intra_pred_planar32, 3,4,12
2954 %else
2955 cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
2956 %define m8 [rsp + 0 * mmsize]
2957 %define m9 [rsp + 1 * mmsize]
2958 %define m10 [rsp + 2 * mmsize]
2959 %define m11 [rsp + 3 * mmsize]
2960 %endif
2961 movd m3, [r2 + 33] ; topRight = above[32]
2962
2963 pxor m7, m7
2964 pshufb m3, m7
2965 punpcklbw m3, m7 ; v_topRight
2966
2967 pmullw m0, m3, [multiL] ; (x + 1) * topRight
2968 pmullw m1, m3, [multiH] ; (x + 1) * topRight
2969 pmullw m2, m3, [multiH2] ; (x + 1) * topRight
2970 pmullw m3, [multiH3] ; (x + 1) * topRight
2971
2972 movd m6, [r2 + 97] ; bottomLeft = left[32]
2973 pshufb m6, m7
2974 punpcklbw m6, m7 ; v_bottomLeft
2975
2976 paddw m0, m6
2977 paddw m1, m6
2978 paddw m2, m6
2979 paddw m3, m6
2980 paddw m0, [pw_32]
2981 paddw m1, [pw_32]
2982 paddw m2, [pw_32]
2983 paddw m3, [pw_32]
2984 pmovzxbw m4, [r2 + 1]
2985 pmullw m5, m4, [pw_31]
2986 paddw m0, m5
2987 psubw m5, m6, m4
2988 mova m8, m5
2989 pmovzxbw m4, [r2 + 9]
2990 pmullw m5, m4, [pw_31]
2991 paddw m1, m5
2992 psubw m5, m6, m4
2993 mova m9, m5
2994 pmovzxbw m4, [r2 + 17]
2995 pmullw m5, m4, [pw_31]
2996 paddw m2, m5
2997 psubw m5, m6, m4
2998 mova m10, m5
2999 pmovzxbw m4, [r2 + 25]
3000 pmullw m5, m4, [pw_31]
3001 paddw m3, m5
3002 psubw m5, m6, m4
3003 mova m11, m5
3004 add r2, 65 ; (2 * blkSize + 1)
3005
3006 %macro INTRA_PRED_PLANAR32 0
3007 movd m4, [r2]
3008 pshufb m4, m7
3009 punpcklbw m4, m7
3010 pmullw m5, m4, [pw_planar32_mul]
3011 pmullw m6, m4, [pw_planar32_mul + mmsize]
3012 paddw m5, m0
3013 paddw m6, m1
3014 paddw m0, m8
3015 paddw m1, m9
3016 psraw m5, 6
3017 psraw m6, 6
3018 packuswb m5, m6
3019 movu [r0], m5
3020 pmullw m5, m4, [pw_planar16_mul]
3021 pmullw m4, [pw_planar16_mul + mmsize]
3022 paddw m5, m2
3023 paddw m4, m3
3024 paddw m2, m10
3025 paddw m3, m11
3026 psraw m5, 6
3027 psraw m4, 6
3028 packuswb m5, m4
3029 movu [r0 + 16], m5
3030
3031 lea r0, [r0 + r1]
3032 inc r2
3033 %endmacro
3034
3035 mov r3, 4
3036 .loop:
3037 INTRA_PRED_PLANAR32
3038 INTRA_PRED_PLANAR32
3039 INTRA_PRED_PLANAR32
3040 INTRA_PRED_PLANAR32
3041 INTRA_PRED_PLANAR32
3042 INTRA_PRED_PLANAR32
3043 INTRA_PRED_PLANAR32
3044 INTRA_PRED_PLANAR32
3045 dec r3
3046 jnz .loop
3047 RET
3048
3049 ;---------------------------------------------------------------------------------------
3050 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
3051 ;---------------------------------------------------------------------------------------
3052 %if ARCH_X86_64 == 1
3053 INIT_YMM avx2
3054 cglobal intra_pred_planar32, 3,4,11
3055 mova m6, [pw_00ff]
3056 vpbroadcastw m3, [r2 + 33] ; topRight = above[32]
3057 vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32]
3058 pand m3, m6
3059 pand m2, m6
3060
3061 pmullw m0, m3, [multiL] ; (x + 1) * topRight
3062 pmullw m3, [multiH2] ; (x + 1) * topRight
3063
3064 paddw m0, m2
3065 paddw m3, m2
3066 paddw m0, [pw_32]
3067 paddw m3, [pw_32]
3068
3069 pmovzxbw m4, [r2 + 1]
3070 pmovzxbw m1, [r2 + 17]
3071 pmullw m5, m4, [pw_31]
3072 paddw m0, m5
3073 psubw m5, m2, m4
3074 psubw m2, m1
3075 pmullw m1, [pw_31]
3076 paddw m3, m1
3077 mova m1, m5
3078
3079 add r2, 65 ; (2 * blkSize + 1)
3080 mova m9, [pw_planar32_mul]
3081 mova m10, [pw_planar16_mul]
3082
3083 %macro INTRA_PRED_PLANAR32_AVX2 0
3084 vpbroadcastw m4, [r2]
3085 vpsrlw m7, m4, 8
3086 pand m4, m6
3087
3088 pmullw m5, m4, m9
3089 pmullw m4, m4, m10
3090 paddw m5, m0
3091 paddw m4, m3
3092 paddw m0, m1
3093 paddw m3, m2
3094 psraw m5, 6
3095 psraw m4, 6
3096 packuswb m5, m4
3097 pmullw m8, m7, m9
3098 pmullw m7, m7, m10
3099 vpermq m5, m5, 11011000b
3100 paddw m8, m0
3101 paddw m7, m3
3102 paddw m0, m1
3103 paddw m3, m2
3104 psraw m8, 6
3105 psraw m7, 6
3106 packuswb m8, m7
3107 add r2, 2
3108 vpermq m8, m8, 11011000b
3109
3110 movu [r0], m5
3111 movu [r0 + r1], m8
3112 lea r0, [r0 + r1 * 2]
3113 %endmacro
3114 INTRA_PRED_PLANAR32_AVX2
3115 INTRA_PRED_PLANAR32_AVX2
3116 INTRA_PRED_PLANAR32_AVX2
3117 INTRA_PRED_PLANAR32_AVX2
3118 INTRA_PRED_PLANAR32_AVX2
3119 INTRA_PRED_PLANAR32_AVX2
3120 INTRA_PRED_PLANAR32_AVX2
3121 INTRA_PRED_PLANAR32_AVX2
3122 INTRA_PRED_PLANAR32_AVX2
3123 INTRA_PRED_PLANAR32_AVX2
3124 INTRA_PRED_PLANAR32_AVX2
3125 INTRA_PRED_PLANAR32_AVX2
3126 INTRA_PRED_PLANAR32_AVX2
3127 INTRA_PRED_PLANAR32_AVX2
3128 INTRA_PRED_PLANAR32_AVX2
3129 INTRA_PRED_PLANAR32_AVX2
3130 %undef INTRA_PRED_PLANAR32_AVX2
3131 RET
3132 %endif ;; ARCH_X86_64 == 1
3133
3134 ;-----------------------------------------------------------------------------------------
3135 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
3136 ;-----------------------------------------------------------------------------------------
3137 INIT_XMM ssse3
3138 cglobal intra_pred_ang4_2, 3,5,3
3139 lea r4, [r2 + 2]
3140 add r2, 10
3141 cmp r3m, byte 34
3142 cmove r2, r4
3143
3144 movh m0, [r2]
3145 movd [r0], m0
3146 palignr m1, m0, 1
3147 movd [r0 + r1], m1
3148 palignr m2, m0, 2
3149 movd [r0 + r1 * 2], m2
3150 lea r1, [r1 * 3]
3151 psrldq m0, 3
3152 movd [r0 + r1], m0
3153 RET
3154
3155 INIT_XMM sse4
3156 cglobal intra_pred_ang4_3, 3,5,5
3157 mov r4, 1
3158 cmp r3m, byte 33
3159 mov r3, 9
3160 cmove r3, r4
3161
3162 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3163 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3164 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3165 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
3166 palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
3167 palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
3168 punpcklqdq m0, m1
3169 punpcklqdq m2, m3
3170
3171 lea r3, [ang_table + 20 * 16]
3172 movh m3, [r3 + 6 * 16] ; [26]
3173 movhps m3, [r3] ; [20]
3174 movh m4, [r3 - 6 * 16] ; [14]
3175 movhps m4, [r3 - 12 * 16] ; [ 8]
3176 jmp .do_filter4x4
3177
3178 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
3179 ALIGN 16
3180 .do_filter4x4:
3181 mova m1, [pw_1024]
3182
3183 pmaddubsw m0, m3
3184 pmulhrsw m0, m1
3185 pmaddubsw m2, m4
3186 pmulhrsw m2, m1
3187 packuswb m0, m2
3188
3189 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
3190 jz .store
3191
3192 ; transpose 4x4
3193 pshufb m0, [c_trans_4x4]
3194
3195 .store:
3196 ; TODO: use pextrd here after intrinsic ssse3 removed
3197 movd [r0], m0
3198 pextrd [r0 + r1], m0, 1
3199 pextrd [r0 + r1 * 2], m0, 2
3200 lea r1, [r1 * 3]
3201 pextrd [r0 + r1], m0, 3
3202 RET
3203
3204 cglobal intra_pred_ang4_4, 3,5,5
3205 xor r4, r4
3206 inc r4
3207 cmp r3m, byte 32
3208 mov r3, 9
3209 cmove r3, r4
3210
3211 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3212 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3213 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3214 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
3215 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
3216 punpcklqdq m0, m1
3217 punpcklqdq m2, m1, m3
3218
3219 lea r3, [ang_table + 18 * 16]
3220 movh m3, [r3 + 3 * 16] ; [21]
3221 movhps m3, [r3 - 8 * 16] ; [10]
3222 movh m4, [r3 + 13 * 16] ; [31]
3223 movhps m4, [r3 + 2 * 16] ; [20]
3224 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3225
3226 cglobal intra_pred_ang4_5, 3,5,5
3227 xor r4, r4
3228 inc r4
3229 cmp r3m, byte 31
3230 mov r3, 9
3231 cmove r3, r4
3232
3233 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3234 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3235 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3236 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
3237 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
3238 punpcklqdq m0, m1
3239 punpcklqdq m2, m1, m3
3240
3241 lea r3, [ang_table + 10 * 16]
3242 movh m3, [r3 + 7 * 16] ; [17]
3243 movhps m3, [r3 - 8 * 16] ; [ 2]
3244 movh m4, [r3 + 9 * 16] ; [19]
3245 movhps m4, [r3 - 6 * 16] ; [ 4]
3246 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3247
3248 cglobal intra_pred_ang4_6, 3,5,5
3249 xor r4, r4
3250 inc r4
3251 cmp r3m, byte 30
3252 mov r3, 9
3253 cmove r3, r4
3254
3255 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3256 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3257 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3258 palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
3259 punpcklqdq m0, m0
3260 punpcklqdq m2, m2
3261
3262 lea r3, [ang_table + 19 * 16]
3263 movh m3, [r3 - 6 * 16] ; [13]
3264 movhps m3, [r3 + 7 * 16] ; [26]
3265 movh m4, [r3 - 12 * 16] ; [ 7]
3266 movhps m4, [r3 + 1 * 16] ; [20]
3267 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3268
3269 cglobal intra_pred_ang4_7, 3,5,5
3270 xor r4, r4
3271 inc r4
3272 cmp r3m, byte 29
3273 mov r3, 9
3274 cmove r3, r4
3275
3276 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3277 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3278 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3279 palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
3280 punpcklqdq m2, m0, m3
3281 punpcklqdq m0, m0
3282
3283 lea r3, [ang_table + 20 * 16]
3284 movh m3, [r3 - 11 * 16] ; [ 9]
3285 movhps m3, [r3 - 2 * 16] ; [18]
3286 movh m4, [r3 + 7 * 16] ; [27]
3287 movhps m4, [r3 - 16 * 16] ; [ 4]
3288 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3289
3290 cglobal intra_pred_ang4_8, 3,5,5
3291 xor r4, r4
3292 inc r4
3293 cmp r3m, byte 28
3294 mov r3, 9
3295 cmove r3, r4
3296
3297 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3298 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3299 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3300 punpcklqdq m0, m0
3301 mova m2, m0
3302
3303 lea r3, [ang_table + 13 * 16]
3304 movh m3, [r3 - 8 * 16] ; [ 5]
3305 movhps m3, [r3 - 3 * 16] ; [10]
3306 movh m4, [r3 + 2 * 16] ; [15]
3307 movhps m4, [r3 + 7 * 16] ; [20]
3308 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3309
3310 cglobal intra_pred_ang4_9, 3,5,5
3311 xor r4, r4
3312 inc r4
3313 cmp r3m, byte 27
3314 mov r3, 9
3315 cmove r3, r4
3316
3317 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
3318 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
3319 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3320 punpcklqdq m0, m0
3321 mova m2, m0
3322
3323 lea r3, [ang_table + 4 * 16]
3324 movh m3, [r3 - 2 * 16] ; [ 2]
3325 movhps m3, [r3 - 0 * 16] ; [ 4]
3326 movh m4, [r3 + 2 * 16] ; [ 6]
3327 movhps m4, [r3 + 4 * 16] ; [ 8]
3328 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3329
3330 cglobal intra_pred_ang4_10, 3,3,4
3331 movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1]
3332 pshufb m0, [pb_unpackbd1]
3333 pshufd m1, m0, 1
3334 movhlps m2, m0
3335 pshufd m3, m0, 3
3336 movd [r0 + r1], m1
3337 movd [r0 + r1 * 2], m2
3338 lea r1, [r1 * 3]
3339 movd [r0 + r1], m3
3340 cmp r4m, byte 0
3341 jz .quit
3342
3343 ; filter
3344 pmovzxbw m0, m0 ; [-1 -1 -1 -1]
3345 movh m1, [r2] ; [4 3 2 1 0]
3346 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
3347 pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
3348 psubw m1, m2
3349 psraw m1, 1
3350 paddw m0, m1
3351 packuswb m0, m0
3352 .quit:
3353 movd [r0], m0
3354 RET
3355
3356 INIT_XMM sse4
3357 cglobal intra_pred_ang4_26, 3,4,3
3358 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
3359
3360 ; store
3361 movd [r0], m0
3362 movd [r0 + r1], m0
3363 movd [r0 + r1 * 2], m0
3364 lea r3, [r1 * 3]
3365 movd [r0 + r3], m0
3366
3367 ; filter
3368 cmp r4m, byte 0
3369 jz .quit
3370
3371 pshufb m0, [pb_0_8] ; [ 1 1 1 1]
3372 movh m1, [r2 + 8] ; [-4 -3 -2 -1 0]
3373 pinsrb m1, [r2], 0
3374 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
3375 pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
3376 psubw m1, m2
3377 psraw m1, 1
3378 paddw m0, m1
3379 packuswb m0, m0
3380
3381 pextrb [r0], m0, 0
3382 pextrb [r0 + r1], m0, 1
3383 pextrb [r0 + r1 * 2], m0, 2
3384 pextrb [r0 + r3], m0, 3
3385 .quit:
3386 RET
3387
3388 cglobal intra_pred_ang4_11, 3,5,5
3389 xor r4, r4
3390 cmp r3m, byte 25
3391 mov r3, 8
3392 cmove r3, r4
3393
3394 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
3395 pinsrb m0, [r2], 0
3396 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
3397 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
3398 punpcklqdq m0, m0
3399 mova m2, m0
3400
3401 lea r3, [ang_table + 24 * 16]
3402
3403 movh m3, [r3 + 6 * 16] ; [24]
3404 movhps m3, [r3 + 4 * 16] ; [26]
3405 movh m4, [r3 + 2 * 16] ; [28]
3406 movhps m4, [r3 + 0 * 16] ; [30]
3407 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3408
3409 cglobal intra_pred_ang4_12, 3,5,5
3410 xor r4, r4
3411 cmp r3m, byte 24
3412 mov r3, 8
3413 cmove r3, r4
3414
3415 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
3416 pinsrb m0, [r2], 0
3417 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
3418 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
3419 punpcklqdq m0, m0
3420 mova m2, m0
3421
3422 lea r3, [ang_table + 20 * 16]
3423 movh m3, [r3 + 7 * 16] ; [27]
3424 movhps m3, [r3 + 2 * 16] ; [22]
3425 movh m4, [r3 - 3 * 16] ; [17]
3426 movhps m4, [r3 - 8 * 16] ; [12]
3427 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3428
3429 cglobal intra_pred_ang4_13, 4,5,5
3430 xor r4, r4
3431 cmp r3m, byte 23
3432 mov r3, 8
3433 jz .next
3434 xchg r3, r4
3435 .next:
3436 movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
3437 pinsrb m1, [r2], 1
3438 palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
3439 palignr m2, m1, 2 ; [x x x x 4 3 2 1]
3440 pinsrb m1, [r2 + r3 + 4], 0
3441 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
3442 punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
3443 punpcklqdq m2, m0, m1
3444 punpcklqdq m0, m0
3445
3446 lea r3, [ang_table + 21 * 16]
3447 movh m3, [r3 + 2 * 16] ; [23]
3448 movhps m3, [r3 - 7 * 16] ; [14]
3449 movh m4, [r3 - 16 * 16] ; [ 5]
3450 movhps m4, [r3 + 7 * 16] ; [28]
3451 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3452
3453 cglobal intra_pred_ang4_14, 4,5,5
3454 xor r4, r4
3455 cmp r3m, byte 22
3456 mov r3, 8
3457 jz .next
3458 xchg r3, r4
3459 .next:
3460 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
3461 pinsrb m2, [r2], 1
3462 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
3463 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
3464 pinsrb m2, [r2 + r3 + 2], 0
3465 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
3466 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
3467 punpcklqdq m0, m0
3468 punpcklqdq m2, m2
3469
3470 lea r3, [ang_table + 19 * 16]
3471 movh m3, [r3 + 0 * 16] ; [19]
3472 movhps m3, [r3 - 13 * 16] ; [ 6]
3473 movh m4, [r3 + 6 * 16] ; [25]
3474 movhps m4, [r3 - 7 * 16] ; [12]
3475 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3476
3477 cglobal intra_pred_ang4_15, 4,5,5
3478 xor r4, r4
3479 cmp r3m, byte 21
3480 mov r3, 8
3481 jz .next
3482 xchg r3, r4
3483 .next:
3484 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
3485 pinsrb m2, [r2], 1
3486 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
3487 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
3488 pinsrb m2, [r2 + r3 + 2], 0
3489 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
3490 pinsrb m3, [r2 + r3 + 4], 0
3491 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
3492 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
3493 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
3494 punpcklqdq m0, m2
3495 punpcklqdq m2, m4
3496
3497 lea r3, [ang_table + 23 * 16]
3498 movh m3, [r3 - 8 * 16] ; [15]
3499 movhps m3, [r3 + 7 * 16] ; [30]
3500 movh m4, [r3 - 10 * 16] ; [13]
3501 movhps m4, [r3 + 5 * 16] ; [28]
3502 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3503
3504 cglobal intra_pred_ang4_16, 3,5,5
3505 xor r4, r4
3506 cmp r3m, byte 20
3507 mov r3, 8
3508 jz .next
3509 xchg r3, r4
3510 .next:
3511 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
3512 pinsrb m2, [r2], 1
3513 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
3514 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
3515 pinsrb m2, [r2 + r3 + 2], 0
3516 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
3517 pinsrb m3, [r2 + r3 + 3], 0
3518 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
3519 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
3520 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
3521 punpcklqdq m0, m2
3522 punpcklqdq m2, m4
3523
3524 lea r3, [ang_table + 19 * 16]
3525 movh m3, [r3 - 8 * 16] ; [11]
3526 movhps m3, [r3 + 3 * 16] ; [22]
3527 movh m4, [r3 - 18 * 16] ; [ 1]
3528 movhps m4, [r3 - 7 * 16] ; [12]
3529 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3530
3531 cglobal intra_pred_ang4_17, 3,5,5
3532 xor r4, r4
3533 cmp r3m, byte 19
3534 mov r3, 8
3535 jz .next
3536 xchg r3, r4
3537 .next:
3538 movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x]
3539 pinsrb m3, [r2], 1
3540 palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
3541 palignr m1, m3, 2 ; [- - - - 4 3 2 1]
3542 mova m4, m0
3543 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
3544 pinsrb m3, [r2 + r3 + 1], 0
3545 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
3546 punpcklqdq m0, m1
3547
3548 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
3549 pinsrb m2, [r2 + r3 + 2], 0
3550 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
3551 pinsrb m1, [r2 + r3 + 4], 0
3552 punpcklbw m1, m2 ; [1 0 0 x x y y z]
3553 punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
3554 punpcklqdq m2, m1
3555
3556 lea r3, [ang_table + 14 * 16]
3557 movh m3, [r3 - 8 * 16] ; [ 6]
3558 movhps m3, [r3 - 2 * 16] ; [12]
3559 movh m4, [r3 + 4 * 16] ; [18]
3560 movhps m4, [r3 + 10 * 16] ; [24]
3561 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
3562
3563 cglobal intra_pred_ang4_18, 3,5,1
3564 mov r4d, [r2 + 8]
3565 mov r3b, byte [r2]
3566 mov [r2 + 8], r3b
3567 mov r3d, [r2 + 8]
3568 bswap r3d
3569 movd m0, r3d
3570
3571 pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
3572 lea r3, [r1 * 3]
3573 movd [r0 + r3], m0
3574 psrldq m0, 1
3575 movd [r0 + r1 * 2], m0
3576 psrldq m0, 1
3577 movd [r0 + r1], m0
3578 psrldq m0, 1
3579 movd [r0], m0
3580 mov [r2 + 8], r4w
3581 RET
3582
3583 ;-----------------------------------------------------------------------------------------
3584 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
3585 ;-----------------------------------------------------------------------------------------
3586 INIT_XMM ssse3
3587 cglobal intra_pred_ang8_2, 3,5,2
3588 lea r4, [r2 + 2]
3589 add r2, 18
3590 cmp r3m, byte 34
3591 cmove r2, r4
3592 movu m0, [r2]
3593 lea r4, [r1 * 3]
3594
3595 movh [r0], m0
3596 palignr m1, m0, 1
3597 movh [r0 + r1], m1
3598 palignr m1, m0, 2
3599 movh [r0 + r1 * 2], m1
3600 palignr m1, m0, 3
3601 movh [r0 + r4], m1
3602 palignr m1, m0, 4
3603 lea r0, [r0 + r1 * 4]
3604 movh [r0], m1
3605 palignr m1, m0, 5
3606 movh [r0 + r1], m1
3607 palignr m1, m0, 6
3608 movh [r0 + r1 * 2], m1
3609 palignr m1, m0, 7
3610 movh [r0 + r4], m1
3611 RET
3612
3613 INIT_XMM sse4
3614 cglobal intra_pred_ang8_3, 3,5,8
3615 lea r4, [r2 + 1]
3616 add r2, 17
3617 cmp r3m, byte 33
3618 cmove r2, r4
3619 lea r3, [ang_table + 22 * 16]
3620 lea r4, [ang_table + 8 * 16]
3621 mova m3, [pw_1024]
3622
3623 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3624 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3625
3626 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3627 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3628 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3629
3630 pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
3631 pmulhrsw m4, m3
3632 pmaddubsw m1, [r3 - 2 * 16] ; [20]
3633 pmulhrsw m1, m3
3634 packuswb m4, m1
3635
3636 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
3637
3638 pmaddubsw m5, [r3 - 8 * 16] ; [14]
3639 pmulhrsw m5, m3
3640
3641 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
3642
3643 pmaddubsw m6, [r4] ; [ 8]
3644 pmulhrsw m6, m3
3645 packuswb m5, m6
3646
3647 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
3648
3649 pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
3650 pmulhrsw m6, m3
3651
3652 pmaddubsw m1, [r3 + 6 * 16] ; [28]
3653 pmulhrsw m1, m3
3654 packuswb m6, m1
3655
3656 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
3657
3658 pmaddubsw m1, [r3] ; [22]
3659 pmulhrsw m1, m3
3660
3661 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
3662
3663 pmaddubsw m2, [r3 - 6 * 16] ; [16]
3664 pmulhrsw m2, m3
3665 packuswb m1, m2
3666 jmp .transpose8x8
3667
3668 ALIGN 16
3669 .transpose8x8:
3670 jz .store
3671
3672 ; transpose 8x8
3673 punpckhbw m0, m4, m5
3674 punpcklbw m4, m5
3675 punpckhbw m2, m4, m0
3676 punpcklbw m4, m0
3677
3678 punpckhbw m0, m6, m1
3679 punpcklbw m6, m1
3680 punpckhbw m1, m6, m0
3681 punpcklbw m6, m0
3682
3683 punpckhdq m5, m4, m6
3684 punpckldq m4, m6
3685 punpckldq m6, m2, m1
3686 punpckhdq m2, m1
3687 mova m1, m2
3688
3689 .store:
3690 lea r4, [r1 * 3]
3691 movh [r0], m4
3692 movhps [r0 + r1], m4
3693 movh [r0 + r1 * 2], m5
3694 movhps [r0 + r4], m5
3695 add r0, r4
3696 movh [r0 + r1], m6
3697 movhps [r0 + r1 * 2], m6
3698 movh [r0 + r4], m1
3699 movhps [r0 + r1 * 4], m1
3700 RET
3701
3702 cglobal intra_pred_ang8_4, 3,5,8
3703 lea r4, [r2 + 1]
3704 add r2, 17
3705 cmp r3m, byte 32
3706 cmove r2, r4
3707 lea r3, [ang_table + 24 * 16]
3708 lea r4, [ang_table + 10 * 16]
3709 mova m3, [pw_1024]
3710
3711 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3712 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3713
3714 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3715 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3716 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3717 mova m5, m1
3718
3719 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
3720 pmulhrsw m4, m3
3721 pmaddubsw m1, [r4] ; [10]
3722 pmulhrsw m1, m3
3723 packuswb m4, m1
3724
3725 pmaddubsw m5, [r3 + 7 * 16] ; [31]
3726 pmulhrsw m5, m3
3727
3728 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
3729
3730 pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
3731 pmulhrsw m6, m3
3732 packuswb m5, m6
3733
3734 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
3735
3736 pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
3737 pmulhrsw m6, m3
3738
3739 pmaddubsw m1, [r3 + 6 * 16] ; [30]
3740 pmulhrsw m1, m3
3741 packuswb m6, m1
3742
3743 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
3744
3745 pmaddubsw m1, [r3 - 5 * 16] ; [19]
3746 pmulhrsw m1, m3
3747
3748 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
3749
3750 pmaddubsw m2, [r4 - 2 * 16] ; [8]
3751 pmulhrsw m2, m3
3752 packuswb m1, m2
3753 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3754
3755 cglobal intra_pred_ang8_5, 3,5,8
3756 lea r4, [r2 + 1]
3757 add r2, 17
3758 cmp r3m, byte 31
3759 cmove r2, r4
3760 lea r3, [ang_table + 17 * 16]
3761 lea r4, [ang_table + 2 * 16]
3762 mova m3, [pw_1024]
3763
3764 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3765 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3766
3767 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3768 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3769 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3770 mova m5, m1
3771
3772 pmaddubsw m4, m0, [r3] ; [17]
3773 pmulhrsw m4, m3
3774 pmaddubsw m1, [r4] ; [2]
3775 pmulhrsw m1, m3
3776 packuswb m4, m1
3777
3778 pmaddubsw m5, [r3 + 2 * 16] ; [19]
3779 pmulhrsw m5, m3
3780
3781 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
3782 mova m1, m6
3783
3784 pmaddubsw m1, [r4 + 2 * 16] ; [4]
3785 pmulhrsw m1, m3
3786 packuswb m5, m1
3787
3788 pmaddubsw m6, [r3 + 4 * 16] ; [21]
3789 pmulhrsw m6, m3
3790
3791 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
3792
3793 mova m7, m1
3794 pmaddubsw m7, [r4 + 4 * 16] ; [6]
3795 pmulhrsw m7, m3
3796 packuswb m6, m7
3797
3798 pmaddubsw m1, [r3 + 6 * 16] ; [23]
3799 pmulhrsw m1, m3
3800
3801 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
3802
3803 pmaddubsw m2, [r4 + 6 * 16] ; [8]
3804 pmulhrsw m2, m3
3805 packuswb m1, m2
3806 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3807
3808 cglobal intra_pred_ang8_6, 3,5,8
3809 lea r4, [r2 + 1]
3810 add r2, 17
3811 cmp r3m, byte 30
3812 cmove r2, r4
3813 lea r3, [ang_table + 20 * 16]
3814 lea r4, [ang_table + 8 * 16]
3815 mova m7, [pw_1024]
3816
3817 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3818 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3819
3820 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3821 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3822 mova m1, m0
3823
3824 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
3825 pmulhrsw m4, m7
3826 pmaddubsw m1, [r3 + 6 * 16] ; [26]
3827 pmulhrsw m1, m7
3828 packuswb m4, m1
3829
3830 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3831
3832 pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
3833 pmulhrsw m5, m7
3834
3835 pmaddubsw m6, [r3] ; [20]
3836 pmulhrsw m6, m7
3837 packuswb m5, m6
3838
3839 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
3840
3841 pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
3842 pmulhrsw m6, m7
3843
3844 mova m3, m1
3845 pmaddubsw m3, [r3 - 6 * 16] ; [14]
3846 pmulhrsw m3, m7
3847 packuswb m6, m3
3848
3849 pmaddubsw m1, [r3 + 7 * 16] ; [27]
3850 pmulhrsw m1, m7
3851
3852 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
3853
3854 pmaddubsw m2, [r4] ; [8]
3855 pmulhrsw m2, m7
3856 packuswb m1, m2
3857 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3858
3859 cglobal intra_pred_ang8_7, 3,5,8
3860 lea r4, [r2 + 1]
3861 add r2, 17
3862 cmp r3m, byte 29
3863 cmove r2, r4
3864 lea r3, [ang_table + 24 * 16]
3865 lea r4, [ang_table + 6 * 16]
3866 mova m7, [pw_1024]
3867
3868 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3869 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3870
3871 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3872 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3873
3874 pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
3875 pmulhrsw m4, m7
3876 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
3877 pmulhrsw m3, m7
3878 packuswb m4, m3
3879
3880 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
3881 pmulhrsw m5, m7
3882
3883 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3884
3885 pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
3886 pmulhrsw m6, m7
3887 packuswb m5, m6
3888
3889 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
3890 pmulhrsw m6, m7
3891
3892 mova m3, m1
3893 pmaddubsw m3, [r3 - 2 * 16] ; [22]
3894 pmulhrsw m3, m7
3895 packuswb m6, m3
3896
3897 pmaddubsw m1, [r3 + 7 * 16] ; [31]
3898 pmulhrsw m1, m7
3899
3900 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
3901
3902 pmaddubsw m2, [r4 + 2 * 16] ; [8]
3903 pmulhrsw m2, m7
3904 packuswb m1, m2
3905 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3906
3907 cglobal intra_pred_ang8_8, 3,5,8
3908 lea r4, [r2 + 1]
3909 add r2, 17
3910 cmp r3m, byte 28
3911 cmove r2, r4
3912 lea r3, [ang_table + 23 * 16]
3913 lea r4, [ang_table + 8 * 16]
3914 mova m7, [pw_1024]
3915
3916 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3917 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3918
3919 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3920 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3921 palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
3922
3923 pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
3924 pmulhrsw m4, m7
3925 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
3926 pmulhrsw m3, m7
3927 packuswb m4, m3
3928
3929 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
3930 pmulhrsw m5, m7
3931
3932 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
3933 pmulhrsw m6, m7
3934 packuswb m5, m6
3935
3936 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
3937 pmulhrsw m6, m7
3938
3939 pmaddubsw m0, [r3 + 7 * 16] ; [30]
3940 pmulhrsw m0, m7
3941 packuswb m6, m0
3942
3943 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
3944 pmulhrsw m1, m7
3945
3946 pmaddubsw m2, [r4] ; [8]
3947 pmulhrsw m2, m7
3948 packuswb m1, m2
3949 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3950
3951 cglobal intra_pred_ang8_9, 3,5,8
3952 lea r4, [r2 + 1]
3953 add r2, 17
3954 cmp r3m, byte 27
3955 cmove r2, r4
3956 lea r3, [ang_table + 10 * 16]
3957 mova m7, [pw_1024]
3958
3959 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3960 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3961
3962 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3963
3964 pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
3965 pmulhrsw m4, m7
3966 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
3967 pmulhrsw m3, m7
3968 packuswb m4, m3
3969
3970 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
3971 pmulhrsw m5, m7
3972
3973 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
3974 pmulhrsw m6, m7
3975 packuswb m5, m6
3976
3977 pmaddubsw m6, m0, [r3] ; [10]
3978 pmulhrsw m6, m7
3979
3980 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
3981 pmulhrsw m2, m7
3982 packuswb m6, m2
3983
3984 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
3985 pmulhrsw m1, m7
3986
3987 pmaddubsw m0, [r3 + 6 * 16] ; [16]
3988 pmulhrsw m0, m7
3989 packuswb m1, m0
3990 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
3991
3992 cglobal intra_pred_ang8_10, 3,6,5
3993 movh m0, [r2 + 17]
3994 mova m4, [pb_unpackbq]
3995 palignr m1, m0, 2
3996 pshufb m1, m4
3997 palignr m2, m0, 4
3998 pshufb m2, m4
3999 palignr m3, m0, 6
4000 pshufb m3, m4
4001 pshufb m0, m4
4002
4003 lea r5, [r1 * 3]
4004 movhps [r0 + r1], m0
4005 movh [r0 + r1 * 2], m1
4006 movhps [r0 + r5], m1
4007 lea r3, [r0 + r1 * 4]
4008 movh [r3], m2
4009 movhps [r3 + r1], m2
4010 movh [r3 + r1 * 2], m3
4011 movhps [r3 + r5], m3
4012
4013 ; filter
4014 cmp r4m, byte 0
4015 jz .quit
4016
4017 pmovzxbw m0, m0
4018 movu m1, [r2]
4019 palignr m2, m1, 1
4020 pshufb m1, m4
4021 pmovzxbw m1, m1
4022 pmovzxbw m2, m2
4023 psubw m2, m1
4024 psraw m2, 1
4025 paddw m0, m2
4026 packuswb m0, m0
4027
4028 .quit:
4029 movh [r0], m0
4030 RET
4031
4032 cglobal intra_pred_ang8_26, 3,6,3
4033 movu m2, [r2]
4034 palignr m0, m2, 1
4035 lea r5, [r1 * 3]
4036 movh [r0], m0
4037 movh [r0 + r1], m0
4038 movh [r0 + r1 * 2], m0
4039 movh [r0 + r5], m0
4040 lea r3, [r0 + r1 * 4]
4041 movh [r3], m0
4042 movh [r3 + r1], m0
4043 movh [r3 + r1 * 2], m0
4044 movh [r3 + r5], m0
4045
4046 ; filter
4047 cmp r4m, byte 0
4048 jz .quit
4049
4050 pshufb m2, [pb_unpackbq]
4051 movhlps m1, m2
4052 pmovzxbw m2, m2
4053 movu m0, [r2 + 17]
4054 pmovzxbw m1, m1
4055 pmovzxbw m0, m0
4056 psubw m0, m2
4057 psraw m0, 1
4058 paddw m1, m0
4059 packuswb m1, m1
4060 pextrb [r0], m1, 0
4061 pextrb [r0 + r1], m1, 1
4062 pextrb [r0 + r1 * 2], m1, 2
4063 pextrb [r0 + r5], m1, 3
4064 pextrb [r3], m1, 4
4065 pextrb [r3 + r1], m1, 5
4066 pextrb [r3 + r1 * 2], m1, 6
4067 pextrb [r3 + r5], m1, 7
4068 .quit:
4069 RET
4070
4071 cglobal intra_pred_ang8_11, 3,5,8
4072 xor r4, r4
4073 cmp r3m, byte 25
4074 mov r3, 16
4075 cmove r3, r4
4076
4077 movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4078 pinsrb m0, [r2], 0
4079 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4080
4081 punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4082
4083 lea r3, [ang_table + 23 * 16]
4084 mova m7, [pw_1024]
4085
4086 pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
4087 pmulhrsw m4, m7
4088 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
4089 pmulhrsw m3, m7
4090 packuswb m4, m3
4091
4092 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
4093 pmulhrsw m5, m7
4094
4095 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
4096 pmulhrsw m6, m7
4097 packuswb m5, m6
4098
4099 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
4100 pmulhrsw m6, m7
4101
4102 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
4103 pmulhrsw m2, m7
4104 packuswb m6, m2
4105
4106 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
4107 pmulhrsw m1, m7
4108
4109 pmaddubsw m0, [r3 - 7 * 16] ; [16]
4110 pmulhrsw m0, m7
4111 packuswb m1, m0
4112 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4113
4114 cglobal intra_pred_ang8_12, 3,5,8
4115 xor r4, r4
4116 cmp r3m, byte 24
4117 mov r3, 16
4118 jz .next
4119 xchg r3, r4
4120 .next:
4121
4122 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4123 pinsrb m1, [r2], 0
4124 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
4125 pinsrb m0, [r2 + r3 + 6], 0
4126
4127 lea r4, [ang_table + 22 * 16]
4128 mova m7, [pw_1024]
4129
4130 punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
4131 punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4132 palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4133
4134 pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
4135 pmulhrsw m4, m7
4136 pmaddubsw m3, m2, [r4] ; [22]
4137 pmulhrsw m3, m7
4138 packuswb m4, m3
4139
4140 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
4141 pmulhrsw m1, m7
4142
4143 pmaddubsw m0, [r4 + 2 * 16] ; [24]
4144 pmulhrsw m0, m7
4145 packuswb m1, m0
4146
4147 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
4148 pmulhrsw m5, m7
4149
4150 lea r4, [ang_table + 7 * 16]
4151 pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
4152 pmulhrsw m6, m7
4153 packuswb m5, m6
4154
4155 pmaddubsw m6, m2, [r4] ; [7]
4156 pmulhrsw m6, m7
4157
4158 pmaddubsw m2, [r4 - 5 * 16] ; [2]
4159 pmulhrsw m2, m7
4160 packuswb m6, m2
4161 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4162
4163 cglobal intra_pred_ang8_13, 4,5,8
4164 xor r4, r4
4165 cmp r3m, byte 23
4166 mov r3, 16
4167 jz .next
4168 xchg r3, r4
4169 .next:
4170
4171 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4172 pinsrb m1, [r2], 0
4173 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
4174 pinsrb m1, [r2 + r3 + 4], 0
4175 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
4176 pinsrb m0, [r2 + r3 + 7], 0
4177 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
4178 punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
4179 palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4180 palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4181
4182 lea r4, [ang_table + 24 * 16]
4183 mova m7, [pw_1024]
4184
4185 pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
4186 pmulhrsw m4, m7
4187
4188 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
4189 pmulhrsw m6, m7
4190
4191 pmaddubsw m0, [r4] ; [24]
4192 pmulhrsw m0, m7
4193
4194 lea r4, [ang_table + 13 * 16]
4195 pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
4196 pmulhrsw m3, m7
4197 packuswb m4, m3
4198
4199 pmaddubsw m5, [r4 - 8 * 16] ; [5]
4200 pmulhrsw m5, m7
4201 packuswb m5, m6
4202
4203 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
4204 pmulhrsw m6, m7
4205
4206 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
4207 pmulhrsw m2, m7
4208 packuswb m6, m2
4209
4210 pmaddubsw m1, [r4 - 12 * 16] ; [1]
4211 pmulhrsw m1, m7
4212 packuswb m1, m0
4213 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4214
4215 cglobal intra_pred_ang8_14, 4,5,8
4216 xor r4, r4
4217 cmp r3m, byte 22
4218 mov r3, 16
4219 jz .next
4220 xchg r3, r4
4221 .next:
4222
4223 movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
4224 pinsrb m1, [r2], 2
4225 pinsrb m1, [r2 + r3 + 2], 1
4226 pinsrb m1, [r2 + r3 + 5], 0
4227 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
4228 pinsrb m0, [r2 + r3 + 7], 0
4229 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
4230 punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
4231 palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
4232 palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4233 palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4234
4235 lea r4, [ang_table + 24 * 16]
4236 mova m3, [pw_1024]
4237
4238 pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
4239 pmulhrsw m4, m3
4240
4241 pmaddubsw m0, [r4] ; [24]
4242 pmulhrsw m0, m3
4243
4244 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
4245 pmulhrsw m5, m3
4246
4247 lea r4, [ang_table + 12 * 16]
4248 pmaddubsw m6, [r4] ; [12]
4249 pmulhrsw m6, m3
4250 packuswb m5, m6
4251
4252 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
4253 pmulhrsw m6, m3
4254
4255 pmaddubsw m2, [r4 - 6 * 16] ; [6]
4256 pmulhrsw m2, m3
4257 packuswb m4, m2
4258
4259 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
4260 pmulhrsw m2, m3
4261 packuswb m6, m2
4262
4263 pmaddubsw m1, [r4 - 7 * 16] ; [5]
4264 pmulhrsw m1, m3
4265 packuswb m1, m0
4266 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4267
4268 cglobal intra_pred_ang8_15, 4,5,8
4269 xor r4, r4
4270 cmp r3m, byte 21
4271 mov r3, 16
4272 jz .next
4273 xchg r3, r4
4274 .next:
4275
4276 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4277 pinsrb m1, [r2], 0
4278 movu m2, [r2 + r3]
4279 pshufb m2, [c_mode16_15]
4280 palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
4281 pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
4282 pinsrb m0, [r2 + r3 + 8], 0
4283 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
4284 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
4285 palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
4286 palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
4287 palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4288 palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4289
4290 lea r4, [ang_table + 23 * 16]
4291 mova m3, [pw_1024]
4292
4293 pmaddubsw m4, [r4 - 8 * 16] ; [15]
4294 pmulhrsw m4, m3
4295
4296 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
4297 pmulhrsw m2, m3
4298 packuswb m4, m2
4299
4300 pmaddubsw m5, [r4 - 10 * 16] ; [13]
4301 pmulhrsw m5, m3
4302
4303 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
4304 pmulhrsw m2, m3
4305 packuswb m5, m2
4306
4307 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
4308 pmulhrsw m2, m3
4309
4310 pmaddubsw m0, [r4 + 1 * 16] ; [24]
4311 pmulhrsw m0, m3
4312
4313 lea r4, [ang_table + 11 * 16]
4314 pmaddubsw m6, [r4] ; [11]
4315 pmulhrsw m6, m3
4316 packuswb m6, m2
4317
4318 pmaddubsw m1, [r4 - 2 * 16] ; [9]
4319 pmulhrsw m1, m3
4320 packuswb m1, m0
4321 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4322
4323 cglobal intra_pred_ang8_16, 4,5,8
4324 xor r4, r4
4325 cmp r3m, byte 20
4326 mov r3, 16
4327 jz .next
4328 xchg r3, r4
4329 .next:
4330
4331 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4332 pinsrb m1, [r2], 0
4333 movu m2, [r2 + r3]
4334 pshufb m2, [c_mode16_16]
4335 palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
4336 pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
4337 pinsrb m0, [r2 + r3 + 8], 0
4338 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
4339 punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
4340 palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
4341 palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
4342 palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
4343 palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4344 palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4345
4346 lea r4, [ang_table + 22 * 16]
4347 mova m7, [pw_1024]
4348
4349 pmaddubsw m3, m5, [r4] ; [22]
4350 pmulhrsw m3, m7
4351
4352 pmaddubsw m0, [r4 + 2 * 16] ; [24]
4353 pmulhrsw m0, m7
4354
4355 lea r4, [ang_table + 9 * 16]
4356
4357 pmaddubsw m4, [r4 + 2 * 16] ; [11]
4358 pmulhrsw m4, m7
4359 packuswb m4, m3
4360
4361 pmaddubsw m2, [r4 + 3 * 16] ; [12]
4362 pmulhrsw m2, m7
4363
4364 pmaddubsw m5, [r4 - 8 * 16] ; [1]
4365 pmulhrsw m5, m7
4366 packuswb m5, m2
4367
4368 mova m2, m6
4369 pmaddubsw m6, [r4 + 14 * 16] ; [23]
4370 pmulhrsw m6, m7
4371
4372 pmaddubsw m2, [r4 - 7 * 16] ; [2]
4373 pmulhrsw m2, m7
4374 packuswb m6, m2
4375
4376 pmaddubsw m1, [r4 + 4 * 16] ; [13]
4377 pmulhrsw m1, m7
4378 packuswb m1, m0
4379 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4380
4381 cglobal intra_pred_ang8_17, 4,5,8
4382 xor r4, r4
4383 cmp r3m, byte 19
4384 mov r3, 16
4385 jz .next
4386 xchg r3, r4
4387 .next:
4388
4389 movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4390 pinsrb m2, [r2], 0
4391 movu m1, [r2 + r3]
4392 pshufb m1, [c_mode16_17]
4393 palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
4394 pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
4395 pinsrb m0, [r2 + r3 + 7], 0
4396 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
4397 punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
4398
4399 palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
4400 palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
4401 palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4402
4403 lea r4, [ang_table + 17 * 16]
4404 mova m3, [pw_1024]
4405
4406 pmaddubsw m2, [r4 - 5 * 16] ; [12]
4407 pmulhrsw m2, m3
4408
4409 pmaddubsw m4, [r4 - 11 * 16] ; [6]
4410 pmulhrsw m4, m3
4411 packuswb m4, m2
4412
4413 pmaddubsw m5, [r4 + 1 * 16] ; [18]
4414 pmulhrsw m5, m3
4415
4416 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
4417 pmaddubsw m2, [r4 + 7 * 16] ; [24]
4418 pmulhrsw m2, m3
4419 packuswb m5, m2
4420
4421 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
4422 mova m2, m6
4423 pmaddubsw m6, [r4 + 13 * 16] ; [30]
4424 pmulhrsw m6, m3
4425
4426 pmaddubsw m2, [r4 - 13 * 16] ; [4]
4427 pmulhrsw m2, m3
4428 packuswb m6, m2
4429
4430 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
4431 pmaddubsw m1, [r4 - 7 * 16] ; [10]
4432 pmulhrsw m1, m3
4433
4434 pmaddubsw m0, [r4 - 1 * 16] ; [16]
4435 pmulhrsw m0, m3
4436 packuswb m1, m0
4437 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
4438
4439 cglobal intra_pred_ang8_18, 4,4,1
4440 movu m0, [r2 + 16]
4441 pinsrb m0, [r2], 0
4442 pshufb m0, [pb_swap8]
4443 movhps m0, [r2 + 1]
4444 lea r2, [r0 + r1 * 4]
4445 lea r3, [r1 * 3]
4446 movh [r2 + r3], m0
4447 psrldq m0, 1
4448 movh [r2 + r1 * 2], m0
4449 psrldq m0, 1
4450 movh [r2 + r1], m0
4451 psrldq m0, 1
4452 movh [r2], m0
4453 psrldq m0, 1
4454 movh [r0 + r3], m0
4455 psrldq m0, 1
4456 movh [r0 + r1 * 2], m0
4457 psrldq m0, 1
4458 movh [r0 + r1], m0
4459 psrldq m0, 1
4460 movh [r0], m0
4461 RET
4462
4463 %macro TRANSPOSE_STORE_8x8 6
4464 %if %2 == 1
4465 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
4466 punpckhbw m0, %3, %4
4467 punpcklbw %3, %4
4468 punpckhbw %4, %3, m0
4469 punpcklbw %3, m0
4470
4471 punpckhbw m0, %5, m1
4472 punpcklbw %5, %6
4473 punpckhbw %6, %5, m0
4474 punpcklbw %5, m0
4475
4476 punpckhdq m0, %3, %5
4477 punpckldq %3, %5
4478 punpckldq %5, %4, %6
4479 punpckhdq %4, %6
4480
4481 movh [r0 + + %1 * 8], %3
4482 movhps [r0 + r1 + %1 * 8], %3
4483 movh [r0 + r1*2 + %1 * 8], m0
4484 movhps [r0 + r5 + %1 * 8], m0
4485 movh [r6 + %1 * 8], %5
4486 movhps [r6 + r1 + %1 * 8], %5
4487 movh [r6 + r1*2 + %1 * 8], %4
4488 movhps [r6 + r5 + %1 * 8], %4
4489 %else
4490 ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
4491 movh [r0 ], %3
4492 movhps [r0 + r1 ], %3
4493 movh [r0 + r1 * 2], %4
4494 movhps [r0 + r5 ], %4
4495 lea r0, [r0 + r1 * 4]
4496 movh [r0 ], %5
4497 movhps [r0 + r1 ], %5
4498 movh [r0 + r1 * 2], %6
4499 movhps [r0 + r5 ], %6
4500 lea r0, [r0 + r1 * 4]
4501 %endif
4502 %endmacro
4503
4504 ;------------------------------------------------------------------------------------------
4505 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
4506 ;------------------------------------------------------------------------------------------
4507 INIT_XMM ssse3
4508 cglobal intra_pred_ang16_2, 3,5,3
4509 lea r4, [r2 + 2]
4510 add r2, 34
4511 cmp r3m, byte 34
4512 cmove r2, r4
4513 movu m0, [r2]
4514 movu m1, [r2 + 16]
4515 movu [r0], m0
4516 palignr m2, m1, m0, 1
4517 movu [r0 + r1], m2
4518 lea r0, [r0 + r1 * 2]
4519 palignr m2, m1, m0, 2
4520 movu [r0], m2
4521 palignr m2, m1, m0, 3
4522 movu [r0 + r1], m2
4523 lea r0, [r0 + r1 * 2]
4524 palignr m2, m1, m0, 4
4525 movu [r0], m2
4526 palignr m2, m1, m0, 5
4527 movu [r0 + r1], m2
4528 lea r0, [r0 + r1 * 2]
4529 palignr m2, m1, m0, 6
4530 movu [r0], m2
4531 palignr m2, m1, m0, 7
4532 movu [r0 + r1], m2
4533 lea r0, [r0 + r1 * 2]
4534 palignr m2, m1, m0, 8
4535 movu [r0], m2
4536 palignr m2, m1, m0, 9
4537 movu [r0 + r1], m2
4538 lea r0, [r0 + r1 * 2]
4539 palignr m2, m1, m0, 10
4540 movu [r0], m2
4541 palignr m2, m1, m0, 11
4542 movu [r0 + r1], m2
4543 lea r0, [r0 + r1 * 2]
4544 palignr m2, m1, m0, 12
4545 movu [r0], m2
4546 palignr m2, m1, m0, 13
4547 movu [r0 + r1], m2
4548 lea r0, [r0 + r1 * 2]
4549 palignr m2, m1, m0, 14
4550 movu [r0], m2
4551 palignr m2, m1, m0, 15
4552 movu [r0 + r1], m2
4553 RET
4554
4555 INIT_XMM sse4
4556 cglobal intra_pred_ang16_3, 3,7,8
4557 add r2, 32
4558 lea r3, [ang_table + 16 * 16]
4559 mov r4d, 2
4560 lea r5, [r1 * 3] ; r5 -> 3 * stride
4561 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4562 mova m7, [pw_1024]
4563
4564 .loop:
4565 movu m0, [r2 + 1]
4566 palignr m1, m0, 1
4567
4568 punpckhbw m2, m0, m1
4569 punpcklbw m0, m1
4570 palignr m1, m2, m0, 2
4571
4572 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
4573 pmulhrsw m4, m7
4574 pmaddubsw m1, [r3 + 4 * 16] ; [20]
4575 pmulhrsw m1, m7
4576 packuswb m4, m1
4577
4578 palignr m5, m2, m0, 4
4579
4580 pmaddubsw m5, [r3 - 2 * 16] ; [14]
4581 pmulhrsw m5, m7
4582
4583 palignr m6, m2, m0, 6
4584
4585 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
4586 pmulhrsw m6, m7
4587 packuswb m5, m6
4588
4589 palignr m1, m2, m0, 8
4590
4591 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
4592 pmulhrsw m6, m7
4593
4594 pmaddubsw m1, [r3 + 12 * 16] ; [28]
4595 pmulhrsw m1, m7
4596 packuswb m6, m1
4597
4598 palignr m1, m2, m0, 10
4599
4600 pmaddubsw m1, [r3 + 6 * 16] ; [22]
4601 pmulhrsw m1, m7
4602
4603 palignr m2, m0, 12
4604
4605 pmaddubsw m2, [r3] ; [16]
4606 pmulhrsw m2, m7
4607 packuswb m1, m2
4608
4609 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4610
4611 movu m0, [r2 + 8]
4612 palignr m1, m0, 1
4613
4614 punpckhbw m2, m0, m1
4615 punpcklbw m0, m1
4616 palignr m5, m2, m0, 2
4617
4618 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
4619 pmulhrsw m4, m7
4620 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
4621 pmulhrsw m1, m7
4622 packuswb m4, m1
4623
4624 pmaddubsw m5, [r3 + 14 * 16] ; [30]
4625 pmulhrsw m5, m7
4626
4627 palignr m6, m2, m0, 4
4628
4629 pmaddubsw m6, [r3 + 8 * 16] ; [24]
4630 pmulhrsw m6, m7
4631 packuswb m5, m6
4632
4633 palignr m1, m2, m0, 6
4634
4635 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
4636 pmulhrsw m6, m7
4637
4638 palignr m1, m2, m0, 8
4639
4640 pmaddubsw m1, [r3 - 4 * 16] ; [12]
4641 pmulhrsw m1, m7
4642 packuswb m6, m1
4643
4644 palignr m1, m2, m0, 10
4645
4646 pmaddubsw m1, [r3 - 10 * 16] ; [06]
4647 pmulhrsw m1, m7
4648 packuswb m1, m1
4649
4650 movhps m1, [r2 + 14] ; [00]
4651
4652 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4653
4654 lea r0, [r6 + r1 * 4]
4655 lea r6, [r6 + r1 * 8]
4656 add r2, 8
4657 dec r4
4658 jnz .loop
4659 RET
4660
4661 INIT_XMM sse4
4662 cglobal intra_pred_ang16_33, 3,7,8
4663 lea r3, [ang_table + 16 * 16]
4664 mov r4d, 2
4665 lea r5, [r1 * 3]
4666 mov r6, r0
4667 mova m7, [pw_1024]
4668
4669 .loop:
4670 movu m0, [r2 + 1]
4671 palignr m1, m0, 1
4672
4673 punpckhbw m2, m0, m1
4674 punpcklbw m0, m1
4675 palignr m1, m2, m0, 2
4676
4677 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
4678 pmulhrsw m4, m7
4679 pmaddubsw m1, [r3 + 4 * 16] ; [20]
4680 pmulhrsw m1, m7
4681 packuswb m4, m1
4682
4683 palignr m5, m2, m0, 4
4684
4685 pmaddubsw m5, [r3 - 2 * 16] ; [14]
4686 pmulhrsw m5, m7
4687
4688 palignr m6, m2, m0, 6
4689
4690 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
4691 pmulhrsw m6, m7
4692 packuswb m5, m6
4693
4694 palignr m1, m2, m0, 8
4695
4696 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
4697 pmulhrsw m6, m7
4698
4699 pmaddubsw m1, [r3 + 12 * 16] ; [28]
4700 pmulhrsw m1, m7
4701 packuswb m6, m1
4702
4703 palignr m1, m2, m0, 10
4704
4705 pmaddubsw m1, [r3 + 6 * 16] ; [22]
4706 pmulhrsw m1, m7
4707
4708 palignr m2, m0, 12
4709
4710 pmaddubsw m2, [r3] ; [16]
4711 pmulhrsw m2, m7
4712 packuswb m1, m2
4713
4714 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4715
4716 movu m0, [r2 + 8]
4717 palignr m1, m0, 1
4718
4719 punpckhbw m2, m0, m1
4720 punpcklbw m0, m1
4721 palignr m5, m2, m0, 2
4722
4723 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
4724 pmulhrsw m4, m7
4725 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
4726 pmulhrsw m1, m7
4727 packuswb m4, m1
4728
4729 pmaddubsw m5, [r3 + 14 * 16] ; [30]
4730 pmulhrsw m5, m7
4731
4732 palignr m6, m2, m0, 4
4733
4734 pmaddubsw m6, [r3 + 8 * 16] ; [24]
4735 pmulhrsw m6, m7
4736 packuswb m5, m6
4737
4738 palignr m1, m2, m0, 6
4739
4740 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
4741 pmulhrsw m6, m7
4742
4743 palignr m1, m2, m0, 8
4744
4745 pmaddubsw m1, [r3 - 4 * 16] ; [12]
4746 pmulhrsw m1, m7
4747 packuswb m6, m1
4748
4749 palignr m1, m2, m0, 10
4750
4751 pmaddubsw m1, [r3 - 10 * 16] ; [06]
4752 pmulhrsw m1, m7
4753 packuswb m1, m1
4754
4755 movh m2, [r2 + 14] ; [00]
4756
4757 movh [r0 ], m4
4758 movhps [r0 + r1 ], m4
4759 movh [r0 + r1 * 2], m5
4760 movhps [r0 + r5 ], m5
4761 lea r0, [r0 + r1 * 4]
4762 movh [r0 ], m6
4763 movhps [r0 + r1 ], m6
4764 movh [r0 + r1 * 2], m1
4765 movh [r0 + r5 ], m2
4766
4767 lea r0, [r6 + 8]
4768 add r2, 8
4769 dec r4
4770 jnz .loop
4771 RET
4772
4773 INIT_XMM sse4
4774 cglobal intra_pred_ang16_4, 3,7,8
4775 add r2, 32
4776 lea r3, [ang_table + 16 * 16]
4777 mov r4d, 2
4778 lea r5, [r1 * 3] ; r5 -> 3 * stride
4779 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4780 mova m7, [pw_1024]
4781
4782 .loop:
4783 movu m0, [r2 + 1]
4784 palignr m1, m0, 1
4785
4786 punpckhbw m2, m0, m1
4787 punpcklbw m0, m1
4788 palignr m1, m2, m0, 2
4789 mova m5, m1
4790
4791 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
4792 pmulhrsw m4, m7
4793 pmaddubsw m1, [r3 - 6 * 16] ; [10]
4794 pmulhrsw m1, m7
4795 packuswb m4, m1
4796
4797 pmaddubsw m5, [r3 + 15 * 16] ; [31]
4798 pmulhrsw m5, m7
4799
4800 palignr m6, m2, m0, 4
4801
4802 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
4803 pmulhrsw m6, m7
4804 packuswb m5, m6
4805
4806 palignr m1, m2, m0, 6
4807
4808 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
4809 pmulhrsw m6, m7
4810
4811 pmaddubsw m1, [r3 + 14 * 16] ; [30]
4812 pmulhrsw m1, m7
4813 packuswb m6, m1
4814
4815 palignr m1, m2, m0, 8
4816
4817 pmaddubsw m1, [r3 + 3 * 16] ; [19]
4818 pmulhrsw m1, m7
4819
4820 palignr m2, m0, 10
4821
4822 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
4823 pmulhrsw m3, m7
4824 packuswb m1, m3
4825
4826 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4827
4828 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
4829 pmulhrsw m4, m7
4830
4831 movu m0, [r2 + 6]
4832 palignr m1, m0, 1
4833
4834 punpckhbw m2, m0, m1
4835 punpcklbw m0, m1
4836 palignr m1, m2, m0, 2
4837
4838 pmaddubsw m1, [r3 + 2 * 16] ; [18]
4839 pmulhrsw m1, m7
4840 packuswb m4, m1
4841
4842 palignr m5, m2, m0, 4
4843 mova m6, m5
4844
4845 pmaddubsw m5, [r3 - 9 * 16] ; [07]
4846 pmulhrsw m5, m7
4847
4848 pmaddubsw m6, [r3 + 12 * 16] ; [28]
4849 pmulhrsw m6, m7
4850 packuswb m5, m6
4851
4852 palignr m6, m2, m0, 6
4853
4854 pmaddubsw m6, [r3 + 16] ; [17]
4855 pmulhrsw m6, m7
4856
4857 palignr m1, m2, m0, 8
4858 palignr m2, m0, 10
4859
4860 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
4861 pmulhrsw m3, m7
4862 packuswb m6, m3
4863
4864 pmaddubsw m1, [r3 + 11 * 16] ; [27]
4865 pmulhrsw m1, m7
4866
4867 pmaddubsw m2, [r3] ; [16]
4868 pmulhrsw m2, m7
4869 packuswb m1, m2
4870
4871 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4872
4873 lea r0, [r6 + r1 * 4]
4874 lea r6, [r6 + r1 * 8]
4875 add r2, 8
4876 dec r4
4877 jnz .loop
4878 RET
4879
4880 INIT_XMM sse4
4881 cglobal intra_pred_ang16_32, 3,7,8
4882 lea r3, [ang_table + 16 * 16]
4883 mov r4d, 2
4884 lea r5, [r1 * 3] ; r5 -> 3 * stride
4885 mov r6, r0
4886 mova m7, [pw_1024]
4887
4888 .loop:
4889 movu m0, [r2 + 1]
4890 palignr m1, m0, 1
4891
4892 punpckhbw m2, m0, m1
4893 punpcklbw m0, m1
4894 palignr m1, m2, m0, 2
4895 mova m5, m1
4896
4897
4898 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
4899 pmulhrsw m4, m7
4900 pmaddubsw m1, [r3 - 6 * 16] ; [10]
4901 pmulhrsw m1, m7
4902 packuswb m4, m1
4903
4904 pmaddubsw m5, [r3 + 15 * 16] ; [31]
4905 pmulhrsw m5, m7
4906
4907 palignr m6, m2, m0, 4
4908
4909 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
4910 pmulhrsw m6, m7
4911 packuswb m5, m6
4912
4913 palignr m1, m2, m0, 6
4914
4915 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
4916 pmulhrsw m6, m7
4917
4918 pmaddubsw m1, [r3 + 14 * 16] ; [30]
4919 pmulhrsw m1, m7
4920 packuswb m6, m1
4921
4922 palignr m1, m2, m0, 8
4923
4924 pmaddubsw m1, [r3 + 3 * 16] ; [19]
4925 pmulhrsw m1, m7
4926
4927 palignr m2, m0, 10
4928
4929 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
4930 pmulhrsw m3, m7
4931 packuswb m1, m3
4932
4933 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4934
4935 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
4936 pmulhrsw m4, m7
4937
4938 movu m0, [r2 + 6]
4939 palignr m1, m0, 1
4940
4941 punpckhbw m2, m0, m1
4942 punpcklbw m0, m1
4943 palignr m1, m2, m0, 2
4944
4945 pmaddubsw m1, [r3 + 2 * 16] ; [18]
4946 pmulhrsw m1, m7
4947 packuswb m4, m1
4948
4949 palignr m5, m2, m0, 4
4950 mova m6, m5
4951
4952 pmaddubsw m5, [r3 - 9 * 16] ; [07]
4953 pmulhrsw m5, m7
4954
4955 pmaddubsw m6, [r3 + 12 * 16] ; [28]
4956 pmulhrsw m6, m7
4957 packuswb m5, m6
4958
4959 palignr m6, m2, m0, 6
4960
4961 pmaddubsw m6, [r3 + 16] ; [17]
4962 pmulhrsw m6, m7
4963
4964 palignr m1, m2, m0, 8
4965 palignr m2, m0, 10
4966
4967 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
4968 pmulhrsw m3, m7
4969 packuswb m6, m3
4970
4971 pmaddubsw m1, [r3 + 11 * 16] ; [27]
4972 pmulhrsw m1, m7
4973
4974 pmaddubsw m2, [r3] ; [16]
4975 pmulhrsw m2, m7
4976 packuswb m1, m2
4977
4978 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4979
4980 lea r0, [r6 + 8]
4981 add r2, 8
4982 dec r4
4983 jnz .loop
4984 RET
4985
4986 INIT_XMM sse4
4987 cglobal intra_pred_ang16_5, 3,7,8
4988 add r2, 32
4989 lea r3, [ang_table + 16 * 16]
4990 mov r4d, 2
4991 lea r5, [r1 * 3] ; r5 -> 3 * stride
4992 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4993 mova m7, [pw_1024]
4994
4995 .loop:
4996 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4997 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
4998 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
4999 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5000
5001 palignr m5, m2, m3, 2
5002
5003 pmaddubsw m4, m3, [r3 + 16] ; [17]
5004 pmulhrsw m4, m7
5005 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
5006 pmulhrsw m1, m7
5007 packuswb m4, m1
5008
5009 palignr m6, m2, m3, 4
5010
5011 pmaddubsw m5, [r3 + 3 * 16] ; [19]
5012 pmulhrsw m5, m7
5013 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
5014 pmulhrsw m1, m7
5015 packuswb m5, m1
5016
5017 palignr m1, m2, m3, 6
5018
5019 pmaddubsw m6, [r3 + 5 * 16] ; [21]
5020 pmulhrsw m6, m7
5021 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
5022 pmulhrsw m0, m7
5023 packuswb m6, m0
5024
5025 palignr m0, m2, m3, 8
5026
5027 pmaddubsw m1, [r3 + 7 * 16] ; [23]
5028 pmulhrsw m1, m7
5029 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5030 pmulhrsw m0, m7
5031 packuswb m1, m0
5032
5033 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5034
5035 palignr m4, m2, m3, 8
5036 palignr m5, m2, m3, 10
5037
5038 pmaddubsw m4, [r3 + 9 * 16] ; [25]
5039 pmulhrsw m4, m7
5040 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
5041 pmulhrsw m1, m7
5042 packuswb m4, m1
5043
5044 palignr m6, m2, m3, 12
5045
5046 pmaddubsw m5, [r3 + 11 * 16] ; [27]
5047 pmulhrsw m5, m7
5048 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
5049 pmulhrsw m1, m7
5050 packuswb m5, m1
5051
5052 palignr m1, m2, m3, 14
5053
5054 pmaddubsw m6, [r3 + 13 * 16] ; [29]
5055 pmulhrsw m6, m7
5056 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
5057 pmulhrsw m0, m7
5058 packuswb m6, m0
5059
5060 pmaddubsw m1, [r3 + 15 * 16] ; [31]
5061 pmulhrsw m1, m7
5062 pmaddubsw m2, [r3] ; [16]
5063 pmulhrsw m2, m7
5064 packuswb m1, m2
5065
5066 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5067
5068 lea r0, [r6 + r1 * 4]
5069 lea r6, [r6 + r1 * 8]
5070 add r2, 8
5071 dec r4
5072 jnz .loop
5073 RET
5074
5075 INIT_XMM sse4
5076 cglobal intra_pred_ang16_31, 3,7,8
5077 lea r3, [ang_table + 16 * 16]
5078 mov r4d, 2
5079 lea r5, [r1 * 3] ; r5 -> 3 * stride
5080 mov r6, r0
5081 mova m7, [pw_1024]
5082
5083 .loop:
5084 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5085 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5086 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5087 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5088
5089 palignr m5, m2, m3, 2
5090
5091 pmaddubsw m4, m3, [r3 + 16] ; [17]
5092 pmulhrsw m4, m7
5093 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
5094 pmulhrsw m1, m7
5095 packuswb m4, m1
5096
5097 palignr m6, m2, m3, 4
5098
5099 pmaddubsw m5, [r3 + 3 * 16] ; [19]
5100 pmulhrsw m5, m7
5101 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
5102 pmulhrsw m1, m7
5103 packuswb m5, m1
5104
5105 palignr m1, m2, m3, 6
5106
5107 pmaddubsw m6, [r3 + 5 * 16] ; [21]
5108 pmulhrsw m6, m7
5109 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
5110 pmulhrsw m0, m7
5111 packuswb m6, m0
5112
5113 palignr m0, m2, m3, 8
5114
5115 pmaddubsw m1, [r3 + 7 * 16] ; [23]
5116 pmulhrsw m1, m7
5117 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5118 pmulhrsw m0, m7
5119 packuswb m1, m0
5120
5121 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5122
5123 palignr m4, m2, m3, 8
5124 palignr m5, m2, m3, 10
5125
5126 pmaddubsw m4, [r3 + 9 * 16] ; [25]
5127 pmulhrsw m4, m7
5128 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
5129 pmulhrsw m1, m7
5130 packuswb m4, m1
5131
5132 palignr m6, m2, m3, 12
5133
5134 pmaddubsw m5, [r3 + 11 * 16] ; [27]
5135 pmulhrsw m5, m7
5136 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
5137 pmulhrsw m1, m7
5138 packuswb m5, m1
5139
5140 palignr m1, m2, m3, 14
5141
5142 pmaddubsw m6, [r3 + 13 * 16] ; [29]
5143 pmulhrsw m6, m7
5144 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
5145 pmulhrsw m0, m7
5146 packuswb m6, m0
5147
5148 pmaddubsw m1, [r3 + 15 * 16] ; [31]
5149 pmulhrsw m1, m7
5150 pmaddubsw m2, [r3] ; [16]
5151 pmulhrsw m2, m7
5152 packuswb m1, m2
5153
5154 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5155
5156 lea r0, [r6 + 8]
5157 add r2, 8
5158 dec r4
5159 jnz .loop
5160 RET
5161
5162 INIT_XMM sse4
5163 cglobal intra_pred_ang16_6, 3,7,8
5164 add r2, 32
5165 lea r3, [ang_table + 16 * 16]
5166 mov r4d, 2
5167 lea r5, [r1 * 3] ; r5 -> 3 * stride
5168 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5169 mova m7, [pw_1024]
5170
5171 .loop:
5172 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5173 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5174 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5175 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5176
5177 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
5178 pmulhrsw m4, m7
5179 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
5180 pmulhrsw m1, m7
5181 packuswb m4, m1
5182
5183 palignr m6, m2, m3, 2
5184
5185 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
5186 pmulhrsw m5, m7
5187 pmaddubsw m6, [r3 + 4 * 16] ; [20]
5188 pmulhrsw m6, m7
5189 packuswb m5, m6
5190
5191 palignr m1, m2, m3, 4
5192
5193 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
5194 pmulhrsw m6, m7
5195 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
5196 pmulhrsw m0, m7
5197 packuswb m6, m0
5198
5199 palignr m0, m2, m3, 6
5200
5201 pmaddubsw m1, [r3 + 11 * 16] ; [27]
5202 pmulhrsw m1, m7
5203 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5204 pmulhrsw m0, m7
5205 packuswb m1, m0
5206
5207 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5208
5209 palignr m4, m2, m3, 6
5210 palignr m6, m2, m3, 8
5211
5212 pmaddubsw m4, [r3 + 5 * 16] ; [21]
5213 pmulhrsw m4, m7
5214 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
5215 pmulhrsw m1, m7
5216 packuswb m4, m1
5217
5218 pmaddubsw m5, m6, [r3 - 16] ; [15]
5219 pmulhrsw m5, m7
5220 pmaddubsw m6, [r3 + 12 * 16] ; [28]
5221 pmulhrsw m6, m7
5222 packuswb m5, m6
5223
5224 palignr m0, m2, m3, 10
5225
5226 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
5227 pmulhrsw m6, m7
5228 pmaddubsw m0, [r3 + 6 * 16] ; [22]
5229 pmulhrsw m0, m7
5230 packuswb m6, m0
5231
5232 palignr m2, m3, 12
5233
5234 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
5235 pmulhrsw m1, m7
5236 pmaddubsw m2, [r3] ; [16]
5237 pmulhrsw m2, m7
5238 packuswb m1, m2
5239
5240 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5241
5242 lea r0, [r6 + r1 * 4]
5243 lea r6, [r6 + r1 * 8]
5244 add r2, 8
5245 dec r4
5246 jnz .loop
5247 RET
5248
5249 INIT_XMM sse4
5250 cglobal intra_pred_ang16_30, 3,7,8
5251 lea r3, [ang_table + 16 * 16]
5252 mov r4d, 2
5253 lea r5, [r1 * 3] ; r5 -> 3 * stride
5254 mov r6, r0
5255 mova m7, [pw_1024]
5256
5257 .loop:
5258 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5259 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5260 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5261 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5262
5263 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
5264 pmulhrsw m4, m7
5265 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
5266 pmulhrsw m1, m7
5267 packuswb m4, m1
5268
5269 palignr m6, m2, m3, 2
5270
5271 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
5272 pmulhrsw m5, m7
5273 pmaddubsw m6, [r3 + 4 * 16] ; [20]
5274 pmulhrsw m6, m7
5275 packuswb m5, m6
5276
5277 palignr m1, m2, m3, 4
5278
5279 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
5280 pmulhrsw m6, m7
5281 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
5282 pmulhrsw m0, m7
5283 packuswb m6, m0
5284
5285 palignr m0, m2, m3, 6
5286
5287 pmaddubsw m1, [r3 + 11 * 16] ; [27]
5288 pmulhrsw m1, m7
5289 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5290 pmulhrsw m0, m7
5291 packuswb m1, m0
5292
5293 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5294
5295 palignr m4, m2, m3, 6
5296 palignr m6, m2, m3, 8
5297
5298 pmaddubsw m4, [r3 + 5 * 16] ; [21]
5299 pmulhrsw m4, m7
5300 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
5301 pmulhrsw m1, m7
5302 packuswb m4, m1
5303
5304 pmaddubsw m5, m6, [r3 - 16] ; [15]
5305 pmulhrsw m5, m7
5306 pmaddubsw m6, [r3 + 12 * 16] ; [28]
5307 pmulhrsw m6, m7
5308 packuswb m5, m6
5309
5310 palignr m0, m2, m3, 10
5311
5312 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
5313 pmulhrsw m6, m7
5314 pmaddubsw m0, [r3 + 6 * 16] ; [22]
5315 pmulhrsw m0, m7
5316 packuswb m6, m0
5317
5318 palignr m2, m3, 12
5319
5320 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
5321 pmulhrsw m1, m7
5322 pmaddubsw m2, [r3] ; [16]
5323 pmulhrsw m2, m7
5324 packuswb m1, m2
5325
5326 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5327
5328 lea r0, [r6 + 8]
5329 add r2, 8
5330 dec r4
5331 jnz .loop
5332 RET
5333
5334 INIT_XMM sse4
5335 cglobal intra_pred_ang16_7, 3,7,8
5336 add r2, 32
5337 lea r3, [ang_table + 16 * 16]
5338 mov r4d, 2
5339 lea r5, [r1 * 3] ; r5 -> 3 * stride
5340 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5341 mova m7, [pw_1024]
5342
5343 .loop:
5344 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5345 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5346 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5347 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5348
5349 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
5350 pmulhrsw m4, m7
5351 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
5352 pmulhrsw m0, m7
5353 packuswb m4, m0
5354
5355 palignr m1, m2, m3, 2
5356
5357 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
5358 pmulhrsw m5, m7
5359 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
5360 pmulhrsw m6, m7
5361 packuswb m5, m6
5362
5363 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
5364 pmulhrsw m6, m7
5365 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
5366 pmulhrsw m0, m7
5367 packuswb m6, m0
5368
5369 palignr m0, m2, m3, 4
5370
5371 pmaddubsw m1, [r3 + 15 * 16] ; [31]
5372 pmulhrsw m1, m7
5373 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5374 pmulhrsw m0, m7
5375 packuswb m1, m0
5376
5377 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5378
5379 palignr m1, m2, m3, 4
5380
5381 pmaddubsw m4, m1, [r3 + 16] ; [17]
5382 pmulhrsw m4, m7
5383 pmaddubsw m1, [r3 + 10 * 16] ; [26]
5384 pmulhrsw m1, m7
5385 packuswb m4, m1
5386
5387 palignr m0, m2, m3, 6
5388
5389 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
5390 pmulhrsw m5, m7
5391 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
5392 pmulhrsw m6, m7
5393 packuswb m5, m6
5394
5395 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
5396 pmulhrsw m6, m7
5397 pmaddubsw m0, [r3 + 14 * 16] ; [30]
5398 pmulhrsw m0, m7
5399 packuswb m6, m0
5400
5401 palignr m2, m3, 8
5402
5403 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
5404 pmulhrsw m1, m7
5405 pmaddubsw m2, [r3] ; [16]
5406 pmulhrsw m2, m7
5407 packuswb m1, m2
5408
5409 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5410
5411 lea r0, [r6 + r1 * 4]
5412 lea r6, [r6 + r1 * 8]
5413 add r2, 8
5414 dec r4
5415 jnz .loop
5416 RET
5417
5418 INIT_XMM sse4
5419 cglobal intra_pred_ang16_29, 3,7,8
5420 lea r3, [ang_table + 16 * 16]
5421 mov r4d, 2
5422 lea r5, [r1 * 3] ; r5 -> 3 * stride
5423 mov r6, r0
5424 mova m7, [pw_1024]
5425
5426 .loop:
5427 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5428 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5429 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5430 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5431
5432 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
5433 pmulhrsw m4, m7
5434 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
5435 pmulhrsw m0, m7
5436 packuswb m4, m0
5437
5438 palignr m1, m2, m3, 2
5439
5440 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
5441 pmulhrsw m5, m7
5442 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
5443 pmulhrsw m6, m7
5444 packuswb m5, m6
5445
5446 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
5447 pmulhrsw m6, m7
5448 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
5449 pmulhrsw m0, m7
5450 packuswb m6, m0
5451
5452 palignr m0, m2, m3, 4
5453
5454 pmaddubsw m1, [r3 + 15 * 16] ; [31]
5455 pmulhrsw m1, m7
5456 pmaddubsw m0, [r3 - 8 * 16] ; [8]
5457 pmulhrsw m0, m7
5458 packuswb m1, m0
5459
5460 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5461
5462 palignr m1, m2, m3, 4
5463
5464 pmaddubsw m4, m1, [r3 + 16] ; [17]
5465 pmulhrsw m4, m7
5466 pmaddubsw m1, [r3 + 10 * 16] ; [26]
5467 pmulhrsw m1, m7
5468 packuswb m4, m1
5469
5470 palignr m0, m2, m3, 6
5471
5472 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
5473 pmulhrsw m5, m7
5474 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
5475 pmulhrsw m6, m7
5476 packuswb m5, m6
5477
5478 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
5479 pmulhrsw m6, m7
5480 pmaddubsw m0, [r3 + 14 * 16] ; [30]
5481 pmulhrsw m0, m7
5482 packuswb m6, m0
5483
5484 palignr m2, m3, 8
5485
5486 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
5487 pmulhrsw m1, m7
5488 pmaddubsw m2, [r3] ; [16]
5489 pmulhrsw m2, m7
5490 packuswb m1, m2
5491
5492 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5493
5494 lea r0, [r6 + 8]
5495 add r2, 8
5496 dec r4
5497 jnz .loop
5498 RET
5499
5500 INIT_XMM sse4
5501 cglobal intra_pred_ang16_8, 3,7,8
5502 add r2, 32
5503 lea r3, [ang_table + 16 * 16]
5504 mov r4d, 2
5505 lea r5, [r1 * 3] ; r5 -> 3 * stride
5506 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5507 mova m7, [pw_1024]
5508
5509 .loop:
5510 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5511 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5512 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5513 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5514
5515 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
5516 pmulhrsw m4, m7
5517 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
5518 pmulhrsw m2, m7
5519 packuswb m4, m2
5520
5521 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
5522 pmulhrsw m5, m7
5523 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
5524 pmulhrsw m6, m7
5525 packuswb m5, m6
5526
5527 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
5528 pmulhrsw m6, m7
5529 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
5530 pmulhrsw m2, m7
5531 packuswb m6, m2
5532
5533 palignr m2, m0, m1, 2
5534 palignr m3, m0, m1, 4
5535
5536 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
5537 pmulhrsw m1, m7
5538 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
5539 pmulhrsw m0, m7
5540 packuswb m1, m0
5541
5542 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5543
5544 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
5545 pmulhrsw m4, m7
5546 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
5547 pmulhrsw m5, m7
5548 packuswb m4, m5
5549
5550 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
5551 pmulhrsw m5, m7
5552 pmaddubsw m2, [r3 + 12 * 16] ; [28]
5553 pmulhrsw m2, m7
5554 packuswb m5, m2
5555
5556 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
5557 pmulhrsw m6, m7
5558 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
5559 pmulhrsw m1, m7
5560 packuswb m6, m1
5561
5562 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
5563 pmulhrsw m1, m7
5564 pmaddubsw m3, [r3] ; [16]
5565 pmulhrsw m3, m7
5566 packuswb m1, m3
5567
5568 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5569
5570 lea r0, [r6 + r1 * 4]
5571 lea r6, [r6 + r1 * 8]
5572 add r2, 8
5573 dec r4
5574 jnz .loop
5575 RET
5576
5577 INIT_XMM sse4
5578 cglobal intra_pred_ang16_28, 3,7,8
5579 lea r3, [ang_table + 16 * 16]
5580 mov r4d, 2
5581 lea r5, [r1 * 3] ; r5 -> 3 * stride
5582 mov r6, r0
5583 mova m7, [pw_1024]
5584
5585 .loop:
5586 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5587 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5588 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
5589 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5590
5591 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
5592 pmulhrsw m4, m7
5593 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
5594 pmulhrsw m2, m7
5595 packuswb m4, m2
5596
5597 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
5598 pmulhrsw m5, m7
5599 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
5600 pmulhrsw m6, m7
5601 packuswb m5, m6
5602
5603 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
5604 pmulhrsw m6, m7
5605 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
5606 pmulhrsw m2, m7
5607 packuswb m6, m2
5608
5609 palignr m2, m0, m1, 2
5610 palignr m3, m0, m1, 4
5611
5612 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
5613 pmulhrsw m1, m7
5614 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
5615 pmulhrsw m0, m7
5616 packuswb m1, m0
5617
5618 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5619
5620 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
5621 pmulhrsw m4, m7
5622 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
5623 pmulhrsw m5, m7
5624 packuswb m4, m5
5625
5626 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
5627 pmulhrsw m5, m7
5628 pmaddubsw m2, [r3 + 12 * 16] ; [28]
5629 pmulhrsw m2, m7
5630 packuswb m5, m2
5631
5632 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
5633 pmulhrsw m6, m7
5634 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
5635 pmulhrsw m1, m7
5636 packuswb m6, m1
5637
5638 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
5639 pmulhrsw m1, m7
5640 pmaddubsw m3, [r3] ; [16]
5641 pmulhrsw m3, m7
5642 packuswb m1, m3
5643
5644 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5645
5646 lea r0, [r6 + 8]
5647 add r2, 8
5648 dec r4
5649 jnz .loop
5650 RET
5651
5652 INIT_XMM sse4
5653 cglobal intra_pred_ang16_9, 3,7,8
5654 add r2, 32
5655 lea r3, [ang_table + 16 * 16]
5656 mov r4d, 2
5657 lea r5, [r1 * 3] ; r5 -> 3 * stride
5658 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5659 mova m7, [pw_1024]
5660
5661 .loop:
5662 movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5663 palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5664 punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5665
5666 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
5667 pmulhrsw m4, m7
5668 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
5669 pmulhrsw m0, m7
5670 packuswb m4, m0
5671
5672 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
5673 pmulhrsw m5, m7
5674 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
5675 pmulhrsw m6, m7
5676 packuswb m5, m6
5677
5678 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
5679 pmulhrsw m6, m7
5680 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
5681 pmulhrsw m0, m7
5682 packuswb m6, m0
5683
5684 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
5685 pmulhrsw m1, m7
5686 pmaddubsw m0, m2, [r3] ; [16]
5687 pmulhrsw m0, m7
5688 packuswb m1, m0
5689
5690 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5691
5692 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
5693 pmulhrsw m4, m7
5694 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
5695 pmulhrsw m5, m7
5696 packuswb m4, m5
5697
5698 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
5699 pmulhrsw m5, m7
5700 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
5701 pmulhrsw m6, m7
5702 packuswb m5, m6
5703
5704 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
5705 pmulhrsw m6, m7
5706 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
5707 pmulhrsw m1, m7
5708 packuswb m6, m1
5709
5710 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
5711 pmulhrsw m1, m7
5712 packuswb m1, m1
5713
5714 punpcklqdq m1, m3 ; [00]
5715
5716 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5717
5718 lea r0, [r6 + r1 * 4]
5719 lea r6, [r6 + r1 * 8]
5720 add r2, 8
5721 dec r4
5722 jnz .loop
5723 RET
5724
5725 INIT_XMM sse4
5726 cglobal intra_pred_ang16_27, 3,7,8
5727 lea r3, [ang_table + 16 * 16]
5728 mov r4d, 2
5729 lea r5, [r1 * 3] ; r5 -> 3 * stride
5730 mov r6, r0
5731 mova m7, [pw_1024]
5732
5733 .loop:
5734 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5735 palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
5736 punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
5737
5738 pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
5739 pmulhrsw m4, m7
5740 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
5741 pmulhrsw m0, m7
5742 packuswb m4, m0
5743
5744 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
5745 pmulhrsw m5, m7
5746 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
5747 pmulhrsw m6, m7
5748 packuswb m5, m6
5749
5750 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
5751 pmulhrsw m6, m7
5752 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
5753 pmulhrsw m0, m7
5754 packuswb m6, m0
5755
5756 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
5757 pmulhrsw m1, m7
5758 pmaddubsw m0, m3, [r3] ; [16]
5759 pmulhrsw m0, m7
5760 packuswb m1, m0
5761
5762 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5763
5764 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
5765 pmulhrsw m4, m7
5766 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
5767 pmulhrsw m5, m7
5768 packuswb m4, m5
5769
5770 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
5771 pmulhrsw m5, m7
5772 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
5773 pmulhrsw m6, m7
5774 packuswb m5, m6
5775
5776 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
5777 pmulhrsw m6, m7
5778 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
5779 pmulhrsw m1, m7
5780 packuswb m6, m1
5781
5782 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
5783 pmulhrsw m1, m7
5784 packuswb m1, m1
5785
5786 movh [r0 ], m4
5787 movhps [r0 + r1 ], m4
5788 movh [r0 + r1 * 2], m5
5789 movhps [r0 + r5 ], m5
5790 lea r0, [r0 + r1 * 4]
5791 movh [r0 ], m6
5792 movhps [r0 + r1 ], m6
5793 movh [r0 + r1 * 2], m1
5794 movh [r0 + r5 ], m2
5795
5796 lea r0, [r6 + 8]
5797 add r2, 8
5798 dec r4
5799 jnz .loop
5800 RET
5801
5802 INIT_XMM sse4
5803 cglobal intra_pred_ang16_10, 5,6,8
5804 lea r5, [r1 * 3]
5805 pxor m7, m7
5806
5807 movu m0, [r2 + 1 + 32]
5808 palignr m1, m0, 1
5809 pshufb m1, m7
5810 palignr m2, m0, 2
5811 pshufb m2, m7
5812 palignr m3, m0, 3
5813 pshufb m3, m7
5814 palignr m4, m0, 4
5815 pshufb m4, m7
5816 palignr m5, m0, 5
5817 pshufb m5, m7
5818 palignr m6, m0, 6
5819 pshufb m6, m7
5820
5821 movu [r0 + r1], m1
5822 movu [r0 + r1 * 2], m2
5823 movu [r0 + r5], m3
5824 lea r3, [r0 + r1 * 4]
5825 movu [r3], m4
5826 movu [r3 + r1], m5
5827 movu [r3 + r1 * 2], m6
5828
5829 palignr m1, m0, 7
5830 pshufb m1, m7
5831 movhlps m2, m0
5832 pshufb m2, m7
5833 palignr m3, m0, 9
5834 pshufb m3, m7
5835 palignr m4, m0, 10
5836 pshufb m4, m7
5837 palignr m5, m0, 11
5838 pshufb m5, m7
5839 palignr m6, m0, 12
5840 pshufb m6, m7
5841
5842 movu [r3 + r5], m1
5843 lea r3, [r3 + r1 * 4]
5844 movu [r3], m2
5845 movu [r3 + r1], m3
5846 movu [r3 + r1 * 2], m4
5847 movu [r3 + r5], m5
5848 lea r3, [r3 + r1 * 4]
5849 movu [r3], m6
5850
5851 palignr m1, m0, 13
5852 pshufb m1, m7
5853 palignr m2, m0, 14
5854 pshufb m2, m7
5855 palignr m3, m0, 15
5856 pshufb m3, m7
5857 pshufb m0, m7
5858
5859 movu [r3 + r1], m1
5860 movu [r3 + r1 * 2], m2
5861 movu [r3 + r5], m3
5862
5863 ; filter
5864 cmp r4w, byte 0
5865 jz .quit
5866 pmovzxbw m0, m0
5867 mova m1, m0
5868 movu m2, [r2]
5869 movu m3, [r2 + 1]
5870
5871 pshufb m2, m7
5872 pmovzxbw m2, m2
5873 movhlps m4, m3
5874 pmovzxbw m3, m3
5875 pmovzxbw m4, m4
5876 psubw m3, m2
5877 psubw m4, m2
5878 psraw m3, 1
5879 psraw m4, 1
5880 paddw m0, m3
5881 paddw m1, m4
5882 packuswb m0, m1
5883 .quit:
5884 movu [r0], m0
5885 RET
5886
5887 INIT_XMM sse4
5888 %if ARCH_X86_64 == 1
5889 cglobal intra_pred_ang16_26, 3,8,5
5890 mov r7, r4mp
5891 %define bfilter r7w
5892 %else
5893 cglobal intra_pred_ang16_26, 5,7,5,0-4
5894 %define bfilter dword[rsp]
5895 mov bfilter, r4
5896 %endif
5897 movu m0, [r2 + 1]
5898
5899 lea r4, [r1 * 3]
5900 lea r3, [r0 + r1 * 4]
5901 lea r5, [r3 + r1 * 4]
5902 lea r6, [r5 + r1 * 4]
5903
5904 movu [r0], m0
5905 movu [r0 + r1], m0
5906 movu [r0 + r1 * 2], m0
5907 movu [r0 + r4], m0
5908 movu [r3], m0
5909 movu [r3 + r1], m0
5910 movu [r3 + r1 * 2], m0
5911 movu [r3 + r4], m0
5912 movu [r5], m0
5913 movu [r5 + r1], m0
5914 movu [r5 + r1 * 2], m0
5915 movu [r5 + r4], m0
5916
5917 movu [r6], m0
5918 movu [r6 + r1], m0
5919 movu [r6 + r1 * 2], m0
5920 movu [r6 + r4], m0
5921
5922 ; filter
5923 cmp bfilter, byte 0
5924 jz .quit
5925
5926 pxor m4, m4
5927 pshufb m0, m4
5928 pmovzxbw m0, m0
5929 mova m1, m0
5930 movu m2, [r2 + 32]
5931 pinsrb m2, [r2], 0
5932 movu m3, [r2 + 1 + 32]
5933
5934 pshufb m2, m4
5935 pmovzxbw m2, m2
5936 movhlps m4, m3
5937 pmovzxbw m3, m3
5938 pmovzxbw m4, m4
5939 psubw m3, m2
5940 psubw m4, m2
5941 psraw m3, 1
5942 psraw m4, 1
5943 paddw m0, m3
5944 paddw m1, m4
5945 packuswb m0, m1
5946
5947 pextrb [r0], m0, 0
5948 pextrb [r0 + r1], m0, 1
5949 pextrb [r0 + r1 * 2], m0, 2
5950 pextrb [r0 + r4], m0, 3
5951 pextrb [r3], m0, 4
5952 pextrb [r3 + r1], m0, 5
5953 pextrb [r3 + r1 * 2], m0, 6
5954 pextrb [r3 + r4], m0, 7
5955 pextrb [r5], m0, 8
5956 pextrb [r5 + r1], m0, 9
5957 pextrb [r5 + r1 * 2], m0, 10
5958 pextrb [r5 + r4], m0, 11
5959 pextrb [r6], m0, 12
5960 pextrb [r6 + r1], m0, 13
5961 pextrb [r6 + r1 * 2], m0, 14
5962 pextrb [r6 + r4], m0, 15
5963 .quit:
5964 RET
5965
5966 INIT_XMM sse4
5967 cglobal intra_pred_ang16_11, 3,7,8
5968 lea r3, [ang_table + 16 * 16]
5969 lea r5, [r1 * 3] ; r5 -> 3 * stride
5970 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5971 mova m7, [pw_1024]
5972
5973 movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5974 pinsrb m3, [r2], 0
5975 mova m2, m3
5976 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5977 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5978
5979 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
5980 pmulhrsw m4, m7
5981 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
5982 pmulhrsw m0, m7
5983 packuswb m4, m0
5984
5985 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
5986 pmulhrsw m5, m7
5987 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
5988 pmulhrsw m6, m7
5989 packuswb m5, m6
5990
5991 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
5992 pmulhrsw m6, m7
5993 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
5994 pmulhrsw m0, m7
5995 packuswb m6, m0
5996
5997 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
5998 pmulhrsw m1, m7
5999 pmaddubsw m0, m3, [r3] ; [16]
6000 pmulhrsw m0, m7
6001 packuswb m1, m0
6002
6003 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6004
6005 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
6006 pmulhrsw m4, m7
6007 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
6008 pmulhrsw m5, m7
6009 packuswb m4, m5
6010
6011 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
6012 pmulhrsw m5, m7
6013 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
6014 pmulhrsw m6, m7
6015 packuswb m5, m6
6016
6017 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
6018 pmulhrsw m6, m7
6019 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
6020 pmulhrsw m1, m7
6021 packuswb m6, m1
6022
6023 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
6024 pmulhrsw m1, m7
6025 packuswb m1, m1
6026 punpcklqdq m1, m2 ;[00]
6027
6028 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6029
6030 lea r0, [r6 + r1 * 4]
6031 lea r6, [r6 + r1 * 8]
6032
6033 movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6034 mova m2, m3
6035 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6036 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6037
6038 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
6039 pmulhrsw m4, m7
6040 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
6041 pmulhrsw m0, m7
6042 packuswb m4, m0
6043
6044 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
6045 pmulhrsw m5, m7
6046 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
6047 pmulhrsw m6, m7
6048 packuswb m5, m6
6049
6050 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
6051 pmulhrsw m6, m7
6052 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
6053 pmulhrsw m0, m7
6054 packuswb m6, m0
6055
6056 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
6057 pmulhrsw m1, m7
6058 pmaddubsw m0, m3, [r3] ; [16]
6059 pmulhrsw m0, m7
6060 packuswb m1, m0
6061
6062 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6063
6064 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
6065 pmulhrsw m4, m7
6066 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
6067 pmulhrsw m5, m7
6068 packuswb m4, m5
6069
6070 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
6071 pmulhrsw m5, m7
6072 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
6073 pmulhrsw m6, m7
6074 packuswb m5, m6
6075
6076 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
6077 pmulhrsw m6, m7
6078 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
6079 pmulhrsw m1, m7
6080 packuswb m6, m1
6081
6082 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
6083 pmulhrsw m1, m7
6084 packuswb m1, m1
6085 punpcklqdq m1, m2 ;[00]
6086
6087 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6088 RET
6089
6090 INIT_XMM sse4
6091 cglobal intra_pred_ang16_25, 3,7,8
6092 lea r3, [ang_table + 16 * 16]
6093 mov r4d, 2
6094 lea r5, [r1 * 3] ; r5 -> 3 * stride
6095 mov r6, r0
6096 mova m7, [pw_1024]
6097
6098 .loop:
6099 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6100 mova m2, m3
6101 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6102 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6103
6104 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
6105 pmulhrsw m4, m7
6106 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
6107 pmulhrsw m0, m7
6108 packuswb m4, m0
6109
6110 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
6111 pmulhrsw m5, m7
6112 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
6113 pmulhrsw m6, m7
6114 packuswb m5, m6
6115
6116 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
6117 pmulhrsw m6, m7
6118 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
6119 pmulhrsw m0, m7
6120 packuswb m6, m0
6121
6122 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
6123 pmulhrsw m1, m7
6124 pmaddubsw m0, m3, [r3] ; [16]
6125 pmulhrsw m0, m7
6126 packuswb m1, m0
6127
6128 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6129
6130 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
6131 pmulhrsw m4, m7
6132 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
6133 pmulhrsw m5, m7
6134 packuswb m4, m5
6135
6136 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
6137 pmulhrsw m5, m7
6138 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
6139 pmulhrsw m6, m7
6140 packuswb m5, m6
6141
6142 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
6143 pmulhrsw m6, m7
6144 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
6145 pmulhrsw m1, m7
6146 packuswb m6, m1
6147
6148 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
6149 pmulhrsw m1, m7
6150 packuswb m1, m1
6151
6152 movh [r0 ], m4
6153 movhps [r0 + r1 ], m4
6154 movh [r0 + r1 * 2], m5
6155 movhps [r0 + r5 ], m5
6156 lea r0, [r0 + r1 * 4]
6157 movh [r0 ], m6
6158 movhps [r0 + r1 ], m6
6159 movh [r0 + r1 * 2], m1
6160 movh [r0 + r5 ], m2
6161
6162 lea r0, [r6 + 8]
6163 add r2, 8
6164 dec r4
6165 jnz .loop
6166 RET
6167
6168 INIT_XMM sse4
6169 cglobal intra_pred_ang16_12, 4,7,8
6170 lea r4, [ang_table + 16 * 16]
6171 lea r5, [r1 * 3] ; r5 -> 3 * stride
6172 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6173 mova m7, [pw_1024]
6174
6175 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6176 pinsrb m3, [r2], 0
6177 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6178 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6179 movu m2, [r2]
6180 pshufb m2, [c_mode16_12]
6181
6182 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6183
6184 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
6185 pmulhrsw m4, m7
6186 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
6187 pmulhrsw m1, m7
6188 packuswb m4, m1
6189
6190 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
6191 pmulhrsw m5, m7
6192 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
6193 pmulhrsw m6, m7
6194 packuswb m5, m6
6195
6196 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
6197 pmulhrsw m6, m7
6198 pmaddubsw m0, [r4 - 14 * 16] ; [2]
6199 pmulhrsw m0, m7
6200 packuswb m6, m0
6201
6202 palignr m3, m2, 15
6203
6204 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6205 pmulhrsw m1, m7
6206 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6207 pmulhrsw m0, m7
6208 packuswb m1, m0
6209
6210 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6211
6212 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
6213 pmulhrsw m4, m7
6214 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6215 pmulhrsw m5, m7
6216 packuswb m4, m5
6217
6218 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
6219 pmulhrsw m5, m7
6220 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6221 pmulhrsw m6, m7
6222 packuswb m5, m6
6223
6224 palignr m3, m2, 14
6225
6226 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6227 pmulhrsw m6, m7
6228 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
6229 pmulhrsw m1, m7
6230 packuswb m6, m1
6231
6232 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
6233 pmulhrsw m1, m7
6234 pmaddubsw m3, [r4] ; [16]
6235 pmulhrsw m3, m7
6236 packuswb m1, m3
6237
6238 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6239
6240 lea r0, [r6 + r1 * 4]
6241 lea r6, [r6 + r1 * 8]
6242
6243 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6244 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
6245 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
6246 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
6247
6248 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
6249 pmulhrsw m4, m7
6250 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
6251 pmulhrsw m5, m7
6252 packuswb m4, m5
6253
6254 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
6255 pmulhrsw m5, m7
6256 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
6257 pmulhrsw m6, m7
6258 packuswb m5, m6
6259
6260 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
6261 pmulhrsw m6, m7
6262 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
6263 pmulhrsw m0, m7
6264 packuswb m6, m0
6265
6266 palignr m3, m2, 14
6267
6268 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6269 pmulhrsw m1, m7
6270 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6271 pmulhrsw m0, m7
6272 packuswb m1, m0
6273
6274 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6275
6276 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
6277 pmulhrsw m4, m7
6278 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6279 pmulhrsw m5, m7
6280 packuswb m4, m5
6281
6282 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
6283 pmulhrsw m5, m7
6284 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6285 pmulhrsw m6, m7
6286 packuswb m5, m6
6287
6288 pslldq m2, 1
6289 palignr m3, m2, 14
6290
6291 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6292 pmulhrsw m6, m7
6293 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
6294 pmulhrsw m1, m7
6295 packuswb m6, m1
6296
6297 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
6298 pmulhrsw m1, m7
6299 pmaddubsw m3, [r4] ; [16]
6300 pmulhrsw m3, m7
6301 packuswb m1, m3
6302
6303 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6304 RET
6305
6306 INIT_XMM sse4
6307 cglobal intra_pred_ang16_24, 4,7,8
6308 lea r4, [ang_table + 16 * 16]
6309 lea r5, [r1 * 3] ; r5 -> 3 * stride
6310 mov r6, r0
6311 mova m7, [pw_1024]
6312
6313 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6314 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6315 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6316 movu m2, [r2 + 32]
6317 pshufb m2, [c_mode16_12]
6318
6319 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6320
6321 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
6322 pmulhrsw m4, m7
6323 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
6324 pmulhrsw m1, m7
6325 packuswb m4, m1
6326
6327 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
6328 pmulhrsw m5, m7
6329 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
6330 pmulhrsw m6, m7
6331 packuswb m5, m6
6332
6333 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
6334 pmulhrsw m6, m7
6335 pmaddubsw m0, [r4 - 14 * 16] ; [2]
6336 pmulhrsw m0, m7
6337 packuswb m6, m0
6338
6339 palignr m3, m2, 15
6340
6341 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6342 pmulhrsw m1, m7
6343 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6344 pmulhrsw m0, m7
6345 packuswb m1, m0
6346
6347 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6348
6349 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
6350 pmulhrsw m4, m7
6351 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6352 pmulhrsw m5, m7
6353 packuswb m4, m5
6354
6355 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
6356 pmulhrsw m5, m7
6357 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6358 pmulhrsw m6, m7
6359 packuswb m5, m6
6360
6361 palignr m3, m2, 14
6362
6363 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6364 pmulhrsw m6, m7
6365 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
6366 pmulhrsw m1, m7
6367 packuswb m6, m1
6368
6369 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
6370 pmulhrsw m1, m7
6371 pmaddubsw m3, [r4] ; [16]
6372 pmulhrsw m3, m7
6373 packuswb m1, m3
6374
6375 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
6376
6377 lea r0, [r6 + 8]
6378
6379 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6380 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
6381 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
6382 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
6383
6384 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
6385 pmulhrsw m4, m7
6386 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
6387 pmulhrsw m5, m7
6388 packuswb m4, m5
6389
6390 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
6391 pmulhrsw m5, m7
6392 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
6393 pmulhrsw m6, m7
6394 packuswb m5, m6
6395
6396 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
6397 pmulhrsw m6, m7
6398 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
6399 pmulhrsw m0, m7
6400 packuswb m6, m0
6401
6402 palignr m3, m2, 14
6403
6404 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6405 pmulhrsw m1, m7
6406 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6407 pmulhrsw m0, m7
6408 packuswb m1, m0
6409
6410 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6411
6412 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
6413 pmulhrsw m4, m7
6414 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6415 pmulhrsw m5, m7
6416 packuswb m4, m5
6417
6418 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
6419 pmulhrsw m5, m7
6420 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6421 pmulhrsw m6, m7
6422 packuswb m5, m6
6423
6424 pslldq m2, 1
6425 palignr m3, m2, 14
6426
6427 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6428 pmulhrsw m6, m7
6429 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
6430 pmulhrsw m1, m7
6431 packuswb m6, m1
6432
6433 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
6434 pmulhrsw m1, m7
6435 pmaddubsw m3, [r4] ; [16]
6436 pmulhrsw m3, m7
6437 packuswb m1, m3
6438
6439 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
6440 RET
6441
6442 INIT_XMM sse4
6443 cglobal intra_pred_ang16_13, 4,7,8
6444 lea r4, [ang_table + 16 * 16]
6445 lea r5, [r1 * 3] ; r5 -> 3 * stride
6446 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6447 mova m7, [pw_1024]
6448
6449 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6450 pinsrb m3, [r2], 0
6451 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6452 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6453 movu m2, [r2]
6454 pshufb m2, [c_mode16_13]
6455
6456 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6457
6458 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
6459 pmulhrsw m4, m7
6460 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
6461 pmulhrsw m0, m7
6462 packuswb m4, m0
6463
6464 pmaddubsw m5, [r4 - 11 * 16] ; [05]
6465 pmulhrsw m5, m7
6466
6467 palignr m3, m2, 15
6468
6469 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
6470 pmulhrsw m6, m7
6471 packuswb m5, m6
6472
6473 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
6474 pmulhrsw m6, m7
6475 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
6476 pmulhrsw m0, m7
6477 packuswb m6, m0
6478
6479 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
6480 pmulhrsw m1, m7
6481
6482 palignr m3, m2, 14
6483
6484 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6485 pmulhrsw m0, m7
6486 packuswb m1, m0
6487
6488 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6489
6490 pmaddubsw m4, m3, [r4 - 16] ; [15]
6491 pmulhrsw m4, m7
6492 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
6493 pmulhrsw m5, m7
6494 packuswb m4, m5
6495
6496 pslldq m2, 1
6497 palignr m3, m2, 14
6498
6499 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
6500 pmulhrsw m5, m7
6501 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
6502 pmulhrsw m6, m7
6503 packuswb m5, m6
6504
6505 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
6506 pmulhrsw m6, m7
6507 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
6508 pmulhrsw m1, m7
6509 packuswb m6, m1
6510
6511 pslldq m2, 1
6512 palignr m3, m2, 14
6513
6514 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
6515 pmulhrsw m1, m7
6516 pmaddubsw m3, [r4] ; [16]
6517 pmulhrsw m3, m7
6518 packuswb m1, m3
6519
6520 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6521
6522 lea r0, [r6 + r1 * 4]
6523 lea r6, [r6 + r1 * 8]
6524
6525 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6526 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
6527 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
6528 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
6529
6530 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
6531 pmulhrsw m4, m7
6532 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6533 pmulhrsw m5, m7
6534 packuswb m4, m5
6535
6536 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
6537 pmulhrsw m5, m7
6538
6539 palignr m3, m2, 14
6540
6541 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
6542 pmulhrsw m6, m7
6543 packuswb m5, m6
6544
6545 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
6546 pmulhrsw m6, m7
6547 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
6548 pmulhrsw m0, m7
6549 packuswb m6, m0
6550
6551 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
6552 pmulhrsw m1, m7
6553
6554 pslldq m2, 1
6555 palignr m3, m2, 14
6556
6557 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6558 pmulhrsw m0, m7
6559 packuswb m1, m0
6560
6561 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6562
6563 pmaddubsw m4, m3, [r4 - 16] ; [15]
6564 pmulhrsw m4, m7
6565 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
6566 pmulhrsw m5, m7
6567 packuswb m4, m5
6568
6569 pslldq m2, 1
6570 palignr m3, m2, 14
6571
6572 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
6573 pmulhrsw m5, m7
6574 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
6575 pmulhrsw m6, m7
6576 packuswb m5, m6
6577
6578 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
6579 pmulhrsw m6, m7
6580 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
6581 pmulhrsw m1, m7
6582 packuswb m6, m1
6583
6584 pslldq m2, 1
6585 palignr m3, m2, 14
6586
6587 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
6588 pmulhrsw m1, m7
6589 pmaddubsw m3, [r4] ; [16]
6590 pmulhrsw m3, m7
6591 packuswb m1, m3
6592
6593 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6594 RET
6595
6596 INIT_XMM sse4
6597 cglobal intra_pred_ang16_23, 4,7,8
6598 lea r4, [ang_table + 16 * 16]
6599 lea r5, [r1 * 3] ; r5 -> 3 * stride
6600 mov r6, r0
6601 mova m7, [pw_1024]
6602
6603 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6604 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6605 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6606 movu m2, [r2 + 32]
6607 pshufb m2, [c_mode16_13]
6608
6609 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6610
6611 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
6612 pmulhrsw m4, m7
6613 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
6614 pmulhrsw m0, m7
6615 packuswb m4, m0
6616
6617 pmaddubsw m5, [r4 - 11 * 16] ; [05]
6618 pmulhrsw m5, m7
6619
6620 palignr m3, m2, 15
6621
6622 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
6623 pmulhrsw m6, m7
6624 packuswb m5, m6
6625
6626 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
6627 pmulhrsw m6, m7
6628 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
6629 pmulhrsw m0, m7
6630 packuswb m6, m0
6631
6632 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
6633 pmulhrsw m1, m7
6634
6635 palignr m3, m2, 14
6636
6637 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6638 pmulhrsw m0, m7
6639 packuswb m1, m0
6640
6641 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6642
6643 pmaddubsw m4, m3, [r4 - 16] ; [15]
6644 pmulhrsw m4, m7
6645 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
6646 pmulhrsw m5, m7
6647 packuswb m4, m5
6648
6649 pslldq m2, 1
6650 palignr m3, m2, 14
6651
6652 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
6653 pmulhrsw m5, m7
6654 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
6655 pmulhrsw m6, m7
6656 packuswb m5, m6
6657
6658 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
6659 pmulhrsw m6, m7
6660 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
6661 pmulhrsw m1, m7
6662 packuswb m6, m1
6663
6664 pslldq m2, 1
6665 palignr m3, m2, 14
6666
6667 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
6668 pmulhrsw m1, m7
6669 pmaddubsw m3, [r4] ; [16]
6670 pmulhrsw m3, m7
6671 packuswb m1, m3
6672
6673 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
6674
6675 lea r0, [r6 + 8]
6676
6677 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6678 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
6679 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
6680 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
6681
6682 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
6683 pmulhrsw m4, m7
6684 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
6685 pmulhrsw m5, m7
6686 packuswb m4, m5
6687
6688 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
6689 pmulhrsw m5, m7
6690
6691 palignr m3, m2, 14
6692
6693 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
6694 pmulhrsw m6, m7
6695 packuswb m5, m6
6696
6697 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
6698 pmulhrsw m6, m7
6699 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
6700 pmulhrsw m0, m7
6701 packuswb m6, m0
6702
6703 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
6704 pmulhrsw m1, m7
6705
6706 pslldq m2, 1
6707 palignr m3, m2, 14
6708
6709 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6710 pmulhrsw m0, m7
6711 packuswb m1, m0
6712
6713 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6714
6715 pmaddubsw m4, m3, [r4 - 16] ; [15]
6716 pmulhrsw m4, m7
6717 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
6718 pmulhrsw m5, m7
6719 packuswb m4, m5
6720
6721 pslldq m2, 1
6722 palignr m3, m2, 14
6723
6724 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
6725 pmulhrsw m5, m7
6726 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
6727 pmulhrsw m6, m7
6728 packuswb m5, m6
6729
6730 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
6731 pmulhrsw m6, m7
6732 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
6733 pmulhrsw m1, m7
6734 packuswb m6, m1
6735
6736 pslldq m2, 1
6737 palignr m3, m2, 14
6738
6739 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
6740 pmulhrsw m1, m7
6741 pmaddubsw m3, [r4] ; [16]
6742 pmulhrsw m3, m7
6743 packuswb m1, m3
6744
6745 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
6746 RET
6747
6748 INIT_XMM sse4
6749 cglobal intra_pred_ang16_14, 4,7,8
6750 lea r4, [ang_table + 16 * 16]
6751 lea r5, [r1 * 3] ; r5 -> 3 * stride
6752 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6753 mova m7, [pw_1024]
6754
6755 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6756 pinsrb m3, [r2], 0
6757 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6758 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6759 movu m2, [r2]
6760 pshufb m2, [c_mode16_14]
6761
6762 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6763
6764 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
6765 pmulhrsw m4, m7
6766 pmaddubsw m5, [r4 - 10 * 16] ; [06]
6767 pmulhrsw m5, m7
6768 packuswb m4, m5
6769
6770 palignr m3, m2, 15
6771
6772 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
6773 pmulhrsw m5, m7
6774 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
6775 pmulhrsw m6, m7
6776 packuswb m5, m6
6777
6778 palignr m3, m2, 14
6779
6780 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6781 pmulhrsw m6, m7
6782 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
6783 pmulhrsw m0, m7
6784 packuswb m6, m0
6785
6786 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
6787 pmulhrsw m1, m7
6788
6789 pslldq m2, 1
6790 palignr m3, m2, 14
6791
6792 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6793 pmulhrsw m0, m7
6794 packuswb m1, m0
6795
6796 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6797
6798 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
6799 pmulhrsw m4, m7
6800
6801 pslldq m2, 1
6802 palignr m3, m2, 14
6803
6804 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
6805 pmulhrsw m5, m7
6806 packuswb m4, m5
6807
6808 pmaddubsw m5, m3, [r4 + 16] ; [17]
6809 pmulhrsw m5, m7
6810 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6811 pmulhrsw m6, m7
6812 packuswb m5, m6
6813
6814 pslldq m2, 1
6815 palignr m3, m2, 14
6816
6817 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
6818 pmulhrsw m6, m7
6819 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
6820 pmulhrsw m1, m7
6821 packuswb m6, m1
6822
6823 pslldq m2, 1
6824 palignr m3, m2, 14
6825
6826 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6827 pmulhrsw m1, m7
6828 pmaddubsw m3, [r4] ; [16]
6829 pmulhrsw m3, m7
6830 packuswb m1, m3
6831
6832 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6833
6834 lea r0, [r6 + r1 * 4]
6835 lea r6, [r6 + r1 * 8]
6836
6837 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6838 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
6839 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
6840 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
6841
6842 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
6843 pmulhrsw m4, m7
6844 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
6845 pmulhrsw m5, m7
6846 packuswb m4, m5
6847
6848 palignr m3, m2, 14
6849
6850 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
6851 pmulhrsw m5, m7
6852 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
6853 pmulhrsw m6, m7
6854 packuswb m5, m6
6855
6856 pslldq m2, 1
6857 palignr m3, m2, 14
6858
6859 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6860 pmulhrsw m6, m7
6861 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
6862 pmulhrsw m0, m7
6863 packuswb m6, m0
6864
6865 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
6866 pmulhrsw m1, m7
6867
6868 pslldq m2, 1
6869 palignr m3, m2, 14
6870
6871 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6872 pmulhrsw m0, m7
6873 packuswb m1, m0
6874
6875 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
6876
6877 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
6878 pmulhrsw m4, m7
6879
6880 pslldq m2, 1
6881 palignr m3, m2, 14
6882
6883 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
6884 pmulhrsw m5, m7
6885 packuswb m4, m5
6886
6887 pmaddubsw m5, m3, [r4 + 16] ; [17]
6888 pmulhrsw m5, m7
6889 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6890 pmulhrsw m6, m7
6891 packuswb m5, m6
6892
6893 pslldq m2, 1
6894 palignr m3, m2, 14
6895
6896 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
6897 pmulhrsw m6, m7
6898 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
6899 pmulhrsw m1, m7
6900 packuswb m6, m1
6901
6902 pslldq m2, 1
6903 palignr m3, m2, 14
6904
6905 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6906 pmulhrsw m1, m7
6907 pmaddubsw m3, [r4] ; [16]
6908 pmulhrsw m3, m7
6909 packuswb m1, m3
6910
6911 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
6912 RET
6913
6914 INIT_XMM sse4
6915 cglobal intra_pred_ang16_22, 4,7,8
6916 lea r4, [ang_table + 16 * 16]
6917 lea r5, [r1 * 3] ; r5 -> 3 * stride
6918 mov r6, r0
6919 mova m7, [pw_1024]
6920
6921 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
6922 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
6923 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
6924 movu m2, [r2 + 32]
6925 pshufb m2, [c_mode16_14]
6926
6927 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
6928
6929 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
6930 pmulhrsw m4, m7
6931 pmaddubsw m5, [r4 - 10 * 16] ; [06]
6932 pmulhrsw m5, m7
6933 packuswb m4, m5
6934
6935 palignr m3, m2, 15
6936
6937 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
6938 pmulhrsw m5, m7
6939 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
6940 pmulhrsw m6, m7
6941 packuswb m5, m6
6942
6943 palignr m3, m2, 14
6944
6945 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
6946 pmulhrsw m6, m7
6947 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
6948 pmulhrsw m0, m7
6949 packuswb m6, m0
6950
6951 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
6952 pmulhrsw m1, m7
6953
6954 pslldq m2, 1
6955 palignr m3, m2, 14
6956
6957 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
6958 pmulhrsw m0, m7
6959 packuswb m1, m0
6960
6961 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
6962
6963 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
6964 pmulhrsw m4, m7
6965
6966 pslldq m2, 1
6967 palignr m3, m2, 14
6968
6969 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
6970 pmulhrsw m5, m7
6971 packuswb m4, m5
6972
6973 pmaddubsw m5, m3, [r4 + 16] ; [17]
6974 pmulhrsw m5, m7
6975 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
6976 pmulhrsw m6, m7
6977 packuswb m5, m6
6978
6979 pslldq m2, 1
6980 palignr m3, m2, 14
6981
6982 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
6983 pmulhrsw m6, m7
6984 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
6985 pmulhrsw m1, m7
6986 packuswb m6, m1
6987
6988 pslldq m2, 1
6989 palignr m3, m2, 14
6990
6991 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
6992 pmulhrsw m1, m7
6993 pmaddubsw m3, [r4] ; [16]
6994 pmulhrsw m3, m7
6995 packuswb m1, m3
6996
6997 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
6998
6999 lea r0, [r6 + 8]
7000
7001 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7002 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7003 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7004 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
7005
7006 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
7007 pmulhrsw m4, m7
7008 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
7009 pmulhrsw m5, m7
7010 packuswb m4, m5
7011
7012 palignr m3, m2, 14
7013
7014 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
7015 pmulhrsw m5, m7
7016 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
7017 pmulhrsw m6, m7
7018 packuswb m5, m6
7019
7020 pslldq m2, 1
7021 palignr m3, m2, 14
7022
7023 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
7024 pmulhrsw m6, m7
7025 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
7026 pmulhrsw m0, m7
7027 packuswb m6, m0
7028
7029 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
7030 pmulhrsw m1, m7
7031
7032 pslldq m2, 1
7033 palignr m3, m2, 14
7034
7035 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7036 pmulhrsw m0, m7
7037 packuswb m1, m0
7038
7039 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
7040
7041 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
7042 pmulhrsw m4, m7
7043
7044 pslldq m2, 1
7045 palignr m3, m2, 14
7046
7047 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
7048 pmulhrsw m5, m7
7049 packuswb m4, m5
7050
7051 pmaddubsw m5, m3, [r4 + 16] ; [17]
7052 pmulhrsw m5, m7
7053 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
7054 pmulhrsw m6, m7
7055 packuswb m5, m6
7056
7057 pslldq m2, 1
7058 palignr m3, m2, 14
7059
7060 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
7061 pmulhrsw m6, m7
7062 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
7063 pmulhrsw m1, m7
7064 packuswb m6, m1
7065
7066 pslldq m2, 1
7067 palignr m3, m2, 14
7068
7069 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
7070 pmulhrsw m1, m7
7071 pmaddubsw m3, [r4] ; [16]
7072 pmulhrsw m3, m7
7073 packuswb m1, m3
7074
7075 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
7076 RET
7077
7078 INIT_XMM sse4
7079 cglobal intra_pred_ang16_15, 4,7,8
7080 lea r4, [ang_table + 16 * 16]
7081 lea r5, [r1 * 3] ; r5 -> 3 * stride
7082 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7083 mova m7, [pw_1024]
7084
7085 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7086 pinsrb m3, [r2], 0
7087 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
7088 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7089 movu m2, [r2]
7090 pshufb m2, [c_mode16_15]
7091
7092 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
7093
7094 pmaddubsw m4, [r4 - 16] ; [15]
7095 pmulhrsw m4, m7
7096
7097 palignr m3, m2, 15
7098
7099 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
7100 pmulhrsw m5, m7
7101 packuswb m4, m5
7102
7103 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
7104 pmulhrsw m5, m7
7105
7106 palignr m3, m2, 14
7107
7108 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
7109 pmulhrsw m6, m7
7110 packuswb m5, m6
7111
7112 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
7113 pmulhrsw m6, m7
7114
7115 pslldq m2, 1
7116 palignr m3, m2, 14
7117
7118 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
7119 pmulhrsw m0, m7
7120 packuswb m6, m0
7121
7122 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
7123 pmulhrsw m1, m7
7124
7125 pslldq m2, 1
7126 palignr m3, m2, 14
7127
7128 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7129 pmulhrsw m0, m7
7130 packuswb m1, m0
7131
7132 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
7133
7134 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
7135 pmulhrsw m4, m7
7136
7137 pslldq m2, 1
7138 palignr m3, m2, 14
7139
7140 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7141 pmulhrsw m5, m7
7142 packuswb m4, m5
7143
7144 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
7145 pmulhrsw m5, m7
7146
7147 pslldq m2, 1
7148 palignr m3, m2, 14
7149
7150 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
7151 pmulhrsw m6, m7
7152 packuswb m5, m6
7153
7154 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
7155 pmulhrsw m6, m7
7156
7157 pslldq m2, 1
7158 palignr m3, m2, 14
7159
7160 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
7161 pmulhrsw m1, m7
7162 packuswb m6, m1
7163
7164 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
7165 pmulhrsw m1, m7
7166
7167 pslldq m2, 1
7168 palignr m3, m2, 14
7169
7170 pmaddubsw m3, [r4] ; [16]
7171 pmulhrsw m3, m7
7172 packuswb m1, m3
7173
7174 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
7175
7176 lea r0, [r6 + r1 * 4]
7177 lea r6, [r6 + r1 * 8]
7178
7179 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7180 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7181 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7182 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
7183
7184 pmaddubsw m4, m3, [r4 - 16] ; [15]
7185 pmulhrsw m4, m7
7186
7187 palignr m3, m2, 14
7188
7189 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
7190 pmulhrsw m5, m7
7191 packuswb m4, m5
7192
7193 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
7194 pmulhrsw m5, m7
7195
7196 pslldq m2, 1
7197 palignr m3, m2, 14
7198
7199 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
7200 pmulhrsw m6, m7
7201 packuswb m5, m6
7202
7203 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
7204 pmulhrsw m6, m7
7205
7206 pslldq m2, 1
7207 palignr m3, m2, 14
7208
7209 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
7210 pmulhrsw m0, m7
7211 packuswb m6, m0
7212
7213 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
7214 pmulhrsw m1, m7
7215
7216 pslldq m2, 1
7217 palignr m3, m2, 14
7218
7219 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7220 pmulhrsw m0, m7
7221 packuswb m1, m0
7222
7223 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
7224
7225 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
7226 pmulhrsw m4, m7
7227
7228 pslldq m2, 1
7229 palignr m3, m2, 14
7230
7231 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7232 pmulhrsw m5, m7
7233 packuswb m4, m5
7234
7235 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
7236 pmulhrsw m5, m7
7237
7238 pslldq m2, 1
7239 palignr m3, m2, 14
7240
7241 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
7242 pmulhrsw m6, m7
7243 packuswb m5, m6
7244
7245 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
7246 pmulhrsw m6, m7
7247
7248 pslldq m2, 1
7249 palignr m3, m2, 14
7250
7251 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
7252 pmulhrsw m1, m7
7253 packuswb m6, m1
7254
7255 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
7256 pmulhrsw m1, m7
7257
7258 pslldq m2, 1
7259 palignr m3, m2, 14
7260
7261 pmaddubsw m3, [r4] ; [16]
7262 pmulhrsw m3, m7
7263 packuswb m1, m3
7264
7265 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
7266 RET
7267
7268 INIT_XMM sse4
7269 cglobal intra_pred_ang16_21, 4,7,8
7270 lea r4, [ang_table + 16 * 16]
7271 lea r5, [r1 * 3] ; r5 -> 3 * stride
7272 mov r6, r0
7273 mova m7, [pw_1024]
7274
7275 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7276 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
7277 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7278 movu m2, [r2 + 32]
7279 pinsrb m2, [r2], 0
7280 pshufb m2, [c_mode16_15]
7281
7282 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
7283
7284 pmaddubsw m4, [r4 - 16] ; [15]
7285 pmulhrsw m4, m7
7286
7287 palignr m3, m2, 15
7288
7289 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
7290 pmulhrsw m5, m7
7291 packuswb m4, m5
7292
7293 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
7294 pmulhrsw m5, m7
7295
7296 palignr m3, m2, 14
7297
7298 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
7299 pmulhrsw m6, m7
7300 packuswb m5, m6
7301
7302 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
7303 pmulhrsw m6, m7
7304
7305 pslldq m2, 1
7306 palignr m3, m2, 14
7307
7308 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
7309 pmulhrsw m0, m7
7310 packuswb m6, m0
7311
7312 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
7313 pmulhrsw m1, m7
7314
7315 pslldq m2, 1
7316 palignr m3, m2, 14
7317
7318 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7319 pmulhrsw m0, m7
7320 packuswb m1, m0
7321
7322 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
7323
7324 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
7325 pmulhrsw m4, m7
7326
7327 pslldq m2, 1
7328 palignr m3, m2, 14
7329
7330 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7331 pmulhrsw m5, m7
7332 packuswb m4, m5
7333
7334 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
7335 pmulhrsw m5, m7
7336
7337 pslldq m2, 1
7338 palignr m3, m2, 14
7339
7340 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
7341 pmulhrsw m6, m7
7342 packuswb m5, m6
7343
7344 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
7345 pmulhrsw m6, m7
7346
7347 pslldq m2, 1
7348 palignr m3, m2, 14
7349
7350 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
7351 pmulhrsw m1, m7
7352 packuswb m6, m1
7353
7354 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
7355 pmulhrsw m1, m7
7356
7357 pslldq m2, 1
7358 palignr m3, m2, 14
7359
7360 pmaddubsw m3, [r4] ; [16]
7361 pmulhrsw m3, m7
7362 packuswb m1, m3
7363
7364 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
7365
7366 lea r0, [r6 + 8]
7367
7368 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7369 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7370 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7371 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
7372
7373 pmaddubsw m4, m3, [r4 - 16] ; [15]
7374 pmulhrsw m4, m7
7375
7376 palignr m3, m2, 14
7377
7378 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
7379 pmulhrsw m5, m7
7380 packuswb m4, m5
7381
7382 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
7383 pmulhrsw m5, m7
7384
7385 pslldq m2, 1
7386 palignr m3, m2, 14
7387
7388 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
7389 pmulhrsw m6, m7
7390 packuswb m5, m6
7391
7392 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
7393 pmulhrsw m6, m7
7394
7395 pslldq m2, 1
7396 palignr m3, m2, 14
7397
7398 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
7399 pmulhrsw m0, m7
7400 packuswb m6, m0
7401
7402 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
7403 pmulhrsw m1, m7
7404
7405 pslldq m2, 1
7406 palignr m3, m2, 14
7407
7408 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7409 pmulhrsw m0, m7
7410 packuswb m1, m0
7411
7412 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
7413
7414 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
7415 pmulhrsw m4, m7
7416
7417 pslldq m2, 1
7418 palignr m3, m2, 14
7419
7420 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7421 pmulhrsw m5, m7
7422 packuswb m4, m5
7423
7424 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
7425 pmulhrsw m5, m7
7426
7427 pslldq m2, 1
7428 palignr m3, m2, 14
7429
7430 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
7431 pmulhrsw m6, m7
7432 packuswb m5, m6
7433
7434 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
7435 pmulhrsw m6, m7
7436
7437 pslldq m2, 1
7438 palignr m3, m2, 14
7439
7440 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
7441 pmulhrsw m1, m7
7442 packuswb m6, m1
7443
7444 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
7445 pmulhrsw m1, m7
7446
7447 pslldq m2, 1
7448 palignr m3, m2, 14
7449
7450 pmaddubsw m3, [r4] ; [16]
7451 pmulhrsw m3, m7
7452 packuswb m1, m3
7453
7454 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
7455 RET
7456
7457 INIT_XMM sse4
7458 cglobal intra_pred_ang16_16, 4,7,8
7459 lea r4, [ang_table + 16 * 16]
7460 lea r5, [r1 * 3] ; r5 -> 3 * stride
7461 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7462 mova m7, [pw_1024]
7463
7464 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7465 pinsrb m3, [r2], 0
7466 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
7467 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7468 movu m2, [r2]
7469 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
7470 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
7471
7472 pmaddubsw m4, [r4 - 5 * 16] ; [11]
7473 pmulhrsw m4, m7
7474
7475 palignr m3, m2, 15
7476
7477 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7478 pmulhrsw m5, m7
7479 packuswb m4, m5
7480
7481 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
7482 pmulhrsw m5, m7
7483
7484 palignr m3, m2, 14
7485
7486 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
7487 pmulhrsw m6, m7
7488 packuswb m5, m6
7489
7490 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
7491 palignr m3, m2, 14
7492
7493 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
7494 pmulhrsw m6, m7
7495 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
7496 pmulhrsw m0, m7
7497 packuswb m6, m0
7498
7499 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
7500 palignr m3, m2, 14
7501
7502 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
7503 pmulhrsw m1, m7
7504
7505 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
7506 palignr m3, m2, 14
7507
7508 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7509 pmulhrsw m0, m7
7510 packuswb m1, m0
7511
7512 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
7513
7514 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
7515 pmulhrsw m4, m7
7516
7517 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
7518 palignr m3, m2, 14
7519
7520 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
7521 pmulhrsw m5, m7
7522 packuswb m4, m5
7523
7524 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
7525 palignr m3, m2, 14
7526
7527 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
7528 pmulhrsw m5, m7
7529 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
7530 pmulhrsw m6, m7
7531 packuswb m5, m6
7532
7533 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
7534 palignr m3, m2, 14
7535
7536 pmaddubsw m6, m3, [r4 - 16] ; [15]
7537 pmulhrsw m6, m7
7538
7539 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
7540 palignr m3, m2, 14
7541
7542 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
7543 pmulhrsw m1, m7
7544 packuswb m6, m1
7545
7546 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
7547 pmulhrsw m1, m7
7548
7549 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
7550 palignr m3, m2, 14
7551
7552 pmaddubsw m3, [r4] ; [16]
7553 pmulhrsw m3, m7
7554 packuswb m1, m3
7555
7556 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
7557
7558 lea r0, [r6 + r1 * 4]
7559 lea r6, [r6 + r1 * 8]
7560
7561 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7562 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7563 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7564 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
7565 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
7566
7567 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
7568 pmulhrsw m4, m7
7569
7570 palignr m3, m2, 14
7571
7572 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7573 pmulhrsw m5, m7
7574 packuswb m4, m5
7575
7576 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
7577 pmulhrsw m5, m7
7578
7579 pslldq m2, 1
7580 palignr m3, m2, 14
7581
7582 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
7583 pmulhrsw m6, m7
7584 packuswb m5, m6
7585
7586 pslldq m2, 1
7587 palignr m3, m2, 14
7588
7589 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
7590 pmulhrsw m6, m7
7591
7592 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
7593 pmulhrsw m0, m7
7594 packuswb m6, m0
7595
7596 pslldq m2, 1
7597 palignr m3, m2, 14
7598
7599 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
7600 pmulhrsw m1, m7
7601
7602 pslldq m2, 1
7603 palignr m3, m2, 14
7604
7605 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7606 pmulhrsw m0, m7
7607 packuswb m1, m0
7608
7609 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
7610
7611 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
7612 pmulhrsw m4, m7
7613
7614 pslldq m2, 1
7615 palignr m3, m2, 14
7616
7617 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
7618 pmulhrsw m5, m7
7619 packuswb m4, m5
7620
7621 pslldq m2, 1
7622 palignr m3, m2, 14
7623
7624 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
7625 pmulhrsw m5, m7
7626 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
7627 pmulhrsw m6, m7
7628 packuswb m5, m6
7629
7630 pslldq m2, 1
7631 palignr m3, m2, 14
7632
7633 pmaddubsw m6, m3, [r4 - 16] ; [15]
7634 pmulhrsw m6, m7
7635
7636 pslldq m2, 1
7637 palignr m3, m2, 14
7638
7639 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
7640 pmulhrsw m1, m7
7641 packuswb m6, m1
7642
7643 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
7644 pmulhrsw m1, m7
7645
7646 pslldq m2, 1
7647 palignr m3, m2, 14
7648
7649 pmaddubsw m3, [r4] ; [16]
7650 pmulhrsw m3, m7
7651 packuswb m1, m3
7652
7653 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
7654 RET
7655
7656 INIT_XMM sse4
7657 cglobal intra_pred_ang16_20, 4,7,8
7658 lea r4, [ang_table + 16 * 16]
7659 lea r5, [r1 * 3] ; r5 -> 3 * stride
7660 mov r6, r0
7661 mova m7, [pw_1024]
7662
7663 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7664 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
7665 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7666 movu m2, [r2 + 32]
7667 pinsrb m2, [r2], 0
7668 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
7669 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
7670
7671 pmaddubsw m4, [r4 - 5 * 16] ; [11]
7672 pmulhrsw m4, m7
7673
7674 palignr m3, m2, 15
7675
7676 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7677 pmulhrsw m5, m7
7678 packuswb m4, m5
7679
7680 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
7681 pmulhrsw m5, m7
7682
7683 palignr m3, m2, 14
7684
7685 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
7686 pmulhrsw m6, m7
7687 packuswb m5, m6
7688
7689 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
7690 palignr m3, m2, 14
7691
7692 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
7693 pmulhrsw m6, m7
7694 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
7695 pmulhrsw m0, m7
7696 packuswb m6, m0
7697
7698 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
7699 palignr m3, m2, 14
7700
7701 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
7702 pmulhrsw m1, m7
7703
7704 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
7705 palignr m3, m2, 14
7706
7707 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7708 pmulhrsw m0, m7
7709 packuswb m1, m0
7710
7711 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
7712
7713 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
7714 pmulhrsw m4, m7
7715
7716 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
7717 palignr m3, m2, 14
7718
7719 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
7720 pmulhrsw m5, m7
7721 packuswb m4, m5
7722
7723 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
7724 palignr m3, m2, 14
7725
7726 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
7727 pmulhrsw m5, m7
7728 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
7729 pmulhrsw m6, m7
7730 packuswb m5, m6
7731
7732 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
7733 palignr m3, m2, 14
7734
7735 pmaddubsw m6, m3, [r4 - 16] ; [15]
7736 pmulhrsw m6, m7
7737
7738 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
7739 palignr m3, m2, 14
7740
7741 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
7742 pmulhrsw m1, m7
7743 packuswb m6, m1
7744
7745 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
7746 pmulhrsw m1, m7
7747
7748 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
7749 palignr m3, m2, 14
7750
7751 pmaddubsw m3, [r4] ; [16]
7752 pmulhrsw m3, m7
7753 packuswb m1, m3
7754
7755 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
7756
7757 lea r0, [r6 + 8]
7758
7759 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7760 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7761 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7762 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
7763 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
7764
7765 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
7766 pmulhrsw m4, m7
7767
7768 palignr m3, m2, 14
7769
7770 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
7771 pmulhrsw m5, m7
7772 packuswb m4, m5
7773
7774 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
7775 pmulhrsw m5, m7
7776
7777 pslldq m2, 1
7778 palignr m3, m2, 14
7779
7780 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
7781 pmulhrsw m6, m7
7782 packuswb m5, m6
7783
7784 pslldq m2, 1
7785 palignr m3, m2, 14
7786
7787 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
7788 pmulhrsw m6, m7
7789
7790 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
7791 pmulhrsw m0, m7
7792 packuswb m6, m0
7793
7794 pslldq m2, 1
7795 palignr m3, m2, 14
7796
7797 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
7798 pmulhrsw m1, m7
7799
7800 pslldq m2, 1
7801 palignr m3, m2, 14
7802
7803 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
7804 pmulhrsw m0, m7
7805 packuswb m1, m0
7806
7807 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
7808
7809 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
7810 pmulhrsw m4, m7
7811
7812 pslldq m2, 1
7813 palignr m3, m2, 14
7814
7815 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
7816 pmulhrsw m5, m7
7817 packuswb m4, m5
7818
7819 pslldq m2, 1
7820 palignr m3, m2, 14
7821
7822 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
7823 pmulhrsw m5, m7
7824 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
7825 pmulhrsw m6, m7
7826 packuswb m5, m6
7827
7828 pslldq m2, 1
7829 palignr m3, m2, 14
7830
7831 pmaddubsw m6, m3, [r4 - 16] ; [15]
7832 pmulhrsw m6, m7
7833
7834 pslldq m2, 1
7835 palignr m3, m2, 14
7836
7837 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
7838 pmulhrsw m1, m7
7839 packuswb m6, m1
7840
7841 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
7842 pmulhrsw m1, m7
7843
7844 pslldq m2, 1
7845 palignr m3, m2, 14
7846
7847 pmaddubsw m3, [r4] ; [16]
7848 pmulhrsw m3, m7
7849 packuswb m1, m3
7850
7851 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
7852 RET
7853
7854 INIT_XMM sse4
7855 cglobal intra_pred_ang16_17, 4,7,8
7856 lea r4, [ang_table + 16 * 16]
7857 lea r5, [r1 * 3] ; r5 -> 3 * stride
7858 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7859 mova m7, [pw_1024]
7860
7861 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7862 pinsrb m3, [r2], 0
7863 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
7864 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7865 movu m2, [r2]
7866 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
7867 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
7868
7869 pmaddubsw m4, [r4 - 10 * 16] ; [06]
7870 pmulhrsw m4, m7
7871
7872 palignr m3, m2, 15
7873
7874 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
7875 pmulhrsw m5, m7
7876 packuswb m4, m5
7877
7878 palignr m3, m2, 14
7879
7880 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
7881 pmulhrsw m5, m7
7882
7883 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
7884 pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
7885 palignr m3, m2, 14
7886
7887 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
7888 pmulhrsw m6, m7
7889 packuswb m5, m6
7890
7891 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
7892 palignr m3, m2, 14
7893
7894 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
7895 pmulhrsw m6, m7
7896 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
7897 pmulhrsw m0, m7
7898 packuswb m6, m0
7899
7900 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
7901 palignr m3, m2, 14
7902
7903 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
7904 pmulhrsw m1, m7
7905
7906 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
7907 palignr m3, m2, 14
7908
7909 pmaddubsw m0, m3, [r4] ; [16]
7910 pmulhrsw m0, m7
7911 packuswb m1, m0
7912
7913 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
7914
7915 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
7916 palignr m3, m2, 14
7917
7918 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
7919 pmulhrsw m4, m7
7920
7921 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
7922 palignr m3, m2, 14
7923
7924 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
7925 pmulhrsw m5, m7
7926 packuswb m4, m5
7927
7928 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
7929 pmulhrsw m5, m7
7930
7931 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
7932 palignr m3, m2, 14
7933
7934 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
7935 pmulhrsw m6, m7
7936 packuswb m5, m6
7937
7938 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
7939 palignr m3, m2, 14
7940
7941 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
7942 pmulhrsw m6, m7
7943
7944 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
7945 palignr m3, m2, 14
7946
7947 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
7948 pmulhrsw m1, m7
7949 packuswb m6, m1
7950
7951 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
7952 palignr m3, m2, 14
7953
7954 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
7955 pmulhrsw m1, m7
7956 pmaddubsw m3, [r4 - 16 * 16] ; [00]
7957 pmulhrsw m3, m7
7958 packuswb m1, m3
7959
7960 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
7961
7962 lea r0, [r6 + r1 * 4]
7963 lea r6, [r6 + r1 * 8]
7964
7965 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
7966 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
7967 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
7968 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
7969 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
7970
7971 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
7972 pmulhrsw m4, m7
7973
7974 palignr m3, m2, 14
7975
7976 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
7977 pmulhrsw m5, m7
7978 packuswb m4, m5
7979
7980 pslldq m2, 1
7981 palignr m3, m2, 14
7982
7983 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
7984 pmulhrsw m5, m7
7985
7986 pslldq m2, 1
7987 palignr m3, m2, 14
7988
7989 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
7990 pmulhrsw m6, m7
7991 packuswb m5, m6
7992
7993 pslldq m2, 1
7994 palignr m3, m2, 14
7995
7996 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
7997 pmulhrsw m6, m7
7998 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
7999 pmulhrsw m0, m7
8000 packuswb m6, m0
8001
8002 pslldq m2, 1
8003 palignr m3, m2, 14
8004
8005 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
8006 pmulhrsw m1, m7
8007
8008 pslldq m2, 1
8009 palignr m3, m2, 14
8010
8011 pmaddubsw m0, m3, [r4] ; [16]
8012 pmulhrsw m0, m7
8013 packuswb m1, m0
8014
8015 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
8016
8017 pslldq m2, 1
8018 palignr m3, m2, 14
8019
8020 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
8021 pmulhrsw m4, m7
8022
8023 pslldq m2, 1
8024 palignr m3, m2, 14
8025
8026 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
8027 pmulhrsw m5, m7
8028 packuswb m4, m5
8029
8030 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
8031 pmulhrsw m5, m7
8032
8033 pslldq m2, 1
8034 palignr m3, m2, 14
8035
8036 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
8037 pmulhrsw m6, m7
8038 packuswb m5, m6
8039
8040 pslldq m2, 1
8041 palignr m3, m2, 14
8042
8043 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
8044 pmulhrsw m6, m7
8045
8046 pslldq m2, 1
8047 palignr m3, m2, 14
8048
8049 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
8050 pmulhrsw m1, m7
8051 packuswb m6, m1
8052
8053 pslldq m2, 1
8054 palignr m3, m2, 14
8055
8056 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
8057 pmulhrsw m1, m7
8058 pmaddubsw m3, [r4 - 16 * 16] ; [00]
8059 pmulhrsw m3, m7
8060 packuswb m1, m3
8061
8062 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
8063 RET
8064
8065 INIT_XMM sse4
8066 cglobal intra_pred_ang16_19, 4,7,8
8067 lea r4, [ang_table + 16 * 16]
8068 lea r5, [r1 * 3] ; r5 -> 3 * stride
8069 mov r6, r0
8070 mova m7, [pw_1024]
8071
8072 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
8073 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
8074 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
8075 movu m2, [r2 + 32]
8076 pinsrb m2, [r2], 0
8077 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
8078 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
8079
8080 pmaddubsw m4, [r4 - 10 * 16] ; [06]
8081 pmulhrsw m4, m7
8082
8083 palignr m3, m2, 15
8084
8085 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
8086 pmulhrsw m5, m7
8087 packuswb m4, m5
8088
8089 palignr m3, m2, 14
8090
8091 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
8092 pmulhrsw m5, m7
8093
8094 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
8095 pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
8096 palignr m3, m2, 14
8097
8098 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
8099 pmulhrsw m6, m7
8100 packuswb m5, m6
8101
8102 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
8103 palignr m3, m2, 14
8104
8105 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
8106 pmulhrsw m6, m7
8107 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
8108 pmulhrsw m0, m7
8109 packuswb m6, m0
8110
8111 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
8112 palignr m3, m2, 14
8113
8114 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
8115 pmulhrsw m1, m7
8116
8117 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
8118 palignr m3, m2, 14
8119
8120 pmaddubsw m0, m3, [r4] ; [16]
8121 pmulhrsw m0, m7
8122 packuswb m1, m0
8123
8124 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
8125
8126 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
8127 palignr m3, m2, 14
8128
8129 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
8130 pmulhrsw m4, m7
8131
8132 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
8133 palignr m3, m2, 14
8134
8135 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
8136 pmulhrsw m5, m7
8137 packuswb m4, m5
8138
8139 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
8140 pmulhrsw m5, m7
8141
8142 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
8143 palignr m3, m2, 14
8144
8145 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
8146 pmulhrsw m6, m7
8147 packuswb m5, m6
8148
8149 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
8150 palignr m3, m2, 14
8151
8152 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
8153 pmulhrsw m6, m7
8154
8155 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
8156 palignr m3, m2, 14
8157
8158 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
8159 pmulhrsw m1, m7
8160 packuswb m6, m1
8161
8162 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
8163 palignr m3, m2, 14
8164
8165 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
8166 pmulhrsw m1, m7
8167 pmaddubsw m3, [r4 - 16 * 16] ; [00]
8168 pmulhrsw m3, m7
8169 packuswb m1, m3
8170
8171 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
8172
8173 lea r0, [r6 + 8]
8174
8175 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
8176 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
8177 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
8178 palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
8179 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
8180
8181 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
8182 pmulhrsw m4, m7
8183
8184 palignr m3, m2, 14
8185
8186 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
8187 pmulhrsw m5, m7
8188 packuswb m4, m5
8189
8190 pslldq m2, 1
8191 palignr m3, m2, 14
8192
8193 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
8194 pmulhrsw m5, m7
8195
8196 pslldq m2, 1
8197 palignr m3, m2, 14
8198
8199 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
8200 pmulhrsw m6, m7
8201 packuswb m5, m6
8202
8203 pslldq m2, 1
8204 palignr m3, m2, 14
8205
8206 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
8207 pmulhrsw m6, m7
8208 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
8209 pmulhrsw m0, m7
8210 packuswb m6, m0
8211
8212 pslldq m2, 1
8213 palignr m3, m2, 14
8214
8215 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
8216 pmulhrsw m1, m7
8217
8218 pslldq m2, 1
8219 palignr m3, m2, 14
8220
8221 pmaddubsw m0, m3, [r4] ; [16]
8222 pmulhrsw m0, m7
8223 packuswb m1, m0
8224
8225 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
8226
8227 pslldq m2, 1
8228 palignr m3, m2, 14
8229
8230 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
8231 pmulhrsw m4, m7
8232
8233 pslldq m2, 1
8234 palignr m3, m2, 14
8235
8236 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
8237 pmulhrsw m5, m7
8238 packuswb m4, m5
8239
8240 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
8241 pmulhrsw m5, m7
8242
8243 pslldq m2, 1
8244 palignr m3, m2, 14
8245
8246 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
8247 pmulhrsw m6, m7
8248 packuswb m5, m6
8249
8250 pslldq m2, 1
8251 palignr m3, m2, 14
8252
8253 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
8254 pmulhrsw m6, m7
8255
8256 pslldq m2, 1
8257 palignr m3, m2, 14
8258
8259 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
8260 pmulhrsw m1, m7
8261 packuswb m6, m1
8262
8263 pslldq m2, 1
8264 palignr m3, m2, 14
8265
8266 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
8267 pmulhrsw m1, m7
8268 pmaddubsw m3, [r4 - 16 * 16] ; [00]
8269 pmulhrsw m3, m7
8270 packuswb m1, m3
8271
8272 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
8273 RET
8274
8275 INIT_XMM sse4
8276 cglobal intra_pred_ang16_18, 4,5,3
8277 movu m0, [r2]
8278 movu m1, [r2 + 32]
8279 mova m2, [c_mode16_18]
8280 pshufb m1, m2
8281
8282 lea r2, [r1 * 2]
8283 lea r3, [r1 * 3]
8284 lea r4, [r1 * 4]
8285 movu [r0], m0
8286 palignr m2, m0, m1, 15
8287 movu [r0 + r1], m2
8288 palignr m2, m0, m1, 14
8289 movu [r0 + r2], m2
8290 palignr m2, m0, m1, 13
8291 movu [r0 + r3], m2
8292 lea r0, [r0 + r4]
8293 palignr m2, m0, m1, 12
8294 movu [r0], m2
8295 palignr m2, m0, m1, 11
8296 movu [r0 + r1], m2
8297 palignr m2, m0, m1, 10
8298 movu [r0 + r2], m2
8299 palignr m2, m0, m1, 9
8300 movu [r0 + r3], m2
8301 lea r0, [r0 + r4]
8302 palignr m2, m0, m1, 8
8303 movu [r0], m2
8304 palignr m2, m0, m1, 7
8305 movu [r0 + r1], m2
8306 palignr m2, m0, m1, 6
8307 movu [r0 + r2], m2
8308 palignr m2, m0, m1, 5
8309 movu [r0 + r3], m2
8310 lea r0, [r0 + r4]
8311 palignr m2, m0, m1, 4
8312 movu [r0], m2
8313 palignr m2, m0, m1, 3
8314 movu [r0 + r1], m2
8315 palignr m2, m0, m1, 2
8316 movu [r0 + r2], m2
8317 palignr m0, m1, 1
8318 movu [r0 + r3], m0
8319 RET
8320
8321 ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
8322 %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
8323 %if %3 == 0
8324 %else
8325 pshufb m0, [r3]
8326 pmaddubsw m0, [r4 + %3 * 16]
8327 pmulhrsw m0, [pw_1024]
8328 %endif
8329 %if %4 == 0
8330 pmovzxbw m1, m1
8331 %else
8332 pshufb m1, [r3]
8333 pmaddubsw m1, [r4 + %4 * 16]
8334 pmulhrsw m1, [pw_1024]
8335 %endif
8336 %if %3 == 0
8337 packuswb m1, m1
8338 movlhps m0, m1
8339 %else
8340 packuswb m0, m1
8341 %endif
8342 mova m1, [pw_1024]
8343 %if %5 == 0
8344 %else
8345 pshufb m2, [r3]
8346 pmaddubsw m2, [r4 + %5 * 16]
8347 pmulhrsw m2, m1
8348 %endif
8349 %if %6 == 0
8350 pmovzxbw m3, m3
8351 %else
8352 pshufb m3, [r3]
8353 pmaddubsw m3, [r4 + %6 * 16]
8354 pmulhrsw m3, m1
8355 %endif
8356 %if %5 == 0
8357 packuswb m3, m3
8358 movlhps m2, m3
8359 %else
8360 packuswb m2, m3
8361 %endif
8362 %if %7 == 0
8363 %else
8364 pshufb m4, [r3]
8365 pmaddubsw m4, [r4 + %7 * 16]
8366 pmulhrsw m4, m1
8367 %endif
8368 %if %8 == 0
8369 pmovzxbw m5, m5
8370 %else
8371 pshufb m5, [r3]
8372 pmaddubsw m5, [r4 + %8 * 16]
8373 pmulhrsw m5, m1
8374 %endif
8375 %if %7 == 0
8376 packuswb m5, m5
8377 movlhps m4, m5
8378 %else
8379 packuswb m4, m5
8380 %endif
8381 %if %9 == 0
8382 %else
8383 pshufb m6, [r3]
8384 pmaddubsw m6, [r4 + %9 * 16]
8385 pmulhrsw m6, m1
8386 %endif
8387 %if %10 == 0
8388 pmovzxbw m7, m7
8389 %else
8390 pshufb m7, [r3]
8391 pmaddubsw m7, [r4 + %10 * 16]
8392 pmulhrsw m7, m1
8393 %endif
8394 %if %9 == 0
8395 packuswb m7, m7
8396 movlhps m6, m7
8397 %else
8398 packuswb m6, m7
8399 %endif
8400
8401 %if %2 == 1
8402 ; transpose
8403 punpckhbw m1, m0, m2
8404 punpcklbw m0, m2
8405 punpckhbw m3, m0, m1
8406 punpcklbw m0, m1
8407
8408 punpckhbw m1, m4, m6
8409 punpcklbw m4, m6
8410 punpckhbw m6, m4, m1
8411 punpcklbw m4, m1
8412
8413 punpckhdq m2, m0, m4
8414 punpckldq m0, m4
8415 punpckldq m4, m3, m6
8416 punpckhdq m3, m6
8417
8418 movh [r0 + + %1 * 8], m0
8419 movhps [r0 + r1 + %1 * 8], m0
8420 movh [r0 + r1*2 + %1 * 8], m2
8421 movhps [r0 + r5 + %1 * 8], m2
8422 movh [r6 + %1 * 8], m4
8423 movhps [r6 + r1 + %1 * 8], m4
8424 movh [r6 + r1*2 + %1 * 8], m3
8425 movhps [r6 + r5 + %1 * 8], m3
8426 %else
8427 movh [r0 ], m0
8428 movhps [r0 + r1 ], m0
8429 movh [r0 + r1 * 2], m2
8430 movhps [r0 + r5 ], m2
8431 lea r0, [r0 + r1 * 4]
8432 movh [r0 ], m4
8433 movhps [r0 + r1 ], m4
8434 movh [r0 + r1 * 2], m6
8435 movhps [r0 + r5 ], m6
8436 %endif
8437 %endmacro
8438
8439 %macro MODE_3_33 1
8440 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
8441 palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
8442 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
8443 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
8444 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
8445 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
8446 pmulhrsw m4, m7
8447 pmaddubsw m1, [r3 + 4 * 16] ; [20]
8448 pmulhrsw m1, m7
8449 packuswb m4, m1
8450 palignr m5, m2, m0, 4
8451 pmaddubsw m5, [r3 - 2 * 16] ; [14]
8452 pmulhrsw m5, m7
8453 palignr m6, m2, m0, 6
8454 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
8455 pmulhrsw m6, m7
8456 packuswb m5, m6
8457 palignr m1, m2, m0, 8
8458 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
8459 pmulhrsw m6, m7
8460 pmaddubsw m1, [r3 + 12 * 16] ; [28]
8461 pmulhrsw m1, m7
8462 packuswb m6, m1
8463 palignr m1, m2, m0, 10
8464 pmaddubsw m1, [r3 + 6 * 16] ; [22]
8465 pmulhrsw m1, m7
8466 palignr m2, m0, 12
8467 pmaddubsw m2, [r3] ; [16]
8468 pmulhrsw m2, m7
8469 packuswb m1, m2
8470
8471 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8472
8473 movu m0, [r2 + 8]
8474 palignr m1, m0, 1
8475 punpckhbw m2, m0, m1
8476 punpcklbw m0, m1
8477 palignr m5, m2, m0, 2
8478 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
8479 pmulhrsw m4, m7
8480 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
8481 pmulhrsw m1, m7
8482 packuswb m4, m1
8483 pmaddubsw m5, [r3 + 14 * 16] ; [30]
8484 pmulhrsw m5, m7
8485 palignr m6, m2, m0, 4
8486 pmaddubsw m6, [r3 + 8 * 16] ; [24]
8487 pmulhrsw m6, m7
8488 packuswb m5, m6
8489 palignr m1, m2, m0, 6
8490 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
8491 pmulhrsw m6, m7
8492 palignr m1, m2, m0, 8
8493 pmaddubsw m1, [r3 - 4 * 16] ; [12]
8494 pmulhrsw m1, m7
8495 packuswb m6, m1
8496 palignr m1, m2, m0, 10
8497 pmaddubsw m1, [r3 - 10 * 16] ; [06]
8498 pmulhrsw m1, m7
8499 packuswb m1, m1
8500 movhps m1, [r2 + 14] ; [00]
8501
8502 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
8503
8504 movu m0, [r2 + 14]
8505 palignr m1, m0, 1
8506 punpckhbw m2, m0, m1
8507 punpcklbw m0, m1
8508 palignr m1, m2, m0, 2
8509 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
8510 pmulhrsw m4, m7
8511 pmaddubsw m1, [r3 + 4 * 16] ; [20]
8512 pmulhrsw m1, m7
8513 packuswb m4, m1
8514 palignr m5, m2, m0, 4
8515 pmaddubsw m5, [r3 - 2 * 16] ; [14]
8516 pmulhrsw m5, m7
8517 palignr m6, m2, m0, 6
8518 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
8519 pmulhrsw m6, m7
8520 packuswb m5, m6
8521 palignr m1, m2, m0, 8
8522 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
8523 pmulhrsw m6, m7
8524 pmaddubsw m1, [r3 + 12 * 16] ; [28]
8525 pmulhrsw m1, m7
8526 packuswb m6, m1
8527 palignr m1, m2, m0, 10
8528 pmaddubsw m1, [r3 + 6 * 16] ; [22]
8529 pmulhrsw m1, m7
8530 palignr m2, m0, 12
8531 pmaddubsw m2, [r3] ; [16]
8532 pmulhrsw m2, m7
8533 packuswb m1, m2
8534
8535 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
8536
8537 movu m0, [r2 + 21]
8538 palignr m1, m0, 1
8539 punpckhbw m2, m0, m1
8540 punpcklbw m0, m1
8541 palignr m5, m2, m0, 2
8542 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
8543 pmulhrsw m4, m7
8544 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
8545 pmulhrsw m1, m7
8546 packuswb m4, m1
8547 pmaddubsw m5, [r3 + 14 * 16] ; [30]
8548 pmulhrsw m5, m7
8549 palignr m6, m2, m0, 4
8550 pmaddubsw m6, [r3 + 8 * 16] ; [24]
8551 pmulhrsw m6, m7
8552 packuswb m5, m6
8553 palignr m1, m2, m0, 6
8554 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
8555 pmulhrsw m6, m7
8556 palignr m1, m2, m0, 8
8557 pmaddubsw m1, [r3 - 4 * 16] ; [12]
8558 pmulhrsw m1, m7
8559 packuswb m6, m1
8560 palignr m1, m2, m0, 10
8561 pmaddubsw m1, [r3 - 10 * 16] ; [06]
8562 pmulhrsw m1, m7
8563 packuswb m1, m1
8564 movhps m1, [r2 + 27] ; [00]
8565
8566 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
8567 %endmacro
8568
8569 %macro MODE_4_32 1
8570 movu m0, [r2 + 1]
8571 palignr m1, m0, 1
8572 punpckhbw m2, m0, m1
8573 punpcklbw m0, m1
8574 palignr m1, m2, m0, 2
8575 mova m5, m1
8576 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
8577 pmulhrsw m4, m7
8578 pmaddubsw m1, [r3 - 6 * 16] ; [10]
8579 pmulhrsw m1, m7
8580 packuswb m4, m1
8581 pmaddubsw m5, [r3 + 15 * 16] ; [31]
8582 pmulhrsw m5, m7
8583 palignr m6, m2, m0, 4
8584 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
8585 pmulhrsw m6, m7
8586 packuswb m5, m6
8587 palignr m1, m2, m0, 6
8588 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
8589 pmulhrsw m6, m7
8590 pmaddubsw m1, [r3 + 14 * 16] ; [30]
8591 pmulhrsw m1, m7
8592 packuswb m6, m1
8593 palignr m1, m2, m0, 8
8594 pmaddubsw m1, [r3 + 3 * 16] ; [19]
8595 pmulhrsw m1, m7
8596 palignr m2, m0, 10
8597 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
8598 pmulhrsw m3, m7
8599 packuswb m1, m3
8600
8601 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8602
8603 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
8604 pmulhrsw m4, m7
8605 movu m0, [r2 + 6]
8606 palignr m1, m0, 1
8607 punpckhbw m2, m0, m1
8608 punpcklbw m0, m1
8609 palignr m1, m2, m0, 2
8610 pmaddubsw m1, [r3 + 2 * 16] ; [18]
8611 pmulhrsw m1, m7
8612 packuswb m4, m1
8613 palignr m5, m2, m0, 4
8614 mova m6, m5
8615 pmaddubsw m5, [r3 - 9 * 16] ; [07]
8616 pmulhrsw m5, m7
8617 pmaddubsw m6, [r3 + 12 * 16] ; [28]
8618 pmulhrsw m6, m7
8619 packuswb m5, m6
8620 palignr m6, m2, m0, 6
8621 pmaddubsw m6, [r3 + 16] ; [17]
8622 pmulhrsw m6, m7
8623 palignr m1, m2, m0, 8
8624 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
8625 pmulhrsw m3, m7
8626 packuswb m6, m3
8627 pmaddubsw m1, [r3 + 11 * 16] ; [27]
8628 pmulhrsw m1, m7
8629 palignr m2, m0, 10
8630 pmaddubsw m2, [r3] ; [16]
8631 pmulhrsw m2, m7
8632 packuswb m1, m2
8633
8634 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
8635
8636 movu m0, [r2 + 12]
8637 palignr m1, m0, 1
8638 punpckhbw m2, m0, m1
8639 punpcklbw m0, m1
8640 mova m1, m0
8641 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
8642 pmulhrsw m4, m7
8643 pmaddubsw m1, [r3 + 10 * 16] ; [26]
8644 pmulhrsw m1, m7
8645 packuswb m4, m1
8646 palignr m5, m2, m0, 2
8647 pmaddubsw m5, [r3 - 16] ; [15]
8648 pmulhrsw m5, m7
8649 palignr m6, m2, m0, 4
8650 mova m1, m6
8651 pmaddubsw m1, [r3 - 12 * 16] ; [4]
8652 pmulhrsw m1, m7
8653 packuswb m5, m1
8654 pmaddubsw m6, [r3 + 9 * 16] ; [25]
8655 pmulhrsw m6, m7
8656 palignr m1, m2, m0, 6
8657 pmaddubsw m1, [r3 - 2 * 16] ; [14]
8658 pmulhrsw m1, m7
8659 packuswb m6, m1
8660 palignr m1, m2, m0, 8
8661 mova m2, m1
8662 pmaddubsw m1, [r3 - 13 * 16] ; [3]
8663 pmulhrsw m1, m7
8664 pmaddubsw m2, [r3 + 8 * 16] ; [24]
8665 pmulhrsw m2, m7
8666 packuswb m1, m2
8667
8668 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
8669
8670 movu m0, [r2 + 17]
8671 palignr m1, m0, 1
8672 punpckhbw m2, m0, m1
8673 punpcklbw m0, m1
8674 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
8675 pmulhrsw m4, m7
8676 palignr m5, m2, m0, 2
8677 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
8678 pmulhrsw m1, m7
8679 packuswb m4, m1
8680 pmaddubsw m5, [r3 + 7 * 16] ; [23]
8681 pmulhrsw m5, m7
8682 palignr m6, m2, m0, 4
8683 pmaddubsw m6, [r3 - 4 * 16] ; [12]
8684 pmulhrsw m6, m7
8685 packuswb m5, m6
8686 palignr m6, m2, m0, 6
8687 mova m1, m6
8688 pmaddubsw m6, [r3 - 15 * 16] ; [1]
8689 pmulhrsw m6, m7
8690 pmaddubsw m1, [r3 + 6 * 16] ; [22]
8691 pmulhrsw m1, m7
8692 packuswb m6, m1
8693 palignr m1, m2, m0, 8
8694 pmaddubsw m1, [r3 - 5 * 16] ; [11]
8695 pmulhrsw m1, m7
8696 packuswb m1, m1
8697 movhps m1, [r2 + 22] ; [00]
8698
8699 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
8700 %endmacro
8701
8702 %macro MODE_5_31 1
8703 movu m0, [r2 + 1]
8704 palignr m1, m0, 1
8705 punpckhbw m2, m0, m1
8706 punpcklbw m0, m1
8707 palignr m1, m2, m0, 2
8708 mova m5, m1
8709 pmaddubsw m4, m0, [r3 + 16] ; [17]
8710 pmulhrsw m4, m7
8711 pmaddubsw m1, [r3 - 14 * 16] ; [2]
8712 pmulhrsw m1, m7
8713 packuswb m4, m1
8714 pmaddubsw m5, [r3 + 3 * 16] ; [19]
8715 pmulhrsw m5, m7
8716 palignr m6, m2, m0, 4
8717 mova m1, m6
8718 pmaddubsw m6, [r3 - 12 * 16] ; [4]
8719 pmulhrsw m6, m7
8720 packuswb m5, m6
8721 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
8722 pmulhrsw m6, m7
8723 palignr m1, m2, m0, 6
8724 mova m3, m1
8725 pmaddubsw m3, [r3 - 10 * 16] ; [6]
8726 pmulhrsw m3, m7
8727 packuswb m6, m3
8728 pmaddubsw m1, [r3 + 7 * 16] ; [23]
8729 pmulhrsw m1, m7
8730 palignr m2, m0, 8
8731 pmaddubsw m2, [r3 - 8 * 16] ; [8]
8732 pmulhrsw m2, m7
8733 packuswb m1, m2
8734
8735 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8736
8737 movu m0, [r2 + 5]
8738 palignr m1, m0, 1
8739 punpckhbw m2, m0, m1
8740 punpcklbw m0, m1
8741 palignr m1, m2, m0, 2
8742 mova m5, m1
8743 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
8744 pmulhrsw m4, m7
8745 pmaddubsw m1, [r3 - 6 * 16] ; [10]
8746 pmulhrsw m1, m7
8747 packuswb m4, m1
8748 pmaddubsw m5, [r3 + 11 * 16] ; [27]
8749 pmulhrsw m5, m7
8750 palignr m6, m2, m0, 4
8751 mova m1, m6
8752 pmaddubsw m6, [r3 - 4 * 16] ; [12]
8753 pmulhrsw m6, m7
8754 packuswb m5, m6
8755 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
8756 pmulhrsw m6, m7
8757 palignr m1, m2, m0, 6
8758 mova m3, m1
8759 pmaddubsw m3, [r3 - 2 * 16] ; [14]
8760 pmulhrsw m3, m7
8761 packuswb m6, m3
8762 pmaddubsw m1, [r3 + 15 * 16] ; [31]
8763 pmulhrsw m1, m7
8764 palignr m2, m0, 8
8765 pmaddubsw m2, [r3] ; [16]
8766 pmulhrsw m2, m7
8767 packuswb m1, m2
8768
8769 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
8770
8771 movu m0, [r2 + 10]
8772 palignr m1, m0, 1
8773 punpckhbw m2, m0, m1
8774 punpcklbw m0, m1
8775 mova m1, m0
8776 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
8777 pmulhrsw m4, m7
8778 pmaddubsw m1, [r3 + 2 * 16] ; [18]
8779 pmulhrsw m1, m7
8780 packuswb m4, m1
8781 palignr m5, m2, m0, 2
8782 mova m1, m5
8783 pmaddubsw m5, [r3 - 13 * 16] ; [3]
8784 pmulhrsw m5, m7
8785 pmaddubsw m1, [r3 + 4 * 16] ; [20]
8786 pmulhrsw m1, m7
8787 packuswb m5, m1
8788 palignr m1, m2, m0, 4
8789 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
8790 pmulhrsw m6, m7
8791 pmaddubsw m1, [r3 + 6 * 16] ; [22]
8792 pmulhrsw m1, m7
8793 packuswb m6, m1
8794 palignr m2, m0, 6
8795 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
8796 pmulhrsw m1, m7
8797 pmaddubsw m2, [r3 + 8 * 16] ; [24]
8798 pmulhrsw m2, m7
8799 packuswb m1, m2
8800
8801 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
8802
8803 movu m0, [r2 + 14]
8804 palignr m1, m0, 1
8805 punpckhbw m2, m0, m1
8806 punpcklbw m0, m1
8807 mova m1, m0
8808 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
8809 pmulhrsw m4, m7
8810 pmaddubsw m1, [r3 + 10 * 16] ; [26]
8811 pmulhrsw m1, m7
8812 packuswb m4, m1
8813 palignr m5, m2, m0, 2
8814 mova m1, m5
8815 pmaddubsw m5, [r3 - 5 * 16] ; [11]
8816 pmulhrsw m5, m7
8817 pmaddubsw m1, [r3 + 12 * 16] ; [28]
8818 pmulhrsw m1, m7
8819 packuswb m5, m1
8820 palignr m1, m2, m0, 4
8821 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
8822 pmulhrsw m6, m7
8823 pmaddubsw m1, [r3 + 14 * 16] ; [30]
8824 pmulhrsw m1, m7
8825 packuswb m6, m1
8826 palignr m2, m0, 6
8827 pmaddubsw m1, m2, [r3 - 16] ; [15]
8828 pmulhrsw m1, m7
8829 packuswb m1, m1
8830 movhps m1, [r2 + 18] ; [00]
8831
8832 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
8833 %endmacro
8834
8835 %macro MODE_6_30 1
8836 movu m0, [r2 + 1]
8837 palignr m1, m0, 1
8838 punpckhbw m2, m0, m1
8839 punpcklbw m0, m1
8840 mova m1, m0
8841 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
8842 pmulhrsw m4, m7
8843 pmaddubsw m1, [r3 + 10 * 16] ; [26]
8844 pmulhrsw m1, m7
8845 packuswb m4, m1
8846 palignr m6, m2, m0, 2
8847 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
8848 pmulhrsw m5, m7
8849 pmaddubsw m6, [r3 + 4 * 16] ; [20]
8850 pmulhrsw m6, m7
8851 packuswb m5, m6
8852 palignr m1, m2, m0, 4
8853 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
8854 pmulhrsw m6, m7
8855 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
8856 pmulhrsw m3, m7
8857 packuswb m6, m3
8858 pmaddubsw m1, [r3 + 11 * 16] ; [27]
8859 pmulhrsw m1, m7
8860 palignr m2, m0, 6
8861 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
8862 pmulhrsw m3, m7
8863 packuswb m1, m3
8864
8865 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8866
8867 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
8868 pmulhrsw m4, m7
8869 movu m0, [r2 + 5]
8870 palignr m1, m0, 1
8871 punpckhbw m2, m0, m1
8872 punpcklbw m0, m1
8873 mova m6, m0
8874 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
8875 pmulhrsw m1, m7
8876 packuswb m4, m1
8877 pmaddubsw m5, m6, [r3 - 16] ; [15]
8878 pmulhrsw m5, m7
8879 pmaddubsw m6, [r3 + 12 * 16] ; [28]
8880 pmulhrsw m6, m7
8881 packuswb m5, m6
8882 palignr m3, m2, m0, 2
8883 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
8884 pmulhrsw m6, m7
8885 pmaddubsw m3, [r3 + 6 * 16] ; [22]
8886 pmulhrsw m3, m7
8887 packuswb m6, m3
8888 palignr m2, m0, 4
8889 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
8890 pmulhrsw m1, m7
8891 pmaddubsw m3, m2, [r3] ; [16]
8892 pmulhrsw m3, m7
8893 packuswb m1, m3
8894
8895 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
8896
8897 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
8898 pmulhrsw m4, m7
8899 movu m0, [r2 + 7]
8900 palignr m1, m0, 1
8901 punpckhbw m2, m0, m1
8902 punpcklbw m0, m1
8903 palignr m5, m2, m0, 2
8904 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
8905 pmulhrsw m1, m7
8906 packuswb m4, m1
8907 pmaddubsw m5, [r3 + 7 * 16] ; [23]
8908 pmulhrsw m5, m7
8909 palignr m1, m2, m0, 4
8910 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
8911 pmulhrsw m6, m7
8912 packuswb m5, m6
8913 pmaddubsw m6, m1, [r3 + 16] ; [17]
8914 pmulhrsw m6, m7
8915 pmaddubsw m1, [r3 + 14 * 16] ; [30]
8916 pmulhrsw m1, m7
8917 packuswb m6, m1
8918 palignr m2, m2, m0, 6
8919 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
8920 pmulhrsw m1, m7
8921 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
8922 pmulhrsw m2, m7
8923 packuswb m1, m2
8924
8925 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
8926
8927 movu m0, [r2 + 11]
8928 palignr m1, m0, 1
8929 punpckhbw m2, m0, m1
8930 punpcklbw m0, m1
8931 mova m5, m0
8932 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
8933 pmulhrsw m4, m7
8934 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
8935 pmulhrsw m3, m7
8936 packuswb m4, m3
8937 pmaddubsw m5, [r3 + 15 * 16] ; [31]
8938 pmulhrsw m5, m7
8939 palignr m6, m2, m0, 2
8940 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
8941 pmulhrsw m1, m7
8942 packuswb m5, m1
8943 pmaddubsw m6, [r3 + 9 * 16] ; [25]
8944 pmulhrsw m6, m7
8945 palignr m1, m2, m0, 4
8946 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
8947 pmulhrsw m2, m7
8948 packuswb m6, m2
8949 pmaddubsw m1, [r3 + 3 * 16] ; [19]
8950 pmulhrsw m1, m7
8951 packuswb m1, m1
8952 movhps m1, [r2 + 14] ; [00]
8953
8954 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
8955 %endmacro
8956
8957 %macro MODE_7_29 1
8958 movu m0, [r2 + 1]
8959 palignr m1, m0, 1
8960 punpckhbw m2, m0, m1
8961 punpcklbw m0, m1
8962 mova m5, m0
8963 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
8964 pmulhrsw m4, m7
8965 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
8966 pmulhrsw m3, m7
8967 packuswb m4, m3
8968 pmaddubsw m5, [r3 + 11 * 16] ; [27]
8969 pmulhrsw m5, m7
8970 palignr m1, m2, m0, 2
8971 palignr m2, m0, 4
8972 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
8973 pmulhrsw m6, m7
8974 packuswb m5, m6
8975 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
8976 pmulhrsw m6, m7
8977 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
8978 pmulhrsw m0, m7
8979 packuswb m6, m0
8980 pmaddubsw m1, [r3 + 15 * 16] ; [31]
8981 pmulhrsw m1, m7
8982 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
8983 pmulhrsw m0, m7
8984 packuswb m1, m0
8985
8986 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8987
8988 pmaddubsw m4, m2, [r3 + 16] ; [17]
8989 pmulhrsw m4, m7
8990 pmaddubsw m2, [r3 + 10 * 16] ; [26]
8991 pmulhrsw m2, m7
8992 packuswb m4, m2
8993 movu m0, [r2 + 4]
8994 palignr m1, m0, 1
8995 punpckhbw m2, m0, m1
8996 punpcklbw m0, m1
8997 palignr m2, m0, 2
8998 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
8999 pmulhrsw m5, m7
9000 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
9001 pmulhrsw m6, m7
9002 packuswb m5, m6
9003 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
9004 pmulhrsw m6, m7
9005 pmaddubsw m0, [r3 + 14 * 16] ; [30]
9006 pmulhrsw m0, m7
9007 packuswb m6, m0
9008 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
9009 pmulhrsw m1, m7
9010 pmaddubsw m3, m2, [r3] ; [16]
9011 pmulhrsw m3, m7
9012 packuswb m1, m3
9013
9014 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
9015
9016 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
9017 pmulhrsw m4, m7
9018 movu m0, [r2 + 6]
9019 palignr m1, m0, 1
9020 punpckhbw m2, m0, m1
9021 punpcklbw m0, m1
9022 palignr m2, m0, 2
9023 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
9024 pmulhrsw m1, m7
9025 packuswb m4, m1
9026 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
9027 pmulhrsw m5, m7
9028 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
9029 pmulhrsw m6, m7
9030 packuswb m5, m6
9031 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
9032 pmulhrsw m6, m7
9033 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
9034 pmulhrsw m1, m7
9035 packuswb m6, m1
9036 pmaddubsw m1, m2, [r3 - 16] ; [15]
9037 pmulhrsw m1, m7
9038 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
9039 pmulhrsw m2, m7
9040 packuswb m1, m2
9041
9042 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
9043
9044 movu m0, [r2 + 8]
9045 palignr m1, m0, 1
9046 punpckhbw m2, m0, m1
9047 punpcklbw m0, m1
9048 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
9049 pmulhrsw m4, m7
9050 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
9051 pmulhrsw m3, m7
9052 packuswb m4, m3
9053 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
9054 pmulhrsw m5, m7
9055 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
9056 pmulhrsw m6, m7
9057 packuswb m5, m6
9058 palignr m2, m0, 2
9059 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
9060 pmulhrsw m6, m7
9061 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
9062 pmulhrsw m0, m7
9063 packuswb m6, m0
9064 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
9065 pmulhrsw m1, m7
9066 packuswb m1, m1
9067 movhps m1, [r2 + 10] ; [0]
9068
9069 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
9070 %endmacro
9071
9072 %macro MODE_8_28 1
9073 movu m0, [r2 + 1]
9074 palignr m1, m0, 1
9075 punpckhbw m2, m0, m1
9076 punpcklbw m0, m1
9077 palignr m2, m0, 2
9078 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
9079 pmulhrsw m4, m7
9080 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
9081 pmulhrsw m3, m7
9082 packuswb m4, m3
9083 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
9084 pmulhrsw m5, m7
9085 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
9086 pmulhrsw m6, m7
9087 packuswb m5, m6
9088 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
9089 pmulhrsw m6, m7
9090 pmaddubsw m0, [r3 + 14 * 16] ; [30]
9091 pmulhrsw m0, m7
9092 packuswb m6, m0
9093 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
9094 pmulhrsw m1, m7
9095 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
9096 pmulhrsw m0, m7
9097 packuswb m1, m0
9098
9099 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9100
9101 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
9102 pmulhrsw m4, m7
9103 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
9104 pmulhrsw m5, m7
9105 packuswb m4, m5
9106 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
9107 pmulhrsw m5, m7
9108 pmaddubsw m2, [r3 + 12 * 16] ; [28]
9109 pmulhrsw m2, m7
9110 packuswb m5, m2
9111 movu m0, [r2 + 3]
9112 palignr m1, m0, 1
9113 punpckhbw m2, m0, m1
9114 punpcklbw m0, m1
9115 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
9116 pmulhrsw m6, m7
9117 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
9118 pmulhrsw m1, m7
9119 packuswb m6, m1
9120 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
9121 pmulhrsw m1, m7
9122 mova m2, m0
9123 pmaddubsw m0, [r3] ; [16]
9124 pmulhrsw m0, m7
9125 packuswb m1, m0
9126
9127 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
9128
9129 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
9130 pmulhrsw m4, m7
9131 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
9132 pmulhrsw m5, m7
9133 packuswb m4, m5
9134 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
9135 pmulhrsw m5, m7
9136 movu m0, [r2 + 4]
9137 palignr m1, m0, 1
9138 punpckhbw m2, m0, m1
9139 punpcklbw m0, m1
9140 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
9141 pmulhrsw m2, m7
9142 packuswb m5, m2
9143 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
9144 pmulhrsw m6, m7
9145 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
9146 pmulhrsw m1, m7
9147 packuswb m6, m1
9148 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
9149 pmulhrsw m1, m7
9150 mova m2, m0
9151 pmaddubsw m0, [r3 + 8 * 16] ; [24]
9152 pmulhrsw m0, m7
9153 packuswb m1, m0
9154
9155 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
9156
9157 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
9158 pmulhrsw m4, m7
9159 movu m0, [r2 + 5]
9160 palignr m1, m0, 1
9161 punpckhbw m2, m0, m1
9162 punpcklbw m0, m1
9163 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
9164 pmulhrsw m1, m7
9165 packuswb m4, m1
9166 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
9167 pmulhrsw m5, m7
9168 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
9169 pmulhrsw m6, m7
9170 packuswb m5, m6
9171 pmaddubsw m6, m0, [r3 + 16] ; [17]
9172 pmulhrsw m6, m7
9173 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
9174 pmulhrsw m1, m7
9175 packuswb m6, m1
9176 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
9177 pmulhrsw m1, m7
9178 packuswb m1, m1
9179 movhps m1, [r2 + 6] ; [00]
9180
9181 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
9182 %endmacro
9183
9184 %macro MODE_9_27 1
9185 movu m2, [r2 + 1]
9186 palignr m1, m2, 1
9187 punpckhbw m0, m2, m1
9188 punpcklbw m2, m1
9189 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
9190 pmulhrsw m4, m7
9191 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
9192 pmulhrsw m3, m7
9193 packuswb m4, m3
9194 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
9195 pmulhrsw m5, m7
9196 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
9197 pmulhrsw m6, m7
9198 packuswb m5, m6
9199 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
9200 pmulhrsw m6, m7
9201 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
9202 pmulhrsw m3, m7
9203 packuswb m6, m3
9204 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
9205 pmulhrsw m1, m7
9206 pmaddubsw m0, m2, [r3] ; [16]
9207 pmulhrsw m0, m7
9208 packuswb m1, m0
9209
9210 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9211
9212 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
9213 pmulhrsw m4, m7
9214 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
9215 pmulhrsw m5, m7
9216 packuswb m4, m5
9217 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
9218 pmulhrsw m5, m7
9219 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
9220 pmulhrsw m6, m7
9221 packuswb m5, m6
9222 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
9223 pmulhrsw m6, m7
9224 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
9225 pmulhrsw m1, m7
9226 packuswb m6, m1
9227 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
9228 pmulhrsw m1, m7
9229 packuswb m1, m1
9230 movhps m1, [r2 + 2] ; [00]
9231
9232 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
9233
9234 movu m2, [r2 + 2]
9235 palignr m1, m2, 1
9236 punpcklbw m2, m1
9237 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
9238 pmulhrsw m4, m7
9239 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
9240 pmulhrsw m3, m7
9241 packuswb m4, m3
9242 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
9243 pmulhrsw m5, m7
9244 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
9245 pmulhrsw m6, m7
9246 packuswb m5, m6
9247 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
9248 pmulhrsw m6, m7
9249 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
9250 pmulhrsw m0, m7
9251 packuswb m6, m0
9252 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
9253 pmulhrsw m1, m7
9254 pmaddubsw m0, m2, [r3] ; [16]
9255 pmulhrsw m0, m7
9256 packuswb m1, m0
9257
9258 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
9259
9260 movu m2, [r2 + 2]
9261 palignr m1, m2, 1
9262 punpcklbw m2, m1
9263 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
9264 pmulhrsw m4, m7
9265 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
9266 pmulhrsw m5, m7
9267 packuswb m4, m5
9268 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
9269 pmulhrsw m5, m7
9270 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
9271 pmulhrsw m6, m7
9272 packuswb m5, m6
9273 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
9274 pmulhrsw m6, m7
9275 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
9276 pmulhrsw m1, m7
9277 packuswb m6, m1
9278 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
9279 pmulhrsw m1, m7
9280 packuswb m1, m1
9281 movhps m1, [r2 + 3] ; [00]
9282
9283 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
9284 %endmacro
9285
9286 %macro MODE_12_24 1
9287 movu m2, [r2]
9288 palignr m1, m2, 1
9289 punpckhbw m0, m2, m1
9290 punpcklbw m2, m1
9291 palignr m0, m2, 2
9292 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
9293 pmulhrsw m4, m7
9294 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
9295 pmulhrsw m3, m7
9296 packuswb m4, m3
9297 pmaddubsw m5, m0, [r4 + 16] ; [17]
9298 pmulhrsw m5, m7
9299 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
9300 pmulhrsw m6, m7
9301 packuswb m5, m6
9302 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
9303 pmulhrsw m6, m7
9304 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
9305 pmulhrsw m3, m7
9306 packuswb m6, m3
9307 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
9308 pmulhrsw m1, m7
9309 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
9310 pmulhrsw m3, m7
9311 packuswb m1, m3
9312 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9313 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
9314 pmulhrsw m4, m7
9315 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
9316 pmulhrsw m5, m7
9317 packuswb m4, m5
9318 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
9319 pmulhrsw m5, m7
9320 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
9321 pmulhrsw m6, m7
9322 packuswb m5, m6
9323 movu m0, [r2 - 2]
9324 palignr m1, m0, 1
9325 punpckhbw m2, m0, m1
9326 punpcklbw m0, m1
9327 palignr m2, m0, 2
9328 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
9329 pmulhrsw m6, m7
9330 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
9331 pmulhrsw m1, m7
9332 packuswb m6, m1
9333 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
9334 pmulhrsw m1, m7
9335 pmaddubsw m3, m2, [r4] ; [16]
9336 pmulhrsw m3, m7
9337 packuswb m1, m3
9338 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
9339 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
9340 pmulhrsw m4, m7
9341 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
9342 pmulhrsw m3, m7
9343 packuswb m4, m3
9344 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
9345 pmulhrsw m5, m7
9346 movu m0, [r2 - 3]
9347 palignr m1, m0, 1
9348 punpckhbw m2, m0, m1
9349 punpcklbw m0, m1
9350 palignr m2, m0, 2
9351 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
9352 pmulhrsw m6, m7
9353 packuswb m5, m6
9354 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
9355 pmulhrsw m6, m7
9356 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
9357 pmulhrsw m3, m7
9358 packuswb m6, m3
9359 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
9360 pmulhrsw m1, m7
9361 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
9362 pmulhrsw m3, m7
9363 packuswb m1, m3
9364 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
9365 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
9366 pmulhrsw m4, m7
9367 movu m2, [r2 - 4]
9368 palignr m1, m2, 1
9369 punpckhbw m0, m2, m1
9370 punpcklbw m2, m1
9371 palignr m0, m2, 2
9372 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
9373 pmulhrsw m5, m7
9374 packuswb m4, m5
9375 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
9376 pmulhrsw m5, m7
9377 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
9378 pmulhrsw m6, m7
9379 packuswb m5, m6
9380 pmaddubsw m6, m0, [r4 - 16] ; [15]
9381 pmulhrsw m6, m7
9382 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
9383 pmulhrsw m1, m7
9384 packuswb m6, m1
9385 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
9386 pmulhrsw m1, m7
9387 movu m2, [pb_fact0]
9388 pshufb m0, m2
9389 pmovzxbw m0, m0
9390 packuswb m1, m0
9391 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
9392 %endmacro
9393
9394 ;------------------------------------------------------------------------------------------
9395 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
9396 ;------------------------------------------------------------------------------------------
9397 INIT_XMM ssse3
9398 cglobal intra_pred_ang32_2, 3,5,4
9399 lea r4, [r2]
9400 add r2, 64
9401 cmp r3m, byte 34
9402 cmove r2, r4
9403 movu m0, [r2 + 2]
9404 movu m1, [r2 + 18]
9405 movu m3, [r2 + 34]
9406
9407 lea r3, [r1 * 3]
9408
9409 movu [r0], m0
9410 movu [r0 + 16], m1
9411 palignr m2, m1, m0, 1
9412 movu [r0 + r1], m2
9413 palignr m2, m3, m1, 1
9414 movu [r0 + r1 + 16], m2
9415 palignr m2, m1, m0, 2
9416 movu [r0 + r1 * 2], m2
9417 palignr m2, m3, m1, 2
9418 movu [r0 + r1 * 2 + 16], m2
9419 palignr m2, m1, m0, 3
9420 movu [r0 + r3], m2
9421 palignr m2, m3, m1, 3
9422 movu [r0 + r3 + 16], m2
9423
9424 lea r0, [r0 + r1 * 4]
9425
9426 palignr m2, m1, m0, 4
9427 movu [r0], m2
9428 palignr m2, m3, m1, 4
9429 movu [r0 + 16], m2
9430 palignr m2, m1, m0, 5
9431 movu [r0 + r1], m2
9432 palignr m2, m3, m1, 5
9433 movu [r0 + r1 + 16], m2
9434 palignr m2, m1, m0, 6
9435 movu [r0 + r1 * 2], m2
9436 palignr m2, m3, m1, 6
9437 movu [r0 + r1 * 2 + 16], m2
9438 palignr m2, m1, m0, 7
9439 movu [r0 + r3], m2
9440 palignr m2, m3, m1, 7
9441 movu [r0 + r3 + 16], m2
9442
9443 lea r0, [r0 + r1 * 4]
9444
9445 palignr m2, m1, m0, 8
9446 movu [r0], m2
9447 palignr m2, m3, m1, 8
9448 movu [r0 + 16], m2
9449 palignr m2, m1, m0, 9
9450 movu [r0 + r1], m2
9451 palignr m2, m3, m1, 9
9452 movu [r0 + r1 + 16], m2
9453 palignr m2, m1, m0, 10
9454 movu [r0 + r1 * 2], m2
9455 palignr m2, m3, m1, 10
9456 movu [r0 + r1 * 2 + 16], m2
9457 palignr m2, m1, m0, 11
9458 movu [r0 + r3], m2
9459 palignr m2, m3, m1, 11
9460 movu [r0 + r3 + 16], m2
9461
9462 lea r0, [r0 + r1 * 4]
9463
9464 palignr m2, m1, m0, 12
9465 movu [r0], m2
9466 palignr m2, m3, m1, 12
9467 movu [r0 + 16], m2
9468 palignr m2, m1, m0, 13
9469 movu [r0 + r1], m2
9470 palignr m2, m3, m1, 13
9471 movu [r0 + r1 + 16], m2
9472 palignr m2, m1, m0, 14
9473 movu [r0 + r1 * 2], m2
9474 palignr m2, m3, m1, 14
9475 movu [r0 + r1 * 2 + 16], m2
9476 palignr m2, m1, m0, 15
9477 movu [r0 + r3], m2
9478 palignr m2, m3, m1, 15
9479 movu [r0 + r3 + 16], m2
9480
9481 lea r0, [r0 + r1 * 4]
9482
9483 movu [r0], m1
9484 movu m0, [r2 + 50]
9485 movu [r0 + 16], m3
9486 palignr m2, m3, m1, 1
9487 movu [r0 + r1], m2
9488 palignr m2, m0, m3, 1
9489 movu [r0 + r1 + 16], m2
9490 palignr m2, m3, m1, 2
9491 movu [r0 + r1 * 2], m2
9492 palignr m2, m0, m3, 2
9493 movu [r0 + r1 * 2 + 16], m2
9494 palignr m2, m3, m1, 3
9495 movu [r0 + r3], m2
9496 palignr m2, m0, m3, 3
9497 movu [r0 + r3 + 16], m2
9498
9499 lea r0, [r0 + r1 * 4]
9500
9501 palignr m2, m3, m1, 4
9502 movu [r0], m2
9503 palignr m2, m0, m3, 4
9504 movu [r0 + 16], m2
9505 palignr m2, m3, m1, 5
9506 movu [r0 + r1], m2
9507 palignr m2, m0, m3, 5
9508 movu [r0 + r1 + 16], m2
9509 palignr m2, m3, m1, 6
9510 movu [r0 + r1 * 2], m2
9511 palignr m2, m0, m3, 6
9512 movu [r0 + r1 * 2 + 16], m2
9513 palignr m2, m3, m1, 7
9514 movu [r0 + r3], m2
9515 palignr m2, m0, m3, 7
9516 movu [r0 + r3 + 16], m2
9517
9518 lea r0, [r0 + r1 * 4]
9519
9520 palignr m2, m3, m1, 8
9521 movu [r0], m2
9522 palignr m2, m0, m3, 8
9523 movu [r0 + 16], m2
9524 palignr m2, m3, m1, 9
9525 movu [r0 + r1], m2
9526 palignr m2, m0, m3, 9
9527 movu [r0 + r1 + 16], m2
9528 palignr m2, m3, m1, 10
9529 movu [r0 + r1 * 2], m2
9530 palignr m2, m0, m3, 10
9531 movu [r0 + r1 * 2 + 16], m2
9532 palignr m2, m3, m1, 11
9533 movu [r0 + r3], m2
9534 palignr m2, m0, m3, 11
9535 movu [r0 + r3 + 16], m2
9536
9537 lea r0, [r0 + r1 * 4]
9538
9539 palignr m2, m3, m1, 12
9540 movu [r0], m2
9541 palignr m2, m0, m3, 12
9542 movu [r0 + 16], m2
9543 palignr m2, m3, m1, 13
9544 movu [r0 + r1], m2
9545 palignr m2, m0, m3, 13
9546 movu [r0 + r1 + 16], m2
9547 palignr m2, m3, m1, 14
9548 movu [r0 + r1 * 2], m2
9549 palignr m2, m0, m3, 14
9550 movu [r0 + r1 * 2 + 16], m2
9551 palignr m2, m3, m1, 15
9552 movu [r0 + r3], m2
9553 palignr m2, m0, m3, 15
9554 movu [r0 + r3 + 16], m2
9555 RET
9556
9557 INIT_XMM sse4
9558 cglobal intra_pred_ang32_3, 3,7,8
9559 add r2, 64
9560 lea r3, [ang_table + 16 * 16]
9561 mov r4d, 4
9562 lea r5, [r1 * 3] ; r5 -> 3 * stride
9563 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9564 mova m7, [pw_1024]
9565 .loop:
9566 MODE_3_33 1
9567 lea r0, [r6 + r1 * 4]
9568 lea r6, [r6 + r1 * 8]
9569 add r2, 8
9570 dec r4
9571 jnz .loop
9572 RET
9573
9574 INIT_XMM sse4
9575 cglobal intra_pred_ang32_4, 3,7,8
9576 add r2, 64
9577 lea r3, [ang_table + 16 * 16]
9578 mov r4d, 4
9579 lea r5, [r1 * 3] ; r5 -> 3 * stride
9580 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9581 mova m7, [pw_1024]
9582 .loop:
9583 MODE_4_32 1
9584 lea r0, [r6 + r1 * 4]
9585 lea r6, [r6 + r1 * 8]
9586 add r2, 8
9587 dec r4
9588 jnz .loop
9589 RET
9590
9591 INIT_XMM sse4
9592 cglobal intra_pred_ang32_5, 3,7,8
9593 add r2, 64
9594 lea r3, [ang_table + 16 * 16]
9595 mov r4d, 4
9596 lea r5, [r1 * 3] ; r5 -> 3 * stride
9597 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9598 mova m7, [pw_1024]
9599 .loop:
9600 MODE_5_31 1
9601 lea r0, [r6 + r1 * 4]
9602 lea r6, [r6 + r1 * 8]
9603 add r2, 8
9604 dec r4
9605 jnz .loop
9606 RET
9607
9608 INIT_XMM sse4
9609 cglobal intra_pred_ang32_6, 3,7,8
9610 add r2, 64
9611 lea r3, [ang_table + 16 * 16]
9612 mov r4d, 4
9613 lea r5, [r1 * 3] ; r5 -> 3 * stride
9614 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9615 mova m7, [pw_1024]
9616 .loop:
9617 MODE_6_30 1
9618 lea r0, [r6 + r1 * 4]
9619 lea r6, [r6 + r1 * 8]
9620 add r2, 8
9621 dec r4
9622 jnz .loop
9623 RET
9624
9625 INIT_XMM sse4
9626 cglobal intra_pred_ang32_7, 3,7,8
9627 add r2, 64
9628 lea r3, [ang_table + 16 * 16]
9629 mov r4d, 4
9630 lea r5, [r1 * 3] ; r5 -> 3 * stride
9631 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9632 mova m7, [pw_1024]
9633 .loop:
9634 MODE_7_29 1
9635 lea r0, [r6 + r1 * 4]
9636 lea r6, [r6 + r1 * 8]
9637 add r2, 8
9638 dec r4
9639 jnz .loop
9640 RET
9641
9642 INIT_XMM sse4
9643 cglobal intra_pred_ang32_8, 3,7,8
9644 add r2, 64
9645 lea r3, [ang_table + 16 * 16]
9646 mov r4d, 4
9647 lea r5, [r1 * 3] ; r5 -> 3 * stride
9648 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9649 mova m7, [pw_1024]
9650 .loop:
9651 MODE_8_28 1
9652 lea r0, [r6 + r1 * 4]
9653 lea r6, [r6 + r1 * 8]
9654 add r2, 8
9655 dec r4
9656 jnz .loop
9657 RET
9658
9659 INIT_XMM sse4
9660 cglobal intra_pred_ang32_9, 3,7,8
9661 add r2, 64
9662 lea r3, [ang_table + 16 * 16]
9663 mov r4d, 4
9664 lea r5, [r1 * 3] ; r5 -> 3 * stride
9665 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9666 mova m7, [pw_1024]
9667 .loop:
9668 MODE_9_27 1
9669 lea r0, [r6 + r1 * 4]
9670 lea r6, [r6 + r1 * 8]
9671 add r2, 8
9672 dec r4
9673 jnz .loop
9674 RET
9675
9676 INIT_XMM sse4
9677 cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
9678 %define m8 [rsp + 0 * mmsize]
9679 %define m9 [rsp + 1 * mmsize]
9680 pxor m7, m7
9681 mov r6, 2
9682 movu m0, [r2]
9683 movu m1, [r2 + 1]
9684 mova m8, m0
9685 mova m9, m1
9686 mov r3d, r4d
9687 lea r4, [r1 * 3]
9688
9689 .loop:
9690 movu m0, [r2 + 1 + 64]
9691 palignr m1, m0, 1
9692 pshufb m1, m7
9693 palignr m2, m0, 2
9694 pshufb m2, m7
9695 palignr m3, m0, 3
9696 pshufb m3, m7
9697 palignr m4, m0, 4
9698 pshufb m4, m7
9699 palignr m5, m0, 5
9700 pshufb m5, m7
9701 palignr m6, m0, 6
9702 pshufb m6, m7
9703
9704 movu [r0 + r1], m1
9705 movu [r0 + r1 + 16], m1
9706 movu [r0 + r1 * 2], m2
9707 movu [r0 + r1 * 2 + 16], m2
9708 movu [r0 + r4], m3
9709 movu [r0 + r4 + 16], m3
9710 lea r5, [r0 + r1 * 4]
9711 movu [r5], m4
9712 movu [r5 + 16], m4
9713 movu [r5 + r1], m5
9714 movu [r5 + r1 + 16], m5
9715 movu [r5 + r1 * 2], m6
9716 movu [r5 + r1 * 2 + 16], m6
9717
9718 palignr m1, m0, 7
9719 pshufb m1, m7
9720 movhlps m2, m0
9721 pshufb m2, m7
9722 palignr m3, m0, 9
9723 pshufb m3, m7
9724 palignr m4, m0, 10
9725 pshufb m4, m7
9726 palignr m5, m0, 11
9727 pshufb m5, m7
9728 palignr m6, m0, 12
9729 pshufb m6, m7
9730
9731 movu [r5 + r4], m1
9732 movu [r5 + r4 + 16], m1
9733 lea r5, [r5 + r1 * 4]
9734 movu [r5], m2
9735 movu [r5 + 16], m2
9736 movu [r5 + r1], m3
9737 movu [r5 + r1 + 16], m3
9738 movu [r5 + r1 * 2], m4
9739 movu [r5 + r1 * 2 + 16], m4
9740 movu [r5 + r4], m5
9741 movu [r5 + r4 + 16], m5
9742 lea r5, [r5 + r1 * 4]
9743 movu [r5], m6
9744 movu [r5 + 16], m6
9745
9746 palignr m1, m0, 13
9747 pshufb m1, m7
9748 palignr m2, m0, 14
9749 pshufb m2, m7
9750 palignr m3, m0, 15
9751 pshufb m3, m7
9752 pshufb m0, m7
9753
9754 movu [r5 + r1], m1
9755 movu [r5 + r1 + 16], m1
9756 movu [r5 + r1 * 2], m2
9757 movu [r5 + r1 * 2 + 16], m2
9758 movu [r5 + r4], m3
9759 movu [r5 + r4 + 16], m3
9760
9761 ; filter
9762 cmp r3d, byte 0
9763 jz .quit
9764 movhlps m1, m0
9765 pmovzxbw m0, m0
9766 mova m1, m0
9767 movu m2, m8
9768 movu m3, m9
9769
9770 pshufb m2, m7
9771 pmovzxbw m2, m2
9772 movhlps m4, m3
9773 pmovzxbw m3, m3
9774 pmovzxbw m4, m4
9775 psubw m3, m2
9776 psubw m4, m2
9777 psraw m3, 1
9778 psraw m4, 1
9779 paddw m0, m3
9780 paddw m1, m4
9781 packuswb m0, m1
9782
9783 .quit:
9784 movu [r0], m0
9785 movu [r0 + 16], m0
9786 dec r6
9787 lea r0, [r5 + r1 * 4]
9788 lea r2, [r2 + 16]
9789 jnz .loop
9790 RET
9791
9792 INIT_XMM sse4
9793 cglobal intra_pred_ang32_11, 4,7,8
9794 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
9795 mov r6, rsp
9796 sub rsp, 64+gprsize
9797 and rsp, ~63
9798 mov [rsp+64], r6
9799
9800 ; collect reference pixel
9801 movu m0, [r2 + 16]
9802 pxor m1, m1
9803 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
9804 mova [rsp], m0
9805 movu m0, [r2 + 64]
9806 pinsrb m0, [r2], 0
9807 movu m1, [r2 + 16 + 64]
9808 movu m2, [r2 + 32 + 64]
9809 movu [rsp + 1], m0
9810 movu [rsp + 1 + 16], m1
9811 movu [rsp + 1 + 32], m2
9812 mov [rsp + 63], byte 4
9813
9814 ; filter
9815 lea r2, [rsp + 1] ; r2 -> [0]
9816 lea r3, [c_shuf8_0] ; r3 -> shuffle8
9817 lea r4, [ang_table] ; r4 -> ang_table
9818 lea r5, [r1 * 3] ; r5 -> 3 * stride
9819 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9820 mova m5, [pw_1024] ; m5 -> 1024
9821 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
9822
9823 .loop:
9824 ; Row[0 - 7]
9825 movu m7, [r2]
9826 mova m0, m7
9827 mova m1, m7
9828 mova m2, m7
9829 mova m3, m7
9830 mova m4, m7
9831 mova m5, m7
9832 mova m6, m7
9833 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
9834
9835 ; Row[8 - 15]
9836 movu m7, [r2]
9837 mova m0, m7
9838 mova m1, m7
9839 mova m2, m7
9840 mova m3, m7
9841 mova m4, m7
9842 mova m5, m7
9843 mova m6, m7
9844 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
9845
9846 ; Row[16 - 23]
9847 movu m7, [r2 - 1]
9848 mova m0, m7
9849 mova m1, m7
9850 mova m2, m7
9851 mova m3, m7
9852 mova m4, m7
9853 mova m5, m7
9854 mova m6, m7
9855 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
9856
9857 ; Row[24 - 31]
9858 movu m7, [r2 - 1]
9859 mova m0, m7
9860 mova m1, m7
9861 mova m2, m7
9862 mova m3, m7
9863 mova m4, m7
9864 mova m5, m7
9865 mova m6, m7
9866 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
9867
9868 lea r0, [r6 + r1 * 4]
9869 lea r6, [r6 + r1 * 8]
9870 add r2, 8
9871 dec byte [rsp + 63]
9872 jnz .loop
9873 mov rsp, [rsp+64]
9874 RET
9875
9876 %macro MODE_12_24_ROW0 1
9877 movu m0, [r3 + 6]
9878 pshufb m0, [c_mode32_12_0]
9879 pinsrb m0, [r3 + 26], 12
9880 mova above, m0
9881 movu m2, [r2]
9882 %if %1 == 1
9883 pinsrb m2, [r3], 0
9884 %endif
9885 palignr m1, m2, 1
9886 punpcklbw m2, m1
9887 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
9888 pmulhrsw m4, m7
9889 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
9890 pmulhrsw m3, m7
9891 packuswb m4, m3
9892 pmaddubsw m5, m2, [r4 + 16] ; [17]
9893 pmulhrsw m5, m7
9894 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
9895 pmulhrsw m6, m7
9896 packuswb m5, m6
9897 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
9898 pmulhrsw m6, m7
9899 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
9900 pmulhrsw m3, m7
9901 packuswb m6, m3
9902 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
9903 %if %1 == 1
9904 pinsrb m1, [r3], 0
9905 %endif
9906 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
9907 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
9908 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
9909 pmulhrsw m1, m7
9910 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
9911 pmulhrsw m3, m7
9912 packuswb m1, m3
9913 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9914 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
9915 pmulhrsw m4, m7
9916 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
9917 pmulhrsw m5, m7
9918 packuswb m4, m5
9919 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
9920 pmulhrsw m5, m7
9921 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
9922 pmulhrsw m6, m7
9923 packuswb m5, m6
9924 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
9925 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
9926 pmulhrsw m6, m7
9927 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
9928 pmulhrsw m1, m7
9929 packuswb m6, m1
9930 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
9931 pmulhrsw m1, m7
9932 pmaddubsw m3, m2, [r4] ; [16]
9933 pmulhrsw m3, m7
9934 packuswb m1, m3
9935 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
9936 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
9937 pmulhrsw m4, m7
9938 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
9939 pmulhrsw m3, m7
9940 packuswb m4, m3
9941 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
9942 pmulhrsw m5, m7
9943 pslldq m1, above, 1
9944 palignr m2, m1, 14
9945 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
9946 pmulhrsw m6, m7
9947 packuswb m5, m6
9948 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
9949 pmulhrsw m6, m7
9950 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
9951 pmulhrsw m3, m7
9952 packuswb m6, m3
9953 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
9954 pmulhrsw m1, m7
9955 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
9956 pmulhrsw m3, m7
9957 packuswb m1, m3
9958 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
9959 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
9960 pmulhrsw m4, m7
9961 pslldq m1, above, 2
9962 palignr m2, m1, 14
9963 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
9964 pmulhrsw m5, m7
9965 packuswb m4, m5
9966 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
9967 pmulhrsw m5, m7
9968 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
9969 pmulhrsw m6, m7
9970 packuswb m5, m6
9971 pmaddubsw m6, m2, [r4 - 16] ; [15]
9972 pmulhrsw m6, m7
9973 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
9974 pmulhrsw m1, m7
9975 packuswb m6, m1
9976 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
9977 pmulhrsw m1, m7
9978 movu m0, [pb_fact0]
9979 pshufb m2, m0
9980 pmovzxbw m2, m2
9981 packuswb m1, m2
9982 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
9983 %endmacro
9984
9985 INIT_XMM sse4
9986 cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
9987 %define above [rsp + 0 * mmsize]
9988 mov r3, r2
9989 add r2, 64
9990 lea r4, [ang_table + 16 * 16]
9991 lea r5, [r1 * 3] ; r5 -> 3 * stride
9992 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
9993 mova m7, [pw_1024]
9994
9995 MODE_12_24_ROW0 1
9996 lea r0, [r6 + r1 * 4]
9997 lea r6, [r6 + r1 * 8]
9998 add r2, 7
9999 mov r3, 3
10000 .loop:
10001 MODE_12_24 1
10002 lea r0, [r6 + r1 * 4]
10003 lea r6, [r6 + r1 * 8]
10004 add r2, 8
10005 dec r3
10006 jnz .loop
10007 RET
10008
10009 %macro MODE_13_23_ROW0 1
10010 movu m0, [r3 + 1]
10011 movu m1, [r3 + 15]
10012 pshufb m0, [c_mode32_13_0]
10013 pshufb m1, [c_mode32_13_0]
10014 punpckldq m0, m1
10015 pshufb m0, [c_mode32_13_shuf]
10016 mova above, m0
10017 movu m2, [r2]
10018 %if (%1 == 1)
10019 pinsrb m2, [r3], 0
10020 %endif
10021 palignr m1, m2, 1
10022 punpcklbw m2, m1
10023 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
10024 pmulhrsw m4, m7
10025 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
10026 pmulhrsw m3, m7
10027 packuswb m4, m3
10028 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
10029 pmulhrsw m5, m7
10030 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
10031 %if (%1 == 1)
10032 pinsrb m1, [r3], 0
10033 %endif
10034 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
10035 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
10036 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
10037 pmulhrsw m6, m7
10038 packuswb m5, m6
10039 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
10040 pmulhrsw m6, m7
10041 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
10042 pmulhrsw m0, m7
10043 packuswb m6, m0
10044 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
10045 pmulhrsw m1, m7
10046 palignr m2, above, 14
10047 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
10048 pmulhrsw m3, m7
10049 packuswb m1, m3
10050 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10051 pmaddubsw m4, m2, [r4 - 16] ; [15]
10052 pmulhrsw m4, m7
10053 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
10054 pmulhrsw m5, m7
10055 packuswb m4, m5
10056 pslldq m0, above, 1
10057 palignr m2, m0, 14
10058 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
10059 pmulhrsw m5, m7
10060 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
10061 pmulhrsw m6, m7
10062 packuswb m5, m6
10063 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
10064 pmulhrsw m6, m7
10065 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
10066 pmulhrsw m1, m7
10067 packuswb m6, m1
10068 pslldq m0, 1
10069 palignr m2, m0, 14
10070 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
10071 pmulhrsw m1, m7
10072 pmaddubsw m0, m2, [r4] ; [16]
10073 pmulhrsw m0, m7
10074 packuswb m1, m0
10075 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
10076 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
10077 pmulhrsw m4, m7
10078 pslldq m0, above, 3
10079 palignr m2, m0, 14
10080 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
10081 pmulhrsw m3, m7
10082 packuswb m4, m3
10083 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
10084 pmulhrsw m5, m7
10085 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
10086 pmulhrsw m6, m7
10087 packuswb m5, m6
10088 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
10089 pmulhrsw m6, m7
10090 pslldq m0, 1
10091 palignr m2, m0, 14
10092 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
10093 pmulhrsw m0, m7
10094 packuswb m6, m0
10095 pmaddubsw m1, m2, [r4 + 16] ; [17]
10096 pmulhrsw m1, m7
10097 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
10098 pmulhrsw m0, m7
10099 packuswb m1, m0
10100 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
10101 pslldq m0, above, 5
10102 palignr m2, m0, 14
10103 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
10104 pmulhrsw m4, m7
10105 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
10106 pmulhrsw m5, m7
10107 packuswb m4, m5
10108 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
10109 pmulhrsw m5, m7
10110 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
10111 pmulhrsw m6, m7
10112 packuswb m5, m6
10113 pslldq m0, 1
10114 palignr m2, m0, 14
10115 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
10116 pmulhrsw m6, m7
10117 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
10118 pmulhrsw m1, m7
10119 packuswb m6, m1
10120 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
10121 pmulhrsw m1, m7
10122 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
10123 pmulhrsw m3, m7
10124 packuswb m1, m3
10125 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
10126 %endmacro
10127
10128 %macro MODE_13_23 2
10129 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
10130 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
10131 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
10132 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
10133 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
10134 pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
10135 pmulhrsw m4, m7
10136 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
10137 pmulhrsw m3, m7
10138 packuswb m4, m3
10139 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
10140 pmulhrsw m5, m7
10141 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
10142 pmulhrsw m6, m7
10143 packuswb m5, m6
10144 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
10145 pmulhrsw m6, m7
10146 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
10147 pmulhrsw m3, m7
10148 packuswb m6, m3
10149 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
10150 pmulhrsw m1, m7
10151 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
10152 palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
10153 punpckhbw m0, m2, m3
10154 punpcklbw m2, m3
10155 palignr m0, m2, 2
10156 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
10157 pmulhrsw m3, m7
10158 packuswb m1, m3
10159 mova m3, m0
10160 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10161 pmaddubsw m4, m3, [r4 - 16] ; [15]
10162 pmulhrsw m4, m7
10163 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
10164 pmulhrsw m5, m7
10165 packuswb m4, m5
10166 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
10167 pmulhrsw m5, m7
10168 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
10169 pmulhrsw m6, m7
10170 packuswb m5, m6
10171 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
10172 pmulhrsw m6, m7
10173 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
10174 pmulhrsw m1, m7
10175 packuswb m6, m1
10176 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
10177 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
10178 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
10179 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
10180 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
10181 pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
10182 pmulhrsw m1, m7
10183 pmaddubsw m3, m0, [r4] ; [16]
10184 pmulhrsw m3, m7
10185 packuswb m1, m3
10186 mova m3, m0
10187 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
10188 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
10189 pmulhrsw m4, m7
10190 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
10191 pmulhrsw m3, m7
10192 packuswb m4, m3
10193 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
10194 pmulhrsw m5, m7
10195 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
10196 pmulhrsw m6, m7
10197 packuswb m5, m6
10198 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
10199 pmulhrsw m6, m7
10200 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
10201 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
10202 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
10203 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
10204 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
10205 pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
10206 pmulhrsw m3, m7
10207 packuswb m6, m3
10208 pmaddubsw m1, m0, [r4 + 16] ; [17]
10209 pmulhrsw m1, m7
10210 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
10211 pmulhrsw m3, m7
10212 packuswb m1, m3
10213 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
10214 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
10215 pmulhrsw m4, m7
10216 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
10217 pmulhrsw m5, m7
10218 packuswb m4, m5
10219 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
10220 pmulhrsw m5, m7
10221 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
10222 pmulhrsw m6, m7
10223 packuswb m5, m6
10224 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
10225 %if ((%1 & %2) == 1)
10226 pinsrb m2, [r3], 0
10227 %endif
10228 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
10229 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
10230 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
10231 pmulhrsw m6, m7
10232 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
10233 pmulhrsw m1, m7
10234 packuswb m6, m1
10235 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
10236 pmulhrsw m1, m7
10237 movu m0, [pb_fact0]
10238 pshufb m2, m0
10239 pmovzxbw m2, m2
10240 packuswb m1, m2
10241 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
10242 %endmacro
10243
10244 INIT_XMM sse4
10245 cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
10246 %define above [rsp + 0 * mmsize]
10247 mov r3, r2
10248 add r2, 64
10249 lea r4, [ang_table + 16 * 16]
10250 lea r5, [r1 * 3] ; r5 -> 3 * stride
10251 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
10252 mova m7, [pw_1024]
10253
10254 MODE_13_23_ROW0 1
10255 lea r0, [r6 + r1 * 4]
10256 lea r6, [r6 + r1 * 8]
10257 add r2, 7
10258
10259 MODE_13_23 1, 1
10260 lea r0, [r6 + r1 * 4]
10261 lea r6, [r6 + r1 * 8]
10262 add r2, 8
10263 mov r3, 2
10264 .loop:
10265 MODE_13_23 1, 0
10266 lea r0, [r6 + r1 * 4]
10267 lea r6, [r6 + r1 * 8]
10268 add r2, 8
10269 dec r3
10270 jnz .loop
10271 RET
10272
10273 INIT_XMM sse4
10274 cglobal intra_pred_ang32_14, 3,7,8
10275 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10276 mov r6, rsp
10277 sub rsp, 64+gprsize
10278 and rsp, ~63
10279 mov [rsp+64], r6
10280
10281 ; collect reference pixel
10282 movu m0, [r2]
10283 movu m1, [r2 + 15]
10284 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
10285 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
10286 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
10287 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
10288 mova [rsp], m0
10289 movu m0, [r2 + 1 + 64]
10290 movu m1, [r2 + 1 + 16 + 64]
10291 movu [rsp + 13], m0
10292 movu [rsp + 13 + 16], m1
10293 mov [rsp + 63], byte 4
10294
10295 ; filter
10296 lea r2, [rsp + 13] ; r2 -> [0]
10297 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10298 lea r4, [ang_table] ; r4 -> ang_table
10299 lea r5, [r1 * 3] ; r5 -> 3 * stride
10300 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
10301 mova m5, [pw_1024] ; m5 -> 1024
10302 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10303
10304 .loop:
10305 ; Row[0 - 7]
10306 movu m7, [r2 - 4]
10307 palignr m0, m7, 3
10308 mova m1, m0
10309 palignr m2, m7, 2
10310 mova m3, m2
10311 palignr m4, m7, 1
10312 mova m5, m4
10313 mova m6, m4
10314 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
10315
10316 ; Row[8 - 15]
10317 movu m7, [r2 - 7]
10318 palignr m0, m7, 3
10319 palignr m1, m7, 2
10320 mova m2, m1
10321 mova m3, m1
10322 palignr m4, m7, 1
10323 mova m5, m4
10324 mova m6, m7
10325 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
10326
10327 ; Row[16 - 23]
10328 movu m7, [r2 - 10]
10329 palignr m0, m7, 3
10330 palignr m1, m7, 2
10331 mova m2, m1
10332 palignr m3, m7, 1
10333 mova m4, m3
10334 mova m5, m3
10335 mova m6, m7
10336 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
10337
10338 ; Row[24 - 31]
10339 movu m7, [r2 - 13]
10340 palignr m0, m7, 2
10341 mova m1, m0
10342 mova m2, m0
10343 palignr m3, m7, 1
10344 mova m4, m3
10345 mova m5, m7
10346 mova m6, m7
10347 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
10348
10349 lea r0, [r6 + r1 * 4]
10350 lea r6, [r6 + r1 * 8]
10351 add r2, 8
10352 dec byte [rsp + 63]
10353 jnz .loop
10354 mov rsp, [rsp+64]
10355 RET
10356
10357 INIT_XMM sse4
10358 cglobal intra_pred_ang32_15, 4,7,8
10359 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10360 mov r6, rsp
10361 sub rsp, 64+gprsize
10362 and rsp, ~63
10363 mov [rsp+64], r6
10364
10365 ; collect reference pixel
10366 movu m0, [r2]
10367 movu m1, [r2 + 15]
10368 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
10369 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
10370 mova [rsp], m1
10371 movu [rsp + 8], m0
10372 movu m0, [r2 + 1 + 64]
10373 movu m1, [r2 + 1 + 16 + 64]
10374 movu [rsp + 17], m0
10375 movu [rsp + 17 + 16], m1
10376 mov [rsp + 63], byte 4
10377
10378 ; filter
10379 lea r2, [rsp + 17] ; r2 -> [0]
10380 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10381 lea r4, [ang_table] ; r4 -> ang_table
10382 lea r5, [r1 * 3] ; r5 -> 3 * stride
10383 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
10384 mova m5, [pw_1024] ; m5 -> 1024
10385 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10386
10387 .loop:
10388 ; Row[0 - 7]
10389 movu m7, [r2 - 5]
10390 palignr m0, m7, 4
10391 palignr m1, m7, 3
10392 mova m2, m1
10393 palignr m3, m7, 2
10394 mova m4, m3
10395 palignr m5, m7, 1
10396 mova m6, m5
10397 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
10398
10399 ; Row[8 - 15]
10400 movu m7, [r2 - 9]
10401 palignr m0, m7, 4
10402 palignr m1, m7, 3
10403 mova m2, m1
10404 palignr m3, m7, 2
10405 mova m4, m3
10406 palignr m5, m7, 1
10407 mova m6, m5
10408 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
10409
10410 ; Row[16 - 23]
10411 movu m7, [r2 - 13]
10412 palignr m0, m7, 3
10413 mova m1, m0
10414 palignr m2, m7, 2
10415 mova m3, m2
10416 palignr m4, m7, 1
10417 mova m5, m4
10418 mova m6, m7
10419 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
10420
10421 ; Row[24 - 31]
10422 movu m7, [r2 - 17]
10423 palignr m0, m7, 3
10424 mova m1, m0
10425 palignr m2, m7, 2
10426 mova m3, m2
10427 palignr m4, m7, 1
10428 mova m5, m4
10429 mova m6, m7
10430 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
10431
10432 lea r0, [r6 + r1 * 4]
10433 lea r6, [r6 + r1 * 8]
10434 add r2, 8
10435 dec byte [rsp + 63]
10436 jnz .loop
10437 mov rsp, [rsp+64]
10438 RET
10439
10440 INIT_XMM sse4
10441 cglobal intra_pred_ang32_16, 4,7,8
10442 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10443 mov r6, rsp
10444 sub rsp, 64+gprsize
10445 and rsp, ~63
10446 mov [rsp+64], r6
10447
10448 ; collect reference pixel
10449 movu m0, [r2]
10450 movu m1, [r2 + 15]
10451 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
10452 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
10453 mova [rsp], m1
10454 movu [rsp + 10], m0
10455 movu m0, [r2 + 1 + 64]
10456 movu m1, [r2 + 1 + 16 + 64]
10457 movu [rsp + 21], m0
10458 movu [rsp + 21 + 16], m1
10459 mov [rsp + 63], byte 4
10460
10461 ; filter
10462 lea r2, [rsp + 21] ; r2 -> [0]
10463 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10464 lea r4, [ang_table] ; r4 -> ang_table
10465 lea r5, [r1 * 3] ; r5 -> 3 * stride
10466 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
10467 mova m5, [pw_1024] ; m5 -> 1024
10468 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10469
10470 .loop:
10471 ; Row[0 - 7]
10472 movu m7, [r2 - 6]
10473 palignr m0, m7, 5
10474 palignr m1, m7, 4
10475 mova m2, m1
10476 palignr m3, m7, 3
10477 palignr m4, m7, 2
10478 mova m5, m4
10479 palignr m6, m7, 1
10480 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
10481
10482 ; Row[8 - 15]
10483 movu m7, [r2 - 11]
10484 palignr m0, m7, 5
10485 palignr m1, m7, 4
10486 palignr m2, m7, 3
10487 mova m3, m2
10488 palignr m4, m7, 2
10489 palignr m5, m7, 1
10490 mova m6, m5
10491 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
10492
10493 ; Row[16 - 23]
10494 movu m7, [r2 - 16]
10495 palignr m0, m7, 4
10496 mova m1, m0
10497 palignr m2, m7, 3
10498 palignr m3, m7, 2
10499 mova m4, m3
10500 palignr m5, m7, 1
10501 mova m6, m7
10502 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
10503
10504 ; Row[24 - 31]
10505 movu m7, [r2 - 21]
10506 palignr m0, m7, 4
10507 palignr m1, m7, 3
10508 mova m2, m1
10509 palignr m3, m7, 2
10510 palignr m4, m7, 1
10511 mova m5, m4
10512 mova m6, m7
10513 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
10514
10515 lea r0, [r6 + r1 * 4]
10516 lea r6, [r6 + r1 * 8]
10517 add r2, 8
10518 dec byte [rsp + 63]
10519 jnz .loop
10520 mov rsp, [rsp+64]
10521 RET
10522
10523 INIT_XMM sse4
10524 cglobal intra_pred_ang32_17, 4,7,8
10525 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10526 mov r6, rsp
10527 sub rsp, 64+gprsize
10528 and rsp, ~63
10529 mov [rsp+64], r6
10530
10531 ; collect reference pixel
10532 movu m0, [r2]
10533 movu m1, [r2 + 16]
10534 pshufb m0, [c_mode32_17_0]
10535 pshufb m1, [c_mode32_17_0]
10536 mova [rsp ], m1
10537 movu [rsp + 13], m0
10538 movu m0, [r2 + 1 + 64]
10539 movu m1, [r2 + 1 + 16 + 64]
10540 movu [rsp + 26], m0
10541 movu [rsp + 26 + 16], m1
10542 mov [rsp + 63], byte 4
10543
10544 ; filter
10545 lea r2, [rsp + 25] ; r2 -> [0]
10546 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10547 lea r4, [ang_table] ; r4 -> ang_table
10548 lea r5, [r1 * 3] ; r5 -> 3 * stride
10549 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
10550 mova m5, [pw_1024] ; m5 -> 1024
10551 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10552
10553 .loop:
10554 ; Row[0 - 7]
10555 movu m7, [r2 - 6]
10556 palignr m0, m7, 6
10557 palignr m1, m7, 5
10558 palignr m2, m7, 4
10559 palignr m3, m7, 3
10560 palignr m4, m7, 2
10561 mova m5, m4
10562 palignr m6, m7, 1
10563 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
10564
10565 ; Row[7 - 15]
10566 movu m7, [r2 - 12]
10567 palignr m0, m7, 5
10568 palignr m1, m7, 4
10569 mova m2, m1
10570 palignr m3, m7, 3
10571 palignr m4, m7, 2
10572 palignr m5, m7, 1
10573 mova m6, m7
10574 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
10575
10576 ; Row[16 - 23]
10577 movu m7, [r2 - 19]
10578 palignr m0, m7, 6
10579 palignr m1, m7, 5
10580 palignr m2, m7, 4
10581 palignr m3, m7, 3
10582 palignr m4, m7, 2
10583 mova m5, m4
10584 palignr m6, m7, 1
10585 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
10586
10587 ; Row[24 - 31]
10588 movu m7, [r2 - 25]
10589 palignr m0, m7, 5
10590 palignr m1, m7, 4
10591 mova m2, m1
10592 palignr m3, m7, 3
10593 palignr m4, m7, 2
10594 palignr m5, m7, 1
10595 mova m6, m7
10596 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
10597
10598 lea r0, [r6 + r1 * 4]
10599 lea r6, [r6 + r1 * 8]
10600 add r2, 8
10601 dec byte [rsp + 63]
10602 jnz .loop
10603 mov rsp, [rsp+64]
10604
10605 RET
10606
10607 INIT_YMM avx2
10608 cglobal intra_pred_ang32_18, 4, 4, 3
10609 movu m0, [r2]
10610 movu xm1, [r2 + 1 + 64]
10611 pshufb xm1, [intra_pred_shuff_15_0]
10612 mova xm2, xm0
10613 vinserti128 m1, m1, xm2, 1
10614
10615 lea r3, [r1 * 3]
10616
10617 movu [r0], m0
10618 palignr m2, m0, m1, 15
10619 movu [r0 + r1], m2
10620 palignr m2, m0, m1, 14
10621 movu [r0 + r1 * 2], m2
10622 palignr m2, m0, m1, 13
10623 movu [r0 + r3], m2
10624
10625 lea r0, [r0 + r1 * 4]
10626 palignr m2, m0, m1, 12
10627 movu [r0], m2
10628 palignr m2, m0, m1, 11
10629 movu [r0 + r1], m2
10630 palignr m2, m0, m1, 10
10631 movu [r0 + r1 * 2], m2
10632 palignr m2, m0, m1, 9
10633 movu [r0 + r3], m2
10634
10635 lea r0, [r0 + r1 * 4]
10636 palignr m2, m0, m1, 8
10637 movu [r0], m2
10638 palignr m2, m0, m1, 7
10639 movu [r0 + r1], m2
10640 palignr m2, m0, m1, 6
10641 movu [r0 + r1 * 2], m2
10642 palignr m2, m0, m1, 5
10643 movu [r0 + r3], m2
10644
10645 lea r0, [r0 + r1 * 4]
10646 palignr m2, m0, m1, 4
10647 movu [r0], m2
10648 palignr m2, m0, m1, 3
10649 movu [r0 + r1], m2
10650 palignr m2, m0, m1, 2
10651 movu [r0 + r1 * 2], m2
10652 palignr m2, m0, m1, 1
10653 movu [r0 + r3], m2
10654
10655 lea r0, [r0 + r1 * 4]
10656 movu [r0], m1
10657
10658 movu xm0, [r2 + 64 + 17]
10659 pshufb xm0, [intra_pred_shuff_15_0]
10660 vinserti128 m0, m0, xm1, 1
10661
10662 palignr m2, m1, m0, 15
10663 movu [r0 + r1], m2
10664 palignr m2, m1, m0, 14
10665 movu [r0 + r1 * 2], m2
10666 palignr m2, m1, m0, 13
10667 movu [r0 + r3], m2
10668
10669 lea r0, [r0 + r1 * 4]
10670 palignr m2, m1, m0, 12
10671 movu [r0], m2
10672 palignr m2, m1, m0, 11
10673 movu [r0 + r1], m2
10674 palignr m2, m1, m0, 10
10675 movu [r0 + r1 * 2], m2
10676 palignr m2, m1, m0, 9
10677 movu [r0 + r3], m2
10678
10679 lea r0, [r0 + r1 * 4]
10680 palignr m2, m1, m0, 8
10681 movu [r0], m2
10682 palignr m2, m1, m0, 7
10683 movu [r0 + r1], m2
10684 palignr m2, m1, m0,6
10685 movu [r0 + r1 * 2], m2
10686 palignr m2, m1, m0, 5
10687 movu [r0 + r3], m2
10688
10689 lea r0, [r0 + r1 * 4]
10690 palignr m2, m1, m0, 4
10691 movu [r0], m2
10692 palignr m2, m1, m0, 3
10693 movu [r0 + r1], m2
10694 palignr m2, m1, m0,2
10695 movu [r0 + r1 * 2], m2
10696 palignr m2, m1, m0, 1
10697 movu [r0 + r3], m2
10698 RET
10699
10700 INIT_XMM sse4
10701 cglobal intra_pred_ang32_18, 4,5,5
10702 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
10703 movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
10704 movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
10705 movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
10706
10707 lea r2, [r1 * 2]
10708 lea r3, [r1 * 3]
10709 lea r4, [r1 * 4]
10710
10711 movu [r0], m0
10712 movu [r0 + 16], m1
10713
10714 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
10715 pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
10716
10717 palignr m4, m0, m2, 15
10718 movu [r0 + r1], m4
10719 palignr m4, m1, m0, 15
10720 movu [r0 + r1 + 16], m4
10721 palignr m4, m0, m2, 14
10722 movu [r0 + r2], m4
10723 palignr m4, m1, m0, 14
10724 movu [r0 + r2 + 16], m4
10725 palignr m4, m0, m2, 13
10726 movu [r0 + r3], m4
10727 palignr m4, m1, m0, 13
10728 movu [r0 + r3 + 16], m4
10729
10730 lea r0, [r0 + r4]
10731
10732 palignr m4, m0, m2, 12
10733 movu [r0], m4
10734 palignr m4, m1, m0, 12
10735 movu [r0 + 16], m4
10736 palignr m4, m0, m2, 11
10737 movu [r0 + r1], m4
10738 palignr m4, m1, m0, 11
10739 movu [r0 + r1 + 16], m4
10740 palignr m4, m0, m2, 10
10741 movu [r0 + r2], m4
10742 palignr m4, m1, m0, 10
10743 movu [r0 + r2 + 16], m4
10744 palignr m4, m0, m2, 9
10745 movu [r0 + r3], m4
10746 palignr m4, m1, m0, 9
10747 movu [r0 + r3 + 16], m4
10748
10749 lea r0, [r0 + r4]
10750
10751 palignr m4, m0, m2, 8
10752 movu [r0], m4
10753 palignr m4, m1, m0, 8
10754 movu [r0 + 16], m4
10755 palignr m4, m0, m2, 7
10756 movu [r0 + r1], m4
10757 palignr m4, m1, m0, 7
10758 movu [r0 + r1 + 16], m4
10759 palignr m4, m0, m2, 6
10760 movu [r0 + r2], m4
10761 palignr m4, m1, m0, 6
10762 movu [r0 + r2 + 16], m4
10763 palignr m4, m0, m2, 5
10764 movu [r0 + r3], m4
10765 palignr m4, m1, m0, 5
10766 movu [r0 + r3 + 16], m4
10767
10768 lea r0, [r0 + r4]
10769
10770 palignr m4, m0, m2, 4
10771 movu [r0], m4
10772 palignr m4, m1, m0, 4
10773 movu [r0 + 16], m4
10774 palignr m4, m0, m2, 3
10775 movu [r0 + r1], m4
10776 palignr m4, m1, m0, 3
10777 movu [r0 + r1 + 16], m4
10778 palignr m4, m0, m2, 2
10779 movu [r0 + r2], m4
10780 palignr m4, m1, m0, 2
10781 movu [r0 + r2 + 16], m4
10782 palignr m4, m0, m2, 1
10783 movu [r0 + r3], m4
10784 palignr m4, m1, m0, 1
10785 movu [r0 + r3 + 16], m4
10786
10787 lea r0, [r0 + r4]
10788
10789 movu [r0], m2
10790 movu [r0 + 16], m0
10791 palignr m4, m2, m3, 15
10792 movu [r0 + r1], m4
10793 palignr m4, m0, m2, 15
10794 movu [r0 + r1 + 16], m4
10795 palignr m4, m2, m3, 14
10796 movu [r0 + r2], m4
10797 palignr m4, m0, m2, 14
10798 movu [r0 + r2 + 16], m4
10799 palignr m4, m2, m3, 13
10800 movu [r0 + r3], m4
10801 palignr m4, m0, m2, 13
10802 movu [r0 + r3 + 16], m4
10803
10804 lea r0, [r0 + r4]
10805
10806 palignr m4, m2, m3, 12
10807 movu [r0], m4
10808 palignr m4, m0, m2, 12
10809 movu [r0 + 16], m4
10810 palignr m4, m2, m3, 11
10811 movu [r0 + r1], m4
10812 palignr m4, m0, m2, 11
10813 movu [r0 + r1 + 16], m4
10814 palignr m4, m2, m3, 10
10815 movu [r0 + r2], m4
10816 palignr m4, m0, m2, 10
10817 movu [r0 + r2 + 16], m4
10818 palignr m4, m2, m3, 9
10819 movu [r0 + r3], m4
10820 palignr m4, m0, m2, 9
10821 movu [r0 + r3 + 16], m4
10822
10823 lea r0, [r0 + r4]
10824
10825 palignr m4, m2, m3, 8
10826 movu [r0], m4
10827 palignr m4, m0, m2, 8
10828 movu [r0 + 16], m4
10829 palignr m4, m2, m3, 7
10830 movu [r0 + r1], m4
10831 palignr m4, m0, m2, 7
10832 movu [r0 + r1 + 16], m4
10833 palignr m4, m2, m3, 6
10834 movu [r0 + r2], m4
10835 palignr m4, m0, m2, 6
10836 movu [r0 + r2 + 16], m4
10837 palignr m4, m2, m3, 5
10838 movu [r0 + r3], m4
10839 palignr m4, m0, m2, 5
10840 movu [r0 + r3 + 16], m4
10841
10842 lea r0, [r0 + r4]
10843
10844 palignr m4, m2, m3, 4
10845 movu [r0], m4
10846 palignr m4, m0, m2, 4
10847 movu [r0 + 16], m4
10848 palignr m4, m2, m3, 3
10849 movu [r0 + r1], m4
10850 palignr m4, m0, m2, 3
10851 movu [r0 + r1 + 16], m4
10852 palignr m4, m2, m3, 2
10853 movu [r0 + r2], m4
10854 palignr m4, m0, m2, 2
10855 movu [r0 + r2 + 16], m4
10856 palignr m4, m2, m3, 1
10857 movu [r0 + r3], m4
10858 palignr m4, m0, m2, 1
10859 movu [r0 + r3 + 16], m4
10860 RET
10861
10862 INIT_XMM sse4
10863 cglobal intra_pred_ang32_19, 4,7,8
10864 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10865 mov r6, rsp
10866 sub rsp, 64+gprsize
10867 and rsp, ~63
10868 mov [rsp+64], r6
10869
10870 ; collect reference pixel
10871 movu m0, [r2 + 64]
10872 pinsrb m0, [r2], 0
10873 movu m1, [r2 + 16 + 64]
10874 pshufb m0, [c_mode32_17_0]
10875 pshufb m1, [c_mode32_17_0]
10876 mova [rsp ], m1
10877 movu [rsp + 13], m0
10878 movu m0, [r2 + 1]
10879 movu m1, [r2 + 1 + 16]
10880 movu [rsp + 26], m0
10881 movu [rsp + 26 + 16], m1
10882 mov [rsp + 63], byte 4
10883
10884 ; filter
10885 lea r2, [rsp + 25] ; r2 -> [0]
10886 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10887 lea r4, [ang_table] ; r4 -> ang_table
10888 lea r5, [r1 * 3] ; r5 -> 3 * stride
10889 lea r6, [r0] ; r6 -> r0
10890 mova m5, [pw_1024] ; m5 -> 1024
10891 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10892
10893 .loop:
10894 ; Row[0 - 7]
10895 movu m7, [r2 - 6]
10896 palignr m0, m7, 6
10897 palignr m1, m7, 5
10898 palignr m2, m7, 4
10899 palignr m3, m7, 3
10900 palignr m4, m7, 2
10901 mova m5, m4
10902 palignr m6, m7, 1
10903 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
10904
10905 ; Row[7 - 15]
10906 movu m7, [r2 - 12]
10907 palignr m0, m7, 5
10908 palignr m1, m7, 4
10909 mova m2, m1
10910 palignr m3, m7, 3
10911 palignr m4, m7, 2
10912 palignr m5, m7, 1
10913 mova m6, m7
10914 lea r0, [r0 + r1 * 4]
10915 PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
10916
10917 ; Row[16 - 23]
10918 movu m7, [r2 - 19]
10919 palignr m0, m7, 6
10920 palignr m1, m7, 5
10921 palignr m2, m7, 4
10922 palignr m3, m7, 3
10923 palignr m4, m7, 2
10924 mova m5, m4
10925 palignr m6, m7, 1
10926 lea r0, [r0 + r1 * 4]
10927 PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
10928
10929 ; Row[24 - 31]
10930 movu m7, [r2 - 25]
10931 palignr m0, m7, 5
10932 palignr m1, m7, 4
10933 mova m2, m1
10934 palignr m3, m7, 3
10935 palignr m4, m7, 2
10936 palignr m5, m7, 1
10937 mova m6, m7
10938 lea r0, [r0 + r1 * 4]
10939 PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
10940
10941 add r6, 8
10942 mov r0, r6
10943 add r2, 8
10944 dec byte [rsp + 63]
10945 jnz .loop
10946 mov rsp, [rsp+64]
10947 RET
10948
10949 INIT_XMM sse4
10950 cglobal intra_pred_ang32_20, 4,7,8
10951 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
10952 mov r6, rsp
10953 sub rsp, 64+gprsize
10954 and rsp, ~63
10955 mov [rsp+64], r6
10956
10957 ; collect reference pixel
10958 movu m0, [r2 + 64]
10959 pinsrb m0, [r2], 0
10960 movu m1, [r2 + 15 + 64]
10961 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
10962 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
10963 mova [rsp], m1
10964 movu [rsp + 10], m0
10965 movu m0, [r2 + 1]
10966 movu m1, [r2 + 1 + 16]
10967 movu [rsp + 21], m0
10968 movu [rsp + 21 + 16], m1
10969 mov [rsp + 63], byte 4
10970
10971 ; filter
10972 lea r2, [rsp + 21] ; r2 -> [0]
10973 lea r3, [c_shuf8_0] ; r3 -> shuffle8
10974 lea r4, [ang_table] ; r4 -> ang_table
10975 lea r5, [r1 * 3] ; r5 -> 3 * stride
10976 lea r6, [r0] ; r6 -> r0
10977 mova m5, [pw_1024] ; m5 -> 1024
10978 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
10979
10980 .loop:
10981 ; Row[0 - 7]
10982 movu m7, [r2 - 6]
10983 palignr m0, m7, 5
10984 palignr m1, m7, 4
10985 mova m2, m1
10986 palignr m3, m7, 3
10987 palignr m4, m7, 2
10988 mova m5, m4
10989 palignr m6, m7, 1
10990 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
10991
10992 ; Row[8 - 15]
10993 movu m7, [r2 - 11]
10994 palignr m0, m7, 5
10995 palignr m1, m7, 4
10996 palignr m2, m7, 3
10997 mova m3, m2
10998 palignr m4, m7, 2
10999 palignr m5, m7, 1
11000 mova m6, m5
11001 lea r0, [r0 + r1 * 4]
11002 PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
11003
11004 ; Row[16 - 23]
11005 movu m7, [r2 - 16]
11006 palignr m0, m7, 4
11007 mova m1, m0
11008 palignr m2, m7, 3
11009 palignr m3, m7, 2
11010 mova m4, m3
11011 palignr m5, m7, 1
11012 mova m6, m7
11013 lea r0, [r0 + r1 * 4]
11014 PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
11015
11016 ; Row[24 - 31]
11017 movu m7, [r2 - 21]
11018 palignr m0, m7, 4
11019 palignr m1, m7, 3
11020 mova m2, m1
11021 palignr m3, m7, 2
11022 palignr m4, m7, 1
11023 mova m5, m4
11024 mova m6, m7
11025 lea r0, [r0 + r1 * 4]
11026 PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
11027
11028 add r6, 8
11029 mov r0, r6
11030 add r2, 8
11031 dec byte [rsp + 63]
11032 jnz .loop
11033 mov rsp, [rsp+64]
11034 RET
11035
11036 INIT_XMM sse4
11037 cglobal intra_pred_ang32_21, 4,7,8
11038 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
11039 mov r6, rsp
11040 sub rsp, 64+gprsize
11041 and rsp, ~63
11042 mov [rsp+64], r6
11043
11044 ; collect reference pixel
11045 movu m0, [r2 + 64]
11046 pinsrb m0, [r2], 0
11047 movu m1, [r2 + 15 + 64]
11048 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
11049 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
11050 mova [rsp], m1
11051 movu [rsp + 8], m0
11052 movu m0, [r2 + 1]
11053 movu m1, [r2 + 1 + 16]
11054 movu [rsp + 17], m0
11055 movu [rsp + 17 + 16], m1
11056 mov [rsp + 63], byte 4
11057
11058 ; filter
11059 lea r2, [rsp + 17] ; r2 -> [0]
11060 lea r3, [c_shuf8_0] ; r3 -> shuffle8
11061 lea r4, [ang_table] ; r4 -> ang_table
11062 lea r5, [r1 * 3] ; r5 -> 3 * stride
11063 lea r6, [r0] ; r6 -> r0
11064 mova m5, [pw_1024] ; m5 -> 1024
11065 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
11066
11067 .loop:
11068 ; Row[0 - 7]
11069 movu m7, [r2 - 5]
11070 palignr m0, m7, 4
11071 palignr m1, m7, 3
11072 mova m2, m1
11073 palignr m3, m7, 2
11074 mova m4, m3
11075 palignr m5, m7, 1
11076 mova m6, m5
11077 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
11078
11079 ; Row[8 - 15]
11080 movu m7, [r2 - 9]
11081 palignr m0, m7, 4
11082 palignr m1, m7, 3
11083 mova m2, m1
11084 palignr m3, m7, 2
11085 mova m4, m3
11086 palignr m5, m7, 1
11087 mova m6, m5
11088 lea r0, [r0 + r1 * 4]
11089 PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
11090
11091 ; Row[16 - 23]
11092 movu m7, [r2 - 13]
11093 palignr m0, m7, 3
11094 mova m1, m0
11095 palignr m2, m7, 2
11096 mova m3, m2
11097 palignr m4, m7, 1
11098 mova m5, m4
11099 mova m6, m7
11100 lea r0, [r0 + r1 * 4]
11101 PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
11102
11103 ; Row[24 - 31]
11104 movu m7, [r2 - 17]
11105 palignr m0, m7, 3
11106 mova m1, m0
11107 palignr m2, m7, 2
11108 mova m3, m2
11109 palignr m4, m7, 1
11110 mova m5, m4
11111 mova m6, m7
11112 lea r0, [r0 + r1 * 4]
11113 PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
11114
11115 add r6, 8
11116 mov r0, r6
11117 add r2, 8
11118 dec byte [rsp + 63]
11119 jnz .loop
11120 mov rsp, [rsp+64]
11121 RET
11122
11123 INIT_XMM sse4
11124 cglobal intra_pred_ang32_22, 4,7,8
11125 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
11126 mov r6, rsp
11127 sub rsp, 64+gprsize
11128 and rsp, ~63
11129 mov [rsp+64], r6
11130
11131 ; collect reference pixel
11132 movu m0, [r2 + 64]
11133 pinsrb m0, [r2], 0
11134 movu m1, [r2 + 15 + 64]
11135 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
11136 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
11137 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
11138 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
11139 mova [rsp], m0
11140 movu m0, [r2 + 1]
11141 movu m1, [r2 + 1 + 16]
11142 movu [rsp + 13], m0
11143 movu [rsp + 13 + 16], m1
11144 mov [rsp + 63], byte 4
11145
11146 ; filter
11147 lea r2, [rsp + 13] ; r2 -> [0]
11148 lea r3, [c_shuf8_0] ; r3 -> shuffle8
11149 lea r4, [ang_table] ; r4 -> ang_table
11150 lea r5, [r1 * 3] ; r5 -> 3 * stride
11151 lea r6, [r0] ; r6 -> r0
11152 mova m5, [pw_1024] ; m5 -> 1024
11153 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
11154
11155 .loop:
11156 ; Row[0 - 7]
11157 movu m7, [r2 - 4]
11158 palignr m0, m7, 3
11159 mova m1, m0
11160 palignr m2, m7, 2
11161 mova m3, m2
11162 palignr m4, m7, 1
11163 mova m5, m4
11164 mova m6, m4
11165 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
11166
11167 ; Row[8 - 15]
11168 movu m7, [r2 - 7]
11169 palignr m0, m7, 3
11170 palignr m1, m7, 2
11171 mova m2, m1
11172 mova m3, m1
11173 palignr m4, m7, 1
11174 mova m5, m4
11175 mova m6, m7
11176 lea r0, [r0 + r1 * 4]
11177 PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
11178
11179 ; Row[16 - 23]
11180 movu m7, [r2 - 10]
11181 palignr m0, m7, 3
11182 palignr m1, m7, 2
11183 mova m2, m1
11184 palignr m3, m7, 1
11185 mova m4, m3
11186 mova m5, m3
11187 mova m6, m7
11188 lea r0, [r0 + r1 * 4]
11189 PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
11190
11191 ; Row[24 - 31]
11192 movu m7, [r2 - 13]
11193 palignr m0, m7, 2
11194 mova m1, m0
11195 mova m2, m0
11196 palignr m3, m7, 1
11197 mova m4, m3
11198 mova m5, m7
11199 mova m6, m7
11200 lea r0, [r0 + r1 * 4]
11201 PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
11202
11203 add r6, 8
11204 mov r0, r6
11205 add r2, 8
11206 dec byte [rsp + 63]
11207 jnz .loop
11208 mov rsp, [rsp+64]
11209 RET
11210
11211 INIT_XMM sse4
11212 cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
11213 %define above [rsp + 0 * mmsize]
11214 lea r3, [r2 + 64]
11215 lea r4, [ang_table + 16 * 16]
11216 lea r5, [r1 * 3] ; r5 -> 3 * stride
11217 mov r6, r0
11218 mova m7, [pw_1024]
11219
11220 MODE_13_23_ROW0 0
11221 add r6, 8
11222 mov r0, r6
11223 add r2, 7
11224 mov r3, 3
11225 .loop:
11226 MODE_13_23 0, 0
11227 add r6, 8
11228 mov r0, r6
11229 add r2, 8
11230 dec r3
11231 jnz .loop
11232 RET
11233
11234 INIT_XMM sse4
11235 cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
11236 %define above [rsp + 0 * mmsize]
11237 lea r3, [r2 + 64]
11238 lea r4, [ang_table + 16 * 16]
11239 lea r5, [r1 * 3] ; r5 -> 3 * stride
11240 mov r6, r0
11241 mova m7, [pw_1024]
11242
11243 MODE_12_24_ROW0 0
11244 add r6, 8
11245 mov r0, r6
11246 add r2, 7
11247 mov r3, 3
11248 .loop:
11249 MODE_12_24 0
11250 add r6, 8
11251 mov r0, r6
11252 add r2, 8
11253 dec r3
11254 jnz .loop
11255 RET
11256
11257 INIT_XMM sse4
11258 cglobal intra_pred_ang32_25, 4,7,8
11259 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
11260 mov r6, rsp
11261 sub rsp, 64+gprsize
11262 and rsp, ~63
11263 mov [rsp+64], r6
11264
11265 ; collect reference pixel
11266 movu m0, [r2 + 16 + 64]
11267 pxor m1, m1
11268 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
11269 mova [rsp], m0
11270 movu m0, [r2]
11271 movu m1, [r2 + 16]
11272 movu m2, [r2 + 32]
11273 movu [rsp + 1], m0
11274 movu [rsp + 1 + 16], m1
11275 movu [rsp + 1 + 32], m2
11276 mov [rsp + 63], byte 4
11277
11278 ; filter
11279 lea r2, [rsp + 1] ; r2 -> [0]
11280 lea r3, [c_shuf8_0] ; r3 -> shuffle8
11281 lea r4, [ang_table] ; r4 -> ang_table
11282 lea r5, [r1 * 3] ; r5 -> 3 * stride
11283 lea r6, [r0] ; r6 -> r0
11284 mova m5, [pw_1024] ; m5 -> 1024
11285 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
11286
11287 .loop:
11288 ; Row[0 - 7]
11289 movu m7, [r2]
11290 mova m0, m7
11291 mova m1, m7
11292 mova m2, m7
11293 mova m3, m7
11294 mova m4, m7
11295 mova m5, m7
11296 mova m6, m7
11297 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
11298
11299 ; Row[8 - 15]
11300 movu m7, [r2]
11301 mova m0, m7
11302 mova m1, m7
11303 mova m2, m7
11304 mova m3, m7
11305 mova m4, m7
11306 mova m5, m7
11307 mova m6, m7
11308 lea r0, [r0 + r1 * 4]
11309 PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
11310
11311 ; Row[16 - 23]
11312 movu m7, [r2 - 1]
11313 mova m0, m7
11314 mova m1, m7
11315 mova m2, m7
11316 mova m3, m7
11317 mova m4, m7
11318 mova m5, m7
11319 mova m6, m7
11320 lea r0, [r0 + r1 * 4]
11321 PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
11322
11323 ; Row[24 - 31]
11324 movu m7, [r2 - 1]
11325 mova m0, m7
11326 mova m1, m7
11327 mova m2, m7
11328 mova m3, m7
11329 mova m4, m7
11330 mova m5, m7
11331 mova m6, m7
11332 lea r0, [r0 + r1 * 4]
11333 PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
11334
11335 add r6, 8
11336 mov r0, r6
11337 add r2, 8
11338 dec byte [rsp + 63]
11339 jnz .loop
11340 mov rsp, [rsp+64]
11341 RET
11342
11343 INIT_XMM sse4
11344 cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
11345 %define m8 [rsp + 0 * mmsize]
11346 %define m9 [rsp + 1 * mmsize]
11347 mov r6, 2
11348 movu m0, [r2 + 64]
11349 pinsrb m0, [r2], 0
11350 movu m1, [r2 + 1 + 64]
11351 mova m8, m0
11352 mova m9, m1
11353 mov r3d, r4d
11354 lea r4, [r1 * 3]
11355
11356 .loop:
11357 movu m0, [r2 + 1]
11358
11359 movu [r0], m0
11360 movu [r0 + r1], m0
11361 movu [r0 + r1 * 2], m0
11362 movu [r0 + r4], m0
11363 lea r5, [r0 + r1 * 4]
11364 movu [r5], m0
11365 movu [r5 + r1], m0
11366 movu [r5 + r1 * 2], m0
11367 movu [r5 + r4], m0
11368 lea r5, [r5 + r1 * 4]
11369 movu [r5], m0
11370 movu [r5 + r1], m0
11371 movu [r5 + r1 * 2], m0
11372 movu [r5 + r4], m0
11373 lea r5, [r5 + r1 * 4]
11374 movu [r5], m0
11375 movu [r5 + r1], m0
11376 movu [r5 + r1 * 2], m0
11377 movu [r5 + r4], m0
11378 lea r5, [r0 + r1 * 4]
11379 movu [r5], m0
11380 movu [r5 + r1], m0
11381 movu [r5 + r1 * 2], m0
11382 movu [r5 + r4], m0
11383 lea r5, [r5 + r1 * 4]
11384 movu [r5], m0
11385 movu [r5 + r1], m0
11386 movu [r5 + r1 * 2], m0
11387 movu [r5 + r4], m0
11388 lea r5, [r5 + r1 * 4]
11389 movu [r5], m0
11390 movu [r5 + r1], m0
11391 movu [r5 + r1 * 2], m0
11392 movu [r5 + r4], m0
11393 lea r5, [r5 + r1 * 4]
11394 movu [r5], m0
11395 movu [r5 + r1], m0
11396 movu [r5 + r1 * 2], m0
11397 movu [r5 + r4], m0
11398 lea r5, [r5 + r1 * 4]
11399 movu [r5], m0
11400 movu [r5 + r1], m0
11401 movu [r5 + r1 * 2], m0
11402 movu [r5 + r4], m0
11403 lea r5, [r5 + r1 * 4]
11404 movu [r5], m0
11405 movu [r5 + r1], m0
11406 movu [r5 + r1 * 2], m0
11407 movu [r5 + r4], m0
11408 lea r5, [r5 + r1 * 4]
11409 movu [r5], m0
11410 movu [r5 + r1], m0
11411 movu [r5 + r1 * 2], m0
11412 movu [r5 + r4], m0
11413
11414 ; filter
11415 cmp r3d, byte 0
11416 jz .quit
11417
11418 pxor m4, m4
11419 pshufb m0, m4
11420 pmovzxbw m0, m0
11421 mova m1, m0
11422 movu m2, m8
11423 movu m3, m9
11424
11425 pshufb m2, m4
11426 pmovzxbw m2, m2
11427 movhlps m4, m3
11428 pmovzxbw m3, m3
11429 pmovzxbw m4, m4
11430 psubw m3, m2
11431 psubw m4, m2
11432 psraw m3, 1
11433 psraw m4, 1
11434 paddw m0, m3
11435 paddw m1, m4
11436 packuswb m0, m1
11437
11438 pextrb [r0], m0, 0
11439 pextrb [r0 + r1], m0, 1
11440 pextrb [r0 + r1 * 2], m0, 2
11441 pextrb [r0 + r4], m0, 3
11442 lea r5, [r0 + r1 * 4]
11443 pextrb [r5], m0, 4
11444 pextrb [r5 + r1], m0, 5
11445 pextrb [r5 + r1 * 2], m0, 6
11446 pextrb [r5 + r4], m0, 7
11447 lea r5, [r5 + r1 * 4]
11448 pextrb [r5], m0, 8
11449 pextrb [r5 + r1], m0, 9
11450 pextrb [r5 + r1 * 2], m0, 10
11451 pextrb [r5 + r4], m0, 11
11452 lea r5, [r5 + r1 * 4]
11453 pextrb [r5], m0, 12
11454 pextrb [r5 + r1], m0, 13
11455 pextrb [r5 + r1 * 2], m0, 14
11456 pextrb [r5 + r4], m0, 15
11457
11458 .quit:
11459 lea r2, [r2 + 16]
11460 add r0, 16
11461 dec r6d
11462 jnz .loop
11463 RET
11464
11465 INIT_XMM sse4
11466 cglobal intra_pred_ang32_27, 3,7,8
11467 lea r3, [ang_table + 16 * 16]
11468 mov r4d, 4
11469 lea r5, [r1 * 3]
11470 mov r6, r0
11471 mova m7, [pw_1024]
11472 .loop:
11473 MODE_9_27 0
11474 add r6, 8
11475 mov r0, r6
11476 add r2, 8
11477 dec r4
11478 jnz .loop
11479 RET
11480
11481 INIT_XMM sse4
11482 cglobal intra_pred_ang32_28, 3,7,8
11483 lea r3, [ang_table + 16 * 16]
11484 mov r4d, 4
11485 lea r5, [r1 * 3]
11486 mov r6, r0
11487 mova m7, [pw_1024]
11488 .loop:
11489 MODE_8_28 0
11490 add r6, 8
11491 mov r0, r6
11492 add r2, 8
11493 dec r4
11494 jnz .loop
11495 RET
11496
11497 INIT_XMM sse4
11498 cglobal intra_pred_ang32_29, 3,7,8
11499 lea r3, [ang_table + 16 * 16]
11500 mov r4d, 4
11501 lea r5, [r1 * 3]
11502 mov r6, r0
11503 mova m7, [pw_1024]
11504 .loop:
11505 MODE_7_29 0
11506 add r6, 8
11507 mov r0, r6
11508 add r2, 8
11509 dec r4
11510 jnz .loop
11511 RET
11512
11513 INIT_XMM sse4
11514 cglobal intra_pred_ang32_30, 3,7,8
11515 lea r3, [ang_table + 16 * 16]
11516 mov r4d, 4
11517 lea r5, [r1 * 3]
11518 mov r6, r0
11519 mova m7, [pw_1024]
11520 .loop:
11521 MODE_6_30 0
11522 add r6, 8
11523 mov r0, r6
11524 add r2, 8
11525 dec r4
11526 jnz .loop
11527 RET
11528
11529 INIT_XMM sse4
11530 cglobal intra_pred_ang32_31, 3,7,8
11531 lea r3, [ang_table + 16 * 16]
11532 mov r4d, 4
11533 lea r5, [r1 * 3]
11534 mov r6, r0
11535 mova m7, [pw_1024]
11536 .loop:
11537 MODE_5_31 0
11538 add r6, 8
11539 mov r0, r6
11540 add r2, 8
11541 dec r4
11542 jnz .loop
11543 RET
11544
11545 INIT_XMM sse4
11546 cglobal intra_pred_ang32_32, 3,7,8
11547 lea r3, [ang_table + 16 * 16]
11548 mov r4d, 4
11549 lea r5, [r1 * 3]
11550 mov r6, r0
11551 mova m7, [pw_1024]
11552 .loop:
11553 MODE_4_32 0
11554 add r6, 8
11555 mov r0, r6
11556 add r2, 8
11557 dec r4
11558 jnz .loop
11559 RET
11560
11561 INIT_XMM sse4
11562 cglobal intra_pred_ang32_33, 3,7,8
11563 lea r3, [ang_table + 16 * 16]
11564 mov r4d, 4
11565 lea r5, [r1 * 3]
11566 mov r6, r0
11567 mova m7, [pw_1024]
11568 .loop:
11569 MODE_3_33 0
11570 add r6, 8
11571 mov r0, r6
11572 add r2, 8
11573 dec r4
11574 jnz .loop
11575 RET
11576
11577 ;-----------------------------------------------------------------------------------------
11578 ; start of intra_pred_ang32 angular modes avx2 asm
11579 ;-----------------------------------------------------------------------------------------
11580
11581 %if ARCH_X86_64 == 1
11582 INIT_YMM avx2
11583
11584 ; register mapping :
11585 ; %1-%8 - output registers
11586 ; %9 - temp register
11587 ; %10 - for label naming
11588 %macro TRANSPOSE_32x8_AVX2 10
11589 jnz .skip%10
11590
11591 ; transpose 8x32 to 32x8 and then store
11592 punpcklbw m%9, m%1, m%2
11593 punpckhbw m%1, m%2
11594 punpcklbw m%2, m%3, m%4
11595 punpckhbw m%3, m%4
11596 punpcklbw m%4, m%5, m%6
11597 punpckhbw m%5, m%6
11598 punpcklbw m%6, m%7, m%8
11599 punpckhbw m%7, m%8
11600
11601 punpcklwd m%8, m%9, m%2
11602 punpckhwd m%9, m%2
11603 punpcklwd m%2, m%4, m%6
11604 punpckhwd m%4, m%6
11605 punpcklwd m%6, m%1, m%3
11606 punpckhwd m%1, m%3
11607 punpcklwd m%3, m%5, m%7
11608 punpckhwd m%5, m%7
11609
11610 punpckldq m%7, m%8, m%2
11611 punpckhdq m%8, m%2
11612 punpckldq m%2, m%6, m%3
11613 punpckhdq m%6, m%3
11614 punpckldq m%3, m%9, m%4
11615 punpckhdq m%9, m%4
11616 punpckldq m%4, m%1, m%5
11617 punpckhdq m%1, m%5
11618
11619 movq [r0 + r1 * 0], xm%7
11620 movhps [r0 + r1 * 1], xm%7
11621 movq [r0 + r1 * 2], xm%8
11622 movhps [r0 + r5 * 1], xm%8
11623
11624 lea r0, [r0 + r6]
11625
11626 movq [r0 + r1 * 0], xm%3
11627 movhps [r0 + r1 * 1], xm%3
11628 movq [r0 + r1 * 2], xm%9
11629 movhps [r0 + r5 * 1], xm%9
11630
11631 lea r0, [r0 + r6]
11632
11633 movq [r0 + r1 * 0], xm%2
11634 movhps [r0 + r1 * 1], xm%2
11635 movq [r0 + r1 * 2], xm%6
11636 movhps [r0 + r5 * 1], xm%6
11637
11638 lea r0, [r0 + r6]
11639
11640 movq [r0 + r1 * 0], xm%4
11641 movhps [r0 + r1 * 1], xm%4
11642 movq [r0 + r1 * 2], xm%1
11643 movhps [r0 + r5 * 1], xm%1
11644
11645 lea r0, [r0 + r6]
11646
11647 vpermq m%8, m%8, 00001110b
11648 vpermq m%7, m%7, 00001110b
11649 vpermq m%6, m%6, 00001110b
11650 vpermq m%3, m%3, 00001110b
11651 vpermq m%9, m%9, 00001110b
11652 vpermq m%2, m%2, 00001110b
11653 vpermq m%4, m%4, 00001110b
11654 vpermq m%1, m%1, 00001110b
11655
11656 movq [r0 + r1 * 0], xm%7
11657 movhps [r0 + r1 * 1], xm%7
11658 movq [r0 + r1 * 2], xm%8
11659 movhps [r0 + r5 * 1], xm%8
11660
11661 lea r0, [r0 + r6]
11662
11663 movq [r0 + r1 * 0], xm%3
11664 movhps [r0 + r1 * 1], xm%3
11665 movq [r0 + r1 * 2], xm%9
11666 movhps [r0 + r5 * 1], xm%9
11667
11668 lea r0, [r0 + r6]
11669
11670 movq [r0 + r1 * 0], xm%2
11671 movhps [r0 + r1 * 1], xm%2
11672 movq [r0 + r1 * 2], xm%6
11673 movhps [r0 + r5 * 1], xm%6
11674
11675 lea r0, [r0 + r6]
11676
11677 movq [r0 + r1 * 0], xm%4
11678 movhps [r0 + r1 * 1], xm%4
11679 movq [r0 + r1 * 2], xm%1
11680 movhps [r0 + r5 * 1], xm%1
11681
11682 lea r0, [r4 + 8]
11683 jmp .end%10
11684 .skip%10:
11685 movu [r0 + r1 * 0], m%1
11686 movu [r0 + r1 * 1], m%2
11687 movu [r0 + r1 * 2], m%3
11688 movu [r0 + r5 * 1], m%4
11689
11690 lea r0, [r0 + r6]
11691
11692 movu [r0 + r1 * 0], m%5
11693 movu [r0 + r1 * 1], m%6
11694 movu [r0 + r1 * 2], m%7
11695 movu [r0 + r5 * 1], m%8
11696
11697 lea r0, [r0 + r6]
11698 .end%10:
11699 %endmacro
11700
11701 cglobal ang32_mode_3_33_row_0_15
11702 test r7d, r7d
11703 ; rows 0 to 7
11704 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11705 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11706 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
11707 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
11708
11709 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
11710 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
11711 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
11712
11713 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
11714 pmulhrsw m4, m7
11715 pmaddubsw m1, m2, [r3 + 10 * 32]
11716 pmulhrsw m1, m7
11717 packuswb m4, m1
11718
11719 palignr m5, m2, m0, 2
11720 palignr m1, m3, m2, 2
11721 pmaddubsw m5, [r3 + 4 * 32] ; [20]
11722 pmulhrsw m5, m7
11723 pmaddubsw m1, [r3 + 4 * 32]
11724 pmulhrsw m1, m7
11725 packuswb m5, m1
11726
11727 palignr m6, m2, m0, 4
11728 palignr m1, m3, m2, 4
11729 pmaddubsw m6, [r3 - 2 * 32] ; [14]
11730 pmulhrsw m6, m7
11731 pmaddubsw m1, [r3 - 2 * 32]
11732 pmulhrsw m1, m7
11733 packuswb m6, m1
11734
11735 palignr m8, m2, m0, 6
11736 palignr m1, m3, m2, 6
11737 pmaddubsw m8, [r3 - 8 * 32] ; [8]
11738 pmulhrsw m8, m7
11739 pmaddubsw m1, [r3 - 8 * 32]
11740 pmulhrsw m1, m7
11741 packuswb m8, m1
11742
11743 palignr m10, m2, m0, 8
11744 palignr m11, m3, m2, 8
11745 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
11746 pmulhrsw m9, m7
11747 pmaddubsw m1, m11, [r3 - 14 * 32]
11748 pmulhrsw m1, m7
11749 packuswb m9, m1
11750
11751 pmaddubsw m10, [r3 + 12 * 32] ; [28]
11752 pmulhrsw m10, m7
11753 pmaddubsw m11, [r3 + 12 * 32]
11754 pmulhrsw m11, m7
11755 packuswb m10, m11
11756
11757 palignr m11, m2, m0, 10
11758 palignr m1, m3, m2, 10
11759 pmaddubsw m11, [r3 + 6 * 32] ; [22]
11760 pmulhrsw m11, m7
11761 pmaddubsw m1, [r3 + 6 * 32]
11762 pmulhrsw m1, m7
11763 packuswb m11, m1
11764
11765 palignr m12, m2, m0, 12
11766 palignr m1, m3, m2, 12
11767 pmaddubsw m12, [r3] ; [16]
11768 pmulhrsw m12, m7
11769 pmaddubsw m1, [r3]
11770 pmulhrsw m1, m7
11771 packuswb m12, m1
11772
11773 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
11774
11775 ; rows 8 to 15
11776 palignr m4, m2, m0, 14
11777 palignr m1, m3, m2, 14
11778 pmaddubsw m4, [r3 - 6 * 32] ; [10]
11779 pmulhrsw m4, m7
11780 pmaddubsw m1, [r3 - 6 * 32]
11781 pmulhrsw m1, m7
11782 packuswb m4, m1
11783
11784 pmaddubsw m5, m2, [r3 - 12 * 32] ; [4]
11785 pmulhrsw m5, m7
11786 pmaddubsw m1, m3, [r3 - 12 * 32]
11787 pmulhrsw m1, m7
11788 packuswb m5, m1
11789
11790 pmaddubsw m6, m2, [r3 + 14 * 32] ; [30]
11791 pmulhrsw m6, m7
11792 pmaddubsw m1, m3, [r3 + 14 * 32]
11793 pmulhrsw m1, m7
11794 packuswb m6, m1
11795
11796 movu m0, [r2 + 25]
11797 movu m1, [r2 + 26]
11798 punpcklbw m0, m1
11799
11800 palignr m8, m3, m2, 2
11801 palignr m1, m0, m3, 2
11802 pmaddubsw m8, [r3 + 8 * 32] ; [24]
11803 pmulhrsw m8, m7
11804 pmaddubsw m1, [r3 + 8 * 32]
11805 pmulhrsw m1, m7
11806 packuswb m8, m1
11807
11808 palignr m9, m3, m2, 4
11809 palignr m1, m0, m3, 4
11810 pmaddubsw m9, [r3 + 2 * 32] ; [18]
11811 pmulhrsw m9, m7
11812 pmaddubsw m1, [r3 + 2 * 32]
11813 pmulhrsw m1, m7
11814 packuswb m9, m1
11815
11816 palignr m10, m3, m2, 6
11817 palignr m1, m0, m3, 6
11818 pmaddubsw m10, [r3 - 4 * 32] ; [12]
11819 pmulhrsw m10, m7
11820 pmaddubsw m1, [r3 - 4 * 32]
11821 pmulhrsw m1, m7
11822 packuswb m10, m1
11823
11824 palignr m11, m3, m2, 8
11825 palignr m1, m0, m3, 8
11826 pmaddubsw m11, [r3 - 10 * 32] ; [6]
11827 pmulhrsw m11, m7
11828 pmaddubsw m1, [r3 - 10 * 32]
11829 pmulhrsw m1, m7
11830 packuswb m11, m1
11831
11832 movu m12, [r2 + 14]
11833
11834 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
11835 ret
11836
11837 INIT_YMM avx2
11838 cglobal intra_pred_ang32_3, 3,8,13
11839 add r2, 64
11840 lea r3, [ang_table_avx2 + 32 * 16]
11841 lea r5, [r1 * 3] ; r5 -> 3 * stride
11842 lea r6, [r1 * 4] ; r6 -> 4 * stride
11843 mova m7, [pw_1024]
11844 mov r4, r0
11845 xor r7d, r7d
11846
11847 call ang32_mode_3_33_row_0_15
11848
11849 add r4, 16
11850 mov r0, r4
11851 add r2, 13
11852
11853 call ang32_mode_3_33_row_0_15
11854 RET
11855
11856 INIT_YMM avx2
11857 cglobal intra_pred_ang32_33, 3,8,13
11858 lea r3, [ang_table_avx2 + 32 * 16]
11859 lea r5, [r1 * 3] ; r5 -> 3 * stride
11860 lea r6, [r1 * 4] ; r6 -> 4 * stride
11861 mova m7, [pw_1024]
11862 xor r7d, r7d
11863 inc r7d
11864
11865 call ang32_mode_3_33_row_0_15
11866
11867 add r2, 13
11868
11869 call ang32_mode_3_33_row_0_15
11870 RET
11871
11872 cglobal ang32_mode_4_32_row_0_15
11873 test r7d, r7d
11874 ; rows 0 to 7
11875 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
11876 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
11877 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
11878 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
11879
11880 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
11881 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
11882 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
11883
11884 pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
11885 pmulhrsw m4, m7
11886 pmaddubsw m1, m2, [r3 + 5 * 32]
11887 pmulhrsw m1, m7
11888 packuswb m4, m1
11889
11890 palignr m6, m2, m0, 2
11891 palignr m1, m3, m2, 2
11892 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
11893 pmulhrsw m5, m7
11894 pmaddubsw m8, m1, [r3 - 6 * 32]
11895 pmulhrsw m8, m7
11896 packuswb m5, m8
11897
11898 pmaddubsw m6, [r3 + 15 * 32] ; [31]
11899 pmulhrsw m6, m7
11900 pmaddubsw m1, [r3 + 15 * 32]
11901 pmulhrsw m1, m7
11902 packuswb m6, m1
11903
11904 palignr m8, m2, m0, 4
11905 palignr m1, m3, m2, 4
11906 pmaddubsw m8, [r3 + 4 * 32] ; [20]
11907 pmulhrsw m8, m7
11908 pmaddubsw m1, [r3 + 4 * 32]
11909 pmulhrsw m1, m7
11910 packuswb m8, m1
11911
11912 palignr m10, m2, m0, 6
11913 palignr m11, m3, m2, 6
11914 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
11915 pmulhrsw m9, m7
11916 pmaddubsw m1, m11, [r3 - 7 * 32]
11917 pmulhrsw m1, m7
11918 packuswb m9, m1
11919
11920 pmaddubsw m10, [r3 + 14 * 32] ; [30]
11921 pmulhrsw m10, m7
11922 pmaddubsw m11, [r3 + 14 * 32]
11923 pmulhrsw m11, m7
11924 packuswb m10, m11
11925
11926 palignr m11, m2, m0, 8
11927 palignr m1, m3, m2, 8
11928 pmaddubsw m11, [r3 + 3 * 32] ; [19]
11929 pmulhrsw m11, m7
11930 pmaddubsw m1, [r3 + 3 * 32]
11931 pmulhrsw m1, m7
11932 packuswb m11, m1
11933
11934 palignr m12, m2, m0, 10
11935 palignr m1, m3, m2, 10
11936 pmaddubsw m12, [r3 - 8 * 32] ; [8]
11937 pmulhrsw m12, m7
11938 pmaddubsw m1, [r3 - 8 * 32]
11939 pmulhrsw m1, m7
11940 packuswb m12, m1
11941
11942 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
11943
11944 ; rows 8 to 15
11945 palignr m4, m2, m0, 10
11946 palignr m1, m3, m2, 10
11947 pmaddubsw m4, [r3 + 13 * 32] ; [29]
11948 pmulhrsw m4, m7
11949 pmaddubsw m1, [r3 + 13 * 32]
11950 pmulhrsw m1, m7
11951 packuswb m4, m1
11952
11953 palignr m5, m2, m0, 12
11954 palignr m1, m3, m2, 12
11955 pmaddubsw m5, [r3 + 2 * 32] ; [18]
11956 pmulhrsw m5, m7
11957 pmaddubsw m1, [r3 + 2 * 32]
11958 pmulhrsw m1, m7
11959 packuswb m5, m1
11960
11961 palignr m8, m2, m0, 14
11962 palignr m1, m3, m2, 14
11963 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
11964 pmulhrsw m6, m7
11965 pmaddubsw m9, m1, [r3 - 9 * 32]
11966 pmulhrsw m9, m7
11967 packuswb m6, m9
11968
11969 pmaddubsw m8, [r3 + 12 * 32] ; [28]
11970 pmulhrsw m8, m7
11971 pmaddubsw m1, [r3 + 12 * 32]
11972 pmulhrsw m1, m7
11973 packuswb m8, m1
11974
11975 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
11976 pmulhrsw m9, m7
11977 pmaddubsw m1, m3, [r3 + 1 * 32]
11978 pmulhrsw m1, m7
11979 packuswb m9, m1
11980
11981 movu m0, [r2 + 25]
11982 movu m1, [r2 + 26]
11983 punpcklbw m0, m1
11984
11985 palignr m11, m3, m2, 2
11986 palignr m1, m0, m3, 2
11987 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
11988 pmulhrsw m10, m7
11989 pmaddubsw m12, m1, [r3 - 10 * 32]
11990 pmulhrsw m12, m7
11991 packuswb m10, m12
11992
11993 pmaddubsw m11, [r3 + 11 * 32] ; [27]
11994 pmulhrsw m11, m7
11995 pmaddubsw m1, [r3 + 11 * 32]
11996 pmulhrsw m1, m7
11997 packuswb m11, m1
11998
11999 palignr m0, m3, 4
12000 palignr m3, m2, 4
12001 pmaddubsw m3, [r3] ; [16]
12002 pmulhrsw m3, m7
12003 pmaddubsw m0, [r3]
12004 pmulhrsw m0, m7
12005 packuswb m3, m0
12006
12007 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8
12008 ret
12009
12010 cglobal ang32_mode_4_32_row_16_31
12011 test r7d, r7d
12012 ; rows 0 to 7
12013 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12014 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12015 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12016 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12017
12018 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12019 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12020 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12021
12022 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
12023 pmulhrsw m4, m7
12024 pmaddubsw m1, m2, [r3 - 11 * 32]
12025 pmulhrsw m1, m7
12026 packuswb m4, m1
12027
12028 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
12029 pmulhrsw m5, m7
12030 pmaddubsw m1, m2, [r3 + 10 * 32]
12031 pmulhrsw m1, m7
12032 packuswb m5, m1
12033
12034 palignr m6, m2, m0, 2
12035 palignr m1, m3, m2, 2
12036 pmaddubsw m6, [r3 - 1 * 32] ; [15]
12037 pmulhrsw m6, m7
12038 pmaddubsw m1, [r3 - 1 * 32]
12039 pmulhrsw m1, m7
12040 packuswb m6, m1
12041
12042 palignr m9, m2, m0, 4
12043 palignr m10, m3, m2, 4
12044 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
12045 pmulhrsw m8, m7
12046 pmaddubsw m1, m10, [r3 - 12 * 32]
12047 pmulhrsw m1, m7
12048 packuswb m8, m1
12049
12050 pmaddubsw m9, [r3 + 9 * 32] ; [25]
12051 pmulhrsw m9, m7
12052 pmaddubsw m10, [r3 + 9 * 32]
12053 pmulhrsw m10, m7
12054 packuswb m9, m10
12055
12056 palignr m10, m2, m0, 6
12057 palignr m11, m3, m2, 6
12058 pmaddubsw m10, [r3 - 2 * 32] ; [14]
12059 pmulhrsw m10, m7
12060 pmaddubsw m11, [r3 - 2 * 32]
12061 pmulhrsw m11, m7
12062 packuswb m10, m11
12063
12064 palignr m12, m2, m0, 8
12065 palignr m1, m3, m2, 8
12066 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
12067 pmulhrsw m11, m7
12068 pmaddubsw m1, [r3 - 13 * 32]
12069 pmulhrsw m1, m7
12070 packuswb m11, m1
12071
12072 palignr m1, m3, m2, 8
12073 pmaddubsw m12, [r3 + 8 * 32] ; [24]
12074 pmulhrsw m12, m7
12075 pmaddubsw m1, [r3 + 8 * 32]
12076 pmulhrsw m1, m7
12077 packuswb m12, m1
12078
12079 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12080
12081 ; rows 8 to 15
12082 palignr m4, m2, m0, 10
12083 palignr m1, m3, m2, 10
12084 pmaddubsw m4, [r3 - 3 * 32] ; [13]
12085 pmulhrsw m4, m7
12086 pmaddubsw m1, [r3 - 3 * 32]
12087 pmulhrsw m1, m7
12088 packuswb m4, m1
12089
12090 palignr m6, m2, m0, 12
12091 palignr m8, m3, m2, 12
12092 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
12093 pmulhrsw m5, m7
12094 pmaddubsw m1, m8, [r3 - 14 * 32]
12095 pmulhrsw m1, m7
12096 packuswb m5, m1
12097
12098 pmaddubsw m6, [r3 + 7 * 32] ; [23]
12099 pmulhrsw m6, m7
12100 pmaddubsw m8, [r3 + 7 * 32]
12101 pmulhrsw m8, m7
12102 packuswb m6, m8
12103
12104 palignr m8, m2, m0, 14
12105 palignr m1, m3, m2, 14
12106 pmaddubsw m8, [r3 - 4 * 32] ; [12]
12107 pmulhrsw m8, m7
12108 pmaddubsw m1, [r3 - 4 * 32]
12109 pmulhrsw m1, m7
12110 packuswb m8, m1
12111
12112 pmaddubsw m9, m2, [r3 - 15 * 32] ; [1]
12113 pmulhrsw m9, m7
12114 pmaddubsw m1, m3, [r3 - 15 * 32]
12115 pmulhrsw m1, m7
12116 packuswb m9, m1
12117
12118 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
12119 pmulhrsw m10, m7
12120 pmaddubsw m1, m3, [r3 + 6 * 32]
12121 pmulhrsw m1, m7
12122 packuswb m10, m1
12123
12124 movu m0, [r2 + 25]
12125 movu m1, [r2 + 26]
12126 punpcklbw m0, m1
12127
12128 palignr m11, m3, m2, 2
12129 palignr m1, m0, m3, 2
12130 pmaddubsw m11, [r3 - 5 * 32] ; [11]
12131 pmulhrsw m11, m7
12132 pmaddubsw m1, [r3 - 5 * 32]
12133 pmulhrsw m1, m7
12134 packuswb m11, m1
12135
12136 movu m12, [r2 + 11]
12137
12138 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
12139 ret
12140
12141 INIT_YMM avx2
12142 cglobal intra_pred_ang32_4, 3,8,13
12143 add r2, 64
12144 lea r3, [ang_table_avx2 + 32 * 16]
12145 lea r5, [r1 * 3] ; r5 -> 3 * stride
12146 lea r6, [r1 * 4] ; r6 -> 4 * stride
12147 mova m7, [pw_1024]
12148 mov r4, r0
12149 xor r7d, r7d
12150
12151 call ang32_mode_4_32_row_0_15
12152
12153 add r4, 16
12154 mov r0, r4
12155 add r2, 11
12156
12157 call ang32_mode_4_32_row_16_31
12158 RET
12159
12160 INIT_YMM avx2
12161 cglobal intra_pred_ang32_32, 3,8,13
12162 lea r3, [ang_table_avx2 + 32 * 16]
12163 lea r5, [r1 * 3] ; r5 -> 3 * stride
12164 lea r6, [r1 * 4] ; r6 -> 4 * stride
12165 mova m7, [pw_1024]
12166 xor r7d, r7d
12167 inc r7d
12168
12169 call ang32_mode_4_32_row_0_15
12170
12171 add r2, 11
12172
12173 call ang32_mode_4_32_row_16_31
12174 RET
12175
12176 cglobal ang32_mode_5_31_row_0_15
12177 test r7d, r7d
12178 ; rows 0 to 7
12179 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12180 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12181 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12182 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12183
12184 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12185 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12186 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12187
12188 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
12189 pmulhrsw m4, m7
12190 pmaddubsw m1, m2, [r3 + 1 * 32]
12191 pmulhrsw m1, m7
12192 packuswb m4, m1
12193
12194 palignr m6, m2, m0, 2
12195 palignr m1, m3, m2, 2
12196 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
12197 pmulhrsw m5, m7
12198 pmaddubsw m8, m1, [r3 - 14 * 32]
12199 pmulhrsw m8, m7
12200 packuswb m5, m8
12201
12202 pmaddubsw m6, [r3 + 3 * 32] ; [19]
12203 pmulhrsw m6, m7
12204 pmaddubsw m1, [r3 + 3 * 32]
12205 pmulhrsw m1, m7
12206 packuswb m6, m1
12207
12208 palignr m9, m2, m0, 4
12209 palignr m10, m3, m2, 4
12210 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
12211 pmulhrsw m8, m7
12212 pmaddubsw m1, m10, [r3 - 12 * 32]
12213 pmulhrsw m1, m7
12214 packuswb m8, m1
12215
12216 pmaddubsw m9, [r3 + 5 * 32] ; [21]
12217 pmulhrsw m9, m7
12218 pmaddubsw m10, [r3 + 5 * 32]
12219 pmulhrsw m10, m7
12220 packuswb m9, m10
12221
12222 palignr m11, m2, m0, 6
12223 palignr m12, m3, m2, 6
12224 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
12225 pmulhrsw m10, m7
12226 pmaddubsw m1, m12, [r3 - 10 * 32]
12227 pmulhrsw m1, m7
12228 packuswb m10, m1
12229
12230 pmaddubsw m11, [r3 + 7 * 32] ; [23]
12231 pmulhrsw m11, m7
12232 pmaddubsw m12, [r3 + 7 * 32]
12233 pmulhrsw m12, m7
12234 packuswb m11, m12
12235
12236 palignr m12, m2, m0, 8
12237 palignr m1, m3, m2, 8
12238 pmaddubsw m12, [r3 - 8 * 32] ; [8]
12239 pmulhrsw m12, m7
12240 pmaddubsw m1, [r3 - 8 * 32]
12241 pmulhrsw m1, m7
12242 packuswb m12, m1
12243
12244 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12245
12246 ; rows 8 to 15
12247 palignr m4, m2, m0, 8
12248 palignr m1, m3, m2, 8
12249 pmaddubsw m4, [r3 + 9 * 32] ; [25]
12250 pmulhrsw m4, m7
12251 pmaddubsw m1, [r3 + 9 * 32]
12252 pmulhrsw m1, m7
12253 packuswb m4, m1
12254
12255 palignr m6, m2, m0, 10
12256 palignr m1, m3, m2, 10
12257 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
12258 pmulhrsw m5, m7
12259 pmaddubsw m8, m1, [r3 - 6 * 32]
12260 pmulhrsw m8, m7
12261 packuswb m5, m8
12262
12263 pmaddubsw m6, [r3 + 11 * 32] ; [27]
12264 pmulhrsw m6, m7
12265 pmaddubsw m1, [r3 + 11 * 32]
12266 pmulhrsw m1, m7
12267 packuswb m6, m1
12268
12269 palignr m9, m2, m0, 12
12270 palignr m1, m3, m2, 12
12271 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
12272 pmulhrsw m8, m7
12273 pmaddubsw m10, m1, [r3 - 4 * 32]
12274 pmulhrsw m10, m7
12275 packuswb m8, m10
12276
12277 pmaddubsw m9, [r3 + 13 * 32] ; [29]
12278 pmulhrsw m9, m7
12279 pmaddubsw m1, [r3 + 13 * 32]
12280 pmulhrsw m1, m7
12281 packuswb m9, m1
12282
12283 palignr m11, m2, m0, 14
12284 palignr m1, m3, m2, 14
12285 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
12286 pmulhrsw m10, m7
12287 pmaddubsw m12, m1, [r3 - 2 * 32]
12288 pmulhrsw m12, m7
12289 packuswb m10, m12
12290
12291 pmaddubsw m11, [r3 + 15 * 32] ; [31]
12292 pmulhrsw m11, m7
12293 pmaddubsw m1, [r3 + 15 * 32]
12294 pmulhrsw m1, m7
12295 packuswb m11, m1
12296
12297 pmaddubsw m2, [r3] ; [16]
12298 pmulhrsw m2, m7
12299 pmaddubsw m3, [r3]
12300 pmulhrsw m3, m7
12301 packuswb m2, m3
12302
12303 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
12304 ret
12305
12306 cglobal ang32_mode_5_31_row_16_31
12307 test r7d, r7d
12308 ; rows 0 to 7
12309 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12310 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12311 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12312 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12313
12314 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12315 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12316 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12317
12318 pmaddubsw m4, m0, [r3 - 15 * 32] ; [1]
12319 pmulhrsw m4, m7
12320 pmaddubsw m1, m2, [r3 - 15 * 32]
12321 pmulhrsw m1, m7
12322 packuswb m4, m1
12323
12324 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
12325 pmulhrsw m5, m7
12326 pmaddubsw m8, m2, [r3 + 2 * 32]
12327 pmulhrsw m8, m7
12328 packuswb m5, m8
12329
12330 palignr m8, m2, m0, 2
12331 palignr m9, m3, m2, 2
12332 pmaddubsw m6, m8, [r3 - 13 * 32] ; [3]
12333 pmulhrsw m6, m7
12334 pmaddubsw m1, m9, [r3 - 13 * 32]
12335 pmulhrsw m1, m7
12336 packuswb m6, m1
12337
12338 pmaddubsw m8, [r3 + 4 * 32] ; [20]
12339 pmulhrsw m8, m7
12340 pmaddubsw m9, [r3 + 4 * 32]
12341 pmulhrsw m9, m7
12342 packuswb m8, m9
12343
12344 palignr m10, m2, m0, 4
12345 palignr m1, m3, m2, 4
12346 pmaddubsw m9, m10, [r3 - 11 * 32] ; [5]
12347 pmulhrsw m9, m7
12348 pmaddubsw m11, m1, [r3 - 11 * 32]
12349 pmulhrsw m11, m7
12350 packuswb m9, m11
12351
12352 pmaddubsw m10, [r3 + 6 * 32] ; [22]
12353 pmulhrsw m10, m7
12354 pmaddubsw m1, [r3 + 6 * 32]
12355 pmulhrsw m1, m7
12356 packuswb m10, m1
12357
12358 palignr m12, m2, m0, 6
12359 palignr m1, m3, m2, 6
12360 pmaddubsw m11, m12, [r3 - 9 * 32] ; [7]
12361 pmulhrsw m11, m7
12362 pmaddubsw m1, [r3 - 9 * 32]
12363 pmulhrsw m1, m7
12364 packuswb m11, m1
12365
12366 palignr m1, m3, m2, 6
12367 pmaddubsw m12, [r3 + 8 * 32] ; [24]
12368 pmulhrsw m12, m7
12369 pmaddubsw m1, [r3 + 8 * 32]
12370 pmulhrsw m1, m7
12371 packuswb m12, m1
12372
12373 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12374
12375 ; rows 8 to 15
12376 palignr m5, m2, m0, 8
12377 palignr m8, m3, m2, 8
12378 pmaddubsw m4, m5, [r3 - 7 * 32] ; [9]
12379 pmulhrsw m4, m7
12380 pmaddubsw m1, m8, [r3 - 7 * 32]
12381 pmulhrsw m1, m7
12382 packuswb m4, m1
12383
12384 pmaddubsw m5, [r3 + 10 * 32] ; [26]
12385 pmulhrsw m5, m7
12386 pmaddubsw m8, [r3 + 10 * 32]
12387 pmulhrsw m8, m7
12388 packuswb m5, m8
12389
12390 palignr m8, m2, m0, 10
12391 palignr m9, m3, m2, 10
12392 pmaddubsw m6, m8, [r3 - 5 * 32] ; [11]
12393 pmulhrsw m6, m7
12394 pmaddubsw m1, m9, [r3 - 5 * 32]
12395 pmulhrsw m1, m7
12396 packuswb m6, m1
12397
12398 pmaddubsw m8, [r3 + 12 * 32] ; [28]
12399 pmulhrsw m8, m7
12400 pmaddubsw m9, [r3 + 12 * 32]
12401 pmulhrsw m9, m7
12402 packuswb m8, m9
12403
12404 palignr m10, m2, m0, 12
12405 palignr m11, m3, m2, 12
12406 pmaddubsw m9, m10, [r3 - 3 * 32] ; [13]
12407 pmulhrsw m9, m7
12408 pmaddubsw m1, m11, [r3 - 3 * 32]
12409 pmulhrsw m1, m7
12410 packuswb m9, m1
12411
12412 pmaddubsw m10, [r3 + 14 * 32] ; [30]
12413 pmulhrsw m10, m7
12414 pmaddubsw m11, [r3 + 14 * 32]
12415 pmulhrsw m11, m7
12416 packuswb m10, m11
12417
12418 palignr m11, m2, m0, 14
12419 palignr m1, m3, m2, 14
12420 pmaddubsw m11, [r3 - 1 * 32] ; [15]
12421 pmulhrsw m11, m7
12422 pmaddubsw m1, [r3 - 1 * 32]
12423 pmulhrsw m1, m7
12424 packuswb m11, m1
12425
12426 movu m2, [r2 + 9]
12427 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
12428 ret
12429
12430 INIT_YMM avx2
12431 cglobal intra_pred_ang32_5, 3,8,13
12432 add r2, 64
12433 lea r3, [ang_table_avx2 + 32 * 16]
12434 lea r5, [r1 * 3] ; r5 -> 3 * stride
12435 lea r6, [r1 * 4] ; r6 -> 4 * stride
12436 mova m7, [pw_1024]
12437 mov r4, r0
12438 xor r7d, r7d
12439
12440 call ang32_mode_5_31_row_0_15
12441
12442 add r4, 16
12443 mov r0, r4
12444 add r2, 9
12445
12446 call ang32_mode_5_31_row_16_31
12447 RET
12448
12449 INIT_YMM avx2
12450 cglobal intra_pred_ang32_31, 3,8,13
12451 lea r3, [ang_table_avx2 + 32 * 16]
12452 lea r5, [r1 * 3] ; r5 -> 3 * stride
12453 lea r6, [r1 * 4] ; r6 -> 4 * stride
12454 mova m7, [pw_1024]
12455 xor r7d, r7d
12456 inc r7d
12457
12458 call ang32_mode_5_31_row_0_15
12459
12460 add r2, 9
12461
12462 call ang32_mode_5_31_row_16_31
12463 RET
12464
12465 cglobal ang32_mode_6_30_row_0_15
12466 test r7d, r7d
12467 ; rows 0 to 7
12468 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12469 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12470 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12471 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12472
12473 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12474 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12475 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12476
12477 pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
12478 pmulhrsw m4, m7
12479 pmaddubsw m1, m2, [r3 - 3 * 32]
12480 pmulhrsw m1, m7
12481 packuswb m4, m1
12482
12483 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
12484 pmulhrsw m5, m7
12485 pmaddubsw m8, m2, [r3 + 10 * 32]
12486 pmulhrsw m8, m7
12487 packuswb m5, m8
12488
12489 palignr m8, m2, m0, 2
12490 palignr m1, m3, m2, 2
12491 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
12492 pmulhrsw m6, m7
12493 pmaddubsw m9, m1, [r3 - 9 * 32]
12494 pmulhrsw m9, m7
12495 packuswb m6, m9
12496
12497 pmaddubsw m8, [r3 + 4 * 32] ; [20]
12498 pmulhrsw m8, m7
12499 pmaddubsw m1, [r3 + 4 * 32]
12500 pmulhrsw m1, m7
12501 packuswb m8, m1
12502
12503 palignr m11, m2, m0, 4
12504 palignr m1, m3, m2, 4
12505 pmaddubsw m9, m11, [r3 - 15 * 32] ; [1]
12506 pmulhrsw m9, m7
12507 pmaddubsw m12, m1, [r3 - 15 * 32]
12508 pmulhrsw m12, m7
12509 packuswb m9, m12
12510
12511 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
12512 pmulhrsw m10, m7
12513 pmaddubsw m12, m1, [r3 - 2 * 32]
12514 pmulhrsw m12, m7
12515 packuswb m10, m12
12516
12517 pmaddubsw m11, [r3 + 11 * 32] ; [27]
12518 pmulhrsw m11, m7
12519 pmaddubsw m1, [r3 + 11 * 32]
12520 pmulhrsw m1, m7
12521 packuswb m11, m1
12522
12523 palignr m12, m2, m0, 6
12524 palignr m1, m3, m2, 6
12525 pmaddubsw m12, [r3 - 8 * 32] ; [8]
12526 pmulhrsw m12, m7
12527 pmaddubsw m1, [r3 - 8 * 32]
12528 pmulhrsw m1, m7
12529 packuswb m12, m1
12530
12531 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12532
12533 ; rows 8 to 15
12534 palignr m4, m2, m0, 6
12535 palignr m1, m3, m2, 6
12536 pmaddubsw m4, [r3 + 5 * 32] ; [21]
12537 pmulhrsw m4, m7
12538 pmaddubsw m1, [r3 + 5 * 32]
12539 pmulhrsw m1, m7
12540 packuswb m4, m1
12541
12542 palignr m8, m2, m0, 8
12543 palignr m1, m3, m2, 8
12544 pmaddubsw m5, m8, [r3 - 14 * 32] ; [2]
12545 pmulhrsw m5, m7
12546 pmaddubsw m9, m1, [r3 - 14 * 32]
12547 pmulhrsw m9, m7
12548 packuswb m5, m9
12549
12550 pmaddubsw m6, m8, [r3 - 1 * 32] ; [15]
12551 pmulhrsw m6, m7
12552 pmaddubsw m9, m1, [r3 - 1 * 32]
12553 pmulhrsw m9, m7
12554 packuswb m6, m9
12555
12556 pmaddubsw m8, [r3 + 12 * 32] ; [28]
12557 pmulhrsw m8, m7
12558 pmaddubsw m1, [r3 + 12 * 32]
12559 pmulhrsw m1, m7
12560 packuswb m8, m1
12561
12562 palignr m10, m2, m0, 10
12563 palignr m1, m3, m2, 10
12564 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
12565 pmulhrsw m9, m7
12566 pmaddubsw m11, m1, [r3 - 7 * 32]
12567 pmulhrsw m11, m7
12568 packuswb m9, m11
12569
12570 pmaddubsw m10, [r3 + 6 * 32] ; [22]
12571 pmulhrsw m10, m7
12572 pmaddubsw m1, m1, [r3 + 6 * 32]
12573 pmulhrsw m1, m7
12574 packuswb m10, m1
12575
12576 palignr m3, m2, 12
12577 palignr m2, m0, 12
12578 pmaddubsw m11, m2, [r3 - 13 * 32] ; [3]
12579 pmulhrsw m11, m7
12580 pmaddubsw m1, m3, [r3 - 13 * 32]
12581 pmulhrsw m1, m7
12582 packuswb m11, m1
12583
12584 pmaddubsw m2, [r3] ; [16]
12585 pmulhrsw m2, m7
12586 pmaddubsw m3, [r3]
12587 pmulhrsw m3, m7
12588 packuswb m2, m3
12589
12590 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
12591 ret
12592
12593 cglobal ang32_mode_6_30_row_16_31
12594 test r7d, r7d
12595 ; rows 0 to 7
12596 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12597 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12598 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12599 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12600
12601 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12602 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12603 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12604
12605 pmaddubsw m4, m0, [r3 + 13 * 32] ; [29]
12606 pmulhrsw m4, m7
12607 pmaddubsw m1, m2, [r3 + 13 * 32]
12608 pmulhrsw m1, m7
12609 packuswb m4, m1
12610
12611 palignr m6, m2, m0, 2
12612 palignr m1, m3, m2, 2
12613 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
12614 pmulhrsw m5, m7
12615 pmaddubsw m8, m1, [r3 - 6 * 32]
12616 pmulhrsw m8, m7
12617 packuswb m5, m8
12618
12619 pmaddubsw m6, [r3 + 7 * 32] ; [23]
12620 pmulhrsw m6, m7
12621 pmaddubsw m1, [r3 + 7 * 32]
12622 pmulhrsw m1, m7
12623 packuswb m6, m1
12624
12625 palignr m10, m2, m0, 4
12626 palignr m1, m3, m2, 4
12627 pmaddubsw m8, m10, [r3 - 12 * 32] ; [4]
12628 pmulhrsw m8, m7
12629 pmaddubsw m11, m1, [r3 - 12 * 32]
12630 pmulhrsw m11, m7
12631 packuswb m8, m11
12632
12633 pmaddubsw m9, m10, [r3 + 1 * 32] ; [17]
12634 pmulhrsw m9, m7
12635 pmaddubsw m11, m1, [r3 + 1 * 32]
12636 pmulhrsw m11, m7
12637 packuswb m9, m11
12638
12639 pmaddubsw m10, [r3 + 14 * 32] ; [30]
12640 pmulhrsw m10, m7
12641 pmaddubsw m1, [r3 + 14 * 32]
12642 pmulhrsw m1, m7
12643 packuswb m10, m1
12644
12645 palignr m12, m2, m0, 6
12646 palignr m1, m3, m2, 6
12647 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
12648 pmulhrsw m11, m7
12649 pmaddubsw m1, [r3 - 5 * 32]
12650 pmulhrsw m1, m7
12651 packuswb m11, m1
12652
12653 palignr m1, m3, m2, 6
12654 pmaddubsw m12, [r3 + 8 * 32] ; [24]
12655 pmulhrsw m12, m7
12656 pmaddubsw m1, [r3 + 8 * 32]
12657 pmulhrsw m1, m7
12658 packuswb m12, m1
12659
12660 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12661
12662 ; rows 8 to 15
12663 palignr m6, m2, m0, 8
12664 palignr m1, m3, m2, 8
12665 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
12666 pmulhrsw m4, m7
12667 pmaddubsw m8, m1, [r3 - 11 * 32]
12668 pmulhrsw m8, m7
12669 packuswb m4, m8
12670
12671 pmaddubsw m5, m6, [r3 + 2 * 32] ; [18]
12672 pmulhrsw m5, m7
12673 pmaddubsw m9, m1, [r3 + 2 * 32]
12674 pmulhrsw m9, m7
12675 packuswb m5, m9
12676
12677 pmaddubsw m6, [r3 + 15 * 32] ; [31]
12678 pmulhrsw m6, m7
12679 pmaddubsw m1, [r3 + 15 * 32]
12680 pmulhrsw m1, m7
12681 packuswb m6, m1
12682
12683 palignr m9, m2, m0, 10
12684 palignr m1, m3, m2, 10
12685 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
12686 pmulhrsw m8, m7
12687 pmaddubsw m10, m1, [r3 - 4 * 32]
12688 pmulhrsw m10, m7
12689 packuswb m8, m10
12690
12691 pmaddubsw m9, [r3 + 9 * 32] ; [25]
12692 pmulhrsw m9, m7
12693 pmaddubsw m1, [r3 + 9 * 32]
12694 pmulhrsw m1, m7
12695 packuswb m9, m1
12696
12697 palignr m3, m2, 12
12698 palignr m2, m0, 12
12699 pmaddubsw m10, m2, [r3 - 10 * 32] ; [6]
12700 pmulhrsw m10, m7
12701 pmaddubsw m1, m3, [r3 - 10 * 32]
12702 pmulhrsw m1, m7
12703 packuswb m10, m1
12704
12705 pmaddubsw m2, [r3 + 3 * 32] ; [19]
12706 pmulhrsw m2, m7
12707 pmaddubsw m3, [r3 + 3 * 32]
12708 pmulhrsw m3, m7
12709 packuswb m2, m3
12710
12711 movu m3, [r2 + 8] ; [0]
12712
12713 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8
12714 ret
12715
12716 INIT_YMM avx2
12717 cglobal intra_pred_ang32_6, 3,8,13
12718 add r2, 64
12719 lea r3, [ang_table_avx2 + 32 * 16]
12720 lea r5, [r1 * 3] ; r5 -> 3 * stride
12721 lea r6, [r1 * 4] ; r6 -> 4 * stride
12722 mova m7, [pw_1024]
12723 mov r4, r0
12724 xor r7d, r7d
12725
12726 call ang32_mode_6_30_row_0_15
12727
12728 add r4, 16
12729 mov r0, r4
12730 add r2, 6
12731
12732 call ang32_mode_6_30_row_16_31
12733 RET
12734
12735 INIT_YMM avx2
12736 cglobal intra_pred_ang32_30, 3,8,13
12737 lea r3, [ang_table_avx2 + 32 * 16]
12738 lea r5, [r1 * 3] ; r5 -> 3 * stride
12739 lea r6, [r1 * 4] ; r6 -> 4 * stride
12740 mova m7, [pw_1024]
12741 xor r7d, r7d
12742 inc r7d
12743
12744 call ang32_mode_6_30_row_0_15
12745
12746 add r2, 6
12747
12748 call ang32_mode_6_30_row_16_31
12749 RET
12750
12751 cglobal ang32_mode_7_29_row_0_15
12752 test r7d, r7d
12753 ; rows 0 to 7
12754 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12755 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12756 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12757 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12758
12759 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12760 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12761 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12762
12763 pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
12764 pmulhrsw m4, m7
12765 pmaddubsw m1, m2, [r3 - 7 * 32]
12766 pmulhrsw m1, m7
12767 packuswb m4, m1
12768
12769 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
12770 pmulhrsw m5, m7
12771 pmaddubsw m8, m2, [r3 + 2 * 32]
12772 pmulhrsw m8, m7
12773 packuswb m5, m8
12774
12775 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
12776 pmulhrsw m6, m7
12777 pmaddubsw m9, m2, [r3 + 11 * 32]
12778 pmulhrsw m9, m7
12779 packuswb m6, m9
12780
12781 palignr m11, m2, m0, 2
12782 palignr m1, m3, m2, 2
12783 pmaddubsw m8, m11, [r3 - 12 * 32] ; [4]
12784 pmulhrsw m8, m7
12785 pmaddubsw m12, m1, [r3 - 12 * 32]
12786 pmulhrsw m12, m7
12787 packuswb m8, m12
12788
12789 pmaddubsw m9, m11, [r3 - 3 * 32] ; [13]
12790 pmulhrsw m9, m7
12791 pmaddubsw m12, m1, [r3 - 3 * 32]
12792 pmulhrsw m12, m7
12793 packuswb m9, m12
12794
12795 pmaddubsw m10, m11, [r3 + 6 * 32] ; [22]
12796 pmulhrsw m10, m7
12797 pmaddubsw m12, m1, [r3 + 6 * 32]
12798 pmulhrsw m12, m7
12799 packuswb m10, m12
12800
12801 pmaddubsw m11, [r3 + 15 * 32] ; [31]
12802 pmulhrsw m11, m7
12803 pmaddubsw m1, [r3 + 15 * 32]
12804 pmulhrsw m1, m7
12805 packuswb m11, m1
12806
12807 palignr m12, m2, m0, 4
12808 palignr m1, m3, m2, 4
12809 pmaddubsw m12, [r3 - 8 * 32] ; [8]
12810 pmulhrsw m12, m7
12811 pmaddubsw m1, [r3 - 8 * 32]
12812 pmulhrsw m1, m7
12813 packuswb m12, m1
12814
12815 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12816
12817 ; rows 8 to 15
12818 palignr m5, m2, m0, 4
12819 palignr m1, m3, m2, 4
12820 pmaddubsw m4, m5, [r3 + 1 * 32] ; [17]
12821 pmulhrsw m4, m7
12822 pmaddubsw m8, m1, [r3 + 1 * 32]
12823 pmulhrsw m8, m7
12824 packuswb m4, m8
12825
12826 pmaddubsw m5, [r3 + 10 * 32] ; [26]
12827 pmulhrsw m5, m7
12828 pmaddubsw m1, [r3 + 10 * 32]
12829 pmulhrsw m1, m7
12830 packuswb m5, m1
12831
12832 palignr m10, m2, m0, 6
12833 palignr m1, m3, m2, 6
12834 pmaddubsw m6, m10, [r3 - 13 * 32] ; [3]
12835 pmulhrsw m6, m7
12836 pmaddubsw m9, m1, [r3 - 13 * 32]
12837 pmulhrsw m9, m7
12838 packuswb m6, m9
12839
12840 pmaddubsw m8, m10, [r3 - 4 * 32] ; [12]
12841 pmulhrsw m8, m7
12842 pmaddubsw m11, m1, [r3 - 4 * 32]
12843 pmulhrsw m11, m7
12844 packuswb m8, m11
12845
12846 pmaddubsw m9, m10, [r3 + 5 * 32] ; [21]
12847 pmulhrsw m9, m7
12848 pmaddubsw m11, m1, [r3 + 5 * 32]
12849 pmulhrsw m11, m7
12850 packuswb m9, m11
12851
12852 pmaddubsw m10, [r3 + 14 * 32] ; [30]
12853 pmulhrsw m10, m7
12854 pmaddubsw m1, [r3 + 14 * 32]
12855 pmulhrsw m1, m7
12856 packuswb m10, m1
12857
12858 palignr m3, m2, 8
12859 palignr m2, m0, 8
12860 pmaddubsw m11, m2, [r3 - 9 * 32] ; [7]
12861 pmulhrsw m11, m7
12862 pmaddubsw m1, m3, [r3 - 9 * 32]
12863 pmulhrsw m1, m7
12864 packuswb m11, m1
12865
12866 pmaddubsw m2, [r3] ; [16]
12867 pmulhrsw m2, m7
12868 pmaddubsw m3, [r3]
12869 pmulhrsw m3, m7
12870 packuswb m2, m3
12871
12872 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
12873 ret
12874
12875 cglobal ang32_mode_7_29_row_16_31
12876 test r7d, r7d
12877 ; rows 0 to 7
12878 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
12879 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
12880 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
12881 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
12882
12883 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
12884 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
12885 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
12886
12887 pmaddubsw m4, m0, [r3 + 9 * 32] ; [25]
12888 pmulhrsw m4, m7
12889 pmaddubsw m1, m2, [r3 + 9 * 32]
12890 pmulhrsw m1, m7
12891 packuswb m4, m1
12892
12893 palignr m9, m2, m0, 2
12894 palignr m1, m3, m2, 2
12895 pmaddubsw m5, m9, [r3 - 14 * 32] ; [2]
12896 pmulhrsw m5, m7
12897 pmaddubsw m8, m1, [r3 - 14 * 32]
12898 pmulhrsw m8, m7
12899 packuswb m5, m8
12900
12901 pmaddubsw m6, m9, [r3 - 5 * 32] ; [11]
12902 pmulhrsw m6, m7
12903 pmaddubsw m10, m1, [r3 - 5 * 32]
12904 pmulhrsw m10, m7
12905 packuswb m6, m10
12906
12907 pmaddubsw m8, m9, [r3 + 4 * 32] ; [20]
12908 pmulhrsw m8, m7
12909 pmaddubsw m10, m1, [r3 + 4 * 32]
12910 pmulhrsw m10, m7
12911 packuswb m8, m10
12912
12913 pmaddubsw m9, [r3 + 13 * 32] ; [29]
12914 pmulhrsw m9, m7
12915 pmaddubsw m1, [r3 + 13 * 32]
12916 pmulhrsw m1, m7
12917 packuswb m9, m1
12918
12919 palignr m12, m2, m0, 4
12920 palignr m1, m3, m2, 4
12921 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
12922 pmulhrsw m10, m7
12923 pmaddubsw m11, m1, [r3 - 10 * 32]
12924 pmulhrsw m11, m7
12925 packuswb m10, m11
12926
12927 pmaddubsw m11, m12, [r3 - 1 * 32] ; [15]
12928 pmulhrsw m11, m7
12929 pmaddubsw m1, [r3 - 1 * 32]
12930 pmulhrsw m1, m7
12931 packuswb m11, m1
12932
12933 palignr m1, m3, m2, 4
12934 pmaddubsw m12, [r3 + 8 * 32] ; [24]
12935 pmulhrsw m12, m7
12936 pmaddubsw m1, [r3 + 8 * 32]
12937 pmulhrsw m1, m7
12938 packuswb m12, m1
12939
12940 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
12941
12942 ; rows 8 to 15
12943 palignr m8, m2, m0, 6
12944 palignr m1, m3, m2, 6
12945 pmaddubsw m4, m8, [r3 - 15 * 32] ; [1]
12946 pmulhrsw m4, m7
12947 pmaddubsw m9, m1, [r3 - 15 * 32]
12948 pmulhrsw m9, m7
12949 packuswb m4, m9
12950
12951 pmaddubsw m5, m8, [r3 - 6 * 32] ; [10]
12952 pmulhrsw m5, m7
12953 pmaddubsw m9, m1, [r3 - 6 * 32]
12954 pmulhrsw m9, m7
12955 packuswb m5, m9
12956
12957 pmaddubsw m6, m8, [r3 + 3 * 32] ; [19]
12958 pmulhrsw m6, m7
12959 pmaddubsw m9, m1, [r3 + 3 * 32]
12960 pmulhrsw m9, m7
12961 packuswb m6, m9
12962
12963 pmaddubsw m8, [r3 + 12 * 32] ; [28]
12964 pmulhrsw m8, m7
12965 pmaddubsw m1, [r3 + 12 * 32]
12966 pmulhrsw m1, m7
12967 packuswb m8, m1
12968
12969 palignr m3, m2, 8
12970 palignr m2, m0, 8
12971 pmaddubsw m9, m2, [r3 - 11 * 32] ; [5]
12972 pmulhrsw m9, m7
12973 pmaddubsw m1, m3, [r3 - 11 * 32]
12974 pmulhrsw m1, m7
12975 packuswb m9, m1
12976
12977 pmaddubsw m10, m2, [r3 - 2 * 32] ; [14]
12978 pmulhrsw m10, m7
12979 pmaddubsw m1, m3, [r3 - 2 * 32]
12980 pmulhrsw m1, m7
12981 packuswb m10, m1
12982
12983 pmaddubsw m2, [r3 + 7 * 32] ; [23]
12984 pmulhrsw m2, m7
12985 pmaddubsw m3, [r3 + 7 * 32]
12986 pmulhrsw m3, m7
12987 packuswb m2, m3
12988
12989 movu m1, [r2 + 6] ; [0]
12990
12991 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8
12992 ret
12993
12994 INIT_YMM avx2
12995 cglobal intra_pred_ang32_7, 3,8,13
12996 add r2, 64
12997 lea r3, [ang_table_avx2 + 32 * 16]
12998 lea r5, [r1 * 3] ; r5 -> 3 * stride
12999 lea r6, [r1 * 4] ; r6 -> 4 * stride
13000 mova m7, [pw_1024]
13001 mov r4, r0
13002 xor r7d, r7d
13003
13004 call ang32_mode_7_29_row_0_15
13005
13006 add r4, 16
13007 mov r0, r4
13008 add r2, 4
13009
13010 call ang32_mode_7_29_row_16_31
13011 RET
13012
13013 INIT_YMM avx2
13014 cglobal intra_pred_ang32_29, 3,8,13
13015 lea r3, [ang_table_avx2 + 32 * 16]
13016 lea r5, [r1 * 3] ; r5 -> 3 * stride
13017 lea r6, [r1 * 4] ; r6 -> 4 * stride
13018 mova m7, [pw_1024]
13019 xor r7d, r7d
13020 inc r7d
13021
13022 call ang32_mode_7_29_row_0_15
13023
13024 add r2, 4
13025
13026 call ang32_mode_7_29_row_16_31
13027 RET
13028
13029 cglobal ang32_mode_8_28_avx2
13030 test r7d, r7d
13031 ; rows 0 to 7
13032 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
13033 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
13034 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
13035 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
13036
13037 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
13038 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
13039 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
13040
13041 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
13042 pmulhrsw m4, m7
13043 pmaddubsw m1, m2, [r3 - 11 * 32]
13044 pmulhrsw m1, m7
13045 packuswb m4, m1
13046
13047 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
13048 pmulhrsw m5, m7
13049 pmaddubsw m8, m2, [r3 - 6 * 32]
13050 pmulhrsw m8, m7
13051 packuswb m5, m8
13052
13053 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
13054 pmulhrsw m6, m7
13055 pmaddubsw m9, m2, [r3 - 1 * 32]
13056 pmulhrsw m9, m7
13057 packuswb m6, m9
13058
13059 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
13060 pmulhrsw m8, m7
13061 pmaddubsw m12, m2, [r3 + 4 * 32]
13062 pmulhrsw m12, m7
13063 packuswb m8, m12
13064
13065 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
13066 pmulhrsw m9, m7
13067 pmaddubsw m12, m2, [r3 + 9 * 32]
13068 pmulhrsw m12, m7
13069 packuswb m9, m12
13070
13071 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
13072 pmulhrsw m10, m7
13073 pmaddubsw m12, m2, [r3 + 14 * 32]
13074 pmulhrsw m12, m7
13075 packuswb m10, m12
13076
13077 palignr m12, m2, m0, 2
13078 palignr m1, m3, m2, 2
13079 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
13080 pmulhrsw m11, m7
13081 pmaddubsw m1, [r3 - 13 * 32]
13082 pmulhrsw m1, m7
13083 packuswb m11, m1
13084
13085 palignr m1, m3, m2, 2
13086 pmaddubsw m12, [r3 - 8 * 32] ; [8]
13087 pmulhrsw m12, m7
13088 pmaddubsw m1, [r3 - 8 * 32]
13089 pmulhrsw m1, m7
13090 packuswb m12, m1
13091
13092 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
13093
13094 ; rows 8 to 15
13095
13096 palignr m8, m2, m0, 2
13097 palignr m1, m3, m2, 2
13098 pmaddubsw m4, m8, [r3 - 3 * 32] ; [13]
13099 pmulhrsw m4, m7
13100 pmaddubsw m9, m1, [r3 - 3 * 32]
13101 pmulhrsw m9, m7
13102 packuswb m4, m9
13103
13104 pmaddubsw m5, m8, [r3 + 2 * 32] ; [18]
13105 pmulhrsw m5, m7
13106 pmaddubsw m9, m1, [r3 + 2 * 32]
13107 pmulhrsw m9, m7
13108 packuswb m5, m9
13109
13110 pmaddubsw m6, m8, [r3 + 7 * 32] ; [23]
13111 pmulhrsw m6, m7
13112 pmaddubsw m9, m1, [r3 + 7 * 32]
13113 pmulhrsw m9, m7
13114 packuswb m6, m9
13115
13116 pmaddubsw m8, [r3 + 12 * 32] ; [28]
13117 pmulhrsw m8, m7
13118 pmaddubsw m1, [r3 + 12 * 32]
13119 pmulhrsw m1, m7
13120 packuswb m8, m1
13121
13122 palignr m12, m2, m0, 4
13123 palignr m1, m3, m2, 4
13124 pmaddubsw m9, m12, [r3 - 15 * 32] ; [1]
13125 pmulhrsw m9, m7
13126 pmaddubsw m11, m1, [r3 - 15 * 32]
13127 pmulhrsw m11, m7
13128 packuswb m9, m11
13129
13130 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
13131 pmulhrsw m10, m7
13132 pmaddubsw m11, m1, [r3 - 10 * 32]
13133 pmulhrsw m11, m7
13134 packuswb m10, m11
13135
13136 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
13137 pmulhrsw m11, m7
13138 pmaddubsw m1, [r3 - 5 * 32]
13139 pmulhrsw m1, m7
13140 packuswb m11, m1
13141
13142 palignr m1, m3, m2, 4
13143 pmaddubsw m12, [r3] ; [16]
13144 pmulhrsw m12, m7
13145 pmaddubsw m1, [r3]
13146 pmulhrsw m1, m7
13147 packuswb m12, m1
13148
13149 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
13150
13151 ; rows 16 to 23
13152
13153 jnz .doNotAdjustBufferPtr
13154 lea r4, [r4 + mmsize/2]
13155 mov r0, r4
13156 .doNotAdjustBufferPtr:
13157
13158 palignr m6, m2, m0, 4
13159 palignr m1, m3, m2, 4
13160 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
13161 pmulhrsw m4, m7
13162 pmaddubsw m8, m1, [r3 + 5 * 32]
13163 pmulhrsw m8, m7
13164 packuswb m4, m8
13165
13166 pmaddubsw m5, m6, [r3 + 10 * 32] ; [26]
13167 pmulhrsw m5, m7
13168 pmaddubsw m8, m1, [r3 + 10 * 32]
13169 pmulhrsw m8, m7
13170 packuswb m5, m8
13171
13172 pmaddubsw m6, [r3 + 15 * 32] ; [31]
13173 pmulhrsw m6, m7
13174 pmaddubsw m1, [r3 + 15 * 32]
13175 pmulhrsw m1, m7
13176 packuswb m6, m1
13177
13178 palignr m12, m2, m0, 6
13179 palignr m1, m3, m2, 6
13180 pmaddubsw m8, m12, [r3 - 12 * 32] ; [4]
13181 pmulhrsw m8, m7
13182 pmaddubsw m11, m1, [r3 - 12 * 32]
13183 pmulhrsw m11, m7
13184 packuswb m8, m11
13185
13186 pmaddubsw m9, m12, [r3 - 7 * 32] ; [9]
13187 pmulhrsw m9, m7
13188 pmaddubsw m11, m1, [r3 - 7 * 32]
13189 pmulhrsw m11, m7
13190 packuswb m9, m11
13191
13192 pmaddubsw m10, m12, [r3 - 2 * 32] ; [14]
13193 pmulhrsw m10, m7
13194 pmaddubsw m11, m1, [r3 - 2 * 32]
13195 pmulhrsw m11, m7
13196 packuswb m10, m11
13197
13198 pmaddubsw m11, m12, [r3 + 3 * 32] ; [19]
13199 pmulhrsw m11, m7
13200 pmaddubsw m1, [r3 + 3 * 32]
13201 pmulhrsw m1, m7
13202 packuswb m11, m1
13203
13204 palignr m1, m3, m2, 6
13205 pmaddubsw m12, [r3 + 8 * 32] ; [24]
13206 pmulhrsw m12, m7
13207 pmaddubsw m1, [r3 + 8 * 32]
13208 pmulhrsw m1, m7
13209 packuswb m12, m1
13210
13211 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16
13212
13213 ; rows 24 to 31
13214 palignr m4, m2, m0, 6
13215 palignr m1, m3, m2, 6
13216 pmaddubsw m4, [r3 + 13 * 32] ; [29]
13217 pmulhrsw m4, m7
13218 pmaddubsw m1, [r3 + 13 * 32]
13219 pmulhrsw m1, m7
13220 packuswb m4, m1
13221
13222 palignr m3, m2, 8
13223 palignr m2, m0, 8
13224 pmaddubsw m5, m2, [r3 - 14 * 32] ; [2]
13225 pmulhrsw m5, m7
13226 pmaddubsw m9, m3, [r3 - 14 * 32]
13227 pmulhrsw m9, m7
13228 packuswb m5, m9
13229
13230 pmaddubsw m6, m2, [r3 - 9 * 32] ; [7]
13231 pmulhrsw m6, m7
13232 pmaddubsw m9, m3, [r3 - 9 * 32]
13233 pmulhrsw m9, m7
13234 packuswb m6, m9
13235
13236 pmaddubsw m8, m2, [r3 - 4 * 32] ; [12]
13237 pmulhrsw m8, m7
13238 pmaddubsw m1, m3, [r3 - 4 * 32]
13239 pmulhrsw m1, m7
13240 packuswb m8, m1
13241
13242 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
13243 pmulhrsw m9, m7
13244 pmaddubsw m11, m3, [r3 + 1 * 32]
13245 pmulhrsw m11, m7
13246 packuswb m9, m11
13247
13248 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
13249 pmulhrsw m10, m7
13250 pmaddubsw m1, m3, [r3 + 6 * 32]
13251 pmulhrsw m1, m7
13252 packuswb m10, m1
13253
13254 pmaddubsw m2, [r3 + 11 * 32] ; [27]
13255 pmulhrsw m2, m7
13256 pmaddubsw m3, [r3 + 11 * 32]
13257 pmulhrsw m3, m7
13258 packuswb m2, m3
13259
13260 movu m3, [r2 + 6] ; [0]
13261
13262 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24
13263 ret
13264
13265 INIT_YMM avx2
13266 cglobal intra_pred_ang32_8, 3,8,13
13267 add r2, 64
13268 lea r3, [ang_table_avx2 + 32 * 16]
13269 lea r5, [r1 * 3] ; r5 -> 3 * stride
13270 lea r6, [r1 * 4] ; r6 -> 4 * stride
13271 mova m7, [pw_1024]
13272 mov r4, r0
13273 xor r7d, r7d
13274
13275 call ang32_mode_8_28_avx2
13276 RET
13277
13278 INIT_YMM avx2
13279 cglobal intra_pred_ang32_28, 3,8,13
13280 lea r3, [ang_table_avx2 + 32 * 16]
13281 lea r5, [r1 * 3] ; r5 -> 3 * stride
13282 lea r6, [r1 * 4] ; r6 -> 4 * stride
13283 mova m7, [pw_1024]
13284 xor r7d, r7d
13285 inc r7d
13286
13287 call ang32_mode_8_28_avx2
13288 RET
13289
13290 INIT_YMM avx2
13291 cglobal intra_pred_ang32_9, 3,5,8
13292 vbroadcasti128 m0, [angHor_tab_9]
13293 vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
13294 mova m2, [pw_1024]
13295 mova m7, [ang32_shuf_mode9]
13296 lea r3, [r1 * 3]
13297
13298 vbroadcasti128 m3, [r2 + mmsize*2 + 1]
13299 vbroadcasti128 m6, [r2 + mmsize*2 + 17]
13300
13301 pshufb m5, m3, m7
13302 pmaddubsw m4, m5, m0
13303 pmaddubsw m5, m1
13304 pmulhrsw m4, m2
13305 pmulhrsw m5, m2
13306 packuswb m4, m5
13307 movu [r0], m4
13308
13309 palignr m5, m6, m3, 1
13310 pshufb m5, m7
13311 pmaddubsw m4, m5, m0
13312 pmaddubsw m5, m1
13313 pmulhrsw m4, m2
13314 pmulhrsw m5, m2
13315 packuswb m4, m5
13316 movu [r0 + r1], m4
13317
13318 palignr m5, m6, m3, 2
13319 pshufb m5, m7
13320 pmaddubsw m4, m5, m0
13321 pmaddubsw m5, m1
13322 pmulhrsw m4, m2
13323 pmulhrsw m5, m2
13324 packuswb m4, m5
13325 movu [r0 + r1*2], m4
13326
13327 palignr m5, m6, m3, 3
13328 pshufb m5, m7
13329 pmaddubsw m4, m5, m0
13330 pmaddubsw m5, m1
13331 pmulhrsw m4, m2
13332 pmulhrsw m5, m2
13333 packuswb m4, m5
13334 movu [r0 + r3], m4
13335
13336 lea r0, [r0 + r1 * 4]
13337
13338 palignr m5, m6, m3, 4
13339 pshufb m5, m7
13340 pmaddubsw m4, m5, m0
13341 pmaddubsw m5, m1
13342 pmulhrsw m4, m2
13343 pmulhrsw m5, m2
13344 packuswb m4, m5
13345 movu [r0], m4
13346
13347 palignr m5, m6, m3, 5
13348 pshufb m5, m7
13349 pmaddubsw m4, m5, m0
13350 pmaddubsw m5, m1
13351 pmulhrsw m4, m2
13352 pmulhrsw m5, m2
13353 packuswb m4, m5
13354 movu [r0 + r1], m4
13355
13356 palignr m5, m6, m3, 6
13357 pshufb m5, m7
13358 pmaddubsw m4, m5, m0
13359 pmaddubsw m5, m1
13360 pmulhrsw m4, m2
13361 pmulhrsw m5, m2
13362 packuswb m4, m5
13363 movu [r0 + r1*2], m4
13364
13365 palignr m5, m6, m3, 7
13366 pshufb m5, m7
13367 pmaddubsw m4, m5, m0
13368 pmaddubsw m5, m1
13369 pmulhrsw m4, m2
13370 pmulhrsw m5, m2
13371 packuswb m4, m5
13372 movu [r0 + r3], m4
13373
13374 lea r0, [r0 + r1 * 4]
13375
13376 palignr m5, m6, m3, 8
13377 pshufb m5, m7
13378 pmaddubsw m4, m5, m0
13379 pmaddubsw m5, m1
13380 pmulhrsw m4, m2
13381 pmulhrsw m5, m2
13382 packuswb m4, m5
13383 movu [r0], m4
13384
13385 palignr m5, m6, m3, 9
13386 pshufb m5, m7
13387 pmaddubsw m4, m5, m0
13388 pmaddubsw m5, m1
13389 pmulhrsw m4, m2
13390 pmulhrsw m5, m2
13391 packuswb m4, m5
13392 movu [r0 + r1], m4
13393
13394 palignr m5, m6, m3, 10
13395 pshufb m5, m7
13396 pmaddubsw m4, m5, m0
13397 pmaddubsw m5, m1
13398 pmulhrsw m4, m2
13399 pmulhrsw m5, m2
13400 packuswb m4, m5
13401 movu [r0 + r1*2], m4
13402
13403 palignr m5, m6, m3, 11
13404 pshufb m5, m7
13405 pmaddubsw m4, m5, m0
13406 pmaddubsw m5, m1
13407 pmulhrsw m4, m2
13408 pmulhrsw m5, m2
13409 packuswb m4, m5
13410 movu [r0 + r3], m4
13411
13412 lea r0, [r0 + r1 * 4]
13413
13414 palignr m5, m6, m3, 12
13415 pshufb m5, m7
13416 pmaddubsw m4, m5, m0
13417 pmaddubsw m5, m1
13418 pmulhrsw m4, m2
13419 pmulhrsw m5, m2
13420 packuswb m4, m5
13421 movu [r0], m4
13422
13423 palignr m5, m6, m3, 13
13424 pshufb m5, m7
13425 pmaddubsw m4, m5, m0
13426 pmaddubsw m5, m1
13427 pmulhrsw m4, m2
13428 pmulhrsw m5, m2
13429 packuswb m4, m5
13430 movu [r0 + r1], m4
13431
13432 palignr m5, m6, m3, 14
13433 pshufb m5, m7
13434 pmaddubsw m4, m5, m0
13435 pmaddubsw m5, m1
13436 pmulhrsw m4, m2
13437 pmulhrsw m5, m2
13438 packuswb m4, m5
13439 movu [r0 + r1*2], m4
13440
13441 palignr m5, m6, m3, 15
13442 pshufb m5, m7
13443 pmaddubsw m4, m5, m0
13444 pmaddubsw m5, m1
13445 pmulhrsw m4, m2
13446 pmulhrsw m5, m2
13447 packuswb m4, m5
13448 movu [r0 + r3], m4
13449
13450 lea r0, [r0 + r1 * 4]
13451
13452 vbroadcasti128 m3, [r2 + mmsize*2 + 33]
13453
13454 pshufb m5, m6, m7
13455 pmaddubsw m4, m5, m0
13456 pmaddubsw m5, m1
13457 pmulhrsw m4, m2
13458 pmulhrsw m5, m2
13459 packuswb m4, m5
13460 movu [r0], m4
13461
13462 palignr m5, m3, m6, 1
13463 pshufb m5, m7
13464 pmaddubsw m4, m5, m0
13465 pmaddubsw m5, m1
13466 pmulhrsw m4, m2
13467 pmulhrsw m5, m2
13468 packuswb m4, m5
13469 movu [r0 + r1], m4
13470
13471 palignr m5, m3, m6, 2
13472 pshufb m5, m7
13473 pmaddubsw m4, m5, m0
13474 pmaddubsw m5, m1
13475 pmulhrsw m4, m2
13476 pmulhrsw m5, m2
13477 packuswb m4, m5
13478 movu [r0 + r1*2], m4
13479
13480 palignr m5, m3, m6, 3
13481 pshufb m5, m7
13482 pmaddubsw m4, m5, m0
13483 pmaddubsw m5, m1
13484 pmulhrsw m4, m2
13485 pmulhrsw m5, m2
13486 packuswb m4, m5
13487 movu [r0 + r3], m4
13488
13489 lea r0, [r0 + r1 * 4]
13490
13491 palignr m5, m3, m6, 4
13492 pshufb m5, m7
13493 pmaddubsw m4, m5, m0
13494 pmaddubsw m5, m1
13495 pmulhrsw m4, m2
13496 pmulhrsw m5, m2
13497 packuswb m4, m5
13498 movu [r0], m4
13499
13500 palignr m5, m3, m6, 5
13501 pshufb m5, m7
13502 pmaddubsw m4, m5, m0
13503 pmaddubsw m5, m1
13504 pmulhrsw m4, m2
13505 pmulhrsw m5, m2
13506 packuswb m4, m5
13507 movu [r0 + r1], m4
13508
13509 palignr m5, m3, m6, 6
13510 pshufb m5, m7
13511 pmaddubsw m4, m5, m0
13512 pmaddubsw m5, m1
13513 pmulhrsw m4, m2
13514 pmulhrsw m5, m2
13515 packuswb m4, m5
13516 movu [r0 + r1*2], m4
13517
13518 palignr m5, m3, m6, 7
13519 pshufb m5, m7
13520 pmaddubsw m4, m5, m0
13521 pmaddubsw m5, m1
13522 pmulhrsw m4, m2
13523 pmulhrsw m5, m2
13524 packuswb m4, m5
13525 movu [r0 + r3], m4
13526
13527 lea r0, [r0 + r1 * 4]
13528
13529 palignr m5, m3, m6, 8
13530 pshufb m5, m7
13531 pmaddubsw m4, m5, m0
13532 pmaddubsw m5, m1
13533 pmulhrsw m4, m2
13534 pmulhrsw m5, m2
13535 packuswb m4, m5
13536 movu [r0], m4
13537
13538 palignr m5, m3, m6, 9
13539 pshufb m5, m7
13540 pmaddubsw m4, m5, m0
13541 pmaddubsw m5, m1
13542 pmulhrsw m4, m2
13543 pmulhrsw m5, m2
13544 packuswb m4, m5
13545 movu [r0 + r1], m4
13546
13547 palignr m5, m3, m6, 10
13548 pshufb m5, m7
13549 pmaddubsw m4, m5, m0
13550 pmaddubsw m5, m1
13551 pmulhrsw m4, m2
13552 pmulhrsw m5, m2
13553 packuswb m4, m5
13554 movu [r0 + r1*2], m4
13555
13556 palignr m5, m3, m6, 11
13557 pshufb m5, m7
13558 pmaddubsw m4, m5, m0
13559 pmaddubsw m5, m1
13560 pmulhrsw m4, m2
13561 pmulhrsw m5, m2
13562 packuswb m4, m5
13563 movu [r0 + r3], m4
13564
13565 lea r0, [r0 + r1 * 4]
13566
13567 palignr m5, m3, m6, 12
13568 pshufb m5, m7
13569 pmaddubsw m4, m5, m0
13570 pmaddubsw m5, m1
13571 pmulhrsw m4, m2
13572 pmulhrsw m5, m2
13573 packuswb m4, m5
13574 movu [r0], m4
13575
13576 palignr m5, m3, m6, 13
13577 pshufb m5, m7
13578 pmaddubsw m4, m5, m0
13579 pmaddubsw m5, m1
13580 pmulhrsw m4, m2
13581 pmulhrsw m5, m2
13582 packuswb m4, m5
13583 movu [r0 + r1], m4
13584
13585 palignr m5, m3, m6, 14
13586 pshufb m5, m7
13587 pmaddubsw m4, m5, m0
13588 pmaddubsw m5, m1
13589 pmulhrsw m4, m2
13590 pmulhrsw m5, m2
13591 packuswb m4, m5
13592 movu [r0 + r1*2], m4
13593
13594 palignr m5, m3, m6, 15
13595 pshufb m5, m7
13596 pmaddubsw m4, m5, m0
13597 pmaddubsw m5, m1
13598 pmulhrsw m4, m2
13599 pmulhrsw m5, m2
13600 packuswb m4, m5
13601 movu [r0 + r3], m4
13602 RET
13603
13604 cglobal intra_pred_ang32_27, 3,5,6
13605 lea r3, [ang_table_avx2 + 32 * 16]
13606 lea r4, [r1 * 3] ; r4 -> 3 * stride
13607 mova m5, [pw_1024]
13608
13609 ; rows 0 to 7
13610 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
13611 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
13612 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
13613 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
13614
13615 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
13616 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
13617 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
13618
13619 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
13620 pmulhrsw m4, m5
13621 pmaddubsw m1, m2, [r3 - 14 * 32]
13622 pmulhrsw m1, m5
13623 packuswb m4, m1
13624 movu [r0], m4
13625
13626 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
13627 pmulhrsw m4, m5
13628 pmaddubsw m1, m2, [r3 - 12 * 32]
13629 pmulhrsw m1, m5
13630 packuswb m4, m1
13631 movu [r0 + r1], m4
13632
13633 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
13634 pmulhrsw m4, m5
13635 pmaddubsw m1, m2, [r3 - 10 * 32]
13636 pmulhrsw m1, m5
13637 packuswb m4, m1
13638 movu [r0 + r1*2], m4
13639
13640 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
13641 pmulhrsw m4, m5
13642 pmaddubsw m1, m2, [r3 - 8 * 32]
13643 pmulhrsw m1, m5
13644 packuswb m4, m1
13645 movu [r0 + r4], m4
13646
13647 lea r0, [r0 + r1 * 4]
13648
13649 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
13650 pmulhrsw m4, m5
13651 pmaddubsw m1, m2, [r3 - 6 * 32]
13652 pmulhrsw m1, m5
13653 packuswb m4, m1
13654 movu [r0], m4
13655
13656 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
13657 pmulhrsw m4, m5
13658 pmaddubsw m1, m2, [r3 - 4 * 32]
13659 pmulhrsw m1, m5
13660 packuswb m4, m1
13661 movu [r0 + r1], m4
13662
13663 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
13664 pmulhrsw m4, m5
13665 pmaddubsw m1, m2, [r3 - 2 * 32]
13666 pmulhrsw m1, m5
13667 packuswb m4, m1
13668 movu [r0 + r1*2], m4
13669
13670 pmaddubsw m4, m0, [r3] ; [16]
13671 pmulhrsw m4, m5
13672 pmaddubsw m1, m2, [r3]
13673 pmulhrsw m1, m5
13674 packuswb m4, m1
13675 movu [r0 + r4], m4
13676
13677 lea r0, [r0 + r1 * 4]
13678
13679 ; rows 8 to 15
13680 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
13681 pmulhrsw m4, m5
13682 pmaddubsw m1, m2, [r3 + 2 * 32]
13683 pmulhrsw m1, m5
13684 packuswb m4, m1
13685 movu [r0], m4
13686
13687 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
13688 pmulhrsw m4, m5
13689 pmaddubsw m1, m2, [r3 + 4 * 32]
13690 pmulhrsw m1, m5
13691 packuswb m4, m1
13692 movu [r0 + r1], m4
13693
13694 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
13695 pmulhrsw m4, m5
13696 pmaddubsw m1, m2, [r3 + 6 * 32]
13697 pmulhrsw m1, m5
13698 packuswb m4, m1
13699 movu [r0 + r1*2], m4
13700
13701 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
13702 pmulhrsw m4, m5
13703 pmaddubsw m1, m2, [r3 + 8 * 32]
13704 pmulhrsw m1, m5
13705 packuswb m4, m1
13706 movu [r0 + r4], m4
13707
13708 lea r0, [r0 + r1 * 4]
13709
13710 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
13711 pmulhrsw m4, m5
13712 pmaddubsw m1, m2, [r3 + 10 * 32]
13713 pmulhrsw m1, m5
13714 packuswb m4, m1
13715 movu [r0], m4
13716
13717 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
13718 pmulhrsw m4, m5
13719 pmaddubsw m1, m2, [r3 + 12 * 32]
13720 pmulhrsw m1, m5
13721 packuswb m4, m1
13722 movu [r0 + r1], m4
13723
13724 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
13725 pmulhrsw m4, m5
13726 pmaddubsw m1, m2, [r3 + 14 * 32]
13727 pmulhrsw m1, m5
13728 packuswb m4, m1
13729 movu [r0 + r1*2], m4
13730
13731 palignr m3, m2, 2
13732 palignr m2, m0, 2
13733 movu m1, [r2 + 2] ; [0]
13734 movu [r0 + r4], m1
13735
13736 lea r0, [r0 + r1 * 4]
13737
13738 ; rows 16 to 23
13739 pmaddubsw m4, m2, [r3 - 14 * 32] ; [2]
13740 pmulhrsw m4, m5
13741 pmaddubsw m1, m3, [r3 - 14 * 32]
13742 pmulhrsw m1, m5
13743 packuswb m4, m1
13744 movu [r0], m4
13745
13746 pmaddubsw m4, m2, [r3 - 12 * 32] ; [4]
13747 pmulhrsw m4, m5
13748 pmaddubsw m1, m3, [r3 - 12 * 32]
13749 pmulhrsw m1, m5
13750 packuswb m4, m1
13751 movu [r0 + r1], m4
13752
13753 pmaddubsw m4, m2, [r3 - 10 * 32] ; [6]
13754 pmulhrsw m4, m5
13755 pmaddubsw m1, m3, [r3 - 10 * 32]
13756 pmulhrsw m1, m5
13757 packuswb m4, m1
13758 movu [r0 + r1*2], m4
13759
13760 pmaddubsw m4, m2, [r3 - 8 * 32] ; [8]
13761 pmulhrsw m4, m5
13762 pmaddubsw m1, m3, [r3 - 8 * 32]
13763 pmulhrsw m1, m5
13764 packuswb m4, m1
13765 movu [r0 + r4], m4
13766
13767 lea r0, [r0 + r1 * 4]
13768
13769 pmaddubsw m4, m2, [r3 - 6 * 32] ; [10]
13770 pmulhrsw m4, m5
13771 pmaddubsw m1, m3, [r3 - 6 * 32]
13772 pmulhrsw m1, m5
13773 packuswb m4, m1
13774 movu [r0], m4
13775
13776 pmaddubsw m4, m2, [r3 - 4 * 32] ; [12]
13777 pmulhrsw m4, m5
13778 pmaddubsw m1, m3, [r3 - 4 * 32]
13779 pmulhrsw m1, m5
13780 packuswb m4, m1
13781 movu [r0 + r1], m4
13782
13783 pmaddubsw m4, m2, [r3 - 2 * 32] ; [14]
13784 pmulhrsw m4, m5
13785 pmaddubsw m1, m3, [r3 - 2 * 32]
13786 pmulhrsw m1, m5
13787 packuswb m4, m1
13788 movu [r0 + r1*2], m4
13789
13790 pmaddubsw m4, m2, [r3] ; [16]
13791 pmulhrsw m4, m5
13792 pmaddubsw m1, m3, [r3]
13793 pmulhrsw m1, m5
13794 packuswb m4, m1
13795 movu [r0 + r4], m4
13796
13797 lea r0, [r0 + r1 * 4]
13798
13799 ; rows 8 to 15
13800 pmaddubsw m4, m2, [r3 + 2 * 32] ; [18]
13801 pmulhrsw m4, m5
13802 pmaddubsw m1, m3, [r3 + 2 * 32]
13803 pmulhrsw m1, m5
13804 packuswb m4, m1
13805 movu [r0], m4
13806
13807 pmaddubsw m4, m2, [r3 + 4 * 32] ; [20]
13808 pmulhrsw m4, m5
13809 pmaddubsw m1, m3, [r3 + 4 * 32]
13810 pmulhrsw m1, m5
13811 packuswb m4, m1
13812 movu [r0 + r1], m4
13813
13814 pmaddubsw m4, m2, [r3 + 6 * 32] ; [22]
13815 pmulhrsw m4, m5
13816 pmaddubsw m1, m3, [r3 + 6 * 32]
13817 pmulhrsw m1, m5
13818 packuswb m4, m1
13819 movu [r0 + r1*2], m4
13820
13821 pmaddubsw m4, m2, [r3 + 8 * 32] ; [24]
13822 pmulhrsw m4, m5
13823 pmaddubsw m1, m3, [r3 + 8 * 32]
13824 pmulhrsw m1, m5
13825 packuswb m4, m1
13826 movu [r0 + r4], m4
13827
13828 lea r0, [r0 + r1 * 4]
13829
13830 pmaddubsw m4, m2, [r3 + 10 * 32] ; [26]
13831 pmulhrsw m4, m5
13832 pmaddubsw m1, m3, [r3 + 10 * 32]
13833 pmulhrsw m1, m5
13834 packuswb m4, m1
13835 movu [r0], m4
13836
13837 pmaddubsw m4, m2, [r3 + 12 * 32] ; [28]
13838 pmulhrsw m4, m5
13839 pmaddubsw m1, m3, [r3 + 12 * 32]
13840 pmulhrsw m1, m5
13841 packuswb m4, m1
13842 movu [r0 + r1], m4
13843
13844 pmaddubsw m2, [r3 + 14 * 32] ; [30]
13845 pmulhrsw m2, m5
13846 pmaddubsw m3, [r3 + 14 * 32]
13847 pmulhrsw m3, m5
13848 packuswb m2, m3
13849 movu [r0 + r1*2], m2
13850
13851 movu m1, [r2 + 3] ; [0]
13852 movu [r0 + r4], m1
13853 RET
13854
13855 cglobal intra_pred_ang32_10, 5,5,4
13856 pxor m0, m0
13857 mova m1, [pb_1]
13858 lea r4, [r1 * 3]
13859
13860 vbroadcasti128 m2, [r2 + mmsize*2 + 1]
13861
13862 pshufb m3, m2, m0
13863 movu [r0], m3
13864 paddb m0, m1
13865 pshufb m3, m2, m0
13866 movu [r0 + r1], m3
13867 paddb m0, m1
13868 pshufb m3, m2, m0
13869 movu [r0 + r1 * 2], m3
13870 paddb m0, m1
13871 pshufb m3, m2, m0
13872 movu [r0 + r4], m3
13873
13874 lea r0, [r0 + r1 * 4]
13875
13876 paddb m0, m1
13877 pshufb m3, m2, m0
13878 movu [r0], m3
13879 paddb m0, m1
13880 pshufb m3, m2, m0
13881 movu [r0 + r1], m3
13882 paddb m0, m1
13883 pshufb m3, m2, m0
13884 movu [r0 + r1 * 2], m3
13885 paddb m0, m1
13886 pshufb m3, m2, m0
13887 movu [r0 + r4], m3
13888
13889 lea r0, [r0 + r1 * 4]
13890
13891 paddb m0, m1
13892 pshufb m3, m2, m0
13893 movu [r0], m3
13894 paddb m0, m1
13895 pshufb m3, m2, m0
13896 movu [r0 + r1], m3
13897 paddb m0, m1
13898 pshufb m3, m2, m0
13899 movu [r0 + r1 * 2], m3
13900 paddb m0, m1
13901 pshufb m3, m2, m0
13902 movu [r0 + r4], m3
13903
13904 lea r0, [r0 + r1 * 4]
13905
13906 paddb m0, m1
13907 pshufb m3, m2, m0
13908 movu [r0], m3
13909 paddb m0, m1
13910 pshufb m3, m2, m0
13911 movu [r0 + r1], m3
13912 paddb m0, m1
13913 pshufb m3, m2, m0
13914 movu [r0 + r1 * 2], m3
13915 paddb m0, m1
13916 pshufb m3, m2, m0
13917 movu [r0 + r4], m3
13918
13919 lea r0, [r0 + r1 * 4]
13920 pxor m0, m0
13921 vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1]
13922
13923 pshufb m3, m2, m0
13924 movu [r0], m3
13925 paddb m0, m1
13926 pshufb m3, m2, m0
13927 movu [r0 + r1], m3
13928 paddb m0, m1
13929 pshufb m3, m2, m0
13930 movu [r0 + r1 * 2], m3
13931 paddb m0, m1
13932 pshufb m3, m2, m0
13933 movu [r0 + r4], m3
13934
13935 lea r0, [r0 + r1 * 4]
13936
13937 paddb m0, m1
13938 pshufb m3, m2, m0
13939 movu [r0], m3
13940 paddb m0, m1
13941 pshufb m3, m2, m0
13942 movu [r0 + r1], m3
13943 paddb m0, m1
13944 pshufb m3, m2, m0
13945 movu [r0 + r1 * 2], m3
13946 paddb m0, m1
13947 pshufb m3, m2, m0
13948 movu [r0 + r4], m3
13949
13950 lea r0, [r0 + r1 * 4]
13951
13952 paddb m0, m1
13953 pshufb m3, m2, m0
13954 movu [r0], m3
13955 paddb m0, m1
13956 pshufb m3, m2, m0
13957 movu [r0 + r1], m3
13958 paddb m0, m1
13959 pshufb m3, m2, m0
13960 movu [r0 + r1 * 2], m3
13961 paddb m0, m1
13962 pshufb m3, m2, m0
13963 movu [r0 + r4], m3
13964
13965 lea r0, [r0 + r1 * 4]
13966
13967 paddb m0, m1
13968 pshufb m3, m2, m0
13969 movu [r0], m3
13970 paddb m0, m1
13971 pshufb m3, m2, m0
13972 movu [r0 + r1], m3
13973 paddb m0, m1
13974 pshufb m3, m2, m0
13975 movu [r0 + r1 * 2], m3
13976 paddb m0, m1
13977 pshufb m3, m2, m0
13978 movu [r0 + r4], m3
13979 RET
13980
13981 cglobal intra_pred_ang32_11, 3,4,8
13982 vbroadcasti128 m0, [angHor_tab_11]
13983 vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
13984 mova m2, [pw_1024]
13985 mova m7, [ang32_shuf_mode11]
13986 lea r3, [r1 * 3]
13987
13988 ; prepare for [16 0 -1 -2 ...]
13989 movu xm3, [r2 + mmsize*2 - 1]
13990 vbroadcasti128 m6, [r2 + mmsize*2 + 15]
13991
13992 pinsrb xm3, [r2 + 0], 1
13993 pinsrb xm3, [r2 + 16], 0
13994 vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
13995
13996 pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0]
13997 pmaddubsw m4, m5, m0
13998 pmaddubsw m5, m1
13999 pmulhrsw m4, m2
14000 pmulhrsw m5, m2
14001 packuswb m4, m5
14002 movu [r0], m4
14003
14004 palignr m5, m6, m3, 1
14005 pshufb m5, m7
14006 pmaddubsw m4, m5, m0
14007 pmaddubsw m5, m1
14008 pmulhrsw m4, m2
14009 pmulhrsw m5, m2
14010 packuswb m4, m5
14011 movu [r0 + r1], m4
14012
14013 palignr m5, m6, m3, 2
14014 pshufb m5, m7
14015 pmaddubsw m4, m5, m0
14016 pmaddubsw m5, m1
14017 pmulhrsw m4, m2
14018 pmulhrsw m5, m2
14019 packuswb m4, m5
14020 movu [r0 + r1 * 2], m4
14021
14022 palignr m5, m6, m3, 3
14023 pshufb m5, m7
14024 pmaddubsw m4, m5, m0
14025 pmaddubsw m5, m1
14026 pmulhrsw m4, m2
14027 pmulhrsw m5, m2
14028 packuswb m4, m5
14029 movu [r0 + r3], m4
14030
14031 lea r0, [r0 + r1 * 4]
14032
14033 palignr m5, m6, m3, 4
14034 pshufb m5, m7
14035 pmaddubsw m4, m5, m0
14036 pmaddubsw m5, m1
14037 pmulhrsw m4, m2
14038 pmulhrsw m5, m2
14039 packuswb m4, m5
14040 movu [r0], m4
14041
14042 palignr m5, m6, m3, 5
14043 pshufb m5, m7
14044 pmaddubsw m4, m5, m0
14045 pmaddubsw m5, m1
14046 pmulhrsw m4, m2
14047 pmulhrsw m5, m2
14048 packuswb m4, m5
14049 movu [r0 + r1], m4
14050
14051 palignr m5, m6, m3, 6
14052 pshufb m5, m7
14053 pmaddubsw m4, m5, m0
14054 pmaddubsw m5, m1
14055 pmulhrsw m4, m2
14056 pmulhrsw m5, m2
14057 packuswb m4, m5
14058 movu [r0 + r1 * 2], m4
14059
14060 palignr m5, m6, m3, 7
14061 pshufb m5, m7
14062 pmaddubsw m4, m5, m0
14063 pmaddubsw m5, m1
14064 pmulhrsw m4, m2
14065 pmulhrsw m5, m2
14066 packuswb m4, m5
14067 movu [r0 + r3], m4
14068
14069 lea r0, [r0 + r1 * 4]
14070
14071 palignr m5, m6, m3, 8
14072 pshufb m5, m7
14073 pmaddubsw m4, m5, m0
14074 pmaddubsw m5, m1
14075 pmulhrsw m4, m2
14076 pmulhrsw m5, m2
14077 packuswb m4, m5
14078 movu [r0], m4
14079
14080 palignr m5, m6, m3, 9
14081 pshufb m5, m7
14082 pmaddubsw m4, m5, m0
14083 pmaddubsw m5, m1
14084 pmulhrsw m4, m2
14085 pmulhrsw m5, m2
14086 packuswb m4, m5
14087 movu [r0 + r1], m4
14088
14089 palignr m5, m6, m3, 10
14090 pshufb m5, m7
14091
14092 pmaddubsw m4, m5, m0
14093 pmaddubsw m5, m1
14094 pmulhrsw m4, m2
14095 pmulhrsw m5, m2
14096 packuswb m4, m5
14097 movu [r0 + r1 * 2], m4
14098
14099 palignr m5, m6, m3, 11
14100 pshufb m5, m7
14101 pmaddubsw m4, m5, m0
14102 pmaddubsw m5, m1
14103 pmulhrsw m4, m2
14104 pmulhrsw m5, m2
14105 packuswb m4, m5
14106 movu [r0 + r3], m4
14107
14108 lea r0, [r0 + r1 * 4]
14109
14110 palignr m5, m6, m3, 12
14111 pshufb m5, m7
14112 pmaddubsw m4, m5, m0
14113 pmaddubsw m5, m1
14114 pmulhrsw m4, m2
14115 pmulhrsw m5, m2
14116 packuswb m4, m5
14117 movu [r0], m4
14118
14119 palignr m5, m6, m3, 13
14120 pshufb m5, m7
14121 pmaddubsw m4, m5, m0
14122 pmaddubsw m5, m1
14123 pmulhrsw m4, m2
14124 pmulhrsw m5, m2
14125 packuswb m4, m5
14126 movu [r0 + r1], m4
14127
14128 palignr m5, m6, m3, 14
14129 pshufb m5, m7
14130 pmaddubsw m4, m5, m0
14131 pmaddubsw m5, m1
14132 pmulhrsw m4, m2
14133 pmulhrsw m5, m2
14134 packuswb m4, m5
14135 movu [r0 + r1 * 2], m4
14136
14137 palignr m5, m6, m3, 15
14138 pshufb m5, m7
14139 pmaddubsw m4, m5, m0
14140 pmaddubsw m5, m1
14141 pmulhrsw m4, m2
14142 pmulhrsw m5, m2
14143 packuswb m4, m5
14144 movu [r0 + r3], m4
14145
14146 lea r0, [r0 + r1 * 4]
14147
14148 mova m3, m6
14149 vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16]
14150 pshufb m5, m3, m7
14151 pmaddubsw m4, m5, m0
14152 pmaddubsw m5, m1
14153 pmulhrsw m4, m2
14154 pmulhrsw m5, m2
14155 packuswb m4, m5
14156 movu [r0], m4
14157
14158 palignr m5, m6, m3, 1
14159 pshufb m5, m7
14160 pmaddubsw m4, m5, m0
14161 pmaddubsw m5, m1
14162 pmulhrsw m4, m2
14163 pmulhrsw m5, m2
14164 packuswb m4, m5
14165 movu [r0 + r1], m4
14166
14167 palignr m5, m6, m3, 2
14168 pshufb m5, m7
14169 pmaddubsw m4, m5, m0
14170 pmaddubsw m5, m1
14171 pmulhrsw m4, m2
14172 pmulhrsw m5, m2
14173 packuswb m4, m5
14174 movu [r0 + r1 * 2], m4
14175
14176 palignr m5, m6, m3, 3
14177 pshufb m5, m7
14178 pmaddubsw m4, m5, m0
14179 pmaddubsw m5, m1
14180 pmulhrsw m4, m2
14181 pmulhrsw m5, m2
14182 packuswb m4, m5
14183 movu [r0 + r3], m4
14184
14185 lea r0, [r0 + r1 * 4]
14186
14187 palignr m5, m6, m3, 4
14188 pshufb m5, m7
14189 pmaddubsw m4, m5, m0
14190 pmaddubsw m5, m1
14191 pmulhrsw m4, m2
14192 pmulhrsw m5, m2
14193 packuswb m4, m5
14194 movu [r0], m4
14195
14196 palignr m5, m6, m3, 5
14197 pshufb m5, m7
14198 pmaddubsw m4, m5, m0
14199 pmaddubsw m5, m1
14200 pmulhrsw m4, m2
14201 pmulhrsw m5, m2
14202 packuswb m4, m5
14203 movu [r0 + r1], m4
14204
14205 palignr m5, m6, m3, 6
14206 pshufb m5, m7
14207 pmaddubsw m4, m5, m0
14208 pmaddubsw m5, m1
14209 pmulhrsw m4, m2
14210 pmulhrsw m5, m2
14211 packuswb m4, m5
14212 movu [r0 + r1 * 2], m4
14213
14214 palignr m5, m6, m3, 7
14215 pshufb m5, m7
14216 pmaddubsw m4, m5, m0
14217 pmaddubsw m5, m1
14218 pmulhrsw m4, m2
14219 pmulhrsw m5, m2
14220 packuswb m4, m5
14221 movu [r0 + r3], m4
14222
14223 lea r0, [r0 + r1 * 4]
14224
14225 palignr m5, m6, m3, 8
14226 pshufb m5, m7
14227 pmaddubsw m4, m5, m0
14228 pmaddubsw m5, m1
14229 pmulhrsw m4, m2
14230 pmulhrsw m5, m2
14231 packuswb m4, m5
14232 movu [r0], m4
14233
14234 palignr m5, m6, m3, 9
14235 pshufb m5, m7
14236 pmaddubsw m4, m5, m0
14237 pmaddubsw m5, m1
14238 pmulhrsw m4, m2
14239 pmulhrsw m5, m2
14240 packuswb m4, m5
14241 movu [r0 + r1], m4
14242
14243 palignr m5, m6, m3, 10
14244 pshufb m5, m7
14245 pmaddubsw m4, m5, m0
14246 pmaddubsw m5, m1
14247 pmulhrsw m4, m2
14248 pmulhrsw m5, m2
14249 packuswb m4, m5
14250 movu [r0 + r1 * 2], m4
14251
14252 palignr m5, m6, m3, 11
14253 pshufb m5, m7
14254 pmaddubsw m4, m5, m0
14255 pmaddubsw m5, m1
14256 pmulhrsw m4, m2
14257 pmulhrsw m5, m2
14258 packuswb m4, m5
14259 movu [r0 + r3], m4
14260
14261 lea r0, [r0 + r1 * 4]
14262
14263 palignr m5, m6, m3, 12
14264 pshufb m5, m7
14265 pmaddubsw m4, m5, m0
14266 pmaddubsw m5, m1
14267 pmulhrsw m4, m2
14268 pmulhrsw m5, m2
14269 packuswb m4, m5
14270 movu [r0], m4
14271
14272 palignr m5, m6, m3, 13
14273 pshufb m5, m7
14274 pmaddubsw m4, m5, m0
14275 pmaddubsw m5, m1
14276 pmulhrsw m4, m2
14277 pmulhrsw m5, m2
14278 packuswb m4, m5
14279 movu [r0 + r1], m4
14280
14281 palignr m5, m6, m3, 14
14282 pshufb m5, m7
14283 pmaddubsw m4, m5, m0
14284 pmaddubsw m5, m1
14285 pmulhrsw m4, m2
14286 pmulhrsw m5, m2
14287 packuswb m4, m5
14288 movu [r0 + r1 * 2], m4
14289
14290 palignr m5, m6, m3, 15
14291 pshufb m5, m7
14292 pmaddubsw m4, m5, m0
14293 pmaddubsw m5, m1
14294 pmulhrsw m4, m2
14295 pmulhrsw m5, m2
14296 packuswb m4, m5
14297 movu [r0 + r3], m4
14298 RET
14299
14300 cglobal intra_pred_ang32_25, 3,5,7
14301 lea r3, [ang_table_avx2 + 32 * 16]
14302 lea r4, [r1 * 3]
14303 mova m5, [pw_1024]
14304
14305 ; rows 0 to 7
14306 movu m0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
14307 movu m1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
14308
14309 pinsrb xm3, [r2], 15
14310 pinsrb xm3, [r2 + mmsize*2 + 16], 14
14311
14312 punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
14313 punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
14314 vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x]
14315
14316 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
14317 pmulhrsw m4, m5
14318 pmaddubsw m1, m2, [r3 + 14 * 32]
14319 pmulhrsw m1, m5
14320 packuswb m4, m1
14321 movu [r0], m4
14322
14323 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
14324 pmulhrsw m4, m5
14325 pmaddubsw m1, m2, [r3 + 12 * 32]
14326 pmulhrsw m1, m5
14327 packuswb m4, m1
14328 movu [r0 + r1], m4
14329
14330 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
14331 pmulhrsw m4, m5
14332 pmaddubsw m1, m2, [r3 + 10 * 32]
14333 pmulhrsw m1, m5
14334 packuswb m4, m1
14335 movu [r0 + r1*2], m4
14336
14337 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
14338 pmulhrsw m4, m5
14339 pmaddubsw m1, m2, [r3 + 8 * 32]
14340 pmulhrsw m1, m5
14341 packuswb m4, m1
14342 movu [r0 + r4], m4
14343
14344 lea r0, [r0 + r1 * 4]
14345
14346 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
14347 pmulhrsw m4, m5
14348 pmaddubsw m1, m2, [r3 + 6 * 32]
14349 pmulhrsw m1, m5
14350 packuswb m4, m1
14351 movu [r0], m4
14352
14353 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
14354 pmulhrsw m4, m5
14355 pmaddubsw m1, m2, [r3 + 4 * 32]
14356 pmulhrsw m1, m5
14357 packuswb m4, m1
14358 movu [r0 + r1], m4
14359
14360 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
14361 pmulhrsw m4, m5
14362 pmaddubsw m1, m2, [r3 + 2 * 32]
14363 pmulhrsw m1, m5
14364 packuswb m4, m1
14365 movu [r0 + r1*2], m4
14366
14367 pmaddubsw m4, m0, [r3] ; [16]
14368 pmulhrsw m4, m5
14369 pmaddubsw m1, m2, [r3]
14370 pmulhrsw m1, m5
14371 packuswb m4, m1
14372 movu [r0 + r4], m4
14373
14374 lea r0, [r0 + r1 * 4]
14375
14376 ; rows 8 to 15
14377 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
14378 pmulhrsw m4, m5
14379 pmaddubsw m1, m2, [r3 - 2 * 32]
14380 pmulhrsw m1, m5
14381 packuswb m4, m1
14382 movu [r0], m4
14383
14384 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
14385 pmulhrsw m4, m5
14386 pmaddubsw m1, m2, [r3 - 4 * 32]
14387 pmulhrsw m1, m5
14388 packuswb m4, m1
14389 movu [r0 + r1], m4
14390
14391 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
14392 pmulhrsw m4, m5
14393 pmaddubsw m1, m2, [r3 - 6 * 32]
14394 pmulhrsw m1, m5
14395 packuswb m4, m1
14396 movu [r0 + r1*2], m4
14397
14398 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
14399 pmulhrsw m4, m5
14400 pmaddubsw m1, m2, [r3 - 8 * 32]
14401 pmulhrsw m1, m5
14402 packuswb m4, m1
14403 movu [r0 + r4], m4
14404
14405 lea r0, [r0 + r1 * 4]
14406
14407 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
14408 pmulhrsw m4, m5
14409 pmaddubsw m1, m2, [r3 - 10 * 32]
14410 pmulhrsw m1, m5
14411 packuswb m4, m1
14412 movu [r0], m4
14413
14414 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
14415 pmulhrsw m4, m5
14416 pmaddubsw m1, m2, [r3 - 12 * 32]
14417 pmulhrsw m1, m5
14418 packuswb m4, m1
14419 movu [r0 + r1], m4
14420
14421 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
14422 pmulhrsw m4, m5
14423 pmaddubsw m1, m2, [r3 - 14 * 32]
14424 pmulhrsw m1, m5
14425 packuswb m4, m1
14426 movu [r0 + r1 * 2], m4
14427
14428 movu m1, [r2] ; [0]
14429 movu [r0 + r4], m1
14430
14431 lea r0, [r0 + r1 * 4]
14432 palignr m2, m0, 14
14433 palignr m0, m3, 14
14434
14435 ; rows 16 to 23
14436 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
14437 pmulhrsw m4, m5
14438 pmaddubsw m1, m2, [r3 + 14 * 32]
14439 pmulhrsw m1, m5
14440 packuswb m4, m1
14441 movu [r0], m4
14442
14443 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
14444 pmulhrsw m4, m5
14445 pmaddubsw m1, m2, [r3 + 12 * 32]
14446 pmulhrsw m1, m5
14447 packuswb m4, m1
14448 movu [r0 + r1], m4
14449
14450 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
14451 pmulhrsw m4, m5
14452 pmaddubsw m1, m2, [r3 + 10 * 32]
14453 pmulhrsw m1, m5
14454 packuswb m4, m1
14455 movu [r0 + r1*2], m4
14456
14457 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
14458 pmulhrsw m4, m5
14459 pmaddubsw m1, m2, [r3 + 8 * 32]
14460 pmulhrsw m1, m5
14461 packuswb m4, m1
14462 movu [r0 + r4], m4
14463
14464 lea r0, [r0 + r1 * 4]
14465
14466 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
14467 pmulhrsw m4, m5
14468 pmaddubsw m1, m2, [r3 + 6 * 32]
14469 pmulhrsw m1, m5
14470 packuswb m4, m1
14471 movu [r0], m4
14472
14473 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
14474 pmulhrsw m4, m5
14475 pmaddubsw m1, m2, [r3 + 4 * 32]
14476 pmulhrsw m1, m5
14477 packuswb m4, m1
14478 movu [r0 + r1], m4
14479
14480 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
14481 pmulhrsw m4, m5
14482 pmaddubsw m1, m2, [r3 + 2 * 32]
14483 pmulhrsw m1, m5
14484 packuswb m4, m1
14485 movu [r0 + r1*2], m4
14486
14487 pmaddubsw m4, m0, [r3] ; [16]
14488 pmulhrsw m4, m5
14489 pmaddubsw m1, m2, [r3]
14490 pmulhrsw m1, m5
14491 packuswb m4, m1
14492 movu [r0 + r4], m4
14493
14494 lea r0, [r0 + r1 * 4]
14495
14496 ; rows 24 to 31
14497 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
14498 pmulhrsw m4, m5
14499 pmaddubsw m1, m2, [r3 - 2 * 32]
14500 pmulhrsw m1, m5
14501 packuswb m4, m1
14502 movu [r0], m4
14503
14504 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
14505 pmulhrsw m4, m5
14506 pmaddubsw m1, m2, [r3 - 4 * 32]
14507 pmulhrsw m1, m5
14508 packuswb m4, m1
14509 movu [r0 + r1], m4
14510
14511 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
14512 pmulhrsw m4, m5
14513 pmaddubsw m1, m2, [r3 - 6 * 32]
14514 pmulhrsw m1, m5
14515 packuswb m4, m1
14516 movu [r0 + r1 * 2], m4
14517
14518 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
14519 pmulhrsw m4, m5
14520 pmaddubsw m1, m2, [r3 - 8 * 32]
14521 pmulhrsw m1, m5
14522 packuswb m4, m1
14523 movu [r0 + r4], m4
14524
14525 lea r0, [r0 + r1 * 4]
14526
14527 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
14528 pmulhrsw m4, m5
14529 pmaddubsw m1, m2, [r3 - 10 * 32]
14530 pmulhrsw m1, m5
14531 packuswb m4, m1
14532 movu [r0], m4
14533
14534 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
14535 pmulhrsw m4, m5
14536 pmaddubsw m1, m2, [r3 - 12 * 32]
14537 pmulhrsw m1, m5
14538 packuswb m4, m1
14539 movu [r0 + r1], m4
14540
14541 pmaddubsw m0, [r3 - 14 * 32] ; [2]
14542 pmulhrsw m0, m5
14543 pmaddubsw m2, [r3 - 14 * 32]
14544 pmulhrsw m2, m5
14545 packuswb m0, m2
14546 movu [r0 + r1*2], m0
14547
14548 movu m1, [r2 + 1] ; [0]
14549 palignr m1, m3, 14
14550 movu [r0 + r4], m1
14551 RET
14552
14553 cglobal intra_pred_ang32_12, 3,4,9
14554 movu m0, [ang32_fact_mode12]
14555 movu m1, [ang32_fact_mode12 + mmsize]
14556 mova m2, [pw_1024]
14557 mova m7, [ang32_shuf_mode12]
14558 mova m8, [ang32_shuf_mode12 + mmsize]
14559 lea r3, [r1 * 3]
14560
14561 ; prepare for [26, 19, 13, 6, 0, -1, -2....]
14562
14563 movu xm4, [r2 + mmsize*2 - 4]
14564 vbroadcasti128 m6, [r2 + mmsize*2 + 12]
14565
14566 pinsrb xm4, [r2 + 0], 4
14567 pinsrb xm4, [r2 + 6], 3
14568 pinsrb xm4, [r2 + 13], 2
14569 pinsrb xm4, [r2 + 19], 1
14570 pinsrb xm4, [r2 + 26], 0
14571 vinserti128 m3, m4, xm4, 1 ; [26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
14572
14573 pshufb m4, m3, m7 ; [ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13]
14574 pshufb m5, m3, m8 ; [ 6, 0, 6, 0, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 13, 6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19]
14575 pmaddubsw m4, m0
14576 pmaddubsw m5, m1
14577 pmulhrsw m4, m2
14578 pmulhrsw m5, m2
14579 packuswb m4, m5
14580 movu [r0], m4
14581
14582 palignr m4, m6, m3, 1
14583 pshufb m5, m4, m8
14584 pshufb m4, m7
14585 pmaddubsw m4, m0
14586 pmaddubsw m5, m1
14587 pmulhrsw m4, m2
14588 pmulhrsw m5, m2
14589 packuswb m4, m5
14590 movu [r0 + r1], m4
14591
14592 palignr m4, m6, m3, 2
14593 pshufb m5, m4, m8
14594 pshufb m4, m7
14595 pmaddubsw m4, m0
14596 pmaddubsw m5, m1
14597 pmulhrsw m4, m2
14598 pmulhrsw m5, m2
14599 packuswb m4, m5
14600 movu [r0 + r1 * 2], m4
14601
14602 palignr m4, m6, m3, 3
14603 pshufb m5, m4, m8
14604 pshufb m4, m7
14605 pmaddubsw m4, m0
14606 pmaddubsw m5, m1
14607 pmulhrsw m4, m2
14608 pmulhrsw m5, m2
14609 packuswb m4, m5
14610 movu [r0 + r3], m4
14611
14612 lea r0, [r0 + r1 * 4]
14613
14614 palignr m4, m6, m3, 4
14615 pshufb m5, m4, m8
14616 pshufb m4, m7
14617 pmaddubsw m4, m0
14618 pmaddubsw m5, m1
14619 pmulhrsw m4, m2
14620 pmulhrsw m5, m2
14621 packuswb m4, m5
14622 movu [r0], m4
14623
14624 palignr m4, m6, m3, 5
14625 pshufb m5, m4, m8
14626 pshufb m4, m7
14627 pmaddubsw m4, m0
14628 pmaddubsw m5, m1
14629 pmulhrsw m4, m2
14630 pmulhrsw m5, m2
14631 packuswb m4, m5
14632 movu [r0 + r1], m4
14633
14634 palignr m4, m6, m3, 6
14635 pshufb m5, m4, m8
14636 pshufb m4, m7
14637 pmaddubsw m4, m0
14638 pmaddubsw m5, m1
14639 pmulhrsw m4, m2
14640 pmulhrsw m5, m2
14641 packuswb m4, m5
14642 movu [r0 + r1 * 2], m4
14643
14644 palignr m4, m6, m3, 7
14645 pshufb m5, m4, m8
14646 pshufb m4, m7
14647 pmaddubsw m4, m0
14648 pmaddubsw m5, m1
14649 pmulhrsw m4, m2
14650 pmulhrsw m5, m2
14651 packuswb m4, m5
14652 movu [r0 + r3], m4
14653
14654 lea r0, [r0 + r1 * 4]
14655
14656 palignr m4, m6, m3, 8
14657 pshufb m5, m4, m8
14658 pshufb m4, m7
14659 pmaddubsw m4, m0
14660 pmaddubsw m5, m1
14661 pmulhrsw m4, m2
14662 pmulhrsw m5, m2
14663 packuswb m4, m5
14664 movu [r0], m4
14665
14666 palignr m4, m6, m3, 9
14667 pshufb m5, m4, m8
14668 pshufb m4, m7
14669 pmaddubsw m4, m0
14670 pmaddubsw m5, m1
14671 pmulhrsw m4, m2
14672 pmulhrsw m5, m2
14673 packuswb m4, m5
14674 movu [r0 + r1], m4
14675
14676 palignr m4, m6, m3, 10
14677 pshufb m5, m4, m8
14678 pshufb m4, m7
14679 pmaddubsw m4, m0
14680 pmaddubsw m5, m1
14681 pmulhrsw m4, m2
14682 pmulhrsw m5, m2
14683 packuswb m4, m5
14684 movu [r0 + r1 * 2], m4
14685
14686 palignr m4, m6, m3, 11
14687 pshufb m5, m4, m8
14688 pshufb m4, m7
14689 pmaddubsw m4, m0
14690 pmaddubsw m5, m1
14691 pmulhrsw m4, m2
14692 pmulhrsw m5, m2
14693 packuswb m4, m5
14694 movu [r0 + r3], m4
14695
14696 lea r0, [r0 + r1 * 4]
14697
14698 palignr m4, m6, m3, 12
14699 pshufb m5, m4, m8
14700 pshufb m4, m7
14701 pmaddubsw m4, m0
14702 pmaddubsw m5, m1
14703 pmulhrsw m4, m2
14704 pmulhrsw m5, m2
14705 packuswb m4, m5
14706 movu [r0], m4
14707
14708 palignr m4, m6, m3, 13
14709 pshufb m5, m4, m8
14710 pshufb m4, m7
14711 pmaddubsw m4, m0
14712 pmaddubsw m5, m1
14713 pmulhrsw m4, m2
14714 pmulhrsw m5, m2
14715 packuswb m4, m5
14716 movu [r0 + r1], m4
14717
14718 palignr m4, m6, m3, 14
14719 pshufb m5, m4, m8
14720 pshufb m4, m7
14721 pmaddubsw m4, m0
14722 pmaddubsw m5, m1
14723 pmulhrsw m4, m2
14724 pmulhrsw m5, m2
14725 packuswb m4, m5
14726 movu [r0 + r1 * 2], m4
14727
14728 palignr m4, m6, m3, 15
14729 pshufb m5, m4, m8
14730 pshufb m4, m7
14731 pmaddubsw m4, m0
14732 pmaddubsw m5, m1
14733 pmulhrsw m4, m2
14734 pmulhrsw m5, m2
14735 packuswb m4, m5
14736 movu [r0 + r3], m4
14737
14738 lea r0, [r0 + r1 * 4]
14739 mova m3, m6
14740 vbroadcasti128 m6, [r2 + mmsize*2 + 12 + 16]
14741
14742 pshufb m4, m3, m7
14743 pshufb m5, m3, m8
14744 pmaddubsw m4, m0
14745 pmaddubsw m5, m1
14746 pmulhrsw m4, m2
14747 pmulhrsw m5, m2
14748 packuswb m4, m5
14749 movu [r0], m4
14750
14751 palignr m4, m6, m3, 1
14752 pshufb m5, m4, m8
14753 pshufb m4, m7
14754 pmaddubsw m4, m0
14755 pmaddubsw m5, m1
14756 pmulhrsw m4, m2
14757 pmulhrsw m5, m2
14758 packuswb m4, m5
14759 movu [r0 + r1], m4
14760
14761 palignr m4, m6, m3, 2
14762 pshufb m5, m4, m8
14763 pshufb m4, m7
14764 pmaddubsw m4, m0
14765 pmaddubsw m5, m1
14766 pmulhrsw m4, m2
14767 pmulhrsw m5, m2
14768 packuswb m4, m5
14769 movu [r0 + r1 * 2], m4
14770
14771 palignr m4, m6, m3, 3
14772 pshufb m5, m4, m8
14773 pshufb m4, m7
14774 pmaddubsw m4, m0
14775 pmaddubsw m5, m1
14776 pmulhrsw m4, m2
14777 pmulhrsw m5, m2
14778 packuswb m4, m5
14779 movu [r0 + r3], m4
14780
14781 lea r0, [r0 + r1 * 4]
14782
14783 palignr m4, m6, m3, 4
14784 pshufb m5, m4, m8
14785 pshufb m4, m7
14786 pmaddubsw m4, m0
14787 pmaddubsw m5, m1
14788 pmulhrsw m4, m2
14789 pmulhrsw m5, m2
14790 packuswb m4, m5
14791 movu [r0], m4
14792
14793 palignr m4, m6, m3, 5
14794 pshufb m5, m4, m8
14795 pshufb m4, m7
14796 pmaddubsw m4, m0
14797 pmaddubsw m5, m1
14798 pmulhrsw m4, m2
14799 pmulhrsw m5, m2
14800 packuswb m4, m5
14801 movu [r0 + r1], m4
14802
14803 palignr m4, m6, m3, 6
14804 pshufb m5, m4, m8
14805 pshufb m4, m7
14806 pmaddubsw m4, m0
14807 pmaddubsw m5, m1
14808 pmulhrsw m4, m2
14809 pmulhrsw m5, m2
14810 packuswb m4, m5
14811 movu [r0 + r1 * 2], m4
14812
14813 palignr m4, m6, m3, 7
14814 pshufb m5, m4, m8
14815 pshufb m4, m7
14816 pmaddubsw m4, m0
14817 pmaddubsw m5, m1
14818 pmulhrsw m4, m2
14819 pmulhrsw m5, m2
14820 packuswb m4, m5
14821 movu [r0 + r3], m4
14822
14823 lea r0, [r0 + r1 * 4]
14824
14825 palignr m4, m6, m3, 8
14826 pshufb m5, m4, m8
14827 pshufb m4, m7
14828 pmaddubsw m4, m0
14829 pmaddubsw m5, m1
14830 pmulhrsw m4, m2
14831 pmulhrsw m5, m2
14832 packuswb m4, m5
14833 movu [r0], m4
14834
14835 palignr m4, m6, m3, 9
14836 pshufb m5, m4, m8
14837 pshufb m4, m7
14838 pmaddubsw m4, m0
14839 pmaddubsw m5, m1
14840 pmulhrsw m4, m2
14841 pmulhrsw m5, m2
14842 packuswb m4, m5
14843 movu [r0 + r1], m4
14844
14845 palignr m4, m6, m3, 10
14846 pshufb m5, m4, m8
14847 pshufb m4, m7
14848 pmaddubsw m4, m0
14849 pmaddubsw m5, m1
14850 pmulhrsw m4, m2
14851 pmulhrsw m5, m2
14852 packuswb m4, m5
14853 movu [r0 + r1 * 2], m4
14854
14855 palignr m4, m6, m3, 11
14856 pshufb m5, m4, m8
14857 pshufb m4, m7
14858 pmaddubsw m4, m0
14859 pmaddubsw m5, m1
14860 pmulhrsw m4, m2
14861 pmulhrsw m5, m2
14862 packuswb m4, m5
14863 movu [r0 + r3], m4
14864
14865 lea r0, [r0 + r1 * 4]
14866
14867 palignr m4, m6, m3, 12
14868 pshufb m5, m4, m8
14869 pshufb m4, m7
14870 pmaddubsw m4, m0
14871 pmaddubsw m5, m1
14872 pmulhrsw m4, m2
14873 pmulhrsw m5, m2
14874 packuswb m4, m5
14875 movu [r0], m4
14876
14877 palignr m4, m6, m3, 13
14878 pshufb m5, m4, m8
14879 pshufb m4, m7
14880 pmaddubsw m4, m0
14881 pmaddubsw m5, m1
14882 pmulhrsw m4, m2
14883 pmulhrsw m5, m2
14884 packuswb m4, m5
14885 movu [r0 + r1], m4
14886
14887 palignr m4, m6, m3, 14
14888 pshufb m5, m4, m8
14889 pshufb m4, m7
14890 pmaddubsw m4, m0
14891 pmaddubsw m5, m1
14892 pmulhrsw m4, m2
14893 pmulhrsw m5, m2
14894 packuswb m4, m5
14895 movu [r0 + r1 * 2], m4
14896
14897 palignr m4, m6, m3, 15
14898 pshufb m5, m4, m8
14899 pshufb m4, m7
14900 pmaddubsw m4, m0
14901 pmaddubsw m5, m1
14902 pmulhrsw m4, m2
14903 pmulhrsw m5, m2
14904 packuswb m4, m5
14905 movu [r0 + r3], m4
14906 RET
14907
14908 cglobal intra_pred_ang32_24, 3,5,8
14909 lea r3, [ang_table_avx2 + 32 * 16]
14910 lea r4, [r1 * 3]
14911 mova m5, [pw_1024]
14912
14913 ; rows 0 to 7
14914 movu m0, [r2 + 0]
14915 movu m1, [r2 + 1]
14916 punpckhbw m2, m0, m1
14917 punpcklbw m0, m1
14918
14919 movu m4, [r2 + mmsize*2]
14920 pshufb m4, [ang32_shuf_mode24]
14921 mova m3, [ang32_shuf_mode24 + mmsize]
14922 vpermd m4, m3, m4 ; [6 6 13 13 19 19 26 26 x x x...]
14923 palignr m3, m0, m4, 1
14924 vinserti128 m3, m3, xm2, 1
14925
14926 pmaddubsw m4, m0, [r3 + 11 * 32] ; [27]
14927 pmulhrsw m4, m5
14928 pmaddubsw m1, m2, [r3 + 11 * 32]
14929 pmulhrsw m1, m5
14930 packuswb m4, m1
14931 movu [r0], m4
14932
14933 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
14934 pmulhrsw m4, m5
14935 pmaddubsw m1, m2, [r3 + 6 * 32]
14936 pmulhrsw m1, m5
14937 packuswb m4, m1
14938 movu [r0 + r1], m4
14939
14940 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
14941 pmulhrsw m4, m5
14942 pmaddubsw m1, m2, [r3 + 1 * 32]
14943 pmulhrsw m1, m5
14944 packuswb m4, m1
14945 movu [r0 + r1*2], m4
14946
14947 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
14948 pmulhrsw m4, m5
14949 pmaddubsw m1, m2, [r3 - 4 * 32]
14950 pmulhrsw m1, m5
14951 packuswb m4, m1
14952 movu [r0 + r4], m4
14953
14954 lea r0, [r0 + r1 * 4]
14955
14956 pmaddubsw m4, m0, [r3 - 9 * 32] ; [7]
14957 pmulhrsw m4, m5
14958 pmaddubsw m1, m2, [r3 - 9 * 32]
14959 pmulhrsw m1, m5
14960 packuswb m4, m1
14961 movu [r0], m4
14962
14963 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
14964 pmulhrsw m4, m5
14965 pmaddubsw m1, m2, [r3 - 14 * 32]
14966 pmulhrsw m1, m5
14967 packuswb m4, m1
14968 movu [r0 + r1], m4
14969
14970 palignr m6, m0, m3, 14
14971 palignr m7, m2, m0, 14
14972
14973 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
14974 pmulhrsw m4, m5
14975 pmaddubsw m1, m7, [r3 + 13 * 32]
14976 pmulhrsw m1, m5
14977 packuswb m4, m1
14978 movu [r0 + r1*2], m4
14979
14980 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
14981 pmulhrsw m4, m5
14982 pmaddubsw m1, m7, [r3 + 8 * 32]
14983 pmulhrsw m1, m5
14984 packuswb m4, m1
14985 movu [r0 + r4], m4
14986
14987 lea r0, [r0 + r1 * 4]
14988
14989 ; rows 8 to 15
14990 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
14991 pmulhrsw m4, m5
14992 pmaddubsw m1, m7, [r3 + 3 * 32]
14993 pmulhrsw m1, m5
14994 packuswb m4, m1
14995 movu [r0], m4
14996
14997 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
14998 pmulhrsw m4, m5
14999 pmaddubsw m1, m7, [r3 - 2 * 32]
15000 pmulhrsw m1, m5
15001 packuswb m4, m1
15002 movu [r0 + r1], m4
15003
15004 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
15005 pmulhrsw m4, m5
15006 pmaddubsw m1, m7, [r3 - 7 * 32]
15007 pmulhrsw m1, m5
15008 packuswb m4, m1
15009 movu [r0 + r1*2], m4
15010
15011 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
15012 pmulhrsw m4, m5
15013 pmaddubsw m1, m7, [r3 - 12 * 32]
15014 pmulhrsw m1, m5
15015 packuswb m4, m1
15016 movu [r0 + r4], m4
15017
15018 lea r0, [r0 + r1 * 4]
15019
15020 palignr m6, m0, m3, 12
15021 palignr m7, m2, m0, 12
15022
15023 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
15024 pmulhrsw m4, m5
15025 pmaddubsw m1, m7, [r3 + 15 * 32]
15026 pmulhrsw m1, m5
15027 packuswb m4, m1
15028 movu [r0], m4
15029
15030 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
15031 pmulhrsw m4, m5
15032 pmaddubsw m1, m7, [r3 + 10 * 32]
15033 pmulhrsw m1, m5
15034 packuswb m4, m1
15035 movu [r0 + r1], m4
15036
15037 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
15038 pmulhrsw m4, m5
15039 pmaddubsw m1, m7, [r3 + 5 * 32]
15040 pmulhrsw m1, m5
15041 packuswb m4, m1
15042 movu [r0 + r1 * 2], m4
15043
15044 pmaddubsw m4, m6, [r3] ; [16]
15045 pmulhrsw m4, m5
15046 pmaddubsw m1, m7, [r3]
15047 pmulhrsw m1, m5
15048 packuswb m4, m1
15049 movu [r0 + r4], m4
15050
15051 lea r0, [r0 + r1 * 4]
15052
15053 ; rows 16 to 23
15054 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
15055 pmulhrsw m4, m5
15056 pmaddubsw m1, m7, [r3 - 5 * 32]
15057 pmulhrsw m1, m5
15058 packuswb m4, m1
15059 movu [r0], m4
15060
15061 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
15062 pmulhrsw m4, m5
15063 pmaddubsw m1, m7, [r3 - 10 * 32]
15064 pmulhrsw m1, m5
15065 packuswb m4, m1
15066 movu [r0 + r1], m4
15067
15068 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
15069 pmulhrsw m4, m5
15070 pmaddubsw m1, m7, [r3 - 15 * 32]
15071 pmulhrsw m1, m5
15072 packuswb m4, m1
15073 movu [r0 + r1*2], m4
15074
15075 palignr m6, m0, m3, 10
15076 palignr m7, m2, m0, 10
15077
15078 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
15079 pmulhrsw m4, m5
15080 pmaddubsw m1, m7, [r3 + 12 * 32]
15081 pmulhrsw m1, m5
15082 packuswb m4, m1
15083 movu [r0 + r4], m4
15084
15085 lea r0, [r0 + r1 * 4]
15086
15087 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
15088 pmulhrsw m4, m5
15089 pmaddubsw m1, m7, [r3 + 7 * 32]
15090 pmulhrsw m1, m5
15091 packuswb m4, m1
15092 movu [r0], m4
15093
15094 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
15095 pmulhrsw m4, m5
15096 pmaddubsw m1, m7, [r3 + 2 * 32]
15097 pmulhrsw m1, m5
15098 packuswb m4, m1
15099 movu [r0 + r1], m4
15100
15101 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
15102 pmulhrsw m4, m5
15103 pmaddubsw m1, m7, [r3 - 3 * 32]
15104 pmulhrsw m1, m5
15105 packuswb m4, m1
15106 movu [r0 + r1*2], m4
15107
15108 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
15109 pmulhrsw m4, m5
15110 pmaddubsw m1, m7, [r3 - 8 * 32]
15111 pmulhrsw m1, m5
15112 packuswb m4, m1
15113 movu [r0 + r4], m4
15114
15115 lea r0, [r0 + r1 * 4]
15116
15117 ; rows 24 to 31
15118 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
15119 pmulhrsw m4, m5
15120 pmaddubsw m1, m7, [r3 - 13 * 32]
15121 pmulhrsw m1, m5
15122 packuswb m4, m1
15123 movu [r0], m4
15124
15125 palignr m6, m0, m3, 8
15126 palignr m7, m2, m0, 8
15127
15128 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
15129 pmulhrsw m4, m5
15130 pmaddubsw m1, m7, [r3 + 14 * 32]
15131 pmulhrsw m1, m5
15132 packuswb m4, m1
15133 movu [r0 + r1], m4
15134
15135 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
15136 pmulhrsw m4, m5
15137 pmaddubsw m1, m7, [r3 + 9 * 32]
15138 pmulhrsw m1, m5
15139 packuswb m4, m1
15140 movu [r0 + r1 * 2], m4
15141
15142 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
15143 pmulhrsw m4, m5
15144 pmaddubsw m1, m7, [r3 + 4 * 32]
15145 pmulhrsw m1, m5
15146 packuswb m4, m1
15147 movu [r0 + r4], m4
15148
15149 lea r0, [r0 + r1 * 4]
15150
15151 pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
15152 pmulhrsw m4, m5
15153 pmaddubsw m1, m7, [r3 - 1 * 32]
15154 pmulhrsw m1, m5
15155 packuswb m4, m1
15156 movu [r0], m4
15157
15158 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
15159 pmulhrsw m4, m5
15160 pmaddubsw m1, m7, [r3 - 6 * 32]
15161 pmulhrsw m1, m5
15162 packuswb m4, m1
15163 movu [r0 + r1], m4
15164
15165 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
15166 pmulhrsw m4, m5
15167 pmaddubsw m1, m7, [r3 - 11 * 32]
15168 pmulhrsw m1, m5
15169 packuswb m4, m1
15170 movu [r0 + r1*2], m4
15171
15172 pand m6, [pw_00ff]
15173 pand m7, [pw_00ff]
15174 packuswb m6, m7
15175 movu [r0 + r4], m6
15176 RET
15177
15178 cglobal intra_pred_ang32_13, 3,4,9
15179 movu m0, [ang32_fact_mode13]
15180 movu m1, [ang32_fact_mode13 + mmsize]
15181 mova m2, [pw_1024]
15182 mova m7, [ang32_shuf_mode13]
15183 mova m8, [ang32_shuf_mode13 + mmsize]
15184 lea r3, [r1 * 3]
15185
15186 ; prepare for [28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2....]
15187
15188 movu m6, [r2]
15189 pshufb m6, [ang32_shuf_mode13 + mmsize*2]
15190 mova m3, [ang32_shuf_mode24 + mmsize*1]
15191 vpermd m6, m3, m6
15192 palignr m6, m6, 1
15193 vbroadcasti128 m3, [r2 + mmsize*2 + 1]
15194
15195 palignr m5, m3, m6, 1
15196 pshufb m4, m5, m7
15197 pshufb m5, m8
15198 pmaddubsw m4, m0
15199 pmaddubsw m5, m1
15200 pmulhrsw m4, m2
15201 pmulhrsw m5, m2
15202 packuswb m4, m5
15203 movu [r0], m4
15204
15205 palignr m5, m3, m6, 2
15206 pshufb m4, m5, m7
15207 pshufb m5, m8
15208 pmaddubsw m4, m0
15209 pmaddubsw m5, m1
15210 pmulhrsw m4, m2
15211 pmulhrsw m5, m2
15212 packuswb m4, m5
15213 movu [r0 + r1], m4
15214
15215 palignr m5, m3, m6, 3
15216 pshufb m4, m5, m7
15217 pshufb m5, m8
15218 pmaddubsw m4, m0
15219 pmaddubsw m5, m1
15220 pmulhrsw m4, m2
15221 pmulhrsw m5, m2
15222 packuswb m4, m5
15223 movu [r0 + r1 * 2], m4
15224
15225 palignr m5, m3, m6, 4
15226 pshufb m4, m5, m7
15227 pshufb m5, m8
15228 pmaddubsw m4, m0
15229 pmaddubsw m5, m1
15230 pmulhrsw m4, m2
15231 pmulhrsw m5, m2
15232 packuswb m4, m5
15233 movu [r0 + r3], m4
15234
15235 lea r0, [r0 + r1 * 4]
15236
15237 palignr m5, m3, m6, 5
15238 pshufb m4, m5, m7
15239 pshufb m5, m8
15240 pmaddubsw m4, m0
15241 pmaddubsw m5, m1
15242 pmulhrsw m4, m2
15243 pmulhrsw m5, m2
15244 packuswb m4, m5
15245 movu [r0], m4
15246
15247 palignr m5, m3, m6, 6
15248 pshufb m4, m5, m7
15249 pshufb m5, m8
15250 pmaddubsw m4, m0
15251 pmaddubsw m5, m1
15252 pmulhrsw m4, m2
15253 pmulhrsw m5, m2
15254 packuswb m4, m5
15255 movu [r0 + r1], m4
15256
15257 palignr m5, m3, m6, 7
15258 pshufb m4, m5, m7
15259 pshufb m5, m8
15260 pmaddubsw m4, m0
15261 pmaddubsw m5, m1
15262 pmulhrsw m4, m2
15263 pmulhrsw m5, m2
15264 packuswb m4, m5
15265 movu [r0 + r1 * 2], m4
15266
15267 palignr m5, m3, m6, 8
15268 pshufb m4, m5, m7
15269 pshufb m5, m8
15270 pmaddubsw m4, m0
15271 pmaddubsw m5, m1
15272 pmulhrsw m4, m2
15273 pmulhrsw m5, m2
15274 packuswb m4, m5
15275 movu [r0 + r3], m4
15276
15277 lea r0, [r0 + r1 * 4]
15278
15279 palignr m5, m3, m6, 9
15280 pshufb m4, m5, m7
15281 pshufb m5, m8
15282 pmaddubsw m4, m0
15283 pmaddubsw m5, m1
15284 pmulhrsw m4, m2
15285 pmulhrsw m5, m2
15286 packuswb m4, m5
15287 movu [r0], m4
15288
15289 palignr m5, m3, m6, 10
15290 pshufb m4, m5, m7
15291 pshufb m5, m8
15292 pmaddubsw m4, m0
15293 pmaddubsw m5, m1
15294 pmulhrsw m4, m2
15295 pmulhrsw m5, m2
15296 packuswb m4, m5
15297 movu [r0 + r1], m4
15298
15299 palignr m5, m3, m6, 11
15300 pshufb m4, m5, m7
15301 pshufb m5, m8
15302 pmaddubsw m4, m0
15303 pmaddubsw m5, m1
15304 pmulhrsw m4, m2
15305 pmulhrsw m5, m2
15306 packuswb m4, m5
15307 movu [r0 + r1 * 2], m4
15308
15309 palignr m5, m3, m6, 12
15310 pshufb m4, m5, m7
15311 pshufb m5, m8
15312 pmaddubsw m4, m0
15313 pmaddubsw m5, m1
15314 pmulhrsw m4, m2
15315 pmulhrsw m5, m2
15316 packuswb m4, m5
15317 movu [r0 + r3], m4
15318
15319 lea r0, [r0 + r1 * 4]
15320
15321 palignr m5, m3, m6, 13
15322 pshufb m4, m5, m7
15323 pshufb m5, m8
15324 pmaddubsw m4, m0
15325 pmaddubsw m5, m1
15326 pmulhrsw m4, m2
15327 pmulhrsw m5, m2
15328 packuswb m4, m5
15329 movu [r0], m4
15330
15331 palignr m5, m3, m6, 14
15332 pshufb m4, m5, m7
15333 pshufb m5, m8
15334 pmaddubsw m4, m0
15335 pmaddubsw m5, m1
15336 pmulhrsw m4, m2
15337 pmulhrsw m5, m2
15338 packuswb m4, m5
15339 movu [r0 + r1], m4
15340
15341 palignr m5, m3, m6, 15
15342 pshufb m4, m5, m7
15343 pshufb m5, m8
15344 pmaddubsw m4, m0
15345 pmaddubsw m5, m1
15346 pmulhrsw m4, m2
15347 pmulhrsw m5, m2
15348 packuswb m4, m5
15349 movu [r0 + r1 * 2], m4
15350
15351 pshufb m4, m3, m7
15352 pshufb m5, m3, m8
15353 pmaddubsw m4, m0
15354 pmaddubsw m5, m1
15355 pmulhrsw m4, m2
15356 pmulhrsw m5, m2
15357 packuswb m4, m5
15358 movu [r0 + r3], m4
15359
15360 lea r0, [r0 + r1 * 4]
15361
15362 mova m6, m3
15363 vbroadcasti128 m3, [r2 + mmsize*2 + 17]
15364 palignr m5, m3, m6, 1
15365 pshufb m4, m5, m7
15366 pshufb m5, m8
15367 pmaddubsw m4, m0
15368 pmaddubsw m5, m1
15369 pmulhrsw m4, m2
15370 pmulhrsw m5, m2
15371 packuswb m4, m5
15372 movu [r0], m4
15373
15374 palignr m5, m3, m6, 2
15375 pshufb m4, m5, m7
15376 pshufb m5, m8
15377 pmaddubsw m4, m0
15378 pmaddubsw m5, m1
15379 pmulhrsw m4, m2
15380 pmulhrsw m5, m2
15381 packuswb m4, m5
15382 movu [r0 + r1], m4
15383
15384 palignr m5, m3, m6, 3
15385 pshufb m4, m5, m7
15386 pshufb m5, m8
15387 pmaddubsw m4, m0
15388 pmaddubsw m5, m1
15389 pmulhrsw m4, m2
15390 pmulhrsw m5, m2
15391 packuswb m4, m5
15392 movu [r0 + r1 * 2], m4
15393
15394 palignr m5, m3, m6, 4
15395 pshufb m4, m5, m7
15396 pshufb m5, m5, m8
15397 pmaddubsw m4, m0
15398 pmaddubsw m5, m1
15399 pmulhrsw m4, m2
15400 pmulhrsw m5, m2
15401 packuswb m4, m5
15402 movu [r0 + r3], m4
15403
15404 lea r0, [r0 + r1 * 4]
15405
15406 palignr m5, m3, m6, 5
15407 pshufb m4, m5, m7
15408 pshufb m5, m8
15409 pmaddubsw m4, m0
15410 pmaddubsw m5, m1
15411 pmulhrsw m4, m2
15412 pmulhrsw m5, m2
15413 packuswb m4, m5
15414 movu [r0], m4
15415
15416 palignr m5, m3, m6, 6
15417 pshufb m4, m5, m7
15418 pshufb m5, m8
15419 pmaddubsw m4, m0
15420 pmaddubsw m5, m1
15421 pmulhrsw m4, m2
15422 pmulhrsw m5, m2
15423 packuswb m4, m5
15424 movu [r0 + r1], m4
15425
15426 palignr m5, m3, m6, 7
15427 pshufb m4, m5, m7
15428 pshufb m5, m8
15429 pmaddubsw m4, m0
15430 pmaddubsw m5, m1
15431 pmulhrsw m4, m2
15432 pmulhrsw m5, m2
15433 packuswb m4, m5
15434 movu [r0 + r1 * 2], m4
15435
15436 palignr m5, m3, m6, 8
15437 pshufb m4, m5, m7
15438 pshufb m5, m8
15439 pmaddubsw m4, m0
15440 pmaddubsw m5, m1
15441 pmulhrsw m4, m2
15442 pmulhrsw m5, m2
15443 packuswb m4, m5
15444 movu [r0 + r3], m4
15445
15446 lea r0, [r0 + r1 * 4]
15447
15448 palignr m5, m3, m6, 9
15449 pshufb m4, m5, m7
15450 pshufb m5, m8
15451 pmaddubsw m4, m0
15452 pmaddubsw m5, m1
15453 pmulhrsw m4, m2
15454 pmulhrsw m5, m2
15455 packuswb m4, m5
15456 movu [r0], m4
15457
15458 palignr m5, m3, m6, 10
15459 pshufb m4, m5, m7
15460 pshufb m5, m8
15461 pmaddubsw m4, m0
15462 pmaddubsw m5, m1
15463 pmulhrsw m4, m2
15464 pmulhrsw m5, m2
15465 packuswb m4, m5
15466 movu [r0 + r1], m4
15467
15468 palignr m5, m3, m6, 11
15469 pshufb m4, m5, m7
15470 pshufb m5, m8
15471 pmaddubsw m4, m0
15472 pmaddubsw m5, m1
15473 pmulhrsw m4, m2
15474 pmulhrsw m5, m2
15475 packuswb m4, m5
15476 movu [r0 + r1 * 2], m4
15477
15478 palignr m5, m3, m6, 12
15479 pshufb m4, m5, m7
15480 pshufb m5, m8
15481 pmaddubsw m4, m0
15482 pmaddubsw m5, m1
15483 pmulhrsw m4, m2
15484 pmulhrsw m5, m2
15485 packuswb m4, m5
15486 movu [r0 + r3], m4
15487
15488 lea r0, [r0 + r1 * 4]
15489
15490 palignr m5, m3, m6, 13
15491 pshufb m4, m5, m7
15492 pshufb m5, m8
15493 pmaddubsw m4, m0
15494 pmaddubsw m5, m1
15495 pmulhrsw m4, m2
15496 pmulhrsw m5, m2
15497 packuswb m4, m5
15498 movu [r0], m4
15499
15500 palignr m5, m3, m6, 14
15501 pshufb m4, m5, m7
15502 pshufb m5, m8
15503 pmaddubsw m4, m0
15504 pmaddubsw m5, m1
15505 pmulhrsw m4, m2
15506 pmulhrsw m5, m2
15507 packuswb m4, m5
15508 movu [r0 + r1], m4
15509
15510 palignr m5, m3, m6, 15
15511 pshufb m4, m5, m7
15512 pshufb m5, m8
15513 pmaddubsw m4, m0
15514 pmaddubsw m5, m1
15515 pmulhrsw m4, m2
15516 pmulhrsw m5, m2
15517 packuswb m4, m5
15518 movu [r0 + r1 * 2], m4
15519
15520 pshufb m4, m3, m7
15521 pshufb m5, m3, m8
15522 pmaddubsw m4, m0
15523 pmaddubsw m5, m1
15524 pmulhrsw m4, m2
15525 pmulhrsw m5, m2
15526 packuswb m4, m5
15527 movu [r0 + r3], m4
15528 RET
15529
15530 cglobal intra_pred_ang32_23, 3,5,8
15531 lea r3, [ang_table_avx2 + 32 * 16]
15532 lea r4, [r1 * 3]
15533 mova m5, [pw_1024]
15534
15535 ; rows 0 to 7
15536 movu m0, [r2 + 0]
15537 movu m1, [r2 + 1]
15538 punpckhbw m2, m0, m1
15539 punpcklbw m0, m1
15540
15541 movu m4, [r2 + mmsize*2]
15542 pshufb m4, [ang32_shuf_mode23]
15543 vpermq m4, m4, q1313
15544 palignr m3, m0, m4, 1
15545 vinserti128 m3, m3, xm2, 1
15546
15547 pmaddubsw m4, m0, [r3 + 7 * 32] ; [23]
15548 pmulhrsw m4, m5
15549 pmaddubsw m1, m2, [r3 + 7 * 32]
15550 pmulhrsw m1, m5
15551 packuswb m4, m1
15552 movu [r0], m4
15553
15554 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
15555 pmulhrsw m4, m5
15556 pmaddubsw m1, m2, [r3 - 2 * 32]
15557 pmulhrsw m1, m5
15558 packuswb m4, m1
15559 movu [r0 + r1], m4
15560
15561 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
15562 pmulhrsw m4, m5
15563 pmaddubsw m1, m2, [r3 - 11 * 32]
15564 pmulhrsw m1, m5
15565 packuswb m4, m1
15566 movu [r0 + r1*2], m4
15567
15568 palignr m6, m0, m3, 14
15569 palignr m7, m2, m0, 14
15570
15571 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
15572 pmulhrsw m4, m5
15573 pmaddubsw m1, m7, [r3 + 12 * 32]
15574 pmulhrsw m1, m5
15575 packuswb m4, m1
15576 movu [r0 + r4], m4
15577
15578 lea r0, [r0 + r1 * 4]
15579
15580 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
15581 pmulhrsw m4, m5
15582 pmaddubsw m1, m7, [r3 + 3 * 32]
15583 pmulhrsw m1, m5
15584 packuswb m4, m1
15585 movu [r0], m4
15586
15587 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
15588 pmulhrsw m4, m5
15589 pmaddubsw m1, m7, [r3 - 6 * 32]
15590 pmulhrsw m1, m5
15591 packuswb m4, m1
15592 movu [r0 + r1], m4
15593
15594 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
15595 pmulhrsw m4, m5
15596 pmaddubsw m1, m7, [r3 - 15 * 32]
15597 pmulhrsw m1, m5
15598 packuswb m4, m1
15599 movu [r0 + r1*2], m4
15600
15601 palignr m6, m0, m3, 12
15602 palignr m7, m2, m0, 12
15603
15604 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
15605 pmulhrsw m4, m5
15606 pmaddubsw m1, m7, [r3 + 8 * 32]
15607 pmulhrsw m1, m5
15608 packuswb m4, m1
15609 movu [r0 + r4], m4
15610
15611 lea r0, [r0 + r1 * 4]
15612
15613 ; rows 8 to 15
15614 pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
15615 pmulhrsw m4, m5
15616 pmaddubsw m1, m7, [r3 - 1 * 32]
15617 pmulhrsw m1, m5
15618 packuswb m4, m1
15619 movu [r0], m4
15620
15621 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
15622 pmulhrsw m4, m5
15623 pmaddubsw m1, m7, [r3 - 10 * 32]
15624 pmulhrsw m1, m5
15625 packuswb m4, m1
15626 movu [r0 + r1], m4
15627
15628 palignr m6, m0, m3, 10
15629 palignr m7, m2, m0, 10
15630
15631 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
15632 pmulhrsw m4, m5
15633 pmaddubsw m1, m7, [r3 + 13 * 32]
15634 pmulhrsw m1, m5
15635 packuswb m4, m1
15636 movu [r0 + r1*2], m4
15637
15638 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
15639 pmulhrsw m4, m5
15640 pmaddubsw m1, m7, [r3 + 4 * 32]
15641 pmulhrsw m1, m5
15642 packuswb m4, m1
15643 movu [r0 + r4], m4
15644
15645 lea r0, [r0 + r1 * 4]
15646
15647 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
15648 pmulhrsw m4, m5
15649 pmaddubsw m1, m7, [r3 - 5 * 32]
15650 pmulhrsw m1, m5
15651 packuswb m4, m1
15652 movu [r0], m4
15653
15654 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
15655 pmulhrsw m4, m5
15656 pmaddubsw m1, m7, [r3 - 14 * 32]
15657 pmulhrsw m1, m5
15658 packuswb m4, m1
15659 movu [r0 + r1], m4
15660
15661 palignr m6, m0, m3, 8
15662 palignr m7, m2, m0, 8
15663
15664 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
15665 pmulhrsw m4, m5
15666 pmaddubsw m1, m7, [r3 + 9 * 32]
15667 pmulhrsw m1, m5
15668 packuswb m4, m1
15669 movu [r0 + r1 * 2], m4
15670
15671 pmaddubsw m4, m6, [r3] ; [16]
15672 pmulhrsw m4, m5
15673 pmaddubsw m1, m7, [r3]
15674 pmulhrsw m1, m5
15675 packuswb m4, m1
15676 movu [r0 + r4], m4
15677
15678 lea r0, [r0 + r1 * 4]
15679
15680 ; rows 16 to 23
15681 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
15682 pmulhrsw m4, m5
15683 pmaddubsw m1, m7, [r3 - 9 * 32]
15684 pmulhrsw m1, m5
15685 packuswb m4, m1
15686 movu [r0], m4
15687
15688 palignr m6, m0, m3, 6
15689 palignr m7, m2, m0, 6
15690
15691 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
15692 pmulhrsw m4, m5
15693 pmaddubsw m1, m7, [r3 + 14 * 32]
15694 pmulhrsw m1, m5
15695 packuswb m4, m1
15696 movu [r0 + r1], m4
15697
15698 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
15699 pmulhrsw m4, m5
15700 pmaddubsw m1, m7, [r3 + 5 * 32]
15701 pmulhrsw m1, m5
15702 packuswb m4, m1
15703 movu [r0 + r1*2], m4
15704
15705 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
15706 pmulhrsw m4, m5
15707 pmaddubsw m1, m7, [r3 - 4 * 32]
15708 pmulhrsw m1, m5
15709 packuswb m4, m1
15710 movu [r0 + r4], m4
15711
15712 lea r0, [r0 + r1 * 4]
15713
15714 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
15715 pmulhrsw m4, m5
15716 pmaddubsw m1, m7, [r3 - 13 * 32]
15717 pmulhrsw m1, m5
15718 packuswb m4, m1
15719 movu [r0], m4
15720
15721 palignr m6, m0, m3, 4
15722 palignr m7, m2, m0, 4
15723 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
15724 pmulhrsw m4, m5
15725 pmaddubsw m1, m7, [r3 + 10 * 32]
15726 pmulhrsw m1, m5
15727 packuswb m4, m1
15728 movu [r0 + r1], m4
15729
15730 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
15731 pmulhrsw m4, m5
15732 pmaddubsw m1, m7, [r3 + 1 * 32]
15733 pmulhrsw m1, m5
15734 packuswb m4, m1
15735 movu [r0 + r1*2], m4
15736
15737 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
15738 pmulhrsw m4, m5
15739 pmaddubsw m1, m7, [r3 - 8 * 32]
15740 pmulhrsw m1, m5
15741 packuswb m4, m1
15742 movu [r0 + r4], m4
15743
15744 lea r0, [r0 + r1 * 4]
15745
15746 ; rows 24 to 31
15747 palignr m6, m0, m3, 2
15748 palignr m7, m2, m0, 2
15749 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
15750 pmulhrsw m4, m5
15751 pmaddubsw m1, m7, [r3 + 15 * 32]
15752 pmulhrsw m1, m5
15753 packuswb m4, m1
15754 movu [r0], m4
15755
15756 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
15757 pmulhrsw m4, m5
15758 pmaddubsw m1, m7, [r3 + 6 * 32]
15759 pmulhrsw m1, m5
15760 packuswb m4, m1
15761 movu [r0 + r1], m4
15762
15763 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
15764 pmulhrsw m4, m5
15765 pmaddubsw m1, m7, [r3 - 3 * 32]
15766 pmulhrsw m1, m5
15767 packuswb m4, m1
15768 movu [r0 + r1 * 2], m4
15769
15770 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
15771 pmulhrsw m4, m5
15772 pmaddubsw m1, m7, [r3 - 12 * 32]
15773 pmulhrsw m1, m5
15774 packuswb m4, m1
15775 movu [r0 + r4], m4
15776
15777 lea r0, [r0 + r1 * 4]
15778
15779 pmaddubsw m4, m3, [r3 + 11 * 32] ; [27]
15780 pmulhrsw m4, m5
15781 pmaddubsw m1, m0, [r3 + 11 * 32]
15782 pmulhrsw m1, m5
15783 packuswb m4, m1
15784 movu [r0], m4
15785
15786 pmaddubsw m4, m3, [r3 + 2 * 32] ; [18]
15787 pmulhrsw m4, m5
15788 pmaddubsw m1, m0, [r3 + 2 * 32]
15789 pmulhrsw m1, m5
15790 packuswb m4, m1
15791 movu [r0 + r1], m4
15792
15793 pmaddubsw m4, m3, [r3 - 7 * 32] ; [9]
15794 pmulhrsw m4, m5
15795 pmaddubsw m1, m0, [r3 - 7 * 32]
15796 pmulhrsw m1, m5
15797 packuswb m4, m1
15798 movu [r0 + r1*2], m4
15799
15800 pand m3, [pw_00ff]
15801 pand m0, [pw_00ff]
15802 packuswb m3, m0
15803 movu [r0 + r4], m3
15804 RET
15805
15806 cglobal intra_pred_ang32_14, 3,4,9
15807 movu m0, [ang32_fact_mode14]
15808 movu m1, [ang32_fact_mode14 + mmsize]
15809 mova m2, [pw_1024]
15810 mova m7, [ang32_shuf_mode14]
15811 mova m8, [ang32_shuf_mode14 + mmsize]
15812 lea r3, [r1 * 3]
15813
15814 ; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...]
15815
15816 movu m6, [r2]
15817 pshufb m6, [ang32_shuf_mode14 + mmsize*2]
15818 vpermq m6, m6, 01110111b
15819 pslldq m6, m6, 1
15820 vbroadcasti128 m3, [r2 + mmsize*2 + 1]
15821
15822 palignr m5, m3, m6, 1
15823 pshufb m4, m5, m7
15824 pshufb m5, m8
15825 pmaddubsw m4, m0
15826 pmaddubsw m5, m1
15827 pmulhrsw m4, m2
15828 pmulhrsw m5, m2
15829 packuswb m4, m5
15830 movu [r0], m4
15831
15832 palignr m5, m3, m6, 2
15833 pshufb m4, m5, m7
15834 pshufb m5, m8
15835 pmaddubsw m4, m0
15836 pmaddubsw m5, m1
15837 pmulhrsw m4, m2
15838 pmulhrsw m5, m2
15839 packuswb m4, m5
15840 movu [r0 + r1], m4
15841
15842 palignr m5, m3, m6, 3
15843 pshufb m4, m5, m7
15844 pshufb m5, m8
15845 pmaddubsw m4, m0
15846 pmaddubsw m5, m1
15847 pmulhrsw m4, m2
15848 pmulhrsw m5, m2
15849 packuswb m4, m5
15850 movu [r0 + r1 * 2], m4
15851
15852 palignr m5, m3, m6, 4
15853 pshufb m4, m5, m7
15854 pshufb m5, m8
15855 pmaddubsw m4, m0
15856 pmaddubsw m5, m1
15857 pmulhrsw m4, m2
15858 pmulhrsw m5, m2
15859 packuswb m4, m5
15860 movu [r0 + r3], m4
15861
15862 lea r0, [r0 + r1 * 4]
15863
15864 palignr m5, m3, m6, 5
15865 pshufb m4, m5, m7
15866 pshufb m5, m8
15867 pmaddubsw m4, m0
15868 pmaddubsw m5, m1
15869 pmulhrsw m4, m2
15870 pmulhrsw m5, m2
15871 packuswb m4, m5
15872 movu [r0], m4
15873
15874 palignr m5, m3, m6, 6
15875 pshufb m4, m5, m7
15876 pshufb m5, m8
15877 pmaddubsw m4, m0
15878 pmaddubsw m5, m1
15879 pmulhrsw m4, m2
15880 pmulhrsw m5, m2
15881 packuswb m4, m5
15882 movu [r0 + r1], m4
15883
15884 palignr m5, m3, m6, 7
15885 pshufb m4, m5, m7
15886 pshufb m5, m8
15887 pmaddubsw m4, m0
15888 pmaddubsw m5, m1
15889 pmulhrsw m4, m2
15890 pmulhrsw m5, m2
15891 packuswb m4, m5
15892 movu [r0 + r1 * 2], m4
15893
15894 palignr m5, m3, m6, 8
15895 pshufb m4, m5, m7
15896 pshufb m5, m8
15897 pmaddubsw m4, m0
15898 pmaddubsw m5, m1
15899 pmulhrsw m4, m2
15900 pmulhrsw m5, m2
15901 packuswb m4, m5
15902 movu [r0 + r3], m4
15903
15904 lea r0, [r0 + r1 * 4]
15905
15906 palignr m5, m3, m6, 9
15907 pshufb m4, m5, m7
15908 pshufb m5, m8
15909 pmaddubsw m4, m0
15910 pmaddubsw m5, m1
15911 pmulhrsw m4, m2
15912 pmulhrsw m5, m2
15913 packuswb m4, m5
15914 movu [r0], m4
15915
15916 palignr m5, m3, m6, 10
15917 pshufb m4, m5, m7
15918 pshufb m5, m8
15919 pmaddubsw m4, m0
15920 pmaddubsw m5, m1
15921 pmulhrsw m4, m2
15922 pmulhrsw m5, m2
15923 packuswb m4, m5
15924 movu [r0 + r1], m4
15925
15926 palignr m5, m3, m6, 11
15927 pshufb m4, m5, m7
15928 pshufb m5, m8
15929 pmaddubsw m4, m0
15930 pmaddubsw m5, m1
15931 pmulhrsw m4, m2
15932 pmulhrsw m5, m2
15933 packuswb m4, m5
15934 movu [r0 + r1 * 2], m4
15935
15936 palignr m5, m3, m6, 12
15937 pshufb m4, m5, m7
15938 pshufb m5, m8
15939 pmaddubsw m4, m0
15940 pmaddubsw m5, m1
15941 pmulhrsw m4, m2
15942 pmulhrsw m5, m2
15943 packuswb m4, m5
15944 movu [r0 + r3], m4
15945
15946 lea r0, [r0 + r1 * 4]
15947
15948 palignr m5, m3, m6, 13
15949 pshufb m4, m5, m7
15950 pshufb m5, m8
15951 pmaddubsw m4, m0
15952 pmaddubsw m5, m1
15953 pmulhrsw m4, m2
15954 pmulhrsw m5, m2
15955 packuswb m4, m5
15956 movu [r0], m4
15957
15958 palignr m5, m3, m6, 14
15959 pshufb m4, m5, m7
15960 pshufb m5, m8
15961 pmaddubsw m4, m0
15962 pmaddubsw m5, m1
15963 pmulhrsw m4, m2
15964 pmulhrsw m5, m2
15965 packuswb m4, m5
15966 movu [r0 + r1], m4
15967
15968 palignr m5, m3, m6, 15
15969 pshufb m4, m5, m7
15970 pshufb m5, m8
15971 pmaddubsw m4, m0
15972 pmaddubsw m5, m1
15973 pmulhrsw m4, m2
15974 pmulhrsw m5, m2
15975 packuswb m4, m5
15976 movu [r0 + r1 * 2], m4
15977
15978 pshufb m4, m3, m7
15979 pshufb m5, m3, m8
15980 pmaddubsw m4, m0
15981 pmaddubsw m5, m1
15982 pmulhrsw m4, m2
15983 pmulhrsw m5, m2
15984 packuswb m4, m5
15985 movu [r0 + r3], m4
15986
15987 lea r0, [r0 + r1 * 4]
15988
15989 mova m6, m3
15990 vbroadcasti128 m3, [r2 + mmsize*2 + 17]
15991 palignr m5, m3, m6, 1
15992 pshufb m4, m5, m7
15993 pshufb m5, m8
15994 pmaddubsw m4, m0
15995 pmaddubsw m5, m1
15996 pmulhrsw m4, m2
15997 pmulhrsw m5, m2
15998 packuswb m4, m5
15999 movu [r0], m4
16000
16001 palignr m5, m3, m6, 2
16002 pshufb m4, m5, m7
16003 pshufb m5, m8
16004 pmaddubsw m4, m0
16005 pmaddubsw m5, m1
16006 pmulhrsw m4, m2
16007 pmulhrsw m5, m2
16008 packuswb m4, m5
16009 movu [r0 + r1], m4
16010
16011 palignr m5, m3, m6, 3
16012 pshufb m4, m5, m7
16013 pshufb m5, m8
16014 pmaddubsw m4, m0
16015 pmaddubsw m5, m1
16016 pmulhrsw m4, m2
16017 pmulhrsw m5, m2
16018 packuswb m4, m5
16019 movu [r0 + r1 * 2], m4
16020
16021 palignr m5, m3, m6, 4
16022 pshufb m4, m5, m7
16023 pshufb m5, m5, m8
16024 pmaddubsw m4, m0
16025 pmaddubsw m5, m1
16026 pmulhrsw m4, m2
16027 pmulhrsw m5, m2
16028 packuswb m4, m5
16029 movu [r0 + r3], m4
16030
16031 lea r0, [r0 + r1 * 4]
16032
16033 palignr m5, m3, m6, 5
16034 pshufb m4, m5, m7
16035 pshufb m5, m8
16036 pmaddubsw m4, m0
16037 pmaddubsw m5, m1
16038 pmulhrsw m4, m2
16039 pmulhrsw m5, m2
16040 packuswb m4, m5
16041 movu [r0], m4
16042
16043 palignr m5, m3, m6, 6
16044 pshufb m4, m5, m7
16045 pshufb m5, m8
16046 pmaddubsw m4, m0
16047 pmaddubsw m5, m1
16048 pmulhrsw m4, m2
16049 pmulhrsw m5, m2
16050 packuswb m4, m5
16051 movu [r0 + r1], m4
16052
16053 palignr m5, m3, m6, 7
16054 pshufb m4, m5, m7
16055 pshufb m5, m8
16056 pmaddubsw m4, m0
16057 pmaddubsw m5, m1
16058 pmulhrsw m4, m2
16059 pmulhrsw m5, m2
16060 packuswb m4, m5
16061 movu [r0 + r1 * 2], m4
16062
16063 palignr m5, m3, m6, 8
16064 pshufb m4, m5, m7
16065 pshufb m5, m8
16066 pmaddubsw m4, m0
16067 pmaddubsw m5, m1
16068 pmulhrsw m4, m2
16069 pmulhrsw m5, m2
16070 packuswb m4, m5
16071 movu [r0 + r3], m4
16072
16073 lea r0, [r0 + r1 * 4]
16074
16075 palignr m5, m3, m6, 9
16076 pshufb m4, m5, m7
16077 pshufb m5, m8
16078 pmaddubsw m4, m0
16079 pmaddubsw m5, m1
16080 pmulhrsw m4, m2
16081 pmulhrsw m5, m2
16082 packuswb m4, m5
16083 movu [r0], m4
16084
16085 palignr m5, m3, m6, 10
16086 pshufb m4, m5, m7
16087 pshufb m5, m8
16088 pmaddubsw m4, m0
16089 pmaddubsw m5, m1
16090 pmulhrsw m4, m2
16091 pmulhrsw m5, m2
16092 packuswb m4, m5
16093 movu [r0 + r1], m4
16094
16095 palignr m5, m3, m6, 11
16096 pshufb m4, m5, m7
16097 pshufb m5, m8
16098 pmaddubsw m4, m0
16099 pmaddubsw m5, m1
16100 pmulhrsw m4, m2
16101 pmulhrsw m5, m2
16102 packuswb m4, m5
16103 movu [r0 + r1 * 2], m4
16104
16105 palignr m5, m3, m6, 12
16106 pshufb m4, m5, m7
16107 pshufb m5, m8
16108 pmaddubsw m4, m0
16109 pmaddubsw m5, m1
16110 pmulhrsw m4, m2
16111 pmulhrsw m5, m2
16112 packuswb m4, m5
16113 movu [r0 + r3], m4
16114
16115 lea r0, [r0 + r1 * 4]
16116
16117 palignr m5, m3, m6, 13
16118 pshufb m4, m5, m7
16119 pshufb m5, m8
16120 pmaddubsw m4, m0
16121 pmaddubsw m5, m1
16122 pmulhrsw m4, m2
16123 pmulhrsw m5, m2
16124 packuswb m4, m5
16125 movu [r0], m4
16126
16127 palignr m5, m3, m6, 14
16128 pshufb m4, m5, m7
16129 pshufb m5, m8
16130 pmaddubsw m4, m0
16131 pmaddubsw m5, m1
16132 pmulhrsw m4, m2
16133 pmulhrsw m5, m2
16134 packuswb m4, m5
16135 movu [r0 + r1], m4
16136
16137 palignr m5, m3, m6, 15
16138 pshufb m4, m5, m7
16139 pshufb m5, m8
16140 pmaddubsw m4, m0
16141 pmaddubsw m5, m1
16142 pmulhrsw m4, m2
16143 pmulhrsw m5, m2
16144 packuswb m4, m5
16145 movu [r0 + r1 * 2], m4
16146
16147 pshufb m4, m3, m7
16148 pshufb m5, m3, m8
16149 pmaddubsw m4, m0
16150 pmaddubsw m5, m1
16151 pmulhrsw m4, m2
16152 pmulhrsw m5, m2
16153 packuswb m4, m5
16154 movu [r0 + r3], m4
16155 RET
16156
16157 cglobal intra_pred_ang32_22, 3,5,9
16158 lea r3, [ang_table_avx2 + 32 * 16]
16159 lea r4, [r1 * 3]
16160 mova m5, [pw_1024]
16161
16162 ; rows 0 to 7
16163 movu m0, [r2 + 0]
16164 movu m1, [r2 + 1]
16165 punpckhbw m2, m0, m1
16166 punpcklbw m0, m1
16167
16168 movu m4, [r2 + mmsize*2 + 2]
16169 pshufb m4, [ang32_shuf_mode22]
16170 vextracti128 xm8, m4, 1
16171
16172 palignr m3, m0, m4, 2
16173 palignr m3, m8, 15
16174 vinserti128 m3, m3, xm2, 1
16175 vinserti128 m8, m8, xm0, 1
16176
16177 pmaddubsw m4, m0, [r3 + 3 * 32] ; [19]
16178 pmulhrsw m4, m5
16179 pmaddubsw m1, m2, [r3 + 3 * 32]
16180 pmulhrsw m1, m5
16181 packuswb m4, m1
16182 movu [r0], m4
16183
16184 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
16185 pmulhrsw m4, m5
16186 pmaddubsw m1, m2, [r3 - 10 * 32]
16187 pmulhrsw m1, m5
16188 packuswb m4, m1
16189 movu [r0 + r1], m4
16190
16191 palignr m6, m0, m3, 14
16192 palignr m7, m2, m0, 14
16193
16194 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
16195 pmulhrsw m4, m5
16196 pmaddubsw m1, m7, [r3 + 9 * 32]
16197 pmulhrsw m1, m5
16198 packuswb m4, m1
16199 movu [r0 + r1*2], m4
16200
16201 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
16202 pmulhrsw m4, m5
16203 pmaddubsw m1, m7, [r3 - 4 * 32]
16204 pmulhrsw m1, m5
16205 packuswb m4, m1
16206 movu [r0 + r4], m4
16207
16208 lea r0, [r0 + r1 * 4]
16209
16210 palignr m6, m0, m3, 12
16211 palignr m7, m2, m0, 12
16212
16213 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
16214 pmulhrsw m4, m5
16215 pmaddubsw m1, m7, [r3 + 15 * 32]
16216 pmulhrsw m1, m5
16217 packuswb m4, m1
16218 movu [r0], m4
16219
16220 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
16221 pmulhrsw m4, m5
16222 pmaddubsw m1, m7, [r3 + 2 * 32]
16223 pmulhrsw m1, m5
16224 packuswb m4, m1
16225 movu [r0 + r1], m4
16226
16227 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
16228 pmulhrsw m4, m5
16229 pmaddubsw m1, m7, [r3 - 11 * 32]
16230 pmulhrsw m1, m5
16231 packuswb m4, m1
16232 movu [r0 + r1*2], m4
16233
16234 palignr m6, m0, m3, 10
16235 palignr m7, m2, m0, 10
16236
16237 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
16238 pmulhrsw m4, m5
16239 pmaddubsw m1, m7, [r3 + 8 * 32]
16240 pmulhrsw m1, m5
16241 packuswb m4, m1
16242 movu [r0 + r4], m4
16243
16244 lea r0, [r0 + r1 * 4]
16245
16246 ; rows 8 to 15
16247 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
16248 pmulhrsw m4, m5
16249 pmaddubsw m1, m7, [r3 - 5 * 32]
16250 pmulhrsw m1, m5
16251 packuswb m4, m1
16252 movu [r0], m4
16253
16254 palignr m6, m0, m3, 8
16255 palignr m7, m2, m0, 8
16256
16257 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
16258 pmulhrsw m4, m5
16259 pmaddubsw m1, m7, [r3 + 14 * 32]
16260 pmulhrsw m1, m5
16261 packuswb m4, m1
16262 movu [r0 + r1], m4
16263
16264 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
16265 pmulhrsw m4, m5
16266 pmaddubsw m1, m7, [r3 + 1 * 32]
16267 pmulhrsw m1, m5
16268 packuswb m4, m1
16269 movu [r0 + r1*2], m4
16270
16271 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
16272 pmulhrsw m4, m5
16273 pmaddubsw m1, m7, [r3 - 12 * 32]
16274 pmulhrsw m1, m5
16275 packuswb m4, m1
16276 movu [r0 + r4], m4
16277
16278 lea r0, [r0 + r1 * 4]
16279
16280 palignr m6, m0, m3, 6
16281 palignr m7, m2, m0, 6
16282
16283 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
16284 pmulhrsw m4, m5
16285 pmaddubsw m1, m7, [r3 + 7 * 32]
16286 pmulhrsw m1, m5
16287 packuswb m4, m1
16288 movu [r0], m4
16289
16290 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
16291 pmulhrsw m4, m5
16292 pmaddubsw m1, m7, [r3 - 6 * 32]
16293 pmulhrsw m1, m5
16294 packuswb m4, m1
16295 movu [r0 + r1], m4
16296
16297 palignr m6, m0, m3, 4
16298 palignr m7, m2, m0, 4
16299
16300 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
16301 pmulhrsw m4, m5
16302 pmaddubsw m1, m7, [r3 + 13 * 32]
16303 pmulhrsw m1, m5
16304 packuswb m4, m1
16305 movu [r0 + r1 * 2], m4
16306
16307 pmaddubsw m4, m6, [r3] ; [16]
16308 pmulhrsw m4, m5
16309 pmaddubsw m1, m7, [r3]
16310 pmulhrsw m1, m5
16311 packuswb m4, m1
16312 movu [r0 + r4], m4
16313
16314 lea r0, [r0 + r1 * 4]
16315
16316 ; rows 16 to 23
16317 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
16318 pmulhrsw m4, m5
16319 pmaddubsw m1, m7, [r3 - 13 * 32]
16320 pmulhrsw m1, m5
16321 packuswb m4, m1
16322 movu [r0], m4
16323
16324 palignr m6, m0, m3, 2
16325 palignr m7, m2, m0, 2
16326
16327 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
16328 pmulhrsw m4, m5
16329 pmaddubsw m1, m7, [r3 + 6 * 32]
16330 pmulhrsw m1, m5
16331 packuswb m4, m1
16332 movu [r0 + r1], m4
16333
16334 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
16335 pmulhrsw m4, m5
16336 pmaddubsw m1, m7, [r3 - 7 * 32]
16337 pmulhrsw m1, m5
16338 packuswb m4, m1
16339 movu [r0 + r1*2], m4
16340
16341 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
16342 pmulhrsw m4, m5
16343 pmaddubsw m1, m0, [r3 + 12 * 32]
16344 pmulhrsw m1, m5
16345 packuswb m4, m1
16346 movu [r0 + r4], m4
16347
16348 lea r0, [r0 + r1 * 4]
16349
16350 pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
16351 pmulhrsw m4, m5
16352 pmaddubsw m1, m0, [r3 - 1 * 32]
16353 pmulhrsw m1, m5
16354 packuswb m4, m1
16355 movu [r0], m4
16356
16357 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
16358 pmulhrsw m4, m5
16359 pmaddubsw m1, m0, [r3 - 14 * 32]
16360 pmulhrsw m1, m5
16361 packuswb m4, m1
16362 movu [r0 + r1], m4
16363
16364 palignr m6, m3, m8, 14
16365 palignr m7, m0, m3, 14
16366
16367 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
16368 pmulhrsw m4, m5
16369 pmaddubsw m1, m7, [r3 + 5 * 32]
16370 pmulhrsw m1, m5
16371 packuswb m4, m1
16372 movu [r0 + r1*2], m4
16373
16374 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
16375 pmulhrsw m4, m5
16376 pmaddubsw m1, m7, [r3 - 8 * 32]
16377 pmulhrsw m1, m5
16378 packuswb m4, m1
16379 movu [r0 + r4], m4
16380
16381 lea r0, [r0 + r1 * 4]
16382
16383 ; rows 24 to 31
16384 palignr m6, m3, m8, 12
16385 palignr m7, m0, m3, 12
16386 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
16387 pmulhrsw m4, m5
16388 pmaddubsw m1, m7, [r3 + 11 * 32]
16389 pmulhrsw m1, m5
16390 packuswb m4, m1
16391 movu [r0], m4
16392
16393 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
16394 pmulhrsw m4, m5
16395 pmaddubsw m1, m7, [r3 - 2 * 32]
16396 pmulhrsw m1, m5
16397 packuswb m4, m1
16398 movu [r0 + r1], m4
16399
16400 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
16401 pmulhrsw m4, m5
16402 pmaddubsw m1, m7, [r3 - 15 * 32]
16403 pmulhrsw m1, m5
16404 packuswb m4, m1
16405 movu [r0 + r1 * 2], m4
16406
16407 palignr m6, m3, m8, 10
16408 palignr m7, m0, m3, 10
16409 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
16410 pmulhrsw m4, m5
16411 pmaddubsw m1, m7, [r3 + 4 * 32]
16412 pmulhrsw m1, m5
16413 packuswb m4, m1
16414 movu [r0 + r4], m4
16415
16416 lea r0, [r0 + r1 * 4]
16417
16418 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
16419 pmulhrsw m4, m5
16420 pmaddubsw m1, m7, [r3 - 9 * 32]
16421 pmulhrsw m1, m5
16422 packuswb m4, m1
16423 movu [r0], m4
16424
16425 palignr m0, m3, 8
16426 palignr m3, m8, 8
16427 pmaddubsw m4, m3, [r3 + 10 * 32] ; [26]
16428 pmulhrsw m4, m5
16429 pmaddubsw m1, m0, [r3 + 10 * 32]
16430 pmulhrsw m1, m5
16431 packuswb m4, m1
16432 movu [r0 + r1], m4
16433
16434 pmaddubsw m4, m3, [r3 - 3 * 32] ; [13]
16435 pmulhrsw m4, m5
16436 pmaddubsw m1, m0, [r3 - 3 * 32]
16437 pmulhrsw m1, m5
16438 packuswb m4, m1
16439 movu [r0 + r1*2], m4
16440
16441 pand m3, [pw_00ff]
16442 pand m0, [pw_00ff]
16443 packuswb m3, m0
16444 movu [r0 + r4], m3
16445 RET
16446
16447 cglobal intra_pred_ang32_15, 3,4,9
16448 movu m0, [ang32_fact_mode15]
16449 movu m1, [ang32_fact_mode15 + mmsize]
16450 mova m2, [pw_1024]
16451 mova m7, [ang32_shuf_mode15]
16452 mova m8, [ang32_shuf_mode15 + mmsize]
16453 lea r3, [r1 * 3]
16454
16455 ; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2...]
16456
16457 movu m6, [r2]
16458 pshufb m6, [ang32_shuf_mode15 + mmsize*2]
16459 vpermq m6, m6, 01110111b
16460
16461 movu xm3, [r2 + mmsize*2]
16462 pinsrb xm3, [r2], 0
16463 vpermq m3, m3, 01000100b
16464
16465 palignr m4, m3, m6, 2
16466 pshufb m4, m7
16467 pshufb m5, m6, m8
16468 pmaddubsw m4, m0
16469 pmaddubsw m5, m1
16470 pmulhrsw m4, m2
16471 pmulhrsw m5, m2
16472 packuswb m4, m5
16473 movu [r0], m4
16474
16475 palignr m4, m3, m6, 3
16476 pshufb m4, m7
16477 palignr m5, m3, m6, 1
16478 pshufb m5, m8
16479 pmaddubsw m4, m0
16480 pmaddubsw m5, m1
16481 pmulhrsw m4, m2
16482 pmulhrsw m5, m2
16483 packuswb m4, m5
16484 movu [r0 + r1], m4
16485
16486 palignr m4, m3, m6, 4
16487 pshufb m4, m7
16488 palignr m5, m3, m6, 2
16489 pshufb m5, m8
16490 pmaddubsw m4, m0
16491 pmaddubsw m5, m1
16492 pmulhrsw m4, m2
16493 pmulhrsw m5, m2
16494 packuswb m4, m5
16495 movu [r0 + r1 * 2], m4
16496
16497 palignr m4, m3, m6, 5
16498 pshufb m4, m7
16499 palignr m5, m3, m6, 3
16500 pshufb m5, m8
16501 pmaddubsw m4, m0
16502 pmaddubsw m5, m1
16503 pmulhrsw m4, m2
16504 pmulhrsw m5, m2
16505 packuswb m4, m5
16506 movu [r0 + r3], m4
16507
16508 lea r0, [r0 + r1 * 4]
16509
16510 palignr m4, m3, m6, 6
16511 pshufb m4, m7
16512 palignr m5, m3, m6, 4
16513 pshufb m5, m8
16514 pmaddubsw m4, m0
16515 pmaddubsw m5, m1
16516 pmulhrsw m4, m2
16517 pmulhrsw m5, m2
16518 packuswb m4, m5
16519 movu [r0], m4
16520
16521 palignr m4, m3, m6, 7
16522 pshufb m4, m7
16523 palignr m5, m3, m6, 5
16524 pshufb m5, m8
16525 pmaddubsw m4, m0
16526 pmaddubsw m5, m1
16527 pmulhrsw m4, m2
16528 pmulhrsw m5, m2
16529 packuswb m4, m5
16530 movu [r0 + r1], m4
16531
16532 palignr m4, m3, m6, 8
16533 pshufb m4, m7
16534 palignr m5, m3, m6, 6
16535 pshufb m5, m8
16536 pmaddubsw m4, m0
16537 pmaddubsw m5, m1
16538 pmulhrsw m4, m2
16539 pmulhrsw m5, m2
16540 packuswb m4, m5
16541 movu [r0 + r1 * 2], m4
16542
16543 palignr m4, m3, m6, 9
16544 pshufb m4, m7
16545 palignr m5, m3, m6, 7
16546 pshufb m5, m8
16547 pmaddubsw m4, m0
16548 pmaddubsw m5, m1
16549 pmulhrsw m4, m2
16550 pmulhrsw m5, m2
16551 packuswb m4, m5
16552 movu [r0 + r3], m4
16553
16554 lea r0, [r0 + r1 * 4]
16555
16556 palignr m4, m3, m6, 10
16557 pshufb m4, m7
16558 palignr m5, m3, m6, 8
16559 pshufb m5, m8
16560 pmaddubsw m4, m0
16561 pmaddubsw m5, m1
16562 pmulhrsw m4, m2
16563 pmulhrsw m5, m2
16564 packuswb m4, m5
16565 movu [r0], m4
16566
16567 palignr m4, m3, m6, 11
16568 pshufb m4, m7
16569 palignr m5, m3, m6, 9
16570 pshufb m5, m8
16571 pmaddubsw m4, m0
16572 pmaddubsw m5, m1
16573 pmulhrsw m4, m2
16574 pmulhrsw m5, m2
16575 packuswb m4, m5
16576 movu [r0 + r1], m4
16577
16578 palignr m4, m3, m6, 12
16579 pshufb m4, m7
16580 palignr m5, m3, m6, 10
16581 pshufb m5, m8
16582 pmaddubsw m4, m0
16583 pmaddubsw m5, m1
16584 pmulhrsw m4, m2
16585 pmulhrsw m5, m2
16586 packuswb m4, m5
16587 movu [r0 + r1 * 2], m4
16588
16589 palignr m4, m3, m6, 13
16590 pshufb m4, m7
16591 palignr m5, m3, m6, 11
16592 pshufb m5, m8
16593 pmaddubsw m4, m0
16594 pmaddubsw m5, m1
16595 pmulhrsw m4, m2
16596 pmulhrsw m5, m2
16597 packuswb m4, m5
16598 movu [r0 + r3], m4
16599
16600 lea r0, [r0 + r1 * 4]
16601
16602 palignr m4, m3, m6, 14
16603 pshufb m4, m7
16604 palignr m5, m3, m6, 12
16605 pshufb m5, m8
16606 pmaddubsw m4, m0
16607 pmaddubsw m5, m1
16608 pmulhrsw m4, m2
16609 pmulhrsw m5, m2
16610 packuswb m4, m5
16611 movu [r0], m4
16612
16613 palignr m4, m3, m6, 15
16614 pshufb m4, m7
16615 palignr m5, m3, m6, 13
16616 pshufb m5, m8
16617 pmaddubsw m4, m0
16618 pmaddubsw m5, m1
16619 pmulhrsw m4, m2
16620 pmulhrsw m5, m2
16621 packuswb m4, m5
16622 movu [r0 + r1], m4
16623
16624 pshufb m4, m3, m7
16625 palignr m5, m3, m6, 14
16626 pshufb m5, m8
16627 pmaddubsw m4, m0
16628 pmaddubsw m5, m1
16629 pmulhrsw m4, m2
16630 pmulhrsw m5, m2
16631 packuswb m4, m5
16632 movu [r0 + r1 * 2], m4
16633
16634 palignr m5, m3, m6, 15
16635 mova m6, m3
16636 vbroadcasti128 m3, [r2 + mmsize*2 + 16]
16637
16638 palignr m4, m3, m6, 1
16639 pshufb m4, m7
16640 pshufb m5, m8
16641 pmaddubsw m4, m0
16642 pmaddubsw m5, m1
16643 pmulhrsw m4, m2
16644 pmulhrsw m5, m2
16645 packuswb m4, m5
16646 movu [r0 + r3], m4
16647
16648 lea r0, [r0 + r1 * 4]
16649
16650 palignr m4, m3, m6, 2
16651 pshufb m4, m7
16652 pshufb m5, m6, m8
16653 pmaddubsw m4, m0
16654 pmaddubsw m5, m1
16655 pmulhrsw m4, m2
16656 pmulhrsw m5, m2
16657 packuswb m4, m5
16658 movu [r0], m4
16659
16660 palignr m4, m3, m6, 3
16661 pshufb m4, m7
16662 palignr m5, m3, m6, 1
16663 pshufb m5, m8
16664 pmaddubsw m4, m0
16665 pmaddubsw m5, m1
16666 pmulhrsw m4, m2
16667 pmulhrsw m5, m2
16668 packuswb m4, m5
16669 movu [r0 + r1], m4
16670
16671 palignr m4, m3, m6, 4
16672 pshufb m4, m7
16673 palignr m5, m3, m6, 2
16674 pshufb m5, m8
16675 pmaddubsw m4, m0
16676 pmaddubsw m5, m1
16677 pmulhrsw m4, m2
16678 pmulhrsw m5, m2
16679 packuswb m4, m5
16680 movu [r0 + r1 * 2], m4
16681
16682 palignr m4, m3, m6, 5
16683 pshufb m4, m7
16684 palignr m5, m3, m6, 3
16685 pshufb m5, m8
16686 pmaddubsw m4, m0
16687 pmaddubsw m5, m1
16688 pmulhrsw m4, m2
16689 pmulhrsw m5, m2
16690 packuswb m4, m5
16691 movu [r0 + r3], m4
16692
16693 lea r0, [r0 + r1 * 4]
16694
16695 palignr m4, m3, m6, 6
16696 pshufb m4, m7
16697 palignr m5, m3, m6, 4
16698 pshufb m5, m8
16699 pmaddubsw m4, m0
16700 pmaddubsw m5, m1
16701 pmulhrsw m4, m2
16702 pmulhrsw m5, m2
16703 packuswb m4, m5
16704 movu [r0], m4
16705
16706 palignr m4, m3, m6, 7
16707 pshufb m4, m7
16708 palignr m5, m3, m6, 5
16709 pshufb m5, m8
16710 pmaddubsw m4, m0
16711 pmaddubsw m5, m1
16712 pmulhrsw m4, m2
16713 pmulhrsw m5, m2
16714 packuswb m4, m5
16715 movu [r0 + r1], m4
16716
16717 palignr m4, m3, m6, 8
16718 pshufb m4, m7
16719 palignr m5, m3, m6, 6
16720 pshufb m5, m8
16721 pmaddubsw m4, m0
16722 pmaddubsw m5, m1
16723 pmulhrsw m4, m2
16724 pmulhrsw m5, m2
16725 packuswb m4, m5
16726 movu [r0 + r1 * 2], m4
16727
16728 palignr m4, m3, m6, 9
16729 pshufb m4, m7
16730 palignr m5, m3, m6, 7
16731 pshufb m5, m8
16732 pmaddubsw m4, m0
16733 pmaddubsw m5, m1
16734 pmulhrsw m4, m2
16735 pmulhrsw m5, m2
16736 packuswb m4, m5
16737 movu [r0 + r3], m4
16738
16739 lea r0, [r0 + r1 * 4]
16740
16741 palignr m4, m3, m6, 10
16742 pshufb m4, m7
16743 palignr m5, m3, m6, 8
16744 pshufb m5, m8
16745 pmaddubsw m4, m0
16746 pmaddubsw m5, m1
16747 pmulhrsw m4, m2
16748 pmulhrsw m5, m2
16749 packuswb m4, m5
16750 movu [r0], m4
16751
16752 palignr m4, m3, m6, 11
16753 pshufb m4, m7
16754 palignr m5, m3, m6, 9
16755 pshufb m5, m8
16756 pmaddubsw m4, m0
16757 pmaddubsw m5, m1
16758 pmulhrsw m4, m2
16759 pmulhrsw m5, m2
16760 packuswb m4, m5
16761 movu [r0 + r1], m4
16762
16763 palignr m4, m3, m6, 12
16764 pshufb m4, m7
16765 palignr m5, m3, m6, 10
16766 pshufb m5, m8
16767 pmaddubsw m4, m0
16768 pmaddubsw m5, m1
16769 pmulhrsw m4, m2
16770 pmulhrsw m5, m2
16771 packuswb m4, m5
16772 movu [r0 + r1 * 2], m4
16773
16774 palignr m4, m3, m6, 13
16775 pshufb m4, m7
16776 palignr m5, m3, m6, 11
16777 pshufb m5, m8
16778 pmaddubsw m4, m0
16779 pmaddubsw m5, m1
16780 pmulhrsw m4, m2
16781 pmulhrsw m5, m2
16782 packuswb m4, m5
16783 movu [r0 + r3], m4
16784
16785 lea r0, [r0 + r1 * 4]
16786
16787 palignr m4, m3, m6, 14
16788 pshufb m4, m7
16789 palignr m5, m3, m6, 12
16790 pshufb m5, m8
16791 pmaddubsw m4, m0
16792 pmaddubsw m5, m1
16793 pmulhrsw m4, m2
16794 pmulhrsw m5, m2
16795 packuswb m4, m5
16796 movu [r0], m4
16797
16798 palignr m4, m3, m6, 15
16799 pshufb m4, m7
16800 palignr m5, m3, m6, 13
16801 pshufb m5, m8
16802 pmaddubsw m4, m0
16803 pmaddubsw m5, m1
16804 pmulhrsw m4, m2
16805 pmulhrsw m5, m2
16806 packuswb m4, m5
16807 movu [r0 + r1], m4
16808
16809 pshufb m4, m3, m7
16810 palignr m5, m3, m6, 14
16811 pshufb m5, m8
16812 pmaddubsw m4, m0
16813 pmaddubsw m5, m1
16814 pmulhrsw m4, m2
16815 pmulhrsw m5, m2
16816 packuswb m4, m5
16817 movu [r0 + r1 * 2], m4
16818
16819 palignr m5, m3, m6, 15
16820 vbroadcasti128 m6, [r2 + mmsize*2 + 32]
16821
16822 palignr m4, m6, m3, 1
16823 pshufb m4, m7
16824 pshufb m5, m8
16825 pmaddubsw m4, m0
16826 pmaddubsw m5, m1
16827 pmulhrsw m4, m2
16828 pmulhrsw m5, m2
16829 packuswb m4, m5
16830 movu [r0 + r3], m4
16831 RET
16832
16833 cglobal intra_pred_ang32_21, 3,5,9
16834 lea r3, [ang_table_avx2 + 32 * 16]
16835 lea r4, [r1 * 3]
16836 mova m5, [pw_1024]
16837
16838 ; rows 0 to 7
16839 movu m0, [r2 + 0]
16840 movu m1, [r2 + 1]
16841 punpckhbw m2, m0, m1
16842 punpcklbw m0, m1
16843
16844 movu m4, [r2 + mmsize*2]
16845 pshufb m4, [ang32_shuf_mode21]
16846 vextracti128 xm6, m4, 1
16847
16848 palignr m3, m0, m4, 1
16849 palignr m8, m3, m6, 1
16850 vinserti128 m3, m3, xm2, 1
16851 vinserti128 m8, m8, xm0, 1
16852
16853 pmaddubsw m4, m0, [r3 - 1 * 32] ; [15]
16854 pmulhrsw m4, m5
16855 pmaddubsw m1, m2, [r3 - 1 * 32]
16856 pmulhrsw m1, m5
16857 packuswb m4, m1
16858 movu [r0], m4
16859
16860 palignr m6, m0, m3, 14
16861 palignr m7, m2, m0, 14
16862 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
16863 pmulhrsw m4, m5
16864 pmaddubsw m1, m7, [r3 + 14 * 32]
16865 pmulhrsw m1, m5
16866 packuswb m4, m1
16867 movu [r0 + r1], m4
16868
16869 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
16870 pmulhrsw m4, m5
16871 pmaddubsw m1, m7, [r3 - 3 * 32]
16872 pmulhrsw m1, m5
16873 packuswb m4, m1
16874 movu [r0 + r1*2], m4
16875
16876 palignr m6, m0, m3, 12
16877 palignr m7, m2, m0, 12
16878 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
16879 pmulhrsw m4, m5
16880 pmaddubsw m1, m7, [r3 + 12 * 32]
16881 pmulhrsw m1, m5
16882 packuswb m4, m1
16883 movu [r0 + r4], m4
16884
16885 lea r0, [r0 + r1 * 4]
16886
16887 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
16888 pmulhrsw m4, m5
16889 pmaddubsw m1, m7, [r3 - 5 * 32]
16890 pmulhrsw m1, m5
16891 packuswb m4, m1
16892 movu [r0], m4
16893
16894 palignr m6, m0, m3, 10
16895 palignr m7, m2, m0, 10
16896 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
16897 pmulhrsw m4, m5
16898 pmaddubsw m1, m7, [r3 + 10 * 32]
16899 pmulhrsw m1, m5
16900 packuswb m4, m1
16901 movu [r0 + r1], m4
16902
16903 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
16904 pmulhrsw m4, m5
16905 pmaddubsw m1, m7, [r3 - 7 * 32]
16906 pmulhrsw m1, m5
16907 packuswb m4, m1
16908 movu [r0 + r1*2], m4
16909
16910 palignr m6, m0, m3, 8
16911 palignr m7, m2, m0, 8
16912
16913 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
16914 pmulhrsw m4, m5
16915 pmaddubsw m1, m7, [r3 + 8 * 32]
16916 pmulhrsw m1, m5
16917 packuswb m4, m1
16918 movu [r0 + r4], m4
16919
16920 lea r0, [r0 + r1 * 4]
16921
16922 ; rows 8 to 15
16923 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
16924 pmulhrsw m4, m5
16925 pmaddubsw m1, m7, [r3 - 9 * 32]
16926 pmulhrsw m1, m5
16927 packuswb m4, m1
16928 movu [r0], m4
16929
16930 palignr m6, m0, m3, 6
16931 palignr m7, m2, m0, 6
16932 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
16933 pmulhrsw m4, m5
16934 pmaddubsw m1, m7, [r3 + 6 * 32]
16935 pmulhrsw m1, m5
16936 packuswb m4, m1
16937 movu [r0 + r1], m4
16938
16939 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
16940 pmulhrsw m4, m5
16941 pmaddubsw m1, m7, [r3 - 11 * 32]
16942 pmulhrsw m1, m5
16943 packuswb m4, m1
16944 movu [r0 + r1*2], m4
16945
16946 palignr m6, m0, m3, 4
16947 palignr m7, m2, m0, 4
16948 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
16949 pmulhrsw m4, m5
16950 pmaddubsw m1, m7, [r3 + 4 * 32]
16951 pmulhrsw m1, m5
16952 packuswb m4, m1
16953 movu [r0 + r4], m4
16954
16955 lea r0, [r0 + r1 * 4]
16956
16957 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
16958 pmulhrsw m4, m5
16959 pmaddubsw m1, m7, [r3 - 13 * 32]
16960 pmulhrsw m1, m5
16961 packuswb m4, m1
16962 movu [r0], m4
16963
16964 palignr m6, m0, m3, 2
16965 palignr m7, m2, m0, 2
16966 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
16967 pmulhrsw m4, m5
16968 pmaddubsw m1, m7, [r3 + 2 * 32]
16969 pmulhrsw m1, m5
16970 packuswb m4, m1
16971 movu [r0 + r1], m4
16972
16973 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
16974 pmulhrsw m4, m5
16975 pmaddubsw m1, m7, [r3 - 15 * 32]
16976 pmulhrsw m1, m5
16977 packuswb m4, m1
16978 movu [r0 + r1 * 2], m4
16979
16980 pmaddubsw m4, m3, [r3] ; [16]
16981 pmulhrsw m4, m5
16982 pmaddubsw m1, m0, [r3]
16983 pmulhrsw m1, m5
16984 packuswb m4, m1
16985 movu [r0 + r4], m4
16986
16987 lea r0, [r0 + r1 * 4]
16988
16989 ; rows 16 to 23
16990 palignr m6, m3, m8, 14
16991 palignr m7, m0, m3, 14
16992 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
16993 pmulhrsw m4, m5
16994 pmaddubsw m1, m7, [r3 + 15 * 32]
16995 pmulhrsw m1, m5
16996 packuswb m4, m1
16997 movu [r0], m4
16998
16999 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
17000 pmulhrsw m4, m5
17001 pmaddubsw m1, m7, [r3 - 2 * 32]
17002 pmulhrsw m1, m5
17003 packuswb m4, m1
17004 movu [r0 + r1], m4
17005
17006 palignr m6, m3, m8, 12
17007 palignr m7, m0, m3, 12
17008 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
17009 pmulhrsw m4, m5
17010 pmaddubsw m1, m7, [r3 + 13 * 32]
17011 pmulhrsw m1, m5
17012 packuswb m4, m1
17013 movu [r0 + r1*2], m4
17014
17015 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
17016 pmulhrsw m4, m5
17017 pmaddubsw m1, m7, [r3 - 4 * 32]
17018 pmulhrsw m1, m5
17019 packuswb m4, m1
17020 movu [r0 + r4], m4
17021
17022 lea r0, [r0 + r1 * 4]
17023
17024 palignr m6, m3, m8, 10
17025 palignr m7, m0, m3, 10
17026 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
17027 pmulhrsw m4, m5
17028 pmaddubsw m1, m7, [r3 + 11 * 32]
17029 pmulhrsw m1, m5
17030 packuswb m4, m1
17031 movu [r0], m4
17032
17033 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
17034 pmulhrsw m4, m5
17035 pmaddubsw m1, m7, [r3 - 6 * 32]
17036 pmulhrsw m1, m5
17037 packuswb m4, m1
17038 movu [r0 + r1], m4
17039
17040 palignr m6, m3, m8, 8
17041 palignr m7, m0, m3, 8
17042 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
17043 pmulhrsw m4, m5
17044 pmaddubsw m1, m7, [r3 + 9 * 32]
17045 pmulhrsw m1, m5
17046 packuswb m4, m1
17047 movu [r0 + r1*2], m4
17048
17049 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
17050 pmulhrsw m4, m5
17051 pmaddubsw m1, m7, [r3 - 8 * 32]
17052 pmulhrsw m1, m5
17053 packuswb m4, m1
17054 movu [r0 + r4], m4
17055
17056 lea r0, [r0 + r1 * 4]
17057
17058 ; rows 24 to 31
17059 palignr m6, m3, m8, 6
17060 palignr m7, m0, m3, 6
17061 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
17062 pmulhrsw m4, m5
17063 pmaddubsw m1, m7, [r3 + 7 * 32]
17064 pmulhrsw m1, m5
17065 packuswb m4, m1
17066 movu [r0], m4
17067
17068 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
17069 pmulhrsw m4, m5
17070 pmaddubsw m1, m7, [r3 - 10 * 32]
17071 pmulhrsw m1, m5
17072 packuswb m4, m1
17073 movu [r0 + r1], m4
17074
17075 palignr m6, m3, m8, 4
17076 palignr m7, m0, m3, 4
17077 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
17078 pmulhrsw m4, m5
17079 pmaddubsw m1, m7, [r3 + 5 * 32]
17080 pmulhrsw m1, m5
17081 packuswb m4, m1
17082 movu [r0 + r1 * 2], m4
17083
17084 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
17085 pmulhrsw m4, m5
17086 pmaddubsw m1, m7, [r3 - 12 * 32]
17087 pmulhrsw m1, m5
17088 packuswb m4, m1
17089 movu [r0 + r4], m4
17090
17091 lea r0, [r0 + r1 * 4]
17092
17093 palignr m6, m3, m8, 2
17094 palignr m7, m0, m3, 2
17095 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
17096 pmulhrsw m4, m5
17097 pmaddubsw m1, m7, [r3 + 3 * 32]
17098 pmulhrsw m1, m5
17099 packuswb m4, m1
17100 movu [r0], m4
17101
17102 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
17103 pmulhrsw m4, m5
17104 pmaddubsw m1, m7, [r3 - 14 * 32]
17105 pmulhrsw m1, m5
17106 packuswb m4, m1
17107 movu [r0 + r1], m4
17108
17109 pmaddubsw m4, m8, [r3 + 1 * 32] ; [17]
17110 pmulhrsw m4, m5
17111 pmaddubsw m1, m3, [r3 + 1 * 32]
17112 pmulhrsw m1, m5
17113 packuswb m4, m1
17114 movu [r0 + r1*2], m4
17115
17116 pand m8, [pw_00ff]
17117 pand m3, [pw_00ff]
17118 packuswb m8, m3
17119 movu [r0 + r4], m8
17120 RET
17121
17122 cglobal intra_pred_ang32_16, 3,4,10
17123 movu m0, [ang32_fact_mode16]
17124 movu m1, [ang32_fact_mode16 + mmsize]
17125 mova m2, [pw_1024]
17126 mova m7, [ang32_shuf_mode16]
17127 mova m8, [ang32_shuf_mode16 + mmsize]
17128 lea r3, [r1 * 3]
17129
17130 ; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...]
17131
17132 movu m6, [r2]
17133 pshufb m6, [ang32_shuf_mode16 + mmsize*2]
17134 mova m9, m6
17135 mova m3, [ang32_shuf_mode16 + mmsize*3]
17136 vpermd m6, m3, m6
17137 vpermq m9, m9, q3232
17138 pslldq m9, 4
17139 palignr m6, m9, 15
17140 pslldq m9, 1
17141
17142 vbroadcasti128 m3, [r2 + mmsize*2 + 1]
17143
17144 palignr m4, m3, m6, 1
17145 palignr m5, m6, m9, 6
17146 pshufb m4, m7
17147 pshufb m5, m8
17148 pmaddubsw m4, m0
17149 pmaddubsw m5, m1
17150 pmulhrsw m4, m2
17151 pmulhrsw m5, m2
17152 packuswb m4, m5
17153 vpermq m4, m4, q3120
17154 movu [r0], m4
17155
17156 palignr m4, m3, m6, 2
17157 palignr m5, m6, m9, 7
17158 pshufb m4, m7
17159 pshufb m5, m8
17160 pmaddubsw m4, m0
17161 pmaddubsw m5, m1
17162 pmulhrsw m4, m2
17163 pmulhrsw m5, m2
17164 packuswb m4, m5
17165 vpermq m4, m4, q3120
17166 movu [r0 + r1], m4
17167
17168 palignr m4, m3, m6, 3
17169 palignr m5, m6, m9, 8
17170 pshufb m4, m7
17171 pshufb m5, m8
17172 pmaddubsw m4, m0
17173 pmaddubsw m5, m1
17174 pmulhrsw m4, m2
17175 pmulhrsw m5, m2
17176 packuswb m4, m5
17177 vpermq m4, m4, q3120
17178 movu [r0 + r1 * 2], m4
17179
17180 palignr m4, m3, m6, 4
17181 palignr m5, m6, m9, 9
17182 pshufb m4, m7
17183 pshufb m5, m8
17184 pmaddubsw m4, m0
17185 pmaddubsw m5, m1
17186 pmulhrsw m4, m2
17187 pmulhrsw m5, m2
17188 packuswb m4, m5
17189 vpermq m4, m4, q3120
17190 movu [r0 + r3], m4
17191
17192 lea r0, [r0 + r1 * 4]
17193
17194 palignr m4, m3, m6, 5
17195 palignr m5, m6, m9, 10
17196 pshufb m4, m7
17197 pshufb m5, m8
17198 pmaddubsw m4, m0
17199 pmaddubsw m5, m1
17200 pmulhrsw m4, m2
17201 pmulhrsw m5, m2
17202 packuswb m4, m5
17203 vpermq m4, m4, q3120
17204 movu [r0], m4
17205
17206 palignr m4, m3, m6, 6
17207 palignr m5, m6, m9, 11
17208 pshufb m4, m7
17209 pshufb m5, m8
17210 pmaddubsw m4, m0
17211 pmaddubsw m5, m1
17212 pmulhrsw m4, m2
17213 pmulhrsw m5, m2
17214 packuswb m4, m5
17215 vpermq m4, m4, q3120
17216 movu [r0 + r1], m4
17217
17218 palignr m4, m3, m6, 7
17219 palignr m5, m6, m9, 12
17220 pshufb m4, m7
17221 pshufb m5, m8
17222 pmaddubsw m4, m0
17223 pmaddubsw m5, m1
17224 pmulhrsw m4, m2
17225 pmulhrsw m5, m2
17226 packuswb m4, m5
17227 vpermq m4, m4, q3120
17228 movu [r0 + r1 * 2], m4
17229
17230 palignr m4, m3, m6, 8
17231 palignr m5, m6, m9, 13
17232 pshufb m4, m7
17233 pshufb m5, m8
17234 pmaddubsw m4, m0
17235 pmaddubsw m5, m1
17236 pmulhrsw m4, m2
17237 pmulhrsw m5, m2
17238 packuswb m4, m5
17239 vpermq m4, m4, q3120
17240 movu [r0 + r3], m4
17241
17242 lea r0, [r0 + r1 * 4]
17243
17244 palignr m4, m3, m6, 9
17245 palignr m5, m6, m9, 14
17246 pshufb m4, m7
17247 pshufb m5, m8
17248 pmaddubsw m4, m0
17249 pmaddubsw m5, m1
17250 pmulhrsw m4, m2
17251 pmulhrsw m5, m2
17252 packuswb m4, m5
17253 vpermq m4, m4, q3120
17254 movu [r0], m4
17255
17256 palignr m4, m3, m6, 10
17257 palignr m5, m6, m9, 15
17258 pshufb m4, m7
17259 pshufb m5, m8
17260 pmaddubsw m4, m0
17261 pmaddubsw m5, m1
17262 pmulhrsw m4, m2
17263 pmulhrsw m5, m2
17264 packuswb m4, m5
17265 vpermq m4, m4, q3120
17266 movu [r0 + r1], m4
17267
17268 palignr m4, m3, m6, 11
17269 pshufb m4, m7
17270 pshufb m5, m6, m8
17271 pmaddubsw m4, m0
17272 pmaddubsw m5, m1
17273 pmulhrsw m4, m2
17274 pmulhrsw m5, m2
17275 packuswb m4, m5
17276 vpermq m4, m4, q3120
17277 movu [r0 + r1 * 2], m4
17278
17279 palignr m4, m3, m6, 12
17280 palignr m5, m3, m6, 1
17281 pshufb m4, m7
17282 pshufb m5, m8
17283 pmaddubsw m4, m0
17284 pmaddubsw m5, m1
17285 pmulhrsw m4, m2
17286 pmulhrsw m5, m2
17287 packuswb m4, m5
17288 vpermq m4, m4, q3120
17289 movu [r0 + r3], m4
17290
17291 lea r0, [r0 + r1 * 4]
17292
17293 palignr m4, m3, m6, 13
17294 palignr m5, m3, m6, 2
17295 pshufb m4, m7
17296 pshufb m5, m8
17297 pmaddubsw m4, m0
17298 pmaddubsw m5, m1
17299 pmulhrsw m4, m2
17300 pmulhrsw m5, m2
17301 packuswb m4, m5
17302 vpermq m4, m4, q3120
17303 movu [r0], m4
17304
17305 palignr m4, m3, m6, 14
17306 palignr m5, m3, m6, 3
17307 pshufb m4, m7
17308 pshufb m5, m8
17309 pmaddubsw m4, m0
17310 pmaddubsw m5, m1
17311 pmulhrsw m4, m2
17312 pmulhrsw m5, m2
17313 packuswb m4, m5
17314 vpermq m4, m4, q3120
17315 movu [r0 + r1], m4
17316
17317 palignr m4, m3, m6, 15
17318 palignr m5, m3, m6, 4
17319 pshufb m4, m7
17320 pshufb m5, m8
17321 pmaddubsw m4, m0
17322 pmaddubsw m5, m1
17323 pmulhrsw m4, m2
17324 pmulhrsw m5, m2
17325 packuswb m4, m5
17326 vpermq m4, m4, q3120
17327 movu [r0 + r1 * 2], m4
17328
17329 palignr m5, m3, m6, 5
17330 pshufb m4, m3, m7
17331 pshufb m5, m8
17332 pmaddubsw m4, m0
17333 pmaddubsw m5, m1
17334 pmulhrsw m4, m2
17335 pmulhrsw m5, m2
17336 packuswb m4, m5
17337 vpermq m4, m4, q3120
17338 movu [r0 + r3], m4
17339
17340 lea r0, [r0 + r1 * 4]
17341
17342 vbroadcasti128 m9, [r2 + mmsize*2 + 17]
17343
17344 palignr m4, m9, m3, 1
17345 palignr m5, m3, m6, 6
17346 pshufb m4, m7
17347 pshufb m5, m8
17348 pmaddubsw m4, m0
17349 pmaddubsw m5, m1
17350 pmulhrsw m4, m2
17351 pmulhrsw m5, m2
17352 packuswb m4, m5
17353 vpermq m4, m4, q3120
17354 movu [r0], m4
17355
17356 palignr m4, m9, m3, 2
17357 palignr m5, m3, m6, 7
17358 pshufb m4, m7
17359 pshufb m5, m8
17360 pmaddubsw m4, m0
17361 pmaddubsw m5, m1
17362 pmulhrsw m4, m2
17363 pmulhrsw m5, m2
17364 packuswb m4, m5
17365 vpermq m4, m4, q3120
17366 movu [r0 + r1], m4
17367
17368 palignr m4, m9, m3, 3
17369 palignr m5, m3, m6, 8
17370 pshufb m4, m7
17371 pshufb m5, m8
17372 pmaddubsw m4, m0
17373 pmaddubsw m5, m1
17374 pmulhrsw m4, m2
17375 pmulhrsw m5, m2
17376 packuswb m4, m5
17377 vpermq m4, m4, q3120
17378 movu [r0 + r1 * 2], m4
17379
17380 palignr m4, m9, m3, 4
17381 palignr m5, m3, m6, 9
17382 pshufb m4, m7
17383 pshufb m5, m8
17384 pmaddubsw m4, m0
17385 pmaddubsw m5, m1
17386 pmulhrsw m4, m2
17387 pmulhrsw m5, m2
17388 packuswb m4, m5
17389 vpermq m4, m4, q3120
17390 movu [r0 + r3], m4
17391
17392 lea r0, [r0 + r1 * 4]
17393
17394 palignr m4, m9, m3, 5
17395 palignr m5, m3, m6, 10
17396 pshufb m4, m7
17397 pshufb m5, m8
17398 pmaddubsw m4, m0
17399 pmaddubsw m5, m1
17400 pmulhrsw m4, m2
17401 pmulhrsw m5, m2
17402 packuswb m4, m5
17403 vpermq m4, m4, q3120
17404 movu [r0], m4
17405
17406 palignr m4, m9, m3, 6
17407 palignr m5, m3, m6, 11
17408 pshufb m4, m7
17409 pshufb m5, m8
17410 pmaddubsw m4, m0
17411 pmaddubsw m5, m1
17412 pmulhrsw m4, m2
17413 pmulhrsw m5, m2
17414 packuswb m4, m5
17415 vpermq m4, m4, q3120
17416 movu [r0 + r1], m4
17417
17418 palignr m4, m9, m3, 7
17419 palignr m5, m3, m6, 12
17420 pshufb m4, m7
17421 pshufb m5, m8
17422 pmaddubsw m4, m0
17423 pmaddubsw m5, m1
17424 pmulhrsw m4, m2
17425 pmulhrsw m5, m2
17426 packuswb m4, m5
17427 vpermq m4, m4, q3120
17428 movu [r0 + r1 * 2], m4
17429
17430 palignr m4, m9, m3, 8
17431 palignr m5, m3, m6, 13
17432 pshufb m4, m7
17433 pshufb m5, m8
17434 pmaddubsw m4, m0
17435 pmaddubsw m5, m1
17436 pmulhrsw m4, m2
17437 pmulhrsw m5, m2
17438 packuswb m4, m5
17439 vpermq m4, m4, q3120
17440 movu [r0 + r3], m4
17441
17442 lea r0, [r0 + r1 * 4]
17443
17444 palignr m4, m9, m3, 9
17445 palignr m5, m3, m6, 14
17446 pshufb m4, m7
17447 pshufb m5, m8
17448 pmaddubsw m4, m0
17449 pmaddubsw m5, m1
17450 pmulhrsw m4, m2
17451 pmulhrsw m5, m2
17452 packuswb m4, m5
17453 vpermq m4, m4, q3120
17454 movu [r0], m4
17455
17456 palignr m4, m9, m3, 10
17457 palignr m5, m3, m6, 15
17458 pshufb m4, m7
17459 pshufb m5, m8
17460 pmaddubsw m4, m0
17461 pmaddubsw m5, m1
17462 pmulhrsw m4, m2
17463 pmulhrsw m5, m2
17464 packuswb m4, m5
17465 vpermq m4, m4, q3120
17466 movu [r0 + r1], m4
17467
17468 palignr m4, m9, m3, 11
17469 pshufb m4, m7
17470 pshufb m5, m3, m8
17471 pmaddubsw m4, m0
17472 pmaddubsw m5, m1
17473 pmulhrsw m4, m2
17474 pmulhrsw m5, m2
17475 packuswb m4, m5
17476 vpermq m4, m4, q3120
17477 movu [r0 + r1 * 2], m4
17478
17479 palignr m4, m9, m3, 12
17480 palignr m5, m9, m3, 1
17481 pshufb m4, m7
17482 pshufb m5, m8
17483 pmaddubsw m4, m0
17484 pmaddubsw m5, m1
17485 pmulhrsw m4, m2
17486 pmulhrsw m5, m2
17487 packuswb m4, m5
17488 vpermq m4, m4, q3120
17489 movu [r0 + r3], m4
17490
17491 lea r0, [r0 + r1 * 4]
17492
17493 palignr m4, m9, m3, 13
17494 palignr m5, m9, m3, 2
17495 pshufb m4, m7
17496 pshufb m5, m8
17497 pmaddubsw m4, m0
17498 pmaddubsw m5, m1
17499 pmulhrsw m4, m2
17500 pmulhrsw m5, m2
17501 packuswb m4, m5
17502 vpermq m4, m4, q3120
17503 movu [r0], m4
17504
17505 palignr m4, m9, m3, 14
17506 palignr m5, m9, m3, 3
17507 pshufb m4, m7
17508 pshufb m5, m8
17509 pmaddubsw m4, m0
17510 pmaddubsw m5, m1
17511 pmulhrsw m4, m2
17512 pmulhrsw m5, m2
17513 packuswb m4, m5
17514 vpermq m4, m4, q3120
17515 movu [r0 + r1], m4
17516
17517 palignr m4, m9, m3, 15
17518 palignr m5, m9, m3, 4
17519 pshufb m4, m7
17520 pshufb m5, m8
17521 pmaddubsw m4, m0
17522 pmaddubsw m5, m1
17523 pmulhrsw m4, m2
17524 pmulhrsw m5, m2
17525 packuswb m4, m5
17526 vpermq m4, m4, q3120
17527 movu [r0 + r1 * 2], m4
17528
17529 palignr m5, m9, m3, 5
17530 pshufb m4, m9, m7
17531 pshufb m5, m8
17532 pmaddubsw m4, m0
17533 pmaddubsw m5, m1
17534 pmulhrsw m4, m2
17535 pmulhrsw m5, m2
17536 packuswb m4, m5
17537 vpermq m4, m4, q3120
17538 movu [r0 + r3], m4
17539 RET
17540
17541 cglobal intra_pred_ang32_20, 3,5,10
17542 lea r3, [ang_table_avx2 + 32 * 16]
17543 lea r4, [r1 * 3]
17544 mova m5, [pw_1024]
17545
17546 ; rows 0 to 7
17547 movu m0, [r2 + 0]
17548 movu m1, [r2 + 1]
17549 punpckhbw m2, m0, m1
17550 punpcklbw m0, m1
17551
17552 movu m4, [r2 + mmsize*2]
17553 pshufb m4, [ang32_shuf_mode20]
17554 mova m9, m4
17555 vpermq m9, m9, q3333
17556 mova m7, m4
17557 vpermq m7, m7, q1111
17558 palignr m4, m7, 14
17559 pshufb m4, [ang32_shuf_mode20 + mmsize*1]
17560
17561 vextracti128 xm6, m4, 1
17562 palignr m3, m0, m4, 1
17563 palignr m8, m3, m6, 1
17564 vinserti128 m3, m3, xm2, 1
17565 vinserti128 m8, m8, xm0, 1
17566 vinserti128 m9, m9, xm3, 1
17567
17568 pmaddubsw m4, m0, [r3 - 5 * 32] ; [11]
17569 pmulhrsw m4, m5
17570 pmaddubsw m1, m2, [r3 - 5 * 32]
17571 pmulhrsw m1, m5
17572 packuswb m4, m1
17573 movu [r0], m4
17574
17575 palignr m6, m0, m3, 14
17576 palignr m7, m2, m0, 14
17577 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
17578 pmulhrsw m4, m5
17579 pmaddubsw m1, m7, [r3 + 6 * 32]
17580 pmulhrsw m1, m5
17581 packuswb m4, m1
17582 movu [r0 + r1], m4
17583
17584 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
17585 pmulhrsw m4, m5
17586 pmaddubsw m1, m7, [r3 - 15 * 32]
17587 pmulhrsw m1, m5
17588 packuswb m4, m1
17589 movu [r0 + r1*2], m4
17590
17591 palignr m6, m0, m3, 12
17592 palignr m7, m2, m0, 12
17593 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
17594 pmulhrsw m4, m5
17595 pmaddubsw m1, m7, [r3 - 4 * 32]
17596 pmulhrsw m1, m5
17597 packuswb m4, m1
17598 movu [r0 + r4], m4
17599
17600 lea r0, [r0 + r1 * 4]
17601
17602 palignr m6, m0, m3, 10
17603 palignr m7, m2, m0, 10
17604 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
17605 pmulhrsw m4, m5
17606 pmaddubsw m1, m7, [r3 + 7 * 32]
17607 pmulhrsw m1, m5
17608 packuswb m4, m1
17609 movu [r0], m4
17610
17611 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
17612 pmulhrsw m4, m5
17613 pmaddubsw m1, m7, [r3 - 14 * 32]
17614 pmulhrsw m1, m5
17615 packuswb m4, m1
17616 movu [r0 + r1], m4
17617
17618 palignr m6, m0, m3, 8
17619 palignr m7, m2, m0, 8
17620 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
17621 pmulhrsw m4, m5
17622 pmaddubsw m1, m7, [r3 - 3 * 32]
17623 pmulhrsw m1, m5
17624 packuswb m4, m1
17625 movu [r0 + r1*2], m4
17626
17627 palignr m6, m0, m3, 6
17628 palignr m7, m2, m0, 6
17629 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
17630 pmulhrsw m4, m5
17631 pmaddubsw m1, m7, [r3 + 8 * 32]
17632 pmulhrsw m1, m5
17633 packuswb m4, m1
17634 movu [r0 + r4], m4
17635
17636 lea r0, [r0 + r1 * 4]
17637
17638 ; rows 8 to 15
17639 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
17640 pmulhrsw m4, m5
17641 pmaddubsw m1, m7, [r3 - 13 * 32]
17642 pmulhrsw m1, m5
17643 packuswb m4, m1
17644 movu [r0], m4
17645
17646 palignr m6, m0, m3, 4
17647 palignr m7, m2, m0, 4
17648 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
17649 pmulhrsw m4, m5
17650 pmaddubsw m1, m7, [r3 - 2 * 32]
17651 pmulhrsw m1, m5
17652 packuswb m4, m1
17653 movu [r0 + r1], m4
17654
17655 palignr m6, m0, m3, 2
17656 palignr m7, m2, m0, 2
17657 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
17658 pmulhrsw m4, m5
17659 pmaddubsw m1, m7, [r3 + 9 * 32]
17660 pmulhrsw m1, m5
17661 packuswb m4, m1
17662 movu [r0 + r1*2], m4
17663
17664 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
17665 pmulhrsw m4, m5
17666 pmaddubsw m1, m7, [r3 - 12 * 32]
17667 pmulhrsw m1, m5
17668 packuswb m4, m1
17669 movu [r0 + r4], m4
17670
17671 lea r0, [r0 + r1 * 4]
17672
17673 pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
17674 pmulhrsw m4, m5
17675 pmaddubsw m1, m0, [r3 - 1 * 32]
17676 pmulhrsw m1, m5
17677 packuswb m4, m1
17678 movu [r0], m4
17679
17680 palignr m6, m3, m8, 14
17681 palignr m7, m0, m3, 14
17682 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
17683 pmulhrsw m4, m5
17684 pmaddubsw m1, m7, [r3 + 10 * 32]
17685 pmulhrsw m1, m5
17686 packuswb m4, m1
17687 movu [r0 + r1], m4
17688
17689 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
17690 pmulhrsw m4, m5
17691 pmaddubsw m1, m7, [r3 - 11 * 32]
17692 pmulhrsw m1, m5
17693 packuswb m4, m1
17694 movu [r0 + r1 * 2], m4
17695
17696 palignr m6, m3, m8, 12
17697 palignr m7, m0, m3, 12
17698 pmaddubsw m4, m6, [r3] ; [16]
17699 pmulhrsw m4, m5
17700 pmaddubsw m1, m7, [r3]
17701 pmulhrsw m1, m5
17702 packuswb m4, m1
17703 movu [r0 + r4], m4
17704
17705 lea r0, [r0 + r1 * 4]
17706
17707 ; rows 16 to 23
17708 palignr m6, m3, m8, 10
17709 palignr m7, m0, m3, 10
17710 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
17711 pmulhrsw m4, m5
17712 pmaddubsw m1, m7, [r3 + 11 * 32]
17713 pmulhrsw m1, m5
17714 packuswb m4, m1
17715 movu [r0], m4
17716
17717 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
17718 pmulhrsw m4, m5
17719 pmaddubsw m1, m7, [r3 - 10 * 32]
17720 pmulhrsw m1, m5
17721 packuswb m4, m1
17722 movu [r0 + r1], m4
17723
17724 palignr m6, m3, m8, 8
17725 palignr m7, m0, m3, 8
17726 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
17727 pmulhrsw m4, m5
17728 pmaddubsw m1, m7, [r3 + 1 * 32]
17729 pmulhrsw m1, m5
17730 packuswb m4, m1
17731 movu [r0 + r1*2], m4
17732
17733 palignr m6, m3, m8, 6
17734 palignr m7, m0, m3, 6
17735 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
17736 pmulhrsw m4, m5
17737 pmaddubsw m1, m7, [r3 + 12 * 32]
17738 pmulhrsw m1, m5
17739 packuswb m4, m1
17740 movu [r0 + r4], m4
17741
17742 lea r0, [r0 + r1 * 4]
17743
17744 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
17745 pmulhrsw m4, m5
17746 pmaddubsw m1, m7, [r3 - 9 * 32]
17747 pmulhrsw m1, m5
17748 packuswb m4, m1
17749 movu [r0], m4
17750
17751 palignr m6, m3, m8, 4
17752 palignr m7, m0, m3, 4
17753 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
17754 pmulhrsw m4, m5
17755 pmaddubsw m1, m7, [r3 + 2 * 32]
17756 pmulhrsw m1, m5
17757 packuswb m4, m1
17758 movu [r0 + r1], m4
17759
17760 palignr m6, m3, m8, 2
17761 palignr m7, m0, m3, 2
17762 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
17763 pmulhrsw m4, m5
17764 pmaddubsw m1, m7, [r3 + 13 * 32]
17765 pmulhrsw m1, m5
17766 packuswb m4, m1
17767 movu [r0 + r1*2], m4
17768
17769 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
17770 pmulhrsw m4, m5
17771 pmaddubsw m1, m7, [r3 - 8 * 32]
17772 pmulhrsw m1, m5
17773 packuswb m4, m1
17774 movu [r0 + r4], m4
17775
17776 lea r0, [r0 + r1 * 4]
17777
17778 ; rows 24 to 31
17779 pmaddubsw m4, m8, [r3 + 3 * 32] ; [19]
17780 pmulhrsw m4, m5
17781 pmaddubsw m1, m3, [r3 + 3 * 32]
17782 pmulhrsw m1, m5
17783 packuswb m4, m1
17784 movu [r0], m4
17785
17786 palignr m6, m8, m9, 14
17787 palignr m7, m3, m8, 14
17788 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
17789 pmulhrsw m4, m5
17790 pmaddubsw m1, m7, [r3 + 14 * 32]
17791 pmulhrsw m1, m5
17792 packuswb m4, m1
17793 movu [r0 + r1], m4
17794
17795 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
17796 pmulhrsw m4, m5
17797 pmaddubsw m1, m7, [r3 - 7 * 32]
17798 pmulhrsw m1, m5
17799 packuswb m4, m1
17800 movu [r0 + r1 * 2], m4
17801
17802 palignr m6, m8, m9, 12
17803 palignr m7, m3, m8, 12
17804 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
17805 pmulhrsw m4, m5
17806 pmaddubsw m1, m7, [r3 + 4 * 32]
17807 pmulhrsw m1, m5
17808 packuswb m4, m1
17809 movu [r0 + r4], m4
17810
17811 lea r0, [r0 + r1 * 4]
17812
17813 palignr m6, m8, m9, 10
17814 palignr m7, m3, m8, 10
17815 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
17816 pmulhrsw m4, m5
17817 pmaddubsw m1, m7, [r3 + 15 * 32]
17818 pmulhrsw m1, m5
17819 packuswb m4, m1
17820 movu [r0], m4
17821
17822 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
17823 pmulhrsw m4, m5
17824 pmaddubsw m1, m7, [r3 - 6 * 32]
17825 pmulhrsw m1, m5
17826 packuswb m4, m1
17827 movu [r0 + r1], m4
17828
17829 palignr m6, m8, m9, 8
17830 palignr m7, m3, m8, 8
17831 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
17832 pmulhrsw m4, m5
17833 pmaddubsw m1, m7, [r3 + 5 * 32]
17834 pmulhrsw m1, m5
17835 packuswb m4, m1
17836 movu [r0 + r1*2], m4
17837
17838 pand m6, [pw_00ff]
17839 pand m7, [pw_00ff]
17840 packuswb m6, m7
17841 movu [r0 + r4], m6
17842 RET
17843
17844 cglobal intra_pred_ang32_17, 3,4,8
17845 movu m0, [ang32_fact_mode17]
17846 mova m2, [pw_1024]
17847 mova m7, [ang32_shuf_mode17]
17848 lea r3, [r1 * 3]
17849
17850 ; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2...]
17851
17852 movu m6, [r2]
17853 pshufb m6, [ang32_shuf_mode17 + mmsize]
17854 mova m1, m6
17855 mova m3, [ang32_shuf_mode16 + mmsize*3]
17856 vpermd m6, m3, m6
17857 vpermq m1, m1, q3232
17858 pslldq m1, 4
17859
17860 movu xm4, [r2 + mmsize*2]
17861 pinsrb xm4, [r2], 0
17862 vinserti128 m3, m4, xm4, 1
17863
17864 palignr m4, m3, m6, 2
17865 palignr m5, m6, m1, 5
17866 pshufb m4, m7
17867 pshufb m5, m7
17868 pmaddubsw m4, m0
17869 pmaddubsw m5, m0
17870 pmulhrsw m4, m2
17871 pmulhrsw m5, m2
17872 packuswb m4, m5
17873 vpermq m4, m4, q3120
17874 movu [r0], m4
17875
17876 palignr m4, m3, m6, 3
17877 palignr m5, m6, m1, 6
17878 pshufb m4, m7
17879 pshufb m5, m7
17880 pmaddubsw m4, m0
17881 pmaddubsw m5, m0
17882 pmulhrsw m4, m2
17883 pmulhrsw m5, m2
17884 packuswb m4, m5
17885 vpermq m4, m4, q3120
17886 movu [r0 + r1], m4
17887
17888 palignr m4, m3, m6, 4
17889 palignr m5, m6, m1, 7
17890 pshufb m4, m7
17891 pshufb m5, m7
17892 pmaddubsw m4, m0
17893 pmaddubsw m5, m0
17894 pmulhrsw m4, m2
17895 pmulhrsw m5, m2
17896 packuswb m4, m5
17897 vpermq m4, m4, q3120
17898 movu [r0 + r1 * 2], m4
17899
17900 palignr m4, m3, m6, 5
17901 palignr m5, m6, m1, 8
17902 pshufb m4, m7
17903 pshufb m5, m7
17904 pmaddubsw m4, m0
17905 pmaddubsw m5, m0
17906 pmulhrsw m4, m2
17907 pmulhrsw m5, m2
17908 packuswb m4, m5
17909 vpermq m4, m4, q3120
17910 movu [r0 + r3], m4
17911
17912 lea r0, [r0 + r1 * 4]
17913
17914 palignr m4, m3, m6, 6
17915 palignr m5, m6, m1, 9
17916 pshufb m4, m7
17917 pshufb m5, m7
17918 pmaddubsw m4, m0
17919 pmaddubsw m5, m0
17920 pmulhrsw m4, m2
17921 pmulhrsw m5, m2
17922 packuswb m4, m5
17923 vpermq m4, m4, q3120
17924 movu [r0], m4
17925
17926 palignr m4, m3, m6, 7
17927 palignr m5, m6, m1, 10
17928 pshufb m4, m7
17929 pshufb m5, m7
17930 pmaddubsw m4, m0
17931 pmaddubsw m5, m0
17932 pmulhrsw m4, m2
17933 pmulhrsw m5, m2
17934 packuswb m4, m5
17935 vpermq m4, m4, q3120
17936 movu [r0 + r1], m4
17937
17938 palignr m4, m3, m6, 8
17939 palignr m5, m6, m1, 11
17940 pshufb m4, m7
17941 pshufb m5, m7
17942 pmaddubsw m4, m0
17943 pmaddubsw m5, m0
17944 pmulhrsw m4, m2
17945 pmulhrsw m5, m2
17946 packuswb m4, m5
17947 vpermq m4, m4, q3120
17948 movu [r0 + r1 * 2], m4
17949
17950 palignr m4, m3, m6, 9
17951 palignr m5, m6, m1, 12
17952 pshufb m4, m7
17953 pshufb m5, m7
17954 pmaddubsw m4, m0
17955 pmaddubsw m5, m0
17956 pmulhrsw m4, m2
17957 pmulhrsw m5, m2
17958 packuswb m4, m5
17959 vpermq m4, m4, q3120
17960 movu [r0 + r3], m4
17961
17962 lea r0, [r0 + r1 * 4]
17963
17964 palignr m4, m3, m6, 10
17965 palignr m5, m6, m1, 13
17966 pshufb m4, m7
17967 pshufb m5, m7
17968 pmaddubsw m4, m0
17969 pmaddubsw m5, m0
17970 pmulhrsw m4, m2
17971 pmulhrsw m5, m2
17972 packuswb m4, m5
17973 vpermq m4, m4, q3120
17974 movu [r0], m4
17975
17976 palignr m4, m3, m6, 11
17977 palignr m5, m6, m1, 14
17978 pshufb m4, m7
17979 pshufb m5, m7
17980 pmaddubsw m4, m0
17981 pmaddubsw m5, m0
17982 pmulhrsw m4, m2
17983 pmulhrsw m5, m2
17984 packuswb m4, m5
17985 vpermq m4, m4, q3120
17986 movu [r0 + r1], m4
17987
17988 palignr m4, m3, m6, 12
17989 palignr m5, m6, m1, 15
17990 pshufb m4, m7
17991 pshufb m5, m7
17992 pmaddubsw m4, m0
17993 pmaddubsw m5, m0
17994 pmulhrsw m4, m2
17995 pmulhrsw m5, m2
17996 packuswb m4, m5
17997 vpermq m4, m4, q3120
17998 movu [r0 + r1 * 2], m4
17999
18000 palignr m4, m3, m6, 13
18001 pshufb m4, m7
18002 pshufb m5, m6, m7
18003 pmaddubsw m4, m0
18004 pmaddubsw m5, m0
18005 pmulhrsw m4, m2
18006 pmulhrsw m5, m2
18007 packuswb m4, m5
18008 vpermq m4, m4, q3120
18009 movu [r0 + r3], m4
18010
18011 lea r0, [r0 + r1 * 4]
18012
18013 palignr m4, m3, m6, 14
18014 palignr m5, m3, m6, 1
18015 pshufb m4, m7
18016 pshufb m5, m7
18017 pmaddubsw m4, m0
18018 pmaddubsw m5, m0
18019 pmulhrsw m4, m2
18020 pmulhrsw m5, m2
18021 packuswb m4, m5
18022 vpermq m4, m4, q3120
18023 movu [r0], m4
18024
18025 palignr m4, m3, m6, 15
18026 palignr m5, m3, m6, 2
18027 pshufb m4, m7
18028 pshufb m5, m7
18029 pmaddubsw m4, m0
18030 pmaddubsw m5, m0
18031 pmulhrsw m4, m2
18032 pmulhrsw m5, m2
18033 packuswb m4, m5
18034 vpermq m4, m4, q3120
18035 movu [r0 + r1], m4
18036
18037 palignr m5, m3, m6, 3
18038 pshufb m4, m3, m7
18039 pshufb m5, m7
18040 pmaddubsw m4, m0
18041 pmaddubsw m5, m0
18042 pmulhrsw m4, m2
18043 pmulhrsw m5, m2
18044 packuswb m4, m5
18045 vpermq m4, m4, q3120
18046 movu [r0 + r1 * 2], m4
18047
18048 vbroadcasti128 m1, [r2 + mmsize*2 + 16]
18049 palignr m4, m1, m3, 1
18050 palignr m5, m3, m6, 4
18051 pshufb m4, m7
18052 pshufb m5, m7
18053 pmaddubsw m4, m0
18054 pmaddubsw m5, m0
18055 pmulhrsw m4, m2
18056 pmulhrsw m5, m2
18057 packuswb m4, m5
18058 vpermq m4, m4, q3120
18059 movu [r0 + r3], m4
18060
18061 lea r0, [r0 + r1 * 4]
18062
18063 palignr m4, m1, m3, 2
18064 palignr m5, m3, m6, 5
18065 pshufb m4, m7
18066 pshufb m5, m7
18067 pmaddubsw m4, m0
18068 pmaddubsw m5, m0
18069 pmulhrsw m4, m2
18070 pmulhrsw m5, m2
18071 packuswb m4, m5
18072 vpermq m4, m4, q3120
18073 movu [r0], m4
18074
18075 palignr m4, m1, m3, 3
18076 palignr m5, m3, m6, 6
18077 pshufb m4, m7
18078 pshufb m5, m7
18079 pmaddubsw m4, m0
18080 pmaddubsw m5, m0
18081 pmulhrsw m4, m2
18082 pmulhrsw m5, m2
18083 packuswb m4, m5
18084 vpermq m4, m4, q3120
18085 movu [r0 + r1], m4
18086
18087 palignr m4, m1, m3, 4
18088 palignr m5, m3, m6, 7
18089 pshufb m4, m7
18090 pshufb m5, m7
18091 pmaddubsw m4, m0
18092 pmaddubsw m5, m0
18093 pmulhrsw m4, m2
18094 pmulhrsw m5, m2
18095 packuswb m4, m5
18096 vpermq m4, m4, q3120
18097 movu [r0 + r1 * 2], m4
18098
18099 palignr m4, m1, m3, 5
18100 palignr m5, m3, m6, 8
18101 pshufb m4, m7
18102 pshufb m5, m7
18103 pmaddubsw m4, m0
18104 pmaddubsw m5, m0
18105 pmulhrsw m4, m2
18106 pmulhrsw m5, m2
18107 packuswb m4, m5
18108 vpermq m4, m4, q3120
18109 movu [r0 + r3], m4
18110
18111 lea r0, [r0 + r1 * 4]
18112
18113 palignr m4, m1, m3, 6
18114 palignr m5, m3, m6, 9
18115 pshufb m4, m7
18116 pshufb m5, m7
18117 pmaddubsw m4, m0
18118 pmaddubsw m5, m0
18119 pmulhrsw m4, m2
18120 pmulhrsw m5, m2
18121 packuswb m4, m5
18122 vpermq m4, m4, q3120
18123 movu [r0], m4
18124
18125 palignr m4, m1, m3, 7
18126 palignr m5, m3, m6, 10
18127 pshufb m4, m7
18128 pshufb m5, m7
18129 pmaddubsw m4, m0
18130 pmaddubsw m5, m0
18131 pmulhrsw m4, m2
18132 pmulhrsw m5, m2
18133 packuswb m4, m5
18134 vpermq m4, m4, q3120
18135 movu [r0 + r1], m4
18136
18137 palignr m4, m1, m3, 8
18138 palignr m5, m3, m6, 11
18139 pshufb m4, m7
18140 pshufb m5, m7
18141 pmaddubsw m4, m0
18142 pmaddubsw m5, m0
18143 pmulhrsw m4, m2
18144 pmulhrsw m5, m2
18145 packuswb m4, m5
18146 vpermq m4, m4, q3120
18147 movu [r0 + r1 * 2], m4
18148
18149 palignr m4, m1, m3, 9
18150 palignr m5, m3, m6, 12
18151 pshufb m4, m7
18152 pshufb m5, m7
18153 pmaddubsw m4, m0
18154 pmaddubsw m5, m0
18155 pmulhrsw m4, m2
18156 pmulhrsw m5, m2
18157 packuswb m4, m5
18158 vpermq m4, m4, q3120
18159 movu [r0 + r3], m4
18160
18161 lea r0, [r0 + r1 * 4]
18162
18163 palignr m4, m1, m3, 10
18164 palignr m5, m3, m6, 13
18165 pshufb m4, m7
18166 pshufb m5, m7
18167 pmaddubsw m4, m0
18168 pmaddubsw m5, m0
18169 pmulhrsw m4, m2
18170 pmulhrsw m5, m2
18171 packuswb m4, m5
18172 vpermq m4, m4, q3120
18173 movu [r0], m4
18174
18175 palignr m4, m1, m3, 11
18176 palignr m5, m3, m6, 14
18177 pshufb m4, m7
18178 pshufb m5, m7
18179 pmaddubsw m4, m0
18180 pmaddubsw m5, m0
18181 pmulhrsw m4, m2
18182 pmulhrsw m5, m2
18183 packuswb m4, m5
18184 vpermq m4, m4, q3120
18185 movu [r0 + r1], m4
18186
18187 palignr m4, m1, m3, 12
18188 palignr m5, m3, m6, 15
18189 pshufb m4, m7
18190 pshufb m5, m7
18191 pmaddubsw m4, m0
18192 pmaddubsw m5, m0
18193 pmulhrsw m4, m2
18194 pmulhrsw m5, m2
18195 packuswb m4, m5
18196 vpermq m4, m4, q3120
18197 movu [r0 + r1 * 2], m4
18198
18199 palignr m4, m1, m3, 13
18200 pshufb m4, m7
18201 pshufb m5, m3, m7
18202 pmaddubsw m4, m0
18203 pmaddubsw m5, m0
18204 pmulhrsw m4, m2
18205 pmulhrsw m5, m2
18206 packuswb m4, m5
18207 vpermq m4, m4, q3120
18208 movu [r0 + r3], m4
18209
18210 lea r0, [r0 + r1 * 4]
18211
18212 palignr m4, m1, m3, 14
18213 palignr m5, m1, m3, 1
18214 pshufb m4, m7
18215 pshufb m5, m7
18216 pmaddubsw m4, m0
18217 pmaddubsw m5, m0
18218 pmulhrsw m4, m2
18219 pmulhrsw m5, m2
18220 packuswb m4, m5
18221 vpermq m4, m4, q3120
18222 movu [r0], m4
18223
18224 palignr m4, m1, m3, 15
18225 palignr m5, m1, m3, 2
18226 pshufb m4, m7
18227 pshufb m5, m7
18228 pmaddubsw m4, m0
18229 pmaddubsw m5, m0
18230 pmulhrsw m4, m2
18231 pmulhrsw m5, m2
18232 packuswb m4, m5
18233 vpermq m4, m4, q3120
18234 movu [r0 + r1], m4
18235
18236 vbroadcasti128 m6, [r2 + mmsize*2 + mmsize]
18237 palignr m5, m1, m3, 3
18238 pshufb m4, m1, m7
18239 pshufb m5, m7
18240 pmaddubsw m4, m0
18241 pmaddubsw m5, m0
18242 pmulhrsw m4, m2
18243 pmulhrsw m5, m2
18244 packuswb m4, m5
18245 vpermq m4, m4, q3120
18246 movu [r0 + r1 * 2], m4
18247
18248 palignr m4, m6, m1, 1
18249 palignr m5, m1, m3, 4
18250 pshufb m4, m7
18251 pshufb m5, m7
18252 pmaddubsw m4, m0
18253 pmaddubsw m5, m0
18254 pmulhrsw m4, m2
18255 pmulhrsw m5, m2
18256 packuswb m4, m5
18257 vpermq m4, m4, q3120
18258 movu [r0 + r3], m4
18259 RET
18260
18261 cglobal intra_pred_ang32_19, 3,5,10
18262 lea r3, [ang_table_avx2 + 32 * 16]
18263 lea r4, [r1 * 3]
18264 mova m5, [pw_1024]
18265
18266 ; rows 0 to 7
18267 movu m0, [r2 + 0]
18268 movu m1, [r2 + 1]
18269 punpckhbw m2, m0, m1
18270 punpcklbw m0, m1
18271
18272 movu m4, [r2 + mmsize*2]
18273 pshufb m4, [ang32_shuf_mode17 + mmsize*1]
18274 mova m3, [ang32_shuf_mode19 + mmsize*1]
18275 mova m6, [ang32_shuf_mode19 + mmsize*2]
18276 mova m9, m4
18277 vpermd m4, m3, m4
18278 vpermd m9, m6, m9
18279 pshufb m4, [ang32_shuf_mode19]
18280 pshufb m9, [ang32_shuf_mode19]
18281
18282 vextracti128 xm6, m4, 1
18283 palignr m3, m0, m4, 1
18284 palignr m8, m3, m6, 1
18285 palignr m7, m8, m9, 1
18286 vinserti128 m3, m3, xm2, 1
18287 vinserti128 m8, m8, xm0, 1
18288 vinserti128 m9, m7, xm3, 1
18289
18290 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
18291 pmulhrsw m4, m5
18292 pmaddubsw m1, m2, [r3 - 10 * 32]
18293 pmulhrsw m1, m5
18294 packuswb m4, m1
18295 movu [r0], m4
18296
18297 palignr m6, m0, m3, 14
18298 palignr m7, m2, m0, 14
18299 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
18300 pmulhrsw m4, m5
18301 pmaddubsw m1, m7, [r3 - 4 * 32]
18302 pmulhrsw m1, m5
18303 packuswb m4, m1
18304 movu [r0 + r1], m4
18305
18306 palignr m6, m0, m3, 12
18307 palignr m7, m2, m0, 12
18308 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
18309 pmulhrsw m4, m5
18310 pmaddubsw m1, m7, [r3 + 2 * 32]
18311 pmulhrsw m1, m5
18312 packuswb m4, m1
18313 movu [r0 + r1*2], m4
18314
18315 palignr m6, m0, m3, 10
18316 palignr m7, m2, m0, 10
18317 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
18318 pmulhrsw m4, m5
18319 pmaddubsw m1, m7, [r3 + 8 * 32]
18320 pmulhrsw m1, m5
18321 packuswb m4, m1
18322 movu [r0 + r4], m4
18323
18324 lea r0, [r0 + r1 * 4]
18325
18326 palignr m6, m0, m3, 8
18327 palignr m7, m2, m0, 8
18328 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
18329 pmulhrsw m4, m5
18330 pmaddubsw m1, m7, [r3 + 14 * 32]
18331 pmulhrsw m1, m5
18332 packuswb m4, m1
18333 movu [r0], m4
18334
18335 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
18336 pmulhrsw m4, m5
18337 pmaddubsw m1, m7, [r3 - 12 * 32]
18338 pmulhrsw m1, m5
18339 packuswb m4, m1
18340 movu [r0 + r1], m4
18341
18342 palignr m6, m0, m3, 6
18343 palignr m7, m2, m0, 6
18344 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
18345 pmulhrsw m4, m5
18346 pmaddubsw m1, m7, [r3 - 6 * 32]
18347 pmulhrsw m1, m5
18348 packuswb m4, m1
18349 movu [r0 + r1*2], m4
18350
18351 palignr m6, m0, m3, 4
18352 palignr m7, m2, m0, 4
18353 pmaddubsw m4, m6, [r3] ; [16]
18354 pmulhrsw m4, m5
18355 pmaddubsw m1, m7, [r3]
18356 pmulhrsw m1, m5
18357 packuswb m4, m1
18358 movu [r0 + r4], m4
18359
18360 lea r0, [r0 + r1 * 4]
18361
18362 ; rows 8 to 15
18363 palignr m6, m0, m3, 2
18364 palignr m7, m2, m0, 2
18365 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
18366 pmulhrsw m4, m5
18367 pmaddubsw m1, m7, [r3 + 6 * 32]
18368 pmulhrsw m1, m5
18369 packuswb m4, m1
18370 movu [r0], m4
18371
18372 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
18373 pmulhrsw m4, m5
18374 pmaddubsw m1, m0, [r3 + 12 * 32]
18375 pmulhrsw m1, m5
18376 packuswb m4, m1
18377 movu [r0 + r1], m4
18378
18379 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
18380 pmulhrsw m4, m5
18381 pmaddubsw m1, m0, [r3 - 14 * 32]
18382 pmulhrsw m1, m5
18383 packuswb m4, m1
18384 movu [r0 + r1*2], m4
18385
18386 palignr m6, m3, m8, 14
18387 palignr m7, m0, m3, 14
18388 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
18389 pmulhrsw m4, m5
18390 pmaddubsw m1, m7, [r3 - 8 * 32]
18391 pmulhrsw m1, m5
18392 packuswb m4, m1
18393 movu [r0 + r4], m4
18394
18395 lea r0, [r0 + r1 * 4]
18396
18397 palignr m6, m3, m8, 12
18398 palignr m7, m0, m3, 12
18399 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
18400 pmulhrsw m4, m5
18401 pmaddubsw m1, m7, [r3 - 2 * 32]
18402 pmulhrsw m1, m5
18403 packuswb m4, m1
18404 movu [r0], m4
18405
18406 palignr m6, m3, m8, 10
18407 palignr m7, m0, m3, 10
18408 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
18409 pmulhrsw m4, m5
18410 pmaddubsw m1, m7, [r3 + 4 * 32]
18411 pmulhrsw m1, m5
18412 packuswb m4, m1
18413 movu [r0 + r1], m4
18414
18415 palignr m6, m3, m8, 8
18416 palignr m7, m0, m3, 8
18417 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
18418 pmulhrsw m4, m5
18419 pmaddubsw m1, m7, [r3 + 10 * 32]
18420 pmulhrsw m1, m5
18421 packuswb m4, m1
18422 movu [r0 + r1 * 2], m4
18423
18424 pand m6, [pw_00ff]
18425 pand m7, [pw_00ff]
18426 packuswb m6, m7
18427 movu [r0 + r4], m6
18428
18429 lea r0, [r0 + r1 * 4]
18430
18431 ; rows 16 to 23
18432 palignr m6, m3, m8, 6
18433 palignr m7, m0, m3, 6
18434 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
18435 pmulhrsw m4, m5
18436 pmaddubsw m1, m7, [r3 - 10 * 32]
18437 pmulhrsw m1, m5
18438 packuswb m4, m1
18439 movu [r0], m4
18440
18441 palignr m6, m3, m8, 4
18442 palignr m7, m0, m3, 4
18443 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
18444 pmulhrsw m4, m5
18445 pmaddubsw m1, m7, [r3 - 4 * 32]
18446 pmulhrsw m1, m5
18447 packuswb m4, m1
18448 movu [r0 + r1], m4
18449
18450 palignr m6, m3, m8, 2
18451 palignr m7, m0, m3, 2
18452 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
18453 pmulhrsw m4, m5
18454 pmaddubsw m1, m7, [r3 + 2 * 32]
18455 pmulhrsw m1, m5
18456 packuswb m4, m1
18457 movu [r0 + r1*2], m4
18458
18459 pmaddubsw m4, m8, [r3 + 8 * 32] ; [24]
18460 pmulhrsw m4, m5
18461 pmaddubsw m1, m3, [r3 + 8 * 32]
18462 pmulhrsw m1, m5
18463 packuswb m4, m1
18464 movu [r0 + r4], m4
18465
18466 lea r0, [r0 + r1 * 4]
18467
18468 palignr m6, m8, m9, 14
18469 palignr m7, m3, m8, 14
18470 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
18471 pmulhrsw m4, m5
18472 pmaddubsw m1, m7, [r3 + 14 * 32]
18473 pmulhrsw m1, m5
18474 packuswb m4, m1
18475 movu [r0], m4
18476
18477 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
18478 pmulhrsw m4, m5
18479 pmaddubsw m1, m7, [r3 - 12 * 32]
18480 pmulhrsw m1, m5
18481 packuswb m4, m1
18482 movu [r0 + r1], m4
18483
18484 palignr m6, m8, m9, 12
18485 palignr m7, m3, m8, 12
18486 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
18487 pmulhrsw m4, m5
18488 pmaddubsw m1, m7, [r3 - 6 * 32]
18489 pmulhrsw m1, m5
18490 packuswb m4, m1
18491 movu [r0 + r1*2], m4
18492
18493 palignr m6, m8, m9, 10
18494 palignr m7, m3, m8, 10
18495 pmaddubsw m4, m6, [r3] ; [16]
18496 pmulhrsw m4, m5
18497 pmaddubsw m1, m7, [r3]
18498 pmulhrsw m1, m5
18499 packuswb m4, m1
18500 movu [r0 + r4], m4
18501
18502 lea r0, [r0 + r1 * 4]
18503
18504 ; rows 24 to 31
18505 palignr m6, m8, m9, 8
18506 palignr m7, m3, m8, 8
18507 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
18508 pmulhrsw m4, m5
18509 pmaddubsw m1, m7, [r3 + 6 * 32]
18510 pmulhrsw m1, m5
18511 packuswb m4, m1
18512 movu [r0], m4
18513
18514 palignr m6, m8, m9, 6
18515 palignr m7, m3, m8, 6
18516 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
18517 pmulhrsw m4, m5
18518 pmaddubsw m1, m7, [r3 + 12 * 32]
18519 pmulhrsw m1, m5
18520 packuswb m4, m1
18521 movu [r0 + r1], m4
18522
18523 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
18524 pmulhrsw m4, m5
18525 pmaddubsw m1, m7, [r3 - 14 * 32]
18526 pmulhrsw m1, m5
18527 packuswb m4, m1
18528 movu [r0 + r1*2], m4
18529
18530 palignr m6, m8, m9, 4
18531 palignr m7, m3, m8, 4
18532 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
18533 pmulhrsw m4, m5
18534 pmaddubsw m1, m7, [r3 - 8 * 32]
18535 pmulhrsw m1, m5
18536 packuswb m4, m1
18537 movu [r0 + r4], m4
18538
18539 lea r0, [r0 + r1 * 4]
18540
18541 vpbroadcastb m0, [r2 + mmsize*2 + 31]
18542 palignr m1, m9, m0, 1
18543 vinserti128 m0, m1, xm8, 1
18544
18545 palignr m6, m8, m9, 2
18546 palignr m7, m3, m8, 2
18547 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
18548 pmulhrsw m4, m5
18549 pmaddubsw m1, m7, [r3 - 2 * 32]
18550 pmulhrsw m1, m5
18551 packuswb m4, m1
18552 movu [r0], m4
18553
18554 pmaddubsw m4, m9, [r3 + 4 * 32] ; [20]
18555 pmulhrsw m4, m5
18556 pmaddubsw m1, m8, [r3 + 4 * 32]
18557 pmulhrsw m1, m5
18558 packuswb m4, m1
18559 movu [r0 + r1], m4
18560
18561 palignr m6, m9, m0, 14
18562 palignr m7, m8, m9, 14
18563 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
18564 pmulhrsw m4, m5
18565 pmaddubsw m1, m7, [r3 + 10 * 32]
18566 pmulhrsw m1, m5
18567 packuswb m4, m1
18568 movu [r0 + r1 * 2], m4
18569
18570 pand m6, [pw_00ff]
18571 pand m7, [pw_00ff]
18572 packuswb m6, m7
18573 movu [r0 + r4], m6
18574 RET
18575
18576 %endif ; ARCH_X86_64
18577 ;-----------------------------------------------------------------------------------------
18578 ; end of intra_pred_ang32 angular modes avx2 asm
18579 ;-----------------------------------------------------------------------------------------
18580
18581 ;-----------------------------------------------------------------------------------------
18582 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
18583 ;-----------------------------------------------------------------------------------------
18584 INIT_YMM avx2
18585 cglobal intra_pred_ang8_3, 3,4,5
18586 mova m3, [pw_1024]
18587 vbroadcasti128 m0, [r2 + 17]
18588
18589 pshufb m1, m0, [c_ang8_src1_9_2_10]
18590 pshufb m2, m0, [c_ang8_src3_11_4_12]
18591 pshufb m4, m0, [c_ang8_src5_13_5_13]
18592 pshufb m0, [c_ang8_src6_14_7_15]
18593
18594 pmaddubsw m1, [c_ang8_26_20]
18595 pmulhrsw m1, m3
18596 pmaddubsw m2, [c_ang8_14_8]
18597 pmulhrsw m2, m3
18598 pmaddubsw m4, [c_ang8_2_28]
18599 pmulhrsw m4, m3
18600 pmaddubsw m0, [c_ang8_22_16]
18601 pmulhrsw m0, m3
18602 packuswb m1, m2
18603 packuswb m4, m0
18604
18605 vperm2i128 m2, m1, m4, 00100000b
18606 vperm2i128 m1, m1, m4, 00110001b
18607 punpcklbw m4, m2, m1
18608 punpckhbw m2, m1
18609 punpcklwd m1, m4, m2
18610 punpckhwd m4, m2
18611 mova m0, [trans8_shuf]
18612 vpermd m1, m0, m1
18613 vpermd m4, m0, m4
18614
18615 lea r3, [3 * r1]
18616 movq [r0], xm1
18617 movhps [r0 + r1], xm1
18618 vextracti128 xm2, m1, 1
18619 movq [r0 + 2 * r1], xm2
18620 movhps [r0 + r3], xm2
18621 lea r0, [r0 + 4 * r1]
18622 movq [r0], xm4
18623 movhps [r0 + r1], xm4
18624 vextracti128 xm2, m4, 1
18625 movq [r0 + 2 * r1], xm2
18626 movhps [r0 + r3], xm2
18627 RET
18628
18629 INIT_YMM avx2
18630 cglobal intra_pred_ang8_33, 3,4,5
18631 mova m3, [pw_1024]
18632 vbroadcasti128 m0, [r2 + 1]
18633
18634 pshufb m1, m0, [c_ang8_src1_9_2_10]
18635 pshufb m2, m0, [c_ang8_src3_11_4_12]
18636 pshufb m4, m0, [c_ang8_src5_13_5_13]
18637 pshufb m0, [c_ang8_src6_14_7_15]
18638
18639 pmaddubsw m1, [c_ang8_26_20]
18640 pmulhrsw m1, m3
18641 pmaddubsw m2, [c_ang8_14_8]
18642 pmulhrsw m2, m3
18643 pmaddubsw m4, [c_ang8_2_28]
18644 pmulhrsw m4, m3
18645 pmaddubsw m0, [c_ang8_22_16]
18646 pmulhrsw m0, m3
18647 packuswb m1, m2
18648 packuswb m4, m0
18649
18650 lea r3, [3 * r1]
18651 movq [r0], xm1
18652 vextracti128 xm2, m1, 1
18653 movq [r0 + r1], xm2
18654 movhps [r0 + 2 * r1], xm1
18655 movhps [r0 + r3], xm2
18656 lea r0, [r0 + 4 * r1]
18657 movq [r0], xm4
18658 vextracti128 xm2, m4, 1
18659 movq [r0 + r1], xm2
18660 movhps [r0 + 2 * r1], xm4
18661 movhps [r0 + r3], xm2
18662 RET
18663
18664 INIT_YMM avx2
18665 cglobal intra_pred_ang8_4, 3,4,5
18666 mova m3, [pw_1024]
18667 vbroadcasti128 m0, [r2 + 17]
18668
18669 pshufb m1, m0, [c_ang8_src1_9_2_10]
18670 pshufb m2, m0, [c_ang8_src2_10_3_11]
18671 pshufb m4, m0, [c_ang8_src4_12_4_12]
18672 pshufb m0, [c_ang8_src5_13_6_14]
18673
18674 pmaddubsw m1, [c_ang8_21_10]
18675 pmulhrsw m1, m3
18676 pmaddubsw m2, [c_ang8_31_20]
18677 pmulhrsw m2, m3
18678 pmaddubsw m4, [c_ang8_9_30]
18679 pmulhrsw m4, m3
18680 pmaddubsw m0, [c_ang8_19_8]
18681 pmulhrsw m0, m3
18682 packuswb m1, m2
18683 packuswb m4, m0
18684
18685 vperm2i128 m2, m1, m4, 00100000b
18686 vperm2i128 m1, m1, m4, 00110001b
18687 punpcklbw m4, m2, m1
18688 punpckhbw m2, m1
18689 punpcklwd m1, m4, m2
18690 punpckhwd m4, m2
18691 mova m0, [trans8_shuf]
18692 vpermd m1, m0, m1
18693 vpermd m4, m0, m4
18694
18695 lea r3, [3 * r1]
18696 movq [r0], xm1
18697 movhps [r0 + r1], xm1
18698 vextracti128 xm2, m1, 1
18699 movq [r0 + 2 * r1], xm2
18700 movhps [r0 + r3], xm2
18701 lea r0, [r0 + 4 * r1]
18702 movq [r0], xm4
18703 movhps [r0 + r1], xm4
18704 vextracti128 xm2, m4, 1
18705 movq [r0 + 2 * r1], xm2
18706 movhps [r0 + r3], xm2
18707 RET
18708
18709 INIT_YMM avx2
18710 cglobal intra_pred_ang8_32, 3,4,5
18711 mova m3, [pw_1024]
18712 vbroadcasti128 m0, [r2 + 1]
18713
18714 pshufb m1, m0, [c_ang8_src1_9_2_10]
18715 pshufb m2, m0, [c_ang8_src2_10_3_11]
18716 pshufb m4, m0, [c_ang8_src4_12_4_12]
18717 pshufb m0, [c_ang8_src5_13_6_14]
18718
18719 pmaddubsw m1, [c_ang8_21_10]
18720 pmulhrsw m1, m3
18721 pmaddubsw m2, [c_ang8_31_20]
18722 pmulhrsw m2, m3
18723 pmaddubsw m4, [c_ang8_9_30]
18724 pmulhrsw m4, m3
18725 pmaddubsw m0, [c_ang8_19_8]
18726 pmulhrsw m0, m3
18727 packuswb m1, m2
18728 packuswb m4, m0
18729
18730 lea r3, [3 * r1]
18731 movq [r0], xm1
18732 vextracti128 xm2, m1, 1
18733 movq [r0 + r1], xm2
18734 movhps [r0 + 2 * r1], xm1
18735 movhps [r0 + r3], xm2
18736 lea r0, [r0 + 4 * r1]
18737 movq [r0], xm4
18738 vextracti128 xm2, m4, 1
18739 movq [r0 + r1], xm2
18740 movhps [r0 + 2 * r1], xm4
18741 movhps [r0 + r3], xm2
18742 RET
18743
18744
18745 INIT_YMM avx2
18746 cglobal intra_pred_ang8_5, 3, 4, 5
18747 mova m3, [pw_1024]
18748 vbroadcasti128 m0, [r2 + 17]
18749
18750 pshufb m1, m0, [c_ang8_src1_9_2_10]
18751 pshufb m2, m0, [c_ang8_src2_10_3_11]
18752 pshufb m4, m0, [c_ang8_src3_11_4_12]
18753 pshufb m0, [c_ang8_src4_12_5_13]
18754
18755 pmaddubsw m1, [c_ang8_17_2]
18756 pmulhrsw m1, m3
18757 pmaddubsw m2, [c_ang8_19_4]
18758 pmulhrsw m2, m3
18759 pmaddubsw m4, [c_ang8_21_6]
18760 pmulhrsw m4, m3
18761 pmaddubsw m0, [c_ang8_23_8]
18762 pmulhrsw m0, m3
18763 packuswb m1, m2
18764 packuswb m4, m0
18765
18766 vperm2i128 m2, m1, m4, 00100000b
18767 vperm2i128 m1, m1, m4, 00110001b
18768 punpcklbw m4, m2, m1
18769 punpckhbw m2, m1
18770 punpcklwd m1, m4, m2
18771 punpckhwd m4, m2
18772 mova m0, [trans8_shuf]
18773 vpermd m1, m0, m1
18774 vpermd m4, m0, m4
18775
18776 lea r3, [3 * r1]
18777 movq [r0], xm1
18778 movhps [r0 + r1], xm1
18779 vextracti128 xm2, m1, 1
18780 movq [r0 + 2 * r1], xm2
18781 movhps [r0 + r3], xm2
18782 lea r0, [r0 + 4 * r1]
18783 movq [r0], xm4
18784 movhps [r0 + r1], xm4
18785 vextracti128 xm2, m4, 1
18786 movq [r0 + 2 * r1], xm2
18787 movhps [r0 + r3], xm2
18788 RET
18789
18790 INIT_YMM avx2
18791 cglobal intra_pred_ang8_31, 3, 4, 5
18792 mova m3, [pw_1024]
18793 vbroadcasti128 m0, [r2 + 1]
18794
18795 pshufb m1, m0, [c_ang8_src1_9_2_10]
18796 pshufb m2, m0, [c_ang8_src2_10_3_11]
18797 pshufb m4, m0, [c_ang8_src3_11_4_12]
18798 pshufb m0, [c_ang8_src4_12_5_13]
18799
18800 pmaddubsw m1, [c_ang8_17_2]
18801 pmulhrsw m1, m3
18802 pmaddubsw m2, [c_ang8_19_4]
18803 pmulhrsw m2, m3
18804 pmaddubsw m4, [c_ang8_21_6]
18805 pmulhrsw m4, m3
18806 pmaddubsw m0, [c_ang8_23_8]
18807 pmulhrsw m0, m3
18808 packuswb m1, m2
18809 packuswb m4, m0
18810
18811 lea r3, [3 * r1]
18812 movq [r0], xm1
18813 vextracti128 xm2, m1, 1
18814 movq [r0 + r1], xm2
18815 movhps [r0 + 2 * r1], xm1
18816 movhps [r0 + r3], xm2
18817 lea r0, [r0 + 4 * r1]
18818 movq [r0], xm4
18819 vextracti128 xm2, m4, 1
18820 movq [r0 + r1], xm2
18821 movhps [r0 + 2 * r1], xm4
18822 movhps [r0 + r3], xm2
18823 RET
18824
18825
18826 INIT_YMM avx2
18827 cglobal intra_pred_ang8_6, 3, 4, 5
18828 mova m3, [pw_1024]
18829 vbroadcasti128 m0, [r2 + 17]
18830
18831 pshufb m1, m0, [intra_pred_shuff_0_8]
18832 pshufb m2, m0, [c_ang8_src2_10_2_10]
18833 pshufb m4, m0, [c_ang8_src3_11_3_11]
18834 pshufb m0, [c_ang8_src3_11_4_12]
18835
18836 pmaddubsw m1, [c_ang8_13_26]
18837 pmulhrsw m1, m3
18838 pmaddubsw m2, [c_ang8_7_20]
18839 pmulhrsw m2, m3
18840 pmaddubsw m4, [c_ang8_1_14]
18841 pmulhrsw m4, m3
18842 pmaddubsw m0, [c_ang8_27_8]
18843 pmulhrsw m0, m3
18844 packuswb m1, m2
18845 packuswb m4, m0
18846
18847 vperm2i128 m2, m1, m4, 00100000b
18848 vperm2i128 m1, m1, m4, 00110001b
18849 punpcklbw m4, m2, m1
18850 punpckhbw m2, m1
18851 punpcklwd m1, m4, m2
18852 punpckhwd m4, m2
18853 mova m0, [trans8_shuf]
18854 vpermd m1, m0, m1
18855 vpermd m4, m0, m4
18856
18857 lea r3, [3 * r1]
18858 movq [r0], xm1
18859 movhps [r0 + r1], xm1
18860 vextracti128 xm2, m1, 1
18861 movq [r0 + 2 * r1], xm2
18862 movhps [r0 + r3], xm2
18863 lea r0, [r0 + 4 * r1]
18864 movq [r0], xm4
18865 movhps [r0 + r1], xm4
18866 vextracti128 xm2, m4, 1
18867 movq [r0 + 2 * r1], xm2
18868 movhps [r0 + r3], xm2
18869 RET
18870
18871 INIT_YMM avx2
18872 cglobal intra_pred_ang8_30, 3, 4, 5
18873 mova m3, [pw_1024]
18874 vbroadcasti128 m0, [r2 + 1]
18875
18876 pshufb m1, m0, [intra_pred_shuff_0_8]
18877 pshufb m2, m0, [c_ang8_src2_10_2_10]
18878 pshufb m4, m0, [c_ang8_src3_11_3_11]
18879 pshufb m0, [c_ang8_src3_11_4_12]
18880
18881 pmaddubsw m1, [c_ang8_13_26]
18882 pmulhrsw m1, m3
18883 pmaddubsw m2, [c_ang8_7_20]
18884 pmulhrsw m2, m3
18885 pmaddubsw m4, [c_ang8_1_14]
18886 pmulhrsw m4, m3
18887 pmaddubsw m0, [c_ang8_27_8]
18888 pmulhrsw m0, m3
18889 packuswb m1, m2
18890 packuswb m4, m0
18891
18892 lea r3, [3 * r1]
18893 movq [r0], xm1
18894 vextracti128 xm2, m1, 1
18895 movq [r0 + r1], xm2
18896 movhps [r0 + 2 * r1], xm1
18897 movhps [r0 + r3], xm2
18898 lea r0, [r0 + 4 * r1]
18899 movq [r0], xm4
18900 vextracti128 xm2, m4, 1
18901 movq [r0 + r1], xm2
18902 movhps [r0 + 2 * r1], xm4
18903 movhps [r0 + r3], xm2
18904 RET
18905
18906
18907 INIT_YMM avx2
18908 cglobal intra_pred_ang8_9, 3, 5, 5
18909 mova m3, [pw_1024]
18910 vbroadcasti128 m0, [r2 + 17]
18911
18912 pshufb m0, [intra_pred_shuff_0_8]
18913
18914 lea r4, [c_ang8_mode_27]
18915 pmaddubsw m1, m0, [r4]
18916 pmulhrsw m1, m3
18917 pmaddubsw m2, m0, [r4 + mmsize]
18918 pmulhrsw m2, m3
18919 pmaddubsw m4, m0, [r4 + 2 * mmsize]
18920 pmulhrsw m4, m3
18921 pmaddubsw m0, [r4 + 3 * mmsize]
18922 pmulhrsw m0, m3
18923 packuswb m1, m2
18924 packuswb m4, m0
18925
18926 vperm2i128 m2, m1, m4, 00100000b
18927 vperm2i128 m1, m1, m4, 00110001b
18928 punpcklbw m4, m2, m1
18929 punpckhbw m2, m1
18930 punpcklwd m1, m4, m2
18931 punpckhwd m4, m2
18932 mova m0, [trans8_shuf]
18933 vpermd m1, m0, m1
18934 vpermd m4, m0, m4
18935
18936 lea r3, [3 * r1]
18937 movq [r0], xm1
18938 movhps [r0 + r1], xm1
18939 vextracti128 xm2, m1, 1
18940 movq [r0 + 2 * r1], xm2
18941 movhps [r0 + r3], xm2
18942 lea r0, [r0 + 4 * r1]
18943 movq [r0], xm4
18944 movhps [r0 + r1], xm4
18945 vextracti128 xm2, m4, 1
18946 movq [r0 + 2 * r1], xm2
18947 movhps [r0 + r3], xm2
18948 RET
18949
18950 INIT_YMM avx2
18951 cglobal intra_pred_ang8_27, 3, 5, 5
18952 mova m3, [pw_1024]
18953 vbroadcasti128 m0, [r2 + 1]
18954
18955 pshufb m0, [intra_pred_shuff_0_8]
18956
18957 lea r4, [c_ang8_mode_27]
18958 pmaddubsw m1, m0, [r4]
18959 pmulhrsw m1, m3
18960 pmaddubsw m2, m0, [r4 + mmsize]
18961 pmulhrsw m2, m3
18962 pmaddubsw m4, m0, [r4 + 2 * mmsize]
18963 pmulhrsw m4, m3
18964 pmaddubsw m0, [r4 + 3 * mmsize]
18965 pmulhrsw m0, m3
18966 packuswb m1, m2
18967 packuswb m4, m0
18968
18969 lea r3, [3 * r1]
18970 movq [r0], xm1
18971 vextracti128 xm2, m1, 1
18972 movq [r0 + r1], xm2
18973 movhps [r0 + 2 * r1], xm1
18974 movhps [r0 + r3], xm2
18975 lea r0, [r0 + 4 * r1]
18976 movq [r0], xm4
18977 vextracti128 xm2, m4, 1
18978 movq [r0 + r1], xm2
18979 movhps [r0 + 2 * r1], xm4
18980 movhps [r0 + r3], xm2
18981 RET
18982
18983 INIT_YMM avx2
18984 cglobal intra_pred_ang8_25, 3, 5, 5
18985 mova m3, [pw_1024]
18986 vbroadcasti128 m0, [r2]
18987
18988 pshufb m0, [intra_pred_shuff_0_8]
18989
18990 lea r4, [c_ang8_mode_25]
18991 pmaddubsw m1, m0, [r4]
18992 pmulhrsw m1, m3
18993 pmaddubsw m2, m0, [r4 + mmsize]
18994 pmulhrsw m2, m3
18995 pmaddubsw m4, m0, [r4 + 2 * mmsize]
18996 pmulhrsw m4, m3
18997 pmaddubsw m0, [r4 + 3 * mmsize]
18998 pmulhrsw m0, m3
18999 packuswb m1, m2
19000 packuswb m4, m0
19001
19002 lea r3, [3 * r1]
19003 movq [r0], xm1
19004 vextracti128 xm2, m1, 1
19005 movq [r0 + r1], xm2
19006 movhps [r0 + 2 * r1], xm1
19007 movhps [r0 + r3], xm2
19008 lea r0, [r0 + 4 * r1]
19009 movq [r0], xm4
19010 vextracti128 xm2, m4, 1
19011 movq [r0 + r1], xm2
19012 movhps [r0 + 2 * r1], xm4
19013 movhps [r0 + r3], xm2
19014 RET
19015
19016
19017 INIT_YMM avx2
19018 cglobal intra_pred_ang8_7, 3, 4, 5
19019 mova m3, [pw_1024]
19020 vbroadcasti128 m0, [r2 + 17]
19021
19022 pshufb m1, m0, [intra_pred_shuff_0_8]
19023 pshufb m2, m0, [c_ang8_src1_9_2_10]
19024 pshufb m4, m0, [c_ang8_src2_10_2_10]
19025 pshufb m0, [c_ang8_src2_10_3_11]
19026
19027 pmaddubsw m1, [c_ang8_9_18]
19028 pmulhrsw m1, m3
19029 pmaddubsw m2, [c_ang8_27_4]
19030 pmulhrsw m2, m3
19031 pmaddubsw m4, [c_ang8_13_22]
19032 pmulhrsw m4, m3
19033 pmaddubsw m0, [c_ang8_31_8]
19034 pmulhrsw m0, m3
19035 packuswb m1, m2
19036 packuswb m4, m0
19037
19038 vperm2i128 m2, m1, m4, 00100000b
19039 vperm2i128 m1, m1, m4, 00110001b
19040 punpcklbw m4, m2, m1
19041 punpckhbw m2, m1
19042 punpcklwd m1, m4, m2
19043 punpckhwd m4, m2
19044 mova m0, [trans8_shuf]
19045 vpermd m1, m0, m1
19046 vpermd m4, m0, m4
19047
19048 lea r3, [3 * r1]
19049 movq [r0], xm1
19050 movhps [r0 + r1], xm1
19051 vextracti128 xm2, m1, 1
19052 movq [r0 + 2 * r1], xm2
19053 movhps [r0 + r3], xm2
19054 lea r0, [r0 + 4 * r1]
19055 movq [r0], xm4
19056 movhps [r0 + r1], xm4
19057 vextracti128 xm2, m4, 1
19058 movq [r0 + 2 * r1], xm2
19059 movhps [r0 + r3], xm2
19060 RET
19061
19062 INIT_YMM avx2
19063 cglobal intra_pred_ang8_29, 3, 4, 5
19064 mova m3, [pw_1024]
19065 vbroadcasti128 m0, [r2 + 1]
19066
19067 pshufb m1, m0, [intra_pred_shuff_0_8]
19068 pshufb m2, m0, [c_ang8_src1_9_2_10]
19069 pshufb m4, m0, [c_ang8_src2_10_2_10]
19070 pshufb m0, [c_ang8_src2_10_3_11]
19071
19072 pmaddubsw m1, [c_ang8_9_18]
19073 pmulhrsw m1, m3
19074 pmaddubsw m2, [c_ang8_27_4]
19075 pmulhrsw m2, m3
19076 pmaddubsw m4, [c_ang8_13_22]
19077 pmulhrsw m4, m3
19078 pmaddubsw m0, [c_ang8_31_8]
19079 pmulhrsw m0, m3
19080 packuswb m1, m2
19081 packuswb m4, m0
19082
19083 lea r3, [3 * r1]
19084 movq [r0], xm1
19085 vextracti128 xm2, m1, 1
19086 movq [r0 + r1], xm2
19087 movhps [r0 + 2 * r1], xm1
19088 movhps [r0 + r3], xm2
19089 lea r0, [r0 + 4 * r1]
19090 movq [r0], xm4
19091 vextracti128 xm2, m4, 1
19092 movq [r0 + r1], xm2
19093 movhps [r0 + 2 * r1], xm4
19094 movhps [r0 + r3], xm2
19095 RET
19096
19097
19098 INIT_YMM avx2
19099 cglobal intra_pred_ang8_8, 3, 4, 6
19100 mova m3, [pw_1024]
19101 vbroadcasti128 m0, [r2 + 17]
19102 mova m5, [intra_pred_shuff_0_8]
19103
19104 pshufb m1, m0, m5
19105 pshufb m2, m0, m5
19106 pshufb m4, m0, m5
19107 pshufb m0, [c_ang8_src2_10_2_10]
19108
19109 pmaddubsw m1, [c_ang8_5_10]
19110 pmulhrsw m1, m3
19111 pmaddubsw m2, [c_ang8_15_20]
19112 pmulhrsw m2, m3
19113 pmaddubsw m4, [c_ang8_25_30]
19114 pmulhrsw m4, m3
19115 pmaddubsw m0, [c_ang8_3_8]
19116 pmulhrsw m0, m3
19117 packuswb m1, m2
19118 packuswb m4, m0
19119
19120 vperm2i128 m2, m1, m4, 00100000b
19121 vperm2i128 m1, m1, m4, 00110001b
19122 punpcklbw m4, m2, m1
19123 punpckhbw m2, m1
19124 punpcklwd m1, m4, m2
19125 punpckhwd m4, m2
19126 mova m0, [trans8_shuf]
19127 vpermd m1, m0, m1
19128 vpermd m4, m0, m4
19129
19130 lea r3, [3 * r1]
19131 movq [r0], xm1
19132 movhps [r0 + r1], xm1
19133 vextracti128 xm2, m1, 1
19134 movq [r0 + 2 * r1], xm2
19135 movhps [r0 + r3], xm2
19136 lea r0, [r0 + 4 * r1]
19137 movq [r0], xm4
19138 movhps [r0 + r1], xm4
19139 vextracti128 xm2, m4, 1
19140 movq [r0 + 2 * r1], xm2
19141 movhps [r0 + r3], xm2
19142 RET
19143
19144 INIT_YMM avx2
19145 cglobal intra_pred_ang8_28, 3, 4, 6
19146 mova m3, [pw_1024]
19147 vbroadcasti128 m0, [r2 + 1]
19148 mova m5, [intra_pred_shuff_0_8]
19149
19150 pshufb m1, m0, m5
19151 pshufb m2, m0, m5
19152 pshufb m4, m0, m5
19153 pshufb m0, [c_ang8_src2_10_2_10]
19154
19155 pmaddubsw m1, [c_ang8_5_10]
19156 pmulhrsw m1, m3
19157 pmaddubsw m2, [c_ang8_15_20]
19158 pmulhrsw m2, m3
19159 pmaddubsw m4, [c_ang8_25_30]
19160 pmulhrsw m4, m3
19161 pmaddubsw m0, [c_ang8_3_8]
19162 pmulhrsw m0, m3
19163 packuswb m1, m2
19164 packuswb m4, m0
19165
19166 lea r3, [3 * r1]
19167 movq [r0], xm1
19168 vextracti128 xm2, m1, 1
19169 movq [r0 + r1], xm2
19170 movhps [r0 + 2 * r1], xm1
19171 movhps [r0 + r3], xm2
19172 lea r0, [r0 + 4 * r1]
19173 movq [r0], xm4
19174 vextracti128 xm2, m4, 1
19175 movq [r0 + r1], xm2
19176 movhps [r0 + 2 * r1], xm4
19177 movhps [r0 + r3], xm2
19178 RET
19179
19180
19181 INIT_YMM avx2
19182 cglobal intra_pred_ang8_11, 3, 5, 5
19183 mova m3, [pw_1024]
19184 movu xm1, [r2 + 16]
19185 pinsrb xm1, [r2], 0
19186 pshufb xm1, [intra_pred_shuff_0_8]
19187 vinserti128 m0, m1, xm1, 1
19188
19189 lea r4, [c_ang8_mode_25]
19190 pmaddubsw m1, m0, [r4]
19191 pmulhrsw m1, m3
19192 pmaddubsw m2, m0, [r4 + mmsize]
19193 pmulhrsw m2, m3
19194 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19195 pmulhrsw m4, m3
19196 pmaddubsw m0, [r4 + 3 * mmsize]
19197 pmulhrsw m0, m3
19198 packuswb m1, m2
19199 packuswb m4, m0
19200
19201 vperm2i128 m2, m1, m4, 00100000b
19202 vperm2i128 m1, m1, m4, 00110001b
19203 punpcklbw m4, m2, m1
19204 punpckhbw m2, m1
19205 punpcklwd m1, m4, m2
19206 punpckhwd m4, m2
19207 mova m0, [trans8_shuf]
19208 vpermd m1, m0, m1
19209 vpermd m4, m0, m4
19210
19211 lea r3, [3 * r1]
19212 movq [r0], xm1
19213 movhps [r0 + r1], xm1
19214 vextracti128 xm2, m1, 1
19215 movq [r0 + 2 * r1], xm2
19216 movhps [r0 + r3], xm2
19217 lea r0, [r0 + 4 * r1]
19218 movq [r0], xm4
19219 movhps [r0 + r1], xm4
19220 vextracti128 xm2, m4, 1
19221 movq [r0 + 2 * r1], xm2
19222 movhps [r0 + r3], xm2
19223 RET
19224
19225 INIT_YMM avx2
19226 cglobal intra_pred_ang8_15, 3, 6, 6
19227 mova m3, [pw_1024]
19228 movu xm5, [r2 + 16]
19229 pinsrb xm5, [r2], 0
19230 lea r5, [intra_pred_shuff_0_8]
19231 mova xm0, xm5
19232 pslldq xm5, 1
19233 pinsrb xm5, [r2 + 2], 0
19234 vinserti128 m0, m0, xm5, 1
19235 pshufb m0, [r5]
19236
19237 lea r4, [c_ang8_mode_15]
19238 pmaddubsw m1, m0, [r4]
19239 pmulhrsw m1, m3
19240 mova xm0, xm5
19241 pslldq xm5, 1
19242 pinsrb xm5, [r2 + 4], 0
19243 vinserti128 m0, m0, xm5, 1
19244 pshufb m0, [r5]
19245 pmaddubsw m2, m0, [r4 + mmsize]
19246 pmulhrsw m2, m3
19247 mova xm0, xm5
19248 pslldq xm5, 1
19249 pinsrb xm5, [r2 + 6], 0
19250 vinserti128 m0, m0, xm5, 1
19251 pshufb m0, [r5]
19252 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19253 pmulhrsw m4, m3
19254 mova xm0, xm5
19255 pslldq xm5, 1
19256 pinsrb xm5, [r2 + 8], 0
19257 vinserti128 m0, m0, xm5, 1
19258 pshufb m0, [r5]
19259 pmaddubsw m0, [r4 + 3 * mmsize]
19260 pmulhrsw m0, m3
19261 packuswb m1, m2
19262 packuswb m4, m0
19263
19264 vperm2i128 m2, m1, m4, 00100000b
19265 vperm2i128 m1, m1, m4, 00110001b
19266 punpcklbw m4, m2, m1
19267 punpckhbw m2, m1
19268 punpcklwd m1, m4, m2
19269 punpckhwd m4, m2
19270 mova m0, [trans8_shuf]
19271 vpermd m1, m0, m1
19272 vpermd m4, m0, m4
19273
19274 lea r3, [3 * r1]
19275 movq [r0], xm1
19276 movhps [r0 + r1], xm1
19277 vextracti128 xm2, m1, 1
19278 movq [r0 + 2 * r1], xm2
19279 movhps [r0 + r3], xm2
19280 lea r0, [r0 + 4 * r1]
19281 movq [r0], xm4
19282 movhps [r0 + r1], xm4
19283 vextracti128 xm2, m4, 1
19284 movq [r0 + 2 * r1], xm2
19285 movhps [r0 + r3], xm2
19286 RET
19287
19288 INIT_YMM avx2
19289 cglobal intra_pred_ang8_16, 3,4,7
19290 lea r0, [r0 + r1 * 8]
19291 sub r0, r1
19292 neg r1
19293 lea r3, [r1 * 3]
19294 vbroadcasti128 m0, [angHor8_tab_16] ; m0 = factor
19295 mova m1, [intra_pred8_shuff16] ; m1 = 4 of Row shuffle
19296 movu m2, [intra_pred8_shuff16 + 8] ; m2 = 4 of Row shuffle
19297
19298 ; prepare reference pixel
19299 movq xm3, [r2 + 16 + 1] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
19300 movhps xm3, [r2 + 2] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
19301 pslldq xm3, 1
19302 pinsrb xm3, [r2], 0 ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
19303 pshufb xm3, [c_ang8_mode_16]
19304 vinserti128 m3, m3, xm3, 1 ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1 0 2 3 5 6 8]
19305
19306 ; process 4 rows
19307 pshufb m4, m3, m1
19308 pshufb m5, m3, m2
19309 psrldq m3, 4
19310 punpcklbw m6, m5, m4
19311 punpckhbw m5, m4
19312 pmaddubsw m6, m0
19313 pmulhrsw m6, [pw_1024]
19314 pmaddubsw m5, m0
19315 pmulhrsw m5, [pw_1024]
19316 packuswb m6, m5
19317 vextracti128 xm5, m6, 1
19318 movq [r0], xm6
19319 movhps [r0 + r1], xm6
19320 movq [r0 + r1 * 2], xm5
19321 movhps [r0 + r3], xm5
19322
19323 ; process 4 rows
19324 lea r0, [r0 + r1 * 4]
19325 pshufb m4, m3, m1
19326 pshufb m5, m3, m2
19327 punpcklbw m6, m5, m4
19328 punpckhbw m5, m4
19329 pmaddubsw m6, m0
19330 pmulhrsw m6, [pw_1024]
19331 pmaddubsw m5, m0
19332 pmulhrsw m5, [pw_1024]
19333 packuswb m6, m5
19334 vextracti128 xm5, m6, 1
19335 movq [r0], xm6
19336 movhps [r0 + r1], xm6
19337 movq [r0 + r1 * 2], xm5
19338 movhps [r0 + r3], xm5
19339 RET
19340
19341 %if 1
19342 INIT_YMM avx2
19343 cglobal intra_pred_ang8_20, 3,5,6
19344 lea r0, [r0 + r1 * 8]
19345 sub r0, r1
19346 neg r1
19347 lea r3, [angHor8_tab_20]
19348 lea r4, [r1 * 3]
19349 movu m5, [intra_pred_shuff_0_8 + 16]
19350
19351 ; prepare reference pixel
19352 movq xm1, [r2 + 1] ; m3 = [ 1 2 3 4 5 6 7 8 x x x x x x x x]
19353 movhps xm1, [r2 + 16 + 2] ; m3 = [ 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8 x]
19354 palignr xm1, xm1, [r2 - 15], 15 ; m3 = [ 0 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8]
19355 pshufb xm1, [c_ang8_mode_20]
19356 vinserti128 m1, m1, xm1, 1
19357
19358 ; process 4 rows
19359 pshufb m3, m1, m5
19360 psrldq m1, 2
19361 pmaddubsw m3, [r3 + 0 * 16]
19362 pmulhrsw m3, [pw_1024]
19363
19364 pshufb m4, m1, [intra_pred_shuff_0_8]
19365 psrldq m1, 1
19366 pmaddubsw m4, [r3 + 2 * 16]
19367 pmulhrsw m4, [pw_1024]
19368
19369 packuswb m3, m4
19370 vextracti128 xm4, m3, 1
19371 movq [r0], xm3
19372 movq [r0 + r1], xm4
19373 movhps [r0 + r1 * 2], xm3
19374 movhps [r0 + r4], xm4
19375
19376 ; process 4 rows
19377 lea r0, [r0 + r1 * 4]
19378 pshufb m3, m1, m5
19379 psrldq m1, 1
19380 pmaddubsw m3, [r3 + 4 * 16]
19381 pmulhrsw m3, [pw_1024]
19382
19383 pshufb m4, m1, m5
19384 pmaddubsw m4, [r3 + 6 * 16]
19385 pmulhrsw m4, [pw_1024]
19386
19387 packuswb m3, m4
19388 vextracti128 xm4, m3, 1
19389 movq [r0], xm3
19390 movq [r0 + r1], xm4
19391 movhps [r0 + r1 * 2], xm3
19392 movhps [r0 + r4], xm4
19393 RET
19394
19395 %else
19396 INIT_YMM avx2
19397 cglobal intra_pred_ang8_20, 3, 6, 6
19398 mova m3, [pw_1024]
19399 movu xm5, [r2]
19400 lea r5, [intra_pred_shuff_0_8]
19401 mova xm0, xm5
19402 pslldq xm5, 1
19403 pinsrb xm5, [r2 + 2 + 16], 0
19404 vinserti128 m0, m0, xm5, 1
19405 pshufb m0, [r5]
19406
19407 lea r4, [c_ang8_mode_20]
19408 pmaddubsw m1, m0, [r4]
19409 pmulhrsw m1, m3
19410 mova xm0, xm5
19411 pslldq xm5, 1
19412 pinsrb xm5, [r2 + 3 + 16], 0
19413 vinserti128 m0, m0, xm5, 1
19414 pshufb m0, [r5]
19415 pmaddubsw m2, m0, [r4 + mmsize]
19416 pmulhrsw m2, m3
19417 pslldq xm5, 1
19418 pinsrb xm5, [r2 + 5 + 16], 0
19419 vinserti128 m0, m5, xm5, 1
19420 pshufb m0, [r5]
19421 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19422 pmulhrsw m4, m3
19423 pslldq xm5, 1
19424 pinsrb xm5, [r2 + 6 + 16], 0
19425 mova xm0, xm5
19426 pslldq xm5, 1
19427 pinsrb xm5, [r2 + 8 + 16], 0
19428 vinserti128 m0, m0, xm5, 1
19429 pshufb m0, [r5]
19430 pmaddubsw m0, [r4 + 3 * mmsize]
19431 pmulhrsw m0, m3
19432
19433 packuswb m1, m2
19434 packuswb m4, m0
19435
19436 lea r3, [3 * r1]
19437 movq [r0], xm1
19438 vextracti128 xm2, m1, 1
19439 movq [r0 + r1], xm2
19440 movhps [r0 + 2 * r1], xm1
19441 movhps [r0 + r3], xm2
19442 lea r0, [r0 + 4 * r1]
19443 movq [r0], xm4
19444 vextracti128 xm2, m4, 1
19445 movq [r0 + r1], xm2
19446 movhps [r0 + 2 * r1], xm4
19447 movhps [r0 + r3], xm2
19448 RET
19449 %endif
19450
19451 INIT_YMM avx2
19452 cglobal intra_pred_ang8_21, 3, 6, 6
19453 mova m3, [pw_1024]
19454 movu xm5, [r2]
19455 lea r5, [intra_pred_shuff_0_8]
19456 mova xm0, xm5
19457 pslldq xm5, 1
19458 pinsrb xm5, [r2 + 2 + 16], 0
19459 vinserti128 m0, m0, xm5, 1
19460 pshufb m0, [r5]
19461
19462 lea r4, [c_ang8_mode_15]
19463 pmaddubsw m1, m0, [r4]
19464 pmulhrsw m1, m3
19465 mova xm0, xm5
19466 pslldq xm5, 1
19467 pinsrb xm5, [r2 + 4 + 16], 0
19468 vinserti128 m0, m0, xm5, 1
19469 pshufb m0, [r5]
19470 pmaddubsw m2, m0, [r4 + mmsize]
19471 pmulhrsw m2, m3
19472 mova xm0, xm5
19473 pslldq xm5, 1
19474 pinsrb xm5, [r2 + 6 + 16], 0
19475 vinserti128 m0, m0, xm5, 1
19476 pshufb m0, [r5]
19477 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19478 pmulhrsw m4, m3
19479 mova xm0, xm5
19480 pslldq xm5, 1
19481 pinsrb xm5, [r2 + 8 + 16], 0
19482 vinserti128 m0, m0, xm5, 1
19483 pshufb m0, [r5]
19484 pmaddubsw m0, [r4 + 3 * mmsize]
19485 pmulhrsw m0, m3
19486 packuswb m1, m2
19487 packuswb m4, m0
19488
19489 lea r3, [3 * r1]
19490 movq [r0], xm1
19491 vextracti128 xm2, m1, 1
19492 movq [r0 + r1], xm2
19493 movhps [r0 + 2 * r1], xm1
19494 movhps [r0 + r3], xm2
19495 lea r0, [r0 + 4 * r1]
19496 movq [r0], xm4
19497 vextracti128 xm2, m4, 1
19498 movq [r0 + r1], xm2
19499 movhps [r0 + 2 * r1], xm4
19500 movhps [r0 + r3], xm2
19501 RET
19502
19503 INIT_YMM avx2
19504 cglobal intra_pred_ang8_22, 3, 6, 6
19505 mova m3, [pw_1024]
19506 movu xm5, [r2]
19507 lea r5, [intra_pred_shuff_0_8]
19508 vinserti128 m0, m5, xm5, 1
19509 pshufb m0, [r5]
19510
19511 lea r4, [c_ang8_mode_14]
19512 pmaddubsw m1, m0, [r4]
19513 pmulhrsw m1, m3
19514 pslldq xm5, 1
19515 pinsrb xm5, [r2 + 2 + 16], 0
19516 vinserti128 m0, m5, xm5, 1
19517 pshufb m0, [r5]
19518 pmaddubsw m2, m0, [r4 + mmsize]
19519 pmulhrsw m2, m3
19520 pslldq xm5, 1
19521 pinsrb xm5, [r2 + 5 + 16], 0
19522 vinserti128 m0, m5, xm5, 1
19523 pshufb m0, [r5]
19524 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19525 pmulhrsw m4, m3
19526 pslldq xm5, 1
19527 pinsrb xm5, [r2 + 7 + 16], 0
19528 pshufb xm5, [r5]
19529 vinserti128 m0, m0, xm5, 1
19530 pmaddubsw m0, [r4 + 3 * mmsize]
19531 pmulhrsw m0, m3
19532 packuswb m1, m2
19533 packuswb m4, m0
19534
19535 lea r3, [3 * r1]
19536 movq [r0], xm1
19537 vextracti128 xm2, m1, 1
19538 movq [r0 + r1], xm2
19539 movhps [r0 + 2 * r1], xm1
19540 movhps [r0 + r3], xm2
19541 lea r0, [r0 + 4 * r1]
19542 movq [r0], xm4
19543 vextracti128 xm2, m4, 1
19544 movq [r0 + r1], xm2
19545 movhps [r0 + 2 * r1], xm4
19546 movhps [r0 + r3], xm2
19547 RET
19548
19549 INIT_YMM avx2
19550 cglobal intra_pred_ang8_14, 3, 6, 6
19551 mova m3, [pw_1024]
19552 movu xm5, [r2 + 16]
19553 pinsrb xm5, [r2], 0
19554 lea r5, [intra_pred_shuff_0_8]
19555 vinserti128 m0, m5, xm5, 1
19556 pshufb m0, [r5]
19557
19558 lea r4, [c_ang8_mode_14]
19559 pmaddubsw m1, m0, [r4]
19560 pmulhrsw m1, m3
19561 pslldq xm5, 1
19562 pinsrb xm5, [r2 + 2], 0
19563 vinserti128 m0, m5, xm5, 1
19564 pshufb m0, [r5]
19565 pmaddubsw m2, m0, [r4 + mmsize]
19566 pmulhrsw m2, m3
19567 pslldq xm5, 1
19568 pinsrb xm5, [r2 + 5], 0
19569 vinserti128 m0, m5, xm5, 1
19570 pshufb m0, [r5]
19571 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19572 pmulhrsw m4, m3
19573 pslldq xm5, 1
19574 pinsrb xm5, [r2 + 7], 0
19575 pshufb xm5, [r5]
19576 vinserti128 m0, m0, xm5, 1
19577 pmaddubsw m0, [r4 + 3 * mmsize]
19578 pmulhrsw m0, m3
19579 packuswb m1, m2
19580 packuswb m4, m0
19581
19582 vperm2i128 m2, m1, m4, 00100000b
19583 vperm2i128 m1, m1, m4, 00110001b
19584 punpcklbw m4, m2, m1
19585 punpckhbw m2, m1
19586 punpcklwd m1, m4, m2
19587 punpckhwd m4, m2
19588 mova m0, [trans8_shuf]
19589 vpermd m1, m0, m1
19590 vpermd m4, m0, m4
19591
19592 lea r3, [3 * r1]
19593 movq [r0], xm1
19594 movhps [r0 + r1], xm1
19595 vextracti128 xm2, m1, 1
19596 movq [r0 + 2 * r1], xm2
19597 movhps [r0 + r3], xm2
19598 lea r0, [r0 + 4 * r1]
19599 movq [r0], xm4
19600 movhps [r0 + r1], xm4
19601 vextracti128 xm2, m4, 1
19602 movq [r0 + 2 * r1], xm2
19603 movhps [r0 + r3], xm2
19604 RET
19605
19606 INIT_YMM avx2
19607 cglobal intra_pred_ang8_13, 3, 6, 6
19608 mova m3, [pw_1024]
19609 movu xm5, [r2 + 16]
19610 pinsrb xm5, [r2], 0
19611 lea r5, [intra_pred_shuff_0_8]
19612 vinserti128 m0, m5, xm5, 1
19613 pshufb m0, [r5]
19614
19615 lea r4, [c_ang8_mode_13]
19616 pmaddubsw m1, m0, [r4]
19617 pmulhrsw m1, m3
19618 pslldq xm5, 1
19619 pinsrb xm5, [r2 + 4], 0
19620 pshufb xm4, xm5, [r5]
19621 vinserti128 m0, m0, xm4, 1
19622 pmaddubsw m2, m0, [r4 + mmsize]
19623 pmulhrsw m2, m3
19624 vinserti128 m0, m0, xm4, 0
19625 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19626 pmulhrsw m4, m3
19627 pslldq xm5, 1
19628 pinsrb xm5, [r2 + 7], 0
19629 pshufb xm5, [r5]
19630 vinserti128 m0, m0, xm5, 1
19631 pmaddubsw m0, [r4 + 3 * mmsize]
19632 pmulhrsw m0, m3
19633 packuswb m1, m2
19634 packuswb m4, m0
19635
19636 vperm2i128 m2, m1, m4, 00100000b
19637 vperm2i128 m1, m1, m4, 00110001b
19638 punpcklbw m4, m2, m1
19639 punpckhbw m2, m1
19640 punpcklwd m1, m4, m2
19641 punpckhwd m4, m2
19642 mova m0, [trans8_shuf]
19643 vpermd m1, m0, m1
19644 vpermd m4, m0, m4
19645
19646 lea r3, [3 * r1]
19647 movq [r0], xm1
19648 movhps [r0 + r1], xm1
19649 vextracti128 xm2, m1, 1
19650 movq [r0 + 2 * r1], xm2
19651 movhps [r0 + r3], xm2
19652 lea r0, [r0 + 4 * r1]
19653 movq [r0], xm4
19654 movhps [r0 + r1], xm4
19655 vextracti128 xm2, m4, 1
19656 movq [r0 + 2 * r1], xm2
19657 movhps [r0 + r3], xm2
19658 RET
19659
19660
19661 INIT_YMM avx2
19662 cglobal intra_pred_ang8_23, 3, 6, 6
19663 mova m3, [pw_1024]
19664 movu xm5, [r2]
19665 lea r5, [intra_pred_shuff_0_8]
19666 vinserti128 m0, m5, xm5, 1
19667 pshufb m0, [r5]
19668
19669 lea r4, [c_ang8_mode_13]
19670 pmaddubsw m1, m0, [r4]
19671 pmulhrsw m1, m3
19672 pslldq xm5, 1
19673 pinsrb xm5, [r2 + 4 + 16], 0
19674 pshufb xm4, xm5, [r5]
19675 vinserti128 m0, m0, xm4, 1
19676 pmaddubsw m2, m0, [r4 + mmsize]
19677 pmulhrsw m2, m3
19678 vinserti128 m0, m0, xm4, 0
19679 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19680 pmulhrsw m4, m3
19681 pslldq xm5, 1
19682 pinsrb xm5, [r2 + 7 + 16], 0
19683 pshufb xm5, [r5]
19684 vinserti128 m0, m0, xm5, 1
19685 pmaddubsw m0, [r4 + 3 * mmsize]
19686 pmulhrsw m0, m3
19687
19688 packuswb m1, m2
19689 packuswb m4, m0
19690
19691 lea r3, [3 * r1]
19692 movq [r0], xm1
19693 vextracti128 xm2, m1, 1
19694 movq [r0 + r1], xm2
19695 movhps [r0 + 2 * r1], xm1
19696 movhps [r0 + r3], xm2
19697 lea r0, [r0 + 4 * r1]
19698 movq [r0], xm4
19699 vextracti128 xm2, m4, 1
19700 movq [r0 + r1], xm2
19701 movhps [r0 + 2 * r1], xm4
19702 movhps [r0 + r3], xm2
19703 RET
19704
19705 INIT_YMM avx2
19706 cglobal intra_pred_ang8_12, 3, 5, 5
19707 mova m3, [pw_1024]
19708 movu xm1, [r2 + 16]
19709 pinsrb xm1, [r2], 0
19710 pshufb xm1, [intra_pred_shuff_0_8]
19711 vinserti128 m0, m1, xm1, 1
19712
19713 lea r4, [c_ang8_mode_24]
19714 pmaddubsw m1, m0, [r4]
19715 pmulhrsw m1, m3
19716 pmaddubsw m2, m0, [r4 + mmsize]
19717 pmulhrsw m2, m3
19718 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19719 pmulhrsw m4, m3
19720 pslldq xm0, 2
19721 pinsrb xm0, [r2 + 6], 0
19722 pinsrb xm0, [r2 + 0], 1
19723 vinserti128 m0, m0, xm0, 1
19724 pmaddubsw m0, [r4 + 3 * mmsize]
19725 pmulhrsw m0, m3
19726 packuswb m1, m2
19727 packuswb m4, m0
19728
19729 vperm2i128 m2, m1, m4, 00100000b
19730 vperm2i128 m1, m1, m4, 00110001b
19731 punpcklbw m4, m2, m1
19732 punpckhbw m2, m1
19733 punpcklwd m1, m4, m2
19734 punpckhwd m4, m2
19735 mova m0, [trans8_shuf]
19736 vpermd m1, m0, m1
19737 vpermd m4, m0, m4
19738
19739 lea r3, [3 * r1]
19740 movq [r0], xm1
19741 movhps [r0 + r1], xm1
19742 vextracti128 xm2, m1, 1
19743 movq [r0 + 2 * r1], xm2
19744 movhps [r0 + r3], xm2
19745 lea r0, [r0 + 4 * r1]
19746 movq [r0], xm4
19747 movhps [r0 + r1], xm4
19748 vextracti128 xm2, m4, 1
19749 movq [r0 + 2 * r1], xm2
19750 movhps [r0 + r3], xm2
19751 RET
19752
19753 INIT_YMM avx2
19754 cglobal intra_pred_ang8_24, 3, 5, 5
19755 mova m3, [pw_1024]
19756 vbroadcasti128 m0, [r2]
19757
19758 pshufb m0, [intra_pred_shuff_0_8]
19759
19760 lea r4, [c_ang8_mode_24]
19761 pmaddubsw m1, m0, [r4]
19762 pmulhrsw m1, m3
19763 pmaddubsw m2, m0, [r4 + mmsize]
19764 pmulhrsw m2, m3
19765 pmaddubsw m4, m0, [r4 + 2 * mmsize]
19766 pmulhrsw m4, m3
19767 pslldq xm0, 2
19768 pinsrb xm0, [r2 + 16 + 6], 0
19769 pinsrb xm0, [r2 + 0], 1
19770 vinserti128 m0, m0, xm0, 1
19771 pmaddubsw m0, [r4 + 3 * mmsize]
19772 pmulhrsw m0, m3
19773 packuswb m1, m2
19774 packuswb m4, m0
19775
19776 lea r3, [3 * r1]
19777 movq [r0], xm1
19778 vextracti128 xm2, m1, 1
19779 movq [r0 + r1], xm2
19780 movhps [r0 + 2 * r1], xm1
19781 movhps [r0 + r3], xm2
19782 lea r0, [r0 + 4 * r1]
19783 movq [r0], xm4
19784 vextracti128 xm2, m4, 1
19785 movq [r0 + r1], xm2
19786 movhps [r0 + 2 * r1], xm4
19787 movhps [r0 + r3], xm2
19788 RET
19789
19790 %macro INTRA_PRED_ANG16_MC0 3
19791 pmaddubsw m3, m1, [r4 + %3 * mmsize]
19792 pmulhrsw m3, m0
19793 pmaddubsw m4, m2, [r4 + %3 * mmsize]
19794 pmulhrsw m4, m0
19795 packuswb m3, m4
19796 movu [%1], xm3
19797 vextracti128 xm4, m3, 1
19798 movu [%2], xm4
19799 %endmacro
19800
19801 %macro INTRA_PRED_ANG16_MC1 1
19802 INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1
19803 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1)
19804 %endmacro
19805
19806 %macro INTRA_PRED_ANG16_MC2 1
19807 vbroadcasti128 m1, [r2 + %1]
19808 pshufb m1, m5
19809 vbroadcasti128 m2, [r2 + (%1 + 8)]
19810 pshufb m2, m5
19811 %endmacro
19812
19813 %macro INTRA_PRED_ANG16_MC3 2
19814 vperm2i128 m1, m1, m2, 00100000b
19815 pmaddubsw m3, m1, [r4 + (%2 * mmsize)]
19816 pmulhrsw m3, m0
19817 packuswb m3, m3
19818 vpermq m3, m3, 11011000b
19819 movu [%1], xm3
19820 %endmacro
19821
19822 %macro INTRA_PRED_ANG16_MC4 3
19823 vperm2i128 m1, m1, m2, 00100000b
19824 pmaddubsw m4, m1, [r4 + (%3 * mmsize)]
19825 pmulhrsw m4, m0
19826 packuswb m3, m4
19827 vpermq m3, m3, 11011000b
19828 movu [%1], xm3
19829 vextracti128 xm3, m3, 1
19830 movu [%2], xm3
19831 %endmacro
19832
19833 %if ARCH_X86_64 == 1
19834 %macro INTRA_PRED_TRANS_STORE_16x16 0
19835 punpcklbw m8, m0, m1
19836 punpckhbw m0, m1
19837
19838 punpcklbw m1, m2, m3
19839 punpckhbw m2, m3
19840
19841 punpcklbw m3, m4, m5
19842 punpckhbw m4, m5
19843
19844 punpcklbw m5, m6, m7
19845 punpckhbw m6, m7
19846
19847 punpcklwd m7, m8, m1
19848 punpckhwd m8, m1
19849
19850 punpcklwd m1, m3, m5
19851 punpckhwd m3, m5
19852
19853 punpcklwd m5, m0, m2
19854 punpckhwd m0, m2
19855
19856 punpcklwd m2, m4, m6
19857 punpckhwd m4, m6
19858
19859 punpckldq m6, m7, m1
19860 punpckhdq m7, m1
19861
19862 punpckldq m1, m8, m3
19863 punpckhdq m8, m3
19864
19865 punpckldq m3, m5, m2
19866 punpckhdq m5, m2
19867
19868 punpckldq m2, m0, m4
19869 punpckhdq m0, m4
19870
19871 vpermq m6, m6, 0xD8
19872 vpermq m7, m7, 0xD8
19873 vpermq m1, m1, 0xD8
19874 vpermq m8, m8, 0xD8
19875 vpermq m3, m3, 0xD8
19876 vpermq m5, m5, 0xD8
19877 vpermq m2, m2, 0xD8
19878 vpermq m0, m0, 0xD8
19879
19880 movu [r0], xm6
19881 vextracti128 xm4, m6, 1
19882 movu [r0 + r1], xm4
19883
19884 movu [r0 + 2 * r1], xm7
19885 vextracti128 xm4, m7, 1
19886 movu [r0 + r3], xm4
19887
19888 lea r0, [r0 + 4 * r1]
19889
19890 movu [r0], xm1
19891 vextracti128 xm4, m1, 1
19892 movu [r0 + r1], xm4
19893
19894 movu [r0 + 2 * r1], xm8
19895 vextracti128 xm4, m8, 1
19896 movu [r0 + r3], xm4
19897
19898 lea r0, [r0 + 4 * r1]
19899
19900 movu [r0], xm3
19901 vextracti128 xm4, m3, 1
19902 movu [r0 + r1], xm4
19903
19904 movu [r0 + 2 * r1], xm5
19905 vextracti128 xm4, m5, 1
19906 movu [r0 + r3], xm4
19907
19908 lea r0, [r0 + 4 * r1]
19909
19910 movu [r0], xm2
19911 vextracti128 xm4, m2, 1
19912 movu [r0 + r1], xm4
19913
19914 movu [r0 + 2 * r1], xm0
19915 vextracti128 xm4, m0, 1
19916 movu [r0 + r3], xm4
19917 %endmacro
19918
19919 %macro INTRA_PRED_ANG16_CAL_ROW 3
19920 pmaddubsw %1, m9, [r4 + (%3 * mmsize)]
19921 pmulhrsw %1, m11
19922 pmaddubsw %2, m10, [r4 + (%3 * mmsize)]
19923 pmulhrsw %2, m11
19924 packuswb %1, %2
19925 %endmacro
19926
19927
19928 INIT_YMM avx2
19929 cglobal intra_pred_ang16_12, 3,4,9
19930 vbroadcasti128 m0, [angHor_tab_12]
19931 vbroadcasti128 m1, [angHor_tab_12 + mmsize/2]
19932 mova m2, [pw_1024]
19933 mova m7, [ang16_shuf_mode12]
19934 mova m8, [ang16_shuf_mode12 + mmsize]
19935 lea r3, [r1 * 3]
19936
19937 movu xm4, [r2 + mmsize - 2]
19938 pinsrb xm4, [r2 + 0], 2
19939 pinsrb xm4, [r2 + 6], 1
19940 pinsrb xm4, [r2 + 13], 0
19941 vbroadcasti128 m6, [r2 + mmsize + 14]
19942 vinserti128 m3, m4, xm4, 1
19943
19944 pshufb m4, m3, m7
19945 pshufb m5, m3, m8
19946 pmaddubsw m4, m0
19947 pmaddubsw m5, m1
19948 pmulhrsw m4, m2
19949 pmulhrsw m5, m2
19950 packuswb m4, m5
19951 movu [r0], xm4
19952 vextracti128 [r0 + r1], m4, 1
19953
19954 palignr m5, m6, m3, 2
19955 pshufb m4, m5, m7
19956 pshufb m5, m8
19957
19958 pmaddubsw m4, m0
19959 pmaddubsw m5, m1
19960 pmulhrsw m4, m2
19961 pmulhrsw m5, m2
19962 packuswb m4, m5
19963 movu [r0 + r1 * 2], xm4
19964 vextracti128 [r0 + r3], m4, 1
19965 lea r0, [r0 + r1 * 4]
19966
19967 palignr m5, m6, m3, 4
19968 pshufb m4, m5, m7
19969 pshufb m5, m8
19970
19971 pmaddubsw m4, m0
19972 pmaddubsw m5, m1
19973 pmulhrsw m4, m2
19974 pmulhrsw m5, m2
19975 packuswb m4, m5
19976 movu [r0], xm4
19977 vextracti128 [r0 + r1], m4, 1
19978
19979 palignr m5, m6, m3, 6
19980 pshufb m4, m5, m7
19981 pshufb m5, m8
19982
19983 pmaddubsw m4, m0
19984 pmaddubsw m5, m1
19985 pmulhrsw m4, m2
19986 pmulhrsw m5, m2
19987 packuswb m4, m5
19988 movu [r0 + r1 * 2], xm4
19989 vextracti128 [r0 + r3], m4, 1
19990 lea r0, [r0 + r1 * 4]
19991
19992 palignr m5, m6, m3, 8
19993 pshufb m4, m5, m7
19994 pshufb m5, m8
19995
19996 pmaddubsw m4, m0
19997 pmaddubsw m5, m1
19998 pmulhrsw m4, m2
19999 pmulhrsw m5, m2
20000 packuswb m4, m5
20001 movu [r0], xm4
20002 vextracti128 [r0 + r1], m4, 1
20003
20004 palignr m5, m6, m3, 10
20005 pshufb m4, m5, m7
20006 pshufb m5, m8
20007
20008 pmaddubsw m4, m0
20009 pmaddubsw m5, m1
20010 pmulhrsw m4, m2
20011 pmulhrsw m5, m2
20012 packuswb m4, m5
20013 movu [r0 + r1 * 2], xm4
20014 vextracti128 [r0 + r3], m4, 1
20015 lea r0, [r0 + r1 * 4]
20016
20017 palignr m5, m6, m3, 12
20018 pshufb m4, m5, m7
20019 pshufb m5, m8
20020
20021 pmaddubsw m4, m0
20022 pmaddubsw m5, m1
20023 pmulhrsw m4, m2
20024 pmulhrsw m5, m2
20025 packuswb m4, m5
20026 movu [r0], xm4
20027 vextracti128 [r0 + r1], m4, 1
20028
20029 palignr m5, m6, m3, 14
20030 pshufb m4, m5, m7
20031 pshufb m5, m8
20032
20033 pmaddubsw m4, m0
20034 pmaddubsw m5, m1
20035 pmulhrsw m4, m2
20036 pmulhrsw m5, m2
20037 packuswb m4, m5
20038 movu [r0 + r1 * 2], xm4
20039 vextracti128 [r0 + r3], m4, 1
20040 RET
20041
20042 INIT_YMM avx2
20043 cglobal intra_pred_ang16_13, 3,4,9
20044 vbroadcasti128 m0, [angHor_tab_13]
20045 vbroadcasti128 m1, [angHor_tab_13 + mmsize/2]
20046 mova m2, [pw_1024]
20047 mova m7, [ang16_shuf_mode13]
20048 mova m8, [ang16_shuf_mode13 + mmsize]
20049 lea r3, [r1 * 3]
20050
20051 vbroadcasti128 m3, [r2 + mmsize + 1]
20052 vbroadcasti128 m4, [r2]
20053 pshufb m4, [ang16_shuf_mode13 + mmsize * 2]
20054
20055 palignr m3, m4, 11
20056 vbroadcasti128 m6, [r2 + mmsize + 12]
20057
20058 pshufb m4, m3, m7
20059 pshufb m5, m3, m8
20060 pmaddubsw m4, m0
20061 pmaddubsw m5, m1
20062 pmulhrsw m4, m2
20063 pmulhrsw m5, m2
20064 packuswb m4, m5
20065 movu [r0], xm4
20066 vextracti128 [r0 + r1], m4, 1
20067
20068 palignr m5, m6, m3, 2
20069 pshufb m4, m5, m7
20070 pshufb m5, m8
20071
20072 pmaddubsw m4, m0
20073 pmaddubsw m5, m1
20074 pmulhrsw m4, m2
20075 pmulhrsw m5, m2
20076 packuswb m4, m5
20077 movu [r0 + r1 * 2], xm4
20078 vextracti128 [r0 + r3], m4, 1
20079 lea r0, [r0 + r1 * 4]
20080
20081 palignr m5, m6, m3, 4
20082 pshufb m4, m5, m7
20083 pshufb m5, m8
20084
20085 pmaddubsw m4, m0
20086 pmaddubsw m5, m1
20087 pmulhrsw m4, m2
20088 pmulhrsw m5, m2
20089 packuswb m4, m5
20090 movu [r0], xm4
20091 vextracti128 [r0 + r1], m4, 1
20092
20093 palignr m5, m6, m3, 6
20094 pshufb m4, m5, m7
20095 pshufb m5, m8
20096
20097 pmaddubsw m4, m0
20098 pmaddubsw m5, m1
20099 pmulhrsw m4, m2
20100 pmulhrsw m5, m2
20101 packuswb m4, m5
20102 movu [r0 + r1 * 2], xm4
20103 vextracti128 [r0 + r3], m4, 1
20104 lea r0, [r0 + r1 * 4]
20105
20106 palignr m5, m6, m3, 8
20107 pshufb m4, m5, m7
20108 pshufb m5, m8
20109
20110 pmaddubsw m4, m0
20111 pmaddubsw m5, m1
20112 pmulhrsw m4, m2
20113 pmulhrsw m5, m2
20114 packuswb m4, m5
20115 movu [r0], xm4
20116 vextracti128 [r0 + r1], m4, 1
20117
20118 palignr m5, m6, m3, 10
20119 pshufb m4, m5, m7
20120 pshufb m5, m8
20121
20122 pmaddubsw m4, m0
20123 pmaddubsw m5, m1
20124 pmulhrsw m4, m2
20125 pmulhrsw m5, m2
20126 packuswb m4, m5
20127 movu [r0 + r1 * 2], xm4
20128 vextracti128 [r0 + r3], m4, 1
20129 lea r0, [r0 + r1 * 4]
20130
20131 palignr m5, m6, m3, 12
20132 pshufb m4, m5, m7
20133 pshufb m5, m8
20134
20135 pmaddubsw m4, m0
20136 pmaddubsw m5, m1
20137 pmulhrsw m4, m2
20138 pmulhrsw m5, m2
20139 packuswb m4, m5
20140 movu [r0], xm4
20141 vextracti128 [r0 + r1], m4, 1
20142
20143 palignr m5, m6, m3, 14
20144 pshufb m4, m5, m7
20145 pshufb m5, m8
20146
20147 pmaddubsw m4, m0
20148 pmaddubsw m5, m1
20149 pmulhrsw m4, m2
20150 pmulhrsw m5, m2
20151 packuswb m4, m5
20152 movu [r0 + r1 * 2], xm4
20153 vextracti128 [r0 + r3], m4, 1
20154 RET
20155
20156 INIT_YMM avx2
20157 cglobal intra_pred_ang16_14, 3,4,9
20158 vbroadcasti128 m0, [angHor_tab_14]
20159 vbroadcasti128 m1, [angHor_tab_14 + mmsize/2]
20160 mova m2, [pw_1024]
20161 mova m7, [ang16_shuf_mode14]
20162 mova m8, [ang16_shuf_mode14 + mmsize]
20163 lea r3, [r1 * 3]
20164
20165 vbroadcasti128 m3, [r2 + mmsize + 1]
20166 vbroadcasti128 m4, [r2]
20167 pshufb m4, [ang16_shuf_mode14 + mmsize * 2]
20168 palignr m3, m4, 9
20169 vbroadcasti128 m6, [r2 + mmsize + 10]
20170
20171 pshufb m4, m3, m7
20172 pshufb m5, m3, m8
20173 pmaddubsw m4, m0
20174 pmaddubsw m5, m1
20175 pmulhrsw m4, m2
20176 pmulhrsw m5, m2
20177 packuswb m4, m5
20178 movu [r0], xm4
20179 vextracti128 [r0 + r1], m4, 1
20180
20181 palignr m5, m6, m3, 2
20182 pshufb m4, m5, m7
20183 pshufb m5, m8
20184
20185 pmaddubsw m4, m0
20186 pmaddubsw m5, m1
20187 pmulhrsw m4, m2
20188 pmulhrsw m5, m2
20189 packuswb m4, m5
20190 movu [r0 + r1 * 2], xm4
20191 vextracti128 [r0 + r3], m4, 1
20192 lea r0, [r0 + r1 * 4]
20193
20194 palignr m5, m6, m3, 4
20195 pshufb m4, m5, m7
20196 pshufb m5, m8
20197
20198 pmaddubsw m4, m0
20199 pmaddubsw m5, m1
20200 pmulhrsw m4, m2
20201 pmulhrsw m5, m2
20202 packuswb m4, m5
20203 movu [r0], xm4
20204 vextracti128 [r0 + r1], m4, 1
20205
20206 palignr m5, m6, m3, 6
20207 pshufb m4, m5, m7
20208 pshufb m5, m8
20209
20210 pmaddubsw m4, m0
20211 pmaddubsw m5, m1
20212 pmulhrsw m4, m2
20213 pmulhrsw m5, m2
20214 packuswb m4, m5
20215 movu [r0 + r1 * 2], xm4
20216 vextracti128 [r0 + r3], m4, 1
20217 lea r0, [r0 + r1 * 4]
20218
20219 palignr m5, m6, m3, 8
20220 pshufb m4, m5, m7
20221 pshufb m5, m8
20222
20223 pmaddubsw m4, m0
20224 pmaddubsw m5, m1
20225 pmulhrsw m4, m2
20226 pmulhrsw m5, m2
20227 packuswb m4, m5
20228 movu [r0], xm4
20229 vextracti128 [r0 + r1], m4, 1
20230
20231 palignr m5, m6, m3, 10
20232 pshufb m4, m5, m7
20233 pshufb m5, m8
20234
20235 pmaddubsw m4, m0
20236 pmaddubsw m5, m1
20237 pmulhrsw m4, m2
20238 pmulhrsw m5, m2
20239 packuswb m4, m5
20240 movu [r0 + r1 * 2], xm4
20241 vextracti128 [r0 + r3], m4, 1
20242 lea r0, [r0 + r1 * 4]
20243
20244 palignr m5, m6, m3, 12
20245 pshufb m4, m5, m7
20246 pshufb m5, m8
20247
20248 pmaddubsw m4, m0
20249 pmaddubsw m5, m1
20250 pmulhrsw m4, m2
20251 pmulhrsw m5, m2
20252 packuswb m4, m5
20253 movu [r0], xm4
20254 vextracti128 [r0 + r1], m4, 1
20255
20256 palignr m5, m6, m3, 14
20257 pshufb m4, m5, m7
20258 pshufb m5, m8
20259
20260 pmaddubsw m4, m0
20261 pmaddubsw m5, m1
20262 pmulhrsw m4, m2
20263 pmulhrsw m5, m2
20264 packuswb m4, m5
20265 movu [r0 + r1 * 2], xm4
20266 vextracti128 [r0 + r3], m4, 1
20267 RET
20268
20269 INIT_YMM avx2
20270 cglobal intra_pred_ang16_15, 3,4,9
20271 vbroadcasti128 m0, [angHor_tab_15]
20272 vbroadcasti128 m1, [angHor_tab_15 + mmsize/2]
20273 mova m2, [pw_1024]
20274 mova m7, [ang16_shuf_mode15]
20275 mova m8, [ang16_shuf_mode15 + mmsize]
20276 lea r3, [r1 * 3]
20277
20278 vbroadcasti128 m3, [r2 + mmsize + 1]
20279 vbroadcasti128 m4, [r2]
20280 pshufb m4, [ang16_shuf_mode15 + mmsize * 2]
20281 palignr m3, m3, m4, 7
20282 vbroadcasti128 m6, [r2 + mmsize + 8]
20283
20284 pshufb m4, m3, m7
20285 pshufb m5, m3, m8
20286 pmaddubsw m4, m0
20287 pmaddubsw m5, m1
20288 pmulhrsw m4, m2
20289 pmulhrsw m5, m2
20290 packuswb m4, m5
20291 movu [r0], xm4
20292 vextracti128 [r0 + r1], m4, 1
20293
20294 palignr m5, m6, m3, 2
20295 pshufb m4, m5, m7
20296 pshufb m5, m8
20297
20298 pmaddubsw m4, m0
20299 pmaddubsw m5, m1
20300 pmulhrsw m4, m2
20301 pmulhrsw m5, m2
20302 packuswb m4, m5
20303 movu [r0 + r1 * 2], xm4
20304 vextracti128 [r0 + r3], m4, 1
20305 lea r0, [r0 + r1 * 4]
20306
20307 palignr m5, m6, m3, 4
20308 pshufb m4, m5, m7
20309 pshufb m5, m8
20310
20311 pmaddubsw m4, m0
20312 pmaddubsw m5, m1
20313 pmulhrsw m4, m2
20314 pmulhrsw m5, m2
20315 packuswb m4, m5
20316 movu [r0], xm4
20317 vextracti128 [r0 + r1], m4, 1
20318
20319 palignr m5, m6, m3, 6
20320 pshufb m4, m5, m7
20321 pshufb m5, m8
20322
20323 pmaddubsw m4, m0
20324 pmaddubsw m5, m1
20325 pmulhrsw m4, m2
20326 pmulhrsw m5, m2
20327 packuswb m4, m5
20328 movu [r0 + r1 * 2], xm4
20329 vextracti128 [r0 + r3], m4, 1
20330 lea r0, [r0 + r1 * 4]
20331
20332 palignr m5, m6, m3, 8
20333 pshufb m4, m5, m7
20334 pshufb m5, m8
20335
20336 pmaddubsw m4, m0
20337 pmaddubsw m5, m1
20338 pmulhrsw m4, m2
20339 pmulhrsw m5, m2
20340 packuswb m4, m5
20341 movu [r0], xm4
20342 vextracti128 [r0 + r1], m4, 1
20343
20344 palignr m5, m6, m3, 10
20345 pshufb m4, m5, m7
20346 pshufb m5, m8
20347
20348 pmaddubsw m4, m0
20349 pmaddubsw m5, m1
20350 pmulhrsw m4, m2
20351 pmulhrsw m5, m2
20352 packuswb m4, m5
20353 movu [r0 + r1 * 2], xm4
20354 vextracti128 [r0 + r3], m4, 1
20355 lea r0, [r0 + r1 * 4]
20356
20357 palignr m5, m6, m3, 12
20358 pshufb m4, m5, m7
20359 pshufb m5, m8
20360
20361 pmaddubsw m4, m0
20362 pmaddubsw m5, m1
20363 pmulhrsw m4, m2
20364 pmulhrsw m5, m2
20365 packuswb m4, m5
20366 movu [r0], xm4
20367 vextracti128 [r0 + r1], m4, 1
20368
20369 palignr m5, m6, m3, 14
20370 pshufb m4, m5, m7
20371 pshufb m5, m8
20372
20373 pmaddubsw m4, m0
20374 pmaddubsw m5, m1
20375 pmulhrsw m4, m2
20376 pmulhrsw m5, m2
20377 packuswb m4, m5
20378 movu [r0 + r1 * 2], xm4
20379 vextracti128 [r0 + r3], m4, 1
20380 RET
20381
20382 INIT_YMM avx2
20383 cglobal intra_pred_ang16_16, 3,4,9
20384 vbroadcasti128 m0, [angHor_tab_16]
20385 vbroadcasti128 m1, [angHor_tab_16 + mmsize/2]
20386 mova m2, [pw_1024]
20387 mova m7, [ang16_shuf_mode16]
20388 mova m8, [ang16_shuf_mode16 + mmsize]
20389 lea r3, [r1 * 3]
20390
20391 vbroadcasti128 m3, [r2 + mmsize + 1]
20392 vbroadcasti128 m4, [r2]
20393 pshufb m4, [ang16_shuf_mode16 + mmsize * 2]
20394 palignr m3, m4, 5
20395 vbroadcasti128 m6, [r2 + mmsize + 6]
20396
20397 pshufb m4, m3, m7
20398 pshufb m5, m3, m8
20399 pmaddubsw m4, m0
20400 pmaddubsw m5, m1
20401 pmulhrsw m4, m2
20402 pmulhrsw m5, m2
20403 packuswb m4, m5
20404 movu [r0], xm4
20405 vextracti128 [r0 + r1], m4, 1
20406
20407 palignr m5, m6, m3, 2
20408 pshufb m4, m5, m7
20409 pshufb m5, m8
20410
20411 pmaddubsw m4, m0
20412 pmaddubsw m5, m1
20413 pmulhrsw m4, m2
20414 pmulhrsw m5, m2
20415 packuswb m4, m5
20416 movu [r0 + r1 * 2], xm4
20417 vextracti128 [r0 + r3], m4, 1
20418 lea r0, [r0 + r1 * 4]
20419
20420 palignr m5, m6, m3, 4
20421 pshufb m4, m5, m7
20422 pshufb m5, m8
20423
20424 pmaddubsw m4, m0
20425 pmaddubsw m5, m1
20426 pmulhrsw m4, m2
20427 pmulhrsw m5, m2
20428 packuswb m4, m5
20429 movu [r0], xm4
20430 vextracti128 [r0 + r1], m4, 1
20431
20432 palignr m5, m6, m3, 6
20433 pshufb m4, m5, m7
20434 pshufb m5, m8
20435
20436 pmaddubsw m4, m0
20437 pmaddubsw m5, m1
20438 pmulhrsw m4, m2
20439 pmulhrsw m5, m2
20440 packuswb m4, m5
20441 movu [r0 + r1 * 2], xm4
20442 vextracti128 [r0 + r3], m4, 1
20443 lea r0, [r0 + r1 * 4]
20444
20445 palignr m5, m6, m3, 8
20446 pshufb m4, m5, m7
20447 pshufb m5, m8
20448
20449 pmaddubsw m4, m0
20450 pmaddubsw m5, m1
20451 pmulhrsw m4, m2
20452 pmulhrsw m5, m2
20453 packuswb m4, m5
20454 movu [r0], xm4
20455 vextracti128 [r0 + r1], m4, 1
20456
20457 palignr m5, m6, m3, 10
20458 pshufb m4, m5, m7
20459 pshufb m5, m8
20460
20461 pmaddubsw m4, m0
20462 pmaddubsw m5, m1
20463 pmulhrsw m4, m2
20464 pmulhrsw m5, m2
20465 packuswb m4, m5
20466 movu [r0 + r1 * 2], xm4
20467 vextracti128 [r0 + r3], m4, 1
20468 lea r0, [r0 + r1 * 4]
20469
20470 palignr m5, m6, m3, 12
20471 pshufb m4, m5, m7
20472 pshufb m5, m8
20473
20474 pmaddubsw m4, m0
20475 pmaddubsw m5, m1
20476 pmulhrsw m4, m2
20477 pmulhrsw m5, m2
20478 packuswb m4, m5
20479 movu [r0], xm4
20480 vextracti128 [r0 + r1], m4, 1
20481
20482 palignr m5, m6, m3, 14
20483 pshufb m4, m5, m7
20484 pshufb m5, m8
20485
20486 pmaddubsw m4, m0
20487 pmaddubsw m5, m1
20488 pmulhrsw m4, m2
20489 pmulhrsw m5, m2
20490 packuswb m4, m5
20491 movu [r0 + r1 * 2], xm4
20492 vextracti128 [r0 + r3], m4, 1
20493 RET
20494
20495 INIT_YMM avx2
20496 cglobal intra_pred_ang16_17, 3,4,9
20497 vbroadcasti128 m0, [angHor_tab_17]
20498 vbroadcasti128 m1, [angHor_tab_17 + mmsize/2]
20499 mova m2, [pw_1024]
20500 mova m7, [ang16_shuf_mode17]
20501 mova m8, [ang16_shuf_mode17 + mmsize]
20502 lea r3, [r1 * 3]
20503
20504 vbroadcasti128 m3, [r2 + mmsize + 1]
20505 vbroadcasti128 m4, [r2]
20506 pshufb m4, [ang16_shuf_mode17 + mmsize * 2]
20507 palignr m3, m4, 3
20508 vbroadcasti128 m6, [r2 + mmsize + 4]
20509
20510 pshufb m4, m3, m7
20511 pshufb m5, m3, m8
20512 pmaddubsw m4, m0
20513 pmaddubsw m5, m1
20514 pmulhrsw m4, m2
20515 pmulhrsw m5, m2
20516 packuswb m4, m5
20517 movu [r0], xm4
20518 vextracti128 [r0 + r1], m4, 1
20519
20520 palignr m5, m6, m3, 2
20521 pshufb m4, m5, m7
20522 pshufb m5, m8
20523
20524 pmaddubsw m4, m0
20525 pmaddubsw m5, m1
20526 pmulhrsw m4, m2
20527 pmulhrsw m5, m2
20528 packuswb m4, m5
20529 movu [r0 + r1 * 2], xm4
20530 vextracti128 [r0 + r3], m4, 1
20531 lea r0, [r0 + r1 * 4]
20532
20533 palignr m5, m6, m3, 4
20534 pshufb m4, m5, m7
20535 pshufb m5, m8
20536
20537 pmaddubsw m4, m0
20538 pmaddubsw m5, m1
20539 pmulhrsw m4, m2
20540 pmulhrsw m5, m2
20541 packuswb m4, m5
20542 movu [r0], xm4
20543 vextracti128 [r0 + r1], m4, 1
20544
20545 palignr m5, m6, m3, 6
20546 pshufb m4, m5, m7
20547 pshufb m5, m8
20548
20549 pmaddubsw m4, m0
20550 pmaddubsw m5, m1
20551 pmulhrsw m4, m2
20552 pmulhrsw m5, m2
20553 packuswb m4, m5
20554 movu [r0 + r1 * 2], xm4
20555 vextracti128 [r0 + r3], m4, 1
20556 lea r0, [r0 + r1 * 4]
20557
20558 palignr m5, m6, m3, 8
20559 pshufb m4, m5, m7
20560 pshufb m5, m8
20561
20562 pmaddubsw m4, m0
20563 pmaddubsw m5, m1
20564 pmulhrsw m4, m2
20565 pmulhrsw m5, m2
20566 packuswb m4, m5
20567 movu [r0], xm4
20568 vextracti128 [r0 + r1], m4, 1
20569
20570 palignr m5, m6, m3, 10
20571 pshufb m4, m5, m7
20572 pshufb m5, m8
20573
20574 pmaddubsw m4, m0
20575 pmaddubsw m5, m1
20576 pmulhrsw m4, m2
20577 pmulhrsw m5, m2
20578 packuswb m4, m5
20579 movu [r0 + r1 * 2], xm4
20580 vextracti128 [r0 + r3], m4, 1
20581 lea r0, [r0 + r1 * 4]
20582
20583 palignr m5, m6, m3, 12
20584 pshufb m4, m5, m7
20585 pshufb m5, m8
20586
20587 pmaddubsw m4, m0
20588 pmaddubsw m5, m1
20589 pmulhrsw m4, m2
20590 pmulhrsw m5, m2
20591 packuswb m4, m5
20592 movu [r0], xm4
20593 vextracti128 [r0 + r1], m4, 1
20594
20595 palignr m5, m6, m3, 14
20596 pshufb m4, m5, m7
20597 pshufb m5, m8
20598
20599 pmaddubsw m4, m0
20600 pmaddubsw m5, m1
20601 pmulhrsw m4, m2
20602 pmulhrsw m5, m2
20603 packuswb m4, m5
20604 movu [r0 + r1 * 2], xm4
20605 vextracti128 [r0 + r3], m4, 1
20606 RET
20607
20608 INIT_YMM avx2
20609 cglobal intra_pred_ang16_11, 3,4,8
20610 vbroadcasti128 m0, [angHor_tab_11]
20611 vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
20612 mova m2, [pw_1024]
20613 mova m7, [ang32_shuf_mode9]
20614 lea r3, [r1 * 3]
20615
20616 ; prepare for [0 -1 -2...]
20617
20618 movu xm3, [r2 + mmsize]
20619 pinsrb xm3, [r2], 0
20620 vbroadcasti128 m6, [r2 + mmsize + 16]
20621 vinserti128 m3, m3, xm3, 1
20622
20623 pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
20624 pmaddubsw m4, m5, m0
20625 pmaddubsw m5, m1
20626 pmulhrsw m4, m2
20627 pmulhrsw m5, m2
20628 packuswb m4, m5
20629 movu [r0], xm4
20630 vextracti128 [r0 + r1], m4, 1
20631
20632 palignr m5, m6, m3, 2
20633 pshufb m5, m7
20634 pmaddubsw m4, m5, m0
20635 pmaddubsw m5, m1
20636 pmulhrsw m4, m2
20637 pmulhrsw m5, m2
20638 packuswb m4, m5
20639 movu [r0 + r1 * 2], xm4
20640 vextracti128 [r0 + r3], m4, 1
20641
20642 lea r0, [r0 + r1 * 4]
20643
20644 palignr m5, m6, m3, 4
20645 pshufb m5, m7
20646 pmaddubsw m4, m5, m0
20647 pmaddubsw m5, m1
20648 pmulhrsw m4, m2
20649 pmulhrsw m5, m2
20650 packuswb m4, m5
20651 movu [r0], xm4
20652 vextracti128 [r0 + r1], m4, 1
20653
20654 palignr m5, m6, m3, 6
20655 pshufb m5, m7
20656 pmaddubsw m4, m5, m0
20657 pmaddubsw m5, m1
20658 pmulhrsw m4, m2
20659 pmulhrsw m5, m2
20660 packuswb m4, m5
20661 movu [r0 + r1 * 2], xm4
20662 vextracti128 [r0 + r3], m4, 1
20663
20664 lea r0, [r0 + r1 * 4]
20665
20666 palignr m5, m6, m3, 8
20667 pshufb m5, m7
20668 pmaddubsw m4, m5, m0
20669 pmaddubsw m5, m1
20670 pmulhrsw m4, m2
20671 pmulhrsw m5, m2
20672 packuswb m4, m5
20673 movu [r0], xm4
20674 vextracti128 [r0 + r1], m4, 1
20675
20676 palignr m5, m6, m3, 10
20677 pshufb m5, m7
20678 pmaddubsw m4, m5, m0
20679 pmaddubsw m5, m1
20680 pmulhrsw m4, m2
20681 pmulhrsw m5, m2
20682 packuswb m4, m5
20683 movu [r0 + r1 * 2], xm4
20684 vextracti128 [r0 + r3], m4, 1
20685
20686 lea r0, [r0 + r1 * 4]
20687
20688 palignr m5, m6, m3, 12
20689 pshufb m5, m7
20690 pmaddubsw m4, m5, m0
20691 pmaddubsw m5, m1
20692 pmulhrsw m4, m2
20693 pmulhrsw m5, m2
20694 packuswb m4, m5
20695 movu [r0], xm4
20696 vextracti128 [r0 + r1], m4, 1
20697
20698 palignr m5, m6, m3, 14
20699 pshufb m5, m7
20700 pmaddubsw m4, m5, m0
20701 pmaddubsw m5, m1
20702 pmulhrsw m4, m2
20703 pmulhrsw m5, m2
20704 packuswb m4, m5
20705 movu [r0 + r1 * 2], xm4
20706 vextracti128 [r0 + r3], m4, 1
20707 RET
20708
20709
20710 ; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
20711 %if ARCH_X86_64 == 1
20712 INIT_YMM avx2
20713 %macro TRANSPOSE_STORE_8x32 12
20714 jc .skip
20715
20716 punpcklbw m%9, m%1, m%2
20717 punpckhbw m%1, m%2
20718 punpcklbw m%10, m%3, m%4
20719 punpckhbw m%3, m%4
20720
20721 punpcklwd m%11, m%9, m%10
20722 punpckhwd m%9, m%10
20723 punpcklwd m%10, m%1, m%3
20724 punpckhwd m%1, m%3
20725
20726 punpckldq m%12, m%11, m%10
20727 punpckhdq m%11, m%10
20728 punpckldq m%10, m%9, m%1
20729 punpckhdq m%9, m%1
20730
20731 punpcklbw m%1, m%5, m%6
20732 punpckhbw m%5, m%6
20733 punpcklbw m%2, m%7, m%8
20734 punpckhbw m%7, m%8
20735
20736 punpcklwd m%3, m%1, m%2
20737 punpckhwd m%1, m%2
20738 punpcklwd m%4, m%5, m%7
20739 punpckhwd m%5, m%7
20740
20741 punpckldq m%2, m%3, m%4
20742 punpckhdq m%3, m%4
20743 punpckldq m%4, m%1, m%5
20744 punpckhdq m%1, m%5
20745
20746 punpckldq m%5, m%12, m%2
20747 punpckhdq m%6, m%12, m%2
20748 punpckldq m%7, m%10, m%4
20749 punpckhdq m%8, m%10, m%4
20750
20751 punpckldq m%2, m%11, m%3
20752 punpckhdq m%11, m%11, m%3
20753 punpckldq m%4, m%9, m%1
20754 punpckhdq m%9, m%9, m%1
20755
20756 movu [r0 + r1 * 0], xm%5
20757 movu [r0 + r1 * 1], xm%6
20758 movu [r0 + r1 * 2], xm%2
20759 movu [r0 + r5 * 1], xm%11
20760
20761 add r0, r6
20762
20763 movu [r0 + r1 * 0], xm%7
20764 movu [r0 + r1 * 1], xm%8
20765 movu [r0 + r1 * 2], xm%4
20766 movu [r0 + r5 * 1], xm%9
20767
20768 add r0, r6
20769
20770 vextracti128 [r0 + r1 * 0], m%5, 1
20771 vextracti128 [r0 + r1 * 1], m%6, 1
20772 vextracti128 [r0 + r1 * 2], m%2, 1
20773 vextracti128 [r0 + r5 * 1], m%11, 1
20774
20775 add r0, r6
20776
20777 vextracti128 [r0 + r1 * 0], m%7, 1
20778 vextracti128 [r0 + r1 * 1], m%8, 1
20779 vextracti128 [r0 + r1 * 2], m%4, 1
20780 vextracti128 [r0 + r5 * 1], m%9, 1
20781 jmp .end
20782
20783 .skip:
20784 vpermq m%1, m%1, q3120
20785 vpermq m%2, m%2, q3120
20786 vpermq m%3, m%3, q3120
20787 vpermq m%4, m%4, q3120
20788 vpermq m%5, m%5, q3120
20789 vpermq m%6, m%6, q3120
20790 vpermq m%7, m%7, q3120
20791 vpermq m%8, m%8, q3120
20792
20793 movu [r0 + r1 * 0], xm%1
20794 movu [r0 + r1 * 1], xm%2
20795 movu [r0 + r1 * 2], xm%3
20796 movu [r0 + r5 * 1], xm%4
20797
20798 add r0, r6
20799
20800 movu [r0 + r1 * 0], xm%5
20801 movu [r0 + r1 * 1], xm%6
20802 movu [r0 + r1 * 2], xm%7
20803 movu [r0 + r5 * 1], xm%8
20804
20805 add r0, r6
20806
20807 vextracti128 [r0 + r1 * 0], m%1, 1
20808 vextracti128 [r0 + r1 * 1], m%2, 1
20809 vextracti128 [r0 + r1 * 2], m%3, 1
20810 vextracti128 [r0 + r5 * 1], m%4, 1
20811
20812 add r0, r6
20813
20814 vextracti128 [r0 + r1 * 0], m%5, 1
20815 vextracti128 [r0 + r1 * 1], m%6, 1
20816 vextracti128 [r0 + r1 * 2], m%7, 1
20817 vextracti128 [r0 + r5 * 1], m%8, 1
20818 .end:
20819 %endmacro
20820
20821 cglobal ang16_mode_3_33
20822 ; rows 0 to 7
20823 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
20824 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
20825
20826 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
20827 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
20828 vextracti128 xm1, m0, 1
20829 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
20830 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
20831
20832 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
20833 pmulhrsw m4, m7
20834
20835 palignr m5, m2, m0, 2
20836 pmaddubsw m5, [r3 + 4 * 32] ; [20]
20837 pmulhrsw m5, m7
20838
20839 palignr m6, m2, m0, 4
20840 palignr m8, m2, m0, 6
20841 pmaddubsw m6, [r3 - 2 * 32] ; [14]
20842 pmulhrsw m6, m7
20843 pmaddubsw m8, [r3 - 8 * 32] ; [8]
20844 pmulhrsw m8, m7
20845
20846 palignr m10, m2, m0, 8
20847 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
20848 pmulhrsw m9, m7
20849 pmaddubsw m10, [r3 + 12 * 32] ; [28]
20850 pmulhrsw m10, m7
20851
20852 palignr m11, m2, m0, 10
20853 palignr m12, m2, m0, 12
20854 pmaddubsw m11, [r3 + 6 * 32] ; [22]
20855 pmulhrsw m11, m7
20856 pmaddubsw m12, [r3] ; [16]
20857 pmulhrsw m12, m7
20858
20859 ; rows 8 to 15
20860 palignr m3, m2, m0, 14
20861 palignr m1, m1, m2, 14
20862 pmaddubsw m3, [r3 - 6 * 32] ; [10]
20863 pmulhrsw m3, m7
20864 packuswb m4, m3
20865
20866 pmaddubsw m3, m2, [r3 - 12 * 32] ; [4]
20867 pmulhrsw m3, m7
20868 packuswb m5, m3
20869
20870 pmaddubsw m3, m2, [r3 + 14 * 32] ; [30]
20871 pmulhrsw m3, m7
20872 packuswb m6, m3
20873
20874 movu xm0, [r2 + 25]
20875 movu xm1, [r2 + 26]
20876 punpcklbw m0, m1
20877 mova m1, m2
20878 vinserti128 m1, m1, xm0, 0
20879 vpermq m1, m1, 01001110b
20880
20881 palignr m3, m1, m2, 2
20882 pmaddubsw m3, [r3 + 8 * 32] ; [24]
20883 pmulhrsw m3, m7
20884 packuswb m8, m3
20885
20886 palignr m3, m1, m2, 4
20887 pmaddubsw m3, [r3 + 2 * 32] ; [18]
20888 pmulhrsw m3, m7
20889 packuswb m9, m3
20890
20891 palignr m3, m1, m2, 6
20892 pmaddubsw m3, [r3 - 4 * 32] ; [12]
20893 pmulhrsw m3, m7
20894 packuswb m10, m3
20895
20896 palignr m3, m1, m2, 8
20897 pmaddubsw m3, [r3 - 10 * 32] ; [6]
20898 pmulhrsw m3, m7
20899 packuswb m11, m3
20900
20901 pmovzxbw m1, [r2 + 14]
20902 packuswb m12, m1
20903
20904 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
20905 ret
20906
20907 INIT_YMM avx2
20908 cglobal intra_pred_ang16_3, 3, 7, 13
20909 add r2, 32
20910 lea r3, [ang_table_avx2 + 16 * 32]
20911 lea r5, [r1 * 3] ; r5 -> 3 * stride
20912 lea r6, [r1 * 4] ; r6 -> 4 * stride
20913 mova m7, [pw_1024]
20914 clc
20915
20916 call ang16_mode_3_33
20917 RET
20918
20919 INIT_YMM avx2
20920 cglobal intra_pred_ang16_33, 3, 7, 13
20921 lea r3, [ang_table_avx2 + 16 * 32]
20922 lea r5, [r1 * 3] ; r5 -> 3 * stride
20923 lea r6, [r1 * 4] ; r6 -> 4 * stride
20924 mova m7, [pw_1024]
20925 stc
20926
20927 call ang16_mode_3_33
20928 RET
20929
20930 cglobal ang16_mode_4_32
20931 ; rows 0 to 7
20932 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
20933 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
20934
20935 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
20936 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
20937 vextracti128 xm1, m0, 1
20938 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
20939 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
20940
20941 pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
20942 pmulhrsw m4, m7
20943
20944 palignr m1, m2, m0, 2
20945 pmaddubsw m5, m1, [r3 - 6 * 32] ; [10]
20946 pmulhrsw m5, m7
20947
20948 palignr m8, m2, m0, 4
20949 pmaddubsw m6, m1, [r3 + 15 * 32] ; [31]
20950 pmulhrsw m6, m7
20951 pmaddubsw m8, [r3 + 4 * 32] ; [20]
20952 pmulhrsw m8, m7
20953
20954 palignr m10, m2, m0, 6
20955 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
20956 pmulhrsw m9, m7
20957 pmaddubsw m10, [r3 + 14 * 32] ; [30]
20958 pmulhrsw m10, m7
20959
20960 palignr m11, m2, m0, 8
20961 palignr m1, m2, m0, 10
20962 pmaddubsw m11, [r3 + 3 * 32] ; [19]
20963 pmulhrsw m11, m7
20964 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
20965 pmulhrsw m12, m7
20966
20967 ; rows 8 to 15
20968 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
20969 pmulhrsw m3, m7
20970 packuswb m4, m3
20971
20972 palignr m3, m2, m0, 12
20973 pmaddubsw m3, m3, [r3 + 2 * 32] ; [18]
20974 pmulhrsw m3, m7
20975 packuswb m5, m3
20976
20977 palignr m1, m2, m0, 14
20978 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
20979 pmulhrsw m3, m7
20980 packuswb m6, m3
20981
20982 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
20983 pmulhrsw m3, m7
20984 packuswb m8, m3
20985
20986 palignr m3, m2, m0, 16
20987 pmaddubsw m3, [r3 + 1 * 32] ; [17]
20988 pmulhrsw m3, m7
20989 packuswb m9, m3
20990
20991 movu xm0, [r2 + 25]
20992 movu xm1, [r2 + 26]
20993 punpcklbw m0, m1
20994 mova m1, m2
20995 vinserti128 m1, m1, xm0, 0
20996 vpermq m1, m1, 01001110b
20997
20998 palignr m0, m1, m2, 2
20999 pmaddubsw m3, m0, [r3 - 10 * 32] ; [6]
21000 pmulhrsw m3, m7
21001 packuswb m10, m3
21002
21003 pmaddubsw m3, m0, [r3 + 11 * 32] ; [27]
21004 pmulhrsw m3, m7
21005 packuswb m11, m3
21006
21007 palignr m1, m1, m2, 4
21008 pmaddubsw m1, [r3] ; [16]
21009 pmulhrsw m1, m7
21010 packuswb m12, m1
21011
21012 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
21013 ret
21014
21015 INIT_YMM avx2
21016 cglobal intra_pred_ang16_4, 3, 7, 13
21017 add r2, 32
21018 lea r3, [ang_table_avx2 + 16 * 32]
21019 lea r5, [r1 * 3] ; r5 -> 3 * stride
21020 lea r6, [r1 * 4] ; r6 -> 4 * stride
21021 mova m7, [pw_1024]
21022 clc
21023
21024 call ang16_mode_4_32
21025 RET
21026
21027 INIT_YMM avx2
21028 cglobal intra_pred_ang16_32, 3, 7, 13
21029 lea r3, [ang_table_avx2 + 16 * 32]
21030 lea r5, [r1 * 3] ; r5 -> 3 * stride
21031 lea r6, [r1 * 4] ; r6 -> 4 * stride
21032 mova m7, [pw_1024]
21033 stc
21034
21035 call ang16_mode_4_32
21036 RET
21037
21038 cglobal ang16_mode_5
21039 ; rows 0 to 7
21040 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
21041 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
21042
21043 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21044 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21045 vextracti128 xm1, m0, 1
21046 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21047 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21048
21049 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
21050 pmulhrsw m4, m7
21051
21052 palignr m1, m2, m0, 2
21053 pmaddubsw m5, m1, [r3 - 14 * 32] ; [2]
21054 pmulhrsw m5, m7
21055
21056 palignr m3, m2, m0, 4
21057 pmaddubsw m6, m1, [r3 + 3 * 32] ; [19]
21058 pmulhrsw m6, m7
21059 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
21060 pmulhrsw m8, m7
21061 pmaddubsw m9, m3, [r3 + 5 * 32] ; [21]
21062 pmulhrsw m9, m7
21063
21064 palignr m3, m2, m0, 6
21065 pmaddubsw m10, m3, [r3 - 10 * 32] ; [6]
21066 pmulhrsw m10, m7
21067
21068 palignr m1, m2, m0, 8
21069 pmaddubsw m11, m3, [r3 + 7 * 32] ; [23]
21070 pmulhrsw m11, m7
21071 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
21072 pmulhrsw m12, m7
21073
21074 ; rows 8 to 15
21075 pmaddubsw m3, m1, [r3 + 9 * 32] ; [25]
21076 pmulhrsw m3, m7
21077 packuswb m4, m3
21078
21079 palignr m1, m2, m0, 10
21080 pmaddubsw m3, m1, [r3 - 6 * 32] ; [10]
21081 pmulhrsw m3, m7
21082 packuswb m5, m3
21083
21084 pmaddubsw m3, m1, [r3 + 11 * 32] ; [27]
21085 pmulhrsw m3, m7
21086 packuswb m6, m3
21087
21088 palignr m1, m2, m0, 12
21089 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
21090 pmulhrsw m3, m7
21091 packuswb m8, m3
21092
21093 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
21094 pmulhrsw m3, m7
21095 packuswb m9, m3
21096
21097 palignr m1, m2, m0, 14
21098 pmaddubsw m3, m1, [r3 - 2 * 32] ; [14]
21099 pmulhrsw m3, m7
21100 packuswb m10, m3
21101
21102 pmaddubsw m3, m1, [r3 + 15 * 32] ; [31]
21103 pmulhrsw m3, m7
21104 packuswb m11, m3
21105
21106 palignr m1, m2, m0, 16
21107 pmaddubsw m1, [r3] ; [16]
21108 pmulhrsw m1, m7
21109 packuswb m12, m1
21110
21111 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
21112 ret
21113
21114 INIT_YMM avx2
21115 cglobal intra_pred_ang16_5, 3, 7, 13
21116 add r2, 32
21117 lea r3, [ang_table_avx2 + 16 * 32]
21118 lea r5, [r1 * 3] ; r5 -> 3 * stride
21119 lea r6, [r1 * 4] ; r6 -> 4 * stride
21120 mova m7, [pw_1024]
21121 clc
21122
21123 call ang16_mode_5
21124 RET
21125
21126 cglobal ang16_mode_6
21127 ; rows 0 to 7
21128 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
21129 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
21130
21131 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21132 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21133 vextracti128 xm1, m0, 1
21134 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21135 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21136
21137 pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
21138 pmulhrsw m4, m7
21139
21140 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
21141 pmulhrsw m5, m7
21142
21143 palignr m3, m2, m0, 2
21144 pmaddubsw m6, m3, [r3 - 9 * 32] ; [7]
21145 pmulhrsw m6, m7
21146 pmaddubsw m8, m3, [r3 + 4 * 32] ; [20]
21147 pmulhrsw m8, m7
21148
21149 palignr m3, m2, m0, 4
21150 pmaddubsw m9, m3, [r3 - 15 * 32] ; [1]
21151 pmulhrsw m9, m7
21152
21153 pmaddubsw m10, m3, [r3 - 2 * 32] ; [14]
21154 pmulhrsw m10, m7
21155
21156 pmaddubsw m11, m3, [r3 + 11 * 32] ; [27]
21157 pmulhrsw m11, m7
21158
21159 palignr m1, m2, m0, 6
21160 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
21161 pmulhrsw m12, m7
21162
21163 ; rows 8 to 15
21164 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
21165 pmulhrsw m3, m7
21166 packuswb m4, m3
21167
21168 palignr m1, m2, m0, 8
21169 pmaddubsw m3, m1, [r3 - 14 * 32] ; [2]
21170 pmulhrsw m3, m7
21171 packuswb m5, m3
21172
21173 pmaddubsw m3, m1, [r3 - 1 * 32] ; [15]
21174 pmulhrsw m3, m7
21175 packuswb m6, m3
21176
21177 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
21178 pmulhrsw m3, m7
21179 packuswb m8, m3
21180
21181 palignr m1, m2, m0, 10
21182 pmaddubsw m3, m1, [r3 - 7 * 32] ; [9]
21183 pmulhrsw m3, m7
21184 packuswb m9, m3
21185
21186 pmaddubsw m3, m1, [r3 + 6 * 32] ; [22]
21187 pmulhrsw m3, m7
21188 packuswb m10, m3
21189
21190 palignr m1, m2, m0, 12
21191 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
21192 pmulhrsw m3, m7
21193 packuswb m11, m3
21194
21195 pmaddubsw m1, [r3] ; [16]
21196 pmulhrsw m1, m7
21197 packuswb m12, m1
21198
21199 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
21200 ret
21201
21202 INIT_YMM avx2
21203 cglobal intra_pred_ang16_6, 3, 7, 13
21204 add r2, 32
21205 lea r3, [ang_table_avx2 + 16 * 32]
21206 lea r5, [r1 * 3] ; r5 -> 3 * stride
21207 lea r6, [r1 * 4] ; r6 -> 4 * stride
21208 mova m7, [pw_1024]
21209 clc
21210
21211 call ang16_mode_6
21212 RET
21213
21214 cglobal ang16_mode_7
21215 ; rows 0 to 7
21216 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
21217 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
21218
21219 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21220 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21221 vextracti128 xm1, m0, 1
21222 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21223 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21224
21225 pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
21226 pmulhrsw m4, m7
21227
21228 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
21229 pmulhrsw m5, m7
21230 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
21231 pmulhrsw m6, m7
21232
21233 palignr m3, m2, m0, 2
21234 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
21235 pmulhrsw m8, m7
21236
21237 pmaddubsw m9, m3, [r3 - 3 * 32] ; [13]
21238 pmulhrsw m9, m7
21239
21240 pmaddubsw m10, m3, [r3 + 6 * 32] ; [22]
21241 pmulhrsw m10, m7
21242
21243 pmaddubsw m11, m3, [r3 + 15 * 32] ; [31]
21244 pmulhrsw m11, m7
21245
21246 palignr m1, m2, m0, 4
21247 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
21248 pmulhrsw m12, m7
21249
21250 ; rows 8 to 15
21251 pmaddubsw m3, m1, [r3 + 1 * 32] ; [17]
21252 pmulhrsw m3, m7
21253 packuswb m4, m3
21254
21255 pmaddubsw m3, m1, [r3 + 10 * 32] ; [26]
21256 pmulhrsw m3, m7
21257 packuswb m5, m3
21258
21259 palignr m1, m2, m0, 6
21260 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
21261 pmulhrsw m3, m7
21262 packuswb m6, m3
21263
21264 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
21265 pmulhrsw m3, m7
21266 packuswb m8, m3
21267
21268 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
21269 pmulhrsw m3, m7
21270 packuswb m9, m3
21271
21272 pmaddubsw m3, m1, [r3 + 14 * 32] ; [30]
21273 pmulhrsw m3, m7
21274 packuswb m10, m3
21275
21276 palignr m1, m2, m0, 8
21277 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
21278 pmulhrsw m3, m7
21279 packuswb m11, m3
21280
21281 pmaddubsw m1, [r3] ; [16]
21282 pmulhrsw m1, m7
21283 packuswb m12, m1
21284
21285 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
21286 ret
21287
21288 INIT_YMM avx2
21289 cglobal intra_pred_ang16_7, 3, 7, 13
21290 add r2, 32
21291 lea r3, [ang_table_avx2 + 16 * 32]
21292 lea r5, [r1 * 3] ; r5 -> 3 * stride
21293 lea r6, [r1 * 4] ; r6 -> 4 * stride
21294 mova m7, [pw_1024]
21295 clc
21296
21297 call ang16_mode_7
21298 RET
21299
21300 cglobal ang16_mode_8
21301 ; rows 0 to 7
21302 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
21303 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
21304
21305 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21306 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21307 vextracti128 xm1, m0, 1
21308 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
21309 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
21310
21311 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
21312 pmulhrsw m4, m7
21313 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
21314 pmulhrsw m5, m7
21315
21316 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
21317 pmulhrsw m6, m7
21318 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
21319 pmulhrsw m8, m7
21320 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
21321 pmulhrsw m9, m7
21322
21323 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
21324 pmulhrsw m10, m7
21325 palignr m1, m2, m0, 2
21326 pmaddubsw m11, m1, [r3 - 13 * 32] ; [3]
21327 pmulhrsw m11, m7
21328 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
21329 pmulhrsw m12, m7
21330
21331 ; rows 8 to 15
21332 pmaddubsw m3, m1, [r3 - 3 * 32] ; [13]
21333 pmulhrsw m3, m7
21334 packuswb m4, m3
21335 pmaddubsw m3, m1, [r3 + 2 * 32] ; [18]
21336 pmulhrsw m3, m7
21337 packuswb m5, m3
21338
21339 pmaddubsw m3, m1, [r3 + 7 * 32] ; [23]
21340 pmulhrsw m3, m7
21341 packuswb m6, m3
21342 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
21343 pmulhrsw m3, m7
21344 packuswb m8, m3
21345
21346 palignr m1, m2, m0, 4
21347 pmaddubsw m3, m1, [r3 - 15 * 32] ; [1]
21348 pmulhrsw m3, m7
21349 packuswb m9, m3
21350 pmaddubsw m3, m1, [r3 - 10 * 32] ; [6]
21351 pmulhrsw m3, m7
21352 packuswb m10, m3
21353
21354 pmaddubsw m3, m1, [r3 - 5 * 32] ; [11]
21355 pmulhrsw m3, m7
21356 packuswb m11, m3
21357 pmaddubsw m1, [r3] ; [16]
21358 pmulhrsw m1, m7
21359 packuswb m12, m1
21360
21361 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
21362 ret
21363
21364 INIT_YMM avx2
21365 cglobal intra_pred_ang16_8, 3, 7, 13
21366 add r2, 32
21367 lea r3, [ang_table_avx2 + 16 * 32]
21368 lea r5, [r1 * 3] ; r5 -> 3 * stride
21369 lea r6, [r1 * 4] ; r6 -> 4 * stride
21370 mova m7, [pw_1024]
21371 clc
21372
21373 call ang16_mode_8
21374 RET
21375 %endif ; ARCH_X86_64
21376
21377 INIT_YMM avx2
21378 cglobal intra_pred_ang16_9, 3,4,8
21379 vbroadcasti128 m0, [angHor_tab_9]
21380 vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
21381 mova m2, [pw_1024]
21382 lea r3, [r1 * 3]
21383 mova m7, [ang16_shuf_mode9]
21384
21385 vbroadcasti128 m6, [r2 + mmsize + 17]
21386 vbroadcasti128 m3, [r2 + mmsize + 1]
21387
21388 pshufb m5, m3, m7
21389 pmaddubsw m4, m5, m0
21390 pmaddubsw m5, m1
21391 pmulhrsw m4, m2
21392 pmulhrsw m5, m2
21393 packuswb m4, m5
21394 movu [r0], xm4
21395 vextracti128 [r0 + r1], m4, 1
21396
21397 palignr m5, m6, m3, 2
21398 pshufb m5, m7
21399 pmaddubsw m4, m5, m0
21400 pmaddubsw m5, m1
21401 pmulhrsw m4, m2
21402 pmulhrsw m5, m2
21403 packuswb m4, m5
21404 movu [r0 + r1 * 2], xm4
21405 vextracti128 [r0 + r3], m4, 1
21406
21407 lea r0, [r0 + r1 * 4]
21408
21409 palignr m5, m6, m3, 4
21410 pshufb m5, m7
21411 pmaddubsw m4, m5, m0
21412 pmaddubsw m5, m1
21413 pmulhrsw m4, m2
21414 pmulhrsw m5, m2
21415 packuswb m4, m5
21416 movu [r0], xm4
21417 vextracti128 [r0 + r1], m4, 1
21418
21419 palignr m5, m6, m3, 6
21420 pshufb m5, m7
21421 pmaddubsw m4, m5, m0
21422 pmaddubsw m5, m1
21423 pmulhrsw m4, m2
21424 pmulhrsw m5, m2
21425 packuswb m4, m5
21426 movu [r0 + r1 * 2], xm4
21427 vextracti128 [r0 + r3], m4, 1
21428
21429 lea r0, [r0 + r1 * 4]
21430
21431 palignr m5, m6, m3, 8
21432 pshufb m5, m7
21433 pmaddubsw m4, m5, m0
21434 pmaddubsw m5, m1
21435 pmulhrsw m4, m2
21436 pmulhrsw m5, m2
21437 packuswb m4, m5
21438 movu [r0], xm4
21439 vextracti128 [r0 + r1], m4, 1
21440
21441 palignr m5, m6, m3, 10
21442 pshufb m5, m7
21443 pmaddubsw m4, m5, m0
21444 pmaddubsw m5, m1
21445 pmulhrsw m4, m2
21446 pmulhrsw m5, m2
21447 packuswb m4, m5
21448 movu [r0 + r1 * 2], xm4
21449 vextracti128 [r0 + r3], m4, 1
21450
21451 lea r0, [r0 + r1 * 4]
21452
21453 palignr m5, m6, m3, 12
21454 pshufb m5, m7
21455 pmaddubsw m4, m5, m0
21456 pmaddubsw m5, m1
21457 pmulhrsw m4, m2
21458 pmulhrsw m5, m2
21459 packuswb m4, m5
21460 movu [r0], xm4
21461 vextracti128 [r0 + r1], m4, 1
21462
21463 palignr m5, m6, m3, 14
21464 pshufb m5, m7
21465 pmaddubsw m4, m5, m0
21466 pmaddubsw m5, m1
21467 pmulhrsw m4, m2
21468 pmulhrsw m5, m2
21469 packuswb m4, m5
21470 movu [r0 + r1 * 2], xm4
21471 vextracti128 [r0 + r3], m4, 1
21472 RET
21473 %endif
21474
21475 INIT_YMM avx2
21476 cglobal intra_pred_ang16_25, 3, 5, 5
21477 mova m0, [pw_1024]
21478
21479 vbroadcasti128 m1, [r2]
21480 pshufb m1, [intra_pred_shuff_0_8]
21481 vbroadcasti128 m2, [r2 + 8]
21482 pshufb m2, [intra_pred_shuff_0_8]
21483
21484 lea r3, [3 * r1]
21485 lea r4, [c_ang16_mode_25]
21486
21487 INTRA_PRED_ANG16_MC1 0
21488
21489 lea r0, [r0 + 4 * r1]
21490 INTRA_PRED_ANG16_MC1 2
21491
21492 add r4, 4 * mmsize
21493
21494 lea r0, [r0 + 4 * r1]
21495 INTRA_PRED_ANG16_MC1 0
21496
21497 lea r0, [r0 + 4 * r1]
21498 INTRA_PRED_ANG16_MC1 2
21499 RET
21500
21501 INIT_YMM avx2
21502 cglobal intra_pred_ang16_28, 3, 5, 6
21503 mova m0, [pw_1024]
21504 mova m5, [intra_pred_shuff_0_8]
21505 lea r3, [3 * r1]
21506 lea r4, [c_ang16_mode_28]
21507
21508 INTRA_PRED_ANG16_MC2 1
21509 INTRA_PRED_ANG16_MC1 0
21510
21511 lea r0, [r0 + 4 * r1]
21512
21513 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
21514
21515 INTRA_PRED_ANG16_MC2 2
21516 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
21517
21518 lea r0, [r0 + 4 * r1]
21519 add r4, 4 * mmsize
21520
21521 INTRA_PRED_ANG16_MC1 0
21522 INTRA_PRED_ANG16_MC2 3
21523
21524 lea r0, [r0 + 4 * r1]
21525 INTRA_PRED_ANG16_MC1 2
21526 RET
21527
21528 INIT_YMM avx2
21529 cglobal intra_pred_ang16_27, 3, 5, 5
21530 mova m0, [pw_1024]
21531 lea r3, [3 * r1]
21532 lea r4, [c_ang16_mode_27]
21533
21534 vbroadcasti128 m1, [r2 + 1]
21535 pshufb m1, [intra_pred_shuff_0_8]
21536 vbroadcasti128 m2, [r2 + 9]
21537 pshufb m2, [intra_pred_shuff_0_8]
21538
21539 INTRA_PRED_ANG16_MC1 0
21540
21541 lea r0, [r0 + 4 * r1]
21542 INTRA_PRED_ANG16_MC1 2
21543
21544 lea r0, [r0 + 4 * r1]
21545 add r4, 4 * mmsize
21546 INTRA_PRED_ANG16_MC1 0
21547
21548 lea r0, [r0 + 4 * r1]
21549 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
21550
21551 vperm2i128 m1, m1, m2, 00100000b
21552 pmaddubsw m3, m1, [r4 + 3 * mmsize]
21553 pmulhrsw m3, m0
21554 vbroadcasti128 m2, [r2 + 2]
21555 pshufb m2, [intra_pred_shuff_0_15]
21556 pmaddubsw m2, [r4 + 4 * mmsize]
21557 pmulhrsw m2, m0
21558 packuswb m3, m2
21559 vpermq m3, m3, 11011000b
21560 movu [r0 + 2 * r1], xm3
21561 vextracti128 xm4, m3, 1
21562 movu [r0 + r3], xm4
21563 RET
21564
21565 INIT_YMM avx2
21566 cglobal intra_pred_ang16_29, 3, 5, 5
21567 mova m0, [pw_1024]
21568 mova m5, [intra_pred_shuff_0_8]
21569 lea r3, [3 * r1]
21570 lea r4, [c_ang16_mode_29]
21571
21572 INTRA_PRED_ANG16_MC2 1
21573 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
21574 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
21575
21576 INTRA_PRED_ANG16_MC2 2
21577 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
21578
21579 lea r0, [r0 + r1 * 4]
21580 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
21581
21582 INTRA_PRED_ANG16_MC2 3
21583 add r4, 4 * mmsize
21584 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
21585 lea r0, [r0 + r1 * 4]
21586 INTRA_PRED_ANG16_MC3 r0 + r1, 1
21587
21588 INTRA_PRED_ANG16_MC2 4
21589 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
21590 lea r0, [r0 + r1 * 4]
21591 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
21592
21593 add r4, 4 * mmsize
21594
21595 INTRA_PRED_ANG16_MC2 5
21596 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
21597 RET
21598
21599 INIT_YMM avx2
21600 cglobal intra_pred_ang16_30, 3, 5, 6
21601 mova m0, [pw_1024]
21602 mova m5, [intra_pred_shuff_0_8]
21603 lea r3, [3 * r1]
21604 lea r4, [c_ang16_mode_30]
21605
21606 INTRA_PRED_ANG16_MC2 1
21607 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
21608
21609 INTRA_PRED_ANG16_MC2 2
21610 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
21611
21612 INTRA_PRED_ANG16_MC2 3
21613 lea r0, [r0 + 4 * r1]
21614 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
21615 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
21616
21617 INTRA_PRED_ANG16_MC2 4
21618 add r4, 4 * mmsize
21619 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
21620
21621 INTRA_PRED_ANG16_MC2 5
21622 lea r0, [r0 + 4 * r1]
21623 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
21624 INTRA_PRED_ANG16_MC3 r0 + r3 , 2
21625
21626 INTRA_PRED_ANG16_MC2 6
21627 lea r0, [r0 + 4 * r1]
21628 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
21629
21630 INTRA_PRED_ANG16_MC2 7
21631 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
21632 RET
21633
21634 INIT_YMM avx2
21635 cglobal intra_pred_ang16_31, 3, 5, 6
21636 mova m0, [pw_1024]
21637 mova m5, [intra_pred_shuff_0_8]
21638 lea r3, [3 * r1]
21639 lea r4, [c_ang16_mode_31]
21640
21641 INTRA_PRED_ANG16_MC2 1
21642 INTRA_PRED_ANG16_MC3 r0, 0
21643
21644 INTRA_PRED_ANG16_MC2 2
21645 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
21646
21647 INTRA_PRED_ANG16_MC2 3
21648 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
21649
21650 INTRA_PRED_ANG16_MC2 4
21651 lea r0, [r0 + 4 * r1]
21652 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
21653
21654 INTRA_PRED_ANG16_MC2 5
21655 add r4, 4 * mmsize
21656 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
21657
21658 INTRA_PRED_ANG16_MC2 6
21659 lea r0, [r0 + 4 * r1]
21660 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
21661
21662 INTRA_PRED_ANG16_MC2 7
21663 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
21664
21665 INTRA_PRED_ANG16_MC2 8
21666 lea r0, [r0 + 4 * r1]
21667 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
21668
21669 INTRA_PRED_ANG16_MC2 9
21670 INTRA_PRED_ANG16_MC3 r0 + r3, 4
21671 RET
21672
21673 INIT_YMM avx2
21674 cglobal intra_pred_ang16_24, 3, 5, 6
21675 mova m0, [pw_1024]
21676 mova m5, [intra_pred_shuff_0_8]
21677 lea r3, [3 * r1]
21678 lea r4, [c_ang16_mode_24]
21679
21680 INTRA_PRED_ANG16_MC2 0
21681 INTRA_PRED_ANG16_MC1 0
21682
21683 lea r0, [r0 + 4 * r1]
21684 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
21685
21686 movu xm1, [r2 - 1]
21687 pinsrb xm1, [r2 + 38], 0
21688 vinserti128 m1, m1, xm1, 1
21689 pshufb m1, m5
21690 vbroadcasti128 m2, [r2 + 7]
21691 pshufb m2, m5
21692 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
21693
21694 lea r0, [r0 + 4 * r1]
21695 add r4, 4 * mmsize
21696
21697 INTRA_PRED_ANG16_MC1 0
21698
21699 movu xm1, [r2 - 2]
21700 pinsrb xm1, [r2 + 45], 0
21701 pinsrb xm1, [r2 + 38], 1
21702 vinserti128 m1, m1, xm1, 1
21703 pshufb m1, m5
21704 vbroadcasti128 m2, [r2 + 6]
21705 pshufb m2, m5
21706
21707 lea r0, [r0 + 4 * r1]
21708
21709 INTRA_PRED_ANG16_MC1 2
21710 RET
21711
21712 %macro INTRA_PRED_ANG16_MC5 2
21713 pslldq xm6, xm6, 1
21714 pinsrb xm6, [r2 + %1], 0
21715 vinserti128 m1, m6, xm6, 1
21716 pshufb m1, m5
21717 vbroadcasti128 m2, [r2 + %2]
21718 pshufb m2, m5
21719 %endmacro
21720
21721 INIT_YMM avx2
21722 cglobal intra_pred_ang16_23, 3, 5, 7
21723 mova m0, [pw_1024]
21724 mova m5, [intra_pred_shuff_0_8]
21725 lea r3, [3 * r1]
21726 lea r4, [c_ang16_mode_23]
21727
21728 INTRA_PRED_ANG16_MC2 0
21729 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
21730 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
21731
21732 movu xm6, [r2 - 1]
21733 pinsrb xm6, [r2 + 36], 0
21734 vinserti128 m1, m6, xm6, 1
21735 pshufb m1, m5
21736 vbroadcasti128 m2, [r2 + 7]
21737 pshufb m2, m5
21738 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
21739
21740 lea r0, [r0 + 4 * r1]
21741
21742 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
21743
21744 add r4, 4 * mmsize
21745
21746 INTRA_PRED_ANG16_MC5 39, 6
21747 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
21748
21749 lea r0, [r0 + 4 * r1]
21750
21751 INTRA_PRED_ANG16_MC3 r0 + r1, 1
21752 INTRA_PRED_ANG16_MC5 43, 5
21753 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
21754
21755 lea r0, [r0 + 4 * r1]
21756
21757 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
21758
21759 add r4, 4 * mmsize
21760
21761 INTRA_PRED_ANG16_MC5 46, 4
21762 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
21763 RET
21764
21765 INIT_YMM avx2
21766 cglobal intra_pred_ang16_22, 3, 5, 7
21767 mova m0, [pw_1024]
21768 mova m5, [intra_pred_shuff_0_8]
21769 lea r3, [3 * r1]
21770 lea r4, [c_ang16_mode_22]
21771
21772 INTRA_PRED_ANG16_MC2 0
21773 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
21774
21775 movu xm6, [r2 - 1]
21776 pinsrb xm6, [r2 + 34], 0
21777 vinserti128 m1, m6, xm6, 1
21778 pshufb m1, m5
21779 vbroadcasti128 m2, [r2 + 7]
21780 pshufb m2, m5
21781 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
21782
21783 lea r0, [r0 + 4 * r1]
21784
21785 INTRA_PRED_ANG16_MC5 37, 6
21786 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
21787 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
21788
21789 add r4, 4 * mmsize
21790
21791 INTRA_PRED_ANG16_MC5 39, 5
21792 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
21793
21794 lea r0, [r0 + 4 * r1]
21795
21796 INTRA_PRED_ANG16_MC5 42, 4
21797 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
21798 INTRA_PRED_ANG16_MC3 r0 + r3, 2
21799
21800 lea r0, [r0 + 4 * r1]
21801
21802 INTRA_PRED_ANG16_MC5 44, 3
21803 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
21804 INTRA_PRED_ANG16_MC5 47, 2
21805 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
21806 RET
21807
21808 %macro INTRA_PRED_ANG32_ALIGNR_STORE 1
21809 lea r0, [r0 + 4 * r1]
21810 palignr m2, m1, m0, %1
21811 movu [r0], m2
21812 palignr m2, m1, m0, (%1 + 1)
21813 movu [r0 + r1], m2
21814 palignr m2, m1, m0, (%1 + 2)
21815 movu [r0 + 2 * r1], m2
21816 palignr m2, m1, m0, (%1 + 3)
21817 movu [r0 + r3], m2
21818 %endmacro
21819
21820 INIT_YMM avx2
21821 cglobal intra_pred_ang32_34, 3, 4,3
21822 lea r3, [3 * r1]
21823
21824 movu m0, [r2 + 2]
21825 movu m1, [r2 + 18]
21826 movu [r0], m0
21827 palignr m2, m1, m0, 1
21828 movu [r0 + r1], m2
21829 palignr m2, m1, m0, 2
21830 movu [r0 + 2 * r1], m2
21831 palignr m2, m1, m0, 3
21832 movu [r0 + r3], m2
21833
21834 INTRA_PRED_ANG32_ALIGNR_STORE 4
21835 INTRA_PRED_ANG32_ALIGNR_STORE 8
21836 INTRA_PRED_ANG32_ALIGNR_STORE 12
21837
21838 lea r0, [r0 + 4 * r1]
21839 palignr m2, m1, m0, 16
21840 movu [r0], m2
21841 movu m0, [r2 + 19]
21842 movu [r0 + r1], m0
21843 movu m1, [r2 + 35]
21844 palignr m2, m1, m0, 1
21845 movu [r0 + 2 * r1], m2
21846 palignr m2, m1, m0, 2
21847 movu [r0 + r3], m2
21848
21849 INTRA_PRED_ANG32_ALIGNR_STORE 3
21850 INTRA_PRED_ANG32_ALIGNR_STORE 7
21851 INTRA_PRED_ANG32_ALIGNR_STORE 11
21852 RET
21853
21854 INIT_YMM avx2
21855 cglobal intra_pred_ang32_2, 3, 4,3
21856 lea r3, [3 * r1]
21857
21858 movu m0, [r2 + 64 + 2]
21859 movu m1, [r2 + 64 + 18]
21860 movu [r0], m0
21861 palignr m2, m1, m0, 1
21862 movu [r0 + r1], m2
21863 palignr m2, m1, m0, 2
21864 movu [r0 + 2 * r1], m2
21865 palignr m2, m1, m0, 3
21866 movu [r0 + r3], m2
21867
21868 INTRA_PRED_ANG32_ALIGNR_STORE 4
21869 INTRA_PRED_ANG32_ALIGNR_STORE 8
21870 INTRA_PRED_ANG32_ALIGNR_STORE 12
21871
21872 lea r0, [r0 + 4 * r1]
21873 palignr m2, m1, m0, 16
21874 movu [r0], m2
21875 movu m0, [r2 + 64 + 19]
21876 movu [r0 + r1], m0
21877 movu m1, [r2 + 64 + 35]
21878 palignr m2, m1, m0, 1
21879 movu [r0 + 2 * r1], m2
21880 palignr m2, m1, m0, 2
21881 movu [r0 + r3], m2
21882
21883 INTRA_PRED_ANG32_ALIGNR_STORE 3
21884 INTRA_PRED_ANG32_ALIGNR_STORE 7
21885 INTRA_PRED_ANG32_ALIGNR_STORE 11
21886 RET
21887
21888 %macro INTRA_PRED_ANG32_STORE 0
21889 lea r0, [r0 + 4 * r1]
21890 movu [r0], m0
21891 movu [r0 + r1], m0
21892 movu [r0 + r1 * 2], m0
21893 movu [r0 + r3], m0
21894 %endmacro
21895
21896 INIT_YMM avx2
21897 cglobal intra_pred_ang32_26, 3, 4, 1
21898 lea r3, [3 * r1]
21899 movu m0, [r2 + 1]
21900 movu [r0], m0
21901 movu [r0 + r1], m0
21902 movu [r0 + r1 * 2], m0
21903 movu [r0 + r3], m0
21904
21905 INTRA_PRED_ANG32_STORE
21906 INTRA_PRED_ANG32_STORE
21907 INTRA_PRED_ANG32_STORE
21908 INTRA_PRED_ANG32_STORE
21909 INTRA_PRED_ANG32_STORE
21910 INTRA_PRED_ANG32_STORE
21911 INTRA_PRED_ANG32_STORE
21912 RET
21913
21914 %macro INTRA_PRED_STORE_4x4 0
21915 movd [r0], xm0
21916 pextrd [r0 + r1], xm0, 1
21917 vextracti128 xm0, m0, 1
21918 lea r0, [r0 + 2 * r1]
21919 movd [r0], xm0
21920 pextrd [r0 + r1], xm0, 1
21921 %endmacro
21922
21923 %macro INTRA_PRED_TRANS_STORE_4x4 0
21924 vpermq m0, m0, 00001000b
21925 pshufb m0, [c_trans_4x4]
21926
21927 ;store
21928 movd [r0], xm0
21929 pextrd [r0 + r1], xm0, 1
21930 lea r0, [r0 + 2 * r1]
21931 pextrd [r0], xm0, 2
21932 pextrd [r0 + r1], xm0, 3
21933 %endmacro
21934
21935 INIT_YMM avx2
21936 cglobal intra_pred_ang4_27, 3, 3, 1
21937 vbroadcasti128 m0, [r2 + 1]
21938 pshufb m0, [intra_pred_shuff_0_4]
21939 pmaddubsw m0, [c_ang4_mode_27]
21940 pmulhrsw m0, [pw_1024]
21941 packuswb m0, m0
21942
21943 INTRA_PRED_STORE_4x4
21944 RET
21945
21946 INIT_YMM avx2
21947 cglobal intra_pred_ang4_28, 3, 3, 1
21948 vbroadcasti128 m0, [r2 + 1]
21949 pshufb m0, [intra_pred_shuff_0_4]
21950 pmaddubsw m0, [c_ang4_mode_28]
21951 pmulhrsw m0, [pw_1024]
21952 packuswb m0, m0
21953
21954 INTRA_PRED_STORE_4x4
21955 RET
21956
21957 INIT_YMM avx2
21958 cglobal intra_pred_ang4_29, 3, 3, 1
21959 vbroadcasti128 m0, [r2 + 1]
21960 pshufb m0, [intra_pred4_shuff1]
21961 pmaddubsw m0, [c_ang4_mode_29]
21962 pmulhrsw m0, [pw_1024]
21963 packuswb m0, m0
21964
21965 INTRA_PRED_STORE_4x4
21966 RET
21967
21968 INIT_YMM avx2
21969 cglobal intra_pred_ang4_30, 3, 3, 1
21970 vbroadcasti128 m0, [r2 + 1]
21971 pshufb m0, [intra_pred4_shuff2]
21972 pmaddubsw m0, [c_ang4_mode_30]
21973 pmulhrsw m0, [pw_1024]
21974 packuswb m0, m0
21975
21976 INTRA_PRED_STORE_4x4
21977 RET
21978
21979 INIT_YMM avx2
21980 cglobal intra_pred_ang4_31, 3, 3, 1
21981 vbroadcasti128 m0, [r2 + 1]
21982 pshufb m0, [intra_pred4_shuff31]
21983 pmaddubsw m0, [c_ang4_mode_31]
21984 pmulhrsw m0, [pw_1024]
21985 packuswb m0, m0
21986
21987 INTRA_PRED_STORE_4x4
21988 RET
21989
21990 INIT_YMM avx2
21991 cglobal intra_pred_ang4_32, 3, 3, 1
21992 vbroadcasti128 m0, [r2 + 1]
21993 pshufb m0, [intra_pred4_shuff31]
21994 pmaddubsw m0, [c_ang4_mode_32]
21995 pmulhrsw m0, [pw_1024]
21996 packuswb m0, m0
21997
21998 INTRA_PRED_STORE_4x4
21999 RET
22000
22001 INIT_YMM avx2
22002 cglobal intra_pred_ang4_33, 3, 3, 1
22003 vbroadcasti128 m0, [r2 + 1]
22004 pshufb m0, [intra_pred4_shuff33]
22005 pmaddubsw m0, [c_ang4_mode_33]
22006 pmulhrsw m0, [pw_1024]
22007 packuswb m0, m0
22008
22009 INTRA_PRED_STORE_4x4
22010 RET
22011
22012
22013 INIT_YMM avx2
22014 cglobal intra_pred_ang4_3, 3, 3, 1
22015 vbroadcasti128 m0, [r2 + 1]
22016 pshufb m0, [intra_pred4_shuff3]
22017 pmaddubsw m0, [c_ang4_mode_33]
22018 pmulhrsw m0, [pw_1024]
22019 packuswb m0, m0
22020
22021 INTRA_PRED_TRANS_STORE_4x4
22022 RET
22023
22024 INIT_YMM avx2
22025 cglobal intra_pred_ang4_4, 3, 3, 1
22026 vbroadcasti128 m0, [r2]
22027 pshufb m0, [intra_pred4_shuff5]
22028 pmaddubsw m0, [c_ang4_mode_32]
22029 pmulhrsw m0, [pw_1024]
22030 packuswb m0, m0
22031
22032 INTRA_PRED_TRANS_STORE_4x4
22033 RET
22034
22035 INIT_YMM avx2
22036 cglobal intra_pred_ang4_5, 3, 3, 1
22037 vbroadcasti128 m0, [r2]
22038 pshufb m0, [intra_pred4_shuff5]
22039 pmaddubsw m0, [c_ang4_mode_5]
22040 pmulhrsw m0, [pw_1024]
22041 packuswb m0, m0
22042
22043 INTRA_PRED_TRANS_STORE_4x4
22044 RET
22045
22046 INIT_YMM avx2
22047 cglobal intra_pred_ang4_6, 3, 3, 1
22048 vbroadcasti128 m0, [r2]
22049 pshufb m0, [intra_pred4_shuff6]
22050 pmaddubsw m0, [c_ang4_mode_6]
22051 pmulhrsw m0, [pw_1024]
22052 packuswb m0, m0
22053
22054 INTRA_PRED_TRANS_STORE_4x4
22055 RET
22056
22057 INIT_YMM avx2
22058 cglobal intra_pred_ang4_7, 3, 3, 1
22059 vbroadcasti128 m0, [r2]
22060 pshufb m0, [intra_pred4_shuff7]
22061 pmaddubsw m0, [c_ang4_mode_7]
22062 pmulhrsw m0, [pw_1024]
22063 packuswb m0, m0
22064
22065 INTRA_PRED_TRANS_STORE_4x4
22066 RET
22067
22068 INIT_YMM avx2
22069 cglobal intra_pred_ang4_8, 3, 3, 1
22070 vbroadcasti128 m0, [r2]
22071 pshufb m0, [intra_pred4_shuff9]
22072 pmaddubsw m0, [c_ang4_mode_8]
22073 pmulhrsw m0, [pw_1024]
22074 packuswb m0, m0
22075
22076 INTRA_PRED_TRANS_STORE_4x4
22077 RET
22078
22079 INIT_YMM avx2
22080 cglobal intra_pred_ang4_9, 3, 3, 1
22081 vbroadcasti128 m0, [r2]
22082 pshufb m0, [intra_pred4_shuff9]
22083 pmaddubsw m0, [c_ang4_mode_9]
22084 pmulhrsw m0, [pw_1024]
22085 packuswb m0, m0
22086
22087 INTRA_PRED_TRANS_STORE_4x4
22088 RET
22089
22090 INIT_YMM avx2
22091 cglobal intra_pred_ang4_11, 3, 3, 1
22092 vbroadcasti128 m0, [r2]
22093 pshufb m0, [intra_pred4_shuff12]
22094 pmaddubsw m0, [c_ang4_mode_11]
22095 pmulhrsw m0, [pw_1024]
22096 packuswb m0, m0
22097
22098 INTRA_PRED_TRANS_STORE_4x4
22099 RET
22100
22101 INIT_YMM avx2
22102 cglobal intra_pred_ang4_12, 3, 3, 1
22103 vbroadcasti128 m0, [r2]
22104 pshufb m0, [intra_pred4_shuff12]
22105 pmaddubsw m0, [c_ang4_mode_12]
22106 pmulhrsw m0, [pw_1024]
22107 packuswb m0, m0
22108
22109 INTRA_PRED_TRANS_STORE_4x4
22110 RET
22111
22112 INIT_YMM avx2
22113 cglobal intra_pred_ang4_13, 3, 3, 1
22114 vbroadcasti128 m0, [r2]
22115 pshufb m0, [intra_pred4_shuff13]
22116 pmaddubsw m0, [c_ang4_mode_13]
22117 pmulhrsw m0, [pw_1024]
22118 packuswb m0, m0
22119
22120 INTRA_PRED_TRANS_STORE_4x4
22121 RET
22122
22123 INIT_YMM avx2
22124 cglobal intra_pred_ang4_14, 3, 3, 1
22125 vbroadcasti128 m0, [r2]
22126 pshufb m0, [intra_pred4_shuff14]
22127 pmaddubsw m0, [c_ang4_mode_14]
22128 pmulhrsw m0, [pw_1024]
22129 packuswb m0, m0
22130
22131 INTRA_PRED_TRANS_STORE_4x4
22132 RET
22133
22134 INIT_YMM avx2
22135 cglobal intra_pred_ang4_15, 3, 3, 1
22136 vbroadcasti128 m0, [r2]
22137 pshufb m0, [intra_pred4_shuff15]
22138 pmaddubsw m0, [c_ang4_mode_15]
22139 pmulhrsw m0, [pw_1024]
22140 packuswb m0, m0
22141
22142 INTRA_PRED_TRANS_STORE_4x4
22143 RET
22144
22145 INIT_YMM avx2
22146 cglobal intra_pred_ang4_16, 3, 3, 1
22147 vbroadcasti128 m0, [r2]
22148 pshufb m0, [intra_pred4_shuff16]
22149 pmaddubsw m0, [c_ang4_mode_16]
22150 pmulhrsw m0, [pw_1024]
22151 packuswb m0, m0
22152
22153 INTRA_PRED_TRANS_STORE_4x4
22154 RET
22155
22156 INIT_YMM avx2
22157 cglobal intra_pred_ang4_17, 3, 3, 1
22158 vbroadcasti128 m0, [r2]
22159 pshufb m0, [intra_pred4_shuff17]
22160 pmaddubsw m0, [c_ang4_mode_17]
22161 pmulhrsw m0, [pw_1024]
22162 packuswb m0, m0
22163
22164 INTRA_PRED_TRANS_STORE_4x4
22165 RET
22166
22167 INIT_YMM avx2
22168 cglobal intra_pred_ang4_19, 3, 3, 1
22169 vbroadcasti128 m0, [r2]
22170 pshufb m0, [intra_pred4_shuff19]
22171 pmaddubsw m0, [c_ang4_mode_19]
22172 pmulhrsw m0, [pw_1024]
22173 packuswb m0, m0
22174
22175 INTRA_PRED_STORE_4x4
22176 RET
22177
22178 INIT_YMM avx2
22179 cglobal intra_pred_ang4_20, 3, 3, 1
22180 vbroadcasti128 m0, [r2]
22181 pshufb m0, [intra_pred4_shuff20]
22182 pmaddubsw m0, [c_ang4_mode_20]
22183 pmulhrsw m0, [pw_1024]
22184 packuswb m0, m0
22185
22186 INTRA_PRED_STORE_4x4
22187 RET
22188
22189 INIT_YMM avx2
22190 cglobal intra_pred_ang4_21, 3, 3, 1
22191 vbroadcasti128 m0, [r2]
22192 pshufb m0, [intra_pred4_shuff21]
22193 pmaddubsw m0, [c_ang4_mode_21]
22194 pmulhrsw m0, [pw_1024]
22195 packuswb m0, m0
22196
22197 INTRA_PRED_STORE_4x4
22198 RET
22199
22200 INIT_YMM avx2
22201 cglobal intra_pred_ang4_22, 3, 3, 1
22202 vbroadcasti128 m0, [r2]
22203 pshufb m0, [intra_pred4_shuff22]
22204 pmaddubsw m0, [c_ang4_mode_22]
22205 pmulhrsw m0, [pw_1024]
22206 packuswb m0, m0
22207
22208 INTRA_PRED_STORE_4x4
22209 RET
22210
22211 INIT_YMM avx2
22212 cglobal intra_pred_ang4_23, 3, 3, 1
22213 vbroadcasti128 m0, [r2]
22214 pshufb m0, [intra_pred4_shuff23]
22215 pmaddubsw m0, [c_ang4_mode_23]
22216 pmulhrsw m0, [pw_1024]
22217 packuswb m0, m0
22218
22219 INTRA_PRED_STORE_4x4
22220 RET
22221
22222 INIT_YMM avx2
22223 cglobal intra_pred_ang4_24, 3, 3, 1
22224 vbroadcasti128 m0, [r2]
22225 pshufb m0, [intra_pred_shuff_0_4]
22226 pmaddubsw m0, [c_ang4_mode_24]
22227 pmulhrsw m0, [pw_1024]
22228 packuswb m0, m0
22229
22230 INTRA_PRED_STORE_4x4
22231 RET
22232
22233 INIT_YMM avx2
22234 cglobal intra_pred_ang4_25, 3, 3, 1
22235 vbroadcasti128 m0, [r2]
22236 pshufb m0, [intra_pred_shuff_0_4]
22237 pmaddubsw m0, [c_ang4_mode_25]
22238 pmulhrsw m0, [pw_1024]
22239 packuswb m0, m0
22240
22241 INTRA_PRED_STORE_4x4
22242 RET
22243
22244 ;-----------------------------------------------------------------------------------
22245 ; void intra_filter_NxN(const pixel* references, pixel* filtered)
22246 ;-----------------------------------------------------------------------------------
22247 INIT_XMM sse4
22248 cglobal intra_filter_4x4, 2,4,5
22249 mov r2b, byte [r0 + 8] ; topLast
22250 mov r3b, byte [r0 + 16] ; LeftLast
22251
22252 ; filtering top
22253 pmovzxbw m0, [r0 + 0]
22254 pmovzxbw m1, [r0 + 8]
22255 pmovzxbw m2, [r0 + 16]
22256
22257 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
22258 palignr m3, m1, m0, 4
22259 pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
22260
22261 psllw m0, 1
22262 paddw m4, m3
22263 paddw m0, m4
22264 paddw m0, [pw_2]
22265 psrlw m0, 2
22266
22267 ; filtering left
22268 palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1]
22269 pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1]
22270 palignr m3, m2, m1, 4
22271 pshufb m3, [intra_filter4_shuf1]
22272
22273 psllw m1, 1
22274 paddw m4, m3
22275 paddw m1, m4
22276 paddw m1, [pw_2]
22277 psrlw m1, 2
22278 packuswb m0, m1
22279
22280 movu [r1], m0
22281 mov [r1 + 8], r2b ; topLast
22282 mov [r1 + 16], r3b ; LeftLast
22283 RET
22284
22285 INIT_XMM sse4
22286 cglobal intra_filter_8x8, 2,4,6
22287 mov r2b, byte [r0 + 16] ; topLast
22288 mov r3b, byte [r0 + 32] ; LeftLast
22289
22290 ; filtering top
22291 pmovzxbw m0, [r0 + 0]
22292 pmovzxbw m1, [r0 + 8]
22293 pmovzxbw m2, [r0 + 16]
22294
22295 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
22296 palignr m5, m1, m0, 2
22297 pinsrb m5, [r0 + 17], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
22298
22299 palignr m3, m1, m0, 14
22300 psllw m0, 1
22301 paddw m4, m5
22302 paddw m0, m4
22303 paddw m0, [pw_2]
22304 psrlw m0, 2
22305
22306 palignr m4, m2, m1, 2
22307 psllw m1, 1
22308 paddw m4, m3
22309 paddw m1, m4
22310 paddw m1, [pw_2]
22311 psrlw m1, 2
22312
22313 packuswb m0, m1
22314 movu [r1], m0
22315
22316 ; filtering left
22317 pmovzxbw m1, [r0 + 24]
22318 pmovzxbw m0, [r0 + 32]
22319
22320 palignr m4, m2, m2, 14
22321 pinsrb m4, [r0], 2
22322 palignr m5, m1, m2, 2
22323
22324 palignr m3, m1, m2, 14
22325 palignr m0, m1, 2
22326
22327 psllw m2, 1
22328 paddw m4, m5
22329 paddw m2, m4
22330 paddw m2, [pw_2]
22331 psrlw m2, 2
22332
22333 psllw m1, 1
22334 paddw m0, m3
22335 paddw m1, m0
22336 paddw m1, [pw_2]
22337 psrlw m1, 2
22338
22339 packuswb m2, m1
22340 movu [r1 + 16], m2
22341 mov [r1 + 16], r2b ; topLast
22342 mov [r1 + 32], r3b ; LeftLast
22343 RET
22344
22345 INIT_XMM sse4
22346 cglobal intra_filter_16x16, 2,4,6
22347 mov r2b, byte [r0 + 32] ; topLast
22348 mov r3b, byte [r0 + 64] ; LeftLast
22349
22350 ; filtering top
22351 pmovzxbw m0, [r0 + 0]
22352 pmovzxbw m1, [r0 + 8]
22353 pmovzxbw m2, [r0 + 16]
22354
22355 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
22356 palignr m5, m1, m0, 2
22357 pinsrb m5, [r0 + 33], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
22358
22359 palignr m3, m1, m0, 14
22360 psllw m0, 1
22361 paddw m4, m5
22362 paddw m0, m4
22363 paddw m0, [pw_2]
22364 psrlw m0, 2
22365
22366 palignr m4, m2, m1, 2
22367 psllw m5, m1, 1
22368 paddw m4, m3
22369 paddw m5, m4
22370 paddw m5, [pw_2]
22371 psrlw m5, 2
22372 packuswb m0, m5
22373 movu [r1], m0
22374
22375 pmovzxbw m0, [r0 + 24]
22376 pmovzxbw m5, [r0 + 32]
22377
22378 palignr m3, m2, m1, 14
22379 palignr m4, m0, m2, 2
22380
22381 psllw m1, m2, 1
22382 paddw m3, m4
22383 paddw m1, m3
22384 paddw m1, [pw_2]
22385 psrlw m1, 2
22386
22387 palignr m3, m0, m2, 14
22388 palignr m4, m5, m0, 2
22389
22390 psllw m0, 1
22391 paddw m4, m3
22392 paddw m0, m4
22393 paddw m0, [pw_2]
22394 psrlw m0, 2
22395 packuswb m1, m0
22396 movu [r1 + 16], m1
22397
22398 ; filtering left
22399 pmovzxbw m1, [r0 + 40]
22400 pmovzxbw m2, [r0 + 48]
22401
22402 palignr m4, m5, m5, 14
22403 pinsrb m4, [r0], 2
22404 palignr m0, m1, m5, 2
22405
22406 psllw m3, m5, 1
22407 paddw m4, m0
22408 paddw m3, m4
22409 paddw m3, [pw_2]
22410 psrlw m3, 2
22411
22412 palignr m0, m1, m5, 14
22413 palignr m4, m2, m1, 2
22414
22415 psllw m5, m1, 1
22416 paddw m4, m0
22417 paddw m5, m4
22418 paddw m5, [pw_2]
22419 psrlw m5, 2
22420 packuswb m3, m5
22421 movu [r1 + 32], m3
22422
22423 pmovzxbw m5, [r0 + 56]
22424 pmovzxbw m0, [r0 + 64]
22425
22426 palignr m3, m2, m1, 14
22427 palignr m4, m5, m2, 2
22428
22429 psllw m1, m2, 1
22430 paddw m3, m4
22431 paddw m1, m3
22432 paddw m1, [pw_2]
22433 psrlw m1, 2
22434
22435 palignr m3, m5, m2, 14
22436 palignr m4, m0, m5, 2
22437
22438 psllw m5, 1
22439 paddw m4, m3
22440 paddw m5, m4
22441 paddw m5, [pw_2]
22442 psrlw m5, 2
22443 packuswb m1, m5
22444 movu [r1 + 48], m1
22445
22446 mov [r1 + 32], r2b ; topLast
22447 mov [r1 + 64], r3b ; LeftLast
22448 RET
22449
22450 INIT_XMM sse4
22451 cglobal intra_filter_32x32, 2,4,6
22452 mov r2b, byte [r0 + 64] ; topLast
22453 mov r3b, byte [r0 + 128] ; LeftLast
22454
22455 ; filtering top
22456 ; 0 to 15
22457 pmovzxbw m0, [r0 + 0]
22458 pmovzxbw m1, [r0 + 8]
22459 pmovzxbw m2, [r0 + 16]
22460
22461 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
22462 palignr m5, m1, m0, 2
22463 pinsrb m5, [r0 + 65], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
22464
22465 palignr m3, m1, m0, 14
22466 psllw m0, 1
22467 paddw m4, m5
22468 paddw m0, m4
22469 paddw m0, [pw_2]
22470 psrlw m0, 2
22471
22472 palignr m4, m2, m1, 2
22473 psllw m5, m1, 1
22474 paddw m4, m3
22475 paddw m5, m4
22476 paddw m5, [pw_2]
22477 psrlw m5, 2
22478 packuswb m0, m5
22479 movu [r1], m0
22480
22481 ; 16 to 31
22482 pmovzxbw m0, [r0 + 24]
22483 pmovzxbw m5, [r0 + 32]
22484
22485 palignr m3, m2, m1, 14
22486 palignr m4, m0, m2, 2
22487
22488 psllw m1, m2, 1
22489 paddw m3, m4
22490 paddw m1, m3
22491 paddw m1, [pw_2]
22492 psrlw m1, 2
22493
22494 palignr m3, m0, m2, 14
22495 palignr m4, m5, m0, 2
22496
22497 psllw m2, m0, 1
22498 paddw m4, m3
22499 paddw m2, m4
22500 paddw m2, [pw_2]
22501 psrlw m2, 2
22502 packuswb m1, m2
22503 movu [r1 + 16], m1
22504
22505 ; 32 to 47
22506 pmovzxbw m1, [r0 + 40]
22507 pmovzxbw m2, [r0 + 48]
22508
22509 palignr m3, m5, m0, 14
22510 palignr m4, m1, m5, 2
22511
22512 psllw m0, m5, 1
22513 paddw m3, m4
22514 paddw m0, m3
22515 paddw m0, [pw_2]
22516 psrlw m0, 2
22517
22518 palignr m3, m1, m5, 14
22519 palignr m4, m2, m1, 2
22520
22521 psllw m5, m1, 1
22522 paddw m4, m3
22523 paddw m5, m4
22524 paddw m5, [pw_2]
22525 psrlw m5, 2
22526 packuswb m0, m5
22527 movu [r1 + 32], m0
22528
22529 ; 48 to 63
22530 pmovzxbw m0, [r0 + 56]
22531 pmovzxbw m5, [r0 + 64]
22532
22533 palignr m3, m2, m1, 14
22534 palignr m4, m0, m2, 2
22535
22536 psllw m1, m2, 1
22537 paddw m3, m4
22538 paddw m1, m3
22539 paddw m1, [pw_2]
22540 psrlw m1, 2
22541
22542 palignr m3, m0, m2, 14
22543 palignr m4, m5, m0, 2
22544
22545 psllw m0, 1
22546 paddw m4, m3
22547 paddw m0, m4
22548 paddw m0, [pw_2]
22549 psrlw m0, 2
22550 packuswb m1, m0
22551 movu [r1 + 48], m1
22552
22553 ; filtering left
22554 ; 64 to 79
22555 pmovzxbw m1, [r0 + 72]
22556 pmovzxbw m2, [r0 + 80]
22557
22558 palignr m4, m5, m5, 14
22559 pinsrb m4, [r0], 2
22560 palignr m0, m1, m5, 2
22561
22562 psllw m3, m5, 1
22563 paddw m4, m0
22564 paddw m3, m4
22565 paddw m3, [pw_2]
22566 psrlw m3, 2
22567
22568 palignr m0, m1, m5, 14
22569 palignr m4, m2, m1, 2
22570
22571 psllw m5, m1, 1
22572 paddw m4, m0
22573 paddw m5, m4
22574 paddw m5, [pw_2]
22575 psrlw m5, 2
22576 packuswb m3, m5
22577 movu [r1 + 64], m3
22578
22579 ; 80 to 95
22580 pmovzxbw m5, [r0 + 88]
22581 pmovzxbw m0, [r0 + 96]
22582
22583 palignr m3, m2, m1, 14
22584 palignr m4, m5, m2, 2
22585
22586 psllw m1, m2, 1
22587 paddw m3, m4
22588 paddw m1, m3
22589 paddw m1, [pw_2]
22590 psrlw m1, 2
22591
22592 palignr m3, m5, m2, 14
22593 palignr m4, m0, m5, 2
22594
22595 psllw m2, m5, 1
22596 paddw m4, m3
22597 paddw m2, m4
22598 paddw m2, [pw_2]
22599 psrlw m2, 2
22600 packuswb m1, m2
22601 movu [r1 + 80], m1
22602
22603 ; 96 to 111
22604 pmovzxbw m1, [r0 + 104]
22605 pmovzxbw m2, [r0 + 112]
22606
22607 palignr m3, m0, m5, 14
22608 palignr m4, m1, m0, 2
22609
22610 psllw m5, m0, 1
22611 paddw m3, m4
22612 paddw m5, m3
22613 paddw m5, [pw_2]
22614 psrlw m5, 2
22615
22616 palignr m3, m1, m0, 14
22617 palignr m4, m2, m1, 2
22618
22619 psllw m0, m1, 1
22620 paddw m4, m3
22621 paddw m0, m4
22622 paddw m0, [pw_2]
22623 psrlw m0, 2
22624 packuswb m5, m0
22625 movu [r1 + 96], m5
22626
22627 ; 112 to 127
22628 pmovzxbw m5, [r0 + 120]
22629 pmovzxbw m0, [r0 + 128]
22630
22631 palignr m3, m2, m1, 14
22632 palignr m4, m5, m2, 2
22633
22634 psllw m1, m2, 1
22635 paddw m3, m4
22636 paddw m1, m3
22637 paddw m1, [pw_2]
22638 psrlw m1, 2
22639
22640 palignr m3, m5, m2, 14
22641 palignr m4, m0, m5, 2
22642
22643 psllw m5, 1
22644 paddw m4, m3
22645 paddw m5, m4
22646 paddw m5, [pw_2]
22647 psrlw m5, 2
22648 packuswb m1, m5
22649 movu [r1 + 112], m1
22650
22651 mov [r1 + 64], r2b ; topLast
22652 mov [r1 + 128], r3b ; LeftLast
22653 RET
22654
22655 INIT_YMM avx2
22656 cglobal intra_filter_4x4, 2,4,4
22657 mov r2b, byte [r0 + 8] ; topLast
22658 mov r3b, byte [r0 + 16] ; LeftLast
22659
22660 ; filtering top
22661 pmovzxbw m0, [r0]
22662 vpbroadcastw m2, xm0
22663 pmovzxbw m1, [r0 + 8]
22664
22665 palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
22666 pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
22667 palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
22668 palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
22669
22670 psllw m0, 1
22671 paddw m3, m1
22672 paddw m0, m3
22673 paddw m0, [pw_2]
22674 psrlw m0, 2
22675
22676 packuswb m0, m0
22677 vpermq m0, m0, 10001000b
22678
22679 movu [r1], xm0
22680 mov [r1 + 8], r2b ; topLast
22681 mov [r1 + 16], r3b ; LeftLast
22682 RET