Mercurial > hg > forks > libbpg
comparison x265/source/common/x86/intrapred8.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:772086c29cc7 |
---|---|
1 ;***************************************************************************** | |
2 ;* Copyright (C) 2013 x265 project | |
3 ;* | |
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
5 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
6 ;* | |
7 ;* This program is free software; you can redistribute it and/or modify | |
8 ;* it under the terms of the GNU General Public License as published by | |
9 ;* the Free Software Foundation; either version 2 of the License, or | |
10 ;* (at your option) any later version. | |
11 ;* | |
12 ;* This program is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 ;* GNU General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU General Public License | |
18 ;* along with this program; if not, write to the Free Software | |
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 ;* | |
21 ;* This program is also available under a commercial proprietary license. | |
22 ;* For more information, contact us at license @ x265.com. | |
23 ;*****************************************************************************/ | |
24 | |
25 %include "x86inc.asm" | |
26 %include "x86util.asm" | |
27 | |
28 SECTION_RODATA 32 | |
29 | |
30 const intra_pred_shuff_0_8, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
31 db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
32 | |
33 intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
34 | |
35 intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 | |
36 intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 | |
37 intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
38 | |
39 pb_0_8 times 8 db 0, 8 | |
40 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 | |
41 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 | |
42 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
43 const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 | |
44 const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 | |
45 const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 | |
46 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 | |
47 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 | |
48 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
49 c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0 | |
50 c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15 | |
51 c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
52 c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0 | |
53 c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0 | |
54 c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 | |
55 c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
56 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
57 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 | |
58 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 | |
59 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 | |
60 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 | |
61 c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2 | |
62 c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2 | |
63 c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 | |
64 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 | |
65 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 | |
66 | |
67 ALIGN 32 | |
68 c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
69 c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
70 c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
71 c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
72 c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | |
73 c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
74 c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | |
75 c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
76 | |
77 c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
78 c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |
79 c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
80 c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
81 c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
82 c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13 | |
83 c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
84 | |
85 c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
86 c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
87 c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
88 c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, | |
89 c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | |
90 | |
91 c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
92 c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
93 c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
94 c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
95 c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
96 c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |
97 | |
98 c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
99 c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 | |
100 c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
101 c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
102 | |
103 c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
104 c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
105 c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
106 c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
107 | |
108 c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
109 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
110 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
111 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
112 | |
113 c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
114 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
115 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
116 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
117 | |
118 c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 | |
119 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
120 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
121 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
122 | |
123 ALIGN 32 | |
124 c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
125 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
126 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
127 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
128 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
129 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
130 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
131 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 | |
132 | |
133 ALIGN 32 | |
134 c_ang16_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
135 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
136 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
137 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
138 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
139 db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
140 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
141 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 | |
142 | |
143 | |
144 ALIGN 32 | |
145 c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 | |
146 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
147 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 | |
148 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
149 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 | |
150 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
151 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 | |
152 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
153 | |
154 | |
155 ALIGN 32 | |
156 c_ang16_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 | |
157 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
158 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 | |
159 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
160 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 | |
161 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
162 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 | |
163 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
164 | |
165 ALIGN 32 | |
166 c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
167 db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
168 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
169 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
170 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
171 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
172 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
173 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
174 | |
175 ALIGN 32 | |
176 c_ang16_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
177 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
178 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 | |
179 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
180 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
181 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
182 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
183 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 | |
184 | |
185 ALIGN 32 | |
186 c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
187 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 | |
188 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
189 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
190 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
191 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
192 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
193 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
194 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 | |
195 | |
196 ALIGN 32 | |
197 intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15 | |
198 | |
199 ALIGN 32 | |
200 c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
201 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 | |
202 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 | |
203 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 | |
204 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 | |
205 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
206 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
207 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
208 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
209 | |
210 ALIGN 32 | |
211 c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
212 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
213 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
214 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 | |
215 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 | |
216 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 | |
217 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
218 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 | |
219 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
220 | |
221 ALIGN 32 | |
222 c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 | |
223 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 | |
224 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 | |
225 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 | |
226 db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 | |
227 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 | |
228 db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 | |
229 db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 | |
230 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
231 | |
232 ALIGN 32 | |
233 c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 | |
234 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
235 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
236 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
237 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
238 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
239 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
240 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
241 | |
242 ALIGN 32 | |
243 c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
244 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 | |
245 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 | |
246 db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 | |
247 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 | |
248 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
249 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 | |
250 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 | |
251 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
252 | |
253 ALIGN 32 | |
254 c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
255 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
256 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
257 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 | |
258 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 | |
259 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 | |
260 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
261 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
262 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
263 | |
264 ALIGN 32 | |
265 intra_pred_shuff_0_4: times 4 db 0, 1, 1, 2, 2, 3, 3, 4 | |
266 intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5 | |
267 intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 | |
268 intra_pred4_shuff31: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 | |
269 intra_pred4_shuff33: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 | |
270 intra_pred4_shuff3: db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 | |
271 intra_pred4_shuff4: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 | |
272 intra_pred4_shuff5: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15 | |
273 intra_pred4_shuff6: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14 | |
274 intra_pred4_shuff7: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14 | |
275 intra_pred4_shuff9: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13 | |
276 intra_pred4_shuff12: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12 | |
277 intra_pred4_shuff13: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11 | |
278 intra_pred4_shuff14: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11 | |
279 intra_pred4_shuff15: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10 | |
280 intra_pred4_shuff16: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10 | |
281 intra_pred4_shuff17: db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9 | |
282 intra_pred4_shuff19: db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1 | |
283 intra_pred4_shuff20: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2 | |
284 intra_pred4_shuff21: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2 | |
285 intra_pred4_shuff22: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3 | |
286 intra_pred4_shuff23: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3 | |
287 | |
288 c_ang4_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 | |
289 c_ang4_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 | |
290 c_ang4_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 | |
291 c_ang4_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 | |
292 c_ang4_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 | |
293 c_ang4_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 | |
294 c_ang4_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 | |
295 c_ang4_mode_5: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 | |
296 c_ang4_mode_6: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 | |
297 c_ang4_mode_7: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 | |
298 c_ang4_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 | |
299 c_ang4_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 | |
300 c_ang4_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 | |
301 c_ang4_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 | |
302 c_ang4_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 | |
303 c_ang4_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 | |
304 c_ang4_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4 | |
305 c_ang4_mode_16: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 | |
306 c_ang4_mode_17: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 | |
307 c_ang4_mode_19: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 | |
308 c_ang4_mode_20: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 | |
309 c_ang4_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 | |
310 c_ang4_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 | |
311 c_ang4_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 | |
312 c_ang4_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 | |
313 c_ang4_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 | |
314 | |
315 ALIGN 32 | |
316 ;; (blkSize - 1 - x) | |
317 pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 | |
318 ALIGN 32 | |
319 c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 | |
320 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
321 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 | |
322 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
323 | |
324 ALIGN 32 | |
325 c_ang8_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 | |
326 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 | |
327 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 | |
328 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
329 | |
330 ALIGN 32 | |
331 c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 | |
332 db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 | |
333 db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 | |
334 db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 | |
335 | |
336 const c_ang8_mode_16, db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0 | |
337 | |
338 const intra_pred8_shuff16, db 0, 1, 1, 2, 3, 3, 4, 5 | |
339 db 1, 2, 2, 3, 4, 4, 5, 6 | |
340 db 2, 3, 3, 4, 5, 5, 6, 7 | |
341 db 3, 4, 4, 5, 6, 6, 7, 8 | |
342 db 4, 5, 5, 6, 7, 7, 8, 9 | |
343 | |
344 const angHor8_tab_16, db (32-11), 11, (32-22), 22, (32-1 ), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 | |
345 | |
346 const c_ang8_mode_20, db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0 | |
347 | |
348 ; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table | |
349 const angHor8_tab_20, times 8 db (32-24), 24 | |
350 times 8 db (32-13), 13 | |
351 times 8 db (32- 2), 2 | |
352 times 8 db (32-23), 23 | |
353 times 8 db (32-12), 12 | |
354 times 8 db (32- 1), 1 | |
355 times 8 db (32-22), 22 | |
356 times 8 db (32-11), 11 | |
357 | |
358 const ang16_shuf_mode9, times 8 db 0, 1 | |
359 times 8 db 1, 2 | |
360 | |
361 const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16 | |
362 db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32 | |
363 | |
364 const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 | |
365 db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0 | |
366 | |
367 const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3 | |
368 db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2 | |
369 | |
370 const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24 | |
371 db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 | |
372 | |
373 const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4 | |
374 db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2 | |
375 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0 | |
376 | |
377 const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24 | |
378 db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16 | |
379 | |
380 const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5 | |
381 db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2 | |
382 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0 | |
383 | |
384 const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24 | |
385 db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16 | |
386 | |
387 const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6 | |
388 db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2 | |
389 db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0 | |
390 | |
391 const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24 | |
392 db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16 | |
393 | |
394 const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7 | |
395 db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2 | |
396 db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0 | |
397 | |
398 const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24 | |
399 db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16 | |
400 | |
401 const ang16_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8 | |
402 db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2 | |
403 db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0 | |
404 | |
405 const angHor_tab_17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16 | |
406 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0 | |
407 | |
408 ; Intrapred_angle32x32, modes 1 to 33 constants | |
409 const ang32_shuf_mode9, times 8 db 0, 1 | |
410 times 8 db 1, 2 | |
411 | |
412 const ang32_shuf_mode11, times 8 db 1, 2 | |
413 times 8 db 0, 1 | |
414 | |
415 const ang32_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24 | |
416 db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8 | |
417 db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 | |
418 db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5), 5, (32- 0), 0 | |
419 const ang32_shuf_mode12, db 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
420 db 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | |
421 const ang32_shuf_mode24, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 3, 3 | |
422 dd 0, 0, 7, 3, 0, 0, 7, 3 | |
423 | |
424 const ang32_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24 | |
425 db (32- 7), 7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3), 3, (32-26), 26, (32-17), 17, (32- 8), 8 | |
426 db (32-15), 15, (32- 6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2), 2, (32-25), 25, (32-16), 16 | |
427 db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4), 4, (32-27), 27, (32-18), 18, (32- 9), 9, (32- 0), 0 | |
428 const ang32_shuf_mode13, db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11, 9, 10, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9 | |
429 db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 6, 7 | |
430 db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2 | |
431 const ang32_shuf_mode23, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2 | |
432 | |
433 const ang32_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24 | |
434 db (32- 3), 3, (32-22), 22, (32- 9), 9, (32-28), 28, (32-15), 15, (32- 2), 2, (32-21), 21, (32- 8), 8 | |
435 db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16 | |
436 db (32-27), 27, (32-14), 14, (32- 1), 1, (32-20), 20, (32- 7), 7, (32-26), 26, (32-13), 13, (32- 0), 0 | |
437 const ang32_shuf_mode14, db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 5, 6, 5, 6 | |
438 db 11, 12, 10, 11, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3 | |
439 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1 | |
440 const ang32_shuf_mode22, db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2 | |
441 | |
442 const ang32_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24 | |
443 db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8 | |
444 db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16), 16 | |
445 db (32-23), 23, (32- 6), 6, (32-21), 21, (32- 4), 4, (32-19), 19, (32- 2), 2, (32-17), 17, (32- 0), 0 | |
446 const ang32_shuf_mode15, db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11, 5, 6, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3 | |
447 db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1 | |
448 db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1 | |
449 const ang32_shuf_mode21, db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1 | |
450 | |
451 const ang32_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 | |
452 db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16 | |
453 db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8 | |
454 db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0 | |
455 const ang32_shuf_mode16, db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5 | |
456 db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6 | |
457 db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1 | |
458 dd 7, 1, 2, 3, 7, 1, 2, 3 | |
459 const ang32_shuf_mode20, db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8 | |
460 db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0 | |
461 | |
462 const ang32_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16 | |
463 db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0 | |
464 const ang32_shuf_mode17, db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 2, 3 | |
465 db 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0 | |
466 const ang32_shuf_mode19, db 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 | |
467 dd 0, 0, 2, 3, 0, 0, 7, 1 | |
468 dd 0, 0, 5, 6, 0, 0, 0, 0 | |
469 | |
470 const ang_table | |
471 %assign x 0 | |
472 %rep 32 | |
473 times 8 db (32-x), x | |
474 %assign x x+1 | |
475 %endrep | |
476 | |
477 const ang_table_avx2 | |
478 %assign x 0 | |
479 %rep 32 | |
480 times 16 db (32-x), x | |
481 %assign x x+1 | |
482 %endrep | |
483 | |
484 const pw_ang_table | |
485 %assign x 0 | |
486 %rep 32 | |
487 times 4 dw (32-x), x | |
488 %assign x x+1 | |
489 %endrep | |
490 | |
491 SECTION .text | |
492 cextern pb_1 | |
493 cextern pw_2 | |
494 cextern pw_3 | |
495 cextern pw_4 | |
496 cextern pw_7 | |
497 cextern pw_8 | |
498 cextern pw_16 | |
499 cextern pw_15 | |
500 cextern pw_31 | |
501 cextern pw_32 | |
502 cextern pw_257 | |
503 cextern pw_512 | |
504 cextern pw_1024 | |
505 cextern pw_4096 | |
506 cextern pw_00ff | |
507 cextern pb_unpackbd1 | |
508 cextern multiL | |
509 cextern multiH | |
510 cextern multiH2 | |
511 cextern multiH3 | |
512 cextern multi_2Row | |
513 cextern trans8_shuf | |
514 cextern pw_planar16_mul | |
515 cextern pw_planar32_mul | |
516 | |
517 ;--------------------------------------------------------------------------------------------- | |
518 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
519 ;--------------------------------------------------------------------------------------------- | |
520 INIT_XMM sse2 | |
521 cglobal intra_pred_dc4, 5,5,3 | |
522 inc r2 | |
523 pxor m0, m0 | |
524 movu m1, [r2] | |
525 pshufd m1, m1, 0xF8 | |
526 psadbw m1, m0 ; m1 = sum | |
527 | |
528 test r4d, r4d | |
529 | |
530 paddw m1, [pw_4] | |
531 psraw m1, 3 | |
532 movd r4d, m1 ; r4d = dc_val | |
533 pmullw m1, [pw_257] | |
534 pshuflw m1, m1, 0x00 | |
535 | |
536 ; store DC 4x4 | |
537 lea r3, [r1 * 3] | |
538 movd [r0], m1 | |
539 movd [r0 + r1], m1 | |
540 movd [r0 + r1 * 2], m1 | |
541 movd [r0 + r3], m1 | |
542 | |
543 ; do DC filter | |
544 jz .end | |
545 lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 | |
546 add r4d, r3d ; r4d = DC * 3 + 2 | |
547 movd m1, r4d | |
548 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
549 | |
550 ; filter top | |
551 movd m2, [r2] | |
552 punpcklbw m2, m0 | |
553 paddw m2, m1 | |
554 psraw m2, 2 | |
555 packuswb m2, m2 | |
556 movd [r0], m2 ; overwrite top-left pixel, we will update it later | |
557 | |
558 ; filter top-left | |
559 movzx r4d, byte [r2 + 8] | |
560 add r3d, r4d | |
561 movzx r4d, byte [r2] | |
562 add r3d, r4d | |
563 shr r3d, 2 | |
564 mov [r0], r3b | |
565 | |
566 ; filter left | |
567 add r0, r1 | |
568 movq m2, [r2 + 9] | |
569 punpcklbw m2, m0 | |
570 paddw m2, m1 | |
571 psraw m2, 2 | |
572 packuswb m2, m2 | |
573 %if ARCH_X86_64 | |
574 movq r4, m2 | |
575 mov [r0], r4b | |
576 shr r4, 8 | |
577 mov [r0 + r1], r4b | |
578 shr r4, 8 | |
579 mov [r0 + r1 * 2], r4b | |
580 %else | |
581 movd r2d, m2 | |
582 mov [r0], r2b | |
583 shr r2, 8 | |
584 mov [r0 + r1], r2b | |
585 shr r2, 8 | |
586 mov [r0 + r1 * 2], r2b | |
587 %endif | |
588 .end: | |
589 RET | |
590 | |
591 ;--------------------------------------------------------------------------------------------- | |
592 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
593 ;--------------------------------------------------------------------------------------------- | |
594 INIT_XMM sse2 | |
595 cglobal intra_pred_dc8, 5, 7, 3 | |
596 pxor m0, m0 | |
597 movh m1, [r2 + 1] | |
598 movh m2, [r2 + 17] | |
599 punpcklqdq m1, m2 | |
600 psadbw m1, m0 | |
601 pshufd m2, m1, 2 | |
602 paddw m1, m2 | |
603 | |
604 paddw m1, [pw_8] | |
605 psraw m1, 4 | |
606 pmullw m1, [pw_257] | |
607 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] | |
608 | |
609 test r4d, r4d | |
610 | |
611 ; store DC 8x8 | |
612 lea r6, [r1 + r1 * 2] | |
613 lea r5, [r6 + r1 * 2] | |
614 movh [r0], m1 | |
615 movh [r0 + r1], m1 | |
616 movh [r0 + r1 * 2], m1 | |
617 movh [r0 + r6], m1 | |
618 movh [r0 + r1 * 4], m1 | |
619 movh [r0 + r5], m1 | |
620 movh [r0 + r6 * 2], m1 | |
621 lea r5, [r5 + r1 * 2] | |
622 movh [r0 + r5], m1 | |
623 | |
624 ; Do DC Filter | |
625 jz .end | |
626 psrlw m1, 8 | |
627 movq m2, [pw_2] | |
628 pmullw m2, m1 | |
629 paddw m2, [pw_2] | |
630 movd r4d, m2 ; r4d = DC * 2 + 2 | |
631 paddw m1, m2 ; m1 = DC * 3 + 2 | |
632 pshufd m1, m1, 0 | |
633 | |
634 ; filter top | |
635 movq m2, [r2 + 1] | |
636 punpcklbw m2, m0 | |
637 paddw m2, m1 | |
638 psraw m2, 2 ; sum = sum / 16 | |
639 packuswb m2, m2 | |
640 movh [r0], m2 | |
641 | |
642 ; filter top-left | |
643 movzx r3d, byte [r2 + 17] | |
644 add r4d, r3d | |
645 movzx r3d, byte [r2 + 1] | |
646 add r3d, r4d | |
647 shr r3d, 2 | |
648 mov [r0], r3b | |
649 | |
650 ; filter left | |
651 movq m2, [r2 + 18] | |
652 punpcklbw m2, m0 | |
653 paddw m2, m1 | |
654 psraw m2, 2 | |
655 packuswb m2, m2 | |
656 movd r2d, m2 | |
657 lea r0, [r0 + r1] | |
658 lea r5, [r6 + r1 * 2] | |
659 mov [r0], r2b | |
660 shr r2, 8 | |
661 mov [r0 + r1], r2b | |
662 shr r2, 8 | |
663 mov [r0 + r1 * 2], r2b | |
664 shr r2, 8 | |
665 mov [r0 + r6], r2b | |
666 pshufd m2, m2, 0x01 | |
667 movd r2d, m2 | |
668 mov [r0 + r1 * 4], r2b | |
669 shr r2, 8 | |
670 mov [r0 + r5], r2b | |
671 shr r2, 8 | |
672 mov [r0 + r6 * 2], r2b | |
673 | |
674 .end: | |
675 RET | |
676 | |
677 ;-------------------------------------------------------------------------------------------- | |
678 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
679 ;-------------------------------------------------------------------------------------------- | |
680 INIT_XMM sse2 | |
681 %if ARCH_X86_64 | |
682 cglobal intra_pred_dc16, 5, 10, 4 | |
683 %else | |
684 cglobal intra_pred_dc16, 5, 7, 4 | |
685 %endif | |
686 pxor m0, m0 | |
687 movu m1, [r2 + 1] | |
688 movu m2, [r2 + 33] | |
689 psadbw m1, m0 | |
690 psadbw m2, m0 | |
691 paddw m1, m2 | |
692 pshufd m2, m1, 2 | |
693 paddw m1, m2 | |
694 | |
695 paddw m1, [pw_16] | |
696 psraw m1, 5 | |
697 pmullw m1, [pw_257] | |
698 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] | |
699 pshufd m1, m1, 0x00 | |
700 | |
701 | |
702 test r4d, r4d | |
703 | |
704 ; store DC 16x16 | |
705 %if ARCH_X86_64 | |
706 lea r6, [r1 + r1 * 2] ;index 3 | |
707 lea r7, [r1 + r1 * 4] ;index 5 | |
708 lea r8, [r6 + r1 * 4] ;index 7 | |
709 lea r9, [r0 + r8] ;base + 7 | |
710 movu [r0], m1 | |
711 movu [r0 + r1], m1 | |
712 movu [r0 + r1 * 2], m1 | |
713 movu [r0 + r6], m1 | |
714 movu [r0 + r1 * 4], m1 | |
715 movu [r0 + r7], m1 | |
716 movu [r0 + r6 * 2], m1 | |
717 movu [r0 + r8], m1 | |
718 movu [r0 + r1 * 8], m1 | |
719 movu [r9 + r1 * 2], m1 | |
720 movu [r0 + r7 * 2], m1 | |
721 movu [r9 + r1 * 4], m1 | |
722 movu [r0 + r6 * 4], m1 | |
723 movu [r9 + r6 * 2], m1 | |
724 movu [r0 + r8 * 2], m1 | |
725 movu [r9 + r1 * 8], m1 | |
726 %else ;32 bit | |
727 mov r6, r0 | |
728 movu [r0], m1 | |
729 movu [r0 + r1], m1 | |
730 lea r0, [r0 + r1 * 2] | |
731 movu [r0], m1 | |
732 movu [r0 + r1], m1 | |
733 lea r0, [r0 + r1 * 2] | |
734 movu [r0], m1 | |
735 movu [r0 + r1], m1 | |
736 lea r0, [r0 + r1 * 2] | |
737 movu [r0], m1 | |
738 movu [r0 + r1], m1 | |
739 lea r0, [r0 + r1 * 2] | |
740 movu [r0], m1 | |
741 movu [r0 + r1], m1 | |
742 lea r0, [r0 + r1 * 2] | |
743 movu [r0], m1 | |
744 movu [r0 + r1], m1 | |
745 lea r0, [r0 + r1 * 2] | |
746 movu [r0], m1 | |
747 movu [r0 + r1], m1 | |
748 lea r0, [r0 + r1 * 2] | |
749 movu [r0], m1 | |
750 movu [r0 + r1], m1 | |
751 %endif | |
752 ; Do DC Filter | |
753 jz .end | |
754 psrlw m1, 8 | |
755 mova m2, [pw_2] | |
756 pmullw m2, m1 | |
757 paddw m2, [pw_2] | |
758 movd r4d, m2 | |
759 paddw m1, m2 | |
760 | |
761 ; filter top | |
762 movh m2, [r2 + 1] | |
763 punpcklbw m2, m0 | |
764 paddw m2, m1 | |
765 psraw m2, 2 | |
766 packuswb m2, m2 | |
767 movh m3, [r2 + 9] | |
768 punpcklbw m3, m0 | |
769 paddw m3, m1 | |
770 psraw m3, 2 | |
771 packuswb m3, m3 | |
772 | |
773 ; filter top-left | |
774 movzx r5d, byte [r2 + 33] | |
775 add r4d, r5d | |
776 movzx r3d, byte [r2 + 1] | |
777 add r3d, r4d | |
778 shr r3d, 2 | |
779 | |
780 %if ARCH_X86_64 | |
781 movh [r0], m2 | |
782 movh [r0 + 8], m3 | |
783 mov [r0], r3b | |
784 %else ;32 bit | |
785 movh [r6], m2 | |
786 movh [r6 + 8], m3 | |
787 mov [r6], r3b | |
788 add r6, r1 | |
789 %endif | |
790 | |
791 ; filter left | |
792 movh m2, [r2 + 34] | |
793 punpcklbw m2, m0 | |
794 paddw m2, m1 | |
795 psraw m2, 2 | |
796 packuswb m2, m2 | |
797 | |
798 movh m3, [r2 + 42] | |
799 punpcklbw m3, m0 | |
800 paddw m3, m1 | |
801 psraw m3, 2 | |
802 packuswb m3, m3 | |
803 %if ARCH_X86_64 | |
804 movh r3, m2 | |
805 mov [r0 + r1], r3b | |
806 shr r3, 8 | |
807 mov [r0 + r1 * 2], r3b | |
808 shr r3, 8 | |
809 mov [r0 + r6], r3b | |
810 shr r3, 8 | |
811 mov [r0 + r1 * 4], r3b | |
812 shr r3, 8 | |
813 mov [r0 + r7], r3b | |
814 shr r3, 8 | |
815 mov [r0 + r6 * 2], r3b | |
816 shr r3, 8 | |
817 mov [r0 + r8], r3b | |
818 shr r3, 8 | |
819 mov [r0 + r1 * 8], r3b | |
820 movh r3, m3 | |
821 mov [r9 + r1 * 2], r3b | |
822 shr r3, 8 | |
823 mov [r0 + r7 * 2], r3b | |
824 shr r3, 8 | |
825 mov [r9 + r1 * 4], r3b | |
826 shr r3, 8 | |
827 mov [r0 + r6 * 4], r3b | |
828 shr r3, 8 | |
829 mov [r9 + r6 * 2], r3b | |
830 shr r3, 8 | |
831 mov [r0 + r8 * 2], r3b | |
832 shr r3, 8 | |
833 mov [r9 + r1 * 8], r3b | |
834 %else ;32 bit | |
835 movd r2d, m2 | |
836 pshufd m2, m2, 0x01 | |
837 mov [r6], r2b | |
838 shr r2, 8 | |
839 mov [r6 + r1], r2b | |
840 shr r2, 8 | |
841 mov [r6 + r1 * 2], r2b | |
842 lea r6, [r6 + r1 * 2] | |
843 shr r2, 8 | |
844 mov [r6 + r1], r2b | |
845 movd r2d, m2 | |
846 mov [r6 + r1 * 2], r2b | |
847 lea r6, [r6 + r1 * 2] | |
848 shr r2, 8 | |
849 mov [r6 + r1], r2b | |
850 shr r2, 8 | |
851 mov [r6 + r1 * 2], r2b | |
852 lea r6, [r6 + r1 * 2] | |
853 shr r2, 8 | |
854 mov [r6 + r1], r2b | |
855 movd r2d, m3 | |
856 pshufd m3, m3, 0x01 | |
857 mov [r6 + r1 * 2], r2b | |
858 lea r6, [r6 + r1 * 2] | |
859 shr r2, 8 | |
860 mov [r6 + r1], r2b | |
861 shr r2, 8 | |
862 mov [r6 + r1 * 2], r2b | |
863 lea r6, [r6 + r1 * 2] | |
864 shr r2, 8 | |
865 mov [r6 + r1], r2b | |
866 movd r2d, m3 | |
867 mov [r6 + r1 * 2], r2b | |
868 lea r6, [r6 + r1 * 2] | |
869 shr r2, 8 | |
870 mov [r6 + r1], r2b | |
871 shr r2, 8 | |
872 mov [r6 + r1 * 2], r2b | |
873 %endif | |
874 .end: | |
875 RET | |
876 | |
877 ;--------------------------------------------------------------------------------------------- | |
878 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
879 ;--------------------------------------------------------------------------------------------- | |
880 INIT_XMM sse2 | |
881 cglobal intra_pred_dc32, 3, 3, 5 | |
882 pxor m0, m0 | |
883 movu m1, [r2 + 1] | |
884 movu m2, [r2 + 17] | |
885 movu m3, [r2 + 65] | |
886 movu m4, [r2 + 81] | |
887 psadbw m1, m0 | |
888 psadbw m2, m0 | |
889 psadbw m3, m0 | |
890 psadbw m4, m0 | |
891 paddw m1, m2 | |
892 paddw m3, m4 | |
893 paddw m1, m3 | |
894 pshufd m2, m1, 2 | |
895 paddw m1, m2 | |
896 | |
897 paddw m1, [pw_32] | |
898 psraw m1, 6 | |
899 pmullw m1, [pw_257] | |
900 pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] | |
901 pshufd m1, m1, 0x00 | |
902 | |
903 %assign x 0 | |
904 %rep 16 | |
905 ; store DC 16x16 | |
906 movu [r0], m1 | |
907 movu [r0 + r1], m1 | |
908 movu [r0 + 16], m1 | |
909 movu [r0 + r1 + 16], m1 | |
910 %if x < 16 | |
911 lea r0, [r0 + 2 * r1] | |
912 %endif | |
913 %assign x x+1 | |
914 %endrep | |
915 RET | |
916 | |
917 ;--------------------------------------------------------------------------------------- | |
918 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
919 ;--------------------------------------------------------------------------------------- | |
920 INIT_XMM sse2 | |
921 cglobal intra_pred_planar4, 3,3,5 | |
922 pxor m0, m0 | |
923 movh m1, [r2 + 1] | |
924 punpcklbw m1, m0 | |
925 movh m2, [r2 + 9] | |
926 punpcklbw m2, m0 | |
927 pshufhw m3, m1, 0 ; topRight | |
928 pshufd m3, m3, 0xAA | |
929 pshufhw m4, m2, 0 ; bottomLeft | |
930 pshufd m4, m4, 0xAA | |
931 pmullw m3, [multi_2Row] ; (x + 1) * topRight | |
932 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] | |
933 paddw m3, [pw_4] | |
934 paddw m3, m4 | |
935 paddw m3, m0 | |
936 psubw m4, m1 | |
937 | |
938 pshuflw m1, m2, 0 | |
939 pmullw m1, [pw_planar4_0] | |
940 paddw m1, m3 | |
941 paddw m3, m4 | |
942 psraw m1, 3 | |
943 packuswb m1, m1 | |
944 movd [r0], m1 | |
945 | |
946 pshuflw m1, m2, 01010101b | |
947 pmullw m1, [pw_planar4_0] | |
948 paddw m1, m3 | |
949 paddw m3, m4 | |
950 psraw m1, 3 | |
951 packuswb m1, m1 | |
952 movd [r0 + r1], m1 | |
953 lea r0, [r0 + 2 * r1] | |
954 | |
955 pshuflw m1, m2, 10101010b | |
956 pmullw m1, [pw_planar4_0] | |
957 paddw m1, m3 | |
958 paddw m3, m4 | |
959 psraw m1, 3 | |
960 packuswb m1, m1 | |
961 movd [r0], m1 | |
962 | |
963 pshuflw m1, m2, 11111111b | |
964 pmullw m1, [pw_planar4_0] | |
965 paddw m1, m3 | |
966 psraw m1, 3 | |
967 packuswb m1, m1 | |
968 movd [r0 + r1], m1 | |
969 RET | |
970 | |
971 ;--------------------------------------------------------------------------------------- | |
972 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
973 ;--------------------------------------------------------------------------------------- | |
974 INIT_XMM sse2 | |
975 cglobal intra_pred_planar8, 3,3,6 | |
976 pxor m0, m0 | |
977 movh m1, [r2 + 1] | |
978 punpcklbw m1, m0 | |
979 movh m2, [r2 + 17] | |
980 punpcklbw m2, m0 | |
981 | |
982 movd m3, [r2 + 9] ; topRight = above[8]; | |
983 movd m4, [r2 + 25] ; bottomLeft = left[8]; | |
984 | |
985 pand m3, [pw_00ff] | |
986 pand m4, [pw_00ff] | |
987 pshuflw m3, m3, 0x00 | |
988 pshuflw m4, m4, 0x00 | |
989 pshufd m3, m3, 0x44 | |
990 pshufd m4, m4, 0x44 | |
991 pmullw m3, [multiL] ; (x + 1) * topRight | |
992 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] | |
993 paddw m3, [pw_8] | |
994 paddw m3, m4 | |
995 paddw m3, m0 | |
996 psubw m4, m1 | |
997 | |
998 %macro INTRA_PRED_PLANAR_8 1 | |
999 %if (%1 < 4) | |
1000 pshuflw m5, m2, 0x55 * %1 | |
1001 pshufd m5, m5, 0 | |
1002 %else | |
1003 pshufhw m5, m2, 0x55 * (%1 - 4) | |
1004 pshufd m5, m5, 0xAA | |
1005 %endif | |
1006 pmullw m5, [pw_planar16_mul + mmsize] | |
1007 paddw m5, m3 | |
1008 psraw m5, 4 | |
1009 packuswb m5, m5 | |
1010 movh [r0], m5 | |
1011 %if (%1 < 7) | |
1012 paddw m3, m4 | |
1013 lea r0, [r0 + r1] | |
1014 %endif | |
1015 %endmacro | |
1016 | |
1017 INTRA_PRED_PLANAR_8 0 | |
1018 INTRA_PRED_PLANAR_8 1 | |
1019 INTRA_PRED_PLANAR_8 2 | |
1020 INTRA_PRED_PLANAR_8 3 | |
1021 INTRA_PRED_PLANAR_8 4 | |
1022 INTRA_PRED_PLANAR_8 5 | |
1023 INTRA_PRED_PLANAR_8 6 | |
1024 INTRA_PRED_PLANAR_8 7 | |
1025 RET | |
1026 | |
1027 ;--------------------------------------------------------------------------------------- | |
1028 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
1029 ;--------------------------------------------------------------------------------------- | |
1030 INIT_XMM sse2 | |
1031 cglobal intra_pred_planar16, 3,5,8 | |
1032 pxor m0, m0 | |
1033 movh m2, [r2 + 1] | |
1034 punpcklbw m2, m0 | |
1035 movh m7, [r2 + 9] | |
1036 punpcklbw m7, m0 | |
1037 | |
1038 movd m3, [r2 + 17] ; topRight = above[16] | |
1039 movd m6, [r2 + 49] ; bottomLeft = left[16] | |
1040 pand m3, [pw_00ff] | |
1041 pand m6, [pw_00ff] | |
1042 pshuflw m3, m3, 0x00 | |
1043 pshuflw m6, m6, 0x00 | |
1044 pshufd m3, m3, 0x44 ; v_topRight | |
1045 pshufd m6, m6, 0x44 ; v_bottomLeft | |
1046 pmullw m4, m3, [multiH] ; (x + 1) * topRight | |
1047 pmullw m3, [multiL] ; (x + 1) * topRight | |
1048 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
1049 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] | |
1050 paddw m4, [pw_16] | |
1051 paddw m3, [pw_16] | |
1052 paddw m4, m6 | |
1053 paddw m3, m6 | |
1054 paddw m4, m5 | |
1055 paddw m3, m1 | |
1056 psubw m1, m6, m7 | |
1057 psubw m6, m2 | |
1058 | |
1059 movh m2, [r2 + 33] | |
1060 punpcklbw m2, m0 | |
1061 movh m7, [r2 + 41] | |
1062 punpcklbw m7, m0 | |
1063 | |
1064 %macro INTRA_PRED_PLANAR_16 1 | |
1065 %if (%1 < 4) | |
1066 pshuflw m5, m2, 0x55 * %1 | |
1067 pshufd m5, m5, 0 | |
1068 %else | |
1069 %if (%1 < 8) | |
1070 pshufhw m5, m2, 0x55 * (%1 - 4) | |
1071 pshufd m5, m5, 0xAA | |
1072 %else | |
1073 %if (%1 < 12) | |
1074 pshuflw m5, m7, 0x55 * (%1 - 8) | |
1075 pshufd m5, m5, 0 | |
1076 %else | |
1077 pshufhw m5, m7, 0x55 * (%1 - 12) | |
1078 pshufd m5, m5, 0xAA | |
1079 %endif | |
1080 %endif | |
1081 %endif | |
1082 %if (%1 > 0) | |
1083 paddw m3, m6 | |
1084 paddw m4, m1 | |
1085 lea r0, [r0 + r1] | |
1086 %endif | |
1087 pmullw m0, m5, [pw_planar16_mul + mmsize] | |
1088 pmullw m5, [pw_planar16_mul] | |
1089 paddw m0, m4 | |
1090 paddw m5, m3 | |
1091 psraw m5, 5 | |
1092 psraw m0, 5 | |
1093 packuswb m5, m0 | |
1094 movu [r0], m5 | |
1095 %endmacro | |
1096 | |
1097 INTRA_PRED_PLANAR_16 0 | |
1098 INTRA_PRED_PLANAR_16 1 | |
1099 INTRA_PRED_PLANAR_16 2 | |
1100 INTRA_PRED_PLANAR_16 3 | |
1101 INTRA_PRED_PLANAR_16 4 | |
1102 INTRA_PRED_PLANAR_16 5 | |
1103 INTRA_PRED_PLANAR_16 6 | |
1104 INTRA_PRED_PLANAR_16 7 | |
1105 INTRA_PRED_PLANAR_16 8 | |
1106 INTRA_PRED_PLANAR_16 9 | |
1107 INTRA_PRED_PLANAR_16 10 | |
1108 INTRA_PRED_PLANAR_16 11 | |
1109 INTRA_PRED_PLANAR_16 12 | |
1110 INTRA_PRED_PLANAR_16 13 | |
1111 INTRA_PRED_PLANAR_16 14 | |
1112 INTRA_PRED_PLANAR_16 15 | |
1113 RET | |
1114 | |
1115 ;--------------------------------------------------------------------------------------- | |
1116 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
1117 ;--------------------------------------------------------------------------------------- | |
1118 INIT_XMM sse2 | |
1119 %if ARCH_X86_64 == 1 | |
1120 cglobal intra_pred_planar32, 3,3,16 | |
1121 movd m3, [r2 + 33] ; topRight = above[32] | |
1122 | |
1123 pxor m7, m7 | |
1124 pand m3, [pw_00ff] | |
1125 pshuflw m3, m3, 0x00 | |
1126 pshufd m3, m3, 0x44 | |
1127 | |
1128 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
1129 pmullw m1, m3, [multiH] ; (x + 1) * topRight | |
1130 pmullw m2, m3, [multiH2] ; (x + 1) * topRight | |
1131 pmullw m3, [multiH3] ; (x + 1) * topRight | |
1132 | |
1133 movd m11, [r2 + 97] ; bottomLeft = left[32] | |
1134 pand m11, [pw_00ff] | |
1135 pshuflw m11, m11, 0x00 | |
1136 pshufd m11, m11, 0x44 | |
1137 mova m5, m11 | |
1138 paddw m5, [pw_32] | |
1139 | |
1140 paddw m0, m5 | |
1141 paddw m1, m5 | |
1142 paddw m2, m5 | |
1143 paddw m3, m5 | |
1144 mova m8, m11 | |
1145 mova m9, m11 | |
1146 mova m10, m11 | |
1147 mova m12, [pw_31] | |
1148 movh m4, [r2 + 1] | |
1149 punpcklbw m4, m7 | |
1150 psubw m8, m4 | |
1151 pmullw m4, m12 | |
1152 paddw m0, m4 | |
1153 | |
1154 movh m4, [r2 + 9] | |
1155 punpcklbw m4, m7 | |
1156 psubw m9, m4 | |
1157 pmullw m4, m12 | |
1158 paddw m1, m4 | |
1159 | |
1160 movh m4, [r2 + 17] | |
1161 punpcklbw m4, m7 | |
1162 psubw m10, m4 | |
1163 pmullw m4, m12 | |
1164 paddw m2, m4 | |
1165 | |
1166 movh m4, [r2 + 25] | |
1167 punpcklbw m4, m7 | |
1168 psubw m11, m4 | |
1169 pmullw m4, m12 | |
1170 paddw m3, m4 | |
1171 mova m12, [pw_planar32_mul] | |
1172 mova m13, [pw_planar32_mul + mmsize] | |
1173 mova m14, [pw_planar16_mul] | |
1174 mova m15, [pw_planar16_mul + mmsize] | |
1175 %macro PROCESS 1 | |
1176 pmullw m5, %1, m12 | |
1177 pmullw m6, %1, m13 | |
1178 paddw m5, m0 | |
1179 paddw m6, m1 | |
1180 psraw m5, 6 | |
1181 psraw m6, 6 | |
1182 packuswb m5, m6 | |
1183 movu [r0], m5 | |
1184 | |
1185 pmullw m5, %1, m14 | |
1186 pmullw %1, m15 | |
1187 paddw m5, m2 | |
1188 paddw %1, m3 | |
1189 psraw m5, 6 | |
1190 psraw %1, 6 | |
1191 packuswb m5, %1 | |
1192 movu [r0 + 16], m5 | |
1193 %endmacro | |
1194 | |
1195 %macro INCREMENT 0 | |
1196 paddw m2, m10 | |
1197 paddw m3, m11 | |
1198 paddw m0, m8 | |
1199 paddw m1, m9 | |
1200 add r0, r1 | |
1201 %endmacro | |
1202 | |
1203 %assign x 0 | |
1204 %rep 4 | |
1205 pxor m7, m7 | |
1206 movq m4, [r2 + 65 + x * 8] | |
1207 punpcklbw m4, m7 | |
1208 %assign y 0 | |
1209 %rep 8 | |
1210 %if y < 4 | |
1211 pshuflw m7, m4, 0x55 * y | |
1212 pshufd m7, m7, 0x44 | |
1213 %else | |
1214 pshufhw m7, m4, 0x55 * (y - 4) | |
1215 pshufd m7, m7, 0xEE | |
1216 %endif | |
1217 PROCESS m7 | |
1218 %if x + y < 10 | |
1219 INCREMENT | |
1220 %endif | |
1221 %assign y y+1 | |
1222 %endrep | |
1223 %assign x x+1 | |
1224 %endrep | |
1225 RET | |
1226 | |
1227 %else ;end ARCH_X86_64, start ARCH_X86_32 | |
1228 cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize) | |
1229 movd m3, [r2 + 33] ; topRight = above[32] | |
1230 | |
1231 pxor m7, m7 | |
1232 pand m3, [pw_00ff] | |
1233 pshuflw m3, m3, 0x00 | |
1234 pshufd m3, m3, 0x44 | |
1235 | |
1236 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
1237 pmullw m1, m3, [multiH] ; (x + 1) * topRight | |
1238 pmullw m2, m3, [multiH2] ; (x + 1) * topRight | |
1239 pmullw m3, [multiH3] ; (x + 1) * topRight | |
1240 | |
1241 movd m6, [r2 + 97] ; bottomLeft = left[32] | |
1242 pand m6, [pw_00ff] | |
1243 pshuflw m6, m6, 0x00 | |
1244 pshufd m6, m6, 0x44 | |
1245 mova m5, m6 | |
1246 paddw m5, [pw_32] | |
1247 | |
1248 paddw m0, m5 | |
1249 paddw m1, m5 | |
1250 paddw m2, m5 | |
1251 paddw m3, m5 | |
1252 | |
1253 movh m4, [r2 + 1] | |
1254 punpcklbw m4, m7 | |
1255 psubw m5, m6, m4 | |
1256 mova [rsp + 0 * mmsize], m5 | |
1257 pmullw m4, [pw_31] | |
1258 paddw m0, m4 | |
1259 movh m4, [r2 + 9] | |
1260 punpcklbw m4, m7 | |
1261 psubw m5, m6, m4 | |
1262 mova [rsp + 1 * mmsize], m5 | |
1263 pmullw m4, [pw_31] | |
1264 paddw m1, m4 | |
1265 movh m4, [r2 + 17] | |
1266 punpcklbw m4, m7 | |
1267 psubw m5, m6, m4 | |
1268 mova [rsp + 2 * mmsize], m5 | |
1269 pmullw m4, [pw_31] | |
1270 paddw m2, m4 | |
1271 movh m4, [r2 + 25] | |
1272 punpcklbw m4, m7 | |
1273 psubw m5, m6, m4 | |
1274 mova [rsp + 3 * mmsize], m5 | |
1275 pmullw m4, [pw_31] | |
1276 paddw m3, m4 | |
1277 %macro PROCESS 1 | |
1278 pmullw m5, %1, [pw_planar32_mul] | |
1279 pmullw m6, %1, [pw_planar32_mul + mmsize] | |
1280 paddw m5, m0 | |
1281 paddw m6, m1 | |
1282 psraw m5, 6 | |
1283 psraw m6, 6 | |
1284 packuswb m5, m6 | |
1285 movu [r0], m5 | |
1286 pmullw m5, %1, [pw_planar16_mul] | |
1287 pmullw %1, [pw_planar16_mul + mmsize] | |
1288 paddw m5, m2 | |
1289 paddw %1, m3 | |
1290 psraw m5, 6 | |
1291 psraw %1, 6 | |
1292 packuswb m5, %1 | |
1293 movu [r0 + 16], m5 | |
1294 %endmacro | |
1295 | |
1296 %macro INCREMENT 0 | |
1297 paddw m0, [rsp + 0 * mmsize] | |
1298 paddw m1, [rsp + 1 * mmsize] | |
1299 paddw m2, [rsp + 2 * mmsize] | |
1300 paddw m3, [rsp + 3 * mmsize] | |
1301 add r0, r1 | |
1302 %endmacro | |
1303 | |
1304 %assign y 0 | |
1305 %rep 4 | |
1306 pxor m7, m7 | |
1307 movq m4, [r2 + 65 + y * 8] | |
1308 punpcklbw m4, m7 | |
1309 %assign x 0 | |
1310 %rep 8 | |
1311 %if x < 4 | |
1312 pshuflw m7, m4, 0x55 * x | |
1313 pshufd m7, m7, 0x44 | |
1314 %else | |
1315 pshufhw m7, m4, 0x55 * (x - 4) | |
1316 pshufd m7, m7, 0xEE | |
1317 %endif | |
1318 | |
1319 PROCESS m7 | |
1320 %if x + y < 10 | |
1321 INCREMENT | |
1322 %endif | |
1323 %assign x x+1 | |
1324 %endrep | |
1325 %assign y y+1 | |
1326 %endrep | |
1327 RET | |
1328 | |
1329 %endif ; end ARCH_X86_32 | |
1330 | |
1331 %macro STORE_4x4 0 | |
1332 movd [r0], m0 | |
1333 psrldq m0, 4 | |
1334 movd [r0 + r1], m0 | |
1335 psrldq m0, 4 | |
1336 movd [r0 + r1 * 2], m0 | |
1337 lea r1, [r1 * 3] | |
1338 psrldq m0, 4 | |
1339 movd [r0 + r1], m0 | |
1340 %endmacro | |
1341 | |
1342 %macro TRANSPOSE_4x4 0 | |
1343 pshufd m0, m0, 0xD8 | |
1344 pshufd m1, m2, 0xD8 | |
1345 pshuflw m0, m0, 0xD8 | |
1346 pshuflw m1, m1, 0xD8 | |
1347 pshufhw m0, m0, 0xD8 | |
1348 pshufhw m1, m1, 0xD8 | |
1349 mova m2, m0 | |
1350 punpckldq m0, m1 | |
1351 punpckhdq m2, m1 | |
1352 packuswb m0, m2 | |
1353 %endmacro | |
1354 | |
1355 ;----------------------------------------------------------------------------------------- | |
1356 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
1357 ;----------------------------------------------------------------------------------------- | |
1358 INIT_XMM sse2 | |
1359 cglobal intra_pred_ang4_2, 3,5,1 | |
1360 lea r4, [r2 + 2] | |
1361 add r2, 10 | |
1362 cmp r3m, byte 34 | |
1363 cmove r2, r4 | |
1364 | |
1365 movh m0, [r2] | |
1366 movd [r0], m0 | |
1367 psrldq m0, 1 | |
1368 movd [r0 + r1], m0 | |
1369 psrldq m0, 1 | |
1370 movd [r0 + r1 * 2], m0 | |
1371 lea r1, [r1 * 3] | |
1372 psrldq m0, 1 | |
1373 movd [r0 + r1], m0 | |
1374 RET | |
1375 | |
1376 INIT_XMM sse2 | |
1377 cglobal intra_pred_ang4_3, 3,3,5 | |
1378 movh m3, [r2 + 9] ; [8 7 6 5 4 3 2 1] | |
1379 punpcklbw m3, m3 | |
1380 psrldq m3, 1 | |
1381 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1] | |
1382 psrldq m3, 2 | |
1383 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
1384 psrldq m3, 2 | |
1385 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
1386 psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4] | |
1387 | |
1388 pxor m4, m4 | |
1389 punpcklbw m1, m4 | |
1390 pmaddwd m1, [pw_ang_table + 20 * 16] | |
1391 punpcklbw m0, m4 | |
1392 pmaddwd m0, [pw_ang_table + 26 * 16] | |
1393 packssdw m0, m1 | |
1394 paddw m0, [pw_16] | |
1395 psraw m0, 5 | |
1396 punpcklbw m3, m4 | |
1397 pmaddwd m3, [pw_ang_table + 8 * 16] | |
1398 punpcklbw m2, m4 | |
1399 pmaddwd m2, [pw_ang_table + 14 * 16] | |
1400 packssdw m2, m3 | |
1401 paddw m2, [pw_16] | |
1402 psraw m2, 5 | |
1403 | |
1404 TRANSPOSE_4x4 | |
1405 | |
1406 STORE_4x4 | |
1407 RET | |
1408 | |
1409 cglobal intra_pred_ang4_4, 3,3,5 | |
1410 movh m1, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1411 punpcklbw m1, m1 | |
1412 psrldq m1, 1 | |
1413 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1] | |
1414 psrldq m1, 2 | |
1415 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
1416 psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
1417 | |
1418 pxor m4, m4 | |
1419 punpcklbw m2, m4 | |
1420 mova m3, m2 | |
1421 pmaddwd m3, [pw_ang_table + 10 * 16] | |
1422 punpcklbw m0, m4 | |
1423 pmaddwd m0, [pw_ang_table + 21 * 16] | |
1424 packssdw m0, m3 | |
1425 paddw m0, [pw_16] | |
1426 psraw m0, 5 | |
1427 punpcklbw m1, m4 | |
1428 pmaddwd m1, [pw_ang_table + 20 * 16] | |
1429 pmaddwd m2, [pw_ang_table + 31 * 16] | |
1430 packssdw m2, m1 | |
1431 paddw m2, [pw_16] | |
1432 psraw m2, 5 | |
1433 | |
1434 TRANSPOSE_4x4 | |
1435 | |
1436 STORE_4x4 | |
1437 RET | |
1438 | |
1439 cglobal intra_pred_ang4_5, 3,3,5 | |
1440 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1441 punpcklbw m3, m3 | |
1442 psrldq m3, 1 | |
1443 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1444 psrldq m3, 2 | |
1445 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
1446 psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
1447 | |
1448 pxor m1, m1 | |
1449 punpcklbw m2, m1 | |
1450 mova m4, m2 | |
1451 pmaddwd m4, [pw_ang_table + 2 * 16] | |
1452 punpcklbw m0, m1 | |
1453 pmaddwd m0, [pw_ang_table + 17 * 16] | |
1454 packssdw m0, m4 | |
1455 paddw m0, [pw_16] | |
1456 psraw m0, 5 | |
1457 punpcklbw m3, m1 | |
1458 pmaddwd m3, [pw_ang_table + 4 * 16] | |
1459 pmaddwd m2, [pw_ang_table + 19 * 16] | |
1460 packssdw m2, m3 | |
1461 paddw m2, [pw_16] | |
1462 psraw m2, 5 | |
1463 | |
1464 TRANSPOSE_4x4 | |
1465 | |
1466 STORE_4x4 | |
1467 RET | |
1468 | |
1469 cglobal intra_pred_ang4_6, 3,3,4 | |
1470 movh m2, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1471 punpcklbw m2, m2 | |
1472 psrldq m2, 1 | |
1473 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1474 psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1475 | |
1476 pxor m1, m1 | |
1477 punpcklbw m0, m1 | |
1478 mova m3, m0 | |
1479 pmaddwd m3, [pw_ang_table + 26 * 16] | |
1480 pmaddwd m0, [pw_ang_table + 13 * 16] | |
1481 packssdw m0, m3 | |
1482 paddw m0, [pw_16] | |
1483 psraw m0, 5 | |
1484 punpcklbw m2, m1 | |
1485 mova m3, m2 | |
1486 pmaddwd m3, [pw_ang_table + 20 * 16] | |
1487 pmaddwd m2, [pw_ang_table + 7 * 16] | |
1488 packssdw m2, m3 | |
1489 paddw m2, [pw_16] | |
1490 psraw m2, 5 | |
1491 | |
1492 TRANSPOSE_4x4 | |
1493 | |
1494 STORE_4x4 | |
1495 RET | |
1496 | |
1497 cglobal intra_pred_ang4_7, 3,3,5 | |
1498 movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1499 punpcklbw m3, m3 | |
1500 psrldq m3, 1 | |
1501 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1502 psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
1503 | |
1504 pxor m1, m1 | |
1505 punpcklbw m0, m1 | |
1506 mova m4, m0 | |
1507 mova m2, m0 | |
1508 pmaddwd m4, [pw_ang_table + 18 * 16] | |
1509 pmaddwd m0, [pw_ang_table + 9 * 16] | |
1510 packssdw m0, m4 | |
1511 paddw m0, [pw_16] | |
1512 psraw m0, 5 | |
1513 punpcklbw m3, m1 | |
1514 pmaddwd m3, [pw_ang_table + 4 * 16] | |
1515 pmaddwd m2, [pw_ang_table + 27 * 16] | |
1516 packssdw m2, m3 | |
1517 paddw m2, [pw_16] | |
1518 psraw m2, 5 | |
1519 | |
1520 TRANSPOSE_4x4 | |
1521 | |
1522 STORE_4x4 | |
1523 RET | |
1524 | |
1525 cglobal intra_pred_ang4_8, 3,3,5 | |
1526 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1527 punpcklbw m0, m0 | |
1528 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1529 | |
1530 pxor m1, m1 | |
1531 punpcklbw m0, m1 | |
1532 mova m2, m0 | |
1533 mova m3, m0 | |
1534 mova m4, m2 | |
1535 pmaddwd m3, [pw_ang_table + 10 * 16] | |
1536 pmaddwd m0, [pw_ang_table + 5 * 16] | |
1537 packssdw m0, m3 | |
1538 paddw m0, [pw_16] | |
1539 psraw m0, 5 | |
1540 pmaddwd m4, [pw_ang_table + 20 * 16] | |
1541 pmaddwd m2, [pw_ang_table + 15 * 16] | |
1542 packssdw m2, m4 | |
1543 paddw m2, [pw_16] | |
1544 psraw m2, 5 | |
1545 | |
1546 TRANSPOSE_4x4 | |
1547 | |
1548 STORE_4x4 | |
1549 RET | |
1550 | |
1551 cglobal intra_pred_ang4_9, 3,3,5 | |
1552 movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1553 punpcklbw m0, m0 | |
1554 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1555 | |
1556 pxor m1, m1 | |
1557 punpcklbw m0, m1 | |
1558 mova m2, m0 | |
1559 mova m3, m0 | |
1560 mova m4, m2 | |
1561 pmaddwd m3, [pw_ang_table + 4 * 16] | |
1562 pmaddwd m0, [pw_ang_table + 2 * 16] | |
1563 packssdw m0, m3 | |
1564 paddw m0, [pw_16] | |
1565 psraw m0, 5 | |
1566 pmaddwd m4, [pw_ang_table + 8 * 16] | |
1567 pmaddwd m2, [pw_ang_table + 6 * 16] | |
1568 packssdw m2, m4 | |
1569 paddw m2, [pw_16] | |
1570 psraw m2, 5 | |
1571 | |
1572 TRANSPOSE_4x4 | |
1573 | |
1574 STORE_4x4 | |
1575 RET | |
1576 | |
1577 cglobal intra_pred_ang4_10, 3,5,4 | |
1578 movd m0, [r2 + 9] ;[8 7 6 5 4 3 2 1] | |
1579 punpcklbw m0, m0 | |
1580 punpcklwd m0, m0 | |
1581 pshufd m1, m0, 1 | |
1582 movhlps m2, m0 | |
1583 pshufd m3, m0, 3 | |
1584 movd [r0 + r1], m1 | |
1585 movd [r0 + r1 * 2], m2 | |
1586 lea r1, [r1 * 3] | |
1587 movd [r0 + r1], m3 | |
1588 cmp r4m, byte 0 | |
1589 jz .quit | |
1590 | |
1591 ; filter | |
1592 pxor m3, m3 | |
1593 punpcklbw m0, m3 | |
1594 movh m1, [r2] ;[4 3 2 1 0] | |
1595 punpcklbw m1, m3 | |
1596 pshuflw m2, m1, 0x00 | |
1597 psrldq m1, 2 | |
1598 psubw m1, m2 | |
1599 psraw m1, 1 | |
1600 paddw m0, m1 | |
1601 packuswb m0, m0 | |
1602 | |
1603 .quit: | |
1604 movd [r0], m0 | |
1605 RET | |
1606 | |
1607 cglobal intra_pred_ang4_11, 3,3,5 | |
1608 movd m1, [r2 + 9] ;[4 3 2 1] | |
1609 movh m0, [r2 - 7] ;[A x x x x x x x] | |
1610 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] | |
1611 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]] | |
1612 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] | |
1613 | |
1614 pxor m1, m1 | |
1615 punpcklbw m0, m1 | |
1616 mova m2, m0 | |
1617 mova m3, m0 | |
1618 mova m4, m2 | |
1619 pmaddwd m3, [pw_ang_table + 28 * 16] | |
1620 pmaddwd m0, [pw_ang_table + 30 * 16] | |
1621 packssdw m0, m3 | |
1622 paddw m0, [pw_16] | |
1623 psraw m0, 5 | |
1624 pmaddwd m4, [pw_ang_table + 24 * 16] | |
1625 pmaddwd m2, [pw_ang_table + 26 * 16] | |
1626 packssdw m2, m4 | |
1627 paddw m2, [pw_16] | |
1628 psraw m2, 5 | |
1629 | |
1630 TRANSPOSE_4x4 | |
1631 | |
1632 STORE_4x4 | |
1633 RET | |
1634 | |
1635 cglobal intra_pred_ang4_12, 3,3,5 | |
1636 movd m1, [r2 + 9] ;[4 3 2 1] | |
1637 movh m0, [r2 - 7] ;[A x x x x x x x] | |
1638 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] | |
1639 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] | |
1640 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] | |
1641 | |
1642 pxor m1, m1 | |
1643 punpcklbw m0, m1 | |
1644 mova m2, m0 | |
1645 mova m3, m0 | |
1646 mova m4, m2 | |
1647 pmaddwd m3, [pw_ang_table + 22 * 16] | |
1648 pmaddwd m0, [pw_ang_table + 27 * 16] | |
1649 packssdw m0, m3 | |
1650 paddw m0, [pw_16] | |
1651 psraw m0, 5 | |
1652 pmaddwd m4, [pw_ang_table + 12 * 16] | |
1653 pmaddwd m2, [pw_ang_table + 17 * 16] | |
1654 packssdw m2, m4 | |
1655 paddw m2, [pw_16] | |
1656 psraw m2, 5 | |
1657 | |
1658 TRANSPOSE_4x4 | |
1659 | |
1660 STORE_4x4 | |
1661 RET | |
1662 | |
1663 cglobal intra_pred_ang4_24, 3,3,5 | |
1664 movd m1, [r2 + 1] ;[4 3 2 1] | |
1665 movh m0, [r2 - 7] ;[A x x x x x x x] | |
1666 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] | |
1667 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] | |
1668 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] | |
1669 | |
1670 pxor m1, m1 | |
1671 punpcklbw m0, m1 | |
1672 mova m2, m0 | |
1673 mova m3, m0 | |
1674 mova m4, m2 | |
1675 pmaddwd m3, [pw_ang_table + 22 * 16] | |
1676 pmaddwd m0, [pw_ang_table + 27 * 16] | |
1677 packssdw m0, m3 | |
1678 paddw m0, [pw_16] | |
1679 psraw m0, 5 | |
1680 pmaddwd m4, [pw_ang_table + 12 * 16] | |
1681 pmaddwd m2, [pw_ang_table + 17 * 16] | |
1682 packssdw m2, m4 | |
1683 paddw m2, [pw_16] | |
1684 psraw m2, 5 | |
1685 packuswb m0, m2 | |
1686 | |
1687 STORE_4x4 | |
1688 RET | |
1689 | |
1690 cglobal intra_pred_ang4_13, 3,3,5 | |
1691 movd m1, [r2 - 1] ;[x x A x] | |
1692 movd m2, [r2 + 9] ;[4 3 2 1] | |
1693 movd m0, [r2 + 3] ;[x x B x] | |
1694 punpcklbw m0, m1 ;[x x x x A B x x] | |
1695 punpckldq m0, m2 ;[4 3 2 1 A B x x] | |
1696 psrldq m0, 2 ;[x x 4 3 2 1 A B] | |
1697 punpcklbw m0, m0 | |
1698 psrldq m0, 1 | |
1699 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1700 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1701 | |
1702 pxor m1, m1 | |
1703 punpcklbw m0, m1 | |
1704 mova m4, m0 | |
1705 mova m2, m0 | |
1706 pmaddwd m4, [pw_ang_table + 14 * 16] | |
1707 pmaddwd m0, [pw_ang_table + 23 * 16] | |
1708 packssdw m0, m4 | |
1709 paddw m0, [pw_16] | |
1710 psraw m0, 5 | |
1711 punpcklbw m3, m1 | |
1712 pmaddwd m3, [pw_ang_table + 28 * 16] | |
1713 pmaddwd m2, [pw_ang_table + 5 * 16] | |
1714 packssdw m2, m3 | |
1715 paddw m2, [pw_16] | |
1716 psraw m2, 5 | |
1717 | |
1718 TRANSPOSE_4x4 | |
1719 | |
1720 STORE_4x4 | |
1721 RET | |
1722 | |
1723 cglobal intra_pred_ang4_14, 3,3,4 | |
1724 movd m1, [r2 - 1] ;[x x A x] | |
1725 movd m0, [r2 + 1] ;[x x B x] | |
1726 punpcklbw m0, m1 ;[A B x x] | |
1727 movd m1, [r2 + 9] ;[4 3 2 1] | |
1728 punpckldq m0, m1 ;[4 3 2 1 A B x x] | |
1729 psrldq m0, 2 ;[x x 4 3 2 1 A B] | |
1730 punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B] | |
1731 psrldq m0, 1 | |
1732 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1733 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1734 | |
1735 pxor m1, m1 | |
1736 punpcklbw m0, m1 | |
1737 mova m3, m0 | |
1738 pmaddwd m3, [pw_ang_table + 6 * 16] | |
1739 pmaddwd m0, [pw_ang_table + 19 * 16] | |
1740 packssdw m0, m3 | |
1741 paddw m0, [pw_16] | |
1742 psraw m0, 5 | |
1743 punpcklbw m2, m1 | |
1744 mova m3, m2 | |
1745 pmaddwd m3, [pw_ang_table + 12 * 16] | |
1746 pmaddwd m2, [pw_ang_table + 25 * 16] | |
1747 packssdw m2, m3 | |
1748 paddw m2, [pw_16] | |
1749 psraw m2, 5 | |
1750 | |
1751 TRANSPOSE_4x4 | |
1752 | |
1753 STORE_4x4 | |
1754 RET | |
1755 | |
1756 cglobal intra_pred_ang4_15, 3,3,5 | |
1757 movd m0, [r2] ;[x x x A] | |
1758 movd m1, [r2 + 2] ;[x x x B] | |
1759 punpcklbw m1, m0 ;[x x A B] | |
1760 movd m0, [r2 + 3] ;[x x C x] | |
1761 punpcklwd m0, m1 ;[A B C x] | |
1762 movd m1, [r2 + 9] ;[4 3 2 1] | |
1763 punpckldq m0, m1 ;[4 3 2 1 A B C x] | |
1764 psrldq m0, 1 ;[x 4 3 2 1 A B C] | |
1765 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] | |
1766 psrldq m0, 1 | |
1767 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1768 psrldq m0, 2 | |
1769 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1770 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1771 | |
1772 pxor m4, m4 | |
1773 punpcklbw m2, m4 | |
1774 mova m3, m2 | |
1775 pmaddwd m3, [pw_ang_table + 30 * 16] | |
1776 punpcklbw m0, m4 | |
1777 pmaddwd m0, [pw_ang_table + 15 * 16] | |
1778 packssdw m0, m3 | |
1779 paddw m0, [pw_16] | |
1780 psraw m0, 5 | |
1781 punpcklbw m1, m4 | |
1782 pmaddwd m1, [pw_ang_table + 28 * 16] | |
1783 pmaddwd m2, [pw_ang_table + 13 * 16] | |
1784 packssdw m2, m1 | |
1785 paddw m2, [pw_16] | |
1786 psraw m2, 5 | |
1787 | |
1788 TRANSPOSE_4x4 | |
1789 | |
1790 STORE_4x4 | |
1791 RET | |
1792 | |
1793 cglobal intra_pred_ang4_16, 3,3,5 | |
1794 movd m2, [r2] ;[x x x A] | |
1795 movd m1, [r2 + 2] ;[x x x B] | |
1796 punpcklbw m1, m2 ;[x x A B] | |
1797 movd m0, [r2 + 2] ;[x x C x] | |
1798 punpcklwd m0, m1 ;[A B C x] | |
1799 movd m1, [r2 + 9] ;[4 3 2 1] | |
1800 punpckldq m0, m1 ;[4 3 2 1 A B C x] | |
1801 psrldq m0, 1 ;[x 4 3 2 1 A B C] | |
1802 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] | |
1803 psrldq m0, 1 | |
1804 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1805 psrldq m0, 2 | |
1806 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1807 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1808 | |
1809 pxor m4, m4 | |
1810 punpcklbw m2, m4 | |
1811 mova m3, m2 | |
1812 pmaddwd m3, [pw_ang_table + 22 * 16] | |
1813 punpcklbw m0, m4 | |
1814 pmaddwd m0, [pw_ang_table + 11 * 16] | |
1815 packssdw m0, m3 | |
1816 paddw m0, [pw_16] | |
1817 psraw m0, 5 | |
1818 punpcklbw m1, m4 | |
1819 pmaddwd m1, [pw_ang_table + 12 * 16] | |
1820 pmaddwd m2, [pw_ang_table + 1 * 16] | |
1821 packssdw m2, m1 | |
1822 paddw m2, [pw_16] | |
1823 psraw m2, 5 | |
1824 | |
1825 TRANSPOSE_4x4 | |
1826 | |
1827 STORE_4x4 | |
1828 RET | |
1829 | |
1830 cglobal intra_pred_ang4_17, 3,3,5 | |
1831 movd m2, [r2] ;[x x x A] | |
1832 movd m3, [r2 + 1] ;[x x x B] | |
1833 movd m4, [r2 + 2] ;[x x x C] | |
1834 movd m0, [r2 + 4] ;[x x x D] | |
1835 punpcklbw m3, m2 ;[x x A B] | |
1836 punpcklbw m0, m4 ;[x x C D] | |
1837 punpcklwd m0, m3 ;[A B C D] | |
1838 movd m1, [r2 + 9] ;[4 3 2 1] | |
1839 punpckldq m0, m1 ;[4 3 2 1 A B C D] | |
1840 punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D] | |
1841 psrldq m0, 1 | |
1842 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D] | |
1843 psrldq m0, 2 | |
1844 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1845 psrldq m0, 2 | |
1846 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1847 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1848 | |
1849 pxor m4, m4 | |
1850 punpcklbw m3, m4 | |
1851 pmaddwd m3, [pw_ang_table + 12 * 16] | |
1852 punpcklbw m0, m4 | |
1853 pmaddwd m0, [pw_ang_table + 6 * 16] | |
1854 packssdw m0, m3 | |
1855 paddw m0, [pw_16] | |
1856 psraw m0, 5 | |
1857 punpcklbw m1, m4 | |
1858 pmaddwd m1, [pw_ang_table + 24 * 16] | |
1859 punpcklbw m2, m4 | |
1860 pmaddwd m2, [pw_ang_table + 18 * 16] | |
1861 packssdw m2, m1 | |
1862 paddw m2, [pw_16] | |
1863 psraw m2, 5 | |
1864 | |
1865 TRANSPOSE_4x4 | |
1866 | |
1867 STORE_4x4 | |
1868 RET | |
1869 | |
1870 cglobal intra_pred_ang4_18, 3,4,2 | |
1871 mov r3d, [r2 + 8] | |
1872 mov r3b, byte [r2] | |
1873 bswap r3d | |
1874 movd m0, r3d | |
1875 | |
1876 movd m1, [r2 + 1] | |
1877 punpckldq m0, m1 | |
1878 lea r3, [r1 * 3] | |
1879 movd [r0 + r3], m0 | |
1880 psrldq m0, 1 | |
1881 movd [r0 + r1 * 2], m0 | |
1882 psrldq m0, 1 | |
1883 movd [r0 + r1], m0 | |
1884 psrldq m0, 1 | |
1885 movd [r0], m0 | |
1886 RET | |
1887 | |
1888 cglobal intra_pred_ang4_19, 3,3,5 | |
1889 movd m2, [r2] ;[x x x A] | |
1890 movd m3, [r2 + 9] ;[x x x B] | |
1891 movd m4, [r2 + 10] ;[x x x C] | |
1892 movd m0, [r2 + 12] ;[x x x D] | |
1893 punpcklbw m3, m2 ;[x x A B] | |
1894 punpcklbw m0, m4 ;[x x C D] | |
1895 punpcklwd m0, m3 ;[A B C D] | |
1896 movd m1, [r2 + 1] ;[4 3 2 1] | |
1897 punpckldq m0, m1 ;[4 3 2 1 A B C D] | |
1898 punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D] | |
1899 psrldq m0, 1 | |
1900 movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D] | |
1901 psrldq m0, 2 | |
1902 movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1903 psrldq m0, 2 | |
1904 movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1905 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1906 | |
1907 pxor m4, m4 | |
1908 punpcklbw m3, m4 | |
1909 pmaddwd m3, [pw_ang_table + 12 * 16] | |
1910 punpcklbw m0, m4 | |
1911 pmaddwd m0, [pw_ang_table + 6 * 16] | |
1912 packssdw m0, m3 | |
1913 paddw m0, [pw_16] | |
1914 psraw m0, 5 | |
1915 punpcklbw m1, m4 | |
1916 pmaddwd m1, [pw_ang_table + 24 * 16] | |
1917 punpcklbw m2, m4 | |
1918 pmaddwd m2, [pw_ang_table + 18 * 16] | |
1919 packssdw m2, m1 | |
1920 paddw m2, [pw_16] | |
1921 psraw m2, 5 | |
1922 packuswb m0, m2 | |
1923 | |
1924 STORE_4x4 | |
1925 RET | |
1926 | |
1927 cglobal intra_pred_ang4_20, 3,3,5 | |
1928 movd m2, [r2] ;[x x x A] | |
1929 movd m1, [r2 + 10] ;[x x x B] | |
1930 punpcklbw m1, m2 ;[x x A B] | |
1931 movd m0, [r2 + 10] ;[x x C x] | |
1932 punpcklwd m0, m1 ;[A B C x] | |
1933 movd m1, [r2 + 1] ;[4 3 2 1] | |
1934 punpckldq m0, m1 ;[4 3 2 1 A B C x] | |
1935 psrldq m0, 1 ;[x 4 3 2 1 A B C] | |
1936 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] | |
1937 psrldq m0, 1 | |
1938 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1939 psrldq m0, 2 | |
1940 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1941 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1942 | |
1943 pxor m4, m4 | |
1944 punpcklbw m2, m4 | |
1945 mova m3, m2 | |
1946 pmaddwd m3, [pw_ang_table + 22 * 16] | |
1947 punpcklbw m0, m4 | |
1948 pmaddwd m0, [pw_ang_table + 11 * 16] | |
1949 packssdw m0, m3 | |
1950 paddw m0, [pw_16] | |
1951 psraw m0, 5 | |
1952 punpcklbw m1, m4 | |
1953 pmaddwd m1, [pw_ang_table + 12 * 16] | |
1954 pmaddwd m2, [pw_ang_table + 1 * 16] | |
1955 packssdw m2, m1 | |
1956 paddw m2, [pw_16] | |
1957 psraw m2, 5 | |
1958 packuswb m0, m2 | |
1959 | |
1960 STORE_4x4 | |
1961 RET | |
1962 | |
1963 cglobal intra_pred_ang4_21, 3,3,5 | |
1964 movd m0, [r2] ;[x x x A] | |
1965 movd m1, [r2 + 10] ;[x x x B] | |
1966 punpcklbw m1, m0 ;[x x A B] | |
1967 movd m0, [r2 + 11] ;[x x C x] | |
1968 punpcklwd m0, m1 ;[A B C x] | |
1969 movd m1, [r2 + 1] ;[4 3 2 1] | |
1970 punpckldq m0, m1 ;[4 3 2 1 A B C x] | |
1971 psrldq m0, 1 ;[x 4 3 2 1 A B C] | |
1972 punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C] | |
1973 psrldq m0, 1 | |
1974 movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C] | |
1975 psrldq m0, 2 | |
1976 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
1977 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
1978 | |
1979 pxor m4, m4 | |
1980 punpcklbw m2, m4 | |
1981 mova m3, m2 | |
1982 pmaddwd m3, [pw_ang_table + 30 * 16] | |
1983 punpcklbw m0, m4 | |
1984 pmaddwd m0, [pw_ang_table + 15 * 16] | |
1985 packssdw m0, m3 | |
1986 paddw m0, [pw_16] | |
1987 psraw m0, 5 | |
1988 punpcklbw m1, m4 | |
1989 pmaddwd m1, [pw_ang_table + 28 * 16] | |
1990 pmaddwd m2, [pw_ang_table + 13 * 16] | |
1991 packssdw m2, m1 | |
1992 paddw m2, [pw_16] | |
1993 psraw m2, 5 | |
1994 packuswb m0, m2 | |
1995 | |
1996 STORE_4x4 | |
1997 RET | |
1998 | |
1999 cglobal intra_pred_ang4_22, 3,3,4 | |
2000 movd m1, [r2 - 1] ;[x x A x] | |
2001 movd m0, [r2 + 9] ;[x x B x] | |
2002 punpcklbw m0, m1 ;[A B x x] | |
2003 movd m1, [r2 + 1] ;[4 3 2 1] | |
2004 punpckldq m0, m1 ;[4 3 2 1 A B x x] | |
2005 psrldq m0, 2 ;[x x 4 3 2 1 A B] | |
2006 punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B] | |
2007 psrldq m0, 1 | |
2008 movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
2009 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
2010 | |
2011 pxor m1, m1 | |
2012 punpcklbw m0, m1 | |
2013 mova m3, m0 | |
2014 pmaddwd m3, [pw_ang_table + 6 * 16] | |
2015 pmaddwd m0, [pw_ang_table + 19 * 16] | |
2016 packssdw m0, m3 | |
2017 paddw m0, [pw_16] | |
2018 psraw m0, 5 | |
2019 punpcklbw m2, m1 | |
2020 mova m3, m2 | |
2021 pmaddwd m3, [pw_ang_table + 12 * 16] | |
2022 pmaddwd m2, [pw_ang_table + 25 * 16] | |
2023 packssdw m2, m3 | |
2024 paddw m2, [pw_16] | |
2025 psraw m2, 5 | |
2026 packuswb m0, m2 | |
2027 | |
2028 STORE_4x4 | |
2029 RET | |
2030 | |
2031 cglobal intra_pred_ang4_23, 3,3,5 | |
2032 movd m1, [r2 - 1] ;[x x A x] | |
2033 movd m2, [r2 + 1] ;[4 3 2 1] | |
2034 movd m0, [r2 + 11] ;[x x B x] | |
2035 punpcklbw m0, m1 ;[x x x x A B x x] | |
2036 punpckldq m0, m2 ;[4 3 2 1 A B x x] | |
2037 psrldq m0, 2 ;[x x 4 3 2 1 A B] | |
2038 punpcklbw m0, m0 | |
2039 psrldq m0, 1 | |
2040 mova m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B] | |
2041 psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A] | |
2042 | |
2043 pxor m1, m1 | |
2044 punpcklbw m0, m1 | |
2045 mova m4, m0 | |
2046 mova m2, m0 | |
2047 pmaddwd m4, [pw_ang_table + 14 * 16] | |
2048 pmaddwd m0, [pw_ang_table + 23 * 16] | |
2049 packssdw m0, m4 | |
2050 paddw m0, [pw_16] | |
2051 psraw m0, 5 | |
2052 punpcklbw m3, m1 | |
2053 pmaddwd m3, [pw_ang_table + 28 * 16] | |
2054 pmaddwd m2, [pw_ang_table + 5 * 16] | |
2055 packssdw m2, m3 | |
2056 paddw m2, [pw_16] | |
2057 psraw m2, 5 | |
2058 packuswb m0, m2 | |
2059 | |
2060 STORE_4x4 | |
2061 RET | |
2062 | |
2063 cglobal intra_pred_ang4_25, 3,3,5 | |
2064 movd m1, [r2 + 1] ;[4 3 2 1] | |
2065 movh m0, [r2 - 7] ;[A x x x x x x x] | |
2066 punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1] | |
2067 punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x] | |
2068 psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A] | |
2069 | |
2070 pxor m1, m1 | |
2071 punpcklbw m0, m1 | |
2072 mova m2, m0 | |
2073 mova m3, m0 | |
2074 mova m4, m2 | |
2075 pmaddwd m3, [pw_ang_table + 28 * 16] | |
2076 pmaddwd m0, [pw_ang_table + 30 * 16] | |
2077 packssdw m0, m3 | |
2078 paddw m0, [pw_16] | |
2079 psraw m0, 5 | |
2080 pmaddwd m4, [pw_ang_table + 24 * 16] | |
2081 pmaddwd m2, [pw_ang_table + 26 * 16] | |
2082 packssdw m2, m4 | |
2083 paddw m2, [pw_16] | |
2084 psraw m2, 5 | |
2085 packuswb m0, m2 | |
2086 | |
2087 STORE_4x4 | |
2088 RET | |
2089 | |
2090 cglobal intra_pred_ang4_26, 3,4,4 | |
2091 movd m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2092 | |
2093 ; store | |
2094 movd [r0], m0 | |
2095 movd [r0 + r1], m0 | |
2096 movd [r0 + r1 * 2], m0 | |
2097 lea r3, [r1 * 3] | |
2098 movd [r0 + r3], m0 | |
2099 | |
2100 ; filter | |
2101 cmp r4m, byte 0 | |
2102 jz .quit | |
2103 | |
2104 pxor m3, m3 | |
2105 punpcklbw m0, m3 | |
2106 pshuflw m0, m0, 0x00 | |
2107 movd m2, [r2] | |
2108 punpcklbw m2, m3 | |
2109 pshuflw m2, m2, 0x00 | |
2110 movd m1, [r2 + 9] | |
2111 punpcklbw m1, m3 | |
2112 psubw m1, m2 | |
2113 psraw m1, 1 | |
2114 paddw m0, m1 | |
2115 packuswb m0, m0 | |
2116 | |
2117 movd r2, m0 | |
2118 mov [r0], r2b | |
2119 shr r2, 8 | |
2120 mov [r0 + r1], r2b | |
2121 shr r2, 8 | |
2122 mov [r0 + r1 * 2], r2b | |
2123 shr r2, 8 | |
2124 mov [r0 + r3], r2b | |
2125 | |
2126 .quit: | |
2127 RET | |
2128 | |
2129 cglobal intra_pred_ang4_27, 3,3,5 | |
2130 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2131 punpcklbw m0, m0 | |
2132 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2133 | |
2134 pxor m1, m1 | |
2135 punpcklbw m0, m1 | |
2136 mova m2, m0 | |
2137 mova m3, m0 | |
2138 mova m4, m2 | |
2139 pmaddwd m3, [pw_ang_table + 4 * 16] | |
2140 pmaddwd m0, [pw_ang_table + 2 * 16] | |
2141 packssdw m0, m3 | |
2142 paddw m0, [pw_16] | |
2143 psraw m0, 5 | |
2144 pmaddwd m4, [pw_ang_table + 8 * 16] | |
2145 pmaddwd m2, [pw_ang_table + 6 * 16] | |
2146 packssdw m2, m4 | |
2147 paddw m2, [pw_16] | |
2148 psraw m2, 5 | |
2149 packuswb m0, m2 | |
2150 | |
2151 STORE_4x4 | |
2152 RET | |
2153 | |
2154 cglobal intra_pred_ang4_28, 3,3,5 | |
2155 movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2156 punpcklbw m0, m0 | |
2157 psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2158 | |
2159 pxor m1, m1 | |
2160 punpcklbw m0, m1 | |
2161 mova m2, m0 | |
2162 mova m3, m0 | |
2163 mova m4, m2 | |
2164 pmaddwd m3, [pw_ang_table + 10 * 16] | |
2165 pmaddwd m0, [pw_ang_table + 5 * 16] | |
2166 packssdw m0, m3 | |
2167 paddw m0, [pw_16] | |
2168 psraw m0, 5 | |
2169 pmaddwd m4, [pw_ang_table + 20 * 16] | |
2170 pmaddwd m2, [pw_ang_table + 15 * 16] | |
2171 packssdw m2, m4 | |
2172 paddw m2, [pw_16] | |
2173 psraw m2, 5 | |
2174 packuswb m0, m2 | |
2175 | |
2176 STORE_4x4 | |
2177 RET | |
2178 | |
2179 cglobal intra_pred_ang4_29, 3,3,5 | |
2180 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2181 punpcklbw m3, m3 | |
2182 psrldq m3, 1 | |
2183 movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2184 psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
2185 | |
2186 pxor m1, m1 | |
2187 punpcklbw m0, m1 | |
2188 mova m4, m0 | |
2189 mova m2, m0 | |
2190 pmaddwd m4, [pw_ang_table + 18 * 16] | |
2191 pmaddwd m0, [pw_ang_table + 9 * 16] | |
2192 packssdw m0, m4 | |
2193 paddw m0, [pw_16] | |
2194 psraw m0, 5 | |
2195 punpcklbw m3, m1 | |
2196 pmaddwd m3, [pw_ang_table + 4 * 16] | |
2197 pmaddwd m2, [pw_ang_table + 27 * 16] | |
2198 packssdw m2, m3 | |
2199 paddw m2, [pw_16] | |
2200 psraw m2, 5 | |
2201 packuswb m0, m2 | |
2202 | |
2203 STORE_4x4 | |
2204 RET | |
2205 | |
2206 cglobal intra_pred_ang4_30, 3,3,4 | |
2207 movh m2, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2208 punpcklbw m2, m2 | |
2209 psrldq m2, 1 | |
2210 movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2211 psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
2212 | |
2213 pxor m1, m1 | |
2214 punpcklbw m0, m1 | |
2215 mova m3, m0 | |
2216 pmaddwd m3, [pw_ang_table + 26 * 16] | |
2217 pmaddwd m0, [pw_ang_table + 13 * 16] | |
2218 packssdw m0, m3 | |
2219 paddw m0, [pw_16] | |
2220 psraw m0, 5 | |
2221 punpcklbw m2, m1 | |
2222 mova m3, m2 | |
2223 pmaddwd m3, [pw_ang_table + 20 * 16] | |
2224 pmaddwd m2, [pw_ang_table + 7 * 16] | |
2225 packssdw m2, m3 | |
2226 paddw m2, [pw_16] | |
2227 psraw m2, 5 | |
2228 packuswb m0, m2 | |
2229 | |
2230 STORE_4x4 | |
2231 RET | |
2232 | |
2233 cglobal intra_pred_ang4_31, 3,3,5 | |
2234 movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2235 punpcklbw m3, m3 | |
2236 psrldq m3, 1 | |
2237 mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2238 psrldq m3, 2 | |
2239 mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
2240 psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
2241 | |
2242 pxor m1, m1 | |
2243 punpcklbw m2, m1 | |
2244 mova m4, m2 | |
2245 pmaddwd m4, [pw_ang_table + 2 * 16] | |
2246 punpcklbw m0, m1 | |
2247 pmaddwd m0, [pw_ang_table + 17 * 16] | |
2248 packssdw m0, m4 | |
2249 paddw m0, [pw_16] | |
2250 psraw m0, 5 | |
2251 punpcklbw m3, m1 | |
2252 pmaddwd m3, [pw_ang_table + 4 * 16] | |
2253 pmaddwd m2, [pw_ang_table + 19 * 16] | |
2254 packssdw m2, m3 | |
2255 paddw m2, [pw_16] | |
2256 psraw m2, 5 | |
2257 packuswb m0, m2 | |
2258 | |
2259 STORE_4x4 | |
2260 RET | |
2261 | |
2262 cglobal intra_pred_ang4_32, 3,3,5 | |
2263 movh m1, [r2 + 1] ;[8 7 6 5 4 3 2 1] | |
2264 punpcklbw m1, m1 | |
2265 psrldq m1, 1 | |
2266 movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1] | |
2267 psrldq m1, 2 | |
2268 movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
2269 psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
2270 | |
2271 pxor m4, m4 | |
2272 punpcklbw m2, m4 | |
2273 mova m3, m2 | |
2274 pmaddwd m3, [pw_ang_table + 10 * 16] | |
2275 punpcklbw m0, m4 | |
2276 pmaddwd m0, [pw_ang_table + 21 * 16] | |
2277 packssdw m0, m3 | |
2278 paddw m0, [pw_16] | |
2279 psraw m0, 5 | |
2280 punpcklbw m1, m4 | |
2281 pmaddwd m1, [pw_ang_table + 20 * 16] | |
2282 pmaddwd m2, [pw_ang_table + 31 * 16] | |
2283 packssdw m2, m1 | |
2284 paddw m2, [pw_16] | |
2285 psraw m2, 5 | |
2286 packuswb m0, m2 | |
2287 | |
2288 STORE_4x4 | |
2289 RET | |
2290 | |
2291 cglobal intra_pred_ang4_33, 3,3,5 | |
2292 movh m3, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
2293 punpcklbw m3, m3 | |
2294 psrldq m3, 1 | |
2295 movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1] | |
2296 psrldq m3, 2 | |
2297 movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2] | |
2298 psrldq m3, 2 | |
2299 movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3] | |
2300 psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4] | |
2301 | |
2302 pxor m4, m4 | |
2303 punpcklbw m1, m4 | |
2304 pmaddwd m1, [pw_ang_table + 20 * 16] | |
2305 punpcklbw m0, m4 | |
2306 pmaddwd m0, [pw_ang_table + 26 * 16] | |
2307 packssdw m0, m1 | |
2308 paddw m0, [pw_16] | |
2309 psraw m0, 5 | |
2310 punpcklbw m3, m4 | |
2311 pmaddwd m3, [pw_ang_table + 8 * 16] | |
2312 punpcklbw m2, m4 | |
2313 pmaddwd m2, [pw_ang_table + 14 * 16] | |
2314 packssdw m2, m3 | |
2315 paddw m2, [pw_16] | |
2316 psraw m2, 5 | |
2317 packuswb m0, m2 | |
2318 | |
2319 STORE_4x4 | |
2320 RET | |
2321 | |
2322 ;--------------------------------------------------------------------------------------------- | |
2323 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
2324 ;--------------------------------------------------------------------------------------------- | |
2325 INIT_XMM sse4 | |
2326 cglobal intra_pred_dc4, 5,5,3 | |
2327 inc r2 | |
2328 pxor m0, m0 | |
2329 movd m1, [r2] | |
2330 movd m2, [r2 + 8] | |
2331 punpckldq m1, m2 | |
2332 psadbw m1, m0 ; m1 = sum | |
2333 | |
2334 test r4d, r4d | |
2335 | |
2336 pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8 | |
2337 movd r4d, m1 ; r4d = dc_val | |
2338 pshufb m1, m0 ; m1 = byte [dc_val ...] | |
2339 | |
2340 ; store DC 4x4 | |
2341 lea r3, [r1 * 3] | |
2342 movd [r0], m1 | |
2343 movd [r0 + r1], m1 | |
2344 movd [r0 + r1 * 2], m1 | |
2345 movd [r0 + r3], m1 | |
2346 | |
2347 ; do DC filter | |
2348 jz .end | |
2349 lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 | |
2350 add r4d, r3d ; r4d = DC * 3 + 2 | |
2351 movd m1, r4d | |
2352 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
2353 pshufd m1, m1, 0 | |
2354 | |
2355 ; filter top | |
2356 movd m2, [r2] | |
2357 movd m0, [r2 + 9] | |
2358 punpckldq m2, m0 | |
2359 pmovzxbw m2, m2 | |
2360 paddw m2, m1 | |
2361 psraw m2, 2 | |
2362 packuswb m2, m2 | |
2363 movd [r0], m2 ; overwrite top-left pixel, we will update it later | |
2364 | |
2365 ; filter top-left | |
2366 movzx r4d, byte [r2 + 8] | |
2367 add r3d, r4d | |
2368 movzx r4d, byte [r2] | |
2369 add r3d, r4d | |
2370 shr r3d, 2 | |
2371 mov [r0], r3b | |
2372 | |
2373 ; filter left | |
2374 add r0, r1 | |
2375 pextrb [r0], m2, 4 | |
2376 pextrb [r0 + r1], m2, 5 | |
2377 pextrb [r0 + r1 * 2], m2, 6 | |
2378 | |
2379 .end: | |
2380 RET | |
2381 | |
2382 ;--------------------------------------------------------------------------------------------- | |
2383 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
2384 ;--------------------------------------------------------------------------------------------- | |
2385 INIT_XMM sse4 | |
2386 cglobal intra_pred_dc8, 5, 7, 3 | |
2387 lea r3, [r2 + 17] | |
2388 inc r2 | |
2389 pxor m0, m0 | |
2390 movh m1, [r2] | |
2391 movh m2, [r3] | |
2392 punpcklqdq m1, m2 | |
2393 psadbw m1, m0 | |
2394 pshufd m2, m1, 2 | |
2395 paddw m1, m2 | |
2396 | |
2397 movd r5d, m1 | |
2398 add r5d, 8 | |
2399 shr r5d, 4 ; sum = sum / 16 | |
2400 movd m1, r5d | |
2401 pshufb m1, m0 ; m1 = byte [dc_val ...] | |
2402 | |
2403 test r4d, r4d | |
2404 | |
2405 ; store DC 8x8 | |
2406 mov r6, r0 | |
2407 movh [r0], m1 | |
2408 movh [r0 + r1], m1 | |
2409 lea r0, [r0 + r1 * 2] | |
2410 movh [r0], m1 | |
2411 movh [r0 + r1], m1 | |
2412 lea r0, [r0 + r1 * 2] | |
2413 movh [r0], m1 | |
2414 movh [r0 + r1], m1 | |
2415 lea r0, [r0 + r1 * 2] | |
2416 movh [r0], m1 | |
2417 movh [r0 + r1], m1 | |
2418 | |
2419 ; Do DC Filter | |
2420 jz .end | |
2421 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
2422 add r5d, r4d ; r5d = DC * 3 + 2 | |
2423 movd m1, r5d | |
2424 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
2425 pshufd m1, m1, 0 | |
2426 | |
2427 ; filter top | |
2428 pmovzxbw m2, [r2] | |
2429 paddw m2, m1 | |
2430 psraw m2, 2 | |
2431 packuswb m2, m2 | |
2432 movh [r6], m2 | |
2433 | |
2434 ; filter top-left | |
2435 movzx r5d, byte [r3] | |
2436 add r4d, r5d | |
2437 movzx r3d, byte [r2] | |
2438 add r3d, r4d | |
2439 shr r3d, 2 | |
2440 mov [r6], r3b | |
2441 | |
2442 ; filter left | |
2443 add r6, r1 | |
2444 pmovzxbw m2, [r2 + 17] | |
2445 paddw m2, m1 | |
2446 psraw m2, 2 | |
2447 packuswb m2, m2 | |
2448 pextrb [r6], m2, 0 | |
2449 pextrb [r6 + r1], m2, 1 | |
2450 pextrb [r6 + 2 * r1], m2, 2 | |
2451 lea r6, [r6 + r1 * 2] | |
2452 pextrb [r6 + r1], m2, 3 | |
2453 pextrb [r6 + r1 * 2], m2, 4 | |
2454 pextrb [r6 + r1 * 4], m2, 6 | |
2455 lea r1, [r1 * 3] | |
2456 pextrb [r6 + r1], m2, 5 | |
2457 | |
2458 .end: | |
2459 RET | |
2460 | |
2461 ;-------------------------------------------------------------------------------------------- | |
2462 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
2463 ;-------------------------------------------------------------------------------------------- | |
2464 INIT_XMM sse4 | |
2465 cglobal intra_pred_dc16, 5, 7, 4 | |
2466 lea r3, [r2 + 33] | |
2467 inc r2 | |
2468 pxor m0, m0 | |
2469 movu m1, [r2] | |
2470 movu m2, [r3] | |
2471 psadbw m1, m0 | |
2472 psadbw m2, m0 | |
2473 paddw m1, m2 | |
2474 pshufd m2, m1, 2 | |
2475 paddw m1, m2 | |
2476 | |
2477 movd r5d, m1 | |
2478 add r5d, 16 | |
2479 shr r5d, 5 ; sum = sum / 32 | |
2480 movd m1, r5d | |
2481 pshufb m1, m0 ; m1 = byte [dc_val ...] | |
2482 | |
2483 test r4d, r4d | |
2484 | |
2485 ; store DC 16x16 | |
2486 mov r6, r0 | |
2487 movu [r0], m1 | |
2488 movu [r0 + r1], m1 | |
2489 lea r0, [r0 + r1 * 2] | |
2490 movu [r0], m1 | |
2491 movu [r0 + r1], m1 | |
2492 lea r0, [r0 + r1 * 2] | |
2493 movu [r0], m1 | |
2494 movu [r0 + r1], m1 | |
2495 lea r0, [r0 + r1 * 2] | |
2496 movu [r0], m1 | |
2497 movu [r0 + r1], m1 | |
2498 lea r0, [r0 + r1 * 2] | |
2499 movu [r0], m1 | |
2500 movu [r0 + r1], m1 | |
2501 lea r0, [r0 + r1 * 2] | |
2502 movu [r0], m1 | |
2503 movu [r0 + r1], m1 | |
2504 lea r0, [r0 + r1 * 2] | |
2505 movu [r0], m1 | |
2506 movu [r0 + r1], m1 | |
2507 lea r0, [r0 + r1 * 2] | |
2508 movu [r0], m1 | |
2509 movu [r0 + r1], m1 | |
2510 | |
2511 ; Do DC Filter | |
2512 jz .end | |
2513 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
2514 add r5d, r4d ; r5d = DC * 3 + 2 | |
2515 movd m1, r5d | |
2516 pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
2517 pshufd m1, m1, 0 | |
2518 | |
2519 ; filter top | |
2520 pmovzxbw m2, [r2] | |
2521 paddw m2, m1 | |
2522 psraw m2, 2 | |
2523 packuswb m2, m2 | |
2524 movh [r6], m2 | |
2525 pmovzxbw m3, [r2 + 8] | |
2526 paddw m3, m1 | |
2527 psraw m3, 2 | |
2528 packuswb m3, m3 | |
2529 movh [r6 + 8], m3 | |
2530 | |
2531 ; filter top-left | |
2532 movzx r5d, byte [r3] | |
2533 add r4d, r5d | |
2534 movzx r3d, byte [r2] | |
2535 add r3d, r4d | |
2536 shr r3d, 2 | |
2537 mov [r6], r3b | |
2538 | |
2539 ; filter left | |
2540 add r6, r1 | |
2541 pmovzxbw m2, [r2 + 33] | |
2542 paddw m2, m1 | |
2543 psraw m2, 2 | |
2544 packuswb m2, m2 | |
2545 pextrb [r6], m2, 0 | |
2546 pextrb [r6 + r1], m2, 1 | |
2547 pextrb [r6 + r1 * 2], m2, 2 | |
2548 lea r6, [r6 + r1 * 2] | |
2549 pextrb [r6 + r1], m2, 3 | |
2550 pextrb [r6 + r1 * 2], m2, 4 | |
2551 lea r6, [r6 + r1 * 2] | |
2552 pextrb [r6 + r1], m2, 5 | |
2553 pextrb [r6 + r1 * 2], m2, 6 | |
2554 lea r6, [r6 + r1 * 2] | |
2555 pextrb [r6 + r1], m2, 7 | |
2556 | |
2557 pmovzxbw m3, [r2 + 41] | |
2558 paddw m3, m1 | |
2559 psraw m3, 2 | |
2560 packuswb m3, m3 | |
2561 pextrb [r6 + r1 * 2], m3, 0 | |
2562 lea r6, [r6 + r1 * 2] | |
2563 pextrb [r6 + r1], m3, 1 | |
2564 pextrb [r6 + r1 * 2], m3, 2 | |
2565 lea r6, [r6 + r1 * 2] | |
2566 pextrb [r6 + r1], m3, 3 | |
2567 pextrb [r6 + r1 * 2], m3, 4 | |
2568 lea r6, [r6 + r1 * 2] | |
2569 pextrb [r6 + r1], m3, 5 | |
2570 pextrb [r6 + r1 * 2], m3, 6 | |
2571 | |
2572 .end: | |
2573 RET | |
2574 | |
2575 ;--------------------------------------------------------------------------------------------- | |
2576 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
2577 ;--------------------------------------------------------------------------------------------- | |
2578 INIT_XMM sse4 | |
2579 cglobal intra_pred_dc32, 3, 5, 5 | |
2580 lea r3, [r2 + 65] | |
2581 inc r2 | |
2582 pxor m0, m0 | |
2583 movu m1, [r2] | |
2584 movu m2, [r2 + 16] | |
2585 movu m3, [r3] | |
2586 movu m4, [r3 + 16] | |
2587 psadbw m1, m0 | |
2588 psadbw m2, m0 | |
2589 psadbw m3, m0 | |
2590 psadbw m4, m0 | |
2591 paddw m1, m2 | |
2592 paddw m3, m4 | |
2593 paddw m1, m3 | |
2594 pshufd m2, m1, 2 | |
2595 paddw m1, m2 | |
2596 | |
2597 movd r4d, m1 | |
2598 add r4d, 32 | |
2599 shr r4d, 6 ; sum = sum / 64 | |
2600 movd m1, r4d | |
2601 pshufb m1, m0 ; m1 = byte [dc_val ...] | |
2602 | |
2603 %rep 2 | |
2604 ; store DC 16x16 | |
2605 movu [r0], m1 | |
2606 movu [r0 + r1], m1 | |
2607 movu [r0 + 16], m1 | |
2608 movu [r0 + r1 + 16],m1 | |
2609 lea r0, [r0 + 2 * r1] | |
2610 movu [r0], m1 | |
2611 movu [r0 + r1], m1 | |
2612 movu [r0 + 16], m1 | |
2613 movu [r0 + r1 + 16],m1 | |
2614 lea r0, [r0 + 2 * r1] | |
2615 movu [r0], m1 | |
2616 movu [r0 + r1], m1 | |
2617 movu [r0 + 16], m1 | |
2618 movu [r0 + r1 + 16],m1 | |
2619 lea r0, [r0 + 2 * r1] | |
2620 movu [r0], m1 | |
2621 movu [r0 + r1], m1 | |
2622 movu [r0 + 16], m1 | |
2623 movu [r0 + r1 + 16],m1 | |
2624 lea r0, [r0 + 2 * r1] | |
2625 movu [r0], m1 | |
2626 movu [r0 + r1], m1 | |
2627 movu [r0 + 16], m1 | |
2628 movu [r0 + r1 + 16],m1 | |
2629 lea r0, [r0 + 2 * r1] | |
2630 movu [r0], m1 | |
2631 movu [r0 + r1], m1 | |
2632 movu [r0 + 16], m1 | |
2633 movu [r0 + r1 + 16],m1 | |
2634 lea r0, [r0 + 2 * r1] | |
2635 movu [r0], m1 | |
2636 movu [r0 + r1], m1 | |
2637 movu [r0 + 16], m1 | |
2638 movu [r0 + r1 + 16],m1 | |
2639 lea r0, [r0 + 2 * r1] | |
2640 movu [r0], m1 | |
2641 movu [r0 + r1], m1 | |
2642 movu [r0 + 16], m1 | |
2643 movu [r0 + r1 + 16],m1 | |
2644 lea r0, [r0 + 2 * r1] | |
2645 %endrep | |
2646 | |
2647 RET | |
2648 | |
2649 ;--------------------------------------------------------------------------------------------- | |
2650 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) | |
2651 ;--------------------------------------------------------------------------------------------- | |
2652 %if ARCH_X86_64 == 1 | |
2653 INIT_YMM avx2 | |
2654 cglobal intra_pred_dc32, 3, 4, 3 | |
2655 lea r3, [r1 * 3] | |
2656 pxor m0, m0 | |
2657 movu m1, [r2 + 1] | |
2658 movu m2, [r2 + 65] | |
2659 psadbw m1, m0 | |
2660 psadbw m2, m0 | |
2661 paddw m1, m2 | |
2662 vextracti128 xm2, m1, 1 | |
2663 paddw m1, m2 | |
2664 pshufd m2, m1, 2 | |
2665 paddw m1, m2 | |
2666 | |
2667 pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64 | |
2668 vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...] | |
2669 | |
2670 movu [r0 + r1 * 0], m1 | |
2671 movu [r0 + r1 * 1], m1 | |
2672 movu [r0 + r1 * 2], m1 | |
2673 movu [r0 + r3 * 1], m1 | |
2674 lea r0, [r0 + 4 * r1] | |
2675 movu [r0 + r1 * 0], m1 | |
2676 movu [r0 + r1 * 1], m1 | |
2677 movu [r0 + r1 * 2], m1 | |
2678 movu [r0 + r3 * 1], m1 | |
2679 lea r0, [r0 + 4 * r1] | |
2680 movu [r0 + r1 * 0], m1 | |
2681 movu [r0 + r1 * 1], m1 | |
2682 movu [r0 + r1 * 2], m1 | |
2683 movu [r0 + r3 * 1], m1 | |
2684 lea r0, [r0 + 4 * r1] | |
2685 movu [r0 + r1 * 0], m1 | |
2686 movu [r0 + r1 * 1], m1 | |
2687 movu [r0 + r1 * 2], m1 | |
2688 movu [r0 + r3 * 1], m1 | |
2689 lea r0, [r0 + 4 * r1] | |
2690 movu [r0 + r1 * 0], m1 | |
2691 movu [r0 + r1 * 1], m1 | |
2692 movu [r0 + r1 * 2], m1 | |
2693 movu [r0 + r3 * 1], m1 | |
2694 lea r0, [r0 + 4 * r1] | |
2695 movu [r0 + r1 * 0], m1 | |
2696 movu [r0 + r1 * 1], m1 | |
2697 movu [r0 + r1 * 2], m1 | |
2698 movu [r0 + r3 * 1], m1 | |
2699 lea r0, [r0 + 4 * r1] | |
2700 movu [r0 + r1 * 0], m1 | |
2701 movu [r0 + r1 * 1], m1 | |
2702 movu [r0 + r1 * 2], m1 | |
2703 movu [r0 + r3 * 1], m1 | |
2704 lea r0, [r0 + 4 * r1] | |
2705 movu [r0 + r1 * 0], m1 | |
2706 movu [r0 + r1 * 1], m1 | |
2707 movu [r0 + r1 * 2], m1 | |
2708 movu [r0 + r3 * 1], m1 | |
2709 RET | |
2710 %endif ;; ARCH_X86_64 == 1 | |
2711 | |
2712 ;--------------------------------------------------------------------------------------- | |
2713 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2714 ;--------------------------------------------------------------------------------------- | |
2715 INIT_XMM sse4 | |
2716 cglobal intra_pred_planar4, 3,3,7 | |
2717 pmovzxbw m1, [r2 + 1] | |
2718 pmovzxbw m2, [r2 + 9] | |
2719 pshufhw m3, m1, 0 ; topRight | |
2720 pshufd m3, m3, 0xAA | |
2721 pshufhw m4, m2, 0 ; bottomLeft | |
2722 pshufd m4, m4, 0xAA | |
2723 pmullw m3, [multi_2Row] ; (x + 1) * topRight | |
2724 pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x] | |
2725 mova m6, [pw_planar4_0] | |
2726 paddw m3, [pw_4] | |
2727 paddw m3, m4 | |
2728 paddw m3, m0 | |
2729 psubw m4, m1 | |
2730 | |
2731 pshuflw m5, m2, 0 | |
2732 pmullw m5, m6 | |
2733 paddw m5, m3 | |
2734 paddw m3, m4 | |
2735 psraw m5, 3 | |
2736 packuswb m5, m5 | |
2737 movd [r0], m5 | |
2738 | |
2739 pshuflw m5, m2, 01010101b | |
2740 pmullw m5, m6 | |
2741 paddw m5, m3 | |
2742 paddw m3, m4 | |
2743 psraw m5, 3 | |
2744 packuswb m5, m5 | |
2745 movd [r0 + r1], m5 | |
2746 lea r0, [r0 + 2 * r1] | |
2747 | |
2748 pshuflw m5, m2, 10101010b | |
2749 pmullw m5, m6 | |
2750 paddw m5, m3 | |
2751 paddw m3, m4 | |
2752 psraw m5, 3 | |
2753 packuswb m5, m5 | |
2754 movd [r0], m5 | |
2755 | |
2756 pshuflw m5, m2, 11111111b | |
2757 pmullw m5, m6 | |
2758 paddw m5, m3 | |
2759 paddw m3, m4 | |
2760 psraw m5, 3 | |
2761 packuswb m5, m5 | |
2762 movd [r0 + r1], m5 | |
2763 RET | |
2764 | |
2765 ;--------------------------------------------------------------------------------------- | |
2766 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2767 ;--------------------------------------------------------------------------------------- | |
2768 INIT_XMM sse4 | |
2769 cglobal intra_pred_planar8, 3,3,7 | |
2770 pmovzxbw m1, [r2 + 1] | |
2771 pmovzxbw m2, [r2 + 17] | |
2772 | |
2773 movd m3, [r2 + 9] ; topRight = above[8]; | |
2774 movd m4, [r2 + 25] ; bottomLeft = left[8]; | |
2775 | |
2776 pxor m0, m0 | |
2777 pshufb m3, m0 | |
2778 pshufb m4, m0 | |
2779 punpcklbw m3, m0 ; v_topRight | |
2780 punpcklbw m4, m0 ; v_bottomLeft | |
2781 pmullw m3, [multiL] ; (x + 1) * topRight | |
2782 pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x] | |
2783 mova m6, [pw_planar16_mul + mmsize] | |
2784 paddw m3, [pw_8] | |
2785 paddw m3, m4 | |
2786 paddw m3, m0 | |
2787 psubw m4, m1 | |
2788 | |
2789 %macro INTRA_PRED_PLANAR8 1 | |
2790 %if (%1 < 4) | |
2791 pshuflw m5, m2, 0x55 * %1 | |
2792 pshufd m5, m5, 0 | |
2793 %else | |
2794 pshufhw m5, m2, 0x55 * (%1 - 4) | |
2795 pshufd m5, m5, 0xAA | |
2796 %endif | |
2797 pmullw m5, m6 | |
2798 paddw m5, m3 | |
2799 paddw m3, m4 | |
2800 psraw m5, 4 | |
2801 packuswb m5, m5 | |
2802 movh [r0], m5 | |
2803 lea r0, [r0 + r1] | |
2804 %endmacro | |
2805 | |
2806 INTRA_PRED_PLANAR8 0 | |
2807 INTRA_PRED_PLANAR8 1 | |
2808 INTRA_PRED_PLANAR8 2 | |
2809 INTRA_PRED_PLANAR8 3 | |
2810 INTRA_PRED_PLANAR8 4 | |
2811 INTRA_PRED_PLANAR8 5 | |
2812 INTRA_PRED_PLANAR8 6 | |
2813 INTRA_PRED_PLANAR8 7 | |
2814 RET | |
2815 | |
2816 ;--------------------------------------------------------------------------------------- | |
2817 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2818 ;--------------------------------------------------------------------------------------- | |
2819 INIT_XMM sse4 | |
2820 cglobal intra_pred_planar16, 3,3,8 | |
2821 pmovzxbw m2, [r2 + 1] | |
2822 pmovzxbw m7, [r2 + 9] | |
2823 | |
2824 movd m3, [r2 + 17] ; topRight = above[16] | |
2825 movd m6, [r2 + 49] ; bottomLeft = left[16] | |
2826 | |
2827 pxor m0, m0 | |
2828 pshufb m3, m0 | |
2829 pshufb m6, m0 | |
2830 punpcklbw m3, m0 ; v_topRight | |
2831 punpcklbw m6, m0 ; v_bottomLeft | |
2832 pmullw m4, m3, [multiH] ; (x + 1) * topRight | |
2833 pmullw m3, [multiL] ; (x + 1) * topRight | |
2834 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
2835 pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x] | |
2836 paddw m4, [pw_16] | |
2837 paddw m3, [pw_16] | |
2838 paddw m4, m6 | |
2839 paddw m3, m6 | |
2840 paddw m4, m5 | |
2841 paddw m3, m1 | |
2842 psubw m1, m6, m7 | |
2843 psubw m6, m2 | |
2844 | |
2845 pmovzxbw m2, [r2 + 33] | |
2846 pmovzxbw m7, [r2 + 41] | |
2847 | |
2848 %macro INTRA_PRED_PLANAR16 1 | |
2849 %if (%1 < 4) | |
2850 pshuflw m5, m2, 0x55 * %1 | |
2851 pshufd m5, m5, 0 | |
2852 %else | |
2853 %if (%1 < 8) | |
2854 pshufhw m5, m2, 0x55 * (%1 - 4) | |
2855 pshufd m5, m5, 0xAA | |
2856 %else | |
2857 %if (%1 < 12) | |
2858 pshuflw m5, m7, 0x55 * (%1 - 8) | |
2859 pshufd m5, m5, 0 | |
2860 %else | |
2861 pshufhw m5, m7, 0x55 * (%1 - 12) | |
2862 pshufd m5, m5, 0xAA | |
2863 %endif | |
2864 %endif | |
2865 %endif | |
2866 pmullw m0, m5, [pw_planar16_mul + mmsize] | |
2867 pmullw m5, [pw_planar16_mul] | |
2868 paddw m0, m4 | |
2869 paddw m5, m3 | |
2870 paddw m3, m6 | |
2871 paddw m4, m1 | |
2872 psraw m5, 5 | |
2873 psraw m0, 5 | |
2874 packuswb m5, m0 | |
2875 movu [r0], m5 | |
2876 lea r0, [r0 + r1] | |
2877 %endmacro | |
2878 | |
2879 INTRA_PRED_PLANAR16 0 | |
2880 INTRA_PRED_PLANAR16 1 | |
2881 INTRA_PRED_PLANAR16 2 | |
2882 INTRA_PRED_PLANAR16 3 | |
2883 INTRA_PRED_PLANAR16 4 | |
2884 INTRA_PRED_PLANAR16 5 | |
2885 INTRA_PRED_PLANAR16 6 | |
2886 INTRA_PRED_PLANAR16 7 | |
2887 INTRA_PRED_PLANAR16 8 | |
2888 INTRA_PRED_PLANAR16 9 | |
2889 INTRA_PRED_PLANAR16 10 | |
2890 INTRA_PRED_PLANAR16 11 | |
2891 INTRA_PRED_PLANAR16 12 | |
2892 INTRA_PRED_PLANAR16 13 | |
2893 INTRA_PRED_PLANAR16 14 | |
2894 INTRA_PRED_PLANAR16 15 | |
2895 RET | |
2896 | |
2897 ;--------------------------------------------------------------------------------------- | |
2898 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2899 ;--------------------------------------------------------------------------------------- | |
2900 INIT_YMM avx2 | |
2901 cglobal intra_pred_planar16, 3,3,6 | |
2902 vpbroadcastw m3, [r2 + 17] | |
2903 mova m5, [pw_00ff] | |
2904 vpbroadcastw m4, [r2 + 49] | |
2905 mova m0, [pw_planar16_mul] | |
2906 pmovzxbw m2, [r2 + 1] | |
2907 pand m3, m5 ; v_topRight | |
2908 pand m4, m5 ; v_bottomLeft | |
2909 | |
2910 pmullw m3, [multiL] ; (x + 1) * topRight | |
2911 pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x] | |
2912 paddw m3, [pw_16] | |
2913 paddw m3, m4 | |
2914 paddw m3, m1 | |
2915 psubw m4, m2 | |
2916 add r2, 33 | |
2917 | |
2918 %macro INTRA_PRED_PLANAR16_AVX2 1 | |
2919 vpbroadcastw m1, [r2 + %1] | |
2920 vpsrlw m2, m1, 8 | |
2921 pand m1, m5 | |
2922 | |
2923 pmullw m1, m0 | |
2924 pmullw m2, m0 | |
2925 paddw m1, m3 | |
2926 paddw m3, m4 | |
2927 psraw m1, 5 | |
2928 paddw m2, m3 | |
2929 psraw m2, 5 | |
2930 paddw m3, m4 | |
2931 packuswb m1, m2 | |
2932 vpermq m1, m1, 11011000b | |
2933 movu [r0], xm1 | |
2934 vextracti128 [r0 + r1], m1, 1 | |
2935 lea r0, [r0 + r1 * 2] | |
2936 %endmacro | |
2937 INTRA_PRED_PLANAR16_AVX2 0 | |
2938 INTRA_PRED_PLANAR16_AVX2 2 | |
2939 INTRA_PRED_PLANAR16_AVX2 4 | |
2940 INTRA_PRED_PLANAR16_AVX2 6 | |
2941 INTRA_PRED_PLANAR16_AVX2 8 | |
2942 INTRA_PRED_PLANAR16_AVX2 10 | |
2943 INTRA_PRED_PLANAR16_AVX2 12 | |
2944 INTRA_PRED_PLANAR16_AVX2 14 | |
2945 %undef INTRA_PRED_PLANAR16_AVX2 | |
2946 RET | |
2947 | |
2948 ;--------------------------------------------------------------------------------------- | |
2949 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
2950 ;--------------------------------------------------------------------------------------- | |
2951 INIT_XMM sse4 | |
2952 %if ARCH_X86_64 == 1 | |
2953 cglobal intra_pred_planar32, 3,4,12 | |
2954 %else | |
2955 cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize) | |
2956 %define m8 [rsp + 0 * mmsize] | |
2957 %define m9 [rsp + 1 * mmsize] | |
2958 %define m10 [rsp + 2 * mmsize] | |
2959 %define m11 [rsp + 3 * mmsize] | |
2960 %endif | |
2961 movd m3, [r2 + 33] ; topRight = above[32] | |
2962 | |
2963 pxor m7, m7 | |
2964 pshufb m3, m7 | |
2965 punpcklbw m3, m7 ; v_topRight | |
2966 | |
2967 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
2968 pmullw m1, m3, [multiH] ; (x + 1) * topRight | |
2969 pmullw m2, m3, [multiH2] ; (x + 1) * topRight | |
2970 pmullw m3, [multiH3] ; (x + 1) * topRight | |
2971 | |
2972 movd m6, [r2 + 97] ; bottomLeft = left[32] | |
2973 pshufb m6, m7 | |
2974 punpcklbw m6, m7 ; v_bottomLeft | |
2975 | |
2976 paddw m0, m6 | |
2977 paddw m1, m6 | |
2978 paddw m2, m6 | |
2979 paddw m3, m6 | |
2980 paddw m0, [pw_32] | |
2981 paddw m1, [pw_32] | |
2982 paddw m2, [pw_32] | |
2983 paddw m3, [pw_32] | |
2984 pmovzxbw m4, [r2 + 1] | |
2985 pmullw m5, m4, [pw_31] | |
2986 paddw m0, m5 | |
2987 psubw m5, m6, m4 | |
2988 mova m8, m5 | |
2989 pmovzxbw m4, [r2 + 9] | |
2990 pmullw m5, m4, [pw_31] | |
2991 paddw m1, m5 | |
2992 psubw m5, m6, m4 | |
2993 mova m9, m5 | |
2994 pmovzxbw m4, [r2 + 17] | |
2995 pmullw m5, m4, [pw_31] | |
2996 paddw m2, m5 | |
2997 psubw m5, m6, m4 | |
2998 mova m10, m5 | |
2999 pmovzxbw m4, [r2 + 25] | |
3000 pmullw m5, m4, [pw_31] | |
3001 paddw m3, m5 | |
3002 psubw m5, m6, m4 | |
3003 mova m11, m5 | |
3004 add r2, 65 ; (2 * blkSize + 1) | |
3005 | |
3006 %macro INTRA_PRED_PLANAR32 0 | |
3007 movd m4, [r2] | |
3008 pshufb m4, m7 | |
3009 punpcklbw m4, m7 | |
3010 pmullw m5, m4, [pw_planar32_mul] | |
3011 pmullw m6, m4, [pw_planar32_mul + mmsize] | |
3012 paddw m5, m0 | |
3013 paddw m6, m1 | |
3014 paddw m0, m8 | |
3015 paddw m1, m9 | |
3016 psraw m5, 6 | |
3017 psraw m6, 6 | |
3018 packuswb m5, m6 | |
3019 movu [r0], m5 | |
3020 pmullw m5, m4, [pw_planar16_mul] | |
3021 pmullw m4, [pw_planar16_mul + mmsize] | |
3022 paddw m5, m2 | |
3023 paddw m4, m3 | |
3024 paddw m2, m10 | |
3025 paddw m3, m11 | |
3026 psraw m5, 6 | |
3027 psraw m4, 6 | |
3028 packuswb m5, m4 | |
3029 movu [r0 + 16], m5 | |
3030 | |
3031 lea r0, [r0 + r1] | |
3032 inc r2 | |
3033 %endmacro | |
3034 | |
3035 mov r3, 4 | |
3036 .loop: | |
3037 INTRA_PRED_PLANAR32 | |
3038 INTRA_PRED_PLANAR32 | |
3039 INTRA_PRED_PLANAR32 | |
3040 INTRA_PRED_PLANAR32 | |
3041 INTRA_PRED_PLANAR32 | |
3042 INTRA_PRED_PLANAR32 | |
3043 INTRA_PRED_PLANAR32 | |
3044 INTRA_PRED_PLANAR32 | |
3045 dec r3 | |
3046 jnz .loop | |
3047 RET | |
3048 | |
3049 ;--------------------------------------------------------------------------------------- | |
3050 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) | |
3051 ;--------------------------------------------------------------------------------------- | |
3052 %if ARCH_X86_64 == 1 | |
3053 INIT_YMM avx2 | |
3054 cglobal intra_pred_planar32, 3,4,11 | |
3055 mova m6, [pw_00ff] | |
3056 vpbroadcastw m3, [r2 + 33] ; topRight = above[32] | |
3057 vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32] | |
3058 pand m3, m6 | |
3059 pand m2, m6 | |
3060 | |
3061 pmullw m0, m3, [multiL] ; (x + 1) * topRight | |
3062 pmullw m3, [multiH2] ; (x + 1) * topRight | |
3063 | |
3064 paddw m0, m2 | |
3065 paddw m3, m2 | |
3066 paddw m0, [pw_32] | |
3067 paddw m3, [pw_32] | |
3068 | |
3069 pmovzxbw m4, [r2 + 1] | |
3070 pmovzxbw m1, [r2 + 17] | |
3071 pmullw m5, m4, [pw_31] | |
3072 paddw m0, m5 | |
3073 psubw m5, m2, m4 | |
3074 psubw m2, m1 | |
3075 pmullw m1, [pw_31] | |
3076 paddw m3, m1 | |
3077 mova m1, m5 | |
3078 | |
3079 add r2, 65 ; (2 * blkSize + 1) | |
3080 mova m9, [pw_planar32_mul] | |
3081 mova m10, [pw_planar16_mul] | |
3082 | |
3083 %macro INTRA_PRED_PLANAR32_AVX2 0 | |
3084 vpbroadcastw m4, [r2] | |
3085 vpsrlw m7, m4, 8 | |
3086 pand m4, m6 | |
3087 | |
3088 pmullw m5, m4, m9 | |
3089 pmullw m4, m4, m10 | |
3090 paddw m5, m0 | |
3091 paddw m4, m3 | |
3092 paddw m0, m1 | |
3093 paddw m3, m2 | |
3094 psraw m5, 6 | |
3095 psraw m4, 6 | |
3096 packuswb m5, m4 | |
3097 pmullw m8, m7, m9 | |
3098 pmullw m7, m7, m10 | |
3099 vpermq m5, m5, 11011000b | |
3100 paddw m8, m0 | |
3101 paddw m7, m3 | |
3102 paddw m0, m1 | |
3103 paddw m3, m2 | |
3104 psraw m8, 6 | |
3105 psraw m7, 6 | |
3106 packuswb m8, m7 | |
3107 add r2, 2 | |
3108 vpermq m8, m8, 11011000b | |
3109 | |
3110 movu [r0], m5 | |
3111 movu [r0 + r1], m8 | |
3112 lea r0, [r0 + r1 * 2] | |
3113 %endmacro | |
3114 INTRA_PRED_PLANAR32_AVX2 | |
3115 INTRA_PRED_PLANAR32_AVX2 | |
3116 INTRA_PRED_PLANAR32_AVX2 | |
3117 INTRA_PRED_PLANAR32_AVX2 | |
3118 INTRA_PRED_PLANAR32_AVX2 | |
3119 INTRA_PRED_PLANAR32_AVX2 | |
3120 INTRA_PRED_PLANAR32_AVX2 | |
3121 INTRA_PRED_PLANAR32_AVX2 | |
3122 INTRA_PRED_PLANAR32_AVX2 | |
3123 INTRA_PRED_PLANAR32_AVX2 | |
3124 INTRA_PRED_PLANAR32_AVX2 | |
3125 INTRA_PRED_PLANAR32_AVX2 | |
3126 INTRA_PRED_PLANAR32_AVX2 | |
3127 INTRA_PRED_PLANAR32_AVX2 | |
3128 INTRA_PRED_PLANAR32_AVX2 | |
3129 INTRA_PRED_PLANAR32_AVX2 | |
3130 %undef INTRA_PRED_PLANAR32_AVX2 | |
3131 RET | |
3132 %endif ;; ARCH_X86_64 == 1 | |
3133 | |
3134 ;----------------------------------------------------------------------------------------- | |
3135 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
3136 ;----------------------------------------------------------------------------------------- | |
3137 INIT_XMM ssse3 | |
3138 cglobal intra_pred_ang4_2, 3,5,3 | |
3139 lea r4, [r2 + 2] | |
3140 add r2, 10 | |
3141 cmp r3m, byte 34 | |
3142 cmove r2, r4 | |
3143 | |
3144 movh m0, [r2] | |
3145 movd [r0], m0 | |
3146 palignr m1, m0, 1 | |
3147 movd [r0 + r1], m1 | |
3148 palignr m2, m0, 2 | |
3149 movd [r0 + r1 * 2], m2 | |
3150 lea r1, [r1 * 3] | |
3151 psrldq m0, 3 | |
3152 movd [r0 + r1], m0 | |
3153 RET | |
3154 | |
3155 INIT_XMM sse4 | |
3156 cglobal intra_pred_ang4_3, 3,5,5 | |
3157 mov r4, 1 | |
3158 cmp r3m, byte 33 | |
3159 mov r3, 9 | |
3160 cmove r3, r4 | |
3161 | |
3162 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3163 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3164 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3165 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
3166 palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
3167 palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] | |
3168 punpcklqdq m0, m1 | |
3169 punpcklqdq m2, m3 | |
3170 | |
3171 lea r3, [ang_table + 20 * 16] | |
3172 movh m3, [r3 + 6 * 16] ; [26] | |
3173 movhps m3, [r3] ; [20] | |
3174 movh m4, [r3 - 6 * 16] ; [14] | |
3175 movhps m4, [r3 - 12 * 16] ; [ 8] | |
3176 jmp .do_filter4x4 | |
3177 | |
3178 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose | |
3179 ALIGN 16 | |
3180 .do_filter4x4: | |
3181 mova m1, [pw_1024] | |
3182 | |
3183 pmaddubsw m0, m3 | |
3184 pmulhrsw m0, m1 | |
3185 pmaddubsw m2, m4 | |
3186 pmulhrsw m2, m1 | |
3187 packuswb m0, m2 | |
3188 | |
3189 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before | |
3190 jz .store | |
3191 | |
3192 ; transpose 4x4 | |
3193 pshufb m0, [c_trans_4x4] | |
3194 | |
3195 .store: | |
3196 ; TODO: use pextrd here after intrinsic ssse3 removed | |
3197 movd [r0], m0 | |
3198 pextrd [r0 + r1], m0, 1 | |
3199 pextrd [r0 + r1 * 2], m0, 2 | |
3200 lea r1, [r1 * 3] | |
3201 pextrd [r0 + r1], m0, 3 | |
3202 RET | |
3203 | |
3204 cglobal intra_pred_ang4_4, 3,5,5 | |
3205 xor r4, r4 | |
3206 inc r4 | |
3207 cmp r3m, byte 32 | |
3208 mov r3, 9 | |
3209 cmove r3, r4 | |
3210 | |
3211 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3212 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3213 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3214 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
3215 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
3216 punpcklqdq m0, m1 | |
3217 punpcklqdq m2, m1, m3 | |
3218 | |
3219 lea r3, [ang_table + 18 * 16] | |
3220 movh m3, [r3 + 3 * 16] ; [21] | |
3221 movhps m3, [r3 - 8 * 16] ; [10] | |
3222 movh m4, [r3 + 13 * 16] ; [31] | |
3223 movhps m4, [r3 + 2 * 16] ; [20] | |
3224 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3225 | |
3226 cglobal intra_pred_ang4_5, 3,5,5 | |
3227 xor r4, r4 | |
3228 inc r4 | |
3229 cmp r3m, byte 31 | |
3230 mov r3, 9 | |
3231 cmove r3, r4 | |
3232 | |
3233 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3234 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3235 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3236 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
3237 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
3238 punpcklqdq m0, m1 | |
3239 punpcklqdq m2, m1, m3 | |
3240 | |
3241 lea r3, [ang_table + 10 * 16] | |
3242 movh m3, [r3 + 7 * 16] ; [17] | |
3243 movhps m3, [r3 - 8 * 16] ; [ 2] | |
3244 movh m4, [r3 + 9 * 16] ; [19] | |
3245 movhps m4, [r3 - 6 * 16] ; [ 4] | |
3246 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3247 | |
3248 cglobal intra_pred_ang4_6, 3,5,5 | |
3249 xor r4, r4 | |
3250 inc r4 | |
3251 cmp r3m, byte 30 | |
3252 mov r3, 9 | |
3253 cmove r3, r4 | |
3254 | |
3255 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3256 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3257 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3258 palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
3259 punpcklqdq m0, m0 | |
3260 punpcklqdq m2, m2 | |
3261 | |
3262 lea r3, [ang_table + 19 * 16] | |
3263 movh m3, [r3 - 6 * 16] ; [13] | |
3264 movhps m3, [r3 + 7 * 16] ; [26] | |
3265 movh m4, [r3 - 12 * 16] ; [ 7] | |
3266 movhps m4, [r3 + 1 * 16] ; [20] | |
3267 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3268 | |
3269 cglobal intra_pred_ang4_7, 3,5,5 | |
3270 xor r4, r4 | |
3271 inc r4 | |
3272 cmp r3m, byte 29 | |
3273 mov r3, 9 | |
3274 cmove r3, r4 | |
3275 | |
3276 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3277 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3278 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3279 palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
3280 punpcklqdq m2, m0, m3 | |
3281 punpcklqdq m0, m0 | |
3282 | |
3283 lea r3, [ang_table + 20 * 16] | |
3284 movh m3, [r3 - 11 * 16] ; [ 9] | |
3285 movhps m3, [r3 - 2 * 16] ; [18] | |
3286 movh m4, [r3 + 7 * 16] ; [27] | |
3287 movhps m4, [r3 - 16 * 16] ; [ 4] | |
3288 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3289 | |
3290 cglobal intra_pred_ang4_8, 3,5,5 | |
3291 xor r4, r4 | |
3292 inc r4 | |
3293 cmp r3m, byte 28 | |
3294 mov r3, 9 | |
3295 cmove r3, r4 | |
3296 | |
3297 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3298 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3299 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3300 punpcklqdq m0, m0 | |
3301 mova m2, m0 | |
3302 | |
3303 lea r3, [ang_table + 13 * 16] | |
3304 movh m3, [r3 - 8 * 16] ; [ 5] | |
3305 movhps m3, [r3 - 3 * 16] ; [10] | |
3306 movh m4, [r3 + 2 * 16] ; [15] | |
3307 movhps m4, [r3 + 7 * 16] ; [20] | |
3308 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3309 | |
3310 cglobal intra_pred_ang4_9, 3,5,5 | |
3311 xor r4, r4 | |
3312 inc r4 | |
3313 cmp r3m, byte 27 | |
3314 mov r3, 9 | |
3315 cmove r3, r4 | |
3316 | |
3317 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] | |
3318 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
3319 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3320 punpcklqdq m0, m0 | |
3321 mova m2, m0 | |
3322 | |
3323 lea r3, [ang_table + 4 * 16] | |
3324 movh m3, [r3 - 2 * 16] ; [ 2] | |
3325 movhps m3, [r3 - 0 * 16] ; [ 4] | |
3326 movh m4, [r3 + 2 * 16] ; [ 6] | |
3327 movhps m4, [r3 + 4 * 16] ; [ 8] | |
3328 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3329 | |
3330 cglobal intra_pred_ang4_10, 3,3,4 | |
3331 movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1] | |
3332 pshufb m0, [pb_unpackbd1] | |
3333 pshufd m1, m0, 1 | |
3334 movhlps m2, m0 | |
3335 pshufd m3, m0, 3 | |
3336 movd [r0 + r1], m1 | |
3337 movd [r0 + r1 * 2], m2 | |
3338 lea r1, [r1 * 3] | |
3339 movd [r0 + r1], m3 | |
3340 cmp r4m, byte 0 | |
3341 jz .quit | |
3342 | |
3343 ; filter | |
3344 pmovzxbw m0, m0 ; [-1 -1 -1 -1] | |
3345 movh m1, [r2] ; [4 3 2 1 0] | |
3346 pshufb m2, m1, [pb_0_8] ; [0 0 0 0] | |
3347 pshufb m1, [pb_unpackbw1] ; [4 3 2 1] | |
3348 psubw m1, m2 | |
3349 psraw m1, 1 | |
3350 paddw m0, m1 | |
3351 packuswb m0, m0 | |
3352 .quit: | |
3353 movd [r0], m0 | |
3354 RET | |
3355 | |
3356 INIT_XMM sse4 | |
3357 cglobal intra_pred_ang4_26, 3,4,3 | |
3358 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
3359 | |
3360 ; store | |
3361 movd [r0], m0 | |
3362 movd [r0 + r1], m0 | |
3363 movd [r0 + r1 * 2], m0 | |
3364 lea r3, [r1 * 3] | |
3365 movd [r0 + r3], m0 | |
3366 | |
3367 ; filter | |
3368 cmp r4m, byte 0 | |
3369 jz .quit | |
3370 | |
3371 pshufb m0, [pb_0_8] ; [ 1 1 1 1] | |
3372 movh m1, [r2 + 8] ; [-4 -3 -2 -1 0] | |
3373 pinsrb m1, [r2], 0 | |
3374 pshufb m2, m1, [pb_0_8] ; [0 0 0 0] | |
3375 pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] | |
3376 psubw m1, m2 | |
3377 psraw m1, 1 | |
3378 paddw m0, m1 | |
3379 packuswb m0, m0 | |
3380 | |
3381 pextrb [r0], m0, 0 | |
3382 pextrb [r0 + r1], m0, 1 | |
3383 pextrb [r0 + r1 * 2], m0, 2 | |
3384 pextrb [r0 + r3], m0, 3 | |
3385 .quit: | |
3386 RET | |
3387 | |
3388 cglobal intra_pred_ang4_11, 3,5,5 | |
3389 xor r4, r4 | |
3390 cmp r3m, byte 25 | |
3391 mov r3, 8 | |
3392 cmove r3, r4 | |
3393 | |
3394 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] | |
3395 pinsrb m0, [r2], 0 | |
3396 palignr m1, m0, 1 ; [x x x x 4 3 2 1] | |
3397 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] | |
3398 punpcklqdq m0, m0 | |
3399 mova m2, m0 | |
3400 | |
3401 lea r3, [ang_table + 24 * 16] | |
3402 | |
3403 movh m3, [r3 + 6 * 16] ; [24] | |
3404 movhps m3, [r3 + 4 * 16] ; [26] | |
3405 movh m4, [r3 + 2 * 16] ; [28] | |
3406 movhps m4, [r3 + 0 * 16] ; [30] | |
3407 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3408 | |
3409 cglobal intra_pred_ang4_12, 3,5,5 | |
3410 xor r4, r4 | |
3411 cmp r3m, byte 24 | |
3412 mov r3, 8 | |
3413 cmove r3, r4 | |
3414 | |
3415 movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] | |
3416 pinsrb m0, [r2], 0 | |
3417 palignr m1, m0, 1 ; [x x x x 4 3 2 1] | |
3418 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] | |
3419 punpcklqdq m0, m0 | |
3420 mova m2, m0 | |
3421 | |
3422 lea r3, [ang_table + 20 * 16] | |
3423 movh m3, [r3 + 7 * 16] ; [27] | |
3424 movhps m3, [r3 + 2 * 16] ; [22] | |
3425 movh m4, [r3 - 3 * 16] ; [17] | |
3426 movhps m4, [r3 - 8 * 16] ; [12] | |
3427 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3428 | |
3429 cglobal intra_pred_ang4_13, 4,5,5 | |
3430 xor r4, r4 | |
3431 cmp r3m, byte 23 | |
3432 mov r3, 8 | |
3433 jz .next | |
3434 xchg r3, r4 | |
3435 .next: | |
3436 movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] | |
3437 pinsrb m1, [r2], 1 | |
3438 palignr m0, m1, 1 ; [x x x 4 3 2 1 0] | |
3439 palignr m2, m1, 2 ; [x x x x 4 3 2 1] | |
3440 pinsrb m1, [r2 + r3 + 4], 0 | |
3441 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] | |
3442 punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] | |
3443 punpcklqdq m2, m0, m1 | |
3444 punpcklqdq m0, m0 | |
3445 | |
3446 lea r3, [ang_table + 21 * 16] | |
3447 movh m3, [r3 + 2 * 16] ; [23] | |
3448 movhps m3, [r3 - 7 * 16] ; [14] | |
3449 movh m4, [r3 - 16 * 16] ; [ 5] | |
3450 movhps m4, [r3 + 7 * 16] ; [28] | |
3451 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3452 | |
3453 cglobal intra_pred_ang4_14, 4,5,5 | |
3454 xor r4, r4 | |
3455 cmp r3m, byte 22 | |
3456 mov r3, 8 | |
3457 jz .next | |
3458 xchg r3, r4 | |
3459 .next: | |
3460 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] | |
3461 pinsrb m2, [r2], 1 | |
3462 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
3463 palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
3464 pinsrb m2, [r2 + r3 + 2], 0 | |
3465 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
3466 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
3467 punpcklqdq m0, m0 | |
3468 punpcklqdq m2, m2 | |
3469 | |
3470 lea r3, [ang_table + 19 * 16] | |
3471 movh m3, [r3 + 0 * 16] ; [19] | |
3472 movhps m3, [r3 - 13 * 16] ; [ 6] | |
3473 movh m4, [r3 + 6 * 16] ; [25] | |
3474 movhps m4, [r3 - 7 * 16] ; [12] | |
3475 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3476 | |
3477 cglobal intra_pred_ang4_15, 4,5,5 | |
3478 xor r4, r4 | |
3479 cmp r3m, byte 21 | |
3480 mov r3, 8 | |
3481 jz .next | |
3482 xchg r3, r4 | |
3483 .next: | |
3484 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] | |
3485 pinsrb m2, [r2], 1 | |
3486 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
3487 palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
3488 pinsrb m2, [r2 + r3 + 2], 0 | |
3489 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] | |
3490 pinsrb m3, [r2 + r3 + 4], 0 | |
3491 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] | |
3492 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
3493 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
3494 punpcklqdq m0, m2 | |
3495 punpcklqdq m2, m4 | |
3496 | |
3497 lea r3, [ang_table + 23 * 16] | |
3498 movh m3, [r3 - 8 * 16] ; [15] | |
3499 movhps m3, [r3 + 7 * 16] ; [30] | |
3500 movh m4, [r3 - 10 * 16] ; [13] | |
3501 movhps m4, [r3 + 5 * 16] ; [28] | |
3502 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3503 | |
3504 cglobal intra_pred_ang4_16, 3,5,5 | |
3505 xor r4, r4 | |
3506 cmp r3m, byte 20 | |
3507 mov r3, 8 | |
3508 jz .next | |
3509 xchg r3, r4 | |
3510 .next: | |
3511 movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] | |
3512 pinsrb m2, [r2], 1 | |
3513 palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
3514 palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
3515 pinsrb m2, [r2 + r3 + 2], 0 | |
3516 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] | |
3517 pinsrb m3, [r2 + r3 + 3], 0 | |
3518 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] | |
3519 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
3520 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
3521 punpcklqdq m0, m2 | |
3522 punpcklqdq m2, m4 | |
3523 | |
3524 lea r3, [ang_table + 19 * 16] | |
3525 movh m3, [r3 - 8 * 16] ; [11] | |
3526 movhps m3, [r3 + 3 * 16] ; [22] | |
3527 movh m4, [r3 - 18 * 16] ; [ 1] | |
3528 movhps m4, [r3 - 7 * 16] ; [12] | |
3529 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3530 | |
3531 cglobal intra_pred_ang4_17, 3,5,5 | |
3532 xor r4, r4 | |
3533 cmp r3m, byte 19 | |
3534 mov r3, 8 | |
3535 jz .next | |
3536 xchg r3, r4 | |
3537 .next: | |
3538 movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x] | |
3539 pinsrb m3, [r2], 1 | |
3540 palignr m0, m3, 1 ; [- - - 4 3 2 1 0] | |
3541 palignr m1, m3, 2 ; [- - - - 4 3 2 1] | |
3542 mova m4, m0 | |
3543 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
3544 pinsrb m3, [r2 + r3 + 1], 0 | |
3545 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] | |
3546 punpcklqdq m0, m1 | |
3547 | |
3548 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] | |
3549 pinsrb m2, [r2 + r3 + 2], 0 | |
3550 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] | |
3551 pinsrb m1, [r2 + r3 + 4], 0 | |
3552 punpcklbw m1, m2 ; [1 0 0 x x y y z] | |
3553 punpcklbw m2, m3 ; [2 1 1 0 0 x x y] | |
3554 punpcklqdq m2, m1 | |
3555 | |
3556 lea r3, [ang_table + 14 * 16] | |
3557 movh m3, [r3 - 8 * 16] ; [ 6] | |
3558 movhps m3, [r3 - 2 * 16] ; [12] | |
3559 movh m4, [r3 + 4 * 16] ; [18] | |
3560 movhps m4, [r3 + 10 * 16] ; [24] | |
3561 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
3562 | |
3563 cglobal intra_pred_ang4_18, 3,5,1 | |
3564 mov r4d, [r2 + 8] | |
3565 mov r3b, byte [r2] | |
3566 mov [r2 + 8], r3b | |
3567 mov r3d, [r2 + 8] | |
3568 bswap r3d | |
3569 movd m0, r3d | |
3570 | |
3571 pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] | |
3572 lea r3, [r1 * 3] | |
3573 movd [r0 + r3], m0 | |
3574 psrldq m0, 1 | |
3575 movd [r0 + r1 * 2], m0 | |
3576 psrldq m0, 1 | |
3577 movd [r0 + r1], m0 | |
3578 psrldq m0, 1 | |
3579 movd [r0], m0 | |
3580 mov [r2 + 8], r4w | |
3581 RET | |
3582 | |
3583 ;----------------------------------------------------------------------------------------- | |
3584 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
3585 ;----------------------------------------------------------------------------------------- | |
3586 INIT_XMM ssse3 | |
3587 cglobal intra_pred_ang8_2, 3,5,2 | |
3588 lea r4, [r2 + 2] | |
3589 add r2, 18 | |
3590 cmp r3m, byte 34 | |
3591 cmove r2, r4 | |
3592 movu m0, [r2] | |
3593 lea r4, [r1 * 3] | |
3594 | |
3595 movh [r0], m0 | |
3596 palignr m1, m0, 1 | |
3597 movh [r0 + r1], m1 | |
3598 palignr m1, m0, 2 | |
3599 movh [r0 + r1 * 2], m1 | |
3600 palignr m1, m0, 3 | |
3601 movh [r0 + r4], m1 | |
3602 palignr m1, m0, 4 | |
3603 lea r0, [r0 + r1 * 4] | |
3604 movh [r0], m1 | |
3605 palignr m1, m0, 5 | |
3606 movh [r0 + r1], m1 | |
3607 palignr m1, m0, 6 | |
3608 movh [r0 + r1 * 2], m1 | |
3609 palignr m1, m0, 7 | |
3610 movh [r0 + r4], m1 | |
3611 RET | |
3612 | |
3613 INIT_XMM sse4 | |
3614 cglobal intra_pred_ang8_3, 3,5,8 | |
3615 lea r4, [r2 + 1] | |
3616 add r2, 17 | |
3617 cmp r3m, byte 33 | |
3618 cmove r2, r4 | |
3619 lea r3, [ang_table + 22 * 16] | |
3620 lea r4, [ang_table + 8 * 16] | |
3621 mova m3, [pw_1024] | |
3622 | |
3623 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3624 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3625 | |
3626 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3627 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3628 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3629 | |
3630 pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] | |
3631 pmulhrsw m4, m3 | |
3632 pmaddubsw m1, [r3 - 2 * 16] ; [20] | |
3633 pmulhrsw m1, m3 | |
3634 packuswb m4, m1 | |
3635 | |
3636 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
3637 | |
3638 pmaddubsw m5, [r3 - 8 * 16] ; [14] | |
3639 pmulhrsw m5, m3 | |
3640 | |
3641 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
3642 | |
3643 pmaddubsw m6, [r4] ; [ 8] | |
3644 pmulhrsw m6, m3 | |
3645 packuswb m5, m6 | |
3646 | |
3647 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
3648 | |
3649 pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] | |
3650 pmulhrsw m6, m3 | |
3651 | |
3652 pmaddubsw m1, [r3 + 6 * 16] ; [28] | |
3653 pmulhrsw m1, m3 | |
3654 packuswb m6, m1 | |
3655 | |
3656 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] | |
3657 | |
3658 pmaddubsw m1, [r3] ; [22] | |
3659 pmulhrsw m1, m3 | |
3660 | |
3661 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] | |
3662 | |
3663 pmaddubsw m2, [r3 - 6 * 16] ; [16] | |
3664 pmulhrsw m2, m3 | |
3665 packuswb m1, m2 | |
3666 jmp .transpose8x8 | |
3667 | |
3668 ALIGN 16 | |
3669 .transpose8x8: | |
3670 jz .store | |
3671 | |
3672 ; transpose 8x8 | |
3673 punpckhbw m0, m4, m5 | |
3674 punpcklbw m4, m5 | |
3675 punpckhbw m2, m4, m0 | |
3676 punpcklbw m4, m0 | |
3677 | |
3678 punpckhbw m0, m6, m1 | |
3679 punpcklbw m6, m1 | |
3680 punpckhbw m1, m6, m0 | |
3681 punpcklbw m6, m0 | |
3682 | |
3683 punpckhdq m5, m4, m6 | |
3684 punpckldq m4, m6 | |
3685 punpckldq m6, m2, m1 | |
3686 punpckhdq m2, m1 | |
3687 mova m1, m2 | |
3688 | |
3689 .store: | |
3690 lea r4, [r1 * 3] | |
3691 movh [r0], m4 | |
3692 movhps [r0 + r1], m4 | |
3693 movh [r0 + r1 * 2], m5 | |
3694 movhps [r0 + r4], m5 | |
3695 add r0, r4 | |
3696 movh [r0 + r1], m6 | |
3697 movhps [r0 + r1 * 2], m6 | |
3698 movh [r0 + r4], m1 | |
3699 movhps [r0 + r1 * 4], m1 | |
3700 RET | |
3701 | |
3702 cglobal intra_pred_ang8_4, 3,5,8 | |
3703 lea r4, [r2 + 1] | |
3704 add r2, 17 | |
3705 cmp r3m, byte 32 | |
3706 cmove r2, r4 | |
3707 lea r3, [ang_table + 24 * 16] | |
3708 lea r4, [ang_table + 10 * 16] | |
3709 mova m3, [pw_1024] | |
3710 | |
3711 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3712 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3713 | |
3714 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3715 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3716 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3717 mova m5, m1 | |
3718 | |
3719 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] | |
3720 pmulhrsw m4, m3 | |
3721 pmaddubsw m1, [r4] ; [10] | |
3722 pmulhrsw m1, m3 | |
3723 packuswb m4, m1 | |
3724 | |
3725 pmaddubsw m5, [r3 + 7 * 16] ; [31] | |
3726 pmulhrsw m5, m3 | |
3727 | |
3728 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
3729 | |
3730 pmaddubsw m6, [r3 - 4 * 16] ; [ 20] | |
3731 pmulhrsw m6, m3 | |
3732 packuswb m5, m6 | |
3733 | |
3734 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
3735 | |
3736 pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] | |
3737 pmulhrsw m6, m3 | |
3738 | |
3739 pmaddubsw m1, [r3 + 6 * 16] ; [30] | |
3740 pmulhrsw m1, m3 | |
3741 packuswb m6, m1 | |
3742 | |
3743 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
3744 | |
3745 pmaddubsw m1, [r3 - 5 * 16] ; [19] | |
3746 pmulhrsw m1, m3 | |
3747 | |
3748 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] | |
3749 | |
3750 pmaddubsw m2, [r4 - 2 * 16] ; [8] | |
3751 pmulhrsw m2, m3 | |
3752 packuswb m1, m2 | |
3753 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3754 | |
3755 cglobal intra_pred_ang8_5, 3,5,8 | |
3756 lea r4, [r2 + 1] | |
3757 add r2, 17 | |
3758 cmp r3m, byte 31 | |
3759 cmove r2, r4 | |
3760 lea r3, [ang_table + 17 * 16] | |
3761 lea r4, [ang_table + 2 * 16] | |
3762 mova m3, [pw_1024] | |
3763 | |
3764 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3765 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3766 | |
3767 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3768 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3769 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3770 mova m5, m1 | |
3771 | |
3772 pmaddubsw m4, m0, [r3] ; [17] | |
3773 pmulhrsw m4, m3 | |
3774 pmaddubsw m1, [r4] ; [2] | |
3775 pmulhrsw m1, m3 | |
3776 packuswb m4, m1 | |
3777 | |
3778 pmaddubsw m5, [r3 + 2 * 16] ; [19] | |
3779 pmulhrsw m5, m3 | |
3780 | |
3781 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
3782 mova m1, m6 | |
3783 | |
3784 pmaddubsw m1, [r4 + 2 * 16] ; [4] | |
3785 pmulhrsw m1, m3 | |
3786 packuswb m5, m1 | |
3787 | |
3788 pmaddubsw m6, [r3 + 4 * 16] ; [21] | |
3789 pmulhrsw m6, m3 | |
3790 | |
3791 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
3792 | |
3793 mova m7, m1 | |
3794 pmaddubsw m7, [r4 + 4 * 16] ; [6] | |
3795 pmulhrsw m7, m3 | |
3796 packuswb m6, m7 | |
3797 | |
3798 pmaddubsw m1, [r3 + 6 * 16] ; [23] | |
3799 pmulhrsw m1, m3 | |
3800 | |
3801 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] | |
3802 | |
3803 pmaddubsw m2, [r4 + 6 * 16] ; [8] | |
3804 pmulhrsw m2, m3 | |
3805 packuswb m1, m2 | |
3806 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3807 | |
3808 cglobal intra_pred_ang8_6, 3,5,8 | |
3809 lea r4, [r2 + 1] | |
3810 add r2, 17 | |
3811 cmp r3m, byte 30 | |
3812 cmove r2, r4 | |
3813 lea r3, [ang_table + 20 * 16] | |
3814 lea r4, [ang_table + 8 * 16] | |
3815 mova m7, [pw_1024] | |
3816 | |
3817 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3818 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3819 | |
3820 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3821 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3822 mova m1, m0 | |
3823 | |
3824 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] | |
3825 pmulhrsw m4, m7 | |
3826 pmaddubsw m1, [r3 + 6 * 16] ; [26] | |
3827 pmulhrsw m1, m7 | |
3828 packuswb m4, m1 | |
3829 | |
3830 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3831 | |
3832 pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] | |
3833 pmulhrsw m5, m7 | |
3834 | |
3835 pmaddubsw m6, [r3] ; [20] | |
3836 pmulhrsw m6, m7 | |
3837 packuswb m5, m6 | |
3838 | |
3839 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
3840 | |
3841 pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] | |
3842 pmulhrsw m6, m7 | |
3843 | |
3844 mova m3, m1 | |
3845 pmaddubsw m3, [r3 - 6 * 16] ; [14] | |
3846 pmulhrsw m3, m7 | |
3847 packuswb m6, m3 | |
3848 | |
3849 pmaddubsw m1, [r3 + 7 * 16] ; [27] | |
3850 pmulhrsw m1, m7 | |
3851 | |
3852 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
3853 | |
3854 pmaddubsw m2, [r4] ; [8] | |
3855 pmulhrsw m2, m7 | |
3856 packuswb m1, m2 | |
3857 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3858 | |
3859 cglobal intra_pred_ang8_7, 3,5,8 | |
3860 lea r4, [r2 + 1] | |
3861 add r2, 17 | |
3862 cmp r3m, byte 29 | |
3863 cmove r2, r4 | |
3864 lea r3, [ang_table + 24 * 16] | |
3865 lea r4, [ang_table + 6 * 16] | |
3866 mova m7, [pw_1024] | |
3867 | |
3868 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3869 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3870 | |
3871 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3872 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3873 | |
3874 pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] | |
3875 pmulhrsw m4, m7 | |
3876 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] | |
3877 pmulhrsw m3, m7 | |
3878 packuswb m4, m3 | |
3879 | |
3880 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] | |
3881 pmulhrsw m5, m7 | |
3882 | |
3883 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3884 | |
3885 pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] | |
3886 pmulhrsw m6, m7 | |
3887 packuswb m5, m6 | |
3888 | |
3889 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] | |
3890 pmulhrsw m6, m7 | |
3891 | |
3892 mova m3, m1 | |
3893 pmaddubsw m3, [r3 - 2 * 16] ; [22] | |
3894 pmulhrsw m3, m7 | |
3895 packuswb m6, m3 | |
3896 | |
3897 pmaddubsw m1, [r3 + 7 * 16] ; [31] | |
3898 pmulhrsw m1, m7 | |
3899 | |
3900 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
3901 | |
3902 pmaddubsw m2, [r4 + 2 * 16] ; [8] | |
3903 pmulhrsw m2, m7 | |
3904 packuswb m1, m2 | |
3905 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3906 | |
3907 cglobal intra_pred_ang8_8, 3,5,8 | |
3908 lea r4, [r2 + 1] | |
3909 add r2, 17 | |
3910 cmp r3m, byte 28 | |
3911 cmove r2, r4 | |
3912 lea r3, [ang_table + 23 * 16] | |
3913 lea r4, [ang_table + 8 * 16] | |
3914 mova m7, [pw_1024] | |
3915 | |
3916 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3917 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3918 | |
3919 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3920 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3921 palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
3922 | |
3923 pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] | |
3924 pmulhrsw m4, m7 | |
3925 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] | |
3926 pmulhrsw m3, m7 | |
3927 packuswb m4, m3 | |
3928 | |
3929 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] | |
3930 pmulhrsw m5, m7 | |
3931 | |
3932 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] | |
3933 pmulhrsw m6, m7 | |
3934 packuswb m5, m6 | |
3935 | |
3936 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] | |
3937 pmulhrsw m6, m7 | |
3938 | |
3939 pmaddubsw m0, [r3 + 7 * 16] ; [30] | |
3940 pmulhrsw m0, m7 | |
3941 packuswb m6, m0 | |
3942 | |
3943 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] | |
3944 pmulhrsw m1, m7 | |
3945 | |
3946 pmaddubsw m2, [r4] ; [8] | |
3947 pmulhrsw m2, m7 | |
3948 packuswb m1, m2 | |
3949 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3950 | |
3951 cglobal intra_pred_ang8_9, 3,5,8 | |
3952 lea r4, [r2 + 1] | |
3953 add r2, 17 | |
3954 cmp r3m, byte 27 | |
3955 cmove r2, r4 | |
3956 lea r3, [ang_table + 10 * 16] | |
3957 mova m7, [pw_1024] | |
3958 | |
3959 movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3960 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3961 | |
3962 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3963 | |
3964 pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] | |
3965 pmulhrsw m4, m7 | |
3966 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] | |
3967 pmulhrsw m3, m7 | |
3968 packuswb m4, m3 | |
3969 | |
3970 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] | |
3971 pmulhrsw m5, m7 | |
3972 | |
3973 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] | |
3974 pmulhrsw m6, m7 | |
3975 packuswb m5, m6 | |
3976 | |
3977 pmaddubsw m6, m0, [r3] ; [10] | |
3978 pmulhrsw m6, m7 | |
3979 | |
3980 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] | |
3981 pmulhrsw m2, m7 | |
3982 packuswb m6, m2 | |
3983 | |
3984 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] | |
3985 pmulhrsw m1, m7 | |
3986 | |
3987 pmaddubsw m0, [r3 + 6 * 16] ; [16] | |
3988 pmulhrsw m0, m7 | |
3989 packuswb m1, m0 | |
3990 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
3991 | |
3992 cglobal intra_pred_ang8_10, 3,6,5 | |
3993 movh m0, [r2 + 17] | |
3994 mova m4, [pb_unpackbq] | |
3995 palignr m1, m0, 2 | |
3996 pshufb m1, m4 | |
3997 palignr m2, m0, 4 | |
3998 pshufb m2, m4 | |
3999 palignr m3, m0, 6 | |
4000 pshufb m3, m4 | |
4001 pshufb m0, m4 | |
4002 | |
4003 lea r5, [r1 * 3] | |
4004 movhps [r0 + r1], m0 | |
4005 movh [r0 + r1 * 2], m1 | |
4006 movhps [r0 + r5], m1 | |
4007 lea r3, [r0 + r1 * 4] | |
4008 movh [r3], m2 | |
4009 movhps [r3 + r1], m2 | |
4010 movh [r3 + r1 * 2], m3 | |
4011 movhps [r3 + r5], m3 | |
4012 | |
4013 ; filter | |
4014 cmp r4m, byte 0 | |
4015 jz .quit | |
4016 | |
4017 pmovzxbw m0, m0 | |
4018 movu m1, [r2] | |
4019 palignr m2, m1, 1 | |
4020 pshufb m1, m4 | |
4021 pmovzxbw m1, m1 | |
4022 pmovzxbw m2, m2 | |
4023 psubw m2, m1 | |
4024 psraw m2, 1 | |
4025 paddw m0, m2 | |
4026 packuswb m0, m0 | |
4027 | |
4028 .quit: | |
4029 movh [r0], m0 | |
4030 RET | |
4031 | |
4032 cglobal intra_pred_ang8_26, 3,6,3 | |
4033 movu m2, [r2] | |
4034 palignr m0, m2, 1 | |
4035 lea r5, [r1 * 3] | |
4036 movh [r0], m0 | |
4037 movh [r0 + r1], m0 | |
4038 movh [r0 + r1 * 2], m0 | |
4039 movh [r0 + r5], m0 | |
4040 lea r3, [r0 + r1 * 4] | |
4041 movh [r3], m0 | |
4042 movh [r3 + r1], m0 | |
4043 movh [r3 + r1 * 2], m0 | |
4044 movh [r3 + r5], m0 | |
4045 | |
4046 ; filter | |
4047 cmp r4m, byte 0 | |
4048 jz .quit | |
4049 | |
4050 pshufb m2, [pb_unpackbq] | |
4051 movhlps m1, m2 | |
4052 pmovzxbw m2, m2 | |
4053 movu m0, [r2 + 17] | |
4054 pmovzxbw m1, m1 | |
4055 pmovzxbw m0, m0 | |
4056 psubw m0, m2 | |
4057 psraw m0, 1 | |
4058 paddw m1, m0 | |
4059 packuswb m1, m1 | |
4060 pextrb [r0], m1, 0 | |
4061 pextrb [r0 + r1], m1, 1 | |
4062 pextrb [r0 + r1 * 2], m1, 2 | |
4063 pextrb [r0 + r5], m1, 3 | |
4064 pextrb [r3], m1, 4 | |
4065 pextrb [r3 + r1], m1, 5 | |
4066 pextrb [r3 + r1 * 2], m1, 6 | |
4067 pextrb [r3 + r5], m1, 7 | |
4068 .quit: | |
4069 RET | |
4070 | |
4071 cglobal intra_pred_ang8_11, 3,5,8 | |
4072 xor r4, r4 | |
4073 cmp r3m, byte 25 | |
4074 mov r3, 16 | |
4075 cmove r3, r4 | |
4076 | |
4077 movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4078 pinsrb m0, [r2], 0 | |
4079 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4080 | |
4081 punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4082 | |
4083 lea r3, [ang_table + 23 * 16] | |
4084 mova m7, [pw_1024] | |
4085 | |
4086 pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] | |
4087 pmulhrsw m4, m7 | |
4088 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] | |
4089 pmulhrsw m3, m7 | |
4090 packuswb m4, m3 | |
4091 | |
4092 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] | |
4093 pmulhrsw m5, m7 | |
4094 | |
4095 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] | |
4096 pmulhrsw m6, m7 | |
4097 packuswb m5, m6 | |
4098 | |
4099 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] | |
4100 pmulhrsw m6, m7 | |
4101 | |
4102 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] | |
4103 pmulhrsw m2, m7 | |
4104 packuswb m6, m2 | |
4105 | |
4106 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] | |
4107 pmulhrsw m1, m7 | |
4108 | |
4109 pmaddubsw m0, [r3 - 7 * 16] ; [16] | |
4110 pmulhrsw m0, m7 | |
4111 packuswb m1, m0 | |
4112 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4113 | |
4114 cglobal intra_pred_ang8_12, 3,5,8 | |
4115 xor r4, r4 | |
4116 cmp r3m, byte 24 | |
4117 mov r3, 16 | |
4118 jz .next | |
4119 xchg r3, r4 | |
4120 .next: | |
4121 | |
4122 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4123 pinsrb m1, [r2], 0 | |
4124 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
4125 pinsrb m0, [r2 + r3 + 6], 0 | |
4126 | |
4127 lea r4, [ang_table + 22 * 16] | |
4128 mova m7, [pw_1024] | |
4129 | |
4130 punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] | |
4131 punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4132 palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4133 | |
4134 pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] | |
4135 pmulhrsw m4, m7 | |
4136 pmaddubsw m3, m2, [r4] ; [22] | |
4137 pmulhrsw m3, m7 | |
4138 packuswb m4, m3 | |
4139 | |
4140 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] | |
4141 pmulhrsw m1, m7 | |
4142 | |
4143 pmaddubsw m0, [r4 + 2 * 16] ; [24] | |
4144 pmulhrsw m0, m7 | |
4145 packuswb m1, m0 | |
4146 | |
4147 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] | |
4148 pmulhrsw m5, m7 | |
4149 | |
4150 lea r4, [ang_table + 7 * 16] | |
4151 pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] | |
4152 pmulhrsw m6, m7 | |
4153 packuswb m5, m6 | |
4154 | |
4155 pmaddubsw m6, m2, [r4] ; [7] | |
4156 pmulhrsw m6, m7 | |
4157 | |
4158 pmaddubsw m2, [r4 - 5 * 16] ; [2] | |
4159 pmulhrsw m2, m7 | |
4160 packuswb m6, m2 | |
4161 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4162 | |
4163 cglobal intra_pred_ang8_13, 4,5,8 | |
4164 xor r4, r4 | |
4165 cmp r3m, byte 23 | |
4166 mov r3, 16 | |
4167 jz .next | |
4168 xchg r3, r4 | |
4169 .next: | |
4170 | |
4171 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4172 pinsrb m1, [r2], 0 | |
4173 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
4174 pinsrb m1, [r2 + r3 + 4], 0 | |
4175 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] | |
4176 pinsrb m0, [r2 + r3 + 7], 0 | |
4177 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] | |
4178 punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
4179 palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4180 palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4181 | |
4182 lea r4, [ang_table + 24 * 16] | |
4183 mova m7, [pw_1024] | |
4184 | |
4185 pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] | |
4186 pmulhrsw m4, m7 | |
4187 | |
4188 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] | |
4189 pmulhrsw m6, m7 | |
4190 | |
4191 pmaddubsw m0, [r4] ; [24] | |
4192 pmulhrsw m0, m7 | |
4193 | |
4194 lea r4, [ang_table + 13 * 16] | |
4195 pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] | |
4196 pmulhrsw m3, m7 | |
4197 packuswb m4, m3 | |
4198 | |
4199 pmaddubsw m5, [r4 - 8 * 16] ; [5] | |
4200 pmulhrsw m5, m7 | |
4201 packuswb m5, m6 | |
4202 | |
4203 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] | |
4204 pmulhrsw m6, m7 | |
4205 | |
4206 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] | |
4207 pmulhrsw m2, m7 | |
4208 packuswb m6, m2 | |
4209 | |
4210 pmaddubsw m1, [r4 - 12 * 16] ; [1] | |
4211 pmulhrsw m1, m7 | |
4212 packuswb m1, m0 | |
4213 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4214 | |
4215 cglobal intra_pred_ang8_14, 4,5,8 | |
4216 xor r4, r4 | |
4217 cmp r3m, byte 22 | |
4218 mov r3, 16 | |
4219 jz .next | |
4220 xchg r3, r4 | |
4221 .next: | |
4222 | |
4223 movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] | |
4224 pinsrb m1, [r2], 2 | |
4225 pinsrb m1, [r2 + r3 + 2], 1 | |
4226 pinsrb m1, [r2 + r3 + 5], 0 | |
4227 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] | |
4228 pinsrb m0, [r2 + r3 + 7], 0 | |
4229 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
4230 punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
4231 palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
4232 palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4233 palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4234 | |
4235 lea r4, [ang_table + 24 * 16] | |
4236 mova m3, [pw_1024] | |
4237 | |
4238 pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] | |
4239 pmulhrsw m4, m3 | |
4240 | |
4241 pmaddubsw m0, [r4] ; [24] | |
4242 pmulhrsw m0, m3 | |
4243 | |
4244 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] | |
4245 pmulhrsw m5, m3 | |
4246 | |
4247 lea r4, [ang_table + 12 * 16] | |
4248 pmaddubsw m6, [r4] ; [12] | |
4249 pmulhrsw m6, m3 | |
4250 packuswb m5, m6 | |
4251 | |
4252 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] | |
4253 pmulhrsw m6, m3 | |
4254 | |
4255 pmaddubsw m2, [r4 - 6 * 16] ; [6] | |
4256 pmulhrsw m2, m3 | |
4257 packuswb m4, m2 | |
4258 | |
4259 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] | |
4260 pmulhrsw m2, m3 | |
4261 packuswb m6, m2 | |
4262 | |
4263 pmaddubsw m1, [r4 - 7 * 16] ; [5] | |
4264 pmulhrsw m1, m3 | |
4265 packuswb m1, m0 | |
4266 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4267 | |
4268 cglobal intra_pred_ang8_15, 4,5,8 | |
4269 xor r4, r4 | |
4270 cmp r3m, byte 21 | |
4271 mov r3, 16 | |
4272 jz .next | |
4273 xchg r3, r4 | |
4274 .next: | |
4275 | |
4276 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4277 pinsrb m1, [r2], 0 | |
4278 movu m2, [r2 + r3] | |
4279 pshufb m2, [c_mode16_15] | |
4280 palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] | |
4281 pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] | |
4282 pinsrb m0, [r2 + r3 + 8], 0 | |
4283 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
4284 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
4285 palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
4286 palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
4287 palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4288 palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4289 | |
4290 lea r4, [ang_table + 23 * 16] | |
4291 mova m3, [pw_1024] | |
4292 | |
4293 pmaddubsw m4, [r4 - 8 * 16] ; [15] | |
4294 pmulhrsw m4, m3 | |
4295 | |
4296 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] | |
4297 pmulhrsw m2, m3 | |
4298 packuswb m4, m2 | |
4299 | |
4300 pmaddubsw m5, [r4 - 10 * 16] ; [13] | |
4301 pmulhrsw m5, m3 | |
4302 | |
4303 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] | |
4304 pmulhrsw m2, m3 | |
4305 packuswb m5, m2 | |
4306 | |
4307 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] | |
4308 pmulhrsw m2, m3 | |
4309 | |
4310 pmaddubsw m0, [r4 + 1 * 16] ; [24] | |
4311 pmulhrsw m0, m3 | |
4312 | |
4313 lea r4, [ang_table + 11 * 16] | |
4314 pmaddubsw m6, [r4] ; [11] | |
4315 pmulhrsw m6, m3 | |
4316 packuswb m6, m2 | |
4317 | |
4318 pmaddubsw m1, [r4 - 2 * 16] ; [9] | |
4319 pmulhrsw m1, m3 | |
4320 packuswb m1, m0 | |
4321 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4322 | |
4323 cglobal intra_pred_ang8_16, 4,5,8 | |
4324 xor r4, r4 | |
4325 cmp r3m, byte 20 | |
4326 mov r3, 16 | |
4327 jz .next | |
4328 xchg r3, r4 | |
4329 .next: | |
4330 | |
4331 movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4332 pinsrb m1, [r2], 0 | |
4333 movu m2, [r2 + r3] | |
4334 pshufb m2, [c_mode16_16] | |
4335 palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] | |
4336 pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] | |
4337 pinsrb m0, [r2 + r3 + 8], 0 | |
4338 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
4339 punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] | |
4340 palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
4341 palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
4342 palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
4343 palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4344 palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4345 | |
4346 lea r4, [ang_table + 22 * 16] | |
4347 mova m7, [pw_1024] | |
4348 | |
4349 pmaddubsw m3, m5, [r4] ; [22] | |
4350 pmulhrsw m3, m7 | |
4351 | |
4352 pmaddubsw m0, [r4 + 2 * 16] ; [24] | |
4353 pmulhrsw m0, m7 | |
4354 | |
4355 lea r4, [ang_table + 9 * 16] | |
4356 | |
4357 pmaddubsw m4, [r4 + 2 * 16] ; [11] | |
4358 pmulhrsw m4, m7 | |
4359 packuswb m4, m3 | |
4360 | |
4361 pmaddubsw m2, [r4 + 3 * 16] ; [12] | |
4362 pmulhrsw m2, m7 | |
4363 | |
4364 pmaddubsw m5, [r4 - 8 * 16] ; [1] | |
4365 pmulhrsw m5, m7 | |
4366 packuswb m5, m2 | |
4367 | |
4368 mova m2, m6 | |
4369 pmaddubsw m6, [r4 + 14 * 16] ; [23] | |
4370 pmulhrsw m6, m7 | |
4371 | |
4372 pmaddubsw m2, [r4 - 7 * 16] ; [2] | |
4373 pmulhrsw m2, m7 | |
4374 packuswb m6, m2 | |
4375 | |
4376 pmaddubsw m1, [r4 + 4 * 16] ; [13] | |
4377 pmulhrsw m1, m7 | |
4378 packuswb m1, m0 | |
4379 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4380 | |
4381 cglobal intra_pred_ang8_17, 4,5,8 | |
4382 xor r4, r4 | |
4383 cmp r3m, byte 19 | |
4384 mov r3, 16 | |
4385 jz .next | |
4386 xchg r3, r4 | |
4387 .next: | |
4388 | |
4389 movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4390 pinsrb m2, [r2], 0 | |
4391 movu m1, [r2 + r3] | |
4392 pshufb m1, [c_mode16_17] | |
4393 palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] | |
4394 pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] | |
4395 pinsrb m0, [r2 + r3 + 7], 0 | |
4396 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
4397 punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] | |
4398 | |
4399 palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
4400 palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
4401 palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4402 | |
4403 lea r4, [ang_table + 17 * 16] | |
4404 mova m3, [pw_1024] | |
4405 | |
4406 pmaddubsw m2, [r4 - 5 * 16] ; [12] | |
4407 pmulhrsw m2, m3 | |
4408 | |
4409 pmaddubsw m4, [r4 - 11 * 16] ; [6] | |
4410 pmulhrsw m4, m3 | |
4411 packuswb m4, m2 | |
4412 | |
4413 pmaddubsw m5, [r4 + 1 * 16] ; [18] | |
4414 pmulhrsw m5, m3 | |
4415 | |
4416 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
4417 pmaddubsw m2, [r4 + 7 * 16] ; [24] | |
4418 pmulhrsw m2, m3 | |
4419 packuswb m5, m2 | |
4420 | |
4421 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
4422 mova m2, m6 | |
4423 pmaddubsw m6, [r4 + 13 * 16] ; [30] | |
4424 pmulhrsw m6, m3 | |
4425 | |
4426 pmaddubsw m2, [r4 - 13 * 16] ; [4] | |
4427 pmulhrsw m2, m3 | |
4428 packuswb m6, m2 | |
4429 | |
4430 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] | |
4431 pmaddubsw m1, [r4 - 7 * 16] ; [10] | |
4432 pmulhrsw m1, m3 | |
4433 | |
4434 pmaddubsw m0, [r4 - 1 * 16] ; [16] | |
4435 pmulhrsw m0, m3 | |
4436 packuswb m1, m0 | |
4437 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
4438 | |
4439 cglobal intra_pred_ang8_18, 4,4,1 | |
4440 movu m0, [r2 + 16] | |
4441 pinsrb m0, [r2], 0 | |
4442 pshufb m0, [pb_swap8] | |
4443 movhps m0, [r2 + 1] | |
4444 lea r2, [r0 + r1 * 4] | |
4445 lea r3, [r1 * 3] | |
4446 movh [r2 + r3], m0 | |
4447 psrldq m0, 1 | |
4448 movh [r2 + r1 * 2], m0 | |
4449 psrldq m0, 1 | |
4450 movh [r2 + r1], m0 | |
4451 psrldq m0, 1 | |
4452 movh [r2], m0 | |
4453 psrldq m0, 1 | |
4454 movh [r0 + r3], m0 | |
4455 psrldq m0, 1 | |
4456 movh [r0 + r1 * 2], m0 | |
4457 psrldq m0, 1 | |
4458 movh [r0 + r1], m0 | |
4459 psrldq m0, 1 | |
4460 movh [r0], m0 | |
4461 RET | |
4462 | |
4463 %macro TRANSPOSE_STORE_8x8 6 | |
4464 %if %2 == 1 | |
4465 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 | |
4466 punpckhbw m0, %3, %4 | |
4467 punpcklbw %3, %4 | |
4468 punpckhbw %4, %3, m0 | |
4469 punpcklbw %3, m0 | |
4470 | |
4471 punpckhbw m0, %5, m1 | |
4472 punpcklbw %5, %6 | |
4473 punpckhbw %6, %5, m0 | |
4474 punpcklbw %5, m0 | |
4475 | |
4476 punpckhdq m0, %3, %5 | |
4477 punpckldq %3, %5 | |
4478 punpckldq %5, %4, %6 | |
4479 punpckhdq %4, %6 | |
4480 | |
4481 movh [r0 + + %1 * 8], %3 | |
4482 movhps [r0 + r1 + %1 * 8], %3 | |
4483 movh [r0 + r1*2 + %1 * 8], m0 | |
4484 movhps [r0 + r5 + %1 * 8], m0 | |
4485 movh [r6 + %1 * 8], %5 | |
4486 movhps [r6 + r1 + %1 * 8], %5 | |
4487 movh [r6 + r1*2 + %1 * 8], %4 | |
4488 movhps [r6 + r5 + %1 * 8], %4 | |
4489 %else | |
4490 ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 | |
4491 movh [r0 ], %3 | |
4492 movhps [r0 + r1 ], %3 | |
4493 movh [r0 + r1 * 2], %4 | |
4494 movhps [r0 + r5 ], %4 | |
4495 lea r0, [r0 + r1 * 4] | |
4496 movh [r0 ], %5 | |
4497 movhps [r0 + r1 ], %5 | |
4498 movh [r0 + r1 * 2], %6 | |
4499 movhps [r0 + r5 ], %6 | |
4500 lea r0, [r0 + r1 * 4] | |
4501 %endif | |
4502 %endmacro | |
4503 | |
4504 ;------------------------------------------------------------------------------------------ | |
4505 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
4506 ;------------------------------------------------------------------------------------------ | |
4507 INIT_XMM ssse3 | |
4508 cglobal intra_pred_ang16_2, 3,5,3 | |
4509 lea r4, [r2 + 2] | |
4510 add r2, 34 | |
4511 cmp r3m, byte 34 | |
4512 cmove r2, r4 | |
4513 movu m0, [r2] | |
4514 movu m1, [r2 + 16] | |
4515 movu [r0], m0 | |
4516 palignr m2, m1, m0, 1 | |
4517 movu [r0 + r1], m2 | |
4518 lea r0, [r0 + r1 * 2] | |
4519 palignr m2, m1, m0, 2 | |
4520 movu [r0], m2 | |
4521 palignr m2, m1, m0, 3 | |
4522 movu [r0 + r1], m2 | |
4523 lea r0, [r0 + r1 * 2] | |
4524 palignr m2, m1, m0, 4 | |
4525 movu [r0], m2 | |
4526 palignr m2, m1, m0, 5 | |
4527 movu [r0 + r1], m2 | |
4528 lea r0, [r0 + r1 * 2] | |
4529 palignr m2, m1, m0, 6 | |
4530 movu [r0], m2 | |
4531 palignr m2, m1, m0, 7 | |
4532 movu [r0 + r1], m2 | |
4533 lea r0, [r0 + r1 * 2] | |
4534 palignr m2, m1, m0, 8 | |
4535 movu [r0], m2 | |
4536 palignr m2, m1, m0, 9 | |
4537 movu [r0 + r1], m2 | |
4538 lea r0, [r0 + r1 * 2] | |
4539 palignr m2, m1, m0, 10 | |
4540 movu [r0], m2 | |
4541 palignr m2, m1, m0, 11 | |
4542 movu [r0 + r1], m2 | |
4543 lea r0, [r0 + r1 * 2] | |
4544 palignr m2, m1, m0, 12 | |
4545 movu [r0], m2 | |
4546 palignr m2, m1, m0, 13 | |
4547 movu [r0 + r1], m2 | |
4548 lea r0, [r0 + r1 * 2] | |
4549 palignr m2, m1, m0, 14 | |
4550 movu [r0], m2 | |
4551 palignr m2, m1, m0, 15 | |
4552 movu [r0 + r1], m2 | |
4553 RET | |
4554 | |
4555 INIT_XMM sse4 | |
4556 cglobal intra_pred_ang16_3, 3,7,8 | |
4557 add r2, 32 | |
4558 lea r3, [ang_table + 16 * 16] | |
4559 mov r4d, 2 | |
4560 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4561 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4562 mova m7, [pw_1024] | |
4563 | |
4564 .loop: | |
4565 movu m0, [r2 + 1] | |
4566 palignr m1, m0, 1 | |
4567 | |
4568 punpckhbw m2, m0, m1 | |
4569 punpcklbw m0, m1 | |
4570 palignr m1, m2, m0, 2 | |
4571 | |
4572 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
4573 pmulhrsw m4, m7 | |
4574 pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
4575 pmulhrsw m1, m7 | |
4576 packuswb m4, m1 | |
4577 | |
4578 palignr m5, m2, m0, 4 | |
4579 | |
4580 pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
4581 pmulhrsw m5, m7 | |
4582 | |
4583 palignr m6, m2, m0, 6 | |
4584 | |
4585 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
4586 pmulhrsw m6, m7 | |
4587 packuswb m5, m6 | |
4588 | |
4589 palignr m1, m2, m0, 8 | |
4590 | |
4591 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
4592 pmulhrsw m6, m7 | |
4593 | |
4594 pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
4595 pmulhrsw m1, m7 | |
4596 packuswb m6, m1 | |
4597 | |
4598 palignr m1, m2, m0, 10 | |
4599 | |
4600 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
4601 pmulhrsw m1, m7 | |
4602 | |
4603 palignr m2, m0, 12 | |
4604 | |
4605 pmaddubsw m2, [r3] ; [16] | |
4606 pmulhrsw m2, m7 | |
4607 packuswb m1, m2 | |
4608 | |
4609 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4610 | |
4611 movu m0, [r2 + 8] | |
4612 palignr m1, m0, 1 | |
4613 | |
4614 punpckhbw m2, m0, m1 | |
4615 punpcklbw m0, m1 | |
4616 palignr m5, m2, m0, 2 | |
4617 | |
4618 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
4619 pmulhrsw m4, m7 | |
4620 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
4621 pmulhrsw m1, m7 | |
4622 packuswb m4, m1 | |
4623 | |
4624 pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
4625 pmulhrsw m5, m7 | |
4626 | |
4627 palignr m6, m2, m0, 4 | |
4628 | |
4629 pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
4630 pmulhrsw m6, m7 | |
4631 packuswb m5, m6 | |
4632 | |
4633 palignr m1, m2, m0, 6 | |
4634 | |
4635 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
4636 pmulhrsw m6, m7 | |
4637 | |
4638 palignr m1, m2, m0, 8 | |
4639 | |
4640 pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
4641 pmulhrsw m1, m7 | |
4642 packuswb m6, m1 | |
4643 | |
4644 palignr m1, m2, m0, 10 | |
4645 | |
4646 pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
4647 pmulhrsw m1, m7 | |
4648 packuswb m1, m1 | |
4649 | |
4650 movhps m1, [r2 + 14] ; [00] | |
4651 | |
4652 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4653 | |
4654 lea r0, [r6 + r1 * 4] | |
4655 lea r6, [r6 + r1 * 8] | |
4656 add r2, 8 | |
4657 dec r4 | |
4658 jnz .loop | |
4659 RET | |
4660 | |
4661 INIT_XMM sse4 | |
4662 cglobal intra_pred_ang16_33, 3,7,8 | |
4663 lea r3, [ang_table + 16 * 16] | |
4664 mov r4d, 2 | |
4665 lea r5, [r1 * 3] | |
4666 mov r6, r0 | |
4667 mova m7, [pw_1024] | |
4668 | |
4669 .loop: | |
4670 movu m0, [r2 + 1] | |
4671 palignr m1, m0, 1 | |
4672 | |
4673 punpckhbw m2, m0, m1 | |
4674 punpcklbw m0, m1 | |
4675 palignr m1, m2, m0, 2 | |
4676 | |
4677 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
4678 pmulhrsw m4, m7 | |
4679 pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
4680 pmulhrsw m1, m7 | |
4681 packuswb m4, m1 | |
4682 | |
4683 palignr m5, m2, m0, 4 | |
4684 | |
4685 pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
4686 pmulhrsw m5, m7 | |
4687 | |
4688 palignr m6, m2, m0, 6 | |
4689 | |
4690 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
4691 pmulhrsw m6, m7 | |
4692 packuswb m5, m6 | |
4693 | |
4694 palignr m1, m2, m0, 8 | |
4695 | |
4696 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
4697 pmulhrsw m6, m7 | |
4698 | |
4699 pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
4700 pmulhrsw m1, m7 | |
4701 packuswb m6, m1 | |
4702 | |
4703 palignr m1, m2, m0, 10 | |
4704 | |
4705 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
4706 pmulhrsw m1, m7 | |
4707 | |
4708 palignr m2, m0, 12 | |
4709 | |
4710 pmaddubsw m2, [r3] ; [16] | |
4711 pmulhrsw m2, m7 | |
4712 packuswb m1, m2 | |
4713 | |
4714 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4715 | |
4716 movu m0, [r2 + 8] | |
4717 palignr m1, m0, 1 | |
4718 | |
4719 punpckhbw m2, m0, m1 | |
4720 punpcklbw m0, m1 | |
4721 palignr m5, m2, m0, 2 | |
4722 | |
4723 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
4724 pmulhrsw m4, m7 | |
4725 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
4726 pmulhrsw m1, m7 | |
4727 packuswb m4, m1 | |
4728 | |
4729 pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
4730 pmulhrsw m5, m7 | |
4731 | |
4732 palignr m6, m2, m0, 4 | |
4733 | |
4734 pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
4735 pmulhrsw m6, m7 | |
4736 packuswb m5, m6 | |
4737 | |
4738 palignr m1, m2, m0, 6 | |
4739 | |
4740 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
4741 pmulhrsw m6, m7 | |
4742 | |
4743 palignr m1, m2, m0, 8 | |
4744 | |
4745 pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
4746 pmulhrsw m1, m7 | |
4747 packuswb m6, m1 | |
4748 | |
4749 palignr m1, m2, m0, 10 | |
4750 | |
4751 pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
4752 pmulhrsw m1, m7 | |
4753 packuswb m1, m1 | |
4754 | |
4755 movh m2, [r2 + 14] ; [00] | |
4756 | |
4757 movh [r0 ], m4 | |
4758 movhps [r0 + r1 ], m4 | |
4759 movh [r0 + r1 * 2], m5 | |
4760 movhps [r0 + r5 ], m5 | |
4761 lea r0, [r0 + r1 * 4] | |
4762 movh [r0 ], m6 | |
4763 movhps [r0 + r1 ], m6 | |
4764 movh [r0 + r1 * 2], m1 | |
4765 movh [r0 + r5 ], m2 | |
4766 | |
4767 lea r0, [r6 + 8] | |
4768 add r2, 8 | |
4769 dec r4 | |
4770 jnz .loop | |
4771 RET | |
4772 | |
4773 INIT_XMM sse4 | |
4774 cglobal intra_pred_ang16_4, 3,7,8 | |
4775 add r2, 32 | |
4776 lea r3, [ang_table + 16 * 16] | |
4777 mov r4d, 2 | |
4778 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4779 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4780 mova m7, [pw_1024] | |
4781 | |
4782 .loop: | |
4783 movu m0, [r2 + 1] | |
4784 palignr m1, m0, 1 | |
4785 | |
4786 punpckhbw m2, m0, m1 | |
4787 punpcklbw m0, m1 | |
4788 palignr m1, m2, m0, 2 | |
4789 mova m5, m1 | |
4790 | |
4791 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
4792 pmulhrsw m4, m7 | |
4793 pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
4794 pmulhrsw m1, m7 | |
4795 packuswb m4, m1 | |
4796 | |
4797 pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
4798 pmulhrsw m5, m7 | |
4799 | |
4800 palignr m6, m2, m0, 4 | |
4801 | |
4802 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
4803 pmulhrsw m6, m7 | |
4804 packuswb m5, m6 | |
4805 | |
4806 palignr m1, m2, m0, 6 | |
4807 | |
4808 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
4809 pmulhrsw m6, m7 | |
4810 | |
4811 pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
4812 pmulhrsw m1, m7 | |
4813 packuswb m6, m1 | |
4814 | |
4815 palignr m1, m2, m0, 8 | |
4816 | |
4817 pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
4818 pmulhrsw m1, m7 | |
4819 | |
4820 palignr m2, m0, 10 | |
4821 | |
4822 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
4823 pmulhrsw m3, m7 | |
4824 packuswb m1, m3 | |
4825 | |
4826 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4827 | |
4828 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
4829 pmulhrsw m4, m7 | |
4830 | |
4831 movu m0, [r2 + 6] | |
4832 palignr m1, m0, 1 | |
4833 | |
4834 punpckhbw m2, m0, m1 | |
4835 punpcklbw m0, m1 | |
4836 palignr m1, m2, m0, 2 | |
4837 | |
4838 pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
4839 pmulhrsw m1, m7 | |
4840 packuswb m4, m1 | |
4841 | |
4842 palignr m5, m2, m0, 4 | |
4843 mova m6, m5 | |
4844 | |
4845 pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
4846 pmulhrsw m5, m7 | |
4847 | |
4848 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
4849 pmulhrsw m6, m7 | |
4850 packuswb m5, m6 | |
4851 | |
4852 palignr m6, m2, m0, 6 | |
4853 | |
4854 pmaddubsw m6, [r3 + 16] ; [17] | |
4855 pmulhrsw m6, m7 | |
4856 | |
4857 palignr m1, m2, m0, 8 | |
4858 palignr m2, m0, 10 | |
4859 | |
4860 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
4861 pmulhrsw m3, m7 | |
4862 packuswb m6, m3 | |
4863 | |
4864 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
4865 pmulhrsw m1, m7 | |
4866 | |
4867 pmaddubsw m2, [r3] ; [16] | |
4868 pmulhrsw m2, m7 | |
4869 packuswb m1, m2 | |
4870 | |
4871 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4872 | |
4873 lea r0, [r6 + r1 * 4] | |
4874 lea r6, [r6 + r1 * 8] | |
4875 add r2, 8 | |
4876 dec r4 | |
4877 jnz .loop | |
4878 RET | |
4879 | |
4880 INIT_XMM sse4 | |
4881 cglobal intra_pred_ang16_32, 3,7,8 | |
4882 lea r3, [ang_table + 16 * 16] | |
4883 mov r4d, 2 | |
4884 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4885 mov r6, r0 | |
4886 mova m7, [pw_1024] | |
4887 | |
4888 .loop: | |
4889 movu m0, [r2 + 1] | |
4890 palignr m1, m0, 1 | |
4891 | |
4892 punpckhbw m2, m0, m1 | |
4893 punpcklbw m0, m1 | |
4894 palignr m1, m2, m0, 2 | |
4895 mova m5, m1 | |
4896 | |
4897 | |
4898 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
4899 pmulhrsw m4, m7 | |
4900 pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
4901 pmulhrsw m1, m7 | |
4902 packuswb m4, m1 | |
4903 | |
4904 pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
4905 pmulhrsw m5, m7 | |
4906 | |
4907 palignr m6, m2, m0, 4 | |
4908 | |
4909 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
4910 pmulhrsw m6, m7 | |
4911 packuswb m5, m6 | |
4912 | |
4913 palignr m1, m2, m0, 6 | |
4914 | |
4915 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
4916 pmulhrsw m6, m7 | |
4917 | |
4918 pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
4919 pmulhrsw m1, m7 | |
4920 packuswb m6, m1 | |
4921 | |
4922 palignr m1, m2, m0, 8 | |
4923 | |
4924 pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
4925 pmulhrsw m1, m7 | |
4926 | |
4927 palignr m2, m0, 10 | |
4928 | |
4929 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
4930 pmulhrsw m3, m7 | |
4931 packuswb m1, m3 | |
4932 | |
4933 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4934 | |
4935 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
4936 pmulhrsw m4, m7 | |
4937 | |
4938 movu m0, [r2 + 6] | |
4939 palignr m1, m0, 1 | |
4940 | |
4941 punpckhbw m2, m0, m1 | |
4942 punpcklbw m0, m1 | |
4943 palignr m1, m2, m0, 2 | |
4944 | |
4945 pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
4946 pmulhrsw m1, m7 | |
4947 packuswb m4, m1 | |
4948 | |
4949 palignr m5, m2, m0, 4 | |
4950 mova m6, m5 | |
4951 | |
4952 pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
4953 pmulhrsw m5, m7 | |
4954 | |
4955 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
4956 pmulhrsw m6, m7 | |
4957 packuswb m5, m6 | |
4958 | |
4959 palignr m6, m2, m0, 6 | |
4960 | |
4961 pmaddubsw m6, [r3 + 16] ; [17] | |
4962 pmulhrsw m6, m7 | |
4963 | |
4964 palignr m1, m2, m0, 8 | |
4965 palignr m2, m0, 10 | |
4966 | |
4967 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
4968 pmulhrsw m3, m7 | |
4969 packuswb m6, m3 | |
4970 | |
4971 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
4972 pmulhrsw m1, m7 | |
4973 | |
4974 pmaddubsw m2, [r3] ; [16] | |
4975 pmulhrsw m2, m7 | |
4976 packuswb m1, m2 | |
4977 | |
4978 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4979 | |
4980 lea r0, [r6 + 8] | |
4981 add r2, 8 | |
4982 dec r4 | |
4983 jnz .loop | |
4984 RET | |
4985 | |
4986 INIT_XMM sse4 | |
4987 cglobal intra_pred_ang16_5, 3,7,8 | |
4988 add r2, 32 | |
4989 lea r3, [ang_table + 16 * 16] | |
4990 mov r4d, 2 | |
4991 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4992 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4993 mova m7, [pw_1024] | |
4994 | |
4995 .loop: | |
4996 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4997 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
4998 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
4999 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5000 | |
5001 palignr m5, m2, m3, 2 | |
5002 | |
5003 pmaddubsw m4, m3, [r3 + 16] ; [17] | |
5004 pmulhrsw m4, m7 | |
5005 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
5006 pmulhrsw m1, m7 | |
5007 packuswb m4, m1 | |
5008 | |
5009 palignr m6, m2, m3, 4 | |
5010 | |
5011 pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
5012 pmulhrsw m5, m7 | |
5013 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] | |
5014 pmulhrsw m1, m7 | |
5015 packuswb m5, m1 | |
5016 | |
5017 palignr m1, m2, m3, 6 | |
5018 | |
5019 pmaddubsw m6, [r3 + 5 * 16] ; [21] | |
5020 pmulhrsw m6, m7 | |
5021 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] | |
5022 pmulhrsw m0, m7 | |
5023 packuswb m6, m0 | |
5024 | |
5025 palignr m0, m2, m3, 8 | |
5026 | |
5027 pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
5028 pmulhrsw m1, m7 | |
5029 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5030 pmulhrsw m0, m7 | |
5031 packuswb m1, m0 | |
5032 | |
5033 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5034 | |
5035 palignr m4, m2, m3, 8 | |
5036 palignr m5, m2, m3, 10 | |
5037 | |
5038 pmaddubsw m4, [r3 + 9 * 16] ; [25] | |
5039 pmulhrsw m4, m7 | |
5040 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
5041 pmulhrsw m1, m7 | |
5042 packuswb m4, m1 | |
5043 | |
5044 palignr m6, m2, m3, 12 | |
5045 | |
5046 pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
5047 pmulhrsw m5, m7 | |
5048 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
5049 pmulhrsw m1, m7 | |
5050 packuswb m5, m1 | |
5051 | |
5052 palignr m1, m2, m3, 14 | |
5053 | |
5054 pmaddubsw m6, [r3 + 13 * 16] ; [29] | |
5055 pmulhrsw m6, m7 | |
5056 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
5057 pmulhrsw m0, m7 | |
5058 packuswb m6, m0 | |
5059 | |
5060 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
5061 pmulhrsw m1, m7 | |
5062 pmaddubsw m2, [r3] ; [16] | |
5063 pmulhrsw m2, m7 | |
5064 packuswb m1, m2 | |
5065 | |
5066 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5067 | |
5068 lea r0, [r6 + r1 * 4] | |
5069 lea r6, [r6 + r1 * 8] | |
5070 add r2, 8 | |
5071 dec r4 | |
5072 jnz .loop | |
5073 RET | |
5074 | |
5075 INIT_XMM sse4 | |
5076 cglobal intra_pred_ang16_31, 3,7,8 | |
5077 lea r3, [ang_table + 16 * 16] | |
5078 mov r4d, 2 | |
5079 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5080 mov r6, r0 | |
5081 mova m7, [pw_1024] | |
5082 | |
5083 .loop: | |
5084 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5085 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5086 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5087 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5088 | |
5089 palignr m5, m2, m3, 2 | |
5090 | |
5091 pmaddubsw m4, m3, [r3 + 16] ; [17] | |
5092 pmulhrsw m4, m7 | |
5093 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
5094 pmulhrsw m1, m7 | |
5095 packuswb m4, m1 | |
5096 | |
5097 palignr m6, m2, m3, 4 | |
5098 | |
5099 pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
5100 pmulhrsw m5, m7 | |
5101 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] | |
5102 pmulhrsw m1, m7 | |
5103 packuswb m5, m1 | |
5104 | |
5105 palignr m1, m2, m3, 6 | |
5106 | |
5107 pmaddubsw m6, [r3 + 5 * 16] ; [21] | |
5108 pmulhrsw m6, m7 | |
5109 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] | |
5110 pmulhrsw m0, m7 | |
5111 packuswb m6, m0 | |
5112 | |
5113 palignr m0, m2, m3, 8 | |
5114 | |
5115 pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
5116 pmulhrsw m1, m7 | |
5117 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5118 pmulhrsw m0, m7 | |
5119 packuswb m1, m0 | |
5120 | |
5121 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5122 | |
5123 palignr m4, m2, m3, 8 | |
5124 palignr m5, m2, m3, 10 | |
5125 | |
5126 pmaddubsw m4, [r3 + 9 * 16] ; [25] | |
5127 pmulhrsw m4, m7 | |
5128 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
5129 pmulhrsw m1, m7 | |
5130 packuswb m4, m1 | |
5131 | |
5132 palignr m6, m2, m3, 12 | |
5133 | |
5134 pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
5135 pmulhrsw m5, m7 | |
5136 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
5137 pmulhrsw m1, m7 | |
5138 packuswb m5, m1 | |
5139 | |
5140 palignr m1, m2, m3, 14 | |
5141 | |
5142 pmaddubsw m6, [r3 + 13 * 16] ; [29] | |
5143 pmulhrsw m6, m7 | |
5144 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
5145 pmulhrsw m0, m7 | |
5146 packuswb m6, m0 | |
5147 | |
5148 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
5149 pmulhrsw m1, m7 | |
5150 pmaddubsw m2, [r3] ; [16] | |
5151 pmulhrsw m2, m7 | |
5152 packuswb m1, m2 | |
5153 | |
5154 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5155 | |
5156 lea r0, [r6 + 8] | |
5157 add r2, 8 | |
5158 dec r4 | |
5159 jnz .loop | |
5160 RET | |
5161 | |
5162 INIT_XMM sse4 | |
5163 cglobal intra_pred_ang16_6, 3,7,8 | |
5164 add r2, 32 | |
5165 lea r3, [ang_table + 16 * 16] | |
5166 mov r4d, 2 | |
5167 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5168 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5169 mova m7, [pw_1024] | |
5170 | |
5171 .loop: | |
5172 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5173 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5174 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5175 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5176 | |
5177 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] | |
5178 pmulhrsw m4, m7 | |
5179 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] | |
5180 pmulhrsw m1, m7 | |
5181 packuswb m4, m1 | |
5182 | |
5183 palignr m6, m2, m3, 2 | |
5184 | |
5185 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
5186 pmulhrsw m5, m7 | |
5187 pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
5188 pmulhrsw m6, m7 | |
5189 packuswb m5, m6 | |
5190 | |
5191 palignr m1, m2, m3, 4 | |
5192 | |
5193 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
5194 pmulhrsw m6, m7 | |
5195 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
5196 pmulhrsw m0, m7 | |
5197 packuswb m6, m0 | |
5198 | |
5199 palignr m0, m2, m3, 6 | |
5200 | |
5201 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
5202 pmulhrsw m1, m7 | |
5203 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5204 pmulhrsw m0, m7 | |
5205 packuswb m1, m0 | |
5206 | |
5207 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5208 | |
5209 palignr m4, m2, m3, 6 | |
5210 palignr m6, m2, m3, 8 | |
5211 | |
5212 pmaddubsw m4, [r3 + 5 * 16] ; [21] | |
5213 pmulhrsw m4, m7 | |
5214 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
5215 pmulhrsw m1, m7 | |
5216 packuswb m4, m1 | |
5217 | |
5218 pmaddubsw m5, m6, [r3 - 16] ; [15] | |
5219 pmulhrsw m5, m7 | |
5220 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
5221 pmulhrsw m6, m7 | |
5222 packuswb m5, m6 | |
5223 | |
5224 palignr m0, m2, m3, 10 | |
5225 | |
5226 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
5227 pmulhrsw m6, m7 | |
5228 pmaddubsw m0, [r3 + 6 * 16] ; [22] | |
5229 pmulhrsw m0, m7 | |
5230 packuswb m6, m0 | |
5231 | |
5232 palignr m2, m3, 12 | |
5233 | |
5234 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
5235 pmulhrsw m1, m7 | |
5236 pmaddubsw m2, [r3] ; [16] | |
5237 pmulhrsw m2, m7 | |
5238 packuswb m1, m2 | |
5239 | |
5240 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5241 | |
5242 lea r0, [r6 + r1 * 4] | |
5243 lea r6, [r6 + r1 * 8] | |
5244 add r2, 8 | |
5245 dec r4 | |
5246 jnz .loop | |
5247 RET | |
5248 | |
5249 INIT_XMM sse4 | |
5250 cglobal intra_pred_ang16_30, 3,7,8 | |
5251 lea r3, [ang_table + 16 * 16] | |
5252 mov r4d, 2 | |
5253 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5254 mov r6, r0 | |
5255 mova m7, [pw_1024] | |
5256 | |
5257 .loop: | |
5258 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5259 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5260 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5261 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5262 | |
5263 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] | |
5264 pmulhrsw m4, m7 | |
5265 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] | |
5266 pmulhrsw m1, m7 | |
5267 packuswb m4, m1 | |
5268 | |
5269 palignr m6, m2, m3, 2 | |
5270 | |
5271 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
5272 pmulhrsw m5, m7 | |
5273 pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
5274 pmulhrsw m6, m7 | |
5275 packuswb m5, m6 | |
5276 | |
5277 palignr m1, m2, m3, 4 | |
5278 | |
5279 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
5280 pmulhrsw m6, m7 | |
5281 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
5282 pmulhrsw m0, m7 | |
5283 packuswb m6, m0 | |
5284 | |
5285 palignr m0, m2, m3, 6 | |
5286 | |
5287 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
5288 pmulhrsw m1, m7 | |
5289 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5290 pmulhrsw m0, m7 | |
5291 packuswb m1, m0 | |
5292 | |
5293 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5294 | |
5295 palignr m4, m2, m3, 6 | |
5296 palignr m6, m2, m3, 8 | |
5297 | |
5298 pmaddubsw m4, [r3 + 5 * 16] ; [21] | |
5299 pmulhrsw m4, m7 | |
5300 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
5301 pmulhrsw m1, m7 | |
5302 packuswb m4, m1 | |
5303 | |
5304 pmaddubsw m5, m6, [r3 - 16] ; [15] | |
5305 pmulhrsw m5, m7 | |
5306 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
5307 pmulhrsw m6, m7 | |
5308 packuswb m5, m6 | |
5309 | |
5310 palignr m0, m2, m3, 10 | |
5311 | |
5312 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
5313 pmulhrsw m6, m7 | |
5314 pmaddubsw m0, [r3 + 6 * 16] ; [22] | |
5315 pmulhrsw m0, m7 | |
5316 packuswb m6, m0 | |
5317 | |
5318 palignr m2, m3, 12 | |
5319 | |
5320 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
5321 pmulhrsw m1, m7 | |
5322 pmaddubsw m2, [r3] ; [16] | |
5323 pmulhrsw m2, m7 | |
5324 packuswb m1, m2 | |
5325 | |
5326 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5327 | |
5328 lea r0, [r6 + 8] | |
5329 add r2, 8 | |
5330 dec r4 | |
5331 jnz .loop | |
5332 RET | |
5333 | |
5334 INIT_XMM sse4 | |
5335 cglobal intra_pred_ang16_7, 3,7,8 | |
5336 add r2, 32 | |
5337 lea r3, [ang_table + 16 * 16] | |
5338 mov r4d, 2 | |
5339 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5340 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5341 mova m7, [pw_1024] | |
5342 | |
5343 .loop: | |
5344 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5345 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5346 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5347 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5348 | |
5349 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] | |
5350 pmulhrsw m4, m7 | |
5351 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] | |
5352 pmulhrsw m0, m7 | |
5353 packuswb m4, m0 | |
5354 | |
5355 palignr m1, m2, m3, 2 | |
5356 | |
5357 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] | |
5358 pmulhrsw m5, m7 | |
5359 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
5360 pmulhrsw m6, m7 | |
5361 packuswb m5, m6 | |
5362 | |
5363 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
5364 pmulhrsw m6, m7 | |
5365 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
5366 pmulhrsw m0, m7 | |
5367 packuswb m6, m0 | |
5368 | |
5369 palignr m0, m2, m3, 4 | |
5370 | |
5371 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
5372 pmulhrsw m1, m7 | |
5373 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5374 pmulhrsw m0, m7 | |
5375 packuswb m1, m0 | |
5376 | |
5377 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5378 | |
5379 palignr m1, m2, m3, 4 | |
5380 | |
5381 pmaddubsw m4, m1, [r3 + 16] ; [17] | |
5382 pmulhrsw m4, m7 | |
5383 pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
5384 pmulhrsw m1, m7 | |
5385 packuswb m4, m1 | |
5386 | |
5387 palignr m0, m2, m3, 6 | |
5388 | |
5389 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
5390 pmulhrsw m5, m7 | |
5391 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
5392 pmulhrsw m6, m7 | |
5393 packuswb m5, m6 | |
5394 | |
5395 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
5396 pmulhrsw m6, m7 | |
5397 pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
5398 pmulhrsw m0, m7 | |
5399 packuswb m6, m0 | |
5400 | |
5401 palignr m2, m3, 8 | |
5402 | |
5403 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
5404 pmulhrsw m1, m7 | |
5405 pmaddubsw m2, [r3] ; [16] | |
5406 pmulhrsw m2, m7 | |
5407 packuswb m1, m2 | |
5408 | |
5409 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5410 | |
5411 lea r0, [r6 + r1 * 4] | |
5412 lea r6, [r6 + r1 * 8] | |
5413 add r2, 8 | |
5414 dec r4 | |
5415 jnz .loop | |
5416 RET | |
5417 | |
5418 INIT_XMM sse4 | |
5419 cglobal intra_pred_ang16_29, 3,7,8 | |
5420 lea r3, [ang_table + 16 * 16] | |
5421 mov r4d, 2 | |
5422 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5423 mov r6, r0 | |
5424 mova m7, [pw_1024] | |
5425 | |
5426 .loop: | |
5427 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5428 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5429 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5430 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5431 | |
5432 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] | |
5433 pmulhrsw m4, m7 | |
5434 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] | |
5435 pmulhrsw m0, m7 | |
5436 packuswb m4, m0 | |
5437 | |
5438 palignr m1, m2, m3, 2 | |
5439 | |
5440 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] | |
5441 pmulhrsw m5, m7 | |
5442 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
5443 pmulhrsw m6, m7 | |
5444 packuswb m5, m6 | |
5445 | |
5446 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
5447 pmulhrsw m6, m7 | |
5448 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
5449 pmulhrsw m0, m7 | |
5450 packuswb m6, m0 | |
5451 | |
5452 palignr m0, m2, m3, 4 | |
5453 | |
5454 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
5455 pmulhrsw m1, m7 | |
5456 pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
5457 pmulhrsw m0, m7 | |
5458 packuswb m1, m0 | |
5459 | |
5460 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5461 | |
5462 palignr m1, m2, m3, 4 | |
5463 | |
5464 pmaddubsw m4, m1, [r3 + 16] ; [17] | |
5465 pmulhrsw m4, m7 | |
5466 pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
5467 pmulhrsw m1, m7 | |
5468 packuswb m4, m1 | |
5469 | |
5470 palignr m0, m2, m3, 6 | |
5471 | |
5472 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
5473 pmulhrsw m5, m7 | |
5474 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
5475 pmulhrsw m6, m7 | |
5476 packuswb m5, m6 | |
5477 | |
5478 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
5479 pmulhrsw m6, m7 | |
5480 pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
5481 pmulhrsw m0, m7 | |
5482 packuswb m6, m0 | |
5483 | |
5484 palignr m2, m3, 8 | |
5485 | |
5486 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
5487 pmulhrsw m1, m7 | |
5488 pmaddubsw m2, [r3] ; [16] | |
5489 pmulhrsw m2, m7 | |
5490 packuswb m1, m2 | |
5491 | |
5492 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5493 | |
5494 lea r0, [r6 + 8] | |
5495 add r2, 8 | |
5496 dec r4 | |
5497 jnz .loop | |
5498 RET | |
5499 | |
5500 INIT_XMM sse4 | |
5501 cglobal intra_pred_ang16_8, 3,7,8 | |
5502 add r2, 32 | |
5503 lea r3, [ang_table + 16 * 16] | |
5504 mov r4d, 2 | |
5505 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5506 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5507 mova m7, [pw_1024] | |
5508 | |
5509 .loop: | |
5510 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5511 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5512 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5513 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5514 | |
5515 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] | |
5516 pmulhrsw m4, m7 | |
5517 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] | |
5518 pmulhrsw m2, m7 | |
5519 packuswb m4, m2 | |
5520 | |
5521 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] | |
5522 pmulhrsw m5, m7 | |
5523 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] | |
5524 pmulhrsw m6, m7 | |
5525 packuswb m5, m6 | |
5526 | |
5527 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] | |
5528 pmulhrsw m6, m7 | |
5529 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] | |
5530 pmulhrsw m2, m7 | |
5531 packuswb m6, m2 | |
5532 | |
5533 palignr m2, m0, m1, 2 | |
5534 palignr m3, m0, m1, 4 | |
5535 | |
5536 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
5537 pmulhrsw m1, m7 | |
5538 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
5539 pmulhrsw m0, m7 | |
5540 packuswb m1, m0 | |
5541 | |
5542 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5543 | |
5544 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
5545 pmulhrsw m4, m7 | |
5546 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
5547 pmulhrsw m5, m7 | |
5548 packuswb m4, m5 | |
5549 | |
5550 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
5551 pmulhrsw m5, m7 | |
5552 pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
5553 pmulhrsw m2, m7 | |
5554 packuswb m5, m2 | |
5555 | |
5556 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] | |
5557 pmulhrsw m6, m7 | |
5558 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] | |
5559 pmulhrsw m1, m7 | |
5560 packuswb m6, m1 | |
5561 | |
5562 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] | |
5563 pmulhrsw m1, m7 | |
5564 pmaddubsw m3, [r3] ; [16] | |
5565 pmulhrsw m3, m7 | |
5566 packuswb m1, m3 | |
5567 | |
5568 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5569 | |
5570 lea r0, [r6 + r1 * 4] | |
5571 lea r6, [r6 + r1 * 8] | |
5572 add r2, 8 | |
5573 dec r4 | |
5574 jnz .loop | |
5575 RET | |
5576 | |
5577 INIT_XMM sse4 | |
5578 cglobal intra_pred_ang16_28, 3,7,8 | |
5579 lea r3, [ang_table + 16 * 16] | |
5580 mov r4d, 2 | |
5581 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5582 mov r6, r0 | |
5583 mova m7, [pw_1024] | |
5584 | |
5585 .loop: | |
5586 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5587 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5588 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
5589 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5590 | |
5591 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] | |
5592 pmulhrsw m4, m7 | |
5593 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] | |
5594 pmulhrsw m2, m7 | |
5595 packuswb m4, m2 | |
5596 | |
5597 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] | |
5598 pmulhrsw m5, m7 | |
5599 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] | |
5600 pmulhrsw m6, m7 | |
5601 packuswb m5, m6 | |
5602 | |
5603 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] | |
5604 pmulhrsw m6, m7 | |
5605 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] | |
5606 pmulhrsw m2, m7 | |
5607 packuswb m6, m2 | |
5608 | |
5609 palignr m2, m0, m1, 2 | |
5610 palignr m3, m0, m1, 4 | |
5611 | |
5612 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
5613 pmulhrsw m1, m7 | |
5614 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
5615 pmulhrsw m0, m7 | |
5616 packuswb m1, m0 | |
5617 | |
5618 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5619 | |
5620 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
5621 pmulhrsw m4, m7 | |
5622 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
5623 pmulhrsw m5, m7 | |
5624 packuswb m4, m5 | |
5625 | |
5626 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
5627 pmulhrsw m5, m7 | |
5628 pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
5629 pmulhrsw m2, m7 | |
5630 packuswb m5, m2 | |
5631 | |
5632 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] | |
5633 pmulhrsw m6, m7 | |
5634 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] | |
5635 pmulhrsw m1, m7 | |
5636 packuswb m6, m1 | |
5637 | |
5638 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] | |
5639 pmulhrsw m1, m7 | |
5640 pmaddubsw m3, [r3] ; [16] | |
5641 pmulhrsw m3, m7 | |
5642 packuswb m1, m3 | |
5643 | |
5644 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5645 | |
5646 lea r0, [r6 + 8] | |
5647 add r2, 8 | |
5648 dec r4 | |
5649 jnz .loop | |
5650 RET | |
5651 | |
5652 INIT_XMM sse4 | |
5653 cglobal intra_pred_ang16_9, 3,7,8 | |
5654 add r2, 32 | |
5655 lea r3, [ang_table + 16 * 16] | |
5656 mov r4d, 2 | |
5657 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5658 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5659 mova m7, [pw_1024] | |
5660 | |
5661 .loop: | |
5662 movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5663 palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5664 punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5665 | |
5666 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
5667 pmulhrsw m4, m7 | |
5668 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] | |
5669 pmulhrsw m0, m7 | |
5670 packuswb m4, m0 | |
5671 | |
5672 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
5673 pmulhrsw m5, m7 | |
5674 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
5675 pmulhrsw m6, m7 | |
5676 packuswb m5, m6 | |
5677 | |
5678 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
5679 pmulhrsw m6, m7 | |
5680 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] | |
5681 pmulhrsw m0, m7 | |
5682 packuswb m6, m0 | |
5683 | |
5684 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
5685 pmulhrsw m1, m7 | |
5686 pmaddubsw m0, m2, [r3] ; [16] | |
5687 pmulhrsw m0, m7 | |
5688 packuswb m1, m0 | |
5689 | |
5690 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5691 | |
5692 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
5693 pmulhrsw m4, m7 | |
5694 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
5695 pmulhrsw m5, m7 | |
5696 packuswb m4, m5 | |
5697 | |
5698 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
5699 pmulhrsw m5, m7 | |
5700 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
5701 pmulhrsw m6, m7 | |
5702 packuswb m5, m6 | |
5703 | |
5704 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
5705 pmulhrsw m6, m7 | |
5706 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
5707 pmulhrsw m1, m7 | |
5708 packuswb m6, m1 | |
5709 | |
5710 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
5711 pmulhrsw m1, m7 | |
5712 packuswb m1, m1 | |
5713 | |
5714 punpcklqdq m1, m3 ; [00] | |
5715 | |
5716 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5717 | |
5718 lea r0, [r6 + r1 * 4] | |
5719 lea r6, [r6 + r1 * 8] | |
5720 add r2, 8 | |
5721 dec r4 | |
5722 jnz .loop | |
5723 RET | |
5724 | |
5725 INIT_XMM sse4 | |
5726 cglobal intra_pred_ang16_27, 3,7,8 | |
5727 lea r3, [ang_table + 16 * 16] | |
5728 mov r4d, 2 | |
5729 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5730 mov r6, r0 | |
5731 mova m7, [pw_1024] | |
5732 | |
5733 .loop: | |
5734 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5735 palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
5736 punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
5737 | |
5738 pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] | |
5739 pmulhrsw m4, m7 | |
5740 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] | |
5741 pmulhrsw m0, m7 | |
5742 packuswb m4, m0 | |
5743 | |
5744 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] | |
5745 pmulhrsw m5, m7 | |
5746 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] | |
5747 pmulhrsw m6, m7 | |
5748 packuswb m5, m6 | |
5749 | |
5750 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] | |
5751 pmulhrsw m6, m7 | |
5752 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] | |
5753 pmulhrsw m0, m7 | |
5754 packuswb m6, m0 | |
5755 | |
5756 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] | |
5757 pmulhrsw m1, m7 | |
5758 pmaddubsw m0, m3, [r3] ; [16] | |
5759 pmulhrsw m0, m7 | |
5760 packuswb m1, m0 | |
5761 | |
5762 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5763 | |
5764 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] | |
5765 pmulhrsw m4, m7 | |
5766 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] | |
5767 pmulhrsw m5, m7 | |
5768 packuswb m4, m5 | |
5769 | |
5770 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] | |
5771 pmulhrsw m5, m7 | |
5772 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
5773 pmulhrsw m6, m7 | |
5774 packuswb m5, m6 | |
5775 | |
5776 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] | |
5777 pmulhrsw m6, m7 | |
5778 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] | |
5779 pmulhrsw m1, m7 | |
5780 packuswb m6, m1 | |
5781 | |
5782 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] | |
5783 pmulhrsw m1, m7 | |
5784 packuswb m1, m1 | |
5785 | |
5786 movh [r0 ], m4 | |
5787 movhps [r0 + r1 ], m4 | |
5788 movh [r0 + r1 * 2], m5 | |
5789 movhps [r0 + r5 ], m5 | |
5790 lea r0, [r0 + r1 * 4] | |
5791 movh [r0 ], m6 | |
5792 movhps [r0 + r1 ], m6 | |
5793 movh [r0 + r1 * 2], m1 | |
5794 movh [r0 + r5 ], m2 | |
5795 | |
5796 lea r0, [r6 + 8] | |
5797 add r2, 8 | |
5798 dec r4 | |
5799 jnz .loop | |
5800 RET | |
5801 | |
5802 INIT_XMM sse4 | |
5803 cglobal intra_pred_ang16_10, 5,6,8 | |
5804 lea r5, [r1 * 3] | |
5805 pxor m7, m7 | |
5806 | |
5807 movu m0, [r2 + 1 + 32] | |
5808 palignr m1, m0, 1 | |
5809 pshufb m1, m7 | |
5810 palignr m2, m0, 2 | |
5811 pshufb m2, m7 | |
5812 palignr m3, m0, 3 | |
5813 pshufb m3, m7 | |
5814 palignr m4, m0, 4 | |
5815 pshufb m4, m7 | |
5816 palignr m5, m0, 5 | |
5817 pshufb m5, m7 | |
5818 palignr m6, m0, 6 | |
5819 pshufb m6, m7 | |
5820 | |
5821 movu [r0 + r1], m1 | |
5822 movu [r0 + r1 * 2], m2 | |
5823 movu [r0 + r5], m3 | |
5824 lea r3, [r0 + r1 * 4] | |
5825 movu [r3], m4 | |
5826 movu [r3 + r1], m5 | |
5827 movu [r3 + r1 * 2], m6 | |
5828 | |
5829 palignr m1, m0, 7 | |
5830 pshufb m1, m7 | |
5831 movhlps m2, m0 | |
5832 pshufb m2, m7 | |
5833 palignr m3, m0, 9 | |
5834 pshufb m3, m7 | |
5835 palignr m4, m0, 10 | |
5836 pshufb m4, m7 | |
5837 palignr m5, m0, 11 | |
5838 pshufb m5, m7 | |
5839 palignr m6, m0, 12 | |
5840 pshufb m6, m7 | |
5841 | |
5842 movu [r3 + r5], m1 | |
5843 lea r3, [r3 + r1 * 4] | |
5844 movu [r3], m2 | |
5845 movu [r3 + r1], m3 | |
5846 movu [r3 + r1 * 2], m4 | |
5847 movu [r3 + r5], m5 | |
5848 lea r3, [r3 + r1 * 4] | |
5849 movu [r3], m6 | |
5850 | |
5851 palignr m1, m0, 13 | |
5852 pshufb m1, m7 | |
5853 palignr m2, m0, 14 | |
5854 pshufb m2, m7 | |
5855 palignr m3, m0, 15 | |
5856 pshufb m3, m7 | |
5857 pshufb m0, m7 | |
5858 | |
5859 movu [r3 + r1], m1 | |
5860 movu [r3 + r1 * 2], m2 | |
5861 movu [r3 + r5], m3 | |
5862 | |
5863 ; filter | |
5864 cmp r4w, byte 0 | |
5865 jz .quit | |
5866 pmovzxbw m0, m0 | |
5867 mova m1, m0 | |
5868 movu m2, [r2] | |
5869 movu m3, [r2 + 1] | |
5870 | |
5871 pshufb m2, m7 | |
5872 pmovzxbw m2, m2 | |
5873 movhlps m4, m3 | |
5874 pmovzxbw m3, m3 | |
5875 pmovzxbw m4, m4 | |
5876 psubw m3, m2 | |
5877 psubw m4, m2 | |
5878 psraw m3, 1 | |
5879 psraw m4, 1 | |
5880 paddw m0, m3 | |
5881 paddw m1, m4 | |
5882 packuswb m0, m1 | |
5883 .quit: | |
5884 movu [r0], m0 | |
5885 RET | |
5886 | |
5887 INIT_XMM sse4 | |
5888 %if ARCH_X86_64 == 1 | |
5889 cglobal intra_pred_ang16_26, 3,8,5 | |
5890 mov r7, r4mp | |
5891 %define bfilter r7w | |
5892 %else | |
5893 cglobal intra_pred_ang16_26, 5,7,5,0-4 | |
5894 %define bfilter dword[rsp] | |
5895 mov bfilter, r4 | |
5896 %endif | |
5897 movu m0, [r2 + 1] | |
5898 | |
5899 lea r4, [r1 * 3] | |
5900 lea r3, [r0 + r1 * 4] | |
5901 lea r5, [r3 + r1 * 4] | |
5902 lea r6, [r5 + r1 * 4] | |
5903 | |
5904 movu [r0], m0 | |
5905 movu [r0 + r1], m0 | |
5906 movu [r0 + r1 * 2], m0 | |
5907 movu [r0 + r4], m0 | |
5908 movu [r3], m0 | |
5909 movu [r3 + r1], m0 | |
5910 movu [r3 + r1 * 2], m0 | |
5911 movu [r3 + r4], m0 | |
5912 movu [r5], m0 | |
5913 movu [r5 + r1], m0 | |
5914 movu [r5 + r1 * 2], m0 | |
5915 movu [r5 + r4], m0 | |
5916 | |
5917 movu [r6], m0 | |
5918 movu [r6 + r1], m0 | |
5919 movu [r6 + r1 * 2], m0 | |
5920 movu [r6 + r4], m0 | |
5921 | |
5922 ; filter | |
5923 cmp bfilter, byte 0 | |
5924 jz .quit | |
5925 | |
5926 pxor m4, m4 | |
5927 pshufb m0, m4 | |
5928 pmovzxbw m0, m0 | |
5929 mova m1, m0 | |
5930 movu m2, [r2 + 32] | |
5931 pinsrb m2, [r2], 0 | |
5932 movu m3, [r2 + 1 + 32] | |
5933 | |
5934 pshufb m2, m4 | |
5935 pmovzxbw m2, m2 | |
5936 movhlps m4, m3 | |
5937 pmovzxbw m3, m3 | |
5938 pmovzxbw m4, m4 | |
5939 psubw m3, m2 | |
5940 psubw m4, m2 | |
5941 psraw m3, 1 | |
5942 psraw m4, 1 | |
5943 paddw m0, m3 | |
5944 paddw m1, m4 | |
5945 packuswb m0, m1 | |
5946 | |
5947 pextrb [r0], m0, 0 | |
5948 pextrb [r0 + r1], m0, 1 | |
5949 pextrb [r0 + r1 * 2], m0, 2 | |
5950 pextrb [r0 + r4], m0, 3 | |
5951 pextrb [r3], m0, 4 | |
5952 pextrb [r3 + r1], m0, 5 | |
5953 pextrb [r3 + r1 * 2], m0, 6 | |
5954 pextrb [r3 + r4], m0, 7 | |
5955 pextrb [r5], m0, 8 | |
5956 pextrb [r5 + r1], m0, 9 | |
5957 pextrb [r5 + r1 * 2], m0, 10 | |
5958 pextrb [r5 + r4], m0, 11 | |
5959 pextrb [r6], m0, 12 | |
5960 pextrb [r6 + r1], m0, 13 | |
5961 pextrb [r6 + r1 * 2], m0, 14 | |
5962 pextrb [r6 + r4], m0, 15 | |
5963 .quit: | |
5964 RET | |
5965 | |
5966 INIT_XMM sse4 | |
5967 cglobal intra_pred_ang16_11, 3,7,8 | |
5968 lea r3, [ang_table + 16 * 16] | |
5969 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5970 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5971 mova m7, [pw_1024] | |
5972 | |
5973 movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
5974 pinsrb m3, [r2], 0 | |
5975 mova m2, m3 | |
5976 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5977 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
5978 | |
5979 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] | |
5980 pmulhrsw m4, m7 | |
5981 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] | |
5982 pmulhrsw m0, m7 | |
5983 packuswb m4, m0 | |
5984 | |
5985 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] | |
5986 pmulhrsw m5, m7 | |
5987 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
5988 pmulhrsw m6, m7 | |
5989 packuswb m5, m6 | |
5990 | |
5991 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] | |
5992 pmulhrsw m6, m7 | |
5993 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] | |
5994 pmulhrsw m0, m7 | |
5995 packuswb m6, m0 | |
5996 | |
5997 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] | |
5998 pmulhrsw m1, m7 | |
5999 pmaddubsw m0, m3, [r3] ; [16] | |
6000 pmulhrsw m0, m7 | |
6001 packuswb m1, m0 | |
6002 | |
6003 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6004 | |
6005 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] | |
6006 pmulhrsw m4, m7 | |
6007 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] | |
6008 pmulhrsw m5, m7 | |
6009 packuswb m4, m5 | |
6010 | |
6011 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] | |
6012 pmulhrsw m5, m7 | |
6013 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] | |
6014 pmulhrsw m6, m7 | |
6015 packuswb m5, m6 | |
6016 | |
6017 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] | |
6018 pmulhrsw m6, m7 | |
6019 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] | |
6020 pmulhrsw m1, m7 | |
6021 packuswb m6, m1 | |
6022 | |
6023 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] | |
6024 pmulhrsw m1, m7 | |
6025 packuswb m1, m1 | |
6026 punpcklqdq m1, m2 ;[00] | |
6027 | |
6028 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6029 | |
6030 lea r0, [r6 + r1 * 4] | |
6031 lea r6, [r6 + r1 * 8] | |
6032 | |
6033 movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6034 mova m2, m3 | |
6035 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6036 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6037 | |
6038 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] | |
6039 pmulhrsw m4, m7 | |
6040 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] | |
6041 pmulhrsw m0, m7 | |
6042 packuswb m4, m0 | |
6043 | |
6044 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] | |
6045 pmulhrsw m5, m7 | |
6046 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
6047 pmulhrsw m6, m7 | |
6048 packuswb m5, m6 | |
6049 | |
6050 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] | |
6051 pmulhrsw m6, m7 | |
6052 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] | |
6053 pmulhrsw m0, m7 | |
6054 packuswb m6, m0 | |
6055 | |
6056 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] | |
6057 pmulhrsw m1, m7 | |
6058 pmaddubsw m0, m3, [r3] ; [16] | |
6059 pmulhrsw m0, m7 | |
6060 packuswb m1, m0 | |
6061 | |
6062 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6063 | |
6064 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] | |
6065 pmulhrsw m4, m7 | |
6066 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] | |
6067 pmulhrsw m5, m7 | |
6068 packuswb m4, m5 | |
6069 | |
6070 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] | |
6071 pmulhrsw m5, m7 | |
6072 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] | |
6073 pmulhrsw m6, m7 | |
6074 packuswb m5, m6 | |
6075 | |
6076 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] | |
6077 pmulhrsw m6, m7 | |
6078 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] | |
6079 pmulhrsw m1, m7 | |
6080 packuswb m6, m1 | |
6081 | |
6082 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] | |
6083 pmulhrsw m1, m7 | |
6084 packuswb m1, m1 | |
6085 punpcklqdq m1, m2 ;[00] | |
6086 | |
6087 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6088 RET | |
6089 | |
6090 INIT_XMM sse4 | |
6091 cglobal intra_pred_ang16_25, 3,7,8 | |
6092 lea r3, [ang_table + 16 * 16] | |
6093 mov r4d, 2 | |
6094 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6095 mov r6, r0 | |
6096 mova m7, [pw_1024] | |
6097 | |
6098 .loop: | |
6099 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6100 mova m2, m3 | |
6101 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6102 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6103 | |
6104 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] | |
6105 pmulhrsw m4, m7 | |
6106 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] | |
6107 pmulhrsw m0, m7 | |
6108 packuswb m4, m0 | |
6109 | |
6110 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] | |
6111 pmulhrsw m5, m7 | |
6112 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
6113 pmulhrsw m6, m7 | |
6114 packuswb m5, m6 | |
6115 | |
6116 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] | |
6117 pmulhrsw m6, m7 | |
6118 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] | |
6119 pmulhrsw m0, m7 | |
6120 packuswb m6, m0 | |
6121 | |
6122 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] | |
6123 pmulhrsw m1, m7 | |
6124 pmaddubsw m0, m3, [r3] ; [16] | |
6125 pmulhrsw m0, m7 | |
6126 packuswb m1, m0 | |
6127 | |
6128 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6129 | |
6130 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] | |
6131 pmulhrsw m4, m7 | |
6132 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] | |
6133 pmulhrsw m5, m7 | |
6134 packuswb m4, m5 | |
6135 | |
6136 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] | |
6137 pmulhrsw m5, m7 | |
6138 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] | |
6139 pmulhrsw m6, m7 | |
6140 packuswb m5, m6 | |
6141 | |
6142 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] | |
6143 pmulhrsw m6, m7 | |
6144 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] | |
6145 pmulhrsw m1, m7 | |
6146 packuswb m6, m1 | |
6147 | |
6148 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] | |
6149 pmulhrsw m1, m7 | |
6150 packuswb m1, m1 | |
6151 | |
6152 movh [r0 ], m4 | |
6153 movhps [r0 + r1 ], m4 | |
6154 movh [r0 + r1 * 2], m5 | |
6155 movhps [r0 + r5 ], m5 | |
6156 lea r0, [r0 + r1 * 4] | |
6157 movh [r0 ], m6 | |
6158 movhps [r0 + r1 ], m6 | |
6159 movh [r0 + r1 * 2], m1 | |
6160 movh [r0 + r5 ], m2 | |
6161 | |
6162 lea r0, [r6 + 8] | |
6163 add r2, 8 | |
6164 dec r4 | |
6165 jnz .loop | |
6166 RET | |
6167 | |
6168 INIT_XMM sse4 | |
6169 cglobal intra_pred_ang16_12, 4,7,8 | |
6170 lea r4, [ang_table + 16 * 16] | |
6171 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6172 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6173 mova m7, [pw_1024] | |
6174 | |
6175 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6176 pinsrb m3, [r2], 0 | |
6177 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6178 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6179 movu m2, [r2] | |
6180 pshufb m2, [c_mode16_12] | |
6181 | |
6182 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6183 | |
6184 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
6185 pmulhrsw m4, m7 | |
6186 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] | |
6187 pmulhrsw m1, m7 | |
6188 packuswb m4, m1 | |
6189 | |
6190 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] | |
6191 pmulhrsw m5, m7 | |
6192 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
6193 pmulhrsw m6, m7 | |
6194 packuswb m5, m6 | |
6195 | |
6196 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
6197 pmulhrsw m6, m7 | |
6198 pmaddubsw m0, [r4 - 14 * 16] ; [2] | |
6199 pmulhrsw m0, m7 | |
6200 packuswb m6, m0 | |
6201 | |
6202 palignr m3, m2, 15 | |
6203 | |
6204 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6205 pmulhrsw m1, m7 | |
6206 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6207 pmulhrsw m0, m7 | |
6208 packuswb m1, m0 | |
6209 | |
6210 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6211 | |
6212 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
6213 pmulhrsw m4, m7 | |
6214 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6215 pmulhrsw m5, m7 | |
6216 packuswb m4, m5 | |
6217 | |
6218 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
6219 pmulhrsw m5, m7 | |
6220 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6221 pmulhrsw m6, m7 | |
6222 packuswb m5, m6 | |
6223 | |
6224 palignr m3, m2, 14 | |
6225 | |
6226 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6227 pmulhrsw m6, m7 | |
6228 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
6229 pmulhrsw m1, m7 | |
6230 packuswb m6, m1 | |
6231 | |
6232 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
6233 pmulhrsw m1, m7 | |
6234 pmaddubsw m3, [r4] ; [16] | |
6235 pmulhrsw m3, m7 | |
6236 packuswb m1, m3 | |
6237 | |
6238 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6239 | |
6240 lea r0, [r6 + r1 * 4] | |
6241 lea r6, [r6 + r1 * 8] | |
6242 | |
6243 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6244 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
6245 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
6246 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
6247 | |
6248 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] | |
6249 pmulhrsw m4, m7 | |
6250 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
6251 pmulhrsw m5, m7 | |
6252 packuswb m4, m5 | |
6253 | |
6254 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] | |
6255 pmulhrsw m5, m7 | |
6256 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
6257 pmulhrsw m6, m7 | |
6258 packuswb m5, m6 | |
6259 | |
6260 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] | |
6261 pmulhrsw m6, m7 | |
6262 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] | |
6263 pmulhrsw m0, m7 | |
6264 packuswb m6, m0 | |
6265 | |
6266 palignr m3, m2, 14 | |
6267 | |
6268 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6269 pmulhrsw m1, m7 | |
6270 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6271 pmulhrsw m0, m7 | |
6272 packuswb m1, m0 | |
6273 | |
6274 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6275 | |
6276 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
6277 pmulhrsw m4, m7 | |
6278 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6279 pmulhrsw m5, m7 | |
6280 packuswb m4, m5 | |
6281 | |
6282 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
6283 pmulhrsw m5, m7 | |
6284 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6285 pmulhrsw m6, m7 | |
6286 packuswb m5, m6 | |
6287 | |
6288 pslldq m2, 1 | |
6289 palignr m3, m2, 14 | |
6290 | |
6291 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6292 pmulhrsw m6, m7 | |
6293 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
6294 pmulhrsw m1, m7 | |
6295 packuswb m6, m1 | |
6296 | |
6297 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
6298 pmulhrsw m1, m7 | |
6299 pmaddubsw m3, [r4] ; [16] | |
6300 pmulhrsw m3, m7 | |
6301 packuswb m1, m3 | |
6302 | |
6303 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6304 RET | |
6305 | |
6306 INIT_XMM sse4 | |
6307 cglobal intra_pred_ang16_24, 4,7,8 | |
6308 lea r4, [ang_table + 16 * 16] | |
6309 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6310 mov r6, r0 | |
6311 mova m7, [pw_1024] | |
6312 | |
6313 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6314 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6315 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6316 movu m2, [r2 + 32] | |
6317 pshufb m2, [c_mode16_12] | |
6318 | |
6319 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6320 | |
6321 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
6322 pmulhrsw m4, m7 | |
6323 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] | |
6324 pmulhrsw m1, m7 | |
6325 packuswb m4, m1 | |
6326 | |
6327 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] | |
6328 pmulhrsw m5, m7 | |
6329 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
6330 pmulhrsw m6, m7 | |
6331 packuswb m5, m6 | |
6332 | |
6333 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
6334 pmulhrsw m6, m7 | |
6335 pmaddubsw m0, [r4 - 14 * 16] ; [2] | |
6336 pmulhrsw m0, m7 | |
6337 packuswb m6, m0 | |
6338 | |
6339 palignr m3, m2, 15 | |
6340 | |
6341 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6342 pmulhrsw m1, m7 | |
6343 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6344 pmulhrsw m0, m7 | |
6345 packuswb m1, m0 | |
6346 | |
6347 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6348 | |
6349 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
6350 pmulhrsw m4, m7 | |
6351 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6352 pmulhrsw m5, m7 | |
6353 packuswb m4, m5 | |
6354 | |
6355 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
6356 pmulhrsw m5, m7 | |
6357 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6358 pmulhrsw m6, m7 | |
6359 packuswb m5, m6 | |
6360 | |
6361 palignr m3, m2, 14 | |
6362 | |
6363 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6364 pmulhrsw m6, m7 | |
6365 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
6366 pmulhrsw m1, m7 | |
6367 packuswb m6, m1 | |
6368 | |
6369 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
6370 pmulhrsw m1, m7 | |
6371 pmaddubsw m3, [r4] ; [16] | |
6372 pmulhrsw m3, m7 | |
6373 packuswb m1, m3 | |
6374 | |
6375 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
6376 | |
6377 lea r0, [r6 + 8] | |
6378 | |
6379 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6380 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
6381 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
6382 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
6383 | |
6384 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] | |
6385 pmulhrsw m4, m7 | |
6386 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
6387 pmulhrsw m5, m7 | |
6388 packuswb m4, m5 | |
6389 | |
6390 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] | |
6391 pmulhrsw m5, m7 | |
6392 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
6393 pmulhrsw m6, m7 | |
6394 packuswb m5, m6 | |
6395 | |
6396 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] | |
6397 pmulhrsw m6, m7 | |
6398 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] | |
6399 pmulhrsw m0, m7 | |
6400 packuswb m6, m0 | |
6401 | |
6402 palignr m3, m2, 14 | |
6403 | |
6404 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6405 pmulhrsw m1, m7 | |
6406 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6407 pmulhrsw m0, m7 | |
6408 packuswb m1, m0 | |
6409 | |
6410 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6411 | |
6412 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
6413 pmulhrsw m4, m7 | |
6414 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6415 pmulhrsw m5, m7 | |
6416 packuswb m4, m5 | |
6417 | |
6418 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
6419 pmulhrsw m5, m7 | |
6420 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6421 pmulhrsw m6, m7 | |
6422 packuswb m5, m6 | |
6423 | |
6424 pslldq m2, 1 | |
6425 palignr m3, m2, 14 | |
6426 | |
6427 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6428 pmulhrsw m6, m7 | |
6429 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
6430 pmulhrsw m1, m7 | |
6431 packuswb m6, m1 | |
6432 | |
6433 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
6434 pmulhrsw m1, m7 | |
6435 pmaddubsw m3, [r4] ; [16] | |
6436 pmulhrsw m3, m7 | |
6437 packuswb m1, m3 | |
6438 | |
6439 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
6440 RET | |
6441 | |
6442 INIT_XMM sse4 | |
6443 cglobal intra_pred_ang16_13, 4,7,8 | |
6444 lea r4, [ang_table + 16 * 16] | |
6445 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6446 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6447 mova m7, [pw_1024] | |
6448 | |
6449 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6450 pinsrb m3, [r2], 0 | |
6451 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6452 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6453 movu m2, [r2] | |
6454 pshufb m2, [c_mode16_13] | |
6455 | |
6456 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6457 | |
6458 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] | |
6459 pmulhrsw m4, m7 | |
6460 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] | |
6461 pmulhrsw m0, m7 | |
6462 packuswb m4, m0 | |
6463 | |
6464 pmaddubsw m5, [r4 - 11 * 16] ; [05] | |
6465 pmulhrsw m5, m7 | |
6466 | |
6467 palignr m3, m2, 15 | |
6468 | |
6469 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
6470 pmulhrsw m6, m7 | |
6471 packuswb m5, m6 | |
6472 | |
6473 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
6474 pmulhrsw m6, m7 | |
6475 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
6476 pmulhrsw m0, m7 | |
6477 packuswb m6, m0 | |
6478 | |
6479 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
6480 pmulhrsw m1, m7 | |
6481 | |
6482 palignr m3, m2, 14 | |
6483 | |
6484 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6485 pmulhrsw m0, m7 | |
6486 packuswb m1, m0 | |
6487 | |
6488 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6489 | |
6490 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
6491 pmulhrsw m4, m7 | |
6492 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
6493 pmulhrsw m5, m7 | |
6494 packuswb m4, m5 | |
6495 | |
6496 pslldq m2, 1 | |
6497 palignr m3, m2, 14 | |
6498 | |
6499 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
6500 pmulhrsw m5, m7 | |
6501 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
6502 pmulhrsw m6, m7 | |
6503 packuswb m5, m6 | |
6504 | |
6505 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
6506 pmulhrsw m6, m7 | |
6507 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
6508 pmulhrsw m1, m7 | |
6509 packuswb m6, m1 | |
6510 | |
6511 pslldq m2, 1 | |
6512 palignr m3, m2, 14 | |
6513 | |
6514 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
6515 pmulhrsw m1, m7 | |
6516 pmaddubsw m3, [r4] ; [16] | |
6517 pmulhrsw m3, m7 | |
6518 packuswb m1, m3 | |
6519 | |
6520 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6521 | |
6522 lea r0, [r6 + r1 * 4] | |
6523 lea r6, [r6 + r1 * 8] | |
6524 | |
6525 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6526 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
6527 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
6528 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
6529 | |
6530 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] | |
6531 pmulhrsw m4, m7 | |
6532 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6533 pmulhrsw m5, m7 | |
6534 packuswb m4, m5 | |
6535 | |
6536 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
6537 pmulhrsw m5, m7 | |
6538 | |
6539 palignr m3, m2, 14 | |
6540 | |
6541 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
6542 pmulhrsw m6, m7 | |
6543 packuswb m5, m6 | |
6544 | |
6545 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
6546 pmulhrsw m6, m7 | |
6547 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
6548 pmulhrsw m0, m7 | |
6549 packuswb m6, m0 | |
6550 | |
6551 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
6552 pmulhrsw m1, m7 | |
6553 | |
6554 pslldq m2, 1 | |
6555 palignr m3, m2, 14 | |
6556 | |
6557 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6558 pmulhrsw m0, m7 | |
6559 packuswb m1, m0 | |
6560 | |
6561 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6562 | |
6563 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
6564 pmulhrsw m4, m7 | |
6565 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
6566 pmulhrsw m5, m7 | |
6567 packuswb m4, m5 | |
6568 | |
6569 pslldq m2, 1 | |
6570 palignr m3, m2, 14 | |
6571 | |
6572 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
6573 pmulhrsw m5, m7 | |
6574 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
6575 pmulhrsw m6, m7 | |
6576 packuswb m5, m6 | |
6577 | |
6578 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
6579 pmulhrsw m6, m7 | |
6580 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
6581 pmulhrsw m1, m7 | |
6582 packuswb m6, m1 | |
6583 | |
6584 pslldq m2, 1 | |
6585 palignr m3, m2, 14 | |
6586 | |
6587 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
6588 pmulhrsw m1, m7 | |
6589 pmaddubsw m3, [r4] ; [16] | |
6590 pmulhrsw m3, m7 | |
6591 packuswb m1, m3 | |
6592 | |
6593 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6594 RET | |
6595 | |
6596 INIT_XMM sse4 | |
6597 cglobal intra_pred_ang16_23, 4,7,8 | |
6598 lea r4, [ang_table + 16 * 16] | |
6599 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6600 mov r6, r0 | |
6601 mova m7, [pw_1024] | |
6602 | |
6603 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6604 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6605 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6606 movu m2, [r2 + 32] | |
6607 pshufb m2, [c_mode16_13] | |
6608 | |
6609 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6610 | |
6611 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] | |
6612 pmulhrsw m4, m7 | |
6613 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] | |
6614 pmulhrsw m0, m7 | |
6615 packuswb m4, m0 | |
6616 | |
6617 pmaddubsw m5, [r4 - 11 * 16] ; [05] | |
6618 pmulhrsw m5, m7 | |
6619 | |
6620 palignr m3, m2, 15 | |
6621 | |
6622 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
6623 pmulhrsw m6, m7 | |
6624 packuswb m5, m6 | |
6625 | |
6626 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
6627 pmulhrsw m6, m7 | |
6628 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
6629 pmulhrsw m0, m7 | |
6630 packuswb m6, m0 | |
6631 | |
6632 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
6633 pmulhrsw m1, m7 | |
6634 | |
6635 palignr m3, m2, 14 | |
6636 | |
6637 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6638 pmulhrsw m0, m7 | |
6639 packuswb m1, m0 | |
6640 | |
6641 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6642 | |
6643 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
6644 pmulhrsw m4, m7 | |
6645 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
6646 pmulhrsw m5, m7 | |
6647 packuswb m4, m5 | |
6648 | |
6649 pslldq m2, 1 | |
6650 palignr m3, m2, 14 | |
6651 | |
6652 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
6653 pmulhrsw m5, m7 | |
6654 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
6655 pmulhrsw m6, m7 | |
6656 packuswb m5, m6 | |
6657 | |
6658 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
6659 pmulhrsw m6, m7 | |
6660 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
6661 pmulhrsw m1, m7 | |
6662 packuswb m6, m1 | |
6663 | |
6664 pslldq m2, 1 | |
6665 palignr m3, m2, 14 | |
6666 | |
6667 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
6668 pmulhrsw m1, m7 | |
6669 pmaddubsw m3, [r4] ; [16] | |
6670 pmulhrsw m3, m7 | |
6671 packuswb m1, m3 | |
6672 | |
6673 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
6674 | |
6675 lea r0, [r6 + 8] | |
6676 | |
6677 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6678 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
6679 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
6680 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
6681 | |
6682 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] | |
6683 pmulhrsw m4, m7 | |
6684 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
6685 pmulhrsw m5, m7 | |
6686 packuswb m4, m5 | |
6687 | |
6688 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
6689 pmulhrsw m5, m7 | |
6690 | |
6691 palignr m3, m2, 14 | |
6692 | |
6693 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
6694 pmulhrsw m6, m7 | |
6695 packuswb m5, m6 | |
6696 | |
6697 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
6698 pmulhrsw m6, m7 | |
6699 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
6700 pmulhrsw m0, m7 | |
6701 packuswb m6, m0 | |
6702 | |
6703 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
6704 pmulhrsw m1, m7 | |
6705 | |
6706 pslldq m2, 1 | |
6707 palignr m3, m2, 14 | |
6708 | |
6709 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6710 pmulhrsw m0, m7 | |
6711 packuswb m1, m0 | |
6712 | |
6713 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6714 | |
6715 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
6716 pmulhrsw m4, m7 | |
6717 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
6718 pmulhrsw m5, m7 | |
6719 packuswb m4, m5 | |
6720 | |
6721 pslldq m2, 1 | |
6722 palignr m3, m2, 14 | |
6723 | |
6724 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
6725 pmulhrsw m5, m7 | |
6726 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
6727 pmulhrsw m6, m7 | |
6728 packuswb m5, m6 | |
6729 | |
6730 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
6731 pmulhrsw m6, m7 | |
6732 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
6733 pmulhrsw m1, m7 | |
6734 packuswb m6, m1 | |
6735 | |
6736 pslldq m2, 1 | |
6737 palignr m3, m2, 14 | |
6738 | |
6739 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
6740 pmulhrsw m1, m7 | |
6741 pmaddubsw m3, [r4] ; [16] | |
6742 pmulhrsw m3, m7 | |
6743 packuswb m1, m3 | |
6744 | |
6745 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
6746 RET | |
6747 | |
6748 INIT_XMM sse4 | |
6749 cglobal intra_pred_ang16_14, 4,7,8 | |
6750 lea r4, [ang_table + 16 * 16] | |
6751 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6752 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6753 mova m7, [pw_1024] | |
6754 | |
6755 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6756 pinsrb m3, [r2], 0 | |
6757 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6758 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6759 movu m2, [r2] | |
6760 pshufb m2, [c_mode16_14] | |
6761 | |
6762 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6763 | |
6764 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] | |
6765 pmulhrsw m4, m7 | |
6766 pmaddubsw m5, [r4 - 10 * 16] ; [06] | |
6767 pmulhrsw m5, m7 | |
6768 packuswb m4, m5 | |
6769 | |
6770 palignr m3, m2, 15 | |
6771 | |
6772 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
6773 pmulhrsw m5, m7 | |
6774 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
6775 pmulhrsw m6, m7 | |
6776 packuswb m5, m6 | |
6777 | |
6778 palignr m3, m2, 14 | |
6779 | |
6780 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6781 pmulhrsw m6, m7 | |
6782 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
6783 pmulhrsw m0, m7 | |
6784 packuswb m6, m0 | |
6785 | |
6786 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
6787 pmulhrsw m1, m7 | |
6788 | |
6789 pslldq m2, 1 | |
6790 palignr m3, m2, 14 | |
6791 | |
6792 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6793 pmulhrsw m0, m7 | |
6794 packuswb m1, m0 | |
6795 | |
6796 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6797 | |
6798 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
6799 pmulhrsw m4, m7 | |
6800 | |
6801 pslldq m2, 1 | |
6802 palignr m3, m2, 14 | |
6803 | |
6804 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
6805 pmulhrsw m5, m7 | |
6806 packuswb m4, m5 | |
6807 | |
6808 pmaddubsw m5, m3, [r4 + 16] ; [17] | |
6809 pmulhrsw m5, m7 | |
6810 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6811 pmulhrsw m6, m7 | |
6812 packuswb m5, m6 | |
6813 | |
6814 pslldq m2, 1 | |
6815 palignr m3, m2, 14 | |
6816 | |
6817 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
6818 pmulhrsw m6, m7 | |
6819 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
6820 pmulhrsw m1, m7 | |
6821 packuswb m6, m1 | |
6822 | |
6823 pslldq m2, 1 | |
6824 palignr m3, m2, 14 | |
6825 | |
6826 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6827 pmulhrsw m1, m7 | |
6828 pmaddubsw m3, [r4] ; [16] | |
6829 pmulhrsw m3, m7 | |
6830 packuswb m1, m3 | |
6831 | |
6832 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6833 | |
6834 lea r0, [r6 + r1 * 4] | |
6835 lea r6, [r6 + r1 * 8] | |
6836 | |
6837 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6838 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
6839 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
6840 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
6841 | |
6842 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
6843 pmulhrsw m4, m7 | |
6844 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
6845 pmulhrsw m5, m7 | |
6846 packuswb m4, m5 | |
6847 | |
6848 palignr m3, m2, 14 | |
6849 | |
6850 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
6851 pmulhrsw m5, m7 | |
6852 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
6853 pmulhrsw m6, m7 | |
6854 packuswb m5, m6 | |
6855 | |
6856 pslldq m2, 1 | |
6857 palignr m3, m2, 14 | |
6858 | |
6859 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6860 pmulhrsw m6, m7 | |
6861 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
6862 pmulhrsw m0, m7 | |
6863 packuswb m6, m0 | |
6864 | |
6865 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
6866 pmulhrsw m1, m7 | |
6867 | |
6868 pslldq m2, 1 | |
6869 palignr m3, m2, 14 | |
6870 | |
6871 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6872 pmulhrsw m0, m7 | |
6873 packuswb m1, m0 | |
6874 | |
6875 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
6876 | |
6877 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
6878 pmulhrsw m4, m7 | |
6879 | |
6880 pslldq m2, 1 | |
6881 palignr m3, m2, 14 | |
6882 | |
6883 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
6884 pmulhrsw m5, m7 | |
6885 packuswb m4, m5 | |
6886 | |
6887 pmaddubsw m5, m3, [r4 + 16] ; [17] | |
6888 pmulhrsw m5, m7 | |
6889 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6890 pmulhrsw m6, m7 | |
6891 packuswb m5, m6 | |
6892 | |
6893 pslldq m2, 1 | |
6894 palignr m3, m2, 14 | |
6895 | |
6896 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
6897 pmulhrsw m6, m7 | |
6898 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
6899 pmulhrsw m1, m7 | |
6900 packuswb m6, m1 | |
6901 | |
6902 pslldq m2, 1 | |
6903 palignr m3, m2, 14 | |
6904 | |
6905 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6906 pmulhrsw m1, m7 | |
6907 pmaddubsw m3, [r4] ; [16] | |
6908 pmulhrsw m3, m7 | |
6909 packuswb m1, m3 | |
6910 | |
6911 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
6912 RET | |
6913 | |
6914 INIT_XMM sse4 | |
6915 cglobal intra_pred_ang16_22, 4,7,8 | |
6916 lea r4, [ang_table + 16 * 16] | |
6917 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6918 mov r6, r0 | |
6919 mova m7, [pw_1024] | |
6920 | |
6921 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
6922 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
6923 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
6924 movu m2, [r2 + 32] | |
6925 pshufb m2, [c_mode16_14] | |
6926 | |
6927 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
6928 | |
6929 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] | |
6930 pmulhrsw m4, m7 | |
6931 pmaddubsw m5, [r4 - 10 * 16] ; [06] | |
6932 pmulhrsw m5, m7 | |
6933 packuswb m4, m5 | |
6934 | |
6935 palignr m3, m2, 15 | |
6936 | |
6937 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
6938 pmulhrsw m5, m7 | |
6939 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
6940 pmulhrsw m6, m7 | |
6941 packuswb m5, m6 | |
6942 | |
6943 palignr m3, m2, 14 | |
6944 | |
6945 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
6946 pmulhrsw m6, m7 | |
6947 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
6948 pmulhrsw m0, m7 | |
6949 packuswb m6, m0 | |
6950 | |
6951 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
6952 pmulhrsw m1, m7 | |
6953 | |
6954 pslldq m2, 1 | |
6955 palignr m3, m2, 14 | |
6956 | |
6957 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
6958 pmulhrsw m0, m7 | |
6959 packuswb m1, m0 | |
6960 | |
6961 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
6962 | |
6963 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
6964 pmulhrsw m4, m7 | |
6965 | |
6966 pslldq m2, 1 | |
6967 palignr m3, m2, 14 | |
6968 | |
6969 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
6970 pmulhrsw m5, m7 | |
6971 packuswb m4, m5 | |
6972 | |
6973 pmaddubsw m5, m3, [r4 + 16] ; [17] | |
6974 pmulhrsw m5, m7 | |
6975 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
6976 pmulhrsw m6, m7 | |
6977 packuswb m5, m6 | |
6978 | |
6979 pslldq m2, 1 | |
6980 palignr m3, m2, 14 | |
6981 | |
6982 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
6983 pmulhrsw m6, m7 | |
6984 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
6985 pmulhrsw m1, m7 | |
6986 packuswb m6, m1 | |
6987 | |
6988 pslldq m2, 1 | |
6989 palignr m3, m2, 14 | |
6990 | |
6991 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
6992 pmulhrsw m1, m7 | |
6993 pmaddubsw m3, [r4] ; [16] | |
6994 pmulhrsw m3, m7 | |
6995 packuswb m1, m3 | |
6996 | |
6997 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
6998 | |
6999 lea r0, [r6 + 8] | |
7000 | |
7001 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7002 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7003 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7004 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
7005 | |
7006 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
7007 pmulhrsw m4, m7 | |
7008 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
7009 pmulhrsw m5, m7 | |
7010 packuswb m4, m5 | |
7011 | |
7012 palignr m3, m2, 14 | |
7013 | |
7014 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
7015 pmulhrsw m5, m7 | |
7016 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
7017 pmulhrsw m6, m7 | |
7018 packuswb m5, m6 | |
7019 | |
7020 pslldq m2, 1 | |
7021 palignr m3, m2, 14 | |
7022 | |
7023 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
7024 pmulhrsw m6, m7 | |
7025 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
7026 pmulhrsw m0, m7 | |
7027 packuswb m6, m0 | |
7028 | |
7029 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
7030 pmulhrsw m1, m7 | |
7031 | |
7032 pslldq m2, 1 | |
7033 palignr m3, m2, 14 | |
7034 | |
7035 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7036 pmulhrsw m0, m7 | |
7037 packuswb m1, m0 | |
7038 | |
7039 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
7040 | |
7041 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
7042 pmulhrsw m4, m7 | |
7043 | |
7044 pslldq m2, 1 | |
7045 palignr m3, m2, 14 | |
7046 | |
7047 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
7048 pmulhrsw m5, m7 | |
7049 packuswb m4, m5 | |
7050 | |
7051 pmaddubsw m5, m3, [r4 + 16] ; [17] | |
7052 pmulhrsw m5, m7 | |
7053 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
7054 pmulhrsw m6, m7 | |
7055 packuswb m5, m6 | |
7056 | |
7057 pslldq m2, 1 | |
7058 palignr m3, m2, 14 | |
7059 | |
7060 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
7061 pmulhrsw m6, m7 | |
7062 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
7063 pmulhrsw m1, m7 | |
7064 packuswb m6, m1 | |
7065 | |
7066 pslldq m2, 1 | |
7067 palignr m3, m2, 14 | |
7068 | |
7069 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
7070 pmulhrsw m1, m7 | |
7071 pmaddubsw m3, [r4] ; [16] | |
7072 pmulhrsw m3, m7 | |
7073 packuswb m1, m3 | |
7074 | |
7075 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
7076 RET | |
7077 | |
7078 INIT_XMM sse4 | |
7079 cglobal intra_pred_ang16_15, 4,7,8 | |
7080 lea r4, [ang_table + 16 * 16] | |
7081 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7082 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7083 mova m7, [pw_1024] | |
7084 | |
7085 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7086 pinsrb m3, [r2], 0 | |
7087 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
7088 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7089 movu m2, [r2] | |
7090 pshufb m2, [c_mode16_15] | |
7091 | |
7092 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
7093 | |
7094 pmaddubsw m4, [r4 - 16] ; [15] | |
7095 pmulhrsw m4, m7 | |
7096 | |
7097 palignr m3, m2, 15 | |
7098 | |
7099 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
7100 pmulhrsw m5, m7 | |
7101 packuswb m4, m5 | |
7102 | |
7103 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
7104 pmulhrsw m5, m7 | |
7105 | |
7106 palignr m3, m2, 14 | |
7107 | |
7108 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
7109 pmulhrsw m6, m7 | |
7110 packuswb m5, m6 | |
7111 | |
7112 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
7113 pmulhrsw m6, m7 | |
7114 | |
7115 pslldq m2, 1 | |
7116 palignr m3, m2, 14 | |
7117 | |
7118 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
7119 pmulhrsw m0, m7 | |
7120 packuswb m6, m0 | |
7121 | |
7122 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
7123 pmulhrsw m1, m7 | |
7124 | |
7125 pslldq m2, 1 | |
7126 palignr m3, m2, 14 | |
7127 | |
7128 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7129 pmulhrsw m0, m7 | |
7130 packuswb m1, m0 | |
7131 | |
7132 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
7133 | |
7134 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
7135 pmulhrsw m4, m7 | |
7136 | |
7137 pslldq m2, 1 | |
7138 palignr m3, m2, 14 | |
7139 | |
7140 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7141 pmulhrsw m5, m7 | |
7142 packuswb m4, m5 | |
7143 | |
7144 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
7145 pmulhrsw m5, m7 | |
7146 | |
7147 pslldq m2, 1 | |
7148 palignr m3, m2, 14 | |
7149 | |
7150 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
7151 pmulhrsw m6, m7 | |
7152 packuswb m5, m6 | |
7153 | |
7154 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
7155 pmulhrsw m6, m7 | |
7156 | |
7157 pslldq m2, 1 | |
7158 palignr m3, m2, 14 | |
7159 | |
7160 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
7161 pmulhrsw m1, m7 | |
7162 packuswb m6, m1 | |
7163 | |
7164 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
7165 pmulhrsw m1, m7 | |
7166 | |
7167 pslldq m2, 1 | |
7168 palignr m3, m2, 14 | |
7169 | |
7170 pmaddubsw m3, [r4] ; [16] | |
7171 pmulhrsw m3, m7 | |
7172 packuswb m1, m3 | |
7173 | |
7174 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
7175 | |
7176 lea r0, [r6 + r1 * 4] | |
7177 lea r6, [r6 + r1 * 8] | |
7178 | |
7179 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7180 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7181 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7182 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] | |
7183 | |
7184 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
7185 pmulhrsw m4, m7 | |
7186 | |
7187 palignr m3, m2, 14 | |
7188 | |
7189 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
7190 pmulhrsw m5, m7 | |
7191 packuswb m4, m5 | |
7192 | |
7193 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
7194 pmulhrsw m5, m7 | |
7195 | |
7196 pslldq m2, 1 | |
7197 palignr m3, m2, 14 | |
7198 | |
7199 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
7200 pmulhrsw m6, m7 | |
7201 packuswb m5, m6 | |
7202 | |
7203 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
7204 pmulhrsw m6, m7 | |
7205 | |
7206 pslldq m2, 1 | |
7207 palignr m3, m2, 14 | |
7208 | |
7209 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
7210 pmulhrsw m0, m7 | |
7211 packuswb m6, m0 | |
7212 | |
7213 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
7214 pmulhrsw m1, m7 | |
7215 | |
7216 pslldq m2, 1 | |
7217 palignr m3, m2, 14 | |
7218 | |
7219 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7220 pmulhrsw m0, m7 | |
7221 packuswb m1, m0 | |
7222 | |
7223 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
7224 | |
7225 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
7226 pmulhrsw m4, m7 | |
7227 | |
7228 pslldq m2, 1 | |
7229 palignr m3, m2, 14 | |
7230 | |
7231 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7232 pmulhrsw m5, m7 | |
7233 packuswb m4, m5 | |
7234 | |
7235 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
7236 pmulhrsw m5, m7 | |
7237 | |
7238 pslldq m2, 1 | |
7239 palignr m3, m2, 14 | |
7240 | |
7241 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
7242 pmulhrsw m6, m7 | |
7243 packuswb m5, m6 | |
7244 | |
7245 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
7246 pmulhrsw m6, m7 | |
7247 | |
7248 pslldq m2, 1 | |
7249 palignr m3, m2, 14 | |
7250 | |
7251 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
7252 pmulhrsw m1, m7 | |
7253 packuswb m6, m1 | |
7254 | |
7255 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
7256 pmulhrsw m1, m7 | |
7257 | |
7258 pslldq m2, 1 | |
7259 palignr m3, m2, 14 | |
7260 | |
7261 pmaddubsw m3, [r4] ; [16] | |
7262 pmulhrsw m3, m7 | |
7263 packuswb m1, m3 | |
7264 | |
7265 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
7266 RET | |
7267 | |
7268 INIT_XMM sse4 | |
7269 cglobal intra_pred_ang16_21, 4,7,8 | |
7270 lea r4, [ang_table + 16 * 16] | |
7271 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7272 mov r6, r0 | |
7273 mova m7, [pw_1024] | |
7274 | |
7275 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7276 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
7277 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7278 movu m2, [r2 + 32] | |
7279 pinsrb m2, [r2], 0 | |
7280 pshufb m2, [c_mode16_15] | |
7281 | |
7282 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
7283 | |
7284 pmaddubsw m4, [r4 - 16] ; [15] | |
7285 pmulhrsw m4, m7 | |
7286 | |
7287 palignr m3, m2, 15 | |
7288 | |
7289 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
7290 pmulhrsw m5, m7 | |
7291 packuswb m4, m5 | |
7292 | |
7293 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
7294 pmulhrsw m5, m7 | |
7295 | |
7296 palignr m3, m2, 14 | |
7297 | |
7298 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
7299 pmulhrsw m6, m7 | |
7300 packuswb m5, m6 | |
7301 | |
7302 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
7303 pmulhrsw m6, m7 | |
7304 | |
7305 pslldq m2, 1 | |
7306 palignr m3, m2, 14 | |
7307 | |
7308 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
7309 pmulhrsw m0, m7 | |
7310 packuswb m6, m0 | |
7311 | |
7312 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
7313 pmulhrsw m1, m7 | |
7314 | |
7315 pslldq m2, 1 | |
7316 palignr m3, m2, 14 | |
7317 | |
7318 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7319 pmulhrsw m0, m7 | |
7320 packuswb m1, m0 | |
7321 | |
7322 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
7323 | |
7324 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
7325 pmulhrsw m4, m7 | |
7326 | |
7327 pslldq m2, 1 | |
7328 palignr m3, m2, 14 | |
7329 | |
7330 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7331 pmulhrsw m5, m7 | |
7332 packuswb m4, m5 | |
7333 | |
7334 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
7335 pmulhrsw m5, m7 | |
7336 | |
7337 pslldq m2, 1 | |
7338 palignr m3, m2, 14 | |
7339 | |
7340 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
7341 pmulhrsw m6, m7 | |
7342 packuswb m5, m6 | |
7343 | |
7344 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
7345 pmulhrsw m6, m7 | |
7346 | |
7347 pslldq m2, 1 | |
7348 palignr m3, m2, 14 | |
7349 | |
7350 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
7351 pmulhrsw m1, m7 | |
7352 packuswb m6, m1 | |
7353 | |
7354 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
7355 pmulhrsw m1, m7 | |
7356 | |
7357 pslldq m2, 1 | |
7358 palignr m3, m2, 14 | |
7359 | |
7360 pmaddubsw m3, [r4] ; [16] | |
7361 pmulhrsw m3, m7 | |
7362 packuswb m1, m3 | |
7363 | |
7364 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
7365 | |
7366 lea r0, [r6 + 8] | |
7367 | |
7368 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7369 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7370 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7371 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] | |
7372 | |
7373 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
7374 pmulhrsw m4, m7 | |
7375 | |
7376 palignr m3, m2, 14 | |
7377 | |
7378 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
7379 pmulhrsw m5, m7 | |
7380 packuswb m4, m5 | |
7381 | |
7382 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
7383 pmulhrsw m5, m7 | |
7384 | |
7385 pslldq m2, 1 | |
7386 palignr m3, m2, 14 | |
7387 | |
7388 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
7389 pmulhrsw m6, m7 | |
7390 packuswb m5, m6 | |
7391 | |
7392 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
7393 pmulhrsw m6, m7 | |
7394 | |
7395 pslldq m2, 1 | |
7396 palignr m3, m2, 14 | |
7397 | |
7398 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
7399 pmulhrsw m0, m7 | |
7400 packuswb m6, m0 | |
7401 | |
7402 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
7403 pmulhrsw m1, m7 | |
7404 | |
7405 pslldq m2, 1 | |
7406 palignr m3, m2, 14 | |
7407 | |
7408 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7409 pmulhrsw m0, m7 | |
7410 packuswb m1, m0 | |
7411 | |
7412 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
7413 | |
7414 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
7415 pmulhrsw m4, m7 | |
7416 | |
7417 pslldq m2, 1 | |
7418 palignr m3, m2, 14 | |
7419 | |
7420 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7421 pmulhrsw m5, m7 | |
7422 packuswb m4, m5 | |
7423 | |
7424 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
7425 pmulhrsw m5, m7 | |
7426 | |
7427 pslldq m2, 1 | |
7428 palignr m3, m2, 14 | |
7429 | |
7430 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
7431 pmulhrsw m6, m7 | |
7432 packuswb m5, m6 | |
7433 | |
7434 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
7435 pmulhrsw m6, m7 | |
7436 | |
7437 pslldq m2, 1 | |
7438 palignr m3, m2, 14 | |
7439 | |
7440 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
7441 pmulhrsw m1, m7 | |
7442 packuswb m6, m1 | |
7443 | |
7444 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
7445 pmulhrsw m1, m7 | |
7446 | |
7447 pslldq m2, 1 | |
7448 palignr m3, m2, 14 | |
7449 | |
7450 pmaddubsw m3, [r4] ; [16] | |
7451 pmulhrsw m3, m7 | |
7452 packuswb m1, m3 | |
7453 | |
7454 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
7455 RET | |
7456 | |
7457 INIT_XMM sse4 | |
7458 cglobal intra_pred_ang16_16, 4,7,8 | |
7459 lea r4, [ang_table + 16 * 16] | |
7460 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7461 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7462 mova m7, [pw_1024] | |
7463 | |
7464 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7465 pinsrb m3, [r2], 0 | |
7466 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
7467 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7468 movu m2, [r2] | |
7469 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] | |
7470 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
7471 | |
7472 pmaddubsw m4, [r4 - 5 * 16] ; [11] | |
7473 pmulhrsw m4, m7 | |
7474 | |
7475 palignr m3, m2, 15 | |
7476 | |
7477 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7478 pmulhrsw m5, m7 | |
7479 packuswb m4, m5 | |
7480 | |
7481 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
7482 pmulhrsw m5, m7 | |
7483 | |
7484 palignr m3, m2, 14 | |
7485 | |
7486 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
7487 pmulhrsw m6, m7 | |
7488 packuswb m5, m6 | |
7489 | |
7490 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] | |
7491 palignr m3, m2, 14 | |
7492 | |
7493 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
7494 pmulhrsw m6, m7 | |
7495 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
7496 pmulhrsw m0, m7 | |
7497 packuswb m6, m0 | |
7498 | |
7499 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
7500 palignr m3, m2, 14 | |
7501 | |
7502 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
7503 pmulhrsw m1, m7 | |
7504 | |
7505 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] | |
7506 palignr m3, m2, 14 | |
7507 | |
7508 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7509 pmulhrsw m0, m7 | |
7510 packuswb m1, m0 | |
7511 | |
7512 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
7513 | |
7514 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
7515 pmulhrsw m4, m7 | |
7516 | |
7517 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] | |
7518 palignr m3, m2, 14 | |
7519 | |
7520 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
7521 pmulhrsw m5, m7 | |
7522 packuswb m4, m5 | |
7523 | |
7524 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] | |
7525 palignr m3, m2, 14 | |
7526 | |
7527 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
7528 pmulhrsw m5, m7 | |
7529 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
7530 pmulhrsw m6, m7 | |
7531 packuswb m5, m6 | |
7532 | |
7533 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] | |
7534 palignr m3, m2, 14 | |
7535 | |
7536 pmaddubsw m6, m3, [r4 - 16] ; [15] | |
7537 pmulhrsw m6, m7 | |
7538 | |
7539 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] | |
7540 palignr m3, m2, 14 | |
7541 | |
7542 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
7543 pmulhrsw m1, m7 | |
7544 packuswb m6, m1 | |
7545 | |
7546 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
7547 pmulhrsw m1, m7 | |
7548 | |
7549 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] | |
7550 palignr m3, m2, 14 | |
7551 | |
7552 pmaddubsw m3, [r4] ; [16] | |
7553 pmulhrsw m3, m7 | |
7554 packuswb m1, m3 | |
7555 | |
7556 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
7557 | |
7558 lea r0, [r6 + r1 * 4] | |
7559 lea r6, [r6 + r1 * 8] | |
7560 | |
7561 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7562 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7563 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7564 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
7565 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
7566 | |
7567 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
7568 pmulhrsw m4, m7 | |
7569 | |
7570 palignr m3, m2, 14 | |
7571 | |
7572 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7573 pmulhrsw m5, m7 | |
7574 packuswb m4, m5 | |
7575 | |
7576 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
7577 pmulhrsw m5, m7 | |
7578 | |
7579 pslldq m2, 1 | |
7580 palignr m3, m2, 14 | |
7581 | |
7582 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
7583 pmulhrsw m6, m7 | |
7584 packuswb m5, m6 | |
7585 | |
7586 pslldq m2, 1 | |
7587 palignr m3, m2, 14 | |
7588 | |
7589 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
7590 pmulhrsw m6, m7 | |
7591 | |
7592 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
7593 pmulhrsw m0, m7 | |
7594 packuswb m6, m0 | |
7595 | |
7596 pslldq m2, 1 | |
7597 palignr m3, m2, 14 | |
7598 | |
7599 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
7600 pmulhrsw m1, m7 | |
7601 | |
7602 pslldq m2, 1 | |
7603 palignr m3, m2, 14 | |
7604 | |
7605 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7606 pmulhrsw m0, m7 | |
7607 packuswb m1, m0 | |
7608 | |
7609 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
7610 | |
7611 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
7612 pmulhrsw m4, m7 | |
7613 | |
7614 pslldq m2, 1 | |
7615 palignr m3, m2, 14 | |
7616 | |
7617 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
7618 pmulhrsw m5, m7 | |
7619 packuswb m4, m5 | |
7620 | |
7621 pslldq m2, 1 | |
7622 palignr m3, m2, 14 | |
7623 | |
7624 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
7625 pmulhrsw m5, m7 | |
7626 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
7627 pmulhrsw m6, m7 | |
7628 packuswb m5, m6 | |
7629 | |
7630 pslldq m2, 1 | |
7631 palignr m3, m2, 14 | |
7632 | |
7633 pmaddubsw m6, m3, [r4 - 16] ; [15] | |
7634 pmulhrsw m6, m7 | |
7635 | |
7636 pslldq m2, 1 | |
7637 palignr m3, m2, 14 | |
7638 | |
7639 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
7640 pmulhrsw m1, m7 | |
7641 packuswb m6, m1 | |
7642 | |
7643 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
7644 pmulhrsw m1, m7 | |
7645 | |
7646 pslldq m2, 1 | |
7647 palignr m3, m2, 14 | |
7648 | |
7649 pmaddubsw m3, [r4] ; [16] | |
7650 pmulhrsw m3, m7 | |
7651 packuswb m1, m3 | |
7652 | |
7653 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
7654 RET | |
7655 | |
7656 INIT_XMM sse4 | |
7657 cglobal intra_pred_ang16_20, 4,7,8 | |
7658 lea r4, [ang_table + 16 * 16] | |
7659 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7660 mov r6, r0 | |
7661 mova m7, [pw_1024] | |
7662 | |
7663 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7664 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
7665 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7666 movu m2, [r2 + 32] | |
7667 pinsrb m2, [r2], 0 | |
7668 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] | |
7669 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
7670 | |
7671 pmaddubsw m4, [r4 - 5 * 16] ; [11] | |
7672 pmulhrsw m4, m7 | |
7673 | |
7674 palignr m3, m2, 15 | |
7675 | |
7676 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7677 pmulhrsw m5, m7 | |
7678 packuswb m4, m5 | |
7679 | |
7680 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
7681 pmulhrsw m5, m7 | |
7682 | |
7683 palignr m3, m2, 14 | |
7684 | |
7685 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
7686 pmulhrsw m6, m7 | |
7687 packuswb m5, m6 | |
7688 | |
7689 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] | |
7690 palignr m3, m2, 14 | |
7691 | |
7692 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
7693 pmulhrsw m6, m7 | |
7694 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
7695 pmulhrsw m0, m7 | |
7696 packuswb m6, m0 | |
7697 | |
7698 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
7699 palignr m3, m2, 14 | |
7700 | |
7701 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
7702 pmulhrsw m1, m7 | |
7703 | |
7704 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] | |
7705 palignr m3, m2, 14 | |
7706 | |
7707 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7708 pmulhrsw m0, m7 | |
7709 packuswb m1, m0 | |
7710 | |
7711 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
7712 | |
7713 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
7714 pmulhrsw m4, m7 | |
7715 | |
7716 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] | |
7717 palignr m3, m2, 14 | |
7718 | |
7719 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
7720 pmulhrsw m5, m7 | |
7721 packuswb m4, m5 | |
7722 | |
7723 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] | |
7724 palignr m3, m2, 14 | |
7725 | |
7726 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
7727 pmulhrsw m5, m7 | |
7728 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
7729 pmulhrsw m6, m7 | |
7730 packuswb m5, m6 | |
7731 | |
7732 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] | |
7733 palignr m3, m2, 14 | |
7734 | |
7735 pmaddubsw m6, m3, [r4 - 16] ; [15] | |
7736 pmulhrsw m6, m7 | |
7737 | |
7738 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] | |
7739 palignr m3, m2, 14 | |
7740 | |
7741 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
7742 pmulhrsw m1, m7 | |
7743 packuswb m6, m1 | |
7744 | |
7745 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
7746 pmulhrsw m1, m7 | |
7747 | |
7748 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] | |
7749 palignr m3, m2, 14 | |
7750 | |
7751 pmaddubsw m3, [r4] ; [16] | |
7752 pmulhrsw m3, m7 | |
7753 packuswb m1, m3 | |
7754 | |
7755 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
7756 | |
7757 lea r0, [r6 + 8] | |
7758 | |
7759 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7760 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7761 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7762 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
7763 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
7764 | |
7765 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
7766 pmulhrsw m4, m7 | |
7767 | |
7768 palignr m3, m2, 14 | |
7769 | |
7770 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
7771 pmulhrsw m5, m7 | |
7772 packuswb m4, m5 | |
7773 | |
7774 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
7775 pmulhrsw m5, m7 | |
7776 | |
7777 pslldq m2, 1 | |
7778 palignr m3, m2, 14 | |
7779 | |
7780 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
7781 pmulhrsw m6, m7 | |
7782 packuswb m5, m6 | |
7783 | |
7784 pslldq m2, 1 | |
7785 palignr m3, m2, 14 | |
7786 | |
7787 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
7788 pmulhrsw m6, m7 | |
7789 | |
7790 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
7791 pmulhrsw m0, m7 | |
7792 packuswb m6, m0 | |
7793 | |
7794 pslldq m2, 1 | |
7795 palignr m3, m2, 14 | |
7796 | |
7797 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
7798 pmulhrsw m1, m7 | |
7799 | |
7800 pslldq m2, 1 | |
7801 palignr m3, m2, 14 | |
7802 | |
7803 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
7804 pmulhrsw m0, m7 | |
7805 packuswb m1, m0 | |
7806 | |
7807 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
7808 | |
7809 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
7810 pmulhrsw m4, m7 | |
7811 | |
7812 pslldq m2, 1 | |
7813 palignr m3, m2, 14 | |
7814 | |
7815 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
7816 pmulhrsw m5, m7 | |
7817 packuswb m4, m5 | |
7818 | |
7819 pslldq m2, 1 | |
7820 palignr m3, m2, 14 | |
7821 | |
7822 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
7823 pmulhrsw m5, m7 | |
7824 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
7825 pmulhrsw m6, m7 | |
7826 packuswb m5, m6 | |
7827 | |
7828 pslldq m2, 1 | |
7829 palignr m3, m2, 14 | |
7830 | |
7831 pmaddubsw m6, m3, [r4 - 16] ; [15] | |
7832 pmulhrsw m6, m7 | |
7833 | |
7834 pslldq m2, 1 | |
7835 palignr m3, m2, 14 | |
7836 | |
7837 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
7838 pmulhrsw m1, m7 | |
7839 packuswb m6, m1 | |
7840 | |
7841 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
7842 pmulhrsw m1, m7 | |
7843 | |
7844 pslldq m2, 1 | |
7845 palignr m3, m2, 14 | |
7846 | |
7847 pmaddubsw m3, [r4] ; [16] | |
7848 pmulhrsw m3, m7 | |
7849 packuswb m1, m3 | |
7850 | |
7851 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
7852 RET | |
7853 | |
7854 INIT_XMM sse4 | |
7855 cglobal intra_pred_ang16_17, 4,7,8 | |
7856 lea r4, [ang_table + 16 * 16] | |
7857 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7858 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7859 mova m7, [pw_1024] | |
7860 | |
7861 movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7862 pinsrb m3, [r2], 0 | |
7863 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
7864 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7865 movu m2, [r2] | |
7866 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] | |
7867 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
7868 | |
7869 pmaddubsw m4, [r4 - 10 * 16] ; [06] | |
7870 pmulhrsw m4, m7 | |
7871 | |
7872 palignr m3, m2, 15 | |
7873 | |
7874 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
7875 pmulhrsw m5, m7 | |
7876 packuswb m4, m5 | |
7877 | |
7878 palignr m3, m2, 14 | |
7879 | |
7880 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
7881 pmulhrsw m5, m7 | |
7882 | |
7883 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] | |
7884 pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] | |
7885 palignr m3, m2, 14 | |
7886 | |
7887 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
7888 pmulhrsw m6, m7 | |
7889 packuswb m5, m6 | |
7890 | |
7891 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] | |
7892 palignr m3, m2, 14 | |
7893 | |
7894 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
7895 pmulhrsw m6, m7 | |
7896 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
7897 pmulhrsw m0, m7 | |
7898 packuswb m6, m0 | |
7899 | |
7900 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] | |
7901 palignr m3, m2, 14 | |
7902 | |
7903 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
7904 pmulhrsw m1, m7 | |
7905 | |
7906 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
7907 palignr m3, m2, 14 | |
7908 | |
7909 pmaddubsw m0, m3, [r4] ; [16] | |
7910 pmulhrsw m0, m7 | |
7911 packuswb m1, m0 | |
7912 | |
7913 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
7914 | |
7915 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] | |
7916 palignr m3, m2, 14 | |
7917 | |
7918 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
7919 pmulhrsw m4, m7 | |
7920 | |
7921 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] | |
7922 palignr m3, m2, 14 | |
7923 | |
7924 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
7925 pmulhrsw m5, m7 | |
7926 packuswb m4, m5 | |
7927 | |
7928 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
7929 pmulhrsw m5, m7 | |
7930 | |
7931 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] | |
7932 palignr m3, m2, 14 | |
7933 | |
7934 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
7935 pmulhrsw m6, m7 | |
7936 packuswb m5, m6 | |
7937 | |
7938 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] | |
7939 palignr m3, m2, 14 | |
7940 | |
7941 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
7942 pmulhrsw m6, m7 | |
7943 | |
7944 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] | |
7945 palignr m3, m2, 14 | |
7946 | |
7947 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
7948 pmulhrsw m1, m7 | |
7949 packuswb m6, m1 | |
7950 | |
7951 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] | |
7952 palignr m3, m2, 14 | |
7953 | |
7954 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
7955 pmulhrsw m1, m7 | |
7956 pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
7957 pmulhrsw m3, m7 | |
7958 packuswb m1, m3 | |
7959 | |
7960 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
7961 | |
7962 lea r0, [r6 + r1 * 4] | |
7963 lea r6, [r6 + r1 * 8] | |
7964 | |
7965 movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
7966 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
7967 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
7968 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
7969 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] | |
7970 | |
7971 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] | |
7972 pmulhrsw m4, m7 | |
7973 | |
7974 palignr m3, m2, 14 | |
7975 | |
7976 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
7977 pmulhrsw m5, m7 | |
7978 packuswb m4, m5 | |
7979 | |
7980 pslldq m2, 1 | |
7981 palignr m3, m2, 14 | |
7982 | |
7983 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
7984 pmulhrsw m5, m7 | |
7985 | |
7986 pslldq m2, 1 | |
7987 palignr m3, m2, 14 | |
7988 | |
7989 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
7990 pmulhrsw m6, m7 | |
7991 packuswb m5, m6 | |
7992 | |
7993 pslldq m2, 1 | |
7994 palignr m3, m2, 14 | |
7995 | |
7996 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
7997 pmulhrsw m6, m7 | |
7998 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
7999 pmulhrsw m0, m7 | |
8000 packuswb m6, m0 | |
8001 | |
8002 pslldq m2, 1 | |
8003 palignr m3, m2, 14 | |
8004 | |
8005 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
8006 pmulhrsw m1, m7 | |
8007 | |
8008 pslldq m2, 1 | |
8009 palignr m3, m2, 14 | |
8010 | |
8011 pmaddubsw m0, m3, [r4] ; [16] | |
8012 pmulhrsw m0, m7 | |
8013 packuswb m1, m0 | |
8014 | |
8015 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
8016 | |
8017 pslldq m2, 1 | |
8018 palignr m3, m2, 14 | |
8019 | |
8020 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
8021 pmulhrsw m4, m7 | |
8022 | |
8023 pslldq m2, 1 | |
8024 palignr m3, m2, 14 | |
8025 | |
8026 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
8027 pmulhrsw m5, m7 | |
8028 packuswb m4, m5 | |
8029 | |
8030 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
8031 pmulhrsw m5, m7 | |
8032 | |
8033 pslldq m2, 1 | |
8034 palignr m3, m2, 14 | |
8035 | |
8036 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
8037 pmulhrsw m6, m7 | |
8038 packuswb m5, m6 | |
8039 | |
8040 pslldq m2, 1 | |
8041 palignr m3, m2, 14 | |
8042 | |
8043 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
8044 pmulhrsw m6, m7 | |
8045 | |
8046 pslldq m2, 1 | |
8047 palignr m3, m2, 14 | |
8048 | |
8049 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
8050 pmulhrsw m1, m7 | |
8051 packuswb m6, m1 | |
8052 | |
8053 pslldq m2, 1 | |
8054 palignr m3, m2, 14 | |
8055 | |
8056 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
8057 pmulhrsw m1, m7 | |
8058 pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
8059 pmulhrsw m3, m7 | |
8060 packuswb m1, m3 | |
8061 | |
8062 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
8063 RET | |
8064 | |
8065 INIT_XMM sse4 | |
8066 cglobal intra_pred_ang16_19, 4,7,8 | |
8067 lea r4, [ang_table + 16 * 16] | |
8068 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8069 mov r6, r0 | |
8070 mova m7, [pw_1024] | |
8071 | |
8072 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
8073 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
8074 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
8075 movu m2, [r2 + 32] | |
8076 pinsrb m2, [r2], 0 | |
8077 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] | |
8078 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
8079 | |
8080 pmaddubsw m4, [r4 - 10 * 16] ; [06] | |
8081 pmulhrsw m4, m7 | |
8082 | |
8083 palignr m3, m2, 15 | |
8084 | |
8085 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
8086 pmulhrsw m5, m7 | |
8087 packuswb m4, m5 | |
8088 | |
8089 palignr m3, m2, 14 | |
8090 | |
8091 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
8092 pmulhrsw m5, m7 | |
8093 | |
8094 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] | |
8095 pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] | |
8096 palignr m3, m2, 14 | |
8097 | |
8098 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
8099 pmulhrsw m6, m7 | |
8100 packuswb m5, m6 | |
8101 | |
8102 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] | |
8103 palignr m3, m2, 14 | |
8104 | |
8105 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
8106 pmulhrsw m6, m7 | |
8107 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
8108 pmulhrsw m0, m7 | |
8109 packuswb m6, m0 | |
8110 | |
8111 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] | |
8112 palignr m3, m2, 14 | |
8113 | |
8114 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
8115 pmulhrsw m1, m7 | |
8116 | |
8117 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
8118 palignr m3, m2, 14 | |
8119 | |
8120 pmaddubsw m0, m3, [r4] ; [16] | |
8121 pmulhrsw m0, m7 | |
8122 packuswb m1, m0 | |
8123 | |
8124 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
8125 | |
8126 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] | |
8127 palignr m3, m2, 14 | |
8128 | |
8129 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
8130 pmulhrsw m4, m7 | |
8131 | |
8132 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] | |
8133 palignr m3, m2, 14 | |
8134 | |
8135 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
8136 pmulhrsw m5, m7 | |
8137 packuswb m4, m5 | |
8138 | |
8139 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
8140 pmulhrsw m5, m7 | |
8141 | |
8142 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] | |
8143 palignr m3, m2, 14 | |
8144 | |
8145 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
8146 pmulhrsw m6, m7 | |
8147 packuswb m5, m6 | |
8148 | |
8149 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] | |
8150 palignr m3, m2, 14 | |
8151 | |
8152 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
8153 pmulhrsw m6, m7 | |
8154 | |
8155 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] | |
8156 palignr m3, m2, 14 | |
8157 | |
8158 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
8159 pmulhrsw m1, m7 | |
8160 packuswb m6, m1 | |
8161 | |
8162 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] | |
8163 palignr m3, m2, 14 | |
8164 | |
8165 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
8166 pmulhrsw m1, m7 | |
8167 pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
8168 pmulhrsw m3, m7 | |
8169 packuswb m1, m3 | |
8170 | |
8171 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
8172 | |
8173 lea r0, [r6 + 8] | |
8174 | |
8175 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
8176 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
8177 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
8178 palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
8179 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
8180 | |
8181 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] | |
8182 pmulhrsw m4, m7 | |
8183 | |
8184 palignr m3, m2, 14 | |
8185 | |
8186 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
8187 pmulhrsw m5, m7 | |
8188 packuswb m4, m5 | |
8189 | |
8190 pslldq m2, 1 | |
8191 palignr m3, m2, 14 | |
8192 | |
8193 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
8194 pmulhrsw m5, m7 | |
8195 | |
8196 pslldq m2, 1 | |
8197 palignr m3, m2, 14 | |
8198 | |
8199 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
8200 pmulhrsw m6, m7 | |
8201 packuswb m5, m6 | |
8202 | |
8203 pslldq m2, 1 | |
8204 palignr m3, m2, 14 | |
8205 | |
8206 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
8207 pmulhrsw m6, m7 | |
8208 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
8209 pmulhrsw m0, m7 | |
8210 packuswb m6, m0 | |
8211 | |
8212 pslldq m2, 1 | |
8213 palignr m3, m2, 14 | |
8214 | |
8215 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
8216 pmulhrsw m1, m7 | |
8217 | |
8218 pslldq m2, 1 | |
8219 palignr m3, m2, 14 | |
8220 | |
8221 pmaddubsw m0, m3, [r4] ; [16] | |
8222 pmulhrsw m0, m7 | |
8223 packuswb m1, m0 | |
8224 | |
8225 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
8226 | |
8227 pslldq m2, 1 | |
8228 palignr m3, m2, 14 | |
8229 | |
8230 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
8231 pmulhrsw m4, m7 | |
8232 | |
8233 pslldq m2, 1 | |
8234 palignr m3, m2, 14 | |
8235 | |
8236 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
8237 pmulhrsw m5, m7 | |
8238 packuswb m4, m5 | |
8239 | |
8240 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
8241 pmulhrsw m5, m7 | |
8242 | |
8243 pslldq m2, 1 | |
8244 palignr m3, m2, 14 | |
8245 | |
8246 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
8247 pmulhrsw m6, m7 | |
8248 packuswb m5, m6 | |
8249 | |
8250 pslldq m2, 1 | |
8251 palignr m3, m2, 14 | |
8252 | |
8253 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
8254 pmulhrsw m6, m7 | |
8255 | |
8256 pslldq m2, 1 | |
8257 palignr m3, m2, 14 | |
8258 | |
8259 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
8260 pmulhrsw m1, m7 | |
8261 packuswb m6, m1 | |
8262 | |
8263 pslldq m2, 1 | |
8264 palignr m3, m2, 14 | |
8265 | |
8266 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
8267 pmulhrsw m1, m7 | |
8268 pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
8269 pmulhrsw m3, m7 | |
8270 packuswb m1, m3 | |
8271 | |
8272 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
8273 RET | |
8274 | |
8275 INIT_XMM sse4 | |
8276 cglobal intra_pred_ang16_18, 4,5,3 | |
8277 movu m0, [r2] | |
8278 movu m1, [r2 + 32] | |
8279 mova m2, [c_mode16_18] | |
8280 pshufb m1, m2 | |
8281 | |
8282 lea r2, [r1 * 2] | |
8283 lea r3, [r1 * 3] | |
8284 lea r4, [r1 * 4] | |
8285 movu [r0], m0 | |
8286 palignr m2, m0, m1, 15 | |
8287 movu [r0 + r1], m2 | |
8288 palignr m2, m0, m1, 14 | |
8289 movu [r0 + r2], m2 | |
8290 palignr m2, m0, m1, 13 | |
8291 movu [r0 + r3], m2 | |
8292 lea r0, [r0 + r4] | |
8293 palignr m2, m0, m1, 12 | |
8294 movu [r0], m2 | |
8295 palignr m2, m0, m1, 11 | |
8296 movu [r0 + r1], m2 | |
8297 palignr m2, m0, m1, 10 | |
8298 movu [r0 + r2], m2 | |
8299 palignr m2, m0, m1, 9 | |
8300 movu [r0 + r3], m2 | |
8301 lea r0, [r0 + r4] | |
8302 palignr m2, m0, m1, 8 | |
8303 movu [r0], m2 | |
8304 palignr m2, m0, m1, 7 | |
8305 movu [r0 + r1], m2 | |
8306 palignr m2, m0, m1, 6 | |
8307 movu [r0 + r2], m2 | |
8308 palignr m2, m0, m1, 5 | |
8309 movu [r0 + r3], m2 | |
8310 lea r0, [r0 + r4] | |
8311 palignr m2, m0, m1, 4 | |
8312 movu [r0], m2 | |
8313 palignr m2, m0, m1, 3 | |
8314 movu [r0 + r1], m2 | |
8315 palignr m2, m0, m1, 2 | |
8316 movu [r0 + r2], m2 | |
8317 palignr m0, m1, 1 | |
8318 movu [r0 + r3], m0 | |
8319 RET | |
8320 | |
8321 ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8 | |
8322 %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7 | |
8323 %if %3 == 0 | |
8324 %else | |
8325 pshufb m0, [r3] | |
8326 pmaddubsw m0, [r4 + %3 * 16] | |
8327 pmulhrsw m0, [pw_1024] | |
8328 %endif | |
8329 %if %4 == 0 | |
8330 pmovzxbw m1, m1 | |
8331 %else | |
8332 pshufb m1, [r3] | |
8333 pmaddubsw m1, [r4 + %4 * 16] | |
8334 pmulhrsw m1, [pw_1024] | |
8335 %endif | |
8336 %if %3 == 0 | |
8337 packuswb m1, m1 | |
8338 movlhps m0, m1 | |
8339 %else | |
8340 packuswb m0, m1 | |
8341 %endif | |
8342 mova m1, [pw_1024] | |
8343 %if %5 == 0 | |
8344 %else | |
8345 pshufb m2, [r3] | |
8346 pmaddubsw m2, [r4 + %5 * 16] | |
8347 pmulhrsw m2, m1 | |
8348 %endif | |
8349 %if %6 == 0 | |
8350 pmovzxbw m3, m3 | |
8351 %else | |
8352 pshufb m3, [r3] | |
8353 pmaddubsw m3, [r4 + %6 * 16] | |
8354 pmulhrsw m3, m1 | |
8355 %endif | |
8356 %if %5 == 0 | |
8357 packuswb m3, m3 | |
8358 movlhps m2, m3 | |
8359 %else | |
8360 packuswb m2, m3 | |
8361 %endif | |
8362 %if %7 == 0 | |
8363 %else | |
8364 pshufb m4, [r3] | |
8365 pmaddubsw m4, [r4 + %7 * 16] | |
8366 pmulhrsw m4, m1 | |
8367 %endif | |
8368 %if %8 == 0 | |
8369 pmovzxbw m5, m5 | |
8370 %else | |
8371 pshufb m5, [r3] | |
8372 pmaddubsw m5, [r4 + %8 * 16] | |
8373 pmulhrsw m5, m1 | |
8374 %endif | |
8375 %if %7 == 0 | |
8376 packuswb m5, m5 | |
8377 movlhps m4, m5 | |
8378 %else | |
8379 packuswb m4, m5 | |
8380 %endif | |
8381 %if %9 == 0 | |
8382 %else | |
8383 pshufb m6, [r3] | |
8384 pmaddubsw m6, [r4 + %9 * 16] | |
8385 pmulhrsw m6, m1 | |
8386 %endif | |
8387 %if %10 == 0 | |
8388 pmovzxbw m7, m7 | |
8389 %else | |
8390 pshufb m7, [r3] | |
8391 pmaddubsw m7, [r4 + %10 * 16] | |
8392 pmulhrsw m7, m1 | |
8393 %endif | |
8394 %if %9 == 0 | |
8395 packuswb m7, m7 | |
8396 movlhps m6, m7 | |
8397 %else | |
8398 packuswb m6, m7 | |
8399 %endif | |
8400 | |
8401 %if %2 == 1 | |
8402 ; transpose | |
8403 punpckhbw m1, m0, m2 | |
8404 punpcklbw m0, m2 | |
8405 punpckhbw m3, m0, m1 | |
8406 punpcklbw m0, m1 | |
8407 | |
8408 punpckhbw m1, m4, m6 | |
8409 punpcklbw m4, m6 | |
8410 punpckhbw m6, m4, m1 | |
8411 punpcklbw m4, m1 | |
8412 | |
8413 punpckhdq m2, m0, m4 | |
8414 punpckldq m0, m4 | |
8415 punpckldq m4, m3, m6 | |
8416 punpckhdq m3, m6 | |
8417 | |
8418 movh [r0 + + %1 * 8], m0 | |
8419 movhps [r0 + r1 + %1 * 8], m0 | |
8420 movh [r0 + r1*2 + %1 * 8], m2 | |
8421 movhps [r0 + r5 + %1 * 8], m2 | |
8422 movh [r6 + %1 * 8], m4 | |
8423 movhps [r6 + r1 + %1 * 8], m4 | |
8424 movh [r6 + r1*2 + %1 * 8], m3 | |
8425 movhps [r6 + r5 + %1 * 8], m3 | |
8426 %else | |
8427 movh [r0 ], m0 | |
8428 movhps [r0 + r1 ], m0 | |
8429 movh [r0 + r1 * 2], m2 | |
8430 movhps [r0 + r5 ], m2 | |
8431 lea r0, [r0 + r1 * 4] | |
8432 movh [r0 ], m4 | |
8433 movhps [r0 + r1 ], m4 | |
8434 movh [r0 + r1 * 2], m6 | |
8435 movhps [r0 + r5 ], m6 | |
8436 %endif | |
8437 %endmacro | |
8438 | |
8439 %macro MODE_3_33 1 | |
8440 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
8441 palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
8442 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
8443 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
8444 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
8445 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
8446 pmulhrsw m4, m7 | |
8447 pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
8448 pmulhrsw m1, m7 | |
8449 packuswb m4, m1 | |
8450 palignr m5, m2, m0, 4 | |
8451 pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
8452 pmulhrsw m5, m7 | |
8453 palignr m6, m2, m0, 6 | |
8454 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
8455 pmulhrsw m6, m7 | |
8456 packuswb m5, m6 | |
8457 palignr m1, m2, m0, 8 | |
8458 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
8459 pmulhrsw m6, m7 | |
8460 pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
8461 pmulhrsw m1, m7 | |
8462 packuswb m6, m1 | |
8463 palignr m1, m2, m0, 10 | |
8464 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
8465 pmulhrsw m1, m7 | |
8466 palignr m2, m0, 12 | |
8467 pmaddubsw m2, [r3] ; [16] | |
8468 pmulhrsw m2, m7 | |
8469 packuswb m1, m2 | |
8470 | |
8471 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
8472 | |
8473 movu m0, [r2 + 8] | |
8474 palignr m1, m0, 1 | |
8475 punpckhbw m2, m0, m1 | |
8476 punpcklbw m0, m1 | |
8477 palignr m5, m2, m0, 2 | |
8478 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
8479 pmulhrsw m4, m7 | |
8480 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
8481 pmulhrsw m1, m7 | |
8482 packuswb m4, m1 | |
8483 pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
8484 pmulhrsw m5, m7 | |
8485 palignr m6, m2, m0, 4 | |
8486 pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
8487 pmulhrsw m6, m7 | |
8488 packuswb m5, m6 | |
8489 palignr m1, m2, m0, 6 | |
8490 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
8491 pmulhrsw m6, m7 | |
8492 palignr m1, m2, m0, 8 | |
8493 pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
8494 pmulhrsw m1, m7 | |
8495 packuswb m6, m1 | |
8496 palignr m1, m2, m0, 10 | |
8497 pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
8498 pmulhrsw m1, m7 | |
8499 packuswb m1, m1 | |
8500 movhps m1, [r2 + 14] ; [00] | |
8501 | |
8502 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
8503 | |
8504 movu m0, [r2 + 14] | |
8505 palignr m1, m0, 1 | |
8506 punpckhbw m2, m0, m1 | |
8507 punpcklbw m0, m1 | |
8508 palignr m1, m2, m0, 2 | |
8509 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
8510 pmulhrsw m4, m7 | |
8511 pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
8512 pmulhrsw m1, m7 | |
8513 packuswb m4, m1 | |
8514 palignr m5, m2, m0, 4 | |
8515 pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
8516 pmulhrsw m5, m7 | |
8517 palignr m6, m2, m0, 6 | |
8518 pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
8519 pmulhrsw m6, m7 | |
8520 packuswb m5, m6 | |
8521 palignr m1, m2, m0, 8 | |
8522 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
8523 pmulhrsw m6, m7 | |
8524 pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
8525 pmulhrsw m1, m7 | |
8526 packuswb m6, m1 | |
8527 palignr m1, m2, m0, 10 | |
8528 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
8529 pmulhrsw m1, m7 | |
8530 palignr m2, m0, 12 | |
8531 pmaddubsw m2, [r3] ; [16] | |
8532 pmulhrsw m2, m7 | |
8533 packuswb m1, m2 | |
8534 | |
8535 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
8536 | |
8537 movu m0, [r2 + 21] | |
8538 palignr m1, m0, 1 | |
8539 punpckhbw m2, m0, m1 | |
8540 punpcklbw m0, m1 | |
8541 palignr m5, m2, m0, 2 | |
8542 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
8543 pmulhrsw m4, m7 | |
8544 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
8545 pmulhrsw m1, m7 | |
8546 packuswb m4, m1 | |
8547 pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
8548 pmulhrsw m5, m7 | |
8549 palignr m6, m2, m0, 4 | |
8550 pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
8551 pmulhrsw m6, m7 | |
8552 packuswb m5, m6 | |
8553 palignr m1, m2, m0, 6 | |
8554 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
8555 pmulhrsw m6, m7 | |
8556 palignr m1, m2, m0, 8 | |
8557 pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
8558 pmulhrsw m1, m7 | |
8559 packuswb m6, m1 | |
8560 palignr m1, m2, m0, 10 | |
8561 pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
8562 pmulhrsw m1, m7 | |
8563 packuswb m1, m1 | |
8564 movhps m1, [r2 + 27] ; [00] | |
8565 | |
8566 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
8567 %endmacro | |
8568 | |
8569 %macro MODE_4_32 1 | |
8570 movu m0, [r2 + 1] | |
8571 palignr m1, m0, 1 | |
8572 punpckhbw m2, m0, m1 | |
8573 punpcklbw m0, m1 | |
8574 palignr m1, m2, m0, 2 | |
8575 mova m5, m1 | |
8576 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
8577 pmulhrsw m4, m7 | |
8578 pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
8579 pmulhrsw m1, m7 | |
8580 packuswb m4, m1 | |
8581 pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
8582 pmulhrsw m5, m7 | |
8583 palignr m6, m2, m0, 4 | |
8584 pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
8585 pmulhrsw m6, m7 | |
8586 packuswb m5, m6 | |
8587 palignr m1, m2, m0, 6 | |
8588 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
8589 pmulhrsw m6, m7 | |
8590 pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
8591 pmulhrsw m1, m7 | |
8592 packuswb m6, m1 | |
8593 palignr m1, m2, m0, 8 | |
8594 pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
8595 pmulhrsw m1, m7 | |
8596 palignr m2, m0, 10 | |
8597 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
8598 pmulhrsw m3, m7 | |
8599 packuswb m1, m3 | |
8600 | |
8601 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
8602 | |
8603 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
8604 pmulhrsw m4, m7 | |
8605 movu m0, [r2 + 6] | |
8606 palignr m1, m0, 1 | |
8607 punpckhbw m2, m0, m1 | |
8608 punpcklbw m0, m1 | |
8609 palignr m1, m2, m0, 2 | |
8610 pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
8611 pmulhrsw m1, m7 | |
8612 packuswb m4, m1 | |
8613 palignr m5, m2, m0, 4 | |
8614 mova m6, m5 | |
8615 pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
8616 pmulhrsw m5, m7 | |
8617 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
8618 pmulhrsw m6, m7 | |
8619 packuswb m5, m6 | |
8620 palignr m6, m2, m0, 6 | |
8621 pmaddubsw m6, [r3 + 16] ; [17] | |
8622 pmulhrsw m6, m7 | |
8623 palignr m1, m2, m0, 8 | |
8624 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
8625 pmulhrsw m3, m7 | |
8626 packuswb m6, m3 | |
8627 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
8628 pmulhrsw m1, m7 | |
8629 palignr m2, m0, 10 | |
8630 pmaddubsw m2, [r3] ; [16] | |
8631 pmulhrsw m2, m7 | |
8632 packuswb m1, m2 | |
8633 | |
8634 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
8635 | |
8636 movu m0, [r2 + 12] | |
8637 palignr m1, m0, 1 | |
8638 punpckhbw m2, m0, m1 | |
8639 punpcklbw m0, m1 | |
8640 mova m1, m0 | |
8641 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
8642 pmulhrsw m4, m7 | |
8643 pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
8644 pmulhrsw m1, m7 | |
8645 packuswb m4, m1 | |
8646 palignr m5, m2, m0, 2 | |
8647 pmaddubsw m5, [r3 - 16] ; [15] | |
8648 pmulhrsw m5, m7 | |
8649 palignr m6, m2, m0, 4 | |
8650 mova m1, m6 | |
8651 pmaddubsw m1, [r3 - 12 * 16] ; [4] | |
8652 pmulhrsw m1, m7 | |
8653 packuswb m5, m1 | |
8654 pmaddubsw m6, [r3 + 9 * 16] ; [25] | |
8655 pmulhrsw m6, m7 | |
8656 palignr m1, m2, m0, 6 | |
8657 pmaddubsw m1, [r3 - 2 * 16] ; [14] | |
8658 pmulhrsw m1, m7 | |
8659 packuswb m6, m1 | |
8660 palignr m1, m2, m0, 8 | |
8661 mova m2, m1 | |
8662 pmaddubsw m1, [r3 - 13 * 16] ; [3] | |
8663 pmulhrsw m1, m7 | |
8664 pmaddubsw m2, [r3 + 8 * 16] ; [24] | |
8665 pmulhrsw m2, m7 | |
8666 packuswb m1, m2 | |
8667 | |
8668 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
8669 | |
8670 movu m0, [r2 + 17] | |
8671 palignr m1, m0, 1 | |
8672 punpckhbw m2, m0, m1 | |
8673 punpcklbw m0, m1 | |
8674 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] | |
8675 pmulhrsw m4, m7 | |
8676 palignr m5, m2, m0, 2 | |
8677 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
8678 pmulhrsw m1, m7 | |
8679 packuswb m4, m1 | |
8680 pmaddubsw m5, [r3 + 7 * 16] ; [23] | |
8681 pmulhrsw m5, m7 | |
8682 palignr m6, m2, m0, 4 | |
8683 pmaddubsw m6, [r3 - 4 * 16] ; [12] | |
8684 pmulhrsw m6, m7 | |
8685 packuswb m5, m6 | |
8686 palignr m6, m2, m0, 6 | |
8687 mova m1, m6 | |
8688 pmaddubsw m6, [r3 - 15 * 16] ; [1] | |
8689 pmulhrsw m6, m7 | |
8690 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
8691 pmulhrsw m1, m7 | |
8692 packuswb m6, m1 | |
8693 palignr m1, m2, m0, 8 | |
8694 pmaddubsw m1, [r3 - 5 * 16] ; [11] | |
8695 pmulhrsw m1, m7 | |
8696 packuswb m1, m1 | |
8697 movhps m1, [r2 + 22] ; [00] | |
8698 | |
8699 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
8700 %endmacro | |
8701 | |
8702 %macro MODE_5_31 1 | |
8703 movu m0, [r2 + 1] | |
8704 palignr m1, m0, 1 | |
8705 punpckhbw m2, m0, m1 | |
8706 punpcklbw m0, m1 | |
8707 palignr m1, m2, m0, 2 | |
8708 mova m5, m1 | |
8709 pmaddubsw m4, m0, [r3 + 16] ; [17] | |
8710 pmulhrsw m4, m7 | |
8711 pmaddubsw m1, [r3 - 14 * 16] ; [2] | |
8712 pmulhrsw m1, m7 | |
8713 packuswb m4, m1 | |
8714 pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
8715 pmulhrsw m5, m7 | |
8716 palignr m6, m2, m0, 4 | |
8717 mova m1, m6 | |
8718 pmaddubsw m6, [r3 - 12 * 16] ; [4] | |
8719 pmulhrsw m6, m7 | |
8720 packuswb m5, m6 | |
8721 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21] | |
8722 pmulhrsw m6, m7 | |
8723 palignr m1, m2, m0, 6 | |
8724 mova m3, m1 | |
8725 pmaddubsw m3, [r3 - 10 * 16] ; [6] | |
8726 pmulhrsw m3, m7 | |
8727 packuswb m6, m3 | |
8728 pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
8729 pmulhrsw m1, m7 | |
8730 palignr m2, m0, 8 | |
8731 pmaddubsw m2, [r3 - 8 * 16] ; [8] | |
8732 pmulhrsw m2, m7 | |
8733 packuswb m1, m2 | |
8734 | |
8735 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
8736 | |
8737 movu m0, [r2 + 5] | |
8738 palignr m1, m0, 1 | |
8739 punpckhbw m2, m0, m1 | |
8740 punpcklbw m0, m1 | |
8741 palignr m1, m2, m0, 2 | |
8742 mova m5, m1 | |
8743 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25] | |
8744 pmulhrsw m4, m7 | |
8745 pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
8746 pmulhrsw m1, m7 | |
8747 packuswb m4, m1 | |
8748 pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
8749 pmulhrsw m5, m7 | |
8750 palignr m6, m2, m0, 4 | |
8751 mova m1, m6 | |
8752 pmaddubsw m6, [r3 - 4 * 16] ; [12] | |
8753 pmulhrsw m6, m7 | |
8754 packuswb m5, m6 | |
8755 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29] | |
8756 pmulhrsw m6, m7 | |
8757 palignr m1, m2, m0, 6 | |
8758 mova m3, m1 | |
8759 pmaddubsw m3, [r3 - 2 * 16] ; [14] | |
8760 pmulhrsw m3, m7 | |
8761 packuswb m6, m3 | |
8762 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
8763 pmulhrsw m1, m7 | |
8764 palignr m2, m0, 8 | |
8765 pmaddubsw m2, [r3] ; [16] | |
8766 pmulhrsw m2, m7 | |
8767 packuswb m1, m2 | |
8768 | |
8769 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
8770 | |
8771 movu m0, [r2 + 10] | |
8772 palignr m1, m0, 1 | |
8773 punpckhbw m2, m0, m1 | |
8774 punpcklbw m0, m1 | |
8775 mova m1, m0 | |
8776 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] | |
8777 pmulhrsw m4, m7 | |
8778 pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
8779 pmulhrsw m1, m7 | |
8780 packuswb m4, m1 | |
8781 palignr m5, m2, m0, 2 | |
8782 mova m1, m5 | |
8783 pmaddubsw m5, [r3 - 13 * 16] ; [3] | |
8784 pmulhrsw m5, m7 | |
8785 pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
8786 pmulhrsw m1, m7 | |
8787 packuswb m5, m1 | |
8788 palignr m1, m2, m0, 4 | |
8789 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5] | |
8790 pmulhrsw m6, m7 | |
8791 pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
8792 pmulhrsw m1, m7 | |
8793 packuswb m6, m1 | |
8794 palignr m2, m0, 6 | |
8795 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7] | |
8796 pmulhrsw m1, m7 | |
8797 pmaddubsw m2, [r3 + 8 * 16] ; [24] | |
8798 pmulhrsw m2, m7 | |
8799 packuswb m1, m2 | |
8800 | |
8801 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
8802 | |
8803 movu m0, [r2 + 14] | |
8804 palignr m1, m0, 1 | |
8805 punpckhbw m2, m0, m1 | |
8806 punpcklbw m0, m1 | |
8807 mova m1, m0 | |
8808 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] | |
8809 pmulhrsw m4, m7 | |
8810 pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
8811 pmulhrsw m1, m7 | |
8812 packuswb m4, m1 | |
8813 palignr m5, m2, m0, 2 | |
8814 mova m1, m5 | |
8815 pmaddubsw m5, [r3 - 5 * 16] ; [11] | |
8816 pmulhrsw m5, m7 | |
8817 pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
8818 pmulhrsw m1, m7 | |
8819 packuswb m5, m1 | |
8820 palignr m1, m2, m0, 4 | |
8821 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
8822 pmulhrsw m6, m7 | |
8823 pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
8824 pmulhrsw m1, m7 | |
8825 packuswb m6, m1 | |
8826 palignr m2, m0, 6 | |
8827 pmaddubsw m1, m2, [r3 - 16] ; [15] | |
8828 pmulhrsw m1, m7 | |
8829 packuswb m1, m1 | |
8830 movhps m1, [r2 + 18] ; [00] | |
8831 | |
8832 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
8833 %endmacro | |
8834 | |
8835 %macro MODE_6_30 1 | |
8836 movu m0, [r2 + 1] | |
8837 palignr m1, m0, 1 | |
8838 punpckhbw m2, m0, m1 | |
8839 punpcklbw m0, m1 | |
8840 mova m1, m0 | |
8841 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] | |
8842 pmulhrsw m4, m7 | |
8843 pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
8844 pmulhrsw m1, m7 | |
8845 packuswb m4, m1 | |
8846 palignr m6, m2, m0, 2 | |
8847 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
8848 pmulhrsw m5, m7 | |
8849 pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
8850 pmulhrsw m6, m7 | |
8851 packuswb m5, m6 | |
8852 palignr m1, m2, m0, 4 | |
8853 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
8854 pmulhrsw m6, m7 | |
8855 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14] | |
8856 pmulhrsw m3, m7 | |
8857 packuswb m6, m3 | |
8858 pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
8859 pmulhrsw m1, m7 | |
8860 palignr m2, m0, 6 | |
8861 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
8862 pmulhrsw m3, m7 | |
8863 packuswb m1, m3 | |
8864 | |
8865 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
8866 | |
8867 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] | |
8868 pmulhrsw m4, m7 | |
8869 movu m0, [r2 + 5] | |
8870 palignr m1, m0, 1 | |
8871 punpckhbw m2, m0, m1 | |
8872 punpcklbw m0, m1 | |
8873 mova m6, m0 | |
8874 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
8875 pmulhrsw m1, m7 | |
8876 packuswb m4, m1 | |
8877 pmaddubsw m5, m6, [r3 - 16] ; [15] | |
8878 pmulhrsw m5, m7 | |
8879 pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
8880 pmulhrsw m6, m7 | |
8881 packuswb m5, m6 | |
8882 palignr m3, m2, m0, 2 | |
8883 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9] | |
8884 pmulhrsw m6, m7 | |
8885 pmaddubsw m3, [r3 + 6 * 16] ; [22] | |
8886 pmulhrsw m3, m7 | |
8887 packuswb m6, m3 | |
8888 palignr m2, m0, 4 | |
8889 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
8890 pmulhrsw m1, m7 | |
8891 pmaddubsw m3, m2, [r3] ; [16] | |
8892 pmulhrsw m3, m7 | |
8893 packuswb m1, m3 | |
8894 | |
8895 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
8896 | |
8897 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
8898 pmulhrsw m4, m7 | |
8899 movu m0, [r2 + 7] | |
8900 palignr m1, m0, 1 | |
8901 punpckhbw m2, m0, m1 | |
8902 punpcklbw m0, m1 | |
8903 palignr m5, m2, m0, 2 | |
8904 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
8905 pmulhrsw m1, m7 | |
8906 packuswb m4, m1 | |
8907 pmaddubsw m5, [r3 + 7 * 16] ; [23] | |
8908 pmulhrsw m5, m7 | |
8909 palignr m1, m2, m0, 4 | |
8910 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
8911 pmulhrsw m6, m7 | |
8912 packuswb m5, m6 | |
8913 pmaddubsw m6, m1, [r3 + 16] ; [17] | |
8914 pmulhrsw m6, m7 | |
8915 pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
8916 pmulhrsw m1, m7 | |
8917 packuswb m6, m1 | |
8918 palignr m2, m2, m0, 6 | |
8919 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11] | |
8920 pmulhrsw m1, m7 | |
8921 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] | |
8922 pmulhrsw m2, m7 | |
8923 packuswb m1, m2 | |
8924 | |
8925 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
8926 | |
8927 movu m0, [r2 + 11] | |
8928 palignr m1, m0, 1 | |
8929 punpckhbw m2, m0, m1 | |
8930 punpcklbw m0, m1 | |
8931 mova m5, m0 | |
8932 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
8933 pmulhrsw m4, m7 | |
8934 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] | |
8935 pmulhrsw m3, m7 | |
8936 packuswb m4, m3 | |
8937 pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
8938 pmulhrsw m5, m7 | |
8939 palignr m6, m2, m0, 2 | |
8940 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
8941 pmulhrsw m1, m7 | |
8942 packuswb m5, m1 | |
8943 pmaddubsw m6, [r3 + 9 * 16] ; [25] | |
8944 pmulhrsw m6, m7 | |
8945 palignr m1, m2, m0, 4 | |
8946 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6] | |
8947 pmulhrsw m2, m7 | |
8948 packuswb m6, m2 | |
8949 pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
8950 pmulhrsw m1, m7 | |
8951 packuswb m1, m1 | |
8952 movhps m1, [r2 + 14] ; [00] | |
8953 | |
8954 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
8955 %endmacro | |
8956 | |
8957 %macro MODE_7_29 1 | |
8958 movu m0, [r2 + 1] | |
8959 palignr m1, m0, 1 | |
8960 punpckhbw m2, m0, m1 | |
8961 punpcklbw m0, m1 | |
8962 mova m5, m0 | |
8963 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] | |
8964 pmulhrsw m4, m7 | |
8965 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] | |
8966 pmulhrsw m3, m7 | |
8967 packuswb m4, m3 | |
8968 pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
8969 pmulhrsw m5, m7 | |
8970 palignr m1, m2, m0, 2 | |
8971 palignr m2, m0, 4 | |
8972 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
8973 pmulhrsw m6, m7 | |
8974 packuswb m5, m6 | |
8975 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
8976 pmulhrsw m6, m7 | |
8977 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
8978 pmulhrsw m0, m7 | |
8979 packuswb m6, m0 | |
8980 pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
8981 pmulhrsw m1, m7 | |
8982 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
8983 pmulhrsw m0, m7 | |
8984 packuswb m1, m0 | |
8985 | |
8986 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
8987 | |
8988 pmaddubsw m4, m2, [r3 + 16] ; [17] | |
8989 pmulhrsw m4, m7 | |
8990 pmaddubsw m2, [r3 + 10 * 16] ; [26] | |
8991 pmulhrsw m2, m7 | |
8992 packuswb m4, m2 | |
8993 movu m0, [r2 + 4] | |
8994 palignr m1, m0, 1 | |
8995 punpckhbw m2, m0, m1 | |
8996 punpcklbw m0, m1 | |
8997 palignr m2, m0, 2 | |
8998 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
8999 pmulhrsw m5, m7 | |
9000 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
9001 pmulhrsw m6, m7 | |
9002 packuswb m5, m6 | |
9003 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
9004 pmulhrsw m6, m7 | |
9005 pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
9006 pmulhrsw m0, m7 | |
9007 packuswb m6, m0 | |
9008 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
9009 pmulhrsw m1, m7 | |
9010 pmaddubsw m3, m2, [r3] ; [16] | |
9011 pmulhrsw m3, m7 | |
9012 packuswb m1, m3 | |
9013 | |
9014 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
9015 | |
9016 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25] | |
9017 pmulhrsw m4, m7 | |
9018 movu m0, [r2 + 6] | |
9019 palignr m1, m0, 1 | |
9020 punpckhbw m2, m0, m1 | |
9021 punpcklbw m0, m1 | |
9022 palignr m2, m0, 2 | |
9023 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] | |
9024 pmulhrsw m1, m7 | |
9025 packuswb m4, m1 | |
9026 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11] | |
9027 pmulhrsw m5, m7 | |
9028 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] | |
9029 pmulhrsw m6, m7 | |
9030 packuswb m5, m6 | |
9031 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29] | |
9032 pmulhrsw m6, m7 | |
9033 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6] | |
9034 pmulhrsw m1, m7 | |
9035 packuswb m6, m1 | |
9036 pmaddubsw m1, m2, [r3 - 16] ; [15] | |
9037 pmulhrsw m1, m7 | |
9038 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] | |
9039 pmulhrsw m2, m7 | |
9040 packuswb m1, m2 | |
9041 | |
9042 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
9043 | |
9044 movu m0, [r2 + 8] | |
9045 palignr m1, m0, 1 | |
9046 punpckhbw m2, m0, m1 | |
9047 punpcklbw m0, m1 | |
9048 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] | |
9049 pmulhrsw m4, m7 | |
9050 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] | |
9051 pmulhrsw m3, m7 | |
9052 packuswb m4, m3 | |
9053 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19] | |
9054 pmulhrsw m5, m7 | |
9055 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28] | |
9056 pmulhrsw m6, m7 | |
9057 packuswb m5, m6 | |
9058 palignr m2, m0, 2 | |
9059 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5] | |
9060 pmulhrsw m6, m7 | |
9061 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14] | |
9062 pmulhrsw m0, m7 | |
9063 packuswb m6, m0 | |
9064 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23] | |
9065 pmulhrsw m1, m7 | |
9066 packuswb m1, m1 | |
9067 movhps m1, [r2 + 10] ; [0] | |
9068 | |
9069 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
9070 %endmacro | |
9071 | |
9072 %macro MODE_8_28 1 | |
9073 movu m0, [r2 + 1] | |
9074 palignr m1, m0, 1 | |
9075 punpckhbw m2, m0, m1 | |
9076 punpcklbw m0, m1 | |
9077 palignr m2, m0, 2 | |
9078 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
9079 pmulhrsw m4, m7 | |
9080 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] | |
9081 pmulhrsw m3, m7 | |
9082 packuswb m4, m3 | |
9083 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15] | |
9084 pmulhrsw m5, m7 | |
9085 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] | |
9086 pmulhrsw m6, m7 | |
9087 packuswb m5, m6 | |
9088 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25] | |
9089 pmulhrsw m6, m7 | |
9090 pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
9091 pmulhrsw m0, m7 | |
9092 packuswb m6, m0 | |
9093 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
9094 pmulhrsw m1, m7 | |
9095 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
9096 pmulhrsw m0, m7 | |
9097 packuswb m1, m0 | |
9098 | |
9099 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
9100 | |
9101 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
9102 pmulhrsw m4, m7 | |
9103 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
9104 pmulhrsw m5, m7 | |
9105 packuswb m4, m5 | |
9106 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
9107 pmulhrsw m5, m7 | |
9108 pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
9109 pmulhrsw m2, m7 | |
9110 packuswb m5, m2 | |
9111 movu m0, [r2 + 3] | |
9112 palignr m1, m0, 1 | |
9113 punpckhbw m2, m0, m1 | |
9114 punpcklbw m0, m1 | |
9115 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01] | |
9116 pmulhrsw m6, m7 | |
9117 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06] | |
9118 pmulhrsw m1, m7 | |
9119 packuswb m6, m1 | |
9120 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11] | |
9121 pmulhrsw m1, m7 | |
9122 mova m2, m0 | |
9123 pmaddubsw m0, [r3] ; [16] | |
9124 pmulhrsw m0, m7 | |
9125 packuswb m1, m0 | |
9126 | |
9127 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
9128 | |
9129 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] | |
9130 pmulhrsw m4, m7 | |
9131 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26] | |
9132 pmulhrsw m5, m7 | |
9133 packuswb m4, m5 | |
9134 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31] | |
9135 pmulhrsw m5, m7 | |
9136 movu m0, [r2 + 4] | |
9137 palignr m1, m0, 1 | |
9138 punpckhbw m2, m0, m1 | |
9139 punpcklbw m0, m1 | |
9140 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4] | |
9141 pmulhrsw m2, m7 | |
9142 packuswb m5, m2 | |
9143 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
9144 pmulhrsw m6, m7 | |
9145 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14] | |
9146 pmulhrsw m1, m7 | |
9147 packuswb m6, m1 | |
9148 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19] | |
9149 pmulhrsw m1, m7 | |
9150 mova m2, m0 | |
9151 pmaddubsw m0, [r3 + 8 * 16] ; [24] | |
9152 pmulhrsw m0, m7 | |
9153 packuswb m1, m0 | |
9154 | |
9155 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
9156 | |
9157 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
9158 pmulhrsw m4, m7 | |
9159 movu m0, [r2 + 5] | |
9160 palignr m1, m0, 1 | |
9161 punpckhbw m2, m0, m1 | |
9162 punpcklbw m0, m1 | |
9163 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] | |
9164 pmulhrsw m1, m7 | |
9165 packuswb m4, m1 | |
9166 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7] | |
9167 pmulhrsw m5, m7 | |
9168 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
9169 pmulhrsw m6, m7 | |
9170 packuswb m5, m6 | |
9171 pmaddubsw m6, m0, [r3 + 16] ; [17] | |
9172 pmulhrsw m6, m7 | |
9173 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22] | |
9174 pmulhrsw m1, m7 | |
9175 packuswb m6, m1 | |
9176 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27] | |
9177 pmulhrsw m1, m7 | |
9178 packuswb m1, m1 | |
9179 movhps m1, [r2 + 6] ; [00] | |
9180 | |
9181 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
9182 %endmacro | |
9183 | |
9184 %macro MODE_9_27 1 | |
9185 movu m2, [r2 + 1] | |
9186 palignr m1, m2, 1 | |
9187 punpckhbw m0, m2, m1 | |
9188 punpcklbw m2, m1 | |
9189 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
9190 pmulhrsw m4, m7 | |
9191 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] | |
9192 pmulhrsw m3, m7 | |
9193 packuswb m4, m3 | |
9194 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
9195 pmulhrsw m5, m7 | |
9196 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
9197 pmulhrsw m6, m7 | |
9198 packuswb m5, m6 | |
9199 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
9200 pmulhrsw m6, m7 | |
9201 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12] | |
9202 pmulhrsw m3, m7 | |
9203 packuswb m6, m3 | |
9204 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
9205 pmulhrsw m1, m7 | |
9206 pmaddubsw m0, m2, [r3] ; [16] | |
9207 pmulhrsw m0, m7 | |
9208 packuswb m1, m0 | |
9209 | |
9210 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
9211 | |
9212 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
9213 pmulhrsw m4, m7 | |
9214 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
9215 pmulhrsw m5, m7 | |
9216 packuswb m4, m5 | |
9217 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
9218 pmulhrsw m5, m7 | |
9219 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
9220 pmulhrsw m6, m7 | |
9221 packuswb m5, m6 | |
9222 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
9223 pmulhrsw m6, m7 | |
9224 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
9225 pmulhrsw m1, m7 | |
9226 packuswb m6, m1 | |
9227 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
9228 pmulhrsw m1, m7 | |
9229 packuswb m1, m1 | |
9230 movhps m1, [r2 + 2] ; [00] | |
9231 | |
9232 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
9233 | |
9234 movu m2, [r2 + 2] | |
9235 palignr m1, m2, 1 | |
9236 punpcklbw m2, m1 | |
9237 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
9238 pmulhrsw m4, m7 | |
9239 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] | |
9240 pmulhrsw m3, m7 | |
9241 packuswb m4, m3 | |
9242 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
9243 pmulhrsw m5, m7 | |
9244 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
9245 pmulhrsw m6, m7 | |
9246 packuswb m5, m6 | |
9247 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
9248 pmulhrsw m6, m7 | |
9249 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] | |
9250 pmulhrsw m0, m7 | |
9251 packuswb m6, m0 | |
9252 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
9253 pmulhrsw m1, m7 | |
9254 pmaddubsw m0, m2, [r3] ; [16] | |
9255 pmulhrsw m0, m7 | |
9256 packuswb m1, m0 | |
9257 | |
9258 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
9259 | |
9260 movu m2, [r2 + 2] | |
9261 palignr m1, m2, 1 | |
9262 punpcklbw m2, m1 | |
9263 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
9264 pmulhrsw m4, m7 | |
9265 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
9266 pmulhrsw m5, m7 | |
9267 packuswb m4, m5 | |
9268 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
9269 pmulhrsw m5, m7 | |
9270 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
9271 pmulhrsw m6, m7 | |
9272 packuswb m5, m6 | |
9273 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
9274 pmulhrsw m6, m7 | |
9275 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
9276 pmulhrsw m1, m7 | |
9277 packuswb m6, m1 | |
9278 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
9279 pmulhrsw m1, m7 | |
9280 packuswb m1, m1 | |
9281 movhps m1, [r2 + 3] ; [00] | |
9282 | |
9283 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
9284 %endmacro | |
9285 | |
9286 %macro MODE_12_24 1 | |
9287 movu m2, [r2] | |
9288 palignr m1, m2, 1 | |
9289 punpckhbw m0, m2, m1 | |
9290 punpcklbw m2, m1 | |
9291 palignr m0, m2, 2 | |
9292 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
9293 pmulhrsw m4, m7 | |
9294 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22] | |
9295 pmulhrsw m3, m7 | |
9296 packuswb m4, m3 | |
9297 pmaddubsw m5, m0, [r4 + 16] ; [17] | |
9298 pmulhrsw m5, m7 | |
9299 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
9300 pmulhrsw m6, m7 | |
9301 packuswb m5, m6 | |
9302 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
9303 pmulhrsw m6, m7 | |
9304 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2] | |
9305 pmulhrsw m3, m7 | |
9306 packuswb m6, m3 | |
9307 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] | |
9308 pmulhrsw m1, m7 | |
9309 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
9310 pmulhrsw m3, m7 | |
9311 packuswb m1, m3 | |
9312 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
9313 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] | |
9314 pmulhrsw m4, m7 | |
9315 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] | |
9316 pmulhrsw m5, m7 | |
9317 packuswb m4, m5 | |
9318 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] | |
9319 pmulhrsw m5, m7 | |
9320 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
9321 pmulhrsw m6, m7 | |
9322 packuswb m5, m6 | |
9323 movu m0, [r2 - 2] | |
9324 palignr m1, m0, 1 | |
9325 punpckhbw m2, m0, m1 | |
9326 punpcklbw m0, m1 | |
9327 palignr m2, m0, 2 | |
9328 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] | |
9329 pmulhrsw m6, m7 | |
9330 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] | |
9331 pmulhrsw m1, m7 | |
9332 packuswb m6, m1 | |
9333 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] | |
9334 pmulhrsw m1, m7 | |
9335 pmaddubsw m3, m2, [r4] ; [16] | |
9336 pmulhrsw m3, m7 | |
9337 packuswb m1, m3 | |
9338 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
9339 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] | |
9340 pmulhrsw m4, m7 | |
9341 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] | |
9342 pmulhrsw m3, m7 | |
9343 packuswb m4, m3 | |
9344 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] | |
9345 pmulhrsw m5, m7 | |
9346 movu m0, [r2 - 3] | |
9347 palignr m1, m0, 1 | |
9348 punpckhbw m2, m0, m1 | |
9349 punpcklbw m0, m1 | |
9350 palignr m2, m0, 2 | |
9351 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
9352 pmulhrsw m6, m7 | |
9353 packuswb m5, m6 | |
9354 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] | |
9355 pmulhrsw m6, m7 | |
9356 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] | |
9357 pmulhrsw m3, m7 | |
9358 packuswb m6, m3 | |
9359 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] | |
9360 pmulhrsw m1, m7 | |
9361 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] | |
9362 pmulhrsw m3, m7 | |
9363 packuswb m1, m3 | |
9364 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
9365 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] | |
9366 pmulhrsw m4, m7 | |
9367 movu m2, [r2 - 4] | |
9368 palignr m1, m2, 1 | |
9369 punpckhbw m0, m2, m1 | |
9370 punpcklbw m2, m1 | |
9371 palignr m0, m2, 2 | |
9372 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30] | |
9373 pmulhrsw m5, m7 | |
9374 packuswb m4, m5 | |
9375 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25] | |
9376 pmulhrsw m5, m7 | |
9377 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20] | |
9378 pmulhrsw m6, m7 | |
9379 packuswb m5, m6 | |
9380 pmaddubsw m6, m0, [r4 - 16] ; [15] | |
9381 pmulhrsw m6, m7 | |
9382 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10] | |
9383 pmulhrsw m1, m7 | |
9384 packuswb m6, m1 | |
9385 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05] | |
9386 pmulhrsw m1, m7 | |
9387 movu m2, [pb_fact0] | |
9388 pshufb m0, m2 | |
9389 pmovzxbw m0, m0 | |
9390 packuswb m1, m0 | |
9391 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
9392 %endmacro | |
9393 | |
9394 ;------------------------------------------------------------------------------------------ | |
9395 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
9396 ;------------------------------------------------------------------------------------------ | |
9397 INIT_XMM ssse3 | |
9398 cglobal intra_pred_ang32_2, 3,5,4 | |
9399 lea r4, [r2] | |
9400 add r2, 64 | |
9401 cmp r3m, byte 34 | |
9402 cmove r2, r4 | |
9403 movu m0, [r2 + 2] | |
9404 movu m1, [r2 + 18] | |
9405 movu m3, [r2 + 34] | |
9406 | |
9407 lea r3, [r1 * 3] | |
9408 | |
9409 movu [r0], m0 | |
9410 movu [r0 + 16], m1 | |
9411 palignr m2, m1, m0, 1 | |
9412 movu [r0 + r1], m2 | |
9413 palignr m2, m3, m1, 1 | |
9414 movu [r0 + r1 + 16], m2 | |
9415 palignr m2, m1, m0, 2 | |
9416 movu [r0 + r1 * 2], m2 | |
9417 palignr m2, m3, m1, 2 | |
9418 movu [r0 + r1 * 2 + 16], m2 | |
9419 palignr m2, m1, m0, 3 | |
9420 movu [r0 + r3], m2 | |
9421 palignr m2, m3, m1, 3 | |
9422 movu [r0 + r3 + 16], m2 | |
9423 | |
9424 lea r0, [r0 + r1 * 4] | |
9425 | |
9426 palignr m2, m1, m0, 4 | |
9427 movu [r0], m2 | |
9428 palignr m2, m3, m1, 4 | |
9429 movu [r0 + 16], m2 | |
9430 palignr m2, m1, m0, 5 | |
9431 movu [r0 + r1], m2 | |
9432 palignr m2, m3, m1, 5 | |
9433 movu [r0 + r1 + 16], m2 | |
9434 palignr m2, m1, m0, 6 | |
9435 movu [r0 + r1 * 2], m2 | |
9436 palignr m2, m3, m1, 6 | |
9437 movu [r0 + r1 * 2 + 16], m2 | |
9438 palignr m2, m1, m0, 7 | |
9439 movu [r0 + r3], m2 | |
9440 palignr m2, m3, m1, 7 | |
9441 movu [r0 + r3 + 16], m2 | |
9442 | |
9443 lea r0, [r0 + r1 * 4] | |
9444 | |
9445 palignr m2, m1, m0, 8 | |
9446 movu [r0], m2 | |
9447 palignr m2, m3, m1, 8 | |
9448 movu [r0 + 16], m2 | |
9449 palignr m2, m1, m0, 9 | |
9450 movu [r0 + r1], m2 | |
9451 palignr m2, m3, m1, 9 | |
9452 movu [r0 + r1 + 16], m2 | |
9453 palignr m2, m1, m0, 10 | |
9454 movu [r0 + r1 * 2], m2 | |
9455 palignr m2, m3, m1, 10 | |
9456 movu [r0 + r1 * 2 + 16], m2 | |
9457 palignr m2, m1, m0, 11 | |
9458 movu [r0 + r3], m2 | |
9459 palignr m2, m3, m1, 11 | |
9460 movu [r0 + r3 + 16], m2 | |
9461 | |
9462 lea r0, [r0 + r1 * 4] | |
9463 | |
9464 palignr m2, m1, m0, 12 | |
9465 movu [r0], m2 | |
9466 palignr m2, m3, m1, 12 | |
9467 movu [r0 + 16], m2 | |
9468 palignr m2, m1, m0, 13 | |
9469 movu [r0 + r1], m2 | |
9470 palignr m2, m3, m1, 13 | |
9471 movu [r0 + r1 + 16], m2 | |
9472 palignr m2, m1, m0, 14 | |
9473 movu [r0 + r1 * 2], m2 | |
9474 palignr m2, m3, m1, 14 | |
9475 movu [r0 + r1 * 2 + 16], m2 | |
9476 palignr m2, m1, m0, 15 | |
9477 movu [r0 + r3], m2 | |
9478 palignr m2, m3, m1, 15 | |
9479 movu [r0 + r3 + 16], m2 | |
9480 | |
9481 lea r0, [r0 + r1 * 4] | |
9482 | |
9483 movu [r0], m1 | |
9484 movu m0, [r2 + 50] | |
9485 movu [r0 + 16], m3 | |
9486 palignr m2, m3, m1, 1 | |
9487 movu [r0 + r1], m2 | |
9488 palignr m2, m0, m3, 1 | |
9489 movu [r0 + r1 + 16], m2 | |
9490 palignr m2, m3, m1, 2 | |
9491 movu [r0 + r1 * 2], m2 | |
9492 palignr m2, m0, m3, 2 | |
9493 movu [r0 + r1 * 2 + 16], m2 | |
9494 palignr m2, m3, m1, 3 | |
9495 movu [r0 + r3], m2 | |
9496 palignr m2, m0, m3, 3 | |
9497 movu [r0 + r3 + 16], m2 | |
9498 | |
9499 lea r0, [r0 + r1 * 4] | |
9500 | |
9501 palignr m2, m3, m1, 4 | |
9502 movu [r0], m2 | |
9503 palignr m2, m0, m3, 4 | |
9504 movu [r0 + 16], m2 | |
9505 palignr m2, m3, m1, 5 | |
9506 movu [r0 + r1], m2 | |
9507 palignr m2, m0, m3, 5 | |
9508 movu [r0 + r1 + 16], m2 | |
9509 palignr m2, m3, m1, 6 | |
9510 movu [r0 + r1 * 2], m2 | |
9511 palignr m2, m0, m3, 6 | |
9512 movu [r0 + r1 * 2 + 16], m2 | |
9513 palignr m2, m3, m1, 7 | |
9514 movu [r0 + r3], m2 | |
9515 palignr m2, m0, m3, 7 | |
9516 movu [r0 + r3 + 16], m2 | |
9517 | |
9518 lea r0, [r0 + r1 * 4] | |
9519 | |
9520 palignr m2, m3, m1, 8 | |
9521 movu [r0], m2 | |
9522 palignr m2, m0, m3, 8 | |
9523 movu [r0 + 16], m2 | |
9524 palignr m2, m3, m1, 9 | |
9525 movu [r0 + r1], m2 | |
9526 palignr m2, m0, m3, 9 | |
9527 movu [r0 + r1 + 16], m2 | |
9528 palignr m2, m3, m1, 10 | |
9529 movu [r0 + r1 * 2], m2 | |
9530 palignr m2, m0, m3, 10 | |
9531 movu [r0 + r1 * 2 + 16], m2 | |
9532 palignr m2, m3, m1, 11 | |
9533 movu [r0 + r3], m2 | |
9534 palignr m2, m0, m3, 11 | |
9535 movu [r0 + r3 + 16], m2 | |
9536 | |
9537 lea r0, [r0 + r1 * 4] | |
9538 | |
9539 palignr m2, m3, m1, 12 | |
9540 movu [r0], m2 | |
9541 palignr m2, m0, m3, 12 | |
9542 movu [r0 + 16], m2 | |
9543 palignr m2, m3, m1, 13 | |
9544 movu [r0 + r1], m2 | |
9545 palignr m2, m0, m3, 13 | |
9546 movu [r0 + r1 + 16], m2 | |
9547 palignr m2, m3, m1, 14 | |
9548 movu [r0 + r1 * 2], m2 | |
9549 palignr m2, m0, m3, 14 | |
9550 movu [r0 + r1 * 2 + 16], m2 | |
9551 palignr m2, m3, m1, 15 | |
9552 movu [r0 + r3], m2 | |
9553 palignr m2, m0, m3, 15 | |
9554 movu [r0 + r3 + 16], m2 | |
9555 RET | |
9556 | |
9557 INIT_XMM sse4 | |
9558 cglobal intra_pred_ang32_3, 3,7,8 | |
9559 add r2, 64 | |
9560 lea r3, [ang_table + 16 * 16] | |
9561 mov r4d, 4 | |
9562 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9563 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9564 mova m7, [pw_1024] | |
9565 .loop: | |
9566 MODE_3_33 1 | |
9567 lea r0, [r6 + r1 * 4] | |
9568 lea r6, [r6 + r1 * 8] | |
9569 add r2, 8 | |
9570 dec r4 | |
9571 jnz .loop | |
9572 RET | |
9573 | |
9574 INIT_XMM sse4 | |
9575 cglobal intra_pred_ang32_4, 3,7,8 | |
9576 add r2, 64 | |
9577 lea r3, [ang_table + 16 * 16] | |
9578 mov r4d, 4 | |
9579 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9580 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9581 mova m7, [pw_1024] | |
9582 .loop: | |
9583 MODE_4_32 1 | |
9584 lea r0, [r6 + r1 * 4] | |
9585 lea r6, [r6 + r1 * 8] | |
9586 add r2, 8 | |
9587 dec r4 | |
9588 jnz .loop | |
9589 RET | |
9590 | |
9591 INIT_XMM sse4 | |
9592 cglobal intra_pred_ang32_5, 3,7,8 | |
9593 add r2, 64 | |
9594 lea r3, [ang_table + 16 * 16] | |
9595 mov r4d, 4 | |
9596 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9597 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9598 mova m7, [pw_1024] | |
9599 .loop: | |
9600 MODE_5_31 1 | |
9601 lea r0, [r6 + r1 * 4] | |
9602 lea r6, [r6 + r1 * 8] | |
9603 add r2, 8 | |
9604 dec r4 | |
9605 jnz .loop | |
9606 RET | |
9607 | |
9608 INIT_XMM sse4 | |
9609 cglobal intra_pred_ang32_6, 3,7,8 | |
9610 add r2, 64 | |
9611 lea r3, [ang_table + 16 * 16] | |
9612 mov r4d, 4 | |
9613 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9614 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9615 mova m7, [pw_1024] | |
9616 .loop: | |
9617 MODE_6_30 1 | |
9618 lea r0, [r6 + r1 * 4] | |
9619 lea r6, [r6 + r1 * 8] | |
9620 add r2, 8 | |
9621 dec r4 | |
9622 jnz .loop | |
9623 RET | |
9624 | |
9625 INIT_XMM sse4 | |
9626 cglobal intra_pred_ang32_7, 3,7,8 | |
9627 add r2, 64 | |
9628 lea r3, [ang_table + 16 * 16] | |
9629 mov r4d, 4 | |
9630 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9631 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9632 mova m7, [pw_1024] | |
9633 .loop: | |
9634 MODE_7_29 1 | |
9635 lea r0, [r6 + r1 * 4] | |
9636 lea r6, [r6 + r1 * 8] | |
9637 add r2, 8 | |
9638 dec r4 | |
9639 jnz .loop | |
9640 RET | |
9641 | |
9642 INIT_XMM sse4 | |
9643 cglobal intra_pred_ang32_8, 3,7,8 | |
9644 add r2, 64 | |
9645 lea r3, [ang_table + 16 * 16] | |
9646 mov r4d, 4 | |
9647 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9648 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9649 mova m7, [pw_1024] | |
9650 .loop: | |
9651 MODE_8_28 1 | |
9652 lea r0, [r6 + r1 * 4] | |
9653 lea r6, [r6 + r1 * 8] | |
9654 add r2, 8 | |
9655 dec r4 | |
9656 jnz .loop | |
9657 RET | |
9658 | |
9659 INIT_XMM sse4 | |
9660 cglobal intra_pred_ang32_9, 3,7,8 | |
9661 add r2, 64 | |
9662 lea r3, [ang_table + 16 * 16] | |
9663 mov r4d, 4 | |
9664 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9665 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9666 mova m7, [pw_1024] | |
9667 .loop: | |
9668 MODE_9_27 1 | |
9669 lea r0, [r6 + r1 * 4] | |
9670 lea r6, [r6 + r1 * 8] | |
9671 add r2, 8 | |
9672 dec r4 | |
9673 jnz .loop | |
9674 RET | |
9675 | |
9676 INIT_XMM sse4 | |
9677 cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize) | |
9678 %define m8 [rsp + 0 * mmsize] | |
9679 %define m9 [rsp + 1 * mmsize] | |
9680 pxor m7, m7 | |
9681 mov r6, 2 | |
9682 movu m0, [r2] | |
9683 movu m1, [r2 + 1] | |
9684 mova m8, m0 | |
9685 mova m9, m1 | |
9686 mov r3d, r4d | |
9687 lea r4, [r1 * 3] | |
9688 | |
9689 .loop: | |
9690 movu m0, [r2 + 1 + 64] | |
9691 palignr m1, m0, 1 | |
9692 pshufb m1, m7 | |
9693 palignr m2, m0, 2 | |
9694 pshufb m2, m7 | |
9695 palignr m3, m0, 3 | |
9696 pshufb m3, m7 | |
9697 palignr m4, m0, 4 | |
9698 pshufb m4, m7 | |
9699 palignr m5, m0, 5 | |
9700 pshufb m5, m7 | |
9701 palignr m6, m0, 6 | |
9702 pshufb m6, m7 | |
9703 | |
9704 movu [r0 + r1], m1 | |
9705 movu [r0 + r1 + 16], m1 | |
9706 movu [r0 + r1 * 2], m2 | |
9707 movu [r0 + r1 * 2 + 16], m2 | |
9708 movu [r0 + r4], m3 | |
9709 movu [r0 + r4 + 16], m3 | |
9710 lea r5, [r0 + r1 * 4] | |
9711 movu [r5], m4 | |
9712 movu [r5 + 16], m4 | |
9713 movu [r5 + r1], m5 | |
9714 movu [r5 + r1 + 16], m5 | |
9715 movu [r5 + r1 * 2], m6 | |
9716 movu [r5 + r1 * 2 + 16], m6 | |
9717 | |
9718 palignr m1, m0, 7 | |
9719 pshufb m1, m7 | |
9720 movhlps m2, m0 | |
9721 pshufb m2, m7 | |
9722 palignr m3, m0, 9 | |
9723 pshufb m3, m7 | |
9724 palignr m4, m0, 10 | |
9725 pshufb m4, m7 | |
9726 palignr m5, m0, 11 | |
9727 pshufb m5, m7 | |
9728 palignr m6, m0, 12 | |
9729 pshufb m6, m7 | |
9730 | |
9731 movu [r5 + r4], m1 | |
9732 movu [r5 + r4 + 16], m1 | |
9733 lea r5, [r5 + r1 * 4] | |
9734 movu [r5], m2 | |
9735 movu [r5 + 16], m2 | |
9736 movu [r5 + r1], m3 | |
9737 movu [r5 + r1 + 16], m3 | |
9738 movu [r5 + r1 * 2], m4 | |
9739 movu [r5 + r1 * 2 + 16], m4 | |
9740 movu [r5 + r4], m5 | |
9741 movu [r5 + r4 + 16], m5 | |
9742 lea r5, [r5 + r1 * 4] | |
9743 movu [r5], m6 | |
9744 movu [r5 + 16], m6 | |
9745 | |
9746 palignr m1, m0, 13 | |
9747 pshufb m1, m7 | |
9748 palignr m2, m0, 14 | |
9749 pshufb m2, m7 | |
9750 palignr m3, m0, 15 | |
9751 pshufb m3, m7 | |
9752 pshufb m0, m7 | |
9753 | |
9754 movu [r5 + r1], m1 | |
9755 movu [r5 + r1 + 16], m1 | |
9756 movu [r5 + r1 * 2], m2 | |
9757 movu [r5 + r1 * 2 + 16], m2 | |
9758 movu [r5 + r4], m3 | |
9759 movu [r5 + r4 + 16], m3 | |
9760 | |
9761 ; filter | |
9762 cmp r3d, byte 0 | |
9763 jz .quit | |
9764 movhlps m1, m0 | |
9765 pmovzxbw m0, m0 | |
9766 mova m1, m0 | |
9767 movu m2, m8 | |
9768 movu m3, m9 | |
9769 | |
9770 pshufb m2, m7 | |
9771 pmovzxbw m2, m2 | |
9772 movhlps m4, m3 | |
9773 pmovzxbw m3, m3 | |
9774 pmovzxbw m4, m4 | |
9775 psubw m3, m2 | |
9776 psubw m4, m2 | |
9777 psraw m3, 1 | |
9778 psraw m4, 1 | |
9779 paddw m0, m3 | |
9780 paddw m1, m4 | |
9781 packuswb m0, m1 | |
9782 | |
9783 .quit: | |
9784 movu [r0], m0 | |
9785 movu [r0 + 16], m0 | |
9786 dec r6 | |
9787 lea r0, [r5 + r1 * 4] | |
9788 lea r2, [r2 + 16] | |
9789 jnz .loop | |
9790 RET | |
9791 | |
9792 INIT_XMM sse4 | |
9793 cglobal intra_pred_ang32_11, 4,7,8 | |
9794 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
9795 mov r6, rsp | |
9796 sub rsp, 64+gprsize | |
9797 and rsp, ~63 | |
9798 mov [rsp+64], r6 | |
9799 | |
9800 ; collect reference pixel | |
9801 movu m0, [r2 + 16] | |
9802 pxor m1, m1 | |
9803 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] | |
9804 mova [rsp], m0 | |
9805 movu m0, [r2 + 64] | |
9806 pinsrb m0, [r2], 0 | |
9807 movu m1, [r2 + 16 + 64] | |
9808 movu m2, [r2 + 32 + 64] | |
9809 movu [rsp + 1], m0 | |
9810 movu [rsp + 1 + 16], m1 | |
9811 movu [rsp + 1 + 32], m2 | |
9812 mov [rsp + 63], byte 4 | |
9813 | |
9814 ; filter | |
9815 lea r2, [rsp + 1] ; r2 -> [0] | |
9816 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
9817 lea r4, [ang_table] ; r4 -> ang_table | |
9818 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9819 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9820 mova m5, [pw_1024] ; m5 -> 1024 | |
9821 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
9822 | |
9823 .loop: | |
9824 ; Row[0 - 7] | |
9825 movu m7, [r2] | |
9826 mova m0, m7 | |
9827 mova m1, m7 | |
9828 mova m2, m7 | |
9829 mova m3, m7 | |
9830 mova m4, m7 | |
9831 mova m5, m7 | |
9832 mova m6, m7 | |
9833 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16 | |
9834 | |
9835 ; Row[8 - 15] | |
9836 movu m7, [r2] | |
9837 mova m0, m7 | |
9838 mova m1, m7 | |
9839 mova m2, m7 | |
9840 mova m3, m7 | |
9841 mova m4, m7 | |
9842 mova m5, m7 | |
9843 mova m6, m7 | |
9844 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0 | |
9845 | |
9846 ; Row[16 - 23] | |
9847 movu m7, [r2 - 1] | |
9848 mova m0, m7 | |
9849 mova m1, m7 | |
9850 mova m2, m7 | |
9851 mova m3, m7 | |
9852 mova m4, m7 | |
9853 mova m5, m7 | |
9854 mova m6, m7 | |
9855 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16 | |
9856 | |
9857 ; Row[24 - 31] | |
9858 movu m7, [r2 - 1] | |
9859 mova m0, m7 | |
9860 mova m1, m7 | |
9861 mova m2, m7 | |
9862 mova m3, m7 | |
9863 mova m4, m7 | |
9864 mova m5, m7 | |
9865 mova m6, m7 | |
9866 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0 | |
9867 | |
9868 lea r0, [r6 + r1 * 4] | |
9869 lea r6, [r6 + r1 * 8] | |
9870 add r2, 8 | |
9871 dec byte [rsp + 63] | |
9872 jnz .loop | |
9873 mov rsp, [rsp+64] | |
9874 RET | |
9875 | |
9876 %macro MODE_12_24_ROW0 1 | |
9877 movu m0, [r3 + 6] | |
9878 pshufb m0, [c_mode32_12_0] | |
9879 pinsrb m0, [r3 + 26], 12 | |
9880 mova above, m0 | |
9881 movu m2, [r2] | |
9882 %if %1 == 1 | |
9883 pinsrb m2, [r3], 0 | |
9884 %endif | |
9885 palignr m1, m2, 1 | |
9886 punpcklbw m2, m1 | |
9887 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27] | |
9888 pmulhrsw m4, m7 | |
9889 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22] | |
9890 pmulhrsw m3, m7 | |
9891 packuswb m4, m3 | |
9892 pmaddubsw m5, m2, [r4 + 16] ; [17] | |
9893 pmulhrsw m5, m7 | |
9894 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
9895 pmulhrsw m6, m7 | |
9896 packuswb m5, m6 | |
9897 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7] | |
9898 pmulhrsw m6, m7 | |
9899 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2] | |
9900 pmulhrsw m3, m7 | |
9901 packuswb m6, m3 | |
9902 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
9903 %if %1 == 1 | |
9904 pinsrb m1, [r3], 0 | |
9905 %endif | |
9906 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
9907 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
9908 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] | |
9909 pmulhrsw m1, m7 | |
9910 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
9911 pmulhrsw m3, m7 | |
9912 packuswb m1, m3 | |
9913 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
9914 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] | |
9915 pmulhrsw m4, m7 | |
9916 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] | |
9917 pmulhrsw m5, m7 | |
9918 packuswb m4, m5 | |
9919 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] | |
9920 pmulhrsw m5, m7 | |
9921 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
9922 pmulhrsw m6, m7 | |
9923 packuswb m5, m6 | |
9924 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
9925 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] | |
9926 pmulhrsw m6, m7 | |
9927 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] | |
9928 pmulhrsw m1, m7 | |
9929 packuswb m6, m1 | |
9930 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] | |
9931 pmulhrsw m1, m7 | |
9932 pmaddubsw m3, m2, [r4] ; [16] | |
9933 pmulhrsw m3, m7 | |
9934 packuswb m1, m3 | |
9935 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
9936 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] | |
9937 pmulhrsw m4, m7 | |
9938 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] | |
9939 pmulhrsw m3, m7 | |
9940 packuswb m4, m3 | |
9941 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] | |
9942 pmulhrsw m5, m7 | |
9943 pslldq m1, above, 1 | |
9944 palignr m2, m1, 14 | |
9945 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
9946 pmulhrsw m6, m7 | |
9947 packuswb m5, m6 | |
9948 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] | |
9949 pmulhrsw m6, m7 | |
9950 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] | |
9951 pmulhrsw m3, m7 | |
9952 packuswb m6, m3 | |
9953 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] | |
9954 pmulhrsw m1, m7 | |
9955 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] | |
9956 pmulhrsw m3, m7 | |
9957 packuswb m1, m3 | |
9958 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
9959 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] | |
9960 pmulhrsw m4, m7 | |
9961 pslldq m1, above, 2 | |
9962 palignr m2, m1, 14 | |
9963 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30] | |
9964 pmulhrsw m5, m7 | |
9965 packuswb m4, m5 | |
9966 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25] | |
9967 pmulhrsw m5, m7 | |
9968 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
9969 pmulhrsw m6, m7 | |
9970 packuswb m5, m6 | |
9971 pmaddubsw m6, m2, [r4 - 16] ; [15] | |
9972 pmulhrsw m6, m7 | |
9973 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10] | |
9974 pmulhrsw m1, m7 | |
9975 packuswb m6, m1 | |
9976 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05] | |
9977 pmulhrsw m1, m7 | |
9978 movu m0, [pb_fact0] | |
9979 pshufb m2, m0 | |
9980 pmovzxbw m2, m2 | |
9981 packuswb m1, m2 | |
9982 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
9983 %endmacro | |
9984 | |
9985 INIT_XMM sse4 | |
9986 cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize) | |
9987 %define above [rsp + 0 * mmsize] | |
9988 mov r3, r2 | |
9989 add r2, 64 | |
9990 lea r4, [ang_table + 16 * 16] | |
9991 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
9992 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
9993 mova m7, [pw_1024] | |
9994 | |
9995 MODE_12_24_ROW0 1 | |
9996 lea r0, [r6 + r1 * 4] | |
9997 lea r6, [r6 + r1 * 8] | |
9998 add r2, 7 | |
9999 mov r3, 3 | |
10000 .loop: | |
10001 MODE_12_24 1 | |
10002 lea r0, [r6 + r1 * 4] | |
10003 lea r6, [r6 + r1 * 8] | |
10004 add r2, 8 | |
10005 dec r3 | |
10006 jnz .loop | |
10007 RET | |
10008 | |
10009 %macro MODE_13_23_ROW0 1 | |
10010 movu m0, [r3 + 1] | |
10011 movu m1, [r3 + 15] | |
10012 pshufb m0, [c_mode32_13_0] | |
10013 pshufb m1, [c_mode32_13_0] | |
10014 punpckldq m0, m1 | |
10015 pshufb m0, [c_mode32_13_shuf] | |
10016 mova above, m0 | |
10017 movu m2, [r2] | |
10018 %if (%1 == 1) | |
10019 pinsrb m2, [r3], 0 | |
10020 %endif | |
10021 palignr m1, m2, 1 | |
10022 punpcklbw m2, m1 | |
10023 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23] | |
10024 pmulhrsw m4, m7 | |
10025 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14] | |
10026 pmulhrsw m3, m7 | |
10027 packuswb m4, m3 | |
10028 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5] | |
10029 pmulhrsw m5, m7 | |
10030 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
10031 %if (%1 == 1) | |
10032 pinsrb m1, [r3], 0 | |
10033 %endif | |
10034 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
10035 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
10036 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
10037 pmulhrsw m6, m7 | |
10038 packuswb m5, m6 | |
10039 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] | |
10040 pmulhrsw m6, m7 | |
10041 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10] | |
10042 pmulhrsw m0, m7 | |
10043 packuswb m6, m0 | |
10044 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] | |
10045 pmulhrsw m1, m7 | |
10046 palignr m2, above, 14 | |
10047 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
10048 pmulhrsw m3, m7 | |
10049 packuswb m1, m3 | |
10050 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
10051 pmaddubsw m4, m2, [r4 - 16] ; [15] | |
10052 pmulhrsw m4, m7 | |
10053 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6] | |
10054 pmulhrsw m5, m7 | |
10055 packuswb m4, m5 | |
10056 pslldq m0, above, 1 | |
10057 palignr m2, m0, 14 | |
10058 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] | |
10059 pmulhrsw m5, m7 | |
10060 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
10061 pmulhrsw m6, m7 | |
10062 packuswb m5, m6 | |
10063 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] | |
10064 pmulhrsw m6, m7 | |
10065 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] | |
10066 pmulhrsw m1, m7 | |
10067 packuswb m6, m1 | |
10068 pslldq m0, 1 | |
10069 palignr m2, m0, 14 | |
10070 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25] | |
10071 pmulhrsw m1, m7 | |
10072 pmaddubsw m0, m2, [r4] ; [16] | |
10073 pmulhrsw m0, m7 | |
10074 packuswb m1, m0 | |
10075 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
10076 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7] | |
10077 pmulhrsw m4, m7 | |
10078 pslldq m0, above, 3 | |
10079 palignr m2, m0, 14 | |
10080 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] | |
10081 pmulhrsw m3, m7 | |
10082 packuswb m4, m3 | |
10083 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] | |
10084 pmulhrsw m5, m7 | |
10085 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
10086 pmulhrsw m6, m7 | |
10087 packuswb m5, m6 | |
10088 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] | |
10089 pmulhrsw m6, m7 | |
10090 pslldq m0, 1 | |
10091 palignr m2, m0, 14 | |
10092 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26] | |
10093 pmulhrsw m0, m7 | |
10094 packuswb m6, m0 | |
10095 pmaddubsw m1, m2, [r4 + 16] ; [17] | |
10096 pmulhrsw m1, m7 | |
10097 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8] | |
10098 pmulhrsw m0, m7 | |
10099 packuswb m1, m0 | |
10100 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
10101 pslldq m0, above, 5 | |
10102 palignr m2, m0, 14 | |
10103 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] | |
10104 pmulhrsw m4, m7 | |
10105 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] | |
10106 pmulhrsw m5, m7 | |
10107 packuswb m4, m5 | |
10108 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] | |
10109 pmulhrsw m5, m7 | |
10110 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
10111 pmulhrsw m6, m7 | |
10112 packuswb m5, m6 | |
10113 pslldq m0, 1 | |
10114 palignr m2, m0, 14 | |
10115 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] | |
10116 pmulhrsw m6, m7 | |
10117 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] | |
10118 pmulhrsw m1, m7 | |
10119 packuswb m6, m1 | |
10120 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] | |
10121 pmulhrsw m1, m7 | |
10122 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] | |
10123 pmulhrsw m3, m7 | |
10124 packuswb m1, m3 | |
10125 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
10126 %endmacro | |
10127 | |
10128 %macro MODE_13_23 2 | |
10129 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
10130 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
10131 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
10132 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
10133 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
10134 pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] | |
10135 pmulhrsw m4, m7 | |
10136 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] | |
10137 pmulhrsw m3, m7 | |
10138 packuswb m4, m3 | |
10139 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] | |
10140 pmulhrsw m5, m7 | |
10141 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
10142 pmulhrsw m6, m7 | |
10143 packuswb m5, m6 | |
10144 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] | |
10145 pmulhrsw m6, m7 | |
10146 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] | |
10147 pmulhrsw m3, m7 | |
10148 packuswb m6, m3 | |
10149 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] | |
10150 pmulhrsw m1, m7 | |
10151 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] | |
10152 palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
10153 punpckhbw m0, m2, m3 | |
10154 punpcklbw m2, m3 | |
10155 palignr m0, m2, 2 | |
10156 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] | |
10157 pmulhrsw m3, m7 | |
10158 packuswb m1, m3 | |
10159 mova m3, m0 | |
10160 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
10161 pmaddubsw m4, m3, [r4 - 16] ; [15] | |
10162 pmulhrsw m4, m7 | |
10163 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] | |
10164 pmulhrsw m5, m7 | |
10165 packuswb m4, m5 | |
10166 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] | |
10167 pmulhrsw m5, m7 | |
10168 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
10169 pmulhrsw m6, m7 | |
10170 packuswb m5, m6 | |
10171 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] | |
10172 pmulhrsw m6, m7 | |
10173 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] | |
10174 pmulhrsw m1, m7 | |
10175 packuswb m6, m1 | |
10176 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
10177 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
10178 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
10179 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
10180 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
10181 pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] | |
10182 pmulhrsw m1, m7 | |
10183 pmaddubsw m3, m0, [r4] ; [16] | |
10184 pmulhrsw m3, m7 | |
10185 packuswb m1, m3 | |
10186 mova m3, m0 | |
10187 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
10188 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] | |
10189 pmulhrsw m4, m7 | |
10190 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] | |
10191 pmulhrsw m3, m7 | |
10192 packuswb m4, m3 | |
10193 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] | |
10194 pmulhrsw m5, m7 | |
10195 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
10196 pmulhrsw m6, m7 | |
10197 packuswb m5, m6 | |
10198 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] | |
10199 pmulhrsw m6, m7 | |
10200 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
10201 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
10202 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
10203 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
10204 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
10205 pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] | |
10206 pmulhrsw m3, m7 | |
10207 packuswb m6, m3 | |
10208 pmaddubsw m1, m0, [r4 + 16] ; [17] | |
10209 pmulhrsw m1, m7 | |
10210 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] | |
10211 pmulhrsw m3, m7 | |
10212 packuswb m1, m3 | |
10213 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
10214 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] | |
10215 pmulhrsw m4, m7 | |
10216 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] | |
10217 pmulhrsw m5, m7 | |
10218 packuswb m4, m5 | |
10219 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] | |
10220 pmulhrsw m5, m7 | |
10221 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
10222 pmulhrsw m6, m7 | |
10223 packuswb m5, m6 | |
10224 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
10225 %if ((%1 & %2) == 1) | |
10226 pinsrb m2, [r3], 0 | |
10227 %endif | |
10228 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
10229 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
10230 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] | |
10231 pmulhrsw m6, m7 | |
10232 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] | |
10233 pmulhrsw m1, m7 | |
10234 packuswb m6, m1 | |
10235 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] | |
10236 pmulhrsw m1, m7 | |
10237 movu m0, [pb_fact0] | |
10238 pshufb m2, m0 | |
10239 pmovzxbw m2, m2 | |
10240 packuswb m1, m2 | |
10241 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
10242 %endmacro | |
10243 | |
10244 INIT_XMM sse4 | |
10245 cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize) | |
10246 %define above [rsp + 0 * mmsize] | |
10247 mov r3, r2 | |
10248 add r2, 64 | |
10249 lea r4, [ang_table + 16 * 16] | |
10250 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10251 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
10252 mova m7, [pw_1024] | |
10253 | |
10254 MODE_13_23_ROW0 1 | |
10255 lea r0, [r6 + r1 * 4] | |
10256 lea r6, [r6 + r1 * 8] | |
10257 add r2, 7 | |
10258 | |
10259 MODE_13_23 1, 1 | |
10260 lea r0, [r6 + r1 * 4] | |
10261 lea r6, [r6 + r1 * 8] | |
10262 add r2, 8 | |
10263 mov r3, 2 | |
10264 .loop: | |
10265 MODE_13_23 1, 0 | |
10266 lea r0, [r6 + r1 * 4] | |
10267 lea r6, [r6 + r1 * 8] | |
10268 add r2, 8 | |
10269 dec r3 | |
10270 jnz .loop | |
10271 RET | |
10272 | |
10273 INIT_XMM sse4 | |
10274 cglobal intra_pred_ang32_14, 3,7,8 | |
10275 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10276 mov r6, rsp | |
10277 sub rsp, 64+gprsize | |
10278 and rsp, ~63 | |
10279 mov [rsp+64], r6 | |
10280 | |
10281 ; collect reference pixel | |
10282 movu m0, [r2] | |
10283 movu m1, [r2 + 15] | |
10284 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] | |
10285 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] | |
10286 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] | |
10287 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] | |
10288 mova [rsp], m0 | |
10289 movu m0, [r2 + 1 + 64] | |
10290 movu m1, [r2 + 1 + 16 + 64] | |
10291 movu [rsp + 13], m0 | |
10292 movu [rsp + 13 + 16], m1 | |
10293 mov [rsp + 63], byte 4 | |
10294 | |
10295 ; filter | |
10296 lea r2, [rsp + 13] ; r2 -> [0] | |
10297 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10298 lea r4, [ang_table] ; r4 -> ang_table | |
10299 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10300 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
10301 mova m5, [pw_1024] ; m5 -> 1024 | |
10302 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10303 | |
10304 .loop: | |
10305 ; Row[0 - 7] | |
10306 movu m7, [r2 - 4] | |
10307 palignr m0, m7, 3 | |
10308 mova m1, m0 | |
10309 palignr m2, m7, 2 | |
10310 mova m3, m2 | |
10311 palignr m4, m7, 1 | |
10312 mova m5, m4 | |
10313 mova m6, m4 | |
10314 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 | |
10315 | |
10316 ; Row[8 - 15] | |
10317 movu m7, [r2 - 7] | |
10318 palignr m0, m7, 3 | |
10319 palignr m1, m7, 2 | |
10320 mova m2, m1 | |
10321 mova m3, m1 | |
10322 palignr m4, m7, 1 | |
10323 mova m5, m4 | |
10324 mova m6, m7 | |
10325 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 | |
10326 | |
10327 ; Row[16 - 23] | |
10328 movu m7, [r2 - 10] | |
10329 palignr m0, m7, 3 | |
10330 palignr m1, m7, 2 | |
10331 mova m2, m1 | |
10332 palignr m3, m7, 1 | |
10333 mova m4, m3 | |
10334 mova m5, m3 | |
10335 mova m6, m7 | |
10336 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 | |
10337 | |
10338 ; Row[24 - 31] | |
10339 movu m7, [r2 - 13] | |
10340 palignr m0, m7, 2 | |
10341 mova m1, m0 | |
10342 mova m2, m0 | |
10343 palignr m3, m7, 1 | |
10344 mova m4, m3 | |
10345 mova m5, m7 | |
10346 mova m6, m7 | |
10347 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 | |
10348 | |
10349 lea r0, [r6 + r1 * 4] | |
10350 lea r6, [r6 + r1 * 8] | |
10351 add r2, 8 | |
10352 dec byte [rsp + 63] | |
10353 jnz .loop | |
10354 mov rsp, [rsp+64] | |
10355 RET | |
10356 | |
10357 INIT_XMM sse4 | |
10358 cglobal intra_pred_ang32_15, 4,7,8 | |
10359 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10360 mov r6, rsp | |
10361 sub rsp, 64+gprsize | |
10362 and rsp, ~63 | |
10363 mov [rsp+64], r6 | |
10364 | |
10365 ; collect reference pixel | |
10366 movu m0, [r2] | |
10367 movu m1, [r2 + 15] | |
10368 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] | |
10369 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] | |
10370 mova [rsp], m1 | |
10371 movu [rsp + 8], m0 | |
10372 movu m0, [r2 + 1 + 64] | |
10373 movu m1, [r2 + 1 + 16 + 64] | |
10374 movu [rsp + 17], m0 | |
10375 movu [rsp + 17 + 16], m1 | |
10376 mov [rsp + 63], byte 4 | |
10377 | |
10378 ; filter | |
10379 lea r2, [rsp + 17] ; r2 -> [0] | |
10380 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10381 lea r4, [ang_table] ; r4 -> ang_table | |
10382 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10383 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
10384 mova m5, [pw_1024] ; m5 -> 1024 | |
10385 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10386 | |
10387 .loop: | |
10388 ; Row[0 - 7] | |
10389 movu m7, [r2 - 5] | |
10390 palignr m0, m7, 4 | |
10391 palignr m1, m7, 3 | |
10392 mova m2, m1 | |
10393 palignr m3, m7, 2 | |
10394 mova m4, m3 | |
10395 palignr m5, m7, 1 | |
10396 mova m6, m5 | |
10397 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 | |
10398 | |
10399 ; Row[8 - 15] | |
10400 movu m7, [r2 - 9] | |
10401 palignr m0, m7, 4 | |
10402 palignr m1, m7, 3 | |
10403 mova m2, m1 | |
10404 palignr m3, m7, 2 | |
10405 mova m4, m3 | |
10406 palignr m5, m7, 1 | |
10407 mova m6, m5 | |
10408 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 | |
10409 | |
10410 ; Row[16 - 23] | |
10411 movu m7, [r2 - 13] | |
10412 palignr m0, m7, 3 | |
10413 mova m1, m0 | |
10414 palignr m2, m7, 2 | |
10415 mova m3, m2 | |
10416 palignr m4, m7, 1 | |
10417 mova m5, m4 | |
10418 mova m6, m7 | |
10419 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 | |
10420 | |
10421 ; Row[24 - 31] | |
10422 movu m7, [r2 - 17] | |
10423 palignr m0, m7, 3 | |
10424 mova m1, m0 | |
10425 palignr m2, m7, 2 | |
10426 mova m3, m2 | |
10427 palignr m4, m7, 1 | |
10428 mova m5, m4 | |
10429 mova m6, m7 | |
10430 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 | |
10431 | |
10432 lea r0, [r6 + r1 * 4] | |
10433 lea r6, [r6 + r1 * 8] | |
10434 add r2, 8 | |
10435 dec byte [rsp + 63] | |
10436 jnz .loop | |
10437 mov rsp, [rsp+64] | |
10438 RET | |
10439 | |
10440 INIT_XMM sse4 | |
10441 cglobal intra_pred_ang32_16, 4,7,8 | |
10442 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10443 mov r6, rsp | |
10444 sub rsp, 64+gprsize | |
10445 and rsp, ~63 | |
10446 mov [rsp+64], r6 | |
10447 | |
10448 ; collect reference pixel | |
10449 movu m0, [r2] | |
10450 movu m1, [r2 + 15] | |
10451 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] | |
10452 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] | |
10453 mova [rsp], m1 | |
10454 movu [rsp + 10], m0 | |
10455 movu m0, [r2 + 1 + 64] | |
10456 movu m1, [r2 + 1 + 16 + 64] | |
10457 movu [rsp + 21], m0 | |
10458 movu [rsp + 21 + 16], m1 | |
10459 mov [rsp + 63], byte 4 | |
10460 | |
10461 ; filter | |
10462 lea r2, [rsp + 21] ; r2 -> [0] | |
10463 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10464 lea r4, [ang_table] ; r4 -> ang_table | |
10465 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10466 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
10467 mova m5, [pw_1024] ; m5 -> 1024 | |
10468 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10469 | |
10470 .loop: | |
10471 ; Row[0 - 7] | |
10472 movu m7, [r2 - 6] | |
10473 palignr m0, m7, 5 | |
10474 palignr m1, m7, 4 | |
10475 mova m2, m1 | |
10476 palignr m3, m7, 3 | |
10477 palignr m4, m7, 2 | |
10478 mova m5, m4 | |
10479 palignr m6, m7, 1 | |
10480 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 | |
10481 | |
10482 ; Row[8 - 15] | |
10483 movu m7, [r2 - 11] | |
10484 palignr m0, m7, 5 | |
10485 palignr m1, m7, 4 | |
10486 palignr m2, m7, 3 | |
10487 mova m3, m2 | |
10488 palignr m4, m7, 2 | |
10489 palignr m5, m7, 1 | |
10490 mova m6, m5 | |
10491 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 | |
10492 | |
10493 ; Row[16 - 23] | |
10494 movu m7, [r2 - 16] | |
10495 palignr m0, m7, 4 | |
10496 mova m1, m0 | |
10497 palignr m2, m7, 3 | |
10498 palignr m3, m7, 2 | |
10499 mova m4, m3 | |
10500 palignr m5, m7, 1 | |
10501 mova m6, m7 | |
10502 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 | |
10503 | |
10504 ; Row[24 - 31] | |
10505 movu m7, [r2 - 21] | |
10506 palignr m0, m7, 4 | |
10507 palignr m1, m7, 3 | |
10508 mova m2, m1 | |
10509 palignr m3, m7, 2 | |
10510 palignr m4, m7, 1 | |
10511 mova m5, m4 | |
10512 mova m6, m7 | |
10513 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 | |
10514 | |
10515 lea r0, [r6 + r1 * 4] | |
10516 lea r6, [r6 + r1 * 8] | |
10517 add r2, 8 | |
10518 dec byte [rsp + 63] | |
10519 jnz .loop | |
10520 mov rsp, [rsp+64] | |
10521 RET | |
10522 | |
10523 INIT_XMM sse4 | |
10524 cglobal intra_pred_ang32_17, 4,7,8 | |
10525 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10526 mov r6, rsp | |
10527 sub rsp, 64+gprsize | |
10528 and rsp, ~63 | |
10529 mov [rsp+64], r6 | |
10530 | |
10531 ; collect reference pixel | |
10532 movu m0, [r2] | |
10533 movu m1, [r2 + 16] | |
10534 pshufb m0, [c_mode32_17_0] | |
10535 pshufb m1, [c_mode32_17_0] | |
10536 mova [rsp ], m1 | |
10537 movu [rsp + 13], m0 | |
10538 movu m0, [r2 + 1 + 64] | |
10539 movu m1, [r2 + 1 + 16 + 64] | |
10540 movu [rsp + 26], m0 | |
10541 movu [rsp + 26 + 16], m1 | |
10542 mov [rsp + 63], byte 4 | |
10543 | |
10544 ; filter | |
10545 lea r2, [rsp + 25] ; r2 -> [0] | |
10546 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10547 lea r4, [ang_table] ; r4 -> ang_table | |
10548 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10549 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
10550 mova m5, [pw_1024] ; m5 -> 1024 | |
10551 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10552 | |
10553 .loop: | |
10554 ; Row[0 - 7] | |
10555 movu m7, [r2 - 6] | |
10556 palignr m0, m7, 6 | |
10557 palignr m1, m7, 5 | |
10558 palignr m2, m7, 4 | |
10559 palignr m3, m7, 3 | |
10560 palignr m4, m7, 2 | |
10561 mova m5, m4 | |
10562 palignr m6, m7, 1 | |
10563 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 | |
10564 | |
10565 ; Row[7 - 15] | |
10566 movu m7, [r2 - 12] | |
10567 palignr m0, m7, 5 | |
10568 palignr m1, m7, 4 | |
10569 mova m2, m1 | |
10570 palignr m3, m7, 3 | |
10571 palignr m4, m7, 2 | |
10572 palignr m5, m7, 1 | |
10573 mova m6, m7 | |
10574 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 | |
10575 | |
10576 ; Row[16 - 23] | |
10577 movu m7, [r2 - 19] | |
10578 palignr m0, m7, 6 | |
10579 palignr m1, m7, 5 | |
10580 palignr m2, m7, 4 | |
10581 palignr m3, m7, 3 | |
10582 palignr m4, m7, 2 | |
10583 mova m5, m4 | |
10584 palignr m6, m7, 1 | |
10585 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 | |
10586 | |
10587 ; Row[24 - 31] | |
10588 movu m7, [r2 - 25] | |
10589 palignr m0, m7, 5 | |
10590 palignr m1, m7, 4 | |
10591 mova m2, m1 | |
10592 palignr m3, m7, 3 | |
10593 palignr m4, m7, 2 | |
10594 palignr m5, m7, 1 | |
10595 mova m6, m7 | |
10596 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 | |
10597 | |
10598 lea r0, [r6 + r1 * 4] | |
10599 lea r6, [r6 + r1 * 8] | |
10600 add r2, 8 | |
10601 dec byte [rsp + 63] | |
10602 jnz .loop | |
10603 mov rsp, [rsp+64] | |
10604 | |
10605 RET | |
10606 | |
10607 INIT_YMM avx2 | |
10608 cglobal intra_pred_ang32_18, 4, 4, 3 | |
10609 movu m0, [r2] | |
10610 movu xm1, [r2 + 1 + 64] | |
10611 pshufb xm1, [intra_pred_shuff_15_0] | |
10612 mova xm2, xm0 | |
10613 vinserti128 m1, m1, xm2, 1 | |
10614 | |
10615 lea r3, [r1 * 3] | |
10616 | |
10617 movu [r0], m0 | |
10618 palignr m2, m0, m1, 15 | |
10619 movu [r0 + r1], m2 | |
10620 palignr m2, m0, m1, 14 | |
10621 movu [r0 + r1 * 2], m2 | |
10622 palignr m2, m0, m1, 13 | |
10623 movu [r0 + r3], m2 | |
10624 | |
10625 lea r0, [r0 + r1 * 4] | |
10626 palignr m2, m0, m1, 12 | |
10627 movu [r0], m2 | |
10628 palignr m2, m0, m1, 11 | |
10629 movu [r0 + r1], m2 | |
10630 palignr m2, m0, m1, 10 | |
10631 movu [r0 + r1 * 2], m2 | |
10632 palignr m2, m0, m1, 9 | |
10633 movu [r0 + r3], m2 | |
10634 | |
10635 lea r0, [r0 + r1 * 4] | |
10636 palignr m2, m0, m1, 8 | |
10637 movu [r0], m2 | |
10638 palignr m2, m0, m1, 7 | |
10639 movu [r0 + r1], m2 | |
10640 palignr m2, m0, m1, 6 | |
10641 movu [r0 + r1 * 2], m2 | |
10642 palignr m2, m0, m1, 5 | |
10643 movu [r0 + r3], m2 | |
10644 | |
10645 lea r0, [r0 + r1 * 4] | |
10646 palignr m2, m0, m1, 4 | |
10647 movu [r0], m2 | |
10648 palignr m2, m0, m1, 3 | |
10649 movu [r0 + r1], m2 | |
10650 palignr m2, m0, m1, 2 | |
10651 movu [r0 + r1 * 2], m2 | |
10652 palignr m2, m0, m1, 1 | |
10653 movu [r0 + r3], m2 | |
10654 | |
10655 lea r0, [r0 + r1 * 4] | |
10656 movu [r0], m1 | |
10657 | |
10658 movu xm0, [r2 + 64 + 17] | |
10659 pshufb xm0, [intra_pred_shuff_15_0] | |
10660 vinserti128 m0, m0, xm1, 1 | |
10661 | |
10662 palignr m2, m1, m0, 15 | |
10663 movu [r0 + r1], m2 | |
10664 palignr m2, m1, m0, 14 | |
10665 movu [r0 + r1 * 2], m2 | |
10666 palignr m2, m1, m0, 13 | |
10667 movu [r0 + r3], m2 | |
10668 | |
10669 lea r0, [r0 + r1 * 4] | |
10670 palignr m2, m1, m0, 12 | |
10671 movu [r0], m2 | |
10672 palignr m2, m1, m0, 11 | |
10673 movu [r0 + r1], m2 | |
10674 palignr m2, m1, m0, 10 | |
10675 movu [r0 + r1 * 2], m2 | |
10676 palignr m2, m1, m0, 9 | |
10677 movu [r0 + r3], m2 | |
10678 | |
10679 lea r0, [r0 + r1 * 4] | |
10680 palignr m2, m1, m0, 8 | |
10681 movu [r0], m2 | |
10682 palignr m2, m1, m0, 7 | |
10683 movu [r0 + r1], m2 | |
10684 palignr m2, m1, m0,6 | |
10685 movu [r0 + r1 * 2], m2 | |
10686 palignr m2, m1, m0, 5 | |
10687 movu [r0 + r3], m2 | |
10688 | |
10689 lea r0, [r0 + r1 * 4] | |
10690 palignr m2, m1, m0, 4 | |
10691 movu [r0], m2 | |
10692 palignr m2, m1, m0, 3 | |
10693 movu [r0 + r1], m2 | |
10694 palignr m2, m1, m0,2 | |
10695 movu [r0 + r1 * 2], m2 | |
10696 palignr m2, m1, m0, 1 | |
10697 movu [r0 + r3], m2 | |
10698 RET | |
10699 | |
10700 INIT_XMM sse4 | |
10701 cglobal intra_pred_ang32_18, 4,5,5 | |
10702 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
10703 movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] | |
10704 movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
10705 movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
10706 | |
10707 lea r2, [r1 * 2] | |
10708 lea r3, [r1 * 3] | |
10709 lea r4, [r1 * 4] | |
10710 | |
10711 movu [r0], m0 | |
10712 movu [r0 + 16], m1 | |
10713 | |
10714 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
10715 pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] | |
10716 | |
10717 palignr m4, m0, m2, 15 | |
10718 movu [r0 + r1], m4 | |
10719 palignr m4, m1, m0, 15 | |
10720 movu [r0 + r1 + 16], m4 | |
10721 palignr m4, m0, m2, 14 | |
10722 movu [r0 + r2], m4 | |
10723 palignr m4, m1, m0, 14 | |
10724 movu [r0 + r2 + 16], m4 | |
10725 palignr m4, m0, m2, 13 | |
10726 movu [r0 + r3], m4 | |
10727 palignr m4, m1, m0, 13 | |
10728 movu [r0 + r3 + 16], m4 | |
10729 | |
10730 lea r0, [r0 + r4] | |
10731 | |
10732 palignr m4, m0, m2, 12 | |
10733 movu [r0], m4 | |
10734 palignr m4, m1, m0, 12 | |
10735 movu [r0 + 16], m4 | |
10736 palignr m4, m0, m2, 11 | |
10737 movu [r0 + r1], m4 | |
10738 palignr m4, m1, m0, 11 | |
10739 movu [r0 + r1 + 16], m4 | |
10740 palignr m4, m0, m2, 10 | |
10741 movu [r0 + r2], m4 | |
10742 palignr m4, m1, m0, 10 | |
10743 movu [r0 + r2 + 16], m4 | |
10744 palignr m4, m0, m2, 9 | |
10745 movu [r0 + r3], m4 | |
10746 palignr m4, m1, m0, 9 | |
10747 movu [r0 + r3 + 16], m4 | |
10748 | |
10749 lea r0, [r0 + r4] | |
10750 | |
10751 palignr m4, m0, m2, 8 | |
10752 movu [r0], m4 | |
10753 palignr m4, m1, m0, 8 | |
10754 movu [r0 + 16], m4 | |
10755 palignr m4, m0, m2, 7 | |
10756 movu [r0 + r1], m4 | |
10757 palignr m4, m1, m0, 7 | |
10758 movu [r0 + r1 + 16], m4 | |
10759 palignr m4, m0, m2, 6 | |
10760 movu [r0 + r2], m4 | |
10761 palignr m4, m1, m0, 6 | |
10762 movu [r0 + r2 + 16], m4 | |
10763 palignr m4, m0, m2, 5 | |
10764 movu [r0 + r3], m4 | |
10765 palignr m4, m1, m0, 5 | |
10766 movu [r0 + r3 + 16], m4 | |
10767 | |
10768 lea r0, [r0 + r4] | |
10769 | |
10770 palignr m4, m0, m2, 4 | |
10771 movu [r0], m4 | |
10772 palignr m4, m1, m0, 4 | |
10773 movu [r0 + 16], m4 | |
10774 palignr m4, m0, m2, 3 | |
10775 movu [r0 + r1], m4 | |
10776 palignr m4, m1, m0, 3 | |
10777 movu [r0 + r1 + 16], m4 | |
10778 palignr m4, m0, m2, 2 | |
10779 movu [r0 + r2], m4 | |
10780 palignr m4, m1, m0, 2 | |
10781 movu [r0 + r2 + 16], m4 | |
10782 palignr m4, m0, m2, 1 | |
10783 movu [r0 + r3], m4 | |
10784 palignr m4, m1, m0, 1 | |
10785 movu [r0 + r3 + 16], m4 | |
10786 | |
10787 lea r0, [r0 + r4] | |
10788 | |
10789 movu [r0], m2 | |
10790 movu [r0 + 16], m0 | |
10791 palignr m4, m2, m3, 15 | |
10792 movu [r0 + r1], m4 | |
10793 palignr m4, m0, m2, 15 | |
10794 movu [r0 + r1 + 16], m4 | |
10795 palignr m4, m2, m3, 14 | |
10796 movu [r0 + r2], m4 | |
10797 palignr m4, m0, m2, 14 | |
10798 movu [r0 + r2 + 16], m4 | |
10799 palignr m4, m2, m3, 13 | |
10800 movu [r0 + r3], m4 | |
10801 palignr m4, m0, m2, 13 | |
10802 movu [r0 + r3 + 16], m4 | |
10803 | |
10804 lea r0, [r0 + r4] | |
10805 | |
10806 palignr m4, m2, m3, 12 | |
10807 movu [r0], m4 | |
10808 palignr m4, m0, m2, 12 | |
10809 movu [r0 + 16], m4 | |
10810 palignr m4, m2, m3, 11 | |
10811 movu [r0 + r1], m4 | |
10812 palignr m4, m0, m2, 11 | |
10813 movu [r0 + r1 + 16], m4 | |
10814 palignr m4, m2, m3, 10 | |
10815 movu [r0 + r2], m4 | |
10816 palignr m4, m0, m2, 10 | |
10817 movu [r0 + r2 + 16], m4 | |
10818 palignr m4, m2, m3, 9 | |
10819 movu [r0 + r3], m4 | |
10820 palignr m4, m0, m2, 9 | |
10821 movu [r0 + r3 + 16], m4 | |
10822 | |
10823 lea r0, [r0 + r4] | |
10824 | |
10825 palignr m4, m2, m3, 8 | |
10826 movu [r0], m4 | |
10827 palignr m4, m0, m2, 8 | |
10828 movu [r0 + 16], m4 | |
10829 palignr m4, m2, m3, 7 | |
10830 movu [r0 + r1], m4 | |
10831 palignr m4, m0, m2, 7 | |
10832 movu [r0 + r1 + 16], m4 | |
10833 palignr m4, m2, m3, 6 | |
10834 movu [r0 + r2], m4 | |
10835 palignr m4, m0, m2, 6 | |
10836 movu [r0 + r2 + 16], m4 | |
10837 palignr m4, m2, m3, 5 | |
10838 movu [r0 + r3], m4 | |
10839 palignr m4, m0, m2, 5 | |
10840 movu [r0 + r3 + 16], m4 | |
10841 | |
10842 lea r0, [r0 + r4] | |
10843 | |
10844 palignr m4, m2, m3, 4 | |
10845 movu [r0], m4 | |
10846 palignr m4, m0, m2, 4 | |
10847 movu [r0 + 16], m4 | |
10848 palignr m4, m2, m3, 3 | |
10849 movu [r0 + r1], m4 | |
10850 palignr m4, m0, m2, 3 | |
10851 movu [r0 + r1 + 16], m4 | |
10852 palignr m4, m2, m3, 2 | |
10853 movu [r0 + r2], m4 | |
10854 palignr m4, m0, m2, 2 | |
10855 movu [r0 + r2 + 16], m4 | |
10856 palignr m4, m2, m3, 1 | |
10857 movu [r0 + r3], m4 | |
10858 palignr m4, m0, m2, 1 | |
10859 movu [r0 + r3 + 16], m4 | |
10860 RET | |
10861 | |
10862 INIT_XMM sse4 | |
10863 cglobal intra_pred_ang32_19, 4,7,8 | |
10864 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10865 mov r6, rsp | |
10866 sub rsp, 64+gprsize | |
10867 and rsp, ~63 | |
10868 mov [rsp+64], r6 | |
10869 | |
10870 ; collect reference pixel | |
10871 movu m0, [r2 + 64] | |
10872 pinsrb m0, [r2], 0 | |
10873 movu m1, [r2 + 16 + 64] | |
10874 pshufb m0, [c_mode32_17_0] | |
10875 pshufb m1, [c_mode32_17_0] | |
10876 mova [rsp ], m1 | |
10877 movu [rsp + 13], m0 | |
10878 movu m0, [r2 + 1] | |
10879 movu m1, [r2 + 1 + 16] | |
10880 movu [rsp + 26], m0 | |
10881 movu [rsp + 26 + 16], m1 | |
10882 mov [rsp + 63], byte 4 | |
10883 | |
10884 ; filter | |
10885 lea r2, [rsp + 25] ; r2 -> [0] | |
10886 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10887 lea r4, [ang_table] ; r4 -> ang_table | |
10888 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10889 lea r6, [r0] ; r6 -> r0 | |
10890 mova m5, [pw_1024] ; m5 -> 1024 | |
10891 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10892 | |
10893 .loop: | |
10894 ; Row[0 - 7] | |
10895 movu m7, [r2 - 6] | |
10896 palignr m0, m7, 6 | |
10897 palignr m1, m7, 5 | |
10898 palignr m2, m7, 4 | |
10899 palignr m3, m7, 3 | |
10900 palignr m4, m7, 2 | |
10901 mova m5, m4 | |
10902 palignr m6, m7, 1 | |
10903 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 | |
10904 | |
10905 ; Row[7 - 15] | |
10906 movu m7, [r2 - 12] | |
10907 palignr m0, m7, 5 | |
10908 palignr m1, m7, 4 | |
10909 mova m2, m1 | |
10910 palignr m3, m7, 3 | |
10911 palignr m4, m7, 2 | |
10912 palignr m5, m7, 1 | |
10913 mova m6, m7 | |
10914 lea r0, [r0 + r1 * 4] | |
10915 PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 | |
10916 | |
10917 ; Row[16 - 23] | |
10918 movu m7, [r2 - 19] | |
10919 palignr m0, m7, 6 | |
10920 palignr m1, m7, 5 | |
10921 palignr m2, m7, 4 | |
10922 palignr m3, m7, 3 | |
10923 palignr m4, m7, 2 | |
10924 mova m5, m4 | |
10925 palignr m6, m7, 1 | |
10926 lea r0, [r0 + r1 * 4] | |
10927 PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 | |
10928 | |
10929 ; Row[24 - 31] | |
10930 movu m7, [r2 - 25] | |
10931 palignr m0, m7, 5 | |
10932 palignr m1, m7, 4 | |
10933 mova m2, m1 | |
10934 palignr m3, m7, 3 | |
10935 palignr m4, m7, 2 | |
10936 palignr m5, m7, 1 | |
10937 mova m6, m7 | |
10938 lea r0, [r0 + r1 * 4] | |
10939 PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 | |
10940 | |
10941 add r6, 8 | |
10942 mov r0, r6 | |
10943 add r2, 8 | |
10944 dec byte [rsp + 63] | |
10945 jnz .loop | |
10946 mov rsp, [rsp+64] | |
10947 RET | |
10948 | |
10949 INIT_XMM sse4 | |
10950 cglobal intra_pred_ang32_20, 4,7,8 | |
10951 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
10952 mov r6, rsp | |
10953 sub rsp, 64+gprsize | |
10954 and rsp, ~63 | |
10955 mov [rsp+64], r6 | |
10956 | |
10957 ; collect reference pixel | |
10958 movu m0, [r2 + 64] | |
10959 pinsrb m0, [r2], 0 | |
10960 movu m1, [r2 + 15 + 64] | |
10961 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] | |
10962 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] | |
10963 mova [rsp], m1 | |
10964 movu [rsp + 10], m0 | |
10965 movu m0, [r2 + 1] | |
10966 movu m1, [r2 + 1 + 16] | |
10967 movu [rsp + 21], m0 | |
10968 movu [rsp + 21 + 16], m1 | |
10969 mov [rsp + 63], byte 4 | |
10970 | |
10971 ; filter | |
10972 lea r2, [rsp + 21] ; r2 -> [0] | |
10973 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
10974 lea r4, [ang_table] ; r4 -> ang_table | |
10975 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
10976 lea r6, [r0] ; r6 -> r0 | |
10977 mova m5, [pw_1024] ; m5 -> 1024 | |
10978 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
10979 | |
10980 .loop: | |
10981 ; Row[0 - 7] | |
10982 movu m7, [r2 - 6] | |
10983 palignr m0, m7, 5 | |
10984 palignr m1, m7, 4 | |
10985 mova m2, m1 | |
10986 palignr m3, m7, 3 | |
10987 palignr m4, m7, 2 | |
10988 mova m5, m4 | |
10989 palignr m6, m7, 1 | |
10990 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 | |
10991 | |
10992 ; Row[8 - 15] | |
10993 movu m7, [r2 - 11] | |
10994 palignr m0, m7, 5 | |
10995 palignr m1, m7, 4 | |
10996 palignr m2, m7, 3 | |
10997 mova m3, m2 | |
10998 palignr m4, m7, 2 | |
10999 palignr m5, m7, 1 | |
11000 mova m6, m5 | |
11001 lea r0, [r0 + r1 * 4] | |
11002 PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 | |
11003 | |
11004 ; Row[16 - 23] | |
11005 movu m7, [r2 - 16] | |
11006 palignr m0, m7, 4 | |
11007 mova m1, m0 | |
11008 palignr m2, m7, 3 | |
11009 palignr m3, m7, 2 | |
11010 mova m4, m3 | |
11011 palignr m5, m7, 1 | |
11012 mova m6, m7 | |
11013 lea r0, [r0 + r1 * 4] | |
11014 PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 | |
11015 | |
11016 ; Row[24 - 31] | |
11017 movu m7, [r2 - 21] | |
11018 palignr m0, m7, 4 | |
11019 palignr m1, m7, 3 | |
11020 mova m2, m1 | |
11021 palignr m3, m7, 2 | |
11022 palignr m4, m7, 1 | |
11023 mova m5, m4 | |
11024 mova m6, m7 | |
11025 lea r0, [r0 + r1 * 4] | |
11026 PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 | |
11027 | |
11028 add r6, 8 | |
11029 mov r0, r6 | |
11030 add r2, 8 | |
11031 dec byte [rsp + 63] | |
11032 jnz .loop | |
11033 mov rsp, [rsp+64] | |
11034 RET | |
11035 | |
11036 INIT_XMM sse4 | |
11037 cglobal intra_pred_ang32_21, 4,7,8 | |
11038 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
11039 mov r6, rsp | |
11040 sub rsp, 64+gprsize | |
11041 and rsp, ~63 | |
11042 mov [rsp+64], r6 | |
11043 | |
11044 ; collect reference pixel | |
11045 movu m0, [r2 + 64] | |
11046 pinsrb m0, [r2], 0 | |
11047 movu m1, [r2 + 15 + 64] | |
11048 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] | |
11049 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] | |
11050 mova [rsp], m1 | |
11051 movu [rsp + 8], m0 | |
11052 movu m0, [r2 + 1] | |
11053 movu m1, [r2 + 1 + 16] | |
11054 movu [rsp + 17], m0 | |
11055 movu [rsp + 17 + 16], m1 | |
11056 mov [rsp + 63], byte 4 | |
11057 | |
11058 ; filter | |
11059 lea r2, [rsp + 17] ; r2 -> [0] | |
11060 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
11061 lea r4, [ang_table] ; r4 -> ang_table | |
11062 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11063 lea r6, [r0] ; r6 -> r0 | |
11064 mova m5, [pw_1024] ; m5 -> 1024 | |
11065 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
11066 | |
11067 .loop: | |
11068 ; Row[0 - 7] | |
11069 movu m7, [r2 - 5] | |
11070 palignr m0, m7, 4 | |
11071 palignr m1, m7, 3 | |
11072 mova m2, m1 | |
11073 palignr m3, m7, 2 | |
11074 mova m4, m3 | |
11075 palignr m5, m7, 1 | |
11076 mova m6, m5 | |
11077 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 | |
11078 | |
11079 ; Row[8 - 15] | |
11080 movu m7, [r2 - 9] | |
11081 palignr m0, m7, 4 | |
11082 palignr m1, m7, 3 | |
11083 mova m2, m1 | |
11084 palignr m3, m7, 2 | |
11085 mova m4, m3 | |
11086 palignr m5, m7, 1 | |
11087 mova m6, m5 | |
11088 lea r0, [r0 + r1 * 4] | |
11089 PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 | |
11090 | |
11091 ; Row[16 - 23] | |
11092 movu m7, [r2 - 13] | |
11093 palignr m0, m7, 3 | |
11094 mova m1, m0 | |
11095 palignr m2, m7, 2 | |
11096 mova m3, m2 | |
11097 palignr m4, m7, 1 | |
11098 mova m5, m4 | |
11099 mova m6, m7 | |
11100 lea r0, [r0 + r1 * 4] | |
11101 PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 | |
11102 | |
11103 ; Row[24 - 31] | |
11104 movu m7, [r2 - 17] | |
11105 palignr m0, m7, 3 | |
11106 mova m1, m0 | |
11107 palignr m2, m7, 2 | |
11108 mova m3, m2 | |
11109 palignr m4, m7, 1 | |
11110 mova m5, m4 | |
11111 mova m6, m7 | |
11112 lea r0, [r0 + r1 * 4] | |
11113 PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 | |
11114 | |
11115 add r6, 8 | |
11116 mov r0, r6 | |
11117 add r2, 8 | |
11118 dec byte [rsp + 63] | |
11119 jnz .loop | |
11120 mov rsp, [rsp+64] | |
11121 RET | |
11122 | |
11123 INIT_XMM sse4 | |
11124 cglobal intra_pred_ang32_22, 4,7,8 | |
11125 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
11126 mov r6, rsp | |
11127 sub rsp, 64+gprsize | |
11128 and rsp, ~63 | |
11129 mov [rsp+64], r6 | |
11130 | |
11131 ; collect reference pixel | |
11132 movu m0, [r2 + 64] | |
11133 pinsrb m0, [r2], 0 | |
11134 movu m1, [r2 + 15 + 64] | |
11135 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] | |
11136 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] | |
11137 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] | |
11138 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] | |
11139 mova [rsp], m0 | |
11140 movu m0, [r2 + 1] | |
11141 movu m1, [r2 + 1 + 16] | |
11142 movu [rsp + 13], m0 | |
11143 movu [rsp + 13 + 16], m1 | |
11144 mov [rsp + 63], byte 4 | |
11145 | |
11146 ; filter | |
11147 lea r2, [rsp + 13] ; r2 -> [0] | |
11148 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
11149 lea r4, [ang_table] ; r4 -> ang_table | |
11150 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11151 lea r6, [r0] ; r6 -> r0 | |
11152 mova m5, [pw_1024] ; m5 -> 1024 | |
11153 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
11154 | |
11155 .loop: | |
11156 ; Row[0 - 7] | |
11157 movu m7, [r2 - 4] | |
11158 palignr m0, m7, 3 | |
11159 mova m1, m0 | |
11160 palignr m2, m7, 2 | |
11161 mova m3, m2 | |
11162 palignr m4, m7, 1 | |
11163 mova m5, m4 | |
11164 mova m6, m4 | |
11165 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 | |
11166 | |
11167 ; Row[8 - 15] | |
11168 movu m7, [r2 - 7] | |
11169 palignr m0, m7, 3 | |
11170 palignr m1, m7, 2 | |
11171 mova m2, m1 | |
11172 mova m3, m1 | |
11173 palignr m4, m7, 1 | |
11174 mova m5, m4 | |
11175 mova m6, m7 | |
11176 lea r0, [r0 + r1 * 4] | |
11177 PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 | |
11178 | |
11179 ; Row[16 - 23] | |
11180 movu m7, [r2 - 10] | |
11181 palignr m0, m7, 3 | |
11182 palignr m1, m7, 2 | |
11183 mova m2, m1 | |
11184 palignr m3, m7, 1 | |
11185 mova m4, m3 | |
11186 mova m5, m3 | |
11187 mova m6, m7 | |
11188 lea r0, [r0 + r1 * 4] | |
11189 PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 | |
11190 | |
11191 ; Row[24 - 31] | |
11192 movu m7, [r2 - 13] | |
11193 palignr m0, m7, 2 | |
11194 mova m1, m0 | |
11195 mova m2, m0 | |
11196 palignr m3, m7, 1 | |
11197 mova m4, m3 | |
11198 mova m5, m7 | |
11199 mova m6, m7 | |
11200 lea r0, [r0 + r1 * 4] | |
11201 PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 | |
11202 | |
11203 add r6, 8 | |
11204 mov r0, r6 | |
11205 add r2, 8 | |
11206 dec byte [rsp + 63] | |
11207 jnz .loop | |
11208 mov rsp, [rsp+64] | |
11209 RET | |
11210 | |
11211 INIT_XMM sse4 | |
11212 cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) | |
11213 %define above [rsp + 0 * mmsize] | |
11214 lea r3, [r2 + 64] | |
11215 lea r4, [ang_table + 16 * 16] | |
11216 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11217 mov r6, r0 | |
11218 mova m7, [pw_1024] | |
11219 | |
11220 MODE_13_23_ROW0 0 | |
11221 add r6, 8 | |
11222 mov r0, r6 | |
11223 add r2, 7 | |
11224 mov r3, 3 | |
11225 .loop: | |
11226 MODE_13_23 0, 0 | |
11227 add r6, 8 | |
11228 mov r0, r6 | |
11229 add r2, 8 | |
11230 dec r3 | |
11231 jnz .loop | |
11232 RET | |
11233 | |
11234 INIT_XMM sse4 | |
11235 cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) | |
11236 %define above [rsp + 0 * mmsize] | |
11237 lea r3, [r2 + 64] | |
11238 lea r4, [ang_table + 16 * 16] | |
11239 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11240 mov r6, r0 | |
11241 mova m7, [pw_1024] | |
11242 | |
11243 MODE_12_24_ROW0 0 | |
11244 add r6, 8 | |
11245 mov r0, r6 | |
11246 add r2, 7 | |
11247 mov r3, 3 | |
11248 .loop: | |
11249 MODE_12_24 0 | |
11250 add r6, 8 | |
11251 mov r0, r6 | |
11252 add r2, 8 | |
11253 dec r3 | |
11254 jnz .loop | |
11255 RET | |
11256 | |
11257 INIT_XMM sse4 | |
11258 cglobal intra_pred_ang32_25, 4,7,8 | |
11259 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
11260 mov r6, rsp | |
11261 sub rsp, 64+gprsize | |
11262 and rsp, ~63 | |
11263 mov [rsp+64], r6 | |
11264 | |
11265 ; collect reference pixel | |
11266 movu m0, [r2 + 16 + 64] | |
11267 pxor m1, m1 | |
11268 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] | |
11269 mova [rsp], m0 | |
11270 movu m0, [r2] | |
11271 movu m1, [r2 + 16] | |
11272 movu m2, [r2 + 32] | |
11273 movu [rsp + 1], m0 | |
11274 movu [rsp + 1 + 16], m1 | |
11275 movu [rsp + 1 + 32], m2 | |
11276 mov [rsp + 63], byte 4 | |
11277 | |
11278 ; filter | |
11279 lea r2, [rsp + 1] ; r2 -> [0] | |
11280 lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
11281 lea r4, [ang_table] ; r4 -> ang_table | |
11282 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11283 lea r6, [r0] ; r6 -> r0 | |
11284 mova m5, [pw_1024] ; m5 -> 1024 | |
11285 mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
11286 | |
11287 .loop: | |
11288 ; Row[0 - 7] | |
11289 movu m7, [r2] | |
11290 mova m0, m7 | |
11291 mova m1, m7 | |
11292 mova m2, m7 | |
11293 mova m3, m7 | |
11294 mova m4, m7 | |
11295 mova m5, m7 | |
11296 mova m6, m7 | |
11297 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 | |
11298 | |
11299 ; Row[8 - 15] | |
11300 movu m7, [r2] | |
11301 mova m0, m7 | |
11302 mova m1, m7 | |
11303 mova m2, m7 | |
11304 mova m3, m7 | |
11305 mova m4, m7 | |
11306 mova m5, m7 | |
11307 mova m6, m7 | |
11308 lea r0, [r0 + r1 * 4] | |
11309 PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 | |
11310 | |
11311 ; Row[16 - 23] | |
11312 movu m7, [r2 - 1] | |
11313 mova m0, m7 | |
11314 mova m1, m7 | |
11315 mova m2, m7 | |
11316 mova m3, m7 | |
11317 mova m4, m7 | |
11318 mova m5, m7 | |
11319 mova m6, m7 | |
11320 lea r0, [r0 + r1 * 4] | |
11321 PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 | |
11322 | |
11323 ; Row[24 - 31] | |
11324 movu m7, [r2 - 1] | |
11325 mova m0, m7 | |
11326 mova m1, m7 | |
11327 mova m2, m7 | |
11328 mova m3, m7 | |
11329 mova m4, m7 | |
11330 mova m5, m7 | |
11331 mova m6, m7 | |
11332 lea r0, [r0 + r1 * 4] | |
11333 PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 | |
11334 | |
11335 add r6, 8 | |
11336 mov r0, r6 | |
11337 add r2, 8 | |
11338 dec byte [rsp + 63] | |
11339 jnz .loop | |
11340 mov rsp, [rsp+64] | |
11341 RET | |
11342 | |
11343 INIT_XMM sse4 | |
11344 cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize) | |
11345 %define m8 [rsp + 0 * mmsize] | |
11346 %define m9 [rsp + 1 * mmsize] | |
11347 mov r6, 2 | |
11348 movu m0, [r2 + 64] | |
11349 pinsrb m0, [r2], 0 | |
11350 movu m1, [r2 + 1 + 64] | |
11351 mova m8, m0 | |
11352 mova m9, m1 | |
11353 mov r3d, r4d | |
11354 lea r4, [r1 * 3] | |
11355 | |
11356 .loop: | |
11357 movu m0, [r2 + 1] | |
11358 | |
11359 movu [r0], m0 | |
11360 movu [r0 + r1], m0 | |
11361 movu [r0 + r1 * 2], m0 | |
11362 movu [r0 + r4], m0 | |
11363 lea r5, [r0 + r1 * 4] | |
11364 movu [r5], m0 | |
11365 movu [r5 + r1], m0 | |
11366 movu [r5 + r1 * 2], m0 | |
11367 movu [r5 + r4], m0 | |
11368 lea r5, [r5 + r1 * 4] | |
11369 movu [r5], m0 | |
11370 movu [r5 + r1], m0 | |
11371 movu [r5 + r1 * 2], m0 | |
11372 movu [r5 + r4], m0 | |
11373 lea r5, [r5 + r1 * 4] | |
11374 movu [r5], m0 | |
11375 movu [r5 + r1], m0 | |
11376 movu [r5 + r1 * 2], m0 | |
11377 movu [r5 + r4], m0 | |
11378 lea r5, [r0 + r1 * 4] | |
11379 movu [r5], m0 | |
11380 movu [r5 + r1], m0 | |
11381 movu [r5 + r1 * 2], m0 | |
11382 movu [r5 + r4], m0 | |
11383 lea r5, [r5 + r1 * 4] | |
11384 movu [r5], m0 | |
11385 movu [r5 + r1], m0 | |
11386 movu [r5 + r1 * 2], m0 | |
11387 movu [r5 + r4], m0 | |
11388 lea r5, [r5 + r1 * 4] | |
11389 movu [r5], m0 | |
11390 movu [r5 + r1], m0 | |
11391 movu [r5 + r1 * 2], m0 | |
11392 movu [r5 + r4], m0 | |
11393 lea r5, [r5 + r1 * 4] | |
11394 movu [r5], m0 | |
11395 movu [r5 + r1], m0 | |
11396 movu [r5 + r1 * 2], m0 | |
11397 movu [r5 + r4], m0 | |
11398 lea r5, [r5 + r1 * 4] | |
11399 movu [r5], m0 | |
11400 movu [r5 + r1], m0 | |
11401 movu [r5 + r1 * 2], m0 | |
11402 movu [r5 + r4], m0 | |
11403 lea r5, [r5 + r1 * 4] | |
11404 movu [r5], m0 | |
11405 movu [r5 + r1], m0 | |
11406 movu [r5 + r1 * 2], m0 | |
11407 movu [r5 + r4], m0 | |
11408 lea r5, [r5 + r1 * 4] | |
11409 movu [r5], m0 | |
11410 movu [r5 + r1], m0 | |
11411 movu [r5 + r1 * 2], m0 | |
11412 movu [r5 + r4], m0 | |
11413 | |
11414 ; filter | |
11415 cmp r3d, byte 0 | |
11416 jz .quit | |
11417 | |
11418 pxor m4, m4 | |
11419 pshufb m0, m4 | |
11420 pmovzxbw m0, m0 | |
11421 mova m1, m0 | |
11422 movu m2, m8 | |
11423 movu m3, m9 | |
11424 | |
11425 pshufb m2, m4 | |
11426 pmovzxbw m2, m2 | |
11427 movhlps m4, m3 | |
11428 pmovzxbw m3, m3 | |
11429 pmovzxbw m4, m4 | |
11430 psubw m3, m2 | |
11431 psubw m4, m2 | |
11432 psraw m3, 1 | |
11433 psraw m4, 1 | |
11434 paddw m0, m3 | |
11435 paddw m1, m4 | |
11436 packuswb m0, m1 | |
11437 | |
11438 pextrb [r0], m0, 0 | |
11439 pextrb [r0 + r1], m0, 1 | |
11440 pextrb [r0 + r1 * 2], m0, 2 | |
11441 pextrb [r0 + r4], m0, 3 | |
11442 lea r5, [r0 + r1 * 4] | |
11443 pextrb [r5], m0, 4 | |
11444 pextrb [r5 + r1], m0, 5 | |
11445 pextrb [r5 + r1 * 2], m0, 6 | |
11446 pextrb [r5 + r4], m0, 7 | |
11447 lea r5, [r5 + r1 * 4] | |
11448 pextrb [r5], m0, 8 | |
11449 pextrb [r5 + r1], m0, 9 | |
11450 pextrb [r5 + r1 * 2], m0, 10 | |
11451 pextrb [r5 + r4], m0, 11 | |
11452 lea r5, [r5 + r1 * 4] | |
11453 pextrb [r5], m0, 12 | |
11454 pextrb [r5 + r1], m0, 13 | |
11455 pextrb [r5 + r1 * 2], m0, 14 | |
11456 pextrb [r5 + r4], m0, 15 | |
11457 | |
11458 .quit: | |
11459 lea r2, [r2 + 16] | |
11460 add r0, 16 | |
11461 dec r6d | |
11462 jnz .loop | |
11463 RET | |
11464 | |
11465 INIT_XMM sse4 | |
11466 cglobal intra_pred_ang32_27, 3,7,8 | |
11467 lea r3, [ang_table + 16 * 16] | |
11468 mov r4d, 4 | |
11469 lea r5, [r1 * 3] | |
11470 mov r6, r0 | |
11471 mova m7, [pw_1024] | |
11472 .loop: | |
11473 MODE_9_27 0 | |
11474 add r6, 8 | |
11475 mov r0, r6 | |
11476 add r2, 8 | |
11477 dec r4 | |
11478 jnz .loop | |
11479 RET | |
11480 | |
11481 INIT_XMM sse4 | |
11482 cglobal intra_pred_ang32_28, 3,7,8 | |
11483 lea r3, [ang_table + 16 * 16] | |
11484 mov r4d, 4 | |
11485 lea r5, [r1 * 3] | |
11486 mov r6, r0 | |
11487 mova m7, [pw_1024] | |
11488 .loop: | |
11489 MODE_8_28 0 | |
11490 add r6, 8 | |
11491 mov r0, r6 | |
11492 add r2, 8 | |
11493 dec r4 | |
11494 jnz .loop | |
11495 RET | |
11496 | |
11497 INIT_XMM sse4 | |
11498 cglobal intra_pred_ang32_29, 3,7,8 | |
11499 lea r3, [ang_table + 16 * 16] | |
11500 mov r4d, 4 | |
11501 lea r5, [r1 * 3] | |
11502 mov r6, r0 | |
11503 mova m7, [pw_1024] | |
11504 .loop: | |
11505 MODE_7_29 0 | |
11506 add r6, 8 | |
11507 mov r0, r6 | |
11508 add r2, 8 | |
11509 dec r4 | |
11510 jnz .loop | |
11511 RET | |
11512 | |
11513 INIT_XMM sse4 | |
11514 cglobal intra_pred_ang32_30, 3,7,8 | |
11515 lea r3, [ang_table + 16 * 16] | |
11516 mov r4d, 4 | |
11517 lea r5, [r1 * 3] | |
11518 mov r6, r0 | |
11519 mova m7, [pw_1024] | |
11520 .loop: | |
11521 MODE_6_30 0 | |
11522 add r6, 8 | |
11523 mov r0, r6 | |
11524 add r2, 8 | |
11525 dec r4 | |
11526 jnz .loop | |
11527 RET | |
11528 | |
11529 INIT_XMM sse4 | |
11530 cglobal intra_pred_ang32_31, 3,7,8 | |
11531 lea r3, [ang_table + 16 * 16] | |
11532 mov r4d, 4 | |
11533 lea r5, [r1 * 3] | |
11534 mov r6, r0 | |
11535 mova m7, [pw_1024] | |
11536 .loop: | |
11537 MODE_5_31 0 | |
11538 add r6, 8 | |
11539 mov r0, r6 | |
11540 add r2, 8 | |
11541 dec r4 | |
11542 jnz .loop | |
11543 RET | |
11544 | |
11545 INIT_XMM sse4 | |
11546 cglobal intra_pred_ang32_32, 3,7,8 | |
11547 lea r3, [ang_table + 16 * 16] | |
11548 mov r4d, 4 | |
11549 lea r5, [r1 * 3] | |
11550 mov r6, r0 | |
11551 mova m7, [pw_1024] | |
11552 .loop: | |
11553 MODE_4_32 0 | |
11554 add r6, 8 | |
11555 mov r0, r6 | |
11556 add r2, 8 | |
11557 dec r4 | |
11558 jnz .loop | |
11559 RET | |
11560 | |
11561 INIT_XMM sse4 | |
11562 cglobal intra_pred_ang32_33, 3,7,8 | |
11563 lea r3, [ang_table + 16 * 16] | |
11564 mov r4d, 4 | |
11565 lea r5, [r1 * 3] | |
11566 mov r6, r0 | |
11567 mova m7, [pw_1024] | |
11568 .loop: | |
11569 MODE_3_33 0 | |
11570 add r6, 8 | |
11571 mov r0, r6 | |
11572 add r2, 8 | |
11573 dec r4 | |
11574 jnz .loop | |
11575 RET | |
11576 | |
11577 ;----------------------------------------------------------------------------------------- | |
11578 ; start of intra_pred_ang32 angular modes avx2 asm | |
11579 ;----------------------------------------------------------------------------------------- | |
11580 | |
11581 %if ARCH_X86_64 == 1 | |
11582 INIT_YMM avx2 | |
11583 | |
11584 ; register mapping : | |
11585 ; %1-%8 - output registers | |
11586 ; %9 - temp register | |
11587 ; %10 - for label naming | |
11588 %macro TRANSPOSE_32x8_AVX2 10 | |
11589 jnz .skip%10 | |
11590 | |
11591 ; transpose 8x32 to 32x8 and then store | |
11592 punpcklbw m%9, m%1, m%2 | |
11593 punpckhbw m%1, m%2 | |
11594 punpcklbw m%2, m%3, m%4 | |
11595 punpckhbw m%3, m%4 | |
11596 punpcklbw m%4, m%5, m%6 | |
11597 punpckhbw m%5, m%6 | |
11598 punpcklbw m%6, m%7, m%8 | |
11599 punpckhbw m%7, m%8 | |
11600 | |
11601 punpcklwd m%8, m%9, m%2 | |
11602 punpckhwd m%9, m%2 | |
11603 punpcklwd m%2, m%4, m%6 | |
11604 punpckhwd m%4, m%6 | |
11605 punpcklwd m%6, m%1, m%3 | |
11606 punpckhwd m%1, m%3 | |
11607 punpcklwd m%3, m%5, m%7 | |
11608 punpckhwd m%5, m%7 | |
11609 | |
11610 punpckldq m%7, m%8, m%2 | |
11611 punpckhdq m%8, m%2 | |
11612 punpckldq m%2, m%6, m%3 | |
11613 punpckhdq m%6, m%3 | |
11614 punpckldq m%3, m%9, m%4 | |
11615 punpckhdq m%9, m%4 | |
11616 punpckldq m%4, m%1, m%5 | |
11617 punpckhdq m%1, m%5 | |
11618 | |
11619 movq [r0 + r1 * 0], xm%7 | |
11620 movhps [r0 + r1 * 1], xm%7 | |
11621 movq [r0 + r1 * 2], xm%8 | |
11622 movhps [r0 + r5 * 1], xm%8 | |
11623 | |
11624 lea r0, [r0 + r6] | |
11625 | |
11626 movq [r0 + r1 * 0], xm%3 | |
11627 movhps [r0 + r1 * 1], xm%3 | |
11628 movq [r0 + r1 * 2], xm%9 | |
11629 movhps [r0 + r5 * 1], xm%9 | |
11630 | |
11631 lea r0, [r0 + r6] | |
11632 | |
11633 movq [r0 + r1 * 0], xm%2 | |
11634 movhps [r0 + r1 * 1], xm%2 | |
11635 movq [r0 + r1 * 2], xm%6 | |
11636 movhps [r0 + r5 * 1], xm%6 | |
11637 | |
11638 lea r0, [r0 + r6] | |
11639 | |
11640 movq [r0 + r1 * 0], xm%4 | |
11641 movhps [r0 + r1 * 1], xm%4 | |
11642 movq [r0 + r1 * 2], xm%1 | |
11643 movhps [r0 + r5 * 1], xm%1 | |
11644 | |
11645 lea r0, [r0 + r6] | |
11646 | |
11647 vpermq m%8, m%8, 00001110b | |
11648 vpermq m%7, m%7, 00001110b | |
11649 vpermq m%6, m%6, 00001110b | |
11650 vpermq m%3, m%3, 00001110b | |
11651 vpermq m%9, m%9, 00001110b | |
11652 vpermq m%2, m%2, 00001110b | |
11653 vpermq m%4, m%4, 00001110b | |
11654 vpermq m%1, m%1, 00001110b | |
11655 | |
11656 movq [r0 + r1 * 0], xm%7 | |
11657 movhps [r0 + r1 * 1], xm%7 | |
11658 movq [r0 + r1 * 2], xm%8 | |
11659 movhps [r0 + r5 * 1], xm%8 | |
11660 | |
11661 lea r0, [r0 + r6] | |
11662 | |
11663 movq [r0 + r1 * 0], xm%3 | |
11664 movhps [r0 + r1 * 1], xm%3 | |
11665 movq [r0 + r1 * 2], xm%9 | |
11666 movhps [r0 + r5 * 1], xm%9 | |
11667 | |
11668 lea r0, [r0 + r6] | |
11669 | |
11670 movq [r0 + r1 * 0], xm%2 | |
11671 movhps [r0 + r1 * 1], xm%2 | |
11672 movq [r0 + r1 * 2], xm%6 | |
11673 movhps [r0 + r5 * 1], xm%6 | |
11674 | |
11675 lea r0, [r0 + r6] | |
11676 | |
11677 movq [r0 + r1 * 0], xm%4 | |
11678 movhps [r0 + r1 * 1], xm%4 | |
11679 movq [r0 + r1 * 2], xm%1 | |
11680 movhps [r0 + r5 * 1], xm%1 | |
11681 | |
11682 lea r0, [r4 + 8] | |
11683 jmp .end%10 | |
11684 .skip%10: | |
11685 movu [r0 + r1 * 0], m%1 | |
11686 movu [r0 + r1 * 1], m%2 | |
11687 movu [r0 + r1 * 2], m%3 | |
11688 movu [r0 + r5 * 1], m%4 | |
11689 | |
11690 lea r0, [r0 + r6] | |
11691 | |
11692 movu [r0 + r1 * 0], m%5 | |
11693 movu [r0 + r1 * 1], m%6 | |
11694 movu [r0 + r1 * 2], m%7 | |
11695 movu [r0 + r5 * 1], m%8 | |
11696 | |
11697 lea r0, [r0 + r6] | |
11698 .end%10: | |
11699 %endmacro | |
11700 | |
11701 cglobal ang32_mode_3_33_row_0_15 | |
11702 test r7d, r7d | |
11703 ; rows 0 to 7 | |
11704 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11705 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11706 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
11707 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
11708 | |
11709 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
11710 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
11711 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
11712 | |
11713 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] | |
11714 pmulhrsw m4, m7 | |
11715 pmaddubsw m1, m2, [r3 + 10 * 32] | |
11716 pmulhrsw m1, m7 | |
11717 packuswb m4, m1 | |
11718 | |
11719 palignr m5, m2, m0, 2 | |
11720 palignr m1, m3, m2, 2 | |
11721 pmaddubsw m5, [r3 + 4 * 32] ; [20] | |
11722 pmulhrsw m5, m7 | |
11723 pmaddubsw m1, [r3 + 4 * 32] | |
11724 pmulhrsw m1, m7 | |
11725 packuswb m5, m1 | |
11726 | |
11727 palignr m6, m2, m0, 4 | |
11728 palignr m1, m3, m2, 4 | |
11729 pmaddubsw m6, [r3 - 2 * 32] ; [14] | |
11730 pmulhrsw m6, m7 | |
11731 pmaddubsw m1, [r3 - 2 * 32] | |
11732 pmulhrsw m1, m7 | |
11733 packuswb m6, m1 | |
11734 | |
11735 palignr m8, m2, m0, 6 | |
11736 palignr m1, m3, m2, 6 | |
11737 pmaddubsw m8, [r3 - 8 * 32] ; [8] | |
11738 pmulhrsw m8, m7 | |
11739 pmaddubsw m1, [r3 - 8 * 32] | |
11740 pmulhrsw m1, m7 | |
11741 packuswb m8, m1 | |
11742 | |
11743 palignr m10, m2, m0, 8 | |
11744 palignr m11, m3, m2, 8 | |
11745 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2] | |
11746 pmulhrsw m9, m7 | |
11747 pmaddubsw m1, m11, [r3 - 14 * 32] | |
11748 pmulhrsw m1, m7 | |
11749 packuswb m9, m1 | |
11750 | |
11751 pmaddubsw m10, [r3 + 12 * 32] ; [28] | |
11752 pmulhrsw m10, m7 | |
11753 pmaddubsw m11, [r3 + 12 * 32] | |
11754 pmulhrsw m11, m7 | |
11755 packuswb m10, m11 | |
11756 | |
11757 palignr m11, m2, m0, 10 | |
11758 palignr m1, m3, m2, 10 | |
11759 pmaddubsw m11, [r3 + 6 * 32] ; [22] | |
11760 pmulhrsw m11, m7 | |
11761 pmaddubsw m1, [r3 + 6 * 32] | |
11762 pmulhrsw m1, m7 | |
11763 packuswb m11, m1 | |
11764 | |
11765 palignr m12, m2, m0, 12 | |
11766 palignr m1, m3, m2, 12 | |
11767 pmaddubsw m12, [r3] ; [16] | |
11768 pmulhrsw m12, m7 | |
11769 pmaddubsw m1, [r3] | |
11770 pmulhrsw m1, m7 | |
11771 packuswb m12, m1 | |
11772 | |
11773 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
11774 | |
11775 ; rows 8 to 15 | |
11776 palignr m4, m2, m0, 14 | |
11777 palignr m1, m3, m2, 14 | |
11778 pmaddubsw m4, [r3 - 6 * 32] ; [10] | |
11779 pmulhrsw m4, m7 | |
11780 pmaddubsw m1, [r3 - 6 * 32] | |
11781 pmulhrsw m1, m7 | |
11782 packuswb m4, m1 | |
11783 | |
11784 pmaddubsw m5, m2, [r3 - 12 * 32] ; [4] | |
11785 pmulhrsw m5, m7 | |
11786 pmaddubsw m1, m3, [r3 - 12 * 32] | |
11787 pmulhrsw m1, m7 | |
11788 packuswb m5, m1 | |
11789 | |
11790 pmaddubsw m6, m2, [r3 + 14 * 32] ; [30] | |
11791 pmulhrsw m6, m7 | |
11792 pmaddubsw m1, m3, [r3 + 14 * 32] | |
11793 pmulhrsw m1, m7 | |
11794 packuswb m6, m1 | |
11795 | |
11796 movu m0, [r2 + 25] | |
11797 movu m1, [r2 + 26] | |
11798 punpcklbw m0, m1 | |
11799 | |
11800 palignr m8, m3, m2, 2 | |
11801 palignr m1, m0, m3, 2 | |
11802 pmaddubsw m8, [r3 + 8 * 32] ; [24] | |
11803 pmulhrsw m8, m7 | |
11804 pmaddubsw m1, [r3 + 8 * 32] | |
11805 pmulhrsw m1, m7 | |
11806 packuswb m8, m1 | |
11807 | |
11808 palignr m9, m3, m2, 4 | |
11809 palignr m1, m0, m3, 4 | |
11810 pmaddubsw m9, [r3 + 2 * 32] ; [18] | |
11811 pmulhrsw m9, m7 | |
11812 pmaddubsw m1, [r3 + 2 * 32] | |
11813 pmulhrsw m1, m7 | |
11814 packuswb m9, m1 | |
11815 | |
11816 palignr m10, m3, m2, 6 | |
11817 palignr m1, m0, m3, 6 | |
11818 pmaddubsw m10, [r3 - 4 * 32] ; [12] | |
11819 pmulhrsw m10, m7 | |
11820 pmaddubsw m1, [r3 - 4 * 32] | |
11821 pmulhrsw m1, m7 | |
11822 packuswb m10, m1 | |
11823 | |
11824 palignr m11, m3, m2, 8 | |
11825 palignr m1, m0, m3, 8 | |
11826 pmaddubsw m11, [r3 - 10 * 32] ; [6] | |
11827 pmulhrsw m11, m7 | |
11828 pmaddubsw m1, [r3 - 10 * 32] | |
11829 pmulhrsw m1, m7 | |
11830 packuswb m11, m1 | |
11831 | |
11832 movu m12, [r2 + 14] | |
11833 | |
11834 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 | |
11835 ret | |
11836 | |
11837 INIT_YMM avx2 | |
11838 cglobal intra_pred_ang32_3, 3,8,13 | |
11839 add r2, 64 | |
11840 lea r3, [ang_table_avx2 + 32 * 16] | |
11841 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11842 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
11843 mova m7, [pw_1024] | |
11844 mov r4, r0 | |
11845 xor r7d, r7d | |
11846 | |
11847 call ang32_mode_3_33_row_0_15 | |
11848 | |
11849 add r4, 16 | |
11850 mov r0, r4 | |
11851 add r2, 13 | |
11852 | |
11853 call ang32_mode_3_33_row_0_15 | |
11854 RET | |
11855 | |
11856 INIT_YMM avx2 | |
11857 cglobal intra_pred_ang32_33, 3,8,13 | |
11858 lea r3, [ang_table_avx2 + 32 * 16] | |
11859 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
11860 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
11861 mova m7, [pw_1024] | |
11862 xor r7d, r7d | |
11863 inc r7d | |
11864 | |
11865 call ang32_mode_3_33_row_0_15 | |
11866 | |
11867 add r2, 13 | |
11868 | |
11869 call ang32_mode_3_33_row_0_15 | |
11870 RET | |
11871 | |
11872 cglobal ang32_mode_4_32_row_0_15 | |
11873 test r7d, r7d | |
11874 ; rows 0 to 7 | |
11875 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
11876 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
11877 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
11878 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
11879 | |
11880 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
11881 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
11882 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
11883 | |
11884 pmaddubsw m4, m0, [r3 + 5 * 32] ; [21] | |
11885 pmulhrsw m4, m7 | |
11886 pmaddubsw m1, m2, [r3 + 5 * 32] | |
11887 pmulhrsw m1, m7 | |
11888 packuswb m4, m1 | |
11889 | |
11890 palignr m6, m2, m0, 2 | |
11891 palignr m1, m3, m2, 2 | |
11892 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] | |
11893 pmulhrsw m5, m7 | |
11894 pmaddubsw m8, m1, [r3 - 6 * 32] | |
11895 pmulhrsw m8, m7 | |
11896 packuswb m5, m8 | |
11897 | |
11898 pmaddubsw m6, [r3 + 15 * 32] ; [31] | |
11899 pmulhrsw m6, m7 | |
11900 pmaddubsw m1, [r3 + 15 * 32] | |
11901 pmulhrsw m1, m7 | |
11902 packuswb m6, m1 | |
11903 | |
11904 palignr m8, m2, m0, 4 | |
11905 palignr m1, m3, m2, 4 | |
11906 pmaddubsw m8, [r3 + 4 * 32] ; [20] | |
11907 pmulhrsw m8, m7 | |
11908 pmaddubsw m1, [r3 + 4 * 32] | |
11909 pmulhrsw m1, m7 | |
11910 packuswb m8, m1 | |
11911 | |
11912 palignr m10, m2, m0, 6 | |
11913 palignr m11, m3, m2, 6 | |
11914 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] | |
11915 pmulhrsw m9, m7 | |
11916 pmaddubsw m1, m11, [r3 - 7 * 32] | |
11917 pmulhrsw m1, m7 | |
11918 packuswb m9, m1 | |
11919 | |
11920 pmaddubsw m10, [r3 + 14 * 32] ; [30] | |
11921 pmulhrsw m10, m7 | |
11922 pmaddubsw m11, [r3 + 14 * 32] | |
11923 pmulhrsw m11, m7 | |
11924 packuswb m10, m11 | |
11925 | |
11926 palignr m11, m2, m0, 8 | |
11927 palignr m1, m3, m2, 8 | |
11928 pmaddubsw m11, [r3 + 3 * 32] ; [19] | |
11929 pmulhrsw m11, m7 | |
11930 pmaddubsw m1, [r3 + 3 * 32] | |
11931 pmulhrsw m1, m7 | |
11932 packuswb m11, m1 | |
11933 | |
11934 palignr m12, m2, m0, 10 | |
11935 palignr m1, m3, m2, 10 | |
11936 pmaddubsw m12, [r3 - 8 * 32] ; [8] | |
11937 pmulhrsw m12, m7 | |
11938 pmaddubsw m1, [r3 - 8 * 32] | |
11939 pmulhrsw m1, m7 | |
11940 packuswb m12, m1 | |
11941 | |
11942 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
11943 | |
11944 ; rows 8 to 15 | |
11945 palignr m4, m2, m0, 10 | |
11946 palignr m1, m3, m2, 10 | |
11947 pmaddubsw m4, [r3 + 13 * 32] ; [29] | |
11948 pmulhrsw m4, m7 | |
11949 pmaddubsw m1, [r3 + 13 * 32] | |
11950 pmulhrsw m1, m7 | |
11951 packuswb m4, m1 | |
11952 | |
11953 palignr m5, m2, m0, 12 | |
11954 palignr m1, m3, m2, 12 | |
11955 pmaddubsw m5, [r3 + 2 * 32] ; [18] | |
11956 pmulhrsw m5, m7 | |
11957 pmaddubsw m1, [r3 + 2 * 32] | |
11958 pmulhrsw m1, m7 | |
11959 packuswb m5, m1 | |
11960 | |
11961 palignr m8, m2, m0, 14 | |
11962 palignr m1, m3, m2, 14 | |
11963 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7] | |
11964 pmulhrsw m6, m7 | |
11965 pmaddubsw m9, m1, [r3 - 9 * 32] | |
11966 pmulhrsw m9, m7 | |
11967 packuswb m6, m9 | |
11968 | |
11969 pmaddubsw m8, [r3 + 12 * 32] ; [28] | |
11970 pmulhrsw m8, m7 | |
11971 pmaddubsw m1, [r3 + 12 * 32] | |
11972 pmulhrsw m1, m7 | |
11973 packuswb m8, m1 | |
11974 | |
11975 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17] | |
11976 pmulhrsw m9, m7 | |
11977 pmaddubsw m1, m3, [r3 + 1 * 32] | |
11978 pmulhrsw m1, m7 | |
11979 packuswb m9, m1 | |
11980 | |
11981 movu m0, [r2 + 25] | |
11982 movu m1, [r2 + 26] | |
11983 punpcklbw m0, m1 | |
11984 | |
11985 palignr m11, m3, m2, 2 | |
11986 palignr m1, m0, m3, 2 | |
11987 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6] | |
11988 pmulhrsw m10, m7 | |
11989 pmaddubsw m12, m1, [r3 - 10 * 32] | |
11990 pmulhrsw m12, m7 | |
11991 packuswb m10, m12 | |
11992 | |
11993 pmaddubsw m11, [r3 + 11 * 32] ; [27] | |
11994 pmulhrsw m11, m7 | |
11995 pmaddubsw m1, [r3 + 11 * 32] | |
11996 pmulhrsw m1, m7 | |
11997 packuswb m11, m1 | |
11998 | |
11999 palignr m0, m3, 4 | |
12000 palignr m3, m2, 4 | |
12001 pmaddubsw m3, [r3] ; [16] | |
12002 pmulhrsw m3, m7 | |
12003 pmaddubsw m0, [r3] | |
12004 pmulhrsw m0, m7 | |
12005 packuswb m3, m0 | |
12006 | |
12007 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8 | |
12008 ret | |
12009 | |
12010 cglobal ang32_mode_4_32_row_16_31 | |
12011 test r7d, r7d | |
12012 ; rows 0 to 7 | |
12013 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12014 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12015 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12016 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12017 | |
12018 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12019 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12020 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12021 | |
12022 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] | |
12023 pmulhrsw m4, m7 | |
12024 pmaddubsw m1, m2, [r3 - 11 * 32] | |
12025 pmulhrsw m1, m7 | |
12026 packuswb m4, m1 | |
12027 | |
12028 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] | |
12029 pmulhrsw m5, m7 | |
12030 pmaddubsw m1, m2, [r3 + 10 * 32] | |
12031 pmulhrsw m1, m7 | |
12032 packuswb m5, m1 | |
12033 | |
12034 palignr m6, m2, m0, 2 | |
12035 palignr m1, m3, m2, 2 | |
12036 pmaddubsw m6, [r3 - 1 * 32] ; [15] | |
12037 pmulhrsw m6, m7 | |
12038 pmaddubsw m1, [r3 - 1 * 32] | |
12039 pmulhrsw m1, m7 | |
12040 packuswb m6, m1 | |
12041 | |
12042 palignr m9, m2, m0, 4 | |
12043 palignr m10, m3, m2, 4 | |
12044 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4] | |
12045 pmulhrsw m8, m7 | |
12046 pmaddubsw m1, m10, [r3 - 12 * 32] | |
12047 pmulhrsw m1, m7 | |
12048 packuswb m8, m1 | |
12049 | |
12050 pmaddubsw m9, [r3 + 9 * 32] ; [25] | |
12051 pmulhrsw m9, m7 | |
12052 pmaddubsw m10, [r3 + 9 * 32] | |
12053 pmulhrsw m10, m7 | |
12054 packuswb m9, m10 | |
12055 | |
12056 palignr m10, m2, m0, 6 | |
12057 palignr m11, m3, m2, 6 | |
12058 pmaddubsw m10, [r3 - 2 * 32] ; [14] | |
12059 pmulhrsw m10, m7 | |
12060 pmaddubsw m11, [r3 - 2 * 32] | |
12061 pmulhrsw m11, m7 | |
12062 packuswb m10, m11 | |
12063 | |
12064 palignr m12, m2, m0, 8 | |
12065 palignr m1, m3, m2, 8 | |
12066 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3] | |
12067 pmulhrsw m11, m7 | |
12068 pmaddubsw m1, [r3 - 13 * 32] | |
12069 pmulhrsw m1, m7 | |
12070 packuswb m11, m1 | |
12071 | |
12072 palignr m1, m3, m2, 8 | |
12073 pmaddubsw m12, [r3 + 8 * 32] ; [24] | |
12074 pmulhrsw m12, m7 | |
12075 pmaddubsw m1, [r3 + 8 * 32] | |
12076 pmulhrsw m1, m7 | |
12077 packuswb m12, m1 | |
12078 | |
12079 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12080 | |
12081 ; rows 8 to 15 | |
12082 palignr m4, m2, m0, 10 | |
12083 palignr m1, m3, m2, 10 | |
12084 pmaddubsw m4, [r3 - 3 * 32] ; [13] | |
12085 pmulhrsw m4, m7 | |
12086 pmaddubsw m1, [r3 - 3 * 32] | |
12087 pmulhrsw m1, m7 | |
12088 packuswb m4, m1 | |
12089 | |
12090 palignr m6, m2, m0, 12 | |
12091 palignr m8, m3, m2, 12 | |
12092 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2] | |
12093 pmulhrsw m5, m7 | |
12094 pmaddubsw m1, m8, [r3 - 14 * 32] | |
12095 pmulhrsw m1, m7 | |
12096 packuswb m5, m1 | |
12097 | |
12098 pmaddubsw m6, [r3 + 7 * 32] ; [23] | |
12099 pmulhrsw m6, m7 | |
12100 pmaddubsw m8, [r3 + 7 * 32] | |
12101 pmulhrsw m8, m7 | |
12102 packuswb m6, m8 | |
12103 | |
12104 palignr m8, m2, m0, 14 | |
12105 palignr m1, m3, m2, 14 | |
12106 pmaddubsw m8, [r3 - 4 * 32] ; [12] | |
12107 pmulhrsw m8, m7 | |
12108 pmaddubsw m1, [r3 - 4 * 32] | |
12109 pmulhrsw m1, m7 | |
12110 packuswb m8, m1 | |
12111 | |
12112 pmaddubsw m9, m2, [r3 - 15 * 32] ; [1] | |
12113 pmulhrsw m9, m7 | |
12114 pmaddubsw m1, m3, [r3 - 15 * 32] | |
12115 pmulhrsw m1, m7 | |
12116 packuswb m9, m1 | |
12117 | |
12118 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22] | |
12119 pmulhrsw m10, m7 | |
12120 pmaddubsw m1, m3, [r3 + 6 * 32] | |
12121 pmulhrsw m1, m7 | |
12122 packuswb m10, m1 | |
12123 | |
12124 movu m0, [r2 + 25] | |
12125 movu m1, [r2 + 26] | |
12126 punpcklbw m0, m1 | |
12127 | |
12128 palignr m11, m3, m2, 2 | |
12129 palignr m1, m0, m3, 2 | |
12130 pmaddubsw m11, [r3 - 5 * 32] ; [11] | |
12131 pmulhrsw m11, m7 | |
12132 pmaddubsw m1, [r3 - 5 * 32] | |
12133 pmulhrsw m1, m7 | |
12134 packuswb m11, m1 | |
12135 | |
12136 movu m12, [r2 + 11] | |
12137 | |
12138 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 | |
12139 ret | |
12140 | |
12141 INIT_YMM avx2 | |
12142 cglobal intra_pred_ang32_4, 3,8,13 | |
12143 add r2, 64 | |
12144 lea r3, [ang_table_avx2 + 32 * 16] | |
12145 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12146 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12147 mova m7, [pw_1024] | |
12148 mov r4, r0 | |
12149 xor r7d, r7d | |
12150 | |
12151 call ang32_mode_4_32_row_0_15 | |
12152 | |
12153 add r4, 16 | |
12154 mov r0, r4 | |
12155 add r2, 11 | |
12156 | |
12157 call ang32_mode_4_32_row_16_31 | |
12158 RET | |
12159 | |
12160 INIT_YMM avx2 | |
12161 cglobal intra_pred_ang32_32, 3,8,13 | |
12162 lea r3, [ang_table_avx2 + 32 * 16] | |
12163 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12164 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12165 mova m7, [pw_1024] | |
12166 xor r7d, r7d | |
12167 inc r7d | |
12168 | |
12169 call ang32_mode_4_32_row_0_15 | |
12170 | |
12171 add r2, 11 | |
12172 | |
12173 call ang32_mode_4_32_row_16_31 | |
12174 RET | |
12175 | |
12176 cglobal ang32_mode_5_31_row_0_15 | |
12177 test r7d, r7d | |
12178 ; rows 0 to 7 | |
12179 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12180 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12181 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12182 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12183 | |
12184 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12185 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12186 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12187 | |
12188 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] | |
12189 pmulhrsw m4, m7 | |
12190 pmaddubsw m1, m2, [r3 + 1 * 32] | |
12191 pmulhrsw m1, m7 | |
12192 packuswb m4, m1 | |
12193 | |
12194 palignr m6, m2, m0, 2 | |
12195 palignr m1, m3, m2, 2 | |
12196 pmaddubsw m5, m6, [r3 - 14 * 32] ; [2] | |
12197 pmulhrsw m5, m7 | |
12198 pmaddubsw m8, m1, [r3 - 14 * 32] | |
12199 pmulhrsw m8, m7 | |
12200 packuswb m5, m8 | |
12201 | |
12202 pmaddubsw m6, [r3 + 3 * 32] ; [19] | |
12203 pmulhrsw m6, m7 | |
12204 pmaddubsw m1, [r3 + 3 * 32] | |
12205 pmulhrsw m1, m7 | |
12206 packuswb m6, m1 | |
12207 | |
12208 palignr m9, m2, m0, 4 | |
12209 palignr m10, m3, m2, 4 | |
12210 pmaddubsw m8, m9, [r3 - 12 * 32] ; [4] | |
12211 pmulhrsw m8, m7 | |
12212 pmaddubsw m1, m10, [r3 - 12 * 32] | |
12213 pmulhrsw m1, m7 | |
12214 packuswb m8, m1 | |
12215 | |
12216 pmaddubsw m9, [r3 + 5 * 32] ; [21] | |
12217 pmulhrsw m9, m7 | |
12218 pmaddubsw m10, [r3 + 5 * 32] | |
12219 pmulhrsw m10, m7 | |
12220 packuswb m9, m10 | |
12221 | |
12222 palignr m11, m2, m0, 6 | |
12223 palignr m12, m3, m2, 6 | |
12224 pmaddubsw m10, m11, [r3 - 10 * 32] ; [6] | |
12225 pmulhrsw m10, m7 | |
12226 pmaddubsw m1, m12, [r3 - 10 * 32] | |
12227 pmulhrsw m1, m7 | |
12228 packuswb m10, m1 | |
12229 | |
12230 pmaddubsw m11, [r3 + 7 * 32] ; [23] | |
12231 pmulhrsw m11, m7 | |
12232 pmaddubsw m12, [r3 + 7 * 32] | |
12233 pmulhrsw m12, m7 | |
12234 packuswb m11, m12 | |
12235 | |
12236 palignr m12, m2, m0, 8 | |
12237 palignr m1, m3, m2, 8 | |
12238 pmaddubsw m12, [r3 - 8 * 32] ; [8] | |
12239 pmulhrsw m12, m7 | |
12240 pmaddubsw m1, [r3 - 8 * 32] | |
12241 pmulhrsw m1, m7 | |
12242 packuswb m12, m1 | |
12243 | |
12244 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12245 | |
12246 ; rows 8 to 15 | |
12247 palignr m4, m2, m0, 8 | |
12248 palignr m1, m3, m2, 8 | |
12249 pmaddubsw m4, [r3 + 9 * 32] ; [25] | |
12250 pmulhrsw m4, m7 | |
12251 pmaddubsw m1, [r3 + 9 * 32] | |
12252 pmulhrsw m1, m7 | |
12253 packuswb m4, m1 | |
12254 | |
12255 palignr m6, m2, m0, 10 | |
12256 palignr m1, m3, m2, 10 | |
12257 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] | |
12258 pmulhrsw m5, m7 | |
12259 pmaddubsw m8, m1, [r3 - 6 * 32] | |
12260 pmulhrsw m8, m7 | |
12261 packuswb m5, m8 | |
12262 | |
12263 pmaddubsw m6, [r3 + 11 * 32] ; [27] | |
12264 pmulhrsw m6, m7 | |
12265 pmaddubsw m1, [r3 + 11 * 32] | |
12266 pmulhrsw m1, m7 | |
12267 packuswb m6, m1 | |
12268 | |
12269 palignr m9, m2, m0, 12 | |
12270 palignr m1, m3, m2, 12 | |
12271 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12] | |
12272 pmulhrsw m8, m7 | |
12273 pmaddubsw m10, m1, [r3 - 4 * 32] | |
12274 pmulhrsw m10, m7 | |
12275 packuswb m8, m10 | |
12276 | |
12277 pmaddubsw m9, [r3 + 13 * 32] ; [29] | |
12278 pmulhrsw m9, m7 | |
12279 pmaddubsw m1, [r3 + 13 * 32] | |
12280 pmulhrsw m1, m7 | |
12281 packuswb m9, m1 | |
12282 | |
12283 palignr m11, m2, m0, 14 | |
12284 palignr m1, m3, m2, 14 | |
12285 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14] | |
12286 pmulhrsw m10, m7 | |
12287 pmaddubsw m12, m1, [r3 - 2 * 32] | |
12288 pmulhrsw m12, m7 | |
12289 packuswb m10, m12 | |
12290 | |
12291 pmaddubsw m11, [r3 + 15 * 32] ; [31] | |
12292 pmulhrsw m11, m7 | |
12293 pmaddubsw m1, [r3 + 15 * 32] | |
12294 pmulhrsw m1, m7 | |
12295 packuswb m11, m1 | |
12296 | |
12297 pmaddubsw m2, [r3] ; [16] | |
12298 pmulhrsw m2, m7 | |
12299 pmaddubsw m3, [r3] | |
12300 pmulhrsw m3, m7 | |
12301 packuswb m2, m3 | |
12302 | |
12303 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 | |
12304 ret | |
12305 | |
12306 cglobal ang32_mode_5_31_row_16_31 | |
12307 test r7d, r7d | |
12308 ; rows 0 to 7 | |
12309 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12310 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12311 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12312 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12313 | |
12314 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12315 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12316 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12317 | |
12318 pmaddubsw m4, m0, [r3 - 15 * 32] ; [1] | |
12319 pmulhrsw m4, m7 | |
12320 pmaddubsw m1, m2, [r3 - 15 * 32] | |
12321 pmulhrsw m1, m7 | |
12322 packuswb m4, m1 | |
12323 | |
12324 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] | |
12325 pmulhrsw m5, m7 | |
12326 pmaddubsw m8, m2, [r3 + 2 * 32] | |
12327 pmulhrsw m8, m7 | |
12328 packuswb m5, m8 | |
12329 | |
12330 palignr m8, m2, m0, 2 | |
12331 palignr m9, m3, m2, 2 | |
12332 pmaddubsw m6, m8, [r3 - 13 * 32] ; [3] | |
12333 pmulhrsw m6, m7 | |
12334 pmaddubsw m1, m9, [r3 - 13 * 32] | |
12335 pmulhrsw m1, m7 | |
12336 packuswb m6, m1 | |
12337 | |
12338 pmaddubsw m8, [r3 + 4 * 32] ; [20] | |
12339 pmulhrsw m8, m7 | |
12340 pmaddubsw m9, [r3 + 4 * 32] | |
12341 pmulhrsw m9, m7 | |
12342 packuswb m8, m9 | |
12343 | |
12344 palignr m10, m2, m0, 4 | |
12345 palignr m1, m3, m2, 4 | |
12346 pmaddubsw m9, m10, [r3 - 11 * 32] ; [5] | |
12347 pmulhrsw m9, m7 | |
12348 pmaddubsw m11, m1, [r3 - 11 * 32] | |
12349 pmulhrsw m11, m7 | |
12350 packuswb m9, m11 | |
12351 | |
12352 pmaddubsw m10, [r3 + 6 * 32] ; [22] | |
12353 pmulhrsw m10, m7 | |
12354 pmaddubsw m1, [r3 + 6 * 32] | |
12355 pmulhrsw m1, m7 | |
12356 packuswb m10, m1 | |
12357 | |
12358 palignr m12, m2, m0, 6 | |
12359 palignr m1, m3, m2, 6 | |
12360 pmaddubsw m11, m12, [r3 - 9 * 32] ; [7] | |
12361 pmulhrsw m11, m7 | |
12362 pmaddubsw m1, [r3 - 9 * 32] | |
12363 pmulhrsw m1, m7 | |
12364 packuswb m11, m1 | |
12365 | |
12366 palignr m1, m3, m2, 6 | |
12367 pmaddubsw m12, [r3 + 8 * 32] ; [24] | |
12368 pmulhrsw m12, m7 | |
12369 pmaddubsw m1, [r3 + 8 * 32] | |
12370 pmulhrsw m1, m7 | |
12371 packuswb m12, m1 | |
12372 | |
12373 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12374 | |
12375 ; rows 8 to 15 | |
12376 palignr m5, m2, m0, 8 | |
12377 palignr m8, m3, m2, 8 | |
12378 pmaddubsw m4, m5, [r3 - 7 * 32] ; [9] | |
12379 pmulhrsw m4, m7 | |
12380 pmaddubsw m1, m8, [r3 - 7 * 32] | |
12381 pmulhrsw m1, m7 | |
12382 packuswb m4, m1 | |
12383 | |
12384 pmaddubsw m5, [r3 + 10 * 32] ; [26] | |
12385 pmulhrsw m5, m7 | |
12386 pmaddubsw m8, [r3 + 10 * 32] | |
12387 pmulhrsw m8, m7 | |
12388 packuswb m5, m8 | |
12389 | |
12390 palignr m8, m2, m0, 10 | |
12391 palignr m9, m3, m2, 10 | |
12392 pmaddubsw m6, m8, [r3 - 5 * 32] ; [11] | |
12393 pmulhrsw m6, m7 | |
12394 pmaddubsw m1, m9, [r3 - 5 * 32] | |
12395 pmulhrsw m1, m7 | |
12396 packuswb m6, m1 | |
12397 | |
12398 pmaddubsw m8, [r3 + 12 * 32] ; [28] | |
12399 pmulhrsw m8, m7 | |
12400 pmaddubsw m9, [r3 + 12 * 32] | |
12401 pmulhrsw m9, m7 | |
12402 packuswb m8, m9 | |
12403 | |
12404 palignr m10, m2, m0, 12 | |
12405 palignr m11, m3, m2, 12 | |
12406 pmaddubsw m9, m10, [r3 - 3 * 32] ; [13] | |
12407 pmulhrsw m9, m7 | |
12408 pmaddubsw m1, m11, [r3 - 3 * 32] | |
12409 pmulhrsw m1, m7 | |
12410 packuswb m9, m1 | |
12411 | |
12412 pmaddubsw m10, [r3 + 14 * 32] ; [30] | |
12413 pmulhrsw m10, m7 | |
12414 pmaddubsw m11, [r3 + 14 * 32] | |
12415 pmulhrsw m11, m7 | |
12416 packuswb m10, m11 | |
12417 | |
12418 palignr m11, m2, m0, 14 | |
12419 palignr m1, m3, m2, 14 | |
12420 pmaddubsw m11, [r3 - 1 * 32] ; [15] | |
12421 pmulhrsw m11, m7 | |
12422 pmaddubsw m1, [r3 - 1 * 32] | |
12423 pmulhrsw m1, m7 | |
12424 packuswb m11, m1 | |
12425 | |
12426 movu m2, [r2 + 9] | |
12427 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 | |
12428 ret | |
12429 | |
12430 INIT_YMM avx2 | |
12431 cglobal intra_pred_ang32_5, 3,8,13 | |
12432 add r2, 64 | |
12433 lea r3, [ang_table_avx2 + 32 * 16] | |
12434 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12435 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12436 mova m7, [pw_1024] | |
12437 mov r4, r0 | |
12438 xor r7d, r7d | |
12439 | |
12440 call ang32_mode_5_31_row_0_15 | |
12441 | |
12442 add r4, 16 | |
12443 mov r0, r4 | |
12444 add r2, 9 | |
12445 | |
12446 call ang32_mode_5_31_row_16_31 | |
12447 RET | |
12448 | |
12449 INIT_YMM avx2 | |
12450 cglobal intra_pred_ang32_31, 3,8,13 | |
12451 lea r3, [ang_table_avx2 + 32 * 16] | |
12452 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12453 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12454 mova m7, [pw_1024] | |
12455 xor r7d, r7d | |
12456 inc r7d | |
12457 | |
12458 call ang32_mode_5_31_row_0_15 | |
12459 | |
12460 add r2, 9 | |
12461 | |
12462 call ang32_mode_5_31_row_16_31 | |
12463 RET | |
12464 | |
12465 cglobal ang32_mode_6_30_row_0_15 | |
12466 test r7d, r7d | |
12467 ; rows 0 to 7 | |
12468 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12469 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12470 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12471 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12472 | |
12473 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12474 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12475 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12476 | |
12477 pmaddubsw m4, m0, [r3 - 3 * 32] ; [13] | |
12478 pmulhrsw m4, m7 | |
12479 pmaddubsw m1, m2, [r3 - 3 * 32] | |
12480 pmulhrsw m1, m7 | |
12481 packuswb m4, m1 | |
12482 | |
12483 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] | |
12484 pmulhrsw m5, m7 | |
12485 pmaddubsw m8, m2, [r3 + 10 * 32] | |
12486 pmulhrsw m8, m7 | |
12487 packuswb m5, m8 | |
12488 | |
12489 palignr m8, m2, m0, 2 | |
12490 palignr m1, m3, m2, 2 | |
12491 pmaddubsw m6, m8, [r3 - 9 * 32] ; [7] | |
12492 pmulhrsw m6, m7 | |
12493 pmaddubsw m9, m1, [r3 - 9 * 32] | |
12494 pmulhrsw m9, m7 | |
12495 packuswb m6, m9 | |
12496 | |
12497 pmaddubsw m8, [r3 + 4 * 32] ; [20] | |
12498 pmulhrsw m8, m7 | |
12499 pmaddubsw m1, [r3 + 4 * 32] | |
12500 pmulhrsw m1, m7 | |
12501 packuswb m8, m1 | |
12502 | |
12503 palignr m11, m2, m0, 4 | |
12504 palignr m1, m3, m2, 4 | |
12505 pmaddubsw m9, m11, [r3 - 15 * 32] ; [1] | |
12506 pmulhrsw m9, m7 | |
12507 pmaddubsw m12, m1, [r3 - 15 * 32] | |
12508 pmulhrsw m12, m7 | |
12509 packuswb m9, m12 | |
12510 | |
12511 pmaddubsw m10, m11, [r3 - 2 * 32] ; [14] | |
12512 pmulhrsw m10, m7 | |
12513 pmaddubsw m12, m1, [r3 - 2 * 32] | |
12514 pmulhrsw m12, m7 | |
12515 packuswb m10, m12 | |
12516 | |
12517 pmaddubsw m11, [r3 + 11 * 32] ; [27] | |
12518 pmulhrsw m11, m7 | |
12519 pmaddubsw m1, [r3 + 11 * 32] | |
12520 pmulhrsw m1, m7 | |
12521 packuswb m11, m1 | |
12522 | |
12523 palignr m12, m2, m0, 6 | |
12524 palignr m1, m3, m2, 6 | |
12525 pmaddubsw m12, [r3 - 8 * 32] ; [8] | |
12526 pmulhrsw m12, m7 | |
12527 pmaddubsw m1, [r3 - 8 * 32] | |
12528 pmulhrsw m1, m7 | |
12529 packuswb m12, m1 | |
12530 | |
12531 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12532 | |
12533 ; rows 8 to 15 | |
12534 palignr m4, m2, m0, 6 | |
12535 palignr m1, m3, m2, 6 | |
12536 pmaddubsw m4, [r3 + 5 * 32] ; [21] | |
12537 pmulhrsw m4, m7 | |
12538 pmaddubsw m1, [r3 + 5 * 32] | |
12539 pmulhrsw m1, m7 | |
12540 packuswb m4, m1 | |
12541 | |
12542 palignr m8, m2, m0, 8 | |
12543 palignr m1, m3, m2, 8 | |
12544 pmaddubsw m5, m8, [r3 - 14 * 32] ; [2] | |
12545 pmulhrsw m5, m7 | |
12546 pmaddubsw m9, m1, [r3 - 14 * 32] | |
12547 pmulhrsw m9, m7 | |
12548 packuswb m5, m9 | |
12549 | |
12550 pmaddubsw m6, m8, [r3 - 1 * 32] ; [15] | |
12551 pmulhrsw m6, m7 | |
12552 pmaddubsw m9, m1, [r3 - 1 * 32] | |
12553 pmulhrsw m9, m7 | |
12554 packuswb m6, m9 | |
12555 | |
12556 pmaddubsw m8, [r3 + 12 * 32] ; [28] | |
12557 pmulhrsw m8, m7 | |
12558 pmaddubsw m1, [r3 + 12 * 32] | |
12559 pmulhrsw m1, m7 | |
12560 packuswb m8, m1 | |
12561 | |
12562 palignr m10, m2, m0, 10 | |
12563 palignr m1, m3, m2, 10 | |
12564 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] | |
12565 pmulhrsw m9, m7 | |
12566 pmaddubsw m11, m1, [r3 - 7 * 32] | |
12567 pmulhrsw m11, m7 | |
12568 packuswb m9, m11 | |
12569 | |
12570 pmaddubsw m10, [r3 + 6 * 32] ; [22] | |
12571 pmulhrsw m10, m7 | |
12572 pmaddubsw m1, m1, [r3 + 6 * 32] | |
12573 pmulhrsw m1, m7 | |
12574 packuswb m10, m1 | |
12575 | |
12576 palignr m3, m2, 12 | |
12577 palignr m2, m0, 12 | |
12578 pmaddubsw m11, m2, [r3 - 13 * 32] ; [3] | |
12579 pmulhrsw m11, m7 | |
12580 pmaddubsw m1, m3, [r3 - 13 * 32] | |
12581 pmulhrsw m1, m7 | |
12582 packuswb m11, m1 | |
12583 | |
12584 pmaddubsw m2, [r3] ; [16] | |
12585 pmulhrsw m2, m7 | |
12586 pmaddubsw m3, [r3] | |
12587 pmulhrsw m3, m7 | |
12588 packuswb m2, m3 | |
12589 | |
12590 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 | |
12591 ret | |
12592 | |
12593 cglobal ang32_mode_6_30_row_16_31 | |
12594 test r7d, r7d | |
12595 ; rows 0 to 7 | |
12596 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12597 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12598 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12599 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12600 | |
12601 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12602 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12603 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12604 | |
12605 pmaddubsw m4, m0, [r3 + 13 * 32] ; [29] | |
12606 pmulhrsw m4, m7 | |
12607 pmaddubsw m1, m2, [r3 + 13 * 32] | |
12608 pmulhrsw m1, m7 | |
12609 packuswb m4, m1 | |
12610 | |
12611 palignr m6, m2, m0, 2 | |
12612 palignr m1, m3, m2, 2 | |
12613 pmaddubsw m5, m6, [r3 - 6 * 32] ; [10] | |
12614 pmulhrsw m5, m7 | |
12615 pmaddubsw m8, m1, [r3 - 6 * 32] | |
12616 pmulhrsw m8, m7 | |
12617 packuswb m5, m8 | |
12618 | |
12619 pmaddubsw m6, [r3 + 7 * 32] ; [23] | |
12620 pmulhrsw m6, m7 | |
12621 pmaddubsw m1, [r3 + 7 * 32] | |
12622 pmulhrsw m1, m7 | |
12623 packuswb m6, m1 | |
12624 | |
12625 palignr m10, m2, m0, 4 | |
12626 palignr m1, m3, m2, 4 | |
12627 pmaddubsw m8, m10, [r3 - 12 * 32] ; [4] | |
12628 pmulhrsw m8, m7 | |
12629 pmaddubsw m11, m1, [r3 - 12 * 32] | |
12630 pmulhrsw m11, m7 | |
12631 packuswb m8, m11 | |
12632 | |
12633 pmaddubsw m9, m10, [r3 + 1 * 32] ; [17] | |
12634 pmulhrsw m9, m7 | |
12635 pmaddubsw m11, m1, [r3 + 1 * 32] | |
12636 pmulhrsw m11, m7 | |
12637 packuswb m9, m11 | |
12638 | |
12639 pmaddubsw m10, [r3 + 14 * 32] ; [30] | |
12640 pmulhrsw m10, m7 | |
12641 pmaddubsw m1, [r3 + 14 * 32] | |
12642 pmulhrsw m1, m7 | |
12643 packuswb m10, m1 | |
12644 | |
12645 palignr m12, m2, m0, 6 | |
12646 palignr m1, m3, m2, 6 | |
12647 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11] | |
12648 pmulhrsw m11, m7 | |
12649 pmaddubsw m1, [r3 - 5 * 32] | |
12650 pmulhrsw m1, m7 | |
12651 packuswb m11, m1 | |
12652 | |
12653 palignr m1, m3, m2, 6 | |
12654 pmaddubsw m12, [r3 + 8 * 32] ; [24] | |
12655 pmulhrsw m12, m7 | |
12656 pmaddubsw m1, [r3 + 8 * 32] | |
12657 pmulhrsw m1, m7 | |
12658 packuswb m12, m1 | |
12659 | |
12660 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12661 | |
12662 ; rows 8 to 15 | |
12663 palignr m6, m2, m0, 8 | |
12664 palignr m1, m3, m2, 8 | |
12665 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] | |
12666 pmulhrsw m4, m7 | |
12667 pmaddubsw m8, m1, [r3 - 11 * 32] | |
12668 pmulhrsw m8, m7 | |
12669 packuswb m4, m8 | |
12670 | |
12671 pmaddubsw m5, m6, [r3 + 2 * 32] ; [18] | |
12672 pmulhrsw m5, m7 | |
12673 pmaddubsw m9, m1, [r3 + 2 * 32] | |
12674 pmulhrsw m9, m7 | |
12675 packuswb m5, m9 | |
12676 | |
12677 pmaddubsw m6, [r3 + 15 * 32] ; [31] | |
12678 pmulhrsw m6, m7 | |
12679 pmaddubsw m1, [r3 + 15 * 32] | |
12680 pmulhrsw m1, m7 | |
12681 packuswb m6, m1 | |
12682 | |
12683 palignr m9, m2, m0, 10 | |
12684 palignr m1, m3, m2, 10 | |
12685 pmaddubsw m8, m9, [r3 - 4 * 32] ; [12] | |
12686 pmulhrsw m8, m7 | |
12687 pmaddubsw m10, m1, [r3 - 4 * 32] | |
12688 pmulhrsw m10, m7 | |
12689 packuswb m8, m10 | |
12690 | |
12691 pmaddubsw m9, [r3 + 9 * 32] ; [25] | |
12692 pmulhrsw m9, m7 | |
12693 pmaddubsw m1, [r3 + 9 * 32] | |
12694 pmulhrsw m1, m7 | |
12695 packuswb m9, m1 | |
12696 | |
12697 palignr m3, m2, 12 | |
12698 palignr m2, m0, 12 | |
12699 pmaddubsw m10, m2, [r3 - 10 * 32] ; [6] | |
12700 pmulhrsw m10, m7 | |
12701 pmaddubsw m1, m3, [r3 - 10 * 32] | |
12702 pmulhrsw m1, m7 | |
12703 packuswb m10, m1 | |
12704 | |
12705 pmaddubsw m2, [r3 + 3 * 32] ; [19] | |
12706 pmulhrsw m2, m7 | |
12707 pmaddubsw m3, [r3 + 3 * 32] | |
12708 pmulhrsw m3, m7 | |
12709 packuswb m2, m3 | |
12710 | |
12711 movu m3, [r2 + 8] ; [0] | |
12712 | |
12713 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8 | |
12714 ret | |
12715 | |
12716 INIT_YMM avx2 | |
12717 cglobal intra_pred_ang32_6, 3,8,13 | |
12718 add r2, 64 | |
12719 lea r3, [ang_table_avx2 + 32 * 16] | |
12720 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12721 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12722 mova m7, [pw_1024] | |
12723 mov r4, r0 | |
12724 xor r7d, r7d | |
12725 | |
12726 call ang32_mode_6_30_row_0_15 | |
12727 | |
12728 add r4, 16 | |
12729 mov r0, r4 | |
12730 add r2, 6 | |
12731 | |
12732 call ang32_mode_6_30_row_16_31 | |
12733 RET | |
12734 | |
12735 INIT_YMM avx2 | |
12736 cglobal intra_pred_ang32_30, 3,8,13 | |
12737 lea r3, [ang_table_avx2 + 32 * 16] | |
12738 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12739 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
12740 mova m7, [pw_1024] | |
12741 xor r7d, r7d | |
12742 inc r7d | |
12743 | |
12744 call ang32_mode_6_30_row_0_15 | |
12745 | |
12746 add r2, 6 | |
12747 | |
12748 call ang32_mode_6_30_row_16_31 | |
12749 RET | |
12750 | |
12751 cglobal ang32_mode_7_29_row_0_15 | |
12752 test r7d, r7d | |
12753 ; rows 0 to 7 | |
12754 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12755 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12756 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12757 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12758 | |
12759 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12760 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12761 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12762 | |
12763 pmaddubsw m4, m0, [r3 - 7 * 32] ; [9] | |
12764 pmulhrsw m4, m7 | |
12765 pmaddubsw m1, m2, [r3 - 7 * 32] | |
12766 pmulhrsw m1, m7 | |
12767 packuswb m4, m1 | |
12768 | |
12769 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] | |
12770 pmulhrsw m5, m7 | |
12771 pmaddubsw m8, m2, [r3 + 2 * 32] | |
12772 pmulhrsw m8, m7 | |
12773 packuswb m5, m8 | |
12774 | |
12775 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27] | |
12776 pmulhrsw m6, m7 | |
12777 pmaddubsw m9, m2, [r3 + 11 * 32] | |
12778 pmulhrsw m9, m7 | |
12779 packuswb m6, m9 | |
12780 | |
12781 palignr m11, m2, m0, 2 | |
12782 palignr m1, m3, m2, 2 | |
12783 pmaddubsw m8, m11, [r3 - 12 * 32] ; [4] | |
12784 pmulhrsw m8, m7 | |
12785 pmaddubsw m12, m1, [r3 - 12 * 32] | |
12786 pmulhrsw m12, m7 | |
12787 packuswb m8, m12 | |
12788 | |
12789 pmaddubsw m9, m11, [r3 - 3 * 32] ; [13] | |
12790 pmulhrsw m9, m7 | |
12791 pmaddubsw m12, m1, [r3 - 3 * 32] | |
12792 pmulhrsw m12, m7 | |
12793 packuswb m9, m12 | |
12794 | |
12795 pmaddubsw m10, m11, [r3 + 6 * 32] ; [22] | |
12796 pmulhrsw m10, m7 | |
12797 pmaddubsw m12, m1, [r3 + 6 * 32] | |
12798 pmulhrsw m12, m7 | |
12799 packuswb m10, m12 | |
12800 | |
12801 pmaddubsw m11, [r3 + 15 * 32] ; [31] | |
12802 pmulhrsw m11, m7 | |
12803 pmaddubsw m1, [r3 + 15 * 32] | |
12804 pmulhrsw m1, m7 | |
12805 packuswb m11, m1 | |
12806 | |
12807 palignr m12, m2, m0, 4 | |
12808 palignr m1, m3, m2, 4 | |
12809 pmaddubsw m12, [r3 - 8 * 32] ; [8] | |
12810 pmulhrsw m12, m7 | |
12811 pmaddubsw m1, [r3 - 8 * 32] | |
12812 pmulhrsw m1, m7 | |
12813 packuswb m12, m1 | |
12814 | |
12815 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12816 | |
12817 ; rows 8 to 15 | |
12818 palignr m5, m2, m0, 4 | |
12819 palignr m1, m3, m2, 4 | |
12820 pmaddubsw m4, m5, [r3 + 1 * 32] ; [17] | |
12821 pmulhrsw m4, m7 | |
12822 pmaddubsw m8, m1, [r3 + 1 * 32] | |
12823 pmulhrsw m8, m7 | |
12824 packuswb m4, m8 | |
12825 | |
12826 pmaddubsw m5, [r3 + 10 * 32] ; [26] | |
12827 pmulhrsw m5, m7 | |
12828 pmaddubsw m1, [r3 + 10 * 32] | |
12829 pmulhrsw m1, m7 | |
12830 packuswb m5, m1 | |
12831 | |
12832 palignr m10, m2, m0, 6 | |
12833 palignr m1, m3, m2, 6 | |
12834 pmaddubsw m6, m10, [r3 - 13 * 32] ; [3] | |
12835 pmulhrsw m6, m7 | |
12836 pmaddubsw m9, m1, [r3 - 13 * 32] | |
12837 pmulhrsw m9, m7 | |
12838 packuswb m6, m9 | |
12839 | |
12840 pmaddubsw m8, m10, [r3 - 4 * 32] ; [12] | |
12841 pmulhrsw m8, m7 | |
12842 pmaddubsw m11, m1, [r3 - 4 * 32] | |
12843 pmulhrsw m11, m7 | |
12844 packuswb m8, m11 | |
12845 | |
12846 pmaddubsw m9, m10, [r3 + 5 * 32] ; [21] | |
12847 pmulhrsw m9, m7 | |
12848 pmaddubsw m11, m1, [r3 + 5 * 32] | |
12849 pmulhrsw m11, m7 | |
12850 packuswb m9, m11 | |
12851 | |
12852 pmaddubsw m10, [r3 + 14 * 32] ; [30] | |
12853 pmulhrsw m10, m7 | |
12854 pmaddubsw m1, [r3 + 14 * 32] | |
12855 pmulhrsw m1, m7 | |
12856 packuswb m10, m1 | |
12857 | |
12858 palignr m3, m2, 8 | |
12859 palignr m2, m0, 8 | |
12860 pmaddubsw m11, m2, [r3 - 9 * 32] ; [7] | |
12861 pmulhrsw m11, m7 | |
12862 pmaddubsw m1, m3, [r3 - 9 * 32] | |
12863 pmulhrsw m1, m7 | |
12864 packuswb m11, m1 | |
12865 | |
12866 pmaddubsw m2, [r3] ; [16] | |
12867 pmulhrsw m2, m7 | |
12868 pmaddubsw m3, [r3] | |
12869 pmulhrsw m3, m7 | |
12870 packuswb m2, m3 | |
12871 | |
12872 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8 | |
12873 ret | |
12874 | |
12875 cglobal ang32_mode_7_29_row_16_31 | |
12876 test r7d, r7d | |
12877 ; rows 0 to 7 | |
12878 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
12879 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
12880 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
12881 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
12882 | |
12883 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
12884 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
12885 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
12886 | |
12887 pmaddubsw m4, m0, [r3 + 9 * 32] ; [25] | |
12888 pmulhrsw m4, m7 | |
12889 pmaddubsw m1, m2, [r3 + 9 * 32] | |
12890 pmulhrsw m1, m7 | |
12891 packuswb m4, m1 | |
12892 | |
12893 palignr m9, m2, m0, 2 | |
12894 palignr m1, m3, m2, 2 | |
12895 pmaddubsw m5, m9, [r3 - 14 * 32] ; [2] | |
12896 pmulhrsw m5, m7 | |
12897 pmaddubsw m8, m1, [r3 - 14 * 32] | |
12898 pmulhrsw m8, m7 | |
12899 packuswb m5, m8 | |
12900 | |
12901 pmaddubsw m6, m9, [r3 - 5 * 32] ; [11] | |
12902 pmulhrsw m6, m7 | |
12903 pmaddubsw m10, m1, [r3 - 5 * 32] | |
12904 pmulhrsw m10, m7 | |
12905 packuswb m6, m10 | |
12906 | |
12907 pmaddubsw m8, m9, [r3 + 4 * 32] ; [20] | |
12908 pmulhrsw m8, m7 | |
12909 pmaddubsw m10, m1, [r3 + 4 * 32] | |
12910 pmulhrsw m10, m7 | |
12911 packuswb m8, m10 | |
12912 | |
12913 pmaddubsw m9, [r3 + 13 * 32] ; [29] | |
12914 pmulhrsw m9, m7 | |
12915 pmaddubsw m1, [r3 + 13 * 32] | |
12916 pmulhrsw m1, m7 | |
12917 packuswb m9, m1 | |
12918 | |
12919 palignr m12, m2, m0, 4 | |
12920 palignr m1, m3, m2, 4 | |
12921 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6] | |
12922 pmulhrsw m10, m7 | |
12923 pmaddubsw m11, m1, [r3 - 10 * 32] | |
12924 pmulhrsw m11, m7 | |
12925 packuswb m10, m11 | |
12926 | |
12927 pmaddubsw m11, m12, [r3 - 1 * 32] ; [15] | |
12928 pmulhrsw m11, m7 | |
12929 pmaddubsw m1, [r3 - 1 * 32] | |
12930 pmulhrsw m1, m7 | |
12931 packuswb m11, m1 | |
12932 | |
12933 palignr m1, m3, m2, 4 | |
12934 pmaddubsw m12, [r3 + 8 * 32] ; [24] | |
12935 pmulhrsw m12, m7 | |
12936 pmaddubsw m1, [r3 + 8 * 32] | |
12937 pmulhrsw m1, m7 | |
12938 packuswb m12, m1 | |
12939 | |
12940 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
12941 | |
12942 ; rows 8 to 15 | |
12943 palignr m8, m2, m0, 6 | |
12944 palignr m1, m3, m2, 6 | |
12945 pmaddubsw m4, m8, [r3 - 15 * 32] ; [1] | |
12946 pmulhrsw m4, m7 | |
12947 pmaddubsw m9, m1, [r3 - 15 * 32] | |
12948 pmulhrsw m9, m7 | |
12949 packuswb m4, m9 | |
12950 | |
12951 pmaddubsw m5, m8, [r3 - 6 * 32] ; [10] | |
12952 pmulhrsw m5, m7 | |
12953 pmaddubsw m9, m1, [r3 - 6 * 32] | |
12954 pmulhrsw m9, m7 | |
12955 packuswb m5, m9 | |
12956 | |
12957 pmaddubsw m6, m8, [r3 + 3 * 32] ; [19] | |
12958 pmulhrsw m6, m7 | |
12959 pmaddubsw m9, m1, [r3 + 3 * 32] | |
12960 pmulhrsw m9, m7 | |
12961 packuswb m6, m9 | |
12962 | |
12963 pmaddubsw m8, [r3 + 12 * 32] ; [28] | |
12964 pmulhrsw m8, m7 | |
12965 pmaddubsw m1, [r3 + 12 * 32] | |
12966 pmulhrsw m1, m7 | |
12967 packuswb m8, m1 | |
12968 | |
12969 palignr m3, m2, 8 | |
12970 palignr m2, m0, 8 | |
12971 pmaddubsw m9, m2, [r3 - 11 * 32] ; [5] | |
12972 pmulhrsw m9, m7 | |
12973 pmaddubsw m1, m3, [r3 - 11 * 32] | |
12974 pmulhrsw m1, m7 | |
12975 packuswb m9, m1 | |
12976 | |
12977 pmaddubsw m10, m2, [r3 - 2 * 32] ; [14] | |
12978 pmulhrsw m10, m7 | |
12979 pmaddubsw m1, m3, [r3 - 2 * 32] | |
12980 pmulhrsw m1, m7 | |
12981 packuswb m10, m1 | |
12982 | |
12983 pmaddubsw m2, [r3 + 7 * 32] ; [23] | |
12984 pmulhrsw m2, m7 | |
12985 pmaddubsw m3, [r3 + 7 * 32] | |
12986 pmulhrsw m3, m7 | |
12987 packuswb m2, m3 | |
12988 | |
12989 movu m1, [r2 + 6] ; [0] | |
12990 | |
12991 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8 | |
12992 ret | |
12993 | |
12994 INIT_YMM avx2 | |
12995 cglobal intra_pred_ang32_7, 3,8,13 | |
12996 add r2, 64 | |
12997 lea r3, [ang_table_avx2 + 32 * 16] | |
12998 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
12999 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
13000 mova m7, [pw_1024] | |
13001 mov r4, r0 | |
13002 xor r7d, r7d | |
13003 | |
13004 call ang32_mode_7_29_row_0_15 | |
13005 | |
13006 add r4, 16 | |
13007 mov r0, r4 | |
13008 add r2, 4 | |
13009 | |
13010 call ang32_mode_7_29_row_16_31 | |
13011 RET | |
13012 | |
13013 INIT_YMM avx2 | |
13014 cglobal intra_pred_ang32_29, 3,8,13 | |
13015 lea r3, [ang_table_avx2 + 32 * 16] | |
13016 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
13017 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
13018 mova m7, [pw_1024] | |
13019 xor r7d, r7d | |
13020 inc r7d | |
13021 | |
13022 call ang32_mode_7_29_row_0_15 | |
13023 | |
13024 add r2, 4 | |
13025 | |
13026 call ang32_mode_7_29_row_16_31 | |
13027 RET | |
13028 | |
13029 cglobal ang32_mode_8_28_avx2 | |
13030 test r7d, r7d | |
13031 ; rows 0 to 7 | |
13032 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
13033 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
13034 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
13035 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
13036 | |
13037 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
13038 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
13039 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
13040 | |
13041 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] | |
13042 pmulhrsw m4, m7 | |
13043 pmaddubsw m1, m2, [r3 - 11 * 32] | |
13044 pmulhrsw m1, m7 | |
13045 packuswb m4, m1 | |
13046 | |
13047 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10] | |
13048 pmulhrsw m5, m7 | |
13049 pmaddubsw m8, m2, [r3 - 6 * 32] | |
13050 pmulhrsw m8, m7 | |
13051 packuswb m5, m8 | |
13052 | |
13053 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15] | |
13054 pmulhrsw m6, m7 | |
13055 pmaddubsw m9, m2, [r3 - 1 * 32] | |
13056 pmulhrsw m9, m7 | |
13057 packuswb m6, m9 | |
13058 | |
13059 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20] | |
13060 pmulhrsw m8, m7 | |
13061 pmaddubsw m12, m2, [r3 + 4 * 32] | |
13062 pmulhrsw m12, m7 | |
13063 packuswb m8, m12 | |
13064 | |
13065 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25] | |
13066 pmulhrsw m9, m7 | |
13067 pmaddubsw m12, m2, [r3 + 9 * 32] | |
13068 pmulhrsw m12, m7 | |
13069 packuswb m9, m12 | |
13070 | |
13071 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30] | |
13072 pmulhrsw m10, m7 | |
13073 pmaddubsw m12, m2, [r3 + 14 * 32] | |
13074 pmulhrsw m12, m7 | |
13075 packuswb m10, m12 | |
13076 | |
13077 palignr m12, m2, m0, 2 | |
13078 palignr m1, m3, m2, 2 | |
13079 pmaddubsw m11, m12, [r3 - 13 * 32] ; [3] | |
13080 pmulhrsw m11, m7 | |
13081 pmaddubsw m1, [r3 - 13 * 32] | |
13082 pmulhrsw m1, m7 | |
13083 packuswb m11, m1 | |
13084 | |
13085 palignr m1, m3, m2, 2 | |
13086 pmaddubsw m12, [r3 - 8 * 32] ; [8] | |
13087 pmulhrsw m12, m7 | |
13088 pmaddubsw m1, [r3 - 8 * 32] | |
13089 pmulhrsw m1, m7 | |
13090 packuswb m12, m1 | |
13091 | |
13092 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0 | |
13093 | |
13094 ; rows 8 to 15 | |
13095 | |
13096 palignr m8, m2, m0, 2 | |
13097 palignr m1, m3, m2, 2 | |
13098 pmaddubsw m4, m8, [r3 - 3 * 32] ; [13] | |
13099 pmulhrsw m4, m7 | |
13100 pmaddubsw m9, m1, [r3 - 3 * 32] | |
13101 pmulhrsw m9, m7 | |
13102 packuswb m4, m9 | |
13103 | |
13104 pmaddubsw m5, m8, [r3 + 2 * 32] ; [18] | |
13105 pmulhrsw m5, m7 | |
13106 pmaddubsw m9, m1, [r3 + 2 * 32] | |
13107 pmulhrsw m9, m7 | |
13108 packuswb m5, m9 | |
13109 | |
13110 pmaddubsw m6, m8, [r3 + 7 * 32] ; [23] | |
13111 pmulhrsw m6, m7 | |
13112 pmaddubsw m9, m1, [r3 + 7 * 32] | |
13113 pmulhrsw m9, m7 | |
13114 packuswb m6, m9 | |
13115 | |
13116 pmaddubsw m8, [r3 + 12 * 32] ; [28] | |
13117 pmulhrsw m8, m7 | |
13118 pmaddubsw m1, [r3 + 12 * 32] | |
13119 pmulhrsw m1, m7 | |
13120 packuswb m8, m1 | |
13121 | |
13122 palignr m12, m2, m0, 4 | |
13123 palignr m1, m3, m2, 4 | |
13124 pmaddubsw m9, m12, [r3 - 15 * 32] ; [1] | |
13125 pmulhrsw m9, m7 | |
13126 pmaddubsw m11, m1, [r3 - 15 * 32] | |
13127 pmulhrsw m11, m7 | |
13128 packuswb m9, m11 | |
13129 | |
13130 pmaddubsw m10, m12, [r3 - 10 * 32] ; [6] | |
13131 pmulhrsw m10, m7 | |
13132 pmaddubsw m11, m1, [r3 - 10 * 32] | |
13133 pmulhrsw m11, m7 | |
13134 packuswb m10, m11 | |
13135 | |
13136 pmaddubsw m11, m12, [r3 - 5 * 32] ; [11] | |
13137 pmulhrsw m11, m7 | |
13138 pmaddubsw m1, [r3 - 5 * 32] | |
13139 pmulhrsw m1, m7 | |
13140 packuswb m11, m1 | |
13141 | |
13142 palignr m1, m3, m2, 4 | |
13143 pmaddubsw m12, [r3] ; [16] | |
13144 pmulhrsw m12, m7 | |
13145 pmaddubsw m1, [r3] | |
13146 pmulhrsw m1, m7 | |
13147 packuswb m12, m1 | |
13148 | |
13149 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8 | |
13150 | |
13151 ; rows 16 to 23 | |
13152 | |
13153 jnz .doNotAdjustBufferPtr | |
13154 lea r4, [r4 + mmsize/2] | |
13155 mov r0, r4 | |
13156 .doNotAdjustBufferPtr: | |
13157 | |
13158 palignr m6, m2, m0, 4 | |
13159 palignr m1, m3, m2, 4 | |
13160 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
13161 pmulhrsw m4, m7 | |
13162 pmaddubsw m8, m1, [r3 + 5 * 32] | |
13163 pmulhrsw m8, m7 | |
13164 packuswb m4, m8 | |
13165 | |
13166 pmaddubsw m5, m6, [r3 + 10 * 32] ; [26] | |
13167 pmulhrsw m5, m7 | |
13168 pmaddubsw m8, m1, [r3 + 10 * 32] | |
13169 pmulhrsw m8, m7 | |
13170 packuswb m5, m8 | |
13171 | |
13172 pmaddubsw m6, [r3 + 15 * 32] ; [31] | |
13173 pmulhrsw m6, m7 | |
13174 pmaddubsw m1, [r3 + 15 * 32] | |
13175 pmulhrsw m1, m7 | |
13176 packuswb m6, m1 | |
13177 | |
13178 palignr m12, m2, m0, 6 | |
13179 palignr m1, m3, m2, 6 | |
13180 pmaddubsw m8, m12, [r3 - 12 * 32] ; [4] | |
13181 pmulhrsw m8, m7 | |
13182 pmaddubsw m11, m1, [r3 - 12 * 32] | |
13183 pmulhrsw m11, m7 | |
13184 packuswb m8, m11 | |
13185 | |
13186 pmaddubsw m9, m12, [r3 - 7 * 32] ; [9] | |
13187 pmulhrsw m9, m7 | |
13188 pmaddubsw m11, m1, [r3 - 7 * 32] | |
13189 pmulhrsw m11, m7 | |
13190 packuswb m9, m11 | |
13191 | |
13192 pmaddubsw m10, m12, [r3 - 2 * 32] ; [14] | |
13193 pmulhrsw m10, m7 | |
13194 pmaddubsw m11, m1, [r3 - 2 * 32] | |
13195 pmulhrsw m11, m7 | |
13196 packuswb m10, m11 | |
13197 | |
13198 pmaddubsw m11, m12, [r3 + 3 * 32] ; [19] | |
13199 pmulhrsw m11, m7 | |
13200 pmaddubsw m1, [r3 + 3 * 32] | |
13201 pmulhrsw m1, m7 | |
13202 packuswb m11, m1 | |
13203 | |
13204 palignr m1, m3, m2, 6 | |
13205 pmaddubsw m12, [r3 + 8 * 32] ; [24] | |
13206 pmulhrsw m12, m7 | |
13207 pmaddubsw m1, [r3 + 8 * 32] | |
13208 pmulhrsw m1, m7 | |
13209 packuswb m12, m1 | |
13210 | |
13211 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16 | |
13212 | |
13213 ; rows 24 to 31 | |
13214 palignr m4, m2, m0, 6 | |
13215 palignr m1, m3, m2, 6 | |
13216 pmaddubsw m4, [r3 + 13 * 32] ; [29] | |
13217 pmulhrsw m4, m7 | |
13218 pmaddubsw m1, [r3 + 13 * 32] | |
13219 pmulhrsw m1, m7 | |
13220 packuswb m4, m1 | |
13221 | |
13222 palignr m3, m2, 8 | |
13223 palignr m2, m0, 8 | |
13224 pmaddubsw m5, m2, [r3 - 14 * 32] ; [2] | |
13225 pmulhrsw m5, m7 | |
13226 pmaddubsw m9, m3, [r3 - 14 * 32] | |
13227 pmulhrsw m9, m7 | |
13228 packuswb m5, m9 | |
13229 | |
13230 pmaddubsw m6, m2, [r3 - 9 * 32] ; [7] | |
13231 pmulhrsw m6, m7 | |
13232 pmaddubsw m9, m3, [r3 - 9 * 32] | |
13233 pmulhrsw m9, m7 | |
13234 packuswb m6, m9 | |
13235 | |
13236 pmaddubsw m8, m2, [r3 - 4 * 32] ; [12] | |
13237 pmulhrsw m8, m7 | |
13238 pmaddubsw m1, m3, [r3 - 4 * 32] | |
13239 pmulhrsw m1, m7 | |
13240 packuswb m8, m1 | |
13241 | |
13242 pmaddubsw m9, m2, [r3 + 1 * 32] ; [17] | |
13243 pmulhrsw m9, m7 | |
13244 pmaddubsw m11, m3, [r3 + 1 * 32] | |
13245 pmulhrsw m11, m7 | |
13246 packuswb m9, m11 | |
13247 | |
13248 pmaddubsw m10, m2, [r3 + 6 * 32] ; [22] | |
13249 pmulhrsw m10, m7 | |
13250 pmaddubsw m1, m3, [r3 + 6 * 32] | |
13251 pmulhrsw m1, m7 | |
13252 packuswb m10, m1 | |
13253 | |
13254 pmaddubsw m2, [r3 + 11 * 32] ; [27] | |
13255 pmulhrsw m2, m7 | |
13256 pmaddubsw m3, [r3 + 11 * 32] | |
13257 pmulhrsw m3, m7 | |
13258 packuswb m2, m3 | |
13259 | |
13260 movu m3, [r2 + 6] ; [0] | |
13261 | |
13262 TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24 | |
13263 ret | |
13264 | |
13265 INIT_YMM avx2 | |
13266 cglobal intra_pred_ang32_8, 3,8,13 | |
13267 add r2, 64 | |
13268 lea r3, [ang_table_avx2 + 32 * 16] | |
13269 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
13270 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
13271 mova m7, [pw_1024] | |
13272 mov r4, r0 | |
13273 xor r7d, r7d | |
13274 | |
13275 call ang32_mode_8_28_avx2 | |
13276 RET | |
13277 | |
13278 INIT_YMM avx2 | |
13279 cglobal intra_pred_ang32_28, 3,8,13 | |
13280 lea r3, [ang_table_avx2 + 32 * 16] | |
13281 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
13282 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
13283 mova m7, [pw_1024] | |
13284 xor r7d, r7d | |
13285 inc r7d | |
13286 | |
13287 call ang32_mode_8_28_avx2 | |
13288 RET | |
13289 | |
13290 INIT_YMM avx2 | |
13291 cglobal intra_pred_ang32_9, 3,5,8 | |
13292 vbroadcasti128 m0, [angHor_tab_9] | |
13293 vbroadcasti128 m1, [angHor_tab_9 + mmsize/2] | |
13294 mova m2, [pw_1024] | |
13295 mova m7, [ang32_shuf_mode9] | |
13296 lea r3, [r1 * 3] | |
13297 | |
13298 vbroadcasti128 m3, [r2 + mmsize*2 + 1] | |
13299 vbroadcasti128 m6, [r2 + mmsize*2 + 17] | |
13300 | |
13301 pshufb m5, m3, m7 | |
13302 pmaddubsw m4, m5, m0 | |
13303 pmaddubsw m5, m1 | |
13304 pmulhrsw m4, m2 | |
13305 pmulhrsw m5, m2 | |
13306 packuswb m4, m5 | |
13307 movu [r0], m4 | |
13308 | |
13309 palignr m5, m6, m3, 1 | |
13310 pshufb m5, m7 | |
13311 pmaddubsw m4, m5, m0 | |
13312 pmaddubsw m5, m1 | |
13313 pmulhrsw m4, m2 | |
13314 pmulhrsw m5, m2 | |
13315 packuswb m4, m5 | |
13316 movu [r0 + r1], m4 | |
13317 | |
13318 palignr m5, m6, m3, 2 | |
13319 pshufb m5, m7 | |
13320 pmaddubsw m4, m5, m0 | |
13321 pmaddubsw m5, m1 | |
13322 pmulhrsw m4, m2 | |
13323 pmulhrsw m5, m2 | |
13324 packuswb m4, m5 | |
13325 movu [r0 + r1*2], m4 | |
13326 | |
13327 palignr m5, m6, m3, 3 | |
13328 pshufb m5, m7 | |
13329 pmaddubsw m4, m5, m0 | |
13330 pmaddubsw m5, m1 | |
13331 pmulhrsw m4, m2 | |
13332 pmulhrsw m5, m2 | |
13333 packuswb m4, m5 | |
13334 movu [r0 + r3], m4 | |
13335 | |
13336 lea r0, [r0 + r1 * 4] | |
13337 | |
13338 palignr m5, m6, m3, 4 | |
13339 pshufb m5, m7 | |
13340 pmaddubsw m4, m5, m0 | |
13341 pmaddubsw m5, m1 | |
13342 pmulhrsw m4, m2 | |
13343 pmulhrsw m5, m2 | |
13344 packuswb m4, m5 | |
13345 movu [r0], m4 | |
13346 | |
13347 palignr m5, m6, m3, 5 | |
13348 pshufb m5, m7 | |
13349 pmaddubsw m4, m5, m0 | |
13350 pmaddubsw m5, m1 | |
13351 pmulhrsw m4, m2 | |
13352 pmulhrsw m5, m2 | |
13353 packuswb m4, m5 | |
13354 movu [r0 + r1], m4 | |
13355 | |
13356 palignr m5, m6, m3, 6 | |
13357 pshufb m5, m7 | |
13358 pmaddubsw m4, m5, m0 | |
13359 pmaddubsw m5, m1 | |
13360 pmulhrsw m4, m2 | |
13361 pmulhrsw m5, m2 | |
13362 packuswb m4, m5 | |
13363 movu [r0 + r1*2], m4 | |
13364 | |
13365 palignr m5, m6, m3, 7 | |
13366 pshufb m5, m7 | |
13367 pmaddubsw m4, m5, m0 | |
13368 pmaddubsw m5, m1 | |
13369 pmulhrsw m4, m2 | |
13370 pmulhrsw m5, m2 | |
13371 packuswb m4, m5 | |
13372 movu [r0 + r3], m4 | |
13373 | |
13374 lea r0, [r0 + r1 * 4] | |
13375 | |
13376 palignr m5, m6, m3, 8 | |
13377 pshufb m5, m7 | |
13378 pmaddubsw m4, m5, m0 | |
13379 pmaddubsw m5, m1 | |
13380 pmulhrsw m4, m2 | |
13381 pmulhrsw m5, m2 | |
13382 packuswb m4, m5 | |
13383 movu [r0], m4 | |
13384 | |
13385 palignr m5, m6, m3, 9 | |
13386 pshufb m5, m7 | |
13387 pmaddubsw m4, m5, m0 | |
13388 pmaddubsw m5, m1 | |
13389 pmulhrsw m4, m2 | |
13390 pmulhrsw m5, m2 | |
13391 packuswb m4, m5 | |
13392 movu [r0 + r1], m4 | |
13393 | |
13394 palignr m5, m6, m3, 10 | |
13395 pshufb m5, m7 | |
13396 pmaddubsw m4, m5, m0 | |
13397 pmaddubsw m5, m1 | |
13398 pmulhrsw m4, m2 | |
13399 pmulhrsw m5, m2 | |
13400 packuswb m4, m5 | |
13401 movu [r0 + r1*2], m4 | |
13402 | |
13403 palignr m5, m6, m3, 11 | |
13404 pshufb m5, m7 | |
13405 pmaddubsw m4, m5, m0 | |
13406 pmaddubsw m5, m1 | |
13407 pmulhrsw m4, m2 | |
13408 pmulhrsw m5, m2 | |
13409 packuswb m4, m5 | |
13410 movu [r0 + r3], m4 | |
13411 | |
13412 lea r0, [r0 + r1 * 4] | |
13413 | |
13414 palignr m5, m6, m3, 12 | |
13415 pshufb m5, m7 | |
13416 pmaddubsw m4, m5, m0 | |
13417 pmaddubsw m5, m1 | |
13418 pmulhrsw m4, m2 | |
13419 pmulhrsw m5, m2 | |
13420 packuswb m4, m5 | |
13421 movu [r0], m4 | |
13422 | |
13423 palignr m5, m6, m3, 13 | |
13424 pshufb m5, m7 | |
13425 pmaddubsw m4, m5, m0 | |
13426 pmaddubsw m5, m1 | |
13427 pmulhrsw m4, m2 | |
13428 pmulhrsw m5, m2 | |
13429 packuswb m4, m5 | |
13430 movu [r0 + r1], m4 | |
13431 | |
13432 palignr m5, m6, m3, 14 | |
13433 pshufb m5, m7 | |
13434 pmaddubsw m4, m5, m0 | |
13435 pmaddubsw m5, m1 | |
13436 pmulhrsw m4, m2 | |
13437 pmulhrsw m5, m2 | |
13438 packuswb m4, m5 | |
13439 movu [r0 + r1*2], m4 | |
13440 | |
13441 palignr m5, m6, m3, 15 | |
13442 pshufb m5, m7 | |
13443 pmaddubsw m4, m5, m0 | |
13444 pmaddubsw m5, m1 | |
13445 pmulhrsw m4, m2 | |
13446 pmulhrsw m5, m2 | |
13447 packuswb m4, m5 | |
13448 movu [r0 + r3], m4 | |
13449 | |
13450 lea r0, [r0 + r1 * 4] | |
13451 | |
13452 vbroadcasti128 m3, [r2 + mmsize*2 + 33] | |
13453 | |
13454 pshufb m5, m6, m7 | |
13455 pmaddubsw m4, m5, m0 | |
13456 pmaddubsw m5, m1 | |
13457 pmulhrsw m4, m2 | |
13458 pmulhrsw m5, m2 | |
13459 packuswb m4, m5 | |
13460 movu [r0], m4 | |
13461 | |
13462 palignr m5, m3, m6, 1 | |
13463 pshufb m5, m7 | |
13464 pmaddubsw m4, m5, m0 | |
13465 pmaddubsw m5, m1 | |
13466 pmulhrsw m4, m2 | |
13467 pmulhrsw m5, m2 | |
13468 packuswb m4, m5 | |
13469 movu [r0 + r1], m4 | |
13470 | |
13471 palignr m5, m3, m6, 2 | |
13472 pshufb m5, m7 | |
13473 pmaddubsw m4, m5, m0 | |
13474 pmaddubsw m5, m1 | |
13475 pmulhrsw m4, m2 | |
13476 pmulhrsw m5, m2 | |
13477 packuswb m4, m5 | |
13478 movu [r0 + r1*2], m4 | |
13479 | |
13480 palignr m5, m3, m6, 3 | |
13481 pshufb m5, m7 | |
13482 pmaddubsw m4, m5, m0 | |
13483 pmaddubsw m5, m1 | |
13484 pmulhrsw m4, m2 | |
13485 pmulhrsw m5, m2 | |
13486 packuswb m4, m5 | |
13487 movu [r0 + r3], m4 | |
13488 | |
13489 lea r0, [r0 + r1 * 4] | |
13490 | |
13491 palignr m5, m3, m6, 4 | |
13492 pshufb m5, m7 | |
13493 pmaddubsw m4, m5, m0 | |
13494 pmaddubsw m5, m1 | |
13495 pmulhrsw m4, m2 | |
13496 pmulhrsw m5, m2 | |
13497 packuswb m4, m5 | |
13498 movu [r0], m4 | |
13499 | |
13500 palignr m5, m3, m6, 5 | |
13501 pshufb m5, m7 | |
13502 pmaddubsw m4, m5, m0 | |
13503 pmaddubsw m5, m1 | |
13504 pmulhrsw m4, m2 | |
13505 pmulhrsw m5, m2 | |
13506 packuswb m4, m5 | |
13507 movu [r0 + r1], m4 | |
13508 | |
13509 palignr m5, m3, m6, 6 | |
13510 pshufb m5, m7 | |
13511 pmaddubsw m4, m5, m0 | |
13512 pmaddubsw m5, m1 | |
13513 pmulhrsw m4, m2 | |
13514 pmulhrsw m5, m2 | |
13515 packuswb m4, m5 | |
13516 movu [r0 + r1*2], m4 | |
13517 | |
13518 palignr m5, m3, m6, 7 | |
13519 pshufb m5, m7 | |
13520 pmaddubsw m4, m5, m0 | |
13521 pmaddubsw m5, m1 | |
13522 pmulhrsw m4, m2 | |
13523 pmulhrsw m5, m2 | |
13524 packuswb m4, m5 | |
13525 movu [r0 + r3], m4 | |
13526 | |
13527 lea r0, [r0 + r1 * 4] | |
13528 | |
13529 palignr m5, m3, m6, 8 | |
13530 pshufb m5, m7 | |
13531 pmaddubsw m4, m5, m0 | |
13532 pmaddubsw m5, m1 | |
13533 pmulhrsw m4, m2 | |
13534 pmulhrsw m5, m2 | |
13535 packuswb m4, m5 | |
13536 movu [r0], m4 | |
13537 | |
13538 palignr m5, m3, m6, 9 | |
13539 pshufb m5, m7 | |
13540 pmaddubsw m4, m5, m0 | |
13541 pmaddubsw m5, m1 | |
13542 pmulhrsw m4, m2 | |
13543 pmulhrsw m5, m2 | |
13544 packuswb m4, m5 | |
13545 movu [r0 + r1], m4 | |
13546 | |
13547 palignr m5, m3, m6, 10 | |
13548 pshufb m5, m7 | |
13549 pmaddubsw m4, m5, m0 | |
13550 pmaddubsw m5, m1 | |
13551 pmulhrsw m4, m2 | |
13552 pmulhrsw m5, m2 | |
13553 packuswb m4, m5 | |
13554 movu [r0 + r1*2], m4 | |
13555 | |
13556 palignr m5, m3, m6, 11 | |
13557 pshufb m5, m7 | |
13558 pmaddubsw m4, m5, m0 | |
13559 pmaddubsw m5, m1 | |
13560 pmulhrsw m4, m2 | |
13561 pmulhrsw m5, m2 | |
13562 packuswb m4, m5 | |
13563 movu [r0 + r3], m4 | |
13564 | |
13565 lea r0, [r0 + r1 * 4] | |
13566 | |
13567 palignr m5, m3, m6, 12 | |
13568 pshufb m5, m7 | |
13569 pmaddubsw m4, m5, m0 | |
13570 pmaddubsw m5, m1 | |
13571 pmulhrsw m4, m2 | |
13572 pmulhrsw m5, m2 | |
13573 packuswb m4, m5 | |
13574 movu [r0], m4 | |
13575 | |
13576 palignr m5, m3, m6, 13 | |
13577 pshufb m5, m7 | |
13578 pmaddubsw m4, m5, m0 | |
13579 pmaddubsw m5, m1 | |
13580 pmulhrsw m4, m2 | |
13581 pmulhrsw m5, m2 | |
13582 packuswb m4, m5 | |
13583 movu [r0 + r1], m4 | |
13584 | |
13585 palignr m5, m3, m6, 14 | |
13586 pshufb m5, m7 | |
13587 pmaddubsw m4, m5, m0 | |
13588 pmaddubsw m5, m1 | |
13589 pmulhrsw m4, m2 | |
13590 pmulhrsw m5, m2 | |
13591 packuswb m4, m5 | |
13592 movu [r0 + r1*2], m4 | |
13593 | |
13594 palignr m5, m3, m6, 15 | |
13595 pshufb m5, m7 | |
13596 pmaddubsw m4, m5, m0 | |
13597 pmaddubsw m5, m1 | |
13598 pmulhrsw m4, m2 | |
13599 pmulhrsw m5, m2 | |
13600 packuswb m4, m5 | |
13601 movu [r0 + r3], m4 | |
13602 RET | |
13603 | |
13604 cglobal intra_pred_ang32_27, 3,5,6 | |
13605 lea r3, [ang_table_avx2 + 32 * 16] | |
13606 lea r4, [r1 * 3] ; r4 -> 3 * stride | |
13607 mova m5, [pw_1024] | |
13608 | |
13609 ; rows 0 to 7 | |
13610 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
13611 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
13612 movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
13613 movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18] | |
13614 | |
13615 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
13616 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
13617 punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17] | |
13618 | |
13619 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] | |
13620 pmulhrsw m4, m5 | |
13621 pmaddubsw m1, m2, [r3 - 14 * 32] | |
13622 pmulhrsw m1, m5 | |
13623 packuswb m4, m1 | |
13624 movu [r0], m4 | |
13625 | |
13626 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] | |
13627 pmulhrsw m4, m5 | |
13628 pmaddubsw m1, m2, [r3 - 12 * 32] | |
13629 pmulhrsw m1, m5 | |
13630 packuswb m4, m1 | |
13631 movu [r0 + r1], m4 | |
13632 | |
13633 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] | |
13634 pmulhrsw m4, m5 | |
13635 pmaddubsw m1, m2, [r3 - 10 * 32] | |
13636 pmulhrsw m1, m5 | |
13637 packuswb m4, m1 | |
13638 movu [r0 + r1*2], m4 | |
13639 | |
13640 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] | |
13641 pmulhrsw m4, m5 | |
13642 pmaddubsw m1, m2, [r3 - 8 * 32] | |
13643 pmulhrsw m1, m5 | |
13644 packuswb m4, m1 | |
13645 movu [r0 + r4], m4 | |
13646 | |
13647 lea r0, [r0 + r1 * 4] | |
13648 | |
13649 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] | |
13650 pmulhrsw m4, m5 | |
13651 pmaddubsw m1, m2, [r3 - 6 * 32] | |
13652 pmulhrsw m1, m5 | |
13653 packuswb m4, m1 | |
13654 movu [r0], m4 | |
13655 | |
13656 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] | |
13657 pmulhrsw m4, m5 | |
13658 pmaddubsw m1, m2, [r3 - 4 * 32] | |
13659 pmulhrsw m1, m5 | |
13660 packuswb m4, m1 | |
13661 movu [r0 + r1], m4 | |
13662 | |
13663 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] | |
13664 pmulhrsw m4, m5 | |
13665 pmaddubsw m1, m2, [r3 - 2 * 32] | |
13666 pmulhrsw m1, m5 | |
13667 packuswb m4, m1 | |
13668 movu [r0 + r1*2], m4 | |
13669 | |
13670 pmaddubsw m4, m0, [r3] ; [16] | |
13671 pmulhrsw m4, m5 | |
13672 pmaddubsw m1, m2, [r3] | |
13673 pmulhrsw m1, m5 | |
13674 packuswb m4, m1 | |
13675 movu [r0 + r4], m4 | |
13676 | |
13677 lea r0, [r0 + r1 * 4] | |
13678 | |
13679 ; rows 8 to 15 | |
13680 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] | |
13681 pmulhrsw m4, m5 | |
13682 pmaddubsw m1, m2, [r3 + 2 * 32] | |
13683 pmulhrsw m1, m5 | |
13684 packuswb m4, m1 | |
13685 movu [r0], m4 | |
13686 | |
13687 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] | |
13688 pmulhrsw m4, m5 | |
13689 pmaddubsw m1, m2, [r3 + 4 * 32] | |
13690 pmulhrsw m1, m5 | |
13691 packuswb m4, m1 | |
13692 movu [r0 + r1], m4 | |
13693 | |
13694 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] | |
13695 pmulhrsw m4, m5 | |
13696 pmaddubsw m1, m2, [r3 + 6 * 32] | |
13697 pmulhrsw m1, m5 | |
13698 packuswb m4, m1 | |
13699 movu [r0 + r1*2], m4 | |
13700 | |
13701 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] | |
13702 pmulhrsw m4, m5 | |
13703 pmaddubsw m1, m2, [r3 + 8 * 32] | |
13704 pmulhrsw m1, m5 | |
13705 packuswb m4, m1 | |
13706 movu [r0 + r4], m4 | |
13707 | |
13708 lea r0, [r0 + r1 * 4] | |
13709 | |
13710 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] | |
13711 pmulhrsw m4, m5 | |
13712 pmaddubsw m1, m2, [r3 + 10 * 32] | |
13713 pmulhrsw m1, m5 | |
13714 packuswb m4, m1 | |
13715 movu [r0], m4 | |
13716 | |
13717 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] | |
13718 pmulhrsw m4, m5 | |
13719 pmaddubsw m1, m2, [r3 + 12 * 32] | |
13720 pmulhrsw m1, m5 | |
13721 packuswb m4, m1 | |
13722 movu [r0 + r1], m4 | |
13723 | |
13724 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] | |
13725 pmulhrsw m4, m5 | |
13726 pmaddubsw m1, m2, [r3 + 14 * 32] | |
13727 pmulhrsw m1, m5 | |
13728 packuswb m4, m1 | |
13729 movu [r0 + r1*2], m4 | |
13730 | |
13731 palignr m3, m2, 2 | |
13732 palignr m2, m0, 2 | |
13733 movu m1, [r2 + 2] ; [0] | |
13734 movu [r0 + r4], m1 | |
13735 | |
13736 lea r0, [r0 + r1 * 4] | |
13737 | |
13738 ; rows 16 to 23 | |
13739 pmaddubsw m4, m2, [r3 - 14 * 32] ; [2] | |
13740 pmulhrsw m4, m5 | |
13741 pmaddubsw m1, m3, [r3 - 14 * 32] | |
13742 pmulhrsw m1, m5 | |
13743 packuswb m4, m1 | |
13744 movu [r0], m4 | |
13745 | |
13746 pmaddubsw m4, m2, [r3 - 12 * 32] ; [4] | |
13747 pmulhrsw m4, m5 | |
13748 pmaddubsw m1, m3, [r3 - 12 * 32] | |
13749 pmulhrsw m1, m5 | |
13750 packuswb m4, m1 | |
13751 movu [r0 + r1], m4 | |
13752 | |
13753 pmaddubsw m4, m2, [r3 - 10 * 32] ; [6] | |
13754 pmulhrsw m4, m5 | |
13755 pmaddubsw m1, m3, [r3 - 10 * 32] | |
13756 pmulhrsw m1, m5 | |
13757 packuswb m4, m1 | |
13758 movu [r0 + r1*2], m4 | |
13759 | |
13760 pmaddubsw m4, m2, [r3 - 8 * 32] ; [8] | |
13761 pmulhrsw m4, m5 | |
13762 pmaddubsw m1, m3, [r3 - 8 * 32] | |
13763 pmulhrsw m1, m5 | |
13764 packuswb m4, m1 | |
13765 movu [r0 + r4], m4 | |
13766 | |
13767 lea r0, [r0 + r1 * 4] | |
13768 | |
13769 pmaddubsw m4, m2, [r3 - 6 * 32] ; [10] | |
13770 pmulhrsw m4, m5 | |
13771 pmaddubsw m1, m3, [r3 - 6 * 32] | |
13772 pmulhrsw m1, m5 | |
13773 packuswb m4, m1 | |
13774 movu [r0], m4 | |
13775 | |
13776 pmaddubsw m4, m2, [r3 - 4 * 32] ; [12] | |
13777 pmulhrsw m4, m5 | |
13778 pmaddubsw m1, m3, [r3 - 4 * 32] | |
13779 pmulhrsw m1, m5 | |
13780 packuswb m4, m1 | |
13781 movu [r0 + r1], m4 | |
13782 | |
13783 pmaddubsw m4, m2, [r3 - 2 * 32] ; [14] | |
13784 pmulhrsw m4, m5 | |
13785 pmaddubsw m1, m3, [r3 - 2 * 32] | |
13786 pmulhrsw m1, m5 | |
13787 packuswb m4, m1 | |
13788 movu [r0 + r1*2], m4 | |
13789 | |
13790 pmaddubsw m4, m2, [r3] ; [16] | |
13791 pmulhrsw m4, m5 | |
13792 pmaddubsw m1, m3, [r3] | |
13793 pmulhrsw m1, m5 | |
13794 packuswb m4, m1 | |
13795 movu [r0 + r4], m4 | |
13796 | |
13797 lea r0, [r0 + r1 * 4] | |
13798 | |
13799 ; rows 8 to 15 | |
13800 pmaddubsw m4, m2, [r3 + 2 * 32] ; [18] | |
13801 pmulhrsw m4, m5 | |
13802 pmaddubsw m1, m3, [r3 + 2 * 32] | |
13803 pmulhrsw m1, m5 | |
13804 packuswb m4, m1 | |
13805 movu [r0], m4 | |
13806 | |
13807 pmaddubsw m4, m2, [r3 + 4 * 32] ; [20] | |
13808 pmulhrsw m4, m5 | |
13809 pmaddubsw m1, m3, [r3 + 4 * 32] | |
13810 pmulhrsw m1, m5 | |
13811 packuswb m4, m1 | |
13812 movu [r0 + r1], m4 | |
13813 | |
13814 pmaddubsw m4, m2, [r3 + 6 * 32] ; [22] | |
13815 pmulhrsw m4, m5 | |
13816 pmaddubsw m1, m3, [r3 + 6 * 32] | |
13817 pmulhrsw m1, m5 | |
13818 packuswb m4, m1 | |
13819 movu [r0 + r1*2], m4 | |
13820 | |
13821 pmaddubsw m4, m2, [r3 + 8 * 32] ; [24] | |
13822 pmulhrsw m4, m5 | |
13823 pmaddubsw m1, m3, [r3 + 8 * 32] | |
13824 pmulhrsw m1, m5 | |
13825 packuswb m4, m1 | |
13826 movu [r0 + r4], m4 | |
13827 | |
13828 lea r0, [r0 + r1 * 4] | |
13829 | |
13830 pmaddubsw m4, m2, [r3 + 10 * 32] ; [26] | |
13831 pmulhrsw m4, m5 | |
13832 pmaddubsw m1, m3, [r3 + 10 * 32] | |
13833 pmulhrsw m1, m5 | |
13834 packuswb m4, m1 | |
13835 movu [r0], m4 | |
13836 | |
13837 pmaddubsw m4, m2, [r3 + 12 * 32] ; [28] | |
13838 pmulhrsw m4, m5 | |
13839 pmaddubsw m1, m3, [r3 + 12 * 32] | |
13840 pmulhrsw m1, m5 | |
13841 packuswb m4, m1 | |
13842 movu [r0 + r1], m4 | |
13843 | |
13844 pmaddubsw m2, [r3 + 14 * 32] ; [30] | |
13845 pmulhrsw m2, m5 | |
13846 pmaddubsw m3, [r3 + 14 * 32] | |
13847 pmulhrsw m3, m5 | |
13848 packuswb m2, m3 | |
13849 movu [r0 + r1*2], m2 | |
13850 | |
13851 movu m1, [r2 + 3] ; [0] | |
13852 movu [r0 + r4], m1 | |
13853 RET | |
13854 | |
13855 cglobal intra_pred_ang32_10, 5,5,4 | |
13856 pxor m0, m0 | |
13857 mova m1, [pb_1] | |
13858 lea r4, [r1 * 3] | |
13859 | |
13860 vbroadcasti128 m2, [r2 + mmsize*2 + 1] | |
13861 | |
13862 pshufb m3, m2, m0 | |
13863 movu [r0], m3 | |
13864 paddb m0, m1 | |
13865 pshufb m3, m2, m0 | |
13866 movu [r0 + r1], m3 | |
13867 paddb m0, m1 | |
13868 pshufb m3, m2, m0 | |
13869 movu [r0 + r1 * 2], m3 | |
13870 paddb m0, m1 | |
13871 pshufb m3, m2, m0 | |
13872 movu [r0 + r4], m3 | |
13873 | |
13874 lea r0, [r0 + r1 * 4] | |
13875 | |
13876 paddb m0, m1 | |
13877 pshufb m3, m2, m0 | |
13878 movu [r0], m3 | |
13879 paddb m0, m1 | |
13880 pshufb m3, m2, m0 | |
13881 movu [r0 + r1], m3 | |
13882 paddb m0, m1 | |
13883 pshufb m3, m2, m0 | |
13884 movu [r0 + r1 * 2], m3 | |
13885 paddb m0, m1 | |
13886 pshufb m3, m2, m0 | |
13887 movu [r0 + r4], m3 | |
13888 | |
13889 lea r0, [r0 + r1 * 4] | |
13890 | |
13891 paddb m0, m1 | |
13892 pshufb m3, m2, m0 | |
13893 movu [r0], m3 | |
13894 paddb m0, m1 | |
13895 pshufb m3, m2, m0 | |
13896 movu [r0 + r1], m3 | |
13897 paddb m0, m1 | |
13898 pshufb m3, m2, m0 | |
13899 movu [r0 + r1 * 2], m3 | |
13900 paddb m0, m1 | |
13901 pshufb m3, m2, m0 | |
13902 movu [r0 + r4], m3 | |
13903 | |
13904 lea r0, [r0 + r1 * 4] | |
13905 | |
13906 paddb m0, m1 | |
13907 pshufb m3, m2, m0 | |
13908 movu [r0], m3 | |
13909 paddb m0, m1 | |
13910 pshufb m3, m2, m0 | |
13911 movu [r0 + r1], m3 | |
13912 paddb m0, m1 | |
13913 pshufb m3, m2, m0 | |
13914 movu [r0 + r1 * 2], m3 | |
13915 paddb m0, m1 | |
13916 pshufb m3, m2, m0 | |
13917 movu [r0 + r4], m3 | |
13918 | |
13919 lea r0, [r0 + r1 * 4] | |
13920 pxor m0, m0 | |
13921 vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1] | |
13922 | |
13923 pshufb m3, m2, m0 | |
13924 movu [r0], m3 | |
13925 paddb m0, m1 | |
13926 pshufb m3, m2, m0 | |
13927 movu [r0 + r1], m3 | |
13928 paddb m0, m1 | |
13929 pshufb m3, m2, m0 | |
13930 movu [r0 + r1 * 2], m3 | |
13931 paddb m0, m1 | |
13932 pshufb m3, m2, m0 | |
13933 movu [r0 + r4], m3 | |
13934 | |
13935 lea r0, [r0 + r1 * 4] | |
13936 | |
13937 paddb m0, m1 | |
13938 pshufb m3, m2, m0 | |
13939 movu [r0], m3 | |
13940 paddb m0, m1 | |
13941 pshufb m3, m2, m0 | |
13942 movu [r0 + r1], m3 | |
13943 paddb m0, m1 | |
13944 pshufb m3, m2, m0 | |
13945 movu [r0 + r1 * 2], m3 | |
13946 paddb m0, m1 | |
13947 pshufb m3, m2, m0 | |
13948 movu [r0 + r4], m3 | |
13949 | |
13950 lea r0, [r0 + r1 * 4] | |
13951 | |
13952 paddb m0, m1 | |
13953 pshufb m3, m2, m0 | |
13954 movu [r0], m3 | |
13955 paddb m0, m1 | |
13956 pshufb m3, m2, m0 | |
13957 movu [r0 + r1], m3 | |
13958 paddb m0, m1 | |
13959 pshufb m3, m2, m0 | |
13960 movu [r0 + r1 * 2], m3 | |
13961 paddb m0, m1 | |
13962 pshufb m3, m2, m0 | |
13963 movu [r0 + r4], m3 | |
13964 | |
13965 lea r0, [r0 + r1 * 4] | |
13966 | |
13967 paddb m0, m1 | |
13968 pshufb m3, m2, m0 | |
13969 movu [r0], m3 | |
13970 paddb m0, m1 | |
13971 pshufb m3, m2, m0 | |
13972 movu [r0 + r1], m3 | |
13973 paddb m0, m1 | |
13974 pshufb m3, m2, m0 | |
13975 movu [r0 + r1 * 2], m3 | |
13976 paddb m0, m1 | |
13977 pshufb m3, m2, m0 | |
13978 movu [r0 + r4], m3 | |
13979 RET | |
13980 | |
13981 cglobal intra_pred_ang32_11, 3,4,8 | |
13982 vbroadcasti128 m0, [angHor_tab_11] | |
13983 vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] | |
13984 mova m2, [pw_1024] | |
13985 mova m7, [ang32_shuf_mode11] | |
13986 lea r3, [r1 * 3] | |
13987 | |
13988 ; prepare for [16 0 -1 -2 ...] | |
13989 movu xm3, [r2 + mmsize*2 - 1] | |
13990 vbroadcasti128 m6, [r2 + mmsize*2 + 15] | |
13991 | |
13992 pinsrb xm3, [r2 + 0], 1 | |
13993 pinsrb xm3, [r2 + 16], 0 | |
13994 vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] | |
13995 | |
13996 pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0] | |
13997 pmaddubsw m4, m5, m0 | |
13998 pmaddubsw m5, m1 | |
13999 pmulhrsw m4, m2 | |
14000 pmulhrsw m5, m2 | |
14001 packuswb m4, m5 | |
14002 movu [r0], m4 | |
14003 | |
14004 palignr m5, m6, m3, 1 | |
14005 pshufb m5, m7 | |
14006 pmaddubsw m4, m5, m0 | |
14007 pmaddubsw m5, m1 | |
14008 pmulhrsw m4, m2 | |
14009 pmulhrsw m5, m2 | |
14010 packuswb m4, m5 | |
14011 movu [r0 + r1], m4 | |
14012 | |
14013 palignr m5, m6, m3, 2 | |
14014 pshufb m5, m7 | |
14015 pmaddubsw m4, m5, m0 | |
14016 pmaddubsw m5, m1 | |
14017 pmulhrsw m4, m2 | |
14018 pmulhrsw m5, m2 | |
14019 packuswb m4, m5 | |
14020 movu [r0 + r1 * 2], m4 | |
14021 | |
14022 palignr m5, m6, m3, 3 | |
14023 pshufb m5, m7 | |
14024 pmaddubsw m4, m5, m0 | |
14025 pmaddubsw m5, m1 | |
14026 pmulhrsw m4, m2 | |
14027 pmulhrsw m5, m2 | |
14028 packuswb m4, m5 | |
14029 movu [r0 + r3], m4 | |
14030 | |
14031 lea r0, [r0 + r1 * 4] | |
14032 | |
14033 palignr m5, m6, m3, 4 | |
14034 pshufb m5, m7 | |
14035 pmaddubsw m4, m5, m0 | |
14036 pmaddubsw m5, m1 | |
14037 pmulhrsw m4, m2 | |
14038 pmulhrsw m5, m2 | |
14039 packuswb m4, m5 | |
14040 movu [r0], m4 | |
14041 | |
14042 palignr m5, m6, m3, 5 | |
14043 pshufb m5, m7 | |
14044 pmaddubsw m4, m5, m0 | |
14045 pmaddubsw m5, m1 | |
14046 pmulhrsw m4, m2 | |
14047 pmulhrsw m5, m2 | |
14048 packuswb m4, m5 | |
14049 movu [r0 + r1], m4 | |
14050 | |
14051 palignr m5, m6, m3, 6 | |
14052 pshufb m5, m7 | |
14053 pmaddubsw m4, m5, m0 | |
14054 pmaddubsw m5, m1 | |
14055 pmulhrsw m4, m2 | |
14056 pmulhrsw m5, m2 | |
14057 packuswb m4, m5 | |
14058 movu [r0 + r1 * 2], m4 | |
14059 | |
14060 palignr m5, m6, m3, 7 | |
14061 pshufb m5, m7 | |
14062 pmaddubsw m4, m5, m0 | |
14063 pmaddubsw m5, m1 | |
14064 pmulhrsw m4, m2 | |
14065 pmulhrsw m5, m2 | |
14066 packuswb m4, m5 | |
14067 movu [r0 + r3], m4 | |
14068 | |
14069 lea r0, [r0 + r1 * 4] | |
14070 | |
14071 palignr m5, m6, m3, 8 | |
14072 pshufb m5, m7 | |
14073 pmaddubsw m4, m5, m0 | |
14074 pmaddubsw m5, m1 | |
14075 pmulhrsw m4, m2 | |
14076 pmulhrsw m5, m2 | |
14077 packuswb m4, m5 | |
14078 movu [r0], m4 | |
14079 | |
14080 palignr m5, m6, m3, 9 | |
14081 pshufb m5, m7 | |
14082 pmaddubsw m4, m5, m0 | |
14083 pmaddubsw m5, m1 | |
14084 pmulhrsw m4, m2 | |
14085 pmulhrsw m5, m2 | |
14086 packuswb m4, m5 | |
14087 movu [r0 + r1], m4 | |
14088 | |
14089 palignr m5, m6, m3, 10 | |
14090 pshufb m5, m7 | |
14091 | |
14092 pmaddubsw m4, m5, m0 | |
14093 pmaddubsw m5, m1 | |
14094 pmulhrsw m4, m2 | |
14095 pmulhrsw m5, m2 | |
14096 packuswb m4, m5 | |
14097 movu [r0 + r1 * 2], m4 | |
14098 | |
14099 palignr m5, m6, m3, 11 | |
14100 pshufb m5, m7 | |
14101 pmaddubsw m4, m5, m0 | |
14102 pmaddubsw m5, m1 | |
14103 pmulhrsw m4, m2 | |
14104 pmulhrsw m5, m2 | |
14105 packuswb m4, m5 | |
14106 movu [r0 + r3], m4 | |
14107 | |
14108 lea r0, [r0 + r1 * 4] | |
14109 | |
14110 palignr m5, m6, m3, 12 | |
14111 pshufb m5, m7 | |
14112 pmaddubsw m4, m5, m0 | |
14113 pmaddubsw m5, m1 | |
14114 pmulhrsw m4, m2 | |
14115 pmulhrsw m5, m2 | |
14116 packuswb m4, m5 | |
14117 movu [r0], m4 | |
14118 | |
14119 palignr m5, m6, m3, 13 | |
14120 pshufb m5, m7 | |
14121 pmaddubsw m4, m5, m0 | |
14122 pmaddubsw m5, m1 | |
14123 pmulhrsw m4, m2 | |
14124 pmulhrsw m5, m2 | |
14125 packuswb m4, m5 | |
14126 movu [r0 + r1], m4 | |
14127 | |
14128 palignr m5, m6, m3, 14 | |
14129 pshufb m5, m7 | |
14130 pmaddubsw m4, m5, m0 | |
14131 pmaddubsw m5, m1 | |
14132 pmulhrsw m4, m2 | |
14133 pmulhrsw m5, m2 | |
14134 packuswb m4, m5 | |
14135 movu [r0 + r1 * 2], m4 | |
14136 | |
14137 palignr m5, m6, m3, 15 | |
14138 pshufb m5, m7 | |
14139 pmaddubsw m4, m5, m0 | |
14140 pmaddubsw m5, m1 | |
14141 pmulhrsw m4, m2 | |
14142 pmulhrsw m5, m2 | |
14143 packuswb m4, m5 | |
14144 movu [r0 + r3], m4 | |
14145 | |
14146 lea r0, [r0 + r1 * 4] | |
14147 | |
14148 mova m3, m6 | |
14149 vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16] | |
14150 pshufb m5, m3, m7 | |
14151 pmaddubsw m4, m5, m0 | |
14152 pmaddubsw m5, m1 | |
14153 pmulhrsw m4, m2 | |
14154 pmulhrsw m5, m2 | |
14155 packuswb m4, m5 | |
14156 movu [r0], m4 | |
14157 | |
14158 palignr m5, m6, m3, 1 | |
14159 pshufb m5, m7 | |
14160 pmaddubsw m4, m5, m0 | |
14161 pmaddubsw m5, m1 | |
14162 pmulhrsw m4, m2 | |
14163 pmulhrsw m5, m2 | |
14164 packuswb m4, m5 | |
14165 movu [r0 + r1], m4 | |
14166 | |
14167 palignr m5, m6, m3, 2 | |
14168 pshufb m5, m7 | |
14169 pmaddubsw m4, m5, m0 | |
14170 pmaddubsw m5, m1 | |
14171 pmulhrsw m4, m2 | |
14172 pmulhrsw m5, m2 | |
14173 packuswb m4, m5 | |
14174 movu [r0 + r1 * 2], m4 | |
14175 | |
14176 palignr m5, m6, m3, 3 | |
14177 pshufb m5, m7 | |
14178 pmaddubsw m4, m5, m0 | |
14179 pmaddubsw m5, m1 | |
14180 pmulhrsw m4, m2 | |
14181 pmulhrsw m5, m2 | |
14182 packuswb m4, m5 | |
14183 movu [r0 + r3], m4 | |
14184 | |
14185 lea r0, [r0 + r1 * 4] | |
14186 | |
14187 palignr m5, m6, m3, 4 | |
14188 pshufb m5, m7 | |
14189 pmaddubsw m4, m5, m0 | |
14190 pmaddubsw m5, m1 | |
14191 pmulhrsw m4, m2 | |
14192 pmulhrsw m5, m2 | |
14193 packuswb m4, m5 | |
14194 movu [r0], m4 | |
14195 | |
14196 palignr m5, m6, m3, 5 | |
14197 pshufb m5, m7 | |
14198 pmaddubsw m4, m5, m0 | |
14199 pmaddubsw m5, m1 | |
14200 pmulhrsw m4, m2 | |
14201 pmulhrsw m5, m2 | |
14202 packuswb m4, m5 | |
14203 movu [r0 + r1], m4 | |
14204 | |
14205 palignr m5, m6, m3, 6 | |
14206 pshufb m5, m7 | |
14207 pmaddubsw m4, m5, m0 | |
14208 pmaddubsw m5, m1 | |
14209 pmulhrsw m4, m2 | |
14210 pmulhrsw m5, m2 | |
14211 packuswb m4, m5 | |
14212 movu [r0 + r1 * 2], m4 | |
14213 | |
14214 palignr m5, m6, m3, 7 | |
14215 pshufb m5, m7 | |
14216 pmaddubsw m4, m5, m0 | |
14217 pmaddubsw m5, m1 | |
14218 pmulhrsw m4, m2 | |
14219 pmulhrsw m5, m2 | |
14220 packuswb m4, m5 | |
14221 movu [r0 + r3], m4 | |
14222 | |
14223 lea r0, [r0 + r1 * 4] | |
14224 | |
14225 palignr m5, m6, m3, 8 | |
14226 pshufb m5, m7 | |
14227 pmaddubsw m4, m5, m0 | |
14228 pmaddubsw m5, m1 | |
14229 pmulhrsw m4, m2 | |
14230 pmulhrsw m5, m2 | |
14231 packuswb m4, m5 | |
14232 movu [r0], m4 | |
14233 | |
14234 palignr m5, m6, m3, 9 | |
14235 pshufb m5, m7 | |
14236 pmaddubsw m4, m5, m0 | |
14237 pmaddubsw m5, m1 | |
14238 pmulhrsw m4, m2 | |
14239 pmulhrsw m5, m2 | |
14240 packuswb m4, m5 | |
14241 movu [r0 + r1], m4 | |
14242 | |
14243 palignr m5, m6, m3, 10 | |
14244 pshufb m5, m7 | |
14245 pmaddubsw m4, m5, m0 | |
14246 pmaddubsw m5, m1 | |
14247 pmulhrsw m4, m2 | |
14248 pmulhrsw m5, m2 | |
14249 packuswb m4, m5 | |
14250 movu [r0 + r1 * 2], m4 | |
14251 | |
14252 palignr m5, m6, m3, 11 | |
14253 pshufb m5, m7 | |
14254 pmaddubsw m4, m5, m0 | |
14255 pmaddubsw m5, m1 | |
14256 pmulhrsw m4, m2 | |
14257 pmulhrsw m5, m2 | |
14258 packuswb m4, m5 | |
14259 movu [r0 + r3], m4 | |
14260 | |
14261 lea r0, [r0 + r1 * 4] | |
14262 | |
14263 palignr m5, m6, m3, 12 | |
14264 pshufb m5, m7 | |
14265 pmaddubsw m4, m5, m0 | |
14266 pmaddubsw m5, m1 | |
14267 pmulhrsw m4, m2 | |
14268 pmulhrsw m5, m2 | |
14269 packuswb m4, m5 | |
14270 movu [r0], m4 | |
14271 | |
14272 palignr m5, m6, m3, 13 | |
14273 pshufb m5, m7 | |
14274 pmaddubsw m4, m5, m0 | |
14275 pmaddubsw m5, m1 | |
14276 pmulhrsw m4, m2 | |
14277 pmulhrsw m5, m2 | |
14278 packuswb m4, m5 | |
14279 movu [r0 + r1], m4 | |
14280 | |
14281 palignr m5, m6, m3, 14 | |
14282 pshufb m5, m7 | |
14283 pmaddubsw m4, m5, m0 | |
14284 pmaddubsw m5, m1 | |
14285 pmulhrsw m4, m2 | |
14286 pmulhrsw m5, m2 | |
14287 packuswb m4, m5 | |
14288 movu [r0 + r1 * 2], m4 | |
14289 | |
14290 palignr m5, m6, m3, 15 | |
14291 pshufb m5, m7 | |
14292 pmaddubsw m4, m5, m0 | |
14293 pmaddubsw m5, m1 | |
14294 pmulhrsw m4, m2 | |
14295 pmulhrsw m5, m2 | |
14296 packuswb m4, m5 | |
14297 movu [r0 + r3], m4 | |
14298 RET | |
14299 | |
14300 cglobal intra_pred_ang32_25, 3,5,7 | |
14301 lea r3, [ang_table_avx2 + 32 * 16] | |
14302 lea r4, [r1 * 3] | |
14303 mova m5, [pw_1024] | |
14304 | |
14305 ; rows 0 to 7 | |
14306 movu m0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
14307 movu m1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
14308 | |
14309 pinsrb xm3, [r2], 15 | |
14310 pinsrb xm3, [r2 + mmsize*2 + 16], 14 | |
14311 | |
14312 punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
14313 punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
14314 vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x] | |
14315 | |
14316 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] | |
14317 pmulhrsw m4, m5 | |
14318 pmaddubsw m1, m2, [r3 + 14 * 32] | |
14319 pmulhrsw m1, m5 | |
14320 packuswb m4, m1 | |
14321 movu [r0], m4 | |
14322 | |
14323 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] | |
14324 pmulhrsw m4, m5 | |
14325 pmaddubsw m1, m2, [r3 + 12 * 32] | |
14326 pmulhrsw m1, m5 | |
14327 packuswb m4, m1 | |
14328 movu [r0 + r1], m4 | |
14329 | |
14330 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] | |
14331 pmulhrsw m4, m5 | |
14332 pmaddubsw m1, m2, [r3 + 10 * 32] | |
14333 pmulhrsw m1, m5 | |
14334 packuswb m4, m1 | |
14335 movu [r0 + r1*2], m4 | |
14336 | |
14337 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] | |
14338 pmulhrsw m4, m5 | |
14339 pmaddubsw m1, m2, [r3 + 8 * 32] | |
14340 pmulhrsw m1, m5 | |
14341 packuswb m4, m1 | |
14342 movu [r0 + r4], m4 | |
14343 | |
14344 lea r0, [r0 + r1 * 4] | |
14345 | |
14346 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] | |
14347 pmulhrsw m4, m5 | |
14348 pmaddubsw m1, m2, [r3 + 6 * 32] | |
14349 pmulhrsw m1, m5 | |
14350 packuswb m4, m1 | |
14351 movu [r0], m4 | |
14352 | |
14353 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] | |
14354 pmulhrsw m4, m5 | |
14355 pmaddubsw m1, m2, [r3 + 4 * 32] | |
14356 pmulhrsw m1, m5 | |
14357 packuswb m4, m1 | |
14358 movu [r0 + r1], m4 | |
14359 | |
14360 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] | |
14361 pmulhrsw m4, m5 | |
14362 pmaddubsw m1, m2, [r3 + 2 * 32] | |
14363 pmulhrsw m1, m5 | |
14364 packuswb m4, m1 | |
14365 movu [r0 + r1*2], m4 | |
14366 | |
14367 pmaddubsw m4, m0, [r3] ; [16] | |
14368 pmulhrsw m4, m5 | |
14369 pmaddubsw m1, m2, [r3] | |
14370 pmulhrsw m1, m5 | |
14371 packuswb m4, m1 | |
14372 movu [r0 + r4], m4 | |
14373 | |
14374 lea r0, [r0 + r1 * 4] | |
14375 | |
14376 ; rows 8 to 15 | |
14377 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] | |
14378 pmulhrsw m4, m5 | |
14379 pmaddubsw m1, m2, [r3 - 2 * 32] | |
14380 pmulhrsw m1, m5 | |
14381 packuswb m4, m1 | |
14382 movu [r0], m4 | |
14383 | |
14384 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] | |
14385 pmulhrsw m4, m5 | |
14386 pmaddubsw m1, m2, [r3 - 4 * 32] | |
14387 pmulhrsw m1, m5 | |
14388 packuswb m4, m1 | |
14389 movu [r0 + r1], m4 | |
14390 | |
14391 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] | |
14392 pmulhrsw m4, m5 | |
14393 pmaddubsw m1, m2, [r3 - 6 * 32] | |
14394 pmulhrsw m1, m5 | |
14395 packuswb m4, m1 | |
14396 movu [r0 + r1*2], m4 | |
14397 | |
14398 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] | |
14399 pmulhrsw m4, m5 | |
14400 pmaddubsw m1, m2, [r3 - 8 * 32] | |
14401 pmulhrsw m1, m5 | |
14402 packuswb m4, m1 | |
14403 movu [r0 + r4], m4 | |
14404 | |
14405 lea r0, [r0 + r1 * 4] | |
14406 | |
14407 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] | |
14408 pmulhrsw m4, m5 | |
14409 pmaddubsw m1, m2, [r3 - 10 * 32] | |
14410 pmulhrsw m1, m5 | |
14411 packuswb m4, m1 | |
14412 movu [r0], m4 | |
14413 | |
14414 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] | |
14415 pmulhrsw m4, m5 | |
14416 pmaddubsw m1, m2, [r3 - 12 * 32] | |
14417 pmulhrsw m1, m5 | |
14418 packuswb m4, m1 | |
14419 movu [r0 + r1], m4 | |
14420 | |
14421 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] | |
14422 pmulhrsw m4, m5 | |
14423 pmaddubsw m1, m2, [r3 - 14 * 32] | |
14424 pmulhrsw m1, m5 | |
14425 packuswb m4, m1 | |
14426 movu [r0 + r1 * 2], m4 | |
14427 | |
14428 movu m1, [r2] ; [0] | |
14429 movu [r0 + r4], m1 | |
14430 | |
14431 lea r0, [r0 + r1 * 4] | |
14432 palignr m2, m0, 14 | |
14433 palignr m0, m3, 14 | |
14434 | |
14435 ; rows 16 to 23 | |
14436 pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] | |
14437 pmulhrsw m4, m5 | |
14438 pmaddubsw m1, m2, [r3 + 14 * 32] | |
14439 pmulhrsw m1, m5 | |
14440 packuswb m4, m1 | |
14441 movu [r0], m4 | |
14442 | |
14443 pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] | |
14444 pmulhrsw m4, m5 | |
14445 pmaddubsw m1, m2, [r3 + 12 * 32] | |
14446 pmulhrsw m1, m5 | |
14447 packuswb m4, m1 | |
14448 movu [r0 + r1], m4 | |
14449 | |
14450 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] | |
14451 pmulhrsw m4, m5 | |
14452 pmaddubsw m1, m2, [r3 + 10 * 32] | |
14453 pmulhrsw m1, m5 | |
14454 packuswb m4, m1 | |
14455 movu [r0 + r1*2], m4 | |
14456 | |
14457 pmaddubsw m4, m0, [r3 + 8 * 32] ; [24] | |
14458 pmulhrsw m4, m5 | |
14459 pmaddubsw m1, m2, [r3 + 8 * 32] | |
14460 pmulhrsw m1, m5 | |
14461 packuswb m4, m1 | |
14462 movu [r0 + r4], m4 | |
14463 | |
14464 lea r0, [r0 + r1 * 4] | |
14465 | |
14466 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] | |
14467 pmulhrsw m4, m5 | |
14468 pmaddubsw m1, m2, [r3 + 6 * 32] | |
14469 pmulhrsw m1, m5 | |
14470 packuswb m4, m1 | |
14471 movu [r0], m4 | |
14472 | |
14473 pmaddubsw m4, m0, [r3 + 4 * 32] ; [20] | |
14474 pmulhrsw m4, m5 | |
14475 pmaddubsw m1, m2, [r3 + 4 * 32] | |
14476 pmulhrsw m1, m5 | |
14477 packuswb m4, m1 | |
14478 movu [r0 + r1], m4 | |
14479 | |
14480 pmaddubsw m4, m0, [r3 + 2 * 32] ; [18] | |
14481 pmulhrsw m4, m5 | |
14482 pmaddubsw m1, m2, [r3 + 2 * 32] | |
14483 pmulhrsw m1, m5 | |
14484 packuswb m4, m1 | |
14485 movu [r0 + r1*2], m4 | |
14486 | |
14487 pmaddubsw m4, m0, [r3] ; [16] | |
14488 pmulhrsw m4, m5 | |
14489 pmaddubsw m1, m2, [r3] | |
14490 pmulhrsw m1, m5 | |
14491 packuswb m4, m1 | |
14492 movu [r0 + r4], m4 | |
14493 | |
14494 lea r0, [r0 + r1 * 4] | |
14495 | |
14496 ; rows 24 to 31 | |
14497 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] | |
14498 pmulhrsw m4, m5 | |
14499 pmaddubsw m1, m2, [r3 - 2 * 32] | |
14500 pmulhrsw m1, m5 | |
14501 packuswb m4, m1 | |
14502 movu [r0], m4 | |
14503 | |
14504 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] | |
14505 pmulhrsw m4, m5 | |
14506 pmaddubsw m1, m2, [r3 - 4 * 32] | |
14507 pmulhrsw m1, m5 | |
14508 packuswb m4, m1 | |
14509 movu [r0 + r1], m4 | |
14510 | |
14511 pmaddubsw m4, m0, [r3 - 6 * 32] ; [10] | |
14512 pmulhrsw m4, m5 | |
14513 pmaddubsw m1, m2, [r3 - 6 * 32] | |
14514 pmulhrsw m1, m5 | |
14515 packuswb m4, m1 | |
14516 movu [r0 + r1 * 2], m4 | |
14517 | |
14518 pmaddubsw m4, m0, [r3 - 8 * 32] ; [8] | |
14519 pmulhrsw m4, m5 | |
14520 pmaddubsw m1, m2, [r3 - 8 * 32] | |
14521 pmulhrsw m1, m5 | |
14522 packuswb m4, m1 | |
14523 movu [r0 + r4], m4 | |
14524 | |
14525 lea r0, [r0 + r1 * 4] | |
14526 | |
14527 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] | |
14528 pmulhrsw m4, m5 | |
14529 pmaddubsw m1, m2, [r3 - 10 * 32] | |
14530 pmulhrsw m1, m5 | |
14531 packuswb m4, m1 | |
14532 movu [r0], m4 | |
14533 | |
14534 pmaddubsw m4, m0, [r3 - 12 * 32] ; [4] | |
14535 pmulhrsw m4, m5 | |
14536 pmaddubsw m1, m2, [r3 - 12 * 32] | |
14537 pmulhrsw m1, m5 | |
14538 packuswb m4, m1 | |
14539 movu [r0 + r1], m4 | |
14540 | |
14541 pmaddubsw m0, [r3 - 14 * 32] ; [2] | |
14542 pmulhrsw m0, m5 | |
14543 pmaddubsw m2, [r3 - 14 * 32] | |
14544 pmulhrsw m2, m5 | |
14545 packuswb m0, m2 | |
14546 movu [r0 + r1*2], m0 | |
14547 | |
14548 movu m1, [r2 + 1] ; [0] | |
14549 palignr m1, m3, 14 | |
14550 movu [r0 + r4], m1 | |
14551 RET | |
14552 | |
14553 cglobal intra_pred_ang32_12, 3,4,9 | |
14554 movu m0, [ang32_fact_mode12] | |
14555 movu m1, [ang32_fact_mode12 + mmsize] | |
14556 mova m2, [pw_1024] | |
14557 mova m7, [ang32_shuf_mode12] | |
14558 mova m8, [ang32_shuf_mode12 + mmsize] | |
14559 lea r3, [r1 * 3] | |
14560 | |
14561 ; prepare for [26, 19, 13, 6, 0, -1, -2....] | |
14562 | |
14563 movu xm4, [r2 + mmsize*2 - 4] | |
14564 vbroadcasti128 m6, [r2 + mmsize*2 + 12] | |
14565 | |
14566 pinsrb xm4, [r2 + 0], 4 | |
14567 pinsrb xm4, [r2 + 6], 3 | |
14568 pinsrb xm4, [r2 + 13], 2 | |
14569 pinsrb xm4, [r2 + 19], 1 | |
14570 pinsrb xm4, [r2 + 26], 0 | |
14571 vinserti128 m3, m4, xm4, 1 ; [26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | |
14572 | |
14573 pshufb m4, m3, m7 ; [ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13] | |
14574 pshufb m5, m3, m8 ; [ 6, 0, 6, 0, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 13, 6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19] | |
14575 pmaddubsw m4, m0 | |
14576 pmaddubsw m5, m1 | |
14577 pmulhrsw m4, m2 | |
14578 pmulhrsw m5, m2 | |
14579 packuswb m4, m5 | |
14580 movu [r0], m4 | |
14581 | |
14582 palignr m4, m6, m3, 1 | |
14583 pshufb m5, m4, m8 | |
14584 pshufb m4, m7 | |
14585 pmaddubsw m4, m0 | |
14586 pmaddubsw m5, m1 | |
14587 pmulhrsw m4, m2 | |
14588 pmulhrsw m5, m2 | |
14589 packuswb m4, m5 | |
14590 movu [r0 + r1], m4 | |
14591 | |
14592 palignr m4, m6, m3, 2 | |
14593 pshufb m5, m4, m8 | |
14594 pshufb m4, m7 | |
14595 pmaddubsw m4, m0 | |
14596 pmaddubsw m5, m1 | |
14597 pmulhrsw m4, m2 | |
14598 pmulhrsw m5, m2 | |
14599 packuswb m4, m5 | |
14600 movu [r0 + r1 * 2], m4 | |
14601 | |
14602 palignr m4, m6, m3, 3 | |
14603 pshufb m5, m4, m8 | |
14604 pshufb m4, m7 | |
14605 pmaddubsw m4, m0 | |
14606 pmaddubsw m5, m1 | |
14607 pmulhrsw m4, m2 | |
14608 pmulhrsw m5, m2 | |
14609 packuswb m4, m5 | |
14610 movu [r0 + r3], m4 | |
14611 | |
14612 lea r0, [r0 + r1 * 4] | |
14613 | |
14614 palignr m4, m6, m3, 4 | |
14615 pshufb m5, m4, m8 | |
14616 pshufb m4, m7 | |
14617 pmaddubsw m4, m0 | |
14618 pmaddubsw m5, m1 | |
14619 pmulhrsw m4, m2 | |
14620 pmulhrsw m5, m2 | |
14621 packuswb m4, m5 | |
14622 movu [r0], m4 | |
14623 | |
14624 palignr m4, m6, m3, 5 | |
14625 pshufb m5, m4, m8 | |
14626 pshufb m4, m7 | |
14627 pmaddubsw m4, m0 | |
14628 pmaddubsw m5, m1 | |
14629 pmulhrsw m4, m2 | |
14630 pmulhrsw m5, m2 | |
14631 packuswb m4, m5 | |
14632 movu [r0 + r1], m4 | |
14633 | |
14634 palignr m4, m6, m3, 6 | |
14635 pshufb m5, m4, m8 | |
14636 pshufb m4, m7 | |
14637 pmaddubsw m4, m0 | |
14638 pmaddubsw m5, m1 | |
14639 pmulhrsw m4, m2 | |
14640 pmulhrsw m5, m2 | |
14641 packuswb m4, m5 | |
14642 movu [r0 + r1 * 2], m4 | |
14643 | |
14644 palignr m4, m6, m3, 7 | |
14645 pshufb m5, m4, m8 | |
14646 pshufb m4, m7 | |
14647 pmaddubsw m4, m0 | |
14648 pmaddubsw m5, m1 | |
14649 pmulhrsw m4, m2 | |
14650 pmulhrsw m5, m2 | |
14651 packuswb m4, m5 | |
14652 movu [r0 + r3], m4 | |
14653 | |
14654 lea r0, [r0 + r1 * 4] | |
14655 | |
14656 palignr m4, m6, m3, 8 | |
14657 pshufb m5, m4, m8 | |
14658 pshufb m4, m7 | |
14659 pmaddubsw m4, m0 | |
14660 pmaddubsw m5, m1 | |
14661 pmulhrsw m4, m2 | |
14662 pmulhrsw m5, m2 | |
14663 packuswb m4, m5 | |
14664 movu [r0], m4 | |
14665 | |
14666 palignr m4, m6, m3, 9 | |
14667 pshufb m5, m4, m8 | |
14668 pshufb m4, m7 | |
14669 pmaddubsw m4, m0 | |
14670 pmaddubsw m5, m1 | |
14671 pmulhrsw m4, m2 | |
14672 pmulhrsw m5, m2 | |
14673 packuswb m4, m5 | |
14674 movu [r0 + r1], m4 | |
14675 | |
14676 palignr m4, m6, m3, 10 | |
14677 pshufb m5, m4, m8 | |
14678 pshufb m4, m7 | |
14679 pmaddubsw m4, m0 | |
14680 pmaddubsw m5, m1 | |
14681 pmulhrsw m4, m2 | |
14682 pmulhrsw m5, m2 | |
14683 packuswb m4, m5 | |
14684 movu [r0 + r1 * 2], m4 | |
14685 | |
14686 palignr m4, m6, m3, 11 | |
14687 pshufb m5, m4, m8 | |
14688 pshufb m4, m7 | |
14689 pmaddubsw m4, m0 | |
14690 pmaddubsw m5, m1 | |
14691 pmulhrsw m4, m2 | |
14692 pmulhrsw m5, m2 | |
14693 packuswb m4, m5 | |
14694 movu [r0 + r3], m4 | |
14695 | |
14696 lea r0, [r0 + r1 * 4] | |
14697 | |
14698 palignr m4, m6, m3, 12 | |
14699 pshufb m5, m4, m8 | |
14700 pshufb m4, m7 | |
14701 pmaddubsw m4, m0 | |
14702 pmaddubsw m5, m1 | |
14703 pmulhrsw m4, m2 | |
14704 pmulhrsw m5, m2 | |
14705 packuswb m4, m5 | |
14706 movu [r0], m4 | |
14707 | |
14708 palignr m4, m6, m3, 13 | |
14709 pshufb m5, m4, m8 | |
14710 pshufb m4, m7 | |
14711 pmaddubsw m4, m0 | |
14712 pmaddubsw m5, m1 | |
14713 pmulhrsw m4, m2 | |
14714 pmulhrsw m5, m2 | |
14715 packuswb m4, m5 | |
14716 movu [r0 + r1], m4 | |
14717 | |
14718 palignr m4, m6, m3, 14 | |
14719 pshufb m5, m4, m8 | |
14720 pshufb m4, m7 | |
14721 pmaddubsw m4, m0 | |
14722 pmaddubsw m5, m1 | |
14723 pmulhrsw m4, m2 | |
14724 pmulhrsw m5, m2 | |
14725 packuswb m4, m5 | |
14726 movu [r0 + r1 * 2], m4 | |
14727 | |
14728 palignr m4, m6, m3, 15 | |
14729 pshufb m5, m4, m8 | |
14730 pshufb m4, m7 | |
14731 pmaddubsw m4, m0 | |
14732 pmaddubsw m5, m1 | |
14733 pmulhrsw m4, m2 | |
14734 pmulhrsw m5, m2 | |
14735 packuswb m4, m5 | |
14736 movu [r0 + r3], m4 | |
14737 | |
14738 lea r0, [r0 + r1 * 4] | |
14739 mova m3, m6 | |
14740 vbroadcasti128 m6, [r2 + mmsize*2 + 12 + 16] | |
14741 | |
14742 pshufb m4, m3, m7 | |
14743 pshufb m5, m3, m8 | |
14744 pmaddubsw m4, m0 | |
14745 pmaddubsw m5, m1 | |
14746 pmulhrsw m4, m2 | |
14747 pmulhrsw m5, m2 | |
14748 packuswb m4, m5 | |
14749 movu [r0], m4 | |
14750 | |
14751 palignr m4, m6, m3, 1 | |
14752 pshufb m5, m4, m8 | |
14753 pshufb m4, m7 | |
14754 pmaddubsw m4, m0 | |
14755 pmaddubsw m5, m1 | |
14756 pmulhrsw m4, m2 | |
14757 pmulhrsw m5, m2 | |
14758 packuswb m4, m5 | |
14759 movu [r0 + r1], m4 | |
14760 | |
14761 palignr m4, m6, m3, 2 | |
14762 pshufb m5, m4, m8 | |
14763 pshufb m4, m7 | |
14764 pmaddubsw m4, m0 | |
14765 pmaddubsw m5, m1 | |
14766 pmulhrsw m4, m2 | |
14767 pmulhrsw m5, m2 | |
14768 packuswb m4, m5 | |
14769 movu [r0 + r1 * 2], m4 | |
14770 | |
14771 palignr m4, m6, m3, 3 | |
14772 pshufb m5, m4, m8 | |
14773 pshufb m4, m7 | |
14774 pmaddubsw m4, m0 | |
14775 pmaddubsw m5, m1 | |
14776 pmulhrsw m4, m2 | |
14777 pmulhrsw m5, m2 | |
14778 packuswb m4, m5 | |
14779 movu [r0 + r3], m4 | |
14780 | |
14781 lea r0, [r0 + r1 * 4] | |
14782 | |
14783 palignr m4, m6, m3, 4 | |
14784 pshufb m5, m4, m8 | |
14785 pshufb m4, m7 | |
14786 pmaddubsw m4, m0 | |
14787 pmaddubsw m5, m1 | |
14788 pmulhrsw m4, m2 | |
14789 pmulhrsw m5, m2 | |
14790 packuswb m4, m5 | |
14791 movu [r0], m4 | |
14792 | |
14793 palignr m4, m6, m3, 5 | |
14794 pshufb m5, m4, m8 | |
14795 pshufb m4, m7 | |
14796 pmaddubsw m4, m0 | |
14797 pmaddubsw m5, m1 | |
14798 pmulhrsw m4, m2 | |
14799 pmulhrsw m5, m2 | |
14800 packuswb m4, m5 | |
14801 movu [r0 + r1], m4 | |
14802 | |
14803 palignr m4, m6, m3, 6 | |
14804 pshufb m5, m4, m8 | |
14805 pshufb m4, m7 | |
14806 pmaddubsw m4, m0 | |
14807 pmaddubsw m5, m1 | |
14808 pmulhrsw m4, m2 | |
14809 pmulhrsw m5, m2 | |
14810 packuswb m4, m5 | |
14811 movu [r0 + r1 * 2], m4 | |
14812 | |
14813 palignr m4, m6, m3, 7 | |
14814 pshufb m5, m4, m8 | |
14815 pshufb m4, m7 | |
14816 pmaddubsw m4, m0 | |
14817 pmaddubsw m5, m1 | |
14818 pmulhrsw m4, m2 | |
14819 pmulhrsw m5, m2 | |
14820 packuswb m4, m5 | |
14821 movu [r0 + r3], m4 | |
14822 | |
14823 lea r0, [r0 + r1 * 4] | |
14824 | |
14825 palignr m4, m6, m3, 8 | |
14826 pshufb m5, m4, m8 | |
14827 pshufb m4, m7 | |
14828 pmaddubsw m4, m0 | |
14829 pmaddubsw m5, m1 | |
14830 pmulhrsw m4, m2 | |
14831 pmulhrsw m5, m2 | |
14832 packuswb m4, m5 | |
14833 movu [r0], m4 | |
14834 | |
14835 palignr m4, m6, m3, 9 | |
14836 pshufb m5, m4, m8 | |
14837 pshufb m4, m7 | |
14838 pmaddubsw m4, m0 | |
14839 pmaddubsw m5, m1 | |
14840 pmulhrsw m4, m2 | |
14841 pmulhrsw m5, m2 | |
14842 packuswb m4, m5 | |
14843 movu [r0 + r1], m4 | |
14844 | |
14845 palignr m4, m6, m3, 10 | |
14846 pshufb m5, m4, m8 | |
14847 pshufb m4, m7 | |
14848 pmaddubsw m4, m0 | |
14849 pmaddubsw m5, m1 | |
14850 pmulhrsw m4, m2 | |
14851 pmulhrsw m5, m2 | |
14852 packuswb m4, m5 | |
14853 movu [r0 + r1 * 2], m4 | |
14854 | |
14855 palignr m4, m6, m3, 11 | |
14856 pshufb m5, m4, m8 | |
14857 pshufb m4, m7 | |
14858 pmaddubsw m4, m0 | |
14859 pmaddubsw m5, m1 | |
14860 pmulhrsw m4, m2 | |
14861 pmulhrsw m5, m2 | |
14862 packuswb m4, m5 | |
14863 movu [r0 + r3], m4 | |
14864 | |
14865 lea r0, [r0 + r1 * 4] | |
14866 | |
14867 palignr m4, m6, m3, 12 | |
14868 pshufb m5, m4, m8 | |
14869 pshufb m4, m7 | |
14870 pmaddubsw m4, m0 | |
14871 pmaddubsw m5, m1 | |
14872 pmulhrsw m4, m2 | |
14873 pmulhrsw m5, m2 | |
14874 packuswb m4, m5 | |
14875 movu [r0], m4 | |
14876 | |
14877 palignr m4, m6, m3, 13 | |
14878 pshufb m5, m4, m8 | |
14879 pshufb m4, m7 | |
14880 pmaddubsw m4, m0 | |
14881 pmaddubsw m5, m1 | |
14882 pmulhrsw m4, m2 | |
14883 pmulhrsw m5, m2 | |
14884 packuswb m4, m5 | |
14885 movu [r0 + r1], m4 | |
14886 | |
14887 palignr m4, m6, m3, 14 | |
14888 pshufb m5, m4, m8 | |
14889 pshufb m4, m7 | |
14890 pmaddubsw m4, m0 | |
14891 pmaddubsw m5, m1 | |
14892 pmulhrsw m4, m2 | |
14893 pmulhrsw m5, m2 | |
14894 packuswb m4, m5 | |
14895 movu [r0 + r1 * 2], m4 | |
14896 | |
14897 palignr m4, m6, m3, 15 | |
14898 pshufb m5, m4, m8 | |
14899 pshufb m4, m7 | |
14900 pmaddubsw m4, m0 | |
14901 pmaddubsw m5, m1 | |
14902 pmulhrsw m4, m2 | |
14903 pmulhrsw m5, m2 | |
14904 packuswb m4, m5 | |
14905 movu [r0 + r3], m4 | |
14906 RET | |
14907 | |
14908 cglobal intra_pred_ang32_24, 3,5,8 | |
14909 lea r3, [ang_table_avx2 + 32 * 16] | |
14910 lea r4, [r1 * 3] | |
14911 mova m5, [pw_1024] | |
14912 | |
14913 ; rows 0 to 7 | |
14914 movu m0, [r2 + 0] | |
14915 movu m1, [r2 + 1] | |
14916 punpckhbw m2, m0, m1 | |
14917 punpcklbw m0, m1 | |
14918 | |
14919 movu m4, [r2 + mmsize*2] | |
14920 pshufb m4, [ang32_shuf_mode24] | |
14921 mova m3, [ang32_shuf_mode24 + mmsize] | |
14922 vpermd m4, m3, m4 ; [6 6 13 13 19 19 26 26 x x x...] | |
14923 palignr m3, m0, m4, 1 | |
14924 vinserti128 m3, m3, xm2, 1 | |
14925 | |
14926 pmaddubsw m4, m0, [r3 + 11 * 32] ; [27] | |
14927 pmulhrsw m4, m5 | |
14928 pmaddubsw m1, m2, [r3 + 11 * 32] | |
14929 pmulhrsw m1, m5 | |
14930 packuswb m4, m1 | |
14931 movu [r0], m4 | |
14932 | |
14933 pmaddubsw m4, m0, [r3 + 6 * 32] ; [22] | |
14934 pmulhrsw m4, m5 | |
14935 pmaddubsw m1, m2, [r3 + 6 * 32] | |
14936 pmulhrsw m1, m5 | |
14937 packuswb m4, m1 | |
14938 movu [r0 + r1], m4 | |
14939 | |
14940 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] | |
14941 pmulhrsw m4, m5 | |
14942 pmaddubsw m1, m2, [r3 + 1 * 32] | |
14943 pmulhrsw m1, m5 | |
14944 packuswb m4, m1 | |
14945 movu [r0 + r1*2], m4 | |
14946 | |
14947 pmaddubsw m4, m0, [r3 - 4 * 32] ; [12] | |
14948 pmulhrsw m4, m5 | |
14949 pmaddubsw m1, m2, [r3 - 4 * 32] | |
14950 pmulhrsw m1, m5 | |
14951 packuswb m4, m1 | |
14952 movu [r0 + r4], m4 | |
14953 | |
14954 lea r0, [r0 + r1 * 4] | |
14955 | |
14956 pmaddubsw m4, m0, [r3 - 9 * 32] ; [7] | |
14957 pmulhrsw m4, m5 | |
14958 pmaddubsw m1, m2, [r3 - 9 * 32] | |
14959 pmulhrsw m1, m5 | |
14960 packuswb m4, m1 | |
14961 movu [r0], m4 | |
14962 | |
14963 pmaddubsw m4, m0, [r3 - 14 * 32] ; [2] | |
14964 pmulhrsw m4, m5 | |
14965 pmaddubsw m1, m2, [r3 - 14 * 32] | |
14966 pmulhrsw m1, m5 | |
14967 packuswb m4, m1 | |
14968 movu [r0 + r1], m4 | |
14969 | |
14970 palignr m6, m0, m3, 14 | |
14971 palignr m7, m2, m0, 14 | |
14972 | |
14973 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] | |
14974 pmulhrsw m4, m5 | |
14975 pmaddubsw m1, m7, [r3 + 13 * 32] | |
14976 pmulhrsw m1, m5 | |
14977 packuswb m4, m1 | |
14978 movu [r0 + r1*2], m4 | |
14979 | |
14980 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
14981 pmulhrsw m4, m5 | |
14982 pmaddubsw m1, m7, [r3 + 8 * 32] | |
14983 pmulhrsw m1, m5 | |
14984 packuswb m4, m1 | |
14985 movu [r0 + r4], m4 | |
14986 | |
14987 lea r0, [r0 + r1 * 4] | |
14988 | |
14989 ; rows 8 to 15 | |
14990 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] | |
14991 pmulhrsw m4, m5 | |
14992 pmaddubsw m1, m7, [r3 + 3 * 32] | |
14993 pmulhrsw m1, m5 | |
14994 packuswb m4, m1 | |
14995 movu [r0], m4 | |
14996 | |
14997 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
14998 pmulhrsw m4, m5 | |
14999 pmaddubsw m1, m7, [r3 - 2 * 32] | |
15000 pmulhrsw m1, m5 | |
15001 packuswb m4, m1 | |
15002 movu [r0 + r1], m4 | |
15003 | |
15004 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] | |
15005 pmulhrsw m4, m5 | |
15006 pmaddubsw m1, m7, [r3 - 7 * 32] | |
15007 pmulhrsw m1, m5 | |
15008 packuswb m4, m1 | |
15009 movu [r0 + r1*2], m4 | |
15010 | |
15011 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
15012 pmulhrsw m4, m5 | |
15013 pmaddubsw m1, m7, [r3 - 12 * 32] | |
15014 pmulhrsw m1, m5 | |
15015 packuswb m4, m1 | |
15016 movu [r0 + r4], m4 | |
15017 | |
15018 lea r0, [r0 + r1 * 4] | |
15019 | |
15020 palignr m6, m0, m3, 12 | |
15021 palignr m7, m2, m0, 12 | |
15022 | |
15023 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] | |
15024 pmulhrsw m4, m5 | |
15025 pmaddubsw m1, m7, [r3 + 15 * 32] | |
15026 pmulhrsw m1, m5 | |
15027 packuswb m4, m1 | |
15028 movu [r0], m4 | |
15029 | |
15030 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
15031 pmulhrsw m4, m5 | |
15032 pmaddubsw m1, m7, [r3 + 10 * 32] | |
15033 pmulhrsw m1, m5 | |
15034 packuswb m4, m1 | |
15035 movu [r0 + r1], m4 | |
15036 | |
15037 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
15038 pmulhrsw m4, m5 | |
15039 pmaddubsw m1, m7, [r3 + 5 * 32] | |
15040 pmulhrsw m1, m5 | |
15041 packuswb m4, m1 | |
15042 movu [r0 + r1 * 2], m4 | |
15043 | |
15044 pmaddubsw m4, m6, [r3] ; [16] | |
15045 pmulhrsw m4, m5 | |
15046 pmaddubsw m1, m7, [r3] | |
15047 pmulhrsw m1, m5 | |
15048 packuswb m4, m1 | |
15049 movu [r0 + r4], m4 | |
15050 | |
15051 lea r0, [r0 + r1 * 4] | |
15052 | |
15053 ; rows 16 to 23 | |
15054 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] | |
15055 pmulhrsw m4, m5 | |
15056 pmaddubsw m1, m7, [r3 - 5 * 32] | |
15057 pmulhrsw m1, m5 | |
15058 packuswb m4, m1 | |
15059 movu [r0], m4 | |
15060 | |
15061 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] | |
15062 pmulhrsw m4, m5 | |
15063 pmaddubsw m1, m7, [r3 - 10 * 32] | |
15064 pmulhrsw m1, m5 | |
15065 packuswb m4, m1 | |
15066 movu [r0 + r1], m4 | |
15067 | |
15068 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] | |
15069 pmulhrsw m4, m5 | |
15070 pmaddubsw m1, m7, [r3 - 15 * 32] | |
15071 pmulhrsw m1, m5 | |
15072 packuswb m4, m1 | |
15073 movu [r0 + r1*2], m4 | |
15074 | |
15075 palignr m6, m0, m3, 10 | |
15076 palignr m7, m2, m0, 10 | |
15077 | |
15078 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] | |
15079 pmulhrsw m4, m5 | |
15080 pmaddubsw m1, m7, [r3 + 12 * 32] | |
15081 pmulhrsw m1, m5 | |
15082 packuswb m4, m1 | |
15083 movu [r0 + r4], m4 | |
15084 | |
15085 lea r0, [r0 + r1 * 4] | |
15086 | |
15087 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] | |
15088 pmulhrsw m4, m5 | |
15089 pmaddubsw m1, m7, [r3 + 7 * 32] | |
15090 pmulhrsw m1, m5 | |
15091 packuswb m4, m1 | |
15092 movu [r0], m4 | |
15093 | |
15094 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
15095 pmulhrsw m4, m5 | |
15096 pmaddubsw m1, m7, [r3 + 2 * 32] | |
15097 pmulhrsw m1, m5 | |
15098 packuswb m4, m1 | |
15099 movu [r0 + r1], m4 | |
15100 | |
15101 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] | |
15102 pmulhrsw m4, m5 | |
15103 pmaddubsw m1, m7, [r3 - 3 * 32] | |
15104 pmulhrsw m1, m5 | |
15105 packuswb m4, m1 | |
15106 movu [r0 + r1*2], m4 | |
15107 | |
15108 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
15109 pmulhrsw m4, m5 | |
15110 pmaddubsw m1, m7, [r3 - 8 * 32] | |
15111 pmulhrsw m1, m5 | |
15112 packuswb m4, m1 | |
15113 movu [r0 + r4], m4 | |
15114 | |
15115 lea r0, [r0 + r1 * 4] | |
15116 | |
15117 ; rows 24 to 31 | |
15118 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] | |
15119 pmulhrsw m4, m5 | |
15120 pmaddubsw m1, m7, [r3 - 13 * 32] | |
15121 pmulhrsw m1, m5 | |
15122 packuswb m4, m1 | |
15123 movu [r0], m4 | |
15124 | |
15125 palignr m6, m0, m3, 8 | |
15126 palignr m7, m2, m0, 8 | |
15127 | |
15128 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
15129 pmulhrsw m4, m5 | |
15130 pmaddubsw m1, m7, [r3 + 14 * 32] | |
15131 pmulhrsw m1, m5 | |
15132 packuswb m4, m1 | |
15133 movu [r0 + r1], m4 | |
15134 | |
15135 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] | |
15136 pmulhrsw m4, m5 | |
15137 pmaddubsw m1, m7, [r3 + 9 * 32] | |
15138 pmulhrsw m1, m5 | |
15139 packuswb m4, m1 | |
15140 movu [r0 + r1 * 2], m4 | |
15141 | |
15142 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
15143 pmulhrsw m4, m5 | |
15144 pmaddubsw m1, m7, [r3 + 4 * 32] | |
15145 pmulhrsw m1, m5 | |
15146 packuswb m4, m1 | |
15147 movu [r0 + r4], m4 | |
15148 | |
15149 lea r0, [r0 + r1 * 4] | |
15150 | |
15151 pmaddubsw m4, m6, [r3 - 1 * 32] ; [15] | |
15152 pmulhrsw m4, m5 | |
15153 pmaddubsw m1, m7, [r3 - 1 * 32] | |
15154 pmulhrsw m1, m5 | |
15155 packuswb m4, m1 | |
15156 movu [r0], m4 | |
15157 | |
15158 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
15159 pmulhrsw m4, m5 | |
15160 pmaddubsw m1, m7, [r3 - 6 * 32] | |
15161 pmulhrsw m1, m5 | |
15162 packuswb m4, m1 | |
15163 movu [r0 + r1], m4 | |
15164 | |
15165 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] | |
15166 pmulhrsw m4, m5 | |
15167 pmaddubsw m1, m7, [r3 - 11 * 32] | |
15168 pmulhrsw m1, m5 | |
15169 packuswb m4, m1 | |
15170 movu [r0 + r1*2], m4 | |
15171 | |
15172 pand m6, [pw_00ff] | |
15173 pand m7, [pw_00ff] | |
15174 packuswb m6, m7 | |
15175 movu [r0 + r4], m6 | |
15176 RET | |
15177 | |
15178 cglobal intra_pred_ang32_13, 3,4,9 | |
15179 movu m0, [ang32_fact_mode13] | |
15180 movu m1, [ang32_fact_mode13 + mmsize] | |
15181 mova m2, [pw_1024] | |
15182 mova m7, [ang32_shuf_mode13] | |
15183 mova m8, [ang32_shuf_mode13 + mmsize] | |
15184 lea r3, [r1 * 3] | |
15185 | |
15186 ; prepare for [28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2....] | |
15187 | |
15188 movu m6, [r2] | |
15189 pshufb m6, [ang32_shuf_mode13 + mmsize*2] | |
15190 mova m3, [ang32_shuf_mode24 + mmsize*1] | |
15191 vpermd m6, m3, m6 | |
15192 palignr m6, m6, 1 | |
15193 vbroadcasti128 m3, [r2 + mmsize*2 + 1] | |
15194 | |
15195 palignr m5, m3, m6, 1 | |
15196 pshufb m4, m5, m7 | |
15197 pshufb m5, m8 | |
15198 pmaddubsw m4, m0 | |
15199 pmaddubsw m5, m1 | |
15200 pmulhrsw m4, m2 | |
15201 pmulhrsw m5, m2 | |
15202 packuswb m4, m5 | |
15203 movu [r0], m4 | |
15204 | |
15205 palignr m5, m3, m6, 2 | |
15206 pshufb m4, m5, m7 | |
15207 pshufb m5, m8 | |
15208 pmaddubsw m4, m0 | |
15209 pmaddubsw m5, m1 | |
15210 pmulhrsw m4, m2 | |
15211 pmulhrsw m5, m2 | |
15212 packuswb m4, m5 | |
15213 movu [r0 + r1], m4 | |
15214 | |
15215 palignr m5, m3, m6, 3 | |
15216 pshufb m4, m5, m7 | |
15217 pshufb m5, m8 | |
15218 pmaddubsw m4, m0 | |
15219 pmaddubsw m5, m1 | |
15220 pmulhrsw m4, m2 | |
15221 pmulhrsw m5, m2 | |
15222 packuswb m4, m5 | |
15223 movu [r0 + r1 * 2], m4 | |
15224 | |
15225 palignr m5, m3, m6, 4 | |
15226 pshufb m4, m5, m7 | |
15227 pshufb m5, m8 | |
15228 pmaddubsw m4, m0 | |
15229 pmaddubsw m5, m1 | |
15230 pmulhrsw m4, m2 | |
15231 pmulhrsw m5, m2 | |
15232 packuswb m4, m5 | |
15233 movu [r0 + r3], m4 | |
15234 | |
15235 lea r0, [r0 + r1 * 4] | |
15236 | |
15237 palignr m5, m3, m6, 5 | |
15238 pshufb m4, m5, m7 | |
15239 pshufb m5, m8 | |
15240 pmaddubsw m4, m0 | |
15241 pmaddubsw m5, m1 | |
15242 pmulhrsw m4, m2 | |
15243 pmulhrsw m5, m2 | |
15244 packuswb m4, m5 | |
15245 movu [r0], m4 | |
15246 | |
15247 palignr m5, m3, m6, 6 | |
15248 pshufb m4, m5, m7 | |
15249 pshufb m5, m8 | |
15250 pmaddubsw m4, m0 | |
15251 pmaddubsw m5, m1 | |
15252 pmulhrsw m4, m2 | |
15253 pmulhrsw m5, m2 | |
15254 packuswb m4, m5 | |
15255 movu [r0 + r1], m4 | |
15256 | |
15257 palignr m5, m3, m6, 7 | |
15258 pshufb m4, m5, m7 | |
15259 pshufb m5, m8 | |
15260 pmaddubsw m4, m0 | |
15261 pmaddubsw m5, m1 | |
15262 pmulhrsw m4, m2 | |
15263 pmulhrsw m5, m2 | |
15264 packuswb m4, m5 | |
15265 movu [r0 + r1 * 2], m4 | |
15266 | |
15267 palignr m5, m3, m6, 8 | |
15268 pshufb m4, m5, m7 | |
15269 pshufb m5, m8 | |
15270 pmaddubsw m4, m0 | |
15271 pmaddubsw m5, m1 | |
15272 pmulhrsw m4, m2 | |
15273 pmulhrsw m5, m2 | |
15274 packuswb m4, m5 | |
15275 movu [r0 + r3], m4 | |
15276 | |
15277 lea r0, [r0 + r1 * 4] | |
15278 | |
15279 palignr m5, m3, m6, 9 | |
15280 pshufb m4, m5, m7 | |
15281 pshufb m5, m8 | |
15282 pmaddubsw m4, m0 | |
15283 pmaddubsw m5, m1 | |
15284 pmulhrsw m4, m2 | |
15285 pmulhrsw m5, m2 | |
15286 packuswb m4, m5 | |
15287 movu [r0], m4 | |
15288 | |
15289 palignr m5, m3, m6, 10 | |
15290 pshufb m4, m5, m7 | |
15291 pshufb m5, m8 | |
15292 pmaddubsw m4, m0 | |
15293 pmaddubsw m5, m1 | |
15294 pmulhrsw m4, m2 | |
15295 pmulhrsw m5, m2 | |
15296 packuswb m4, m5 | |
15297 movu [r0 + r1], m4 | |
15298 | |
15299 palignr m5, m3, m6, 11 | |
15300 pshufb m4, m5, m7 | |
15301 pshufb m5, m8 | |
15302 pmaddubsw m4, m0 | |
15303 pmaddubsw m5, m1 | |
15304 pmulhrsw m4, m2 | |
15305 pmulhrsw m5, m2 | |
15306 packuswb m4, m5 | |
15307 movu [r0 + r1 * 2], m4 | |
15308 | |
15309 palignr m5, m3, m6, 12 | |
15310 pshufb m4, m5, m7 | |
15311 pshufb m5, m8 | |
15312 pmaddubsw m4, m0 | |
15313 pmaddubsw m5, m1 | |
15314 pmulhrsw m4, m2 | |
15315 pmulhrsw m5, m2 | |
15316 packuswb m4, m5 | |
15317 movu [r0 + r3], m4 | |
15318 | |
15319 lea r0, [r0 + r1 * 4] | |
15320 | |
15321 palignr m5, m3, m6, 13 | |
15322 pshufb m4, m5, m7 | |
15323 pshufb m5, m8 | |
15324 pmaddubsw m4, m0 | |
15325 pmaddubsw m5, m1 | |
15326 pmulhrsw m4, m2 | |
15327 pmulhrsw m5, m2 | |
15328 packuswb m4, m5 | |
15329 movu [r0], m4 | |
15330 | |
15331 palignr m5, m3, m6, 14 | |
15332 pshufb m4, m5, m7 | |
15333 pshufb m5, m8 | |
15334 pmaddubsw m4, m0 | |
15335 pmaddubsw m5, m1 | |
15336 pmulhrsw m4, m2 | |
15337 pmulhrsw m5, m2 | |
15338 packuswb m4, m5 | |
15339 movu [r0 + r1], m4 | |
15340 | |
15341 palignr m5, m3, m6, 15 | |
15342 pshufb m4, m5, m7 | |
15343 pshufb m5, m8 | |
15344 pmaddubsw m4, m0 | |
15345 pmaddubsw m5, m1 | |
15346 pmulhrsw m4, m2 | |
15347 pmulhrsw m5, m2 | |
15348 packuswb m4, m5 | |
15349 movu [r0 + r1 * 2], m4 | |
15350 | |
15351 pshufb m4, m3, m7 | |
15352 pshufb m5, m3, m8 | |
15353 pmaddubsw m4, m0 | |
15354 pmaddubsw m5, m1 | |
15355 pmulhrsw m4, m2 | |
15356 pmulhrsw m5, m2 | |
15357 packuswb m4, m5 | |
15358 movu [r0 + r3], m4 | |
15359 | |
15360 lea r0, [r0 + r1 * 4] | |
15361 | |
15362 mova m6, m3 | |
15363 vbroadcasti128 m3, [r2 + mmsize*2 + 17] | |
15364 palignr m5, m3, m6, 1 | |
15365 pshufb m4, m5, m7 | |
15366 pshufb m5, m8 | |
15367 pmaddubsw m4, m0 | |
15368 pmaddubsw m5, m1 | |
15369 pmulhrsw m4, m2 | |
15370 pmulhrsw m5, m2 | |
15371 packuswb m4, m5 | |
15372 movu [r0], m4 | |
15373 | |
15374 palignr m5, m3, m6, 2 | |
15375 pshufb m4, m5, m7 | |
15376 pshufb m5, m8 | |
15377 pmaddubsw m4, m0 | |
15378 pmaddubsw m5, m1 | |
15379 pmulhrsw m4, m2 | |
15380 pmulhrsw m5, m2 | |
15381 packuswb m4, m5 | |
15382 movu [r0 + r1], m4 | |
15383 | |
15384 palignr m5, m3, m6, 3 | |
15385 pshufb m4, m5, m7 | |
15386 pshufb m5, m8 | |
15387 pmaddubsw m4, m0 | |
15388 pmaddubsw m5, m1 | |
15389 pmulhrsw m4, m2 | |
15390 pmulhrsw m5, m2 | |
15391 packuswb m4, m5 | |
15392 movu [r0 + r1 * 2], m4 | |
15393 | |
15394 palignr m5, m3, m6, 4 | |
15395 pshufb m4, m5, m7 | |
15396 pshufb m5, m5, m8 | |
15397 pmaddubsw m4, m0 | |
15398 pmaddubsw m5, m1 | |
15399 pmulhrsw m4, m2 | |
15400 pmulhrsw m5, m2 | |
15401 packuswb m4, m5 | |
15402 movu [r0 + r3], m4 | |
15403 | |
15404 lea r0, [r0 + r1 * 4] | |
15405 | |
15406 palignr m5, m3, m6, 5 | |
15407 pshufb m4, m5, m7 | |
15408 pshufb m5, m8 | |
15409 pmaddubsw m4, m0 | |
15410 pmaddubsw m5, m1 | |
15411 pmulhrsw m4, m2 | |
15412 pmulhrsw m5, m2 | |
15413 packuswb m4, m5 | |
15414 movu [r0], m4 | |
15415 | |
15416 palignr m5, m3, m6, 6 | |
15417 pshufb m4, m5, m7 | |
15418 pshufb m5, m8 | |
15419 pmaddubsw m4, m0 | |
15420 pmaddubsw m5, m1 | |
15421 pmulhrsw m4, m2 | |
15422 pmulhrsw m5, m2 | |
15423 packuswb m4, m5 | |
15424 movu [r0 + r1], m4 | |
15425 | |
15426 palignr m5, m3, m6, 7 | |
15427 pshufb m4, m5, m7 | |
15428 pshufb m5, m8 | |
15429 pmaddubsw m4, m0 | |
15430 pmaddubsw m5, m1 | |
15431 pmulhrsw m4, m2 | |
15432 pmulhrsw m5, m2 | |
15433 packuswb m4, m5 | |
15434 movu [r0 + r1 * 2], m4 | |
15435 | |
15436 palignr m5, m3, m6, 8 | |
15437 pshufb m4, m5, m7 | |
15438 pshufb m5, m8 | |
15439 pmaddubsw m4, m0 | |
15440 pmaddubsw m5, m1 | |
15441 pmulhrsw m4, m2 | |
15442 pmulhrsw m5, m2 | |
15443 packuswb m4, m5 | |
15444 movu [r0 + r3], m4 | |
15445 | |
15446 lea r0, [r0 + r1 * 4] | |
15447 | |
15448 palignr m5, m3, m6, 9 | |
15449 pshufb m4, m5, m7 | |
15450 pshufb m5, m8 | |
15451 pmaddubsw m4, m0 | |
15452 pmaddubsw m5, m1 | |
15453 pmulhrsw m4, m2 | |
15454 pmulhrsw m5, m2 | |
15455 packuswb m4, m5 | |
15456 movu [r0], m4 | |
15457 | |
15458 palignr m5, m3, m6, 10 | |
15459 pshufb m4, m5, m7 | |
15460 pshufb m5, m8 | |
15461 pmaddubsw m4, m0 | |
15462 pmaddubsw m5, m1 | |
15463 pmulhrsw m4, m2 | |
15464 pmulhrsw m5, m2 | |
15465 packuswb m4, m5 | |
15466 movu [r0 + r1], m4 | |
15467 | |
15468 palignr m5, m3, m6, 11 | |
15469 pshufb m4, m5, m7 | |
15470 pshufb m5, m8 | |
15471 pmaddubsw m4, m0 | |
15472 pmaddubsw m5, m1 | |
15473 pmulhrsw m4, m2 | |
15474 pmulhrsw m5, m2 | |
15475 packuswb m4, m5 | |
15476 movu [r0 + r1 * 2], m4 | |
15477 | |
15478 palignr m5, m3, m6, 12 | |
15479 pshufb m4, m5, m7 | |
15480 pshufb m5, m8 | |
15481 pmaddubsw m4, m0 | |
15482 pmaddubsw m5, m1 | |
15483 pmulhrsw m4, m2 | |
15484 pmulhrsw m5, m2 | |
15485 packuswb m4, m5 | |
15486 movu [r0 + r3], m4 | |
15487 | |
15488 lea r0, [r0 + r1 * 4] | |
15489 | |
15490 palignr m5, m3, m6, 13 | |
15491 pshufb m4, m5, m7 | |
15492 pshufb m5, m8 | |
15493 pmaddubsw m4, m0 | |
15494 pmaddubsw m5, m1 | |
15495 pmulhrsw m4, m2 | |
15496 pmulhrsw m5, m2 | |
15497 packuswb m4, m5 | |
15498 movu [r0], m4 | |
15499 | |
15500 palignr m5, m3, m6, 14 | |
15501 pshufb m4, m5, m7 | |
15502 pshufb m5, m8 | |
15503 pmaddubsw m4, m0 | |
15504 pmaddubsw m5, m1 | |
15505 pmulhrsw m4, m2 | |
15506 pmulhrsw m5, m2 | |
15507 packuswb m4, m5 | |
15508 movu [r0 + r1], m4 | |
15509 | |
15510 palignr m5, m3, m6, 15 | |
15511 pshufb m4, m5, m7 | |
15512 pshufb m5, m8 | |
15513 pmaddubsw m4, m0 | |
15514 pmaddubsw m5, m1 | |
15515 pmulhrsw m4, m2 | |
15516 pmulhrsw m5, m2 | |
15517 packuswb m4, m5 | |
15518 movu [r0 + r1 * 2], m4 | |
15519 | |
15520 pshufb m4, m3, m7 | |
15521 pshufb m5, m3, m8 | |
15522 pmaddubsw m4, m0 | |
15523 pmaddubsw m5, m1 | |
15524 pmulhrsw m4, m2 | |
15525 pmulhrsw m5, m2 | |
15526 packuswb m4, m5 | |
15527 movu [r0 + r3], m4 | |
15528 RET | |
15529 | |
15530 cglobal intra_pred_ang32_23, 3,5,8 | |
15531 lea r3, [ang_table_avx2 + 32 * 16] | |
15532 lea r4, [r1 * 3] | |
15533 mova m5, [pw_1024] | |
15534 | |
15535 ; rows 0 to 7 | |
15536 movu m0, [r2 + 0] | |
15537 movu m1, [r2 + 1] | |
15538 punpckhbw m2, m0, m1 | |
15539 punpcklbw m0, m1 | |
15540 | |
15541 movu m4, [r2 + mmsize*2] | |
15542 pshufb m4, [ang32_shuf_mode23] | |
15543 vpermq m4, m4, q1313 | |
15544 palignr m3, m0, m4, 1 | |
15545 vinserti128 m3, m3, xm2, 1 | |
15546 | |
15547 pmaddubsw m4, m0, [r3 + 7 * 32] ; [23] | |
15548 pmulhrsw m4, m5 | |
15549 pmaddubsw m1, m2, [r3 + 7 * 32] | |
15550 pmulhrsw m1, m5 | |
15551 packuswb m4, m1 | |
15552 movu [r0], m4 | |
15553 | |
15554 pmaddubsw m4, m0, [r3 - 2 * 32] ; [14] | |
15555 pmulhrsw m4, m5 | |
15556 pmaddubsw m1, m2, [r3 - 2 * 32] | |
15557 pmulhrsw m1, m5 | |
15558 packuswb m4, m1 | |
15559 movu [r0 + r1], m4 | |
15560 | |
15561 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] | |
15562 pmulhrsw m4, m5 | |
15563 pmaddubsw m1, m2, [r3 - 11 * 32] | |
15564 pmulhrsw m1, m5 | |
15565 packuswb m4, m1 | |
15566 movu [r0 + r1*2], m4 | |
15567 | |
15568 palignr m6, m0, m3, 14 | |
15569 palignr m7, m2, m0, 14 | |
15570 | |
15571 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] | |
15572 pmulhrsw m4, m5 | |
15573 pmaddubsw m1, m7, [r3 + 12 * 32] | |
15574 pmulhrsw m1, m5 | |
15575 packuswb m4, m1 | |
15576 movu [r0 + r4], m4 | |
15577 | |
15578 lea r0, [r0 + r1 * 4] | |
15579 | |
15580 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] | |
15581 pmulhrsw m4, m5 | |
15582 pmaddubsw m1, m7, [r3 + 3 * 32] | |
15583 pmulhrsw m1, m5 | |
15584 packuswb m4, m1 | |
15585 movu [r0], m4 | |
15586 | |
15587 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
15588 pmulhrsw m4, m5 | |
15589 pmaddubsw m1, m7, [r3 - 6 * 32] | |
15590 pmulhrsw m1, m5 | |
15591 packuswb m4, m1 | |
15592 movu [r0 + r1], m4 | |
15593 | |
15594 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] | |
15595 pmulhrsw m4, m5 | |
15596 pmaddubsw m1, m7, [r3 - 15 * 32] | |
15597 pmulhrsw m1, m5 | |
15598 packuswb m4, m1 | |
15599 movu [r0 + r1*2], m4 | |
15600 | |
15601 palignr m6, m0, m3, 12 | |
15602 palignr m7, m2, m0, 12 | |
15603 | |
15604 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
15605 pmulhrsw m4, m5 | |
15606 pmaddubsw m1, m7, [r3 + 8 * 32] | |
15607 pmulhrsw m1, m5 | |
15608 packuswb m4, m1 | |
15609 movu [r0 + r4], m4 | |
15610 | |
15611 lea r0, [r0 + r1 * 4] | |
15612 | |
15613 ; rows 8 to 15 | |
15614 pmaddubsw m4, m6, [r3 - 1 * 32] ; [15] | |
15615 pmulhrsw m4, m5 | |
15616 pmaddubsw m1, m7, [r3 - 1 * 32] | |
15617 pmulhrsw m1, m5 | |
15618 packuswb m4, m1 | |
15619 movu [r0], m4 | |
15620 | |
15621 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] | |
15622 pmulhrsw m4, m5 | |
15623 pmaddubsw m1, m7, [r3 - 10 * 32] | |
15624 pmulhrsw m1, m5 | |
15625 packuswb m4, m1 | |
15626 movu [r0 + r1], m4 | |
15627 | |
15628 palignr m6, m0, m3, 10 | |
15629 palignr m7, m2, m0, 10 | |
15630 | |
15631 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] | |
15632 pmulhrsw m4, m5 | |
15633 pmaddubsw m1, m7, [r3 + 13 * 32] | |
15634 pmulhrsw m1, m5 | |
15635 packuswb m4, m1 | |
15636 movu [r0 + r1*2], m4 | |
15637 | |
15638 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
15639 pmulhrsw m4, m5 | |
15640 pmaddubsw m1, m7, [r3 + 4 * 32] | |
15641 pmulhrsw m1, m5 | |
15642 packuswb m4, m1 | |
15643 movu [r0 + r4], m4 | |
15644 | |
15645 lea r0, [r0 + r1 * 4] | |
15646 | |
15647 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] | |
15648 pmulhrsw m4, m5 | |
15649 pmaddubsw m1, m7, [r3 - 5 * 32] | |
15650 pmulhrsw m1, m5 | |
15651 packuswb m4, m1 | |
15652 movu [r0], m4 | |
15653 | |
15654 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] | |
15655 pmulhrsw m4, m5 | |
15656 pmaddubsw m1, m7, [r3 - 14 * 32] | |
15657 pmulhrsw m1, m5 | |
15658 packuswb m4, m1 | |
15659 movu [r0 + r1], m4 | |
15660 | |
15661 palignr m6, m0, m3, 8 | |
15662 palignr m7, m2, m0, 8 | |
15663 | |
15664 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] | |
15665 pmulhrsw m4, m5 | |
15666 pmaddubsw m1, m7, [r3 + 9 * 32] | |
15667 pmulhrsw m1, m5 | |
15668 packuswb m4, m1 | |
15669 movu [r0 + r1 * 2], m4 | |
15670 | |
15671 pmaddubsw m4, m6, [r3] ; [16] | |
15672 pmulhrsw m4, m5 | |
15673 pmaddubsw m1, m7, [r3] | |
15674 pmulhrsw m1, m5 | |
15675 packuswb m4, m1 | |
15676 movu [r0 + r4], m4 | |
15677 | |
15678 lea r0, [r0 + r1 * 4] | |
15679 | |
15680 ; rows 16 to 23 | |
15681 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] | |
15682 pmulhrsw m4, m5 | |
15683 pmaddubsw m1, m7, [r3 - 9 * 32] | |
15684 pmulhrsw m1, m5 | |
15685 packuswb m4, m1 | |
15686 movu [r0], m4 | |
15687 | |
15688 palignr m6, m0, m3, 6 | |
15689 palignr m7, m2, m0, 6 | |
15690 | |
15691 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
15692 pmulhrsw m4, m5 | |
15693 pmaddubsw m1, m7, [r3 + 14 * 32] | |
15694 pmulhrsw m1, m5 | |
15695 packuswb m4, m1 | |
15696 movu [r0 + r1], m4 | |
15697 | |
15698 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
15699 pmulhrsw m4, m5 | |
15700 pmaddubsw m1, m7, [r3 + 5 * 32] | |
15701 pmulhrsw m1, m5 | |
15702 packuswb m4, m1 | |
15703 movu [r0 + r1*2], m4 | |
15704 | |
15705 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
15706 pmulhrsw m4, m5 | |
15707 pmaddubsw m1, m7, [r3 - 4 * 32] | |
15708 pmulhrsw m1, m5 | |
15709 packuswb m4, m1 | |
15710 movu [r0 + r4], m4 | |
15711 | |
15712 lea r0, [r0 + r1 * 4] | |
15713 | |
15714 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] | |
15715 pmulhrsw m4, m5 | |
15716 pmaddubsw m1, m7, [r3 - 13 * 32] | |
15717 pmulhrsw m1, m5 | |
15718 packuswb m4, m1 | |
15719 movu [r0], m4 | |
15720 | |
15721 palignr m6, m0, m3, 4 | |
15722 palignr m7, m2, m0, 4 | |
15723 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
15724 pmulhrsw m4, m5 | |
15725 pmaddubsw m1, m7, [r3 + 10 * 32] | |
15726 pmulhrsw m1, m5 | |
15727 packuswb m4, m1 | |
15728 movu [r0 + r1], m4 | |
15729 | |
15730 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] | |
15731 pmulhrsw m4, m5 | |
15732 pmaddubsw m1, m7, [r3 + 1 * 32] | |
15733 pmulhrsw m1, m5 | |
15734 packuswb m4, m1 | |
15735 movu [r0 + r1*2], m4 | |
15736 | |
15737 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
15738 pmulhrsw m4, m5 | |
15739 pmaddubsw m1, m7, [r3 - 8 * 32] | |
15740 pmulhrsw m1, m5 | |
15741 packuswb m4, m1 | |
15742 movu [r0 + r4], m4 | |
15743 | |
15744 lea r0, [r0 + r1 * 4] | |
15745 | |
15746 ; rows 24 to 31 | |
15747 palignr m6, m0, m3, 2 | |
15748 palignr m7, m2, m0, 2 | |
15749 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] | |
15750 pmulhrsw m4, m5 | |
15751 pmaddubsw m1, m7, [r3 + 15 * 32] | |
15752 pmulhrsw m1, m5 | |
15753 packuswb m4, m1 | |
15754 movu [r0], m4 | |
15755 | |
15756 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
15757 pmulhrsw m4, m5 | |
15758 pmaddubsw m1, m7, [r3 + 6 * 32] | |
15759 pmulhrsw m1, m5 | |
15760 packuswb m4, m1 | |
15761 movu [r0 + r1], m4 | |
15762 | |
15763 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] | |
15764 pmulhrsw m4, m5 | |
15765 pmaddubsw m1, m7, [r3 - 3 * 32] | |
15766 pmulhrsw m1, m5 | |
15767 packuswb m4, m1 | |
15768 movu [r0 + r1 * 2], m4 | |
15769 | |
15770 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
15771 pmulhrsw m4, m5 | |
15772 pmaddubsw m1, m7, [r3 - 12 * 32] | |
15773 pmulhrsw m1, m5 | |
15774 packuswb m4, m1 | |
15775 movu [r0 + r4], m4 | |
15776 | |
15777 lea r0, [r0 + r1 * 4] | |
15778 | |
15779 pmaddubsw m4, m3, [r3 + 11 * 32] ; [27] | |
15780 pmulhrsw m4, m5 | |
15781 pmaddubsw m1, m0, [r3 + 11 * 32] | |
15782 pmulhrsw m1, m5 | |
15783 packuswb m4, m1 | |
15784 movu [r0], m4 | |
15785 | |
15786 pmaddubsw m4, m3, [r3 + 2 * 32] ; [18] | |
15787 pmulhrsw m4, m5 | |
15788 pmaddubsw m1, m0, [r3 + 2 * 32] | |
15789 pmulhrsw m1, m5 | |
15790 packuswb m4, m1 | |
15791 movu [r0 + r1], m4 | |
15792 | |
15793 pmaddubsw m4, m3, [r3 - 7 * 32] ; [9] | |
15794 pmulhrsw m4, m5 | |
15795 pmaddubsw m1, m0, [r3 - 7 * 32] | |
15796 pmulhrsw m1, m5 | |
15797 packuswb m4, m1 | |
15798 movu [r0 + r1*2], m4 | |
15799 | |
15800 pand m3, [pw_00ff] | |
15801 pand m0, [pw_00ff] | |
15802 packuswb m3, m0 | |
15803 movu [r0 + r4], m3 | |
15804 RET | |
15805 | |
15806 cglobal intra_pred_ang32_14, 3,4,9 | |
15807 movu m0, [ang32_fact_mode14] | |
15808 movu m1, [ang32_fact_mode14 + mmsize] | |
15809 mova m2, [pw_1024] | |
15810 mova m7, [ang32_shuf_mode14] | |
15811 mova m8, [ang32_shuf_mode14 + mmsize] | |
15812 lea r3, [r1 * 3] | |
15813 | |
15814 ; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...] | |
15815 | |
15816 movu m6, [r2] | |
15817 pshufb m6, [ang32_shuf_mode14 + mmsize*2] | |
15818 vpermq m6, m6, 01110111b | |
15819 pslldq m6, m6, 1 | |
15820 vbroadcasti128 m3, [r2 + mmsize*2 + 1] | |
15821 | |
15822 palignr m5, m3, m6, 1 | |
15823 pshufb m4, m5, m7 | |
15824 pshufb m5, m8 | |
15825 pmaddubsw m4, m0 | |
15826 pmaddubsw m5, m1 | |
15827 pmulhrsw m4, m2 | |
15828 pmulhrsw m5, m2 | |
15829 packuswb m4, m5 | |
15830 movu [r0], m4 | |
15831 | |
15832 palignr m5, m3, m6, 2 | |
15833 pshufb m4, m5, m7 | |
15834 pshufb m5, m8 | |
15835 pmaddubsw m4, m0 | |
15836 pmaddubsw m5, m1 | |
15837 pmulhrsw m4, m2 | |
15838 pmulhrsw m5, m2 | |
15839 packuswb m4, m5 | |
15840 movu [r0 + r1], m4 | |
15841 | |
15842 palignr m5, m3, m6, 3 | |
15843 pshufb m4, m5, m7 | |
15844 pshufb m5, m8 | |
15845 pmaddubsw m4, m0 | |
15846 pmaddubsw m5, m1 | |
15847 pmulhrsw m4, m2 | |
15848 pmulhrsw m5, m2 | |
15849 packuswb m4, m5 | |
15850 movu [r0 + r1 * 2], m4 | |
15851 | |
15852 palignr m5, m3, m6, 4 | |
15853 pshufb m4, m5, m7 | |
15854 pshufb m5, m8 | |
15855 pmaddubsw m4, m0 | |
15856 pmaddubsw m5, m1 | |
15857 pmulhrsw m4, m2 | |
15858 pmulhrsw m5, m2 | |
15859 packuswb m4, m5 | |
15860 movu [r0 + r3], m4 | |
15861 | |
15862 lea r0, [r0 + r1 * 4] | |
15863 | |
15864 palignr m5, m3, m6, 5 | |
15865 pshufb m4, m5, m7 | |
15866 pshufb m5, m8 | |
15867 pmaddubsw m4, m0 | |
15868 pmaddubsw m5, m1 | |
15869 pmulhrsw m4, m2 | |
15870 pmulhrsw m5, m2 | |
15871 packuswb m4, m5 | |
15872 movu [r0], m4 | |
15873 | |
15874 palignr m5, m3, m6, 6 | |
15875 pshufb m4, m5, m7 | |
15876 pshufb m5, m8 | |
15877 pmaddubsw m4, m0 | |
15878 pmaddubsw m5, m1 | |
15879 pmulhrsw m4, m2 | |
15880 pmulhrsw m5, m2 | |
15881 packuswb m4, m5 | |
15882 movu [r0 + r1], m4 | |
15883 | |
15884 palignr m5, m3, m6, 7 | |
15885 pshufb m4, m5, m7 | |
15886 pshufb m5, m8 | |
15887 pmaddubsw m4, m0 | |
15888 pmaddubsw m5, m1 | |
15889 pmulhrsw m4, m2 | |
15890 pmulhrsw m5, m2 | |
15891 packuswb m4, m5 | |
15892 movu [r0 + r1 * 2], m4 | |
15893 | |
15894 palignr m5, m3, m6, 8 | |
15895 pshufb m4, m5, m7 | |
15896 pshufb m5, m8 | |
15897 pmaddubsw m4, m0 | |
15898 pmaddubsw m5, m1 | |
15899 pmulhrsw m4, m2 | |
15900 pmulhrsw m5, m2 | |
15901 packuswb m4, m5 | |
15902 movu [r0 + r3], m4 | |
15903 | |
15904 lea r0, [r0 + r1 * 4] | |
15905 | |
15906 palignr m5, m3, m6, 9 | |
15907 pshufb m4, m5, m7 | |
15908 pshufb m5, m8 | |
15909 pmaddubsw m4, m0 | |
15910 pmaddubsw m5, m1 | |
15911 pmulhrsw m4, m2 | |
15912 pmulhrsw m5, m2 | |
15913 packuswb m4, m5 | |
15914 movu [r0], m4 | |
15915 | |
15916 palignr m5, m3, m6, 10 | |
15917 pshufb m4, m5, m7 | |
15918 pshufb m5, m8 | |
15919 pmaddubsw m4, m0 | |
15920 pmaddubsw m5, m1 | |
15921 pmulhrsw m4, m2 | |
15922 pmulhrsw m5, m2 | |
15923 packuswb m4, m5 | |
15924 movu [r0 + r1], m4 | |
15925 | |
15926 palignr m5, m3, m6, 11 | |
15927 pshufb m4, m5, m7 | |
15928 pshufb m5, m8 | |
15929 pmaddubsw m4, m0 | |
15930 pmaddubsw m5, m1 | |
15931 pmulhrsw m4, m2 | |
15932 pmulhrsw m5, m2 | |
15933 packuswb m4, m5 | |
15934 movu [r0 + r1 * 2], m4 | |
15935 | |
15936 palignr m5, m3, m6, 12 | |
15937 pshufb m4, m5, m7 | |
15938 pshufb m5, m8 | |
15939 pmaddubsw m4, m0 | |
15940 pmaddubsw m5, m1 | |
15941 pmulhrsw m4, m2 | |
15942 pmulhrsw m5, m2 | |
15943 packuswb m4, m5 | |
15944 movu [r0 + r3], m4 | |
15945 | |
15946 lea r0, [r0 + r1 * 4] | |
15947 | |
15948 palignr m5, m3, m6, 13 | |
15949 pshufb m4, m5, m7 | |
15950 pshufb m5, m8 | |
15951 pmaddubsw m4, m0 | |
15952 pmaddubsw m5, m1 | |
15953 pmulhrsw m4, m2 | |
15954 pmulhrsw m5, m2 | |
15955 packuswb m4, m5 | |
15956 movu [r0], m4 | |
15957 | |
15958 palignr m5, m3, m6, 14 | |
15959 pshufb m4, m5, m7 | |
15960 pshufb m5, m8 | |
15961 pmaddubsw m4, m0 | |
15962 pmaddubsw m5, m1 | |
15963 pmulhrsw m4, m2 | |
15964 pmulhrsw m5, m2 | |
15965 packuswb m4, m5 | |
15966 movu [r0 + r1], m4 | |
15967 | |
15968 palignr m5, m3, m6, 15 | |
15969 pshufb m4, m5, m7 | |
15970 pshufb m5, m8 | |
15971 pmaddubsw m4, m0 | |
15972 pmaddubsw m5, m1 | |
15973 pmulhrsw m4, m2 | |
15974 pmulhrsw m5, m2 | |
15975 packuswb m4, m5 | |
15976 movu [r0 + r1 * 2], m4 | |
15977 | |
15978 pshufb m4, m3, m7 | |
15979 pshufb m5, m3, m8 | |
15980 pmaddubsw m4, m0 | |
15981 pmaddubsw m5, m1 | |
15982 pmulhrsw m4, m2 | |
15983 pmulhrsw m5, m2 | |
15984 packuswb m4, m5 | |
15985 movu [r0 + r3], m4 | |
15986 | |
15987 lea r0, [r0 + r1 * 4] | |
15988 | |
15989 mova m6, m3 | |
15990 vbroadcasti128 m3, [r2 + mmsize*2 + 17] | |
15991 palignr m5, m3, m6, 1 | |
15992 pshufb m4, m5, m7 | |
15993 pshufb m5, m8 | |
15994 pmaddubsw m4, m0 | |
15995 pmaddubsw m5, m1 | |
15996 pmulhrsw m4, m2 | |
15997 pmulhrsw m5, m2 | |
15998 packuswb m4, m5 | |
15999 movu [r0], m4 | |
16000 | |
16001 palignr m5, m3, m6, 2 | |
16002 pshufb m4, m5, m7 | |
16003 pshufb m5, m8 | |
16004 pmaddubsw m4, m0 | |
16005 pmaddubsw m5, m1 | |
16006 pmulhrsw m4, m2 | |
16007 pmulhrsw m5, m2 | |
16008 packuswb m4, m5 | |
16009 movu [r0 + r1], m4 | |
16010 | |
16011 palignr m5, m3, m6, 3 | |
16012 pshufb m4, m5, m7 | |
16013 pshufb m5, m8 | |
16014 pmaddubsw m4, m0 | |
16015 pmaddubsw m5, m1 | |
16016 pmulhrsw m4, m2 | |
16017 pmulhrsw m5, m2 | |
16018 packuswb m4, m5 | |
16019 movu [r0 + r1 * 2], m4 | |
16020 | |
16021 palignr m5, m3, m6, 4 | |
16022 pshufb m4, m5, m7 | |
16023 pshufb m5, m5, m8 | |
16024 pmaddubsw m4, m0 | |
16025 pmaddubsw m5, m1 | |
16026 pmulhrsw m4, m2 | |
16027 pmulhrsw m5, m2 | |
16028 packuswb m4, m5 | |
16029 movu [r0 + r3], m4 | |
16030 | |
16031 lea r0, [r0 + r1 * 4] | |
16032 | |
16033 palignr m5, m3, m6, 5 | |
16034 pshufb m4, m5, m7 | |
16035 pshufb m5, m8 | |
16036 pmaddubsw m4, m0 | |
16037 pmaddubsw m5, m1 | |
16038 pmulhrsw m4, m2 | |
16039 pmulhrsw m5, m2 | |
16040 packuswb m4, m5 | |
16041 movu [r0], m4 | |
16042 | |
16043 palignr m5, m3, m6, 6 | |
16044 pshufb m4, m5, m7 | |
16045 pshufb m5, m8 | |
16046 pmaddubsw m4, m0 | |
16047 pmaddubsw m5, m1 | |
16048 pmulhrsw m4, m2 | |
16049 pmulhrsw m5, m2 | |
16050 packuswb m4, m5 | |
16051 movu [r0 + r1], m4 | |
16052 | |
16053 palignr m5, m3, m6, 7 | |
16054 pshufb m4, m5, m7 | |
16055 pshufb m5, m8 | |
16056 pmaddubsw m4, m0 | |
16057 pmaddubsw m5, m1 | |
16058 pmulhrsw m4, m2 | |
16059 pmulhrsw m5, m2 | |
16060 packuswb m4, m5 | |
16061 movu [r0 + r1 * 2], m4 | |
16062 | |
16063 palignr m5, m3, m6, 8 | |
16064 pshufb m4, m5, m7 | |
16065 pshufb m5, m8 | |
16066 pmaddubsw m4, m0 | |
16067 pmaddubsw m5, m1 | |
16068 pmulhrsw m4, m2 | |
16069 pmulhrsw m5, m2 | |
16070 packuswb m4, m5 | |
16071 movu [r0 + r3], m4 | |
16072 | |
16073 lea r0, [r0 + r1 * 4] | |
16074 | |
16075 palignr m5, m3, m6, 9 | |
16076 pshufb m4, m5, m7 | |
16077 pshufb m5, m8 | |
16078 pmaddubsw m4, m0 | |
16079 pmaddubsw m5, m1 | |
16080 pmulhrsw m4, m2 | |
16081 pmulhrsw m5, m2 | |
16082 packuswb m4, m5 | |
16083 movu [r0], m4 | |
16084 | |
16085 palignr m5, m3, m6, 10 | |
16086 pshufb m4, m5, m7 | |
16087 pshufb m5, m8 | |
16088 pmaddubsw m4, m0 | |
16089 pmaddubsw m5, m1 | |
16090 pmulhrsw m4, m2 | |
16091 pmulhrsw m5, m2 | |
16092 packuswb m4, m5 | |
16093 movu [r0 + r1], m4 | |
16094 | |
16095 palignr m5, m3, m6, 11 | |
16096 pshufb m4, m5, m7 | |
16097 pshufb m5, m8 | |
16098 pmaddubsw m4, m0 | |
16099 pmaddubsw m5, m1 | |
16100 pmulhrsw m4, m2 | |
16101 pmulhrsw m5, m2 | |
16102 packuswb m4, m5 | |
16103 movu [r0 + r1 * 2], m4 | |
16104 | |
16105 palignr m5, m3, m6, 12 | |
16106 pshufb m4, m5, m7 | |
16107 pshufb m5, m8 | |
16108 pmaddubsw m4, m0 | |
16109 pmaddubsw m5, m1 | |
16110 pmulhrsw m4, m2 | |
16111 pmulhrsw m5, m2 | |
16112 packuswb m4, m5 | |
16113 movu [r0 + r3], m4 | |
16114 | |
16115 lea r0, [r0 + r1 * 4] | |
16116 | |
16117 palignr m5, m3, m6, 13 | |
16118 pshufb m4, m5, m7 | |
16119 pshufb m5, m8 | |
16120 pmaddubsw m4, m0 | |
16121 pmaddubsw m5, m1 | |
16122 pmulhrsw m4, m2 | |
16123 pmulhrsw m5, m2 | |
16124 packuswb m4, m5 | |
16125 movu [r0], m4 | |
16126 | |
16127 palignr m5, m3, m6, 14 | |
16128 pshufb m4, m5, m7 | |
16129 pshufb m5, m8 | |
16130 pmaddubsw m4, m0 | |
16131 pmaddubsw m5, m1 | |
16132 pmulhrsw m4, m2 | |
16133 pmulhrsw m5, m2 | |
16134 packuswb m4, m5 | |
16135 movu [r0 + r1], m4 | |
16136 | |
16137 palignr m5, m3, m6, 15 | |
16138 pshufb m4, m5, m7 | |
16139 pshufb m5, m8 | |
16140 pmaddubsw m4, m0 | |
16141 pmaddubsw m5, m1 | |
16142 pmulhrsw m4, m2 | |
16143 pmulhrsw m5, m2 | |
16144 packuswb m4, m5 | |
16145 movu [r0 + r1 * 2], m4 | |
16146 | |
16147 pshufb m4, m3, m7 | |
16148 pshufb m5, m3, m8 | |
16149 pmaddubsw m4, m0 | |
16150 pmaddubsw m5, m1 | |
16151 pmulhrsw m4, m2 | |
16152 pmulhrsw m5, m2 | |
16153 packuswb m4, m5 | |
16154 movu [r0 + r3], m4 | |
16155 RET | |
16156 | |
16157 cglobal intra_pred_ang32_22, 3,5,9 | |
16158 lea r3, [ang_table_avx2 + 32 * 16] | |
16159 lea r4, [r1 * 3] | |
16160 mova m5, [pw_1024] | |
16161 | |
16162 ; rows 0 to 7 | |
16163 movu m0, [r2 + 0] | |
16164 movu m1, [r2 + 1] | |
16165 punpckhbw m2, m0, m1 | |
16166 punpcklbw m0, m1 | |
16167 | |
16168 movu m4, [r2 + mmsize*2 + 2] | |
16169 pshufb m4, [ang32_shuf_mode22] | |
16170 vextracti128 xm8, m4, 1 | |
16171 | |
16172 palignr m3, m0, m4, 2 | |
16173 palignr m3, m8, 15 | |
16174 vinserti128 m3, m3, xm2, 1 | |
16175 vinserti128 m8, m8, xm0, 1 | |
16176 | |
16177 pmaddubsw m4, m0, [r3 + 3 * 32] ; [19] | |
16178 pmulhrsw m4, m5 | |
16179 pmaddubsw m1, m2, [r3 + 3 * 32] | |
16180 pmulhrsw m1, m5 | |
16181 packuswb m4, m1 | |
16182 movu [r0], m4 | |
16183 | |
16184 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] | |
16185 pmulhrsw m4, m5 | |
16186 pmaddubsw m1, m2, [r3 - 10 * 32] | |
16187 pmulhrsw m1, m5 | |
16188 packuswb m4, m1 | |
16189 movu [r0 + r1], m4 | |
16190 | |
16191 palignr m6, m0, m3, 14 | |
16192 palignr m7, m2, m0, 14 | |
16193 | |
16194 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] | |
16195 pmulhrsw m4, m5 | |
16196 pmaddubsw m1, m7, [r3 + 9 * 32] | |
16197 pmulhrsw m1, m5 | |
16198 packuswb m4, m1 | |
16199 movu [r0 + r1*2], m4 | |
16200 | |
16201 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
16202 pmulhrsw m4, m5 | |
16203 pmaddubsw m1, m7, [r3 - 4 * 32] | |
16204 pmulhrsw m1, m5 | |
16205 packuswb m4, m1 | |
16206 movu [r0 + r4], m4 | |
16207 | |
16208 lea r0, [r0 + r1 * 4] | |
16209 | |
16210 palignr m6, m0, m3, 12 | |
16211 palignr m7, m2, m0, 12 | |
16212 | |
16213 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] | |
16214 pmulhrsw m4, m5 | |
16215 pmaddubsw m1, m7, [r3 + 15 * 32] | |
16216 pmulhrsw m1, m5 | |
16217 packuswb m4, m1 | |
16218 movu [r0], m4 | |
16219 | |
16220 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
16221 pmulhrsw m4, m5 | |
16222 pmaddubsw m1, m7, [r3 + 2 * 32] | |
16223 pmulhrsw m1, m5 | |
16224 packuswb m4, m1 | |
16225 movu [r0 + r1], m4 | |
16226 | |
16227 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] | |
16228 pmulhrsw m4, m5 | |
16229 pmaddubsw m1, m7, [r3 - 11 * 32] | |
16230 pmulhrsw m1, m5 | |
16231 packuswb m4, m1 | |
16232 movu [r0 + r1*2], m4 | |
16233 | |
16234 palignr m6, m0, m3, 10 | |
16235 palignr m7, m2, m0, 10 | |
16236 | |
16237 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
16238 pmulhrsw m4, m5 | |
16239 pmaddubsw m1, m7, [r3 + 8 * 32] | |
16240 pmulhrsw m1, m5 | |
16241 packuswb m4, m1 | |
16242 movu [r0 + r4], m4 | |
16243 | |
16244 lea r0, [r0 + r1 * 4] | |
16245 | |
16246 ; rows 8 to 15 | |
16247 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] | |
16248 pmulhrsw m4, m5 | |
16249 pmaddubsw m1, m7, [r3 - 5 * 32] | |
16250 pmulhrsw m1, m5 | |
16251 packuswb m4, m1 | |
16252 movu [r0], m4 | |
16253 | |
16254 palignr m6, m0, m3, 8 | |
16255 palignr m7, m2, m0, 8 | |
16256 | |
16257 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
16258 pmulhrsw m4, m5 | |
16259 pmaddubsw m1, m7, [r3 + 14 * 32] | |
16260 pmulhrsw m1, m5 | |
16261 packuswb m4, m1 | |
16262 movu [r0 + r1], m4 | |
16263 | |
16264 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] | |
16265 pmulhrsw m4, m5 | |
16266 pmaddubsw m1, m7, [r3 + 1 * 32] | |
16267 pmulhrsw m1, m5 | |
16268 packuswb m4, m1 | |
16269 movu [r0 + r1*2], m4 | |
16270 | |
16271 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
16272 pmulhrsw m4, m5 | |
16273 pmaddubsw m1, m7, [r3 - 12 * 32] | |
16274 pmulhrsw m1, m5 | |
16275 packuswb m4, m1 | |
16276 movu [r0 + r4], m4 | |
16277 | |
16278 lea r0, [r0 + r1 * 4] | |
16279 | |
16280 palignr m6, m0, m3, 6 | |
16281 palignr m7, m2, m0, 6 | |
16282 | |
16283 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] | |
16284 pmulhrsw m4, m5 | |
16285 pmaddubsw m1, m7, [r3 + 7 * 32] | |
16286 pmulhrsw m1, m5 | |
16287 packuswb m4, m1 | |
16288 movu [r0], m4 | |
16289 | |
16290 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
16291 pmulhrsw m4, m5 | |
16292 pmaddubsw m1, m7, [r3 - 6 * 32] | |
16293 pmulhrsw m1, m5 | |
16294 packuswb m4, m1 | |
16295 movu [r0 + r1], m4 | |
16296 | |
16297 palignr m6, m0, m3, 4 | |
16298 palignr m7, m2, m0, 4 | |
16299 | |
16300 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] | |
16301 pmulhrsw m4, m5 | |
16302 pmaddubsw m1, m7, [r3 + 13 * 32] | |
16303 pmulhrsw m1, m5 | |
16304 packuswb m4, m1 | |
16305 movu [r0 + r1 * 2], m4 | |
16306 | |
16307 pmaddubsw m4, m6, [r3] ; [16] | |
16308 pmulhrsw m4, m5 | |
16309 pmaddubsw m1, m7, [r3] | |
16310 pmulhrsw m1, m5 | |
16311 packuswb m4, m1 | |
16312 movu [r0 + r4], m4 | |
16313 | |
16314 lea r0, [r0 + r1 * 4] | |
16315 | |
16316 ; rows 16 to 23 | |
16317 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] | |
16318 pmulhrsw m4, m5 | |
16319 pmaddubsw m1, m7, [r3 - 13 * 32] | |
16320 pmulhrsw m1, m5 | |
16321 packuswb m4, m1 | |
16322 movu [r0], m4 | |
16323 | |
16324 palignr m6, m0, m3, 2 | |
16325 palignr m7, m2, m0, 2 | |
16326 | |
16327 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
16328 pmulhrsw m4, m5 | |
16329 pmaddubsw m1, m7, [r3 + 6 * 32] | |
16330 pmulhrsw m1, m5 | |
16331 packuswb m4, m1 | |
16332 movu [r0 + r1], m4 | |
16333 | |
16334 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] | |
16335 pmulhrsw m4, m5 | |
16336 pmaddubsw m1, m7, [r3 - 7 * 32] | |
16337 pmulhrsw m1, m5 | |
16338 packuswb m4, m1 | |
16339 movu [r0 + r1*2], m4 | |
16340 | |
16341 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28] | |
16342 pmulhrsw m4, m5 | |
16343 pmaddubsw m1, m0, [r3 + 12 * 32] | |
16344 pmulhrsw m1, m5 | |
16345 packuswb m4, m1 | |
16346 movu [r0 + r4], m4 | |
16347 | |
16348 lea r0, [r0 + r1 * 4] | |
16349 | |
16350 pmaddubsw m4, m3, [r3 - 1 * 32] ; [15] | |
16351 pmulhrsw m4, m5 | |
16352 pmaddubsw m1, m0, [r3 - 1 * 32] | |
16353 pmulhrsw m1, m5 | |
16354 packuswb m4, m1 | |
16355 movu [r0], m4 | |
16356 | |
16357 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2] | |
16358 pmulhrsw m4, m5 | |
16359 pmaddubsw m1, m0, [r3 - 14 * 32] | |
16360 pmulhrsw m1, m5 | |
16361 packuswb m4, m1 | |
16362 movu [r0 + r1], m4 | |
16363 | |
16364 palignr m6, m3, m8, 14 | |
16365 palignr m7, m0, m3, 14 | |
16366 | |
16367 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
16368 pmulhrsw m4, m5 | |
16369 pmaddubsw m1, m7, [r3 + 5 * 32] | |
16370 pmulhrsw m1, m5 | |
16371 packuswb m4, m1 | |
16372 movu [r0 + r1*2], m4 | |
16373 | |
16374 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
16375 pmulhrsw m4, m5 | |
16376 pmaddubsw m1, m7, [r3 - 8 * 32] | |
16377 pmulhrsw m1, m5 | |
16378 packuswb m4, m1 | |
16379 movu [r0 + r4], m4 | |
16380 | |
16381 lea r0, [r0 + r1 * 4] | |
16382 | |
16383 ; rows 24 to 31 | |
16384 palignr m6, m3, m8, 12 | |
16385 palignr m7, m0, m3, 12 | |
16386 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] | |
16387 pmulhrsw m4, m5 | |
16388 pmaddubsw m1, m7, [r3 + 11 * 32] | |
16389 pmulhrsw m1, m5 | |
16390 packuswb m4, m1 | |
16391 movu [r0], m4 | |
16392 | |
16393 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
16394 pmulhrsw m4, m5 | |
16395 pmaddubsw m1, m7, [r3 - 2 * 32] | |
16396 pmulhrsw m1, m5 | |
16397 packuswb m4, m1 | |
16398 movu [r0 + r1], m4 | |
16399 | |
16400 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] | |
16401 pmulhrsw m4, m5 | |
16402 pmaddubsw m1, m7, [r3 - 15 * 32] | |
16403 pmulhrsw m1, m5 | |
16404 packuswb m4, m1 | |
16405 movu [r0 + r1 * 2], m4 | |
16406 | |
16407 palignr m6, m3, m8, 10 | |
16408 palignr m7, m0, m3, 10 | |
16409 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
16410 pmulhrsw m4, m5 | |
16411 pmaddubsw m1, m7, [r3 + 4 * 32] | |
16412 pmulhrsw m1, m5 | |
16413 packuswb m4, m1 | |
16414 movu [r0 + r4], m4 | |
16415 | |
16416 lea r0, [r0 + r1 * 4] | |
16417 | |
16418 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] | |
16419 pmulhrsw m4, m5 | |
16420 pmaddubsw m1, m7, [r3 - 9 * 32] | |
16421 pmulhrsw m1, m5 | |
16422 packuswb m4, m1 | |
16423 movu [r0], m4 | |
16424 | |
16425 palignr m0, m3, 8 | |
16426 palignr m3, m8, 8 | |
16427 pmaddubsw m4, m3, [r3 + 10 * 32] ; [26] | |
16428 pmulhrsw m4, m5 | |
16429 pmaddubsw m1, m0, [r3 + 10 * 32] | |
16430 pmulhrsw m1, m5 | |
16431 packuswb m4, m1 | |
16432 movu [r0 + r1], m4 | |
16433 | |
16434 pmaddubsw m4, m3, [r3 - 3 * 32] ; [13] | |
16435 pmulhrsw m4, m5 | |
16436 pmaddubsw m1, m0, [r3 - 3 * 32] | |
16437 pmulhrsw m1, m5 | |
16438 packuswb m4, m1 | |
16439 movu [r0 + r1*2], m4 | |
16440 | |
16441 pand m3, [pw_00ff] | |
16442 pand m0, [pw_00ff] | |
16443 packuswb m3, m0 | |
16444 movu [r0 + r4], m3 | |
16445 RET | |
16446 | |
16447 cglobal intra_pred_ang32_15, 3,4,9 | |
16448 movu m0, [ang32_fact_mode15] | |
16449 movu m1, [ang32_fact_mode15 + mmsize] | |
16450 mova m2, [pw_1024] | |
16451 mova m7, [ang32_shuf_mode15] | |
16452 mova m8, [ang32_shuf_mode15 + mmsize] | |
16453 lea r3, [r1 * 3] | |
16454 | |
16455 ; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2...] | |
16456 | |
16457 movu m6, [r2] | |
16458 pshufb m6, [ang32_shuf_mode15 + mmsize*2] | |
16459 vpermq m6, m6, 01110111b | |
16460 | |
16461 movu xm3, [r2 + mmsize*2] | |
16462 pinsrb xm3, [r2], 0 | |
16463 vpermq m3, m3, 01000100b | |
16464 | |
16465 palignr m4, m3, m6, 2 | |
16466 pshufb m4, m7 | |
16467 pshufb m5, m6, m8 | |
16468 pmaddubsw m4, m0 | |
16469 pmaddubsw m5, m1 | |
16470 pmulhrsw m4, m2 | |
16471 pmulhrsw m5, m2 | |
16472 packuswb m4, m5 | |
16473 movu [r0], m4 | |
16474 | |
16475 palignr m4, m3, m6, 3 | |
16476 pshufb m4, m7 | |
16477 palignr m5, m3, m6, 1 | |
16478 pshufb m5, m8 | |
16479 pmaddubsw m4, m0 | |
16480 pmaddubsw m5, m1 | |
16481 pmulhrsw m4, m2 | |
16482 pmulhrsw m5, m2 | |
16483 packuswb m4, m5 | |
16484 movu [r0 + r1], m4 | |
16485 | |
16486 palignr m4, m3, m6, 4 | |
16487 pshufb m4, m7 | |
16488 palignr m5, m3, m6, 2 | |
16489 pshufb m5, m8 | |
16490 pmaddubsw m4, m0 | |
16491 pmaddubsw m5, m1 | |
16492 pmulhrsw m4, m2 | |
16493 pmulhrsw m5, m2 | |
16494 packuswb m4, m5 | |
16495 movu [r0 + r1 * 2], m4 | |
16496 | |
16497 palignr m4, m3, m6, 5 | |
16498 pshufb m4, m7 | |
16499 palignr m5, m3, m6, 3 | |
16500 pshufb m5, m8 | |
16501 pmaddubsw m4, m0 | |
16502 pmaddubsw m5, m1 | |
16503 pmulhrsw m4, m2 | |
16504 pmulhrsw m5, m2 | |
16505 packuswb m4, m5 | |
16506 movu [r0 + r3], m4 | |
16507 | |
16508 lea r0, [r0 + r1 * 4] | |
16509 | |
16510 palignr m4, m3, m6, 6 | |
16511 pshufb m4, m7 | |
16512 palignr m5, m3, m6, 4 | |
16513 pshufb m5, m8 | |
16514 pmaddubsw m4, m0 | |
16515 pmaddubsw m5, m1 | |
16516 pmulhrsw m4, m2 | |
16517 pmulhrsw m5, m2 | |
16518 packuswb m4, m5 | |
16519 movu [r0], m4 | |
16520 | |
16521 palignr m4, m3, m6, 7 | |
16522 pshufb m4, m7 | |
16523 palignr m5, m3, m6, 5 | |
16524 pshufb m5, m8 | |
16525 pmaddubsw m4, m0 | |
16526 pmaddubsw m5, m1 | |
16527 pmulhrsw m4, m2 | |
16528 pmulhrsw m5, m2 | |
16529 packuswb m4, m5 | |
16530 movu [r0 + r1], m4 | |
16531 | |
16532 palignr m4, m3, m6, 8 | |
16533 pshufb m4, m7 | |
16534 palignr m5, m3, m6, 6 | |
16535 pshufb m5, m8 | |
16536 pmaddubsw m4, m0 | |
16537 pmaddubsw m5, m1 | |
16538 pmulhrsw m4, m2 | |
16539 pmulhrsw m5, m2 | |
16540 packuswb m4, m5 | |
16541 movu [r0 + r1 * 2], m4 | |
16542 | |
16543 palignr m4, m3, m6, 9 | |
16544 pshufb m4, m7 | |
16545 palignr m5, m3, m6, 7 | |
16546 pshufb m5, m8 | |
16547 pmaddubsw m4, m0 | |
16548 pmaddubsw m5, m1 | |
16549 pmulhrsw m4, m2 | |
16550 pmulhrsw m5, m2 | |
16551 packuswb m4, m5 | |
16552 movu [r0 + r3], m4 | |
16553 | |
16554 lea r0, [r0 + r1 * 4] | |
16555 | |
16556 palignr m4, m3, m6, 10 | |
16557 pshufb m4, m7 | |
16558 palignr m5, m3, m6, 8 | |
16559 pshufb m5, m8 | |
16560 pmaddubsw m4, m0 | |
16561 pmaddubsw m5, m1 | |
16562 pmulhrsw m4, m2 | |
16563 pmulhrsw m5, m2 | |
16564 packuswb m4, m5 | |
16565 movu [r0], m4 | |
16566 | |
16567 palignr m4, m3, m6, 11 | |
16568 pshufb m4, m7 | |
16569 palignr m5, m3, m6, 9 | |
16570 pshufb m5, m8 | |
16571 pmaddubsw m4, m0 | |
16572 pmaddubsw m5, m1 | |
16573 pmulhrsw m4, m2 | |
16574 pmulhrsw m5, m2 | |
16575 packuswb m4, m5 | |
16576 movu [r0 + r1], m4 | |
16577 | |
16578 palignr m4, m3, m6, 12 | |
16579 pshufb m4, m7 | |
16580 palignr m5, m3, m6, 10 | |
16581 pshufb m5, m8 | |
16582 pmaddubsw m4, m0 | |
16583 pmaddubsw m5, m1 | |
16584 pmulhrsw m4, m2 | |
16585 pmulhrsw m5, m2 | |
16586 packuswb m4, m5 | |
16587 movu [r0 + r1 * 2], m4 | |
16588 | |
16589 palignr m4, m3, m6, 13 | |
16590 pshufb m4, m7 | |
16591 palignr m5, m3, m6, 11 | |
16592 pshufb m5, m8 | |
16593 pmaddubsw m4, m0 | |
16594 pmaddubsw m5, m1 | |
16595 pmulhrsw m4, m2 | |
16596 pmulhrsw m5, m2 | |
16597 packuswb m4, m5 | |
16598 movu [r0 + r3], m4 | |
16599 | |
16600 lea r0, [r0 + r1 * 4] | |
16601 | |
16602 palignr m4, m3, m6, 14 | |
16603 pshufb m4, m7 | |
16604 palignr m5, m3, m6, 12 | |
16605 pshufb m5, m8 | |
16606 pmaddubsw m4, m0 | |
16607 pmaddubsw m5, m1 | |
16608 pmulhrsw m4, m2 | |
16609 pmulhrsw m5, m2 | |
16610 packuswb m4, m5 | |
16611 movu [r0], m4 | |
16612 | |
16613 palignr m4, m3, m6, 15 | |
16614 pshufb m4, m7 | |
16615 palignr m5, m3, m6, 13 | |
16616 pshufb m5, m8 | |
16617 pmaddubsw m4, m0 | |
16618 pmaddubsw m5, m1 | |
16619 pmulhrsw m4, m2 | |
16620 pmulhrsw m5, m2 | |
16621 packuswb m4, m5 | |
16622 movu [r0 + r1], m4 | |
16623 | |
16624 pshufb m4, m3, m7 | |
16625 palignr m5, m3, m6, 14 | |
16626 pshufb m5, m8 | |
16627 pmaddubsw m4, m0 | |
16628 pmaddubsw m5, m1 | |
16629 pmulhrsw m4, m2 | |
16630 pmulhrsw m5, m2 | |
16631 packuswb m4, m5 | |
16632 movu [r0 + r1 * 2], m4 | |
16633 | |
16634 palignr m5, m3, m6, 15 | |
16635 mova m6, m3 | |
16636 vbroadcasti128 m3, [r2 + mmsize*2 + 16] | |
16637 | |
16638 palignr m4, m3, m6, 1 | |
16639 pshufb m4, m7 | |
16640 pshufb m5, m8 | |
16641 pmaddubsw m4, m0 | |
16642 pmaddubsw m5, m1 | |
16643 pmulhrsw m4, m2 | |
16644 pmulhrsw m5, m2 | |
16645 packuswb m4, m5 | |
16646 movu [r0 + r3], m4 | |
16647 | |
16648 lea r0, [r0 + r1 * 4] | |
16649 | |
16650 palignr m4, m3, m6, 2 | |
16651 pshufb m4, m7 | |
16652 pshufb m5, m6, m8 | |
16653 pmaddubsw m4, m0 | |
16654 pmaddubsw m5, m1 | |
16655 pmulhrsw m4, m2 | |
16656 pmulhrsw m5, m2 | |
16657 packuswb m4, m5 | |
16658 movu [r0], m4 | |
16659 | |
16660 palignr m4, m3, m6, 3 | |
16661 pshufb m4, m7 | |
16662 palignr m5, m3, m6, 1 | |
16663 pshufb m5, m8 | |
16664 pmaddubsw m4, m0 | |
16665 pmaddubsw m5, m1 | |
16666 pmulhrsw m4, m2 | |
16667 pmulhrsw m5, m2 | |
16668 packuswb m4, m5 | |
16669 movu [r0 + r1], m4 | |
16670 | |
16671 palignr m4, m3, m6, 4 | |
16672 pshufb m4, m7 | |
16673 palignr m5, m3, m6, 2 | |
16674 pshufb m5, m8 | |
16675 pmaddubsw m4, m0 | |
16676 pmaddubsw m5, m1 | |
16677 pmulhrsw m4, m2 | |
16678 pmulhrsw m5, m2 | |
16679 packuswb m4, m5 | |
16680 movu [r0 + r1 * 2], m4 | |
16681 | |
16682 palignr m4, m3, m6, 5 | |
16683 pshufb m4, m7 | |
16684 palignr m5, m3, m6, 3 | |
16685 pshufb m5, m8 | |
16686 pmaddubsw m4, m0 | |
16687 pmaddubsw m5, m1 | |
16688 pmulhrsw m4, m2 | |
16689 pmulhrsw m5, m2 | |
16690 packuswb m4, m5 | |
16691 movu [r0 + r3], m4 | |
16692 | |
16693 lea r0, [r0 + r1 * 4] | |
16694 | |
16695 palignr m4, m3, m6, 6 | |
16696 pshufb m4, m7 | |
16697 palignr m5, m3, m6, 4 | |
16698 pshufb m5, m8 | |
16699 pmaddubsw m4, m0 | |
16700 pmaddubsw m5, m1 | |
16701 pmulhrsw m4, m2 | |
16702 pmulhrsw m5, m2 | |
16703 packuswb m4, m5 | |
16704 movu [r0], m4 | |
16705 | |
16706 palignr m4, m3, m6, 7 | |
16707 pshufb m4, m7 | |
16708 palignr m5, m3, m6, 5 | |
16709 pshufb m5, m8 | |
16710 pmaddubsw m4, m0 | |
16711 pmaddubsw m5, m1 | |
16712 pmulhrsw m4, m2 | |
16713 pmulhrsw m5, m2 | |
16714 packuswb m4, m5 | |
16715 movu [r0 + r1], m4 | |
16716 | |
16717 palignr m4, m3, m6, 8 | |
16718 pshufb m4, m7 | |
16719 palignr m5, m3, m6, 6 | |
16720 pshufb m5, m8 | |
16721 pmaddubsw m4, m0 | |
16722 pmaddubsw m5, m1 | |
16723 pmulhrsw m4, m2 | |
16724 pmulhrsw m5, m2 | |
16725 packuswb m4, m5 | |
16726 movu [r0 + r1 * 2], m4 | |
16727 | |
16728 palignr m4, m3, m6, 9 | |
16729 pshufb m4, m7 | |
16730 palignr m5, m3, m6, 7 | |
16731 pshufb m5, m8 | |
16732 pmaddubsw m4, m0 | |
16733 pmaddubsw m5, m1 | |
16734 pmulhrsw m4, m2 | |
16735 pmulhrsw m5, m2 | |
16736 packuswb m4, m5 | |
16737 movu [r0 + r3], m4 | |
16738 | |
16739 lea r0, [r0 + r1 * 4] | |
16740 | |
16741 palignr m4, m3, m6, 10 | |
16742 pshufb m4, m7 | |
16743 palignr m5, m3, m6, 8 | |
16744 pshufb m5, m8 | |
16745 pmaddubsw m4, m0 | |
16746 pmaddubsw m5, m1 | |
16747 pmulhrsw m4, m2 | |
16748 pmulhrsw m5, m2 | |
16749 packuswb m4, m5 | |
16750 movu [r0], m4 | |
16751 | |
16752 palignr m4, m3, m6, 11 | |
16753 pshufb m4, m7 | |
16754 palignr m5, m3, m6, 9 | |
16755 pshufb m5, m8 | |
16756 pmaddubsw m4, m0 | |
16757 pmaddubsw m5, m1 | |
16758 pmulhrsw m4, m2 | |
16759 pmulhrsw m5, m2 | |
16760 packuswb m4, m5 | |
16761 movu [r0 + r1], m4 | |
16762 | |
16763 palignr m4, m3, m6, 12 | |
16764 pshufb m4, m7 | |
16765 palignr m5, m3, m6, 10 | |
16766 pshufb m5, m8 | |
16767 pmaddubsw m4, m0 | |
16768 pmaddubsw m5, m1 | |
16769 pmulhrsw m4, m2 | |
16770 pmulhrsw m5, m2 | |
16771 packuswb m4, m5 | |
16772 movu [r0 + r1 * 2], m4 | |
16773 | |
16774 palignr m4, m3, m6, 13 | |
16775 pshufb m4, m7 | |
16776 palignr m5, m3, m6, 11 | |
16777 pshufb m5, m8 | |
16778 pmaddubsw m4, m0 | |
16779 pmaddubsw m5, m1 | |
16780 pmulhrsw m4, m2 | |
16781 pmulhrsw m5, m2 | |
16782 packuswb m4, m5 | |
16783 movu [r0 + r3], m4 | |
16784 | |
16785 lea r0, [r0 + r1 * 4] | |
16786 | |
16787 palignr m4, m3, m6, 14 | |
16788 pshufb m4, m7 | |
16789 palignr m5, m3, m6, 12 | |
16790 pshufb m5, m8 | |
16791 pmaddubsw m4, m0 | |
16792 pmaddubsw m5, m1 | |
16793 pmulhrsw m4, m2 | |
16794 pmulhrsw m5, m2 | |
16795 packuswb m4, m5 | |
16796 movu [r0], m4 | |
16797 | |
16798 palignr m4, m3, m6, 15 | |
16799 pshufb m4, m7 | |
16800 palignr m5, m3, m6, 13 | |
16801 pshufb m5, m8 | |
16802 pmaddubsw m4, m0 | |
16803 pmaddubsw m5, m1 | |
16804 pmulhrsw m4, m2 | |
16805 pmulhrsw m5, m2 | |
16806 packuswb m4, m5 | |
16807 movu [r0 + r1], m4 | |
16808 | |
16809 pshufb m4, m3, m7 | |
16810 palignr m5, m3, m6, 14 | |
16811 pshufb m5, m8 | |
16812 pmaddubsw m4, m0 | |
16813 pmaddubsw m5, m1 | |
16814 pmulhrsw m4, m2 | |
16815 pmulhrsw m5, m2 | |
16816 packuswb m4, m5 | |
16817 movu [r0 + r1 * 2], m4 | |
16818 | |
16819 palignr m5, m3, m6, 15 | |
16820 vbroadcasti128 m6, [r2 + mmsize*2 + 32] | |
16821 | |
16822 palignr m4, m6, m3, 1 | |
16823 pshufb m4, m7 | |
16824 pshufb m5, m8 | |
16825 pmaddubsw m4, m0 | |
16826 pmaddubsw m5, m1 | |
16827 pmulhrsw m4, m2 | |
16828 pmulhrsw m5, m2 | |
16829 packuswb m4, m5 | |
16830 movu [r0 + r3], m4 | |
16831 RET | |
16832 | |
16833 cglobal intra_pred_ang32_21, 3,5,9 | |
16834 lea r3, [ang_table_avx2 + 32 * 16] | |
16835 lea r4, [r1 * 3] | |
16836 mova m5, [pw_1024] | |
16837 | |
16838 ; rows 0 to 7 | |
16839 movu m0, [r2 + 0] | |
16840 movu m1, [r2 + 1] | |
16841 punpckhbw m2, m0, m1 | |
16842 punpcklbw m0, m1 | |
16843 | |
16844 movu m4, [r2 + mmsize*2] | |
16845 pshufb m4, [ang32_shuf_mode21] | |
16846 vextracti128 xm6, m4, 1 | |
16847 | |
16848 palignr m3, m0, m4, 1 | |
16849 palignr m8, m3, m6, 1 | |
16850 vinserti128 m3, m3, xm2, 1 | |
16851 vinserti128 m8, m8, xm0, 1 | |
16852 | |
16853 pmaddubsw m4, m0, [r3 - 1 * 32] ; [15] | |
16854 pmulhrsw m4, m5 | |
16855 pmaddubsw m1, m2, [r3 - 1 * 32] | |
16856 pmulhrsw m1, m5 | |
16857 packuswb m4, m1 | |
16858 movu [r0], m4 | |
16859 | |
16860 palignr m6, m0, m3, 14 | |
16861 palignr m7, m2, m0, 14 | |
16862 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
16863 pmulhrsw m4, m5 | |
16864 pmaddubsw m1, m7, [r3 + 14 * 32] | |
16865 pmulhrsw m1, m5 | |
16866 packuswb m4, m1 | |
16867 movu [r0 + r1], m4 | |
16868 | |
16869 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] | |
16870 pmulhrsw m4, m5 | |
16871 pmaddubsw m1, m7, [r3 - 3 * 32] | |
16872 pmulhrsw m1, m5 | |
16873 packuswb m4, m1 | |
16874 movu [r0 + r1*2], m4 | |
16875 | |
16876 palignr m6, m0, m3, 12 | |
16877 palignr m7, m2, m0, 12 | |
16878 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] | |
16879 pmulhrsw m4, m5 | |
16880 pmaddubsw m1, m7, [r3 + 12 * 32] | |
16881 pmulhrsw m1, m5 | |
16882 packuswb m4, m1 | |
16883 movu [r0 + r4], m4 | |
16884 | |
16885 lea r0, [r0 + r1 * 4] | |
16886 | |
16887 pmaddubsw m4, m6, [r3 - 5 * 32] ; [11] | |
16888 pmulhrsw m4, m5 | |
16889 pmaddubsw m1, m7, [r3 - 5 * 32] | |
16890 pmulhrsw m1, m5 | |
16891 packuswb m4, m1 | |
16892 movu [r0], m4 | |
16893 | |
16894 palignr m6, m0, m3, 10 | |
16895 palignr m7, m2, m0, 10 | |
16896 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
16897 pmulhrsw m4, m5 | |
16898 pmaddubsw m1, m7, [r3 + 10 * 32] | |
16899 pmulhrsw m1, m5 | |
16900 packuswb m4, m1 | |
16901 movu [r0 + r1], m4 | |
16902 | |
16903 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] | |
16904 pmulhrsw m4, m5 | |
16905 pmaddubsw m1, m7, [r3 - 7 * 32] | |
16906 pmulhrsw m1, m5 | |
16907 packuswb m4, m1 | |
16908 movu [r0 + r1*2], m4 | |
16909 | |
16910 palignr m6, m0, m3, 8 | |
16911 palignr m7, m2, m0, 8 | |
16912 | |
16913 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
16914 pmulhrsw m4, m5 | |
16915 pmaddubsw m1, m7, [r3 + 8 * 32] | |
16916 pmulhrsw m1, m5 | |
16917 packuswb m4, m1 | |
16918 movu [r0 + r4], m4 | |
16919 | |
16920 lea r0, [r0 + r1 * 4] | |
16921 | |
16922 ; rows 8 to 15 | |
16923 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] | |
16924 pmulhrsw m4, m5 | |
16925 pmaddubsw m1, m7, [r3 - 9 * 32] | |
16926 pmulhrsw m1, m5 | |
16927 packuswb m4, m1 | |
16928 movu [r0], m4 | |
16929 | |
16930 palignr m6, m0, m3, 6 | |
16931 palignr m7, m2, m0, 6 | |
16932 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
16933 pmulhrsw m4, m5 | |
16934 pmaddubsw m1, m7, [r3 + 6 * 32] | |
16935 pmulhrsw m1, m5 | |
16936 packuswb m4, m1 | |
16937 movu [r0 + r1], m4 | |
16938 | |
16939 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] | |
16940 pmulhrsw m4, m5 | |
16941 pmaddubsw m1, m7, [r3 - 11 * 32] | |
16942 pmulhrsw m1, m5 | |
16943 packuswb m4, m1 | |
16944 movu [r0 + r1*2], m4 | |
16945 | |
16946 palignr m6, m0, m3, 4 | |
16947 palignr m7, m2, m0, 4 | |
16948 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
16949 pmulhrsw m4, m5 | |
16950 pmaddubsw m1, m7, [r3 + 4 * 32] | |
16951 pmulhrsw m1, m5 | |
16952 packuswb m4, m1 | |
16953 movu [r0 + r4], m4 | |
16954 | |
16955 lea r0, [r0 + r1 * 4] | |
16956 | |
16957 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] | |
16958 pmulhrsw m4, m5 | |
16959 pmaddubsw m1, m7, [r3 - 13 * 32] | |
16960 pmulhrsw m1, m5 | |
16961 packuswb m4, m1 | |
16962 movu [r0], m4 | |
16963 | |
16964 palignr m6, m0, m3, 2 | |
16965 palignr m7, m2, m0, 2 | |
16966 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
16967 pmulhrsw m4, m5 | |
16968 pmaddubsw m1, m7, [r3 + 2 * 32] | |
16969 pmulhrsw m1, m5 | |
16970 packuswb m4, m1 | |
16971 movu [r0 + r1], m4 | |
16972 | |
16973 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] | |
16974 pmulhrsw m4, m5 | |
16975 pmaddubsw m1, m7, [r3 - 15 * 32] | |
16976 pmulhrsw m1, m5 | |
16977 packuswb m4, m1 | |
16978 movu [r0 + r1 * 2], m4 | |
16979 | |
16980 pmaddubsw m4, m3, [r3] ; [16] | |
16981 pmulhrsw m4, m5 | |
16982 pmaddubsw m1, m0, [r3] | |
16983 pmulhrsw m1, m5 | |
16984 packuswb m4, m1 | |
16985 movu [r0 + r4], m4 | |
16986 | |
16987 lea r0, [r0 + r1 * 4] | |
16988 | |
16989 ; rows 16 to 23 | |
16990 palignr m6, m3, m8, 14 | |
16991 palignr m7, m0, m3, 14 | |
16992 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] | |
16993 pmulhrsw m4, m5 | |
16994 pmaddubsw m1, m7, [r3 + 15 * 32] | |
16995 pmulhrsw m1, m5 | |
16996 packuswb m4, m1 | |
16997 movu [r0], m4 | |
16998 | |
16999 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
17000 pmulhrsw m4, m5 | |
17001 pmaddubsw m1, m7, [r3 - 2 * 32] | |
17002 pmulhrsw m1, m5 | |
17003 packuswb m4, m1 | |
17004 movu [r0 + r1], m4 | |
17005 | |
17006 palignr m6, m3, m8, 12 | |
17007 palignr m7, m0, m3, 12 | |
17008 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] | |
17009 pmulhrsw m4, m5 | |
17010 pmaddubsw m1, m7, [r3 + 13 * 32] | |
17011 pmulhrsw m1, m5 | |
17012 packuswb m4, m1 | |
17013 movu [r0 + r1*2], m4 | |
17014 | |
17015 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
17016 pmulhrsw m4, m5 | |
17017 pmaddubsw m1, m7, [r3 - 4 * 32] | |
17018 pmulhrsw m1, m5 | |
17019 packuswb m4, m1 | |
17020 movu [r0 + r4], m4 | |
17021 | |
17022 lea r0, [r0 + r1 * 4] | |
17023 | |
17024 palignr m6, m3, m8, 10 | |
17025 palignr m7, m0, m3, 10 | |
17026 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] | |
17027 pmulhrsw m4, m5 | |
17028 pmaddubsw m1, m7, [r3 + 11 * 32] | |
17029 pmulhrsw m1, m5 | |
17030 packuswb m4, m1 | |
17031 movu [r0], m4 | |
17032 | |
17033 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
17034 pmulhrsw m4, m5 | |
17035 pmaddubsw m1, m7, [r3 - 6 * 32] | |
17036 pmulhrsw m1, m5 | |
17037 packuswb m4, m1 | |
17038 movu [r0 + r1], m4 | |
17039 | |
17040 palignr m6, m3, m8, 8 | |
17041 palignr m7, m0, m3, 8 | |
17042 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] | |
17043 pmulhrsw m4, m5 | |
17044 pmaddubsw m1, m7, [r3 + 9 * 32] | |
17045 pmulhrsw m1, m5 | |
17046 packuswb m4, m1 | |
17047 movu [r0 + r1*2], m4 | |
17048 | |
17049 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
17050 pmulhrsw m4, m5 | |
17051 pmaddubsw m1, m7, [r3 - 8 * 32] | |
17052 pmulhrsw m1, m5 | |
17053 packuswb m4, m1 | |
17054 movu [r0 + r4], m4 | |
17055 | |
17056 lea r0, [r0 + r1 * 4] | |
17057 | |
17058 ; rows 24 to 31 | |
17059 palignr m6, m3, m8, 6 | |
17060 palignr m7, m0, m3, 6 | |
17061 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] | |
17062 pmulhrsw m4, m5 | |
17063 pmaddubsw m1, m7, [r3 + 7 * 32] | |
17064 pmulhrsw m1, m5 | |
17065 packuswb m4, m1 | |
17066 movu [r0], m4 | |
17067 | |
17068 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] | |
17069 pmulhrsw m4, m5 | |
17070 pmaddubsw m1, m7, [r3 - 10 * 32] | |
17071 pmulhrsw m1, m5 | |
17072 packuswb m4, m1 | |
17073 movu [r0 + r1], m4 | |
17074 | |
17075 palignr m6, m3, m8, 4 | |
17076 palignr m7, m0, m3, 4 | |
17077 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
17078 pmulhrsw m4, m5 | |
17079 pmaddubsw m1, m7, [r3 + 5 * 32] | |
17080 pmulhrsw m1, m5 | |
17081 packuswb m4, m1 | |
17082 movu [r0 + r1 * 2], m4 | |
17083 | |
17084 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
17085 pmulhrsw m4, m5 | |
17086 pmaddubsw m1, m7, [r3 - 12 * 32] | |
17087 pmulhrsw m1, m5 | |
17088 packuswb m4, m1 | |
17089 movu [r0 + r4], m4 | |
17090 | |
17091 lea r0, [r0 + r1 * 4] | |
17092 | |
17093 palignr m6, m3, m8, 2 | |
17094 palignr m7, m0, m3, 2 | |
17095 pmaddubsw m4, m6, [r3 + 3 * 32] ; [19] | |
17096 pmulhrsw m4, m5 | |
17097 pmaddubsw m1, m7, [r3 + 3 * 32] | |
17098 pmulhrsw m1, m5 | |
17099 packuswb m4, m1 | |
17100 movu [r0], m4 | |
17101 | |
17102 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] | |
17103 pmulhrsw m4, m5 | |
17104 pmaddubsw m1, m7, [r3 - 14 * 32] | |
17105 pmulhrsw m1, m5 | |
17106 packuswb m4, m1 | |
17107 movu [r0 + r1], m4 | |
17108 | |
17109 pmaddubsw m4, m8, [r3 + 1 * 32] ; [17] | |
17110 pmulhrsw m4, m5 | |
17111 pmaddubsw m1, m3, [r3 + 1 * 32] | |
17112 pmulhrsw m1, m5 | |
17113 packuswb m4, m1 | |
17114 movu [r0 + r1*2], m4 | |
17115 | |
17116 pand m8, [pw_00ff] | |
17117 pand m3, [pw_00ff] | |
17118 packuswb m8, m3 | |
17119 movu [r0 + r4], m8 | |
17120 RET | |
17121 | |
17122 cglobal intra_pred_ang32_16, 3,4,10 | |
17123 movu m0, [ang32_fact_mode16] | |
17124 movu m1, [ang32_fact_mode16 + mmsize] | |
17125 mova m2, [pw_1024] | |
17126 mova m7, [ang32_shuf_mode16] | |
17127 mova m8, [ang32_shuf_mode16 + mmsize] | |
17128 lea r3, [r1 * 3] | |
17129 | |
17130 ; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...] | |
17131 | |
17132 movu m6, [r2] | |
17133 pshufb m6, [ang32_shuf_mode16 + mmsize*2] | |
17134 mova m9, m6 | |
17135 mova m3, [ang32_shuf_mode16 + mmsize*3] | |
17136 vpermd m6, m3, m6 | |
17137 vpermq m9, m9, q3232 | |
17138 pslldq m9, 4 | |
17139 palignr m6, m9, 15 | |
17140 pslldq m9, 1 | |
17141 | |
17142 vbroadcasti128 m3, [r2 + mmsize*2 + 1] | |
17143 | |
17144 palignr m4, m3, m6, 1 | |
17145 palignr m5, m6, m9, 6 | |
17146 pshufb m4, m7 | |
17147 pshufb m5, m8 | |
17148 pmaddubsw m4, m0 | |
17149 pmaddubsw m5, m1 | |
17150 pmulhrsw m4, m2 | |
17151 pmulhrsw m5, m2 | |
17152 packuswb m4, m5 | |
17153 vpermq m4, m4, q3120 | |
17154 movu [r0], m4 | |
17155 | |
17156 palignr m4, m3, m6, 2 | |
17157 palignr m5, m6, m9, 7 | |
17158 pshufb m4, m7 | |
17159 pshufb m5, m8 | |
17160 pmaddubsw m4, m0 | |
17161 pmaddubsw m5, m1 | |
17162 pmulhrsw m4, m2 | |
17163 pmulhrsw m5, m2 | |
17164 packuswb m4, m5 | |
17165 vpermq m4, m4, q3120 | |
17166 movu [r0 + r1], m4 | |
17167 | |
17168 palignr m4, m3, m6, 3 | |
17169 palignr m5, m6, m9, 8 | |
17170 pshufb m4, m7 | |
17171 pshufb m5, m8 | |
17172 pmaddubsw m4, m0 | |
17173 pmaddubsw m5, m1 | |
17174 pmulhrsw m4, m2 | |
17175 pmulhrsw m5, m2 | |
17176 packuswb m4, m5 | |
17177 vpermq m4, m4, q3120 | |
17178 movu [r0 + r1 * 2], m4 | |
17179 | |
17180 palignr m4, m3, m6, 4 | |
17181 palignr m5, m6, m9, 9 | |
17182 pshufb m4, m7 | |
17183 pshufb m5, m8 | |
17184 pmaddubsw m4, m0 | |
17185 pmaddubsw m5, m1 | |
17186 pmulhrsw m4, m2 | |
17187 pmulhrsw m5, m2 | |
17188 packuswb m4, m5 | |
17189 vpermq m4, m4, q3120 | |
17190 movu [r0 + r3], m4 | |
17191 | |
17192 lea r0, [r0 + r1 * 4] | |
17193 | |
17194 palignr m4, m3, m6, 5 | |
17195 palignr m5, m6, m9, 10 | |
17196 pshufb m4, m7 | |
17197 pshufb m5, m8 | |
17198 pmaddubsw m4, m0 | |
17199 pmaddubsw m5, m1 | |
17200 pmulhrsw m4, m2 | |
17201 pmulhrsw m5, m2 | |
17202 packuswb m4, m5 | |
17203 vpermq m4, m4, q3120 | |
17204 movu [r0], m4 | |
17205 | |
17206 palignr m4, m3, m6, 6 | |
17207 palignr m5, m6, m9, 11 | |
17208 pshufb m4, m7 | |
17209 pshufb m5, m8 | |
17210 pmaddubsw m4, m0 | |
17211 pmaddubsw m5, m1 | |
17212 pmulhrsw m4, m2 | |
17213 pmulhrsw m5, m2 | |
17214 packuswb m4, m5 | |
17215 vpermq m4, m4, q3120 | |
17216 movu [r0 + r1], m4 | |
17217 | |
17218 palignr m4, m3, m6, 7 | |
17219 palignr m5, m6, m9, 12 | |
17220 pshufb m4, m7 | |
17221 pshufb m5, m8 | |
17222 pmaddubsw m4, m0 | |
17223 pmaddubsw m5, m1 | |
17224 pmulhrsw m4, m2 | |
17225 pmulhrsw m5, m2 | |
17226 packuswb m4, m5 | |
17227 vpermq m4, m4, q3120 | |
17228 movu [r0 + r1 * 2], m4 | |
17229 | |
17230 palignr m4, m3, m6, 8 | |
17231 palignr m5, m6, m9, 13 | |
17232 pshufb m4, m7 | |
17233 pshufb m5, m8 | |
17234 pmaddubsw m4, m0 | |
17235 pmaddubsw m5, m1 | |
17236 pmulhrsw m4, m2 | |
17237 pmulhrsw m5, m2 | |
17238 packuswb m4, m5 | |
17239 vpermq m4, m4, q3120 | |
17240 movu [r0 + r3], m4 | |
17241 | |
17242 lea r0, [r0 + r1 * 4] | |
17243 | |
17244 palignr m4, m3, m6, 9 | |
17245 palignr m5, m6, m9, 14 | |
17246 pshufb m4, m7 | |
17247 pshufb m5, m8 | |
17248 pmaddubsw m4, m0 | |
17249 pmaddubsw m5, m1 | |
17250 pmulhrsw m4, m2 | |
17251 pmulhrsw m5, m2 | |
17252 packuswb m4, m5 | |
17253 vpermq m4, m4, q3120 | |
17254 movu [r0], m4 | |
17255 | |
17256 palignr m4, m3, m6, 10 | |
17257 palignr m5, m6, m9, 15 | |
17258 pshufb m4, m7 | |
17259 pshufb m5, m8 | |
17260 pmaddubsw m4, m0 | |
17261 pmaddubsw m5, m1 | |
17262 pmulhrsw m4, m2 | |
17263 pmulhrsw m5, m2 | |
17264 packuswb m4, m5 | |
17265 vpermq m4, m4, q3120 | |
17266 movu [r0 + r1], m4 | |
17267 | |
17268 palignr m4, m3, m6, 11 | |
17269 pshufb m4, m7 | |
17270 pshufb m5, m6, m8 | |
17271 pmaddubsw m4, m0 | |
17272 pmaddubsw m5, m1 | |
17273 pmulhrsw m4, m2 | |
17274 pmulhrsw m5, m2 | |
17275 packuswb m4, m5 | |
17276 vpermq m4, m4, q3120 | |
17277 movu [r0 + r1 * 2], m4 | |
17278 | |
17279 palignr m4, m3, m6, 12 | |
17280 palignr m5, m3, m6, 1 | |
17281 pshufb m4, m7 | |
17282 pshufb m5, m8 | |
17283 pmaddubsw m4, m0 | |
17284 pmaddubsw m5, m1 | |
17285 pmulhrsw m4, m2 | |
17286 pmulhrsw m5, m2 | |
17287 packuswb m4, m5 | |
17288 vpermq m4, m4, q3120 | |
17289 movu [r0 + r3], m4 | |
17290 | |
17291 lea r0, [r0 + r1 * 4] | |
17292 | |
17293 palignr m4, m3, m6, 13 | |
17294 palignr m5, m3, m6, 2 | |
17295 pshufb m4, m7 | |
17296 pshufb m5, m8 | |
17297 pmaddubsw m4, m0 | |
17298 pmaddubsw m5, m1 | |
17299 pmulhrsw m4, m2 | |
17300 pmulhrsw m5, m2 | |
17301 packuswb m4, m5 | |
17302 vpermq m4, m4, q3120 | |
17303 movu [r0], m4 | |
17304 | |
17305 palignr m4, m3, m6, 14 | |
17306 palignr m5, m3, m6, 3 | |
17307 pshufb m4, m7 | |
17308 pshufb m5, m8 | |
17309 pmaddubsw m4, m0 | |
17310 pmaddubsw m5, m1 | |
17311 pmulhrsw m4, m2 | |
17312 pmulhrsw m5, m2 | |
17313 packuswb m4, m5 | |
17314 vpermq m4, m4, q3120 | |
17315 movu [r0 + r1], m4 | |
17316 | |
17317 palignr m4, m3, m6, 15 | |
17318 palignr m5, m3, m6, 4 | |
17319 pshufb m4, m7 | |
17320 pshufb m5, m8 | |
17321 pmaddubsw m4, m0 | |
17322 pmaddubsw m5, m1 | |
17323 pmulhrsw m4, m2 | |
17324 pmulhrsw m5, m2 | |
17325 packuswb m4, m5 | |
17326 vpermq m4, m4, q3120 | |
17327 movu [r0 + r1 * 2], m4 | |
17328 | |
17329 palignr m5, m3, m6, 5 | |
17330 pshufb m4, m3, m7 | |
17331 pshufb m5, m8 | |
17332 pmaddubsw m4, m0 | |
17333 pmaddubsw m5, m1 | |
17334 pmulhrsw m4, m2 | |
17335 pmulhrsw m5, m2 | |
17336 packuswb m4, m5 | |
17337 vpermq m4, m4, q3120 | |
17338 movu [r0 + r3], m4 | |
17339 | |
17340 lea r0, [r0 + r1 * 4] | |
17341 | |
17342 vbroadcasti128 m9, [r2 + mmsize*2 + 17] | |
17343 | |
17344 palignr m4, m9, m3, 1 | |
17345 palignr m5, m3, m6, 6 | |
17346 pshufb m4, m7 | |
17347 pshufb m5, m8 | |
17348 pmaddubsw m4, m0 | |
17349 pmaddubsw m5, m1 | |
17350 pmulhrsw m4, m2 | |
17351 pmulhrsw m5, m2 | |
17352 packuswb m4, m5 | |
17353 vpermq m4, m4, q3120 | |
17354 movu [r0], m4 | |
17355 | |
17356 palignr m4, m9, m3, 2 | |
17357 palignr m5, m3, m6, 7 | |
17358 pshufb m4, m7 | |
17359 pshufb m5, m8 | |
17360 pmaddubsw m4, m0 | |
17361 pmaddubsw m5, m1 | |
17362 pmulhrsw m4, m2 | |
17363 pmulhrsw m5, m2 | |
17364 packuswb m4, m5 | |
17365 vpermq m4, m4, q3120 | |
17366 movu [r0 + r1], m4 | |
17367 | |
17368 palignr m4, m9, m3, 3 | |
17369 palignr m5, m3, m6, 8 | |
17370 pshufb m4, m7 | |
17371 pshufb m5, m8 | |
17372 pmaddubsw m4, m0 | |
17373 pmaddubsw m5, m1 | |
17374 pmulhrsw m4, m2 | |
17375 pmulhrsw m5, m2 | |
17376 packuswb m4, m5 | |
17377 vpermq m4, m4, q3120 | |
17378 movu [r0 + r1 * 2], m4 | |
17379 | |
17380 palignr m4, m9, m3, 4 | |
17381 palignr m5, m3, m6, 9 | |
17382 pshufb m4, m7 | |
17383 pshufb m5, m8 | |
17384 pmaddubsw m4, m0 | |
17385 pmaddubsw m5, m1 | |
17386 pmulhrsw m4, m2 | |
17387 pmulhrsw m5, m2 | |
17388 packuswb m4, m5 | |
17389 vpermq m4, m4, q3120 | |
17390 movu [r0 + r3], m4 | |
17391 | |
17392 lea r0, [r0 + r1 * 4] | |
17393 | |
17394 palignr m4, m9, m3, 5 | |
17395 palignr m5, m3, m6, 10 | |
17396 pshufb m4, m7 | |
17397 pshufb m5, m8 | |
17398 pmaddubsw m4, m0 | |
17399 pmaddubsw m5, m1 | |
17400 pmulhrsw m4, m2 | |
17401 pmulhrsw m5, m2 | |
17402 packuswb m4, m5 | |
17403 vpermq m4, m4, q3120 | |
17404 movu [r0], m4 | |
17405 | |
17406 palignr m4, m9, m3, 6 | |
17407 palignr m5, m3, m6, 11 | |
17408 pshufb m4, m7 | |
17409 pshufb m5, m8 | |
17410 pmaddubsw m4, m0 | |
17411 pmaddubsw m5, m1 | |
17412 pmulhrsw m4, m2 | |
17413 pmulhrsw m5, m2 | |
17414 packuswb m4, m5 | |
17415 vpermq m4, m4, q3120 | |
17416 movu [r0 + r1], m4 | |
17417 | |
17418 palignr m4, m9, m3, 7 | |
17419 palignr m5, m3, m6, 12 | |
17420 pshufb m4, m7 | |
17421 pshufb m5, m8 | |
17422 pmaddubsw m4, m0 | |
17423 pmaddubsw m5, m1 | |
17424 pmulhrsw m4, m2 | |
17425 pmulhrsw m5, m2 | |
17426 packuswb m4, m5 | |
17427 vpermq m4, m4, q3120 | |
17428 movu [r0 + r1 * 2], m4 | |
17429 | |
17430 palignr m4, m9, m3, 8 | |
17431 palignr m5, m3, m6, 13 | |
17432 pshufb m4, m7 | |
17433 pshufb m5, m8 | |
17434 pmaddubsw m4, m0 | |
17435 pmaddubsw m5, m1 | |
17436 pmulhrsw m4, m2 | |
17437 pmulhrsw m5, m2 | |
17438 packuswb m4, m5 | |
17439 vpermq m4, m4, q3120 | |
17440 movu [r0 + r3], m4 | |
17441 | |
17442 lea r0, [r0 + r1 * 4] | |
17443 | |
17444 palignr m4, m9, m3, 9 | |
17445 palignr m5, m3, m6, 14 | |
17446 pshufb m4, m7 | |
17447 pshufb m5, m8 | |
17448 pmaddubsw m4, m0 | |
17449 pmaddubsw m5, m1 | |
17450 pmulhrsw m4, m2 | |
17451 pmulhrsw m5, m2 | |
17452 packuswb m4, m5 | |
17453 vpermq m4, m4, q3120 | |
17454 movu [r0], m4 | |
17455 | |
17456 palignr m4, m9, m3, 10 | |
17457 palignr m5, m3, m6, 15 | |
17458 pshufb m4, m7 | |
17459 pshufb m5, m8 | |
17460 pmaddubsw m4, m0 | |
17461 pmaddubsw m5, m1 | |
17462 pmulhrsw m4, m2 | |
17463 pmulhrsw m5, m2 | |
17464 packuswb m4, m5 | |
17465 vpermq m4, m4, q3120 | |
17466 movu [r0 + r1], m4 | |
17467 | |
17468 palignr m4, m9, m3, 11 | |
17469 pshufb m4, m7 | |
17470 pshufb m5, m3, m8 | |
17471 pmaddubsw m4, m0 | |
17472 pmaddubsw m5, m1 | |
17473 pmulhrsw m4, m2 | |
17474 pmulhrsw m5, m2 | |
17475 packuswb m4, m5 | |
17476 vpermq m4, m4, q3120 | |
17477 movu [r0 + r1 * 2], m4 | |
17478 | |
17479 palignr m4, m9, m3, 12 | |
17480 palignr m5, m9, m3, 1 | |
17481 pshufb m4, m7 | |
17482 pshufb m5, m8 | |
17483 pmaddubsw m4, m0 | |
17484 pmaddubsw m5, m1 | |
17485 pmulhrsw m4, m2 | |
17486 pmulhrsw m5, m2 | |
17487 packuswb m4, m5 | |
17488 vpermq m4, m4, q3120 | |
17489 movu [r0 + r3], m4 | |
17490 | |
17491 lea r0, [r0 + r1 * 4] | |
17492 | |
17493 palignr m4, m9, m3, 13 | |
17494 palignr m5, m9, m3, 2 | |
17495 pshufb m4, m7 | |
17496 pshufb m5, m8 | |
17497 pmaddubsw m4, m0 | |
17498 pmaddubsw m5, m1 | |
17499 pmulhrsw m4, m2 | |
17500 pmulhrsw m5, m2 | |
17501 packuswb m4, m5 | |
17502 vpermq m4, m4, q3120 | |
17503 movu [r0], m4 | |
17504 | |
17505 palignr m4, m9, m3, 14 | |
17506 palignr m5, m9, m3, 3 | |
17507 pshufb m4, m7 | |
17508 pshufb m5, m8 | |
17509 pmaddubsw m4, m0 | |
17510 pmaddubsw m5, m1 | |
17511 pmulhrsw m4, m2 | |
17512 pmulhrsw m5, m2 | |
17513 packuswb m4, m5 | |
17514 vpermq m4, m4, q3120 | |
17515 movu [r0 + r1], m4 | |
17516 | |
17517 palignr m4, m9, m3, 15 | |
17518 palignr m5, m9, m3, 4 | |
17519 pshufb m4, m7 | |
17520 pshufb m5, m8 | |
17521 pmaddubsw m4, m0 | |
17522 pmaddubsw m5, m1 | |
17523 pmulhrsw m4, m2 | |
17524 pmulhrsw m5, m2 | |
17525 packuswb m4, m5 | |
17526 vpermq m4, m4, q3120 | |
17527 movu [r0 + r1 * 2], m4 | |
17528 | |
17529 palignr m5, m9, m3, 5 | |
17530 pshufb m4, m9, m7 | |
17531 pshufb m5, m8 | |
17532 pmaddubsw m4, m0 | |
17533 pmaddubsw m5, m1 | |
17534 pmulhrsw m4, m2 | |
17535 pmulhrsw m5, m2 | |
17536 packuswb m4, m5 | |
17537 vpermq m4, m4, q3120 | |
17538 movu [r0 + r3], m4 | |
17539 RET | |
17540 | |
17541 cglobal intra_pred_ang32_20, 3,5,10 | |
17542 lea r3, [ang_table_avx2 + 32 * 16] | |
17543 lea r4, [r1 * 3] | |
17544 mova m5, [pw_1024] | |
17545 | |
17546 ; rows 0 to 7 | |
17547 movu m0, [r2 + 0] | |
17548 movu m1, [r2 + 1] | |
17549 punpckhbw m2, m0, m1 | |
17550 punpcklbw m0, m1 | |
17551 | |
17552 movu m4, [r2 + mmsize*2] | |
17553 pshufb m4, [ang32_shuf_mode20] | |
17554 mova m9, m4 | |
17555 vpermq m9, m9, q3333 | |
17556 mova m7, m4 | |
17557 vpermq m7, m7, q1111 | |
17558 palignr m4, m7, 14 | |
17559 pshufb m4, [ang32_shuf_mode20 + mmsize*1] | |
17560 | |
17561 vextracti128 xm6, m4, 1 | |
17562 palignr m3, m0, m4, 1 | |
17563 palignr m8, m3, m6, 1 | |
17564 vinserti128 m3, m3, xm2, 1 | |
17565 vinserti128 m8, m8, xm0, 1 | |
17566 vinserti128 m9, m9, xm3, 1 | |
17567 | |
17568 pmaddubsw m4, m0, [r3 - 5 * 32] ; [11] | |
17569 pmulhrsw m4, m5 | |
17570 pmaddubsw m1, m2, [r3 - 5 * 32] | |
17571 pmulhrsw m1, m5 | |
17572 packuswb m4, m1 | |
17573 movu [r0], m4 | |
17574 | |
17575 palignr m6, m0, m3, 14 | |
17576 palignr m7, m2, m0, 14 | |
17577 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
17578 pmulhrsw m4, m5 | |
17579 pmaddubsw m1, m7, [r3 + 6 * 32] | |
17580 pmulhrsw m1, m5 | |
17581 packuswb m4, m1 | |
17582 movu [r0 + r1], m4 | |
17583 | |
17584 pmaddubsw m4, m6, [r3 - 15 * 32] ; [1] | |
17585 pmulhrsw m4, m5 | |
17586 pmaddubsw m1, m7, [r3 - 15 * 32] | |
17587 pmulhrsw m1, m5 | |
17588 packuswb m4, m1 | |
17589 movu [r0 + r1*2], m4 | |
17590 | |
17591 palignr m6, m0, m3, 12 | |
17592 palignr m7, m2, m0, 12 | |
17593 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
17594 pmulhrsw m4, m5 | |
17595 pmaddubsw m1, m7, [r3 - 4 * 32] | |
17596 pmulhrsw m1, m5 | |
17597 packuswb m4, m1 | |
17598 movu [r0 + r4], m4 | |
17599 | |
17600 lea r0, [r0 + r1 * 4] | |
17601 | |
17602 palignr m6, m0, m3, 10 | |
17603 palignr m7, m2, m0, 10 | |
17604 pmaddubsw m4, m6, [r3 + 7 * 32] ; [23] | |
17605 pmulhrsw m4, m5 | |
17606 pmaddubsw m1, m7, [r3 + 7 * 32] | |
17607 pmulhrsw m1, m5 | |
17608 packuswb m4, m1 | |
17609 movu [r0], m4 | |
17610 | |
17611 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] | |
17612 pmulhrsw m4, m5 | |
17613 pmaddubsw m1, m7, [r3 - 14 * 32] | |
17614 pmulhrsw m1, m5 | |
17615 packuswb m4, m1 | |
17616 movu [r0 + r1], m4 | |
17617 | |
17618 palignr m6, m0, m3, 8 | |
17619 palignr m7, m2, m0, 8 | |
17620 pmaddubsw m4, m6, [r3 - 3 * 32] ; [13] | |
17621 pmulhrsw m4, m5 | |
17622 pmaddubsw m1, m7, [r3 - 3 * 32] | |
17623 pmulhrsw m1, m5 | |
17624 packuswb m4, m1 | |
17625 movu [r0 + r1*2], m4 | |
17626 | |
17627 palignr m6, m0, m3, 6 | |
17628 palignr m7, m2, m0, 6 | |
17629 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
17630 pmulhrsw m4, m5 | |
17631 pmaddubsw m1, m7, [r3 + 8 * 32] | |
17632 pmulhrsw m1, m5 | |
17633 packuswb m4, m1 | |
17634 movu [r0 + r4], m4 | |
17635 | |
17636 lea r0, [r0 + r1 * 4] | |
17637 | |
17638 ; rows 8 to 15 | |
17639 pmaddubsw m4, m6, [r3 - 13 * 32] ; [3] | |
17640 pmulhrsw m4, m5 | |
17641 pmaddubsw m1, m7, [r3 - 13 * 32] | |
17642 pmulhrsw m1, m5 | |
17643 packuswb m4, m1 | |
17644 movu [r0], m4 | |
17645 | |
17646 palignr m6, m0, m3, 4 | |
17647 palignr m7, m2, m0, 4 | |
17648 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
17649 pmulhrsw m4, m5 | |
17650 pmaddubsw m1, m7, [r3 - 2 * 32] | |
17651 pmulhrsw m1, m5 | |
17652 packuswb m4, m1 | |
17653 movu [r0 + r1], m4 | |
17654 | |
17655 palignr m6, m0, m3, 2 | |
17656 palignr m7, m2, m0, 2 | |
17657 pmaddubsw m4, m6, [r3 + 9 * 32] ; [25] | |
17658 pmulhrsw m4, m5 | |
17659 pmaddubsw m1, m7, [r3 + 9 * 32] | |
17660 pmulhrsw m1, m5 | |
17661 packuswb m4, m1 | |
17662 movu [r0 + r1*2], m4 | |
17663 | |
17664 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
17665 pmulhrsw m4, m5 | |
17666 pmaddubsw m1, m7, [r3 - 12 * 32] | |
17667 pmulhrsw m1, m5 | |
17668 packuswb m4, m1 | |
17669 movu [r0 + r4], m4 | |
17670 | |
17671 lea r0, [r0 + r1 * 4] | |
17672 | |
17673 pmaddubsw m4, m3, [r3 - 1 * 32] ; [15] | |
17674 pmulhrsw m4, m5 | |
17675 pmaddubsw m1, m0, [r3 - 1 * 32] | |
17676 pmulhrsw m1, m5 | |
17677 packuswb m4, m1 | |
17678 movu [r0], m4 | |
17679 | |
17680 palignr m6, m3, m8, 14 | |
17681 palignr m7, m0, m3, 14 | |
17682 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
17683 pmulhrsw m4, m5 | |
17684 pmaddubsw m1, m7, [r3 + 10 * 32] | |
17685 pmulhrsw m1, m5 | |
17686 packuswb m4, m1 | |
17687 movu [r0 + r1], m4 | |
17688 | |
17689 pmaddubsw m4, m6, [r3 - 11 * 32] ; [5] | |
17690 pmulhrsw m4, m5 | |
17691 pmaddubsw m1, m7, [r3 - 11 * 32] | |
17692 pmulhrsw m1, m5 | |
17693 packuswb m4, m1 | |
17694 movu [r0 + r1 * 2], m4 | |
17695 | |
17696 palignr m6, m3, m8, 12 | |
17697 palignr m7, m0, m3, 12 | |
17698 pmaddubsw m4, m6, [r3] ; [16] | |
17699 pmulhrsw m4, m5 | |
17700 pmaddubsw m1, m7, [r3] | |
17701 pmulhrsw m1, m5 | |
17702 packuswb m4, m1 | |
17703 movu [r0 + r4], m4 | |
17704 | |
17705 lea r0, [r0 + r1 * 4] | |
17706 | |
17707 ; rows 16 to 23 | |
17708 palignr m6, m3, m8, 10 | |
17709 palignr m7, m0, m3, 10 | |
17710 pmaddubsw m4, m6, [r3 + 11 * 32] ; [27] | |
17711 pmulhrsw m4, m5 | |
17712 pmaddubsw m1, m7, [r3 + 11 * 32] | |
17713 pmulhrsw m1, m5 | |
17714 packuswb m4, m1 | |
17715 movu [r0], m4 | |
17716 | |
17717 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] | |
17718 pmulhrsw m4, m5 | |
17719 pmaddubsw m1, m7, [r3 - 10 * 32] | |
17720 pmulhrsw m1, m5 | |
17721 packuswb m4, m1 | |
17722 movu [r0 + r1], m4 | |
17723 | |
17724 palignr m6, m3, m8, 8 | |
17725 palignr m7, m0, m3, 8 | |
17726 pmaddubsw m4, m6, [r3 + 1 * 32] ; [17] | |
17727 pmulhrsw m4, m5 | |
17728 pmaddubsw m1, m7, [r3 + 1 * 32] | |
17729 pmulhrsw m1, m5 | |
17730 packuswb m4, m1 | |
17731 movu [r0 + r1*2], m4 | |
17732 | |
17733 palignr m6, m3, m8, 6 | |
17734 palignr m7, m0, m3, 6 | |
17735 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] | |
17736 pmulhrsw m4, m5 | |
17737 pmaddubsw m1, m7, [r3 + 12 * 32] | |
17738 pmulhrsw m1, m5 | |
17739 packuswb m4, m1 | |
17740 movu [r0 + r4], m4 | |
17741 | |
17742 lea r0, [r0 + r1 * 4] | |
17743 | |
17744 pmaddubsw m4, m6, [r3 - 9 * 32] ; [7] | |
17745 pmulhrsw m4, m5 | |
17746 pmaddubsw m1, m7, [r3 - 9 * 32] | |
17747 pmulhrsw m1, m5 | |
17748 packuswb m4, m1 | |
17749 movu [r0], m4 | |
17750 | |
17751 palignr m6, m3, m8, 4 | |
17752 palignr m7, m0, m3, 4 | |
17753 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
17754 pmulhrsw m4, m5 | |
17755 pmaddubsw m1, m7, [r3 + 2 * 32] | |
17756 pmulhrsw m1, m5 | |
17757 packuswb m4, m1 | |
17758 movu [r0 + r1], m4 | |
17759 | |
17760 palignr m6, m3, m8, 2 | |
17761 palignr m7, m0, m3, 2 | |
17762 pmaddubsw m4, m6, [r3 + 13 * 32] ; [29] | |
17763 pmulhrsw m4, m5 | |
17764 pmaddubsw m1, m7, [r3 + 13 * 32] | |
17765 pmulhrsw m1, m5 | |
17766 packuswb m4, m1 | |
17767 movu [r0 + r1*2], m4 | |
17768 | |
17769 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
17770 pmulhrsw m4, m5 | |
17771 pmaddubsw m1, m7, [r3 - 8 * 32] | |
17772 pmulhrsw m1, m5 | |
17773 packuswb m4, m1 | |
17774 movu [r0 + r4], m4 | |
17775 | |
17776 lea r0, [r0 + r1 * 4] | |
17777 | |
17778 ; rows 24 to 31 | |
17779 pmaddubsw m4, m8, [r3 + 3 * 32] ; [19] | |
17780 pmulhrsw m4, m5 | |
17781 pmaddubsw m1, m3, [r3 + 3 * 32] | |
17782 pmulhrsw m1, m5 | |
17783 packuswb m4, m1 | |
17784 movu [r0], m4 | |
17785 | |
17786 palignr m6, m8, m9, 14 | |
17787 palignr m7, m3, m8, 14 | |
17788 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
17789 pmulhrsw m4, m5 | |
17790 pmaddubsw m1, m7, [r3 + 14 * 32] | |
17791 pmulhrsw m1, m5 | |
17792 packuswb m4, m1 | |
17793 movu [r0 + r1], m4 | |
17794 | |
17795 pmaddubsw m4, m6, [r3 - 7 * 32] ; [9] | |
17796 pmulhrsw m4, m5 | |
17797 pmaddubsw m1, m7, [r3 - 7 * 32] | |
17798 pmulhrsw m1, m5 | |
17799 packuswb m4, m1 | |
17800 movu [r0 + r1 * 2], m4 | |
17801 | |
17802 palignr m6, m8, m9, 12 | |
17803 palignr m7, m3, m8, 12 | |
17804 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
17805 pmulhrsw m4, m5 | |
17806 pmaddubsw m1, m7, [r3 + 4 * 32] | |
17807 pmulhrsw m1, m5 | |
17808 packuswb m4, m1 | |
17809 movu [r0 + r4], m4 | |
17810 | |
17811 lea r0, [r0 + r1 * 4] | |
17812 | |
17813 palignr m6, m8, m9, 10 | |
17814 palignr m7, m3, m8, 10 | |
17815 pmaddubsw m4, m6, [r3 + 15 * 32] ; [31] | |
17816 pmulhrsw m4, m5 | |
17817 pmaddubsw m1, m7, [r3 + 15 * 32] | |
17818 pmulhrsw m1, m5 | |
17819 packuswb m4, m1 | |
17820 movu [r0], m4 | |
17821 | |
17822 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
17823 pmulhrsw m4, m5 | |
17824 pmaddubsw m1, m7, [r3 - 6 * 32] | |
17825 pmulhrsw m1, m5 | |
17826 packuswb m4, m1 | |
17827 movu [r0 + r1], m4 | |
17828 | |
17829 palignr m6, m8, m9, 8 | |
17830 palignr m7, m3, m8, 8 | |
17831 pmaddubsw m4, m6, [r3 + 5 * 32] ; [21] | |
17832 pmulhrsw m4, m5 | |
17833 pmaddubsw m1, m7, [r3 + 5 * 32] | |
17834 pmulhrsw m1, m5 | |
17835 packuswb m4, m1 | |
17836 movu [r0 + r1*2], m4 | |
17837 | |
17838 pand m6, [pw_00ff] | |
17839 pand m7, [pw_00ff] | |
17840 packuswb m6, m7 | |
17841 movu [r0 + r4], m6 | |
17842 RET | |
17843 | |
17844 cglobal intra_pred_ang32_17, 3,4,8 | |
17845 movu m0, [ang32_fact_mode17] | |
17846 mova m2, [pw_1024] | |
17847 mova m7, [ang32_shuf_mode17] | |
17848 lea r3, [r1 * 3] | |
17849 | |
17850 ; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2...] | |
17851 | |
17852 movu m6, [r2] | |
17853 pshufb m6, [ang32_shuf_mode17 + mmsize] | |
17854 mova m1, m6 | |
17855 mova m3, [ang32_shuf_mode16 + mmsize*3] | |
17856 vpermd m6, m3, m6 | |
17857 vpermq m1, m1, q3232 | |
17858 pslldq m1, 4 | |
17859 | |
17860 movu xm4, [r2 + mmsize*2] | |
17861 pinsrb xm4, [r2], 0 | |
17862 vinserti128 m3, m4, xm4, 1 | |
17863 | |
17864 palignr m4, m3, m6, 2 | |
17865 palignr m5, m6, m1, 5 | |
17866 pshufb m4, m7 | |
17867 pshufb m5, m7 | |
17868 pmaddubsw m4, m0 | |
17869 pmaddubsw m5, m0 | |
17870 pmulhrsw m4, m2 | |
17871 pmulhrsw m5, m2 | |
17872 packuswb m4, m5 | |
17873 vpermq m4, m4, q3120 | |
17874 movu [r0], m4 | |
17875 | |
17876 palignr m4, m3, m6, 3 | |
17877 palignr m5, m6, m1, 6 | |
17878 pshufb m4, m7 | |
17879 pshufb m5, m7 | |
17880 pmaddubsw m4, m0 | |
17881 pmaddubsw m5, m0 | |
17882 pmulhrsw m4, m2 | |
17883 pmulhrsw m5, m2 | |
17884 packuswb m4, m5 | |
17885 vpermq m4, m4, q3120 | |
17886 movu [r0 + r1], m4 | |
17887 | |
17888 palignr m4, m3, m6, 4 | |
17889 palignr m5, m6, m1, 7 | |
17890 pshufb m4, m7 | |
17891 pshufb m5, m7 | |
17892 pmaddubsw m4, m0 | |
17893 pmaddubsw m5, m0 | |
17894 pmulhrsw m4, m2 | |
17895 pmulhrsw m5, m2 | |
17896 packuswb m4, m5 | |
17897 vpermq m4, m4, q3120 | |
17898 movu [r0 + r1 * 2], m4 | |
17899 | |
17900 palignr m4, m3, m6, 5 | |
17901 palignr m5, m6, m1, 8 | |
17902 pshufb m4, m7 | |
17903 pshufb m5, m7 | |
17904 pmaddubsw m4, m0 | |
17905 pmaddubsw m5, m0 | |
17906 pmulhrsw m4, m2 | |
17907 pmulhrsw m5, m2 | |
17908 packuswb m4, m5 | |
17909 vpermq m4, m4, q3120 | |
17910 movu [r0 + r3], m4 | |
17911 | |
17912 lea r0, [r0 + r1 * 4] | |
17913 | |
17914 palignr m4, m3, m6, 6 | |
17915 palignr m5, m6, m1, 9 | |
17916 pshufb m4, m7 | |
17917 pshufb m5, m7 | |
17918 pmaddubsw m4, m0 | |
17919 pmaddubsw m5, m0 | |
17920 pmulhrsw m4, m2 | |
17921 pmulhrsw m5, m2 | |
17922 packuswb m4, m5 | |
17923 vpermq m4, m4, q3120 | |
17924 movu [r0], m4 | |
17925 | |
17926 palignr m4, m3, m6, 7 | |
17927 palignr m5, m6, m1, 10 | |
17928 pshufb m4, m7 | |
17929 pshufb m5, m7 | |
17930 pmaddubsw m4, m0 | |
17931 pmaddubsw m5, m0 | |
17932 pmulhrsw m4, m2 | |
17933 pmulhrsw m5, m2 | |
17934 packuswb m4, m5 | |
17935 vpermq m4, m4, q3120 | |
17936 movu [r0 + r1], m4 | |
17937 | |
17938 palignr m4, m3, m6, 8 | |
17939 palignr m5, m6, m1, 11 | |
17940 pshufb m4, m7 | |
17941 pshufb m5, m7 | |
17942 pmaddubsw m4, m0 | |
17943 pmaddubsw m5, m0 | |
17944 pmulhrsw m4, m2 | |
17945 pmulhrsw m5, m2 | |
17946 packuswb m4, m5 | |
17947 vpermq m4, m4, q3120 | |
17948 movu [r0 + r1 * 2], m4 | |
17949 | |
17950 palignr m4, m3, m6, 9 | |
17951 palignr m5, m6, m1, 12 | |
17952 pshufb m4, m7 | |
17953 pshufb m5, m7 | |
17954 pmaddubsw m4, m0 | |
17955 pmaddubsw m5, m0 | |
17956 pmulhrsw m4, m2 | |
17957 pmulhrsw m5, m2 | |
17958 packuswb m4, m5 | |
17959 vpermq m4, m4, q3120 | |
17960 movu [r0 + r3], m4 | |
17961 | |
17962 lea r0, [r0 + r1 * 4] | |
17963 | |
17964 palignr m4, m3, m6, 10 | |
17965 palignr m5, m6, m1, 13 | |
17966 pshufb m4, m7 | |
17967 pshufb m5, m7 | |
17968 pmaddubsw m4, m0 | |
17969 pmaddubsw m5, m0 | |
17970 pmulhrsw m4, m2 | |
17971 pmulhrsw m5, m2 | |
17972 packuswb m4, m5 | |
17973 vpermq m4, m4, q3120 | |
17974 movu [r0], m4 | |
17975 | |
17976 palignr m4, m3, m6, 11 | |
17977 palignr m5, m6, m1, 14 | |
17978 pshufb m4, m7 | |
17979 pshufb m5, m7 | |
17980 pmaddubsw m4, m0 | |
17981 pmaddubsw m5, m0 | |
17982 pmulhrsw m4, m2 | |
17983 pmulhrsw m5, m2 | |
17984 packuswb m4, m5 | |
17985 vpermq m4, m4, q3120 | |
17986 movu [r0 + r1], m4 | |
17987 | |
17988 palignr m4, m3, m6, 12 | |
17989 palignr m5, m6, m1, 15 | |
17990 pshufb m4, m7 | |
17991 pshufb m5, m7 | |
17992 pmaddubsw m4, m0 | |
17993 pmaddubsw m5, m0 | |
17994 pmulhrsw m4, m2 | |
17995 pmulhrsw m5, m2 | |
17996 packuswb m4, m5 | |
17997 vpermq m4, m4, q3120 | |
17998 movu [r0 + r1 * 2], m4 | |
17999 | |
18000 palignr m4, m3, m6, 13 | |
18001 pshufb m4, m7 | |
18002 pshufb m5, m6, m7 | |
18003 pmaddubsw m4, m0 | |
18004 pmaddubsw m5, m0 | |
18005 pmulhrsw m4, m2 | |
18006 pmulhrsw m5, m2 | |
18007 packuswb m4, m5 | |
18008 vpermq m4, m4, q3120 | |
18009 movu [r0 + r3], m4 | |
18010 | |
18011 lea r0, [r0 + r1 * 4] | |
18012 | |
18013 palignr m4, m3, m6, 14 | |
18014 palignr m5, m3, m6, 1 | |
18015 pshufb m4, m7 | |
18016 pshufb m5, m7 | |
18017 pmaddubsw m4, m0 | |
18018 pmaddubsw m5, m0 | |
18019 pmulhrsw m4, m2 | |
18020 pmulhrsw m5, m2 | |
18021 packuswb m4, m5 | |
18022 vpermq m4, m4, q3120 | |
18023 movu [r0], m4 | |
18024 | |
18025 palignr m4, m3, m6, 15 | |
18026 palignr m5, m3, m6, 2 | |
18027 pshufb m4, m7 | |
18028 pshufb m5, m7 | |
18029 pmaddubsw m4, m0 | |
18030 pmaddubsw m5, m0 | |
18031 pmulhrsw m4, m2 | |
18032 pmulhrsw m5, m2 | |
18033 packuswb m4, m5 | |
18034 vpermq m4, m4, q3120 | |
18035 movu [r0 + r1], m4 | |
18036 | |
18037 palignr m5, m3, m6, 3 | |
18038 pshufb m4, m3, m7 | |
18039 pshufb m5, m7 | |
18040 pmaddubsw m4, m0 | |
18041 pmaddubsw m5, m0 | |
18042 pmulhrsw m4, m2 | |
18043 pmulhrsw m5, m2 | |
18044 packuswb m4, m5 | |
18045 vpermq m4, m4, q3120 | |
18046 movu [r0 + r1 * 2], m4 | |
18047 | |
18048 vbroadcasti128 m1, [r2 + mmsize*2 + 16] | |
18049 palignr m4, m1, m3, 1 | |
18050 palignr m5, m3, m6, 4 | |
18051 pshufb m4, m7 | |
18052 pshufb m5, m7 | |
18053 pmaddubsw m4, m0 | |
18054 pmaddubsw m5, m0 | |
18055 pmulhrsw m4, m2 | |
18056 pmulhrsw m5, m2 | |
18057 packuswb m4, m5 | |
18058 vpermq m4, m4, q3120 | |
18059 movu [r0 + r3], m4 | |
18060 | |
18061 lea r0, [r0 + r1 * 4] | |
18062 | |
18063 palignr m4, m1, m3, 2 | |
18064 palignr m5, m3, m6, 5 | |
18065 pshufb m4, m7 | |
18066 pshufb m5, m7 | |
18067 pmaddubsw m4, m0 | |
18068 pmaddubsw m5, m0 | |
18069 pmulhrsw m4, m2 | |
18070 pmulhrsw m5, m2 | |
18071 packuswb m4, m5 | |
18072 vpermq m4, m4, q3120 | |
18073 movu [r0], m4 | |
18074 | |
18075 palignr m4, m1, m3, 3 | |
18076 palignr m5, m3, m6, 6 | |
18077 pshufb m4, m7 | |
18078 pshufb m5, m7 | |
18079 pmaddubsw m4, m0 | |
18080 pmaddubsw m5, m0 | |
18081 pmulhrsw m4, m2 | |
18082 pmulhrsw m5, m2 | |
18083 packuswb m4, m5 | |
18084 vpermq m4, m4, q3120 | |
18085 movu [r0 + r1], m4 | |
18086 | |
18087 palignr m4, m1, m3, 4 | |
18088 palignr m5, m3, m6, 7 | |
18089 pshufb m4, m7 | |
18090 pshufb m5, m7 | |
18091 pmaddubsw m4, m0 | |
18092 pmaddubsw m5, m0 | |
18093 pmulhrsw m4, m2 | |
18094 pmulhrsw m5, m2 | |
18095 packuswb m4, m5 | |
18096 vpermq m4, m4, q3120 | |
18097 movu [r0 + r1 * 2], m4 | |
18098 | |
18099 palignr m4, m1, m3, 5 | |
18100 palignr m5, m3, m6, 8 | |
18101 pshufb m4, m7 | |
18102 pshufb m5, m7 | |
18103 pmaddubsw m4, m0 | |
18104 pmaddubsw m5, m0 | |
18105 pmulhrsw m4, m2 | |
18106 pmulhrsw m5, m2 | |
18107 packuswb m4, m5 | |
18108 vpermq m4, m4, q3120 | |
18109 movu [r0 + r3], m4 | |
18110 | |
18111 lea r0, [r0 + r1 * 4] | |
18112 | |
18113 palignr m4, m1, m3, 6 | |
18114 palignr m5, m3, m6, 9 | |
18115 pshufb m4, m7 | |
18116 pshufb m5, m7 | |
18117 pmaddubsw m4, m0 | |
18118 pmaddubsw m5, m0 | |
18119 pmulhrsw m4, m2 | |
18120 pmulhrsw m5, m2 | |
18121 packuswb m4, m5 | |
18122 vpermq m4, m4, q3120 | |
18123 movu [r0], m4 | |
18124 | |
18125 palignr m4, m1, m3, 7 | |
18126 palignr m5, m3, m6, 10 | |
18127 pshufb m4, m7 | |
18128 pshufb m5, m7 | |
18129 pmaddubsw m4, m0 | |
18130 pmaddubsw m5, m0 | |
18131 pmulhrsw m4, m2 | |
18132 pmulhrsw m5, m2 | |
18133 packuswb m4, m5 | |
18134 vpermq m4, m4, q3120 | |
18135 movu [r0 + r1], m4 | |
18136 | |
18137 palignr m4, m1, m3, 8 | |
18138 palignr m5, m3, m6, 11 | |
18139 pshufb m4, m7 | |
18140 pshufb m5, m7 | |
18141 pmaddubsw m4, m0 | |
18142 pmaddubsw m5, m0 | |
18143 pmulhrsw m4, m2 | |
18144 pmulhrsw m5, m2 | |
18145 packuswb m4, m5 | |
18146 vpermq m4, m4, q3120 | |
18147 movu [r0 + r1 * 2], m4 | |
18148 | |
18149 palignr m4, m1, m3, 9 | |
18150 palignr m5, m3, m6, 12 | |
18151 pshufb m4, m7 | |
18152 pshufb m5, m7 | |
18153 pmaddubsw m4, m0 | |
18154 pmaddubsw m5, m0 | |
18155 pmulhrsw m4, m2 | |
18156 pmulhrsw m5, m2 | |
18157 packuswb m4, m5 | |
18158 vpermq m4, m4, q3120 | |
18159 movu [r0 + r3], m4 | |
18160 | |
18161 lea r0, [r0 + r1 * 4] | |
18162 | |
18163 palignr m4, m1, m3, 10 | |
18164 palignr m5, m3, m6, 13 | |
18165 pshufb m4, m7 | |
18166 pshufb m5, m7 | |
18167 pmaddubsw m4, m0 | |
18168 pmaddubsw m5, m0 | |
18169 pmulhrsw m4, m2 | |
18170 pmulhrsw m5, m2 | |
18171 packuswb m4, m5 | |
18172 vpermq m4, m4, q3120 | |
18173 movu [r0], m4 | |
18174 | |
18175 palignr m4, m1, m3, 11 | |
18176 palignr m5, m3, m6, 14 | |
18177 pshufb m4, m7 | |
18178 pshufb m5, m7 | |
18179 pmaddubsw m4, m0 | |
18180 pmaddubsw m5, m0 | |
18181 pmulhrsw m4, m2 | |
18182 pmulhrsw m5, m2 | |
18183 packuswb m4, m5 | |
18184 vpermq m4, m4, q3120 | |
18185 movu [r0 + r1], m4 | |
18186 | |
18187 palignr m4, m1, m3, 12 | |
18188 palignr m5, m3, m6, 15 | |
18189 pshufb m4, m7 | |
18190 pshufb m5, m7 | |
18191 pmaddubsw m4, m0 | |
18192 pmaddubsw m5, m0 | |
18193 pmulhrsw m4, m2 | |
18194 pmulhrsw m5, m2 | |
18195 packuswb m4, m5 | |
18196 vpermq m4, m4, q3120 | |
18197 movu [r0 + r1 * 2], m4 | |
18198 | |
18199 palignr m4, m1, m3, 13 | |
18200 pshufb m4, m7 | |
18201 pshufb m5, m3, m7 | |
18202 pmaddubsw m4, m0 | |
18203 pmaddubsw m5, m0 | |
18204 pmulhrsw m4, m2 | |
18205 pmulhrsw m5, m2 | |
18206 packuswb m4, m5 | |
18207 vpermq m4, m4, q3120 | |
18208 movu [r0 + r3], m4 | |
18209 | |
18210 lea r0, [r0 + r1 * 4] | |
18211 | |
18212 palignr m4, m1, m3, 14 | |
18213 palignr m5, m1, m3, 1 | |
18214 pshufb m4, m7 | |
18215 pshufb m5, m7 | |
18216 pmaddubsw m4, m0 | |
18217 pmaddubsw m5, m0 | |
18218 pmulhrsw m4, m2 | |
18219 pmulhrsw m5, m2 | |
18220 packuswb m4, m5 | |
18221 vpermq m4, m4, q3120 | |
18222 movu [r0], m4 | |
18223 | |
18224 palignr m4, m1, m3, 15 | |
18225 palignr m5, m1, m3, 2 | |
18226 pshufb m4, m7 | |
18227 pshufb m5, m7 | |
18228 pmaddubsw m4, m0 | |
18229 pmaddubsw m5, m0 | |
18230 pmulhrsw m4, m2 | |
18231 pmulhrsw m5, m2 | |
18232 packuswb m4, m5 | |
18233 vpermq m4, m4, q3120 | |
18234 movu [r0 + r1], m4 | |
18235 | |
18236 vbroadcasti128 m6, [r2 + mmsize*2 + mmsize] | |
18237 palignr m5, m1, m3, 3 | |
18238 pshufb m4, m1, m7 | |
18239 pshufb m5, m7 | |
18240 pmaddubsw m4, m0 | |
18241 pmaddubsw m5, m0 | |
18242 pmulhrsw m4, m2 | |
18243 pmulhrsw m5, m2 | |
18244 packuswb m4, m5 | |
18245 vpermq m4, m4, q3120 | |
18246 movu [r0 + r1 * 2], m4 | |
18247 | |
18248 palignr m4, m6, m1, 1 | |
18249 palignr m5, m1, m3, 4 | |
18250 pshufb m4, m7 | |
18251 pshufb m5, m7 | |
18252 pmaddubsw m4, m0 | |
18253 pmaddubsw m5, m0 | |
18254 pmulhrsw m4, m2 | |
18255 pmulhrsw m5, m2 | |
18256 packuswb m4, m5 | |
18257 vpermq m4, m4, q3120 | |
18258 movu [r0 + r3], m4 | |
18259 RET | |
18260 | |
18261 cglobal intra_pred_ang32_19, 3,5,10 | |
18262 lea r3, [ang_table_avx2 + 32 * 16] | |
18263 lea r4, [r1 * 3] | |
18264 mova m5, [pw_1024] | |
18265 | |
18266 ; rows 0 to 7 | |
18267 movu m0, [r2 + 0] | |
18268 movu m1, [r2 + 1] | |
18269 punpckhbw m2, m0, m1 | |
18270 punpcklbw m0, m1 | |
18271 | |
18272 movu m4, [r2 + mmsize*2] | |
18273 pshufb m4, [ang32_shuf_mode17 + mmsize*1] | |
18274 mova m3, [ang32_shuf_mode19 + mmsize*1] | |
18275 mova m6, [ang32_shuf_mode19 + mmsize*2] | |
18276 mova m9, m4 | |
18277 vpermd m4, m3, m4 | |
18278 vpermd m9, m6, m9 | |
18279 pshufb m4, [ang32_shuf_mode19] | |
18280 pshufb m9, [ang32_shuf_mode19] | |
18281 | |
18282 vextracti128 xm6, m4, 1 | |
18283 palignr m3, m0, m4, 1 | |
18284 palignr m8, m3, m6, 1 | |
18285 palignr m7, m8, m9, 1 | |
18286 vinserti128 m3, m3, xm2, 1 | |
18287 vinserti128 m8, m8, xm0, 1 | |
18288 vinserti128 m9, m7, xm3, 1 | |
18289 | |
18290 pmaddubsw m4, m0, [r3 - 10 * 32] ; [6] | |
18291 pmulhrsw m4, m5 | |
18292 pmaddubsw m1, m2, [r3 - 10 * 32] | |
18293 pmulhrsw m1, m5 | |
18294 packuswb m4, m1 | |
18295 movu [r0], m4 | |
18296 | |
18297 palignr m6, m0, m3, 14 | |
18298 palignr m7, m2, m0, 14 | |
18299 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
18300 pmulhrsw m4, m5 | |
18301 pmaddubsw m1, m7, [r3 - 4 * 32] | |
18302 pmulhrsw m1, m5 | |
18303 packuswb m4, m1 | |
18304 movu [r0 + r1], m4 | |
18305 | |
18306 palignr m6, m0, m3, 12 | |
18307 palignr m7, m2, m0, 12 | |
18308 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
18309 pmulhrsw m4, m5 | |
18310 pmaddubsw m1, m7, [r3 + 2 * 32] | |
18311 pmulhrsw m1, m5 | |
18312 packuswb m4, m1 | |
18313 movu [r0 + r1*2], m4 | |
18314 | |
18315 palignr m6, m0, m3, 10 | |
18316 palignr m7, m2, m0, 10 | |
18317 pmaddubsw m4, m6, [r3 + 8 * 32] ; [24] | |
18318 pmulhrsw m4, m5 | |
18319 pmaddubsw m1, m7, [r3 + 8 * 32] | |
18320 pmulhrsw m1, m5 | |
18321 packuswb m4, m1 | |
18322 movu [r0 + r4], m4 | |
18323 | |
18324 lea r0, [r0 + r1 * 4] | |
18325 | |
18326 palignr m6, m0, m3, 8 | |
18327 palignr m7, m2, m0, 8 | |
18328 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
18329 pmulhrsw m4, m5 | |
18330 pmaddubsw m1, m7, [r3 + 14 * 32] | |
18331 pmulhrsw m1, m5 | |
18332 packuswb m4, m1 | |
18333 movu [r0], m4 | |
18334 | |
18335 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
18336 pmulhrsw m4, m5 | |
18337 pmaddubsw m1, m7, [r3 - 12 * 32] | |
18338 pmulhrsw m1, m5 | |
18339 packuswb m4, m1 | |
18340 movu [r0 + r1], m4 | |
18341 | |
18342 palignr m6, m0, m3, 6 | |
18343 palignr m7, m2, m0, 6 | |
18344 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
18345 pmulhrsw m4, m5 | |
18346 pmaddubsw m1, m7, [r3 - 6 * 32] | |
18347 pmulhrsw m1, m5 | |
18348 packuswb m4, m1 | |
18349 movu [r0 + r1*2], m4 | |
18350 | |
18351 palignr m6, m0, m3, 4 | |
18352 palignr m7, m2, m0, 4 | |
18353 pmaddubsw m4, m6, [r3] ; [16] | |
18354 pmulhrsw m4, m5 | |
18355 pmaddubsw m1, m7, [r3] | |
18356 pmulhrsw m1, m5 | |
18357 packuswb m4, m1 | |
18358 movu [r0 + r4], m4 | |
18359 | |
18360 lea r0, [r0 + r1 * 4] | |
18361 | |
18362 ; rows 8 to 15 | |
18363 palignr m6, m0, m3, 2 | |
18364 palignr m7, m2, m0, 2 | |
18365 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
18366 pmulhrsw m4, m5 | |
18367 pmaddubsw m1, m7, [r3 + 6 * 32] | |
18368 pmulhrsw m1, m5 | |
18369 packuswb m4, m1 | |
18370 movu [r0], m4 | |
18371 | |
18372 pmaddubsw m4, m3, [r3 + 12 * 32] ; [28] | |
18373 pmulhrsw m4, m5 | |
18374 pmaddubsw m1, m0, [r3 + 12 * 32] | |
18375 pmulhrsw m1, m5 | |
18376 packuswb m4, m1 | |
18377 movu [r0 + r1], m4 | |
18378 | |
18379 pmaddubsw m4, m3, [r3 - 14 * 32] ; [2] | |
18380 pmulhrsw m4, m5 | |
18381 pmaddubsw m1, m0, [r3 - 14 * 32] | |
18382 pmulhrsw m1, m5 | |
18383 packuswb m4, m1 | |
18384 movu [r0 + r1*2], m4 | |
18385 | |
18386 palignr m6, m3, m8, 14 | |
18387 palignr m7, m0, m3, 14 | |
18388 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
18389 pmulhrsw m4, m5 | |
18390 pmaddubsw m1, m7, [r3 - 8 * 32] | |
18391 pmulhrsw m1, m5 | |
18392 packuswb m4, m1 | |
18393 movu [r0 + r4], m4 | |
18394 | |
18395 lea r0, [r0 + r1 * 4] | |
18396 | |
18397 palignr m6, m3, m8, 12 | |
18398 palignr m7, m0, m3, 12 | |
18399 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
18400 pmulhrsw m4, m5 | |
18401 pmaddubsw m1, m7, [r3 - 2 * 32] | |
18402 pmulhrsw m1, m5 | |
18403 packuswb m4, m1 | |
18404 movu [r0], m4 | |
18405 | |
18406 palignr m6, m3, m8, 10 | |
18407 palignr m7, m0, m3, 10 | |
18408 pmaddubsw m4, m6, [r3 + 4 * 32] ; [20] | |
18409 pmulhrsw m4, m5 | |
18410 pmaddubsw m1, m7, [r3 + 4 * 32] | |
18411 pmulhrsw m1, m5 | |
18412 packuswb m4, m1 | |
18413 movu [r0 + r1], m4 | |
18414 | |
18415 palignr m6, m3, m8, 8 | |
18416 palignr m7, m0, m3, 8 | |
18417 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
18418 pmulhrsw m4, m5 | |
18419 pmaddubsw m1, m7, [r3 + 10 * 32] | |
18420 pmulhrsw m1, m5 | |
18421 packuswb m4, m1 | |
18422 movu [r0 + r1 * 2], m4 | |
18423 | |
18424 pand m6, [pw_00ff] | |
18425 pand m7, [pw_00ff] | |
18426 packuswb m6, m7 | |
18427 movu [r0 + r4], m6 | |
18428 | |
18429 lea r0, [r0 + r1 * 4] | |
18430 | |
18431 ; rows 16 to 23 | |
18432 palignr m6, m3, m8, 6 | |
18433 palignr m7, m0, m3, 6 | |
18434 pmaddubsw m4, m6, [r3 - 10 * 32] ; [6] | |
18435 pmulhrsw m4, m5 | |
18436 pmaddubsw m1, m7, [r3 - 10 * 32] | |
18437 pmulhrsw m1, m5 | |
18438 packuswb m4, m1 | |
18439 movu [r0], m4 | |
18440 | |
18441 palignr m6, m3, m8, 4 | |
18442 palignr m7, m0, m3, 4 | |
18443 pmaddubsw m4, m6, [r3 - 4 * 32] ; [12] | |
18444 pmulhrsw m4, m5 | |
18445 pmaddubsw m1, m7, [r3 - 4 * 32] | |
18446 pmulhrsw m1, m5 | |
18447 packuswb m4, m1 | |
18448 movu [r0 + r1], m4 | |
18449 | |
18450 palignr m6, m3, m8, 2 | |
18451 palignr m7, m0, m3, 2 | |
18452 pmaddubsw m4, m6, [r3 + 2 * 32] ; [18] | |
18453 pmulhrsw m4, m5 | |
18454 pmaddubsw m1, m7, [r3 + 2 * 32] | |
18455 pmulhrsw m1, m5 | |
18456 packuswb m4, m1 | |
18457 movu [r0 + r1*2], m4 | |
18458 | |
18459 pmaddubsw m4, m8, [r3 + 8 * 32] ; [24] | |
18460 pmulhrsw m4, m5 | |
18461 pmaddubsw m1, m3, [r3 + 8 * 32] | |
18462 pmulhrsw m1, m5 | |
18463 packuswb m4, m1 | |
18464 movu [r0 + r4], m4 | |
18465 | |
18466 lea r0, [r0 + r1 * 4] | |
18467 | |
18468 palignr m6, m8, m9, 14 | |
18469 palignr m7, m3, m8, 14 | |
18470 pmaddubsw m4, m6, [r3 + 14 * 32] ; [30] | |
18471 pmulhrsw m4, m5 | |
18472 pmaddubsw m1, m7, [r3 + 14 * 32] | |
18473 pmulhrsw m1, m5 | |
18474 packuswb m4, m1 | |
18475 movu [r0], m4 | |
18476 | |
18477 pmaddubsw m4, m6, [r3 - 12 * 32] ; [4] | |
18478 pmulhrsw m4, m5 | |
18479 pmaddubsw m1, m7, [r3 - 12 * 32] | |
18480 pmulhrsw m1, m5 | |
18481 packuswb m4, m1 | |
18482 movu [r0 + r1], m4 | |
18483 | |
18484 palignr m6, m8, m9, 12 | |
18485 palignr m7, m3, m8, 12 | |
18486 pmaddubsw m4, m6, [r3 - 6 * 32] ; [10] | |
18487 pmulhrsw m4, m5 | |
18488 pmaddubsw m1, m7, [r3 - 6 * 32] | |
18489 pmulhrsw m1, m5 | |
18490 packuswb m4, m1 | |
18491 movu [r0 + r1*2], m4 | |
18492 | |
18493 palignr m6, m8, m9, 10 | |
18494 palignr m7, m3, m8, 10 | |
18495 pmaddubsw m4, m6, [r3] ; [16] | |
18496 pmulhrsw m4, m5 | |
18497 pmaddubsw m1, m7, [r3] | |
18498 pmulhrsw m1, m5 | |
18499 packuswb m4, m1 | |
18500 movu [r0 + r4], m4 | |
18501 | |
18502 lea r0, [r0 + r1 * 4] | |
18503 | |
18504 ; rows 24 to 31 | |
18505 palignr m6, m8, m9, 8 | |
18506 palignr m7, m3, m8, 8 | |
18507 pmaddubsw m4, m6, [r3 + 6 * 32] ; [22] | |
18508 pmulhrsw m4, m5 | |
18509 pmaddubsw m1, m7, [r3 + 6 * 32] | |
18510 pmulhrsw m1, m5 | |
18511 packuswb m4, m1 | |
18512 movu [r0], m4 | |
18513 | |
18514 palignr m6, m8, m9, 6 | |
18515 palignr m7, m3, m8, 6 | |
18516 pmaddubsw m4, m6, [r3 + 12 * 32] ; [28] | |
18517 pmulhrsw m4, m5 | |
18518 pmaddubsw m1, m7, [r3 + 12 * 32] | |
18519 pmulhrsw m1, m5 | |
18520 packuswb m4, m1 | |
18521 movu [r0 + r1], m4 | |
18522 | |
18523 pmaddubsw m4, m6, [r3 - 14 * 32] ; [2] | |
18524 pmulhrsw m4, m5 | |
18525 pmaddubsw m1, m7, [r3 - 14 * 32] | |
18526 pmulhrsw m1, m5 | |
18527 packuswb m4, m1 | |
18528 movu [r0 + r1*2], m4 | |
18529 | |
18530 palignr m6, m8, m9, 4 | |
18531 palignr m7, m3, m8, 4 | |
18532 pmaddubsw m4, m6, [r3 - 8 * 32] ; [8] | |
18533 pmulhrsw m4, m5 | |
18534 pmaddubsw m1, m7, [r3 - 8 * 32] | |
18535 pmulhrsw m1, m5 | |
18536 packuswb m4, m1 | |
18537 movu [r0 + r4], m4 | |
18538 | |
18539 lea r0, [r0 + r1 * 4] | |
18540 | |
18541 vpbroadcastb m0, [r2 + mmsize*2 + 31] | |
18542 palignr m1, m9, m0, 1 | |
18543 vinserti128 m0, m1, xm8, 1 | |
18544 | |
18545 palignr m6, m8, m9, 2 | |
18546 palignr m7, m3, m8, 2 | |
18547 pmaddubsw m4, m6, [r3 - 2 * 32] ; [14] | |
18548 pmulhrsw m4, m5 | |
18549 pmaddubsw m1, m7, [r3 - 2 * 32] | |
18550 pmulhrsw m1, m5 | |
18551 packuswb m4, m1 | |
18552 movu [r0], m4 | |
18553 | |
18554 pmaddubsw m4, m9, [r3 + 4 * 32] ; [20] | |
18555 pmulhrsw m4, m5 | |
18556 pmaddubsw m1, m8, [r3 + 4 * 32] | |
18557 pmulhrsw m1, m5 | |
18558 packuswb m4, m1 | |
18559 movu [r0 + r1], m4 | |
18560 | |
18561 palignr m6, m9, m0, 14 | |
18562 palignr m7, m8, m9, 14 | |
18563 pmaddubsw m4, m6, [r3 + 10 * 32] ; [26] | |
18564 pmulhrsw m4, m5 | |
18565 pmaddubsw m1, m7, [r3 + 10 * 32] | |
18566 pmulhrsw m1, m5 | |
18567 packuswb m4, m1 | |
18568 movu [r0 + r1 * 2], m4 | |
18569 | |
18570 pand m6, [pw_00ff] | |
18571 pand m7, [pw_00ff] | |
18572 packuswb m6, m7 | |
18573 movu [r0 + r4], m6 | |
18574 RET | |
18575 | |
18576 %endif ; ARCH_X86_64 | |
18577 ;----------------------------------------------------------------------------------------- | |
18578 ; end of intra_pred_ang32 angular modes avx2 asm | |
18579 ;----------------------------------------------------------------------------------------- | |
18580 | |
18581 ;----------------------------------------------------------------------------------------- | |
18582 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) | |
18583 ;----------------------------------------------------------------------------------------- | |
18584 INIT_YMM avx2 | |
18585 cglobal intra_pred_ang8_3, 3,4,5 | |
18586 mova m3, [pw_1024] | |
18587 vbroadcasti128 m0, [r2 + 17] | |
18588 | |
18589 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18590 pshufb m2, m0, [c_ang8_src3_11_4_12] | |
18591 pshufb m4, m0, [c_ang8_src5_13_5_13] | |
18592 pshufb m0, [c_ang8_src6_14_7_15] | |
18593 | |
18594 pmaddubsw m1, [c_ang8_26_20] | |
18595 pmulhrsw m1, m3 | |
18596 pmaddubsw m2, [c_ang8_14_8] | |
18597 pmulhrsw m2, m3 | |
18598 pmaddubsw m4, [c_ang8_2_28] | |
18599 pmulhrsw m4, m3 | |
18600 pmaddubsw m0, [c_ang8_22_16] | |
18601 pmulhrsw m0, m3 | |
18602 packuswb m1, m2 | |
18603 packuswb m4, m0 | |
18604 | |
18605 vperm2i128 m2, m1, m4, 00100000b | |
18606 vperm2i128 m1, m1, m4, 00110001b | |
18607 punpcklbw m4, m2, m1 | |
18608 punpckhbw m2, m1 | |
18609 punpcklwd m1, m4, m2 | |
18610 punpckhwd m4, m2 | |
18611 mova m0, [trans8_shuf] | |
18612 vpermd m1, m0, m1 | |
18613 vpermd m4, m0, m4 | |
18614 | |
18615 lea r3, [3 * r1] | |
18616 movq [r0], xm1 | |
18617 movhps [r0 + r1], xm1 | |
18618 vextracti128 xm2, m1, 1 | |
18619 movq [r0 + 2 * r1], xm2 | |
18620 movhps [r0 + r3], xm2 | |
18621 lea r0, [r0 + 4 * r1] | |
18622 movq [r0], xm4 | |
18623 movhps [r0 + r1], xm4 | |
18624 vextracti128 xm2, m4, 1 | |
18625 movq [r0 + 2 * r1], xm2 | |
18626 movhps [r0 + r3], xm2 | |
18627 RET | |
18628 | |
18629 INIT_YMM avx2 | |
18630 cglobal intra_pred_ang8_33, 3,4,5 | |
18631 mova m3, [pw_1024] | |
18632 vbroadcasti128 m0, [r2 + 1] | |
18633 | |
18634 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18635 pshufb m2, m0, [c_ang8_src3_11_4_12] | |
18636 pshufb m4, m0, [c_ang8_src5_13_5_13] | |
18637 pshufb m0, [c_ang8_src6_14_7_15] | |
18638 | |
18639 pmaddubsw m1, [c_ang8_26_20] | |
18640 pmulhrsw m1, m3 | |
18641 pmaddubsw m2, [c_ang8_14_8] | |
18642 pmulhrsw m2, m3 | |
18643 pmaddubsw m4, [c_ang8_2_28] | |
18644 pmulhrsw m4, m3 | |
18645 pmaddubsw m0, [c_ang8_22_16] | |
18646 pmulhrsw m0, m3 | |
18647 packuswb m1, m2 | |
18648 packuswb m4, m0 | |
18649 | |
18650 lea r3, [3 * r1] | |
18651 movq [r0], xm1 | |
18652 vextracti128 xm2, m1, 1 | |
18653 movq [r0 + r1], xm2 | |
18654 movhps [r0 + 2 * r1], xm1 | |
18655 movhps [r0 + r3], xm2 | |
18656 lea r0, [r0 + 4 * r1] | |
18657 movq [r0], xm4 | |
18658 vextracti128 xm2, m4, 1 | |
18659 movq [r0 + r1], xm2 | |
18660 movhps [r0 + 2 * r1], xm4 | |
18661 movhps [r0 + r3], xm2 | |
18662 RET | |
18663 | |
18664 INIT_YMM avx2 | |
18665 cglobal intra_pred_ang8_4, 3,4,5 | |
18666 mova m3, [pw_1024] | |
18667 vbroadcasti128 m0, [r2 + 17] | |
18668 | |
18669 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18670 pshufb m2, m0, [c_ang8_src2_10_3_11] | |
18671 pshufb m4, m0, [c_ang8_src4_12_4_12] | |
18672 pshufb m0, [c_ang8_src5_13_6_14] | |
18673 | |
18674 pmaddubsw m1, [c_ang8_21_10] | |
18675 pmulhrsw m1, m3 | |
18676 pmaddubsw m2, [c_ang8_31_20] | |
18677 pmulhrsw m2, m3 | |
18678 pmaddubsw m4, [c_ang8_9_30] | |
18679 pmulhrsw m4, m3 | |
18680 pmaddubsw m0, [c_ang8_19_8] | |
18681 pmulhrsw m0, m3 | |
18682 packuswb m1, m2 | |
18683 packuswb m4, m0 | |
18684 | |
18685 vperm2i128 m2, m1, m4, 00100000b | |
18686 vperm2i128 m1, m1, m4, 00110001b | |
18687 punpcklbw m4, m2, m1 | |
18688 punpckhbw m2, m1 | |
18689 punpcklwd m1, m4, m2 | |
18690 punpckhwd m4, m2 | |
18691 mova m0, [trans8_shuf] | |
18692 vpermd m1, m0, m1 | |
18693 vpermd m4, m0, m4 | |
18694 | |
18695 lea r3, [3 * r1] | |
18696 movq [r0], xm1 | |
18697 movhps [r0 + r1], xm1 | |
18698 vextracti128 xm2, m1, 1 | |
18699 movq [r0 + 2 * r1], xm2 | |
18700 movhps [r0 + r3], xm2 | |
18701 lea r0, [r0 + 4 * r1] | |
18702 movq [r0], xm4 | |
18703 movhps [r0 + r1], xm4 | |
18704 vextracti128 xm2, m4, 1 | |
18705 movq [r0 + 2 * r1], xm2 | |
18706 movhps [r0 + r3], xm2 | |
18707 RET | |
18708 | |
18709 INIT_YMM avx2 | |
18710 cglobal intra_pred_ang8_32, 3,4,5 | |
18711 mova m3, [pw_1024] | |
18712 vbroadcasti128 m0, [r2 + 1] | |
18713 | |
18714 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18715 pshufb m2, m0, [c_ang8_src2_10_3_11] | |
18716 pshufb m4, m0, [c_ang8_src4_12_4_12] | |
18717 pshufb m0, [c_ang8_src5_13_6_14] | |
18718 | |
18719 pmaddubsw m1, [c_ang8_21_10] | |
18720 pmulhrsw m1, m3 | |
18721 pmaddubsw m2, [c_ang8_31_20] | |
18722 pmulhrsw m2, m3 | |
18723 pmaddubsw m4, [c_ang8_9_30] | |
18724 pmulhrsw m4, m3 | |
18725 pmaddubsw m0, [c_ang8_19_8] | |
18726 pmulhrsw m0, m3 | |
18727 packuswb m1, m2 | |
18728 packuswb m4, m0 | |
18729 | |
18730 lea r3, [3 * r1] | |
18731 movq [r0], xm1 | |
18732 vextracti128 xm2, m1, 1 | |
18733 movq [r0 + r1], xm2 | |
18734 movhps [r0 + 2 * r1], xm1 | |
18735 movhps [r0 + r3], xm2 | |
18736 lea r0, [r0 + 4 * r1] | |
18737 movq [r0], xm4 | |
18738 vextracti128 xm2, m4, 1 | |
18739 movq [r0 + r1], xm2 | |
18740 movhps [r0 + 2 * r1], xm4 | |
18741 movhps [r0 + r3], xm2 | |
18742 RET | |
18743 | |
18744 | |
18745 INIT_YMM avx2 | |
18746 cglobal intra_pred_ang8_5, 3, 4, 5 | |
18747 mova m3, [pw_1024] | |
18748 vbroadcasti128 m0, [r2 + 17] | |
18749 | |
18750 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18751 pshufb m2, m0, [c_ang8_src2_10_3_11] | |
18752 pshufb m4, m0, [c_ang8_src3_11_4_12] | |
18753 pshufb m0, [c_ang8_src4_12_5_13] | |
18754 | |
18755 pmaddubsw m1, [c_ang8_17_2] | |
18756 pmulhrsw m1, m3 | |
18757 pmaddubsw m2, [c_ang8_19_4] | |
18758 pmulhrsw m2, m3 | |
18759 pmaddubsw m4, [c_ang8_21_6] | |
18760 pmulhrsw m4, m3 | |
18761 pmaddubsw m0, [c_ang8_23_8] | |
18762 pmulhrsw m0, m3 | |
18763 packuswb m1, m2 | |
18764 packuswb m4, m0 | |
18765 | |
18766 vperm2i128 m2, m1, m4, 00100000b | |
18767 vperm2i128 m1, m1, m4, 00110001b | |
18768 punpcklbw m4, m2, m1 | |
18769 punpckhbw m2, m1 | |
18770 punpcklwd m1, m4, m2 | |
18771 punpckhwd m4, m2 | |
18772 mova m0, [trans8_shuf] | |
18773 vpermd m1, m0, m1 | |
18774 vpermd m4, m0, m4 | |
18775 | |
18776 lea r3, [3 * r1] | |
18777 movq [r0], xm1 | |
18778 movhps [r0 + r1], xm1 | |
18779 vextracti128 xm2, m1, 1 | |
18780 movq [r0 + 2 * r1], xm2 | |
18781 movhps [r0 + r3], xm2 | |
18782 lea r0, [r0 + 4 * r1] | |
18783 movq [r0], xm4 | |
18784 movhps [r0 + r1], xm4 | |
18785 vextracti128 xm2, m4, 1 | |
18786 movq [r0 + 2 * r1], xm2 | |
18787 movhps [r0 + r3], xm2 | |
18788 RET | |
18789 | |
18790 INIT_YMM avx2 | |
18791 cglobal intra_pred_ang8_31, 3, 4, 5 | |
18792 mova m3, [pw_1024] | |
18793 vbroadcasti128 m0, [r2 + 1] | |
18794 | |
18795 pshufb m1, m0, [c_ang8_src1_9_2_10] | |
18796 pshufb m2, m0, [c_ang8_src2_10_3_11] | |
18797 pshufb m4, m0, [c_ang8_src3_11_4_12] | |
18798 pshufb m0, [c_ang8_src4_12_5_13] | |
18799 | |
18800 pmaddubsw m1, [c_ang8_17_2] | |
18801 pmulhrsw m1, m3 | |
18802 pmaddubsw m2, [c_ang8_19_4] | |
18803 pmulhrsw m2, m3 | |
18804 pmaddubsw m4, [c_ang8_21_6] | |
18805 pmulhrsw m4, m3 | |
18806 pmaddubsw m0, [c_ang8_23_8] | |
18807 pmulhrsw m0, m3 | |
18808 packuswb m1, m2 | |
18809 packuswb m4, m0 | |
18810 | |
18811 lea r3, [3 * r1] | |
18812 movq [r0], xm1 | |
18813 vextracti128 xm2, m1, 1 | |
18814 movq [r0 + r1], xm2 | |
18815 movhps [r0 + 2 * r1], xm1 | |
18816 movhps [r0 + r3], xm2 | |
18817 lea r0, [r0 + 4 * r1] | |
18818 movq [r0], xm4 | |
18819 vextracti128 xm2, m4, 1 | |
18820 movq [r0 + r1], xm2 | |
18821 movhps [r0 + 2 * r1], xm4 | |
18822 movhps [r0 + r3], xm2 | |
18823 RET | |
18824 | |
18825 | |
18826 INIT_YMM avx2 | |
18827 cglobal intra_pred_ang8_6, 3, 4, 5 | |
18828 mova m3, [pw_1024] | |
18829 vbroadcasti128 m0, [r2 + 17] | |
18830 | |
18831 pshufb m1, m0, [intra_pred_shuff_0_8] | |
18832 pshufb m2, m0, [c_ang8_src2_10_2_10] | |
18833 pshufb m4, m0, [c_ang8_src3_11_3_11] | |
18834 pshufb m0, [c_ang8_src3_11_4_12] | |
18835 | |
18836 pmaddubsw m1, [c_ang8_13_26] | |
18837 pmulhrsw m1, m3 | |
18838 pmaddubsw m2, [c_ang8_7_20] | |
18839 pmulhrsw m2, m3 | |
18840 pmaddubsw m4, [c_ang8_1_14] | |
18841 pmulhrsw m4, m3 | |
18842 pmaddubsw m0, [c_ang8_27_8] | |
18843 pmulhrsw m0, m3 | |
18844 packuswb m1, m2 | |
18845 packuswb m4, m0 | |
18846 | |
18847 vperm2i128 m2, m1, m4, 00100000b | |
18848 vperm2i128 m1, m1, m4, 00110001b | |
18849 punpcklbw m4, m2, m1 | |
18850 punpckhbw m2, m1 | |
18851 punpcklwd m1, m4, m2 | |
18852 punpckhwd m4, m2 | |
18853 mova m0, [trans8_shuf] | |
18854 vpermd m1, m0, m1 | |
18855 vpermd m4, m0, m4 | |
18856 | |
18857 lea r3, [3 * r1] | |
18858 movq [r0], xm1 | |
18859 movhps [r0 + r1], xm1 | |
18860 vextracti128 xm2, m1, 1 | |
18861 movq [r0 + 2 * r1], xm2 | |
18862 movhps [r0 + r3], xm2 | |
18863 lea r0, [r0 + 4 * r1] | |
18864 movq [r0], xm4 | |
18865 movhps [r0 + r1], xm4 | |
18866 vextracti128 xm2, m4, 1 | |
18867 movq [r0 + 2 * r1], xm2 | |
18868 movhps [r0 + r3], xm2 | |
18869 RET | |
18870 | |
18871 INIT_YMM avx2 | |
18872 cglobal intra_pred_ang8_30, 3, 4, 5 | |
18873 mova m3, [pw_1024] | |
18874 vbroadcasti128 m0, [r2 + 1] | |
18875 | |
18876 pshufb m1, m0, [intra_pred_shuff_0_8] | |
18877 pshufb m2, m0, [c_ang8_src2_10_2_10] | |
18878 pshufb m4, m0, [c_ang8_src3_11_3_11] | |
18879 pshufb m0, [c_ang8_src3_11_4_12] | |
18880 | |
18881 pmaddubsw m1, [c_ang8_13_26] | |
18882 pmulhrsw m1, m3 | |
18883 pmaddubsw m2, [c_ang8_7_20] | |
18884 pmulhrsw m2, m3 | |
18885 pmaddubsw m4, [c_ang8_1_14] | |
18886 pmulhrsw m4, m3 | |
18887 pmaddubsw m0, [c_ang8_27_8] | |
18888 pmulhrsw m0, m3 | |
18889 packuswb m1, m2 | |
18890 packuswb m4, m0 | |
18891 | |
18892 lea r3, [3 * r1] | |
18893 movq [r0], xm1 | |
18894 vextracti128 xm2, m1, 1 | |
18895 movq [r0 + r1], xm2 | |
18896 movhps [r0 + 2 * r1], xm1 | |
18897 movhps [r0 + r3], xm2 | |
18898 lea r0, [r0 + 4 * r1] | |
18899 movq [r0], xm4 | |
18900 vextracti128 xm2, m4, 1 | |
18901 movq [r0 + r1], xm2 | |
18902 movhps [r0 + 2 * r1], xm4 | |
18903 movhps [r0 + r3], xm2 | |
18904 RET | |
18905 | |
18906 | |
18907 INIT_YMM avx2 | |
18908 cglobal intra_pred_ang8_9, 3, 5, 5 | |
18909 mova m3, [pw_1024] | |
18910 vbroadcasti128 m0, [r2 + 17] | |
18911 | |
18912 pshufb m0, [intra_pred_shuff_0_8] | |
18913 | |
18914 lea r4, [c_ang8_mode_27] | |
18915 pmaddubsw m1, m0, [r4] | |
18916 pmulhrsw m1, m3 | |
18917 pmaddubsw m2, m0, [r4 + mmsize] | |
18918 pmulhrsw m2, m3 | |
18919 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
18920 pmulhrsw m4, m3 | |
18921 pmaddubsw m0, [r4 + 3 * mmsize] | |
18922 pmulhrsw m0, m3 | |
18923 packuswb m1, m2 | |
18924 packuswb m4, m0 | |
18925 | |
18926 vperm2i128 m2, m1, m4, 00100000b | |
18927 vperm2i128 m1, m1, m4, 00110001b | |
18928 punpcklbw m4, m2, m1 | |
18929 punpckhbw m2, m1 | |
18930 punpcklwd m1, m4, m2 | |
18931 punpckhwd m4, m2 | |
18932 mova m0, [trans8_shuf] | |
18933 vpermd m1, m0, m1 | |
18934 vpermd m4, m0, m4 | |
18935 | |
18936 lea r3, [3 * r1] | |
18937 movq [r0], xm1 | |
18938 movhps [r0 + r1], xm1 | |
18939 vextracti128 xm2, m1, 1 | |
18940 movq [r0 + 2 * r1], xm2 | |
18941 movhps [r0 + r3], xm2 | |
18942 lea r0, [r0 + 4 * r1] | |
18943 movq [r0], xm4 | |
18944 movhps [r0 + r1], xm4 | |
18945 vextracti128 xm2, m4, 1 | |
18946 movq [r0 + 2 * r1], xm2 | |
18947 movhps [r0 + r3], xm2 | |
18948 RET | |
18949 | |
18950 INIT_YMM avx2 | |
18951 cglobal intra_pred_ang8_27, 3, 5, 5 | |
18952 mova m3, [pw_1024] | |
18953 vbroadcasti128 m0, [r2 + 1] | |
18954 | |
18955 pshufb m0, [intra_pred_shuff_0_8] | |
18956 | |
18957 lea r4, [c_ang8_mode_27] | |
18958 pmaddubsw m1, m0, [r4] | |
18959 pmulhrsw m1, m3 | |
18960 pmaddubsw m2, m0, [r4 + mmsize] | |
18961 pmulhrsw m2, m3 | |
18962 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
18963 pmulhrsw m4, m3 | |
18964 pmaddubsw m0, [r4 + 3 * mmsize] | |
18965 pmulhrsw m0, m3 | |
18966 packuswb m1, m2 | |
18967 packuswb m4, m0 | |
18968 | |
18969 lea r3, [3 * r1] | |
18970 movq [r0], xm1 | |
18971 vextracti128 xm2, m1, 1 | |
18972 movq [r0 + r1], xm2 | |
18973 movhps [r0 + 2 * r1], xm1 | |
18974 movhps [r0 + r3], xm2 | |
18975 lea r0, [r0 + 4 * r1] | |
18976 movq [r0], xm4 | |
18977 vextracti128 xm2, m4, 1 | |
18978 movq [r0 + r1], xm2 | |
18979 movhps [r0 + 2 * r1], xm4 | |
18980 movhps [r0 + r3], xm2 | |
18981 RET | |
18982 | |
18983 INIT_YMM avx2 | |
18984 cglobal intra_pred_ang8_25, 3, 5, 5 | |
18985 mova m3, [pw_1024] | |
18986 vbroadcasti128 m0, [r2] | |
18987 | |
18988 pshufb m0, [intra_pred_shuff_0_8] | |
18989 | |
18990 lea r4, [c_ang8_mode_25] | |
18991 pmaddubsw m1, m0, [r4] | |
18992 pmulhrsw m1, m3 | |
18993 pmaddubsw m2, m0, [r4 + mmsize] | |
18994 pmulhrsw m2, m3 | |
18995 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
18996 pmulhrsw m4, m3 | |
18997 pmaddubsw m0, [r4 + 3 * mmsize] | |
18998 pmulhrsw m0, m3 | |
18999 packuswb m1, m2 | |
19000 packuswb m4, m0 | |
19001 | |
19002 lea r3, [3 * r1] | |
19003 movq [r0], xm1 | |
19004 vextracti128 xm2, m1, 1 | |
19005 movq [r0 + r1], xm2 | |
19006 movhps [r0 + 2 * r1], xm1 | |
19007 movhps [r0 + r3], xm2 | |
19008 lea r0, [r0 + 4 * r1] | |
19009 movq [r0], xm4 | |
19010 vextracti128 xm2, m4, 1 | |
19011 movq [r0 + r1], xm2 | |
19012 movhps [r0 + 2 * r1], xm4 | |
19013 movhps [r0 + r3], xm2 | |
19014 RET | |
19015 | |
19016 | |
19017 INIT_YMM avx2 | |
19018 cglobal intra_pred_ang8_7, 3, 4, 5 | |
19019 mova m3, [pw_1024] | |
19020 vbroadcasti128 m0, [r2 + 17] | |
19021 | |
19022 pshufb m1, m0, [intra_pred_shuff_0_8] | |
19023 pshufb m2, m0, [c_ang8_src1_9_2_10] | |
19024 pshufb m4, m0, [c_ang8_src2_10_2_10] | |
19025 pshufb m0, [c_ang8_src2_10_3_11] | |
19026 | |
19027 pmaddubsw m1, [c_ang8_9_18] | |
19028 pmulhrsw m1, m3 | |
19029 pmaddubsw m2, [c_ang8_27_4] | |
19030 pmulhrsw m2, m3 | |
19031 pmaddubsw m4, [c_ang8_13_22] | |
19032 pmulhrsw m4, m3 | |
19033 pmaddubsw m0, [c_ang8_31_8] | |
19034 pmulhrsw m0, m3 | |
19035 packuswb m1, m2 | |
19036 packuswb m4, m0 | |
19037 | |
19038 vperm2i128 m2, m1, m4, 00100000b | |
19039 vperm2i128 m1, m1, m4, 00110001b | |
19040 punpcklbw m4, m2, m1 | |
19041 punpckhbw m2, m1 | |
19042 punpcklwd m1, m4, m2 | |
19043 punpckhwd m4, m2 | |
19044 mova m0, [trans8_shuf] | |
19045 vpermd m1, m0, m1 | |
19046 vpermd m4, m0, m4 | |
19047 | |
19048 lea r3, [3 * r1] | |
19049 movq [r0], xm1 | |
19050 movhps [r0 + r1], xm1 | |
19051 vextracti128 xm2, m1, 1 | |
19052 movq [r0 + 2 * r1], xm2 | |
19053 movhps [r0 + r3], xm2 | |
19054 lea r0, [r0 + 4 * r1] | |
19055 movq [r0], xm4 | |
19056 movhps [r0 + r1], xm4 | |
19057 vextracti128 xm2, m4, 1 | |
19058 movq [r0 + 2 * r1], xm2 | |
19059 movhps [r0 + r3], xm2 | |
19060 RET | |
19061 | |
19062 INIT_YMM avx2 | |
19063 cglobal intra_pred_ang8_29, 3, 4, 5 | |
19064 mova m3, [pw_1024] | |
19065 vbroadcasti128 m0, [r2 + 1] | |
19066 | |
19067 pshufb m1, m0, [intra_pred_shuff_0_8] | |
19068 pshufb m2, m0, [c_ang8_src1_9_2_10] | |
19069 pshufb m4, m0, [c_ang8_src2_10_2_10] | |
19070 pshufb m0, [c_ang8_src2_10_3_11] | |
19071 | |
19072 pmaddubsw m1, [c_ang8_9_18] | |
19073 pmulhrsw m1, m3 | |
19074 pmaddubsw m2, [c_ang8_27_4] | |
19075 pmulhrsw m2, m3 | |
19076 pmaddubsw m4, [c_ang8_13_22] | |
19077 pmulhrsw m4, m3 | |
19078 pmaddubsw m0, [c_ang8_31_8] | |
19079 pmulhrsw m0, m3 | |
19080 packuswb m1, m2 | |
19081 packuswb m4, m0 | |
19082 | |
19083 lea r3, [3 * r1] | |
19084 movq [r0], xm1 | |
19085 vextracti128 xm2, m1, 1 | |
19086 movq [r0 + r1], xm2 | |
19087 movhps [r0 + 2 * r1], xm1 | |
19088 movhps [r0 + r3], xm2 | |
19089 lea r0, [r0 + 4 * r1] | |
19090 movq [r0], xm4 | |
19091 vextracti128 xm2, m4, 1 | |
19092 movq [r0 + r1], xm2 | |
19093 movhps [r0 + 2 * r1], xm4 | |
19094 movhps [r0 + r3], xm2 | |
19095 RET | |
19096 | |
19097 | |
19098 INIT_YMM avx2 | |
19099 cglobal intra_pred_ang8_8, 3, 4, 6 | |
19100 mova m3, [pw_1024] | |
19101 vbroadcasti128 m0, [r2 + 17] | |
19102 mova m5, [intra_pred_shuff_0_8] | |
19103 | |
19104 pshufb m1, m0, m5 | |
19105 pshufb m2, m0, m5 | |
19106 pshufb m4, m0, m5 | |
19107 pshufb m0, [c_ang8_src2_10_2_10] | |
19108 | |
19109 pmaddubsw m1, [c_ang8_5_10] | |
19110 pmulhrsw m1, m3 | |
19111 pmaddubsw m2, [c_ang8_15_20] | |
19112 pmulhrsw m2, m3 | |
19113 pmaddubsw m4, [c_ang8_25_30] | |
19114 pmulhrsw m4, m3 | |
19115 pmaddubsw m0, [c_ang8_3_8] | |
19116 pmulhrsw m0, m3 | |
19117 packuswb m1, m2 | |
19118 packuswb m4, m0 | |
19119 | |
19120 vperm2i128 m2, m1, m4, 00100000b | |
19121 vperm2i128 m1, m1, m4, 00110001b | |
19122 punpcklbw m4, m2, m1 | |
19123 punpckhbw m2, m1 | |
19124 punpcklwd m1, m4, m2 | |
19125 punpckhwd m4, m2 | |
19126 mova m0, [trans8_shuf] | |
19127 vpermd m1, m0, m1 | |
19128 vpermd m4, m0, m4 | |
19129 | |
19130 lea r3, [3 * r1] | |
19131 movq [r0], xm1 | |
19132 movhps [r0 + r1], xm1 | |
19133 vextracti128 xm2, m1, 1 | |
19134 movq [r0 + 2 * r1], xm2 | |
19135 movhps [r0 + r3], xm2 | |
19136 lea r0, [r0 + 4 * r1] | |
19137 movq [r0], xm4 | |
19138 movhps [r0 + r1], xm4 | |
19139 vextracti128 xm2, m4, 1 | |
19140 movq [r0 + 2 * r1], xm2 | |
19141 movhps [r0 + r3], xm2 | |
19142 RET | |
19143 | |
19144 INIT_YMM avx2 | |
19145 cglobal intra_pred_ang8_28, 3, 4, 6 | |
19146 mova m3, [pw_1024] | |
19147 vbroadcasti128 m0, [r2 + 1] | |
19148 mova m5, [intra_pred_shuff_0_8] | |
19149 | |
19150 pshufb m1, m0, m5 | |
19151 pshufb m2, m0, m5 | |
19152 pshufb m4, m0, m5 | |
19153 pshufb m0, [c_ang8_src2_10_2_10] | |
19154 | |
19155 pmaddubsw m1, [c_ang8_5_10] | |
19156 pmulhrsw m1, m3 | |
19157 pmaddubsw m2, [c_ang8_15_20] | |
19158 pmulhrsw m2, m3 | |
19159 pmaddubsw m4, [c_ang8_25_30] | |
19160 pmulhrsw m4, m3 | |
19161 pmaddubsw m0, [c_ang8_3_8] | |
19162 pmulhrsw m0, m3 | |
19163 packuswb m1, m2 | |
19164 packuswb m4, m0 | |
19165 | |
19166 lea r3, [3 * r1] | |
19167 movq [r0], xm1 | |
19168 vextracti128 xm2, m1, 1 | |
19169 movq [r0 + r1], xm2 | |
19170 movhps [r0 + 2 * r1], xm1 | |
19171 movhps [r0 + r3], xm2 | |
19172 lea r0, [r0 + 4 * r1] | |
19173 movq [r0], xm4 | |
19174 vextracti128 xm2, m4, 1 | |
19175 movq [r0 + r1], xm2 | |
19176 movhps [r0 + 2 * r1], xm4 | |
19177 movhps [r0 + r3], xm2 | |
19178 RET | |
19179 | |
19180 | |
19181 INIT_YMM avx2 | |
19182 cglobal intra_pred_ang8_11, 3, 5, 5 | |
19183 mova m3, [pw_1024] | |
19184 movu xm1, [r2 + 16] | |
19185 pinsrb xm1, [r2], 0 | |
19186 pshufb xm1, [intra_pred_shuff_0_8] | |
19187 vinserti128 m0, m1, xm1, 1 | |
19188 | |
19189 lea r4, [c_ang8_mode_25] | |
19190 pmaddubsw m1, m0, [r4] | |
19191 pmulhrsw m1, m3 | |
19192 pmaddubsw m2, m0, [r4 + mmsize] | |
19193 pmulhrsw m2, m3 | |
19194 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19195 pmulhrsw m4, m3 | |
19196 pmaddubsw m0, [r4 + 3 * mmsize] | |
19197 pmulhrsw m0, m3 | |
19198 packuswb m1, m2 | |
19199 packuswb m4, m0 | |
19200 | |
19201 vperm2i128 m2, m1, m4, 00100000b | |
19202 vperm2i128 m1, m1, m4, 00110001b | |
19203 punpcklbw m4, m2, m1 | |
19204 punpckhbw m2, m1 | |
19205 punpcklwd m1, m4, m2 | |
19206 punpckhwd m4, m2 | |
19207 mova m0, [trans8_shuf] | |
19208 vpermd m1, m0, m1 | |
19209 vpermd m4, m0, m4 | |
19210 | |
19211 lea r3, [3 * r1] | |
19212 movq [r0], xm1 | |
19213 movhps [r0 + r1], xm1 | |
19214 vextracti128 xm2, m1, 1 | |
19215 movq [r0 + 2 * r1], xm2 | |
19216 movhps [r0 + r3], xm2 | |
19217 lea r0, [r0 + 4 * r1] | |
19218 movq [r0], xm4 | |
19219 movhps [r0 + r1], xm4 | |
19220 vextracti128 xm2, m4, 1 | |
19221 movq [r0 + 2 * r1], xm2 | |
19222 movhps [r0 + r3], xm2 | |
19223 RET | |
19224 | |
19225 INIT_YMM avx2 | |
19226 cglobal intra_pred_ang8_15, 3, 6, 6 | |
19227 mova m3, [pw_1024] | |
19228 movu xm5, [r2 + 16] | |
19229 pinsrb xm5, [r2], 0 | |
19230 lea r5, [intra_pred_shuff_0_8] | |
19231 mova xm0, xm5 | |
19232 pslldq xm5, 1 | |
19233 pinsrb xm5, [r2 + 2], 0 | |
19234 vinserti128 m0, m0, xm5, 1 | |
19235 pshufb m0, [r5] | |
19236 | |
19237 lea r4, [c_ang8_mode_15] | |
19238 pmaddubsw m1, m0, [r4] | |
19239 pmulhrsw m1, m3 | |
19240 mova xm0, xm5 | |
19241 pslldq xm5, 1 | |
19242 pinsrb xm5, [r2 + 4], 0 | |
19243 vinserti128 m0, m0, xm5, 1 | |
19244 pshufb m0, [r5] | |
19245 pmaddubsw m2, m0, [r4 + mmsize] | |
19246 pmulhrsw m2, m3 | |
19247 mova xm0, xm5 | |
19248 pslldq xm5, 1 | |
19249 pinsrb xm5, [r2 + 6], 0 | |
19250 vinserti128 m0, m0, xm5, 1 | |
19251 pshufb m0, [r5] | |
19252 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19253 pmulhrsw m4, m3 | |
19254 mova xm0, xm5 | |
19255 pslldq xm5, 1 | |
19256 pinsrb xm5, [r2 + 8], 0 | |
19257 vinserti128 m0, m0, xm5, 1 | |
19258 pshufb m0, [r5] | |
19259 pmaddubsw m0, [r4 + 3 * mmsize] | |
19260 pmulhrsw m0, m3 | |
19261 packuswb m1, m2 | |
19262 packuswb m4, m0 | |
19263 | |
19264 vperm2i128 m2, m1, m4, 00100000b | |
19265 vperm2i128 m1, m1, m4, 00110001b | |
19266 punpcklbw m4, m2, m1 | |
19267 punpckhbw m2, m1 | |
19268 punpcklwd m1, m4, m2 | |
19269 punpckhwd m4, m2 | |
19270 mova m0, [trans8_shuf] | |
19271 vpermd m1, m0, m1 | |
19272 vpermd m4, m0, m4 | |
19273 | |
19274 lea r3, [3 * r1] | |
19275 movq [r0], xm1 | |
19276 movhps [r0 + r1], xm1 | |
19277 vextracti128 xm2, m1, 1 | |
19278 movq [r0 + 2 * r1], xm2 | |
19279 movhps [r0 + r3], xm2 | |
19280 lea r0, [r0 + 4 * r1] | |
19281 movq [r0], xm4 | |
19282 movhps [r0 + r1], xm4 | |
19283 vextracti128 xm2, m4, 1 | |
19284 movq [r0 + 2 * r1], xm2 | |
19285 movhps [r0 + r3], xm2 | |
19286 RET | |
19287 | |
19288 INIT_YMM avx2 | |
19289 cglobal intra_pred_ang8_16, 3,4,7 | |
19290 lea r0, [r0 + r1 * 8] | |
19291 sub r0, r1 | |
19292 neg r1 | |
19293 lea r3, [r1 * 3] | |
19294 vbroadcasti128 m0, [angHor8_tab_16] ; m0 = factor | |
19295 mova m1, [intra_pred8_shuff16] ; m1 = 4 of Row shuffle | |
19296 movu m2, [intra_pred8_shuff16 + 8] ; m2 = 4 of Row shuffle | |
19297 | |
19298 ; prepare reference pixel | |
19299 movq xm3, [r2 + 16 + 1] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x] | |
19300 movhps xm3, [r2 + 2] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x] | |
19301 pslldq xm3, 1 | |
19302 pinsrb xm3, [r2], 0 ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8] | |
19303 pshufb xm3, [c_ang8_mode_16] | |
19304 vinserti128 m3, m3, xm3, 1 ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1 0 2 3 5 6 8] | |
19305 | |
19306 ; process 4 rows | |
19307 pshufb m4, m3, m1 | |
19308 pshufb m5, m3, m2 | |
19309 psrldq m3, 4 | |
19310 punpcklbw m6, m5, m4 | |
19311 punpckhbw m5, m4 | |
19312 pmaddubsw m6, m0 | |
19313 pmulhrsw m6, [pw_1024] | |
19314 pmaddubsw m5, m0 | |
19315 pmulhrsw m5, [pw_1024] | |
19316 packuswb m6, m5 | |
19317 vextracti128 xm5, m6, 1 | |
19318 movq [r0], xm6 | |
19319 movhps [r0 + r1], xm6 | |
19320 movq [r0 + r1 * 2], xm5 | |
19321 movhps [r0 + r3], xm5 | |
19322 | |
19323 ; process 4 rows | |
19324 lea r0, [r0 + r1 * 4] | |
19325 pshufb m4, m3, m1 | |
19326 pshufb m5, m3, m2 | |
19327 punpcklbw m6, m5, m4 | |
19328 punpckhbw m5, m4 | |
19329 pmaddubsw m6, m0 | |
19330 pmulhrsw m6, [pw_1024] | |
19331 pmaddubsw m5, m0 | |
19332 pmulhrsw m5, [pw_1024] | |
19333 packuswb m6, m5 | |
19334 vextracti128 xm5, m6, 1 | |
19335 movq [r0], xm6 | |
19336 movhps [r0 + r1], xm6 | |
19337 movq [r0 + r1 * 2], xm5 | |
19338 movhps [r0 + r3], xm5 | |
19339 RET | |
19340 | |
19341 %if 1 | |
19342 INIT_YMM avx2 | |
19343 cglobal intra_pred_ang8_20, 3,5,6 | |
19344 lea r0, [r0 + r1 * 8] | |
19345 sub r0, r1 | |
19346 neg r1 | |
19347 lea r3, [angHor8_tab_20] | |
19348 lea r4, [r1 * 3] | |
19349 movu m5, [intra_pred_shuff_0_8 + 16] | |
19350 | |
19351 ; prepare reference pixel | |
19352 movq xm1, [r2 + 1] ; m3 = [ 1 2 3 4 5 6 7 8 x x x x x x x x] | |
19353 movhps xm1, [r2 + 16 + 2] ; m3 = [ 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8 x] | |
19354 palignr xm1, xm1, [r2 - 15], 15 ; m3 = [ 0 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8] | |
19355 pshufb xm1, [c_ang8_mode_20] | |
19356 vinserti128 m1, m1, xm1, 1 | |
19357 | |
19358 ; process 4 rows | |
19359 pshufb m3, m1, m5 | |
19360 psrldq m1, 2 | |
19361 pmaddubsw m3, [r3 + 0 * 16] | |
19362 pmulhrsw m3, [pw_1024] | |
19363 | |
19364 pshufb m4, m1, [intra_pred_shuff_0_8] | |
19365 psrldq m1, 1 | |
19366 pmaddubsw m4, [r3 + 2 * 16] | |
19367 pmulhrsw m4, [pw_1024] | |
19368 | |
19369 packuswb m3, m4 | |
19370 vextracti128 xm4, m3, 1 | |
19371 movq [r0], xm3 | |
19372 movq [r0 + r1], xm4 | |
19373 movhps [r0 + r1 * 2], xm3 | |
19374 movhps [r0 + r4], xm4 | |
19375 | |
19376 ; process 4 rows | |
19377 lea r0, [r0 + r1 * 4] | |
19378 pshufb m3, m1, m5 | |
19379 psrldq m1, 1 | |
19380 pmaddubsw m3, [r3 + 4 * 16] | |
19381 pmulhrsw m3, [pw_1024] | |
19382 | |
19383 pshufb m4, m1, m5 | |
19384 pmaddubsw m4, [r3 + 6 * 16] | |
19385 pmulhrsw m4, [pw_1024] | |
19386 | |
19387 packuswb m3, m4 | |
19388 vextracti128 xm4, m3, 1 | |
19389 movq [r0], xm3 | |
19390 movq [r0 + r1], xm4 | |
19391 movhps [r0 + r1 * 2], xm3 | |
19392 movhps [r0 + r4], xm4 | |
19393 RET | |
19394 | |
19395 %else | |
19396 INIT_YMM avx2 | |
19397 cglobal intra_pred_ang8_20, 3, 6, 6 | |
19398 mova m3, [pw_1024] | |
19399 movu xm5, [r2] | |
19400 lea r5, [intra_pred_shuff_0_8] | |
19401 mova xm0, xm5 | |
19402 pslldq xm5, 1 | |
19403 pinsrb xm5, [r2 + 2 + 16], 0 | |
19404 vinserti128 m0, m0, xm5, 1 | |
19405 pshufb m0, [r5] | |
19406 | |
19407 lea r4, [c_ang8_mode_20] | |
19408 pmaddubsw m1, m0, [r4] | |
19409 pmulhrsw m1, m3 | |
19410 mova xm0, xm5 | |
19411 pslldq xm5, 1 | |
19412 pinsrb xm5, [r2 + 3 + 16], 0 | |
19413 vinserti128 m0, m0, xm5, 1 | |
19414 pshufb m0, [r5] | |
19415 pmaddubsw m2, m0, [r4 + mmsize] | |
19416 pmulhrsw m2, m3 | |
19417 pslldq xm5, 1 | |
19418 pinsrb xm5, [r2 + 5 + 16], 0 | |
19419 vinserti128 m0, m5, xm5, 1 | |
19420 pshufb m0, [r5] | |
19421 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19422 pmulhrsw m4, m3 | |
19423 pslldq xm5, 1 | |
19424 pinsrb xm5, [r2 + 6 + 16], 0 | |
19425 mova xm0, xm5 | |
19426 pslldq xm5, 1 | |
19427 pinsrb xm5, [r2 + 8 + 16], 0 | |
19428 vinserti128 m0, m0, xm5, 1 | |
19429 pshufb m0, [r5] | |
19430 pmaddubsw m0, [r4 + 3 * mmsize] | |
19431 pmulhrsw m0, m3 | |
19432 | |
19433 packuswb m1, m2 | |
19434 packuswb m4, m0 | |
19435 | |
19436 lea r3, [3 * r1] | |
19437 movq [r0], xm1 | |
19438 vextracti128 xm2, m1, 1 | |
19439 movq [r0 + r1], xm2 | |
19440 movhps [r0 + 2 * r1], xm1 | |
19441 movhps [r0 + r3], xm2 | |
19442 lea r0, [r0 + 4 * r1] | |
19443 movq [r0], xm4 | |
19444 vextracti128 xm2, m4, 1 | |
19445 movq [r0 + r1], xm2 | |
19446 movhps [r0 + 2 * r1], xm4 | |
19447 movhps [r0 + r3], xm2 | |
19448 RET | |
19449 %endif | |
19450 | |
19451 INIT_YMM avx2 | |
19452 cglobal intra_pred_ang8_21, 3, 6, 6 | |
19453 mova m3, [pw_1024] | |
19454 movu xm5, [r2] | |
19455 lea r5, [intra_pred_shuff_0_8] | |
19456 mova xm0, xm5 | |
19457 pslldq xm5, 1 | |
19458 pinsrb xm5, [r2 + 2 + 16], 0 | |
19459 vinserti128 m0, m0, xm5, 1 | |
19460 pshufb m0, [r5] | |
19461 | |
19462 lea r4, [c_ang8_mode_15] | |
19463 pmaddubsw m1, m0, [r4] | |
19464 pmulhrsw m1, m3 | |
19465 mova xm0, xm5 | |
19466 pslldq xm5, 1 | |
19467 pinsrb xm5, [r2 + 4 + 16], 0 | |
19468 vinserti128 m0, m0, xm5, 1 | |
19469 pshufb m0, [r5] | |
19470 pmaddubsw m2, m0, [r4 + mmsize] | |
19471 pmulhrsw m2, m3 | |
19472 mova xm0, xm5 | |
19473 pslldq xm5, 1 | |
19474 pinsrb xm5, [r2 + 6 + 16], 0 | |
19475 vinserti128 m0, m0, xm5, 1 | |
19476 pshufb m0, [r5] | |
19477 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19478 pmulhrsw m4, m3 | |
19479 mova xm0, xm5 | |
19480 pslldq xm5, 1 | |
19481 pinsrb xm5, [r2 + 8 + 16], 0 | |
19482 vinserti128 m0, m0, xm5, 1 | |
19483 pshufb m0, [r5] | |
19484 pmaddubsw m0, [r4 + 3 * mmsize] | |
19485 pmulhrsw m0, m3 | |
19486 packuswb m1, m2 | |
19487 packuswb m4, m0 | |
19488 | |
19489 lea r3, [3 * r1] | |
19490 movq [r0], xm1 | |
19491 vextracti128 xm2, m1, 1 | |
19492 movq [r0 + r1], xm2 | |
19493 movhps [r0 + 2 * r1], xm1 | |
19494 movhps [r0 + r3], xm2 | |
19495 lea r0, [r0 + 4 * r1] | |
19496 movq [r0], xm4 | |
19497 vextracti128 xm2, m4, 1 | |
19498 movq [r0 + r1], xm2 | |
19499 movhps [r0 + 2 * r1], xm4 | |
19500 movhps [r0 + r3], xm2 | |
19501 RET | |
19502 | |
19503 INIT_YMM avx2 | |
19504 cglobal intra_pred_ang8_22, 3, 6, 6 | |
19505 mova m3, [pw_1024] | |
19506 movu xm5, [r2] | |
19507 lea r5, [intra_pred_shuff_0_8] | |
19508 vinserti128 m0, m5, xm5, 1 | |
19509 pshufb m0, [r5] | |
19510 | |
19511 lea r4, [c_ang8_mode_14] | |
19512 pmaddubsw m1, m0, [r4] | |
19513 pmulhrsw m1, m3 | |
19514 pslldq xm5, 1 | |
19515 pinsrb xm5, [r2 + 2 + 16], 0 | |
19516 vinserti128 m0, m5, xm5, 1 | |
19517 pshufb m0, [r5] | |
19518 pmaddubsw m2, m0, [r4 + mmsize] | |
19519 pmulhrsw m2, m3 | |
19520 pslldq xm5, 1 | |
19521 pinsrb xm5, [r2 + 5 + 16], 0 | |
19522 vinserti128 m0, m5, xm5, 1 | |
19523 pshufb m0, [r5] | |
19524 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19525 pmulhrsw m4, m3 | |
19526 pslldq xm5, 1 | |
19527 pinsrb xm5, [r2 + 7 + 16], 0 | |
19528 pshufb xm5, [r5] | |
19529 vinserti128 m0, m0, xm5, 1 | |
19530 pmaddubsw m0, [r4 + 3 * mmsize] | |
19531 pmulhrsw m0, m3 | |
19532 packuswb m1, m2 | |
19533 packuswb m4, m0 | |
19534 | |
19535 lea r3, [3 * r1] | |
19536 movq [r0], xm1 | |
19537 vextracti128 xm2, m1, 1 | |
19538 movq [r0 + r1], xm2 | |
19539 movhps [r0 + 2 * r1], xm1 | |
19540 movhps [r0 + r3], xm2 | |
19541 lea r0, [r0 + 4 * r1] | |
19542 movq [r0], xm4 | |
19543 vextracti128 xm2, m4, 1 | |
19544 movq [r0 + r1], xm2 | |
19545 movhps [r0 + 2 * r1], xm4 | |
19546 movhps [r0 + r3], xm2 | |
19547 RET | |
19548 | |
19549 INIT_YMM avx2 | |
19550 cglobal intra_pred_ang8_14, 3, 6, 6 | |
19551 mova m3, [pw_1024] | |
19552 movu xm5, [r2 + 16] | |
19553 pinsrb xm5, [r2], 0 | |
19554 lea r5, [intra_pred_shuff_0_8] | |
19555 vinserti128 m0, m5, xm5, 1 | |
19556 pshufb m0, [r5] | |
19557 | |
19558 lea r4, [c_ang8_mode_14] | |
19559 pmaddubsw m1, m0, [r4] | |
19560 pmulhrsw m1, m3 | |
19561 pslldq xm5, 1 | |
19562 pinsrb xm5, [r2 + 2], 0 | |
19563 vinserti128 m0, m5, xm5, 1 | |
19564 pshufb m0, [r5] | |
19565 pmaddubsw m2, m0, [r4 + mmsize] | |
19566 pmulhrsw m2, m3 | |
19567 pslldq xm5, 1 | |
19568 pinsrb xm5, [r2 + 5], 0 | |
19569 vinserti128 m0, m5, xm5, 1 | |
19570 pshufb m0, [r5] | |
19571 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19572 pmulhrsw m4, m3 | |
19573 pslldq xm5, 1 | |
19574 pinsrb xm5, [r2 + 7], 0 | |
19575 pshufb xm5, [r5] | |
19576 vinserti128 m0, m0, xm5, 1 | |
19577 pmaddubsw m0, [r4 + 3 * mmsize] | |
19578 pmulhrsw m0, m3 | |
19579 packuswb m1, m2 | |
19580 packuswb m4, m0 | |
19581 | |
19582 vperm2i128 m2, m1, m4, 00100000b | |
19583 vperm2i128 m1, m1, m4, 00110001b | |
19584 punpcklbw m4, m2, m1 | |
19585 punpckhbw m2, m1 | |
19586 punpcklwd m1, m4, m2 | |
19587 punpckhwd m4, m2 | |
19588 mova m0, [trans8_shuf] | |
19589 vpermd m1, m0, m1 | |
19590 vpermd m4, m0, m4 | |
19591 | |
19592 lea r3, [3 * r1] | |
19593 movq [r0], xm1 | |
19594 movhps [r0 + r1], xm1 | |
19595 vextracti128 xm2, m1, 1 | |
19596 movq [r0 + 2 * r1], xm2 | |
19597 movhps [r0 + r3], xm2 | |
19598 lea r0, [r0 + 4 * r1] | |
19599 movq [r0], xm4 | |
19600 movhps [r0 + r1], xm4 | |
19601 vextracti128 xm2, m4, 1 | |
19602 movq [r0 + 2 * r1], xm2 | |
19603 movhps [r0 + r3], xm2 | |
19604 RET | |
19605 | |
19606 INIT_YMM avx2 | |
19607 cglobal intra_pred_ang8_13, 3, 6, 6 | |
19608 mova m3, [pw_1024] | |
19609 movu xm5, [r2 + 16] | |
19610 pinsrb xm5, [r2], 0 | |
19611 lea r5, [intra_pred_shuff_0_8] | |
19612 vinserti128 m0, m5, xm5, 1 | |
19613 pshufb m0, [r5] | |
19614 | |
19615 lea r4, [c_ang8_mode_13] | |
19616 pmaddubsw m1, m0, [r4] | |
19617 pmulhrsw m1, m3 | |
19618 pslldq xm5, 1 | |
19619 pinsrb xm5, [r2 + 4], 0 | |
19620 pshufb xm4, xm5, [r5] | |
19621 vinserti128 m0, m0, xm4, 1 | |
19622 pmaddubsw m2, m0, [r4 + mmsize] | |
19623 pmulhrsw m2, m3 | |
19624 vinserti128 m0, m0, xm4, 0 | |
19625 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19626 pmulhrsw m4, m3 | |
19627 pslldq xm5, 1 | |
19628 pinsrb xm5, [r2 + 7], 0 | |
19629 pshufb xm5, [r5] | |
19630 vinserti128 m0, m0, xm5, 1 | |
19631 pmaddubsw m0, [r4 + 3 * mmsize] | |
19632 pmulhrsw m0, m3 | |
19633 packuswb m1, m2 | |
19634 packuswb m4, m0 | |
19635 | |
19636 vperm2i128 m2, m1, m4, 00100000b | |
19637 vperm2i128 m1, m1, m4, 00110001b | |
19638 punpcklbw m4, m2, m1 | |
19639 punpckhbw m2, m1 | |
19640 punpcklwd m1, m4, m2 | |
19641 punpckhwd m4, m2 | |
19642 mova m0, [trans8_shuf] | |
19643 vpermd m1, m0, m1 | |
19644 vpermd m4, m0, m4 | |
19645 | |
19646 lea r3, [3 * r1] | |
19647 movq [r0], xm1 | |
19648 movhps [r0 + r1], xm1 | |
19649 vextracti128 xm2, m1, 1 | |
19650 movq [r0 + 2 * r1], xm2 | |
19651 movhps [r0 + r3], xm2 | |
19652 lea r0, [r0 + 4 * r1] | |
19653 movq [r0], xm4 | |
19654 movhps [r0 + r1], xm4 | |
19655 vextracti128 xm2, m4, 1 | |
19656 movq [r0 + 2 * r1], xm2 | |
19657 movhps [r0 + r3], xm2 | |
19658 RET | |
19659 | |
19660 | |
19661 INIT_YMM avx2 | |
19662 cglobal intra_pred_ang8_23, 3, 6, 6 | |
19663 mova m3, [pw_1024] | |
19664 movu xm5, [r2] | |
19665 lea r5, [intra_pred_shuff_0_8] | |
19666 vinserti128 m0, m5, xm5, 1 | |
19667 pshufb m0, [r5] | |
19668 | |
19669 lea r4, [c_ang8_mode_13] | |
19670 pmaddubsw m1, m0, [r4] | |
19671 pmulhrsw m1, m3 | |
19672 pslldq xm5, 1 | |
19673 pinsrb xm5, [r2 + 4 + 16], 0 | |
19674 pshufb xm4, xm5, [r5] | |
19675 vinserti128 m0, m0, xm4, 1 | |
19676 pmaddubsw m2, m0, [r4 + mmsize] | |
19677 pmulhrsw m2, m3 | |
19678 vinserti128 m0, m0, xm4, 0 | |
19679 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19680 pmulhrsw m4, m3 | |
19681 pslldq xm5, 1 | |
19682 pinsrb xm5, [r2 + 7 + 16], 0 | |
19683 pshufb xm5, [r5] | |
19684 vinserti128 m0, m0, xm5, 1 | |
19685 pmaddubsw m0, [r4 + 3 * mmsize] | |
19686 pmulhrsw m0, m3 | |
19687 | |
19688 packuswb m1, m2 | |
19689 packuswb m4, m0 | |
19690 | |
19691 lea r3, [3 * r1] | |
19692 movq [r0], xm1 | |
19693 vextracti128 xm2, m1, 1 | |
19694 movq [r0 + r1], xm2 | |
19695 movhps [r0 + 2 * r1], xm1 | |
19696 movhps [r0 + r3], xm2 | |
19697 lea r0, [r0 + 4 * r1] | |
19698 movq [r0], xm4 | |
19699 vextracti128 xm2, m4, 1 | |
19700 movq [r0 + r1], xm2 | |
19701 movhps [r0 + 2 * r1], xm4 | |
19702 movhps [r0 + r3], xm2 | |
19703 RET | |
19704 | |
19705 INIT_YMM avx2 | |
19706 cglobal intra_pred_ang8_12, 3, 5, 5 | |
19707 mova m3, [pw_1024] | |
19708 movu xm1, [r2 + 16] | |
19709 pinsrb xm1, [r2], 0 | |
19710 pshufb xm1, [intra_pred_shuff_0_8] | |
19711 vinserti128 m0, m1, xm1, 1 | |
19712 | |
19713 lea r4, [c_ang8_mode_24] | |
19714 pmaddubsw m1, m0, [r4] | |
19715 pmulhrsw m1, m3 | |
19716 pmaddubsw m2, m0, [r4 + mmsize] | |
19717 pmulhrsw m2, m3 | |
19718 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19719 pmulhrsw m4, m3 | |
19720 pslldq xm0, 2 | |
19721 pinsrb xm0, [r2 + 6], 0 | |
19722 pinsrb xm0, [r2 + 0], 1 | |
19723 vinserti128 m0, m0, xm0, 1 | |
19724 pmaddubsw m0, [r4 + 3 * mmsize] | |
19725 pmulhrsw m0, m3 | |
19726 packuswb m1, m2 | |
19727 packuswb m4, m0 | |
19728 | |
19729 vperm2i128 m2, m1, m4, 00100000b | |
19730 vperm2i128 m1, m1, m4, 00110001b | |
19731 punpcklbw m4, m2, m1 | |
19732 punpckhbw m2, m1 | |
19733 punpcklwd m1, m4, m2 | |
19734 punpckhwd m4, m2 | |
19735 mova m0, [trans8_shuf] | |
19736 vpermd m1, m0, m1 | |
19737 vpermd m4, m0, m4 | |
19738 | |
19739 lea r3, [3 * r1] | |
19740 movq [r0], xm1 | |
19741 movhps [r0 + r1], xm1 | |
19742 vextracti128 xm2, m1, 1 | |
19743 movq [r0 + 2 * r1], xm2 | |
19744 movhps [r0 + r3], xm2 | |
19745 lea r0, [r0 + 4 * r1] | |
19746 movq [r0], xm4 | |
19747 movhps [r0 + r1], xm4 | |
19748 vextracti128 xm2, m4, 1 | |
19749 movq [r0 + 2 * r1], xm2 | |
19750 movhps [r0 + r3], xm2 | |
19751 RET | |
19752 | |
19753 INIT_YMM avx2 | |
19754 cglobal intra_pred_ang8_24, 3, 5, 5 | |
19755 mova m3, [pw_1024] | |
19756 vbroadcasti128 m0, [r2] | |
19757 | |
19758 pshufb m0, [intra_pred_shuff_0_8] | |
19759 | |
19760 lea r4, [c_ang8_mode_24] | |
19761 pmaddubsw m1, m0, [r4] | |
19762 pmulhrsw m1, m3 | |
19763 pmaddubsw m2, m0, [r4 + mmsize] | |
19764 pmulhrsw m2, m3 | |
19765 pmaddubsw m4, m0, [r4 + 2 * mmsize] | |
19766 pmulhrsw m4, m3 | |
19767 pslldq xm0, 2 | |
19768 pinsrb xm0, [r2 + 16 + 6], 0 | |
19769 pinsrb xm0, [r2 + 0], 1 | |
19770 vinserti128 m0, m0, xm0, 1 | |
19771 pmaddubsw m0, [r4 + 3 * mmsize] | |
19772 pmulhrsw m0, m3 | |
19773 packuswb m1, m2 | |
19774 packuswb m4, m0 | |
19775 | |
19776 lea r3, [3 * r1] | |
19777 movq [r0], xm1 | |
19778 vextracti128 xm2, m1, 1 | |
19779 movq [r0 + r1], xm2 | |
19780 movhps [r0 + 2 * r1], xm1 | |
19781 movhps [r0 + r3], xm2 | |
19782 lea r0, [r0 + 4 * r1] | |
19783 movq [r0], xm4 | |
19784 vextracti128 xm2, m4, 1 | |
19785 movq [r0 + r1], xm2 | |
19786 movhps [r0 + 2 * r1], xm4 | |
19787 movhps [r0 + r3], xm2 | |
19788 RET | |
19789 | |
19790 %macro INTRA_PRED_ANG16_MC0 3 | |
19791 pmaddubsw m3, m1, [r4 + %3 * mmsize] | |
19792 pmulhrsw m3, m0 | |
19793 pmaddubsw m4, m2, [r4 + %3 * mmsize] | |
19794 pmulhrsw m4, m0 | |
19795 packuswb m3, m4 | |
19796 movu [%1], xm3 | |
19797 vextracti128 xm4, m3, 1 | |
19798 movu [%2], xm4 | |
19799 %endmacro | |
19800 | |
19801 %macro INTRA_PRED_ANG16_MC1 1 | |
19802 INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1 | |
19803 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1) | |
19804 %endmacro | |
19805 | |
19806 %macro INTRA_PRED_ANG16_MC2 1 | |
19807 vbroadcasti128 m1, [r2 + %1] | |
19808 pshufb m1, m5 | |
19809 vbroadcasti128 m2, [r2 + (%1 + 8)] | |
19810 pshufb m2, m5 | |
19811 %endmacro | |
19812 | |
19813 %macro INTRA_PRED_ANG16_MC3 2 | |
19814 vperm2i128 m1, m1, m2, 00100000b | |
19815 pmaddubsw m3, m1, [r4 + (%2 * mmsize)] | |
19816 pmulhrsw m3, m0 | |
19817 packuswb m3, m3 | |
19818 vpermq m3, m3, 11011000b | |
19819 movu [%1], xm3 | |
19820 %endmacro | |
19821 | |
19822 %macro INTRA_PRED_ANG16_MC4 3 | |
19823 vperm2i128 m1, m1, m2, 00100000b | |
19824 pmaddubsw m4, m1, [r4 + (%3 * mmsize)] | |
19825 pmulhrsw m4, m0 | |
19826 packuswb m3, m4 | |
19827 vpermq m3, m3, 11011000b | |
19828 movu [%1], xm3 | |
19829 vextracti128 xm3, m3, 1 | |
19830 movu [%2], xm3 | |
19831 %endmacro | |
19832 | |
19833 %if ARCH_X86_64 == 1 | |
19834 %macro INTRA_PRED_TRANS_STORE_16x16 0 | |
19835 punpcklbw m8, m0, m1 | |
19836 punpckhbw m0, m1 | |
19837 | |
19838 punpcklbw m1, m2, m3 | |
19839 punpckhbw m2, m3 | |
19840 | |
19841 punpcklbw m3, m4, m5 | |
19842 punpckhbw m4, m5 | |
19843 | |
19844 punpcklbw m5, m6, m7 | |
19845 punpckhbw m6, m7 | |
19846 | |
19847 punpcklwd m7, m8, m1 | |
19848 punpckhwd m8, m1 | |
19849 | |
19850 punpcklwd m1, m3, m5 | |
19851 punpckhwd m3, m5 | |
19852 | |
19853 punpcklwd m5, m0, m2 | |
19854 punpckhwd m0, m2 | |
19855 | |
19856 punpcklwd m2, m4, m6 | |
19857 punpckhwd m4, m6 | |
19858 | |
19859 punpckldq m6, m7, m1 | |
19860 punpckhdq m7, m1 | |
19861 | |
19862 punpckldq m1, m8, m3 | |
19863 punpckhdq m8, m3 | |
19864 | |
19865 punpckldq m3, m5, m2 | |
19866 punpckhdq m5, m2 | |
19867 | |
19868 punpckldq m2, m0, m4 | |
19869 punpckhdq m0, m4 | |
19870 | |
19871 vpermq m6, m6, 0xD8 | |
19872 vpermq m7, m7, 0xD8 | |
19873 vpermq m1, m1, 0xD8 | |
19874 vpermq m8, m8, 0xD8 | |
19875 vpermq m3, m3, 0xD8 | |
19876 vpermq m5, m5, 0xD8 | |
19877 vpermq m2, m2, 0xD8 | |
19878 vpermq m0, m0, 0xD8 | |
19879 | |
19880 movu [r0], xm6 | |
19881 vextracti128 xm4, m6, 1 | |
19882 movu [r0 + r1], xm4 | |
19883 | |
19884 movu [r0 + 2 * r1], xm7 | |
19885 vextracti128 xm4, m7, 1 | |
19886 movu [r0 + r3], xm4 | |
19887 | |
19888 lea r0, [r0 + 4 * r1] | |
19889 | |
19890 movu [r0], xm1 | |
19891 vextracti128 xm4, m1, 1 | |
19892 movu [r0 + r1], xm4 | |
19893 | |
19894 movu [r0 + 2 * r1], xm8 | |
19895 vextracti128 xm4, m8, 1 | |
19896 movu [r0 + r3], xm4 | |
19897 | |
19898 lea r0, [r0 + 4 * r1] | |
19899 | |
19900 movu [r0], xm3 | |
19901 vextracti128 xm4, m3, 1 | |
19902 movu [r0 + r1], xm4 | |
19903 | |
19904 movu [r0 + 2 * r1], xm5 | |
19905 vextracti128 xm4, m5, 1 | |
19906 movu [r0 + r3], xm4 | |
19907 | |
19908 lea r0, [r0 + 4 * r1] | |
19909 | |
19910 movu [r0], xm2 | |
19911 vextracti128 xm4, m2, 1 | |
19912 movu [r0 + r1], xm4 | |
19913 | |
19914 movu [r0 + 2 * r1], xm0 | |
19915 vextracti128 xm4, m0, 1 | |
19916 movu [r0 + r3], xm4 | |
19917 %endmacro | |
19918 | |
19919 %macro INTRA_PRED_ANG16_CAL_ROW 3 | |
19920 pmaddubsw %1, m9, [r4 + (%3 * mmsize)] | |
19921 pmulhrsw %1, m11 | |
19922 pmaddubsw %2, m10, [r4 + (%3 * mmsize)] | |
19923 pmulhrsw %2, m11 | |
19924 packuswb %1, %2 | |
19925 %endmacro | |
19926 | |
19927 | |
19928 INIT_YMM avx2 | |
19929 cglobal intra_pred_ang16_12, 3,4,9 | |
19930 vbroadcasti128 m0, [angHor_tab_12] | |
19931 vbroadcasti128 m1, [angHor_tab_12 + mmsize/2] | |
19932 mova m2, [pw_1024] | |
19933 mova m7, [ang16_shuf_mode12] | |
19934 mova m8, [ang16_shuf_mode12 + mmsize] | |
19935 lea r3, [r1 * 3] | |
19936 | |
19937 movu xm4, [r2 + mmsize - 2] | |
19938 pinsrb xm4, [r2 + 0], 2 | |
19939 pinsrb xm4, [r2 + 6], 1 | |
19940 pinsrb xm4, [r2 + 13], 0 | |
19941 vbroadcasti128 m6, [r2 + mmsize + 14] | |
19942 vinserti128 m3, m4, xm4, 1 | |
19943 | |
19944 pshufb m4, m3, m7 | |
19945 pshufb m5, m3, m8 | |
19946 pmaddubsw m4, m0 | |
19947 pmaddubsw m5, m1 | |
19948 pmulhrsw m4, m2 | |
19949 pmulhrsw m5, m2 | |
19950 packuswb m4, m5 | |
19951 movu [r0], xm4 | |
19952 vextracti128 [r0 + r1], m4, 1 | |
19953 | |
19954 palignr m5, m6, m3, 2 | |
19955 pshufb m4, m5, m7 | |
19956 pshufb m5, m8 | |
19957 | |
19958 pmaddubsw m4, m0 | |
19959 pmaddubsw m5, m1 | |
19960 pmulhrsw m4, m2 | |
19961 pmulhrsw m5, m2 | |
19962 packuswb m4, m5 | |
19963 movu [r0 + r1 * 2], xm4 | |
19964 vextracti128 [r0 + r3], m4, 1 | |
19965 lea r0, [r0 + r1 * 4] | |
19966 | |
19967 palignr m5, m6, m3, 4 | |
19968 pshufb m4, m5, m7 | |
19969 pshufb m5, m8 | |
19970 | |
19971 pmaddubsw m4, m0 | |
19972 pmaddubsw m5, m1 | |
19973 pmulhrsw m4, m2 | |
19974 pmulhrsw m5, m2 | |
19975 packuswb m4, m5 | |
19976 movu [r0], xm4 | |
19977 vextracti128 [r0 + r1], m4, 1 | |
19978 | |
19979 palignr m5, m6, m3, 6 | |
19980 pshufb m4, m5, m7 | |
19981 pshufb m5, m8 | |
19982 | |
19983 pmaddubsw m4, m0 | |
19984 pmaddubsw m5, m1 | |
19985 pmulhrsw m4, m2 | |
19986 pmulhrsw m5, m2 | |
19987 packuswb m4, m5 | |
19988 movu [r0 + r1 * 2], xm4 | |
19989 vextracti128 [r0 + r3], m4, 1 | |
19990 lea r0, [r0 + r1 * 4] | |
19991 | |
19992 palignr m5, m6, m3, 8 | |
19993 pshufb m4, m5, m7 | |
19994 pshufb m5, m8 | |
19995 | |
19996 pmaddubsw m4, m0 | |
19997 pmaddubsw m5, m1 | |
19998 pmulhrsw m4, m2 | |
19999 pmulhrsw m5, m2 | |
20000 packuswb m4, m5 | |
20001 movu [r0], xm4 | |
20002 vextracti128 [r0 + r1], m4, 1 | |
20003 | |
20004 palignr m5, m6, m3, 10 | |
20005 pshufb m4, m5, m7 | |
20006 pshufb m5, m8 | |
20007 | |
20008 pmaddubsw m4, m0 | |
20009 pmaddubsw m5, m1 | |
20010 pmulhrsw m4, m2 | |
20011 pmulhrsw m5, m2 | |
20012 packuswb m4, m5 | |
20013 movu [r0 + r1 * 2], xm4 | |
20014 vextracti128 [r0 + r3], m4, 1 | |
20015 lea r0, [r0 + r1 * 4] | |
20016 | |
20017 palignr m5, m6, m3, 12 | |
20018 pshufb m4, m5, m7 | |
20019 pshufb m5, m8 | |
20020 | |
20021 pmaddubsw m4, m0 | |
20022 pmaddubsw m5, m1 | |
20023 pmulhrsw m4, m2 | |
20024 pmulhrsw m5, m2 | |
20025 packuswb m4, m5 | |
20026 movu [r0], xm4 | |
20027 vextracti128 [r0 + r1], m4, 1 | |
20028 | |
20029 palignr m5, m6, m3, 14 | |
20030 pshufb m4, m5, m7 | |
20031 pshufb m5, m8 | |
20032 | |
20033 pmaddubsw m4, m0 | |
20034 pmaddubsw m5, m1 | |
20035 pmulhrsw m4, m2 | |
20036 pmulhrsw m5, m2 | |
20037 packuswb m4, m5 | |
20038 movu [r0 + r1 * 2], xm4 | |
20039 vextracti128 [r0 + r3], m4, 1 | |
20040 RET | |
20041 | |
20042 INIT_YMM avx2 | |
20043 cglobal intra_pred_ang16_13, 3,4,9 | |
20044 vbroadcasti128 m0, [angHor_tab_13] | |
20045 vbroadcasti128 m1, [angHor_tab_13 + mmsize/2] | |
20046 mova m2, [pw_1024] | |
20047 mova m7, [ang16_shuf_mode13] | |
20048 mova m8, [ang16_shuf_mode13 + mmsize] | |
20049 lea r3, [r1 * 3] | |
20050 | |
20051 vbroadcasti128 m3, [r2 + mmsize + 1] | |
20052 vbroadcasti128 m4, [r2] | |
20053 pshufb m4, [ang16_shuf_mode13 + mmsize * 2] | |
20054 | |
20055 palignr m3, m4, 11 | |
20056 vbroadcasti128 m6, [r2 + mmsize + 12] | |
20057 | |
20058 pshufb m4, m3, m7 | |
20059 pshufb m5, m3, m8 | |
20060 pmaddubsw m4, m0 | |
20061 pmaddubsw m5, m1 | |
20062 pmulhrsw m4, m2 | |
20063 pmulhrsw m5, m2 | |
20064 packuswb m4, m5 | |
20065 movu [r0], xm4 | |
20066 vextracti128 [r0 + r1], m4, 1 | |
20067 | |
20068 palignr m5, m6, m3, 2 | |
20069 pshufb m4, m5, m7 | |
20070 pshufb m5, m8 | |
20071 | |
20072 pmaddubsw m4, m0 | |
20073 pmaddubsw m5, m1 | |
20074 pmulhrsw m4, m2 | |
20075 pmulhrsw m5, m2 | |
20076 packuswb m4, m5 | |
20077 movu [r0 + r1 * 2], xm4 | |
20078 vextracti128 [r0 + r3], m4, 1 | |
20079 lea r0, [r0 + r1 * 4] | |
20080 | |
20081 palignr m5, m6, m3, 4 | |
20082 pshufb m4, m5, m7 | |
20083 pshufb m5, m8 | |
20084 | |
20085 pmaddubsw m4, m0 | |
20086 pmaddubsw m5, m1 | |
20087 pmulhrsw m4, m2 | |
20088 pmulhrsw m5, m2 | |
20089 packuswb m4, m5 | |
20090 movu [r0], xm4 | |
20091 vextracti128 [r0 + r1], m4, 1 | |
20092 | |
20093 palignr m5, m6, m3, 6 | |
20094 pshufb m4, m5, m7 | |
20095 pshufb m5, m8 | |
20096 | |
20097 pmaddubsw m4, m0 | |
20098 pmaddubsw m5, m1 | |
20099 pmulhrsw m4, m2 | |
20100 pmulhrsw m5, m2 | |
20101 packuswb m4, m5 | |
20102 movu [r0 + r1 * 2], xm4 | |
20103 vextracti128 [r0 + r3], m4, 1 | |
20104 lea r0, [r0 + r1 * 4] | |
20105 | |
20106 palignr m5, m6, m3, 8 | |
20107 pshufb m4, m5, m7 | |
20108 pshufb m5, m8 | |
20109 | |
20110 pmaddubsw m4, m0 | |
20111 pmaddubsw m5, m1 | |
20112 pmulhrsw m4, m2 | |
20113 pmulhrsw m5, m2 | |
20114 packuswb m4, m5 | |
20115 movu [r0], xm4 | |
20116 vextracti128 [r0 + r1], m4, 1 | |
20117 | |
20118 palignr m5, m6, m3, 10 | |
20119 pshufb m4, m5, m7 | |
20120 pshufb m5, m8 | |
20121 | |
20122 pmaddubsw m4, m0 | |
20123 pmaddubsw m5, m1 | |
20124 pmulhrsw m4, m2 | |
20125 pmulhrsw m5, m2 | |
20126 packuswb m4, m5 | |
20127 movu [r0 + r1 * 2], xm4 | |
20128 vextracti128 [r0 + r3], m4, 1 | |
20129 lea r0, [r0 + r1 * 4] | |
20130 | |
20131 palignr m5, m6, m3, 12 | |
20132 pshufb m4, m5, m7 | |
20133 pshufb m5, m8 | |
20134 | |
20135 pmaddubsw m4, m0 | |
20136 pmaddubsw m5, m1 | |
20137 pmulhrsw m4, m2 | |
20138 pmulhrsw m5, m2 | |
20139 packuswb m4, m5 | |
20140 movu [r0], xm4 | |
20141 vextracti128 [r0 + r1], m4, 1 | |
20142 | |
20143 palignr m5, m6, m3, 14 | |
20144 pshufb m4, m5, m7 | |
20145 pshufb m5, m8 | |
20146 | |
20147 pmaddubsw m4, m0 | |
20148 pmaddubsw m5, m1 | |
20149 pmulhrsw m4, m2 | |
20150 pmulhrsw m5, m2 | |
20151 packuswb m4, m5 | |
20152 movu [r0 + r1 * 2], xm4 | |
20153 vextracti128 [r0 + r3], m4, 1 | |
20154 RET | |
20155 | |
20156 INIT_YMM avx2 | |
20157 cglobal intra_pred_ang16_14, 3,4,9 | |
20158 vbroadcasti128 m0, [angHor_tab_14] | |
20159 vbroadcasti128 m1, [angHor_tab_14 + mmsize/2] | |
20160 mova m2, [pw_1024] | |
20161 mova m7, [ang16_shuf_mode14] | |
20162 mova m8, [ang16_shuf_mode14 + mmsize] | |
20163 lea r3, [r1 * 3] | |
20164 | |
20165 vbroadcasti128 m3, [r2 + mmsize + 1] | |
20166 vbroadcasti128 m4, [r2] | |
20167 pshufb m4, [ang16_shuf_mode14 + mmsize * 2] | |
20168 palignr m3, m4, 9 | |
20169 vbroadcasti128 m6, [r2 + mmsize + 10] | |
20170 | |
20171 pshufb m4, m3, m7 | |
20172 pshufb m5, m3, m8 | |
20173 pmaddubsw m4, m0 | |
20174 pmaddubsw m5, m1 | |
20175 pmulhrsw m4, m2 | |
20176 pmulhrsw m5, m2 | |
20177 packuswb m4, m5 | |
20178 movu [r0], xm4 | |
20179 vextracti128 [r0 + r1], m4, 1 | |
20180 | |
20181 palignr m5, m6, m3, 2 | |
20182 pshufb m4, m5, m7 | |
20183 pshufb m5, m8 | |
20184 | |
20185 pmaddubsw m4, m0 | |
20186 pmaddubsw m5, m1 | |
20187 pmulhrsw m4, m2 | |
20188 pmulhrsw m5, m2 | |
20189 packuswb m4, m5 | |
20190 movu [r0 + r1 * 2], xm4 | |
20191 vextracti128 [r0 + r3], m4, 1 | |
20192 lea r0, [r0 + r1 * 4] | |
20193 | |
20194 palignr m5, m6, m3, 4 | |
20195 pshufb m4, m5, m7 | |
20196 pshufb m5, m8 | |
20197 | |
20198 pmaddubsw m4, m0 | |
20199 pmaddubsw m5, m1 | |
20200 pmulhrsw m4, m2 | |
20201 pmulhrsw m5, m2 | |
20202 packuswb m4, m5 | |
20203 movu [r0], xm4 | |
20204 vextracti128 [r0 + r1], m4, 1 | |
20205 | |
20206 palignr m5, m6, m3, 6 | |
20207 pshufb m4, m5, m7 | |
20208 pshufb m5, m8 | |
20209 | |
20210 pmaddubsw m4, m0 | |
20211 pmaddubsw m5, m1 | |
20212 pmulhrsw m4, m2 | |
20213 pmulhrsw m5, m2 | |
20214 packuswb m4, m5 | |
20215 movu [r0 + r1 * 2], xm4 | |
20216 vextracti128 [r0 + r3], m4, 1 | |
20217 lea r0, [r0 + r1 * 4] | |
20218 | |
20219 palignr m5, m6, m3, 8 | |
20220 pshufb m4, m5, m7 | |
20221 pshufb m5, m8 | |
20222 | |
20223 pmaddubsw m4, m0 | |
20224 pmaddubsw m5, m1 | |
20225 pmulhrsw m4, m2 | |
20226 pmulhrsw m5, m2 | |
20227 packuswb m4, m5 | |
20228 movu [r0], xm4 | |
20229 vextracti128 [r0 + r1], m4, 1 | |
20230 | |
20231 palignr m5, m6, m3, 10 | |
20232 pshufb m4, m5, m7 | |
20233 pshufb m5, m8 | |
20234 | |
20235 pmaddubsw m4, m0 | |
20236 pmaddubsw m5, m1 | |
20237 pmulhrsw m4, m2 | |
20238 pmulhrsw m5, m2 | |
20239 packuswb m4, m5 | |
20240 movu [r0 + r1 * 2], xm4 | |
20241 vextracti128 [r0 + r3], m4, 1 | |
20242 lea r0, [r0 + r1 * 4] | |
20243 | |
20244 palignr m5, m6, m3, 12 | |
20245 pshufb m4, m5, m7 | |
20246 pshufb m5, m8 | |
20247 | |
20248 pmaddubsw m4, m0 | |
20249 pmaddubsw m5, m1 | |
20250 pmulhrsw m4, m2 | |
20251 pmulhrsw m5, m2 | |
20252 packuswb m4, m5 | |
20253 movu [r0], xm4 | |
20254 vextracti128 [r0 + r1], m4, 1 | |
20255 | |
20256 palignr m5, m6, m3, 14 | |
20257 pshufb m4, m5, m7 | |
20258 pshufb m5, m8 | |
20259 | |
20260 pmaddubsw m4, m0 | |
20261 pmaddubsw m5, m1 | |
20262 pmulhrsw m4, m2 | |
20263 pmulhrsw m5, m2 | |
20264 packuswb m4, m5 | |
20265 movu [r0 + r1 * 2], xm4 | |
20266 vextracti128 [r0 + r3], m4, 1 | |
20267 RET | |
20268 | |
20269 INIT_YMM avx2 | |
20270 cglobal intra_pred_ang16_15, 3,4,9 | |
20271 vbroadcasti128 m0, [angHor_tab_15] | |
20272 vbroadcasti128 m1, [angHor_tab_15 + mmsize/2] | |
20273 mova m2, [pw_1024] | |
20274 mova m7, [ang16_shuf_mode15] | |
20275 mova m8, [ang16_shuf_mode15 + mmsize] | |
20276 lea r3, [r1 * 3] | |
20277 | |
20278 vbroadcasti128 m3, [r2 + mmsize + 1] | |
20279 vbroadcasti128 m4, [r2] | |
20280 pshufb m4, [ang16_shuf_mode15 + mmsize * 2] | |
20281 palignr m3, m3, m4, 7 | |
20282 vbroadcasti128 m6, [r2 + mmsize + 8] | |
20283 | |
20284 pshufb m4, m3, m7 | |
20285 pshufb m5, m3, m8 | |
20286 pmaddubsw m4, m0 | |
20287 pmaddubsw m5, m1 | |
20288 pmulhrsw m4, m2 | |
20289 pmulhrsw m5, m2 | |
20290 packuswb m4, m5 | |
20291 movu [r0], xm4 | |
20292 vextracti128 [r0 + r1], m4, 1 | |
20293 | |
20294 palignr m5, m6, m3, 2 | |
20295 pshufb m4, m5, m7 | |
20296 pshufb m5, m8 | |
20297 | |
20298 pmaddubsw m4, m0 | |
20299 pmaddubsw m5, m1 | |
20300 pmulhrsw m4, m2 | |
20301 pmulhrsw m5, m2 | |
20302 packuswb m4, m5 | |
20303 movu [r0 + r1 * 2], xm4 | |
20304 vextracti128 [r0 + r3], m4, 1 | |
20305 lea r0, [r0 + r1 * 4] | |
20306 | |
20307 palignr m5, m6, m3, 4 | |
20308 pshufb m4, m5, m7 | |
20309 pshufb m5, m8 | |
20310 | |
20311 pmaddubsw m4, m0 | |
20312 pmaddubsw m5, m1 | |
20313 pmulhrsw m4, m2 | |
20314 pmulhrsw m5, m2 | |
20315 packuswb m4, m5 | |
20316 movu [r0], xm4 | |
20317 vextracti128 [r0 + r1], m4, 1 | |
20318 | |
20319 palignr m5, m6, m3, 6 | |
20320 pshufb m4, m5, m7 | |
20321 pshufb m5, m8 | |
20322 | |
20323 pmaddubsw m4, m0 | |
20324 pmaddubsw m5, m1 | |
20325 pmulhrsw m4, m2 | |
20326 pmulhrsw m5, m2 | |
20327 packuswb m4, m5 | |
20328 movu [r0 + r1 * 2], xm4 | |
20329 vextracti128 [r0 + r3], m4, 1 | |
20330 lea r0, [r0 + r1 * 4] | |
20331 | |
20332 palignr m5, m6, m3, 8 | |
20333 pshufb m4, m5, m7 | |
20334 pshufb m5, m8 | |
20335 | |
20336 pmaddubsw m4, m0 | |
20337 pmaddubsw m5, m1 | |
20338 pmulhrsw m4, m2 | |
20339 pmulhrsw m5, m2 | |
20340 packuswb m4, m5 | |
20341 movu [r0], xm4 | |
20342 vextracti128 [r0 + r1], m4, 1 | |
20343 | |
20344 palignr m5, m6, m3, 10 | |
20345 pshufb m4, m5, m7 | |
20346 pshufb m5, m8 | |
20347 | |
20348 pmaddubsw m4, m0 | |
20349 pmaddubsw m5, m1 | |
20350 pmulhrsw m4, m2 | |
20351 pmulhrsw m5, m2 | |
20352 packuswb m4, m5 | |
20353 movu [r0 + r1 * 2], xm4 | |
20354 vextracti128 [r0 + r3], m4, 1 | |
20355 lea r0, [r0 + r1 * 4] | |
20356 | |
20357 palignr m5, m6, m3, 12 | |
20358 pshufb m4, m5, m7 | |
20359 pshufb m5, m8 | |
20360 | |
20361 pmaddubsw m4, m0 | |
20362 pmaddubsw m5, m1 | |
20363 pmulhrsw m4, m2 | |
20364 pmulhrsw m5, m2 | |
20365 packuswb m4, m5 | |
20366 movu [r0], xm4 | |
20367 vextracti128 [r0 + r1], m4, 1 | |
20368 | |
20369 palignr m5, m6, m3, 14 | |
20370 pshufb m4, m5, m7 | |
20371 pshufb m5, m8 | |
20372 | |
20373 pmaddubsw m4, m0 | |
20374 pmaddubsw m5, m1 | |
20375 pmulhrsw m4, m2 | |
20376 pmulhrsw m5, m2 | |
20377 packuswb m4, m5 | |
20378 movu [r0 + r1 * 2], xm4 | |
20379 vextracti128 [r0 + r3], m4, 1 | |
20380 RET | |
20381 | |
20382 INIT_YMM avx2 | |
20383 cglobal intra_pred_ang16_16, 3,4,9 | |
20384 vbroadcasti128 m0, [angHor_tab_16] | |
20385 vbroadcasti128 m1, [angHor_tab_16 + mmsize/2] | |
20386 mova m2, [pw_1024] | |
20387 mova m7, [ang16_shuf_mode16] | |
20388 mova m8, [ang16_shuf_mode16 + mmsize] | |
20389 lea r3, [r1 * 3] | |
20390 | |
20391 vbroadcasti128 m3, [r2 + mmsize + 1] | |
20392 vbroadcasti128 m4, [r2] | |
20393 pshufb m4, [ang16_shuf_mode16 + mmsize * 2] | |
20394 palignr m3, m4, 5 | |
20395 vbroadcasti128 m6, [r2 + mmsize + 6] | |
20396 | |
20397 pshufb m4, m3, m7 | |
20398 pshufb m5, m3, m8 | |
20399 pmaddubsw m4, m0 | |
20400 pmaddubsw m5, m1 | |
20401 pmulhrsw m4, m2 | |
20402 pmulhrsw m5, m2 | |
20403 packuswb m4, m5 | |
20404 movu [r0], xm4 | |
20405 vextracti128 [r0 + r1], m4, 1 | |
20406 | |
20407 palignr m5, m6, m3, 2 | |
20408 pshufb m4, m5, m7 | |
20409 pshufb m5, m8 | |
20410 | |
20411 pmaddubsw m4, m0 | |
20412 pmaddubsw m5, m1 | |
20413 pmulhrsw m4, m2 | |
20414 pmulhrsw m5, m2 | |
20415 packuswb m4, m5 | |
20416 movu [r0 + r1 * 2], xm4 | |
20417 vextracti128 [r0 + r3], m4, 1 | |
20418 lea r0, [r0 + r1 * 4] | |
20419 | |
20420 palignr m5, m6, m3, 4 | |
20421 pshufb m4, m5, m7 | |
20422 pshufb m5, m8 | |
20423 | |
20424 pmaddubsw m4, m0 | |
20425 pmaddubsw m5, m1 | |
20426 pmulhrsw m4, m2 | |
20427 pmulhrsw m5, m2 | |
20428 packuswb m4, m5 | |
20429 movu [r0], xm4 | |
20430 vextracti128 [r0 + r1], m4, 1 | |
20431 | |
20432 palignr m5, m6, m3, 6 | |
20433 pshufb m4, m5, m7 | |
20434 pshufb m5, m8 | |
20435 | |
20436 pmaddubsw m4, m0 | |
20437 pmaddubsw m5, m1 | |
20438 pmulhrsw m4, m2 | |
20439 pmulhrsw m5, m2 | |
20440 packuswb m4, m5 | |
20441 movu [r0 + r1 * 2], xm4 | |
20442 vextracti128 [r0 + r3], m4, 1 | |
20443 lea r0, [r0 + r1 * 4] | |
20444 | |
20445 palignr m5, m6, m3, 8 | |
20446 pshufb m4, m5, m7 | |
20447 pshufb m5, m8 | |
20448 | |
20449 pmaddubsw m4, m0 | |
20450 pmaddubsw m5, m1 | |
20451 pmulhrsw m4, m2 | |
20452 pmulhrsw m5, m2 | |
20453 packuswb m4, m5 | |
20454 movu [r0], xm4 | |
20455 vextracti128 [r0 + r1], m4, 1 | |
20456 | |
20457 palignr m5, m6, m3, 10 | |
20458 pshufb m4, m5, m7 | |
20459 pshufb m5, m8 | |
20460 | |
20461 pmaddubsw m4, m0 | |
20462 pmaddubsw m5, m1 | |
20463 pmulhrsw m4, m2 | |
20464 pmulhrsw m5, m2 | |
20465 packuswb m4, m5 | |
20466 movu [r0 + r1 * 2], xm4 | |
20467 vextracti128 [r0 + r3], m4, 1 | |
20468 lea r0, [r0 + r1 * 4] | |
20469 | |
20470 palignr m5, m6, m3, 12 | |
20471 pshufb m4, m5, m7 | |
20472 pshufb m5, m8 | |
20473 | |
20474 pmaddubsw m4, m0 | |
20475 pmaddubsw m5, m1 | |
20476 pmulhrsw m4, m2 | |
20477 pmulhrsw m5, m2 | |
20478 packuswb m4, m5 | |
20479 movu [r0], xm4 | |
20480 vextracti128 [r0 + r1], m4, 1 | |
20481 | |
20482 palignr m5, m6, m3, 14 | |
20483 pshufb m4, m5, m7 | |
20484 pshufb m5, m8 | |
20485 | |
20486 pmaddubsw m4, m0 | |
20487 pmaddubsw m5, m1 | |
20488 pmulhrsw m4, m2 | |
20489 pmulhrsw m5, m2 | |
20490 packuswb m4, m5 | |
20491 movu [r0 + r1 * 2], xm4 | |
20492 vextracti128 [r0 + r3], m4, 1 | |
20493 RET | |
20494 | |
20495 INIT_YMM avx2 | |
20496 cglobal intra_pred_ang16_17, 3,4,9 | |
20497 vbroadcasti128 m0, [angHor_tab_17] | |
20498 vbroadcasti128 m1, [angHor_tab_17 + mmsize/2] | |
20499 mova m2, [pw_1024] | |
20500 mova m7, [ang16_shuf_mode17] | |
20501 mova m8, [ang16_shuf_mode17 + mmsize] | |
20502 lea r3, [r1 * 3] | |
20503 | |
20504 vbroadcasti128 m3, [r2 + mmsize + 1] | |
20505 vbroadcasti128 m4, [r2] | |
20506 pshufb m4, [ang16_shuf_mode17 + mmsize * 2] | |
20507 palignr m3, m4, 3 | |
20508 vbroadcasti128 m6, [r2 + mmsize + 4] | |
20509 | |
20510 pshufb m4, m3, m7 | |
20511 pshufb m5, m3, m8 | |
20512 pmaddubsw m4, m0 | |
20513 pmaddubsw m5, m1 | |
20514 pmulhrsw m4, m2 | |
20515 pmulhrsw m5, m2 | |
20516 packuswb m4, m5 | |
20517 movu [r0], xm4 | |
20518 vextracti128 [r0 + r1], m4, 1 | |
20519 | |
20520 palignr m5, m6, m3, 2 | |
20521 pshufb m4, m5, m7 | |
20522 pshufb m5, m8 | |
20523 | |
20524 pmaddubsw m4, m0 | |
20525 pmaddubsw m5, m1 | |
20526 pmulhrsw m4, m2 | |
20527 pmulhrsw m5, m2 | |
20528 packuswb m4, m5 | |
20529 movu [r0 + r1 * 2], xm4 | |
20530 vextracti128 [r0 + r3], m4, 1 | |
20531 lea r0, [r0 + r1 * 4] | |
20532 | |
20533 palignr m5, m6, m3, 4 | |
20534 pshufb m4, m5, m7 | |
20535 pshufb m5, m8 | |
20536 | |
20537 pmaddubsw m4, m0 | |
20538 pmaddubsw m5, m1 | |
20539 pmulhrsw m4, m2 | |
20540 pmulhrsw m5, m2 | |
20541 packuswb m4, m5 | |
20542 movu [r0], xm4 | |
20543 vextracti128 [r0 + r1], m4, 1 | |
20544 | |
20545 palignr m5, m6, m3, 6 | |
20546 pshufb m4, m5, m7 | |
20547 pshufb m5, m8 | |
20548 | |
20549 pmaddubsw m4, m0 | |
20550 pmaddubsw m5, m1 | |
20551 pmulhrsw m4, m2 | |
20552 pmulhrsw m5, m2 | |
20553 packuswb m4, m5 | |
20554 movu [r0 + r1 * 2], xm4 | |
20555 vextracti128 [r0 + r3], m4, 1 | |
20556 lea r0, [r0 + r1 * 4] | |
20557 | |
20558 palignr m5, m6, m3, 8 | |
20559 pshufb m4, m5, m7 | |
20560 pshufb m5, m8 | |
20561 | |
20562 pmaddubsw m4, m0 | |
20563 pmaddubsw m5, m1 | |
20564 pmulhrsw m4, m2 | |
20565 pmulhrsw m5, m2 | |
20566 packuswb m4, m5 | |
20567 movu [r0], xm4 | |
20568 vextracti128 [r0 + r1], m4, 1 | |
20569 | |
20570 palignr m5, m6, m3, 10 | |
20571 pshufb m4, m5, m7 | |
20572 pshufb m5, m8 | |
20573 | |
20574 pmaddubsw m4, m0 | |
20575 pmaddubsw m5, m1 | |
20576 pmulhrsw m4, m2 | |
20577 pmulhrsw m5, m2 | |
20578 packuswb m4, m5 | |
20579 movu [r0 + r1 * 2], xm4 | |
20580 vextracti128 [r0 + r3], m4, 1 | |
20581 lea r0, [r0 + r1 * 4] | |
20582 | |
20583 palignr m5, m6, m3, 12 | |
20584 pshufb m4, m5, m7 | |
20585 pshufb m5, m8 | |
20586 | |
20587 pmaddubsw m4, m0 | |
20588 pmaddubsw m5, m1 | |
20589 pmulhrsw m4, m2 | |
20590 pmulhrsw m5, m2 | |
20591 packuswb m4, m5 | |
20592 movu [r0], xm4 | |
20593 vextracti128 [r0 + r1], m4, 1 | |
20594 | |
20595 palignr m5, m6, m3, 14 | |
20596 pshufb m4, m5, m7 | |
20597 pshufb m5, m8 | |
20598 | |
20599 pmaddubsw m4, m0 | |
20600 pmaddubsw m5, m1 | |
20601 pmulhrsw m4, m2 | |
20602 pmulhrsw m5, m2 | |
20603 packuswb m4, m5 | |
20604 movu [r0 + r1 * 2], xm4 | |
20605 vextracti128 [r0 + r3], m4, 1 | |
20606 RET | |
20607 | |
20608 INIT_YMM avx2 | |
20609 cglobal intra_pred_ang16_11, 3,4,8 | |
20610 vbroadcasti128 m0, [angHor_tab_11] | |
20611 vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] | |
20612 mova m2, [pw_1024] | |
20613 mova m7, [ang32_shuf_mode9] | |
20614 lea r3, [r1 * 3] | |
20615 | |
20616 ; prepare for [0 -1 -2...] | |
20617 | |
20618 movu xm3, [r2 + mmsize] | |
20619 pinsrb xm3, [r2], 0 | |
20620 vbroadcasti128 m6, [r2 + mmsize + 16] | |
20621 vinserti128 m3, m3, xm3, 1 | |
20622 | |
20623 pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2] | |
20624 pmaddubsw m4, m5, m0 | |
20625 pmaddubsw m5, m1 | |
20626 pmulhrsw m4, m2 | |
20627 pmulhrsw m5, m2 | |
20628 packuswb m4, m5 | |
20629 movu [r0], xm4 | |
20630 vextracti128 [r0 + r1], m4, 1 | |
20631 | |
20632 palignr m5, m6, m3, 2 | |
20633 pshufb m5, m7 | |
20634 pmaddubsw m4, m5, m0 | |
20635 pmaddubsw m5, m1 | |
20636 pmulhrsw m4, m2 | |
20637 pmulhrsw m5, m2 | |
20638 packuswb m4, m5 | |
20639 movu [r0 + r1 * 2], xm4 | |
20640 vextracti128 [r0 + r3], m4, 1 | |
20641 | |
20642 lea r0, [r0 + r1 * 4] | |
20643 | |
20644 palignr m5, m6, m3, 4 | |
20645 pshufb m5, m7 | |
20646 pmaddubsw m4, m5, m0 | |
20647 pmaddubsw m5, m1 | |
20648 pmulhrsw m4, m2 | |
20649 pmulhrsw m5, m2 | |
20650 packuswb m4, m5 | |
20651 movu [r0], xm4 | |
20652 vextracti128 [r0 + r1], m4, 1 | |
20653 | |
20654 palignr m5, m6, m3, 6 | |
20655 pshufb m5, m7 | |
20656 pmaddubsw m4, m5, m0 | |
20657 pmaddubsw m5, m1 | |
20658 pmulhrsw m4, m2 | |
20659 pmulhrsw m5, m2 | |
20660 packuswb m4, m5 | |
20661 movu [r0 + r1 * 2], xm4 | |
20662 vextracti128 [r0 + r3], m4, 1 | |
20663 | |
20664 lea r0, [r0 + r1 * 4] | |
20665 | |
20666 palignr m5, m6, m3, 8 | |
20667 pshufb m5, m7 | |
20668 pmaddubsw m4, m5, m0 | |
20669 pmaddubsw m5, m1 | |
20670 pmulhrsw m4, m2 | |
20671 pmulhrsw m5, m2 | |
20672 packuswb m4, m5 | |
20673 movu [r0], xm4 | |
20674 vextracti128 [r0 + r1], m4, 1 | |
20675 | |
20676 palignr m5, m6, m3, 10 | |
20677 pshufb m5, m7 | |
20678 pmaddubsw m4, m5, m0 | |
20679 pmaddubsw m5, m1 | |
20680 pmulhrsw m4, m2 | |
20681 pmulhrsw m5, m2 | |
20682 packuswb m4, m5 | |
20683 movu [r0 + r1 * 2], xm4 | |
20684 vextracti128 [r0 + r3], m4, 1 | |
20685 | |
20686 lea r0, [r0 + r1 * 4] | |
20687 | |
20688 palignr m5, m6, m3, 12 | |
20689 pshufb m5, m7 | |
20690 pmaddubsw m4, m5, m0 | |
20691 pmaddubsw m5, m1 | |
20692 pmulhrsw m4, m2 | |
20693 pmulhrsw m5, m2 | |
20694 packuswb m4, m5 | |
20695 movu [r0], xm4 | |
20696 vextracti128 [r0 + r1], m4, 1 | |
20697 | |
20698 palignr m5, m6, m3, 14 | |
20699 pshufb m5, m7 | |
20700 pmaddubsw m4, m5, m0 | |
20701 pmaddubsw m5, m1 | |
20702 pmulhrsw m4, m2 | |
20703 pmulhrsw m5, m2 | |
20704 packuswb m4, m5 | |
20705 movu [r0 + r1 * 2], xm4 | |
20706 vextracti128 [r0 + r3], m4, 1 | |
20707 RET | |
20708 | |
20709 | |
20710 ; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm | |
20711 %if ARCH_X86_64 == 1 | |
20712 INIT_YMM avx2 | |
20713 %macro TRANSPOSE_STORE_8x32 12 | |
20714 jc .skip | |
20715 | |
20716 punpcklbw m%9, m%1, m%2 | |
20717 punpckhbw m%1, m%2 | |
20718 punpcklbw m%10, m%3, m%4 | |
20719 punpckhbw m%3, m%4 | |
20720 | |
20721 punpcklwd m%11, m%9, m%10 | |
20722 punpckhwd m%9, m%10 | |
20723 punpcklwd m%10, m%1, m%3 | |
20724 punpckhwd m%1, m%3 | |
20725 | |
20726 punpckldq m%12, m%11, m%10 | |
20727 punpckhdq m%11, m%10 | |
20728 punpckldq m%10, m%9, m%1 | |
20729 punpckhdq m%9, m%1 | |
20730 | |
20731 punpcklbw m%1, m%5, m%6 | |
20732 punpckhbw m%5, m%6 | |
20733 punpcklbw m%2, m%7, m%8 | |
20734 punpckhbw m%7, m%8 | |
20735 | |
20736 punpcklwd m%3, m%1, m%2 | |
20737 punpckhwd m%1, m%2 | |
20738 punpcklwd m%4, m%5, m%7 | |
20739 punpckhwd m%5, m%7 | |
20740 | |
20741 punpckldq m%2, m%3, m%4 | |
20742 punpckhdq m%3, m%4 | |
20743 punpckldq m%4, m%1, m%5 | |
20744 punpckhdq m%1, m%5 | |
20745 | |
20746 punpckldq m%5, m%12, m%2 | |
20747 punpckhdq m%6, m%12, m%2 | |
20748 punpckldq m%7, m%10, m%4 | |
20749 punpckhdq m%8, m%10, m%4 | |
20750 | |
20751 punpckldq m%2, m%11, m%3 | |
20752 punpckhdq m%11, m%11, m%3 | |
20753 punpckldq m%4, m%9, m%1 | |
20754 punpckhdq m%9, m%9, m%1 | |
20755 | |
20756 movu [r0 + r1 * 0], xm%5 | |
20757 movu [r0 + r1 * 1], xm%6 | |
20758 movu [r0 + r1 * 2], xm%2 | |
20759 movu [r0 + r5 * 1], xm%11 | |
20760 | |
20761 add r0, r6 | |
20762 | |
20763 movu [r0 + r1 * 0], xm%7 | |
20764 movu [r0 + r1 * 1], xm%8 | |
20765 movu [r0 + r1 * 2], xm%4 | |
20766 movu [r0 + r5 * 1], xm%9 | |
20767 | |
20768 add r0, r6 | |
20769 | |
20770 vextracti128 [r0 + r1 * 0], m%5, 1 | |
20771 vextracti128 [r0 + r1 * 1], m%6, 1 | |
20772 vextracti128 [r0 + r1 * 2], m%2, 1 | |
20773 vextracti128 [r0 + r5 * 1], m%11, 1 | |
20774 | |
20775 add r0, r6 | |
20776 | |
20777 vextracti128 [r0 + r1 * 0], m%7, 1 | |
20778 vextracti128 [r0 + r1 * 1], m%8, 1 | |
20779 vextracti128 [r0 + r1 * 2], m%4, 1 | |
20780 vextracti128 [r0 + r5 * 1], m%9, 1 | |
20781 jmp .end | |
20782 | |
20783 .skip: | |
20784 vpermq m%1, m%1, q3120 | |
20785 vpermq m%2, m%2, q3120 | |
20786 vpermq m%3, m%3, q3120 | |
20787 vpermq m%4, m%4, q3120 | |
20788 vpermq m%5, m%5, q3120 | |
20789 vpermq m%6, m%6, q3120 | |
20790 vpermq m%7, m%7, q3120 | |
20791 vpermq m%8, m%8, q3120 | |
20792 | |
20793 movu [r0 + r1 * 0], xm%1 | |
20794 movu [r0 + r1 * 1], xm%2 | |
20795 movu [r0 + r1 * 2], xm%3 | |
20796 movu [r0 + r5 * 1], xm%4 | |
20797 | |
20798 add r0, r6 | |
20799 | |
20800 movu [r0 + r1 * 0], xm%5 | |
20801 movu [r0 + r1 * 1], xm%6 | |
20802 movu [r0 + r1 * 2], xm%7 | |
20803 movu [r0 + r5 * 1], xm%8 | |
20804 | |
20805 add r0, r6 | |
20806 | |
20807 vextracti128 [r0 + r1 * 0], m%1, 1 | |
20808 vextracti128 [r0 + r1 * 1], m%2, 1 | |
20809 vextracti128 [r0 + r1 * 2], m%3, 1 | |
20810 vextracti128 [r0 + r5 * 1], m%4, 1 | |
20811 | |
20812 add r0, r6 | |
20813 | |
20814 vextracti128 [r0 + r1 * 0], m%5, 1 | |
20815 vextracti128 [r0 + r1 * 1], m%6, 1 | |
20816 vextracti128 [r0 + r1 * 2], m%7, 1 | |
20817 vextracti128 [r0 + r5 * 1], m%8, 1 | |
20818 .end: | |
20819 %endmacro | |
20820 | |
20821 cglobal ang16_mode_3_33 | |
20822 ; rows 0 to 7 | |
20823 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
20824 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
20825 | |
20826 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
20827 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
20828 vextracti128 xm1, m0, 1 | |
20829 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
20830 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
20831 | |
20832 pmaddubsw m4, m0, [r3 + 10 * 32] ; [26] | |
20833 pmulhrsw m4, m7 | |
20834 | |
20835 palignr m5, m2, m0, 2 | |
20836 pmaddubsw m5, [r3 + 4 * 32] ; [20] | |
20837 pmulhrsw m5, m7 | |
20838 | |
20839 palignr m6, m2, m0, 4 | |
20840 palignr m8, m2, m0, 6 | |
20841 pmaddubsw m6, [r3 - 2 * 32] ; [14] | |
20842 pmulhrsw m6, m7 | |
20843 pmaddubsw m8, [r3 - 8 * 32] ; [8] | |
20844 pmulhrsw m8, m7 | |
20845 | |
20846 palignr m10, m2, m0, 8 | |
20847 pmaddubsw m9, m10, [r3 - 14 * 32] ; [2] | |
20848 pmulhrsw m9, m7 | |
20849 pmaddubsw m10, [r3 + 12 * 32] ; [28] | |
20850 pmulhrsw m10, m7 | |
20851 | |
20852 palignr m11, m2, m0, 10 | |
20853 palignr m12, m2, m0, 12 | |
20854 pmaddubsw m11, [r3 + 6 * 32] ; [22] | |
20855 pmulhrsw m11, m7 | |
20856 pmaddubsw m12, [r3] ; [16] | |
20857 pmulhrsw m12, m7 | |
20858 | |
20859 ; rows 8 to 15 | |
20860 palignr m3, m2, m0, 14 | |
20861 palignr m1, m1, m2, 14 | |
20862 pmaddubsw m3, [r3 - 6 * 32] ; [10] | |
20863 pmulhrsw m3, m7 | |
20864 packuswb m4, m3 | |
20865 | |
20866 pmaddubsw m3, m2, [r3 - 12 * 32] ; [4] | |
20867 pmulhrsw m3, m7 | |
20868 packuswb m5, m3 | |
20869 | |
20870 pmaddubsw m3, m2, [r3 + 14 * 32] ; [30] | |
20871 pmulhrsw m3, m7 | |
20872 packuswb m6, m3 | |
20873 | |
20874 movu xm0, [r2 + 25] | |
20875 movu xm1, [r2 + 26] | |
20876 punpcklbw m0, m1 | |
20877 mova m1, m2 | |
20878 vinserti128 m1, m1, xm0, 0 | |
20879 vpermq m1, m1, 01001110b | |
20880 | |
20881 palignr m3, m1, m2, 2 | |
20882 pmaddubsw m3, [r3 + 8 * 32] ; [24] | |
20883 pmulhrsw m3, m7 | |
20884 packuswb m8, m3 | |
20885 | |
20886 palignr m3, m1, m2, 4 | |
20887 pmaddubsw m3, [r3 + 2 * 32] ; [18] | |
20888 pmulhrsw m3, m7 | |
20889 packuswb m9, m3 | |
20890 | |
20891 palignr m3, m1, m2, 6 | |
20892 pmaddubsw m3, [r3 - 4 * 32] ; [12] | |
20893 pmulhrsw m3, m7 | |
20894 packuswb m10, m3 | |
20895 | |
20896 palignr m3, m1, m2, 8 | |
20897 pmaddubsw m3, [r3 - 10 * 32] ; [6] | |
20898 pmulhrsw m3, m7 | |
20899 packuswb m11, m3 | |
20900 | |
20901 pmovzxbw m1, [r2 + 14] | |
20902 packuswb m12, m1 | |
20903 | |
20904 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
20905 ret | |
20906 | |
20907 INIT_YMM avx2 | |
20908 cglobal intra_pred_ang16_3, 3, 7, 13 | |
20909 add r2, 32 | |
20910 lea r3, [ang_table_avx2 + 16 * 32] | |
20911 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
20912 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
20913 mova m7, [pw_1024] | |
20914 clc | |
20915 | |
20916 call ang16_mode_3_33 | |
20917 RET | |
20918 | |
20919 INIT_YMM avx2 | |
20920 cglobal intra_pred_ang16_33, 3, 7, 13 | |
20921 lea r3, [ang_table_avx2 + 16 * 32] | |
20922 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
20923 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
20924 mova m7, [pw_1024] | |
20925 stc | |
20926 | |
20927 call ang16_mode_3_33 | |
20928 RET | |
20929 | |
20930 cglobal ang16_mode_4_32 | |
20931 ; rows 0 to 7 | |
20932 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
20933 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
20934 | |
20935 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
20936 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
20937 vextracti128 xm1, m0, 1 | |
20938 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
20939 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
20940 | |
20941 pmaddubsw m4, m0, [r3 + 5 * 32] ; [21] | |
20942 pmulhrsw m4, m7 | |
20943 | |
20944 palignr m1, m2, m0, 2 | |
20945 pmaddubsw m5, m1, [r3 - 6 * 32] ; [10] | |
20946 pmulhrsw m5, m7 | |
20947 | |
20948 palignr m8, m2, m0, 4 | |
20949 pmaddubsw m6, m1, [r3 + 15 * 32] ; [31] | |
20950 pmulhrsw m6, m7 | |
20951 pmaddubsw m8, [r3 + 4 * 32] ; [20] | |
20952 pmulhrsw m8, m7 | |
20953 | |
20954 palignr m10, m2, m0, 6 | |
20955 pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] | |
20956 pmulhrsw m9, m7 | |
20957 pmaddubsw m10, [r3 + 14 * 32] ; [30] | |
20958 pmulhrsw m10, m7 | |
20959 | |
20960 palignr m11, m2, m0, 8 | |
20961 palignr m1, m2, m0, 10 | |
20962 pmaddubsw m11, [r3 + 3 * 32] ; [19] | |
20963 pmulhrsw m11, m7 | |
20964 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] | |
20965 pmulhrsw m12, m7 | |
20966 | |
20967 ; rows 8 to 15 | |
20968 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29] | |
20969 pmulhrsw m3, m7 | |
20970 packuswb m4, m3 | |
20971 | |
20972 palignr m3, m2, m0, 12 | |
20973 pmaddubsw m3, m3, [r3 + 2 * 32] ; [18] | |
20974 pmulhrsw m3, m7 | |
20975 packuswb m5, m3 | |
20976 | |
20977 palignr m1, m2, m0, 14 | |
20978 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7] | |
20979 pmulhrsw m3, m7 | |
20980 packuswb m6, m3 | |
20981 | |
20982 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] | |
20983 pmulhrsw m3, m7 | |
20984 packuswb m8, m3 | |
20985 | |
20986 palignr m3, m2, m0, 16 | |
20987 pmaddubsw m3, [r3 + 1 * 32] ; [17] | |
20988 pmulhrsw m3, m7 | |
20989 packuswb m9, m3 | |
20990 | |
20991 movu xm0, [r2 + 25] | |
20992 movu xm1, [r2 + 26] | |
20993 punpcklbw m0, m1 | |
20994 mova m1, m2 | |
20995 vinserti128 m1, m1, xm0, 0 | |
20996 vpermq m1, m1, 01001110b | |
20997 | |
20998 palignr m0, m1, m2, 2 | |
20999 pmaddubsw m3, m0, [r3 - 10 * 32] ; [6] | |
21000 pmulhrsw m3, m7 | |
21001 packuswb m10, m3 | |
21002 | |
21003 pmaddubsw m3, m0, [r3 + 11 * 32] ; [27] | |
21004 pmulhrsw m3, m7 | |
21005 packuswb m11, m3 | |
21006 | |
21007 palignr m1, m1, m2, 4 | |
21008 pmaddubsw m1, [r3] ; [16] | |
21009 pmulhrsw m1, m7 | |
21010 packuswb m12, m1 | |
21011 | |
21012 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
21013 ret | |
21014 | |
21015 INIT_YMM avx2 | |
21016 cglobal intra_pred_ang16_4, 3, 7, 13 | |
21017 add r2, 32 | |
21018 lea r3, [ang_table_avx2 + 16 * 32] | |
21019 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21020 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21021 mova m7, [pw_1024] | |
21022 clc | |
21023 | |
21024 call ang16_mode_4_32 | |
21025 RET | |
21026 | |
21027 INIT_YMM avx2 | |
21028 cglobal intra_pred_ang16_32, 3, 7, 13 | |
21029 lea r3, [ang_table_avx2 + 16 * 32] | |
21030 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21031 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21032 mova m7, [pw_1024] | |
21033 stc | |
21034 | |
21035 call ang16_mode_4_32 | |
21036 RET | |
21037 | |
21038 cglobal ang16_mode_5 | |
21039 ; rows 0 to 7 | |
21040 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
21041 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
21042 | |
21043 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21044 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21045 vextracti128 xm1, m0, 1 | |
21046 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21047 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21048 | |
21049 pmaddubsw m4, m0, [r3 + 1 * 32] ; [17] | |
21050 pmulhrsw m4, m7 | |
21051 | |
21052 palignr m1, m2, m0, 2 | |
21053 pmaddubsw m5, m1, [r3 - 14 * 32] ; [2] | |
21054 pmulhrsw m5, m7 | |
21055 | |
21056 palignr m3, m2, m0, 4 | |
21057 pmaddubsw m6, m1, [r3 + 3 * 32] ; [19] | |
21058 pmulhrsw m6, m7 | |
21059 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4] | |
21060 pmulhrsw m8, m7 | |
21061 pmaddubsw m9, m3, [r3 + 5 * 32] ; [21] | |
21062 pmulhrsw m9, m7 | |
21063 | |
21064 palignr m3, m2, m0, 6 | |
21065 pmaddubsw m10, m3, [r3 - 10 * 32] ; [6] | |
21066 pmulhrsw m10, m7 | |
21067 | |
21068 palignr m1, m2, m0, 8 | |
21069 pmaddubsw m11, m3, [r3 + 7 * 32] ; [23] | |
21070 pmulhrsw m11, m7 | |
21071 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] | |
21072 pmulhrsw m12, m7 | |
21073 | |
21074 ; rows 8 to 15 | |
21075 pmaddubsw m3, m1, [r3 + 9 * 32] ; [25] | |
21076 pmulhrsw m3, m7 | |
21077 packuswb m4, m3 | |
21078 | |
21079 palignr m1, m2, m0, 10 | |
21080 pmaddubsw m3, m1, [r3 - 6 * 32] ; [10] | |
21081 pmulhrsw m3, m7 | |
21082 packuswb m5, m3 | |
21083 | |
21084 pmaddubsw m3, m1, [r3 + 11 * 32] ; [27] | |
21085 pmulhrsw m3, m7 | |
21086 packuswb m6, m3 | |
21087 | |
21088 palignr m1, m2, m0, 12 | |
21089 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12] | |
21090 pmulhrsw m3, m7 | |
21091 packuswb m8, m3 | |
21092 | |
21093 pmaddubsw m3, m1, [r3 + 13 * 32] ; [29] | |
21094 pmulhrsw m3, m7 | |
21095 packuswb m9, m3 | |
21096 | |
21097 palignr m1, m2, m0, 14 | |
21098 pmaddubsw m3, m1, [r3 - 2 * 32] ; [14] | |
21099 pmulhrsw m3, m7 | |
21100 packuswb m10, m3 | |
21101 | |
21102 pmaddubsw m3, m1, [r3 + 15 * 32] ; [31] | |
21103 pmulhrsw m3, m7 | |
21104 packuswb m11, m3 | |
21105 | |
21106 palignr m1, m2, m0, 16 | |
21107 pmaddubsw m1, [r3] ; [16] | |
21108 pmulhrsw m1, m7 | |
21109 packuswb m12, m1 | |
21110 | |
21111 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
21112 ret | |
21113 | |
21114 INIT_YMM avx2 | |
21115 cglobal intra_pred_ang16_5, 3, 7, 13 | |
21116 add r2, 32 | |
21117 lea r3, [ang_table_avx2 + 16 * 32] | |
21118 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21119 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21120 mova m7, [pw_1024] | |
21121 clc | |
21122 | |
21123 call ang16_mode_5 | |
21124 RET | |
21125 | |
21126 cglobal ang16_mode_6 | |
21127 ; rows 0 to 7 | |
21128 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
21129 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
21130 | |
21131 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21132 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21133 vextracti128 xm1, m0, 1 | |
21134 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21135 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21136 | |
21137 pmaddubsw m4, m0, [r3 - 3 * 32] ; [13] | |
21138 pmulhrsw m4, m7 | |
21139 | |
21140 pmaddubsw m5, m0, [r3 + 10 * 32] ; [26] | |
21141 pmulhrsw m5, m7 | |
21142 | |
21143 palignr m3, m2, m0, 2 | |
21144 pmaddubsw m6, m3, [r3 - 9 * 32] ; [7] | |
21145 pmulhrsw m6, m7 | |
21146 pmaddubsw m8, m3, [r3 + 4 * 32] ; [20] | |
21147 pmulhrsw m8, m7 | |
21148 | |
21149 palignr m3, m2, m0, 4 | |
21150 pmaddubsw m9, m3, [r3 - 15 * 32] ; [1] | |
21151 pmulhrsw m9, m7 | |
21152 | |
21153 pmaddubsw m10, m3, [r3 - 2 * 32] ; [14] | |
21154 pmulhrsw m10, m7 | |
21155 | |
21156 pmaddubsw m11, m3, [r3 + 11 * 32] ; [27] | |
21157 pmulhrsw m11, m7 | |
21158 | |
21159 palignr m1, m2, m0, 6 | |
21160 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] | |
21161 pmulhrsw m12, m7 | |
21162 | |
21163 ; rows 8 to 15 | |
21164 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21] | |
21165 pmulhrsw m3, m7 | |
21166 packuswb m4, m3 | |
21167 | |
21168 palignr m1, m2, m0, 8 | |
21169 pmaddubsw m3, m1, [r3 - 14 * 32] ; [2] | |
21170 pmulhrsw m3, m7 | |
21171 packuswb m5, m3 | |
21172 | |
21173 pmaddubsw m3, m1, [r3 - 1 * 32] ; [15] | |
21174 pmulhrsw m3, m7 | |
21175 packuswb m6, m3 | |
21176 | |
21177 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] | |
21178 pmulhrsw m3, m7 | |
21179 packuswb m8, m3 | |
21180 | |
21181 palignr m1, m2, m0, 10 | |
21182 pmaddubsw m3, m1, [r3 - 7 * 32] ; [9] | |
21183 pmulhrsw m3, m7 | |
21184 packuswb m9, m3 | |
21185 | |
21186 pmaddubsw m3, m1, [r3 + 6 * 32] ; [22] | |
21187 pmulhrsw m3, m7 | |
21188 packuswb m10, m3 | |
21189 | |
21190 palignr m1, m2, m0, 12 | |
21191 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3] | |
21192 pmulhrsw m3, m7 | |
21193 packuswb m11, m3 | |
21194 | |
21195 pmaddubsw m1, [r3] ; [16] | |
21196 pmulhrsw m1, m7 | |
21197 packuswb m12, m1 | |
21198 | |
21199 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
21200 ret | |
21201 | |
21202 INIT_YMM avx2 | |
21203 cglobal intra_pred_ang16_6, 3, 7, 13 | |
21204 add r2, 32 | |
21205 lea r3, [ang_table_avx2 + 16 * 32] | |
21206 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21207 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21208 mova m7, [pw_1024] | |
21209 clc | |
21210 | |
21211 call ang16_mode_6 | |
21212 RET | |
21213 | |
21214 cglobal ang16_mode_7 | |
21215 ; rows 0 to 7 | |
21216 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
21217 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
21218 | |
21219 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21220 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21221 vextracti128 xm1, m0, 1 | |
21222 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21223 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21224 | |
21225 pmaddubsw m4, m0, [r3 - 7 * 32] ; [9] | |
21226 pmulhrsw m4, m7 | |
21227 | |
21228 pmaddubsw m5, m0, [r3 + 2 * 32] ; [18] | |
21229 pmulhrsw m5, m7 | |
21230 pmaddubsw m6, m0, [r3 + 11 * 32] ; [27] | |
21231 pmulhrsw m6, m7 | |
21232 | |
21233 palignr m3, m2, m0, 2 | |
21234 pmaddubsw m8, m3, [r3 - 12 * 32] ; [4] | |
21235 pmulhrsw m8, m7 | |
21236 | |
21237 pmaddubsw m9, m3, [r3 - 3 * 32] ; [13] | |
21238 pmulhrsw m9, m7 | |
21239 | |
21240 pmaddubsw m10, m3, [r3 + 6 * 32] ; [22] | |
21241 pmulhrsw m10, m7 | |
21242 | |
21243 pmaddubsw m11, m3, [r3 + 15 * 32] ; [31] | |
21244 pmulhrsw m11, m7 | |
21245 | |
21246 palignr m1, m2, m0, 4 | |
21247 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] | |
21248 pmulhrsw m12, m7 | |
21249 | |
21250 ; rows 8 to 15 | |
21251 pmaddubsw m3, m1, [r3 + 1 * 32] ; [17] | |
21252 pmulhrsw m3, m7 | |
21253 packuswb m4, m3 | |
21254 | |
21255 pmaddubsw m3, m1, [r3 + 10 * 32] ; [26] | |
21256 pmulhrsw m3, m7 | |
21257 packuswb m5, m3 | |
21258 | |
21259 palignr m1, m2, m0, 6 | |
21260 pmaddubsw m3, m1, [r3 - 13 * 32] ; [3] | |
21261 pmulhrsw m3, m7 | |
21262 packuswb m6, m3 | |
21263 | |
21264 pmaddubsw m3, m1, [r3 - 4 * 32] ; [12] | |
21265 pmulhrsw m3, m7 | |
21266 packuswb m8, m3 | |
21267 | |
21268 pmaddubsw m3, m1, [r3 + 5 * 32] ; [21] | |
21269 pmulhrsw m3, m7 | |
21270 packuswb m9, m3 | |
21271 | |
21272 pmaddubsw m3, m1, [r3 + 14 * 32] ; [30] | |
21273 pmulhrsw m3, m7 | |
21274 packuswb m10, m3 | |
21275 | |
21276 palignr m1, m2, m0, 8 | |
21277 pmaddubsw m3, m1, [r3 - 9 * 32] ; [7] | |
21278 pmulhrsw m3, m7 | |
21279 packuswb m11, m3 | |
21280 | |
21281 pmaddubsw m1, [r3] ; [16] | |
21282 pmulhrsw m1, m7 | |
21283 packuswb m12, m1 | |
21284 | |
21285 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
21286 ret | |
21287 | |
21288 INIT_YMM avx2 | |
21289 cglobal intra_pred_ang16_7, 3, 7, 13 | |
21290 add r2, 32 | |
21291 lea r3, [ang_table_avx2 + 16 * 32] | |
21292 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21293 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21294 mova m7, [pw_1024] | |
21295 clc | |
21296 | |
21297 call ang16_mode_7 | |
21298 RET | |
21299 | |
21300 cglobal ang16_mode_8 | |
21301 ; rows 0 to 7 | |
21302 movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
21303 movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
21304 | |
21305 punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21306 punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21307 vextracti128 xm1, m0, 1 | |
21308 vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
21309 vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
21310 | |
21311 pmaddubsw m4, m0, [r3 - 11 * 32] ; [5] | |
21312 pmulhrsw m4, m7 | |
21313 pmaddubsw m5, m0, [r3 - 6 * 32] ; [10] | |
21314 pmulhrsw m5, m7 | |
21315 | |
21316 pmaddubsw m6, m0, [r3 - 1 * 32] ; [15] | |
21317 pmulhrsw m6, m7 | |
21318 pmaddubsw m8, m0, [r3 + 4 * 32] ; [20] | |
21319 pmulhrsw m8, m7 | |
21320 pmaddubsw m9, m0, [r3 + 9 * 32] ; [25] | |
21321 pmulhrsw m9, m7 | |
21322 | |
21323 pmaddubsw m10, m0, [r3 + 14 * 32] ; [30] | |
21324 pmulhrsw m10, m7 | |
21325 palignr m1, m2, m0, 2 | |
21326 pmaddubsw m11, m1, [r3 - 13 * 32] ; [3] | |
21327 pmulhrsw m11, m7 | |
21328 pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] | |
21329 pmulhrsw m12, m7 | |
21330 | |
21331 ; rows 8 to 15 | |
21332 pmaddubsw m3, m1, [r3 - 3 * 32] ; [13] | |
21333 pmulhrsw m3, m7 | |
21334 packuswb m4, m3 | |
21335 pmaddubsw m3, m1, [r3 + 2 * 32] ; [18] | |
21336 pmulhrsw m3, m7 | |
21337 packuswb m5, m3 | |
21338 | |
21339 pmaddubsw m3, m1, [r3 + 7 * 32] ; [23] | |
21340 pmulhrsw m3, m7 | |
21341 packuswb m6, m3 | |
21342 pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] | |
21343 pmulhrsw m3, m7 | |
21344 packuswb m8, m3 | |
21345 | |
21346 palignr m1, m2, m0, 4 | |
21347 pmaddubsw m3, m1, [r3 - 15 * 32] ; [1] | |
21348 pmulhrsw m3, m7 | |
21349 packuswb m9, m3 | |
21350 pmaddubsw m3, m1, [r3 - 10 * 32] ; [6] | |
21351 pmulhrsw m3, m7 | |
21352 packuswb m10, m3 | |
21353 | |
21354 pmaddubsw m3, m1, [r3 - 5 * 32] ; [11] | |
21355 pmulhrsw m3, m7 | |
21356 packuswb m11, m3 | |
21357 pmaddubsw m1, [r3] ; [16] | |
21358 pmulhrsw m1, m7 | |
21359 packuswb m12, m1 | |
21360 | |
21361 TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 | |
21362 ret | |
21363 | |
21364 INIT_YMM avx2 | |
21365 cglobal intra_pred_ang16_8, 3, 7, 13 | |
21366 add r2, 32 | |
21367 lea r3, [ang_table_avx2 + 16 * 32] | |
21368 lea r5, [r1 * 3] ; r5 -> 3 * stride | |
21369 lea r6, [r1 * 4] ; r6 -> 4 * stride | |
21370 mova m7, [pw_1024] | |
21371 clc | |
21372 | |
21373 call ang16_mode_8 | |
21374 RET | |
21375 %endif ; ARCH_X86_64 | |
21376 | |
21377 INIT_YMM avx2 | |
21378 cglobal intra_pred_ang16_9, 3,4,8 | |
21379 vbroadcasti128 m0, [angHor_tab_9] | |
21380 vbroadcasti128 m1, [angHor_tab_9 + mmsize/2] | |
21381 mova m2, [pw_1024] | |
21382 lea r3, [r1 * 3] | |
21383 mova m7, [ang16_shuf_mode9] | |
21384 | |
21385 vbroadcasti128 m6, [r2 + mmsize + 17] | |
21386 vbroadcasti128 m3, [r2 + mmsize + 1] | |
21387 | |
21388 pshufb m5, m3, m7 | |
21389 pmaddubsw m4, m5, m0 | |
21390 pmaddubsw m5, m1 | |
21391 pmulhrsw m4, m2 | |
21392 pmulhrsw m5, m2 | |
21393 packuswb m4, m5 | |
21394 movu [r0], xm4 | |
21395 vextracti128 [r0 + r1], m4, 1 | |
21396 | |
21397 palignr m5, m6, m3, 2 | |
21398 pshufb m5, m7 | |
21399 pmaddubsw m4, m5, m0 | |
21400 pmaddubsw m5, m1 | |
21401 pmulhrsw m4, m2 | |
21402 pmulhrsw m5, m2 | |
21403 packuswb m4, m5 | |
21404 movu [r0 + r1 * 2], xm4 | |
21405 vextracti128 [r0 + r3], m4, 1 | |
21406 | |
21407 lea r0, [r0 + r1 * 4] | |
21408 | |
21409 palignr m5, m6, m3, 4 | |
21410 pshufb m5, m7 | |
21411 pmaddubsw m4, m5, m0 | |
21412 pmaddubsw m5, m1 | |
21413 pmulhrsw m4, m2 | |
21414 pmulhrsw m5, m2 | |
21415 packuswb m4, m5 | |
21416 movu [r0], xm4 | |
21417 vextracti128 [r0 + r1], m4, 1 | |
21418 | |
21419 palignr m5, m6, m3, 6 | |
21420 pshufb m5, m7 | |
21421 pmaddubsw m4, m5, m0 | |
21422 pmaddubsw m5, m1 | |
21423 pmulhrsw m4, m2 | |
21424 pmulhrsw m5, m2 | |
21425 packuswb m4, m5 | |
21426 movu [r0 + r1 * 2], xm4 | |
21427 vextracti128 [r0 + r3], m4, 1 | |
21428 | |
21429 lea r0, [r0 + r1 * 4] | |
21430 | |
21431 palignr m5, m6, m3, 8 | |
21432 pshufb m5, m7 | |
21433 pmaddubsw m4, m5, m0 | |
21434 pmaddubsw m5, m1 | |
21435 pmulhrsw m4, m2 | |
21436 pmulhrsw m5, m2 | |
21437 packuswb m4, m5 | |
21438 movu [r0], xm4 | |
21439 vextracti128 [r0 + r1], m4, 1 | |
21440 | |
21441 palignr m5, m6, m3, 10 | |
21442 pshufb m5, m7 | |
21443 pmaddubsw m4, m5, m0 | |
21444 pmaddubsw m5, m1 | |
21445 pmulhrsw m4, m2 | |
21446 pmulhrsw m5, m2 | |
21447 packuswb m4, m5 | |
21448 movu [r0 + r1 * 2], xm4 | |
21449 vextracti128 [r0 + r3], m4, 1 | |
21450 | |
21451 lea r0, [r0 + r1 * 4] | |
21452 | |
21453 palignr m5, m6, m3, 12 | |
21454 pshufb m5, m7 | |
21455 pmaddubsw m4, m5, m0 | |
21456 pmaddubsw m5, m1 | |
21457 pmulhrsw m4, m2 | |
21458 pmulhrsw m5, m2 | |
21459 packuswb m4, m5 | |
21460 movu [r0], xm4 | |
21461 vextracti128 [r0 + r1], m4, 1 | |
21462 | |
21463 palignr m5, m6, m3, 14 | |
21464 pshufb m5, m7 | |
21465 pmaddubsw m4, m5, m0 | |
21466 pmaddubsw m5, m1 | |
21467 pmulhrsw m4, m2 | |
21468 pmulhrsw m5, m2 | |
21469 packuswb m4, m5 | |
21470 movu [r0 + r1 * 2], xm4 | |
21471 vextracti128 [r0 + r3], m4, 1 | |
21472 RET | |
21473 %endif | |
21474 | |
21475 INIT_YMM avx2 | |
21476 cglobal intra_pred_ang16_25, 3, 5, 5 | |
21477 mova m0, [pw_1024] | |
21478 | |
21479 vbroadcasti128 m1, [r2] | |
21480 pshufb m1, [intra_pred_shuff_0_8] | |
21481 vbroadcasti128 m2, [r2 + 8] | |
21482 pshufb m2, [intra_pred_shuff_0_8] | |
21483 | |
21484 lea r3, [3 * r1] | |
21485 lea r4, [c_ang16_mode_25] | |
21486 | |
21487 INTRA_PRED_ANG16_MC1 0 | |
21488 | |
21489 lea r0, [r0 + 4 * r1] | |
21490 INTRA_PRED_ANG16_MC1 2 | |
21491 | |
21492 add r4, 4 * mmsize | |
21493 | |
21494 lea r0, [r0 + 4 * r1] | |
21495 INTRA_PRED_ANG16_MC1 0 | |
21496 | |
21497 lea r0, [r0 + 4 * r1] | |
21498 INTRA_PRED_ANG16_MC1 2 | |
21499 RET | |
21500 | |
21501 INIT_YMM avx2 | |
21502 cglobal intra_pred_ang16_28, 3, 5, 6 | |
21503 mova m0, [pw_1024] | |
21504 mova m5, [intra_pred_shuff_0_8] | |
21505 lea r3, [3 * r1] | |
21506 lea r4, [c_ang16_mode_28] | |
21507 | |
21508 INTRA_PRED_ANG16_MC2 1 | |
21509 INTRA_PRED_ANG16_MC1 0 | |
21510 | |
21511 lea r0, [r0 + 4 * r1] | |
21512 | |
21513 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 | |
21514 | |
21515 INTRA_PRED_ANG16_MC2 2 | |
21516 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 | |
21517 | |
21518 lea r0, [r0 + 4 * r1] | |
21519 add r4, 4 * mmsize | |
21520 | |
21521 INTRA_PRED_ANG16_MC1 0 | |
21522 INTRA_PRED_ANG16_MC2 3 | |
21523 | |
21524 lea r0, [r0 + 4 * r1] | |
21525 INTRA_PRED_ANG16_MC1 2 | |
21526 RET | |
21527 | |
21528 INIT_YMM avx2 | |
21529 cglobal intra_pred_ang16_27, 3, 5, 5 | |
21530 mova m0, [pw_1024] | |
21531 lea r3, [3 * r1] | |
21532 lea r4, [c_ang16_mode_27] | |
21533 | |
21534 vbroadcasti128 m1, [r2 + 1] | |
21535 pshufb m1, [intra_pred_shuff_0_8] | |
21536 vbroadcasti128 m2, [r2 + 9] | |
21537 pshufb m2, [intra_pred_shuff_0_8] | |
21538 | |
21539 INTRA_PRED_ANG16_MC1 0 | |
21540 | |
21541 lea r0, [r0 + 4 * r1] | |
21542 INTRA_PRED_ANG16_MC1 2 | |
21543 | |
21544 lea r0, [r0 + 4 * r1] | |
21545 add r4, 4 * mmsize | |
21546 INTRA_PRED_ANG16_MC1 0 | |
21547 | |
21548 lea r0, [r0 + 4 * r1] | |
21549 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 | |
21550 | |
21551 vperm2i128 m1, m1, m2, 00100000b | |
21552 pmaddubsw m3, m1, [r4 + 3 * mmsize] | |
21553 pmulhrsw m3, m0 | |
21554 vbroadcasti128 m2, [r2 + 2] | |
21555 pshufb m2, [intra_pred_shuff_0_15] | |
21556 pmaddubsw m2, [r4 + 4 * mmsize] | |
21557 pmulhrsw m2, m0 | |
21558 packuswb m3, m2 | |
21559 vpermq m3, m3, 11011000b | |
21560 movu [r0 + 2 * r1], xm3 | |
21561 vextracti128 xm4, m3, 1 | |
21562 movu [r0 + r3], xm4 | |
21563 RET | |
21564 | |
21565 INIT_YMM avx2 | |
21566 cglobal intra_pred_ang16_29, 3, 5, 5 | |
21567 mova m0, [pw_1024] | |
21568 mova m5, [intra_pred_shuff_0_8] | |
21569 lea r3, [3 * r1] | |
21570 lea r4, [c_ang16_mode_29] | |
21571 | |
21572 INTRA_PRED_ANG16_MC2 1 | |
21573 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 | |
21574 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 | |
21575 | |
21576 INTRA_PRED_ANG16_MC2 2 | |
21577 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 | |
21578 | |
21579 lea r0, [r0 + r1 * 4] | |
21580 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 | |
21581 | |
21582 INTRA_PRED_ANG16_MC2 3 | |
21583 add r4, 4 * mmsize | |
21584 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 | |
21585 lea r0, [r0 + r1 * 4] | |
21586 INTRA_PRED_ANG16_MC3 r0 + r1, 1 | |
21587 | |
21588 INTRA_PRED_ANG16_MC2 4 | |
21589 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 | |
21590 lea r0, [r0 + r1 * 4] | |
21591 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 | |
21592 | |
21593 add r4, 4 * mmsize | |
21594 | |
21595 INTRA_PRED_ANG16_MC2 5 | |
21596 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 | |
21597 RET | |
21598 | |
21599 INIT_YMM avx2 | |
21600 cglobal intra_pred_ang16_30, 3, 5, 6 | |
21601 mova m0, [pw_1024] | |
21602 mova m5, [intra_pred_shuff_0_8] | |
21603 lea r3, [3 * r1] | |
21604 lea r4, [c_ang16_mode_30] | |
21605 | |
21606 INTRA_PRED_ANG16_MC2 1 | |
21607 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 | |
21608 | |
21609 INTRA_PRED_ANG16_MC2 2 | |
21610 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 | |
21611 | |
21612 INTRA_PRED_ANG16_MC2 3 | |
21613 lea r0, [r0 + 4 * r1] | |
21614 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 | |
21615 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 | |
21616 | |
21617 INTRA_PRED_ANG16_MC2 4 | |
21618 add r4, 4 * mmsize | |
21619 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 | |
21620 | |
21621 INTRA_PRED_ANG16_MC2 5 | |
21622 lea r0, [r0 + 4 * r1] | |
21623 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 | |
21624 INTRA_PRED_ANG16_MC3 r0 + r3 , 2 | |
21625 | |
21626 INTRA_PRED_ANG16_MC2 6 | |
21627 lea r0, [r0 + 4 * r1] | |
21628 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 | |
21629 | |
21630 INTRA_PRED_ANG16_MC2 7 | |
21631 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 | |
21632 RET | |
21633 | |
21634 INIT_YMM avx2 | |
21635 cglobal intra_pred_ang16_31, 3, 5, 6 | |
21636 mova m0, [pw_1024] | |
21637 mova m5, [intra_pred_shuff_0_8] | |
21638 lea r3, [3 * r1] | |
21639 lea r4, [c_ang16_mode_31] | |
21640 | |
21641 INTRA_PRED_ANG16_MC2 1 | |
21642 INTRA_PRED_ANG16_MC3 r0, 0 | |
21643 | |
21644 INTRA_PRED_ANG16_MC2 2 | |
21645 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 | |
21646 | |
21647 INTRA_PRED_ANG16_MC2 3 | |
21648 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 | |
21649 | |
21650 INTRA_PRED_ANG16_MC2 4 | |
21651 lea r0, [r0 + 4 * r1] | |
21652 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 | |
21653 | |
21654 INTRA_PRED_ANG16_MC2 5 | |
21655 add r4, 4 * mmsize | |
21656 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 | |
21657 | |
21658 INTRA_PRED_ANG16_MC2 6 | |
21659 lea r0, [r0 + 4 * r1] | |
21660 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 | |
21661 | |
21662 INTRA_PRED_ANG16_MC2 7 | |
21663 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 | |
21664 | |
21665 INTRA_PRED_ANG16_MC2 8 | |
21666 lea r0, [r0 + 4 * r1] | |
21667 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 | |
21668 | |
21669 INTRA_PRED_ANG16_MC2 9 | |
21670 INTRA_PRED_ANG16_MC3 r0 + r3, 4 | |
21671 RET | |
21672 | |
21673 INIT_YMM avx2 | |
21674 cglobal intra_pred_ang16_24, 3, 5, 6 | |
21675 mova m0, [pw_1024] | |
21676 mova m5, [intra_pred_shuff_0_8] | |
21677 lea r3, [3 * r1] | |
21678 lea r4, [c_ang16_mode_24] | |
21679 | |
21680 INTRA_PRED_ANG16_MC2 0 | |
21681 INTRA_PRED_ANG16_MC1 0 | |
21682 | |
21683 lea r0, [r0 + 4 * r1] | |
21684 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 | |
21685 | |
21686 movu xm1, [r2 - 1] | |
21687 pinsrb xm1, [r2 + 38], 0 | |
21688 vinserti128 m1, m1, xm1, 1 | |
21689 pshufb m1, m5 | |
21690 vbroadcasti128 m2, [r2 + 7] | |
21691 pshufb m2, m5 | |
21692 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 | |
21693 | |
21694 lea r0, [r0 + 4 * r1] | |
21695 add r4, 4 * mmsize | |
21696 | |
21697 INTRA_PRED_ANG16_MC1 0 | |
21698 | |
21699 movu xm1, [r2 - 2] | |
21700 pinsrb xm1, [r2 + 45], 0 | |
21701 pinsrb xm1, [r2 + 38], 1 | |
21702 vinserti128 m1, m1, xm1, 1 | |
21703 pshufb m1, m5 | |
21704 vbroadcasti128 m2, [r2 + 6] | |
21705 pshufb m2, m5 | |
21706 | |
21707 lea r0, [r0 + 4 * r1] | |
21708 | |
21709 INTRA_PRED_ANG16_MC1 2 | |
21710 RET | |
21711 | |
21712 %macro INTRA_PRED_ANG16_MC5 2 | |
21713 pslldq xm6, xm6, 1 | |
21714 pinsrb xm6, [r2 + %1], 0 | |
21715 vinserti128 m1, m6, xm6, 1 | |
21716 pshufb m1, m5 | |
21717 vbroadcasti128 m2, [r2 + %2] | |
21718 pshufb m2, m5 | |
21719 %endmacro | |
21720 | |
21721 INIT_YMM avx2 | |
21722 cglobal intra_pred_ang16_23, 3, 5, 7 | |
21723 mova m0, [pw_1024] | |
21724 mova m5, [intra_pred_shuff_0_8] | |
21725 lea r3, [3 * r1] | |
21726 lea r4, [c_ang16_mode_23] | |
21727 | |
21728 INTRA_PRED_ANG16_MC2 0 | |
21729 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 | |
21730 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 | |
21731 | |
21732 movu xm6, [r2 - 1] | |
21733 pinsrb xm6, [r2 + 36], 0 | |
21734 vinserti128 m1, m6, xm6, 1 | |
21735 pshufb m1, m5 | |
21736 vbroadcasti128 m2, [r2 + 7] | |
21737 pshufb m2, m5 | |
21738 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 | |
21739 | |
21740 lea r0, [r0 + 4 * r1] | |
21741 | |
21742 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 | |
21743 | |
21744 add r4, 4 * mmsize | |
21745 | |
21746 INTRA_PRED_ANG16_MC5 39, 6 | |
21747 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 | |
21748 | |
21749 lea r0, [r0 + 4 * r1] | |
21750 | |
21751 INTRA_PRED_ANG16_MC3 r0 + r1, 1 | |
21752 INTRA_PRED_ANG16_MC5 43, 5 | |
21753 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 | |
21754 | |
21755 lea r0, [r0 + 4 * r1] | |
21756 | |
21757 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 | |
21758 | |
21759 add r4, 4 * mmsize | |
21760 | |
21761 INTRA_PRED_ANG16_MC5 46, 4 | |
21762 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 | |
21763 RET | |
21764 | |
21765 INIT_YMM avx2 | |
21766 cglobal intra_pred_ang16_22, 3, 5, 7 | |
21767 mova m0, [pw_1024] | |
21768 mova m5, [intra_pred_shuff_0_8] | |
21769 lea r3, [3 * r1] | |
21770 lea r4, [c_ang16_mode_22] | |
21771 | |
21772 INTRA_PRED_ANG16_MC2 0 | |
21773 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 | |
21774 | |
21775 movu xm6, [r2 - 1] | |
21776 pinsrb xm6, [r2 + 34], 0 | |
21777 vinserti128 m1, m6, xm6, 1 | |
21778 pshufb m1, m5 | |
21779 vbroadcasti128 m2, [r2 + 7] | |
21780 pshufb m2, m5 | |
21781 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 | |
21782 | |
21783 lea r0, [r0 + 4 * r1] | |
21784 | |
21785 INTRA_PRED_ANG16_MC5 37, 6 | |
21786 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 | |
21787 INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 | |
21788 | |
21789 add r4, 4 * mmsize | |
21790 | |
21791 INTRA_PRED_ANG16_MC5 39, 5 | |
21792 INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 | |
21793 | |
21794 lea r0, [r0 + 4 * r1] | |
21795 | |
21796 INTRA_PRED_ANG16_MC5 42, 4 | |
21797 INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 | |
21798 INTRA_PRED_ANG16_MC3 r0 + r3, 2 | |
21799 | |
21800 lea r0, [r0 + 4 * r1] | |
21801 | |
21802 INTRA_PRED_ANG16_MC5 44, 3 | |
21803 INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 | |
21804 INTRA_PRED_ANG16_MC5 47, 2 | |
21805 INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 | |
21806 RET | |
21807 | |
21808 %macro INTRA_PRED_ANG32_ALIGNR_STORE 1 | |
21809 lea r0, [r0 + 4 * r1] | |
21810 palignr m2, m1, m0, %1 | |
21811 movu [r0], m2 | |
21812 palignr m2, m1, m0, (%1 + 1) | |
21813 movu [r0 + r1], m2 | |
21814 palignr m2, m1, m0, (%1 + 2) | |
21815 movu [r0 + 2 * r1], m2 | |
21816 palignr m2, m1, m0, (%1 + 3) | |
21817 movu [r0 + r3], m2 | |
21818 %endmacro | |
21819 | |
21820 INIT_YMM avx2 | |
21821 cglobal intra_pred_ang32_34, 3, 4,3 | |
21822 lea r3, [3 * r1] | |
21823 | |
21824 movu m0, [r2 + 2] | |
21825 movu m1, [r2 + 18] | |
21826 movu [r0], m0 | |
21827 palignr m2, m1, m0, 1 | |
21828 movu [r0 + r1], m2 | |
21829 palignr m2, m1, m0, 2 | |
21830 movu [r0 + 2 * r1], m2 | |
21831 palignr m2, m1, m0, 3 | |
21832 movu [r0 + r3], m2 | |
21833 | |
21834 INTRA_PRED_ANG32_ALIGNR_STORE 4 | |
21835 INTRA_PRED_ANG32_ALIGNR_STORE 8 | |
21836 INTRA_PRED_ANG32_ALIGNR_STORE 12 | |
21837 | |
21838 lea r0, [r0 + 4 * r1] | |
21839 palignr m2, m1, m0, 16 | |
21840 movu [r0], m2 | |
21841 movu m0, [r2 + 19] | |
21842 movu [r0 + r1], m0 | |
21843 movu m1, [r2 + 35] | |
21844 palignr m2, m1, m0, 1 | |
21845 movu [r0 + 2 * r1], m2 | |
21846 palignr m2, m1, m0, 2 | |
21847 movu [r0 + r3], m2 | |
21848 | |
21849 INTRA_PRED_ANG32_ALIGNR_STORE 3 | |
21850 INTRA_PRED_ANG32_ALIGNR_STORE 7 | |
21851 INTRA_PRED_ANG32_ALIGNR_STORE 11 | |
21852 RET | |
21853 | |
21854 INIT_YMM avx2 | |
21855 cglobal intra_pred_ang32_2, 3, 4,3 | |
21856 lea r3, [3 * r1] | |
21857 | |
21858 movu m0, [r2 + 64 + 2] | |
21859 movu m1, [r2 + 64 + 18] | |
21860 movu [r0], m0 | |
21861 palignr m2, m1, m0, 1 | |
21862 movu [r0 + r1], m2 | |
21863 palignr m2, m1, m0, 2 | |
21864 movu [r0 + 2 * r1], m2 | |
21865 palignr m2, m1, m0, 3 | |
21866 movu [r0 + r3], m2 | |
21867 | |
21868 INTRA_PRED_ANG32_ALIGNR_STORE 4 | |
21869 INTRA_PRED_ANG32_ALIGNR_STORE 8 | |
21870 INTRA_PRED_ANG32_ALIGNR_STORE 12 | |
21871 | |
21872 lea r0, [r0 + 4 * r1] | |
21873 palignr m2, m1, m0, 16 | |
21874 movu [r0], m2 | |
21875 movu m0, [r2 + 64 + 19] | |
21876 movu [r0 + r1], m0 | |
21877 movu m1, [r2 + 64 + 35] | |
21878 palignr m2, m1, m0, 1 | |
21879 movu [r0 + 2 * r1], m2 | |
21880 palignr m2, m1, m0, 2 | |
21881 movu [r0 + r3], m2 | |
21882 | |
21883 INTRA_PRED_ANG32_ALIGNR_STORE 3 | |
21884 INTRA_PRED_ANG32_ALIGNR_STORE 7 | |
21885 INTRA_PRED_ANG32_ALIGNR_STORE 11 | |
21886 RET | |
21887 | |
21888 %macro INTRA_PRED_ANG32_STORE 0 | |
21889 lea r0, [r0 + 4 * r1] | |
21890 movu [r0], m0 | |
21891 movu [r0 + r1], m0 | |
21892 movu [r0 + r1 * 2], m0 | |
21893 movu [r0 + r3], m0 | |
21894 %endmacro | |
21895 | |
21896 INIT_YMM avx2 | |
21897 cglobal intra_pred_ang32_26, 3, 4, 1 | |
21898 lea r3, [3 * r1] | |
21899 movu m0, [r2 + 1] | |
21900 movu [r0], m0 | |
21901 movu [r0 + r1], m0 | |
21902 movu [r0 + r1 * 2], m0 | |
21903 movu [r0 + r3], m0 | |
21904 | |
21905 INTRA_PRED_ANG32_STORE | |
21906 INTRA_PRED_ANG32_STORE | |
21907 INTRA_PRED_ANG32_STORE | |
21908 INTRA_PRED_ANG32_STORE | |
21909 INTRA_PRED_ANG32_STORE | |
21910 INTRA_PRED_ANG32_STORE | |
21911 INTRA_PRED_ANG32_STORE | |
21912 RET | |
21913 | |
21914 %macro INTRA_PRED_STORE_4x4 0 | |
21915 movd [r0], xm0 | |
21916 pextrd [r0 + r1], xm0, 1 | |
21917 vextracti128 xm0, m0, 1 | |
21918 lea r0, [r0 + 2 * r1] | |
21919 movd [r0], xm0 | |
21920 pextrd [r0 + r1], xm0, 1 | |
21921 %endmacro | |
21922 | |
21923 %macro INTRA_PRED_TRANS_STORE_4x4 0 | |
21924 vpermq m0, m0, 00001000b | |
21925 pshufb m0, [c_trans_4x4] | |
21926 | |
21927 ;store | |
21928 movd [r0], xm0 | |
21929 pextrd [r0 + r1], xm0, 1 | |
21930 lea r0, [r0 + 2 * r1] | |
21931 pextrd [r0], xm0, 2 | |
21932 pextrd [r0 + r1], xm0, 3 | |
21933 %endmacro | |
21934 | |
21935 INIT_YMM avx2 | |
21936 cglobal intra_pred_ang4_27, 3, 3, 1 | |
21937 vbroadcasti128 m0, [r2 + 1] | |
21938 pshufb m0, [intra_pred_shuff_0_4] | |
21939 pmaddubsw m0, [c_ang4_mode_27] | |
21940 pmulhrsw m0, [pw_1024] | |
21941 packuswb m0, m0 | |
21942 | |
21943 INTRA_PRED_STORE_4x4 | |
21944 RET | |
21945 | |
21946 INIT_YMM avx2 | |
21947 cglobal intra_pred_ang4_28, 3, 3, 1 | |
21948 vbroadcasti128 m0, [r2 + 1] | |
21949 pshufb m0, [intra_pred_shuff_0_4] | |
21950 pmaddubsw m0, [c_ang4_mode_28] | |
21951 pmulhrsw m0, [pw_1024] | |
21952 packuswb m0, m0 | |
21953 | |
21954 INTRA_PRED_STORE_4x4 | |
21955 RET | |
21956 | |
21957 INIT_YMM avx2 | |
21958 cglobal intra_pred_ang4_29, 3, 3, 1 | |
21959 vbroadcasti128 m0, [r2 + 1] | |
21960 pshufb m0, [intra_pred4_shuff1] | |
21961 pmaddubsw m0, [c_ang4_mode_29] | |
21962 pmulhrsw m0, [pw_1024] | |
21963 packuswb m0, m0 | |
21964 | |
21965 INTRA_PRED_STORE_4x4 | |
21966 RET | |
21967 | |
21968 INIT_YMM avx2 | |
21969 cglobal intra_pred_ang4_30, 3, 3, 1 | |
21970 vbroadcasti128 m0, [r2 + 1] | |
21971 pshufb m0, [intra_pred4_shuff2] | |
21972 pmaddubsw m0, [c_ang4_mode_30] | |
21973 pmulhrsw m0, [pw_1024] | |
21974 packuswb m0, m0 | |
21975 | |
21976 INTRA_PRED_STORE_4x4 | |
21977 RET | |
21978 | |
21979 INIT_YMM avx2 | |
21980 cglobal intra_pred_ang4_31, 3, 3, 1 | |
21981 vbroadcasti128 m0, [r2 + 1] | |
21982 pshufb m0, [intra_pred4_shuff31] | |
21983 pmaddubsw m0, [c_ang4_mode_31] | |
21984 pmulhrsw m0, [pw_1024] | |
21985 packuswb m0, m0 | |
21986 | |
21987 INTRA_PRED_STORE_4x4 | |
21988 RET | |
21989 | |
21990 INIT_YMM avx2 | |
21991 cglobal intra_pred_ang4_32, 3, 3, 1 | |
21992 vbroadcasti128 m0, [r2 + 1] | |
21993 pshufb m0, [intra_pred4_shuff31] | |
21994 pmaddubsw m0, [c_ang4_mode_32] | |
21995 pmulhrsw m0, [pw_1024] | |
21996 packuswb m0, m0 | |
21997 | |
21998 INTRA_PRED_STORE_4x4 | |
21999 RET | |
22000 | |
22001 INIT_YMM avx2 | |
22002 cglobal intra_pred_ang4_33, 3, 3, 1 | |
22003 vbroadcasti128 m0, [r2 + 1] | |
22004 pshufb m0, [intra_pred4_shuff33] | |
22005 pmaddubsw m0, [c_ang4_mode_33] | |
22006 pmulhrsw m0, [pw_1024] | |
22007 packuswb m0, m0 | |
22008 | |
22009 INTRA_PRED_STORE_4x4 | |
22010 RET | |
22011 | |
22012 | |
22013 INIT_YMM avx2 | |
22014 cglobal intra_pred_ang4_3, 3, 3, 1 | |
22015 vbroadcasti128 m0, [r2 + 1] | |
22016 pshufb m0, [intra_pred4_shuff3] | |
22017 pmaddubsw m0, [c_ang4_mode_33] | |
22018 pmulhrsw m0, [pw_1024] | |
22019 packuswb m0, m0 | |
22020 | |
22021 INTRA_PRED_TRANS_STORE_4x4 | |
22022 RET | |
22023 | |
22024 INIT_YMM avx2 | |
22025 cglobal intra_pred_ang4_4, 3, 3, 1 | |
22026 vbroadcasti128 m0, [r2] | |
22027 pshufb m0, [intra_pred4_shuff5] | |
22028 pmaddubsw m0, [c_ang4_mode_32] | |
22029 pmulhrsw m0, [pw_1024] | |
22030 packuswb m0, m0 | |
22031 | |
22032 INTRA_PRED_TRANS_STORE_4x4 | |
22033 RET | |
22034 | |
22035 INIT_YMM avx2 | |
22036 cglobal intra_pred_ang4_5, 3, 3, 1 | |
22037 vbroadcasti128 m0, [r2] | |
22038 pshufb m0, [intra_pred4_shuff5] | |
22039 pmaddubsw m0, [c_ang4_mode_5] | |
22040 pmulhrsw m0, [pw_1024] | |
22041 packuswb m0, m0 | |
22042 | |
22043 INTRA_PRED_TRANS_STORE_4x4 | |
22044 RET | |
22045 | |
22046 INIT_YMM avx2 | |
22047 cglobal intra_pred_ang4_6, 3, 3, 1 | |
22048 vbroadcasti128 m0, [r2] | |
22049 pshufb m0, [intra_pred4_shuff6] | |
22050 pmaddubsw m0, [c_ang4_mode_6] | |
22051 pmulhrsw m0, [pw_1024] | |
22052 packuswb m0, m0 | |
22053 | |
22054 INTRA_PRED_TRANS_STORE_4x4 | |
22055 RET | |
22056 | |
22057 INIT_YMM avx2 | |
22058 cglobal intra_pred_ang4_7, 3, 3, 1 | |
22059 vbroadcasti128 m0, [r2] | |
22060 pshufb m0, [intra_pred4_shuff7] | |
22061 pmaddubsw m0, [c_ang4_mode_7] | |
22062 pmulhrsw m0, [pw_1024] | |
22063 packuswb m0, m0 | |
22064 | |
22065 INTRA_PRED_TRANS_STORE_4x4 | |
22066 RET | |
22067 | |
22068 INIT_YMM avx2 | |
22069 cglobal intra_pred_ang4_8, 3, 3, 1 | |
22070 vbroadcasti128 m0, [r2] | |
22071 pshufb m0, [intra_pred4_shuff9] | |
22072 pmaddubsw m0, [c_ang4_mode_8] | |
22073 pmulhrsw m0, [pw_1024] | |
22074 packuswb m0, m0 | |
22075 | |
22076 INTRA_PRED_TRANS_STORE_4x4 | |
22077 RET | |
22078 | |
22079 INIT_YMM avx2 | |
22080 cglobal intra_pred_ang4_9, 3, 3, 1 | |
22081 vbroadcasti128 m0, [r2] | |
22082 pshufb m0, [intra_pred4_shuff9] | |
22083 pmaddubsw m0, [c_ang4_mode_9] | |
22084 pmulhrsw m0, [pw_1024] | |
22085 packuswb m0, m0 | |
22086 | |
22087 INTRA_PRED_TRANS_STORE_4x4 | |
22088 RET | |
22089 | |
22090 INIT_YMM avx2 | |
22091 cglobal intra_pred_ang4_11, 3, 3, 1 | |
22092 vbroadcasti128 m0, [r2] | |
22093 pshufb m0, [intra_pred4_shuff12] | |
22094 pmaddubsw m0, [c_ang4_mode_11] | |
22095 pmulhrsw m0, [pw_1024] | |
22096 packuswb m0, m0 | |
22097 | |
22098 INTRA_PRED_TRANS_STORE_4x4 | |
22099 RET | |
22100 | |
22101 INIT_YMM avx2 | |
22102 cglobal intra_pred_ang4_12, 3, 3, 1 | |
22103 vbroadcasti128 m0, [r2] | |
22104 pshufb m0, [intra_pred4_shuff12] | |
22105 pmaddubsw m0, [c_ang4_mode_12] | |
22106 pmulhrsw m0, [pw_1024] | |
22107 packuswb m0, m0 | |
22108 | |
22109 INTRA_PRED_TRANS_STORE_4x4 | |
22110 RET | |
22111 | |
22112 INIT_YMM avx2 | |
22113 cglobal intra_pred_ang4_13, 3, 3, 1 | |
22114 vbroadcasti128 m0, [r2] | |
22115 pshufb m0, [intra_pred4_shuff13] | |
22116 pmaddubsw m0, [c_ang4_mode_13] | |
22117 pmulhrsw m0, [pw_1024] | |
22118 packuswb m0, m0 | |
22119 | |
22120 INTRA_PRED_TRANS_STORE_4x4 | |
22121 RET | |
22122 | |
22123 INIT_YMM avx2 | |
22124 cglobal intra_pred_ang4_14, 3, 3, 1 | |
22125 vbroadcasti128 m0, [r2] | |
22126 pshufb m0, [intra_pred4_shuff14] | |
22127 pmaddubsw m0, [c_ang4_mode_14] | |
22128 pmulhrsw m0, [pw_1024] | |
22129 packuswb m0, m0 | |
22130 | |
22131 INTRA_PRED_TRANS_STORE_4x4 | |
22132 RET | |
22133 | |
22134 INIT_YMM avx2 | |
22135 cglobal intra_pred_ang4_15, 3, 3, 1 | |
22136 vbroadcasti128 m0, [r2] | |
22137 pshufb m0, [intra_pred4_shuff15] | |
22138 pmaddubsw m0, [c_ang4_mode_15] | |
22139 pmulhrsw m0, [pw_1024] | |
22140 packuswb m0, m0 | |
22141 | |
22142 INTRA_PRED_TRANS_STORE_4x4 | |
22143 RET | |
22144 | |
22145 INIT_YMM avx2 | |
22146 cglobal intra_pred_ang4_16, 3, 3, 1 | |
22147 vbroadcasti128 m0, [r2] | |
22148 pshufb m0, [intra_pred4_shuff16] | |
22149 pmaddubsw m0, [c_ang4_mode_16] | |
22150 pmulhrsw m0, [pw_1024] | |
22151 packuswb m0, m0 | |
22152 | |
22153 INTRA_PRED_TRANS_STORE_4x4 | |
22154 RET | |
22155 | |
22156 INIT_YMM avx2 | |
22157 cglobal intra_pred_ang4_17, 3, 3, 1 | |
22158 vbroadcasti128 m0, [r2] | |
22159 pshufb m0, [intra_pred4_shuff17] | |
22160 pmaddubsw m0, [c_ang4_mode_17] | |
22161 pmulhrsw m0, [pw_1024] | |
22162 packuswb m0, m0 | |
22163 | |
22164 INTRA_PRED_TRANS_STORE_4x4 | |
22165 RET | |
22166 | |
22167 INIT_YMM avx2 | |
22168 cglobal intra_pred_ang4_19, 3, 3, 1 | |
22169 vbroadcasti128 m0, [r2] | |
22170 pshufb m0, [intra_pred4_shuff19] | |
22171 pmaddubsw m0, [c_ang4_mode_19] | |
22172 pmulhrsw m0, [pw_1024] | |
22173 packuswb m0, m0 | |
22174 | |
22175 INTRA_PRED_STORE_4x4 | |
22176 RET | |
22177 | |
22178 INIT_YMM avx2 | |
22179 cglobal intra_pred_ang4_20, 3, 3, 1 | |
22180 vbroadcasti128 m0, [r2] | |
22181 pshufb m0, [intra_pred4_shuff20] | |
22182 pmaddubsw m0, [c_ang4_mode_20] | |
22183 pmulhrsw m0, [pw_1024] | |
22184 packuswb m0, m0 | |
22185 | |
22186 INTRA_PRED_STORE_4x4 | |
22187 RET | |
22188 | |
22189 INIT_YMM avx2 | |
22190 cglobal intra_pred_ang4_21, 3, 3, 1 | |
22191 vbroadcasti128 m0, [r2] | |
22192 pshufb m0, [intra_pred4_shuff21] | |
22193 pmaddubsw m0, [c_ang4_mode_21] | |
22194 pmulhrsw m0, [pw_1024] | |
22195 packuswb m0, m0 | |
22196 | |
22197 INTRA_PRED_STORE_4x4 | |
22198 RET | |
22199 | |
22200 INIT_YMM avx2 | |
22201 cglobal intra_pred_ang4_22, 3, 3, 1 | |
22202 vbroadcasti128 m0, [r2] | |
22203 pshufb m0, [intra_pred4_shuff22] | |
22204 pmaddubsw m0, [c_ang4_mode_22] | |
22205 pmulhrsw m0, [pw_1024] | |
22206 packuswb m0, m0 | |
22207 | |
22208 INTRA_PRED_STORE_4x4 | |
22209 RET | |
22210 | |
22211 INIT_YMM avx2 | |
22212 cglobal intra_pred_ang4_23, 3, 3, 1 | |
22213 vbroadcasti128 m0, [r2] | |
22214 pshufb m0, [intra_pred4_shuff23] | |
22215 pmaddubsw m0, [c_ang4_mode_23] | |
22216 pmulhrsw m0, [pw_1024] | |
22217 packuswb m0, m0 | |
22218 | |
22219 INTRA_PRED_STORE_4x4 | |
22220 RET | |
22221 | |
22222 INIT_YMM avx2 | |
22223 cglobal intra_pred_ang4_24, 3, 3, 1 | |
22224 vbroadcasti128 m0, [r2] | |
22225 pshufb m0, [intra_pred_shuff_0_4] | |
22226 pmaddubsw m0, [c_ang4_mode_24] | |
22227 pmulhrsw m0, [pw_1024] | |
22228 packuswb m0, m0 | |
22229 | |
22230 INTRA_PRED_STORE_4x4 | |
22231 RET | |
22232 | |
22233 INIT_YMM avx2 | |
22234 cglobal intra_pred_ang4_25, 3, 3, 1 | |
22235 vbroadcasti128 m0, [r2] | |
22236 pshufb m0, [intra_pred_shuff_0_4] | |
22237 pmaddubsw m0, [c_ang4_mode_25] | |
22238 pmulhrsw m0, [pw_1024] | |
22239 packuswb m0, m0 | |
22240 | |
22241 INTRA_PRED_STORE_4x4 | |
22242 RET | |
22243 | |
22244 ;----------------------------------------------------------------------------------- | |
22245 ; void intra_filter_NxN(const pixel* references, pixel* filtered) | |
22246 ;----------------------------------------------------------------------------------- | |
22247 INIT_XMM sse4 | |
22248 cglobal intra_filter_4x4, 2,4,5 | |
22249 mov r2b, byte [r0 + 8] ; topLast | |
22250 mov r3b, byte [r0 + 16] ; LeftLast | |
22251 | |
22252 ; filtering top | |
22253 pmovzxbw m0, [r0 + 0] | |
22254 pmovzxbw m1, [r0 + 8] | |
22255 pmovzxbw m2, [r0 + 16] | |
22256 | |
22257 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] | |
22258 palignr m3, m1, m0, 4 | |
22259 pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] | |
22260 | |
22261 psllw m0, 1 | |
22262 paddw m4, m3 | |
22263 paddw m0, m4 | |
22264 paddw m0, [pw_2] | |
22265 psrlw m0, 2 | |
22266 | |
22267 ; filtering left | |
22268 palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1] | |
22269 pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1] | |
22270 palignr m3, m2, m1, 4 | |
22271 pshufb m3, [intra_filter4_shuf1] | |
22272 | |
22273 psllw m1, 1 | |
22274 paddw m4, m3 | |
22275 paddw m1, m4 | |
22276 paddw m1, [pw_2] | |
22277 psrlw m1, 2 | |
22278 packuswb m0, m1 | |
22279 | |
22280 movu [r1], m0 | |
22281 mov [r1 + 8], r2b ; topLast | |
22282 mov [r1 + 16], r3b ; LeftLast | |
22283 RET | |
22284 | |
22285 INIT_XMM sse4 | |
22286 cglobal intra_filter_8x8, 2,4,6 | |
22287 mov r2b, byte [r0 + 16] ; topLast | |
22288 mov r3b, byte [r0 + 32] ; LeftLast | |
22289 | |
22290 ; filtering top | |
22291 pmovzxbw m0, [r0 + 0] | |
22292 pmovzxbw m1, [r0 + 8] | |
22293 pmovzxbw m2, [r0 + 16] | |
22294 | |
22295 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] | |
22296 palignr m5, m1, m0, 2 | |
22297 pinsrb m5, [r0 + 17], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] | |
22298 | |
22299 palignr m3, m1, m0, 14 | |
22300 psllw m0, 1 | |
22301 paddw m4, m5 | |
22302 paddw m0, m4 | |
22303 paddw m0, [pw_2] | |
22304 psrlw m0, 2 | |
22305 | |
22306 palignr m4, m2, m1, 2 | |
22307 psllw m1, 1 | |
22308 paddw m4, m3 | |
22309 paddw m1, m4 | |
22310 paddw m1, [pw_2] | |
22311 psrlw m1, 2 | |
22312 | |
22313 packuswb m0, m1 | |
22314 movu [r1], m0 | |
22315 | |
22316 ; filtering left | |
22317 pmovzxbw m1, [r0 + 24] | |
22318 pmovzxbw m0, [r0 + 32] | |
22319 | |
22320 palignr m4, m2, m2, 14 | |
22321 pinsrb m4, [r0], 2 | |
22322 palignr m5, m1, m2, 2 | |
22323 | |
22324 palignr m3, m1, m2, 14 | |
22325 palignr m0, m1, 2 | |
22326 | |
22327 psllw m2, 1 | |
22328 paddw m4, m5 | |
22329 paddw m2, m4 | |
22330 paddw m2, [pw_2] | |
22331 psrlw m2, 2 | |
22332 | |
22333 psllw m1, 1 | |
22334 paddw m0, m3 | |
22335 paddw m1, m0 | |
22336 paddw m1, [pw_2] | |
22337 psrlw m1, 2 | |
22338 | |
22339 packuswb m2, m1 | |
22340 movu [r1 + 16], m2 | |
22341 mov [r1 + 16], r2b ; topLast | |
22342 mov [r1 + 32], r3b ; LeftLast | |
22343 RET | |
22344 | |
22345 INIT_XMM sse4 | |
22346 cglobal intra_filter_16x16, 2,4,6 | |
22347 mov r2b, byte [r0 + 32] ; topLast | |
22348 mov r3b, byte [r0 + 64] ; LeftLast | |
22349 | |
22350 ; filtering top | |
22351 pmovzxbw m0, [r0 + 0] | |
22352 pmovzxbw m1, [r0 + 8] | |
22353 pmovzxbw m2, [r0 + 16] | |
22354 | |
22355 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] | |
22356 palignr m5, m1, m0, 2 | |
22357 pinsrb m5, [r0 + 33], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] | |
22358 | |
22359 palignr m3, m1, m0, 14 | |
22360 psllw m0, 1 | |
22361 paddw m4, m5 | |
22362 paddw m0, m4 | |
22363 paddw m0, [pw_2] | |
22364 psrlw m0, 2 | |
22365 | |
22366 palignr m4, m2, m1, 2 | |
22367 psllw m5, m1, 1 | |
22368 paddw m4, m3 | |
22369 paddw m5, m4 | |
22370 paddw m5, [pw_2] | |
22371 psrlw m5, 2 | |
22372 packuswb m0, m5 | |
22373 movu [r1], m0 | |
22374 | |
22375 pmovzxbw m0, [r0 + 24] | |
22376 pmovzxbw m5, [r0 + 32] | |
22377 | |
22378 palignr m3, m2, m1, 14 | |
22379 palignr m4, m0, m2, 2 | |
22380 | |
22381 psllw m1, m2, 1 | |
22382 paddw m3, m4 | |
22383 paddw m1, m3 | |
22384 paddw m1, [pw_2] | |
22385 psrlw m1, 2 | |
22386 | |
22387 palignr m3, m0, m2, 14 | |
22388 palignr m4, m5, m0, 2 | |
22389 | |
22390 psllw m0, 1 | |
22391 paddw m4, m3 | |
22392 paddw m0, m4 | |
22393 paddw m0, [pw_2] | |
22394 psrlw m0, 2 | |
22395 packuswb m1, m0 | |
22396 movu [r1 + 16], m1 | |
22397 | |
22398 ; filtering left | |
22399 pmovzxbw m1, [r0 + 40] | |
22400 pmovzxbw m2, [r0 + 48] | |
22401 | |
22402 palignr m4, m5, m5, 14 | |
22403 pinsrb m4, [r0], 2 | |
22404 palignr m0, m1, m5, 2 | |
22405 | |
22406 psllw m3, m5, 1 | |
22407 paddw m4, m0 | |
22408 paddw m3, m4 | |
22409 paddw m3, [pw_2] | |
22410 psrlw m3, 2 | |
22411 | |
22412 palignr m0, m1, m5, 14 | |
22413 palignr m4, m2, m1, 2 | |
22414 | |
22415 psllw m5, m1, 1 | |
22416 paddw m4, m0 | |
22417 paddw m5, m4 | |
22418 paddw m5, [pw_2] | |
22419 psrlw m5, 2 | |
22420 packuswb m3, m5 | |
22421 movu [r1 + 32], m3 | |
22422 | |
22423 pmovzxbw m5, [r0 + 56] | |
22424 pmovzxbw m0, [r0 + 64] | |
22425 | |
22426 palignr m3, m2, m1, 14 | |
22427 palignr m4, m5, m2, 2 | |
22428 | |
22429 psllw m1, m2, 1 | |
22430 paddw m3, m4 | |
22431 paddw m1, m3 | |
22432 paddw m1, [pw_2] | |
22433 psrlw m1, 2 | |
22434 | |
22435 palignr m3, m5, m2, 14 | |
22436 palignr m4, m0, m5, 2 | |
22437 | |
22438 psllw m5, 1 | |
22439 paddw m4, m3 | |
22440 paddw m5, m4 | |
22441 paddw m5, [pw_2] | |
22442 psrlw m5, 2 | |
22443 packuswb m1, m5 | |
22444 movu [r1 + 48], m1 | |
22445 | |
22446 mov [r1 + 32], r2b ; topLast | |
22447 mov [r1 + 64], r3b ; LeftLast | |
22448 RET | |
22449 | |
22450 INIT_XMM sse4 | |
22451 cglobal intra_filter_32x32, 2,4,6 | |
22452 mov r2b, byte [r0 + 64] ; topLast | |
22453 mov r3b, byte [r0 + 128] ; LeftLast | |
22454 | |
22455 ; filtering top | |
22456 ; 0 to 15 | |
22457 pmovzxbw m0, [r0 + 0] | |
22458 pmovzxbw m1, [r0 + 8] | |
22459 pmovzxbw m2, [r0 + 16] | |
22460 | |
22461 pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] | |
22462 palignr m5, m1, m0, 2 | |
22463 pinsrb m5, [r0 + 65], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1] | |
22464 | |
22465 palignr m3, m1, m0, 14 | |
22466 psllw m0, 1 | |
22467 paddw m4, m5 | |
22468 paddw m0, m4 | |
22469 paddw m0, [pw_2] | |
22470 psrlw m0, 2 | |
22471 | |
22472 palignr m4, m2, m1, 2 | |
22473 psllw m5, m1, 1 | |
22474 paddw m4, m3 | |
22475 paddw m5, m4 | |
22476 paddw m5, [pw_2] | |
22477 psrlw m5, 2 | |
22478 packuswb m0, m5 | |
22479 movu [r1], m0 | |
22480 | |
22481 ; 16 to 31 | |
22482 pmovzxbw m0, [r0 + 24] | |
22483 pmovzxbw m5, [r0 + 32] | |
22484 | |
22485 palignr m3, m2, m1, 14 | |
22486 palignr m4, m0, m2, 2 | |
22487 | |
22488 psllw m1, m2, 1 | |
22489 paddw m3, m4 | |
22490 paddw m1, m3 | |
22491 paddw m1, [pw_2] | |
22492 psrlw m1, 2 | |
22493 | |
22494 palignr m3, m0, m2, 14 | |
22495 palignr m4, m5, m0, 2 | |
22496 | |
22497 psllw m2, m0, 1 | |
22498 paddw m4, m3 | |
22499 paddw m2, m4 | |
22500 paddw m2, [pw_2] | |
22501 psrlw m2, 2 | |
22502 packuswb m1, m2 | |
22503 movu [r1 + 16], m1 | |
22504 | |
22505 ; 32 to 47 | |
22506 pmovzxbw m1, [r0 + 40] | |
22507 pmovzxbw m2, [r0 + 48] | |
22508 | |
22509 palignr m3, m5, m0, 14 | |
22510 palignr m4, m1, m5, 2 | |
22511 | |
22512 psllw m0, m5, 1 | |
22513 paddw m3, m4 | |
22514 paddw m0, m3 | |
22515 paddw m0, [pw_2] | |
22516 psrlw m0, 2 | |
22517 | |
22518 palignr m3, m1, m5, 14 | |
22519 palignr m4, m2, m1, 2 | |
22520 | |
22521 psllw m5, m1, 1 | |
22522 paddw m4, m3 | |
22523 paddw m5, m4 | |
22524 paddw m5, [pw_2] | |
22525 psrlw m5, 2 | |
22526 packuswb m0, m5 | |
22527 movu [r1 + 32], m0 | |
22528 | |
22529 ; 48 to 63 | |
22530 pmovzxbw m0, [r0 + 56] | |
22531 pmovzxbw m5, [r0 + 64] | |
22532 | |
22533 palignr m3, m2, m1, 14 | |
22534 palignr m4, m0, m2, 2 | |
22535 | |
22536 psllw m1, m2, 1 | |
22537 paddw m3, m4 | |
22538 paddw m1, m3 | |
22539 paddw m1, [pw_2] | |
22540 psrlw m1, 2 | |
22541 | |
22542 palignr m3, m0, m2, 14 | |
22543 palignr m4, m5, m0, 2 | |
22544 | |
22545 psllw m0, 1 | |
22546 paddw m4, m3 | |
22547 paddw m0, m4 | |
22548 paddw m0, [pw_2] | |
22549 psrlw m0, 2 | |
22550 packuswb m1, m0 | |
22551 movu [r1 + 48], m1 | |
22552 | |
22553 ; filtering left | |
22554 ; 64 to 79 | |
22555 pmovzxbw m1, [r0 + 72] | |
22556 pmovzxbw m2, [r0 + 80] | |
22557 | |
22558 palignr m4, m5, m5, 14 | |
22559 pinsrb m4, [r0], 2 | |
22560 palignr m0, m1, m5, 2 | |
22561 | |
22562 psllw m3, m5, 1 | |
22563 paddw m4, m0 | |
22564 paddw m3, m4 | |
22565 paddw m3, [pw_2] | |
22566 psrlw m3, 2 | |
22567 | |
22568 palignr m0, m1, m5, 14 | |
22569 palignr m4, m2, m1, 2 | |
22570 | |
22571 psllw m5, m1, 1 | |
22572 paddw m4, m0 | |
22573 paddw m5, m4 | |
22574 paddw m5, [pw_2] | |
22575 psrlw m5, 2 | |
22576 packuswb m3, m5 | |
22577 movu [r1 + 64], m3 | |
22578 | |
22579 ; 80 to 95 | |
22580 pmovzxbw m5, [r0 + 88] | |
22581 pmovzxbw m0, [r0 + 96] | |
22582 | |
22583 palignr m3, m2, m1, 14 | |
22584 palignr m4, m5, m2, 2 | |
22585 | |
22586 psllw m1, m2, 1 | |
22587 paddw m3, m4 | |
22588 paddw m1, m3 | |
22589 paddw m1, [pw_2] | |
22590 psrlw m1, 2 | |
22591 | |
22592 palignr m3, m5, m2, 14 | |
22593 palignr m4, m0, m5, 2 | |
22594 | |
22595 psllw m2, m5, 1 | |
22596 paddw m4, m3 | |
22597 paddw m2, m4 | |
22598 paddw m2, [pw_2] | |
22599 psrlw m2, 2 | |
22600 packuswb m1, m2 | |
22601 movu [r1 + 80], m1 | |
22602 | |
22603 ; 96 to 111 | |
22604 pmovzxbw m1, [r0 + 104] | |
22605 pmovzxbw m2, [r0 + 112] | |
22606 | |
22607 palignr m3, m0, m5, 14 | |
22608 palignr m4, m1, m0, 2 | |
22609 | |
22610 psllw m5, m0, 1 | |
22611 paddw m3, m4 | |
22612 paddw m5, m3 | |
22613 paddw m5, [pw_2] | |
22614 psrlw m5, 2 | |
22615 | |
22616 palignr m3, m1, m0, 14 | |
22617 palignr m4, m2, m1, 2 | |
22618 | |
22619 psllw m0, m1, 1 | |
22620 paddw m4, m3 | |
22621 paddw m0, m4 | |
22622 paddw m0, [pw_2] | |
22623 psrlw m0, 2 | |
22624 packuswb m5, m0 | |
22625 movu [r1 + 96], m5 | |
22626 | |
22627 ; 112 to 127 | |
22628 pmovzxbw m5, [r0 + 120] | |
22629 pmovzxbw m0, [r0 + 128] | |
22630 | |
22631 palignr m3, m2, m1, 14 | |
22632 palignr m4, m5, m2, 2 | |
22633 | |
22634 psllw m1, m2, 1 | |
22635 paddw m3, m4 | |
22636 paddw m1, m3 | |
22637 paddw m1, [pw_2] | |
22638 psrlw m1, 2 | |
22639 | |
22640 palignr m3, m5, m2, 14 | |
22641 palignr m4, m0, m5, 2 | |
22642 | |
22643 psllw m5, 1 | |
22644 paddw m4, m3 | |
22645 paddw m5, m4 | |
22646 paddw m5, [pw_2] | |
22647 psrlw m5, 2 | |
22648 packuswb m1, m5 | |
22649 movu [r1 + 112], m1 | |
22650 | |
22651 mov [r1 + 64], r2b ; topLast | |
22652 mov [r1 + 128], r3b ; LeftLast | |
22653 RET | |
22654 | |
22655 INIT_YMM avx2 | |
22656 cglobal intra_filter_4x4, 2,4,4 | |
22657 mov r2b, byte [r0 + 8] ; topLast | |
22658 mov r3b, byte [r0 + 16] ; LeftLast | |
22659 | |
22660 ; filtering top | |
22661 pmovzxbw m0, [r0] | |
22662 vpbroadcastw m2, xm0 | |
22663 pmovzxbw m1, [r0 + 8] | |
22664 | |
22665 palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] | |
22666 pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] | |
22667 palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] | |
22668 palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] | |
22669 | |
22670 psllw m0, 1 | |
22671 paddw m3, m1 | |
22672 paddw m0, m3 | |
22673 paddw m0, [pw_2] | |
22674 psrlw m0, 2 | |
22675 | |
22676 packuswb m0, m0 | |
22677 vpermq m0, m0, 10001000b | |
22678 | |
22679 movu [r1], xm0 | |
22680 mov [r1 + 8], r2b ; topLast | |
22681 mov [r1 + 16], r3b ; LeftLast | |
22682 RET |