Mercurial > hg > forks > libbpg
comparison x265/source/common/x86/intrapred8_allangs.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:772086c29cc7 |
---|---|
1 ;***************************************************************************** | |
2 ;* Copyright (C) 2013 x265 project | |
3 ;* | |
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
5 ;* Praveen Tiwari <praveen@multicorewareinc.com> | |
6 ;* | |
7 ;* This program is free software; you can redistribute it and/or modify | |
8 ;* it under the terms of the GNU General Public License as published by | |
9 ;* the Free Software Foundation; either version 2 of the License, or | |
10 ;* (at your option) any later version. | |
11 ;* | |
12 ;* This program is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 ;* GNU General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU General Public License | |
18 ;* along with this program; if not, write to the Free Software | |
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 ;* | |
21 ;* This program is also available under a commercial proprietary license. | |
22 ;* For more information, contact us at license @ x265.com. | |
23 ;*****************************************************************************/ | |
24 | |
25 %include "x86inc.asm" | |
26 %include "x86util.asm" | |
27 | |
28 SECTION_RODATA 32 | |
29 | |
30 all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
31 db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 | |
32 db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 | |
33 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 | |
34 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5 | |
35 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4 | |
36 db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 | |
37 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12 | |
38 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11 | |
39 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11 | |
40 db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10 | |
41 db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10 | |
42 db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9 | |
43 db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0 | |
44 db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1 | |
45 db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2 | |
46 db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2 | |
47 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3 | |
48 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3 | |
49 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4 | |
50 db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 | |
51 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 | |
52 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 | |
53 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6 | |
54 db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 | |
55 db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8 | |
56 db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8 | |
57 | |
58 all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 | |
59 db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 | |
60 db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 | |
61 db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 | |
62 db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 | |
63 db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 | |
64 db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 | |
65 db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 | |
66 db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 | |
67 db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 | |
68 db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 | |
69 db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 | |
70 db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 | |
71 db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 | |
72 db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 | |
73 db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 | |
74 db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 | |
75 db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 | |
76 db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 | |
77 db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 | |
78 db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 | |
79 db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 | |
80 db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 | |
81 db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 | |
82 db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 | |
83 db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 | |
84 db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 | |
85 db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 | |
86 | |
87 | |
88 SECTION .text | |
89 | |
90 ; global constant | |
91 cextern pw_1024 | |
92 | |
93 ; common constant with intrapred8.asm | |
94 cextern ang_table | |
95 cextern pw_ang_table | |
96 cextern tab_S1 | |
97 cextern tab_S2 | |
98 cextern tab_Si | |
99 cextern pw_16 | |
100 cextern pb_000000000000000F | |
101 cextern pb_0000000000000F0F | |
102 cextern pw_FFFFFFFFFFFFFFF0 | |
103 | |
104 | |
105 ;----------------------------------------------------------------------------- | |
106 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
107 ;----------------------------------------------------------------------------- | |
108 INIT_XMM sse4 | |
109 cglobal all_angs_pred_4x4, 4, 4, 8 | |
110 | |
111 ; mode 2 | |
112 | |
113 movh m0, [r1 + 10] | |
114 movd [r0], m0 | |
115 | |
116 palignr m1, m0, 1 | |
117 movd [r0 + 4], m1 | |
118 | |
119 palignr m1, m0, 2 | |
120 movd [r0 + 8], m1 | |
121 | |
122 palignr m1, m0, 3 | |
123 movd [r0 + 12], m1 | |
124 | |
125 ; mode 3 | |
126 | |
127 mova m2, [pw_1024] | |
128 | |
129 pslldq m1, m0, 1 | |
130 pinsrb m1, [r1 + 9], 0 | |
131 punpcklbw m1, m0 | |
132 | |
133 lea r3, [ang_table] | |
134 | |
135 pmaddubsw m6, m1, [r3 + 26 * 16] | |
136 pmulhrsw m6, m2 | |
137 packuswb m6, m6 | |
138 movd [r0 + 16], m6 | |
139 | |
140 palignr m0, m1, 2 | |
141 | |
142 mova m7, [r3 + 20 * 16] | |
143 | |
144 pmaddubsw m3, m0, m7 | |
145 pmulhrsw m3, m2 | |
146 packuswb m3, m3 | |
147 movd [r0 + 20], m3 | |
148 | |
149 ; mode 6 [row 3] | |
150 movd [r0 + 76], m3 | |
151 | |
152 palignr m3, m1, 4 | |
153 | |
154 pmaddubsw m4, m3, [r3 + 14 * 16] | |
155 pmulhrsw m4, m2 | |
156 packuswb m4, m4 | |
157 movd [r0 + 24], m4 | |
158 | |
159 palignr m4, m1, 6 | |
160 | |
161 pmaddubsw m4, [r3 + 8 * 16] | |
162 pmulhrsw m4, m2 | |
163 packuswb m4, m4 | |
164 movd [r0 + 28], m4 | |
165 | |
166 ; mode 4 | |
167 | |
168 pmaddubsw m5, m1, [r3 + 21 * 16] | |
169 pmulhrsw m5, m2 | |
170 packuswb m5, m5 | |
171 movd [r0 + 32], m5 | |
172 | |
173 pmaddubsw m5, m0, [r3 + 10 * 16] | |
174 pmulhrsw m5, m2 | |
175 packuswb m5, m5 | |
176 movd [r0 + 36], m5 | |
177 | |
178 pmaddubsw m5, m0, [r3 + 31 * 16] | |
179 pmulhrsw m5, m2 | |
180 packuswb m5, m5 | |
181 movd [r0 + 40], m5 | |
182 | |
183 pmaddubsw m4, m3, m7 | |
184 pmulhrsw m4, m2 | |
185 packuswb m4, m4 | |
186 movd [r0 + 44], m4 | |
187 | |
188 ; mode 5 | |
189 | |
190 pmaddubsw m5, m1, [r3 + 17 * 16] | |
191 pmulhrsw m5, m2 | |
192 packuswb m5, m5 | |
193 movd [r0 + 48], m5 | |
194 | |
195 pmaddubsw m5, m0, [r3 + 2 * 16] | |
196 pmulhrsw m5, m2 | |
197 packuswb m5, m5 | |
198 movd [r0 + 52], m5 | |
199 | |
200 pmaddubsw m5, m0, [r3 + 19 * 16] | |
201 pmulhrsw m5, m2 | |
202 packuswb m5, m5 | |
203 movd [r0 + 56], m5 | |
204 | |
205 pmaddubsw m4, m3, [r3 + 4 * 16] | |
206 pmulhrsw m4, m2 | |
207 packuswb m4, m4 | |
208 movd [r0 + 60], m4 | |
209 | |
210 ; mode 6 | |
211 | |
212 pmaddubsw m5, m1, [r3 + 13 * 16] | |
213 pmulhrsw m5, m2 | |
214 packuswb m5, m5 | |
215 movd [r0 + 64], m5 | |
216 | |
217 movd [r0 + 68], m6 | |
218 | |
219 pmaddubsw m5, m0, [r3 + 7 * 16] | |
220 pmulhrsw m5, m2 | |
221 packuswb m5, m5 | |
222 movd [r0 + 72], m5 | |
223 | |
224 ; mode 7 | |
225 | |
226 pmaddubsw m5, m1, [r3 + 9 * 16] | |
227 pmulhrsw m5, m2 | |
228 packuswb m5, m5 | |
229 movd [r0 + 80], m5 | |
230 | |
231 pmaddubsw m5, m1, [r3 + 18 * 16] | |
232 pmulhrsw m5, m2 | |
233 packuswb m5, m5 | |
234 movd [r0 + 84], m5 | |
235 | |
236 pmaddubsw m5, m1, [r3 + 27 * 16] | |
237 pmulhrsw m5, m2 | |
238 packuswb m5, m5 | |
239 movd [r0 + 88], m5 | |
240 | |
241 pmaddubsw m5, m0, [r3 + 4 * 16] | |
242 pmulhrsw m5, m2 | |
243 packuswb m5, m5 | |
244 movd [r0 + 92], m5 | |
245 | |
246 ; mode 8 | |
247 | |
248 pmaddubsw m5, m1, [r3 + 5 * 16] | |
249 pmulhrsw m5, m2 | |
250 packuswb m5, m5 | |
251 movd [r0 + 96], m5 | |
252 | |
253 pmaddubsw m5, m1, [r3 + 10 * 16] | |
254 pmulhrsw m5, m2 | |
255 packuswb m5, m5 | |
256 movd [r0 + 100], m5 | |
257 | |
258 pmaddubsw m5, m1, [r3 + 15 * 16] | |
259 pmulhrsw m5, m2 | |
260 packuswb m5, m5 | |
261 movd [r0 + 104], m5 | |
262 | |
263 pmaddubsw m5, m1, [r3 + 20 * 16] | |
264 pmulhrsw m5, m2 | |
265 packuswb m5, m5 | |
266 movd [r0 + 108], m5 | |
267 | |
268 ; mode 9 | |
269 | |
270 pmaddubsw m5, m1, [r3 + 2 * 16] | |
271 pmulhrsw m5, m2 | |
272 packuswb m5, m5 | |
273 movd [r0 + 112], m5 | |
274 | |
275 pmaddubsw m5, m1, [r3 + 4 * 16] | |
276 pmulhrsw m5, m2 | |
277 packuswb m5, m5 | |
278 movd [r0 + 116], m5 | |
279 | |
280 pmaddubsw m5, m1, [r3 + 6 * 16] | |
281 pmulhrsw m5, m2 | |
282 packuswb m5, m5 | |
283 movd [r0 + 120], m5 | |
284 | |
285 pmaddubsw m5, m1, [r3 + 8 * 16] | |
286 pmulhrsw m5, m2 | |
287 packuswb m5, m5 | |
288 movd [r0 + 124], m5 | |
289 | |
290 ; mode 10 | |
291 | |
292 movd m3, [r1 + 9] | |
293 pshufd m4, m3, 0 | |
294 movu [r0 + 128], m4 | |
295 | |
296 pxor m5, m5 | |
297 movd m7, [r1 + 1] | |
298 pshufd m4, m7, 0 | |
299 punpcklbw m4, m5 | |
300 | |
301 pinsrb m7, [r1], 0 | |
302 pshufb m6, m7, m5 | |
303 punpcklbw m6, m5 | |
304 | |
305 psubw m4, m6 | |
306 psraw m4, 1 | |
307 | |
308 pshufb m6, m3, m5 | |
309 punpcklbw m6, m5 | |
310 | |
311 paddw m4, m6 | |
312 packuswb m4, m5 | |
313 | |
314 pextrb [r0 + 128], m4, 0 | |
315 pextrb [r0 + 132], m4, 1 | |
316 pextrb [r0 + 136], m4, 2 | |
317 pextrb [r0 + 140], m4, 3 | |
318 | |
319 ; mode 11 | |
320 | |
321 pslldq m1, m1, 2 | |
322 pinsrb m1, [r1], 0 | |
323 pinsrb m1, [r1 + 9], 1 | |
324 | |
325 pmaddubsw m3, m1, [r3 + 30 * 16] | |
326 pmulhrsw m3, m2 | |
327 packuswb m3, m3 | |
328 movd [r0 + 144], m3 | |
329 | |
330 pmaddubsw m3, m1, [r3 + 28 * 16] | |
331 pmulhrsw m3, m2 | |
332 packuswb m3, m3 | |
333 movd [r0 + 148], m3 | |
334 | |
335 pmaddubsw m3, m1, [r3 + 26 * 16] | |
336 pmulhrsw m3, m2 | |
337 packuswb m3, m3 | |
338 movd [r0 + 152], m3 | |
339 | |
340 pmaddubsw m3, m1, [r3 + 24 * 16] | |
341 pmulhrsw m3, m2 | |
342 packuswb m3, m3 | |
343 movd [r0 + 156], m3 | |
344 | |
345 ; mode 12 | |
346 | |
347 pmaddubsw m3, m1, [r3 + 27 * 16] | |
348 pmulhrsw m3, m2 | |
349 packuswb m3, m3 | |
350 movd [r0 + 160], m3 | |
351 | |
352 pmaddubsw m3, m1, [r3 + 22 * 16] | |
353 pmulhrsw m3, m2 | |
354 packuswb m3, m3 | |
355 movd [r0 + 164], m3 | |
356 | |
357 pmaddubsw m3, m1, [r3 + 17 * 16] | |
358 pmulhrsw m3, m2 | |
359 packuswb m3, m3 | |
360 movd [r0 + 168], m3 | |
361 | |
362 pmaddubsw m3, m1, [r3 + 12 * 16] | |
363 pmulhrsw m3, m2 | |
364 packuswb m3, m3 | |
365 movd [r0 + 172], m3 | |
366 | |
367 ; mode 13 | |
368 | |
369 pmaddubsw m3, m1, [r3 + 23 * 16] | |
370 pmulhrsw m3, m2 | |
371 packuswb m3, m3 | |
372 movd [r0 + 176], m3 | |
373 | |
374 pmaddubsw m3, m1, [r3 + 14 * 16] | |
375 pmulhrsw m3, m2 | |
376 packuswb m3, m3 | |
377 movd [r0 + 180], m3 | |
378 | |
379 pmaddubsw m3, m1, [r3 + 5 * 16] | |
380 pmulhrsw m3, m2 | |
381 packuswb m3, m3 | |
382 movd [r0 + 184], m3 | |
383 | |
384 pslldq m5, m1, 2 | |
385 pinsrb m5, [r1 + 0], 1 | |
386 pinsrb m5, [r1 + 4], 0 | |
387 | |
388 pmaddubsw m4, m5, [r3 + 28 * 16] | |
389 pmulhrsw m4, m2 | |
390 packuswb m4, m4 | |
391 movd [r0 + 188], m4 | |
392 | |
393 ; mode 14 | |
394 | |
395 pmaddubsw m4, m1, [r3 + 19 * 16] | |
396 pmulhrsw m4, m2 | |
397 packuswb m4, m4 | |
398 movd [r0 + 192], m4 | |
399 | |
400 pmaddubsw m7, m1, [r3 + 6 * 16] | |
401 pmulhrsw m7, m2 | |
402 packuswb m7, m7 | |
403 movd [r0 + 196], m7 | |
404 | |
405 pinsrb m5, [r1 + 2], 0 | |
406 | |
407 pmaddubsw m4, m5, [r3 + 25 * 16] | |
408 pmulhrsw m4, m2 | |
409 packuswb m4, m4 | |
410 movd [r0 + 200], m4 | |
411 | |
412 pmaddubsw m4, m5, [r3 + 12 * 16] | |
413 pmulhrsw m4, m2 | |
414 packuswb m4, m4 | |
415 movd [r0 + 204], m4 | |
416 | |
417 ; mode 15 | |
418 | |
419 pmaddubsw m4, m1, [r3 + 15 * 16] | |
420 pmulhrsw m4, m2 | |
421 packuswb m4, m4 | |
422 movd [r0 + 208], m4 | |
423 | |
424 pmaddubsw m4, m5, [r3 + 30 * 16] | |
425 pmulhrsw m4, m2 | |
426 packuswb m4, m4 | |
427 movd [r0 + 212], m4 | |
428 | |
429 pmaddubsw m4, m5, [r3 + 13 * 16] | |
430 pmulhrsw m4, m2 | |
431 packuswb m4, m4 | |
432 movd [r0 + 216], m4 | |
433 | |
434 pslldq m4, m5, 2 | |
435 pinsrb m4, [r1 + 2], 1 | |
436 pinsrb m4, [r1 + 4], 0 | |
437 | |
438 pmaddubsw m6, m4, [r3 + 28 * 16] | |
439 pmulhrsw m6, m2 | |
440 packuswb m6, m6 | |
441 movd [r0 + 220], m6 | |
442 | |
443 ; mode 16 | |
444 | |
445 pmaddubsw m6, m1, [r3 + 11 * 16] | |
446 pmulhrsw m6, m2 | |
447 packuswb m6, m6 | |
448 movd [r0 + 224], m6 | |
449 | |
450 pmaddubsw m6, m5, [r3 + 22 * 16] | |
451 pmulhrsw m6, m2 | |
452 packuswb m6, m6 | |
453 movd [r0 + 228], m6 | |
454 | |
455 pmaddubsw m6, m5, [r3 + 1 * 16] | |
456 pmulhrsw m6, m2 | |
457 packuswb m6, m6 | |
458 movd [r0 + 232], m6 | |
459 | |
460 pinsrb m4, [r1 + 3], 0 | |
461 | |
462 pmaddubsw m4, [r3 + 12 * 16] | |
463 pmulhrsw m4, m2 | |
464 packuswb m4, m4 | |
465 movd [r0 + 236], m4 | |
466 | |
467 ; mode 17 | |
468 | |
469 movd [r0 + 240], m7 | |
470 | |
471 pslldq m1, 2 | |
472 pinsrb m1, [r1 + 1], 0 | |
473 pinsrb m1, [r1 + 0], 1 | |
474 | |
475 pmaddubsw m3, m1, [r3 + 12 * 16] | |
476 pmulhrsw m3, m2 | |
477 packuswb m3, m3 | |
478 movd [r0 + 244], m3 | |
479 | |
480 pslldq m1, 2 | |
481 pinsrb m1, [r1 + 1], 1 | |
482 pinsrb m1, [r1 + 2], 0 | |
483 | |
484 pmaddubsw m3, m1, [r3 + 18 * 16] | |
485 pmulhrsw m3, m2 | |
486 packuswb m3, m3 | |
487 movd [r0 + 248], m3 | |
488 | |
489 pslldq m1, 2 | |
490 pinsrb m1, [r1 + 2], 1 | |
491 pinsrb m1, [r1 + 4], 0 | |
492 | |
493 pmaddubsw m1, [r3 + 24 * 16] | |
494 pmulhrsw m1, m2 | |
495 packuswb m1, m1 | |
496 movd [r0 + 252], m1 | |
497 | |
498 ; mode 18 | |
499 | |
500 movh m1, [r1] | |
501 movd [r0 + 256], m1 | |
502 | |
503 pslldq m3, m1, 1 | |
504 pinsrb m3, [r1 + 9], 0 | |
505 movd [r0 + 260], m3 | |
506 | |
507 pslldq m4, m3, 1 | |
508 pinsrb m4, [r1 + 10], 0 | |
509 movd [r0 + 264], m4 | |
510 | |
511 pslldq m4, 1 | |
512 pinsrb m4, [r1 + 11], 0 | |
513 movd [r0 + 268], m4 | |
514 | |
515 ; mode 19 | |
516 | |
517 palignr m3, m1, 1 | |
518 punpcklbw m1, m3 | |
519 | |
520 pmaddubsw m7, m1, [r3 + 6 * 16] | |
521 pmulhrsw m7, m2 | |
522 packuswb m7, m7 | |
523 movd [r0 + 272], m7 | |
524 | |
525 pslldq m3, m1, 2 | |
526 pinsrb m3, [r1], 1 | |
527 pinsrb m3, [r1 + 9], 0 | |
528 | |
529 pmaddubsw m4, m3, [r3 + 12 * 16] | |
530 pmulhrsw m4, m2 | |
531 packuswb m4, m4 | |
532 movd [r0 + 276], m4 | |
533 | |
534 pslldq m4, m3, 2 | |
535 pinsrb m4, [r1 + 9], 1 | |
536 pinsrb m4, [r1 + 10], 0 | |
537 | |
538 pmaddubsw m5, m4, [r3 + 18 * 16] | |
539 pmulhrsw m5, m2 | |
540 packuswb m5, m5 | |
541 movd [r0 + 280], m5 | |
542 | |
543 pslldq m4, 2 | |
544 pinsrb m4, [r1 + 10], 1 | |
545 pinsrb m4, [r1 + 12], 0 | |
546 | |
547 pmaddubsw m4, [r3 + 24 * 16] | |
548 pmulhrsw m4, m2 | |
549 packuswb m4, m4 | |
550 movd [r0 + 284], m4 | |
551 | |
552 ; mode 20 | |
553 | |
554 pmaddubsw m4, m1, [r3 + 11 * 16] | |
555 pmulhrsw m4, m2 | |
556 packuswb m4, m4 | |
557 movd [r0 + 288], m4 | |
558 | |
559 pinsrb m3, [r1 + 10], 0 | |
560 | |
561 pmaddubsw m4, m3, [r3 + 22 * 16] | |
562 pmulhrsw m4, m2 | |
563 packuswb m4, m4 | |
564 movd [r0 + 292], m4 | |
565 | |
566 pmaddubsw m4, m3, [r3 + 1 * 16] | |
567 pmulhrsw m4, m2 | |
568 packuswb m4, m4 | |
569 movd [r0 + 296], m4 | |
570 | |
571 pslldq m6, m3, 2 | |
572 pinsrb m6, [r1 + 10], 1 | |
573 pinsrb m6, [r1 + 11], 0 | |
574 | |
575 pmaddubsw m5, m6, [r3 + 12 * 16] | |
576 pmulhrsw m5, m2 | |
577 packuswb m5, m5 | |
578 movd [r0 + 300], m5 | |
579 | |
580 ; mode 21 | |
581 | |
582 pmaddubsw m4, m1, [r3 + 15 * 16] | |
583 pmulhrsw m4, m2 | |
584 packuswb m4, m4 | |
585 movd [r0 + 304], m4 | |
586 | |
587 pmaddubsw m4, m3, [r3 + 30 * 16] | |
588 pmulhrsw m4, m2 | |
589 packuswb m4, m4 | |
590 movd [r0 + 308], m4 | |
591 | |
592 pmaddubsw m4, m3, [r3 + 13 * 16] | |
593 pmulhrsw m4, m2 | |
594 packuswb m4, m4 | |
595 movd [r0 + 312], m4 | |
596 | |
597 pinsrb m6, [r1 + 12], 0 | |
598 | |
599 pmaddubsw m6, [r3 + 28 * 16] | |
600 pmulhrsw m6, m2 | |
601 packuswb m6, m6 | |
602 movd [r0 + 316], m6 | |
603 | |
604 ; mode 22 | |
605 | |
606 pmaddubsw m4, m1, [r3 + 19 * 16] | |
607 pmulhrsw m4, m2 | |
608 packuswb m4, m4 | |
609 movd [r0 + 320], m4 | |
610 | |
611 movd [r0 + 324], m7 | |
612 | |
613 pmaddubsw m4, m3, [r3 + 25 * 16] | |
614 pmulhrsw m4, m2 | |
615 packuswb m4, m4 | |
616 movd [r0 + 328], m4 | |
617 | |
618 pmaddubsw m4, m3, [r3 + 12 * 16] | |
619 pmulhrsw m4, m2 | |
620 packuswb m4, m4 | |
621 movd [r0 + 332], m4 | |
622 | |
623 ; mode 23 | |
624 | |
625 pmaddubsw m4, m1, [r3 + 23 * 16] | |
626 pmulhrsw m4, m2 | |
627 packuswb m4, m4 | |
628 movd [r0 + 336], m4 | |
629 | |
630 pmaddubsw m4, m1, [r3 + 14 * 16] | |
631 pmulhrsw m4, m2 | |
632 packuswb m4, m4 | |
633 movd [r0 + 340], m4 | |
634 | |
635 pmaddubsw m4, m1, [r3 + 5 * 16] | |
636 pmulhrsw m4, m2 | |
637 packuswb m4, m4 | |
638 movd [r0 + 344], m4 | |
639 | |
640 pinsrb m3, [r1 + 12], 0 | |
641 | |
642 pmaddubsw m3, [r3 + 28 * 16] | |
643 pmulhrsw m3, m2 | |
644 packuswb m3, m3 | |
645 movd [r0 + 348], m3 | |
646 | |
647 ; mode 24 | |
648 | |
649 pmaddubsw m3, m1, [r3 + 27 * 16] | |
650 pmulhrsw m3, m2 | |
651 packuswb m3, m3 | |
652 movd [r0 + 352], m3 | |
653 | |
654 pmaddubsw m3, m1, [r3 + 22 * 16] | |
655 pmulhrsw m3, m2 | |
656 packuswb m3, m3 | |
657 movd [r0 + 356], m3 | |
658 | |
659 pmaddubsw m3, m1, [r3 + 17 * 16] | |
660 pmulhrsw m3, m2 | |
661 packuswb m3, m3 | |
662 movd [r0 + 360], m3 | |
663 | |
664 pmaddubsw m3, m1, [r3 + 12 * 16] | |
665 pmulhrsw m3, m2 | |
666 packuswb m3, m3 | |
667 movd [r0 + 364], m3 | |
668 | |
669 ; mode 25 | |
670 | |
671 pmaddubsw m3, m1, [r3 + 30 * 16] | |
672 pmulhrsw m3, m2 | |
673 packuswb m3, m3 | |
674 movd [r0 + 368], m3 | |
675 | |
676 pmaddubsw m3, m1, [r3 + 28 * 16] | |
677 pmulhrsw m3, m2 | |
678 packuswb m3, m3 | |
679 movd [r0 + 372], m3 | |
680 | |
681 pmaddubsw m3, m1, [r3 + 26 * 16] | |
682 pmulhrsw m3, m2 | |
683 packuswb m3, m3 | |
684 movd [r0 + 376], m3 | |
685 | |
686 pmaddubsw m1, [r3 + 24 * 16] | |
687 pmulhrsw m1, m2 | |
688 packuswb m1, m1 | |
689 movd [r0 + 380], m1 | |
690 | |
691 ; mode 26 | |
692 | |
693 movh m1, [r1 + 1] | |
694 pshufd m3, m1, 0 | |
695 movu [r0 + 384], m3 | |
696 | |
697 pxor m4, m4 | |
698 movd m5, [r1 + 9] | |
699 pshufd m5, m5, 0 | |
700 punpcklbw m5, m4 | |
701 | |
702 pinsrb m6, [r1], 0 | |
703 pshufb m6, m4 | |
704 punpcklbw m6, m4 | |
705 | |
706 psubw m5, m6 | |
707 psraw m5, 1 | |
708 | |
709 pshufb m6, m1, m4 | |
710 punpcklbw m6, m4 | |
711 | |
712 paddw m5, m6 | |
713 packuswb m5, m4 | |
714 | |
715 pextrb [r0 + 384], m5, 0 | |
716 pextrb [r0 + 388], m5, 1 | |
717 pextrb [r0 + 392], m5, 2 | |
718 pextrb [r0 + 396], m5, 3 | |
719 | |
720 ; mode 27 | |
721 | |
722 palignr m3, m1, 1 | |
723 punpcklbw m1, m3 | |
724 | |
725 pmaddubsw m3, m1, [r3 + 2 * 16] | |
726 pmulhrsw m3, m2 | |
727 packuswb m3, m3 | |
728 movd [r0 + 400], m3 | |
729 | |
730 pmaddubsw m3, m1, [r3 + 4 * 16] | |
731 pmulhrsw m3, m2 | |
732 packuswb m3, m3 | |
733 movd [r0 + 404], m3 | |
734 | |
735 pmaddubsw m3, m1, [r3 + 6 * 16] | |
736 pmulhrsw m3, m2 | |
737 packuswb m3, m3 | |
738 movd [r0 + 408], m3 | |
739 | |
740 pmaddubsw m3, m1, [r3 + 8 * 16] | |
741 pmulhrsw m3, m2 | |
742 packuswb m3, m3 | |
743 movd [r0 + 412], m3 | |
744 | |
745 ; mode 28 | |
746 | |
747 pmaddubsw m3, m1, [r3 + 5 * 16] | |
748 pmulhrsw m3, m2 | |
749 packuswb m3, m3 | |
750 movd [r0 + 416], m3 | |
751 | |
752 pmaddubsw m3, m1, [r3 + 10 * 16] | |
753 pmulhrsw m3, m2 | |
754 packuswb m3, m3 | |
755 movd [r0 + 420], m3 | |
756 | |
757 pmaddubsw m3, m1, [r3 + 15 * 16] | |
758 pmulhrsw m3, m2 | |
759 packuswb m3, m3 | |
760 movd [r0 + 424], m3 | |
761 | |
762 pmaddubsw m3, m1, [r3 + 20 * 16] | |
763 pmulhrsw m3, m2 | |
764 packuswb m3, m3 | |
765 movd [r0 + 428], m3 | |
766 | |
767 ; mode 29 | |
768 | |
769 pmaddubsw m3, m1, [r3 + 9 * 16] | |
770 pmulhrsw m3, m2 | |
771 packuswb m3, m3 | |
772 movd [r0 + 432], m3 | |
773 | |
774 pmaddubsw m3, m1, [r3 + 18 * 16] | |
775 pmulhrsw m3, m2 | |
776 packuswb m3, m3 | |
777 movd [r0 + 436], m3 | |
778 | |
779 pmaddubsw m3, m1, [r3 + 27 * 16] | |
780 pmulhrsw m3, m2 | |
781 packuswb m3, m3 | |
782 movd [r0 + 440], m3 | |
783 | |
784 palignr m3, m1, 2 | |
785 | |
786 pmaddubsw m4, m3, [r3 + 4 * 16] | |
787 pmulhrsw m4, m2 | |
788 packuswb m4, m4 | |
789 movd [r0 + 444], m4 | |
790 | |
791 ; mode 30 | |
792 | |
793 pmaddubsw m4, m1, [r3 + 13 * 16] | |
794 pmulhrsw m4, m2 | |
795 packuswb m4, m4 | |
796 movd [r0 + 448], m4 | |
797 | |
798 pmaddubsw m7, m1, [r3 + 26 * 16] | |
799 pmulhrsw m7, m2 | |
800 packuswb m7, m7 | |
801 movd [r0 + 452], m7 | |
802 | |
803 pmaddubsw m5, m3, [r3 + 7 * 16] | |
804 pmulhrsw m5, m2 | |
805 packuswb m5, m5 | |
806 movd [r0 + 456], m5 | |
807 | |
808 pmaddubsw m6, m3, [r3 + 20 * 16] | |
809 pmulhrsw m6, m2 | |
810 packuswb m6, m6 | |
811 movd [r0 + 460], m6 | |
812 | |
813 ; mode 31 | |
814 | |
815 pmaddubsw m4, m1, [r3 + 17 * 16] | |
816 pmulhrsw m4, m2 | |
817 packuswb m4, m4 | |
818 movd [r0 + 464], m4 | |
819 | |
820 pmaddubsw m5, m3, [r3 + 2 * 16] | |
821 pmulhrsw m5, m2 | |
822 packuswb m5, m5 | |
823 movd [r0 + 468], m5 | |
824 | |
825 pmaddubsw m5, m3, [r3 + 19 * 16] | |
826 pmulhrsw m5, m2 | |
827 packuswb m5, m5 | |
828 movd [r0 + 472], m5 | |
829 | |
830 palignr m4, m3, 2 | |
831 | |
832 pmaddubsw m5, m4, [r3 + 4 * 16] | |
833 pmulhrsw m5, m2 | |
834 packuswb m5, m5 | |
835 movd [r0 + 476], m5 | |
836 | |
837 ; mode 32 | |
838 | |
839 pmaddubsw m5, m1, [r3 + 21 * 16] | |
840 pmulhrsw m5, m2 | |
841 packuswb m5, m5 | |
842 movd [r0 + 480], m5 | |
843 | |
844 pmaddubsw m5, m3, [r3 + 10 * 16] | |
845 pmulhrsw m5, m2 | |
846 packuswb m5, m5 | |
847 movd [r0 + 484], m5 | |
848 | |
849 pmaddubsw m5, m3, [r3 + 31 * 16] | |
850 pmulhrsw m5, m2 | |
851 packuswb m5, m5 | |
852 movd [r0 + 488], m5 | |
853 | |
854 pmaddubsw m5, m4, [r3 + 20 * 16] | |
855 pmulhrsw m5, m2 | |
856 packuswb m5, m5 | |
857 movd [r0 + 492], m5 | |
858 | |
859 ; mode 33 | |
860 | |
861 movd [r0 + 496], m7 | |
862 | |
863 movd [r0 + 500], m6 | |
864 | |
865 pmaddubsw m5, m4, [r3 + 14 * 16] | |
866 pmulhrsw m5, m2 | |
867 packuswb m5, m5 | |
868 movd [r0 + 504], m5 | |
869 | |
870 psrldq m4, 2 | |
871 | |
872 pmaddubsw m4, [r3 + 8 * 16] | |
873 pmulhrsw m4, m2 | |
874 packuswb m4, m4 | |
875 movd [r0 + 508], m4 | |
876 | |
877 ; mode 34 | |
878 | |
879 movh m7, [r1 + 2] | |
880 movd [r0 + 512], m7 | |
881 | |
882 psrldq m7, 1 | |
883 movd [r0 + 516], m7 | |
884 | |
885 psrldq m7, 1 | |
886 movd [r0 + 520], m7 | |
887 | |
888 psrldq m7, 1 | |
889 movd [r0 + 524], m7 | |
890 | |
891 RET | |
892 | |
893 ;------------------------------------------------------------------------------ | |
894 ; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
895 ;------------------------------------------------------------------------------ | |
896 INIT_XMM sse4 | |
897 cglobal all_angs_pred_8x8, 3,4,8 | |
898 ; mode 2 | |
899 | |
900 movu m0, [r2 + 18] | |
901 palignr m1, m0, 1 | |
902 punpcklqdq m2, m0, m1 | |
903 movu [r0], m2 | |
904 | |
905 palignr m1, m0, 2 | |
906 palignr m2, m0, 3 | |
907 punpcklqdq m1, m2 | |
908 movu [r0 + 16], m1 | |
909 | |
910 palignr m1, m0, 4 | |
911 palignr m2, m0, 5 | |
912 punpcklqdq m1, m2 | |
913 movu [r0 + 32], m1 | |
914 | |
915 palignr m1, m0, 6 | |
916 palignr m2, m0, 7 | |
917 punpcklqdq m1, m2 | |
918 movu [r0 + 48], m1 | |
919 | |
920 ; mode 3 [row 0, 1] | |
921 | |
922 mova m7, [pw_1024] | |
923 lea r3, [ang_table] | |
924 | |
925 movu m0, [r1 + 17] | |
926 | |
927 palignr m1, m0, 1 | |
928 palignr m2, m0, 2 | |
929 | |
930 punpcklbw m3, m0, m1 | |
931 pmaddubsw m4, m3, [r3 + 26 * 16] | |
932 pmulhrsw m4, m7 | |
933 | |
934 punpcklbw m1, m2 | |
935 pmaddubsw m5, m1, [r3 + 20 * 16] | |
936 pmulhrsw m5, m7 | |
937 | |
938 packuswb m4, m5 | |
939 | |
940 movu [r0 + 64], m4 | |
941 | |
942 ; mode 6 [row 1] | |
943 | |
944 movh [r0 + 264], m4 | |
945 | |
946 ; mode 6 [row 3] | |
947 | |
948 movhps [r0 + 280], m4 | |
949 | |
950 ; mode 4 [row 0, 1] | |
951 | |
952 pmaddubsw m4, m3, [r3 + 21 * 16] | |
953 pmulhrsw m4, m7 | |
954 | |
955 pmaddubsw m5, m1, [r3 + 10 * 16] | |
956 pmulhrsw m5, m7 | |
957 | |
958 packuswb m4, m5 | |
959 movu [r0 + 128], m4 | |
960 | |
961 ; mode 5 [row 0, 1] | |
962 | |
963 pmaddubsw m4, m3, [r3 + 17 * 16] | |
964 pmulhrsw m4, m7 | |
965 | |
966 pmaddubsw m5, m1, [r3 + 2 * 16] | |
967 pmulhrsw m5, m7 | |
968 | |
969 packuswb m4, m5 | |
970 movu [r0 + 192], m4 | |
971 | |
972 ; mode 6 [row 0] | |
973 | |
974 pmaddubsw m4, m3, [r3 + 13 * 16] | |
975 pmulhrsw m4, m7 | |
976 | |
977 pxor m5, m5 | |
978 | |
979 packuswb m4, m5 | |
980 movh [r0 + 256], m4 | |
981 | |
982 ; mode 7 [row 0, 1] | |
983 | |
984 pmaddubsw m4, m3, [r3 + 9 * 16] | |
985 pmulhrsw m4, m7 | |
986 | |
987 pmaddubsw m5, m3, [r3 + 18 * 16] | |
988 pmulhrsw m5, m7 | |
989 | |
990 packuswb m4, m5 | |
991 movu [r0 + 320], m4 | |
992 | |
993 ; mode 8 [row 0, 1] | |
994 | |
995 pmaddubsw m4, m3, [r3 + 5 * 16] | |
996 pmulhrsw m4, m7 | |
997 | |
998 pmaddubsw m5, m3, [r3 + 10 * 16] | |
999 pmulhrsw m5, m7 | |
1000 | |
1001 packuswb m4, m5 | |
1002 movu [r0 + 384], m4 | |
1003 | |
1004 ; mode 8 [row 2, 3] | |
1005 | |
1006 pmaddubsw m4, m3, [r3 + 15 * 16] | |
1007 pmulhrsw m4, m7 | |
1008 | |
1009 pmaddubsw m5, m3, [r3 + 20 * 16] | |
1010 pmulhrsw m5, m7 | |
1011 | |
1012 packuswb m4, m5 | |
1013 movu [r0 + 400], m4 | |
1014 | |
1015 ; mode 8 [row 4, 5] | |
1016 | |
1017 pmaddubsw m4, m3, [r3 + 25 * 16] | |
1018 pmulhrsw m4, m7 | |
1019 | |
1020 pmaddubsw m5, m3, [r3 + 30 * 16] | |
1021 pmulhrsw m5, m7 | |
1022 | |
1023 packuswb m4, m5 | |
1024 movu [r0 + 416], m4 | |
1025 | |
1026 ; mode 8 [row 6, 7] | |
1027 | |
1028 pmaddubsw m4, m1, [r3 + 3 * 16] | |
1029 pmulhrsw m4, m7 | |
1030 | |
1031 pmaddubsw m5, m1, [r3 + 8 * 16] | |
1032 pmulhrsw m5, m7 | |
1033 | |
1034 packuswb m4, m5 | |
1035 movu [r0 + 432], m4 | |
1036 | |
1037 ; mode 9 [row 0, 1] | |
1038 | |
1039 pmaddubsw m4, m3, [r3 + 2 * 16] | |
1040 pmulhrsw m4, m7 | |
1041 | |
1042 pmaddubsw m5, m3, [r3 + 4 * 16] | |
1043 pmulhrsw m5, m7 | |
1044 | |
1045 packuswb m4, m5 | |
1046 movu [r0 + 448], m4 | |
1047 | |
1048 ; mode 9 [row 2, 3] | |
1049 | |
1050 pmaddubsw m4, m3, [r3 + 6 * 16] | |
1051 pmulhrsw m4, m7 | |
1052 | |
1053 pmaddubsw m5, m3, [r3 + 8 * 16] | |
1054 pmulhrsw m5, m7 | |
1055 | |
1056 packuswb m4, m5 | |
1057 movu [r0 + 464], m4 | |
1058 | |
1059 ; mode 9 [row 4, 5] | |
1060 | |
1061 pmaddubsw m4, m3, [r3 + 10 * 16] | |
1062 pmulhrsw m4, m7 | |
1063 | |
1064 pmaddubsw m5, m3, [r3 + 12 * 16] | |
1065 pmulhrsw m5, m7 | |
1066 | |
1067 packuswb m4, m5 | |
1068 movu [r0 + 480], m4 | |
1069 | |
1070 ; mode 9 [row 6, 7] | |
1071 | |
1072 pmaddubsw m4, m3, [r3 + 14 * 16] | |
1073 pmulhrsw m4, m7 | |
1074 | |
1075 pmaddubsw m5, m3, [r3 + 16 * 16] | |
1076 pmulhrsw m5, m7 | |
1077 | |
1078 packuswb m4, m5 | |
1079 movu [r0 + 496], m4 | |
1080 | |
1081 ; mode 7 [row 2, 3] | |
1082 | |
1083 pmaddubsw m4, m3, [r3 + 27 * 16] | |
1084 pmulhrsw m4, m7 | |
1085 | |
1086 pmaddubsw m5, m1, [r3 + 4 * 16] | |
1087 pmulhrsw m5, m7 | |
1088 | |
1089 packuswb m4, m5 | |
1090 movu [r0 + 336], m4 | |
1091 | |
1092 ; mode 7 [row 4, 5] | |
1093 | |
1094 pmaddubsw m4, m1, [r3 + 13 * 16] | |
1095 pmulhrsw m4, m7 | |
1096 | |
1097 pmaddubsw m5, m1, [r3 + 22 * 16] | |
1098 pmulhrsw m5, m7 | |
1099 | |
1100 packuswb m4, m5 | |
1101 movu [r0 + 352], m4 | |
1102 | |
1103 ; mode 6 [row 2] | |
1104 | |
1105 pmaddubsw m4, m1, [r3 + 7 * 16] | |
1106 pmulhrsw m4, m7 | |
1107 | |
1108 pxor m5, m5 | |
1109 | |
1110 packuswb m4, m5 | |
1111 movh [r0 + 272], m4 | |
1112 | |
1113 ; mode 3 [row 2, 3] | |
1114 | |
1115 palignr m1, m0, 3 | |
1116 palignr m3, m0, 4 | |
1117 | |
1118 punpcklbw m2, m1 | |
1119 pmaddubsw m5, m2, [r3 + 14 * 16] | |
1120 pmulhrsw m5, m7 | |
1121 | |
1122 punpcklbw m1, m3 | |
1123 pmaddubsw m6, m1, [r3 + 8 * 16] | |
1124 pmulhrsw m6, m7 | |
1125 | |
1126 packuswb m5, m6 | |
1127 movu [r0 + 80], m5 | |
1128 | |
1129 ; mode 6 [row 7] | |
1130 | |
1131 movhps [r0 + 312], m5 | |
1132 | |
1133 ; mode 6 [row 5] | |
1134 | |
1135 movh [r0 + 296], m5 | |
1136 | |
1137 ; mode 4 [calculate and store row 4, 5] | |
1138 | |
1139 pmaddubsw m4, m1, [r3 + 9 * 16] | |
1140 pmulhrsw m4, m7 | |
1141 | |
1142 pmaddubsw m5, m1, [r3 + 30 * 16] | |
1143 pmulhrsw m5, m7 | |
1144 | |
1145 packuswb m4, m5 | |
1146 movu [r0 + 160], m4 | |
1147 | |
1148 ; mode 5 [row 4, 5] | |
1149 | |
1150 pmaddubsw m4, m2, [r3 + 21 * 16] | |
1151 pmulhrsw m4, m7 | |
1152 | |
1153 pmaddubsw m5, m1, [r3 + 6 * 16] | |
1154 pmulhrsw m5, m7 | |
1155 | |
1156 packuswb m4, m5 | |
1157 movu [r0 + 224], m4 | |
1158 | |
1159 ; mode 6 [row 4, 5] | |
1160 | |
1161 pmaddubsw m5, m2, [r3 + 1 * 16] | |
1162 pmulhrsw m5, m7 | |
1163 | |
1164 pxor m6, m6 | |
1165 | |
1166 packuswb m5, m6 | |
1167 movh [r0 + 288], m5 | |
1168 | |
1169 ; mode 6 [row 6, 7] | |
1170 | |
1171 pmaddubsw m5, m2, [r3 + 27 * 16] | |
1172 pmulhrsw m5, m7 | |
1173 | |
1174 pxor m6, m6 | |
1175 | |
1176 packuswb m5, m6 | |
1177 movh [r0 + 304], m5 | |
1178 | |
1179 ; mode 5 [calculate row 6] | |
1180 | |
1181 pmaddubsw m6, m1, [r3 + 23 * 16] | |
1182 pmulhrsw m6, m7 | |
1183 | |
1184 ; mode 3 [row 4, 5] | |
1185 | |
1186 palignr m1, m0, 5 | |
1187 | |
1188 punpcklbw m3, m1 | |
1189 pmaddubsw m4, m3, [r3 + 2 * 16] | |
1190 pmulhrsw m4, m7 | |
1191 | |
1192 pmaddubsw m5, m3, [r3 + 28 * 16] | |
1193 pmulhrsw m5, m7 | |
1194 | |
1195 packuswb m4, m5 | |
1196 movu [r0 + 96], m4 | |
1197 | |
1198 ; mode 4 [calculate row 7] | |
1199 | |
1200 pmaddubsw m5, m3, [r3 + 19 * 16] | |
1201 pmulhrsw m5, m7 | |
1202 | |
1203 ; mode 5 [calculate row 6] | |
1204 | |
1205 pmaddubsw m4, m3, [r3 + 8 * 16] | |
1206 pmulhrsw m4, m7 | |
1207 | |
1208 packuswb m6, m4 | |
1209 movu [r0 + 240], m6 | |
1210 | |
1211 ; mode 3 [row 6, 7] | |
1212 | |
1213 palignr m2, m0, 6 | |
1214 palignr m3, m0, 7 | |
1215 | |
1216 punpcklbw m1, m2 | |
1217 pmaddubsw m4, m1, [r3 + 22 * 16] | |
1218 pmulhrsw m4, m7 | |
1219 | |
1220 punpcklbw m2, m3 | |
1221 pmaddubsw m2, [r3 + 16 * 16] | |
1222 pmulhrsw m2, m7 | |
1223 | |
1224 packuswb m4, m2 | |
1225 movu [r0 + 112], m4 | |
1226 | |
1227 ; mode 4 [calculate row 7] | |
1228 | |
1229 pmaddubsw m2, m1, [r3 + 8 * 16] | |
1230 pmulhrsw m2, m7 | |
1231 | |
1232 ; mode 4 [store row 6 and 7] | |
1233 | |
1234 packuswb m5, m2 | |
1235 movu [r0 + 176], m5 | |
1236 | |
1237 ; mode 4 [row 2, 3] | |
1238 | |
1239 palignr m1, m0, 1 | |
1240 palignr m2, m0, 2 | |
1241 palignr m3, m0, 3 | |
1242 | |
1243 punpcklbw m1, m2 | |
1244 pmaddubsw m4, m1, [r3 + 31 * 16] | |
1245 pmulhrsw m4, m7 | |
1246 | |
1247 punpcklbw m2, m3 | |
1248 pmaddubsw m5, m2, [r3 + 20 * 16] | |
1249 pmulhrsw m5, m7 | |
1250 | |
1251 packuswb m4, m5 | |
1252 movu [r0 + 144], m4 | |
1253 | |
1254 ; mode 5 [row 2, 3] | |
1255 | |
1256 pmaddubsw m4, m1, [r3 + 19 * 16] | |
1257 pmulhrsw m4, m7 | |
1258 | |
1259 pmaddubsw m5, m2, [r3 + 4 * 16] | |
1260 pmulhrsw m5, m7 | |
1261 | |
1262 packuswb m4, m5 | |
1263 movu [r0 + 208], m4 | |
1264 | |
1265 ; mode 7 [row 6, 7] | |
1266 | |
1267 pmaddubsw m4, m1, [r3 + 31 * 16] | |
1268 pmulhrsw m4, m7 | |
1269 | |
1270 pmaddubsw m5, m2, [r3 + 8 * 16] | |
1271 pmulhrsw m5, m7 | |
1272 | |
1273 packuswb m4, m5 | |
1274 movu [r0 + 368], m4 | |
1275 | |
1276 ; mode 10 | |
1277 | |
1278 pshufb m1, m0, [tab_Si] | |
1279 movu [r0 + 512], m1 | |
1280 movu [r0 + 528], m1 | |
1281 movu [r0 + 544], m1 | |
1282 movu [r0 + 560], m1 | |
1283 | |
1284 pxor m0, m0 | |
1285 | |
1286 pshufb m1, m1, m0 | |
1287 punpcklbw m1, m0 | |
1288 | |
1289 movu m2, [r1] | |
1290 | |
1291 pshufb m3, m2, m0 | |
1292 punpcklbw m3, m0 | |
1293 | |
1294 psrldq m4, m2, 1 | |
1295 punpcklbw m4, m0 | |
1296 | |
1297 movu m2, [r1 + 9] | |
1298 punpcklbw m2, m0 | |
1299 | |
1300 psubw m4, m3 | |
1301 psubw m2, m3 | |
1302 | |
1303 psraw m4, 1 | |
1304 psraw m2, 1 | |
1305 | |
1306 paddw m4, m1 | |
1307 paddw m2, m1 | |
1308 | |
1309 packuswb m4, m2 | |
1310 | |
1311 pextrb [r0 + 512], m4, 0 | |
1312 pextrb [r0 + 520], m4, 1 | |
1313 pextrb [r0 + 528], m4, 2 | |
1314 pextrb [r0 + 536], m4, 3 | |
1315 pextrb [r0 + 544], m4, 4 | |
1316 pextrb [r0 + 552], m4, 5 | |
1317 pextrb [r0 + 560], m4, 6 | |
1318 pextrb [r0 + 568], m4, 7 | |
1319 | |
1320 ; mode 11 [row 0, 1] | |
1321 | |
1322 movu m0, [r1 + 16] | |
1323 pinsrb m0, [r1], 0 | |
1324 palignr m1, m0, 1 | |
1325 punpcklbw m2, m0, m1 | |
1326 | |
1327 pmaddubsw m3, m2, [r3 + 30 * 16] | |
1328 pmulhrsw m3, m7 | |
1329 | |
1330 pmaddubsw m4, m2, [r3 + 28 * 16] | |
1331 pmulhrsw m4, m7 | |
1332 | |
1333 packuswb m3, m4 | |
1334 movu [r0 + 576], m3 | |
1335 | |
1336 ; mode 11 [row 2, 3] | |
1337 | |
1338 pmaddubsw m3, m2, [r3 + 26 * 16] | |
1339 pmulhrsw m3, m7 | |
1340 | |
1341 pmaddubsw m4, m2, [r3 + 24 * 16] | |
1342 pmulhrsw m4, m7 | |
1343 | |
1344 packuswb m3, m4 | |
1345 movu [r0 + 592], m3 | |
1346 | |
1347 ; mode 11 [row 4, 5] | |
1348 | |
1349 pmaddubsw m3, m2, [r3 + 22 * 16] | |
1350 pmulhrsw m3, m7 | |
1351 | |
1352 pmaddubsw m4, m2, [r3 + 20 * 16] | |
1353 pmulhrsw m4, m7 | |
1354 | |
1355 packuswb m5, m3, m4 | |
1356 movu [r0 + 608], m5 | |
1357 | |
1358 ; mode 12 [row 0, 1] | |
1359 | |
1360 pmaddubsw m4, m2, [r3 + 27 * 16] | |
1361 pmulhrsw m4, m7 | |
1362 | |
1363 packuswb m4, m3 | |
1364 movu [r0 + 640], m4 | |
1365 | |
1366 ; mode 11 [row 6, 7] | |
1367 | |
1368 pmaddubsw m3, m2, [r3 + 18 * 16] | |
1369 pmulhrsw m3, m7 | |
1370 | |
1371 pmaddubsw m4, m2, [r3 + 16 * 16] | |
1372 pmulhrsw m4, m7 | |
1373 | |
1374 packuswb m3, m4 | |
1375 movu [r0 + 624], m3 | |
1376 | |
1377 ; mode 12 [row 2, 3] | |
1378 | |
1379 pmaddubsw m3, m2, [r3 + 17 * 16] | |
1380 pmulhrsw m3, m7 | |
1381 | |
1382 pmaddubsw m4, m2, [r3 + 12 * 16] | |
1383 pmulhrsw m4, m7 | |
1384 | |
1385 packuswb m3, m4 | |
1386 movu [r0 + 656], m3 | |
1387 | |
1388 ; mode 12 [row 4, 5] | |
1389 | |
1390 pmaddubsw m3, m2, [r3 + 7 * 16] | |
1391 pmulhrsw m3, m7 | |
1392 | |
1393 pmaddubsw m4, m2, [r3 + 2 * 16] | |
1394 pmulhrsw m4, m7 | |
1395 | |
1396 packuswb m3, m4 | |
1397 movu [r0 + 672], m3 | |
1398 | |
1399 ; mode 12 [row 6, 7] | |
1400 | |
1401 pslldq m3, m2, 2 | |
1402 pinsrb m3, [r1 + 0], 1 | |
1403 pinsrb m3, [r1 + 6], 0 | |
1404 | |
1405 pmaddubsw m4, m3, [r3 + 29 * 16] | |
1406 pmulhrsw m4, m7 | |
1407 | |
1408 pmaddubsw m5, m3, [r3 + 24 * 16] | |
1409 pmulhrsw m5, m7 | |
1410 | |
1411 packuswb m4, m5 | |
1412 movu [r0 + 688], m4 | |
1413 | |
1414 ; mode 13 [row 0, 1] | |
1415 | |
1416 pmaddubsw m4, m2, [r3 + 23 * 16] | |
1417 pmulhrsw m4, m7 | |
1418 | |
1419 pmaddubsw m5, m2, [r3 + 14 * 16] | |
1420 pmulhrsw m5, m7 | |
1421 | |
1422 packuswb m4, m5 | |
1423 movu [r0 + 704], m4 | |
1424 | |
1425 ; mode 13 [row 2, 3] | |
1426 | |
1427 pmaddubsw m4, m2, [r3 + 5 * 16] | |
1428 pmulhrsw m4, m7 | |
1429 | |
1430 pinsrb m3, [r1 + 4], 0 | |
1431 pmaddubsw m5, m3, [r3 + 28 * 16] | |
1432 pmulhrsw m5, m7 | |
1433 | |
1434 packuswb m4, m5 | |
1435 movu [r0 + 720], m4 | |
1436 | |
1437 ; mode 13 [row 4, 5] | |
1438 | |
1439 pmaddubsw m4, m3, [r3 + 19 * 16] | |
1440 pmulhrsw m4, m7 | |
1441 | |
1442 pmaddubsw m5, m3, [r3 + 10 * 16] | |
1443 pmulhrsw m5, m7 | |
1444 | |
1445 packuswb m4, m5 | |
1446 movu [r0 + 736], m4 | |
1447 | |
1448 ; mode 13 [row 6, 7] | |
1449 | |
1450 pmaddubsw m4, m3, [r3 + 1 * 16] | |
1451 pmulhrsw m4, m7 | |
1452 | |
1453 pslldq m5, m3, 2 | |
1454 pinsrb m5, [r1 + 4], 1 | |
1455 pinsrb m5, [r1 + 7], 0 | |
1456 | |
1457 pmaddubsw m5, [r3 + 24 * 16] | |
1458 pmulhrsw m5, m7 | |
1459 | |
1460 packuswb m4, m5 | |
1461 movu [r0 + 752], m4 | |
1462 | |
1463 ; mode 14 [row 0, 1] | |
1464 | |
1465 pmaddubsw m4, m2, [r3 + 19 * 16] | |
1466 pmulhrsw m4, m7 | |
1467 | |
1468 pmaddubsw m5, m2, [r3 + 6 * 16] | |
1469 pmulhrsw m5, m7 | |
1470 | |
1471 packuswb m4, m5 | |
1472 movu [r0 + 768], m4 | |
1473 | |
1474 ; mode 14 [row 2, 3] | |
1475 | |
1476 pinsrb m3, [r1 + 2], 0 | |
1477 | |
1478 pmaddubsw m4, m3, [r3 + 25 * 16] | |
1479 pmulhrsw m4, m7 | |
1480 | |
1481 pmaddubsw m5, m3, [r3 + 12 * 16] | |
1482 pmulhrsw m5, m7 | |
1483 | |
1484 packuswb m4, m5 | |
1485 movu [r0 + 784], m4 | |
1486 | |
1487 ; mode 14 [row 4, 5] | |
1488 | |
1489 pslldq m1, m3, 2 | |
1490 pinsrb m1, [r1 + 2], 1 | |
1491 pinsrb m1, [r1 + 5], 0 | |
1492 | |
1493 pmaddubsw m4, m1, [r3 + 31 * 16] | |
1494 pmulhrsw m4, m7 | |
1495 | |
1496 pmaddubsw m5, m1, [r3 + 18 * 16] | |
1497 pmulhrsw m5, m7 | |
1498 | |
1499 packuswb m4, m5 | |
1500 movu [r0 + 800], m4 | |
1501 | |
1502 ; mode 14 [row 6, 7] | |
1503 | |
1504 pmaddubsw m4, m1, [r3 + 5 * 16] | |
1505 pmulhrsw m4, m7 | |
1506 | |
1507 pslldq m1, 2 | |
1508 pinsrb m1, [r1 + 5], 1 | |
1509 pinsrb m1, [r1 + 7], 0 | |
1510 | |
1511 pmaddubsw m5, m1, [r3 + 24 * 16] | |
1512 pmulhrsw m5, m7 | |
1513 | |
1514 packuswb m4, m5 | |
1515 movu [r0 + 816], m4 | |
1516 | |
1517 ; mode 15 [row 0, 1] | |
1518 | |
1519 pmaddubsw m4, m2, [r3 + 15 * 16] | |
1520 pmulhrsw m4, m7 | |
1521 | |
1522 pmaddubsw m5, m3, [r3 + 30 * 16] | |
1523 pmulhrsw m5, m7 | |
1524 | |
1525 packuswb m4, m5 | |
1526 movu [r0 + 832], m4 | |
1527 | |
1528 ; mode 15 [row 2, 3] | |
1529 | |
1530 pmaddubsw m4, m3, [r3 + 13 * 16] | |
1531 pmulhrsw m4, m7 | |
1532 | |
1533 pslldq m1, m3, 2 | |
1534 pinsrb m1, [r1 + 2], 1 | |
1535 pinsrb m1, [r1 + 4], 0 | |
1536 | |
1537 pmaddubsw m5, m1, [r3 + 28 * 16] | |
1538 pmulhrsw m5, m7 | |
1539 | |
1540 packuswb m4, m5 | |
1541 movu [r0 + 848], m4 | |
1542 | |
1543 ; mode 15 [row 4, 5] | |
1544 | |
1545 pmaddubsw m4, m1, [r3 + 11 * 16] | |
1546 pmulhrsw m4, m7 | |
1547 | |
1548 pslldq m1, 2 | |
1549 pinsrb m1, [r1 + 4], 1 | |
1550 pinsrb m1, [r1 + 6], 0 | |
1551 | |
1552 pmaddubsw m5, m1, [r3 + 26 * 16] | |
1553 pmulhrsw m5, m7 | |
1554 | |
1555 packuswb m4, m5 | |
1556 movu [r0 + 864], m4 | |
1557 | |
1558 ; mode 15 [row 6, 7] | |
1559 | |
1560 pmaddubsw m4, m1, [r3 + 9 * 16] | |
1561 pmulhrsw m4, m7 | |
1562 | |
1563 pslldq m1, 2 | |
1564 pinsrb m1, [r1 + 6], 1 | |
1565 pinsrb m1, [r1 + 8], 0 | |
1566 | |
1567 pmaddubsw m1, [r3 + 24 * 16] | |
1568 pmulhrsw m1, m7 | |
1569 | |
1570 packuswb m4, m1 | |
1571 movu [r0 + 880], m4 | |
1572 | |
1573 ; mode 16 [row 0, 1] | |
1574 | |
1575 pmaddubsw m4, m2, [r3 + 11 * 16] | |
1576 pmulhrsw m4, m7 | |
1577 | |
1578 pmaddubsw m5, m3, [r3 + 22 * 16] | |
1579 pmulhrsw m5, m7 | |
1580 | |
1581 packuswb m4, m5 | |
1582 movu [r0 + 896], m4 | |
1583 | |
1584 ; mode 16 [row 2, 3] | |
1585 | |
1586 pmaddubsw m4, m3, [r3 + 1 * 16] | |
1587 pmulhrsw m4, m7 | |
1588 | |
1589 pslldq m3, 2 | |
1590 pinsrb m3, [r1 + 2], 1 | |
1591 pinsrb m3, [r1 + 3], 0 | |
1592 | |
1593 pmaddubsw m5, m3, [r3 + 12 * 16] | |
1594 pmulhrsw m5, m7 | |
1595 | |
1596 packuswb m4, m5 | |
1597 movu [r0 + 912], m4 | |
1598 | |
1599 ; mode 16 [row 4, 5] | |
1600 | |
1601 pslldq m3, 2 | |
1602 pinsrb m3, [r1 + 3], 1 | |
1603 pinsrb m3, [r1 + 5], 0 | |
1604 | |
1605 pmaddubsw m4, m3, [r3 + 23 * 16] | |
1606 pmulhrsw m4, m7 | |
1607 | |
1608 pmaddubsw m5, m3, [r3 + 2 * 16] | |
1609 pmulhrsw m5, m7 | |
1610 | |
1611 packuswb m4, m5 | |
1612 movu [r0 + 928], m4 | |
1613 | |
1614 ; mode 16 [row 6, 7] | |
1615 | |
1616 pslldq m3, 2 | |
1617 pinsrb m3, [r1 + 5], 1 | |
1618 pinsrb m3, [r1 + 6], 0 | |
1619 | |
1620 pmaddubsw m4, m3, [r3 + 13 * 16] | |
1621 pmulhrsw m4, m7 | |
1622 | |
1623 pslldq m3, 2 | |
1624 pinsrb m3, [r1 + 6], 1 | |
1625 pinsrb m3, [r1 + 8], 0 | |
1626 | |
1627 pmaddubsw m3, [r3 + 24 * 16] | |
1628 pmulhrsw m3, m7 | |
1629 | |
1630 packuswb m4, m3 | |
1631 movu [r0 + 944], m4 | |
1632 | |
1633 ; mode 17 [row 0, 1] | |
1634 | |
1635 pmaddubsw m4, m2, [r3 + 6 * 16] | |
1636 pmulhrsw m4, m7 | |
1637 | |
1638 pslldq m2, 2 | |
1639 pinsrb m2, [r1 + 0], 1 | |
1640 pinsrb m2, [r1 + 1], 0 | |
1641 | |
1642 pmaddubsw m3, m2, [r3 + 12 * 16] | |
1643 pmulhrsw m3, m7 | |
1644 | |
1645 packuswb m4, m3 | |
1646 movu [r0 + 960], m4 | |
1647 | |
1648 ; mode 17 [row 2, 3] | |
1649 | |
1650 pslldq m2, 2 | |
1651 pinsrb m2, [r1 + 1], 1 | |
1652 pinsrb m2, [r1 + 2], 0 | |
1653 | |
1654 pmaddubsw m4, m2, [r3 + 18 * 16] | |
1655 pmulhrsw m4, m7 | |
1656 | |
1657 pslldq m2, 2 | |
1658 pinsrb m2, [r1 + 2], 1 | |
1659 pinsrb m2, [r1 + 4], 0 | |
1660 | |
1661 pmaddubsw m3, m2, [r3 + 24 * 16] | |
1662 pmulhrsw m3, m7 | |
1663 | |
1664 packuswb m4, m3 | |
1665 movu [r0 + 976], m4 | |
1666 | |
1667 ; mode 17 [row 4, 5] | |
1668 | |
1669 pslldq m2, 2 | |
1670 pinsrb m2, [r1 + 4], 1 | |
1671 pinsrb m2, [r1 + 5], 0 | |
1672 | |
1673 pmaddubsw m4, m2, [r3 + 30 * 16] | |
1674 pmulhrsw m4, m7 | |
1675 | |
1676 pmaddubsw m3, m2, [r3 + 4 * 16] | |
1677 pmulhrsw m3, m7 | |
1678 | |
1679 packuswb m4, m3 | |
1680 movu [r0 + 992], m4 | |
1681 | |
1682 ; mode 17 [row 6, 7] | |
1683 | |
1684 pslldq m2, 2 | |
1685 pinsrb m2, [r1 + 5], 1 | |
1686 pinsrb m2, [r1 + 6], 0 | |
1687 | |
1688 pmaddubsw m4, m2, [r3 + 10 * 16] | |
1689 pmulhrsw m4, m7 | |
1690 | |
1691 pslldq m2, 2 | |
1692 pinsrb m2, [r1 + 6], 1 | |
1693 pinsrb m2, [r1 + 7], 0 | |
1694 | |
1695 pmaddubsw m3, m2, [r3 + 16 * 16] | |
1696 pmulhrsw m3, m7 | |
1697 | |
1698 packuswb m4, m3 | |
1699 movu [r0 + 1008], m4 | |
1700 | |
1701 ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] | |
1702 | |
1703 movh m1, [r2] | |
1704 | |
1705 pslldq m2, m1, 1 | |
1706 pinsrb m2, [r2 + 1 + 16], 0 | |
1707 punpcklqdq m1, m2 | |
1708 movu [r0 + 1024], m1 | |
1709 | |
1710 pslldq m2, 1 | |
1711 pinsrb m2, [r2 + 2 + 16], 0 | |
1712 | |
1713 pslldq m0, m2, 1 | |
1714 pinsrb m0, [r2 + 3 + 16], 0 | |
1715 punpcklqdq m2, m0 | |
1716 movu [r0 + 1040], m2 | |
1717 | |
1718 pslldq m0, 1 | |
1719 pinsrb m0, [r2 + 4 + 16], 0 | |
1720 | |
1721 pslldq m2, m0, 1 | |
1722 pinsrb m2, [r2 + 5 + 16], 0 | |
1723 punpcklqdq m0, m2 | |
1724 movu [r0 + 1056], m0 | |
1725 | |
1726 pslldq m2, 1 | |
1727 pinsrb m2, [r2 + 6 + 16], 0 | |
1728 | |
1729 pslldq m0, m2, 1 | |
1730 pinsrb m0, [r2 + 7 + 16], 0 | |
1731 punpcklqdq m2, m0 | |
1732 movu [r0 + 1072], m2 | |
1733 | |
1734 ; mode 19 [row 0, 1] | |
1735 | |
1736 movu m0, [r1] | |
1737 palignr m1, m0, 1 | |
1738 punpcklbw m0, m1 | |
1739 | |
1740 pmaddubsw m1, m0, [r3 + 6 * 16] | |
1741 pmulhrsw m1, m7 | |
1742 | |
1743 pslldq m2, m0, 2 | |
1744 pinsrb m2, [r1], 1 | |
1745 pinsrb m2, [r1 + 1 + 16], 0 | |
1746 | |
1747 pmaddubsw m3, m2, [r3 + 12 * 16] | |
1748 pmulhrsw m3, m7 | |
1749 | |
1750 packuswb m1, m3 | |
1751 movu [r0 + 1088], m1 | |
1752 | |
1753 ; mode 19 [row 2, 3] | |
1754 | |
1755 pslldq m2, 2 | |
1756 pinsrb m2, [r1 + 1 + 16], 1 | |
1757 pinsrb m2, [r1 + 2 + 16], 0 | |
1758 | |
1759 pmaddubsw m4, m2, [r3 + 18 * 16] | |
1760 pmulhrsw m4, m7 | |
1761 | |
1762 pslldq m2, 2 | |
1763 pinsrb m2, [r1 + 2 + 16], 1 | |
1764 pinsrb m2, [r1 + 4 + 16], 0 | |
1765 | |
1766 pmaddubsw m5, m2, [r3 + 24 * 16] | |
1767 pmulhrsw m5, m7 | |
1768 | |
1769 packuswb m4, m5 | |
1770 movu [r0 + 1104], m4 | |
1771 | |
1772 ; mode 19 [row 4, 5] | |
1773 | |
1774 pslldq m2, 2 | |
1775 pinsrb m2, [r1 + 4 + 16], 1 | |
1776 pinsrb m2, [r1 + 5 + 16], 0 | |
1777 | |
1778 pmaddubsw m4, m2, [r3 + 30 * 16] | |
1779 pmulhrsw m4, m7 | |
1780 | |
1781 pmaddubsw m5, m2, [r3 + 4 * 16] | |
1782 pmulhrsw m5, m7 | |
1783 | |
1784 packuswb m4, m5 | |
1785 movu [r0 + 1120], m4 | |
1786 | |
1787 ; mode 19 [row 6, 7] | |
1788 | |
1789 pslldq m2, 2 | |
1790 pinsrb m2, [r1 + 5 + 16], 1 | |
1791 pinsrb m2, [r1 + 6 + 16], 0 | |
1792 | |
1793 pmaddubsw m4, m2, [r3 + 10 * 16] | |
1794 pmulhrsw m4, m7 | |
1795 | |
1796 pslldq m2, 2 | |
1797 pinsrb m2, [r1 + 6 + 16], 1 | |
1798 pinsrb m2, [r1 + 7 + 16], 0 | |
1799 | |
1800 pmaddubsw m2, [r3 + 16 * 16] | |
1801 pmulhrsw m2, m7 | |
1802 | |
1803 packuswb m4, m2 | |
1804 movu [r0 + 1136], m4 | |
1805 | |
1806 ; mode 20 [row 0, 1] | |
1807 | |
1808 pmaddubsw m3, m0, [r3 + 11 * 16] | |
1809 pmulhrsw m3, m7 | |
1810 | |
1811 pslldq m1, m0, 2 | |
1812 pinsrb m1, [r1 + 0], 1 | |
1813 pinsrb m1, [r1 + 2 + 16], 0 | |
1814 | |
1815 pmaddubsw m4, m1, [r3 + 22 * 16] | |
1816 pmulhrsw m4, m7 | |
1817 | |
1818 packuswb m3, m4 | |
1819 movu [r0 + 1152], m3 | |
1820 | |
1821 ; mode 20 [row 2, 3] | |
1822 | |
1823 pmaddubsw m3, m1, [r3 + 1 * 16] | |
1824 pmulhrsw m3, m7 | |
1825 | |
1826 pslldq m2, m1, 2 | |
1827 pinsrb m2, [r1 + 2 + 16], 1 | |
1828 pinsrb m2, [r1 + 3 + 16], 0 | |
1829 | |
1830 pmaddubsw m4, m2, [r3 + 12 * 16] | |
1831 pmulhrsw m4, m7 | |
1832 | |
1833 packuswb m3, m4 | |
1834 movu [r0 + 1168], m3 | |
1835 | |
1836 ; mode 20 [row 4, 5] | |
1837 | |
1838 pslldq m2, 2 | |
1839 pinsrb m2, [r1 + 3 + 16], 1 | |
1840 pinsrb m2, [r1 + 5 + 16], 0 | |
1841 | |
1842 pmaddubsw m3, m2, [r3 + 23 * 16] | |
1843 pmulhrsw m3, m7 | |
1844 | |
1845 pmaddubsw m4, m2, [r3 + 2 * 16] | |
1846 pmulhrsw m4, m7 | |
1847 | |
1848 packuswb m3, m4 | |
1849 movu [r0 + 1184], m3 | |
1850 | |
1851 ; mode 20 [row 6, 7] | |
1852 | |
1853 pslldq m2, 2 | |
1854 pinsrb m2, [r1 + 5 + 16], 1 | |
1855 pinsrb m2, [r1 + 6 + 16], 0 | |
1856 | |
1857 pmaddubsw m3, m2, [r3 + 13 * 16] | |
1858 pmulhrsw m3, m7 | |
1859 | |
1860 pslldq m2, 2 | |
1861 pinsrb m2, [r1 + 6 + 16], 1 | |
1862 pinsrb m2, [r1 + 8 + 16], 0 | |
1863 | |
1864 pmaddubsw m4, m2, [r3 + 24 * 16] | |
1865 pmulhrsw m4, m7 | |
1866 | |
1867 packuswb m3, m4 | |
1868 movu [r0 + 1200], m3 | |
1869 | |
1870 ; mode 21 [row 0, 1] | |
1871 | |
1872 pmaddubsw m2, m0, [r3 + 15 * 16] | |
1873 pmulhrsw m2, m7 | |
1874 | |
1875 pmaddubsw m3, m1, [r3 + 30 * 16] | |
1876 pmulhrsw m3, m7 | |
1877 | |
1878 packuswb m2, m3 | |
1879 movu [r0 + 1216], m2 | |
1880 | |
1881 ; mode 21 [row 2, 3] | |
1882 | |
1883 pmaddubsw m2, m1, [r3 + 13 * 16] | |
1884 pmulhrsw m2, m7 | |
1885 | |
1886 pslldq m3, m1, 2 | |
1887 pinsrb m3, [r1 + 2 + 16], 1 | |
1888 pinsrb m3, [r1 + 4 + 16], 0 | |
1889 | |
1890 pmaddubsw m4, m3, [r3 + 28 * 16] | |
1891 pmulhrsw m4, m7 | |
1892 | |
1893 packuswb m2, m4 | |
1894 movu [r0 + 1232], m2 | |
1895 | |
1896 ; mode 21 [row 4, 5] | |
1897 | |
1898 pmaddubsw m2, m3, [r3 + 11 * 16] | |
1899 pmulhrsw m2, m7 | |
1900 | |
1901 pslldq m3, 2 | |
1902 pinsrb m3, [r1 + 4 + 16], 1 | |
1903 pinsrb m3, [r1 + 6 + 16], 0 | |
1904 | |
1905 pmaddubsw m4, m3, [r3 + 26 * 16] | |
1906 pmulhrsw m4, m7 | |
1907 | |
1908 packuswb m2, m4 | |
1909 movu [r0 + 1248], m2 | |
1910 | |
1911 ; mode 21 [row 6, 7] | |
1912 | |
1913 pmaddubsw m2, m3, [r3 + 9 * 16] | |
1914 pmulhrsw m2, m7 | |
1915 | |
1916 pslldq m3, 2 | |
1917 pinsrb m3, [r1 + 6 + 16], 1 | |
1918 pinsrb m3, [r1 + 8 + 16], 0 | |
1919 | |
1920 pmaddubsw m4, m3, [r3 + 24 * 16] | |
1921 pmulhrsw m4, m7 | |
1922 | |
1923 packuswb m2, m4 | |
1924 movu [r0 + 1264], m2 | |
1925 | |
1926 ; mode 22 [row 0, 1] | |
1927 | |
1928 pmaddubsw m2, m0, [r3 + 19 * 16] | |
1929 pmulhrsw m2, m7 | |
1930 | |
1931 pmaddubsw m4, m0, [r3 + 6 * 16] | |
1932 pmulhrsw m4, m7 | |
1933 | |
1934 packuswb m2, m4 | |
1935 movu [r0 + 1280], m2 | |
1936 | |
1937 ; mode 22 [row 2, 3] | |
1938 | |
1939 pmaddubsw m2, m1, [r3 + 25 * 16] | |
1940 pmulhrsw m2, m7 | |
1941 | |
1942 pmaddubsw m3, m1, [r3 + 12 * 16] | |
1943 pmulhrsw m3, m7 | |
1944 | |
1945 packuswb m2, m3 | |
1946 movu [r0 + 1296], m2 | |
1947 | |
1948 ; mode 22 [row 4, 5] | |
1949 | |
1950 pslldq m1, 2 | |
1951 pinsrb m1, [r1 + 5 + 16], 0 | |
1952 pinsrb m1, [r1 + 2 + 16], 1 | |
1953 | |
1954 pmaddubsw m2, m1, [r3 + 31 * 16] | |
1955 pmulhrsw m2, m7 | |
1956 | |
1957 pmaddubsw m3, m1, [r3 + 18 * 16] | |
1958 pmulhrsw m3, m7 | |
1959 | |
1960 packuswb m2, m3 | |
1961 movu [r0 + 1312], m2 | |
1962 | |
1963 ; mode 22 [row 6, 7] | |
1964 | |
1965 pmaddubsw m2, m1, [r3 + 5 * 16] | |
1966 pmulhrsw m2, m7 | |
1967 | |
1968 pslldq m1, 2 | |
1969 pinsrb m1, [r1 + 5 + 16], 1 | |
1970 pinsrb m1, [r1 + 7 + 16], 0 | |
1971 | |
1972 pmaddubsw m1, [r3 + 24 * 16] | |
1973 pmulhrsw m1, m7 | |
1974 | |
1975 packuswb m2, m1 | |
1976 movu [r0 + 1328], m2 | |
1977 | |
1978 ; mode 23 [row 0, 1] | |
1979 | |
1980 pmaddubsw m2, m0, [r3 + 23 * 16] | |
1981 pmulhrsw m2, m7 | |
1982 | |
1983 pmaddubsw m3, m0, [r3 + 14 * 16] | |
1984 pmulhrsw m3, m7 | |
1985 | |
1986 packuswb m2, m3 | |
1987 movu [r0 + 1344], m2 | |
1988 | |
1989 ; mode 23 [row 2, 3] | |
1990 | |
1991 pmaddubsw m2, m0, [r3 + 5 * 16] | |
1992 pmulhrsw m2, m7 | |
1993 | |
1994 pslldq m1, m0, 2 | |
1995 pinsrb m1, [r1], 1 | |
1996 pinsrb m1, [r1 + 4 + 16], 0 | |
1997 | |
1998 pmaddubsw m3, m1, [r3 + 28 * 16] | |
1999 pmulhrsw m3, m7 | |
2000 | |
2001 packuswb m2, m3 | |
2002 movu [r0 + 1360], m2 | |
2003 | |
2004 ; mode 23 [row 4, 5] | |
2005 | |
2006 pmaddubsw m2, m1, [r3 + 19 * 16] | |
2007 pmulhrsw m2, m7 | |
2008 | |
2009 pmaddubsw m3, m1, [r3 + 10 * 16] | |
2010 pmulhrsw m3, m7 | |
2011 | |
2012 packuswb m2, m3 | |
2013 movu [r0 + 1376], m2 | |
2014 | |
2015 ; mode 23 [row 6, 7] | |
2016 | |
2017 pmaddubsw m2, m1, [r3 + 1 * 16] | |
2018 pmulhrsw m2, m7 | |
2019 | |
2020 pslldq m3, m1, 2 | |
2021 pinsrb m3, [r1 + 4 + 16], 1 | |
2022 pinsrb m3, [r1 + 7 + 16], 0 | |
2023 | |
2024 pmaddubsw m3, [r3 + 24 * 16] | |
2025 pmulhrsw m3, m7 | |
2026 | |
2027 packuswb m2, m3 | |
2028 movu [r0 + 1392], m2 | |
2029 | |
2030 ; mode 24 [row 0, 1] | |
2031 | |
2032 pmaddubsw m2, m0, [r3 + 27 * 16] | |
2033 pmulhrsw m2, m7 | |
2034 | |
2035 pmaddubsw m5, m0, [r3 + 22 * 16] | |
2036 pmulhrsw m5, m7 | |
2037 | |
2038 packuswb m2, m5 | |
2039 movu [r0 + 1408], m2 | |
2040 | |
2041 ; mode 24 [row 2, 3] | |
2042 | |
2043 pmaddubsw m2, m0, [r3 + 17 * 16] | |
2044 pmulhrsw m2, m7 | |
2045 | |
2046 pmaddubsw m3, m0, [r3 + 12 * 16] | |
2047 pmulhrsw m3, m7 | |
2048 | |
2049 packuswb m2, m3 | |
2050 movu [r0 + 1424], m2 | |
2051 | |
2052 ; mode 24 [row 4, 5] | |
2053 | |
2054 pmaddubsw m2, m0, [r3 + 7 * 16] | |
2055 pmulhrsw m2, m7 | |
2056 | |
2057 pmaddubsw m3, m0, [r3 + 2 * 16] | |
2058 pmulhrsw m3, m7 | |
2059 | |
2060 packuswb m2, m3 | |
2061 movu [r0 + 1440], m2 | |
2062 | |
2063 ; mode 24 [row 6, 7] | |
2064 | |
2065 pinsrb m1, [r1 + 6 + 16], 0 | |
2066 | |
2067 pmaddubsw m2, m1, [r3 + 29 * 16] | |
2068 pmulhrsw m2, m7 | |
2069 | |
2070 pmaddubsw m1, [r3 + 24 * 16] | |
2071 pmulhrsw m1, m7 | |
2072 | |
2073 packuswb m2, m1 | |
2074 movu [r0 + 1456], m2 | |
2075 | |
2076 ; mode 25 [row 0, 1] | |
2077 | |
2078 pmaddubsw m2, m0, [r3 + 30 * 16] | |
2079 pmulhrsw m2, m7 | |
2080 | |
2081 pmaddubsw m1, m0, [r3 + 28 * 16] | |
2082 pmulhrsw m1, m7 | |
2083 | |
2084 packuswb m2, m1 | |
2085 movu [r0 + 1472], m2 | |
2086 | |
2087 ; mode 25 [row 2, 3] | |
2088 | |
2089 pmaddubsw m2, m0, [r3 + 26 * 16] | |
2090 pmulhrsw m2, m7 | |
2091 | |
2092 pmaddubsw m1, m0, [r3 + 24 * 16] | |
2093 pmulhrsw m1, m7 | |
2094 | |
2095 packuswb m2, m1 | |
2096 movu [r0 + 1488], m2 | |
2097 | |
2098 ; mode 25 [row 4, 5] | |
2099 | |
2100 pmaddubsw m1, m0, [r3 + 20 * 16] | |
2101 pmulhrsw m1, m7 | |
2102 | |
2103 packuswb m5, m1 | |
2104 movu [r0 + 1504], m5 | |
2105 | |
2106 ; mode 25 [row 6, 7] | |
2107 | |
2108 pmaddubsw m2, m0, [r3 + 18 * 16] | |
2109 pmulhrsw m2, m7 | |
2110 | |
2111 pmaddubsw m1, m0, [r3 + 16 * 16] | |
2112 pmulhrsw m1, m7 | |
2113 | |
2114 packuswb m2, m1 | |
2115 movu [r0 + 1520], m2 | |
2116 | |
2117 ; mode 26 | |
2118 | |
2119 movu m0, [r1 + 1] | |
2120 | |
2121 pshufb m1, m0, [tab_Si] | |
2122 movu [r0 + 1536], m1 | |
2123 movu [r0 + 1552], m1 | |
2124 movu [r0 + 1568], m1 | |
2125 movu [r0 + 1584], m1 | |
2126 | |
2127 pxor m5, m5 | |
2128 | |
2129 pshufb m1, m1, m5 | |
2130 punpcklbw m1, m5 | |
2131 | |
2132 movu m2, [r1 + 16] | |
2133 pinsrb m2, [r1], 0 | |
2134 | |
2135 pshufb m3, m2, m5 | |
2136 punpcklbw m3, m5 | |
2137 | |
2138 psrldq m4, m2, 1 | |
2139 punpcklbw m4, m5 | |
2140 | |
2141 movu m2, [r1 + 9 + 16] | |
2142 punpcklbw m2, m5 | |
2143 | |
2144 psubw m4, m3 | |
2145 psubw m2, m3 | |
2146 | |
2147 psraw m4, 1 | |
2148 psraw m2, 1 | |
2149 | |
2150 paddw m4, m1 | |
2151 paddw m2, m1 | |
2152 | |
2153 packuswb m4, m2 | |
2154 | |
2155 pextrb [r0 + 1536], m4, 0 | |
2156 pextrb [r0 + 1544], m4, 1 | |
2157 pextrb [r0 + 1552], m4, 2 | |
2158 pextrb [r0 + 1560], m4, 3 | |
2159 pextrb [r0 + 1568], m4, 4 | |
2160 pextrb [r0 + 1576], m4, 5 | |
2161 pextrb [r0 + 1584], m4, 6 | |
2162 pextrb [r0 + 1592], m4, 7 | |
2163 | |
2164 ; mode 27 [row 0, 1] | |
2165 | |
2166 palignr m6, m0, 1 | |
2167 punpcklbw m4, m0, m6 | |
2168 | |
2169 pmaddubsw m1, m4, [r3 + 2 * 16] | |
2170 pmulhrsw m1, m7 | |
2171 | |
2172 pmaddubsw m2, m4, [r3 + 4 * 16] | |
2173 pmulhrsw m2, m7 | |
2174 | |
2175 packuswb m1, m2 | |
2176 movu [r0 + 1600], m1 | |
2177 | |
2178 ; mode 27 [row 2, 3] | |
2179 | |
2180 pmaddubsw m1, m4, [r3 + 6 * 16] | |
2181 pmulhrsw m1, m7 | |
2182 | |
2183 pmaddubsw m2, m4, [r3 + 8 * 16] | |
2184 pmulhrsw m2, m7 | |
2185 | |
2186 packuswb m1, m2 | |
2187 movu [r0 + 1616], m1 | |
2188 | |
2189 ; mode 27 [row 4, 5] | |
2190 | |
2191 pmaddubsw m3, m4, [r3 + 10 * 16] | |
2192 pmulhrsw m3, m7 | |
2193 | |
2194 pmaddubsw m2, m4, [r3 + 12 * 16] | |
2195 pmulhrsw m2, m7 | |
2196 | |
2197 packuswb m1, m3, m2 | |
2198 movu [r0 + 1632], m1 | |
2199 | |
2200 ; mode 27 [row 6, 7] | |
2201 | |
2202 pmaddubsw m1, m4, [r3 + 14 * 16] | |
2203 pmulhrsw m1, m7 | |
2204 | |
2205 pmaddubsw m2, m4, [r3 + 16 * 16] | |
2206 pmulhrsw m2, m7 | |
2207 | |
2208 packuswb m1, m2 | |
2209 movu [r0 + 1648], m1 | |
2210 | |
2211 ; mode 28 [row 0, 1] | |
2212 | |
2213 pmaddubsw m1, m4, [r3 + 5 * 16] | |
2214 pmulhrsw m1, m7 | |
2215 | |
2216 packuswb m1, m3 | |
2217 movu [r0 + 1664], m1 | |
2218 | |
2219 ; mode 28 [row 2, 3] | |
2220 | |
2221 pmaddubsw m1, m4, [r3 + 15 * 16] | |
2222 pmulhrsw m1, m7 | |
2223 | |
2224 pmaddubsw m2, m4, [r3 + 20 * 16] | |
2225 pmulhrsw m2, m7 | |
2226 | |
2227 packuswb m1, m2 | |
2228 movu [r0 + 1680], m1 | |
2229 | |
2230 ; mode 28 [row 4, 5] | |
2231 | |
2232 pmaddubsw m1, m4, [r3 + 25 * 16] | |
2233 pmulhrsw m1, m7 | |
2234 | |
2235 pmaddubsw m2, m4, [r3 + 30 * 16] | |
2236 pmulhrsw m2, m7 | |
2237 | |
2238 packuswb m1, m2 | |
2239 movu [r0 + 1696], m1 | |
2240 | |
2241 ; mode 28 [row 6, 7] | |
2242 | |
2243 palignr m1, m0, 2 | |
2244 punpcklbw m5, m6, m1 | |
2245 | |
2246 pmaddubsw m2, m5, [r3 + 3 * 16] | |
2247 pmulhrsw m2, m7 | |
2248 | |
2249 pmaddubsw m3, m5, [r3 + 8 * 16] | |
2250 pmulhrsw m3, m7 | |
2251 | |
2252 packuswb m2, m3 | |
2253 movu [r0 + 1712], m2 | |
2254 | |
2255 ; mode 29 [row 0, 1] | |
2256 | |
2257 pmaddubsw m2, m4, [r3 + 9 * 16] | |
2258 pmulhrsw m2, m7 | |
2259 | |
2260 pmaddubsw m3, m4, [r3 + 18 * 16] | |
2261 pmulhrsw m3, m7 | |
2262 | |
2263 packuswb m2, m3 | |
2264 movu [r0 + 1728], m2 | |
2265 | |
2266 ; mode 29 [row 2, 3] | |
2267 | |
2268 pmaddubsw m2, m4, [r3 + 27 * 16] | |
2269 pmulhrsw m2, m7 | |
2270 | |
2271 pmaddubsw m3, m5, [r3 + 4 * 16] | |
2272 pmulhrsw m3, m7 | |
2273 | |
2274 packuswb m2, m3 | |
2275 movu [r0 + 1744], m2 | |
2276 | |
2277 ; mode 29 [row 4, 5] | |
2278 | |
2279 pmaddubsw m2, m5, [r3 + 13 * 16] | |
2280 pmulhrsw m2, m7 | |
2281 | |
2282 pmaddubsw m3, m5, [r3 + 22 * 16] | |
2283 pmulhrsw m3, m7 | |
2284 | |
2285 packuswb m2, m3 | |
2286 movu [r0 + 1760], m2 | |
2287 | |
2288 ; mode 29 [row 6, 7] | |
2289 | |
2290 pmaddubsw m2, m5, [r3 + 31 * 16] | |
2291 pmulhrsw m2, m7 | |
2292 | |
2293 palignr m6, m0, 3 | |
2294 punpcklbw m1, m6 | |
2295 | |
2296 pmaddubsw m3, m1, [r3 + 8 * 16] | |
2297 pmulhrsw m3, m7 | |
2298 | |
2299 packuswb m2, m3 | |
2300 movu [r0 + 1776], m2 | |
2301 | |
2302 ; mode 32 [row 2] | |
2303 | |
2304 movh [r0 + 1936], m2 | |
2305 | |
2306 ; mode 30 [row 0, 1] | |
2307 | |
2308 pmaddubsw m2, m4, [r3 + 13 * 16] | |
2309 pmulhrsw m2, m7 | |
2310 | |
2311 pmaddubsw m3, m4, [r3 + 26 * 16] | |
2312 pmulhrsw m3, m7 | |
2313 | |
2314 packuswb m2, m3 | |
2315 movu [r0 + 1792], m2 | |
2316 | |
2317 ; mode 30 [row 2, 3] | |
2318 | |
2319 pmaddubsw m2, m5, [r3 + 7 * 16] | |
2320 pmulhrsw m2, m7 | |
2321 | |
2322 pmaddubsw m3, m5, [r3 + 20 * 16] | |
2323 pmulhrsw m3, m7 | |
2324 | |
2325 packuswb m2, m3 | |
2326 movu [r0 + 1808], m2 | |
2327 | |
2328 ; mode 33 [row 1] | |
2329 | |
2330 movhps [r0 + 1992], m2 | |
2331 | |
2332 ; mode 30 [row 4, 5] | |
2333 | |
2334 pmaddubsw m2, m1, [r3 + 1 * 16] | |
2335 pmulhrsw m2, m7 | |
2336 | |
2337 pmaddubsw m3, m1, [r3 + 14 * 16] | |
2338 pmulhrsw m3, m7 | |
2339 | |
2340 packuswb m2, m3 | |
2341 movu [r0 + 1824], m2 | |
2342 | |
2343 ; mode 33 [row 2] | |
2344 | |
2345 movhps [r0 + 2000], m2 | |
2346 | |
2347 ; mode 30 [row 6, 7] | |
2348 | |
2349 pmaddubsw m2, m1, [r3 + 27 * 16] | |
2350 pmulhrsw m2, m7 | |
2351 | |
2352 psrldq m0, 4 | |
2353 punpcklbw m6, m0 | |
2354 | |
2355 pmaddubsw m3, m6, [r3 + 8 * 16] | |
2356 pmulhrsw m3, m7 | |
2357 | |
2358 packuswb m2, m3 | |
2359 movu [r0 + 1840], m2 | |
2360 | |
2361 ; mode 33 [row 3] | |
2362 | |
2363 movhps [r0 + 2008], m2 | |
2364 | |
2365 ; mode 31 [row 0, 1] | |
2366 | |
2367 pmaddubsw m2, m4, [r3 + 17 * 16] | |
2368 pmulhrsw m2, m7 | |
2369 | |
2370 pmaddubsw m3, m5, [r3 + 2 * 16] | |
2371 pmulhrsw m3, m7 | |
2372 | |
2373 packuswb m2, m3 | |
2374 movu [r0 + 1856], m2 | |
2375 | |
2376 ; mode 31 [row 2, 3] | |
2377 | |
2378 pmaddubsw m2, m5, [r3 + 19 * 16] | |
2379 pmulhrsw m2, m7 | |
2380 | |
2381 pmaddubsw m3, m1, [r3 + 4 * 16] | |
2382 pmulhrsw m3, m7 | |
2383 | |
2384 packuswb m2, m3 | |
2385 movu [r0 + 1872], m2 | |
2386 | |
2387 ; mode 31 [row 4, 5] | |
2388 | |
2389 pmaddubsw m2, m1, [r3 + 21 * 16] | |
2390 pmulhrsw m2, m7 | |
2391 | |
2392 pmaddubsw m3, m6, [r3 + 6 * 16] | |
2393 pmulhrsw m3, m7 | |
2394 | |
2395 packuswb m2, m3 | |
2396 movu [r0 + 1888], m2 | |
2397 | |
2398 ; mode 31 [row 6, 7] | |
2399 | |
2400 pmaddubsw m2, m6, [r3 + 23 * 16] | |
2401 pmulhrsw m2, m7 | |
2402 | |
2403 movu m3, [r1 + 6] | |
2404 punpcklbw m0, m3 | |
2405 | |
2406 pmaddubsw m3, m0, [r3 + 8 * 16] | |
2407 pmulhrsw m3, m7 | |
2408 | |
2409 packuswb m2, m3 | |
2410 movu [r0 + 1904], m2 | |
2411 | |
2412 ; mode 32 [row 0, 1] | |
2413 | |
2414 pmaddubsw m2, m4, [r3 + 21 * 16] | |
2415 pmulhrsw m2, m7 | |
2416 | |
2417 pmaddubsw m3, m5, [r3 + 10 * 16] | |
2418 pmulhrsw m3, m7 | |
2419 | |
2420 packuswb m2, m3 | |
2421 movu [r0 + 1920], m2 | |
2422 | |
2423 ; mode 32 [row 3] | |
2424 | |
2425 pmaddubsw m2, m1, [r3 + 20 * 16] | |
2426 pmulhrsw m2, m7 | |
2427 | |
2428 pxor m3, m3 | |
2429 | |
2430 packuswb m2, m3 | |
2431 movh [r0 + 1944], m2 | |
2432 | |
2433 ; mode 32 [row 4, 5] | |
2434 | |
2435 pmaddubsw m2, m6, [r3 + 9 * 16] | |
2436 pmulhrsw m2, m7 | |
2437 | |
2438 pmaddubsw m3, m6, [r3 + 30 * 16] | |
2439 pmulhrsw m3, m7 | |
2440 | |
2441 packuswb m2, m3 | |
2442 movu [r0 + 1952], m2 | |
2443 | |
2444 ; mode 33 [row 4, 5] | |
2445 | |
2446 pmaddubsw m2, m0, [r3 + 2 * 16] | |
2447 pmulhrsw m2, m7 | |
2448 | |
2449 pmaddubsw m3, m0, [r3 + 28 * 16] | |
2450 pmulhrsw m3, m7 | |
2451 | |
2452 packuswb m2, m3 | |
2453 movu [r0 + 2016], m2 | |
2454 | |
2455 ; mode 32 [row 6] | |
2456 | |
2457 pmaddubsw m2, m0, [r3 + 19 * 16] | |
2458 pmulhrsw m2, m7 | |
2459 | |
2460 ; mode 32 [row 7] | |
2461 | |
2462 movu m0, [r1 + 6] | |
2463 palignr m3, m0, 1 | |
2464 punpcklbw m0, m3 | |
2465 | |
2466 pmaddubsw m3, m0, [r3 + 8 * 16] | |
2467 pmulhrsw m3, m7 | |
2468 | |
2469 packuswb m2, m3 | |
2470 movu [r0 + 1968], m2 | |
2471 | |
2472 ; mode 33 [row 6, 7] | |
2473 | |
2474 pmaddubsw m2, m0, [r3 + 22 * 16] | |
2475 pmulhrsw m2, m7 | |
2476 | |
2477 movu m0, [r1 + 7] | |
2478 palignr m3, m0, 1 | |
2479 punpcklbw m0, m3 | |
2480 | |
2481 pmaddubsw m3, m0, [r3 + 16 * 16] | |
2482 pmulhrsw m3, m7 | |
2483 | |
2484 packuswb m2, m3 | |
2485 movu [r0 + 2032], m2 | |
2486 | |
2487 ; mode 33 [row 0] | |
2488 | |
2489 pmaddubsw m2, m4, [r3 + 26 * 16] | |
2490 pmulhrsw m2, m7 | |
2491 | |
2492 pxor m3, m3 | |
2493 | |
2494 packuswb m2, m3 | |
2495 movh [r0 + 1984], m2 | |
2496 | |
2497 ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] | |
2498 | |
2499 movu m0, [r2 + 2] | |
2500 palignr m1, m0, 1 | |
2501 punpcklqdq m2, m0, m1 | |
2502 movu [r0 + 2048], m2 | |
2503 | |
2504 palignr m1, m0, 2 | |
2505 palignr m2, m0, 3 | |
2506 punpcklqdq m1, m2 | |
2507 movu [r0 + 2064], m1 | |
2508 | |
2509 palignr m1, m0, 4 | |
2510 palignr m2, m0, 5 | |
2511 punpcklqdq m1, m2 | |
2512 movu [r0 + 2080], m1 | |
2513 | |
2514 palignr m1, m0, 6 | |
2515 palignr m2, m0, 7 | |
2516 punpcklqdq m1, m2 | |
2517 movu [r0 + 2096], m1 | |
2518 RET | |
2519 | |
2520 ;-------------------------------------------------------------------------------- | |
2521 ; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
2522 ;-------------------------------------------------------------------------------- | |
2523 INIT_XMM sse4 | |
2524 cglobal all_angs_pred_16x16, 3,4,8 | |
2525 ; mode 2 | |
2526 | |
2527 movu m0, [r2 + 2 + 32] | |
2528 movu [r0 + 0 * 16], m0 | |
2529 | |
2530 movu m1, m0 | |
2531 | |
2532 movu m6, [r2 + 18 + 32] | |
2533 palignr m5, m6, m0, 1 | |
2534 movu [r0 + 1 * 16], m5 | |
2535 | |
2536 movu m4, m5 | |
2537 | |
2538 palignr m5, m6, m0, 2 | |
2539 movu [r0 + 2 * 16], m5 | |
2540 palignr m5, m6, m0, 3 | |
2541 movu [r0 + 3 * 16], m5 | |
2542 palignr m5, m6, m0, 4 | |
2543 movu [r0 + 4 * 16], m5 | |
2544 palignr m5, m6, m0, 5 | |
2545 movu [r0 + 5 * 16], m5 | |
2546 palignr m5, m6, m0, 6 | |
2547 movu [r0 + 6 * 16], m5 | |
2548 palignr m5, m6, m0, 7 | |
2549 movu [r0 + 7 * 16], m5 | |
2550 | |
2551 movu m7, m5 | |
2552 | |
2553 palignr m5, m6, m0, 8 | |
2554 movu [r0 + 8 * 16], m5 | |
2555 | |
2556 movu m2, m5 | |
2557 | |
2558 palignr m5, m6, m0, 9 | |
2559 movu [r0 + 9 * 16], m5 | |
2560 | |
2561 palignr m3, m6, m0, 10 | |
2562 movu [r0 + 10 * 16], m3 | |
2563 palignr m3, m6, m0, 11 | |
2564 movu [r0 + 11 * 16], m3 | |
2565 palignr m3, m6, m0, 12 | |
2566 movu [r0 + 12 * 16], m3 | |
2567 | |
2568 ; mode 3 [row 15] | |
2569 movu [r0 + (3-2)*16*16 + 15 * 16], m3 | |
2570 | |
2571 palignr m3, m6, m0, 13 | |
2572 movu [r0 + 13 * 16], m3 | |
2573 palignr m3, m6, m0, 14 | |
2574 movu [r0 + 14 * 16], m3 | |
2575 palignr m3, m6, m0, 15 | |
2576 movu [r0 + 15 * 16], m3 | |
2577 | |
2578 ; mode 3 [row 0] | |
2579 lea r3, [ang_table] | |
2580 movu m3, [pw_1024] | |
2581 movu m0, [r2 + 1 + 32] | |
2582 punpcklbw m0, m1 | |
2583 | |
2584 ; mode 17 [row 8 - second half] | |
2585 pmaddubsw m1, m0, [r3 + 22 * 16] | |
2586 pmulhrsw m1, m3 | |
2587 packuswb m1, m1 | |
2588 movh [r0 + 248 * 16 + 8], m1 | |
2589 ; mode 17 [row 8 - second half] end | |
2590 | |
2591 pmaddubsw m1, m0, [r3 + 26 * 16] | |
2592 pmulhrsw m1, m3 | |
2593 punpcklbw m7, m2 | |
2594 pmaddubsw m2, m7, [r3 + 26 * 16] | |
2595 pmulhrsw m2, m3 | |
2596 packuswb m1, m2 | |
2597 movu [r0 + 16 * 16], m1 | |
2598 | |
2599 ;mode 6 [row 1] | |
2600 movu [r0 + 65 * 16], m1 | |
2601 | |
2602 ; mode 4 [row 0] | |
2603 pmaddubsw m1, m0, [r3 + 21 * 16] | |
2604 pmulhrsw m1, m3 | |
2605 pmaddubsw m2, m7, [r3 + 21 * 16] | |
2606 pmulhrsw m2, m3 | |
2607 packuswb m1, m2 | |
2608 movu [r0 + 32 * 16], m1 | |
2609 | |
2610 ; mode 5 [row 0] | |
2611 pmaddubsw m1, m0, [r3 + 17 * 16] | |
2612 pmulhrsw m1, m3 | |
2613 pmaddubsw m2, m7, [r3 + 17 * 16] | |
2614 pmulhrsw m2, m3 | |
2615 packuswb m1, m2 | |
2616 movu [r0 + 48 * 16], m1 | |
2617 | |
2618 ; mode 6 [row 0] | |
2619 pmaddubsw m1, m0, [r3 + 13 * 16] | |
2620 pmulhrsw m1, m3 | |
2621 pmaddubsw m2, m7, [r3 + 13 * 16] | |
2622 pmulhrsw m2, m3 | |
2623 packuswb m1, m2 | |
2624 movu [r0 + 64 * 16], m1 | |
2625 | |
2626 ; mode 7 [row 0] | |
2627 pmaddubsw m1, m0, [r3 + 9 * 16] | |
2628 pmulhrsw m1, m3 | |
2629 pmaddubsw m2, m7, [r3 + 9 * 16] | |
2630 pmulhrsw m2, m3 | |
2631 packuswb m1, m2 | |
2632 movu [r0 + 80 * 16], m1 | |
2633 | |
2634 ; mode 7 [row 1] | |
2635 pmaddubsw m1, m0, [r3 + 18 * 16] | |
2636 pmulhrsw m1, m3 | |
2637 pmaddubsw m2, m7, [r3 + 18 * 16] | |
2638 pmulhrsw m2, m3 | |
2639 packuswb m1, m2 | |
2640 movu [r0 + 81 * 16], m1 | |
2641 | |
2642 ; mode 7 [row 2] | |
2643 pmaddubsw m1, m0, [r3 + 27 * 16] | |
2644 pmulhrsw m1, m3 | |
2645 pmaddubsw m2, m7, [r3 + 27 * 16] | |
2646 pmulhrsw m2, m3 | |
2647 packuswb m1, m2 | |
2648 movu [r0 + 82 * 16], m1 | |
2649 | |
2650 ; mode 8 [row 0] | |
2651 pmaddubsw m1, m0, [r3 + 5 * 16] | |
2652 pmulhrsw m1, m3 | |
2653 pmaddubsw m2, m7, [r3 + 5 * 16] | |
2654 pmulhrsw m2, m3 | |
2655 packuswb m1, m2 | |
2656 movu [r0 + 96 * 16], m1 | |
2657 | |
2658 ; mode 8 [row 1] | |
2659 pmaddubsw m1, m0, [r3 + 10 * 16] | |
2660 pmulhrsw m1, m3 | |
2661 pmaddubsw m2, m7, [r3 + 10 * 16] | |
2662 pmulhrsw m2, m3 | |
2663 packuswb m1, m2 | |
2664 movu [r0 + 97 * 16], m1 | |
2665 | |
2666 ; mode 8 [row 2] | |
2667 pmaddubsw m1, m0, [r3 + 15 * 16] | |
2668 pmulhrsw m1, m3 | |
2669 pmaddubsw m2, m7, [r3 + 15 * 16] | |
2670 pmulhrsw m2, m3 | |
2671 packuswb m1, m2 | |
2672 movu [r0 + 98 * 16], m1 | |
2673 | |
2674 ; mode 8 [row 3] | |
2675 pmaddubsw m1, m0, [r3 + 20 * 16] | |
2676 pmulhrsw m1, m3 | |
2677 pmaddubsw m2, m7, [r3 + 20 * 16] | |
2678 pmulhrsw m2, m3 | |
2679 packuswb m1, m2 | |
2680 movu [r0 + 99 * 16], m1 | |
2681 | |
2682 ; mode 8 [row 4] | |
2683 pmaddubsw m1, m0, [r3 + 25 * 16] | |
2684 pmulhrsw m1, m3 | |
2685 pmaddubsw m2, m7, [r3 + 25 * 16] | |
2686 pmulhrsw m2, m3 | |
2687 packuswb m1, m2 | |
2688 movu [r0 + 100 * 16], m1 | |
2689 | |
2690 ; mode 8 [row 5] | |
2691 pmaddubsw m1, m0, [r3 + 30 * 16] | |
2692 pmulhrsw m1, m3 | |
2693 pmaddubsw m2, m7, [r3 + 30 * 16] | |
2694 pmulhrsw m2, m3 | |
2695 packuswb m1, m2 | |
2696 movu [r0 + 101 * 16], m1 | |
2697 | |
2698 ; mode 15 [row 13 - second half] | |
2699 pmaddubsw m1, m0, [r3 + 18 * 16] | |
2700 pmulhrsw m1, m3 | |
2701 packuswb m1, m1 | |
2702 movh [r0 + 221 * 16 + 8], m1 | |
2703 ; mode 15 [row 13 - second half] end | |
2704 | |
2705 ; mode 15 [row 14 - second half] | |
2706 pmaddubsw m1, m0, [r3 + 1 * 16] | |
2707 pmulhrsw m1, m3 | |
2708 packuswb m1, m1 | |
2709 movh [r0 + 222 * 16 + 8], m1 | |
2710 ; mode 15 [row 14 - second half] end | |
2711 | |
2712 ; mode 16 [row 10 - second half] | |
2713 pmaddubsw m1, m0, [r3 + 25 * 16] | |
2714 pmulhrsw m1, m3 | |
2715 packuswb m1, m1 | |
2716 movh [r0 + 234 * 16 + 8], m1 | |
2717 ; mode 16 [row 10 - second half] end | |
2718 | |
2719 ; mode 16 [row 11 - second half] | |
2720 pmaddubsw m1, m0, [r3 + 4 * 16] | |
2721 pmulhrsw m1, m3 | |
2722 packuswb m1, m1 | |
2723 movh [r0 + 235 * 16 + 8], m1 | |
2724 ; mode 16 [row 11 - second half] end | |
2725 | |
2726 ; mode 3 [row 1] | |
2727 movu m6, [r3 + 20 * 16] | |
2728 movu m0, [r2 + 2 + 32] | |
2729 punpcklbw m0, m4 | |
2730 | |
2731 ; mode 17 [row 7 - second half] | |
2732 pmaddubsw m1, m0, [r3 + 16 * 16] | |
2733 pmulhrsw m1, m3 | |
2734 packuswb m1, m1 | |
2735 movh [r0 + 247 * 16 + 8], m1 | |
2736 | |
2737 ; mode 17 [row 7 - second half] end | |
2738 pmaddubsw m1, m0, m6 | |
2739 pmulhrsw m1, m3 | |
2740 movu m2, [r2 + 10 + 32] | |
2741 punpcklbw m2, m5 | |
2742 pmaddubsw m4, m2, m6 | |
2743 pmulhrsw m4, m3 | |
2744 packuswb m1, m4 | |
2745 movu [r0 + 17 * 16], m1 | |
2746 | |
2747 ;mode 6 [row 3] | |
2748 movu [r0 + 67 * 16], m1 | |
2749 | |
2750 ; mode 4 row [row 1] | |
2751 pmaddubsw m1, m0, [r3 + 10 * 16] | |
2752 pmulhrsw m1, m3 | |
2753 pmaddubsw m4, m2, [r3 + 10 * 16] | |
2754 pmulhrsw m4, m3 | |
2755 packuswb m1, m4 | |
2756 movu [r0 + 33 * 16], m1 | |
2757 | |
2758 ; mode 4 row [row 2] | |
2759 pmaddubsw m1, m0, [r3 + 31 * 16] | |
2760 pmulhrsw m1, m3 | |
2761 pmaddubsw m4, m2, [r3 + 31 * 16] | |
2762 pmulhrsw m4, m3 | |
2763 packuswb m1, m4 | |
2764 movu [r0 + 34 * 16], m1 | |
2765 | |
2766 ; mode 7 [row 6] | |
2767 movu [r0 + 86 * 16], m1 | |
2768 | |
2769 ; mode 5 row [row 1] | |
2770 pmaddubsw m1, m0, [r3 + 2 * 16] | |
2771 pmulhrsw m1, m3 | |
2772 pmaddubsw m4, m2, [r3 + 2 * 16] | |
2773 pmulhrsw m4, m3 | |
2774 packuswb m1, m4 | |
2775 movu [r0 + 49 * 16], m1 | |
2776 | |
2777 ; mode 5 row [row 2] | |
2778 pmaddubsw m1, m0, [r3 + 19 * 16] | |
2779 pmulhrsw m1, m3 | |
2780 pmaddubsw m4, m2, [r3 + 19 * 16] | |
2781 pmulhrsw m4, m3 | |
2782 packuswb m1, m4 | |
2783 movu [r0 + 50 * 16], m1 | |
2784 | |
2785 ; mode 6 [row 2] | |
2786 pmaddubsw m1, m0, [r3 + 7 * 16] | |
2787 pmulhrsw m1, m3 | |
2788 pmaddubsw m4, m2, [r3 + 7 * 16] | |
2789 pmulhrsw m4, m3 | |
2790 packuswb m1, m4 | |
2791 movu [r0 + 66 * 16], m1 | |
2792 | |
2793 ; mode 7 [row 3] | |
2794 pmaddubsw m1, m0, [r3 + 4 * 16] | |
2795 pmulhrsw m1, m3 | |
2796 pmaddubsw m4, m2, [r3 + 4 * 16] | |
2797 pmulhrsw m4, m3 | |
2798 packuswb m1, m4 | |
2799 movu [r0 + 83 * 16], m1 | |
2800 | |
2801 ; mode 7 [row 4] | |
2802 pmaddubsw m1, m0, [r3 + 13 * 16] | |
2803 pmulhrsw m1, m3 | |
2804 pmaddubsw m4, m2, [r3 + 13 * 16] | |
2805 pmulhrsw m4, m3 | |
2806 packuswb m1, m4 | |
2807 movu [r0 + 84 * 16], m1 | |
2808 | |
2809 ; mode 8 [row 8] | |
2810 movu [r0 + 104 * 16], m1 | |
2811 | |
2812 ; mode 7 [row 5] | |
2813 pmaddubsw m1, m0, [r3 + 22 * 16] | |
2814 pmulhrsw m1, m3 | |
2815 pmaddubsw m4, m2, [r3 + 22 * 16] | |
2816 pmulhrsw m4, m3 | |
2817 packuswb m1, m4 | |
2818 movu [r0 + 85 * 16], m1 | |
2819 | |
2820 ; mode 8 [row 6] | |
2821 pmaddubsw m1, m0, [r3 + 3 * 16] | |
2822 pmulhrsw m1, m3 | |
2823 pmaddubsw m4, m2, [r3 + 3 * 16] | |
2824 pmulhrsw m4, m3 | |
2825 packuswb m1, m4 | |
2826 movu [r0 + 102 * 16], m1 | |
2827 | |
2828 ; mode 8 [row 7] | |
2829 pmaddubsw m1, m0, [r3 + 8 * 16] | |
2830 pmulhrsw m1, m3 | |
2831 pmaddubsw m4, m2, [r3 + 8 * 16] | |
2832 pmulhrsw m4, m3 | |
2833 packuswb m1, m4 | |
2834 movu [r0 + 103 * 16], m1 | |
2835 | |
2836 ; mode 8 [row 9] | |
2837 pmaddubsw m1, m0, [r3 + 18 * 16] | |
2838 pmulhrsw m1, m3 | |
2839 pmaddubsw m4, m2, [r3 + 18 * 16] | |
2840 pmulhrsw m4, m3 | |
2841 packuswb m1, m4 | |
2842 movu [r0 + 105 * 16], m1 | |
2843 | |
2844 ; mode 8 [row 10] | |
2845 pmaddubsw m1, m0, [r3 + 23 * 16] | |
2846 pmulhrsw m1, m3 | |
2847 pmaddubsw m4, m2, [r3 + 23 * 16] | |
2848 pmulhrsw m4, m3 | |
2849 packuswb m1, m4 | |
2850 movu [r0 + 106 * 16], m1 | |
2851 | |
2852 ; mode 8 [row 11] | |
2853 pmaddubsw m1, m0, [r3 + 28 * 16] | |
2854 pmulhrsw m1, m3 | |
2855 pmaddubsw m4, m2, [r3 + 28 * 16] | |
2856 pmulhrsw m4, m3 | |
2857 packuswb m1, m4 | |
2858 movu [r0 + 107 * 16], m1 | |
2859 | |
2860 ; mode 3 [row 2] | |
2861 movu m0, [r2 + 3 + 32] | |
2862 movd m1, [r2 + 19 + 32] | |
2863 palignr m1, m0, 1 | |
2864 punpcklbw m0, m1 | |
2865 | |
2866 ; mode 17 [row 6 - second half] | |
2867 pmaddubsw m1, m0, [r3 + 10 * 16] | |
2868 pmulhrsw m1, m3 | |
2869 packuswb m1, m1 | |
2870 movh [r0 + 246 * 16 + 8], m1 | |
2871 ; mode 17 [row 6 - second half] end | |
2872 | |
2873 pmaddubsw m1, m0, [r3 + 14 * 16] | |
2874 pmulhrsw m1, m3 | |
2875 movu m2, [r2 + 11 + 32] | |
2876 movd m4, [r2 + 27 + 32] | |
2877 palignr m4, m2, 1 | |
2878 punpcklbw m2, m4 | |
2879 pmaddubsw m4, m2, [r3 + 14 * 16] | |
2880 pmulhrsw m4, m3 | |
2881 packuswb m1, m4 | |
2882 movu [r0 + 18 * 16], m1 | |
2883 | |
2884 ; mode 6 [row 5] | |
2885 movu [r0 + 69 * 16], m1 | |
2886 | |
2887 ; mode 4 row [row 3] | |
2888 pmaddubsw m1, m0, [r3 + 20 * 16] | |
2889 pmulhrsw m1, m3 | |
2890 pmaddubsw m4, m2, [r3 + 20 * 16] | |
2891 pmulhrsw m4, m3 | |
2892 packuswb m1, m4 | |
2893 movu [r0 + 35 * 16], m1 | |
2894 | |
2895 ; mode 5 row [row 3] | |
2896 pmaddubsw m1, m0, [r3 + 4 * 16] | |
2897 pmulhrsw m1, m3 | |
2898 pmaddubsw m4, m2, [r3 + 4 * 16] | |
2899 pmulhrsw m4, m3 | |
2900 packuswb m1, m4 | |
2901 movu [r0 + 51 * 16], m1 | |
2902 | |
2903 ; mode 5 row [row 4] | |
2904 pmaddubsw m1, m0, [r3 + 21 * 16] | |
2905 pmulhrsw m1, m3 | |
2906 pmaddubsw m4, m2, [r3 + 21 * 16] | |
2907 pmulhrsw m4, m3 | |
2908 packuswb m1, m4 | |
2909 movu [r0 + 52 * 16], m1 | |
2910 | |
2911 ; mode 6 [row 4] | |
2912 pmaddubsw m1, m0, [r3 + 1 * 16] | |
2913 pmulhrsw m1, m3 | |
2914 pmaddubsw m4, m2, [r3 + 1 * 16] | |
2915 pmulhrsw m4, m3 | |
2916 packuswb m1, m4 | |
2917 movu [r0 + 68 * 16], m1 | |
2918 | |
2919 ; mode 6 [row 6] | |
2920 pmaddubsw m1, m0, [r3 + 27 * 16] | |
2921 pmulhrsw m1, m3 | |
2922 pmaddubsw m4, m2, [r3 + 27 * 16] | |
2923 pmulhrsw m4, m3 | |
2924 packuswb m1, m4 | |
2925 movu [r0 + 70 * 16], m1 | |
2926 | |
2927 ; mode 7 [row 7] | |
2928 pmaddubsw m1, m0, [r3 + 8 * 16] | |
2929 pmulhrsw m1, m3 | |
2930 pmaddubsw m4, m2, [r3 + 8 * 16] | |
2931 pmulhrsw m4, m3 | |
2932 packuswb m1, m4 | |
2933 movu [r0 + 87 * 16], m1 | |
2934 | |
2935 ; mode 7 [row 8] | |
2936 pmaddubsw m1, m0, [r3 + 17 * 16] | |
2937 pmulhrsw m1, m3 | |
2938 pmaddubsw m4, m2, [r3 + 17 * 16] | |
2939 pmulhrsw m4, m3 | |
2940 packuswb m1, m4 | |
2941 movu [r0 + 88 * 16], m1 | |
2942 | |
2943 ; mode 7 [row 9] | |
2944 pmaddubsw m1, m0, [r3 + 26 * 16] | |
2945 pmulhrsw m1, m3 | |
2946 pmaddubsw m4, m2, [r3 + 26 * 16] | |
2947 pmulhrsw m4, m3 | |
2948 packuswb m1, m4 | |
2949 movu [r0 + 89 * 16], m1 | |
2950 | |
2951 ; mode 8 [row 12] | |
2952 pmaddubsw m1, m0, [r3 + 1 * 16] | |
2953 pmulhrsw m1, m3 | |
2954 pmaddubsw m4, m2, [r3 + 1 * 16] | |
2955 pmulhrsw m4, m3 | |
2956 packuswb m1, m4 | |
2957 movu [r0 + 108 * 16], m1 | |
2958 | |
2959 ; mode 8 [row 13] | |
2960 pmaddubsw m1, m0, [r3 + 6 * 16] | |
2961 pmulhrsw m1, m3 | |
2962 pmaddubsw m4, m2, [r3 + 6 * 16] | |
2963 pmulhrsw m4, m3 | |
2964 packuswb m1, m4 | |
2965 movu [r0 + 109 * 16], m1 | |
2966 | |
2967 ; mode 8 [row 14] | |
2968 pmaddubsw m1, m0, [r3 + 11 * 16] | |
2969 pmulhrsw m1, m3 | |
2970 pmaddubsw m4, m2, [r3 + 11 * 16] | |
2971 pmulhrsw m4, m3 | |
2972 packuswb m1, m4 | |
2973 movu [r0 + 110 * 16], m1 | |
2974 | |
2975 ; mode 8 [row 15] | |
2976 pmaddubsw m1, m0, [r3 + 16 * 16] | |
2977 pmulhrsw m1, m3 | |
2978 pmaddubsw m4, m2, [r3 + 16 * 16] | |
2979 pmulhrsw m4, m3 | |
2980 packuswb m1, m4 | |
2981 movu [r0 + 111 * 16], m1 | |
2982 | |
2983 ; mode 3 [row 3] | |
2984 movu m0, [r2 + 4 + 32] | |
2985 movd m1, [r2 + 20 + 32] | |
2986 palignr m1, m0, 1 | |
2987 punpcklbw m0, m1 | |
2988 | |
2989 ; mode 17 [row 4 - second half] | |
2990 pmaddubsw m1, m0, [r3 + 30 * 16] | |
2991 pmulhrsw m1, m3 | |
2992 packuswb m1, m1 | |
2993 movh [r0 + 244 * 16 + 8], m1 | |
2994 ; mode 17 [row 4 - second half] end | |
2995 | |
2996 ; mode 17 [row 5 - second half] | |
2997 pmaddubsw m1, m0, [r3 + 4 * 16] | |
2998 pmulhrsw m1, m3 | |
2999 packuswb m1, m1 | |
3000 movh [r0 + 245 * 16 + 8], m1 | |
3001 ; mode 17 [row 5 - second half] end | |
3002 | |
3003 pmaddubsw m1, m0, [r3 + 8 * 16] | |
3004 pmulhrsw m1, m3 | |
3005 movu m2, [r2 + 12 + 32] | |
3006 movd m4, [r2 + 28 + 32] | |
3007 palignr m4, m2, 1 | |
3008 punpcklbw m2, m4 | |
3009 pmaddubsw m4, m2, [r3 + 8 * 16] | |
3010 pmulhrsw m4, m3 | |
3011 packuswb m1, m4 | |
3012 movu [r0 + 19 * 16], m1 | |
3013 | |
3014 ; mode 6 [row 7] | |
3015 movu [r0 + 71 * 16], m1 | |
3016 | |
3017 ; mode 4 row [row 4] | |
3018 pmaddubsw m1, m0, [r3 + 9 * 16] | |
3019 pmulhrsw m1, m3 | |
3020 pmaddubsw m4, m2, [r3 + 9 * 16] | |
3021 pmulhrsw m4, m3 | |
3022 packuswb m1, m4 | |
3023 movu [r0 + 36 * 16], m1 | |
3024 | |
3025 ; mode 4 row [row 5] | |
3026 pmaddubsw m1, m0, [r3 + 30 * 16] | |
3027 pmulhrsw m1, m3 | |
3028 pmaddubsw m4, m2, [r3 + 30 * 16] | |
3029 pmulhrsw m4, m3 | |
3030 packuswb m1, m4 | |
3031 movu [r0 + 37 * 16], m1 | |
3032 | |
3033 ; mode 7 row [row 13] | |
3034 movu [r0 + 93 * 16], m1 | |
3035 | |
3036 ; mode 5 row [row 5] | |
3037 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3038 pmulhrsw m1, m3 | |
3039 pmaddubsw m4, m2, [r3 + 6 * 16] | |
3040 pmulhrsw m4, m3 | |
3041 packuswb m1, m4 | |
3042 movu [r0 + 53 * 16], m1 | |
3043 | |
3044 ; mode 5 row [row 6] | |
3045 pmaddubsw m1, m0, [r3 + 23 * 16] | |
3046 pmulhrsw m1, m3 | |
3047 pmaddubsw m4, m2, [r3 + 23 * 16] | |
3048 pmulhrsw m4, m3 | |
3049 packuswb m1, m4 | |
3050 movu [r0 + 54 * 16], m1 | |
3051 | |
3052 ; mode 6 [row 8] | |
3053 pmaddubsw m1, m0, [r3 + 21 * 16] | |
3054 pmulhrsw m1, m3 | |
3055 pmaddubsw m4, m2, [r3 + 21 * 16] | |
3056 pmulhrsw m4, m3 | |
3057 packuswb m1, m4 | |
3058 movu [r0 + 72 * 16], m1 | |
3059 | |
3060 ; mode 7 [row 12] | |
3061 movu [r0 + 92 * 16], m1 | |
3062 | |
3063 ; mode 7 [row 10] | |
3064 pmaddubsw m1, m0, [r3 + 3 * 16] | |
3065 pmulhrsw m1, m3 | |
3066 pmaddubsw m4, m2, [r3 + 3 * 16] | |
3067 pmulhrsw m4, m3 | |
3068 packuswb m1, m4 | |
3069 movu [r0 + 90 * 16], m1 | |
3070 | |
3071 ; mode 7 [row 11] | |
3072 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3073 pmulhrsw m1, m3 | |
3074 pmaddubsw m4, m2, [r3 + 12 * 16] | |
3075 pmulhrsw m4, m3 | |
3076 packuswb m1, m4 | |
3077 movu [r0 + 91 * 16], m1 | |
3078 | |
3079 ; mode 3 [row 4] | |
3080 movu m0, [r2 + 5 + 32] | |
3081 movd m1, [r2 + 20 + 32] | |
3082 palignr m1, m0, 1 | |
3083 punpcklbw m0, m1 | |
3084 | |
3085 ; mode 17 [row 3 - second half] | |
3086 pmaddubsw m1, m0, [r3 + 24 * 16] | |
3087 pmulhrsw m1, m3 | |
3088 packuswb m1, m1 | |
3089 movh [r0 + 243 * 16 + 8], m1 | |
3090 | |
3091 ; mode 17 [row 3 - second half] end | |
3092 pmaddubsw m1, m0, [r3 + 2 * 16] | |
3093 pmulhrsw m1, m3 | |
3094 movu m2, [r2 + 13 + 32] | |
3095 movd m4, [r2 + 29 + 32] | |
3096 palignr m4, m2, 1 | |
3097 punpcklbw m2, m4 | |
3098 pmaddubsw m4, m2, [r3 + 2 * 16] | |
3099 pmulhrsw m4, m3 | |
3100 packuswb m1, m4 | |
3101 movu [r0 + 20 * 16], m1 | |
3102 | |
3103 ;mode 6 [row 9] | |
3104 movu [r0 + 73 * 16], m1 | |
3105 | |
3106 ; mode 4 row [row 6] | |
3107 movu m6, [r3 + 19 * 16] | |
3108 pmaddubsw m1, m0, m6 | |
3109 pmulhrsw m1, m3 | |
3110 pmaddubsw m4, m2, m6 | |
3111 pmulhrsw m4, m3 | |
3112 packuswb m1, m4 | |
3113 movu [r0 + 38 * 16], m1 | |
3114 | |
3115 ; mode 3 [row 5] | |
3116 pmaddubsw m1, m0, [r3 + 28 * 16] | |
3117 pmulhrsw m1, m3 | |
3118 pmaddubsw m4, m2, [r3 + 28 * 16] | |
3119 pmulhrsw m4, m3 | |
3120 packuswb m1, m4 | |
3121 movu [r0 + 21 * 16], m1 | |
3122 | |
3123 ;mode 6 [row 11] | |
3124 movu [r0 + 75 * 16], m1 | |
3125 | |
3126 ; mode 5 row [row 7] | |
3127 pmaddubsw m1, m0, [r3 + 8 * 16] | |
3128 pmulhrsw m1, m3 | |
3129 pmaddubsw m4, m2, [r3 + 8 * 16] | |
3130 pmulhrsw m4, m3 | |
3131 packuswb m1, m4 | |
3132 movu [r0 + 55 * 16], m1 | |
3133 | |
3134 ; mode 5 row [row 8] | |
3135 pmaddubsw m1, m0, [r3 + 25 * 16] | |
3136 pmulhrsw m1, m3 | |
3137 pmaddubsw m4, m2, [r3 + 25 * 16] | |
3138 pmulhrsw m4, m3 | |
3139 packuswb m1, m4 | |
3140 movu [r0 + 56 * 16], m1 | |
3141 | |
3142 ; mode 6 [row 10] | |
3143 pmaddubsw m1, m0, [r3 + 15 * 16] | |
3144 pmulhrsw m1, m3 | |
3145 pmaddubsw m4, m2, [r3 + 15 * 16] | |
3146 pmulhrsw m4, m3 | |
3147 packuswb m1, m4 | |
3148 movu [r0 + 74 * 16], m1 | |
3149 | |
3150 ; mode 7 [row 14] | |
3151 pmaddubsw m1, m0, [r3 + 7 * 16] | |
3152 pmulhrsw m1, m3 | |
3153 pmaddubsw m4, m2, [r3 + 7 * 16] | |
3154 pmulhrsw m4, m3 | |
3155 packuswb m1, m4 | |
3156 movu [r0 + 94 * 16], m1 | |
3157 | |
3158 ; mode 7 [row 15] | |
3159 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3160 pmulhrsw m1, m3 | |
3161 pmaddubsw m4, m2, [r3 + 16 * 16] | |
3162 pmulhrsw m4, m3 | |
3163 packuswb m1, m4 | |
3164 movu [r0 + 95 * 16], m1 | |
3165 | |
3166 ; mode 3 [row 6] | |
3167 movu m0, [r2 + 6 + 32] | |
3168 movd m1, [r2 + 22 + 32] | |
3169 palignr m1, m0, 1 | |
3170 punpcklbw m0, m1 | |
3171 | |
3172 ; mode 17 [row 2 - second half] | |
3173 pmaddubsw m1, m0, [r3 + 18 * 16] | |
3174 pmulhrsw m1, m3 | |
3175 packuswb m1, m1 | |
3176 movh [r0 + 242 * 16 + 8], m1 | |
3177 ; mode 17 [row 2 - second half] end | |
3178 | |
3179 pmaddubsw m1, m0, [r3 + 22 * 16] | |
3180 pmulhrsw m1, m3 | |
3181 movu m2, [r2 + 14 + 32] | |
3182 movd m4, [r2 + 30 + 32] | |
3183 palignr m4, m2, 1 | |
3184 punpcklbw m2, m4 | |
3185 pmaddubsw m4, m2, [r3 + 22 * 16] | |
3186 pmulhrsw m4, m3 | |
3187 packuswb m1, m4 | |
3188 movu [r0 + 22 * 16], m1 | |
3189 | |
3190 ; mode 6 [row 13] | |
3191 movu [r0 + 77 * 16], m1 | |
3192 | |
3193 ; mode 4 row [row 7] | |
3194 pmaddubsw m1, m0, [r3 + 8 * 16] | |
3195 pmulhrsw m1, m3 | |
3196 pmaddubsw m4, m2, [r3 + 8 * 16] | |
3197 pmulhrsw m4, m3 | |
3198 packuswb m1, m4 | |
3199 movu [r0 + 39 * 16], m1 | |
3200 | |
3201 ; mode 4 row [row 8] | |
3202 pmaddubsw m1, m0, [r3 + 29 * 16] | |
3203 pmulhrsw m1, m3 | |
3204 pmaddubsw m4, m2, [r3 + 29 * 16] | |
3205 pmulhrsw m4, m3 | |
3206 packuswb m1, m4 | |
3207 movu [r0 + 40 * 16], m1 | |
3208 | |
3209 ; mode 5 row [row 9] | |
3210 pmaddubsw m1, m0, [r3 + 10 * 16] | |
3211 pmulhrsw m1, m3 | |
3212 pmaddubsw m4, m2, [r3 + 10 * 16] | |
3213 pmulhrsw m4, m3 | |
3214 packuswb m1, m4 | |
3215 movu [r0 + 57 * 16], m1 | |
3216 | |
3217 ; mode 5 row [row 10] | |
3218 pmaddubsw m1, m0, [r3 + 27 * 16] | |
3219 pmulhrsw m1, m3 | |
3220 pmaddubsw m4, m2, [r3 + 27 * 16] | |
3221 pmulhrsw m4, m3 | |
3222 packuswb m1, m4 | |
3223 movu [r0 + 58 * 16], m1 | |
3224 | |
3225 ; mode 6 [row 12] | |
3226 pmaddubsw m1, m0, [r3 + 9 * 16] | |
3227 pmulhrsw m1, m3 | |
3228 pmaddubsw m4, m2, [r3 + 9 * 16] | |
3229 pmulhrsw m4, m3 | |
3230 packuswb m1, m4 | |
3231 movu [r0 + 76 * 16], m1 | |
3232 | |
3233 ; mode 3 [row 7] | |
3234 movu m0, [r2 + 7 + 32] | |
3235 movd m1, [r2 + 27 + 32] | |
3236 palignr m1, m0, 1 | |
3237 punpcklbw m0, m1 | |
3238 | |
3239 ; mode 17 [row 1 - second half] | |
3240 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3241 pmulhrsw m1, m3 | |
3242 packuswb m1, m1 | |
3243 movh [r0 + 241 * 16 + 8], m1 | |
3244 ; mode 17 [row 1 - second half] end | |
3245 | |
3246 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3247 pmulhrsw m1, m3 | |
3248 movu m2, [r2 + 15 + 32] | |
3249 movd m4, [r2 + 25 + 32] | |
3250 palignr m4, m2, 1 | |
3251 punpcklbw m2, m4 | |
3252 pmaddubsw m4, m2, [r3 + 16 * 16] | |
3253 pmulhrsw m4, m3 | |
3254 packuswb m1, m4 | |
3255 movu [r0 + 23 * 16], m1 | |
3256 | |
3257 ; mode 6 [row 15] | |
3258 movu [r0 + 79 * 16], m1 | |
3259 | |
3260 ; mode 4 row [row 9] | |
3261 pmaddubsw m1, m0, [r3 + 18 * 16] | |
3262 pmulhrsw m1, m3 | |
3263 pmaddubsw m4, m2, [r3 + 18 * 16] | |
3264 pmulhrsw m4, m3 | |
3265 packuswb m1, m4 | |
3266 movu [r0 + 41 * 16], m1 | |
3267 | |
3268 ; mode 5 row [row 11] | |
3269 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3270 pmulhrsw m1, m3 | |
3271 pmaddubsw m4, m2, [r3 + 12 * 16] | |
3272 pmulhrsw m4, m3 | |
3273 packuswb m1, m4 | |
3274 movu [r0 + 59 * 16], m1 | |
3275 | |
3276 ; mode 5 row [row 12] | |
3277 pmaddubsw m1, m0, [r3 + 29 * 16] | |
3278 pmulhrsw m1, m3 | |
3279 pmaddubsw m4, m2, [r3 + 29 * 16] | |
3280 pmulhrsw m4, m3 | |
3281 packuswb m1, m4 | |
3282 movu [r0 + 60 * 16], m1 | |
3283 | |
3284 ; mode 6 [row 14] | |
3285 pmaddubsw m1, m0, [r3 + 3 * 16] | |
3286 pmulhrsw m1, m3 | |
3287 pmaddubsw m4, m2, [r3 + 3 * 16] | |
3288 pmulhrsw m4, m3 | |
3289 packuswb m1, m4 | |
3290 movu [r0 + 78 * 16], m1 | |
3291 | |
3292 ; mode 3 [row 8] | |
3293 movu m0, [r2 + 8 + 32] | |
3294 movd m1, [r2 + 24 + 32] | |
3295 palignr m1, m0, 1 | |
3296 punpcklbw m0, m1 | |
3297 pmaddubsw m1, m0, [r3 + 10 * 16] | |
3298 pmulhrsw m1, m3 | |
3299 movu m2, [r2 + 16 + 32] | |
3300 psrldq m4, m2, 1 | |
3301 pinsrb m4, [r2 + 32], 15 | |
3302 punpcklbw m2, m4 | |
3303 pmaddubsw m4, m2, [r3 + 10 * 16] | |
3304 pmulhrsw m4, m3 | |
3305 packuswb m1, m4 | |
3306 movu [r0 + 24 * 16], m1 | |
3307 | |
3308 ; mode 4 row [row 10] | |
3309 pmaddubsw m1, m0, [r3 + 7 * 16] | |
3310 pmulhrsw m1, m3 | |
3311 pmaddubsw m4, m2, [r3 + 7 * 16] | |
3312 pmulhrsw m4, m3 | |
3313 packuswb m1, m4 | |
3314 movu [r0 + 42 * 16], m1 | |
3315 | |
3316 ; mode 4 row [row 11] | |
3317 pmaddubsw m1, m0, [r3 + 28 * 16] | |
3318 pmulhrsw m1, m3 | |
3319 pmaddubsw m4, m2, [r3 + 28 * 16] | |
3320 pmulhrsw m4, m3 | |
3321 packuswb m1, m4 | |
3322 movu [r0 + 43 * 16], m1 | |
3323 | |
3324 ; mode 5 row [row 13] | |
3325 pmaddubsw m1, m0, [r3 + 14 * 16] | |
3326 pmulhrsw m1, m3 | |
3327 pmaddubsw m4, m2, [r3 + 14 * 16] | |
3328 pmulhrsw m4, m3 | |
3329 packuswb m1, m4 | |
3330 movu [r0 + 61 * 16], m1 | |
3331 | |
3332 ; mode 5 row [row 14] | |
3333 pmaddubsw m1, m0, [r3 + 31 * 16] | |
3334 pmulhrsw m1, m3 | |
3335 pmaddubsw m4, m2, [r3 + 31 * 16] | |
3336 pmulhrsw m4, m3 | |
3337 packuswb m1, m4 | |
3338 movu [r0 + 62 * 16], m1 | |
3339 | |
3340 ; mode 3 [row 9] | |
3341 movu m0, [r2 + 9 + 32] | |
3342 movd m1, [r2 + 16 + 32] | |
3343 palignr m1, m0, 1 | |
3344 punpcklbw m0, m1 | |
3345 pmaddubsw m1, m0, [r3 + 4 * 16] | |
3346 pmulhrsw m1, m3 | |
3347 movu m2, [r2 + 17 + 32] | |
3348 movd m4, [r2 + 33 + 32] | |
3349 palignr m4, m2, 1 | |
3350 punpcklbw m2, m4 | |
3351 pmaddubsw m4, m2, [r3 + 4 * 16] | |
3352 pmulhrsw m4, m3 | |
3353 packuswb m1, m4 | |
3354 movu [r0 + 25 * 16], m1 | |
3355 | |
3356 ; mode 4 row [row 12] | |
3357 pmaddubsw m1, m0, [r3 + 17 * 16] | |
3358 pmulhrsw m1, m3 | |
3359 pmaddubsw m4, m2, [r3 + 17 * 16] | |
3360 pmulhrsw m4, m3 | |
3361 packuswb m1, m4 | |
3362 movu [r0 + 44 * 16], m1 | |
3363 | |
3364 ; mode 3 [row 10] | |
3365 pmaddubsw m1, m0, [r3 + 30 * 16] | |
3366 pmulhrsw m1, m3 | |
3367 pmaddubsw m4, m2, [r3 + 30 * 16] | |
3368 pmulhrsw m4, m3 | |
3369 packuswb m1, m4 | |
3370 movu [r0 + 26 * 16], m1 | |
3371 | |
3372 ; mode 5 row [row 15] | |
3373 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3374 pmulhrsw m1, m3 | |
3375 pmaddubsw m4, m2, [r3 + 16 * 16] | |
3376 pmulhrsw m4, m3 | |
3377 packuswb m1, m4 | |
3378 movu [r0 + 63 * 16], m1 | |
3379 | |
3380 ; mode 3 [row 11] | |
3381 movu m0, [r2 + 10 + 32] | |
3382 movd m1, [r2 + 26 + 32] | |
3383 palignr m1, m0, 1 | |
3384 punpcklbw m0, m1 | |
3385 pmaddubsw m1, m0, [r3 + 24 * 16] | |
3386 pmulhrsw m1, m3 | |
3387 movu m2, [r2 + 18 + 32] | |
3388 movd m4, [r2 + 34 + 32] | |
3389 palignr m4, m2, 1 | |
3390 punpcklbw m2, m4 | |
3391 pmaddubsw m4, m2, [r3 + 24 * 16] | |
3392 pmulhrsw m4, m3 | |
3393 packuswb m1, m4 | |
3394 movu [r0 + 27 * 16], m1 | |
3395 | |
3396 ; mode 4 row [row 13] | |
3397 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3398 pmulhrsw m1, m3 | |
3399 pmaddubsw m4, m2, [r3 + 6 * 16] | |
3400 pmulhrsw m4, m3 | |
3401 packuswb m1, m4 | |
3402 movu [r0 + 45 * 16], m1 | |
3403 | |
3404 ; mode 4 row [row 14] | |
3405 pmaddubsw m1, m0, [r3 + 27 * 16] | |
3406 pmulhrsw m1, m3 | |
3407 pmaddubsw m4, m2, [r3 + 27 * 16] | |
3408 pmulhrsw m4, m3 | |
3409 packuswb m1, m4 | |
3410 movu [r0 + 46 * 16], m1 | |
3411 | |
3412 ; mode 3 [row 12] | |
3413 movu m0, [r2 + 11 + 32] | |
3414 movd m1, [r2 + 27 + 32] | |
3415 palignr m1, m0, 1 | |
3416 punpcklbw m0, m1 | |
3417 pmaddubsw m1, m0, [r3 + 18 * 16] | |
3418 pmulhrsw m1, m3 | |
3419 movu m2, [r2 + 19 + 32] | |
3420 movd m4, [r2 + 35 + 32] | |
3421 palignr m4, m2, 1 | |
3422 punpcklbw m2, m4 | |
3423 pmaddubsw m4, m2, [r3 + 18 * 16] | |
3424 pmulhrsw m4, m3 | |
3425 packuswb m1, m4 | |
3426 movu [r0 + 28 * 16], m1 | |
3427 | |
3428 ; mode 4 row [row 15] | |
3429 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3430 pmulhrsw m1, m3 | |
3431 pmaddubsw m4, m2, [r3 + 16 * 16] | |
3432 pmulhrsw m4, m3 | |
3433 packuswb m1, m4 | |
3434 movu [r0 + 47 * 16], m1 | |
3435 | |
3436 ; mode 3 [row 13] | |
3437 movu m0, [r2 + 12 + 32] | |
3438 movd m1, [r2 + 28 + 32] | |
3439 palignr m1, m0, 1 | |
3440 punpcklbw m0, m1 | |
3441 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3442 pmulhrsw m1, m3 | |
3443 movu m2, [r2 + 20 + 32] | |
3444 movd m4, [r2 + 36 + 32] | |
3445 palignr m4, m2, 1 | |
3446 punpcklbw m2, m4 | |
3447 pmaddubsw m4, m2, [r3 + 12 * 16] | |
3448 pmulhrsw m4, m3 | |
3449 packuswb m1, m4 | |
3450 movu [r0 + 29 * 16], m1 | |
3451 | |
3452 ; mode 3 [row 14] | |
3453 movu m0, [r2 + 13 + 32] | |
3454 movd m1, [r2 + 29 + 32] | |
3455 palignr m1, m0, 1 | |
3456 punpcklbw m0, m1 | |
3457 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3458 pmulhrsw m1, m3 | |
3459 movu m2, [r2 + 21 + 32] | |
3460 movd m4, [r2 + 37 + 32] | |
3461 palignr m4, m2, 1 | |
3462 punpcklbw m2, m4 | |
3463 pmaddubsw m4, m2, [r3 + 6 * 16] | |
3464 pmulhrsw m4, m3 | |
3465 packuswb m1, m4 | |
3466 movu [r0 + 30 * 16], m1 | |
3467 | |
3468 ; mode 9 | |
3469 movu m0, [r1 + 1 + 32] | |
3470 movd m1, [r1 + 17 + 32] | |
3471 palignr m1, m0, 1 | |
3472 | |
3473 ; mode 9 [row 15] | |
3474 movu [r0 + 127 * 16], m1 | |
3475 | |
3476 ; mode 9 [row 0] | |
3477 punpcklbw m0, m1 | |
3478 pmaddubsw m1, m0, [r3 + 2 * 16] | |
3479 pmulhrsw m1, m3 | |
3480 movu m7, [r1 + 9 + 32] | |
3481 movd m4, [r2 + 25 + 32] | |
3482 palignr m2, m7, 1 | |
3483 punpcklbw m7, m2 | |
3484 pmaddubsw m2, m7, [r3 + 2 * 16] | |
3485 pmulhrsw m2, m3 | |
3486 packuswb m1, m2 | |
3487 movu [r0 + 112 * 16], m1 | |
3488 | |
3489 ; mode 9 [row 1] | |
3490 pmaddubsw m1, m0, [r3 + 4 * 16] | |
3491 pmulhrsw m1, m3 | |
3492 pmaddubsw m2, m7, [r3 + 4 * 16] | |
3493 pmulhrsw m2, m3 | |
3494 packuswb m1, m2 | |
3495 movu [r0 + 113 * 16], m1 | |
3496 | |
3497 ; mode 9 [row 2] | |
3498 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3499 pmulhrsw m1, m3 | |
3500 pmaddubsw m2, m7, [r3 + 6 * 16] | |
3501 pmulhrsw m2, m3 | |
3502 packuswb m1, m2 | |
3503 movu [r0 + 114 * 16], m1 | |
3504 | |
3505 ; mode 9 [row 3] | |
3506 pmaddubsw m1, m0, [r3 + 8 * 16] | |
3507 pmulhrsw m1, m3 | |
3508 pmaddubsw m2, m7, [r3 + 8 * 16] | |
3509 pmulhrsw m2, m3 | |
3510 packuswb m1, m2 | |
3511 movu [r0 + 115 * 16], m1 | |
3512 | |
3513 ; mode 9 [row 4] | |
3514 pmaddubsw m1, m0, [r3 + 10 * 16] | |
3515 pmulhrsw m1, m3 | |
3516 pmaddubsw m2, m7, [r3 + 10 * 16] | |
3517 pmulhrsw m2, m3 | |
3518 packuswb m1, m2 | |
3519 movu [r0 + 116 * 16], m1 | |
3520 | |
3521 ; mode 9 [row 5] | |
3522 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3523 pmulhrsw m1, m3 | |
3524 pmaddubsw m2, m7, [r3 + 12 * 16] | |
3525 pmulhrsw m2, m3 | |
3526 packuswb m1, m2 | |
3527 movu [r0 + 117 * 16], m1 | |
3528 | |
3529 ; mode 9 [row 6] | |
3530 pmaddubsw m1, m0, [r3 + 14 * 16] | |
3531 pmulhrsw m1, m3 | |
3532 pmaddubsw m2, m7, [r3 + 14 * 16] | |
3533 pmulhrsw m2, m3 | |
3534 packuswb m1, m2 | |
3535 movu [r0 + 118 * 16], m1 | |
3536 | |
3537 ; mode 9 [row 7] | |
3538 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3539 pmulhrsw m1, m3 | |
3540 pmaddubsw m2, m7, [r3 + 16 * 16] | |
3541 pmulhrsw m2, m3 | |
3542 packuswb m1, m2 | |
3543 movu [r0 + 119 * 16], m1 | |
3544 | |
3545 ; mode 9 [row 8] | |
3546 pmaddubsw m1, m0, [r3 + 18 * 16] | |
3547 pmulhrsw m1, m3 | |
3548 pmaddubsw m2, m7, [r3 + 18 * 16] | |
3549 pmulhrsw m2, m3 | |
3550 packuswb m1, m2 | |
3551 movu [r0 + 120 * 16], m1 | |
3552 | |
3553 ; mode 9 [row 9] | |
3554 pmaddubsw m1, m0, [r3 + 20 * 16] | |
3555 pmulhrsw m1, m3 | |
3556 pmaddubsw m2, m7, [r3 + 20 * 16] | |
3557 pmulhrsw m2, m3 | |
3558 packuswb m1, m2 | |
3559 movu [r0 + 121 * 16], m1 | |
3560 | |
3561 ; mode 9 [row 10] | |
3562 pmaddubsw m1, m0, [r3 + 22 * 16] | |
3563 pmulhrsw m1, m3 | |
3564 pmaddubsw m2, m7, [r3 + 22 * 16] | |
3565 pmulhrsw m2, m3 | |
3566 packuswb m1, m2 | |
3567 movu [r0 + 122 * 16], m1 | |
3568 | |
3569 ; mode 9 [row 11] | |
3570 pmaddubsw m1, m0, [r3 + 24 * 16] | |
3571 pmulhrsw m1, m3 | |
3572 pmaddubsw m2, m7, [r3 + 24 * 16] | |
3573 pmulhrsw m2, m3 | |
3574 packuswb m1, m2 | |
3575 movu [r0 + 123 * 16], m1 | |
3576 | |
3577 ; mode 9 [row 12] | |
3578 pmaddubsw m1, m0, [r3 + 26 * 16] | |
3579 pmulhrsw m1, m3 | |
3580 pmaddubsw m2, m7, [r3 + 26 * 16] | |
3581 pmulhrsw m2, m3 | |
3582 packuswb m1, m2 | |
3583 movu [r0 + 124 * 16], m1 | |
3584 | |
3585 ; mode 9 [row 13] | |
3586 pmaddubsw m1, m0, [r3 + 28 * 16] | |
3587 pmulhrsw m1, m3 | |
3588 pmaddubsw m2, m7, [r3 + 28 * 16] | |
3589 pmulhrsw m2, m3 | |
3590 packuswb m1, m2 | |
3591 movu [r0 + 125 * 16], m1 | |
3592 | |
3593 ; mode 9 [row 14] | |
3594 pmaddubsw m1, m0, [r3 + 30 * 16] | |
3595 pmulhrsw m1, m3 | |
3596 pmaddubsw m2, m7, [r3 + 30 * 16] | |
3597 pmulhrsw m2, m3 | |
3598 packuswb m1, m2 | |
3599 movu [r0 + 126 * 16], m1 | |
3600 | |
3601 ; mode 10 | |
3602 movu m1, [r1 + 1 + 32] | |
3603 movu [r0 + 128 * 16], m1 | |
3604 movu [r0 + 129 * 16], m1 | |
3605 movu [r0 + 130 * 16], m1 | |
3606 movu [r0 + 131 * 16], m1 | |
3607 movu [r0 + 132 * 16], m1 | |
3608 movu [r0 + 133 * 16], m1 | |
3609 movu [r0 + 134 * 16], m1 | |
3610 movu [r0 + 135 * 16], m1 | |
3611 movu [r0 + 136 * 16], m1 | |
3612 movu [r0 + 137 * 16], m1 | |
3613 movu [r0 + 138 * 16], m1 | |
3614 movu [r0 + 139 * 16], m1 | |
3615 movu [r0 + 140 * 16], m1 | |
3616 movu [r0 + 141 * 16], m1 | |
3617 movu [r0 + 142 * 16], m1 | |
3618 movu [r0 + 143 * 16], m1 | |
3619 | |
3620 pxor m0, m0 | |
3621 pshufb m1, m1, m0 | |
3622 punpcklbw m1, m0 | |
3623 pinsrb m2, [r1], 0 | |
3624 pshufb m2, m2, m0 | |
3625 punpcklbw m2, m0 | |
3626 movu m4, [r1 + 1] | |
3627 punpcklbw m5, m4, m0 | |
3628 punpckhbw m4, m0 | |
3629 psubw m5, m2 | |
3630 psubw m4, m2 | |
3631 psraw m5, 1 | |
3632 psraw m4, 1 | |
3633 paddw m5, m1 | |
3634 paddw m4, m1 | |
3635 packuswb m5, m4 | |
3636 | |
3637 pextrb [r0 + 128 * 16], m5, 0 | |
3638 pextrb [r0 + 129 * 16], m5, 1 | |
3639 pextrb [r0 + 130 * 16], m5, 2 | |
3640 pextrb [r0 + 131 * 16], m5, 3 | |
3641 pextrb [r0 + 132 * 16], m5, 4 | |
3642 pextrb [r0 + 133 * 16], m5, 5 | |
3643 pextrb [r0 + 134 * 16], m5, 6 | |
3644 pextrb [r0 + 135 * 16], m5, 7 | |
3645 pextrb [r0 + 136 * 16], m5, 8 | |
3646 pextrb [r0 + 137 * 16], m5, 9 | |
3647 pextrb [r0 + 138 * 16], m5, 10 | |
3648 pextrb [r0 + 139 * 16], m5, 11 | |
3649 pextrb [r0 + 140 * 16], m5, 12 | |
3650 pextrb [r0 + 141 * 16], m5, 13 | |
3651 pextrb [r0 + 142 * 16], m5, 14 | |
3652 pextrb [r0 + 143 * 16], m5, 15 | |
3653 | |
3654 ; mode 11 | |
3655 movu m0, [r1 + 32] | |
3656 pinsrb m0, [r1], 0 | |
3657 | |
3658 ; mode 11 [row 15] | |
3659 movu [r0 + 159 * 16], m0 | |
3660 | |
3661 ; mode 11 [row 0] | |
3662 movu m1, [r1 + 1 + 32] | |
3663 punpcklbw m0, m1 | |
3664 pmaddubsw m1, m0, [r3 + 30 * 16] | |
3665 pmulhrsw m1, m3 | |
3666 movu m7, [r1 + 8 + 32] | |
3667 movu m2, [r1 + 9 + 32] | |
3668 punpcklbw m7, m2 | |
3669 pmaddubsw m2, m7, [r3 + 30 * 16] | |
3670 pmulhrsw m2, m3 | |
3671 packuswb m1, m2 | |
3672 movu [r0 + 144 * 16], m1 | |
3673 | |
3674 ; mode 11 [row 1] | |
3675 pmaddubsw m1, m0, [r3 + 28 * 16] | |
3676 pmulhrsw m1, m3 | |
3677 pmaddubsw m2, m7, [r3 + 28 * 16] | |
3678 pmulhrsw m2, m3 | |
3679 packuswb m1, m2 | |
3680 movu [r0 + 145 * 16], m1 | |
3681 | |
3682 ; mode 11 [row 2] | |
3683 pmaddubsw m1, m0, [r3 + 26 * 16] | |
3684 pmulhrsw m1, m3 | |
3685 pmaddubsw m2, m7, [r3 + 26 * 16] | |
3686 pmulhrsw m2, m3 | |
3687 packuswb m1, m2 | |
3688 movu [r0 + 146 * 16], m1 | |
3689 | |
3690 ; mode 11 [row 3] | |
3691 pmaddubsw m1, m0, [r3 + 24 * 16] | |
3692 pmulhrsw m1, m3 | |
3693 pmaddubsw m2, m7, [r3 + 24 * 16] | |
3694 pmulhrsw m2, m3 | |
3695 packuswb m1, m2 | |
3696 movu [r0 + 147 * 16], m1 | |
3697 | |
3698 ; mode 11 [row 4] | |
3699 pmaddubsw m1, m0, [r3 + 22 * 16] | |
3700 pmulhrsw m1, m3 | |
3701 pmaddubsw m2, m7, [r3 + 22 * 16] | |
3702 pmulhrsw m2, m3 | |
3703 packuswb m1, m2 | |
3704 movu [r0 + 148 * 16], m1 | |
3705 | |
3706 ; mode 11 [row 5] | |
3707 pmaddubsw m1, m0, [r3 + 20 * 16] | |
3708 pmulhrsw m1, m3 | |
3709 pmaddubsw m2, m7, [r3 + 20 * 16] | |
3710 pmulhrsw m2, m3 | |
3711 packuswb m1, m2 | |
3712 movu [r0 + 149 * 16], m1 | |
3713 | |
3714 ; mode 11 [row 6] | |
3715 pmaddubsw m1, m0, [r3 + 18 * 16] | |
3716 pmulhrsw m1, m3 | |
3717 pmaddubsw m2, m7, [r3 + 18 * 16] | |
3718 pmulhrsw m2, m3 | |
3719 packuswb m1, m2 | |
3720 movu [r0 + 150 * 16], m1 | |
3721 | |
3722 ; mode 11 [row 7] | |
3723 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3724 pmulhrsw m1, m3 | |
3725 pmaddubsw m2, m7, [r3 + 16 * 16] | |
3726 pmulhrsw m2, m3 | |
3727 packuswb m1, m2 | |
3728 movu [r0 + 151 * 16], m1 | |
3729 | |
3730 ; mode 11 [row 8] | |
3731 pmaddubsw m1, m0, [r3 + 14 * 16] | |
3732 pmulhrsw m1, m3 | |
3733 pmaddubsw m2, m7, [r3 + 14 * 16] | |
3734 pmulhrsw m2, m3 | |
3735 packuswb m1, m2 | |
3736 movu [r0 + 152 * 16], m1 | |
3737 | |
3738 ; mode 11 [row 9] | |
3739 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3740 pmulhrsw m1, m3 | |
3741 pmaddubsw m2, m7, [r3 + 12 * 16] | |
3742 pmulhrsw m2, m3 | |
3743 packuswb m1, m2 | |
3744 movu [r0 + 153 * 16], m1 | |
3745 | |
3746 ; mode 11 [row 10] | |
3747 pmaddubsw m1, m0, [r3 + 10 * 16] | |
3748 pmulhrsw m1, m3 | |
3749 pmaddubsw m2, m7, [r3 + 10 * 16] | |
3750 pmulhrsw m2, m3 | |
3751 packuswb m1, m2 | |
3752 movu [r0 + 154 * 16], m1 | |
3753 | |
3754 ; mode 11 [row 11] | |
3755 pmaddubsw m1, m0, [r3 + 8 * 16] | |
3756 pmulhrsw m1, m3 | |
3757 pmaddubsw m2, m7, [r3 + 8 * 16] | |
3758 pmulhrsw m2, m3 | |
3759 packuswb m1, m2 | |
3760 movu [r0 + 155 * 16], m1 | |
3761 | |
3762 ; mode 11 [row 12] | |
3763 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3764 pmulhrsw m1, m3 | |
3765 pmaddubsw m2, m7, [r3 + 6 * 16] | |
3766 pmulhrsw m2, m3 | |
3767 packuswb m1, m2 | |
3768 movu [r0 + 156 * 16], m1 | |
3769 | |
3770 ; mode 11 [row 13] | |
3771 pmaddubsw m1, m0, [r3 + 4 * 16] | |
3772 pmulhrsw m1, m3 | |
3773 pmaddubsw m2, m7, [r3 + 4 * 16] | |
3774 pmulhrsw m2, m3 | |
3775 packuswb m1, m2 | |
3776 movu [r0 + 157 * 16], m1 | |
3777 | |
3778 ; mode 11 [row 14] | |
3779 pmaddubsw m1, m0, [r3 + 2 * 16] | |
3780 pmulhrsw m1, m3 | |
3781 pmaddubsw m2, m7, [r3 + 2 * 16] | |
3782 pmulhrsw m2, m3 | |
3783 packuswb m1, m2 | |
3784 movu [r0 + 158 * 16], m1 | |
3785 | |
3786 ; mode 12 [row 0] | |
3787 movu m0, [r2 + 32] | |
3788 pinsrb m0, [r2], 0 | |
3789 movu m1, [r2 + 1 + 32] | |
3790 punpcklbw m0, m1 | |
3791 pmaddubsw m1, m0, [r3 + 27 * 16] | |
3792 pmulhrsw m1, m3 | |
3793 movu m7, [r2 + 8 + 32] | |
3794 movd m2, [r2 + 24 + 32] | |
3795 palignr m2, m7, 1 | |
3796 punpcklbw m7, m2 | |
3797 pmaddubsw m2, m7, [r3 + 27 * 16] | |
3798 pmulhrsw m2, m3 | |
3799 packuswb m1, m2 | |
3800 movu [r0 + 160 * 16], m1 | |
3801 | |
3802 ; mode 12 [row 1] | |
3803 pmaddubsw m1, m0, [r3 + 22 * 16] | |
3804 pmulhrsw m1, m3 | |
3805 pmaddubsw m2, m7, [r3 + 22 * 16] | |
3806 pmulhrsw m2, m3 | |
3807 packuswb m1, m2 | |
3808 movu [r0 + 161 * 16], m1 | |
3809 | |
3810 ; mode 12 [row 2] | |
3811 pmaddubsw m1, m0, [r3 + 17 * 16] | |
3812 pmulhrsw m1, m3 | |
3813 pmaddubsw m2, m7, [r3 + 17 * 16] | |
3814 pmulhrsw m2, m3 | |
3815 packuswb m1, m2 | |
3816 movu [r0 + 162 * 16], m1 | |
3817 | |
3818 ; mode 12 [row 3] | |
3819 pmaddubsw m1, m0, [r3 + 12 * 16] | |
3820 pmulhrsw m1, m3 | |
3821 pmaddubsw m2, m7, [r3 + 12 * 16] | |
3822 pmulhrsw m2, m3 | |
3823 packuswb m1, m2 | |
3824 movu [r0 + 163 * 16], m1 | |
3825 | |
3826 ; mode 12 [row 4] | |
3827 pmaddubsw m1, m0, [r3 + 7 * 16] | |
3828 pmulhrsw m1, m3 | |
3829 pmaddubsw m2, m7, [r3 + 7 * 16] | |
3830 pmulhrsw m2, m3 | |
3831 packuswb m1, m2 | |
3832 movu [r0 + 164 * 16], m1 | |
3833 | |
3834 ; mode 12 [row 5] | |
3835 pmaddubsw m1, m0, [r3 + 2 * 16] | |
3836 pmulhrsw m1, m3 | |
3837 pmaddubsw m2, m7, [r3 + 2 * 16] | |
3838 pmulhrsw m2, m3 | |
3839 packuswb m1, m2 | |
3840 movu [r0 + 165 * 16], m1 | |
3841 | |
3842 ; mode 13 [row 0] | |
3843 pmaddubsw m1, m0, [r3 + 23 * 16] | |
3844 pmulhrsw m1, m3 | |
3845 pmaddubsw m2, m7, [r3 + 23 * 16] | |
3846 pmulhrsw m2, m3 | |
3847 packuswb m1, m2 | |
3848 movu [r0 + 176 * 16], m1 | |
3849 | |
3850 ; mode 13 [row 1] | |
3851 pmaddubsw m1, m0, [r3 + 14 * 16] | |
3852 pmulhrsw m1, m3 | |
3853 pmaddubsw m2, m7, [r3 + 14 * 16] | |
3854 pmulhrsw m2, m3 | |
3855 packuswb m1, m2 | |
3856 movu [r0 + 177 * 16], m1 | |
3857 | |
3858 ; mode 13 [row 2] | |
3859 pmaddubsw m1, m0, [r3 + 5 * 16] | |
3860 pmulhrsw m1, m3 | |
3861 pmaddubsw m2, m7, [r3 + 5 * 16] | |
3862 pmulhrsw m2, m3 | |
3863 packuswb m1, m2 | |
3864 movu [r0 + 178 * 16], m1 | |
3865 | |
3866 ; mode 14 [row 0] | |
3867 pmaddubsw m1, m0, [r3 + 19 * 16] | |
3868 pmulhrsw m1, m3 | |
3869 pmaddubsw m2, m7, [r3 + 19 * 16] | |
3870 pmulhrsw m2, m3 | |
3871 packuswb m1, m2 | |
3872 movu [r0 + 192 * 16], m1 | |
3873 | |
3874 ; mode 14 [row 1] | |
3875 pmaddubsw m1, m0, [r3 + 6 * 16] | |
3876 pmulhrsw m1, m3 | |
3877 pmaddubsw m2, m7, [r3 + 6 * 16] | |
3878 pmulhrsw m2, m3 | |
3879 packuswb m1, m2 | |
3880 movu [r0 + 193 * 16], m1 | |
3881 | |
3882 ; mode 17 [row 0] | |
3883 movu [r0 + 240 * 16], m1 | |
3884 | |
3885 ; mode 15 [row 0] | |
3886 pmaddubsw m1, m0, [r3 + 15 * 16] | |
3887 pmulhrsw m1, m3 | |
3888 pmaddubsw m2, m7, [r3 + 15 * 16] | |
3889 pmulhrsw m2, m3 | |
3890 packuswb m1, m2 | |
3891 movu [r0 + 208 * 16], m1 | |
3892 | |
3893 ; mode 15 [row 15 - second half] | |
3894 pmaddubsw m1, m0, [r3 + 16 * 16] | |
3895 pmulhrsw m1, m3 | |
3896 packuswb m1, m1 | |
3897 movh [r0 + 223 * 16 + 8], m1 | |
3898 ; mode 15 [row 15 - second half] end | |
3899 | |
3900 ; mode 16 [row 0] | |
3901 pmaddubsw m1, m0, [r3 + 11 * 16] | |
3902 pmulhrsw m1, m3 | |
3903 pmaddubsw m2, m7, [r3 + 11 * 16] | |
3904 pmulhrsw m2, m3 | |
3905 packuswb m1, m2 | |
3906 movu [r0 + 224 * 16], m1 | |
3907 | |
3908 ; mode 17 [row 9 - second half] | |
3909 pmaddubsw m1, m0, [r3 + 28 * 16] | |
3910 pmulhrsw m1, m3 | |
3911 packuswb m1, m1 | |
3912 movh [r0 + 249 * 16 + 8], m1 | |
3913 ; mode 17 [row 9 - second half] end | |
3914 | |
3915 ; mode 17 [row 10 - second half] | |
3916 pmaddubsw m1, m0, [r3 + 2 * 16] | |
3917 pmulhrsw m1, m3 | |
3918 packuswb m1, m1 | |
3919 movh [r0 + 250 * 16 + 8], m1 | |
3920 ; mode 17 [row 10 - second half] end | |
3921 | |
3922 ; mode 17 [row 1 - first half] | |
3923 pslldq m6, m0, 2 | |
3924 pinsrb m6, [r2], 1 | |
3925 pinsrb m6, [r2 + 1], 0 | |
3926 pmaddubsw m1, m6, [r3 + 12 * 16] | |
3927 pmulhrsw m1, m3 | |
3928 packuswb m1, m1 | |
3929 movh [r0 + 241 * 16], m1 | |
3930 | |
3931 ; mode 17 [row 11 - second half] | |
3932 pmaddubsw m1, m6, [r3 + 8 * 16] | |
3933 pmulhrsw m1, m3 | |
3934 packuswb m1, m1 | |
3935 movh [r0 + 251 * 16 + 8], m1 | |
3936 ; mode 17 [row 11 - second half] end | |
3937 | |
3938 ; mode 17 [row 2 - first half] | |
3939 pslldq m6, 2 | |
3940 pinsrb m6, [r2 + 1], 1 | |
3941 pinsrb m6, [r2 + 2], 0 | |
3942 pmaddubsw m1, m6, [r3 + 18 * 16] | |
3943 pmulhrsw m1, m3 | |
3944 packuswb m1, m1 | |
3945 movh [r0 + 242 * 16], m1 | |
3946 | |
3947 ; mode 17 [row 12 - second half] | |
3948 pmaddubsw m1, m6, [r3 + 14 * 16] | |
3949 pmulhrsw m1, m3 | |
3950 packuswb m1, m1 | |
3951 movh [r0 + 252 * 16 + 8], m1 | |
3952 ; mode 17 [row 12 - second half] end | |
3953 | |
3954 ; mode 17 [row 3 - first half] | |
3955 pslldq m6, 2 | |
3956 pinsrb m6, [r2 + 2], 1 | |
3957 pinsrb m6, [r2 + 4], 0 | |
3958 pmaddubsw m1, m6, [r3 + 24 * 16] | |
3959 pmulhrsw m1, m3 | |
3960 packuswb m1, m1 | |
3961 movh [r0 + 243 * 16], m1 | |
3962 | |
3963 ; mode 17 [row 13 - first half] | |
3964 pmaddubsw m1, m6, [r3 + 20 * 16] | |
3965 pmulhrsw m1, m3 | |
3966 packuswb m1, m1 | |
3967 movh [r0 + 253 * 16 + 8], m1 | |
3968 | |
3969 ; mode 17 [row 4 - first half] | |
3970 pslldq m6, 2 | |
3971 pinsrb m6, [r2 + 4], 1 | |
3972 pinsrb m6, [r2 + 5], 0 | |
3973 pmaddubsw m1, m6, [r3 + 30 * 16] | |
3974 pmulhrsw m1, m3 | |
3975 packuswb m1, m1 | |
3976 movh [r0 + 244 * 16], m1 | |
3977 | |
3978 ; mode 17 [row 5 - first half] | |
3979 pmaddubsw m1, m6, [r3 + 4 * 16] | |
3980 pmulhrsw m1, m3 | |
3981 packuswb m1, m1 | |
3982 movh [r0 + 245 * 16], m1 | |
3983 | |
3984 ; mode 17 [row 14 - second half] | |
3985 pmaddubsw m1, m6, [r3 + 26 * 16] | |
3986 pmulhrsw m1, m3 | |
3987 packuswb m1, m1 | |
3988 movh [r0 + 254 * 16 + 8], m1 | |
3989 ; mode 17 [row 14 - second half] end | |
3990 | |
3991 ; mode 17 [row 6 - first half] | |
3992 pslldq m6, 2 | |
3993 pinsrb m6, [r2 + 5], 1 | |
3994 pinsrb m6, [r2 + 6], 0 | |
3995 pmaddubsw m1, m6, [r3 + 10 * 16] | |
3996 pmulhrsw m1, m3 | |
3997 packuswb m1, m1 | |
3998 movh [r0 + 246 * 16], m1 | |
3999 | |
4000 ; mode 17 [row 7 - first half] | |
4001 pslldq m6, 2 | |
4002 pinsrb m6, [r2 + 6], 1 | |
4003 pinsrb m6, [r2 + 7], 0 | |
4004 pmaddubsw m1, m6, [r3 + 16 * 16] | |
4005 pmulhrsw m1, m3 | |
4006 packuswb m1, m1 | |
4007 movh [r0 + 247 * 16], m1 | |
4008 | |
4009 ; mode 17 [row 8 - first half] | |
4010 pslldq m6, 2 | |
4011 pinsrb m6, [r2 + 7], 1 | |
4012 pinsrb m6, [r2 + 9], 0 | |
4013 pmaddubsw m1, m6, [r3 + 22 * 16] | |
4014 pmulhrsw m1, m3 | |
4015 packuswb m1, m1 | |
4016 movh [r0 + 248 * 16], m1 | |
4017 | |
4018 ; mode 17 [row 9 - first half] | |
4019 pslldq m6, 2 | |
4020 pinsrb m6, [r2 + 9], 1 | |
4021 pinsrb m6, [r2 + 10], 0 | |
4022 pmaddubsw m1, m6, [r3 + 28 * 16] | |
4023 pmulhrsw m1, m3 | |
4024 packuswb m1, m1 | |
4025 movh [r0 + 249 * 16], m1 | |
4026 | |
4027 ; mode 17 [row 10 - first half] | |
4028 pmaddubsw m1, m6, [r3 + 2 * 16] | |
4029 pmulhrsw m1, m3 | |
4030 packuswb m1, m1 | |
4031 movh [r0 + 250 * 16], m1 | |
4032 | |
4033 ; mode 17 [row 11 - first half] | |
4034 pslldq m6, 2 | |
4035 pinsrb m6, [r2 + 10], 1 | |
4036 pinsrb m6, [r2 + 11], 0 | |
4037 pmaddubsw m1, m6, [r3 + 8 * 16] | |
4038 pmulhrsw m1, m3 | |
4039 packuswb m1, m1 | |
4040 movh [r0 + 251 * 16], m1 | |
4041 | |
4042 ; mode 17 [row 12 - first half] | |
4043 pslldq m6, 2 | |
4044 pinsrb m6, [r2 + 11], 1 | |
4045 pinsrb m6, [r2 + 12], 0 | |
4046 pmaddubsw m1, m6, [r3 + 14 * 16] | |
4047 pmulhrsw m1, m3 | |
4048 packuswb m1, m1 | |
4049 movh [r0 + 252 * 16], m1 | |
4050 | |
4051 ; mode 17 [row 13 - first half] | |
4052 pslldq m6, 2 | |
4053 pinsrb m6, [r2 + 12], 1 | |
4054 pinsrb m6, [r2 + 14], 0 | |
4055 pmaddubsw m1, m6, [r3 + 20 * 16] | |
4056 pmulhrsw m1, m3 | |
4057 packuswb m1, m1 | |
4058 movh [r0 + 253 * 16], m1 | |
4059 | |
4060 ; mode 17 [row 14 - first half] | |
4061 pslldq m6, 2 | |
4062 pinsrb m6, [r2 + 14], 1 | |
4063 pinsrb m6, [r2 + 15], 0 | |
4064 pmaddubsw m1, m6, [r3 + 26 * 16] | |
4065 pmulhrsw m1, m3 | |
4066 packuswb m1, m1 | |
4067 movh [r0 + 254 * 16], m1 | |
4068 | |
4069 ; mode 16 [row 12 - second half] | |
4070 pmaddubsw m1, m0, [r3 + 15 * 16] | |
4071 pmulhrsw m1, m3 | |
4072 packuswb m1, m1 | |
4073 movh [r0 + 236 * 16 + 8], m1 | |
4074 ; mode 16 [row 12 - second half] | |
4075 | |
4076 ; mode 12 [row 6] | |
4077 pslldq m2, m0, 2 | |
4078 pinsrb m2, [r2], 1 | |
4079 pinsrb m2, [r2 + 6], 0 | |
4080 pmaddubsw m1, m2, [r3 + 29 * 16] | |
4081 pmulhrsw m1, m3 | |
4082 movu m0, [r2 + 7 + 32] | |
4083 psrldq m4, m0, 1 | |
4084 punpcklbw m0, m4 | |
4085 pmaddubsw m4, m0, [r3 + 29 * 16] | |
4086 pmulhrsw m4, m3 | |
4087 packuswb m1, m4 | |
4088 movu [r0 + 166 * 16], m1 | |
4089 | |
4090 ; mode 12 [row 7] | |
4091 pmaddubsw m1, m2, [r3 + 24 * 16] | |
4092 pmulhrsw m1, m3 | |
4093 pmaddubsw m4, m0, [r3 + 24 * 16] | |
4094 pmulhrsw m4, m3 | |
4095 packuswb m1, m4 | |
4096 movu [r0 + 167 * 16], m1 | |
4097 | |
4098 ; mode 12 [row 8] | |
4099 pmaddubsw m1, m2, [r3 + 19 * 16] | |
4100 pmulhrsw m1, m3 | |
4101 pmaddubsw m4, m0, [r3 + 19 * 16] | |
4102 pmulhrsw m4, m3 | |
4103 packuswb m1, m4 | |
4104 movu [r0 + 168 * 16], m1 | |
4105 | |
4106 ; mode 12 [row 9] | |
4107 pmaddubsw m1, m2, [r3 + 14 * 16] | |
4108 pmulhrsw m1, m3 | |
4109 pmaddubsw m4, m0, [r3 + 14 * 16] | |
4110 pmulhrsw m4, m3 | |
4111 packuswb m1, m4 | |
4112 movu [r0 + 169 * 16], m1 | |
4113 | |
4114 ; mode 12 [row 10] | |
4115 pmaddubsw m1, m2, [r3 + 9 * 16] | |
4116 pmulhrsw m1, m3 | |
4117 pmaddubsw m4, m0, [r3 + 9 * 16] | |
4118 pmulhrsw m4, m3 | |
4119 packuswb m1, m4 | |
4120 movu [r0 + 170 * 16], m1 | |
4121 | |
4122 ; mode 12 [row 11] | |
4123 pmaddubsw m1, m2, [r3 + 4 * 16] | |
4124 pmulhrsw m1, m3 | |
4125 pmaddubsw m4, m0, [r3 + 4 * 16] | |
4126 pmulhrsw m4, m3 | |
4127 packuswb m1, m4 | |
4128 movu [r0 + 171 * 16], m1 | |
4129 | |
4130 ; mode 13 [row 3] | |
4131 pinsrb m7, m2, [r2 + 4], 0 | |
4132 pmaddubsw m1, m7, [r3 + 28 * 16] | |
4133 pmulhrsw m1, m3 | |
4134 pmaddubsw m4, m0, [r3 + 28 * 16] | |
4135 pmulhrsw m4, m3 | |
4136 packuswb m1, m4 | |
4137 movu [r0 + 179 * 16], m1 | |
4138 | |
4139 ; mode 13 [row 4] | |
4140 pmaddubsw m1, m7, [r3 + 19 * 16] | |
4141 pmulhrsw m1, m3 | |
4142 pmaddubsw m4, m0, [r3 + 19 * 16] | |
4143 pmulhrsw m4, m3 | |
4144 packuswb m1, m4 | |
4145 movu [r0 + 180 * 16], m1 | |
4146 | |
4147 ; mode 13 [row 5] | |
4148 pmaddubsw m1, m7, [r3 + 10 * 16] | |
4149 pmulhrsw m1, m3 | |
4150 pmaddubsw m4, m0, [r3 + 10 * 16] | |
4151 pmulhrsw m4, m3 | |
4152 packuswb m1, m4 | |
4153 movu [r0 + 181 * 16], m1 | |
4154 | |
4155 ; mode 13 [row 6] | |
4156 pmaddubsw m1, m7, [r3 + 1 * 16] | |
4157 pmulhrsw m1, m3 | |
4158 pmaddubsw m4, m0, [r3 + 1 * 16] | |
4159 pmulhrsw m4, m3 | |
4160 packuswb m1, m4 | |
4161 movu [r0 + 182 * 16], m1 | |
4162 | |
4163 ; mode 14 [row 2] | |
4164 pinsrb m5, m7, [r2 + 2], 0 | |
4165 pmaddubsw m1, m5, [r3 + 25 * 16] | |
4166 pmulhrsw m1, m3 | |
4167 pmaddubsw m4, m0, [r3 + 25 * 16] | |
4168 pmulhrsw m4, m3 | |
4169 packuswb m1, m4 | |
4170 movu [r0 + 194 * 16], m1 | |
4171 | |
4172 ; mode 14 [row 3] | |
4173 pmaddubsw m1, m5, [r3 + 12 * 16] | |
4174 pmulhrsw m1, m3 | |
4175 pmaddubsw m4, m0, [r3 + 12 * 16] | |
4176 pmulhrsw m4, m3 | |
4177 packuswb m1, m4 | |
4178 movu [r0 + 195 * 16], m1 | |
4179 | |
4180 ; mode 15 [row 1] | |
4181 pmaddubsw m1, m5, [r3 + 30 * 16] | |
4182 pmulhrsw m1, m3 | |
4183 pmaddubsw m4, m0, [r3 + 30 * 16] | |
4184 pmulhrsw m4, m3 | |
4185 packuswb m1, m4 | |
4186 movu [r0 + 209 * 16], m1 | |
4187 | |
4188 ; mode 15 [row 2] | |
4189 pmaddubsw m1, m5, [r3 + 13 * 16] | |
4190 pmulhrsw m1, m3 | |
4191 pmaddubsw m4, m0, [r3 + 13 * 16] | |
4192 pmulhrsw m4, m3 | |
4193 packuswb m1, m4 | |
4194 movu [r0 + 210 * 16], m1 | |
4195 | |
4196 ; mode 16 [row 1] | |
4197 pmaddubsw m1, m5, [r3 + 22 * 16] | |
4198 pmulhrsw m1, m3 | |
4199 pmaddubsw m4, m0, [r3 + 22 * 16] | |
4200 pmulhrsw m4, m3 | |
4201 packuswb m1, m4 | |
4202 movu [r0 + 225 * 16], m1 | |
4203 | |
4204 ; mode 16 [row 2] | |
4205 pmaddubsw m1, m5, [r3 + 1 * 16] | |
4206 pmulhrsw m1, m3 | |
4207 pmaddubsw m4, m0, [r3 + 1 * 16] | |
4208 pmulhrsw m4, m3 | |
4209 packuswb m1, m4 | |
4210 movu [r0 + 226 * 16], m1 | |
4211 | |
4212 ; mode 16 [row 13 - second half] | |
4213 pmaddubsw m1, m5, [r3 + 26 * 16] | |
4214 pmulhrsw m1, m3 | |
4215 packuswb m1, m1 | |
4216 movh [r0 + 237 * 16 + 8], m1 | |
4217 ; mode 16 [row 13 - second half] | |
4218 | |
4219 ; mode 16 [row 14 - second half] | |
4220 pmaddubsw m1, m5, [r3 + 5 * 16] | |
4221 pmulhrsw m1, m3 | |
4222 packuswb m1, m1 | |
4223 movh [r0 + 238 * 16 + 8], m1 | |
4224 ; mode 16 [row 14 - second half] | |
4225 | |
4226 ; mode 16 [row 3] | |
4227 pslldq m6, m5, 2 | |
4228 pinsrb m6, [r2 + 2], 1 | |
4229 pinsrb m6, [r2 + 3], 0 | |
4230 pmaddubsw m1, m6, [r3 + 12 * 16] | |
4231 pmulhrsw m1, m3 | |
4232 packuswb m1, m1 | |
4233 movh [r0 + 227 * 16], m1 | |
4234 | |
4235 ; mode 16 [row 15 - second half] | |
4236 pmaddubsw m1, m6, [r3 + 16 * 16] | |
4237 pmulhrsw m1, m3 | |
4238 packuswb m1, m1 | |
4239 movh [r0 + 239 * 16 + 8], m1 | |
4240 ; mode 16 [row 15 - second half] end | |
4241 | |
4242 ; mode 16 [row 4- first half] | |
4243 pslldq m6, 2 | |
4244 pinsrb m6, [r2 + 3], 1 | |
4245 pinsrb m6, [r2 + 5], 0 | |
4246 pmaddubsw m1, m6, [r3 + 23 * 16] | |
4247 pmulhrsw m1, m3 | |
4248 packuswb m1, m1 | |
4249 movh [r0 + 228 * 16], m1 | |
4250 | |
4251 ; mode 16 [row 5- first half] | |
4252 pmaddubsw m1, m6, [r3 + 2 * 16] | |
4253 pmulhrsw m1, m3 | |
4254 packuswb m1, m1 | |
4255 movh [r0 + 229 * 16], m1 | |
4256 | |
4257 ; mode 16 [row 6- first half] | |
4258 pslldq m6, 2 | |
4259 pinsrb m6, [r2 + 5], 1 | |
4260 pinsrb m6, [r2 + 6], 0 | |
4261 pmaddubsw m1, m6, [r3 + 13 * 16] | |
4262 pmulhrsw m1, m3 | |
4263 packuswb m1, m1 | |
4264 movh [r0 + 230 * 16], m1 | |
4265 | |
4266 ; mode 16 [row 7- first half] | |
4267 pslldq m6, 2 | |
4268 pinsrb m6, [r2 + 6], 1 | |
4269 pinsrb m6, [r2 + 8], 0 | |
4270 pmaddubsw m1, m6, [r3 + 24 * 16] | |
4271 pmulhrsw m1, m3 | |
4272 packuswb m1, m1 | |
4273 movh [r0 + 231 * 16], m1 | |
4274 | |
4275 ; mode 16 [row 8- first half] | |
4276 pmaddubsw m1, m6, [r3 + 3 * 16] | |
4277 pmulhrsw m1, m3 | |
4278 packuswb m1, m1 | |
4279 movh [r0 + 232 * 16], m1 | |
4280 ; mode 19 [row 0 - second half] end | |
4281 | |
4282 ; mode 16 [row 9- first half] | |
4283 pslldq m6, 2 | |
4284 pinsrb m6, [r2 + 8], 1 | |
4285 pinsrb m6, [r2 + 9], 0 | |
4286 pmaddubsw m1, m6, [r3 + 14 * 16] | |
4287 pmulhrsw m1, m3 | |
4288 packuswb m1, m1 | |
4289 movh [r0 + 233 * 16], m1 | |
4290 | |
4291 ; mode 16 [row 10 - first half] | |
4292 pslldq m6, 2 | |
4293 pinsrb m6, [r2 + 9], 1 | |
4294 pinsrb m6, [r2 + 11], 0 | |
4295 pmaddubsw m1, m6, [r3 + 25 * 16] | |
4296 pmulhrsw m1, m3 | |
4297 packuswb m1, m1 | |
4298 movh [r0 + 234 * 16], m1 | |
4299 | |
4300 ; mode 16 [row 11 - first half] | |
4301 pmaddubsw m1, m6, [r3 + 4 * 16] | |
4302 pmulhrsw m1, m3 | |
4303 packuswb m1, m1 | |
4304 movh [r0 + 235 * 16], m1 | |
4305 | |
4306 ; mode 16 [row 12 - first half] | |
4307 pslldq m6, 2 | |
4308 pinsrb m6, [r2 + 11], 1 | |
4309 pinsrb m6, [r2 + 12], 0 | |
4310 pmaddubsw m1, m6, [r3 + 15 * 16] | |
4311 pmulhrsw m1, m3 | |
4312 packuswb m1, m1 | |
4313 movh [r0 + 236 * 16], m1 | |
4314 | |
4315 ; mode 16 [row 13 - first half] | |
4316 pslldq m6, 2 | |
4317 pinsrb m6, [r2 + 12], 1 | |
4318 pinsrb m6, [r2 + 14], 0 | |
4319 pmaddubsw m1, m6, [r3 + 26 * 16] | |
4320 pmulhrsw m1, m3 | |
4321 packuswb m1, m1 | |
4322 movh [r0 + 237 * 16], m1 | |
4323 | |
4324 ; mode 16 [row 14 - first half] | |
4325 pmaddubsw m1, m6, [r3 + 5 * 16] | |
4326 pmulhrsw m1, m3 | |
4327 packuswb m1, m1 | |
4328 movh [r0 + 238 * 16], m1 | |
4329 | |
4330 ; mode 16 [row 15 - first half] | |
4331 pslldq m6, 2 | |
4332 pinsrb m6, [r2 + 14], 1 | |
4333 pinsrb m6, [r2 + 15], 0 | |
4334 pmaddubsw m1, m6, [r3 + 16 * 16] | |
4335 pmulhrsw m1, m3 | |
4336 packuswb m1, m1 | |
4337 movh [r0 + 239 * 16], m1 | |
4338 | |
4339 ; mode 14 [row 4] | |
4340 pslldq m5, 2 | |
4341 pinsrb m5, [r2 + 2], 1 | |
4342 pinsrb m5, [r2 + 5], 0 | |
4343 movu m4, [r2 + 6 + 32] | |
4344 psrldq m0, m4, 1 | |
4345 punpcklbw m4, m0 | |
4346 | |
4347 ; mode 16 [row 3 - second half] | |
4348 pmaddubsw m1, m4, [r3 + 12 * 16] | |
4349 pmulhrsw m1, m3 | |
4350 packuswb m1, m1 | |
4351 movh [r0 + 227 * 16 + 8], m1 | |
4352 | |
4353 ; mode 16 [row 3 - second half] end | |
4354 pmaddubsw m1, m5, [r3 + 31 * 16] | |
4355 pmulhrsw m1, m3 | |
4356 pmaddubsw m0, m4, [r3 + 31 * 16] | |
4357 pmulhrsw m0, m3 | |
4358 packuswb m1, m0 | |
4359 movu [r0 + 196 * 16], m1 | |
4360 | |
4361 ; mode 14 [row 5] | |
4362 pmaddubsw m1, m5, [r3 + 18 * 16] | |
4363 pmulhrsw m1, m3 | |
4364 pmaddubsw m0, m4, [r3 + 18 * 16] | |
4365 pmulhrsw m0, m3 | |
4366 packuswb m1, m0 | |
4367 movu [r0 + 197 * 16], m1 | |
4368 | |
4369 ; mode 14 [row 6] | |
4370 pmaddubsw m1, m5, [r3 + 5 * 16] | |
4371 pmulhrsw m1, m3 | |
4372 pmaddubsw m0, m4, [r3 + 5 * 16] | |
4373 pmulhrsw m0, m3 | |
4374 packuswb m1, m0 | |
4375 movu [r0 + 198 * 16], m1 | |
4376 | |
4377 ; mode 15 [row 3] | |
4378 movu m6, m5 | |
4379 pinsrb m6, [r2 + 4], 0 | |
4380 pmaddubsw m1, m6, [r3 + 28 * 16] | |
4381 pmulhrsw m1, m3 | |
4382 pmaddubsw m0, m4, [r3 + 28 * 16] | |
4383 pmulhrsw m0, m3 | |
4384 packuswb m1, m0 | |
4385 movu [r0 + 211 * 16], m1 | |
4386 | |
4387 ; mode 15 [row 4] | |
4388 pmaddubsw m1, m6, [r3 + 11 * 16] | |
4389 pmulhrsw m1, m3 | |
4390 pmaddubsw m0, m4, [r3 + 11 * 16] | |
4391 pmulhrsw m0, m3 | |
4392 packuswb m1, m0 | |
4393 movu [r0 + 212 * 16], m1 | |
4394 | |
4395 ; mode 15 [row 5 - first half] | |
4396 pslldq m6, 2 | |
4397 pinsrb m6, [r2 + 4], 1 | |
4398 pinsrb m6, [r2 + 6], 0 | |
4399 pmaddubsw m1, m6, [r3 + 26 * 16] | |
4400 pmulhrsw m1, m3 | |
4401 packuswb m1, m1 | |
4402 movh [r0 + 213 * 16], m1 | |
4403 | |
4404 ; mode 15 [row 6 - first half] | |
4405 pmaddubsw m1, m6, [r3 + 9 * 16] | |
4406 pmulhrsw m1, m3 | |
4407 packuswb m1, m1 | |
4408 movh [r0 + 214 * 16], m1 | |
4409 | |
4410 ; mode 15 [row 7 - first half] | |
4411 pslldq m6, 2 | |
4412 pinsrb m6, [r2 + 6], 1 | |
4413 pinsrb m6, [r2 + 8], 0 | |
4414 pmaddubsw m1, m6, [r3 + 24 * 16] | |
4415 pmulhrsw m1, m3 | |
4416 packuswb m1, m1 | |
4417 movh [r0 + 215 * 16], m1 | |
4418 | |
4419 ; mode 15 [row 8 - first half] | |
4420 pmaddubsw m1, m6, [r3 + 7 * 16] | |
4421 pmulhrsw m1, m3 | |
4422 packuswb m1, m1 | |
4423 movh [r0 + 216 * 16], m1 | |
4424 | |
4425 ; mode 15 [row 9 - first half] | |
4426 pslldq m6, 2 | |
4427 pinsrb m6, [r2 + 8], 1 | |
4428 pinsrb m6, [r2 + 9], 0 | |
4429 pmaddubsw m1, m6, [r3 + 22 * 16] | |
4430 pmulhrsw m1, m3 | |
4431 packuswb m1, m1 | |
4432 movh [r0 + 217 * 16], m1 | |
4433 | |
4434 ; mode 15 [row 10 - first half] | |
4435 pmaddubsw m1, m6, [r3 + 5 * 16] | |
4436 pmulhrsw m1, m3 | |
4437 packuswb m1, m1 | |
4438 movh [r0 + 218 * 16], m1 | |
4439 | |
4440 ; mode 15 [row 11 - first half] | |
4441 pslldq m6, 2 | |
4442 pinsrb m6, [r2 + 9], 1 | |
4443 pinsrb m6, [r2 + 11], 0 | |
4444 pmaddubsw m1, m6, [r3 + 20 * 16] | |
4445 pmulhrsw m1, m3 | |
4446 packuswb m1, m1 | |
4447 movh [r0 + 219 * 16], m1 | |
4448 | |
4449 ; mode 15 [row 12 - first half] | |
4450 pmaddubsw m1, m6, [r3 + 3 * 16] | |
4451 pmulhrsw m1, m3 | |
4452 packuswb m1, m1 | |
4453 movh [r0 + 220 * 16], m1 | |
4454 | |
4455 ; mode 15 [row 13 - first half] | |
4456 pslldq m6, 2 | |
4457 pinsrb m6, [r2 + 11], 1 | |
4458 pinsrb m6, [r2 + 13], 0 | |
4459 pmaddubsw m1, m6, [r3 + 18 * 16] | |
4460 pmulhrsw m1, m3 | |
4461 packuswb m1, m1 | |
4462 movh [r0 + 221 * 16], m1 | |
4463 | |
4464 ; mode 15 [row 14 - first half] | |
4465 pmaddubsw m1, m6, [r3 + 1 * 16] | |
4466 pmulhrsw m1, m3 | |
4467 packuswb m1, m1 | |
4468 movh [r0 + 222 * 16], m1 | |
4469 | |
4470 ; mode 15 [row 15 - first half] | |
4471 pslldq m6, 2 | |
4472 pinsrb m6, [r2 + 13], 1 | |
4473 pinsrb m6, [r2 + 15], 0 | |
4474 pmaddubsw m1, m6, [r3 + 16 * 16] | |
4475 pmulhrsw m1, m3 | |
4476 packuswb m1, m1 | |
4477 movh [r0 + 223 * 16], m1 | |
4478 | |
4479 ; mode 14 [row 7] | |
4480 pslldq m5, 2 | |
4481 pinsrb m5, [r2 + 5], 1 | |
4482 pinsrb m5, [r2 + 7], 0 | |
4483 movu m0, [r2 + 5 + 32] | |
4484 psrldq m6, m0, 1 | |
4485 punpcklbw m0, m6 | |
4486 | |
4487 ; mode 15 [row 5 - second half] | |
4488 pmaddubsw m1, m0, [r3 + 26 * 16] | |
4489 pmulhrsw m1, m3 | |
4490 packuswb m1, m1 | |
4491 movh [r0 + 213 * 16 + 8], m1 | |
4492 ; mode 15 [row 5 - second half] end | |
4493 | |
4494 ; mode 15 [row 6 - second half] | |
4495 pmaddubsw m1, m0, [r3 + 9 * 16] | |
4496 pmulhrsw m1, m3 | |
4497 packuswb m1, m1 | |
4498 movh [r0 + 214 * 16 + 8], m1 | |
4499 ; mode 15 [row 6 - second half] end | |
4500 | |
4501 ; mode 16 [row 4 - second half] | |
4502 pmaddubsw m1, m0, [r3 + 23 * 16] | |
4503 pmulhrsw m1, m3 | |
4504 packuswb m1, m1 | |
4505 movh [r0 + 228 * 16 + 8], m1 | |
4506 ; mode 16 [row 4 - second half] end | |
4507 | |
4508 ; mode 16 [row 5 - second half] | |
4509 pmaddubsw m1, m0, [r3 + 2 * 16] | |
4510 pmulhrsw m1, m3 | |
4511 packuswb m1, m1 | |
4512 movh [r0 + 229 * 16 + 8], m1 | |
4513 | |
4514 ; mode 16 [row 5 - second half] end | |
4515 pmaddubsw m1, m5, [r3 + 24 * 16] | |
4516 pmulhrsw m1, m3 | |
4517 pmaddubsw m6, m0, [r3 + 24 * 16] | |
4518 pmulhrsw m6, m3 | |
4519 packuswb m1, m6 | |
4520 movu [r0 + 199 * 16], m1 | |
4521 | |
4522 ; mode 14 [row 8] | |
4523 pmaddubsw m1, m5, [r3 + 11 * 16] | |
4524 pmulhrsw m1, m3 | |
4525 pmaddubsw m6, m0, [r3 + 11 * 16] | |
4526 pmulhrsw m6, m3 | |
4527 packuswb m1, m6 | |
4528 movu [r0 + 200 * 16], m1 | |
4529 | |
4530 ; mode 14 [row 9] | |
4531 pslldq m5, 2 | |
4532 pinsrb m5, [r2 + 7], 1 | |
4533 pinsrb m5, [r2 + 10], 0 | |
4534 movu m0, [r2 + 4 + 32] | |
4535 psrldq m6, m0, 1 | |
4536 punpcklbw m0, m6 | |
4537 | |
4538 ; mode 15 [row 7 - second half] | |
4539 pmaddubsw m1, m0, [r3 + 24 * 16] | |
4540 pmulhrsw m1, m3 | |
4541 packuswb m1, m1 | |
4542 movh [r0 + 215 * 16 + 8], m1 | |
4543 ; mode 15 [row 7 - second half] end | |
4544 | |
4545 ; mode 15 [row 8 - second half] | |
4546 pmaddubsw m1, m0, [r3 + 7 * 16] | |
4547 pmulhrsw m1, m3 | |
4548 packuswb m1, m1 | |
4549 movh [r0 + 216 * 16 + 8], m1 | |
4550 ; mode 15 [row 8 - second half] end | |
4551 | |
4552 ; mode 16 [row 6 - second half] | |
4553 pmaddubsw m1, m0, [r3 + 13 * 16] | |
4554 pmulhrsw m1, m3 | |
4555 packuswb m1, m1 | |
4556 movh [r0 + 230 * 16 + 8], m1 | |
4557 ; mode 16 [row 6 - second half] end | |
4558 | |
4559 ; mode 15 [row 6 - second half] end | |
4560 pmaddubsw m1, m5, [r3 + 30 * 16] | |
4561 pmulhrsw m1, m3 | |
4562 pmaddubsw m6, m0, [r3 + 30 * 16] | |
4563 pmulhrsw m6, m3 | |
4564 packuswb m1, m6 | |
4565 movu [r0 + 201 * 16], m1 | |
4566 | |
4567 ; mode 14 [row 10] | |
4568 pmaddubsw m1, m5, [r3 + 17 * 16] | |
4569 pmulhrsw m1, m3 | |
4570 pmaddubsw m6, m0, [r3 + 17 * 16] | |
4571 pmulhrsw m6, m3 | |
4572 packuswb m1, m6 | |
4573 movu [r0 + 202 * 16], m1 | |
4574 | |
4575 ; mode 14 [row 11] | |
4576 pmaddubsw m1, m5, [r3 + 4 * 16] | |
4577 pmulhrsw m1, m3 | |
4578 pmaddubsw m6, m0, [r3 + 4 * 16] | |
4579 pmulhrsw m6, m3 | |
4580 packuswb m1, m6 | |
4581 movu [r0 + 203 * 16], m1 | |
4582 | |
4583 ; mode 14 [row 12] | |
4584 pslldq m5, 2 | |
4585 pinsrb m5, [r2 + 10], 1 | |
4586 pinsrb m5, [r2 + 12], 0 | |
4587 movu m0, [r2 + 3 + 32] | |
4588 psrldq m6, m0, 1 | |
4589 punpcklbw m0, m6 | |
4590 | |
4591 ; mode 15 [row 9 - second half] | |
4592 pmaddubsw m1, m0, [r3 + 22 * 16] | |
4593 pmulhrsw m1, m3 | |
4594 packuswb m1, m1 | |
4595 movh [r0 + 217 * 16 + 8], m1 | |
4596 ; mode 15 [row 9 - second half] end | |
4597 | |
4598 ; mode 15 [row 10 - second half] | |
4599 pmaddubsw m1, m0, [r3 + 5 * 16] | |
4600 pmulhrsw m1, m3 | |
4601 packuswb m1, m1 | |
4602 movh [r0 + 218 * 16 + 8], m1 | |
4603 ; mode 15 [row 10 - second half] end | |
4604 | |
4605 ; mode 16 [row 7 - second half] | |
4606 pmaddubsw m1, m0, [r3 + 24 * 16] | |
4607 pmulhrsw m1, m3 | |
4608 packuswb m1, m1 | |
4609 movh [r0 + 231 * 16 + 8], m1 | |
4610 ; mode 16 [row 7 - second half] end | |
4611 | |
4612 ; mode 16 [row 8 - second half] | |
4613 pmaddubsw m1, m0, [r3 + 3 * 16] | |
4614 pmulhrsw m1, m3 | |
4615 packuswb m1, m1 | |
4616 movh [r0 + 232 * 16 + 8], m1 | |
4617 ; mode 16 [row 8 - second half] end | |
4618 | |
4619 pmaddubsw m1, m5, [r3 + 23 * 16] | |
4620 pmulhrsw m1, m3 | |
4621 pmaddubsw m6, m0, [r3 + 23 * 16] | |
4622 pmulhrsw m6, m3 | |
4623 packuswb m1, m6 | |
4624 movu [r0 + 204 * 16], m1 | |
4625 | |
4626 ; mode 14 [row 13] | |
4627 pmaddubsw m1, m5, [r3 + 10 * 16] | |
4628 pmulhrsw m1, m3 | |
4629 pmaddubsw m6, m0, [r3 + 10 * 16] | |
4630 pmulhrsw m6, m3 | |
4631 packuswb m1, m6 | |
4632 movu [r0 + 205 * 16], m1 | |
4633 | |
4634 ; mode 14 [row 14] | |
4635 pslldq m5, 2 | |
4636 pinsrb m5, [r2 + 12], 1 | |
4637 pinsrb m5, [r2 + 15], 0 | |
4638 movu m0, [r2 + 2 + 32] | |
4639 psrldq m6, m0, 1 | |
4640 punpcklbw m0, m6 | |
4641 | |
4642 ; mode 15 [row 11 - second half] | |
4643 pmaddubsw m1, m0, [r3 + 20 * 16] | |
4644 pmulhrsw m1, m3 | |
4645 packuswb m1, m1 | |
4646 movh [r0 + 219 * 16 + 8], m1 | |
4647 ; mode 15 [row 11 - second half] end | |
4648 | |
4649 ; mode 15 [row 12 - second half] | |
4650 pmaddubsw m1, m0, [r3 + 3 * 16] | |
4651 pmulhrsw m1, m3 | |
4652 packuswb m1, m1 | |
4653 movh [r0 + 220 * 16 + 8], m1 | |
4654 ; mode 15 [row 12 - second half] end | |
4655 | |
4656 ; mode 16 [row 9 - second half] | |
4657 pmaddubsw m1, m0, [r3 + 14 * 16] | |
4658 pmulhrsw m1, m3 | |
4659 packuswb m1, m1 | |
4660 movh [r0 + 233 * 16 + 8], m1 | |
4661 | |
4662 ; mode 16 [row 9 - second half] end | |
4663 pmaddubsw m1, m5, [r3 + 29 * 16] | |
4664 pmulhrsw m1, m3 | |
4665 pmaddubsw m6, m0, [r3 + 29 * 16] | |
4666 pmulhrsw m6, m3 | |
4667 packuswb m1, m6 | |
4668 movu [r0 + 206 * 16], m1 | |
4669 | |
4670 ; mode 14 [row 15] | |
4671 pmaddubsw m1, m5, [r3 + 16 * 16] | |
4672 pmulhrsw m1, m3 | |
4673 pmaddubsw m6, m0, [r3 + 16 * 16] | |
4674 pmulhrsw m6, m3 | |
4675 packuswb m1, m6 | |
4676 movu [r0 + 207 * 16], m1 | |
4677 | |
4678 ; mode 12 [row 12] | |
4679 pslldq m0, m2, 2 | |
4680 pinsrb m0, [r2 + 6], 1 | |
4681 pinsrb m0, [r2 + 13], 0 | |
4682 pmaddubsw m1, m0, [r3 + 31 * 16] | |
4683 pmulhrsw m1, m3 | |
4684 pmaddubsw m5, m4, [r3 + 31 * 16] | |
4685 pmulhrsw m5, m3 | |
4686 packuswb m1, m5 | |
4687 movu [r0 + 172 * 16], m1 | |
4688 | |
4689 ; mode 12 [row 13] | |
4690 pmaddubsw m1, m0, [r3 + 26 * 16] | |
4691 pmulhrsw m1, m3 | |
4692 pmaddubsw m5, m4, [r3 + 26 * 16] | |
4693 pmulhrsw m5, m3 | |
4694 packuswb m1, m5 | |
4695 movu [r0 + 173 * 16], m1 | |
4696 | |
4697 ; mode 12 [row 14] | |
4698 pmaddubsw m1, m0, [r3 + 21 * 16] | |
4699 pmulhrsw m1, m3 | |
4700 pmaddubsw m5, m4, [r3 + 21 * 16] | |
4701 pmulhrsw m5, m3 | |
4702 packuswb m1, m5 | |
4703 movu [r0 + 174 * 16], m1 | |
4704 | |
4705 ; mode 12 [row 15] | |
4706 pmaddubsw m1, m0, [r3 + 16 * 16] | |
4707 pmulhrsw m1, m3 | |
4708 pmaddubsw m5, m4, [r3 + 16 * 16] | |
4709 pmulhrsw m5, m3 | |
4710 packuswb m1, m5 | |
4711 movu [r0 + 175 * 16], m1 | |
4712 | |
4713 ; mode 13 [row 7] | |
4714 pslldq m7, 2 | |
4715 pinsrb m7, [r2 + 4], 1 | |
4716 pinsrb m7, [r2 + 7], 0 | |
4717 pmaddubsw m1, m7, [r3 + 24 * 16] | |
4718 pmulhrsw m1, m3 | |
4719 pmaddubsw m5, m4, [r3 + 24 * 16] | |
4720 pmulhrsw m5, m3 | |
4721 packuswb m1, m5 | |
4722 movu [r0 + 183 * 16], m1 | |
4723 | |
4724 ; mode 13 [row 8] | |
4725 pmaddubsw m1, m7, [r3 + 15 * 16] | |
4726 pmulhrsw m1, m3 | |
4727 pmaddubsw m5, m4, [r3 + 15 * 16] | |
4728 pmulhrsw m5, m3 | |
4729 packuswb m1, m5 | |
4730 movu [r0 + 184 * 16], m1 | |
4731 | |
4732 ; mode 13 [row 9] | |
4733 pmaddubsw m1, m7, [r3 + 6 * 16] | |
4734 pmulhrsw m1, m3 | |
4735 pmaddubsw m5, m4, [r3 + 6 * 16] | |
4736 pmulhrsw m5, m3 | |
4737 packuswb m1, m5 | |
4738 movu [r0 + 185 * 16], m1 | |
4739 | |
4740 ; mode 13 [row 10] | |
4741 pslldq m7, 2 | |
4742 pinsrb m7, [r2 + 7], 1 | |
4743 pinsrb m7, [r2 + 11], 0 | |
4744 pmaddubsw m1, m7, [r3 + 29 * 16] | |
4745 pmulhrsw m1, m3 | |
4746 movu m4, [r2 + 5 + 32] | |
4747 psrldq m5, m4, 1 | |
4748 punpcklbw m4, m5 | |
4749 pmaddubsw m5, m4, [r3 + 29 * 16] | |
4750 pmulhrsw m5, m3 | |
4751 packuswb m1, m5 | |
4752 movu [r0 + 186 * 16], m1 | |
4753 | |
4754 ; mode 13 [row 11] | |
4755 pmaddubsw m1, m7, [r3 + 20 * 16] | |
4756 pmulhrsw m1, m3 | |
4757 pmaddubsw m5, m4, [r3 + 20 * 16] | |
4758 pmulhrsw m5, m3 | |
4759 packuswb m1, m5 | |
4760 movu [r0 + 187 * 16], m1 | |
4761 | |
4762 ; mode 13 [row 12] | |
4763 pmaddubsw m1, m7, [r3 + 11 * 16] | |
4764 pmulhrsw m1, m3 | |
4765 pmaddubsw m5, m4, [r3 + 11 * 16] | |
4766 pmulhrsw m5, m3 | |
4767 packuswb m1, m5 | |
4768 movu [r0 + 188 * 16], m1 | |
4769 | |
4770 ; mode 13 [row 13] | |
4771 pmaddubsw m1, m7, [r3 + 2 * 16] | |
4772 pmulhrsw m1, m3 | |
4773 pmaddubsw m5, m4, [r3 + 2 * 16] | |
4774 pmulhrsw m5, m3 | |
4775 packuswb m1, m5 | |
4776 movu [r0 + 189 * 16], m1 | |
4777 | |
4778 ; mode 13 [row 14] | |
4779 pslldq m7, 2 | |
4780 pinsrb m7, [r2 + 11], 1 | |
4781 pinsrb m7, [r2 + 14], 0 | |
4782 pmaddubsw m1, m7, [r3 + 25 * 16] | |
4783 pmulhrsw m1, m3 | |
4784 movu m4, [r2 + 4 + 32] | |
4785 psrldq m5, m4, 1 | |
4786 punpcklbw m4, m5 | |
4787 pmaddubsw m5, m4, [r3 + 25 * 16] | |
4788 pmulhrsw m5, m3 | |
4789 packuswb m1, m5 | |
4790 movu [r0 + 190 * 16], m1 | |
4791 | |
4792 ; mode 13 [row 15] | |
4793 pmaddubsw m1, m7, [r3 + 16 * 16] | |
4794 pmulhrsw m1, m3 | |
4795 pmaddubsw m5, m4, [r3 + 16 * 16] | |
4796 pmulhrsw m5, m3 | |
4797 packuswb m1, m5 | |
4798 movu [r0 + 191 * 16], m1 | |
4799 | |
4800 ; mode 17 [row 15] | |
4801 movu m0, [r2] | |
4802 pshufb m1, m0, [tab_S1] | |
4803 movu [r0 + 255 * 16], m1 | |
4804 movu m2, [r2 + 32] | |
4805 pinsrb m2, [r2], 0 | |
4806 movd [r0 + 255 * 16 + 12], m2 | |
4807 | |
4808 ; mode 18 [row 0] | |
4809 movu [r0 + 256 * 16], m0 | |
4810 | |
4811 ; mode 18 [row 1] | |
4812 pslldq m4, m0, 1 | |
4813 pinsrb m4, [r2 + 1 + 32], 0 | |
4814 movu [r0 + 257 * 16], m4 | |
4815 pslldq m4, 1 | |
4816 pinsrb m4, [r2 + 2 + 32], 0 | |
4817 movu [r0 + 258 * 16], m4 | |
4818 pslldq m4, 1 | |
4819 pinsrb m4, [r2 + 3 + 32], 0 | |
4820 movu [r0 + 259 * 16], m4 | |
4821 pslldq m4, 1 | |
4822 pinsrb m4, [r2 + 4 + 32], 0 | |
4823 movu [r0 + 260 * 16], m4 | |
4824 pslldq m4, 1 | |
4825 pinsrb m4, [r2 + 5 + 32], 0 | |
4826 movu [r0 + 261 * 16], m4 | |
4827 pslldq m4, 1 | |
4828 pinsrb m4, [r2 + 6 + 32], 0 | |
4829 movu [r0 + 262 * 16], m4 | |
4830 pslldq m4, 1 | |
4831 pinsrb m4, [r2 + 7 + 32], 0 | |
4832 movu [r0 + 263 * 16], m4 | |
4833 pslldq m4, 1 | |
4834 pinsrb m4, [r2 + 8 + 32], 0 | |
4835 movu [r0 + 264 * 16], m4 | |
4836 pslldq m4, 1 | |
4837 pinsrb m4, [r2 + 9 + 32], 0 | |
4838 movu [r0 + 265 * 16], m4 | |
4839 pslldq m4, 1 | |
4840 pinsrb m4, [r2 + 10 + 32], 0 | |
4841 movu [r0 + 266 * 16], m4 | |
4842 pslldq m4, 1 | |
4843 pinsrb m4, [r2 + 11 + 32], 0 | |
4844 movu [r0 + 267 * 16], m4 | |
4845 pslldq m4, 1 | |
4846 pinsrb m4, [r2 + 12 + 32], 0 | |
4847 movu [r0 + 268 * 16], m4 | |
4848 pslldq m4, 1 | |
4849 pinsrb m4, [r2 + 13 + 32], 0 | |
4850 movu [r0 + 269 * 16], m4 | |
4851 pslldq m4, 1 | |
4852 pinsrb m4, [r2 + 14 + 32], 0 | |
4853 movu [r0 + 270 * 16], m4 | |
4854 pslldq m4, 1 | |
4855 pinsrb m4, [r2 + 15 + 32], 0 | |
4856 movu [r0 + 271 * 16], m4 | |
4857 | |
4858 ; mode 19 [row 0] | |
4859 psrldq m2, m0, 1 | |
4860 punpcklbw m0, m2 | |
4861 movu m5, [r2 + 8] | |
4862 psrldq m6, m5, 1 | |
4863 punpcklbw m5, m6 | |
4864 pmaddubsw m4, m0, [r3 + 6 * 16] | |
4865 pmulhrsw m4, m3 | |
4866 pmaddubsw m6, m5, [r3 + 6 * 16] | |
4867 pmulhrsw m6, m3 | |
4868 packuswb m4, m6 | |
4869 movu [r0 + 272 * 16], m4 | |
4870 | |
4871 ; mode 20 [row 0] | |
4872 pmaddubsw m4, m0, [r3 + 11 * 16] | |
4873 pmulhrsw m4, m3 | |
4874 pmaddubsw m6, m5, [r3 + 11 * 16] | |
4875 pmulhrsw m6, m3 | |
4876 packuswb m4, m6 | |
4877 movu [r0 + 288 * 16], m4 | |
4878 | |
4879 ; mode 21 [row 0] | |
4880 pmaddubsw m4, m0, [r3 + 15 * 16] | |
4881 pmulhrsw m4, m3 | |
4882 pmaddubsw m6, m5, [r3 + 15 * 16] | |
4883 pmulhrsw m6, m3 | |
4884 packuswb m4, m6 | |
4885 movu [r0 + 304 * 16], m4 | |
4886 | |
4887 ; mode 22 [row 0] | |
4888 pmaddubsw m4, m0, [r3 + 19 * 16] | |
4889 pmulhrsw m4, m3 | |
4890 pmaddubsw m6, m5, [r3 + 19 * 16] | |
4891 pmulhrsw m6, m3 | |
4892 packuswb m4, m6 | |
4893 movu [r0 + 320 * 16], m4 | |
4894 | |
4895 ; mode 22 [row 1] | |
4896 pmaddubsw m4, m0, [r3 + 6 * 16] | |
4897 pmulhrsw m4, m3 | |
4898 pmaddubsw m6, m5, [r3 + 6 * 16] | |
4899 pmulhrsw m6, m3 | |
4900 packuswb m4, m6 | |
4901 movu [r0 + 321 * 16], m4 | |
4902 | |
4903 ; mode 23 [row 0] | |
4904 pmaddubsw m4, m0, [r3 + 23 * 16] | |
4905 pmulhrsw m4, m3 | |
4906 pmaddubsw m6, m5, [r3 + 23 * 16] | |
4907 pmulhrsw m6, m3 | |
4908 packuswb m4, m6 | |
4909 movu [r0 + 336 * 16], m4 | |
4910 | |
4911 ; mode 23 [row 1] | |
4912 pmaddubsw m4, m0, [r3 + 14 * 16] | |
4913 pmulhrsw m4, m3 | |
4914 pmaddubsw m6, m5, [r3 + 14 * 16] | |
4915 pmulhrsw m6, m3 | |
4916 packuswb m4, m6 | |
4917 movu [r0 + 337 * 16], m4 | |
4918 | |
4919 ; mode 23 [row 2] | |
4920 pmaddubsw m4, m0, [r3 + 5 * 16] | |
4921 pmulhrsw m4, m3 | |
4922 pmaddubsw m6, m5, [r3 + 5 * 16] | |
4923 pmulhrsw m6, m3 | |
4924 packuswb m4, m6 | |
4925 movu [r0 + 338 * 16], m4 | |
4926 | |
4927 ; mode 24 [row 0] | |
4928 pmaddubsw m4, m0, [r3 + 27 * 16] | |
4929 pmulhrsw m4, m3 | |
4930 pmaddubsw m6, m5, [r3 + 27 * 16] | |
4931 pmulhrsw m6, m3 | |
4932 packuswb m4, m6 | |
4933 movu [r0 + 352 * 16], m4 | |
4934 | |
4935 ; mode 24 [row 1] | |
4936 pmaddubsw m4, m0, [r3 + 22 * 16] | |
4937 pmulhrsw m4, m3 | |
4938 pmaddubsw m6, m5, [r3 + 22 * 16] | |
4939 pmulhrsw m6, m3 | |
4940 packuswb m4, m6 | |
4941 movu [r0 + 353 * 16], m4 | |
4942 | |
4943 ; mode 24 [row 2] | |
4944 pmaddubsw m4, m0, [r3 + 17 * 16] | |
4945 pmulhrsw m4, m3 | |
4946 pmaddubsw m6, m5, [r3 + 17 * 16] | |
4947 pmulhrsw m6, m3 | |
4948 packuswb m4, m6 | |
4949 movu [r0 + 354 * 16], m4 | |
4950 | |
4951 ; mode 24 [row 3] | |
4952 pmaddubsw m4, m0, [r3 + 12 * 16] | |
4953 pmulhrsw m4, m3 | |
4954 pmaddubsw m6, m5, [r3 + 12 * 16] | |
4955 pmulhrsw m6, m3 | |
4956 packuswb m4, m6 | |
4957 movu [r0 + 355 * 16], m4 | |
4958 | |
4959 ; mode 24 [row 4] | |
4960 pmaddubsw m4, m0, [r3 + 7 * 16] | |
4961 pmulhrsw m4, m3 | |
4962 pmaddubsw m6, m5, [r3 + 7 * 16] | |
4963 pmulhrsw m6, m3 | |
4964 packuswb m4, m6 | |
4965 movu [r0 + 356 * 16], m4 | |
4966 | |
4967 ; mode 24 [row 5] | |
4968 pmaddubsw m4, m0, [r3 + 2 * 16] | |
4969 pmulhrsw m4, m3 | |
4970 pmaddubsw m6, m5, [r3 + 2 * 16] | |
4971 pmulhrsw m6, m3 | |
4972 packuswb m4, m6 | |
4973 movu [r0 + 357 * 16], m4 | |
4974 | |
4975 ; mode 24 [row 6 - first half] | |
4976 pslldq m7, m0, 2 | |
4977 pinsrb m7, [r2 + 0], 1 | |
4978 pinsrb m7, [r2 + 6 + 32], 0 | |
4979 pmaddubsw m4, m7, [r3 + 29 * 16] | |
4980 pmulhrsw m4, m3 | |
4981 packuswb m4, m4 | |
4982 movh [r0 + 358 * 16], m4 | |
4983 | |
4984 ; mode 24 [row 7 - first half] | |
4985 pmaddubsw m4, m7, [r3 + 24 * 16] | |
4986 pmulhrsw m4, m3 | |
4987 packuswb m4, m4 | |
4988 movh [r0 + 359 * 16], m4 | |
4989 | |
4990 ; mode 24 [row 8 - first half] | |
4991 pmaddubsw m4, m7, [r3 + 19 * 16] | |
4992 pmulhrsw m4, m3 | |
4993 packuswb m4, m4 | |
4994 movh [r0 + 360 * 16], m4 | |
4995 | |
4996 ; mode 24 [row 9 - first half] | |
4997 pmaddubsw m4, m7, [r3 + 14 * 16] | |
4998 pmulhrsw m4, m3 | |
4999 packuswb m4, m4 | |
5000 movh [r0 + 361 * 16], m4 | |
5001 | |
5002 ; mode 24 [row 10 - first half] | |
5003 pmaddubsw m4, m7, [r3 + 9 * 16] | |
5004 pmulhrsw m4, m3 | |
5005 packuswb m4, m4 | |
5006 movh [r0 + 362 * 16], m4 | |
5007 | |
5008 ; mode 24 [row 11 - first half] | |
5009 pmaddubsw m4, m7, [r3 + 4 * 16] | |
5010 pmulhrsw m4, m3 | |
5011 packuswb m4, m4 | |
5012 movh [r0 + 363 * 16], m4 | |
5013 | |
5014 ; mode 24 [row 12 - first half] | |
5015 pslldq m7, 2 | |
5016 pinsrb m7, [r2 + 6 + 32], 1 | |
5017 pinsrb m7, [r2 + 13 + 32], 0 | |
5018 pmaddubsw m4, m7, [r3 + 31 * 16] | |
5019 pmulhrsw m4, m3 | |
5020 packuswb m4, m4 | |
5021 movh [r0 + 364 * 16], m4 | |
5022 | |
5023 ; mode 24 [row 13 - first half] | |
5024 pmaddubsw m4, m7, [r3 + 26 * 16] | |
5025 pmulhrsw m4, m3 | |
5026 packuswb m4, m4 | |
5027 movh [r0 + 365 * 16], m4 | |
5028 | |
5029 ; mode 24 [row 14 - first half] | |
5030 pmaddubsw m4, m7, [r3 + 21 * 16] | |
5031 pmulhrsw m4, m3 | |
5032 packuswb m4, m4 | |
5033 movh [r0 + 366 * 16], m4 | |
5034 | |
5035 ; mode 24 [row 15 - first half] | |
5036 pmaddubsw m4, m7, [r3 + 16 * 16] | |
5037 pmulhrsw m4, m3 | |
5038 packuswb m4, m4 | |
5039 movh [r0 + 367 * 16], m4 | |
5040 | |
5041 ; mode 23 [row 3 - first half] | |
5042 pslldq m7, m0, 2 | |
5043 pinsrb m7, [r2 + 0], 1 | |
5044 pinsrb m7, [r2 + 4 + 32], 0 | |
5045 pmaddubsw m4, m7, [r3 + 28 * 16] | |
5046 pmulhrsw m4, m3 | |
5047 packuswb m4, m4 | |
5048 movh [r0 + 339 * 16], m4 | |
5049 | |
5050 ; mode 23 [row 4 - first half] | |
5051 pmaddubsw m4, m7, [r3 + 19 * 16] | |
5052 pmulhrsw m4, m3 | |
5053 packuswb m4, m4 | |
5054 movh [r0 + 340 * 16], m4 | |
5055 | |
5056 ; mode 23 [row 5 - first half] | |
5057 pmaddubsw m4, m7, [r3 + 10 * 16] | |
5058 pmulhrsw m4, m3 | |
5059 packuswb m4, m4 | |
5060 movh [r0 + 341 * 16], m4 | |
5061 | |
5062 ; mode 23 [row 6 - first half] | |
5063 pmaddubsw m4, m7, [r3 + 1 * 16] | |
5064 pmulhrsw m4, m3 | |
5065 packuswb m4, m4 | |
5066 movh [r0 + 342 * 16], m4 | |
5067 | |
5068 ; mode 23 [row 7 - first half] | |
5069 pslldq m7, 2 | |
5070 pinsrb m7, [r2 + 4 + 32], 1 | |
5071 pinsrb m7, [r2 + 7 + 32], 0 | |
5072 pmaddubsw m4, m7, [r3 + 24 * 16] | |
5073 pmulhrsw m4, m3 | |
5074 packuswb m4, m4 | |
5075 movh [r0 + 343 * 16], m4 | |
5076 | |
5077 ; mode 23 [row 8 - first half] | |
5078 pmaddubsw m4, m7, [r3 + 15 * 16] | |
5079 pmulhrsw m4, m3 | |
5080 packuswb m4, m4 | |
5081 movh [r0 + 344 * 16], m4 | |
5082 | |
5083 ; mode 23 [row 9 - first half] | |
5084 pmaddubsw m4, m7, [r3 + 6 * 16] | |
5085 pmulhrsw m4, m3 | |
5086 packuswb m4, m4 | |
5087 movh [r0 + 345 * 16], m4 | |
5088 | |
5089 ; mode 23 [row 10 - first half] | |
5090 pslldq m7, 2 | |
5091 pinsrb m7, [r2 + 7 + 32], 1 | |
5092 pinsrb m7, [r2 + 11 + 32], 0 | |
5093 pmaddubsw m4, m7, [r3 + 29 * 16] | |
5094 pmulhrsw m4, m3 | |
5095 packuswb m4, m4 | |
5096 movh [r0 + 346 * 16], m4 | |
5097 | |
5098 ; mode 23 [row 11 - first half] | |
5099 pmaddubsw m4, m7, [r3 + 20 * 16] | |
5100 pmulhrsw m4, m3 | |
5101 packuswb m4, m4 | |
5102 movh [r0 + 347 * 16], m4 | |
5103 | |
5104 ; mode 23 [row 12 - first half] | |
5105 pmaddubsw m4, m7, [r3 + 11 * 16] | |
5106 pmulhrsw m4, m3 | |
5107 packuswb m4, m4 | |
5108 movh [r0 + 348 * 16], m4 | |
5109 | |
5110 ; mode 23 [row 13 - first half] | |
5111 pmaddubsw m4, m7, [r3 + 2 * 16] | |
5112 pmulhrsw m4, m3 | |
5113 packuswb m4, m4 | |
5114 movh [r0 + 349 * 16], m4 | |
5115 | |
5116 ; mode 23 [row 14 - first half] | |
5117 pslldq m7, 2 | |
5118 pinsrb m7, [r2 + 11 + 32], 1 | |
5119 pinsrb m7, [r2 + 14 + 32], 0 | |
5120 pmaddubsw m4, m7, [r3 + 25 * 16] | |
5121 pmulhrsw m4, m3 | |
5122 packuswb m4, m4 | |
5123 movh [r0 + 350 * 16], m4 | |
5124 | |
5125 ; mode 23 [row 15 - first half] | |
5126 pmaddubsw m4, m7, [r3 + 16 * 16] | |
5127 pmulhrsw m4, m3 | |
5128 packuswb m4, m4 | |
5129 movh [r0 + 351 * 16], m4 | |
5130 | |
5131 ; mode 21 [row 15 - first half] | |
5132 pmaddubsw m4, m0, [r3 + 16 * 16] | |
5133 pmulhrsw m4, m3 | |
5134 packuswb m4, m4 | |
5135 movh [r0 + 319 * 16 + 8], m4 | |
5136 ; mode 21 [row 15 - second half] end | |
5137 | |
5138 ; mode 20 [row 1 - first half] | |
5139 pslldq m7, m0, 2 | |
5140 pinsrb m7, [r2 + 0], 1 | |
5141 pinsrb m7, [r2 + 2 + 32], 0 | |
5142 pmaddubsw m4, m7, [r3 + 22 * 16] | |
5143 pmulhrsw m4, m3 | |
5144 packuswb m4, m4 | |
5145 movh [r0 + 289 * 16], m4 | |
5146 | |
5147 ; mode 20 [row 2 - first half] | |
5148 pmaddubsw m4, m7, [r3 + 1 * 16] | |
5149 pmulhrsw m4, m3 | |
5150 packuswb m4, m4 | |
5151 movh [r0 + 290 * 16], m4 | |
5152 | |
5153 ; mode 21 [row 1 - first half] | |
5154 pmaddubsw m4, m7, [r3 + 30 * 16] | |
5155 pmulhrsw m4, m3 | |
5156 packuswb m4, m4 | |
5157 movh [r0 + 305 * 16], m4 | |
5158 | |
5159 ; mode 21 [row 2 - first half] | |
5160 pmaddubsw m4, m7, [r3 + 13 * 16] | |
5161 pmulhrsw m4, m3 | |
5162 packuswb m4, m4 | |
5163 movh [r0 + 306 * 16], m4 | |
5164 | |
5165 ; mode 22 [row 2 - first half] | |
5166 pmaddubsw m4, m7, [r3 + 25 * 16] | |
5167 pmulhrsw m4, m3 | |
5168 packuswb m4, m4 | |
5169 movh [r0 + 322 * 16], m4 | |
5170 | |
5171 ; mode 22 [row 3 - first half] | |
5172 pmaddubsw m4, m7, [r3 + 12 * 16] | |
5173 pmulhrsw m4, m3 | |
5174 packuswb m4, m4 | |
5175 movh [r0 + 323 * 16], m4 | |
5176 | |
5177 ; mode 22 [row 4 - first half] | |
5178 pslldq m1, m7, 2 | |
5179 pinsrb m1, [r2 + 2 + 32], 1 | |
5180 pinsrb m1, [r2 + 5 + 32], 0 | |
5181 pmaddubsw m4, m1, [r3 + 31 * 16] | |
5182 pmulhrsw m4, m3 | |
5183 packuswb m4, m4 | |
5184 movh [r0 + 324 * 16], m4 | |
5185 | |
5186 ; mode 22 [row 5 - first half] | |
5187 pmaddubsw m4, m1, [r3 + 18 * 16] | |
5188 pmulhrsw m4, m3 | |
5189 packuswb m4, m4 | |
5190 movh [r0 + 325 * 16], m4 | |
5191 | |
5192 ; mode 22 [row 6 - first half] | |
5193 pmaddubsw m4, m1, [r3 + 5 * 16] | |
5194 pmulhrsw m4, m3 | |
5195 packuswb m4, m4 | |
5196 movh [r0 + 326 * 16], m4 | |
5197 | |
5198 ; mode 22 [row 7 - first half] | |
5199 pslldq m1, 2 | |
5200 pinsrb m1, [r2 + 5 + 32], 1 | |
5201 pinsrb m1, [r2 + 7 + 32], 0 | |
5202 pmaddubsw m4, m1, [r3 + 24 * 16] | |
5203 pmulhrsw m4, m3 | |
5204 packuswb m4, m4 | |
5205 movh [r0 + 327 * 16], m4 | |
5206 | |
5207 ; mode 22 [row 8 - first half] | |
5208 pmaddubsw m4, m1, [r3 + 11 * 16] | |
5209 pmulhrsw m4, m3 | |
5210 packuswb m4, m4 | |
5211 movh [r0 + 328 * 16], m4 | |
5212 | |
5213 ; mode 22 [row 9 - first half] | |
5214 pslldq m1, 2 | |
5215 pinsrb m1, [r2 + 7 + 32], 1 | |
5216 pinsrb m1, [r2 + 10 + 32], 0 | |
5217 pmaddubsw m4, m1, [r3 + 30 * 16] | |
5218 pmulhrsw m4, m3 | |
5219 packuswb m4, m4 | |
5220 movh [r0 + 329 * 16], m4 | |
5221 | |
5222 ; mode 22 [row 10 - first half] | |
5223 pmaddubsw m4, m1, [r3 + 17 * 16] | |
5224 pmulhrsw m4, m3 | |
5225 packuswb m4, m4 | |
5226 movh [r0 + 330 * 16], m4 | |
5227 | |
5228 ; mode 22 [row 11 - first half] | |
5229 pmaddubsw m4, m1, [r3 + 4 * 16] | |
5230 pmulhrsw m4, m3 | |
5231 packuswb m4, m4 | |
5232 movh [r0 + 331 * 16], m4 | |
5233 | |
5234 ; mode 22 [row 12 - first half] | |
5235 pslldq m1, 2 | |
5236 pinsrb m1, [r2 + 10 + 32], 1 | |
5237 pinsrb m1, [r2 + 12 + 32], 0 | |
5238 pmaddubsw m4, m1, [r3 + 23 * 16] | |
5239 pmulhrsw m4, m3 | |
5240 packuswb m4, m4 | |
5241 movh [r0 + 332 * 16], m4 | |
5242 | |
5243 ; mode 22 [row 13 - first half] | |
5244 pmaddubsw m4, m1, [r3 + 10 * 16] | |
5245 pmulhrsw m4, m3 | |
5246 packuswb m4, m4 | |
5247 movh [r0 + 333 * 16], m4 | |
5248 | |
5249 ; mode 22 [row 14 - first half] | |
5250 pslldq m1, 2 | |
5251 pinsrb m1, [r2 + 12 + 32], 1 | |
5252 pinsrb m1, [r2 + 15 + 32], 0 | |
5253 pmaddubsw m4, m1, [r3 + 29 * 16] | |
5254 pmulhrsw m4, m3 | |
5255 packuswb m4, m4 | |
5256 movh [r0 + 334 * 16], m4 | |
5257 | |
5258 ; mode 22 [row 15 - first half] | |
5259 pmaddubsw m4, m1, [r3 + 16 * 16] | |
5260 pmulhrsw m4, m3 | |
5261 packuswb m4, m4 | |
5262 movh [r0 + 335 * 16], m4 | |
5263 | |
5264 ; mode 21 [row 3 - first half] | |
5265 pslldq m6, m7, 2 | |
5266 pinsrb m6, [r2 + 2 + 32], 1 | |
5267 pinsrb m6, [r2 + 4 + 32], 0 | |
5268 pmaddubsw m4, m6, [r3 + 28 * 16] | |
5269 pmulhrsw m4, m3 | |
5270 packuswb m4, m4 | |
5271 movh [r0 + 307 * 16], m4 | |
5272 | |
5273 ; mode 21 [row 4 - first half] | |
5274 pmaddubsw m4, m6, [r3 + 11 * 16] | |
5275 pmulhrsw m4, m3 | |
5276 packuswb m4, m4 | |
5277 movh [r0 + 308 * 16], m4 | |
5278 | |
5279 ; mode 21 [row 5 - first half] | |
5280 pslldq m6, 2 | |
5281 pinsrb m6, [r2 + 4 + 32], 1 | |
5282 pinsrb m6, [r2 + 6 + 32], 0 | |
5283 pmaddubsw m4, m6, [r3 + 26 * 16] | |
5284 pmulhrsw m4, m3 | |
5285 packuswb m4, m4 | |
5286 movh [r0 + 309 * 16], m4 | |
5287 | |
5288 ; mode 21 [row 6 - first half] | |
5289 pmaddubsw m4, m6, [r3 + 9 * 16] | |
5290 pmulhrsw m4, m3 | |
5291 packuswb m4, m4 | |
5292 movh [r0 + 310 * 16], m4 | |
5293 | |
5294 ; mode 21 [row 7 - first half] | |
5295 pslldq m6, 2 | |
5296 pinsrb m6, [r2 + 6 + 32], 1 | |
5297 pinsrb m6, [r2 + 8 + 32], 0 | |
5298 pmaddubsw m4, m6, [r3 + 24 * 16] | |
5299 pmulhrsw m4, m3 | |
5300 packuswb m4, m4 | |
5301 movh [r0 + 311 * 16], m4 | |
5302 | |
5303 ; mode 21 [row 8 - first half] | |
5304 pmaddubsw m4, m6, [r3 + 7 * 16] | |
5305 pmulhrsw m4, m3 | |
5306 packuswb m4, m4 | |
5307 movh [r0 + 312 * 16], m4 | |
5308 | |
5309 ; mode 21 [row 9 - first half] | |
5310 pslldq m6, 2 | |
5311 pinsrb m6, [r2 + 8 + 32], 1 | |
5312 pinsrb m6, [r2 + 9 + 32], 0 | |
5313 pmaddubsw m4, m6, [r3 + 22 * 16] | |
5314 pmulhrsw m4, m3 | |
5315 packuswb m4, m4 | |
5316 movh [r0 + 313 * 16], m4 | |
5317 | |
5318 ; mode 21 [row 10 - first half] | |
5319 pmaddubsw m4, m6, [r3 + 5 * 16] | |
5320 pmulhrsw m4, m3 | |
5321 packuswb m4, m4 | |
5322 movh [r0 + 314 * 16], m4 | |
5323 | |
5324 ; mode 21 [row 11 - first half] | |
5325 pslldq m6, 2 | |
5326 pinsrb m6, [r2 + 9 + 32], 1 | |
5327 pinsrb m6, [r2 + 11 + 32], 0 | |
5328 pmaddubsw m4, m6, [r3 + 20 * 16] | |
5329 pmulhrsw m4, m3 | |
5330 packuswb m4, m4 | |
5331 movh [r0 + 315 * 16], m4 | |
5332 | |
5333 ; mode 21 [row 12 - first half] | |
5334 pmaddubsw m4, m6, [r3 + 3 * 16] | |
5335 pmulhrsw m4, m3 | |
5336 packuswb m4, m4 | |
5337 movh [r0 + 316 * 16], m4 | |
5338 | |
5339 ; mode 21 [row 13 - first half] | |
5340 pslldq m6, 2 | |
5341 pinsrb m6, [r2 + 11 + 32], 1 | |
5342 pinsrb m6, [r2 + 13 + 32], 0 | |
5343 pmaddubsw m4, m6, [r3 + 18 * 16] | |
5344 pmulhrsw m4, m3 | |
5345 packuswb m4, m4 | |
5346 movh [r0 + 317 * 16], m4 | |
5347 | |
5348 ; mode 21 [row 14 - first half] | |
5349 pmaddubsw m4, m6, [r3 + 1 * 16] | |
5350 pmulhrsw m4, m3 | |
5351 packuswb m4, m4 | |
5352 movh [r0 + 318 * 16], m4 | |
5353 | |
5354 ; mode 21 [row 15 - first half] | |
5355 pslldq m6, 2 | |
5356 pinsrb m6, [r2 + 32 + 13], 1 | |
5357 pinsrb m6, [r2 + 32 + 15], 0 | |
5358 pmaddubsw m4, m6, [r3 + 16 * 16] | |
5359 pmulhrsw m4, m3 | |
5360 packuswb m4, m4 | |
5361 movh [r0 + 319 * 16], m4 | |
5362 | |
5363 ; mode 20 [row 13 - second half] | |
5364 pmaddubsw m4, m7, [r3 + 26 * 16] | |
5365 pmulhrsw m4, m3 | |
5366 packuswb m4, m4 | |
5367 movh [r0 + 301 * 16 + 8], m4 | |
5368 ; mode 20 [row 13 - second half] | |
5369 | |
5370 ; mode 20 [row 14 - second half] | |
5371 pmaddubsw m4, m7, [r3 + 5 * 16] | |
5372 pmulhrsw m4, m3 | |
5373 packuswb m4, m4 | |
5374 movh [r0 + 302 * 16 + 8], m4 | |
5375 ; mode 20 [row 14 - second half] | |
5376 | |
5377 ; mode 20 [row 3 - first half] | |
5378 pslldq m7, 2 | |
5379 pinsrb m7, [r2 + 32 + 2], 1 | |
5380 pinsrb m7, [r2 + 32 + 3], 0 | |
5381 pmaddubsw m4, m7, [r3 + 12 * 16] | |
5382 pmulhrsw m4, m3 | |
5383 packuswb m4, m4 | |
5384 movh [r0 + 291 * 16], m4 | |
5385 | |
5386 ; mode 20 [row 15 - second half] | |
5387 pmaddubsw m4, m7, [r3 + 16 * 16] | |
5388 pmulhrsw m4, m3 | |
5389 packuswb m4, m4 | |
5390 movh [r0 + 303 * 16 + 8], m4 | |
5391 ; mode 20 [row 15 - second half] | |
5392 | |
5393 ; mode 20 [row 4 - first half] | |
5394 pslldq m7, 2 | |
5395 pinsrb m7, [r2 + 32 + 3], 1 | |
5396 pinsrb m7, [r2 + 32 + 5], 0 | |
5397 pmaddubsw m4, m7, [r3 + 23 * 16] | |
5398 pmulhrsw m4, m3 | |
5399 packuswb m4, m4 | |
5400 movh [r0 + 292 * 16], m4 | |
5401 | |
5402 ; mode 20 [row 5 - first half] | |
5403 pmaddubsw m4, m7, [r3 + 2 * 16] | |
5404 pmulhrsw m4, m3 | |
5405 packuswb m4, m4 | |
5406 movh [r0 + 293 * 16], m4 | |
5407 | |
5408 ; mode 20 [row 6 - first half] | |
5409 pslldq m7, 2 | |
5410 pinsrb m7, [r2 + 32 + 5], 1 | |
5411 pinsrb m7, [r2 + 32 + 6], 0 | |
5412 pmaddubsw m4, m7, [r3 + 13 * 16] | |
5413 pmulhrsw m4, m3 | |
5414 packuswb m4, m4 | |
5415 movh [r0 + 294 * 16], m4 | |
5416 | |
5417 ; mode 20 [row 7 - first half] | |
5418 pslldq m7, 2 | |
5419 pinsrb m7, [r2 + 32 + 6], 1 | |
5420 pinsrb m7, [r2 + 32 + 8], 0 | |
5421 pmaddubsw m4, m7, [r3 + 24 * 16] | |
5422 pmulhrsw m4, m3 | |
5423 packuswb m4, m4 | |
5424 movh [r0 + 295 * 16], m4 | |
5425 | |
5426 ; mode 20 [row 8 - first half] | |
5427 pmaddubsw m4, m7, [r3 + 3 * 16] | |
5428 pmulhrsw m4, m3 | |
5429 packuswb m4, m4 | |
5430 movh [r0 + 296 * 16], m4 | |
5431 | |
5432 ; mode 20 [row 9 - first half] | |
5433 pslldq m7, 2 | |
5434 pinsrb m7, [r2 + 32 + 8], 1 | |
5435 pinsrb m7, [r2 + 32 + 9], 0 | |
5436 pmaddubsw m4, m7, [r3 + 14 * 16] | |
5437 pmulhrsw m4, m3 | |
5438 packuswb m4, m4 | |
5439 movh [r0 + 297 * 16], m4 | |
5440 | |
5441 ; mode 20 [row 10 - first half] | |
5442 pslldq m7, 2 | |
5443 pinsrb m7, [r2 + 32 + 9], 1 | |
5444 pinsrb m7, [r2 + 32 + 11], 0 | |
5445 pmaddubsw m4, m7, [r3 + 25 * 16] | |
5446 pmulhrsw m4, m3 | |
5447 packuswb m4, m4 | |
5448 movh [r0 + 298 * 16], m4 | |
5449 | |
5450 ; mode 20 [row 11 - first half] | |
5451 pmaddubsw m4, m7, [r3 + 4 * 16] | |
5452 pmulhrsw m4, m3 | |
5453 packuswb m4, m4 | |
5454 movh [r0 + 299 * 16], m4 | |
5455 | |
5456 ; mode 20 [row 12 - first half] | |
5457 movu m1, [r3 + 15 * 16] | |
5458 pslldq m7, 2 | |
5459 pinsrb m7, [r2 + 32 + 11], 1 | |
5460 pinsrb m7, [r2 + 32 + 12], 0 | |
5461 pmaddubsw m4, m7, [r3 + 15 * 16] | |
5462 pmulhrsw m4, m3 | |
5463 packuswb m4, m4 | |
5464 movh [r0 + 300 * 16], m4 | |
5465 | |
5466 ; mode 20 [row 13 - first half] | |
5467 pslldq m7, 2 | |
5468 pinsrb m7, [r2 + 32 + 12], 1 | |
5469 pinsrb m7, [r2 + 32 + 14], 0 | |
5470 pmaddubsw m4, m7, [r3 + 26 * 16] | |
5471 pmulhrsw m4, m3 | |
5472 packuswb m4, m4 | |
5473 movh [r0 + 301 * 16], m4 | |
5474 | |
5475 ; mode 20 [row 14 - first half] | |
5476 pmaddubsw m4, m7, [r3 + 5 * 16] | |
5477 pmulhrsw m4, m3 | |
5478 packuswb m4, m4 | |
5479 movh [r0 + 302 * 16], m4 | |
5480 | |
5481 ; mode 20 [row 15 - first half] | |
5482 pslldq m7, 2 | |
5483 pinsrb m7, [r2 + 32 + 14], 1 | |
5484 pinsrb m7, [r2 + 32 + 15], 0 | |
5485 pmaddubsw m4, m7, [r3 + 16 * 16] | |
5486 pmulhrsw m4, m3 | |
5487 packuswb m4, m4 | |
5488 movh [r0 + 303 * 16], m4 | |
5489 | |
5490 ; mode 19 [row 1] | |
5491 pslldq m0, 2 | |
5492 pinsrb m0, [r2], 1 | |
5493 pinsrb m0, [r2 + 32 + 1], 0 | |
5494 pslldq m5, 2 | |
5495 pinsrb m5, [r2 + 8], 1 | |
5496 pinsrb m5, [r2 + 7], 0 | |
5497 | |
5498 ; mode 20 [row 1 - second half] | |
5499 pmaddubsw m4, m5, [r3 + 22 * 16] | |
5500 pmulhrsw m4, m3 | |
5501 packuswb m4, m4 | |
5502 movh [r0 + 289 * 16 + 8], m4 | |
5503 ; mode 20 [row 1 - second half] end | |
5504 | |
5505 ; mode 20 [row 2 - second half] | |
5506 pmaddubsw m4, m5, [r3 + 1 * 16] | |
5507 pmulhrsw m4, m3 | |
5508 packuswb m4, m4 | |
5509 movh [r0 + 290 * 16 + 8], m4 | |
5510 ; mode 20 [row 2 - second half] end | |
5511 | |
5512 ; mode 21 [row 2 - second half] | |
5513 pmaddubsw m4, m5, [r3 + 30 * 16] | |
5514 pmulhrsw m4, m3 | |
5515 packuswb m4, m4 | |
5516 movh [r0 + 305 * 16 + 8], m4 | |
5517 ; mode 21 [row 2 - second half] end | |
5518 | |
5519 ; mode 21 [row 3 - second half] | |
5520 pmaddubsw m4, m5, [r3 + 13 * 16] | |
5521 pmulhrsw m4, m3 | |
5522 packuswb m4, m4 | |
5523 movh [r0 + 306 * 16 + 8], m4 | |
5524 ; mode 21 [row 3 - second half] end | |
5525 | |
5526 ; mode 21 [row 4 - second half] | |
5527 pmaddubsw m4, m5, [r3 + 11 * 16] | |
5528 pmulhrsw m4, m3 | |
5529 packuswb m4, m4 | |
5530 movh [r0 + 307 * 16 + 8], m4 | |
5531 ; mode 21 [row 4 - second half] end | |
5532 | |
5533 ; mode 22 [row 2 - second half] | |
5534 pmaddubsw m4, m5, [r3 + 25 * 16] | |
5535 pmulhrsw m4, m3 | |
5536 packuswb m4, m4 | |
5537 movh [r0 + 322 * 16 + 8], m4 | |
5538 ; mode 22 [row 2 - second half] end | |
5539 | |
5540 ; mode 22 [row 3 - second half] | |
5541 pmaddubsw m4, m5, [r3 + 12 * 16] | |
5542 pmulhrsw m4, m3 | |
5543 packuswb m4, m4 | |
5544 movh [r0 + 323 * 16 + 8], m4 | |
5545 ; mode 22 [row 3 - second half] end | |
5546 | |
5547 ; mode 23 [row 3 - second half] | |
5548 pmaddubsw m4, m5, [r3 + 28 * 16] | |
5549 pmulhrsw m4, m3 | |
5550 packuswb m4, m4 | |
5551 movh [r0 + 339 * 16 + 8], m4 | |
5552 ; mode 23 [row 3 - second half] end | |
5553 | |
5554 ; mode 23 [row 4 - second half] | |
5555 pmaddubsw m4, m5, [r3 + 19 * 16] | |
5556 pmulhrsw m4, m3 | |
5557 packuswb m4, m4 | |
5558 movh [r0 + 340 * 16 + 8], m4 | |
5559 ; mode 23 [row 4 - second half] end | |
5560 | |
5561 ; mode 23 [row 5 - second half] | |
5562 pmaddubsw m4, m5, [r3 + 10 * 16] | |
5563 pmulhrsw m4, m3 | |
5564 packuswb m4, m4 | |
5565 movh [r0 + 341 * 16 + 8], m4 | |
5566 ; mode 23 [row 5 - second half] end | |
5567 | |
5568 ; mode 23 [row 6 - second half] | |
5569 pmaddubsw m4, m5, [r3 + 1 * 16] | |
5570 pmulhrsw m4, m3 | |
5571 packuswb m4, m4 | |
5572 movh [r0 + 342 * 16 + 8], m4 | |
5573 ; mode 23 [row 6 - second half] end | |
5574 | |
5575 ; mode 24 [row 6 - second half] | |
5576 pmaddubsw m4, m5, [r3 + 29 * 16] | |
5577 pmulhrsw m4, m3 | |
5578 packuswb m4, m4 | |
5579 movh [r0 + 358 * 16 + 8], m4 | |
5580 ; mode 24 [row 6 - second half] end | |
5581 | |
5582 ; mode 24 [row 7 - second half] | |
5583 pmaddubsw m4, m5, [r3 + 24 * 16] | |
5584 pmulhrsw m4, m3 | |
5585 packuswb m4, m4 | |
5586 movh [r0 + 359 * 16 + 8], m4 | |
5587 ; mode 24 [row 7 - second half] end | |
5588 | |
5589 ; mode 24 [row 8 - second half] | |
5590 pmaddubsw m4, m5, [r3 + 19 * 16] | |
5591 pmulhrsw m4, m3 | |
5592 packuswb m4, m4 | |
5593 movh [r0 + 360 * 16 + 8], m4 | |
5594 ; mode 24 [row 8 - second half] end | |
5595 | |
5596 ; mode 24 [row 9 - second half] | |
5597 pmaddubsw m4, m5, [r3 + 14 * 16] | |
5598 pmulhrsw m4, m3 | |
5599 packuswb m4, m4 | |
5600 movh [r0 + 361 * 16 + 8], m4 | |
5601 ; mode 24 [row 9 - second half] end | |
5602 | |
5603 ; mode 24 [row 10 - second half] | |
5604 pmaddubsw m4, m5, [r3 + 9 * 16] | |
5605 pmulhrsw m4, m3 | |
5606 packuswb m4, m4 | |
5607 movh [r0 + 362 * 16 + 8], m4 | |
5608 ; mode 24 [row 10 - second half] end | |
5609 | |
5610 ; mode 24 [row 11 - second half] | |
5611 pmaddubsw m4, m5, [r3 + 4 * 16] | |
5612 pmulhrsw m4, m3 | |
5613 packuswb m4, m4 | |
5614 movh [r0 + 363 * 16 + 8], m4 | |
5615 ; mode 24 [row 11 - second half] end | |
5616 | |
5617 pmaddubsw m4, m0, [r3 + 12 * 16] | |
5618 pmulhrsw m4, m3 | |
5619 pmaddubsw m6, m5, [r3 + 12 * 16] | |
5620 pmulhrsw m6, m3 | |
5621 packuswb m4, m6 | |
5622 movu [r0 + 273 * 16], m4 | |
5623 | |
5624 ; mode 19 [row 2] | |
5625 pslldq m0, 2 | |
5626 pinsrb m0, [r2 + 32 + 1], 1 | |
5627 pinsrb m0, [r2 + 32 + 2], 0 | |
5628 pslldq m5, 2 | |
5629 pinsrb m5, [r2 + 7], 1 | |
5630 pinsrb m5, [r2 + 6], 0 | |
5631 | |
5632 ; mode 20 [row 3 - second half] | |
5633 pmaddubsw m4, m5, [r3 + 12 * 16] | |
5634 pmulhrsw m4, m3 | |
5635 packuswb m4, m4 | |
5636 movh [r0 + 291 * 16 + 8], m4 | |
5637 ; mode 20 [row 3 - second half] end | |
5638 | |
5639 ; mode 21 [row 3 - second half] | |
5640 pmaddubsw m4, m5, [r3 + 28 * 16] | |
5641 pmulhrsw m4, m3 | |
5642 packuswb m4, m4 | |
5643 movh [r0 + 307 * 16 + 8], m4 | |
5644 ; mode 21 [row 3 - second half] end | |
5645 | |
5646 ; mode 21 [row 4 - second half] | |
5647 pmaddubsw m4, m5, [r3 + 11 * 16] | |
5648 pmulhrsw m4, m3 | |
5649 packuswb m4, m4 | |
5650 movh [r0 + 308 * 16 + 8], m4 | |
5651 ; mode 21 [row 4 - second half] end | |
5652 | |
5653 ; mode 22 [row 4 - second half] | |
5654 pmaddubsw m4, m5, [r3 + 31 * 16] | |
5655 pmulhrsw m4, m3 | |
5656 packuswb m4, m4 | |
5657 movh [r0 + 324 * 16 + 8], m4 | |
5658 ; mode 22 [row 4 - second half] end | |
5659 | |
5660 ; mode 22 [row 5 - second half] | |
5661 pmaddubsw m4, m5, [r3 + 18 * 16] | |
5662 pmulhrsw m4, m3 | |
5663 packuswb m4, m4 | |
5664 movh [r0 + 325 * 16 + 8], m4 | |
5665 ; mode 22 [row 5 - second half] end | |
5666 | |
5667 ; mode 22 [row 6 - second half] | |
5668 pmaddubsw m4, m5, [r3 + 5 * 16] | |
5669 pmulhrsw m4, m3 | |
5670 packuswb m4, m4 | |
5671 movh [r0 + 326 * 16 + 8], m4 | |
5672 ; mode 22 [row 6 - second half] end | |
5673 | |
5674 ; mode 23 [row 7 - second half] | |
5675 pmaddubsw m4, m5, [r3 + 24 * 16] | |
5676 pmulhrsw m4, m3 | |
5677 packuswb m4, m4 | |
5678 movh [r0 + 343 * 16 + 8], m4 | |
5679 ; mode 23 [row 7 - second half] end | |
5680 | |
5681 ; mode 23 [row 8 - second half] | |
5682 pmaddubsw m4, m5, [r3 + 15 * 16] | |
5683 pmulhrsw m4, m3 | |
5684 packuswb m4, m4 | |
5685 movh [r0 + 344 * 16 + 8], m4 | |
5686 ; mode 23 [row 8 - second half] end | |
5687 | |
5688 ; mode 23 [row 9 - second half] | |
5689 pmaddubsw m4, m5, [r3 + 6 * 16] | |
5690 pmulhrsw m4, m3 | |
5691 packuswb m4, m4 | |
5692 movh [r0 + 345 * 16 + 8], m4 | |
5693 ; mode 23 [row 9 - second half] end | |
5694 | |
5695 ; mode 24 [row 12 - second half] | |
5696 pmaddubsw m4, m5, [r3 + 31 * 16] | |
5697 pmulhrsw m4, m3 | |
5698 packuswb m4, m4 | |
5699 movh [r0 + 364 * 16 + 8], m4 | |
5700 ; mode 24 [row 12 - second half] end | |
5701 | |
5702 ; mode 24 [row 13 - second half] | |
5703 pmaddubsw m4, m5, [r3 + 26 * 16] | |
5704 pmulhrsw m4, m3 | |
5705 packuswb m4, m4 | |
5706 movh [r0 + 365 * 16 + 8], m4 | |
5707 ; mode 24 [row 13 - second half] end | |
5708 | |
5709 ; mode 24 [row 14 - second half] | |
5710 pmaddubsw m4, m5, [r3 + 21 * 16] | |
5711 pmulhrsw m4, m3 | |
5712 packuswb m4, m4 | |
5713 movh [r0 + 366 * 16 + 8], m4 | |
5714 ; mode 24 [row 14 - second half] end | |
5715 | |
5716 ; mode 24 [row 15 - second half] | |
5717 pmaddubsw m4, m5, [r3 + 16 * 16] | |
5718 pmulhrsw m4, m3 | |
5719 packuswb m4, m4 | |
5720 movh [r0 + 367 * 16 + 8], m4 | |
5721 ; mode 24 [row 15 - second half] end | |
5722 | |
5723 pmaddubsw m4, m0, [r3 + 18 * 16] | |
5724 pmulhrsw m4, m3 | |
5725 pmaddubsw m6, m5, [r3 + 18 * 16] | |
5726 pmulhrsw m6, m3 | |
5727 packuswb m4, m6 | |
5728 movu [r0 + 274 * 16], m4 | |
5729 | |
5730 ; mode 19 [row 3] | |
5731 pslldq m0, 2 | |
5732 pinsrb m0, [r2 + 32 + 2], 1 | |
5733 pinsrb m0, [r2 + 32 + 4], 0 | |
5734 pslldq m5, 2 | |
5735 pinsrb m5, [r2 + 6], 1 | |
5736 pinsrb m5, [r2 + 5], 0 | |
5737 | |
5738 ; mode 20 [row 4 - second half] | |
5739 pmaddubsw m4, m5, [r3 + 23 * 16] | |
5740 pmulhrsw m4, m3 | |
5741 packuswb m4, m4 | |
5742 movh [r0 + 292 * 16 + 8], m4 | |
5743 ; mode 20 [row 4 - second half] end | |
5744 | |
5745 ; mode 20 [row 5 - second half] | |
5746 pmaddubsw m4, m5, [r3 + 2 * 16] | |
5747 pmulhrsw m4, m3 | |
5748 packuswb m4, m4 | |
5749 movh [r0 + 293 * 16 + 8], m4 | |
5750 ; mode 20 [row 5 - second half] end | |
5751 | |
5752 ; mode 21 [row 5 - second half] | |
5753 pmaddubsw m4, m5, [r3 + 26 * 16] | |
5754 pmulhrsw m4, m3 | |
5755 packuswb m4, m4 | |
5756 movh [r0 + 309 * 16 + 8], m4 | |
5757 ; mode 21 [row 5 - second half] end | |
5758 | |
5759 ; mode 21 [row 6 - second half] | |
5760 pmaddubsw m4, m5, [r3 + 9 * 16] | |
5761 pmulhrsw m4, m3 | |
5762 packuswb m4, m4 | |
5763 movh [r0 + 310 * 16 + 8], m4 | |
5764 ; mode 21 [row 6 - second half] end | |
5765 | |
5766 ; mode 22 [row 7 - second half] | |
5767 pmaddubsw m4, m5, [r3 + 24 * 16] | |
5768 pmulhrsw m4, m3 | |
5769 packuswb m4, m4 | |
5770 movh [r0 + 327 * 16 + 8], m4 | |
5771 ; mode 22 [row 7 - second half] end | |
5772 | |
5773 ; mode 22 [row 8 - second half] | |
5774 pmaddubsw m4, m5, [r3 + 11 * 16] | |
5775 pmulhrsw m4, m3 | |
5776 packuswb m4, m4 | |
5777 movh [r0 + 328 * 16 + 8], m4 | |
5778 ; mode 22 [row 7 - second half] end | |
5779 | |
5780 ; mode 23 [row 10 - second half] | |
5781 pmaddubsw m4, m5, [r3 + 29 * 16] | |
5782 pmulhrsw m4, m3 | |
5783 packuswb m4, m4 | |
5784 movh [r0 + 346 * 16 + 8], m4 | |
5785 ; mode 23 [row 10 - second half] end | |
5786 | |
5787 ; mode 23 [row 11 - second half] | |
5788 pmaddubsw m4, m5, [r3 + 20 * 16] | |
5789 pmulhrsw m4, m3 | |
5790 packuswb m4, m4 | |
5791 movh [r0 + 347 * 16 + 8], m4 | |
5792 ; mode 23 [row 11 - second half] end | |
5793 | |
5794 ; mode 23 [row 12 - second half] | |
5795 pmaddubsw m4, m5, [r3 + 11 * 16] | |
5796 pmulhrsw m4, m3 | |
5797 packuswb m4, m4 | |
5798 movh [r0 + 348 * 16 + 8], m4 | |
5799 ; mode 23 [row 12 - second half] end | |
5800 | |
5801 ; mode 23 [row 13 - second half] | |
5802 pmaddubsw m4, m5, [r3 + 2 * 16] | |
5803 pmulhrsw m4, m3 | |
5804 packuswb m4, m4 | |
5805 movh [r0 + 349 * 16 + 8], m4 | |
5806 ; mode 23 [row 13 - second half] end | |
5807 | |
5808 pmaddubsw m4, m0, [r3 + 24 * 16] | |
5809 pmulhrsw m4, m3 | |
5810 pmaddubsw m6, m5, [r3 + 24 * 16] | |
5811 pmulhrsw m6, m3 | |
5812 packuswb m4, m6 | |
5813 movu [r0 + 275 * 16], m4 | |
5814 | |
5815 ; mode 19 [row 4] | |
5816 pslldq m0, 2 | |
5817 pinsrb m0, [r2 + 32 + 4], 1 | |
5818 pinsrb m0, [r2 + 32 + 5], 0 | |
5819 pslldq m5, 2 | |
5820 pinsrb m5, [r2 + 5], 1 | |
5821 pinsrb m5, [r2 + 4], 0 | |
5822 | |
5823 ; mode 20 [row 6 - second half] | |
5824 pmaddubsw m4, m5, [r3 + 13 * 16] | |
5825 pmulhrsw m4, m3 | |
5826 packuswb m4, m4 | |
5827 movh [r0 + 294 * 16 + 8], m4 | |
5828 ; mode 20 [row 6 - second half] end | |
5829 | |
5830 ; mode 21 [row 7 - second half] | |
5831 pmaddubsw m4, m5, [r3 + 24 * 16] | |
5832 pmulhrsw m4, m3 | |
5833 packuswb m4, m4 | |
5834 movh [r0 + 311 * 16 + 8], m4 | |
5835 ; mode 21 [row 7 - second half] end | |
5836 | |
5837 ; mode 21 [row 8 - second half] | |
5838 pmaddubsw m4, m5, [r3 + 7 * 16] | |
5839 pmulhrsw m4, m3 | |
5840 packuswb m4, m4 | |
5841 movh [r0 + 312 * 16 + 8], m4 | |
5842 ; mode 21 [row 8 - second half] end | |
5843 | |
5844 ; mode 22 [row 9 - second half] | |
5845 pmaddubsw m4, m5, [r3 + 30 * 16] | |
5846 pmulhrsw m4, m3 | |
5847 packuswb m4, m4 | |
5848 movh [r0 + 329 * 16 + 8], m4 | |
5849 ; mode 22 [row 9 - second half] end | |
5850 | |
5851 ; mode 22 [row 10 - second half] | |
5852 pmaddubsw m4, m5, [r3 + 17 * 16] | |
5853 pmulhrsw m4, m3 | |
5854 packuswb m4, m4 | |
5855 movh [r0 + 330 * 16 + 8], m4 | |
5856 ; mode 22 [row 10 - second half] end | |
5857 | |
5858 ; mode 22 [row 11 - second half] | |
5859 pmaddubsw m4, m5, [r3 + 4 * 16] | |
5860 pmulhrsw m4, m3 | |
5861 packuswb m4, m4 | |
5862 movh [r0 + 331 * 16 + 8], m4 | |
5863 ; mode 22 [row 11 - second half] end | |
5864 | |
5865 ; mode 23 [row 14 - second half] | |
5866 pmaddubsw m4, m5, [r3 + 25 * 16] | |
5867 pmulhrsw m4, m3 | |
5868 packuswb m4, m4 | |
5869 movh [r0 + 350 * 16 + 8], m4 | |
5870 ; mode 23 [row 14 - second half] end | |
5871 | |
5872 ; mode 23 [row 15 - second half] | |
5873 pmaddubsw m4, m5, [r3 + 16 * 16] | |
5874 pmulhrsw m4, m3 | |
5875 packuswb m4, m4 | |
5876 movh [r0 + 351 * 16 + 8], m4 | |
5877 | |
5878 ; mode 23 [row 15 - second half] end | |
5879 pmaddubsw m4, m0, [r3 + 30 * 16] | |
5880 pmulhrsw m4, m3 | |
5881 pmaddubsw m6, m5, [r3 + 30 * 16] | |
5882 pmulhrsw m6, m3 | |
5883 packuswb m4, m6 | |
5884 movu [r0 + 276 * 16], m4 | |
5885 | |
5886 ; mode 19 [row 5] | |
5887 pmaddubsw m4, m0, [r3 + 4 * 16] | |
5888 pmulhrsw m4, m3 | |
5889 pmaddubsw m6, m5, [r3 + 4 * 16] | |
5890 pmulhrsw m6, m3 | |
5891 packuswb m4, m6 | |
5892 movu [r0 + 277 * 16], m4 | |
5893 | |
5894 ; mode 19 [row 6] | |
5895 pslldq m0, 2 | |
5896 pinsrb m0, [r2 + 32 + 5], 1 | |
5897 pinsrb m0, [r2 + 32 + 6], 0 | |
5898 pslldq m5, 2 | |
5899 pinsrb m5, [r2 + 4], 1 | |
5900 pinsrb m5, [r2 + 3], 0 | |
5901 | |
5902 ; mode 20 [row 7 - second half] | |
5903 pmaddubsw m4, m5, [r3 + 24 * 16] | |
5904 pmulhrsw m4, m3 | |
5905 packuswb m4, m4 | |
5906 movh [r0 + 295 * 16 + 8], m4 | |
5907 ; mode 20 [row 7 - second half] end | |
5908 | |
5909 ; mode 20 [row 8 - second half] | |
5910 pmaddubsw m4, m5, [r3 + 3 * 16] | |
5911 pmulhrsw m4, m3 | |
5912 packuswb m4, m4 | |
5913 movh [r0 + 296 * 16 + 8], m4 | |
5914 ; mode 20 [row 8 - second half] end | |
5915 | |
5916 ; mode 21 [row 9 - second half] | |
5917 pmaddubsw m4, m5, [r3 + 22 * 16] | |
5918 pmulhrsw m4, m3 | |
5919 packuswb m4, m4 | |
5920 movh [r0 + 313 * 16 + 8], m4 | |
5921 ; mode 21 [row 9 - second half] end | |
5922 | |
5923 ; mode 21 [row 10 - second half] | |
5924 pmaddubsw m4, m5, [r3 + 5 * 16] | |
5925 pmulhrsw m4, m3 | |
5926 packuswb m4, m4 | |
5927 movh [r0 + 314 * 16 + 8], m4 | |
5928 ; mode 21 [row 10 - second half] end | |
5929 | |
5930 ; mode 22 [row 12 - second half] | |
5931 pmaddubsw m4, m5, [r3 + 23 * 16] | |
5932 pmulhrsw m4, m3 | |
5933 packuswb m4, m4 | |
5934 movh [r0 + 332 * 16 + 8], m4 | |
5935 ; mode 22 [row 12 - second half] end | |
5936 | |
5937 ; mode 22 [row 12 - second half] | |
5938 pmaddubsw m4, m5, [r3 + 10 * 16] | |
5939 pmulhrsw m4, m3 | |
5940 packuswb m4, m4 | |
5941 movh [r0 + 333 * 16 + 8], m4 | |
5942 ; mode 22 [row 12 - second half] end | |
5943 | |
5944 pmaddubsw m4, m0, [r3 + 10 * 16] | |
5945 pmulhrsw m4, m3 | |
5946 pmaddubsw m6, m5, [r3 + 10 * 16] | |
5947 pmulhrsw m6, m3 | |
5948 packuswb m4, m6 | |
5949 movu [r0 + 278 * 16], m4 | |
5950 | |
5951 ; mode 19 [row 7] | |
5952 pslldq m0, 2 | |
5953 pinsrb m0, [r2 + 32 + 6], 1 | |
5954 pinsrb m0, [r2 + 32 + 7], 0 | |
5955 pslldq m5, 2 | |
5956 pinsrb m5, [r2 + 3], 1 | |
5957 pinsrb m5, [r2 + 2], 0 | |
5958 | |
5959 ; mode 20 [row 9 - second half] | |
5960 pmaddubsw m4, m5, [r3 + 14 * 16] | |
5961 pmulhrsw m4, m3 | |
5962 packuswb m4, m4 | |
5963 movh [r0 + 297 * 16 + 8], m4 | |
5964 ; mode 20 [row 9 - second half] | |
5965 | |
5966 ; mode 21 [row 11 - second half] | |
5967 pmaddubsw m4, m5, [r3 + 20 * 16] | |
5968 pmulhrsw m4, m3 | |
5969 packuswb m4, m4 | |
5970 movh [r0 + 315 * 16 + 8], m4 | |
5971 ; mode 21 [row 11 - second half] end | |
5972 | |
5973 ; mode 21 [row 12 - second half] | |
5974 pmaddubsw m4, m5, [r3 + 3 * 16] | |
5975 pmulhrsw m4, m3 | |
5976 packuswb m4, m4 | |
5977 movh [r0 + 316 * 16 + 8], m4 | |
5978 ; mode 21 [row 12 - second half] end | |
5979 | |
5980 ; mode 22 [row 14 - second half] | |
5981 pmaddubsw m4, m5, [r3 + 29 * 16] | |
5982 pmulhrsw m4, m3 | |
5983 packuswb m4, m4 | |
5984 movh [r0 + 334 * 16 + 8], m4 | |
5985 ; mode 22 [row 14 - second half] end | |
5986 | |
5987 ; mode 22 [row 15 - second half] | |
5988 pmaddubsw m4, m5, [r3 + 16 * 16] | |
5989 pmulhrsw m4, m3 | |
5990 packuswb m4, m4 | |
5991 movh [r0 + 335 * 16 + 8], m4 | |
5992 ; mode 22 [row 15 - second half] end | |
5993 | |
5994 pmaddubsw m4, m0, [r3 + 16 * 16] | |
5995 pmulhrsw m4, m3 | |
5996 pmaddubsw m6, m5, [r3 + 16 * 16] | |
5997 pmulhrsw m6, m3 | |
5998 packuswb m4, m6 | |
5999 movu [r0 + 279 * 16], m4 | |
6000 | |
6001 ; mode 19 [row 8] | |
6002 pslldq m0, 2 | |
6003 pinsrb m0, [r2 + 32 + 7], 1 | |
6004 pinsrb m0, [r2 + 32 + 9], 0 | |
6005 pslldq m5, 2 | |
6006 pinsrb m5, [r2 + 2], 1 | |
6007 pinsrb m5, [r2 + 1], 0 | |
6008 | |
6009 ; mode 20 [row 10 - second half] | |
6010 pmaddubsw m4, m5, [r3 + 25 * 16] | |
6011 pmulhrsw m4, m3 | |
6012 packuswb m4, m4 | |
6013 movh [r0 + 298 * 16 + 8], m4 | |
6014 ; mode 20 [row 10 - second half] end | |
6015 | |
6016 ; mode 20 [row 11 - second half] | |
6017 pmaddubsw m4, m5, [r3 + 4 * 16] | |
6018 pmulhrsw m4, m3 | |
6019 packuswb m4, m4 | |
6020 movh [r0 + 299 * 16 + 8], m4 | |
6021 ; mode 20 [row 11 - second half] end | |
6022 | |
6023 ; mode 21 [row 13 - second half] | |
6024 pmaddubsw m4, m5, [r3 + 18 * 16] | |
6025 pmulhrsw m4, m3 | |
6026 packuswb m4, m4 | |
6027 movh [r0 + 317 * 16 + 8], m4 | |
6028 ; mode 21 [row 13 - second half] end | |
6029 | |
6030 ; mode 21 [row 14 - second half] | |
6031 pmaddubsw m4, m5, [r3 + 1 * 16] | |
6032 pmulhrsw m4, m3 | |
6033 packuswb m4, m4 | |
6034 movh [r0 + 318 * 16 + 8], m4 | |
6035 ; mode 21 [row 14 - second half] end | |
6036 | |
6037 pmaddubsw m4, m0, [r3 + 22 * 16] | |
6038 pmulhrsw m4, m3 | |
6039 pmaddubsw m6, m5, [r3 + 22 * 16] | |
6040 pmulhrsw m6, m3 | |
6041 packuswb m4, m6 | |
6042 movu [r0 + 280 * 16], m4 | |
6043 | |
6044 ; mode 19 [row 9] | |
6045 pslldq m0, 2 | |
6046 pinsrb m0, [r2 + 32 + 9], 1 | |
6047 pinsrb m0, [r2 + 32 + 10], 0 | |
6048 pslldq m5, 2 | |
6049 pinsrb m5, [r2 + 1], 1 | |
6050 pinsrb m5, [r2 + 0], 0 | |
6051 | |
6052 ; mode 20 [row 12 - second half] | |
6053 pmaddubsw m4, m5, [r3 + 15 * 16] | |
6054 pmulhrsw m4, m3 | |
6055 packuswb m4, m4 | |
6056 movh [r0 + 300 * 16 + 8], m4 | |
6057 | |
6058 ; mode 20 [row 12 - second half] end | |
6059 pmaddubsw m4, m0, [r3 + 28 * 16] | |
6060 pmulhrsw m4, m3 | |
6061 pmaddubsw m6, m5, [r3 + 28 * 16] | |
6062 pmulhrsw m6, m3 | |
6063 packuswb m4, m6 | |
6064 movu [r0 + 281 * 16], m4 | |
6065 | |
6066 ; mode 19 [row 10] | |
6067 pmaddubsw m4, m0, [r3 + 2 * 16] | |
6068 pmulhrsw m4, m3 | |
6069 pmaddubsw m6, m5, [r3 + 2 * 16] | |
6070 pmulhrsw m6, m3 | |
6071 packuswb m4, m6 | |
6072 movu [r0 + 282 * 16], m4 | |
6073 | |
6074 ; mode 19 [row 11] | |
6075 pslldq m0, 2 | |
6076 pinsrb m0, [r2 + 32 + 10], 1 | |
6077 pinsrb m0, [r2 + 32 + 11], 0 | |
6078 pmaddubsw m4, m0, [r3 + 8 * 16] | |
6079 pmulhrsw m4, m3 | |
6080 pslldq m5, 2 | |
6081 pinsrb m5, [r2], 1 | |
6082 pinsrb m5, [r2 + 32 + 1], 0 | |
6083 pmaddubsw m6, m5, [r3 + 8 * 16] | |
6084 pmulhrsw m6, m3 | |
6085 packuswb m4, m6 | |
6086 movu [r0 + 283 * 16], m4 | |
6087 | |
6088 ; mode 19 [row 12] | |
6089 pslldq m0, 2 | |
6090 pinsrb m0, [r2 + 32 + 11], 1 | |
6091 pinsrb m0, [r2 + 32 + 12], 0 | |
6092 pslldq m5, 2 | |
6093 pinsrb m5, [r2 + 32 + 1], 1 | |
6094 pinsrb m5, [r2 + 32 + 2], 0 | |
6095 pmaddubsw m4, m0, [r3 + 14 * 16] | |
6096 pmulhrsw m4, m3 | |
6097 pmaddubsw m6, m5, [r3 + 14 * 16] | |
6098 pmulhrsw m6, m3 | |
6099 packuswb m4, m6 | |
6100 movu [r0 + 284 * 16], m4 | |
6101 | |
6102 ; mode 19 [row 13] | |
6103 pslldq m0, 2 | |
6104 pinsrb m0, [r2 + 32 + 12], 1 | |
6105 pinsrb m0, [r2 + 32 + 14], 0 | |
6106 pmaddubsw m4, m0, [r3 + 20 * 16] | |
6107 pmulhrsw m4, m3 | |
6108 pslldq m5, 2 | |
6109 pinsrb m5, [r2 + 32 + 2], 1 | |
6110 pinsrb m5, [r2 + 32 + 4], 0 | |
6111 pmaddubsw m6, m5, [r3 + 20 * 16] | |
6112 pmulhrsw m6, m3 | |
6113 packuswb m4, m6 | |
6114 movu [r0 + 285 * 16], m4 | |
6115 | |
6116 ; mode 19 [row 14] | |
6117 pslldq m0, 2 | |
6118 pinsrb m0, [r2 + 32 + 14], 1 | |
6119 pinsrb m0, [r2 + 32 + 15], 0 | |
6120 pmaddubsw m4, m0, [r3 + 26 * 16] | |
6121 pmulhrsw m4, m3 | |
6122 pslldq m5, 2 | |
6123 pinsrb m5, [r2 + 32 + 4], 1 | |
6124 pinsrb m5, [r2 + 32 + 5], 0 | |
6125 pmaddubsw m6, m5, [r3 + 26 * 16] | |
6126 pmulhrsw m6, m3 | |
6127 packuswb m4, m6 | |
6128 movu [r0 + 286 * 16], m4 | |
6129 | |
6130 ; mode 19 [row 15] | |
6131 movu m0, [r2 + 32] | |
6132 pshufb m0, [tab_S1] | |
6133 movu [r0 + 287 * 16], m0 | |
6134 movd m1, [r2] | |
6135 movd [r0 + 287 * 16 + 12], m1 | |
6136 | |
6137 ; mode 25 | |
6138 movu m1, [r1] | |
6139 | |
6140 ; mode 26 [all rows] | |
6141 psrldq m6, m1, 1 | |
6142 pinsrb m6, [r1 + 16], 15 | |
6143 movu m7, m6 | |
6144 movu [r0 + 384 * 16], m6 | |
6145 movu [r0 + 385 * 16], m6 | |
6146 movu [r0 + 386 * 16], m6 | |
6147 movu [r0 + 387 * 16], m6 | |
6148 movu [r0 + 388 * 16], m6 | |
6149 movu [r0 + 389 * 16], m6 | |
6150 movu [r0 + 390 * 16], m6 | |
6151 movu [r0 + 391 * 16], m6 | |
6152 movu [r0 + 392 * 16], m6 | |
6153 movu [r0 + 393 * 16], m6 | |
6154 movu [r0 + 394 * 16], m6 | |
6155 movu [r0 + 395 * 16], m6 | |
6156 movu [r0 + 396 * 16], m6 | |
6157 movu [r0 + 397 * 16], m6 | |
6158 movu [r0 + 398 * 16], m6 | |
6159 movu [r0 + 399 * 16], m6 | |
6160 | |
6161 pxor m0, m0 | |
6162 pshufb m6, m6, m0 | |
6163 punpcklbw m6, m0 | |
6164 pinsrb m2, [r1], 0 | |
6165 pshufb m2, m2, m0 | |
6166 punpcklbw m2, m0 | |
6167 movu m4, [r1 + 1 + 32] | |
6168 punpcklbw m5, m4, m0 | |
6169 punpckhbw m4, m0 | |
6170 psubw m5, m2 | |
6171 psubw m4, m2 | |
6172 psraw m5, 1 | |
6173 psraw m4, 1 | |
6174 paddw m5, m6 | |
6175 paddw m4, m6 | |
6176 packuswb m5, m4 | |
6177 | |
6178 pextrb [r0 + 384 * 16], m5, 0 | |
6179 pextrb [r0 + 385 * 16], m5, 1 | |
6180 pextrb [r0 + 386 * 16], m5, 2 | |
6181 pextrb [r0 + 387 * 16], m5, 3 | |
6182 pextrb [r0 + 388 * 16], m5, 4 | |
6183 pextrb [r0 + 389 * 16], m5, 5 | |
6184 pextrb [r0 + 390 * 16], m5, 6 | |
6185 pextrb [r0 + 391 * 16], m5, 7 | |
6186 pextrb [r0 + 392 * 16], m5, 8 | |
6187 pextrb [r0 + 393 * 16], m5, 9 | |
6188 pextrb [r0 + 394 * 16], m5, 10 | |
6189 pextrb [r0 + 395 * 16], m5, 11 | |
6190 pextrb [r0 + 396 * 16], m5, 12 | |
6191 pextrb [r0 + 397 * 16], m5, 13 | |
6192 pextrb [r0 + 398 * 16], m5, 14 | |
6193 pextrb [r0 + 399 * 16], m5, 15 | |
6194 | |
6195 ; mode 25 [row 15] | |
6196 movu [r0 + 383 * 16], m1 | |
6197 | |
6198 ; mode 25 [row 0] | |
6199 psrldq m2, m1, 1 | |
6200 punpcklbw m1, m2 | |
6201 movu m2, [r1 + 8] | |
6202 psrldq m4, m2, 1 | |
6203 punpcklbw m2, m4 | |
6204 pmaddubsw m4, m1, [r3 + 30 * 16] | |
6205 pmulhrsw m4, m3 | |
6206 pmaddubsw m5, m2, [r3 + 30 * 16] | |
6207 pmulhrsw m5, m3 | |
6208 packuswb m4, m5 | |
6209 movu [r0 + 368 * 16], m4 | |
6210 | |
6211 ; mode 25 [row 1] | |
6212 pmaddubsw m4, m1, [r3 + 28 * 16] | |
6213 pmulhrsw m4, m3 | |
6214 pmaddubsw m5, m2, [r3 + 28 * 16] | |
6215 pmulhrsw m5, m3 | |
6216 packuswb m4, m5 | |
6217 movu [r0 + 369 * 16], m4 | |
6218 | |
6219 ; mode 25 [row 2] | |
6220 pmaddubsw m4, m1, [r3 + 26 * 16] | |
6221 pmulhrsw m4, m3 | |
6222 pmaddubsw m5, m2, [r3 + 26 * 16] | |
6223 pmulhrsw m5, m3 | |
6224 packuswb m4, m5 | |
6225 movu [r0 + 370 * 16], m4 | |
6226 | |
6227 ; mode 25 [row 3] | |
6228 pmaddubsw m4, m1, [r3 + 24 * 16] | |
6229 pmulhrsw m4, m3 | |
6230 pmaddubsw m5, m2, [r3 + 24 * 16] | |
6231 pmulhrsw m5, m3 | |
6232 packuswb m4, m5 | |
6233 movu [r0 + 371 * 16], m4 | |
6234 | |
6235 ; mode 25 [row 4] | |
6236 pmaddubsw m4, m1, [r3 + 22 * 16] | |
6237 pmulhrsw m4, m3 | |
6238 pmaddubsw m5, m2, [r3 + 22 * 16] | |
6239 pmulhrsw m5, m3 | |
6240 packuswb m4, m5 | |
6241 movu [r0 + 372 * 16], m4 | |
6242 | |
6243 ; mode 25 [row 5] | |
6244 pmaddubsw m4, m1, [r3 + 20 * 16] | |
6245 pmulhrsw m4, m3 | |
6246 pmaddubsw m5, m2, [r3 + 20 * 16] | |
6247 pmulhrsw m5, m3 | |
6248 packuswb m4, m5 | |
6249 movu [r0 + 373 * 16], m4 | |
6250 | |
6251 ; mode 25 [row 6] | |
6252 pmaddubsw m4, m1, [r3 + 18 * 16] | |
6253 pmulhrsw m4, m3 | |
6254 pmaddubsw m5, m2, [r3 + 18 * 16] | |
6255 pmulhrsw m5, m3 | |
6256 packuswb m4, m5 | |
6257 movu [r0 + 374 * 16], m4 | |
6258 | |
6259 ; mode 25 [row 7] | |
6260 pmaddubsw m4, m1, [r3 + 16 * 16] | |
6261 pmulhrsw m4, m3 | |
6262 pmaddubsw m5, m2, [r3 + 16 * 16] | |
6263 pmulhrsw m5, m3 | |
6264 packuswb m4, m5 | |
6265 movu [r0 + 375 * 16], m4 | |
6266 | |
6267 ; mode 25 [row 8] | |
6268 pmaddubsw m4, m1, [r3 + 14 * 16] | |
6269 pmulhrsw m4, m3 | |
6270 pmaddubsw m5, m2, [r3 + 14 * 16] | |
6271 pmulhrsw m5, m3 | |
6272 packuswb m4, m5 | |
6273 movu [r0 + 376 * 16], m4 | |
6274 | |
6275 ; mode 25 [row 9] | |
6276 pmaddubsw m4, m1, [r3 + 12 * 16] | |
6277 pmulhrsw m4, m3 | |
6278 pmaddubsw m5, m2, [r3 + 12 * 16] | |
6279 pmulhrsw m5, m3 | |
6280 packuswb m4, m5 | |
6281 movu [r0 + 377 * 16], m4 | |
6282 | |
6283 ; mode 25 [row 10] | |
6284 pmaddubsw m4, m1, [r3 + 10 * 16] | |
6285 pmulhrsw m4, m3 | |
6286 pmaddubsw m5, m2, [r3 + 10 * 16] | |
6287 pmulhrsw m5, m3 | |
6288 packuswb m4, m5 | |
6289 movu [r0 + 378 * 16], m4 | |
6290 | |
6291 ; mode 25 [row 11] | |
6292 pmaddubsw m4, m1, [r3 + 8 * 16] | |
6293 pmulhrsw m4, m3 | |
6294 pmaddubsw m5, m2, [r3 + 8 * 16] | |
6295 pmulhrsw m5, m3 | |
6296 packuswb m4, m5 | |
6297 movu [r0 + 379 * 16], m4 | |
6298 | |
6299 ; mode 25 [row 12] | |
6300 pmaddubsw m4, m1, [r3 + 6 * 16] | |
6301 pmulhrsw m4, m3 | |
6302 pmaddubsw m5, m2, [r3 + 6 * 16] | |
6303 pmulhrsw m5, m3 | |
6304 packuswb m4, m5 | |
6305 movu [r0 + 380 * 16], m4 | |
6306 | |
6307 ; mode 25 [row 13] | |
6308 pmaddubsw m4, m1, [r3 + 4 * 16] | |
6309 pmulhrsw m4, m3 | |
6310 pmaddubsw m5, m2, [r3 + 4 * 16] | |
6311 pmulhrsw m5, m3 | |
6312 packuswb m4, m5 | |
6313 movu [r0 + 381 * 16], m4 | |
6314 | |
6315 ; mode 25 [row 14] | |
6316 pmaddubsw m4, m1, [r3 + 2 * 16] | |
6317 pmulhrsw m4, m3 | |
6318 pmaddubsw m5, m2, [r3 + 2 * 16] | |
6319 pmulhrsw m5, m3 | |
6320 packuswb m4, m5 | |
6321 movu [r0 + 382 * 16], m4 | |
6322 | |
6323 ; mode 27 [row 15] | |
6324 psrldq m6, m7, 1 | |
6325 punpcklbw m7, m6 | |
6326 pinsrb m6, [r1 + 17], 15 | |
6327 movu [r0 + 415 * 16], m6 | |
6328 | |
6329 ; mode 27 [row 0] | |
6330 movu m4, [r1 + 9] | |
6331 psrldq m5, m4, 1 | |
6332 punpcklbw m4, m5 | |
6333 pmaddubsw m6, m7, [r3 + 2 * 16] | |
6334 pmulhrsw m6, m3 | |
6335 pmaddubsw m5, m4, [r3 + 2 * 16] | |
6336 pmulhrsw m5, m3 | |
6337 packuswb m6, m5 | |
6338 movu [r0 + 400 * 16], m6 | |
6339 | |
6340 ; mode 27 [row 1] | |
6341 pmaddubsw m6, m7, [r3 + 4 * 16] | |
6342 pmulhrsw m6, m3 | |
6343 pmaddubsw m5, m4, [r3 + 4 * 16] | |
6344 pmulhrsw m5, m3 | |
6345 packuswb m6, m5 | |
6346 movu [r0 + 401 * 16], m6 | |
6347 | |
6348 ; mode 27 [row 2] | |
6349 pmaddubsw m6, m7, [r3 + 6 * 16] | |
6350 pmulhrsw m6, m3 | |
6351 pmaddubsw m5, m4, [r3 + 6 * 16] | |
6352 pmulhrsw m5, m3 | |
6353 packuswb m6, m5 | |
6354 movu [r0 + 402 * 16], m6 | |
6355 | |
6356 ; mode 27 [row 3] | |
6357 pmaddubsw m6, m7, [r3 + 8 * 16] | |
6358 pmulhrsw m6, m3 | |
6359 pmaddubsw m5, m4, [r3 + 8 * 16] | |
6360 pmulhrsw m5, m3 | |
6361 packuswb m6, m5 | |
6362 movu [r0 + 403 * 16], m6 | |
6363 | |
6364 ; mode 27 [row 4] | |
6365 pmaddubsw m6, m7, [r3 + 10 * 16] | |
6366 pmulhrsw m6, m3 | |
6367 pmaddubsw m5, m4, [r3 + 10 * 16] | |
6368 pmulhrsw m5, m3 | |
6369 packuswb m6, m5 | |
6370 movu [r0 + 404 * 16], m6 | |
6371 | |
6372 ; mode 27 [row 5] | |
6373 pmaddubsw m6, m7, [r3 + 12 * 16] | |
6374 pmulhrsw m6, m3 | |
6375 pmaddubsw m5, m4, [r3 + 12 * 16] | |
6376 pmulhrsw m5, m3 | |
6377 packuswb m6, m5 | |
6378 movu [r0 + 405 * 16], m6 | |
6379 | |
6380 ; mode 27 [row 6] | |
6381 pmaddubsw m6, m7, [r3 + 14 * 16] | |
6382 pmulhrsw m6, m3 | |
6383 pmaddubsw m5, m4, [r3 + 14 * 16] | |
6384 pmulhrsw m5, m3 | |
6385 packuswb m6, m5 | |
6386 movu [r0 + 406 * 16], m6 | |
6387 | |
6388 ; mode 27 [row 7] | |
6389 pmaddubsw m6, m7, [r3 + 16 * 16] | |
6390 pmulhrsw m6, m3 | |
6391 pmaddubsw m5, m4, [r3 + 16 * 16] | |
6392 pmulhrsw m5, m3 | |
6393 packuswb m6, m5 | |
6394 movu [r0 + 407 * 16], m6 | |
6395 | |
6396 ; mode 27 [row 8] | |
6397 pmaddubsw m6, m7, [r3 + 18 * 16] | |
6398 pmulhrsw m6, m3 | |
6399 pmaddubsw m5, m4, [r3 + 18 * 16] | |
6400 pmulhrsw m5, m3 | |
6401 packuswb m6, m5 | |
6402 movu [r0 + 408 * 16], m6 | |
6403 | |
6404 ; mode 27 [row 9] | |
6405 pmaddubsw m6, m7, [r3 + 20 * 16] | |
6406 pmulhrsw m6, m3 | |
6407 pmaddubsw m5, m4, [r3 + 20 * 16] | |
6408 pmulhrsw m5, m3 | |
6409 packuswb m6, m5 | |
6410 movu [r0 + 409 * 16], m6 | |
6411 | |
6412 ; mode 27 [row 10] | |
6413 pmaddubsw m6, m7, [r3 + 22 * 16] | |
6414 pmulhrsw m6, m3 | |
6415 pmaddubsw m5, m4, [r3 + 22 * 16] | |
6416 pmulhrsw m5, m3 | |
6417 packuswb m6, m5 | |
6418 movu [r0 + 410 * 16], m6 | |
6419 | |
6420 ; mode 27 [row 11] | |
6421 pmaddubsw m6, m7, [r3 + 24 * 16] | |
6422 pmulhrsw m6, m3 | |
6423 pmaddubsw m5, m4, [r3 + 24 * 16] | |
6424 pmulhrsw m5, m3 | |
6425 packuswb m6, m5 | |
6426 movu [r0 + 411 * 16], m6 | |
6427 | |
6428 ; mode 27 [row 12] | |
6429 pmaddubsw m6, m7, [r3 + 26 * 16] | |
6430 pmulhrsw m6, m3 | |
6431 pmaddubsw m5, m4, [r3 + 26 * 16] | |
6432 pmulhrsw m5, m3 | |
6433 packuswb m6, m5 | |
6434 movu [r0 + 412 * 16], m6 | |
6435 | |
6436 ; mode 27 [row 13] | |
6437 pmaddubsw m6, m7, [r3 + 28 * 16] | |
6438 pmulhrsw m6, m3 | |
6439 pmaddubsw m5, m4, [r3 + 28 * 16] | |
6440 pmulhrsw m5, m3 | |
6441 packuswb m6, m5 | |
6442 movu [r0 + 413 * 16], m6 | |
6443 | |
6444 ; mode 27 [row 14] | |
6445 pmaddubsw m6, m7, [r3 + 30 * 16] | |
6446 pmulhrsw m6, m3 | |
6447 pmaddubsw m5, m4, [r3 + 30 * 16] | |
6448 pmulhrsw m5, m3 | |
6449 packuswb m6, m5 | |
6450 movu [r0 + 414 * 16], m6 | |
6451 | |
6452 ; mode 28 [row 0] | |
6453 movu m1, [r2 + 1] | |
6454 psrldq m2, m1, 1 | |
6455 punpcklbw m1, m2 | |
6456 movu m4, [r2 + 9] | |
6457 psrldq m5, m4, 1 | |
6458 punpcklbw m4, m5 | |
6459 pmaddubsw m2, m1, [r3 + 5 * 16] | |
6460 pmulhrsw m2, m3 | |
6461 pmaddubsw m5, m4, [r3 + 5 * 16] | |
6462 pmulhrsw m5, m3 | |
6463 packuswb m2, m5 | |
6464 movu [r0 + 416 * 16], m2 | |
6465 | |
6466 ; mode 28 [row 0] | |
6467 pmaddubsw m2, m1, [r3 + 5 * 16] | |
6468 pmulhrsw m2, m3 | |
6469 pmaddubsw m5, m4, [r3 + 5 * 16] | |
6470 pmulhrsw m5, m3 | |
6471 packuswb m2, m5 | |
6472 movu [r0 + 416 * 16], m2 | |
6473 | |
6474 ; mode 28 [row 1] | |
6475 pmaddubsw m2, m1, [r3 + 10 * 16] | |
6476 pmulhrsw m2, m3 | |
6477 pmaddubsw m5, m4, [r3 + 10 * 16] | |
6478 pmulhrsw m5, m3 | |
6479 packuswb m2, m5 | |
6480 movu [r0 + 417 * 16], m2 | |
6481 | |
6482 ; mode 28 [row 2] | |
6483 pmaddubsw m2, m1, [r3 + 15 * 16] | |
6484 pmulhrsw m2, m3 | |
6485 pmaddubsw m5, m4, [r3 + 15 * 16] | |
6486 pmulhrsw m5, m3 | |
6487 packuswb m2, m5 | |
6488 movu [r0 + 418 * 16], m2 | |
6489 | |
6490 ; mode 28 [row 3] | |
6491 pmaddubsw m2, m1, [r3 + 20 * 16] | |
6492 pmulhrsw m2, m3 | |
6493 pmaddubsw m5, m4, [r3 + 20 * 16] | |
6494 pmulhrsw m5, m3 | |
6495 packuswb m2, m5 | |
6496 movu [r0 + 419 * 16], m2 | |
6497 | |
6498 ; mode 28 [row 4] | |
6499 pmaddubsw m2, m1, [r3 + 25 * 16] | |
6500 pmulhrsw m2, m3 | |
6501 pmaddubsw m5, m4, [r3 + 25 * 16] | |
6502 pmulhrsw m5, m3 | |
6503 packuswb m2, m5 | |
6504 movu [r0 + 420 * 16], m2 | |
6505 | |
6506 ; mode 28 [row 5] | |
6507 pmaddubsw m2, m1, [r3 + 30 * 16] | |
6508 pmulhrsw m2, m3 | |
6509 pmaddubsw m5, m4, [r3 + 30 * 16] | |
6510 pmulhrsw m5, m3 | |
6511 packuswb m2, m5 | |
6512 movu [r0 + 421 * 16], m2 | |
6513 | |
6514 ; mode 29 [row 0] | |
6515 pmaddubsw m2, m1, [r3 + 9 * 16] | |
6516 pmulhrsw m2, m3 | |
6517 pmaddubsw m5, m4, [r3 + 9 * 16] | |
6518 pmulhrsw m5, m3 | |
6519 packuswb m2, m5 | |
6520 movu [r0 + 432 * 16], m2 | |
6521 | |
6522 ; mode 29 [row 1] | |
6523 pmaddubsw m2, m1, [r3 + 18 * 16] | |
6524 pmulhrsw m2, m3 | |
6525 pmaddubsw m5, m4, [r3 + 18 * 16] | |
6526 pmulhrsw m5, m3 | |
6527 packuswb m2, m5 | |
6528 movu [r0 + 433 * 16], m2 | |
6529 | |
6530 ; mode 29 [row 2] | |
6531 pmaddubsw m2, m1, [r3 + 27 * 16] | |
6532 pmulhrsw m2, m3 | |
6533 pmaddubsw m5, m4, [r3 + 27 * 16] | |
6534 pmulhrsw m5, m3 | |
6535 packuswb m2, m5 | |
6536 movu [r0 + 434 * 16], m2 | |
6537 | |
6538 ; mode 30 [row 0] | |
6539 pmaddubsw m2, m1, [r3 + 13 * 16] | |
6540 pmulhrsw m2, m3 | |
6541 pmaddubsw m5, m4, [r3 + 13 * 16] | |
6542 pmulhrsw m5, m3 | |
6543 packuswb m2, m5 | |
6544 movu [r0 + 448 * 16], m2 | |
6545 | |
6546 ; mode 30 [row 1] | |
6547 pmaddubsw m2, m1, [r3 + 26 * 16] | |
6548 pmulhrsw m2, m3 | |
6549 pmaddubsw m5, m4, [r3 + 26 * 16] | |
6550 pmulhrsw m5, m3 | |
6551 packuswb m2, m5 | |
6552 movu [r0 + 449 * 16], m2 | |
6553 | |
6554 ; mode 33 [row 0] | |
6555 movu [r0 + 496 * 16], m2 | |
6556 | |
6557 ; mode 31 [row 0] | |
6558 pmaddubsw m2, m1, [r3 + 17 * 16] | |
6559 pmulhrsw m2, m3 | |
6560 pmaddubsw m5, m4, [r3 + 17 * 16] | |
6561 pmulhrsw m5, m3 | |
6562 packuswb m2, m5 | |
6563 movu [r0 + 464 * 16], m2 | |
6564 | |
6565 ; mode 32 [row 0] | |
6566 pmaddubsw m2, m1, [r3 + 21 * 16] | |
6567 pmulhrsw m2, m3 | |
6568 pmaddubsw m5, m4, [r3 + 21 * 16] | |
6569 pmulhrsw m5, m3 | |
6570 packuswb m2, m5 | |
6571 movu [r0 + 480 * 16], m2 | |
6572 | |
6573 ; mode 28 [row 6] | |
6574 movd m7, [r2 + 9] | |
6575 palignr m7, m1, 2 | |
6576 pmaddubsw m2, m7, [r3 + 3 * 16] | |
6577 pmulhrsw m2, m3 | |
6578 movd m6, [r2 + 17] | |
6579 palignr m6, m4, 2 | |
6580 pmaddubsw m5, m6, [r3 + 3 * 16] | |
6581 pmulhrsw m5, m3 | |
6582 packuswb m2, m5 | |
6583 movu [r0 + 422 * 16], m2 | |
6584 | |
6585 ; mode 28 [row 7] | |
6586 pmaddubsw m2, m7, [r3 + 8 * 16] | |
6587 pmulhrsw m2, m3 | |
6588 pmaddubsw m5, m6, [r3 + 8 * 16] | |
6589 pmulhrsw m5, m3 | |
6590 packuswb m2, m5 | |
6591 movu [r0 + 423 * 16], m2 | |
6592 | |
6593 ; mode 28 [row 8] | |
6594 pmaddubsw m2, m7, [r3 + 13 * 16] | |
6595 pmulhrsw m2, m3 | |
6596 pmaddubsw m5, m6, [r3 + 13 * 16] | |
6597 pmulhrsw m5, m3 | |
6598 packuswb m2, m5 | |
6599 movu [r0 + 424 * 16], m2 | |
6600 | |
6601 ; mode 28 [row 9] | |
6602 pmaddubsw m2, m7, [r3 + 18 * 16] | |
6603 pmulhrsw m2, m3 | |
6604 pmaddubsw m5, m6, [r3 + 18 * 16] | |
6605 pmulhrsw m5, m3 | |
6606 packuswb m2, m5 | |
6607 movu [r0 + 425 * 16], m2 | |
6608 | |
6609 ; mode 28 [row 10] | |
6610 pmaddubsw m2, m7, [r3 + 23 * 16] | |
6611 pmulhrsw m2, m3 | |
6612 pmaddubsw m5, m6, [r3 + 23 * 16] | |
6613 pmulhrsw m5, m3 | |
6614 packuswb m2, m5 | |
6615 movu [r0 + 426 * 16], m2 | |
6616 | |
6617 ; mode 29 [row 3] | |
6618 pmaddubsw m2, m7, [r3 + 4 * 16] | |
6619 pmulhrsw m2, m3 | |
6620 pmaddubsw m5, m6, [r3 + 4 * 16] | |
6621 pmulhrsw m5, m3 | |
6622 packuswb m2, m5 | |
6623 movu [r0 + 435 * 16], m2 | |
6624 | |
6625 ; mode 29 [row 4] | |
6626 pmaddubsw m2, m7, [r3 + 13 * 16] | |
6627 pmulhrsw m2, m3 | |
6628 pmaddubsw m5, m6, [r3 + 13 * 16] | |
6629 pmulhrsw m5, m3 | |
6630 packuswb m2, m5 | |
6631 movu [r0 + 436 * 16], m2 | |
6632 | |
6633 ; mode 29 [row 5] | |
6634 pmaddubsw m2, m7, [r3 + 22 * 16] | |
6635 pmulhrsw m2, m3 | |
6636 pmaddubsw m5, m6, [r3 + 22 * 16] | |
6637 pmulhrsw m5, m3 | |
6638 packuswb m2, m5 | |
6639 movu [r0 + 437 * 16], m2 | |
6640 | |
6641 ; mode 29 [row 6] | |
6642 pmaddubsw m2, m7, [r3 + 31 * 16] | |
6643 pmulhrsw m2, m3 | |
6644 pmaddubsw m5, m6, [r3 + 31 * 16] | |
6645 pmulhrsw m5, m3 | |
6646 packuswb m2, m5 | |
6647 movu [r0 + 438 * 16], m2 | |
6648 | |
6649 ; mode 32 [row 2] | |
6650 movu [r0 + 482 * 16], m2 | |
6651 | |
6652 ; mode 30 [row 2] | |
6653 pmaddubsw m2, m7, [r3 + 7 * 16] | |
6654 pmulhrsw m2, m3 | |
6655 pmaddubsw m5, m6, [r3 + 7 * 16] | |
6656 pmulhrsw m5, m3 | |
6657 packuswb m2, m5 | |
6658 movu [r0 + 450 * 16], m2 | |
6659 | |
6660 ; mode 30 [row 3] | |
6661 pmaddubsw m2, m7, [r3 + 20 * 16] | |
6662 pmulhrsw m2, m3 | |
6663 pmaddubsw m5, m6, [r3 + 20 * 16] | |
6664 pmulhrsw m5, m3 | |
6665 packuswb m2, m5 | |
6666 movu [r0 + 451 * 16], m2 | |
6667 | |
6668 ; mode 33 [row 1] | |
6669 movu [r0 + 497 * 16], m2 | |
6670 | |
6671 ; mode 31 [row 1] | |
6672 pmaddubsw m2, m7, [r3 + 2 * 16] | |
6673 pmulhrsw m2, m3 | |
6674 pmaddubsw m5, m6, [r3 + 2 * 16] | |
6675 pmulhrsw m5, m3 | |
6676 packuswb m2, m5 | |
6677 movu [r0 + 465 * 16], m2 | |
6678 | |
6679 ; mode 31 [row 2] | |
6680 pmaddubsw m2, m7, [r3 + 19 * 16] | |
6681 pmulhrsw m2, m3 | |
6682 pmaddubsw m5, m6, [r3 + 19 * 16] | |
6683 pmulhrsw m5, m3 | |
6684 packuswb m2, m5 | |
6685 movu [r0 + 466 * 16], m2 | |
6686 | |
6687 ; mode 32 [row 1] | |
6688 pmaddubsw m2, m7, [r3 + 10 * 16] | |
6689 pmulhrsw m2, m3 | |
6690 pmaddubsw m5, m6, [r3 + 10 * 16] | |
6691 pmulhrsw m5, m3 | |
6692 packuswb m2, m5 | |
6693 movu [r0 + 481 * 16], m2 | |
6694 | |
6695 ; mode 28 [row 11] | |
6696 pmaddubsw m2, m7, [r3 + 28 * 16] | |
6697 pmulhrsw m2, m3 | |
6698 pmaddubsw m5, m6, [r3 + 28 * 16] | |
6699 pmulhrsw m5, m3 | |
6700 packuswb m2, m5 | |
6701 movu [r0 + 427 * 16], m2 | |
6702 | |
6703 ; mode 28 [row 12] | |
6704 movd m1, [r2 + 10] | |
6705 palignr m1, m7, 2 | |
6706 pmaddubsw m2, m1, [r3 + 1 * 16] | |
6707 pmulhrsw m2, m3 | |
6708 movd m4, [r2 + 18] | |
6709 palignr m4, m6, 2 | |
6710 pmaddubsw m5, m4, [r3 + 1 * 16] | |
6711 pmulhrsw m5, m3 | |
6712 packuswb m2, m5 | |
6713 movu [r0 + 428 * 16], m2 | |
6714 | |
6715 ; mode 30 [row 4] | |
6716 movu [r0 + 452 * 16], m2 | |
6717 | |
6718 ; mode 28 [row 13] | |
6719 pmaddubsw m2, m1, [r3 + 6 * 16] | |
6720 pmulhrsw m2, m3 | |
6721 pmaddubsw m5, m4, [r3 + 6 * 16] | |
6722 pmulhrsw m5, m3 | |
6723 packuswb m2, m5 | |
6724 movu [r0 + 429 * 16], m2 | |
6725 | |
6726 ; mode 28 [row 14] | |
6727 pmaddubsw m2, m1, [r3 + 11 * 16] | |
6728 pmulhrsw m2, m3 | |
6729 pmaddubsw m5, m4, [r3 + 11 * 16] | |
6730 pmulhrsw m5, m3 | |
6731 packuswb m2, m5 | |
6732 movu [r0 + 430 * 16], m2 | |
6733 | |
6734 ; mode 28 [row 15] | |
6735 pmaddubsw m2, m1, [r3 + 16 * 16] | |
6736 pmulhrsw m2, m3 | |
6737 pmaddubsw m5, m4, [r3 + 16 * 16] | |
6738 pmulhrsw m5, m3 | |
6739 packuswb m2, m5 | |
6740 movu [r0 + 431 * 16], m2 | |
6741 | |
6742 ; mode 29 [row 7] | |
6743 pmaddubsw m2, m1, [r3 + 8 * 16] | |
6744 pmulhrsw m2, m3 | |
6745 pmaddubsw m5, m4, [r3 + 8 * 16] | |
6746 pmulhrsw m5, m3 | |
6747 packuswb m2, m5 | |
6748 movu [r0 + 439 * 16], m2 | |
6749 | |
6750 ; mode 29 [row 8] | |
6751 pmaddubsw m2, m1, [r3 + 17 * 16] | |
6752 pmulhrsw m2, m3 | |
6753 pmaddubsw m5, m4, [r3 + 17 * 16] | |
6754 pmulhrsw m5, m3 | |
6755 packuswb m2, m5 | |
6756 movu [r0 + 440 * 16], m2 | |
6757 | |
6758 ; mode 29 [row 9] | |
6759 pmaddubsw m2, m1, [r3 + 26 * 16] | |
6760 pmulhrsw m2, m3 | |
6761 pmaddubsw m5, m4, [r3 + 26 * 16] | |
6762 pmulhrsw m5, m3 | |
6763 packuswb m2, m5 | |
6764 movu [r0 + 441 * 16], m2 | |
6765 | |
6766 ; mode 30 [row 5] | |
6767 pmaddubsw m2, m1, [r3 + 14 * 16] | |
6768 pmulhrsw m2, m3 | |
6769 pmaddubsw m5, m4, [r3 + 14 * 16] | |
6770 pmulhrsw m5, m3 | |
6771 packuswb m2, m5 | |
6772 movu [r0 + 453 * 16], m2 | |
6773 | |
6774 ; mode 33 [row 2] | |
6775 movu [r0 + 498 * 16], m2 | |
6776 | |
6777 ; mode 30 [row 6] | |
6778 pmaddubsw m2, m1, [r3 + 27 * 16] | |
6779 pmulhrsw m2, m3 | |
6780 pmaddubsw m5, m4, [r3 + 27 * 16] | |
6781 pmulhrsw m5, m3 | |
6782 packuswb m2, m5 | |
6783 movu [r0 + 454 * 16], m2 | |
6784 | |
6785 ; mode 31 [row 3] | |
6786 pmaddubsw m2, m1, [r3 + 4 * 16] | |
6787 pmulhrsw m2, m3 | |
6788 pmaddubsw m5, m4, [r3 + 4 * 16] | |
6789 pmulhrsw m5, m3 | |
6790 packuswb m2, m5 | |
6791 movu [r0 + 467 * 16], m2 | |
6792 | |
6793 ; mode 31 [row 4] | |
6794 pmaddubsw m2, m1, [r3 + 21 * 16] | |
6795 pmulhrsw m2, m3 | |
6796 pmaddubsw m5, m4, [r3 + 21 * 16] | |
6797 pmulhrsw m5, m3 | |
6798 packuswb m2, m5 | |
6799 movu [r0 + 468 * 16], m2 | |
6800 | |
6801 ; mode 32 [row 3] | |
6802 pmaddubsw m2, m1, [r3 + 20 * 16] | |
6803 pmulhrsw m2, m3 | |
6804 pmaddubsw m5, m4, [r3 + 20 * 16] | |
6805 pmulhrsw m5, m3 | |
6806 packuswb m2, m5 | |
6807 movu [r0 + 483 * 16], m2 | |
6808 | |
6809 ; mode 29 [row 10] | |
6810 movd m7, [r2 + 11] | |
6811 palignr m7, m1, 2 | |
6812 pmaddubsw m2, m7, [r3 + 3 * 16] | |
6813 pmulhrsw m2, m3 | |
6814 movd m6, [r2 + 19] | |
6815 palignr m6, m4, 2 | |
6816 pmaddubsw m5, m6, [r3 + 3 * 16] | |
6817 pmulhrsw m5, m3 | |
6818 packuswb m2, m5 | |
6819 movu [r0 + 442 * 16], m2 | |
6820 | |
6821 ; mode 29 [row 11] | |
6822 pmaddubsw m2, m7, [r3 + 12 * 16] | |
6823 pmulhrsw m2, m3 | |
6824 pmaddubsw m5, m6, [r3 + 12 * 16] | |
6825 pmulhrsw m5, m3 | |
6826 packuswb m2, m5 | |
6827 movu [r0 + 443 * 16], m2 | |
6828 | |
6829 ; mode 29 [row 12] | |
6830 pmaddubsw m2, m7, [r3 + 21 * 16] | |
6831 pmulhrsw m2, m3 | |
6832 pmaddubsw m5, m6, [r3 + 21 * 16] | |
6833 pmulhrsw m5, m3 | |
6834 packuswb m2, m5 | |
6835 movu [r0 + 444 * 16], m2 | |
6836 | |
6837 ; mode 30 [row 8] | |
6838 movu [r0 + 456 * 16], m2 | |
6839 | |
6840 ; mode 29 [row 13] | |
6841 pmaddubsw m2, m7, [r3 + 30 * 16] | |
6842 pmulhrsw m2, m3 | |
6843 pmaddubsw m5, m6, [r3 + 30 * 16] | |
6844 pmulhrsw m5, m3 | |
6845 packuswb m2, m5 | |
6846 movu [r0 + 445 * 16], m2 | |
6847 | |
6848 ; mode 32 [row 5] | |
6849 movu [r0 + 485 * 16], m2 | |
6850 | |
6851 ; mode 30 [row 7] | |
6852 pmaddubsw m2, m7, [r3 + 8 * 16] | |
6853 pmulhrsw m2, m3 | |
6854 pmaddubsw m5, m6, [r3 + 8 * 16] | |
6855 pmulhrsw m5, m3 | |
6856 packuswb m2, m5 | |
6857 movu [r0 + 455 * 16], m2 | |
6858 | |
6859 ; mode 33 [row 3] | |
6860 movu [r0 + 499 * 16], m2 | |
6861 | |
6862 ; mode 31 [row 5] | |
6863 pmaddubsw m2, m7, [r3 + 6 * 16] | |
6864 pmulhrsw m2, m3 | |
6865 pmaddubsw m5, m6, [r3 + 6 * 16] | |
6866 pmulhrsw m5, m3 | |
6867 packuswb m2, m5 | |
6868 movu [r0 + 469 * 16], m2 | |
6869 | |
6870 ; mode 31 [row 6] | |
6871 pmaddubsw m2, m7, [r3 + 23 * 16] | |
6872 pmulhrsw m2, m3 | |
6873 pmaddubsw m5, m6, [r3 + 23 * 16] | |
6874 pmulhrsw m5, m3 | |
6875 packuswb m2, m5 | |
6876 movu [r0 + 470 * 16], m2 | |
6877 | |
6878 ; mode 32 [row 4] | |
6879 pmaddubsw m2, m7, [r3 + 9 * 16] | |
6880 pmulhrsw m2, m3 | |
6881 pmaddubsw m5, m6, [r3 + 9 * 16] | |
6882 pmulhrsw m5, m3 | |
6883 packuswb m2, m5 | |
6884 movu [r0 + 484 * 16], m2 | |
6885 | |
6886 movu m1, m7 | |
6887 movu m4, m6 | |
6888 | |
6889 ; mode 29 [row 14] | |
6890 movu m1, [r2 + 12] | |
6891 palignr m1, m7, 2 | |
6892 pmaddubsw m2, m1, [r3 + 7 * 16] | |
6893 pmulhrsw m2, m3 | |
6894 movd m4, [r2 + 20] | |
6895 palignr m4, m6, 2 | |
6896 pmaddubsw m5, m4, [r3 + 7 * 16] | |
6897 pmulhrsw m5, m3 | |
6898 packuswb m2, m5 | |
6899 movu [r0 + 446 * 16], m2 | |
6900 | |
6901 ; mode 29 [row 15] | |
6902 pmaddubsw m2, m1, [r3 + 16 * 16] | |
6903 pmulhrsw m2, m3 | |
6904 pmaddubsw m5, m4, [r3 + 16 * 16] | |
6905 pmulhrsw m5, m3 | |
6906 packuswb m2, m5 | |
6907 movu [r0 + 447 * 16], m2 | |
6908 | |
6909 ; mode 30 [row 9] | |
6910 pmaddubsw m2, m1, [r3 + 2 * 16] | |
6911 pmulhrsw m2, m3 | |
6912 pmaddubsw m5, m4, [r3 + 2 * 16] | |
6913 pmulhrsw m5, m3 | |
6914 packuswb m2, m5 | |
6915 movu [r0 + 457 * 16], m2 | |
6916 | |
6917 ; mode 33 [row 4] | |
6918 movu [r0 + 500 * 16], m2 | |
6919 | |
6920 ; mode 30 [row 10] | |
6921 pmaddubsw m2, m1, [r3 + 15 * 16] | |
6922 pmulhrsw m2, m3 | |
6923 pmaddubsw m5, m4, [r3 + 15 * 16] | |
6924 pmulhrsw m5, m3 | |
6925 packuswb m2, m5 | |
6926 movu [r0 + 458 * 16], m2 | |
6927 | |
6928 ; mode 30 [row 11] | |
6929 pmaddubsw m2, m1, [r3 + 28 * 16] | |
6930 pmulhrsw m2, m3 | |
6931 pmaddubsw m5, m4, [r3 + 28 * 16] | |
6932 pmulhrsw m5, m3 | |
6933 packuswb m2, m5 | |
6934 movu [r0 + 459 * 16], m2 | |
6935 | |
6936 ; mode 33 [row 5] | |
6937 movu [r0 + 501 * 16], m2 | |
6938 | |
6939 ; mode 31 [row 7] | |
6940 pmaddubsw m2, m1, [r3 + 8 * 16] | |
6941 pmulhrsw m2, m3 | |
6942 pmaddubsw m5, m4, [r3 + 8 * 16] | |
6943 pmulhrsw m5, m3 | |
6944 packuswb m2, m5 | |
6945 movu [r0 + 471 * 16], m2 | |
6946 | |
6947 ; mode 31 [row 8] | |
6948 pmaddubsw m2, m1, [r3 + 25 * 16] | |
6949 pmulhrsw m2, m3 | |
6950 pmaddubsw m5, m4, [r3 + 25 * 16] | |
6951 pmulhrsw m5, m3 | |
6952 packuswb m2, m5 | |
6953 movu [r0 + 472 * 16], m2 | |
6954 | |
6955 ; mode 32 [row 6] | |
6956 pmaddubsw m2, m1, [r3 + 19 * 16] | |
6957 pmulhrsw m2, m3 | |
6958 pmaddubsw m5, m4, [r3 + 19 * 16] | |
6959 pmulhrsw m5, m3 | |
6960 packuswb m2, m5 | |
6961 movu [r0 + 486 * 16], m2 | |
6962 | |
6963 ; mode 30 [row 12] | |
6964 movd m7, [r2 + 13] | |
6965 palignr m7, m1, 2 | |
6966 pmaddubsw m2, m7, [r3 + 9 * 16] | |
6967 pmulhrsw m2, m3 | |
6968 movd m6, [r2 + 21] | |
6969 palignr m6, m4, 2 | |
6970 pmaddubsw m5, m6, [r3 + 9 * 16] | |
6971 pmulhrsw m5, m3 | |
6972 packuswb m2, m5 | |
6973 movu [r0 + 460 * 16], m2 | |
6974 | |
6975 ; mode 30 [row 13] | |
6976 pmaddubsw m2, m7, [r3 + 22 * 16] | |
6977 pmulhrsw m2, m3 | |
6978 pmaddubsw m5, m6, [r3 + 22 * 16] | |
6979 pmulhrsw m5, m3 | |
6980 packuswb m2, m5 | |
6981 movu [r0 + 461 * 16], m2 | |
6982 | |
6983 ; mode 33 [row 6] | |
6984 movu [r0 + 502 * 16], m2 | |
6985 | |
6986 ; mode 31 [row 9] | |
6987 pmaddubsw m2, m7, [r3 + 10 * 16] | |
6988 pmulhrsw m2, m3 | |
6989 pmaddubsw m5, m6, [r3 + 10 * 16] | |
6990 pmulhrsw m5, m3 | |
6991 packuswb m2, m5 | |
6992 movu [r0 + 473 * 16], m2 | |
6993 | |
6994 ; mode 31 [row 10] | |
6995 pmaddubsw m2, m7, [r3 + 27 * 16] | |
6996 pmulhrsw m2, m3 | |
6997 pmaddubsw m5, m6, [r3 + 27 * 16] | |
6998 pmulhrsw m5, m3 | |
6999 packuswb m2, m5 | |
7000 movu [r0 + 474 * 16], m2 | |
7001 | |
7002 ; mode 32 [row 7] | |
7003 pmaddubsw m2, m7, [r3 + 8 * 16] | |
7004 pmulhrsw m2, m3 | |
7005 pmaddubsw m5, m6, [r3 + 8 * 16] | |
7006 pmulhrsw m5, m3 | |
7007 packuswb m2, m5 | |
7008 movu [r0 + 487 * 16], m2 | |
7009 | |
7010 ; mode 32 [row 8] | |
7011 pmaddubsw m2, m7, [r3 + 29 * 16] | |
7012 pmulhrsw m2, m3 | |
7013 pmaddubsw m5, m6, [r3 + 29 * 16] | |
7014 pmulhrsw m5, m3 | |
7015 packuswb m2, m5 | |
7016 movu [r0 + 488 * 16], m2 | |
7017 | |
7018 | |
7019 movu m1, m7 | |
7020 movu m4, m6 | |
7021 | |
7022 ; mode 30 [row 14] | |
7023 movd m1, [r2 + 14] | |
7024 palignr m1, m7, 2 | |
7025 pmaddubsw m2, m1, [r3 + 3 * 16] | |
7026 pmulhrsw m2, m3 | |
7027 movd m4, [r2 + 22] | |
7028 palignr m4, m6, 2 | |
7029 pmaddubsw m5, m4, [r3 + 3 * 16] | |
7030 pmulhrsw m5, m3 | |
7031 packuswb m2, m5 | |
7032 movu [r0 + 462 * 16], m2 | |
7033 | |
7034 ; mode 30 [row 15] | |
7035 pmaddubsw m2, m1, [r3 + 16 * 16] | |
7036 pmulhrsw m2, m3 | |
7037 pmaddubsw m5, m4, [r3 + 16 * 16] | |
7038 pmulhrsw m5, m3 | |
7039 packuswb m2, m5 | |
7040 movu [r0 + 463 * 16], m2 | |
7041 | |
7042 ; mode 33 [row 7] | |
7043 movu [r0 + 503 * 16], m2 | |
7044 | |
7045 ; mode 31 [row 11] | |
7046 pmaddubsw m2, m1, [r3 + 12 * 16] | |
7047 pmulhrsw m2, m3 | |
7048 pmaddubsw m5, m4, [r3 + 12 * 16] | |
7049 pmulhrsw m5, m3 | |
7050 packuswb m2, m5 | |
7051 movu [r0 + 475 * 16], m2 | |
7052 | |
7053 ; mode 31 [row 12] | |
7054 pmaddubsw m2, m1, [r3 + 29 * 16] | |
7055 pmulhrsw m2, m3 | |
7056 pmaddubsw m5, m4, [r3 + 29 * 16] | |
7057 pmulhrsw m5, m3 | |
7058 packuswb m2, m5 | |
7059 movu [r0 + 476 * 16], m2 | |
7060 | |
7061 ; mode 32 [row 9] | |
7062 pmaddubsw m2, m1, [r3 + 18 * 16] | |
7063 pmulhrsw m2, m3 | |
7064 pmaddubsw m5, m4, [r3 + 18 * 16] | |
7065 pmulhrsw m5, m3 | |
7066 packuswb m2, m5 | |
7067 movu [r0 + 489 * 16], m2 | |
7068 | |
7069 ; mode 31 [row 13] | |
7070 movd m7, [r2 + 15] | |
7071 palignr m7, m1, 2 | |
7072 pmaddubsw m2, m7, [r3 + 14 * 16] | |
7073 pmulhrsw m2, m3 | |
7074 movd m6, [r2 + 23] | |
7075 palignr m6, m4, 2 | |
7076 pmaddubsw m5, m6, [r3 + 14 * 16] | |
7077 pmulhrsw m5, m3 | |
7078 packuswb m2, m5 | |
7079 movu [r0 + 477 * 16], m2 | |
7080 | |
7081 ; mode 31 [row 14] | |
7082 pmaddubsw m2, m7, [r3 + 31 * 16] | |
7083 pmulhrsw m2, m3 | |
7084 pmaddubsw m5, m6, [r3 + 31 * 16] | |
7085 pmulhrsw m5, m3 | |
7086 packuswb m2, m5 | |
7087 movu [r0 + 478 * 16], m2 | |
7088 | |
7089 ; mode 32 [row 10] | |
7090 pmaddubsw m2, m7, [r3 + 7 * 16] | |
7091 pmulhrsw m2, m3 | |
7092 pmaddubsw m5, m6, [r3 + 7 * 16] | |
7093 pmulhrsw m5, m3 | |
7094 packuswb m2, m5 | |
7095 movu [r0 + 490 * 16], m2 | |
7096 | |
7097 ; mode 32 [row 11] | |
7098 pmaddubsw m2, m7, [r3 + 28 * 16] | |
7099 pmulhrsw m2, m3 | |
7100 pmaddubsw m5, m6, [r3 + 28 * 16] | |
7101 pmulhrsw m5, m3 | |
7102 packuswb m2, m5 | |
7103 movu [r0 + 491 * 16], m2 | |
7104 | |
7105 ; mode 33 [row 8] | |
7106 pmaddubsw m2, m7, [r3 + 10 * 16] | |
7107 pmulhrsw m2, m3 | |
7108 pmaddubsw m5, m6, [r3 + 10 * 16] | |
7109 pmulhrsw m5, m3 | |
7110 packuswb m2, m5 | |
7111 movu [r0 + 504 * 16], m2 | |
7112 | |
7113 ; mode 31 [row 15] | |
7114 movd m1, [r2 + 16] | |
7115 palignr m1, m7, 2 | |
7116 pmaddubsw m2, m1, [r3 + 16 * 16] | |
7117 pmulhrsw m2, m3 | |
7118 movd m4, [r2 + 24] | |
7119 palignr m4, m6, 2 | |
7120 pmaddubsw m5, m4, [r3 + 16 * 16] | |
7121 pmulhrsw m5, m3 | |
7122 packuswb m2, m5 | |
7123 movu [r0 + 479 * 16], m2 | |
7124 | |
7125 ; mode 32 [row 12] | |
7126 pmaddubsw m2, m1, [r3 + 17 * 16] | |
7127 pmulhrsw m2, m3 | |
7128 pmaddubsw m5, m4, [r3 + 17 * 16] | |
7129 pmulhrsw m5, m3 | |
7130 packuswb m2, m5 | |
7131 movu [r0 + 492 * 16], m2 | |
7132 | |
7133 ; mode 33 [row 9] | |
7134 pmaddubsw m2, m1, [r3 + 4 * 16] | |
7135 pmulhrsw m2, m3 | |
7136 pmaddubsw m5, m4, [r3 + 4 * 16] | |
7137 pmulhrsw m5, m3 | |
7138 packuswb m2, m5 | |
7139 movu [r0 + 505 * 16], m2 | |
7140 | |
7141 ; mode 33 [row 10] | |
7142 pmaddubsw m2, m1, [r3 + 30 * 16] | |
7143 pmulhrsw m2, m3 | |
7144 pmaddubsw m5, m4, [r3 + 30 * 16] | |
7145 pmulhrsw m5, m3 | |
7146 packuswb m2, m5 | |
7147 movu [r0 + 506 * 16], m2 | |
7148 | |
7149 ; mode 33 [row 10] | |
7150 pmaddubsw m2, m1, [r3 + 4 * 16] | |
7151 pmulhrsw m2, m3 | |
7152 pmaddubsw m5, m4, [r3 + 4 * 16] | |
7153 pmulhrsw m5, m3 | |
7154 packuswb m2, m5 | |
7155 movu [r0 + 505 * 16], m2 | |
7156 | |
7157 ; mode 32 [row 13] | |
7158 movd m7, [r2 + 17] | |
7159 palignr m7, m1, 2 | |
7160 pmaddubsw m2, m7, [r3 + 6 * 16] | |
7161 pmulhrsw m2, m3 | |
7162 | |
7163 movd m6, [r2 + 25] | |
7164 palignr m6, m4, 2 | |
7165 pmaddubsw m5, m6, [r3 + 6 * 16] | |
7166 pmulhrsw m5, m3 | |
7167 packuswb m2, m5 | |
7168 movu [r0 + 493 * 16], m2 | |
7169 | |
7170 ; mode 32 [row 14] | |
7171 pmaddubsw m2, m7, [r3 + 27 * 16] | |
7172 pmulhrsw m2, m3 | |
7173 pmaddubsw m5, m6, [r3 + 27 * 16] | |
7174 pmulhrsw m5, m3 | |
7175 packuswb m2, m5 | |
7176 movu [r0 + 494 * 16], m2 | |
7177 | |
7178 ; mode 33 [row 11] | |
7179 pmaddubsw m2, m7, [r3 + 24 * 16] | |
7180 pmulhrsw m2, m3 | |
7181 pmaddubsw m5, m6, [r3 + 24 * 16] | |
7182 pmulhrsw m5, m3 | |
7183 packuswb m2, m5 | |
7184 movu [r0 + 507 * 16], m2 | |
7185 | |
7186 ; mode 32 [row 15] | |
7187 movd m1, [r2 + 18] | |
7188 palignr m1, m7, 2 | |
7189 pmaddubsw m2, m1, [r3 + 16 * 16] | |
7190 pmulhrsw m2, m3 | |
7191 psrldq m4, 2 | |
7192 pinsrb m4, [r2 + 26], 14 | |
7193 pinsrb m4, [r2 + 27], 15 | |
7194 movd m4, [r2 + 26] | |
7195 palignr m4, m6, 2 | |
7196 pmaddubsw m5, m4, [r3 + 16 * 16] | |
7197 pmulhrsw m5, m3 | |
7198 packuswb m2, m5 | |
7199 movu [r0 + 495 * 16], m2 | |
7200 | |
7201 ; mode 33 [row 12] | |
7202 pmaddubsw m2, m1, [r3 + 18 * 16] | |
7203 pmulhrsw m2, m3 | |
7204 pmaddubsw m5, m4, [r3 + 18 * 16] | |
7205 pmulhrsw m5, m3 | |
7206 packuswb m2, m5 | |
7207 movu [r0 + 508 * 16], m2 | |
7208 | |
7209 ; mode 33 [row 13] | |
7210 movd m7, [r2 + 19] | |
7211 palignr m7, m1, 2 | |
7212 pmaddubsw m2, m7, [r3 + 12 * 16] | |
7213 pmulhrsw m2, m3 | |
7214 movd m6, [r2 + 27] | |
7215 palignr m6, m4, 2 | |
7216 pmaddubsw m5, m6, [r3 + 12 * 16] | |
7217 pmulhrsw m5, m3 | |
7218 packuswb m2, m5 | |
7219 movu [r0 + 509 * 16], m2 | |
7220 | |
7221 ; mode 33 [row 14] | |
7222 movd m1, [r2 + 20] | |
7223 palignr m1, m7, 2 | |
7224 pmaddubsw m2, m1, [r3 + 6 * 16] | |
7225 pmulhrsw m2, m3 | |
7226 movd m4, [r2 + 28] | |
7227 palignr m4, m6, 2 | |
7228 pmaddubsw m5, m4, [r3 + 6 * 16] | |
7229 pmulhrsw m5, m3 | |
7230 packuswb m2, m5 | |
7231 movu [r0 + 510 * 16], m2 | |
7232 | |
7233 ; mode 34 [row 0] | |
7234 movu m1, [r2 + 2] | |
7235 movu [r0 + 512 * 16], m1 | |
7236 movu m2, [r2 + 18] | |
7237 palignr m3, m2, m1, 1 | |
7238 movu [r0 + 513 * 16], m3 | |
7239 palignr m3, m2, m1, 2 | |
7240 movu [r0 + 514 * 16], m3 | |
7241 palignr m3, m2, m1, 3 | |
7242 movu [r0 + 515 * 16], m3 | |
7243 palignr m3, m2, m1, 4 | |
7244 movu [r0 + 516 * 16], m3 | |
7245 palignr m3, m2, m1, 5 | |
7246 movu [r0 + 517 * 16], m3 | |
7247 palignr m3, m2, m1, 6 | |
7248 movu [r0 + 518 * 16], m3 | |
7249 palignr m3, m2, m1, 7 | |
7250 movu [r0 + 519 * 16], m3 | |
7251 palignr m3, m2, m1, 8 | |
7252 movu [r0 + 520 * 16], m3 | |
7253 palignr m3, m2, m1, 9 | |
7254 movu [r0 + 521 * 16], m3 | |
7255 palignr m3, m2, m1, 10 | |
7256 movu [r0 + 522 * 16], m3 | |
7257 palignr m3, m2, m1, 11 | |
7258 movu [r0 + 523 * 16], m3 | |
7259 palignr m3, m2, m1, 12 | |
7260 movu [r0 + 524 * 16], m3 | |
7261 | |
7262 ; mode 33 [row 15] | |
7263 movu [r0 + 511 * 16], m3 | |
7264 | |
7265 ; mode 34 | |
7266 palignr m3, m2, m1, 13 | |
7267 movu [r0 + 525 * 16], m3 | |
7268 palignr m3, m2, m1, 14 | |
7269 movu [r0 + 526 * 16], m3 | |
7270 palignr m3, m2, m1, 15 | |
7271 movu [r0 + 527 * 16], m3 | |
7272 RET | |
7273 | |
7274 ;-------------------------------------------------------------------------------- | |
7275 ; void all_angs_pred_32x32(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
7276 ;-------------------------------------------------------------------------------- | |
7277 INIT_XMM sse4 | |
7278 cglobal all_angs_pred_32x32, 3,7,8, 0-4 | |
7279 mov r6d, [r1 + 64] | |
7280 mov r3d, [r1] | |
7281 mov [rsp], r6d | |
7282 mov [r1 + 64], r3b | |
7283 mov r3d, [r2] | |
7284 mov r6d, [r2 + 64] | |
7285 mov [r2 + 64], r3b | |
7286 | |
7287 lea r3, [r2] | |
7288 lea r4, [r2 + 64] | |
7289 lea r2, [r1 + 64] | |
7290 | |
7291 ;mode 2[row 0] | |
7292 movu m0, [r4 + 2] | |
7293 movu [r0 + 0 * 16], m0 | |
7294 movu m1, [r4 + 18] | |
7295 movu [r0 + 1 * 16], m1 | |
7296 | |
7297 ;mode 9 [row 15] | |
7298 movu [r0 + 478 * 16], m0 | |
7299 movu [r0 + 479 * 16], m1 | |
7300 | |
7301 ;mode 2[row 1] | |
7302 movu m2, [r4 + 34] | |
7303 palignr m3, m1, m0, 1 | |
7304 movu [r0 + 2 * 16], m3 | |
7305 palignr m4, m2, m1, 1 | |
7306 movu [r0 + 3 * 16], m4 | |
7307 | |
7308 ; mode 9 [row 31] | |
7309 movu [r0 + 510 * 16], m3 | |
7310 movu [r0 + 511 * 16], m4 | |
7311 | |
7312 ;mode 2[row 17] | |
7313 movu [r0 + 34 * 16], m4 | |
7314 movu m5, [r4 + 35] | |
7315 movu [r0 + 35 * 16], m5 | |
7316 | |
7317 ;mode 2[row 2] | |
7318 palignr m3, m1, m0, 2 | |
7319 movu [r0 + 4 * 16], m3 | |
7320 palignr m4, m2, m1, 2 | |
7321 movu [r0 + 5 * 16], m4 | |
7322 | |
7323 ;mode 2[row 18] | |
7324 movu [r0 + 36 * 16], m4 | |
7325 movu m6, [r4 + 51] | |
7326 palignr m7, m6, m5, 1 | |
7327 movu [r0 + 37 * 16], m7 | |
7328 | |
7329 ;mode 2[row 3] | |
7330 palignr m3, m1, m0, 3 | |
7331 movu [r0 + 6 * 16], m3 | |
7332 palignr m4, m2, m1, 3 | |
7333 movu [r0 + 7 * 16], m4 | |
7334 | |
7335 ;mode 2[row 19] | |
7336 movu [r0 + 38 * 16], m4 | |
7337 palignr m7, m6, m5, 2 | |
7338 movu [r0 + 39 * 16], m7 | |
7339 | |
7340 ;mode 2[row 4] | |
7341 palignr m3, m1, m0, 4 | |
7342 movu [r0 + 8 * 16], m3 | |
7343 palignr m4, m2, m1, 4 | |
7344 movu [r0 + 9 * 16], m4 | |
7345 | |
7346 ; mode 8 [row 31] | |
7347 movu [r0 + 446 * 16], m3 | |
7348 movu [r0 + 447 * 16], m4 | |
7349 | |
7350 ;mode 2[row 20] | |
7351 movu [r0 + 40 * 16], m4 | |
7352 palignr m7, m6, m5, 3 | |
7353 movu [r0 + 41 * 16], m7 | |
7354 | |
7355 ; mode 4 [row 31] | |
7356 movu [r0 + 190 * 16], m4 | |
7357 movu [r0 + 191 * 16], m7 | |
7358 | |
7359 ;mode 2[row 5] | |
7360 palignr m3, m1, m0, 5 | |
7361 movu [r0 + 10 * 16], m3 | |
7362 palignr m4, m2, m1, 5 | |
7363 movu [r0 + 11 * 16], m4 | |
7364 | |
7365 ;mode 2[row 21] | |
7366 movu [r0 + 42 * 16], m4 | |
7367 palignr m7, m6, m5, 4 | |
7368 movu [r0 + 43 * 16], m7 | |
7369 | |
7370 ;mode 2[row 6] | |
7371 palignr m3, m1, m0, 6 | |
7372 movu [r0 + 12 * 16], m3 | |
7373 palignr m4, m2, m1, 6 | |
7374 movu [r0 + 13 * 16], m4 | |
7375 | |
7376 ;mode 2[row 22] | |
7377 movu [r0 + 44 * 16], m4 | |
7378 palignr m7, m6, m5, 5 | |
7379 movu [r0 + 45 * 16], m7 | |
7380 | |
7381 ;mode 2[row 7] | |
7382 palignr m3, m1, m0, 7 | |
7383 movu [r0 + 14 * 16], m3 | |
7384 palignr m4, m2, m1, 7 | |
7385 movu [r0 + 15 * 16], m4 | |
7386 | |
7387 ;mode 2[row 23] | |
7388 movu [r0 + 46 * 16], m4 | |
7389 palignr m7, m6, m5, 6 | |
7390 movu [r0 + 47 * 16], m7 | |
7391 | |
7392 ;mode 2[row 8] | |
7393 palignr m3, m1, m0, 8 | |
7394 movu [r0 + 16 * 16], m3 | |
7395 palignr m4, m2, m1, 8 | |
7396 movu [r0 + 17 * 16], m4 | |
7397 | |
7398 ;mode 7[row 31] | |
7399 movu [r0 + 382 * 16], m3 | |
7400 movu [r0 + 383 * 16], m4 | |
7401 | |
7402 ;mode 2[row 24] | |
7403 movu [r0 + 48 * 16], m4 | |
7404 palignr m7, m6, m5, 7 | |
7405 movu [r0 + 49 * 16], m7 | |
7406 | |
7407 ;mode 2[row 9] | |
7408 palignr m3, m1, m0, 9 | |
7409 movu [r0 + 18 * 16], m3 | |
7410 palignr m4, m2, m1, 9 | |
7411 movu [r0 + 19 * 16], m4 | |
7412 | |
7413 ;mode 2[row 25] | |
7414 movu [r0 + 50 * 16], m4 | |
7415 palignr m7, m6, m5, 8 | |
7416 movu [r0 + 51 * 16], m7 | |
7417 | |
7418 ; mode 3 [row 31] | |
7419 movu [r0 + 126 * 16], m4 | |
7420 movu [r0 + 127 * 16], m7 | |
7421 | |
7422 ;mode 2[row 10] | |
7423 palignr m3, m1, m0, 10 | |
7424 movu [r0 + 20 * 16], m3 | |
7425 palignr m4, m2, m1, 10 | |
7426 movu [r0 + 21 * 16], m4 | |
7427 | |
7428 ;mode 2[row 26] | |
7429 movu [r0 + 52 * 16], m4 | |
7430 palignr m7, m6, m5, 9 | |
7431 movu [r0 + 53 * 16], m7 | |
7432 | |
7433 ;mode 2[row 11] | |
7434 palignr m3, m1, m0, 11 | |
7435 movu [r0 + 22 * 16], m3 | |
7436 palignr m4, m2, m1, 11 | |
7437 movu [r0 + 23 * 16], m4 | |
7438 | |
7439 ;mode 2[row 27] | |
7440 movu [r0 + 54 * 16], m4 | |
7441 palignr m7, m6, m5, 10 | |
7442 movu [r0 + 55 * 16], m7 | |
7443 | |
7444 ;mode 2[row 12] | |
7445 palignr m3, m1, m0, 12 | |
7446 movu [r0 + 24 * 16], m3 | |
7447 palignr m4, m2, m1, 12 | |
7448 movu [r0 + 25 * 16], m4 | |
7449 | |
7450 ; mode 6 [row 31] | |
7451 movu [r0 + 318 * 16], m3 | |
7452 movu [r0 + 319 * 16], m4 | |
7453 | |
7454 ; mode 3 [row 15] | |
7455 movu [r0 + 94 * 16], m3 | |
7456 movu [r0 + 95 * 16], m4 | |
7457 | |
7458 ;mode 2[row 28] | |
7459 movu [r0 + 56 * 16], m4 | |
7460 palignr m7, m6, m5, 11 | |
7461 movu [r0 + 57 * 16], m7 | |
7462 | |
7463 ;mode 2[row 13] | |
7464 palignr m3, m1, m0, 13 | |
7465 movu [r0 + 26 * 16], m3 | |
7466 palignr m4, m2, m1, 13 | |
7467 movu [r0 + 27 * 16], m4 | |
7468 | |
7469 ;mode 2[row 29] | |
7470 movu [r0 + 58 * 16], m4 | |
7471 palignr m7, m6, m5, 12 | |
7472 movu [r0 + 59 * 16], m7 | |
7473 | |
7474 ;mode 2[row 14] | |
7475 palignr m3, m1, m0, 14 | |
7476 movu [r0 + 28 * 16], m3 | |
7477 palignr m4, m2, m1, 14 | |
7478 movu [r0 + 29 * 16], m4 | |
7479 | |
7480 ;mode 2[row 30] | |
7481 movu [r0 + 60 * 16], m4 | |
7482 palignr m7, m6, m5, 13 | |
7483 movu [r0 + 61 * 16], m7 | |
7484 | |
7485 ;mode 2[row 15] | |
7486 palignr m3, m1, m0, 15 | |
7487 movu [r0 + 30 * 16], m3 | |
7488 palignr m4, m2, m1, 15 | |
7489 movu [r0 + 31 * 16], m4 | |
7490 | |
7491 ;mode 2[row 31] | |
7492 movu [r0 + 62 * 16], m4 | |
7493 palignr m7, m6, m5, 14 | |
7494 movu [r0 + 63 * 16], m7 | |
7495 | |
7496 ;mode 2[row 16] | |
7497 movu [r0 + 32 * 16], m1 | |
7498 movu [r0 + 33 * 16], m2 | |
7499 | |
7500 ; mode 5[row 31] | |
7501 movu [r0 + 254 * 16], m1 | |
7502 movu [r0 + 255 * 16], m2 | |
7503 | |
7504 ; mode 3 [row 0] | |
7505 lea r5, [ang_table] | |
7506 movu m6, [r5 + 26 * 16] | |
7507 movu m7, [pw_1024 ] | |
7508 movu m1, [r4 + 1 ] | |
7509 punpcklbw m1, m0 | |
7510 pmaddubsw m0, m1, m6 | |
7511 pmulhrsw m0, m7 | |
7512 movu m2, [r4 + 9] | |
7513 movd m3, [r4 + 10] | |
7514 palignr m3, m2, 1 | |
7515 punpcklbw m2, m3 | |
7516 pmaddubsw m3, m2, m6 | |
7517 pmulhrsw m3, m7 | |
7518 packuswb m0, m3 | |
7519 movu [r0 + 64 * 16], m0 | |
7520 | |
7521 ; mode 6 [row 1 - first half] | |
7522 movu [r0 + 258 * 16], m0 | |
7523 | |
7524 ; mode 9 [row 12 - first half] | |
7525 movu [r0 + 472 * 16], m0 | |
7526 | |
7527 movu m0, [r4 + 17] | |
7528 movd m3, [r4 + 18] | |
7529 palignr m3, m0, 1 | |
7530 punpcklbw m0, m3 | |
7531 pmaddubsw m3, m0, m6 | |
7532 pmulhrsw m3, m7 | |
7533 movu m4, [r4 + 25] | |
7534 movd m5, [r4 + 26] | |
7535 palignr m5, m4, 1 | |
7536 punpcklbw m4, m5 | |
7537 pmaddubsw m5, m4, m6 | |
7538 pmulhrsw m5, m7 | |
7539 packuswb m3, m5 | |
7540 movu [r0 + 65 * 16], m3 | |
7541 | |
7542 ; mode 6 [row 1 - second half] | |
7543 movu [r0 + 259 * 16], m3 | |
7544 | |
7545 ; mode 9 [row 12 - second half] | |
7546 movu [r0 + 473 * 16], m3 | |
7547 | |
7548 ; mode 4 [row 0] | |
7549 movu m6, [r5 + 21 * 16] | |
7550 pmaddubsw m3, m1, m6 | |
7551 pmulhrsw m3, m7 | |
7552 pmaddubsw m5, m2, m6 | |
7553 pmulhrsw m5, m7 | |
7554 packuswb m3, m5 | |
7555 movu [r0 + 128 * 16], m3 | |
7556 pmaddubsw m3, m0, m6 | |
7557 pmulhrsw m3, m7 | |
7558 pmaddubsw m5, m4, m6 | |
7559 pmulhrsw m5, m7 | |
7560 packuswb m3, m5 | |
7561 movu [r0 + 129 * 16], m3 | |
7562 | |
7563 ; mode 5 [row 0] | |
7564 movu m6, [r5 + 17 * 16] | |
7565 pmaddubsw m3, m1, m6 | |
7566 pmulhrsw m3, m7 | |
7567 pmaddubsw m5, m2, m6 | |
7568 pmulhrsw m5, m7 | |
7569 packuswb m3, m5 | |
7570 movu [r0 + 192 * 16], m3 | |
7571 pmaddubsw m3, m0, m6 | |
7572 pmulhrsw m3, m7 | |
7573 pmaddubsw m5, m4, m6 | |
7574 pmulhrsw m5, m7 | |
7575 packuswb m3, m5 | |
7576 movu [r0 + 193 * 16], m3 | |
7577 | |
7578 ; mode 6 [row 0] | |
7579 movu m6, [r5 + 13 * 16] | |
7580 pmaddubsw m3, m1, m6 | |
7581 pmulhrsw m3, m7 | |
7582 pmaddubsw m5, m2, m6 | |
7583 pmulhrsw m5, m7 | |
7584 packuswb m3, m5 | |
7585 movu [r0 + 256 * 16], m3 | |
7586 pmaddubsw m3, m0, m6 | |
7587 pmulhrsw m3, m7 | |
7588 pmaddubsw m5, m4, m6 | |
7589 pmulhrsw m5, m7 | |
7590 packuswb m3, m5 | |
7591 movu [r0 + 257 * 16], m3 | |
7592 | |
7593 ; mode 7 [row 0] | |
7594 movu m6, [r5 + 9 * 16] | |
7595 pmaddubsw m3, m1, m6 | |
7596 pmulhrsw m3, m7 | |
7597 pmaddubsw m5, m2, m6 | |
7598 pmulhrsw m5, m7 | |
7599 packuswb m3, m5 | |
7600 movu [r0 + 320 * 16], m3 | |
7601 pmaddubsw m3, m0, m6 | |
7602 pmulhrsw m3, m7 | |
7603 pmaddubsw m5, m4, m6 | |
7604 pmulhrsw m5, m7 | |
7605 packuswb m3, m5 | |
7606 movu [r0 + 321 * 16], m3 | |
7607 | |
7608 ; mode 7 [row 1] | |
7609 movu m6, [r5 + 18 * 16] | |
7610 pmaddubsw m3, m1, m6 | |
7611 pmulhrsw m3, m7 | |
7612 pmaddubsw m5, m2, m6 | |
7613 pmulhrsw m5, m7 | |
7614 packuswb m3, m5 | |
7615 movu [r0 + 322 * 16], m3 | |
7616 | |
7617 ; mode 9 [row 8 - first half] | |
7618 movu [r0 + 464 * 16], m3 | |
7619 | |
7620 pmaddubsw m3, m0, m6 | |
7621 pmulhrsw m3, m7 | |
7622 pmaddubsw m5, m4, m6 | |
7623 pmulhrsw m5, m7 | |
7624 packuswb m3, m5 | |
7625 movu [r0 + 323 * 16], m3 | |
7626 | |
7627 ; mode 9 [row 8 - second half] | |
7628 movu [r0 + 465 * 16], m3 | |
7629 | |
7630 ; mode 7 [row 2] | |
7631 movu m6, [r5 + 27 * 16] | |
7632 pmaddubsw m3, m1, m6 | |
7633 pmulhrsw m3, m7 | |
7634 pmaddubsw m5, m2, m6 | |
7635 pmulhrsw m5, m7 | |
7636 packuswb m3, m5 | |
7637 movu [r0 + 324 * 16], m3 | |
7638 pmaddubsw m3, m0, m6 | |
7639 pmulhrsw m3, m7 | |
7640 pmaddubsw m5, m4, m6 | |
7641 pmulhrsw m5, m7 | |
7642 packuswb m3, m5 | |
7643 movu [r0 + 325 * 16], m3 | |
7644 | |
7645 ; mode 8 [row 0] | |
7646 movu m6, [r5 + 5 * 16] | |
7647 pmaddubsw m3, m1, m6 | |
7648 pmulhrsw m3, m7 | |
7649 pmaddubsw m5, m2, m6 | |
7650 pmulhrsw m5, m7 | |
7651 packuswb m3, m5 | |
7652 movu [r0 + 384 * 16], m3 | |
7653 pmaddubsw m3, m0, m6 | |
7654 pmulhrsw m3, m7 | |
7655 pmaddubsw m5, m4, m6 | |
7656 pmulhrsw m5, m7 | |
7657 packuswb m3, m5 | |
7658 movu [r0 + 385 * 16], m3 | |
7659 | |
7660 ; mode 8 [row 1] | |
7661 movu m6, [r5 + 10 * 16] | |
7662 pmaddubsw m3, m1, m6 | |
7663 pmulhrsw m3, m7 | |
7664 pmaddubsw m5, m2, m6 | |
7665 pmulhrsw m5, m7 | |
7666 packuswb m3, m5 | |
7667 movu [r0 + 386 * 16], m3 | |
7668 | |
7669 ; mode 9 [row 4 - first half] | |
7670 movu [r0 + 456 * 16], m3 | |
7671 | |
7672 pmaddubsw m3, m0, m6 | |
7673 pmulhrsw m3, m7 | |
7674 pmaddubsw m5, m4, m6 | |
7675 pmulhrsw m5, m7 | |
7676 packuswb m3, m5 | |
7677 movu [r0 + 387 * 16], m3 | |
7678 | |
7679 ; mode 9 [row 4 - second half] | |
7680 movu [r0 + 457 * 16], m3 | |
7681 | |
7682 ; mode 8 [row 2] | |
7683 movu m6, [r5 + 15 * 16] | |
7684 pmaddubsw m3, m1, m6 | |
7685 pmulhrsw m3, m7 | |
7686 pmaddubsw m5, m2, m6 | |
7687 pmulhrsw m5, m7 | |
7688 packuswb m3, m5 | |
7689 movu [r0 + 388 * 16], m3 | |
7690 pmaddubsw m3, m0, m6 | |
7691 pmulhrsw m3, m7 | |
7692 pmaddubsw m5, m4, m6 | |
7693 pmulhrsw m5, m7 | |
7694 packuswb m3, m5 | |
7695 movu [r0 + 389 * 16], m3 | |
7696 | |
7697 ; mode 8 [row 3] | |
7698 movu m6, [r5 + 20 * 16] | |
7699 pmaddubsw m3, m1, m6 | |
7700 pmulhrsw m3, m7 | |
7701 pmaddubsw m5, m2, m6 | |
7702 pmulhrsw m5, m7 | |
7703 packuswb m3, m5 | |
7704 movu [r0 + 390 * 16], m3 | |
7705 | |
7706 ; mode 9 [row 9 - first half] | |
7707 movu [r0 + 466 * 16], m3 | |
7708 | |
7709 pmaddubsw m3, m0, m6 | |
7710 pmulhrsw m3, m7 | |
7711 pmaddubsw m5, m4, m6 | |
7712 pmulhrsw m5, m7 | |
7713 packuswb m3, m5 | |
7714 movu [r0 + 391 * 16], m3 | |
7715 | |
7716 ; mode 9 [row 9 - second half] | |
7717 movu [r0 + 467 * 16], m3 | |
7718 | |
7719 ; mode 8 [row 4] | |
7720 movu m6, [r5 + 25 * 16] | |
7721 pmaddubsw m3, m1, m6 | |
7722 pmulhrsw m3, m7 | |
7723 pmaddubsw m5, m2, m6 | |
7724 pmulhrsw m5, m7 | |
7725 packuswb m3, m5 | |
7726 movu [r0 + 392 * 16], m3 | |
7727 pmaddubsw m3, m0, m6 | |
7728 pmulhrsw m3, m7 | |
7729 pmaddubsw m5, m4, m6 | |
7730 pmulhrsw m5, m7 | |
7731 packuswb m3, m5 | |
7732 movu [r0 + 393 * 16], m3 | |
7733 | |
7734 ; mode 8 [row 5] | |
7735 movu m6, [r5 + 30 * 16] | |
7736 pmaddubsw m3, m1, m6 | |
7737 pmulhrsw m3, m7 | |
7738 pmaddubsw m5, m2, m6 | |
7739 pmulhrsw m5, m7 | |
7740 packuswb m3, m5 | |
7741 movu [r0 + 394 * 16], m3 | |
7742 | |
7743 ; mode 9 [row 14 - first half] | |
7744 movu [r0 + 476 * 16], m3 | |
7745 | |
7746 pmaddubsw m3, m0, m6 | |
7747 pmulhrsw m3, m7 | |
7748 pmaddubsw m5, m4, m6 | |
7749 pmulhrsw m5, m7 | |
7750 packuswb m3, m5 | |
7751 movu [r0 + 395 * 16], m3 | |
7752 | |
7753 ; mode 9 [row 14 - second half] | |
7754 movu [r0 + 477 * 16], m3 | |
7755 | |
7756 ; mode 9 [row 0] | |
7757 movu m6, [r5 + 2 * 16] | |
7758 pmaddubsw m3, m1, m6 | |
7759 pmulhrsw m3, m7 | |
7760 pmaddubsw m5, m2, m6 | |
7761 pmulhrsw m5, m7 | |
7762 packuswb m3, m5 | |
7763 movu [r0 + 448 * 16], m3 | |
7764 pmaddubsw m3, m0, m6 | |
7765 pmulhrsw m3, m7 | |
7766 pmaddubsw m5, m4, m6 | |
7767 pmulhrsw m5, m7 | |
7768 packuswb m3, m5 | |
7769 movu [r0 + 449 * 16], m3 | |
7770 | |
7771 ; mode 9 [row 1] | |
7772 movu m6, [r5 + 4 * 16] | |
7773 pmaddubsw m3, m1, m6 | |
7774 pmulhrsw m3, m7 | |
7775 pmaddubsw m5, m2, m6 | |
7776 pmulhrsw m5, m7 | |
7777 packuswb m3, m5 | |
7778 movu [r0 + 450 * 16], m3 | |
7779 pmaddubsw m3, m0, m6 | |
7780 pmulhrsw m3, m7 | |
7781 pmaddubsw m5, m4, m6 | |
7782 pmulhrsw m5, m7 | |
7783 packuswb m3, m5 | |
7784 movu [r0 + 451 * 16], m3 | |
7785 | |
7786 ; mode 9 [row 2] | |
7787 movu m6, [r5 + 6 * 16] | |
7788 pmaddubsw m3, m1, m6 | |
7789 pmulhrsw m3, m7 | |
7790 pmaddubsw m5, m2, m6 | |
7791 pmulhrsw m5, m7 | |
7792 packuswb m3, m5 | |
7793 movu [r0 + 452 * 16], m3 | |
7794 pmaddubsw m3, m0, m6 | |
7795 pmulhrsw m3, m7 | |
7796 pmaddubsw m5, m4, m6 | |
7797 pmulhrsw m5, m7 | |
7798 packuswb m3, m5 | |
7799 movu [r0 + 453 * 16], m3 | |
7800 | |
7801 ; mode 9 [row 3] | |
7802 movu m6, [r5 + 8 * 16] | |
7803 pmaddubsw m3, m1, m6 | |
7804 pmulhrsw m3, m7 | |
7805 pmaddubsw m5, m2, m6 | |
7806 pmulhrsw m5, m7 | |
7807 packuswb m3, m5 | |
7808 movu [r0 + 454 * 16], m3 | |
7809 pmaddubsw m3, m0, m6 | |
7810 pmulhrsw m3, m7 | |
7811 pmaddubsw m5, m4, m6 | |
7812 pmulhrsw m5, m7 | |
7813 packuswb m3, m5 | |
7814 movu [r0 + 455 * 16], m3 | |
7815 | |
7816 ; mode 9 [row 5] | |
7817 movu m6, [r5 + 12 * 16] | |
7818 pmaddubsw m3, m1, m6 | |
7819 pmulhrsw m3, m7 | |
7820 pmaddubsw m5, m2, m6 | |
7821 pmulhrsw m5, m7 | |
7822 packuswb m3, m5 | |
7823 movu [r0 + 458 * 16], m3 | |
7824 pmaddubsw m3, m0, m6 | |
7825 pmulhrsw m3, m7 | |
7826 pmaddubsw m5, m4, m6 | |
7827 pmulhrsw m5, m7 | |
7828 packuswb m3, m5 | |
7829 movu [r0 + 459 * 16], m3 | |
7830 | |
7831 ; mode 9 [row 6] | |
7832 movu m6, [r5 + 14 * 16] | |
7833 pmaddubsw m3, m1, m6 | |
7834 pmulhrsw m3, m7 | |
7835 pmaddubsw m5, m2, m6 | |
7836 pmulhrsw m5, m7 | |
7837 packuswb m3, m5 | |
7838 movu [r0 + 460 * 16], m3 | |
7839 pmaddubsw m3, m0, m6 | |
7840 pmulhrsw m3, m7 | |
7841 pmaddubsw m5, m4, m6 | |
7842 pmulhrsw m5, m7 | |
7843 packuswb m3, m5 | |
7844 movu [r0 + 461 * 16], m3 | |
7845 | |
7846 ; mode 9 [row 7] | |
7847 movu m6, [r5 + 16 * 16] | |
7848 pmaddubsw m3, m1, m6 | |
7849 pmulhrsw m3, m7 | |
7850 pmaddubsw m5, m2, m6 | |
7851 pmulhrsw m5, m7 | |
7852 packuswb m3, m5 | |
7853 movu [r0 + 462 * 16], m3 | |
7854 pmaddubsw m3, m0, m6 | |
7855 pmulhrsw m3, m7 | |
7856 pmaddubsw m5, m4, m6 | |
7857 pmulhrsw m5, m7 | |
7858 packuswb m3, m5 | |
7859 movu [r0 + 463 * 16], m3 | |
7860 | |
7861 ; mode 9 [row 10] | |
7862 movu m6, [r5 + 22 * 16] | |
7863 pmaddubsw m3, m1, m6 | |
7864 pmulhrsw m3, m7 | |
7865 pmaddubsw m5, m2, m6 | |
7866 pmulhrsw m5, m7 | |
7867 packuswb m3, m5 | |
7868 movu [r0 + 468 * 16], m3 | |
7869 pmaddubsw m3, m0, m6 | |
7870 pmulhrsw m3, m7 | |
7871 pmaddubsw m5, m4, m6 | |
7872 pmulhrsw m5, m7 | |
7873 packuswb m3, m5 | |
7874 movu [r0 + 469 * 16], m3 | |
7875 | |
7876 ; mode 9 [row 11] | |
7877 movu m6, [r5 + 24 * 16] | |
7878 pmaddubsw m3, m1, m6 | |
7879 pmulhrsw m3, m7 | |
7880 pmaddubsw m5, m2, m6 | |
7881 pmulhrsw m5, m7 | |
7882 packuswb m3, m5 | |
7883 movu [r0 + 470 * 16], m3 | |
7884 pmaddubsw m3, m0, m6 | |
7885 pmulhrsw m3, m7 | |
7886 pmaddubsw m5, m4, m6 | |
7887 pmulhrsw m5, m7 | |
7888 packuswb m3, m5 | |
7889 movu [r0 + 471 * 16], m3 | |
7890 | |
7891 ; mode 9 [row 13] | |
7892 movu m6, [r5 + 28 * 16] | |
7893 pmaddubsw m3, m1, m6 | |
7894 pmulhrsw m3, m7 | |
7895 pmaddubsw m5, m2, m6 | |
7896 pmulhrsw m5, m7 | |
7897 packuswb m3, m5 | |
7898 movu [r0 + 474 * 16], m3 | |
7899 pmaddubsw m3, m0, m6 | |
7900 pmulhrsw m3, m7 | |
7901 pmaddubsw m5, m4, m6 | |
7902 pmulhrsw m5, m7 | |
7903 packuswb m3, m5 | |
7904 movu [r0 + 475 * 16], m3 | |
7905 | |
7906 ; mode 3 [row 1] | |
7907 movu m6, [r5 + 20 * 16] | |
7908 movu m0, [r4 + 2] | |
7909 movd m1, [r4 + 3] | |
7910 palignr m1, m0, 1 | |
7911 punpcklbw m0, m1 | |
7912 pmaddubsw m1, m0, m6 | |
7913 pmulhrsw m1, m7 | |
7914 movu m2, [r4 + 10] | |
7915 movd m3, [r4 + 11] | |
7916 palignr m3, m2, 1 | |
7917 punpcklbw m2, m3 | |
7918 pmaddubsw m3, m2, m6 | |
7919 pmulhrsw m3, m7 | |
7920 packuswb m1, m3 | |
7921 movu [r0 + 66 * 16], m1 | |
7922 | |
7923 ; mode 6 [row 3 - first half] | |
7924 movu [r0 + 262 * 16], m1 | |
7925 | |
7926 ; mode 9 [row 25 - first half] | |
7927 movu [r0 + 498 * 16], m1 | |
7928 | |
7929 movu m1, [r4 + 18] | |
7930 movd m3, [r4 + 19] | |
7931 palignr m3, m1, 1 | |
7932 punpcklbw m1, m3 | |
7933 pmaddubsw m3, m1, m6 | |
7934 pmulhrsw m3, m7 | |
7935 movu m4, [r4 + 26] | |
7936 movd m5, [r4 + 27] | |
7937 palignr m5, m4, 1 | |
7938 punpcklbw m4, m5 | |
7939 pmaddubsw m5, m4, m6 | |
7940 pmulhrsw m5, m7 | |
7941 packuswb m3, m5 | |
7942 movu [r0 + 67 * 16], m3 | |
7943 | |
7944 ; mode 6 [row 3 - second half] | |
7945 movu [r0 + 263 * 16], m3 | |
7946 | |
7947 ; mode 9 [row 25 - second half] | |
7948 movu [r0 + 499 * 16], m3 | |
7949 | |
7950 ; mode 4 [row 1] | |
7951 movu m6, [r5 + 10 * 16] | |
7952 pmaddubsw m3, m0, m6 | |
7953 pmulhrsw m3, m7 | |
7954 pmaddubsw m5, m2, m6 | |
7955 pmulhrsw m5, m7 | |
7956 packuswb m3, m5 | |
7957 movu [r0 + 130 * 16], m3 | |
7958 | |
7959 ; mode 9 [row 20 - first half] | |
7960 movu [r0 + 488 * 16], m3 | |
7961 | |
7962 pmaddubsw m3, m1, m6 | |
7963 pmulhrsw m3, m7 | |
7964 pmaddubsw m5, m4, m6 | |
7965 pmulhrsw m5, m7 | |
7966 packuswb m3, m5 | |
7967 movu [r0 + 131 * 16], m3 | |
7968 | |
7969 ; mode 9 [row 20 - second half] | |
7970 movu [r0 + 489 * 16], m3 | |
7971 | |
7972 ; mode 4 [row 2] | |
7973 movu m6, [r5 + 31 * 16] | |
7974 pmaddubsw m3, m0, m6 | |
7975 pmulhrsw m3, m7 | |
7976 pmaddubsw m5, m2, m6 | |
7977 pmulhrsw m5, m7 | |
7978 packuswb m3, m5 | |
7979 movu [r0 + 132 * 16], m3 | |
7980 | |
7981 ; mode 7 [row 6 - first half] | |
7982 movu [r0 + 332 * 16], m3 | |
7983 | |
7984 pmaddubsw m3, m1, m6 | |
7985 pmulhrsw m3, m7 | |
7986 pmaddubsw m5, m4, m6 | |
7987 pmulhrsw m5, m7 | |
7988 packuswb m3, m5 | |
7989 movu [r0 + 133 * 16], m3 | |
7990 | |
7991 ; mode 7 [row 6 - second half] | |
7992 movu [r0 + 333 * 16], m3 | |
7993 | |
7994 ; mode 5 [row 1] | |
7995 movu m6, [r5 + 2 * 16] | |
7996 pmaddubsw m3, m0, m6 | |
7997 pmulhrsw m3, m7 | |
7998 pmaddubsw m5, m2, m6 | |
7999 pmulhrsw m5, m7 | |
8000 packuswb m3, m5 | |
8001 movu [r0 + 194 * 16], m3 | |
8002 | |
8003 ; mode 5 [row 1 - first half] | |
8004 movu [r0 + 480 * 16], m3 | |
8005 | |
8006 pmaddubsw m3, m1, m6 | |
8007 pmulhrsw m3, m7 | |
8008 pmaddubsw m5, m4, m6 | |
8009 pmulhrsw m5, m7 | |
8010 packuswb m3, m5 | |
8011 movu [r0 + 195 * 16], m3 | |
8012 | |
8013 ; mode 5 [row 1 - second half] | |
8014 movu [r0 + 481 * 16], m3 | |
8015 | |
8016 ; mode 5 [row 2] | |
8017 movu m6, [r5 + 19 * 16] | |
8018 pmaddubsw m3, m0, m6 | |
8019 pmulhrsw m3, m7 | |
8020 pmaddubsw m5, m2, m6 | |
8021 pmulhrsw m5, m7 | |
8022 packuswb m3, m5 | |
8023 movu [r0 + 196 * 16], m3 | |
8024 pmaddubsw m3, m1, m6 | |
8025 pmulhrsw m3, m7 | |
8026 pmaddubsw m5, m4, m6 | |
8027 pmulhrsw m5, m7 | |
8028 packuswb m3, m5 | |
8029 movu [r0 + 197 * 16], m3 | |
8030 | |
8031 ; mode 6 [row 2] | |
8032 movu m6, [r5 + 7 * 16] | |
8033 pmaddubsw m3, m0, m6 | |
8034 pmulhrsw m3, m7 | |
8035 pmaddubsw m5, m2, m6 | |
8036 pmulhrsw m5, m7 | |
8037 packuswb m3, m5 | |
8038 movu [r0 + 260 * 16], m3 | |
8039 pmaddubsw m3, m1, m6 | |
8040 pmulhrsw m3, m7 | |
8041 pmaddubsw m5, m4, m6 | |
8042 pmulhrsw m5, m7 | |
8043 packuswb m3, m5 | |
8044 movu [r0 + 261 * 16], m3 | |
8045 | |
8046 ; mode 7 [row 3] | |
8047 movu m6, [r5 + 4 * 16] | |
8048 pmaddubsw m3, m0, m6 | |
8049 pmulhrsw m3, m7 | |
8050 pmaddubsw m5, m2, m6 | |
8051 pmulhrsw m5, m7 | |
8052 packuswb m3, m5 | |
8053 movu [r0 + 326 * 16], m3 | |
8054 | |
8055 ; mode 9 [row 17 - first half] | |
8056 movu [r0 + 482 * 16], m3 | |
8057 | |
8058 pmaddubsw m3, m1, m6 | |
8059 pmulhrsw m3, m7 | |
8060 pmaddubsw m5, m4, m6 | |
8061 pmulhrsw m5, m7 | |
8062 packuswb m3, m5 | |
8063 movu [r0 + 327 * 16], m3 | |
8064 | |
8065 ; mode 9 [row 17 - second half] | |
8066 movu [r0 + 483 * 16], m3 | |
8067 | |
8068 ; mode 7 [row 4] | |
8069 movu m6, [r5 + 13 * 16] | |
8070 pmaddubsw m3, m0, m6 | |
8071 pmulhrsw m3, m7 | |
8072 pmaddubsw m5, m2, m6 | |
8073 pmulhrsw m5, m7 | |
8074 packuswb m3, m5 | |
8075 movu [r0 + 328 * 16], m3 | |
8076 | |
8077 ; mode 8 [row 8 - first half] | |
8078 movu [r0 + 400 * 16], m3 | |
8079 | |
8080 pmaddubsw m3, m1, m6 | |
8081 pmulhrsw m3, m7 | |
8082 pmaddubsw m5, m4, m6 | |
8083 pmulhrsw m5, m7 | |
8084 packuswb m3, m5 | |
8085 movu [r0 + 329 * 16], m3 | |
8086 | |
8087 ; mode 8 [row 8 - second half] | |
8088 movu [r0 + 401 * 16], m3 | |
8089 | |
8090 ; mode 7 [row 5] | |
8091 movu m6, [r5 + 22 * 16] | |
8092 pmaddubsw m3, m0, m6 | |
8093 pmulhrsw m3, m7 | |
8094 pmaddubsw m5, m2, m6 | |
8095 pmulhrsw m5, m7 | |
8096 packuswb m3, m5 | |
8097 movu [r0 + 330 * 16], m3 | |
8098 | |
8099 ; mode 9 [row 26 - first half] | |
8100 movu [r0 + 500 * 16], m3 | |
8101 | |
8102 pmaddubsw m3, m1, m6 | |
8103 pmulhrsw m3, m7 | |
8104 pmaddubsw m5, m4, m6 | |
8105 pmulhrsw m5, m7 | |
8106 packuswb m3, m5 | |
8107 movu [r0 + 331 * 16], m3 | |
8108 | |
8109 ; mode 9 [row 26 - second half] | |
8110 movu [r0 + 501 * 16], m3 | |
8111 | |
8112 ; mode 8 [row 6] | |
8113 movu m6, [r5 + 3 * 16] | |
8114 pmaddubsw m3, m0, m6 | |
8115 pmulhrsw m3, m7 | |
8116 pmaddubsw m5, m2, m6 | |
8117 pmulhrsw m5, m7 | |
8118 packuswb m3, m5 | |
8119 movu [r0 + 396 * 16], m3 | |
8120 pmaddubsw m3, m1, m6 | |
8121 pmulhrsw m3, m7 | |
8122 pmaddubsw m5, m4, m6 | |
8123 pmulhrsw m5, m7 | |
8124 packuswb m3, m5 | |
8125 movu [r0 + 397 * 16], m3 | |
8126 | |
8127 ; mode 9 [row 18] | |
8128 movu m6, [r5 + 6 * 16] | |
8129 pmaddubsw m3, m0, m6 | |
8130 pmulhrsw m3, m7 | |
8131 pmaddubsw m5, m2, m6 | |
8132 pmulhrsw m5, m7 | |
8133 packuswb m3, m5 | |
8134 movu [r0 + 484 * 16], m3 | |
8135 pmaddubsw m3, m1, m6 | |
8136 pmulhrsw m3, m7 | |
8137 pmaddubsw m5, m4, m6 | |
8138 pmulhrsw m5, m7 | |
8139 packuswb m3, m5 | |
8140 movu [r0 + 485 * 16], m3 | |
8141 | |
8142 ; mode 9 [row 21] | |
8143 movu m6, [r5 + 12 * 16] | |
8144 pmaddubsw m3, m0, m6 | |
8145 pmulhrsw m3, m7 | |
8146 pmaddubsw m5, m2, m6 | |
8147 pmulhrsw m5, m7 | |
8148 packuswb m3, m5 | |
8149 movu [r0 + 490 * 16], m3 | |
8150 pmaddubsw m3, m1, m6 | |
8151 pmulhrsw m3, m7 | |
8152 pmaddubsw m5, m4, m6 | |
8153 pmulhrsw m5, m7 | |
8154 packuswb m3, m5 | |
8155 movu [r0 + 491 * 16], m3 | |
8156 | |
8157 ; mode 9 [row 22] | |
8158 movu m6, [r5 + 14 * 16] | |
8159 pmaddubsw m3, m0, m6 | |
8160 pmulhrsw m3, m7 | |
8161 pmaddubsw m5, m2, m6 | |
8162 pmulhrsw m5, m7 | |
8163 packuswb m3, m5 | |
8164 movu [r0 + 492 * 16], m3 | |
8165 pmaddubsw m3, m1, m6 | |
8166 pmulhrsw m3, m7 | |
8167 pmaddubsw m5, m4, m6 | |
8168 pmulhrsw m5, m7 | |
8169 packuswb m3, m5 | |
8170 movu [r0 + 493 * 16], m3 | |
8171 | |
8172 ; mode 9 [row 23] | |
8173 movu m6, [r5 + 16 * 16] | |
8174 pmaddubsw m3, m0, m6 | |
8175 pmulhrsw m3, m7 | |
8176 pmaddubsw m5, m2, m6 | |
8177 pmulhrsw m5, m7 | |
8178 packuswb m3, m5 | |
8179 movu [r0 + 494 * 16], m3 | |
8180 pmaddubsw m3, m1, m6 | |
8181 pmulhrsw m3, m7 | |
8182 pmaddubsw m5, m4, m6 | |
8183 pmulhrsw m5, m7 | |
8184 packuswb m3, m5 | |
8185 movu [r0 + 495 * 16], m3 | |
8186 | |
8187 ; mode 9 [row 27] | |
8188 movu m6, [r5 + 24 * 16] | |
8189 pmaddubsw m3, m0, m6 | |
8190 pmulhrsw m3, m7 | |
8191 pmaddubsw m5, m2, m6 | |
8192 pmulhrsw m5, m7 | |
8193 packuswb m3, m5 | |
8194 movu [r0 + 502 * 16], m3 | |
8195 pmaddubsw m3, m1, m6 | |
8196 pmulhrsw m3, m7 | |
8197 pmaddubsw m5, m4, m6 | |
8198 pmulhrsw m5, m7 | |
8199 packuswb m3, m5 | |
8200 movu [r0 + 503 * 16], m3 | |
8201 | |
8202 ; mode 9 [row 28] | |
8203 movu m6, [r5 + 26 * 16] | |
8204 pmaddubsw m3, m0, m6 | |
8205 pmulhrsw m3, m7 | |
8206 pmaddubsw m5, m2, m6 | |
8207 pmulhrsw m5, m7 | |
8208 packuswb m3, m5 | |
8209 movu [r0 + 504 * 16], m3 | |
8210 pmaddubsw m3, m1, m6 | |
8211 pmulhrsw m3, m7 | |
8212 pmaddubsw m5, m4, m6 | |
8213 pmulhrsw m5, m7 | |
8214 packuswb m3, m5 | |
8215 movu [r0 + 505 * 16], m3 | |
8216 | |
8217 ; mode 9 [row 30] | |
8218 movu m6, [r5 + 30 * 16] | |
8219 pmaddubsw m3, m0, m6 | |
8220 pmulhrsw m3, m7 | |
8221 pmaddubsw m5, m2, m6 | |
8222 pmulhrsw m5, m7 | |
8223 packuswb m3, m5 | |
8224 movu [r0 + 508 * 16], m3 | |
8225 pmaddubsw m3, m1, m6 | |
8226 pmulhrsw m3, m7 | |
8227 pmaddubsw m5, m4, m6 | |
8228 pmulhrsw m5, m7 | |
8229 packuswb m3, m5 | |
8230 movu [r0 + 509 * 16], m3 | |
8231 | |
8232 ; mode 8 [row 7] | |
8233 movu m6, [r5 + 8 * 16] | |
8234 pmaddubsw m3, m0, m6 | |
8235 pmulhrsw m3, m7 | |
8236 pmaddubsw m5, m2, m6 | |
8237 pmulhrsw m5, m7 | |
8238 packuswb m3, m5 | |
8239 movu [r0 + 398 * 16], m3 | |
8240 | |
8241 ; mode 9 [row 19 - first half] | |
8242 movu [r0 + 486 * 16], m3 | |
8243 | |
8244 pmaddubsw m3, m1, m6 | |
8245 pmulhrsw m3, m7 | |
8246 pmaddubsw m5, m4, m6 | |
8247 pmulhrsw m5, m7 | |
8248 packuswb m3, m5 | |
8249 movu [r0 + 399 * 16], m3 | |
8250 | |
8251 ; mode 9 [row 19 - second half] | |
8252 movu [r0 + 487 * 16], m3 | |
8253 | |
8254 ; mode 8 [row 9] | |
8255 movu m6, [r5 + 18 * 16] | |
8256 pmaddubsw m3, m0, m6 | |
8257 pmulhrsw m3, m7 | |
8258 pmaddubsw m5, m2, m6 | |
8259 pmulhrsw m5, m7 | |
8260 packuswb m3, m5 | |
8261 movu [r0 + 402 * 16], m3 | |
8262 | |
8263 ; mode 9 [row 24 - first half] | |
8264 movu [r0 + 496 * 16], m3 | |
8265 | |
8266 pmaddubsw m3, m1, m6 | |
8267 pmulhrsw m3, m7 | |
8268 pmaddubsw m5, m4, m6 | |
8269 pmulhrsw m5, m7 | |
8270 packuswb m3, m5 | |
8271 movu [r0 + 403 * 16], m3 | |
8272 | |
8273 ; mode 9 [row 24 - second half] | |
8274 movu [r0 + 497 * 16], m3 | |
8275 | |
8276 ; mode 8 [row 10] | |
8277 movu m6, [r5 + 23 * 16] | |
8278 pmaddubsw m3, m0, m6 | |
8279 pmulhrsw m3, m7 | |
8280 pmaddubsw m5, m2, m6 | |
8281 pmulhrsw m5, m7 | |
8282 packuswb m3, m5 | |
8283 movu [r0 + 404 * 16], m3 | |
8284 pmaddubsw m3, m1, m6 | |
8285 pmulhrsw m3, m7 | |
8286 pmaddubsw m5, m4, m6 | |
8287 pmulhrsw m5, m7 | |
8288 packuswb m3, m5 | |
8289 movu [r0 + 405 * 16], m3 | |
8290 | |
8291 ; mode 8 [row 11] | |
8292 movu m6, [r5 + 28 * 16] | |
8293 pmaddubsw m3, m0, m6 | |
8294 pmulhrsw m3, m7 | |
8295 pmaddubsw m5, m2, m6 | |
8296 pmulhrsw m5, m7 | |
8297 packuswb m3, m5 | |
8298 movu [r0 + 406 * 16], m3 | |
8299 | |
8300 ; mode 9 [row 29 - first half] | |
8301 movu [r0 + 506 * 16], m3 | |
8302 | |
8303 pmaddubsw m3, m1, m6 | |
8304 pmulhrsw m3, m7 | |
8305 pmaddubsw m5, m4, m6 | |
8306 pmulhrsw m5, m7 | |
8307 packuswb m3, m5 | |
8308 movu [r0 + 407 * 16], m3 | |
8309 | |
8310 ; mode 9 [row 29 - second half] | |
8311 movu [r0 + 507 * 16], m3 | |
8312 | |
8313 ; mode 3 [row 2] | |
8314 movu m6, [r5 + 14 * 16] | |
8315 movu m0, [r4 + 3] | |
8316 movd m1, [r4 + 4] | |
8317 palignr m1, m0, 1 | |
8318 punpcklbw m0, m1 | |
8319 pmaddubsw m1, m0, m6 | |
8320 pmulhrsw m1, m7 | |
8321 movu m2, [r4 + 11] | |
8322 movd m3, [r4 + 12] | |
8323 palignr m3, m2, 1 | |
8324 punpcklbw m2, m3 | |
8325 pmaddubsw m3, m2, m6 | |
8326 pmulhrsw m3, m7 | |
8327 packuswb m1, m3 | |
8328 movu [r0 + 68 * 16], m1 | |
8329 | |
8330 ; mode 3 [row 2 - first half] | |
8331 movu [r0 + 266 * 16], m1 | |
8332 | |
8333 movu m1, [r4 + 19] | |
8334 movd m3, [r4 + 20] | |
8335 palignr m3, m1, 1 | |
8336 punpcklbw m1, m3 | |
8337 pmaddubsw m3, m1, m6 | |
8338 pmulhrsw m3, m7 | |
8339 movu m4, [r4 + 27] | |
8340 movd m5, [r4 + 28] | |
8341 palignr m5, m4, 1 | |
8342 punpcklbw m4, m5 | |
8343 pmaddubsw m5, m4, m6 | |
8344 pmulhrsw m5, m7 | |
8345 packuswb m3, m5 | |
8346 movu [r0 + 69 * 16], m3 | |
8347 | |
8348 ; mode 3 [row 2 - second half] | |
8349 movu [r0 + 267 * 16], m3 | |
8350 | |
8351 ; mode 4 [row 3] | |
8352 movu m6, [r5 + 20 * 16] | |
8353 pmaddubsw m3, m0, m6 | |
8354 pmulhrsw m3, m7 | |
8355 pmaddubsw m5, m2, m6 | |
8356 pmulhrsw m5, m7 | |
8357 packuswb m3, m5 | |
8358 movu [r0 + 134 * 16], m3 | |
8359 pmaddubsw m3, m1, m6 | |
8360 pmulhrsw m3, m7 | |
8361 pmaddubsw m5, m4, m6 | |
8362 pmulhrsw m5, m7 | |
8363 packuswb m3, m5 | |
8364 movu [r0 + 135 * 16], m3 | |
8365 | |
8366 ; mode 5 [row 3] | |
8367 movu m6, [r5 + 4 * 16] | |
8368 pmaddubsw m3, m0, m6 | |
8369 pmulhrsw m3, m7 | |
8370 pmaddubsw m5, m2, m6 | |
8371 pmulhrsw m5, m7 | |
8372 packuswb m3, m5 | |
8373 movu [r0 + 198 * 16], m3 | |
8374 pmaddubsw m3, m1, m6 | |
8375 pmulhrsw m3, m7 | |
8376 pmaddubsw m5, m4, m6 | |
8377 pmulhrsw m5, m7 | |
8378 packuswb m3, m5 | |
8379 movu [r0 + 199 * 16], m3 | |
8380 | |
8381 ; mode 5 [row 4] | |
8382 movu m6, [r5 + 21 * 16] | |
8383 pmaddubsw m3, m0, m6 | |
8384 pmulhrsw m3, m7 | |
8385 pmaddubsw m5, m2, m6 | |
8386 pmulhrsw m5, m7 | |
8387 packuswb m3, m5 | |
8388 movu [r0 + 200 * 16], m3 | |
8389 | |
8390 ; mode 8 [row 16 - first half] | |
8391 movu [r0 + 416 * 16], m3 | |
8392 | |
8393 pmaddubsw m3, m1, m6 | |
8394 pmulhrsw m3, m7 | |
8395 pmaddubsw m5, m4, m6 | |
8396 pmulhrsw m5, m7 | |
8397 packuswb m3, m5 | |
8398 movu [r0 + 201 * 16], m3 | |
8399 | |
8400 ; mode 8 [row 16 - second half] | |
8401 movu [r0 + 417 * 16], m3 | |
8402 | |
8403 ; mode 6 [row 4] | |
8404 movu m6, [r5 + 1 * 16] | |
8405 pmaddubsw m3, m0, m6 | |
8406 pmulhrsw m3, m7 | |
8407 pmaddubsw m5, m2, m6 | |
8408 pmulhrsw m5, m7 | |
8409 packuswb m3, m5 | |
8410 movu [r0 + 264 * 16], m3 | |
8411 | |
8412 ; mode 6 [row 4 - first half] | |
8413 movu [r0 + 408 * 16], m3 | |
8414 | |
8415 pmaddubsw m3, m1, m6 | |
8416 pmulhrsw m3, m7 | |
8417 pmaddubsw m5, m4, m6 | |
8418 pmulhrsw m5, m7 | |
8419 packuswb m3, m5 | |
8420 movu [r0 + 265 * 16], m3 | |
8421 | |
8422 ; mode 6 [row 4 - second half] | |
8423 movu [r0 + 409 * 16], m3 | |
8424 | |
8425 ; mode 6 [row 6] | |
8426 movu m6, [r5 + 27 * 16] | |
8427 pmaddubsw m3, m0, m6 | |
8428 pmulhrsw m3, m7 | |
8429 pmaddubsw m5, m2, m6 | |
8430 pmulhrsw m5, m7 | |
8431 packuswb m3, m5 | |
8432 movu [r0 + 268 * 16], m3 | |
8433 pmaddubsw m3, m1, m6 | |
8434 pmulhrsw m3, m7 | |
8435 pmaddubsw m5, m4, m6 | |
8436 pmulhrsw m5, m7 | |
8437 packuswb m3, m5 | |
8438 movu [r0 + 269 * 16], m3 | |
8439 | |
8440 ; mode 7 [row 7] | |
8441 movu m6, [r5 + 8 * 16] | |
8442 pmaddubsw m3, m0, m6 | |
8443 pmulhrsw m3, m7 | |
8444 pmaddubsw m5, m2, m6 | |
8445 pmulhrsw m5, m7 | |
8446 packuswb m3, m5 | |
8447 movu [r0 + 334 * 16], m3 | |
8448 pmaddubsw m3, m1, m6 | |
8449 pmulhrsw m3, m7 | |
8450 pmaddubsw m5, m4, m6 | |
8451 pmulhrsw m5, m7 | |
8452 packuswb m3, m5 | |
8453 movu [r0 + 335 * 16], m3 | |
8454 | |
8455 ; mode 7 [row 8] | |
8456 movu m6, [r5 + 17 * 16] | |
8457 pmaddubsw m3, m0, m6 | |
8458 pmulhrsw m3, m7 | |
8459 pmaddubsw m5, m2, m6 | |
8460 pmulhrsw m5, m7 | |
8461 packuswb m3, m5 | |
8462 movu [r0 + 336 * 16], m3 | |
8463 pmaddubsw m3, m1, m6 | |
8464 pmulhrsw m3, m7 | |
8465 pmaddubsw m5, m4, m6 | |
8466 pmulhrsw m5, m7 | |
8467 packuswb m3, m5 | |
8468 movu [r0 + 337 * 16], m3 | |
8469 | |
8470 ; mode 7 [row 9] | |
8471 movu m6, [r5 + 26 * 16] | |
8472 pmaddubsw m3, m0, m6 | |
8473 pmulhrsw m3, m7 | |
8474 pmaddubsw m5, m2, m6 | |
8475 pmulhrsw m5, m7 | |
8476 packuswb m3, m5 | |
8477 movu [r0 + 338 * 16], m3 | |
8478 | |
8479 ; mode 8 [row 17 - first half] | |
8480 movu [r0 + 418 * 16], m3 | |
8481 | |
8482 pmaddubsw m3, m1, m6 | |
8483 pmulhrsw m3, m7 | |
8484 pmaddubsw m5, m4, m6 | |
8485 pmulhrsw m5, m7 | |
8486 packuswb m3, m5 | |
8487 movu [r0 + 339 * 16], m3 | |
8488 | |
8489 ; mode 8 [row 17 - second half] | |
8490 movu [r0 + 419 * 16], m3 | |
8491 | |
8492 ; mode 8 [row 13] | |
8493 movu m6, [r5 + 6 * 16] | |
8494 pmaddubsw m3, m0, m6 | |
8495 pmulhrsw m3, m7 | |
8496 pmaddubsw m5, m2, m6 | |
8497 pmulhrsw m5, m7 | |
8498 packuswb m3, m5 | |
8499 movu [r0 + 410 * 16], m3 | |
8500 pmaddubsw m3, m1, m6 | |
8501 pmulhrsw m3, m7 | |
8502 pmaddubsw m5, m4, m6 | |
8503 pmulhrsw m5, m7 | |
8504 packuswb m3, m5 | |
8505 movu [r0 + 411 * 16], m3 | |
8506 | |
8507 ; mode 8 [row 14] | |
8508 movu m6, [r5 + 11 * 16] | |
8509 pmaddubsw m3, m0, m6 | |
8510 pmulhrsw m3, m7 | |
8511 pmaddubsw m5, m2, m6 | |
8512 pmulhrsw m5, m7 | |
8513 packuswb m3, m5 | |
8514 movu [r0 + 412 * 16], m3 | |
8515 pmaddubsw m3, m1, m6 | |
8516 pmulhrsw m3, m7 | |
8517 pmaddubsw m5, m4, m6 | |
8518 pmulhrsw m5, m7 | |
8519 packuswb m3, m5 | |
8520 movu [r0 + 413 * 16], m3 | |
8521 | |
8522 ; mode 8 [row 15] | |
8523 movu m6, [r5 + 16 * 16] | |
8524 pmaddubsw m3, m0, m6 | |
8525 pmulhrsw m3, m7 | |
8526 pmaddubsw m5, m2, m6 | |
8527 pmulhrsw m5, m7 | |
8528 packuswb m3, m5 | |
8529 movu [r0 + 414 * 16], m3 | |
8530 pmaddubsw m3, m1, m6 | |
8531 pmulhrsw m3, m7 | |
8532 pmaddubsw m5, m4, m6 | |
8533 pmulhrsw m5, m7 | |
8534 packuswb m3, m5 | |
8535 movu [r0 + 415 * 16], m3 | |
8536 | |
8537 ; mode 8 [row 18] | |
8538 movu m6, [r5 + 31 * 16] | |
8539 pmaddubsw m3, m0, m6 | |
8540 pmulhrsw m3, m7 | |
8541 pmaddubsw m5, m2, m6 | |
8542 pmulhrsw m5, m7 | |
8543 packuswb m3, m5 | |
8544 movu [r0 + 420 * 16], m3 | |
8545 pmaddubsw m3, m1, m6 | |
8546 pmulhrsw m3, m7 | |
8547 pmaddubsw m5, m4, m6 | |
8548 pmulhrsw m5, m7 | |
8549 packuswb m3, m5 | |
8550 movu [r0 + 421 * 16], m3 | |
8551 | |
8552 ; mode 3 [row 3] | |
8553 movu m6, [r5 + 8 * 16] | |
8554 movu m0, [r4 + 4] | |
8555 movd m1, [r4 + 5] | |
8556 palignr m1, m0, 1 | |
8557 punpcklbw m0, m1 | |
8558 pmaddubsw m1, m0, m6 | |
8559 pmulhrsw m1, m7 | |
8560 movu m2, [r4 + 12] | |
8561 movd m3, [r4 + 13] | |
8562 palignr m3, m2, 1 | |
8563 punpcklbw m2, m3 | |
8564 pmaddubsw m3, m2, m6 | |
8565 pmulhrsw m3, m7 | |
8566 packuswb m1, m3 | |
8567 movu [r0 + 70 * 16], m1 | |
8568 | |
8569 ; mode 6 [row 7 - first half] | |
8570 movu [r0 + 270 * 16], m1 | |
8571 | |
8572 movu m1, [r4 + 20] | |
8573 movd m3, [r4 + 21] | |
8574 palignr m3, m1, 1 | |
8575 punpcklbw m1, m3 | |
8576 pmaddubsw m3, m1, m6 | |
8577 pmulhrsw m3, m7 | |
8578 movu m4, [r4 + 28] | |
8579 movd m5, [r4 + 29] | |
8580 palignr m5, m4, 1 | |
8581 punpcklbw m4, m5 | |
8582 pmaddubsw m5, m4, m6 | |
8583 pmulhrsw m5, m7 | |
8584 packuswb m3, m5 | |
8585 movu [r0 + 71 * 16], m3 | |
8586 | |
8587 ; mode 6 [row 7 - second half] | |
8588 movu [r0 + 271 * 16], m3 | |
8589 | |
8590 ; mode 4 [row 4] | |
8591 movu m6, [r5 + 9 * 16] | |
8592 pmaddubsw m3, m0, m6 | |
8593 pmulhrsw m3, m7 | |
8594 pmaddubsw m5, m2, m6 | |
8595 pmulhrsw m5, m7 | |
8596 packuswb m3, m5 | |
8597 movu [r0 + 136 * 16], m3 | |
8598 | |
8599 ; mode 4 [row 4 - first half] | |
8600 movu [r0 + 424 * 16], m3 | |
8601 | |
8602 pmaddubsw m3, m1, m6 | |
8603 pmulhrsw m3, m7 | |
8604 pmaddubsw m5, m4, m6 | |
8605 pmulhrsw m5, m7 | |
8606 packuswb m3, m5 | |
8607 movu [r0 + 137 * 16], m3 | |
8608 | |
8609 ; mode 4 [row 4 - second half] | |
8610 movu [r0 + 425 * 16], m3 | |
8611 | |
8612 ; mode 4 [row 5] | |
8613 movu m6, [r5 + 30 * 16] | |
8614 pmaddubsw m3, m0, m6 | |
8615 pmulhrsw m3, m7 | |
8616 pmaddubsw m5, m2, m6 | |
8617 pmulhrsw m5, m7 | |
8618 packuswb m3, m5 | |
8619 movu [r0 + 138 * 16], m3 | |
8620 | |
8621 ; mode 7 [row 13 - first half] | |
8622 movu [r0 + 346 * 16], m3 | |
8623 | |
8624 pmaddubsw m3, m1, m6 | |
8625 pmulhrsw m3, m7 | |
8626 pmaddubsw m5, m4, m6 | |
8627 pmulhrsw m5, m7 | |
8628 packuswb m3, m5 | |
8629 movu [r0 + 139 * 16], m3 | |
8630 | |
8631 ; mode 7 [row 13 - second half] | |
8632 movu [r0 + 347 * 16], m3 | |
8633 | |
8634 ; mode 5 [row 5] | |
8635 movu m6, [r5 + 6 * 16] | |
8636 pmaddubsw m3, m0, m6 | |
8637 pmulhrsw m3, m7 | |
8638 pmaddubsw m5, m2, m6 | |
8639 pmulhrsw m5, m7 | |
8640 packuswb m3, m5 | |
8641 movu [r0 + 202 * 16], m3 | |
8642 pmaddubsw m3, m1, m6 | |
8643 pmulhrsw m3, m7 | |
8644 pmaddubsw m5, m4, m6 | |
8645 pmulhrsw m5, m7 | |
8646 packuswb m3, m5 | |
8647 movu [r0 + 203 * 16], m3 | |
8648 | |
8649 ; mode 5 [row 6] | |
8650 movu m6, [r5 + 23 * 16] | |
8651 pmaddubsw m3, m0, m6 | |
8652 pmulhrsw m3, m7 | |
8653 pmaddubsw m5, m2, m6 | |
8654 pmulhrsw m5, m7 | |
8655 packuswb m3, m5 | |
8656 movu [r0 + 204 * 16], m3 | |
8657 pmaddubsw m3, m1, m6 | |
8658 pmulhrsw m3, m7 | |
8659 pmaddubsw m5, m4, m6 | |
8660 pmulhrsw m5, m7 | |
8661 packuswb m3, m5 | |
8662 movu [r0 + 205 * 16], m3 | |
8663 | |
8664 ; mode 6 [row 8] | |
8665 movu m6, [r5 + 21 * 16] | |
8666 pmaddubsw m3, m0, m6 | |
8667 pmulhrsw m3, m7 | |
8668 pmaddubsw m5, m2, m6 | |
8669 pmulhrsw m5, m7 | |
8670 packuswb m3, m5 | |
8671 movu [r0 + 272 * 16], m3 | |
8672 | |
8673 ; mode 7 [row 12 - first half] | |
8674 movu [r0 + 344 * 16], m3 | |
8675 | |
8676 pmaddubsw m3, m1, m6 | |
8677 pmulhrsw m3, m7 | |
8678 pmaddubsw m5, m4, m6 | |
8679 pmulhrsw m5, m7 | |
8680 packuswb m3, m5 | |
8681 movu [r0 + 273 * 16], m3 | |
8682 | |
8683 ; mode 7 [row 12 - second half] | |
8684 movu [r0 + 345 * 16], m3 | |
8685 | |
8686 ; mode 7 [row 10] | |
8687 movu m6, [r5 + 3 * 16] | |
8688 pmaddubsw m3, m0, m6 | |
8689 pmulhrsw m3, m7 | |
8690 pmaddubsw m5, m2, m6 | |
8691 pmulhrsw m5, m7 | |
8692 packuswb m3, m5 | |
8693 movu [r0 + 340 * 16], m3 | |
8694 pmaddubsw m3, m1, m6 | |
8695 pmulhrsw m3, m7 | |
8696 pmaddubsw m5, m4, m6 | |
8697 pmulhrsw m5, m7 | |
8698 packuswb m3, m5 | |
8699 movu [r0 + 341 * 16], m3 | |
8700 | |
8701 ; mode 7 [row 11] | |
8702 movu m6, [r5 + 12 * 16] | |
8703 pmaddubsw m3, m0, m6 | |
8704 pmulhrsw m3, m7 | |
8705 pmaddubsw m5, m2, m6 | |
8706 pmulhrsw m5, m7 | |
8707 packuswb m3, m5 | |
8708 movu [r0 + 342 * 16], m3 | |
8709 pmaddubsw m3, m1, m6 | |
8710 pmulhrsw m3, m7 | |
8711 pmaddubsw m5, m4, m6 | |
8712 pmulhrsw m5, m7 | |
8713 packuswb m3, m5 | |
8714 movu [r0 + 343 * 16], m3 | |
8715 | |
8716 ; mode 8 [row 19] | |
8717 movu m6, [r5 + 4 * 16] | |
8718 pmaddubsw m3, m0, m6 | |
8719 pmulhrsw m3, m7 | |
8720 pmaddubsw m5, m2, m6 | |
8721 pmulhrsw m5, m7 | |
8722 packuswb m3, m5 | |
8723 movu [r0 + 422 * 16], m3 | |
8724 pmaddubsw m3, m1, m6 | |
8725 pmulhrsw m3, m7 | |
8726 pmaddubsw m5, m4, m6 | |
8727 pmulhrsw m5, m7 | |
8728 packuswb m3, m5 | |
8729 movu [r0 + 423 * 16], m3 | |
8730 | |
8731 ; mode 8 [row 21] | |
8732 movu m6, [r5 + 14 * 16] | |
8733 pmaddubsw m3, m0, m6 | |
8734 pmulhrsw m3, m7 | |
8735 pmaddubsw m5, m2, m6 | |
8736 pmulhrsw m5, m7 | |
8737 packuswb m3, m5 | |
8738 movu [r0 + 426 * 16], m3 | |
8739 pmaddubsw m3, m1, m6 | |
8740 pmulhrsw m3, m7 | |
8741 pmaddubsw m5, m4, m6 | |
8742 pmulhrsw m5, m7 | |
8743 packuswb m3, m5 | |
8744 movu [r0 + 427 * 16], m3 | |
8745 | |
8746 ; mode 8 [row 22] | |
8747 movu m6, [r5 + 19 * 16] | |
8748 pmaddubsw m3, m0, m6 | |
8749 pmulhrsw m3, m7 | |
8750 pmaddubsw m5, m2, m6 | |
8751 pmulhrsw m5, m7 | |
8752 packuswb m3, m5 | |
8753 movu [r0 + 428 * 16], m3 | |
8754 pmaddubsw m3, m1, m6 | |
8755 pmulhrsw m3, m7 | |
8756 pmaddubsw m5, m4, m6 | |
8757 pmulhrsw m5, m7 | |
8758 packuswb m3, m5 | |
8759 movu [r0 + 429 * 16], m3 | |
8760 | |
8761 ; mode 8 [row 23] | |
8762 movu m6, [r5 + 24 * 16] | |
8763 pmaddubsw m3, m0, m6 | |
8764 pmulhrsw m3, m7 | |
8765 pmaddubsw m5, m2, m6 | |
8766 pmulhrsw m5, m7 | |
8767 packuswb m3, m5 | |
8768 movu [r0 + 430 * 16], m3 | |
8769 pmaddubsw m3, m1, m6 | |
8770 pmulhrsw m3, m7 | |
8771 pmaddubsw m5, m4, m6 | |
8772 pmulhrsw m5, m7 | |
8773 packuswb m3, m5 | |
8774 movu [r0 + 431 * 16], m3 | |
8775 | |
8776 ; mode 8 [row 24] | |
8777 movu m6, [r5 + 29 * 16] | |
8778 pmaddubsw m3, m0, m6 | |
8779 pmulhrsw m3, m7 | |
8780 pmaddubsw m5, m2, m6 | |
8781 pmulhrsw m5, m7 | |
8782 packuswb m3, m5 | |
8783 movu [r0 + 432 * 16], m3 | |
8784 pmaddubsw m3, m1, m6 | |
8785 pmulhrsw m3, m7 | |
8786 pmaddubsw m5, m4, m6 | |
8787 pmulhrsw m5, m7 | |
8788 packuswb m3, m5 | |
8789 movu [r0 + 433 * 16], m3 | |
8790 | |
8791 ; mode 3 [row 4] | |
8792 movu m6, [r5 + 2 * 16] | |
8793 movu m0, [r4 + 5] | |
8794 movd m1, [r4 + 6] | |
8795 palignr m1, m0, 1 | |
8796 punpcklbw m0, m1 | |
8797 pmaddubsw m1, m0, m6 | |
8798 pmulhrsw m1, m7 | |
8799 movu m2, [r4 + 13] | |
8800 movd m3, [r4 + 14] | |
8801 palignr m3, m2, 1 | |
8802 punpcklbw m2, m3 | |
8803 pmaddubsw m3, m2, m6 | |
8804 pmulhrsw m3, m7 | |
8805 packuswb m1, m3 | |
8806 movu [r0 + 72 * 16], m1 | |
8807 | |
8808 ; mode 3 [row 4 - first half] | |
8809 movu [r0 + 274 * 16], m1 | |
8810 | |
8811 ; mode 8 [row 25 - first half] | |
8812 movu [r0 + 434 * 16], m1 | |
8813 | |
8814 movu m1, [r4 + 21] | |
8815 movd m3, [r4 + 22] | |
8816 palignr m3, m1, 1 | |
8817 punpcklbw m1, m3 | |
8818 pmaddubsw m3, m1, m6 | |
8819 pmulhrsw m3, m7 | |
8820 movu m4, [r4 + 29] | |
8821 movd m5, [r4 + 30] | |
8822 palignr m5, m4, 1 | |
8823 punpcklbw m4, m5 | |
8824 pmaddubsw m5, m4, m6 | |
8825 pmulhrsw m5, m7 | |
8826 packuswb m3, m5 | |
8827 movu [r0 + 73 * 16], m3 | |
8828 | |
8829 ; mode 3 [row 4 - second half] | |
8830 movu [r0 + 275 * 16], m3 | |
8831 | |
8832 ; mode 8 [row 25 - second half] | |
8833 movu [r0 + 435 * 16], m3 | |
8834 | |
8835 ; mode 3 [row 5] | |
8836 movu m6, [r5 + 28 * 16] | |
8837 pmaddubsw m3, m0, m6 | |
8838 pmulhrsw m3, m7 | |
8839 pmaddubsw m5, m2, m6 | |
8840 pmulhrsw m5, m7 | |
8841 packuswb m3, m5 | |
8842 movu [r0 + 74 * 16], m3 | |
8843 | |
8844 ; mode 3 [row 5 - first half] | |
8845 movu [r0 + 278 * 16], m3 | |
8846 | |
8847 pmaddubsw m3, m1, m6 | |
8848 pmulhrsw m3, m7 | |
8849 pmaddubsw m5, m4, m6 | |
8850 pmulhrsw m5, m7 | |
8851 packuswb m3, m5 | |
8852 movu [r0 + 75 * 16], m3 | |
8853 | |
8854 ; mode 3 [row 5 - second half] | |
8855 movu [r0 + 279 * 16], m3 | |
8856 | |
8857 ; mode 4 [row 6] | |
8858 movu m6, [r5 + 19 * 16] | |
8859 pmaddubsw m3, m0, m6 | |
8860 pmulhrsw m3, m7 | |
8861 pmaddubsw m5, m2, m6 | |
8862 pmulhrsw m5, m7 | |
8863 packuswb m3, m5 | |
8864 movu [r0 + 140 * 16], m3 | |
8865 pmaddubsw m3, m1, m6 | |
8866 pmulhrsw m3, m7 | |
8867 pmaddubsw m5, m4, m6 | |
8868 pmulhrsw m5, m7 | |
8869 packuswb m3, m5 | |
8870 movu [r0 + 141 * 16], m3 | |
8871 | |
8872 ; mode 5 [row 7] | |
8873 movu m6, [r5 + 8 * 16] | |
8874 pmaddubsw m3, m0, m6 | |
8875 pmulhrsw m3, m7 | |
8876 pmaddubsw m5, m2, m6 | |
8877 pmulhrsw m5, m7 | |
8878 packuswb m3, m5 | |
8879 movu [r0 + 206 * 16], m3 | |
8880 pmaddubsw m3, m1, m6 | |
8881 pmulhrsw m3, m7 | |
8882 pmaddubsw m5, m4, m6 | |
8883 pmulhrsw m5, m7 | |
8884 packuswb m3, m5 | |
8885 movu [r0 + 207 * 16], m3 | |
8886 | |
8887 ; mode 5 [row 8] | |
8888 movu m6, [r5 + 25 * 16] | |
8889 pmaddubsw m3, m0, m6 | |
8890 pmulhrsw m3, m7 | |
8891 pmaddubsw m5, m2, m6 | |
8892 pmulhrsw m5, m7 | |
8893 packuswb m3, m5 | |
8894 movu [r0 + 208 * 16], m3 | |
8895 | |
8896 ; mode 7 [row 16 - first half] | |
8897 movu [r0 + 352 * 16], m3 | |
8898 | |
8899 pmaddubsw m3, m1, m6 | |
8900 pmulhrsw m3, m7 | |
8901 pmaddubsw m5, m4, m6 | |
8902 pmulhrsw m5, m7 | |
8903 packuswb m3, m5 | |
8904 movu [r0 + 209 * 16], m3 | |
8905 | |
8906 ; mode 7 [row 16 - second half] | |
8907 movu [r0 + 353 * 16], m3 | |
8908 | |
8909 ; mode 6 [row 10] | |
8910 movu m6, [r5 + 15 * 16] | |
8911 pmaddubsw m3, m0, m6 | |
8912 pmulhrsw m3, m7 | |
8913 pmaddubsw m5, m2, m6 | |
8914 pmulhrsw m5, m7 | |
8915 packuswb m3, m5 | |
8916 movu [r0 + 276 * 16], m3 | |
8917 pmaddubsw m3, m1, m6 | |
8918 pmulhrsw m3, m7 | |
8919 pmaddubsw m5, m4, m6 | |
8920 pmulhrsw m5, m7 | |
8921 packuswb m3, m5 | |
8922 movu [r0 + 277 * 16], m3 | |
8923 | |
8924 ; mode 7 [row 14] | |
8925 movu m6, [r5 + 7 * 16] | |
8926 pmaddubsw m3, m0, m6 | |
8927 pmulhrsw m3, m7 | |
8928 pmaddubsw m5, m2, m6 | |
8929 pmulhrsw m5, m7 | |
8930 packuswb m3, m5 | |
8931 movu [r0 + 348 * 16], m3 | |
8932 | |
8933 ; mode 8 [row 26 - first half] | |
8934 movu [r0 + 436 * 16], m3 | |
8935 | |
8936 pmaddubsw m3, m1, m6 | |
8937 pmulhrsw m3, m7 | |
8938 pmaddubsw m5, m4, m6 | |
8939 pmulhrsw m5, m7 | |
8940 packuswb m3, m5 | |
8941 movu [r0 + 349 * 16], m3 | |
8942 | |
8943 ; mode 8 [row 26 - second half] | |
8944 movu [r0 + 437 * 16], m3 | |
8945 | |
8946 ; mode 7 [row 15] | |
8947 movu m6, [r5 + 16 * 16] | |
8948 pmaddubsw m3, m0, m6 | |
8949 pmulhrsw m3, m7 | |
8950 pmaddubsw m5, m2, m6 | |
8951 pmulhrsw m5, m7 | |
8952 packuswb m3, m5 | |
8953 movu [r0 + 350 * 16], m3 | |
8954 pmaddubsw m3, m1, m6 | |
8955 pmulhrsw m3, m7 | |
8956 pmaddubsw m5, m4, m6 | |
8957 pmulhrsw m5, m7 | |
8958 packuswb m3, m5 | |
8959 movu [r0 + 351 * 16], m3 | |
8960 | |
8961 ; mode 8 [row 27] | |
8962 movu m6, [r5 + 12 * 16] | |
8963 pmaddubsw m3, m0, m6 | |
8964 pmulhrsw m3, m7 | |
8965 pmaddubsw m5, m2, m6 | |
8966 pmulhrsw m5, m7 | |
8967 packuswb m3, m5 | |
8968 movu [r0 + 438 * 16], m3 | |
8969 pmaddubsw m3, m1, m6 | |
8970 pmulhrsw m3, m7 | |
8971 pmaddubsw m5, m4, m6 | |
8972 pmulhrsw m5, m7 | |
8973 packuswb m3, m5 | |
8974 movu [r0 + 439 * 16], m3 | |
8975 | |
8976 ; mode 8 [row 28] | |
8977 movu m6, [r5 + 17 * 16] | |
8978 pmaddubsw m3, m0, m6 | |
8979 pmulhrsw m3, m7 | |
8980 pmaddubsw m5, m2, m6 | |
8981 pmulhrsw m5, m7 | |
8982 packuswb m3, m5 | |
8983 movu [r0 + 440 * 16], m3 | |
8984 pmaddubsw m3, m1, m6 | |
8985 pmulhrsw m3, m7 | |
8986 pmaddubsw m5, m4, m6 | |
8987 pmulhrsw m5, m7 | |
8988 packuswb m3, m5 | |
8989 movu [r0 + 441 * 16], m3 | |
8990 | |
8991 ; mode 8 [row 29] | |
8992 movu m6, [r5 + 22 * 16] | |
8993 pmaddubsw m3, m0, m6 | |
8994 pmulhrsw m3, m7 | |
8995 pmaddubsw m5, m2, m6 | |
8996 pmulhrsw m5, m7 | |
8997 packuswb m3, m5 | |
8998 movu [r0 + 442 * 16], m3 | |
8999 pmaddubsw m3, m1, m6 | |
9000 pmulhrsw m3, m7 | |
9001 pmaddubsw m5, m4, m6 | |
9002 pmulhrsw m5, m7 | |
9003 packuswb m3, m5 | |
9004 movu [r0 + 443 * 16], m3 | |
9005 | |
9006 ; mode 8 [row 30] | |
9007 movu m6, [r5 + 27 * 16] | |
9008 pmaddubsw m3, m0, m6 | |
9009 pmulhrsw m3, m7 | |
9010 pmaddubsw m5, m2, m6 | |
9011 pmulhrsw m5, m7 | |
9012 packuswb m3, m5 | |
9013 movu [r0 + 444 * 16], m3 | |
9014 pmaddubsw m3, m1, m6 | |
9015 pmulhrsw m3, m7 | |
9016 pmaddubsw m5, m4, m6 | |
9017 pmulhrsw m5, m7 | |
9018 packuswb m3, m5 | |
9019 movu [r0 + 445 * 16], m3 | |
9020 | |
9021 ; mode 3 [row 6] | |
9022 movu m6, [r5 + 22 * 16] | |
9023 movu m0, [r4 + 6] | |
9024 movd m1, [r4 + 7] | |
9025 palignr m1, m0, 1 | |
9026 punpcklbw m0, m1 | |
9027 pmaddubsw m1, m0, m6 | |
9028 pmulhrsw m1, m7 | |
9029 movu m2, [r4 + 14] | |
9030 movd m3, [r4 + 15] | |
9031 palignr m3, m2, 1 | |
9032 punpcklbw m2, m3 | |
9033 pmaddubsw m3, m2, m6 | |
9034 pmulhrsw m3, m7 | |
9035 packuswb m1, m3 | |
9036 movu [r0 + 76 * 16], m1 | |
9037 | |
9038 ; mode 6 [row 13 - first half] | |
9039 movu [r0 + 282 * 16], m1 | |
9040 | |
9041 movu m1, [r4 + 22] | |
9042 movd m3, [r4 + 23] | |
9043 palignr m3, m1, 1 | |
9044 punpcklbw m1, m3 | |
9045 pmaddubsw m3, m1, m6 | |
9046 pmulhrsw m3, m7 | |
9047 movu m4, [r4 + 30] | |
9048 movd m5, [r4 + 31] | |
9049 palignr m5, m4, 1 | |
9050 punpcklbw m4, m5 | |
9051 pmaddubsw m5, m4, m6 | |
9052 pmulhrsw m5, m7 | |
9053 packuswb m3, m5 | |
9054 movu [r0 + 77 * 16], m3 | |
9055 | |
9056 ; mode 6 [row 13 - second half] | |
9057 movu [r0 + 283 * 16], m3 | |
9058 | |
9059 ; mode 4 [row 7] | |
9060 movu m6, [r5 + 8 * 16] | |
9061 pmaddubsw m3, m0, m6 | |
9062 pmulhrsw m3, m7 | |
9063 pmaddubsw m5, m2, m6 | |
9064 pmulhrsw m5, m7 | |
9065 packuswb m3, m5 | |
9066 movu [r0 + 142 * 16], m3 | |
9067 pmaddubsw m3, m1, m6 | |
9068 pmulhrsw m3, m7 | |
9069 pmaddubsw m5, m4, m6 | |
9070 pmulhrsw m5, m7 | |
9071 packuswb m3, m5 | |
9072 movu [r0 + 143 * 16], m3 | |
9073 | |
9074 ; mode 4 [row 8] | |
9075 movu m6, [r5 + 29 * 16] | |
9076 pmaddubsw m3, m0, m6 | |
9077 pmulhrsw m3, m7 | |
9078 pmaddubsw m5, m2, m6 | |
9079 pmulhrsw m5, m7 | |
9080 packuswb m3, m5 | |
9081 movu [r0 + 144 * 16], m3 | |
9082 | |
9083 ; mode 4 [row 8 - first half] | |
9084 movu [r0 + 360 * 16], m3 | |
9085 | |
9086 pmaddubsw m3, m1, m6 | |
9087 pmulhrsw m3, m7 | |
9088 pmaddubsw m5, m4, m6 | |
9089 pmulhrsw m5, m7 | |
9090 packuswb m3, m5 | |
9091 movu [r0 + 145 * 16], m3 | |
9092 | |
9093 ; mode 4 [row 8 - second half] | |
9094 movu [r0 + 361 * 16], m3 | |
9095 | |
9096 ; mode 5 [row 9] | |
9097 movu m6, [r5 + 10 * 16] | |
9098 pmaddubsw m3, m0, m6 | |
9099 pmulhrsw m3, m7 | |
9100 pmaddubsw m5, m2, m6 | |
9101 pmulhrsw m5, m7 | |
9102 packuswb m3, m5 | |
9103 movu [r0 + 210 * 16], m3 | |
9104 pmaddubsw m3, m1, m6 | |
9105 pmulhrsw m3, m7 | |
9106 pmaddubsw m5, m4, m6 | |
9107 pmulhrsw m5, m7 | |
9108 packuswb m3, m5 | |
9109 movu [r0 + 211 * 16], m3 | |
9110 | |
9111 ; mode 5 [row 10] | |
9112 movu m6, [r5 + 27 * 16] | |
9113 pmaddubsw m3, m0, m6 | |
9114 pmulhrsw m3, m7 | |
9115 pmaddubsw m5, m2, m6 | |
9116 pmulhrsw m5, m7 | |
9117 packuswb m3, m5 | |
9118 movu [r0 + 212 * 16], m3 | |
9119 pmaddubsw m3, m1, m6 | |
9120 pmulhrsw m3, m7 | |
9121 pmaddubsw m5, m4, m6 | |
9122 pmulhrsw m5, m7 | |
9123 packuswb m3, m5 | |
9124 movu [r0 + 213 * 16], m3 | |
9125 | |
9126 ; mode 7 [row 17] | |
9127 movu m6, [r5 + 2 * 16] | |
9128 pmaddubsw m3, m0, m6 | |
9129 pmulhrsw m3, m7 | |
9130 pmaddubsw m5, m2, m6 | |
9131 pmulhrsw m5, m7 | |
9132 packuswb m3, m5 | |
9133 movu [r0 + 354 * 16], m3 | |
9134 pmaddubsw m3, m1, m6 | |
9135 pmulhrsw m3, m7 | |
9136 pmaddubsw m5, m4, m6 | |
9137 pmulhrsw m5, m7 | |
9138 packuswb m3, m5 | |
9139 movu [r0 + 355 * 16], m3 | |
9140 | |
9141 ; mode 7 [row 18] | |
9142 movu m6, [r5 + 11 * 16] | |
9143 pmaddubsw m3, m0, m6 | |
9144 pmulhrsw m3, m7 | |
9145 pmaddubsw m5, m2, m6 | |
9146 pmulhrsw m5, m7 | |
9147 packuswb m3, m5 | |
9148 movu [r0 + 356 * 16], m3 | |
9149 pmaddubsw m3, m1, m6 | |
9150 pmulhrsw m3, m7 | |
9151 pmaddubsw m5, m4, m6 | |
9152 pmulhrsw m5, m7 | |
9153 packuswb m3, m5 | |
9154 movu [r0 + 357 * 16], m3 | |
9155 | |
9156 ; mode 7 [row 19] | |
9157 movu m6, [r5 + 20 * 16] | |
9158 pmaddubsw m3, m0, m6 | |
9159 pmulhrsw m3, m7 | |
9160 pmaddubsw m5, m2, m6 | |
9161 pmulhrsw m5, m7 | |
9162 packuswb m3, m5 | |
9163 movu [r0 + 358 * 16], m3 | |
9164 pmaddubsw m3, m1, m6 | |
9165 pmulhrsw m3, m7 | |
9166 pmaddubsw m5, m4, m6 | |
9167 pmulhrsw m5, m7 | |
9168 packuswb m3, m5 | |
9169 movu [r0 + 359 * 16], m3 | |
9170 | |
9171 ; mode 6 [row 12] | |
9172 movu m6, [r5 + 9 * 16] | |
9173 pmaddubsw m3, m0, m6 | |
9174 pmulhrsw m3, m7 | |
9175 pmaddubsw m5, m2, m6 | |
9176 pmulhrsw m5, m7 | |
9177 packuswb m3, m5 | |
9178 movu [r0 + 280 * 16], m3 | |
9179 pmaddubsw m3, m1, m6 | |
9180 pmulhrsw m3, m7 | |
9181 pmaddubsw m5, m4, m6 | |
9182 pmulhrsw m5, m7 | |
9183 packuswb m3, m5 | |
9184 movu [r0 + 281 * 16], m3 | |
9185 | |
9186 ; mode 3 [row 7] | |
9187 movu m6, [r5 + 16 * 16] | |
9188 movu m0, [r4 + 7] | |
9189 movd m1, [r4 + 8] | |
9190 palignr m1, m0, 1 | |
9191 punpcklbw m0, m1 | |
9192 pmaddubsw m1, m0, m6 | |
9193 pmulhrsw m1, m7 | |
9194 movu m2, [r4 + 15] | |
9195 movd m3, [r4 + 16] | |
9196 palignr m3, m2, 1 | |
9197 punpcklbw m2, m3 | |
9198 pmaddubsw m3, m2, m6 | |
9199 pmulhrsw m3, m7 | |
9200 packuswb m1, m3 | |
9201 movu [r0 + 78 * 16], m1 | |
9202 | |
9203 ; mode 6 [row 15 - first half] | |
9204 movu [r0 + 286 * 16], m1 | |
9205 | |
9206 movu m1, [r4 + 23] | |
9207 movd m3, [r4 + 24] | |
9208 palignr m3, m1, 1 | |
9209 punpcklbw m1, m3 | |
9210 pmaddubsw m3, m1, m6 | |
9211 pmulhrsw m3, m7 | |
9212 movu m4, [r4 + 31] | |
9213 movd m5, [r4 + 32] | |
9214 palignr m5, m4, 1 | |
9215 punpcklbw m4, m5 | |
9216 pmaddubsw m5, m4, m6 | |
9217 pmulhrsw m5, m7 | |
9218 packuswb m3, m5 | |
9219 movu [r0 + 79 * 16], m3 | |
9220 | |
9221 ; mode 6 [row 15 - second half] | |
9222 movu [r0 + 287 * 16], m3 | |
9223 | |
9224 ; mode 4 [row 9] | |
9225 movu m6, [r5 + 18 * 16] | |
9226 pmaddubsw m3, m0, m6 | |
9227 pmulhrsw m3, m7 | |
9228 pmaddubsw m5, m2, m6 | |
9229 pmulhrsw m5, m7 | |
9230 packuswb m3, m5 | |
9231 movu [r0 + 146 * 16], m3 | |
9232 pmaddubsw m3, m1, m6 | |
9233 pmulhrsw m3, m7 | |
9234 pmaddubsw m5, m4, m6 | |
9235 pmulhrsw m5, m7 | |
9236 packuswb m3, m5 | |
9237 movu [r0 + 147 * 16], m3 | |
9238 | |
9239 ; mode 5 [row 11] | |
9240 movu m6, [r5 + 12 * 16] | |
9241 pmaddubsw m3, m0, m6 | |
9242 pmulhrsw m3, m7 | |
9243 pmaddubsw m5, m2, m6 | |
9244 pmulhrsw m5, m7 | |
9245 packuswb m3, m5 | |
9246 movu [r0 + 214 * 16], m3 | |
9247 pmaddubsw m3, m1, m6 | |
9248 pmulhrsw m3, m7 | |
9249 pmaddubsw m5, m4, m6 | |
9250 pmulhrsw m5, m7 | |
9251 packuswb m3, m5 | |
9252 movu [r0 + 215 * 16], m3 | |
9253 | |
9254 ; mode 5 [row 12] | |
9255 movu m6, [r5 + 29 * 16] | |
9256 pmaddubsw m3, m0, m6 | |
9257 pmulhrsw m3, m7 | |
9258 pmaddubsw m5, m2, m6 | |
9259 pmulhrsw m5, m7 | |
9260 packuswb m3, m5 | |
9261 movu [r0 + 216 * 16], m3 | |
9262 | |
9263 ; mode 6 [row 16 - first half] | |
9264 movu [r0 + 288 * 16], m3 | |
9265 | |
9266 pmaddubsw m3, m1, m6 | |
9267 pmulhrsw m3, m7 | |
9268 pmaddubsw m5, m4, m6 | |
9269 pmulhrsw m5, m7 | |
9270 packuswb m3, m5 | |
9271 movu [r0 + 217 * 16], m3 | |
9272 | |
9273 ; mode 6 [row 16 - second half] | |
9274 movu [r0 + 289 * 16], m3 | |
9275 | |
9276 ; mode 6 [row 14] | |
9277 movu m6, [r5 + 3 * 16] | |
9278 pmaddubsw m3, m0, m6 | |
9279 pmulhrsw m3, m7 | |
9280 pmaddubsw m5, m2, m6 | |
9281 pmulhrsw m5, m7 | |
9282 packuswb m3, m5 | |
9283 movu [r0 + 284 * 16], m3 | |
9284 pmaddubsw m3, m1, m6 | |
9285 pmulhrsw m3, m7 | |
9286 pmaddubsw m5, m4, m6 | |
9287 pmulhrsw m5, m7 | |
9288 packuswb m3, m5 | |
9289 movu [r0 + 285 * 16], m3 | |
9290 | |
9291 ; mode 7 [row 21] | |
9292 movu m6, [r5 + 6 * 16] | |
9293 pmaddubsw m3, m0, m6 | |
9294 pmulhrsw m3, m7 | |
9295 pmaddubsw m5, m2, m6 | |
9296 pmulhrsw m5, m7 | |
9297 packuswb m3, m5 | |
9298 movu [r0 + 362 * 16], m3 | |
9299 pmaddubsw m3, m1, m6 | |
9300 pmulhrsw m3, m7 | |
9301 pmaddubsw m5, m4, m6 | |
9302 pmulhrsw m5, m7 | |
9303 packuswb m3, m5 | |
9304 movu [r0 + 363 * 16], m3 | |
9305 | |
9306 ; mode 7 [row 22] | |
9307 movu m6, [r5 + 15 * 16] | |
9308 pmaddubsw m3, m0, m6 | |
9309 pmulhrsw m3, m7 | |
9310 pmaddubsw m5, m2, m6 | |
9311 pmulhrsw m5, m7 | |
9312 packuswb m3, m5 | |
9313 movu [r0 + 364 * 16], m3 | |
9314 pmaddubsw m3, m1, m6 | |
9315 pmulhrsw m3, m7 | |
9316 pmaddubsw m5, m4, m6 | |
9317 pmulhrsw m5, m7 | |
9318 packuswb m3, m5 | |
9319 movu [r0 + 365 * 16], m3 | |
9320 | |
9321 ; mode 7 [row 23] | |
9322 movu m6, [r5 + 24 * 16] | |
9323 pmaddubsw m3, m0, m6 | |
9324 pmulhrsw m3, m7 | |
9325 pmaddubsw m5, m2, m6 | |
9326 pmulhrsw m5, m7 | |
9327 packuswb m3, m5 | |
9328 movu [r0 + 366 * 16], m3 | |
9329 pmaddubsw m3, m1, m6 | |
9330 pmulhrsw m3, m7 | |
9331 pmaddubsw m5, m4, m6 | |
9332 pmulhrsw m5, m7 | |
9333 packuswb m3, m5 | |
9334 movu [r0 + 367 * 16], m3 | |
9335 | |
9336 ; mode 3 [row 8] | |
9337 movu m6, [r5 + 10 * 16] | |
9338 movu m0, [r4 + 8] | |
9339 movd m1, [r4 + 9] | |
9340 palignr m1, m0, 1 | |
9341 punpcklbw m0, m1 | |
9342 pmaddubsw m1, m0, m6 | |
9343 pmulhrsw m1, m7 | |
9344 movu m2, [r4 + 16] | |
9345 movd m3, [r4 + 17] | |
9346 palignr m3, m2, 1 | |
9347 punpcklbw m2, m3 | |
9348 pmaddubsw m3, m2, m6 | |
9349 pmulhrsw m3, m7 | |
9350 packuswb m1, m3 | |
9351 movu [r0 + 80 * 16], m1 | |
9352 | |
9353 ; mode 7 [row 25 - first half] | |
9354 movu [r0 + 290 * 16], m1 | |
9355 | |
9356 ; mode 6 [row 17 - first half] | |
9357 movu [r0 + 370 * 16], m1 | |
9358 | |
9359 movu m1, [r4 + 24] | |
9360 movd m3, [r4 + 25] | |
9361 palignr m3, m1, 1 | |
9362 punpcklbw m1, m3 | |
9363 pmaddubsw m3, m1, m6 | |
9364 pmulhrsw m3, m7 | |
9365 movu m4, [r4 + 32] | |
9366 movd m5, [r4 + 33] | |
9367 palignr m5, m4, 1 | |
9368 punpcklbw m4, m5 | |
9369 pmaddubsw m5, m4, m6 | |
9370 pmulhrsw m5, m7 | |
9371 packuswb m3, m5 | |
9372 movu [r0 + 81 * 16], m3 | |
9373 | |
9374 ; mode 7 [row 25 - second half] | |
9375 movu [r0 + 291 * 16], m3 | |
9376 | |
9377 ; mode 6 [row 17 - second half] | |
9378 movu [r0 + 371 * 16], m3 | |
9379 | |
9380 ; mode 4 [row 10] | |
9381 movu m6, [r5 + 7 * 16] | |
9382 pmaddubsw m3, m0, m6 | |
9383 pmulhrsw m3, m7 | |
9384 pmaddubsw m5, m2, m6 | |
9385 pmulhrsw m5, m7 | |
9386 packuswb m3, m5 | |
9387 movu [r0 + 148 * 16], m3 | |
9388 pmaddubsw m3, m1, m6 | |
9389 pmulhrsw m3, m7 | |
9390 pmaddubsw m5, m4, m6 | |
9391 pmulhrsw m5, m7 | |
9392 packuswb m3, m5 | |
9393 movu [r0 + 149 * 16], m3 | |
9394 | |
9395 ; mode 4 [row 11] | |
9396 movu m6, [r5 + 28 * 16] | |
9397 pmaddubsw m3, m0, m6 | |
9398 pmulhrsw m3, m7 | |
9399 pmaddubsw m5, m2, m6 | |
9400 pmulhrsw m5, m7 | |
9401 packuswb m3, m5 | |
9402 movu [r0 + 150 * 16], m3 | |
9403 | |
9404 ; mode 7 [row 27 - first half] | |
9405 movu [r0 + 374 * 16], m3 | |
9406 | |
9407 pmaddubsw m3, m1, m6 | |
9408 pmulhrsw m3, m7 | |
9409 pmaddubsw m5, m4, m6 | |
9410 pmulhrsw m5, m7 | |
9411 packuswb m3, m5 | |
9412 movu [r0 + 151 * 16], m3 | |
9413 | |
9414 ; mode 7 [row 27 - second half] | |
9415 movu [r0 + 375 * 16], m3 | |
9416 | |
9417 ; mode 5 [row 13] | |
9418 movu m6, [r5 + 14 * 16] | |
9419 pmaddubsw m3, m0, m6 | |
9420 pmulhrsw m3, m7 | |
9421 pmaddubsw m5, m2, m6 | |
9422 pmulhrsw m5, m7 | |
9423 packuswb m3, m5 | |
9424 movu [r0 + 218 * 16], m3 | |
9425 pmaddubsw m3, m1, m6 | |
9426 pmulhrsw m3, m7 | |
9427 pmaddubsw m5, m4, m6 | |
9428 pmulhrsw m5, m7 | |
9429 packuswb m3, m5 | |
9430 movu [r0 + 219 * 16], m3 | |
9431 | |
9432 ; mode 5 [row 14] | |
9433 movu m6, [r5 + 31 * 16] | |
9434 pmaddubsw m3, m0, m6 | |
9435 pmulhrsw m3, m7 | |
9436 pmaddubsw m5, m2, m6 | |
9437 pmulhrsw m5, m7 | |
9438 packuswb m3, m5 | |
9439 movu [r0 + 220 * 16], m3 | |
9440 pmaddubsw m3, m1, m6 | |
9441 pmulhrsw m3, m7 | |
9442 pmaddubsw m5, m4, m6 | |
9443 pmulhrsw m5, m7 | |
9444 packuswb m3, m5 | |
9445 movu [r0 + 221 * 16], m3 | |
9446 | |
9447 ; mode 6 [row 18] | |
9448 movu m6, [r5 + 23 * 16] | |
9449 pmaddubsw m3, m0, m6 | |
9450 pmulhrsw m3, m7 | |
9451 pmaddubsw m5, m2, m6 | |
9452 pmulhrsw m5, m7 | |
9453 packuswb m3, m5 | |
9454 movu [r0 + 292 * 16], m3 | |
9455 pmaddubsw m3, m1, m6 | |
9456 pmulhrsw m3, m7 | |
9457 pmaddubsw m5, m4, m6 | |
9458 pmulhrsw m5, m7 | |
9459 packuswb m3, m5 | |
9460 movu [r0 + 293 * 16], m3 | |
9461 | |
9462 ; mode 7 [row 24] | |
9463 movu m6, [r5 + 1 * 16] | |
9464 pmaddubsw m3, m0, m6 | |
9465 pmulhrsw m3, m7 | |
9466 pmaddubsw m5, m2, m6 | |
9467 pmulhrsw m5, m7 | |
9468 packuswb m3, m5 | |
9469 movu [r0 + 368 * 16], m3 | |
9470 pmaddubsw m3, m1, m6 | |
9471 pmulhrsw m3, m7 | |
9472 pmaddubsw m5, m4, m6 | |
9473 pmulhrsw m5, m7 | |
9474 packuswb m3, m5 | |
9475 movu [r0 + 369 * 16], m3 | |
9476 | |
9477 ; mode 7 [row 26] | |
9478 movu m6, [r5 + 19 * 16] | |
9479 pmaddubsw m3, m0, m6 | |
9480 pmulhrsw m3, m7 | |
9481 pmaddubsw m5, m2, m6 | |
9482 pmulhrsw m5, m7 | |
9483 packuswb m3, m5 | |
9484 movu [r0 + 372 * 16], m3 | |
9485 pmaddubsw m3, m1, m6 | |
9486 pmulhrsw m3, m7 | |
9487 pmaddubsw m5, m4, m6 | |
9488 pmulhrsw m5, m7 | |
9489 packuswb m3, m5 | |
9490 movu [r0 + 373 * 16], m3 | |
9491 | |
9492 ; mode 3 [row 9] | |
9493 movu m6, [r5 + 4 * 16] | |
9494 movu m0, [r4 + 9] | |
9495 movd m1, [r4 + 10] | |
9496 palignr m1, m0, 1 | |
9497 punpcklbw m0, m1 | |
9498 pmaddubsw m1, m0, m6 | |
9499 pmulhrsw m1, m7 | |
9500 movu m2, [r4 + 17] | |
9501 movd m3, [r4 + 18] | |
9502 palignr m3, m2, 1 | |
9503 punpcklbw m2, m3 | |
9504 pmaddubsw m3, m2, m6 | |
9505 pmulhrsw m3, m7 | |
9506 packuswb m1, m3 | |
9507 movu [r0 + 82 * 16], m1 | |
9508 | |
9509 ; mode 6 [row 19 - first half] | |
9510 movu [r0 + 294 * 16], m1 | |
9511 | |
9512 movu m1, [r4 + 25] | |
9513 movd m3, [r4 + 26] | |
9514 palignr m3, m1, 1 | |
9515 punpcklbw m1, m3 | |
9516 pmaddubsw m3, m1, m6 | |
9517 pmulhrsw m3, m7 | |
9518 movu m4, [r4 + 33] | |
9519 movd m5, [r4 + 34] | |
9520 palignr m5, m4, 1 | |
9521 punpcklbw m4, m5 | |
9522 pmaddubsw m5, m4, m6 | |
9523 pmulhrsw m5, m7 | |
9524 packuswb m3, m5 | |
9525 movu [r0 + 83 * 16], m3 | |
9526 | |
9527 ; mode 6 [row 19 - second half] | |
9528 movu [r0 + 295 * 16], m3 | |
9529 | |
9530 ; mode 4 [row 12] | |
9531 movu m6, [r5 + 17 * 16] | |
9532 pmaddubsw m3, m0, m6 | |
9533 pmulhrsw m3, m7 | |
9534 pmaddubsw m5, m2, m6 | |
9535 pmulhrsw m5, m7 | |
9536 packuswb m3, m5 | |
9537 movu [r0 + 152 * 16], m3 | |
9538 | |
9539 ; mode 4 [row 12 - first half] | |
9540 movu [r0 + 296 * 16], m3 | |
9541 | |
9542 pmaddubsw m3, m1, m6 | |
9543 pmulhrsw m3, m7 | |
9544 pmaddubsw m5, m4, m6 | |
9545 pmulhrsw m5, m7 | |
9546 packuswb m3, m5 | |
9547 movu [r0 + 153 * 16], m3 | |
9548 | |
9549 ; mode 4 [row 12 - second half] | |
9550 movu [r0 + 297 * 16], m3 | |
9551 | |
9552 ; mode 3 [row 10] | |
9553 movu m6, [r5 + 30 * 16] | |
9554 pmaddubsw m3, m0, m6 | |
9555 pmulhrsw m3, m7 | |
9556 pmaddubsw m5, m2, m6 | |
9557 pmulhrsw m5, m7 | |
9558 packuswb m3, m5 | |
9559 movu [r0 + 84 * 16], m3 | |
9560 | |
9561 ; mode 6 [row 21 - first half] | |
9562 movu [r0 + 298 * 16], m3 | |
9563 | |
9564 pmaddubsw m3, m1, m6 | |
9565 pmulhrsw m3, m7 | |
9566 pmaddubsw m5, m4, m6 | |
9567 pmulhrsw m5, m7 | |
9568 packuswb m3, m5 | |
9569 movu [r0 + 85 * 16], m3 | |
9570 | |
9571 ; mode 6 [row 21 - second half] | |
9572 movu [r0 + 299 * 16], m3 | |
9573 | |
9574 ; mode 5 [row 15] | |
9575 movu m6, [r5 + 16 * 16] | |
9576 pmaddubsw m3, m0, m6 | |
9577 pmulhrsw m3, m7 | |
9578 pmaddubsw m5, m2, m6 | |
9579 pmulhrsw m5, m7 | |
9580 packuswb m3, m5 | |
9581 movu [r0 + 222 * 16], m3 | |
9582 pmaddubsw m3, m1, m6 | |
9583 pmulhrsw m3, m7 | |
9584 pmaddubsw m5, m4, m6 | |
9585 pmulhrsw m5, m7 | |
9586 packuswb m3, m5 | |
9587 movu [r0 + 223 * 16], m3 | |
9588 | |
9589 ; mode 7 [row 28] | |
9590 movu m6, [r5 + 5 * 16] | |
9591 pmaddubsw m3, m0, m6 | |
9592 pmulhrsw m3, m7 | |
9593 pmaddubsw m5, m2, m6 | |
9594 pmulhrsw m5, m7 | |
9595 packuswb m3, m5 | |
9596 movu [r0 + 376 * 16], m3 | |
9597 pmaddubsw m3, m1, m6 | |
9598 pmulhrsw m3, m7 | |
9599 pmaddubsw m5, m4, m6 | |
9600 pmulhrsw m5, m7 | |
9601 packuswb m3, m5 | |
9602 movu [r0 + 377 * 16], m3 | |
9603 | |
9604 ; mode 7 [row 29] | |
9605 movu m6, [r5 + 14 * 16] | |
9606 pmaddubsw m3, m0, m6 | |
9607 pmulhrsw m3, m7 | |
9608 pmaddubsw m5, m2, m6 | |
9609 pmulhrsw m5, m7 | |
9610 packuswb m3, m5 | |
9611 movu [r0 + 378 * 16], m3 | |
9612 pmaddubsw m3, m1, m6 | |
9613 pmulhrsw m3, m7 | |
9614 pmaddubsw m5, m4, m6 | |
9615 pmulhrsw m5, m7 | |
9616 packuswb m3, m5 | |
9617 movu [r0 + 379 * 16], m3 | |
9618 | |
9619 ; mode 7 [row 30] | |
9620 movu m6, [r5 + 23 * 16] | |
9621 pmaddubsw m3, m0, m6 | |
9622 pmulhrsw m3, m7 | |
9623 pmaddubsw m5, m2, m6 | |
9624 pmulhrsw m5, m7 | |
9625 packuswb m3, m5 | |
9626 movu [r0 + 380 * 16], m3 | |
9627 pmaddubsw m3, m1, m6 | |
9628 pmulhrsw m3, m7 | |
9629 pmaddubsw m5, m4, m6 | |
9630 pmulhrsw m5, m7 | |
9631 packuswb m3, m5 | |
9632 movu [r0 + 381 * 16], m3 | |
9633 | |
9634 ; mode 3 [row 11] | |
9635 movu m6, [r5 + 24 * 16] | |
9636 movu m0, [r4 + 10] | |
9637 movd m1, [r4 + 11] | |
9638 palignr m1, m0, 1 | |
9639 punpcklbw m0, m1 | |
9640 pmaddubsw m1, m0, m6 | |
9641 pmulhrsw m1, m7 | |
9642 movu m2, [r4 + 18] | |
9643 movd m3, [r4 + 19] | |
9644 palignr m3, m2, 1 | |
9645 punpcklbw m2, m3 | |
9646 pmaddubsw m3, m2, m6 | |
9647 pmulhrsw m3, m7 | |
9648 packuswb m1, m3 | |
9649 movu [r0 + 86 * 16], m1 | |
9650 | |
9651 ; mode 6 [row 23 - first half] | |
9652 movu [r0 + 302 * 16], m1 | |
9653 | |
9654 movu m1, [r4 + 26] | |
9655 movd m3, [r4 + 27] | |
9656 palignr m3, m1, 1 | |
9657 punpcklbw m1, m3 | |
9658 pmaddubsw m3, m1, m6 | |
9659 pmulhrsw m3, m7 | |
9660 movu m4, [r4 + 34] | |
9661 movd m5, [r4 + 35] | |
9662 palignr m5, m4, 1 | |
9663 punpcklbw m4, m5 | |
9664 pmaddubsw m5, m4, m6 | |
9665 pmulhrsw m5, m7 | |
9666 packuswb m3, m5 | |
9667 movu [r0 + 87 * 16], m3 | |
9668 | |
9669 ; mode 6 [row 23 - second half] | |
9670 movu [r0 + 303 * 16], m3 | |
9671 | |
9672 ; mode 4 [row 13] | |
9673 movu m6, [r5 + 6 * 16] | |
9674 pmaddubsw m3, m0, m6 | |
9675 pmulhrsw m3, m7 | |
9676 pmaddubsw m5, m2, m6 | |
9677 pmulhrsw m5, m7 | |
9678 packuswb m3, m5 | |
9679 movu [r0 + 154 * 16], m3 | |
9680 pmaddubsw m3, m1, m6 | |
9681 pmulhrsw m3, m7 | |
9682 pmaddubsw m5, m4, m6 | |
9683 pmulhrsw m5, m7 | |
9684 packuswb m3, m5 | |
9685 movu [r0 + 155 * 16], m3 | |
9686 | |
9687 ; mode 4 [row 14] | |
9688 movu m6, [r5 + 27 * 16] | |
9689 pmaddubsw m3, m0, m6 | |
9690 pmulhrsw m3, m7 | |
9691 pmaddubsw m5, m2, m6 | |
9692 pmulhrsw m5, m7 | |
9693 packuswb m3, m5 | |
9694 movu [r0 + 156 * 16], m3 | |
9695 pmaddubsw m3, m1, m6 | |
9696 pmulhrsw m3, m7 | |
9697 pmaddubsw m5, m4, m6 | |
9698 pmulhrsw m5, m7 | |
9699 packuswb m3, m5 | |
9700 movu [r0 + 157 * 16], m3 | |
9701 | |
9702 ; mode 5 [row 16] | |
9703 movu m6, [r5 + 1 * 16] | |
9704 pmaddubsw m3, m0, m6 | |
9705 pmulhrsw m3, m7 | |
9706 pmaddubsw m5, m2, m6 | |
9707 pmulhrsw m5, m7 | |
9708 packuswb m3, m5 | |
9709 movu [r0 + 224 * 16], m3 | |
9710 pmaddubsw m3, m1, m6 | |
9711 pmulhrsw m3, m7 | |
9712 pmaddubsw m5, m4, m6 | |
9713 pmulhrsw m5, m7 | |
9714 packuswb m3, m5 | |
9715 movu [r0 + 225 * 16], m3 | |
9716 | |
9717 ; mode 5 [row 17] | |
9718 movu m6, [r5 + 18 * 16] | |
9719 pmaddubsw m3, m0, m6 | |
9720 pmulhrsw m3, m7 | |
9721 pmaddubsw m5, m2, m6 | |
9722 pmulhrsw m5, m7 | |
9723 packuswb m3, m5 | |
9724 movu [r0 + 226 * 16], m3 | |
9725 pmaddubsw m3, m1, m6 | |
9726 pmulhrsw m3, m7 | |
9727 pmaddubsw m5, m4, m6 | |
9728 pmulhrsw m5, m7 | |
9729 packuswb m3, m5 | |
9730 movu [r0 + 227 * 16], m3 | |
9731 | |
9732 ; mode 6 [row 22] | |
9733 movu m6, [r5 + 11 * 16] | |
9734 pmaddubsw m3, m0, m6 | |
9735 pmulhrsw m3, m7 | |
9736 pmaddubsw m5, m2, m6 | |
9737 pmulhrsw m5, m7 | |
9738 packuswb m3, m5 | |
9739 movu [r0 + 300 * 16], m3 | |
9740 pmaddubsw m3, m1, m6 | |
9741 pmulhrsw m3, m7 | |
9742 pmaddubsw m5, m4, m6 | |
9743 pmulhrsw m5, m7 | |
9744 packuswb m3, m5 | |
9745 movu [r0 + 301 * 16], m3 | |
9746 | |
9747 ; mode 3 [row 12] | |
9748 movu m6, [r5 + 18 * 16] | |
9749 movu m0, [r4 + 11] | |
9750 movd m1, [r4 + 12] | |
9751 palignr m1, m0, 1 | |
9752 punpcklbw m0, m1 | |
9753 pmaddubsw m1, m0, m6 | |
9754 pmulhrsw m1, m7 | |
9755 movu m2, [r4 + 19] | |
9756 movd m3, [r4 + 20] | |
9757 palignr m3, m2, 1 | |
9758 punpcklbw m2, m3 | |
9759 pmaddubsw m3, m2, m6 | |
9760 pmulhrsw m3, m7 | |
9761 packuswb m1, m3 | |
9762 movu [r0 + 88 * 16], m1 | |
9763 | |
9764 ; mode 6 [row 25 - first half] | |
9765 movu [r0 + 306 * 16], m1 | |
9766 | |
9767 movu m1, [r4 + 27] | |
9768 movd m3, [r4 + 28] | |
9769 palignr m3, m1, 1 | |
9770 punpcklbw m1, m3 | |
9771 pmaddubsw m3, m1, m6 | |
9772 pmulhrsw m3, m7 | |
9773 movu m4, [r4 + 35] | |
9774 movd m5, [r4 + 36] | |
9775 palignr m5, m4, 1 | |
9776 punpcklbw m4, m5 | |
9777 pmaddubsw m5, m4, m6 | |
9778 pmulhrsw m5, m7 | |
9779 packuswb m3, m5 | |
9780 movu [r0 + 89 * 16], m3 | |
9781 | |
9782 ; mode 6 [row 25 - second half] | |
9783 movu [r0 + 307 * 16], m3 | |
9784 | |
9785 ; mode 4 [row 15] | |
9786 movu m6, [r5 + 16 * 16] | |
9787 pmaddubsw m3, m0, m6 | |
9788 pmulhrsw m3, m7 | |
9789 pmaddubsw m5, m2, m6 | |
9790 pmulhrsw m5, m7 | |
9791 packuswb m3, m5 | |
9792 movu [r0 + 158 * 16], m3 | |
9793 pmaddubsw m3, m1, m6 | |
9794 pmulhrsw m3, m7 | |
9795 pmaddubsw m5, m4, m6 | |
9796 pmulhrsw m5, m7 | |
9797 packuswb m3, m5 | |
9798 movu [r0 + 159 * 16], m3 | |
9799 | |
9800 ; mode 5 [row 18] | |
9801 movu m6, [r5 + 3 * 16] | |
9802 pmaddubsw m3, m0, m6 | |
9803 pmulhrsw m3, m7 | |
9804 pmaddubsw m5, m2, m6 | |
9805 pmulhrsw m5, m7 | |
9806 packuswb m3, m5 | |
9807 movu [r0 + 228 * 16], m3 | |
9808 pmaddubsw m3, m1, m6 | |
9809 pmulhrsw m3, m7 | |
9810 pmaddubsw m5, m4, m6 | |
9811 pmulhrsw m5, m7 | |
9812 packuswb m3, m5 | |
9813 movu [r0 + 229 * 16], m3 | |
9814 | |
9815 ; mode 5 [row 19] | |
9816 movu m6, [r5 + 20 * 16] | |
9817 pmaddubsw m3, m0, m6 | |
9818 pmulhrsw m3, m7 | |
9819 pmaddubsw m5, m2, m6 | |
9820 pmulhrsw m5, m7 | |
9821 packuswb m3, m5 | |
9822 movu [r0 + 230 * 16], m3 | |
9823 pmaddubsw m3, m1, m6 | |
9824 pmulhrsw m3, m7 | |
9825 pmaddubsw m5, m4, m6 | |
9826 pmulhrsw m5, m7 | |
9827 packuswb m3, m5 | |
9828 movu [r0 + 231 * 16], m3 | |
9829 | |
9830 ; mode 6 [row 24] | |
9831 movu m6, [r5 + 5 * 16] | |
9832 pmaddubsw m3, m0, m6 | |
9833 pmulhrsw m3, m7 | |
9834 pmaddubsw m5, m2, m6 | |
9835 pmulhrsw m5, m7 | |
9836 packuswb m3, m5 | |
9837 movu [r0 + 304 * 16], m3 | |
9838 pmaddubsw m3, m1, m6 | |
9839 pmulhrsw m3, m7 | |
9840 pmaddubsw m5, m4, m6 | |
9841 pmulhrsw m5, m7 | |
9842 packuswb m3, m5 | |
9843 movu [r0 + 305 * 16], m3 | |
9844 | |
9845 ; mode 6 [row 26] | |
9846 movu m6, [r5 + 31 * 16] | |
9847 pmaddubsw m3, m0, m6 | |
9848 pmulhrsw m3, m7 | |
9849 pmaddubsw m5, m2, m6 | |
9850 pmulhrsw m5, m7 | |
9851 packuswb m3, m5 | |
9852 movu [r0 + 308 * 16], m3 | |
9853 pmaddubsw m3, m1, m6 | |
9854 pmulhrsw m3, m7 | |
9855 pmaddubsw m5, m4, m6 | |
9856 pmulhrsw m5, m7 | |
9857 packuswb m3, m5 | |
9858 movu [r0 + 309 * 16], m3 | |
9859 | |
9860 ; mode 3 [row 13] | |
9861 movu m6, [r5 + 12 * 16] | |
9862 movu m0, [r4 + 12] | |
9863 movd m1, [r4 + 13] | |
9864 palignr m1, m0, 1 | |
9865 punpcklbw m0, m1 | |
9866 pmaddubsw m1, m0, m6 | |
9867 pmulhrsw m1, m7 | |
9868 movu m2, [r4 + 20] | |
9869 movd m3, [r4 + 21] | |
9870 palignr m3, m2, 1 | |
9871 punpcklbw m2, m3 | |
9872 pmaddubsw m3, m2, m6 | |
9873 pmulhrsw m3, m7 | |
9874 packuswb m1, m3 | |
9875 movu [r0 + 90 * 16], m1 | |
9876 | |
9877 movu m1, [r4 + 28] | |
9878 movd m3, [r4 + 29] | |
9879 palignr m3, m1, 1 | |
9880 punpcklbw m1, m3 | |
9881 pmaddubsw m3, m1, m6 | |
9882 pmulhrsw m3, m7 | |
9883 movu m4, [r4 + 36] | |
9884 movd m5, [r4 + 37] | |
9885 palignr m5, m4, 1 | |
9886 punpcklbw m4, m5 | |
9887 pmaddubsw m5, m4, m6 | |
9888 pmulhrsw m5, m7 | |
9889 packuswb m3, m5 | |
9890 movu [r0 + 91 * 16], m3 | |
9891 | |
9892 ; mode 4 [row 16] | |
9893 movu m6, [r5 + 5 * 16] | |
9894 pmaddubsw m3, m0, m6 | |
9895 pmulhrsw m3, m7 | |
9896 pmaddubsw m5, m2, m6 | |
9897 pmulhrsw m5, m7 | |
9898 packuswb m3, m5 | |
9899 movu [r0 + 160 * 16], m3 | |
9900 | |
9901 ; mode 5 [row 20 - first half] | |
9902 movu [r0 + 232 * 16], m3 | |
9903 | |
9904 pmaddubsw m3, m1, m6 | |
9905 pmulhrsw m3, m7 | |
9906 pmaddubsw m5, m4, m6 | |
9907 pmulhrsw m5, m7 | |
9908 packuswb m3, m5 | |
9909 movu [r0 + 161 * 16], m3 | |
9910 | |
9911 ; mode 5 [row 20 - second half] | |
9912 movu [r0 + 233 * 16], m3 | |
9913 | |
9914 ; mode 4 [row 17] | |
9915 movu m6, [r5 + 26 * 16] | |
9916 pmaddubsw m3, m0, m6 | |
9917 pmulhrsw m3, m7 | |
9918 pmaddubsw m5, m2, m6 | |
9919 pmulhrsw m5, m7 | |
9920 packuswb m3, m5 | |
9921 movu [r0 + 162 * 16], m3 | |
9922 pmaddubsw m3, m1, m6 | |
9923 pmulhrsw m3, m7 | |
9924 pmaddubsw m5, m4, m6 | |
9925 pmulhrsw m5, m7 | |
9926 packuswb m3, m5 | |
9927 movu [r0 + 163 * 16], m3 | |
9928 | |
9929 ; mode 5 [row 21] | |
9930 movu m6, [r5 + 22 * 16] | |
9931 pmaddubsw m3, m0, m6 | |
9932 pmulhrsw m3, m7 | |
9933 pmaddubsw m5, m2, m6 | |
9934 pmulhrsw m5, m7 | |
9935 packuswb m3, m5 | |
9936 movu [r0 + 234 * 16], m3 | |
9937 pmaddubsw m3, m1, m6 | |
9938 pmulhrsw m3, m7 | |
9939 pmaddubsw m5, m4, m6 | |
9940 pmulhrsw m5, m7 | |
9941 packuswb m3, m5 | |
9942 movu [r0 + 235 * 16], m3 | |
9943 | |
9944 ; mode 6 [row 27] | |
9945 movu m6, [r5 + 12 * 16] | |
9946 pmaddubsw m3, m0, m6 | |
9947 pmulhrsw m3, m7 | |
9948 pmaddubsw m5, m2, m6 | |
9949 pmulhrsw m5, m7 | |
9950 packuswb m3, m5 | |
9951 movu [r0 + 310 * 16], m3 | |
9952 pmaddubsw m3, m1, m6 | |
9953 pmulhrsw m3, m7 | |
9954 pmaddubsw m5, m4, m6 | |
9955 pmulhrsw m5, m7 | |
9956 packuswb m3, m5 | |
9957 movu [r0 + 311 * 16], m3 | |
9958 | |
9959 ; mode 6 [row 28] | |
9960 movu m6, [r5 + 25 * 16] | |
9961 pmaddubsw m3, m0, m6 | |
9962 pmulhrsw m3, m7 | |
9963 pmaddubsw m5, m2, m6 | |
9964 pmulhrsw m5, m7 | |
9965 packuswb m3, m5 | |
9966 movu [r0 + 312 * 16], m3 | |
9967 pmaddubsw m3, m1, m6 | |
9968 pmulhrsw m3, m7 | |
9969 pmaddubsw m5, m4, m6 | |
9970 pmulhrsw m5, m7 | |
9971 packuswb m3, m5 | |
9972 movu [r0 + 313 * 16], m3 | |
9973 | |
9974 ; mode 3 [row 14] | |
9975 movu m6, [r5 + 6 * 16] | |
9976 movu m0, [r4 + 13] | |
9977 movd m1, [r4 + 14] | |
9978 palignr m1, m0, 1 | |
9979 punpcklbw m0, m1 | |
9980 pmaddubsw m1, m0, m6 | |
9981 pmulhrsw m1, m7 | |
9982 movu m2, [r4 + 21] | |
9983 movd m3, [r4 + 22] | |
9984 palignr m3, m2, 1 | |
9985 punpcklbw m2, m3 | |
9986 pmaddubsw m3, m2, m6 | |
9987 pmulhrsw m3, m7 | |
9988 packuswb m1, m3 | |
9989 movu [r0 + 92 * 16], m1 | |
9990 | |
9991 ; mode 6 [row 29 - first half] | |
9992 movu [r0 + 314 * 16], m1 | |
9993 | |
9994 movu m1, [r4 + 29] | |
9995 movd m3, [r4 + 30] | |
9996 palignr m3, m1, 1 | |
9997 punpcklbw m1, m3 | |
9998 pmaddubsw m3, m1, m6 | |
9999 pmulhrsw m3, m7 | |
10000 movu m4, [r4 + 37] | |
10001 movd m5, [r4 + 38] | |
10002 palignr m5, m4, 1 | |
10003 punpcklbw m4, m5 | |
10004 pmaddubsw m5, m4, m6 | |
10005 pmulhrsw m5, m7 | |
10006 packuswb m3, m5 | |
10007 movu [r0 + 93 * 16], m3 | |
10008 | |
10009 ; mode 6 [row 29 - second half] | |
10010 movu [r0 + 315 * 16], m3 | |
10011 | |
10012 ; mode 4 [row 18] | |
10013 movu m6, [r5 + 15 * 16] | |
10014 pmaddubsw m3, m0, m6 | |
10015 pmulhrsw m3, m7 | |
10016 pmaddubsw m5, m2, m6 | |
10017 pmulhrsw m5, m7 | |
10018 packuswb m3, m5 | |
10019 movu [r0 + 164 * 16], m3 | |
10020 pmaddubsw m3, m1, m6 | |
10021 pmulhrsw m3, m7 | |
10022 pmaddubsw m5, m4, m6 | |
10023 pmulhrsw m5, m7 | |
10024 packuswb m3, m5 | |
10025 movu [r0 + 165 * 16], m3 | |
10026 | |
10027 ; mode 5 [row 22] | |
10028 movu m6, [r5 + 7 * 16] | |
10029 pmaddubsw m3, m0, m6 | |
10030 pmulhrsw m3, m7 | |
10031 pmaddubsw m5, m2, m6 | |
10032 pmulhrsw m5, m7 | |
10033 packuswb m3, m5 | |
10034 movu [r0 + 236 * 16], m3 | |
10035 pmaddubsw m3, m1, m6 | |
10036 pmulhrsw m3, m7 | |
10037 pmaddubsw m5, m4, m6 | |
10038 pmulhrsw m5, m7 | |
10039 packuswb m3, m5 | |
10040 movu [r0 + 237 * 16], m3 | |
10041 | |
10042 ; mode 5 [row 23] | |
10043 movu m6, [r5 + 24 * 16] | |
10044 pmaddubsw m3, m0, m6 | |
10045 pmulhrsw m3, m7 | |
10046 pmaddubsw m5, m2, m6 | |
10047 pmulhrsw m5, m7 | |
10048 packuswb m3, m5 | |
10049 movu [r0 + 238 * 16], m3 | |
10050 pmaddubsw m3, m1, m6 | |
10051 pmulhrsw m3, m7 | |
10052 pmaddubsw m5, m4, m6 | |
10053 pmulhrsw m5, m7 | |
10054 packuswb m3, m5 | |
10055 movu [r0 + 239 * 16], m3 | |
10056 | |
10057 ; mode 6 [row 30] | |
10058 movu m6, [r5 + 19 * 16] | |
10059 pmaddubsw m3, m0, m6 | |
10060 pmulhrsw m3, m7 | |
10061 pmaddubsw m5, m2, m6 | |
10062 pmulhrsw m5, m7 | |
10063 packuswb m3, m5 | |
10064 movu [r0 + 316 * 16], m3 | |
10065 pmaddubsw m3, m1, m6 | |
10066 pmulhrsw m3, m7 | |
10067 pmaddubsw m5, m4, m6 | |
10068 pmulhrsw m5, m7 | |
10069 packuswb m3, m5 | |
10070 movu [r0 + 317 * 16], m3 | |
10071 | |
10072 ; mode 3 [row 16] | |
10073 movu m6, [r5 + 26 * 16] | |
10074 movu m0, [r4 + 14] | |
10075 movd m1, [r4 + 15] | |
10076 palignr m1, m0, 1 | |
10077 punpcklbw m0, m1 | |
10078 pmaddubsw m1, m0, m6 | |
10079 pmulhrsw m1, m7 | |
10080 movu m2, [r4 + 22] | |
10081 movd m3, [r4 + 23] | |
10082 palignr m3, m2, 1 | |
10083 punpcklbw m2, m3 | |
10084 pmaddubsw m3, m2, m6 | |
10085 pmulhrsw m3, m7 | |
10086 packuswb m1, m3 | |
10087 movu [r0 + 96 * 16], m1 | |
10088 | |
10089 ; mode 5 [row 25 - first half] | |
10090 movu [r0 + 242 * 16], m1 | |
10091 | |
10092 movu m1, [r4 + 30] | |
10093 movd m3, [r4 + 31] | |
10094 palignr m3, m1, 1 | |
10095 punpcklbw m1, m3 | |
10096 pmaddubsw m3, m1, m6 | |
10097 pmulhrsw m3, m7 | |
10098 movu m4, [r4 + 38] | |
10099 movd m5, [r4 + 39] | |
10100 palignr m5, m4, 1 | |
10101 punpcklbw m4, m5 | |
10102 pmaddubsw m5, m4, m6 | |
10103 pmulhrsw m5, m7 | |
10104 packuswb m3, m5 | |
10105 movu [r0 + 97 * 16], m3 | |
10106 | |
10107 ; mode 5 [row 25 - second half] | |
10108 movu [r0 + 243 * 16], m3 | |
10109 | |
10110 ; mode 4 [row 19] | |
10111 movu m6, [r5 + 4 * 16] | |
10112 pmaddubsw m3, m0, m6 | |
10113 pmulhrsw m3, m7 | |
10114 pmaddubsw m5, m2, m6 | |
10115 pmulhrsw m5, m7 | |
10116 packuswb m3, m5 | |
10117 movu [r0 + 166 * 16], m3 | |
10118 pmaddubsw m3, m1, m6 | |
10119 pmulhrsw m3, m7 | |
10120 pmaddubsw m5, m4, m6 | |
10121 pmulhrsw m5, m7 | |
10122 packuswb m3, m5 | |
10123 movu [r0 + 167 * 16], m3 | |
10124 | |
10125 ; mode 4 [row 20] | |
10126 movu m6, [r5 + 25 * 16] | |
10127 pmaddubsw m3, m0, m6 | |
10128 pmulhrsw m3, m7 | |
10129 pmaddubsw m5, m2, m6 | |
10130 pmulhrsw m5, m7 | |
10131 packuswb m3, m5 | |
10132 movu [r0 + 168 * 16], m3 | |
10133 pmaddubsw m3, m1, m6 | |
10134 pmulhrsw m3, m7 | |
10135 pmaddubsw m5, m4, m6 | |
10136 pmulhrsw m5, m7 | |
10137 packuswb m3, m5 | |
10138 movu [r0 + 169 * 16], m3 | |
10139 | |
10140 ; mode 5 [row 24] | |
10141 movu m6, [r5 + 9 * 16] | |
10142 pmaddubsw m3, m0, m6 | |
10143 pmulhrsw m3, m7 | |
10144 pmaddubsw m5, m2, m6 | |
10145 pmulhrsw m5, m7 | |
10146 packuswb m3, m5 | |
10147 movu [r0 + 240 * 16], m3 | |
10148 pmaddubsw m3, m1, m6 | |
10149 pmulhrsw m3, m7 | |
10150 pmaddubsw m5, m4, m6 | |
10151 pmulhrsw m5, m7 | |
10152 packuswb m3, m5 | |
10153 movu [r0 + 241 * 16], m3 | |
10154 | |
10155 ; mode 3 [row 17] | |
10156 movu m6, [r5 + 20 * 16] | |
10157 movu m0, [r4 + 15] | |
10158 movd m1, [r4 + 16] | |
10159 palignr m1, m0, 1 | |
10160 punpcklbw m0, m1 | |
10161 pmaddubsw m1, m0, m6 | |
10162 pmulhrsw m1, m7 | |
10163 movu m2, [r4 + 23] | |
10164 movd m3, [r4 + 24] | |
10165 palignr m3, m2, 1 | |
10166 punpcklbw m2, m3 | |
10167 pmaddubsw m3, m2, m6 | |
10168 pmulhrsw m3, m7 | |
10169 packuswb m1, m3 | |
10170 movu [r0 + 98 * 16], m1 | |
10171 | |
10172 movu m1, [r4 + 31] | |
10173 movd m3, [r4 + 32] | |
10174 palignr m3, m1, 1 | |
10175 punpcklbw m1, m3 | |
10176 pmaddubsw m3, m1, m6 | |
10177 pmulhrsw m3, m7 | |
10178 movu m4, [r4 + 39] | |
10179 movd m5, [r4 + 40] | |
10180 palignr m5, m4, 1 | |
10181 punpcklbw m4, m5 | |
10182 pmaddubsw m5, m4, m6 | |
10183 pmulhrsw m5, m7 | |
10184 packuswb m3, m5 | |
10185 movu [r0 + 99 * 16], m3 | |
10186 | |
10187 ; mode 4 [row 21] | |
10188 movu m6, [r5 + 14 * 16] | |
10189 pmaddubsw m3, m0, m6 | |
10190 pmulhrsw m3, m7 | |
10191 pmaddubsw m5, m2, m6 | |
10192 pmulhrsw m5, m7 | |
10193 packuswb m3, m5 | |
10194 movu [r0 + 170 * 16], m3 | |
10195 pmaddubsw m3, m1, m6 | |
10196 pmulhrsw m3, m7 | |
10197 pmaddubsw m5, m4, m6 | |
10198 pmulhrsw m5, m7 | |
10199 packuswb m3, m5 | |
10200 movu [r0 + 171 * 16], m3 | |
10201 | |
10202 ; mode 5 [row 26] | |
10203 movu m6, [r5 + 11 * 16] | |
10204 pmaddubsw m3, m0, m6 | |
10205 pmulhrsw m3, m7 | |
10206 pmaddubsw m5, m2, m6 | |
10207 pmulhrsw m5, m7 | |
10208 packuswb m3, m5 | |
10209 movu [r0 + 244 * 16], m3 | |
10210 pmaddubsw m3, m1, m6 | |
10211 pmulhrsw m3, m7 | |
10212 pmaddubsw m5, m4, m6 | |
10213 pmulhrsw m5, m7 | |
10214 packuswb m3, m5 | |
10215 movu [r0 + 245 * 16], m3 | |
10216 | |
10217 ; mode 5 [row 27] | |
10218 movu m6, [r5 + 28 * 16] | |
10219 pmaddubsw m3, m0, m6 | |
10220 pmulhrsw m3, m7 | |
10221 pmaddubsw m5, m2, m6 | |
10222 pmulhrsw m5, m7 | |
10223 packuswb m3, m5 | |
10224 movu [r0 + 246 * 16], m3 | |
10225 pmaddubsw m3, m1, m6 | |
10226 pmulhrsw m3, m7 | |
10227 pmaddubsw m5, m4, m6 | |
10228 pmulhrsw m5, m7 | |
10229 packuswb m3, m5 | |
10230 movu [r0 + 247 * 16], m3 | |
10231 | |
10232 ; mode 3 [row 18] | |
10233 movu m6, [r5 + 14 * 16] | |
10234 movu m0, [r4 + 16] | |
10235 movd m1, [r4 + 17] | |
10236 palignr m1, m0, 1 | |
10237 punpcklbw m0, m1 | |
10238 pmaddubsw m1, m0, m6 | |
10239 pmulhrsw m1, m7 | |
10240 movu m2, [r4 + 24] | |
10241 movd m3, [r4 + 25] | |
10242 palignr m3, m2, 1 | |
10243 punpcklbw m2, m3 | |
10244 pmaddubsw m3, m2, m6 | |
10245 pmulhrsw m3, m7 | |
10246 packuswb m1, m3 | |
10247 movu [r0 + 100 * 16], m1 | |
10248 | |
10249 movu m1, [r4 + 32] | |
10250 movd m3, [r4 + 33] | |
10251 palignr m3, m1, 1 | |
10252 punpcklbw m1, m3 | |
10253 pmaddubsw m3, m1, m6 | |
10254 pmulhrsw m3, m7 | |
10255 movu m4, [r4 + 40] | |
10256 movd m5, [r4 + 41] | |
10257 palignr m5, m4, 1 | |
10258 punpcklbw m4, m5 | |
10259 pmaddubsw m5, m4, m6 | |
10260 pmulhrsw m5, m7 | |
10261 packuswb m3, m5 | |
10262 movu [r0 + 101 * 16], m3 | |
10263 | |
10264 ; mode 4 [row 22] | |
10265 movu m6, [r5 + 3 * 16] | |
10266 pmaddubsw m3, m0, m6 | |
10267 pmulhrsw m3, m7 | |
10268 pmaddubsw m5, m2, m6 | |
10269 pmulhrsw m5, m7 | |
10270 packuswb m3, m5 | |
10271 movu [r0 + 172 * 16], m3 | |
10272 pmaddubsw m3, m1, m6 | |
10273 pmulhrsw m3, m7 | |
10274 pmaddubsw m5, m4, m6 | |
10275 pmulhrsw m5, m7 | |
10276 packuswb m3, m5 | |
10277 movu [r0 + 173 * 16], m3 | |
10278 | |
10279 ; mode 4 [row 23] | |
10280 movu m6, [r5 + 24 * 16] | |
10281 pmaddubsw m3, m0, m6 | |
10282 pmulhrsw m3, m7 | |
10283 pmaddubsw m5, m2, m6 | |
10284 pmulhrsw m5, m7 | |
10285 packuswb m3, m5 | |
10286 movu [r0 + 174 * 16], m3 | |
10287 pmaddubsw m3, m1, m6 | |
10288 pmulhrsw m3, m7 | |
10289 pmaddubsw m5, m4, m6 | |
10290 pmulhrsw m5, m7 | |
10291 packuswb m3, m5 | |
10292 movu [r0 + 175 * 16], m3 | |
10293 | |
10294 ; mode 5 [row 28] | |
10295 movu m6, [r5 + 13 * 16] | |
10296 pmaddubsw m3, m0, m6 | |
10297 pmulhrsw m3, m7 | |
10298 pmaddubsw m5, m2, m6 | |
10299 pmulhrsw m5, m7 | |
10300 packuswb m3, m5 | |
10301 movu [r0 + 248 * 16], m3 | |
10302 pmaddubsw m3, m1, m6 | |
10303 pmulhrsw m3, m7 | |
10304 pmaddubsw m5, m4, m6 | |
10305 pmulhrsw m5, m7 | |
10306 packuswb m3, m5 | |
10307 movu [r0 + 249 * 16], m3 | |
10308 | |
10309 ; mode 5 [row 29] | |
10310 movu m6, [r5 + 30 * 16] | |
10311 pmaddubsw m3, m0, m6 | |
10312 pmulhrsw m3, m7 | |
10313 pmaddubsw m5, m2, m6 | |
10314 pmulhrsw m5, m7 | |
10315 packuswb m3, m5 | |
10316 movu [r0 + 250 * 16], m3 | |
10317 pmaddubsw m3, m1, m6 | |
10318 pmulhrsw m3, m7 | |
10319 pmaddubsw m5, m4, m6 | |
10320 pmulhrsw m5, m7 | |
10321 packuswb m3, m5 | |
10322 movu [r0 + 251 * 16], m3 | |
10323 | |
10324 ; mode 3 [row 19] | |
10325 movu m6, [r5 + 8 * 16] | |
10326 movu m0, [r4 + 17] | |
10327 movd m1, [r4 + 18] | |
10328 palignr m1, m0, 1 | |
10329 punpcklbw m0, m1 | |
10330 pmaddubsw m1, m0, m6 | |
10331 pmulhrsw m1, m7 | |
10332 movu m2, [r4 + 25] | |
10333 movd m3, [r4 + 26] | |
10334 palignr m3, m2, 1 | |
10335 punpcklbw m2, m3 | |
10336 pmaddubsw m3, m2, m6 | |
10337 pmulhrsw m3, m7 | |
10338 packuswb m1, m3 | |
10339 movu [r0 + 102 * 16], m1 | |
10340 | |
10341 movu m1, [r4 + 33] | |
10342 movd m3, [r4 + 34] | |
10343 palignr m3, m1, 1 | |
10344 punpcklbw m1, m3 | |
10345 pmaddubsw m3, m1, m6 | |
10346 pmulhrsw m3, m7 | |
10347 movu m4, [r4 + 41] | |
10348 movd m5, [r4 + 42] | |
10349 palignr m5, m4, 1 | |
10350 punpcklbw m4, m5 | |
10351 pmaddubsw m5, m4, m6 | |
10352 pmulhrsw m5, m7 | |
10353 packuswb m3, m5 | |
10354 movu [r0 + 103 * 16], m3 | |
10355 | |
10356 ; mode 4 [row 24] | |
10357 movu m6, [r5 + 13 * 16] | |
10358 pmaddubsw m3, m0, m6 | |
10359 pmulhrsw m3, m7 | |
10360 pmaddubsw m5, m2, m6 | |
10361 pmulhrsw m5, m7 | |
10362 packuswb m3, m5 | |
10363 movu [r0 + 176 * 16], m3 | |
10364 pmaddubsw m3, m1, m6 | |
10365 pmulhrsw m3, m7 | |
10366 pmaddubsw m5, m4, m6 | |
10367 pmulhrsw m5, m7 | |
10368 packuswb m3, m5 | |
10369 movu [r0 + 177 * 16], m3 | |
10370 | |
10371 ; mode 5 [row 30] | |
10372 movu m6, [r5 + 15 * 16] | |
10373 pmaddubsw m3, m0, m6 | |
10374 pmulhrsw m3, m7 | |
10375 pmaddubsw m5, m2, m6 | |
10376 pmulhrsw m5, m7 | |
10377 packuswb m3, m5 | |
10378 movu [r0 + 252 * 16], m3 | |
10379 pmaddubsw m3, m1, m6 | |
10380 pmulhrsw m3, m7 | |
10381 pmaddubsw m5, m4, m6 | |
10382 pmulhrsw m5, m7 | |
10383 packuswb m3, m5 | |
10384 movu [r0 + 253 * 16], m3 | |
10385 | |
10386 ; mode 3 [row 20] | |
10387 movu m6, [r5 + 2 * 16] | |
10388 movu m0, [r4 + 18] | |
10389 movd m1, [r4 + 19] | |
10390 palignr m1, m0, 1 | |
10391 punpcklbw m0, m1 | |
10392 pmaddubsw m1, m0, m6 | |
10393 pmulhrsw m1, m7 | |
10394 movu m2, [r4 + 26] | |
10395 movd m3, [r4 + 27] | |
10396 palignr m3, m2, 1 | |
10397 punpcklbw m2, m3 | |
10398 pmaddubsw m3, m2, m6 | |
10399 pmulhrsw m3, m7 | |
10400 packuswb m1, m3 | |
10401 movu [r0 + 104 * 16], m1 | |
10402 | |
10403 movu m1, [r4 + 34] | |
10404 movd m3, [r4 + 35] | |
10405 palignr m3, m1, 1 | |
10406 punpcklbw m1, m3 | |
10407 pmaddubsw m3, m1, m6 | |
10408 pmulhrsw m3, m7 | |
10409 movu m4, [r4 + 42] | |
10410 movd m5, [r4 + 43] | |
10411 palignr m5, m4, 1 | |
10412 punpcklbw m4, m5 | |
10413 pmaddubsw m5, m4, m6 | |
10414 pmulhrsw m5, m7 | |
10415 packuswb m3, m5 | |
10416 movu [r0 + 105 * 16], m3 | |
10417 | |
10418 ; mode 4 [row 25] | |
10419 pmaddubsw m3, m0, m6 | |
10420 pmulhrsw m3, m7 | |
10421 pmaddubsw m5, m2, m6 | |
10422 pmulhrsw m5, m7 | |
10423 packuswb m3, m5 | |
10424 movu [r0 + 178 * 16], m3 | |
10425 pmaddubsw m3, m1, m6 | |
10426 pmulhrsw m3, m7 | |
10427 pmaddubsw m5, m4, m6 | |
10428 pmulhrsw m5, m7 | |
10429 packuswb m3, m5 | |
10430 movu [r0 + 179 * 16], m3 | |
10431 | |
10432 ; mode 4 [row 26] | |
10433 movu m6, [r5 + 23 * 16] | |
10434 pmaddubsw m3, m0, m6 | |
10435 pmulhrsw m3, m7 | |
10436 pmaddubsw m5, m2, m6 | |
10437 pmulhrsw m5, m7 | |
10438 packuswb m3, m5 | |
10439 movu [r0 + 180 * 16], m3 | |
10440 pmaddubsw m3, m1, m6 | |
10441 pmulhrsw m3, m7 | |
10442 pmaddubsw m5, m4, m6 | |
10443 pmulhrsw m5, m7 | |
10444 packuswb m3, m5 | |
10445 movu [r0 + 181 * 16], m3 | |
10446 | |
10447 ; mode 3 [row 21] | |
10448 movu m6, [r5 + 28 * 16] | |
10449 pmaddubsw m3, m0, m6 | |
10450 pmulhrsw m3, m7 | |
10451 pmaddubsw m5, m2, m6 | |
10452 pmulhrsw m5, m7 | |
10453 packuswb m3, m5 | |
10454 movu [r0 + 106 * 16], m3 | |
10455 pmaddubsw m3, m1, m6 | |
10456 pmulhrsw m3, m7 | |
10457 pmaddubsw m5, m4, m6 | |
10458 pmulhrsw m5, m7 | |
10459 packuswb m3, m5 | |
10460 movu [r0 + 107 * 16], m3 | |
10461 | |
10462 ; mode 3 [row 22] | |
10463 movu m6, [r5 + 22 * 16] | |
10464 movu m0, [r4 + 19] | |
10465 movd m1, [r4 + 20] | |
10466 palignr m1, m0, 1 | |
10467 punpcklbw m0, m1 | |
10468 pmaddubsw m1, m0, m6 | |
10469 pmulhrsw m1, m7 | |
10470 movu m2, [r4 + 27] | |
10471 movd m3, [r4 + 28] | |
10472 palignr m3, m2, 1 | |
10473 punpcklbw m2, m3 | |
10474 pmaddubsw m3, m2, m6 | |
10475 pmulhrsw m3, m7 | |
10476 packuswb m1, m3 | |
10477 movu [r0 + 108 * 16], m1 | |
10478 | |
10479 movu m1, [r4 + 35] | |
10480 movd m3, [r4 + 36] | |
10481 palignr m3, m1, 1 | |
10482 punpcklbw m1, m3 | |
10483 pmaddubsw m3, m1, m6 | |
10484 pmulhrsw m3, m7 | |
10485 movu m4, [r4 + 43] | |
10486 movd m5, [r4 + 44] | |
10487 palignr m5, m4, 1 | |
10488 punpcklbw m4, m5 | |
10489 pmaddubsw m5, m4, m6 | |
10490 pmulhrsw m5, m7 | |
10491 packuswb m3, m5 | |
10492 movu [r0 + 109 * 16], m3 | |
10493 | |
10494 ; mode 4 [row 27] | |
10495 movu m6, [r5 + 12 * 16] | |
10496 pmaddubsw m3, m0, m6 | |
10497 pmulhrsw m3, m7 | |
10498 pmaddubsw m5, m2, m6 | |
10499 pmulhrsw m5, m7 | |
10500 packuswb m3, m5 | |
10501 movu [r0 + 182 * 16], m3 | |
10502 pmaddubsw m3, m1, m6 | |
10503 pmulhrsw m3, m7 | |
10504 pmaddubsw m5, m4, m6 | |
10505 pmulhrsw m5, m7 | |
10506 packuswb m3, m5 | |
10507 movu [r0 + 183 * 16], m3 | |
10508 | |
10509 ; mode 3 [row 23] | |
10510 movu m6, [r5 + 16 * 16] | |
10511 movu m0, [r4 + 20] | |
10512 movd m1, [r4 + 21] | |
10513 palignr m1, m0, 1 | |
10514 punpcklbw m0, m1 | |
10515 pmaddubsw m1, m0, m6 | |
10516 pmulhrsw m1, m7 | |
10517 movu m2, [r4 + 28] | |
10518 movd m3, [r4 + 29] | |
10519 palignr m3, m2, 1 | |
10520 punpcklbw m2, m3 | |
10521 pmaddubsw m3, m2, m6 | |
10522 pmulhrsw m3, m7 | |
10523 packuswb m1, m3 | |
10524 movu [r0 + 110 * 16], m1 | |
10525 | |
10526 movu m1, [r4 + 36] | |
10527 movd m3, [r4 + 37] | |
10528 palignr m3, m1, 1 | |
10529 punpcklbw m1, m3 | |
10530 pmaddubsw m3, m1, m6 | |
10531 pmulhrsw m3, m7 | |
10532 movu m4, [r4 + 44] | |
10533 movd m5, [r4 + 45] | |
10534 palignr m5, m4, 1 | |
10535 punpcklbw m4, m5 | |
10536 pmaddubsw m5, m4, m6 | |
10537 pmulhrsw m5, m7 | |
10538 packuswb m3, m5 | |
10539 movu [r0 + 111 * 16], m3 | |
10540 | |
10541 ; mode 4 [row 28] | |
10542 movu m6, [r5 + 1 * 16] | |
10543 pmaddubsw m3, m0, m6 | |
10544 pmulhrsw m3, m7 | |
10545 pmaddubsw m5, m2, m6 | |
10546 pmulhrsw m5, m7 | |
10547 packuswb m3, m5 | |
10548 movu [r0 + 184 * 16], m3 | |
10549 pmaddubsw m3, m1, m6 | |
10550 pmulhrsw m3, m7 | |
10551 pmaddubsw m5, m4, m6 | |
10552 pmulhrsw m5, m7 | |
10553 packuswb m3, m5 | |
10554 movu [r0 + 185 * 16], m3 | |
10555 | |
10556 ; mode 4 [row 29] | |
10557 movu m6, [r5 + 22 * 16] | |
10558 pmaddubsw m3, m0, m6 | |
10559 pmulhrsw m3, m7 | |
10560 pmaddubsw m5, m2, m6 | |
10561 pmulhrsw m5, m7 | |
10562 packuswb m3, m5 | |
10563 movu [r0 + 186 * 16], m3 | |
10564 pmaddubsw m3, m1, m6 | |
10565 pmulhrsw m3, m7 | |
10566 pmaddubsw m5, m4, m6 | |
10567 pmulhrsw m5, m7 | |
10568 packuswb m3, m5 | |
10569 movu [r0 + 187 * 16], m3 | |
10570 | |
10571 ; mode 3 [row 24] | |
10572 movu m6, [r5 + 10 * 16] | |
10573 movu m0, [r4 + 21] | |
10574 movd m1, [r4 + 22] | |
10575 palignr m1, m0, 1 | |
10576 punpcklbw m0, m1 | |
10577 pmaddubsw m1, m0, m6 | |
10578 pmulhrsw m1, m7 | |
10579 movu m2, [r4 + 29] | |
10580 movd m3, [r4 + 30] | |
10581 palignr m3, m2, 1 | |
10582 punpcklbw m2, m3 | |
10583 pmaddubsw m3, m2, m6 | |
10584 pmulhrsw m3, m7 | |
10585 packuswb m1, m3 | |
10586 movu [r0 + 112 * 16], m1 | |
10587 | |
10588 movu m1, [r4 + 37] | |
10589 movd m3, [r4 + 38] | |
10590 palignr m3, m1, 1 | |
10591 punpcklbw m1, m3 | |
10592 pmaddubsw m3, m1, m6 | |
10593 pmulhrsw m3, m7 | |
10594 movu m4, [r4 + 45] | |
10595 movd m5, [r4 + 46] | |
10596 palignr m5, m4, 1 | |
10597 punpcklbw m4, m5 | |
10598 pmaddubsw m5, m4, m6 | |
10599 pmulhrsw m5, m7 | |
10600 packuswb m3, m5 | |
10601 movu [r0 + 113 * 16], m3 | |
10602 | |
10603 ; mode 4 [row 30] | |
10604 movu m6, [r5 + 11 * 16] | |
10605 pmaddubsw m3, m0, m6 | |
10606 pmulhrsw m3, m7 | |
10607 pmaddubsw m5, m2, m6 | |
10608 pmulhrsw m5, m7 | |
10609 packuswb m3, m5 | |
10610 movu [r0 + 188 * 16], m3 | |
10611 pmaddubsw m3, m1, m6 | |
10612 pmulhrsw m3, m7 | |
10613 pmaddubsw m5, m4, m6 | |
10614 pmulhrsw m5, m7 | |
10615 packuswb m3, m5 | |
10616 movu [r0 + 189 * 16], m3 | |
10617 | |
10618 ; mode 3 [row 25] | |
10619 movu m6, [r5 + 4 * 16] | |
10620 movu m0, [r4 + 22] | |
10621 movd m1, [r4 + 23] | |
10622 palignr m1, m0, 1 | |
10623 punpcklbw m0, m1 | |
10624 pmaddubsw m1, m0, m6 | |
10625 pmulhrsw m1, m7 | |
10626 movu m2, [r4 + 30] | |
10627 movd m3, [r4 + 31] | |
10628 palignr m3, m2, 1 | |
10629 punpcklbw m2, m3 | |
10630 pmaddubsw m3, m2, m6 | |
10631 pmulhrsw m3, m7 | |
10632 packuswb m1, m3 | |
10633 movu [r0 + 114 * 16], m1 | |
10634 | |
10635 movu m1, [r4 + 38] | |
10636 movd m3, [r4 + 39] | |
10637 palignr m3, m1, 1 | |
10638 punpcklbw m1, m3 | |
10639 pmaddubsw m3, m1, m6 | |
10640 pmulhrsw m3, m7 | |
10641 movu m4, [r4 + 46] | |
10642 movd m5, [r4 + 47] | |
10643 palignr m5, m4, 1 | |
10644 punpcklbw m4, m5 | |
10645 pmaddubsw m5, m4, m6 | |
10646 pmulhrsw m5, m7 | |
10647 packuswb m3, m5 | |
10648 movu [r0 + 115 * 16], m3 | |
10649 | |
10650 ; mode 3 [row 26] | |
10651 movu m6, [r5 + 30 * 16] | |
10652 pmaddubsw m3, m0, m6 | |
10653 pmulhrsw m3, m7 | |
10654 pmaddubsw m5, m2, m6 | |
10655 pmulhrsw m5, m7 | |
10656 packuswb m3, m5 | |
10657 movu [r0 + 116 * 16], m3 | |
10658 pmaddubsw m3, m1, m6 | |
10659 pmulhrsw m3, m7 | |
10660 pmaddubsw m5, m4, m6 | |
10661 pmulhrsw m5, m7 | |
10662 packuswb m3, m5 | |
10663 movu [r0 + 117 * 16], m3 | |
10664 | |
10665 ; mode 3 [row 27] | |
10666 movu m6, [r5 + 24 * 16] | |
10667 movu m0, [r4 + 23] | |
10668 movd m1, [r4 + 24] | |
10669 palignr m1, m0, 1 | |
10670 punpcklbw m0, m1 | |
10671 pmaddubsw m1, m0, m6 | |
10672 pmulhrsw m1, m7 | |
10673 movu m2, [r4 + 31] | |
10674 movd m3, [r4 + 32] | |
10675 palignr m3, m2, 1 | |
10676 punpcklbw m2, m3 | |
10677 pmaddubsw m3, m2, m6 | |
10678 pmulhrsw m3, m7 | |
10679 packuswb m1, m3 | |
10680 movu [r0 + 118 * 16], m1 | |
10681 | |
10682 movu m1, [r4 + 39] | |
10683 movd m3, [r4 + 40] | |
10684 palignr m3, m1, 1 | |
10685 punpcklbw m1, m3 | |
10686 pmaddubsw m3, m1, m6 | |
10687 pmulhrsw m3, m7 | |
10688 movu m4, [r4 + 47] | |
10689 movd m5, [r4 + 48] | |
10690 palignr m5, m4, 1 | |
10691 punpcklbw m4, m5 | |
10692 pmaddubsw m5, m4, m6 | |
10693 pmulhrsw m5, m7 | |
10694 packuswb m3, m5 | |
10695 movu [r0 + 119 * 16], m3 | |
10696 | |
10697 ; mode 3 [row 28] | |
10698 movu m6, [r5 + 18 * 16] | |
10699 movu m0, [r4 + 24] | |
10700 movd m1, [r4 + 25] | |
10701 palignr m1, m0, 1 | |
10702 punpcklbw m0, m1 | |
10703 pmaddubsw m1, m0, m6 | |
10704 pmulhrsw m1, m7 | |
10705 movu m2, [r4 + 32] | |
10706 movd m3, [r4 + 33] | |
10707 palignr m3, m2, 1 | |
10708 punpcklbw m2, m3 | |
10709 pmaddubsw m3, m2, m6 | |
10710 pmulhrsw m3, m7 | |
10711 packuswb m1, m3 | |
10712 movu [r0 + 120 * 16], m1 | |
10713 | |
10714 movu m1, [r4 + 40] | |
10715 movd m3, [r4 + 41] | |
10716 palignr m3, m1, 1 | |
10717 punpcklbw m1, m3 | |
10718 pmaddubsw m3, m1, m6 | |
10719 pmulhrsw m3, m7 | |
10720 movu m4, [r4 + 48] | |
10721 movd m5, [r4 + 49] | |
10722 palignr m5, m4, 1 | |
10723 punpcklbw m4, m5 | |
10724 pmaddubsw m5, m4, m6 | |
10725 pmulhrsw m5, m7 | |
10726 packuswb m3, m5 | |
10727 movu [r0 + 121 * 16], m3 | |
10728 | |
10729 ; mode 3 [row 29] | |
10730 movu m6, [r5 + 12 * 16] | |
10731 movu m0, [r4 + 25] | |
10732 movd m1, [r4 + 26] | |
10733 palignr m1, m0, 1 | |
10734 punpcklbw m0, m1 | |
10735 pmaddubsw m1, m0, m6 | |
10736 pmulhrsw m1, m7 | |
10737 movu m2, [r4 + 33] | |
10738 movd m3, [r4 + 34] | |
10739 palignr m3, m2, 1 | |
10740 punpcklbw m2, m3 | |
10741 pmaddubsw m3, m2, m6 | |
10742 pmulhrsw m3, m7 | |
10743 packuswb m1, m3 | |
10744 movu [r0 + 122 * 16], m1 | |
10745 | |
10746 movu m1, [r4 + 41] | |
10747 movd m3, [r4 + 42] | |
10748 palignr m3, m1, 1 | |
10749 punpcklbw m1, m3 | |
10750 pmaddubsw m3, m1, m6 | |
10751 pmulhrsw m3, m7 | |
10752 movu m4, [r4 + 49] | |
10753 movd m5, [r4 + 50] | |
10754 palignr m5, m4, 1 | |
10755 punpcklbw m4, m5 | |
10756 pmaddubsw m5, m4, m6 | |
10757 pmulhrsw m5, m7 | |
10758 packuswb m3, m5 | |
10759 movu [r0 + 123 * 16], m3 | |
10760 | |
10761 ; mode 3 [row 30] | |
10762 movu m6, [r5 + 6 * 16] | |
10763 movu m0, [r4 + 26] | |
10764 movd m1, [r4 + 27] | |
10765 palignr m1, m0, 1 | |
10766 punpcklbw m0, m1 | |
10767 pmaddubsw m1, m0, m6 | |
10768 pmulhrsw m1, m7 | |
10769 movu m2, [r4 + 34] | |
10770 movd m3, [r4 + 35] | |
10771 palignr m3, m2, 1 | |
10772 punpcklbw m2, m3 | |
10773 pmaddubsw m3, m2, m6 | |
10774 pmulhrsw m3, m7 | |
10775 packuswb m1, m3 | |
10776 movu [r0 + 124 * 16], m1 | |
10777 | |
10778 movu m1, [r4 + 42] | |
10779 movd m3, [r4 + 43] | |
10780 palignr m3, m1, 1 | |
10781 punpcklbw m1, m3 | |
10782 pmaddubsw m3, m1, m6 | |
10783 pmulhrsw m3, m7 | |
10784 movu m4, [r4 + 50] | |
10785 movd m5, [r4 + 51] | |
10786 palignr m5, m4, 1 | |
10787 punpcklbw m4, m5 | |
10788 pmaddubsw m5, m4, m6 | |
10789 pmulhrsw m5, m7 | |
10790 packuswb m3, m5 | |
10791 movu [r0 + 125 * 16], m3 | |
10792 | |
10793 ; mode 10 | |
10794 movu m1, [r2 + 1] | |
10795 movu m2, [r2 + 17] | |
10796 movu [r0 + 512 * 16], m1 | |
10797 movu [r0 + 513 * 16], m2 | |
10798 movu [r0 + 514 * 16], m1 | |
10799 movu [r0 + 515 * 16], m2 | |
10800 movu [r0 + 516 * 16], m1 | |
10801 movu [r0 + 517 * 16], m2 | |
10802 movu [r0 + 518 * 16], m1 | |
10803 movu [r0 + 519 * 16], m2 | |
10804 movu [r0 + 520 * 16], m1 | |
10805 movu [r0 + 521 * 16], m2 | |
10806 movu [r0 + 522 * 16], m1 | |
10807 movu [r0 + 523 * 16], m2 | |
10808 movu [r0 + 524 * 16], m1 | |
10809 movu [r0 + 525 * 16], m2 | |
10810 movu [r0 + 526 * 16], m1 | |
10811 movu [r0 + 527 * 16], m2 | |
10812 | |
10813 movu [r0 + 528 * 16], m1 | |
10814 movu [r0 + 529 * 16], m2 | |
10815 movu [r0 + 530 * 16], m1 | |
10816 movu [r0 + 531 * 16], m2 | |
10817 movu [r0 + 532 * 16], m1 | |
10818 movu [r0 + 533 * 16], m2 | |
10819 movu [r0 + 534 * 16], m1 | |
10820 movu [r0 + 535 * 16], m2 | |
10821 movu [r0 + 536 * 16], m1 | |
10822 movu [r0 + 537 * 16], m2 | |
10823 movu [r0 + 538 * 16], m1 | |
10824 movu [r0 + 539 * 16], m2 | |
10825 movu [r0 + 540 * 16], m1 | |
10826 movu [r0 + 541 * 16], m2 | |
10827 movu [r0 + 542 * 16], m1 | |
10828 movu [r0 + 543 * 16], m2 | |
10829 | |
10830 movu [r0 + 544 * 16], m1 | |
10831 movu [r0 + 545 * 16], m2 | |
10832 movu [r0 + 546 * 16], m1 | |
10833 movu [r0 + 547 * 16], m2 | |
10834 movu [r0 + 548 * 16], m1 | |
10835 movu [r0 + 549 * 16], m2 | |
10836 movu [r0 + 550 * 16], m1 | |
10837 movu [r0 + 551 * 16], m2 | |
10838 movu [r0 + 552 * 16], m1 | |
10839 movu [r0 + 553 * 16], m2 | |
10840 movu [r0 + 554 * 16], m1 | |
10841 movu [r0 + 555 * 16], m2 | |
10842 movu [r0 + 556 * 16], m1 | |
10843 movu [r0 + 557 * 16], m2 | |
10844 movu [r0 + 558 * 16], m1 | |
10845 movu [r0 + 559 * 16], m2 | |
10846 | |
10847 movu [r0 + 560 * 16], m1 | |
10848 movu [r0 + 561 * 16], m2 | |
10849 movu [r0 + 562 * 16], m1 | |
10850 movu [r0 + 563 * 16], m2 | |
10851 movu [r0 + 564 * 16], m1 | |
10852 movu [r0 + 565 * 16], m2 | |
10853 movu [r0 + 566 * 16], m1 | |
10854 movu [r0 + 567 * 16], m2 | |
10855 movu [r0 + 568 * 16], m1 | |
10856 movu [r0 + 569 * 16], m2 | |
10857 movu [r0 + 570 * 16], m1 | |
10858 movu [r0 + 571 * 16], m2 | |
10859 movu [r0 + 572 * 16], m1 | |
10860 movu [r0 + 573 * 16], m2 | |
10861 movu [r0 + 574 * 16], m1 | |
10862 movu [r0 + 575 * 16], m2 | |
10863 | |
10864 ; mode 11 [row 0] | |
10865 movu m0, [r4] | |
10866 | |
10867 ; mode 11 [row 15 - first half] | |
10868 movu [r0 + 606 * 16], m0 | |
10869 | |
10870 movu [r0 + 606 * 16], m0 | |
10871 | |
10872 ; mode 12 [row 31] | |
10873 pslldq m6, m0, 4 | |
10874 pinsrb m6, [r3 + 26], 0 | |
10875 pinsrb m6, [r3 + 19], 1 | |
10876 pinsrb m6, [r3 + 13], 2 | |
10877 pinsrb m6, [r3 + 6], 3 | |
10878 movu [r0 + 702 * 16], m6 | |
10879 movu m6, [r4 + 12] | |
10880 movu [r0 + 703 * 16], m6 | |
10881 | |
10882 ; mode 11 [row 31] | |
10883 pslldq m6, m0, 1 | |
10884 pinsrb m6, [r3 + 16], 0 | |
10885 movu [r0 + 638 * 16], m6 | |
10886 movu m6, [r4 + 15] | |
10887 movu [r0 + 639 * 16], m6 | |
10888 | |
10889 movd m1, [r4 + 1] | |
10890 palignr m1, m0, 1 | |
10891 punpcklbw m0, m1 | |
10892 pmaddubsw m1, m0, [r5 + 30 * 16] | |
10893 pmulhrsw m1, m7 | |
10894 movu m2, [r4 + 8] | |
10895 movd m3, [r4 + 9] | |
10896 palignr m3, m2, 1 | |
10897 punpcklbw m2, m3 | |
10898 pmaddubsw m3, m2, [r5 + 30 * 16] | |
10899 pmulhrsw m3, m7 | |
10900 packuswb m1, m3 | |
10901 movu [r0 + 576 * 16], m1 | |
10902 | |
10903 movu m1, [r4 + 16] | |
10904 | |
10905 ; mode 11 [row 15 - second half] | |
10906 movu [r0 + 607 * 16], m1 | |
10907 | |
10908 movd m3, [r4 + 17] | |
10909 palignr m3, m1, 1 | |
10910 punpcklbw m1, m3 | |
10911 pmaddubsw m3, m1, [r5 + 30 * 16] | |
10912 pmulhrsw m3, m7 | |
10913 movu m4, [r4 + 24] | |
10914 movd m5, [r4 + 25] | |
10915 palignr m5, m4, 1 | |
10916 punpcklbw m4, m5 | |
10917 pmaddubsw m5, m4, [r5 + 30 * 16] | |
10918 pmulhrsw m5, m7 | |
10919 packuswb m3, m5 | |
10920 movu [r0 + 577 * 16], m3 | |
10921 | |
10922 ; mode 11 [row 1] | |
10923 pmaddubsw m3, m0, [r5 + 28 * 16] | |
10924 pmulhrsw m3, m7 | |
10925 pmaddubsw m5, m2, [r5 + 28 * 16] | |
10926 pmulhrsw m5, m7 | |
10927 packuswb m3, m5 | |
10928 movu [r0 + 578 * 16], m3 | |
10929 pmaddubsw m3, m1, [r5 + 28 * 16] | |
10930 pmulhrsw m3, m7 | |
10931 pmaddubsw m5, m4, [r5 + 28 * 16] | |
10932 pmulhrsw m5, m7 | |
10933 packuswb m3, m5 | |
10934 movu [r0 + 579 * 16], m3 | |
10935 | |
10936 ; mode 11 [row 2] | |
10937 pmaddubsw m3, m0, [r5 + 26 * 16] | |
10938 pmulhrsw m3, m7 | |
10939 pmaddubsw m5, m2, [r5 + 26 * 16] | |
10940 pmulhrsw m5, m7 | |
10941 packuswb m3, m5 | |
10942 movu [r0 + 580 * 16], m3 | |
10943 pmaddubsw m3, m1, [r5 + 26 * 16] | |
10944 pmulhrsw m3, m7 | |
10945 pmaddubsw m5, m4, [r5 + 26 * 16] | |
10946 pmulhrsw m5, m7 | |
10947 packuswb m3, m5 | |
10948 movu [r0 + 581 * 16], m3 | |
10949 | |
10950 ; mode 11 [row 3] | |
10951 pmaddubsw m3, m0, [r5 + 24 * 16] | |
10952 pmulhrsw m3, m7 | |
10953 pmaddubsw m5, m2, [r5 + 24 * 16] | |
10954 pmulhrsw m5, m7 | |
10955 packuswb m3, m5 | |
10956 movu [r0 + 582 * 16], m3 | |
10957 pmaddubsw m3, m1, [r5 + 24 * 16] | |
10958 pmulhrsw m3, m7 | |
10959 pmaddubsw m5, m4, [r5 + 24 * 16] | |
10960 pmulhrsw m5, m7 | |
10961 packuswb m3, m5 | |
10962 movu [r0 + 583 * 16], m3 | |
10963 | |
10964 ; mode 11 [row 4] | |
10965 pmaddubsw m3, m0, [r5 + 22 * 16] | |
10966 pmulhrsw m3, m7 | |
10967 pmaddubsw m5, m2, [r5 + 22 * 16] | |
10968 pmulhrsw m5, m7 | |
10969 packuswb m3, m5 | |
10970 movu [r0 + 584 * 16], m3 | |
10971 | |
10972 ; mode 12 [row 1 - first half] | |
10973 movu [r0 + 642 * 16], m3 | |
10974 | |
10975 pmaddubsw m3, m1, [r5 + 22 * 16] | |
10976 pmulhrsw m3, m7 | |
10977 pmaddubsw m5, m4, [r5 + 22 * 16] | |
10978 pmulhrsw m5, m7 | |
10979 packuswb m3, m5 | |
10980 movu [r0 + 585 * 16], m3 | |
10981 | |
10982 ; mode 12 [row 1 - second half] | |
10983 movu [r0 + 643 * 16], m3 | |
10984 | |
10985 ; mode 11 [row 5] | |
10986 pmaddubsw m3, m0, [r5 + 20 * 16] | |
10987 pmulhrsw m3, m7 | |
10988 pmaddubsw m5, m2, [r5 + 20 * 16] | |
10989 pmulhrsw m5, m7 | |
10990 packuswb m3, m5 | |
10991 movu [r0 + 586 * 16], m3 | |
10992 pmaddubsw m3, m1, [r5 + 20 * 16] | |
10993 pmulhrsw m3, m7 | |
10994 pmaddubsw m5, m4, [r5 + 20 * 16] | |
10995 pmulhrsw m5, m7 | |
10996 packuswb m3, m5 | |
10997 movu [r0 + 587 * 16], m3 | |
10998 | |
10999 ; mode 11 [row 6] | |
11000 pmaddubsw m3, m0, [r5 + 18 * 16] | |
11001 pmulhrsw m3, m7 | |
11002 pmaddubsw m5, m2, [r5 + 18 * 16] | |
11003 pmulhrsw m5, m7 | |
11004 packuswb m3, m5 | |
11005 movu [r0 + 588 * 16], m3 | |
11006 pmaddubsw m3, m1, [r5 + 18 * 16] | |
11007 pmulhrsw m3, m7 | |
11008 pmaddubsw m5, m4, [r5 + 18 * 16] | |
11009 pmulhrsw m5, m7 | |
11010 packuswb m3, m5 | |
11011 movu [r0 + 589 * 16], m3 | |
11012 | |
11013 ; mode 11 [row 7] | |
11014 pmaddubsw m3, m0, [r5 + 16 * 16] | |
11015 pmulhrsw m3, m7 | |
11016 pmaddubsw m5, m2, [r5 + 16 * 16] | |
11017 pmulhrsw m5, m7 | |
11018 packuswb m3, m5 | |
11019 movu [r0 + 590 * 16], m3 | |
11020 pmaddubsw m3, m1, [r5 + 16 * 16] | |
11021 pmulhrsw m3, m7 | |
11022 pmaddubsw m5, m4, [r5 + 16 * 16] | |
11023 pmulhrsw m5, m7 | |
11024 packuswb m3, m5 | |
11025 movu [r0 + 591 * 16], m3 | |
11026 | |
11027 ; mode 11 [row 8] | |
11028 pmaddubsw m3, m0, [r5 + 14 * 16] | |
11029 pmulhrsw m3, m7 | |
11030 pmaddubsw m5, m2, [r5 + 14 * 16] | |
11031 pmulhrsw m5, m7 | |
11032 packuswb m3, m5 | |
11033 movu [r0 + 592 * 16], m3 | |
11034 | |
11035 ; mode 13 [row 1 - first half] | |
11036 movu [r0 + 706 * 16], m3 | |
11037 | |
11038 pmaddubsw m3, m1, [r5 + 14 * 16] | |
11039 pmulhrsw m3, m7 | |
11040 pmaddubsw m5, m4, [r5 + 14 * 16] | |
11041 pmulhrsw m5, m7 | |
11042 packuswb m3, m5 | |
11043 movu [r0 + 593 * 16], m3 | |
11044 | |
11045 ; mode 13 [row 1 - second half] | |
11046 movu [r0 + 707 * 16], m3 | |
11047 | |
11048 ; mode 11 [row 9] | |
11049 pmaddubsw m3, m0, [r5 + 12 * 16] | |
11050 pmulhrsw m3, m7 | |
11051 pmaddubsw m5, m2, [r5 + 12 * 16] | |
11052 pmulhrsw m5, m7 | |
11053 packuswb m3, m5 | |
11054 movu [r0 + 594 * 16], m3 | |
11055 | |
11056 ; mode 12 [row 3 - first half] | |
11057 movu [r0 + 646 * 16], m3 | |
11058 | |
11059 pmaddubsw m3, m1, [r5 + 12 * 16] | |
11060 pmulhrsw m3, m7 | |
11061 pmaddubsw m5, m4, [r5 + 12 * 16] | |
11062 pmulhrsw m5, m7 | |
11063 packuswb m3, m5 | |
11064 movu [r0 + 595 * 16], m3 | |
11065 | |
11066 ; mode 12 [row 3 - second half] | |
11067 movu [r0 + 647 * 16], m3 | |
11068 | |
11069 ; mode 11 [row 10] | |
11070 pmaddubsw m3, m0, [r5 + 10 * 16] | |
11071 pmulhrsw m3, m7 | |
11072 pmaddubsw m5, m2, [r5 + 10 * 16] | |
11073 pmulhrsw m5, m7 | |
11074 packuswb m3, m5 | |
11075 movu [r0 + 596 * 16], m3 | |
11076 pmaddubsw m3, m1, [r5 + 10 * 16] | |
11077 pmulhrsw m3, m7 | |
11078 pmaddubsw m5, m4, [r5 + 10 * 16] | |
11079 pmulhrsw m5, m7 | |
11080 packuswb m3, m5 | |
11081 movu [r0 + 597 * 16], m3 | |
11082 | |
11083 ; mode 11 [row 11] | |
11084 pmaddubsw m3, m0, [r5 + 8 * 16] | |
11085 pmulhrsw m3, m7 | |
11086 pmaddubsw m5, m2, [r5 + 8 * 16] | |
11087 pmulhrsw m5, m7 | |
11088 packuswb m3, m5 | |
11089 movu [r0 + 598 * 16], m3 | |
11090 pmaddubsw m3, m1, [r5 + 8 * 16] | |
11091 pmulhrsw m3, m7 | |
11092 pmaddubsw m5, m4, [r5 + 8 * 16] | |
11093 pmulhrsw m5, m7 | |
11094 packuswb m3, m5 | |
11095 movu [r0 + 599 * 16], m3 | |
11096 | |
11097 ; mode 11 [row 12] | |
11098 pmaddubsw m3, m0, [r5 + 6 * 16] | |
11099 pmulhrsw m3, m7 | |
11100 pmaddubsw m5, m2, [r5 + 6 * 16] | |
11101 pmulhrsw m5, m7 | |
11102 packuswb m3, m5 | |
11103 movu [r0 + 600 * 16], m3 | |
11104 | |
11105 ; mode 14 [row 1 - first half] | |
11106 movu [r0 + 770 * 16], m3 | |
11107 | |
11108 pmaddubsw m3, m1, [r5 + 6 * 16] | |
11109 pmulhrsw m3, m7 | |
11110 pmaddubsw m5, m4, [r5 + 6 * 16] | |
11111 pmulhrsw m5, m7 | |
11112 packuswb m3, m5 | |
11113 movu [r0 + 601 * 16], m3 | |
11114 | |
11115 ; mode 14 [row 1 - second half] | |
11116 movu [r0 + 771 * 16], m3 | |
11117 | |
11118 ; mode 11 [row 13] | |
11119 pmaddubsw m3, m0, [r5 + 4 * 16] | |
11120 pmulhrsw m3, m7 | |
11121 pmaddubsw m5, m2, [r5 + 4 * 16] | |
11122 pmulhrsw m5, m7 | |
11123 packuswb m3, m5 | |
11124 movu [r0 + 602 * 16], m3 | |
11125 pmaddubsw m3, m1, [r5 + 4 * 16] | |
11126 pmulhrsw m3, m7 | |
11127 pmaddubsw m5, m4, [r5 + 4 * 16] | |
11128 pmulhrsw m5, m7 | |
11129 packuswb m3, m5 | |
11130 movu [r0 + 603 * 16], m3 | |
11131 | |
11132 ; mode 11 [row 14] | |
11133 pmaddubsw m3, m0, [r5 + 2 * 16] | |
11134 pmulhrsw m3, m7 | |
11135 pmaddubsw m5, m2, [r5 + 2 * 16] | |
11136 pmulhrsw m5, m7 | |
11137 packuswb m3, m5 | |
11138 movu [r0 + 604 * 16], m3 | |
11139 | |
11140 ; mode 13 [row 5 - first half] | |
11141 movu [r0 + 650 * 16], m3 | |
11142 | |
11143 pmaddubsw m3, m1, [r5 + 2 * 16] | |
11144 pmulhrsw m3, m7 | |
11145 pmaddubsw m5, m4, [r5 + 2 * 16] | |
11146 pmulhrsw m5, m7 | |
11147 packuswb m3, m5 | |
11148 movu [r0 + 605 * 16], m3 | |
11149 | |
11150 ; mode 13 [row 5 - second half] | |
11151 movu [r0 + 651 * 16], m3 | |
11152 | |
11153 ; mode 12 [row 0] | |
11154 pmaddubsw m3, m0, [r5 + 27 * 16] | |
11155 pmulhrsw m3, m7 | |
11156 pmaddubsw m5, m2, [r5 + 27 * 16] | |
11157 pmulhrsw m5, m7 | |
11158 packuswb m3, m5 | |
11159 movu [r0 + 640 * 16], m3 | |
11160 pmaddubsw m3, m1, [r5 + 27 * 16] | |
11161 pmulhrsw m3, m7 | |
11162 pmaddubsw m5, m4, [r5 + 27 * 16] | |
11163 pmulhrsw m5, m7 | |
11164 packuswb m3, m5 | |
11165 movu [r0 + 641 * 16], m3 | |
11166 | |
11167 ; mode 12 [row 2] | |
11168 pmaddubsw m3, m0, [r5 + 17 * 16] | |
11169 pmulhrsw m3, m7 | |
11170 pmaddubsw m5, m2, [r5 + 17 * 16] | |
11171 pmulhrsw m5, m7 | |
11172 packuswb m3, m5 | |
11173 movu [r0 + 644 * 16], m3 | |
11174 pmaddubsw m3, m1, [r5 + 17 * 16] | |
11175 pmulhrsw m3, m7 | |
11176 pmaddubsw m5, m4, [r5 + 17 * 16] | |
11177 pmulhrsw m5, m7 | |
11178 packuswb m3, m5 | |
11179 movu [r0 + 645 * 16], m3 | |
11180 | |
11181 ; mode 12 [row 4] | |
11182 pmaddubsw m3, m0, [r5 + 7 * 16] | |
11183 pmulhrsw m3, m7 | |
11184 pmaddubsw m5, m2, [r5 + 7 * 16] | |
11185 pmulhrsw m5, m7 | |
11186 packuswb m3, m5 | |
11187 movu [r0 + 648 * 16], m3 | |
11188 pmaddubsw m3, m1, [r5 + 7 * 16] | |
11189 pmulhrsw m3, m7 | |
11190 pmaddubsw m5, m4, [r5 + 7 * 16] | |
11191 pmulhrsw m5, m7 | |
11192 packuswb m3, m5 | |
11193 movu [r0 + 649 * 16], m3 | |
11194 | |
11195 ; mode 13 [row 0] | |
11196 pmaddubsw m3, m0, [r5 + 23 * 16] | |
11197 pmulhrsw m3, m7 | |
11198 pmaddubsw m5, m2, [r5 + 23 * 16] | |
11199 pmulhrsw m5, m7 | |
11200 packuswb m3, m5 | |
11201 movu [r0 + 704 * 16], m3 | |
11202 pmaddubsw m3, m1, [r5 + 23 * 16] | |
11203 pmulhrsw m3, m7 | |
11204 pmaddubsw m5, m4, [r5 + 23 * 16] | |
11205 pmulhrsw m5, m7 | |
11206 packuswb m3, m5 | |
11207 movu [r0 + 705 * 16], m3 | |
11208 | |
11209 ; mode 13 [row 2] | |
11210 pmaddubsw m3, m0, [r5 + 5 * 16] | |
11211 pmulhrsw m3, m7 | |
11212 pmaddubsw m5, m2, [r5 + 5 * 16] | |
11213 pmulhrsw m5, m7 | |
11214 packuswb m3, m5 | |
11215 movu [r0 + 708 * 16], m3 | |
11216 pmaddubsw m3, m1, [r5 + 5 * 16] | |
11217 pmulhrsw m3, m7 | |
11218 pmaddubsw m5, m4, [r5 + 5 * 16] | |
11219 pmulhrsw m5, m7 | |
11220 packuswb m3, m5 | |
11221 movu [r0 + 709 * 16], m3 | |
11222 | |
11223 ; mode 14 [row 0] | |
11224 pmaddubsw m3, m0, [r5 + 19 * 16] | |
11225 pmulhrsw m3, m7 | |
11226 pmaddubsw m5, m2, [r5 + 19 * 16] | |
11227 pmulhrsw m5, m7 | |
11228 packuswb m3, m5 | |
11229 movu [r0 + 768 * 16], m3 | |
11230 pmaddubsw m3, m1, [r5 + 19 * 16] | |
11231 pmulhrsw m3, m7 | |
11232 pmaddubsw m5, m4, [r5 + 19 * 16] | |
11233 pmulhrsw m5, m7 | |
11234 packuswb m3, m5 | |
11235 movu [r0 + 769 * 16], m3 | |
11236 | |
11237 ; mode 15 [row 0] | |
11238 pmaddubsw m3, m0, [r5 + 15 * 16] | |
11239 pmulhrsw m3, m7 | |
11240 pmaddubsw m5, m2, [r5 + 15 * 16] | |
11241 pmulhrsw m5, m7 | |
11242 packuswb m3, m5 | |
11243 movu [r0 + 832 * 16], m3 | |
11244 pmaddubsw m3, m1, [r5 + 15 * 16] | |
11245 pmulhrsw m3, m7 | |
11246 pmaddubsw m5, m4, [r5 + 15 * 16] | |
11247 pmulhrsw m5, m7 | |
11248 packuswb m3, m5 | |
11249 movu [r0 + 833 * 16], m3 | |
11250 | |
11251 ; mode 11 [row 16] | |
11252 pslldq m0, 2 | |
11253 pinsrb m0, [r4 + 0], 1 | |
11254 pinsrb m0, [r3 + 16], 0 | |
11255 pmaddubsw m3, m0, [r5 + 30 * 16] | |
11256 pmulhrsw m3, m7 | |
11257 pslldq m2, 2 | |
11258 pinsrb m2, [r4 + 8], 1 | |
11259 pinsrb m2, [r4 + 7], 0 | |
11260 pmaddubsw m5, m2, [r5 + 30 * 16] | |
11261 pmulhrsw m5, m7 | |
11262 packuswb m3, m5 | |
11263 movu [r0 + 608 * 16], m3 | |
11264 pslldq m1, 2 | |
11265 pinsrb m1, [r4 + 16], 1 | |
11266 pinsrb m1, [r4 + 15], 0 | |
11267 pmaddubsw m3, m1, [r5 + 30 * 16] | |
11268 pmulhrsw m3, m7 | |
11269 pslldq m4, 2 | |
11270 pinsrb m4, [r4 + 24], 1 | |
11271 pinsrb m4, [r4 + 23], 0 | |
11272 pmaddubsw m5, m4, [r5 + 30 * 16] | |
11273 pmulhrsw m5, m7 | |
11274 packuswb m3, m5 | |
11275 movu [r0 + 609 * 16], m3 | |
11276 | |
11277 ; mode 11 [row 17] | |
11278 pmaddubsw m3, m0, [r5 + 28 * 16] | |
11279 pmulhrsw m3, m7 | |
11280 pmaddubsw m5, m2, [r5 + 28 * 16] | |
11281 pmulhrsw m5, m7 | |
11282 packuswb m3, m5 | |
11283 movu [r0 + 610 * 16], m3 | |
11284 pmaddubsw m3, m1, [r5 + 28 * 16] | |
11285 pmulhrsw m3, m7 | |
11286 pmaddubsw m5, m4, [r5 + 28 * 16] | |
11287 pmulhrsw m5, m7 | |
11288 packuswb m3, m5 | |
11289 movu [r0 + 611 * 16], m3 | |
11290 | |
11291 ; mode 11 [row 18] | |
11292 pmaddubsw m3, m0, [r5 + 26 * 16] | |
11293 pmulhrsw m3, m7 | |
11294 pmaddubsw m5, m2, [r5 + 26 * 16] | |
11295 pmulhrsw m5, m7 | |
11296 packuswb m3, m5 | |
11297 movu [r0 + 612 * 16], m3 | |
11298 pmaddubsw m3, m1, [r5 + 26 * 16] | |
11299 pmulhrsw m3, m7 | |
11300 pmaddubsw m5, m4, [r5 + 26 * 16] | |
11301 pmulhrsw m5, m7 | |
11302 packuswb m3, m5 | |
11303 movu [r0 + 613 * 16], m3 | |
11304 | |
11305 ; mode 11 [row 19] | |
11306 pmaddubsw m3, m0, [r5 + 24 * 16] | |
11307 pmulhrsw m3, m7 | |
11308 pmaddubsw m5, m2, [r5 + 24 * 16] | |
11309 pmulhrsw m5, m7 | |
11310 packuswb m3, m5 | |
11311 movu [r0 + 614 * 16], m3 | |
11312 pmaddubsw m3, m1, [r5 + 24 * 16] | |
11313 pmulhrsw m3, m7 | |
11314 pmaddubsw m5, m4, [r5 + 24 * 16] | |
11315 pmulhrsw m5, m7 | |
11316 packuswb m3, m5 | |
11317 movu [r0 + 615 * 16], m3 | |
11318 | |
11319 ; mode 11 [row 20] | |
11320 pmaddubsw m3, m0, [r5 + 22 * 16] | |
11321 pmulhrsw m3, m7 | |
11322 pmaddubsw m5, m2, [r5 + 22 * 16] | |
11323 pmulhrsw m5, m7 | |
11324 packuswb m3, m5 | |
11325 movu [r0 + 616 * 16], m3 | |
11326 pmaddubsw m3, m1, [r5 + 22 * 16] | |
11327 pmulhrsw m3, m7 | |
11328 pmaddubsw m5, m4, [r5 + 22 * 16] | |
11329 pmulhrsw m5, m7 | |
11330 packuswb m3, m5 | |
11331 movu [r0 + 617 * 16], m3 | |
11332 | |
11333 ; mode 11 [row 21] | |
11334 pmaddubsw m3, m0, [r5 + 20 * 16] | |
11335 pmulhrsw m3, m7 | |
11336 pmaddubsw m5, m2, [r5 + 20 * 16] | |
11337 pmulhrsw m5, m7 | |
11338 packuswb m3, m5 | |
11339 movu [r0 + 618 * 16], m3 | |
11340 pmaddubsw m3, m1, [r5 + 20 * 16] | |
11341 pmulhrsw m3, m7 | |
11342 pmaddubsw m5, m4, [r5 + 20 * 16] | |
11343 pmulhrsw m5, m7 | |
11344 packuswb m3, m5 | |
11345 movu [r0 + 619 * 16], m3 | |
11346 | |
11347 ; mode 11 [row 22] | |
11348 pmaddubsw m3, m0, [r5 + 18 * 16] | |
11349 pmulhrsw m3, m7 | |
11350 pmaddubsw m5, m2, [r5 + 18 * 16] | |
11351 pmulhrsw m5, m7 | |
11352 packuswb m3, m5 | |
11353 movu [r0 + 620 * 16], m3 | |
11354 pmaddubsw m3, m1, [r5 + 18 * 16] | |
11355 pmulhrsw m3, m7 | |
11356 pmaddubsw m5, m4, [r5 + 18 * 16] | |
11357 pmulhrsw m5, m7 | |
11358 packuswb m3, m5 | |
11359 movu [r0 + 621 * 16], m3 | |
11360 | |
11361 ; mode 11 [row 23] | |
11362 pmaddubsw m3, m0, [r5 + 16 * 16] | |
11363 pmulhrsw m3, m7 | |
11364 pmaddubsw m5, m2, [r5 + 16 * 16] | |
11365 pmulhrsw m5, m7 | |
11366 packuswb m3, m5 | |
11367 movu [r0 + 622 * 16], m3 | |
11368 pmaddubsw m3, m1, [r5 + 16 * 16] | |
11369 pmulhrsw m3, m7 | |
11370 pmaddubsw m5, m4, [r5 + 16 * 16] | |
11371 pmulhrsw m5, m7 | |
11372 packuswb m3, m5 | |
11373 movu [r0 + 623 * 16], m3 | |
11374 | |
11375 ; mode 11 [row 24] | |
11376 pmaddubsw m3, m0, [r5 + 14 * 16] | |
11377 pmulhrsw m3, m7 | |
11378 pmaddubsw m5, m2, [r5 + 14 * 16] | |
11379 pmulhrsw m5, m7 | |
11380 packuswb m3, m5 | |
11381 movu [r0 + 624 * 16], m3 | |
11382 pmaddubsw m3, m1, [r5 + 14 * 16] | |
11383 pmulhrsw m3, m7 | |
11384 pmaddubsw m5, m4, [r5 + 14 * 16] | |
11385 pmulhrsw m5, m7 | |
11386 packuswb m3, m5 | |
11387 movu [r0 + 625 * 16], m3 | |
11388 | |
11389 ; mode 11 [row 25] | |
11390 pmaddubsw m3, m0, [r5 + 12 * 16] | |
11391 pmulhrsw m3, m7 | |
11392 pmaddubsw m5, m2, [r5 + 12 * 16] | |
11393 pmulhrsw m5, m7 | |
11394 packuswb m3, m5 | |
11395 movu [r0 + 626 * 16], m3 | |
11396 pmaddubsw m3, m1, [r5 + 12 * 16] | |
11397 pmulhrsw m3, m7 | |
11398 pmaddubsw m5, m4, [r5 + 12 * 16] | |
11399 pmulhrsw m5, m7 | |
11400 packuswb m3, m5 | |
11401 movu [r0 + 627 * 16], m3 | |
11402 | |
11403 ; mode 11 [row 26] | |
11404 pmaddubsw m3, m0, [r5 + 10 * 16] | |
11405 pmulhrsw m3, m7 | |
11406 pmaddubsw m5, m2, [r5 + 10 * 16] | |
11407 pmulhrsw m5, m7 | |
11408 packuswb m3, m5 | |
11409 movu [r0 + 628 * 16], m3 | |
11410 pmaddubsw m3, m1, [r5 + 10 * 16] | |
11411 pmulhrsw m3, m7 | |
11412 pmaddubsw m5, m4, [r5 + 10 * 16] | |
11413 pmulhrsw m5, m7 | |
11414 packuswb m3, m5 | |
11415 movu [r0 + 629 * 16], m3 | |
11416 | |
11417 ; mode 11 [row 27] | |
11418 pmaddubsw m3, m0, [r5 + 8 * 16] | |
11419 pmulhrsw m3, m7 | |
11420 pmaddubsw m5, m2, [r5 + 8 * 16] | |
11421 pmulhrsw m5, m7 | |
11422 packuswb m3, m5 | |
11423 movu [r0 + 630 * 16], m3 | |
11424 pmaddubsw m3, m1, [r5 + 8 * 16] | |
11425 pmulhrsw m3, m7 | |
11426 pmaddubsw m5, m4, [r5 + 8 * 16] | |
11427 pmulhrsw m5, m7 | |
11428 packuswb m3, m5 | |
11429 movu [r0 + 631 * 16], m3 | |
11430 | |
11431 ; mode 11 [row 28] | |
11432 pmaddubsw m3, m0, [r5 + 6 * 16] | |
11433 pmulhrsw m3, m7 | |
11434 pmaddubsw m5, m2, [r5 + 6 * 16] | |
11435 pmulhrsw m5, m7 | |
11436 packuswb m3, m5 | |
11437 movu [r0 + 632 * 16], m3 | |
11438 pmaddubsw m3, m1, [r5 + 6 * 16] | |
11439 pmulhrsw m3, m7 | |
11440 pmaddubsw m5, m4, [r5 + 6 * 16] | |
11441 pmulhrsw m5, m7 | |
11442 packuswb m3, m5 | |
11443 movu [r0 + 633 * 16], m3 | |
11444 | |
11445 ; mode 11 [row 29] | |
11446 pmaddubsw m3, m0, [r5 + 4 * 16] | |
11447 pmulhrsw m3, m7 | |
11448 pmaddubsw m5, m2, [r5 + 4 * 16] | |
11449 pmulhrsw m5, m7 | |
11450 packuswb m3, m5 | |
11451 movu [r0 + 634 * 16], m3 | |
11452 pmaddubsw m3, m1, [r5 + 4 * 16] | |
11453 pmulhrsw m3, m7 | |
11454 pmaddubsw m5, m4, [r5 + 4 * 16] | |
11455 pmulhrsw m5, m7 | |
11456 packuswb m3, m5 | |
11457 movu [r0 + 635 * 16], m3 | |
11458 | |
11459 ; mode 11 [row 30] | |
11460 pmaddubsw m3, m0, [r5 + 2 * 16] | |
11461 pmulhrsw m3, m7 | |
11462 pmaddubsw m5, m2, [r5 + 2 * 16] | |
11463 pmulhrsw m5, m7 | |
11464 packuswb m3, m5 | |
11465 movu [r0 + 636 * 16], m3 | |
11466 pmaddubsw m3, m1, [r5 + 2 * 16] | |
11467 pmulhrsw m3, m7 | |
11468 pmaddubsw m5, m4, [r5 + 2 * 16] | |
11469 pmulhrsw m5, m7 | |
11470 packuswb m3, m5 | |
11471 movu [r0 + 637 * 16], m3 | |
11472 | |
11473 ; mode 12 [row 6] | |
11474 pinsrb m0, [r3 + 6], 0 | |
11475 pmaddubsw m3, m0, [r5 + 29 * 16] | |
11476 pmulhrsw m3, m7 | |
11477 pmaddubsw m5, m2, [r5 + 29 * 16] | |
11478 pmulhrsw m5, m7 | |
11479 packuswb m3, m5 | |
11480 movu [r0 + 652 * 16], m3 | |
11481 pmaddubsw m3, m1, [r5 + 29 * 16] | |
11482 pmulhrsw m3, m7 | |
11483 pmaddubsw m5, m4, [r5 + 29 * 16] | |
11484 pmulhrsw m5, m7 | |
11485 packuswb m3, m5 | |
11486 movu [r0 + 653 * 16], m3 | |
11487 | |
11488 ; mode 12 [row 7] | |
11489 pmaddubsw m3, m0, [r5 + 24 * 16] | |
11490 pmulhrsw m3, m7 | |
11491 pmaddubsw m5, m2, [r5 + 24 * 16] | |
11492 pmulhrsw m5, m7 | |
11493 packuswb m3, m5 | |
11494 movu [r0 + 654 * 16], m3 | |
11495 pmaddubsw m3, m1, [r5 + 24 * 16] | |
11496 pmulhrsw m3, m7 | |
11497 pmaddubsw m5, m4, [r5 + 24 * 16] | |
11498 pmulhrsw m5, m7 | |
11499 packuswb m3, m5 | |
11500 movu [r0 + 655 * 16], m3 | |
11501 | |
11502 ; mode 12 [row 8] | |
11503 pmaddubsw m3, m0, [r5 + 19 * 16] | |
11504 pmulhrsw m3, m7 | |
11505 pmaddubsw m5, m2, [r5 + 19 * 16] | |
11506 pmulhrsw m5, m7 | |
11507 packuswb m3, m5 | |
11508 movu [r0 + 656 * 16], m3 | |
11509 pmaddubsw m3, m1, [r5 + 19 * 16] | |
11510 pmulhrsw m3, m7 | |
11511 pmaddubsw m5, m4, [r5 + 19 * 16] | |
11512 pmulhrsw m5, m7 | |
11513 packuswb m3, m5 | |
11514 movu [r0 + 657 * 16], m3 | |
11515 | |
11516 ; mode 12 [row 9] | |
11517 pmaddubsw m3, m0, [r5 + 14 * 16] | |
11518 pmulhrsw m3, m7 | |
11519 pmaddubsw m5, m2, [r5 + 14 * 16] | |
11520 pmulhrsw m5, m7 | |
11521 packuswb m3, m5 | |
11522 movu [r0 + 658 * 16], m3 | |
11523 pmaddubsw m3, m1, [r5 + 14 * 16] | |
11524 pmulhrsw m3, m7 | |
11525 pmaddubsw m5, m4, [r5 + 14 * 16] | |
11526 pmulhrsw m5, m7 | |
11527 packuswb m3, m5 | |
11528 movu [r0 + 659 * 16], m3 | |
11529 | |
11530 ; mode 12 [row 10] | |
11531 pmaddubsw m3, m0, [r5 + 9 * 16] | |
11532 pmulhrsw m3, m7 | |
11533 pmaddubsw m5, m2, [r5 + 9 * 16] | |
11534 pmulhrsw m5, m7 | |
11535 packuswb m3, m5 | |
11536 movu [r0 + 660 * 16], m3 | |
11537 pmaddubsw m3, m1, [r5 + 9 * 16] | |
11538 pmulhrsw m3, m7 | |
11539 pmaddubsw m5, m4, [r5 + 9 * 16] | |
11540 pmulhrsw m5, m7 | |
11541 packuswb m3, m5 | |
11542 movu [r0 + 661 * 16], m3 | |
11543 | |
11544 ; mode 12 [row 11] | |
11545 pmaddubsw m3, m0, [r5 + 4 * 16] | |
11546 pmulhrsw m3, m7 | |
11547 pmaddubsw m5, m2, [r5 + 4 * 16] | |
11548 pmulhrsw m5, m7 | |
11549 packuswb m3, m5 | |
11550 movu [r0 + 662 * 16], m3 | |
11551 pmaddubsw m3, m1, [r5 + 4 * 16] | |
11552 pmulhrsw m3, m7 | |
11553 pmaddubsw m5, m4, [r5 + 4 * 16] | |
11554 pmulhrsw m5, m7 | |
11555 packuswb m3, m5 | |
11556 movu [r0 + 663 * 16], m3 | |
11557 | |
11558 ; mode 13 [row 3] | |
11559 movu m6, m0 | |
11560 pinsrb m6, [r3 + 4], 0 | |
11561 pmaddubsw m3, m6, [r5 + 28 * 16] | |
11562 pmulhrsw m3, m7 | |
11563 pmaddubsw m5, m2, [r5 + 28 * 16] | |
11564 pmulhrsw m5, m7 | |
11565 packuswb m3, m5 | |
11566 movu [r0 + 710 * 16], m3 | |
11567 pmaddubsw m3, m1, [r5 + 28 * 16] | |
11568 pmulhrsw m3, m7 | |
11569 pmaddubsw m5, m4, [r5 + 28 * 16] | |
11570 pmulhrsw m5, m7 | |
11571 packuswb m3, m5 | |
11572 movu [r0 + 711 * 16], m3 | |
11573 | |
11574 ; mode 13 [row 4] | |
11575 pmaddubsw m3, m6, [r5 + 19 * 16] | |
11576 pmulhrsw m3, m7 | |
11577 pmaddubsw m5, m2, [r5 + 19 * 16] | |
11578 pmulhrsw m5, m7 | |
11579 packuswb m3, m5 | |
11580 movu [r0 + 712 * 16], m3 | |
11581 pmaddubsw m3, m1, [r5 + 19 * 16] | |
11582 pmulhrsw m3, m7 | |
11583 pmaddubsw m5, m4, [r5 + 19 * 16] | |
11584 pmulhrsw m5, m7 | |
11585 packuswb m3, m5 | |
11586 movu [r0 + 713 * 16], m3 | |
11587 | |
11588 ; mode 13 [row 5] | |
11589 pmaddubsw m3, m6, [r5 + 10 * 16] | |
11590 pmulhrsw m3, m7 | |
11591 pmaddubsw m5, m2, [r5 + 10 * 16] | |
11592 pmulhrsw m5, m7 | |
11593 packuswb m3, m5 | |
11594 movu [r0 + 714 * 16], m3 | |
11595 pmaddubsw m3, m1, [r5 + 10 * 16] | |
11596 pmulhrsw m3, m7 | |
11597 pmaddubsw m5, m4, [r5 + 10 * 16] | |
11598 pmulhrsw m5, m7 | |
11599 packuswb m3, m5 | |
11600 movu [r0 + 715 * 16], m3 | |
11601 | |
11602 ; mode 13 [row 6] | |
11603 pmaddubsw m3, m6, [r5 + 1 * 16] | |
11604 pmulhrsw m3, m7 | |
11605 pmaddubsw m5, m2, [r5 + 1 * 16] | |
11606 pmulhrsw m5, m7 | |
11607 packuswb m3, m5 | |
11608 movu [r0 + 716 * 16], m3 | |
11609 pmaddubsw m3, m1, [r5 + 1 * 16] | |
11610 pmulhrsw m3, m7 | |
11611 pmaddubsw m5, m4, [r5 + 1 * 16] | |
11612 pmulhrsw m5, m7 | |
11613 packuswb m3, m5 | |
11614 movu [r0 + 717 * 16], m3 | |
11615 | |
11616 ; mode 14 [row 2] | |
11617 movu m6, m0 | |
11618 pinsrb m6, [r4 + 0], 1 | |
11619 pinsrb m6, [r3 + 2], 0 | |
11620 pmaddubsw m3, m6, [r5 + 25 * 16] | |
11621 pmulhrsw m3, m7 | |
11622 pmaddubsw m5, m2, [r5 + 25 * 16] | |
11623 pmulhrsw m5, m7 | |
11624 packuswb m3, m5 | |
11625 movu [r0 + 772 * 16], m3 | |
11626 pmaddubsw m3, m1, [r5 + 25 * 16] | |
11627 pmulhrsw m3, m7 | |
11628 pmaddubsw m5, m4, [r5 + 25 * 16] | |
11629 pmulhrsw m5, m7 | |
11630 packuswb m3, m5 | |
11631 movu [r0 + 773 * 16], m3 | |
11632 | |
11633 ; mode 14 [row 3] | |
11634 pmaddubsw m3, m6, [r5 + 12 * 16] | |
11635 pmulhrsw m3, m7 | |
11636 pmaddubsw m5, m2, [r5 + 12 * 16] | |
11637 pmulhrsw m5, m7 | |
11638 packuswb m3, m5 | |
11639 movu [r0 + 774 * 16], m3 | |
11640 pmaddubsw m3, m1, [r5 + 12 * 16] | |
11641 pmulhrsw m3, m7 | |
11642 pmaddubsw m5, m4, [r5 + 12 * 16] | |
11643 pmulhrsw m5, m7 | |
11644 packuswb m3, m5 | |
11645 movu [r0 + 775 * 16], m3 | |
11646 | |
11647 ; mode 15 [row 1] | |
11648 pmaddubsw m3, m6, [r5 + 30 * 16] | |
11649 pmulhrsw m3, m7 | |
11650 pmaddubsw m5, m2, [r5 + 30 * 16] | |
11651 pmulhrsw m5, m7 | |
11652 packuswb m3, m5 | |
11653 movu [r0 + 834 * 16], m3 | |
11654 pmaddubsw m3, m1, [r5 + 30 * 16] | |
11655 pmulhrsw m3, m7 | |
11656 pmaddubsw m5, m4, [r5 + 30 * 16] | |
11657 pmulhrsw m5, m7 | |
11658 packuswb m3, m5 | |
11659 movu [r0 + 835 * 16], m3 | |
11660 | |
11661 ; mode 15 [row 2] | |
11662 pmaddubsw m3, m6, [r5 + 13 * 16] | |
11663 pmulhrsw m3, m7 | |
11664 pmaddubsw m5, m2, [r5 + 13 * 16] | |
11665 pmulhrsw m5, m7 | |
11666 packuswb m3, m5 | |
11667 movu [r0 + 836 * 16], m3 | |
11668 pmaddubsw m3, m1, [r5 + 13 * 16] | |
11669 pmulhrsw m3, m7 | |
11670 pmaddubsw m5, m4, [r5 + 13 * 16] | |
11671 pmulhrsw m5, m7 | |
11672 packuswb m3, m5 | |
11673 movu [r0 + 837 * 16], m3 | |
11674 | |
11675 ; mode 15 [row 3] | |
11676 pslldq m6, 2 | |
11677 pinsrb m6, [r3 + 2], 1 | |
11678 pinsrb m6, [r3 + 4], 0 | |
11679 pmaddubsw m3, m6, [r5 + 28 * 16] | |
11680 pmulhrsw m3, m7 | |
11681 pslldq m2, 2 | |
11682 pinsrb m2, [r4 + 7], 1 | |
11683 pinsrb m2, [r4 + 6], 0 | |
11684 pmaddubsw m5, m2, [r5 + 28 * 16] | |
11685 pmulhrsw m5, m7 | |
11686 packuswb m3, m5 | |
11687 movu [r0 + 838 * 16], m3 | |
11688 pslldq m1, 2 | |
11689 pinsrb m1, [r4 + 15], 1 | |
11690 pinsrb m1, [r4 + 14], 0 | |
11691 pmaddubsw m3, m1, [r5 + 28 * 16] | |
11692 pmulhrsw m3, m7 | |
11693 pslldq m4, 2 | |
11694 pinsrb m4, [r4 + 23], 1 | |
11695 pinsrb m4, [r4 + 22], 0 | |
11696 pmaddubsw m5, m4, [r5 + 28 * 16] | |
11697 pmulhrsw m5, m7 | |
11698 packuswb m3, m5 | |
11699 movu [r0 + 839 * 16], m3 | |
11700 | |
11701 ; mode 15 [row 4] | |
11702 pmaddubsw m3, m6, [r5 + 11 * 16] | |
11703 pmulhrsw m3, m7 | |
11704 pmaddubsw m5, m2, [r5 + 11 * 16] | |
11705 pmulhrsw m5, m7 | |
11706 packuswb m3, m5 | |
11707 movu [r0 + 840 * 16], m3 | |
11708 pmaddubsw m3, m1, [r5 + 11 * 16] | |
11709 pmulhrsw m3, m7 | |
11710 pmaddubsw m5, m4, [r5 + 11 * 16] | |
11711 pmulhrsw m5, m7 | |
11712 packuswb m3, m5 | |
11713 movu [r0 + 841 * 16], m3 | |
11714 | |
11715 ; mode 15 [row 5, 0-7] | |
11716 pslldq m6, 2 | |
11717 pinsrb m6, [r3 + 4], 1 | |
11718 pinsrb m6, [r3 + 6], 0 | |
11719 pmaddubsw m3, m6, [r5 + 26 * 16] | |
11720 pmulhrsw m3, m7 | |
11721 packuswb m3, m3 | |
11722 movh [r0 + 842 * 16], m3 | |
11723 | |
11724 ; mode 15 [row 6, 0-7] | |
11725 pmaddubsw m3, m6, [r5 + 9 * 16] | |
11726 pmulhrsw m3, m7 | |
11727 packuswb m3, m3 | |
11728 movh [r0 + 844 * 16], m3 | |
11729 | |
11730 ; mode 15 [row 7, 0-7] | |
11731 pslldq m6, 2 | |
11732 pinsrb m6, [r3 + 6], 1 | |
11733 pinsrb m6, [r3 + 8], 0 | |
11734 pmaddubsw m3, m6, [r5 + 24 * 16] | |
11735 pmulhrsw m3, m7 | |
11736 packuswb m3, m3 | |
11737 movh [r0 + 846 * 16], m3 | |
11738 | |
11739 ; mode 15 [row 8, 0-7] | |
11740 pmaddubsw m3, m6, [r5 + 7 * 16] | |
11741 pmulhrsw m3, m7 | |
11742 packuswb m3, m3 | |
11743 movh [r0 + 848 * 16], m3 | |
11744 | |
11745 ; mode 15 [row 9, 0-7] | |
11746 pslldq m6, 2 | |
11747 pinsrb m6, [r3 + 8], 1 | |
11748 pinsrb m6, [r3 + 9], 0 | |
11749 pmaddubsw m3, m6, [r5 + 22 * 16] | |
11750 pmulhrsw m3, m7 | |
11751 packuswb m3, m3 | |
11752 movh [r0 + 850 * 16], m3 | |
11753 | |
11754 ; mode 15 [row 10, 0-7] | |
11755 pmaddubsw m3, m6, [r5 + 5 * 16] | |
11756 pmulhrsw m3, m7 | |
11757 packuswb m3, m3 | |
11758 movh [r0 + 852 * 16], m3 | |
11759 | |
11760 ; mode 15 [row 11, 0-7] | |
11761 pslldq m6, 2 | |
11762 pinsrb m6, [r3 + 9], 1 | |
11763 pinsrb m6, [r3 + 11], 0 | |
11764 pmaddubsw m3, m6, [r5 + 20 * 16] | |
11765 pmulhrsw m3, m7 | |
11766 packuswb m3, m3 | |
11767 movh [r0 + 854 * 16], m3 | |
11768 | |
11769 ; mode 15 [row 12, 0-7] | |
11770 pmaddubsw m3, m6, [r5 + 3 * 16] | |
11771 pmulhrsw m3, m7 | |
11772 packuswb m3, m3 | |
11773 movh [r0 + 856 * 16], m3 | |
11774 | |
11775 ; mode 15 [row 13, 0-7] | |
11776 pslldq m6, 2 | |
11777 pinsrb m6, [r3 + 11], 1 | |
11778 pinsrb m6, [r3 + 13], 0 | |
11779 pmaddubsw m3, m6, [r5 + 18 * 16] | |
11780 pmulhrsw m3, m7 | |
11781 packuswb m3, m3 | |
11782 movh [r0 + 858 * 16], m3 | |
11783 | |
11784 ; mode 15 [row 14, 0-7] | |
11785 pmaddubsw m3, m6, [r5 + 1 * 16] | |
11786 pmulhrsw m3, m7 | |
11787 packuswb m3, m3 | |
11788 movh [r0 + 860 * 16], m3 | |
11789 | |
11790 ; mode 15 [row 15, 0-7] | |
11791 pslldq m6, 2 | |
11792 pinsrb m6, [r3 + 13], 1 | |
11793 pinsrb m6, [r3 + 15], 0 | |
11794 pmaddubsw m3, m6, [r5 + 16 * 16] | |
11795 pmulhrsw m3, m7 | |
11796 packuswb m3, m3 | |
11797 movh [r0 + 862 * 16], m3 | |
11798 | |
11799 ; mode 15 [row 16, 0-7] | |
11800 pslldq m6, 2 | |
11801 pinsrb m6, [r3 + 15], 1 | |
11802 pinsrb m6, [r3 + 17], 0 | |
11803 pmaddubsw m3, m6, [r5 + 31 * 16] | |
11804 pmulhrsw m3, m7 | |
11805 packuswb m3, m3 | |
11806 movh [r0 + 864 * 16], m3 | |
11807 | |
11808 ; mode 15 [row 17, 0-7] | |
11809 pmaddubsw m3, m6, [r5 + 14 * 16] | |
11810 pmulhrsw m3, m7 | |
11811 packuswb m3, m3 | |
11812 movh [r0 + 866 * 16], m3 | |
11813 | |
11814 ; mode 15 [row 18, 0-7] | |
11815 pslldq m6, 2 | |
11816 pinsrb m6, [r3 + 17], 1 | |
11817 pinsrb m6, [r3 + 19], 0 | |
11818 pmaddubsw m3, m6, [r5 + 29 * 16] | |
11819 pmulhrsw m3, m7 | |
11820 packuswb m3, m3 | |
11821 movh [r0 + 868 * 16], m3 | |
11822 | |
11823 ; mode 15 [row 19, 0-7] | |
11824 pmaddubsw m3, m6, [r5 + 12 * 16] | |
11825 pmulhrsw m3, m7 | |
11826 packuswb m3, m3 | |
11827 movh [r0 + 870 * 16], m3 | |
11828 | |
11829 ; mode 15 [row 20, 0-7] | |
11830 pslldq m6, 2 | |
11831 pinsrb m6, [r3 + 19], 1 | |
11832 pinsrb m6, [r3 + 21], 0 | |
11833 pmaddubsw m3, m6, [r5 + 27 * 16] | |
11834 pmulhrsw m3, m7 | |
11835 packuswb m3, m3 | |
11836 movh [r0 + 872 * 16], m3 | |
11837 | |
11838 ; mode 15 [row 21, 0-7] | |
11839 pmaddubsw m3, m6, [r5 + 10 * 16] | |
11840 pmulhrsw m3, m7 | |
11841 packuswb m3, m3 | |
11842 movh [r0 + 874 * 16], m3 | |
11843 | |
11844 ; mode 15 [row 22, 0-7] | |
11845 pslldq m6, 2 | |
11846 pinsrb m6, [r3 + 21], 1 | |
11847 pinsrb m6, [r3 + 23], 0 | |
11848 pmaddubsw m3, m6, [r5 + 25 * 16] | |
11849 pmulhrsw m3, m7 | |
11850 packuswb m3, m3 | |
11851 movh [r0 + 876 * 16], m3 | |
11852 | |
11853 ; mode 15 [row 23, 0-7] | |
11854 pmaddubsw m3, m6, [r5 + 8 * 16] | |
11855 pmulhrsw m3, m7 | |
11856 packuswb m3, m3 | |
11857 movh [r0 + 878 * 16], m3 | |
11858 | |
11859 ; mode 15 [row 24, 0-7] | |
11860 pslldq m6, 2 | |
11861 pinsrb m6, [r3 + 23], 1 | |
11862 pinsrb m6, [r3 + 24], 0 | |
11863 pmaddubsw m3, m6, [r5 + 23 * 16] | |
11864 pmulhrsw m3, m7 | |
11865 packuswb m3, m3 | |
11866 movh [r0 + 880 * 16], m3 | |
11867 | |
11868 ; mode 15 [row 25, 0-7] | |
11869 pmaddubsw m3, m6, [r5 + 6 * 16] | |
11870 pmulhrsw m3, m7 | |
11871 packuswb m3, m3 | |
11872 movh [r0 + 882 * 16], m3 | |
11873 | |
11874 ; mode 15 [row 26, 0-7] | |
11875 pslldq m6, 2 | |
11876 pinsrb m6, [r3 + 24], 1 | |
11877 pinsrb m6, [r3 + 26], 0 | |
11878 pmaddubsw m3, m6, [r5 + 21 * 16] | |
11879 pmulhrsw m3, m7 | |
11880 packuswb m3, m3 | |
11881 movh [r0 + 884 * 16], m3 | |
11882 | |
11883 ; mode 15 [row 27, 0-7] | |
11884 pmaddubsw m3, m6, [r5 + 4 * 16] | |
11885 pmulhrsw m3, m7 | |
11886 packuswb m3, m3 | |
11887 movh [r0 + 886 * 16], m3 | |
11888 | |
11889 ; mode 15 [row 28, 0-7] | |
11890 pslldq m6, 2 | |
11891 pinsrb m6, [r3 + 26], 1 | |
11892 pinsrb m6, [r3 + 28], 0 | |
11893 pmaddubsw m3, m6, [r5 + 19 * 16] | |
11894 pmulhrsw m3, m7 | |
11895 packuswb m3, m3 | |
11896 movh [r0 + 888 * 16], m3 | |
11897 | |
11898 ; mode 15 [row 29, 0-7] | |
11899 pmaddubsw m3, m6, [r5 + 2 * 16] | |
11900 pmulhrsw m3, m7 | |
11901 packuswb m3, m3 | |
11902 movh [r0 + 890 * 16], m3 | |
11903 | |
11904 ; mode 15 [row 30, 0-7] | |
11905 pslldq m6, 2 | |
11906 pinsrb m6, [r3 + 28], 1 | |
11907 pinsrb m6, [r3 + 30], 0 | |
11908 pmaddubsw m3, m6, [r5 + 17 * 16] | |
11909 pmulhrsw m3, m7 | |
11910 packuswb m3, m3 | |
11911 movh [r0 + 892 * 16], m3 | |
11912 | |
11913 ; mode 15 [row 31, 0-7] | |
11914 pshufb m3, m6, [tab_S2] | |
11915 movh [r0 + 894 * 16], m3 | |
11916 | |
11917 ; mode 12 [row 12] | |
11918 pslldq m0, 2 | |
11919 pinsrb m0, [r3 + 6], 1 | |
11920 pinsrb m0, [r3 + 13], 0 | |
11921 pmaddubsw m3, m0, [r5 + 31 * 16] | |
11922 pmulhrsw m3, m7 | |
11923 pmaddubsw m5, m2, [r5 + 31 * 16] | |
11924 pmulhrsw m5, m7 | |
11925 packuswb m3, m5 | |
11926 movu [r0 + 664 * 16], m3 | |
11927 pmaddubsw m3, m1, [r5 + 31 * 16] | |
11928 pmulhrsw m3, m7 | |
11929 pmaddubsw m5, m4, [r5 + 31 * 16] | |
11930 pmulhrsw m5, m7 | |
11931 packuswb m3, m5 | |
11932 movu [r0 + 665 * 16], m3 | |
11933 | |
11934 ; mode 12 [row 13] | |
11935 pmaddubsw m3, m0, [r5 + 26 * 16] | |
11936 pmulhrsw m3, m7 | |
11937 pmaddubsw m5, m2, [r5 + 26 * 16] | |
11938 pmulhrsw m5, m7 | |
11939 packuswb m3, m5 | |
11940 movu [r0 + 666 * 16], m3 | |
11941 pmaddubsw m3, m1, [r5 + 26 * 16] | |
11942 pmulhrsw m3, m7 | |
11943 pmaddubsw m5, m4, [r5 + 26 * 16] | |
11944 pmulhrsw m5, m7 | |
11945 packuswb m3, m5 | |
11946 movu [r0 + 667 * 16], m3 | |
11947 | |
11948 ; mode 12 [row 14] | |
11949 pmaddubsw m3, m0, [r5 + 21 * 16] | |
11950 pmulhrsw m3, m7 | |
11951 pmaddubsw m5, m2, [r5 + 21 * 16] | |
11952 pmulhrsw m5, m7 | |
11953 packuswb m3, m5 | |
11954 movu [r0 + 668 * 16], m3 | |
11955 pmaddubsw m3, m1, [r5 + 21 * 16] | |
11956 pmulhrsw m3, m7 | |
11957 pmaddubsw m5, m4, [r5 + 21 * 16] | |
11958 pmulhrsw m5, m7 | |
11959 packuswb m3, m5 | |
11960 movu [r0 + 669 * 16], m3 | |
11961 | |
11962 ; mode 12 [row 15] | |
11963 pmaddubsw m3, m0, [r5 + 16 * 16] | |
11964 pmulhrsw m3, m7 | |
11965 pmaddubsw m5, m2, [r5 + 16 * 16] | |
11966 pmulhrsw m5, m7 | |
11967 packuswb m3, m5 | |
11968 movu [r0 + 670 * 16], m3 | |
11969 pmaddubsw m3, m1, [r5 + 16 * 16] | |
11970 pmulhrsw m3, m7 | |
11971 pmaddubsw m5, m4, [r5 + 16 * 16] | |
11972 pmulhrsw m5, m7 | |
11973 packuswb m3, m5 | |
11974 movu [r0 + 671 * 16], m3 | |
11975 | |
11976 ; mode 12 [row 16] | |
11977 pmaddubsw m3, m0, [r5 + 11 * 16] | |
11978 pmulhrsw m3, m7 | |
11979 pmaddubsw m5, m2, [r5 + 11 * 16] | |
11980 pmulhrsw m5, m7 | |
11981 packuswb m3, m5 | |
11982 movu [r0 + 672 * 16], m3 | |
11983 pmaddubsw m3, m1, [r5 + 11 * 16] | |
11984 pmulhrsw m3, m7 | |
11985 pmaddubsw m5, m4, [r5 + 11 * 16] | |
11986 pmulhrsw m5, m7 | |
11987 packuswb m3, m5 | |
11988 movu [r0 + 673 * 16], m3 | |
11989 | |
11990 ; mode 12 [row 17] | |
11991 pmaddubsw m3, m0, [r5 + 6 * 16] | |
11992 pmulhrsw m3, m7 | |
11993 pmaddubsw m5, m2, [r5 + 6 * 16] | |
11994 pmulhrsw m5, m7 | |
11995 packuswb m3, m5 | |
11996 movu [r0 + 674 * 16], m3 | |
11997 pmaddubsw m3, m1, [r5 + 6 * 16] | |
11998 pmulhrsw m3, m7 | |
11999 pmaddubsw m5, m4, [r5 + 6 * 16] | |
12000 pmulhrsw m5, m7 | |
12001 packuswb m3, m5 | |
12002 movu [r0 + 675 * 16], m3 | |
12003 | |
12004 ; mode 12 [row 18] | |
12005 pmaddubsw m3, m0, [r5 + 1 * 16] | |
12006 pmulhrsw m3, m7 | |
12007 pmaddubsw m5, m2, [r5 + 1 * 16] | |
12008 pmulhrsw m5, m7 | |
12009 packuswb m3, m5 | |
12010 movu [r0 + 676 * 16], m3 | |
12011 pmaddubsw m3, m1, [r5 + 1 * 16] | |
12012 pmulhrsw m3, m7 | |
12013 pmaddubsw m5, m4, [r5 + 1 * 16] | |
12014 pmulhrsw m5, m7 | |
12015 packuswb m3, m5 | |
12016 movu [r0 + 677 * 16], m3 | |
12017 | |
12018 ; mode 13 [row 7] | |
12019 movu m6, m0 | |
12020 pinsrb m6, [r3 + 4], 2 | |
12021 pinsrb m6, [r3 + 4], 1 | |
12022 pinsrb m6, [r3 + 7], 0 | |
12023 pmaddubsw m3, m6, [r5 + 24 * 16] | |
12024 pmulhrsw m3, m7 | |
12025 pmaddubsw m5, m2, [r5 + 24 * 16] | |
12026 pmulhrsw m5, m7 | |
12027 packuswb m3, m5 | |
12028 movu [r0 + 718 * 16], m3 | |
12029 pmaddubsw m3, m1, [r5 + 24 * 16] | |
12030 pmulhrsw m3, m7 | |
12031 pmaddubsw m5, m4, [r5 + 24 * 16] | |
12032 pmulhrsw m5, m7 | |
12033 packuswb m3, m5 | |
12034 movu [r0 + 719 * 16], m3 | |
12035 | |
12036 ; mode 13 [row 8] | |
12037 pmaddubsw m3, m6, [r5 + 15 * 16] | |
12038 pmulhrsw m3, m7 | |
12039 pmaddubsw m5, m2, [r5 + 15 * 16] | |
12040 pmulhrsw m5, m7 | |
12041 packuswb m3, m5 | |
12042 movu [r0 + 720 * 16], m3 | |
12043 pmaddubsw m3, m1, [r5 + 15 * 16] | |
12044 pmulhrsw m3, m7 | |
12045 pmaddubsw m5, m4, [r5 + 15 * 16] | |
12046 pmulhrsw m5, m7 | |
12047 packuswb m3, m5 | |
12048 movu [r0 + 721 * 16], m3 | |
12049 | |
12050 ; mode 13 [row 9] | |
12051 pmaddubsw m3, m6, [r5 + 6 * 16] | |
12052 pmulhrsw m3, m7 | |
12053 pmaddubsw m5, m2, [r5 + 6 * 16] | |
12054 pmulhrsw m5, m7 | |
12055 packuswb m3, m5 | |
12056 movu [r0 + 722 * 16], m3 | |
12057 pmaddubsw m3, m1, [r5 + 6 * 16] | |
12058 pmulhrsw m3, m7 | |
12059 pmaddubsw m5, m4, [r5 + 6 * 16] | |
12060 pmulhrsw m5, m7 | |
12061 packuswb m3, m5 | |
12062 movu [r0 + 723 * 16], m3 | |
12063 | |
12064 ; mode 14 [row 4] | |
12065 pinsrb m6, [r3 + 2], 2 | |
12066 pinsrb m6, [r3 + 2], 1 | |
12067 pinsrb m6, [r3 + 5], 0 | |
12068 pmaddubsw m3, m6, [r5 + 31 * 16] | |
12069 pmulhrsw m3, m7 | |
12070 pmaddubsw m5, m2, [r5 + 31 * 16] | |
12071 pmulhrsw m5, m7 | |
12072 packuswb m3, m5 | |
12073 movu [r0 + 776 * 16], m3 | |
12074 pmaddubsw m3, m1, [r5 + 31 * 16] | |
12075 pmulhrsw m3, m7 | |
12076 pmaddubsw m5, m4, [r5 + 31 * 16] | |
12077 pmulhrsw m5, m7 | |
12078 packuswb m3, m5 | |
12079 movu [r0 + 777 * 16], m3 | |
12080 | |
12081 ; mode 14 [row 5] | |
12082 pmaddubsw m3, m6, [r5 + 18 * 16] | |
12083 pmulhrsw m3, m7 | |
12084 pmaddubsw m5, m2, [r5 + 18 * 16] | |
12085 pmulhrsw m5, m7 | |
12086 packuswb m3, m5 | |
12087 movu [r0 + 778 * 16], m3 | |
12088 pmaddubsw m3, m1, [r5 + 18 * 16] | |
12089 pmulhrsw m3, m7 | |
12090 pmaddubsw m5, m4, [r5 + 18 * 16] | |
12091 pmulhrsw m5, m7 | |
12092 packuswb m3, m5 | |
12093 movu [r0 + 779 * 16], m3 | |
12094 | |
12095 ; mode 14 [row 6] | |
12096 pmaddubsw m3, m6, [r5 + 5 * 16] | |
12097 pmulhrsw m3, m7 | |
12098 pmaddubsw m5, m2, [r5 + 5 * 16] | |
12099 pmulhrsw m5, m7 | |
12100 packuswb m3, m5 | |
12101 movu [r0 + 780 * 16], m3 | |
12102 pmaddubsw m3, m1, [r5 + 5 * 16] | |
12103 pmulhrsw m3, m7 | |
12104 pmaddubsw m5, m4, [r5 + 5 * 16] | |
12105 pmulhrsw m5, m7 | |
12106 packuswb m3, m5 | |
12107 movu [r0 + 781 * 16], m3 | |
12108 | |
12109 ; mode 14 [row 7] | |
12110 pslldq m6, 2 | |
12111 pinsrb m6, [r3 + 5], 1 | |
12112 pinsrb m6, [r3 + 7], 0 | |
12113 pmaddubsw m3, m6, [r5 + 24 * 16] | |
12114 pmulhrsw m3, m7 | |
12115 pslldq m2, 2 | |
12116 pinsrw m2, [r4 + 5], 0 | |
12117 pmaddubsw m5, m2, [r5 + 24 * 16] | |
12118 pmulhrsw m5, m7 | |
12119 packuswb m3, m5 | |
12120 movu [r0 + 782 * 16], m3 | |
12121 pslldq m1, 2 | |
12122 pinsrw m1, [r4 + 13], 0 | |
12123 pmaddubsw m3, m1, [r5 + 24 * 16] | |
12124 pmulhrsw m3, m7 | |
12125 pslldq m4, 2 | |
12126 pinsrw m4, [r4 + 21], 0 | |
12127 pmaddubsw m5, m4, [r5 + 24 * 16] | |
12128 pmulhrsw m5, m7 | |
12129 packuswb m3, m5 | |
12130 movu [r0 + 783 * 16], m3 | |
12131 | |
12132 ; mode 14 [row 8] | |
12133 pmaddubsw m3, m6, [r5 + 11 * 16] | |
12134 pmulhrsw m3, m7 | |
12135 pmaddubsw m5, m2, [r5 + 11 * 16] | |
12136 pmulhrsw m5, m7 | |
12137 packuswb m3, m5 | |
12138 movu [r0 + 784 * 16], m3 | |
12139 pmaddubsw m3, m1, [r5 + 11 * 16] | |
12140 pmulhrsw m3, m7 | |
12141 pmaddubsw m5, m4, [r5 + 11 * 16] | |
12142 pmulhrsw m5, m7 | |
12143 packuswb m3, m5 | |
12144 movu [r0 + 785 * 16], m3 | |
12145 | |
12146 ; mode 15 [row 5, 8-31] | |
12147 pmaddubsw m5, m2, [r5 + 26 * 16] | |
12148 pmulhrsw m5, m7 | |
12149 packuswb m5, m5 | |
12150 movh [r0 + 842 * 16 + 8], m5 | |
12151 pmaddubsw m3, m1, [r5 + 26 * 16] | |
12152 pmulhrsw m3, m7 | |
12153 pmaddubsw m5, m4, [r5 + 26 * 16] | |
12154 pmulhrsw m5, m7 | |
12155 packuswb m3, m5 | |
12156 movu [r0 + 843 * 16], m3 | |
12157 | |
12158 ; mode 15 [row 6, 8-31] | |
12159 pmaddubsw m5, m2, [r5 + 9 * 16] | |
12160 pmulhrsw m5, m7 | |
12161 packuswb m5, m5 | |
12162 movh [r0 + 844 * 16 + 8], m5 | |
12163 pmaddubsw m3, m1, [r5 + 9 * 16] | |
12164 pmulhrsw m3, m7 | |
12165 pmaddubsw m5, m4, [r5 + 9 * 16] | |
12166 pmulhrsw m5, m7 | |
12167 packuswb m3, m5 | |
12168 movu [r0 + 845 * 16], m3 | |
12169 | |
12170 ; mode 12 [row 19] | |
12171 pslldq m0, 2 | |
12172 pinsrb m0, [r3 + 13], 1 | |
12173 pinsrb m0, [r3 + 19], 0 | |
12174 pmaddubsw m3, m0, [r5 + 28 * 16] | |
12175 pmulhrsw m3, m7 | |
12176 pmaddubsw m5, m2, [r5 + 28 * 16] | |
12177 pmulhrsw m5, m7 | |
12178 packuswb m3, m5 | |
12179 movu [r0 + 678 * 16], m3 | |
12180 pmaddubsw m3, m1, [r5 + 28 * 16] | |
12181 pmulhrsw m3, m7 | |
12182 pmaddubsw m5, m4, [r5 + 28 * 16] | |
12183 pmulhrsw m5, m7 | |
12184 packuswb m3, m5 | |
12185 movu [r0 + 679 * 16], m3 | |
12186 | |
12187 ; mode 12 [row 20] | |
12188 pmaddubsw m3, m0, [r5 + 23 * 16] | |
12189 pmulhrsw m3, m7 | |
12190 pmaddubsw m5, m2, [r5 + 23 * 16] | |
12191 pmulhrsw m5, m7 | |
12192 packuswb m3, m5 | |
12193 movu [r0 + 680 * 16], m3 | |
12194 pmaddubsw m3, m1, [r5 + 23 * 16] | |
12195 pmulhrsw m3, m7 | |
12196 pmaddubsw m5, m4, [r5 + 23 * 16] | |
12197 pmulhrsw m5, m7 | |
12198 packuswb m3, m5 | |
12199 movu [r0 + 681 * 16], m3 | |
12200 | |
12201 ; mode 12 [row 21] | |
12202 pmaddubsw m3, m0, [r5 + 18 * 16] | |
12203 pmulhrsw m3, m7 | |
12204 pmaddubsw m5, m2, [r5 + 18 * 16] | |
12205 pmulhrsw m5, m7 | |
12206 packuswb m3, m5 | |
12207 movu [r0 + 682 * 16], m3 | |
12208 pmaddubsw m3, m1, [r5 + 18 * 16] | |
12209 pmulhrsw m3, m7 | |
12210 pmaddubsw m5, m4, [r5 + 18 * 16] | |
12211 pmulhrsw m5, m7 | |
12212 packuswb m3, m5 | |
12213 movu [r0 + 683 * 16], m3 | |
12214 | |
12215 ; mode 12 [row 22] | |
12216 pmaddubsw m3, m0, [r5 + 13 * 16] | |
12217 pmulhrsw m3, m7 | |
12218 pmaddubsw m5, m2, [r5 + 13 * 16] | |
12219 pmulhrsw m5, m7 | |
12220 packuswb m3, m5 | |
12221 movu [r0 + 684 * 16], m3 | |
12222 pmaddubsw m3, m1, [r5 + 13 * 16] | |
12223 pmulhrsw m3, m7 | |
12224 pmaddubsw m5, m4, [r5 + 13 * 16] | |
12225 pmulhrsw m5, m7 | |
12226 packuswb m3, m5 | |
12227 movu [r0 + 685 * 16], m3 | |
12228 | |
12229 ; mode 12 [row 23] | |
12230 pmaddubsw m3, m0, [r5 + 8 * 16] | |
12231 pmulhrsw m3, m7 | |
12232 pmaddubsw m5, m2, [r5 + 8 * 16] | |
12233 pmulhrsw m5, m7 | |
12234 packuswb m3, m5 | |
12235 movu [r0 + 686 * 16], m3 | |
12236 pmaddubsw m3, m1, [r5 + 8 * 16] | |
12237 pmulhrsw m3, m7 | |
12238 pmaddubsw m5, m4, [r5 + 8 * 16] | |
12239 pmulhrsw m5, m7 | |
12240 packuswb m3, m5 | |
12241 movu [r0 + 687 * 16], m3 | |
12242 | |
12243 ; mode 12 [row 24] | |
12244 pmaddubsw m3, m0, [r5 + 3 * 16] | |
12245 pmulhrsw m3, m7 | |
12246 pmaddubsw m5, m2, [r5 + 3 * 16] | |
12247 pmulhrsw m5, m7 | |
12248 packuswb m3, m5 | |
12249 movu [r0 + 688 * 16], m3 | |
12250 pmaddubsw m3, m1, [r5 + 3 * 16] | |
12251 pmulhrsw m3, m7 | |
12252 pmaddubsw m5, m4, [r5 + 3 * 16] | |
12253 pmulhrsw m5, m7 | |
12254 packuswb m3, m5 | |
12255 movu [r0 + 689 * 16], m3 | |
12256 | |
12257 ; mode 13 [row 10] | |
12258 movu m7, m6 | |
12259 movu m6, m0 | |
12260 pinsrb m6, [r3 + 4], 4 | |
12261 pinsrb m6, [r3 + 4], 3 | |
12262 pinsrb m6, [r3 + 7], 2 | |
12263 pinsrb m6, [r3 + 7], 1 | |
12264 pinsrb m6, [r3 + 11], 0 | |
12265 pmaddubsw m3, m6, [r5 + 29 * 16] | |
12266 pmulhrsw m3, [pw_1024] | |
12267 pmaddubsw m5, m2, [r5 + 29 * 16] | |
12268 pmulhrsw m5, [pw_1024] | |
12269 packuswb m3, m5 | |
12270 movu [r0 + 724 * 16], m3 | |
12271 pmaddubsw m3, m1, [r5 + 29 * 16] | |
12272 pmulhrsw m3, [pw_1024] | |
12273 pmaddubsw m5, m4, [r5 + 29 * 16] | |
12274 pmulhrsw m5, [pw_1024] | |
12275 packuswb m3, m5 | |
12276 movu [r0 + 725 * 16], m3 | |
12277 | |
12278 ; mode 13 [row 11] | |
12279 pmaddubsw m3, m6, [r5 + 20 * 16] | |
12280 pmulhrsw m3, [pw_1024] | |
12281 pmaddubsw m5, m2, [r5 + 20 * 16] | |
12282 pmulhrsw m5, [pw_1024] | |
12283 packuswb m3, m5 | |
12284 movu [r0 + 726 * 16], m3 | |
12285 pmaddubsw m3, m1, [r5 + 20 * 16] | |
12286 pmulhrsw m3, [pw_1024] | |
12287 pmaddubsw m5, m4, [r5 + 20 * 16] | |
12288 pmulhrsw m5, [pw_1024] | |
12289 packuswb m3, m5 | |
12290 movu [r0 + 727 * 16], m3 | |
12291 | |
12292 ; mode 13 [row 12] | |
12293 pmaddubsw m3, m6, [r5 + 11 * 16] | |
12294 pmulhrsw m3, [pw_1024] | |
12295 pmaddubsw m5, m2, [r5 + 11 * 16] | |
12296 pmulhrsw m5, [pw_1024] | |
12297 packuswb m3, m5 | |
12298 movu [r0 + 728 * 16], m3 | |
12299 pmaddubsw m3, m1, [r5 + 11 * 16] | |
12300 pmulhrsw m3, [pw_1024] | |
12301 pmaddubsw m5, m4, [r5 + 11 * 16] | |
12302 pmulhrsw m5, [pw_1024] | |
12303 packuswb m3, m5 | |
12304 movu [r0 + 729 * 16], m3 | |
12305 | |
12306 ; mode 13 [row 13] | |
12307 pmaddubsw m3, m6, [r5 + 2 * 16] | |
12308 pmulhrsw m3, [pw_1024] | |
12309 pmaddubsw m5, m2, [r5 + 2 * 16] | |
12310 pmulhrsw m5, [pw_1024] | |
12311 packuswb m3, m5 | |
12312 movu [r0 + 730 * 16], m3 | |
12313 pmaddubsw m3, m1, [r5 + 2 * 16] | |
12314 pmulhrsw m3, [pw_1024] | |
12315 pmaddubsw m5, m4, [r5 + 2 * 16] | |
12316 pmulhrsw m5, [pw_1024] | |
12317 packuswb m3, m5 | |
12318 movu [r0 + 731 * 16], m3 | |
12319 | |
12320 ; mode 14 [row 9] | |
12321 pslldq m7, 2 | |
12322 pinsrb m7, [r3 + 7], 1 | |
12323 pinsrb m7, [r3 + 10], 0 | |
12324 pmaddubsw m3, m7, [r5 + 30 * 16] | |
12325 pmulhrsw m3, [pw_1024] | |
12326 pslldq m2, 2 | |
12327 pinsrw m2, [r4 + 4], 0 | |
12328 pmaddubsw m5, m2, [r5 + 30 * 16] | |
12329 pmulhrsw m5, [pw_1024] | |
12330 packuswb m3, m5 | |
12331 movu [r0 + 786 * 16], m3 | |
12332 pslldq m1, 2 | |
12333 pinsrw m1, [r4 + 12], 0 | |
12334 pmaddubsw m3, m1, [r5 + 30 * 16] | |
12335 pmulhrsw m3, [pw_1024] | |
12336 pslldq m4, 2 | |
12337 pinsrb m4, [r4 + 21], 1 | |
12338 pinsrb m4, [r4 + 20], 0 | |
12339 pmaddubsw m5, m4, [r5 + 30 * 16] | |
12340 pmulhrsw m5, [pw_1024] | |
12341 packuswb m3, m5 | |
12342 movu [r0 + 787 * 16], m3 | |
12343 | |
12344 ; mode 14 [row 10] | |
12345 pmaddubsw m3, m7, [r5 + 17 * 16] | |
12346 pmulhrsw m3, [pw_1024] | |
12347 pmaddubsw m5, m2, [r5 + 17 * 16] | |
12348 pmulhrsw m5, [pw_1024] | |
12349 packuswb m3, m5 | |
12350 movu [r0 + 788 * 16], m3 | |
12351 pmaddubsw m3, m1, [r5 + 17 * 16] | |
12352 pmulhrsw m3, [pw_1024] | |
12353 pmaddubsw m5, m4, [r5 + 17 * 16] | |
12354 pmulhrsw m5, [pw_1024] | |
12355 packuswb m3, m5 | |
12356 movu [r0 + 789 * 16], m3 | |
12357 | |
12358 ; mode 14 [row 11] | |
12359 pmaddubsw m3, m7, [r5 + 4 * 16] | |
12360 pmulhrsw m3, [pw_1024] | |
12361 pmaddubsw m5, m2, [r5 + 4 * 16] | |
12362 pmulhrsw m5, [pw_1024] | |
12363 packuswb m3, m5 | |
12364 movu [r0 + 790 * 16], m3 | |
12365 pmaddubsw m3, m1, [r5 + 4 * 16] | |
12366 pmulhrsw m3, [pw_1024] | |
12367 pmaddubsw m5, m4, [r5 + 4 * 16] | |
12368 pmulhrsw m5, [pw_1024] | |
12369 packuswb m3, m5 | |
12370 movu [r0 + 791 * 16], m3 | |
12371 | |
12372 movu m6, [pw_1024] | |
12373 | |
12374 ; mode 15 [row 7, 8-31] | |
12375 pmaddubsw m5, m2, [r5 + 24 * 16] | |
12376 pmulhrsw m5, m6 | |
12377 packuswb m5, m5 | |
12378 movh [r0 + 846 * 16 + 8], m5 | |
12379 pmaddubsw m3, m1, [r5 + 24 * 16] | |
12380 pmulhrsw m3, m6 | |
12381 pmaddubsw m5, m4, [r5 + 24 * 16] | |
12382 pmulhrsw m5, m6 | |
12383 packuswb m3, m5 | |
12384 movu [r0 + 847 * 16], m3 | |
12385 | |
12386 ; mode 15 [row 8, 8-31] | |
12387 pmaddubsw m5, m2, [r5 + 7 * 16] | |
12388 pmulhrsw m5, m6 | |
12389 packuswb m5, m5 | |
12390 movh [r0 + 848 * 16 + 8], m5 | |
12391 pmaddubsw m3, m1, [r5 + 7 * 16] | |
12392 pmulhrsw m3, m6 | |
12393 pmaddubsw m5, m4, [r5 + 7 * 16] | |
12394 pmulhrsw m5, m6 | |
12395 packuswb m3, m5 | |
12396 movu [r0 + 849 * 16], m3 | |
12397 | |
12398 ; mode 12 [row 25] | |
12399 pslldq m0, 2 | |
12400 pinsrb m0, [r3 + 19], 1 | |
12401 pinsrb m0, [r3 + 26], 0 | |
12402 pmaddubsw m3, m0, [r5 + 30 * 16] | |
12403 pmulhrsw m3, [pw_1024] | |
12404 pmaddubsw m5, m2, [r5 + 30 * 16] | |
12405 pmulhrsw m5, [pw_1024] | |
12406 packuswb m3, m5 | |
12407 movu [r0 + 690 * 16], m3 | |
12408 pmaddubsw m3, m1, [r5 + 30 * 16] | |
12409 pmulhrsw m3, [pw_1024] | |
12410 pmaddubsw m5, m4, [r5 + 30 * 16] | |
12411 pmulhrsw m5, [pw_1024] | |
12412 packuswb m3, m5 | |
12413 movu [r0 + 691 * 16], m3 | |
12414 | |
12415 ; mode 12 [row 26] | |
12416 pmaddubsw m3, m0, [r5 + 25 * 16] | |
12417 pmulhrsw m3, [pw_1024] | |
12418 pmaddubsw m5, m2, [r5 + 25 * 16] | |
12419 pmulhrsw m5, [pw_1024] | |
12420 packuswb m3, m5 | |
12421 movu [r0 + 692 * 16], m3 | |
12422 pmaddubsw m3, m1, [r5 + 25 * 16] | |
12423 pmulhrsw m3, [pw_1024] | |
12424 pmaddubsw m5, m4, [r5 + 25 * 16] | |
12425 pmulhrsw m5, [pw_1024] | |
12426 packuswb m3, m5 | |
12427 movu [r0 + 693 * 16], m3 | |
12428 | |
12429 ; mode 12 [row 27] | |
12430 pmaddubsw m3, m0, [r5 + 20 * 16] | |
12431 pmulhrsw m3, [pw_1024] | |
12432 pmaddubsw m5, m2, [r5 + 20 * 16] | |
12433 pmulhrsw m5, [pw_1024] | |
12434 packuswb m3, m5 | |
12435 movu [r0 + 694 * 16], m3 | |
12436 pmaddubsw m3, m1, [r5 + 20 * 16] | |
12437 pmulhrsw m3, [pw_1024] | |
12438 pmaddubsw m5, m4, [r5 + 20 * 16] | |
12439 pmulhrsw m5, [pw_1024] | |
12440 packuswb m3, m5 | |
12441 movu [r0 + 695 * 16], m3 | |
12442 | |
12443 ; mode 12 [row 28] | |
12444 pmaddubsw m3, m0, [r5 + 15 * 16] | |
12445 pmulhrsw m3, [pw_1024] | |
12446 pmaddubsw m5, m2, [r5 + 15 * 16] | |
12447 pmulhrsw m5, [pw_1024] | |
12448 packuswb m3, m5 | |
12449 movu [r0 + 696 * 16], m3 | |
12450 pmaddubsw m3, m1, [r5 + 15 * 16] | |
12451 pmulhrsw m3, [pw_1024] | |
12452 pmaddubsw m5, m4, [r5 + 15 * 16] | |
12453 pmulhrsw m5, [pw_1024] | |
12454 packuswb m3, m5 | |
12455 movu [r0 + 697 * 16], m3 | |
12456 | |
12457 ; mode 12 [row 29] | |
12458 pmaddubsw m3, m0, [r5 + 10 * 16] | |
12459 pmulhrsw m3, [pw_1024] | |
12460 pmaddubsw m5, m2, [r5 + 10 * 16] | |
12461 pmulhrsw m5, [pw_1024] | |
12462 packuswb m3, m5 | |
12463 movu [r0 + 698 * 16], m3 | |
12464 pmaddubsw m3, m1, [r5 + 10 * 16] | |
12465 pmulhrsw m3, [pw_1024] | |
12466 pmaddubsw m5, m4, [r5 + 10 * 16] | |
12467 pmulhrsw m5, [pw_1024] | |
12468 packuswb m3, m5 | |
12469 movu [r0 + 699 * 16], m3 | |
12470 | |
12471 ; mode 12 [row 30] | |
12472 pmaddubsw m3, m0, [r5 + 5 * 16] | |
12473 pmulhrsw m3, [pw_1024] | |
12474 pmaddubsw m5, m2, [r5 + 5 * 16] | |
12475 pmulhrsw m5, [pw_1024] | |
12476 packuswb m3, m5 | |
12477 movu [r0 + 700 * 16], m3 | |
12478 pmaddubsw m3, m1, [r5 + 5 * 16] | |
12479 pmulhrsw m3, [pw_1024] | |
12480 pmaddubsw m5, m4, [r5 + 5 * 16] | |
12481 pmulhrsw m5, [pw_1024] | |
12482 packuswb m3, m5 | |
12483 movu [r0 + 701 * 16], m3 | |
12484 | |
12485 ; mode 13 [row 14] | |
12486 movu m6, m0 | |
12487 pinsrb m6, [r3 + 4], 6 | |
12488 pinsrb m6, [r3 + 4], 5 | |
12489 pinsrb m6, [r3 + 7], 4 | |
12490 pinsrb m6, [r3 + 7], 3 | |
12491 pinsrb m6, [r3 + 11], 2 | |
12492 pinsrb m6, [r3 + 11], 1 | |
12493 pinsrb m6, [r3 + 14], 0 | |
12494 pmaddubsw m3, m6, [r5 + 25 * 16] | |
12495 pmulhrsw m3, [pw_1024] | |
12496 pmaddubsw m5, m2, [r5 + 25 * 16] | |
12497 pmulhrsw m5, [pw_1024] | |
12498 packuswb m3, m5 | |
12499 movu [r0 + 732 * 16], m3 | |
12500 pmaddubsw m3, m1, [r5 + 25 * 16] | |
12501 pmulhrsw m3, [pw_1024] | |
12502 pmaddubsw m5, m4, [r5 + 25 * 16] | |
12503 pmulhrsw m5, [pw_1024] | |
12504 packuswb m3, m5 | |
12505 movu [r0 + 733 * 16], m3 | |
12506 | |
12507 ; mode 13 [row 15] | |
12508 pmaddubsw m3, m6, [r5 + 16 * 16] | |
12509 pmulhrsw m3, [pw_1024] | |
12510 pmaddubsw m5, m2, [r5 + 16 * 16] | |
12511 pmulhrsw m5, [pw_1024] | |
12512 packuswb m3, m5 | |
12513 movu [r0 + 734 * 16], m3 | |
12514 pmaddubsw m3, m1, [r5 + 16 * 16] | |
12515 pmulhrsw m3, [pw_1024] | |
12516 pmaddubsw m5, m4, [r5 + 16 * 16] | |
12517 pmulhrsw m5, [pw_1024] | |
12518 packuswb m3, m5 | |
12519 movu [r0 + 735 * 16], m3 | |
12520 | |
12521 ; mode 13 [row 16] | |
12522 pmaddubsw m3, m6, [r5 + 7 * 16] | |
12523 pmulhrsw m3, [pw_1024] | |
12524 pmaddubsw m5, m2, [r5 + 7 * 16] | |
12525 pmulhrsw m5, [pw_1024] | |
12526 packuswb m3, m5 | |
12527 movu [r0 + 736 * 16], m3 | |
12528 pmaddubsw m3, m1, [r5 + 7 * 16] | |
12529 pmulhrsw m3, [pw_1024] | |
12530 pmaddubsw m5, m4, [r5 + 7 * 16] | |
12531 pmulhrsw m5, [pw_1024] | |
12532 packuswb m3, m5 | |
12533 movu [r0 + 737 * 16], m3 | |
12534 | |
12535 ; mode 13 [row 17] | |
12536 pslldq m6, 2 | |
12537 pinsrb m6, [r3 + 14], 1 | |
12538 pinsrb m6, [r3 + 18], 0 | |
12539 pmaddubsw m3, m6, [r5 + 30 * 16] | |
12540 pmulhrsw m3, [pw_1024] | |
12541 pslldq m2, 2 | |
12542 pinsrw m2, [r4 + 3], 0 | |
12543 pmaddubsw m5, m2, [r5 + 30 * 16] | |
12544 pmulhrsw m5, [pw_1024] | |
12545 packuswb m3, m5 | |
12546 movu [r0 + 738 * 16], m3 | |
12547 pslldq m1, 2 | |
12548 pinsrw m1, [r4 + 11], 0 | |
12549 pmaddubsw m3, m1, [r5 + 30 * 16] | |
12550 pmulhrsw m3, [pw_1024] | |
12551 pslldq m4, 2 | |
12552 pinsrw m4, [r4 + 19], 0 | |
12553 pmaddubsw m5, m4, [r5 + 30 * 16] | |
12554 pmulhrsw m5, [pw_1024] | |
12555 packuswb m3, m5 | |
12556 movu [r0 + 739 * 16], m3 | |
12557 | |
12558 ; mode 13 [row 18] | |
12559 pmaddubsw m3, m6, [r5 + 21 * 16] | |
12560 pmulhrsw m3, [pw_1024] | |
12561 pmaddubsw m5, m2, [r5 + 21 * 16] | |
12562 pmulhrsw m5, [pw_1024] | |
12563 packuswb m3, m5 | |
12564 movu [r0 + 740 * 16], m3 | |
12565 pmaddubsw m3, m1, [r5 + 21 * 16] | |
12566 pmulhrsw m3, [pw_1024] | |
12567 pmaddubsw m5, m4, [r5 + 21 * 16] | |
12568 pmulhrsw m5, [pw_1024] | |
12569 packuswb m3, m5 | |
12570 movu [r0 + 741 * 16], m3 | |
12571 | |
12572 ; mode 13 [row 19] | |
12573 pmaddubsw m3, m6, [r5 + 12 * 16] | |
12574 pmulhrsw m3, [pw_1024] | |
12575 pmaddubsw m5, m2, [r5 + 12 * 16] | |
12576 pmulhrsw m5, [pw_1024] | |
12577 packuswb m3, m5 | |
12578 movu [r0 + 742 * 16], m3 | |
12579 pmaddubsw m3, m1, [r5 + 12 * 16] | |
12580 pmulhrsw m3, [pw_1024] | |
12581 pmaddubsw m5, m4, [r5 + 12 * 16] | |
12582 pmulhrsw m5, [pw_1024] | |
12583 packuswb m3, m5 | |
12584 movu [r0 + 743 * 16], m3 | |
12585 | |
12586 ; mode 13 [row 20] | |
12587 pmaddubsw m3, m6, [r5 + 3 * 16] | |
12588 pmulhrsw m3, [pw_1024] | |
12589 pmaddubsw m5, m2, [r5 + 3 * 16] | |
12590 pmulhrsw m5, [pw_1024] | |
12591 packuswb m3, m5 | |
12592 movu [r0 + 744 * 16], m3 | |
12593 pmaddubsw m3, m1, [r5 + 3 * 16] | |
12594 pmulhrsw m3, [pw_1024] | |
12595 pmaddubsw m5, m4, [r5 + 3 * 16] | |
12596 pmulhrsw m5, [pw_1024] | |
12597 packuswb m3, m5 | |
12598 movu [r0 + 745 * 16], m3 | |
12599 | |
12600 ; mode 14 [row 12] | |
12601 pslldq m7, 2 | |
12602 pinsrb m7, [r3 + 10], 1 | |
12603 pinsrb m7, [r3 + 12], 0 | |
12604 pmaddubsw m3, m7, [r5 + 23 * 16] | |
12605 pmulhrsw m3, [pw_1024] | |
12606 pmaddubsw m5, m2, [r5 + 23 * 16] | |
12607 pmulhrsw m5, [pw_1024] | |
12608 packuswb m3, m5 | |
12609 movu [r0 + 792 * 16], m3 | |
12610 pmaddubsw m3, m1, [r5 + 23 * 16] | |
12611 pmulhrsw m3, [pw_1024] | |
12612 pmaddubsw m5, m4, [r5 + 23 * 16] | |
12613 pmulhrsw m5, [pw_1024] | |
12614 packuswb m3, m5 | |
12615 movu [r0 + 793 * 16], m3 | |
12616 | |
12617 ; mode 14 [row 13] | |
12618 pmaddubsw m3, m7, [r5 + 10 * 16] | |
12619 pmulhrsw m3, [pw_1024] | |
12620 pmaddubsw m5, m2, [r5 + 10 * 16] | |
12621 pmulhrsw m5, [pw_1024] | |
12622 packuswb m3, m5 | |
12623 movu [r0 + 794 * 16], m3 | |
12624 pmaddubsw m3, m1, [r5 + 10 * 16] | |
12625 pmulhrsw m3, [pw_1024] | |
12626 pmaddubsw m5, m4, [r5 + 10 * 16] | |
12627 pmulhrsw m5, [pw_1024] | |
12628 packuswb m3, m5 | |
12629 movu [r0 + 795 * 16], m3 | |
12630 | |
12631 ; mode 15 [row 9] | |
12632 pmaddubsw m5, m2, [r5 + 22 * 16] | |
12633 pmulhrsw m5, [pw_1024] | |
12634 packuswb m5, m5 | |
12635 movu [r0 + 850 * 16 + 8], m5 | |
12636 pmaddubsw m3, m1, [r5 + 22 * 16] | |
12637 pmulhrsw m3, [pw_1024] | |
12638 pmaddubsw m5, m4, [r5 + 22 * 16] | |
12639 pmulhrsw m5, [pw_1024] | |
12640 packuswb m3, m5 | |
12641 movu [r0 + 851 * 16], m3 | |
12642 | |
12643 ; mode 15 [row 10] | |
12644 pmaddubsw m5, m2, [r5 + 5 * 16] | |
12645 pmulhrsw m5, [pw_1024] | |
12646 packuswb m5, m5 | |
12647 movu [r0 + 852 * 16 + 8], m5 | |
12648 pmaddubsw m3, m1, [r5 + 5 * 16] | |
12649 pmulhrsw m3, [pw_1024] | |
12650 pmaddubsw m5, m4, [r5 + 5 * 16] | |
12651 pmulhrsw m5, [pw_1024] | |
12652 packuswb m3, m5 | |
12653 movu [r0 + 853 * 16], m3 | |
12654 | |
12655 ; mode 13 [row 21] | |
12656 pslldq m6, 2 | |
12657 pinsrb m6, [r3 + 18], 1 | |
12658 pinsrb m6, [r3 + 21], 0 | |
12659 pmaddubsw m3, m6, [r5 + 26 * 16] | |
12660 pmulhrsw m3, [pw_1024] | |
12661 pslldq m2, 2 | |
12662 pinsrw m2, [r4 + 2], 0 | |
12663 pmaddubsw m5, m2, [r5 + 26 * 16] | |
12664 pmulhrsw m5, [pw_1024] | |
12665 packuswb m3, m5 | |
12666 movu [r0 + 746 * 16], m3 | |
12667 pslldq m1, 2 | |
12668 pinsrw m1, [r4 + 10], 0 | |
12669 pmaddubsw m3, m1, [r5 + 26 * 16] | |
12670 pmulhrsw m3, [pw_1024] | |
12671 pslldq m4, 2 | |
12672 pinsrw m4, [r4 + 18], 0 | |
12673 pmaddubsw m5, m4, [r5 + 26 * 16] | |
12674 pmulhrsw m5, [pw_1024] | |
12675 packuswb m3, m5 | |
12676 movu [r0 + 747 * 16], m3 | |
12677 | |
12678 ; mode 13 [row 22] | |
12679 pmaddubsw m3, m6, [r5 + 17 * 16] | |
12680 pmulhrsw m3, [pw_1024] | |
12681 pmaddubsw m5, m2, [r5 + 17 * 16] | |
12682 pmulhrsw m5, [pw_1024] | |
12683 packuswb m3, m5 | |
12684 movu [r0 + 748 * 16], m3 | |
12685 pmaddubsw m3, m1, [r5 + 17 * 16] | |
12686 pmulhrsw m3, [pw_1024] | |
12687 pmaddubsw m5, m4, [r5 + 17 * 16] | |
12688 pmulhrsw m5, [pw_1024] | |
12689 packuswb m3, m5 | |
12690 movu [r0 + 749 * 16], m3 | |
12691 | |
12692 ; mode 13 [row 23] | |
12693 pmaddubsw m3, m6, [r5 + 8 * 16] | |
12694 pmulhrsw m3, [pw_1024] | |
12695 pmaddubsw m5, m2, [r5 + 8 * 16] | |
12696 pmulhrsw m5, [pw_1024] | |
12697 packuswb m3, m5 | |
12698 movu [r0 + 750 * 16], m3 | |
12699 pmaddubsw m3, m1, [r5 + 8 * 16] | |
12700 pmulhrsw m3, [pw_1024] | |
12701 pmaddubsw m5, m4, [r5 + 8 * 16] | |
12702 pmulhrsw m5, [pw_1024] | |
12703 packuswb m3, m5 | |
12704 movu [r0 + 751 * 16], m3 | |
12705 | |
12706 ; mode 14 [row 14] | |
12707 pslldq m7, 2 | |
12708 pinsrb m7, [r3 + 12], 1 | |
12709 pinsrb m7, [r3 + 15], 0 | |
12710 pmaddubsw m3, m7, [r5 + 29 * 16] | |
12711 pmulhrsw m3, [pw_1024] | |
12712 pmaddubsw m5, m2, [r5 + 29 * 16] | |
12713 pmulhrsw m5, [pw_1024] | |
12714 packuswb m3, m5 | |
12715 movu [r0 + 796 * 16], m3 | |
12716 pmaddubsw m3, m1, [r5 + 29 * 16] | |
12717 pmulhrsw m3, [pw_1024] | |
12718 pmaddubsw m5, m4, [r5 + 29 * 16] | |
12719 pmulhrsw m5, [pw_1024] | |
12720 packuswb m3, m5 | |
12721 movu [r0 + 797 * 16], m3 | |
12722 | |
12723 ; mode 14 [row 15] | |
12724 pmaddubsw m3, m7, [r5 + 16 * 16] | |
12725 pmulhrsw m3, [pw_1024] | |
12726 pmaddubsw m5, m2, [r5 + 16 * 16] | |
12727 pmulhrsw m5, [pw_1024] | |
12728 packuswb m3, m5 | |
12729 movu [r0 + 798 * 16], m3 | |
12730 pmaddubsw m3, m1, [r5 + 16 * 16] | |
12731 pmulhrsw m3, [pw_1024] | |
12732 pmaddubsw m5, m4, [r5 + 16 * 16] | |
12733 pmulhrsw m5, [pw_1024] | |
12734 packuswb m3, m5 | |
12735 movu [r0 + 799 * 16], m3 | |
12736 | |
12737 ; mode 14 [row 16] | |
12738 pmaddubsw m3, m7, [r5 + 3 * 16] | |
12739 pmulhrsw m3, [pw_1024] | |
12740 pmaddubsw m5, m2, [r5 + 3 * 16] | |
12741 pmulhrsw m5, [pw_1024] | |
12742 packuswb m3, m5 | |
12743 movu [r0 + 800 * 16], m3 | |
12744 pmaddubsw m3, m1, [r5 + 3 * 16] | |
12745 pmulhrsw m3, [pw_1024] | |
12746 pmaddubsw m5, m4, [r5 + 3 * 16] | |
12747 pmulhrsw m5, [pw_1024] | |
12748 packuswb m3, m5 | |
12749 movu [r0 + 801 * 16], m3 | |
12750 | |
12751 ; mode 15 [row 11] | |
12752 pmaddubsw m5, m2, [r5 + 20 * 16] | |
12753 pmulhrsw m5, [pw_1024] | |
12754 packuswb m5, m5 | |
12755 movh [r0 + 854 * 16 + 8], m5 | |
12756 pmaddubsw m3, m1, [r5 + 20 * 16] | |
12757 pmulhrsw m3, [pw_1024] | |
12758 pmaddubsw m5, m4, [r5 + 20 * 16] | |
12759 pmulhrsw m5, [pw_1024] | |
12760 packuswb m3, m5 | |
12761 movu [r0 + 855 * 16], m3 | |
12762 | |
12763 ; mode 15 [row 12] | |
12764 pmaddubsw m5, m2, [r5 + 3 * 16] | |
12765 pmulhrsw m5, [pw_1024] | |
12766 packuswb m5, m5 | |
12767 movh [r0 + 856 * 16 + 8], m5 | |
12768 pmaddubsw m3, m1, [r5 + 3 * 16] | |
12769 pmulhrsw m3, [pw_1024] | |
12770 pmaddubsw m5, m4, [r5 + 3 * 16] | |
12771 pmulhrsw m5, [pw_1024] | |
12772 packuswb m3, m5 | |
12773 movu [r0 + 857 * 16], m3 | |
12774 | |
12775 ; mode 13 [row 24] | |
12776 pslldq m6, 2 | |
12777 pinsrb m6, [r3 + 21], 1 | |
12778 pinsrb m6, [r3 + 25], 0 | |
12779 pmaddubsw m3, m6, [r5 + 31 * 16] | |
12780 pmulhrsw m3, [pw_1024] | |
12781 pslldq m2, 2 | |
12782 pinsrw m2, [r4 + 1], 0 | |
12783 pmaddubsw m5, m2, [r5 + 31 * 16] | |
12784 pmulhrsw m5, [pw_1024] | |
12785 packuswb m3, m5 | |
12786 movu [r0 + 752 * 16], m3 | |
12787 pslldq m1, 2 | |
12788 pinsrw m1, [r4 + 9], 0 | |
12789 pmaddubsw m3, m1, [r5 + 31 * 16] | |
12790 pmulhrsw m3, [pw_1024] | |
12791 pslldq m4, 2 | |
12792 pinsrw m4, [r4 + 17], 0 | |
12793 pmaddubsw m5, m4, [r5 + 31 * 16] | |
12794 pmulhrsw m5, [pw_1024] | |
12795 packuswb m3, m5 | |
12796 movu [r0 + 753 * 16], m3 | |
12797 | |
12798 ; mode 13 [row 25] | |
12799 pmaddubsw m3, m6, [r5 + 22 * 16] | |
12800 pmulhrsw m3, [pw_1024] | |
12801 pmaddubsw m5, m2, [r5 + 22 * 16] | |
12802 pmulhrsw m5, [pw_1024] | |
12803 packuswb m3, m5 | |
12804 movu [r0 + 754 * 16], m3 | |
12805 pmaddubsw m3, m1, [r5 + 22 * 16] | |
12806 pmulhrsw m3, [pw_1024] | |
12807 pmaddubsw m5, m4, [r5 + 22 * 16] | |
12808 pmulhrsw m5, [pw_1024] | |
12809 packuswb m3, m5 | |
12810 movu [r0 + 755 * 16], m3 | |
12811 | |
12812 ; mode 13 [row 26] | |
12813 pmaddubsw m3, m6, [r5 + 13 * 16] | |
12814 pmulhrsw m3, [pw_1024] | |
12815 pmaddubsw m5, m2, [r5 + 13 * 16] | |
12816 pmulhrsw m5, [pw_1024] | |
12817 packuswb m3, m5 | |
12818 movu [r0 + 756 * 16], m3 | |
12819 pmaddubsw m3, m1, [r5 + 13 * 16] | |
12820 pmulhrsw m3, [pw_1024] | |
12821 pmaddubsw m5, m4, [r5 + 13 * 16] | |
12822 pmulhrsw m5, [pw_1024] | |
12823 packuswb m3, m5 | |
12824 movu [r0 + 757 * 16], m3 | |
12825 | |
12826 ; mode 13 [row 27] | |
12827 pmaddubsw m3, m6, [r5 + 4 * 16] | |
12828 pmulhrsw m3, [pw_1024] | |
12829 pmaddubsw m5, m2, [r5 + 4 * 16] | |
12830 pmulhrsw m5, [pw_1024] | |
12831 packuswb m3, m5 | |
12832 movu [r0 + 758 * 16], m3 | |
12833 pmaddubsw m3, m1, [r5 + 4 * 16] | |
12834 pmulhrsw m3, [pw_1024] | |
12835 pmaddubsw m5, m4, [r5 + 4 * 16] | |
12836 pmulhrsw m5, [pw_1024] | |
12837 packuswb m3, m5 | |
12838 movu [r0 + 759 * 16], m3 | |
12839 | |
12840 ; mode 14 [row 17] | |
12841 pslldq m7, 2 | |
12842 pinsrb m7, [r3 + 15], 1 | |
12843 pinsrb m7, [r3 + 17], 0 | |
12844 pmaddubsw m3, m7, [r5 + 22 * 16] | |
12845 pmulhrsw m3, [pw_1024] | |
12846 pmaddubsw m5, m2, [r5 + 22 * 16] | |
12847 pmulhrsw m5, [pw_1024] | |
12848 packuswb m3, m5 | |
12849 movu [r0 + 802 * 16], m3 | |
12850 pmaddubsw m3, m1, [r5 + 22 * 16] | |
12851 pmulhrsw m3, [pw_1024] | |
12852 pmaddubsw m5, m4, [r5 + 22 * 16] | |
12853 pmulhrsw m5, [pw_1024] | |
12854 packuswb m3, m5 | |
12855 movu [r0 + 803 * 16], m3 | |
12856 | |
12857 ; mode 14 [row 18] | |
12858 pmaddubsw m3, m7, [r5 + 9 * 16] | |
12859 pmulhrsw m3, [pw_1024] | |
12860 pmaddubsw m5, m2, [r5 + 9 * 16] | |
12861 pmulhrsw m5, [pw_1024] | |
12862 packuswb m3, m5 | |
12863 movu [r0 + 804 * 16], m3 | |
12864 pmaddubsw m3, m1, [r5 + 9 * 16] | |
12865 pmulhrsw m3, [pw_1024] | |
12866 pmaddubsw m5, m4, [r5 + 9 * 16] | |
12867 pmulhrsw m5, [pw_1024] | |
12868 packuswb m3, m5 | |
12869 movu [r0 + 805 * 16], m3 | |
12870 | |
12871 ; mode 15 [row 13] | |
12872 pmaddubsw m5, m2, [r5 + 18 * 16] | |
12873 pmulhrsw m5, [pw_1024] | |
12874 packuswb m5, m5 | |
12875 movh [r0 + 858 * 16 + 8], m5 | |
12876 pmaddubsw m3, m1, [r5 + 18 * 16] | |
12877 pmulhrsw m3, [pw_1024] | |
12878 pmaddubsw m5, m4, [r5 + 18 * 16] | |
12879 pmulhrsw m5, [pw_1024] | |
12880 packuswb m3, m5 | |
12881 movu [r0 + 859 * 16], m3 | |
12882 | |
12883 ; mode 15 [row 14] | |
12884 pmaddubsw m5, m2, [r5 + 1 * 16] | |
12885 pmulhrsw m5, [pw_1024] | |
12886 packuswb m5, m5 | |
12887 movh [r0 + 860 * 16 + 8], m5 | |
12888 pmaddubsw m3, m1, [r5 + 1 * 16] | |
12889 pmulhrsw m3, [pw_1024] | |
12890 pmaddubsw m5, m4, [r5 + 1 * 16] | |
12891 pmulhrsw m5, [pw_1024] | |
12892 packuswb m3, m5 | |
12893 movu [r0 + 861 * 16], m3 | |
12894 | |
12895 ; mode 13 [row 28] | |
12896 pslldq m6, 2 | |
12897 pinsrb m6, [r3 + 25], 1 | |
12898 pinsrb m6, [r3 + 28], 0 | |
12899 pmaddubsw m3, m6, [r5 + 27 * 16] | |
12900 pmulhrsw m3, [pw_1024] | |
12901 pslldq m2, 2 | |
12902 pinsrw m2, [r4 + 0], 0 | |
12903 pmaddubsw m5, m2, [r5 + 27 * 16] | |
12904 pmulhrsw m5, [pw_1024] | |
12905 packuswb m3, m5 | |
12906 movu [r0 + 760 * 16], m3 | |
12907 pslldq m1, 2 | |
12908 pinsrw m1, [r4 + 8], 0 | |
12909 pmaddubsw m3, m1, [r5 + 27 * 16] | |
12910 pmulhrsw m3, [pw_1024] | |
12911 pslldq m4, 2 | |
12912 pinsrw m4, [r4 + 16], 0 | |
12913 pmaddubsw m5, m4, [r5 + 27 * 16] | |
12914 pmulhrsw m5, [pw_1024] | |
12915 packuswb m3, m5 | |
12916 movu [r0 + 761 * 16], m3 | |
12917 | |
12918 ; mode 13 [row 29] | |
12919 pmaddubsw m3, m6, [r5 + 18 * 16] | |
12920 pmulhrsw m3, [pw_1024] | |
12921 pmaddubsw m5, m2, [r5 + 18 * 16] | |
12922 pmulhrsw m5, [pw_1024] | |
12923 packuswb m3, m5 | |
12924 movu [r0 + 762 * 16], m3 | |
12925 pmaddubsw m3, m1, [r5 + 18 * 16] | |
12926 pmulhrsw m3, [pw_1024] | |
12927 pmaddubsw m5, m4, [r5 + 18 * 16] | |
12928 pmulhrsw m5, [pw_1024] | |
12929 packuswb m3, m5 | |
12930 movu [r0 + 763 * 16], m3 | |
12931 | |
12932 ; mode 13 [row 30] | |
12933 pmaddubsw m3, m6, [r5 + 9 * 16] | |
12934 pmulhrsw m3, [pw_1024] | |
12935 pmaddubsw m5, m2, [r5 + 9 * 16] | |
12936 pmulhrsw m5, [pw_1024] | |
12937 packuswb m3, m5 | |
12938 movu [r0 + 764 * 16], m3 | |
12939 pmaddubsw m3, m1, [r5 + 9 * 16] | |
12940 pmulhrsw m3, [pw_1024] | |
12941 pmaddubsw m5, m4, [r5 + 9 * 16] | |
12942 pmulhrsw m5, [pw_1024] | |
12943 packuswb m3, m5 | |
12944 movu [r0 + 765 * 16], m3 | |
12945 | |
12946 ; mode 14 [row 19] | |
12947 pslldq m7, 2 | |
12948 pinsrb m7, [r3 + 17], 1 | |
12949 pinsrb m7, [r3 + 20], 0 | |
12950 pmaddubsw m3, m7, [r5 + 28 * 16] | |
12951 pmulhrsw m3, [pw_1024] | |
12952 pmaddubsw m5, m2, [r5 + 28 * 16] | |
12953 pmulhrsw m5, [pw_1024] | |
12954 packuswb m3, m5 | |
12955 movu [r0 + 806 * 16], m3 | |
12956 pmaddubsw m3, m1, [r5 + 28 * 16] | |
12957 pmulhrsw m3, [pw_1024] | |
12958 pmaddubsw m5, m4, [r5 + 28 * 16] | |
12959 pmulhrsw m5, [pw_1024] | |
12960 packuswb m3, m5 | |
12961 movu [r0 + 807 * 16], m3 | |
12962 | |
12963 ; mode 14 [row 20] | |
12964 pmaddubsw m3, m7, [r5 + 15 * 16] | |
12965 pmulhrsw m3, [pw_1024] | |
12966 pmaddubsw m5, m2, [r5 + 15 * 16] | |
12967 pmulhrsw m5, [pw_1024] | |
12968 packuswb m3, m5 | |
12969 movu [r0 + 808 * 16], m3 | |
12970 pmaddubsw m3, m1, [r5 + 15 * 16] | |
12971 pmulhrsw m3, [pw_1024] | |
12972 pmaddubsw m5, m4, [r5 + 15 * 16] | |
12973 pmulhrsw m5, [pw_1024] | |
12974 packuswb m3, m5 | |
12975 movu [r0 + 809 * 16], m3 | |
12976 | |
12977 ; mode 14 [row 21] | |
12978 pmaddubsw m3, m7, [r5 + 2 * 16] | |
12979 pmulhrsw m3, [pw_1024] | |
12980 pmaddubsw m5, m2, [r5 + 2 * 16] | |
12981 pmulhrsw m5, [pw_1024] | |
12982 packuswb m3, m5 | |
12983 movu [r0 + 810 * 16], m3 | |
12984 pmaddubsw m3, m1, [r5 + 2 * 16] | |
12985 pmulhrsw m3, [pw_1024] | |
12986 pmaddubsw m5, m4, [r5 + 2 * 16] | |
12987 pmulhrsw m5, [pw_1024] | |
12988 packuswb m3, m5 | |
12989 movu [r0 + 811 * 16], m3 | |
12990 | |
12991 ; mode 15 [row 15] | |
12992 pmaddubsw m5, m2, [r5 + 16 * 16] | |
12993 pmulhrsw m5, [pw_1024] | |
12994 packuswb m5, m5 | |
12995 movh [r0 + 862 * 16 + 8], m5 | |
12996 pmaddubsw m3, m1, [r5 + 16 * 16] | |
12997 pmulhrsw m3, [pw_1024] | |
12998 pmaddubsw m5, m4, [r5 + 16 * 16] | |
12999 pmulhrsw m5, [pw_1024] | |
13000 packuswb m3, m5 | |
13001 movu [r0 + 863 * 16], m3 | |
13002 | |
13003 ; mode 14 [row 22] | |
13004 pslldq m7, 2 | |
13005 pinsrb m7, [r3 + 20], 1 | |
13006 pinsrb m7, [r3 + 22], 0 | |
13007 pmaddubsw m3, m7, [r5 + 21 * 16] | |
13008 pmulhrsw m3, [pw_1024] | |
13009 pslldq m2, 2 | |
13010 pinsrb m2, [r4 + 0], 1 | |
13011 pinsrb m2, [r3 + 2], 0 | |
13012 pmaddubsw m5, m2, [r5 + 21 * 16] | |
13013 pmulhrsw m5, [pw_1024] | |
13014 packuswb m3, m5 | |
13015 movu [r0 + 812 * 16], m3 | |
13016 pslldq m1, 2 | |
13017 pinsrw m1, [r4 + 7], 0 | |
13018 pmaddubsw m3, m1, [r5 + 21 * 16] | |
13019 pmulhrsw m3, [pw_1024] | |
13020 pslldq m4, 2 | |
13021 pinsrw m4, [r4 + 15], 0 | |
13022 pmaddubsw m5, m4, [r5 + 21 * 16] | |
13023 pmulhrsw m5, [pw_1024] | |
13024 packuswb m3, m5 | |
13025 movu [r0 + 813 * 16], m3 | |
13026 | |
13027 ; mode 14 [row 23] | |
13028 pmaddubsw m3, m7, [r5 + 8 * 16] | |
13029 pmulhrsw m3, [pw_1024] | |
13030 pmaddubsw m5, m2, [r5 + 8 * 16] | |
13031 pmulhrsw m5, [pw_1024] | |
13032 packuswb m3, m5 | |
13033 movu [r0 + 814 * 16], m3 | |
13034 pmaddubsw m3, m1, [r5 + 8 * 16] | |
13035 pmulhrsw m3, [pw_1024] | |
13036 pmaddubsw m5, m4, [r5 + 8 * 16] | |
13037 pmulhrsw m5, [pw_1024] | |
13038 packuswb m3, m5 | |
13039 movu [r0 + 815 * 16], m3 | |
13040 | |
13041 ; mode 15 [row 16] | |
13042 pmaddubsw m5, m2, [r5 + 31 * 16] | |
13043 pmulhrsw m5, [pw_1024] | |
13044 packuswb m5, m5 | |
13045 movh [r0 + 864 * 16 + 8], m5 | |
13046 pmaddubsw m3, m1, [r5 + 31 * 16] | |
13047 pmulhrsw m3, [pw_1024] | |
13048 pmaddubsw m5, m4, [r5 + 31 * 16] | |
13049 pmulhrsw m5, [pw_1024] | |
13050 packuswb m3, m5 | |
13051 movu [r0 + 865 * 16], m3 | |
13052 | |
13053 ; mode 15 [row 17] | |
13054 pmaddubsw m5, m2, [r5 + 14 * 16] | |
13055 pmulhrsw m5, [pw_1024] | |
13056 packuswb m5, m5 | |
13057 movh [r0 + 866 * 16 + 8], m5 | |
13058 pmaddubsw m3, m1, [r5 + 14 * 16] | |
13059 pmulhrsw m3, [pw_1024] | |
13060 pmaddubsw m5, m4, [r5 + 14 * 16] | |
13061 pmulhrsw m5, [pw_1024] | |
13062 packuswb m3, m5 | |
13063 movu [r0 + 867 * 16], m3 | |
13064 | |
13065 ; mode 14 [row 24] | |
13066 pslldq m7, 2 | |
13067 pinsrb m7, [r3 + 22], 1 | |
13068 pinsrb m7, [r3 + 25], 0 | |
13069 pmaddubsw m3, m7, [r5 + 27 * 16] | |
13070 pmulhrsw m3, [pw_1024] | |
13071 pslldq m2, 2 | |
13072 pinsrb m2, [r3 + 2], 1 | |
13073 pinsrb m2, [r3 + 5], 0 | |
13074 pmaddubsw m5, m2, [r5 + 27 * 16] | |
13075 pmulhrsw m5, [pw_1024] | |
13076 packuswb m3, m5 | |
13077 movu [r0 + 816 * 16], m3 | |
13078 pslldq m1, 2 | |
13079 pinsrw m1, [r4 + 6], 0 | |
13080 pmaddubsw m3, m1, [r5 + 27 * 16] | |
13081 pmulhrsw m3, [pw_1024] | |
13082 pslldq m4, 2 | |
13083 pinsrw m4, [r4 + 14], 0 | |
13084 pmaddubsw m5, m4, [r5 + 27 * 16] | |
13085 pmulhrsw m5, [pw_1024] | |
13086 packuswb m3, m5 | |
13087 movu [r0 + 817 * 16], m3 | |
13088 | |
13089 ; mode 14 [row 25] | |
13090 pmaddubsw m3, m7, [r5 + 14 * 16] | |
13091 pmulhrsw m3, [pw_1024] | |
13092 pmaddubsw m5, m2, [r5 + 14 * 16] | |
13093 pmulhrsw m5, [pw_1024] | |
13094 packuswb m3, m5 | |
13095 movu [r0 + 818 * 16], m3 | |
13096 pmaddubsw m3, m1, [r5 + 14 * 16] | |
13097 pmulhrsw m3, [pw_1024] | |
13098 pmaddubsw m5, m4, [r5 + 14 * 16] | |
13099 pmulhrsw m5, [pw_1024] | |
13100 packuswb m3, m5 | |
13101 movu [r0 + 819 * 16], m3 | |
13102 | |
13103 ; mode 14 [row 26] | |
13104 pmaddubsw m3, m7, [r5 + 1 * 16] | |
13105 pmulhrsw m3, [pw_1024] | |
13106 pmaddubsw m5, m2, [r5 + 1 * 16] | |
13107 pmulhrsw m5, [pw_1024] | |
13108 packuswb m3, m5 | |
13109 movu [r0 + 820 * 16], m3 | |
13110 pmaddubsw m3, m1, [r5 + 1 * 16] | |
13111 pmulhrsw m3, [pw_1024] | |
13112 pmaddubsw m5, m4, [r5 + 1 * 16] | |
13113 pmulhrsw m5, [pw_1024] | |
13114 packuswb m3, m5 | |
13115 movu [r0 + 821 * 16], m3 | |
13116 | |
13117 ; mode 15 [row 18] | |
13118 pinsrb m2, [r3 + 4], 0 | |
13119 pmaddubsw m5, m2, [r5 + 29 * 16] | |
13120 pmulhrsw m5, [pw_1024] | |
13121 packuswb m5, m5 | |
13122 movh [r0 + 868 * 16 + 8], m5 | |
13123 pmaddubsw m3, m1, [r5 + 29 * 16] | |
13124 pmulhrsw m3, [pw_1024] | |
13125 pmaddubsw m5, m4, [r5 + 29 * 16] | |
13126 pmulhrsw m5, [pw_1024] | |
13127 packuswb m3, m5 | |
13128 movu [r0 + 869 * 16], m3 | |
13129 | |
13130 ; mode 15 [row 19] | |
13131 pmaddubsw m5, m2, [r5 + 12 * 16] | |
13132 pmulhrsw m5, [pw_1024] | |
13133 packuswb m5, m5 | |
13134 movh [r0 + 870 * 16 + 8], m5 | |
13135 pmaddubsw m3, m1, [r5 + 12 * 16] | |
13136 pmulhrsw m3, [pw_1024] | |
13137 pmaddubsw m5, m4, [r5 + 12 * 16] | |
13138 pmulhrsw m5, [pw_1024] | |
13139 packuswb m3, m5 | |
13140 movu [r0 + 871 * 16], m3 | |
13141 | |
13142 ; mode 15 [row 20 - 8 to 15] | |
13143 pslldq m3, m2, 2 | |
13144 pinsrb m3, [r3 + 4], 1 | |
13145 pinsrb m3, [r3 + 6], 0 | |
13146 pmaddubsw m5, m3, [r5 + 27 * 16] | |
13147 pmulhrsw m5, [pw_1024] | |
13148 packuswb m5, m5 | |
13149 movh [r0 + 872 * 16 + 8], m5 | |
13150 | |
13151 ; mode 15 [row 21 - 8 to 15] | |
13152 pmaddubsw m5, m3, [r5 + 10 * 16] | |
13153 pmulhrsw m5, [pw_1024] | |
13154 packuswb m5, m5 | |
13155 movh [r0 + 874 * 16 + 8], m5 | |
13156 | |
13157 ; mode 15 [row 22 - 8 to 15] | |
13158 pslldq m3, 2 | |
13159 pinsrb m3, [r3 + 6], 1 | |
13160 pinsrb m3, [r3 + 8], 0 | |
13161 pmaddubsw m5, m3, [r5 + 25 * 16] | |
13162 pmulhrsw m5, [pw_1024] | |
13163 packuswb m5, m5 | |
13164 movh [r0 + 876 * 16 + 8], m5 | |
13165 | |
13166 ; mode 15 [row 23 - 8 to 15] | |
13167 pmaddubsw m5, m3, [r5 + 8 * 16] | |
13168 pmulhrsw m5, [pw_1024] | |
13169 packuswb m5, m5 | |
13170 movh [r0 + 878 * 16 + 8], m5 | |
13171 | |
13172 ; mode 15 [row 24 - 8 to 15] | |
13173 pslldq m3, 2 | |
13174 pinsrb m3, [r3 + 8], 1 | |
13175 pinsrb m3, [r3 + 9], 0 | |
13176 pmaddubsw m5, m3, [r5 + 23 * 16] | |
13177 pmulhrsw m5, [pw_1024] | |
13178 packuswb m5, m5 | |
13179 movh [r0 + 880 * 16 + 8], m5 | |
13180 | |
13181 ; mode 15 [row 25 - 8 to 15] | |
13182 pmaddubsw m5, m3, [r5 + 6 * 16] | |
13183 pmulhrsw m5, [pw_1024] | |
13184 packuswb m5, m5 | |
13185 movh [r0 + 882 * 16 + 8], m5 | |
13186 | |
13187 ; mode 15 [row 26 - 8 to 15] | |
13188 pslldq m3, 2 | |
13189 pinsrb m3, [r3 + 9], 1 | |
13190 pinsrb m3, [r3 + 11], 0 | |
13191 pmaddubsw m5, m3, [r5 + 21 * 16] | |
13192 pmulhrsw m5, [pw_1024] | |
13193 packuswb m5, m5 | |
13194 movh [r0 + 884 * 16 + 8], m5 | |
13195 | |
13196 ; mode 15 [row 27 - 8 to 15] | |
13197 pmaddubsw m5, m3, [r5 + 4 * 16] | |
13198 pmulhrsw m5, [pw_1024] | |
13199 packuswb m5, m5 | |
13200 movh [r0 + 886 * 16 + 8], m5 | |
13201 | |
13202 ; mode 15 [row 28 - 8 to 15] | |
13203 pslldq m3, 2 | |
13204 pinsrb m3, [r3 + 11], 1 | |
13205 pinsrb m3, [r3 + 13], 0 | |
13206 pmaddubsw m5, m3, [r5 + 19 * 16] | |
13207 pmulhrsw m5, [pw_1024] | |
13208 packuswb m5, m5 | |
13209 movh [r0 + 888 * 16 + 8], m5 | |
13210 | |
13211 ; mode 15 [row 29 - 8 to 15] | |
13212 pmaddubsw m5, m3, [r5 + 2 * 16] | |
13213 pmulhrsw m5, [pw_1024] | |
13214 packuswb m5, m5 | |
13215 movh [r0 + 890 * 16 + 8], m5 | |
13216 | |
13217 ; mode 15 [row 30 - 8 to 15] | |
13218 pslldq m3, 2 | |
13219 pinsrb m3, [r3 + 13], 1 | |
13220 pinsrb m3, [r3 + 15], 0 | |
13221 pmaddubsw m5, m3, [r5 + 17 * 16] | |
13222 pmulhrsw m5, [pw_1024] | |
13223 packuswb m5, m5 | |
13224 movh [r0 + 892 * 16 + 8], m5 | |
13225 | |
13226 ; mode 15 [row 31, 8 to 15] | |
13227 pshufb m5, m3, [tab_S2] | |
13228 movh [r0 + 894 * 16 + 8], m5 | |
13229 | |
13230 ; mode 14 [row 27] | |
13231 pinsrb m2, [r3 + 5], 0 | |
13232 pslldq m7, 2 | |
13233 pinsrb m7, [r3 + 25], 1 | |
13234 pinsrb m7, [r3 + 27], 0 | |
13235 pmaddubsw m3, m7, [r5 + 20 * 16] | |
13236 pmulhrsw m3, [pw_1024] | |
13237 pslldq m2, 2 | |
13238 pinsrb m2, [r3 + 5], 1 | |
13239 pinsrb m2, [r3 + 7], 0 | |
13240 pmaddubsw m5, m2, [r5 + 20 * 16] | |
13241 pmulhrsw m5, [pw_1024] | |
13242 packuswb m3, m5 | |
13243 movu [r0 + 822 * 16], m3 | |
13244 pslldq m1, 2 | |
13245 pinsrw m1, [r4 + 5], 0 | |
13246 pmaddubsw m3, m1, [r5 + 20 * 16] | |
13247 pmulhrsw m3, [pw_1024] | |
13248 pslldq m4, 2 | |
13249 pinsrw m4, [r4 + 13], 0 | |
13250 pmaddubsw m5, m4, [r5 + 20 * 16] | |
13251 pmulhrsw m5, [pw_1024] | |
13252 packuswb m3, m5 | |
13253 movu [r0 + 823 * 16], m3 | |
13254 | |
13255 ; mode 15 [row 20 - 16 to 31] | |
13256 pmaddubsw m3, m1, [r5 + 27 * 16] | |
13257 pmulhrsw m3, [pw_1024] | |
13258 pmaddubsw m5, m4, [r5 + 27 * 16] | |
13259 pmulhrsw m5, [pw_1024] | |
13260 packuswb m3, m5 | |
13261 movu [r0 + 873 * 16], m3 | |
13262 | |
13263 ; mode 15 [row 21 - 16 to 31] | |
13264 pmaddubsw m3, m1, [r5 + 10 * 16] | |
13265 pmulhrsw m3, [pw_1024] | |
13266 pmaddubsw m5, m4, [r5 + 10 * 16] | |
13267 pmulhrsw m5, [pw_1024] | |
13268 packuswb m3, m5 | |
13269 movu [r0 + 875 * 16], m3 | |
13270 | |
13271 ; mode 14 [row 28] | |
13272 pmaddubsw m3, m7, [r5 + 7 * 16] | |
13273 pmulhrsw m3, [pw_1024] | |
13274 pmaddubsw m5, m2, [r5 + 7 * 16] | |
13275 pmulhrsw m5, [pw_1024] | |
13276 packuswb m3, m5 | |
13277 movu [r0 + 824 * 16], m3 | |
13278 pmaddubsw m3, m1, [r5 + 7 * 16] | |
13279 pmulhrsw m3, [pw_1024] | |
13280 pmaddubsw m5, m4, [r5 + 7 * 16] | |
13281 pmulhrsw m5, [pw_1024] | |
13282 packuswb m3, m5 | |
13283 movu [r0 + 825 * 16], m3 | |
13284 | |
13285 ; mode 14 [row 29] | |
13286 pslldq m7, 2 | |
13287 pinsrb m7, [r3 + 27], 1 | |
13288 pinsrb m7, [r3 + 30], 0 | |
13289 pmaddubsw m3, m7, [r5 + 26 * 16] | |
13290 pmulhrsw m3, [pw_1024] | |
13291 pslldq m2, 2 | |
13292 pinsrb m2, [r3 + 7], 1 | |
13293 pinsrb m2, [r3 + 10], 0 | |
13294 pmaddubsw m5, m2, [r5 + 26 * 16] | |
13295 pmulhrsw m5, [pw_1024] | |
13296 packuswb m3, m5 | |
13297 movu [r0 + 826 * 16], m3 | |
13298 pslldq m1, 2 | |
13299 pinsrw m1, [r4 + 4], 0 | |
13300 pmaddubsw m3, m1, [r5 + 26 * 16] | |
13301 pmulhrsw m3, [pw_1024] | |
13302 pslldq m4, 2 | |
13303 pinsrw m4, [r4 + 12], 0 | |
13304 pmaddubsw m5, m4, [r5 + 26 * 16] | |
13305 pmulhrsw m5, [pw_1024] | |
13306 packuswb m3, m5 | |
13307 movu [r0 + 827 * 16], m3 | |
13308 | |
13309 ; mode 14 [row 30] | |
13310 pmaddubsw m3, m7, [r5 + 13 * 16] | |
13311 pmulhrsw m3, [pw_1024] | |
13312 pmaddubsw m5, m2, [r5 + 13 * 16] | |
13313 pmulhrsw m5, [pw_1024] | |
13314 packuswb m3, m5 | |
13315 movu [r0 + 828 * 16], m3 | |
13316 pmaddubsw m3, m1, [r5 + 13 * 16] | |
13317 pmulhrsw m3, [pw_1024] | |
13318 pmaddubsw m5, m4, [r5 + 13 * 16] | |
13319 pmulhrsw m5, [pw_1024] | |
13320 packuswb m3, m5 | |
13321 movu [r0 + 829 * 16], m3 | |
13322 | |
13323 ; mode 15 [row 22] | |
13324 pmaddubsw m3, m1, [r5 + 25 * 16] | |
13325 pmulhrsw m3, [pw_1024] | |
13326 pmaddubsw m5, m4, [r5 + 25 * 16] | |
13327 pmulhrsw m5, [pw_1024] | |
13328 packuswb m3, m5 | |
13329 movu [r0 + 877 * 16], m3 | |
13330 | |
13331 ; mode 15 [row 23] | |
13332 pmaddubsw m3, m1, [r5 + 8 * 16] | |
13333 pmulhrsw m3, [pw_1024] | |
13334 pmaddubsw m5, m4, [r5 + 8 * 16] | |
13335 pmulhrsw m5, [pw_1024] | |
13336 packuswb m3, m5 | |
13337 movu [r0 + 879 * 16], m3 | |
13338 | |
13339 ; mode 14 [row 31] | |
13340 pshufb m3, m7, [tab_S2] | |
13341 movh [r0 + 830 * 16], m3 | |
13342 pshufb m3, m2, [tab_S2] | |
13343 movh [r0 + 830 * 16 + 8], m3 | |
13344 pshufb m3, m1, [tab_S2] | |
13345 movh [r0 + 831 * 16], m3 | |
13346 pshufb m3, m4, [tab_S2] | |
13347 movh [r0 + 831 * 16 + 8], m3 | |
13348 | |
13349 ; mode 13 [row 31] | |
13350 pshufb m0, m6, [tab_S2] | |
13351 movh [r0 + 766 * 16], m0 | |
13352 movh m0, [r4] | |
13353 movh [r0 + 766 * 16 + 8], m0 | |
13354 movu m0, [r4 + 8] | |
13355 movu [r0 + 767 * 16], m0 | |
13356 | |
13357 ; mode 15 [row 24] | |
13358 pslldq m1, 2 | |
13359 pinsrw m1, [r4 + 3], 0 | |
13360 pmaddubsw m3, m1, [r5 + 23 * 16] | |
13361 pmulhrsw m3, [pw_1024] | |
13362 pslldq m4, 2 | |
13363 pinsrw m4, [r4 + 11], 0 | |
13364 pmaddubsw m5, m4, [r5 + 23 * 16] | |
13365 pmulhrsw m5, [pw_1024] | |
13366 packuswb m3, m5 | |
13367 movu [r0 + 881 * 16], m3 | |
13368 | |
13369 ; mode 15 [row 25] | |
13370 pmaddubsw m3, m1, [r5 + 6 * 16] | |
13371 pmulhrsw m3, [pw_1024] | |
13372 pmaddubsw m5, m4, [r5 + 6 * 16] | |
13373 pmulhrsw m5, [pw_1024] | |
13374 packuswb m3, m5 | |
13375 movu [r0 + 883 * 16], m3 | |
13376 | |
13377 ; mode 15 [row 26] | |
13378 pslldq m1, 2 | |
13379 pinsrw m1, [r4 + 2], 0 | |
13380 pmaddubsw m3, m1, [r5 + 21 * 16] | |
13381 pmulhrsw m3, [pw_1024] | |
13382 pslldq m4, 2 | |
13383 pinsrw m4, [r4 + 10], 0 | |
13384 pmaddubsw m5, m4, [r5 + 21 * 16] | |
13385 pmulhrsw m5, [pw_1024] | |
13386 packuswb m3, m5 | |
13387 movu [r0 + 885 * 16], m3 | |
13388 | |
13389 ; mode 15 [row 27] | |
13390 pmaddubsw m3, m1, [r5 + 4 * 16] | |
13391 pmulhrsw m3, [pw_1024] | |
13392 pmaddubsw m5, m4, [r5 + 4 * 16] | |
13393 pmulhrsw m5, [pw_1024] | |
13394 packuswb m3, m5 | |
13395 movu [r0 + 887 * 16], m3 | |
13396 | |
13397 ; mode 15 [row 28] | |
13398 pslldq m1, 2 | |
13399 pinsrw m1, [r4 + 1], 0 | |
13400 pmaddubsw m3, m1, [r5 + 19 * 16] | |
13401 pmulhrsw m3, [pw_1024] | |
13402 pslldq m4, 2 | |
13403 pinsrw m4, [r4 + 9], 0 | |
13404 pmaddubsw m5, m4, [r5 + 19 * 16] | |
13405 pmulhrsw m5, [pw_1024] | |
13406 packuswb m3, m5 | |
13407 movu [r0 + 889 * 16], m3 | |
13408 | |
13409 ; mode 15 [row 29] | |
13410 pmaddubsw m3, m1, [r5 + 2 * 16] | |
13411 pmulhrsw m3, [pw_1024] | |
13412 pmaddubsw m5, m4, [r5 + 2 * 16] | |
13413 pmulhrsw m5, [pw_1024] | |
13414 packuswb m3, m5 | |
13415 movu [r0 + 891 * 16], m3 | |
13416 | |
13417 ; mode 15 [row 30] | |
13418 pslldq m1, 2 | |
13419 pinsrw m1, [r4 + 0], 0 | |
13420 pmaddubsw m3, m1, [r5 + 17 * 16] | |
13421 pmulhrsw m3, [pw_1024] | |
13422 pslldq m4, 2 | |
13423 pinsrw m4, [r4 + 8], 0 | |
13424 pmaddubsw m5, m4, [r5 + 17 * 16] | |
13425 pmulhrsw m5, [pw_1024] | |
13426 packuswb m3, m5 | |
13427 movu [r0 + 893 * 16], m3 | |
13428 | |
13429 ; mode 15 [row 31] | |
13430 pshufb m5, m1, [tab_S2] | |
13431 movh [r0 + 895 * 16], m5 | |
13432 pshufb m5, m4, [tab_S2] | |
13433 movh [r0 + 895 * 16 + 8], m5 | |
13434 | |
13435 ; mode 16 [row 0] | |
13436 movu m6, [r5 + 11 * 16] | |
13437 movu m7, [pw_1024] | |
13438 movh m0, [r4 ] | |
13439 movh m1, [r4 + 1 ] | |
13440 punpcklbw m0, m1 | |
13441 pmaddubsw m1, m0, m6 | |
13442 pmulhrsw m1, m7 | |
13443 movh m2, [r4 + 8] | |
13444 movh m3, [r4 + 9] | |
13445 punpcklbw m2, m3 | |
13446 pmaddubsw m3, m2, m6 | |
13447 pmulhrsw m3, m7 | |
13448 packuswb m1, m3 | |
13449 movu [r0 + 896 * 16], m1 | |
13450 | |
13451 movh m1, [r4 + 16] | |
13452 movh m3, [r4 + 17] | |
13453 punpcklbw m1, m3 | |
13454 pmaddubsw m3, m1, m6 | |
13455 pmulhrsw m3, m7 | |
13456 movh m4, [r4 + 24] | |
13457 movh m5, [r4 + 25] | |
13458 punpcklbw m4, m5 | |
13459 pmaddubsw m5, m4, m6 | |
13460 pmulhrsw m5, m7 | |
13461 packuswb m3, m5 | |
13462 movu [r0 + 897 * 16], m3 | |
13463 | |
13464 ; mode16 [row 1] | |
13465 movu m6, [r5 + 22 * 16] | |
13466 pslldq m0, 2 | |
13467 pinsrb m0, [r4], 1 | |
13468 pinsrb m0, [r3 + 2], 0 | |
13469 pmaddubsw m3, m0, m6 | |
13470 pmulhrsw m3, m7 | |
13471 pslldq m2, 2 | |
13472 pinsrw m2, [r4 + 7], 0 | |
13473 pmaddubsw m5, m2, m6 | |
13474 pmulhrsw m5, m7 | |
13475 packuswb m3, m5 | |
13476 movu [r0 + 898 * 16], m3 | |
13477 | |
13478 pslldq m1, 2 | |
13479 pinsrw m1, [r4 + 15], 0 | |
13480 pmaddubsw m3, m1, m6 | |
13481 pmulhrsw m3, m7 | |
13482 pslldq m4, 2 | |
13483 pinsrw m4, [r4 + 23], 0 | |
13484 pmaddubsw m5, m4, m6 | |
13485 pmulhrsw m5, m7 | |
13486 packuswb m3, m5 | |
13487 movu [r0 + 899 * 16], m3 | |
13488 | |
13489 ; mode16 [row 2] | |
13490 movu m6, [r5 + 1 * 16] | |
13491 pmaddubsw m3, m0, m6 | |
13492 pmulhrsw m3, m7 | |
13493 pmaddubsw m5, m2, m6 | |
13494 pmulhrsw m5, m7 | |
13495 packuswb m3, m5 | |
13496 movu [r0 + 900 * 16], m3 | |
13497 | |
13498 pmaddubsw m3, m1, m6 | |
13499 pmulhrsw m3, m7 | |
13500 pmaddubsw m5, m4, m6 | |
13501 pmulhrsw m5, m7 | |
13502 packuswb m3, m5 | |
13503 movu [r0 + 901 * 16], m3 | |
13504 | |
13505 ; mode16 [row 3] | |
13506 movu m6, [r5 + 12 * 16] | |
13507 pslldq m0, 2 | |
13508 pinsrb m0, [r3 + 2], 1 | |
13509 pinsrb m0, [r3 + 3], 0 | |
13510 pmaddubsw m3, m0, m6 | |
13511 pmulhrsw m3, m7 | |
13512 pslldq m2, 2 | |
13513 pinsrw m2, [r4 + 6], 0 | |
13514 pmaddubsw m5, m2, m6 | |
13515 pmulhrsw m5, m7 | |
13516 packuswb m3, m5 | |
13517 movu [r0 + 902 * 16], m3 | |
13518 | |
13519 pslldq m1, 2 | |
13520 pinsrw m1, [r4 + 14], 0 | |
13521 pmaddubsw m3, m1, m6 | |
13522 pmulhrsw m3, m7 | |
13523 pslldq m4, 2 | |
13524 pinsrw m4, [r4 + 22], 0 | |
13525 pmaddubsw m5, m4, m6 | |
13526 pmulhrsw m5, m7 | |
13527 packuswb m3, m5 | |
13528 movu [r0 + 903 * 16], m3 | |
13529 | |
13530 ; mode16 [row 4] | |
13531 movu m6, [r5 + 23 * 16] | |
13532 pslldq m0, 2 | |
13533 pinsrb m0, [r3 + 3], 1 | |
13534 pinsrb m0, [r3 + 5], 0 | |
13535 pmaddubsw m3, m0, m6 | |
13536 pmulhrsw m3, m7 | |
13537 pslldq m2, 2 | |
13538 pinsrw m2, [r4 + 5], 0 | |
13539 pmaddubsw m5, m2, m6 | |
13540 pmulhrsw m5, m7 | |
13541 packuswb m3, m5 | |
13542 movu [r0 + 904 * 16], m3 | |
13543 | |
13544 pslldq m1, 2 | |
13545 pinsrw m1, [r4 + 13], 0 | |
13546 pmaddubsw m3, m1, m6 | |
13547 pmulhrsw m3, m7 | |
13548 pslldq m4, 2 | |
13549 pinsrw m4, [r4 + 21], 0 | |
13550 pmaddubsw m5, m4, m6 | |
13551 pmulhrsw m5, m7 | |
13552 packuswb m3, m5 | |
13553 movu [r0 + 905 * 16], m3 | |
13554 | |
13555 ; mode16 [row 5] | |
13556 movu m6, [r5 + 2 * 16] | |
13557 pmaddubsw m3, m0, m6 | |
13558 pmulhrsw m3, m7 | |
13559 pmaddubsw m5, m2, m6 | |
13560 pmulhrsw m5, m7 | |
13561 packuswb m3, m5 | |
13562 movu [r0 + 906 * 16], m3 | |
13563 | |
13564 pmaddubsw m3, m1, m6 | |
13565 pmulhrsw m3, m7 | |
13566 pmaddubsw m5, m4, m6 | |
13567 pmulhrsw m5, m7 | |
13568 packuswb m3, m5 | |
13569 movu [r0 + 907 * 16], m3 | |
13570 | |
13571 ; mode16 [row 6] | |
13572 movu m6, [r5 + 13 * 16] | |
13573 pslldq m0, 2 | |
13574 pinsrb m0, [r3 + 5], 1 | |
13575 pinsrb m0, [r3 + 6], 0 | |
13576 pmaddubsw m3, m0, m6 | |
13577 pmulhrsw m3, m7 | |
13578 pslldq m2, 2 | |
13579 pinsrb m2, [r4 + 5], 1 | |
13580 pinsrb m2, [r4 + 4], 0 | |
13581 pmaddubsw m5, m2, m6 | |
13582 pmulhrsw m5, m7 | |
13583 packuswb m3, m5 | |
13584 movu [r0 + 908 * 16], m3 | |
13585 pslldq m1, 2 | |
13586 pinsrw m1, [r4 + 12], 0 | |
13587 pmaddubsw m3, m1, m6 | |
13588 pmulhrsw m3, m7 | |
13589 pslldq m4, 2 | |
13590 pinsrw m4, [r4 + 20], 0 | |
13591 pmaddubsw m5, m4, m6 | |
13592 pmulhrsw m5, m7 | |
13593 packuswb m3, m5 | |
13594 movu [r0 + 909 * 16], m3 | |
13595 | |
13596 ; mode16 [row 7] | |
13597 movu m6, [r5 + 24 * 16] | |
13598 pslldq m0, 2 | |
13599 pinsrb m0, [r3 + 6], 1 | |
13600 pinsrb m0, [r3 + 8], 0 | |
13601 pmaddubsw m3, m0, m6 | |
13602 pmulhrsw m3, m7 | |
13603 pslldq m2, 2 | |
13604 pinsrw m2, [r4 + 3], 0 | |
13605 pmaddubsw m5, m2, m6 | |
13606 pmulhrsw m5, m7 | |
13607 packuswb m3, m5 | |
13608 movu [r0 + 910 * 16], m3 | |
13609 | |
13610 pslldq m1, 2 | |
13611 pinsrw m1, [r4 + 11], 0 | |
13612 pmaddubsw m3, m1, m6 | |
13613 pmulhrsw m3, m7 | |
13614 pslldq m4, 2 | |
13615 pinsrw m4, [r4 + 19], 0 | |
13616 pmaddubsw m5, m4, m6 | |
13617 pmulhrsw m5, m7 | |
13618 packuswb m3, m5 | |
13619 movu [r0 + 911 * 16], m3 | |
13620 | |
13621 ; mode16 [row 8] | |
13622 movu m6, [r5 + 3 * 16] | |
13623 pmaddubsw m3, m0, m6 | |
13624 pmulhrsw m3, m7 | |
13625 pmaddubsw m5, m2, m6 | |
13626 pmulhrsw m5, m7 | |
13627 packuswb m3, m5 | |
13628 movu [r0 + 912 * 16], m3 | |
13629 | |
13630 pmaddubsw m3, m1, m6 | |
13631 pmulhrsw m3, m7 | |
13632 pmaddubsw m5, m4, m6 | |
13633 pmulhrsw m5, m7 | |
13634 packuswb m3, m5 | |
13635 movu [r0 + 913 * 16], m3 | |
13636 | |
13637 ; mode16 [row 9] | |
13638 movu m6, [r5 + 14 * 16] | |
13639 pslldq m0, 2 | |
13640 pinsrb m0, [r3 + 8], 1 | |
13641 pinsrb m0, [r3 + 9], 0 | |
13642 pmaddubsw m3, m0, m6 | |
13643 pmulhrsw m3, m7 | |
13644 pslldq m2, 2 | |
13645 pinsrw m2, [r4 + 2], 0 | |
13646 pmaddubsw m5, m2, m6 | |
13647 pmulhrsw m5, m7 | |
13648 packuswb m3, m5 | |
13649 movu [r0 + 914 * 16], m3 | |
13650 | |
13651 pslldq m1, 2 | |
13652 pinsrw m1, [r4 + 10], 0 | |
13653 pmaddubsw m3, m1, m6 | |
13654 pmulhrsw m3, m7 | |
13655 pslldq m4, 2 | |
13656 pinsrw m4, [r4 + 18], 0 | |
13657 pmaddubsw m5, m4, m6 | |
13658 pmulhrsw m5, m7 | |
13659 packuswb m3, m5 | |
13660 movu [r0 + 915 * 16], m3 | |
13661 | |
13662 ; mode16 [row 10] | |
13663 movu m6, [r5 + 25 * 16] | |
13664 pslldq m0, 2 | |
13665 pinsrb m0, [r3 + 9], 1 | |
13666 pinsrb m0, [r3 + 11], 0 | |
13667 pmaddubsw m3, m0, m6 | |
13668 pmulhrsw m3, m7 | |
13669 pslldq m2, 2 | |
13670 pinsrw m2, [r4 + 1], 0 | |
13671 pmaddubsw m5, m2, m6 | |
13672 pmulhrsw m5, m7 | |
13673 packuswb m3, m5 | |
13674 movu [r0 + 916 * 16], m3 | |
13675 | |
13676 pslldq m1, 2 | |
13677 pinsrw m1, [r4 + 9], 0 | |
13678 pmaddubsw m3, m1, m6 | |
13679 pmulhrsw m3, m7 | |
13680 pslldq m4, 2 | |
13681 pinsrb m4, [r4 + 18], 1 | |
13682 pinsrb m4, [r4 + 17], 0 | |
13683 pmaddubsw m5, m4, m6 | |
13684 pmulhrsw m5, m7 | |
13685 packuswb m3, m5 | |
13686 movu [r0 + 917 * 16], m3 | |
13687 | |
13688 ; mode16 [row 11] | |
13689 movu m6, [r5 + 4 * 16] | |
13690 pmaddubsw m3, m0, m6 | |
13691 pmulhrsw m3, m7 | |
13692 pmaddubsw m5, m2, m6 | |
13693 pmulhrsw m5, m7 | |
13694 packuswb m3, m5 | |
13695 movu [r0 + 918 * 16], m3 | |
13696 | |
13697 pmaddubsw m3, m1, m6 | |
13698 pmulhrsw m3, m7 | |
13699 pmaddubsw m5, m4, m6 | |
13700 pmulhrsw m5, m7 | |
13701 packuswb m3, m5 | |
13702 movu [r0 + 919 * 16], m3 | |
13703 | |
13704 ; mode16 [row 12] | |
13705 movu m6, [r5 + 15 * 16] | |
13706 pslldq m0, 2 | |
13707 pinsrb m0, [r3 + 11], 1 | |
13708 pinsrb m0, [r3 + 12], 0 | |
13709 pmaddubsw m3, m0, m6 | |
13710 pmulhrsw m3, m7 | |
13711 pslldq m2, 2 | |
13712 pinsrw m2, [r4 + 0], 0 | |
13713 pmaddubsw m5, m2, m6 | |
13714 pmulhrsw m5, m7 | |
13715 packuswb m3, m5 | |
13716 movu [r0 + 920 * 16], m3 | |
13717 | |
13718 pslldq m1, 2 | |
13719 pinsrw m1, [r4 + 8], 0 | |
13720 pmaddubsw m3, m1, m6 | |
13721 pmulhrsw m3, m7 | |
13722 pslldq m4, 2 | |
13723 pinsrw m4, [r4 + 16], 0 | |
13724 pmaddubsw m5, m4, m6 | |
13725 pmulhrsw m5, m7 | |
13726 packuswb m3, m5 | |
13727 movu [r0 + 921 * 16], m3 | |
13728 | |
13729 ; mode16 [row 13] | |
13730 movu m6, [r5 + 26 * 16] | |
13731 pslldq m0, 2 | |
13732 pinsrb m0, [r3 + 12], 1 | |
13733 pinsrb m0, [r3 + 14], 0 | |
13734 pmaddubsw m3, m0, m6 | |
13735 pmulhrsw m3, m7 | |
13736 pslldq m2, 2 | |
13737 pinsrb m2, [r4 + 0], 1 | |
13738 pinsrb m2, [r3 + 2], 0 | |
13739 pmaddubsw m5, m2, m6 | |
13740 pmulhrsw m5, m7 | |
13741 packuswb m3, m5 | |
13742 movu [r0 + 922 * 16], m3 | |
13743 | |
13744 pslldq m1, 2 | |
13745 pinsrw m1, [r4 + 7], 0 | |
13746 pmaddubsw m3, m1, m6 | |
13747 pmulhrsw m3, m7 | |
13748 pslldq m4, 2 | |
13749 pinsrw m4, [r4 + 15], 0 | |
13750 pmaddubsw m5, m4, m6 | |
13751 pmulhrsw m5, m7 | |
13752 packuswb m3, m5 | |
13753 movu [r0 + 923 * 16], m3 | |
13754 | |
13755 ; mode16 [row 14] | |
13756 movu m6, [r5 + 5 * 16] | |
13757 pmaddubsw m3, m0, m6 | |
13758 pmulhrsw m3, m7 | |
13759 pmaddubsw m5, m2, m6 | |
13760 pmulhrsw m5, m7 | |
13761 packuswb m3, m5 | |
13762 movu [r0 + 924 * 16], m3 | |
13763 | |
13764 pmaddubsw m3, m1, m6 | |
13765 pmulhrsw m3, m7 | |
13766 pmaddubsw m5, m4, m6 | |
13767 pmulhrsw m5, m7 | |
13768 packuswb m3, m5 | |
13769 movu [r0 + 925 * 16], m3 | |
13770 | |
13771 ; mode16 [row 15] | |
13772 movu m6, [r5 + 16 * 16] | |
13773 pslldq m0, 2 | |
13774 pinsrb m0, [r3 + 14], 1 | |
13775 pinsrb m0, [r3 + 15], 0 | |
13776 pmaddubsw m3, m0, m6 | |
13777 pmulhrsw m3, m7 | |
13778 pslldq m2, 2 | |
13779 pinsrb m2, [r3 + 2], 1 | |
13780 pinsrb m2, [r3 + 3], 0 | |
13781 pmaddubsw m5, m2, m6 | |
13782 pmulhrsw m5, m7 | |
13783 packuswb m3, m5 | |
13784 movu [r0 + 926 * 16], m3 | |
13785 | |
13786 pslldq m1, 2 | |
13787 pinsrw m1, [r4 + 6], 0 | |
13788 pmaddubsw m3, m1, m6 | |
13789 pmulhrsw m3, m7 | |
13790 pslldq m4, 2 | |
13791 pinsrw m4, [r4 + 14], 0 | |
13792 pmaddubsw m5, m4, m6 | |
13793 pmulhrsw m5, m7 | |
13794 packuswb m3, m5 | |
13795 movu [r0 + 927 * 16], m3 | |
13796 | |
13797 ; mode16 [row 16] | |
13798 movu m6, [r5 + 27 * 16] | |
13799 pslldq m0, 2 | |
13800 pinsrb m0, [r3 + 15], 1 | |
13801 pinsrb m0, [r3 + 17], 0 | |
13802 pmaddubsw m3, m0, m6 | |
13803 pmulhrsw m3, m7 | |
13804 pslldq m2, 2 | |
13805 pinsrb m2, [r3 + 3], 1 | |
13806 pinsrb m2, [r3 + 5], 0 | |
13807 pmaddubsw m5, m2, m6 | |
13808 pmulhrsw m5, m7 | |
13809 packuswb m3, m5 | |
13810 movu [r0 + 928 * 16], m3 | |
13811 | |
13812 pslldq m1, 2 | |
13813 pinsrw m1, [r4 + 5], 0 | |
13814 pmaddubsw m3, m1, m6 | |
13815 pmulhrsw m3, m7 | |
13816 pslldq m4, 2 | |
13817 pinsrw m4, [r4 + 13], 0 | |
13818 pmaddubsw m5, m4, m6 | |
13819 pmulhrsw m5, m7 | |
13820 packuswb m3, m5 | |
13821 movu [r0 + 929 * 16], m3 | |
13822 | |
13823 ; mode16 [row 17] | |
13824 movu m6, [r5 + 6 * 16] | |
13825 pmaddubsw m3, m0, m6 | |
13826 pmulhrsw m3, m7 | |
13827 pmaddubsw m5, m2, m6 | |
13828 pmulhrsw m5, m7 | |
13829 packuswb m3, m5 | |
13830 movu [r0 + 930 * 16], m3 | |
13831 | |
13832 pmaddubsw m3, m1, m6 | |
13833 pmulhrsw m3, m7 | |
13834 pmaddubsw m5, m4, m6 | |
13835 pmulhrsw m5, m7 | |
13836 packuswb m3, m5 | |
13837 movu [r0 + 931 * 16], m3 | |
13838 | |
13839 ; mode16 [row 18] | |
13840 movu m6, [r5 + 17 * 16] | |
13841 pslldq m0, 2 | |
13842 pinsrb m0, [r3 + 17], 1 | |
13843 pinsrb m0, [r3 + 18], 0 | |
13844 pmaddubsw m3, m0, m6 | |
13845 pmulhrsw m3, m7 | |
13846 pslldq m2, 2 | |
13847 pinsrb m2, [r3 + 5], 1 | |
13848 pinsrb m2, [r3 + 6], 0 | |
13849 pmaddubsw m5, m2, m6 | |
13850 pmulhrsw m5, m7 | |
13851 packuswb m3, m5 | |
13852 movu [r0 + 932 * 16], m3 | |
13853 | |
13854 pslldq m1, 2 | |
13855 pinsrw m1, [r4 + 4], 0 | |
13856 pmaddubsw m3, m1, m6 | |
13857 pmulhrsw m3, m7 | |
13858 pslldq m4, 2 | |
13859 pinsrw m4, [r4 + 12], 0 | |
13860 pmaddubsw m5, m4, m6 | |
13861 pmulhrsw m5, m7 | |
13862 packuswb m3, m5 | |
13863 movu [r0 + 933 * 16], m3 | |
13864 | |
13865 ; mode16 [row 19] | |
13866 movu m6, [r5 + 28 * 16] | |
13867 pslldq m0, 2 | |
13868 pinsrb m0, [r3 + 18], 1 | |
13869 pinsrb m0, [r3 + 20], 0 | |
13870 pmaddubsw m3, m0, m6 | |
13871 pmulhrsw m3, m7 | |
13872 pslldq m2, 2 | |
13873 pinsrb m2, [r3 + 6], 1 | |
13874 pinsrb m2, [r3 + 8], 0 | |
13875 pmaddubsw m5, m2, m6 | |
13876 pmulhrsw m5, m7 | |
13877 packuswb m3, m5 | |
13878 movu [r0 + 934 * 16], m3 | |
13879 | |
13880 pslldq m1, 2 | |
13881 pinsrw m1, [r4 + 3], 0 | |
13882 pmaddubsw m3, m1, m6 | |
13883 pmulhrsw m3, m7 | |
13884 pslldq m4, 2 | |
13885 pinsrw m4, [r4 + 11], 0 | |
13886 pmaddubsw m5, m4, m6 | |
13887 pmulhrsw m5, m7 | |
13888 packuswb m3, m5 | |
13889 movu [r0 + 935 * 16], m3 | |
13890 | |
13891 ; mode16 [row 20] | |
13892 movu m6, [r5 + 7 * 16] | |
13893 pmaddubsw m3, m0, m6 | |
13894 pmulhrsw m3, m7 | |
13895 pmaddubsw m5, m2, m6 | |
13896 pmulhrsw m5, m7 | |
13897 packuswb m3, m5 | |
13898 movu [r0 + 936 * 16], m3 | |
13899 | |
13900 pmaddubsw m3, m1, m6 | |
13901 pmulhrsw m3, m7 | |
13902 pmaddubsw m5, m4, m6 | |
13903 pmulhrsw m5, m7 | |
13904 packuswb m3, m5 | |
13905 movu [r0 + 937 * 16], m3 | |
13906 | |
13907 ; mode16 [row 21] | |
13908 movu m6, [r5 + 18 * 16] | |
13909 pslldq m0, 2 | |
13910 pinsrb m0, [r3 + 20], 1 | |
13911 pinsrb m0, [r3 + 21], 0 | |
13912 pmaddubsw m3, m0, m6 | |
13913 pmulhrsw m3, m7 | |
13914 pslldq m2, 2 | |
13915 pinsrb m2, [r3 + 8], 1 | |
13916 pinsrb m2, [r3 + 9], 0 | |
13917 pmaddubsw m5, m2, m6 | |
13918 pmulhrsw m5, m7 | |
13919 packuswb m3, m5 | |
13920 movu [r0 + 938 * 16], m3 | |
13921 | |
13922 pslldq m1, 2 | |
13923 pinsrw m1, [r4 + 2], 0 | |
13924 pmaddubsw m3, m1, m6 | |
13925 pmulhrsw m3, m7 | |
13926 pslldq m4, 2 | |
13927 pinsrw m4, [r4 + 10], 0 | |
13928 pmaddubsw m5, m4, m6 | |
13929 pmulhrsw m5, m7 | |
13930 packuswb m3, m5 | |
13931 movu [r0 + 939 * 16], m3 | |
13932 | |
13933 ; mode16 [row 22] | |
13934 movu m6, [r5 + 29 * 16] | |
13935 pslldq m0, 2 | |
13936 pinsrb m0, [r3 + 21], 1 | |
13937 pinsrb m0, [r3 + 23], 0 | |
13938 pmaddubsw m3, m0, m6 | |
13939 pmulhrsw m3, m7 | |
13940 pslldq m2, 2 | |
13941 pinsrb m2, [r3 + 9], 1 | |
13942 pinsrb m2, [r3 + 11], 0 | |
13943 pmaddubsw m5, m2, m6 | |
13944 pmulhrsw m5, m7 | |
13945 packuswb m3, m5 | |
13946 movu [r0 + 940 * 16], m3 | |
13947 | |
13948 pslldq m1, 2 | |
13949 pinsrw m1, [r4 + 1], 0 | |
13950 pmaddubsw m3, m1, m6 | |
13951 pmulhrsw m3, m7 | |
13952 pslldq m4, 2 | |
13953 pinsrw m4, [r4 + 9], 0 | |
13954 pmaddubsw m5, m4, m6 | |
13955 pmulhrsw m5, m7 | |
13956 packuswb m3, m5 | |
13957 movu [r0 + 941 * 16], m3 | |
13958 | |
13959 ; mode16 [row 23] | |
13960 movu m6, [r5 + 8 * 16] | |
13961 pmaddubsw m3, m0, m6 | |
13962 pmulhrsw m3, m7 | |
13963 pmaddubsw m5, m2, m6 | |
13964 pmulhrsw m5, m7 | |
13965 packuswb m3, m5 | |
13966 movu [r0 + 942 * 16], m3 | |
13967 | |
13968 pmaddubsw m3, m1, m6 | |
13969 pmulhrsw m3, m7 | |
13970 pmaddubsw m5, m4, m6 | |
13971 pmulhrsw m5, m7 | |
13972 packuswb m3, m5 | |
13973 movu [r0 + 943 * 16], m3 | |
13974 | |
13975 ; mode16 [row 24] | |
13976 movu m6, [r5 + 19 * 16] | |
13977 pslldq m0, 2 | |
13978 pinsrb m0, [r3 + 23], 1 | |
13979 pinsrb m0, [r3 + 24], 0 | |
13980 pmaddubsw m3, m0, m6 | |
13981 pmulhrsw m3, m7 | |
13982 pslldq m2, 2 | |
13983 pinsrb m2, [r3 + 11], 1 | |
13984 pinsrb m2, [r3 + 12], 0 | |
13985 pmaddubsw m5, m2, m6 | |
13986 pmulhrsw m5, m7 | |
13987 packuswb m3, m5 | |
13988 movu [r0 + 944 * 16], m3 | |
13989 | |
13990 pslldq m1, 2 | |
13991 pinsrw m1, [r4 + 0], 0 | |
13992 pmaddubsw m3, m1, m6 | |
13993 pmulhrsw m3, m7 | |
13994 pslldq m4, 2 | |
13995 pinsrw m4, [r4 + 8], 0 | |
13996 pmaddubsw m5, m4, m6 | |
13997 pmulhrsw m5, m7 | |
13998 packuswb m3, m5 | |
13999 movu [r0 + 945 * 16], m3 | |
14000 | |
14001 ; mode16 [row 25] | |
14002 movu m6, [r5 + 30 * 16] | |
14003 pslldq m0, 2 | |
14004 pinsrb m0, [r3 + 24], 1 | |
14005 pinsrb m0, [r3 + 26], 0 | |
14006 pmaddubsw m3, m0, m6 | |
14007 pmulhrsw m3, m7 | |
14008 pslldq m2, 2 | |
14009 pinsrb m2, [r3 + 12], 1 | |
14010 pinsrb m2, [r3 + 14], 0 | |
14011 pmaddubsw m5, m2, m6 | |
14012 pmulhrsw m5, m7 | |
14013 packuswb m3, m5 | |
14014 movu [r0 + 946 * 16], m3 | |
14015 | |
14016 pslldq m1, 2 | |
14017 pinsrb m1, [r4 + 0], 1 | |
14018 pinsrb m1, [r3 + 2], 0 | |
14019 pmaddubsw m3, m1, m6 | |
14020 pmulhrsw m3, m7 | |
14021 pslldq m4, 2 | |
14022 pinsrw m4, [r4 + 7], 0 | |
14023 pmaddubsw m5, m4, m6 | |
14024 pmulhrsw m5, m7 | |
14025 packuswb m3, m5 | |
14026 movu [r0 + 947 * 16], m3 | |
14027 | |
14028 ; mode16 [row 26] | |
14029 movu m6, [r5 + 9 * 16] | |
14030 pmaddubsw m3, m0, m6 | |
14031 pmulhrsw m3, m7 | |
14032 pmaddubsw m5, m2, m6 | |
14033 pmulhrsw m5, m7 | |
14034 packuswb m3, m5 | |
14035 movu [r0 + 948 * 16], m3 | |
14036 | |
14037 pmaddubsw m3, m1, m6 | |
14038 pmulhrsw m3, m7 | |
14039 pmaddubsw m5, m4, m6 | |
14040 pmulhrsw m5, m7 | |
14041 packuswb m3, m5 | |
14042 movu [r0 + 949 * 16], m3 | |
14043 | |
14044 ; mode16 [row 27] | |
14045 movu m6, [r5 + 20 * 16] | |
14046 pslldq m0, 2 | |
14047 pinsrb m0, [r3 + 26], 1 | |
14048 pinsrb m0, [r3 + 27], 0 | |
14049 pmaddubsw m3, m0, m6 | |
14050 pmulhrsw m3, m7 | |
14051 pslldq m2, 2 | |
14052 pinsrb m2, [r3 + 14], 1 | |
14053 pinsrb m2, [r3 + 15], 0 | |
14054 pmaddubsw m5, m2, m6 | |
14055 pmulhrsw m5, m7 | |
14056 packuswb m3, m5 | |
14057 movu [r0 + 950 * 16], m3 | |
14058 | |
14059 pslldq m1, 2 | |
14060 pinsrb m1, [r3 + 2], 1 | |
14061 pinsrb m1, [r3 + 3], 0 | |
14062 pmaddubsw m3, m1, m6 | |
14063 pmulhrsw m3, m7 | |
14064 pslldq m4, 2 | |
14065 pinsrw m4, [r4 + 6], 0 | |
14066 pmaddubsw m5, m4, m6 | |
14067 pmulhrsw m5, m7 | |
14068 packuswb m3, m5 | |
14069 movu [r0 + 951 * 16], m3 | |
14070 | |
14071 ; mode16 [row 28] | |
14072 movu m6, [r5 + 31 * 16] | |
14073 pslldq m0, 2 | |
14074 pinsrb m0, [r3 + 27], 1 | |
14075 pinsrb m0, [r3 + 29], 0 | |
14076 pmaddubsw m3, m0, m6 | |
14077 pmulhrsw m3, m7 | |
14078 pslldq m2, 2 | |
14079 pinsrb m2, [r3 + 15], 1 | |
14080 pinsrb m2, [r3 + 17], 0 | |
14081 pmaddubsw m5, m2, m6 | |
14082 pmulhrsw m5, m7 | |
14083 packuswb m3, m5 | |
14084 movu [r0 + 952 * 16], m3 | |
14085 | |
14086 pslldq m1, 2 | |
14087 pinsrb m1, [r3 + 3], 1 | |
14088 pinsrb m1, [r3 + 5], 0 | |
14089 pmaddubsw m3, m1, m6 | |
14090 pmulhrsw m3, m7 | |
14091 pslldq m4, 2 | |
14092 pinsrw m4, [r4 + 5], 0 | |
14093 pmaddubsw m5, m4, m6 | |
14094 pmulhrsw m5, m7 | |
14095 packuswb m3, m5 | |
14096 movu [r0 + 953 * 16], m3 | |
14097 | |
14098 ; mode16 [row 29] | |
14099 movu m6, [r5 + 10 * 16] | |
14100 pmaddubsw m3, m0, m6 | |
14101 pmulhrsw m3, m7 | |
14102 pmaddubsw m5, m2, m6 | |
14103 pmulhrsw m5, m7 | |
14104 packuswb m3, m5 | |
14105 movu [r0 + 954 * 16], m3 | |
14106 | |
14107 pmaddubsw m3, m1, m6 | |
14108 pmulhrsw m3, m7 | |
14109 pmaddubsw m5, m4, m6 | |
14110 pmulhrsw m5, m7 | |
14111 packuswb m3, m5 | |
14112 movu [r0 + 955 * 16], m3 | |
14113 | |
14114 ; mode16 [row 30] | |
14115 movu m6, [r5 + 21 * 16] | |
14116 pslldq m0, 2 | |
14117 pinsrb m0, [r3 + 29], 1 | |
14118 pinsrb m0, [r3 + 30], 0 | |
14119 pmaddubsw m3, m0, m6 | |
14120 pmulhrsw m3, m7 | |
14121 pslldq m2, 2 | |
14122 pinsrb m2, [r3 + 17], 1 | |
14123 pinsrb m2, [r3 + 18], 0 | |
14124 pmaddubsw m5, m2, m6 | |
14125 pmulhrsw m5, m7 | |
14126 packuswb m3, m5 | |
14127 movu [r0 + 956 * 16], m3 | |
14128 | |
14129 pslldq m1, 2 | |
14130 pinsrb m1, [r3 + 5], 1 | |
14131 pinsrb m1, [r3 + 6], 0 | |
14132 pmaddubsw m3, m1, m6 | |
14133 pmulhrsw m3, m7 | |
14134 pslldq m4, 2 | |
14135 pinsrw m4, [r4 + 4], 0 | |
14136 pmaddubsw m5, m4, m6 | |
14137 pmulhrsw m5, m7 | |
14138 packuswb m3, m5 | |
14139 movu [r0 + 957 * 16], m3 | |
14140 | |
14141 ; mode16 [row 31] | |
14142 pshufb m5, m0, [tab_S2] | |
14143 movh [r0 + 958 * 16], m5 | |
14144 pshufb m5, m2, [tab_S2] | |
14145 movh [r0 + 958 * 16 + 8], m5 | |
14146 pshufb m5, m1, [tab_S2] | |
14147 movh [r0 + 959 * 16], m5 | |
14148 pshufb m5, m4, [tab_S2] | |
14149 movh [r0 + 959 * 16 + 8], m5 | |
14150 | |
14151 ; mode 17 [row 0] | |
14152 movu m6, [r5 + 6 * 16] | |
14153 movu m7, [pw_1024] | |
14154 movh m0, [r4 ] | |
14155 movh m1, [r4 + 1 ] | |
14156 punpcklbw m0, m1 | |
14157 pmaddubsw m1, m0, m6 | |
14158 pmulhrsw m1, m7 | |
14159 movh m2, [r4 + 8] | |
14160 movh m3, [r4 + 9] | |
14161 punpcklbw m2, m3 | |
14162 pmaddubsw m3, m2, m6 | |
14163 pmulhrsw m3, m7 | |
14164 packuswb m1, m3 | |
14165 movu [r0 + 960 * 16], m1 | |
14166 | |
14167 movh m1, [r4 + 16] | |
14168 movh m3, [r4 + 17] | |
14169 punpcklbw m1, m3 | |
14170 pmaddubsw m3, m1, m6 | |
14171 pmulhrsw m3, m7 | |
14172 movh m4, [r4 + 24] | |
14173 movh m5, [r4 + 25] | |
14174 punpcklbw m4, m5 | |
14175 pmaddubsw m5, m4, m6 | |
14176 pmulhrsw m5, m7 | |
14177 packuswb m3, m5 | |
14178 movu [r0 + 961 * 16], m3 | |
14179 | |
14180 ; mode17 [row 1] | |
14181 movu m6, [r5 + 12 * 16] | |
14182 pslldq m0, 2 | |
14183 pinsrb m0, [r3 + 0], 1 | |
14184 pinsrb m0, [r3 + 1], 0 | |
14185 pmaddubsw m3, m0, m6 | |
14186 pmulhrsw m3, m7 | |
14187 pslldq m2, 2 | |
14188 pinsrw m2, [r4 + 7], 0 | |
14189 pmaddubsw m5, m2, m6 | |
14190 pmulhrsw m5, m7 | |
14191 packuswb m3, m5 | |
14192 movu [r0 + 962 * 16], m3 | |
14193 | |
14194 pslldq m1, 2 | |
14195 pinsrw m1, [r4 + 15], 0 | |
14196 pmaddubsw m3, m1, m6 | |
14197 pmulhrsw m3, m7 | |
14198 pslldq m4, 2 | |
14199 pinsrw m4, [r4 + 23], 0 | |
14200 pmaddubsw m5, m4, m6 | |
14201 pmulhrsw m5, m7 | |
14202 packuswb m3, m5 | |
14203 movu [r0 + 963 * 16], m3 | |
14204 | |
14205 ; mode17 [row 2] | |
14206 movu m6, [r5 + 18 * 16] | |
14207 pslldq m0, 2 | |
14208 pinsrb m0, [r3 + 1], 1 | |
14209 pinsrb m0, [r3 + 2], 0 | |
14210 pmaddubsw m3, m0, m6 | |
14211 pmulhrsw m3, m7 | |
14212 pslldq m2, 2 | |
14213 pinsrw m2, [r4 + 6], 0 | |
14214 pmaddubsw m5, m2, m6 | |
14215 pmulhrsw m5, m7 | |
14216 packuswb m3, m5 | |
14217 movu [r0 + 964 * 16], m3 | |
14218 | |
14219 pslldq m1, 2 | |
14220 pinsrw m1, [r4 + 14], 0 | |
14221 pmaddubsw m3, m1, m6 | |
14222 pmulhrsw m3, m7 | |
14223 pslldq m4, 2 | |
14224 pinsrw m4, [r4 + 22], 0 | |
14225 pmaddubsw m5, m4, m6 | |
14226 pmulhrsw m5, m7 | |
14227 packuswb m3, m5 | |
14228 movu [r0 + 965 * 16], m3 | |
14229 | |
14230 ; mode17 [row 3] | |
14231 movu m6, [r5 + 24 * 16] | |
14232 pslldq m0, 2 | |
14233 pinsrb m0, [r3 + 2], 1 | |
14234 pinsrb m0, [r3 + 4], 0 | |
14235 pmaddubsw m3, m0, m6 | |
14236 pmulhrsw m3, m7 | |
14237 pslldq m2, 2 | |
14238 pinsrw m2, [r4 + 5], 0 | |
14239 pmaddubsw m5, m2, m6 | |
14240 pmulhrsw m5, m7 | |
14241 packuswb m3, m5 | |
14242 movu [r0 + 966 * 16], m3 | |
14243 | |
14244 pslldq m1, 2 | |
14245 pinsrw m1, [r4 + 13], 0 | |
14246 pmaddubsw m3, m1, m6 | |
14247 pmulhrsw m3, m7 | |
14248 pslldq m4, 2 | |
14249 pinsrw m4, [r4 + 21], 0 | |
14250 pmaddubsw m5, m4, m6 | |
14251 pmulhrsw m5, m7 | |
14252 packuswb m3, m5 | |
14253 movu [r0 + 967 * 16], m3 | |
14254 | |
14255 ; mode17 [row 4] | |
14256 movu m6, [r5 + 30 * 16] | |
14257 pslldq m0, 2 | |
14258 pinsrb m0, [r3 + 4], 1 | |
14259 pinsrb m0, [r3 + 5], 0 | |
14260 pmaddubsw m3, m0, m6 | |
14261 pmulhrsw m3, m7 | |
14262 pslldq m2, 2 | |
14263 pinsrw m2, [r4 + 4], 0 | |
14264 pmaddubsw m5, m2, m6 | |
14265 pmulhrsw m5, m7 | |
14266 packuswb m3, m5 | |
14267 movu [r0 + 968 * 16], m3 | |
14268 | |
14269 pslldq m1, 2 | |
14270 pinsrw m1, [r4 + 12], 0 | |
14271 pmaddubsw m3, m1, m6 | |
14272 pmulhrsw m3, m7 | |
14273 pslldq m4, 2 | |
14274 pinsrw m4, [r4 + 20], 0 | |
14275 pmaddubsw m5, m4, m6 | |
14276 pmulhrsw m5, m7 | |
14277 packuswb m3, m5 | |
14278 movu [r0 + 969 * 16], m3 | |
14279 | |
14280 ; mode17 [row 5] | |
14281 movu m6, [r5 + 4 * 16] | |
14282 pmaddubsw m3, m0, m6 | |
14283 pmulhrsw m3, m7 | |
14284 pmaddubsw m5, m2, m6 | |
14285 pmulhrsw m5, m7 | |
14286 packuswb m3, m5 | |
14287 movu [r0 + 970 * 16], m3 | |
14288 | |
14289 pmaddubsw m3, m1, m6 | |
14290 pmulhrsw m3, m7 | |
14291 pmaddubsw m5, m4, m6 | |
14292 pmulhrsw m5, m7 | |
14293 packuswb m3, m5 | |
14294 movu [r0 + 971 * 16], m3 | |
14295 | |
14296 ; mode17 [row 6] | |
14297 movu m6, [r5 + 10 * 16] | |
14298 pslldq m0, 2 | |
14299 pinsrb m0, [r3 + 5], 1 | |
14300 pinsrb m0, [r3 + 6], 0 | |
14301 pmaddubsw m3, m0, m6 | |
14302 pmulhrsw m3, m7 | |
14303 pslldq m2, 2 | |
14304 pinsrw m2, [r4 + 3], 0 | |
14305 pmaddubsw m5, m2, m6 | |
14306 pmulhrsw m5, m7 | |
14307 packuswb m3, m5 | |
14308 movu [r0 + 972 * 16], m3 | |
14309 | |
14310 pslldq m1, 2 | |
14311 pinsrw m1, [r4 + 11], 0 | |
14312 pmaddubsw m3, m1, m6 | |
14313 pmulhrsw m3, m7 | |
14314 pslldq m4, 2 | |
14315 pinsrw m4, [r4 + 19], 0 | |
14316 pmaddubsw m5, m4, m6 | |
14317 pmulhrsw m5, m7 | |
14318 packuswb m3, m5 | |
14319 movu [r0 + 973 * 16], m3 | |
14320 | |
14321 ; mode17 [row 7] | |
14322 movu m6, [r5 + 16 * 16] | |
14323 pslldq m0, 2 | |
14324 pinsrb m0, [r3 + 6], 1 | |
14325 pinsrb m0, [r3 + 7], 0 | |
14326 pmaddubsw m3, m0, m6 | |
14327 pmulhrsw m3, m7 | |
14328 pslldq m2, 2 | |
14329 pinsrw m2, [r4 + 2], 0 | |
14330 pmaddubsw m5, m2, m6 | |
14331 pmulhrsw m5, m7 | |
14332 packuswb m3, m5 | |
14333 movu [r0 + 974 * 16], m3 | |
14334 | |
14335 pslldq m1, 2 | |
14336 pinsrw m1, [r4 + 10], 0 | |
14337 pmaddubsw m3, m1, m6 | |
14338 pmulhrsw m3, m7 | |
14339 pslldq m4, 2 | |
14340 pinsrw m4, [r4 + 18], 0 | |
14341 pmaddubsw m5, m4, m6 | |
14342 pmulhrsw m5, m7 | |
14343 packuswb m3, m5 | |
14344 movu [r0 + 975 * 16], m3 | |
14345 | |
14346 ; mode17 [row 8] | |
14347 movu m6, [r5 + 22 * 16] | |
14348 pslldq m0, 2 | |
14349 pinsrb m0, [r3 + 7], 1 | |
14350 pinsrb m0, [r3 + 9], 0 | |
14351 pmaddubsw m3, m0, m6 | |
14352 pmulhrsw m3, m7 | |
14353 pslldq m2, 2 | |
14354 pinsrw m2, [r4 + 1], 0 | |
14355 pmaddubsw m5, m2, m6 | |
14356 pmulhrsw m5, m7 | |
14357 packuswb m3, m5 | |
14358 movu [r0 + 976 * 16], m3 | |
14359 | |
14360 pslldq m1, 2 | |
14361 pinsrw m1, [r4 + 9], 0 | |
14362 pmaddubsw m3, m1, m6 | |
14363 pmulhrsw m3, m7 | |
14364 pslldq m4, 2 | |
14365 pinsrw m4, [r4 + 17], 0 | |
14366 pmaddubsw m5, m4, m6 | |
14367 pmulhrsw m5, m7 | |
14368 packuswb m3, m5 | |
14369 movu [r0 + 977 * 16], m3 | |
14370 | |
14371 ; mode17 [row 9] | |
14372 movu m6, [r5 + 28 * 16] | |
14373 pslldq m0, 2 | |
14374 pinsrb m0, [r3 + 9], 1 | |
14375 pinsrb m0, [r3 + 10], 0 | |
14376 pmaddubsw m3, m0, m6 | |
14377 pmulhrsw m3, m7 | |
14378 pslldq m2, 2 | |
14379 pinsrw m2, [r4 + 0], 0 | |
14380 pmaddubsw m5, m2, m6 | |
14381 pmulhrsw m5, m7 | |
14382 packuswb m3, m5 | |
14383 movu [r0 + 978 * 16], m3 | |
14384 | |
14385 pslldq m1, 2 | |
14386 pinsrw m1, [r4 + 8], 0 | |
14387 pmaddubsw m3, m1, m6 | |
14388 pmulhrsw m3, m7 | |
14389 pslldq m4, 2 | |
14390 pinsrw m4, [r4 + 16], 0 | |
14391 pmaddubsw m5, m4, m6 | |
14392 pmulhrsw m5, m7 | |
14393 packuswb m3, m5 | |
14394 movu [r0 + 979 * 16], m3 | |
14395 | |
14396 ; mode17 [row 10] | |
14397 movu m6, [r5 + 2 * 16] | |
14398 pmaddubsw m3, m0, m6 | |
14399 pmulhrsw m3, m7 | |
14400 pmaddubsw m5, m2, m6 | |
14401 pmulhrsw m5, m7 | |
14402 packuswb m3, m5 | |
14403 movu [r0 + 980 * 16], m3 | |
14404 | |
14405 pmaddubsw m3, m1, m6 | |
14406 pmulhrsw m3, m7 | |
14407 pmaddubsw m5, m4, m6 | |
14408 pmulhrsw m5, m7 | |
14409 packuswb m3, m5 | |
14410 movu [r0 + 981 * 16], m3 | |
14411 | |
14412 ; mode17 [row 11] | |
14413 movu m6, [r5 + 8 * 16] | |
14414 pslldq m0, 2 | |
14415 pinsrb m0, [r3 + 10], 1 | |
14416 pinsrb m0, [r3 + 11], 0 | |
14417 pmaddubsw m3, m0, m6 | |
14418 pmulhrsw m3, m7 | |
14419 pslldq m2, 2 | |
14420 pinsrb m2, [r4 + 0], 1 | |
14421 pinsrb m2, [r3 + 1], 0 | |
14422 pmaddubsw m5, m2, m6 | |
14423 pmulhrsw m5, m7 | |
14424 packuswb m3, m5 | |
14425 movu [r0 + 982 * 16], m3 | |
14426 | |
14427 pslldq m1, 2 | |
14428 pinsrw m1, [r4 + 7], 0 | |
14429 pmaddubsw m3, m1, m6 | |
14430 pmulhrsw m3, m7 | |
14431 pslldq m4, 2 | |
14432 pinsrw m4, [r4 + 15], 0 | |
14433 pmaddubsw m5, m4, m6 | |
14434 pmulhrsw m5, m7 | |
14435 packuswb m3, m5 | |
14436 movu [r0 + 983 * 16], m3 | |
14437 | |
14438 ; mode17 [row 12] | |
14439 movu m6, [r5 + 14 * 16] | |
14440 pslldq m0, 2 | |
14441 pinsrb m0, [r3 + 11], 1 | |
14442 pinsrb m0, [r3 + 12], 0 | |
14443 pmaddubsw m3, m0, m6 | |
14444 pmulhrsw m3, m7 | |
14445 pslldq m2, 2 | |
14446 pinsrb m2, [r3 + 1], 1 | |
14447 pinsrb m2, [r3 + 2], 0 | |
14448 pmaddubsw m5, m2, m6 | |
14449 pmulhrsw m5, m7 | |
14450 packuswb m3, m5 | |
14451 movu [r0 + 984 * 16], m3 | |
14452 | |
14453 pslldq m1, 2 | |
14454 pinsrw m1, [r4 + 6], 0 | |
14455 pmaddubsw m3, m1, m6 | |
14456 pmulhrsw m3, m7 | |
14457 pslldq m4, 2 | |
14458 pinsrw m4, [r4 + 14], 0 | |
14459 pmaddubsw m5, m4, m6 | |
14460 pmulhrsw m5, m7 | |
14461 packuswb m3, m5 | |
14462 movu [r0 + 985 * 16], m3 | |
14463 | |
14464 ; mode17 [row 13] | |
14465 movu m6, [r5 + 20 * 16] | |
14466 pslldq m0, 2 | |
14467 pinsrb m0, [r3 + 12], 1 | |
14468 pinsrb m0, [r3 + 14], 0 | |
14469 pmaddubsw m3, m0, m6 | |
14470 pmulhrsw m3, m7 | |
14471 pslldq m2, 2 | |
14472 pinsrb m2, [r3 + 2], 1 | |
14473 pinsrb m2, [r3 + 4], 0 | |
14474 pmaddubsw m5, m2, m6 | |
14475 pmulhrsw m5, m7 | |
14476 packuswb m3, m5 | |
14477 movu [r0 + 986 * 16], m3 | |
14478 | |
14479 pslldq m1, 2 | |
14480 pinsrw m1, [r4 + 5], 0 | |
14481 pmaddubsw m3, m1, m6 | |
14482 pmulhrsw m3, m7 | |
14483 pslldq m4, 2 | |
14484 pinsrw m4, [r4 + 13], 0 | |
14485 pmaddubsw m5, m4, m6 | |
14486 pmulhrsw m5, m7 | |
14487 packuswb m3, m5 | |
14488 movu [r0 + 987 * 16], m3 | |
14489 | |
14490 ; mode17 [row 14] | |
14491 movu m6, [r5 + 26 * 16] | |
14492 pslldq m0, 2 | |
14493 pinsrb m0, [r3 + 14], 1 | |
14494 pinsrb m0, [r3 + 15], 0 | |
14495 pmaddubsw m3, m0, m6 | |
14496 pmulhrsw m3, m7 | |
14497 pslldq m2, 2 | |
14498 pinsrb m2, [r3 + 4], 1 | |
14499 pinsrb m2, [r3 + 5], 0 | |
14500 pmaddubsw m5, m2, m6 | |
14501 pmulhrsw m5, m7 | |
14502 packuswb m3, m5 | |
14503 movu [r0 + 988 * 16], m3 | |
14504 | |
14505 pslldq m1, 2 | |
14506 pinsrw m1, [r4 + 4], 0 | |
14507 pmaddubsw m3, m1, m6 | |
14508 pmulhrsw m3, m7 | |
14509 pslldq m4, 2 | |
14510 pinsrw m4, [r4 + 12], 0 | |
14511 pmaddubsw m5, m4, m6 | |
14512 pmulhrsw m5, m7 | |
14513 packuswb m3, m5 | |
14514 movu [r0 + 989 * 16], m3 | |
14515 | |
14516 ; mode17 [row 15] | |
14517 pshufb m5, m0, [tab_S2] | |
14518 movh [r0 + 990 * 16], m5 | |
14519 pshufb m5, m2, [tab_S2] | |
14520 movh [r0 + 990 * 16 + 8], m5 | |
14521 pshufb m5, m1, [tab_S2] | |
14522 movh [r0 + 991 * 16], m5 | |
14523 pshufb m5, m4, [tab_S2] | |
14524 movh [r0 + 991 * 16 + 8], m5 | |
14525 | |
14526 ; mode17 [row 16] | |
14527 movu m6, [r5 + 6 * 16] | |
14528 pslldq m0, 2 | |
14529 pinsrb m0, [r3 + 15], 1 | |
14530 pinsrb m0, [r3 + 16], 0 | |
14531 pmaddubsw m3, m0, m6 | |
14532 pmulhrsw m3, m7 | |
14533 pslldq m2, 2 | |
14534 pinsrb m2, [r3 + 5], 1 | |
14535 pinsrb m2, [r3 + 6], 0 | |
14536 pmaddubsw m5, m2, m6 | |
14537 pmulhrsw m5, m7 | |
14538 packuswb m3, m5 | |
14539 movu [r0 + 992 * 16], m3 | |
14540 | |
14541 pslldq m1, 2 | |
14542 pinsrw m1, [r4 + 3], 0 | |
14543 pmaddubsw m3, m1, m6 | |
14544 pmulhrsw m3, m7 | |
14545 pslldq m4, 2 | |
14546 pinsrw m4, [r4 + 11], 0 | |
14547 pmaddubsw m5, m4, m6 | |
14548 pmulhrsw m5, m7 | |
14549 packuswb m3, m5 | |
14550 movu [r0 + 993 * 16], m3 | |
14551 | |
14552 ; mode17 [row 17] | |
14553 movu m6, [r5 + 12 * 16] | |
14554 pslldq m0, 2 | |
14555 pinsrb m0, [r3 + 16], 1 | |
14556 pinsrb m0, [r3 + 17], 0 | |
14557 pmaddubsw m3, m0, m6 | |
14558 pmulhrsw m3, m7 | |
14559 pslldq m2, 2 | |
14560 pinsrb m2, [r3 + 6], 1 | |
14561 pinsrb m2, [r3 + 7], 0 | |
14562 pmaddubsw m5, m2, m6 | |
14563 pmulhrsw m5, m7 | |
14564 packuswb m3, m5 | |
14565 movu [r0 + 994 * 16], m3 | |
14566 | |
14567 pslldq m1, 2 | |
14568 pinsrw m1, [r4 + 2], 0 | |
14569 pmaddubsw m3, m1, m6 | |
14570 pmulhrsw m3, m7 | |
14571 pslldq m4, 2 | |
14572 pinsrw m4, [r4 + 10], 0 | |
14573 pmaddubsw m5, m4, m6 | |
14574 pmulhrsw m5, m7 | |
14575 packuswb m3, m5 | |
14576 movu [r0 + 995 * 16], m3 | |
14577 | |
14578 ; mode17 [row 18] | |
14579 movu m6, [r5 + 18 * 16] | |
14580 pslldq m0, 2 | |
14581 pinsrb m0, [r3 + 17], 1 | |
14582 pinsrb m0, [r3 + 18], 0 | |
14583 pmaddubsw m3, m0, m6 | |
14584 pmulhrsw m3, m7 | |
14585 pslldq m2, 2 | |
14586 pinsrb m2, [r3 + 7], 1 | |
14587 pinsrb m2, [r3 + 9], 0 | |
14588 pmaddubsw m5, m2, m6 | |
14589 pmulhrsw m5, m7 | |
14590 packuswb m3, m5 | |
14591 movu [r0 + 996 * 16], m3 | |
14592 | |
14593 pslldq m1, 2 | |
14594 pinsrw m1, [r4 + 1], 0 | |
14595 pmaddubsw m3, m1, m6 | |
14596 pmulhrsw m3, m7 | |
14597 pslldq m4, 2 | |
14598 pinsrw m4, [r4 + 9], 0 | |
14599 pmaddubsw m5, m4, m6 | |
14600 pmulhrsw m5, m7 | |
14601 packuswb m3, m5 | |
14602 movu [r0 + 997 * 16], m3 | |
14603 | |
14604 ; mode17 [row 19] | |
14605 movu m6, [r5 + 24 * 16] | |
14606 pslldq m0, 2 | |
14607 pinsrb m0, [r3 + 18], 1 | |
14608 pinsrb m0, [r3 + 20], 0 | |
14609 pmaddubsw m3, m0, m6 | |
14610 pmulhrsw m3, m7 | |
14611 pslldq m2, 2 | |
14612 pinsrb m2, [r3 + 9], 1 | |
14613 pinsrb m2, [r3 + 10], 0 | |
14614 pmaddubsw m5, m2, m6 | |
14615 pmulhrsw m5, m7 | |
14616 packuswb m3, m5 | |
14617 movu [r0 + 998 * 16], m3 | |
14618 | |
14619 pslldq m1, 2 | |
14620 pinsrw m1, [r4 + 0], 0 | |
14621 pmaddubsw m3, m1, m6 | |
14622 pmulhrsw m3, m7 | |
14623 pslldq m4, 2 | |
14624 pinsrw m4, [r4 + 8], 0 | |
14625 pmaddubsw m5, m4, m6 | |
14626 pmulhrsw m5, m7 | |
14627 packuswb m3, m5 | |
14628 movu [r0 + 999 * 16], m3 | |
14629 | |
14630 ; mode17 [row 20] | |
14631 movu m6, [r5 + 30 * 16] | |
14632 pslldq m0, 2 | |
14633 pinsrb m0, [r3 + 20], 1 | |
14634 pinsrb m0, [r3 + 21], 0 | |
14635 pmaddubsw m3, m0, m6 | |
14636 pmulhrsw m3, m7 | |
14637 pslldq m2, 2 | |
14638 pinsrb m2, [r3 + 10], 1 | |
14639 pinsrb m2, [r3 + 11], 0 | |
14640 pmaddubsw m5, m2, m6 | |
14641 pmulhrsw m5, m7 | |
14642 packuswb m3, m5 | |
14643 movu [r0 + 1000 * 16], m3 | |
14644 | |
14645 pslldq m1, 2 | |
14646 pinsrb m1, [r4 + 0], 1 | |
14647 pinsrb m1, [r3 + 1], 0 | |
14648 pmaddubsw m3, m1, m6 | |
14649 pmulhrsw m3, m7 | |
14650 pslldq m4, 2 | |
14651 ;pinsrb m4, [r4 + 8], 1 | |
14652 ;pinsrb m4, [r4 + 7], 0 | |
14653 pinsrw m4, [r4 + 7], 0 | |
14654 pmaddubsw m5, m4, m6 | |
14655 pmulhrsw m5, m7 | |
14656 packuswb m3, m5 | |
14657 movu [r0 + 1001 * 16], m3 | |
14658 | |
14659 ; mode17 [row 21] | |
14660 movu m6, [r5 + 4 * 16] | |
14661 pmaddubsw m3, m0, m6 | |
14662 pmulhrsw m3, m7 | |
14663 pmaddubsw m5, m2, m6 | |
14664 pmulhrsw m5, m7 | |
14665 packuswb m3, m5 | |
14666 movu [r0 + 1002 * 16], m3 | |
14667 | |
14668 pmaddubsw m3, m1, m6 | |
14669 pmulhrsw m3, m7 | |
14670 pmaddubsw m5, m4, m6 | |
14671 pmulhrsw m5, m7 | |
14672 packuswb m3, m5 | |
14673 movu [r0 + 1003 * 16], m3 | |
14674 | |
14675 ; mode17 [row 22] | |
14676 movu m6, [r5 + 10 * 16] | |
14677 pslldq m0, 2 | |
14678 pinsrb m0, [r3 + 21], 1 | |
14679 pinsrb m0, [r3 + 22], 0 | |
14680 pmaddubsw m3, m0, m6 | |
14681 pmulhrsw m3, m7 | |
14682 pslldq m2, 2 | |
14683 pinsrb m2, [r3 + 11], 1 | |
14684 pinsrb m2, [r3 + 12], 0 | |
14685 pmaddubsw m5, m2, m6 | |
14686 pmulhrsw m5, m7 | |
14687 packuswb m3, m5 | |
14688 movu [r0 + 1004 * 16], m3 | |
14689 | |
14690 pslldq m1, 2 | |
14691 pinsrb m1, [r3 + 1], 1 | |
14692 pinsrb m1, [r3 + 2], 0 | |
14693 pmaddubsw m3, m1, m6 | |
14694 pmulhrsw m3, m7 | |
14695 pslldq m4, 2 | |
14696 pinsrw m4, [r4 + 6], 0 | |
14697 pmaddubsw m5, m4, m6 | |
14698 pmulhrsw m5, m7 | |
14699 packuswb m3, m5 | |
14700 movu [r0 + 1005 * 16], m3 | |
14701 | |
14702 ; mode17 [row 23] | |
14703 movu m6, [r5 + 16 * 16] | |
14704 pslldq m0, 2 | |
14705 pinsrb m0, [r3 + 22], 1 | |
14706 pinsrb m0, [r3 + 23], 0 | |
14707 pmaddubsw m3, m0, m6 | |
14708 pmulhrsw m3, m7 | |
14709 pslldq m2, 2 | |
14710 pinsrb m2, [r3 + 12], 1 | |
14711 pinsrb m2, [r3 + 14], 0 | |
14712 pmaddubsw m5, m2, m6 | |
14713 pmulhrsw m5, m7 | |
14714 packuswb m3, m5 | |
14715 movu [r0 + 1006 * 16], m3 | |
14716 | |
14717 pslldq m1, 2 | |
14718 pinsrb m1, [r3 + 2], 1 | |
14719 pinsrb m1, [r3 + 4], 0 | |
14720 pmaddubsw m3, m1, m6 | |
14721 pmulhrsw m3, m7 | |
14722 pslldq m4, 2 | |
14723 pinsrw m4, [r4 + 5], 0 | |
14724 pmaddubsw m5, m4, m6 | |
14725 pmulhrsw m5, m7 | |
14726 packuswb m3, m5 | |
14727 movu [r0 + 1007 * 16], m3 | |
14728 | |
14729 ; mode17 [row 24] | |
14730 movu m6, [r5 + 22 * 16] | |
14731 pslldq m0, 2 | |
14732 pinsrb m0, [r3 + 23], 1 | |
14733 pinsrb m0, [r3 + 25], 0 | |
14734 pmaddubsw m3, m0, m6 | |
14735 pmulhrsw m3, m7 | |
14736 pslldq m2, 2 | |
14737 pinsrb m2, [r3 + 14], 1 | |
14738 pinsrb m2, [r3 + 15], 0 | |
14739 pmaddubsw m5, m2, m6 | |
14740 pmulhrsw m5, m7 | |
14741 packuswb m3, m5 | |
14742 movu [r0 + 1008 * 16], m3 | |
14743 | |
14744 pslldq m1, 2 | |
14745 pinsrb m1, [r3 + 4], 1 | |
14746 pinsrb m1, [r3 + 5], 0 | |
14747 pmaddubsw m3, m1, m6 | |
14748 pmulhrsw m3, m7 | |
14749 pslldq m4, 2 | |
14750 pinsrw m4, [r4 + 4], 0 | |
14751 pmaddubsw m5, m4, m6 | |
14752 pmulhrsw m5, m7 | |
14753 packuswb m3, m5 | |
14754 movu [r0 + 1009 * 16], m3 | |
14755 | |
14756 ; mode17 [row 25] | |
14757 movu m6, [r5 + 28 * 16] | |
14758 pslldq m0, 2 | |
14759 pinsrb m0, [r3 + 25], 1 | |
14760 pinsrb m0, [r3 + 26], 0 | |
14761 pmaddubsw m3, m0, m6 | |
14762 pmulhrsw m3, m7 | |
14763 pslldq m2, 2 | |
14764 pinsrb m2, [r3 + 15], 1 | |
14765 pinsrb m2, [r3 + 16], 0 | |
14766 pmaddubsw m5, m2, m6 | |
14767 pmulhrsw m5, m7 | |
14768 packuswb m3, m5 | |
14769 movu [r0 + 1010 * 16], m3 | |
14770 | |
14771 pslldq m1, 2 | |
14772 pinsrb m1, [r3 + 5], 1 | |
14773 pinsrb m1, [r3 + 6], 0 | |
14774 pmaddubsw m3, m1, m6 | |
14775 pmulhrsw m3, m7 | |
14776 pslldq m4, 2 | |
14777 pinsrw m4, [r4 + 3], 0 | |
14778 pmaddubsw m5, m4, m6 | |
14779 pmulhrsw m5, m7 | |
14780 packuswb m3, m5 | |
14781 movu [r0 + 1011 * 16], m3 | |
14782 | |
14783 ; mode17 [row 26] | |
14784 movu m6, [r5 + 2 * 16] | |
14785 pmaddubsw m3, m0, m6 | |
14786 pmulhrsw m3, m7 | |
14787 pmaddubsw m5, m2, m6 | |
14788 pmulhrsw m5, m7 | |
14789 packuswb m3, m5 | |
14790 movu [r0 + 1012 * 16], m3 | |
14791 | |
14792 pmaddubsw m3, m1, m6 | |
14793 pmulhrsw m3, m7 | |
14794 pmaddubsw m5, m4, m6 | |
14795 pmulhrsw m5, m7 | |
14796 packuswb m3, m5 | |
14797 movu [r0 + 1013 * 16], m3 | |
14798 | |
14799 ; mode17 [row 27] | |
14800 movu m6, [r5 + 8 * 16] | |
14801 pslldq m0, 2 | |
14802 pinsrb m0, [r3 + 26], 1 | |
14803 pinsrb m0, [r3 + 27], 0 | |
14804 pmaddubsw m3, m0, m6 | |
14805 pmulhrsw m3, m7 | |
14806 pslldq m2, 2 | |
14807 pinsrb m2, [r3 + 16], 1 | |
14808 pinsrb m2, [r3 + 17], 0 | |
14809 pmaddubsw m5, m2, m6 | |
14810 pmulhrsw m5, m7 | |
14811 packuswb m3, m5 | |
14812 movu [r0 + 1014 * 16], m3 | |
14813 | |
14814 pslldq m1, 2 | |
14815 pinsrb m1, [r3 + 6], 1 | |
14816 pinsrb m1, [r3 + 7], 0 | |
14817 pmaddubsw m3, m1, m6 | |
14818 pmulhrsw m3, m7 | |
14819 pslldq m4, 2 | |
14820 pinsrw m4, [r4 + 2], 0 | |
14821 pmaddubsw m5, m4, m6 | |
14822 pmulhrsw m5, m7 | |
14823 packuswb m3, m5 | |
14824 movu [r0 + 1015 * 16], m3 | |
14825 | |
14826 ; mode17 [row 28] | |
14827 movu m6, [r5 + 14 * 16] | |
14828 pslldq m0, 2 | |
14829 pinsrb m0, [r3 + 27], 1 | |
14830 pinsrb m0, [r3 + 28], 0 | |
14831 pmaddubsw m3, m0, m6 | |
14832 pmulhrsw m3, m7 | |
14833 pslldq m2, 2 | |
14834 pinsrb m2, [r3 + 17], 1 | |
14835 pinsrb m2, [r3 + 18], 0 | |
14836 pmaddubsw m5, m2, m6 | |
14837 pmulhrsw m5, m7 | |
14838 packuswb m3, m5 | |
14839 movu [r0 + 1016 * 16], m3 | |
14840 | |
14841 pslldq m1, 2 | |
14842 pinsrb m1, [r3 + 7], 1 | |
14843 pinsrb m1, [r3 + 9], 0 | |
14844 pmaddubsw m3, m1, m6 | |
14845 pmulhrsw m3, m7 | |
14846 pslldq m4, 2 | |
14847 pinsrw m4, [r4 + 1], 0 | |
14848 pmaddubsw m5, m4, m6 | |
14849 pmulhrsw m5, m7 | |
14850 packuswb m3, m5 | |
14851 movu [r0 + 1017 * 16], m3 | |
14852 | |
14853 ; mode17 [row 29] | |
14854 movu m6, [r5 + 20 * 16] | |
14855 pslldq m0, 2 | |
14856 pinsrb m0, [r3 + 28], 1 | |
14857 pinsrb m0, [r3 + 30], 0 | |
14858 pmaddubsw m3, m0, m6 | |
14859 pmulhrsw m3, m7 | |
14860 pslldq m2, 2 | |
14861 pinsrb m2, [r3 + 18], 1 | |
14862 pinsrb m2, [r3 + 20], 0 | |
14863 pmaddubsw m5, m2, m6 | |
14864 pmulhrsw m5, m7 | |
14865 packuswb m3, m5 | |
14866 movu [r0 + 1018 * 16], m3 | |
14867 | |
14868 pslldq m1, 2 | |
14869 pinsrb m1, [r3 + 9], 1 | |
14870 pinsrb m1, [r3 + 10], 0 | |
14871 pmaddubsw m3, m1, m6 | |
14872 pmulhrsw m3, m7 | |
14873 pslldq m4, 2 | |
14874 pinsrw m4, [r4 + 0], 0 | |
14875 pmaddubsw m5, m4, m6 | |
14876 pmulhrsw m5, m7 | |
14877 packuswb m3, m5 | |
14878 movu [r0 + 1019 * 16], m3 | |
14879 | |
14880 ; mode17 [row 30] | |
14881 movu m6, [r5 + 26 * 16] | |
14882 pslldq m0, 2 | |
14883 pinsrb m0, [r3 + 30], 1 | |
14884 pinsrb m0, [r3 + 31], 0 | |
14885 pmaddubsw m3, m0, m6 | |
14886 pmulhrsw m3, m7 | |
14887 pslldq m2, 2 | |
14888 pinsrb m2, [r3 + 20], 1 | |
14889 pinsrb m2, [r3 + 21], 0 | |
14890 pmaddubsw m5, m2, m6 | |
14891 pmulhrsw m5, m7 | |
14892 packuswb m3, m5 | |
14893 movu [r0 + 1020 * 16], m3 | |
14894 | |
14895 pslldq m1, 2 | |
14896 pinsrb m1, [r3 + 10], 1 | |
14897 pinsrb m1, [r3 + 11], 0 | |
14898 pmaddubsw m3, m1, m6 | |
14899 pmulhrsw m3, m7 | |
14900 pslldq m4, 2 | |
14901 pinsrb m4, [r4 + 0], 1 | |
14902 pinsrb m4, [r3 + 1], 0 | |
14903 pmaddubsw m5, m4, m6 | |
14904 pmulhrsw m5, m7 | |
14905 packuswb m3, m5 | |
14906 movu [r0 + 1021 * 16], m3 | |
14907 | |
14908 ; mode17 [row 31] | |
14909 pshufb m5, m0, [tab_S2] | |
14910 movh [r0 + 1022 * 16], m5 | |
14911 pshufb m5, m2, [tab_S2] | |
14912 movh [r0 + 1022 * 16 + 8], m5 | |
14913 pshufb m5, m1, [tab_S2] | |
14914 movh [r0 + 1023 * 16], m5 | |
14915 pshufb m5, m4, [tab_S2] | |
14916 movh [r0 + 1023 * 16 + 8], m5 | |
14917 | |
14918 ;mode 18[row 0] | |
14919 movu m0, [r3] | |
14920 movu [r0 + 1024 * 16], m0 | |
14921 movu m1, [r3 + 16] | |
14922 movu [r0 + 1025 * 16], m1 | |
14923 | |
14924 ;mode 18[row 1] | |
14925 pslldq m0, 1 | |
14926 pinsrb m0, [r4 + 1], 0 | |
14927 movu [r0 + 1026 * 16], m0 | |
14928 pslldq m1, 1 | |
14929 pinsrb m1, [r3 + 15], 0 | |
14930 movu [r0 + 1027 * 16], m1 | |
14931 | |
14932 ;mode 18[row 2] | |
14933 pslldq m0, 1 | |
14934 pinsrb m0, [r4 + 2], 0 | |
14935 movu [r0 + 1028 * 16], m0 | |
14936 pslldq m1, 1 | |
14937 pinsrb m1, [r3 + 14], 0 | |
14938 movu [r0 + 1029 * 16], m1 | |
14939 | |
14940 ;mode 18[row 3] | |
14941 pslldq m0, 1 | |
14942 pinsrb m0, [r4 + 3], 0 | |
14943 movu [r0 + 1030 * 16], m0 | |
14944 pslldq m1, 1 | |
14945 pinsrb m1, [r3 + 13], 0 | |
14946 movu [r0 + 1031 * 16], m1 | |
14947 | |
14948 ;mode 18[row 4] | |
14949 pslldq m0, 1 | |
14950 pinsrb m0, [r4 + 4], 0 | |
14951 movu [r0 + 1032 * 16], m0 | |
14952 pslldq m1, 1 | |
14953 pinsrb m1, [r3 + 12], 0 | |
14954 movu [r0 + 1033 * 16], m1 | |
14955 | |
14956 ;mode 18[row 5] | |
14957 pslldq m0, 1 | |
14958 pinsrb m0, [r4 + 5], 0 | |
14959 movu [r0 + 1034 * 16], m0 | |
14960 pslldq m1, 1 | |
14961 pinsrb m1, [r3 + 11], 0 | |
14962 movu [r0 + 1035 * 16], m1 | |
14963 | |
14964 ;mode 18[row 6] | |
14965 pslldq m0, 1 | |
14966 pinsrb m0, [r4 + 6], 0 | |
14967 movu [r0 + 1036 * 16], m0 | |
14968 pslldq m1, 1 | |
14969 pinsrb m1, [r3 + 10], 0 | |
14970 movu [r0 + 1037 * 16], m1 | |
14971 | |
14972 ;mode 18[row 7] | |
14973 pslldq m0, 1 | |
14974 pinsrb m0, [r4 + 7], 0 | |
14975 movu [r0 + 1038 * 16], m0 | |
14976 pslldq m1, 1 | |
14977 pinsrb m1, [r3 + 9], 0 | |
14978 movu [r0 + 1039 * 16], m1 | |
14979 | |
14980 ;mode 18[row 8] | |
14981 pslldq m0, 1 | |
14982 pinsrb m0, [r4 + 8], 0 | |
14983 movu [r0 + 1040 * 16], m0 | |
14984 pslldq m1, 1 | |
14985 pinsrb m1, [r3 + 8], 0 | |
14986 movu [r0 + 1041 * 16], m1 | |
14987 | |
14988 ;mode 18[row 9] | |
14989 pslldq m0, 1 | |
14990 pinsrb m0, [r4 + 9], 0 | |
14991 movu [r0 + 1042 * 16], m0 | |
14992 pslldq m1, 1 | |
14993 pinsrb m1, [r3 + 7], 0 | |
14994 movu [r0 + 1043 * 16], m1 | |
14995 | |
14996 ;mode 18[row 10] | |
14997 pslldq m0, 1 | |
14998 pinsrb m0, [r4 + 10], 0 | |
14999 movu [r0 + 1044 * 16], m0 | |
15000 pslldq m1, 1 | |
15001 pinsrb m1, [r3 + 6], 0 | |
15002 movu [r0 + 1045 * 16], m1 | |
15003 | |
15004 ;mode 18[row 11] | |
15005 pslldq m0, 1 | |
15006 pinsrb m0, [r4 + 11], 0 | |
15007 movu [r0 + 1046 * 16], m0 | |
15008 pslldq m1, 1 | |
15009 pinsrb m1, [r3 + 5], 0 | |
15010 movu [r0 + 1047 * 16], m1 | |
15011 | |
15012 ;mode 18[row 12] | |
15013 pslldq m0, 1 | |
15014 pinsrb m0, [r4 + 12], 0 | |
15015 movu [r0 + 1048 * 16], m0 | |
15016 pslldq m1, 1 | |
15017 pinsrb m1, [r3 + 4], 0 | |
15018 movu [r0 + 1049 * 16], m1 | |
15019 | |
15020 ;mode 18[row 13] | |
15021 pslldq m0, 1 | |
15022 pinsrb m0, [r4 + 13], 0 | |
15023 movu [r0 + 1050 * 16], m0 | |
15024 pslldq m1, 1 | |
15025 pinsrb m1, [r3 + 3], 0 | |
15026 movu [r0 + 1051 * 16], m1 | |
15027 | |
15028 ;mode 18[row 14] | |
15029 pslldq m0, 1 | |
15030 pinsrb m0, [r4 + 14], 0 | |
15031 movu [r0 + 1052 * 16], m0 | |
15032 pslldq m1, 1 | |
15033 pinsrb m1, [r3 + 2], 0 | |
15034 movu [r0 + 1053 * 16], m1 | |
15035 | |
15036 ;mode 18[row 15] | |
15037 pslldq m0, 1 | |
15038 pinsrb m0, [r4 + 15], 0 | |
15039 movu [r0 + 1054 * 16], m0 | |
15040 pslldq m1, 1 | |
15041 pinsrb m1, [r3 + 1], 0 | |
15042 movu [r0 + 1055 * 16], m1 | |
15043 | |
15044 ;mode 18[row 16] | |
15045 pslldq m0, 1 | |
15046 pinsrb m0, [r4 + 16], 0 | |
15047 movu [r0 + 1056 * 16], m0 | |
15048 pslldq m1, 1 | |
15049 pinsrb m1, [r3 + 0], 0 | |
15050 movu [r0 + 1057 * 16], m1 | |
15051 | |
15052 ;mode 18[row 17] | |
15053 pslldq m0, 1 | |
15054 pinsrb m0, [r4 + 17], 0 | |
15055 movu [r0 + 1058 * 16], m0 | |
15056 pslldq m1, 1 | |
15057 pinsrb m1, [r4 + 1], 0 | |
15058 movu [r0 + 1059 * 16], m1 | |
15059 | |
15060 ;mode 18[row 18] | |
15061 pslldq m0, 1 | |
15062 pinsrb m0, [r4 + 18], 0 | |
15063 movu [r0 + 1060 * 16], m0 | |
15064 pslldq m1, 1 | |
15065 pinsrb m1, [r4 + 2], 0 | |
15066 movu [r0 + 1061 * 16], m1 | |
15067 | |
15068 ;mode 18[row 19] | |
15069 pslldq m0, 1 | |
15070 pinsrb m0, [r4 + 19], 0 | |
15071 movu [r0 + 1062 * 16], m0 | |
15072 pslldq m1, 1 | |
15073 pinsrb m1, [r4 + 3], 0 | |
15074 movu [r0 + 1063 * 16], m1 | |
15075 | |
15076 ;mode 18[row 20] | |
15077 pslldq m0, 1 | |
15078 pinsrb m0, [r4 + 20], 0 | |
15079 movu [r0 + 1064 * 16], m0 | |
15080 pslldq m1, 1 | |
15081 pinsrb m1, [r4 + 4], 0 | |
15082 movu [r0 + 1065 * 16], m1 | |
15083 | |
15084 ;mode 18[row 21] | |
15085 pslldq m0, 1 | |
15086 pinsrb m0, [r4 + 21], 0 | |
15087 movu [r0 + 1066 * 16], m0 | |
15088 pslldq m1, 1 | |
15089 pinsrb m1, [r4 + 5], 0 | |
15090 movu [r0 + 1067 * 16], m1 | |
15091 | |
15092 ;mode 18[row 22] | |
15093 pslldq m0, 1 | |
15094 pinsrb m0, [r4 + 22], 0 | |
15095 movu [r0 + 1068 * 16], m0 | |
15096 pslldq m1, 1 | |
15097 pinsrb m1, [r4 + 6], 0 | |
15098 movu [r0 + 1069 * 16], m1 | |
15099 | |
15100 ;mode 18[row 23] | |
15101 pslldq m0, 1 | |
15102 pinsrb m0, [r4 + 23], 0 | |
15103 movu [r0 + 1070 * 16], m0 | |
15104 pslldq m1, 1 | |
15105 pinsrb m1, [r4 + 7], 0 | |
15106 movu [r0 + 1071 * 16], m1 | |
15107 | |
15108 ;mode 18[row 24] | |
15109 pslldq m0, 1 | |
15110 pinsrb m0, [r4 + 24], 0 | |
15111 movu [r0 + 1072 * 16], m0 | |
15112 pslldq m1, 1 | |
15113 pinsrb m1, [r4 + 8], 0 | |
15114 movu [r0 + 1073 * 16], m1 | |
15115 | |
15116 ;mode 18[row 25] | |
15117 pslldq m0, 1 | |
15118 pinsrb m0, [r4 + 25], 0 | |
15119 movu [r0 + 1074 * 16], m0 | |
15120 pslldq m1, 1 | |
15121 pinsrb m1, [r4 + 9], 0 | |
15122 movu [r0 + 1075 * 16], m1 | |
15123 | |
15124 ;mode 18[row 26] | |
15125 pslldq m0, 1 | |
15126 pinsrb m0, [r4 + 26], 0 | |
15127 movu [r0 + 1076 * 16], m0 | |
15128 pslldq m1, 1 | |
15129 pinsrb m1, [r4 + 10], 0 | |
15130 movu [r0 + 1077 * 16], m1 | |
15131 | |
15132 ;mode 18[row 27] | |
15133 pslldq m0, 1 | |
15134 pinsrb m0, [r4 + 27], 0 | |
15135 movu [r0 + 1078 * 16], m0 | |
15136 pslldq m1, 1 | |
15137 pinsrb m1, [r4 + 11], 0 | |
15138 movu [r0 + 1079 * 16], m1 | |
15139 | |
15140 ;mode 18[row 28] | |
15141 pslldq m0, 1 | |
15142 pinsrb m0, [r4 + 28], 0 | |
15143 movu [r0 + 1080 * 16], m0 | |
15144 pslldq m1, 1 | |
15145 pinsrb m1, [r4 + 12], 0 | |
15146 movu [r0 + 1081 * 16], m1 | |
15147 | |
15148 ;mode 18[row 29] | |
15149 pslldq m0, 1 | |
15150 pinsrb m0, [r4 + 29], 0 | |
15151 movu [r0 + 1082 * 16], m0 | |
15152 pslldq m1, 1 | |
15153 pinsrb m1, [r4 + 13], 0 | |
15154 movu [r0 + 1083 * 16], m1 | |
15155 | |
15156 ;mode 18[row 30] | |
15157 pslldq m0, 1 | |
15158 pinsrb m0, [r4 + 30], 0 | |
15159 movu [r0 + 1084 * 16], m0 | |
15160 pslldq m1, 1 | |
15161 pinsrb m1, [r4 + 14], 0 | |
15162 movu [r0 + 1085 * 16], m1 | |
15163 | |
15164 ;mode 18[row 31] | |
15165 pslldq m0, 1 | |
15166 pinsrb m0, [r4 + 31], 0 | |
15167 movu [r0 + 1086 * 16], m0 | |
15168 pslldq m1, 1 | |
15169 pinsrb m1, [r4 + 15], 0 | |
15170 movu [r0 + 1087 * 16], m1 | |
15171 | |
15172 ; mode 19 [row 0] | |
15173 movu m6, [r5 + 6 * 16] | |
15174 movu m0, [r3 ] | |
15175 movu m1, [r3 + 1 ] | |
15176 punpcklbw m0, m1 | |
15177 pmaddubsw m1, m0, m6 | |
15178 pmulhrsw m1, m7 | |
15179 movu m2, [r3 + 8] | |
15180 movu m3, [r3 + 9] | |
15181 punpcklbw m2, m3 | |
15182 pmaddubsw m3, m2, m6 | |
15183 pmulhrsw m3, m7 | |
15184 packuswb m1, m3 | |
15185 movu [r0 + 1088 * 16], m1 | |
15186 | |
15187 movu m1, [r3 + 16] | |
15188 movu m3, [r3 + 17] | |
15189 punpcklbw m1, m3 | |
15190 pmaddubsw m4, m1, m6 | |
15191 pmulhrsw m4, m7 | |
15192 movu m3, [r3 + 24] | |
15193 movu m5, [r3 + 25] | |
15194 punpcklbw m3, m5 | |
15195 pmaddubsw m5, m3, m6 | |
15196 pmulhrsw m5, m7 | |
15197 packuswb m4, m5 | |
15198 movu [r0 + 1089 * 16], m4 | |
15199 | |
15200 ; mode 19 [row 1] | |
15201 movu m6, [r5 + 12 * 16] | |
15202 pslldq m0, 2 | |
15203 pinsrb m0, [r4 + 0], 1 | |
15204 pinsrb m0, [r4 + 1], 0 | |
15205 pmaddubsw m4, m0, m6 | |
15206 pmulhrsw m4, m7 | |
15207 pslldq m2, 2 | |
15208 pinsrw m2, [r3 + 7], 0 | |
15209 pmaddubsw m5, m2, m6 | |
15210 pmulhrsw m5, m7 | |
15211 packuswb m4, m5 | |
15212 movu [r0 + 1090 * 16], m4 | |
15213 pslldq m1, 2 | |
15214 pinsrw m1, [r3 + 15], 0 | |
15215 pmaddubsw m4, m1, m6 | |
15216 pmulhrsw m4, m7 | |
15217 pslldq m3, 2 | |
15218 pinsrw m3, [r3 + 23], 0 | |
15219 pmaddubsw m5, m3, m6 | |
15220 pmulhrsw m5, m7 | |
15221 packuswb m4, m5 | |
15222 movu [r0 + 1091 * 16], m4 | |
15223 | |
15224 ; mode 19 [row 2] | |
15225 movu m6, [r5 + 18 * 16] | |
15226 pslldq m0, 2 | |
15227 pinsrb m0, [r4 + 1], 1 | |
15228 pinsrb m0, [r4 + 2], 0 | |
15229 pmaddubsw m4, m0, m6 | |
15230 pmulhrsw m4, m7 | |
15231 pslldq m2, 2 | |
15232 pinsrw m2, [r3 + 6], 0 | |
15233 pmaddubsw m5, m2, m6 | |
15234 pmulhrsw m5, m7 | |
15235 packuswb m4, m5 | |
15236 movu [r0 + 1092 * 16], m4 | |
15237 pslldq m1, 2 | |
15238 pinsrw m1, [r3 + 14], 0 | |
15239 pmaddubsw m4, m1, m6 | |
15240 pmulhrsw m4, m7 | |
15241 pslldq m3, 2 | |
15242 pinsrw m3, [r3 + 22], 0 | |
15243 pmaddubsw m5, m3, m6 | |
15244 pmulhrsw m5, m7 | |
15245 packuswb m4, m5 | |
15246 movu [r0 + 1093 * 16], m4 | |
15247 | |
15248 ; mode 19 [row 3] | |
15249 movu m6, [r5 + 24 * 16] | |
15250 pslldq m0, 2 | |
15251 pinsrb m0, [r4 + 2], 1 | |
15252 pinsrb m0, [r4 + 4], 0 | |
15253 pmaddubsw m4, m0, m6 | |
15254 pmulhrsw m4, m7 | |
15255 pslldq m2, 2 | |
15256 pinsrw m2, [r3 + 5], 0 | |
15257 pmaddubsw m5, m2, m6 | |
15258 pmulhrsw m5, m7 | |
15259 packuswb m4, m5 | |
15260 movu [r0 + 1094 * 16], m4 | |
15261 pslldq m1, 2 | |
15262 pinsrw m1, [r3 + 13], 0 | |
15263 pmaddubsw m4, m1, m6 | |
15264 pmulhrsw m4, m7 | |
15265 pslldq m3, 2 | |
15266 pinsrw m3, [r3 + 21], 0 | |
15267 pmaddubsw m5, m3, m6 | |
15268 pmulhrsw m5, m7 | |
15269 packuswb m4, m5 | |
15270 movu [r0 + 1095 * 16], m4 | |
15271 | |
15272 ; mode 19 [row 4] | |
15273 movu m6, [r5 + 30 * 16] | |
15274 pslldq m0, 2 | |
15275 pinsrb m0, [r4 + 4], 1 | |
15276 pinsrb m0, [r4 + 5], 0 | |
15277 pmaddubsw m4, m0, m6 | |
15278 pmulhrsw m4, m7 | |
15279 pslldq m2, 2 | |
15280 pinsrw m2, [r3 + 4], 0 | |
15281 pmaddubsw m5, m2, m6 | |
15282 pmulhrsw m5, m7 | |
15283 packuswb m4, m5 | |
15284 movu [r0 + 1096 * 16], m4 | |
15285 pslldq m1, 2 | |
15286 pinsrw m1, [r3 + 12], 0 | |
15287 pmaddubsw m4, m1, m6 | |
15288 pmulhrsw m4, m7 | |
15289 pslldq m3, 2 | |
15290 pinsrw m3, [r3 + 20], 0 | |
15291 pmaddubsw m5, m3, m6 | |
15292 pmulhrsw m5, m7 | |
15293 packuswb m4, m5 | |
15294 movu [r0 + 1097 * 16], m4 | |
15295 | |
15296 ; mode 19 [row 5] | |
15297 movu m6, [r5 + 4 * 16] | |
15298 pmaddubsw m4, m0, m6 | |
15299 pmulhrsw m4, m7 | |
15300 pmaddubsw m5, m2, m6 | |
15301 pmulhrsw m5, m7 | |
15302 packuswb m4, m5 | |
15303 movu [r0 + 1098 * 16], m4 | |
15304 pmaddubsw m4, m1, m6 | |
15305 pmulhrsw m4, m7 | |
15306 pmaddubsw m5, m3, m6 | |
15307 pmulhrsw m5, m7 | |
15308 packuswb m4, m5 | |
15309 movu [r0 + 1099 * 16], m4 | |
15310 | |
15311 ; mode 19 [row 6] | |
15312 movu m6, [r5 + 10 * 16] | |
15313 pslldq m0, 2 | |
15314 pinsrb m0, [r4 + 5], 1 | |
15315 pinsrb m0, [r4 + 6], 0 | |
15316 pmaddubsw m4, m0, m6 | |
15317 pmulhrsw m4, m7 | |
15318 pslldq m2, 2 | |
15319 pinsrw m2, [r3 + 3], 0 | |
15320 pmaddubsw m5, m2, m6 | |
15321 pmulhrsw m5, m7 | |
15322 packuswb m4, m5 | |
15323 movu [r0 + 1100 * 16], m4 | |
15324 pslldq m1, 2 | |
15325 pinsrw m1, [r3 + 11], 0 | |
15326 pmaddubsw m4, m1, m6 | |
15327 pmulhrsw m4, m7 | |
15328 pslldq m3, 2 | |
15329 pinsrw m3, [r3 + 19], 0 | |
15330 pmaddubsw m5, m3, m6 | |
15331 pmulhrsw m5, m7 | |
15332 packuswb m4, m5 | |
15333 movu [r0 + 1101 * 16], m4 | |
15334 | |
15335 ; mode 19 [row 7] | |
15336 movu m6, [r5 + 16 * 16] | |
15337 pslldq m0, 2 | |
15338 pinsrb m0, [r4 + 6], 1 | |
15339 pinsrb m0, [r4 + 7], 0 | |
15340 pmaddubsw m4, m0, m6 | |
15341 pmulhrsw m4, m7 | |
15342 pslldq m2, 2 | |
15343 pinsrw m2, [r3 + 2], 0 | |
15344 pmaddubsw m5, m2, m6 | |
15345 pmulhrsw m5, m7 | |
15346 packuswb m4, m5 | |
15347 movu [r0 + 1102 * 16], m4 | |
15348 pslldq m1, 2 | |
15349 pinsrw m1, [r3 + 10], 0 | |
15350 pmaddubsw m4, m1, m6 | |
15351 pmulhrsw m4, m7 | |
15352 pslldq m3, 2 | |
15353 pinsrw m3, [r3 + 18], 0 | |
15354 pmaddubsw m5, m3, m6 | |
15355 pmulhrsw m5, m7 | |
15356 packuswb m4, m5 | |
15357 movu [r0 + 1103 * 16], m4 | |
15358 | |
15359 ; mode 19 [row 8] | |
15360 movu m6, [r5 + 22 * 16] | |
15361 pslldq m0, 2 | |
15362 pinsrb m0, [r4 + 7], 1 | |
15363 pinsrb m0, [r4 + 9], 0 | |
15364 pmaddubsw m4, m0, m6 | |
15365 pmulhrsw m4, m7 | |
15366 pslldq m2, 2 | |
15367 pinsrw m2, [r3 + 1], 0 | |
15368 pmaddubsw m5, m2, m6 | |
15369 pmulhrsw m5, m7 | |
15370 packuswb m4, m5 | |
15371 movu [r0 + 1104 * 16], m4 | |
15372 pslldq m1, 2 | |
15373 pinsrw m1, [r3 + 9], 0 | |
15374 pmaddubsw m4, m1, m6 | |
15375 pmulhrsw m4, m7 | |
15376 pslldq m3, 2 | |
15377 pinsrw m3, [r3 + 17], 0 | |
15378 pmaddubsw m5, m3, m6 | |
15379 pmulhrsw m5, m7 | |
15380 packuswb m4, m5 | |
15381 movu [r0 + 1105 * 16], m4 | |
15382 | |
15383 ; mode 19 [row 9] | |
15384 movu m6, [r5 + 28 * 16] | |
15385 pslldq m0, 2 | |
15386 pinsrb m0, [r4 + 9], 1 | |
15387 pinsrb m0, [r4 + 10], 0 | |
15388 pmaddubsw m4, m0, m6 | |
15389 pmulhrsw m4, m7 | |
15390 pslldq m2, 2 | |
15391 pinsrw m2, [r3 + 0], 0 | |
15392 pmaddubsw m5, m2, m6 | |
15393 pmulhrsw m5, m7 | |
15394 packuswb m4, m5 | |
15395 movu [r0 + 1106 * 16], m4 | |
15396 pslldq m1, 2 | |
15397 pinsrw m1, [r3 + 8], 0 | |
15398 pmaddubsw m4, m1, m6 | |
15399 pmulhrsw m4, m7 | |
15400 pslldq m3, 2 | |
15401 pinsrw m3, [r3 + 16], 0 | |
15402 pmaddubsw m5, m3, m6 | |
15403 pmulhrsw m5, m7 | |
15404 packuswb m4, m5 | |
15405 movu [r0 + 1107 * 16], m4 | |
15406 | |
15407 ; mode 19 [row 10] | |
15408 movu m6, [r5 + 2 * 16] | |
15409 pmaddubsw m4, m0, m6 | |
15410 pmulhrsw m4, m7 | |
15411 pmaddubsw m5, m2, m6 | |
15412 pmulhrsw m5, m7 | |
15413 packuswb m4, m5 | |
15414 movu [r0 + 1108 * 16], m4 | |
15415 pmaddubsw m4, m1, m6 | |
15416 pmulhrsw m4, m7 | |
15417 pmaddubsw m5, m3, m6 | |
15418 pmulhrsw m5, m7 | |
15419 packuswb m4, m5 | |
15420 movu [r0 + 1109 * 16], m4 | |
15421 | |
15422 ; mode 19 [row 11] | |
15423 movu m6, [r5 + 8 * 16] | |
15424 pslldq m0, 2 | |
15425 pinsrb m0, [r4 + 10], 1 | |
15426 pinsrb m0, [r4 + 11], 0 | |
15427 pmaddubsw m4, m0, m6 | |
15428 pmulhrsw m4, m7 | |
15429 pslldq m2, 2 | |
15430 pinsrb m2, [r3 + 0], 1 | |
15431 pinsrb m2, [r4 + 1], 0 | |
15432 pmaddubsw m5, m2, m6 | |
15433 pmulhrsw m5, m7 | |
15434 packuswb m4, m5 | |
15435 movu [r0 + 1110 * 16], m4 | |
15436 pslldq m1, 2 | |
15437 pinsrw m1, [r3 + 7], 0 | |
15438 pmaddubsw m4, m1, m6 | |
15439 pmulhrsw m4, m7 | |
15440 pslldq m3, 2 | |
15441 pinsrw m3, [r3 + 15], 0 | |
15442 pmaddubsw m5, m3, m6 | |
15443 pmulhrsw m5, m7 | |
15444 packuswb m4, m5 | |
15445 movu [r0 + 1111 * 16], m4 | |
15446 | |
15447 ; mode 19 [row 12] | |
15448 movu m6, [r5 + 14 * 16] | |
15449 pslldq m0, 2 | |
15450 pinsrb m0, [r4 + 11], 1 | |
15451 pinsrb m0, [r4 + 12], 0 | |
15452 pmaddubsw m4, m0, m6 | |
15453 pmulhrsw m4, m7 | |
15454 pslldq m2, 2 | |
15455 pinsrb m2, [r4 + 1], 1 | |
15456 pinsrb m2, [r4 + 2], 0 | |
15457 pmaddubsw m5, m2, m6 | |
15458 pmulhrsw m5, m7 | |
15459 packuswb m4, m5 | |
15460 movu [r0 + 1112 * 16], m4 | |
15461 pslldq m1, 2 | |
15462 pinsrw m1, [r3 + 6], 0 | |
15463 pmaddubsw m4, m1, m6 | |
15464 pmulhrsw m4, m7 | |
15465 pslldq m3, 2 | |
15466 pinsrw m3, [r3 + 14], 0 | |
15467 pmaddubsw m5, m3, m6 | |
15468 pmulhrsw m5, m7 | |
15469 packuswb m4, m5 | |
15470 movu [r0 + 1113 * 16], m4 | |
15471 | |
15472 ; mode 19 [row 13] | |
15473 movu m6, [r5 + 20 * 16] | |
15474 pslldq m0, 2 | |
15475 pinsrb m0, [r4 + 12], 1 | |
15476 pinsrb m0, [r4 + 14], 0 | |
15477 pmaddubsw m4, m0, m6 | |
15478 pmulhrsw m4, m7 | |
15479 pslldq m2, 2 | |
15480 pinsrb m2, [r4 + 2], 1 | |
15481 pinsrb m2, [r4 + 4], 0 | |
15482 pmaddubsw m5, m2, m6 | |
15483 pmulhrsw m5, m7 | |
15484 packuswb m4, m5 | |
15485 movu [r0 + 1114 * 16], m4 | |
15486 pslldq m1, 2 | |
15487 pinsrw m1, [r3 + 5], 0 | |
15488 pmaddubsw m4, m1, m6 | |
15489 pmulhrsw m4, m7 | |
15490 pslldq m3, 2 | |
15491 pinsrw m3, [r3 + 13], 0 | |
15492 pmaddubsw m5, m3, m6 | |
15493 pmulhrsw m5, m7 | |
15494 packuswb m4, m5 | |
15495 movu [r0 + 1115 * 16], m4 | |
15496 | |
15497 ; mode 19 [row 14] | |
15498 movu m6, [r5 + 26 * 16] | |
15499 pslldq m0, 2 | |
15500 pinsrb m0, [r4 + 14], 1 | |
15501 pinsrb m0, [r4 + 15], 0 | |
15502 pmaddubsw m4, m0, m6 | |
15503 pmulhrsw m4, m7 | |
15504 pslldq m2, 2 | |
15505 pinsrb m2, [r4 + 4], 1 | |
15506 pinsrb m2, [r4 + 5], 0 | |
15507 pmaddubsw m5, m2, m6 | |
15508 pmulhrsw m5, m7 | |
15509 packuswb m4, m5 | |
15510 movu [r0 + 1116 * 16], m4 | |
15511 pslldq m1, 2 | |
15512 pinsrw m1, [r3 + 4], 0 | |
15513 pmaddubsw m4, m1, m6 | |
15514 pmulhrsw m4, m7 | |
15515 pslldq m3, 2 | |
15516 pinsrw m3, [r3 + 12], 0 | |
15517 pmaddubsw m5, m3, m6 | |
15518 pmulhrsw m5, m7 | |
15519 packuswb m4, m5 | |
15520 movu [r0 + 1117 * 16], m4 | |
15521 | |
15522 ; mode19 [row 15] | |
15523 pshufb m5, m0, [tab_S2] | |
15524 movh [r0 + 1118 * 16], m5 | |
15525 pshufb m5, m2, [tab_S2] | |
15526 movh [r0 + 1118 * 16 + 8], m5 | |
15527 pshufb m5, m1, [tab_S2] | |
15528 movh [r0 + 1119 * 16], m5 | |
15529 pshufb m5, m3, [tab_S2] | |
15530 movh [r0 + 1119 * 16 + 8], m5 | |
15531 | |
15532 ; mode 19 [row 16] | |
15533 movu m6, [r5 + 6 * 16] | |
15534 pslldq m0, 2 | |
15535 pinsrb m0, [r4 + 15], 1 | |
15536 pinsrb m0, [r4 + 16], 0 | |
15537 pmaddubsw m4, m0, m6 | |
15538 pmulhrsw m4, m7 | |
15539 pslldq m2, 2 | |
15540 pinsrb m2, [r4 + 5], 1 | |
15541 pinsrb m2, [r4 + 6], 0 | |
15542 pmaddubsw m5, m2, m6 | |
15543 pmulhrsw m5, m7 | |
15544 packuswb m4, m5 | |
15545 movu [r0 + 1120 * 16], m4 | |
15546 pslldq m1, 2 | |
15547 pinsrw m1, [r3 + 3], 0 | |
15548 pmaddubsw m4, m1, m6 | |
15549 pmulhrsw m4, m7 | |
15550 pslldq m3, 2 | |
15551 pinsrw m3, [r3 + 11], 0 | |
15552 pmaddubsw m5, m3, m6 | |
15553 pmulhrsw m5, m7 | |
15554 packuswb m4, m5 | |
15555 movu [r0 + 1121 * 16], m4 | |
15556 | |
15557 ; mode 19 [row 17] | |
15558 movu m6, [r5 + 12 * 16] | |
15559 pslldq m0, 2 | |
15560 pinsrb m0, [r4 + 16], 1 | |
15561 pinsrb m0, [r4 + 17], 0 | |
15562 pmaddubsw m4, m0, m6 | |
15563 pmulhrsw m4, m7 | |
15564 pslldq m2, 2 | |
15565 pinsrb m2, [r4 + 6], 1 | |
15566 pinsrb m2, [r4 + 7], 0 | |
15567 pmaddubsw m5, m2, m6 | |
15568 pmulhrsw m5, m7 | |
15569 packuswb m4, m5 | |
15570 movu [r0 + 1122 * 16], m4 | |
15571 pslldq m1, 2 | |
15572 pinsrw m1, [r3 + 2], 0 | |
15573 pmaddubsw m4, m1, m6 | |
15574 pmulhrsw m4, m7 | |
15575 pslldq m3, 2 | |
15576 pinsrw m3, [r3 + 10], 0 | |
15577 pmaddubsw m5, m3, m6 | |
15578 pmulhrsw m5, m7 | |
15579 packuswb m4, m5 | |
15580 movu [r0 + 1123 * 16], m4 | |
15581 | |
15582 ; mode 19 [row 18] | |
15583 movu m6, [r5 + 18 * 16] | |
15584 pslldq m0, 2 | |
15585 pinsrb m0, [r4 + 17], 1 | |
15586 pinsrb m0, [r4 + 18], 0 | |
15587 pmaddubsw m4, m0, m6 | |
15588 pmulhrsw m4, m7 | |
15589 pslldq m2, 2 | |
15590 pinsrb m2, [r4 + 7], 1 | |
15591 pinsrb m2, [r4 + 9], 0 | |
15592 pmaddubsw m5, m2, m6 | |
15593 pmulhrsw m5, m7 | |
15594 packuswb m4, m5 | |
15595 movu [r0 + 1124 * 16], m4 | |
15596 pslldq m1, 2 | |
15597 pinsrw m1, [r3 + 1], 0 | |
15598 pmaddubsw m4, m1, m6 | |
15599 pmulhrsw m4, m7 | |
15600 pslldq m3, 2 | |
15601 pinsrw m3, [r3 + 9], 0 | |
15602 pmaddubsw m5, m3, m6 | |
15603 pmulhrsw m5, m7 | |
15604 packuswb m4, m5 | |
15605 movu [r0 + 1125 * 16], m4 | |
15606 | |
15607 ; mode 19 [row 19] | |
15608 movu m6, [r5 + 24 * 16] | |
15609 pslldq m0, 2 | |
15610 pinsrb m0, [r4 + 18], 1 | |
15611 pinsrb m0, [r4 + 20], 0 | |
15612 pmaddubsw m4, m0, m6 | |
15613 pmulhrsw m4, m7 | |
15614 pslldq m2, 2 | |
15615 pinsrb m2, [r4 + 9], 1 | |
15616 pinsrb m2, [r4 + 10], 0 | |
15617 pmaddubsw m5, m2, m6 | |
15618 pmulhrsw m5, m7 | |
15619 packuswb m4, m5 | |
15620 movu [r0 + 1126 * 16], m4 | |
15621 pslldq m1, 2 | |
15622 pinsrw m1, [r3 + 0], 0 | |
15623 pmaddubsw m4, m1, m6 | |
15624 pmulhrsw m4, m7 | |
15625 pslldq m3, 2 | |
15626 pinsrw m3, [r3 + 8], 0 | |
15627 pmaddubsw m5, m3, m6 | |
15628 pmulhrsw m5, m7 | |
15629 packuswb m4, m5 | |
15630 movu [r0 + 1127 * 16], m4 | |
15631 | |
15632 ; mode 19 [row 20] | |
15633 movu m6, [r5 + 30 * 16] | |
15634 pslldq m0, 2 | |
15635 pinsrb m0, [r4 + 20], 1 | |
15636 pinsrb m0, [r4 + 21], 0 | |
15637 pmaddubsw m4, m0, m6 | |
15638 pmulhrsw m4, m7 | |
15639 pslldq m2, 2 | |
15640 pinsrb m2, [r4 + 10], 1 | |
15641 pinsrb m2, [r4 + 11], 0 | |
15642 pmaddubsw m5, m2, m6 | |
15643 pmulhrsw m5, m7 | |
15644 packuswb m4, m5 | |
15645 movu [r0 + 1128 * 16], m4 | |
15646 pslldq m1, 2 | |
15647 pinsrb m1, [r4 + 0], 1 | |
15648 pinsrb m1, [r4 + 1], 0 | |
15649 pmaddubsw m4, m1, m6 | |
15650 pmulhrsw m4, m7 | |
15651 pslldq m3, 2 | |
15652 pinsrb m3, [r3 + 8], 1 | |
15653 pinsrb m3, [r3 + 7], 0 | |
15654 pmaddubsw m5, m3, m6 | |
15655 pmulhrsw m5, m7 | |
15656 packuswb m4, m5 | |
15657 movu [r0 + 1129 * 16], m4 | |
15658 | |
15659 ; mode 19 [row 21] | |
15660 movu m6, [r5 + 4 * 16] | |
15661 pmaddubsw m4, m0, m6 | |
15662 pmulhrsw m4, m7 | |
15663 pmaddubsw m5, m2, m6 | |
15664 pmulhrsw m5, m7 | |
15665 packuswb m4, m5 | |
15666 movu [r0 + 1130 * 16], m4 | |
15667 pmaddubsw m4, m1, m6 | |
15668 pmulhrsw m4, m7 | |
15669 pmaddubsw m5, m3, m6 | |
15670 pmulhrsw m5, m7 | |
15671 packuswb m4, m5 | |
15672 movu [r0 + 1131 * 16], m4 | |
15673 | |
15674 ; mode 19 [row 22] | |
15675 movu m6, [r5 + 10 * 16] | |
15676 pslldq m0, 2 | |
15677 pinsrb m0, [r4 + 21], 1 | |
15678 pinsrb m0, [r4 + 22], 0 | |
15679 pmaddubsw m4, m0, m6 | |
15680 pmulhrsw m4, m7 | |
15681 pslldq m2, 2 | |
15682 pinsrb m2, [r4 + 11], 1 | |
15683 pinsrb m2, [r4 + 12], 0 | |
15684 pmaddubsw m5, m2, m6 | |
15685 pmulhrsw m5, m7 | |
15686 packuswb m4, m5 | |
15687 movu [r0 + 1132 * 16], m4 | |
15688 pslldq m1, 2 | |
15689 pinsrb m1, [r4 + 1], 1 | |
15690 pinsrb m1, [r4 + 2], 0 | |
15691 pmaddubsw m4, m1, m6 | |
15692 pmulhrsw m4, m7 | |
15693 pslldq m3, 2 | |
15694 pinsrw m3, [r3 + 6], 0 | |
15695 pmaddubsw m5, m3, m6 | |
15696 pmulhrsw m5, m7 | |
15697 packuswb m4, m5 | |
15698 movu [r0 + 1133 * 16], m4 | |
15699 | |
15700 ; mode 19 [row 23] | |
15701 movu m6, [r5 + 16 * 16] | |
15702 pslldq m0, 2 | |
15703 pinsrb m0, [r4 + 22], 1 | |
15704 pinsrb m0, [r4 + 23], 0 | |
15705 pmaddubsw m4, m0, m6 | |
15706 pmulhrsw m4, m7 | |
15707 pslldq m2, 2 | |
15708 pinsrb m2, [r4 + 12], 1 | |
15709 pinsrb m2, [r4 + 14], 0 | |
15710 pmaddubsw m5, m2, m6 | |
15711 pmulhrsw m5, m7 | |
15712 packuswb m4, m5 | |
15713 movu [r0 + 1134 * 16], m4 | |
15714 pslldq m1, 2 | |
15715 pinsrb m1, [r4 + 2], 1 | |
15716 pinsrb m1, [r4 + 4], 0 | |
15717 pmaddubsw m4, m1, m6 | |
15718 pmulhrsw m4, m7 | |
15719 pslldq m3, 2 | |
15720 pinsrw m3, [r3 + 5], 0 | |
15721 pmaddubsw m5, m3, m6 | |
15722 pmulhrsw m5, m7 | |
15723 packuswb m4, m5 | |
15724 movu [r0 + 1135 * 16], m4 | |
15725 | |
15726 ; mode 19 [row 24] | |
15727 movu m6, [r5 + 22 * 16] | |
15728 pslldq m0, 2 | |
15729 pinsrb m0, [r4 + 23], 1 | |
15730 pinsrb m0, [r4 + 25], 0 | |
15731 pmaddubsw m4, m0, m6 | |
15732 pmulhrsw m4, m7 | |
15733 pslldq m2, 2 | |
15734 pinsrb m2, [r4 + 14], 1 | |
15735 pinsrb m2, [r4 + 15], 0 | |
15736 pmaddubsw m5, m2, m6 | |
15737 pmulhrsw m5, m7 | |
15738 packuswb m4, m5 | |
15739 movu [r0 + 1136 * 16], m4 | |
15740 pslldq m1, 2 | |
15741 pinsrb m1, [r4 + 4], 1 | |
15742 pinsrb m1, [r4 + 5], 0 | |
15743 pmaddubsw m4, m1, m6 | |
15744 pmulhrsw m4, m7 | |
15745 pslldq m3, 2 | |
15746 pinsrw m3, [r3 + 4], 0 | |
15747 pmaddubsw m5, m3, m6 | |
15748 pmulhrsw m5, m7 | |
15749 packuswb m4, m5 | |
15750 movu [r0 + 1137 * 16], m4 | |
15751 | |
15752 ; mode 19 [row 25] | |
15753 movu m6, [r5 + 28 * 16] | |
15754 pslldq m0, 2 | |
15755 pinsrb m0, [r4 + 25], 1 | |
15756 pinsrb m0, [r4 + 26], 0 | |
15757 pmaddubsw m4, m0, m6 | |
15758 pmulhrsw m4, m7 | |
15759 pslldq m2, 2 | |
15760 pinsrb m2, [r4 + 15], 1 | |
15761 pinsrb m2, [r4 + 16], 0 | |
15762 pmaddubsw m5, m2, m6 | |
15763 pmulhrsw m5, m7 | |
15764 packuswb m4, m5 | |
15765 movu [r0 + 1138 * 16], m4 | |
15766 pslldq m1, 2 | |
15767 pinsrb m1, [r4 + 5], 1 | |
15768 pinsrb m1, [r4 + 6], 0 | |
15769 pmaddubsw m4, m1, m6 | |
15770 pmulhrsw m4, m7 | |
15771 pslldq m3, 2 | |
15772 pinsrw m3, [r3 + 3], 0 | |
15773 pmaddubsw m5, m3, m6 | |
15774 pmulhrsw m5, m7 | |
15775 packuswb m4, m5 | |
15776 movu [r0 + 1139 * 16], m4 | |
15777 | |
15778 ; mode 19 [row 26] | |
15779 movu m6, [r5 + 2 * 16] | |
15780 pmaddubsw m4, m0, m6 | |
15781 pmulhrsw m4, m7 | |
15782 pmaddubsw m5, m2, m6 | |
15783 pmulhrsw m5, m7 | |
15784 packuswb m4, m5 | |
15785 movu [r0 + 1140 * 16], m4 | |
15786 pmaddubsw m4, m1, m6 | |
15787 pmulhrsw m4, m7 | |
15788 pmaddubsw m5, m3, m6 | |
15789 pmulhrsw m5, m7 | |
15790 packuswb m4, m5 | |
15791 movu [r0 + 1141 * 16], m4 | |
15792 | |
15793 ; mode 19 [row 27] | |
15794 movu m6, [r5 + 8 * 16] | |
15795 pslldq m0, 2 | |
15796 pinsrb m0, [r4 + 26], 1 | |
15797 pinsrb m0, [r4 + 27], 0 | |
15798 pmaddubsw m4, m0, m6 | |
15799 pmulhrsw m4, m7 | |
15800 pslldq m2, 2 | |
15801 pinsrb m2, [r4 + 16], 1 | |
15802 pinsrb m2, [r4 + 17], 0 | |
15803 pmaddubsw m5, m2, m6 | |
15804 pmulhrsw m5, m7 | |
15805 packuswb m4, m5 | |
15806 movu [r0 + 1142 * 16], m4 | |
15807 pslldq m1, 2 | |
15808 pinsrb m1, [r4 + 6], 1 | |
15809 pinsrb m1, [r4 + 7], 0 | |
15810 pmaddubsw m4, m1, m6 | |
15811 pmulhrsw m4, m7 | |
15812 pslldq m3, 2 | |
15813 pinsrw m3, [r3 + 2], 0 | |
15814 pmaddubsw m5, m3, m6 | |
15815 pmulhrsw m5, m7 | |
15816 packuswb m4, m5 | |
15817 movu [r0 + 1143 * 16], m4 | |
15818 | |
15819 ; mode 19 [row 28] | |
15820 movu m6, [r5 + 14 * 16] | |
15821 pslldq m0, 2 | |
15822 pinsrb m0, [r4 + 27], 1 | |
15823 pinsrb m0, [r4 + 28], 0 | |
15824 pmaddubsw m4, m0, m6 | |
15825 pmulhrsw m4, m7 | |
15826 pslldq m2, 2 | |
15827 pinsrb m2, [r4 + 17], 1 | |
15828 pinsrb m2, [r4 + 18], 0 | |
15829 pmaddubsw m5, m2, m6 | |
15830 pmulhrsw m5, m7 | |
15831 packuswb m4, m5 | |
15832 movu [r0 + 1144 * 16], m4 | |
15833 pslldq m1, 2 | |
15834 pinsrb m1, [r4 + 7], 1 | |
15835 pinsrb m1, [r4 + 9], 0 | |
15836 pmaddubsw m4, m1, m6 | |
15837 pmulhrsw m4, m7 | |
15838 pslldq m3, 2 | |
15839 pinsrw m3, [r3 + 1], 0 | |
15840 pmaddubsw m5, m3, m6 | |
15841 pmulhrsw m5, m7 | |
15842 packuswb m4, m5 | |
15843 movu [r0 + 1145 * 16], m4 | |
15844 | |
15845 ; mode 19 [row 29] | |
15846 movu m6, [r5 + 20 * 16] | |
15847 pslldq m0, 2 | |
15848 pinsrb m0, [r4 + 28], 1 | |
15849 pinsrb m0, [r4 + 30], 0 | |
15850 pmaddubsw m4, m0, m6 | |
15851 pmulhrsw m4, m7 | |
15852 pslldq m2, 2 | |
15853 pinsrb m2, [r4 + 18], 1 | |
15854 pinsrb m2, [r4 + 20], 0 | |
15855 pmaddubsw m5, m2, m6 | |
15856 pmulhrsw m5, m7 | |
15857 packuswb m4, m5 | |
15858 movu [r0 + 1146 * 16], m4 | |
15859 pslldq m1, 2 | |
15860 pinsrb m1, [r4 + 9], 1 | |
15861 pinsrb m1, [r4 + 10], 0 | |
15862 pmaddubsw m4, m1, m6 | |
15863 pmulhrsw m4, m7 | |
15864 pslldq m3, 2 | |
15865 pinsrw m3, [r3 + 0], 0 | |
15866 pmaddubsw m5, m3, m6 | |
15867 pmulhrsw m5, m7 | |
15868 packuswb m4, m5 | |
15869 movu [r0 + 1147 * 16], m4 | |
15870 | |
15871 ; mode 19 [row 30] | |
15872 movu m6, [r5 + 26 * 16] | |
15873 pslldq m0, 2 | |
15874 pinsrb m0, [r4 + 30], 1 | |
15875 pinsrb m0, [r4 + 31], 0 | |
15876 pmaddubsw m4, m0, m6 | |
15877 pmulhrsw m4, m7 | |
15878 pslldq m2, 2 | |
15879 pinsrb m2, [r4 + 20], 1 | |
15880 pinsrb m2, [r4 + 21], 0 | |
15881 pmaddubsw m5, m2, m6 | |
15882 pmulhrsw m5, m7 | |
15883 packuswb m4, m5 | |
15884 movu [r0 + 1148 * 16], m4 | |
15885 pslldq m1, 2 | |
15886 pinsrb m1, [r4 + 10], 1 | |
15887 pinsrb m1, [r4 + 11], 0 | |
15888 pmaddubsw m4, m1, m6 | |
15889 pmulhrsw m4, m7 | |
15890 pslldq m3, 2 | |
15891 pinsrb m3, [r4 + 0], 1 | |
15892 pinsrb m3, [r4 + 1], 0 | |
15893 pmaddubsw m5, m3, m6 | |
15894 pmulhrsw m5, m7 | |
15895 packuswb m4, m5 | |
15896 movu [r0 + 1149 * 16], m4 | |
15897 | |
15898 ; mode19 [row 31] | |
15899 pshufb m5, m0, [tab_S2] | |
15900 movh [r0 + 1150 * 16], m5 | |
15901 pshufb m5, m2, [tab_S2] | |
15902 movh [r0 + 1150 * 16 + 8], m5 | |
15903 pshufb m5, m1, [tab_S2] | |
15904 movh [r0 + 1151 * 16], m5 | |
15905 pshufb m5, m3, [tab_S2] | |
15906 movh [r0 + 1151 * 16 + 8], m5 | |
15907 | |
15908 ; mode 20 [row 0] | |
15909 movu m6, [r5 + 11 * 16] | |
15910 movu m0, [r3 ] | |
15911 movu m1, [r3 + 1 ] | |
15912 punpcklbw m0, m1 | |
15913 pmaddubsw m1, m0, m6 | |
15914 pmulhrsw m1, m7 | |
15915 movu m2, [r3 + 8] | |
15916 movu m3, [r3 + 9] | |
15917 punpcklbw m2, m3 | |
15918 pmaddubsw m3, m2, m6 | |
15919 pmulhrsw m3, m7 | |
15920 packuswb m1, m3 | |
15921 movu [r0 + 1152 * 16], m1 | |
15922 | |
15923 movu m1, [r3 + 16] | |
15924 movu m3, [r3 + 17] | |
15925 punpcklbw m1, m3 | |
15926 pmaddubsw m4, m1, m6 | |
15927 pmulhrsw m4, m7 | |
15928 movu m3, [r3 + 24] | |
15929 movu m5, [r3 + 25] | |
15930 punpcklbw m3, m5 | |
15931 pmaddubsw m5, m3, m6 | |
15932 pmulhrsw m5, m7 | |
15933 packuswb m4, m5 | |
15934 movu [r0 + 1153 * 16], m4 | |
15935 | |
15936 ; mode 20 [row 1] | |
15937 movu m6, [r5 + 22 * 16] | |
15938 pslldq m0, 2 | |
15939 pinsrb m0, [r4 + 0], 1 | |
15940 pinsrb m0, [r4 + 2], 0 | |
15941 pmaddubsw m4, m0, m6 | |
15942 pmulhrsw m4, m7 | |
15943 pslldq m2, 2 | |
15944 pinsrw m2, [r3 + 7], 0 | |
15945 pmaddubsw m5, m2, m6 | |
15946 pmulhrsw m5, m7 | |
15947 packuswb m4, m5 | |
15948 movu [r0 + 1154 * 16], m4 | |
15949 pslldq m1, 2 | |
15950 pinsrw m1, [r3 + 15], 0 | |
15951 pmaddubsw m4, m1, m6 | |
15952 pmulhrsw m4, m7 | |
15953 pslldq m3, 2 | |
15954 pinsrw m3, [r3 + 23], 0 | |
15955 pmaddubsw m5, m3, m6 | |
15956 pmulhrsw m5, m7 | |
15957 packuswb m4, m5 | |
15958 movu [r0 + 1155 * 16], m4 | |
15959 | |
15960 ; mode 20 [row 2] | |
15961 movu m6, [r5 + 1 * 16] | |
15962 pmaddubsw m4, m0, m6 | |
15963 pmulhrsw m4, m7 | |
15964 pmaddubsw m5, m2, m6 | |
15965 pmulhrsw m5, m7 | |
15966 packuswb m4, m5 | |
15967 movu [r0 + 1156 * 16], m4 | |
15968 pmaddubsw m4, m1, m6 | |
15969 pmulhrsw m4, m7 | |
15970 pmaddubsw m5, m3, m6 | |
15971 pmulhrsw m5, m7 | |
15972 packuswb m4, m5 | |
15973 movu [r0 + 1157 * 16], m4 | |
15974 | |
15975 ; mode 20 [row 3] | |
15976 movu m6, [r5 + 12 * 16] | |
15977 pslldq m0, 2 | |
15978 pinsrb m0, [r4 + 2], 1 | |
15979 pinsrb m0, [r4 + 3], 0 | |
15980 pmaddubsw m4, m0, m6 | |
15981 pmulhrsw m4, m7 | |
15982 pslldq m2, 2 | |
15983 pinsrw m2, [r3 + 6], 0 | |
15984 pmaddubsw m5, m2, m6 | |
15985 pmulhrsw m5, m7 | |
15986 packuswb m4, m5 | |
15987 movu [r0 + 1158 * 16], m4 | |
15988 pslldq m1, 2 | |
15989 pinsrw m1, [r3 + 14], 0 | |
15990 pmaddubsw m4, m1, m6 | |
15991 pmulhrsw m4, m7 | |
15992 pslldq m3, 2 | |
15993 pinsrw m3, [r3 + 22], 0 | |
15994 pmaddubsw m5, m3, m6 | |
15995 pmulhrsw m5, m7 | |
15996 packuswb m4, m5 | |
15997 movu [r0 + 1159 * 16], m4 | |
15998 | |
15999 ; mode 20 [row 4] | |
16000 movu m6, [r5 + 23 * 16] | |
16001 pslldq m0, 2 | |
16002 pinsrb m0, [r4 + 3], 1 | |
16003 pinsrb m0, [r4 + 5], 0 | |
16004 pmaddubsw m4, m0, m6 | |
16005 pmulhrsw m4, m7 | |
16006 pslldq m2, 2 | |
16007 pinsrw m2, [r3 + 5], 0 | |
16008 pmaddubsw m5, m2, m6 | |
16009 pmulhrsw m5, m7 | |
16010 packuswb m4, m5 | |
16011 movu [r0 + 1160 * 16], m4 | |
16012 pslldq m1, 2 | |
16013 pinsrw m1, [r3 + 13], 0 | |
16014 pmaddubsw m4, m1, m6 | |
16015 pmulhrsw m4, m7 | |
16016 pslldq m3, 2 | |
16017 pinsrw m3, [r3 + 21], 0 | |
16018 pmaddubsw m5, m3, m6 | |
16019 pmulhrsw m5, m7 | |
16020 packuswb m4, m5 | |
16021 movu [r0 + 1161 * 16], m4 | |
16022 | |
16023 ; mode 20 [row 5] | |
16024 movu m6, [r5 + 2 * 16] | |
16025 pmaddubsw m4, m0, m6 | |
16026 pmulhrsw m4, m7 | |
16027 pmaddubsw m5, m2, m6 | |
16028 pmulhrsw m5, m7 | |
16029 packuswb m4, m5 | |
16030 movu [r0 + 1162 * 16], m4 | |
16031 pmaddubsw m4, m1, m6 | |
16032 pmulhrsw m4, m7 | |
16033 pmaddubsw m5, m3, m6 | |
16034 pmulhrsw m5, m7 | |
16035 packuswb m4, m5 | |
16036 movu [r0 + 1163 * 16], m4 | |
16037 | |
16038 ; mode 20 [row 6] | |
16039 movu m6, [r5 + 13 * 16] | |
16040 pslldq m0, 2 | |
16041 pinsrb m0, [r4 + 5], 1 | |
16042 pinsrb m0, [r4 + 6], 0 | |
16043 pmaddubsw m4, m0, m6 | |
16044 pmulhrsw m4, m7 | |
16045 pslldq m2, 2 | |
16046 pinsrw m2, [r3 + 4], 0 | |
16047 pmaddubsw m5, m2, m6 | |
16048 pmulhrsw m5, m7 | |
16049 packuswb m4, m5 | |
16050 movu [r0 + 1164 * 16], m4 | |
16051 pslldq m1, 2 | |
16052 pinsrw m1, [r3 + 12], 0 | |
16053 pmaddubsw m4, m1, m6 | |
16054 pmulhrsw m4, m7 | |
16055 pslldq m3, 2 | |
16056 pinsrw m3, [r3 + 20], 0 | |
16057 pmaddubsw m5, m3, m6 | |
16058 pmulhrsw m5, m7 | |
16059 packuswb m4, m5 | |
16060 movu [r0 + 1165 * 16], m4 | |
16061 | |
16062 ; mode 20 [row 7] | |
16063 movu m6, [r5 + 24 * 16] | |
16064 pslldq m0, 2 | |
16065 pinsrb m0, [r4 + 6], 1 | |
16066 pinsrb m0, [r4 + 8], 0 | |
16067 pmaddubsw m4, m0, m6 | |
16068 pmulhrsw m4, m7 | |
16069 pslldq m2, 2 | |
16070 pinsrw m2, [r3 + 3], 0 | |
16071 pmaddubsw m5, m2, m6 | |
16072 pmulhrsw m5, m7 | |
16073 packuswb m4, m5 | |
16074 movu [r0 + 1166 * 16], m4 | |
16075 pslldq m1, 2 | |
16076 pinsrw m1, [r3 + 11], 0 | |
16077 pmaddubsw m4, m1, m6 | |
16078 pmulhrsw m4, m7 | |
16079 pslldq m3, 2 | |
16080 pinsrw m3, [r3 + 19], 0 | |
16081 pmaddubsw m5, m3, m6 | |
16082 pmulhrsw m5, m7 | |
16083 packuswb m4, m5 | |
16084 movu [r0 + 1167 * 16], m4 | |
16085 | |
16086 ; mode 20 [row 8] | |
16087 movu m6, [r5 + 3 * 16] | |
16088 pmaddubsw m4, m0, m6 | |
16089 pmulhrsw m4, m7 | |
16090 pmaddubsw m5, m2, m6 | |
16091 pmulhrsw m5, m7 | |
16092 packuswb m4, m5 | |
16093 movu [r0 + 1168 * 16], m4 | |
16094 pmaddubsw m4, m1, m6 | |
16095 pmulhrsw m4, m7 | |
16096 pmaddubsw m5, m3, m6 | |
16097 pmulhrsw m5, m7 | |
16098 packuswb m4, m5 | |
16099 movu [r0 + 1169 * 16], m4 | |
16100 | |
16101 ; mode 20 [row 9] | |
16102 movu m6, [r5 + 14 * 16] | |
16103 pslldq m0, 2 | |
16104 pinsrb m0, [r4 + 8], 1 | |
16105 pinsrb m0, [r4 + 9], 0 | |
16106 pmaddubsw m4, m0, m6 | |
16107 pmulhrsw m4, m7 | |
16108 pslldq m2, 2 | |
16109 pinsrb m2, [r3 + 3], 1 | |
16110 pinsrb m2, [r3 + 2], 0 | |
16111 pmaddubsw m5, m2, m6 | |
16112 pmulhrsw m5, m7 | |
16113 packuswb m4, m5 | |
16114 movu [r0 + 1170 * 16], m4 | |
16115 pslldq m1, 2 | |
16116 pinsrw m1, [r3 + 10], 0 | |
16117 pmaddubsw m4, m1, m6 | |
16118 pmulhrsw m4, m7 | |
16119 pslldq m3, 2 | |
16120 pinsrw m3, [r3 + 18], 0 | |
16121 pmaddubsw m5, m3, m6 | |
16122 pmulhrsw m5, m7 | |
16123 packuswb m4, m5 | |
16124 movu [r0 + 1171 * 16], m4 | |
16125 | |
16126 ; mode 20 [row 10] | |
16127 movu m6, [r5 + 25 * 16] | |
16128 pslldq m0, 2 | |
16129 pinsrb m0, [r4 + 9], 1 | |
16130 pinsrb m0, [r4 + 11], 0 | |
16131 pmaddubsw m4, m0, m6 | |
16132 pmulhrsw m4, m7 | |
16133 pslldq m2, 2 | |
16134 pinsrw m2, [r3 + 1], 0 | |
16135 pmaddubsw m5, m2, m6 | |
16136 pmulhrsw m5, m7 | |
16137 packuswb m4, m5 | |
16138 movu [r0 + 1172 * 16], m4 | |
16139 pslldq m1, 2 | |
16140 pinsrw m1, [r3 + 9], 0 | |
16141 pmaddubsw m4, m1, m6 | |
16142 pmulhrsw m4, m7 | |
16143 pslldq m3, 2 | |
16144 pinsrw m3, [r3 + 17], 0 | |
16145 pmaddubsw m5, m3, m6 | |
16146 pmulhrsw m5, m7 | |
16147 packuswb m4, m5 | |
16148 movu [r0 + 1173 * 16], m4 | |
16149 | |
16150 ; mode 20 [row 11] | |
16151 movu m6, [r5 + 4 * 16] | |
16152 pmaddubsw m4, m0, m6 | |
16153 pmulhrsw m4, m7 | |
16154 pmaddubsw m5, m2, m6 | |
16155 pmulhrsw m5, m7 | |
16156 packuswb m4, m5 | |
16157 movu [r0 + 1174 * 16], m4 | |
16158 pmaddubsw m4, m1, m6 | |
16159 pmulhrsw m4, m7 | |
16160 pmaddubsw m5, m3, m6 | |
16161 pmulhrsw m5, m7 | |
16162 packuswb m4, m5 | |
16163 movu [r0 + 1175 * 16], m4 | |
16164 | |
16165 ; mode 20 [row 12] | |
16166 movu m6, [r5 + 15 * 16] | |
16167 pslldq m0, 2 | |
16168 pinsrb m0, [r4 + 11], 1 | |
16169 pinsrb m0, [r4 + 12], 0 | |
16170 pmaddubsw m4, m0, m6 | |
16171 pmulhrsw m4, m7 | |
16172 pslldq m2, 2 | |
16173 pinsrb m2, [r3 + 1], 1 | |
16174 pinsrb m2, [r3 + 0], 0 | |
16175 pmaddubsw m5, m2, m6 | |
16176 pmulhrsw m5, m7 | |
16177 packuswb m4, m5 | |
16178 movu [r0 + 1176 * 16], m4 | |
16179 pslldq m1, 2 | |
16180 pinsrw m1, [r3 + 8], 0 | |
16181 pmaddubsw m4, m1, m6 | |
16182 pmulhrsw m4, m7 | |
16183 pslldq m3, 2 | |
16184 pinsrw m3, [r3 + 16], 0 | |
16185 pmaddubsw m5, m3, m6 | |
16186 pmulhrsw m5, m7 | |
16187 packuswb m4, m5 | |
16188 movu [r0 + 1177 * 16], m4 | |
16189 | |
16190 ; mode 20 [row 13] | |
16191 movu m6, [r5 + 26 * 16] | |
16192 pslldq m0, 2 | |
16193 pinsrb m0, [r4 + 12], 1 | |
16194 pinsrb m0, [r4 + 14], 0 | |
16195 pmaddubsw m4, m0, m6 | |
16196 pmulhrsw m4, m7 | |
16197 pslldq m2, 2 | |
16198 pinsrb m2, [r4 + 0], 1 | |
16199 pinsrb m2, [r4 + 2], 0 | |
16200 pmaddubsw m5, m2, m6 | |
16201 pmulhrsw m5, m7 | |
16202 packuswb m4, m5 | |
16203 movu [r0 + 1178 * 16], m4 | |
16204 pslldq m1, 2 | |
16205 pinsrw m1, [r3 + 7], 0 | |
16206 pmaddubsw m4, m1, m6 | |
16207 pmulhrsw m4, m7 | |
16208 pslldq m3, 2 | |
16209 pinsrw m3, [r3 + 15], 0 | |
16210 pmaddubsw m5, m3, m6 | |
16211 pmulhrsw m5, m7 | |
16212 packuswb m4, m5 | |
16213 movu [r0 + 1179 * 16], m4 | |
16214 | |
16215 ; mode 20 [row 14] | |
16216 movu m6, [r5 + 5 * 16] | |
16217 pmaddubsw m4, m0, m6 | |
16218 pmulhrsw m4, m7 | |
16219 pmaddubsw m5, m2, m6 | |
16220 pmulhrsw m5, m7 | |
16221 packuswb m4, m5 | |
16222 movu [r0 + 1180 * 16], m4 | |
16223 pmaddubsw m4, m1, m6 | |
16224 pmulhrsw m4, m7 | |
16225 pmaddubsw m5, m3, m6 | |
16226 pmulhrsw m5, m7 | |
16227 packuswb m4, m5 | |
16228 movu [r0 + 1181 * 16], m4 | |
16229 | |
16230 ; mode 20 [row 15] | |
16231 movu m6, [r5 + 16 * 16] | |
16232 pslldq m0, 2 | |
16233 pinsrb m0, [r4 + 14], 1 | |
16234 pinsrb m0, [r4 + 15], 0 | |
16235 pmaddubsw m4, m0, m6 | |
16236 pmulhrsw m4, m7 | |
16237 pslldq m2, 2 | |
16238 pinsrb m2, [r4 + 2], 1 | |
16239 pinsrb m2, [r4 + 3], 0 | |
16240 pmaddubsw m5, m2, m6 | |
16241 pmulhrsw m5, m7 | |
16242 packuswb m4, m5 | |
16243 movu [r0 + 1182 * 16], m4 | |
16244 pslldq m1, 2 | |
16245 pinsrw m1, [r3 + 6], 0 | |
16246 pmaddubsw m4, m1, m6 | |
16247 pmulhrsw m4, m7 | |
16248 pslldq m3, 2 | |
16249 pinsrw m3, [r3 + 14], 0 | |
16250 pmaddubsw m5, m3, m6 | |
16251 pmulhrsw m5, m7 | |
16252 packuswb m4, m5 | |
16253 movu [r0 + 1183 * 16], m4 | |
16254 | |
16255 ; mode 20 [row 16] | |
16256 movu m6, [r5 + 27 * 16] | |
16257 pslldq m0, 2 | |
16258 pinsrb m0, [r4 + 15], 1 | |
16259 pinsrb m0, [r4 + 17], 0 | |
16260 pmaddubsw m4, m0, m6 | |
16261 pmulhrsw m4, m7 | |
16262 pslldq m2, 2 | |
16263 pinsrb m2, [r4 + 3], 1 | |
16264 pinsrb m2, [r4 + 5], 0 | |
16265 pmaddubsw m5, m2, m6 | |
16266 pmulhrsw m5, m7 | |
16267 packuswb m4, m5 | |
16268 movu [r0 + 1184 * 16], m4 | |
16269 pslldq m1, 2 | |
16270 pinsrw m1, [r3 + 5], 0 | |
16271 pmaddubsw m4, m1, m6 | |
16272 pmulhrsw m4, m7 | |
16273 pslldq m3, 2 | |
16274 pinsrw m3, [r3 + 13], 0 | |
16275 pmaddubsw m5, m3, m6 | |
16276 pmulhrsw m5, m7 | |
16277 packuswb m4, m5 | |
16278 movu [r0 + 1185 * 16], m4 | |
16279 | |
16280 ; mode 20 [row 17] | |
16281 movu m6, [r5 + 6 * 16] | |
16282 pmaddubsw m4, m0, m6 | |
16283 pmulhrsw m4, m7 | |
16284 pmaddubsw m5, m2, m6 | |
16285 pmulhrsw m5, m7 | |
16286 packuswb m4, m5 | |
16287 movu [r0 + 1186 * 16], m4 | |
16288 pmaddubsw m4, m1, m6 | |
16289 pmulhrsw m4, m7 | |
16290 pmaddubsw m5, m3, m6 | |
16291 pmulhrsw m5, m7 | |
16292 packuswb m4, m5 | |
16293 movu [r0 + 1187 * 16], m4 | |
16294 | |
16295 ; mode 20 [row 18] | |
16296 movu m6, [r5 + 17 * 16] | |
16297 pslldq m0, 2 | |
16298 pinsrb m0, [r4 + 17], 1 | |
16299 pinsrb m0, [r4 + 18], 0 | |
16300 pmaddubsw m4, m0, m6 | |
16301 pmulhrsw m4, m7 | |
16302 pslldq m2, 2 | |
16303 pinsrb m2, [r4 + 5], 1 | |
16304 pinsrb m2, [r4 + 6], 0 | |
16305 pmaddubsw m5, m2, m6 | |
16306 pmulhrsw m5, m7 | |
16307 packuswb m4, m5 | |
16308 movu [r0 + 1188 * 16], m4 | |
16309 pslldq m1, 2 | |
16310 pinsrw m1, [r3 + 4], 0 | |
16311 pmaddubsw m4, m1, m6 | |
16312 pmulhrsw m4, m7 | |
16313 pslldq m3, 2 | |
16314 pinsrw m3, [r3 + 12], 0 | |
16315 pmaddubsw m5, m3, m6 | |
16316 pmulhrsw m5, m7 | |
16317 packuswb m4, m5 | |
16318 movu [r0 + 1189 * 16], m4 | |
16319 | |
16320 ; mode 20 [row 19] | |
16321 movu m6, [r5 + 28 * 16] | |
16322 pslldq m0, 2 | |
16323 pinsrb m0, [r4 + 18], 1 | |
16324 pinsrb m0, [r4 + 20], 0 | |
16325 pmaddubsw m4, m0, m6 | |
16326 pmulhrsw m4, m7 | |
16327 pslldq m2, 2 | |
16328 pinsrb m2, [r4 + 6], 1 | |
16329 pinsrb m2, [r4 + 8], 0 | |
16330 pmaddubsw m5, m2, m6 | |
16331 pmulhrsw m5, m7 | |
16332 packuswb m4, m5 | |
16333 movu [r0 + 1190 * 16], m4 | |
16334 pslldq m1, 2 | |
16335 pinsrw m1, [r3 + 3], 0 | |
16336 pmaddubsw m4, m1, m6 | |
16337 pmulhrsw m4, m7 | |
16338 pslldq m3, 2 | |
16339 pinsrw m3, [r3 + 11], 0 | |
16340 pmaddubsw m5, m3, m6 | |
16341 pmulhrsw m5, m7 | |
16342 packuswb m4, m5 | |
16343 movu [r0 + 1191 * 16], m4 | |
16344 | |
16345 ; mode 20 [row 20] | |
16346 movu m6, [r5 + 7 * 16] | |
16347 pmaddubsw m4, m0, m6 | |
16348 pmulhrsw m4, m7 | |
16349 pmaddubsw m5, m2, m6 | |
16350 pmulhrsw m5, m7 | |
16351 packuswb m4, m5 | |
16352 movu [r0 + 1192 * 16], m4 | |
16353 pmaddubsw m4, m1, m6 | |
16354 pmulhrsw m4, m7 | |
16355 pmaddubsw m5, m3, m6 | |
16356 pmulhrsw m5, m7 | |
16357 packuswb m4, m5 | |
16358 movu [r0 + 1193 * 16], m4 | |
16359 | |
16360 ; mode 20 [row 21] | |
16361 movu m6, [r5 + 18 * 16] | |
16362 pslldq m0, 2 | |
16363 pinsrb m0, [r4 + 20], 1 | |
16364 pinsrb m0, [r4 + 21], 0 | |
16365 pmaddubsw m4, m0, m6 | |
16366 pmulhrsw m4, m7 | |
16367 pslldq m2, 2 | |
16368 pinsrb m2, [r4 + 8], 1 | |
16369 pinsrb m2, [r4 + 9], 0 | |
16370 pmaddubsw m5, m2, m6 | |
16371 pmulhrsw m5, m7 | |
16372 packuswb m4, m5 | |
16373 movu [r0 + 1194 * 16], m4 | |
16374 pslldq m1, 2 | |
16375 pinsrw m1, [r3 + 2], 0 | |
16376 pmaddubsw m4, m1, m6 | |
16377 pmulhrsw m4, m7 | |
16378 pslldq m3, 2 | |
16379 pinsrw m3, [r3 + 10], 0 | |
16380 pmaddubsw m5, m3, m6 | |
16381 pmulhrsw m5, m7 | |
16382 packuswb m4, m5 | |
16383 movu [r0 + 1195 * 16], m4 | |
16384 | |
16385 ; mode 20 [row 22] | |
16386 movu m6, [r5 + 29 * 16] | |
16387 pslldq m0, 2 | |
16388 pinsrb m0, [r4 + 21], 1 | |
16389 pinsrb m0, [r4 + 23], 0 | |
16390 pmaddubsw m4, m0, m6 | |
16391 pmulhrsw m4, m7 | |
16392 pslldq m2, 2 | |
16393 pinsrb m2, [r4 + 9], 1 | |
16394 pinsrb m2, [r4 + 11], 0 | |
16395 pmaddubsw m5, m2, m6 | |
16396 pmulhrsw m5, m7 | |
16397 packuswb m4, m5 | |
16398 movu [r0 + 1196 * 16], m4 | |
16399 pslldq m1, 2 | |
16400 pinsrw m1, [r3 + 1], 0 | |
16401 pmaddubsw m4, m1, m6 | |
16402 pmulhrsw m4, m7 | |
16403 pslldq m3, 2 | |
16404 pinsrw m3, [r3 + 9], 0 | |
16405 pmaddubsw m5, m3, m6 | |
16406 pmulhrsw m5, m7 | |
16407 packuswb m4, m5 | |
16408 movu [r0 + 1197 * 16], m4 | |
16409 | |
16410 ; mode 20 [row 23] | |
16411 movu m6, [r5 + 8 * 16] | |
16412 pmaddubsw m4, m0, m6 | |
16413 pmulhrsw m4, m7 | |
16414 pmaddubsw m5, m2, m6 | |
16415 pmulhrsw m5, m7 | |
16416 packuswb m4, m5 | |
16417 movu [r0 + 1198 * 16], m4 | |
16418 pmaddubsw m4, m1, m6 | |
16419 pmulhrsw m4, m7 | |
16420 pmaddubsw m5, m3, m6 | |
16421 pmulhrsw m5, m7 | |
16422 packuswb m4, m5 | |
16423 movu [r0 + 1199 * 16], m4 | |
16424 | |
16425 ; mode 20 [row 24] | |
16426 movu m6, [r5 + 19 * 16] | |
16427 pslldq m0, 2 | |
16428 pinsrb m0, [r4 + 23], 1 | |
16429 pinsrb m0, [r4 + 24], 0 | |
16430 pmaddubsw m4, m0, m6 | |
16431 pmulhrsw m4, m7 | |
16432 pslldq m2, 2 | |
16433 pinsrb m2, [r4 + 11], 1 | |
16434 pinsrb m2, [r4 + 12], 0 | |
16435 pmaddubsw m5, m2, m6 | |
16436 pmulhrsw m5, m7 | |
16437 packuswb m4, m5 | |
16438 movu [r0 + 1200 * 16], m4 | |
16439 pslldq m1, 2 | |
16440 pinsrw m1, [r3 + 0], 0 | |
16441 pmaddubsw m4, m1, m6 | |
16442 pmulhrsw m4, m7 | |
16443 pslldq m3, 2 | |
16444 pinsrw m3, [r3 + 8], 0 | |
16445 pmaddubsw m5, m3, m6 | |
16446 pmulhrsw m5, m7 | |
16447 packuswb m4, m5 | |
16448 movu [r0 + 1201 * 16], m4 | |
16449 | |
16450 ; mode 20 [row 25] | |
16451 movu m6, [r5 + 30 * 16] | |
16452 pslldq m0, 2 | |
16453 pinsrb m0, [r4 + 24], 1 | |
16454 pinsrb m0, [r4 + 26], 0 | |
16455 pmaddubsw m4, m0, m6 | |
16456 pmulhrsw m4, m7 | |
16457 pslldq m2, 2 | |
16458 pinsrb m2, [r4 + 12], 1 | |
16459 pinsrb m2, [r4 + 14], 0 | |
16460 pmaddubsw m5, m2, m6 | |
16461 pmulhrsw m5, m7 | |
16462 packuswb m4, m5 | |
16463 movu [r0 + 1202 * 16], m4 | |
16464 pslldq m1, 2 | |
16465 pinsrb m1, [r4 + 0], 1 | |
16466 pinsrb m1, [r4 + 2], 0 | |
16467 pmaddubsw m4, m1, m6 | |
16468 pmulhrsw m4, m7 | |
16469 pslldq m3, 2 | |
16470 pinsrw m3, [r3 + 7], 0 | |
16471 pmaddubsw m5, m3, m6 | |
16472 pmulhrsw m5, m7 | |
16473 packuswb m4, m5 | |
16474 movu [r0 + 1203 * 16], m4 | |
16475 | |
16476 ; mode 20 [row 26] | |
16477 movu m6, [r5 + 9 * 16] | |
16478 pmaddubsw m4, m0, m6 | |
16479 pmulhrsw m4, m7 | |
16480 pmaddubsw m5, m2, m6 | |
16481 pmulhrsw m5, m7 | |
16482 packuswb m4, m5 | |
16483 movu [r0 + 1204 * 16], m4 | |
16484 pmaddubsw m4, m1, m6 | |
16485 pmulhrsw m4, m7 | |
16486 pmaddubsw m5, m3, m6 | |
16487 pmulhrsw m5, m7 | |
16488 packuswb m4, m5 | |
16489 movu [r0 + 1205 * 16], m4 | |
16490 | |
16491 ; mode 20 [row 27] | |
16492 movu m6, [r5 + 20 * 16] | |
16493 pslldq m0, 2 | |
16494 pinsrb m0, [r4 + 26], 1 | |
16495 pinsrb m0, [r4 + 27], 0 | |
16496 pmaddubsw m4, m0, m6 | |
16497 pmulhrsw m4, m7 | |
16498 pslldq m2, 2 | |
16499 pinsrb m2, [r4 + 14], 1 | |
16500 pinsrb m2, [r4 + 15], 0 | |
16501 pmaddubsw m5, m2, m6 | |
16502 pmulhrsw m5, m7 | |
16503 packuswb m4, m5 | |
16504 movu [r0 + 1206 * 16], m4 | |
16505 pslldq m1, 2 | |
16506 pinsrb m1, [r4 + 2], 1 | |
16507 pinsrb m1, [r4 + 3], 0 | |
16508 pmaddubsw m4, m1, m6 | |
16509 pmulhrsw m4, m7 | |
16510 pslldq m3, 2 | |
16511 pinsrw m3, [r3 + 6], 0 | |
16512 pmaddubsw m5, m3, m6 | |
16513 pmulhrsw m5, m7 | |
16514 packuswb m4, m5 | |
16515 movu [r0 + 1207 * 16], m4 | |
16516 | |
16517 ; mode 20 [row 28] | |
16518 movu m6, [r5 + 31 * 16] | |
16519 pslldq m0, 2 | |
16520 pinsrb m0, [r4 + 27], 1 | |
16521 pinsrb m0, [r4 + 29], 0 | |
16522 pmaddubsw m4, m0, m6 | |
16523 pmulhrsw m4, m7 | |
16524 pslldq m2, 2 | |
16525 pinsrb m2, [r4 + 15], 1 | |
16526 pinsrb m2, [r4 + 17], 0 | |
16527 pmaddubsw m5, m2, m6 | |
16528 pmulhrsw m5, m7 | |
16529 packuswb m4, m5 | |
16530 movu [r0 + 1208 * 16], m4 | |
16531 pslldq m1, 2 | |
16532 pinsrb m1, [r4 + 3], 1 | |
16533 pinsrb m1, [r4 + 5], 0 | |
16534 pmaddubsw m4, m1, m6 | |
16535 pmulhrsw m4, m7 | |
16536 pslldq m3, 2 | |
16537 pinsrw m3, [r3 + 5], 0 | |
16538 pmaddubsw m5, m3, m6 | |
16539 pmulhrsw m5, m7 | |
16540 packuswb m4, m5 | |
16541 movu [r0 + 1209 * 16], m4 | |
16542 | |
16543 ; mode 20 [row 29] | |
16544 movu m6, [r5 + 10 * 16] | |
16545 pmaddubsw m4, m0, m6 | |
16546 pmulhrsw m4, m7 | |
16547 pmaddubsw m5, m2, m6 | |
16548 pmulhrsw m5, m7 | |
16549 packuswb m4, m5 | |
16550 movu [r0 + 1210 * 16], m4 | |
16551 pmaddubsw m4, m1, m6 | |
16552 pmulhrsw m4, m7 | |
16553 pmaddubsw m5, m3, m6 | |
16554 pmulhrsw m5, m7 | |
16555 packuswb m4, m5 | |
16556 movu [r0 + 1211 * 16], m4 | |
16557 | |
16558 ; mode 20 [row 30] | |
16559 movu m6, [r5 + 21 * 16] | |
16560 pslldq m0, 2 | |
16561 pinsrb m0, [r4 + 29], 1 | |
16562 pinsrb m0, [r4 + 30], 0 | |
16563 pmaddubsw m4, m0, m6 | |
16564 pmulhrsw m4, m7 | |
16565 pslldq m2, 2 | |
16566 pinsrb m2, [r4 + 17], 1 | |
16567 pinsrb m2, [r4 + 18], 0 | |
16568 pmaddubsw m5, m2, m6 | |
16569 pmulhrsw m5, m7 | |
16570 packuswb m4, m5 | |
16571 movu [r0 + 1212 * 16], m4 | |
16572 pslldq m1, 2 | |
16573 pinsrb m1, [r4 + 5], 1 | |
16574 pinsrb m1, [r4 + 6], 0 | |
16575 pmaddubsw m4, m1, m6 | |
16576 pmulhrsw m4, m7 | |
16577 pslldq m3, 2 | |
16578 pinsrw m3, [r3 + 4], 0 | |
16579 pmaddubsw m5, m3, m6 | |
16580 pmulhrsw m5, m7 | |
16581 packuswb m4, m5 | |
16582 movu [r0 + 1213 * 16], m4 | |
16583 | |
16584 ; mode20 [row 31] | |
16585 pshufb m5, m0, [tab_S2] | |
16586 movh [r0 + 1214 * 16], m5 | |
16587 pshufb m5, m2, [tab_S2] | |
16588 movh [r0 + 1214 * 16 + 8], m5 | |
16589 pshufb m5, m1, [tab_S2] | |
16590 movh [r0 + 1215 * 16], m5 | |
16591 pshufb m5, m3, [tab_S2] | |
16592 movh [r0 + 1215 * 16 + 8], m5 | |
16593 | |
16594 ; mode 21 [row 0] | |
16595 movu m6, [r5 + 15 * 16] | |
16596 movu m0, [r3 ] | |
16597 movu m1, [r3 + 1 ] | |
16598 punpcklbw m0, m1 | |
16599 pmaddubsw m1, m0, m6 | |
16600 pmulhrsw m1, m7 | |
16601 movu m2, [r3 + 8] | |
16602 movu m3, [r3 + 9] | |
16603 punpcklbw m2, m3 | |
16604 pmaddubsw m3, m2, m6 | |
16605 pmulhrsw m3, m7 | |
16606 packuswb m1, m3 | |
16607 movu [r0 + 1216 * 16], m1 | |
16608 | |
16609 movu m1, [r3 + 16] | |
16610 movu m3, [r3 + 17] | |
16611 punpcklbw m1, m3 | |
16612 pmaddubsw m4, m1, m6 | |
16613 pmulhrsw m4, m7 | |
16614 movu m3, [r3 + 24] | |
16615 movu m5, [r3 + 25] | |
16616 punpcklbw m3, m5 | |
16617 pmaddubsw m5, m3, m6 | |
16618 pmulhrsw m5, m7 | |
16619 packuswb m4, m5 | |
16620 movu [r0 + 1217 * 16], m4 | |
16621 | |
16622 ; mode 21 [row 1] | |
16623 movu m6, [r5 + 30 * 16] | |
16624 pslldq m0, 2 | |
16625 pinsrb m0, [r4 + 0], 1 | |
16626 pinsrb m0, [r4 + 2], 0 | |
16627 pmaddubsw m4, m0, m6 | |
16628 pmulhrsw m4, m7 | |
16629 pslldq m2, 2 | |
16630 pinsrw m2, [r3 + 7], 0 | |
16631 pmaddubsw m5, m2, m6 | |
16632 pmulhrsw m5, m7 | |
16633 packuswb m4, m5 | |
16634 movu [r0 + 1218 * 16], m4 | |
16635 pslldq m1, 2 | |
16636 pinsrw m1, [r3 + 15], 0 | |
16637 pmaddubsw m4, m1, m6 | |
16638 pmulhrsw m4, m7 | |
16639 pslldq m3, 2 | |
16640 pinsrw m3, [r3 + 23], 0 | |
16641 pmaddubsw m5, m3, m6 | |
16642 pmulhrsw m5, m7 | |
16643 packuswb m4, m5 | |
16644 movu [r0 + 1219 * 16], m4 | |
16645 | |
16646 ; mode 21 [row 2] | |
16647 movu m6, [r5 + 13 * 16] | |
16648 pmaddubsw m4, m0, m6 | |
16649 pmulhrsw m4, m7 | |
16650 pmaddubsw m5, m2, m6 | |
16651 pmulhrsw m5, m7 | |
16652 packuswb m4, m5 | |
16653 movu [r0 + 1220 * 16], m4 | |
16654 pmaddubsw m4, m1, m6 | |
16655 pmulhrsw m4, m7 | |
16656 pmaddubsw m5, m3, m6 | |
16657 pmulhrsw m5, m7 | |
16658 packuswb m4, m5 | |
16659 movu [r0 + 1221 * 16], m4 | |
16660 | |
16661 ; mode 21 [row 3] | |
16662 movu m6, [r5 + 28 * 16] | |
16663 pslldq m0, 2 | |
16664 pinsrb m0, [r4 + 2], 1 | |
16665 pinsrb m0, [r4 + 4], 0 | |
16666 pmaddubsw m4, m0, m6 | |
16667 pmulhrsw m4, m7 | |
16668 pslldq m2, 2 | |
16669 pinsrw m2, [r3 + 6], 0 | |
16670 pmaddubsw m5, m2, m6 | |
16671 pmulhrsw m5, m7 | |
16672 packuswb m4, m5 | |
16673 movu [r0 + 1222 * 16], m4 | |
16674 pslldq m1, 2 | |
16675 pinsrw m1, [r3 + 14], 0 | |
16676 pmaddubsw m4, m1, m6 | |
16677 pmulhrsw m4, m7 | |
16678 pslldq m3, 2 | |
16679 pinsrw m3, [r3 + 22], 0 | |
16680 pmaddubsw m5, m3, m6 | |
16681 pmulhrsw m5, m7 | |
16682 packuswb m4, m5 | |
16683 movu [r0 + 1223 * 16], m4 | |
16684 | |
16685 ; mode 21 [row 4] | |
16686 movu m6, [r5 + 11 * 16] | |
16687 pmaddubsw m4, m0, m6 | |
16688 pmulhrsw m4, m7 | |
16689 pmaddubsw m5, m2, m6 | |
16690 pmulhrsw m5, m7 | |
16691 packuswb m4, m5 | |
16692 movu [r0 + 1224 * 16], m4 | |
16693 pmaddubsw m4, m1, m6 | |
16694 pmulhrsw m4, m7 | |
16695 pmaddubsw m5, m3, m6 | |
16696 pmulhrsw m5, m7 | |
16697 packuswb m4, m5 | |
16698 movu [r0 + 1225 * 16], m4 | |
16699 | |
16700 ; mode 21 [row 5] | |
16701 movu m6, [r5 + 26 * 16] | |
16702 pslldq m0, 2 | |
16703 pinsrb m0, [r4 + 4], 1 | |
16704 pinsrb m0, [r4 + 6], 0 | |
16705 pmaddubsw m4, m0, m6 | |
16706 pmulhrsw m4, m7 | |
16707 pslldq m2, 2 | |
16708 pinsrw m2, [r3 + 5], 0 | |
16709 pmaddubsw m5, m2, m6 | |
16710 pmulhrsw m5, m7 | |
16711 packuswb m4, m5 | |
16712 movu [r0 + 1226 * 16], m4 | |
16713 pslldq m1, 2 | |
16714 pinsrw m1, [r3 + 13], 0 | |
16715 pmaddubsw m4, m1, m6 | |
16716 pmulhrsw m4, m7 | |
16717 pslldq m3, 2 | |
16718 pinsrw m3, [r3 + 21], 0 | |
16719 pmaddubsw m5, m3, m6 | |
16720 pmulhrsw m5, m7 | |
16721 packuswb m4, m5 | |
16722 movu [r0 + 1227 * 16], m4 | |
16723 | |
16724 ; mode 21 [row 6] | |
16725 movu m6, [r5 + 9 * 16] | |
16726 pmaddubsw m4, m0, m6 | |
16727 pmulhrsw m4, m7 | |
16728 pmaddubsw m5, m2, m6 | |
16729 pmulhrsw m5, m7 | |
16730 packuswb m4, m5 | |
16731 movu [r0 + 1228 * 16], m4 | |
16732 pmaddubsw m4, m1, m6 | |
16733 pmulhrsw m4, m7 | |
16734 pmaddubsw m5, m3, m6 | |
16735 pmulhrsw m5, m7 | |
16736 packuswb m4, m5 | |
16737 movu [r0 + 1229 * 16], m4 | |
16738 | |
16739 ; mode 21 [row 7] | |
16740 movu m6, [r5 + 24 * 16] | |
16741 pslldq m0, 2 | |
16742 pinsrb m0, [r4 + 6], 1 | |
16743 pinsrb m0, [r4 + 8], 0 | |
16744 pmaddubsw m4, m0, m6 | |
16745 pmulhrsw m4, m7 | |
16746 pslldq m2, 2 | |
16747 pinsrw m2, [r3 + 4], 0 | |
16748 pmaddubsw m5, m2, m6 | |
16749 pmulhrsw m5, m7 | |
16750 packuswb m4, m5 | |
16751 movu [r0 + 1230 * 16], m4 | |
16752 pslldq m1, 2 | |
16753 pinsrw m1, [r3 + 12], 0 | |
16754 pmaddubsw m4, m1, m6 | |
16755 pmulhrsw m4, m7 | |
16756 pslldq m3, 2 | |
16757 pinsrw m3, [r3 + 20], 0 | |
16758 pmaddubsw m5, m3, m6 | |
16759 pmulhrsw m5, m7 | |
16760 packuswb m4, m5 | |
16761 movu [r0 + 1231 * 16], m4 | |
16762 | |
16763 ; mode 21 [row 8] | |
16764 movu m6, [r5 + 7 * 16] | |
16765 pmaddubsw m4, m0, m6 | |
16766 pmulhrsw m4, m7 | |
16767 pmaddubsw m5, m2, m6 | |
16768 pmulhrsw m5, m7 | |
16769 packuswb m4, m5 | |
16770 movu [r0 + 1232 * 16], m4 | |
16771 pmaddubsw m4, m1, m6 | |
16772 pmulhrsw m4, m7 | |
16773 pmaddubsw m5, m3, m6 | |
16774 pmulhrsw m5, m7 | |
16775 packuswb m4, m5 | |
16776 movu [r0 + 1233 * 16], m4 | |
16777 | |
16778 ; mode 21 [row 9] | |
16779 movu m6, [r5 + 22 * 16] | |
16780 pslldq m0, 2 | |
16781 pinsrb m0, [r4 + 8], 1 | |
16782 pinsrb m0, [r4 + 9], 0 | |
16783 pmaddubsw m4, m0, m6 | |
16784 pmulhrsw m4, m7 | |
16785 pslldq m2, 2 | |
16786 pinsrw m2, [r3 + 3], 0 | |
16787 pmaddubsw m5, m2, m6 | |
16788 pmulhrsw m5, m7 | |
16789 packuswb m4, m5 | |
16790 movu [r0 + 1234 * 16], m4 | |
16791 pslldq m1, 2 | |
16792 pinsrw m1, [r3 + 11], 0 | |
16793 pmaddubsw m4, m1, m6 | |
16794 pmulhrsw m4, m7 | |
16795 pslldq m3, 2 | |
16796 pinsrw m3, [r3 + 19], 0 | |
16797 pmaddubsw m5, m3, m6 | |
16798 pmulhrsw m5, m7 | |
16799 packuswb m4, m5 | |
16800 movu [r0 + 1235 * 16], m4 | |
16801 | |
16802 ; mode 21 [row 10] | |
16803 movu m6, [r5 + 5 * 16] | |
16804 pmaddubsw m4, m0, m6 | |
16805 pmulhrsw m4, m7 | |
16806 pmaddubsw m5, m2, m6 | |
16807 pmulhrsw m5, m7 | |
16808 packuswb m4, m5 | |
16809 movu [r0 + 1236 * 16], m4 | |
16810 pmaddubsw m4, m1, m6 | |
16811 pmulhrsw m4, m7 | |
16812 pmaddubsw m5, m3, m6 | |
16813 pmulhrsw m5, m7 | |
16814 packuswb m4, m5 | |
16815 movu [r0 + 1237 * 16], m4 | |
16816 | |
16817 ; mode 21 [row 11] | |
16818 movu m6, [r5 + 20 * 16] | |
16819 pslldq m0, 2 | |
16820 pinsrb m0, [r4 + 9], 1 | |
16821 pinsrb m0, [r4 + 11], 0 | |
16822 pmaddubsw m4, m0, m6 | |
16823 pmulhrsw m4, m7 | |
16824 pslldq m2, 2 | |
16825 pinsrw m2, [r3 + 2], 0 | |
16826 pmaddubsw m5, m2, m6 | |
16827 pmulhrsw m5, m7 | |
16828 packuswb m4, m5 | |
16829 movu [r0 + 1238 * 16], m4 | |
16830 pslldq m1, 2 | |
16831 pinsrw m1, [r3 + 10], 0 | |
16832 pmaddubsw m4, m1, m6 | |
16833 pmulhrsw m4, m7 | |
16834 pslldq m3, 2 | |
16835 pinsrw m3, [r3 + 18], 0 | |
16836 pmaddubsw m5, m3, m6 | |
16837 pmulhrsw m5, m7 | |
16838 packuswb m4, m5 | |
16839 movu [r0 + 1239 * 16], m4 | |
16840 | |
16841 ; mode 21 [row 12] | |
16842 movu m6, [r5 + 3 * 16] | |
16843 pmaddubsw m4, m0, m6 | |
16844 pmulhrsw m4, m7 | |
16845 pmaddubsw m5, m2, m6 | |
16846 pmulhrsw m5, m7 | |
16847 packuswb m4, m5 | |
16848 movu [r0 + 1240 * 16], m4 | |
16849 pmaddubsw m4, m1, m6 | |
16850 pmulhrsw m4, m7 | |
16851 pmaddubsw m5, m3, m6 | |
16852 pmulhrsw m5, m7 | |
16853 packuswb m4, m5 | |
16854 movu [r0 + 1241 * 16], m4 | |
16855 | |
16856 ; mode 21 [row 13] | |
16857 movu m6, [r5 + 18 * 16] | |
16858 pslldq m0, 2 | |
16859 pinsrb m0, [r4 + 11], 1 | |
16860 pinsrb m0, [r4 + 13], 0 | |
16861 pmaddubsw m4, m0, m6 | |
16862 pmulhrsw m4, m7 | |
16863 pslldq m2, 2 | |
16864 pinsrw m2, [r3 + 1], 0 | |
16865 pmaddubsw m5, m2, m6 | |
16866 pmulhrsw m5, m7 | |
16867 packuswb m4, m5 | |
16868 movu [r0 + 1242 * 16], m4 | |
16869 pslldq m1, 2 | |
16870 pinsrw m1, [r3 + 9], 0 | |
16871 pmaddubsw m4, m1, m6 | |
16872 pmulhrsw m4, m7 | |
16873 pslldq m3, 2 | |
16874 pinsrw m3, [r3 + 17], 0 | |
16875 pmaddubsw m5, m3, m6 | |
16876 pmulhrsw m5, m7 | |
16877 packuswb m4, m5 | |
16878 movu [r0 + 1243 * 16], m4 | |
16879 | |
16880 ; mode 21 [row 14] | |
16881 movu m6, [r5 + 1 * 16] | |
16882 pmaddubsw m4, m0, m6 | |
16883 pmulhrsw m4, m7 | |
16884 pmaddubsw m5, m2, m6 | |
16885 pmulhrsw m5, m7 | |
16886 packuswb m4, m5 | |
16887 movu [r0 + 1244 * 16], m4 | |
16888 pmaddubsw m4, m1, m6 | |
16889 pmulhrsw m4, m7 | |
16890 pmaddubsw m5, m3, m6 | |
16891 pmulhrsw m5, m7 | |
16892 packuswb m4, m5 | |
16893 movu [r0 + 1245 * 16], m4 | |
16894 | |
16895 ; mode 21 [row 15] | |
16896 movu m6, [r5 + 16 * 16] | |
16897 pslldq m0, 2 | |
16898 pinsrb m0, [r4 + 13], 1 | |
16899 pinsrb m0, [r4 + 15], 0 | |
16900 pmaddubsw m4, m0, m6 | |
16901 pmulhrsw m4, m7 | |
16902 pslldq m2, 2 | |
16903 pinsrw m2, [r3 + 0], 0 | |
16904 pmaddubsw m5, m2, m6 | |
16905 pmulhrsw m5, m7 | |
16906 packuswb m4, m5 | |
16907 movu [r0 + 1246 * 16], m4 | |
16908 pslldq m1, 2 | |
16909 pinsrw m1, [r3 + 8], 0 | |
16910 pmaddubsw m4, m1, m6 | |
16911 pmulhrsw m4, m7 | |
16912 pslldq m3, 2 | |
16913 pinsrw m3, [r3 + 16], 0 | |
16914 pmaddubsw m5, m3, m6 | |
16915 pmulhrsw m5, m7 | |
16916 packuswb m4, m5 | |
16917 movu [r0 + 1247 * 16], m4 | |
16918 | |
16919 ; mode 21 [row 16] | |
16920 movu m6, [r5 + 31 * 16] | |
16921 pslldq m0, 2 | |
16922 pinsrb m0, [r4 + 15], 1 | |
16923 pinsrb m0, [r4 + 17], 0 | |
16924 pmaddubsw m4, m0, m6 | |
16925 pmulhrsw m4, m7 | |
16926 pslldq m2, 2 | |
16927 pinsrb m2, [r4 + 0], 1 | |
16928 pinsrb m2, [r4 + 2], 0 | |
16929 pmaddubsw m5, m2, m6 | |
16930 pmulhrsw m5, m7 | |
16931 packuswb m4, m5 | |
16932 movu [r0 + 1248 * 16], m4 | |
16933 pslldq m1, 2 | |
16934 pinsrw m1, [r3 + 7], 0 | |
16935 pmaddubsw m4, m1, m6 | |
16936 pmulhrsw m4, m7 | |
16937 pslldq m3, 2 | |
16938 pinsrw m3, [r3 + 15], 0 | |
16939 pmaddubsw m5, m3, m6 | |
16940 pmulhrsw m5, m7 | |
16941 packuswb m4, m5 | |
16942 movu [r0 + 1249 * 16], m4 | |
16943 | |
16944 ; mode 21 [row 17] | |
16945 movu m6, [r5 + 14 * 16] | |
16946 pmaddubsw m4, m0, m6 | |
16947 pmulhrsw m4, m7 | |
16948 pmaddubsw m5, m2, m6 | |
16949 pmulhrsw m5, m7 | |
16950 packuswb m4, m5 | |
16951 movu [r0 + 1250 * 16], m4 | |
16952 pmaddubsw m4, m1, m6 | |
16953 pmulhrsw m4, m7 | |
16954 pmaddubsw m5, m3, m6 | |
16955 pmulhrsw m5, m7 | |
16956 packuswb m4, m5 | |
16957 movu [r0 + 1251 * 16], m4 | |
16958 | |
16959 ; mode 21 [row 18] | |
16960 movu m6, [r5 + 29 * 16] | |
16961 pslldq m0, 2 | |
16962 pinsrb m0, [r4 + 17], 1 | |
16963 pinsrb m0, [r4 + 19], 0 | |
16964 pmaddubsw m4, m0, m6 | |
16965 pmulhrsw m4, m7 | |
16966 pslldq m2, 2 | |
16967 pinsrb m2, [r4 + 2], 1 | |
16968 pinsrb m2, [r4 + 4], 0 | |
16969 pmaddubsw m5, m2, m6 | |
16970 pmulhrsw m5, m7 | |
16971 packuswb m4, m5 | |
16972 movu [r0 + 1252 * 16], m4 | |
16973 pslldq m1, 2 | |
16974 pinsrb m1, [r3 + 7], 1 | |
16975 pinsrb m1, [r3 + 6], 0 | |
16976 pmaddubsw m4, m1, m6 | |
16977 pmulhrsw m4, m7 | |
16978 pslldq m3, 2 | |
16979 pinsrb m3, [r3 + 15], 1 | |
16980 pinsrb m3, [r3 + 14], 0 | |
16981 pmaddubsw m5, m3, m6 | |
16982 pmulhrsw m5, m7 | |
16983 packuswb m4, m5 | |
16984 movu [r0 + 1253 * 16], m4 | |
16985 | |
16986 ; mode 21 [row 19] | |
16987 movu m6, [r5 + 12 * 16] | |
16988 pmaddubsw m4, m0, m6 | |
16989 pmulhrsw m4, m7 | |
16990 pmaddubsw m5, m2, m6 | |
16991 pmulhrsw m5, m7 | |
16992 packuswb m4, m5 | |
16993 movu [r0 + 1254 * 16], m4 | |
16994 pmaddubsw m4, m1, m6 | |
16995 pmulhrsw m4, m7 | |
16996 pmaddubsw m5, m3, m6 | |
16997 pmulhrsw m5, m7 | |
16998 packuswb m4, m5 | |
16999 movu [r0 + 1255 * 16], m4 | |
17000 | |
17001 ; mode 21 [row 20] | |
17002 movu m6, [r5 + 27 * 16] | |
17003 pslldq m0, 2 | |
17004 pinsrb m0, [r4 + 19], 1 | |
17005 pinsrb m0, [r4 + 21], 0 | |
17006 pmaddubsw m4, m0, m6 | |
17007 pmulhrsw m4, m7 | |
17008 pslldq m2, 2 | |
17009 pinsrb m2, [r4 + 4], 1 | |
17010 pinsrb m2, [r4 + 6], 0 | |
17011 pmaddubsw m5, m2, m6 | |
17012 pmulhrsw m5, m7 | |
17013 packuswb m4, m5 | |
17014 movu [r0 + 1256 * 16], m4 | |
17015 pslldq m1, 2 | |
17016 pinsrw m1, [r3 + 5], 0 | |
17017 pmaddubsw m4, m1, m6 | |
17018 pmulhrsw m4, m7 | |
17019 pslldq m3, 2 | |
17020 pinsrw m3, [r3 + 13], 0 | |
17021 pmaddubsw m5, m3, m6 | |
17022 pmulhrsw m5, m7 | |
17023 packuswb m4, m5 | |
17024 movu [r0 + 1257 * 16], m4 | |
17025 | |
17026 ; mode 21 [row 21] | |
17027 movu m6, [r5 + 10 * 16] | |
17028 pmaddubsw m4, m0, m6 | |
17029 pmulhrsw m4, m7 | |
17030 pmaddubsw m5, m2, m6 | |
17031 pmulhrsw m5, m7 | |
17032 packuswb m4, m5 | |
17033 movu [r0 + 1258 * 16], m4 | |
17034 pmaddubsw m4, m1, m6 | |
17035 pmulhrsw m4, m7 | |
17036 pmaddubsw m5, m3, m6 | |
17037 pmulhrsw m5, m7 | |
17038 packuswb m4, m5 | |
17039 movu [r0 + 1259 * 16], m4 | |
17040 | |
17041 ; mode 21 [row 22] | |
17042 movu m6, [r5 + 25 * 16] | |
17043 pslldq m0, 2 | |
17044 pinsrb m0, [r4 + 21], 1 | |
17045 pinsrb m0, [r4 + 23], 0 | |
17046 pmaddubsw m4, m0, m6 | |
17047 pmulhrsw m4, m7 | |
17048 pslldq m2, 2 | |
17049 pinsrb m2, [r4 + 6], 1 | |
17050 pinsrb m2, [r4 + 8], 0 | |
17051 pmaddubsw m5, m2, m6 | |
17052 pmulhrsw m5, m7 | |
17053 packuswb m4, m5 | |
17054 movu [r0 + 1260 * 16], m4 | |
17055 pslldq m1, 2 | |
17056 pinsrw m1, [r3 + 4], 0 | |
17057 pmaddubsw m4, m1, m6 | |
17058 pmulhrsw m4, m7 | |
17059 pslldq m3, 2 | |
17060 pinsrw m3, [r3 + 12], 0 | |
17061 pmaddubsw m5, m3, m6 | |
17062 pmulhrsw m5, m7 | |
17063 packuswb m4, m5 | |
17064 movu [r0 + 1261 * 16], m4 | |
17065 | |
17066 ; mode 21 [row 23] | |
17067 movu m6, [r5 + 8 * 16] | |
17068 pmaddubsw m4, m0, m6 | |
17069 pmulhrsw m4, m7 | |
17070 pmaddubsw m5, m2, m6 | |
17071 pmulhrsw m5, m7 | |
17072 packuswb m4, m5 | |
17073 movu [r0 + 1262 * 16], m4 | |
17074 pmaddubsw m4, m1, m6 | |
17075 pmulhrsw m4, m7 | |
17076 pmaddubsw m5, m3, m6 | |
17077 pmulhrsw m5, m7 | |
17078 packuswb m4, m5 | |
17079 movu [r0 + 1263 * 16], m4 | |
17080 | |
17081 ; mode 21 [row 24] | |
17082 movu m6, [r5 + 23 * 16] | |
17083 pslldq m0, 2 | |
17084 pinsrb m0, [r4 + 23], 1 | |
17085 pinsrb m0, [r4 + 24], 0 | |
17086 pmaddubsw m4, m0, m6 | |
17087 pmulhrsw m4, m7 | |
17088 pslldq m2, 2 | |
17089 pinsrb m2, [r4 + 8], 1 | |
17090 pinsrb m2, [r4 + 9], 0 | |
17091 pmaddubsw m5, m2, m6 | |
17092 pmulhrsw m5, m7 | |
17093 packuswb m4, m5 | |
17094 movu [r0 + 1264 * 16], m4 | |
17095 pslldq m1, 2 | |
17096 pinsrw m1, [r3 + 3], 0 | |
17097 pmaddubsw m4, m1, m6 | |
17098 pmulhrsw m4, m7 | |
17099 pslldq m3, 2 | |
17100 pinsrw m3, [r3 + 11], 0 | |
17101 pmaddubsw m5, m3, m6 | |
17102 pmulhrsw m5, m7 | |
17103 packuswb m4, m5 | |
17104 movu [r0 + 1265 * 16], m4 | |
17105 | |
17106 ; mode 21 [row 25] | |
17107 movu m6, [r5 + 6 * 16] | |
17108 pmaddubsw m4, m0, m6 | |
17109 pmulhrsw m4, m7 | |
17110 pmaddubsw m5, m2, m6 | |
17111 pmulhrsw m5, m7 | |
17112 packuswb m4, m5 | |
17113 movu [r0 + 1266 * 16], m4 | |
17114 pmaddubsw m4, m1, m6 | |
17115 pmulhrsw m4, m7 | |
17116 pmaddubsw m5, m3, m6 | |
17117 pmulhrsw m5, m7 | |
17118 packuswb m4, m5 | |
17119 movu [r0 + 1267 * 16], m4 | |
17120 | |
17121 ; mode 21 [row 26] | |
17122 movu m6, [r5 + 21 * 16] | |
17123 pslldq m0, 2 | |
17124 pinsrb m0, [r4 + 24], 1 | |
17125 pinsrb m0, [r4 + 26], 0 | |
17126 pmaddubsw m4, m0, m6 | |
17127 pmulhrsw m4, m7 | |
17128 pslldq m2, 2 | |
17129 pinsrb m2, [r4 + 9], 1 | |
17130 pinsrb m2, [r4 + 11], 0 | |
17131 pmaddubsw m5, m2, m6 | |
17132 pmulhrsw m5, m7 | |
17133 packuswb m4, m5 | |
17134 movu [r0 + 1268 * 16], m4 | |
17135 pslldq m1, 2 | |
17136 pinsrw m1, [r3 + 2], 0 | |
17137 pmaddubsw m4, m1, m6 | |
17138 pmulhrsw m4, m7 | |
17139 pslldq m3, 2 | |
17140 pinsrw m3, [r3 + 10], 0 | |
17141 pmaddubsw m5, m3, m6 | |
17142 pmulhrsw m5, m7 | |
17143 packuswb m4, m5 | |
17144 movu [r0 + 1269 * 16], m4 | |
17145 | |
17146 ; mode 21 [row 27] | |
17147 movu m6, [r5 + 4 * 16] | |
17148 pmaddubsw m4, m0, m6 | |
17149 pmulhrsw m4, m7 | |
17150 pmaddubsw m5, m2, m6 | |
17151 pmulhrsw m5, m7 | |
17152 packuswb m4, m5 | |
17153 movu [r0 + 1270 * 16], m4 | |
17154 pmaddubsw m4, m1, m6 | |
17155 pmulhrsw m4, m7 | |
17156 pmaddubsw m5, m3, m6 | |
17157 pmulhrsw m5, m7 | |
17158 packuswb m4, m5 | |
17159 movu [r0 + 1271 * 16], m4 | |
17160 | |
17161 ; mode 21 [row 28] | |
17162 movu m6, [r5 + 19 * 16] | |
17163 pslldq m0, 2 | |
17164 pinsrb m0, [r4 + 26], 1 | |
17165 pinsrb m0, [r4 + 28], 0 | |
17166 pmaddubsw m4, m0, m6 | |
17167 pmulhrsw m4, m7 | |
17168 pslldq m2, 2 | |
17169 pinsrb m2, [r4 + 11], 1 | |
17170 pinsrb m2, [r4 + 13], 0 | |
17171 pmaddubsw m5, m2, m6 | |
17172 pmulhrsw m5, m7 | |
17173 packuswb m4, m5 | |
17174 movu [r0 + 1272 * 16], m4 | |
17175 pslldq m1, 2 | |
17176 pinsrw m1, [r3 + 1], 0 | |
17177 pmaddubsw m4, m1, m6 | |
17178 pmulhrsw m4, m7 | |
17179 pslldq m3, 2 | |
17180 pinsrw m3, [r3 + 9], 0 | |
17181 pmaddubsw m5, m3, m6 | |
17182 pmulhrsw m5, m7 | |
17183 packuswb m4, m5 | |
17184 movu [r0 + 1273 * 16], m4 | |
17185 | |
17186 ; mode 21 [row 29] | |
17187 movu m6, [r5 + 2 * 16] | |
17188 pmaddubsw m4, m0, m6 | |
17189 pmulhrsw m4, m7 | |
17190 pmaddubsw m5, m2, m6 | |
17191 pmulhrsw m5, m7 | |
17192 packuswb m4, m5 | |
17193 movu [r0 + 1274 * 16], m4 | |
17194 pmaddubsw m4, m1, m6 | |
17195 pmulhrsw m4, m7 | |
17196 pmaddubsw m5, m3, m6 | |
17197 pmulhrsw m5, m7 | |
17198 packuswb m4, m5 | |
17199 movu [r0 + 1275 * 16], m4 | |
17200 | |
17201 ; mode 21 [row 30] | |
17202 movu m6, [r5 + 17 * 16] | |
17203 pslldq m0, 2 | |
17204 pinsrb m0, [r4 + 28], 1 | |
17205 pinsrb m0, [r4 + 30], 0 | |
17206 pmaddubsw m4, m0, m6 | |
17207 pmulhrsw m4, m7 | |
17208 pslldq m2, 2 | |
17209 pinsrb m2, [r4 + 13], 1 | |
17210 pinsrb m2, [r4 + 15], 0 | |
17211 pmaddubsw m5, m2, m6 | |
17212 pmulhrsw m5, m7 | |
17213 packuswb m4, m5 | |
17214 movu [r0 + 1276 * 16], m4 | |
17215 pslldq m1, 2 | |
17216 pinsrw m1, [r3 + 0], 0 | |
17217 pmaddubsw m4, m1, m6 | |
17218 pmulhrsw m4, m7 | |
17219 pslldq m3, 2 | |
17220 pinsrw m3, [r3 + 8], 0 | |
17221 pmaddubsw m5, m3, m6 | |
17222 pmulhrsw m5, m7 | |
17223 packuswb m4, m5 | |
17224 movu [r0 + 1277 * 16], m4 | |
17225 | |
17226 ; mode21 [row 31] | |
17227 pshufb m5, m0, [tab_S2] | |
17228 movh [r0 + 1278 * 16], m5 | |
17229 pshufb m5, m2, [tab_S2] | |
17230 movh [r0 + 1278 * 16 + 8], m5 | |
17231 pshufb m5, m1, [tab_S2] | |
17232 movh [r0 + 1279 * 16], m5 | |
17233 pshufb m5, m3, [tab_S2] | |
17234 movh [r0 + 1279 * 16 + 8], m5 | |
17235 | |
17236 ; mode 22 [row 0] | |
17237 movu m6, [r5 + 19 * 16] | |
17238 movu m0, [r3 ] | |
17239 movu m1, [r3 + 1 ] | |
17240 punpcklbw m0, m1 | |
17241 pmaddubsw m1, m0, m6 | |
17242 pmulhrsw m1, m7 | |
17243 movu m2, [r3 + 8] | |
17244 movu m3, [r3 + 9] | |
17245 punpcklbw m2, m3 | |
17246 pmaddubsw m3, m2, m6 | |
17247 pmulhrsw m3, m7 | |
17248 packuswb m1, m3 | |
17249 movu [r0 + 1280 * 16], m1 | |
17250 | |
17251 movu m1, [r3 + 16] | |
17252 movu m3, [r3 + 17] | |
17253 punpcklbw m1, m3 | |
17254 pmaddubsw m4, m1, m6 | |
17255 pmulhrsw m4, m7 | |
17256 movu m3, [r3 + 24] | |
17257 movu m5, [r3 + 25] | |
17258 punpcklbw m3, m5 | |
17259 pmaddubsw m5, m3, m6 | |
17260 pmulhrsw m5, m7 | |
17261 packuswb m4, m5 | |
17262 movu [r0 + 1281 * 16], m4 | |
17263 | |
17264 ; mode 22 [row 1] | |
17265 movu m6, [r5 + 6 * 16] | |
17266 pmaddubsw m4, m0, m6 | |
17267 pmulhrsw m4, m7 | |
17268 pmaddubsw m5, m2, m6 | |
17269 pmulhrsw m5, m7 | |
17270 packuswb m4, m5 | |
17271 movu [r0 + 1282 * 16], m4 | |
17272 pmaddubsw m4, m1, m6 | |
17273 pmulhrsw m4, m7 | |
17274 pmaddubsw m5, m3, m6 | |
17275 pmulhrsw m5, m7 | |
17276 packuswb m4, m5 | |
17277 movu [r0 + 1283 * 16], m4 | |
17278 | |
17279 ; mode 22 [row 2] | |
17280 movu m6, [r5 + 25 * 16] | |
17281 pslldq m0, 2 | |
17282 pinsrb m0, [r4 + 0], 1 | |
17283 pinsrb m0, [r4 + 2], 0 | |
17284 pmaddubsw m4, m0, m6 | |
17285 pmulhrsw m4, m7 | |
17286 pslldq m2, 2 | |
17287 pinsrw m2, [r3 + 7], 0 | |
17288 pmaddubsw m5, m2, m6 | |
17289 pmulhrsw m5, m7 | |
17290 packuswb m4, m5 | |
17291 movu [r0 + 1284 * 16], m4 | |
17292 pslldq m1, 2 | |
17293 pinsrw m1, [r3 + 15], 0 | |
17294 pmaddubsw m4, m1, m6 | |
17295 pmulhrsw m4, m7 | |
17296 pslldq m3, 2 | |
17297 pinsrw m3, [r3 + 23], 0 | |
17298 pmaddubsw m5, m3, m6 | |
17299 pmulhrsw m5, m7 | |
17300 packuswb m4, m5 | |
17301 movu [r0 + 1285 * 16], m4 | |
17302 | |
17303 ; mode 22 [row 3] | |
17304 movu m6, [r5 + 12 * 16] | |
17305 pmaddubsw m4, m0, m6 | |
17306 pmulhrsw m4, m7 | |
17307 pmaddubsw m5, m2, m6 | |
17308 pmulhrsw m5, m7 | |
17309 packuswb m4, m5 | |
17310 movu [r0 + 1286 * 16], m4 | |
17311 pmaddubsw m4, m1, m6 | |
17312 pmulhrsw m4, m7 | |
17313 pmaddubsw m5, m3, m6 | |
17314 pmulhrsw m5, m7 | |
17315 packuswb m4, m5 | |
17316 movu [r0 + 1287 * 16], m4 | |
17317 | |
17318 ; mode 22 [row 4] | |
17319 movu m6, [r5 + 31 * 16] | |
17320 pslldq m0, 2 | |
17321 pinsrb m0, [r4 + 2], 1 | |
17322 pinsrb m0, [r4 + 5], 0 | |
17323 pmaddubsw m4, m0, m6 | |
17324 pmulhrsw m4, m7 | |
17325 pslldq m2, 2 | |
17326 pinsrw m2, [r3 + 6], 0 | |
17327 pmaddubsw m5, m2, m6 | |
17328 pmulhrsw m5, m7 | |
17329 packuswb m4, m5 | |
17330 movu [r0 + 1288 * 16], m4 | |
17331 pslldq m1, 2 | |
17332 pinsrw m1, [r3 + 14], 0 | |
17333 pmaddubsw m4, m1, m6 | |
17334 pmulhrsw m4, m7 | |
17335 pslldq m3, 2 | |
17336 pinsrw m3, [r3 + 22], 0 | |
17337 pmaddubsw m5, m3, m6 | |
17338 pmulhrsw m5, m7 | |
17339 packuswb m4, m5 | |
17340 movu [r0 + 1289 * 16], m4 | |
17341 | |
17342 ; mode 22 [row 5] | |
17343 movu m6, [r5 + 18 * 16] | |
17344 pmaddubsw m4, m0, m6 | |
17345 pmulhrsw m4, m7 | |
17346 pmaddubsw m5, m2, m6 | |
17347 pmulhrsw m5, m7 | |
17348 packuswb m4, m5 | |
17349 movu [r0 + 1290 * 16], m4 | |
17350 pmaddubsw m4, m1, m6 | |
17351 pmulhrsw m4, m7 | |
17352 pmaddubsw m5, m3, m6 | |
17353 pmulhrsw m5, m7 | |
17354 packuswb m4, m5 | |
17355 movu [r0 + 1291 * 16], m4 | |
17356 | |
17357 ; mode 22 [row 6] | |
17358 movu m6, [r5 + 5 * 16] | |
17359 pmaddubsw m4, m0, m6 | |
17360 pmulhrsw m4, m7 | |
17361 pmaddubsw m5, m2, m6 | |
17362 pmulhrsw m5, m7 | |
17363 packuswb m4, m5 | |
17364 movu [r0 + 1292 * 16], m4 | |
17365 pmaddubsw m4, m1, m6 | |
17366 pmulhrsw m4, m7 | |
17367 pmaddubsw m5, m3, m6 | |
17368 pmulhrsw m5, m7 | |
17369 packuswb m4, m5 | |
17370 movu [r0 + 1293 * 16], m4 | |
17371 | |
17372 ; mode 22 [row 7] | |
17373 movu m6, [r5 + 24 * 16] | |
17374 pslldq m0, 2 | |
17375 pinsrb m0, [r4 + 5], 1 | |
17376 pinsrb m0, [r4 + 7], 0 | |
17377 pmaddubsw m4, m0, m6 | |
17378 pmulhrsw m4, m7 | |
17379 pslldq m2, 2 | |
17380 pinsrw m2, [r3 + 5], 0 | |
17381 pmaddubsw m5, m2, m6 | |
17382 pmulhrsw m5, m7 | |
17383 packuswb m4, m5 | |
17384 movu [r0 + 1294 * 16], m4 | |
17385 pslldq m1, 2 | |
17386 pinsrw m1, [r3 + 13], 0 | |
17387 pmaddubsw m4, m1, m6 | |
17388 pmulhrsw m4, m7 | |
17389 pslldq m3, 2 | |
17390 pinsrw m3, [r3 + 21], 0 | |
17391 pmaddubsw m5, m3, m6 | |
17392 pmulhrsw m5, m7 | |
17393 packuswb m4, m5 | |
17394 movu [r0 + 1295 * 16], m4 | |
17395 | |
17396 ; mode 22 [row 8] | |
17397 movu m6, [r5 + 11 * 16] | |
17398 pmaddubsw m4, m0, m6 | |
17399 pmulhrsw m4, m7 | |
17400 pmaddubsw m5, m2, m6 | |
17401 pmulhrsw m5, m7 | |
17402 packuswb m4, m5 | |
17403 movu [r0 + 1296 * 16], m4 | |
17404 pmaddubsw m4, m1, m6 | |
17405 pmulhrsw m4, m7 | |
17406 pmaddubsw m5, m3, m6 | |
17407 pmulhrsw m5, m7 | |
17408 packuswb m4, m5 | |
17409 movu [r0 + 1297 * 16], m4 | |
17410 | |
17411 ; mode 22 [row 9] | |
17412 movu m6, [r5 + 30 * 16] | |
17413 pslldq m0, 2 | |
17414 pinsrb m0, [r4 + 7], 1 | |
17415 pinsrb m0, [r4 + 10], 0 | |
17416 pmaddubsw m4, m0, m6 | |
17417 pmulhrsw m4, m7 | |
17418 pslldq m2, 2 | |
17419 pinsrw m2, [r3 + 4], 0 | |
17420 pmaddubsw m5, m2, m6 | |
17421 pmulhrsw m5, m7 | |
17422 packuswb m4, m5 | |
17423 movu [r0 + 1298 * 16], m4 | |
17424 pslldq m1, 2 | |
17425 pinsrw m1, [r3 + 12], 0 | |
17426 pmaddubsw m4, m1, m6 | |
17427 pmulhrsw m4, m7 | |
17428 pslldq m3, 2 | |
17429 pinsrw m3, [r3 + 20], 0 | |
17430 pmaddubsw m5, m3, m6 | |
17431 pmulhrsw m5, m7 | |
17432 packuswb m4, m5 | |
17433 movu [r0 + 1299 * 16], m4 | |
17434 | |
17435 ; mode 22 [row 10] | |
17436 movu m6, [r5 + 17 * 16] | |
17437 pmaddubsw m4, m0, m6 | |
17438 pmulhrsw m4, m7 | |
17439 pmaddubsw m5, m2, m6 | |
17440 pmulhrsw m5, m7 | |
17441 packuswb m4, m5 | |
17442 movu [r0 + 1300 * 16], m4 | |
17443 pmaddubsw m4, m1, m6 | |
17444 pmulhrsw m4, m7 | |
17445 pmaddubsw m5, m3, m6 | |
17446 pmulhrsw m5, m7 | |
17447 packuswb m4, m5 | |
17448 movu [r0 + 1301 * 16], m4 | |
17449 | |
17450 ; mode 22 [row 11] | |
17451 movu m6, [r5 + 4 * 16] | |
17452 pmaddubsw m4, m0, m6 | |
17453 pmulhrsw m4, m7 | |
17454 pmaddubsw m5, m2, m6 | |
17455 pmulhrsw m5, m7 | |
17456 packuswb m4, m5 | |
17457 movu [r0 + 1302 * 16], m4 | |
17458 pmaddubsw m4, m1, m6 | |
17459 pmulhrsw m4, m7 | |
17460 pmaddubsw m5, m3, m6 | |
17461 pmulhrsw m5, m7 | |
17462 packuswb m4, m5 | |
17463 movu [r0 + 1303 * 16], m4 | |
17464 | |
17465 ; mode 22 [row 12] | |
17466 movu m6, [r5 + 23 * 16] | |
17467 pslldq m0, 2 | |
17468 pinsrb m0, [r4 + 10], 1 | |
17469 pinsrb m0, [r4 + 12], 0 | |
17470 pmaddubsw m4, m0, m6 | |
17471 pmulhrsw m4, m7 | |
17472 pslldq m2, 2 | |
17473 pinsrw m2, [r3 + 3], 0 | |
17474 pmaddubsw m5, m2, m6 | |
17475 pmulhrsw m5, m7 | |
17476 packuswb m4, m5 | |
17477 movu [r0 + 1304 * 16], m4 | |
17478 pslldq m1, 2 | |
17479 pinsrw m1, [r3 + 11], 0 | |
17480 pmaddubsw m4, m1, m6 | |
17481 pmulhrsw m4, m7 | |
17482 pslldq m3, 2 | |
17483 pinsrw m3, [r3 + 19], 0 | |
17484 pmaddubsw m5, m3, m6 | |
17485 pmulhrsw m5, m7 | |
17486 packuswb m4, m5 | |
17487 movu [r0 + 1305 * 16], m4 | |
17488 | |
17489 ; mode 22 [row 13] | |
17490 movu m6, [r5 + 10 * 16] | |
17491 pmaddubsw m4, m0, m6 | |
17492 pmulhrsw m4, m7 | |
17493 pmaddubsw m5, m2, m6 | |
17494 pmulhrsw m5, m7 | |
17495 packuswb m4, m5 | |
17496 movu [r0 + 1306 * 16], m4 | |
17497 pmaddubsw m4, m1, m6 | |
17498 pmulhrsw m4, m7 | |
17499 pmaddubsw m5, m3, m6 | |
17500 pmulhrsw m5, m7 | |
17501 packuswb m4, m5 | |
17502 movu [r0 + 1307 * 16], m4 | |
17503 | |
17504 ; mode 22 [row 14] | |
17505 movu m6, [r5 + 29 * 16] | |
17506 pslldq m0, 2 | |
17507 pinsrb m0, [r4 + 12], 1 | |
17508 pinsrb m0, [r4 + 15], 0 | |
17509 pmaddubsw m4, m0, m6 | |
17510 pmulhrsw m4, m7 | |
17511 pslldq m2, 2 | |
17512 pinsrw m2, [r3 + 2], 0 | |
17513 pmaddubsw m5, m2, m6 | |
17514 pmulhrsw m5, m7 | |
17515 packuswb m4, m5 | |
17516 movu [r0 + 1308 * 16], m4 | |
17517 pslldq m1, 2 | |
17518 pinsrw m1, [r3 + 10], 0 | |
17519 pmaddubsw m4, m1, m6 | |
17520 pmulhrsw m4, m7 | |
17521 pslldq m3, 2 | |
17522 pinsrw m3, [r3 + 18], 0 | |
17523 pmaddubsw m5, m3, m6 | |
17524 pmulhrsw m5, m7 | |
17525 packuswb m4, m5 | |
17526 movu [r0 + 1309 * 16], m4 | |
17527 | |
17528 ; mode 22 [row 15] | |
17529 movu m6, [r5 + 16 * 16] | |
17530 pmaddubsw m4, m0, m6 | |
17531 pmulhrsw m4, m7 | |
17532 pmaddubsw m5, m2, m6 | |
17533 pmulhrsw m5, m7 | |
17534 packuswb m4, m5 | |
17535 movu [r0 + 1310 * 16], m4 | |
17536 pmaddubsw m4, m1, m6 | |
17537 pmulhrsw m4, m7 | |
17538 pmaddubsw m5, m3, m6 | |
17539 pmulhrsw m5, m7 | |
17540 packuswb m4, m5 | |
17541 movu [r0 + 1311 * 16], m4 | |
17542 | |
17543 ; mode 22 [row 16] | |
17544 movu m6, [r5 + 3 * 16] | |
17545 pmaddubsw m4, m0, m6 | |
17546 pmulhrsw m4, m7 | |
17547 pmaddubsw m5, m2, m6 | |
17548 pmulhrsw m5, m7 | |
17549 packuswb m4, m5 | |
17550 movu [r0 + 1312 * 16], m4 | |
17551 pmaddubsw m4, m1, m6 | |
17552 pmulhrsw m4, m7 | |
17553 pmaddubsw m5, m3, m6 | |
17554 pmulhrsw m5, m7 | |
17555 packuswb m4, m5 | |
17556 movu [r0 + 1313 * 16], m4 | |
17557 | |
17558 ; mode 22 [row 17] | |
17559 movu m6, [r5 + 22 * 16] | |
17560 pslldq m0, 2 | |
17561 pinsrb m0, [r4 + 15], 1 | |
17562 pinsrb m0, [r4 + 17], 0 | |
17563 pmaddubsw m4, m0, m6 | |
17564 pmulhrsw m4, m7 | |
17565 pslldq m2, 2 | |
17566 pinsrw m2, [r3 + 1], 0 | |
17567 pmaddubsw m5, m2, m6 | |
17568 pmulhrsw m5, m7 | |
17569 packuswb m4, m5 | |
17570 movu [r0 + 1314 * 16], m4 | |
17571 pslldq m1, 2 | |
17572 pinsrw m1, [r3 + 9], 0 | |
17573 pmaddubsw m4, m1, m6 | |
17574 pmulhrsw m4, m7 | |
17575 pslldq m3, 2 | |
17576 pinsrw m3, [r3 + 17], 0 | |
17577 pmaddubsw m5, m3, m6 | |
17578 pmulhrsw m5, m7 | |
17579 packuswb m4, m5 | |
17580 movu [r0 + 1315 * 16], m4 | |
17581 | |
17582 ; mode 22 [row 18] | |
17583 movu m6, [r5 + 9 * 16] | |
17584 pmaddubsw m4, m0, m6 | |
17585 pmulhrsw m4, m7 | |
17586 pmaddubsw m5, m2, m6 | |
17587 pmulhrsw m5, m7 | |
17588 packuswb m4, m5 | |
17589 movu [r0 + 1316 * 16], m4 | |
17590 pmaddubsw m4, m1, m6 | |
17591 pmulhrsw m4, m7 | |
17592 pmaddubsw m5, m3, m6 | |
17593 pmulhrsw m5, m7 | |
17594 packuswb m4, m5 | |
17595 movu [r0 + 1317 * 16], m4 | |
17596 | |
17597 ; mode 22 [row 19] | |
17598 movu m6, [r5 + 28 * 16] | |
17599 pslldq m0, 2 | |
17600 pinsrb m0, [r4 + 17], 1 | |
17601 pinsrb m0, [r4 + 20], 0 | |
17602 pmaddubsw m4, m0, m6 | |
17603 pmulhrsw m4, m7 | |
17604 pslldq m2, 2 | |
17605 pinsrw m2, [r3 + 0], 0 | |
17606 pmaddubsw m5, m2, m6 | |
17607 pmulhrsw m5, m7 | |
17608 packuswb m4, m5 | |
17609 movu [r0 + 1318 * 16], m4 | |
17610 pslldq m1, 2 | |
17611 pinsrw m1, [r3 + 8], 0 | |
17612 pmaddubsw m4, m1, m6 | |
17613 pmulhrsw m4, m7 | |
17614 pslldq m3, 2 | |
17615 pinsrw m3, [r3 + 16], 0 | |
17616 pmaddubsw m5, m3, m6 | |
17617 pmulhrsw m5, m7 | |
17618 packuswb m4, m5 | |
17619 movu [r0 + 1319 * 16], m4 | |
17620 | |
17621 ; mode 22 [row 20] | |
17622 movu m6, [r5 + 15 * 16] | |
17623 pmaddubsw m4, m0, m6 | |
17624 pmulhrsw m4, m7 | |
17625 pmaddubsw m5, m2, m6 | |
17626 pmulhrsw m5, m7 | |
17627 packuswb m4, m5 | |
17628 movu [r0 + 1320 * 16], m4 | |
17629 pmaddubsw m4, m1, m6 | |
17630 pmulhrsw m4, m7 | |
17631 pmaddubsw m5, m3, m6 | |
17632 pmulhrsw m5, m7 | |
17633 packuswb m4, m5 | |
17634 movu [r0 + 1321 * 16], m4 | |
17635 | |
17636 ; mode 22 [row 21] | |
17637 movu m6, [r5 + 2 * 16] | |
17638 pmaddubsw m4, m0, m6 | |
17639 pmulhrsw m4, m7 | |
17640 pmaddubsw m5, m2, m6 | |
17641 pmulhrsw m5, m7 | |
17642 packuswb m4, m5 | |
17643 movu [r0 + 1322 * 16], m4 | |
17644 pmaddubsw m4, m1, m6 | |
17645 pmulhrsw m4, m7 | |
17646 pmaddubsw m5, m3, m6 | |
17647 pmulhrsw m5, m7 | |
17648 packuswb m4, m5 | |
17649 movu [r0 + 1323 * 16], m4 | |
17650 | |
17651 ; mode 22 [row 22] | |
17652 movu m6, [r5 + 21 * 16] | |
17653 pslldq m0, 2 | |
17654 pinsrb m0, [r4 + 20], 1 | |
17655 pinsrb m0, [r4 + 22], 0 | |
17656 pmaddubsw m4, m0, m6 | |
17657 pmulhrsw m4, m7 | |
17658 pslldq m2, 2 | |
17659 pinsrb m2, [r4 + 0], 1 | |
17660 pinsrb m2, [r4 + 2], 0 | |
17661 pmaddubsw m5, m2, m6 | |
17662 pmulhrsw m5, m7 | |
17663 packuswb m4, m5 | |
17664 movu [r0 + 1324 * 16], m4 | |
17665 pslldq m1, 2 | |
17666 pinsrw m1, [r3 + 7], 0 | |
17667 pmaddubsw m4, m1, m6 | |
17668 pmulhrsw m4, m7 | |
17669 pslldq m3, 2 | |
17670 pinsrw m3, [r3 + 15], 0 | |
17671 pmaddubsw m5, m3, m6 | |
17672 pmulhrsw m5, m7 | |
17673 packuswb m4, m5 | |
17674 movu [r0 + 1325 * 16], m4 | |
17675 | |
17676 ; mode 22 [row 23] | |
17677 movu m6, [r5 + 8 * 16] | |
17678 pmaddubsw m4, m0, m6 | |
17679 pmulhrsw m4, m7 | |
17680 pmaddubsw m5, m2, m6 | |
17681 pmulhrsw m5, m7 | |
17682 packuswb m4, m5 | |
17683 movu [r0 + 1326 * 16], m4 | |
17684 pmaddubsw m4, m1, m6 | |
17685 pmulhrsw m4, m7 | |
17686 pmaddubsw m5, m3, m6 | |
17687 pmulhrsw m5, m7 | |
17688 packuswb m4, m5 | |
17689 movu [r0 + 1327 * 16], m4 | |
17690 | |
17691 ; mode 22 [row 24] | |
17692 movu m6, [r5 + 27 * 16] | |
17693 pslldq m0, 2 | |
17694 pinsrb m0, [r4 + 22], 1 | |
17695 pinsrb m0, [r4 + 25], 0 | |
17696 pmaddubsw m4, m0, m6 | |
17697 pmulhrsw m4, m7 | |
17698 pslldq m2, 2 | |
17699 pinsrb m2, [r4 + 2], 1 | |
17700 pinsrb m2, [r4 + 5], 0 | |
17701 pmaddubsw m5, m2, m6 | |
17702 pmulhrsw m5, m7 | |
17703 packuswb m4, m5 | |
17704 movu [r0 + 1328 * 16], m4 | |
17705 pslldq m1, 2 | |
17706 pinsrw m1, [r3 + 6], 0 | |
17707 pmaddubsw m4, m1, m6 | |
17708 pmulhrsw m4, m7 | |
17709 pslldq m3, 2 | |
17710 pinsrw m3, [r3 + 14], 0 | |
17711 pmaddubsw m5, m3, m6 | |
17712 pmulhrsw m5, m7 | |
17713 packuswb m4, m5 | |
17714 movu [r0 + 1329 * 16], m4 | |
17715 | |
17716 ; mode 22 [row 25] | |
17717 movu m6, [r5 + 14 * 16] | |
17718 pmaddubsw m4, m0, m6 | |
17719 pmulhrsw m4, m7 | |
17720 pmaddubsw m5, m2, m6 | |
17721 pmulhrsw m5, m7 | |
17722 packuswb m4, m5 | |
17723 movu [r0 + 1330 * 16], m4 | |
17724 pmaddubsw m4, m1, m6 | |
17725 pmulhrsw m4, m7 | |
17726 pmaddubsw m5, m3, m6 | |
17727 pmulhrsw m5, m7 | |
17728 packuswb m4, m5 | |
17729 movu [r0 + 1331 * 16], m4 | |
17730 | |
17731 ; mode 22 [row 26] | |
17732 movu m6, [r5 + 1 * 16] | |
17733 pmaddubsw m4, m0, m6 | |
17734 pmulhrsw m4, m7 | |
17735 pmaddubsw m5, m2, m6 | |
17736 pmulhrsw m5, m7 | |
17737 packuswb m4, m5 | |
17738 movu [r0 + 1332 * 16], m4 | |
17739 pmaddubsw m4, m1, m6 | |
17740 pmulhrsw m4, m7 | |
17741 pmaddubsw m5, m3, m6 | |
17742 pmulhrsw m5, m7 | |
17743 packuswb m4, m5 | |
17744 movu [r0 + 1333 * 16], m4 | |
17745 | |
17746 ; mode 22 [row 27] | |
17747 movu m6, [r5 + 20 * 16] | |
17748 pslldq m0, 2 | |
17749 pinsrb m0, [r4 + 25], 1 | |
17750 pinsrb m0, [r4 + 27], 0 | |
17751 pmaddubsw m4, m0, m6 | |
17752 pmulhrsw m4, m7 | |
17753 pslldq m2, 2 | |
17754 pinsrb m2, [r4 + 5], 1 | |
17755 pinsrb m2, [r4 + 7], 0 | |
17756 pmaddubsw m5, m2, m6 | |
17757 pmulhrsw m5, m7 | |
17758 packuswb m4, m5 | |
17759 movu [r0 + 1334 * 16], m4 | |
17760 pslldq m1, 2 | |
17761 pinsrw m1, [r3 + 5], 0 | |
17762 pmaddubsw m4, m1, m6 | |
17763 pmulhrsw m4, m7 | |
17764 pslldq m3, 2 | |
17765 pinsrw m3, [r3 + 13], 0 | |
17766 pmaddubsw m5, m3, m6 | |
17767 pmulhrsw m5, m7 | |
17768 packuswb m4, m5 | |
17769 movu [r0 + 1335 * 16], m4 | |
17770 | |
17771 ; mode 22 [row 28] | |
17772 movu m6, [r5 + 7 * 16] | |
17773 pmaddubsw m4, m0, m6 | |
17774 pmulhrsw m4, m7 | |
17775 pmaddubsw m5, m2, m6 | |
17776 pmulhrsw m5, m7 | |
17777 packuswb m4, m5 | |
17778 movu [r0 + 1336 * 16], m4 | |
17779 pmaddubsw m4, m1, m6 | |
17780 pmulhrsw m4, m7 | |
17781 pmaddubsw m5, m3, m6 | |
17782 pmulhrsw m5, m7 | |
17783 packuswb m4, m5 | |
17784 movu [r0 + 1337 * 16], m4 | |
17785 | |
17786 ; mode 22 [row 29] | |
17787 movu m6, [r5 + 26 * 16] | |
17788 pslldq m0, 2 | |
17789 pinsrb m0, [r4 + 27], 1 | |
17790 pinsrb m0, [r4 + 30], 0 | |
17791 pmaddubsw m4, m0, m6 | |
17792 pmulhrsw m4, m7 | |
17793 pslldq m2, 2 | |
17794 pinsrb m2, [r4 + 7], 1 | |
17795 pinsrb m2, [r4 + 10], 0 | |
17796 pmaddubsw m5, m2, m6 | |
17797 pmulhrsw m5, m7 | |
17798 packuswb m4, m5 | |
17799 movu [r0 + 1338 * 16], m4 | |
17800 pslldq m1, 2 | |
17801 pinsrw m1, [r3 + 4], 0 | |
17802 pmaddubsw m4, m1, m6 | |
17803 pmulhrsw m4, m7 | |
17804 pslldq m3, 2 | |
17805 pinsrw m3, [r3 + 12], 0 | |
17806 pmaddubsw m5, m3, m6 | |
17807 pmulhrsw m5, m7 | |
17808 packuswb m4, m5 | |
17809 movu [r0 + 1339 * 16], m4 | |
17810 | |
17811 ; mode 22 [row 30] | |
17812 movu m6, [r5 + 13 * 16] | |
17813 pmaddubsw m4, m0, m6 | |
17814 pmulhrsw m4, m7 | |
17815 pmaddubsw m5, m2, m6 | |
17816 pmulhrsw m5, m7 | |
17817 packuswb m4, m5 | |
17818 movu [r0 + 1340 * 16], m4 | |
17819 pmaddubsw m4, m1, m6 | |
17820 pmulhrsw m4, m7 | |
17821 pmaddubsw m5, m3, m6 | |
17822 pmulhrsw m5, m7 | |
17823 packuswb m4, m5 | |
17824 movu [r0 + 1341 * 16], m4 | |
17825 | |
17826 ; mode22 [row 31] | |
17827 pshufb m5, m0, [tab_S2] | |
17828 movh [r0 + 1342 * 16], m5 | |
17829 pshufb m5, m2, [tab_S2] | |
17830 movh [r0 + 1342 * 16 + 8], m5 | |
17831 pshufb m5, m1, [tab_S2] | |
17832 movh [r0 + 1343 * 16], m5 | |
17833 pshufb m5, m3, [tab_S2] | |
17834 movh [r0 + 1343 * 16 + 8], m5 | |
17835 | |
17836 ; mode 23 [row 0] | |
17837 movu m6, [r5 + 23 * 16] | |
17838 movu m0, [r3 ] | |
17839 movu m1, [r3 + 1 ] | |
17840 punpcklbw m0, m1 | |
17841 pmaddubsw m1, m0, m6 | |
17842 pmulhrsw m1, m7 | |
17843 movu m2, [r3 + 8] | |
17844 movu m3, [r3 + 9] | |
17845 punpcklbw m2, m3 | |
17846 pmaddubsw m3, m2, m6 | |
17847 pmulhrsw m3, m7 | |
17848 packuswb m1, m3 | |
17849 movu [r0 + 1344 * 16], m1 | |
17850 | |
17851 movu m1, [r3 + 16] | |
17852 movu m3, [r3 + 17] | |
17853 punpcklbw m1, m3 | |
17854 pmaddubsw m4, m1, m6 | |
17855 pmulhrsw m4, m7 | |
17856 movu m3, [r3 + 24] | |
17857 movu m5, [r3 + 25] | |
17858 punpcklbw m3, m5 | |
17859 pmaddubsw m5, m3, m6 | |
17860 pmulhrsw m5, m7 | |
17861 packuswb m4, m5 | |
17862 movu [r0 + 1345 * 16], m4 | |
17863 | |
17864 ; mode 23 [row 1] | |
17865 movu m6, [r5 + 14 * 16] | |
17866 pmaddubsw m4, m0, m6 | |
17867 pmulhrsw m4, m7 | |
17868 pmaddubsw m5, m2, m6 | |
17869 pmulhrsw m5, m7 | |
17870 packuswb m4, m5 | |
17871 movu [r0 + 1346 * 16], m4 | |
17872 pmaddubsw m4, m1, m6 | |
17873 pmulhrsw m4, m7 | |
17874 pmaddubsw m5, m3, m6 | |
17875 pmulhrsw m5, m7 | |
17876 packuswb m4, m5 | |
17877 movu [r0 + 1347 * 16], m4 | |
17878 | |
17879 ; mode 23 [row 2] | |
17880 movu m6, [r5 + 5 * 16] | |
17881 pmaddubsw m4, m0, m6 | |
17882 pmulhrsw m4, m7 | |
17883 pmaddubsw m5, m2, m6 | |
17884 pmulhrsw m5, m7 | |
17885 packuswb m4, m5 | |
17886 movu [r0 + 1348 * 16], m4 | |
17887 pmaddubsw m4, m1, m6 | |
17888 pmulhrsw m4, m7 | |
17889 pmaddubsw m5, m3, m6 | |
17890 pmulhrsw m5, m7 | |
17891 packuswb m4, m5 | |
17892 movu [r0 + 1349 * 16], m4 | |
17893 | |
17894 ; mode 23 [row 3] | |
17895 movu m6, [r5 + 28 * 16] | |
17896 pslldq m0, 2 | |
17897 pinsrb m0, [r4 + 0], 1 | |
17898 pinsrb m0, [r4 + 4], 0 | |
17899 pmaddubsw m4, m0, m6 | |
17900 pmulhrsw m4, m7 | |
17901 pslldq m2, 2 | |
17902 pinsrw m2, [r3 + 7], 0 | |
17903 pmaddubsw m5, m2, m6 | |
17904 pmulhrsw m5, m7 | |
17905 packuswb m4, m5 | |
17906 movu [r0 + 1350 * 16], m4 | |
17907 pslldq m1, 2 | |
17908 pinsrw m1, [r3 + 15], 0 | |
17909 pmaddubsw m4, m1, m6 | |
17910 pmulhrsw m4, m7 | |
17911 pslldq m3, 2 | |
17912 pinsrw m3, [r3 + 23], 0 | |
17913 pmaddubsw m5, m3, m6 | |
17914 pmulhrsw m5, m7 | |
17915 packuswb m4, m5 | |
17916 movu [r0 + 1351 * 16], m4 | |
17917 | |
17918 ; mode 23 [row 4] | |
17919 movu m6, [r5 + 19 * 16] | |
17920 pmaddubsw m4, m0, m6 | |
17921 pmulhrsw m4, m7 | |
17922 pmaddubsw m5, m2, m6 | |
17923 pmulhrsw m5, m7 | |
17924 packuswb m4, m5 | |
17925 movu [r0 + 1352 * 16], m4 | |
17926 pmaddubsw m4, m1, m6 | |
17927 pmulhrsw m4, m7 | |
17928 pmaddubsw m5, m3, m6 | |
17929 pmulhrsw m5, m7 | |
17930 packuswb m4, m5 | |
17931 movu [r0 + 1353 * 16], m4 | |
17932 | |
17933 ; mode 23 [row 5] | |
17934 movu m6, [r5 + 10 * 16] | |
17935 pmaddubsw m4, m0, m6 | |
17936 pmulhrsw m4, m7 | |
17937 pmaddubsw m5, m2, m6 | |
17938 pmulhrsw m5, m7 | |
17939 packuswb m4, m5 | |
17940 movu [r0 + 1354 * 16], m4 | |
17941 pmaddubsw m4, m1, m6 | |
17942 pmulhrsw m4, m7 | |
17943 pmaddubsw m5, m3, m6 | |
17944 pmulhrsw m5, m7 | |
17945 packuswb m4, m5 | |
17946 movu [r0 + 1355 * 16], m4 | |
17947 | |
17948 ; mode 23 [row 6] | |
17949 movu m6, [r5 + 1 * 16] | |
17950 pmaddubsw m4, m0, m6 | |
17951 pmulhrsw m4, m7 | |
17952 pmaddubsw m5, m2, m6 | |
17953 pmulhrsw m5, m7 | |
17954 packuswb m4, m5 | |
17955 movu [r0 + 1356 * 16], m4 | |
17956 pmaddubsw m4, m1, m6 | |
17957 pmulhrsw m4, m7 | |
17958 pmaddubsw m5, m3, m6 | |
17959 pmulhrsw m5, m7 | |
17960 packuswb m4, m5 | |
17961 movu [r0 + 1357 * 16], m4 | |
17962 | |
17963 ; mode 23 [row 7] | |
17964 movu m6, [r5 + 24 * 16] | |
17965 pslldq m0, 2 | |
17966 pinsrb m0, [r4 + 4], 1 | |
17967 pinsrb m0, [r4 + 7], 0 | |
17968 pmaddubsw m4, m0, m6 | |
17969 pmulhrsw m4, m7 | |
17970 pslldq m2, 2 | |
17971 pinsrw m2, [r3 + 6], 0 | |
17972 pmaddubsw m5, m2, m6 | |
17973 pmulhrsw m5, m7 | |
17974 packuswb m4, m5 | |
17975 movu [r0 + 1358 * 16], m4 | |
17976 pslldq m1, 2 | |
17977 pinsrw m1, [r3 + 14], 0 | |
17978 pmaddubsw m4, m1, m6 | |
17979 pmulhrsw m4, m7 | |
17980 pslldq m3, 2 | |
17981 pinsrw m3, [r3 + 22], 0 | |
17982 pmaddubsw m5, m3, m6 | |
17983 pmulhrsw m5, m7 | |
17984 packuswb m4, m5 | |
17985 movu [r0 + 1359 * 16], m4 | |
17986 | |
17987 ; mode 23 [row 8] | |
17988 movu m6, [r5 + 15 * 16] | |
17989 pmaddubsw m4, m0, m6 | |
17990 pmulhrsw m4, m7 | |
17991 pmaddubsw m5, m2, m6 | |
17992 pmulhrsw m5, m7 | |
17993 packuswb m4, m5 | |
17994 movu [r0 + 1360 * 16], m4 | |
17995 pmaddubsw m4, m1, m6 | |
17996 pmulhrsw m4, m7 | |
17997 pmaddubsw m5, m3, m6 | |
17998 pmulhrsw m5, m7 | |
17999 packuswb m4, m5 | |
18000 movu [r0 + 1361 * 16], m4 | |
18001 | |
18002 ; mode 23 [row 9] | |
18003 movu m6, [r5 + 6 * 16] | |
18004 pmaddubsw m4, m0, m6 | |
18005 pmulhrsw m4, m7 | |
18006 pmaddubsw m5, m2, m6 | |
18007 pmulhrsw m5, m7 | |
18008 packuswb m4, m5 | |
18009 movu [r0 + 1362 * 16], m4 | |
18010 pmaddubsw m4, m1, m6 | |
18011 pmulhrsw m4, m7 | |
18012 pmaddubsw m5, m3, m6 | |
18013 pmulhrsw m5, m7 | |
18014 packuswb m4, m5 | |
18015 movu [r0 + 1363 * 16], m4 | |
18016 | |
18017 ; mode 23 [row 10] | |
18018 movu m6, [r5 + 29 * 16] | |
18019 pslldq m0, 2 | |
18020 pinsrb m0, [r4 + 7], 1 | |
18021 pinsrb m0, [r4 + 11], 0 | |
18022 pmaddubsw m4, m0, m6 | |
18023 pmulhrsw m4, m7 | |
18024 pslldq m2, 2 | |
18025 pinsrw m2, [r3 + 5], 0 | |
18026 pmaddubsw m5, m2, m6 | |
18027 pmulhrsw m5, m7 | |
18028 packuswb m4, m5 | |
18029 movu [r0 + 1364 * 16], m4 | |
18030 pslldq m1, 2 | |
18031 pinsrw m1, [r3 + 13], 0 | |
18032 pmaddubsw m4, m1, m6 | |
18033 pmulhrsw m4, m7 | |
18034 pslldq m3, 2 | |
18035 pinsrw m3, [r3 + 21], 0 | |
18036 pmaddubsw m5, m3, m6 | |
18037 pmulhrsw m5, m7 | |
18038 packuswb m4, m5 | |
18039 movu [r0 + 1365 * 16], m4 | |
18040 | |
18041 ; mode 23 [row 11] | |
18042 movu m6, [r5 + 20 * 16] | |
18043 pmaddubsw m4, m0, m6 | |
18044 pmulhrsw m4, m7 | |
18045 pmaddubsw m5, m2, m6 | |
18046 pmulhrsw m5, m7 | |
18047 packuswb m4, m5 | |
18048 movu [r0 + 1366 * 16], m4 | |
18049 pmaddubsw m4, m1, m6 | |
18050 pmulhrsw m4, m7 | |
18051 pmaddubsw m5, m3, m6 | |
18052 pmulhrsw m5, m7 | |
18053 packuswb m4, m5 | |
18054 movu [r0 + 1367 * 16], m4 | |
18055 | |
18056 ; mode 23 [row 12] | |
18057 movu m6, [r5 + 11 * 16] | |
18058 pmaddubsw m4, m0, m6 | |
18059 pmulhrsw m4, m7 | |
18060 pmaddubsw m5, m2, m6 | |
18061 pmulhrsw m5, m7 | |
18062 packuswb m4, m5 | |
18063 movu [r0 + 1368 * 16], m4 | |
18064 pmaddubsw m4, m1, m6 | |
18065 pmulhrsw m4, m7 | |
18066 pmaddubsw m5, m3, m6 | |
18067 pmulhrsw m5, m7 | |
18068 packuswb m4, m5 | |
18069 movu [r0 + 1369 * 16], m4 | |
18070 | |
18071 ; mode 23 [row 13] | |
18072 movu m6, [r5 + 2 * 16] | |
18073 pmaddubsw m4, m0, m6 | |
18074 pmulhrsw m4, m7 | |
18075 pmaddubsw m5, m2, m6 | |
18076 pmulhrsw m5, m7 | |
18077 packuswb m4, m5 | |
18078 movu [r0 + 1370 * 16], m4 | |
18079 pmaddubsw m4, m1, m6 | |
18080 pmulhrsw m4, m7 | |
18081 pmaddubsw m5, m3, m6 | |
18082 pmulhrsw m5, m7 | |
18083 packuswb m4, m5 | |
18084 movu [r0 + 1371 * 16], m4 | |
18085 | |
18086 ; mode 23 [row 14] | |
18087 movu m6, [r5 + 25 * 16] | |
18088 pslldq m0, 2 | |
18089 pinsrb m0, [r4 + 11], 1 | |
18090 pinsrb m0, [r4 + 14], 0 | |
18091 pmaddubsw m4, m0, m6 | |
18092 pmulhrsw m4, m7 | |
18093 pslldq m2, 2 | |
18094 pinsrw m2, [r3 + 4], 0 | |
18095 pmaddubsw m5, m2, m6 | |
18096 pmulhrsw m5, m7 | |
18097 packuswb m4, m5 | |
18098 movu [r0 + 1372 * 16], m4 | |
18099 pslldq m1, 2 | |
18100 pinsrw m1, [r3 + 12], 0 | |
18101 pmaddubsw m4, m1, m6 | |
18102 pmulhrsw m4, m7 | |
18103 pslldq m3, 2 | |
18104 pinsrw m3, [r3 + 20], 0 | |
18105 pmaddubsw m5, m3, m6 | |
18106 pmulhrsw m5, m7 | |
18107 packuswb m4, m5 | |
18108 movu [r0 + 1373 * 16], m4 | |
18109 | |
18110 ; mode 23 [row 15] | |
18111 movu m6, [r5 + 16 * 16] | |
18112 pmaddubsw m4, m0, m6 | |
18113 pmulhrsw m4, m7 | |
18114 pmaddubsw m5, m2, m6 | |
18115 pmulhrsw m5, m7 | |
18116 packuswb m4, m5 | |
18117 movu [r0 + 1374 * 16], m4 | |
18118 pmaddubsw m4, m1, m6 | |
18119 pmulhrsw m4, m7 | |
18120 pmaddubsw m5, m3, m6 | |
18121 pmulhrsw m5, m7 | |
18122 packuswb m4, m5 | |
18123 movu [r0 + 1375 * 16], m4 | |
18124 | |
18125 ; mode 23 [row 16] | |
18126 movu m6, [r5 + 7 * 16] | |
18127 pmaddubsw m4, m0, m6 | |
18128 pmulhrsw m4, m7 | |
18129 pmaddubsw m5, m2, m6 | |
18130 pmulhrsw m5, m7 | |
18131 packuswb m4, m5 | |
18132 movu [r0 + 1376 * 16], m4 | |
18133 pmaddubsw m4, m1, m6 | |
18134 pmulhrsw m4, m7 | |
18135 pmaddubsw m5, m3, m6 | |
18136 pmulhrsw m5, m7 | |
18137 packuswb m4, m5 | |
18138 movu [r0 + 1377 * 16], m4 | |
18139 | |
18140 ; mode 23 [row 17] | |
18141 movu m6, [r5 + 30 * 16] | |
18142 pslldq m0, 2 | |
18143 pinsrb m0, [r4 + 14], 1 | |
18144 pinsrb m0, [r4 + 18], 0 | |
18145 pmaddubsw m4, m0, m6 | |
18146 pmulhrsw m4, m7 | |
18147 pslldq m2, 2 | |
18148 pinsrw m2, [r3 + 3], 0 | |
18149 pmaddubsw m5, m2, m6 | |
18150 pmulhrsw m5, m7 | |
18151 packuswb m4, m5 | |
18152 movu [r0 + 1378 * 16], m4 | |
18153 pslldq m1, 2 | |
18154 pinsrw m1, [r3 + 11], 0 | |
18155 pmaddubsw m4, m1, m6 | |
18156 pmulhrsw m4, m7 | |
18157 pslldq m3, 2 | |
18158 pinsrw m3, [r3 + 19], 0 | |
18159 pmaddubsw m5, m3, m6 | |
18160 pmulhrsw m5, m7 | |
18161 packuswb m4, m5 | |
18162 movu [r0 + 1379 * 16], m4 | |
18163 | |
18164 ; mode 23 [row 18] | |
18165 movu m6, [r5 + 21 * 16] | |
18166 pmaddubsw m4, m0, m6 | |
18167 pmulhrsw m4, m7 | |
18168 pmaddubsw m5, m2, m6 | |
18169 pmulhrsw m5, m7 | |
18170 packuswb m4, m5 | |
18171 movu [r0 + 1380 * 16], m4 | |
18172 pmaddubsw m4, m1, m6 | |
18173 pmulhrsw m4, m7 | |
18174 pmaddubsw m5, m3, m6 | |
18175 pmulhrsw m5, m7 | |
18176 packuswb m4, m5 | |
18177 movu [r0 + 1381 * 16], m4 | |
18178 | |
18179 ; mode 23 [row 19] | |
18180 movu m6, [r5 + 12 * 16] | |
18181 pmaddubsw m4, m0, m6 | |
18182 pmulhrsw m4, m7 | |
18183 pmaddubsw m5, m2, m6 | |
18184 pmulhrsw m5, m7 | |
18185 packuswb m4, m5 | |
18186 movu [r0 + 1382 * 16], m4 | |
18187 pmaddubsw m4, m1, m6 | |
18188 pmulhrsw m4, m7 | |
18189 pmaddubsw m5, m3, m6 | |
18190 pmulhrsw m5, m7 | |
18191 packuswb m4, m5 | |
18192 movu [r0 + 1383 * 16], m4 | |
18193 | |
18194 ; mode 23 [row 20] | |
18195 movu m6, [r5 + 3 * 16] | |
18196 pmaddubsw m4, m0, m6 | |
18197 pmulhrsw m4, m7 | |
18198 pmaddubsw m5, m2, m6 | |
18199 pmulhrsw m5, m7 | |
18200 packuswb m4, m5 | |
18201 movu [r0 + 1384 * 16], m4 | |
18202 pmaddubsw m4, m1, m6 | |
18203 pmulhrsw m4, m7 | |
18204 pmaddubsw m5, m3, m6 | |
18205 pmulhrsw m5, m7 | |
18206 packuswb m4, m5 | |
18207 movu [r0 + 1385 * 16], m4 | |
18208 | |
18209 ; mode 23 [row 21] | |
18210 movu m6, [r5 + 26 * 16] | |
18211 pslldq m0, 2 | |
18212 pinsrb m0, [r4 + 18], 1 | |
18213 pinsrb m0, [r4 + 21], 0 | |
18214 pmaddubsw m4, m0, m6 | |
18215 pmulhrsw m4, m7 | |
18216 pslldq m2, 2 | |
18217 pinsrw m2, [r3 + 2], 0 | |
18218 pmaddubsw m5, m2, m6 | |
18219 pmulhrsw m5, m7 | |
18220 packuswb m4, m5 | |
18221 movu [r0 + 1386 * 16], m4 | |
18222 pslldq m1, 2 | |
18223 pinsrw m1, [r3 + 10], 0 | |
18224 pmaddubsw m4, m1, m6 | |
18225 pmulhrsw m4, m7 | |
18226 pslldq m3, 2 | |
18227 pinsrw m3, [r3 + 18], 0 | |
18228 pmaddubsw m5, m3, m6 | |
18229 pmulhrsw m5, m7 | |
18230 packuswb m4, m5 | |
18231 movu [r0 + 1387 * 16], m4 | |
18232 | |
18233 ; mode 23 [row 22] | |
18234 movu m6, [r5 + 17 * 16] | |
18235 pmaddubsw m4, m0, m6 | |
18236 pmulhrsw m4, m7 | |
18237 pmaddubsw m5, m2, m6 | |
18238 pmulhrsw m5, m7 | |
18239 packuswb m4, m5 | |
18240 movu [r0 + 1388 * 16], m4 | |
18241 pmaddubsw m4, m1, m6 | |
18242 pmulhrsw m4, m7 | |
18243 pmaddubsw m5, m3, m6 | |
18244 pmulhrsw m5, m7 | |
18245 packuswb m4, m5 | |
18246 movu [r0 + 1389 * 16], m4 | |
18247 | |
18248 ; mode 23 [row 23] | |
18249 movu m6, [r5 + 8 * 16] | |
18250 pmaddubsw m4, m0, m6 | |
18251 pmulhrsw m4, m7 | |
18252 pmaddubsw m5, m2, m6 | |
18253 pmulhrsw m5, m7 | |
18254 packuswb m4, m5 | |
18255 movu [r0 + 1390 * 16], m4 | |
18256 pmaddubsw m4, m1, m6 | |
18257 pmulhrsw m4, m7 | |
18258 pmaddubsw m5, m3, m6 | |
18259 pmulhrsw m5, m7 | |
18260 packuswb m4, m5 | |
18261 movu [r0 + 1391 * 16], m4 | |
18262 | |
18263 ; mode 23 [row 24] | |
18264 movu m6, [r5 + 31 * 16] | |
18265 pslldq m0, 2 | |
18266 pinsrb m0, [r4 + 21], 1 | |
18267 pinsrb m0, [r4 + 25], 0 | |
18268 pmaddubsw m4, m0, m6 | |
18269 pmulhrsw m4, m7 | |
18270 pslldq m2, 2 | |
18271 pinsrw m2, [r3 + 1], 0 | |
18272 pmaddubsw m5, m2, m6 | |
18273 pmulhrsw m5, m7 | |
18274 packuswb m4, m5 | |
18275 movu [r0 + 1392 * 16], m4 | |
18276 pslldq m1, 2 | |
18277 pinsrw m1, [r3 + 9], 0 | |
18278 pmaddubsw m4, m1, m6 | |
18279 pmulhrsw m4, m7 | |
18280 pslldq m3, 2 | |
18281 pinsrw m3, [r3 + 17], 0 | |
18282 pmaddubsw m5, m3, m6 | |
18283 pmulhrsw m5, m7 | |
18284 packuswb m4, m5 | |
18285 movu [r0 + 1393 * 16], m4 | |
18286 | |
18287 ; mode 23 [row 25] | |
18288 movu m6, [r5 + 22 * 16] | |
18289 pmaddubsw m4, m0, m6 | |
18290 pmulhrsw m4, m7 | |
18291 pmaddubsw m5, m2, m6 | |
18292 pmulhrsw m5, m7 | |
18293 packuswb m4, m5 | |
18294 movu [r0 + 1394 * 16], m4 | |
18295 pmaddubsw m4, m1, m6 | |
18296 pmulhrsw m4, m7 | |
18297 pmaddubsw m5, m3, m6 | |
18298 pmulhrsw m5, m7 | |
18299 packuswb m4, m5 | |
18300 movu [r0 + 1395 * 16], m4 | |
18301 | |
18302 ; mode 23 [row 26] | |
18303 movu m6, [r5 + 13 * 16] | |
18304 pmaddubsw m4, m0, m6 | |
18305 pmulhrsw m4, m7 | |
18306 pmaddubsw m5, m2, m6 | |
18307 pmulhrsw m5, m7 | |
18308 packuswb m4, m5 | |
18309 movu [r0 + 1396 * 16], m4 | |
18310 pmaddubsw m4, m1, m6 | |
18311 pmulhrsw m4, m7 | |
18312 pmaddubsw m5, m3, m6 | |
18313 pmulhrsw m5, m7 | |
18314 packuswb m4, m5 | |
18315 movu [r0 + 1397 * 16], m4 | |
18316 | |
18317 ; mode 23 [row 27] | |
18318 movu m6, [r5 + 4 * 16] | |
18319 pmaddubsw m4, m0, m6 | |
18320 pmulhrsw m4, m7 | |
18321 pmaddubsw m5, m2, m6 | |
18322 pmulhrsw m5, m7 | |
18323 packuswb m4, m5 | |
18324 movu [r0 + 1398 * 16], m4 | |
18325 pmaddubsw m4, m1, m6 | |
18326 pmulhrsw m4, m7 | |
18327 pmaddubsw m5, m3, m6 | |
18328 pmulhrsw m5, m7 | |
18329 packuswb m4, m5 | |
18330 movu [r0 + 1399 * 16], m4 | |
18331 | |
18332 ; mode 23 [row 28] | |
18333 movu m6, [r5 + 27 * 16] | |
18334 pslldq m0, 2 | |
18335 pinsrb m0, [r4 + 25], 1 | |
18336 pinsrb m0, [r4 + 28], 0 | |
18337 pmaddubsw m4, m0, m6 | |
18338 pmulhrsw m4, m7 | |
18339 pslldq m2, 2 | |
18340 pinsrw m2, [r3 + 0], 0 | |
18341 pmaddubsw m5, m2, m6 | |
18342 pmulhrsw m5, m7 | |
18343 packuswb m4, m5 | |
18344 movu [r0 + 1400 * 16], m4 | |
18345 pslldq m1, 2 | |
18346 pinsrw m1, [r3 + 8], 0 | |
18347 pmaddubsw m4, m1, m6 | |
18348 pmulhrsw m4, m7 | |
18349 pslldq m3, 2 | |
18350 pinsrw m3, [r3 + 16], 0 | |
18351 pmaddubsw m5, m3, m6 | |
18352 pmulhrsw m5, m7 | |
18353 packuswb m4, m5 | |
18354 movu [r0 + 1401 * 16], m4 | |
18355 | |
18356 ; mode 23 [row 29] | |
18357 movu m6, [r5 + 18 * 16] | |
18358 pmaddubsw m4, m0, m6 | |
18359 pmulhrsw m4, m7 | |
18360 pmaddubsw m5, m2, m6 | |
18361 pmulhrsw m5, m7 | |
18362 packuswb m4, m5 | |
18363 movu [r0 + 1402 * 16], m4 | |
18364 pmaddubsw m4, m1, m6 | |
18365 pmulhrsw m4, m7 | |
18366 pmaddubsw m5, m3, m6 | |
18367 pmulhrsw m5, m7 | |
18368 packuswb m4, m5 | |
18369 movu [r0 + 1403 * 16], m4 | |
18370 | |
18371 ; mode 23 [row 30] | |
18372 movu m6, [r5 + 9 * 16] | |
18373 pmaddubsw m4, m0, m6 | |
18374 pmulhrsw m4, m7 | |
18375 pmaddubsw m5, m2, m6 | |
18376 pmulhrsw m5, m7 | |
18377 packuswb m4, m5 | |
18378 movu [r0 + 1404 * 16], m4 | |
18379 pmaddubsw m4, m1, m6 | |
18380 pmulhrsw m4, m7 | |
18381 pmaddubsw m5, m3, m6 | |
18382 pmulhrsw m5, m7 | |
18383 packuswb m4, m5 | |
18384 movu [r0 + 1405 * 16], m4 | |
18385 | |
18386 ; mode23 [row 31] | |
18387 pshufb m5, m0, [tab_S2] | |
18388 movh [r0 + 1406 * 16], m5 | |
18389 pshufb m5, m2, [tab_S2] | |
18390 movh [r0 + 1406 * 16 + 8], m5 | |
18391 pshufb m5, m1, [tab_S2] | |
18392 movh [r0 + 1407 * 16], m5 | |
18393 pshufb m5, m3, [tab_S2] | |
18394 movh [r0 + 1407 * 16 + 8], m5 | |
18395 | |
18396 ; mode 24 [row 0] | |
18397 movu m6, [r5 + 27 * 16] | |
18398 movu m0, [r3 ] | |
18399 movu m1, [r3 + 1 ] | |
18400 punpcklbw m0, m1 | |
18401 pmaddubsw m4, m0, m6 | |
18402 pmulhrsw m4, m7 | |
18403 movu m2, [r3 + 8] | |
18404 movu m3, [r3 + 9] | |
18405 punpcklbw m2, m3 | |
18406 pmaddubsw m5, m2, m6 | |
18407 pmulhrsw m5, m7 | |
18408 packuswb m4, m5 | |
18409 movu [r0 + 1408 * 16], m4 | |
18410 | |
18411 movu m1, [r3 + 16] | |
18412 movu m3, [r3 + 17] | |
18413 punpcklbw m1, m3 | |
18414 pmaddubsw m4, m1, m6 | |
18415 pmulhrsw m4, m7 | |
18416 movu m3, [r3 + 24] | |
18417 movu m5, [r3 + 25] | |
18418 punpcklbw m3, m5 | |
18419 pmaddubsw m5, m3, m6 | |
18420 pmulhrsw m5, m7 | |
18421 packuswb m4, m5 | |
18422 movu [r0 + 1409 * 16], m4 | |
18423 | |
18424 ; mode 24 [row 1] | |
18425 movu m6, [r5 + 22 * 16] | |
18426 pmaddubsw m4, m0, m6 | |
18427 pmulhrsw m4, m7 | |
18428 pmaddubsw m5, m2, m6 | |
18429 pmulhrsw m5, m7 | |
18430 packuswb m4, m5 | |
18431 movu [r0 + 1410 * 16], m4 | |
18432 pmaddubsw m4, m1, m6 | |
18433 pmulhrsw m4, m7 | |
18434 pmaddubsw m5, m3, m6 | |
18435 pmulhrsw m5, m7 | |
18436 packuswb m4, m5 | |
18437 movu [r0 + 1411 * 16], m4 | |
18438 | |
18439 ; mode 24 [row 2] | |
18440 movu m6, [r5 + 17 * 16] | |
18441 pmaddubsw m4, m0, m6 | |
18442 pmulhrsw m4, m7 | |
18443 pmaddubsw m5, m2, m6 | |
18444 pmulhrsw m5, m7 | |
18445 packuswb m4, m5 | |
18446 movu [r0 + 1412 * 16], m4 | |
18447 pmaddubsw m4, m1, m6 | |
18448 pmulhrsw m4, m7 | |
18449 pmaddubsw m5, m3, m6 | |
18450 pmulhrsw m5, m7 | |
18451 packuswb m4, m5 | |
18452 movu [r0 + 1413 * 16], m4 | |
18453 | |
18454 ; mode 24 [row 3] | |
18455 movu m6, [r5 + 12 * 16] | |
18456 pmaddubsw m4, m0, m6 | |
18457 pmulhrsw m4, m7 | |
18458 pmaddubsw m5, m2, m6 | |
18459 pmulhrsw m5, m7 | |
18460 packuswb m4, m5 | |
18461 movu [r0 + 1414 * 16], m4 | |
18462 pmaddubsw m4, m1, m6 | |
18463 pmulhrsw m4, m7 | |
18464 pmaddubsw m5, m3, m6 | |
18465 pmulhrsw m5, m7 | |
18466 packuswb m4, m5 | |
18467 movu [r0 + 1415 * 16], m4 | |
18468 | |
18469 ; mode 24 [row 4] | |
18470 movu m6, [r5 + 7 * 16] | |
18471 pmaddubsw m4, m0, m6 | |
18472 pmulhrsw m4, m7 | |
18473 pmaddubsw m5, m2, m6 | |
18474 pmulhrsw m5, m7 | |
18475 packuswb m4, m5 | |
18476 movu [r0 + 1416 * 16], m4 | |
18477 pmaddubsw m4, m1, m6 | |
18478 pmulhrsw m4, m7 | |
18479 pmaddubsw m5, m3, m6 | |
18480 pmulhrsw m5, m7 | |
18481 packuswb m4, m5 | |
18482 movu [r0 + 1417 * 16], m4 | |
18483 | |
18484 ; mode 24 [row 5] | |
18485 movu m6, [r5 + 2 * 16] | |
18486 pmaddubsw m4, m0, m6 | |
18487 pmulhrsw m4, m7 | |
18488 pmaddubsw m5, m2, m6 | |
18489 pmulhrsw m5, m7 | |
18490 packuswb m4, m5 | |
18491 movu [r0 + 1418 * 16], m4 | |
18492 pmaddubsw m4, m1, m6 | |
18493 pmulhrsw m4, m7 | |
18494 pmaddubsw m5, m3, m6 | |
18495 pmulhrsw m5, m7 | |
18496 packuswb m4, m5 | |
18497 movu [r0 + 1419 * 16], m4 | |
18498 | |
18499 ; mode 24 [row 6] | |
18500 movu m6, [r5 + 29 * 16] | |
18501 pslldq m0, 2 | |
18502 pinsrb m0, [r4 + 0], 1 | |
18503 pinsrb m0, [r4 + 6], 0 | |
18504 pmaddubsw m4, m0, m6 | |
18505 pmulhrsw m4, m7 | |
18506 pslldq m2, 2 | |
18507 pinsrw m2, [r3 + 7], 0 | |
18508 pmaddubsw m5, m2, m6 | |
18509 pmulhrsw m5, m7 | |
18510 packuswb m4, m5 | |
18511 movu [r0 + 1420 * 16], m4 | |
18512 pslldq m1, 2 | |
18513 pinsrw m1, [r3 + 15], 0 | |
18514 pmaddubsw m4, m1, m6 | |
18515 pmulhrsw m4, m7 | |
18516 pslldq m3, 2 | |
18517 pinsrw m3, [r3 + 23], 0 | |
18518 pmaddubsw m5, m3, m6 | |
18519 pmulhrsw m5, m7 | |
18520 packuswb m4, m5 | |
18521 movu [r0 + 1421 * 16], m4 | |
18522 | |
18523 ; mode 24 [row 7] | |
18524 movu m6, [r5 + 24 * 16] | |
18525 pmaddubsw m4, m0, m6 | |
18526 pmulhrsw m4, m7 | |
18527 pmaddubsw m5, m2, m6 | |
18528 pmulhrsw m5, m7 | |
18529 packuswb m4, m5 | |
18530 movu [r0 + 1422 * 16], m4 | |
18531 pmaddubsw m4, m1, m6 | |
18532 pmulhrsw m4, m7 | |
18533 pmaddubsw m5, m3, m6 | |
18534 pmulhrsw m5, m7 | |
18535 packuswb m4, m5 | |
18536 movu [r0 + 1423 * 16], m4 | |
18537 | |
18538 ; mode 24 [row 8] | |
18539 movu m6, [r5 + 19 * 16] | |
18540 pmaddubsw m4, m0, m6 | |
18541 pmulhrsw m4, m7 | |
18542 pmaddubsw m5, m2, m6 | |
18543 pmulhrsw m5, m7 | |
18544 packuswb m4, m5 | |
18545 movu [r0 + 1424 * 16], m4 | |
18546 pmaddubsw m4, m1, m6 | |
18547 pmulhrsw m4, m7 | |
18548 pmaddubsw m5, m3, m6 | |
18549 pmulhrsw m5, m7 | |
18550 packuswb m4, m5 | |
18551 movu [r0 + 1425 * 16], m4 | |
18552 | |
18553 ; mode 24 [row 9] | |
18554 movu m6, [r5 + 14 * 16] | |
18555 pmaddubsw m4, m0, m6 | |
18556 pmulhrsw m4, m7 | |
18557 pmaddubsw m5, m2, m6 | |
18558 pmulhrsw m5, m7 | |
18559 packuswb m4, m5 | |
18560 movu [r0 + 1426 * 16], m4 | |
18561 pmaddubsw m4, m1, m6 | |
18562 pmulhrsw m4, m7 | |
18563 pmaddubsw m5, m3, m6 | |
18564 pmulhrsw m5, m7 | |
18565 packuswb m4, m5 | |
18566 movu [r0 + 1427 * 16], m4 | |
18567 | |
18568 ; mode 24 [row 10] | |
18569 movu m6, [r5 + 9 * 16] | |
18570 pmaddubsw m4, m0, m6 | |
18571 pmulhrsw m4, m7 | |
18572 pmaddubsw m5, m2, m6 | |
18573 pmulhrsw m5, m7 | |
18574 packuswb m4, m5 | |
18575 movu [r0 + 1428 * 16], m4 | |
18576 pmaddubsw m4, m1, m6 | |
18577 pmulhrsw m4, m7 | |
18578 pmaddubsw m5, m3, m6 | |
18579 pmulhrsw m5, m7 | |
18580 packuswb m4, m5 | |
18581 movu [r0 + 1429 * 16], m4 | |
18582 | |
18583 ; mode 24 [row 11] | |
18584 movu m6, [r5 + 4 * 16] | |
18585 pmaddubsw m4, m0, m6 | |
18586 pmulhrsw m4, m7 | |
18587 pmaddubsw m5, m2, m6 | |
18588 pmulhrsw m5, m7 | |
18589 packuswb m4, m5 | |
18590 movu [r0 + 1430 * 16], m4 | |
18591 pmaddubsw m4, m1, m6 | |
18592 pmulhrsw m4, m7 | |
18593 pmaddubsw m5, m3, m6 | |
18594 pmulhrsw m5, m7 | |
18595 packuswb m4, m5 | |
18596 movu [r0 + 1431 * 16], m4 | |
18597 | |
18598 ; mode 24 [row 12] | |
18599 movu m6, [r5 + 31 * 16] | |
18600 pslldq m0, 2 | |
18601 pinsrb m0, [r4 + 6], 1 | |
18602 pinsrb m0, [r4 + 13], 0 | |
18603 pmaddubsw m4, m0, m6 | |
18604 pmulhrsw m4, m7 | |
18605 pslldq m2, 2 | |
18606 pinsrw m2, [r3 + 6], 0 | |
18607 pmaddubsw m5, m2, m6 | |
18608 pmulhrsw m5, m7 | |
18609 packuswb m4, m5 | |
18610 movu [r0 + 1432 * 16], m4 | |
18611 pslldq m1, 2 | |
18612 pinsrw m1, [r3 + 14], 0 | |
18613 pmaddubsw m4, m1, m6 | |
18614 pmulhrsw m4, m7 | |
18615 pslldq m3, 2 | |
18616 pinsrw m3, [r3 + 22], 0 | |
18617 pmaddubsw m5, m3, m6 | |
18618 pmulhrsw m5, m7 | |
18619 packuswb m4, m5 | |
18620 movu [r0 + 1433 * 16], m4 | |
18621 | |
18622 ; mode 24 [row 13] | |
18623 movu m6, [r5 + 26 * 16] | |
18624 pmaddubsw m4, m0, m6 | |
18625 pmulhrsw m4, m7 | |
18626 pmaddubsw m5, m2, m6 | |
18627 pmulhrsw m5, m7 | |
18628 packuswb m4, m5 | |
18629 movu [r0 + 1434 * 16], m4 | |
18630 pmaddubsw m4, m1, m6 | |
18631 pmulhrsw m4, m7 | |
18632 pmaddubsw m5, m3, m6 | |
18633 pmulhrsw m5, m7 | |
18634 packuswb m4, m5 | |
18635 movu [r0 + 1435 * 16], m4 | |
18636 | |
18637 ; mode 24 [row 14] | |
18638 movu m6, [r5 + 21 * 16] | |
18639 pmaddubsw m4, m0, m6 | |
18640 pmulhrsw m4, m7 | |
18641 pmaddubsw m5, m2, m6 | |
18642 pmulhrsw m5, m7 | |
18643 packuswb m4, m5 | |
18644 movu [r0 + 1436 * 16], m4 | |
18645 pmaddubsw m4, m1, m6 | |
18646 pmulhrsw m4, m7 | |
18647 pmaddubsw m5, m3, m6 | |
18648 pmulhrsw m5, m7 | |
18649 packuswb m4, m5 | |
18650 movu [r0 + 1437 * 16], m4 | |
18651 | |
18652 ; mode 24 [row 15] | |
18653 movu m6, [r5 + 16 * 16] | |
18654 pmaddubsw m4, m0, m6 | |
18655 pmulhrsw m4, m7 | |
18656 pmaddubsw m5, m2, m6 | |
18657 pmulhrsw m5, m7 | |
18658 packuswb m4, m5 | |
18659 movu [r0 + 1438 * 16], m4 | |
18660 pmaddubsw m4, m1, m6 | |
18661 pmulhrsw m4, m7 | |
18662 pmaddubsw m5, m3, m6 | |
18663 pmulhrsw m5, m7 | |
18664 packuswb m4, m5 | |
18665 movu [r0 + 1439 * 16], m4 | |
18666 | |
18667 ; mode 24 [row 16] | |
18668 movu m6, [r5 + 11 * 16] | |
18669 pmaddubsw m4, m0, m6 | |
18670 pmulhrsw m4, m7 | |
18671 pmaddubsw m5, m2, m6 | |
18672 pmulhrsw m5, m7 | |
18673 packuswb m4, m5 | |
18674 movu [r0 + 1440 * 16], m4 | |
18675 pmaddubsw m4, m1, m6 | |
18676 pmulhrsw m4, m7 | |
18677 pmaddubsw m5, m3, m6 | |
18678 pmulhrsw m5, m7 | |
18679 packuswb m4, m5 | |
18680 movu [r0 + 1441 * 16], m4 | |
18681 | |
18682 ; mode 24 [row 17] | |
18683 movu m6, [r5 + 6 * 16] | |
18684 pmaddubsw m4, m0, m6 | |
18685 pmulhrsw m4, m7 | |
18686 pmaddubsw m5, m2, m6 | |
18687 pmulhrsw m5, m7 | |
18688 packuswb m4, m5 | |
18689 movu [r0 + 1442 * 16], m4 | |
18690 pmaddubsw m4, m1, m6 | |
18691 pmulhrsw m4, m7 | |
18692 pmaddubsw m5, m3, m6 | |
18693 pmulhrsw m5, m7 | |
18694 packuswb m4, m5 | |
18695 movu [r0 + 1443 * 16], m4 | |
18696 | |
18697 ; mode 24 [row 18] | |
18698 movu m6, [r5 + 1 * 16] | |
18699 pmaddubsw m4, m0, m6 | |
18700 pmulhrsw m4, m7 | |
18701 pmaddubsw m5, m2, m6 | |
18702 pmulhrsw m5, m7 | |
18703 packuswb m4, m5 | |
18704 movu [r0 + 1444 * 16], m4 | |
18705 pmaddubsw m4, m1, m6 | |
18706 pmulhrsw m4, m7 | |
18707 pmaddubsw m5, m3, m6 | |
18708 pmulhrsw m5, m7 | |
18709 packuswb m4, m5 | |
18710 movu [r0 + 1445 * 16], m4 | |
18711 | |
18712 ; mode 24 [row 19] | |
18713 movu m6, [r5 + 28 * 16] | |
18714 pslldq m0, 2 | |
18715 pinsrb m0, [r4 + 13], 1 | |
18716 pinsrb m0, [r4 + 19], 0 | |
18717 pmaddubsw m4, m0, m6 | |
18718 pmulhrsw m4, m7 | |
18719 pslldq m2, 2 | |
18720 pinsrw m2, [r3 + 5], 0 | |
18721 pmaddubsw m5, m2, m6 | |
18722 pmulhrsw m5, m7 | |
18723 packuswb m4, m5 | |
18724 movu [r0 + 1446 * 16], m4 | |
18725 pslldq m1, 2 | |
18726 pinsrw m1, [r3 + 13], 0 | |
18727 pmaddubsw m4, m1, m6 | |
18728 pmulhrsw m4, m7 | |
18729 pslldq m3, 2 | |
18730 pinsrw m3, [r3 + 21], 0 | |
18731 pmaddubsw m5, m3, m6 | |
18732 pmulhrsw m5, m7 | |
18733 packuswb m4, m5 | |
18734 movu [r0 + 1447 * 16], m4 | |
18735 | |
18736 ; mode 24 [row 20] | |
18737 movu m6, [r5 + 23 * 16] | |
18738 pmaddubsw m4, m0, m6 | |
18739 pmulhrsw m4, m7 | |
18740 pmaddubsw m5, m2, m6 | |
18741 pmulhrsw m5, m7 | |
18742 packuswb m4, m5 | |
18743 movu [r0 + 1448 * 16], m4 | |
18744 pmaddubsw m4, m1, m6 | |
18745 pmulhrsw m4, m7 | |
18746 pmaddubsw m5, m3, m6 | |
18747 pmulhrsw m5, m7 | |
18748 packuswb m4, m5 | |
18749 movu [r0 + 1449 * 16], m4 | |
18750 | |
18751 ; mode 24 [row 21] | |
18752 movu m6, [r5 + 18 * 16] | |
18753 pmaddubsw m4, m0, m6 | |
18754 pmulhrsw m4, m7 | |
18755 pmaddubsw m5, m2, m6 | |
18756 pmulhrsw m5, m7 | |
18757 packuswb m4, m5 | |
18758 movu [r0 + 1450 * 16], m4 | |
18759 pmaddubsw m4, m1, m6 | |
18760 pmulhrsw m4, m7 | |
18761 pmaddubsw m5, m3, m6 | |
18762 pmulhrsw m5, m7 | |
18763 packuswb m4, m5 | |
18764 movu [r0 + 1451 * 16], m4 | |
18765 | |
18766 ; mode 24 [row 22] | |
18767 movu m6, [r5 + 13 * 16] | |
18768 pmaddubsw m4, m0, m6 | |
18769 pmulhrsw m4, m7 | |
18770 pmaddubsw m5, m2, m6 | |
18771 pmulhrsw m5, m7 | |
18772 packuswb m4, m5 | |
18773 movu [r0 + 1452 * 16], m4 | |
18774 pmaddubsw m4, m1, m6 | |
18775 pmulhrsw m4, m7 | |
18776 pmaddubsw m5, m3, m6 | |
18777 pmulhrsw m5, m7 | |
18778 packuswb m4, m5 | |
18779 movu [r0 + 1453 * 16], m4 | |
18780 | |
18781 ; mode 24 [row 23] | |
18782 movu m6, [r5 + 8 * 16] | |
18783 pmaddubsw m4, m0, m6 | |
18784 pmulhrsw m4, m7 | |
18785 pmaddubsw m5, m2, m6 | |
18786 pmulhrsw m5, m7 | |
18787 packuswb m4, m5 | |
18788 movu [r0 + 1454 * 16], m4 | |
18789 pmaddubsw m4, m1, m6 | |
18790 pmulhrsw m4, m7 | |
18791 pmaddubsw m5, m3, m6 | |
18792 pmulhrsw m5, m7 | |
18793 packuswb m4, m5 | |
18794 movu [r0 + 1455 * 16], m4 | |
18795 | |
18796 ; mode 24 [row 24] | |
18797 movu m6, [r5 + 3 * 16] | |
18798 pmaddubsw m4, m0, m6 | |
18799 pmulhrsw m4, m7 | |
18800 pmaddubsw m5, m2, m6 | |
18801 pmulhrsw m5, m7 | |
18802 packuswb m4, m5 | |
18803 movu [r0 + 1456 * 16], m4 | |
18804 pmaddubsw m4, m1, m6 | |
18805 pmulhrsw m4, m7 | |
18806 pmaddubsw m5, m3, m6 | |
18807 pmulhrsw m5, m7 | |
18808 packuswb m4, m5 | |
18809 movu [r0 + 1457 * 16], m4 | |
18810 | |
18811 ; mode 24 [row 25] | |
18812 movu m6, [r5 + 30 * 16] | |
18813 pslldq m0, 2 | |
18814 pinsrb m0, [r4 + 19], 1 | |
18815 pinsrb m0, [r4 + 26], 0 | |
18816 pmaddubsw m4, m0, m6 | |
18817 pmulhrsw m4, m7 | |
18818 pslldq m2, 2 | |
18819 pinsrw m2, [r3 + 4], 0 | |
18820 pmaddubsw m5, m2, m6 | |
18821 pmulhrsw m5, m7 | |
18822 packuswb m4, m5 | |
18823 movu [r0 + 1458 * 16], m4 | |
18824 pslldq m1, 2 | |
18825 pinsrw m1, [r3 + 12], 0 | |
18826 pmaddubsw m4, m1, m6 | |
18827 pmulhrsw m4, m7 | |
18828 pslldq m3, 2 | |
18829 pinsrw m3, [r3 + 20], 0 | |
18830 pmaddubsw m5, m3, m6 | |
18831 pmulhrsw m5, m7 | |
18832 packuswb m4, m5 | |
18833 movu [r0 + 1459 * 16], m4 | |
18834 | |
18835 ; mode 24 [row 26] | |
18836 movu m6, [r5 + 25 * 16] | |
18837 pmaddubsw m4, m0, m6 | |
18838 pmulhrsw m4, m7 | |
18839 pmaddubsw m5, m2, m6 | |
18840 pmulhrsw m5, m7 | |
18841 packuswb m4, m5 | |
18842 movu [r0 + 1460 * 16], m4 | |
18843 pmaddubsw m4, m1, m6 | |
18844 pmulhrsw m4, m7 | |
18845 pmaddubsw m5, m3, m6 | |
18846 pmulhrsw m5, m7 | |
18847 packuswb m4, m5 | |
18848 movu [r0 + 1461 * 16], m4 | |
18849 | |
18850 ; mode 24 [row 27] | |
18851 movu m6, [r5 + 20 * 16] | |
18852 pmaddubsw m4, m0, m6 | |
18853 pmulhrsw m4, m7 | |
18854 pmaddubsw m5, m2, m6 | |
18855 pmulhrsw m5, m7 | |
18856 packuswb m4, m5 | |
18857 movu [r0 + 1462 * 16], m4 | |
18858 pmaddubsw m4, m1, m6 | |
18859 pmulhrsw m4, m7 | |
18860 pmaddubsw m5, m3, m6 | |
18861 pmulhrsw m5, m7 | |
18862 packuswb m4, m5 | |
18863 movu [r0 + 1463 * 16], m4 | |
18864 | |
18865 ; mode 24 [row 28] | |
18866 movu m6, [r5 + 15 * 16] | |
18867 pmaddubsw m4, m0, m6 | |
18868 pmulhrsw m4, m7 | |
18869 pmaddubsw m5, m2, m6 | |
18870 pmulhrsw m5, m7 | |
18871 packuswb m4, m5 | |
18872 movu [r0 + 1464 * 16], m4 | |
18873 pmaddubsw m4, m1, m6 | |
18874 pmulhrsw m4, m7 | |
18875 pmaddubsw m5, m3, m6 | |
18876 pmulhrsw m5, m7 | |
18877 packuswb m4, m5 | |
18878 movu [r0 + 1465 * 16], m4 | |
18879 | |
18880 ; mode 24 [row 29] | |
18881 movu m6, [r5 + 10 * 16] | |
18882 pmaddubsw m4, m0, m6 | |
18883 pmulhrsw m4, m7 | |
18884 pmaddubsw m5, m2, m6 | |
18885 pmulhrsw m5, m7 | |
18886 packuswb m4, m5 | |
18887 movu [r0 + 1466 * 16], m4 | |
18888 pmaddubsw m4, m1, m6 | |
18889 pmulhrsw m4, m7 | |
18890 pmaddubsw m5, m3, m6 | |
18891 pmulhrsw m5, m7 | |
18892 packuswb m4, m5 | |
18893 movu [r0 + 1467 * 16], m4 | |
18894 | |
18895 ; mode 24 [row 30] | |
18896 movu m6, [r5 + 5 * 16] | |
18897 pmaddubsw m4, m0, m6 | |
18898 pmulhrsw m4, m7 | |
18899 pmaddubsw m5, m2, m6 | |
18900 pmulhrsw m5, m7 | |
18901 packuswb m4, m5 | |
18902 movu [r0 + 1468 * 16], m4 | |
18903 pmaddubsw m4, m1, m6 | |
18904 pmulhrsw m4, m7 | |
18905 pmaddubsw m5, m3, m6 | |
18906 pmulhrsw m5, m7 | |
18907 packuswb m4, m5 | |
18908 movu [r0 + 1469 * 16], m4 | |
18909 | |
18910 ; mode 24 [row 31] | |
18911 pshufb m5, m0, [tab_S2] | |
18912 movh [r0 + 1470 * 16], m5 | |
18913 pshufb m5, m2, [tab_S2] | |
18914 movh [r0 + 1470 * 16 + 8], m5 | |
18915 pshufb m5, m1, [tab_S2] | |
18916 movh [r0 + 1471 * 16], m5 | |
18917 pshufb m5, m3, [tab_S2] | |
18918 movh [r0 + 1471 * 16 + 8], m5 | |
18919 | |
18920 ; mode 25 [row 0] | |
18921 movu m6, [r5 + 30 * 16] | |
18922 movu m0, [r3 ] | |
18923 movu m1, [r3 + 1 ] | |
18924 punpcklbw m0, m1 | |
18925 pmaddubsw m4, m0, m6 | |
18926 pmulhrsw m4, m7 | |
18927 movu m2, [r3 + 8] | |
18928 movu m3, [r3 + 9] | |
18929 punpcklbw m2, m3 | |
18930 pmaddubsw m5, m2, m6 | |
18931 pmulhrsw m5, m7 | |
18932 packuswb m4, m5 | |
18933 movu [r0 + 1472 * 16], m4 | |
18934 | |
18935 movu m1, [r3 + 16] | |
18936 movu m3, [r3 + 17] | |
18937 punpcklbw m1, m3 | |
18938 pmaddubsw m4, m1, m6 | |
18939 pmulhrsw m4, m7 | |
18940 movu m3, [r3 + 24] | |
18941 movu m5, [r3 + 25] | |
18942 punpcklbw m3, m5 | |
18943 pmaddubsw m5, m3, m6 | |
18944 pmulhrsw m5, m7 | |
18945 packuswb m4, m5 | |
18946 movu [r0 + 1473 * 16], m4 | |
18947 | |
18948 ; mode 25 [row 1] | |
18949 movu m6, [r5 + 28 * 16] | |
18950 pmaddubsw m4, m0, m6 | |
18951 pmulhrsw m4, m7 | |
18952 pmaddubsw m5, m2, m6 | |
18953 pmulhrsw m5, m7 | |
18954 packuswb m4, m5 | |
18955 movu [r0 + 1474 * 16], m4 | |
18956 pmaddubsw m4, m1, m6 | |
18957 pmulhrsw m4, m7 | |
18958 pmaddubsw m5, m3, m6 | |
18959 pmulhrsw m5, m7 | |
18960 packuswb m4, m5 | |
18961 movu [r0 + 1475 * 16], m4 | |
18962 | |
18963 ; mode 25 [row 2] | |
18964 movu m6, [r5 + 26 * 16] | |
18965 pmaddubsw m4, m0, m6 | |
18966 pmulhrsw m4, m7 | |
18967 pmaddubsw m5, m2, m6 | |
18968 pmulhrsw m5, m7 | |
18969 packuswb m4, m5 | |
18970 movu [r0 + 1476 * 16], m4 | |
18971 pmaddubsw m4, m1, m6 | |
18972 pmulhrsw m4, m7 | |
18973 pmaddubsw m5, m3, m6 | |
18974 pmulhrsw m5, m7 | |
18975 packuswb m4, m5 | |
18976 movu [r0 + 1477 * 16], m4 | |
18977 | |
18978 ; mode 25 [row 3] | |
18979 movu m6, [r5 + 24 * 16] | |
18980 pmaddubsw m4, m0, m6 | |
18981 pmulhrsw m4, m7 | |
18982 pmaddubsw m5, m2, m6 | |
18983 pmulhrsw m5, m7 | |
18984 packuswb m4, m5 | |
18985 movu [r0 + 1478 * 16], m4 | |
18986 pmaddubsw m4, m1, m6 | |
18987 pmulhrsw m4, m7 | |
18988 pmaddubsw m5, m3, m6 | |
18989 pmulhrsw m5, m7 | |
18990 packuswb m4, m5 | |
18991 movu [r0 + 1479 * 16], m4 | |
18992 | |
18993 ; mode 25 [row 4] | |
18994 movu m6, [r5 + 22 * 16] | |
18995 pmaddubsw m4, m0, m6 | |
18996 pmulhrsw m4, m7 | |
18997 pmaddubsw m5, m2, m6 | |
18998 pmulhrsw m5, m7 | |
18999 packuswb m4, m5 | |
19000 movu [r0 + 1480 * 16], m4 | |
19001 pmaddubsw m4, m1, m6 | |
19002 pmulhrsw m4, m7 | |
19003 pmaddubsw m5, m3, m6 | |
19004 pmulhrsw m5, m7 | |
19005 packuswb m4, m5 | |
19006 movu [r0 + 1481 * 16], m4 | |
19007 | |
19008 ; mode 25 [row 5] | |
19009 movu m6, [r5 + 20 * 16] | |
19010 pmaddubsw m4, m0, m6 | |
19011 pmulhrsw m4, m7 | |
19012 pmaddubsw m5, m2, m6 | |
19013 pmulhrsw m5, m7 | |
19014 packuswb m4, m5 | |
19015 movu [r0 + 1482 * 16], m4 | |
19016 pmaddubsw m4, m1, m6 | |
19017 pmulhrsw m4, m7 | |
19018 pmaddubsw m5, m3, m6 | |
19019 pmulhrsw m5, m7 | |
19020 packuswb m4, m5 | |
19021 movu [r0 + 1483 * 16], m4 | |
19022 | |
19023 ; mode 25 [row 6] | |
19024 movu m6, [r5 + 18 * 16] | |
19025 pmaddubsw m4, m0, m6 | |
19026 pmulhrsw m4, m7 | |
19027 pmaddubsw m5, m2, m6 | |
19028 pmulhrsw m5, m7 | |
19029 packuswb m4, m5 | |
19030 movu [r0 + 1484 * 16], m4 | |
19031 pmaddubsw m4, m1, m6 | |
19032 pmulhrsw m4, m7 | |
19033 pmaddubsw m5, m3, m6 | |
19034 pmulhrsw m5, m7 | |
19035 packuswb m4, m5 | |
19036 movu [r0 + 1485 * 16], m4 | |
19037 | |
19038 ; mode 25 [row 7] | |
19039 movu m6, [r5 + 16 * 16] | |
19040 pmaddubsw m4, m0, m6 | |
19041 pmulhrsw m4, m7 | |
19042 pmaddubsw m5, m2, m6 | |
19043 pmulhrsw m5, m7 | |
19044 packuswb m4, m5 | |
19045 movu [r0 + 1486 * 16], m4 | |
19046 pmaddubsw m4, m1, m6 | |
19047 pmulhrsw m4, m7 | |
19048 pmaddubsw m5, m3, m6 | |
19049 pmulhrsw m5, m7 | |
19050 packuswb m4, m5 | |
19051 movu [r0 + 1487 * 16], m4 | |
19052 | |
19053 ; mode 25 [row 8] | |
19054 movu m6, [r5 + 14 * 16] | |
19055 pmaddubsw m4, m0, m6 | |
19056 pmulhrsw m4, m7 | |
19057 pmaddubsw m5, m2, m6 | |
19058 pmulhrsw m5, m7 | |
19059 packuswb m4, m5 | |
19060 movu [r0 + 1488 * 16], m4 | |
19061 pmaddubsw m4, m1, m6 | |
19062 pmulhrsw m4, m7 | |
19063 pmaddubsw m5, m3, m6 | |
19064 pmulhrsw m5, m7 | |
19065 packuswb m4, m5 | |
19066 movu [r0 + 1489 * 16], m4 | |
19067 | |
19068 ; mode 25 [row 9] | |
19069 movu m6, [r5 + 12 * 16] | |
19070 pmaddubsw m4, m0, m6 | |
19071 pmulhrsw m4, m7 | |
19072 pmaddubsw m5, m2, m6 | |
19073 pmulhrsw m5, m7 | |
19074 packuswb m4, m5 | |
19075 movu [r0 + 1490 * 16], m4 | |
19076 pmaddubsw m4, m1, m6 | |
19077 pmulhrsw m4, m7 | |
19078 pmaddubsw m5, m3, m6 | |
19079 pmulhrsw m5, m7 | |
19080 packuswb m4, m5 | |
19081 movu [r0 + 1491 * 16], m4 | |
19082 | |
19083 ; mode 25 [row 10] | |
19084 movu m6, [r5 + 10 * 16] | |
19085 pmaddubsw m4, m0, m6 | |
19086 pmulhrsw m4, m7 | |
19087 pmaddubsw m5, m2, m6 | |
19088 pmulhrsw m5, m7 | |
19089 packuswb m4, m5 | |
19090 movu [r0 + 1492 * 16], m4 | |
19091 pmaddubsw m4, m1, m6 | |
19092 pmulhrsw m4, m7 | |
19093 pmaddubsw m5, m3, m6 | |
19094 pmulhrsw m5, m7 | |
19095 packuswb m4, m5 | |
19096 movu [r0 + 1493 * 16], m4 | |
19097 | |
19098 ; mode 25 [row 11] | |
19099 movu m6, [r5 + 8 * 16] | |
19100 pmaddubsw m4, m0, m6 | |
19101 pmulhrsw m4, m7 | |
19102 pmaddubsw m5, m2, m6 | |
19103 pmulhrsw m5, m7 | |
19104 packuswb m4, m5 | |
19105 movu [r0 + 1494 * 16], m4 | |
19106 pmaddubsw m4, m1, m6 | |
19107 pmulhrsw m4, m7 | |
19108 pmaddubsw m5, m3, m6 | |
19109 pmulhrsw m5, m7 | |
19110 packuswb m4, m5 | |
19111 movu [r0 + 1495 * 16], m4 | |
19112 | |
19113 ; mode 25 [row 12] | |
19114 movu m6, [r5 + 6 * 16] | |
19115 pmaddubsw m4, m0, m6 | |
19116 pmulhrsw m4, m7 | |
19117 pmaddubsw m5, m2, m6 | |
19118 pmulhrsw m5, m7 | |
19119 packuswb m4, m5 | |
19120 movu [r0 + 1496 * 16], m4 | |
19121 pmaddubsw m4, m1, m6 | |
19122 pmulhrsw m4, m7 | |
19123 pmaddubsw m5, m3, m6 | |
19124 pmulhrsw m5, m7 | |
19125 packuswb m4, m5 | |
19126 movu [r0 + 1497 * 16], m4 | |
19127 | |
19128 ; mode 25 [row 13] | |
19129 movu m6, [r5 + 4 * 16] | |
19130 pmaddubsw m4, m0, m6 | |
19131 pmulhrsw m4, m7 | |
19132 pmaddubsw m5, m2, m6 | |
19133 pmulhrsw m5, m7 | |
19134 packuswb m4, m5 | |
19135 movu [r0 + 1498 * 16], m4 | |
19136 pmaddubsw m4, m1, m6 | |
19137 pmulhrsw m4, m7 | |
19138 pmaddubsw m5, m3, m6 | |
19139 pmulhrsw m5, m7 | |
19140 packuswb m4, m5 | |
19141 movu [r0 + 1499 * 16], m4 | |
19142 | |
19143 ; mode 25 [row 14] | |
19144 movu m6, [r5 + 2 * 16] | |
19145 pmaddubsw m4, m0, m6 | |
19146 pmulhrsw m4, m7 | |
19147 pmaddubsw m5, m2, m6 | |
19148 pmulhrsw m5, m7 | |
19149 packuswb m4, m5 | |
19150 movu [r0 + 1500 * 16], m4 | |
19151 pmaddubsw m4, m1, m6 | |
19152 pmulhrsw m4, m7 | |
19153 pmaddubsw m5, m3, m6 | |
19154 pmulhrsw m5, m7 | |
19155 packuswb m4, m5 | |
19156 movu [r0 + 1501 * 16], m4 | |
19157 | |
19158 ; mode 25 [row 15] | |
19159 pshufb m5, m0, [tab_S2] | |
19160 movh [r0 + 1502 * 16], m5 | |
19161 pshufb m5, m2, [tab_S2] | |
19162 movh [r0 + 1502 * 16 + 8], m5 | |
19163 pshufb m5, m1, [tab_S2] | |
19164 movh [r0 + 1503 * 16], m5 | |
19165 pshufb m5, m3, [tab_S2] | |
19166 movh [r0 + 1503 * 16 + 8], m5 | |
19167 | |
19168 ; mode 25 [row 16] | |
19169 movu m6, [r5 + 30 * 16] | |
19170 pslldq m0, 2 | |
19171 pinsrb m0, [r4 + 0], 1 | |
19172 pinsrb m0, [r4 + 16], 0 | |
19173 pmaddubsw m4, m0, m6 | |
19174 pmulhrsw m4, m7 | |
19175 pslldq m2, 2 | |
19176 pinsrw m2, [r3 + 7], 0 | |
19177 pmaddubsw m5, m2, m6 | |
19178 pmulhrsw m5, m7 | |
19179 packuswb m4, m5 | |
19180 movu [r0 + 1504 * 16], m4 | |
19181 pslldq m1, 2 | |
19182 pinsrw m1, [r3 + 15], 0 | |
19183 pmaddubsw m4, m1, m6 | |
19184 pmulhrsw m4, m7 | |
19185 pslldq m3, 2 | |
19186 pinsrw m3, [r3 + 23], 0 | |
19187 pmaddubsw m5, m3, m6 | |
19188 pmulhrsw m5, m7 | |
19189 packuswb m4, m5 | |
19190 movu [r0 + 1505 * 16], m4 | |
19191 | |
19192 ; mode 25 [row 17] | |
19193 movu m6, [r5 + 28 * 16] | |
19194 pmaddubsw m4, m0, m6 | |
19195 pmulhrsw m4, m7 | |
19196 pmaddubsw m5, m2, m6 | |
19197 pmulhrsw m5, m7 | |
19198 packuswb m4, m5 | |
19199 movu [r0 + 1506 * 16], m4 | |
19200 pmaddubsw m4, m1, m6 | |
19201 pmulhrsw m4, m7 | |
19202 pmaddubsw m5, m3, m6 | |
19203 pmulhrsw m5, m7 | |
19204 packuswb m4, m5 | |
19205 movu [r0 + 1507 * 16], m4 | |
19206 | |
19207 ; mode 25 [row 18] | |
19208 movu m6, [r5 + 26 * 16] | |
19209 pmaddubsw m4, m0, m6 | |
19210 pmulhrsw m4, m7 | |
19211 pmaddubsw m5, m2, m6 | |
19212 pmulhrsw m5, m7 | |
19213 packuswb m4, m5 | |
19214 movu [r0 + 1508 * 16], m4 | |
19215 pmaddubsw m4, m1, m6 | |
19216 pmulhrsw m4, m7 | |
19217 pmaddubsw m5, m3, m6 | |
19218 pmulhrsw m5, m7 | |
19219 packuswb m4, m5 | |
19220 movu [r0 + 1509 * 16], m4 | |
19221 | |
19222 ; mode 25 [row 19] | |
19223 movu m6, [r5 + 24 * 16] | |
19224 pmaddubsw m4, m0, m6 | |
19225 pmulhrsw m4, m7 | |
19226 pmaddubsw m5, m2, m6 | |
19227 pmulhrsw m5, m7 | |
19228 packuswb m4, m5 | |
19229 movu [r0 + 1510 * 16], m4 | |
19230 pmaddubsw m4, m1, m6 | |
19231 pmulhrsw m4, m7 | |
19232 pmaddubsw m5, m3, m6 | |
19233 pmulhrsw m5, m7 | |
19234 packuswb m4, m5 | |
19235 movu [r0 + 1511 * 16], m4 | |
19236 | |
19237 ; mode 25 [row 20] | |
19238 movu m6, [r5 + 22 * 16] | |
19239 pmaddubsw m4, m0, m6 | |
19240 pmulhrsw m4, m7 | |
19241 pmaddubsw m5, m2, m6 | |
19242 pmulhrsw m5, m7 | |
19243 packuswb m4, m5 | |
19244 movu [r0 + 1512 * 16], m4 | |
19245 pmaddubsw m4, m1, m6 | |
19246 pmulhrsw m4, m7 | |
19247 pmaddubsw m5, m3, m6 | |
19248 pmulhrsw m5, m7 | |
19249 packuswb m4, m5 | |
19250 movu [r0 + 1513 * 16], m4 | |
19251 | |
19252 ; mode 25 [row 21] | |
19253 movu m6, [r5 + 20 * 16] | |
19254 pmaddubsw m4, m0, m6 | |
19255 pmulhrsw m4, m7 | |
19256 pmaddubsw m5, m2, m6 | |
19257 pmulhrsw m5, m7 | |
19258 packuswb m4, m5 | |
19259 movu [r0 + 1514 * 16], m4 | |
19260 pmaddubsw m4, m1, m6 | |
19261 pmulhrsw m4, m7 | |
19262 pmaddubsw m5, m3, m6 | |
19263 pmulhrsw m5, m7 | |
19264 packuswb m4, m5 | |
19265 movu [r0 + 1515 * 16], m4 | |
19266 | |
19267 ; mode 25 [row 22] | |
19268 movu m6, [r5 + 18 * 16] | |
19269 pmaddubsw m4, m0, m6 | |
19270 pmulhrsw m4, m7 | |
19271 pmaddubsw m5, m2, m6 | |
19272 pmulhrsw m5, m7 | |
19273 packuswb m4, m5 | |
19274 movu [r0 + 1516 * 16], m4 | |
19275 pmaddubsw m4, m1, m6 | |
19276 pmulhrsw m4, m7 | |
19277 pmaddubsw m5, m3, m6 | |
19278 pmulhrsw m5, m7 | |
19279 packuswb m4, m5 | |
19280 movu [r0 + 1517 * 16], m4 | |
19281 | |
19282 ; mode 25 [row 23] | |
19283 movu m6, [r5 + 16 * 16] | |
19284 pmaddubsw m4, m0, m6 | |
19285 pmulhrsw m4, m7 | |
19286 pmaddubsw m5, m2, m6 | |
19287 pmulhrsw m5, m7 | |
19288 packuswb m4, m5 | |
19289 movu [r0 + 1518 * 16], m4 | |
19290 pmaddubsw m4, m1, m6 | |
19291 pmulhrsw m4, m7 | |
19292 pmaddubsw m5, m3, m6 | |
19293 pmulhrsw m5, m7 | |
19294 packuswb m4, m5 | |
19295 movu [r0 + 1519 * 16], m4 | |
19296 | |
19297 ; mode 25 [row 24] | |
19298 movu m6, [r5 + 14 * 16] | |
19299 pmaddubsw m4, m0, m6 | |
19300 pmulhrsw m4, m7 | |
19301 pmaddubsw m5, m2, m6 | |
19302 pmulhrsw m5, m7 | |
19303 packuswb m4, m5 | |
19304 movu [r0 + 1520 * 16], m4 | |
19305 pmaddubsw m4, m1, m6 | |
19306 pmulhrsw m4, m7 | |
19307 pmaddubsw m5, m3, m6 | |
19308 pmulhrsw m5, m7 | |
19309 packuswb m4, m5 | |
19310 movu [r0 + 1521 * 16], m4 | |
19311 | |
19312 ; mode 25 [row 25] | |
19313 movu m6, [r5 + 12 * 16] | |
19314 pmaddubsw m4, m0, m6 | |
19315 pmulhrsw m4, m7 | |
19316 pmaddubsw m5, m2, m6 | |
19317 pmulhrsw m5, m7 | |
19318 packuswb m4, m5 | |
19319 movu [r0 + 1522 * 16], m4 | |
19320 pmaddubsw m4, m1, m6 | |
19321 pmulhrsw m4, m7 | |
19322 pmaddubsw m5, m3, m6 | |
19323 pmulhrsw m5, m7 | |
19324 packuswb m4, m5 | |
19325 movu [r0 + 1523 * 16], m4 | |
19326 | |
19327 ; mode 25 [row 26] | |
19328 movu m6, [r5 + 10 * 16] | |
19329 pmaddubsw m4, m0, m6 | |
19330 pmulhrsw m4, m7 | |
19331 pmaddubsw m5, m2, m6 | |
19332 pmulhrsw m5, m7 | |
19333 packuswb m4, m5 | |
19334 movu [r0 + 1524 * 16], m4 | |
19335 pmaddubsw m4, m1, m6 | |
19336 pmulhrsw m4, m7 | |
19337 pmaddubsw m5, m3, m6 | |
19338 pmulhrsw m5, m7 | |
19339 packuswb m4, m5 | |
19340 movu [r0 + 1525 * 16], m4 | |
19341 | |
19342 ; mode 25 [row 27] | |
19343 movu m6, [r5 + 8 * 16] | |
19344 pmaddubsw m4, m0, m6 | |
19345 pmulhrsw m4, m7 | |
19346 pmaddubsw m5, m2, m6 | |
19347 pmulhrsw m5, m7 | |
19348 packuswb m4, m5 | |
19349 movu [r0 + 1526 * 16], m4 | |
19350 pmaddubsw m4, m1, m6 | |
19351 pmulhrsw m4, m7 | |
19352 pmaddubsw m5, m3, m6 | |
19353 pmulhrsw m5, m7 | |
19354 packuswb m4, m5 | |
19355 movu [r0 + 1527 * 16], m4 | |
19356 | |
19357 ; mode 25 [row 28] | |
19358 movu m6, [r5 + 6 * 16] | |
19359 pmaddubsw m4, m0, m6 | |
19360 pmulhrsw m4, m7 | |
19361 pmaddubsw m5, m2, m6 | |
19362 pmulhrsw m5, m7 | |
19363 packuswb m4, m5 | |
19364 movu [r0 + 1528 * 16], m4 | |
19365 pmaddubsw m4, m1, m6 | |
19366 pmulhrsw m4, m7 | |
19367 pmaddubsw m5, m3, m6 | |
19368 pmulhrsw m5, m7 | |
19369 packuswb m4, m5 | |
19370 movu [r0 + 1529 * 16], m4 | |
19371 | |
19372 ; mode 25 [row 29] | |
19373 movu m6, [r5 + 4 * 16] | |
19374 pmaddubsw m4, m0, m6 | |
19375 pmulhrsw m4, m7 | |
19376 pmaddubsw m5, m2, m6 | |
19377 pmulhrsw m5, m7 | |
19378 packuswb m4, m5 | |
19379 movu [r0 + 1530 * 16], m4 | |
19380 pmaddubsw m4, m1, m6 | |
19381 pmulhrsw m4, m7 | |
19382 pmaddubsw m5, m3, m6 | |
19383 pmulhrsw m5, m7 | |
19384 packuswb m4, m5 | |
19385 movu [r0 + 1531 * 16], m4 | |
19386 | |
19387 ; mode 25 [row 30] | |
19388 movu m6, [r5 + 2 * 16] | |
19389 pmaddubsw m4, m0, m6 | |
19390 pmulhrsw m4, m7 | |
19391 pmaddubsw m5, m2, m6 | |
19392 pmulhrsw m5, m7 | |
19393 packuswb m4, m5 | |
19394 movu [r0 + 1532 * 16], m4 | |
19395 pmaddubsw m4, m1, m6 | |
19396 pmulhrsw m4, m7 | |
19397 pmaddubsw m5, m3, m6 | |
19398 pmulhrsw m5, m7 | |
19399 packuswb m4, m5 | |
19400 movu [r0 + 1533 * 16], m4 | |
19401 | |
19402 ; mode 25 [row 31] | |
19403 pshufb m5, m0, [tab_S2] | |
19404 movh [r0 + 1534 * 16], m5 | |
19405 pshufb m5, m2, [tab_S2] | |
19406 movh [r0 + 1534 * 16 + 8], m5 | |
19407 pshufb m5, m1, [tab_S2] | |
19408 movh [r0 + 1535 * 16], m5 | |
19409 pshufb m5, m3, [tab_S2] | |
19410 movh [r0 + 1535 * 16 + 8], m5 | |
19411 | |
19412 ; mode 26 | |
19413 movu m1, [r1 + 1] | |
19414 movu m2, [r1 + 17] | |
19415 movu [r0 + 1536 * 16], m1 | |
19416 movu [r0 + 1537 * 16], m2 | |
19417 movu [r0 + 1538 * 16], m1 | |
19418 movu [r0 + 1539 * 16], m2 | |
19419 movu [r0 + 1540 * 16], m1 | |
19420 movu [r0 + 1541 * 16], m2 | |
19421 movu [r0 + 1542 * 16], m1 | |
19422 movu [r0 + 1543 * 16], m2 | |
19423 movu [r0 + 1544 * 16], m1 | |
19424 movu [r0 + 1545 * 16], m2 | |
19425 movu [r0 + 1546 * 16], m1 | |
19426 movu [r0 + 1547 * 16], m2 | |
19427 movu [r0 + 1548 * 16], m1 | |
19428 movu [r0 + 1549 * 16], m2 | |
19429 movu [r0 + 1550 * 16], m1 | |
19430 movu [r0 + 1551 * 16], m2 | |
19431 | |
19432 movu [r0 + 1552 * 16], m1 | |
19433 movu [r0 + 1553 * 16], m2 | |
19434 movu [r0 + 1554 * 16], m1 | |
19435 movu [r0 + 1555 * 16], m2 | |
19436 movu [r0 + 1556 * 16], m1 | |
19437 movu [r0 + 1557 * 16], m2 | |
19438 movu [r0 + 1558 * 16], m1 | |
19439 movu [r0 + 1559 * 16], m2 | |
19440 movu [r0 + 1560 * 16], m1 | |
19441 movu [r0 + 1561 * 16], m2 | |
19442 movu [r0 + 1562 * 16], m1 | |
19443 movu [r0 + 1563 * 16], m2 | |
19444 movu [r0 + 1564 * 16], m1 | |
19445 movu [r0 + 1565 * 16], m2 | |
19446 movu [r0 + 1566 * 16], m1 | |
19447 movu [r0 + 1567 * 16], m2 | |
19448 | |
19449 movu [r0 + 1568 * 16], m1 | |
19450 movu [r0 + 1569 * 16], m2 | |
19451 movu [r0 + 1570 * 16], m1 | |
19452 movu [r0 + 1571 * 16], m2 | |
19453 movu [r0 + 1572 * 16], m1 | |
19454 movu [r0 + 1573 * 16], m2 | |
19455 movu [r0 + 1574 * 16], m1 | |
19456 movu [r0 + 1575 * 16], m2 | |
19457 movu [r0 + 1576 * 16], m1 | |
19458 movu [r0 + 1577 * 16], m2 | |
19459 movu [r0 + 1578 * 16], m1 | |
19460 movu [r0 + 1579 * 16], m2 | |
19461 movu [r0 + 1580 * 16], m1 | |
19462 movu [r0 + 1581 * 16], m2 | |
19463 movu [r0 + 1582 * 16], m1 | |
19464 movu [r0 + 1583 * 16], m2 | |
19465 | |
19466 movu [r0 + 1584 * 16], m1 | |
19467 movu [r0 + 1585 * 16], m2 | |
19468 movu [r0 + 1586 * 16], m1 | |
19469 movu [r0 + 1587 * 16], m2 | |
19470 movu [r0 + 1588 * 16], m1 | |
19471 movu [r0 + 1589 * 16], m2 | |
19472 movu [r0 + 1590 * 16], m1 | |
19473 movu [r0 + 1591 * 16], m2 | |
19474 movu [r0 + 1592 * 16], m1 | |
19475 movu [r0 + 1593 * 16], m2 | |
19476 movu [r0 + 1594 * 16], m1 | |
19477 movu [r0 + 1595 * 16], m2 | |
19478 movu [r0 + 1596 * 16], m1 | |
19479 movu [r0 + 1597 * 16], m2 | |
19480 movu [r0 + 1598 * 16], m1 | |
19481 movu [r0 + 1599 * 16], m2 | |
19482 | |
19483 ; mode 27 [row 0] | |
19484 movu m6, [r5 + 2 * 16] | |
19485 movu m0, [r3 + 1 ] | |
19486 movu m1, [r3 + 2 ] | |
19487 punpcklbw m0, m1 | |
19488 pmaddubsw m4, m0, m6 | |
19489 pmulhrsw m4, m7 | |
19490 movu m2, [r3 + 9] | |
19491 movu m3, [r3 + 10] | |
19492 punpcklbw m2, m3 | |
19493 pmaddubsw m5, m2, m6 | |
19494 pmulhrsw m5, m7 | |
19495 packuswb m4, m5 | |
19496 movu [r0 + 1600 * 16], m4 | |
19497 | |
19498 movu m1, [r3 + 17] | |
19499 movu m3, [r3 + 18] | |
19500 punpcklbw m1, m3 | |
19501 pmaddubsw m4, m1, m6 | |
19502 pmulhrsw m4, m7 | |
19503 movu m3, [r3 + 25] | |
19504 movu m5, [r3 + 26] | |
19505 punpcklbw m3, m5 | |
19506 pmaddubsw m5, m3, m6 | |
19507 pmulhrsw m5, m7 | |
19508 packuswb m4, m5 | |
19509 movu [r0 + 1601 * 16], m4 | |
19510 | |
19511 ; mode 27 [row 1] | |
19512 movu m6, [r5 + 4 * 16] | |
19513 pmaddubsw m4, m0, m6 | |
19514 pmulhrsw m4, m7 | |
19515 pmaddubsw m5, m2, m6 | |
19516 pmulhrsw m5, m7 | |
19517 packuswb m4, m5 | |
19518 movu [r0 + 1602 * 16], m4 | |
19519 pmaddubsw m4, m1, m6 | |
19520 pmulhrsw m4, m7 | |
19521 pmaddubsw m5, m3, m6 | |
19522 pmulhrsw m5, m7 | |
19523 packuswb m4, m5 | |
19524 movu [r0 + 1603 * 16], m4 | |
19525 | |
19526 ; mode 27 [row 2] | |
19527 movu m6, [r5 + 6 * 16] | |
19528 pmaddubsw m4, m0, m6 | |
19529 pmulhrsw m4, m7 | |
19530 pmaddubsw m5, m2, m6 | |
19531 pmulhrsw m5, m7 | |
19532 packuswb m4, m5 | |
19533 movu [r0 + 1604 * 16], m4 | |
19534 pmaddubsw m4, m1, m6 | |
19535 pmulhrsw m4, m7 | |
19536 pmaddubsw m5, m3, m6 | |
19537 pmulhrsw m5, m7 | |
19538 packuswb m4, m5 | |
19539 movu [r0 + 1605 * 16], m4 | |
19540 | |
19541 ; mode 27 [row 3] | |
19542 movu m6, [r5 + 8 * 16] | |
19543 pmaddubsw m4, m0, m6 | |
19544 pmulhrsw m4, m7 | |
19545 pmaddubsw m5, m2, m6 | |
19546 pmulhrsw m5, m7 | |
19547 packuswb m4, m5 | |
19548 movu [r0 + 1606 * 16], m4 | |
19549 pmaddubsw m4, m1, m6 | |
19550 pmulhrsw m4, m7 | |
19551 pmaddubsw m5, m3, m6 | |
19552 pmulhrsw m5, m7 | |
19553 packuswb m4, m5 | |
19554 movu [r0 + 1607 * 16], m4 | |
19555 | |
19556 ; mode 27 [row 4] | |
19557 movu m6, [r5 + 10 * 16] | |
19558 pmaddubsw m4, m0, m6 | |
19559 pmulhrsw m4, m7 | |
19560 pmaddubsw m5, m2, m6 | |
19561 pmulhrsw m5, m7 | |
19562 packuswb m4, m5 | |
19563 movu [r0 + 1608 * 16], m4 | |
19564 | |
19565 ; mode 28 [row 1 -first half] | |
19566 movu [r0 + 1666 * 16], m4 | |
19567 | |
19568 pmaddubsw m4, m1, m6 | |
19569 pmulhrsw m4, m7 | |
19570 pmaddubsw m5, m3, m6 | |
19571 pmulhrsw m5, m7 | |
19572 packuswb m4, m5 | |
19573 movu [r0 + 1609 * 16], m4 | |
19574 | |
19575 ; mode 28 [row 1 - second half] | |
19576 movu [r0 + 1667 * 16], m4 | |
19577 | |
19578 ; mode 27 [row 5] | |
19579 movu m6, [r5 + 12 * 16] | |
19580 pmaddubsw m4, m0, m6 | |
19581 pmulhrsw m4, m7 | |
19582 pmaddubsw m5, m2, m6 | |
19583 pmulhrsw m5, m7 | |
19584 packuswb m4, m5 | |
19585 movu [r0 + 1610 * 16], m4 | |
19586 | |
19587 pmaddubsw m4, m1, m6 | |
19588 pmulhrsw m4, m7 | |
19589 pmaddubsw m5, m3, m6 | |
19590 pmulhrsw m5, m7 | |
19591 packuswb m4, m5 | |
19592 movu [r0 + 1611 * 16], m4 | |
19593 | |
19594 ; mode 27 [row 6] | |
19595 movu m6, [r5 + 14 * 16] | |
19596 pmaddubsw m4, m0, m6 | |
19597 pmulhrsw m4, m7 | |
19598 pmaddubsw m5, m2, m6 | |
19599 pmulhrsw m5, m7 | |
19600 packuswb m4, m5 | |
19601 movu [r0 + 1612 * 16], m4 | |
19602 pmaddubsw m4, m1, m6 | |
19603 pmulhrsw m4, m7 | |
19604 pmaddubsw m5, m3, m6 | |
19605 pmulhrsw m5, m7 | |
19606 packuswb m4, m5 | |
19607 movu [r0 + 1613 * 16], m4 | |
19608 | |
19609 ; mode 27 [row 7] | |
19610 movu m6, [r5 + 16 * 16] | |
19611 pmaddubsw m4, m0, m6 | |
19612 pmulhrsw m4, m7 | |
19613 pmaddubsw m5, m2, m6 | |
19614 pmulhrsw m5, m7 | |
19615 packuswb m4, m5 | |
19616 movu [r0 + 1614 * 16], m4 | |
19617 pmaddubsw m4, m1, m6 | |
19618 pmulhrsw m4, m7 | |
19619 pmaddubsw m5, m3, m6 | |
19620 pmulhrsw m5, m7 | |
19621 packuswb m4, m5 | |
19622 movu [r0 + 1615 * 16], m4 | |
19623 | |
19624 ; mode 27 [row 8] | |
19625 movu m6, [r5 + 18 * 16] | |
19626 pmaddubsw m4, m0, m6 | |
19627 pmulhrsw m4, m7 | |
19628 pmaddubsw m5, m2, m6 | |
19629 pmulhrsw m5, m7 | |
19630 packuswb m4, m5 | |
19631 movu [r0 + 1616 * 16], m4 | |
19632 | |
19633 ; mode 29 [row 1 - first half] | |
19634 movu [r0 + 1730 * 16], m4 | |
19635 | |
19636 pmaddubsw m4, m1, m6 | |
19637 pmulhrsw m4, m7 | |
19638 pmaddubsw m5, m3, m6 | |
19639 pmulhrsw m5, m7 | |
19640 packuswb m4, m5 | |
19641 movu [r0 + 1617 * 16], m4 | |
19642 | |
19643 ; mode 29 [row 1 - second half] | |
19644 movu [r0 + 1731 * 16], m4 | |
19645 | |
19646 ; mode 27 [row 9] | |
19647 movu m6, [r5 + 20 * 16] | |
19648 pmaddubsw m4, m0, m6 | |
19649 pmulhrsw m4, m7 | |
19650 pmaddubsw m5, m2, m6 | |
19651 pmulhrsw m5, m7 | |
19652 packuswb m4, m5 | |
19653 movu [r0 + 1618 * 16], m4 | |
19654 | |
19655 ; mode 28 [row 3 -first half] | |
19656 movu [r0 + 1670 * 16], m4 | |
19657 | |
19658 pmaddubsw m4, m1, m6 | |
19659 pmulhrsw m4, m7 | |
19660 pmaddubsw m5, m3, m6 | |
19661 pmulhrsw m5, m7 | |
19662 packuswb m4, m5 | |
19663 movu [r0 + 1619 * 16], m4 | |
19664 | |
19665 ; mode 28 [row 3 -second half] | |
19666 movu [r0 + 1671 * 16], m4 | |
19667 | |
19668 ; mode 27 [row 10] | |
19669 movu m6, [r5 + 22 * 16] | |
19670 pmaddubsw m4, m0, m6 | |
19671 pmulhrsw m4, m7 | |
19672 pmaddubsw m5, m2, m6 | |
19673 pmulhrsw m5, m7 | |
19674 packuswb m4, m5 | |
19675 movu [r0 + 1620 * 16], m4 | |
19676 pmaddubsw m4, m1, m6 | |
19677 pmulhrsw m4, m7 | |
19678 pmaddubsw m5, m3, m6 | |
19679 pmulhrsw m5, m7 | |
19680 packuswb m4, m5 | |
19681 movu [r0 + 1621 * 16], m4 | |
19682 | |
19683 ; mode 27 [row 11] | |
19684 movu m6, [r5 + 24 * 16] | |
19685 pmaddubsw m4, m0, m6 | |
19686 pmulhrsw m4, m7 | |
19687 pmaddubsw m5, m2, m6 | |
19688 pmulhrsw m5, m7 | |
19689 packuswb m4, m5 | |
19690 movu [r0 + 1622 * 16], m4 | |
19691 pmaddubsw m4, m1, m6 | |
19692 pmulhrsw m4, m7 | |
19693 pmaddubsw m5, m3, m6 | |
19694 pmulhrsw m5, m7 | |
19695 packuswb m4, m5 | |
19696 movu [r0 + 1623 * 16], m4 | |
19697 | |
19698 ; mode 27 [row 12] | |
19699 movu m6, [r5 + 26 * 16] | |
19700 pmaddubsw m4, m0, m6 | |
19701 pmulhrsw m4, m7 | |
19702 pmaddubsw m5, m2, m6 | |
19703 pmulhrsw m5, m7 | |
19704 packuswb m4, m5 | |
19705 movu [r0 + 1624 * 16], m4 | |
19706 | |
19707 ; mode 30 [row 1 - first half] | |
19708 movu [r0 + 1794 * 16], m4 | |
19709 | |
19710 ; mode 33 [row 0 - first half] | |
19711 movu [r0 + 1984 * 16], m4 | |
19712 | |
19713 pmaddubsw m4, m1, m6 | |
19714 pmulhrsw m4, m7 | |
19715 pmaddubsw m5, m3, m6 | |
19716 pmulhrsw m5, m7 | |
19717 packuswb m4, m5 | |
19718 movu [r0 + 1625 * 16], m4 | |
19719 | |
19720 ; mode 30 [row 1 - second half] | |
19721 movu [r0 + 1795 * 16], m4 | |
19722 | |
19723 ; mode 33 [row 0 - second half] | |
19724 movu [r0 + 1985 * 16], m4 | |
19725 | |
19726 ; mode 27 [row 13] | |
19727 movu m6, [r5 + 28 * 16] | |
19728 pmaddubsw m4, m0, m6 | |
19729 pmulhrsw m4, m7 | |
19730 pmaddubsw m5, m2, m6 | |
19731 pmulhrsw m5, m7 | |
19732 packuswb m4, m5 | |
19733 movu [r0 + 1626 * 16], m4 | |
19734 pmaddubsw m4, m1, m6 | |
19735 pmulhrsw m4, m7 | |
19736 pmaddubsw m5, m3, m6 | |
19737 pmulhrsw m5, m7 | |
19738 packuswb m4, m5 | |
19739 movu [r0 + 1627 * 16], m4 | |
19740 | |
19741 ; mode 27 [row 14] | |
19742 movu m6, [r5 + 30 * 16] | |
19743 pmaddubsw m4, m0, m6 | |
19744 pmulhrsw m4, m7 | |
19745 pmaddubsw m5, m2, m6 | |
19746 pmulhrsw m5, m7 | |
19747 packuswb m4, m5 | |
19748 movu [r0 + 1628 * 16], m4 | |
19749 | |
19750 ; mode 28 [row 5 first half] | |
19751 movu [r0 + 1674 * 16], m4 | |
19752 | |
19753 pmaddubsw m4, m1, m6 | |
19754 pmulhrsw m4, m7 | |
19755 pmaddubsw m5, m3, m6 | |
19756 pmulhrsw m5, m7 | |
19757 packuswb m4, m5 | |
19758 movu [r0 + 1629 * 16], m4 | |
19759 | |
19760 ; mode 28 [row 5 second half] | |
19761 movu [r0 + 1675 * 16], m4 | |
19762 | |
19763 ; mode 28 [row 0] | |
19764 movu m6, [r5 + 5 * 16] | |
19765 pmaddubsw m4, m0, m6 | |
19766 pmulhrsw m4, m7 | |
19767 pmaddubsw m5, m2, m6 | |
19768 pmulhrsw m5, m7 | |
19769 packuswb m4, m5 | |
19770 movu [r0 + 1664 * 16], m4 | |
19771 pmaddubsw m4, m1, m6 | |
19772 pmulhrsw m4, m7 | |
19773 pmaddubsw m5, m3, m6 | |
19774 pmulhrsw m5, m7 | |
19775 packuswb m4, m5 | |
19776 movu [r0 + 1665 * 16], m4 | |
19777 | |
19778 ; mode 28 [row 2] | |
19779 movu m6, [r5 + 15 * 16] | |
19780 pmaddubsw m4, m0, m6 | |
19781 pmulhrsw m4, m7 | |
19782 pmaddubsw m5, m2, m6 | |
19783 pmulhrsw m5, m7 | |
19784 packuswb m4, m5 | |
19785 movu [r0 + 1668 * 16], m4 | |
19786 pmaddubsw m4, m1, m6 | |
19787 pmulhrsw m4, m7 | |
19788 pmaddubsw m5, m3, m6 | |
19789 pmulhrsw m5, m7 | |
19790 packuswb m4, m5 | |
19791 movu [r0 + 1669 * 16], m4 | |
19792 | |
19793 ; mode 28 [row 4] | |
19794 movu m6, [r5 + 25 * 16] | |
19795 pmaddubsw m4, m0, m6 | |
19796 pmulhrsw m4, m7 | |
19797 pmaddubsw m5, m2, m6 | |
19798 pmulhrsw m5, m7 | |
19799 packuswb m4, m5 | |
19800 movu [r0 + 1672 * 16], m4 | |
19801 pmaddubsw m4, m1, m6 | |
19802 pmulhrsw m4, m7 | |
19803 pmaddubsw m5, m3, m6 | |
19804 pmulhrsw m5, m7 | |
19805 packuswb m4, m5 | |
19806 movu [r0 + 1673 * 16], m4 | |
19807 | |
19808 ; mode 30 [row 0] | |
19809 movu m6, [r5 + 13 * 16] | |
19810 pmaddubsw m4, m0, m6 | |
19811 pmulhrsw m4, m7 | |
19812 pmaddubsw m5, m2, m6 | |
19813 pmulhrsw m5, m7 | |
19814 packuswb m4, m5 | |
19815 movu [r0 + 1792 * 16], m4 | |
19816 pmaddubsw m4, m1, m6 | |
19817 pmulhrsw m4, m7 | |
19818 pmaddubsw m5, m3, m6 | |
19819 pmulhrsw m5, m7 | |
19820 packuswb m4, m5 | |
19821 movu [r0 + 1793 * 16], m4 | |
19822 | |
19823 ; mode 29 [row 0] | |
19824 movu m6, [r5 + 9 * 16] | |
19825 pmaddubsw m4, m0, m6 | |
19826 pmulhrsw m4, m7 | |
19827 pmaddubsw m5, m2, m6 | |
19828 pmulhrsw m5, m7 | |
19829 packuswb m4, m5 | |
19830 movu [r0 + 1728 * 16], m4 | |
19831 pmaddubsw m4, m1, m6 | |
19832 pmulhrsw m4, m7 | |
19833 pmaddubsw m5, m3, m6 | |
19834 pmulhrsw m5, m7 | |
19835 packuswb m4, m5 | |
19836 movu [r0 + 1729 * 16], m4 | |
19837 | |
19838 ; mode 29 [row 2] | |
19839 movu m6, [r5 + 27 * 16] | |
19840 pmaddubsw m4, m0, m6 | |
19841 pmulhrsw m4, m7 | |
19842 pmaddubsw m5, m2, m6 | |
19843 pmulhrsw m5, m7 | |
19844 packuswb m4, m5 | |
19845 movu [r0 + 1732 * 16], m4 | |
19846 pmaddubsw m4, m1, m6 | |
19847 pmulhrsw m4, m7 | |
19848 pmaddubsw m5, m3, m6 | |
19849 pmulhrsw m5, m7 | |
19850 packuswb m4, m5 | |
19851 movu [r0 + 1733 * 16], m4 | |
19852 | |
19853 ; mode 31 [row 0] | |
19854 movu m6, [r5 + 17 * 16] | |
19855 pmaddubsw m4, m0, m6 | |
19856 pmulhrsw m4, m7 | |
19857 pmaddubsw m5, m2, m6 | |
19858 pmulhrsw m5, m7 | |
19859 packuswb m4, m5 | |
19860 movu [r0 + 1856 * 16], m4 | |
19861 pmaddubsw m4, m1, m6 | |
19862 pmulhrsw m4, m7 | |
19863 pmaddubsw m5, m3, m6 | |
19864 pmulhrsw m5, m7 | |
19865 packuswb m4, m5 | |
19866 movu [r0 + 1857 * 16], m4 | |
19867 | |
19868 ; mode 32 [row 0] | |
19869 movu m6, [r5 + 21 * 16] | |
19870 pmaddubsw m4, m0, m6 | |
19871 pmulhrsw m4, m7 | |
19872 pmaddubsw m5, m2, m6 | |
19873 pmulhrsw m5, m7 | |
19874 packuswb m4, m5 | |
19875 movu [r0 + 1920 * 16], m4 | |
19876 pmaddubsw m4, m1, m6 | |
19877 pmulhrsw m4, m7 | |
19878 pmaddubsw m5, m3, m6 | |
19879 pmulhrsw m5, m7 | |
19880 packuswb m4, m5 | |
19881 movu [r0 + 1921 * 16], m4 | |
19882 | |
19883 ; mode 27 [row 15] | |
19884 movu m0, [r3 + 2] | |
19885 movd m1, [r3 + 3] | |
19886 palignr m1, m0, 1 | |
19887 punpcklbw m0, m1 | |
19888 movu m2, [r3 + 10] | |
19889 movd m3, [r3 + 11] | |
19890 palignr m3, m2, 1 | |
19891 punpcklbw m2, m3 | |
19892 movu m1, [r3 + 18] | |
19893 movd m3, [r3 + 19] | |
19894 palignr m3, m1, 1 | |
19895 punpcklbw m1, m3 | |
19896 movu m4, [r3 + 26] | |
19897 movd m5, [r3 + 27] | |
19898 palignr m5, m4, 1 | |
19899 punpcklbw m4, m5 | |
19900 | |
19901 pshufb m5, m0, [tab_S2] | |
19902 movh [r0 + 1630 * 16], m5 | |
19903 pshufb m5, m2, [tab_S2] | |
19904 movh [r0 + 1630 * 16 + 8], m5 | |
19905 pshufb m5, m1, [tab_S2] | |
19906 movh [r0 + 1631 * 16], m5 | |
19907 pshufb m5, m4, [tab_S2] | |
19908 movh [r0 + 1631 * 16 + 8], m5 | |
19909 | |
19910 ; mode 27 [row 16] | |
19911 movu m6, [r5 + 2 * 16] | |
19912 pmaddubsw m3, m0, m6 | |
19913 pmulhrsw m3, m7 | |
19914 pmaddubsw m5, m2, m6 | |
19915 pmulhrsw m5, m7 | |
19916 packuswb m3, m5 | |
19917 movu [r0 + 1632 * 16], m3 | |
19918 | |
19919 ; mode 31 [row 1 - first half] | |
19920 movu [r0 + 1858 * 16], m3 | |
19921 | |
19922 pmaddubsw m3, m1, m6 | |
19923 pmulhrsw m3, m7 | |
19924 pmaddubsw m5, m4, m6 | |
19925 pmulhrsw m5, m7 | |
19926 packuswb m3, m5 | |
19927 movu [r0 + 1633 * 16], m3 | |
19928 | |
19929 ; mode 31 [row 1 - second half] | |
19930 movu [r0 + 1859 * 16], m3 | |
19931 | |
19932 ; mode 27 [row 17] | |
19933 movu m6, [r5 + 4 * 16] | |
19934 pmaddubsw m3, m0, m6 | |
19935 pmulhrsw m3, m7 | |
19936 pmaddubsw m5, m2, m6 | |
19937 pmulhrsw m5, m7 | |
19938 packuswb m3, m5 | |
19939 movu [r0 + 1634 * 16], m3 | |
19940 | |
19941 ; mode 29 [row 3 - first half] | |
19942 movu [r0 + 1734 * 16], m3 | |
19943 | |
19944 pmaddubsw m3, m1, m6 | |
19945 pmulhrsw m3, m7 | |
19946 pmaddubsw m5, m4, m6 | |
19947 pmulhrsw m5, m7 | |
19948 packuswb m3, m5 | |
19949 movu [r0 + 1635 * 16], m3 | |
19950 | |
19951 ; mode 29 [row 3 - second half] | |
19952 movu [r0 + 1735 * 16], m3 | |
19953 | |
19954 ; mode 27 [row 18] | |
19955 movu m6, [r5 + 6 * 16] | |
19956 pmaddubsw m3, m0, m6 | |
19957 pmulhrsw m3, m7 | |
19958 pmaddubsw m5, m2, m6 | |
19959 pmulhrsw m5, m7 | |
19960 packuswb m3, m5 | |
19961 movu [r0 + 1636 * 16], m3 | |
19962 pmaddubsw m3, m1, m6 | |
19963 pmulhrsw m3, m7 | |
19964 pmaddubsw m5, m4, m6 | |
19965 pmulhrsw m5, m7 | |
19966 packuswb m3, m5 | |
19967 movu [r0 + 1637 * 16], m3 | |
19968 | |
19969 ; mode 27 [row 19] | |
19970 movu m6, [r5 + 8 * 16] | |
19971 pmaddubsw m3, m0, m6 | |
19972 pmulhrsw m3, m7 | |
19973 pmaddubsw m5, m2, m6 | |
19974 pmulhrsw m5, m7 | |
19975 packuswb m3, m5 | |
19976 movu [r0 + 1638 * 16], m3 | |
19977 | |
19978 ; mode 28 [row 7 - first half] | |
19979 movu [r0 + 1678 * 16], m3 | |
19980 | |
19981 pmaddubsw m3, m1, m6 | |
19982 pmulhrsw m3, m7 | |
19983 pmaddubsw m5, m4, m6 | |
19984 pmulhrsw m5, m7 | |
19985 packuswb m3, m5 | |
19986 movu [r0 + 1639 * 16], m3 | |
19987 | |
19988 ; mode 28 [row 7 - second half] | |
19989 movu [r0 + 1679 * 16], m3 | |
19990 | |
19991 ; mode 27 [row 20] | |
19992 movu m6, [r5 + 10 * 16] | |
19993 pmaddubsw m3, m0, m6 | |
19994 pmulhrsw m3, m7 | |
19995 pmaddubsw m5, m2, m6 | |
19996 pmulhrsw m5, m7 | |
19997 packuswb m3, m5 | |
19998 movu [r0 + 1640 * 16], m3 | |
19999 | |
20000 ; mode 32 [row 1 - first half] | |
20001 movu [r0 + 1922 * 16], m3 | |
20002 | |
20003 pmaddubsw m3, m1, m6 | |
20004 pmulhrsw m3, m7 | |
20005 pmaddubsw m5, m4, m6 | |
20006 pmulhrsw m5, m7 | |
20007 packuswb m3, m5 | |
20008 movu [r0 + 1641 * 16], m3 | |
20009 | |
20010 ; mode 32 [row 1 - second half] | |
20011 movu [r0 + 1923 * 16], m3 | |
20012 | |
20013 ; mode 27 [row 21] | |
20014 movu m6, [r5 + 12 * 16] | |
20015 pmaddubsw m3, m0, m6 | |
20016 pmulhrsw m3, m7 | |
20017 pmaddubsw m5, m2, m6 | |
20018 pmulhrsw m5, m7 | |
20019 packuswb m3, m5 | |
20020 movu [r0 + 1642 * 16], m3 | |
20021 pmaddubsw m3, m1, m6 | |
20022 pmulhrsw m3, m7 | |
20023 pmaddubsw m5, m4, m6 | |
20024 pmulhrsw m5, m7 | |
20025 packuswb m3, m5 | |
20026 movu [r0 + 1643 * 16], m3 | |
20027 | |
20028 ; mode 27 [row 22] | |
20029 movu m6, [r5 + 14 * 16] | |
20030 pmaddubsw m3, m0, m6 | |
20031 pmulhrsw m3, m7 | |
20032 pmaddubsw m5, m2, m6 | |
20033 pmulhrsw m5, m7 | |
20034 packuswb m3, m5 | |
20035 movu [r0 + 1644 * 16], m3 | |
20036 pmaddubsw m3, m1, m6 | |
20037 pmulhrsw m3, m7 | |
20038 pmaddubsw m5, m4, m6 | |
20039 pmulhrsw m5, m7 | |
20040 packuswb m3, m5 | |
20041 movu [r0 + 1645 * 16], m3 | |
20042 | |
20043 ; mode 27 [row 23] | |
20044 movu m6, [r5 + 16 * 16] | |
20045 pmaddubsw m3, m0, m6 | |
20046 pmulhrsw m3, m7 | |
20047 pmaddubsw m5, m2, m6 | |
20048 pmulhrsw m5, m7 | |
20049 packuswb m3, m5 | |
20050 movu [r0 + 1646 * 16], m3 | |
20051 pmaddubsw m3, m1, m6 | |
20052 pmulhrsw m3, m7 | |
20053 pmaddubsw m5, m4, m6 | |
20054 pmulhrsw m5, m7 | |
20055 packuswb m3, m5 | |
20056 movu [r0 + 1647 * 16], m3 | |
20057 | |
20058 ; mode 27 [row 24] | |
20059 movu m6, [r5 + 18 * 16] | |
20060 pmaddubsw m3, m0, m6 | |
20061 pmulhrsw m3, m7 | |
20062 pmaddubsw m5, m2, m6 | |
20063 pmulhrsw m5, m7 | |
20064 packuswb m3, m5 | |
20065 movu [r0 + 1648 * 16], m3 | |
20066 | |
20067 ; mode 28 [row 9 - first half] | |
20068 movu [r0 + 1682 * 16], m3 | |
20069 | |
20070 pmaddubsw m3, m1, m6 | |
20071 pmulhrsw m3, m7 | |
20072 pmaddubsw m5, m4, m6 | |
20073 pmulhrsw m5, m7 | |
20074 packuswb m3, m5 | |
20075 movu [r0 + 1649 * 16], m3 | |
20076 | |
20077 ; mode 28 [row 9 - second half] | |
20078 movu [r0 + 1683 * 16], m3 | |
20079 | |
20080 ; mode 27 [row 25] | |
20081 movu m6, [r5 + 20 * 16] | |
20082 pmaddubsw m3, m0, m6 | |
20083 pmulhrsw m3, m7 | |
20084 pmaddubsw m5, m2, m6 | |
20085 pmulhrsw m5, m7 | |
20086 packuswb m3, m5 | |
20087 movu [r0 + 1650 * 16], m3 | |
20088 | |
20089 ; mode 30 [row 3 - first half] | |
20090 movu [r0 + 1798 * 16], m3 | |
20091 | |
20092 ; mode 33 [row 1 - first half] | |
20093 movu [r0 + 1986 * 16], m3 | |
20094 | |
20095 pmaddubsw m3, m1, m6 | |
20096 pmulhrsw m3, m7 | |
20097 pmaddubsw m5, m4, m6 | |
20098 pmulhrsw m5, m7 | |
20099 packuswb m3, m5 | |
20100 movu [r0 + 1651 * 16], m3 | |
20101 | |
20102 ; mode 30 [row 3 - second half] | |
20103 movu [r0 + 1799 * 16], m3 | |
20104 | |
20105 ; mode 33 [row 1 - second half] | |
20106 movu [r0 + 1987 * 16], m3 | |
20107 | |
20108 ; mode 27 [row 26] | |
20109 movu m6, [r5 + 22 * 16] | |
20110 pmaddubsw m3, m0, m6 | |
20111 pmulhrsw m3, m7 | |
20112 pmaddubsw m5, m2, m6 | |
20113 pmulhrsw m5, m7 | |
20114 packuswb m3, m5 | |
20115 movu [r0 + 1652 * 16], m3 | |
20116 | |
20117 ; mode 29 [row 5 - first half] | |
20118 movu [r0 + 1738 * 16], m3 | |
20119 | |
20120 pmaddubsw m3, m1, m6 | |
20121 pmulhrsw m3, m7 | |
20122 pmaddubsw m5, m4, m6 | |
20123 pmulhrsw m5, m7 | |
20124 packuswb m3, m5 | |
20125 movu [r0 + 1653 * 16], m3 | |
20126 | |
20127 ; mode 29 [row 5 - second half] | |
20128 movu [r0 + 1739 * 16], m3 | |
20129 | |
20130 ; mode 27 [row 27] | |
20131 movu m6, [r5 + 24 * 16] | |
20132 pmaddubsw m3, m0, m6 | |
20133 pmulhrsw m3, m7 | |
20134 pmaddubsw m5, m2, m6 | |
20135 pmulhrsw m5, m7 | |
20136 packuswb m3, m5 | |
20137 movu [r0 + 1654 * 16], m3 | |
20138 pmaddubsw m3, m1, m6 | |
20139 pmulhrsw m3, m7 | |
20140 pmaddubsw m5, m4, m6 | |
20141 pmulhrsw m5, m7 | |
20142 packuswb m3, m5 | |
20143 movu [r0 + 1655 * 16], m3 | |
20144 | |
20145 ; mode 27 [row 28] | |
20146 movu m6, [r5 + 26 * 16] | |
20147 pmaddubsw m3, m0, m6 | |
20148 pmulhrsw m3, m7 | |
20149 pmaddubsw m5, m2, m6 | |
20150 pmulhrsw m5, m7 | |
20151 packuswb m3, m5 | |
20152 movu [r0 + 1656 * 16], m3 | |
20153 pmaddubsw m3, m1, m6 | |
20154 pmulhrsw m3, m7 | |
20155 pmaddubsw m5, m4, m6 | |
20156 pmulhrsw m5, m7 | |
20157 packuswb m3, m5 | |
20158 movu [r0 + 1657 * 16], m3 | |
20159 | |
20160 ; mode 27 [row 29] | |
20161 movu m6, [r5 + 28 * 16] | |
20162 pmaddubsw m3, m0, m6 | |
20163 pmulhrsw m3, m7 | |
20164 pmaddubsw m5, m2, m6 | |
20165 pmulhrsw m5, m7 | |
20166 packuswb m3, m5 | |
20167 movu [r0 + 1658 * 16], m3 | |
20168 | |
20169 ; mode 28 [row 11 - first half] | |
20170 movu [r0 + 1686 * 16], m3 | |
20171 | |
20172 pmaddubsw m3, m1, m6 | |
20173 pmulhrsw m3, m7 | |
20174 pmaddubsw m5, m4, m6 | |
20175 pmulhrsw m5, m7 | |
20176 packuswb m3, m5 | |
20177 movu [r0 + 1659 * 16], m3 | |
20178 | |
20179 ; mode 28 [row 11 - second half] | |
20180 movu [r0 + 1687 * 16], m3 | |
20181 | |
20182 ; mode 27 [row 30] | |
20183 movu m6, [r5 + 30 * 16] | |
20184 pmaddubsw m3, m0, m6 | |
20185 pmulhrsw m3, m7 | |
20186 pmaddubsw m5, m2, m6 | |
20187 pmulhrsw m5, m7 | |
20188 packuswb m3, m5 | |
20189 movu [r0 + 1660 * 16], m3 | |
20190 pmaddubsw m3, m1, m6 | |
20191 pmulhrsw m3, m7 | |
20192 pmaddubsw m5, m4, m6 | |
20193 pmulhrsw m5, m7 | |
20194 packuswb m3, m5 | |
20195 movu [r0 + 1661 * 16], m3 | |
20196 | |
20197 ; mode 28 [row 6] | |
20198 movu m6, [r5 + 3 * 16] | |
20199 pmaddubsw m3, m0, m6 | |
20200 pmulhrsw m3, m7 | |
20201 pmaddubsw m5, m2, m6 | |
20202 pmulhrsw m5, m7 | |
20203 packuswb m3, m5 | |
20204 movu [r0 + 1676 * 16], m3 | |
20205 pmaddubsw m3, m1, m6 | |
20206 pmulhrsw m3, m7 | |
20207 pmaddubsw m5, m4, m6 | |
20208 pmulhrsw m5, m7 | |
20209 packuswb m3, m5 | |
20210 movu [r0 + 1677 * 16], m3 | |
20211 | |
20212 ; mode 28 [row 8] | |
20213 movu m6, [r5 + 13 * 16] | |
20214 pmaddubsw m3, m0, m6 | |
20215 pmulhrsw m3, m7 | |
20216 pmaddubsw m5, m2, m6 | |
20217 pmulhrsw m5, m7 | |
20218 packuswb m3, m5 | |
20219 movu [r0 + 1680 * 16], m3 | |
20220 | |
20221 ; mode 29 [row 4 - first half] | |
20222 movu [r0 + 1736 * 16], m3 | |
20223 | |
20224 pmaddubsw m3, m1, m6 | |
20225 pmulhrsw m3, m7 | |
20226 pmaddubsw m5, m4, m6 | |
20227 pmulhrsw m5, m7 | |
20228 packuswb m3, m5 | |
20229 movu [r0 + 1681 * 16], m3 | |
20230 | |
20231 ; mode 29 [row 4 - second half] | |
20232 movu [r0 + 1737 * 16], m3 | |
20233 | |
20234 ; mode 28 [row 10] | |
20235 movu m6, [r5 + 23 * 16] | |
20236 pmaddubsw m3, m0, m6 | |
20237 pmulhrsw m3, m7 | |
20238 pmaddubsw m5, m2, m6 | |
20239 pmulhrsw m5, m7 | |
20240 packuswb m3, m5 | |
20241 movu [r0 + 1684 * 16], m3 | |
20242 pmaddubsw m3, m1, m6 | |
20243 pmulhrsw m3, m7 | |
20244 pmaddubsw m5, m4, m6 | |
20245 pmulhrsw m5, m7 | |
20246 packuswb m3, m5 | |
20247 movu [r0 + 1685 * 16], m3 | |
20248 | |
20249 ; mode 29 [row 6] | |
20250 movu m6, [r5 + 31 * 16] | |
20251 pmaddubsw m3, m0, m6 | |
20252 pmulhrsw m3, m7 | |
20253 pmaddubsw m5, m2, m6 | |
20254 pmulhrsw m5, m7 | |
20255 packuswb m3, m5 | |
20256 movu [r0 + 1740 * 16], m3 | |
20257 | |
20258 ; mode 32 [row 2 - first half] | |
20259 movu [r0 + 1924 * 16], m3 | |
20260 | |
20261 pmaddubsw m3, m1, m6 | |
20262 pmulhrsw m3, m7 | |
20263 pmaddubsw m5, m4, m6 | |
20264 pmulhrsw m5, m7 | |
20265 packuswb m3, m5 | |
20266 movu [r0 + 1741 * 16], m3 | |
20267 | |
20268 ; mode 32 [row 2 - second half] | |
20269 movu [r0 + 1925 * 16], m3 | |
20270 | |
20271 ; mode 30 [row 2] | |
20272 movu m6, [r5 + 7 * 16] | |
20273 pmaddubsw m3, m0, m6 | |
20274 pmulhrsw m3, m7 | |
20275 pmaddubsw m5, m2, m6 | |
20276 pmulhrsw m5, m7 | |
20277 packuswb m3, m5 | |
20278 movu [r0 + 1796 * 16], m3 | |
20279 pmaddubsw m3, m1, m6 | |
20280 pmulhrsw m3, m7 | |
20281 pmaddubsw m5, m4, m6 | |
20282 pmulhrsw m5, m7 | |
20283 packuswb m3, m5 | |
20284 movu [r0 + 1797 * 16], m3 | |
20285 | |
20286 ; mode 31 [row 2] | |
20287 movu m6, [r5 + 19 * 16] | |
20288 pmaddubsw m3, m0, m6 | |
20289 pmulhrsw m3, m7 | |
20290 pmaddubsw m5, m2, m6 | |
20291 pmulhrsw m5, m7 | |
20292 packuswb m3, m5 | |
20293 movu [r0 + 1860 * 16], m3 | |
20294 pmaddubsw m3, m1, m6 | |
20295 pmulhrsw m3, m7 | |
20296 pmaddubsw m5, m4, m6 | |
20297 pmulhrsw m5, m7 | |
20298 packuswb m3, m5 | |
20299 movu [r0 + 1861 * 16], m3 | |
20300 | |
20301 ; mode 27 [row 15] | |
20302 movu m0, [r3 + 3] | |
20303 movd m1, [r3 + 4] | |
20304 palignr m1, m0, 1 | |
20305 punpcklbw m0, m1 | |
20306 movu m2, [r3 + 11] | |
20307 movd m3, [r3 + 12] | |
20308 palignr m3, m2, 1 | |
20309 punpcklbw m2, m3 | |
20310 movu m1, [r3 + 19] | |
20311 movd m3, [r3 + 20] | |
20312 palignr m3, m1, 1 | |
20313 punpcklbw m1, m3 | |
20314 movu m4, [r3 + 27] | |
20315 movd m5, [r3 + 28] | |
20316 palignr m5, m4, 1 | |
20317 punpcklbw m4, m5 | |
20318 | |
20319 pshufb m5, m0, [tab_S2] | |
20320 movh [r0 + 1662 * 16], m5 | |
20321 pshufb m5, m2, [tab_S2] | |
20322 movh [r0 + 1662 * 16 + 8], m5 | |
20323 pshufb m5, m1, [tab_S2] | |
20324 movh [r0 + 1663 * 16], m5 | |
20325 pshufb m5, m4, [tab_S2] | |
20326 movh [r0 + 1663 * 16 + 8], m5 | |
20327 | |
20328 ; mode 28 [row 12] | |
20329 movu m6, [r5 + 1 * 16] | |
20330 pmaddubsw m3, m0, m6 | |
20331 pmulhrsw m3, m7 | |
20332 pmaddubsw m5, m2, m6 | |
20333 pmulhrsw m5, m7 | |
20334 packuswb m3, m5 | |
20335 movu [r0 + 1688 * 16], m3 | |
20336 | |
20337 ; mode 30 [row 4 - first half] | |
20338 movu [r0 + 1800 * 16], m3 | |
20339 | |
20340 pmaddubsw m3, m1, m6 | |
20341 pmulhrsw m3, m7 | |
20342 pmaddubsw m5, m4, m6 | |
20343 pmulhrsw m5, m7 | |
20344 packuswb m3, m5 | |
20345 movu [r0 + 1689 * 16], m3 | |
20346 | |
20347 ; mode 30 [row 4 - second half] | |
20348 movu [r0 + 1801 * 16], m3 | |
20349 | |
20350 ; mode 28 [row 13] | |
20351 movu m6, [r5 + 6 * 16] | |
20352 pmaddubsw m3, m0, m6 | |
20353 pmulhrsw m3, m7 | |
20354 pmaddubsw m5, m2, m6 | |
20355 pmulhrsw m5, m7 | |
20356 packuswb m3, m5 | |
20357 movu [r0 + 1690 * 16], m3 | |
20358 pmaddubsw m3, m1, m6 | |
20359 pmulhrsw m3, m7 | |
20360 pmaddubsw m5, m4, m6 | |
20361 pmulhrsw m5, m7 | |
20362 packuswb m3, m5 | |
20363 movu [r0 + 1691 * 16], m3 | |
20364 | |
20365 ; mode 28 [row 14] | |
20366 movu m6, [r5 + 11 * 16] | |
20367 pmaddubsw m3, m0, m6 | |
20368 pmulhrsw m3, m7 | |
20369 pmaddubsw m5, m2, m6 | |
20370 pmulhrsw m5, m7 | |
20371 packuswb m3, m5 | |
20372 movu [r0 + 1692 * 16], m3 | |
20373 pmaddubsw m3, m1, m6 | |
20374 pmulhrsw m3, m7 | |
20375 pmaddubsw m5, m4, m6 | |
20376 pmulhrsw m5, m7 | |
20377 packuswb m3, m5 | |
20378 movu [r0 + 1693 * 16], m3 | |
20379 | |
20380 ; mode 28 [row 15] | |
20381 movu m6, [r5 + 16 * 16] | |
20382 pmaddubsw m3, m0, m6 | |
20383 pmulhrsw m3, m7 | |
20384 pmaddubsw m5, m2, m6 | |
20385 pmulhrsw m5, m7 | |
20386 packuswb m3, m5 | |
20387 movu [r0 + 1694 * 16], m3 | |
20388 pmaddubsw m3, m1, m6 | |
20389 pmulhrsw m3, m7 | |
20390 pmaddubsw m5, m4, m6 | |
20391 pmulhrsw m5, m7 | |
20392 packuswb m3, m5 | |
20393 movu [r0 + 1695 * 16], m3 | |
20394 | |
20395 ; mode 28 [row 16] | |
20396 movu m6, [r5 + 21 * 16] | |
20397 pmaddubsw m3, m0, m6 | |
20398 pmulhrsw m3, m7 | |
20399 pmaddubsw m5, m2, m6 | |
20400 pmulhrsw m5, m7 | |
20401 packuswb m3, m5 | |
20402 movu [r0 + 1696 * 16], m3 | |
20403 | |
20404 ; mode 31 [row 4 - first half] | |
20405 movu [r0 + 1864 * 16], m3 | |
20406 | |
20407 pmaddubsw m3, m1, m6 | |
20408 pmulhrsw m3, m7 | |
20409 pmaddubsw m5, m4, m6 | |
20410 pmulhrsw m5, m7 | |
20411 packuswb m3, m5 | |
20412 movu [r0 + 1697 * 16], m3 | |
20413 | |
20414 ; mode 31 [row 4 - second half] | |
20415 movu [r0 + 1865 * 16], m3 | |
20416 | |
20417 ; mode 28 [row 17] | |
20418 movu m6, [r5 + 26 * 16] | |
20419 pmaddubsw m3, m0, m6 | |
20420 pmulhrsw m3, m7 | |
20421 pmaddubsw m5, m2, m6 | |
20422 pmulhrsw m5, m7 | |
20423 packuswb m3, m5 | |
20424 movu [r0 + 1698 * 16], m3 | |
20425 | |
20426 ; mode 29 [row 9 - first half] | |
20427 movu [r0 + 1746 * 16], m3 | |
20428 | |
20429 pmaddubsw m3, m1, m6 | |
20430 pmulhrsw m3, m7 | |
20431 pmaddubsw m5, m4, m6 | |
20432 pmulhrsw m5, m7 | |
20433 packuswb m3, m5 | |
20434 movu [r0 + 1699 * 16], m3 | |
20435 | |
20436 ; mode 29 [row 9 - second half] | |
20437 movu [r0 + 1747 * 16], m3 | |
20438 | |
20439 ; mode 28 [row 18] | |
20440 movu m6, [r5 + 31 * 16] | |
20441 pmaddubsw m3, m0, m6 | |
20442 pmulhrsw m3, m7 | |
20443 pmaddubsw m5, m2, m6 | |
20444 pmulhrsw m5, m7 | |
20445 packuswb m3, m5 | |
20446 movu [r0 + 1700 * 16], m3 | |
20447 pmaddubsw m3, m1, m6 | |
20448 pmulhrsw m3, m7 | |
20449 pmaddubsw m5, m4, m6 | |
20450 pmulhrsw m5, m7 | |
20451 packuswb m3, m5 | |
20452 movu [r0 + 1701 * 16], m3 | |
20453 | |
20454 ; mode 29 [row 7] | |
20455 movu m6, [r5 + 8 * 16] | |
20456 pmaddubsw m3, m0, m6 | |
20457 pmulhrsw m3, m7 | |
20458 pmaddubsw m5, m2, m6 | |
20459 pmulhrsw m5, m7 | |
20460 packuswb m3, m5 | |
20461 movu [r0 + 1742 * 16], m3 | |
20462 pmaddubsw m3, m1, m6 | |
20463 pmulhrsw m3, m7 | |
20464 pmaddubsw m5, m4, m6 | |
20465 pmulhrsw m5, m7 | |
20466 packuswb m3, m5 | |
20467 movu [r0 + 1743 * 16], m3 | |
20468 | |
20469 ; mode 29 [row 8] | |
20470 movu m6, [r5 + 17 * 16] | |
20471 pmaddubsw m3, m0, m6 | |
20472 pmulhrsw m3, m7 | |
20473 pmaddubsw m5, m2, m6 | |
20474 pmulhrsw m5, m7 | |
20475 packuswb m3, m5 | |
20476 movu [r0 + 1744 * 16], m3 | |
20477 pmaddubsw m3, m1, m6 | |
20478 pmulhrsw m3, m7 | |
20479 pmaddubsw m5, m4, m6 | |
20480 pmulhrsw m5, m7 | |
20481 packuswb m3, m5 | |
20482 movu [r0 + 1745 * 16], m3 | |
20483 | |
20484 ; mode 30 [row 5] | |
20485 movu m6, [r5 + 14 * 16] | |
20486 pmaddubsw m3, m0, m6 | |
20487 pmulhrsw m3, m7 | |
20488 pmaddubsw m5, m2, m6 | |
20489 pmulhrsw m5, m7 | |
20490 packuswb m3, m5 | |
20491 movu [r0 + 1802 * 16], m3 | |
20492 | |
20493 ; mode 33 [row 2 - first half] | |
20494 movu [r0 + 1988 * 16], m3 | |
20495 | |
20496 pmaddubsw m3, m1, m6 | |
20497 pmulhrsw m3, m7 | |
20498 pmaddubsw m5, m4, m6 | |
20499 pmulhrsw m5, m7 | |
20500 packuswb m3, m5 | |
20501 movu [r0 + 1803 * 16], m3 | |
20502 | |
20503 ; mode 33 [row 2 - second half] | |
20504 movu [r0 + 1989 * 16], m3 | |
20505 | |
20506 ; mode 30 [row 6] | |
20507 movu m6, [r5 + 27 * 16] | |
20508 pmaddubsw m3, m0, m6 | |
20509 pmulhrsw m3, m7 | |
20510 pmaddubsw m5, m2, m6 | |
20511 pmulhrsw m5, m7 | |
20512 packuswb m3, m5 | |
20513 movu [r0 + 1804 * 16], m3 | |
20514 pmaddubsw m3, m1, m6 | |
20515 pmulhrsw m3, m7 | |
20516 pmaddubsw m5, m4, m6 | |
20517 pmulhrsw m5, m7 | |
20518 packuswb m3, m5 | |
20519 movu [r0 + 1805 * 16], m3 | |
20520 | |
20521 ; mode 31 [row 3] | |
20522 movu m6, [r5 + 4 * 16] | |
20523 pmaddubsw m3, m0, m6 | |
20524 pmulhrsw m3, m7 | |
20525 pmaddubsw m5, m2, m6 | |
20526 pmulhrsw m5, m7 | |
20527 packuswb m3, m5 | |
20528 movu [r0 + 1862 * 16], m3 | |
20529 pmaddubsw m3, m1, m6 | |
20530 pmulhrsw m3, m7 | |
20531 pmaddubsw m5, m4, m6 | |
20532 pmulhrsw m5, m7 | |
20533 packuswb m3, m5 | |
20534 movu [r0 + 1863 * 16], m3 | |
20535 | |
20536 ; mode 32 [row 3] | |
20537 movu m6, [r5 + 20 * 16] | |
20538 pmaddubsw m3, m0, m6 | |
20539 pmulhrsw m3, m7 | |
20540 pmaddubsw m5, m2, m6 | |
20541 pmulhrsw m5, m7 | |
20542 packuswb m3, m5 | |
20543 movu [r0 + 1926 * 16], m3 | |
20544 pmaddubsw m3, m1, m6 | |
20545 pmulhrsw m3, m7 | |
20546 pmaddubsw m5, m4, m6 | |
20547 pmulhrsw m5, m7 | |
20548 packuswb m3, m5 | |
20549 movu [r0 + 1927 * 16], m3 | |
20550 | |
20551 ; mode 28 [row 19] | |
20552 movu m6, [r5 + 4 * 16] | |
20553 movu m0, [r3 + 4] | |
20554 movd m1, [r3 + 5] | |
20555 palignr m1, m0, 1 | |
20556 punpcklbw m0, m1 | |
20557 pmaddubsw m3, m0, m6 | |
20558 pmulhrsw m3, m7 | |
20559 movu m2, [r3 + 12] | |
20560 movd m4, [r3 + 13] | |
20561 palignr m4, m2, 1 | |
20562 punpcklbw m2, m4 | |
20563 pmaddubsw m5, m2, m6 | |
20564 pmulhrsw m5, m7 | |
20565 packuswb m3, m5 | |
20566 movu [r0 + 1702 * 16], m3 | |
20567 | |
20568 movu m1, [r3 + 20] | |
20569 movd m3, [r3 + 21] | |
20570 palignr m3, m1, 1 | |
20571 punpcklbw m1, m3 | |
20572 pmaddubsw m3, m1, m6 | |
20573 pmulhrsw m3, m7 | |
20574 movu m4, [r3 + 28] | |
20575 movd m5, [r3 + 29] | |
20576 palignr m5, m4, 1 | |
20577 punpcklbw m4, m5 | |
20578 pmaddubsw m5, m4, m6 | |
20579 pmulhrsw m5, m7 | |
20580 packuswb m3, m5 | |
20581 movu [r0 + 1703 * 16], m3 | |
20582 | |
20583 ; mode 28 [row 20] | |
20584 movu m6, [r5 + 9 * 16] | |
20585 pmaddubsw m3, m0, m6 | |
20586 pmulhrsw m3, m7 | |
20587 pmaddubsw m5, m2, m6 | |
20588 pmulhrsw m5, m7 | |
20589 packuswb m3, m5 | |
20590 movu [r0 + 1704 * 16], m3 | |
20591 | |
20592 ; mode 32 [row 4 - first half] | |
20593 movu [r0 + 1928 * 16], m3 | |
20594 | |
20595 pmaddubsw m3, m1, m6 | |
20596 pmulhrsw m3, m7 | |
20597 pmaddubsw m5, m4, m6 | |
20598 pmulhrsw m5, m7 | |
20599 packuswb m3, m5 | |
20600 movu [r0 + 1705 * 16], m3 | |
20601 | |
20602 ; mode 32 [row 4 - second half] | |
20603 movu [r0 + 1929 * 16], m3 | |
20604 | |
20605 ; mode 28 [row 21] | |
20606 movu m6, [r5 + 14 * 16] | |
20607 pmaddubsw m3, m0, m6 | |
20608 pmulhrsw m3, m7 | |
20609 pmaddubsw m5, m2, m6 | |
20610 pmulhrsw m5, m7 | |
20611 packuswb m3, m5 | |
20612 movu [r0 + 1706 * 16], m3 | |
20613 pmaddubsw m3, m1, m6 | |
20614 pmulhrsw m3, m7 | |
20615 pmaddubsw m5, m4, m6 | |
20616 pmulhrsw m5, m7 | |
20617 packuswb m3, m5 | |
20618 movu [r0 + 1707 * 16], m3 | |
20619 | |
20620 ; mode 28 [row 22] | |
20621 movu m6, [r5 + 19 * 16] | |
20622 pmaddubsw m3, m0, m6 | |
20623 pmulhrsw m3, m7 | |
20624 pmaddubsw m5, m2, m6 | |
20625 pmulhrsw m5, m7 | |
20626 packuswb m3, m5 | |
20627 movu [r0 + 1708 * 16], m3 | |
20628 pmaddubsw m3, m1, m6 | |
20629 pmulhrsw m3, m7 | |
20630 pmaddubsw m5, m4, m6 | |
20631 pmulhrsw m5, m7 | |
20632 packuswb m3, m5 | |
20633 movu [r0 + 1709 * 16], m3 | |
20634 | |
20635 ; mode 28 [row 23] | |
20636 movu m6, [r5 + 24 * 16] | |
20637 pmaddubsw m3, m0, m6 | |
20638 pmulhrsw m3, m7 | |
20639 pmaddubsw m5, m2, m6 | |
20640 pmulhrsw m5, m7 | |
20641 packuswb m3, m5 | |
20642 movu [r0 + 1710 * 16], m3 | |
20643 pmaddubsw m3, m1, m6 | |
20644 pmulhrsw m3, m7 | |
20645 pmaddubsw m5, m4, m6 | |
20646 pmulhrsw m5, m7 | |
20647 packuswb m3, m5 | |
20648 movu [r0 + 1711 * 16], m3 | |
20649 | |
20650 ; mode 28 [row 24] | |
20651 movu m6, [r5 + 29 * 16] | |
20652 pmaddubsw m3, m0, m6 | |
20653 pmulhrsw m3, m7 | |
20654 pmaddubsw m5, m2, m6 | |
20655 pmulhrsw m5, m7 | |
20656 packuswb m3, m5 | |
20657 movu [r0 + 1712 * 16], m3 | |
20658 pmaddubsw m3, m1, m6 | |
20659 pmulhrsw m3, m7 | |
20660 pmaddubsw m5, m4, m6 | |
20661 pmulhrsw m5, m7 | |
20662 packuswb m3, m5 | |
20663 movu [r0 + 1713 * 16], m3 | |
20664 | |
20665 ; mode 29 [row 10] | |
20666 movu m6, [r5 + 3 * 16] | |
20667 pmaddubsw m3, m0, m6 | |
20668 pmulhrsw m3, m7 | |
20669 pmaddubsw m5, m2, m6 | |
20670 pmulhrsw m5, m7 | |
20671 packuswb m3, m5 | |
20672 movu [r0 + 1748 * 16], m3 | |
20673 pmaddubsw m3, m1, m6 | |
20674 pmulhrsw m3, m7 | |
20675 pmaddubsw m5, m4, m6 | |
20676 pmulhrsw m5, m7 | |
20677 packuswb m3, m5 | |
20678 movu [r0 + 1749 * 16], m3 | |
20679 | |
20680 ; mode 29 [row 11] | |
20681 movu m6, [r5 + 12 * 16] | |
20682 pmaddubsw m3, m0, m6 | |
20683 pmulhrsw m3, m7 | |
20684 pmaddubsw m5, m2, m6 | |
20685 pmulhrsw m5, m7 | |
20686 packuswb m3, m5 | |
20687 movu [r0 + 1750 * 16], m3 | |
20688 pmaddubsw m3, m1, m6 | |
20689 pmulhrsw m3, m7 | |
20690 pmaddubsw m5, m4, m6 | |
20691 pmulhrsw m5, m7 | |
20692 packuswb m3, m5 | |
20693 movu [r0 + 1751 * 16], m3 | |
20694 | |
20695 ; mode 29 [row 12] | |
20696 movu m6, [r5 + 21 * 16] | |
20697 pmaddubsw m3, m0, m6 | |
20698 pmulhrsw m3, m7 | |
20699 pmaddubsw m5, m2, m6 | |
20700 pmulhrsw m5, m7 | |
20701 packuswb m3, m5 | |
20702 movu [r0 + 1752 * 16], m3 | |
20703 | |
20704 ; mode 30 [row 8 -first half] | |
20705 movu [r0 + 1808 * 16], m3 | |
20706 | |
20707 pmaddubsw m3, m1, m6 | |
20708 pmulhrsw m3, m7 | |
20709 pmaddubsw m5, m4, m6 | |
20710 pmulhrsw m5, m7 | |
20711 packuswb m3, m5 | |
20712 movu [r0 + 1753 * 16], m3 | |
20713 | |
20714 ; mode 30 [row 8 -second half] | |
20715 movu [r0 + 1809 * 16], m3 | |
20716 | |
20717 ; mode 29 [row 13] | |
20718 movu m6, [r5 + 30 * 16] | |
20719 pmaddubsw m3, m0, m6 | |
20720 pmulhrsw m3, m7 | |
20721 pmaddubsw m5, m2, m6 | |
20722 pmulhrsw m5, m7 | |
20723 packuswb m3, m5 | |
20724 movu [r0 + 1754 * 16], m3 | |
20725 | |
20726 ; mode 32 [row 5 - first half] | |
20727 movu [r0 + 1930 * 16], m3 | |
20728 | |
20729 pmaddubsw m3, m1, m6 | |
20730 pmulhrsw m3, m7 | |
20731 pmaddubsw m5, m4, m6 | |
20732 pmulhrsw m5, m7 | |
20733 packuswb m3, m5 | |
20734 movu [r0 + 1755 * 16], m3 | |
20735 | |
20736 ; mode 32 [row 5 - second half] | |
20737 movu [r0 + 1931 * 16], m3 | |
20738 | |
20739 ; mode 30 [row 7] | |
20740 movu m6, [r5 + 8 * 16] | |
20741 pmaddubsw m3, m0, m6 | |
20742 pmulhrsw m3, m7 | |
20743 pmaddubsw m5, m2, m6 | |
20744 pmulhrsw m5, m7 | |
20745 packuswb m3, m5 | |
20746 movu [r0 + 1806 * 16], m3 | |
20747 | |
20748 ; mode 33 [row 3 - first half] | |
20749 movu [r0 + 1990 * 16], m3 | |
20750 | |
20751 pmaddubsw m3, m1, m6 | |
20752 pmulhrsw m3, m7 | |
20753 pmaddubsw m5, m4, m6 | |
20754 pmulhrsw m5, m7 | |
20755 packuswb m3, m5 | |
20756 movu [r0 + 1807 * 16], m3 | |
20757 | |
20758 ; mode 33 [row 3 - second half] | |
20759 movu [r0 + 1991 * 16], m3 | |
20760 | |
20761 ; mode 31 [row 5] | |
20762 movu m6, [r5 + 6 * 16] | |
20763 pmaddubsw m3, m0, m6 | |
20764 pmulhrsw m3, m7 | |
20765 pmaddubsw m5, m2, m6 | |
20766 pmulhrsw m5, m7 | |
20767 packuswb m3, m5 | |
20768 movu [r0 + 1866 * 16], m3 | |
20769 pmaddubsw m3, m1, m6 | |
20770 pmulhrsw m3, m7 | |
20771 pmaddubsw m5, m4, m6 | |
20772 pmulhrsw m5, m7 | |
20773 packuswb m3, m5 | |
20774 movu [r0 + 1867 * 16], m3 | |
20775 | |
20776 ; mode 31 [row 6] | |
20777 movu m6, [r5 + 23 * 16] | |
20778 pmaddubsw m3, m0, m6 | |
20779 pmulhrsw m3, m7 | |
20780 pmaddubsw m5, m2, m6 | |
20781 pmulhrsw m5, m7 | |
20782 packuswb m3, m5 | |
20783 movu [r0 + 1868 * 16], m3 | |
20784 pmaddubsw m3, m1, m6 | |
20785 pmulhrsw m3, m7 | |
20786 pmaddubsw m5, m4, m6 | |
20787 pmulhrsw m5, m7 | |
20788 packuswb m3, m5 | |
20789 movu [r0 + 1869 * 16], m3 | |
20790 | |
20791 ; mode 28 [row 25] | |
20792 movu m6, [r5 + 2 * 16] | |
20793 movu m0, [r3 + 5] | |
20794 movd m1, [r3 + 6] | |
20795 palignr m1, m0, 1 | |
20796 punpcklbw m0, m1 | |
20797 pmaddubsw m3, m0, m6 | |
20798 pmulhrsw m3, m7 | |
20799 movu m2, [r3 + 13] | |
20800 movd m4, [r3 + 14] | |
20801 palignr m4, m2, 1 | |
20802 punpcklbw m2, m4 | |
20803 pmaddubsw m5, m2, m6 | |
20804 pmulhrsw m5, m7 | |
20805 packuswb m3, m5 | |
20806 movu [r0 + 1714 * 16], m3 | |
20807 | |
20808 movu m1, [r3 + 21] | |
20809 movd m3, [r3 + 22] | |
20810 palignr m3, m1, 1 | |
20811 punpcklbw m1, m3 | |
20812 pmaddubsw m3, m1, m6 | |
20813 pmulhrsw m3, m7 | |
20814 movu m4, [r3 + 29] | |
20815 movd m5, [r3 + 30] | |
20816 palignr m5, m4, 1 | |
20817 punpcklbw m4, m5 | |
20818 pmaddubsw m5, m4, m6 | |
20819 pmulhrsw m5, m7 | |
20820 packuswb m3, m5 | |
20821 movu [r0 + 1715 * 16], m3 | |
20822 | |
20823 ; mode 28 [row 26] | |
20824 movu m6, [r5 + 7 * 16] | |
20825 pmaddubsw m3, m0, m6 | |
20826 pmulhrsw m3, m7 | |
20827 pmaddubsw m5, m2, m6 | |
20828 pmulhrsw m5, m7 | |
20829 packuswb m3, m5 | |
20830 movu [r0 + 1716 * 16], m3 | |
20831 | |
20832 ; mode 29 [row 14 - first half] | |
20833 movu [r0 + 1756 * 16], m3 | |
20834 | |
20835 pmaddubsw m3, m1, m6 | |
20836 pmulhrsw m3, m7 | |
20837 pmaddubsw m5, m4, m6 | |
20838 pmulhrsw m5, m7 | |
20839 packuswb m3, m5 | |
20840 movu [r0 + 1717 * 16], m3 | |
20841 | |
20842 ; mode 29 [row 14 - second half] | |
20843 movu [r0 + 1757 * 16], m3 | |
20844 | |
20845 ; mode 28 [row 27] | |
20846 movu m6, [r5 + 12 * 16] | |
20847 pmaddubsw m3, m0, m6 | |
20848 pmulhrsw m3, m7 | |
20849 pmaddubsw m5, m2, m6 | |
20850 pmulhrsw m5, m7 | |
20851 packuswb m3, m5 | |
20852 movu [r0 + 1718 * 16], m3 | |
20853 pmaddubsw m3, m1, m6 | |
20854 pmulhrsw m3, m7 | |
20855 pmaddubsw m5, m4, m6 | |
20856 pmulhrsw m5, m7 | |
20857 packuswb m3, m5 | |
20858 movu [r0 + 1719 * 16], m3 | |
20859 | |
20860 ; mode 28 [row 28] | |
20861 movu m6, [r5 + 17 * 16] | |
20862 pmaddubsw m3, m0, m6 | |
20863 pmulhrsw m3, m7 | |
20864 pmaddubsw m5, m2, m6 | |
20865 pmulhrsw m5, m7 | |
20866 packuswb m3, m5 | |
20867 movu [r0 + 1720 * 16], m3 | |
20868 pmaddubsw m3, m1, m6 | |
20869 pmulhrsw m3, m7 | |
20870 pmaddubsw m5, m4, m6 | |
20871 pmulhrsw m5, m7 | |
20872 packuswb m3, m5 | |
20873 movu [r0 + 1721 * 16], m3 | |
20874 | |
20875 ; mode 28 [row 29] | |
20876 movu m6, [r5 + 22 * 16] | |
20877 pmaddubsw m3, m0, m6 | |
20878 pmulhrsw m3, m7 | |
20879 pmaddubsw m5, m2, m6 | |
20880 pmulhrsw m5, m7 | |
20881 packuswb m3, m5 | |
20882 movu [r0 + 1722 * 16], m3 | |
20883 pmaddubsw m3, m1, m6 | |
20884 pmulhrsw m3, m7 | |
20885 pmaddubsw m5, m4, m6 | |
20886 pmulhrsw m5, m7 | |
20887 packuswb m3, m5 | |
20888 movu [r0 + 1723 * 16], m3 | |
20889 | |
20890 ; mode 28 [row 30] | |
20891 movu m6, [r5 + 27 * 16] | |
20892 pmaddubsw m3, m0, m6 | |
20893 pmulhrsw m3, m7 | |
20894 pmaddubsw m5, m2, m6 | |
20895 pmulhrsw m5, m7 | |
20896 packuswb m3, m5 | |
20897 movu [r0 + 1724 * 16], m3 | |
20898 pmaddubsw m3, m1, m6 | |
20899 pmulhrsw m3, m7 | |
20900 pmaddubsw m5, m4, m6 | |
20901 pmulhrsw m5, m7 | |
20902 packuswb m3, m5 | |
20903 movu [r0 + 1725 * 16], m3 | |
20904 | |
20905 ; mode 29 [row 15] | |
20906 movu m6, [r5 + 16 * 16] | |
20907 pmaddubsw m3, m0, m6 | |
20908 pmulhrsw m3, m7 | |
20909 pmaddubsw m5, m2, m6 | |
20910 pmulhrsw m5, m7 | |
20911 packuswb m3, m5 | |
20912 movu [r0 + 1758 * 16], m3 | |
20913 pmaddubsw m3, m1, m6 | |
20914 pmulhrsw m3, m7 | |
20915 pmaddubsw m5, m4, m6 | |
20916 pmulhrsw m5, m7 | |
20917 packuswb m3, m5 | |
20918 movu [r0 + 1759 * 16], m3 | |
20919 | |
20920 ; mode 29 [row 16] | |
20921 movu m6, [r5 + 25 * 16] | |
20922 pmaddubsw m3, m0, m6 | |
20923 pmulhrsw m3, m7 | |
20924 pmaddubsw m5, m2, m6 | |
20925 pmulhrsw m5, m7 | |
20926 packuswb m3, m5 | |
20927 movu [r0 + 1760 * 16], m3 | |
20928 pmaddubsw m3, m1, m6 | |
20929 pmulhrsw m3, m7 | |
20930 pmaddubsw m5, m4, m6 | |
20931 pmulhrsw m5, m7 | |
20932 packuswb m3, m5 | |
20933 movu [r0 + 1761 * 16], m3 | |
20934 | |
20935 ; mode 30 [row 9] | |
20936 movu m6, [r5 + 2 * 16] | |
20937 pmaddubsw m3, m0, m6 | |
20938 pmulhrsw m3, m7 | |
20939 pmaddubsw m5, m2, m6 | |
20940 pmulhrsw m5, m7 | |
20941 packuswb m3, m5 | |
20942 movu [r0 + 1810 * 16], m3 | |
20943 | |
20944 ; mode 33 [row 4 - first half] | |
20945 movu [r0 + 1992 * 16], m3 | |
20946 | |
20947 pmaddubsw m3, m1, m6 | |
20948 pmulhrsw m3, m7 | |
20949 pmaddubsw m5, m4, m6 | |
20950 pmulhrsw m5, m7 | |
20951 packuswb m3, m5 | |
20952 movu [r0 + 1811 * 16], m3 | |
20953 | |
20954 ; mode 33 [row 4 - second half] | |
20955 movu [r0 + 1993 * 16], m3 | |
20956 | |
20957 ; mode 30 [row 10] | |
20958 movu m6, [r5 + 15 * 16] | |
20959 pmaddubsw m3, m0, m6 | |
20960 pmulhrsw m3, m7 | |
20961 pmaddubsw m5, m2, m6 | |
20962 pmulhrsw m5, m7 | |
20963 packuswb m3, m5 | |
20964 movu [r0 + 1812 * 16], m3 | |
20965 pmaddubsw m3, m1, m6 | |
20966 pmulhrsw m3, m7 | |
20967 pmaddubsw m5, m4, m6 | |
20968 pmulhrsw m5, m7 | |
20969 packuswb m3, m5 | |
20970 movu [r0 + 1813 * 16], m3 | |
20971 | |
20972 ; mode 31 [row 7] | |
20973 movu m6, [r5 + 8 * 16] | |
20974 pmaddubsw m3, m0, m6 | |
20975 pmulhrsw m3, m7 | |
20976 pmaddubsw m5, m2, m6 | |
20977 pmulhrsw m5, m7 | |
20978 packuswb m3, m5 | |
20979 movu [r0 + 1870 * 16], m3 | |
20980 pmaddubsw m3, m1, m6 | |
20981 pmulhrsw m3, m7 | |
20982 pmaddubsw m5, m4, m6 | |
20983 pmulhrsw m5, m7 | |
20984 packuswb m3, m5 | |
20985 movu [r0 + 1871 * 16], m3 | |
20986 | |
20987 ; mode 31 [row 8] | |
20988 movu m6, [r5 + 25 * 16] | |
20989 pmaddubsw m3, m0, m6 | |
20990 pmulhrsw m3, m7 | |
20991 pmaddubsw m5, m2, m6 | |
20992 pmulhrsw m5, m7 | |
20993 packuswb m3, m5 | |
20994 movu [r0 + 1872 * 16], m3 | |
20995 pmaddubsw m3, m1, m6 | |
20996 pmulhrsw m3, m7 | |
20997 pmaddubsw m5, m4, m6 | |
20998 pmulhrsw m5, m7 | |
20999 packuswb m3, m5 | |
21000 movu [r0 + 1873 * 16], m3 | |
21001 | |
21002 ; mode 32 [row 6] | |
21003 movu m6, [r5 + 19 * 16] | |
21004 pmaddubsw m3, m0, m6 | |
21005 pmulhrsw m3, m7 | |
21006 pmaddubsw m5, m2, m6 | |
21007 pmulhrsw m5, m7 | |
21008 packuswb m3, m5 | |
21009 movu [r0 + 1932 * 16], m3 | |
21010 pmaddubsw m3, m1, m6 | |
21011 pmulhrsw m3, m7 | |
21012 pmaddubsw m5, m4, m6 | |
21013 pmulhrsw m5, m7 | |
21014 packuswb m3, m5 | |
21015 movu [r0 + 1933 * 16], m3 | |
21016 | |
21017 ; mode 30 [row 11] | |
21018 movu m6, [r5 + 28 * 16] | |
21019 pmaddubsw m3, m0, m6 | |
21020 pmulhrsw m3, m7 | |
21021 pmaddubsw m5, m2, m6 | |
21022 pmulhrsw m5, m7 | |
21023 packuswb m3, m5 | |
21024 movu [r0 + 1814 * 16], m3 | |
21025 | |
21026 ; mode 33 [row 5 - first half] | |
21027 movu [r0 + 1994 * 16], m3 | |
21028 | |
21029 pmaddubsw m3, m1, m6 | |
21030 pmulhrsw m3, m7 | |
21031 pmaddubsw m5, m4, m6 | |
21032 pmulhrsw m5, m7 | |
21033 packuswb m3, m5 | |
21034 movu [r0 + 1815 * 16], m3 | |
21035 | |
21036 ; mode 33 [row 5 - second half] | |
21037 movu [r0 + 1995 * 16], m3 | |
21038 | |
21039 ; mode 28 [row 31] | |
21040 movu m0, [r3 + 6] | |
21041 movd m1, [r3 + 7] | |
21042 palignr m1, m0, 1 | |
21043 punpcklbw m0, m1 | |
21044 movu m2, [r3 + 14] | |
21045 movd m3, [r3 + 15] | |
21046 palignr m3, m2, 1 | |
21047 punpcklbw m2, m3 | |
21048 movu m1, [r3 + 22] | |
21049 movd m3, [r3 + 23] | |
21050 palignr m3, m1, 1 | |
21051 punpcklbw m1, m3 | |
21052 movu m4, [r3 + 30] | |
21053 movd m5, [r3 + 31] | |
21054 palignr m5, m4, 1 | |
21055 punpcklbw m4, m5 | |
21056 | |
21057 pshufb m5, m0, [tab_S2] | |
21058 movh [r0 + 1726 * 16], m5 | |
21059 pshufb m5, m2, [tab_S2] | |
21060 movh [r0 + 1726 * 16 + 8], m5 | |
21061 pshufb m5, m1, [tab_S2] | |
21062 movh [r0 + 1727 * 16], m5 | |
21063 pshufb m5, m4, [tab_S2] | |
21064 movh [r0 + 1727 * 16 + 8], m5 | |
21065 | |
21066 ; mode 29 [row 17] | |
21067 movu m6, [r5 + 2 * 16] | |
21068 pmaddubsw m3, m0, m6 | |
21069 pmulhrsw m3, m7 | |
21070 pmaddubsw m5, m2, m6 | |
21071 pmulhrsw m5, m7 | |
21072 packuswb m3, m5 | |
21073 movu [r0 + 1762 * 16], m3 | |
21074 pmaddubsw m3, m1, m6 | |
21075 pmulhrsw m3, m7 | |
21076 pmaddubsw m5, m4, m6 | |
21077 pmulhrsw m5, m7 | |
21078 packuswb m3, m5 | |
21079 movu [r0 + 1763 * 16], m3 | |
21080 | |
21081 ; mode 29 [row 18] | |
21082 movu m6, [r5 + 11 * 16] | |
21083 pmaddubsw m3, m0, m6 | |
21084 pmulhrsw m3, m7 | |
21085 pmaddubsw m5, m2, m6 | |
21086 pmulhrsw m5, m7 | |
21087 packuswb m3, m5 | |
21088 movu [r0 + 1764 * 16], m3 | |
21089 pmaddubsw m3, m1, m6 | |
21090 pmulhrsw m3, m7 | |
21091 pmaddubsw m5, m4, m6 | |
21092 pmulhrsw m5, m7 | |
21093 packuswb m3, m5 | |
21094 movu [r0 + 1765 * 16], m3 | |
21095 | |
21096 ; mode 29 [row 19] | |
21097 movu m6, [r5 + 20 * 16] | |
21098 pmaddubsw m3, m0, m6 | |
21099 pmulhrsw m3, m7 | |
21100 pmaddubsw m5, m2, m6 | |
21101 pmulhrsw m5, m7 | |
21102 packuswb m3, m5 | |
21103 movu [r0 + 1766 * 16], m3 | |
21104 pmaddubsw m3, m1, m6 | |
21105 pmulhrsw m3, m7 | |
21106 pmaddubsw m5, m4, m6 | |
21107 pmulhrsw m5, m7 | |
21108 packuswb m3, m5 | |
21109 movu [r0 + 1767 * 16], m3 | |
21110 | |
21111 ; mode 29 [row 20] | |
21112 movu m6, [r5 + 29 * 16] | |
21113 pmaddubsw m3, m0, m6 | |
21114 pmulhrsw m3, m7 | |
21115 pmaddubsw m5, m2, m6 | |
21116 pmulhrsw m5, m7 | |
21117 packuswb m3, m5 | |
21118 movu [r0 + 1768 * 16], m3 | |
21119 | |
21120 ; mode 32 [row 8 - first halif] | |
21121 movu [r0 + 1936 * 16], m3 | |
21122 | |
21123 pmaddubsw m3, m1, m6 | |
21124 pmulhrsw m3, m7 | |
21125 pmaddubsw m5, m4, m6 | |
21126 pmulhrsw m5, m7 | |
21127 packuswb m3, m5 | |
21128 movu [r0 + 1769 * 16], m3 | |
21129 | |
21130 ; mode 32 [row 8 - second halif] | |
21131 movu [r0 + 1937 * 16], m3 | |
21132 | |
21133 ; mode 30 [row 12] | |
21134 movu m6, [r5 + 9 * 16] | |
21135 pmaddubsw m3, m0, m6 | |
21136 pmulhrsw m3, m7 | |
21137 pmaddubsw m5, m2, m6 | |
21138 pmulhrsw m5, m7 | |
21139 packuswb m3, m5 | |
21140 movu [r0 + 1816 * 16], m3 | |
21141 pmaddubsw m3, m1, m6 | |
21142 pmulhrsw m3, m7 | |
21143 pmaddubsw m5, m4, m6 | |
21144 pmulhrsw m5, m7 | |
21145 packuswb m3, m5 | |
21146 movu [r0 + 1817 * 16], m3 | |
21147 | |
21148 ; mode 30 [row 13] | |
21149 movu m6, [r5 + 22 * 16] | |
21150 pmaddubsw m3, m0, m6 | |
21151 pmulhrsw m3, m7 | |
21152 pmaddubsw m5, m2, m6 | |
21153 pmulhrsw m5, m7 | |
21154 packuswb m3, m5 | |
21155 movu [r0 + 1818 * 16], m3 | |
21156 | |
21157 ; mode 33 [row 6 - first half] | |
21158 movu [r0 + 1996 * 16], m3 | |
21159 | |
21160 pmaddubsw m3, m1, m6 | |
21161 pmulhrsw m3, m7 | |
21162 pmaddubsw m5, m4, m6 | |
21163 pmulhrsw m5, m7 | |
21164 packuswb m3, m5 | |
21165 movu [r0 + 1819 * 16], m3 | |
21166 | |
21167 ; mode 33 [row 6 - second half] | |
21168 movu [r0 + 1997 * 16], m3 | |
21169 | |
21170 ; mode 31 [row 9] | |
21171 movu m6, [r5 + 10 * 16] | |
21172 pmaddubsw m3, m0, m6 | |
21173 pmulhrsw m3, m7 | |
21174 pmaddubsw m5, m2, m6 | |
21175 pmulhrsw m5, m7 | |
21176 packuswb m3, m5 | |
21177 movu [r0 + 1874 * 16], m3 | |
21178 pmaddubsw m3, m1, m6 | |
21179 pmulhrsw m3, m7 | |
21180 pmaddubsw m5, m4, m6 | |
21181 pmulhrsw m5, m7 | |
21182 packuswb m3, m5 | |
21183 movu [r0 + 1875 * 16], m3 | |
21184 | |
21185 ; mode 31 [row 10] | |
21186 movu m6, [r5 + 27 * 16] | |
21187 pmaddubsw m3, m0, m6 | |
21188 pmulhrsw m3, m7 | |
21189 pmaddubsw m5, m2, m6 | |
21190 pmulhrsw m5, m7 | |
21191 packuswb m3, m5 | |
21192 movu [r0 + 1876 * 16], m3 | |
21193 pmaddubsw m3, m1, m6 | |
21194 pmulhrsw m3, m7 | |
21195 pmaddubsw m5, m4, m6 | |
21196 pmulhrsw m5, m7 | |
21197 packuswb m3, m5 | |
21198 movu [r0 + 1877 * 16], m3 | |
21199 | |
21200 ; mode 32 [row 7] | |
21201 movu m6, [r5 + 8 * 16] | |
21202 pmaddubsw m3, m0, m6 | |
21203 pmulhrsw m3, m7 | |
21204 pmaddubsw m5, m2, m6 | |
21205 pmulhrsw m5, m7 | |
21206 packuswb m3, m5 | |
21207 movu [r0 + 1934 * 16], m3 | |
21208 pmaddubsw m3, m1, m6 | |
21209 pmulhrsw m3, m7 | |
21210 pmaddubsw m5, m4, m6 | |
21211 pmulhrsw m5, m7 | |
21212 packuswb m3, m5 | |
21213 movu [r0 + 1935 * 16], m3 | |
21214 | |
21215 ; mode 29 [row 21] | |
21216 movu m6, [r5 + 6 * 16] | |
21217 movu m0, [r3 + 7] | |
21218 movd m1, [r3 + 8] | |
21219 palignr m1, m0, 1 | |
21220 punpcklbw m0, m1 | |
21221 pmaddubsw m3, m0, m6 | |
21222 pmulhrsw m3, m7 | |
21223 movu m2, [r3 + 15] | |
21224 movd m4, [r3 + 16] | |
21225 palignr m4, m2, 1 | |
21226 punpcklbw m2, m4 | |
21227 pmaddubsw m5, m2, m6 | |
21228 pmulhrsw m5, m7 | |
21229 packuswb m3, m5 | |
21230 movu [r0 + 1770 * 16], m3 | |
21231 | |
21232 movu m1, [r3 + 23] | |
21233 movd m3, [r3 + 24] | |
21234 palignr m3, m1, 1 | |
21235 punpcklbw m1, m3 | |
21236 pmaddubsw m3, m1, m6 | |
21237 pmulhrsw m3, m7 | |
21238 movu m4, [r3 + 31] | |
21239 movd m5, [r3 + 32] | |
21240 palignr m5, m4, 1 | |
21241 punpcklbw m4, m5 | |
21242 pmaddubsw m5, m4, m6 | |
21243 pmulhrsw m5, m7 | |
21244 packuswb m3, m5 | |
21245 movu [r0 + 1771 * 16], m3 | |
21246 | |
21247 ; mode 29 [row 22] | |
21248 movu m6, [r5 + 15 * 16] | |
21249 pmaddubsw m3, m0, m6 | |
21250 pmulhrsw m3, m7 | |
21251 pmaddubsw m5, m2, m6 | |
21252 pmulhrsw m5, m7 | |
21253 packuswb m3, m5 | |
21254 movu [r0 + 1772 * 16], m3 | |
21255 pmaddubsw m3, m1, m6 | |
21256 pmulhrsw m3, m7 | |
21257 pmaddubsw m5, m4, m6 | |
21258 pmulhrsw m5, m7 | |
21259 packuswb m3, m5 | |
21260 movu [r0 + 1773 * 16], m3 | |
21261 | |
21262 ; mode 29 [row 23] | |
21263 movu m6, [r5 + 24 * 16] | |
21264 pmaddubsw m3, m0, m6 | |
21265 pmulhrsw m3, m7 | |
21266 pmaddubsw m5, m2, m6 | |
21267 pmulhrsw m5, m7 | |
21268 packuswb m3, m5 | |
21269 movu [r0 + 1774 * 16], m3 | |
21270 pmaddubsw m3, m1, m6 | |
21271 pmulhrsw m3, m7 | |
21272 pmaddubsw m5, m4, m6 | |
21273 pmulhrsw m5, m7 | |
21274 packuswb m3, m5 | |
21275 movu [r0 + 1775 * 16], m3 | |
21276 | |
21277 ; mode 30 [row 14] | |
21278 movu m6, [r5 + 3 * 16] | |
21279 pmaddubsw m3, m0, m6 | |
21280 pmulhrsw m3, m7 | |
21281 pmaddubsw m5, m2, m6 | |
21282 pmulhrsw m5, m7 | |
21283 packuswb m3, m5 | |
21284 movu [r0 + 1820 * 16], m3 | |
21285 pmaddubsw m3, m1, m6 | |
21286 pmulhrsw m3, m7 | |
21287 pmaddubsw m5, m4, m6 | |
21288 pmulhrsw m5, m7 | |
21289 packuswb m3, m5 | |
21290 movu [r0 + 1821 * 16], m3 | |
21291 | |
21292 ; mode 30 [row 15] | |
21293 movu m6, [r5 + 16 * 16] | |
21294 pmaddubsw m3, m0, m6 | |
21295 pmulhrsw m3, m7 | |
21296 pmaddubsw m5, m2, m6 | |
21297 pmulhrsw m5, m7 | |
21298 packuswb m3, m5 | |
21299 movu [r0 + 1822 * 16], m3 | |
21300 | |
21301 ; mode 33 [row 7 - first half] | |
21302 movu [r0 + 1998 * 16], m3 | |
21303 | |
21304 pmaddubsw m3, m1, m6 | |
21305 pmulhrsw m3, m7 | |
21306 pmaddubsw m5, m4, m6 | |
21307 pmulhrsw m5, m7 | |
21308 packuswb m3, m5 | |
21309 movu [r0 + 1823 * 16], m3 | |
21310 | |
21311 ; mode 33 [row 7 - second half] | |
21312 movu [r0 + 1999 * 16], m3 | |
21313 | |
21314 ; mode 30 [row 16] | |
21315 movu m6, [r5 + 29 * 16] | |
21316 pmaddubsw m3, m0, m6 | |
21317 pmulhrsw m3, m7 | |
21318 pmaddubsw m5, m2, m6 | |
21319 pmulhrsw m5, m7 | |
21320 packuswb m3, m5 | |
21321 movu [r0 + 1824 * 16], m3 | |
21322 | |
21323 ; mode 31 [row 12 - first half] | |
21324 movu [r0 + 1880 * 16], m3 | |
21325 | |
21326 pmaddubsw m3, m1, m6 | |
21327 pmulhrsw m3, m7 | |
21328 pmaddubsw m5, m4, m6 | |
21329 pmulhrsw m5, m7 | |
21330 packuswb m3, m5 | |
21331 movu [r0 + 1825 * 16], m3 | |
21332 | |
21333 ; mode 31 [row 12 - second half] | |
21334 movu [r0 + 1881 * 16], m3 | |
21335 | |
21336 ; mode 31 [row 11] | |
21337 movu m6, [r5 + 12 * 16] | |
21338 pmaddubsw m3, m0, m6 | |
21339 pmulhrsw m3, m7 | |
21340 pmaddubsw m5, m2, m6 | |
21341 pmulhrsw m5, m7 | |
21342 packuswb m3, m5 | |
21343 movu [r0 + 1878 * 16], m3 | |
21344 pmaddubsw m3, m1, m6 | |
21345 pmulhrsw m3, m7 | |
21346 pmaddubsw m5, m4, m6 | |
21347 pmulhrsw m5, m7 | |
21348 packuswb m3, m5 | |
21349 movu [r0 + 1879 * 16], m3 | |
21350 | |
21351 ; mode 32 [row 9] | |
21352 movu m6, [r5 + 18 * 16] | |
21353 pmaddubsw m3, m0, m6 | |
21354 pmulhrsw m3, m7 | |
21355 pmaddubsw m5, m2, m6 | |
21356 pmulhrsw m5, m7 | |
21357 packuswb m3, m5 | |
21358 movu [r0 + 1938 * 16], m3 | |
21359 pmaddubsw m3, m1, m6 | |
21360 pmulhrsw m3, m7 | |
21361 pmaddubsw m5, m4, m6 | |
21362 pmulhrsw m5, m7 | |
21363 packuswb m3, m5 | |
21364 movu [r0 + 1939 * 16], m3 | |
21365 | |
21366 ; mode 29 [row 24] | |
21367 movu m6, [r5 + 1 * 16] | |
21368 movu m0, [r3 + 8] | |
21369 movd m1, [r3 + 9] | |
21370 palignr m1, m0, 1 | |
21371 punpcklbw m0, m1 | |
21372 pmaddubsw m3, m0, m6 | |
21373 pmulhrsw m3, m7 | |
21374 movu m2, [r3 + 16] | |
21375 movd m4, [r3 + 17] | |
21376 palignr m4, m2, 1 | |
21377 punpcklbw m2, m4 | |
21378 pmaddubsw m5, m2, m6 | |
21379 pmulhrsw m5, m7 | |
21380 packuswb m3, m5 | |
21381 movu [r0 + 1776 * 16], m3 | |
21382 | |
21383 movu m1, [r3 + 24] | |
21384 movd m3, [r3 + 25] | |
21385 palignr m3, m1, 1 | |
21386 punpcklbw m1, m3 | |
21387 pmaddubsw m3, m1, m6 | |
21388 pmulhrsw m3, m7 | |
21389 movu m4, [r3 + 32] | |
21390 movd m5, [r3 + 33] | |
21391 palignr m5, m4, 1 | |
21392 punpcklbw m4, m5 | |
21393 pmaddubsw m5, m4, m6 | |
21394 pmulhrsw m5, m7 | |
21395 packuswb m3, m5 | |
21396 movu [r0 + 1777 * 16], m3 | |
21397 | |
21398 ; mode 29 [row 25] | |
21399 movu m6, [r5 + 10 * 16] | |
21400 pmaddubsw m3, m0, m6 | |
21401 pmulhrsw m3, m7 | |
21402 pmaddubsw m5, m2, m6 | |
21403 pmulhrsw m5, m7 | |
21404 packuswb m3, m5 | |
21405 movu [r0 + 1778 * 16], m3 | |
21406 | |
21407 ; mode 30 [row 17 - first half] | |
21408 movu [r0 + 1826 * 16], m3 | |
21409 | |
21410 ; mode 33 [row 8 - first half] | |
21411 movu [r0 + 2000 * 16], m3 | |
21412 | |
21413 pmaddubsw m3, m1, m6 | |
21414 pmulhrsw m3, m7 | |
21415 pmaddubsw m5, m4, m6 | |
21416 pmulhrsw m5, m7 | |
21417 packuswb m3, m5 | |
21418 movu [r0 + 1779 * 16], m3 | |
21419 | |
21420 ; mode 30 [row 17 - second half] | |
21421 movu [r0 + 1827 * 16], m3 | |
21422 | |
21423 ; mode 33 [row 8 - second half] | |
21424 movu [r0 + 2001 * 16], m3 | |
21425 | |
21426 ; mode 29 [row 26] | |
21427 movu m6, [r5 + 19 * 16] | |
21428 pmaddubsw m3, m0, m6 | |
21429 pmulhrsw m3, m7 | |
21430 pmaddubsw m5, m2, m6 | |
21431 pmulhrsw m5, m7 | |
21432 packuswb m3, m5 | |
21433 movu [r0 + 1780 * 16], m3 | |
21434 pmaddubsw m3, m1, m6 | |
21435 pmulhrsw m3, m7 | |
21436 pmaddubsw m5, m4, m6 | |
21437 pmulhrsw m5, m7 | |
21438 packuswb m3, m5 | |
21439 movu [r0 + 1781 * 16], m3 | |
21440 | |
21441 ; mode 29 [row 27] | |
21442 movu m6, [r5 + 28 * 16] | |
21443 pmaddubsw m3, m0, m6 | |
21444 pmulhrsw m3, m7 | |
21445 pmaddubsw m5, m2, m6 | |
21446 pmulhrsw m5, m7 | |
21447 packuswb m3, m5 | |
21448 movu [r0 + 1782 * 16], m3 | |
21449 | |
21450 ; mode 32 [row 11 - first half] | |
21451 movu [r0 + 1942 * 16], m3 | |
21452 | |
21453 pmaddubsw m3, m1, m6 | |
21454 pmulhrsw m3, m7 | |
21455 pmaddubsw m5, m4, m6 | |
21456 pmulhrsw m5, m7 | |
21457 packuswb m3, m5 | |
21458 movu [r0 + 1783 * 16], m3 | |
21459 | |
21460 ; mode 32 [row 11 - second half] | |
21461 movu [r0 + 1943 * 16], m3 | |
21462 | |
21463 ; mode 30 [row 18] | |
21464 movu m6, [r5 + 23 * 16] | |
21465 pmaddubsw m3, m0, m6 | |
21466 pmulhrsw m3, m7 | |
21467 pmaddubsw m5, m2, m6 | |
21468 pmulhrsw m5, m7 | |
21469 packuswb m3, m5 | |
21470 movu [r0 + 1828 * 16], m3 | |
21471 pmaddubsw m3, m1, m6 | |
21472 pmulhrsw m3, m7 | |
21473 pmaddubsw m5, m4, m6 | |
21474 pmulhrsw m5, m7 | |
21475 packuswb m3, m5 | |
21476 movu [r0 + 1829 * 16], m3 | |
21477 | |
21478 ; mode 31 [row 13] | |
21479 movu m6, [r5 + 14 * 16] | |
21480 pmaddubsw m3, m0, m6 | |
21481 pmulhrsw m3, m7 | |
21482 pmaddubsw m5, m2, m6 | |
21483 pmulhrsw m5, m7 | |
21484 packuswb m3, m5 | |
21485 movu [r0 + 1882 * 16], m3 | |
21486 pmaddubsw m3, m1, m6 | |
21487 pmulhrsw m3, m7 | |
21488 pmaddubsw m5, m4, m6 | |
21489 pmulhrsw m5, m7 | |
21490 packuswb m3, m5 | |
21491 movu [r0 + 1883 * 16], m3 | |
21492 | |
21493 ; mode 31 [row 14] | |
21494 movu m6, [r5 + 31 * 16] | |
21495 pmaddubsw m3, m0, m6 | |
21496 pmulhrsw m3, m7 | |
21497 pmaddubsw m5, m2, m6 | |
21498 pmulhrsw m5, m7 | |
21499 packuswb m3, m5 | |
21500 movu [r0 + 1884 * 16], m3 | |
21501 pmaddubsw m3, m1, m6 | |
21502 pmulhrsw m3, m7 | |
21503 pmaddubsw m5, m4, m6 | |
21504 pmulhrsw m5, m7 | |
21505 packuswb m3, m5 | |
21506 movu [r0 + 1885 * 16], m3 | |
21507 | |
21508 ; mode 32 [row 10] | |
21509 movu m6, [r5 + 7 * 16] | |
21510 pmaddubsw m3, m0, m6 | |
21511 pmulhrsw m3, m7 | |
21512 pmaddubsw m5, m2, m6 | |
21513 pmulhrsw m5, m7 | |
21514 packuswb m3, m5 | |
21515 movu [r0 + 1940 * 16], m3 | |
21516 pmaddubsw m3, m1, m6 | |
21517 pmulhrsw m3, m7 | |
21518 pmaddubsw m5, m4, m6 | |
21519 pmulhrsw m5, m7 | |
21520 packuswb m3, m5 | |
21521 movu [r0 + 1941 * 16], m3 | |
21522 | |
21523 ; mode 29 [row 28] | |
21524 movu m6, [r5 + 5 * 16] | |
21525 movu m0, [r3 + 9] | |
21526 movd m1, [r3 + 10] | |
21527 palignr m1, m0, 1 | |
21528 punpcklbw m0, m1 | |
21529 pmaddubsw m3, m0, m6 | |
21530 pmulhrsw m3, m7 | |
21531 movu m2, [r3 + 17] | |
21532 movd m4, [r3 + 18] | |
21533 palignr m4, m2, 1 | |
21534 punpcklbw m2, m4 | |
21535 pmaddubsw m5, m2, m6 | |
21536 pmulhrsw m5, m7 | |
21537 packuswb m3, m5 | |
21538 movu [r0 + 1784 * 16], m3 | |
21539 | |
21540 movu m1, [r3 + 25] | |
21541 movd m3, [r3 + 26] | |
21542 palignr m3, m1, 1 | |
21543 punpcklbw m1, m3 | |
21544 pmaddubsw m3, m1, m6 | |
21545 pmulhrsw m3, m7 | |
21546 movu m4, [r3 + 33] | |
21547 movd m5, [r3 + 34] | |
21548 palignr m5, m4, 1 | |
21549 punpcklbw m4, m5 | |
21550 pmaddubsw m5, m4, m6 | |
21551 pmulhrsw m5, m7 | |
21552 packuswb m3, m5 | |
21553 movu [r0 + 1785 * 16], m3 | |
21554 | |
21555 ; mode 29 [row 29] | |
21556 movu m6, [r5 + 14 * 16] | |
21557 pmaddubsw m3, m0, m6 | |
21558 pmulhrsw m3, m7 | |
21559 pmaddubsw m5, m2, m6 | |
21560 pmulhrsw m5, m7 | |
21561 packuswb m3, m5 | |
21562 movu [r0 + 1786 * 16], m3 | |
21563 pmaddubsw m3, m1, m6 | |
21564 pmulhrsw m3, m7 | |
21565 pmaddubsw m5, m4, m6 | |
21566 pmulhrsw m5, m7 | |
21567 packuswb m3, m5 | |
21568 movu [r0 + 1787 * 16], m3 | |
21569 | |
21570 ; mode 29 [row 30] | |
21571 movu m6, [r5 + 23 * 16] | |
21572 pmaddubsw m3, m0, m6 | |
21573 pmulhrsw m3, m7 | |
21574 pmaddubsw m5, m2, m6 | |
21575 pmulhrsw m5, m7 | |
21576 packuswb m3, m5 | |
21577 movu [r0 + 1788 * 16], m3 | |
21578 pmaddubsw m3, m1, m6 | |
21579 pmulhrsw m3, m7 | |
21580 pmaddubsw m5, m4, m6 | |
21581 pmulhrsw m5, m7 | |
21582 packuswb m3, m5 | |
21583 movu [r0 + 1789 * 16], m3 | |
21584 | |
21585 ; mode 30 [row 19] | |
21586 movu m6, [r5 + 4 * 16] | |
21587 pmaddubsw m3, m0, m6 | |
21588 pmulhrsw m3, m7 | |
21589 pmaddubsw m5, m2, m6 | |
21590 pmulhrsw m5, m7 | |
21591 packuswb m3, m5 | |
21592 movu [r0 + 1830 * 16], m3 | |
21593 | |
21594 ; mode 33 [row 9 - first half] | |
21595 movu [r0 + 2002 * 16], m3 | |
21596 | |
21597 pmaddubsw m3, m1, m6 | |
21598 pmulhrsw m3, m7 | |
21599 pmaddubsw m5, m4, m6 | |
21600 pmulhrsw m5, m7 | |
21601 packuswb m3, m5 | |
21602 movu [r0 + 1831 * 16], m3 | |
21603 | |
21604 ; mode 33 [row 9 - second half] | |
21605 movu [r0 + 2003 * 16], m3 | |
21606 | |
21607 ; mode 30 [row 20] | |
21608 movu m6, [r5 + 17 * 16] | |
21609 pmaddubsw m3, m0, m6 | |
21610 pmulhrsw m3, m7 | |
21611 pmaddubsw m5, m2, m6 | |
21612 pmulhrsw m5, m7 | |
21613 packuswb m3, m5 | |
21614 movu [r0 + 1832 * 16], m3 | |
21615 | |
21616 ; mode 32 [row 12 - first half] | |
21617 movu [r0 + 1944 * 16], m3 | |
21618 | |
21619 pmaddubsw m3, m1, m6 | |
21620 pmulhrsw m3, m7 | |
21621 pmaddubsw m5, m4, m6 | |
21622 pmulhrsw m5, m7 | |
21623 packuswb m3, m5 | |
21624 movu [r0 + 1833 * 16], m3 | |
21625 | |
21626 ; mode 32 [row 12 - second half] | |
21627 movu [r0 + 1945 * 16], m3 | |
21628 | |
21629 ; mode 30 [row 21] | |
21630 movu m6, [r5 + 30 * 16] | |
21631 pmaddubsw m3, m0, m6 | |
21632 pmulhrsw m3, m7 | |
21633 pmaddubsw m5, m2, m6 | |
21634 pmulhrsw m5, m7 | |
21635 packuswb m3, m5 | |
21636 movu [r0 + 1834 * 16], m3 | |
21637 | |
21638 ; mode 33 [row 10 - first half] | |
21639 movu [r0 + 2004 * 16], m3 | |
21640 | |
21641 pmaddubsw m3, m1, m6 | |
21642 pmulhrsw m3, m7 | |
21643 pmaddubsw m5, m4, m6 | |
21644 pmulhrsw m5, m7 | |
21645 packuswb m3, m5 | |
21646 movu [r0 + 1835 * 16], m3 | |
21647 | |
21648 ; mode 33 [row 10 - second half] | |
21649 movu [r0 + 2005 * 16], m3 | |
21650 | |
21651 ; mode 31 [row 15] | |
21652 movu m6, [r5 + 16 * 16] | |
21653 pmaddubsw m3, m0, m6 | |
21654 pmulhrsw m3, m7 | |
21655 pmaddubsw m5, m2, m6 | |
21656 pmulhrsw m5, m7 | |
21657 packuswb m3, m5 | |
21658 movu [r0 + 1886 * 16], m3 | |
21659 pmaddubsw m3, m1, m6 | |
21660 pmulhrsw m3, m7 | |
21661 pmaddubsw m5, m4, m6 | |
21662 pmulhrsw m5, m7 | |
21663 packuswb m3, m5 | |
21664 movu [r0 + 1887 * 16], m3 | |
21665 | |
21666 ; mode 29 [row 31] | |
21667 movu m0, [r3 + 10] | |
21668 movd m1, [r3 + 11] | |
21669 palignr m1, m0, 1 | |
21670 punpcklbw m0, m1 | |
21671 movu m2, [r3 + 18] | |
21672 movd m3, [r3 + 19] | |
21673 palignr m3, m2, 1 | |
21674 punpcklbw m2, m3 | |
21675 movu m1, [r3 + 26] | |
21676 movd m3, [r3 + 27] | |
21677 palignr m3, m1, 1 | |
21678 punpcklbw m1, m3 | |
21679 movu m4, [r3 + 34] | |
21680 movd m5, [r3 + 35] | |
21681 palignr m5, m4, 1 | |
21682 punpcklbw m4, m5 | |
21683 | |
21684 pshufb m5, m0, [tab_S2] | |
21685 movh [r0 + 1790 * 16], m5 | |
21686 pshufb m5, m2, [tab_S2] | |
21687 movh [r0 + 1790 * 16 + 8], m5 | |
21688 pshufb m5, m1, [tab_S2] | |
21689 movh [r0 + 1791 * 16], m5 | |
21690 pshufb m5, m4, [tab_S2] | |
21691 movh [r0 + 1791 * 16 + 8], m5 | |
21692 | |
21693 ; mode 30 [row 22] | |
21694 movu m6, [r5 + 11 * 16] | |
21695 pmaddubsw m3, m0, m6 | |
21696 pmulhrsw m3, m7 | |
21697 pmaddubsw m5, m2, m6 | |
21698 pmulhrsw m5, m7 | |
21699 packuswb m3, m5 | |
21700 movu [r0 + 1836 * 16], m3 | |
21701 pmaddubsw m3, m1, m6 | |
21702 pmulhrsw m3, m7 | |
21703 pmaddubsw m5, m4, m6 | |
21704 pmulhrsw m5, m7 | |
21705 packuswb m3, m5 | |
21706 movu [r0 + 1837 * 16], m3 | |
21707 | |
21708 ; mode 30 [row 23] | |
21709 movu m6, [r5 + 24 * 16] | |
21710 pmaddubsw m3, m0, m6 | |
21711 pmulhrsw m3, m7 | |
21712 pmaddubsw m5, m2, m6 | |
21713 pmulhrsw m5, m7 | |
21714 packuswb m3, m5 | |
21715 movu [r0 + 1838 * 16], m3 | |
21716 | |
21717 ; mode 33 [row 11 - first half] | |
21718 movu [r0 + 2006 * 16], m3 | |
21719 | |
21720 pmaddubsw m3, m1, m6 | |
21721 pmulhrsw m3, m7 | |
21722 pmaddubsw m5, m4, m6 | |
21723 pmulhrsw m5, m7 | |
21724 packuswb m3, m5 | |
21725 movu [r0 + 1839 * 16], m3 | |
21726 | |
21727 ; mode 33 [row 11 - second half] | |
21728 movu [r0 + 2007 * 16], m3 | |
21729 | |
21730 ; mode 31 [row 16] | |
21731 movu m6, [r5 + 1 * 16] | |
21732 pmaddubsw m3, m0, m6 | |
21733 pmulhrsw m3, m7 | |
21734 pmaddubsw m5, m2, m6 | |
21735 pmulhrsw m5, m7 | |
21736 packuswb m3, m5 | |
21737 movu [r0 + 1888 * 16], m3 | |
21738 pmaddubsw m3, m1, m6 | |
21739 pmulhrsw m3, m7 | |
21740 pmaddubsw m5, m4, m6 | |
21741 pmulhrsw m5, m7 | |
21742 packuswb m3, m5 | |
21743 movu [r0 + 1889 * 16], m3 | |
21744 | |
21745 ; mode 31 [row 17] | |
21746 movu m6, [r5 + 18 * 16] | |
21747 pmaddubsw m3, m0, m6 | |
21748 pmulhrsw m3, m7 | |
21749 pmaddubsw m5, m2, m6 | |
21750 pmulhrsw m5, m7 | |
21751 packuswb m3, m5 | |
21752 movu [r0 + 1890 * 16], m3 | |
21753 pmaddubsw m3, m1, m6 | |
21754 pmulhrsw m3, m7 | |
21755 pmaddubsw m5, m4, m6 | |
21756 pmulhrsw m5, m7 | |
21757 packuswb m3, m5 | |
21758 movu [r0 + 1891 * 16], m3 | |
21759 | |
21760 ; mode 32 [row 13] | |
21761 movu m6, [r5 + 6 * 16] | |
21762 pmaddubsw m3, m0, m6 | |
21763 pmulhrsw m3, m7 | |
21764 pmaddubsw m5, m2, m6 | |
21765 pmulhrsw m5, m7 | |
21766 packuswb m3, m5 | |
21767 movu [r0 + 1946 * 16], m3 | |
21768 pmaddubsw m3, m1, m6 | |
21769 pmulhrsw m3, m7 | |
21770 pmaddubsw m5, m4, m6 | |
21771 pmulhrsw m5, m7 | |
21772 packuswb m3, m5 | |
21773 movu [r0 + 1947 * 16], m3 | |
21774 | |
21775 ; mode 32 [row 14] | |
21776 movu m6, [r5 + 27 * 16] | |
21777 pmaddubsw m3, m0, m6 | |
21778 pmulhrsw m3, m7 | |
21779 pmaddubsw m5, m2, m6 | |
21780 pmulhrsw m5, m7 | |
21781 packuswb m3, m5 | |
21782 movu [r0 + 1948 * 16], m3 | |
21783 pmaddubsw m3, m1, m6 | |
21784 pmulhrsw m3, m7 | |
21785 pmaddubsw m5, m4, m6 | |
21786 pmulhrsw m5, m7 | |
21787 packuswb m3, m5 | |
21788 movu [r0 + 1949 * 16], m3 | |
21789 | |
21790 ; mode 30 [row 24] | |
21791 movu m6, [r5 + 5 * 16] | |
21792 movu m0, [r3 + 11] | |
21793 movd m1, [r3 + 12] | |
21794 palignr m1, m0, 1 | |
21795 punpcklbw m0, m1 | |
21796 pmaddubsw m3, m0, m6 | |
21797 pmulhrsw m3, m7 | |
21798 movu m2, [r3 + 19] | |
21799 movd m4, [r3 + 20] | |
21800 palignr m4, m2, 1 | |
21801 punpcklbw m2, m4 | |
21802 pmaddubsw m5, m2, m6 | |
21803 pmulhrsw m5, m7 | |
21804 packuswb m3, m5 | |
21805 movu [r0 + 1840 * 16], m3 | |
21806 | |
21807 movu m1, [r3 + 27] | |
21808 movd m3, [r3 + 28] | |
21809 palignr m3, m1, 1 | |
21810 punpcklbw m1, m3 | |
21811 pmaddubsw m3, m1, m6 | |
21812 pmulhrsw m3, m7 | |
21813 movu m4, [r3 + 35] | |
21814 movd m5, [r3 + 36] | |
21815 palignr m5, m4, 1 | |
21816 punpcklbw m4, m5 | |
21817 pmaddubsw m5, m4, m6 | |
21818 pmulhrsw m5, m7 | |
21819 packuswb m3, m5 | |
21820 movu [r0 + 1841 * 16], m3 | |
21821 | |
21822 ; mode 30 [row 25] | |
21823 movu m6, [r5 + 18 * 16] | |
21824 pmaddubsw m3, m0, m6 | |
21825 pmulhrsw m3, m7 | |
21826 pmaddubsw m5, m2, m6 | |
21827 pmulhrsw m5, m7 | |
21828 packuswb m3, m5 | |
21829 movu [r0 + 1842 * 16], m3 | |
21830 | |
21831 ; mode 33 [row 12 - first half] | |
21832 movu [r0 + 2008 * 16], m3 | |
21833 | |
21834 pmaddubsw m3, m1, m6 | |
21835 pmulhrsw m3, m7 | |
21836 pmaddubsw m5, m4, m6 | |
21837 pmulhrsw m5, m7 | |
21838 packuswb m3, m5 | |
21839 movu [r0 + 1843 * 16], m3 | |
21840 | |
21841 ; mode 33 [row 12 - second half] | |
21842 movu [r0 + 2009 * 16], m3 | |
21843 | |
21844 ; mode 30 [row 26] | |
21845 movu m6, [r5 + 31 * 16] | |
21846 pmaddubsw m3, m0, m6 | |
21847 pmulhrsw m3, m7 | |
21848 pmaddubsw m5, m2, m6 | |
21849 pmulhrsw m5, m7 | |
21850 packuswb m3, m5 | |
21851 movu [r0 + 1844 * 16], m3 | |
21852 pmaddubsw m3, m1, m6 | |
21853 pmulhrsw m3, m7 | |
21854 pmaddubsw m5, m4, m6 | |
21855 pmulhrsw m5, m7 | |
21856 packuswb m3, m5 | |
21857 movu [r0 + 1845 * 16], m3 | |
21858 | |
21859 ; mode 31 [row 18] | |
21860 movu m6, [r5 + 3 * 16] | |
21861 pmaddubsw m3, m0, m6 | |
21862 pmulhrsw m3, m7 | |
21863 pmaddubsw m5, m2, m6 | |
21864 pmulhrsw m5, m7 | |
21865 packuswb m3, m5 | |
21866 movu [r0 + 1892 * 16], m3 | |
21867 pmaddubsw m3, m1, m6 | |
21868 pmulhrsw m3, m7 | |
21869 pmaddubsw m5, m4, m6 | |
21870 pmulhrsw m5, m7 | |
21871 packuswb m3, m5 | |
21872 movu [r0 + 1893 * 16], m3 | |
21873 | |
21874 ; mode 31 [row 19] | |
21875 movu m6, [r5 + 20 * 16] | |
21876 pmaddubsw m3, m0, m6 | |
21877 pmulhrsw m3, m7 | |
21878 pmaddubsw m5, m2, m6 | |
21879 pmulhrsw m5, m7 | |
21880 packuswb m3, m5 | |
21881 movu [r0 + 1894 * 16], m3 | |
21882 pmaddubsw m3, m1, m6 | |
21883 pmulhrsw m3, m7 | |
21884 pmaddubsw m5, m4, m6 | |
21885 pmulhrsw m5, m7 | |
21886 packuswb m3, m5 | |
21887 movu [r0 + 1895 * 16], m3 | |
21888 | |
21889 ; mode 32 [row 15] | |
21890 movu m6, [r5 + 16 * 16] | |
21891 pmaddubsw m3, m0, m6 | |
21892 pmulhrsw m3, m7 | |
21893 pmaddubsw m5, m2, m6 | |
21894 pmulhrsw m5, m7 | |
21895 packuswb m3, m5 | |
21896 movu [r0 + 1950 * 16], m3 | |
21897 pmaddubsw m3, m1, m6 | |
21898 pmulhrsw m3, m7 | |
21899 pmaddubsw m5, m4, m6 | |
21900 pmulhrsw m5, m7 | |
21901 packuswb m3, m5 | |
21902 movu [r0 + 1951 * 16], m3 | |
21903 | |
21904 ; mode 30 [row 27] | |
21905 movu m6, [r5 + 12 * 16] | |
21906 movu m0, [r3 + 12] | |
21907 movd m1, [r3 + 13] | |
21908 palignr m1, m0, 1 | |
21909 punpcklbw m0, m1 | |
21910 pmaddubsw m3, m0, m6 | |
21911 pmulhrsw m3, m7 | |
21912 movu m2, [r3 + 20] | |
21913 movd m4, [r3 + 21] | |
21914 palignr m4, m2, 1 | |
21915 punpcklbw m2, m4 | |
21916 pmaddubsw m5, m2, m6 | |
21917 pmulhrsw m5, m7 | |
21918 packuswb m3, m5 | |
21919 movu [r0 + 1846 * 16], m3 | |
21920 | |
21921 ; mode 33 [row 13 - first half] | |
21922 movu [r0 + 2010 * 16], m3 | |
21923 | |
21924 movu m1, [r3 + 28] | |
21925 movd m3, [r3 + 29] | |
21926 palignr m3, m1, 1 | |
21927 punpcklbw m1, m3 | |
21928 pmaddubsw m3, m1, m6 | |
21929 pmulhrsw m3, m7 | |
21930 movu m4, [r3 + 36] | |
21931 movd m5, [r3 + 37] | |
21932 palignr m5, m4, 1 | |
21933 punpcklbw m4, m5 | |
21934 pmaddubsw m5, m4, m6 | |
21935 pmulhrsw m5, m7 | |
21936 packuswb m3, m5 | |
21937 movu [r0 + 1847 * 16], m3 | |
21938 | |
21939 ; mode 33 [row 13 - second half] | |
21940 movu [r0 + 2011 * 16], m3 | |
21941 | |
21942 ; mode 30 [row 28] | |
21943 movu m6, [r5 + 25 * 16] | |
21944 pmaddubsw m3, m0, m6 | |
21945 pmulhrsw m3, m7 | |
21946 pmaddubsw m5, m2, m6 | |
21947 pmulhrsw m5, m7 | |
21948 packuswb m3, m5 | |
21949 movu [r0 + 1848 * 16], m3 | |
21950 pmaddubsw m3, m1, m6 | |
21951 pmulhrsw m3, m7 | |
21952 pmaddubsw m5, m4, m6 | |
21953 pmulhrsw m5, m7 | |
21954 packuswb m3, m5 | |
21955 movu [r0 + 1849 * 16], m3 | |
21956 | |
21957 ; mode 31 [row 20] | |
21958 movu m6, [r5 + 5 * 16] | |
21959 pmaddubsw m3, m0, m6 | |
21960 pmulhrsw m3, m7 | |
21961 pmaddubsw m5, m2, m6 | |
21962 pmulhrsw m5, m7 | |
21963 packuswb m3, m5 | |
21964 movu [r0 + 1896 * 16], m3 | |
21965 | |
21966 ; mode 32 [row 16 - first half] | |
21967 movu [r0 + 1952 * 16], m3 | |
21968 | |
21969 pmaddubsw m3, m1, m6 | |
21970 pmulhrsw m3, m7 | |
21971 pmaddubsw m5, m4, m6 | |
21972 pmulhrsw m5, m7 | |
21973 packuswb m3, m5 | |
21974 movu [r0 + 1897 * 16], m3 | |
21975 | |
21976 ; mode 32 [row 16 - second half] | |
21977 movu [r0 + 1953 * 16], m3 | |
21978 | |
21979 ; mode 31 [row 21] | |
21980 movu m6, [r5 + 22 * 16] | |
21981 pmaddubsw m3, m0, m6 | |
21982 pmulhrsw m3, m7 | |
21983 pmaddubsw m5, m2, m6 | |
21984 pmulhrsw m5, m7 | |
21985 packuswb m3, m5 | |
21986 movu [r0 + 1898 * 16], m3 | |
21987 pmaddubsw m3, m1, m6 | |
21988 pmulhrsw m3, m7 | |
21989 pmaddubsw m5, m4, m6 | |
21990 pmulhrsw m5, m7 | |
21991 packuswb m3, m5 | |
21992 movu [r0 + 1899 * 16], m3 | |
21993 | |
21994 ; mode 32 [row 17] | |
21995 movu m6, [r5 + 26 * 16] | |
21996 pmaddubsw m3, m0, m6 | |
21997 pmulhrsw m3, m7 | |
21998 pmaddubsw m5, m2, m6 | |
21999 pmulhrsw m5, m7 | |
22000 packuswb m3, m5 | |
22001 movu [r0 + 1954 * 16], m3 | |
22002 pmaddubsw m3, m1, m6 | |
22003 pmulhrsw m3, m7 | |
22004 pmaddubsw m5, m4, m6 | |
22005 pmulhrsw m5, m7 | |
22006 packuswb m3, m5 | |
22007 movu [r0 + 1955 * 16], m3 | |
22008 | |
22009 ; mode 30 [row 29] | |
22010 movu m6, [r5 + 6 * 16] | |
22011 movu m0, [r3 + 13] | |
22012 movd m1, [r3 + 14] | |
22013 palignr m1, m0, 1 | |
22014 punpcklbw m0, m1 | |
22015 pmaddubsw m3, m0, m6 | |
22016 pmulhrsw m3, m7 | |
22017 movu m2, [r3 + 21] | |
22018 movd m4, [r3 + 22] | |
22019 palignr m4, m2, 1 | |
22020 punpcklbw m2, m4 | |
22021 pmaddubsw m5, m2, m6 | |
22022 pmulhrsw m5, m7 | |
22023 packuswb m3, m5 | |
22024 movu [r0 + 1850 * 16], m3 | |
22025 | |
22026 ; mode 33 [row 14 - first half] | |
22027 movu [r0 + 2012 * 16], m3 | |
22028 | |
22029 movu m1, [r3 + 29] | |
22030 movd m3, [r3 + 30] | |
22031 palignr m3, m1, 1 | |
22032 punpcklbw m1, m3 | |
22033 pmaddubsw m3, m1, m6 | |
22034 pmulhrsw m3, m7 | |
22035 movu m4, [r3 + 37] | |
22036 movd m5, [r3 + 38] | |
22037 palignr m5, m4, 1 | |
22038 punpcklbw m4, m5 | |
22039 pmaddubsw m5, m4, m6 | |
22040 pmulhrsw m5, m7 | |
22041 packuswb m3, m5 | |
22042 movu [r0 + 1851 * 16], m3 | |
22043 | |
22044 ; mode 33 [row 14 - second half] | |
22045 movu [r0 + 2013 * 16], m3 | |
22046 | |
22047 ; mode 30 [row 30] | |
22048 movu m6, [r5 + 19 * 16] | |
22049 pmaddubsw m3, m0, m6 | |
22050 pmulhrsw m3, m7 | |
22051 pmaddubsw m5, m2, m6 | |
22052 pmulhrsw m5, m7 | |
22053 packuswb m3, m5 | |
22054 movu [r0 + 1852 * 16], m3 | |
22055 pmaddubsw m3, m1, m6 | |
22056 pmulhrsw m3, m7 | |
22057 pmaddubsw m5, m4, m6 | |
22058 pmulhrsw m5, m7 | |
22059 packuswb m3, m5 | |
22060 movu [r0 + 1853 * 16], m3 | |
22061 | |
22062 ; mode 31 [row 22] | |
22063 movu m6, [r5 + 7 * 16] | |
22064 pmaddubsw m3, m0, m6 | |
22065 pmulhrsw m3, m7 | |
22066 pmaddubsw m5, m2, m6 | |
22067 pmulhrsw m5, m7 | |
22068 packuswb m3, m5 | |
22069 movu [r0 + 1900 * 16], m3 | |
22070 pmaddubsw m3, m1, m6 | |
22071 pmulhrsw m3, m7 | |
22072 pmaddubsw m5, m4, m6 | |
22073 pmulhrsw m5, m7 | |
22074 packuswb m3, m5 | |
22075 movu [r0 + 1901 * 16], m3 | |
22076 | |
22077 ; mode 31 [row 23] | |
22078 movu m6, [r5 + 24 * 16] | |
22079 pmaddubsw m3, m0, m6 | |
22080 pmulhrsw m3, m7 | |
22081 pmaddubsw m5, m2, m6 | |
22082 pmulhrsw m5, m7 | |
22083 packuswb m3, m5 | |
22084 movu [r0 + 1902 * 16], m3 | |
22085 pmaddubsw m3, m1, m6 | |
22086 pmulhrsw m3, m7 | |
22087 pmaddubsw m5, m4, m6 | |
22088 pmulhrsw m5, m7 | |
22089 packuswb m3, m5 | |
22090 movu [r0 + 1903 * 16], m3 | |
22091 | |
22092 ; mode 32 [row 18] | |
22093 movu m6, [r5 + 15 * 16] | |
22094 pmaddubsw m3, m0, m6 | |
22095 pmulhrsw m3, m7 | |
22096 pmaddubsw m5, m2, m6 | |
22097 pmulhrsw m5, m7 | |
22098 packuswb m3, m5 | |
22099 movu [r0 + 1956 * 16], m3 | |
22100 pmaddubsw m3, m1, m6 | |
22101 pmulhrsw m3, m7 | |
22102 pmaddubsw m5, m4, m6 | |
22103 pmulhrsw m5, m7 | |
22104 packuswb m3, m5 | |
22105 movu [r0 + 1957 * 16], m3 | |
22106 | |
22107 ; mode 30 [row 31] | |
22108 movu m0, [r3 + 14] | |
22109 movd m1, [r3 + 15] | |
22110 palignr m1, m0, 1 | |
22111 punpcklbw m0, m1 | |
22112 movu m2, [r3 + 22] | |
22113 movd m3, [r3 + 23] | |
22114 palignr m3, m2, 1 | |
22115 punpcklbw m2, m3 | |
22116 movu m1, [r3 + 30] | |
22117 movd m3, [r3 + 31] | |
22118 palignr m3, m1, 1 | |
22119 punpcklbw m1, m3 | |
22120 movu m4, [r3 + 38] | |
22121 movd m5, [r3 + 39] | |
22122 palignr m5, m4, 1 | |
22123 punpcklbw m4, m5 | |
22124 | |
22125 pshufb m5, m0, [tab_S2] | |
22126 movh [r0 + 1854 * 16], m5 | |
22127 | |
22128 ; mode 33 [row 15 - first eight] | |
22129 movh [r0 + 2014 * 16], m5 | |
22130 | |
22131 pshufb m5, m2, [tab_S2] | |
22132 movh [r0 + 1854 * 16 + 8], m5 | |
22133 | |
22134 ; mode 33 [row 15 - second eight] | |
22135 movh [r0 + 2014 * 16 + 8], m5 | |
22136 | |
22137 pshufb m5, m1, [tab_S2] | |
22138 movh [r0 + 1855 * 16], m5 | |
22139 | |
22140 ; mode 33 [row 15 - third eight] | |
22141 movh [r0 + 2015 * 16], m5 | |
22142 | |
22143 pshufb m5, m4, [tab_S2] | |
22144 movh [r0 + 1855 * 16 + 8], m5 | |
22145 | |
22146 ; mode 33 [row 15 - fourth eight] | |
22147 movh [r0 + 2015 * 16 + 8], m5 | |
22148 | |
22149 ; mode 31 [row 24] | |
22150 movu m6, [r5 + 9 * 16] | |
22151 pmaddubsw m3, m0, m6 | |
22152 pmulhrsw m3, m7 | |
22153 pmaddubsw m5, m2, m6 | |
22154 pmulhrsw m5, m7 | |
22155 packuswb m3, m5 | |
22156 movu [r0 + 1904 * 16], m3 | |
22157 pmaddubsw m3, m1, m6 | |
22158 pmulhrsw m3, m7 | |
22159 pmaddubsw m5, m4, m6 | |
22160 pmulhrsw m5, m7 | |
22161 packuswb m3, m5 | |
22162 movu [r0 + 1905 * 16], m3 | |
22163 | |
22164 ; mode 31 [row 25] | |
22165 movu m6, [r5 + 26 * 16] | |
22166 pmaddubsw m3, m0, m6 | |
22167 pmulhrsw m3, m7 | |
22168 pmaddubsw m5, m2, m6 | |
22169 pmulhrsw m5, m7 | |
22170 packuswb m3, m5 | |
22171 movu [r0 + 1906 * 16], m3 | |
22172 | |
22173 ; mode 33 [row 16 - first half] | |
22174 movu [r0 + 2016 * 16], m3 | |
22175 | |
22176 pmaddubsw m3, m1, m6 | |
22177 pmulhrsw m3, m7 | |
22178 pmaddubsw m5, m4, m6 | |
22179 pmulhrsw m5, m7 | |
22180 packuswb m3, m5 | |
22181 movu [r0 + 1907 * 16], m3 | |
22182 | |
22183 ; mode 33 [row 16 - second half] | |
22184 movu [r0 + 2017 * 16], m3 | |
22185 | |
22186 ; mode 32 [row 19] | |
22187 movu m6, [r5 + 4 * 16] | |
22188 pmaddubsw m3, m0, m6 | |
22189 pmulhrsw m3, m7 | |
22190 pmaddubsw m5, m2, m6 | |
22191 pmulhrsw m5, m7 | |
22192 packuswb m3, m5 | |
22193 movu [r0 + 1958 * 16], m3 | |
22194 pmaddubsw m3, m1, m6 | |
22195 pmulhrsw m3, m7 | |
22196 pmaddubsw m5, m4, m6 | |
22197 pmulhrsw m5, m7 | |
22198 packuswb m3, m5 | |
22199 movu [r0 + 1959 * 16], m3 | |
22200 | |
22201 ; mode 32 [row 20] | |
22202 movu m6, [r5 + 25 * 16] | |
22203 pmaddubsw m3, m0, m6 | |
22204 pmulhrsw m3, m7 | |
22205 pmaddubsw m5, m2, m6 | |
22206 pmulhrsw m5, m7 | |
22207 packuswb m3, m5 | |
22208 movu [r0 + 1960 * 16], m3 | |
22209 pmaddubsw m3, m1, m6 | |
22210 pmulhrsw m3, m7 | |
22211 pmaddubsw m5, m4, m6 | |
22212 pmulhrsw m5, m7 | |
22213 packuswb m3, m5 | |
22214 movu [r0 + 1961 * 16], m3 | |
22215 | |
22216 ; mode 31 [row 26] | |
22217 movu m6, [r5 + 11 * 16] | |
22218 movu m0, [r3 + 15] | |
22219 movd m1, [r3 + 16] | |
22220 palignr m1, m0, 1 | |
22221 punpcklbw m0, m1 | |
22222 pmaddubsw m3, m0, m6 | |
22223 pmulhrsw m3, m7 | |
22224 movu m2, [r3 + 23] | |
22225 movd m4, [r3 + 24] | |
22226 palignr m4, m2, 1 | |
22227 punpcklbw m2, m4 | |
22228 pmaddubsw m5, m2, m6 | |
22229 pmulhrsw m5, m7 | |
22230 packuswb m3, m5 | |
22231 movu [r0 + 1908 * 16], m3 | |
22232 | |
22233 movu m1, [r3 + 31] | |
22234 movd m3, [r3 + 32] | |
22235 palignr m3, m1, 1 | |
22236 punpcklbw m1, m3 | |
22237 pmaddubsw m3, m1, m6 | |
22238 pmulhrsw m3, m7 | |
22239 movu m4, [r3 + 39] | |
22240 movd m5, [r3 + 40] | |
22241 palignr m5, m4, 1 | |
22242 punpcklbw m4, m5 | |
22243 pmaddubsw m5, m4, m6 | |
22244 pmulhrsw m5, m7 | |
22245 packuswb m3, m5 | |
22246 movu [r0 + 1909 * 16], m3 | |
22247 | |
22248 ; mode 31 [row 27] | |
22249 movu m6, [r5 + 28 * 16] | |
22250 pmaddubsw m3, m0, m6 | |
22251 pmulhrsw m3, m7 | |
22252 pmaddubsw m5, m2, m6 | |
22253 pmulhrsw m5, m7 | |
22254 packuswb m3, m5 | |
22255 movu [r0 + 1910 * 16], m3 | |
22256 pmaddubsw m3, m1, m6 | |
22257 pmulhrsw m3, m7 | |
22258 pmaddubsw m5, m4, m6 | |
22259 pmulhrsw m5, m7 | |
22260 packuswb m3, m5 | |
22261 movu [r0 + 1911 * 16], m3 | |
22262 | |
22263 ; mode 32 [row 21] | |
22264 movu m6, [r5 + 14 * 16] | |
22265 pmaddubsw m3, m0, m6 | |
22266 pmulhrsw m3, m7 | |
22267 pmaddubsw m5, m2, m6 | |
22268 pmulhrsw m5, m7 | |
22269 packuswb m3, m5 | |
22270 movu [r0 + 1962 * 16], m3 | |
22271 pmaddubsw m3, m1, m6 | |
22272 pmulhrsw m3, m7 | |
22273 pmaddubsw m5, m4, m6 | |
22274 pmulhrsw m5, m7 | |
22275 packuswb m3, m5 | |
22276 movu [r0 + 1963 * 16], m3 | |
22277 | |
22278 ; mode 33 [row 17] | |
22279 movu m6, [r5 + 20 * 16] | |
22280 pmaddubsw m3, m0, m6 | |
22281 pmulhrsw m3, m7 | |
22282 pmaddubsw m5, m2, m6 | |
22283 pmulhrsw m5, m7 | |
22284 packuswb m3, m5 | |
22285 movu [r0 + 2018 * 16], m3 | |
22286 pmaddubsw m3, m1, m6 | |
22287 pmulhrsw m3, m7 | |
22288 pmaddubsw m5, m4, m6 | |
22289 pmulhrsw m5, m7 | |
22290 packuswb m3, m5 | |
22291 movu [r0 + 2019 * 16], m3 | |
22292 | |
22293 ; mode 31 [row 28] | |
22294 movu m6, [r5 + 13 * 16] | |
22295 movu m0, [r3 + 16] | |
22296 movd m1, [r3 + 17] | |
22297 palignr m1, m0, 1 | |
22298 punpcklbw m0, m1 | |
22299 pmaddubsw m3, m0, m6 | |
22300 pmulhrsw m3, m7 | |
22301 movu m2, [r3 + 24] | |
22302 movd m4, [r3 + 25] | |
22303 palignr m4, m2, 1 | |
22304 punpcklbw m2, m4 | |
22305 pmaddubsw m5, m2, m6 | |
22306 pmulhrsw m5, m7 | |
22307 packuswb m3, m5 | |
22308 movu [r0 + 1912 * 16], m3 | |
22309 | |
22310 movu m1, [r3 + 32] | |
22311 movd m3, [r3 + 33] | |
22312 palignr m3, m1, 1 | |
22313 punpcklbw m1, m3 | |
22314 pmaddubsw m3, m1, m6 | |
22315 pmulhrsw m3, m7 | |
22316 movu m4, [r3 + 40] | |
22317 movd m5, [r3 + 41] | |
22318 palignr m5, m4, 1 | |
22319 punpcklbw m4, m5 | |
22320 pmaddubsw m5, m4, m6 | |
22321 pmulhrsw m5, m7 | |
22322 packuswb m3, m5 | |
22323 movu [r0 + 1913 * 16], m3 | |
22324 | |
22325 ; mode 31 [row 29] | |
22326 movu m6, [r5 + 30 * 16] | |
22327 pmaddubsw m3, m0, m6 | |
22328 pmulhrsw m3, m7 | |
22329 pmaddubsw m5, m2, m6 | |
22330 pmulhrsw m5, m7 | |
22331 packuswb m3, m5 | |
22332 movu [r0 + 1914 * 16], m3 | |
22333 pmaddubsw m3, m1, m6 | |
22334 pmulhrsw m3, m7 | |
22335 pmaddubsw m5, m4, m6 | |
22336 pmulhrsw m5, m7 | |
22337 packuswb m3, m5 | |
22338 movu [r0 + 1915 * 16], m3 | |
22339 | |
22340 ; mode 32 [row 22] | |
22341 movu m6, [r5 + 3 * 16] | |
22342 pmaddubsw m3, m0, m6 | |
22343 pmulhrsw m3, m7 | |
22344 pmaddubsw m5, m2, m6 | |
22345 pmulhrsw m5, m7 | |
22346 packuswb m3, m5 | |
22347 movu [r0 + 1964 * 16], m3 | |
22348 pmaddubsw m3, m1, m6 | |
22349 pmulhrsw m3, m7 | |
22350 pmaddubsw m5, m4, m6 | |
22351 pmulhrsw m5, m7 | |
22352 packuswb m3, m5 | |
22353 movu [r0 + 1965 * 16], m3 | |
22354 | |
22355 ; mode 32 [row 23] | |
22356 movu m6, [r5 + 24 * 16] | |
22357 pmaddubsw m3, m0, m6 | |
22358 pmulhrsw m3, m7 | |
22359 pmaddubsw m5, m2, m6 | |
22360 pmulhrsw m5, m7 | |
22361 packuswb m3, m5 | |
22362 movu [r0 + 1966 * 16], m3 | |
22363 pmaddubsw m3, m1, m6 | |
22364 pmulhrsw m3, m7 | |
22365 pmaddubsw m5, m4, m6 | |
22366 pmulhrsw m5, m7 | |
22367 packuswb m3, m5 | |
22368 movu [r0 + 1967 * 16], m3 | |
22369 | |
22370 ; mode 33 [row 18] | |
22371 movu m6, [r5 + 14 * 16] | |
22372 pmaddubsw m3, m0, m6 | |
22373 pmulhrsw m3, m7 | |
22374 pmaddubsw m5, m2, m6 | |
22375 pmulhrsw m5, m7 | |
22376 packuswb m3, m5 | |
22377 movu [r0 + 2020 * 16], m3 | |
22378 pmaddubsw m3, m1, m6 | |
22379 pmulhrsw m3, m7 | |
22380 pmaddubsw m5, m4, m6 | |
22381 pmulhrsw m5, m7 | |
22382 packuswb m3, m5 | |
22383 movu [r0 + 2021 * 16], m3 | |
22384 | |
22385 ; mode 31 [row 30] | |
22386 movu m6, [r5 + 15 * 16] | |
22387 movu m0, [r3 + 17] | |
22388 movd m1, [r3 + 18] | |
22389 palignr m1, m0, 1 | |
22390 punpcklbw m0, m1 | |
22391 pmaddubsw m3, m0, m6 | |
22392 pmulhrsw m3, m7 | |
22393 movu m2, [r3 + 25] | |
22394 movd m4, [r3 + 26] | |
22395 palignr m4, m2, 1 | |
22396 punpcklbw m2, m4 | |
22397 pmaddubsw m5, m2, m6 | |
22398 pmulhrsw m5, m7 | |
22399 packuswb m3, m5 | |
22400 movu [r0 + 1916 * 16], m3 | |
22401 | |
22402 movu m1, [r3 + 33] | |
22403 movd m3, [r3 + 34] | |
22404 palignr m3, m1, 1 | |
22405 punpcklbw m1, m3 | |
22406 pmaddubsw m3, m1, m6 | |
22407 pmulhrsw m3, m7 | |
22408 movu m4, [r3 + 41] | |
22409 movd m5, [r3 + 42] | |
22410 palignr m5, m4, 1 | |
22411 punpcklbw m4, m5 | |
22412 pmaddubsw m5, m4, m6 | |
22413 pmulhrsw m5, m7 | |
22414 packuswb m3, m5 | |
22415 movu [r0 + 1917 * 16], m3 | |
22416 | |
22417 ; mode 32 [row 24] | |
22418 movu m6, [r5 + 13 * 16] | |
22419 pmaddubsw m3, m0, m6 | |
22420 pmulhrsw m3, m7 | |
22421 pmaddubsw m5, m2, m6 | |
22422 pmulhrsw m5, m7 | |
22423 packuswb m3, m5 | |
22424 movu [r0 + 1968 * 16], m3 | |
22425 pmaddubsw m3, m1, m6 | |
22426 pmulhrsw m3, m7 | |
22427 pmaddubsw m5, m4, m6 | |
22428 pmulhrsw m5, m7 | |
22429 packuswb m3, m5 | |
22430 movu [r0 + 1969 * 16], m3 | |
22431 | |
22432 ; mode 33 [row 19] | |
22433 movu m6, [r5 + 8 * 16] | |
22434 pmaddubsw m3, m0, m6 | |
22435 pmulhrsw m3, m7 | |
22436 pmaddubsw m5, m2, m6 | |
22437 pmulhrsw m5, m7 | |
22438 packuswb m3, m5 | |
22439 movu [r0 + 2022 * 16], m3 | |
22440 pmaddubsw m3, m1, m6 | |
22441 pmulhrsw m3, m7 | |
22442 pmaddubsw m5, m4, m6 | |
22443 pmulhrsw m5, m7 | |
22444 packuswb m3, m5 | |
22445 movu [r0 + 2023 * 16], m3 | |
22446 | |
22447 ; mode 31 [row 31] | |
22448 movu m0, [r3 + 18] | |
22449 movd m1, [r3 + 19] | |
22450 palignr m1, m0, 1 | |
22451 punpcklbw m0, m1 | |
22452 movu m2, [r3 + 26] | |
22453 movd m3, [r3 + 27] | |
22454 palignr m3, m2, 1 | |
22455 punpcklbw m2, m3 | |
22456 movu m1, [r3 + 34] | |
22457 movd m3, [r3 + 35] | |
22458 palignr m3, m1, 1 | |
22459 punpcklbw m1, m3 | |
22460 movu m4, [r3 + 42] | |
22461 movd m5, [r3 + 43] | |
22462 palignr m5, m4, 1 | |
22463 punpcklbw m4, m5 | |
22464 | |
22465 pshufb m5, m0, [tab_S2] | |
22466 movh [r0 + 1918 * 16], m5 | |
22467 pshufb m5, m2, [tab_S2] | |
22468 movh [r0 + 1918 * 16 + 8], m5 | |
22469 pshufb m5, m1, [tab_S2] | |
22470 movh [r0 + 1919 * 16], m5 | |
22471 pshufb m5, m4, [tab_S2] | |
22472 movh [r0 + 1919 * 16 + 8], m5 | |
22473 | |
22474 ; mode 32 [row 25] | |
22475 movu m6, [r5 + 2 * 16] | |
22476 pmaddubsw m3, m0, m6 | |
22477 pmulhrsw m3, m7 | |
22478 pmaddubsw m5, m2, m6 | |
22479 pmulhrsw m5, m7 | |
22480 packuswb m3, m5 | |
22481 movu [r0 + 1970 * 16], m3 | |
22482 | |
22483 ; mode 33 [row 20 - first half] | |
22484 movu [r0 + 2024 * 16], m3 | |
22485 | |
22486 pmaddubsw m3, m1, m6 | |
22487 pmulhrsw m3, m7 | |
22488 pmaddubsw m5, m4, m6 | |
22489 pmulhrsw m5, m7 | |
22490 packuswb m3, m5 | |
22491 movu [r0 + 1971 * 16], m3 | |
22492 | |
22493 ; mode 33 [row 20 - second half] | |
22494 movu [r0 + 2025 * 16], m3 | |
22495 | |
22496 ; mode 32 [row 26] | |
22497 movu m6, [r5 + 23 * 16] | |
22498 pmaddubsw m3, m0, m6 | |
22499 pmulhrsw m3, m7 | |
22500 pmaddubsw m5, m2, m6 | |
22501 pmulhrsw m5, m7 | |
22502 packuswb m3, m5 | |
22503 movu [r0 + 1972 * 16], m3 | |
22504 pmaddubsw m3, m1, m6 | |
22505 pmulhrsw m3, m7 | |
22506 pmaddubsw m5, m4, m6 | |
22507 pmulhrsw m5, m7 | |
22508 packuswb m3, m5 | |
22509 movu [r0 + 1973 * 16], m3 | |
22510 | |
22511 ; mode 33 [row 21] | |
22512 movu m6, [r5 + 28 * 16] | |
22513 pmaddubsw m3, m0, m6 | |
22514 pmulhrsw m3, m7 | |
22515 pmaddubsw m5, m2, m6 | |
22516 pmulhrsw m5, m7 | |
22517 packuswb m3, m5 | |
22518 movu [r0 + 2026 * 16], m3 | |
22519 pmaddubsw m3, m1, m6 | |
22520 pmulhrsw m3, m7 | |
22521 pmaddubsw m5, m4, m6 | |
22522 pmulhrsw m5, m7 | |
22523 packuswb m3, m5 | |
22524 movu [r0 + 2027 * 16], m3 | |
22525 | |
22526 ; mode 32 [row 27] | |
22527 movu m6, [r5 + 12 * 16] | |
22528 movu m0, [r3 + 19] | |
22529 movd m1, [r3 + 20] | |
22530 palignr m1, m0, 1 | |
22531 punpcklbw m0, m1 | |
22532 pmaddubsw m3, m0, m6 | |
22533 pmulhrsw m3, m7 | |
22534 movu m2, [r3 + 27] | |
22535 movd m4, [r3 + 28] | |
22536 palignr m4, m2, 1 | |
22537 punpcklbw m2, m4 | |
22538 pmaddubsw m5, m2, m6 | |
22539 pmulhrsw m5, m7 | |
22540 packuswb m3, m5 | |
22541 movu [r0 + 1974 * 16], m3 | |
22542 | |
22543 movu m1, [r3 + 35] | |
22544 movd m3, [r3 + 36] | |
22545 palignr m3, m1, 1 | |
22546 punpcklbw m1, m3 | |
22547 pmaddubsw m3, m1, m6 | |
22548 pmulhrsw m3, m7 | |
22549 movu m4, [r3 + 43] | |
22550 movd m5, [r3 + 44] | |
22551 palignr m5, m4, 1 | |
22552 punpcklbw m4, m5 | |
22553 pmaddubsw m5, m4, m6 | |
22554 pmulhrsw m5, m7 | |
22555 packuswb m3, m5 | |
22556 movu [r0 + 1975 * 16], m3 | |
22557 | |
22558 ; mode 33 [row 22] | |
22559 movu m6, [r5 + 22 * 16] | |
22560 pmaddubsw m3, m0, m6 | |
22561 pmulhrsw m3, m7 | |
22562 pmaddubsw m5, m2, m6 | |
22563 pmulhrsw m5, m7 | |
22564 packuswb m3, m5 | |
22565 movu [r0 + 2028 * 16], m3 | |
22566 pmaddubsw m3, m1, m6 | |
22567 pmulhrsw m3, m7 | |
22568 pmaddubsw m5, m4, m6 | |
22569 pmulhrsw m5, m7 | |
22570 packuswb m3, m5 | |
22571 movu [r0 + 2029 * 16], m3 | |
22572 | |
22573 ; mode 32 [row 28] | |
22574 movu m6, [r5 + 1 * 16] | |
22575 movu m0, [r3 + 20] | |
22576 movd m1, [r3 + 21] | |
22577 palignr m1, m0, 1 | |
22578 punpcklbw m0, m1 | |
22579 pmaddubsw m3, m0, m6 | |
22580 pmulhrsw m3, m7 | |
22581 movu m2, [r3 + 28] | |
22582 movd m4, [r3 + 29] | |
22583 palignr m4, m2, 1 | |
22584 punpcklbw m2, m4 | |
22585 pmaddubsw m5, m2, m6 | |
22586 pmulhrsw m5, m7 | |
22587 packuswb m3, m5 | |
22588 movu [r0 + 1976 * 16], m3 | |
22589 | |
22590 movu m1, [r3 + 36] | |
22591 movd m3, [r3 + 37] | |
22592 palignr m3, m1, 1 | |
22593 punpcklbw m1, m3 | |
22594 pmaddubsw m3, m1, m6 | |
22595 pmulhrsw m3, m7 | |
22596 movu m4, [r3 + 44] | |
22597 movd m5, [r3 + 45] | |
22598 palignr m5, m4, 1 | |
22599 punpcklbw m4, m5 | |
22600 pmaddubsw m5, m4, m6 | |
22601 pmulhrsw m5, m7 | |
22602 packuswb m3, m5 | |
22603 movu [r0 + 1977 * 16], m3 | |
22604 | |
22605 ; mode 32 [row 29] | |
22606 movu m6, [r5 + 22 * 16] | |
22607 pmaddubsw m3, m0, m6 | |
22608 pmulhrsw m3, m7 | |
22609 pmaddubsw m5, m2, m6 | |
22610 pmulhrsw m5, m7 | |
22611 packuswb m3, m5 | |
22612 movu [r0 + 1978 * 16], m3 | |
22613 pmaddubsw m3, m1, m6 | |
22614 pmulhrsw m3, m7 | |
22615 pmaddubsw m5, m4, m6 | |
22616 pmulhrsw m5, m7 | |
22617 packuswb m3, m5 | |
22618 movu [r0 + 1979 * 16], m3 | |
22619 | |
22620 ; mode 33 [row 23] | |
22621 movu m6, [r5 + 16 * 16] | |
22622 pmaddubsw m3, m0, m6 | |
22623 pmulhrsw m3, m7 | |
22624 pmaddubsw m5, m2, m6 | |
22625 pmulhrsw m5, m7 | |
22626 packuswb m3, m5 | |
22627 movu [r0 + 2030 * 16], m3 | |
22628 pmaddubsw m3, m1, m6 | |
22629 pmulhrsw m3, m7 | |
22630 pmaddubsw m5, m4, m6 | |
22631 pmulhrsw m5, m7 | |
22632 packuswb m3, m5 | |
22633 movu [r0 + 2031 * 16], m3 | |
22634 | |
22635 ; mode 32 [row 30] | |
22636 movu m6, [r5 + 11 * 16] | |
22637 movu m0, [r3 + 21] | |
22638 movd m1, [r3 + 22] | |
22639 palignr m1, m0, 1 | |
22640 punpcklbw m0, m1 | |
22641 pmaddubsw m3, m0, m6 | |
22642 pmulhrsw m3, m7 | |
22643 movu m2, [r3 + 29] | |
22644 movd m4, [r3 + 30] | |
22645 palignr m4, m2, 1 | |
22646 punpcklbw m2, m4 | |
22647 pmaddubsw m5, m2, m6 | |
22648 pmulhrsw m5, m7 | |
22649 packuswb m3, m5 | |
22650 movu [r0 + 1980 * 16], m3 | |
22651 | |
22652 movu m1, [r3 + 37] | |
22653 movd m3, [r3 + 38] | |
22654 palignr m3, m1, 1 | |
22655 punpcklbw m1, m3 | |
22656 pmaddubsw m3, m1, m6 | |
22657 pmulhrsw m3, m7 | |
22658 movu m4, [r3 + 45] | |
22659 movd m5, [r3 + 46] | |
22660 palignr m5, m4, 1 | |
22661 punpcklbw m4, m5 | |
22662 pmaddubsw m5, m4, m6 | |
22663 pmulhrsw m5, m7 | |
22664 packuswb m3, m5 | |
22665 movu [r0 + 1981 * 16], m3 | |
22666 | |
22667 ; mode 33 [row 24] | |
22668 movu m6, [r5 + 10 * 16] | |
22669 pmaddubsw m3, m0, m6 | |
22670 pmulhrsw m3, m7 | |
22671 pmaddubsw m5, m2, m6 | |
22672 pmulhrsw m5, m7 | |
22673 packuswb m3, m5 | |
22674 movu [r0 + 2032 * 16], m3 | |
22675 pmaddubsw m3, m1, m6 | |
22676 pmulhrsw m3, m7 | |
22677 pmaddubsw m5, m4, m6 | |
22678 pmulhrsw m5, m7 | |
22679 packuswb m3, m5 | |
22680 movu [r0 + 2033 * 16], m3 | |
22681 | |
22682 ; mode 32 [row 31] | |
22683 movu m0, [r3 + 22] | |
22684 movd m1, [r3 + 23] | |
22685 palignr m1, m0, 1 | |
22686 punpcklbw m0, m1 | |
22687 movu m2, [r3 + 30] | |
22688 movd m3, [r3 + 31] | |
22689 palignr m3, m2, 1 | |
22690 punpcklbw m2, m3 | |
22691 movu m1, [r3 + 38] | |
22692 movd m3, [r3 + 39] | |
22693 palignr m3, m1, 1 | |
22694 punpcklbw m1, m3 | |
22695 movu m4, [r3 + 46] | |
22696 movd m5, [r3 + 47] | |
22697 palignr m5, m4, 1 | |
22698 punpcklbw m4, m5 | |
22699 | |
22700 pshufb m5, m0, [tab_S2] | |
22701 movh [r0 + 1982 * 16], m5 | |
22702 pshufb m5, m2, [tab_S2] | |
22703 movh [r0 + 1982 * 16 + 8], m5 | |
22704 pshufb m5, m1, [tab_S2] | |
22705 movh [r0 + 1983 * 16], m5 | |
22706 pshufb m5, m4, [tab_S2] | |
22707 movh [r0 + 1983 * 16 + 8], m5 | |
22708 | |
22709 ; mode 33 [row 25] | |
22710 movu m6, [r5 + 4 * 16] | |
22711 pmaddubsw m3, m0, m6 | |
22712 pmulhrsw m3, m7 | |
22713 pmaddubsw m5, m2, m6 | |
22714 pmulhrsw m5, m7 | |
22715 packuswb m3, m5 | |
22716 movu [r0 + 2034 * 16], m3 | |
22717 pmaddubsw m3, m1, m6 | |
22718 pmulhrsw m3, m7 | |
22719 pmaddubsw m5, m4, m6 | |
22720 pmulhrsw m5, m7 | |
22721 packuswb m3, m5 | |
22722 movu [r0 + 2035 * 16], m3 | |
22723 | |
22724 ; mode 33 [row 26] | |
22725 movu m6, [r5 + 30 * 16] | |
22726 pmaddubsw m3, m0, m6 | |
22727 pmulhrsw m3, m7 | |
22728 pmaddubsw m5, m2, m6 | |
22729 pmulhrsw m5, m7 | |
22730 packuswb m3, m5 | |
22731 movu [r0 + 2036 * 16], m3 | |
22732 pmaddubsw m3, m1, m6 | |
22733 pmulhrsw m3, m7 | |
22734 pmaddubsw m5, m4, m6 | |
22735 pmulhrsw m5, m7 | |
22736 packuswb m3, m5 | |
22737 movu [r0 + 2037 * 16], m3 | |
22738 | |
22739 ; mode 33 [row 27] | |
22740 movu m6, [r5 + 24 * 16] | |
22741 movu m0, [r3 + 23] | |
22742 movd m1, [r3 + 24] | |
22743 palignr m1, m0, 1 | |
22744 punpcklbw m0, m1 | |
22745 pmaddubsw m3, m0, m6 | |
22746 pmulhrsw m3, m7 | |
22747 movu m2, [r3 + 31] | |
22748 movd m4, [r3 + 32] | |
22749 palignr m4, m2, 1 | |
22750 punpcklbw m2, m4 | |
22751 pmaddubsw m5, m2, m6 | |
22752 pmulhrsw m5, m7 | |
22753 packuswb m3, m5 | |
22754 movu [r0 + 2038 * 16], m3 | |
22755 | |
22756 movu m1, [r3 + 39] | |
22757 movd m3, [r3 + 40] | |
22758 palignr m3, m1, 1 | |
22759 punpcklbw m1, m3 | |
22760 pmaddubsw m3, m1, m6 | |
22761 pmulhrsw m3, m7 | |
22762 movu m4, [r3 + 47] | |
22763 movd m5, [r3 + 48] | |
22764 palignr m5, m4, 1 | |
22765 punpcklbw m4, m5 | |
22766 pmaddubsw m5, m4, m6 | |
22767 pmulhrsw m5, m7 | |
22768 packuswb m3, m5 | |
22769 movu [r0 + 2039 * 16], m3 | |
22770 | |
22771 ; mode 33 [row 28] | |
22772 movu m6, [r5 + 18 * 16] | |
22773 movu m0, [r3 + 24] | |
22774 movd m1, [r3 + 25] | |
22775 palignr m1, m0, 1 | |
22776 punpcklbw m0, m1 | |
22777 pmaddubsw m3, m0, m6 | |
22778 pmulhrsw m3, m7 | |
22779 movu m2, [r3 + 32] | |
22780 movd m4, [r3 + 33] | |
22781 palignr m4, m2, 1 | |
22782 punpcklbw m2, m4 | |
22783 pmaddubsw m5, m2, m6 | |
22784 pmulhrsw m5, m7 | |
22785 packuswb m3, m5 | |
22786 movu [r0 + 2040 * 16], m3 | |
22787 | |
22788 movu m1, [r3 + 40] | |
22789 movd m3, [r3 + 41] | |
22790 palignr m3, m1, 1 | |
22791 punpcklbw m1, m3 | |
22792 pmaddubsw m3, m1, m6 | |
22793 pmulhrsw m3, m7 | |
22794 movu m4, [r3 + 48] | |
22795 movd m5, [r3 + 49] | |
22796 palignr m5, m4, 1 | |
22797 punpcklbw m4, m5 | |
22798 pmaddubsw m5, m4, m6 | |
22799 pmulhrsw m5, m7 | |
22800 packuswb m3, m5 | |
22801 movu [r0 + 2041 * 16], m3 | |
22802 | |
22803 ; mode 33 [row 29] | |
22804 movu m6, [r5 + 12 * 16] | |
22805 movu m0, [r3 + 25] | |
22806 movd m1, [r3 + 26] | |
22807 palignr m1, m0, 1 | |
22808 punpcklbw m0, m1 | |
22809 pmaddubsw m3, m0, m6 | |
22810 pmulhrsw m3, m7 | |
22811 movu m2, [r3 + 33] | |
22812 movd m4, [r3 + 34] | |
22813 palignr m4, m2, 1 | |
22814 punpcklbw m2, m4 | |
22815 pmaddubsw m5, m2, m6 | |
22816 pmulhrsw m5, m7 | |
22817 packuswb m3, m5 | |
22818 movu [r0 + 2042 * 16], m3 | |
22819 | |
22820 movu m1, [r3 + 41] | |
22821 movd m3, [r3 + 42] | |
22822 palignr m3, m1, 1 | |
22823 punpcklbw m1, m3 | |
22824 pmaddubsw m3, m1, m6 | |
22825 pmulhrsw m3, m7 | |
22826 movu m4, [r3 + 49] | |
22827 movd m5, [r3 + 50] | |
22828 palignr m5, m4, 1 | |
22829 punpcklbw m4, m5 | |
22830 pmaddubsw m5, m4, m6 | |
22831 pmulhrsw m5, m7 | |
22832 packuswb m3, m5 | |
22833 movu [r0 + 2043 * 16], m3 | |
22834 | |
22835 ; mode 33 [row 30] | |
22836 movu m6, [r5 + 6 * 16] | |
22837 movu m0, [r3 + 26] | |
22838 movd m1, [r3 + 27] | |
22839 palignr m1, m0, 1 | |
22840 punpcklbw m0, m1 | |
22841 pmaddubsw m3, m0, m6 | |
22842 pmulhrsw m3, m7 | |
22843 movu m2, [r3 + 34] | |
22844 movd m4, [r3 + 35] | |
22845 palignr m4, m2, 1 | |
22846 punpcklbw m2, m4 | |
22847 pmaddubsw m5, m2, m6 | |
22848 pmulhrsw m5, m7 | |
22849 packuswb m3, m5 | |
22850 movu [r0 + 2044 * 16], m3 | |
22851 | |
22852 movu m1, [r3 + 42] | |
22853 movd m3, [r3 + 43] | |
22854 palignr m3, m1, 1 | |
22855 punpcklbw m1, m3 | |
22856 pmaddubsw m3, m1, m6 | |
22857 pmulhrsw m3, m7 | |
22858 movu m4, [r3 + 50] | |
22859 movd m5, [r3 + 51] | |
22860 palignr m5, m4, 1 | |
22861 punpcklbw m4, m5 | |
22862 pmaddubsw m5, m4, m6 | |
22863 pmulhrsw m5, m7 | |
22864 packuswb m3, m5 | |
22865 movu [r0 + 2045 * 16], m3 | |
22866 | |
22867 ; mode 33 [row 31] | |
22868 movu m5, [r3 + 27] | |
22869 movu [r0 + 2046 * 16], m5 | |
22870 movu m5, [r3 + 43] | |
22871 movu [r0 + 2047 * 16], m5 | |
22872 | |
22873 ;mode 34 [row 0] | |
22874 movu m0, [r3 + 2] | |
22875 movu [r0 + 2048 * 16], m0 | |
22876 movu m1, [r3 + 18] | |
22877 movu [r0 + 2049 * 16], m1 | |
22878 | |
22879 ;mode 34 [row 1] | |
22880 movu m2, [r3 + 34] | |
22881 palignr m3, m1, m0, 1 | |
22882 movu [r0 + 2050 * 16], m3 | |
22883 palignr m4, m2, m1, 1 | |
22884 movu [r0 + 2051 * 16], m4 | |
22885 | |
22886 ;mode 34 [row 2] | |
22887 palignr m3, m1, m0, 2 | |
22888 movu [r0 + 2052 * 16], m3 | |
22889 palignr m4, m2, m1, 2 | |
22890 movu [r0 + 2053 * 16], m4 | |
22891 | |
22892 ;mode 34 [row 3] | |
22893 palignr m3, m1, m0, 3 | |
22894 movu [r0 + 2054 * 16], m3 | |
22895 palignr m4, m2, m1, 3 | |
22896 movu [r0 + 2055 * 16], m4 | |
22897 | |
22898 ;mode 34 [row 4] | |
22899 palignr m3, m1, m0, 4 | |
22900 movu [r0 + 2056 * 16], m3 | |
22901 palignr m4, m2, m1, 4 | |
22902 movu [r0 + 2057 * 16], m4 | |
22903 | |
22904 ;mode 34 [row 5] | |
22905 palignr m3, m1, m0, 5 | |
22906 movu [r0 + 2058 * 16], m3 | |
22907 palignr m4, m2, m1, 5 | |
22908 movu [r0 + 2059 * 16], m4 | |
22909 | |
22910 ;mode 34 [row 6] | |
22911 palignr m3, m1, m0, 6 | |
22912 movu [r0 + 2060 * 16], m3 | |
22913 palignr m4, m2, m1, 6 | |
22914 movu [r0 + 2061 * 16], m4 | |
22915 | |
22916 ;mode 34 [row 7] | |
22917 palignr m3, m1, m0, 7 | |
22918 movu [r0 + 2062 * 16], m3 | |
22919 palignr m4, m2, m1, 7 | |
22920 movu [r0 + 2063 * 16], m4 | |
22921 | |
22922 ;mode 34 [row 8] | |
22923 palignr m3, m1, m0, 8 | |
22924 movu [r0 + 2064 * 16], m3 | |
22925 palignr m4, m2, m1, 8 | |
22926 movu [r0 + 2065 * 16], m4 | |
22927 | |
22928 ;mode 34 [row 9] | |
22929 palignr m3, m1, m0, 9 | |
22930 movu [r0 + 2066 * 16], m3 | |
22931 palignr m4, m2, m1, 9 | |
22932 movu [r0 + 2067 * 16], m4 | |
22933 | |
22934 ;mode 34 [row 10] | |
22935 palignr m3, m1, m0, 10 | |
22936 movu [r0 + 2068 * 16], m3 | |
22937 palignr m4, m2, m1, 10 | |
22938 movu [r0 + 2069 * 16], m4 | |
22939 | |
22940 ;mode 34 [row 11] | |
22941 palignr m3, m1, m0, 11 | |
22942 movu [r0 + 2070 * 16], m3 | |
22943 palignr m4, m2, m1, 11 | |
22944 movu [r0 + 2071 * 16], m4 | |
22945 | |
22946 ;mode 34 [row 12] | |
22947 palignr m3, m1, m0, 12 | |
22948 movu [r0 + 2072 * 16], m3 | |
22949 palignr m4, m2, m1, 12 | |
22950 movu [r0 + 2073 * 16], m4 | |
22951 | |
22952 ;mode 34 [row 13] | |
22953 palignr m3, m1, m0, 13 | |
22954 movu [r0 + 2074 * 16], m3 | |
22955 palignr m4, m2, m1, 13 | |
22956 movu [r0 + 2075 * 16], m4 | |
22957 | |
22958 ;mode 34 [row 14] | |
22959 palignr m3, m1, m0, 14 | |
22960 movu [r0 + 2076 * 16], m3 | |
22961 palignr m4, m2, m1, 14 | |
22962 movu [r0 + 2077 * 16], m4 | |
22963 | |
22964 ;mode 34 [row 15] | |
22965 palignr m3, m1, m0, 15 | |
22966 movu [r0 + 2078 * 16], m3 | |
22967 palignr m4, m2, m1, 15 | |
22968 movu [r0 + 2079 * 16], m4 | |
22969 | |
22970 ;mode 34 [row 16] | |
22971 palignr m3, m1, m0, 16 | |
22972 movu [r0 + 2080 * 16], m3 | |
22973 palignr m4, m2, m1, 16 | |
22974 movu [r0 + 2081 * 16], m4 | |
22975 | |
22976 ;mode 34 [row 17] | |
22977 movu m0, [r3 + 19] | |
22978 movu [r0 + 2082 * 16], m0 | |
22979 movu m1, [r3 + 35] | |
22980 movu [r0 + 2083 * 16], m1 | |
22981 | |
22982 mov r2d, r6d | |
22983 mov [r4], r2b | |
22984 mov r2d, [rsp] | |
22985 mov [r1 + 64], r2b | |
22986 | |
22987 ;mode 34 [row 18] | |
22988 movu m2, [r3 + 51] | |
22989 palignr m3, m1, m0, 1 | |
22990 movu [r0 + 2084 * 16], m3 | |
22991 palignr m4, m2, m1, 1 | |
22992 movu [r0 + 2085 * 16], m4 | |
22993 | |
22994 ;mode 34 [row 19] | |
22995 palignr m3, m1, m0, 2 | |
22996 movu [r0 + 2086 * 16], m3 | |
22997 palignr m4, m2, m1, 2 | |
22998 movu [r0 + 2087 * 16], m4 | |
22999 | |
23000 ;mode 34 [row 20] | |
23001 palignr m3, m1, m0, 3 | |
23002 movu [r0 + 2088 * 16], m3 | |
23003 palignr m4, m2, m1, 3 | |
23004 movu [r0 + 2089 * 16], m4 | |
23005 | |
23006 ;mode 34 [row 21] | |
23007 palignr m3, m1, m0, 4 | |
23008 movu [r0 + 2090 * 16], m3 | |
23009 palignr m4, m2, m1, 4 | |
23010 movu [r0 + 2091 * 16], m4 | |
23011 | |
23012 ;mode 34 [row 22] | |
23013 palignr m3, m1, m0, 5 | |
23014 movu [r0 + 2092 * 16], m3 | |
23015 palignr m4, m2, m1, 5 | |
23016 movu [r0 + 2093 * 16], m4 | |
23017 | |
23018 ;mode 34 [row 23] | |
23019 palignr m3, m1, m0, 6 | |
23020 movu [r0 + 2094 * 16], m3 | |
23021 palignr m4, m2, m1, 6 | |
23022 movu [r0 + 2095 * 16], m4 | |
23023 | |
23024 ;mode 34 [row 24] | |
23025 palignr m3, m1, m0, 7 | |
23026 movu [r0 + 2096 * 16], m3 | |
23027 palignr m4, m2, m1, 7 | |
23028 movu [r0 + 2097 * 16], m4 | |
23029 | |
23030 ;mode 34 [row 25] | |
23031 palignr m3, m1, m0, 8 | |
23032 movu [r0 + 2098 * 16], m3 | |
23033 palignr m4, m2, m1, 8 | |
23034 movu [r0 + 2099 * 16], m4 | |
23035 | |
23036 ;mode 34 [row 26] | |
23037 palignr m3, m1, m0, 9 | |
23038 movu [r0 + 2100 * 16], m3 | |
23039 palignr m4, m2, m1, 9 | |
23040 movu [r0 + 2101 * 16], m4 | |
23041 | |
23042 ;mode 34 [row 27] | |
23043 palignr m3, m1, m0, 10 | |
23044 movu [r0 + 2102 * 16], m3 | |
23045 palignr m4, m2, m1, 10 | |
23046 movu [r0 + 2103 * 16], m4 | |
23047 | |
23048 ;mode 34 [row 28] | |
23049 palignr m3, m1, m0, 11 | |
23050 movu [r0 + 2104 * 16], m3 | |
23051 palignr m4, m2, m1, 11 | |
23052 movu [r0 + 2105 * 16], m4 | |
23053 | |
23054 ;mode 34 [row 29] | |
23055 palignr m3, m1, m0, 12 | |
23056 movu [r0 + 2106 * 16], m3 | |
23057 palignr m4, m2, m1, 12 | |
23058 movu [r0 + 2107 * 16], m4 | |
23059 | |
23060 ;mode 34 [row 30] | |
23061 palignr m3, m1, m0, 13 | |
23062 movu [r0 + 2108 * 16], m3 | |
23063 palignr m4, m2, m1, 13 | |
23064 movu [r0 + 2109 * 16], m4 | |
23065 | |
23066 ;mode 34 [row 31] | |
23067 palignr m3, m1, m0, 14 | |
23068 movu [r0 + 2110 * 16], m3 | |
23069 palignr m4, m2, m1, 14 | |
23070 movu [r0 + 2111 * 16], m4 | |
23071 RET | |
23072 | |
23073 | |
23074 ;----------------------------------------------------------------------------- | |
23075 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
23076 ;----------------------------------------------------------------------------- | |
23077 INIT_YMM avx2 | |
23078 cglobal all_angs_pred_4x4, 4, 4, 6 | |
23079 | |
23080 mova m5, [pw_1024] | |
23081 lea r2, [all_ang4] | |
23082 lea r3, [all_ang4_shuff] | |
23083 | |
23084 ; mode 2 | |
23085 | |
23086 vbroadcasti128 m0, [r1 + 9] | |
23087 mova xm1, xm0 | |
23088 psrldq xm1, 1 | |
23089 pshufb xm1, [r3] | |
23090 movu [r0], xm1 | |
23091 | |
23092 ; mode 3 | |
23093 | |
23094 pshufb m1, m0, [r3 + 1 * mmsize] | |
23095 pmaddubsw m1, [r2] | |
23096 pmulhrsw m1, m5 | |
23097 | |
23098 ; mode 4 | |
23099 | |
23100 pshufb m2, m0, [r3 + 2 * mmsize] | |
23101 pmaddubsw m2, [r2 + 1 * mmsize] | |
23102 pmulhrsw m2, m5 | |
23103 packuswb m1, m2 | |
23104 vpermq m1, m1, 11011000b | |
23105 movu [r0 + (3 - 2) * 16], m1 | |
23106 | |
23107 ; mode 5 | |
23108 | |
23109 pshufb m1, m0, [r3 + 2 * mmsize] | |
23110 pmaddubsw m1, [r2 + 2 * mmsize] | |
23111 pmulhrsw m1, m5 | |
23112 | |
23113 ; mode 6 | |
23114 | |
23115 pshufb m2, m0, [r3 + 3 * mmsize] | |
23116 pmaddubsw m2, [r2 + 3 * mmsize] | |
23117 pmulhrsw m2, m5 | |
23118 packuswb m1, m2 | |
23119 vpermq m1, m1, 11011000b | |
23120 movu [r0 + (5 - 2) * 16], m1 | |
23121 | |
23122 add r3, 4 * mmsize | |
23123 add r2, 4 * mmsize | |
23124 | |
23125 ; mode 7 | |
23126 | |
23127 pshufb m1, m0, [r3 + 0 * mmsize] | |
23128 pmaddubsw m1, [r2 + 0 * mmsize] | |
23129 pmulhrsw m1, m5 | |
23130 | |
23131 ; mode 8 | |
23132 | |
23133 pshufb m2, m0, [r3 + 1 * mmsize] | |
23134 pmaddubsw m2, [r2 + 1 * mmsize] | |
23135 pmulhrsw m2, m5 | |
23136 packuswb m1, m2 | |
23137 vpermq m1, m1, 11011000b | |
23138 movu [r0 + (7 - 2) * 16], m1 | |
23139 | |
23140 ; mode 9 | |
23141 | |
23142 pshufb m1, m0, [r3 + 1 * mmsize] | |
23143 pmaddubsw m1, [r2 + 2 * mmsize] | |
23144 pmulhrsw m1, m5 | |
23145 packuswb m1, m1 | |
23146 vpermq m1, m1, 11011000b | |
23147 movu [r0 + (9 - 2) * 16], xm1 | |
23148 | |
23149 ; mode 10 | |
23150 | |
23151 pshufb xm1, xm0, [r3 + 2 * mmsize] | |
23152 movu [r0 + (10 - 2) * 16], xm1 | |
23153 | |
23154 pxor xm1, xm1 | |
23155 movd xm2, [r1 + 1] | |
23156 pshufd xm3, xm2, 0 | |
23157 punpcklbw xm3, xm1 | |
23158 pinsrb xm2, [r1], 0 | |
23159 pshufb xm4, xm2, xm1 | |
23160 punpcklbw xm4, xm1 | |
23161 psubw xm3, xm4 | |
23162 psraw xm3, 1 | |
23163 pshufb xm4, xm0, xm1 | |
23164 punpcklbw xm4, xm1 | |
23165 paddw xm3, xm4 | |
23166 packuswb xm3, xm1 | |
23167 | |
23168 pextrb [r0 + 128], xm3, 0 | |
23169 pextrb [r0 + 132], xm3, 1 | |
23170 pextrb [r0 + 136], xm3, 2 | |
23171 pextrb [r0 + 140], xm3, 3 | |
23172 | |
23173 ; mode 11 | |
23174 | |
23175 vbroadcasti128 m0, [r1] | |
23176 pshufb m1, m0, [r3 + 3 * mmsize] | |
23177 pmaddubsw m1, [r2 + 3 * mmsize] | |
23178 pmulhrsw m1, m5 | |
23179 | |
23180 ; mode 12 | |
23181 | |
23182 add r2, 4 * mmsize | |
23183 | |
23184 pshufb m2, m0, [r3 + 3 * mmsize] | |
23185 pmaddubsw m2, [r2 + 0 * mmsize] | |
23186 pmulhrsw m2, m5 | |
23187 packuswb m1, m2 | |
23188 vpermq m1, m1, 11011000b | |
23189 movu [r0 + (11 - 2) * 16], m1 | |
23190 | |
23191 ; mode 13 | |
23192 | |
23193 add r3, 4 * mmsize | |
23194 | |
23195 pshufb m1, m0, [r3 + 0 * mmsize] | |
23196 pmaddubsw m1, [r2 + 1 * mmsize] | |
23197 pmulhrsw m1, m5 | |
23198 | |
23199 ; mode 14 | |
23200 | |
23201 pshufb m2, m0, [r3 + 1 * mmsize] | |
23202 pmaddubsw m2, [r2 + 2 * mmsize] | |
23203 pmulhrsw m2, m5 | |
23204 packuswb m1, m2 | |
23205 vpermq m1, m1, 11011000b | |
23206 movu [r0 + (13 - 2) * 16], m1 | |
23207 | |
23208 ; mode 15 | |
23209 | |
23210 pshufb m1, m0, [r3 + 2 * mmsize] | |
23211 pmaddubsw m1, [r2 + 3 * mmsize] | |
23212 pmulhrsw m1, m5 | |
23213 | |
23214 ; mode 16 | |
23215 | |
23216 add r2, 4 * mmsize | |
23217 | |
23218 pshufb m2, m0, [r3 + 3 * mmsize] | |
23219 pmaddubsw m2, [r2 + 0 * mmsize] | |
23220 pmulhrsw m2, m5 | |
23221 packuswb m1, m2 | |
23222 vpermq m1, m1, 11011000b | |
23223 movu [r0 + (15 - 2) * 16], m1 | |
23224 | |
23225 ; mode 17 | |
23226 | |
23227 add r3, 4 * mmsize | |
23228 | |
23229 pshufb m1, m0, [r3 + 0 * mmsize] | |
23230 pmaddubsw m1, [r2 + 1 * mmsize] | |
23231 pmulhrsw m1, m5 | |
23232 packuswb m1, m1 | |
23233 vpermq m1, m1, 11011000b | |
23234 | |
23235 ; mode 18 | |
23236 | |
23237 pshufb m2, m0, [r3 + 1 * mmsize] | |
23238 vinserti128 m1, m1, xm2, 1 | |
23239 movu [r0 + (17 - 2) * 16], m1 | |
23240 | |
23241 ; mode 19 | |
23242 | |
23243 pshufb m1, m0, [r3 + 2 * mmsize] | |
23244 pmaddubsw m1, [r2 + 2 * mmsize] | |
23245 pmulhrsw m1, m5 | |
23246 | |
23247 ; mode 20 | |
23248 | |
23249 pshufb m2, m0, [r3 + 3 * mmsize] | |
23250 pmaddubsw m2, [r2 + 3 * mmsize] | |
23251 pmulhrsw m2, m5 | |
23252 packuswb m1, m2 | |
23253 vpermq m1, m1, 11011000b | |
23254 movu [r0 + (19 - 2) * 16], m1 | |
23255 | |
23256 ; mode 21 | |
23257 | |
23258 add r2, 4 * mmsize | |
23259 add r3, 4 * mmsize | |
23260 | |
23261 pshufb m1, m0, [r3 + 0 * mmsize] | |
23262 pmaddubsw m1, [r2 + 0 * mmsize] | |
23263 pmulhrsw m1, m5 | |
23264 | |
23265 ; mode 22 | |
23266 | |
23267 pshufb m2, m0, [r3 + 1 * mmsize] | |
23268 pmaddubsw m2, [r2 + 1 * mmsize] | |
23269 pmulhrsw m2, m5 | |
23270 packuswb m1, m2 | |
23271 vpermq m1, m1, 11011000b | |
23272 movu [r0 + (21 - 2) * 16], m1 | |
23273 | |
23274 ; mode 23 | |
23275 | |
23276 pshufb m1, m0, [r3 + 2 * mmsize] | |
23277 pmaddubsw m1, [r2 + 2 * mmsize] | |
23278 pmulhrsw m1, m5 | |
23279 | |
23280 ; mode 24 | |
23281 | |
23282 pshufb m2, m0, [r3 + 3 * mmsize] | |
23283 pmaddubsw m2, [r2 + 3 * mmsize] | |
23284 pmulhrsw m2, m5 | |
23285 packuswb m1, m2 | |
23286 vpermq m1, m1, 11011000b | |
23287 movu [r0 + (23 - 2) * 16], m1 | |
23288 | |
23289 ; mode 25 | |
23290 | |
23291 add r2, 4 * mmsize | |
23292 | |
23293 pshufb m1, m0, [r3 + 3 * mmsize] | |
23294 pmaddubsw m1, [r2 + 0 * mmsize] | |
23295 pmulhrsw m1, m5 | |
23296 packuswb m1, m1 | |
23297 vpermq m1, m1, 11011000b | |
23298 movu [r0 + (25 - 2) * 16], xm1 | |
23299 | |
23300 ; mode 26 | |
23301 | |
23302 add r3, 4 * mmsize | |
23303 | |
23304 pshufb xm1, xm0, [r3 + 0 * mmsize] | |
23305 movu [r0 + (26 - 2) * 16], xm1 | |
23306 | |
23307 pxor xm1, xm1 | |
23308 movd xm2, [r1 + 9] | |
23309 pshufd xm3, xm2, 0 | |
23310 punpcklbw xm3, xm1 | |
23311 pinsrb xm4, [r1 + 0], 0 | |
23312 pshufb xm4, xm1 | |
23313 punpcklbw xm4, xm1 | |
23314 psubw xm3, xm4 | |
23315 psraw xm3, 1 | |
23316 psrldq xm2, xm0, 1 | |
23317 pshufb xm2, xm1 | |
23318 punpcklbw xm2, xm1 | |
23319 paddw xm3, xm2 | |
23320 packuswb xm3, xm1 | |
23321 | |
23322 pextrb [r0 + 384], xm3, 0 | |
23323 pextrb [r0 + 388], xm3, 1 | |
23324 pextrb [r0 + 392], xm3, 2 | |
23325 pextrb [r0 + 396], xm3, 3 | |
23326 | |
23327 ; mode 27 | |
23328 | |
23329 pshufb m1, m0, [r3 + 1 * mmsize] | |
23330 pmaddubsw m1, [r2 + 1 * mmsize] | |
23331 pmulhrsw m1, m5 | |
23332 | |
23333 ; mode 28 | |
23334 | |
23335 pshufb m2, m0, [r3 + 1 * mmsize] | |
23336 pmaddubsw m2, [r2 + 2 * mmsize] | |
23337 pmulhrsw m2, m5 | |
23338 packuswb m1, m2 | |
23339 vpermq m1, m1, 11011000b | |
23340 movu [r0 + (27 - 2) * 16], m1 | |
23341 | |
23342 ; mode 29 | |
23343 | |
23344 pshufb m1, m0, [r3 + 2 * mmsize] | |
23345 pmaddubsw m1, [r2 + 3 * mmsize] | |
23346 pmulhrsw m1, m5 | |
23347 | |
23348 ; mode 30 | |
23349 | |
23350 add r2, 4 * mmsize | |
23351 | |
23352 pshufb m2, m0, [r3 + 3 * mmsize] | |
23353 pmaddubsw m2, [r2 + 0 * mmsize] | |
23354 pmulhrsw m2, m5 | |
23355 packuswb m1, m2 | |
23356 vpermq m1, m1, 11011000b | |
23357 movu [r0 + (29 - 2) * 16], m1 | |
23358 | |
23359 ; mode 31 | |
23360 | |
23361 add r3, 4 * mmsize | |
23362 | |
23363 pshufb m1, m0, [r3 + 0 * mmsize] | |
23364 pmaddubsw m1, [r2 + 1 * mmsize] | |
23365 pmulhrsw m1, m5 | |
23366 | |
23367 ; mode 32 | |
23368 | |
23369 pshufb m2, m0, [r3 + 0 * mmsize] | |
23370 pmaddubsw m2, [r2 + 2 * mmsize] | |
23371 pmulhrsw m2, m5 | |
23372 packuswb m1, m2 | |
23373 vpermq m1, m1, 11011000b | |
23374 movu [r0 + (31 - 2) * 16], m1 | |
23375 | |
23376 ; mode 33 | |
23377 | |
23378 pshufb m1, m0, [r3 + 1 * mmsize] | |
23379 pmaddubsw m1, [r2 + 3 * mmsize] | |
23380 pmulhrsw m1, m5 | |
23381 packuswb m1, m2 | |
23382 vpermq m1, m1, 11011000b | |
23383 | |
23384 ; mode 34 | |
23385 | |
23386 pshufb m0, [r3 + 2 * mmsize] | |
23387 vinserti128 m1, m1, xm0, 1 | |
23388 movu [r0 + (33 - 2) * 16], m1 | |
23389 RET | |
23390 | |
23391 ;----------------------------------------------------------------------------- | |
23392 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) | |
23393 ;----------------------------------------------------------------------------- | |
23394 INIT_XMM sse2 | |
23395 cglobal all_angs_pred_4x4, 4, 4, 8 | |
23396 | |
23397 ; mode 2 | |
23398 | |
23399 movh m6, [r1 + 9] | |
23400 mova m2, m6 | |
23401 psrldq m2, 1 | |
23402 movd [r0], m2 ;byte[A, B, C, D] | |
23403 psrldq m2, 1 | |
23404 movd [r0 + 4], m2 ;byte[B, C, D, E] | |
23405 psrldq m2, 1 | |
23406 movd [r0 + 8], m2 ;byte[C, D, E, F] | |
23407 psrldq m2, 1 | |
23408 movd [r0 + 12], m2 ;byte[D, E, F, G] | |
23409 | |
23410 ; mode 10/26 | |
23411 | |
23412 pxor m7, m7 | |
23413 pshufd m5, m6, 0 | |
23414 mova [r0 + 128], m5 ;mode 10 byte[9, A, B, C, 9, A, B, C, 9, A, B, C, 9, A, B, C] | |
23415 | |
23416 movd m4, [r1 + 1] | |
23417 pshufd m4, m4, 0 | |
23418 mova [r0 + 384], m4 ;mode 26 byte[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] | |
23419 | |
23420 movd m1, [r1] | |
23421 punpcklbw m1, m7 | |
23422 pshuflw m1, m1, 0x00 | |
23423 punpcklqdq m1, m1 ;m1 = byte[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | |
23424 | |
23425 punpckldq m4, m5 | |
23426 punpcklbw m4, m7 ;m4 = word[1, 2, 3, 4, 9, A, B, C] | |
23427 pshuflw m2, m4, 0x00 | |
23428 pshufhw m2, m2, 0x00 ;m2 = word[1, 1, 1, 1, 9, 9, 9, 9] | |
23429 | |
23430 psubw m4, m1 | |
23431 psraw m4, 1 | |
23432 | |
23433 pshufd m2, m2, q1032 ;m2 = word[9, 9, 9, 9, 1, 1, 1, 1] | |
23434 paddw m4, m2 | |
23435 packuswb m4, m4 | |
23436 | |
23437 %if ARCH_X86_64 | |
23438 movq r2, m4 | |
23439 | |
23440 mov [r0 + 128], r2b ;mode 10 | |
23441 shr r2, 8 | |
23442 mov [r0 + 132], r2b | |
23443 shr r2, 8 | |
23444 mov [r0 + 136], r2b | |
23445 shr r2, 8 | |
23446 mov [r0 + 140], r2b | |
23447 shr r2, 8 | |
23448 mov [r0 + 384], r2b ;mode 26 | |
23449 shr r2d, 8 | |
23450 mov [r0 + 388], r2b | |
23451 shr r2d, 8 | |
23452 mov [r0 + 392], r2b | |
23453 shr r2d, 8 | |
23454 mov [r0 + 396], r2b | |
23455 | |
23456 %else | |
23457 movd r2d, m4 | |
23458 | |
23459 mov [r0 + 128], r2b ;mode 10 | |
23460 shr r2d, 8 | |
23461 mov [r0 + 132], r2b | |
23462 shr r2d, 8 | |
23463 mov [r0 + 136], r2b | |
23464 shr r2d, 8 | |
23465 mov [r0 + 140], r2b | |
23466 | |
23467 psrldq m4, 4 | |
23468 movd r2d, m4 | |
23469 | |
23470 mov [r0 + 384], r2b ;mode 26 | |
23471 shr r2d, 8 | |
23472 mov [r0 + 388], r2b | |
23473 shr r2d, 8 | |
23474 mov [r0 + 392], r2b | |
23475 shr r2d, 8 | |
23476 mov [r0 + 396], r2b | |
23477 %endif | |
23478 | |
23479 ; mode 3 | |
23480 | |
23481 mova m2, [pw_16] | |
23482 lea r3, [pw_ang_table + 7 * 16] | |
23483 lea r2, [pw_ang_table + 23 * 16] | |
23484 punpcklbw m6, m6 | |
23485 psrldq m6, 1 | |
23486 movh m1, m6 | |
23487 psrldq m6, 2 | |
23488 movh m0, m6 | |
23489 psrldq m6, 2 | |
23490 movh m3, m6 | |
23491 psrldq m6, 2 | |
23492 punpcklbw m1, m7 ;m1 = word[9, A, A, B, B, C, C, D] | |
23493 punpcklbw m0, m7 ;m0 = word[A, B, B, C, C, D, D, E] | |
23494 punpcklbw m3, m7 ;m3 = word[B, C, C, D, D, E, E, F] | |
23495 punpcklbw m6, m7 ;m6 = word[C, D, D, E, E, F, F, G] | |
23496 | |
23497 mova m7, [r2 - 3 * 16] | |
23498 | |
23499 pmaddwd m5, m1, [r2 + 3 * 16] | |
23500 pmaddwd m4, m0, m7 | |
23501 | |
23502 packssdw m5, m4 | |
23503 paddw m5, m2 | |
23504 psraw m5, 5 | |
23505 | |
23506 pmaddwd m4, m3, [r3 + 7 * 16] | |
23507 pmaddwd m6, [r3 + 1 * 16] | |
23508 | |
23509 packssdw m4, m6 | |
23510 paddw m4, m2 | |
23511 psraw m4, 5 | |
23512 | |
23513 packuswb m5, m4 | |
23514 mova [r0 + 16], m5 | |
23515 movd [r0 + 68], m5 ;mode 6 row 1 | |
23516 psrldq m5, 4 | |
23517 movd [r0 + 76], m5 ;mode 6 row 3 | |
23518 | |
23519 ; mode 4 | |
23520 | |
23521 pmaddwd m4, m0, [r2 + 8 * 16] | |
23522 pmaddwd m6, m3, m7 | |
23523 | |
23524 packssdw m4, m6 | |
23525 paddw m4, m2 | |
23526 psraw m4, 5 | |
23527 | |
23528 pmaddwd m5, m1, [r2 - 2 * 16] | |
23529 pmaddwd m6, m0, [r3 + 3 * 16] | |
23530 | |
23531 packssdw m5, m6 | |
23532 paddw m5, m2 | |
23533 psraw m5, 5 | |
23534 | |
23535 packuswb m5, m4 | |
23536 mova [r0 + 32], m5 | |
23537 | |
23538 ; mode 5 | |
23539 | |
23540 pmaddwd m5, m1, [r2 - 6 * 16] | |
23541 pmaddwd m6, m0, [r3 - 5 * 16] | |
23542 | |
23543 packssdw m5, m6 | |
23544 paddw m5, m2 | |
23545 psraw m5, 5 | |
23546 | |
23547 pmaddwd m4, m0, [r2 - 4 * 16] | |
23548 pmaddwd m3, [r3 - 3 * 16] | |
23549 | |
23550 packssdw m4, m3 | |
23551 paddw m4, m2 | |
23552 psraw m4, 5 | |
23553 | |
23554 packuswb m5, m4 | |
23555 mova [r0 + 48], m5 | |
23556 | |
23557 ; mode 6 | |
23558 | |
23559 pmaddwd m5, m1, [r3 + 6 * 16] | |
23560 pmaddwd m6, m0, [r3 + 0 * 16] | |
23561 | |
23562 packssdw m5, m6 | |
23563 paddw m5, m2 | |
23564 psraw m5, 5 | |
23565 | |
23566 packuswb m5, m6 | |
23567 movd [r0 + 64], m5 | |
23568 psrldq m5, 4 | |
23569 movd [r0 + 72], m5 | |
23570 | |
23571 ; mode 7 | |
23572 | |
23573 pmaddwd m5, m1, [r3 + 2 * 16] | |
23574 pmaddwd m6, m1, [r2 - 5 * 16] | |
23575 | |
23576 packssdw m5, m6 | |
23577 paddw m5, m2 | |
23578 psraw m5, 5 | |
23579 | |
23580 mova m3, [r2 + 4 * 16] | |
23581 pmaddwd m4, m1, m3 | |
23582 pmaddwd m0, [r3 - 3 * 16] | |
23583 | |
23584 packssdw m4, m0 | |
23585 paddw m4, m2 | |
23586 psraw m4, 5 | |
23587 | |
23588 packuswb m5, m4 | |
23589 mova [r0 + 80], m5 | |
23590 | |
23591 ; mode 8 | |
23592 | |
23593 mova m0, [r3 - 2 * 16] | |
23594 pmaddwd m5, m1, m0 | |
23595 pmaddwd m6, m1, [r3 + 3 * 16] | |
23596 | |
23597 packssdw m5, m6 | |
23598 paddw m5, m2 | |
23599 psraw m5, 5 | |
23600 | |
23601 pmaddwd m4, m1, [r3 + 8 * 16] | |
23602 pmaddwd m7, m1 | |
23603 | |
23604 packssdw m4, m7 | |
23605 paddw m4, m2 | |
23606 psraw m4, 5 | |
23607 | |
23608 packuswb m5, m4 | |
23609 mova [r0 + 96], m5 | |
23610 | |
23611 ; mode 9 | |
23612 | |
23613 pmaddwd m5, m1, [r3 - 5 * 16] | |
23614 pmaddwd m6, m1, [r3 - 3 * 16] | |
23615 | |
23616 packssdw m5, m6 | |
23617 paddw m5, m2 | |
23618 psraw m5, 5 | |
23619 | |
23620 pmaddwd m4, m1, [r3 - 1 * 16] | |
23621 pmaddwd m6, m1, [r3 + 1 * 16] | |
23622 | |
23623 packssdw m4, m6 | |
23624 paddw m4, m2 | |
23625 psraw m4, 5 | |
23626 | |
23627 packuswb m5, m4 | |
23628 mova [r0 + 112], m5 | |
23629 | |
23630 ; mode 11 | |
23631 | |
23632 movd m5, [r1] | |
23633 punpcklwd m5, m1 | |
23634 pand m5, [pb_0000000000000F0F] | |
23635 pslldq m1, 4 | |
23636 por m1, m5 ;m1 = word[0, 9, 9, A, A, B, B, C] | |
23637 | |
23638 pmaddwd m5, m1, [r2 + 7 * 16] | |
23639 pmaddwd m6, m1, [r2 + 5 * 16] | |
23640 | |
23641 packssdw m5, m6 | |
23642 paddw m5, m2 | |
23643 psraw m5, 5 | |
23644 | |
23645 pmaddwd m4, m1, [r2 + 3 * 16] | |
23646 pmaddwd m6, m1, [r2 + 1 * 16] | |
23647 | |
23648 packssdw m4, m6 | |
23649 paddw m4, m2 | |
23650 psraw m4, 5 | |
23651 | |
23652 packuswb m5, m4 | |
23653 mova [r0 + 144], m5 | |
23654 | |
23655 ; mode 12 | |
23656 | |
23657 pmaddwd m3, m1 | |
23658 pmaddwd m6, m1, [r2 - 1 * 16] | |
23659 | |
23660 packssdw m3, m6 | |
23661 paddw m3, m2 | |
23662 psraw m3, 5 | |
23663 | |
23664 pmaddwd m4, m1, [r2 - 6 * 16] | |
23665 pmaddwd m6, m1, [r3 + 5 * 16] | |
23666 | |
23667 packssdw m4, m6 | |
23668 paddw m4, m2 | |
23669 psraw m4, 5 | |
23670 | |
23671 packuswb m3, m4 | |
23672 mova [r0 + 160], m3 | |
23673 | |
23674 ; mode 13 | |
23675 | |
23676 mova m3, m1 | |
23677 movd m7, [r1 + 4] | |
23678 punpcklwd m7, m1 | |
23679 pand m7, [pb_0000000000000F0F] | |
23680 pslldq m3, 4 | |
23681 por m3, m7 ;m3 = word[4, 0, 0, 9, 9, A, A, B] | |
23682 | |
23683 pmaddwd m5, m1, [r2 + 0 * 16] | |
23684 pmaddwd m6, m1, [r3 + 7 * 16] | |
23685 | |
23686 packssdw m5, m6 | |
23687 paddw m5, m2 | |
23688 psraw m5, 5 | |
23689 | |
23690 pmaddwd m4, m1, m0 | |
23691 pmaddwd m6, m3, [r2 + 5 * 16] | |
23692 | |
23693 packssdw m4, m6 | |
23694 paddw m4, m2 | |
23695 psraw m4, 5 | |
23696 | |
23697 packuswb m5, m4 | |
23698 mova [r0 + 176], m5 | |
23699 | |
23700 ; mode 14 | |
23701 | |
23702 pmaddwd m5, m1, [r2 - 4 * 16] | |
23703 pmaddwd m6, m1, [r3 - 1 * 16] | |
23704 | |
23705 packssdw m5, m6 | |
23706 paddw m5, m2 | |
23707 psraw m5, 5 | |
23708 | |
23709 movd m6, [r1 + 2] | |
23710 pand m3, [pw_FFFFFFFFFFFFFFF0] | |
23711 pand m6, [pb_000000000000000F] | |
23712 por m3, m6 ;m3 = word[2, 0, 0, 9, 9, A, A, B] | |
23713 | |
23714 pmaddwd m4, m3, [r2 + 2 * 16] | |
23715 pmaddwd m6, m3, [r3 + 5 * 16] | |
23716 | |
23717 packssdw m4, m6 | |
23718 paddw m4, m2 | |
23719 psraw m4, 5 | |
23720 | |
23721 packuswb m5, m4 | |
23722 mova [r0 + 192], m5 | |
23723 psrldq m5, 4 | |
23724 movd [r0 + 240], m5 ;mode 17 row 0 | |
23725 | |
23726 ; mode 15 | |
23727 | |
23728 pmaddwd m5, m1, [r3 + 8 * 16] | |
23729 pmaddwd m6, m3, [r2 + 7 * 16] | |
23730 | |
23731 packssdw m5, m6 | |
23732 paddw m5, m2 | |
23733 psraw m5, 5 | |
23734 | |
23735 pmaddwd m6, m3, [r3 + 6 * 16] | |
23736 | |
23737 mova m0, m3 | |
23738 punpcklwd m7, m3 | |
23739 pslldq m0, 4 | |
23740 pand m7, [pb_0000000000000F0F] | |
23741 por m0, m7 ;m0 = word[4, 2, 2, 0, 0, 9, 9, A] | |
23742 | |
23743 pmaddwd m4, m0, [r2 + 5 * 16] | |
23744 | |
23745 packssdw m6, m4 | |
23746 paddw m6, m2 | |
23747 psraw m6, 5 | |
23748 | |
23749 packuswb m5, m6 | |
23750 mova [r0 + 208], m5 | |
23751 | |
23752 ; mode 16 | |
23753 | |
23754 pmaddwd m5, m1, [r3 + 4 * 16] | |
23755 pmaddwd m6, m3, [r2 - 1 * 16] | |
23756 | |
23757 packssdw m5, m6 | |
23758 paddw m5, m2 | |
23759 psraw m5, 5 | |
23760 | |
23761 pmaddwd m3, [r3 - 6 * 16] | |
23762 | |
23763 movd m6, [r1 + 3] | |
23764 pand m0, [pw_FFFFFFFFFFFFFFF0] | |
23765 pand m6, [pb_000000000000000F] | |
23766 por m0, m6 ;m0 = word[3, 2, 2, 0, 0, 9, 9, A] | |
23767 | |
23768 pmaddwd m0, [r3 + 5 * 16] | |
23769 packssdw m3, m0 | |
23770 paddw m3, m2 | |
23771 psraw m3, 5 | |
23772 | |
23773 packuswb m5, m3 | |
23774 mova [r0 + 224], m5 | |
23775 | |
23776 ; mode 17 | |
23777 | |
23778 movd m4, [r1 + 1] | |
23779 punpcklwd m4, m1 | |
23780 pand m4, [pb_0000000000000F0F] | |
23781 pslldq m1, 4 | |
23782 por m1, m4 ;m1 = word[1, 0, 0, 9, 9, A, A, B] | |
23783 | |
23784 pmaddwd m6, m1, [r3 + 5 * 16] | |
23785 | |
23786 packssdw m6, m6 | |
23787 paddw m6, m2 | |
23788 psraw m6, 5 | |
23789 | |
23790 movd m5, [r1 + 2] | |
23791 punpcklwd m5, m1 | |
23792 pand m5, [pb_0000000000000F0F] | |
23793 pslldq m1, 4 | |
23794 por m1, m5 ;m1 = word[2, 1, 1, 0, 0, 9, 9, A] | |
23795 | |
23796 pmaddwd m4, m1, [r2 - 5 * 16] | |
23797 | |
23798 punpcklwd m7, m1 | |
23799 pand m7, [pb_0000000000000F0F] | |
23800 pslldq m1, 4 | |
23801 por m1, m7 ;m1 = word[4, 2, 2, 1, 1, 0, 0, 9] | |
23802 | |
23803 pmaddwd m1, [r2 + 1 * 16] | |
23804 packssdw m4, m1 | |
23805 paddw m4, m2 | |
23806 psraw m4, 5 | |
23807 | |
23808 packuswb m6, m4 | |
23809 movd [r0 + 244], m6 | |
23810 psrldq m6, 8 | |
23811 movh [r0 + 248], m6 | |
23812 | |
23813 ; mode 18 | |
23814 | |
23815 movh m1, [r1] | |
23816 movd [r0 + 256], m1 ;byte[0, 1, 2, 3] | |
23817 | |
23818 movh m3, [r1 + 2] | |
23819 punpcklqdq m3, m1 | |
23820 psrldq m3, 7 | |
23821 movd [r0 + 260], m3 ;byte[2, 1, 0, 9] | |
23822 | |
23823 movh m4, [r1 + 3] | |
23824 punpcklqdq m4, m3 | |
23825 psrldq m4, 7 | |
23826 movd [r0 + 264], m4 ;byte[1, 0, 9, A] | |
23827 | |
23828 movh m0, [r1 + 4] | |
23829 punpcklqdq m0, m4 | |
23830 psrldq m0, 7 | |
23831 movd [r0 + 268], m0 ;byte[0, 9, A, B] | |
23832 | |
23833 ; mode 19 | |
23834 | |
23835 pxor m7, m7 | |
23836 punpcklbw m4, m3 | |
23837 punpcklbw m3, m1 | |
23838 punpcklbw m1, m1 | |
23839 punpcklbw m4, m7 ;m4 = word[A, 9, 9, 0, 0, 1, 1, 2] | |
23840 punpcklbw m3, m7 ;m3 = word[9, 0, 0, 1, 1, 2, 2, 3] | |
23841 psrldq m1, 1 | |
23842 punpcklbw m1, m7 ;m1 = word[0, 1, 1, 2, 2, 3, 3, 4] | |
23843 | |
23844 pmaddwd m6, m1, [r3 - 1 * 16] | |
23845 pmaddwd m7, m3, [r3 + 5 * 16] | |
23846 | |
23847 packssdw m6, m7 | |
23848 paddw m6, m2 | |
23849 psraw m6, 5 | |
23850 | |
23851 pmaddwd m5, m4, [r2 - 5 * 16] | |
23852 | |
23853 movd m7, [r1 + 12] | |
23854 punpcklwd m7, m4 | |
23855 pand m7, [pb_0000000000000F0F] | |
23856 pslldq m4, 4 | |
23857 por m4, m7 ;m4 = word[C, A, A, 9, 9, 0, 0, 1] | |
23858 | |
23859 pmaddwd m4, [r2 + 1 * 16] | |
23860 packssdw m5, m4 | |
23861 paddw m5, m2 | |
23862 psraw m5, 5 | |
23863 | |
23864 packuswb m6, m5 | |
23865 mova [r0 + 272], m6 | |
23866 movd [r0 + 324], m6 ;mode 22 row 1 | |
23867 | |
23868 ; mode 20 | |
23869 | |
23870 pmaddwd m5, m1, [r3 + 4 * 16] | |
23871 | |
23872 movd m4, [r1 + 10] | |
23873 pand m3, [pw_FFFFFFFFFFFFFFF0] | |
23874 pand m4, [pb_000000000000000F] | |
23875 por m3, m4 ;m3 = word[A, 0, 0, 1, 1, 2, 2, 3] | |
23876 | |
23877 pmaddwd m6, m3, [r2 - 1 * 16] | |
23878 | |
23879 packssdw m5, m6 | |
23880 paddw m5, m2 | |
23881 psraw m5, 5 | |
23882 | |
23883 pmaddwd m4, m3, [r3 - 6 * 16] | |
23884 | |
23885 punpcklwd m0, m3 | |
23886 pand m0, [pb_0000000000000F0F] | |
23887 mova m6, m3 | |
23888 pslldq m6, 4 | |
23889 por m0, m6 ;m0 = word[B, A, A, 0, 0, 1, 1, 2] | |
23890 | |
23891 pmaddwd m6, m0, [r3 + 5 * 16] | |
23892 | |
23893 packssdw m4, m6 | |
23894 paddw m4, m2 | |
23895 psraw m4, 5 | |
23896 | |
23897 packuswb m5, m4 | |
23898 mova [r0 + 288], m5 | |
23899 | |
23900 ; mode 21 | |
23901 | |
23902 pmaddwd m4, m1, [r3 + 8 * 16] | |
23903 pmaddwd m6, m3, [r2 + 7 * 16] | |
23904 | |
23905 packssdw m4, m6 | |
23906 paddw m4, m2 | |
23907 psraw m4, 5 | |
23908 | |
23909 pmaddwd m5, m3, [r3 + 6 * 16] | |
23910 | |
23911 pand m0, [pw_FFFFFFFFFFFFFFF0] | |
23912 pand m7, [pb_000000000000000F] | |
23913 por m0, m7 ;m0 = word[C, A, A, 0, 0, 1, 1, 2] | |
23914 | |
23915 pmaddwd m0, [r2 + 5 * 16] | |
23916 packssdw m5, m0 | |
23917 paddw m5, m2 | |
23918 psraw m5, 5 | |
23919 | |
23920 packuswb m4, m5 | |
23921 mova [r0 + 304], m4 | |
23922 | |
23923 ; mode 22 | |
23924 | |
23925 pmaddwd m4, m1, [r2 - 4 * 16] | |
23926 packssdw m4, m4 | |
23927 paddw m4, m2 | |
23928 psraw m4, 5 | |
23929 | |
23930 mova m0, [r3 + 5 * 16] | |
23931 pmaddwd m5, m3, [r2 + 2 * 16] | |
23932 pmaddwd m6, m3, m0 | |
23933 | |
23934 packssdw m5, m6 | |
23935 paddw m5, m2 | |
23936 psraw m5, 5 | |
23937 | |
23938 packuswb m4, m5 | |
23939 movd [r0 + 320], m4 | |
23940 psrldq m4, 8 | |
23941 movh [r0 + 328], m4 | |
23942 | |
23943 ; mode 23 | |
23944 | |
23945 pmaddwd m4, m1, [r2 + 0 * 16] | |
23946 pmaddwd m5, m1, [r3 + 7 * 16] | |
23947 | |
23948 packssdw m4, m5 | |
23949 paddw m4, m2 | |
23950 psraw m4, 5 | |
23951 | |
23952 pmaddwd m6, m1, [r3 - 2 * 16] | |
23953 | |
23954 pand m3, [pw_FFFFFFFFFFFFFFF0] | |
23955 por m3, m7 ;m3 = word[C, 0, 0, 1, 1, 2, 2, 3] | |
23956 | |
23957 pmaddwd m3, [r2 + 5 * 16] | |
23958 packssdw m6, m3 | |
23959 paddw m6, m2 | |
23960 psraw m6, 5 | |
23961 | |
23962 packuswb m4, m6 | |
23963 mova [r0 + 336], m4 | |
23964 | |
23965 ; mode 24 | |
23966 | |
23967 pmaddwd m4, m1, [r2 + 4 * 16] | |
23968 pmaddwd m5, m1, [r2 - 1 * 16] | |
23969 | |
23970 packssdw m4, m5 | |
23971 paddw m4, m2 | |
23972 psraw m4, 5 | |
23973 | |
23974 pmaddwd m6, m1, [r2 - 6 * 16] | |
23975 pmaddwd m0, m1 | |
23976 | |
23977 packssdw m6, m0 | |
23978 paddw m6, m2 | |
23979 psraw m6, 5 | |
23980 | |
23981 packuswb m4, m6 | |
23982 mova [r0 + 352], m4 | |
23983 | |
23984 ; mode 25 | |
23985 | |
23986 pmaddwd m4, m1, [r2 + 7 * 16] | |
23987 pmaddwd m5, m1, [r2 + 5 * 16] | |
23988 | |
23989 packssdw m4, m5 | |
23990 paddw m4, m2 | |
23991 psraw m4, 5 | |
23992 | |
23993 pmaddwd m6, m1, [r2 + 3 * 16] | |
23994 pmaddwd m1, [r2 + 1 * 16] | |
23995 | |
23996 packssdw m6, m1 | |
23997 paddw m6, m2 | |
23998 psraw m6, 5 | |
23999 | |
24000 packuswb m4, m6 | |
24001 mova [r0 + 368], m4 | |
24002 | |
24003 ; mode 27 | |
24004 | |
24005 movh m0, [r1 + 1] | |
24006 pxor m7, m7 | |
24007 punpcklbw m0, m0 | |
24008 psrldq m0, 1 | |
24009 movh m1, m0 | |
24010 psrldq m0, 2 | |
24011 movh m3, m0 | |
24012 psrldq m0, 2 | |
24013 punpcklbw m1, m7 ;m1 = word[1, 2, 2, 3, 3, 4, 4, 5] | |
24014 punpcklbw m3, m7 ;m3 = word[2, 3, 3, 4, 4, 5, 5, 6] | |
24015 punpcklbw m0, m7 ;m0 = word[3, 4, 4, 5, 5, 6, 6, 7] | |
24016 | |
24017 mova m7, [r3 - 3 * 16] | |
24018 | |
24019 pmaddwd m4, m1, [r3 - 5 * 16] | |
24020 pmaddwd m5, m1, m7 | |
24021 | |
24022 packssdw m4, m5 | |
24023 paddw m4, m2 | |
24024 psraw m4, 5 | |
24025 | |
24026 pmaddwd m6, m1, [r3 - 1 * 16] | |
24027 pmaddwd m5, m1, [r3 + 1 * 16] | |
24028 | |
24029 packssdw m6, m5 | |
24030 paddw m6, m2 | |
24031 psraw m6, 5 | |
24032 | |
24033 packuswb m4, m6 | |
24034 mova [r0 + 400], m4 | |
24035 | |
24036 ; mode 28 | |
24037 | |
24038 pmaddwd m4, m1, [r3 - 2 * 16] | |
24039 pmaddwd m5, m1, [r3 + 3 * 16] | |
24040 | |
24041 packssdw m4, m5 | |
24042 paddw m4, m2 | |
24043 psraw m4, 5 | |
24044 | |
24045 pmaddwd m6, m1, [r3 + 8 * 16] | |
24046 pmaddwd m5, m1, [r2 - 3 * 16] | |
24047 | |
24048 packssdw m6, m5 | |
24049 paddw m6, m2 | |
24050 psraw m6, 5 | |
24051 | |
24052 packuswb m4, m6 | |
24053 mova [r0 + 416], m4 | |
24054 | |
24055 ; mode 29 | |
24056 | |
24057 pmaddwd m4, m1, [r3 + 2 * 16] | |
24058 pmaddwd m6, m1, [r2 - 5 * 16] | |
24059 | |
24060 packssdw m4, m6 | |
24061 paddw m4, m2 | |
24062 psraw m4, 5 | |
24063 | |
24064 pmaddwd m6, m1, [r2 + 4 * 16] | |
24065 pmaddwd m5, m3, m7 | |
24066 | |
24067 packssdw m6, m5 | |
24068 paddw m6, m2 | |
24069 psraw m6, 5 | |
24070 | |
24071 packuswb m4, m6 | |
24072 mova [r0 + 432], m4 | |
24073 | |
24074 ; mode 30 | |
24075 | |
24076 pmaddwd m4, m1, [r3 + 6 * 16] | |
24077 pmaddwd m5, m1, [r2 + 3 * 16] | |
24078 | |
24079 packssdw m4, m5 | |
24080 paddw m4, m2 | |
24081 psraw m4, 5 | |
24082 | |
24083 pmaddwd m6, m3, [r3 + 0 * 16] | |
24084 pmaddwd m5, m3, [r2 - 3 * 16] | |
24085 | |
24086 packssdw m6, m5 | |
24087 paddw m6, m2 | |
24088 psraw m6, 5 | |
24089 | |
24090 packuswb m4, m6 | |
24091 mova [r0 + 448], m4 | |
24092 psrldq m4, 4 | |
24093 movh [r0 + 496], m4 ;mode 33 row 0 | |
24094 psrldq m4, 8 | |
24095 movd [r0 + 500], m4 ;mode 33 row 1 | |
24096 | |
24097 ; mode 31 | |
24098 | |
24099 pmaddwd m4, m1, [r2 - 6 * 16] | |
24100 pmaddwd m5, m3, [r3 - 5 * 16] | |
24101 | |
24102 packssdw m4, m5 | |
24103 paddw m4, m2 | |
24104 psraw m4, 5 | |
24105 | |
24106 pmaddwd m6, m3, [r2 - 4 * 16] | |
24107 pmaddwd m7, m0 | |
24108 | |
24109 packssdw m6, m7 | |
24110 paddw m6, m2 | |
24111 psraw m6, 5 | |
24112 | |
24113 packuswb m4, m6 | |
24114 mova [r0 + 464], m4 | |
24115 | |
24116 ; mode 32 | |
24117 | |
24118 pmaddwd m1, [r2 - 2 * 16] | |
24119 pmaddwd m5, m3, [r3 + 3 * 16] | |
24120 | |
24121 packssdw m1, m5 | |
24122 paddw m1, m2 | |
24123 psraw m1, 5 | |
24124 | |
24125 pmaddwd m3, [r2 + 8 * 16] | |
24126 pmaddwd m5, m0, [r2 - 3 * 16] | |
24127 packssdw m3, m5 | |
24128 paddw m3, m2 | |
24129 psraw m3, 5 | |
24130 | |
24131 packuswb m1, m3 | |
24132 mova [r0 + 480], m1 | |
24133 | |
24134 ; mode 33 | |
24135 | |
24136 pmaddwd m0, [r3 + 7 * 16] | |
24137 pxor m7, m7 | |
24138 movh m4, [r1 + 4] | |
24139 punpcklbw m4, m4 | |
24140 psrldq m4, 1 | |
24141 punpcklbw m4, m7 | |
24142 | |
24143 pmaddwd m4, [r3 + 1 * 16] | |
24144 | |
24145 packssdw m0, m4 | |
24146 paddw m0, m2 | |
24147 psraw m0, 5 | |
24148 | |
24149 packuswb m0, m0 | |
24150 movh [r0 + 504], m0 | |
24151 | |
24152 ; mode 34 | |
24153 | |
24154 movh m7, [r1 + 2] | |
24155 movd [r0 + 512], m7 ;byte[2, 3, 4, 5] | |
24156 | |
24157 psrldq m7, 1 | |
24158 movd [r0 + 516], m7 ;byte[3, 4, 5, 6] | |
24159 | |
24160 psrldq m7, 1 | |
24161 movd [r0 + 520], m7 ;byte[4, 5, 6, 7] | |
24162 | |
24163 psrldq m7, 1 | |
24164 movd [r0 + 524], m7 ;byte[5, 6, 7, 8] | |
24165 | |
24166 RET |