comparison x265/source/common/x86/intrapred8_allangs.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Praveen Tiwari <praveen@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
32 db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
33 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
34 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
35 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
36 db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
37 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
38 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
39 db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
40 db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
41 db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
42 db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
43 db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
44 db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
45 db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
46 db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
47 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
48 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
49 db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
50 db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
51 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
52 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
53 db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
54 db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
55 db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
56 db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
57
58 all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
59 db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
60 db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
61 db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
62 db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
63 db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
64 db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
65 db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
66 db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
67 db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
68 db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
69 db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
70 db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
71 db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
72 db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
73 db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
74 db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
75 db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
76 db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
77 db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
78 db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
79 db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
80 db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
81 db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
82 db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
83 db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
84 db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
85 db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
86
87
88 SECTION .text
89
90 ; global constant
91 cextern pw_1024
92
93 ; common constant with intrapred8.asm
94 cextern ang_table
95 cextern pw_ang_table
96 cextern tab_S1
97 cextern tab_S2
98 cextern tab_Si
99 cextern pw_16
100 cextern pb_000000000000000F
101 cextern pb_0000000000000F0F
102 cextern pw_FFFFFFFFFFFFFFF0
103
104
105 ;-----------------------------------------------------------------------------
106 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
107 ;-----------------------------------------------------------------------------
108 INIT_XMM sse4
109 cglobal all_angs_pred_4x4, 4, 4, 8
110
111 ; mode 2
112
113 movh m0, [r1 + 10]
114 movd [r0], m0
115
116 palignr m1, m0, 1
117 movd [r0 + 4], m1
118
119 palignr m1, m0, 2
120 movd [r0 + 8], m1
121
122 palignr m1, m0, 3
123 movd [r0 + 12], m1
124
125 ; mode 3
126
127 mova m2, [pw_1024]
128
129 pslldq m1, m0, 1
130 pinsrb m1, [r1 + 9], 0
131 punpcklbw m1, m0
132
133 lea r3, [ang_table]
134
135 pmaddubsw m6, m1, [r3 + 26 * 16]
136 pmulhrsw m6, m2
137 packuswb m6, m6
138 movd [r0 + 16], m6
139
140 palignr m0, m1, 2
141
142 mova m7, [r3 + 20 * 16]
143
144 pmaddubsw m3, m0, m7
145 pmulhrsw m3, m2
146 packuswb m3, m3
147 movd [r0 + 20], m3
148
149 ; mode 6 [row 3]
150 movd [r0 + 76], m3
151
152 palignr m3, m1, 4
153
154 pmaddubsw m4, m3, [r3 + 14 * 16]
155 pmulhrsw m4, m2
156 packuswb m4, m4
157 movd [r0 + 24], m4
158
159 palignr m4, m1, 6
160
161 pmaddubsw m4, [r3 + 8 * 16]
162 pmulhrsw m4, m2
163 packuswb m4, m4
164 movd [r0 + 28], m4
165
166 ; mode 4
167
168 pmaddubsw m5, m1, [r3 + 21 * 16]
169 pmulhrsw m5, m2
170 packuswb m5, m5
171 movd [r0 + 32], m5
172
173 pmaddubsw m5, m0, [r3 + 10 * 16]
174 pmulhrsw m5, m2
175 packuswb m5, m5
176 movd [r0 + 36], m5
177
178 pmaddubsw m5, m0, [r3 + 31 * 16]
179 pmulhrsw m5, m2
180 packuswb m5, m5
181 movd [r0 + 40], m5
182
183 pmaddubsw m4, m3, m7
184 pmulhrsw m4, m2
185 packuswb m4, m4
186 movd [r0 + 44], m4
187
188 ; mode 5
189
190 pmaddubsw m5, m1, [r3 + 17 * 16]
191 pmulhrsw m5, m2
192 packuswb m5, m5
193 movd [r0 + 48], m5
194
195 pmaddubsw m5, m0, [r3 + 2 * 16]
196 pmulhrsw m5, m2
197 packuswb m5, m5
198 movd [r0 + 52], m5
199
200 pmaddubsw m5, m0, [r3 + 19 * 16]
201 pmulhrsw m5, m2
202 packuswb m5, m5
203 movd [r0 + 56], m5
204
205 pmaddubsw m4, m3, [r3 + 4 * 16]
206 pmulhrsw m4, m2
207 packuswb m4, m4
208 movd [r0 + 60], m4
209
210 ; mode 6
211
212 pmaddubsw m5, m1, [r3 + 13 * 16]
213 pmulhrsw m5, m2
214 packuswb m5, m5
215 movd [r0 + 64], m5
216
217 movd [r0 + 68], m6
218
219 pmaddubsw m5, m0, [r3 + 7 * 16]
220 pmulhrsw m5, m2
221 packuswb m5, m5
222 movd [r0 + 72], m5
223
224 ; mode 7
225
226 pmaddubsw m5, m1, [r3 + 9 * 16]
227 pmulhrsw m5, m2
228 packuswb m5, m5
229 movd [r0 + 80], m5
230
231 pmaddubsw m5, m1, [r3 + 18 * 16]
232 pmulhrsw m5, m2
233 packuswb m5, m5
234 movd [r0 + 84], m5
235
236 pmaddubsw m5, m1, [r3 + 27 * 16]
237 pmulhrsw m5, m2
238 packuswb m5, m5
239 movd [r0 + 88], m5
240
241 pmaddubsw m5, m0, [r3 + 4 * 16]
242 pmulhrsw m5, m2
243 packuswb m5, m5
244 movd [r0 + 92], m5
245
246 ; mode 8
247
248 pmaddubsw m5, m1, [r3 + 5 * 16]
249 pmulhrsw m5, m2
250 packuswb m5, m5
251 movd [r0 + 96], m5
252
253 pmaddubsw m5, m1, [r3 + 10 * 16]
254 pmulhrsw m5, m2
255 packuswb m5, m5
256 movd [r0 + 100], m5
257
258 pmaddubsw m5, m1, [r3 + 15 * 16]
259 pmulhrsw m5, m2
260 packuswb m5, m5
261 movd [r0 + 104], m5
262
263 pmaddubsw m5, m1, [r3 + 20 * 16]
264 pmulhrsw m5, m2
265 packuswb m5, m5
266 movd [r0 + 108], m5
267
268 ; mode 9
269
270 pmaddubsw m5, m1, [r3 + 2 * 16]
271 pmulhrsw m5, m2
272 packuswb m5, m5
273 movd [r0 + 112], m5
274
275 pmaddubsw m5, m1, [r3 + 4 * 16]
276 pmulhrsw m5, m2
277 packuswb m5, m5
278 movd [r0 + 116], m5
279
280 pmaddubsw m5, m1, [r3 + 6 * 16]
281 pmulhrsw m5, m2
282 packuswb m5, m5
283 movd [r0 + 120], m5
284
285 pmaddubsw m5, m1, [r3 + 8 * 16]
286 pmulhrsw m5, m2
287 packuswb m5, m5
288 movd [r0 + 124], m5
289
290 ; mode 10
291
292 movd m3, [r1 + 9]
293 pshufd m4, m3, 0
294 movu [r0 + 128], m4
295
296 pxor m5, m5
297 movd m7, [r1 + 1]
298 pshufd m4, m7, 0
299 punpcklbw m4, m5
300
301 pinsrb m7, [r1], 0
302 pshufb m6, m7, m5
303 punpcklbw m6, m5
304
305 psubw m4, m6
306 psraw m4, 1
307
308 pshufb m6, m3, m5
309 punpcklbw m6, m5
310
311 paddw m4, m6
312 packuswb m4, m5
313
314 pextrb [r0 + 128], m4, 0
315 pextrb [r0 + 132], m4, 1
316 pextrb [r0 + 136], m4, 2
317 pextrb [r0 + 140], m4, 3
318
319 ; mode 11
320
321 pslldq m1, m1, 2
322 pinsrb m1, [r1], 0
323 pinsrb m1, [r1 + 9], 1
324
325 pmaddubsw m3, m1, [r3 + 30 * 16]
326 pmulhrsw m3, m2
327 packuswb m3, m3
328 movd [r0 + 144], m3
329
330 pmaddubsw m3, m1, [r3 + 28 * 16]
331 pmulhrsw m3, m2
332 packuswb m3, m3
333 movd [r0 + 148], m3
334
335 pmaddubsw m3, m1, [r3 + 26 * 16]
336 pmulhrsw m3, m2
337 packuswb m3, m3
338 movd [r0 + 152], m3
339
340 pmaddubsw m3, m1, [r3 + 24 * 16]
341 pmulhrsw m3, m2
342 packuswb m3, m3
343 movd [r0 + 156], m3
344
345 ; mode 12
346
347 pmaddubsw m3, m1, [r3 + 27 * 16]
348 pmulhrsw m3, m2
349 packuswb m3, m3
350 movd [r0 + 160], m3
351
352 pmaddubsw m3, m1, [r3 + 22 * 16]
353 pmulhrsw m3, m2
354 packuswb m3, m3
355 movd [r0 + 164], m3
356
357 pmaddubsw m3, m1, [r3 + 17 * 16]
358 pmulhrsw m3, m2
359 packuswb m3, m3
360 movd [r0 + 168], m3
361
362 pmaddubsw m3, m1, [r3 + 12 * 16]
363 pmulhrsw m3, m2
364 packuswb m3, m3
365 movd [r0 + 172], m3
366
367 ; mode 13
368
369 pmaddubsw m3, m1, [r3 + 23 * 16]
370 pmulhrsw m3, m2
371 packuswb m3, m3
372 movd [r0 + 176], m3
373
374 pmaddubsw m3, m1, [r3 + 14 * 16]
375 pmulhrsw m3, m2
376 packuswb m3, m3
377 movd [r0 + 180], m3
378
379 pmaddubsw m3, m1, [r3 + 5 * 16]
380 pmulhrsw m3, m2
381 packuswb m3, m3
382 movd [r0 + 184], m3
383
384 pslldq m5, m1, 2
385 pinsrb m5, [r1 + 0], 1
386 pinsrb m5, [r1 + 4], 0
387
388 pmaddubsw m4, m5, [r3 + 28 * 16]
389 pmulhrsw m4, m2
390 packuswb m4, m4
391 movd [r0 + 188], m4
392
393 ; mode 14
394
395 pmaddubsw m4, m1, [r3 + 19 * 16]
396 pmulhrsw m4, m2
397 packuswb m4, m4
398 movd [r0 + 192], m4
399
400 pmaddubsw m7, m1, [r3 + 6 * 16]
401 pmulhrsw m7, m2
402 packuswb m7, m7
403 movd [r0 + 196], m7
404
405 pinsrb m5, [r1 + 2], 0
406
407 pmaddubsw m4, m5, [r3 + 25 * 16]
408 pmulhrsw m4, m2
409 packuswb m4, m4
410 movd [r0 + 200], m4
411
412 pmaddubsw m4, m5, [r3 + 12 * 16]
413 pmulhrsw m4, m2
414 packuswb m4, m4
415 movd [r0 + 204], m4
416
417 ; mode 15
418
419 pmaddubsw m4, m1, [r3 + 15 * 16]
420 pmulhrsw m4, m2
421 packuswb m4, m4
422 movd [r0 + 208], m4
423
424 pmaddubsw m4, m5, [r3 + 30 * 16]
425 pmulhrsw m4, m2
426 packuswb m4, m4
427 movd [r0 + 212], m4
428
429 pmaddubsw m4, m5, [r3 + 13 * 16]
430 pmulhrsw m4, m2
431 packuswb m4, m4
432 movd [r0 + 216], m4
433
434 pslldq m4, m5, 2
435 pinsrb m4, [r1 + 2], 1
436 pinsrb m4, [r1 + 4], 0
437
438 pmaddubsw m6, m4, [r3 + 28 * 16]
439 pmulhrsw m6, m2
440 packuswb m6, m6
441 movd [r0 + 220], m6
442
443 ; mode 16
444
445 pmaddubsw m6, m1, [r3 + 11 * 16]
446 pmulhrsw m6, m2
447 packuswb m6, m6
448 movd [r0 + 224], m6
449
450 pmaddubsw m6, m5, [r3 + 22 * 16]
451 pmulhrsw m6, m2
452 packuswb m6, m6
453 movd [r0 + 228], m6
454
455 pmaddubsw m6, m5, [r3 + 1 * 16]
456 pmulhrsw m6, m2
457 packuswb m6, m6
458 movd [r0 + 232], m6
459
460 pinsrb m4, [r1 + 3], 0
461
462 pmaddubsw m4, [r3 + 12 * 16]
463 pmulhrsw m4, m2
464 packuswb m4, m4
465 movd [r0 + 236], m4
466
467 ; mode 17
468
469 movd [r0 + 240], m7
470
471 pslldq m1, 2
472 pinsrb m1, [r1 + 1], 0
473 pinsrb m1, [r1 + 0], 1
474
475 pmaddubsw m3, m1, [r3 + 12 * 16]
476 pmulhrsw m3, m2
477 packuswb m3, m3
478 movd [r0 + 244], m3
479
480 pslldq m1, 2
481 pinsrb m1, [r1 + 1], 1
482 pinsrb m1, [r1 + 2], 0
483
484 pmaddubsw m3, m1, [r3 + 18 * 16]
485 pmulhrsw m3, m2
486 packuswb m3, m3
487 movd [r0 + 248], m3
488
489 pslldq m1, 2
490 pinsrb m1, [r1 + 2], 1
491 pinsrb m1, [r1 + 4], 0
492
493 pmaddubsw m1, [r3 + 24 * 16]
494 pmulhrsw m1, m2
495 packuswb m1, m1
496 movd [r0 + 252], m1
497
498 ; mode 18
499
500 movh m1, [r1]
501 movd [r0 + 256], m1
502
503 pslldq m3, m1, 1
504 pinsrb m3, [r1 + 9], 0
505 movd [r0 + 260], m3
506
507 pslldq m4, m3, 1
508 pinsrb m4, [r1 + 10], 0
509 movd [r0 + 264], m4
510
511 pslldq m4, 1
512 pinsrb m4, [r1 + 11], 0
513 movd [r0 + 268], m4
514
515 ; mode 19
516
517 palignr m3, m1, 1
518 punpcklbw m1, m3
519
520 pmaddubsw m7, m1, [r3 + 6 * 16]
521 pmulhrsw m7, m2
522 packuswb m7, m7
523 movd [r0 + 272], m7
524
525 pslldq m3, m1, 2
526 pinsrb m3, [r1], 1
527 pinsrb m3, [r1 + 9], 0
528
529 pmaddubsw m4, m3, [r3 + 12 * 16]
530 pmulhrsw m4, m2
531 packuswb m4, m4
532 movd [r0 + 276], m4
533
534 pslldq m4, m3, 2
535 pinsrb m4, [r1 + 9], 1
536 pinsrb m4, [r1 + 10], 0
537
538 pmaddubsw m5, m4, [r3 + 18 * 16]
539 pmulhrsw m5, m2
540 packuswb m5, m5
541 movd [r0 + 280], m5
542
543 pslldq m4, 2
544 pinsrb m4, [r1 + 10], 1
545 pinsrb m4, [r1 + 12], 0
546
547 pmaddubsw m4, [r3 + 24 * 16]
548 pmulhrsw m4, m2
549 packuswb m4, m4
550 movd [r0 + 284], m4
551
552 ; mode 20
553
554 pmaddubsw m4, m1, [r3 + 11 * 16]
555 pmulhrsw m4, m2
556 packuswb m4, m4
557 movd [r0 + 288], m4
558
559 pinsrb m3, [r1 + 10], 0
560
561 pmaddubsw m4, m3, [r3 + 22 * 16]
562 pmulhrsw m4, m2
563 packuswb m4, m4
564 movd [r0 + 292], m4
565
566 pmaddubsw m4, m3, [r3 + 1 * 16]
567 pmulhrsw m4, m2
568 packuswb m4, m4
569 movd [r0 + 296], m4
570
571 pslldq m6, m3, 2
572 pinsrb m6, [r1 + 10], 1
573 pinsrb m6, [r1 + 11], 0
574
575 pmaddubsw m5, m6, [r3 + 12 * 16]
576 pmulhrsw m5, m2
577 packuswb m5, m5
578 movd [r0 + 300], m5
579
580 ; mode 21
581
582 pmaddubsw m4, m1, [r3 + 15 * 16]
583 pmulhrsw m4, m2
584 packuswb m4, m4
585 movd [r0 + 304], m4
586
587 pmaddubsw m4, m3, [r3 + 30 * 16]
588 pmulhrsw m4, m2
589 packuswb m4, m4
590 movd [r0 + 308], m4
591
592 pmaddubsw m4, m3, [r3 + 13 * 16]
593 pmulhrsw m4, m2
594 packuswb m4, m4
595 movd [r0 + 312], m4
596
597 pinsrb m6, [r1 + 12], 0
598
599 pmaddubsw m6, [r3 + 28 * 16]
600 pmulhrsw m6, m2
601 packuswb m6, m6
602 movd [r0 + 316], m6
603
604 ; mode 22
605
606 pmaddubsw m4, m1, [r3 + 19 * 16]
607 pmulhrsw m4, m2
608 packuswb m4, m4
609 movd [r0 + 320], m4
610
611 movd [r0 + 324], m7
612
613 pmaddubsw m4, m3, [r3 + 25 * 16]
614 pmulhrsw m4, m2
615 packuswb m4, m4
616 movd [r0 + 328], m4
617
618 pmaddubsw m4, m3, [r3 + 12 * 16]
619 pmulhrsw m4, m2
620 packuswb m4, m4
621 movd [r0 + 332], m4
622
623 ; mode 23
624
625 pmaddubsw m4, m1, [r3 + 23 * 16]
626 pmulhrsw m4, m2
627 packuswb m4, m4
628 movd [r0 + 336], m4
629
630 pmaddubsw m4, m1, [r3 + 14 * 16]
631 pmulhrsw m4, m2
632 packuswb m4, m4
633 movd [r0 + 340], m4
634
635 pmaddubsw m4, m1, [r3 + 5 * 16]
636 pmulhrsw m4, m2
637 packuswb m4, m4
638 movd [r0 + 344], m4
639
640 pinsrb m3, [r1 + 12], 0
641
642 pmaddubsw m3, [r3 + 28 * 16]
643 pmulhrsw m3, m2
644 packuswb m3, m3
645 movd [r0 + 348], m3
646
647 ; mode 24
648
649 pmaddubsw m3, m1, [r3 + 27 * 16]
650 pmulhrsw m3, m2
651 packuswb m3, m3
652 movd [r0 + 352], m3
653
654 pmaddubsw m3, m1, [r3 + 22 * 16]
655 pmulhrsw m3, m2
656 packuswb m3, m3
657 movd [r0 + 356], m3
658
659 pmaddubsw m3, m1, [r3 + 17 * 16]
660 pmulhrsw m3, m2
661 packuswb m3, m3
662 movd [r0 + 360], m3
663
664 pmaddubsw m3, m1, [r3 + 12 * 16]
665 pmulhrsw m3, m2
666 packuswb m3, m3
667 movd [r0 + 364], m3
668
669 ; mode 25
670
671 pmaddubsw m3, m1, [r3 + 30 * 16]
672 pmulhrsw m3, m2
673 packuswb m3, m3
674 movd [r0 + 368], m3
675
676 pmaddubsw m3, m1, [r3 + 28 * 16]
677 pmulhrsw m3, m2
678 packuswb m3, m3
679 movd [r0 + 372], m3
680
681 pmaddubsw m3, m1, [r3 + 26 * 16]
682 pmulhrsw m3, m2
683 packuswb m3, m3
684 movd [r0 + 376], m3
685
686 pmaddubsw m1, [r3 + 24 * 16]
687 pmulhrsw m1, m2
688 packuswb m1, m1
689 movd [r0 + 380], m1
690
691 ; mode 26
692
693 movh m1, [r1 + 1]
694 pshufd m3, m1, 0
695 movu [r0 + 384], m3
696
697 pxor m4, m4
698 movd m5, [r1 + 9]
699 pshufd m5, m5, 0
700 punpcklbw m5, m4
701
702 pinsrb m6, [r1], 0
703 pshufb m6, m4
704 punpcklbw m6, m4
705
706 psubw m5, m6
707 psraw m5, 1
708
709 pshufb m6, m1, m4
710 punpcklbw m6, m4
711
712 paddw m5, m6
713 packuswb m5, m4
714
715 pextrb [r0 + 384], m5, 0
716 pextrb [r0 + 388], m5, 1
717 pextrb [r0 + 392], m5, 2
718 pextrb [r0 + 396], m5, 3
719
720 ; mode 27
721
722 palignr m3, m1, 1
723 punpcklbw m1, m3
724
725 pmaddubsw m3, m1, [r3 + 2 * 16]
726 pmulhrsw m3, m2
727 packuswb m3, m3
728 movd [r0 + 400], m3
729
730 pmaddubsw m3, m1, [r3 + 4 * 16]
731 pmulhrsw m3, m2
732 packuswb m3, m3
733 movd [r0 + 404], m3
734
735 pmaddubsw m3, m1, [r3 + 6 * 16]
736 pmulhrsw m3, m2
737 packuswb m3, m3
738 movd [r0 + 408], m3
739
740 pmaddubsw m3, m1, [r3 + 8 * 16]
741 pmulhrsw m3, m2
742 packuswb m3, m3
743 movd [r0 + 412], m3
744
745 ; mode 28
746
747 pmaddubsw m3, m1, [r3 + 5 * 16]
748 pmulhrsw m3, m2
749 packuswb m3, m3
750 movd [r0 + 416], m3
751
752 pmaddubsw m3, m1, [r3 + 10 * 16]
753 pmulhrsw m3, m2
754 packuswb m3, m3
755 movd [r0 + 420], m3
756
757 pmaddubsw m3, m1, [r3 + 15 * 16]
758 pmulhrsw m3, m2
759 packuswb m3, m3
760 movd [r0 + 424], m3
761
762 pmaddubsw m3, m1, [r3 + 20 * 16]
763 pmulhrsw m3, m2
764 packuswb m3, m3
765 movd [r0 + 428], m3
766
767 ; mode 29
768
769 pmaddubsw m3, m1, [r3 + 9 * 16]
770 pmulhrsw m3, m2
771 packuswb m3, m3
772 movd [r0 + 432], m3
773
774 pmaddubsw m3, m1, [r3 + 18 * 16]
775 pmulhrsw m3, m2
776 packuswb m3, m3
777 movd [r0 + 436], m3
778
779 pmaddubsw m3, m1, [r3 + 27 * 16]
780 pmulhrsw m3, m2
781 packuswb m3, m3
782 movd [r0 + 440], m3
783
784 palignr m3, m1, 2
785
786 pmaddubsw m4, m3, [r3 + 4 * 16]
787 pmulhrsw m4, m2
788 packuswb m4, m4
789 movd [r0 + 444], m4
790
791 ; mode 30
792
793 pmaddubsw m4, m1, [r3 + 13 * 16]
794 pmulhrsw m4, m2
795 packuswb m4, m4
796 movd [r0 + 448], m4
797
798 pmaddubsw m7, m1, [r3 + 26 * 16]
799 pmulhrsw m7, m2
800 packuswb m7, m7
801 movd [r0 + 452], m7
802
803 pmaddubsw m5, m3, [r3 + 7 * 16]
804 pmulhrsw m5, m2
805 packuswb m5, m5
806 movd [r0 + 456], m5
807
808 pmaddubsw m6, m3, [r3 + 20 * 16]
809 pmulhrsw m6, m2
810 packuswb m6, m6
811 movd [r0 + 460], m6
812
813 ; mode 31
814
815 pmaddubsw m4, m1, [r3 + 17 * 16]
816 pmulhrsw m4, m2
817 packuswb m4, m4
818 movd [r0 + 464], m4
819
820 pmaddubsw m5, m3, [r3 + 2 * 16]
821 pmulhrsw m5, m2
822 packuswb m5, m5
823 movd [r0 + 468], m5
824
825 pmaddubsw m5, m3, [r3 + 19 * 16]
826 pmulhrsw m5, m2
827 packuswb m5, m5
828 movd [r0 + 472], m5
829
830 palignr m4, m3, 2
831
832 pmaddubsw m5, m4, [r3 + 4 * 16]
833 pmulhrsw m5, m2
834 packuswb m5, m5
835 movd [r0 + 476], m5
836
837 ; mode 32
838
839 pmaddubsw m5, m1, [r3 + 21 * 16]
840 pmulhrsw m5, m2
841 packuswb m5, m5
842 movd [r0 + 480], m5
843
844 pmaddubsw m5, m3, [r3 + 10 * 16]
845 pmulhrsw m5, m2
846 packuswb m5, m5
847 movd [r0 + 484], m5
848
849 pmaddubsw m5, m3, [r3 + 31 * 16]
850 pmulhrsw m5, m2
851 packuswb m5, m5
852 movd [r0 + 488], m5
853
854 pmaddubsw m5, m4, [r3 + 20 * 16]
855 pmulhrsw m5, m2
856 packuswb m5, m5
857 movd [r0 + 492], m5
858
859 ; mode 33
860
861 movd [r0 + 496], m7
862
863 movd [r0 + 500], m6
864
865 pmaddubsw m5, m4, [r3 + 14 * 16]
866 pmulhrsw m5, m2
867 packuswb m5, m5
868 movd [r0 + 504], m5
869
870 psrldq m4, 2
871
872 pmaddubsw m4, [r3 + 8 * 16]
873 pmulhrsw m4, m2
874 packuswb m4, m4
875 movd [r0 + 508], m4
876
877 ; mode 34
878
879 movh m7, [r1 + 2]
880 movd [r0 + 512], m7
881
882 psrldq m7, 1
883 movd [r0 + 516], m7
884
885 psrldq m7, 1
886 movd [r0 + 520], m7
887
888 psrldq m7, 1
889 movd [r0 + 524], m7
890
891 RET
892
893 ;------------------------------------------------------------------------------
894 ; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
895 ;------------------------------------------------------------------------------
896 INIT_XMM sse4
897 cglobal all_angs_pred_8x8, 3,4,8
898 ; mode 2
899
900 movu m0, [r2 + 18]
901 palignr m1, m0, 1
902 punpcklqdq m2, m0, m1
903 movu [r0], m2
904
905 palignr m1, m0, 2
906 palignr m2, m0, 3
907 punpcklqdq m1, m2
908 movu [r0 + 16], m1
909
910 palignr m1, m0, 4
911 palignr m2, m0, 5
912 punpcklqdq m1, m2
913 movu [r0 + 32], m1
914
915 palignr m1, m0, 6
916 palignr m2, m0, 7
917 punpcklqdq m1, m2
918 movu [r0 + 48], m1
919
920 ; mode 3 [row 0, 1]
921
922 mova m7, [pw_1024]
923 lea r3, [ang_table]
924
925 movu m0, [r1 + 17]
926
927 palignr m1, m0, 1
928 palignr m2, m0, 2
929
930 punpcklbw m3, m0, m1
931 pmaddubsw m4, m3, [r3 + 26 * 16]
932 pmulhrsw m4, m7
933
934 punpcklbw m1, m2
935 pmaddubsw m5, m1, [r3 + 20 * 16]
936 pmulhrsw m5, m7
937
938 packuswb m4, m5
939
940 movu [r0 + 64], m4
941
942 ; mode 6 [row 1]
943
944 movh [r0 + 264], m4
945
946 ; mode 6 [row 3]
947
948 movhps [r0 + 280], m4
949
950 ; mode 4 [row 0, 1]
951
952 pmaddubsw m4, m3, [r3 + 21 * 16]
953 pmulhrsw m4, m7
954
955 pmaddubsw m5, m1, [r3 + 10 * 16]
956 pmulhrsw m5, m7
957
958 packuswb m4, m5
959 movu [r0 + 128], m4
960
961 ; mode 5 [row 0, 1]
962
963 pmaddubsw m4, m3, [r3 + 17 * 16]
964 pmulhrsw m4, m7
965
966 pmaddubsw m5, m1, [r3 + 2 * 16]
967 pmulhrsw m5, m7
968
969 packuswb m4, m5
970 movu [r0 + 192], m4
971
972 ; mode 6 [row 0]
973
974 pmaddubsw m4, m3, [r3 + 13 * 16]
975 pmulhrsw m4, m7
976
977 pxor m5, m5
978
979 packuswb m4, m5
980 movh [r0 + 256], m4
981
982 ; mode 7 [row 0, 1]
983
984 pmaddubsw m4, m3, [r3 + 9 * 16]
985 pmulhrsw m4, m7
986
987 pmaddubsw m5, m3, [r3 + 18 * 16]
988 pmulhrsw m5, m7
989
990 packuswb m4, m5
991 movu [r0 + 320], m4
992
993 ; mode 8 [row 0, 1]
994
995 pmaddubsw m4, m3, [r3 + 5 * 16]
996 pmulhrsw m4, m7
997
998 pmaddubsw m5, m3, [r3 + 10 * 16]
999 pmulhrsw m5, m7
1000
1001 packuswb m4, m5
1002 movu [r0 + 384], m4
1003
1004 ; mode 8 [row 2, 3]
1005
1006 pmaddubsw m4, m3, [r3 + 15 * 16]
1007 pmulhrsw m4, m7
1008
1009 pmaddubsw m5, m3, [r3 + 20 * 16]
1010 pmulhrsw m5, m7
1011
1012 packuswb m4, m5
1013 movu [r0 + 400], m4
1014
1015 ; mode 8 [row 4, 5]
1016
1017 pmaddubsw m4, m3, [r3 + 25 * 16]
1018 pmulhrsw m4, m7
1019
1020 pmaddubsw m5, m3, [r3 + 30 * 16]
1021 pmulhrsw m5, m7
1022
1023 packuswb m4, m5
1024 movu [r0 + 416], m4
1025
1026 ; mode 8 [row 6, 7]
1027
1028 pmaddubsw m4, m1, [r3 + 3 * 16]
1029 pmulhrsw m4, m7
1030
1031 pmaddubsw m5, m1, [r3 + 8 * 16]
1032 pmulhrsw m5, m7
1033
1034 packuswb m4, m5
1035 movu [r0 + 432], m4
1036
1037 ; mode 9 [row 0, 1]
1038
1039 pmaddubsw m4, m3, [r3 + 2 * 16]
1040 pmulhrsw m4, m7
1041
1042 pmaddubsw m5, m3, [r3 + 4 * 16]
1043 pmulhrsw m5, m7
1044
1045 packuswb m4, m5
1046 movu [r0 + 448], m4
1047
1048 ; mode 9 [row 2, 3]
1049
1050 pmaddubsw m4, m3, [r3 + 6 * 16]
1051 pmulhrsw m4, m7
1052
1053 pmaddubsw m5, m3, [r3 + 8 * 16]
1054 pmulhrsw m5, m7
1055
1056 packuswb m4, m5
1057 movu [r0 + 464], m4
1058
1059 ; mode 9 [row 4, 5]
1060
1061 pmaddubsw m4, m3, [r3 + 10 * 16]
1062 pmulhrsw m4, m7
1063
1064 pmaddubsw m5, m3, [r3 + 12 * 16]
1065 pmulhrsw m5, m7
1066
1067 packuswb m4, m5
1068 movu [r0 + 480], m4
1069
1070 ; mode 9 [row 6, 7]
1071
1072 pmaddubsw m4, m3, [r3 + 14 * 16]
1073 pmulhrsw m4, m7
1074
1075 pmaddubsw m5, m3, [r3 + 16 * 16]
1076 pmulhrsw m5, m7
1077
1078 packuswb m4, m5
1079 movu [r0 + 496], m4
1080
1081 ; mode 7 [row 2, 3]
1082
1083 pmaddubsw m4, m3, [r3 + 27 * 16]
1084 pmulhrsw m4, m7
1085
1086 pmaddubsw m5, m1, [r3 + 4 * 16]
1087 pmulhrsw m5, m7
1088
1089 packuswb m4, m5
1090 movu [r0 + 336], m4
1091
1092 ; mode 7 [row 4, 5]
1093
1094 pmaddubsw m4, m1, [r3 + 13 * 16]
1095 pmulhrsw m4, m7
1096
1097 pmaddubsw m5, m1, [r3 + 22 * 16]
1098 pmulhrsw m5, m7
1099
1100 packuswb m4, m5
1101 movu [r0 + 352], m4
1102
1103 ; mode 6 [row 2]
1104
1105 pmaddubsw m4, m1, [r3 + 7 * 16]
1106 pmulhrsw m4, m7
1107
1108 pxor m5, m5
1109
1110 packuswb m4, m5
1111 movh [r0 + 272], m4
1112
1113 ; mode 3 [row 2, 3]
1114
1115 palignr m1, m0, 3
1116 palignr m3, m0, 4
1117
1118 punpcklbw m2, m1
1119 pmaddubsw m5, m2, [r3 + 14 * 16]
1120 pmulhrsw m5, m7
1121
1122 punpcklbw m1, m3
1123 pmaddubsw m6, m1, [r3 + 8 * 16]
1124 pmulhrsw m6, m7
1125
1126 packuswb m5, m6
1127 movu [r0 + 80], m5
1128
1129 ; mode 6 [row 7]
1130
1131 movhps [r0 + 312], m5
1132
1133 ; mode 6 [row 5]
1134
1135 movh [r0 + 296], m5
1136
1137 ; mode 4 [calculate and store row 4, 5]
1138
1139 pmaddubsw m4, m1, [r3 + 9 * 16]
1140 pmulhrsw m4, m7
1141
1142 pmaddubsw m5, m1, [r3 + 30 * 16]
1143 pmulhrsw m5, m7
1144
1145 packuswb m4, m5
1146 movu [r0 + 160], m4
1147
1148 ; mode 5 [row 4, 5]
1149
1150 pmaddubsw m4, m2, [r3 + 21 * 16]
1151 pmulhrsw m4, m7
1152
1153 pmaddubsw m5, m1, [r3 + 6 * 16]
1154 pmulhrsw m5, m7
1155
1156 packuswb m4, m5
1157 movu [r0 + 224], m4
1158
1159 ; mode 6 [row 4, 5]
1160
1161 pmaddubsw m5, m2, [r3 + 1 * 16]
1162 pmulhrsw m5, m7
1163
1164 pxor m6, m6
1165
1166 packuswb m5, m6
1167 movh [r0 + 288], m5
1168
1169 ; mode 6 [row 6, 7]
1170
1171 pmaddubsw m5, m2, [r3 + 27 * 16]
1172 pmulhrsw m5, m7
1173
1174 pxor m6, m6
1175
1176 packuswb m5, m6
1177 movh [r0 + 304], m5
1178
1179 ; mode 5 [calculate row 6]
1180
1181 pmaddubsw m6, m1, [r3 + 23 * 16]
1182 pmulhrsw m6, m7
1183
1184 ; mode 3 [row 4, 5]
1185
1186 palignr m1, m0, 5
1187
1188 punpcklbw m3, m1
1189 pmaddubsw m4, m3, [r3 + 2 * 16]
1190 pmulhrsw m4, m7
1191
1192 pmaddubsw m5, m3, [r3 + 28 * 16]
1193 pmulhrsw m5, m7
1194
1195 packuswb m4, m5
1196 movu [r0 + 96], m4
1197
1198 ; mode 4 [calculate row 7]
1199
1200 pmaddubsw m5, m3, [r3 + 19 * 16]
1201 pmulhrsw m5, m7
1202
1203 ; mode 5 [calculate row 6]
1204
1205 pmaddubsw m4, m3, [r3 + 8 * 16]
1206 pmulhrsw m4, m7
1207
1208 packuswb m6, m4
1209 movu [r0 + 240], m6
1210
1211 ; mode 3 [row 6, 7]
1212
1213 palignr m2, m0, 6
1214 palignr m3, m0, 7
1215
1216 punpcklbw m1, m2
1217 pmaddubsw m4, m1, [r3 + 22 * 16]
1218 pmulhrsw m4, m7
1219
1220 punpcklbw m2, m3
1221 pmaddubsw m2, [r3 + 16 * 16]
1222 pmulhrsw m2, m7
1223
1224 packuswb m4, m2
1225 movu [r0 + 112], m4
1226
1227 ; mode 4 [calculate row 7]
1228
1229 pmaddubsw m2, m1, [r3 + 8 * 16]
1230 pmulhrsw m2, m7
1231
1232 ; mode 4 [store row 6 and 7]
1233
1234 packuswb m5, m2
1235 movu [r0 + 176], m5
1236
1237 ; mode 4 [row 2, 3]
1238
1239 palignr m1, m0, 1
1240 palignr m2, m0, 2
1241 palignr m3, m0, 3
1242
1243 punpcklbw m1, m2
1244 pmaddubsw m4, m1, [r3 + 31 * 16]
1245 pmulhrsw m4, m7
1246
1247 punpcklbw m2, m3
1248 pmaddubsw m5, m2, [r3 + 20 * 16]
1249 pmulhrsw m5, m7
1250
1251 packuswb m4, m5
1252 movu [r0 + 144], m4
1253
1254 ; mode 5 [row 2, 3]
1255
1256 pmaddubsw m4, m1, [r3 + 19 * 16]
1257 pmulhrsw m4, m7
1258
1259 pmaddubsw m5, m2, [r3 + 4 * 16]
1260 pmulhrsw m5, m7
1261
1262 packuswb m4, m5
1263 movu [r0 + 208], m4
1264
1265 ; mode 7 [row 6, 7]
1266
1267 pmaddubsw m4, m1, [r3 + 31 * 16]
1268 pmulhrsw m4, m7
1269
1270 pmaddubsw m5, m2, [r3 + 8 * 16]
1271 pmulhrsw m5, m7
1272
1273 packuswb m4, m5
1274 movu [r0 + 368], m4
1275
1276 ; mode 10
1277
1278 pshufb m1, m0, [tab_Si]
1279 movu [r0 + 512], m1
1280 movu [r0 + 528], m1
1281 movu [r0 + 544], m1
1282 movu [r0 + 560], m1
1283
1284 pxor m0, m0
1285
1286 pshufb m1, m1, m0
1287 punpcklbw m1, m0
1288
1289 movu m2, [r1]
1290
1291 pshufb m3, m2, m0
1292 punpcklbw m3, m0
1293
1294 psrldq m4, m2, 1
1295 punpcklbw m4, m0
1296
1297 movu m2, [r1 + 9]
1298 punpcklbw m2, m0
1299
1300 psubw m4, m3
1301 psubw m2, m3
1302
1303 psraw m4, 1
1304 psraw m2, 1
1305
1306 paddw m4, m1
1307 paddw m2, m1
1308
1309 packuswb m4, m2
1310
1311 pextrb [r0 + 512], m4, 0
1312 pextrb [r0 + 520], m4, 1
1313 pextrb [r0 + 528], m4, 2
1314 pextrb [r0 + 536], m4, 3
1315 pextrb [r0 + 544], m4, 4
1316 pextrb [r0 + 552], m4, 5
1317 pextrb [r0 + 560], m4, 6
1318 pextrb [r0 + 568], m4, 7
1319
1320 ; mode 11 [row 0, 1]
1321
1322 movu m0, [r1 + 16]
1323 pinsrb m0, [r1], 0
1324 palignr m1, m0, 1
1325 punpcklbw m2, m0, m1
1326
1327 pmaddubsw m3, m2, [r3 + 30 * 16]
1328 pmulhrsw m3, m7
1329
1330 pmaddubsw m4, m2, [r3 + 28 * 16]
1331 pmulhrsw m4, m7
1332
1333 packuswb m3, m4
1334 movu [r0 + 576], m3
1335
1336 ; mode 11 [row 2, 3]
1337
1338 pmaddubsw m3, m2, [r3 + 26 * 16]
1339 pmulhrsw m3, m7
1340
1341 pmaddubsw m4, m2, [r3 + 24 * 16]
1342 pmulhrsw m4, m7
1343
1344 packuswb m3, m4
1345 movu [r0 + 592], m3
1346
1347 ; mode 11 [row 4, 5]
1348
1349 pmaddubsw m3, m2, [r3 + 22 * 16]
1350 pmulhrsw m3, m7
1351
1352 pmaddubsw m4, m2, [r3 + 20 * 16]
1353 pmulhrsw m4, m7
1354
1355 packuswb m5, m3, m4
1356 movu [r0 + 608], m5
1357
1358 ; mode 12 [row 0, 1]
1359
1360 pmaddubsw m4, m2, [r3 + 27 * 16]
1361 pmulhrsw m4, m7
1362
1363 packuswb m4, m3
1364 movu [r0 + 640], m4
1365
1366 ; mode 11 [row 6, 7]
1367
1368 pmaddubsw m3, m2, [r3 + 18 * 16]
1369 pmulhrsw m3, m7
1370
1371 pmaddubsw m4, m2, [r3 + 16 * 16]
1372 pmulhrsw m4, m7
1373
1374 packuswb m3, m4
1375 movu [r0 + 624], m3
1376
1377 ; mode 12 [row 2, 3]
1378
1379 pmaddubsw m3, m2, [r3 + 17 * 16]
1380 pmulhrsw m3, m7
1381
1382 pmaddubsw m4, m2, [r3 + 12 * 16]
1383 pmulhrsw m4, m7
1384
1385 packuswb m3, m4
1386 movu [r0 + 656], m3
1387
1388 ; mode 12 [row 4, 5]
1389
1390 pmaddubsw m3, m2, [r3 + 7 * 16]
1391 pmulhrsw m3, m7
1392
1393 pmaddubsw m4, m2, [r3 + 2 * 16]
1394 pmulhrsw m4, m7
1395
1396 packuswb m3, m4
1397 movu [r0 + 672], m3
1398
1399 ; mode 12 [row 6, 7]
1400
1401 pslldq m3, m2, 2
1402 pinsrb m3, [r1 + 0], 1
1403 pinsrb m3, [r1 + 6], 0
1404
1405 pmaddubsw m4, m3, [r3 + 29 * 16]
1406 pmulhrsw m4, m7
1407
1408 pmaddubsw m5, m3, [r3 + 24 * 16]
1409 pmulhrsw m5, m7
1410
1411 packuswb m4, m5
1412 movu [r0 + 688], m4
1413
1414 ; mode 13 [row 0, 1]
1415
1416 pmaddubsw m4, m2, [r3 + 23 * 16]
1417 pmulhrsw m4, m7
1418
1419 pmaddubsw m5, m2, [r3 + 14 * 16]
1420 pmulhrsw m5, m7
1421
1422 packuswb m4, m5
1423 movu [r0 + 704], m4
1424
1425 ; mode 13 [row 2, 3]
1426
1427 pmaddubsw m4, m2, [r3 + 5 * 16]
1428 pmulhrsw m4, m7
1429
1430 pinsrb m3, [r1 + 4], 0
1431 pmaddubsw m5, m3, [r3 + 28 * 16]
1432 pmulhrsw m5, m7
1433
1434 packuswb m4, m5
1435 movu [r0 + 720], m4
1436
1437 ; mode 13 [row 4, 5]
1438
1439 pmaddubsw m4, m3, [r3 + 19 * 16]
1440 pmulhrsw m4, m7
1441
1442 pmaddubsw m5, m3, [r3 + 10 * 16]
1443 pmulhrsw m5, m7
1444
1445 packuswb m4, m5
1446 movu [r0 + 736], m4
1447
1448 ; mode 13 [row 6, 7]
1449
1450 pmaddubsw m4, m3, [r3 + 1 * 16]
1451 pmulhrsw m4, m7
1452
1453 pslldq m5, m3, 2
1454 pinsrb m5, [r1 + 4], 1
1455 pinsrb m5, [r1 + 7], 0
1456
1457 pmaddubsw m5, [r3 + 24 * 16]
1458 pmulhrsw m5, m7
1459
1460 packuswb m4, m5
1461 movu [r0 + 752], m4
1462
1463 ; mode 14 [row 0, 1]
1464
1465 pmaddubsw m4, m2, [r3 + 19 * 16]
1466 pmulhrsw m4, m7
1467
1468 pmaddubsw m5, m2, [r3 + 6 * 16]
1469 pmulhrsw m5, m7
1470
1471 packuswb m4, m5
1472 movu [r0 + 768], m4
1473
1474 ; mode 14 [row 2, 3]
1475
1476 pinsrb m3, [r1 + 2], 0
1477
1478 pmaddubsw m4, m3, [r3 + 25 * 16]
1479 pmulhrsw m4, m7
1480
1481 pmaddubsw m5, m3, [r3 + 12 * 16]
1482 pmulhrsw m5, m7
1483
1484 packuswb m4, m5
1485 movu [r0 + 784], m4
1486
1487 ; mode 14 [row 4, 5]
1488
1489 pslldq m1, m3, 2
1490 pinsrb m1, [r1 + 2], 1
1491 pinsrb m1, [r1 + 5], 0
1492
1493 pmaddubsw m4, m1, [r3 + 31 * 16]
1494 pmulhrsw m4, m7
1495
1496 pmaddubsw m5, m1, [r3 + 18 * 16]
1497 pmulhrsw m5, m7
1498
1499 packuswb m4, m5
1500 movu [r0 + 800], m4
1501
1502 ; mode 14 [row 6, 7]
1503
1504 pmaddubsw m4, m1, [r3 + 5 * 16]
1505 pmulhrsw m4, m7
1506
1507 pslldq m1, 2
1508 pinsrb m1, [r1 + 5], 1
1509 pinsrb m1, [r1 + 7], 0
1510
1511 pmaddubsw m5, m1, [r3 + 24 * 16]
1512 pmulhrsw m5, m7
1513
1514 packuswb m4, m5
1515 movu [r0 + 816], m4
1516
1517 ; mode 15 [row 0, 1]
1518
1519 pmaddubsw m4, m2, [r3 + 15 * 16]
1520 pmulhrsw m4, m7
1521
1522 pmaddubsw m5, m3, [r3 + 30 * 16]
1523 pmulhrsw m5, m7
1524
1525 packuswb m4, m5
1526 movu [r0 + 832], m4
1527
1528 ; mode 15 [row 2, 3]
1529
1530 pmaddubsw m4, m3, [r3 + 13 * 16]
1531 pmulhrsw m4, m7
1532
1533 pslldq m1, m3, 2
1534 pinsrb m1, [r1 + 2], 1
1535 pinsrb m1, [r1 + 4], 0
1536
1537 pmaddubsw m5, m1, [r3 + 28 * 16]
1538 pmulhrsw m5, m7
1539
1540 packuswb m4, m5
1541 movu [r0 + 848], m4
1542
1543 ; mode 15 [row 4, 5]
1544
1545 pmaddubsw m4, m1, [r3 + 11 * 16]
1546 pmulhrsw m4, m7
1547
1548 pslldq m1, 2
1549 pinsrb m1, [r1 + 4], 1
1550 pinsrb m1, [r1 + 6], 0
1551
1552 pmaddubsw m5, m1, [r3 + 26 * 16]
1553 pmulhrsw m5, m7
1554
1555 packuswb m4, m5
1556 movu [r0 + 864], m4
1557
1558 ; mode 15 [row 6, 7]
1559
1560 pmaddubsw m4, m1, [r3 + 9 * 16]
1561 pmulhrsw m4, m7
1562
1563 pslldq m1, 2
1564 pinsrb m1, [r1 + 6], 1
1565 pinsrb m1, [r1 + 8], 0
1566
1567 pmaddubsw m1, [r3 + 24 * 16]
1568 pmulhrsw m1, m7
1569
1570 packuswb m4, m1
1571 movu [r0 + 880], m4
1572
1573 ; mode 16 [row 0, 1]
1574
1575 pmaddubsw m4, m2, [r3 + 11 * 16]
1576 pmulhrsw m4, m7
1577
1578 pmaddubsw m5, m3, [r3 + 22 * 16]
1579 pmulhrsw m5, m7
1580
1581 packuswb m4, m5
1582 movu [r0 + 896], m4
1583
1584 ; mode 16 [row 2, 3]
1585
1586 pmaddubsw m4, m3, [r3 + 1 * 16]
1587 pmulhrsw m4, m7
1588
1589 pslldq m3, 2
1590 pinsrb m3, [r1 + 2], 1
1591 pinsrb m3, [r1 + 3], 0
1592
1593 pmaddubsw m5, m3, [r3 + 12 * 16]
1594 pmulhrsw m5, m7
1595
1596 packuswb m4, m5
1597 movu [r0 + 912], m4
1598
1599 ; mode 16 [row 4, 5]
1600
1601 pslldq m3, 2
1602 pinsrb m3, [r1 + 3], 1
1603 pinsrb m3, [r1 + 5], 0
1604
1605 pmaddubsw m4, m3, [r3 + 23 * 16]
1606 pmulhrsw m4, m7
1607
1608 pmaddubsw m5, m3, [r3 + 2 * 16]
1609 pmulhrsw m5, m7
1610
1611 packuswb m4, m5
1612 movu [r0 + 928], m4
1613
1614 ; mode 16 [row 6, 7]
1615
1616 pslldq m3, 2
1617 pinsrb m3, [r1 + 5], 1
1618 pinsrb m3, [r1 + 6], 0
1619
1620 pmaddubsw m4, m3, [r3 + 13 * 16]
1621 pmulhrsw m4, m7
1622
1623 pslldq m3, 2
1624 pinsrb m3, [r1 + 6], 1
1625 pinsrb m3, [r1 + 8], 0
1626
1627 pmaddubsw m3, [r3 + 24 * 16]
1628 pmulhrsw m3, m7
1629
1630 packuswb m4, m3
1631 movu [r0 + 944], m4
1632
1633 ; mode 17 [row 0, 1]
1634
1635 pmaddubsw m4, m2, [r3 + 6 * 16]
1636 pmulhrsw m4, m7
1637
1638 pslldq m2, 2
1639 pinsrb m2, [r1 + 0], 1
1640 pinsrb m2, [r1 + 1], 0
1641
1642 pmaddubsw m3, m2, [r3 + 12 * 16]
1643 pmulhrsw m3, m7
1644
1645 packuswb m4, m3
1646 movu [r0 + 960], m4
1647
1648 ; mode 17 [row 2, 3]
1649
1650 pslldq m2, 2
1651 pinsrb m2, [r1 + 1], 1
1652 pinsrb m2, [r1 + 2], 0
1653
1654 pmaddubsw m4, m2, [r3 + 18 * 16]
1655 pmulhrsw m4, m7
1656
1657 pslldq m2, 2
1658 pinsrb m2, [r1 + 2], 1
1659 pinsrb m2, [r1 + 4], 0
1660
1661 pmaddubsw m3, m2, [r3 + 24 * 16]
1662 pmulhrsw m3, m7
1663
1664 packuswb m4, m3
1665 movu [r0 + 976], m4
1666
1667 ; mode 17 [row 4, 5]
1668
1669 pslldq m2, 2
1670 pinsrb m2, [r1 + 4], 1
1671 pinsrb m2, [r1 + 5], 0
1672
1673 pmaddubsw m4, m2, [r3 + 30 * 16]
1674 pmulhrsw m4, m7
1675
1676 pmaddubsw m3, m2, [r3 + 4 * 16]
1677 pmulhrsw m3, m7
1678
1679 packuswb m4, m3
1680 movu [r0 + 992], m4
1681
1682 ; mode 17 [row 6, 7]
1683
1684 pslldq m2, 2
1685 pinsrb m2, [r1 + 5], 1
1686 pinsrb m2, [r1 + 6], 0
1687
1688 pmaddubsw m4, m2, [r3 + 10 * 16]
1689 pmulhrsw m4, m7
1690
1691 pslldq m2, 2
1692 pinsrb m2, [r1 + 6], 1
1693 pinsrb m2, [r1 + 7], 0
1694
1695 pmaddubsw m3, m2, [r3 + 16 * 16]
1696 pmulhrsw m3, m7
1697
1698 packuswb m4, m3
1699 movu [r0 + 1008], m4
1700
1701 ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
1702
1703 movh m1, [r2]
1704
1705 pslldq m2, m1, 1
1706 pinsrb m2, [r2 + 1 + 16], 0
1707 punpcklqdq m1, m2
1708 movu [r0 + 1024], m1
1709
1710 pslldq m2, 1
1711 pinsrb m2, [r2 + 2 + 16], 0
1712
1713 pslldq m0, m2, 1
1714 pinsrb m0, [r2 + 3 + 16], 0
1715 punpcklqdq m2, m0
1716 movu [r0 + 1040], m2
1717
1718 pslldq m0, 1
1719 pinsrb m0, [r2 + 4 + 16], 0
1720
1721 pslldq m2, m0, 1
1722 pinsrb m2, [r2 + 5 + 16], 0
1723 punpcklqdq m0, m2
1724 movu [r0 + 1056], m0
1725
1726 pslldq m2, 1
1727 pinsrb m2, [r2 + 6 + 16], 0
1728
1729 pslldq m0, m2, 1
1730 pinsrb m0, [r2 + 7 + 16], 0
1731 punpcklqdq m2, m0
1732 movu [r0 + 1072], m2
1733
1734 ; mode 19 [row 0, 1]
1735
1736 movu m0, [r1]
1737 palignr m1, m0, 1
1738 punpcklbw m0, m1
1739
1740 pmaddubsw m1, m0, [r3 + 6 * 16]
1741 pmulhrsw m1, m7
1742
1743 pslldq m2, m0, 2
1744 pinsrb m2, [r1], 1
1745 pinsrb m2, [r1 + 1 + 16], 0
1746
1747 pmaddubsw m3, m2, [r3 + 12 * 16]
1748 pmulhrsw m3, m7
1749
1750 packuswb m1, m3
1751 movu [r0 + 1088], m1
1752
1753 ; mode 19 [row 2, 3]
1754
1755 pslldq m2, 2
1756 pinsrb m2, [r1 + 1 + 16], 1
1757 pinsrb m2, [r1 + 2 + 16], 0
1758
1759 pmaddubsw m4, m2, [r3 + 18 * 16]
1760 pmulhrsw m4, m7
1761
1762 pslldq m2, 2
1763 pinsrb m2, [r1 + 2 + 16], 1
1764 pinsrb m2, [r1 + 4 + 16], 0
1765
1766 pmaddubsw m5, m2, [r3 + 24 * 16]
1767 pmulhrsw m5, m7
1768
1769 packuswb m4, m5
1770 movu [r0 + 1104], m4
1771
1772 ; mode 19 [row 4, 5]
1773
1774 pslldq m2, 2
1775 pinsrb m2, [r1 + 4 + 16], 1
1776 pinsrb m2, [r1 + 5 + 16], 0
1777
1778 pmaddubsw m4, m2, [r3 + 30 * 16]
1779 pmulhrsw m4, m7
1780
1781 pmaddubsw m5, m2, [r3 + 4 * 16]
1782 pmulhrsw m5, m7
1783
1784 packuswb m4, m5
1785 movu [r0 + 1120], m4
1786
1787 ; mode 19 [row 6, 7]
1788
1789 pslldq m2, 2
1790 pinsrb m2, [r1 + 5 + 16], 1
1791 pinsrb m2, [r1 + 6 + 16], 0
1792
1793 pmaddubsw m4, m2, [r3 + 10 * 16]
1794 pmulhrsw m4, m7
1795
1796 pslldq m2, 2
1797 pinsrb m2, [r1 + 6 + 16], 1
1798 pinsrb m2, [r1 + 7 + 16], 0
1799
1800 pmaddubsw m2, [r3 + 16 * 16]
1801 pmulhrsw m2, m7
1802
1803 packuswb m4, m2
1804 movu [r0 + 1136], m4
1805
1806 ; mode 20 [row 0, 1]
1807
1808 pmaddubsw m3, m0, [r3 + 11 * 16]
1809 pmulhrsw m3, m7
1810
1811 pslldq m1, m0, 2
1812 pinsrb m1, [r1 + 0], 1
1813 pinsrb m1, [r1 + 2 + 16], 0
1814
1815 pmaddubsw m4, m1, [r3 + 22 * 16]
1816 pmulhrsw m4, m7
1817
1818 packuswb m3, m4
1819 movu [r0 + 1152], m3
1820
1821 ; mode 20 [row 2, 3]
1822
1823 pmaddubsw m3, m1, [r3 + 1 * 16]
1824 pmulhrsw m3, m7
1825
1826 pslldq m2, m1, 2
1827 pinsrb m2, [r1 + 2 + 16], 1
1828 pinsrb m2, [r1 + 3 + 16], 0
1829
1830 pmaddubsw m4, m2, [r3 + 12 * 16]
1831 pmulhrsw m4, m7
1832
1833 packuswb m3, m4
1834 movu [r0 + 1168], m3
1835
1836 ; mode 20 [row 4, 5]
1837
1838 pslldq m2, 2
1839 pinsrb m2, [r1 + 3 + 16], 1
1840 pinsrb m2, [r1 + 5 + 16], 0
1841
1842 pmaddubsw m3, m2, [r3 + 23 * 16]
1843 pmulhrsw m3, m7
1844
1845 pmaddubsw m4, m2, [r3 + 2 * 16]
1846 pmulhrsw m4, m7
1847
1848 packuswb m3, m4
1849 movu [r0 + 1184], m3
1850
1851 ; mode 20 [row 6, 7]
1852
1853 pslldq m2, 2
1854 pinsrb m2, [r1 + 5 + 16], 1
1855 pinsrb m2, [r1 + 6 + 16], 0
1856
1857 pmaddubsw m3, m2, [r3 + 13 * 16]
1858 pmulhrsw m3, m7
1859
1860 pslldq m2, 2
1861 pinsrb m2, [r1 + 6 + 16], 1
1862 pinsrb m2, [r1 + 8 + 16], 0
1863
1864 pmaddubsw m4, m2, [r3 + 24 * 16]
1865 pmulhrsw m4, m7
1866
1867 packuswb m3, m4
1868 movu [r0 + 1200], m3
1869
1870 ; mode 21 [row 0, 1]
1871
1872 pmaddubsw m2, m0, [r3 + 15 * 16]
1873 pmulhrsw m2, m7
1874
1875 pmaddubsw m3, m1, [r3 + 30 * 16]
1876 pmulhrsw m3, m7
1877
1878 packuswb m2, m3
1879 movu [r0 + 1216], m2
1880
1881 ; mode 21 [row 2, 3]
1882
1883 pmaddubsw m2, m1, [r3 + 13 * 16]
1884 pmulhrsw m2, m7
1885
1886 pslldq m3, m1, 2
1887 pinsrb m3, [r1 + 2 + 16], 1
1888 pinsrb m3, [r1 + 4 + 16], 0
1889
1890 pmaddubsw m4, m3, [r3 + 28 * 16]
1891 pmulhrsw m4, m7
1892
1893 packuswb m2, m4
1894 movu [r0 + 1232], m2
1895
1896 ; mode 21 [row 4, 5]
1897
1898 pmaddubsw m2, m3, [r3 + 11 * 16]
1899 pmulhrsw m2, m7
1900
1901 pslldq m3, 2
1902 pinsrb m3, [r1 + 4 + 16], 1
1903 pinsrb m3, [r1 + 6 + 16], 0
1904
1905 pmaddubsw m4, m3, [r3 + 26 * 16]
1906 pmulhrsw m4, m7
1907
1908 packuswb m2, m4
1909 movu [r0 + 1248], m2
1910
1911 ; mode 21 [row 6, 7]
1912
1913 pmaddubsw m2, m3, [r3 + 9 * 16]
1914 pmulhrsw m2, m7
1915
1916 pslldq m3, 2
1917 pinsrb m3, [r1 + 6 + 16], 1
1918 pinsrb m3, [r1 + 8 + 16], 0
1919
1920 pmaddubsw m4, m3, [r3 + 24 * 16]
1921 pmulhrsw m4, m7
1922
1923 packuswb m2, m4
1924 movu [r0 + 1264], m2
1925
1926 ; mode 22 [row 0, 1]
1927
1928 pmaddubsw m2, m0, [r3 + 19 * 16]
1929 pmulhrsw m2, m7
1930
1931 pmaddubsw m4, m0, [r3 + 6 * 16]
1932 pmulhrsw m4, m7
1933
1934 packuswb m2, m4
1935 movu [r0 + 1280], m2
1936
1937 ; mode 22 [row 2, 3]
1938
1939 pmaddubsw m2, m1, [r3 + 25 * 16]
1940 pmulhrsw m2, m7
1941
1942 pmaddubsw m3, m1, [r3 + 12 * 16]
1943 pmulhrsw m3, m7
1944
1945 packuswb m2, m3
1946 movu [r0 + 1296], m2
1947
1948 ; mode 22 [row 4, 5]
1949
1950 pslldq m1, 2
1951 pinsrb m1, [r1 + 5 + 16], 0
1952 pinsrb m1, [r1 + 2 + 16], 1
1953
1954 pmaddubsw m2, m1, [r3 + 31 * 16]
1955 pmulhrsw m2, m7
1956
1957 pmaddubsw m3, m1, [r3 + 18 * 16]
1958 pmulhrsw m3, m7
1959
1960 packuswb m2, m3
1961 movu [r0 + 1312], m2
1962
1963 ; mode 22 [row 6, 7]
1964
1965 pmaddubsw m2, m1, [r3 + 5 * 16]
1966 pmulhrsw m2, m7
1967
1968 pslldq m1, 2
1969 pinsrb m1, [r1 + 5 + 16], 1
1970 pinsrb m1, [r1 + 7 + 16], 0
1971
1972 pmaddubsw m1, [r3 + 24 * 16]
1973 pmulhrsw m1, m7
1974
1975 packuswb m2, m1
1976 movu [r0 + 1328], m2
1977
1978 ; mode 23 [row 0, 1]
1979
1980 pmaddubsw m2, m0, [r3 + 23 * 16]
1981 pmulhrsw m2, m7
1982
1983 pmaddubsw m3, m0, [r3 + 14 * 16]
1984 pmulhrsw m3, m7
1985
1986 packuswb m2, m3
1987 movu [r0 + 1344], m2
1988
1989 ; mode 23 [row 2, 3]
1990
1991 pmaddubsw m2, m0, [r3 + 5 * 16]
1992 pmulhrsw m2, m7
1993
1994 pslldq m1, m0, 2
1995 pinsrb m1, [r1], 1
1996 pinsrb m1, [r1 + 4 + 16], 0
1997
1998 pmaddubsw m3, m1, [r3 + 28 * 16]
1999 pmulhrsw m3, m7
2000
2001 packuswb m2, m3
2002 movu [r0 + 1360], m2
2003
2004 ; mode 23 [row 4, 5]
2005
2006 pmaddubsw m2, m1, [r3 + 19 * 16]
2007 pmulhrsw m2, m7
2008
2009 pmaddubsw m3, m1, [r3 + 10 * 16]
2010 pmulhrsw m3, m7
2011
2012 packuswb m2, m3
2013 movu [r0 + 1376], m2
2014
2015 ; mode 23 [row 6, 7]
2016
2017 pmaddubsw m2, m1, [r3 + 1 * 16]
2018 pmulhrsw m2, m7
2019
2020 pslldq m3, m1, 2
2021 pinsrb m3, [r1 + 4 + 16], 1
2022 pinsrb m3, [r1 + 7 + 16], 0
2023
2024 pmaddubsw m3, [r3 + 24 * 16]
2025 pmulhrsw m3, m7
2026
2027 packuswb m2, m3
2028 movu [r0 + 1392], m2
2029
2030 ; mode 24 [row 0, 1]
2031
2032 pmaddubsw m2, m0, [r3 + 27 * 16]
2033 pmulhrsw m2, m7
2034
2035 pmaddubsw m5, m0, [r3 + 22 * 16]
2036 pmulhrsw m5, m7
2037
2038 packuswb m2, m5
2039 movu [r0 + 1408], m2
2040
2041 ; mode 24 [row 2, 3]
2042
2043 pmaddubsw m2, m0, [r3 + 17 * 16]
2044 pmulhrsw m2, m7
2045
2046 pmaddubsw m3, m0, [r3 + 12 * 16]
2047 pmulhrsw m3, m7
2048
2049 packuswb m2, m3
2050 movu [r0 + 1424], m2
2051
2052 ; mode 24 [row 4, 5]
2053
2054 pmaddubsw m2, m0, [r3 + 7 * 16]
2055 pmulhrsw m2, m7
2056
2057 pmaddubsw m3, m0, [r3 + 2 * 16]
2058 pmulhrsw m3, m7
2059
2060 packuswb m2, m3
2061 movu [r0 + 1440], m2
2062
2063 ; mode 24 [row 6, 7]
2064
2065 pinsrb m1, [r1 + 6 + 16], 0
2066
2067 pmaddubsw m2, m1, [r3 + 29 * 16]
2068 pmulhrsw m2, m7
2069
2070 pmaddubsw m1, [r3 + 24 * 16]
2071 pmulhrsw m1, m7
2072
2073 packuswb m2, m1
2074 movu [r0 + 1456], m2
2075
2076 ; mode 25 [row 0, 1]
2077
2078 pmaddubsw m2, m0, [r3 + 30 * 16]
2079 pmulhrsw m2, m7
2080
2081 pmaddubsw m1, m0, [r3 + 28 * 16]
2082 pmulhrsw m1, m7
2083
2084 packuswb m2, m1
2085 movu [r0 + 1472], m2
2086
2087 ; mode 25 [row 2, 3]
2088
2089 pmaddubsw m2, m0, [r3 + 26 * 16]
2090 pmulhrsw m2, m7
2091
2092 pmaddubsw m1, m0, [r3 + 24 * 16]
2093 pmulhrsw m1, m7
2094
2095 packuswb m2, m1
2096 movu [r0 + 1488], m2
2097
2098 ; mode 25 [row 4, 5]
2099
2100 pmaddubsw m1, m0, [r3 + 20 * 16]
2101 pmulhrsw m1, m7
2102
2103 packuswb m5, m1
2104 movu [r0 + 1504], m5
2105
2106 ; mode 25 [row 6, 7]
2107
2108 pmaddubsw m2, m0, [r3 + 18 * 16]
2109 pmulhrsw m2, m7
2110
2111 pmaddubsw m1, m0, [r3 + 16 * 16]
2112 pmulhrsw m1, m7
2113
2114 packuswb m2, m1
2115 movu [r0 + 1520], m2
2116
2117 ; mode 26
2118
2119 movu m0, [r1 + 1]
2120
2121 pshufb m1, m0, [tab_Si]
2122 movu [r0 + 1536], m1
2123 movu [r0 + 1552], m1
2124 movu [r0 + 1568], m1
2125 movu [r0 + 1584], m1
2126
2127 pxor m5, m5
2128
2129 pshufb m1, m1, m5
2130 punpcklbw m1, m5
2131
2132 movu m2, [r1 + 16]
2133 pinsrb m2, [r1], 0
2134
2135 pshufb m3, m2, m5
2136 punpcklbw m3, m5
2137
2138 psrldq m4, m2, 1
2139 punpcklbw m4, m5
2140
2141 movu m2, [r1 + 9 + 16]
2142 punpcklbw m2, m5
2143
2144 psubw m4, m3
2145 psubw m2, m3
2146
2147 psraw m4, 1
2148 psraw m2, 1
2149
2150 paddw m4, m1
2151 paddw m2, m1
2152
2153 packuswb m4, m2
2154
2155 pextrb [r0 + 1536], m4, 0
2156 pextrb [r0 + 1544], m4, 1
2157 pextrb [r0 + 1552], m4, 2
2158 pextrb [r0 + 1560], m4, 3
2159 pextrb [r0 + 1568], m4, 4
2160 pextrb [r0 + 1576], m4, 5
2161 pextrb [r0 + 1584], m4, 6
2162 pextrb [r0 + 1592], m4, 7
2163
2164 ; mode 27 [row 0, 1]
2165
2166 palignr m6, m0, 1
2167 punpcklbw m4, m0, m6
2168
2169 pmaddubsw m1, m4, [r3 + 2 * 16]
2170 pmulhrsw m1, m7
2171
2172 pmaddubsw m2, m4, [r3 + 4 * 16]
2173 pmulhrsw m2, m7
2174
2175 packuswb m1, m2
2176 movu [r0 + 1600], m1
2177
2178 ; mode 27 [row 2, 3]
2179
2180 pmaddubsw m1, m4, [r3 + 6 * 16]
2181 pmulhrsw m1, m7
2182
2183 pmaddubsw m2, m4, [r3 + 8 * 16]
2184 pmulhrsw m2, m7
2185
2186 packuswb m1, m2
2187 movu [r0 + 1616], m1
2188
2189 ; mode 27 [row 4, 5]
2190
2191 pmaddubsw m3, m4, [r3 + 10 * 16]
2192 pmulhrsw m3, m7
2193
2194 pmaddubsw m2, m4, [r3 + 12 * 16]
2195 pmulhrsw m2, m7
2196
2197 packuswb m1, m3, m2
2198 movu [r0 + 1632], m1
2199
2200 ; mode 27 [row 6, 7]
2201
2202 pmaddubsw m1, m4, [r3 + 14 * 16]
2203 pmulhrsw m1, m7
2204
2205 pmaddubsw m2, m4, [r3 + 16 * 16]
2206 pmulhrsw m2, m7
2207
2208 packuswb m1, m2
2209 movu [r0 + 1648], m1
2210
2211 ; mode 28 [row 0, 1]
2212
2213 pmaddubsw m1, m4, [r3 + 5 * 16]
2214 pmulhrsw m1, m7
2215
2216 packuswb m1, m3
2217 movu [r0 + 1664], m1
2218
2219 ; mode 28 [row 2, 3]
2220
2221 pmaddubsw m1, m4, [r3 + 15 * 16]
2222 pmulhrsw m1, m7
2223
2224 pmaddubsw m2, m4, [r3 + 20 * 16]
2225 pmulhrsw m2, m7
2226
2227 packuswb m1, m2
2228 movu [r0 + 1680], m1
2229
2230 ; mode 28 [row 4, 5]
2231
2232 pmaddubsw m1, m4, [r3 + 25 * 16]
2233 pmulhrsw m1, m7
2234
2235 pmaddubsw m2, m4, [r3 + 30 * 16]
2236 pmulhrsw m2, m7
2237
2238 packuswb m1, m2
2239 movu [r0 + 1696], m1
2240
2241 ; mode 28 [row 6, 7]
2242
2243 palignr m1, m0, 2
2244 punpcklbw m5, m6, m1
2245
2246 pmaddubsw m2, m5, [r3 + 3 * 16]
2247 pmulhrsw m2, m7
2248
2249 pmaddubsw m3, m5, [r3 + 8 * 16]
2250 pmulhrsw m3, m7
2251
2252 packuswb m2, m3
2253 movu [r0 + 1712], m2
2254
2255 ; mode 29 [row 0, 1]
2256
2257 pmaddubsw m2, m4, [r3 + 9 * 16]
2258 pmulhrsw m2, m7
2259
2260 pmaddubsw m3, m4, [r3 + 18 * 16]
2261 pmulhrsw m3, m7
2262
2263 packuswb m2, m3
2264 movu [r0 + 1728], m2
2265
2266 ; mode 29 [row 2, 3]
2267
2268 pmaddubsw m2, m4, [r3 + 27 * 16]
2269 pmulhrsw m2, m7
2270
2271 pmaddubsw m3, m5, [r3 + 4 * 16]
2272 pmulhrsw m3, m7
2273
2274 packuswb m2, m3
2275 movu [r0 + 1744], m2
2276
2277 ; mode 29 [row 4, 5]
2278
2279 pmaddubsw m2, m5, [r3 + 13 * 16]
2280 pmulhrsw m2, m7
2281
2282 pmaddubsw m3, m5, [r3 + 22 * 16]
2283 pmulhrsw m3, m7
2284
2285 packuswb m2, m3
2286 movu [r0 + 1760], m2
2287
2288 ; mode 29 [row 6, 7]
2289
2290 pmaddubsw m2, m5, [r3 + 31 * 16]
2291 pmulhrsw m2, m7
2292
2293 palignr m6, m0, 3
2294 punpcklbw m1, m6
2295
2296 pmaddubsw m3, m1, [r3 + 8 * 16]
2297 pmulhrsw m3, m7
2298
2299 packuswb m2, m3
2300 movu [r0 + 1776], m2
2301
2302 ; mode 32 [row 2]
2303
2304 movh [r0 + 1936], m2
2305
2306 ; mode 30 [row 0, 1]
2307
2308 pmaddubsw m2, m4, [r3 + 13 * 16]
2309 pmulhrsw m2, m7
2310
2311 pmaddubsw m3, m4, [r3 + 26 * 16]
2312 pmulhrsw m3, m7
2313
2314 packuswb m2, m3
2315 movu [r0 + 1792], m2
2316
2317 ; mode 30 [row 2, 3]
2318
2319 pmaddubsw m2, m5, [r3 + 7 * 16]
2320 pmulhrsw m2, m7
2321
2322 pmaddubsw m3, m5, [r3 + 20 * 16]
2323 pmulhrsw m3, m7
2324
2325 packuswb m2, m3
2326 movu [r0 + 1808], m2
2327
2328 ; mode 33 [row 1]
2329
2330 movhps [r0 + 1992], m2
2331
2332 ; mode 30 [row 4, 5]
2333
2334 pmaddubsw m2, m1, [r3 + 1 * 16]
2335 pmulhrsw m2, m7
2336
2337 pmaddubsw m3, m1, [r3 + 14 * 16]
2338 pmulhrsw m3, m7
2339
2340 packuswb m2, m3
2341 movu [r0 + 1824], m2
2342
2343 ; mode 33 [row 2]
2344
2345 movhps [r0 + 2000], m2
2346
2347 ; mode 30 [row 6, 7]
2348
2349 pmaddubsw m2, m1, [r3 + 27 * 16]
2350 pmulhrsw m2, m7
2351
2352 psrldq m0, 4
2353 punpcklbw m6, m0
2354
2355 pmaddubsw m3, m6, [r3 + 8 * 16]
2356 pmulhrsw m3, m7
2357
2358 packuswb m2, m3
2359 movu [r0 + 1840], m2
2360
2361 ; mode 33 [row 3]
2362
2363 movhps [r0 + 2008], m2
2364
2365 ; mode 31 [row 0, 1]
2366
2367 pmaddubsw m2, m4, [r3 + 17 * 16]
2368 pmulhrsw m2, m7
2369
2370 pmaddubsw m3, m5, [r3 + 2 * 16]
2371 pmulhrsw m3, m7
2372
2373 packuswb m2, m3
2374 movu [r0 + 1856], m2
2375
2376 ; mode 31 [row 2, 3]
2377
2378 pmaddubsw m2, m5, [r3 + 19 * 16]
2379 pmulhrsw m2, m7
2380
2381 pmaddubsw m3, m1, [r3 + 4 * 16]
2382 pmulhrsw m3, m7
2383
2384 packuswb m2, m3
2385 movu [r0 + 1872], m2
2386
2387 ; mode 31 [row 4, 5]
2388
2389 pmaddubsw m2, m1, [r3 + 21 * 16]
2390 pmulhrsw m2, m7
2391
2392 pmaddubsw m3, m6, [r3 + 6 * 16]
2393 pmulhrsw m3, m7
2394
2395 packuswb m2, m3
2396 movu [r0 + 1888], m2
2397
2398 ; mode 31 [row 6, 7]
2399
2400 pmaddubsw m2, m6, [r3 + 23 * 16]
2401 pmulhrsw m2, m7
2402
2403 movu m3, [r1 + 6]
2404 punpcklbw m0, m3
2405
2406 pmaddubsw m3, m0, [r3 + 8 * 16]
2407 pmulhrsw m3, m7
2408
2409 packuswb m2, m3
2410 movu [r0 + 1904], m2
2411
2412 ; mode 32 [row 0, 1]
2413
2414 pmaddubsw m2, m4, [r3 + 21 * 16]
2415 pmulhrsw m2, m7
2416
2417 pmaddubsw m3, m5, [r3 + 10 * 16]
2418 pmulhrsw m3, m7
2419
2420 packuswb m2, m3
2421 movu [r0 + 1920], m2
2422
2423 ; mode 32 [row 3]
2424
2425 pmaddubsw m2, m1, [r3 + 20 * 16]
2426 pmulhrsw m2, m7
2427
2428 pxor m3, m3
2429
2430 packuswb m2, m3
2431 movh [r0 + 1944], m2
2432
2433 ; mode 32 [row 4, 5]
2434
2435 pmaddubsw m2, m6, [r3 + 9 * 16]
2436 pmulhrsw m2, m7
2437
2438 pmaddubsw m3, m6, [r3 + 30 * 16]
2439 pmulhrsw m3, m7
2440
2441 packuswb m2, m3
2442 movu [r0 + 1952], m2
2443
2444 ; mode 33 [row 4, 5]
2445
2446 pmaddubsw m2, m0, [r3 + 2 * 16]
2447 pmulhrsw m2, m7
2448
2449 pmaddubsw m3, m0, [r3 + 28 * 16]
2450 pmulhrsw m3, m7
2451
2452 packuswb m2, m3
2453 movu [r0 + 2016], m2
2454
2455 ; mode 32 [row 6]
2456
2457 pmaddubsw m2, m0, [r3 + 19 * 16]
2458 pmulhrsw m2, m7
2459
2460 ; mode 32 [row 7]
2461
2462 movu m0, [r1 + 6]
2463 palignr m3, m0, 1
2464 punpcklbw m0, m3
2465
2466 pmaddubsw m3, m0, [r3 + 8 * 16]
2467 pmulhrsw m3, m7
2468
2469 packuswb m2, m3
2470 movu [r0 + 1968], m2
2471
2472 ; mode 33 [row 6, 7]
2473
2474 pmaddubsw m2, m0, [r3 + 22 * 16]
2475 pmulhrsw m2, m7
2476
2477 movu m0, [r1 + 7]
2478 palignr m3, m0, 1
2479 punpcklbw m0, m3
2480
2481 pmaddubsw m3, m0, [r3 + 16 * 16]
2482 pmulhrsw m3, m7
2483
2484 packuswb m2, m3
2485 movu [r0 + 2032], m2
2486
2487 ; mode 33 [row 0]
2488
2489 pmaddubsw m2, m4, [r3 + 26 * 16]
2490 pmulhrsw m2, m7
2491
2492 pxor m3, m3
2493
2494 packuswb m2, m3
2495 movh [r0 + 1984], m2
2496
2497 ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
2498
2499 movu m0, [r2 + 2]
2500 palignr m1, m0, 1
2501 punpcklqdq m2, m0, m1
2502 movu [r0 + 2048], m2
2503
2504 palignr m1, m0, 2
2505 palignr m2, m0, 3
2506 punpcklqdq m1, m2
2507 movu [r0 + 2064], m1
2508
2509 palignr m1, m0, 4
2510 palignr m2, m0, 5
2511 punpcklqdq m1, m2
2512 movu [r0 + 2080], m1
2513
2514 palignr m1, m0, 6
2515 palignr m2, m0, 7
2516 punpcklqdq m1, m2
2517 movu [r0 + 2096], m1
2518 RET
2519
2520 ;--------------------------------------------------------------------------------
2521 ; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
2522 ;--------------------------------------------------------------------------------
2523 INIT_XMM sse4
2524 cglobal all_angs_pred_16x16, 3,4,8
2525 ; mode 2
2526
2527 movu m0, [r2 + 2 + 32]
2528 movu [r0 + 0 * 16], m0
2529
2530 movu m1, m0
2531
2532 movu m6, [r2 + 18 + 32]
2533 palignr m5, m6, m0, 1
2534 movu [r0 + 1 * 16], m5
2535
2536 movu m4, m5
2537
2538 palignr m5, m6, m0, 2
2539 movu [r0 + 2 * 16], m5
2540 palignr m5, m6, m0, 3
2541 movu [r0 + 3 * 16], m5
2542 palignr m5, m6, m0, 4
2543 movu [r0 + 4 * 16], m5
2544 palignr m5, m6, m0, 5
2545 movu [r0 + 5 * 16], m5
2546 palignr m5, m6, m0, 6
2547 movu [r0 + 6 * 16], m5
2548 palignr m5, m6, m0, 7
2549 movu [r0 + 7 * 16], m5
2550
2551 movu m7, m5
2552
2553 palignr m5, m6, m0, 8
2554 movu [r0 + 8 * 16], m5
2555
2556 movu m2, m5
2557
2558 palignr m5, m6, m0, 9
2559 movu [r0 + 9 * 16], m5
2560
2561 palignr m3, m6, m0, 10
2562 movu [r0 + 10 * 16], m3
2563 palignr m3, m6, m0, 11
2564 movu [r0 + 11 * 16], m3
2565 palignr m3, m6, m0, 12
2566 movu [r0 + 12 * 16], m3
2567
2568 ; mode 3 [row 15]
2569 movu [r0 + (3-2)*16*16 + 15 * 16], m3
2570
2571 palignr m3, m6, m0, 13
2572 movu [r0 + 13 * 16], m3
2573 palignr m3, m6, m0, 14
2574 movu [r0 + 14 * 16], m3
2575 palignr m3, m6, m0, 15
2576 movu [r0 + 15 * 16], m3
2577
2578 ; mode 3 [row 0]
2579 lea r3, [ang_table]
2580 movu m3, [pw_1024]
2581 movu m0, [r2 + 1 + 32]
2582 punpcklbw m0, m1
2583
2584 ; mode 17 [row 8 - second half]
2585 pmaddubsw m1, m0, [r3 + 22 * 16]
2586 pmulhrsw m1, m3
2587 packuswb m1, m1
2588 movh [r0 + 248 * 16 + 8], m1
2589 ; mode 17 [row 8 - second half] end
2590
2591 pmaddubsw m1, m0, [r3 + 26 * 16]
2592 pmulhrsw m1, m3
2593 punpcklbw m7, m2
2594 pmaddubsw m2, m7, [r3 + 26 * 16]
2595 pmulhrsw m2, m3
2596 packuswb m1, m2
2597 movu [r0 + 16 * 16], m1
2598
2599 ;mode 6 [row 1]
2600 movu [r0 + 65 * 16], m1
2601
2602 ; mode 4 [row 0]
2603 pmaddubsw m1, m0, [r3 + 21 * 16]
2604 pmulhrsw m1, m3
2605 pmaddubsw m2, m7, [r3 + 21 * 16]
2606 pmulhrsw m2, m3
2607 packuswb m1, m2
2608 movu [r0 + 32 * 16], m1
2609
2610 ; mode 5 [row 0]
2611 pmaddubsw m1, m0, [r3 + 17 * 16]
2612 pmulhrsw m1, m3
2613 pmaddubsw m2, m7, [r3 + 17 * 16]
2614 pmulhrsw m2, m3
2615 packuswb m1, m2
2616 movu [r0 + 48 * 16], m1
2617
2618 ; mode 6 [row 0]
2619 pmaddubsw m1, m0, [r3 + 13 * 16]
2620 pmulhrsw m1, m3
2621 pmaddubsw m2, m7, [r3 + 13 * 16]
2622 pmulhrsw m2, m3
2623 packuswb m1, m2
2624 movu [r0 + 64 * 16], m1
2625
2626 ; mode 7 [row 0]
2627 pmaddubsw m1, m0, [r3 + 9 * 16]
2628 pmulhrsw m1, m3
2629 pmaddubsw m2, m7, [r3 + 9 * 16]
2630 pmulhrsw m2, m3
2631 packuswb m1, m2
2632 movu [r0 + 80 * 16], m1
2633
2634 ; mode 7 [row 1]
2635 pmaddubsw m1, m0, [r3 + 18 * 16]
2636 pmulhrsw m1, m3
2637 pmaddubsw m2, m7, [r3 + 18 * 16]
2638 pmulhrsw m2, m3
2639 packuswb m1, m2
2640 movu [r0 + 81 * 16], m1
2641
2642 ; mode 7 [row 2]
2643 pmaddubsw m1, m0, [r3 + 27 * 16]
2644 pmulhrsw m1, m3
2645 pmaddubsw m2, m7, [r3 + 27 * 16]
2646 pmulhrsw m2, m3
2647 packuswb m1, m2
2648 movu [r0 + 82 * 16], m1
2649
2650 ; mode 8 [row 0]
2651 pmaddubsw m1, m0, [r3 + 5 * 16]
2652 pmulhrsw m1, m3
2653 pmaddubsw m2, m7, [r3 + 5 * 16]
2654 pmulhrsw m2, m3
2655 packuswb m1, m2
2656 movu [r0 + 96 * 16], m1
2657
2658 ; mode 8 [row 1]
2659 pmaddubsw m1, m0, [r3 + 10 * 16]
2660 pmulhrsw m1, m3
2661 pmaddubsw m2, m7, [r3 + 10 * 16]
2662 pmulhrsw m2, m3
2663 packuswb m1, m2
2664 movu [r0 + 97 * 16], m1
2665
2666 ; mode 8 [row 2]
2667 pmaddubsw m1, m0, [r3 + 15 * 16]
2668 pmulhrsw m1, m3
2669 pmaddubsw m2, m7, [r3 + 15 * 16]
2670 pmulhrsw m2, m3
2671 packuswb m1, m2
2672 movu [r0 + 98 * 16], m1
2673
2674 ; mode 8 [row 3]
2675 pmaddubsw m1, m0, [r3 + 20 * 16]
2676 pmulhrsw m1, m3
2677 pmaddubsw m2, m7, [r3 + 20 * 16]
2678 pmulhrsw m2, m3
2679 packuswb m1, m2
2680 movu [r0 + 99 * 16], m1
2681
2682 ; mode 8 [row 4]
2683 pmaddubsw m1, m0, [r3 + 25 * 16]
2684 pmulhrsw m1, m3
2685 pmaddubsw m2, m7, [r3 + 25 * 16]
2686 pmulhrsw m2, m3
2687 packuswb m1, m2
2688 movu [r0 + 100 * 16], m1
2689
2690 ; mode 8 [row 5]
2691 pmaddubsw m1, m0, [r3 + 30 * 16]
2692 pmulhrsw m1, m3
2693 pmaddubsw m2, m7, [r3 + 30 * 16]
2694 pmulhrsw m2, m3
2695 packuswb m1, m2
2696 movu [r0 + 101 * 16], m1
2697
2698 ; mode 15 [row 13 - second half]
2699 pmaddubsw m1, m0, [r3 + 18 * 16]
2700 pmulhrsw m1, m3
2701 packuswb m1, m1
2702 movh [r0 + 221 * 16 + 8], m1
2703 ; mode 15 [row 13 - second half] end
2704
2705 ; mode 15 [row 14 - second half]
2706 pmaddubsw m1, m0, [r3 + 1 * 16]
2707 pmulhrsw m1, m3
2708 packuswb m1, m1
2709 movh [r0 + 222 * 16 + 8], m1
2710 ; mode 15 [row 14 - second half] end
2711
2712 ; mode 16 [row 10 - second half]
2713 pmaddubsw m1, m0, [r3 + 25 * 16]
2714 pmulhrsw m1, m3
2715 packuswb m1, m1
2716 movh [r0 + 234 * 16 + 8], m1
2717 ; mode 16 [row 10 - second half] end
2718
2719 ; mode 16 [row 11 - second half]
2720 pmaddubsw m1, m0, [r3 + 4 * 16]
2721 pmulhrsw m1, m3
2722 packuswb m1, m1
2723 movh [r0 + 235 * 16 + 8], m1
2724 ; mode 16 [row 11 - second half] end
2725
2726 ; mode 3 [row 1]
2727 movu m6, [r3 + 20 * 16]
2728 movu m0, [r2 + 2 + 32]
2729 punpcklbw m0, m4
2730
2731 ; mode 17 [row 7 - second half]
2732 pmaddubsw m1, m0, [r3 + 16 * 16]
2733 pmulhrsw m1, m3
2734 packuswb m1, m1
2735 movh [r0 + 247 * 16 + 8], m1
2736
2737 ; mode 17 [row 7 - second half] end
2738 pmaddubsw m1, m0, m6
2739 pmulhrsw m1, m3
2740 movu m2, [r2 + 10 + 32]
2741 punpcklbw m2, m5
2742 pmaddubsw m4, m2, m6
2743 pmulhrsw m4, m3
2744 packuswb m1, m4
2745 movu [r0 + 17 * 16], m1
2746
2747 ;mode 6 [row 3]
2748 movu [r0 + 67 * 16], m1
2749
2750 ; mode 4 row [row 1]
2751 pmaddubsw m1, m0, [r3 + 10 * 16]
2752 pmulhrsw m1, m3
2753 pmaddubsw m4, m2, [r3 + 10 * 16]
2754 pmulhrsw m4, m3
2755 packuswb m1, m4
2756 movu [r0 + 33 * 16], m1
2757
2758 ; mode 4 row [row 2]
2759 pmaddubsw m1, m0, [r3 + 31 * 16]
2760 pmulhrsw m1, m3
2761 pmaddubsw m4, m2, [r3 + 31 * 16]
2762 pmulhrsw m4, m3
2763 packuswb m1, m4
2764 movu [r0 + 34 * 16], m1
2765
2766 ; mode 7 [row 6]
2767 movu [r0 + 86 * 16], m1
2768
2769 ; mode 5 row [row 1]
2770 pmaddubsw m1, m0, [r3 + 2 * 16]
2771 pmulhrsw m1, m3
2772 pmaddubsw m4, m2, [r3 + 2 * 16]
2773 pmulhrsw m4, m3
2774 packuswb m1, m4
2775 movu [r0 + 49 * 16], m1
2776
2777 ; mode 5 row [row 2]
2778 pmaddubsw m1, m0, [r3 + 19 * 16]
2779 pmulhrsw m1, m3
2780 pmaddubsw m4, m2, [r3 + 19 * 16]
2781 pmulhrsw m4, m3
2782 packuswb m1, m4
2783 movu [r0 + 50 * 16], m1
2784
2785 ; mode 6 [row 2]
2786 pmaddubsw m1, m0, [r3 + 7 * 16]
2787 pmulhrsw m1, m3
2788 pmaddubsw m4, m2, [r3 + 7 * 16]
2789 pmulhrsw m4, m3
2790 packuswb m1, m4
2791 movu [r0 + 66 * 16], m1
2792
2793 ; mode 7 [row 3]
2794 pmaddubsw m1, m0, [r3 + 4 * 16]
2795 pmulhrsw m1, m3
2796 pmaddubsw m4, m2, [r3 + 4 * 16]
2797 pmulhrsw m4, m3
2798 packuswb m1, m4
2799 movu [r0 + 83 * 16], m1
2800
2801 ; mode 7 [row 4]
2802 pmaddubsw m1, m0, [r3 + 13 * 16]
2803 pmulhrsw m1, m3
2804 pmaddubsw m4, m2, [r3 + 13 * 16]
2805 pmulhrsw m4, m3
2806 packuswb m1, m4
2807 movu [r0 + 84 * 16], m1
2808
2809 ; mode 8 [row 8]
2810 movu [r0 + 104 * 16], m1
2811
2812 ; mode 7 [row 5]
2813 pmaddubsw m1, m0, [r3 + 22 * 16]
2814 pmulhrsw m1, m3
2815 pmaddubsw m4, m2, [r3 + 22 * 16]
2816 pmulhrsw m4, m3
2817 packuswb m1, m4
2818 movu [r0 + 85 * 16], m1
2819
2820 ; mode 8 [row 6]
2821 pmaddubsw m1, m0, [r3 + 3 * 16]
2822 pmulhrsw m1, m3
2823 pmaddubsw m4, m2, [r3 + 3 * 16]
2824 pmulhrsw m4, m3
2825 packuswb m1, m4
2826 movu [r0 + 102 * 16], m1
2827
2828 ; mode 8 [row 7]
2829 pmaddubsw m1, m0, [r3 + 8 * 16]
2830 pmulhrsw m1, m3
2831 pmaddubsw m4, m2, [r3 + 8 * 16]
2832 pmulhrsw m4, m3
2833 packuswb m1, m4
2834 movu [r0 + 103 * 16], m1
2835
2836 ; mode 8 [row 9]
2837 pmaddubsw m1, m0, [r3 + 18 * 16]
2838 pmulhrsw m1, m3
2839 pmaddubsw m4, m2, [r3 + 18 * 16]
2840 pmulhrsw m4, m3
2841 packuswb m1, m4
2842 movu [r0 + 105 * 16], m1
2843
2844 ; mode 8 [row 10]
2845 pmaddubsw m1, m0, [r3 + 23 * 16]
2846 pmulhrsw m1, m3
2847 pmaddubsw m4, m2, [r3 + 23 * 16]
2848 pmulhrsw m4, m3
2849 packuswb m1, m4
2850 movu [r0 + 106 * 16], m1
2851
2852 ; mode 8 [row 11]
2853 pmaddubsw m1, m0, [r3 + 28 * 16]
2854 pmulhrsw m1, m3
2855 pmaddubsw m4, m2, [r3 + 28 * 16]
2856 pmulhrsw m4, m3
2857 packuswb m1, m4
2858 movu [r0 + 107 * 16], m1
2859
2860 ; mode 3 [row 2]
2861 movu m0, [r2 + 3 + 32]
2862 movd m1, [r2 + 19 + 32]
2863 palignr m1, m0, 1
2864 punpcklbw m0, m1
2865
2866 ; mode 17 [row 6 - second half]
2867 pmaddubsw m1, m0, [r3 + 10 * 16]
2868 pmulhrsw m1, m3
2869 packuswb m1, m1
2870 movh [r0 + 246 * 16 + 8], m1
2871 ; mode 17 [row 6 - second half] end
2872
2873 pmaddubsw m1, m0, [r3 + 14 * 16]
2874 pmulhrsw m1, m3
2875 movu m2, [r2 + 11 + 32]
2876 movd m4, [r2 + 27 + 32]
2877 palignr m4, m2, 1
2878 punpcklbw m2, m4
2879 pmaddubsw m4, m2, [r3 + 14 * 16]
2880 pmulhrsw m4, m3
2881 packuswb m1, m4
2882 movu [r0 + 18 * 16], m1
2883
2884 ; mode 6 [row 5]
2885 movu [r0 + 69 * 16], m1
2886
2887 ; mode 4 row [row 3]
2888 pmaddubsw m1, m0, [r3 + 20 * 16]
2889 pmulhrsw m1, m3
2890 pmaddubsw m4, m2, [r3 + 20 * 16]
2891 pmulhrsw m4, m3
2892 packuswb m1, m4
2893 movu [r0 + 35 * 16], m1
2894
2895 ; mode 5 row [row 3]
2896 pmaddubsw m1, m0, [r3 + 4 * 16]
2897 pmulhrsw m1, m3
2898 pmaddubsw m4, m2, [r3 + 4 * 16]
2899 pmulhrsw m4, m3
2900 packuswb m1, m4
2901 movu [r0 + 51 * 16], m1
2902
2903 ; mode 5 row [row 4]
2904 pmaddubsw m1, m0, [r3 + 21 * 16]
2905 pmulhrsw m1, m3
2906 pmaddubsw m4, m2, [r3 + 21 * 16]
2907 pmulhrsw m4, m3
2908 packuswb m1, m4
2909 movu [r0 + 52 * 16], m1
2910
2911 ; mode 6 [row 4]
2912 pmaddubsw m1, m0, [r3 + 1 * 16]
2913 pmulhrsw m1, m3
2914 pmaddubsw m4, m2, [r3 + 1 * 16]
2915 pmulhrsw m4, m3
2916 packuswb m1, m4
2917 movu [r0 + 68 * 16], m1
2918
2919 ; mode 6 [row 6]
2920 pmaddubsw m1, m0, [r3 + 27 * 16]
2921 pmulhrsw m1, m3
2922 pmaddubsw m4, m2, [r3 + 27 * 16]
2923 pmulhrsw m4, m3
2924 packuswb m1, m4
2925 movu [r0 + 70 * 16], m1
2926
2927 ; mode 7 [row 7]
2928 pmaddubsw m1, m0, [r3 + 8 * 16]
2929 pmulhrsw m1, m3
2930 pmaddubsw m4, m2, [r3 + 8 * 16]
2931 pmulhrsw m4, m3
2932 packuswb m1, m4
2933 movu [r0 + 87 * 16], m1
2934
2935 ; mode 7 [row 8]
2936 pmaddubsw m1, m0, [r3 + 17 * 16]
2937 pmulhrsw m1, m3
2938 pmaddubsw m4, m2, [r3 + 17 * 16]
2939 pmulhrsw m4, m3
2940 packuswb m1, m4
2941 movu [r0 + 88 * 16], m1
2942
2943 ; mode 7 [row 9]
2944 pmaddubsw m1, m0, [r3 + 26 * 16]
2945 pmulhrsw m1, m3
2946 pmaddubsw m4, m2, [r3 + 26 * 16]
2947 pmulhrsw m4, m3
2948 packuswb m1, m4
2949 movu [r0 + 89 * 16], m1
2950
2951 ; mode 8 [row 12]
2952 pmaddubsw m1, m0, [r3 + 1 * 16]
2953 pmulhrsw m1, m3
2954 pmaddubsw m4, m2, [r3 + 1 * 16]
2955 pmulhrsw m4, m3
2956 packuswb m1, m4
2957 movu [r0 + 108 * 16], m1
2958
2959 ; mode 8 [row 13]
2960 pmaddubsw m1, m0, [r3 + 6 * 16]
2961 pmulhrsw m1, m3
2962 pmaddubsw m4, m2, [r3 + 6 * 16]
2963 pmulhrsw m4, m3
2964 packuswb m1, m4
2965 movu [r0 + 109 * 16], m1
2966
2967 ; mode 8 [row 14]
2968 pmaddubsw m1, m0, [r3 + 11 * 16]
2969 pmulhrsw m1, m3
2970 pmaddubsw m4, m2, [r3 + 11 * 16]
2971 pmulhrsw m4, m3
2972 packuswb m1, m4
2973 movu [r0 + 110 * 16], m1
2974
2975 ; mode 8 [row 15]
2976 pmaddubsw m1, m0, [r3 + 16 * 16]
2977 pmulhrsw m1, m3
2978 pmaddubsw m4, m2, [r3 + 16 * 16]
2979 pmulhrsw m4, m3
2980 packuswb m1, m4
2981 movu [r0 + 111 * 16], m1
2982
2983 ; mode 3 [row 3]
2984 movu m0, [r2 + 4 + 32]
2985 movd m1, [r2 + 20 + 32]
2986 palignr m1, m0, 1
2987 punpcklbw m0, m1
2988
2989 ; mode 17 [row 4 - second half]
2990 pmaddubsw m1, m0, [r3 + 30 * 16]
2991 pmulhrsw m1, m3
2992 packuswb m1, m1
2993 movh [r0 + 244 * 16 + 8], m1
2994 ; mode 17 [row 4 - second half] end
2995
2996 ; mode 17 [row 5 - second half]
2997 pmaddubsw m1, m0, [r3 + 4 * 16]
2998 pmulhrsw m1, m3
2999 packuswb m1, m1
3000 movh [r0 + 245 * 16 + 8], m1
3001 ; mode 17 [row 5 - second half] end
3002
3003 pmaddubsw m1, m0, [r3 + 8 * 16]
3004 pmulhrsw m1, m3
3005 movu m2, [r2 + 12 + 32]
3006 movd m4, [r2 + 28 + 32]
3007 palignr m4, m2, 1
3008 punpcklbw m2, m4
3009 pmaddubsw m4, m2, [r3 + 8 * 16]
3010 pmulhrsw m4, m3
3011 packuswb m1, m4
3012 movu [r0 + 19 * 16], m1
3013
3014 ; mode 6 [row 7]
3015 movu [r0 + 71 * 16], m1
3016
3017 ; mode 4 row [row 4]
3018 pmaddubsw m1, m0, [r3 + 9 * 16]
3019 pmulhrsw m1, m3
3020 pmaddubsw m4, m2, [r3 + 9 * 16]
3021 pmulhrsw m4, m3
3022 packuswb m1, m4
3023 movu [r0 + 36 * 16], m1
3024
3025 ; mode 4 row [row 5]
3026 pmaddubsw m1, m0, [r3 + 30 * 16]
3027 pmulhrsw m1, m3
3028 pmaddubsw m4, m2, [r3 + 30 * 16]
3029 pmulhrsw m4, m3
3030 packuswb m1, m4
3031 movu [r0 + 37 * 16], m1
3032
3033 ; mode 7 row [row 13]
3034 movu [r0 + 93 * 16], m1
3035
3036 ; mode 5 row [row 5]
3037 pmaddubsw m1, m0, [r3 + 6 * 16]
3038 pmulhrsw m1, m3
3039 pmaddubsw m4, m2, [r3 + 6 * 16]
3040 pmulhrsw m4, m3
3041 packuswb m1, m4
3042 movu [r0 + 53 * 16], m1
3043
3044 ; mode 5 row [row 6]
3045 pmaddubsw m1, m0, [r3 + 23 * 16]
3046 pmulhrsw m1, m3
3047 pmaddubsw m4, m2, [r3 + 23 * 16]
3048 pmulhrsw m4, m3
3049 packuswb m1, m4
3050 movu [r0 + 54 * 16], m1
3051
3052 ; mode 6 [row 8]
3053 pmaddubsw m1, m0, [r3 + 21 * 16]
3054 pmulhrsw m1, m3
3055 pmaddubsw m4, m2, [r3 + 21 * 16]
3056 pmulhrsw m4, m3
3057 packuswb m1, m4
3058 movu [r0 + 72 * 16], m1
3059
3060 ; mode 7 [row 12]
3061 movu [r0 + 92 * 16], m1
3062
3063 ; mode 7 [row 10]
3064 pmaddubsw m1, m0, [r3 + 3 * 16]
3065 pmulhrsw m1, m3
3066 pmaddubsw m4, m2, [r3 + 3 * 16]
3067 pmulhrsw m4, m3
3068 packuswb m1, m4
3069 movu [r0 + 90 * 16], m1
3070
3071 ; mode 7 [row 11]
3072 pmaddubsw m1, m0, [r3 + 12 * 16]
3073 pmulhrsw m1, m3
3074 pmaddubsw m4, m2, [r3 + 12 * 16]
3075 pmulhrsw m4, m3
3076 packuswb m1, m4
3077 movu [r0 + 91 * 16], m1
3078
3079 ; mode 3 [row 4]
3080 movu m0, [r2 + 5 + 32]
3081 movd m1, [r2 + 20 + 32]
3082 palignr m1, m0, 1
3083 punpcklbw m0, m1
3084
3085 ; mode 17 [row 3 - second half]
3086 pmaddubsw m1, m0, [r3 + 24 * 16]
3087 pmulhrsw m1, m3
3088 packuswb m1, m1
3089 movh [r0 + 243 * 16 + 8], m1
3090
3091 ; mode 17 [row 3 - second half] end
3092 pmaddubsw m1, m0, [r3 + 2 * 16]
3093 pmulhrsw m1, m3
3094 movu m2, [r2 + 13 + 32]
3095 movd m4, [r2 + 29 + 32]
3096 palignr m4, m2, 1
3097 punpcklbw m2, m4
3098 pmaddubsw m4, m2, [r3 + 2 * 16]
3099 pmulhrsw m4, m3
3100 packuswb m1, m4
3101 movu [r0 + 20 * 16], m1
3102
3103 ;mode 6 [row 9]
3104 movu [r0 + 73 * 16], m1
3105
3106 ; mode 4 row [row 6]
3107 movu m6, [r3 + 19 * 16]
3108 pmaddubsw m1, m0, m6
3109 pmulhrsw m1, m3
3110 pmaddubsw m4, m2, m6
3111 pmulhrsw m4, m3
3112 packuswb m1, m4
3113 movu [r0 + 38 * 16], m1
3114
3115 ; mode 3 [row 5]
3116 pmaddubsw m1, m0, [r3 + 28 * 16]
3117 pmulhrsw m1, m3
3118 pmaddubsw m4, m2, [r3 + 28 * 16]
3119 pmulhrsw m4, m3
3120 packuswb m1, m4
3121 movu [r0 + 21 * 16], m1
3122
3123 ;mode 6 [row 11]
3124 movu [r0 + 75 * 16], m1
3125
3126 ; mode 5 row [row 7]
3127 pmaddubsw m1, m0, [r3 + 8 * 16]
3128 pmulhrsw m1, m3
3129 pmaddubsw m4, m2, [r3 + 8 * 16]
3130 pmulhrsw m4, m3
3131 packuswb m1, m4
3132 movu [r0 + 55 * 16], m1
3133
3134 ; mode 5 row [row 8]
3135 pmaddubsw m1, m0, [r3 + 25 * 16]
3136 pmulhrsw m1, m3
3137 pmaddubsw m4, m2, [r3 + 25 * 16]
3138 pmulhrsw m4, m3
3139 packuswb m1, m4
3140 movu [r0 + 56 * 16], m1
3141
3142 ; mode 6 [row 10]
3143 pmaddubsw m1, m0, [r3 + 15 * 16]
3144 pmulhrsw m1, m3
3145 pmaddubsw m4, m2, [r3 + 15 * 16]
3146 pmulhrsw m4, m3
3147 packuswb m1, m4
3148 movu [r0 + 74 * 16], m1
3149
3150 ; mode 7 [row 14]
3151 pmaddubsw m1, m0, [r3 + 7 * 16]
3152 pmulhrsw m1, m3
3153 pmaddubsw m4, m2, [r3 + 7 * 16]
3154 pmulhrsw m4, m3
3155 packuswb m1, m4
3156 movu [r0 + 94 * 16], m1
3157
3158 ; mode 7 [row 15]
3159 pmaddubsw m1, m0, [r3 + 16 * 16]
3160 pmulhrsw m1, m3
3161 pmaddubsw m4, m2, [r3 + 16 * 16]
3162 pmulhrsw m4, m3
3163 packuswb m1, m4
3164 movu [r0 + 95 * 16], m1
3165
3166 ; mode 3 [row 6]
3167 movu m0, [r2 + 6 + 32]
3168 movd m1, [r2 + 22 + 32]
3169 palignr m1, m0, 1
3170 punpcklbw m0, m1
3171
3172 ; mode 17 [row 2 - second half]
3173 pmaddubsw m1, m0, [r3 + 18 * 16]
3174 pmulhrsw m1, m3
3175 packuswb m1, m1
3176 movh [r0 + 242 * 16 + 8], m1
3177 ; mode 17 [row 2 - second half] end
3178
3179 pmaddubsw m1, m0, [r3 + 22 * 16]
3180 pmulhrsw m1, m3
3181 movu m2, [r2 + 14 + 32]
3182 movd m4, [r2 + 30 + 32]
3183 palignr m4, m2, 1
3184 punpcklbw m2, m4
3185 pmaddubsw m4, m2, [r3 + 22 * 16]
3186 pmulhrsw m4, m3
3187 packuswb m1, m4
3188 movu [r0 + 22 * 16], m1
3189
3190 ; mode 6 [row 13]
3191 movu [r0 + 77 * 16], m1
3192
3193 ; mode 4 row [row 7]
3194 pmaddubsw m1, m0, [r3 + 8 * 16]
3195 pmulhrsw m1, m3
3196 pmaddubsw m4, m2, [r3 + 8 * 16]
3197 pmulhrsw m4, m3
3198 packuswb m1, m4
3199 movu [r0 + 39 * 16], m1
3200
3201 ; mode 4 row [row 8]
3202 pmaddubsw m1, m0, [r3 + 29 * 16]
3203 pmulhrsw m1, m3
3204 pmaddubsw m4, m2, [r3 + 29 * 16]
3205 pmulhrsw m4, m3
3206 packuswb m1, m4
3207 movu [r0 + 40 * 16], m1
3208
3209 ; mode 5 row [row 9]
3210 pmaddubsw m1, m0, [r3 + 10 * 16]
3211 pmulhrsw m1, m3
3212 pmaddubsw m4, m2, [r3 + 10 * 16]
3213 pmulhrsw m4, m3
3214 packuswb m1, m4
3215 movu [r0 + 57 * 16], m1
3216
3217 ; mode 5 row [row 10]
3218 pmaddubsw m1, m0, [r3 + 27 * 16]
3219 pmulhrsw m1, m3
3220 pmaddubsw m4, m2, [r3 + 27 * 16]
3221 pmulhrsw m4, m3
3222 packuswb m1, m4
3223 movu [r0 + 58 * 16], m1
3224
3225 ; mode 6 [row 12]
3226 pmaddubsw m1, m0, [r3 + 9 * 16]
3227 pmulhrsw m1, m3
3228 pmaddubsw m4, m2, [r3 + 9 * 16]
3229 pmulhrsw m4, m3
3230 packuswb m1, m4
3231 movu [r0 + 76 * 16], m1
3232
3233 ; mode 3 [row 7]
3234 movu m0, [r2 + 7 + 32]
3235 movd m1, [r2 + 27 + 32]
3236 palignr m1, m0, 1
3237 punpcklbw m0, m1
3238
3239 ; mode 17 [row 1 - second half]
3240 pmaddubsw m1, m0, [r3 + 12 * 16]
3241 pmulhrsw m1, m3
3242 packuswb m1, m1
3243 movh [r0 + 241 * 16 + 8], m1
3244 ; mode 17 [row 1 - second half] end
3245
3246 pmaddubsw m1, m0, [r3 + 16 * 16]
3247 pmulhrsw m1, m3
3248 movu m2, [r2 + 15 + 32]
3249 movd m4, [r2 + 25 + 32]
3250 palignr m4, m2, 1
3251 punpcklbw m2, m4
3252 pmaddubsw m4, m2, [r3 + 16 * 16]
3253 pmulhrsw m4, m3
3254 packuswb m1, m4
3255 movu [r0 + 23 * 16], m1
3256
3257 ; mode 6 [row 15]
3258 movu [r0 + 79 * 16], m1
3259
3260 ; mode 4 row [row 9]
3261 pmaddubsw m1, m0, [r3 + 18 * 16]
3262 pmulhrsw m1, m3
3263 pmaddubsw m4, m2, [r3 + 18 * 16]
3264 pmulhrsw m4, m3
3265 packuswb m1, m4
3266 movu [r0 + 41 * 16], m1
3267
3268 ; mode 5 row [row 11]
3269 pmaddubsw m1, m0, [r3 + 12 * 16]
3270 pmulhrsw m1, m3
3271 pmaddubsw m4, m2, [r3 + 12 * 16]
3272 pmulhrsw m4, m3
3273 packuswb m1, m4
3274 movu [r0 + 59 * 16], m1
3275
3276 ; mode 5 row [row 12]
3277 pmaddubsw m1, m0, [r3 + 29 * 16]
3278 pmulhrsw m1, m3
3279 pmaddubsw m4, m2, [r3 + 29 * 16]
3280 pmulhrsw m4, m3
3281 packuswb m1, m4
3282 movu [r0 + 60 * 16], m1
3283
3284 ; mode 6 [row 14]
3285 pmaddubsw m1, m0, [r3 + 3 * 16]
3286 pmulhrsw m1, m3
3287 pmaddubsw m4, m2, [r3 + 3 * 16]
3288 pmulhrsw m4, m3
3289 packuswb m1, m4
3290 movu [r0 + 78 * 16], m1
3291
3292 ; mode 3 [row 8]
3293 movu m0, [r2 + 8 + 32]
3294 movd m1, [r2 + 24 + 32]
3295 palignr m1, m0, 1
3296 punpcklbw m0, m1
3297 pmaddubsw m1, m0, [r3 + 10 * 16]
3298 pmulhrsw m1, m3
3299 movu m2, [r2 + 16 + 32]
3300 psrldq m4, m2, 1
3301 pinsrb m4, [r2 + 32], 15
3302 punpcklbw m2, m4
3303 pmaddubsw m4, m2, [r3 + 10 * 16]
3304 pmulhrsw m4, m3
3305 packuswb m1, m4
3306 movu [r0 + 24 * 16], m1
3307
3308 ; mode 4 row [row 10]
3309 pmaddubsw m1, m0, [r3 + 7 * 16]
3310 pmulhrsw m1, m3
3311 pmaddubsw m4, m2, [r3 + 7 * 16]
3312 pmulhrsw m4, m3
3313 packuswb m1, m4
3314 movu [r0 + 42 * 16], m1
3315
3316 ; mode 4 row [row 11]
3317 pmaddubsw m1, m0, [r3 + 28 * 16]
3318 pmulhrsw m1, m3
3319 pmaddubsw m4, m2, [r3 + 28 * 16]
3320 pmulhrsw m4, m3
3321 packuswb m1, m4
3322 movu [r0 + 43 * 16], m1
3323
3324 ; mode 5 row [row 13]
3325 pmaddubsw m1, m0, [r3 + 14 * 16]
3326 pmulhrsw m1, m3
3327 pmaddubsw m4, m2, [r3 + 14 * 16]
3328 pmulhrsw m4, m3
3329 packuswb m1, m4
3330 movu [r0 + 61 * 16], m1
3331
3332 ; mode 5 row [row 14]
3333 pmaddubsw m1, m0, [r3 + 31 * 16]
3334 pmulhrsw m1, m3
3335 pmaddubsw m4, m2, [r3 + 31 * 16]
3336 pmulhrsw m4, m3
3337 packuswb m1, m4
3338 movu [r0 + 62 * 16], m1
3339
3340 ; mode 3 [row 9]
3341 movu m0, [r2 + 9 + 32]
3342 movd m1, [r2 + 16 + 32]
3343 palignr m1, m0, 1
3344 punpcklbw m0, m1
3345 pmaddubsw m1, m0, [r3 + 4 * 16]
3346 pmulhrsw m1, m3
3347 movu m2, [r2 + 17 + 32]
3348 movd m4, [r2 + 33 + 32]
3349 palignr m4, m2, 1
3350 punpcklbw m2, m4
3351 pmaddubsw m4, m2, [r3 + 4 * 16]
3352 pmulhrsw m4, m3
3353 packuswb m1, m4
3354 movu [r0 + 25 * 16], m1
3355
3356 ; mode 4 row [row 12]
3357 pmaddubsw m1, m0, [r3 + 17 * 16]
3358 pmulhrsw m1, m3
3359 pmaddubsw m4, m2, [r3 + 17 * 16]
3360 pmulhrsw m4, m3
3361 packuswb m1, m4
3362 movu [r0 + 44 * 16], m1
3363
3364 ; mode 3 [row 10]
3365 pmaddubsw m1, m0, [r3 + 30 * 16]
3366 pmulhrsw m1, m3
3367 pmaddubsw m4, m2, [r3 + 30 * 16]
3368 pmulhrsw m4, m3
3369 packuswb m1, m4
3370 movu [r0 + 26 * 16], m1
3371
3372 ; mode 5 row [row 15]
3373 pmaddubsw m1, m0, [r3 + 16 * 16]
3374 pmulhrsw m1, m3
3375 pmaddubsw m4, m2, [r3 + 16 * 16]
3376 pmulhrsw m4, m3
3377 packuswb m1, m4
3378 movu [r0 + 63 * 16], m1
3379
3380 ; mode 3 [row 11]
3381 movu m0, [r2 + 10 + 32]
3382 movd m1, [r2 + 26 + 32]
3383 palignr m1, m0, 1
3384 punpcklbw m0, m1
3385 pmaddubsw m1, m0, [r3 + 24 * 16]
3386 pmulhrsw m1, m3
3387 movu m2, [r2 + 18 + 32]
3388 movd m4, [r2 + 34 + 32]
3389 palignr m4, m2, 1
3390 punpcklbw m2, m4
3391 pmaddubsw m4, m2, [r3 + 24 * 16]
3392 pmulhrsw m4, m3
3393 packuswb m1, m4
3394 movu [r0 + 27 * 16], m1
3395
3396 ; mode 4 row [row 13]
3397 pmaddubsw m1, m0, [r3 + 6 * 16]
3398 pmulhrsw m1, m3
3399 pmaddubsw m4, m2, [r3 + 6 * 16]
3400 pmulhrsw m4, m3
3401 packuswb m1, m4
3402 movu [r0 + 45 * 16], m1
3403
3404 ; mode 4 row [row 14]
3405 pmaddubsw m1, m0, [r3 + 27 * 16]
3406 pmulhrsw m1, m3
3407 pmaddubsw m4, m2, [r3 + 27 * 16]
3408 pmulhrsw m4, m3
3409 packuswb m1, m4
3410 movu [r0 + 46 * 16], m1
3411
3412 ; mode 3 [row 12]
3413 movu m0, [r2 + 11 + 32]
3414 movd m1, [r2 + 27 + 32]
3415 palignr m1, m0, 1
3416 punpcklbw m0, m1
3417 pmaddubsw m1, m0, [r3 + 18 * 16]
3418 pmulhrsw m1, m3
3419 movu m2, [r2 + 19 + 32]
3420 movd m4, [r2 + 35 + 32]
3421 palignr m4, m2, 1
3422 punpcklbw m2, m4
3423 pmaddubsw m4, m2, [r3 + 18 * 16]
3424 pmulhrsw m4, m3
3425 packuswb m1, m4
3426 movu [r0 + 28 * 16], m1
3427
3428 ; mode 4 row [row 15]
3429 pmaddubsw m1, m0, [r3 + 16 * 16]
3430 pmulhrsw m1, m3
3431 pmaddubsw m4, m2, [r3 + 16 * 16]
3432 pmulhrsw m4, m3
3433 packuswb m1, m4
3434 movu [r0 + 47 * 16], m1
3435
3436 ; mode 3 [row 13]
3437 movu m0, [r2 + 12 + 32]
3438 movd m1, [r2 + 28 + 32]
3439 palignr m1, m0, 1
3440 punpcklbw m0, m1
3441 pmaddubsw m1, m0, [r3 + 12 * 16]
3442 pmulhrsw m1, m3
3443 movu m2, [r2 + 20 + 32]
3444 movd m4, [r2 + 36 + 32]
3445 palignr m4, m2, 1
3446 punpcklbw m2, m4
3447 pmaddubsw m4, m2, [r3 + 12 * 16]
3448 pmulhrsw m4, m3
3449 packuswb m1, m4
3450 movu [r0 + 29 * 16], m1
3451
3452 ; mode 3 [row 14]
3453 movu m0, [r2 + 13 + 32]
3454 movd m1, [r2 + 29 + 32]
3455 palignr m1, m0, 1
3456 punpcklbw m0, m1
3457 pmaddubsw m1, m0, [r3 + 6 * 16]
3458 pmulhrsw m1, m3
3459 movu m2, [r2 + 21 + 32]
3460 movd m4, [r2 + 37 + 32]
3461 palignr m4, m2, 1
3462 punpcklbw m2, m4
3463 pmaddubsw m4, m2, [r3 + 6 * 16]
3464 pmulhrsw m4, m3
3465 packuswb m1, m4
3466 movu [r0 + 30 * 16], m1
3467
3468 ; mode 9
3469 movu m0, [r1 + 1 + 32]
3470 movd m1, [r1 + 17 + 32]
3471 palignr m1, m0, 1
3472
3473 ; mode 9 [row 15]
3474 movu [r0 + 127 * 16], m1
3475
3476 ; mode 9 [row 0]
3477 punpcklbw m0, m1
3478 pmaddubsw m1, m0, [r3 + 2 * 16]
3479 pmulhrsw m1, m3
3480 movu m7, [r1 + 9 + 32]
3481 movd m4, [r2 + 25 + 32]
3482 palignr m2, m7, 1
3483 punpcklbw m7, m2
3484 pmaddubsw m2, m7, [r3 + 2 * 16]
3485 pmulhrsw m2, m3
3486 packuswb m1, m2
3487 movu [r0 + 112 * 16], m1
3488
3489 ; mode 9 [row 1]
3490 pmaddubsw m1, m0, [r3 + 4 * 16]
3491 pmulhrsw m1, m3
3492 pmaddubsw m2, m7, [r3 + 4 * 16]
3493 pmulhrsw m2, m3
3494 packuswb m1, m2
3495 movu [r0 + 113 * 16], m1
3496
3497 ; mode 9 [row 2]
3498 pmaddubsw m1, m0, [r3 + 6 * 16]
3499 pmulhrsw m1, m3
3500 pmaddubsw m2, m7, [r3 + 6 * 16]
3501 pmulhrsw m2, m3
3502 packuswb m1, m2
3503 movu [r0 + 114 * 16], m1
3504
3505 ; mode 9 [row 3]
3506 pmaddubsw m1, m0, [r3 + 8 * 16]
3507 pmulhrsw m1, m3
3508 pmaddubsw m2, m7, [r3 + 8 * 16]
3509 pmulhrsw m2, m3
3510 packuswb m1, m2
3511 movu [r0 + 115 * 16], m1
3512
3513 ; mode 9 [row 4]
3514 pmaddubsw m1, m0, [r3 + 10 * 16]
3515 pmulhrsw m1, m3
3516 pmaddubsw m2, m7, [r3 + 10 * 16]
3517 pmulhrsw m2, m3
3518 packuswb m1, m2
3519 movu [r0 + 116 * 16], m1
3520
3521 ; mode 9 [row 5]
3522 pmaddubsw m1, m0, [r3 + 12 * 16]
3523 pmulhrsw m1, m3
3524 pmaddubsw m2, m7, [r3 + 12 * 16]
3525 pmulhrsw m2, m3
3526 packuswb m1, m2
3527 movu [r0 + 117 * 16], m1
3528
3529 ; mode 9 [row 6]
3530 pmaddubsw m1, m0, [r3 + 14 * 16]
3531 pmulhrsw m1, m3
3532 pmaddubsw m2, m7, [r3 + 14 * 16]
3533 pmulhrsw m2, m3
3534 packuswb m1, m2
3535 movu [r0 + 118 * 16], m1
3536
3537 ; mode 9 [row 7]
3538 pmaddubsw m1, m0, [r3 + 16 * 16]
3539 pmulhrsw m1, m3
3540 pmaddubsw m2, m7, [r3 + 16 * 16]
3541 pmulhrsw m2, m3
3542 packuswb m1, m2
3543 movu [r0 + 119 * 16], m1
3544
3545 ; mode 9 [row 8]
3546 pmaddubsw m1, m0, [r3 + 18 * 16]
3547 pmulhrsw m1, m3
3548 pmaddubsw m2, m7, [r3 + 18 * 16]
3549 pmulhrsw m2, m3
3550 packuswb m1, m2
3551 movu [r0 + 120 * 16], m1
3552
3553 ; mode 9 [row 9]
3554 pmaddubsw m1, m0, [r3 + 20 * 16]
3555 pmulhrsw m1, m3
3556 pmaddubsw m2, m7, [r3 + 20 * 16]
3557 pmulhrsw m2, m3
3558 packuswb m1, m2
3559 movu [r0 + 121 * 16], m1
3560
3561 ; mode 9 [row 10]
3562 pmaddubsw m1, m0, [r3 + 22 * 16]
3563 pmulhrsw m1, m3
3564 pmaddubsw m2, m7, [r3 + 22 * 16]
3565 pmulhrsw m2, m3
3566 packuswb m1, m2
3567 movu [r0 + 122 * 16], m1
3568
3569 ; mode 9 [row 11]
3570 pmaddubsw m1, m0, [r3 + 24 * 16]
3571 pmulhrsw m1, m3
3572 pmaddubsw m2, m7, [r3 + 24 * 16]
3573 pmulhrsw m2, m3
3574 packuswb m1, m2
3575 movu [r0 + 123 * 16], m1
3576
3577 ; mode 9 [row 12]
3578 pmaddubsw m1, m0, [r3 + 26 * 16]
3579 pmulhrsw m1, m3
3580 pmaddubsw m2, m7, [r3 + 26 * 16]
3581 pmulhrsw m2, m3
3582 packuswb m1, m2
3583 movu [r0 + 124 * 16], m1
3584
3585 ; mode 9 [row 13]
3586 pmaddubsw m1, m0, [r3 + 28 * 16]
3587 pmulhrsw m1, m3
3588 pmaddubsw m2, m7, [r3 + 28 * 16]
3589 pmulhrsw m2, m3
3590 packuswb m1, m2
3591 movu [r0 + 125 * 16], m1
3592
3593 ; mode 9 [row 14]
3594 pmaddubsw m1, m0, [r3 + 30 * 16]
3595 pmulhrsw m1, m3
3596 pmaddubsw m2, m7, [r3 + 30 * 16]
3597 pmulhrsw m2, m3
3598 packuswb m1, m2
3599 movu [r0 + 126 * 16], m1
3600
3601 ; mode 10
3602 movu m1, [r1 + 1 + 32]
3603 movu [r0 + 128 * 16], m1
3604 movu [r0 + 129 * 16], m1
3605 movu [r0 + 130 * 16], m1
3606 movu [r0 + 131 * 16], m1
3607 movu [r0 + 132 * 16], m1
3608 movu [r0 + 133 * 16], m1
3609 movu [r0 + 134 * 16], m1
3610 movu [r0 + 135 * 16], m1
3611 movu [r0 + 136 * 16], m1
3612 movu [r0 + 137 * 16], m1
3613 movu [r0 + 138 * 16], m1
3614 movu [r0 + 139 * 16], m1
3615 movu [r0 + 140 * 16], m1
3616 movu [r0 + 141 * 16], m1
3617 movu [r0 + 142 * 16], m1
3618 movu [r0 + 143 * 16], m1
3619
3620 pxor m0, m0
3621 pshufb m1, m1, m0
3622 punpcklbw m1, m0
3623 pinsrb m2, [r1], 0
3624 pshufb m2, m2, m0
3625 punpcklbw m2, m0
3626 movu m4, [r1 + 1]
3627 punpcklbw m5, m4, m0
3628 punpckhbw m4, m0
3629 psubw m5, m2
3630 psubw m4, m2
3631 psraw m5, 1
3632 psraw m4, 1
3633 paddw m5, m1
3634 paddw m4, m1
3635 packuswb m5, m4
3636
3637 pextrb [r0 + 128 * 16], m5, 0
3638 pextrb [r0 + 129 * 16], m5, 1
3639 pextrb [r0 + 130 * 16], m5, 2
3640 pextrb [r0 + 131 * 16], m5, 3
3641 pextrb [r0 + 132 * 16], m5, 4
3642 pextrb [r0 + 133 * 16], m5, 5
3643 pextrb [r0 + 134 * 16], m5, 6
3644 pextrb [r0 + 135 * 16], m5, 7
3645 pextrb [r0 + 136 * 16], m5, 8
3646 pextrb [r0 + 137 * 16], m5, 9
3647 pextrb [r0 + 138 * 16], m5, 10
3648 pextrb [r0 + 139 * 16], m5, 11
3649 pextrb [r0 + 140 * 16], m5, 12
3650 pextrb [r0 + 141 * 16], m5, 13
3651 pextrb [r0 + 142 * 16], m5, 14
3652 pextrb [r0 + 143 * 16], m5, 15
3653
3654 ; mode 11
3655 movu m0, [r1 + 32]
3656 pinsrb m0, [r1], 0
3657
3658 ; mode 11 [row 15]
3659 movu [r0 + 159 * 16], m0
3660
3661 ; mode 11 [row 0]
3662 movu m1, [r1 + 1 + 32]
3663 punpcklbw m0, m1
3664 pmaddubsw m1, m0, [r3 + 30 * 16]
3665 pmulhrsw m1, m3
3666 movu m7, [r1 + 8 + 32]
3667 movu m2, [r1 + 9 + 32]
3668 punpcklbw m7, m2
3669 pmaddubsw m2, m7, [r3 + 30 * 16]
3670 pmulhrsw m2, m3
3671 packuswb m1, m2
3672 movu [r0 + 144 * 16], m1
3673
3674 ; mode 11 [row 1]
3675 pmaddubsw m1, m0, [r3 + 28 * 16]
3676 pmulhrsw m1, m3
3677 pmaddubsw m2, m7, [r3 + 28 * 16]
3678 pmulhrsw m2, m3
3679 packuswb m1, m2
3680 movu [r0 + 145 * 16], m1
3681
3682 ; mode 11 [row 2]
3683 pmaddubsw m1, m0, [r3 + 26 * 16]
3684 pmulhrsw m1, m3
3685 pmaddubsw m2, m7, [r3 + 26 * 16]
3686 pmulhrsw m2, m3
3687 packuswb m1, m2
3688 movu [r0 + 146 * 16], m1
3689
3690 ; mode 11 [row 3]
3691 pmaddubsw m1, m0, [r3 + 24 * 16]
3692 pmulhrsw m1, m3
3693 pmaddubsw m2, m7, [r3 + 24 * 16]
3694 pmulhrsw m2, m3
3695 packuswb m1, m2
3696 movu [r0 + 147 * 16], m1
3697
3698 ; mode 11 [row 4]
3699 pmaddubsw m1, m0, [r3 + 22 * 16]
3700 pmulhrsw m1, m3
3701 pmaddubsw m2, m7, [r3 + 22 * 16]
3702 pmulhrsw m2, m3
3703 packuswb m1, m2
3704 movu [r0 + 148 * 16], m1
3705
3706 ; mode 11 [row 5]
3707 pmaddubsw m1, m0, [r3 + 20 * 16]
3708 pmulhrsw m1, m3
3709 pmaddubsw m2, m7, [r3 + 20 * 16]
3710 pmulhrsw m2, m3
3711 packuswb m1, m2
3712 movu [r0 + 149 * 16], m1
3713
3714 ; mode 11 [row 6]
3715 pmaddubsw m1, m0, [r3 + 18 * 16]
3716 pmulhrsw m1, m3
3717 pmaddubsw m2, m7, [r3 + 18 * 16]
3718 pmulhrsw m2, m3
3719 packuswb m1, m2
3720 movu [r0 + 150 * 16], m1
3721
3722 ; mode 11 [row 7]
3723 pmaddubsw m1, m0, [r3 + 16 * 16]
3724 pmulhrsw m1, m3
3725 pmaddubsw m2, m7, [r3 + 16 * 16]
3726 pmulhrsw m2, m3
3727 packuswb m1, m2
3728 movu [r0 + 151 * 16], m1
3729
3730 ; mode 11 [row 8]
3731 pmaddubsw m1, m0, [r3 + 14 * 16]
3732 pmulhrsw m1, m3
3733 pmaddubsw m2, m7, [r3 + 14 * 16]
3734 pmulhrsw m2, m3
3735 packuswb m1, m2
3736 movu [r0 + 152 * 16], m1
3737
3738 ; mode 11 [row 9]
3739 pmaddubsw m1, m0, [r3 + 12 * 16]
3740 pmulhrsw m1, m3
3741 pmaddubsw m2, m7, [r3 + 12 * 16]
3742 pmulhrsw m2, m3
3743 packuswb m1, m2
3744 movu [r0 + 153 * 16], m1
3745
3746 ; mode 11 [row 10]
3747 pmaddubsw m1, m0, [r3 + 10 * 16]
3748 pmulhrsw m1, m3
3749 pmaddubsw m2, m7, [r3 + 10 * 16]
3750 pmulhrsw m2, m3
3751 packuswb m1, m2
3752 movu [r0 + 154 * 16], m1
3753
3754 ; mode 11 [row 11]
3755 pmaddubsw m1, m0, [r3 + 8 * 16]
3756 pmulhrsw m1, m3
3757 pmaddubsw m2, m7, [r3 + 8 * 16]
3758 pmulhrsw m2, m3
3759 packuswb m1, m2
3760 movu [r0 + 155 * 16], m1
3761
3762 ; mode 11 [row 12]
3763 pmaddubsw m1, m0, [r3 + 6 * 16]
3764 pmulhrsw m1, m3
3765 pmaddubsw m2, m7, [r3 + 6 * 16]
3766 pmulhrsw m2, m3
3767 packuswb m1, m2
3768 movu [r0 + 156 * 16], m1
3769
3770 ; mode 11 [row 13]
3771 pmaddubsw m1, m0, [r3 + 4 * 16]
3772 pmulhrsw m1, m3
3773 pmaddubsw m2, m7, [r3 + 4 * 16]
3774 pmulhrsw m2, m3
3775 packuswb m1, m2
3776 movu [r0 + 157 * 16], m1
3777
3778 ; mode 11 [row 14]
3779 pmaddubsw m1, m0, [r3 + 2 * 16]
3780 pmulhrsw m1, m3
3781 pmaddubsw m2, m7, [r3 + 2 * 16]
3782 pmulhrsw m2, m3
3783 packuswb m1, m2
3784 movu [r0 + 158 * 16], m1
3785
3786 ; mode 12 [row 0]
3787 movu m0, [r2 + 32]
3788 pinsrb m0, [r2], 0
3789 movu m1, [r2 + 1 + 32]
3790 punpcklbw m0, m1
3791 pmaddubsw m1, m0, [r3 + 27 * 16]
3792 pmulhrsw m1, m3
3793 movu m7, [r2 + 8 + 32]
3794 movd m2, [r2 + 24 + 32]
3795 palignr m2, m7, 1
3796 punpcklbw m7, m2
3797 pmaddubsw m2, m7, [r3 + 27 * 16]
3798 pmulhrsw m2, m3
3799 packuswb m1, m2
3800 movu [r0 + 160 * 16], m1
3801
3802 ; mode 12 [row 1]
3803 pmaddubsw m1, m0, [r3 + 22 * 16]
3804 pmulhrsw m1, m3
3805 pmaddubsw m2, m7, [r3 + 22 * 16]
3806 pmulhrsw m2, m3
3807 packuswb m1, m2
3808 movu [r0 + 161 * 16], m1
3809
3810 ; mode 12 [row 2]
3811 pmaddubsw m1, m0, [r3 + 17 * 16]
3812 pmulhrsw m1, m3
3813 pmaddubsw m2, m7, [r3 + 17 * 16]
3814 pmulhrsw m2, m3
3815 packuswb m1, m2
3816 movu [r0 + 162 * 16], m1
3817
3818 ; mode 12 [row 3]
3819 pmaddubsw m1, m0, [r3 + 12 * 16]
3820 pmulhrsw m1, m3
3821 pmaddubsw m2, m7, [r3 + 12 * 16]
3822 pmulhrsw m2, m3
3823 packuswb m1, m2
3824 movu [r0 + 163 * 16], m1
3825
3826 ; mode 12 [row 4]
3827 pmaddubsw m1, m0, [r3 + 7 * 16]
3828 pmulhrsw m1, m3
3829 pmaddubsw m2, m7, [r3 + 7 * 16]
3830 pmulhrsw m2, m3
3831 packuswb m1, m2
3832 movu [r0 + 164 * 16], m1
3833
3834 ; mode 12 [row 5]
3835 pmaddubsw m1, m0, [r3 + 2 * 16]
3836 pmulhrsw m1, m3
3837 pmaddubsw m2, m7, [r3 + 2 * 16]
3838 pmulhrsw m2, m3
3839 packuswb m1, m2
3840 movu [r0 + 165 * 16], m1
3841
3842 ; mode 13 [row 0]
3843 pmaddubsw m1, m0, [r3 + 23 * 16]
3844 pmulhrsw m1, m3
3845 pmaddubsw m2, m7, [r3 + 23 * 16]
3846 pmulhrsw m2, m3
3847 packuswb m1, m2
3848 movu [r0 + 176 * 16], m1
3849
3850 ; mode 13 [row 1]
3851 pmaddubsw m1, m0, [r3 + 14 * 16]
3852 pmulhrsw m1, m3
3853 pmaddubsw m2, m7, [r3 + 14 * 16]
3854 pmulhrsw m2, m3
3855 packuswb m1, m2
3856 movu [r0 + 177 * 16], m1
3857
3858 ; mode 13 [row 2]
3859 pmaddubsw m1, m0, [r3 + 5 * 16]
3860 pmulhrsw m1, m3
3861 pmaddubsw m2, m7, [r3 + 5 * 16]
3862 pmulhrsw m2, m3
3863 packuswb m1, m2
3864 movu [r0 + 178 * 16], m1
3865
3866 ; mode 14 [row 0]
3867 pmaddubsw m1, m0, [r3 + 19 * 16]
3868 pmulhrsw m1, m3
3869 pmaddubsw m2, m7, [r3 + 19 * 16]
3870 pmulhrsw m2, m3
3871 packuswb m1, m2
3872 movu [r0 + 192 * 16], m1
3873
3874 ; mode 14 [row 1]
3875 pmaddubsw m1, m0, [r3 + 6 * 16]
3876 pmulhrsw m1, m3
3877 pmaddubsw m2, m7, [r3 + 6 * 16]
3878 pmulhrsw m2, m3
3879 packuswb m1, m2
3880 movu [r0 + 193 * 16], m1
3881
3882 ; mode 17 [row 0]
3883 movu [r0 + 240 * 16], m1
3884
3885 ; mode 15 [row 0]
3886 pmaddubsw m1, m0, [r3 + 15 * 16]
3887 pmulhrsw m1, m3
3888 pmaddubsw m2, m7, [r3 + 15 * 16]
3889 pmulhrsw m2, m3
3890 packuswb m1, m2
3891 movu [r0 + 208 * 16], m1
3892
3893 ; mode 15 [row 15 - second half]
3894 pmaddubsw m1, m0, [r3 + 16 * 16]
3895 pmulhrsw m1, m3
3896 packuswb m1, m1
3897 movh [r0 + 223 * 16 + 8], m1
3898 ; mode 15 [row 15 - second half] end
3899
3900 ; mode 16 [row 0]
3901 pmaddubsw m1, m0, [r3 + 11 * 16]
3902 pmulhrsw m1, m3
3903 pmaddubsw m2, m7, [r3 + 11 * 16]
3904 pmulhrsw m2, m3
3905 packuswb m1, m2
3906 movu [r0 + 224 * 16], m1
3907
3908 ; mode 17 [row 9 - second half]
3909 pmaddubsw m1, m0, [r3 + 28 * 16]
3910 pmulhrsw m1, m3
3911 packuswb m1, m1
3912 movh [r0 + 249 * 16 + 8], m1
3913 ; mode 17 [row 9 - second half] end
3914
3915 ; mode 17 [row 10 - second half]
3916 pmaddubsw m1, m0, [r3 + 2 * 16]
3917 pmulhrsw m1, m3
3918 packuswb m1, m1
3919 movh [r0 + 250 * 16 + 8], m1
3920 ; mode 17 [row 10 - second half] end
3921
3922 ; mode 17 [row 1 - first half]
3923 pslldq m6, m0, 2
3924 pinsrb m6, [r2], 1
3925 pinsrb m6, [r2 + 1], 0
3926 pmaddubsw m1, m6, [r3 + 12 * 16]
3927 pmulhrsw m1, m3
3928 packuswb m1, m1
3929 movh [r0 + 241 * 16], m1
3930
3931 ; mode 17 [row 11 - second half]
3932 pmaddubsw m1, m6, [r3 + 8 * 16]
3933 pmulhrsw m1, m3
3934 packuswb m1, m1
3935 movh [r0 + 251 * 16 + 8], m1
3936 ; mode 17 [row 11 - second half] end
3937
3938 ; mode 17 [row 2 - first half]
3939 pslldq m6, 2
3940 pinsrb m6, [r2 + 1], 1
3941 pinsrb m6, [r2 + 2], 0
3942 pmaddubsw m1, m6, [r3 + 18 * 16]
3943 pmulhrsw m1, m3
3944 packuswb m1, m1
3945 movh [r0 + 242 * 16], m1
3946
3947 ; mode 17 [row 12 - second half]
3948 pmaddubsw m1, m6, [r3 + 14 * 16]
3949 pmulhrsw m1, m3
3950 packuswb m1, m1
3951 movh [r0 + 252 * 16 + 8], m1
3952 ; mode 17 [row 12 - second half] end
3953
3954 ; mode 17 [row 3 - first half]
3955 pslldq m6, 2
3956 pinsrb m6, [r2 + 2], 1
3957 pinsrb m6, [r2 + 4], 0
3958 pmaddubsw m1, m6, [r3 + 24 * 16]
3959 pmulhrsw m1, m3
3960 packuswb m1, m1
3961 movh [r0 + 243 * 16], m1
3962
3963 ; mode 17 [row 13 - first half]
3964 pmaddubsw m1, m6, [r3 + 20 * 16]
3965 pmulhrsw m1, m3
3966 packuswb m1, m1
3967 movh [r0 + 253 * 16 + 8], m1
3968
3969 ; mode 17 [row 4 - first half]
3970 pslldq m6, 2
3971 pinsrb m6, [r2 + 4], 1
3972 pinsrb m6, [r2 + 5], 0
3973 pmaddubsw m1, m6, [r3 + 30 * 16]
3974 pmulhrsw m1, m3
3975 packuswb m1, m1
3976 movh [r0 + 244 * 16], m1
3977
3978 ; mode 17 [row 5 - first half]
3979 pmaddubsw m1, m6, [r3 + 4 * 16]
3980 pmulhrsw m1, m3
3981 packuswb m1, m1
3982 movh [r0 + 245 * 16], m1
3983
3984 ; mode 17 [row 14 - second half]
3985 pmaddubsw m1, m6, [r3 + 26 * 16]
3986 pmulhrsw m1, m3
3987 packuswb m1, m1
3988 movh [r0 + 254 * 16 + 8], m1
3989 ; mode 17 [row 14 - second half] end
3990
3991 ; mode 17 [row 6 - first half]
3992 pslldq m6, 2
3993 pinsrb m6, [r2 + 5], 1
3994 pinsrb m6, [r2 + 6], 0
3995 pmaddubsw m1, m6, [r3 + 10 * 16]
3996 pmulhrsw m1, m3
3997 packuswb m1, m1
3998 movh [r0 + 246 * 16], m1
3999
4000 ; mode 17 [row 7 - first half]
4001 pslldq m6, 2
4002 pinsrb m6, [r2 + 6], 1
4003 pinsrb m6, [r2 + 7], 0
4004 pmaddubsw m1, m6, [r3 + 16 * 16]
4005 pmulhrsw m1, m3
4006 packuswb m1, m1
4007 movh [r0 + 247 * 16], m1
4008
4009 ; mode 17 [row 8 - first half]
4010 pslldq m6, 2
4011 pinsrb m6, [r2 + 7], 1
4012 pinsrb m6, [r2 + 9], 0
4013 pmaddubsw m1, m6, [r3 + 22 * 16]
4014 pmulhrsw m1, m3
4015 packuswb m1, m1
4016 movh [r0 + 248 * 16], m1
4017
4018 ; mode 17 [row 9 - first half]
4019 pslldq m6, 2
4020 pinsrb m6, [r2 + 9], 1
4021 pinsrb m6, [r2 + 10], 0
4022 pmaddubsw m1, m6, [r3 + 28 * 16]
4023 pmulhrsw m1, m3
4024 packuswb m1, m1
4025 movh [r0 + 249 * 16], m1
4026
4027 ; mode 17 [row 10 - first half]
4028 pmaddubsw m1, m6, [r3 + 2 * 16]
4029 pmulhrsw m1, m3
4030 packuswb m1, m1
4031 movh [r0 + 250 * 16], m1
4032
4033 ; mode 17 [row 11 - first half]
4034 pslldq m6, 2
4035 pinsrb m6, [r2 + 10], 1
4036 pinsrb m6, [r2 + 11], 0
4037 pmaddubsw m1, m6, [r3 + 8 * 16]
4038 pmulhrsw m1, m3
4039 packuswb m1, m1
4040 movh [r0 + 251 * 16], m1
4041
4042 ; mode 17 [row 12 - first half]
4043 pslldq m6, 2
4044 pinsrb m6, [r2 + 11], 1
4045 pinsrb m6, [r2 + 12], 0
4046 pmaddubsw m1, m6, [r3 + 14 * 16]
4047 pmulhrsw m1, m3
4048 packuswb m1, m1
4049 movh [r0 + 252 * 16], m1
4050
4051 ; mode 17 [row 13 - first half]
4052 pslldq m6, 2
4053 pinsrb m6, [r2 + 12], 1
4054 pinsrb m6, [r2 + 14], 0
4055 pmaddubsw m1, m6, [r3 + 20 * 16]
4056 pmulhrsw m1, m3
4057 packuswb m1, m1
4058 movh [r0 + 253 * 16], m1
4059
4060 ; mode 17 [row 14 - first half]
4061 pslldq m6, 2
4062 pinsrb m6, [r2 + 14], 1
4063 pinsrb m6, [r2 + 15], 0
4064 pmaddubsw m1, m6, [r3 + 26 * 16]
4065 pmulhrsw m1, m3
4066 packuswb m1, m1
4067 movh [r0 + 254 * 16], m1
4068
4069 ; mode 16 [row 12 - second half]
4070 pmaddubsw m1, m0, [r3 + 15 * 16]
4071 pmulhrsw m1, m3
4072 packuswb m1, m1
4073 movh [r0 + 236 * 16 + 8], m1
4074 ; mode 16 [row 12 - second half]
4075
4076 ; mode 12 [row 6]
4077 pslldq m2, m0, 2
4078 pinsrb m2, [r2], 1
4079 pinsrb m2, [r2 + 6], 0
4080 pmaddubsw m1, m2, [r3 + 29 * 16]
4081 pmulhrsw m1, m3
4082 movu m0, [r2 + 7 + 32]
4083 psrldq m4, m0, 1
4084 punpcklbw m0, m4
4085 pmaddubsw m4, m0, [r3 + 29 * 16]
4086 pmulhrsw m4, m3
4087 packuswb m1, m4
4088 movu [r0 + 166 * 16], m1
4089
4090 ; mode 12 [row 7]
4091 pmaddubsw m1, m2, [r3 + 24 * 16]
4092 pmulhrsw m1, m3
4093 pmaddubsw m4, m0, [r3 + 24 * 16]
4094 pmulhrsw m4, m3
4095 packuswb m1, m4
4096 movu [r0 + 167 * 16], m1
4097
4098 ; mode 12 [row 8]
4099 pmaddubsw m1, m2, [r3 + 19 * 16]
4100 pmulhrsw m1, m3
4101 pmaddubsw m4, m0, [r3 + 19 * 16]
4102 pmulhrsw m4, m3
4103 packuswb m1, m4
4104 movu [r0 + 168 * 16], m1
4105
4106 ; mode 12 [row 9]
4107 pmaddubsw m1, m2, [r3 + 14 * 16]
4108 pmulhrsw m1, m3
4109 pmaddubsw m4, m0, [r3 + 14 * 16]
4110 pmulhrsw m4, m3
4111 packuswb m1, m4
4112 movu [r0 + 169 * 16], m1
4113
4114 ; mode 12 [row 10]
4115 pmaddubsw m1, m2, [r3 + 9 * 16]
4116 pmulhrsw m1, m3
4117 pmaddubsw m4, m0, [r3 + 9 * 16]
4118 pmulhrsw m4, m3
4119 packuswb m1, m4
4120 movu [r0 + 170 * 16], m1
4121
4122 ; mode 12 [row 11]
4123 pmaddubsw m1, m2, [r3 + 4 * 16]
4124 pmulhrsw m1, m3
4125 pmaddubsw m4, m0, [r3 + 4 * 16]
4126 pmulhrsw m4, m3
4127 packuswb m1, m4
4128 movu [r0 + 171 * 16], m1
4129
4130 ; mode 13 [row 3]
4131 pinsrb m7, m2, [r2 + 4], 0
4132 pmaddubsw m1, m7, [r3 + 28 * 16]
4133 pmulhrsw m1, m3
4134 pmaddubsw m4, m0, [r3 + 28 * 16]
4135 pmulhrsw m4, m3
4136 packuswb m1, m4
4137 movu [r0 + 179 * 16], m1
4138
4139 ; mode 13 [row 4]
4140 pmaddubsw m1, m7, [r3 + 19 * 16]
4141 pmulhrsw m1, m3
4142 pmaddubsw m4, m0, [r3 + 19 * 16]
4143 pmulhrsw m4, m3
4144 packuswb m1, m4
4145 movu [r0 + 180 * 16], m1
4146
4147 ; mode 13 [row 5]
4148 pmaddubsw m1, m7, [r3 + 10 * 16]
4149 pmulhrsw m1, m3
4150 pmaddubsw m4, m0, [r3 + 10 * 16]
4151 pmulhrsw m4, m3
4152 packuswb m1, m4
4153 movu [r0 + 181 * 16], m1
4154
4155 ; mode 13 [row 6]
4156 pmaddubsw m1, m7, [r3 + 1 * 16]
4157 pmulhrsw m1, m3
4158 pmaddubsw m4, m0, [r3 + 1 * 16]
4159 pmulhrsw m4, m3
4160 packuswb m1, m4
4161 movu [r0 + 182 * 16], m1
4162
4163 ; mode 14 [row 2]
4164 pinsrb m5, m7, [r2 + 2], 0
4165 pmaddubsw m1, m5, [r3 + 25 * 16]
4166 pmulhrsw m1, m3
4167 pmaddubsw m4, m0, [r3 + 25 * 16]
4168 pmulhrsw m4, m3
4169 packuswb m1, m4
4170 movu [r0 + 194 * 16], m1
4171
4172 ; mode 14 [row 3]
4173 pmaddubsw m1, m5, [r3 + 12 * 16]
4174 pmulhrsw m1, m3
4175 pmaddubsw m4, m0, [r3 + 12 * 16]
4176 pmulhrsw m4, m3
4177 packuswb m1, m4
4178 movu [r0 + 195 * 16], m1
4179
4180 ; mode 15 [row 1]
4181 pmaddubsw m1, m5, [r3 + 30 * 16]
4182 pmulhrsw m1, m3
4183 pmaddubsw m4, m0, [r3 + 30 * 16]
4184 pmulhrsw m4, m3
4185 packuswb m1, m4
4186 movu [r0 + 209 * 16], m1
4187
4188 ; mode 15 [row 2]
4189 pmaddubsw m1, m5, [r3 + 13 * 16]
4190 pmulhrsw m1, m3
4191 pmaddubsw m4, m0, [r3 + 13 * 16]
4192 pmulhrsw m4, m3
4193 packuswb m1, m4
4194 movu [r0 + 210 * 16], m1
4195
4196 ; mode 16 [row 1]
4197 pmaddubsw m1, m5, [r3 + 22 * 16]
4198 pmulhrsw m1, m3
4199 pmaddubsw m4, m0, [r3 + 22 * 16]
4200 pmulhrsw m4, m3
4201 packuswb m1, m4
4202 movu [r0 + 225 * 16], m1
4203
4204 ; mode 16 [row 2]
4205 pmaddubsw m1, m5, [r3 + 1 * 16]
4206 pmulhrsw m1, m3
4207 pmaddubsw m4, m0, [r3 + 1 * 16]
4208 pmulhrsw m4, m3
4209 packuswb m1, m4
4210 movu [r0 + 226 * 16], m1
4211
4212 ; mode 16 [row 13 - second half]
4213 pmaddubsw m1, m5, [r3 + 26 * 16]
4214 pmulhrsw m1, m3
4215 packuswb m1, m1
4216 movh [r0 + 237 * 16 + 8], m1
4217 ; mode 16 [row 13 - second half]
4218
4219 ; mode 16 [row 14 - second half]
4220 pmaddubsw m1, m5, [r3 + 5 * 16]
4221 pmulhrsw m1, m3
4222 packuswb m1, m1
4223 movh [r0 + 238 * 16 + 8], m1
4224 ; mode 16 [row 14 - second half]
4225
4226 ; mode 16 [row 3]
4227 pslldq m6, m5, 2
4228 pinsrb m6, [r2 + 2], 1
4229 pinsrb m6, [r2 + 3], 0
4230 pmaddubsw m1, m6, [r3 + 12 * 16]
4231 pmulhrsw m1, m3
4232 packuswb m1, m1
4233 movh [r0 + 227 * 16], m1
4234
4235 ; mode 16 [row 15 - second half]
4236 pmaddubsw m1, m6, [r3 + 16 * 16]
4237 pmulhrsw m1, m3
4238 packuswb m1, m1
4239 movh [r0 + 239 * 16 + 8], m1
4240 ; mode 16 [row 15 - second half] end
4241
4242 ; mode 16 [row 4- first half]
4243 pslldq m6, 2
4244 pinsrb m6, [r2 + 3], 1
4245 pinsrb m6, [r2 + 5], 0
4246 pmaddubsw m1, m6, [r3 + 23 * 16]
4247 pmulhrsw m1, m3
4248 packuswb m1, m1
4249 movh [r0 + 228 * 16], m1
4250
4251 ; mode 16 [row 5- first half]
4252 pmaddubsw m1, m6, [r3 + 2 * 16]
4253 pmulhrsw m1, m3
4254 packuswb m1, m1
4255 movh [r0 + 229 * 16], m1
4256
4257 ; mode 16 [row 6- first half]
4258 pslldq m6, 2
4259 pinsrb m6, [r2 + 5], 1
4260 pinsrb m6, [r2 + 6], 0
4261 pmaddubsw m1, m6, [r3 + 13 * 16]
4262 pmulhrsw m1, m3
4263 packuswb m1, m1
4264 movh [r0 + 230 * 16], m1
4265
4266 ; mode 16 [row 7- first half]
4267 pslldq m6, 2
4268 pinsrb m6, [r2 + 6], 1
4269 pinsrb m6, [r2 + 8], 0
4270 pmaddubsw m1, m6, [r3 + 24 * 16]
4271 pmulhrsw m1, m3
4272 packuswb m1, m1
4273 movh [r0 + 231 * 16], m1
4274
4275 ; mode 16 [row 8- first half]
4276 pmaddubsw m1, m6, [r3 + 3 * 16]
4277 pmulhrsw m1, m3
4278 packuswb m1, m1
4279 movh [r0 + 232 * 16], m1
4280 ; mode 19 [row 0 - second half] end
4281
4282 ; mode 16 [row 9- first half]
4283 pslldq m6, 2
4284 pinsrb m6, [r2 + 8], 1
4285 pinsrb m6, [r2 + 9], 0
4286 pmaddubsw m1, m6, [r3 + 14 * 16]
4287 pmulhrsw m1, m3
4288 packuswb m1, m1
4289 movh [r0 + 233 * 16], m1
4290
4291 ; mode 16 [row 10 - first half]
4292 pslldq m6, 2
4293 pinsrb m6, [r2 + 9], 1
4294 pinsrb m6, [r2 + 11], 0
4295 pmaddubsw m1, m6, [r3 + 25 * 16]
4296 pmulhrsw m1, m3
4297 packuswb m1, m1
4298 movh [r0 + 234 * 16], m1
4299
4300 ; mode 16 [row 11 - first half]
4301 pmaddubsw m1, m6, [r3 + 4 * 16]
4302 pmulhrsw m1, m3
4303 packuswb m1, m1
4304 movh [r0 + 235 * 16], m1
4305
4306 ; mode 16 [row 12 - first half]
4307 pslldq m6, 2
4308 pinsrb m6, [r2 + 11], 1
4309 pinsrb m6, [r2 + 12], 0
4310 pmaddubsw m1, m6, [r3 + 15 * 16]
4311 pmulhrsw m1, m3
4312 packuswb m1, m1
4313 movh [r0 + 236 * 16], m1
4314
4315 ; mode 16 [row 13 - first half]
4316 pslldq m6, 2
4317 pinsrb m6, [r2 + 12], 1
4318 pinsrb m6, [r2 + 14], 0
4319 pmaddubsw m1, m6, [r3 + 26 * 16]
4320 pmulhrsw m1, m3
4321 packuswb m1, m1
4322 movh [r0 + 237 * 16], m1
4323
4324 ; mode 16 [row 14 - first half]
4325 pmaddubsw m1, m6, [r3 + 5 * 16]
4326 pmulhrsw m1, m3
4327 packuswb m1, m1
4328 movh [r0 + 238 * 16], m1
4329
4330 ; mode 16 [row 15 - first half]
4331 pslldq m6, 2
4332 pinsrb m6, [r2 + 14], 1
4333 pinsrb m6, [r2 + 15], 0
4334 pmaddubsw m1, m6, [r3 + 16 * 16]
4335 pmulhrsw m1, m3
4336 packuswb m1, m1
4337 movh [r0 + 239 * 16], m1
4338
4339 ; mode 14 [row 4]
4340 pslldq m5, 2
4341 pinsrb m5, [r2 + 2], 1
4342 pinsrb m5, [r2 + 5], 0
4343 movu m4, [r2 + 6 + 32]
4344 psrldq m0, m4, 1
4345 punpcklbw m4, m0
4346
4347 ; mode 16 [row 3 - second half]
4348 pmaddubsw m1, m4, [r3 + 12 * 16]
4349 pmulhrsw m1, m3
4350 packuswb m1, m1
4351 movh [r0 + 227 * 16 + 8], m1
4352
4353 ; mode 16 [row 3 - second half] end
4354 pmaddubsw m1, m5, [r3 + 31 * 16]
4355 pmulhrsw m1, m3
4356 pmaddubsw m0, m4, [r3 + 31 * 16]
4357 pmulhrsw m0, m3
4358 packuswb m1, m0
4359 movu [r0 + 196 * 16], m1
4360
4361 ; mode 14 [row 5]
4362 pmaddubsw m1, m5, [r3 + 18 * 16]
4363 pmulhrsw m1, m3
4364 pmaddubsw m0, m4, [r3 + 18 * 16]
4365 pmulhrsw m0, m3
4366 packuswb m1, m0
4367 movu [r0 + 197 * 16], m1
4368
4369 ; mode 14 [row 6]
4370 pmaddubsw m1, m5, [r3 + 5 * 16]
4371 pmulhrsw m1, m3
4372 pmaddubsw m0, m4, [r3 + 5 * 16]
4373 pmulhrsw m0, m3
4374 packuswb m1, m0
4375 movu [r0 + 198 * 16], m1
4376
4377 ; mode 15 [row 3]
4378 movu m6, m5
4379 pinsrb m6, [r2 + 4], 0
4380 pmaddubsw m1, m6, [r3 + 28 * 16]
4381 pmulhrsw m1, m3
4382 pmaddubsw m0, m4, [r3 + 28 * 16]
4383 pmulhrsw m0, m3
4384 packuswb m1, m0
4385 movu [r0 + 211 * 16], m1
4386
4387 ; mode 15 [row 4]
4388 pmaddubsw m1, m6, [r3 + 11 * 16]
4389 pmulhrsw m1, m3
4390 pmaddubsw m0, m4, [r3 + 11 * 16]
4391 pmulhrsw m0, m3
4392 packuswb m1, m0
4393 movu [r0 + 212 * 16], m1
4394
4395 ; mode 15 [row 5 - first half]
4396 pslldq m6, 2
4397 pinsrb m6, [r2 + 4], 1
4398 pinsrb m6, [r2 + 6], 0
4399 pmaddubsw m1, m6, [r3 + 26 * 16]
4400 pmulhrsw m1, m3
4401 packuswb m1, m1
4402 movh [r0 + 213 * 16], m1
4403
4404 ; mode 15 [row 6 - first half]
4405 pmaddubsw m1, m6, [r3 + 9 * 16]
4406 pmulhrsw m1, m3
4407 packuswb m1, m1
4408 movh [r0 + 214 * 16], m1
4409
4410 ; mode 15 [row 7 - first half]
4411 pslldq m6, 2
4412 pinsrb m6, [r2 + 6], 1
4413 pinsrb m6, [r2 + 8], 0
4414 pmaddubsw m1, m6, [r3 + 24 * 16]
4415 pmulhrsw m1, m3
4416 packuswb m1, m1
4417 movh [r0 + 215 * 16], m1
4418
4419 ; mode 15 [row 8 - first half]
4420 pmaddubsw m1, m6, [r3 + 7 * 16]
4421 pmulhrsw m1, m3
4422 packuswb m1, m1
4423 movh [r0 + 216 * 16], m1
4424
4425 ; mode 15 [row 9 - first half]
4426 pslldq m6, 2
4427 pinsrb m6, [r2 + 8], 1
4428 pinsrb m6, [r2 + 9], 0
4429 pmaddubsw m1, m6, [r3 + 22 * 16]
4430 pmulhrsw m1, m3
4431 packuswb m1, m1
4432 movh [r0 + 217 * 16], m1
4433
4434 ; mode 15 [row 10 - first half]
4435 pmaddubsw m1, m6, [r3 + 5 * 16]
4436 pmulhrsw m1, m3
4437 packuswb m1, m1
4438 movh [r0 + 218 * 16], m1
4439
4440 ; mode 15 [row 11 - first half]
4441 pslldq m6, 2
4442 pinsrb m6, [r2 + 9], 1
4443 pinsrb m6, [r2 + 11], 0
4444 pmaddubsw m1, m6, [r3 + 20 * 16]
4445 pmulhrsw m1, m3
4446 packuswb m1, m1
4447 movh [r0 + 219 * 16], m1
4448
4449 ; mode 15 [row 12 - first half]
4450 pmaddubsw m1, m6, [r3 + 3 * 16]
4451 pmulhrsw m1, m3
4452 packuswb m1, m1
4453 movh [r0 + 220 * 16], m1
4454
4455 ; mode 15 [row 13 - first half]
4456 pslldq m6, 2
4457 pinsrb m6, [r2 + 11], 1
4458 pinsrb m6, [r2 + 13], 0
4459 pmaddubsw m1, m6, [r3 + 18 * 16]
4460 pmulhrsw m1, m3
4461 packuswb m1, m1
4462 movh [r0 + 221 * 16], m1
4463
4464 ; mode 15 [row 14 - first half]
4465 pmaddubsw m1, m6, [r3 + 1 * 16]
4466 pmulhrsw m1, m3
4467 packuswb m1, m1
4468 movh [r0 + 222 * 16], m1
4469
4470 ; mode 15 [row 15 - first half]
4471 pslldq m6, 2
4472 pinsrb m6, [r2 + 13], 1
4473 pinsrb m6, [r2 + 15], 0
4474 pmaddubsw m1, m6, [r3 + 16 * 16]
4475 pmulhrsw m1, m3
4476 packuswb m1, m1
4477 movh [r0 + 223 * 16], m1
4478
4479 ; mode 14 [row 7]
4480 pslldq m5, 2
4481 pinsrb m5, [r2 + 5], 1
4482 pinsrb m5, [r2 + 7], 0
4483 movu m0, [r2 + 5 + 32]
4484 psrldq m6, m0, 1
4485 punpcklbw m0, m6
4486
4487 ; mode 15 [row 5 - second half]
4488 pmaddubsw m1, m0, [r3 + 26 * 16]
4489 pmulhrsw m1, m3
4490 packuswb m1, m1
4491 movh [r0 + 213 * 16 + 8], m1
4492 ; mode 15 [row 5 - second half] end
4493
4494 ; mode 15 [row 6 - second half]
4495 pmaddubsw m1, m0, [r3 + 9 * 16]
4496 pmulhrsw m1, m3
4497 packuswb m1, m1
4498 movh [r0 + 214 * 16 + 8], m1
4499 ; mode 15 [row 6 - second half] end
4500
4501 ; mode 16 [row 4 - second half]
4502 pmaddubsw m1, m0, [r3 + 23 * 16]
4503 pmulhrsw m1, m3
4504 packuswb m1, m1
4505 movh [r0 + 228 * 16 + 8], m1
4506 ; mode 16 [row 4 - second half] end
4507
4508 ; mode 16 [row 5 - second half]
4509 pmaddubsw m1, m0, [r3 + 2 * 16]
4510 pmulhrsw m1, m3
4511 packuswb m1, m1
4512 movh [r0 + 229 * 16 + 8], m1
4513
4514 ; mode 16 [row 5 - second half] end
4515 pmaddubsw m1, m5, [r3 + 24 * 16]
4516 pmulhrsw m1, m3
4517 pmaddubsw m6, m0, [r3 + 24 * 16]
4518 pmulhrsw m6, m3
4519 packuswb m1, m6
4520 movu [r0 + 199 * 16], m1
4521
4522 ; mode 14 [row 8]
4523 pmaddubsw m1, m5, [r3 + 11 * 16]
4524 pmulhrsw m1, m3
4525 pmaddubsw m6, m0, [r3 + 11 * 16]
4526 pmulhrsw m6, m3
4527 packuswb m1, m6
4528 movu [r0 + 200 * 16], m1
4529
4530 ; mode 14 [row 9]
4531 pslldq m5, 2
4532 pinsrb m5, [r2 + 7], 1
4533 pinsrb m5, [r2 + 10], 0
4534 movu m0, [r2 + 4 + 32]
4535 psrldq m6, m0, 1
4536 punpcklbw m0, m6
4537
4538 ; mode 15 [row 7 - second half]
4539 pmaddubsw m1, m0, [r3 + 24 * 16]
4540 pmulhrsw m1, m3
4541 packuswb m1, m1
4542 movh [r0 + 215 * 16 + 8], m1
4543 ; mode 15 [row 7 - second half] end
4544
4545 ; mode 15 [row 8 - second half]
4546 pmaddubsw m1, m0, [r3 + 7 * 16]
4547 pmulhrsw m1, m3
4548 packuswb m1, m1
4549 movh [r0 + 216 * 16 + 8], m1
4550 ; mode 15 [row 8 - second half] end
4551
4552 ; mode 16 [row 6 - second half]
4553 pmaddubsw m1, m0, [r3 + 13 * 16]
4554 pmulhrsw m1, m3
4555 packuswb m1, m1
4556 movh [r0 + 230 * 16 + 8], m1
4557 ; mode 16 [row 6 - second half] end
4558
4559 ; mode 15 [row 6 - second half] end
4560 pmaddubsw m1, m5, [r3 + 30 * 16]
4561 pmulhrsw m1, m3
4562 pmaddubsw m6, m0, [r3 + 30 * 16]
4563 pmulhrsw m6, m3
4564 packuswb m1, m6
4565 movu [r0 + 201 * 16], m1
4566
4567 ; mode 14 [row 10]
4568 pmaddubsw m1, m5, [r3 + 17 * 16]
4569 pmulhrsw m1, m3
4570 pmaddubsw m6, m0, [r3 + 17 * 16]
4571 pmulhrsw m6, m3
4572 packuswb m1, m6
4573 movu [r0 + 202 * 16], m1
4574
4575 ; mode 14 [row 11]
4576 pmaddubsw m1, m5, [r3 + 4 * 16]
4577 pmulhrsw m1, m3
4578 pmaddubsw m6, m0, [r3 + 4 * 16]
4579 pmulhrsw m6, m3
4580 packuswb m1, m6
4581 movu [r0 + 203 * 16], m1
4582
4583 ; mode 14 [row 12]
4584 pslldq m5, 2
4585 pinsrb m5, [r2 + 10], 1
4586 pinsrb m5, [r2 + 12], 0
4587 movu m0, [r2 + 3 + 32]
4588 psrldq m6, m0, 1
4589 punpcklbw m0, m6
4590
4591 ; mode 15 [row 9 - second half]
4592 pmaddubsw m1, m0, [r3 + 22 * 16]
4593 pmulhrsw m1, m3
4594 packuswb m1, m1
4595 movh [r0 + 217 * 16 + 8], m1
4596 ; mode 15 [row 9 - second half] end
4597
4598 ; mode 15 [row 10 - second half]
4599 pmaddubsw m1, m0, [r3 + 5 * 16]
4600 pmulhrsw m1, m3
4601 packuswb m1, m1
4602 movh [r0 + 218 * 16 + 8], m1
4603 ; mode 15 [row 10 - second half] end
4604
4605 ; mode 16 [row 7 - second half]
4606 pmaddubsw m1, m0, [r3 + 24 * 16]
4607 pmulhrsw m1, m3
4608 packuswb m1, m1
4609 movh [r0 + 231 * 16 + 8], m1
4610 ; mode 16 [row 7 - second half] end
4611
4612 ; mode 16 [row 8 - second half]
4613 pmaddubsw m1, m0, [r3 + 3 * 16]
4614 pmulhrsw m1, m3
4615 packuswb m1, m1
4616 movh [r0 + 232 * 16 + 8], m1
4617 ; mode 16 [row 8 - second half] end
4618
4619 pmaddubsw m1, m5, [r3 + 23 * 16]
4620 pmulhrsw m1, m3
4621 pmaddubsw m6, m0, [r3 + 23 * 16]
4622 pmulhrsw m6, m3
4623 packuswb m1, m6
4624 movu [r0 + 204 * 16], m1
4625
4626 ; mode 14 [row 13]
4627 pmaddubsw m1, m5, [r3 + 10 * 16]
4628 pmulhrsw m1, m3
4629 pmaddubsw m6, m0, [r3 + 10 * 16]
4630 pmulhrsw m6, m3
4631 packuswb m1, m6
4632 movu [r0 + 205 * 16], m1
4633
4634 ; mode 14 [row 14]
4635 pslldq m5, 2
4636 pinsrb m5, [r2 + 12], 1
4637 pinsrb m5, [r2 + 15], 0
4638 movu m0, [r2 + 2 + 32]
4639 psrldq m6, m0, 1
4640 punpcklbw m0, m6
4641
4642 ; mode 15 [row 11 - second half]
4643 pmaddubsw m1, m0, [r3 + 20 * 16]
4644 pmulhrsw m1, m3
4645 packuswb m1, m1
4646 movh [r0 + 219 * 16 + 8], m1
4647 ; mode 15 [row 11 - second half] end
4648
4649 ; mode 15 [row 12 - second half]
4650 pmaddubsw m1, m0, [r3 + 3 * 16]
4651 pmulhrsw m1, m3
4652 packuswb m1, m1
4653 movh [r0 + 220 * 16 + 8], m1
4654 ; mode 15 [row 12 - second half] end
4655
4656 ; mode 16 [row 9 - second half]
4657 pmaddubsw m1, m0, [r3 + 14 * 16]
4658 pmulhrsw m1, m3
4659 packuswb m1, m1
4660 movh [r0 + 233 * 16 + 8], m1
4661
4662 ; mode 16 [row 9 - second half] end
4663 pmaddubsw m1, m5, [r3 + 29 * 16]
4664 pmulhrsw m1, m3
4665 pmaddubsw m6, m0, [r3 + 29 * 16]
4666 pmulhrsw m6, m3
4667 packuswb m1, m6
4668 movu [r0 + 206 * 16], m1
4669
4670 ; mode 14 [row 15]
4671 pmaddubsw m1, m5, [r3 + 16 * 16]
4672 pmulhrsw m1, m3
4673 pmaddubsw m6, m0, [r3 + 16 * 16]
4674 pmulhrsw m6, m3
4675 packuswb m1, m6
4676 movu [r0 + 207 * 16], m1
4677
4678 ; mode 12 [row 12]
4679 pslldq m0, m2, 2
4680 pinsrb m0, [r2 + 6], 1
4681 pinsrb m0, [r2 + 13], 0
4682 pmaddubsw m1, m0, [r3 + 31 * 16]
4683 pmulhrsw m1, m3
4684 pmaddubsw m5, m4, [r3 + 31 * 16]
4685 pmulhrsw m5, m3
4686 packuswb m1, m5
4687 movu [r0 + 172 * 16], m1
4688
4689 ; mode 12 [row 13]
4690 pmaddubsw m1, m0, [r3 + 26 * 16]
4691 pmulhrsw m1, m3
4692 pmaddubsw m5, m4, [r3 + 26 * 16]
4693 pmulhrsw m5, m3
4694 packuswb m1, m5
4695 movu [r0 + 173 * 16], m1
4696
4697 ; mode 12 [row 14]
4698 pmaddubsw m1, m0, [r3 + 21 * 16]
4699 pmulhrsw m1, m3
4700 pmaddubsw m5, m4, [r3 + 21 * 16]
4701 pmulhrsw m5, m3
4702 packuswb m1, m5
4703 movu [r0 + 174 * 16], m1
4704
4705 ; mode 12 [row 15]
4706 pmaddubsw m1, m0, [r3 + 16 * 16]
4707 pmulhrsw m1, m3
4708 pmaddubsw m5, m4, [r3 + 16 * 16]
4709 pmulhrsw m5, m3
4710 packuswb m1, m5
4711 movu [r0 + 175 * 16], m1
4712
4713 ; mode 13 [row 7]
4714 pslldq m7, 2
4715 pinsrb m7, [r2 + 4], 1
4716 pinsrb m7, [r2 + 7], 0
4717 pmaddubsw m1, m7, [r3 + 24 * 16]
4718 pmulhrsw m1, m3
4719 pmaddubsw m5, m4, [r3 + 24 * 16]
4720 pmulhrsw m5, m3
4721 packuswb m1, m5
4722 movu [r0 + 183 * 16], m1
4723
4724 ; mode 13 [row 8]
4725 pmaddubsw m1, m7, [r3 + 15 * 16]
4726 pmulhrsw m1, m3
4727 pmaddubsw m5, m4, [r3 + 15 * 16]
4728 pmulhrsw m5, m3
4729 packuswb m1, m5
4730 movu [r0 + 184 * 16], m1
4731
4732 ; mode 13 [row 9]
4733 pmaddubsw m1, m7, [r3 + 6 * 16]
4734 pmulhrsw m1, m3
4735 pmaddubsw m5, m4, [r3 + 6 * 16]
4736 pmulhrsw m5, m3
4737 packuswb m1, m5
4738 movu [r0 + 185 * 16], m1
4739
4740 ; mode 13 [row 10]
4741 pslldq m7, 2
4742 pinsrb m7, [r2 + 7], 1
4743 pinsrb m7, [r2 + 11], 0
4744 pmaddubsw m1, m7, [r3 + 29 * 16]
4745 pmulhrsw m1, m3
4746 movu m4, [r2 + 5 + 32]
4747 psrldq m5, m4, 1
4748 punpcklbw m4, m5
4749 pmaddubsw m5, m4, [r3 + 29 * 16]
4750 pmulhrsw m5, m3
4751 packuswb m1, m5
4752 movu [r0 + 186 * 16], m1
4753
4754 ; mode 13 [row 11]
4755 pmaddubsw m1, m7, [r3 + 20 * 16]
4756 pmulhrsw m1, m3
4757 pmaddubsw m5, m4, [r3 + 20 * 16]
4758 pmulhrsw m5, m3
4759 packuswb m1, m5
4760 movu [r0 + 187 * 16], m1
4761
4762 ; mode 13 [row 12]
4763 pmaddubsw m1, m7, [r3 + 11 * 16]
4764 pmulhrsw m1, m3
4765 pmaddubsw m5, m4, [r3 + 11 * 16]
4766 pmulhrsw m5, m3
4767 packuswb m1, m5
4768 movu [r0 + 188 * 16], m1
4769
4770 ; mode 13 [row 13]
4771 pmaddubsw m1, m7, [r3 + 2 * 16]
4772 pmulhrsw m1, m3
4773 pmaddubsw m5, m4, [r3 + 2 * 16]
4774 pmulhrsw m5, m3
4775 packuswb m1, m5
4776 movu [r0 + 189 * 16], m1
4777
4778 ; mode 13 [row 14]
4779 pslldq m7, 2
4780 pinsrb m7, [r2 + 11], 1
4781 pinsrb m7, [r2 + 14], 0
4782 pmaddubsw m1, m7, [r3 + 25 * 16]
4783 pmulhrsw m1, m3
4784 movu m4, [r2 + 4 + 32]
4785 psrldq m5, m4, 1
4786 punpcklbw m4, m5
4787 pmaddubsw m5, m4, [r3 + 25 * 16]
4788 pmulhrsw m5, m3
4789 packuswb m1, m5
4790 movu [r0 + 190 * 16], m1
4791
4792 ; mode 13 [row 15]
4793 pmaddubsw m1, m7, [r3 + 16 * 16]
4794 pmulhrsw m1, m3
4795 pmaddubsw m5, m4, [r3 + 16 * 16]
4796 pmulhrsw m5, m3
4797 packuswb m1, m5
4798 movu [r0 + 191 * 16], m1
4799
4800 ; mode 17 [row 15]
4801 movu m0, [r2]
4802 pshufb m1, m0, [tab_S1]
4803 movu [r0 + 255 * 16], m1
4804 movu m2, [r2 + 32]
4805 pinsrb m2, [r2], 0
4806 movd [r0 + 255 * 16 + 12], m2
4807
4808 ; mode 18 [row 0]
4809 movu [r0 + 256 * 16], m0
4810
4811 ; mode 18 [row 1]
4812 pslldq m4, m0, 1
4813 pinsrb m4, [r2 + 1 + 32], 0
4814 movu [r0 + 257 * 16], m4
4815 pslldq m4, 1
4816 pinsrb m4, [r2 + 2 + 32], 0
4817 movu [r0 + 258 * 16], m4
4818 pslldq m4, 1
4819 pinsrb m4, [r2 + 3 + 32], 0
4820 movu [r0 + 259 * 16], m4
4821 pslldq m4, 1
4822 pinsrb m4, [r2 + 4 + 32], 0
4823 movu [r0 + 260 * 16], m4
4824 pslldq m4, 1
4825 pinsrb m4, [r2 + 5 + 32], 0
4826 movu [r0 + 261 * 16], m4
4827 pslldq m4, 1
4828 pinsrb m4, [r2 + 6 + 32], 0
4829 movu [r0 + 262 * 16], m4
4830 pslldq m4, 1
4831 pinsrb m4, [r2 + 7 + 32], 0
4832 movu [r0 + 263 * 16], m4
4833 pslldq m4, 1
4834 pinsrb m4, [r2 + 8 + 32], 0
4835 movu [r0 + 264 * 16], m4
4836 pslldq m4, 1
4837 pinsrb m4, [r2 + 9 + 32], 0
4838 movu [r0 + 265 * 16], m4
4839 pslldq m4, 1
4840 pinsrb m4, [r2 + 10 + 32], 0
4841 movu [r0 + 266 * 16], m4
4842 pslldq m4, 1
4843 pinsrb m4, [r2 + 11 + 32], 0
4844 movu [r0 + 267 * 16], m4
4845 pslldq m4, 1
4846 pinsrb m4, [r2 + 12 + 32], 0
4847 movu [r0 + 268 * 16], m4
4848 pslldq m4, 1
4849 pinsrb m4, [r2 + 13 + 32], 0
4850 movu [r0 + 269 * 16], m4
4851 pslldq m4, 1
4852 pinsrb m4, [r2 + 14 + 32], 0
4853 movu [r0 + 270 * 16], m4
4854 pslldq m4, 1
4855 pinsrb m4, [r2 + 15 + 32], 0
4856 movu [r0 + 271 * 16], m4
4857
4858 ; mode 19 [row 0]
4859 psrldq m2, m0, 1
4860 punpcklbw m0, m2
4861 movu m5, [r2 + 8]
4862 psrldq m6, m5, 1
4863 punpcklbw m5, m6
4864 pmaddubsw m4, m0, [r3 + 6 * 16]
4865 pmulhrsw m4, m3
4866 pmaddubsw m6, m5, [r3 + 6 * 16]
4867 pmulhrsw m6, m3
4868 packuswb m4, m6
4869 movu [r0 + 272 * 16], m4
4870
4871 ; mode 20 [row 0]
4872 pmaddubsw m4, m0, [r3 + 11 * 16]
4873 pmulhrsw m4, m3
4874 pmaddubsw m6, m5, [r3 + 11 * 16]
4875 pmulhrsw m6, m3
4876 packuswb m4, m6
4877 movu [r0 + 288 * 16], m4
4878
4879 ; mode 21 [row 0]
4880 pmaddubsw m4, m0, [r3 + 15 * 16]
4881 pmulhrsw m4, m3
4882 pmaddubsw m6, m5, [r3 + 15 * 16]
4883 pmulhrsw m6, m3
4884 packuswb m4, m6
4885 movu [r0 + 304 * 16], m4
4886
4887 ; mode 22 [row 0]
4888 pmaddubsw m4, m0, [r3 + 19 * 16]
4889 pmulhrsw m4, m3
4890 pmaddubsw m6, m5, [r3 + 19 * 16]
4891 pmulhrsw m6, m3
4892 packuswb m4, m6
4893 movu [r0 + 320 * 16], m4
4894
4895 ; mode 22 [row 1]
4896 pmaddubsw m4, m0, [r3 + 6 * 16]
4897 pmulhrsw m4, m3
4898 pmaddubsw m6, m5, [r3 + 6 * 16]
4899 pmulhrsw m6, m3
4900 packuswb m4, m6
4901 movu [r0 + 321 * 16], m4
4902
4903 ; mode 23 [row 0]
4904 pmaddubsw m4, m0, [r3 + 23 * 16]
4905 pmulhrsw m4, m3
4906 pmaddubsw m6, m5, [r3 + 23 * 16]
4907 pmulhrsw m6, m3
4908 packuswb m4, m6
4909 movu [r0 + 336 * 16], m4
4910
4911 ; mode 23 [row 1]
4912 pmaddubsw m4, m0, [r3 + 14 * 16]
4913 pmulhrsw m4, m3
4914 pmaddubsw m6, m5, [r3 + 14 * 16]
4915 pmulhrsw m6, m3
4916 packuswb m4, m6
4917 movu [r0 + 337 * 16], m4
4918
4919 ; mode 23 [row 2]
4920 pmaddubsw m4, m0, [r3 + 5 * 16]
4921 pmulhrsw m4, m3
4922 pmaddubsw m6, m5, [r3 + 5 * 16]
4923 pmulhrsw m6, m3
4924 packuswb m4, m6
4925 movu [r0 + 338 * 16], m4
4926
4927 ; mode 24 [row 0]
4928 pmaddubsw m4, m0, [r3 + 27 * 16]
4929 pmulhrsw m4, m3
4930 pmaddubsw m6, m5, [r3 + 27 * 16]
4931 pmulhrsw m6, m3
4932 packuswb m4, m6
4933 movu [r0 + 352 * 16], m4
4934
4935 ; mode 24 [row 1]
4936 pmaddubsw m4, m0, [r3 + 22 * 16]
4937 pmulhrsw m4, m3
4938 pmaddubsw m6, m5, [r3 + 22 * 16]
4939 pmulhrsw m6, m3
4940 packuswb m4, m6
4941 movu [r0 + 353 * 16], m4
4942
4943 ; mode 24 [row 2]
4944 pmaddubsw m4, m0, [r3 + 17 * 16]
4945 pmulhrsw m4, m3
4946 pmaddubsw m6, m5, [r3 + 17 * 16]
4947 pmulhrsw m6, m3
4948 packuswb m4, m6
4949 movu [r0 + 354 * 16], m4
4950
4951 ; mode 24 [row 3]
4952 pmaddubsw m4, m0, [r3 + 12 * 16]
4953 pmulhrsw m4, m3
4954 pmaddubsw m6, m5, [r3 + 12 * 16]
4955 pmulhrsw m6, m3
4956 packuswb m4, m6
4957 movu [r0 + 355 * 16], m4
4958
4959 ; mode 24 [row 4]
4960 pmaddubsw m4, m0, [r3 + 7 * 16]
4961 pmulhrsw m4, m3
4962 pmaddubsw m6, m5, [r3 + 7 * 16]
4963 pmulhrsw m6, m3
4964 packuswb m4, m6
4965 movu [r0 + 356 * 16], m4
4966
4967 ; mode 24 [row 5]
4968 pmaddubsw m4, m0, [r3 + 2 * 16]
4969 pmulhrsw m4, m3
4970 pmaddubsw m6, m5, [r3 + 2 * 16]
4971 pmulhrsw m6, m3
4972 packuswb m4, m6
4973 movu [r0 + 357 * 16], m4
4974
4975 ; mode 24 [row 6 - first half]
4976 pslldq m7, m0, 2
4977 pinsrb m7, [r2 + 0], 1
4978 pinsrb m7, [r2 + 6 + 32], 0
4979 pmaddubsw m4, m7, [r3 + 29 * 16]
4980 pmulhrsw m4, m3
4981 packuswb m4, m4
4982 movh [r0 + 358 * 16], m4
4983
4984 ; mode 24 [row 7 - first half]
4985 pmaddubsw m4, m7, [r3 + 24 * 16]
4986 pmulhrsw m4, m3
4987 packuswb m4, m4
4988 movh [r0 + 359 * 16], m4
4989
4990 ; mode 24 [row 8 - first half]
4991 pmaddubsw m4, m7, [r3 + 19 * 16]
4992 pmulhrsw m4, m3
4993 packuswb m4, m4
4994 movh [r0 + 360 * 16], m4
4995
4996 ; mode 24 [row 9 - first half]
4997 pmaddubsw m4, m7, [r3 + 14 * 16]
4998 pmulhrsw m4, m3
4999 packuswb m4, m4
5000 movh [r0 + 361 * 16], m4
5001
5002 ; mode 24 [row 10 - first half]
5003 pmaddubsw m4, m7, [r3 + 9 * 16]
5004 pmulhrsw m4, m3
5005 packuswb m4, m4
5006 movh [r0 + 362 * 16], m4
5007
5008 ; mode 24 [row 11 - first half]
5009 pmaddubsw m4, m7, [r3 + 4 * 16]
5010 pmulhrsw m4, m3
5011 packuswb m4, m4
5012 movh [r0 + 363 * 16], m4
5013
5014 ; mode 24 [row 12 - first half]
5015 pslldq m7, 2
5016 pinsrb m7, [r2 + 6 + 32], 1
5017 pinsrb m7, [r2 + 13 + 32], 0
5018 pmaddubsw m4, m7, [r3 + 31 * 16]
5019 pmulhrsw m4, m3
5020 packuswb m4, m4
5021 movh [r0 + 364 * 16], m4
5022
5023 ; mode 24 [row 13 - first half]
5024 pmaddubsw m4, m7, [r3 + 26 * 16]
5025 pmulhrsw m4, m3
5026 packuswb m4, m4
5027 movh [r0 + 365 * 16], m4
5028
5029 ; mode 24 [row 14 - first half]
5030 pmaddubsw m4, m7, [r3 + 21 * 16]
5031 pmulhrsw m4, m3
5032 packuswb m4, m4
5033 movh [r0 + 366 * 16], m4
5034
5035 ; mode 24 [row 15 - first half]
5036 pmaddubsw m4, m7, [r3 + 16 * 16]
5037 pmulhrsw m4, m3
5038 packuswb m4, m4
5039 movh [r0 + 367 * 16], m4
5040
5041 ; mode 23 [row 3 - first half]
5042 pslldq m7, m0, 2
5043 pinsrb m7, [r2 + 0], 1
5044 pinsrb m7, [r2 + 4 + 32], 0
5045 pmaddubsw m4, m7, [r3 + 28 * 16]
5046 pmulhrsw m4, m3
5047 packuswb m4, m4
5048 movh [r0 + 339 * 16], m4
5049
5050 ; mode 23 [row 4 - first half]
5051 pmaddubsw m4, m7, [r3 + 19 * 16]
5052 pmulhrsw m4, m3
5053 packuswb m4, m4
5054 movh [r0 + 340 * 16], m4
5055
5056 ; mode 23 [row 5 - first half]
5057 pmaddubsw m4, m7, [r3 + 10 * 16]
5058 pmulhrsw m4, m3
5059 packuswb m4, m4
5060 movh [r0 + 341 * 16], m4
5061
5062 ; mode 23 [row 6 - first half]
5063 pmaddubsw m4, m7, [r3 + 1 * 16]
5064 pmulhrsw m4, m3
5065 packuswb m4, m4
5066 movh [r0 + 342 * 16], m4
5067
5068 ; mode 23 [row 7 - first half]
5069 pslldq m7, 2
5070 pinsrb m7, [r2 + 4 + 32], 1
5071 pinsrb m7, [r2 + 7 + 32], 0
5072 pmaddubsw m4, m7, [r3 + 24 * 16]
5073 pmulhrsw m4, m3
5074 packuswb m4, m4
5075 movh [r0 + 343 * 16], m4
5076
5077 ; mode 23 [row 8 - first half]
5078 pmaddubsw m4, m7, [r3 + 15 * 16]
5079 pmulhrsw m4, m3
5080 packuswb m4, m4
5081 movh [r0 + 344 * 16], m4
5082
5083 ; mode 23 [row 9 - first half]
5084 pmaddubsw m4, m7, [r3 + 6 * 16]
5085 pmulhrsw m4, m3
5086 packuswb m4, m4
5087 movh [r0 + 345 * 16], m4
5088
5089 ; mode 23 [row 10 - first half]
5090 pslldq m7, 2
5091 pinsrb m7, [r2 + 7 + 32], 1
5092 pinsrb m7, [r2 + 11 + 32], 0
5093 pmaddubsw m4, m7, [r3 + 29 * 16]
5094 pmulhrsw m4, m3
5095 packuswb m4, m4
5096 movh [r0 + 346 * 16], m4
5097
5098 ; mode 23 [row 11 - first half]
5099 pmaddubsw m4, m7, [r3 + 20 * 16]
5100 pmulhrsw m4, m3
5101 packuswb m4, m4
5102 movh [r0 + 347 * 16], m4
5103
5104 ; mode 23 [row 12 - first half]
5105 pmaddubsw m4, m7, [r3 + 11 * 16]
5106 pmulhrsw m4, m3
5107 packuswb m4, m4
5108 movh [r0 + 348 * 16], m4
5109
5110 ; mode 23 [row 13 - first half]
5111 pmaddubsw m4, m7, [r3 + 2 * 16]
5112 pmulhrsw m4, m3
5113 packuswb m4, m4
5114 movh [r0 + 349 * 16], m4
5115
5116 ; mode 23 [row 14 - first half]
5117 pslldq m7, 2
5118 pinsrb m7, [r2 + 11 + 32], 1
5119 pinsrb m7, [r2 + 14 + 32], 0
5120 pmaddubsw m4, m7, [r3 + 25 * 16]
5121 pmulhrsw m4, m3
5122 packuswb m4, m4
5123 movh [r0 + 350 * 16], m4
5124
5125 ; mode 23 [row 15 - first half]
5126 pmaddubsw m4, m7, [r3 + 16 * 16]
5127 pmulhrsw m4, m3
5128 packuswb m4, m4
5129 movh [r0 + 351 * 16], m4
5130
5131 ; mode 21 [row 15 - first half]
5132 pmaddubsw m4, m0, [r3 + 16 * 16]
5133 pmulhrsw m4, m3
5134 packuswb m4, m4
5135 movh [r0 + 319 * 16 + 8], m4
5136 ; mode 21 [row 15 - second half] end
5137
5138 ; mode 20 [row 1 - first half]
5139 pslldq m7, m0, 2
5140 pinsrb m7, [r2 + 0], 1
5141 pinsrb m7, [r2 + 2 + 32], 0
5142 pmaddubsw m4, m7, [r3 + 22 * 16]
5143 pmulhrsw m4, m3
5144 packuswb m4, m4
5145 movh [r0 + 289 * 16], m4
5146
5147 ; mode 20 [row 2 - first half]
5148 pmaddubsw m4, m7, [r3 + 1 * 16]
5149 pmulhrsw m4, m3
5150 packuswb m4, m4
5151 movh [r0 + 290 * 16], m4
5152
5153 ; mode 21 [row 1 - first half]
5154 pmaddubsw m4, m7, [r3 + 30 * 16]
5155 pmulhrsw m4, m3
5156 packuswb m4, m4
5157 movh [r0 + 305 * 16], m4
5158
5159 ; mode 21 [row 2 - first half]
5160 pmaddubsw m4, m7, [r3 + 13 * 16]
5161 pmulhrsw m4, m3
5162 packuswb m4, m4
5163 movh [r0 + 306 * 16], m4
5164
5165 ; mode 22 [row 2 - first half]
5166 pmaddubsw m4, m7, [r3 + 25 * 16]
5167 pmulhrsw m4, m3
5168 packuswb m4, m4
5169 movh [r0 + 322 * 16], m4
5170
5171 ; mode 22 [row 3 - first half]
5172 pmaddubsw m4, m7, [r3 + 12 * 16]
5173 pmulhrsw m4, m3
5174 packuswb m4, m4
5175 movh [r0 + 323 * 16], m4
5176
5177 ; mode 22 [row 4 - first half]
5178 pslldq m1, m7, 2
5179 pinsrb m1, [r2 + 2 + 32], 1
5180 pinsrb m1, [r2 + 5 + 32], 0
5181 pmaddubsw m4, m1, [r3 + 31 * 16]
5182 pmulhrsw m4, m3
5183 packuswb m4, m4
5184 movh [r0 + 324 * 16], m4
5185
5186 ; mode 22 [row 5 - first half]
5187 pmaddubsw m4, m1, [r3 + 18 * 16]
5188 pmulhrsw m4, m3
5189 packuswb m4, m4
5190 movh [r0 + 325 * 16], m4
5191
5192 ; mode 22 [row 6 - first half]
5193 pmaddubsw m4, m1, [r3 + 5 * 16]
5194 pmulhrsw m4, m3
5195 packuswb m4, m4
5196 movh [r0 + 326 * 16], m4
5197
5198 ; mode 22 [row 7 - first half]
5199 pslldq m1, 2
5200 pinsrb m1, [r2 + 5 + 32], 1
5201 pinsrb m1, [r2 + 7 + 32], 0
5202 pmaddubsw m4, m1, [r3 + 24 * 16]
5203 pmulhrsw m4, m3
5204 packuswb m4, m4
5205 movh [r0 + 327 * 16], m4
5206
5207 ; mode 22 [row 8 - first half]
5208 pmaddubsw m4, m1, [r3 + 11 * 16]
5209 pmulhrsw m4, m3
5210 packuswb m4, m4
5211 movh [r0 + 328 * 16], m4
5212
5213 ; mode 22 [row 9 - first half]
5214 pslldq m1, 2
5215 pinsrb m1, [r2 + 7 + 32], 1
5216 pinsrb m1, [r2 + 10 + 32], 0
5217 pmaddubsw m4, m1, [r3 + 30 * 16]
5218 pmulhrsw m4, m3
5219 packuswb m4, m4
5220 movh [r0 + 329 * 16], m4
5221
5222 ; mode 22 [row 10 - first half]
5223 pmaddubsw m4, m1, [r3 + 17 * 16]
5224 pmulhrsw m4, m3
5225 packuswb m4, m4
5226 movh [r0 + 330 * 16], m4
5227
5228 ; mode 22 [row 11 - first half]
5229 pmaddubsw m4, m1, [r3 + 4 * 16]
5230 pmulhrsw m4, m3
5231 packuswb m4, m4
5232 movh [r0 + 331 * 16], m4
5233
5234 ; mode 22 [row 12 - first half]
5235 pslldq m1, 2
5236 pinsrb m1, [r2 + 10 + 32], 1
5237 pinsrb m1, [r2 + 12 + 32], 0
5238 pmaddubsw m4, m1, [r3 + 23 * 16]
5239 pmulhrsw m4, m3
5240 packuswb m4, m4
5241 movh [r0 + 332 * 16], m4
5242
5243 ; mode 22 [row 13 - first half]
5244 pmaddubsw m4, m1, [r3 + 10 * 16]
5245 pmulhrsw m4, m3
5246 packuswb m4, m4
5247 movh [r0 + 333 * 16], m4
5248
5249 ; mode 22 [row 14 - first half]
5250 pslldq m1, 2
5251 pinsrb m1, [r2 + 12 + 32], 1
5252 pinsrb m1, [r2 + 15 + 32], 0
5253 pmaddubsw m4, m1, [r3 + 29 * 16]
5254 pmulhrsw m4, m3
5255 packuswb m4, m4
5256 movh [r0 + 334 * 16], m4
5257
5258 ; mode 22 [row 15 - first half]
5259 pmaddubsw m4, m1, [r3 + 16 * 16]
5260 pmulhrsw m4, m3
5261 packuswb m4, m4
5262 movh [r0 + 335 * 16], m4
5263
5264 ; mode 21 [row 3 - first half]
5265 pslldq m6, m7, 2
5266 pinsrb m6, [r2 + 2 + 32], 1
5267 pinsrb m6, [r2 + 4 + 32], 0
5268 pmaddubsw m4, m6, [r3 + 28 * 16]
5269 pmulhrsw m4, m3
5270 packuswb m4, m4
5271 movh [r0 + 307 * 16], m4
5272
5273 ; mode 21 [row 4 - first half]
5274 pmaddubsw m4, m6, [r3 + 11 * 16]
5275 pmulhrsw m4, m3
5276 packuswb m4, m4
5277 movh [r0 + 308 * 16], m4
5278
5279 ; mode 21 [row 5 - first half]
5280 pslldq m6, 2
5281 pinsrb m6, [r2 + 4 + 32], 1
5282 pinsrb m6, [r2 + 6 + 32], 0
5283 pmaddubsw m4, m6, [r3 + 26 * 16]
5284 pmulhrsw m4, m3
5285 packuswb m4, m4
5286 movh [r0 + 309 * 16], m4
5287
5288 ; mode 21 [row 6 - first half]
5289 pmaddubsw m4, m6, [r3 + 9 * 16]
5290 pmulhrsw m4, m3
5291 packuswb m4, m4
5292 movh [r0 + 310 * 16], m4
5293
5294 ; mode 21 [row 7 - first half]
5295 pslldq m6, 2
5296 pinsrb m6, [r2 + 6 + 32], 1
5297 pinsrb m6, [r2 + 8 + 32], 0
5298 pmaddubsw m4, m6, [r3 + 24 * 16]
5299 pmulhrsw m4, m3
5300 packuswb m4, m4
5301 movh [r0 + 311 * 16], m4
5302
5303 ; mode 21 [row 8 - first half]
5304 pmaddubsw m4, m6, [r3 + 7 * 16]
5305 pmulhrsw m4, m3
5306 packuswb m4, m4
5307 movh [r0 + 312 * 16], m4
5308
5309 ; mode 21 [row 9 - first half]
5310 pslldq m6, 2
5311 pinsrb m6, [r2 + 8 + 32], 1
5312 pinsrb m6, [r2 + 9 + 32], 0
5313 pmaddubsw m4, m6, [r3 + 22 * 16]
5314 pmulhrsw m4, m3
5315 packuswb m4, m4
5316 movh [r0 + 313 * 16], m4
5317
5318 ; mode 21 [row 10 - first half]
5319 pmaddubsw m4, m6, [r3 + 5 * 16]
5320 pmulhrsw m4, m3
5321 packuswb m4, m4
5322 movh [r0 + 314 * 16], m4
5323
5324 ; mode 21 [row 11 - first half]
5325 pslldq m6, 2
5326 pinsrb m6, [r2 + 9 + 32], 1
5327 pinsrb m6, [r2 + 11 + 32], 0
5328 pmaddubsw m4, m6, [r3 + 20 * 16]
5329 pmulhrsw m4, m3
5330 packuswb m4, m4
5331 movh [r0 + 315 * 16], m4
5332
5333 ; mode 21 [row 12 - first half]
5334 pmaddubsw m4, m6, [r3 + 3 * 16]
5335 pmulhrsw m4, m3
5336 packuswb m4, m4
5337 movh [r0 + 316 * 16], m4
5338
5339 ; mode 21 [row 13 - first half]
5340 pslldq m6, 2
5341 pinsrb m6, [r2 + 11 + 32], 1
5342 pinsrb m6, [r2 + 13 + 32], 0
5343 pmaddubsw m4, m6, [r3 + 18 * 16]
5344 pmulhrsw m4, m3
5345 packuswb m4, m4
5346 movh [r0 + 317 * 16], m4
5347
5348 ; mode 21 [row 14 - first half]
5349 pmaddubsw m4, m6, [r3 + 1 * 16]
5350 pmulhrsw m4, m3
5351 packuswb m4, m4
5352 movh [r0 + 318 * 16], m4
5353
5354 ; mode 21 [row 15 - first half]
5355 pslldq m6, 2
5356 pinsrb m6, [r2 + 32 + 13], 1
5357 pinsrb m6, [r2 + 32 + 15], 0
5358 pmaddubsw m4, m6, [r3 + 16 * 16]
5359 pmulhrsw m4, m3
5360 packuswb m4, m4
5361 movh [r0 + 319 * 16], m4
5362
5363 ; mode 20 [row 13 - second half]
5364 pmaddubsw m4, m7, [r3 + 26 * 16]
5365 pmulhrsw m4, m3
5366 packuswb m4, m4
5367 movh [r0 + 301 * 16 + 8], m4
5368 ; mode 20 [row 13 - second half]
5369
5370 ; mode 20 [row 14 - second half]
5371 pmaddubsw m4, m7, [r3 + 5 * 16]
5372 pmulhrsw m4, m3
5373 packuswb m4, m4
5374 movh [r0 + 302 * 16 + 8], m4
5375 ; mode 20 [row 14 - second half]
5376
5377 ; mode 20 [row 3 - first half]
5378 pslldq m7, 2
5379 pinsrb m7, [r2 + 32 + 2], 1
5380 pinsrb m7, [r2 + 32 + 3], 0
5381 pmaddubsw m4, m7, [r3 + 12 * 16]
5382 pmulhrsw m4, m3
5383 packuswb m4, m4
5384 movh [r0 + 291 * 16], m4
5385
5386 ; mode 20 [row 15 - second half]
5387 pmaddubsw m4, m7, [r3 + 16 * 16]
5388 pmulhrsw m4, m3
5389 packuswb m4, m4
5390 movh [r0 + 303 * 16 + 8], m4
5391 ; mode 20 [row 15 - second half]
5392
5393 ; mode 20 [row 4 - first half]
5394 pslldq m7, 2
5395 pinsrb m7, [r2 + 32 + 3], 1
5396 pinsrb m7, [r2 + 32 + 5], 0
5397 pmaddubsw m4, m7, [r3 + 23 * 16]
5398 pmulhrsw m4, m3
5399 packuswb m4, m4
5400 movh [r0 + 292 * 16], m4
5401
5402 ; mode 20 [row 5 - first half]
5403 pmaddubsw m4, m7, [r3 + 2 * 16]
5404 pmulhrsw m4, m3
5405 packuswb m4, m4
5406 movh [r0 + 293 * 16], m4
5407
5408 ; mode 20 [row 6 - first half]
5409 pslldq m7, 2
5410 pinsrb m7, [r2 + 32 + 5], 1
5411 pinsrb m7, [r2 + 32 + 6], 0
5412 pmaddubsw m4, m7, [r3 + 13 * 16]
5413 pmulhrsw m4, m3
5414 packuswb m4, m4
5415 movh [r0 + 294 * 16], m4
5416
5417 ; mode 20 [row 7 - first half]
5418 pslldq m7, 2
5419 pinsrb m7, [r2 + 32 + 6], 1
5420 pinsrb m7, [r2 + 32 + 8], 0
5421 pmaddubsw m4, m7, [r3 + 24 * 16]
5422 pmulhrsw m4, m3
5423 packuswb m4, m4
5424 movh [r0 + 295 * 16], m4
5425
5426 ; mode 20 [row 8 - first half]
5427 pmaddubsw m4, m7, [r3 + 3 * 16]
5428 pmulhrsw m4, m3
5429 packuswb m4, m4
5430 movh [r0 + 296 * 16], m4
5431
5432 ; mode 20 [row 9 - first half]
5433 pslldq m7, 2
5434 pinsrb m7, [r2 + 32 + 8], 1
5435 pinsrb m7, [r2 + 32 + 9], 0
5436 pmaddubsw m4, m7, [r3 + 14 * 16]
5437 pmulhrsw m4, m3
5438 packuswb m4, m4
5439 movh [r0 + 297 * 16], m4
5440
5441 ; mode 20 [row 10 - first half]
5442 pslldq m7, 2
5443 pinsrb m7, [r2 + 32 + 9], 1
5444 pinsrb m7, [r2 + 32 + 11], 0
5445 pmaddubsw m4, m7, [r3 + 25 * 16]
5446 pmulhrsw m4, m3
5447 packuswb m4, m4
5448 movh [r0 + 298 * 16], m4
5449
5450 ; mode 20 [row 11 - first half]
5451 pmaddubsw m4, m7, [r3 + 4 * 16]
5452 pmulhrsw m4, m3
5453 packuswb m4, m4
5454 movh [r0 + 299 * 16], m4
5455
5456 ; mode 20 [row 12 - first half]
5457 movu m1, [r3 + 15 * 16]
5458 pslldq m7, 2
5459 pinsrb m7, [r2 + 32 + 11], 1
5460 pinsrb m7, [r2 + 32 + 12], 0
5461 pmaddubsw m4, m7, [r3 + 15 * 16]
5462 pmulhrsw m4, m3
5463 packuswb m4, m4
5464 movh [r0 + 300 * 16], m4
5465
5466 ; mode 20 [row 13 - first half]
5467 pslldq m7, 2
5468 pinsrb m7, [r2 + 32 + 12], 1
5469 pinsrb m7, [r2 + 32 + 14], 0
5470 pmaddubsw m4, m7, [r3 + 26 * 16]
5471 pmulhrsw m4, m3
5472 packuswb m4, m4
5473 movh [r0 + 301 * 16], m4
5474
5475 ; mode 20 [row 14 - first half]
5476 pmaddubsw m4, m7, [r3 + 5 * 16]
5477 pmulhrsw m4, m3
5478 packuswb m4, m4
5479 movh [r0 + 302 * 16], m4
5480
5481 ; mode 20 [row 15 - first half]
5482 pslldq m7, 2
5483 pinsrb m7, [r2 + 32 + 14], 1
5484 pinsrb m7, [r2 + 32 + 15], 0
5485 pmaddubsw m4, m7, [r3 + 16 * 16]
5486 pmulhrsw m4, m3
5487 packuswb m4, m4
5488 movh [r0 + 303 * 16], m4
5489
5490 ; mode 19 [row 1]
5491 pslldq m0, 2
5492 pinsrb m0, [r2], 1
5493 pinsrb m0, [r2 + 32 + 1], 0
5494 pslldq m5, 2
5495 pinsrb m5, [r2 + 8], 1
5496 pinsrb m5, [r2 + 7], 0
5497
5498 ; mode 20 [row 1 - second half]
5499 pmaddubsw m4, m5, [r3 + 22 * 16]
5500 pmulhrsw m4, m3
5501 packuswb m4, m4
5502 movh [r0 + 289 * 16 + 8], m4
5503 ; mode 20 [row 1 - second half] end
5504
5505 ; mode 20 [row 2 - second half]
5506 pmaddubsw m4, m5, [r3 + 1 * 16]
5507 pmulhrsw m4, m3
5508 packuswb m4, m4
5509 movh [r0 + 290 * 16 + 8], m4
5510 ; mode 20 [row 2 - second half] end
5511
5512 ; mode 21 [row 2 - second half]
5513 pmaddubsw m4, m5, [r3 + 30 * 16]
5514 pmulhrsw m4, m3
5515 packuswb m4, m4
5516 movh [r0 + 305 * 16 + 8], m4
5517 ; mode 21 [row 2 - second half] end
5518
5519 ; mode 21 [row 3 - second half]
5520 pmaddubsw m4, m5, [r3 + 13 * 16]
5521 pmulhrsw m4, m3
5522 packuswb m4, m4
5523 movh [r0 + 306 * 16 + 8], m4
5524 ; mode 21 [row 3 - second half] end
5525
5526 ; mode 21 [row 4 - second half]
5527 pmaddubsw m4, m5, [r3 + 11 * 16]
5528 pmulhrsw m4, m3
5529 packuswb m4, m4
5530 movh [r0 + 307 * 16 + 8], m4
5531 ; mode 21 [row 4 - second half] end
5532
5533 ; mode 22 [row 2 - second half]
5534 pmaddubsw m4, m5, [r3 + 25 * 16]
5535 pmulhrsw m4, m3
5536 packuswb m4, m4
5537 movh [r0 + 322 * 16 + 8], m4
5538 ; mode 22 [row 2 - second half] end
5539
5540 ; mode 22 [row 3 - second half]
5541 pmaddubsw m4, m5, [r3 + 12 * 16]
5542 pmulhrsw m4, m3
5543 packuswb m4, m4
5544 movh [r0 + 323 * 16 + 8], m4
5545 ; mode 22 [row 3 - second half] end
5546
5547 ; mode 23 [row 3 - second half]
5548 pmaddubsw m4, m5, [r3 + 28 * 16]
5549 pmulhrsw m4, m3
5550 packuswb m4, m4
5551 movh [r0 + 339 * 16 + 8], m4
5552 ; mode 23 [row 3 - second half] end
5553
5554 ; mode 23 [row 4 - second half]
5555 pmaddubsw m4, m5, [r3 + 19 * 16]
5556 pmulhrsw m4, m3
5557 packuswb m4, m4
5558 movh [r0 + 340 * 16 + 8], m4
5559 ; mode 23 [row 4 - second half] end
5560
5561 ; mode 23 [row 5 - second half]
5562 pmaddubsw m4, m5, [r3 + 10 * 16]
5563 pmulhrsw m4, m3
5564 packuswb m4, m4
5565 movh [r0 + 341 * 16 + 8], m4
5566 ; mode 23 [row 5 - second half] end
5567
5568 ; mode 23 [row 6 - second half]
5569 pmaddubsw m4, m5, [r3 + 1 * 16]
5570 pmulhrsw m4, m3
5571 packuswb m4, m4
5572 movh [r0 + 342 * 16 + 8], m4
5573 ; mode 23 [row 6 - second half] end
5574
5575 ; mode 24 [row 6 - second half]
5576 pmaddubsw m4, m5, [r3 + 29 * 16]
5577 pmulhrsw m4, m3
5578 packuswb m4, m4
5579 movh [r0 + 358 * 16 + 8], m4
5580 ; mode 24 [row 6 - second half] end
5581
5582 ; mode 24 [row 7 - second half]
5583 pmaddubsw m4, m5, [r3 + 24 * 16]
5584 pmulhrsw m4, m3
5585 packuswb m4, m4
5586 movh [r0 + 359 * 16 + 8], m4
5587 ; mode 24 [row 7 - second half] end
5588
5589 ; mode 24 [row 8 - second half]
5590 pmaddubsw m4, m5, [r3 + 19 * 16]
5591 pmulhrsw m4, m3
5592 packuswb m4, m4
5593 movh [r0 + 360 * 16 + 8], m4
5594 ; mode 24 [row 8 - second half] end
5595
5596 ; mode 24 [row 9 - second half]
5597 pmaddubsw m4, m5, [r3 + 14 * 16]
5598 pmulhrsw m4, m3
5599 packuswb m4, m4
5600 movh [r0 + 361 * 16 + 8], m4
5601 ; mode 24 [row 9 - second half] end
5602
5603 ; mode 24 [row 10 - second half]
5604 pmaddubsw m4, m5, [r3 + 9 * 16]
5605 pmulhrsw m4, m3
5606 packuswb m4, m4
5607 movh [r0 + 362 * 16 + 8], m4
5608 ; mode 24 [row 10 - second half] end
5609
5610 ; mode 24 [row 11 - second half]
5611 pmaddubsw m4, m5, [r3 + 4 * 16]
5612 pmulhrsw m4, m3
5613 packuswb m4, m4
5614 movh [r0 + 363 * 16 + 8], m4
5615 ; mode 24 [row 11 - second half] end
5616
5617 pmaddubsw m4, m0, [r3 + 12 * 16]
5618 pmulhrsw m4, m3
5619 pmaddubsw m6, m5, [r3 + 12 * 16]
5620 pmulhrsw m6, m3
5621 packuswb m4, m6
5622 movu [r0 + 273 * 16], m4
5623
5624 ; mode 19 [row 2]
5625 pslldq m0, 2
5626 pinsrb m0, [r2 + 32 + 1], 1
5627 pinsrb m0, [r2 + 32 + 2], 0
5628 pslldq m5, 2
5629 pinsrb m5, [r2 + 7], 1
5630 pinsrb m5, [r2 + 6], 0
5631
5632 ; mode 20 [row 3 - second half]
5633 pmaddubsw m4, m5, [r3 + 12 * 16]
5634 pmulhrsw m4, m3
5635 packuswb m4, m4
5636 movh [r0 + 291 * 16 + 8], m4
5637 ; mode 20 [row 3 - second half] end
5638
5639 ; mode 21 [row 3 - second half]
5640 pmaddubsw m4, m5, [r3 + 28 * 16]
5641 pmulhrsw m4, m3
5642 packuswb m4, m4
5643 movh [r0 + 307 * 16 + 8], m4
5644 ; mode 21 [row 3 - second half] end
5645
5646 ; mode 21 [row 4 - second half]
5647 pmaddubsw m4, m5, [r3 + 11 * 16]
5648 pmulhrsw m4, m3
5649 packuswb m4, m4
5650 movh [r0 + 308 * 16 + 8], m4
5651 ; mode 21 [row 4 - second half] end
5652
5653 ; mode 22 [row 4 - second half]
5654 pmaddubsw m4, m5, [r3 + 31 * 16]
5655 pmulhrsw m4, m3
5656 packuswb m4, m4
5657 movh [r0 + 324 * 16 + 8], m4
5658 ; mode 22 [row 4 - second half] end
5659
5660 ; mode 22 [row 5 - second half]
5661 pmaddubsw m4, m5, [r3 + 18 * 16]
5662 pmulhrsw m4, m3
5663 packuswb m4, m4
5664 movh [r0 + 325 * 16 + 8], m4
5665 ; mode 22 [row 5 - second half] end
5666
5667 ; mode 22 [row 6 - second half]
5668 pmaddubsw m4, m5, [r3 + 5 * 16]
5669 pmulhrsw m4, m3
5670 packuswb m4, m4
5671 movh [r0 + 326 * 16 + 8], m4
5672 ; mode 22 [row 6 - second half] end
5673
5674 ; mode 23 [row 7 - second half]
5675 pmaddubsw m4, m5, [r3 + 24 * 16]
5676 pmulhrsw m4, m3
5677 packuswb m4, m4
5678 movh [r0 + 343 * 16 + 8], m4
5679 ; mode 23 [row 7 - second half] end
5680
5681 ; mode 23 [row 8 - second half]
5682 pmaddubsw m4, m5, [r3 + 15 * 16]
5683 pmulhrsw m4, m3
5684 packuswb m4, m4
5685 movh [r0 + 344 * 16 + 8], m4
5686 ; mode 23 [row 8 - second half] end
5687
5688 ; mode 23 [row 9 - second half]
5689 pmaddubsw m4, m5, [r3 + 6 * 16]
5690 pmulhrsw m4, m3
5691 packuswb m4, m4
5692 movh [r0 + 345 * 16 + 8], m4
5693 ; mode 23 [row 9 - second half] end
5694
5695 ; mode 24 [row 12 - second half]
5696 pmaddubsw m4, m5, [r3 + 31 * 16]
5697 pmulhrsw m4, m3
5698 packuswb m4, m4
5699 movh [r0 + 364 * 16 + 8], m4
5700 ; mode 24 [row 12 - second half] end
5701
5702 ; mode 24 [row 13 - second half]
5703 pmaddubsw m4, m5, [r3 + 26 * 16]
5704 pmulhrsw m4, m3
5705 packuswb m4, m4
5706 movh [r0 + 365 * 16 + 8], m4
5707 ; mode 24 [row 13 - second half] end
5708
5709 ; mode 24 [row 14 - second half]
5710 pmaddubsw m4, m5, [r3 + 21 * 16]
5711 pmulhrsw m4, m3
5712 packuswb m4, m4
5713 movh [r0 + 366 * 16 + 8], m4
5714 ; mode 24 [row 14 - second half] end
5715
5716 ; mode 24 [row 15 - second half]
5717 pmaddubsw m4, m5, [r3 + 16 * 16]
5718 pmulhrsw m4, m3
5719 packuswb m4, m4
5720 movh [r0 + 367 * 16 + 8], m4
5721 ; mode 24 [row 15 - second half] end
5722
5723 pmaddubsw m4, m0, [r3 + 18 * 16]
5724 pmulhrsw m4, m3
5725 pmaddubsw m6, m5, [r3 + 18 * 16]
5726 pmulhrsw m6, m3
5727 packuswb m4, m6
5728 movu [r0 + 274 * 16], m4
5729
5730 ; mode 19 [row 3]
5731 pslldq m0, 2
5732 pinsrb m0, [r2 + 32 + 2], 1
5733 pinsrb m0, [r2 + 32 + 4], 0
5734 pslldq m5, 2
5735 pinsrb m5, [r2 + 6], 1
5736 pinsrb m5, [r2 + 5], 0
5737
5738 ; mode 20 [row 4 - second half]
5739 pmaddubsw m4, m5, [r3 + 23 * 16]
5740 pmulhrsw m4, m3
5741 packuswb m4, m4
5742 movh [r0 + 292 * 16 + 8], m4
5743 ; mode 20 [row 4 - second half] end
5744
5745 ; mode 20 [row 5 - second half]
5746 pmaddubsw m4, m5, [r3 + 2 * 16]
5747 pmulhrsw m4, m3
5748 packuswb m4, m4
5749 movh [r0 + 293 * 16 + 8], m4
5750 ; mode 20 [row 5 - second half] end
5751
5752 ; mode 21 [row 5 - second half]
5753 pmaddubsw m4, m5, [r3 + 26 * 16]
5754 pmulhrsw m4, m3
5755 packuswb m4, m4
5756 movh [r0 + 309 * 16 + 8], m4
5757 ; mode 21 [row 5 - second half] end
5758
5759 ; mode 21 [row 6 - second half]
5760 pmaddubsw m4, m5, [r3 + 9 * 16]
5761 pmulhrsw m4, m3
5762 packuswb m4, m4
5763 movh [r0 + 310 * 16 + 8], m4
5764 ; mode 21 [row 6 - second half] end
5765
5766 ; mode 22 [row 7 - second half]
5767 pmaddubsw m4, m5, [r3 + 24 * 16]
5768 pmulhrsw m4, m3
5769 packuswb m4, m4
5770 movh [r0 + 327 * 16 + 8], m4
5771 ; mode 22 [row 7 - second half] end
5772
5773 ; mode 22 [row 8 - second half]
5774 pmaddubsw m4, m5, [r3 + 11 * 16]
5775 pmulhrsw m4, m3
5776 packuswb m4, m4
5777 movh [r0 + 328 * 16 + 8], m4
5778 ; mode 22 [row 7 - second half] end
5779
5780 ; mode 23 [row 10 - second half]
5781 pmaddubsw m4, m5, [r3 + 29 * 16]
5782 pmulhrsw m4, m3
5783 packuswb m4, m4
5784 movh [r0 + 346 * 16 + 8], m4
5785 ; mode 23 [row 10 - second half] end
5786
5787 ; mode 23 [row 11 - second half]
5788 pmaddubsw m4, m5, [r3 + 20 * 16]
5789 pmulhrsw m4, m3
5790 packuswb m4, m4
5791 movh [r0 + 347 * 16 + 8], m4
5792 ; mode 23 [row 11 - second half] end
5793
5794 ; mode 23 [row 12 - second half]
5795 pmaddubsw m4, m5, [r3 + 11 * 16]
5796 pmulhrsw m4, m3
5797 packuswb m4, m4
5798 movh [r0 + 348 * 16 + 8], m4
5799 ; mode 23 [row 12 - second half] end
5800
5801 ; mode 23 [row 13 - second half]
5802 pmaddubsw m4, m5, [r3 + 2 * 16]
5803 pmulhrsw m4, m3
5804 packuswb m4, m4
5805 movh [r0 + 349 * 16 + 8], m4
5806 ; mode 23 [row 13 - second half] end
5807
5808 pmaddubsw m4, m0, [r3 + 24 * 16]
5809 pmulhrsw m4, m3
5810 pmaddubsw m6, m5, [r3 + 24 * 16]
5811 pmulhrsw m6, m3
5812 packuswb m4, m6
5813 movu [r0 + 275 * 16], m4
5814
5815 ; mode 19 [row 4]
5816 pslldq m0, 2
5817 pinsrb m0, [r2 + 32 + 4], 1
5818 pinsrb m0, [r2 + 32 + 5], 0
5819 pslldq m5, 2
5820 pinsrb m5, [r2 + 5], 1
5821 pinsrb m5, [r2 + 4], 0
5822
5823 ; mode 20 [row 6 - second half]
5824 pmaddubsw m4, m5, [r3 + 13 * 16]
5825 pmulhrsw m4, m3
5826 packuswb m4, m4
5827 movh [r0 + 294 * 16 + 8], m4
5828 ; mode 20 [row 6 - second half] end
5829
5830 ; mode 21 [row 7 - second half]
5831 pmaddubsw m4, m5, [r3 + 24 * 16]
5832 pmulhrsw m4, m3
5833 packuswb m4, m4
5834 movh [r0 + 311 * 16 + 8], m4
5835 ; mode 21 [row 7 - second half] end
5836
5837 ; mode 21 [row 8 - second half]
5838 pmaddubsw m4, m5, [r3 + 7 * 16]
5839 pmulhrsw m4, m3
5840 packuswb m4, m4
5841 movh [r0 + 312 * 16 + 8], m4
5842 ; mode 21 [row 8 - second half] end
5843
5844 ; mode 22 [row 9 - second half]
5845 pmaddubsw m4, m5, [r3 + 30 * 16]
5846 pmulhrsw m4, m3
5847 packuswb m4, m4
5848 movh [r0 + 329 * 16 + 8], m4
5849 ; mode 22 [row 9 - second half] end
5850
5851 ; mode 22 [row 10 - second half]
5852 pmaddubsw m4, m5, [r3 + 17 * 16]
5853 pmulhrsw m4, m3
5854 packuswb m4, m4
5855 movh [r0 + 330 * 16 + 8], m4
5856 ; mode 22 [row 10 - second half] end
5857
5858 ; mode 22 [row 11 - second half]
5859 pmaddubsw m4, m5, [r3 + 4 * 16]
5860 pmulhrsw m4, m3
5861 packuswb m4, m4
5862 movh [r0 + 331 * 16 + 8], m4
5863 ; mode 22 [row 11 - second half] end
5864
5865 ; mode 23 [row 14 - second half]
5866 pmaddubsw m4, m5, [r3 + 25 * 16]
5867 pmulhrsw m4, m3
5868 packuswb m4, m4
5869 movh [r0 + 350 * 16 + 8], m4
5870 ; mode 23 [row 14 - second half] end
5871
5872 ; mode 23 [row 15 - second half]
5873 pmaddubsw m4, m5, [r3 + 16 * 16]
5874 pmulhrsw m4, m3
5875 packuswb m4, m4
5876 movh [r0 + 351 * 16 + 8], m4
5877
5878 ; mode 23 [row 15 - second half] end
5879 pmaddubsw m4, m0, [r3 + 30 * 16]
5880 pmulhrsw m4, m3
5881 pmaddubsw m6, m5, [r3 + 30 * 16]
5882 pmulhrsw m6, m3
5883 packuswb m4, m6
5884 movu [r0 + 276 * 16], m4
5885
5886 ; mode 19 [row 5]
5887 pmaddubsw m4, m0, [r3 + 4 * 16]
5888 pmulhrsw m4, m3
5889 pmaddubsw m6, m5, [r3 + 4 * 16]
5890 pmulhrsw m6, m3
5891 packuswb m4, m6
5892 movu [r0 + 277 * 16], m4
5893
5894 ; mode 19 [row 6]
5895 pslldq m0, 2
5896 pinsrb m0, [r2 + 32 + 5], 1
5897 pinsrb m0, [r2 + 32 + 6], 0
5898 pslldq m5, 2
5899 pinsrb m5, [r2 + 4], 1
5900 pinsrb m5, [r2 + 3], 0
5901
5902 ; mode 20 [row 7 - second half]
5903 pmaddubsw m4, m5, [r3 + 24 * 16]
5904 pmulhrsw m4, m3
5905 packuswb m4, m4
5906 movh [r0 + 295 * 16 + 8], m4
5907 ; mode 20 [row 7 - second half] end
5908
5909 ; mode 20 [row 8 - second half]
5910 pmaddubsw m4, m5, [r3 + 3 * 16]
5911 pmulhrsw m4, m3
5912 packuswb m4, m4
5913 movh [r0 + 296 * 16 + 8], m4
5914 ; mode 20 [row 8 - second half] end
5915
5916 ; mode 21 [row 9 - second half]
5917 pmaddubsw m4, m5, [r3 + 22 * 16]
5918 pmulhrsw m4, m3
5919 packuswb m4, m4
5920 movh [r0 + 313 * 16 + 8], m4
5921 ; mode 21 [row 9 - second half] end
5922
5923 ; mode 21 [row 10 - second half]
5924 pmaddubsw m4, m5, [r3 + 5 * 16]
5925 pmulhrsw m4, m3
5926 packuswb m4, m4
5927 movh [r0 + 314 * 16 + 8], m4
5928 ; mode 21 [row 10 - second half] end
5929
5930 ; mode 22 [row 12 - second half]
5931 pmaddubsw m4, m5, [r3 + 23 * 16]
5932 pmulhrsw m4, m3
5933 packuswb m4, m4
5934 movh [r0 + 332 * 16 + 8], m4
5935 ; mode 22 [row 12 - second half] end
5936
5937 ; mode 22 [row 12 - second half]
5938 pmaddubsw m4, m5, [r3 + 10 * 16]
5939 pmulhrsw m4, m3
5940 packuswb m4, m4
5941 movh [r0 + 333 * 16 + 8], m4
5942 ; mode 22 [row 12 - second half] end
5943
5944 pmaddubsw m4, m0, [r3 + 10 * 16]
5945 pmulhrsw m4, m3
5946 pmaddubsw m6, m5, [r3 + 10 * 16]
5947 pmulhrsw m6, m3
5948 packuswb m4, m6
5949 movu [r0 + 278 * 16], m4
5950
5951 ; mode 19 [row 7]
5952 pslldq m0, 2
5953 pinsrb m0, [r2 + 32 + 6], 1
5954 pinsrb m0, [r2 + 32 + 7], 0
5955 pslldq m5, 2
5956 pinsrb m5, [r2 + 3], 1
5957 pinsrb m5, [r2 + 2], 0
5958
5959 ; mode 20 [row 9 - second half]
5960 pmaddubsw m4, m5, [r3 + 14 * 16]
5961 pmulhrsw m4, m3
5962 packuswb m4, m4
5963 movh [r0 + 297 * 16 + 8], m4
5964 ; mode 20 [row 9 - second half]
5965
5966 ; mode 21 [row 11 - second half]
5967 pmaddubsw m4, m5, [r3 + 20 * 16]
5968 pmulhrsw m4, m3
5969 packuswb m4, m4
5970 movh [r0 + 315 * 16 + 8], m4
5971 ; mode 21 [row 11 - second half] end
5972
5973 ; mode 21 [row 12 - second half]
5974 pmaddubsw m4, m5, [r3 + 3 * 16]
5975 pmulhrsw m4, m3
5976 packuswb m4, m4
5977 movh [r0 + 316 * 16 + 8], m4
5978 ; mode 21 [row 12 - second half] end
5979
5980 ; mode 22 [row 14 - second half]
5981 pmaddubsw m4, m5, [r3 + 29 * 16]
5982 pmulhrsw m4, m3
5983 packuswb m4, m4
5984 movh [r0 + 334 * 16 + 8], m4
5985 ; mode 22 [row 14 - second half] end
5986
5987 ; mode 22 [row 15 - second half]
5988 pmaddubsw m4, m5, [r3 + 16 * 16]
5989 pmulhrsw m4, m3
5990 packuswb m4, m4
5991 movh [r0 + 335 * 16 + 8], m4
5992 ; mode 22 [row 15 - second half] end
5993
5994 pmaddubsw m4, m0, [r3 + 16 * 16]
5995 pmulhrsw m4, m3
5996 pmaddubsw m6, m5, [r3 + 16 * 16]
5997 pmulhrsw m6, m3
5998 packuswb m4, m6
5999 movu [r0 + 279 * 16], m4
6000
6001 ; mode 19 [row 8]
6002 pslldq m0, 2
6003 pinsrb m0, [r2 + 32 + 7], 1
6004 pinsrb m0, [r2 + 32 + 9], 0
6005 pslldq m5, 2
6006 pinsrb m5, [r2 + 2], 1
6007 pinsrb m5, [r2 + 1], 0
6008
6009 ; mode 20 [row 10 - second half]
6010 pmaddubsw m4, m5, [r3 + 25 * 16]
6011 pmulhrsw m4, m3
6012 packuswb m4, m4
6013 movh [r0 + 298 * 16 + 8], m4
6014 ; mode 20 [row 10 - second half] end
6015
6016 ; mode 20 [row 11 - second half]
6017 pmaddubsw m4, m5, [r3 + 4 * 16]
6018 pmulhrsw m4, m3
6019 packuswb m4, m4
6020 movh [r0 + 299 * 16 + 8], m4
6021 ; mode 20 [row 11 - second half] end
6022
6023 ; mode 21 [row 13 - second half]
6024 pmaddubsw m4, m5, [r3 + 18 * 16]
6025 pmulhrsw m4, m3
6026 packuswb m4, m4
6027 movh [r0 + 317 * 16 + 8], m4
6028 ; mode 21 [row 13 - second half] end
6029
6030 ; mode 21 [row 14 - second half]
6031 pmaddubsw m4, m5, [r3 + 1 * 16]
6032 pmulhrsw m4, m3
6033 packuswb m4, m4
6034 movh [r0 + 318 * 16 + 8], m4
6035 ; mode 21 [row 14 - second half] end
6036
6037 pmaddubsw m4, m0, [r3 + 22 * 16]
6038 pmulhrsw m4, m3
6039 pmaddubsw m6, m5, [r3 + 22 * 16]
6040 pmulhrsw m6, m3
6041 packuswb m4, m6
6042 movu [r0 + 280 * 16], m4
6043
6044 ; mode 19 [row 9]
6045 pslldq m0, 2
6046 pinsrb m0, [r2 + 32 + 9], 1
6047 pinsrb m0, [r2 + 32 + 10], 0
6048 pslldq m5, 2
6049 pinsrb m5, [r2 + 1], 1
6050 pinsrb m5, [r2 + 0], 0
6051
6052 ; mode 20 [row 12 - second half]
6053 pmaddubsw m4, m5, [r3 + 15 * 16]
6054 pmulhrsw m4, m3
6055 packuswb m4, m4
6056 movh [r0 + 300 * 16 + 8], m4
6057
6058 ; mode 20 [row 12 - second half] end
6059 pmaddubsw m4, m0, [r3 + 28 * 16]
6060 pmulhrsw m4, m3
6061 pmaddubsw m6, m5, [r3 + 28 * 16]
6062 pmulhrsw m6, m3
6063 packuswb m4, m6
6064 movu [r0 + 281 * 16], m4
6065
6066 ; mode 19 [row 10]
6067 pmaddubsw m4, m0, [r3 + 2 * 16]
6068 pmulhrsw m4, m3
6069 pmaddubsw m6, m5, [r3 + 2 * 16]
6070 pmulhrsw m6, m3
6071 packuswb m4, m6
6072 movu [r0 + 282 * 16], m4
6073
6074 ; mode 19 [row 11]
6075 pslldq m0, 2
6076 pinsrb m0, [r2 + 32 + 10], 1
6077 pinsrb m0, [r2 + 32 + 11], 0
6078 pmaddubsw m4, m0, [r3 + 8 * 16]
6079 pmulhrsw m4, m3
6080 pslldq m5, 2
6081 pinsrb m5, [r2], 1
6082 pinsrb m5, [r2 + 32 + 1], 0
6083 pmaddubsw m6, m5, [r3 + 8 * 16]
6084 pmulhrsw m6, m3
6085 packuswb m4, m6
6086 movu [r0 + 283 * 16], m4
6087
6088 ; mode 19 [row 12]
6089 pslldq m0, 2
6090 pinsrb m0, [r2 + 32 + 11], 1
6091 pinsrb m0, [r2 + 32 + 12], 0
6092 pslldq m5, 2
6093 pinsrb m5, [r2 + 32 + 1], 1
6094 pinsrb m5, [r2 + 32 + 2], 0
6095 pmaddubsw m4, m0, [r3 + 14 * 16]
6096 pmulhrsw m4, m3
6097 pmaddubsw m6, m5, [r3 + 14 * 16]
6098 pmulhrsw m6, m3
6099 packuswb m4, m6
6100 movu [r0 + 284 * 16], m4
6101
6102 ; mode 19 [row 13]
6103 pslldq m0, 2
6104 pinsrb m0, [r2 + 32 + 12], 1
6105 pinsrb m0, [r2 + 32 + 14], 0
6106 pmaddubsw m4, m0, [r3 + 20 * 16]
6107 pmulhrsw m4, m3
6108 pslldq m5, 2
6109 pinsrb m5, [r2 + 32 + 2], 1
6110 pinsrb m5, [r2 + 32 + 4], 0
6111 pmaddubsw m6, m5, [r3 + 20 * 16]
6112 pmulhrsw m6, m3
6113 packuswb m4, m6
6114 movu [r0 + 285 * 16], m4
6115
6116 ; mode 19 [row 14]
6117 pslldq m0, 2
6118 pinsrb m0, [r2 + 32 + 14], 1
6119 pinsrb m0, [r2 + 32 + 15], 0
6120 pmaddubsw m4, m0, [r3 + 26 * 16]
6121 pmulhrsw m4, m3
6122 pslldq m5, 2
6123 pinsrb m5, [r2 + 32 + 4], 1
6124 pinsrb m5, [r2 + 32 + 5], 0
6125 pmaddubsw m6, m5, [r3 + 26 * 16]
6126 pmulhrsw m6, m3
6127 packuswb m4, m6
6128 movu [r0 + 286 * 16], m4
6129
6130 ; mode 19 [row 15]
6131 movu m0, [r2 + 32]
6132 pshufb m0, [tab_S1]
6133 movu [r0 + 287 * 16], m0
6134 movd m1, [r2]
6135 movd [r0 + 287 * 16 + 12], m1
6136
6137 ; mode 25
6138 movu m1, [r1]
6139
6140 ; mode 26 [all rows]
6141 psrldq m6, m1, 1
6142 pinsrb m6, [r1 + 16], 15
6143 movu m7, m6
6144 movu [r0 + 384 * 16], m6
6145 movu [r0 + 385 * 16], m6
6146 movu [r0 + 386 * 16], m6
6147 movu [r0 + 387 * 16], m6
6148 movu [r0 + 388 * 16], m6
6149 movu [r0 + 389 * 16], m6
6150 movu [r0 + 390 * 16], m6
6151 movu [r0 + 391 * 16], m6
6152 movu [r0 + 392 * 16], m6
6153 movu [r0 + 393 * 16], m6
6154 movu [r0 + 394 * 16], m6
6155 movu [r0 + 395 * 16], m6
6156 movu [r0 + 396 * 16], m6
6157 movu [r0 + 397 * 16], m6
6158 movu [r0 + 398 * 16], m6
6159 movu [r0 + 399 * 16], m6
6160
6161 pxor m0, m0
6162 pshufb m6, m6, m0
6163 punpcklbw m6, m0
6164 pinsrb m2, [r1], 0
6165 pshufb m2, m2, m0
6166 punpcklbw m2, m0
6167 movu m4, [r1 + 1 + 32]
6168 punpcklbw m5, m4, m0
6169 punpckhbw m4, m0
6170 psubw m5, m2
6171 psubw m4, m2
6172 psraw m5, 1
6173 psraw m4, 1
6174 paddw m5, m6
6175 paddw m4, m6
6176 packuswb m5, m4
6177
6178 pextrb [r0 + 384 * 16], m5, 0
6179 pextrb [r0 + 385 * 16], m5, 1
6180 pextrb [r0 + 386 * 16], m5, 2
6181 pextrb [r0 + 387 * 16], m5, 3
6182 pextrb [r0 + 388 * 16], m5, 4
6183 pextrb [r0 + 389 * 16], m5, 5
6184 pextrb [r0 + 390 * 16], m5, 6
6185 pextrb [r0 + 391 * 16], m5, 7
6186 pextrb [r0 + 392 * 16], m5, 8
6187 pextrb [r0 + 393 * 16], m5, 9
6188 pextrb [r0 + 394 * 16], m5, 10
6189 pextrb [r0 + 395 * 16], m5, 11
6190 pextrb [r0 + 396 * 16], m5, 12
6191 pextrb [r0 + 397 * 16], m5, 13
6192 pextrb [r0 + 398 * 16], m5, 14
6193 pextrb [r0 + 399 * 16], m5, 15
6194
6195 ; mode 25 [row 15]
6196 movu [r0 + 383 * 16], m1
6197
6198 ; mode 25 [row 0]
6199 psrldq m2, m1, 1
6200 punpcklbw m1, m2
6201 movu m2, [r1 + 8]
6202 psrldq m4, m2, 1
6203 punpcklbw m2, m4
6204 pmaddubsw m4, m1, [r3 + 30 * 16]
6205 pmulhrsw m4, m3
6206 pmaddubsw m5, m2, [r3 + 30 * 16]
6207 pmulhrsw m5, m3
6208 packuswb m4, m5
6209 movu [r0 + 368 * 16], m4
6210
6211 ; mode 25 [row 1]
6212 pmaddubsw m4, m1, [r3 + 28 * 16]
6213 pmulhrsw m4, m3
6214 pmaddubsw m5, m2, [r3 + 28 * 16]
6215 pmulhrsw m5, m3
6216 packuswb m4, m5
6217 movu [r0 + 369 * 16], m4
6218
6219 ; mode 25 [row 2]
6220 pmaddubsw m4, m1, [r3 + 26 * 16]
6221 pmulhrsw m4, m3
6222 pmaddubsw m5, m2, [r3 + 26 * 16]
6223 pmulhrsw m5, m3
6224 packuswb m4, m5
6225 movu [r0 + 370 * 16], m4
6226
6227 ; mode 25 [row 3]
6228 pmaddubsw m4, m1, [r3 + 24 * 16]
6229 pmulhrsw m4, m3
6230 pmaddubsw m5, m2, [r3 + 24 * 16]
6231 pmulhrsw m5, m3
6232 packuswb m4, m5
6233 movu [r0 + 371 * 16], m4
6234
6235 ; mode 25 [row 4]
6236 pmaddubsw m4, m1, [r3 + 22 * 16]
6237 pmulhrsw m4, m3
6238 pmaddubsw m5, m2, [r3 + 22 * 16]
6239 pmulhrsw m5, m3
6240 packuswb m4, m5
6241 movu [r0 + 372 * 16], m4
6242
6243 ; mode 25 [row 5]
6244 pmaddubsw m4, m1, [r3 + 20 * 16]
6245 pmulhrsw m4, m3
6246 pmaddubsw m5, m2, [r3 + 20 * 16]
6247 pmulhrsw m5, m3
6248 packuswb m4, m5
6249 movu [r0 + 373 * 16], m4
6250
6251 ; mode 25 [row 6]
6252 pmaddubsw m4, m1, [r3 + 18 * 16]
6253 pmulhrsw m4, m3
6254 pmaddubsw m5, m2, [r3 + 18 * 16]
6255 pmulhrsw m5, m3
6256 packuswb m4, m5
6257 movu [r0 + 374 * 16], m4
6258
6259 ; mode 25 [row 7]
6260 pmaddubsw m4, m1, [r3 + 16 * 16]
6261 pmulhrsw m4, m3
6262 pmaddubsw m5, m2, [r3 + 16 * 16]
6263 pmulhrsw m5, m3
6264 packuswb m4, m5
6265 movu [r0 + 375 * 16], m4
6266
6267 ; mode 25 [row 8]
6268 pmaddubsw m4, m1, [r3 + 14 * 16]
6269 pmulhrsw m4, m3
6270 pmaddubsw m5, m2, [r3 + 14 * 16]
6271 pmulhrsw m5, m3
6272 packuswb m4, m5
6273 movu [r0 + 376 * 16], m4
6274
6275 ; mode 25 [row 9]
6276 pmaddubsw m4, m1, [r3 + 12 * 16]
6277 pmulhrsw m4, m3
6278 pmaddubsw m5, m2, [r3 + 12 * 16]
6279 pmulhrsw m5, m3
6280 packuswb m4, m5
6281 movu [r0 + 377 * 16], m4
6282
6283 ; mode 25 [row 10]
6284 pmaddubsw m4, m1, [r3 + 10 * 16]
6285 pmulhrsw m4, m3
6286 pmaddubsw m5, m2, [r3 + 10 * 16]
6287 pmulhrsw m5, m3
6288 packuswb m4, m5
6289 movu [r0 + 378 * 16], m4
6290
6291 ; mode 25 [row 11]
6292 pmaddubsw m4, m1, [r3 + 8 * 16]
6293 pmulhrsw m4, m3
6294 pmaddubsw m5, m2, [r3 + 8 * 16]
6295 pmulhrsw m5, m3
6296 packuswb m4, m5
6297 movu [r0 + 379 * 16], m4
6298
6299 ; mode 25 [row 12]
6300 pmaddubsw m4, m1, [r3 + 6 * 16]
6301 pmulhrsw m4, m3
6302 pmaddubsw m5, m2, [r3 + 6 * 16]
6303 pmulhrsw m5, m3
6304 packuswb m4, m5
6305 movu [r0 + 380 * 16], m4
6306
6307 ; mode 25 [row 13]
6308 pmaddubsw m4, m1, [r3 + 4 * 16]
6309 pmulhrsw m4, m3
6310 pmaddubsw m5, m2, [r3 + 4 * 16]
6311 pmulhrsw m5, m3
6312 packuswb m4, m5
6313 movu [r0 + 381 * 16], m4
6314
6315 ; mode 25 [row 14]
6316 pmaddubsw m4, m1, [r3 + 2 * 16]
6317 pmulhrsw m4, m3
6318 pmaddubsw m5, m2, [r3 + 2 * 16]
6319 pmulhrsw m5, m3
6320 packuswb m4, m5
6321 movu [r0 + 382 * 16], m4
6322
6323 ; mode 27 [row 15]
6324 psrldq m6, m7, 1
6325 punpcklbw m7, m6
6326 pinsrb m6, [r1 + 17], 15
6327 movu [r0 + 415 * 16], m6
6328
6329 ; mode 27 [row 0]
6330 movu m4, [r1 + 9]
6331 psrldq m5, m4, 1
6332 punpcklbw m4, m5
6333 pmaddubsw m6, m7, [r3 + 2 * 16]
6334 pmulhrsw m6, m3
6335 pmaddubsw m5, m4, [r3 + 2 * 16]
6336 pmulhrsw m5, m3
6337 packuswb m6, m5
6338 movu [r0 + 400 * 16], m6
6339
6340 ; mode 27 [row 1]
6341 pmaddubsw m6, m7, [r3 + 4 * 16]
6342 pmulhrsw m6, m3
6343 pmaddubsw m5, m4, [r3 + 4 * 16]
6344 pmulhrsw m5, m3
6345 packuswb m6, m5
6346 movu [r0 + 401 * 16], m6
6347
6348 ; mode 27 [row 2]
6349 pmaddubsw m6, m7, [r3 + 6 * 16]
6350 pmulhrsw m6, m3
6351 pmaddubsw m5, m4, [r3 + 6 * 16]
6352 pmulhrsw m5, m3
6353 packuswb m6, m5
6354 movu [r0 + 402 * 16], m6
6355
6356 ; mode 27 [row 3]
6357 pmaddubsw m6, m7, [r3 + 8 * 16]
6358 pmulhrsw m6, m3
6359 pmaddubsw m5, m4, [r3 + 8 * 16]
6360 pmulhrsw m5, m3
6361 packuswb m6, m5
6362 movu [r0 + 403 * 16], m6
6363
6364 ; mode 27 [row 4]
6365 pmaddubsw m6, m7, [r3 + 10 * 16]
6366 pmulhrsw m6, m3
6367 pmaddubsw m5, m4, [r3 + 10 * 16]
6368 pmulhrsw m5, m3
6369 packuswb m6, m5
6370 movu [r0 + 404 * 16], m6
6371
6372 ; mode 27 [row 5]
6373 pmaddubsw m6, m7, [r3 + 12 * 16]
6374 pmulhrsw m6, m3
6375 pmaddubsw m5, m4, [r3 + 12 * 16]
6376 pmulhrsw m5, m3
6377 packuswb m6, m5
6378 movu [r0 + 405 * 16], m6
6379
6380 ; mode 27 [row 6]
6381 pmaddubsw m6, m7, [r3 + 14 * 16]
6382 pmulhrsw m6, m3
6383 pmaddubsw m5, m4, [r3 + 14 * 16]
6384 pmulhrsw m5, m3
6385 packuswb m6, m5
6386 movu [r0 + 406 * 16], m6
6387
6388 ; mode 27 [row 7]
6389 pmaddubsw m6, m7, [r3 + 16 * 16]
6390 pmulhrsw m6, m3
6391 pmaddubsw m5, m4, [r3 + 16 * 16]
6392 pmulhrsw m5, m3
6393 packuswb m6, m5
6394 movu [r0 + 407 * 16], m6
6395
6396 ; mode 27 [row 8]
6397 pmaddubsw m6, m7, [r3 + 18 * 16]
6398 pmulhrsw m6, m3
6399 pmaddubsw m5, m4, [r3 + 18 * 16]
6400 pmulhrsw m5, m3
6401 packuswb m6, m5
6402 movu [r0 + 408 * 16], m6
6403
6404 ; mode 27 [row 9]
6405 pmaddubsw m6, m7, [r3 + 20 * 16]
6406 pmulhrsw m6, m3
6407 pmaddubsw m5, m4, [r3 + 20 * 16]
6408 pmulhrsw m5, m3
6409 packuswb m6, m5
6410 movu [r0 + 409 * 16], m6
6411
6412 ; mode 27 [row 10]
6413 pmaddubsw m6, m7, [r3 + 22 * 16]
6414 pmulhrsw m6, m3
6415 pmaddubsw m5, m4, [r3 + 22 * 16]
6416 pmulhrsw m5, m3
6417 packuswb m6, m5
6418 movu [r0 + 410 * 16], m6
6419
6420 ; mode 27 [row 11]
6421 pmaddubsw m6, m7, [r3 + 24 * 16]
6422 pmulhrsw m6, m3
6423 pmaddubsw m5, m4, [r3 + 24 * 16]
6424 pmulhrsw m5, m3
6425 packuswb m6, m5
6426 movu [r0 + 411 * 16], m6
6427
6428 ; mode 27 [row 12]
6429 pmaddubsw m6, m7, [r3 + 26 * 16]
6430 pmulhrsw m6, m3
6431 pmaddubsw m5, m4, [r3 + 26 * 16]
6432 pmulhrsw m5, m3
6433 packuswb m6, m5
6434 movu [r0 + 412 * 16], m6
6435
6436 ; mode 27 [row 13]
6437 pmaddubsw m6, m7, [r3 + 28 * 16]
6438 pmulhrsw m6, m3
6439 pmaddubsw m5, m4, [r3 + 28 * 16]
6440 pmulhrsw m5, m3
6441 packuswb m6, m5
6442 movu [r0 + 413 * 16], m6
6443
6444 ; mode 27 [row 14]
6445 pmaddubsw m6, m7, [r3 + 30 * 16]
6446 pmulhrsw m6, m3
6447 pmaddubsw m5, m4, [r3 + 30 * 16]
6448 pmulhrsw m5, m3
6449 packuswb m6, m5
6450 movu [r0 + 414 * 16], m6
6451
6452 ; mode 28 [row 0]
6453 movu m1, [r2 + 1]
6454 psrldq m2, m1, 1
6455 punpcklbw m1, m2
6456 movu m4, [r2 + 9]
6457 psrldq m5, m4, 1
6458 punpcklbw m4, m5
6459 pmaddubsw m2, m1, [r3 + 5 * 16]
6460 pmulhrsw m2, m3
6461 pmaddubsw m5, m4, [r3 + 5 * 16]
6462 pmulhrsw m5, m3
6463 packuswb m2, m5
6464 movu [r0 + 416 * 16], m2
6465
6466 ; mode 28 [row 0]
6467 pmaddubsw m2, m1, [r3 + 5 * 16]
6468 pmulhrsw m2, m3
6469 pmaddubsw m5, m4, [r3 + 5 * 16]
6470 pmulhrsw m5, m3
6471 packuswb m2, m5
6472 movu [r0 + 416 * 16], m2
6473
6474 ; mode 28 [row 1]
6475 pmaddubsw m2, m1, [r3 + 10 * 16]
6476 pmulhrsw m2, m3
6477 pmaddubsw m5, m4, [r3 + 10 * 16]
6478 pmulhrsw m5, m3
6479 packuswb m2, m5
6480 movu [r0 + 417 * 16], m2
6481
6482 ; mode 28 [row 2]
6483 pmaddubsw m2, m1, [r3 + 15 * 16]
6484 pmulhrsw m2, m3
6485 pmaddubsw m5, m4, [r3 + 15 * 16]
6486 pmulhrsw m5, m3
6487 packuswb m2, m5
6488 movu [r0 + 418 * 16], m2
6489
6490 ; mode 28 [row 3]
6491 pmaddubsw m2, m1, [r3 + 20 * 16]
6492 pmulhrsw m2, m3
6493 pmaddubsw m5, m4, [r3 + 20 * 16]
6494 pmulhrsw m5, m3
6495 packuswb m2, m5
6496 movu [r0 + 419 * 16], m2
6497
6498 ; mode 28 [row 4]
6499 pmaddubsw m2, m1, [r3 + 25 * 16]
6500 pmulhrsw m2, m3
6501 pmaddubsw m5, m4, [r3 + 25 * 16]
6502 pmulhrsw m5, m3
6503 packuswb m2, m5
6504 movu [r0 + 420 * 16], m2
6505
6506 ; mode 28 [row 5]
6507 pmaddubsw m2, m1, [r3 + 30 * 16]
6508 pmulhrsw m2, m3
6509 pmaddubsw m5, m4, [r3 + 30 * 16]
6510 pmulhrsw m5, m3
6511 packuswb m2, m5
6512 movu [r0 + 421 * 16], m2
6513
6514 ; mode 29 [row 0]
6515 pmaddubsw m2, m1, [r3 + 9 * 16]
6516 pmulhrsw m2, m3
6517 pmaddubsw m5, m4, [r3 + 9 * 16]
6518 pmulhrsw m5, m3
6519 packuswb m2, m5
6520 movu [r0 + 432 * 16], m2
6521
6522 ; mode 29 [row 1]
6523 pmaddubsw m2, m1, [r3 + 18 * 16]
6524 pmulhrsw m2, m3
6525 pmaddubsw m5, m4, [r3 + 18 * 16]
6526 pmulhrsw m5, m3
6527 packuswb m2, m5
6528 movu [r0 + 433 * 16], m2
6529
6530 ; mode 29 [row 2]
6531 pmaddubsw m2, m1, [r3 + 27 * 16]
6532 pmulhrsw m2, m3
6533 pmaddubsw m5, m4, [r3 + 27 * 16]
6534 pmulhrsw m5, m3
6535 packuswb m2, m5
6536 movu [r0 + 434 * 16], m2
6537
6538 ; mode 30 [row 0]
6539 pmaddubsw m2, m1, [r3 + 13 * 16]
6540 pmulhrsw m2, m3
6541 pmaddubsw m5, m4, [r3 + 13 * 16]
6542 pmulhrsw m5, m3
6543 packuswb m2, m5
6544 movu [r0 + 448 * 16], m2
6545
6546 ; mode 30 [row 1]
6547 pmaddubsw m2, m1, [r3 + 26 * 16]
6548 pmulhrsw m2, m3
6549 pmaddubsw m5, m4, [r3 + 26 * 16]
6550 pmulhrsw m5, m3
6551 packuswb m2, m5
6552 movu [r0 + 449 * 16], m2
6553
6554 ; mode 33 [row 0]
6555 movu [r0 + 496 * 16], m2
6556
6557 ; mode 31 [row 0]
6558 pmaddubsw m2, m1, [r3 + 17 * 16]
6559 pmulhrsw m2, m3
6560 pmaddubsw m5, m4, [r3 + 17 * 16]
6561 pmulhrsw m5, m3
6562 packuswb m2, m5
6563 movu [r0 + 464 * 16], m2
6564
6565 ; mode 32 [row 0]
6566 pmaddubsw m2, m1, [r3 + 21 * 16]
6567 pmulhrsw m2, m3
6568 pmaddubsw m5, m4, [r3 + 21 * 16]
6569 pmulhrsw m5, m3
6570 packuswb m2, m5
6571 movu [r0 + 480 * 16], m2
6572
6573 ; mode 28 [row 6]
6574 movd m7, [r2 + 9]
6575 palignr m7, m1, 2
6576 pmaddubsw m2, m7, [r3 + 3 * 16]
6577 pmulhrsw m2, m3
6578 movd m6, [r2 + 17]
6579 palignr m6, m4, 2
6580 pmaddubsw m5, m6, [r3 + 3 * 16]
6581 pmulhrsw m5, m3
6582 packuswb m2, m5
6583 movu [r0 + 422 * 16], m2
6584
6585 ; mode 28 [row 7]
6586 pmaddubsw m2, m7, [r3 + 8 * 16]
6587 pmulhrsw m2, m3
6588 pmaddubsw m5, m6, [r3 + 8 * 16]
6589 pmulhrsw m5, m3
6590 packuswb m2, m5
6591 movu [r0 + 423 * 16], m2
6592
6593 ; mode 28 [row 8]
6594 pmaddubsw m2, m7, [r3 + 13 * 16]
6595 pmulhrsw m2, m3
6596 pmaddubsw m5, m6, [r3 + 13 * 16]
6597 pmulhrsw m5, m3
6598 packuswb m2, m5
6599 movu [r0 + 424 * 16], m2
6600
6601 ; mode 28 [row 9]
6602 pmaddubsw m2, m7, [r3 + 18 * 16]
6603 pmulhrsw m2, m3
6604 pmaddubsw m5, m6, [r3 + 18 * 16]
6605 pmulhrsw m5, m3
6606 packuswb m2, m5
6607 movu [r0 + 425 * 16], m2
6608
6609 ; mode 28 [row 10]
6610 pmaddubsw m2, m7, [r3 + 23 * 16]
6611 pmulhrsw m2, m3
6612 pmaddubsw m5, m6, [r3 + 23 * 16]
6613 pmulhrsw m5, m3
6614 packuswb m2, m5
6615 movu [r0 + 426 * 16], m2
6616
6617 ; mode 29 [row 3]
6618 pmaddubsw m2, m7, [r3 + 4 * 16]
6619 pmulhrsw m2, m3
6620 pmaddubsw m5, m6, [r3 + 4 * 16]
6621 pmulhrsw m5, m3
6622 packuswb m2, m5
6623 movu [r0 + 435 * 16], m2
6624
6625 ; mode 29 [row 4]
6626 pmaddubsw m2, m7, [r3 + 13 * 16]
6627 pmulhrsw m2, m3
6628 pmaddubsw m5, m6, [r3 + 13 * 16]
6629 pmulhrsw m5, m3
6630 packuswb m2, m5
6631 movu [r0 + 436 * 16], m2
6632
6633 ; mode 29 [row 5]
6634 pmaddubsw m2, m7, [r3 + 22 * 16]
6635 pmulhrsw m2, m3
6636 pmaddubsw m5, m6, [r3 + 22 * 16]
6637 pmulhrsw m5, m3
6638 packuswb m2, m5
6639 movu [r0 + 437 * 16], m2
6640
6641 ; mode 29 [row 6]
6642 pmaddubsw m2, m7, [r3 + 31 * 16]
6643 pmulhrsw m2, m3
6644 pmaddubsw m5, m6, [r3 + 31 * 16]
6645 pmulhrsw m5, m3
6646 packuswb m2, m5
6647 movu [r0 + 438 * 16], m2
6648
6649 ; mode 32 [row 2]
6650 movu [r0 + 482 * 16], m2
6651
6652 ; mode 30 [row 2]
6653 pmaddubsw m2, m7, [r3 + 7 * 16]
6654 pmulhrsw m2, m3
6655 pmaddubsw m5, m6, [r3 + 7 * 16]
6656 pmulhrsw m5, m3
6657 packuswb m2, m5
6658 movu [r0 + 450 * 16], m2
6659
6660 ; mode 30 [row 3]
6661 pmaddubsw m2, m7, [r3 + 20 * 16]
6662 pmulhrsw m2, m3
6663 pmaddubsw m5, m6, [r3 + 20 * 16]
6664 pmulhrsw m5, m3
6665 packuswb m2, m5
6666 movu [r0 + 451 * 16], m2
6667
6668 ; mode 33 [row 1]
6669 movu [r0 + 497 * 16], m2
6670
6671 ; mode 31 [row 1]
6672 pmaddubsw m2, m7, [r3 + 2 * 16]
6673 pmulhrsw m2, m3
6674 pmaddubsw m5, m6, [r3 + 2 * 16]
6675 pmulhrsw m5, m3
6676 packuswb m2, m5
6677 movu [r0 + 465 * 16], m2
6678
6679 ; mode 31 [row 2]
6680 pmaddubsw m2, m7, [r3 + 19 * 16]
6681 pmulhrsw m2, m3
6682 pmaddubsw m5, m6, [r3 + 19 * 16]
6683 pmulhrsw m5, m3
6684 packuswb m2, m5
6685 movu [r0 + 466 * 16], m2
6686
6687 ; mode 32 [row 1]
6688 pmaddubsw m2, m7, [r3 + 10 * 16]
6689 pmulhrsw m2, m3
6690 pmaddubsw m5, m6, [r3 + 10 * 16]
6691 pmulhrsw m5, m3
6692 packuswb m2, m5
6693 movu [r0 + 481 * 16], m2
6694
6695 ; mode 28 [row 11]
6696 pmaddubsw m2, m7, [r3 + 28 * 16]
6697 pmulhrsw m2, m3
6698 pmaddubsw m5, m6, [r3 + 28 * 16]
6699 pmulhrsw m5, m3
6700 packuswb m2, m5
6701 movu [r0 + 427 * 16], m2
6702
6703 ; mode 28 [row 12]
6704 movd m1, [r2 + 10]
6705 palignr m1, m7, 2
6706 pmaddubsw m2, m1, [r3 + 1 * 16]
6707 pmulhrsw m2, m3
6708 movd m4, [r2 + 18]
6709 palignr m4, m6, 2
6710 pmaddubsw m5, m4, [r3 + 1 * 16]
6711 pmulhrsw m5, m3
6712 packuswb m2, m5
6713 movu [r0 + 428 * 16], m2
6714
6715 ; mode 30 [row 4]
6716 movu [r0 + 452 * 16], m2
6717
6718 ; mode 28 [row 13]
6719 pmaddubsw m2, m1, [r3 + 6 * 16]
6720 pmulhrsw m2, m3
6721 pmaddubsw m5, m4, [r3 + 6 * 16]
6722 pmulhrsw m5, m3
6723 packuswb m2, m5
6724 movu [r0 + 429 * 16], m2
6725
6726 ; mode 28 [row 14]
6727 pmaddubsw m2, m1, [r3 + 11 * 16]
6728 pmulhrsw m2, m3
6729 pmaddubsw m5, m4, [r3 + 11 * 16]
6730 pmulhrsw m5, m3
6731 packuswb m2, m5
6732 movu [r0 + 430 * 16], m2
6733
6734 ; mode 28 [row 15]
6735 pmaddubsw m2, m1, [r3 + 16 * 16]
6736 pmulhrsw m2, m3
6737 pmaddubsw m5, m4, [r3 + 16 * 16]
6738 pmulhrsw m5, m3
6739 packuswb m2, m5
6740 movu [r0 + 431 * 16], m2
6741
6742 ; mode 29 [row 7]
6743 pmaddubsw m2, m1, [r3 + 8 * 16]
6744 pmulhrsw m2, m3
6745 pmaddubsw m5, m4, [r3 + 8 * 16]
6746 pmulhrsw m5, m3
6747 packuswb m2, m5
6748 movu [r0 + 439 * 16], m2
6749
6750 ; mode 29 [row 8]
6751 pmaddubsw m2, m1, [r3 + 17 * 16]
6752 pmulhrsw m2, m3
6753 pmaddubsw m5, m4, [r3 + 17 * 16]
6754 pmulhrsw m5, m3
6755 packuswb m2, m5
6756 movu [r0 + 440 * 16], m2
6757
6758 ; mode 29 [row 9]
6759 pmaddubsw m2, m1, [r3 + 26 * 16]
6760 pmulhrsw m2, m3
6761 pmaddubsw m5, m4, [r3 + 26 * 16]
6762 pmulhrsw m5, m3
6763 packuswb m2, m5
6764 movu [r0 + 441 * 16], m2
6765
6766 ; mode 30 [row 5]
6767 pmaddubsw m2, m1, [r3 + 14 * 16]
6768 pmulhrsw m2, m3
6769 pmaddubsw m5, m4, [r3 + 14 * 16]
6770 pmulhrsw m5, m3
6771 packuswb m2, m5
6772 movu [r0 + 453 * 16], m2
6773
6774 ; mode 33 [row 2]
6775 movu [r0 + 498 * 16], m2
6776
6777 ; mode 30 [row 6]
6778 pmaddubsw m2, m1, [r3 + 27 * 16]
6779 pmulhrsw m2, m3
6780 pmaddubsw m5, m4, [r3 + 27 * 16]
6781 pmulhrsw m5, m3
6782 packuswb m2, m5
6783 movu [r0 + 454 * 16], m2
6784
6785 ; mode 31 [row 3]
6786 pmaddubsw m2, m1, [r3 + 4 * 16]
6787 pmulhrsw m2, m3
6788 pmaddubsw m5, m4, [r3 + 4 * 16]
6789 pmulhrsw m5, m3
6790 packuswb m2, m5
6791 movu [r0 + 467 * 16], m2
6792
6793 ; mode 31 [row 4]
6794 pmaddubsw m2, m1, [r3 + 21 * 16]
6795 pmulhrsw m2, m3
6796 pmaddubsw m5, m4, [r3 + 21 * 16]
6797 pmulhrsw m5, m3
6798 packuswb m2, m5
6799 movu [r0 + 468 * 16], m2
6800
6801 ; mode 32 [row 3]
6802 pmaddubsw m2, m1, [r3 + 20 * 16]
6803 pmulhrsw m2, m3
6804 pmaddubsw m5, m4, [r3 + 20 * 16]
6805 pmulhrsw m5, m3
6806 packuswb m2, m5
6807 movu [r0 + 483 * 16], m2
6808
6809 ; mode 29 [row 10]
6810 movd m7, [r2 + 11]
6811 palignr m7, m1, 2
6812 pmaddubsw m2, m7, [r3 + 3 * 16]
6813 pmulhrsw m2, m3
6814 movd m6, [r2 + 19]
6815 palignr m6, m4, 2
6816 pmaddubsw m5, m6, [r3 + 3 * 16]
6817 pmulhrsw m5, m3
6818 packuswb m2, m5
6819 movu [r0 + 442 * 16], m2
6820
6821 ; mode 29 [row 11]
6822 pmaddubsw m2, m7, [r3 + 12 * 16]
6823 pmulhrsw m2, m3
6824 pmaddubsw m5, m6, [r3 + 12 * 16]
6825 pmulhrsw m5, m3
6826 packuswb m2, m5
6827 movu [r0 + 443 * 16], m2
6828
6829 ; mode 29 [row 12]
6830 pmaddubsw m2, m7, [r3 + 21 * 16]
6831 pmulhrsw m2, m3
6832 pmaddubsw m5, m6, [r3 + 21 * 16]
6833 pmulhrsw m5, m3
6834 packuswb m2, m5
6835 movu [r0 + 444 * 16], m2
6836
6837 ; mode 30 [row 8]
6838 movu [r0 + 456 * 16], m2
6839
6840 ; mode 29 [row 13]
6841 pmaddubsw m2, m7, [r3 + 30 * 16]
6842 pmulhrsw m2, m3
6843 pmaddubsw m5, m6, [r3 + 30 * 16]
6844 pmulhrsw m5, m3
6845 packuswb m2, m5
6846 movu [r0 + 445 * 16], m2
6847
6848 ; mode 32 [row 5]
6849 movu [r0 + 485 * 16], m2
6850
6851 ; mode 30 [row 7]
6852 pmaddubsw m2, m7, [r3 + 8 * 16]
6853 pmulhrsw m2, m3
6854 pmaddubsw m5, m6, [r3 + 8 * 16]
6855 pmulhrsw m5, m3
6856 packuswb m2, m5
6857 movu [r0 + 455 * 16], m2
6858
6859 ; mode 33 [row 3]
6860 movu [r0 + 499 * 16], m2
6861
6862 ; mode 31 [row 5]
6863 pmaddubsw m2, m7, [r3 + 6 * 16]
6864 pmulhrsw m2, m3
6865 pmaddubsw m5, m6, [r3 + 6 * 16]
6866 pmulhrsw m5, m3
6867 packuswb m2, m5
6868 movu [r0 + 469 * 16], m2
6869
6870 ; mode 31 [row 6]
6871 pmaddubsw m2, m7, [r3 + 23 * 16]
6872 pmulhrsw m2, m3
6873 pmaddubsw m5, m6, [r3 + 23 * 16]
6874 pmulhrsw m5, m3
6875 packuswb m2, m5
6876 movu [r0 + 470 * 16], m2
6877
6878 ; mode 32 [row 4]
6879 pmaddubsw m2, m7, [r3 + 9 * 16]
6880 pmulhrsw m2, m3
6881 pmaddubsw m5, m6, [r3 + 9 * 16]
6882 pmulhrsw m5, m3
6883 packuswb m2, m5
6884 movu [r0 + 484 * 16], m2
6885
6886 movu m1, m7
6887 movu m4, m6
6888
6889 ; mode 29 [row 14]
6890 movu m1, [r2 + 12]
6891 palignr m1, m7, 2
6892 pmaddubsw m2, m1, [r3 + 7 * 16]
6893 pmulhrsw m2, m3
6894 movd m4, [r2 + 20]
6895 palignr m4, m6, 2
6896 pmaddubsw m5, m4, [r3 + 7 * 16]
6897 pmulhrsw m5, m3
6898 packuswb m2, m5
6899 movu [r0 + 446 * 16], m2
6900
6901 ; mode 29 [row 15]
6902 pmaddubsw m2, m1, [r3 + 16 * 16]
6903 pmulhrsw m2, m3
6904 pmaddubsw m5, m4, [r3 + 16 * 16]
6905 pmulhrsw m5, m3
6906 packuswb m2, m5
6907 movu [r0 + 447 * 16], m2
6908
6909 ; mode 30 [row 9]
6910 pmaddubsw m2, m1, [r3 + 2 * 16]
6911 pmulhrsw m2, m3
6912 pmaddubsw m5, m4, [r3 + 2 * 16]
6913 pmulhrsw m5, m3
6914 packuswb m2, m5
6915 movu [r0 + 457 * 16], m2
6916
6917 ; mode 33 [row 4]
6918 movu [r0 + 500 * 16], m2
6919
6920 ; mode 30 [row 10]
6921 pmaddubsw m2, m1, [r3 + 15 * 16]
6922 pmulhrsw m2, m3
6923 pmaddubsw m5, m4, [r3 + 15 * 16]
6924 pmulhrsw m5, m3
6925 packuswb m2, m5
6926 movu [r0 + 458 * 16], m2
6927
6928 ; mode 30 [row 11]
6929 pmaddubsw m2, m1, [r3 + 28 * 16]
6930 pmulhrsw m2, m3
6931 pmaddubsw m5, m4, [r3 + 28 * 16]
6932 pmulhrsw m5, m3
6933 packuswb m2, m5
6934 movu [r0 + 459 * 16], m2
6935
6936 ; mode 33 [row 5]
6937 movu [r0 + 501 * 16], m2
6938
6939 ; mode 31 [row 7]
6940 pmaddubsw m2, m1, [r3 + 8 * 16]
6941 pmulhrsw m2, m3
6942 pmaddubsw m5, m4, [r3 + 8 * 16]
6943 pmulhrsw m5, m3
6944 packuswb m2, m5
6945 movu [r0 + 471 * 16], m2
6946
6947 ; mode 31 [row 8]
6948 pmaddubsw m2, m1, [r3 + 25 * 16]
6949 pmulhrsw m2, m3
6950 pmaddubsw m5, m4, [r3 + 25 * 16]
6951 pmulhrsw m5, m3
6952 packuswb m2, m5
6953 movu [r0 + 472 * 16], m2
6954
6955 ; mode 32 [row 6]
6956 pmaddubsw m2, m1, [r3 + 19 * 16]
6957 pmulhrsw m2, m3
6958 pmaddubsw m5, m4, [r3 + 19 * 16]
6959 pmulhrsw m5, m3
6960 packuswb m2, m5
6961 movu [r0 + 486 * 16], m2
6962
6963 ; mode 30 [row 12]
6964 movd m7, [r2 + 13]
6965 palignr m7, m1, 2
6966 pmaddubsw m2, m7, [r3 + 9 * 16]
6967 pmulhrsw m2, m3
6968 movd m6, [r2 + 21]
6969 palignr m6, m4, 2
6970 pmaddubsw m5, m6, [r3 + 9 * 16]
6971 pmulhrsw m5, m3
6972 packuswb m2, m5
6973 movu [r0 + 460 * 16], m2
6974
6975 ; mode 30 [row 13]
6976 pmaddubsw m2, m7, [r3 + 22 * 16]
6977 pmulhrsw m2, m3
6978 pmaddubsw m5, m6, [r3 + 22 * 16]
6979 pmulhrsw m5, m3
6980 packuswb m2, m5
6981 movu [r0 + 461 * 16], m2
6982
6983 ; mode 33 [row 6]
6984 movu [r0 + 502 * 16], m2
6985
6986 ; mode 31 [row 9]
6987 pmaddubsw m2, m7, [r3 + 10 * 16]
6988 pmulhrsw m2, m3
6989 pmaddubsw m5, m6, [r3 + 10 * 16]
6990 pmulhrsw m5, m3
6991 packuswb m2, m5
6992 movu [r0 + 473 * 16], m2
6993
6994 ; mode 31 [row 10]
6995 pmaddubsw m2, m7, [r3 + 27 * 16]
6996 pmulhrsw m2, m3
6997 pmaddubsw m5, m6, [r3 + 27 * 16]
6998 pmulhrsw m5, m3
6999 packuswb m2, m5
7000 movu [r0 + 474 * 16], m2
7001
7002 ; mode 32 [row 7]
7003 pmaddubsw m2, m7, [r3 + 8 * 16]
7004 pmulhrsw m2, m3
7005 pmaddubsw m5, m6, [r3 + 8 * 16]
7006 pmulhrsw m5, m3
7007 packuswb m2, m5
7008 movu [r0 + 487 * 16], m2
7009
7010 ; mode 32 [row 8]
7011 pmaddubsw m2, m7, [r3 + 29 * 16]
7012 pmulhrsw m2, m3
7013 pmaddubsw m5, m6, [r3 + 29 * 16]
7014 pmulhrsw m5, m3
7015 packuswb m2, m5
7016 movu [r0 + 488 * 16], m2
7017
7018
7019 movu m1, m7
7020 movu m4, m6
7021
7022 ; mode 30 [row 14]
7023 movd m1, [r2 + 14]
7024 palignr m1, m7, 2
7025 pmaddubsw m2, m1, [r3 + 3 * 16]
7026 pmulhrsw m2, m3
7027 movd m4, [r2 + 22]
7028 palignr m4, m6, 2
7029 pmaddubsw m5, m4, [r3 + 3 * 16]
7030 pmulhrsw m5, m3
7031 packuswb m2, m5
7032 movu [r0 + 462 * 16], m2
7033
7034 ; mode 30 [row 15]
7035 pmaddubsw m2, m1, [r3 + 16 * 16]
7036 pmulhrsw m2, m3
7037 pmaddubsw m5, m4, [r3 + 16 * 16]
7038 pmulhrsw m5, m3
7039 packuswb m2, m5
7040 movu [r0 + 463 * 16], m2
7041
7042 ; mode 33 [row 7]
7043 movu [r0 + 503 * 16], m2
7044
7045 ; mode 31 [row 11]
7046 pmaddubsw m2, m1, [r3 + 12 * 16]
7047 pmulhrsw m2, m3
7048 pmaddubsw m5, m4, [r3 + 12 * 16]
7049 pmulhrsw m5, m3
7050 packuswb m2, m5
7051 movu [r0 + 475 * 16], m2
7052
7053 ; mode 31 [row 12]
7054 pmaddubsw m2, m1, [r3 + 29 * 16]
7055 pmulhrsw m2, m3
7056 pmaddubsw m5, m4, [r3 + 29 * 16]
7057 pmulhrsw m5, m3
7058 packuswb m2, m5
7059 movu [r0 + 476 * 16], m2
7060
7061 ; mode 32 [row 9]
7062 pmaddubsw m2, m1, [r3 + 18 * 16]
7063 pmulhrsw m2, m3
7064 pmaddubsw m5, m4, [r3 + 18 * 16]
7065 pmulhrsw m5, m3
7066 packuswb m2, m5
7067 movu [r0 + 489 * 16], m2
7068
7069 ; mode 31 [row 13]
7070 movd m7, [r2 + 15]
7071 palignr m7, m1, 2
7072 pmaddubsw m2, m7, [r3 + 14 * 16]
7073 pmulhrsw m2, m3
7074 movd m6, [r2 + 23]
7075 palignr m6, m4, 2
7076 pmaddubsw m5, m6, [r3 + 14 * 16]
7077 pmulhrsw m5, m3
7078 packuswb m2, m5
7079 movu [r0 + 477 * 16], m2
7080
7081 ; mode 31 [row 14]
7082 pmaddubsw m2, m7, [r3 + 31 * 16]
7083 pmulhrsw m2, m3
7084 pmaddubsw m5, m6, [r3 + 31 * 16]
7085 pmulhrsw m5, m3
7086 packuswb m2, m5
7087 movu [r0 + 478 * 16], m2
7088
7089 ; mode 32 [row 10]
7090 pmaddubsw m2, m7, [r3 + 7 * 16]
7091 pmulhrsw m2, m3
7092 pmaddubsw m5, m6, [r3 + 7 * 16]
7093 pmulhrsw m5, m3
7094 packuswb m2, m5
7095 movu [r0 + 490 * 16], m2
7096
7097 ; mode 32 [row 11]
7098 pmaddubsw m2, m7, [r3 + 28 * 16]
7099 pmulhrsw m2, m3
7100 pmaddubsw m5, m6, [r3 + 28 * 16]
7101 pmulhrsw m5, m3
7102 packuswb m2, m5
7103 movu [r0 + 491 * 16], m2
7104
7105 ; mode 33 [row 8]
7106 pmaddubsw m2, m7, [r3 + 10 * 16]
7107 pmulhrsw m2, m3
7108 pmaddubsw m5, m6, [r3 + 10 * 16]
7109 pmulhrsw m5, m3
7110 packuswb m2, m5
7111 movu [r0 + 504 * 16], m2
7112
7113 ; mode 31 [row 15]
7114 movd m1, [r2 + 16]
7115 palignr m1, m7, 2
7116 pmaddubsw m2, m1, [r3 + 16 * 16]
7117 pmulhrsw m2, m3
7118 movd m4, [r2 + 24]
7119 palignr m4, m6, 2
7120 pmaddubsw m5, m4, [r3 + 16 * 16]
7121 pmulhrsw m5, m3
7122 packuswb m2, m5
7123 movu [r0 + 479 * 16], m2
7124
7125 ; mode 32 [row 12]
7126 pmaddubsw m2, m1, [r3 + 17 * 16]
7127 pmulhrsw m2, m3
7128 pmaddubsw m5, m4, [r3 + 17 * 16]
7129 pmulhrsw m5, m3
7130 packuswb m2, m5
7131 movu [r0 + 492 * 16], m2
7132
7133 ; mode 33 [row 9]
7134 pmaddubsw m2, m1, [r3 + 4 * 16]
7135 pmulhrsw m2, m3
7136 pmaddubsw m5, m4, [r3 + 4 * 16]
7137 pmulhrsw m5, m3
7138 packuswb m2, m5
7139 movu [r0 + 505 * 16], m2
7140
7141 ; mode 33 [row 10]
7142 pmaddubsw m2, m1, [r3 + 30 * 16]
7143 pmulhrsw m2, m3
7144 pmaddubsw m5, m4, [r3 + 30 * 16]
7145 pmulhrsw m5, m3
7146 packuswb m2, m5
7147 movu [r0 + 506 * 16], m2
7148
7149 ; mode 33 [row 10]
7150 pmaddubsw m2, m1, [r3 + 4 * 16]
7151 pmulhrsw m2, m3
7152 pmaddubsw m5, m4, [r3 + 4 * 16]
7153 pmulhrsw m5, m3
7154 packuswb m2, m5
7155 movu [r0 + 505 * 16], m2
7156
7157 ; mode 32 [row 13]
7158 movd m7, [r2 + 17]
7159 palignr m7, m1, 2
7160 pmaddubsw m2, m7, [r3 + 6 * 16]
7161 pmulhrsw m2, m3
7162
7163 movd m6, [r2 + 25]
7164 palignr m6, m4, 2
7165 pmaddubsw m5, m6, [r3 + 6 * 16]
7166 pmulhrsw m5, m3
7167 packuswb m2, m5
7168 movu [r0 + 493 * 16], m2
7169
7170 ; mode 32 [row 14]
7171 pmaddubsw m2, m7, [r3 + 27 * 16]
7172 pmulhrsw m2, m3
7173 pmaddubsw m5, m6, [r3 + 27 * 16]
7174 pmulhrsw m5, m3
7175 packuswb m2, m5
7176 movu [r0 + 494 * 16], m2
7177
7178 ; mode 33 [row 11]
7179 pmaddubsw m2, m7, [r3 + 24 * 16]
7180 pmulhrsw m2, m3
7181 pmaddubsw m5, m6, [r3 + 24 * 16]
7182 pmulhrsw m5, m3
7183 packuswb m2, m5
7184 movu [r0 + 507 * 16], m2
7185
7186 ; mode 32 [row 15]
7187 movd m1, [r2 + 18]
7188 palignr m1, m7, 2
7189 pmaddubsw m2, m1, [r3 + 16 * 16]
7190 pmulhrsw m2, m3
7191 psrldq m4, 2
7192 pinsrb m4, [r2 + 26], 14
7193 pinsrb m4, [r2 + 27], 15
7194 movd m4, [r2 + 26]
7195 palignr m4, m6, 2
7196 pmaddubsw m5, m4, [r3 + 16 * 16]
7197 pmulhrsw m5, m3
7198 packuswb m2, m5
7199 movu [r0 + 495 * 16], m2
7200
7201 ; mode 33 [row 12]
7202 pmaddubsw m2, m1, [r3 + 18 * 16]
7203 pmulhrsw m2, m3
7204 pmaddubsw m5, m4, [r3 + 18 * 16]
7205 pmulhrsw m5, m3
7206 packuswb m2, m5
7207 movu [r0 + 508 * 16], m2
7208
7209 ; mode 33 [row 13]
7210 movd m7, [r2 + 19]
7211 palignr m7, m1, 2
7212 pmaddubsw m2, m7, [r3 + 12 * 16]
7213 pmulhrsw m2, m3
7214 movd m6, [r2 + 27]
7215 palignr m6, m4, 2
7216 pmaddubsw m5, m6, [r3 + 12 * 16]
7217 pmulhrsw m5, m3
7218 packuswb m2, m5
7219 movu [r0 + 509 * 16], m2
7220
7221 ; mode 33 [row 14]
7222 movd m1, [r2 + 20]
7223 palignr m1, m7, 2
7224 pmaddubsw m2, m1, [r3 + 6 * 16]
7225 pmulhrsw m2, m3
7226 movd m4, [r2 + 28]
7227 palignr m4, m6, 2
7228 pmaddubsw m5, m4, [r3 + 6 * 16]
7229 pmulhrsw m5, m3
7230 packuswb m2, m5
7231 movu [r0 + 510 * 16], m2
7232
7233 ; mode 34 [row 0]
7234 movu m1, [r2 + 2]
7235 movu [r0 + 512 * 16], m1
7236 movu m2, [r2 + 18]
7237 palignr m3, m2, m1, 1
7238 movu [r0 + 513 * 16], m3
7239 palignr m3, m2, m1, 2
7240 movu [r0 + 514 * 16], m3
7241 palignr m3, m2, m1, 3
7242 movu [r0 + 515 * 16], m3
7243 palignr m3, m2, m1, 4
7244 movu [r0 + 516 * 16], m3
7245 palignr m3, m2, m1, 5
7246 movu [r0 + 517 * 16], m3
7247 palignr m3, m2, m1, 6
7248 movu [r0 + 518 * 16], m3
7249 palignr m3, m2, m1, 7
7250 movu [r0 + 519 * 16], m3
7251 palignr m3, m2, m1, 8
7252 movu [r0 + 520 * 16], m3
7253 palignr m3, m2, m1, 9
7254 movu [r0 + 521 * 16], m3
7255 palignr m3, m2, m1, 10
7256 movu [r0 + 522 * 16], m3
7257 palignr m3, m2, m1, 11
7258 movu [r0 + 523 * 16], m3
7259 palignr m3, m2, m1, 12
7260 movu [r0 + 524 * 16], m3
7261
7262 ; mode 33 [row 15]
7263 movu [r0 + 511 * 16], m3
7264
7265 ; mode 34
7266 palignr m3, m2, m1, 13
7267 movu [r0 + 525 * 16], m3
7268 palignr m3, m2, m1, 14
7269 movu [r0 + 526 * 16], m3
7270 palignr m3, m2, m1, 15
7271 movu [r0 + 527 * 16], m3
7272 RET
7273
7274 ;--------------------------------------------------------------------------------
7275 ; void all_angs_pred_32x32(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
7276 ;--------------------------------------------------------------------------------
7277 INIT_XMM sse4
7278 cglobal all_angs_pred_32x32, 3,7,8, 0-4
7279 mov r6d, [r1 + 64]
7280 mov r3d, [r1]
7281 mov [rsp], r6d
7282 mov [r1 + 64], r3b
7283 mov r3d, [r2]
7284 mov r6d, [r2 + 64]
7285 mov [r2 + 64], r3b
7286
7287 lea r3, [r2]
7288 lea r4, [r2 + 64]
7289 lea r2, [r1 + 64]
7290
7291 ;mode 2[row 0]
7292 movu m0, [r4 + 2]
7293 movu [r0 + 0 * 16], m0
7294 movu m1, [r4 + 18]
7295 movu [r0 + 1 * 16], m1
7296
7297 ;mode 9 [row 15]
7298 movu [r0 + 478 * 16], m0
7299 movu [r0 + 479 * 16], m1
7300
7301 ;mode 2[row 1]
7302 movu m2, [r4 + 34]
7303 palignr m3, m1, m0, 1
7304 movu [r0 + 2 * 16], m3
7305 palignr m4, m2, m1, 1
7306 movu [r0 + 3 * 16], m4
7307
7308 ; mode 9 [row 31]
7309 movu [r0 + 510 * 16], m3
7310 movu [r0 + 511 * 16], m4
7311
7312 ;mode 2[row 17]
7313 movu [r0 + 34 * 16], m4
7314 movu m5, [r4 + 35]
7315 movu [r0 + 35 * 16], m5
7316
7317 ;mode 2[row 2]
7318 palignr m3, m1, m0, 2
7319 movu [r0 + 4 * 16], m3
7320 palignr m4, m2, m1, 2
7321 movu [r0 + 5 * 16], m4
7322
7323 ;mode 2[row 18]
7324 movu [r0 + 36 * 16], m4
7325 movu m6, [r4 + 51]
7326 palignr m7, m6, m5, 1
7327 movu [r0 + 37 * 16], m7
7328
7329 ;mode 2[row 3]
7330 palignr m3, m1, m0, 3
7331 movu [r0 + 6 * 16], m3
7332 palignr m4, m2, m1, 3
7333 movu [r0 + 7 * 16], m4
7334
7335 ;mode 2[row 19]
7336 movu [r0 + 38 * 16], m4
7337 palignr m7, m6, m5, 2
7338 movu [r0 + 39 * 16], m7
7339
7340 ;mode 2[row 4]
7341 palignr m3, m1, m0, 4
7342 movu [r0 + 8 * 16], m3
7343 palignr m4, m2, m1, 4
7344 movu [r0 + 9 * 16], m4
7345
7346 ; mode 8 [row 31]
7347 movu [r0 + 446 * 16], m3
7348 movu [r0 + 447 * 16], m4
7349
7350 ;mode 2[row 20]
7351 movu [r0 + 40 * 16], m4
7352 palignr m7, m6, m5, 3
7353 movu [r0 + 41 * 16], m7
7354
7355 ; mode 4 [row 31]
7356 movu [r0 + 190 * 16], m4
7357 movu [r0 + 191 * 16], m7
7358
7359 ;mode 2[row 5]
7360 palignr m3, m1, m0, 5
7361 movu [r0 + 10 * 16], m3
7362 palignr m4, m2, m1, 5
7363 movu [r0 + 11 * 16], m4
7364
7365 ;mode 2[row 21]
7366 movu [r0 + 42 * 16], m4
7367 palignr m7, m6, m5, 4
7368 movu [r0 + 43 * 16], m7
7369
7370 ;mode 2[row 6]
7371 palignr m3, m1, m0, 6
7372 movu [r0 + 12 * 16], m3
7373 palignr m4, m2, m1, 6
7374 movu [r0 + 13 * 16], m4
7375
7376 ;mode 2[row 22]
7377 movu [r0 + 44 * 16], m4
7378 palignr m7, m6, m5, 5
7379 movu [r0 + 45 * 16], m7
7380
7381 ;mode 2[row 7]
7382 palignr m3, m1, m0, 7
7383 movu [r0 + 14 * 16], m3
7384 palignr m4, m2, m1, 7
7385 movu [r0 + 15 * 16], m4
7386
7387 ;mode 2[row 23]
7388 movu [r0 + 46 * 16], m4
7389 palignr m7, m6, m5, 6
7390 movu [r0 + 47 * 16], m7
7391
7392 ;mode 2[row 8]
7393 palignr m3, m1, m0, 8
7394 movu [r0 + 16 * 16], m3
7395 palignr m4, m2, m1, 8
7396 movu [r0 + 17 * 16], m4
7397
7398 ;mode 7[row 31]
7399 movu [r0 + 382 * 16], m3
7400 movu [r0 + 383 * 16], m4
7401
7402 ;mode 2[row 24]
7403 movu [r0 + 48 * 16], m4
7404 palignr m7, m6, m5, 7
7405 movu [r0 + 49 * 16], m7
7406
7407 ;mode 2[row 9]
7408 palignr m3, m1, m0, 9
7409 movu [r0 + 18 * 16], m3
7410 palignr m4, m2, m1, 9
7411 movu [r0 + 19 * 16], m4
7412
7413 ;mode 2[row 25]
7414 movu [r0 + 50 * 16], m4
7415 palignr m7, m6, m5, 8
7416 movu [r0 + 51 * 16], m7
7417
7418 ; mode 3 [row 31]
7419 movu [r0 + 126 * 16], m4
7420 movu [r0 + 127 * 16], m7
7421
7422 ;mode 2[row 10]
7423 palignr m3, m1, m0, 10
7424 movu [r0 + 20 * 16], m3
7425 palignr m4, m2, m1, 10
7426 movu [r0 + 21 * 16], m4
7427
7428 ;mode 2[row 26]
7429 movu [r0 + 52 * 16], m4
7430 palignr m7, m6, m5, 9
7431 movu [r0 + 53 * 16], m7
7432
7433 ;mode 2[row 11]
7434 palignr m3, m1, m0, 11
7435 movu [r0 + 22 * 16], m3
7436 palignr m4, m2, m1, 11
7437 movu [r0 + 23 * 16], m4
7438
7439 ;mode 2[row 27]
7440 movu [r0 + 54 * 16], m4
7441 palignr m7, m6, m5, 10
7442 movu [r0 + 55 * 16], m7
7443
7444 ;mode 2[row 12]
7445 palignr m3, m1, m0, 12
7446 movu [r0 + 24 * 16], m3
7447 palignr m4, m2, m1, 12
7448 movu [r0 + 25 * 16], m4
7449
7450 ; mode 6 [row 31]
7451 movu [r0 + 318 * 16], m3
7452 movu [r0 + 319 * 16], m4
7453
7454 ; mode 3 [row 15]
7455 movu [r0 + 94 * 16], m3
7456 movu [r0 + 95 * 16], m4
7457
7458 ;mode 2[row 28]
7459 movu [r0 + 56 * 16], m4
7460 palignr m7, m6, m5, 11
7461 movu [r0 + 57 * 16], m7
7462
7463 ;mode 2[row 13]
7464 palignr m3, m1, m0, 13
7465 movu [r0 + 26 * 16], m3
7466 palignr m4, m2, m1, 13
7467 movu [r0 + 27 * 16], m4
7468
7469 ;mode 2[row 29]
7470 movu [r0 + 58 * 16], m4
7471 palignr m7, m6, m5, 12
7472 movu [r0 + 59 * 16], m7
7473
7474 ;mode 2[row 14]
7475 palignr m3, m1, m0, 14
7476 movu [r0 + 28 * 16], m3
7477 palignr m4, m2, m1, 14
7478 movu [r0 + 29 * 16], m4
7479
7480 ;mode 2[row 30]
7481 movu [r0 + 60 * 16], m4
7482 palignr m7, m6, m5, 13
7483 movu [r0 + 61 * 16], m7
7484
7485 ;mode 2[row 15]
7486 palignr m3, m1, m0, 15
7487 movu [r0 + 30 * 16], m3
7488 palignr m4, m2, m1, 15
7489 movu [r0 + 31 * 16], m4
7490
7491 ;mode 2[row 31]
7492 movu [r0 + 62 * 16], m4
7493 palignr m7, m6, m5, 14
7494 movu [r0 + 63 * 16], m7
7495
7496 ;mode 2[row 16]
7497 movu [r0 + 32 * 16], m1
7498 movu [r0 + 33 * 16], m2
7499
7500 ; mode 5[row 31]
7501 movu [r0 + 254 * 16], m1
7502 movu [r0 + 255 * 16], m2
7503
7504 ; mode 3 [row 0]
7505 lea r5, [ang_table]
7506 movu m6, [r5 + 26 * 16]
7507 movu m7, [pw_1024 ]
7508 movu m1, [r4 + 1 ]
7509 punpcklbw m1, m0
7510 pmaddubsw m0, m1, m6
7511 pmulhrsw m0, m7
7512 movu m2, [r4 + 9]
7513 movd m3, [r4 + 10]
7514 palignr m3, m2, 1
7515 punpcklbw m2, m3
7516 pmaddubsw m3, m2, m6
7517 pmulhrsw m3, m7
7518 packuswb m0, m3
7519 movu [r0 + 64 * 16], m0
7520
7521 ; mode 6 [row 1 - first half]
7522 movu [r0 + 258 * 16], m0
7523
7524 ; mode 9 [row 12 - first half]
7525 movu [r0 + 472 * 16], m0
7526
7527 movu m0, [r4 + 17]
7528 movd m3, [r4 + 18]
7529 palignr m3, m0, 1
7530 punpcklbw m0, m3
7531 pmaddubsw m3, m0, m6
7532 pmulhrsw m3, m7
7533 movu m4, [r4 + 25]
7534 movd m5, [r4 + 26]
7535 palignr m5, m4, 1
7536 punpcklbw m4, m5
7537 pmaddubsw m5, m4, m6
7538 pmulhrsw m5, m7
7539 packuswb m3, m5
7540 movu [r0 + 65 * 16], m3
7541
7542 ; mode 6 [row 1 - second half]
7543 movu [r0 + 259 * 16], m3
7544
7545 ; mode 9 [row 12 - second half]
7546 movu [r0 + 473 * 16], m3
7547
7548 ; mode 4 [row 0]
7549 movu m6, [r5 + 21 * 16]
7550 pmaddubsw m3, m1, m6
7551 pmulhrsw m3, m7
7552 pmaddubsw m5, m2, m6
7553 pmulhrsw m5, m7
7554 packuswb m3, m5
7555 movu [r0 + 128 * 16], m3
7556 pmaddubsw m3, m0, m6
7557 pmulhrsw m3, m7
7558 pmaddubsw m5, m4, m6
7559 pmulhrsw m5, m7
7560 packuswb m3, m5
7561 movu [r0 + 129 * 16], m3
7562
7563 ; mode 5 [row 0]
7564 movu m6, [r5 + 17 * 16]
7565 pmaddubsw m3, m1, m6
7566 pmulhrsw m3, m7
7567 pmaddubsw m5, m2, m6
7568 pmulhrsw m5, m7
7569 packuswb m3, m5
7570 movu [r0 + 192 * 16], m3
7571 pmaddubsw m3, m0, m6
7572 pmulhrsw m3, m7
7573 pmaddubsw m5, m4, m6
7574 pmulhrsw m5, m7
7575 packuswb m3, m5
7576 movu [r0 + 193 * 16], m3
7577
7578 ; mode 6 [row 0]
7579 movu m6, [r5 + 13 * 16]
7580 pmaddubsw m3, m1, m6
7581 pmulhrsw m3, m7
7582 pmaddubsw m5, m2, m6
7583 pmulhrsw m5, m7
7584 packuswb m3, m5
7585 movu [r0 + 256 * 16], m3
7586 pmaddubsw m3, m0, m6
7587 pmulhrsw m3, m7
7588 pmaddubsw m5, m4, m6
7589 pmulhrsw m5, m7
7590 packuswb m3, m5
7591 movu [r0 + 257 * 16], m3
7592
7593 ; mode 7 [row 0]
7594 movu m6, [r5 + 9 * 16]
7595 pmaddubsw m3, m1, m6
7596 pmulhrsw m3, m7
7597 pmaddubsw m5, m2, m6
7598 pmulhrsw m5, m7
7599 packuswb m3, m5
7600 movu [r0 + 320 * 16], m3
7601 pmaddubsw m3, m0, m6
7602 pmulhrsw m3, m7
7603 pmaddubsw m5, m4, m6
7604 pmulhrsw m5, m7
7605 packuswb m3, m5
7606 movu [r0 + 321 * 16], m3
7607
7608 ; mode 7 [row 1]
7609 movu m6, [r5 + 18 * 16]
7610 pmaddubsw m3, m1, m6
7611 pmulhrsw m3, m7
7612 pmaddubsw m5, m2, m6
7613 pmulhrsw m5, m7
7614 packuswb m3, m5
7615 movu [r0 + 322 * 16], m3
7616
7617 ; mode 9 [row 8 - first half]
7618 movu [r0 + 464 * 16], m3
7619
7620 pmaddubsw m3, m0, m6
7621 pmulhrsw m3, m7
7622 pmaddubsw m5, m4, m6
7623 pmulhrsw m5, m7
7624 packuswb m3, m5
7625 movu [r0 + 323 * 16], m3
7626
7627 ; mode 9 [row 8 - second half]
7628 movu [r0 + 465 * 16], m3
7629
7630 ; mode 7 [row 2]
7631 movu m6, [r5 + 27 * 16]
7632 pmaddubsw m3, m1, m6
7633 pmulhrsw m3, m7
7634 pmaddubsw m5, m2, m6
7635 pmulhrsw m5, m7
7636 packuswb m3, m5
7637 movu [r0 + 324 * 16], m3
7638 pmaddubsw m3, m0, m6
7639 pmulhrsw m3, m7
7640 pmaddubsw m5, m4, m6
7641 pmulhrsw m5, m7
7642 packuswb m3, m5
7643 movu [r0 + 325 * 16], m3
7644
7645 ; mode 8 [row 0]
7646 movu m6, [r5 + 5 * 16]
7647 pmaddubsw m3, m1, m6
7648 pmulhrsw m3, m7
7649 pmaddubsw m5, m2, m6
7650 pmulhrsw m5, m7
7651 packuswb m3, m5
7652 movu [r0 + 384 * 16], m3
7653 pmaddubsw m3, m0, m6
7654 pmulhrsw m3, m7
7655 pmaddubsw m5, m4, m6
7656 pmulhrsw m5, m7
7657 packuswb m3, m5
7658 movu [r0 + 385 * 16], m3
7659
7660 ; mode 8 [row 1]
7661 movu m6, [r5 + 10 * 16]
7662 pmaddubsw m3, m1, m6
7663 pmulhrsw m3, m7
7664 pmaddubsw m5, m2, m6
7665 pmulhrsw m5, m7
7666 packuswb m3, m5
7667 movu [r0 + 386 * 16], m3
7668
7669 ; mode 9 [row 4 - first half]
7670 movu [r0 + 456 * 16], m3
7671
7672 pmaddubsw m3, m0, m6
7673 pmulhrsw m3, m7
7674 pmaddubsw m5, m4, m6
7675 pmulhrsw m5, m7
7676 packuswb m3, m5
7677 movu [r0 + 387 * 16], m3
7678
7679 ; mode 9 [row 4 - second half]
7680 movu [r0 + 457 * 16], m3
7681
7682 ; mode 8 [row 2]
7683 movu m6, [r5 + 15 * 16]
7684 pmaddubsw m3, m1, m6
7685 pmulhrsw m3, m7
7686 pmaddubsw m5, m2, m6
7687 pmulhrsw m5, m7
7688 packuswb m3, m5
7689 movu [r0 + 388 * 16], m3
7690 pmaddubsw m3, m0, m6
7691 pmulhrsw m3, m7
7692 pmaddubsw m5, m4, m6
7693 pmulhrsw m5, m7
7694 packuswb m3, m5
7695 movu [r0 + 389 * 16], m3
7696
7697 ; mode 8 [row 3]
7698 movu m6, [r5 + 20 * 16]
7699 pmaddubsw m3, m1, m6
7700 pmulhrsw m3, m7
7701 pmaddubsw m5, m2, m6
7702 pmulhrsw m5, m7
7703 packuswb m3, m5
7704 movu [r0 + 390 * 16], m3
7705
7706 ; mode 9 [row 9 - first half]
7707 movu [r0 + 466 * 16], m3
7708
7709 pmaddubsw m3, m0, m6
7710 pmulhrsw m3, m7
7711 pmaddubsw m5, m4, m6
7712 pmulhrsw m5, m7
7713 packuswb m3, m5
7714 movu [r0 + 391 * 16], m3
7715
7716 ; mode 9 [row 9 - second half]
7717 movu [r0 + 467 * 16], m3
7718
7719 ; mode 8 [row 4]
7720 movu m6, [r5 + 25 * 16]
7721 pmaddubsw m3, m1, m6
7722 pmulhrsw m3, m7
7723 pmaddubsw m5, m2, m6
7724 pmulhrsw m5, m7
7725 packuswb m3, m5
7726 movu [r0 + 392 * 16], m3
7727 pmaddubsw m3, m0, m6
7728 pmulhrsw m3, m7
7729 pmaddubsw m5, m4, m6
7730 pmulhrsw m5, m7
7731 packuswb m3, m5
7732 movu [r0 + 393 * 16], m3
7733
7734 ; mode 8 [row 5]
7735 movu m6, [r5 + 30 * 16]
7736 pmaddubsw m3, m1, m6
7737 pmulhrsw m3, m7
7738 pmaddubsw m5, m2, m6
7739 pmulhrsw m5, m7
7740 packuswb m3, m5
7741 movu [r0 + 394 * 16], m3
7742
7743 ; mode 9 [row 14 - first half]
7744 movu [r0 + 476 * 16], m3
7745
7746 pmaddubsw m3, m0, m6
7747 pmulhrsw m3, m7
7748 pmaddubsw m5, m4, m6
7749 pmulhrsw m5, m7
7750 packuswb m3, m5
7751 movu [r0 + 395 * 16], m3
7752
7753 ; mode 9 [row 14 - second half]
7754 movu [r0 + 477 * 16], m3
7755
7756 ; mode 9 [row 0]
7757 movu m6, [r5 + 2 * 16]
7758 pmaddubsw m3, m1, m6
7759 pmulhrsw m3, m7
7760 pmaddubsw m5, m2, m6
7761 pmulhrsw m5, m7
7762 packuswb m3, m5
7763 movu [r0 + 448 * 16], m3
7764 pmaddubsw m3, m0, m6
7765 pmulhrsw m3, m7
7766 pmaddubsw m5, m4, m6
7767 pmulhrsw m5, m7
7768 packuswb m3, m5
7769 movu [r0 + 449 * 16], m3
7770
7771 ; mode 9 [row 1]
7772 movu m6, [r5 + 4 * 16]
7773 pmaddubsw m3, m1, m6
7774 pmulhrsw m3, m7
7775 pmaddubsw m5, m2, m6
7776 pmulhrsw m5, m7
7777 packuswb m3, m5
7778 movu [r0 + 450 * 16], m3
7779 pmaddubsw m3, m0, m6
7780 pmulhrsw m3, m7
7781 pmaddubsw m5, m4, m6
7782 pmulhrsw m5, m7
7783 packuswb m3, m5
7784 movu [r0 + 451 * 16], m3
7785
7786 ; mode 9 [row 2]
7787 movu m6, [r5 + 6 * 16]
7788 pmaddubsw m3, m1, m6
7789 pmulhrsw m3, m7
7790 pmaddubsw m5, m2, m6
7791 pmulhrsw m5, m7
7792 packuswb m3, m5
7793 movu [r0 + 452 * 16], m3
7794 pmaddubsw m3, m0, m6
7795 pmulhrsw m3, m7
7796 pmaddubsw m5, m4, m6
7797 pmulhrsw m5, m7
7798 packuswb m3, m5
7799 movu [r0 + 453 * 16], m3
7800
7801 ; mode 9 [row 3]
7802 movu m6, [r5 + 8 * 16]
7803 pmaddubsw m3, m1, m6
7804 pmulhrsw m3, m7
7805 pmaddubsw m5, m2, m6
7806 pmulhrsw m5, m7
7807 packuswb m3, m5
7808 movu [r0 + 454 * 16], m3
7809 pmaddubsw m3, m0, m6
7810 pmulhrsw m3, m7
7811 pmaddubsw m5, m4, m6
7812 pmulhrsw m5, m7
7813 packuswb m3, m5
7814 movu [r0 + 455 * 16], m3
7815
7816 ; mode 9 [row 5]
7817 movu m6, [r5 + 12 * 16]
7818 pmaddubsw m3, m1, m6
7819 pmulhrsw m3, m7
7820 pmaddubsw m5, m2, m6
7821 pmulhrsw m5, m7
7822 packuswb m3, m5
7823 movu [r0 + 458 * 16], m3
7824 pmaddubsw m3, m0, m6
7825 pmulhrsw m3, m7
7826 pmaddubsw m5, m4, m6
7827 pmulhrsw m5, m7
7828 packuswb m3, m5
7829 movu [r0 + 459 * 16], m3
7830
7831 ; mode 9 [row 6]
7832 movu m6, [r5 + 14 * 16]
7833 pmaddubsw m3, m1, m6
7834 pmulhrsw m3, m7
7835 pmaddubsw m5, m2, m6
7836 pmulhrsw m5, m7
7837 packuswb m3, m5
7838 movu [r0 + 460 * 16], m3
7839 pmaddubsw m3, m0, m6
7840 pmulhrsw m3, m7
7841 pmaddubsw m5, m4, m6
7842 pmulhrsw m5, m7
7843 packuswb m3, m5
7844 movu [r0 + 461 * 16], m3
7845
7846 ; mode 9 [row 7]
7847 movu m6, [r5 + 16 * 16]
7848 pmaddubsw m3, m1, m6
7849 pmulhrsw m3, m7
7850 pmaddubsw m5, m2, m6
7851 pmulhrsw m5, m7
7852 packuswb m3, m5
7853 movu [r0 + 462 * 16], m3
7854 pmaddubsw m3, m0, m6
7855 pmulhrsw m3, m7
7856 pmaddubsw m5, m4, m6
7857 pmulhrsw m5, m7
7858 packuswb m3, m5
7859 movu [r0 + 463 * 16], m3
7860
7861 ; mode 9 [row 10]
7862 movu m6, [r5 + 22 * 16]
7863 pmaddubsw m3, m1, m6
7864 pmulhrsw m3, m7
7865 pmaddubsw m5, m2, m6
7866 pmulhrsw m5, m7
7867 packuswb m3, m5
7868 movu [r0 + 468 * 16], m3
7869 pmaddubsw m3, m0, m6
7870 pmulhrsw m3, m7
7871 pmaddubsw m5, m4, m6
7872 pmulhrsw m5, m7
7873 packuswb m3, m5
7874 movu [r0 + 469 * 16], m3
7875
7876 ; mode 9 [row 11]
7877 movu m6, [r5 + 24 * 16]
7878 pmaddubsw m3, m1, m6
7879 pmulhrsw m3, m7
7880 pmaddubsw m5, m2, m6
7881 pmulhrsw m5, m7
7882 packuswb m3, m5
7883 movu [r0 + 470 * 16], m3
7884 pmaddubsw m3, m0, m6
7885 pmulhrsw m3, m7
7886 pmaddubsw m5, m4, m6
7887 pmulhrsw m5, m7
7888 packuswb m3, m5
7889 movu [r0 + 471 * 16], m3
7890
7891 ; mode 9 [row 13]
7892 movu m6, [r5 + 28 * 16]
7893 pmaddubsw m3, m1, m6
7894 pmulhrsw m3, m7
7895 pmaddubsw m5, m2, m6
7896 pmulhrsw m5, m7
7897 packuswb m3, m5
7898 movu [r0 + 474 * 16], m3
7899 pmaddubsw m3, m0, m6
7900 pmulhrsw m3, m7
7901 pmaddubsw m5, m4, m6
7902 pmulhrsw m5, m7
7903 packuswb m3, m5
7904 movu [r0 + 475 * 16], m3
7905
7906 ; mode 3 [row 1]
7907 movu m6, [r5 + 20 * 16]
7908 movu m0, [r4 + 2]
7909 movd m1, [r4 + 3]
7910 palignr m1, m0, 1
7911 punpcklbw m0, m1
7912 pmaddubsw m1, m0, m6
7913 pmulhrsw m1, m7
7914 movu m2, [r4 + 10]
7915 movd m3, [r4 + 11]
7916 palignr m3, m2, 1
7917 punpcklbw m2, m3
7918 pmaddubsw m3, m2, m6
7919 pmulhrsw m3, m7
7920 packuswb m1, m3
7921 movu [r0 + 66 * 16], m1
7922
7923 ; mode 6 [row 3 - first half]
7924 movu [r0 + 262 * 16], m1
7925
7926 ; mode 9 [row 25 - first half]
7927 movu [r0 + 498 * 16], m1
7928
7929 movu m1, [r4 + 18]
7930 movd m3, [r4 + 19]
7931 palignr m3, m1, 1
7932 punpcklbw m1, m3
7933 pmaddubsw m3, m1, m6
7934 pmulhrsw m3, m7
7935 movu m4, [r4 + 26]
7936 movd m5, [r4 + 27]
7937 palignr m5, m4, 1
7938 punpcklbw m4, m5
7939 pmaddubsw m5, m4, m6
7940 pmulhrsw m5, m7
7941 packuswb m3, m5
7942 movu [r0 + 67 * 16], m3
7943
7944 ; mode 6 [row 3 - second half]
7945 movu [r0 + 263 * 16], m3
7946
7947 ; mode 9 [row 25 - second half]
7948 movu [r0 + 499 * 16], m3
7949
7950 ; mode 4 [row 1]
7951 movu m6, [r5 + 10 * 16]
7952 pmaddubsw m3, m0, m6
7953 pmulhrsw m3, m7
7954 pmaddubsw m5, m2, m6
7955 pmulhrsw m5, m7
7956 packuswb m3, m5
7957 movu [r0 + 130 * 16], m3
7958
7959 ; mode 9 [row 20 - first half]
7960 movu [r0 + 488 * 16], m3
7961
7962 pmaddubsw m3, m1, m6
7963 pmulhrsw m3, m7
7964 pmaddubsw m5, m4, m6
7965 pmulhrsw m5, m7
7966 packuswb m3, m5
7967 movu [r0 + 131 * 16], m3
7968
7969 ; mode 9 [row 20 - second half]
7970 movu [r0 + 489 * 16], m3
7971
7972 ; mode 4 [row 2]
7973 movu m6, [r5 + 31 * 16]
7974 pmaddubsw m3, m0, m6
7975 pmulhrsw m3, m7
7976 pmaddubsw m5, m2, m6
7977 pmulhrsw m5, m7
7978 packuswb m3, m5
7979 movu [r0 + 132 * 16], m3
7980
7981 ; mode 7 [row 6 - first half]
7982 movu [r0 + 332 * 16], m3
7983
7984 pmaddubsw m3, m1, m6
7985 pmulhrsw m3, m7
7986 pmaddubsw m5, m4, m6
7987 pmulhrsw m5, m7
7988 packuswb m3, m5
7989 movu [r0 + 133 * 16], m3
7990
7991 ; mode 7 [row 6 - second half]
7992 movu [r0 + 333 * 16], m3
7993
7994 ; mode 5 [row 1]
7995 movu m6, [r5 + 2 * 16]
7996 pmaddubsw m3, m0, m6
7997 pmulhrsw m3, m7
7998 pmaddubsw m5, m2, m6
7999 pmulhrsw m5, m7
8000 packuswb m3, m5
8001 movu [r0 + 194 * 16], m3
8002
8003 ; mode 5 [row 1 - first half]
8004 movu [r0 + 480 * 16], m3
8005
8006 pmaddubsw m3, m1, m6
8007 pmulhrsw m3, m7
8008 pmaddubsw m5, m4, m6
8009 pmulhrsw m5, m7
8010 packuswb m3, m5
8011 movu [r0 + 195 * 16], m3
8012
8013 ; mode 5 [row 1 - second half]
8014 movu [r0 + 481 * 16], m3
8015
8016 ; mode 5 [row 2]
8017 movu m6, [r5 + 19 * 16]
8018 pmaddubsw m3, m0, m6
8019 pmulhrsw m3, m7
8020 pmaddubsw m5, m2, m6
8021 pmulhrsw m5, m7
8022 packuswb m3, m5
8023 movu [r0 + 196 * 16], m3
8024 pmaddubsw m3, m1, m6
8025 pmulhrsw m3, m7
8026 pmaddubsw m5, m4, m6
8027 pmulhrsw m5, m7
8028 packuswb m3, m5
8029 movu [r0 + 197 * 16], m3
8030
8031 ; mode 6 [row 2]
8032 movu m6, [r5 + 7 * 16]
8033 pmaddubsw m3, m0, m6
8034 pmulhrsw m3, m7
8035 pmaddubsw m5, m2, m6
8036 pmulhrsw m5, m7
8037 packuswb m3, m5
8038 movu [r0 + 260 * 16], m3
8039 pmaddubsw m3, m1, m6
8040 pmulhrsw m3, m7
8041 pmaddubsw m5, m4, m6
8042 pmulhrsw m5, m7
8043 packuswb m3, m5
8044 movu [r0 + 261 * 16], m3
8045
8046 ; mode 7 [row 3]
8047 movu m6, [r5 + 4 * 16]
8048 pmaddubsw m3, m0, m6
8049 pmulhrsw m3, m7
8050 pmaddubsw m5, m2, m6
8051 pmulhrsw m5, m7
8052 packuswb m3, m5
8053 movu [r0 + 326 * 16], m3
8054
8055 ; mode 9 [row 17 - first half]
8056 movu [r0 + 482 * 16], m3
8057
8058 pmaddubsw m3, m1, m6
8059 pmulhrsw m3, m7
8060 pmaddubsw m5, m4, m6
8061 pmulhrsw m5, m7
8062 packuswb m3, m5
8063 movu [r0 + 327 * 16], m3
8064
8065 ; mode 9 [row 17 - second half]
8066 movu [r0 + 483 * 16], m3
8067
8068 ; mode 7 [row 4]
8069 movu m6, [r5 + 13 * 16]
8070 pmaddubsw m3, m0, m6
8071 pmulhrsw m3, m7
8072 pmaddubsw m5, m2, m6
8073 pmulhrsw m5, m7
8074 packuswb m3, m5
8075 movu [r0 + 328 * 16], m3
8076
8077 ; mode 8 [row 8 - first half]
8078 movu [r0 + 400 * 16], m3
8079
8080 pmaddubsw m3, m1, m6
8081 pmulhrsw m3, m7
8082 pmaddubsw m5, m4, m6
8083 pmulhrsw m5, m7
8084 packuswb m3, m5
8085 movu [r0 + 329 * 16], m3
8086
8087 ; mode 8 [row 8 - second half]
8088 movu [r0 + 401 * 16], m3
8089
8090 ; mode 7 [row 5]
8091 movu m6, [r5 + 22 * 16]
8092 pmaddubsw m3, m0, m6
8093 pmulhrsw m3, m7
8094 pmaddubsw m5, m2, m6
8095 pmulhrsw m5, m7
8096 packuswb m3, m5
8097 movu [r0 + 330 * 16], m3
8098
8099 ; mode 9 [row 26 - first half]
8100 movu [r0 + 500 * 16], m3
8101
8102 pmaddubsw m3, m1, m6
8103 pmulhrsw m3, m7
8104 pmaddubsw m5, m4, m6
8105 pmulhrsw m5, m7
8106 packuswb m3, m5
8107 movu [r0 + 331 * 16], m3
8108
8109 ; mode 9 [row 26 - second half]
8110 movu [r0 + 501 * 16], m3
8111
8112 ; mode 8 [row 6]
8113 movu m6, [r5 + 3 * 16]
8114 pmaddubsw m3, m0, m6
8115 pmulhrsw m3, m7
8116 pmaddubsw m5, m2, m6
8117 pmulhrsw m5, m7
8118 packuswb m3, m5
8119 movu [r0 + 396 * 16], m3
8120 pmaddubsw m3, m1, m6
8121 pmulhrsw m3, m7
8122 pmaddubsw m5, m4, m6
8123 pmulhrsw m5, m7
8124 packuswb m3, m5
8125 movu [r0 + 397 * 16], m3
8126
8127 ; mode 9 [row 18]
8128 movu m6, [r5 + 6 * 16]
8129 pmaddubsw m3, m0, m6
8130 pmulhrsw m3, m7
8131 pmaddubsw m5, m2, m6
8132 pmulhrsw m5, m7
8133 packuswb m3, m5
8134 movu [r0 + 484 * 16], m3
8135 pmaddubsw m3, m1, m6
8136 pmulhrsw m3, m7
8137 pmaddubsw m5, m4, m6
8138 pmulhrsw m5, m7
8139 packuswb m3, m5
8140 movu [r0 + 485 * 16], m3
8141
8142 ; mode 9 [row 21]
8143 movu m6, [r5 + 12 * 16]
8144 pmaddubsw m3, m0, m6
8145 pmulhrsw m3, m7
8146 pmaddubsw m5, m2, m6
8147 pmulhrsw m5, m7
8148 packuswb m3, m5
8149 movu [r0 + 490 * 16], m3
8150 pmaddubsw m3, m1, m6
8151 pmulhrsw m3, m7
8152 pmaddubsw m5, m4, m6
8153 pmulhrsw m5, m7
8154 packuswb m3, m5
8155 movu [r0 + 491 * 16], m3
8156
8157 ; mode 9 [row 22]
8158 movu m6, [r5 + 14 * 16]
8159 pmaddubsw m3, m0, m6
8160 pmulhrsw m3, m7
8161 pmaddubsw m5, m2, m6
8162 pmulhrsw m5, m7
8163 packuswb m3, m5
8164 movu [r0 + 492 * 16], m3
8165 pmaddubsw m3, m1, m6
8166 pmulhrsw m3, m7
8167 pmaddubsw m5, m4, m6
8168 pmulhrsw m5, m7
8169 packuswb m3, m5
8170 movu [r0 + 493 * 16], m3
8171
8172 ; mode 9 [row 23]
8173 movu m6, [r5 + 16 * 16]
8174 pmaddubsw m3, m0, m6
8175 pmulhrsw m3, m7
8176 pmaddubsw m5, m2, m6
8177 pmulhrsw m5, m7
8178 packuswb m3, m5
8179 movu [r0 + 494 * 16], m3
8180 pmaddubsw m3, m1, m6
8181 pmulhrsw m3, m7
8182 pmaddubsw m5, m4, m6
8183 pmulhrsw m5, m7
8184 packuswb m3, m5
8185 movu [r0 + 495 * 16], m3
8186
8187 ; mode 9 [row 27]
8188 movu m6, [r5 + 24 * 16]
8189 pmaddubsw m3, m0, m6
8190 pmulhrsw m3, m7
8191 pmaddubsw m5, m2, m6
8192 pmulhrsw m5, m7
8193 packuswb m3, m5
8194 movu [r0 + 502 * 16], m3
8195 pmaddubsw m3, m1, m6
8196 pmulhrsw m3, m7
8197 pmaddubsw m5, m4, m6
8198 pmulhrsw m5, m7
8199 packuswb m3, m5
8200 movu [r0 + 503 * 16], m3
8201
8202 ; mode 9 [row 28]
8203 movu m6, [r5 + 26 * 16]
8204 pmaddubsw m3, m0, m6
8205 pmulhrsw m3, m7
8206 pmaddubsw m5, m2, m6
8207 pmulhrsw m5, m7
8208 packuswb m3, m5
8209 movu [r0 + 504 * 16], m3
8210 pmaddubsw m3, m1, m6
8211 pmulhrsw m3, m7
8212 pmaddubsw m5, m4, m6
8213 pmulhrsw m5, m7
8214 packuswb m3, m5
8215 movu [r0 + 505 * 16], m3
8216
8217 ; mode 9 [row 30]
8218 movu m6, [r5 + 30 * 16]
8219 pmaddubsw m3, m0, m6
8220 pmulhrsw m3, m7
8221 pmaddubsw m5, m2, m6
8222 pmulhrsw m5, m7
8223 packuswb m3, m5
8224 movu [r0 + 508 * 16], m3
8225 pmaddubsw m3, m1, m6
8226 pmulhrsw m3, m7
8227 pmaddubsw m5, m4, m6
8228 pmulhrsw m5, m7
8229 packuswb m3, m5
8230 movu [r0 + 509 * 16], m3
8231
8232 ; mode 8 [row 7]
8233 movu m6, [r5 + 8 * 16]
8234 pmaddubsw m3, m0, m6
8235 pmulhrsw m3, m7
8236 pmaddubsw m5, m2, m6
8237 pmulhrsw m5, m7
8238 packuswb m3, m5
8239 movu [r0 + 398 * 16], m3
8240
8241 ; mode 9 [row 19 - first half]
8242 movu [r0 + 486 * 16], m3
8243
8244 pmaddubsw m3, m1, m6
8245 pmulhrsw m3, m7
8246 pmaddubsw m5, m4, m6
8247 pmulhrsw m5, m7
8248 packuswb m3, m5
8249 movu [r0 + 399 * 16], m3
8250
8251 ; mode 9 [row 19 - second half]
8252 movu [r0 + 487 * 16], m3
8253
8254 ; mode 8 [row 9]
8255 movu m6, [r5 + 18 * 16]
8256 pmaddubsw m3, m0, m6
8257 pmulhrsw m3, m7
8258 pmaddubsw m5, m2, m6
8259 pmulhrsw m5, m7
8260 packuswb m3, m5
8261 movu [r0 + 402 * 16], m3
8262
8263 ; mode 9 [row 24 - first half]
8264 movu [r0 + 496 * 16], m3
8265
8266 pmaddubsw m3, m1, m6
8267 pmulhrsw m3, m7
8268 pmaddubsw m5, m4, m6
8269 pmulhrsw m5, m7
8270 packuswb m3, m5
8271 movu [r0 + 403 * 16], m3
8272
8273 ; mode 9 [row 24 - second half]
8274 movu [r0 + 497 * 16], m3
8275
8276 ; mode 8 [row 10]
8277 movu m6, [r5 + 23 * 16]
8278 pmaddubsw m3, m0, m6
8279 pmulhrsw m3, m7
8280 pmaddubsw m5, m2, m6
8281 pmulhrsw m5, m7
8282 packuswb m3, m5
8283 movu [r0 + 404 * 16], m3
8284 pmaddubsw m3, m1, m6
8285 pmulhrsw m3, m7
8286 pmaddubsw m5, m4, m6
8287 pmulhrsw m5, m7
8288 packuswb m3, m5
8289 movu [r0 + 405 * 16], m3
8290
8291 ; mode 8 [row 11]
8292 movu m6, [r5 + 28 * 16]
8293 pmaddubsw m3, m0, m6
8294 pmulhrsw m3, m7
8295 pmaddubsw m5, m2, m6
8296 pmulhrsw m5, m7
8297 packuswb m3, m5
8298 movu [r0 + 406 * 16], m3
8299
8300 ; mode 9 [row 29 - first half]
8301 movu [r0 + 506 * 16], m3
8302
8303 pmaddubsw m3, m1, m6
8304 pmulhrsw m3, m7
8305 pmaddubsw m5, m4, m6
8306 pmulhrsw m5, m7
8307 packuswb m3, m5
8308 movu [r0 + 407 * 16], m3
8309
8310 ; mode 9 [row 29 - second half]
8311 movu [r0 + 507 * 16], m3
8312
8313 ; mode 3 [row 2]
8314 movu m6, [r5 + 14 * 16]
8315 movu m0, [r4 + 3]
8316 movd m1, [r4 + 4]
8317 palignr m1, m0, 1
8318 punpcklbw m0, m1
8319 pmaddubsw m1, m0, m6
8320 pmulhrsw m1, m7
8321 movu m2, [r4 + 11]
8322 movd m3, [r4 + 12]
8323 palignr m3, m2, 1
8324 punpcklbw m2, m3
8325 pmaddubsw m3, m2, m6
8326 pmulhrsw m3, m7
8327 packuswb m1, m3
8328 movu [r0 + 68 * 16], m1
8329
8330 ; mode 3 [row 2 - first half]
8331 movu [r0 + 266 * 16], m1
8332
8333 movu m1, [r4 + 19]
8334 movd m3, [r4 + 20]
8335 palignr m3, m1, 1
8336 punpcklbw m1, m3
8337 pmaddubsw m3, m1, m6
8338 pmulhrsw m3, m7
8339 movu m4, [r4 + 27]
8340 movd m5, [r4 + 28]
8341 palignr m5, m4, 1
8342 punpcklbw m4, m5
8343 pmaddubsw m5, m4, m6
8344 pmulhrsw m5, m7
8345 packuswb m3, m5
8346 movu [r0 + 69 * 16], m3
8347
8348 ; mode 3 [row 2 - second half]
8349 movu [r0 + 267 * 16], m3
8350
8351 ; mode 4 [row 3]
8352 movu m6, [r5 + 20 * 16]
8353 pmaddubsw m3, m0, m6
8354 pmulhrsw m3, m7
8355 pmaddubsw m5, m2, m6
8356 pmulhrsw m5, m7
8357 packuswb m3, m5
8358 movu [r0 + 134 * 16], m3
8359 pmaddubsw m3, m1, m6
8360 pmulhrsw m3, m7
8361 pmaddubsw m5, m4, m6
8362 pmulhrsw m5, m7
8363 packuswb m3, m5
8364 movu [r0 + 135 * 16], m3
8365
8366 ; mode 5 [row 3]
8367 movu m6, [r5 + 4 * 16]
8368 pmaddubsw m3, m0, m6
8369 pmulhrsw m3, m7
8370 pmaddubsw m5, m2, m6
8371 pmulhrsw m5, m7
8372 packuswb m3, m5
8373 movu [r0 + 198 * 16], m3
8374 pmaddubsw m3, m1, m6
8375 pmulhrsw m3, m7
8376 pmaddubsw m5, m4, m6
8377 pmulhrsw m5, m7
8378 packuswb m3, m5
8379 movu [r0 + 199 * 16], m3
8380
8381 ; mode 5 [row 4]
8382 movu m6, [r5 + 21 * 16]
8383 pmaddubsw m3, m0, m6
8384 pmulhrsw m3, m7
8385 pmaddubsw m5, m2, m6
8386 pmulhrsw m5, m7
8387 packuswb m3, m5
8388 movu [r0 + 200 * 16], m3
8389
8390 ; mode 8 [row 16 - first half]
8391 movu [r0 + 416 * 16], m3
8392
8393 pmaddubsw m3, m1, m6
8394 pmulhrsw m3, m7
8395 pmaddubsw m5, m4, m6
8396 pmulhrsw m5, m7
8397 packuswb m3, m5
8398 movu [r0 + 201 * 16], m3
8399
8400 ; mode 8 [row 16 - second half]
8401 movu [r0 + 417 * 16], m3
8402
8403 ; mode 6 [row 4]
8404 movu m6, [r5 + 1 * 16]
8405 pmaddubsw m3, m0, m6
8406 pmulhrsw m3, m7
8407 pmaddubsw m5, m2, m6
8408 pmulhrsw m5, m7
8409 packuswb m3, m5
8410 movu [r0 + 264 * 16], m3
8411
8412 ; mode 6 [row 4 - first half]
8413 movu [r0 + 408 * 16], m3
8414
8415 pmaddubsw m3, m1, m6
8416 pmulhrsw m3, m7
8417 pmaddubsw m5, m4, m6
8418 pmulhrsw m5, m7
8419 packuswb m3, m5
8420 movu [r0 + 265 * 16], m3
8421
8422 ; mode 6 [row 4 - second half]
8423 movu [r0 + 409 * 16], m3
8424
8425 ; mode 6 [row 6]
8426 movu m6, [r5 + 27 * 16]
8427 pmaddubsw m3, m0, m6
8428 pmulhrsw m3, m7
8429 pmaddubsw m5, m2, m6
8430 pmulhrsw m5, m7
8431 packuswb m3, m5
8432 movu [r0 + 268 * 16], m3
8433 pmaddubsw m3, m1, m6
8434 pmulhrsw m3, m7
8435 pmaddubsw m5, m4, m6
8436 pmulhrsw m5, m7
8437 packuswb m3, m5
8438 movu [r0 + 269 * 16], m3
8439
8440 ; mode 7 [row 7]
8441 movu m6, [r5 + 8 * 16]
8442 pmaddubsw m3, m0, m6
8443 pmulhrsw m3, m7
8444 pmaddubsw m5, m2, m6
8445 pmulhrsw m5, m7
8446 packuswb m3, m5
8447 movu [r0 + 334 * 16], m3
8448 pmaddubsw m3, m1, m6
8449 pmulhrsw m3, m7
8450 pmaddubsw m5, m4, m6
8451 pmulhrsw m5, m7
8452 packuswb m3, m5
8453 movu [r0 + 335 * 16], m3
8454
8455 ; mode 7 [row 8]
8456 movu m6, [r5 + 17 * 16]
8457 pmaddubsw m3, m0, m6
8458 pmulhrsw m3, m7
8459 pmaddubsw m5, m2, m6
8460 pmulhrsw m5, m7
8461 packuswb m3, m5
8462 movu [r0 + 336 * 16], m3
8463 pmaddubsw m3, m1, m6
8464 pmulhrsw m3, m7
8465 pmaddubsw m5, m4, m6
8466 pmulhrsw m5, m7
8467 packuswb m3, m5
8468 movu [r0 + 337 * 16], m3
8469
8470 ; mode 7 [row 9]
8471 movu m6, [r5 + 26 * 16]
8472 pmaddubsw m3, m0, m6
8473 pmulhrsw m3, m7
8474 pmaddubsw m5, m2, m6
8475 pmulhrsw m5, m7
8476 packuswb m3, m5
8477 movu [r0 + 338 * 16], m3
8478
8479 ; mode 8 [row 17 - first half]
8480 movu [r0 + 418 * 16], m3
8481
8482 pmaddubsw m3, m1, m6
8483 pmulhrsw m3, m7
8484 pmaddubsw m5, m4, m6
8485 pmulhrsw m5, m7
8486 packuswb m3, m5
8487 movu [r0 + 339 * 16], m3
8488
8489 ; mode 8 [row 17 - second half]
8490 movu [r0 + 419 * 16], m3
8491
8492 ; mode 8 [row 13]
8493 movu m6, [r5 + 6 * 16]
8494 pmaddubsw m3, m0, m6
8495 pmulhrsw m3, m7
8496 pmaddubsw m5, m2, m6
8497 pmulhrsw m5, m7
8498 packuswb m3, m5
8499 movu [r0 + 410 * 16], m3
8500 pmaddubsw m3, m1, m6
8501 pmulhrsw m3, m7
8502 pmaddubsw m5, m4, m6
8503 pmulhrsw m5, m7
8504 packuswb m3, m5
8505 movu [r0 + 411 * 16], m3
8506
8507 ; mode 8 [row 14]
8508 movu m6, [r5 + 11 * 16]
8509 pmaddubsw m3, m0, m6
8510 pmulhrsw m3, m7
8511 pmaddubsw m5, m2, m6
8512 pmulhrsw m5, m7
8513 packuswb m3, m5
8514 movu [r0 + 412 * 16], m3
8515 pmaddubsw m3, m1, m6
8516 pmulhrsw m3, m7
8517 pmaddubsw m5, m4, m6
8518 pmulhrsw m5, m7
8519 packuswb m3, m5
8520 movu [r0 + 413 * 16], m3
8521
8522 ; mode 8 [row 15]
8523 movu m6, [r5 + 16 * 16]
8524 pmaddubsw m3, m0, m6
8525 pmulhrsw m3, m7
8526 pmaddubsw m5, m2, m6
8527 pmulhrsw m5, m7
8528 packuswb m3, m5
8529 movu [r0 + 414 * 16], m3
8530 pmaddubsw m3, m1, m6
8531 pmulhrsw m3, m7
8532 pmaddubsw m5, m4, m6
8533 pmulhrsw m5, m7
8534 packuswb m3, m5
8535 movu [r0 + 415 * 16], m3
8536
8537 ; mode 8 [row 18]
8538 movu m6, [r5 + 31 * 16]
8539 pmaddubsw m3, m0, m6
8540 pmulhrsw m3, m7
8541 pmaddubsw m5, m2, m6
8542 pmulhrsw m5, m7
8543 packuswb m3, m5
8544 movu [r0 + 420 * 16], m3
8545 pmaddubsw m3, m1, m6
8546 pmulhrsw m3, m7
8547 pmaddubsw m5, m4, m6
8548 pmulhrsw m5, m7
8549 packuswb m3, m5
8550 movu [r0 + 421 * 16], m3
8551
8552 ; mode 3 [row 3]
8553 movu m6, [r5 + 8 * 16]
8554 movu m0, [r4 + 4]
8555 movd m1, [r4 + 5]
8556 palignr m1, m0, 1
8557 punpcklbw m0, m1
8558 pmaddubsw m1, m0, m6
8559 pmulhrsw m1, m7
8560 movu m2, [r4 + 12]
8561 movd m3, [r4 + 13]
8562 palignr m3, m2, 1
8563 punpcklbw m2, m3
8564 pmaddubsw m3, m2, m6
8565 pmulhrsw m3, m7
8566 packuswb m1, m3
8567 movu [r0 + 70 * 16], m1
8568
8569 ; mode 6 [row 7 - first half]
8570 movu [r0 + 270 * 16], m1
8571
8572 movu m1, [r4 + 20]
8573 movd m3, [r4 + 21]
8574 palignr m3, m1, 1
8575 punpcklbw m1, m3
8576 pmaddubsw m3, m1, m6
8577 pmulhrsw m3, m7
8578 movu m4, [r4 + 28]
8579 movd m5, [r4 + 29]
8580 palignr m5, m4, 1
8581 punpcklbw m4, m5
8582 pmaddubsw m5, m4, m6
8583 pmulhrsw m5, m7
8584 packuswb m3, m5
8585 movu [r0 + 71 * 16], m3
8586
8587 ; mode 6 [row 7 - second half]
8588 movu [r0 + 271 * 16], m3
8589
8590 ; mode 4 [row 4]
8591 movu m6, [r5 + 9 * 16]
8592 pmaddubsw m3, m0, m6
8593 pmulhrsw m3, m7
8594 pmaddubsw m5, m2, m6
8595 pmulhrsw m5, m7
8596 packuswb m3, m5
8597 movu [r0 + 136 * 16], m3
8598
8599 ; mode 4 [row 4 - first half]
8600 movu [r0 + 424 * 16], m3
8601
8602 pmaddubsw m3, m1, m6
8603 pmulhrsw m3, m7
8604 pmaddubsw m5, m4, m6
8605 pmulhrsw m5, m7
8606 packuswb m3, m5
8607 movu [r0 + 137 * 16], m3
8608
8609 ; mode 4 [row 4 - second half]
8610 movu [r0 + 425 * 16], m3
8611
8612 ; mode 4 [row 5]
8613 movu m6, [r5 + 30 * 16]
8614 pmaddubsw m3, m0, m6
8615 pmulhrsw m3, m7
8616 pmaddubsw m5, m2, m6
8617 pmulhrsw m5, m7
8618 packuswb m3, m5
8619 movu [r0 + 138 * 16], m3
8620
8621 ; mode 7 [row 13 - first half]
8622 movu [r0 + 346 * 16], m3
8623
8624 pmaddubsw m3, m1, m6
8625 pmulhrsw m3, m7
8626 pmaddubsw m5, m4, m6
8627 pmulhrsw m5, m7
8628 packuswb m3, m5
8629 movu [r0 + 139 * 16], m3
8630
8631 ; mode 7 [row 13 - second half]
8632 movu [r0 + 347 * 16], m3
8633
8634 ; mode 5 [row 5]
8635 movu m6, [r5 + 6 * 16]
8636 pmaddubsw m3, m0, m6
8637 pmulhrsw m3, m7
8638 pmaddubsw m5, m2, m6
8639 pmulhrsw m5, m7
8640 packuswb m3, m5
8641 movu [r0 + 202 * 16], m3
8642 pmaddubsw m3, m1, m6
8643 pmulhrsw m3, m7
8644 pmaddubsw m5, m4, m6
8645 pmulhrsw m5, m7
8646 packuswb m3, m5
8647 movu [r0 + 203 * 16], m3
8648
8649 ; mode 5 [row 6]
8650 movu m6, [r5 + 23 * 16]
8651 pmaddubsw m3, m0, m6
8652 pmulhrsw m3, m7
8653 pmaddubsw m5, m2, m6
8654 pmulhrsw m5, m7
8655 packuswb m3, m5
8656 movu [r0 + 204 * 16], m3
8657 pmaddubsw m3, m1, m6
8658 pmulhrsw m3, m7
8659 pmaddubsw m5, m4, m6
8660 pmulhrsw m5, m7
8661 packuswb m3, m5
8662 movu [r0 + 205 * 16], m3
8663
8664 ; mode 6 [row 8]
8665 movu m6, [r5 + 21 * 16]
8666 pmaddubsw m3, m0, m6
8667 pmulhrsw m3, m7
8668 pmaddubsw m5, m2, m6
8669 pmulhrsw m5, m7
8670 packuswb m3, m5
8671 movu [r0 + 272 * 16], m3
8672
8673 ; mode 7 [row 12 - first half]
8674 movu [r0 + 344 * 16], m3
8675
8676 pmaddubsw m3, m1, m6
8677 pmulhrsw m3, m7
8678 pmaddubsw m5, m4, m6
8679 pmulhrsw m5, m7
8680 packuswb m3, m5
8681 movu [r0 + 273 * 16], m3
8682
8683 ; mode 7 [row 12 - second half]
8684 movu [r0 + 345 * 16], m3
8685
8686 ; mode 7 [row 10]
8687 movu m6, [r5 + 3 * 16]
8688 pmaddubsw m3, m0, m6
8689 pmulhrsw m3, m7
8690 pmaddubsw m5, m2, m6
8691 pmulhrsw m5, m7
8692 packuswb m3, m5
8693 movu [r0 + 340 * 16], m3
8694 pmaddubsw m3, m1, m6
8695 pmulhrsw m3, m7
8696 pmaddubsw m5, m4, m6
8697 pmulhrsw m5, m7
8698 packuswb m3, m5
8699 movu [r0 + 341 * 16], m3
8700
8701 ; mode 7 [row 11]
8702 movu m6, [r5 + 12 * 16]
8703 pmaddubsw m3, m0, m6
8704 pmulhrsw m3, m7
8705 pmaddubsw m5, m2, m6
8706 pmulhrsw m5, m7
8707 packuswb m3, m5
8708 movu [r0 + 342 * 16], m3
8709 pmaddubsw m3, m1, m6
8710 pmulhrsw m3, m7
8711 pmaddubsw m5, m4, m6
8712 pmulhrsw m5, m7
8713 packuswb m3, m5
8714 movu [r0 + 343 * 16], m3
8715
8716 ; mode 8 [row 19]
8717 movu m6, [r5 + 4 * 16]
8718 pmaddubsw m3, m0, m6
8719 pmulhrsw m3, m7
8720 pmaddubsw m5, m2, m6
8721 pmulhrsw m5, m7
8722 packuswb m3, m5
8723 movu [r0 + 422 * 16], m3
8724 pmaddubsw m3, m1, m6
8725 pmulhrsw m3, m7
8726 pmaddubsw m5, m4, m6
8727 pmulhrsw m5, m7
8728 packuswb m3, m5
8729 movu [r0 + 423 * 16], m3
8730
8731 ; mode 8 [row 21]
8732 movu m6, [r5 + 14 * 16]
8733 pmaddubsw m3, m0, m6
8734 pmulhrsw m3, m7
8735 pmaddubsw m5, m2, m6
8736 pmulhrsw m5, m7
8737 packuswb m3, m5
8738 movu [r0 + 426 * 16], m3
8739 pmaddubsw m3, m1, m6
8740 pmulhrsw m3, m7
8741 pmaddubsw m5, m4, m6
8742 pmulhrsw m5, m7
8743 packuswb m3, m5
8744 movu [r0 + 427 * 16], m3
8745
8746 ; mode 8 [row 22]
8747 movu m6, [r5 + 19 * 16]
8748 pmaddubsw m3, m0, m6
8749 pmulhrsw m3, m7
8750 pmaddubsw m5, m2, m6
8751 pmulhrsw m5, m7
8752 packuswb m3, m5
8753 movu [r0 + 428 * 16], m3
8754 pmaddubsw m3, m1, m6
8755 pmulhrsw m3, m7
8756 pmaddubsw m5, m4, m6
8757 pmulhrsw m5, m7
8758 packuswb m3, m5
8759 movu [r0 + 429 * 16], m3
8760
8761 ; mode 8 [row 23]
8762 movu m6, [r5 + 24 * 16]
8763 pmaddubsw m3, m0, m6
8764 pmulhrsw m3, m7
8765 pmaddubsw m5, m2, m6
8766 pmulhrsw m5, m7
8767 packuswb m3, m5
8768 movu [r0 + 430 * 16], m3
8769 pmaddubsw m3, m1, m6
8770 pmulhrsw m3, m7
8771 pmaddubsw m5, m4, m6
8772 pmulhrsw m5, m7
8773 packuswb m3, m5
8774 movu [r0 + 431 * 16], m3
8775
8776 ; mode 8 [row 24]
8777 movu m6, [r5 + 29 * 16]
8778 pmaddubsw m3, m0, m6
8779 pmulhrsw m3, m7
8780 pmaddubsw m5, m2, m6
8781 pmulhrsw m5, m7
8782 packuswb m3, m5
8783 movu [r0 + 432 * 16], m3
8784 pmaddubsw m3, m1, m6
8785 pmulhrsw m3, m7
8786 pmaddubsw m5, m4, m6
8787 pmulhrsw m5, m7
8788 packuswb m3, m5
8789 movu [r0 + 433 * 16], m3
8790
8791 ; mode 3 [row 4]
8792 movu m6, [r5 + 2 * 16]
8793 movu m0, [r4 + 5]
8794 movd m1, [r4 + 6]
8795 palignr m1, m0, 1
8796 punpcklbw m0, m1
8797 pmaddubsw m1, m0, m6
8798 pmulhrsw m1, m7
8799 movu m2, [r4 + 13]
8800 movd m3, [r4 + 14]
8801 palignr m3, m2, 1
8802 punpcklbw m2, m3
8803 pmaddubsw m3, m2, m6
8804 pmulhrsw m3, m7
8805 packuswb m1, m3
8806 movu [r0 + 72 * 16], m1
8807
8808 ; mode 3 [row 4 - first half]
8809 movu [r0 + 274 * 16], m1
8810
8811 ; mode 8 [row 25 - first half]
8812 movu [r0 + 434 * 16], m1
8813
8814 movu m1, [r4 + 21]
8815 movd m3, [r4 + 22]
8816 palignr m3, m1, 1
8817 punpcklbw m1, m3
8818 pmaddubsw m3, m1, m6
8819 pmulhrsw m3, m7
8820 movu m4, [r4 + 29]
8821 movd m5, [r4 + 30]
8822 palignr m5, m4, 1
8823 punpcklbw m4, m5
8824 pmaddubsw m5, m4, m6
8825 pmulhrsw m5, m7
8826 packuswb m3, m5
8827 movu [r0 + 73 * 16], m3
8828
8829 ; mode 3 [row 4 - second half]
8830 movu [r0 + 275 * 16], m3
8831
8832 ; mode 8 [row 25 - second half]
8833 movu [r0 + 435 * 16], m3
8834
8835 ; mode 3 [row 5]
8836 movu m6, [r5 + 28 * 16]
8837 pmaddubsw m3, m0, m6
8838 pmulhrsw m3, m7
8839 pmaddubsw m5, m2, m6
8840 pmulhrsw m5, m7
8841 packuswb m3, m5
8842 movu [r0 + 74 * 16], m3
8843
8844 ; mode 3 [row 5 - first half]
8845 movu [r0 + 278 * 16], m3
8846
8847 pmaddubsw m3, m1, m6
8848 pmulhrsw m3, m7
8849 pmaddubsw m5, m4, m6
8850 pmulhrsw m5, m7
8851 packuswb m3, m5
8852 movu [r0 + 75 * 16], m3
8853
8854 ; mode 3 [row 5 - second half]
8855 movu [r0 + 279 * 16], m3
8856
8857 ; mode 4 [row 6]
8858 movu m6, [r5 + 19 * 16]
8859 pmaddubsw m3, m0, m6
8860 pmulhrsw m3, m7
8861 pmaddubsw m5, m2, m6
8862 pmulhrsw m5, m7
8863 packuswb m3, m5
8864 movu [r0 + 140 * 16], m3
8865 pmaddubsw m3, m1, m6
8866 pmulhrsw m3, m7
8867 pmaddubsw m5, m4, m6
8868 pmulhrsw m5, m7
8869 packuswb m3, m5
8870 movu [r0 + 141 * 16], m3
8871
8872 ; mode 5 [row 7]
8873 movu m6, [r5 + 8 * 16]
8874 pmaddubsw m3, m0, m6
8875 pmulhrsw m3, m7
8876 pmaddubsw m5, m2, m6
8877 pmulhrsw m5, m7
8878 packuswb m3, m5
8879 movu [r0 + 206 * 16], m3
8880 pmaddubsw m3, m1, m6
8881 pmulhrsw m3, m7
8882 pmaddubsw m5, m4, m6
8883 pmulhrsw m5, m7
8884 packuswb m3, m5
8885 movu [r0 + 207 * 16], m3
8886
8887 ; mode 5 [row 8]
8888 movu m6, [r5 + 25 * 16]
8889 pmaddubsw m3, m0, m6
8890 pmulhrsw m3, m7
8891 pmaddubsw m5, m2, m6
8892 pmulhrsw m5, m7
8893 packuswb m3, m5
8894 movu [r0 + 208 * 16], m3
8895
8896 ; mode 7 [row 16 - first half]
8897 movu [r0 + 352 * 16], m3
8898
8899 pmaddubsw m3, m1, m6
8900 pmulhrsw m3, m7
8901 pmaddubsw m5, m4, m6
8902 pmulhrsw m5, m7
8903 packuswb m3, m5
8904 movu [r0 + 209 * 16], m3
8905
8906 ; mode 7 [row 16 - second half]
8907 movu [r0 + 353 * 16], m3
8908
8909 ; mode 6 [row 10]
8910 movu m6, [r5 + 15 * 16]
8911 pmaddubsw m3, m0, m6
8912 pmulhrsw m3, m7
8913 pmaddubsw m5, m2, m6
8914 pmulhrsw m5, m7
8915 packuswb m3, m5
8916 movu [r0 + 276 * 16], m3
8917 pmaddubsw m3, m1, m6
8918 pmulhrsw m3, m7
8919 pmaddubsw m5, m4, m6
8920 pmulhrsw m5, m7
8921 packuswb m3, m5
8922 movu [r0 + 277 * 16], m3
8923
8924 ; mode 7 [row 14]
8925 movu m6, [r5 + 7 * 16]
8926 pmaddubsw m3, m0, m6
8927 pmulhrsw m3, m7
8928 pmaddubsw m5, m2, m6
8929 pmulhrsw m5, m7
8930 packuswb m3, m5
8931 movu [r0 + 348 * 16], m3
8932
8933 ; mode 8 [row 26 - first half]
8934 movu [r0 + 436 * 16], m3
8935
8936 pmaddubsw m3, m1, m6
8937 pmulhrsw m3, m7
8938 pmaddubsw m5, m4, m6
8939 pmulhrsw m5, m7
8940 packuswb m3, m5
8941 movu [r0 + 349 * 16], m3
8942
8943 ; mode 8 [row 26 - second half]
8944 movu [r0 + 437 * 16], m3
8945
8946 ; mode 7 [row 15]
8947 movu m6, [r5 + 16 * 16]
8948 pmaddubsw m3, m0, m6
8949 pmulhrsw m3, m7
8950 pmaddubsw m5, m2, m6
8951 pmulhrsw m5, m7
8952 packuswb m3, m5
8953 movu [r0 + 350 * 16], m3
8954 pmaddubsw m3, m1, m6
8955 pmulhrsw m3, m7
8956 pmaddubsw m5, m4, m6
8957 pmulhrsw m5, m7
8958 packuswb m3, m5
8959 movu [r0 + 351 * 16], m3
8960
8961 ; mode 8 [row 27]
8962 movu m6, [r5 + 12 * 16]
8963 pmaddubsw m3, m0, m6
8964 pmulhrsw m3, m7
8965 pmaddubsw m5, m2, m6
8966 pmulhrsw m5, m7
8967 packuswb m3, m5
8968 movu [r0 + 438 * 16], m3
8969 pmaddubsw m3, m1, m6
8970 pmulhrsw m3, m7
8971 pmaddubsw m5, m4, m6
8972 pmulhrsw m5, m7
8973 packuswb m3, m5
8974 movu [r0 + 439 * 16], m3
8975
8976 ; mode 8 [row 28]
8977 movu m6, [r5 + 17 * 16]
8978 pmaddubsw m3, m0, m6
8979 pmulhrsw m3, m7
8980 pmaddubsw m5, m2, m6
8981 pmulhrsw m5, m7
8982 packuswb m3, m5
8983 movu [r0 + 440 * 16], m3
8984 pmaddubsw m3, m1, m6
8985 pmulhrsw m3, m7
8986 pmaddubsw m5, m4, m6
8987 pmulhrsw m5, m7
8988 packuswb m3, m5
8989 movu [r0 + 441 * 16], m3
8990
8991 ; mode 8 [row 29]
8992 movu m6, [r5 + 22 * 16]
8993 pmaddubsw m3, m0, m6
8994 pmulhrsw m3, m7
8995 pmaddubsw m5, m2, m6
8996 pmulhrsw m5, m7
8997 packuswb m3, m5
8998 movu [r0 + 442 * 16], m3
8999 pmaddubsw m3, m1, m6
9000 pmulhrsw m3, m7
9001 pmaddubsw m5, m4, m6
9002 pmulhrsw m5, m7
9003 packuswb m3, m5
9004 movu [r0 + 443 * 16], m3
9005
9006 ; mode 8 [row 30]
9007 movu m6, [r5 + 27 * 16]
9008 pmaddubsw m3, m0, m6
9009 pmulhrsw m3, m7
9010 pmaddubsw m5, m2, m6
9011 pmulhrsw m5, m7
9012 packuswb m3, m5
9013 movu [r0 + 444 * 16], m3
9014 pmaddubsw m3, m1, m6
9015 pmulhrsw m3, m7
9016 pmaddubsw m5, m4, m6
9017 pmulhrsw m5, m7
9018 packuswb m3, m5
9019 movu [r0 + 445 * 16], m3
9020
9021 ; mode 3 [row 6]
9022 movu m6, [r5 + 22 * 16]
9023 movu m0, [r4 + 6]
9024 movd m1, [r4 + 7]
9025 palignr m1, m0, 1
9026 punpcklbw m0, m1
9027 pmaddubsw m1, m0, m6
9028 pmulhrsw m1, m7
9029 movu m2, [r4 + 14]
9030 movd m3, [r4 + 15]
9031 palignr m3, m2, 1
9032 punpcklbw m2, m3
9033 pmaddubsw m3, m2, m6
9034 pmulhrsw m3, m7
9035 packuswb m1, m3
9036 movu [r0 + 76 * 16], m1
9037
9038 ; mode 6 [row 13 - first half]
9039 movu [r0 + 282 * 16], m1
9040
9041 movu m1, [r4 + 22]
9042 movd m3, [r4 + 23]
9043 palignr m3, m1, 1
9044 punpcklbw m1, m3
9045 pmaddubsw m3, m1, m6
9046 pmulhrsw m3, m7
9047 movu m4, [r4 + 30]
9048 movd m5, [r4 + 31]
9049 palignr m5, m4, 1
9050 punpcklbw m4, m5
9051 pmaddubsw m5, m4, m6
9052 pmulhrsw m5, m7
9053 packuswb m3, m5
9054 movu [r0 + 77 * 16], m3
9055
9056 ; mode 6 [row 13 - second half]
9057 movu [r0 + 283 * 16], m3
9058
9059 ; mode 4 [row 7]
9060 movu m6, [r5 + 8 * 16]
9061 pmaddubsw m3, m0, m6
9062 pmulhrsw m3, m7
9063 pmaddubsw m5, m2, m6
9064 pmulhrsw m5, m7
9065 packuswb m3, m5
9066 movu [r0 + 142 * 16], m3
9067 pmaddubsw m3, m1, m6
9068 pmulhrsw m3, m7
9069 pmaddubsw m5, m4, m6
9070 pmulhrsw m5, m7
9071 packuswb m3, m5
9072 movu [r0 + 143 * 16], m3
9073
9074 ; mode 4 [row 8]
9075 movu m6, [r5 + 29 * 16]
9076 pmaddubsw m3, m0, m6
9077 pmulhrsw m3, m7
9078 pmaddubsw m5, m2, m6
9079 pmulhrsw m5, m7
9080 packuswb m3, m5
9081 movu [r0 + 144 * 16], m3
9082
9083 ; mode 4 [row 8 - first half]
9084 movu [r0 + 360 * 16], m3
9085
9086 pmaddubsw m3, m1, m6
9087 pmulhrsw m3, m7
9088 pmaddubsw m5, m4, m6
9089 pmulhrsw m5, m7
9090 packuswb m3, m5
9091 movu [r0 + 145 * 16], m3
9092
9093 ; mode 4 [row 8 - second half]
9094 movu [r0 + 361 * 16], m3
9095
9096 ; mode 5 [row 9]
9097 movu m6, [r5 + 10 * 16]
9098 pmaddubsw m3, m0, m6
9099 pmulhrsw m3, m7
9100 pmaddubsw m5, m2, m6
9101 pmulhrsw m5, m7
9102 packuswb m3, m5
9103 movu [r0 + 210 * 16], m3
9104 pmaddubsw m3, m1, m6
9105 pmulhrsw m3, m7
9106 pmaddubsw m5, m4, m6
9107 pmulhrsw m5, m7
9108 packuswb m3, m5
9109 movu [r0 + 211 * 16], m3
9110
9111 ; mode 5 [row 10]
9112 movu m6, [r5 + 27 * 16]
9113 pmaddubsw m3, m0, m6
9114 pmulhrsw m3, m7
9115 pmaddubsw m5, m2, m6
9116 pmulhrsw m5, m7
9117 packuswb m3, m5
9118 movu [r0 + 212 * 16], m3
9119 pmaddubsw m3, m1, m6
9120 pmulhrsw m3, m7
9121 pmaddubsw m5, m4, m6
9122 pmulhrsw m5, m7
9123 packuswb m3, m5
9124 movu [r0 + 213 * 16], m3
9125
9126 ; mode 7 [row 17]
9127 movu m6, [r5 + 2 * 16]
9128 pmaddubsw m3, m0, m6
9129 pmulhrsw m3, m7
9130 pmaddubsw m5, m2, m6
9131 pmulhrsw m5, m7
9132 packuswb m3, m5
9133 movu [r0 + 354 * 16], m3
9134 pmaddubsw m3, m1, m6
9135 pmulhrsw m3, m7
9136 pmaddubsw m5, m4, m6
9137 pmulhrsw m5, m7
9138 packuswb m3, m5
9139 movu [r0 + 355 * 16], m3
9140
9141 ; mode 7 [row 18]
9142 movu m6, [r5 + 11 * 16]
9143 pmaddubsw m3, m0, m6
9144 pmulhrsw m3, m7
9145 pmaddubsw m5, m2, m6
9146 pmulhrsw m5, m7
9147 packuswb m3, m5
9148 movu [r0 + 356 * 16], m3
9149 pmaddubsw m3, m1, m6
9150 pmulhrsw m3, m7
9151 pmaddubsw m5, m4, m6
9152 pmulhrsw m5, m7
9153 packuswb m3, m5
9154 movu [r0 + 357 * 16], m3
9155
9156 ; mode 7 [row 19]
9157 movu m6, [r5 + 20 * 16]
9158 pmaddubsw m3, m0, m6
9159 pmulhrsw m3, m7
9160 pmaddubsw m5, m2, m6
9161 pmulhrsw m5, m7
9162 packuswb m3, m5
9163 movu [r0 + 358 * 16], m3
9164 pmaddubsw m3, m1, m6
9165 pmulhrsw m3, m7
9166 pmaddubsw m5, m4, m6
9167 pmulhrsw m5, m7
9168 packuswb m3, m5
9169 movu [r0 + 359 * 16], m3
9170
9171 ; mode 6 [row 12]
9172 movu m6, [r5 + 9 * 16]
9173 pmaddubsw m3, m0, m6
9174 pmulhrsw m3, m7
9175 pmaddubsw m5, m2, m6
9176 pmulhrsw m5, m7
9177 packuswb m3, m5
9178 movu [r0 + 280 * 16], m3
9179 pmaddubsw m3, m1, m6
9180 pmulhrsw m3, m7
9181 pmaddubsw m5, m4, m6
9182 pmulhrsw m5, m7
9183 packuswb m3, m5
9184 movu [r0 + 281 * 16], m3
9185
9186 ; mode 3 [row 7]
9187 movu m6, [r5 + 16 * 16]
9188 movu m0, [r4 + 7]
9189 movd m1, [r4 + 8]
9190 palignr m1, m0, 1
9191 punpcklbw m0, m1
9192 pmaddubsw m1, m0, m6
9193 pmulhrsw m1, m7
9194 movu m2, [r4 + 15]
9195 movd m3, [r4 + 16]
9196 palignr m3, m2, 1
9197 punpcklbw m2, m3
9198 pmaddubsw m3, m2, m6
9199 pmulhrsw m3, m7
9200 packuswb m1, m3
9201 movu [r0 + 78 * 16], m1
9202
9203 ; mode 6 [row 15 - first half]
9204 movu [r0 + 286 * 16], m1
9205
9206 movu m1, [r4 + 23]
9207 movd m3, [r4 + 24]
9208 palignr m3, m1, 1
9209 punpcklbw m1, m3
9210 pmaddubsw m3, m1, m6
9211 pmulhrsw m3, m7
9212 movu m4, [r4 + 31]
9213 movd m5, [r4 + 32]
9214 palignr m5, m4, 1
9215 punpcklbw m4, m5
9216 pmaddubsw m5, m4, m6
9217 pmulhrsw m5, m7
9218 packuswb m3, m5
9219 movu [r0 + 79 * 16], m3
9220
9221 ; mode 6 [row 15 - second half]
9222 movu [r0 + 287 * 16], m3
9223
9224 ; mode 4 [row 9]
9225 movu m6, [r5 + 18 * 16]
9226 pmaddubsw m3, m0, m6
9227 pmulhrsw m3, m7
9228 pmaddubsw m5, m2, m6
9229 pmulhrsw m5, m7
9230 packuswb m3, m5
9231 movu [r0 + 146 * 16], m3
9232 pmaddubsw m3, m1, m6
9233 pmulhrsw m3, m7
9234 pmaddubsw m5, m4, m6
9235 pmulhrsw m5, m7
9236 packuswb m3, m5
9237 movu [r0 + 147 * 16], m3
9238
9239 ; mode 5 [row 11]
9240 movu m6, [r5 + 12 * 16]
9241 pmaddubsw m3, m0, m6
9242 pmulhrsw m3, m7
9243 pmaddubsw m5, m2, m6
9244 pmulhrsw m5, m7
9245 packuswb m3, m5
9246 movu [r0 + 214 * 16], m3
9247 pmaddubsw m3, m1, m6
9248 pmulhrsw m3, m7
9249 pmaddubsw m5, m4, m6
9250 pmulhrsw m5, m7
9251 packuswb m3, m5
9252 movu [r0 + 215 * 16], m3
9253
9254 ; mode 5 [row 12]
9255 movu m6, [r5 + 29 * 16]
9256 pmaddubsw m3, m0, m6
9257 pmulhrsw m3, m7
9258 pmaddubsw m5, m2, m6
9259 pmulhrsw m5, m7
9260 packuswb m3, m5
9261 movu [r0 + 216 * 16], m3
9262
9263 ; mode 6 [row 16 - first half]
9264 movu [r0 + 288 * 16], m3
9265
9266 pmaddubsw m3, m1, m6
9267 pmulhrsw m3, m7
9268 pmaddubsw m5, m4, m6
9269 pmulhrsw m5, m7
9270 packuswb m3, m5
9271 movu [r0 + 217 * 16], m3
9272
9273 ; mode 6 [row 16 - second half]
9274 movu [r0 + 289 * 16], m3
9275
9276 ; mode 6 [row 14]
9277 movu m6, [r5 + 3 * 16]
9278 pmaddubsw m3, m0, m6
9279 pmulhrsw m3, m7
9280 pmaddubsw m5, m2, m6
9281 pmulhrsw m5, m7
9282 packuswb m3, m5
9283 movu [r0 + 284 * 16], m3
9284 pmaddubsw m3, m1, m6
9285 pmulhrsw m3, m7
9286 pmaddubsw m5, m4, m6
9287 pmulhrsw m5, m7
9288 packuswb m3, m5
9289 movu [r0 + 285 * 16], m3
9290
9291 ; mode 7 [row 21]
9292 movu m6, [r5 + 6 * 16]
9293 pmaddubsw m3, m0, m6
9294 pmulhrsw m3, m7
9295 pmaddubsw m5, m2, m6
9296 pmulhrsw m5, m7
9297 packuswb m3, m5
9298 movu [r0 + 362 * 16], m3
9299 pmaddubsw m3, m1, m6
9300 pmulhrsw m3, m7
9301 pmaddubsw m5, m4, m6
9302 pmulhrsw m5, m7
9303 packuswb m3, m5
9304 movu [r0 + 363 * 16], m3
9305
9306 ; mode 7 [row 22]
9307 movu m6, [r5 + 15 * 16]
9308 pmaddubsw m3, m0, m6
9309 pmulhrsw m3, m7
9310 pmaddubsw m5, m2, m6
9311 pmulhrsw m5, m7
9312 packuswb m3, m5
9313 movu [r0 + 364 * 16], m3
9314 pmaddubsw m3, m1, m6
9315 pmulhrsw m3, m7
9316 pmaddubsw m5, m4, m6
9317 pmulhrsw m5, m7
9318 packuswb m3, m5
9319 movu [r0 + 365 * 16], m3
9320
9321 ; mode 7 [row 23]
9322 movu m6, [r5 + 24 * 16]
9323 pmaddubsw m3, m0, m6
9324 pmulhrsw m3, m7
9325 pmaddubsw m5, m2, m6
9326 pmulhrsw m5, m7
9327 packuswb m3, m5
9328 movu [r0 + 366 * 16], m3
9329 pmaddubsw m3, m1, m6
9330 pmulhrsw m3, m7
9331 pmaddubsw m5, m4, m6
9332 pmulhrsw m5, m7
9333 packuswb m3, m5
9334 movu [r0 + 367 * 16], m3
9335
9336 ; mode 3 [row 8]
9337 movu m6, [r5 + 10 * 16]
9338 movu m0, [r4 + 8]
9339 movd m1, [r4 + 9]
9340 palignr m1, m0, 1
9341 punpcklbw m0, m1
9342 pmaddubsw m1, m0, m6
9343 pmulhrsw m1, m7
9344 movu m2, [r4 + 16]
9345 movd m3, [r4 + 17]
9346 palignr m3, m2, 1
9347 punpcklbw m2, m3
9348 pmaddubsw m3, m2, m6
9349 pmulhrsw m3, m7
9350 packuswb m1, m3
9351 movu [r0 + 80 * 16], m1
9352
9353 ; mode 7 [row 25 - first half]
9354 movu [r0 + 290 * 16], m1
9355
9356 ; mode 6 [row 17 - first half]
9357 movu [r0 + 370 * 16], m1
9358
9359 movu m1, [r4 + 24]
9360 movd m3, [r4 + 25]
9361 palignr m3, m1, 1
9362 punpcklbw m1, m3
9363 pmaddubsw m3, m1, m6
9364 pmulhrsw m3, m7
9365 movu m4, [r4 + 32]
9366 movd m5, [r4 + 33]
9367 palignr m5, m4, 1
9368 punpcklbw m4, m5
9369 pmaddubsw m5, m4, m6
9370 pmulhrsw m5, m7
9371 packuswb m3, m5
9372 movu [r0 + 81 * 16], m3
9373
9374 ; mode 7 [row 25 - second half]
9375 movu [r0 + 291 * 16], m3
9376
9377 ; mode 6 [row 17 - second half]
9378 movu [r0 + 371 * 16], m3
9379
9380 ; mode 4 [row 10]
9381 movu m6, [r5 + 7 * 16]
9382 pmaddubsw m3, m0, m6
9383 pmulhrsw m3, m7
9384 pmaddubsw m5, m2, m6
9385 pmulhrsw m5, m7
9386 packuswb m3, m5
9387 movu [r0 + 148 * 16], m3
9388 pmaddubsw m3, m1, m6
9389 pmulhrsw m3, m7
9390 pmaddubsw m5, m4, m6
9391 pmulhrsw m5, m7
9392 packuswb m3, m5
9393 movu [r0 + 149 * 16], m3
9394
9395 ; mode 4 [row 11]
9396 movu m6, [r5 + 28 * 16]
9397 pmaddubsw m3, m0, m6
9398 pmulhrsw m3, m7
9399 pmaddubsw m5, m2, m6
9400 pmulhrsw m5, m7
9401 packuswb m3, m5
9402 movu [r0 + 150 * 16], m3
9403
9404 ; mode 7 [row 27 - first half]
9405 movu [r0 + 374 * 16], m3
9406
9407 pmaddubsw m3, m1, m6
9408 pmulhrsw m3, m7
9409 pmaddubsw m5, m4, m6
9410 pmulhrsw m5, m7
9411 packuswb m3, m5
9412 movu [r0 + 151 * 16], m3
9413
9414 ; mode 7 [row 27 - second half]
9415 movu [r0 + 375 * 16], m3
9416
9417 ; mode 5 [row 13]
9418 movu m6, [r5 + 14 * 16]
9419 pmaddubsw m3, m0, m6
9420 pmulhrsw m3, m7
9421 pmaddubsw m5, m2, m6
9422 pmulhrsw m5, m7
9423 packuswb m3, m5
9424 movu [r0 + 218 * 16], m3
9425 pmaddubsw m3, m1, m6
9426 pmulhrsw m3, m7
9427 pmaddubsw m5, m4, m6
9428 pmulhrsw m5, m7
9429 packuswb m3, m5
9430 movu [r0 + 219 * 16], m3
9431
9432 ; mode 5 [row 14]
9433 movu m6, [r5 + 31 * 16]
9434 pmaddubsw m3, m0, m6
9435 pmulhrsw m3, m7
9436 pmaddubsw m5, m2, m6
9437 pmulhrsw m5, m7
9438 packuswb m3, m5
9439 movu [r0 + 220 * 16], m3
9440 pmaddubsw m3, m1, m6
9441 pmulhrsw m3, m7
9442 pmaddubsw m5, m4, m6
9443 pmulhrsw m5, m7
9444 packuswb m3, m5
9445 movu [r0 + 221 * 16], m3
9446
9447 ; mode 6 [row 18]
9448 movu m6, [r5 + 23 * 16]
9449 pmaddubsw m3, m0, m6
9450 pmulhrsw m3, m7
9451 pmaddubsw m5, m2, m6
9452 pmulhrsw m5, m7
9453 packuswb m3, m5
9454 movu [r0 + 292 * 16], m3
9455 pmaddubsw m3, m1, m6
9456 pmulhrsw m3, m7
9457 pmaddubsw m5, m4, m6
9458 pmulhrsw m5, m7
9459 packuswb m3, m5
9460 movu [r0 + 293 * 16], m3
9461
9462 ; mode 7 [row 24]
9463 movu m6, [r5 + 1 * 16]
9464 pmaddubsw m3, m0, m6
9465 pmulhrsw m3, m7
9466 pmaddubsw m5, m2, m6
9467 pmulhrsw m5, m7
9468 packuswb m3, m5
9469 movu [r0 + 368 * 16], m3
9470 pmaddubsw m3, m1, m6
9471 pmulhrsw m3, m7
9472 pmaddubsw m5, m4, m6
9473 pmulhrsw m5, m7
9474 packuswb m3, m5
9475 movu [r0 + 369 * 16], m3
9476
9477 ; mode 7 [row 26]
9478 movu m6, [r5 + 19 * 16]
9479 pmaddubsw m3, m0, m6
9480 pmulhrsw m3, m7
9481 pmaddubsw m5, m2, m6
9482 pmulhrsw m5, m7
9483 packuswb m3, m5
9484 movu [r0 + 372 * 16], m3
9485 pmaddubsw m3, m1, m6
9486 pmulhrsw m3, m7
9487 pmaddubsw m5, m4, m6
9488 pmulhrsw m5, m7
9489 packuswb m3, m5
9490 movu [r0 + 373 * 16], m3
9491
9492 ; mode 3 [row 9]
9493 movu m6, [r5 + 4 * 16]
9494 movu m0, [r4 + 9]
9495 movd m1, [r4 + 10]
9496 palignr m1, m0, 1
9497 punpcklbw m0, m1
9498 pmaddubsw m1, m0, m6
9499 pmulhrsw m1, m7
9500 movu m2, [r4 + 17]
9501 movd m3, [r4 + 18]
9502 palignr m3, m2, 1
9503 punpcklbw m2, m3
9504 pmaddubsw m3, m2, m6
9505 pmulhrsw m3, m7
9506 packuswb m1, m3
9507 movu [r0 + 82 * 16], m1
9508
9509 ; mode 6 [row 19 - first half]
9510 movu [r0 + 294 * 16], m1
9511
9512 movu m1, [r4 + 25]
9513 movd m3, [r4 + 26]
9514 palignr m3, m1, 1
9515 punpcklbw m1, m3
9516 pmaddubsw m3, m1, m6
9517 pmulhrsw m3, m7
9518 movu m4, [r4 + 33]
9519 movd m5, [r4 + 34]
9520 palignr m5, m4, 1
9521 punpcklbw m4, m5
9522 pmaddubsw m5, m4, m6
9523 pmulhrsw m5, m7
9524 packuswb m3, m5
9525 movu [r0 + 83 * 16], m3
9526
9527 ; mode 6 [row 19 - second half]
9528 movu [r0 + 295 * 16], m3
9529
9530 ; mode 4 [row 12]
9531 movu m6, [r5 + 17 * 16]
9532 pmaddubsw m3, m0, m6
9533 pmulhrsw m3, m7
9534 pmaddubsw m5, m2, m6
9535 pmulhrsw m5, m7
9536 packuswb m3, m5
9537 movu [r0 + 152 * 16], m3
9538
9539 ; mode 4 [row 12 - first half]
9540 movu [r0 + 296 * 16], m3
9541
9542 pmaddubsw m3, m1, m6
9543 pmulhrsw m3, m7
9544 pmaddubsw m5, m4, m6
9545 pmulhrsw m5, m7
9546 packuswb m3, m5
9547 movu [r0 + 153 * 16], m3
9548
9549 ; mode 4 [row 12 - second half]
9550 movu [r0 + 297 * 16], m3
9551
9552 ; mode 3 [row 10]
9553 movu m6, [r5 + 30 * 16]
9554 pmaddubsw m3, m0, m6
9555 pmulhrsw m3, m7
9556 pmaddubsw m5, m2, m6
9557 pmulhrsw m5, m7
9558 packuswb m3, m5
9559 movu [r0 + 84 * 16], m3
9560
9561 ; mode 6 [row 21 - first half]
9562 movu [r0 + 298 * 16], m3
9563
9564 pmaddubsw m3, m1, m6
9565 pmulhrsw m3, m7
9566 pmaddubsw m5, m4, m6
9567 pmulhrsw m5, m7
9568 packuswb m3, m5
9569 movu [r0 + 85 * 16], m3
9570
9571 ; mode 6 [row 21 - second half]
9572 movu [r0 + 299 * 16], m3
9573
9574 ; mode 5 [row 15]
9575 movu m6, [r5 + 16 * 16]
9576 pmaddubsw m3, m0, m6
9577 pmulhrsw m3, m7
9578 pmaddubsw m5, m2, m6
9579 pmulhrsw m5, m7
9580 packuswb m3, m5
9581 movu [r0 + 222 * 16], m3
9582 pmaddubsw m3, m1, m6
9583 pmulhrsw m3, m7
9584 pmaddubsw m5, m4, m6
9585 pmulhrsw m5, m7
9586 packuswb m3, m5
9587 movu [r0 + 223 * 16], m3
9588
9589 ; mode 7 [row 28]
9590 movu m6, [r5 + 5 * 16]
9591 pmaddubsw m3, m0, m6
9592 pmulhrsw m3, m7
9593 pmaddubsw m5, m2, m6
9594 pmulhrsw m5, m7
9595 packuswb m3, m5
9596 movu [r0 + 376 * 16], m3
9597 pmaddubsw m3, m1, m6
9598 pmulhrsw m3, m7
9599 pmaddubsw m5, m4, m6
9600 pmulhrsw m5, m7
9601 packuswb m3, m5
9602 movu [r0 + 377 * 16], m3
9603
9604 ; mode 7 [row 29]
9605 movu m6, [r5 + 14 * 16]
9606 pmaddubsw m3, m0, m6
9607 pmulhrsw m3, m7
9608 pmaddubsw m5, m2, m6
9609 pmulhrsw m5, m7
9610 packuswb m3, m5
9611 movu [r0 + 378 * 16], m3
9612 pmaddubsw m3, m1, m6
9613 pmulhrsw m3, m7
9614 pmaddubsw m5, m4, m6
9615 pmulhrsw m5, m7
9616 packuswb m3, m5
9617 movu [r0 + 379 * 16], m3
9618
9619 ; mode 7 [row 30]
9620 movu m6, [r5 + 23 * 16]
9621 pmaddubsw m3, m0, m6
9622 pmulhrsw m3, m7
9623 pmaddubsw m5, m2, m6
9624 pmulhrsw m5, m7
9625 packuswb m3, m5
9626 movu [r0 + 380 * 16], m3
9627 pmaddubsw m3, m1, m6
9628 pmulhrsw m3, m7
9629 pmaddubsw m5, m4, m6
9630 pmulhrsw m5, m7
9631 packuswb m3, m5
9632 movu [r0 + 381 * 16], m3
9633
9634 ; mode 3 [row 11]
9635 movu m6, [r5 + 24 * 16]
9636 movu m0, [r4 + 10]
9637 movd m1, [r4 + 11]
9638 palignr m1, m0, 1
9639 punpcklbw m0, m1
9640 pmaddubsw m1, m0, m6
9641 pmulhrsw m1, m7
9642 movu m2, [r4 + 18]
9643 movd m3, [r4 + 19]
9644 palignr m3, m2, 1
9645 punpcklbw m2, m3
9646 pmaddubsw m3, m2, m6
9647 pmulhrsw m3, m7
9648 packuswb m1, m3
9649 movu [r0 + 86 * 16], m1
9650
9651 ; mode 6 [row 23 - first half]
9652 movu [r0 + 302 * 16], m1
9653
9654 movu m1, [r4 + 26]
9655 movd m3, [r4 + 27]
9656 palignr m3, m1, 1
9657 punpcklbw m1, m3
9658 pmaddubsw m3, m1, m6
9659 pmulhrsw m3, m7
9660 movu m4, [r4 + 34]
9661 movd m5, [r4 + 35]
9662 palignr m5, m4, 1
9663 punpcklbw m4, m5
9664 pmaddubsw m5, m4, m6
9665 pmulhrsw m5, m7
9666 packuswb m3, m5
9667 movu [r0 + 87 * 16], m3
9668
9669 ; mode 6 [row 23 - second half]
9670 movu [r0 + 303 * 16], m3
9671
9672 ; mode 4 [row 13]
9673 movu m6, [r5 + 6 * 16]
9674 pmaddubsw m3, m0, m6
9675 pmulhrsw m3, m7
9676 pmaddubsw m5, m2, m6
9677 pmulhrsw m5, m7
9678 packuswb m3, m5
9679 movu [r0 + 154 * 16], m3
9680 pmaddubsw m3, m1, m6
9681 pmulhrsw m3, m7
9682 pmaddubsw m5, m4, m6
9683 pmulhrsw m5, m7
9684 packuswb m3, m5
9685 movu [r0 + 155 * 16], m3
9686
9687 ; mode 4 [row 14]
9688 movu m6, [r5 + 27 * 16]
9689 pmaddubsw m3, m0, m6
9690 pmulhrsw m3, m7
9691 pmaddubsw m5, m2, m6
9692 pmulhrsw m5, m7
9693 packuswb m3, m5
9694 movu [r0 + 156 * 16], m3
9695 pmaddubsw m3, m1, m6
9696 pmulhrsw m3, m7
9697 pmaddubsw m5, m4, m6
9698 pmulhrsw m5, m7
9699 packuswb m3, m5
9700 movu [r0 + 157 * 16], m3
9701
9702 ; mode 5 [row 16]
9703 movu m6, [r5 + 1 * 16]
9704 pmaddubsw m3, m0, m6
9705 pmulhrsw m3, m7
9706 pmaddubsw m5, m2, m6
9707 pmulhrsw m5, m7
9708 packuswb m3, m5
9709 movu [r0 + 224 * 16], m3
9710 pmaddubsw m3, m1, m6
9711 pmulhrsw m3, m7
9712 pmaddubsw m5, m4, m6
9713 pmulhrsw m5, m7
9714 packuswb m3, m5
9715 movu [r0 + 225 * 16], m3
9716
9717 ; mode 5 [row 17]
9718 movu m6, [r5 + 18 * 16]
9719 pmaddubsw m3, m0, m6
9720 pmulhrsw m3, m7
9721 pmaddubsw m5, m2, m6
9722 pmulhrsw m5, m7
9723 packuswb m3, m5
9724 movu [r0 + 226 * 16], m3
9725 pmaddubsw m3, m1, m6
9726 pmulhrsw m3, m7
9727 pmaddubsw m5, m4, m6
9728 pmulhrsw m5, m7
9729 packuswb m3, m5
9730 movu [r0 + 227 * 16], m3
9731
9732 ; mode 6 [row 22]
9733 movu m6, [r5 + 11 * 16]
9734 pmaddubsw m3, m0, m6
9735 pmulhrsw m3, m7
9736 pmaddubsw m5, m2, m6
9737 pmulhrsw m5, m7
9738 packuswb m3, m5
9739 movu [r0 + 300 * 16], m3
9740 pmaddubsw m3, m1, m6
9741 pmulhrsw m3, m7
9742 pmaddubsw m5, m4, m6
9743 pmulhrsw m5, m7
9744 packuswb m3, m5
9745 movu [r0 + 301 * 16], m3
9746
9747 ; mode 3 [row 12]
9748 movu m6, [r5 + 18 * 16]
9749 movu m0, [r4 + 11]
9750 movd m1, [r4 + 12]
9751 palignr m1, m0, 1
9752 punpcklbw m0, m1
9753 pmaddubsw m1, m0, m6
9754 pmulhrsw m1, m7
9755 movu m2, [r4 + 19]
9756 movd m3, [r4 + 20]
9757 palignr m3, m2, 1
9758 punpcklbw m2, m3
9759 pmaddubsw m3, m2, m6
9760 pmulhrsw m3, m7
9761 packuswb m1, m3
9762 movu [r0 + 88 * 16], m1
9763
9764 ; mode 6 [row 25 - first half]
9765 movu [r0 + 306 * 16], m1
9766
9767 movu m1, [r4 + 27]
9768 movd m3, [r4 + 28]
9769 palignr m3, m1, 1
9770 punpcklbw m1, m3
9771 pmaddubsw m3, m1, m6
9772 pmulhrsw m3, m7
9773 movu m4, [r4 + 35]
9774 movd m5, [r4 + 36]
9775 palignr m5, m4, 1
9776 punpcklbw m4, m5
9777 pmaddubsw m5, m4, m6
9778 pmulhrsw m5, m7
9779 packuswb m3, m5
9780 movu [r0 + 89 * 16], m3
9781
9782 ; mode 6 [row 25 - second half]
9783 movu [r0 + 307 * 16], m3
9784
9785 ; mode 4 [row 15]
9786 movu m6, [r5 + 16 * 16]
9787 pmaddubsw m3, m0, m6
9788 pmulhrsw m3, m7
9789 pmaddubsw m5, m2, m6
9790 pmulhrsw m5, m7
9791 packuswb m3, m5
9792 movu [r0 + 158 * 16], m3
9793 pmaddubsw m3, m1, m6
9794 pmulhrsw m3, m7
9795 pmaddubsw m5, m4, m6
9796 pmulhrsw m5, m7
9797 packuswb m3, m5
9798 movu [r0 + 159 * 16], m3
9799
9800 ; mode 5 [row 18]
9801 movu m6, [r5 + 3 * 16]
9802 pmaddubsw m3, m0, m6
9803 pmulhrsw m3, m7
9804 pmaddubsw m5, m2, m6
9805 pmulhrsw m5, m7
9806 packuswb m3, m5
9807 movu [r0 + 228 * 16], m3
9808 pmaddubsw m3, m1, m6
9809 pmulhrsw m3, m7
9810 pmaddubsw m5, m4, m6
9811 pmulhrsw m5, m7
9812 packuswb m3, m5
9813 movu [r0 + 229 * 16], m3
9814
9815 ; mode 5 [row 19]
9816 movu m6, [r5 + 20 * 16]
9817 pmaddubsw m3, m0, m6
9818 pmulhrsw m3, m7
9819 pmaddubsw m5, m2, m6
9820 pmulhrsw m5, m7
9821 packuswb m3, m5
9822 movu [r0 + 230 * 16], m3
9823 pmaddubsw m3, m1, m6
9824 pmulhrsw m3, m7
9825 pmaddubsw m5, m4, m6
9826 pmulhrsw m5, m7
9827 packuswb m3, m5
9828 movu [r0 + 231 * 16], m3
9829
9830 ; mode 6 [row 24]
9831 movu m6, [r5 + 5 * 16]
9832 pmaddubsw m3, m0, m6
9833 pmulhrsw m3, m7
9834 pmaddubsw m5, m2, m6
9835 pmulhrsw m5, m7
9836 packuswb m3, m5
9837 movu [r0 + 304 * 16], m3
9838 pmaddubsw m3, m1, m6
9839 pmulhrsw m3, m7
9840 pmaddubsw m5, m4, m6
9841 pmulhrsw m5, m7
9842 packuswb m3, m5
9843 movu [r0 + 305 * 16], m3
9844
9845 ; mode 6 [row 26]
9846 movu m6, [r5 + 31 * 16]
9847 pmaddubsw m3, m0, m6
9848 pmulhrsw m3, m7
9849 pmaddubsw m5, m2, m6
9850 pmulhrsw m5, m7
9851 packuswb m3, m5
9852 movu [r0 + 308 * 16], m3
9853 pmaddubsw m3, m1, m6
9854 pmulhrsw m3, m7
9855 pmaddubsw m5, m4, m6
9856 pmulhrsw m5, m7
9857 packuswb m3, m5
9858 movu [r0 + 309 * 16], m3
9859
9860 ; mode 3 [row 13]
9861 movu m6, [r5 + 12 * 16]
9862 movu m0, [r4 + 12]
9863 movd m1, [r4 + 13]
9864 palignr m1, m0, 1
9865 punpcklbw m0, m1
9866 pmaddubsw m1, m0, m6
9867 pmulhrsw m1, m7
9868 movu m2, [r4 + 20]
9869 movd m3, [r4 + 21]
9870 palignr m3, m2, 1
9871 punpcklbw m2, m3
9872 pmaddubsw m3, m2, m6
9873 pmulhrsw m3, m7
9874 packuswb m1, m3
9875 movu [r0 + 90 * 16], m1
9876
9877 movu m1, [r4 + 28]
9878 movd m3, [r4 + 29]
9879 palignr m3, m1, 1
9880 punpcklbw m1, m3
9881 pmaddubsw m3, m1, m6
9882 pmulhrsw m3, m7
9883 movu m4, [r4 + 36]
9884 movd m5, [r4 + 37]
9885 palignr m5, m4, 1
9886 punpcklbw m4, m5
9887 pmaddubsw m5, m4, m6
9888 pmulhrsw m5, m7
9889 packuswb m3, m5
9890 movu [r0 + 91 * 16], m3
9891
9892 ; mode 4 [row 16]
9893 movu m6, [r5 + 5 * 16]
9894 pmaddubsw m3, m0, m6
9895 pmulhrsw m3, m7
9896 pmaddubsw m5, m2, m6
9897 pmulhrsw m5, m7
9898 packuswb m3, m5
9899 movu [r0 + 160 * 16], m3
9900
9901 ; mode 5 [row 20 - first half]
9902 movu [r0 + 232 * 16], m3
9903
9904 pmaddubsw m3, m1, m6
9905 pmulhrsw m3, m7
9906 pmaddubsw m5, m4, m6
9907 pmulhrsw m5, m7
9908 packuswb m3, m5
9909 movu [r0 + 161 * 16], m3
9910
9911 ; mode 5 [row 20 - second half]
9912 movu [r0 + 233 * 16], m3
9913
9914 ; mode 4 [row 17]
9915 movu m6, [r5 + 26 * 16]
9916 pmaddubsw m3, m0, m6
9917 pmulhrsw m3, m7
9918 pmaddubsw m5, m2, m6
9919 pmulhrsw m5, m7
9920 packuswb m3, m5
9921 movu [r0 + 162 * 16], m3
9922 pmaddubsw m3, m1, m6
9923 pmulhrsw m3, m7
9924 pmaddubsw m5, m4, m6
9925 pmulhrsw m5, m7
9926 packuswb m3, m5
9927 movu [r0 + 163 * 16], m3
9928
9929 ; mode 5 [row 21]
9930 movu m6, [r5 + 22 * 16]
9931 pmaddubsw m3, m0, m6
9932 pmulhrsw m3, m7
9933 pmaddubsw m5, m2, m6
9934 pmulhrsw m5, m7
9935 packuswb m3, m5
9936 movu [r0 + 234 * 16], m3
9937 pmaddubsw m3, m1, m6
9938 pmulhrsw m3, m7
9939 pmaddubsw m5, m4, m6
9940 pmulhrsw m5, m7
9941 packuswb m3, m5
9942 movu [r0 + 235 * 16], m3
9943
9944 ; mode 6 [row 27]
9945 movu m6, [r5 + 12 * 16]
9946 pmaddubsw m3, m0, m6
9947 pmulhrsw m3, m7
9948 pmaddubsw m5, m2, m6
9949 pmulhrsw m5, m7
9950 packuswb m3, m5
9951 movu [r0 + 310 * 16], m3
9952 pmaddubsw m3, m1, m6
9953 pmulhrsw m3, m7
9954 pmaddubsw m5, m4, m6
9955 pmulhrsw m5, m7
9956 packuswb m3, m5
9957 movu [r0 + 311 * 16], m3
9958
9959 ; mode 6 [row 28]
9960 movu m6, [r5 + 25 * 16]
9961 pmaddubsw m3, m0, m6
9962 pmulhrsw m3, m7
9963 pmaddubsw m5, m2, m6
9964 pmulhrsw m5, m7
9965 packuswb m3, m5
9966 movu [r0 + 312 * 16], m3
9967 pmaddubsw m3, m1, m6
9968 pmulhrsw m3, m7
9969 pmaddubsw m5, m4, m6
9970 pmulhrsw m5, m7
9971 packuswb m3, m5
9972 movu [r0 + 313 * 16], m3
9973
9974 ; mode 3 [row 14]
9975 movu m6, [r5 + 6 * 16]
9976 movu m0, [r4 + 13]
9977 movd m1, [r4 + 14]
9978 palignr m1, m0, 1
9979 punpcklbw m0, m1
9980 pmaddubsw m1, m0, m6
9981 pmulhrsw m1, m7
9982 movu m2, [r4 + 21]
9983 movd m3, [r4 + 22]
9984 palignr m3, m2, 1
9985 punpcklbw m2, m3
9986 pmaddubsw m3, m2, m6
9987 pmulhrsw m3, m7
9988 packuswb m1, m3
9989 movu [r0 + 92 * 16], m1
9990
9991 ; mode 6 [row 29 - first half]
9992 movu [r0 + 314 * 16], m1
9993
9994 movu m1, [r4 + 29]
9995 movd m3, [r4 + 30]
9996 palignr m3, m1, 1
9997 punpcklbw m1, m3
9998 pmaddubsw m3, m1, m6
9999 pmulhrsw m3, m7
10000 movu m4, [r4 + 37]
10001 movd m5, [r4 + 38]
10002 palignr m5, m4, 1
10003 punpcklbw m4, m5
10004 pmaddubsw m5, m4, m6
10005 pmulhrsw m5, m7
10006 packuswb m3, m5
10007 movu [r0 + 93 * 16], m3
10008
10009 ; mode 6 [row 29 - second half]
10010 movu [r0 + 315 * 16], m3
10011
10012 ; mode 4 [row 18]
10013 movu m6, [r5 + 15 * 16]
10014 pmaddubsw m3, m0, m6
10015 pmulhrsw m3, m7
10016 pmaddubsw m5, m2, m6
10017 pmulhrsw m5, m7
10018 packuswb m3, m5
10019 movu [r0 + 164 * 16], m3
10020 pmaddubsw m3, m1, m6
10021 pmulhrsw m3, m7
10022 pmaddubsw m5, m4, m6
10023 pmulhrsw m5, m7
10024 packuswb m3, m5
10025 movu [r0 + 165 * 16], m3
10026
10027 ; mode 5 [row 22]
10028 movu m6, [r5 + 7 * 16]
10029 pmaddubsw m3, m0, m6
10030 pmulhrsw m3, m7
10031 pmaddubsw m5, m2, m6
10032 pmulhrsw m5, m7
10033 packuswb m3, m5
10034 movu [r0 + 236 * 16], m3
10035 pmaddubsw m3, m1, m6
10036 pmulhrsw m3, m7
10037 pmaddubsw m5, m4, m6
10038 pmulhrsw m5, m7
10039 packuswb m3, m5
10040 movu [r0 + 237 * 16], m3
10041
10042 ; mode 5 [row 23]
10043 movu m6, [r5 + 24 * 16]
10044 pmaddubsw m3, m0, m6
10045 pmulhrsw m3, m7
10046 pmaddubsw m5, m2, m6
10047 pmulhrsw m5, m7
10048 packuswb m3, m5
10049 movu [r0 + 238 * 16], m3
10050 pmaddubsw m3, m1, m6
10051 pmulhrsw m3, m7
10052 pmaddubsw m5, m4, m6
10053 pmulhrsw m5, m7
10054 packuswb m3, m5
10055 movu [r0 + 239 * 16], m3
10056
10057 ; mode 6 [row 30]
10058 movu m6, [r5 + 19 * 16]
10059 pmaddubsw m3, m0, m6
10060 pmulhrsw m3, m7
10061 pmaddubsw m5, m2, m6
10062 pmulhrsw m5, m7
10063 packuswb m3, m5
10064 movu [r0 + 316 * 16], m3
10065 pmaddubsw m3, m1, m6
10066 pmulhrsw m3, m7
10067 pmaddubsw m5, m4, m6
10068 pmulhrsw m5, m7
10069 packuswb m3, m5
10070 movu [r0 + 317 * 16], m3
10071
10072 ; mode 3 [row 16]
10073 movu m6, [r5 + 26 * 16]
10074 movu m0, [r4 + 14]
10075 movd m1, [r4 + 15]
10076 palignr m1, m0, 1
10077 punpcklbw m0, m1
10078 pmaddubsw m1, m0, m6
10079 pmulhrsw m1, m7
10080 movu m2, [r4 + 22]
10081 movd m3, [r4 + 23]
10082 palignr m3, m2, 1
10083 punpcklbw m2, m3
10084 pmaddubsw m3, m2, m6
10085 pmulhrsw m3, m7
10086 packuswb m1, m3
10087 movu [r0 + 96 * 16], m1
10088
10089 ; mode 5 [row 25 - first half]
10090 movu [r0 + 242 * 16], m1
10091
10092 movu m1, [r4 + 30]
10093 movd m3, [r4 + 31]
10094 palignr m3, m1, 1
10095 punpcklbw m1, m3
10096 pmaddubsw m3, m1, m6
10097 pmulhrsw m3, m7
10098 movu m4, [r4 + 38]
10099 movd m5, [r4 + 39]
10100 palignr m5, m4, 1
10101 punpcklbw m4, m5
10102 pmaddubsw m5, m4, m6
10103 pmulhrsw m5, m7
10104 packuswb m3, m5
10105 movu [r0 + 97 * 16], m3
10106
10107 ; mode 5 [row 25 - second half]
10108 movu [r0 + 243 * 16], m3
10109
10110 ; mode 4 [row 19]
10111 movu m6, [r5 + 4 * 16]
10112 pmaddubsw m3, m0, m6
10113 pmulhrsw m3, m7
10114 pmaddubsw m5, m2, m6
10115 pmulhrsw m5, m7
10116 packuswb m3, m5
10117 movu [r0 + 166 * 16], m3
10118 pmaddubsw m3, m1, m6
10119 pmulhrsw m3, m7
10120 pmaddubsw m5, m4, m6
10121 pmulhrsw m5, m7
10122 packuswb m3, m5
10123 movu [r0 + 167 * 16], m3
10124
10125 ; mode 4 [row 20]
10126 movu m6, [r5 + 25 * 16]
10127 pmaddubsw m3, m0, m6
10128 pmulhrsw m3, m7
10129 pmaddubsw m5, m2, m6
10130 pmulhrsw m5, m7
10131 packuswb m3, m5
10132 movu [r0 + 168 * 16], m3
10133 pmaddubsw m3, m1, m6
10134 pmulhrsw m3, m7
10135 pmaddubsw m5, m4, m6
10136 pmulhrsw m5, m7
10137 packuswb m3, m5
10138 movu [r0 + 169 * 16], m3
10139
10140 ; mode 5 [row 24]
10141 movu m6, [r5 + 9 * 16]
10142 pmaddubsw m3, m0, m6
10143 pmulhrsw m3, m7
10144 pmaddubsw m5, m2, m6
10145 pmulhrsw m5, m7
10146 packuswb m3, m5
10147 movu [r0 + 240 * 16], m3
10148 pmaddubsw m3, m1, m6
10149 pmulhrsw m3, m7
10150 pmaddubsw m5, m4, m6
10151 pmulhrsw m5, m7
10152 packuswb m3, m5
10153 movu [r0 + 241 * 16], m3
10154
10155 ; mode 3 [row 17]
10156 movu m6, [r5 + 20 * 16]
10157 movu m0, [r4 + 15]
10158 movd m1, [r4 + 16]
10159 palignr m1, m0, 1
10160 punpcklbw m0, m1
10161 pmaddubsw m1, m0, m6
10162 pmulhrsw m1, m7
10163 movu m2, [r4 + 23]
10164 movd m3, [r4 + 24]
10165 palignr m3, m2, 1
10166 punpcklbw m2, m3
10167 pmaddubsw m3, m2, m6
10168 pmulhrsw m3, m7
10169 packuswb m1, m3
10170 movu [r0 + 98 * 16], m1
10171
10172 movu m1, [r4 + 31]
10173 movd m3, [r4 + 32]
10174 palignr m3, m1, 1
10175 punpcklbw m1, m3
10176 pmaddubsw m3, m1, m6
10177 pmulhrsw m3, m7
10178 movu m4, [r4 + 39]
10179 movd m5, [r4 + 40]
10180 palignr m5, m4, 1
10181 punpcklbw m4, m5
10182 pmaddubsw m5, m4, m6
10183 pmulhrsw m5, m7
10184 packuswb m3, m5
10185 movu [r0 + 99 * 16], m3
10186
10187 ; mode 4 [row 21]
10188 movu m6, [r5 + 14 * 16]
10189 pmaddubsw m3, m0, m6
10190 pmulhrsw m3, m7
10191 pmaddubsw m5, m2, m6
10192 pmulhrsw m5, m7
10193 packuswb m3, m5
10194 movu [r0 + 170 * 16], m3
10195 pmaddubsw m3, m1, m6
10196 pmulhrsw m3, m7
10197 pmaddubsw m5, m4, m6
10198 pmulhrsw m5, m7
10199 packuswb m3, m5
10200 movu [r0 + 171 * 16], m3
10201
10202 ; mode 5 [row 26]
10203 movu m6, [r5 + 11 * 16]
10204 pmaddubsw m3, m0, m6
10205 pmulhrsw m3, m7
10206 pmaddubsw m5, m2, m6
10207 pmulhrsw m5, m7
10208 packuswb m3, m5
10209 movu [r0 + 244 * 16], m3
10210 pmaddubsw m3, m1, m6
10211 pmulhrsw m3, m7
10212 pmaddubsw m5, m4, m6
10213 pmulhrsw m5, m7
10214 packuswb m3, m5
10215 movu [r0 + 245 * 16], m3
10216
10217 ; mode 5 [row 27]
10218 movu m6, [r5 + 28 * 16]
10219 pmaddubsw m3, m0, m6
10220 pmulhrsw m3, m7
10221 pmaddubsw m5, m2, m6
10222 pmulhrsw m5, m7
10223 packuswb m3, m5
10224 movu [r0 + 246 * 16], m3
10225 pmaddubsw m3, m1, m6
10226 pmulhrsw m3, m7
10227 pmaddubsw m5, m4, m6
10228 pmulhrsw m5, m7
10229 packuswb m3, m5
10230 movu [r0 + 247 * 16], m3
10231
10232 ; mode 3 [row 18]
10233 movu m6, [r5 + 14 * 16]
10234 movu m0, [r4 + 16]
10235 movd m1, [r4 + 17]
10236 palignr m1, m0, 1
10237 punpcklbw m0, m1
10238 pmaddubsw m1, m0, m6
10239 pmulhrsw m1, m7
10240 movu m2, [r4 + 24]
10241 movd m3, [r4 + 25]
10242 palignr m3, m2, 1
10243 punpcklbw m2, m3
10244 pmaddubsw m3, m2, m6
10245 pmulhrsw m3, m7
10246 packuswb m1, m3
10247 movu [r0 + 100 * 16], m1
10248
10249 movu m1, [r4 + 32]
10250 movd m3, [r4 + 33]
10251 palignr m3, m1, 1
10252 punpcklbw m1, m3
10253 pmaddubsw m3, m1, m6
10254 pmulhrsw m3, m7
10255 movu m4, [r4 + 40]
10256 movd m5, [r4 + 41]
10257 palignr m5, m4, 1
10258 punpcklbw m4, m5
10259 pmaddubsw m5, m4, m6
10260 pmulhrsw m5, m7
10261 packuswb m3, m5
10262 movu [r0 + 101 * 16], m3
10263
10264 ; mode 4 [row 22]
10265 movu m6, [r5 + 3 * 16]
10266 pmaddubsw m3, m0, m6
10267 pmulhrsw m3, m7
10268 pmaddubsw m5, m2, m6
10269 pmulhrsw m5, m7
10270 packuswb m3, m5
10271 movu [r0 + 172 * 16], m3
10272 pmaddubsw m3, m1, m6
10273 pmulhrsw m3, m7
10274 pmaddubsw m5, m4, m6
10275 pmulhrsw m5, m7
10276 packuswb m3, m5
10277 movu [r0 + 173 * 16], m3
10278
10279 ; mode 4 [row 23]
10280 movu m6, [r5 + 24 * 16]
10281 pmaddubsw m3, m0, m6
10282 pmulhrsw m3, m7
10283 pmaddubsw m5, m2, m6
10284 pmulhrsw m5, m7
10285 packuswb m3, m5
10286 movu [r0 + 174 * 16], m3
10287 pmaddubsw m3, m1, m6
10288 pmulhrsw m3, m7
10289 pmaddubsw m5, m4, m6
10290 pmulhrsw m5, m7
10291 packuswb m3, m5
10292 movu [r0 + 175 * 16], m3
10293
10294 ; mode 5 [row 28]
10295 movu m6, [r5 + 13 * 16]
10296 pmaddubsw m3, m0, m6
10297 pmulhrsw m3, m7
10298 pmaddubsw m5, m2, m6
10299 pmulhrsw m5, m7
10300 packuswb m3, m5
10301 movu [r0 + 248 * 16], m3
10302 pmaddubsw m3, m1, m6
10303 pmulhrsw m3, m7
10304 pmaddubsw m5, m4, m6
10305 pmulhrsw m5, m7
10306 packuswb m3, m5
10307 movu [r0 + 249 * 16], m3
10308
10309 ; mode 5 [row 29]
10310 movu m6, [r5 + 30 * 16]
10311 pmaddubsw m3, m0, m6
10312 pmulhrsw m3, m7
10313 pmaddubsw m5, m2, m6
10314 pmulhrsw m5, m7
10315 packuswb m3, m5
10316 movu [r0 + 250 * 16], m3
10317 pmaddubsw m3, m1, m6
10318 pmulhrsw m3, m7
10319 pmaddubsw m5, m4, m6
10320 pmulhrsw m5, m7
10321 packuswb m3, m5
10322 movu [r0 + 251 * 16], m3
10323
10324 ; mode 3 [row 19]
10325 movu m6, [r5 + 8 * 16]
10326 movu m0, [r4 + 17]
10327 movd m1, [r4 + 18]
10328 palignr m1, m0, 1
10329 punpcklbw m0, m1
10330 pmaddubsw m1, m0, m6
10331 pmulhrsw m1, m7
10332 movu m2, [r4 + 25]
10333 movd m3, [r4 + 26]
10334 palignr m3, m2, 1
10335 punpcklbw m2, m3
10336 pmaddubsw m3, m2, m6
10337 pmulhrsw m3, m7
10338 packuswb m1, m3
10339 movu [r0 + 102 * 16], m1
10340
10341 movu m1, [r4 + 33]
10342 movd m3, [r4 + 34]
10343 palignr m3, m1, 1
10344 punpcklbw m1, m3
10345 pmaddubsw m3, m1, m6
10346 pmulhrsw m3, m7
10347 movu m4, [r4 + 41]
10348 movd m5, [r4 + 42]
10349 palignr m5, m4, 1
10350 punpcklbw m4, m5
10351 pmaddubsw m5, m4, m6
10352 pmulhrsw m5, m7
10353 packuswb m3, m5
10354 movu [r0 + 103 * 16], m3
10355
10356 ; mode 4 [row 24]
10357 movu m6, [r5 + 13 * 16]
10358 pmaddubsw m3, m0, m6
10359 pmulhrsw m3, m7
10360 pmaddubsw m5, m2, m6
10361 pmulhrsw m5, m7
10362 packuswb m3, m5
10363 movu [r0 + 176 * 16], m3
10364 pmaddubsw m3, m1, m6
10365 pmulhrsw m3, m7
10366 pmaddubsw m5, m4, m6
10367 pmulhrsw m5, m7
10368 packuswb m3, m5
10369 movu [r0 + 177 * 16], m3
10370
10371 ; mode 5 [row 30]
10372 movu m6, [r5 + 15 * 16]
10373 pmaddubsw m3, m0, m6
10374 pmulhrsw m3, m7
10375 pmaddubsw m5, m2, m6
10376 pmulhrsw m5, m7
10377 packuswb m3, m5
10378 movu [r0 + 252 * 16], m3
10379 pmaddubsw m3, m1, m6
10380 pmulhrsw m3, m7
10381 pmaddubsw m5, m4, m6
10382 pmulhrsw m5, m7
10383 packuswb m3, m5
10384 movu [r0 + 253 * 16], m3
10385
10386 ; mode 3 [row 20]
10387 movu m6, [r5 + 2 * 16]
10388 movu m0, [r4 + 18]
10389 movd m1, [r4 + 19]
10390 palignr m1, m0, 1
10391 punpcklbw m0, m1
10392 pmaddubsw m1, m0, m6
10393 pmulhrsw m1, m7
10394 movu m2, [r4 + 26]
10395 movd m3, [r4 + 27]
10396 palignr m3, m2, 1
10397 punpcklbw m2, m3
10398 pmaddubsw m3, m2, m6
10399 pmulhrsw m3, m7
10400 packuswb m1, m3
10401 movu [r0 + 104 * 16], m1
10402
10403 movu m1, [r4 + 34]
10404 movd m3, [r4 + 35]
10405 palignr m3, m1, 1
10406 punpcklbw m1, m3
10407 pmaddubsw m3, m1, m6
10408 pmulhrsw m3, m7
10409 movu m4, [r4 + 42]
10410 movd m5, [r4 + 43]
10411 palignr m5, m4, 1
10412 punpcklbw m4, m5
10413 pmaddubsw m5, m4, m6
10414 pmulhrsw m5, m7
10415 packuswb m3, m5
10416 movu [r0 + 105 * 16], m3
10417
10418 ; mode 4 [row 25]
10419 pmaddubsw m3, m0, m6
10420 pmulhrsw m3, m7
10421 pmaddubsw m5, m2, m6
10422 pmulhrsw m5, m7
10423 packuswb m3, m5
10424 movu [r0 + 178 * 16], m3
10425 pmaddubsw m3, m1, m6
10426 pmulhrsw m3, m7
10427 pmaddubsw m5, m4, m6
10428 pmulhrsw m5, m7
10429 packuswb m3, m5
10430 movu [r0 + 179 * 16], m3
10431
10432 ; mode 4 [row 26]
10433 movu m6, [r5 + 23 * 16]
10434 pmaddubsw m3, m0, m6
10435 pmulhrsw m3, m7
10436 pmaddubsw m5, m2, m6
10437 pmulhrsw m5, m7
10438 packuswb m3, m5
10439 movu [r0 + 180 * 16], m3
10440 pmaddubsw m3, m1, m6
10441 pmulhrsw m3, m7
10442 pmaddubsw m5, m4, m6
10443 pmulhrsw m5, m7
10444 packuswb m3, m5
10445 movu [r0 + 181 * 16], m3
10446
10447 ; mode 3 [row 21]
10448 movu m6, [r5 + 28 * 16]
10449 pmaddubsw m3, m0, m6
10450 pmulhrsw m3, m7
10451 pmaddubsw m5, m2, m6
10452 pmulhrsw m5, m7
10453 packuswb m3, m5
10454 movu [r0 + 106 * 16], m3
10455 pmaddubsw m3, m1, m6
10456 pmulhrsw m3, m7
10457 pmaddubsw m5, m4, m6
10458 pmulhrsw m5, m7
10459 packuswb m3, m5
10460 movu [r0 + 107 * 16], m3
10461
10462 ; mode 3 [row 22]
10463 movu m6, [r5 + 22 * 16]
10464 movu m0, [r4 + 19]
10465 movd m1, [r4 + 20]
10466 palignr m1, m0, 1
10467 punpcklbw m0, m1
10468 pmaddubsw m1, m0, m6
10469 pmulhrsw m1, m7
10470 movu m2, [r4 + 27]
10471 movd m3, [r4 + 28]
10472 palignr m3, m2, 1
10473 punpcklbw m2, m3
10474 pmaddubsw m3, m2, m6
10475 pmulhrsw m3, m7
10476 packuswb m1, m3
10477 movu [r0 + 108 * 16], m1
10478
10479 movu m1, [r4 + 35]
10480 movd m3, [r4 + 36]
10481 palignr m3, m1, 1
10482 punpcklbw m1, m3
10483 pmaddubsw m3, m1, m6
10484 pmulhrsw m3, m7
10485 movu m4, [r4 + 43]
10486 movd m5, [r4 + 44]
10487 palignr m5, m4, 1
10488 punpcklbw m4, m5
10489 pmaddubsw m5, m4, m6
10490 pmulhrsw m5, m7
10491 packuswb m3, m5
10492 movu [r0 + 109 * 16], m3
10493
10494 ; mode 4 [row 27]
10495 movu m6, [r5 + 12 * 16]
10496 pmaddubsw m3, m0, m6
10497 pmulhrsw m3, m7
10498 pmaddubsw m5, m2, m6
10499 pmulhrsw m5, m7
10500 packuswb m3, m5
10501 movu [r0 + 182 * 16], m3
10502 pmaddubsw m3, m1, m6
10503 pmulhrsw m3, m7
10504 pmaddubsw m5, m4, m6
10505 pmulhrsw m5, m7
10506 packuswb m3, m5
10507 movu [r0 + 183 * 16], m3
10508
10509 ; mode 3 [row 23]
10510 movu m6, [r5 + 16 * 16]
10511 movu m0, [r4 + 20]
10512 movd m1, [r4 + 21]
10513 palignr m1, m0, 1
10514 punpcklbw m0, m1
10515 pmaddubsw m1, m0, m6
10516 pmulhrsw m1, m7
10517 movu m2, [r4 + 28]
10518 movd m3, [r4 + 29]
10519 palignr m3, m2, 1
10520 punpcklbw m2, m3
10521 pmaddubsw m3, m2, m6
10522 pmulhrsw m3, m7
10523 packuswb m1, m3
10524 movu [r0 + 110 * 16], m1
10525
10526 movu m1, [r4 + 36]
10527 movd m3, [r4 + 37]
10528 palignr m3, m1, 1
10529 punpcklbw m1, m3
10530 pmaddubsw m3, m1, m6
10531 pmulhrsw m3, m7
10532 movu m4, [r4 + 44]
10533 movd m5, [r4 + 45]
10534 palignr m5, m4, 1
10535 punpcklbw m4, m5
10536 pmaddubsw m5, m4, m6
10537 pmulhrsw m5, m7
10538 packuswb m3, m5
10539 movu [r0 + 111 * 16], m3
10540
10541 ; mode 4 [row 28]
10542 movu m6, [r5 + 1 * 16]
10543 pmaddubsw m3, m0, m6
10544 pmulhrsw m3, m7
10545 pmaddubsw m5, m2, m6
10546 pmulhrsw m5, m7
10547 packuswb m3, m5
10548 movu [r0 + 184 * 16], m3
10549 pmaddubsw m3, m1, m6
10550 pmulhrsw m3, m7
10551 pmaddubsw m5, m4, m6
10552 pmulhrsw m5, m7
10553 packuswb m3, m5
10554 movu [r0 + 185 * 16], m3
10555
10556 ; mode 4 [row 29]
10557 movu m6, [r5 + 22 * 16]
10558 pmaddubsw m3, m0, m6
10559 pmulhrsw m3, m7
10560 pmaddubsw m5, m2, m6
10561 pmulhrsw m5, m7
10562 packuswb m3, m5
10563 movu [r0 + 186 * 16], m3
10564 pmaddubsw m3, m1, m6
10565 pmulhrsw m3, m7
10566 pmaddubsw m5, m4, m6
10567 pmulhrsw m5, m7
10568 packuswb m3, m5
10569 movu [r0 + 187 * 16], m3
10570
10571 ; mode 3 [row 24]
10572 movu m6, [r5 + 10 * 16]
10573 movu m0, [r4 + 21]
10574 movd m1, [r4 + 22]
10575 palignr m1, m0, 1
10576 punpcklbw m0, m1
10577 pmaddubsw m1, m0, m6
10578 pmulhrsw m1, m7
10579 movu m2, [r4 + 29]
10580 movd m3, [r4 + 30]
10581 palignr m3, m2, 1
10582 punpcklbw m2, m3
10583 pmaddubsw m3, m2, m6
10584 pmulhrsw m3, m7
10585 packuswb m1, m3
10586 movu [r0 + 112 * 16], m1
10587
10588 movu m1, [r4 + 37]
10589 movd m3, [r4 + 38]
10590 palignr m3, m1, 1
10591 punpcklbw m1, m3
10592 pmaddubsw m3, m1, m6
10593 pmulhrsw m3, m7
10594 movu m4, [r4 + 45]
10595 movd m5, [r4 + 46]
10596 palignr m5, m4, 1
10597 punpcklbw m4, m5
10598 pmaddubsw m5, m4, m6
10599 pmulhrsw m5, m7
10600 packuswb m3, m5
10601 movu [r0 + 113 * 16], m3
10602
10603 ; mode 4 [row 30]
10604 movu m6, [r5 + 11 * 16]
10605 pmaddubsw m3, m0, m6
10606 pmulhrsw m3, m7
10607 pmaddubsw m5, m2, m6
10608 pmulhrsw m5, m7
10609 packuswb m3, m5
10610 movu [r0 + 188 * 16], m3
10611 pmaddubsw m3, m1, m6
10612 pmulhrsw m3, m7
10613 pmaddubsw m5, m4, m6
10614 pmulhrsw m5, m7
10615 packuswb m3, m5
10616 movu [r0 + 189 * 16], m3
10617
10618 ; mode 3 [row 25]
10619 movu m6, [r5 + 4 * 16]
10620 movu m0, [r4 + 22]
10621 movd m1, [r4 + 23]
10622 palignr m1, m0, 1
10623 punpcklbw m0, m1
10624 pmaddubsw m1, m0, m6
10625 pmulhrsw m1, m7
10626 movu m2, [r4 + 30]
10627 movd m3, [r4 + 31]
10628 palignr m3, m2, 1
10629 punpcklbw m2, m3
10630 pmaddubsw m3, m2, m6
10631 pmulhrsw m3, m7
10632 packuswb m1, m3
10633 movu [r0 + 114 * 16], m1
10634
10635 movu m1, [r4 + 38]
10636 movd m3, [r4 + 39]
10637 palignr m3, m1, 1
10638 punpcklbw m1, m3
10639 pmaddubsw m3, m1, m6
10640 pmulhrsw m3, m7
10641 movu m4, [r4 + 46]
10642 movd m5, [r4 + 47]
10643 palignr m5, m4, 1
10644 punpcklbw m4, m5
10645 pmaddubsw m5, m4, m6
10646 pmulhrsw m5, m7
10647 packuswb m3, m5
10648 movu [r0 + 115 * 16], m3
10649
10650 ; mode 3 [row 26]
10651 movu m6, [r5 + 30 * 16]
10652 pmaddubsw m3, m0, m6
10653 pmulhrsw m3, m7
10654 pmaddubsw m5, m2, m6
10655 pmulhrsw m5, m7
10656 packuswb m3, m5
10657 movu [r0 + 116 * 16], m3
10658 pmaddubsw m3, m1, m6
10659 pmulhrsw m3, m7
10660 pmaddubsw m5, m4, m6
10661 pmulhrsw m5, m7
10662 packuswb m3, m5
10663 movu [r0 + 117 * 16], m3
10664
10665 ; mode 3 [row 27]
10666 movu m6, [r5 + 24 * 16]
10667 movu m0, [r4 + 23]
10668 movd m1, [r4 + 24]
10669 palignr m1, m0, 1
10670 punpcklbw m0, m1
10671 pmaddubsw m1, m0, m6
10672 pmulhrsw m1, m7
10673 movu m2, [r4 + 31]
10674 movd m3, [r4 + 32]
10675 palignr m3, m2, 1
10676 punpcklbw m2, m3
10677 pmaddubsw m3, m2, m6
10678 pmulhrsw m3, m7
10679 packuswb m1, m3
10680 movu [r0 + 118 * 16], m1
10681
10682 movu m1, [r4 + 39]
10683 movd m3, [r4 + 40]
10684 palignr m3, m1, 1
10685 punpcklbw m1, m3
10686 pmaddubsw m3, m1, m6
10687 pmulhrsw m3, m7
10688 movu m4, [r4 + 47]
10689 movd m5, [r4 + 48]
10690 palignr m5, m4, 1
10691 punpcklbw m4, m5
10692 pmaddubsw m5, m4, m6
10693 pmulhrsw m5, m7
10694 packuswb m3, m5
10695 movu [r0 + 119 * 16], m3
10696
10697 ; mode 3 [row 28]
10698 movu m6, [r5 + 18 * 16]
10699 movu m0, [r4 + 24]
10700 movd m1, [r4 + 25]
10701 palignr m1, m0, 1
10702 punpcklbw m0, m1
10703 pmaddubsw m1, m0, m6
10704 pmulhrsw m1, m7
10705 movu m2, [r4 + 32]
10706 movd m3, [r4 + 33]
10707 palignr m3, m2, 1
10708 punpcklbw m2, m3
10709 pmaddubsw m3, m2, m6
10710 pmulhrsw m3, m7
10711 packuswb m1, m3
10712 movu [r0 + 120 * 16], m1
10713
10714 movu m1, [r4 + 40]
10715 movd m3, [r4 + 41]
10716 palignr m3, m1, 1
10717 punpcklbw m1, m3
10718 pmaddubsw m3, m1, m6
10719 pmulhrsw m3, m7
10720 movu m4, [r4 + 48]
10721 movd m5, [r4 + 49]
10722 palignr m5, m4, 1
10723 punpcklbw m4, m5
10724 pmaddubsw m5, m4, m6
10725 pmulhrsw m5, m7
10726 packuswb m3, m5
10727 movu [r0 + 121 * 16], m3
10728
10729 ; mode 3 [row 29]
10730 movu m6, [r5 + 12 * 16]
10731 movu m0, [r4 + 25]
10732 movd m1, [r4 + 26]
10733 palignr m1, m0, 1
10734 punpcklbw m0, m1
10735 pmaddubsw m1, m0, m6
10736 pmulhrsw m1, m7
10737 movu m2, [r4 + 33]
10738 movd m3, [r4 + 34]
10739 palignr m3, m2, 1
10740 punpcklbw m2, m3
10741 pmaddubsw m3, m2, m6
10742 pmulhrsw m3, m7
10743 packuswb m1, m3
10744 movu [r0 + 122 * 16], m1
10745
10746 movu m1, [r4 + 41]
10747 movd m3, [r4 + 42]
10748 palignr m3, m1, 1
10749 punpcklbw m1, m3
10750 pmaddubsw m3, m1, m6
10751 pmulhrsw m3, m7
10752 movu m4, [r4 + 49]
10753 movd m5, [r4 + 50]
10754 palignr m5, m4, 1
10755 punpcklbw m4, m5
10756 pmaddubsw m5, m4, m6
10757 pmulhrsw m5, m7
10758 packuswb m3, m5
10759 movu [r0 + 123 * 16], m3
10760
10761 ; mode 3 [row 30]
10762 movu m6, [r5 + 6 * 16]
10763 movu m0, [r4 + 26]
10764 movd m1, [r4 + 27]
10765 palignr m1, m0, 1
10766 punpcklbw m0, m1
10767 pmaddubsw m1, m0, m6
10768 pmulhrsw m1, m7
10769 movu m2, [r4 + 34]
10770 movd m3, [r4 + 35]
10771 palignr m3, m2, 1
10772 punpcklbw m2, m3
10773 pmaddubsw m3, m2, m6
10774 pmulhrsw m3, m7
10775 packuswb m1, m3
10776 movu [r0 + 124 * 16], m1
10777
10778 movu m1, [r4 + 42]
10779 movd m3, [r4 + 43]
10780 palignr m3, m1, 1
10781 punpcklbw m1, m3
10782 pmaddubsw m3, m1, m6
10783 pmulhrsw m3, m7
10784 movu m4, [r4 + 50]
10785 movd m5, [r4 + 51]
10786 palignr m5, m4, 1
10787 punpcklbw m4, m5
10788 pmaddubsw m5, m4, m6
10789 pmulhrsw m5, m7
10790 packuswb m3, m5
10791 movu [r0 + 125 * 16], m3
10792
10793 ; mode 10
10794 movu m1, [r2 + 1]
10795 movu m2, [r2 + 17]
10796 movu [r0 + 512 * 16], m1
10797 movu [r0 + 513 * 16], m2
10798 movu [r0 + 514 * 16], m1
10799 movu [r0 + 515 * 16], m2
10800 movu [r0 + 516 * 16], m1
10801 movu [r0 + 517 * 16], m2
10802 movu [r0 + 518 * 16], m1
10803 movu [r0 + 519 * 16], m2
10804 movu [r0 + 520 * 16], m1
10805 movu [r0 + 521 * 16], m2
10806 movu [r0 + 522 * 16], m1
10807 movu [r0 + 523 * 16], m2
10808 movu [r0 + 524 * 16], m1
10809 movu [r0 + 525 * 16], m2
10810 movu [r0 + 526 * 16], m1
10811 movu [r0 + 527 * 16], m2
10812
10813 movu [r0 + 528 * 16], m1
10814 movu [r0 + 529 * 16], m2
10815 movu [r0 + 530 * 16], m1
10816 movu [r0 + 531 * 16], m2
10817 movu [r0 + 532 * 16], m1
10818 movu [r0 + 533 * 16], m2
10819 movu [r0 + 534 * 16], m1
10820 movu [r0 + 535 * 16], m2
10821 movu [r0 + 536 * 16], m1
10822 movu [r0 + 537 * 16], m2
10823 movu [r0 + 538 * 16], m1
10824 movu [r0 + 539 * 16], m2
10825 movu [r0 + 540 * 16], m1
10826 movu [r0 + 541 * 16], m2
10827 movu [r0 + 542 * 16], m1
10828 movu [r0 + 543 * 16], m2
10829
10830 movu [r0 + 544 * 16], m1
10831 movu [r0 + 545 * 16], m2
10832 movu [r0 + 546 * 16], m1
10833 movu [r0 + 547 * 16], m2
10834 movu [r0 + 548 * 16], m1
10835 movu [r0 + 549 * 16], m2
10836 movu [r0 + 550 * 16], m1
10837 movu [r0 + 551 * 16], m2
10838 movu [r0 + 552 * 16], m1
10839 movu [r0 + 553 * 16], m2
10840 movu [r0 + 554 * 16], m1
10841 movu [r0 + 555 * 16], m2
10842 movu [r0 + 556 * 16], m1
10843 movu [r0 + 557 * 16], m2
10844 movu [r0 + 558 * 16], m1
10845 movu [r0 + 559 * 16], m2
10846
10847 movu [r0 + 560 * 16], m1
10848 movu [r0 + 561 * 16], m2
10849 movu [r0 + 562 * 16], m1
10850 movu [r0 + 563 * 16], m2
10851 movu [r0 + 564 * 16], m1
10852 movu [r0 + 565 * 16], m2
10853 movu [r0 + 566 * 16], m1
10854 movu [r0 + 567 * 16], m2
10855 movu [r0 + 568 * 16], m1
10856 movu [r0 + 569 * 16], m2
10857 movu [r0 + 570 * 16], m1
10858 movu [r0 + 571 * 16], m2
10859 movu [r0 + 572 * 16], m1
10860 movu [r0 + 573 * 16], m2
10861 movu [r0 + 574 * 16], m1
10862 movu [r0 + 575 * 16], m2
10863
10864 ; mode 11 [row 0]
10865 movu m0, [r4]
10866
10867 ; mode 11 [row 15 - first half]
10868 movu [r0 + 606 * 16], m0
10869
10870 movu [r0 + 606 * 16], m0
10871
10872 ; mode 12 [row 31]
10873 pslldq m6, m0, 4
10874 pinsrb m6, [r3 + 26], 0
10875 pinsrb m6, [r3 + 19], 1
10876 pinsrb m6, [r3 + 13], 2
10877 pinsrb m6, [r3 + 6], 3
10878 movu [r0 + 702 * 16], m6
10879 movu m6, [r4 + 12]
10880 movu [r0 + 703 * 16], m6
10881
10882 ; mode 11 [row 31]
10883 pslldq m6, m0, 1
10884 pinsrb m6, [r3 + 16], 0
10885 movu [r0 + 638 * 16], m6
10886 movu m6, [r4 + 15]
10887 movu [r0 + 639 * 16], m6
10888
10889 movd m1, [r4 + 1]
10890 palignr m1, m0, 1
10891 punpcklbw m0, m1
10892 pmaddubsw m1, m0, [r5 + 30 * 16]
10893 pmulhrsw m1, m7
10894 movu m2, [r4 + 8]
10895 movd m3, [r4 + 9]
10896 palignr m3, m2, 1
10897 punpcklbw m2, m3
10898 pmaddubsw m3, m2, [r5 + 30 * 16]
10899 pmulhrsw m3, m7
10900 packuswb m1, m3
10901 movu [r0 + 576 * 16], m1
10902
10903 movu m1, [r4 + 16]
10904
10905 ; mode 11 [row 15 - second half]
10906 movu [r0 + 607 * 16], m1
10907
10908 movd m3, [r4 + 17]
10909 palignr m3, m1, 1
10910 punpcklbw m1, m3
10911 pmaddubsw m3, m1, [r5 + 30 * 16]
10912 pmulhrsw m3, m7
10913 movu m4, [r4 + 24]
10914 movd m5, [r4 + 25]
10915 palignr m5, m4, 1
10916 punpcklbw m4, m5
10917 pmaddubsw m5, m4, [r5 + 30 * 16]
10918 pmulhrsw m5, m7
10919 packuswb m3, m5
10920 movu [r0 + 577 * 16], m3
10921
10922 ; mode 11 [row 1]
10923 pmaddubsw m3, m0, [r5 + 28 * 16]
10924 pmulhrsw m3, m7
10925 pmaddubsw m5, m2, [r5 + 28 * 16]
10926 pmulhrsw m5, m7
10927 packuswb m3, m5
10928 movu [r0 + 578 * 16], m3
10929 pmaddubsw m3, m1, [r5 + 28 * 16]
10930 pmulhrsw m3, m7
10931 pmaddubsw m5, m4, [r5 + 28 * 16]
10932 pmulhrsw m5, m7
10933 packuswb m3, m5
10934 movu [r0 + 579 * 16], m3
10935
10936 ; mode 11 [row 2]
10937 pmaddubsw m3, m0, [r5 + 26 * 16]
10938 pmulhrsw m3, m7
10939 pmaddubsw m5, m2, [r5 + 26 * 16]
10940 pmulhrsw m5, m7
10941 packuswb m3, m5
10942 movu [r0 + 580 * 16], m3
10943 pmaddubsw m3, m1, [r5 + 26 * 16]
10944 pmulhrsw m3, m7
10945 pmaddubsw m5, m4, [r5 + 26 * 16]
10946 pmulhrsw m5, m7
10947 packuswb m3, m5
10948 movu [r0 + 581 * 16], m3
10949
10950 ; mode 11 [row 3]
10951 pmaddubsw m3, m0, [r5 + 24 * 16]
10952 pmulhrsw m3, m7
10953 pmaddubsw m5, m2, [r5 + 24 * 16]
10954 pmulhrsw m5, m7
10955 packuswb m3, m5
10956 movu [r0 + 582 * 16], m3
10957 pmaddubsw m3, m1, [r5 + 24 * 16]
10958 pmulhrsw m3, m7
10959 pmaddubsw m5, m4, [r5 + 24 * 16]
10960 pmulhrsw m5, m7
10961 packuswb m3, m5
10962 movu [r0 + 583 * 16], m3
10963
10964 ; mode 11 [row 4]
10965 pmaddubsw m3, m0, [r5 + 22 * 16]
10966 pmulhrsw m3, m7
10967 pmaddubsw m5, m2, [r5 + 22 * 16]
10968 pmulhrsw m5, m7
10969 packuswb m3, m5
10970 movu [r0 + 584 * 16], m3
10971
10972 ; mode 12 [row 1 - first half]
10973 movu [r0 + 642 * 16], m3
10974
10975 pmaddubsw m3, m1, [r5 + 22 * 16]
10976 pmulhrsw m3, m7
10977 pmaddubsw m5, m4, [r5 + 22 * 16]
10978 pmulhrsw m5, m7
10979 packuswb m3, m5
10980 movu [r0 + 585 * 16], m3
10981
10982 ; mode 12 [row 1 - second half]
10983 movu [r0 + 643 * 16], m3
10984
10985 ; mode 11 [row 5]
10986 pmaddubsw m3, m0, [r5 + 20 * 16]
10987 pmulhrsw m3, m7
10988 pmaddubsw m5, m2, [r5 + 20 * 16]
10989 pmulhrsw m5, m7
10990 packuswb m3, m5
10991 movu [r0 + 586 * 16], m3
10992 pmaddubsw m3, m1, [r5 + 20 * 16]
10993 pmulhrsw m3, m7
10994 pmaddubsw m5, m4, [r5 + 20 * 16]
10995 pmulhrsw m5, m7
10996 packuswb m3, m5
10997 movu [r0 + 587 * 16], m3
10998
10999 ; mode 11 [row 6]
11000 pmaddubsw m3, m0, [r5 + 18 * 16]
11001 pmulhrsw m3, m7
11002 pmaddubsw m5, m2, [r5 + 18 * 16]
11003 pmulhrsw m5, m7
11004 packuswb m3, m5
11005 movu [r0 + 588 * 16], m3
11006 pmaddubsw m3, m1, [r5 + 18 * 16]
11007 pmulhrsw m3, m7
11008 pmaddubsw m5, m4, [r5 + 18 * 16]
11009 pmulhrsw m5, m7
11010 packuswb m3, m5
11011 movu [r0 + 589 * 16], m3
11012
11013 ; mode 11 [row 7]
11014 pmaddubsw m3, m0, [r5 + 16 * 16]
11015 pmulhrsw m3, m7
11016 pmaddubsw m5, m2, [r5 + 16 * 16]
11017 pmulhrsw m5, m7
11018 packuswb m3, m5
11019 movu [r0 + 590 * 16], m3
11020 pmaddubsw m3, m1, [r5 + 16 * 16]
11021 pmulhrsw m3, m7
11022 pmaddubsw m5, m4, [r5 + 16 * 16]
11023 pmulhrsw m5, m7
11024 packuswb m3, m5
11025 movu [r0 + 591 * 16], m3
11026
11027 ; mode 11 [row 8]
11028 pmaddubsw m3, m0, [r5 + 14 * 16]
11029 pmulhrsw m3, m7
11030 pmaddubsw m5, m2, [r5 + 14 * 16]
11031 pmulhrsw m5, m7
11032 packuswb m3, m5
11033 movu [r0 + 592 * 16], m3
11034
11035 ; mode 13 [row 1 - first half]
11036 movu [r0 + 706 * 16], m3
11037
11038 pmaddubsw m3, m1, [r5 + 14 * 16]
11039 pmulhrsw m3, m7
11040 pmaddubsw m5, m4, [r5 + 14 * 16]
11041 pmulhrsw m5, m7
11042 packuswb m3, m5
11043 movu [r0 + 593 * 16], m3
11044
11045 ; mode 13 [row 1 - second half]
11046 movu [r0 + 707 * 16], m3
11047
11048 ; mode 11 [row 9]
11049 pmaddubsw m3, m0, [r5 + 12 * 16]
11050 pmulhrsw m3, m7
11051 pmaddubsw m5, m2, [r5 + 12 * 16]
11052 pmulhrsw m5, m7
11053 packuswb m3, m5
11054 movu [r0 + 594 * 16], m3
11055
11056 ; mode 12 [row 3 - first half]
11057 movu [r0 + 646 * 16], m3
11058
11059 pmaddubsw m3, m1, [r5 + 12 * 16]
11060 pmulhrsw m3, m7
11061 pmaddubsw m5, m4, [r5 + 12 * 16]
11062 pmulhrsw m5, m7
11063 packuswb m3, m5
11064 movu [r0 + 595 * 16], m3
11065
11066 ; mode 12 [row 3 - second half]
11067 movu [r0 + 647 * 16], m3
11068
11069 ; mode 11 [row 10]
11070 pmaddubsw m3, m0, [r5 + 10 * 16]
11071 pmulhrsw m3, m7
11072 pmaddubsw m5, m2, [r5 + 10 * 16]
11073 pmulhrsw m5, m7
11074 packuswb m3, m5
11075 movu [r0 + 596 * 16], m3
11076 pmaddubsw m3, m1, [r5 + 10 * 16]
11077 pmulhrsw m3, m7
11078 pmaddubsw m5, m4, [r5 + 10 * 16]
11079 pmulhrsw m5, m7
11080 packuswb m3, m5
11081 movu [r0 + 597 * 16], m3
11082
11083 ; mode 11 [row 11]
11084 pmaddubsw m3, m0, [r5 + 8 * 16]
11085 pmulhrsw m3, m7
11086 pmaddubsw m5, m2, [r5 + 8 * 16]
11087 pmulhrsw m5, m7
11088 packuswb m3, m5
11089 movu [r0 + 598 * 16], m3
11090 pmaddubsw m3, m1, [r5 + 8 * 16]
11091 pmulhrsw m3, m7
11092 pmaddubsw m5, m4, [r5 + 8 * 16]
11093 pmulhrsw m5, m7
11094 packuswb m3, m5
11095 movu [r0 + 599 * 16], m3
11096
11097 ; mode 11 [row 12]
11098 pmaddubsw m3, m0, [r5 + 6 * 16]
11099 pmulhrsw m3, m7
11100 pmaddubsw m5, m2, [r5 + 6 * 16]
11101 pmulhrsw m5, m7
11102 packuswb m3, m5
11103 movu [r0 + 600 * 16], m3
11104
11105 ; mode 14 [row 1 - first half]
11106 movu [r0 + 770 * 16], m3
11107
11108 pmaddubsw m3, m1, [r5 + 6 * 16]
11109 pmulhrsw m3, m7
11110 pmaddubsw m5, m4, [r5 + 6 * 16]
11111 pmulhrsw m5, m7
11112 packuswb m3, m5
11113 movu [r0 + 601 * 16], m3
11114
11115 ; mode 14 [row 1 - second half]
11116 movu [r0 + 771 * 16], m3
11117
11118 ; mode 11 [row 13]
11119 pmaddubsw m3, m0, [r5 + 4 * 16]
11120 pmulhrsw m3, m7
11121 pmaddubsw m5, m2, [r5 + 4 * 16]
11122 pmulhrsw m5, m7
11123 packuswb m3, m5
11124 movu [r0 + 602 * 16], m3
11125 pmaddubsw m3, m1, [r5 + 4 * 16]
11126 pmulhrsw m3, m7
11127 pmaddubsw m5, m4, [r5 + 4 * 16]
11128 pmulhrsw m5, m7
11129 packuswb m3, m5
11130 movu [r0 + 603 * 16], m3
11131
11132 ; mode 11 [row 14]
11133 pmaddubsw m3, m0, [r5 + 2 * 16]
11134 pmulhrsw m3, m7
11135 pmaddubsw m5, m2, [r5 + 2 * 16]
11136 pmulhrsw m5, m7
11137 packuswb m3, m5
11138 movu [r0 + 604 * 16], m3
11139
11140 ; mode 13 [row 5 - first half]
11141 movu [r0 + 650 * 16], m3
11142
11143 pmaddubsw m3, m1, [r5 + 2 * 16]
11144 pmulhrsw m3, m7
11145 pmaddubsw m5, m4, [r5 + 2 * 16]
11146 pmulhrsw m5, m7
11147 packuswb m3, m5
11148 movu [r0 + 605 * 16], m3
11149
11150 ; mode 13 [row 5 - second half]
11151 movu [r0 + 651 * 16], m3
11152
11153 ; mode 12 [row 0]
11154 pmaddubsw m3, m0, [r5 + 27 * 16]
11155 pmulhrsw m3, m7
11156 pmaddubsw m5, m2, [r5 + 27 * 16]
11157 pmulhrsw m5, m7
11158 packuswb m3, m5
11159 movu [r0 + 640 * 16], m3
11160 pmaddubsw m3, m1, [r5 + 27 * 16]
11161 pmulhrsw m3, m7
11162 pmaddubsw m5, m4, [r5 + 27 * 16]
11163 pmulhrsw m5, m7
11164 packuswb m3, m5
11165 movu [r0 + 641 * 16], m3
11166
11167 ; mode 12 [row 2]
11168 pmaddubsw m3, m0, [r5 + 17 * 16]
11169 pmulhrsw m3, m7
11170 pmaddubsw m5, m2, [r5 + 17 * 16]
11171 pmulhrsw m5, m7
11172 packuswb m3, m5
11173 movu [r0 + 644 * 16], m3
11174 pmaddubsw m3, m1, [r5 + 17 * 16]
11175 pmulhrsw m3, m7
11176 pmaddubsw m5, m4, [r5 + 17 * 16]
11177 pmulhrsw m5, m7
11178 packuswb m3, m5
11179 movu [r0 + 645 * 16], m3
11180
11181 ; mode 12 [row 4]
11182 pmaddubsw m3, m0, [r5 + 7 * 16]
11183 pmulhrsw m3, m7
11184 pmaddubsw m5, m2, [r5 + 7 * 16]
11185 pmulhrsw m5, m7
11186 packuswb m3, m5
11187 movu [r0 + 648 * 16], m3
11188 pmaddubsw m3, m1, [r5 + 7 * 16]
11189 pmulhrsw m3, m7
11190 pmaddubsw m5, m4, [r5 + 7 * 16]
11191 pmulhrsw m5, m7
11192 packuswb m3, m5
11193 movu [r0 + 649 * 16], m3
11194
11195 ; mode 13 [row 0]
11196 pmaddubsw m3, m0, [r5 + 23 * 16]
11197 pmulhrsw m3, m7
11198 pmaddubsw m5, m2, [r5 + 23 * 16]
11199 pmulhrsw m5, m7
11200 packuswb m3, m5
11201 movu [r0 + 704 * 16], m3
11202 pmaddubsw m3, m1, [r5 + 23 * 16]
11203 pmulhrsw m3, m7
11204 pmaddubsw m5, m4, [r5 + 23 * 16]
11205 pmulhrsw m5, m7
11206 packuswb m3, m5
11207 movu [r0 + 705 * 16], m3
11208
11209 ; mode 13 [row 2]
11210 pmaddubsw m3, m0, [r5 + 5 * 16]
11211 pmulhrsw m3, m7
11212 pmaddubsw m5, m2, [r5 + 5 * 16]
11213 pmulhrsw m5, m7
11214 packuswb m3, m5
11215 movu [r0 + 708 * 16], m3
11216 pmaddubsw m3, m1, [r5 + 5 * 16]
11217 pmulhrsw m3, m7
11218 pmaddubsw m5, m4, [r5 + 5 * 16]
11219 pmulhrsw m5, m7
11220 packuswb m3, m5
11221 movu [r0 + 709 * 16], m3
11222
11223 ; mode 14 [row 0]
11224 pmaddubsw m3, m0, [r5 + 19 * 16]
11225 pmulhrsw m3, m7
11226 pmaddubsw m5, m2, [r5 + 19 * 16]
11227 pmulhrsw m5, m7
11228 packuswb m3, m5
11229 movu [r0 + 768 * 16], m3
11230 pmaddubsw m3, m1, [r5 + 19 * 16]
11231 pmulhrsw m3, m7
11232 pmaddubsw m5, m4, [r5 + 19 * 16]
11233 pmulhrsw m5, m7
11234 packuswb m3, m5
11235 movu [r0 + 769 * 16], m3
11236
11237 ; mode 15 [row 0]
11238 pmaddubsw m3, m0, [r5 + 15 * 16]
11239 pmulhrsw m3, m7
11240 pmaddubsw m5, m2, [r5 + 15 * 16]
11241 pmulhrsw m5, m7
11242 packuswb m3, m5
11243 movu [r0 + 832 * 16], m3
11244 pmaddubsw m3, m1, [r5 + 15 * 16]
11245 pmulhrsw m3, m7
11246 pmaddubsw m5, m4, [r5 + 15 * 16]
11247 pmulhrsw m5, m7
11248 packuswb m3, m5
11249 movu [r0 + 833 * 16], m3
11250
11251 ; mode 11 [row 16]
11252 pslldq m0, 2
11253 pinsrb m0, [r4 + 0], 1
11254 pinsrb m0, [r3 + 16], 0
11255 pmaddubsw m3, m0, [r5 + 30 * 16]
11256 pmulhrsw m3, m7
11257 pslldq m2, 2
11258 pinsrb m2, [r4 + 8], 1
11259 pinsrb m2, [r4 + 7], 0
11260 pmaddubsw m5, m2, [r5 + 30 * 16]
11261 pmulhrsw m5, m7
11262 packuswb m3, m5
11263 movu [r0 + 608 * 16], m3
11264 pslldq m1, 2
11265 pinsrb m1, [r4 + 16], 1
11266 pinsrb m1, [r4 + 15], 0
11267 pmaddubsw m3, m1, [r5 + 30 * 16]
11268 pmulhrsw m3, m7
11269 pslldq m4, 2
11270 pinsrb m4, [r4 + 24], 1
11271 pinsrb m4, [r4 + 23], 0
11272 pmaddubsw m5, m4, [r5 + 30 * 16]
11273 pmulhrsw m5, m7
11274 packuswb m3, m5
11275 movu [r0 + 609 * 16], m3
11276
11277 ; mode 11 [row 17]
11278 pmaddubsw m3, m0, [r5 + 28 * 16]
11279 pmulhrsw m3, m7
11280 pmaddubsw m5, m2, [r5 + 28 * 16]
11281 pmulhrsw m5, m7
11282 packuswb m3, m5
11283 movu [r0 + 610 * 16], m3
11284 pmaddubsw m3, m1, [r5 + 28 * 16]
11285 pmulhrsw m3, m7
11286 pmaddubsw m5, m4, [r5 + 28 * 16]
11287 pmulhrsw m5, m7
11288 packuswb m3, m5
11289 movu [r0 + 611 * 16], m3
11290
11291 ; mode 11 [row 18]
11292 pmaddubsw m3, m0, [r5 + 26 * 16]
11293 pmulhrsw m3, m7
11294 pmaddubsw m5, m2, [r5 + 26 * 16]
11295 pmulhrsw m5, m7
11296 packuswb m3, m5
11297 movu [r0 + 612 * 16], m3
11298 pmaddubsw m3, m1, [r5 + 26 * 16]
11299 pmulhrsw m3, m7
11300 pmaddubsw m5, m4, [r5 + 26 * 16]
11301 pmulhrsw m5, m7
11302 packuswb m3, m5
11303 movu [r0 + 613 * 16], m3
11304
11305 ; mode 11 [row 19]
11306 pmaddubsw m3, m0, [r5 + 24 * 16]
11307 pmulhrsw m3, m7
11308 pmaddubsw m5, m2, [r5 + 24 * 16]
11309 pmulhrsw m5, m7
11310 packuswb m3, m5
11311 movu [r0 + 614 * 16], m3
11312 pmaddubsw m3, m1, [r5 + 24 * 16]
11313 pmulhrsw m3, m7
11314 pmaddubsw m5, m4, [r5 + 24 * 16]
11315 pmulhrsw m5, m7
11316 packuswb m3, m5
11317 movu [r0 + 615 * 16], m3
11318
11319 ; mode 11 [row 20]
11320 pmaddubsw m3, m0, [r5 + 22 * 16]
11321 pmulhrsw m3, m7
11322 pmaddubsw m5, m2, [r5 + 22 * 16]
11323 pmulhrsw m5, m7
11324 packuswb m3, m5
11325 movu [r0 + 616 * 16], m3
11326 pmaddubsw m3, m1, [r5 + 22 * 16]
11327 pmulhrsw m3, m7
11328 pmaddubsw m5, m4, [r5 + 22 * 16]
11329 pmulhrsw m5, m7
11330 packuswb m3, m5
11331 movu [r0 + 617 * 16], m3
11332
11333 ; mode 11 [row 21]
11334 pmaddubsw m3, m0, [r5 + 20 * 16]
11335 pmulhrsw m3, m7
11336 pmaddubsw m5, m2, [r5 + 20 * 16]
11337 pmulhrsw m5, m7
11338 packuswb m3, m5
11339 movu [r0 + 618 * 16], m3
11340 pmaddubsw m3, m1, [r5 + 20 * 16]
11341 pmulhrsw m3, m7
11342 pmaddubsw m5, m4, [r5 + 20 * 16]
11343 pmulhrsw m5, m7
11344 packuswb m3, m5
11345 movu [r0 + 619 * 16], m3
11346
11347 ; mode 11 [row 22]
11348 pmaddubsw m3, m0, [r5 + 18 * 16]
11349 pmulhrsw m3, m7
11350 pmaddubsw m5, m2, [r5 + 18 * 16]
11351 pmulhrsw m5, m7
11352 packuswb m3, m5
11353 movu [r0 + 620 * 16], m3
11354 pmaddubsw m3, m1, [r5 + 18 * 16]
11355 pmulhrsw m3, m7
11356 pmaddubsw m5, m4, [r5 + 18 * 16]
11357 pmulhrsw m5, m7
11358 packuswb m3, m5
11359 movu [r0 + 621 * 16], m3
11360
11361 ; mode 11 [row 23]
11362 pmaddubsw m3, m0, [r5 + 16 * 16]
11363 pmulhrsw m3, m7
11364 pmaddubsw m5, m2, [r5 + 16 * 16]
11365 pmulhrsw m5, m7
11366 packuswb m3, m5
11367 movu [r0 + 622 * 16], m3
11368 pmaddubsw m3, m1, [r5 + 16 * 16]
11369 pmulhrsw m3, m7
11370 pmaddubsw m5, m4, [r5 + 16 * 16]
11371 pmulhrsw m5, m7
11372 packuswb m3, m5
11373 movu [r0 + 623 * 16], m3
11374
11375 ; mode 11 [row 24]
11376 pmaddubsw m3, m0, [r5 + 14 * 16]
11377 pmulhrsw m3, m7
11378 pmaddubsw m5, m2, [r5 + 14 * 16]
11379 pmulhrsw m5, m7
11380 packuswb m3, m5
11381 movu [r0 + 624 * 16], m3
11382 pmaddubsw m3, m1, [r5 + 14 * 16]
11383 pmulhrsw m3, m7
11384 pmaddubsw m5, m4, [r5 + 14 * 16]
11385 pmulhrsw m5, m7
11386 packuswb m3, m5
11387 movu [r0 + 625 * 16], m3
11388
11389 ; mode 11 [row 25]
11390 pmaddubsw m3, m0, [r5 + 12 * 16]
11391 pmulhrsw m3, m7
11392 pmaddubsw m5, m2, [r5 + 12 * 16]
11393 pmulhrsw m5, m7
11394 packuswb m3, m5
11395 movu [r0 + 626 * 16], m3
11396 pmaddubsw m3, m1, [r5 + 12 * 16]
11397 pmulhrsw m3, m7
11398 pmaddubsw m5, m4, [r5 + 12 * 16]
11399 pmulhrsw m5, m7
11400 packuswb m3, m5
11401 movu [r0 + 627 * 16], m3
11402
11403 ; mode 11 [row 26]
11404 pmaddubsw m3, m0, [r5 + 10 * 16]
11405 pmulhrsw m3, m7
11406 pmaddubsw m5, m2, [r5 + 10 * 16]
11407 pmulhrsw m5, m7
11408 packuswb m3, m5
11409 movu [r0 + 628 * 16], m3
11410 pmaddubsw m3, m1, [r5 + 10 * 16]
11411 pmulhrsw m3, m7
11412 pmaddubsw m5, m4, [r5 + 10 * 16]
11413 pmulhrsw m5, m7
11414 packuswb m3, m5
11415 movu [r0 + 629 * 16], m3
11416
11417 ; mode 11 [row 27]
11418 pmaddubsw m3, m0, [r5 + 8 * 16]
11419 pmulhrsw m3, m7
11420 pmaddubsw m5, m2, [r5 + 8 * 16]
11421 pmulhrsw m5, m7
11422 packuswb m3, m5
11423 movu [r0 + 630 * 16], m3
11424 pmaddubsw m3, m1, [r5 + 8 * 16]
11425 pmulhrsw m3, m7
11426 pmaddubsw m5, m4, [r5 + 8 * 16]
11427 pmulhrsw m5, m7
11428 packuswb m3, m5
11429 movu [r0 + 631 * 16], m3
11430
11431 ; mode 11 [row 28]
11432 pmaddubsw m3, m0, [r5 + 6 * 16]
11433 pmulhrsw m3, m7
11434 pmaddubsw m5, m2, [r5 + 6 * 16]
11435 pmulhrsw m5, m7
11436 packuswb m3, m5
11437 movu [r0 + 632 * 16], m3
11438 pmaddubsw m3, m1, [r5 + 6 * 16]
11439 pmulhrsw m3, m7
11440 pmaddubsw m5, m4, [r5 + 6 * 16]
11441 pmulhrsw m5, m7
11442 packuswb m3, m5
11443 movu [r0 + 633 * 16], m3
11444
11445 ; mode 11 [row 29]
11446 pmaddubsw m3, m0, [r5 + 4 * 16]
11447 pmulhrsw m3, m7
11448 pmaddubsw m5, m2, [r5 + 4 * 16]
11449 pmulhrsw m5, m7
11450 packuswb m3, m5
11451 movu [r0 + 634 * 16], m3
11452 pmaddubsw m3, m1, [r5 + 4 * 16]
11453 pmulhrsw m3, m7
11454 pmaddubsw m5, m4, [r5 + 4 * 16]
11455 pmulhrsw m5, m7
11456 packuswb m3, m5
11457 movu [r0 + 635 * 16], m3
11458
11459 ; mode 11 [row 30]
11460 pmaddubsw m3, m0, [r5 + 2 * 16]
11461 pmulhrsw m3, m7
11462 pmaddubsw m5, m2, [r5 + 2 * 16]
11463 pmulhrsw m5, m7
11464 packuswb m3, m5
11465 movu [r0 + 636 * 16], m3
11466 pmaddubsw m3, m1, [r5 + 2 * 16]
11467 pmulhrsw m3, m7
11468 pmaddubsw m5, m4, [r5 + 2 * 16]
11469 pmulhrsw m5, m7
11470 packuswb m3, m5
11471 movu [r0 + 637 * 16], m3
11472
11473 ; mode 12 [row 6]
11474 pinsrb m0, [r3 + 6], 0
11475 pmaddubsw m3, m0, [r5 + 29 * 16]
11476 pmulhrsw m3, m7
11477 pmaddubsw m5, m2, [r5 + 29 * 16]
11478 pmulhrsw m5, m7
11479 packuswb m3, m5
11480 movu [r0 + 652 * 16], m3
11481 pmaddubsw m3, m1, [r5 + 29 * 16]
11482 pmulhrsw m3, m7
11483 pmaddubsw m5, m4, [r5 + 29 * 16]
11484 pmulhrsw m5, m7
11485 packuswb m3, m5
11486 movu [r0 + 653 * 16], m3
11487
11488 ; mode 12 [row 7]
11489 pmaddubsw m3, m0, [r5 + 24 * 16]
11490 pmulhrsw m3, m7
11491 pmaddubsw m5, m2, [r5 + 24 * 16]
11492 pmulhrsw m5, m7
11493 packuswb m3, m5
11494 movu [r0 + 654 * 16], m3
11495 pmaddubsw m3, m1, [r5 + 24 * 16]
11496 pmulhrsw m3, m7
11497 pmaddubsw m5, m4, [r5 + 24 * 16]
11498 pmulhrsw m5, m7
11499 packuswb m3, m5
11500 movu [r0 + 655 * 16], m3
11501
11502 ; mode 12 [row 8]
11503 pmaddubsw m3, m0, [r5 + 19 * 16]
11504 pmulhrsw m3, m7
11505 pmaddubsw m5, m2, [r5 + 19 * 16]
11506 pmulhrsw m5, m7
11507 packuswb m3, m5
11508 movu [r0 + 656 * 16], m3
11509 pmaddubsw m3, m1, [r5 + 19 * 16]
11510 pmulhrsw m3, m7
11511 pmaddubsw m5, m4, [r5 + 19 * 16]
11512 pmulhrsw m5, m7
11513 packuswb m3, m5
11514 movu [r0 + 657 * 16], m3
11515
11516 ; mode 12 [row 9]
11517 pmaddubsw m3, m0, [r5 + 14 * 16]
11518 pmulhrsw m3, m7
11519 pmaddubsw m5, m2, [r5 + 14 * 16]
11520 pmulhrsw m5, m7
11521 packuswb m3, m5
11522 movu [r0 + 658 * 16], m3
11523 pmaddubsw m3, m1, [r5 + 14 * 16]
11524 pmulhrsw m3, m7
11525 pmaddubsw m5, m4, [r5 + 14 * 16]
11526 pmulhrsw m5, m7
11527 packuswb m3, m5
11528 movu [r0 + 659 * 16], m3
11529
11530 ; mode 12 [row 10]
11531 pmaddubsw m3, m0, [r5 + 9 * 16]
11532 pmulhrsw m3, m7
11533 pmaddubsw m5, m2, [r5 + 9 * 16]
11534 pmulhrsw m5, m7
11535 packuswb m3, m5
11536 movu [r0 + 660 * 16], m3
11537 pmaddubsw m3, m1, [r5 + 9 * 16]
11538 pmulhrsw m3, m7
11539 pmaddubsw m5, m4, [r5 + 9 * 16]
11540 pmulhrsw m5, m7
11541 packuswb m3, m5
11542 movu [r0 + 661 * 16], m3
11543
11544 ; mode 12 [row 11]
11545 pmaddubsw m3, m0, [r5 + 4 * 16]
11546 pmulhrsw m3, m7
11547 pmaddubsw m5, m2, [r5 + 4 * 16]
11548 pmulhrsw m5, m7
11549 packuswb m3, m5
11550 movu [r0 + 662 * 16], m3
11551 pmaddubsw m3, m1, [r5 + 4 * 16]
11552 pmulhrsw m3, m7
11553 pmaddubsw m5, m4, [r5 + 4 * 16]
11554 pmulhrsw m5, m7
11555 packuswb m3, m5
11556 movu [r0 + 663 * 16], m3
11557
11558 ; mode 13 [row 3]
11559 movu m6, m0
11560 pinsrb m6, [r3 + 4], 0
11561 pmaddubsw m3, m6, [r5 + 28 * 16]
11562 pmulhrsw m3, m7
11563 pmaddubsw m5, m2, [r5 + 28 * 16]
11564 pmulhrsw m5, m7
11565 packuswb m3, m5
11566 movu [r0 + 710 * 16], m3
11567 pmaddubsw m3, m1, [r5 + 28 * 16]
11568 pmulhrsw m3, m7
11569 pmaddubsw m5, m4, [r5 + 28 * 16]
11570 pmulhrsw m5, m7
11571 packuswb m3, m5
11572 movu [r0 + 711 * 16], m3
11573
11574 ; mode 13 [row 4]
11575 pmaddubsw m3, m6, [r5 + 19 * 16]
11576 pmulhrsw m3, m7
11577 pmaddubsw m5, m2, [r5 + 19 * 16]
11578 pmulhrsw m5, m7
11579 packuswb m3, m5
11580 movu [r0 + 712 * 16], m3
11581 pmaddubsw m3, m1, [r5 + 19 * 16]
11582 pmulhrsw m3, m7
11583 pmaddubsw m5, m4, [r5 + 19 * 16]
11584 pmulhrsw m5, m7
11585 packuswb m3, m5
11586 movu [r0 + 713 * 16], m3
11587
11588 ; mode 13 [row 5]
11589 pmaddubsw m3, m6, [r5 + 10 * 16]
11590 pmulhrsw m3, m7
11591 pmaddubsw m5, m2, [r5 + 10 * 16]
11592 pmulhrsw m5, m7
11593 packuswb m3, m5
11594 movu [r0 + 714 * 16], m3
11595 pmaddubsw m3, m1, [r5 + 10 * 16]
11596 pmulhrsw m3, m7
11597 pmaddubsw m5, m4, [r5 + 10 * 16]
11598 pmulhrsw m5, m7
11599 packuswb m3, m5
11600 movu [r0 + 715 * 16], m3
11601
11602 ; mode 13 [row 6]
11603 pmaddubsw m3, m6, [r5 + 1 * 16]
11604 pmulhrsw m3, m7
11605 pmaddubsw m5, m2, [r5 + 1 * 16]
11606 pmulhrsw m5, m7
11607 packuswb m3, m5
11608 movu [r0 + 716 * 16], m3
11609 pmaddubsw m3, m1, [r5 + 1 * 16]
11610 pmulhrsw m3, m7
11611 pmaddubsw m5, m4, [r5 + 1 * 16]
11612 pmulhrsw m5, m7
11613 packuswb m3, m5
11614 movu [r0 + 717 * 16], m3
11615
11616 ; mode 14 [row 2]
11617 movu m6, m0
11618 pinsrb m6, [r4 + 0], 1
11619 pinsrb m6, [r3 + 2], 0
11620 pmaddubsw m3, m6, [r5 + 25 * 16]
11621 pmulhrsw m3, m7
11622 pmaddubsw m5, m2, [r5 + 25 * 16]
11623 pmulhrsw m5, m7
11624 packuswb m3, m5
11625 movu [r0 + 772 * 16], m3
11626 pmaddubsw m3, m1, [r5 + 25 * 16]
11627 pmulhrsw m3, m7
11628 pmaddubsw m5, m4, [r5 + 25 * 16]
11629 pmulhrsw m5, m7
11630 packuswb m3, m5
11631 movu [r0 + 773 * 16], m3
11632
11633 ; mode 14 [row 3]
11634 pmaddubsw m3, m6, [r5 + 12 * 16]
11635 pmulhrsw m3, m7
11636 pmaddubsw m5, m2, [r5 + 12 * 16]
11637 pmulhrsw m5, m7
11638 packuswb m3, m5
11639 movu [r0 + 774 * 16], m3
11640 pmaddubsw m3, m1, [r5 + 12 * 16]
11641 pmulhrsw m3, m7
11642 pmaddubsw m5, m4, [r5 + 12 * 16]
11643 pmulhrsw m5, m7
11644 packuswb m3, m5
11645 movu [r0 + 775 * 16], m3
11646
11647 ; mode 15 [row 1]
11648 pmaddubsw m3, m6, [r5 + 30 * 16]
11649 pmulhrsw m3, m7
11650 pmaddubsw m5, m2, [r5 + 30 * 16]
11651 pmulhrsw m5, m7
11652 packuswb m3, m5
11653 movu [r0 + 834 * 16], m3
11654 pmaddubsw m3, m1, [r5 + 30 * 16]
11655 pmulhrsw m3, m7
11656 pmaddubsw m5, m4, [r5 + 30 * 16]
11657 pmulhrsw m5, m7
11658 packuswb m3, m5
11659 movu [r0 + 835 * 16], m3
11660
11661 ; mode 15 [row 2]
11662 pmaddubsw m3, m6, [r5 + 13 * 16]
11663 pmulhrsw m3, m7
11664 pmaddubsw m5, m2, [r5 + 13 * 16]
11665 pmulhrsw m5, m7
11666 packuswb m3, m5
11667 movu [r0 + 836 * 16], m3
11668 pmaddubsw m3, m1, [r5 + 13 * 16]
11669 pmulhrsw m3, m7
11670 pmaddubsw m5, m4, [r5 + 13 * 16]
11671 pmulhrsw m5, m7
11672 packuswb m3, m5
11673 movu [r0 + 837 * 16], m3
11674
11675 ; mode 15 [row 3]
11676 pslldq m6, 2
11677 pinsrb m6, [r3 + 2], 1
11678 pinsrb m6, [r3 + 4], 0
11679 pmaddubsw m3, m6, [r5 + 28 * 16]
11680 pmulhrsw m3, m7
11681 pslldq m2, 2
11682 pinsrb m2, [r4 + 7], 1
11683 pinsrb m2, [r4 + 6], 0
11684 pmaddubsw m5, m2, [r5 + 28 * 16]
11685 pmulhrsw m5, m7
11686 packuswb m3, m5
11687 movu [r0 + 838 * 16], m3
11688 pslldq m1, 2
11689 pinsrb m1, [r4 + 15], 1
11690 pinsrb m1, [r4 + 14], 0
11691 pmaddubsw m3, m1, [r5 + 28 * 16]
11692 pmulhrsw m3, m7
11693 pslldq m4, 2
11694 pinsrb m4, [r4 + 23], 1
11695 pinsrb m4, [r4 + 22], 0
11696 pmaddubsw m5, m4, [r5 + 28 * 16]
11697 pmulhrsw m5, m7
11698 packuswb m3, m5
11699 movu [r0 + 839 * 16], m3
11700
11701 ; mode 15 [row 4]
11702 pmaddubsw m3, m6, [r5 + 11 * 16]
11703 pmulhrsw m3, m7
11704 pmaddubsw m5, m2, [r5 + 11 * 16]
11705 pmulhrsw m5, m7
11706 packuswb m3, m5
11707 movu [r0 + 840 * 16], m3
11708 pmaddubsw m3, m1, [r5 + 11 * 16]
11709 pmulhrsw m3, m7
11710 pmaddubsw m5, m4, [r5 + 11 * 16]
11711 pmulhrsw m5, m7
11712 packuswb m3, m5
11713 movu [r0 + 841 * 16], m3
11714
11715 ; mode 15 [row 5, 0-7]
11716 pslldq m6, 2
11717 pinsrb m6, [r3 + 4], 1
11718 pinsrb m6, [r3 + 6], 0
11719 pmaddubsw m3, m6, [r5 + 26 * 16]
11720 pmulhrsw m3, m7
11721 packuswb m3, m3
11722 movh [r0 + 842 * 16], m3
11723
11724 ; mode 15 [row 6, 0-7]
11725 pmaddubsw m3, m6, [r5 + 9 * 16]
11726 pmulhrsw m3, m7
11727 packuswb m3, m3
11728 movh [r0 + 844 * 16], m3
11729
11730 ; mode 15 [row 7, 0-7]
11731 pslldq m6, 2
11732 pinsrb m6, [r3 + 6], 1
11733 pinsrb m6, [r3 + 8], 0
11734 pmaddubsw m3, m6, [r5 + 24 * 16]
11735 pmulhrsw m3, m7
11736 packuswb m3, m3
11737 movh [r0 + 846 * 16], m3
11738
11739 ; mode 15 [row 8, 0-7]
11740 pmaddubsw m3, m6, [r5 + 7 * 16]
11741 pmulhrsw m3, m7
11742 packuswb m3, m3
11743 movh [r0 + 848 * 16], m3
11744
11745 ; mode 15 [row 9, 0-7]
11746 pslldq m6, 2
11747 pinsrb m6, [r3 + 8], 1
11748 pinsrb m6, [r3 + 9], 0
11749 pmaddubsw m3, m6, [r5 + 22 * 16]
11750 pmulhrsw m3, m7
11751 packuswb m3, m3
11752 movh [r0 + 850 * 16], m3
11753
11754 ; mode 15 [row 10, 0-7]
11755 pmaddubsw m3, m6, [r5 + 5 * 16]
11756 pmulhrsw m3, m7
11757 packuswb m3, m3
11758 movh [r0 + 852 * 16], m3
11759
11760 ; mode 15 [row 11, 0-7]
11761 pslldq m6, 2
11762 pinsrb m6, [r3 + 9], 1
11763 pinsrb m6, [r3 + 11], 0
11764 pmaddubsw m3, m6, [r5 + 20 * 16]
11765 pmulhrsw m3, m7
11766 packuswb m3, m3
11767 movh [r0 + 854 * 16], m3
11768
11769 ; mode 15 [row 12, 0-7]
11770 pmaddubsw m3, m6, [r5 + 3 * 16]
11771 pmulhrsw m3, m7
11772 packuswb m3, m3
11773 movh [r0 + 856 * 16], m3
11774
11775 ; mode 15 [row 13, 0-7]
11776 pslldq m6, 2
11777 pinsrb m6, [r3 + 11], 1
11778 pinsrb m6, [r3 + 13], 0
11779 pmaddubsw m3, m6, [r5 + 18 * 16]
11780 pmulhrsw m3, m7
11781 packuswb m3, m3
11782 movh [r0 + 858 * 16], m3
11783
11784 ; mode 15 [row 14, 0-7]
11785 pmaddubsw m3, m6, [r5 + 1 * 16]
11786 pmulhrsw m3, m7
11787 packuswb m3, m3
11788 movh [r0 + 860 * 16], m3
11789
11790 ; mode 15 [row 15, 0-7]
11791 pslldq m6, 2
11792 pinsrb m6, [r3 + 13], 1
11793 pinsrb m6, [r3 + 15], 0
11794 pmaddubsw m3, m6, [r5 + 16 * 16]
11795 pmulhrsw m3, m7
11796 packuswb m3, m3
11797 movh [r0 + 862 * 16], m3
11798
11799 ; mode 15 [row 16, 0-7]
11800 pslldq m6, 2
11801 pinsrb m6, [r3 + 15], 1
11802 pinsrb m6, [r3 + 17], 0
11803 pmaddubsw m3, m6, [r5 + 31 * 16]
11804 pmulhrsw m3, m7
11805 packuswb m3, m3
11806 movh [r0 + 864 * 16], m3
11807
11808 ; mode 15 [row 17, 0-7]
11809 pmaddubsw m3, m6, [r5 + 14 * 16]
11810 pmulhrsw m3, m7
11811 packuswb m3, m3
11812 movh [r0 + 866 * 16], m3
11813
11814 ; mode 15 [row 18, 0-7]
11815 pslldq m6, 2
11816 pinsrb m6, [r3 + 17], 1
11817 pinsrb m6, [r3 + 19], 0
11818 pmaddubsw m3, m6, [r5 + 29 * 16]
11819 pmulhrsw m3, m7
11820 packuswb m3, m3
11821 movh [r0 + 868 * 16], m3
11822
11823 ; mode 15 [row 19, 0-7]
11824 pmaddubsw m3, m6, [r5 + 12 * 16]
11825 pmulhrsw m3, m7
11826 packuswb m3, m3
11827 movh [r0 + 870 * 16], m3
11828
11829 ; mode 15 [row 20, 0-7]
11830 pslldq m6, 2
11831 pinsrb m6, [r3 + 19], 1
11832 pinsrb m6, [r3 + 21], 0
11833 pmaddubsw m3, m6, [r5 + 27 * 16]
11834 pmulhrsw m3, m7
11835 packuswb m3, m3
11836 movh [r0 + 872 * 16], m3
11837
11838 ; mode 15 [row 21, 0-7]
11839 pmaddubsw m3, m6, [r5 + 10 * 16]
11840 pmulhrsw m3, m7
11841 packuswb m3, m3
11842 movh [r0 + 874 * 16], m3
11843
11844 ; mode 15 [row 22, 0-7]
11845 pslldq m6, 2
11846 pinsrb m6, [r3 + 21], 1
11847 pinsrb m6, [r3 + 23], 0
11848 pmaddubsw m3, m6, [r5 + 25 * 16]
11849 pmulhrsw m3, m7
11850 packuswb m3, m3
11851 movh [r0 + 876 * 16], m3
11852
11853 ; mode 15 [row 23, 0-7]
11854 pmaddubsw m3, m6, [r5 + 8 * 16]
11855 pmulhrsw m3, m7
11856 packuswb m3, m3
11857 movh [r0 + 878 * 16], m3
11858
11859 ; mode 15 [row 24, 0-7]
11860 pslldq m6, 2
11861 pinsrb m6, [r3 + 23], 1
11862 pinsrb m6, [r3 + 24], 0
11863 pmaddubsw m3, m6, [r5 + 23 * 16]
11864 pmulhrsw m3, m7
11865 packuswb m3, m3
11866 movh [r0 + 880 * 16], m3
11867
11868 ; mode 15 [row 25, 0-7]
11869 pmaddubsw m3, m6, [r5 + 6 * 16]
11870 pmulhrsw m3, m7
11871 packuswb m3, m3
11872 movh [r0 + 882 * 16], m3
11873
11874 ; mode 15 [row 26, 0-7]
11875 pslldq m6, 2
11876 pinsrb m6, [r3 + 24], 1
11877 pinsrb m6, [r3 + 26], 0
11878 pmaddubsw m3, m6, [r5 + 21 * 16]
11879 pmulhrsw m3, m7
11880 packuswb m3, m3
11881 movh [r0 + 884 * 16], m3
11882
11883 ; mode 15 [row 27, 0-7]
11884 pmaddubsw m3, m6, [r5 + 4 * 16]
11885 pmulhrsw m3, m7
11886 packuswb m3, m3
11887 movh [r0 + 886 * 16], m3
11888
11889 ; mode 15 [row 28, 0-7]
11890 pslldq m6, 2
11891 pinsrb m6, [r3 + 26], 1
11892 pinsrb m6, [r3 + 28], 0
11893 pmaddubsw m3, m6, [r5 + 19 * 16]
11894 pmulhrsw m3, m7
11895 packuswb m3, m3
11896 movh [r0 + 888 * 16], m3
11897
11898 ; mode 15 [row 29, 0-7]
11899 pmaddubsw m3, m6, [r5 + 2 * 16]
11900 pmulhrsw m3, m7
11901 packuswb m3, m3
11902 movh [r0 + 890 * 16], m3
11903
11904 ; mode 15 [row 30, 0-7]
11905 pslldq m6, 2
11906 pinsrb m6, [r3 + 28], 1
11907 pinsrb m6, [r3 + 30], 0
11908 pmaddubsw m3, m6, [r5 + 17 * 16]
11909 pmulhrsw m3, m7
11910 packuswb m3, m3
11911 movh [r0 + 892 * 16], m3
11912
11913 ; mode 15 [row 31, 0-7]
11914 pshufb m3, m6, [tab_S2]
11915 movh [r0 + 894 * 16], m3
11916
11917 ; mode 12 [row 12]
11918 pslldq m0, 2
11919 pinsrb m0, [r3 + 6], 1
11920 pinsrb m0, [r3 + 13], 0
11921 pmaddubsw m3, m0, [r5 + 31 * 16]
11922 pmulhrsw m3, m7
11923 pmaddubsw m5, m2, [r5 + 31 * 16]
11924 pmulhrsw m5, m7
11925 packuswb m3, m5
11926 movu [r0 + 664 * 16], m3
11927 pmaddubsw m3, m1, [r5 + 31 * 16]
11928 pmulhrsw m3, m7
11929 pmaddubsw m5, m4, [r5 + 31 * 16]
11930 pmulhrsw m5, m7
11931 packuswb m3, m5
11932 movu [r0 + 665 * 16], m3
11933
11934 ; mode 12 [row 13]
11935 pmaddubsw m3, m0, [r5 + 26 * 16]
11936 pmulhrsw m3, m7
11937 pmaddubsw m5, m2, [r5 + 26 * 16]
11938 pmulhrsw m5, m7
11939 packuswb m3, m5
11940 movu [r0 + 666 * 16], m3
11941 pmaddubsw m3, m1, [r5 + 26 * 16]
11942 pmulhrsw m3, m7
11943 pmaddubsw m5, m4, [r5 + 26 * 16]
11944 pmulhrsw m5, m7
11945 packuswb m3, m5
11946 movu [r0 + 667 * 16], m3
11947
11948 ; mode 12 [row 14]
11949 pmaddubsw m3, m0, [r5 + 21 * 16]
11950 pmulhrsw m3, m7
11951 pmaddubsw m5, m2, [r5 + 21 * 16]
11952 pmulhrsw m5, m7
11953 packuswb m3, m5
11954 movu [r0 + 668 * 16], m3
11955 pmaddubsw m3, m1, [r5 + 21 * 16]
11956 pmulhrsw m3, m7
11957 pmaddubsw m5, m4, [r5 + 21 * 16]
11958 pmulhrsw m5, m7
11959 packuswb m3, m5
11960 movu [r0 + 669 * 16], m3
11961
11962 ; mode 12 [row 15]
11963 pmaddubsw m3, m0, [r5 + 16 * 16]
11964 pmulhrsw m3, m7
11965 pmaddubsw m5, m2, [r5 + 16 * 16]
11966 pmulhrsw m5, m7
11967 packuswb m3, m5
11968 movu [r0 + 670 * 16], m3
11969 pmaddubsw m3, m1, [r5 + 16 * 16]
11970 pmulhrsw m3, m7
11971 pmaddubsw m5, m4, [r5 + 16 * 16]
11972 pmulhrsw m5, m7
11973 packuswb m3, m5
11974 movu [r0 + 671 * 16], m3
11975
11976 ; mode 12 [row 16]
11977 pmaddubsw m3, m0, [r5 + 11 * 16]
11978 pmulhrsw m3, m7
11979 pmaddubsw m5, m2, [r5 + 11 * 16]
11980 pmulhrsw m5, m7
11981 packuswb m3, m5
11982 movu [r0 + 672 * 16], m3
11983 pmaddubsw m3, m1, [r5 + 11 * 16]
11984 pmulhrsw m3, m7
11985 pmaddubsw m5, m4, [r5 + 11 * 16]
11986 pmulhrsw m5, m7
11987 packuswb m3, m5
11988 movu [r0 + 673 * 16], m3
11989
11990 ; mode 12 [row 17]
11991 pmaddubsw m3, m0, [r5 + 6 * 16]
11992 pmulhrsw m3, m7
11993 pmaddubsw m5, m2, [r5 + 6 * 16]
11994 pmulhrsw m5, m7
11995 packuswb m3, m5
11996 movu [r0 + 674 * 16], m3
11997 pmaddubsw m3, m1, [r5 + 6 * 16]
11998 pmulhrsw m3, m7
11999 pmaddubsw m5, m4, [r5 + 6 * 16]
12000 pmulhrsw m5, m7
12001 packuswb m3, m5
12002 movu [r0 + 675 * 16], m3
12003
12004 ; mode 12 [row 18]
12005 pmaddubsw m3, m0, [r5 + 1 * 16]
12006 pmulhrsw m3, m7
12007 pmaddubsw m5, m2, [r5 + 1 * 16]
12008 pmulhrsw m5, m7
12009 packuswb m3, m5
12010 movu [r0 + 676 * 16], m3
12011 pmaddubsw m3, m1, [r5 + 1 * 16]
12012 pmulhrsw m3, m7
12013 pmaddubsw m5, m4, [r5 + 1 * 16]
12014 pmulhrsw m5, m7
12015 packuswb m3, m5
12016 movu [r0 + 677 * 16], m3
12017
12018 ; mode 13 [row 7]
12019 movu m6, m0
12020 pinsrb m6, [r3 + 4], 2
12021 pinsrb m6, [r3 + 4], 1
12022 pinsrb m6, [r3 + 7], 0
12023 pmaddubsw m3, m6, [r5 + 24 * 16]
12024 pmulhrsw m3, m7
12025 pmaddubsw m5, m2, [r5 + 24 * 16]
12026 pmulhrsw m5, m7
12027 packuswb m3, m5
12028 movu [r0 + 718 * 16], m3
12029 pmaddubsw m3, m1, [r5 + 24 * 16]
12030 pmulhrsw m3, m7
12031 pmaddubsw m5, m4, [r5 + 24 * 16]
12032 pmulhrsw m5, m7
12033 packuswb m3, m5
12034 movu [r0 + 719 * 16], m3
12035
12036 ; mode 13 [row 8]
12037 pmaddubsw m3, m6, [r5 + 15 * 16]
12038 pmulhrsw m3, m7
12039 pmaddubsw m5, m2, [r5 + 15 * 16]
12040 pmulhrsw m5, m7
12041 packuswb m3, m5
12042 movu [r0 + 720 * 16], m3
12043 pmaddubsw m3, m1, [r5 + 15 * 16]
12044 pmulhrsw m3, m7
12045 pmaddubsw m5, m4, [r5 + 15 * 16]
12046 pmulhrsw m5, m7
12047 packuswb m3, m5
12048 movu [r0 + 721 * 16], m3
12049
12050 ; mode 13 [row 9]
12051 pmaddubsw m3, m6, [r5 + 6 * 16]
12052 pmulhrsw m3, m7
12053 pmaddubsw m5, m2, [r5 + 6 * 16]
12054 pmulhrsw m5, m7
12055 packuswb m3, m5
12056 movu [r0 + 722 * 16], m3
12057 pmaddubsw m3, m1, [r5 + 6 * 16]
12058 pmulhrsw m3, m7
12059 pmaddubsw m5, m4, [r5 + 6 * 16]
12060 pmulhrsw m5, m7
12061 packuswb m3, m5
12062 movu [r0 + 723 * 16], m3
12063
12064 ; mode 14 [row 4]
12065 pinsrb m6, [r3 + 2], 2
12066 pinsrb m6, [r3 + 2], 1
12067 pinsrb m6, [r3 + 5], 0
12068 pmaddubsw m3, m6, [r5 + 31 * 16]
12069 pmulhrsw m3, m7
12070 pmaddubsw m5, m2, [r5 + 31 * 16]
12071 pmulhrsw m5, m7
12072 packuswb m3, m5
12073 movu [r0 + 776 * 16], m3
12074 pmaddubsw m3, m1, [r5 + 31 * 16]
12075 pmulhrsw m3, m7
12076 pmaddubsw m5, m4, [r5 + 31 * 16]
12077 pmulhrsw m5, m7
12078 packuswb m3, m5
12079 movu [r0 + 777 * 16], m3
12080
12081 ; mode 14 [row 5]
12082 pmaddubsw m3, m6, [r5 + 18 * 16]
12083 pmulhrsw m3, m7
12084 pmaddubsw m5, m2, [r5 + 18 * 16]
12085 pmulhrsw m5, m7
12086 packuswb m3, m5
12087 movu [r0 + 778 * 16], m3
12088 pmaddubsw m3, m1, [r5 + 18 * 16]
12089 pmulhrsw m3, m7
12090 pmaddubsw m5, m4, [r5 + 18 * 16]
12091 pmulhrsw m5, m7
12092 packuswb m3, m5
12093 movu [r0 + 779 * 16], m3
12094
12095 ; mode 14 [row 6]
12096 pmaddubsw m3, m6, [r5 + 5 * 16]
12097 pmulhrsw m3, m7
12098 pmaddubsw m5, m2, [r5 + 5 * 16]
12099 pmulhrsw m5, m7
12100 packuswb m3, m5
12101 movu [r0 + 780 * 16], m3
12102 pmaddubsw m3, m1, [r5 + 5 * 16]
12103 pmulhrsw m3, m7
12104 pmaddubsw m5, m4, [r5 + 5 * 16]
12105 pmulhrsw m5, m7
12106 packuswb m3, m5
12107 movu [r0 + 781 * 16], m3
12108
12109 ; mode 14 [row 7]
12110 pslldq m6, 2
12111 pinsrb m6, [r3 + 5], 1
12112 pinsrb m6, [r3 + 7], 0
12113 pmaddubsw m3, m6, [r5 + 24 * 16]
12114 pmulhrsw m3, m7
12115 pslldq m2, 2
12116 pinsrw m2, [r4 + 5], 0
12117 pmaddubsw m5, m2, [r5 + 24 * 16]
12118 pmulhrsw m5, m7
12119 packuswb m3, m5
12120 movu [r0 + 782 * 16], m3
12121 pslldq m1, 2
12122 pinsrw m1, [r4 + 13], 0
12123 pmaddubsw m3, m1, [r5 + 24 * 16]
12124 pmulhrsw m3, m7
12125 pslldq m4, 2
12126 pinsrw m4, [r4 + 21], 0
12127 pmaddubsw m5, m4, [r5 + 24 * 16]
12128 pmulhrsw m5, m7
12129 packuswb m3, m5
12130 movu [r0 + 783 * 16], m3
12131
12132 ; mode 14 [row 8]
12133 pmaddubsw m3, m6, [r5 + 11 * 16]
12134 pmulhrsw m3, m7
12135 pmaddubsw m5, m2, [r5 + 11 * 16]
12136 pmulhrsw m5, m7
12137 packuswb m3, m5
12138 movu [r0 + 784 * 16], m3
12139 pmaddubsw m3, m1, [r5 + 11 * 16]
12140 pmulhrsw m3, m7
12141 pmaddubsw m5, m4, [r5 + 11 * 16]
12142 pmulhrsw m5, m7
12143 packuswb m3, m5
12144 movu [r0 + 785 * 16], m3
12145
12146 ; mode 15 [row 5, 8-31]
12147 pmaddubsw m5, m2, [r5 + 26 * 16]
12148 pmulhrsw m5, m7
12149 packuswb m5, m5
12150 movh [r0 + 842 * 16 + 8], m5
12151 pmaddubsw m3, m1, [r5 + 26 * 16]
12152 pmulhrsw m3, m7
12153 pmaddubsw m5, m4, [r5 + 26 * 16]
12154 pmulhrsw m5, m7
12155 packuswb m3, m5
12156 movu [r0 + 843 * 16], m3
12157
12158 ; mode 15 [row 6, 8-31]
12159 pmaddubsw m5, m2, [r5 + 9 * 16]
12160 pmulhrsw m5, m7
12161 packuswb m5, m5
12162 movh [r0 + 844 * 16 + 8], m5
12163 pmaddubsw m3, m1, [r5 + 9 * 16]
12164 pmulhrsw m3, m7
12165 pmaddubsw m5, m4, [r5 + 9 * 16]
12166 pmulhrsw m5, m7
12167 packuswb m3, m5
12168 movu [r0 + 845 * 16], m3
12169
12170 ; mode 12 [row 19]
12171 pslldq m0, 2
12172 pinsrb m0, [r3 + 13], 1
12173 pinsrb m0, [r3 + 19], 0
12174 pmaddubsw m3, m0, [r5 + 28 * 16]
12175 pmulhrsw m3, m7
12176 pmaddubsw m5, m2, [r5 + 28 * 16]
12177 pmulhrsw m5, m7
12178 packuswb m3, m5
12179 movu [r0 + 678 * 16], m3
12180 pmaddubsw m3, m1, [r5 + 28 * 16]
12181 pmulhrsw m3, m7
12182 pmaddubsw m5, m4, [r5 + 28 * 16]
12183 pmulhrsw m5, m7
12184 packuswb m3, m5
12185 movu [r0 + 679 * 16], m3
12186
12187 ; mode 12 [row 20]
12188 pmaddubsw m3, m0, [r5 + 23 * 16]
12189 pmulhrsw m3, m7
12190 pmaddubsw m5, m2, [r5 + 23 * 16]
12191 pmulhrsw m5, m7
12192 packuswb m3, m5
12193 movu [r0 + 680 * 16], m3
12194 pmaddubsw m3, m1, [r5 + 23 * 16]
12195 pmulhrsw m3, m7
12196 pmaddubsw m5, m4, [r5 + 23 * 16]
12197 pmulhrsw m5, m7
12198 packuswb m3, m5
12199 movu [r0 + 681 * 16], m3
12200
12201 ; mode 12 [row 21]
12202 pmaddubsw m3, m0, [r5 + 18 * 16]
12203 pmulhrsw m3, m7
12204 pmaddubsw m5, m2, [r5 + 18 * 16]
12205 pmulhrsw m5, m7
12206 packuswb m3, m5
12207 movu [r0 + 682 * 16], m3
12208 pmaddubsw m3, m1, [r5 + 18 * 16]
12209 pmulhrsw m3, m7
12210 pmaddubsw m5, m4, [r5 + 18 * 16]
12211 pmulhrsw m5, m7
12212 packuswb m3, m5
12213 movu [r0 + 683 * 16], m3
12214
12215 ; mode 12 [row 22]
12216 pmaddubsw m3, m0, [r5 + 13 * 16]
12217 pmulhrsw m3, m7
12218 pmaddubsw m5, m2, [r5 + 13 * 16]
12219 pmulhrsw m5, m7
12220 packuswb m3, m5
12221 movu [r0 + 684 * 16], m3
12222 pmaddubsw m3, m1, [r5 + 13 * 16]
12223 pmulhrsw m3, m7
12224 pmaddubsw m5, m4, [r5 + 13 * 16]
12225 pmulhrsw m5, m7
12226 packuswb m3, m5
12227 movu [r0 + 685 * 16], m3
12228
12229 ; mode 12 [row 23]
12230 pmaddubsw m3, m0, [r5 + 8 * 16]
12231 pmulhrsw m3, m7
12232 pmaddubsw m5, m2, [r5 + 8 * 16]
12233 pmulhrsw m5, m7
12234 packuswb m3, m5
12235 movu [r0 + 686 * 16], m3
12236 pmaddubsw m3, m1, [r5 + 8 * 16]
12237 pmulhrsw m3, m7
12238 pmaddubsw m5, m4, [r5 + 8 * 16]
12239 pmulhrsw m5, m7
12240 packuswb m3, m5
12241 movu [r0 + 687 * 16], m3
12242
12243 ; mode 12 [row 24]
12244 pmaddubsw m3, m0, [r5 + 3 * 16]
12245 pmulhrsw m3, m7
12246 pmaddubsw m5, m2, [r5 + 3 * 16]
12247 pmulhrsw m5, m7
12248 packuswb m3, m5
12249 movu [r0 + 688 * 16], m3
12250 pmaddubsw m3, m1, [r5 + 3 * 16]
12251 pmulhrsw m3, m7
12252 pmaddubsw m5, m4, [r5 + 3 * 16]
12253 pmulhrsw m5, m7
12254 packuswb m3, m5
12255 movu [r0 + 689 * 16], m3
12256
12257 ; mode 13 [row 10]
12258 movu m7, m6
12259 movu m6, m0
12260 pinsrb m6, [r3 + 4], 4
12261 pinsrb m6, [r3 + 4], 3
12262 pinsrb m6, [r3 + 7], 2
12263 pinsrb m6, [r3 + 7], 1
12264 pinsrb m6, [r3 + 11], 0
12265 pmaddubsw m3, m6, [r5 + 29 * 16]
12266 pmulhrsw m3, [pw_1024]
12267 pmaddubsw m5, m2, [r5 + 29 * 16]
12268 pmulhrsw m5, [pw_1024]
12269 packuswb m3, m5
12270 movu [r0 + 724 * 16], m3
12271 pmaddubsw m3, m1, [r5 + 29 * 16]
12272 pmulhrsw m3, [pw_1024]
12273 pmaddubsw m5, m4, [r5 + 29 * 16]
12274 pmulhrsw m5, [pw_1024]
12275 packuswb m3, m5
12276 movu [r0 + 725 * 16], m3
12277
12278 ; mode 13 [row 11]
12279 pmaddubsw m3, m6, [r5 + 20 * 16]
12280 pmulhrsw m3, [pw_1024]
12281 pmaddubsw m5, m2, [r5 + 20 * 16]
12282 pmulhrsw m5, [pw_1024]
12283 packuswb m3, m5
12284 movu [r0 + 726 * 16], m3
12285 pmaddubsw m3, m1, [r5 + 20 * 16]
12286 pmulhrsw m3, [pw_1024]
12287 pmaddubsw m5, m4, [r5 + 20 * 16]
12288 pmulhrsw m5, [pw_1024]
12289 packuswb m3, m5
12290 movu [r0 + 727 * 16], m3
12291
12292 ; mode 13 [row 12]
12293 pmaddubsw m3, m6, [r5 + 11 * 16]
12294 pmulhrsw m3, [pw_1024]
12295 pmaddubsw m5, m2, [r5 + 11 * 16]
12296 pmulhrsw m5, [pw_1024]
12297 packuswb m3, m5
12298 movu [r0 + 728 * 16], m3
12299 pmaddubsw m3, m1, [r5 + 11 * 16]
12300 pmulhrsw m3, [pw_1024]
12301 pmaddubsw m5, m4, [r5 + 11 * 16]
12302 pmulhrsw m5, [pw_1024]
12303 packuswb m3, m5
12304 movu [r0 + 729 * 16], m3
12305
12306 ; mode 13 [row 13]
12307 pmaddubsw m3, m6, [r5 + 2 * 16]
12308 pmulhrsw m3, [pw_1024]
12309 pmaddubsw m5, m2, [r5 + 2 * 16]
12310 pmulhrsw m5, [pw_1024]
12311 packuswb m3, m5
12312 movu [r0 + 730 * 16], m3
12313 pmaddubsw m3, m1, [r5 + 2 * 16]
12314 pmulhrsw m3, [pw_1024]
12315 pmaddubsw m5, m4, [r5 + 2 * 16]
12316 pmulhrsw m5, [pw_1024]
12317 packuswb m3, m5
12318 movu [r0 + 731 * 16], m3
12319
12320 ; mode 14 [row 9]
12321 pslldq m7, 2
12322 pinsrb m7, [r3 + 7], 1
12323 pinsrb m7, [r3 + 10], 0
12324 pmaddubsw m3, m7, [r5 + 30 * 16]
12325 pmulhrsw m3, [pw_1024]
12326 pslldq m2, 2
12327 pinsrw m2, [r4 + 4], 0
12328 pmaddubsw m5, m2, [r5 + 30 * 16]
12329 pmulhrsw m5, [pw_1024]
12330 packuswb m3, m5
12331 movu [r0 + 786 * 16], m3
12332 pslldq m1, 2
12333 pinsrw m1, [r4 + 12], 0
12334 pmaddubsw m3, m1, [r5 + 30 * 16]
12335 pmulhrsw m3, [pw_1024]
12336 pslldq m4, 2
12337 pinsrb m4, [r4 + 21], 1
12338 pinsrb m4, [r4 + 20], 0
12339 pmaddubsw m5, m4, [r5 + 30 * 16]
12340 pmulhrsw m5, [pw_1024]
12341 packuswb m3, m5
12342 movu [r0 + 787 * 16], m3
12343
12344 ; mode 14 [row 10]
12345 pmaddubsw m3, m7, [r5 + 17 * 16]
12346 pmulhrsw m3, [pw_1024]
12347 pmaddubsw m5, m2, [r5 + 17 * 16]
12348 pmulhrsw m5, [pw_1024]
12349 packuswb m3, m5
12350 movu [r0 + 788 * 16], m3
12351 pmaddubsw m3, m1, [r5 + 17 * 16]
12352 pmulhrsw m3, [pw_1024]
12353 pmaddubsw m5, m4, [r5 + 17 * 16]
12354 pmulhrsw m5, [pw_1024]
12355 packuswb m3, m5
12356 movu [r0 + 789 * 16], m3
12357
12358 ; mode 14 [row 11]
12359 pmaddubsw m3, m7, [r5 + 4 * 16]
12360 pmulhrsw m3, [pw_1024]
12361 pmaddubsw m5, m2, [r5 + 4 * 16]
12362 pmulhrsw m5, [pw_1024]
12363 packuswb m3, m5
12364 movu [r0 + 790 * 16], m3
12365 pmaddubsw m3, m1, [r5 + 4 * 16]
12366 pmulhrsw m3, [pw_1024]
12367 pmaddubsw m5, m4, [r5 + 4 * 16]
12368 pmulhrsw m5, [pw_1024]
12369 packuswb m3, m5
12370 movu [r0 + 791 * 16], m3
12371
12372 movu m6, [pw_1024]
12373
12374 ; mode 15 [row 7, 8-31]
12375 pmaddubsw m5, m2, [r5 + 24 * 16]
12376 pmulhrsw m5, m6
12377 packuswb m5, m5
12378 movh [r0 + 846 * 16 + 8], m5
12379 pmaddubsw m3, m1, [r5 + 24 * 16]
12380 pmulhrsw m3, m6
12381 pmaddubsw m5, m4, [r5 + 24 * 16]
12382 pmulhrsw m5, m6
12383 packuswb m3, m5
12384 movu [r0 + 847 * 16], m3
12385
12386 ; mode 15 [row 8, 8-31]
12387 pmaddubsw m5, m2, [r5 + 7 * 16]
12388 pmulhrsw m5, m6
12389 packuswb m5, m5
12390 movh [r0 + 848 * 16 + 8], m5
12391 pmaddubsw m3, m1, [r5 + 7 * 16]
12392 pmulhrsw m3, m6
12393 pmaddubsw m5, m4, [r5 + 7 * 16]
12394 pmulhrsw m5, m6
12395 packuswb m3, m5
12396 movu [r0 + 849 * 16], m3
12397
12398 ; mode 12 [row 25]
12399 pslldq m0, 2
12400 pinsrb m0, [r3 + 19], 1
12401 pinsrb m0, [r3 + 26], 0
12402 pmaddubsw m3, m0, [r5 + 30 * 16]
12403 pmulhrsw m3, [pw_1024]
12404 pmaddubsw m5, m2, [r5 + 30 * 16]
12405 pmulhrsw m5, [pw_1024]
12406 packuswb m3, m5
12407 movu [r0 + 690 * 16], m3
12408 pmaddubsw m3, m1, [r5 + 30 * 16]
12409 pmulhrsw m3, [pw_1024]
12410 pmaddubsw m5, m4, [r5 + 30 * 16]
12411 pmulhrsw m5, [pw_1024]
12412 packuswb m3, m5
12413 movu [r0 + 691 * 16], m3
12414
12415 ; mode 12 [row 26]
12416 pmaddubsw m3, m0, [r5 + 25 * 16]
12417 pmulhrsw m3, [pw_1024]
12418 pmaddubsw m5, m2, [r5 + 25 * 16]
12419 pmulhrsw m5, [pw_1024]
12420 packuswb m3, m5
12421 movu [r0 + 692 * 16], m3
12422 pmaddubsw m3, m1, [r5 + 25 * 16]
12423 pmulhrsw m3, [pw_1024]
12424 pmaddubsw m5, m4, [r5 + 25 * 16]
12425 pmulhrsw m5, [pw_1024]
12426 packuswb m3, m5
12427 movu [r0 + 693 * 16], m3
12428
12429 ; mode 12 [row 27]
12430 pmaddubsw m3, m0, [r5 + 20 * 16]
12431 pmulhrsw m3, [pw_1024]
12432 pmaddubsw m5, m2, [r5 + 20 * 16]
12433 pmulhrsw m5, [pw_1024]
12434 packuswb m3, m5
12435 movu [r0 + 694 * 16], m3
12436 pmaddubsw m3, m1, [r5 + 20 * 16]
12437 pmulhrsw m3, [pw_1024]
12438 pmaddubsw m5, m4, [r5 + 20 * 16]
12439 pmulhrsw m5, [pw_1024]
12440 packuswb m3, m5
12441 movu [r0 + 695 * 16], m3
12442
12443 ; mode 12 [row 28]
12444 pmaddubsw m3, m0, [r5 + 15 * 16]
12445 pmulhrsw m3, [pw_1024]
12446 pmaddubsw m5, m2, [r5 + 15 * 16]
12447 pmulhrsw m5, [pw_1024]
12448 packuswb m3, m5
12449 movu [r0 + 696 * 16], m3
12450 pmaddubsw m3, m1, [r5 + 15 * 16]
12451 pmulhrsw m3, [pw_1024]
12452 pmaddubsw m5, m4, [r5 + 15 * 16]
12453 pmulhrsw m5, [pw_1024]
12454 packuswb m3, m5
12455 movu [r0 + 697 * 16], m3
12456
12457 ; mode 12 [row 29]
12458 pmaddubsw m3, m0, [r5 + 10 * 16]
12459 pmulhrsw m3, [pw_1024]
12460 pmaddubsw m5, m2, [r5 + 10 * 16]
12461 pmulhrsw m5, [pw_1024]
12462 packuswb m3, m5
12463 movu [r0 + 698 * 16], m3
12464 pmaddubsw m3, m1, [r5 + 10 * 16]
12465 pmulhrsw m3, [pw_1024]
12466 pmaddubsw m5, m4, [r5 + 10 * 16]
12467 pmulhrsw m5, [pw_1024]
12468 packuswb m3, m5
12469 movu [r0 + 699 * 16], m3
12470
12471 ; mode 12 [row 30]
12472 pmaddubsw m3, m0, [r5 + 5 * 16]
12473 pmulhrsw m3, [pw_1024]
12474 pmaddubsw m5, m2, [r5 + 5 * 16]
12475 pmulhrsw m5, [pw_1024]
12476 packuswb m3, m5
12477 movu [r0 + 700 * 16], m3
12478 pmaddubsw m3, m1, [r5 + 5 * 16]
12479 pmulhrsw m3, [pw_1024]
12480 pmaddubsw m5, m4, [r5 + 5 * 16]
12481 pmulhrsw m5, [pw_1024]
12482 packuswb m3, m5
12483 movu [r0 + 701 * 16], m3
12484
12485 ; mode 13 [row 14]
12486 movu m6, m0
12487 pinsrb m6, [r3 + 4], 6
12488 pinsrb m6, [r3 + 4], 5
12489 pinsrb m6, [r3 + 7], 4
12490 pinsrb m6, [r3 + 7], 3
12491 pinsrb m6, [r3 + 11], 2
12492 pinsrb m6, [r3 + 11], 1
12493 pinsrb m6, [r3 + 14], 0
12494 pmaddubsw m3, m6, [r5 + 25 * 16]
12495 pmulhrsw m3, [pw_1024]
12496 pmaddubsw m5, m2, [r5 + 25 * 16]
12497 pmulhrsw m5, [pw_1024]
12498 packuswb m3, m5
12499 movu [r0 + 732 * 16], m3
12500 pmaddubsw m3, m1, [r5 + 25 * 16]
12501 pmulhrsw m3, [pw_1024]
12502 pmaddubsw m5, m4, [r5 + 25 * 16]
12503 pmulhrsw m5, [pw_1024]
12504 packuswb m3, m5
12505 movu [r0 + 733 * 16], m3
12506
12507 ; mode 13 [row 15]
12508 pmaddubsw m3, m6, [r5 + 16 * 16]
12509 pmulhrsw m3, [pw_1024]
12510 pmaddubsw m5, m2, [r5 + 16 * 16]
12511 pmulhrsw m5, [pw_1024]
12512 packuswb m3, m5
12513 movu [r0 + 734 * 16], m3
12514 pmaddubsw m3, m1, [r5 + 16 * 16]
12515 pmulhrsw m3, [pw_1024]
12516 pmaddubsw m5, m4, [r5 + 16 * 16]
12517 pmulhrsw m5, [pw_1024]
12518 packuswb m3, m5
12519 movu [r0 + 735 * 16], m3
12520
12521 ; mode 13 [row 16]
12522 pmaddubsw m3, m6, [r5 + 7 * 16]
12523 pmulhrsw m3, [pw_1024]
12524 pmaddubsw m5, m2, [r5 + 7 * 16]
12525 pmulhrsw m5, [pw_1024]
12526 packuswb m3, m5
12527 movu [r0 + 736 * 16], m3
12528 pmaddubsw m3, m1, [r5 + 7 * 16]
12529 pmulhrsw m3, [pw_1024]
12530 pmaddubsw m5, m4, [r5 + 7 * 16]
12531 pmulhrsw m5, [pw_1024]
12532 packuswb m3, m5
12533 movu [r0 + 737 * 16], m3
12534
12535 ; mode 13 [row 17]
12536 pslldq m6, 2
12537 pinsrb m6, [r3 + 14], 1
12538 pinsrb m6, [r3 + 18], 0
12539 pmaddubsw m3, m6, [r5 + 30 * 16]
12540 pmulhrsw m3, [pw_1024]
12541 pslldq m2, 2
12542 pinsrw m2, [r4 + 3], 0
12543 pmaddubsw m5, m2, [r5 + 30 * 16]
12544 pmulhrsw m5, [pw_1024]
12545 packuswb m3, m5
12546 movu [r0 + 738 * 16], m3
12547 pslldq m1, 2
12548 pinsrw m1, [r4 + 11], 0
12549 pmaddubsw m3, m1, [r5 + 30 * 16]
12550 pmulhrsw m3, [pw_1024]
12551 pslldq m4, 2
12552 pinsrw m4, [r4 + 19], 0
12553 pmaddubsw m5, m4, [r5 + 30 * 16]
12554 pmulhrsw m5, [pw_1024]
12555 packuswb m3, m5
12556 movu [r0 + 739 * 16], m3
12557
12558 ; mode 13 [row 18]
12559 pmaddubsw m3, m6, [r5 + 21 * 16]
12560 pmulhrsw m3, [pw_1024]
12561 pmaddubsw m5, m2, [r5 + 21 * 16]
12562 pmulhrsw m5, [pw_1024]
12563 packuswb m3, m5
12564 movu [r0 + 740 * 16], m3
12565 pmaddubsw m3, m1, [r5 + 21 * 16]
12566 pmulhrsw m3, [pw_1024]
12567 pmaddubsw m5, m4, [r5 + 21 * 16]
12568 pmulhrsw m5, [pw_1024]
12569 packuswb m3, m5
12570 movu [r0 + 741 * 16], m3
12571
12572 ; mode 13 [row 19]
12573 pmaddubsw m3, m6, [r5 + 12 * 16]
12574 pmulhrsw m3, [pw_1024]
12575 pmaddubsw m5, m2, [r5 + 12 * 16]
12576 pmulhrsw m5, [pw_1024]
12577 packuswb m3, m5
12578 movu [r0 + 742 * 16], m3
12579 pmaddubsw m3, m1, [r5 + 12 * 16]
12580 pmulhrsw m3, [pw_1024]
12581 pmaddubsw m5, m4, [r5 + 12 * 16]
12582 pmulhrsw m5, [pw_1024]
12583 packuswb m3, m5
12584 movu [r0 + 743 * 16], m3
12585
12586 ; mode 13 [row 20]
12587 pmaddubsw m3, m6, [r5 + 3 * 16]
12588 pmulhrsw m3, [pw_1024]
12589 pmaddubsw m5, m2, [r5 + 3 * 16]
12590 pmulhrsw m5, [pw_1024]
12591 packuswb m3, m5
12592 movu [r0 + 744 * 16], m3
12593 pmaddubsw m3, m1, [r5 + 3 * 16]
12594 pmulhrsw m3, [pw_1024]
12595 pmaddubsw m5, m4, [r5 + 3 * 16]
12596 pmulhrsw m5, [pw_1024]
12597 packuswb m3, m5
12598 movu [r0 + 745 * 16], m3
12599
12600 ; mode 14 [row 12]
12601 pslldq m7, 2
12602 pinsrb m7, [r3 + 10], 1
12603 pinsrb m7, [r3 + 12], 0
12604 pmaddubsw m3, m7, [r5 + 23 * 16]
12605 pmulhrsw m3, [pw_1024]
12606 pmaddubsw m5, m2, [r5 + 23 * 16]
12607 pmulhrsw m5, [pw_1024]
12608 packuswb m3, m5
12609 movu [r0 + 792 * 16], m3
12610 pmaddubsw m3, m1, [r5 + 23 * 16]
12611 pmulhrsw m3, [pw_1024]
12612 pmaddubsw m5, m4, [r5 + 23 * 16]
12613 pmulhrsw m5, [pw_1024]
12614 packuswb m3, m5
12615 movu [r0 + 793 * 16], m3
12616
12617 ; mode 14 [row 13]
12618 pmaddubsw m3, m7, [r5 + 10 * 16]
12619 pmulhrsw m3, [pw_1024]
12620 pmaddubsw m5, m2, [r5 + 10 * 16]
12621 pmulhrsw m5, [pw_1024]
12622 packuswb m3, m5
12623 movu [r0 + 794 * 16], m3
12624 pmaddubsw m3, m1, [r5 + 10 * 16]
12625 pmulhrsw m3, [pw_1024]
12626 pmaddubsw m5, m4, [r5 + 10 * 16]
12627 pmulhrsw m5, [pw_1024]
12628 packuswb m3, m5
12629 movu [r0 + 795 * 16], m3
12630
12631 ; mode 15 [row 9]
12632 pmaddubsw m5, m2, [r5 + 22 * 16]
12633 pmulhrsw m5, [pw_1024]
12634 packuswb m5, m5
12635 movu [r0 + 850 * 16 + 8], m5
12636 pmaddubsw m3, m1, [r5 + 22 * 16]
12637 pmulhrsw m3, [pw_1024]
12638 pmaddubsw m5, m4, [r5 + 22 * 16]
12639 pmulhrsw m5, [pw_1024]
12640 packuswb m3, m5
12641 movu [r0 + 851 * 16], m3
12642
12643 ; mode 15 [row 10]
12644 pmaddubsw m5, m2, [r5 + 5 * 16]
12645 pmulhrsw m5, [pw_1024]
12646 packuswb m5, m5
12647 movu [r0 + 852 * 16 + 8], m5
12648 pmaddubsw m3, m1, [r5 + 5 * 16]
12649 pmulhrsw m3, [pw_1024]
12650 pmaddubsw m5, m4, [r5 + 5 * 16]
12651 pmulhrsw m5, [pw_1024]
12652 packuswb m3, m5
12653 movu [r0 + 853 * 16], m3
12654
12655 ; mode 13 [row 21]
12656 pslldq m6, 2
12657 pinsrb m6, [r3 + 18], 1
12658 pinsrb m6, [r3 + 21], 0
12659 pmaddubsw m3, m6, [r5 + 26 * 16]
12660 pmulhrsw m3, [pw_1024]
12661 pslldq m2, 2
12662 pinsrw m2, [r4 + 2], 0
12663 pmaddubsw m5, m2, [r5 + 26 * 16]
12664 pmulhrsw m5, [pw_1024]
12665 packuswb m3, m5
12666 movu [r0 + 746 * 16], m3
12667 pslldq m1, 2
12668 pinsrw m1, [r4 + 10], 0
12669 pmaddubsw m3, m1, [r5 + 26 * 16]
12670 pmulhrsw m3, [pw_1024]
12671 pslldq m4, 2
12672 pinsrw m4, [r4 + 18], 0
12673 pmaddubsw m5, m4, [r5 + 26 * 16]
12674 pmulhrsw m5, [pw_1024]
12675 packuswb m3, m5
12676 movu [r0 + 747 * 16], m3
12677
12678 ; mode 13 [row 22]
12679 pmaddubsw m3, m6, [r5 + 17 * 16]
12680 pmulhrsw m3, [pw_1024]
12681 pmaddubsw m5, m2, [r5 + 17 * 16]
12682 pmulhrsw m5, [pw_1024]
12683 packuswb m3, m5
12684 movu [r0 + 748 * 16], m3
12685 pmaddubsw m3, m1, [r5 + 17 * 16]
12686 pmulhrsw m3, [pw_1024]
12687 pmaddubsw m5, m4, [r5 + 17 * 16]
12688 pmulhrsw m5, [pw_1024]
12689 packuswb m3, m5
12690 movu [r0 + 749 * 16], m3
12691
12692 ; mode 13 [row 23]
12693 pmaddubsw m3, m6, [r5 + 8 * 16]
12694 pmulhrsw m3, [pw_1024]
12695 pmaddubsw m5, m2, [r5 + 8 * 16]
12696 pmulhrsw m5, [pw_1024]
12697 packuswb m3, m5
12698 movu [r0 + 750 * 16], m3
12699 pmaddubsw m3, m1, [r5 + 8 * 16]
12700 pmulhrsw m3, [pw_1024]
12701 pmaddubsw m5, m4, [r5 + 8 * 16]
12702 pmulhrsw m5, [pw_1024]
12703 packuswb m3, m5
12704 movu [r0 + 751 * 16], m3
12705
12706 ; mode 14 [row 14]
12707 pslldq m7, 2
12708 pinsrb m7, [r3 + 12], 1
12709 pinsrb m7, [r3 + 15], 0
12710 pmaddubsw m3, m7, [r5 + 29 * 16]
12711 pmulhrsw m3, [pw_1024]
12712 pmaddubsw m5, m2, [r5 + 29 * 16]
12713 pmulhrsw m5, [pw_1024]
12714 packuswb m3, m5
12715 movu [r0 + 796 * 16], m3
12716 pmaddubsw m3, m1, [r5 + 29 * 16]
12717 pmulhrsw m3, [pw_1024]
12718 pmaddubsw m5, m4, [r5 + 29 * 16]
12719 pmulhrsw m5, [pw_1024]
12720 packuswb m3, m5
12721 movu [r0 + 797 * 16], m3
12722
12723 ; mode 14 [row 15]
12724 pmaddubsw m3, m7, [r5 + 16 * 16]
12725 pmulhrsw m3, [pw_1024]
12726 pmaddubsw m5, m2, [r5 + 16 * 16]
12727 pmulhrsw m5, [pw_1024]
12728 packuswb m3, m5
12729 movu [r0 + 798 * 16], m3
12730 pmaddubsw m3, m1, [r5 + 16 * 16]
12731 pmulhrsw m3, [pw_1024]
12732 pmaddubsw m5, m4, [r5 + 16 * 16]
12733 pmulhrsw m5, [pw_1024]
12734 packuswb m3, m5
12735 movu [r0 + 799 * 16], m3
12736
12737 ; mode 14 [row 16]
12738 pmaddubsw m3, m7, [r5 + 3 * 16]
12739 pmulhrsw m3, [pw_1024]
12740 pmaddubsw m5, m2, [r5 + 3 * 16]
12741 pmulhrsw m5, [pw_1024]
12742 packuswb m3, m5
12743 movu [r0 + 800 * 16], m3
12744 pmaddubsw m3, m1, [r5 + 3 * 16]
12745 pmulhrsw m3, [pw_1024]
12746 pmaddubsw m5, m4, [r5 + 3 * 16]
12747 pmulhrsw m5, [pw_1024]
12748 packuswb m3, m5
12749 movu [r0 + 801 * 16], m3
12750
12751 ; mode 15 [row 11]
12752 pmaddubsw m5, m2, [r5 + 20 * 16]
12753 pmulhrsw m5, [pw_1024]
12754 packuswb m5, m5
12755 movh [r0 + 854 * 16 + 8], m5
12756 pmaddubsw m3, m1, [r5 + 20 * 16]
12757 pmulhrsw m3, [pw_1024]
12758 pmaddubsw m5, m4, [r5 + 20 * 16]
12759 pmulhrsw m5, [pw_1024]
12760 packuswb m3, m5
12761 movu [r0 + 855 * 16], m3
12762
12763 ; mode 15 [row 12]
12764 pmaddubsw m5, m2, [r5 + 3 * 16]
12765 pmulhrsw m5, [pw_1024]
12766 packuswb m5, m5
12767 movh [r0 + 856 * 16 + 8], m5
12768 pmaddubsw m3, m1, [r5 + 3 * 16]
12769 pmulhrsw m3, [pw_1024]
12770 pmaddubsw m5, m4, [r5 + 3 * 16]
12771 pmulhrsw m5, [pw_1024]
12772 packuswb m3, m5
12773 movu [r0 + 857 * 16], m3
12774
12775 ; mode 13 [row 24]
12776 pslldq m6, 2
12777 pinsrb m6, [r3 + 21], 1
12778 pinsrb m6, [r3 + 25], 0
12779 pmaddubsw m3, m6, [r5 + 31 * 16]
12780 pmulhrsw m3, [pw_1024]
12781 pslldq m2, 2
12782 pinsrw m2, [r4 + 1], 0
12783 pmaddubsw m5, m2, [r5 + 31 * 16]
12784 pmulhrsw m5, [pw_1024]
12785 packuswb m3, m5
12786 movu [r0 + 752 * 16], m3
12787 pslldq m1, 2
12788 pinsrw m1, [r4 + 9], 0
12789 pmaddubsw m3, m1, [r5 + 31 * 16]
12790 pmulhrsw m3, [pw_1024]
12791 pslldq m4, 2
12792 pinsrw m4, [r4 + 17], 0
12793 pmaddubsw m5, m4, [r5 + 31 * 16]
12794 pmulhrsw m5, [pw_1024]
12795 packuswb m3, m5
12796 movu [r0 + 753 * 16], m3
12797
12798 ; mode 13 [row 25]
12799 pmaddubsw m3, m6, [r5 + 22 * 16]
12800 pmulhrsw m3, [pw_1024]
12801 pmaddubsw m5, m2, [r5 + 22 * 16]
12802 pmulhrsw m5, [pw_1024]
12803 packuswb m3, m5
12804 movu [r0 + 754 * 16], m3
12805 pmaddubsw m3, m1, [r5 + 22 * 16]
12806 pmulhrsw m3, [pw_1024]
12807 pmaddubsw m5, m4, [r5 + 22 * 16]
12808 pmulhrsw m5, [pw_1024]
12809 packuswb m3, m5
12810 movu [r0 + 755 * 16], m3
12811
12812 ; mode 13 [row 26]
12813 pmaddubsw m3, m6, [r5 + 13 * 16]
12814 pmulhrsw m3, [pw_1024]
12815 pmaddubsw m5, m2, [r5 + 13 * 16]
12816 pmulhrsw m5, [pw_1024]
12817 packuswb m3, m5
12818 movu [r0 + 756 * 16], m3
12819 pmaddubsw m3, m1, [r5 + 13 * 16]
12820 pmulhrsw m3, [pw_1024]
12821 pmaddubsw m5, m4, [r5 + 13 * 16]
12822 pmulhrsw m5, [pw_1024]
12823 packuswb m3, m5
12824 movu [r0 + 757 * 16], m3
12825
12826 ; mode 13 [row 27]
12827 pmaddubsw m3, m6, [r5 + 4 * 16]
12828 pmulhrsw m3, [pw_1024]
12829 pmaddubsw m5, m2, [r5 + 4 * 16]
12830 pmulhrsw m5, [pw_1024]
12831 packuswb m3, m5
12832 movu [r0 + 758 * 16], m3
12833 pmaddubsw m3, m1, [r5 + 4 * 16]
12834 pmulhrsw m3, [pw_1024]
12835 pmaddubsw m5, m4, [r5 + 4 * 16]
12836 pmulhrsw m5, [pw_1024]
12837 packuswb m3, m5
12838 movu [r0 + 759 * 16], m3
12839
12840 ; mode 14 [row 17]
12841 pslldq m7, 2
12842 pinsrb m7, [r3 + 15], 1
12843 pinsrb m7, [r3 + 17], 0
12844 pmaddubsw m3, m7, [r5 + 22 * 16]
12845 pmulhrsw m3, [pw_1024]
12846 pmaddubsw m5, m2, [r5 + 22 * 16]
12847 pmulhrsw m5, [pw_1024]
12848 packuswb m3, m5
12849 movu [r0 + 802 * 16], m3
12850 pmaddubsw m3, m1, [r5 + 22 * 16]
12851 pmulhrsw m3, [pw_1024]
12852 pmaddubsw m5, m4, [r5 + 22 * 16]
12853 pmulhrsw m5, [pw_1024]
12854 packuswb m3, m5
12855 movu [r0 + 803 * 16], m3
12856
12857 ; mode 14 [row 18]
12858 pmaddubsw m3, m7, [r5 + 9 * 16]
12859 pmulhrsw m3, [pw_1024]
12860 pmaddubsw m5, m2, [r5 + 9 * 16]
12861 pmulhrsw m5, [pw_1024]
12862 packuswb m3, m5
12863 movu [r0 + 804 * 16], m3
12864 pmaddubsw m3, m1, [r5 + 9 * 16]
12865 pmulhrsw m3, [pw_1024]
12866 pmaddubsw m5, m4, [r5 + 9 * 16]
12867 pmulhrsw m5, [pw_1024]
12868 packuswb m3, m5
12869 movu [r0 + 805 * 16], m3
12870
12871 ; mode 15 [row 13]
12872 pmaddubsw m5, m2, [r5 + 18 * 16]
12873 pmulhrsw m5, [pw_1024]
12874 packuswb m5, m5
12875 movh [r0 + 858 * 16 + 8], m5
12876 pmaddubsw m3, m1, [r5 + 18 * 16]
12877 pmulhrsw m3, [pw_1024]
12878 pmaddubsw m5, m4, [r5 + 18 * 16]
12879 pmulhrsw m5, [pw_1024]
12880 packuswb m3, m5
12881 movu [r0 + 859 * 16], m3
12882
12883 ; mode 15 [row 14]
12884 pmaddubsw m5, m2, [r5 + 1 * 16]
12885 pmulhrsw m5, [pw_1024]
12886 packuswb m5, m5
12887 movh [r0 + 860 * 16 + 8], m5
12888 pmaddubsw m3, m1, [r5 + 1 * 16]
12889 pmulhrsw m3, [pw_1024]
12890 pmaddubsw m5, m4, [r5 + 1 * 16]
12891 pmulhrsw m5, [pw_1024]
12892 packuswb m3, m5
12893 movu [r0 + 861 * 16], m3
12894
12895 ; mode 13 [row 28]
12896 pslldq m6, 2
12897 pinsrb m6, [r3 + 25], 1
12898 pinsrb m6, [r3 + 28], 0
12899 pmaddubsw m3, m6, [r5 + 27 * 16]
12900 pmulhrsw m3, [pw_1024]
12901 pslldq m2, 2
12902 pinsrw m2, [r4 + 0], 0
12903 pmaddubsw m5, m2, [r5 + 27 * 16]
12904 pmulhrsw m5, [pw_1024]
12905 packuswb m3, m5
12906 movu [r0 + 760 * 16], m3
12907 pslldq m1, 2
12908 pinsrw m1, [r4 + 8], 0
12909 pmaddubsw m3, m1, [r5 + 27 * 16]
12910 pmulhrsw m3, [pw_1024]
12911 pslldq m4, 2
12912 pinsrw m4, [r4 + 16], 0
12913 pmaddubsw m5, m4, [r5 + 27 * 16]
12914 pmulhrsw m5, [pw_1024]
12915 packuswb m3, m5
12916 movu [r0 + 761 * 16], m3
12917
12918 ; mode 13 [row 29]
12919 pmaddubsw m3, m6, [r5 + 18 * 16]
12920 pmulhrsw m3, [pw_1024]
12921 pmaddubsw m5, m2, [r5 + 18 * 16]
12922 pmulhrsw m5, [pw_1024]
12923 packuswb m3, m5
12924 movu [r0 + 762 * 16], m3
12925 pmaddubsw m3, m1, [r5 + 18 * 16]
12926 pmulhrsw m3, [pw_1024]
12927 pmaddubsw m5, m4, [r5 + 18 * 16]
12928 pmulhrsw m5, [pw_1024]
12929 packuswb m3, m5
12930 movu [r0 + 763 * 16], m3
12931
12932 ; mode 13 [row 30]
12933 pmaddubsw m3, m6, [r5 + 9 * 16]
12934 pmulhrsw m3, [pw_1024]
12935 pmaddubsw m5, m2, [r5 + 9 * 16]
12936 pmulhrsw m5, [pw_1024]
12937 packuswb m3, m5
12938 movu [r0 + 764 * 16], m3
12939 pmaddubsw m3, m1, [r5 + 9 * 16]
12940 pmulhrsw m3, [pw_1024]
12941 pmaddubsw m5, m4, [r5 + 9 * 16]
12942 pmulhrsw m5, [pw_1024]
12943 packuswb m3, m5
12944 movu [r0 + 765 * 16], m3
12945
12946 ; mode 14 [row 19]
12947 pslldq m7, 2
12948 pinsrb m7, [r3 + 17], 1
12949 pinsrb m7, [r3 + 20], 0
12950 pmaddubsw m3, m7, [r5 + 28 * 16]
12951 pmulhrsw m3, [pw_1024]
12952 pmaddubsw m5, m2, [r5 + 28 * 16]
12953 pmulhrsw m5, [pw_1024]
12954 packuswb m3, m5
12955 movu [r0 + 806 * 16], m3
12956 pmaddubsw m3, m1, [r5 + 28 * 16]
12957 pmulhrsw m3, [pw_1024]
12958 pmaddubsw m5, m4, [r5 + 28 * 16]
12959 pmulhrsw m5, [pw_1024]
12960 packuswb m3, m5
12961 movu [r0 + 807 * 16], m3
12962
12963 ; mode 14 [row 20]
12964 pmaddubsw m3, m7, [r5 + 15 * 16]
12965 pmulhrsw m3, [pw_1024]
12966 pmaddubsw m5, m2, [r5 + 15 * 16]
12967 pmulhrsw m5, [pw_1024]
12968 packuswb m3, m5
12969 movu [r0 + 808 * 16], m3
12970 pmaddubsw m3, m1, [r5 + 15 * 16]
12971 pmulhrsw m3, [pw_1024]
12972 pmaddubsw m5, m4, [r5 + 15 * 16]
12973 pmulhrsw m5, [pw_1024]
12974 packuswb m3, m5
12975 movu [r0 + 809 * 16], m3
12976
12977 ; mode 14 [row 21]
12978 pmaddubsw m3, m7, [r5 + 2 * 16]
12979 pmulhrsw m3, [pw_1024]
12980 pmaddubsw m5, m2, [r5 + 2 * 16]
12981 pmulhrsw m5, [pw_1024]
12982 packuswb m3, m5
12983 movu [r0 + 810 * 16], m3
12984 pmaddubsw m3, m1, [r5 + 2 * 16]
12985 pmulhrsw m3, [pw_1024]
12986 pmaddubsw m5, m4, [r5 + 2 * 16]
12987 pmulhrsw m5, [pw_1024]
12988 packuswb m3, m5
12989 movu [r0 + 811 * 16], m3
12990
12991 ; mode 15 [row 15]
12992 pmaddubsw m5, m2, [r5 + 16 * 16]
12993 pmulhrsw m5, [pw_1024]
12994 packuswb m5, m5
12995 movh [r0 + 862 * 16 + 8], m5
12996 pmaddubsw m3, m1, [r5 + 16 * 16]
12997 pmulhrsw m3, [pw_1024]
12998 pmaddubsw m5, m4, [r5 + 16 * 16]
12999 pmulhrsw m5, [pw_1024]
13000 packuswb m3, m5
13001 movu [r0 + 863 * 16], m3
13002
13003 ; mode 14 [row 22]
13004 pslldq m7, 2
13005 pinsrb m7, [r3 + 20], 1
13006 pinsrb m7, [r3 + 22], 0
13007 pmaddubsw m3, m7, [r5 + 21 * 16]
13008 pmulhrsw m3, [pw_1024]
13009 pslldq m2, 2
13010 pinsrb m2, [r4 + 0], 1
13011 pinsrb m2, [r3 + 2], 0
13012 pmaddubsw m5, m2, [r5 + 21 * 16]
13013 pmulhrsw m5, [pw_1024]
13014 packuswb m3, m5
13015 movu [r0 + 812 * 16], m3
13016 pslldq m1, 2
13017 pinsrw m1, [r4 + 7], 0
13018 pmaddubsw m3, m1, [r5 + 21 * 16]
13019 pmulhrsw m3, [pw_1024]
13020 pslldq m4, 2
13021 pinsrw m4, [r4 + 15], 0
13022 pmaddubsw m5, m4, [r5 + 21 * 16]
13023 pmulhrsw m5, [pw_1024]
13024 packuswb m3, m5
13025 movu [r0 + 813 * 16], m3
13026
13027 ; mode 14 [row 23]
13028 pmaddubsw m3, m7, [r5 + 8 * 16]
13029 pmulhrsw m3, [pw_1024]
13030 pmaddubsw m5, m2, [r5 + 8 * 16]
13031 pmulhrsw m5, [pw_1024]
13032 packuswb m3, m5
13033 movu [r0 + 814 * 16], m3
13034 pmaddubsw m3, m1, [r5 + 8 * 16]
13035 pmulhrsw m3, [pw_1024]
13036 pmaddubsw m5, m4, [r5 + 8 * 16]
13037 pmulhrsw m5, [pw_1024]
13038 packuswb m3, m5
13039 movu [r0 + 815 * 16], m3
13040
13041 ; mode 15 [row 16]
13042 pmaddubsw m5, m2, [r5 + 31 * 16]
13043 pmulhrsw m5, [pw_1024]
13044 packuswb m5, m5
13045 movh [r0 + 864 * 16 + 8], m5
13046 pmaddubsw m3, m1, [r5 + 31 * 16]
13047 pmulhrsw m3, [pw_1024]
13048 pmaddubsw m5, m4, [r5 + 31 * 16]
13049 pmulhrsw m5, [pw_1024]
13050 packuswb m3, m5
13051 movu [r0 + 865 * 16], m3
13052
13053 ; mode 15 [row 17]
13054 pmaddubsw m5, m2, [r5 + 14 * 16]
13055 pmulhrsw m5, [pw_1024]
13056 packuswb m5, m5
13057 movh [r0 + 866 * 16 + 8], m5
13058 pmaddubsw m3, m1, [r5 + 14 * 16]
13059 pmulhrsw m3, [pw_1024]
13060 pmaddubsw m5, m4, [r5 + 14 * 16]
13061 pmulhrsw m5, [pw_1024]
13062 packuswb m3, m5
13063 movu [r0 + 867 * 16], m3
13064
13065 ; mode 14 [row 24]
13066 pslldq m7, 2
13067 pinsrb m7, [r3 + 22], 1
13068 pinsrb m7, [r3 + 25], 0
13069 pmaddubsw m3, m7, [r5 + 27 * 16]
13070 pmulhrsw m3, [pw_1024]
13071 pslldq m2, 2
13072 pinsrb m2, [r3 + 2], 1
13073 pinsrb m2, [r3 + 5], 0
13074 pmaddubsw m5, m2, [r5 + 27 * 16]
13075 pmulhrsw m5, [pw_1024]
13076 packuswb m3, m5
13077 movu [r0 + 816 * 16], m3
13078 pslldq m1, 2
13079 pinsrw m1, [r4 + 6], 0
13080 pmaddubsw m3, m1, [r5 + 27 * 16]
13081 pmulhrsw m3, [pw_1024]
13082 pslldq m4, 2
13083 pinsrw m4, [r4 + 14], 0
13084 pmaddubsw m5, m4, [r5 + 27 * 16]
13085 pmulhrsw m5, [pw_1024]
13086 packuswb m3, m5
13087 movu [r0 + 817 * 16], m3
13088
13089 ; mode 14 [row 25]
13090 pmaddubsw m3, m7, [r5 + 14 * 16]
13091 pmulhrsw m3, [pw_1024]
13092 pmaddubsw m5, m2, [r5 + 14 * 16]
13093 pmulhrsw m5, [pw_1024]
13094 packuswb m3, m5
13095 movu [r0 + 818 * 16], m3
13096 pmaddubsw m3, m1, [r5 + 14 * 16]
13097 pmulhrsw m3, [pw_1024]
13098 pmaddubsw m5, m4, [r5 + 14 * 16]
13099 pmulhrsw m5, [pw_1024]
13100 packuswb m3, m5
13101 movu [r0 + 819 * 16], m3
13102
13103 ; mode 14 [row 26]
13104 pmaddubsw m3, m7, [r5 + 1 * 16]
13105 pmulhrsw m3, [pw_1024]
13106 pmaddubsw m5, m2, [r5 + 1 * 16]
13107 pmulhrsw m5, [pw_1024]
13108 packuswb m3, m5
13109 movu [r0 + 820 * 16], m3
13110 pmaddubsw m3, m1, [r5 + 1 * 16]
13111 pmulhrsw m3, [pw_1024]
13112 pmaddubsw m5, m4, [r5 + 1 * 16]
13113 pmulhrsw m5, [pw_1024]
13114 packuswb m3, m5
13115 movu [r0 + 821 * 16], m3
13116
13117 ; mode 15 [row 18]
13118 pinsrb m2, [r3 + 4], 0
13119 pmaddubsw m5, m2, [r5 + 29 * 16]
13120 pmulhrsw m5, [pw_1024]
13121 packuswb m5, m5
13122 movh [r0 + 868 * 16 + 8], m5
13123 pmaddubsw m3, m1, [r5 + 29 * 16]
13124 pmulhrsw m3, [pw_1024]
13125 pmaddubsw m5, m4, [r5 + 29 * 16]
13126 pmulhrsw m5, [pw_1024]
13127 packuswb m3, m5
13128 movu [r0 + 869 * 16], m3
13129
13130 ; mode 15 [row 19]
13131 pmaddubsw m5, m2, [r5 + 12 * 16]
13132 pmulhrsw m5, [pw_1024]
13133 packuswb m5, m5
13134 movh [r0 + 870 * 16 + 8], m5
13135 pmaddubsw m3, m1, [r5 + 12 * 16]
13136 pmulhrsw m3, [pw_1024]
13137 pmaddubsw m5, m4, [r5 + 12 * 16]
13138 pmulhrsw m5, [pw_1024]
13139 packuswb m3, m5
13140 movu [r0 + 871 * 16], m3
13141
13142 ; mode 15 [row 20 - 8 to 15]
13143 pslldq m3, m2, 2
13144 pinsrb m3, [r3 + 4], 1
13145 pinsrb m3, [r3 + 6], 0
13146 pmaddubsw m5, m3, [r5 + 27 * 16]
13147 pmulhrsw m5, [pw_1024]
13148 packuswb m5, m5
13149 movh [r0 + 872 * 16 + 8], m5
13150
13151 ; mode 15 [row 21 - 8 to 15]
13152 pmaddubsw m5, m3, [r5 + 10 * 16]
13153 pmulhrsw m5, [pw_1024]
13154 packuswb m5, m5
13155 movh [r0 + 874 * 16 + 8], m5
13156
13157 ; mode 15 [row 22 - 8 to 15]
13158 pslldq m3, 2
13159 pinsrb m3, [r3 + 6], 1
13160 pinsrb m3, [r3 + 8], 0
13161 pmaddubsw m5, m3, [r5 + 25 * 16]
13162 pmulhrsw m5, [pw_1024]
13163 packuswb m5, m5
13164 movh [r0 + 876 * 16 + 8], m5
13165
13166 ; mode 15 [row 23 - 8 to 15]
13167 pmaddubsw m5, m3, [r5 + 8 * 16]
13168 pmulhrsw m5, [pw_1024]
13169 packuswb m5, m5
13170 movh [r0 + 878 * 16 + 8], m5
13171
13172 ; mode 15 [row 24 - 8 to 15]
13173 pslldq m3, 2
13174 pinsrb m3, [r3 + 8], 1
13175 pinsrb m3, [r3 + 9], 0
13176 pmaddubsw m5, m3, [r5 + 23 * 16]
13177 pmulhrsw m5, [pw_1024]
13178 packuswb m5, m5
13179 movh [r0 + 880 * 16 + 8], m5
13180
13181 ; mode 15 [row 25 - 8 to 15]
13182 pmaddubsw m5, m3, [r5 + 6 * 16]
13183 pmulhrsw m5, [pw_1024]
13184 packuswb m5, m5
13185 movh [r0 + 882 * 16 + 8], m5
13186
13187 ; mode 15 [row 26 - 8 to 15]
13188 pslldq m3, 2
13189 pinsrb m3, [r3 + 9], 1
13190 pinsrb m3, [r3 + 11], 0
13191 pmaddubsw m5, m3, [r5 + 21 * 16]
13192 pmulhrsw m5, [pw_1024]
13193 packuswb m5, m5
13194 movh [r0 + 884 * 16 + 8], m5
13195
13196 ; mode 15 [row 27 - 8 to 15]
13197 pmaddubsw m5, m3, [r5 + 4 * 16]
13198 pmulhrsw m5, [pw_1024]
13199 packuswb m5, m5
13200 movh [r0 + 886 * 16 + 8], m5
13201
13202 ; mode 15 [row 28 - 8 to 15]
13203 pslldq m3, 2
13204 pinsrb m3, [r3 + 11], 1
13205 pinsrb m3, [r3 + 13], 0
13206 pmaddubsw m5, m3, [r5 + 19 * 16]
13207 pmulhrsw m5, [pw_1024]
13208 packuswb m5, m5
13209 movh [r0 + 888 * 16 + 8], m5
13210
13211 ; mode 15 [row 29 - 8 to 15]
13212 pmaddubsw m5, m3, [r5 + 2 * 16]
13213 pmulhrsw m5, [pw_1024]
13214 packuswb m5, m5
13215 movh [r0 + 890 * 16 + 8], m5
13216
13217 ; mode 15 [row 30 - 8 to 15]
13218 pslldq m3, 2
13219 pinsrb m3, [r3 + 13], 1
13220 pinsrb m3, [r3 + 15], 0
13221 pmaddubsw m5, m3, [r5 + 17 * 16]
13222 pmulhrsw m5, [pw_1024]
13223 packuswb m5, m5
13224 movh [r0 + 892 * 16 + 8], m5
13225
13226 ; mode 15 [row 31, 8 to 15]
13227 pshufb m5, m3, [tab_S2]
13228 movh [r0 + 894 * 16 + 8], m5
13229
13230 ; mode 14 [row 27]
13231 pinsrb m2, [r3 + 5], 0
13232 pslldq m7, 2
13233 pinsrb m7, [r3 + 25], 1
13234 pinsrb m7, [r3 + 27], 0
13235 pmaddubsw m3, m7, [r5 + 20 * 16]
13236 pmulhrsw m3, [pw_1024]
13237 pslldq m2, 2
13238 pinsrb m2, [r3 + 5], 1
13239 pinsrb m2, [r3 + 7], 0
13240 pmaddubsw m5, m2, [r5 + 20 * 16]
13241 pmulhrsw m5, [pw_1024]
13242 packuswb m3, m5
13243 movu [r0 + 822 * 16], m3
13244 pslldq m1, 2
13245 pinsrw m1, [r4 + 5], 0
13246 pmaddubsw m3, m1, [r5 + 20 * 16]
13247 pmulhrsw m3, [pw_1024]
13248 pslldq m4, 2
13249 pinsrw m4, [r4 + 13], 0
13250 pmaddubsw m5, m4, [r5 + 20 * 16]
13251 pmulhrsw m5, [pw_1024]
13252 packuswb m3, m5
13253 movu [r0 + 823 * 16], m3
13254
13255 ; mode 15 [row 20 - 16 to 31]
13256 pmaddubsw m3, m1, [r5 + 27 * 16]
13257 pmulhrsw m3, [pw_1024]
13258 pmaddubsw m5, m4, [r5 + 27 * 16]
13259 pmulhrsw m5, [pw_1024]
13260 packuswb m3, m5
13261 movu [r0 + 873 * 16], m3
13262
13263 ; mode 15 [row 21 - 16 to 31]
13264 pmaddubsw m3, m1, [r5 + 10 * 16]
13265 pmulhrsw m3, [pw_1024]
13266 pmaddubsw m5, m4, [r5 + 10 * 16]
13267 pmulhrsw m5, [pw_1024]
13268 packuswb m3, m5
13269 movu [r0 + 875 * 16], m3
13270
13271 ; mode 14 [row 28]
13272 pmaddubsw m3, m7, [r5 + 7 * 16]
13273 pmulhrsw m3, [pw_1024]
13274 pmaddubsw m5, m2, [r5 + 7 * 16]
13275 pmulhrsw m5, [pw_1024]
13276 packuswb m3, m5
13277 movu [r0 + 824 * 16], m3
13278 pmaddubsw m3, m1, [r5 + 7 * 16]
13279 pmulhrsw m3, [pw_1024]
13280 pmaddubsw m5, m4, [r5 + 7 * 16]
13281 pmulhrsw m5, [pw_1024]
13282 packuswb m3, m5
13283 movu [r0 + 825 * 16], m3
13284
13285 ; mode 14 [row 29]
13286 pslldq m7, 2
13287 pinsrb m7, [r3 + 27], 1
13288 pinsrb m7, [r3 + 30], 0
13289 pmaddubsw m3, m7, [r5 + 26 * 16]
13290 pmulhrsw m3, [pw_1024]
13291 pslldq m2, 2
13292 pinsrb m2, [r3 + 7], 1
13293 pinsrb m2, [r3 + 10], 0
13294 pmaddubsw m5, m2, [r5 + 26 * 16]
13295 pmulhrsw m5, [pw_1024]
13296 packuswb m3, m5
13297 movu [r0 + 826 * 16], m3
13298 pslldq m1, 2
13299 pinsrw m1, [r4 + 4], 0
13300 pmaddubsw m3, m1, [r5 + 26 * 16]
13301 pmulhrsw m3, [pw_1024]
13302 pslldq m4, 2
13303 pinsrw m4, [r4 + 12], 0
13304 pmaddubsw m5, m4, [r5 + 26 * 16]
13305 pmulhrsw m5, [pw_1024]
13306 packuswb m3, m5
13307 movu [r0 + 827 * 16], m3
13308
13309 ; mode 14 [row 30]
13310 pmaddubsw m3, m7, [r5 + 13 * 16]
13311 pmulhrsw m3, [pw_1024]
13312 pmaddubsw m5, m2, [r5 + 13 * 16]
13313 pmulhrsw m5, [pw_1024]
13314 packuswb m3, m5
13315 movu [r0 + 828 * 16], m3
13316 pmaddubsw m3, m1, [r5 + 13 * 16]
13317 pmulhrsw m3, [pw_1024]
13318 pmaddubsw m5, m4, [r5 + 13 * 16]
13319 pmulhrsw m5, [pw_1024]
13320 packuswb m3, m5
13321 movu [r0 + 829 * 16], m3
13322
13323 ; mode 15 [row 22]
13324 pmaddubsw m3, m1, [r5 + 25 * 16]
13325 pmulhrsw m3, [pw_1024]
13326 pmaddubsw m5, m4, [r5 + 25 * 16]
13327 pmulhrsw m5, [pw_1024]
13328 packuswb m3, m5
13329 movu [r0 + 877 * 16], m3
13330
13331 ; mode 15 [row 23]
13332 pmaddubsw m3, m1, [r5 + 8 * 16]
13333 pmulhrsw m3, [pw_1024]
13334 pmaddubsw m5, m4, [r5 + 8 * 16]
13335 pmulhrsw m5, [pw_1024]
13336 packuswb m3, m5
13337 movu [r0 + 879 * 16], m3
13338
13339 ; mode 14 [row 31]
13340 pshufb m3, m7, [tab_S2]
13341 movh [r0 + 830 * 16], m3
13342 pshufb m3, m2, [tab_S2]
13343 movh [r0 + 830 * 16 + 8], m3
13344 pshufb m3, m1, [tab_S2]
13345 movh [r0 + 831 * 16], m3
13346 pshufb m3, m4, [tab_S2]
13347 movh [r0 + 831 * 16 + 8], m3
13348
13349 ; mode 13 [row 31]
13350 pshufb m0, m6, [tab_S2]
13351 movh [r0 + 766 * 16], m0
13352 movh m0, [r4]
13353 movh [r0 + 766 * 16 + 8], m0
13354 movu m0, [r4 + 8]
13355 movu [r0 + 767 * 16], m0
13356
13357 ; mode 15 [row 24]
13358 pslldq m1, 2
13359 pinsrw m1, [r4 + 3], 0
13360 pmaddubsw m3, m1, [r5 + 23 * 16]
13361 pmulhrsw m3, [pw_1024]
13362 pslldq m4, 2
13363 pinsrw m4, [r4 + 11], 0
13364 pmaddubsw m5, m4, [r5 + 23 * 16]
13365 pmulhrsw m5, [pw_1024]
13366 packuswb m3, m5
13367 movu [r0 + 881 * 16], m3
13368
13369 ; mode 15 [row 25]
13370 pmaddubsw m3, m1, [r5 + 6 * 16]
13371 pmulhrsw m3, [pw_1024]
13372 pmaddubsw m5, m4, [r5 + 6 * 16]
13373 pmulhrsw m5, [pw_1024]
13374 packuswb m3, m5
13375 movu [r0 + 883 * 16], m3
13376
13377 ; mode 15 [row 26]
13378 pslldq m1, 2
13379 pinsrw m1, [r4 + 2], 0
13380 pmaddubsw m3, m1, [r5 + 21 * 16]
13381 pmulhrsw m3, [pw_1024]
13382 pslldq m4, 2
13383 pinsrw m4, [r4 + 10], 0
13384 pmaddubsw m5, m4, [r5 + 21 * 16]
13385 pmulhrsw m5, [pw_1024]
13386 packuswb m3, m5
13387 movu [r0 + 885 * 16], m3
13388
13389 ; mode 15 [row 27]
13390 pmaddubsw m3, m1, [r5 + 4 * 16]
13391 pmulhrsw m3, [pw_1024]
13392 pmaddubsw m5, m4, [r5 + 4 * 16]
13393 pmulhrsw m5, [pw_1024]
13394 packuswb m3, m5
13395 movu [r0 + 887 * 16], m3
13396
13397 ; mode 15 [row 28]
13398 pslldq m1, 2
13399 pinsrw m1, [r4 + 1], 0
13400 pmaddubsw m3, m1, [r5 + 19 * 16]
13401 pmulhrsw m3, [pw_1024]
13402 pslldq m4, 2
13403 pinsrw m4, [r4 + 9], 0
13404 pmaddubsw m5, m4, [r5 + 19 * 16]
13405 pmulhrsw m5, [pw_1024]
13406 packuswb m3, m5
13407 movu [r0 + 889 * 16], m3
13408
13409 ; mode 15 [row 29]
13410 pmaddubsw m3, m1, [r5 + 2 * 16]
13411 pmulhrsw m3, [pw_1024]
13412 pmaddubsw m5, m4, [r5 + 2 * 16]
13413 pmulhrsw m5, [pw_1024]
13414 packuswb m3, m5
13415 movu [r0 + 891 * 16], m3
13416
13417 ; mode 15 [row 30]
13418 pslldq m1, 2
13419 pinsrw m1, [r4 + 0], 0
13420 pmaddubsw m3, m1, [r5 + 17 * 16]
13421 pmulhrsw m3, [pw_1024]
13422 pslldq m4, 2
13423 pinsrw m4, [r4 + 8], 0
13424 pmaddubsw m5, m4, [r5 + 17 * 16]
13425 pmulhrsw m5, [pw_1024]
13426 packuswb m3, m5
13427 movu [r0 + 893 * 16], m3
13428
13429 ; mode 15 [row 31]
13430 pshufb m5, m1, [tab_S2]
13431 movh [r0 + 895 * 16], m5
13432 pshufb m5, m4, [tab_S2]
13433 movh [r0 + 895 * 16 + 8], m5
13434
13435 ; mode 16 [row 0]
13436 movu m6, [r5 + 11 * 16]
13437 movu m7, [pw_1024]
13438 movh m0, [r4 ]
13439 movh m1, [r4 + 1 ]
13440 punpcklbw m0, m1
13441 pmaddubsw m1, m0, m6
13442 pmulhrsw m1, m7
13443 movh m2, [r4 + 8]
13444 movh m3, [r4 + 9]
13445 punpcklbw m2, m3
13446 pmaddubsw m3, m2, m6
13447 pmulhrsw m3, m7
13448 packuswb m1, m3
13449 movu [r0 + 896 * 16], m1
13450
13451 movh m1, [r4 + 16]
13452 movh m3, [r4 + 17]
13453 punpcklbw m1, m3
13454 pmaddubsw m3, m1, m6
13455 pmulhrsw m3, m7
13456 movh m4, [r4 + 24]
13457 movh m5, [r4 + 25]
13458 punpcklbw m4, m5
13459 pmaddubsw m5, m4, m6
13460 pmulhrsw m5, m7
13461 packuswb m3, m5
13462 movu [r0 + 897 * 16], m3
13463
13464 ; mode16 [row 1]
13465 movu m6, [r5 + 22 * 16]
13466 pslldq m0, 2
13467 pinsrb m0, [r4], 1
13468 pinsrb m0, [r3 + 2], 0
13469 pmaddubsw m3, m0, m6
13470 pmulhrsw m3, m7
13471 pslldq m2, 2
13472 pinsrw m2, [r4 + 7], 0
13473 pmaddubsw m5, m2, m6
13474 pmulhrsw m5, m7
13475 packuswb m3, m5
13476 movu [r0 + 898 * 16], m3
13477
13478 pslldq m1, 2
13479 pinsrw m1, [r4 + 15], 0
13480 pmaddubsw m3, m1, m6
13481 pmulhrsw m3, m7
13482 pslldq m4, 2
13483 pinsrw m4, [r4 + 23], 0
13484 pmaddubsw m5, m4, m6
13485 pmulhrsw m5, m7
13486 packuswb m3, m5
13487 movu [r0 + 899 * 16], m3
13488
13489 ; mode16 [row 2]
13490 movu m6, [r5 + 1 * 16]
13491 pmaddubsw m3, m0, m6
13492 pmulhrsw m3, m7
13493 pmaddubsw m5, m2, m6
13494 pmulhrsw m5, m7
13495 packuswb m3, m5
13496 movu [r0 + 900 * 16], m3
13497
13498 pmaddubsw m3, m1, m6
13499 pmulhrsw m3, m7
13500 pmaddubsw m5, m4, m6
13501 pmulhrsw m5, m7
13502 packuswb m3, m5
13503 movu [r0 + 901 * 16], m3
13504
13505 ; mode16 [row 3]
13506 movu m6, [r5 + 12 * 16]
13507 pslldq m0, 2
13508 pinsrb m0, [r3 + 2], 1
13509 pinsrb m0, [r3 + 3], 0
13510 pmaddubsw m3, m0, m6
13511 pmulhrsw m3, m7
13512 pslldq m2, 2
13513 pinsrw m2, [r4 + 6], 0
13514 pmaddubsw m5, m2, m6
13515 pmulhrsw m5, m7
13516 packuswb m3, m5
13517 movu [r0 + 902 * 16], m3
13518
13519 pslldq m1, 2
13520 pinsrw m1, [r4 + 14], 0
13521 pmaddubsw m3, m1, m6
13522 pmulhrsw m3, m7
13523 pslldq m4, 2
13524 pinsrw m4, [r4 + 22], 0
13525 pmaddubsw m5, m4, m6
13526 pmulhrsw m5, m7
13527 packuswb m3, m5
13528 movu [r0 + 903 * 16], m3
13529
13530 ; mode16 [row 4]
13531 movu m6, [r5 + 23 * 16]
13532 pslldq m0, 2
13533 pinsrb m0, [r3 + 3], 1
13534 pinsrb m0, [r3 + 5], 0
13535 pmaddubsw m3, m0, m6
13536 pmulhrsw m3, m7
13537 pslldq m2, 2
13538 pinsrw m2, [r4 + 5], 0
13539 pmaddubsw m5, m2, m6
13540 pmulhrsw m5, m7
13541 packuswb m3, m5
13542 movu [r0 + 904 * 16], m3
13543
13544 pslldq m1, 2
13545 pinsrw m1, [r4 + 13], 0
13546 pmaddubsw m3, m1, m6
13547 pmulhrsw m3, m7
13548 pslldq m4, 2
13549 pinsrw m4, [r4 + 21], 0
13550 pmaddubsw m5, m4, m6
13551 pmulhrsw m5, m7
13552 packuswb m3, m5
13553 movu [r0 + 905 * 16], m3
13554
13555 ; mode16 [row 5]
13556 movu m6, [r5 + 2 * 16]
13557 pmaddubsw m3, m0, m6
13558 pmulhrsw m3, m7
13559 pmaddubsw m5, m2, m6
13560 pmulhrsw m5, m7
13561 packuswb m3, m5
13562 movu [r0 + 906 * 16], m3
13563
13564 pmaddubsw m3, m1, m6
13565 pmulhrsw m3, m7
13566 pmaddubsw m5, m4, m6
13567 pmulhrsw m5, m7
13568 packuswb m3, m5
13569 movu [r0 + 907 * 16], m3
13570
13571 ; mode16 [row 6]
13572 movu m6, [r5 + 13 * 16]
13573 pslldq m0, 2
13574 pinsrb m0, [r3 + 5], 1
13575 pinsrb m0, [r3 + 6], 0
13576 pmaddubsw m3, m0, m6
13577 pmulhrsw m3, m7
13578 pslldq m2, 2
13579 pinsrb m2, [r4 + 5], 1
13580 pinsrb m2, [r4 + 4], 0
13581 pmaddubsw m5, m2, m6
13582 pmulhrsw m5, m7
13583 packuswb m3, m5
13584 movu [r0 + 908 * 16], m3
13585 pslldq m1, 2
13586 pinsrw m1, [r4 + 12], 0
13587 pmaddubsw m3, m1, m6
13588 pmulhrsw m3, m7
13589 pslldq m4, 2
13590 pinsrw m4, [r4 + 20], 0
13591 pmaddubsw m5, m4, m6
13592 pmulhrsw m5, m7
13593 packuswb m3, m5
13594 movu [r0 + 909 * 16], m3
13595
13596 ; mode16 [row 7]
13597 movu m6, [r5 + 24 * 16]
13598 pslldq m0, 2
13599 pinsrb m0, [r3 + 6], 1
13600 pinsrb m0, [r3 + 8], 0
13601 pmaddubsw m3, m0, m6
13602 pmulhrsw m3, m7
13603 pslldq m2, 2
13604 pinsrw m2, [r4 + 3], 0
13605 pmaddubsw m5, m2, m6
13606 pmulhrsw m5, m7
13607 packuswb m3, m5
13608 movu [r0 + 910 * 16], m3
13609
13610 pslldq m1, 2
13611 pinsrw m1, [r4 + 11], 0
13612 pmaddubsw m3, m1, m6
13613 pmulhrsw m3, m7
13614 pslldq m4, 2
13615 pinsrw m4, [r4 + 19], 0
13616 pmaddubsw m5, m4, m6
13617 pmulhrsw m5, m7
13618 packuswb m3, m5
13619 movu [r0 + 911 * 16], m3
13620
13621 ; mode16 [row 8]
13622 movu m6, [r5 + 3 * 16]
13623 pmaddubsw m3, m0, m6
13624 pmulhrsw m3, m7
13625 pmaddubsw m5, m2, m6
13626 pmulhrsw m5, m7
13627 packuswb m3, m5
13628 movu [r0 + 912 * 16], m3
13629
13630 pmaddubsw m3, m1, m6
13631 pmulhrsw m3, m7
13632 pmaddubsw m5, m4, m6
13633 pmulhrsw m5, m7
13634 packuswb m3, m5
13635 movu [r0 + 913 * 16], m3
13636
13637 ; mode16 [row 9]
13638 movu m6, [r5 + 14 * 16]
13639 pslldq m0, 2
13640 pinsrb m0, [r3 + 8], 1
13641 pinsrb m0, [r3 + 9], 0
13642 pmaddubsw m3, m0, m6
13643 pmulhrsw m3, m7
13644 pslldq m2, 2
13645 pinsrw m2, [r4 + 2], 0
13646 pmaddubsw m5, m2, m6
13647 pmulhrsw m5, m7
13648 packuswb m3, m5
13649 movu [r0 + 914 * 16], m3
13650
13651 pslldq m1, 2
13652 pinsrw m1, [r4 + 10], 0
13653 pmaddubsw m3, m1, m6
13654 pmulhrsw m3, m7
13655 pslldq m4, 2
13656 pinsrw m4, [r4 + 18], 0
13657 pmaddubsw m5, m4, m6
13658 pmulhrsw m5, m7
13659 packuswb m3, m5
13660 movu [r0 + 915 * 16], m3
13661
13662 ; mode16 [row 10]
13663 movu m6, [r5 + 25 * 16]
13664 pslldq m0, 2
13665 pinsrb m0, [r3 + 9], 1
13666 pinsrb m0, [r3 + 11], 0
13667 pmaddubsw m3, m0, m6
13668 pmulhrsw m3, m7
13669 pslldq m2, 2
13670 pinsrw m2, [r4 + 1], 0
13671 pmaddubsw m5, m2, m6
13672 pmulhrsw m5, m7
13673 packuswb m3, m5
13674 movu [r0 + 916 * 16], m3
13675
13676 pslldq m1, 2
13677 pinsrw m1, [r4 + 9], 0
13678 pmaddubsw m3, m1, m6
13679 pmulhrsw m3, m7
13680 pslldq m4, 2
13681 pinsrb m4, [r4 + 18], 1
13682 pinsrb m4, [r4 + 17], 0
13683 pmaddubsw m5, m4, m6
13684 pmulhrsw m5, m7
13685 packuswb m3, m5
13686 movu [r0 + 917 * 16], m3
13687
13688 ; mode16 [row 11]
13689 movu m6, [r5 + 4 * 16]
13690 pmaddubsw m3, m0, m6
13691 pmulhrsw m3, m7
13692 pmaddubsw m5, m2, m6
13693 pmulhrsw m5, m7
13694 packuswb m3, m5
13695 movu [r0 + 918 * 16], m3
13696
13697 pmaddubsw m3, m1, m6
13698 pmulhrsw m3, m7
13699 pmaddubsw m5, m4, m6
13700 pmulhrsw m5, m7
13701 packuswb m3, m5
13702 movu [r0 + 919 * 16], m3
13703
13704 ; mode16 [row 12]
13705 movu m6, [r5 + 15 * 16]
13706 pslldq m0, 2
13707 pinsrb m0, [r3 + 11], 1
13708 pinsrb m0, [r3 + 12], 0
13709 pmaddubsw m3, m0, m6
13710 pmulhrsw m3, m7
13711 pslldq m2, 2
13712 pinsrw m2, [r4 + 0], 0
13713 pmaddubsw m5, m2, m6
13714 pmulhrsw m5, m7
13715 packuswb m3, m5
13716 movu [r0 + 920 * 16], m3
13717
13718 pslldq m1, 2
13719 pinsrw m1, [r4 + 8], 0
13720 pmaddubsw m3, m1, m6
13721 pmulhrsw m3, m7
13722 pslldq m4, 2
13723 pinsrw m4, [r4 + 16], 0
13724 pmaddubsw m5, m4, m6
13725 pmulhrsw m5, m7
13726 packuswb m3, m5
13727 movu [r0 + 921 * 16], m3
13728
13729 ; mode16 [row 13]
13730 movu m6, [r5 + 26 * 16]
13731 pslldq m0, 2
13732 pinsrb m0, [r3 + 12], 1
13733 pinsrb m0, [r3 + 14], 0
13734 pmaddubsw m3, m0, m6
13735 pmulhrsw m3, m7
13736 pslldq m2, 2
13737 pinsrb m2, [r4 + 0], 1
13738 pinsrb m2, [r3 + 2], 0
13739 pmaddubsw m5, m2, m6
13740 pmulhrsw m5, m7
13741 packuswb m3, m5
13742 movu [r0 + 922 * 16], m3
13743
13744 pslldq m1, 2
13745 pinsrw m1, [r4 + 7], 0
13746 pmaddubsw m3, m1, m6
13747 pmulhrsw m3, m7
13748 pslldq m4, 2
13749 pinsrw m4, [r4 + 15], 0
13750 pmaddubsw m5, m4, m6
13751 pmulhrsw m5, m7
13752 packuswb m3, m5
13753 movu [r0 + 923 * 16], m3
13754
13755 ; mode16 [row 14]
13756 movu m6, [r5 + 5 * 16]
13757 pmaddubsw m3, m0, m6
13758 pmulhrsw m3, m7
13759 pmaddubsw m5, m2, m6
13760 pmulhrsw m5, m7
13761 packuswb m3, m5
13762 movu [r0 + 924 * 16], m3
13763
13764 pmaddubsw m3, m1, m6
13765 pmulhrsw m3, m7
13766 pmaddubsw m5, m4, m6
13767 pmulhrsw m5, m7
13768 packuswb m3, m5
13769 movu [r0 + 925 * 16], m3
13770
13771 ; mode16 [row 15]
13772 movu m6, [r5 + 16 * 16]
13773 pslldq m0, 2
13774 pinsrb m0, [r3 + 14], 1
13775 pinsrb m0, [r3 + 15], 0
13776 pmaddubsw m3, m0, m6
13777 pmulhrsw m3, m7
13778 pslldq m2, 2
13779 pinsrb m2, [r3 + 2], 1
13780 pinsrb m2, [r3 + 3], 0
13781 pmaddubsw m5, m2, m6
13782 pmulhrsw m5, m7
13783 packuswb m3, m5
13784 movu [r0 + 926 * 16], m3
13785
13786 pslldq m1, 2
13787 pinsrw m1, [r4 + 6], 0
13788 pmaddubsw m3, m1, m6
13789 pmulhrsw m3, m7
13790 pslldq m4, 2
13791 pinsrw m4, [r4 + 14], 0
13792 pmaddubsw m5, m4, m6
13793 pmulhrsw m5, m7
13794 packuswb m3, m5
13795 movu [r0 + 927 * 16], m3
13796
13797 ; mode16 [row 16]
13798 movu m6, [r5 + 27 * 16]
13799 pslldq m0, 2
13800 pinsrb m0, [r3 + 15], 1
13801 pinsrb m0, [r3 + 17], 0
13802 pmaddubsw m3, m0, m6
13803 pmulhrsw m3, m7
13804 pslldq m2, 2
13805 pinsrb m2, [r3 + 3], 1
13806 pinsrb m2, [r3 + 5], 0
13807 pmaddubsw m5, m2, m6
13808 pmulhrsw m5, m7
13809 packuswb m3, m5
13810 movu [r0 + 928 * 16], m3
13811
13812 pslldq m1, 2
13813 pinsrw m1, [r4 + 5], 0
13814 pmaddubsw m3, m1, m6
13815 pmulhrsw m3, m7
13816 pslldq m4, 2
13817 pinsrw m4, [r4 + 13], 0
13818 pmaddubsw m5, m4, m6
13819 pmulhrsw m5, m7
13820 packuswb m3, m5
13821 movu [r0 + 929 * 16], m3
13822
13823 ; mode16 [row 17]
13824 movu m6, [r5 + 6 * 16]
13825 pmaddubsw m3, m0, m6
13826 pmulhrsw m3, m7
13827 pmaddubsw m5, m2, m6
13828 pmulhrsw m5, m7
13829 packuswb m3, m5
13830 movu [r0 + 930 * 16], m3
13831
13832 pmaddubsw m3, m1, m6
13833 pmulhrsw m3, m7
13834 pmaddubsw m5, m4, m6
13835 pmulhrsw m5, m7
13836 packuswb m3, m5
13837 movu [r0 + 931 * 16], m3
13838
13839 ; mode16 [row 18]
13840 movu m6, [r5 + 17 * 16]
13841 pslldq m0, 2
13842 pinsrb m0, [r3 + 17], 1
13843 pinsrb m0, [r3 + 18], 0
13844 pmaddubsw m3, m0, m6
13845 pmulhrsw m3, m7
13846 pslldq m2, 2
13847 pinsrb m2, [r3 + 5], 1
13848 pinsrb m2, [r3 + 6], 0
13849 pmaddubsw m5, m2, m6
13850 pmulhrsw m5, m7
13851 packuswb m3, m5
13852 movu [r0 + 932 * 16], m3
13853
13854 pslldq m1, 2
13855 pinsrw m1, [r4 + 4], 0
13856 pmaddubsw m3, m1, m6
13857 pmulhrsw m3, m7
13858 pslldq m4, 2
13859 pinsrw m4, [r4 + 12], 0
13860 pmaddubsw m5, m4, m6
13861 pmulhrsw m5, m7
13862 packuswb m3, m5
13863 movu [r0 + 933 * 16], m3
13864
13865 ; mode16 [row 19]
13866 movu m6, [r5 + 28 * 16]
13867 pslldq m0, 2
13868 pinsrb m0, [r3 + 18], 1
13869 pinsrb m0, [r3 + 20], 0
13870 pmaddubsw m3, m0, m6
13871 pmulhrsw m3, m7
13872 pslldq m2, 2
13873 pinsrb m2, [r3 + 6], 1
13874 pinsrb m2, [r3 + 8], 0
13875 pmaddubsw m5, m2, m6
13876 pmulhrsw m5, m7
13877 packuswb m3, m5
13878 movu [r0 + 934 * 16], m3
13879
13880 pslldq m1, 2
13881 pinsrw m1, [r4 + 3], 0
13882 pmaddubsw m3, m1, m6
13883 pmulhrsw m3, m7
13884 pslldq m4, 2
13885 pinsrw m4, [r4 + 11], 0
13886 pmaddubsw m5, m4, m6
13887 pmulhrsw m5, m7
13888 packuswb m3, m5
13889 movu [r0 + 935 * 16], m3
13890
13891 ; mode16 [row 20]
13892 movu m6, [r5 + 7 * 16]
13893 pmaddubsw m3, m0, m6
13894 pmulhrsw m3, m7
13895 pmaddubsw m5, m2, m6
13896 pmulhrsw m5, m7
13897 packuswb m3, m5
13898 movu [r0 + 936 * 16], m3
13899
13900 pmaddubsw m3, m1, m6
13901 pmulhrsw m3, m7
13902 pmaddubsw m5, m4, m6
13903 pmulhrsw m5, m7
13904 packuswb m3, m5
13905 movu [r0 + 937 * 16], m3
13906
13907 ; mode16 [row 21]
13908 movu m6, [r5 + 18 * 16]
13909 pslldq m0, 2
13910 pinsrb m0, [r3 + 20], 1
13911 pinsrb m0, [r3 + 21], 0
13912 pmaddubsw m3, m0, m6
13913 pmulhrsw m3, m7
13914 pslldq m2, 2
13915 pinsrb m2, [r3 + 8], 1
13916 pinsrb m2, [r3 + 9], 0
13917 pmaddubsw m5, m2, m6
13918 pmulhrsw m5, m7
13919 packuswb m3, m5
13920 movu [r0 + 938 * 16], m3
13921
13922 pslldq m1, 2
13923 pinsrw m1, [r4 + 2], 0
13924 pmaddubsw m3, m1, m6
13925 pmulhrsw m3, m7
13926 pslldq m4, 2
13927 pinsrw m4, [r4 + 10], 0
13928 pmaddubsw m5, m4, m6
13929 pmulhrsw m5, m7
13930 packuswb m3, m5
13931 movu [r0 + 939 * 16], m3
13932
13933 ; mode16 [row 22]
13934 movu m6, [r5 + 29 * 16]
13935 pslldq m0, 2
13936 pinsrb m0, [r3 + 21], 1
13937 pinsrb m0, [r3 + 23], 0
13938 pmaddubsw m3, m0, m6
13939 pmulhrsw m3, m7
13940 pslldq m2, 2
13941 pinsrb m2, [r3 + 9], 1
13942 pinsrb m2, [r3 + 11], 0
13943 pmaddubsw m5, m2, m6
13944 pmulhrsw m5, m7
13945 packuswb m3, m5
13946 movu [r0 + 940 * 16], m3
13947
13948 pslldq m1, 2
13949 pinsrw m1, [r4 + 1], 0
13950 pmaddubsw m3, m1, m6
13951 pmulhrsw m3, m7
13952 pslldq m4, 2
13953 pinsrw m4, [r4 + 9], 0
13954 pmaddubsw m5, m4, m6
13955 pmulhrsw m5, m7
13956 packuswb m3, m5
13957 movu [r0 + 941 * 16], m3
13958
13959 ; mode16 [row 23]
13960 movu m6, [r5 + 8 * 16]
13961 pmaddubsw m3, m0, m6
13962 pmulhrsw m3, m7
13963 pmaddubsw m5, m2, m6
13964 pmulhrsw m5, m7
13965 packuswb m3, m5
13966 movu [r0 + 942 * 16], m3
13967
13968 pmaddubsw m3, m1, m6
13969 pmulhrsw m3, m7
13970 pmaddubsw m5, m4, m6
13971 pmulhrsw m5, m7
13972 packuswb m3, m5
13973 movu [r0 + 943 * 16], m3
13974
13975 ; mode16 [row 24]
13976 movu m6, [r5 + 19 * 16]
13977 pslldq m0, 2
13978 pinsrb m0, [r3 + 23], 1
13979 pinsrb m0, [r3 + 24], 0
13980 pmaddubsw m3, m0, m6
13981 pmulhrsw m3, m7
13982 pslldq m2, 2
13983 pinsrb m2, [r3 + 11], 1
13984 pinsrb m2, [r3 + 12], 0
13985 pmaddubsw m5, m2, m6
13986 pmulhrsw m5, m7
13987 packuswb m3, m5
13988 movu [r0 + 944 * 16], m3
13989
13990 pslldq m1, 2
13991 pinsrw m1, [r4 + 0], 0
13992 pmaddubsw m3, m1, m6
13993 pmulhrsw m3, m7
13994 pslldq m4, 2
13995 pinsrw m4, [r4 + 8], 0
13996 pmaddubsw m5, m4, m6
13997 pmulhrsw m5, m7
13998 packuswb m3, m5
13999 movu [r0 + 945 * 16], m3
14000
14001 ; mode16 [row 25]
14002 movu m6, [r5 + 30 * 16]
14003 pslldq m0, 2
14004 pinsrb m0, [r3 + 24], 1
14005 pinsrb m0, [r3 + 26], 0
14006 pmaddubsw m3, m0, m6
14007 pmulhrsw m3, m7
14008 pslldq m2, 2
14009 pinsrb m2, [r3 + 12], 1
14010 pinsrb m2, [r3 + 14], 0
14011 pmaddubsw m5, m2, m6
14012 pmulhrsw m5, m7
14013 packuswb m3, m5
14014 movu [r0 + 946 * 16], m3
14015
14016 pslldq m1, 2
14017 pinsrb m1, [r4 + 0], 1
14018 pinsrb m1, [r3 + 2], 0
14019 pmaddubsw m3, m1, m6
14020 pmulhrsw m3, m7
14021 pslldq m4, 2
14022 pinsrw m4, [r4 + 7], 0
14023 pmaddubsw m5, m4, m6
14024 pmulhrsw m5, m7
14025 packuswb m3, m5
14026 movu [r0 + 947 * 16], m3
14027
14028 ; mode16 [row 26]
14029 movu m6, [r5 + 9 * 16]
14030 pmaddubsw m3, m0, m6
14031 pmulhrsw m3, m7
14032 pmaddubsw m5, m2, m6
14033 pmulhrsw m5, m7
14034 packuswb m3, m5
14035 movu [r0 + 948 * 16], m3
14036
14037 pmaddubsw m3, m1, m6
14038 pmulhrsw m3, m7
14039 pmaddubsw m5, m4, m6
14040 pmulhrsw m5, m7
14041 packuswb m3, m5
14042 movu [r0 + 949 * 16], m3
14043
14044 ; mode16 [row 27]
14045 movu m6, [r5 + 20 * 16]
14046 pslldq m0, 2
14047 pinsrb m0, [r3 + 26], 1
14048 pinsrb m0, [r3 + 27], 0
14049 pmaddubsw m3, m0, m6
14050 pmulhrsw m3, m7
14051 pslldq m2, 2
14052 pinsrb m2, [r3 + 14], 1
14053 pinsrb m2, [r3 + 15], 0
14054 pmaddubsw m5, m2, m6
14055 pmulhrsw m5, m7
14056 packuswb m3, m5
14057 movu [r0 + 950 * 16], m3
14058
14059 pslldq m1, 2
14060 pinsrb m1, [r3 + 2], 1
14061 pinsrb m1, [r3 + 3], 0
14062 pmaddubsw m3, m1, m6
14063 pmulhrsw m3, m7
14064 pslldq m4, 2
14065 pinsrw m4, [r4 + 6], 0
14066 pmaddubsw m5, m4, m6
14067 pmulhrsw m5, m7
14068 packuswb m3, m5
14069 movu [r0 + 951 * 16], m3
14070
14071 ; mode16 [row 28]
14072 movu m6, [r5 + 31 * 16]
14073 pslldq m0, 2
14074 pinsrb m0, [r3 + 27], 1
14075 pinsrb m0, [r3 + 29], 0
14076 pmaddubsw m3, m0, m6
14077 pmulhrsw m3, m7
14078 pslldq m2, 2
14079 pinsrb m2, [r3 + 15], 1
14080 pinsrb m2, [r3 + 17], 0
14081 pmaddubsw m5, m2, m6
14082 pmulhrsw m5, m7
14083 packuswb m3, m5
14084 movu [r0 + 952 * 16], m3
14085
14086 pslldq m1, 2
14087 pinsrb m1, [r3 + 3], 1
14088 pinsrb m1, [r3 + 5], 0
14089 pmaddubsw m3, m1, m6
14090 pmulhrsw m3, m7
14091 pslldq m4, 2
14092 pinsrw m4, [r4 + 5], 0
14093 pmaddubsw m5, m4, m6
14094 pmulhrsw m5, m7
14095 packuswb m3, m5
14096 movu [r0 + 953 * 16], m3
14097
14098 ; mode16 [row 29]
14099 movu m6, [r5 + 10 * 16]
14100 pmaddubsw m3, m0, m6
14101 pmulhrsw m3, m7
14102 pmaddubsw m5, m2, m6
14103 pmulhrsw m5, m7
14104 packuswb m3, m5
14105 movu [r0 + 954 * 16], m3
14106
14107 pmaddubsw m3, m1, m6
14108 pmulhrsw m3, m7
14109 pmaddubsw m5, m4, m6
14110 pmulhrsw m5, m7
14111 packuswb m3, m5
14112 movu [r0 + 955 * 16], m3
14113
14114 ; mode16 [row 30]
14115 movu m6, [r5 + 21 * 16]
14116 pslldq m0, 2
14117 pinsrb m0, [r3 + 29], 1
14118 pinsrb m0, [r3 + 30], 0
14119 pmaddubsw m3, m0, m6
14120 pmulhrsw m3, m7
14121 pslldq m2, 2
14122 pinsrb m2, [r3 + 17], 1
14123 pinsrb m2, [r3 + 18], 0
14124 pmaddubsw m5, m2, m6
14125 pmulhrsw m5, m7
14126 packuswb m3, m5
14127 movu [r0 + 956 * 16], m3
14128
14129 pslldq m1, 2
14130 pinsrb m1, [r3 + 5], 1
14131 pinsrb m1, [r3 + 6], 0
14132 pmaddubsw m3, m1, m6
14133 pmulhrsw m3, m7
14134 pslldq m4, 2
14135 pinsrw m4, [r4 + 4], 0
14136 pmaddubsw m5, m4, m6
14137 pmulhrsw m5, m7
14138 packuswb m3, m5
14139 movu [r0 + 957 * 16], m3
14140
14141 ; mode16 [row 31]
14142 pshufb m5, m0, [tab_S2]
14143 movh [r0 + 958 * 16], m5
14144 pshufb m5, m2, [tab_S2]
14145 movh [r0 + 958 * 16 + 8], m5
14146 pshufb m5, m1, [tab_S2]
14147 movh [r0 + 959 * 16], m5
14148 pshufb m5, m4, [tab_S2]
14149 movh [r0 + 959 * 16 + 8], m5
14150
14151 ; mode 17 [row 0]
14152 movu m6, [r5 + 6 * 16]
14153 movu m7, [pw_1024]
14154 movh m0, [r4 ]
14155 movh m1, [r4 + 1 ]
14156 punpcklbw m0, m1
14157 pmaddubsw m1, m0, m6
14158 pmulhrsw m1, m7
14159 movh m2, [r4 + 8]
14160 movh m3, [r4 + 9]
14161 punpcklbw m2, m3
14162 pmaddubsw m3, m2, m6
14163 pmulhrsw m3, m7
14164 packuswb m1, m3
14165 movu [r0 + 960 * 16], m1
14166
14167 movh m1, [r4 + 16]
14168 movh m3, [r4 + 17]
14169 punpcklbw m1, m3
14170 pmaddubsw m3, m1, m6
14171 pmulhrsw m3, m7
14172 movh m4, [r4 + 24]
14173 movh m5, [r4 + 25]
14174 punpcklbw m4, m5
14175 pmaddubsw m5, m4, m6
14176 pmulhrsw m5, m7
14177 packuswb m3, m5
14178 movu [r0 + 961 * 16], m3
14179
14180 ; mode17 [row 1]
14181 movu m6, [r5 + 12 * 16]
14182 pslldq m0, 2
14183 pinsrb m0, [r3 + 0], 1
14184 pinsrb m0, [r3 + 1], 0
14185 pmaddubsw m3, m0, m6
14186 pmulhrsw m3, m7
14187 pslldq m2, 2
14188 pinsrw m2, [r4 + 7], 0
14189 pmaddubsw m5, m2, m6
14190 pmulhrsw m5, m7
14191 packuswb m3, m5
14192 movu [r0 + 962 * 16], m3
14193
14194 pslldq m1, 2
14195 pinsrw m1, [r4 + 15], 0
14196 pmaddubsw m3, m1, m6
14197 pmulhrsw m3, m7
14198 pslldq m4, 2
14199 pinsrw m4, [r4 + 23], 0
14200 pmaddubsw m5, m4, m6
14201 pmulhrsw m5, m7
14202 packuswb m3, m5
14203 movu [r0 + 963 * 16], m3
14204
14205 ; mode17 [row 2]
14206 movu m6, [r5 + 18 * 16]
14207 pslldq m0, 2
14208 pinsrb m0, [r3 + 1], 1
14209 pinsrb m0, [r3 + 2], 0
14210 pmaddubsw m3, m0, m6
14211 pmulhrsw m3, m7
14212 pslldq m2, 2
14213 pinsrw m2, [r4 + 6], 0
14214 pmaddubsw m5, m2, m6
14215 pmulhrsw m5, m7
14216 packuswb m3, m5
14217 movu [r0 + 964 * 16], m3
14218
14219 pslldq m1, 2
14220 pinsrw m1, [r4 + 14], 0
14221 pmaddubsw m3, m1, m6
14222 pmulhrsw m3, m7
14223 pslldq m4, 2
14224 pinsrw m4, [r4 + 22], 0
14225 pmaddubsw m5, m4, m6
14226 pmulhrsw m5, m7
14227 packuswb m3, m5
14228 movu [r0 + 965 * 16], m3
14229
14230 ; mode17 [row 3]
14231 movu m6, [r5 + 24 * 16]
14232 pslldq m0, 2
14233 pinsrb m0, [r3 + 2], 1
14234 pinsrb m0, [r3 + 4], 0
14235 pmaddubsw m3, m0, m6
14236 pmulhrsw m3, m7
14237 pslldq m2, 2
14238 pinsrw m2, [r4 + 5], 0
14239 pmaddubsw m5, m2, m6
14240 pmulhrsw m5, m7
14241 packuswb m3, m5
14242 movu [r0 + 966 * 16], m3
14243
14244 pslldq m1, 2
14245 pinsrw m1, [r4 + 13], 0
14246 pmaddubsw m3, m1, m6
14247 pmulhrsw m3, m7
14248 pslldq m4, 2
14249 pinsrw m4, [r4 + 21], 0
14250 pmaddubsw m5, m4, m6
14251 pmulhrsw m5, m7
14252 packuswb m3, m5
14253 movu [r0 + 967 * 16], m3
14254
14255 ; mode17 [row 4]
14256 movu m6, [r5 + 30 * 16]
14257 pslldq m0, 2
14258 pinsrb m0, [r3 + 4], 1
14259 pinsrb m0, [r3 + 5], 0
14260 pmaddubsw m3, m0, m6
14261 pmulhrsw m3, m7
14262 pslldq m2, 2
14263 pinsrw m2, [r4 + 4], 0
14264 pmaddubsw m5, m2, m6
14265 pmulhrsw m5, m7
14266 packuswb m3, m5
14267 movu [r0 + 968 * 16], m3
14268
14269 pslldq m1, 2
14270 pinsrw m1, [r4 + 12], 0
14271 pmaddubsw m3, m1, m6
14272 pmulhrsw m3, m7
14273 pslldq m4, 2
14274 pinsrw m4, [r4 + 20], 0
14275 pmaddubsw m5, m4, m6
14276 pmulhrsw m5, m7
14277 packuswb m3, m5
14278 movu [r0 + 969 * 16], m3
14279
14280 ; mode17 [row 5]
14281 movu m6, [r5 + 4 * 16]
14282 pmaddubsw m3, m0, m6
14283 pmulhrsw m3, m7
14284 pmaddubsw m5, m2, m6
14285 pmulhrsw m5, m7
14286 packuswb m3, m5
14287 movu [r0 + 970 * 16], m3
14288
14289 pmaddubsw m3, m1, m6
14290 pmulhrsw m3, m7
14291 pmaddubsw m5, m4, m6
14292 pmulhrsw m5, m7
14293 packuswb m3, m5
14294 movu [r0 + 971 * 16], m3
14295
14296 ; mode17 [row 6]
14297 movu m6, [r5 + 10 * 16]
14298 pslldq m0, 2
14299 pinsrb m0, [r3 + 5], 1
14300 pinsrb m0, [r3 + 6], 0
14301 pmaddubsw m3, m0, m6
14302 pmulhrsw m3, m7
14303 pslldq m2, 2
14304 pinsrw m2, [r4 + 3], 0
14305 pmaddubsw m5, m2, m6
14306 pmulhrsw m5, m7
14307 packuswb m3, m5
14308 movu [r0 + 972 * 16], m3
14309
14310 pslldq m1, 2
14311 pinsrw m1, [r4 + 11], 0
14312 pmaddubsw m3, m1, m6
14313 pmulhrsw m3, m7
14314 pslldq m4, 2
14315 pinsrw m4, [r4 + 19], 0
14316 pmaddubsw m5, m4, m6
14317 pmulhrsw m5, m7
14318 packuswb m3, m5
14319 movu [r0 + 973 * 16], m3
14320
14321 ; mode17 [row 7]
14322 movu m6, [r5 + 16 * 16]
14323 pslldq m0, 2
14324 pinsrb m0, [r3 + 6], 1
14325 pinsrb m0, [r3 + 7], 0
14326 pmaddubsw m3, m0, m6
14327 pmulhrsw m3, m7
14328 pslldq m2, 2
14329 pinsrw m2, [r4 + 2], 0
14330 pmaddubsw m5, m2, m6
14331 pmulhrsw m5, m7
14332 packuswb m3, m5
14333 movu [r0 + 974 * 16], m3
14334
14335 pslldq m1, 2
14336 pinsrw m1, [r4 + 10], 0
14337 pmaddubsw m3, m1, m6
14338 pmulhrsw m3, m7
14339 pslldq m4, 2
14340 pinsrw m4, [r4 + 18], 0
14341 pmaddubsw m5, m4, m6
14342 pmulhrsw m5, m7
14343 packuswb m3, m5
14344 movu [r0 + 975 * 16], m3
14345
14346 ; mode17 [row 8]
14347 movu m6, [r5 + 22 * 16]
14348 pslldq m0, 2
14349 pinsrb m0, [r3 + 7], 1
14350 pinsrb m0, [r3 + 9], 0
14351 pmaddubsw m3, m0, m6
14352 pmulhrsw m3, m7
14353 pslldq m2, 2
14354 pinsrw m2, [r4 + 1], 0
14355 pmaddubsw m5, m2, m6
14356 pmulhrsw m5, m7
14357 packuswb m3, m5
14358 movu [r0 + 976 * 16], m3
14359
14360 pslldq m1, 2
14361 pinsrw m1, [r4 + 9], 0
14362 pmaddubsw m3, m1, m6
14363 pmulhrsw m3, m7
14364 pslldq m4, 2
14365 pinsrw m4, [r4 + 17], 0
14366 pmaddubsw m5, m4, m6
14367 pmulhrsw m5, m7
14368 packuswb m3, m5
14369 movu [r0 + 977 * 16], m3
14370
14371 ; mode17 [row 9]
14372 movu m6, [r5 + 28 * 16]
14373 pslldq m0, 2
14374 pinsrb m0, [r3 + 9], 1
14375 pinsrb m0, [r3 + 10], 0
14376 pmaddubsw m3, m0, m6
14377 pmulhrsw m3, m7
14378 pslldq m2, 2
14379 pinsrw m2, [r4 + 0], 0
14380 pmaddubsw m5, m2, m6
14381 pmulhrsw m5, m7
14382 packuswb m3, m5
14383 movu [r0 + 978 * 16], m3
14384
14385 pslldq m1, 2
14386 pinsrw m1, [r4 + 8], 0
14387 pmaddubsw m3, m1, m6
14388 pmulhrsw m3, m7
14389 pslldq m4, 2
14390 pinsrw m4, [r4 + 16], 0
14391 pmaddubsw m5, m4, m6
14392 pmulhrsw m5, m7
14393 packuswb m3, m5
14394 movu [r0 + 979 * 16], m3
14395
14396 ; mode17 [row 10]
14397 movu m6, [r5 + 2 * 16]
14398 pmaddubsw m3, m0, m6
14399 pmulhrsw m3, m7
14400 pmaddubsw m5, m2, m6
14401 pmulhrsw m5, m7
14402 packuswb m3, m5
14403 movu [r0 + 980 * 16], m3
14404
14405 pmaddubsw m3, m1, m6
14406 pmulhrsw m3, m7
14407 pmaddubsw m5, m4, m6
14408 pmulhrsw m5, m7
14409 packuswb m3, m5
14410 movu [r0 + 981 * 16], m3
14411
14412 ; mode17 [row 11]
14413 movu m6, [r5 + 8 * 16]
14414 pslldq m0, 2
14415 pinsrb m0, [r3 + 10], 1
14416 pinsrb m0, [r3 + 11], 0
14417 pmaddubsw m3, m0, m6
14418 pmulhrsw m3, m7
14419 pslldq m2, 2
14420 pinsrb m2, [r4 + 0], 1
14421 pinsrb m2, [r3 + 1], 0
14422 pmaddubsw m5, m2, m6
14423 pmulhrsw m5, m7
14424 packuswb m3, m5
14425 movu [r0 + 982 * 16], m3
14426
14427 pslldq m1, 2
14428 pinsrw m1, [r4 + 7], 0
14429 pmaddubsw m3, m1, m6
14430 pmulhrsw m3, m7
14431 pslldq m4, 2
14432 pinsrw m4, [r4 + 15], 0
14433 pmaddubsw m5, m4, m6
14434 pmulhrsw m5, m7
14435 packuswb m3, m5
14436 movu [r0 + 983 * 16], m3
14437
14438 ; mode17 [row 12]
14439 movu m6, [r5 + 14 * 16]
14440 pslldq m0, 2
14441 pinsrb m0, [r3 + 11], 1
14442 pinsrb m0, [r3 + 12], 0
14443 pmaddubsw m3, m0, m6
14444 pmulhrsw m3, m7
14445 pslldq m2, 2
14446 pinsrb m2, [r3 + 1], 1
14447 pinsrb m2, [r3 + 2], 0
14448 pmaddubsw m5, m2, m6
14449 pmulhrsw m5, m7
14450 packuswb m3, m5
14451 movu [r0 + 984 * 16], m3
14452
14453 pslldq m1, 2
14454 pinsrw m1, [r4 + 6], 0
14455 pmaddubsw m3, m1, m6
14456 pmulhrsw m3, m7
14457 pslldq m4, 2
14458 pinsrw m4, [r4 + 14], 0
14459 pmaddubsw m5, m4, m6
14460 pmulhrsw m5, m7
14461 packuswb m3, m5
14462 movu [r0 + 985 * 16], m3
14463
14464 ; mode17 [row 13]
14465 movu m6, [r5 + 20 * 16]
14466 pslldq m0, 2
14467 pinsrb m0, [r3 + 12], 1
14468 pinsrb m0, [r3 + 14], 0
14469 pmaddubsw m3, m0, m6
14470 pmulhrsw m3, m7
14471 pslldq m2, 2
14472 pinsrb m2, [r3 + 2], 1
14473 pinsrb m2, [r3 + 4], 0
14474 pmaddubsw m5, m2, m6
14475 pmulhrsw m5, m7
14476 packuswb m3, m5
14477 movu [r0 + 986 * 16], m3
14478
14479 pslldq m1, 2
14480 pinsrw m1, [r4 + 5], 0
14481 pmaddubsw m3, m1, m6
14482 pmulhrsw m3, m7
14483 pslldq m4, 2
14484 pinsrw m4, [r4 + 13], 0
14485 pmaddubsw m5, m4, m6
14486 pmulhrsw m5, m7
14487 packuswb m3, m5
14488 movu [r0 + 987 * 16], m3
14489
14490 ; mode17 [row 14]
14491 movu m6, [r5 + 26 * 16]
14492 pslldq m0, 2
14493 pinsrb m0, [r3 + 14], 1
14494 pinsrb m0, [r3 + 15], 0
14495 pmaddubsw m3, m0, m6
14496 pmulhrsw m3, m7
14497 pslldq m2, 2
14498 pinsrb m2, [r3 + 4], 1
14499 pinsrb m2, [r3 + 5], 0
14500 pmaddubsw m5, m2, m6
14501 pmulhrsw m5, m7
14502 packuswb m3, m5
14503 movu [r0 + 988 * 16], m3
14504
14505 pslldq m1, 2
14506 pinsrw m1, [r4 + 4], 0
14507 pmaddubsw m3, m1, m6
14508 pmulhrsw m3, m7
14509 pslldq m4, 2
14510 pinsrw m4, [r4 + 12], 0
14511 pmaddubsw m5, m4, m6
14512 pmulhrsw m5, m7
14513 packuswb m3, m5
14514 movu [r0 + 989 * 16], m3
14515
14516 ; mode17 [row 15]
14517 pshufb m5, m0, [tab_S2]
14518 movh [r0 + 990 * 16], m5
14519 pshufb m5, m2, [tab_S2]
14520 movh [r0 + 990 * 16 + 8], m5
14521 pshufb m5, m1, [tab_S2]
14522 movh [r0 + 991 * 16], m5
14523 pshufb m5, m4, [tab_S2]
14524 movh [r0 + 991 * 16 + 8], m5
14525
14526 ; mode17 [row 16]
14527 movu m6, [r5 + 6 * 16]
14528 pslldq m0, 2
14529 pinsrb m0, [r3 + 15], 1
14530 pinsrb m0, [r3 + 16], 0
14531 pmaddubsw m3, m0, m6
14532 pmulhrsw m3, m7
14533 pslldq m2, 2
14534 pinsrb m2, [r3 + 5], 1
14535 pinsrb m2, [r3 + 6], 0
14536 pmaddubsw m5, m2, m6
14537 pmulhrsw m5, m7
14538 packuswb m3, m5
14539 movu [r0 + 992 * 16], m3
14540
14541 pslldq m1, 2
14542 pinsrw m1, [r4 + 3], 0
14543 pmaddubsw m3, m1, m6
14544 pmulhrsw m3, m7
14545 pslldq m4, 2
14546 pinsrw m4, [r4 + 11], 0
14547 pmaddubsw m5, m4, m6
14548 pmulhrsw m5, m7
14549 packuswb m3, m5
14550 movu [r0 + 993 * 16], m3
14551
14552 ; mode17 [row 17]
14553 movu m6, [r5 + 12 * 16]
14554 pslldq m0, 2
14555 pinsrb m0, [r3 + 16], 1
14556 pinsrb m0, [r3 + 17], 0
14557 pmaddubsw m3, m0, m6
14558 pmulhrsw m3, m7
14559 pslldq m2, 2
14560 pinsrb m2, [r3 + 6], 1
14561 pinsrb m2, [r3 + 7], 0
14562 pmaddubsw m5, m2, m6
14563 pmulhrsw m5, m7
14564 packuswb m3, m5
14565 movu [r0 + 994 * 16], m3
14566
14567 pslldq m1, 2
14568 pinsrw m1, [r4 + 2], 0
14569 pmaddubsw m3, m1, m6
14570 pmulhrsw m3, m7
14571 pslldq m4, 2
14572 pinsrw m4, [r4 + 10], 0
14573 pmaddubsw m5, m4, m6
14574 pmulhrsw m5, m7
14575 packuswb m3, m5
14576 movu [r0 + 995 * 16], m3
14577
14578 ; mode17 [row 18]
14579 movu m6, [r5 + 18 * 16]
14580 pslldq m0, 2
14581 pinsrb m0, [r3 + 17], 1
14582 pinsrb m0, [r3 + 18], 0
14583 pmaddubsw m3, m0, m6
14584 pmulhrsw m3, m7
14585 pslldq m2, 2
14586 pinsrb m2, [r3 + 7], 1
14587 pinsrb m2, [r3 + 9], 0
14588 pmaddubsw m5, m2, m6
14589 pmulhrsw m5, m7
14590 packuswb m3, m5
14591 movu [r0 + 996 * 16], m3
14592
14593 pslldq m1, 2
14594 pinsrw m1, [r4 + 1], 0
14595 pmaddubsw m3, m1, m6
14596 pmulhrsw m3, m7
14597 pslldq m4, 2
14598 pinsrw m4, [r4 + 9], 0
14599 pmaddubsw m5, m4, m6
14600 pmulhrsw m5, m7
14601 packuswb m3, m5
14602 movu [r0 + 997 * 16], m3
14603
14604 ; mode17 [row 19]
14605 movu m6, [r5 + 24 * 16]
14606 pslldq m0, 2
14607 pinsrb m0, [r3 + 18], 1
14608 pinsrb m0, [r3 + 20], 0
14609 pmaddubsw m3, m0, m6
14610 pmulhrsw m3, m7
14611 pslldq m2, 2
14612 pinsrb m2, [r3 + 9], 1
14613 pinsrb m2, [r3 + 10], 0
14614 pmaddubsw m5, m2, m6
14615 pmulhrsw m5, m7
14616 packuswb m3, m5
14617 movu [r0 + 998 * 16], m3
14618
14619 pslldq m1, 2
14620 pinsrw m1, [r4 + 0], 0
14621 pmaddubsw m3, m1, m6
14622 pmulhrsw m3, m7
14623 pslldq m4, 2
14624 pinsrw m4, [r4 + 8], 0
14625 pmaddubsw m5, m4, m6
14626 pmulhrsw m5, m7
14627 packuswb m3, m5
14628 movu [r0 + 999 * 16], m3
14629
14630 ; mode17 [row 20]
14631 movu m6, [r5 + 30 * 16]
14632 pslldq m0, 2
14633 pinsrb m0, [r3 + 20], 1
14634 pinsrb m0, [r3 + 21], 0
14635 pmaddubsw m3, m0, m6
14636 pmulhrsw m3, m7
14637 pslldq m2, 2
14638 pinsrb m2, [r3 + 10], 1
14639 pinsrb m2, [r3 + 11], 0
14640 pmaddubsw m5, m2, m6
14641 pmulhrsw m5, m7
14642 packuswb m3, m5
14643 movu [r0 + 1000 * 16], m3
14644
14645 pslldq m1, 2
14646 pinsrb m1, [r4 + 0], 1
14647 pinsrb m1, [r3 + 1], 0
14648 pmaddubsw m3, m1, m6
14649 pmulhrsw m3, m7
14650 pslldq m4, 2
14651 ;pinsrb m4, [r4 + 8], 1
14652 ;pinsrb m4, [r4 + 7], 0
14653 pinsrw m4, [r4 + 7], 0
14654 pmaddubsw m5, m4, m6
14655 pmulhrsw m5, m7
14656 packuswb m3, m5
14657 movu [r0 + 1001 * 16], m3
14658
14659 ; mode17 [row 21]
14660 movu m6, [r5 + 4 * 16]
14661 pmaddubsw m3, m0, m6
14662 pmulhrsw m3, m7
14663 pmaddubsw m5, m2, m6
14664 pmulhrsw m5, m7
14665 packuswb m3, m5
14666 movu [r0 + 1002 * 16], m3
14667
14668 pmaddubsw m3, m1, m6
14669 pmulhrsw m3, m7
14670 pmaddubsw m5, m4, m6
14671 pmulhrsw m5, m7
14672 packuswb m3, m5
14673 movu [r0 + 1003 * 16], m3
14674
14675 ; mode17 [row 22]
14676 movu m6, [r5 + 10 * 16]
14677 pslldq m0, 2
14678 pinsrb m0, [r3 + 21], 1
14679 pinsrb m0, [r3 + 22], 0
14680 pmaddubsw m3, m0, m6
14681 pmulhrsw m3, m7
14682 pslldq m2, 2
14683 pinsrb m2, [r3 + 11], 1
14684 pinsrb m2, [r3 + 12], 0
14685 pmaddubsw m5, m2, m6
14686 pmulhrsw m5, m7
14687 packuswb m3, m5
14688 movu [r0 + 1004 * 16], m3
14689
14690 pslldq m1, 2
14691 pinsrb m1, [r3 + 1], 1
14692 pinsrb m1, [r3 + 2], 0
14693 pmaddubsw m3, m1, m6
14694 pmulhrsw m3, m7
14695 pslldq m4, 2
14696 pinsrw m4, [r4 + 6], 0
14697 pmaddubsw m5, m4, m6
14698 pmulhrsw m5, m7
14699 packuswb m3, m5
14700 movu [r0 + 1005 * 16], m3
14701
14702 ; mode17 [row 23]
14703 movu m6, [r5 + 16 * 16]
14704 pslldq m0, 2
14705 pinsrb m0, [r3 + 22], 1
14706 pinsrb m0, [r3 + 23], 0
14707 pmaddubsw m3, m0, m6
14708 pmulhrsw m3, m7
14709 pslldq m2, 2
14710 pinsrb m2, [r3 + 12], 1
14711 pinsrb m2, [r3 + 14], 0
14712 pmaddubsw m5, m2, m6
14713 pmulhrsw m5, m7
14714 packuswb m3, m5
14715 movu [r0 + 1006 * 16], m3
14716
14717 pslldq m1, 2
14718 pinsrb m1, [r3 + 2], 1
14719 pinsrb m1, [r3 + 4], 0
14720 pmaddubsw m3, m1, m6
14721 pmulhrsw m3, m7
14722 pslldq m4, 2
14723 pinsrw m4, [r4 + 5], 0
14724 pmaddubsw m5, m4, m6
14725 pmulhrsw m5, m7
14726 packuswb m3, m5
14727 movu [r0 + 1007 * 16], m3
14728
14729 ; mode17 [row 24]
14730 movu m6, [r5 + 22 * 16]
14731 pslldq m0, 2
14732 pinsrb m0, [r3 + 23], 1
14733 pinsrb m0, [r3 + 25], 0
14734 pmaddubsw m3, m0, m6
14735 pmulhrsw m3, m7
14736 pslldq m2, 2
14737 pinsrb m2, [r3 + 14], 1
14738 pinsrb m2, [r3 + 15], 0
14739 pmaddubsw m5, m2, m6
14740 pmulhrsw m5, m7
14741 packuswb m3, m5
14742 movu [r0 + 1008 * 16], m3
14743
14744 pslldq m1, 2
14745 pinsrb m1, [r3 + 4], 1
14746 pinsrb m1, [r3 + 5], 0
14747 pmaddubsw m3, m1, m6
14748 pmulhrsw m3, m7
14749 pslldq m4, 2
14750 pinsrw m4, [r4 + 4], 0
14751 pmaddubsw m5, m4, m6
14752 pmulhrsw m5, m7
14753 packuswb m3, m5
14754 movu [r0 + 1009 * 16], m3
14755
14756 ; mode17 [row 25]
14757 movu m6, [r5 + 28 * 16]
14758 pslldq m0, 2
14759 pinsrb m0, [r3 + 25], 1
14760 pinsrb m0, [r3 + 26], 0
14761 pmaddubsw m3, m0, m6
14762 pmulhrsw m3, m7
14763 pslldq m2, 2
14764 pinsrb m2, [r3 + 15], 1
14765 pinsrb m2, [r3 + 16], 0
14766 pmaddubsw m5, m2, m6
14767 pmulhrsw m5, m7
14768 packuswb m3, m5
14769 movu [r0 + 1010 * 16], m3
14770
14771 pslldq m1, 2
14772 pinsrb m1, [r3 + 5], 1
14773 pinsrb m1, [r3 + 6], 0
14774 pmaddubsw m3, m1, m6
14775 pmulhrsw m3, m7
14776 pslldq m4, 2
14777 pinsrw m4, [r4 + 3], 0
14778 pmaddubsw m5, m4, m6
14779 pmulhrsw m5, m7
14780 packuswb m3, m5
14781 movu [r0 + 1011 * 16], m3
14782
14783 ; mode17 [row 26]
14784 movu m6, [r5 + 2 * 16]
14785 pmaddubsw m3, m0, m6
14786 pmulhrsw m3, m7
14787 pmaddubsw m5, m2, m6
14788 pmulhrsw m5, m7
14789 packuswb m3, m5
14790 movu [r0 + 1012 * 16], m3
14791
14792 pmaddubsw m3, m1, m6
14793 pmulhrsw m3, m7
14794 pmaddubsw m5, m4, m6
14795 pmulhrsw m5, m7
14796 packuswb m3, m5
14797 movu [r0 + 1013 * 16], m3
14798
14799 ; mode17 [row 27]
14800 movu m6, [r5 + 8 * 16]
14801 pslldq m0, 2
14802 pinsrb m0, [r3 + 26], 1
14803 pinsrb m0, [r3 + 27], 0
14804 pmaddubsw m3, m0, m6
14805 pmulhrsw m3, m7
14806 pslldq m2, 2
14807 pinsrb m2, [r3 + 16], 1
14808 pinsrb m2, [r3 + 17], 0
14809 pmaddubsw m5, m2, m6
14810 pmulhrsw m5, m7
14811 packuswb m3, m5
14812 movu [r0 + 1014 * 16], m3
14813
14814 pslldq m1, 2
14815 pinsrb m1, [r3 + 6], 1
14816 pinsrb m1, [r3 + 7], 0
14817 pmaddubsw m3, m1, m6
14818 pmulhrsw m3, m7
14819 pslldq m4, 2
14820 pinsrw m4, [r4 + 2], 0
14821 pmaddubsw m5, m4, m6
14822 pmulhrsw m5, m7
14823 packuswb m3, m5
14824 movu [r0 + 1015 * 16], m3
14825
14826 ; mode17 [row 28]
14827 movu m6, [r5 + 14 * 16]
14828 pslldq m0, 2
14829 pinsrb m0, [r3 + 27], 1
14830 pinsrb m0, [r3 + 28], 0
14831 pmaddubsw m3, m0, m6
14832 pmulhrsw m3, m7
14833 pslldq m2, 2
14834 pinsrb m2, [r3 + 17], 1
14835 pinsrb m2, [r3 + 18], 0
14836 pmaddubsw m5, m2, m6
14837 pmulhrsw m5, m7
14838 packuswb m3, m5
14839 movu [r0 + 1016 * 16], m3
14840
14841 pslldq m1, 2
14842 pinsrb m1, [r3 + 7], 1
14843 pinsrb m1, [r3 + 9], 0
14844 pmaddubsw m3, m1, m6
14845 pmulhrsw m3, m7
14846 pslldq m4, 2
14847 pinsrw m4, [r4 + 1], 0
14848 pmaddubsw m5, m4, m6
14849 pmulhrsw m5, m7
14850 packuswb m3, m5
14851 movu [r0 + 1017 * 16], m3
14852
14853 ; mode17 [row 29]
14854 movu m6, [r5 + 20 * 16]
14855 pslldq m0, 2
14856 pinsrb m0, [r3 + 28], 1
14857 pinsrb m0, [r3 + 30], 0
14858 pmaddubsw m3, m0, m6
14859 pmulhrsw m3, m7
14860 pslldq m2, 2
14861 pinsrb m2, [r3 + 18], 1
14862 pinsrb m2, [r3 + 20], 0
14863 pmaddubsw m5, m2, m6
14864 pmulhrsw m5, m7
14865 packuswb m3, m5
14866 movu [r0 + 1018 * 16], m3
14867
14868 pslldq m1, 2
14869 pinsrb m1, [r3 + 9], 1
14870 pinsrb m1, [r3 + 10], 0
14871 pmaddubsw m3, m1, m6
14872 pmulhrsw m3, m7
14873 pslldq m4, 2
14874 pinsrw m4, [r4 + 0], 0
14875 pmaddubsw m5, m4, m6
14876 pmulhrsw m5, m7
14877 packuswb m3, m5
14878 movu [r0 + 1019 * 16], m3
14879
14880 ; mode17 [row 30]
14881 movu m6, [r5 + 26 * 16]
14882 pslldq m0, 2
14883 pinsrb m0, [r3 + 30], 1
14884 pinsrb m0, [r3 + 31], 0
14885 pmaddubsw m3, m0, m6
14886 pmulhrsw m3, m7
14887 pslldq m2, 2
14888 pinsrb m2, [r3 + 20], 1
14889 pinsrb m2, [r3 + 21], 0
14890 pmaddubsw m5, m2, m6
14891 pmulhrsw m5, m7
14892 packuswb m3, m5
14893 movu [r0 + 1020 * 16], m3
14894
14895 pslldq m1, 2
14896 pinsrb m1, [r3 + 10], 1
14897 pinsrb m1, [r3 + 11], 0
14898 pmaddubsw m3, m1, m6
14899 pmulhrsw m3, m7
14900 pslldq m4, 2
14901 pinsrb m4, [r4 + 0], 1
14902 pinsrb m4, [r3 + 1], 0
14903 pmaddubsw m5, m4, m6
14904 pmulhrsw m5, m7
14905 packuswb m3, m5
14906 movu [r0 + 1021 * 16], m3
14907
14908 ; mode17 [row 31]
14909 pshufb m5, m0, [tab_S2]
14910 movh [r0 + 1022 * 16], m5
14911 pshufb m5, m2, [tab_S2]
14912 movh [r0 + 1022 * 16 + 8], m5
14913 pshufb m5, m1, [tab_S2]
14914 movh [r0 + 1023 * 16], m5
14915 pshufb m5, m4, [tab_S2]
14916 movh [r0 + 1023 * 16 + 8], m5
14917
14918 ;mode 18[row 0]
14919 movu m0, [r3]
14920 movu [r0 + 1024 * 16], m0
14921 movu m1, [r3 + 16]
14922 movu [r0 + 1025 * 16], m1
14923
14924 ;mode 18[row 1]
14925 pslldq m0, 1
14926 pinsrb m0, [r4 + 1], 0
14927 movu [r0 + 1026 * 16], m0
14928 pslldq m1, 1
14929 pinsrb m1, [r3 + 15], 0
14930 movu [r0 + 1027 * 16], m1
14931
14932 ;mode 18[row 2]
14933 pslldq m0, 1
14934 pinsrb m0, [r4 + 2], 0
14935 movu [r0 + 1028 * 16], m0
14936 pslldq m1, 1
14937 pinsrb m1, [r3 + 14], 0
14938 movu [r0 + 1029 * 16], m1
14939
14940 ;mode 18[row 3]
14941 pslldq m0, 1
14942 pinsrb m0, [r4 + 3], 0
14943 movu [r0 + 1030 * 16], m0
14944 pslldq m1, 1
14945 pinsrb m1, [r3 + 13], 0
14946 movu [r0 + 1031 * 16], m1
14947
14948 ;mode 18[row 4]
14949 pslldq m0, 1
14950 pinsrb m0, [r4 + 4], 0
14951 movu [r0 + 1032 * 16], m0
14952 pslldq m1, 1
14953 pinsrb m1, [r3 + 12], 0
14954 movu [r0 + 1033 * 16], m1
14955
14956 ;mode 18[row 5]
14957 pslldq m0, 1
14958 pinsrb m0, [r4 + 5], 0
14959 movu [r0 + 1034 * 16], m0
14960 pslldq m1, 1
14961 pinsrb m1, [r3 + 11], 0
14962 movu [r0 + 1035 * 16], m1
14963
14964 ;mode 18[row 6]
14965 pslldq m0, 1
14966 pinsrb m0, [r4 + 6], 0
14967 movu [r0 + 1036 * 16], m0
14968 pslldq m1, 1
14969 pinsrb m1, [r3 + 10], 0
14970 movu [r0 + 1037 * 16], m1
14971
14972 ;mode 18[row 7]
14973 pslldq m0, 1
14974 pinsrb m0, [r4 + 7], 0
14975 movu [r0 + 1038 * 16], m0
14976 pslldq m1, 1
14977 pinsrb m1, [r3 + 9], 0
14978 movu [r0 + 1039 * 16], m1
14979
14980 ;mode 18[row 8]
14981 pslldq m0, 1
14982 pinsrb m0, [r4 + 8], 0
14983 movu [r0 + 1040 * 16], m0
14984 pslldq m1, 1
14985 pinsrb m1, [r3 + 8], 0
14986 movu [r0 + 1041 * 16], m1
14987
14988 ;mode 18[row 9]
14989 pslldq m0, 1
14990 pinsrb m0, [r4 + 9], 0
14991 movu [r0 + 1042 * 16], m0
14992 pslldq m1, 1
14993 pinsrb m1, [r3 + 7], 0
14994 movu [r0 + 1043 * 16], m1
14995
14996 ;mode 18[row 10]
14997 pslldq m0, 1
14998 pinsrb m0, [r4 + 10], 0
14999 movu [r0 + 1044 * 16], m0
15000 pslldq m1, 1
15001 pinsrb m1, [r3 + 6], 0
15002 movu [r0 + 1045 * 16], m1
15003
15004 ;mode 18[row 11]
15005 pslldq m0, 1
15006 pinsrb m0, [r4 + 11], 0
15007 movu [r0 + 1046 * 16], m0
15008 pslldq m1, 1
15009 pinsrb m1, [r3 + 5], 0
15010 movu [r0 + 1047 * 16], m1
15011
15012 ;mode 18[row 12]
15013 pslldq m0, 1
15014 pinsrb m0, [r4 + 12], 0
15015 movu [r0 + 1048 * 16], m0
15016 pslldq m1, 1
15017 pinsrb m1, [r3 + 4], 0
15018 movu [r0 + 1049 * 16], m1
15019
15020 ;mode 18[row 13]
15021 pslldq m0, 1
15022 pinsrb m0, [r4 + 13], 0
15023 movu [r0 + 1050 * 16], m0
15024 pslldq m1, 1
15025 pinsrb m1, [r3 + 3], 0
15026 movu [r0 + 1051 * 16], m1
15027
15028 ;mode 18[row 14]
15029 pslldq m0, 1
15030 pinsrb m0, [r4 + 14], 0
15031 movu [r0 + 1052 * 16], m0
15032 pslldq m1, 1
15033 pinsrb m1, [r3 + 2], 0
15034 movu [r0 + 1053 * 16], m1
15035
15036 ;mode 18[row 15]
15037 pslldq m0, 1
15038 pinsrb m0, [r4 + 15], 0
15039 movu [r0 + 1054 * 16], m0
15040 pslldq m1, 1
15041 pinsrb m1, [r3 + 1], 0
15042 movu [r0 + 1055 * 16], m1
15043
15044 ;mode 18[row 16]
15045 pslldq m0, 1
15046 pinsrb m0, [r4 + 16], 0
15047 movu [r0 + 1056 * 16], m0
15048 pslldq m1, 1
15049 pinsrb m1, [r3 + 0], 0
15050 movu [r0 + 1057 * 16], m1
15051
15052 ;mode 18[row 17]
15053 pslldq m0, 1
15054 pinsrb m0, [r4 + 17], 0
15055 movu [r0 + 1058 * 16], m0
15056 pslldq m1, 1
15057 pinsrb m1, [r4 + 1], 0
15058 movu [r0 + 1059 * 16], m1
15059
15060 ;mode 18[row 18]
15061 pslldq m0, 1
15062 pinsrb m0, [r4 + 18], 0
15063 movu [r0 + 1060 * 16], m0
15064 pslldq m1, 1
15065 pinsrb m1, [r4 + 2], 0
15066 movu [r0 + 1061 * 16], m1
15067
15068 ;mode 18[row 19]
15069 pslldq m0, 1
15070 pinsrb m0, [r4 + 19], 0
15071 movu [r0 + 1062 * 16], m0
15072 pslldq m1, 1
15073 pinsrb m1, [r4 + 3], 0
15074 movu [r0 + 1063 * 16], m1
15075
15076 ;mode 18[row 20]
15077 pslldq m0, 1
15078 pinsrb m0, [r4 + 20], 0
15079 movu [r0 + 1064 * 16], m0
15080 pslldq m1, 1
15081 pinsrb m1, [r4 + 4], 0
15082 movu [r0 + 1065 * 16], m1
15083
15084 ;mode 18[row 21]
15085 pslldq m0, 1
15086 pinsrb m0, [r4 + 21], 0
15087 movu [r0 + 1066 * 16], m0
15088 pslldq m1, 1
15089 pinsrb m1, [r4 + 5], 0
15090 movu [r0 + 1067 * 16], m1
15091
15092 ;mode 18[row 22]
15093 pslldq m0, 1
15094 pinsrb m0, [r4 + 22], 0
15095 movu [r0 + 1068 * 16], m0
15096 pslldq m1, 1
15097 pinsrb m1, [r4 + 6], 0
15098 movu [r0 + 1069 * 16], m1
15099
15100 ;mode 18[row 23]
15101 pslldq m0, 1
15102 pinsrb m0, [r4 + 23], 0
15103 movu [r0 + 1070 * 16], m0
15104 pslldq m1, 1
15105 pinsrb m1, [r4 + 7], 0
15106 movu [r0 + 1071 * 16], m1
15107
15108 ;mode 18[row 24]
15109 pslldq m0, 1
15110 pinsrb m0, [r4 + 24], 0
15111 movu [r0 + 1072 * 16], m0
15112 pslldq m1, 1
15113 pinsrb m1, [r4 + 8], 0
15114 movu [r0 + 1073 * 16], m1
15115
15116 ;mode 18[row 25]
15117 pslldq m0, 1
15118 pinsrb m0, [r4 + 25], 0
15119 movu [r0 + 1074 * 16], m0
15120 pslldq m1, 1
15121 pinsrb m1, [r4 + 9], 0
15122 movu [r0 + 1075 * 16], m1
15123
15124 ;mode 18[row 26]
15125 pslldq m0, 1
15126 pinsrb m0, [r4 + 26], 0
15127 movu [r0 + 1076 * 16], m0
15128 pslldq m1, 1
15129 pinsrb m1, [r4 + 10], 0
15130 movu [r0 + 1077 * 16], m1
15131
15132 ;mode 18[row 27]
15133 pslldq m0, 1
15134 pinsrb m0, [r4 + 27], 0
15135 movu [r0 + 1078 * 16], m0
15136 pslldq m1, 1
15137 pinsrb m1, [r4 + 11], 0
15138 movu [r0 + 1079 * 16], m1
15139
15140 ;mode 18[row 28]
15141 pslldq m0, 1
15142 pinsrb m0, [r4 + 28], 0
15143 movu [r0 + 1080 * 16], m0
15144 pslldq m1, 1
15145 pinsrb m1, [r4 + 12], 0
15146 movu [r0 + 1081 * 16], m1
15147
15148 ;mode 18[row 29]
15149 pslldq m0, 1
15150 pinsrb m0, [r4 + 29], 0
15151 movu [r0 + 1082 * 16], m0
15152 pslldq m1, 1
15153 pinsrb m1, [r4 + 13], 0
15154 movu [r0 + 1083 * 16], m1
15155
15156 ;mode 18[row 30]
15157 pslldq m0, 1
15158 pinsrb m0, [r4 + 30], 0
15159 movu [r0 + 1084 * 16], m0
15160 pslldq m1, 1
15161 pinsrb m1, [r4 + 14], 0
15162 movu [r0 + 1085 * 16], m1
15163
15164 ;mode 18[row 31]
15165 pslldq m0, 1
15166 pinsrb m0, [r4 + 31], 0
15167 movu [r0 + 1086 * 16], m0
15168 pslldq m1, 1
15169 pinsrb m1, [r4 + 15], 0
15170 movu [r0 + 1087 * 16], m1
15171
15172 ; mode 19 [row 0]
15173 movu m6, [r5 + 6 * 16]
15174 movu m0, [r3 ]
15175 movu m1, [r3 + 1 ]
15176 punpcklbw m0, m1
15177 pmaddubsw m1, m0, m6
15178 pmulhrsw m1, m7
15179 movu m2, [r3 + 8]
15180 movu m3, [r3 + 9]
15181 punpcklbw m2, m3
15182 pmaddubsw m3, m2, m6
15183 pmulhrsw m3, m7
15184 packuswb m1, m3
15185 movu [r0 + 1088 * 16], m1
15186
15187 movu m1, [r3 + 16]
15188 movu m3, [r3 + 17]
15189 punpcklbw m1, m3
15190 pmaddubsw m4, m1, m6
15191 pmulhrsw m4, m7
15192 movu m3, [r3 + 24]
15193 movu m5, [r3 + 25]
15194 punpcklbw m3, m5
15195 pmaddubsw m5, m3, m6
15196 pmulhrsw m5, m7
15197 packuswb m4, m5
15198 movu [r0 + 1089 * 16], m4
15199
15200 ; mode 19 [row 1]
15201 movu m6, [r5 + 12 * 16]
15202 pslldq m0, 2
15203 pinsrb m0, [r4 + 0], 1
15204 pinsrb m0, [r4 + 1], 0
15205 pmaddubsw m4, m0, m6
15206 pmulhrsw m4, m7
15207 pslldq m2, 2
15208 pinsrw m2, [r3 + 7], 0
15209 pmaddubsw m5, m2, m6
15210 pmulhrsw m5, m7
15211 packuswb m4, m5
15212 movu [r0 + 1090 * 16], m4
15213 pslldq m1, 2
15214 pinsrw m1, [r3 + 15], 0
15215 pmaddubsw m4, m1, m6
15216 pmulhrsw m4, m7
15217 pslldq m3, 2
15218 pinsrw m3, [r3 + 23], 0
15219 pmaddubsw m5, m3, m6
15220 pmulhrsw m5, m7
15221 packuswb m4, m5
15222 movu [r0 + 1091 * 16], m4
15223
15224 ; mode 19 [row 2]
15225 movu m6, [r5 + 18 * 16]
15226 pslldq m0, 2
15227 pinsrb m0, [r4 + 1], 1
15228 pinsrb m0, [r4 + 2], 0
15229 pmaddubsw m4, m0, m6
15230 pmulhrsw m4, m7
15231 pslldq m2, 2
15232 pinsrw m2, [r3 + 6], 0
15233 pmaddubsw m5, m2, m6
15234 pmulhrsw m5, m7
15235 packuswb m4, m5
15236 movu [r0 + 1092 * 16], m4
15237 pslldq m1, 2
15238 pinsrw m1, [r3 + 14], 0
15239 pmaddubsw m4, m1, m6
15240 pmulhrsw m4, m7
15241 pslldq m3, 2
15242 pinsrw m3, [r3 + 22], 0
15243 pmaddubsw m5, m3, m6
15244 pmulhrsw m5, m7
15245 packuswb m4, m5
15246 movu [r0 + 1093 * 16], m4
15247
15248 ; mode 19 [row 3]
15249 movu m6, [r5 + 24 * 16]
15250 pslldq m0, 2
15251 pinsrb m0, [r4 + 2], 1
15252 pinsrb m0, [r4 + 4], 0
15253 pmaddubsw m4, m0, m6
15254 pmulhrsw m4, m7
15255 pslldq m2, 2
15256 pinsrw m2, [r3 + 5], 0
15257 pmaddubsw m5, m2, m6
15258 pmulhrsw m5, m7
15259 packuswb m4, m5
15260 movu [r0 + 1094 * 16], m4
15261 pslldq m1, 2
15262 pinsrw m1, [r3 + 13], 0
15263 pmaddubsw m4, m1, m6
15264 pmulhrsw m4, m7
15265 pslldq m3, 2
15266 pinsrw m3, [r3 + 21], 0
15267 pmaddubsw m5, m3, m6
15268 pmulhrsw m5, m7
15269 packuswb m4, m5
15270 movu [r0 + 1095 * 16], m4
15271
15272 ; mode 19 [row 4]
15273 movu m6, [r5 + 30 * 16]
15274 pslldq m0, 2
15275 pinsrb m0, [r4 + 4], 1
15276 pinsrb m0, [r4 + 5], 0
15277 pmaddubsw m4, m0, m6
15278 pmulhrsw m4, m7
15279 pslldq m2, 2
15280 pinsrw m2, [r3 + 4], 0
15281 pmaddubsw m5, m2, m6
15282 pmulhrsw m5, m7
15283 packuswb m4, m5
15284 movu [r0 + 1096 * 16], m4
15285 pslldq m1, 2
15286 pinsrw m1, [r3 + 12], 0
15287 pmaddubsw m4, m1, m6
15288 pmulhrsw m4, m7
15289 pslldq m3, 2
15290 pinsrw m3, [r3 + 20], 0
15291 pmaddubsw m5, m3, m6
15292 pmulhrsw m5, m7
15293 packuswb m4, m5
15294 movu [r0 + 1097 * 16], m4
15295
15296 ; mode 19 [row 5]
15297 movu m6, [r5 + 4 * 16]
15298 pmaddubsw m4, m0, m6
15299 pmulhrsw m4, m7
15300 pmaddubsw m5, m2, m6
15301 pmulhrsw m5, m7
15302 packuswb m4, m5
15303 movu [r0 + 1098 * 16], m4
15304 pmaddubsw m4, m1, m6
15305 pmulhrsw m4, m7
15306 pmaddubsw m5, m3, m6
15307 pmulhrsw m5, m7
15308 packuswb m4, m5
15309 movu [r0 + 1099 * 16], m4
15310
15311 ; mode 19 [row 6]
15312 movu m6, [r5 + 10 * 16]
15313 pslldq m0, 2
15314 pinsrb m0, [r4 + 5], 1
15315 pinsrb m0, [r4 + 6], 0
15316 pmaddubsw m4, m0, m6
15317 pmulhrsw m4, m7
15318 pslldq m2, 2
15319 pinsrw m2, [r3 + 3], 0
15320 pmaddubsw m5, m2, m6
15321 pmulhrsw m5, m7
15322 packuswb m4, m5
15323 movu [r0 + 1100 * 16], m4
15324 pslldq m1, 2
15325 pinsrw m1, [r3 + 11], 0
15326 pmaddubsw m4, m1, m6
15327 pmulhrsw m4, m7
15328 pslldq m3, 2
15329 pinsrw m3, [r3 + 19], 0
15330 pmaddubsw m5, m3, m6
15331 pmulhrsw m5, m7
15332 packuswb m4, m5
15333 movu [r0 + 1101 * 16], m4
15334
15335 ; mode 19 [row 7]
15336 movu m6, [r5 + 16 * 16]
15337 pslldq m0, 2
15338 pinsrb m0, [r4 + 6], 1
15339 pinsrb m0, [r4 + 7], 0
15340 pmaddubsw m4, m0, m6
15341 pmulhrsw m4, m7
15342 pslldq m2, 2
15343 pinsrw m2, [r3 + 2], 0
15344 pmaddubsw m5, m2, m6
15345 pmulhrsw m5, m7
15346 packuswb m4, m5
15347 movu [r0 + 1102 * 16], m4
15348 pslldq m1, 2
15349 pinsrw m1, [r3 + 10], 0
15350 pmaddubsw m4, m1, m6
15351 pmulhrsw m4, m7
15352 pslldq m3, 2
15353 pinsrw m3, [r3 + 18], 0
15354 pmaddubsw m5, m3, m6
15355 pmulhrsw m5, m7
15356 packuswb m4, m5
15357 movu [r0 + 1103 * 16], m4
15358
15359 ; mode 19 [row 8]
15360 movu m6, [r5 + 22 * 16]
15361 pslldq m0, 2
15362 pinsrb m0, [r4 + 7], 1
15363 pinsrb m0, [r4 + 9], 0
15364 pmaddubsw m4, m0, m6
15365 pmulhrsw m4, m7
15366 pslldq m2, 2
15367 pinsrw m2, [r3 + 1], 0
15368 pmaddubsw m5, m2, m6
15369 pmulhrsw m5, m7
15370 packuswb m4, m5
15371 movu [r0 + 1104 * 16], m4
15372 pslldq m1, 2
15373 pinsrw m1, [r3 + 9], 0
15374 pmaddubsw m4, m1, m6
15375 pmulhrsw m4, m7
15376 pslldq m3, 2
15377 pinsrw m3, [r3 + 17], 0
15378 pmaddubsw m5, m3, m6
15379 pmulhrsw m5, m7
15380 packuswb m4, m5
15381 movu [r0 + 1105 * 16], m4
15382
15383 ; mode 19 [row 9]
15384 movu m6, [r5 + 28 * 16]
15385 pslldq m0, 2
15386 pinsrb m0, [r4 + 9], 1
15387 pinsrb m0, [r4 + 10], 0
15388 pmaddubsw m4, m0, m6
15389 pmulhrsw m4, m7
15390 pslldq m2, 2
15391 pinsrw m2, [r3 + 0], 0
15392 pmaddubsw m5, m2, m6
15393 pmulhrsw m5, m7
15394 packuswb m4, m5
15395 movu [r0 + 1106 * 16], m4
15396 pslldq m1, 2
15397 pinsrw m1, [r3 + 8], 0
15398 pmaddubsw m4, m1, m6
15399 pmulhrsw m4, m7
15400 pslldq m3, 2
15401 pinsrw m3, [r3 + 16], 0
15402 pmaddubsw m5, m3, m6
15403 pmulhrsw m5, m7
15404 packuswb m4, m5
15405 movu [r0 + 1107 * 16], m4
15406
15407 ; mode 19 [row 10]
15408 movu m6, [r5 + 2 * 16]
15409 pmaddubsw m4, m0, m6
15410 pmulhrsw m4, m7
15411 pmaddubsw m5, m2, m6
15412 pmulhrsw m5, m7
15413 packuswb m4, m5
15414 movu [r0 + 1108 * 16], m4
15415 pmaddubsw m4, m1, m6
15416 pmulhrsw m4, m7
15417 pmaddubsw m5, m3, m6
15418 pmulhrsw m5, m7
15419 packuswb m4, m5
15420 movu [r0 + 1109 * 16], m4
15421
15422 ; mode 19 [row 11]
15423 movu m6, [r5 + 8 * 16]
15424 pslldq m0, 2
15425 pinsrb m0, [r4 + 10], 1
15426 pinsrb m0, [r4 + 11], 0
15427 pmaddubsw m4, m0, m6
15428 pmulhrsw m4, m7
15429 pslldq m2, 2
15430 pinsrb m2, [r3 + 0], 1
15431 pinsrb m2, [r4 + 1], 0
15432 pmaddubsw m5, m2, m6
15433 pmulhrsw m5, m7
15434 packuswb m4, m5
15435 movu [r0 + 1110 * 16], m4
15436 pslldq m1, 2
15437 pinsrw m1, [r3 + 7], 0
15438 pmaddubsw m4, m1, m6
15439 pmulhrsw m4, m7
15440 pslldq m3, 2
15441 pinsrw m3, [r3 + 15], 0
15442 pmaddubsw m5, m3, m6
15443 pmulhrsw m5, m7
15444 packuswb m4, m5
15445 movu [r0 + 1111 * 16], m4
15446
15447 ; mode 19 [row 12]
15448 movu m6, [r5 + 14 * 16]
15449 pslldq m0, 2
15450 pinsrb m0, [r4 + 11], 1
15451 pinsrb m0, [r4 + 12], 0
15452 pmaddubsw m4, m0, m6
15453 pmulhrsw m4, m7
15454 pslldq m2, 2
15455 pinsrb m2, [r4 + 1], 1
15456 pinsrb m2, [r4 + 2], 0
15457 pmaddubsw m5, m2, m6
15458 pmulhrsw m5, m7
15459 packuswb m4, m5
15460 movu [r0 + 1112 * 16], m4
15461 pslldq m1, 2
15462 pinsrw m1, [r3 + 6], 0
15463 pmaddubsw m4, m1, m6
15464 pmulhrsw m4, m7
15465 pslldq m3, 2
15466 pinsrw m3, [r3 + 14], 0
15467 pmaddubsw m5, m3, m6
15468 pmulhrsw m5, m7
15469 packuswb m4, m5
15470 movu [r0 + 1113 * 16], m4
15471
15472 ; mode 19 [row 13]
15473 movu m6, [r5 + 20 * 16]
15474 pslldq m0, 2
15475 pinsrb m0, [r4 + 12], 1
15476 pinsrb m0, [r4 + 14], 0
15477 pmaddubsw m4, m0, m6
15478 pmulhrsw m4, m7
15479 pslldq m2, 2
15480 pinsrb m2, [r4 + 2], 1
15481 pinsrb m2, [r4 + 4], 0
15482 pmaddubsw m5, m2, m6
15483 pmulhrsw m5, m7
15484 packuswb m4, m5
15485 movu [r0 + 1114 * 16], m4
15486 pslldq m1, 2
15487 pinsrw m1, [r3 + 5], 0
15488 pmaddubsw m4, m1, m6
15489 pmulhrsw m4, m7
15490 pslldq m3, 2
15491 pinsrw m3, [r3 + 13], 0
15492 pmaddubsw m5, m3, m6
15493 pmulhrsw m5, m7
15494 packuswb m4, m5
15495 movu [r0 + 1115 * 16], m4
15496
15497 ; mode 19 [row 14]
15498 movu m6, [r5 + 26 * 16]
15499 pslldq m0, 2
15500 pinsrb m0, [r4 + 14], 1
15501 pinsrb m0, [r4 + 15], 0
15502 pmaddubsw m4, m0, m6
15503 pmulhrsw m4, m7
15504 pslldq m2, 2
15505 pinsrb m2, [r4 + 4], 1
15506 pinsrb m2, [r4 + 5], 0
15507 pmaddubsw m5, m2, m6
15508 pmulhrsw m5, m7
15509 packuswb m4, m5
15510 movu [r0 + 1116 * 16], m4
15511 pslldq m1, 2
15512 pinsrw m1, [r3 + 4], 0
15513 pmaddubsw m4, m1, m6
15514 pmulhrsw m4, m7
15515 pslldq m3, 2
15516 pinsrw m3, [r3 + 12], 0
15517 pmaddubsw m5, m3, m6
15518 pmulhrsw m5, m7
15519 packuswb m4, m5
15520 movu [r0 + 1117 * 16], m4
15521
15522 ; mode19 [row 15]
15523 pshufb m5, m0, [tab_S2]
15524 movh [r0 + 1118 * 16], m5
15525 pshufb m5, m2, [tab_S2]
15526 movh [r0 + 1118 * 16 + 8], m5
15527 pshufb m5, m1, [tab_S2]
15528 movh [r0 + 1119 * 16], m5
15529 pshufb m5, m3, [tab_S2]
15530 movh [r0 + 1119 * 16 + 8], m5
15531
15532 ; mode 19 [row 16]
15533 movu m6, [r5 + 6 * 16]
15534 pslldq m0, 2
15535 pinsrb m0, [r4 + 15], 1
15536 pinsrb m0, [r4 + 16], 0
15537 pmaddubsw m4, m0, m6
15538 pmulhrsw m4, m7
15539 pslldq m2, 2
15540 pinsrb m2, [r4 + 5], 1
15541 pinsrb m2, [r4 + 6], 0
15542 pmaddubsw m5, m2, m6
15543 pmulhrsw m5, m7
15544 packuswb m4, m5
15545 movu [r0 + 1120 * 16], m4
15546 pslldq m1, 2
15547 pinsrw m1, [r3 + 3], 0
15548 pmaddubsw m4, m1, m6
15549 pmulhrsw m4, m7
15550 pslldq m3, 2
15551 pinsrw m3, [r3 + 11], 0
15552 pmaddubsw m5, m3, m6
15553 pmulhrsw m5, m7
15554 packuswb m4, m5
15555 movu [r0 + 1121 * 16], m4
15556
15557 ; mode 19 [row 17]
15558 movu m6, [r5 + 12 * 16]
15559 pslldq m0, 2
15560 pinsrb m0, [r4 + 16], 1
15561 pinsrb m0, [r4 + 17], 0
15562 pmaddubsw m4, m0, m6
15563 pmulhrsw m4, m7
15564 pslldq m2, 2
15565 pinsrb m2, [r4 + 6], 1
15566 pinsrb m2, [r4 + 7], 0
15567 pmaddubsw m5, m2, m6
15568 pmulhrsw m5, m7
15569 packuswb m4, m5
15570 movu [r0 + 1122 * 16], m4
15571 pslldq m1, 2
15572 pinsrw m1, [r3 + 2], 0
15573 pmaddubsw m4, m1, m6
15574 pmulhrsw m4, m7
15575 pslldq m3, 2
15576 pinsrw m3, [r3 + 10], 0
15577 pmaddubsw m5, m3, m6
15578 pmulhrsw m5, m7
15579 packuswb m4, m5
15580 movu [r0 + 1123 * 16], m4
15581
15582 ; mode 19 [row 18]
15583 movu m6, [r5 + 18 * 16]
15584 pslldq m0, 2
15585 pinsrb m0, [r4 + 17], 1
15586 pinsrb m0, [r4 + 18], 0
15587 pmaddubsw m4, m0, m6
15588 pmulhrsw m4, m7
15589 pslldq m2, 2
15590 pinsrb m2, [r4 + 7], 1
15591 pinsrb m2, [r4 + 9], 0
15592 pmaddubsw m5, m2, m6
15593 pmulhrsw m5, m7
15594 packuswb m4, m5
15595 movu [r0 + 1124 * 16], m4
15596 pslldq m1, 2
15597 pinsrw m1, [r3 + 1], 0
15598 pmaddubsw m4, m1, m6
15599 pmulhrsw m4, m7
15600 pslldq m3, 2
15601 pinsrw m3, [r3 + 9], 0
15602 pmaddubsw m5, m3, m6
15603 pmulhrsw m5, m7
15604 packuswb m4, m5
15605 movu [r0 + 1125 * 16], m4
15606
15607 ; mode 19 [row 19]
15608 movu m6, [r5 + 24 * 16]
15609 pslldq m0, 2
15610 pinsrb m0, [r4 + 18], 1
15611 pinsrb m0, [r4 + 20], 0
15612 pmaddubsw m4, m0, m6
15613 pmulhrsw m4, m7
15614 pslldq m2, 2
15615 pinsrb m2, [r4 + 9], 1
15616 pinsrb m2, [r4 + 10], 0
15617 pmaddubsw m5, m2, m6
15618 pmulhrsw m5, m7
15619 packuswb m4, m5
15620 movu [r0 + 1126 * 16], m4
15621 pslldq m1, 2
15622 pinsrw m1, [r3 + 0], 0
15623 pmaddubsw m4, m1, m6
15624 pmulhrsw m4, m7
15625 pslldq m3, 2
15626 pinsrw m3, [r3 + 8], 0
15627 pmaddubsw m5, m3, m6
15628 pmulhrsw m5, m7
15629 packuswb m4, m5
15630 movu [r0 + 1127 * 16], m4
15631
15632 ; mode 19 [row 20]
15633 movu m6, [r5 + 30 * 16]
15634 pslldq m0, 2
15635 pinsrb m0, [r4 + 20], 1
15636 pinsrb m0, [r4 + 21], 0
15637 pmaddubsw m4, m0, m6
15638 pmulhrsw m4, m7
15639 pslldq m2, 2
15640 pinsrb m2, [r4 + 10], 1
15641 pinsrb m2, [r4 + 11], 0
15642 pmaddubsw m5, m2, m6
15643 pmulhrsw m5, m7
15644 packuswb m4, m5
15645 movu [r0 + 1128 * 16], m4
15646 pslldq m1, 2
15647 pinsrb m1, [r4 + 0], 1
15648 pinsrb m1, [r4 + 1], 0
15649 pmaddubsw m4, m1, m6
15650 pmulhrsw m4, m7
15651 pslldq m3, 2
15652 pinsrb m3, [r3 + 8], 1
15653 pinsrb m3, [r3 + 7], 0
15654 pmaddubsw m5, m3, m6
15655 pmulhrsw m5, m7
15656 packuswb m4, m5
15657 movu [r0 + 1129 * 16], m4
15658
15659 ; mode 19 [row 21]
15660 movu m6, [r5 + 4 * 16]
15661 pmaddubsw m4, m0, m6
15662 pmulhrsw m4, m7
15663 pmaddubsw m5, m2, m6
15664 pmulhrsw m5, m7
15665 packuswb m4, m5
15666 movu [r0 + 1130 * 16], m4
15667 pmaddubsw m4, m1, m6
15668 pmulhrsw m4, m7
15669 pmaddubsw m5, m3, m6
15670 pmulhrsw m5, m7
15671 packuswb m4, m5
15672 movu [r0 + 1131 * 16], m4
15673
15674 ; mode 19 [row 22]
15675 movu m6, [r5 + 10 * 16]
15676 pslldq m0, 2
15677 pinsrb m0, [r4 + 21], 1
15678 pinsrb m0, [r4 + 22], 0
15679 pmaddubsw m4, m0, m6
15680 pmulhrsw m4, m7
15681 pslldq m2, 2
15682 pinsrb m2, [r4 + 11], 1
15683 pinsrb m2, [r4 + 12], 0
15684 pmaddubsw m5, m2, m6
15685 pmulhrsw m5, m7
15686 packuswb m4, m5
15687 movu [r0 + 1132 * 16], m4
15688 pslldq m1, 2
15689 pinsrb m1, [r4 + 1], 1
15690 pinsrb m1, [r4 + 2], 0
15691 pmaddubsw m4, m1, m6
15692 pmulhrsw m4, m7
15693 pslldq m3, 2
15694 pinsrw m3, [r3 + 6], 0
15695 pmaddubsw m5, m3, m6
15696 pmulhrsw m5, m7
15697 packuswb m4, m5
15698 movu [r0 + 1133 * 16], m4
15699
15700 ; mode 19 [row 23]
15701 movu m6, [r5 + 16 * 16]
15702 pslldq m0, 2
15703 pinsrb m0, [r4 + 22], 1
15704 pinsrb m0, [r4 + 23], 0
15705 pmaddubsw m4, m0, m6
15706 pmulhrsw m4, m7
15707 pslldq m2, 2
15708 pinsrb m2, [r4 + 12], 1
15709 pinsrb m2, [r4 + 14], 0
15710 pmaddubsw m5, m2, m6
15711 pmulhrsw m5, m7
15712 packuswb m4, m5
15713 movu [r0 + 1134 * 16], m4
15714 pslldq m1, 2
15715 pinsrb m1, [r4 + 2], 1
15716 pinsrb m1, [r4 + 4], 0
15717 pmaddubsw m4, m1, m6
15718 pmulhrsw m4, m7
15719 pslldq m3, 2
15720 pinsrw m3, [r3 + 5], 0
15721 pmaddubsw m5, m3, m6
15722 pmulhrsw m5, m7
15723 packuswb m4, m5
15724 movu [r0 + 1135 * 16], m4
15725
15726 ; mode 19 [row 24]
15727 movu m6, [r5 + 22 * 16]
15728 pslldq m0, 2
15729 pinsrb m0, [r4 + 23], 1
15730 pinsrb m0, [r4 + 25], 0
15731 pmaddubsw m4, m0, m6
15732 pmulhrsw m4, m7
15733 pslldq m2, 2
15734 pinsrb m2, [r4 + 14], 1
15735 pinsrb m2, [r4 + 15], 0
15736 pmaddubsw m5, m2, m6
15737 pmulhrsw m5, m7
15738 packuswb m4, m5
15739 movu [r0 + 1136 * 16], m4
15740 pslldq m1, 2
15741 pinsrb m1, [r4 + 4], 1
15742 pinsrb m1, [r4 + 5], 0
15743 pmaddubsw m4, m1, m6
15744 pmulhrsw m4, m7
15745 pslldq m3, 2
15746 pinsrw m3, [r3 + 4], 0
15747 pmaddubsw m5, m3, m6
15748 pmulhrsw m5, m7
15749 packuswb m4, m5
15750 movu [r0 + 1137 * 16], m4
15751
15752 ; mode 19 [row 25]
15753 movu m6, [r5 + 28 * 16]
15754 pslldq m0, 2
15755 pinsrb m0, [r4 + 25], 1
15756 pinsrb m0, [r4 + 26], 0
15757 pmaddubsw m4, m0, m6
15758 pmulhrsw m4, m7
15759 pslldq m2, 2
15760 pinsrb m2, [r4 + 15], 1
15761 pinsrb m2, [r4 + 16], 0
15762 pmaddubsw m5, m2, m6
15763 pmulhrsw m5, m7
15764 packuswb m4, m5
15765 movu [r0 + 1138 * 16], m4
15766 pslldq m1, 2
15767 pinsrb m1, [r4 + 5], 1
15768 pinsrb m1, [r4 + 6], 0
15769 pmaddubsw m4, m1, m6
15770 pmulhrsw m4, m7
15771 pslldq m3, 2
15772 pinsrw m3, [r3 + 3], 0
15773 pmaddubsw m5, m3, m6
15774 pmulhrsw m5, m7
15775 packuswb m4, m5
15776 movu [r0 + 1139 * 16], m4
15777
15778 ; mode 19 [row 26]
15779 movu m6, [r5 + 2 * 16]
15780 pmaddubsw m4, m0, m6
15781 pmulhrsw m4, m7
15782 pmaddubsw m5, m2, m6
15783 pmulhrsw m5, m7
15784 packuswb m4, m5
15785 movu [r0 + 1140 * 16], m4
15786 pmaddubsw m4, m1, m6
15787 pmulhrsw m4, m7
15788 pmaddubsw m5, m3, m6
15789 pmulhrsw m5, m7
15790 packuswb m4, m5
15791 movu [r0 + 1141 * 16], m4
15792
15793 ; mode 19 [row 27]
15794 movu m6, [r5 + 8 * 16]
15795 pslldq m0, 2
15796 pinsrb m0, [r4 + 26], 1
15797 pinsrb m0, [r4 + 27], 0
15798 pmaddubsw m4, m0, m6
15799 pmulhrsw m4, m7
15800 pslldq m2, 2
15801 pinsrb m2, [r4 + 16], 1
15802 pinsrb m2, [r4 + 17], 0
15803 pmaddubsw m5, m2, m6
15804 pmulhrsw m5, m7
15805 packuswb m4, m5
15806 movu [r0 + 1142 * 16], m4
15807 pslldq m1, 2
15808 pinsrb m1, [r4 + 6], 1
15809 pinsrb m1, [r4 + 7], 0
15810 pmaddubsw m4, m1, m6
15811 pmulhrsw m4, m7
15812 pslldq m3, 2
15813 pinsrw m3, [r3 + 2], 0
15814 pmaddubsw m5, m3, m6
15815 pmulhrsw m5, m7
15816 packuswb m4, m5
15817 movu [r0 + 1143 * 16], m4
15818
15819 ; mode 19 [row 28]
15820 movu m6, [r5 + 14 * 16]
15821 pslldq m0, 2
15822 pinsrb m0, [r4 + 27], 1
15823 pinsrb m0, [r4 + 28], 0
15824 pmaddubsw m4, m0, m6
15825 pmulhrsw m4, m7
15826 pslldq m2, 2
15827 pinsrb m2, [r4 + 17], 1
15828 pinsrb m2, [r4 + 18], 0
15829 pmaddubsw m5, m2, m6
15830 pmulhrsw m5, m7
15831 packuswb m4, m5
15832 movu [r0 + 1144 * 16], m4
15833 pslldq m1, 2
15834 pinsrb m1, [r4 + 7], 1
15835 pinsrb m1, [r4 + 9], 0
15836 pmaddubsw m4, m1, m6
15837 pmulhrsw m4, m7
15838 pslldq m3, 2
15839 pinsrw m3, [r3 + 1], 0
15840 pmaddubsw m5, m3, m6
15841 pmulhrsw m5, m7
15842 packuswb m4, m5
15843 movu [r0 + 1145 * 16], m4
15844
15845 ; mode 19 [row 29]
15846 movu m6, [r5 + 20 * 16]
15847 pslldq m0, 2
15848 pinsrb m0, [r4 + 28], 1
15849 pinsrb m0, [r4 + 30], 0
15850 pmaddubsw m4, m0, m6
15851 pmulhrsw m4, m7
15852 pslldq m2, 2
15853 pinsrb m2, [r4 + 18], 1
15854 pinsrb m2, [r4 + 20], 0
15855 pmaddubsw m5, m2, m6
15856 pmulhrsw m5, m7
15857 packuswb m4, m5
15858 movu [r0 + 1146 * 16], m4
15859 pslldq m1, 2
15860 pinsrb m1, [r4 + 9], 1
15861 pinsrb m1, [r4 + 10], 0
15862 pmaddubsw m4, m1, m6
15863 pmulhrsw m4, m7
15864 pslldq m3, 2
15865 pinsrw m3, [r3 + 0], 0
15866 pmaddubsw m5, m3, m6
15867 pmulhrsw m5, m7
15868 packuswb m4, m5
15869 movu [r0 + 1147 * 16], m4
15870
15871 ; mode 19 [row 30]
15872 movu m6, [r5 + 26 * 16]
15873 pslldq m0, 2
15874 pinsrb m0, [r4 + 30], 1
15875 pinsrb m0, [r4 + 31], 0
15876 pmaddubsw m4, m0, m6
15877 pmulhrsw m4, m7
15878 pslldq m2, 2
15879 pinsrb m2, [r4 + 20], 1
15880 pinsrb m2, [r4 + 21], 0
15881 pmaddubsw m5, m2, m6
15882 pmulhrsw m5, m7
15883 packuswb m4, m5
15884 movu [r0 + 1148 * 16], m4
15885 pslldq m1, 2
15886 pinsrb m1, [r4 + 10], 1
15887 pinsrb m1, [r4 + 11], 0
15888 pmaddubsw m4, m1, m6
15889 pmulhrsw m4, m7
15890 pslldq m3, 2
15891 pinsrb m3, [r4 + 0], 1
15892 pinsrb m3, [r4 + 1], 0
15893 pmaddubsw m5, m3, m6
15894 pmulhrsw m5, m7
15895 packuswb m4, m5
15896 movu [r0 + 1149 * 16], m4
15897
15898 ; mode19 [row 31]
15899 pshufb m5, m0, [tab_S2]
15900 movh [r0 + 1150 * 16], m5
15901 pshufb m5, m2, [tab_S2]
15902 movh [r0 + 1150 * 16 + 8], m5
15903 pshufb m5, m1, [tab_S2]
15904 movh [r0 + 1151 * 16], m5
15905 pshufb m5, m3, [tab_S2]
15906 movh [r0 + 1151 * 16 + 8], m5
15907
15908 ; mode 20 [row 0]
15909 movu m6, [r5 + 11 * 16]
15910 movu m0, [r3 ]
15911 movu m1, [r3 + 1 ]
15912 punpcklbw m0, m1
15913 pmaddubsw m1, m0, m6
15914 pmulhrsw m1, m7
15915 movu m2, [r3 + 8]
15916 movu m3, [r3 + 9]
15917 punpcklbw m2, m3
15918 pmaddubsw m3, m2, m6
15919 pmulhrsw m3, m7
15920 packuswb m1, m3
15921 movu [r0 + 1152 * 16], m1
15922
15923 movu m1, [r3 + 16]
15924 movu m3, [r3 + 17]
15925 punpcklbw m1, m3
15926 pmaddubsw m4, m1, m6
15927 pmulhrsw m4, m7
15928 movu m3, [r3 + 24]
15929 movu m5, [r3 + 25]
15930 punpcklbw m3, m5
15931 pmaddubsw m5, m3, m6
15932 pmulhrsw m5, m7
15933 packuswb m4, m5
15934 movu [r0 + 1153 * 16], m4
15935
15936 ; mode 20 [row 1]
15937 movu m6, [r5 + 22 * 16]
15938 pslldq m0, 2
15939 pinsrb m0, [r4 + 0], 1
15940 pinsrb m0, [r4 + 2], 0
15941 pmaddubsw m4, m0, m6
15942 pmulhrsw m4, m7
15943 pslldq m2, 2
15944 pinsrw m2, [r3 + 7], 0
15945 pmaddubsw m5, m2, m6
15946 pmulhrsw m5, m7
15947 packuswb m4, m5
15948 movu [r0 + 1154 * 16], m4
15949 pslldq m1, 2
15950 pinsrw m1, [r3 + 15], 0
15951 pmaddubsw m4, m1, m6
15952 pmulhrsw m4, m7
15953 pslldq m3, 2
15954 pinsrw m3, [r3 + 23], 0
15955 pmaddubsw m5, m3, m6
15956 pmulhrsw m5, m7
15957 packuswb m4, m5
15958 movu [r0 + 1155 * 16], m4
15959
15960 ; mode 20 [row 2]
15961 movu m6, [r5 + 1 * 16]
15962 pmaddubsw m4, m0, m6
15963 pmulhrsw m4, m7
15964 pmaddubsw m5, m2, m6
15965 pmulhrsw m5, m7
15966 packuswb m4, m5
15967 movu [r0 + 1156 * 16], m4
15968 pmaddubsw m4, m1, m6
15969 pmulhrsw m4, m7
15970 pmaddubsw m5, m3, m6
15971 pmulhrsw m5, m7
15972 packuswb m4, m5
15973 movu [r0 + 1157 * 16], m4
15974
15975 ; mode 20 [row 3]
15976 movu m6, [r5 + 12 * 16]
15977 pslldq m0, 2
15978 pinsrb m0, [r4 + 2], 1
15979 pinsrb m0, [r4 + 3], 0
15980 pmaddubsw m4, m0, m6
15981 pmulhrsw m4, m7
15982 pslldq m2, 2
15983 pinsrw m2, [r3 + 6], 0
15984 pmaddubsw m5, m2, m6
15985 pmulhrsw m5, m7
15986 packuswb m4, m5
15987 movu [r0 + 1158 * 16], m4
15988 pslldq m1, 2
15989 pinsrw m1, [r3 + 14], 0
15990 pmaddubsw m4, m1, m6
15991 pmulhrsw m4, m7
15992 pslldq m3, 2
15993 pinsrw m3, [r3 + 22], 0
15994 pmaddubsw m5, m3, m6
15995 pmulhrsw m5, m7
15996 packuswb m4, m5
15997 movu [r0 + 1159 * 16], m4
15998
15999 ; mode 20 [row 4]
16000 movu m6, [r5 + 23 * 16]
16001 pslldq m0, 2
16002 pinsrb m0, [r4 + 3], 1
16003 pinsrb m0, [r4 + 5], 0
16004 pmaddubsw m4, m0, m6
16005 pmulhrsw m4, m7
16006 pslldq m2, 2
16007 pinsrw m2, [r3 + 5], 0
16008 pmaddubsw m5, m2, m6
16009 pmulhrsw m5, m7
16010 packuswb m4, m5
16011 movu [r0 + 1160 * 16], m4
16012 pslldq m1, 2
16013 pinsrw m1, [r3 + 13], 0
16014 pmaddubsw m4, m1, m6
16015 pmulhrsw m4, m7
16016 pslldq m3, 2
16017 pinsrw m3, [r3 + 21], 0
16018 pmaddubsw m5, m3, m6
16019 pmulhrsw m5, m7
16020 packuswb m4, m5
16021 movu [r0 + 1161 * 16], m4
16022
16023 ; mode 20 [row 5]
16024 movu m6, [r5 + 2 * 16]
16025 pmaddubsw m4, m0, m6
16026 pmulhrsw m4, m7
16027 pmaddubsw m5, m2, m6
16028 pmulhrsw m5, m7
16029 packuswb m4, m5
16030 movu [r0 + 1162 * 16], m4
16031 pmaddubsw m4, m1, m6
16032 pmulhrsw m4, m7
16033 pmaddubsw m5, m3, m6
16034 pmulhrsw m5, m7
16035 packuswb m4, m5
16036 movu [r0 + 1163 * 16], m4
16037
16038 ; mode 20 [row 6]
16039 movu m6, [r5 + 13 * 16]
16040 pslldq m0, 2
16041 pinsrb m0, [r4 + 5], 1
16042 pinsrb m0, [r4 + 6], 0
16043 pmaddubsw m4, m0, m6
16044 pmulhrsw m4, m7
16045 pslldq m2, 2
16046 pinsrw m2, [r3 + 4], 0
16047 pmaddubsw m5, m2, m6
16048 pmulhrsw m5, m7
16049 packuswb m4, m5
16050 movu [r0 + 1164 * 16], m4
16051 pslldq m1, 2
16052 pinsrw m1, [r3 + 12], 0
16053 pmaddubsw m4, m1, m6
16054 pmulhrsw m4, m7
16055 pslldq m3, 2
16056 pinsrw m3, [r3 + 20], 0
16057 pmaddubsw m5, m3, m6
16058 pmulhrsw m5, m7
16059 packuswb m4, m5
16060 movu [r0 + 1165 * 16], m4
16061
16062 ; mode 20 [row 7]
16063 movu m6, [r5 + 24 * 16]
16064 pslldq m0, 2
16065 pinsrb m0, [r4 + 6], 1
16066 pinsrb m0, [r4 + 8], 0
16067 pmaddubsw m4, m0, m6
16068 pmulhrsw m4, m7
16069 pslldq m2, 2
16070 pinsrw m2, [r3 + 3], 0
16071 pmaddubsw m5, m2, m6
16072 pmulhrsw m5, m7
16073 packuswb m4, m5
16074 movu [r0 + 1166 * 16], m4
16075 pslldq m1, 2
16076 pinsrw m1, [r3 + 11], 0
16077 pmaddubsw m4, m1, m6
16078 pmulhrsw m4, m7
16079 pslldq m3, 2
16080 pinsrw m3, [r3 + 19], 0
16081 pmaddubsw m5, m3, m6
16082 pmulhrsw m5, m7
16083 packuswb m4, m5
16084 movu [r0 + 1167 * 16], m4
16085
16086 ; mode 20 [row 8]
16087 movu m6, [r5 + 3 * 16]
16088 pmaddubsw m4, m0, m6
16089 pmulhrsw m4, m7
16090 pmaddubsw m5, m2, m6
16091 pmulhrsw m5, m7
16092 packuswb m4, m5
16093 movu [r0 + 1168 * 16], m4
16094 pmaddubsw m4, m1, m6
16095 pmulhrsw m4, m7
16096 pmaddubsw m5, m3, m6
16097 pmulhrsw m5, m7
16098 packuswb m4, m5
16099 movu [r0 + 1169 * 16], m4
16100
16101 ; mode 20 [row 9]
16102 movu m6, [r5 + 14 * 16]
16103 pslldq m0, 2
16104 pinsrb m0, [r4 + 8], 1
16105 pinsrb m0, [r4 + 9], 0
16106 pmaddubsw m4, m0, m6
16107 pmulhrsw m4, m7
16108 pslldq m2, 2
16109 pinsrb m2, [r3 + 3], 1
16110 pinsrb m2, [r3 + 2], 0
16111 pmaddubsw m5, m2, m6
16112 pmulhrsw m5, m7
16113 packuswb m4, m5
16114 movu [r0 + 1170 * 16], m4
16115 pslldq m1, 2
16116 pinsrw m1, [r3 + 10], 0
16117 pmaddubsw m4, m1, m6
16118 pmulhrsw m4, m7
16119 pslldq m3, 2
16120 pinsrw m3, [r3 + 18], 0
16121 pmaddubsw m5, m3, m6
16122 pmulhrsw m5, m7
16123 packuswb m4, m5
16124 movu [r0 + 1171 * 16], m4
16125
16126 ; mode 20 [row 10]
16127 movu m6, [r5 + 25 * 16]
16128 pslldq m0, 2
16129 pinsrb m0, [r4 + 9], 1
16130 pinsrb m0, [r4 + 11], 0
16131 pmaddubsw m4, m0, m6
16132 pmulhrsw m4, m7
16133 pslldq m2, 2
16134 pinsrw m2, [r3 + 1], 0
16135 pmaddubsw m5, m2, m6
16136 pmulhrsw m5, m7
16137 packuswb m4, m5
16138 movu [r0 + 1172 * 16], m4
16139 pslldq m1, 2
16140 pinsrw m1, [r3 + 9], 0
16141 pmaddubsw m4, m1, m6
16142 pmulhrsw m4, m7
16143 pslldq m3, 2
16144 pinsrw m3, [r3 + 17], 0
16145 pmaddubsw m5, m3, m6
16146 pmulhrsw m5, m7
16147 packuswb m4, m5
16148 movu [r0 + 1173 * 16], m4
16149
16150 ; mode 20 [row 11]
16151 movu m6, [r5 + 4 * 16]
16152 pmaddubsw m4, m0, m6
16153 pmulhrsw m4, m7
16154 pmaddubsw m5, m2, m6
16155 pmulhrsw m5, m7
16156 packuswb m4, m5
16157 movu [r0 + 1174 * 16], m4
16158 pmaddubsw m4, m1, m6
16159 pmulhrsw m4, m7
16160 pmaddubsw m5, m3, m6
16161 pmulhrsw m5, m7
16162 packuswb m4, m5
16163 movu [r0 + 1175 * 16], m4
16164
16165 ; mode 20 [row 12]
16166 movu m6, [r5 + 15 * 16]
16167 pslldq m0, 2
16168 pinsrb m0, [r4 + 11], 1
16169 pinsrb m0, [r4 + 12], 0
16170 pmaddubsw m4, m0, m6
16171 pmulhrsw m4, m7
16172 pslldq m2, 2
16173 pinsrb m2, [r3 + 1], 1
16174 pinsrb m2, [r3 + 0], 0
16175 pmaddubsw m5, m2, m6
16176 pmulhrsw m5, m7
16177 packuswb m4, m5
16178 movu [r0 + 1176 * 16], m4
16179 pslldq m1, 2
16180 pinsrw m1, [r3 + 8], 0
16181 pmaddubsw m4, m1, m6
16182 pmulhrsw m4, m7
16183 pslldq m3, 2
16184 pinsrw m3, [r3 + 16], 0
16185 pmaddubsw m5, m3, m6
16186 pmulhrsw m5, m7
16187 packuswb m4, m5
16188 movu [r0 + 1177 * 16], m4
16189
16190 ; mode 20 [row 13]
16191 movu m6, [r5 + 26 * 16]
16192 pslldq m0, 2
16193 pinsrb m0, [r4 + 12], 1
16194 pinsrb m0, [r4 + 14], 0
16195 pmaddubsw m4, m0, m6
16196 pmulhrsw m4, m7
16197 pslldq m2, 2
16198 pinsrb m2, [r4 + 0], 1
16199 pinsrb m2, [r4 + 2], 0
16200 pmaddubsw m5, m2, m6
16201 pmulhrsw m5, m7
16202 packuswb m4, m5
16203 movu [r0 + 1178 * 16], m4
16204 pslldq m1, 2
16205 pinsrw m1, [r3 + 7], 0
16206 pmaddubsw m4, m1, m6
16207 pmulhrsw m4, m7
16208 pslldq m3, 2
16209 pinsrw m3, [r3 + 15], 0
16210 pmaddubsw m5, m3, m6
16211 pmulhrsw m5, m7
16212 packuswb m4, m5
16213 movu [r0 + 1179 * 16], m4
16214
16215 ; mode 20 [row 14]
16216 movu m6, [r5 + 5 * 16]
16217 pmaddubsw m4, m0, m6
16218 pmulhrsw m4, m7
16219 pmaddubsw m5, m2, m6
16220 pmulhrsw m5, m7
16221 packuswb m4, m5
16222 movu [r0 + 1180 * 16], m4
16223 pmaddubsw m4, m1, m6
16224 pmulhrsw m4, m7
16225 pmaddubsw m5, m3, m6
16226 pmulhrsw m5, m7
16227 packuswb m4, m5
16228 movu [r0 + 1181 * 16], m4
16229
16230 ; mode 20 [row 15]
16231 movu m6, [r5 + 16 * 16]
16232 pslldq m0, 2
16233 pinsrb m0, [r4 + 14], 1
16234 pinsrb m0, [r4 + 15], 0
16235 pmaddubsw m4, m0, m6
16236 pmulhrsw m4, m7
16237 pslldq m2, 2
16238 pinsrb m2, [r4 + 2], 1
16239 pinsrb m2, [r4 + 3], 0
16240 pmaddubsw m5, m2, m6
16241 pmulhrsw m5, m7
16242 packuswb m4, m5
16243 movu [r0 + 1182 * 16], m4
16244 pslldq m1, 2
16245 pinsrw m1, [r3 + 6], 0
16246 pmaddubsw m4, m1, m6
16247 pmulhrsw m4, m7
16248 pslldq m3, 2
16249 pinsrw m3, [r3 + 14], 0
16250 pmaddubsw m5, m3, m6
16251 pmulhrsw m5, m7
16252 packuswb m4, m5
16253 movu [r0 + 1183 * 16], m4
16254
16255 ; mode 20 [row 16]
16256 movu m6, [r5 + 27 * 16]
16257 pslldq m0, 2
16258 pinsrb m0, [r4 + 15], 1
16259 pinsrb m0, [r4 + 17], 0
16260 pmaddubsw m4, m0, m6
16261 pmulhrsw m4, m7
16262 pslldq m2, 2
16263 pinsrb m2, [r4 + 3], 1
16264 pinsrb m2, [r4 + 5], 0
16265 pmaddubsw m5, m2, m6
16266 pmulhrsw m5, m7
16267 packuswb m4, m5
16268 movu [r0 + 1184 * 16], m4
16269 pslldq m1, 2
16270 pinsrw m1, [r3 + 5], 0
16271 pmaddubsw m4, m1, m6
16272 pmulhrsw m4, m7
16273 pslldq m3, 2
16274 pinsrw m3, [r3 + 13], 0
16275 pmaddubsw m5, m3, m6
16276 pmulhrsw m5, m7
16277 packuswb m4, m5
16278 movu [r0 + 1185 * 16], m4
16279
16280 ; mode 20 [row 17]
16281 movu m6, [r5 + 6 * 16]
16282 pmaddubsw m4, m0, m6
16283 pmulhrsw m4, m7
16284 pmaddubsw m5, m2, m6
16285 pmulhrsw m5, m7
16286 packuswb m4, m5
16287 movu [r0 + 1186 * 16], m4
16288 pmaddubsw m4, m1, m6
16289 pmulhrsw m4, m7
16290 pmaddubsw m5, m3, m6
16291 pmulhrsw m5, m7
16292 packuswb m4, m5
16293 movu [r0 + 1187 * 16], m4
16294
16295 ; mode 20 [row 18]
16296 movu m6, [r5 + 17 * 16]
16297 pslldq m0, 2
16298 pinsrb m0, [r4 + 17], 1
16299 pinsrb m0, [r4 + 18], 0
16300 pmaddubsw m4, m0, m6
16301 pmulhrsw m4, m7
16302 pslldq m2, 2
16303 pinsrb m2, [r4 + 5], 1
16304 pinsrb m2, [r4 + 6], 0
16305 pmaddubsw m5, m2, m6
16306 pmulhrsw m5, m7
16307 packuswb m4, m5
16308 movu [r0 + 1188 * 16], m4
16309 pslldq m1, 2
16310 pinsrw m1, [r3 + 4], 0
16311 pmaddubsw m4, m1, m6
16312 pmulhrsw m4, m7
16313 pslldq m3, 2
16314 pinsrw m3, [r3 + 12], 0
16315 pmaddubsw m5, m3, m6
16316 pmulhrsw m5, m7
16317 packuswb m4, m5
16318 movu [r0 + 1189 * 16], m4
16319
16320 ; mode 20 [row 19]
16321 movu m6, [r5 + 28 * 16]
16322 pslldq m0, 2
16323 pinsrb m0, [r4 + 18], 1
16324 pinsrb m0, [r4 + 20], 0
16325 pmaddubsw m4, m0, m6
16326 pmulhrsw m4, m7
16327 pslldq m2, 2
16328 pinsrb m2, [r4 + 6], 1
16329 pinsrb m2, [r4 + 8], 0
16330 pmaddubsw m5, m2, m6
16331 pmulhrsw m5, m7
16332 packuswb m4, m5
16333 movu [r0 + 1190 * 16], m4
16334 pslldq m1, 2
16335 pinsrw m1, [r3 + 3], 0
16336 pmaddubsw m4, m1, m6
16337 pmulhrsw m4, m7
16338 pslldq m3, 2
16339 pinsrw m3, [r3 + 11], 0
16340 pmaddubsw m5, m3, m6
16341 pmulhrsw m5, m7
16342 packuswb m4, m5
16343 movu [r0 + 1191 * 16], m4
16344
16345 ; mode 20 [row 20]
16346 movu m6, [r5 + 7 * 16]
16347 pmaddubsw m4, m0, m6
16348 pmulhrsw m4, m7
16349 pmaddubsw m5, m2, m6
16350 pmulhrsw m5, m7
16351 packuswb m4, m5
16352 movu [r0 + 1192 * 16], m4
16353 pmaddubsw m4, m1, m6
16354 pmulhrsw m4, m7
16355 pmaddubsw m5, m3, m6
16356 pmulhrsw m5, m7
16357 packuswb m4, m5
16358 movu [r0 + 1193 * 16], m4
16359
16360 ; mode 20 [row 21]
16361 movu m6, [r5 + 18 * 16]
16362 pslldq m0, 2
16363 pinsrb m0, [r4 + 20], 1
16364 pinsrb m0, [r4 + 21], 0
16365 pmaddubsw m4, m0, m6
16366 pmulhrsw m4, m7
16367 pslldq m2, 2
16368 pinsrb m2, [r4 + 8], 1
16369 pinsrb m2, [r4 + 9], 0
16370 pmaddubsw m5, m2, m6
16371 pmulhrsw m5, m7
16372 packuswb m4, m5
16373 movu [r0 + 1194 * 16], m4
16374 pslldq m1, 2
16375 pinsrw m1, [r3 + 2], 0
16376 pmaddubsw m4, m1, m6
16377 pmulhrsw m4, m7
16378 pslldq m3, 2
16379 pinsrw m3, [r3 + 10], 0
16380 pmaddubsw m5, m3, m6
16381 pmulhrsw m5, m7
16382 packuswb m4, m5
16383 movu [r0 + 1195 * 16], m4
16384
16385 ; mode 20 [row 22]
16386 movu m6, [r5 + 29 * 16]
16387 pslldq m0, 2
16388 pinsrb m0, [r4 + 21], 1
16389 pinsrb m0, [r4 + 23], 0
16390 pmaddubsw m4, m0, m6
16391 pmulhrsw m4, m7
16392 pslldq m2, 2
16393 pinsrb m2, [r4 + 9], 1
16394 pinsrb m2, [r4 + 11], 0
16395 pmaddubsw m5, m2, m6
16396 pmulhrsw m5, m7
16397 packuswb m4, m5
16398 movu [r0 + 1196 * 16], m4
16399 pslldq m1, 2
16400 pinsrw m1, [r3 + 1], 0
16401 pmaddubsw m4, m1, m6
16402 pmulhrsw m4, m7
16403 pslldq m3, 2
16404 pinsrw m3, [r3 + 9], 0
16405 pmaddubsw m5, m3, m6
16406 pmulhrsw m5, m7
16407 packuswb m4, m5
16408 movu [r0 + 1197 * 16], m4
16409
16410 ; mode 20 [row 23]
16411 movu m6, [r5 + 8 * 16]
16412 pmaddubsw m4, m0, m6
16413 pmulhrsw m4, m7
16414 pmaddubsw m5, m2, m6
16415 pmulhrsw m5, m7
16416 packuswb m4, m5
16417 movu [r0 + 1198 * 16], m4
16418 pmaddubsw m4, m1, m6
16419 pmulhrsw m4, m7
16420 pmaddubsw m5, m3, m6
16421 pmulhrsw m5, m7
16422 packuswb m4, m5
16423 movu [r0 + 1199 * 16], m4
16424
16425 ; mode 20 [row 24]
16426 movu m6, [r5 + 19 * 16]
16427 pslldq m0, 2
16428 pinsrb m0, [r4 + 23], 1
16429 pinsrb m0, [r4 + 24], 0
16430 pmaddubsw m4, m0, m6
16431 pmulhrsw m4, m7
16432 pslldq m2, 2
16433 pinsrb m2, [r4 + 11], 1
16434 pinsrb m2, [r4 + 12], 0
16435 pmaddubsw m5, m2, m6
16436 pmulhrsw m5, m7
16437 packuswb m4, m5
16438 movu [r0 + 1200 * 16], m4
16439 pslldq m1, 2
16440 pinsrw m1, [r3 + 0], 0
16441 pmaddubsw m4, m1, m6
16442 pmulhrsw m4, m7
16443 pslldq m3, 2
16444 pinsrw m3, [r3 + 8], 0
16445 pmaddubsw m5, m3, m6
16446 pmulhrsw m5, m7
16447 packuswb m4, m5
16448 movu [r0 + 1201 * 16], m4
16449
16450 ; mode 20 [row 25]
16451 movu m6, [r5 + 30 * 16]
16452 pslldq m0, 2
16453 pinsrb m0, [r4 + 24], 1
16454 pinsrb m0, [r4 + 26], 0
16455 pmaddubsw m4, m0, m6
16456 pmulhrsw m4, m7
16457 pslldq m2, 2
16458 pinsrb m2, [r4 + 12], 1
16459 pinsrb m2, [r4 + 14], 0
16460 pmaddubsw m5, m2, m6
16461 pmulhrsw m5, m7
16462 packuswb m4, m5
16463 movu [r0 + 1202 * 16], m4
16464 pslldq m1, 2
16465 pinsrb m1, [r4 + 0], 1
16466 pinsrb m1, [r4 + 2], 0
16467 pmaddubsw m4, m1, m6
16468 pmulhrsw m4, m7
16469 pslldq m3, 2
16470 pinsrw m3, [r3 + 7], 0
16471 pmaddubsw m5, m3, m6
16472 pmulhrsw m5, m7
16473 packuswb m4, m5
16474 movu [r0 + 1203 * 16], m4
16475
16476 ; mode 20 [row 26]
16477 movu m6, [r5 + 9 * 16]
16478 pmaddubsw m4, m0, m6
16479 pmulhrsw m4, m7
16480 pmaddubsw m5, m2, m6
16481 pmulhrsw m5, m7
16482 packuswb m4, m5
16483 movu [r0 + 1204 * 16], m4
16484 pmaddubsw m4, m1, m6
16485 pmulhrsw m4, m7
16486 pmaddubsw m5, m3, m6
16487 pmulhrsw m5, m7
16488 packuswb m4, m5
16489 movu [r0 + 1205 * 16], m4
16490
16491 ; mode 20 [row 27]
16492 movu m6, [r5 + 20 * 16]
16493 pslldq m0, 2
16494 pinsrb m0, [r4 + 26], 1
16495 pinsrb m0, [r4 + 27], 0
16496 pmaddubsw m4, m0, m6
16497 pmulhrsw m4, m7
16498 pslldq m2, 2
16499 pinsrb m2, [r4 + 14], 1
16500 pinsrb m2, [r4 + 15], 0
16501 pmaddubsw m5, m2, m6
16502 pmulhrsw m5, m7
16503 packuswb m4, m5
16504 movu [r0 + 1206 * 16], m4
16505 pslldq m1, 2
16506 pinsrb m1, [r4 + 2], 1
16507 pinsrb m1, [r4 + 3], 0
16508 pmaddubsw m4, m1, m6
16509 pmulhrsw m4, m7
16510 pslldq m3, 2
16511 pinsrw m3, [r3 + 6], 0
16512 pmaddubsw m5, m3, m6
16513 pmulhrsw m5, m7
16514 packuswb m4, m5
16515 movu [r0 + 1207 * 16], m4
16516
16517 ; mode 20 [row 28]
16518 movu m6, [r5 + 31 * 16]
16519 pslldq m0, 2
16520 pinsrb m0, [r4 + 27], 1
16521 pinsrb m0, [r4 + 29], 0
16522 pmaddubsw m4, m0, m6
16523 pmulhrsw m4, m7
16524 pslldq m2, 2
16525 pinsrb m2, [r4 + 15], 1
16526 pinsrb m2, [r4 + 17], 0
16527 pmaddubsw m5, m2, m6
16528 pmulhrsw m5, m7
16529 packuswb m4, m5
16530 movu [r0 + 1208 * 16], m4
16531 pslldq m1, 2
16532 pinsrb m1, [r4 + 3], 1
16533 pinsrb m1, [r4 + 5], 0
16534 pmaddubsw m4, m1, m6
16535 pmulhrsw m4, m7
16536 pslldq m3, 2
16537 pinsrw m3, [r3 + 5], 0
16538 pmaddubsw m5, m3, m6
16539 pmulhrsw m5, m7
16540 packuswb m4, m5
16541 movu [r0 + 1209 * 16], m4
16542
16543 ; mode 20 [row 29]
16544 movu m6, [r5 + 10 * 16]
16545 pmaddubsw m4, m0, m6
16546 pmulhrsw m4, m7
16547 pmaddubsw m5, m2, m6
16548 pmulhrsw m5, m7
16549 packuswb m4, m5
16550 movu [r0 + 1210 * 16], m4
16551 pmaddubsw m4, m1, m6
16552 pmulhrsw m4, m7
16553 pmaddubsw m5, m3, m6
16554 pmulhrsw m5, m7
16555 packuswb m4, m5
16556 movu [r0 + 1211 * 16], m4
16557
16558 ; mode 20 [row 30]
16559 movu m6, [r5 + 21 * 16]
16560 pslldq m0, 2
16561 pinsrb m0, [r4 + 29], 1
16562 pinsrb m0, [r4 + 30], 0
16563 pmaddubsw m4, m0, m6
16564 pmulhrsw m4, m7
16565 pslldq m2, 2
16566 pinsrb m2, [r4 + 17], 1
16567 pinsrb m2, [r4 + 18], 0
16568 pmaddubsw m5, m2, m6
16569 pmulhrsw m5, m7
16570 packuswb m4, m5
16571 movu [r0 + 1212 * 16], m4
16572 pslldq m1, 2
16573 pinsrb m1, [r4 + 5], 1
16574 pinsrb m1, [r4 + 6], 0
16575 pmaddubsw m4, m1, m6
16576 pmulhrsw m4, m7
16577 pslldq m3, 2
16578 pinsrw m3, [r3 + 4], 0
16579 pmaddubsw m5, m3, m6
16580 pmulhrsw m5, m7
16581 packuswb m4, m5
16582 movu [r0 + 1213 * 16], m4
16583
16584 ; mode20 [row 31]
16585 pshufb m5, m0, [tab_S2]
16586 movh [r0 + 1214 * 16], m5
16587 pshufb m5, m2, [tab_S2]
16588 movh [r0 + 1214 * 16 + 8], m5
16589 pshufb m5, m1, [tab_S2]
16590 movh [r0 + 1215 * 16], m5
16591 pshufb m5, m3, [tab_S2]
16592 movh [r0 + 1215 * 16 + 8], m5
16593
16594 ; mode 21 [row 0]
16595 movu m6, [r5 + 15 * 16]
16596 movu m0, [r3 ]
16597 movu m1, [r3 + 1 ]
16598 punpcklbw m0, m1
16599 pmaddubsw m1, m0, m6
16600 pmulhrsw m1, m7
16601 movu m2, [r3 + 8]
16602 movu m3, [r3 + 9]
16603 punpcklbw m2, m3
16604 pmaddubsw m3, m2, m6
16605 pmulhrsw m3, m7
16606 packuswb m1, m3
16607 movu [r0 + 1216 * 16], m1
16608
16609 movu m1, [r3 + 16]
16610 movu m3, [r3 + 17]
16611 punpcklbw m1, m3
16612 pmaddubsw m4, m1, m6
16613 pmulhrsw m4, m7
16614 movu m3, [r3 + 24]
16615 movu m5, [r3 + 25]
16616 punpcklbw m3, m5
16617 pmaddubsw m5, m3, m6
16618 pmulhrsw m5, m7
16619 packuswb m4, m5
16620 movu [r0 + 1217 * 16], m4
16621
16622 ; mode 21 [row 1]
16623 movu m6, [r5 + 30 * 16]
16624 pslldq m0, 2
16625 pinsrb m0, [r4 + 0], 1
16626 pinsrb m0, [r4 + 2], 0
16627 pmaddubsw m4, m0, m6
16628 pmulhrsw m4, m7
16629 pslldq m2, 2
16630 pinsrw m2, [r3 + 7], 0
16631 pmaddubsw m5, m2, m6
16632 pmulhrsw m5, m7
16633 packuswb m4, m5
16634 movu [r0 + 1218 * 16], m4
16635 pslldq m1, 2
16636 pinsrw m1, [r3 + 15], 0
16637 pmaddubsw m4, m1, m6
16638 pmulhrsw m4, m7
16639 pslldq m3, 2
16640 pinsrw m3, [r3 + 23], 0
16641 pmaddubsw m5, m3, m6
16642 pmulhrsw m5, m7
16643 packuswb m4, m5
16644 movu [r0 + 1219 * 16], m4
16645
16646 ; mode 21 [row 2]
16647 movu m6, [r5 + 13 * 16]
16648 pmaddubsw m4, m0, m6
16649 pmulhrsw m4, m7
16650 pmaddubsw m5, m2, m6
16651 pmulhrsw m5, m7
16652 packuswb m4, m5
16653 movu [r0 + 1220 * 16], m4
16654 pmaddubsw m4, m1, m6
16655 pmulhrsw m4, m7
16656 pmaddubsw m5, m3, m6
16657 pmulhrsw m5, m7
16658 packuswb m4, m5
16659 movu [r0 + 1221 * 16], m4
16660
16661 ; mode 21 [row 3]
16662 movu m6, [r5 + 28 * 16]
16663 pslldq m0, 2
16664 pinsrb m0, [r4 + 2], 1
16665 pinsrb m0, [r4 + 4], 0
16666 pmaddubsw m4, m0, m6
16667 pmulhrsw m4, m7
16668 pslldq m2, 2
16669 pinsrw m2, [r3 + 6], 0
16670 pmaddubsw m5, m2, m6
16671 pmulhrsw m5, m7
16672 packuswb m4, m5
16673 movu [r0 + 1222 * 16], m4
16674 pslldq m1, 2
16675 pinsrw m1, [r3 + 14], 0
16676 pmaddubsw m4, m1, m6
16677 pmulhrsw m4, m7
16678 pslldq m3, 2
16679 pinsrw m3, [r3 + 22], 0
16680 pmaddubsw m5, m3, m6
16681 pmulhrsw m5, m7
16682 packuswb m4, m5
16683 movu [r0 + 1223 * 16], m4
16684
16685 ; mode 21 [row 4]
16686 movu m6, [r5 + 11 * 16]
16687 pmaddubsw m4, m0, m6
16688 pmulhrsw m4, m7
16689 pmaddubsw m5, m2, m6
16690 pmulhrsw m5, m7
16691 packuswb m4, m5
16692 movu [r0 + 1224 * 16], m4
16693 pmaddubsw m4, m1, m6
16694 pmulhrsw m4, m7
16695 pmaddubsw m5, m3, m6
16696 pmulhrsw m5, m7
16697 packuswb m4, m5
16698 movu [r0 + 1225 * 16], m4
16699
16700 ; mode 21 [row 5]
16701 movu m6, [r5 + 26 * 16]
16702 pslldq m0, 2
16703 pinsrb m0, [r4 + 4], 1
16704 pinsrb m0, [r4 + 6], 0
16705 pmaddubsw m4, m0, m6
16706 pmulhrsw m4, m7
16707 pslldq m2, 2
16708 pinsrw m2, [r3 + 5], 0
16709 pmaddubsw m5, m2, m6
16710 pmulhrsw m5, m7
16711 packuswb m4, m5
16712 movu [r0 + 1226 * 16], m4
16713 pslldq m1, 2
16714 pinsrw m1, [r3 + 13], 0
16715 pmaddubsw m4, m1, m6
16716 pmulhrsw m4, m7
16717 pslldq m3, 2
16718 pinsrw m3, [r3 + 21], 0
16719 pmaddubsw m5, m3, m6
16720 pmulhrsw m5, m7
16721 packuswb m4, m5
16722 movu [r0 + 1227 * 16], m4
16723
16724 ; mode 21 [row 6]
16725 movu m6, [r5 + 9 * 16]
16726 pmaddubsw m4, m0, m6
16727 pmulhrsw m4, m7
16728 pmaddubsw m5, m2, m6
16729 pmulhrsw m5, m7
16730 packuswb m4, m5
16731 movu [r0 + 1228 * 16], m4
16732 pmaddubsw m4, m1, m6
16733 pmulhrsw m4, m7
16734 pmaddubsw m5, m3, m6
16735 pmulhrsw m5, m7
16736 packuswb m4, m5
16737 movu [r0 + 1229 * 16], m4
16738
16739 ; mode 21 [row 7]
16740 movu m6, [r5 + 24 * 16]
16741 pslldq m0, 2
16742 pinsrb m0, [r4 + 6], 1
16743 pinsrb m0, [r4 + 8], 0
16744 pmaddubsw m4, m0, m6
16745 pmulhrsw m4, m7
16746 pslldq m2, 2
16747 pinsrw m2, [r3 + 4], 0
16748 pmaddubsw m5, m2, m6
16749 pmulhrsw m5, m7
16750 packuswb m4, m5
16751 movu [r0 + 1230 * 16], m4
16752 pslldq m1, 2
16753 pinsrw m1, [r3 + 12], 0
16754 pmaddubsw m4, m1, m6
16755 pmulhrsw m4, m7
16756 pslldq m3, 2
16757 pinsrw m3, [r3 + 20], 0
16758 pmaddubsw m5, m3, m6
16759 pmulhrsw m5, m7
16760 packuswb m4, m5
16761 movu [r0 + 1231 * 16], m4
16762
16763 ; mode 21 [row 8]
16764 movu m6, [r5 + 7 * 16]
16765 pmaddubsw m4, m0, m6
16766 pmulhrsw m4, m7
16767 pmaddubsw m5, m2, m6
16768 pmulhrsw m5, m7
16769 packuswb m4, m5
16770 movu [r0 + 1232 * 16], m4
16771 pmaddubsw m4, m1, m6
16772 pmulhrsw m4, m7
16773 pmaddubsw m5, m3, m6
16774 pmulhrsw m5, m7
16775 packuswb m4, m5
16776 movu [r0 + 1233 * 16], m4
16777
16778 ; mode 21 [row 9]
16779 movu m6, [r5 + 22 * 16]
16780 pslldq m0, 2
16781 pinsrb m0, [r4 + 8], 1
16782 pinsrb m0, [r4 + 9], 0
16783 pmaddubsw m4, m0, m6
16784 pmulhrsw m4, m7
16785 pslldq m2, 2
16786 pinsrw m2, [r3 + 3], 0
16787 pmaddubsw m5, m2, m6
16788 pmulhrsw m5, m7
16789 packuswb m4, m5
16790 movu [r0 + 1234 * 16], m4
16791 pslldq m1, 2
16792 pinsrw m1, [r3 + 11], 0
16793 pmaddubsw m4, m1, m6
16794 pmulhrsw m4, m7
16795 pslldq m3, 2
16796 pinsrw m3, [r3 + 19], 0
16797 pmaddubsw m5, m3, m6
16798 pmulhrsw m5, m7
16799 packuswb m4, m5
16800 movu [r0 + 1235 * 16], m4
16801
16802 ; mode 21 [row 10]
16803 movu m6, [r5 + 5 * 16]
16804 pmaddubsw m4, m0, m6
16805 pmulhrsw m4, m7
16806 pmaddubsw m5, m2, m6
16807 pmulhrsw m5, m7
16808 packuswb m4, m5
16809 movu [r0 + 1236 * 16], m4
16810 pmaddubsw m4, m1, m6
16811 pmulhrsw m4, m7
16812 pmaddubsw m5, m3, m6
16813 pmulhrsw m5, m7
16814 packuswb m4, m5
16815 movu [r0 + 1237 * 16], m4
16816
16817 ; mode 21 [row 11]
16818 movu m6, [r5 + 20 * 16]
16819 pslldq m0, 2
16820 pinsrb m0, [r4 + 9], 1
16821 pinsrb m0, [r4 + 11], 0
16822 pmaddubsw m4, m0, m6
16823 pmulhrsw m4, m7
16824 pslldq m2, 2
16825 pinsrw m2, [r3 + 2], 0
16826 pmaddubsw m5, m2, m6
16827 pmulhrsw m5, m7
16828 packuswb m4, m5
16829 movu [r0 + 1238 * 16], m4
16830 pslldq m1, 2
16831 pinsrw m1, [r3 + 10], 0
16832 pmaddubsw m4, m1, m6
16833 pmulhrsw m4, m7
16834 pslldq m3, 2
16835 pinsrw m3, [r3 + 18], 0
16836 pmaddubsw m5, m3, m6
16837 pmulhrsw m5, m7
16838 packuswb m4, m5
16839 movu [r0 + 1239 * 16], m4
16840
16841 ; mode 21 [row 12]
16842 movu m6, [r5 + 3 * 16]
16843 pmaddubsw m4, m0, m6
16844 pmulhrsw m4, m7
16845 pmaddubsw m5, m2, m6
16846 pmulhrsw m5, m7
16847 packuswb m4, m5
16848 movu [r0 + 1240 * 16], m4
16849 pmaddubsw m4, m1, m6
16850 pmulhrsw m4, m7
16851 pmaddubsw m5, m3, m6
16852 pmulhrsw m5, m7
16853 packuswb m4, m5
16854 movu [r0 + 1241 * 16], m4
16855
16856 ; mode 21 [row 13]
16857 movu m6, [r5 + 18 * 16]
16858 pslldq m0, 2
16859 pinsrb m0, [r4 + 11], 1
16860 pinsrb m0, [r4 + 13], 0
16861 pmaddubsw m4, m0, m6
16862 pmulhrsw m4, m7
16863 pslldq m2, 2
16864 pinsrw m2, [r3 + 1], 0
16865 pmaddubsw m5, m2, m6
16866 pmulhrsw m5, m7
16867 packuswb m4, m5
16868 movu [r0 + 1242 * 16], m4
16869 pslldq m1, 2
16870 pinsrw m1, [r3 + 9], 0
16871 pmaddubsw m4, m1, m6
16872 pmulhrsw m4, m7
16873 pslldq m3, 2
16874 pinsrw m3, [r3 + 17], 0
16875 pmaddubsw m5, m3, m6
16876 pmulhrsw m5, m7
16877 packuswb m4, m5
16878 movu [r0 + 1243 * 16], m4
16879
16880 ; mode 21 [row 14]
16881 movu m6, [r5 + 1 * 16]
16882 pmaddubsw m4, m0, m6
16883 pmulhrsw m4, m7
16884 pmaddubsw m5, m2, m6
16885 pmulhrsw m5, m7
16886 packuswb m4, m5
16887 movu [r0 + 1244 * 16], m4
16888 pmaddubsw m4, m1, m6
16889 pmulhrsw m4, m7
16890 pmaddubsw m5, m3, m6
16891 pmulhrsw m5, m7
16892 packuswb m4, m5
16893 movu [r0 + 1245 * 16], m4
16894
16895 ; mode 21 [row 15]
16896 movu m6, [r5 + 16 * 16]
16897 pslldq m0, 2
16898 pinsrb m0, [r4 + 13], 1
16899 pinsrb m0, [r4 + 15], 0
16900 pmaddubsw m4, m0, m6
16901 pmulhrsw m4, m7
16902 pslldq m2, 2
16903 pinsrw m2, [r3 + 0], 0
16904 pmaddubsw m5, m2, m6
16905 pmulhrsw m5, m7
16906 packuswb m4, m5
16907 movu [r0 + 1246 * 16], m4
16908 pslldq m1, 2
16909 pinsrw m1, [r3 + 8], 0
16910 pmaddubsw m4, m1, m6
16911 pmulhrsw m4, m7
16912 pslldq m3, 2
16913 pinsrw m3, [r3 + 16], 0
16914 pmaddubsw m5, m3, m6
16915 pmulhrsw m5, m7
16916 packuswb m4, m5
16917 movu [r0 + 1247 * 16], m4
16918
16919 ; mode 21 [row 16]
16920 movu m6, [r5 + 31 * 16]
16921 pslldq m0, 2
16922 pinsrb m0, [r4 + 15], 1
16923 pinsrb m0, [r4 + 17], 0
16924 pmaddubsw m4, m0, m6
16925 pmulhrsw m4, m7
16926 pslldq m2, 2
16927 pinsrb m2, [r4 + 0], 1
16928 pinsrb m2, [r4 + 2], 0
16929 pmaddubsw m5, m2, m6
16930 pmulhrsw m5, m7
16931 packuswb m4, m5
16932 movu [r0 + 1248 * 16], m4
16933 pslldq m1, 2
16934 pinsrw m1, [r3 + 7], 0
16935 pmaddubsw m4, m1, m6
16936 pmulhrsw m4, m7
16937 pslldq m3, 2
16938 pinsrw m3, [r3 + 15], 0
16939 pmaddubsw m5, m3, m6
16940 pmulhrsw m5, m7
16941 packuswb m4, m5
16942 movu [r0 + 1249 * 16], m4
16943
16944 ; mode 21 [row 17]
16945 movu m6, [r5 + 14 * 16]
16946 pmaddubsw m4, m0, m6
16947 pmulhrsw m4, m7
16948 pmaddubsw m5, m2, m6
16949 pmulhrsw m5, m7
16950 packuswb m4, m5
16951 movu [r0 + 1250 * 16], m4
16952 pmaddubsw m4, m1, m6
16953 pmulhrsw m4, m7
16954 pmaddubsw m5, m3, m6
16955 pmulhrsw m5, m7
16956 packuswb m4, m5
16957 movu [r0 + 1251 * 16], m4
16958
16959 ; mode 21 [row 18]
16960 movu m6, [r5 + 29 * 16]
16961 pslldq m0, 2
16962 pinsrb m0, [r4 + 17], 1
16963 pinsrb m0, [r4 + 19], 0
16964 pmaddubsw m4, m0, m6
16965 pmulhrsw m4, m7
16966 pslldq m2, 2
16967 pinsrb m2, [r4 + 2], 1
16968 pinsrb m2, [r4 + 4], 0
16969 pmaddubsw m5, m2, m6
16970 pmulhrsw m5, m7
16971 packuswb m4, m5
16972 movu [r0 + 1252 * 16], m4
16973 pslldq m1, 2
16974 pinsrb m1, [r3 + 7], 1
16975 pinsrb m1, [r3 + 6], 0
16976 pmaddubsw m4, m1, m6
16977 pmulhrsw m4, m7
16978 pslldq m3, 2
16979 pinsrb m3, [r3 + 15], 1
16980 pinsrb m3, [r3 + 14], 0
16981 pmaddubsw m5, m3, m6
16982 pmulhrsw m5, m7
16983 packuswb m4, m5
16984 movu [r0 + 1253 * 16], m4
16985
16986 ; mode 21 [row 19]
16987 movu m6, [r5 + 12 * 16]
16988 pmaddubsw m4, m0, m6
16989 pmulhrsw m4, m7
16990 pmaddubsw m5, m2, m6
16991 pmulhrsw m5, m7
16992 packuswb m4, m5
16993 movu [r0 + 1254 * 16], m4
16994 pmaddubsw m4, m1, m6
16995 pmulhrsw m4, m7
16996 pmaddubsw m5, m3, m6
16997 pmulhrsw m5, m7
16998 packuswb m4, m5
16999 movu [r0 + 1255 * 16], m4
17000
17001 ; mode 21 [row 20]
17002 movu m6, [r5 + 27 * 16]
17003 pslldq m0, 2
17004 pinsrb m0, [r4 + 19], 1
17005 pinsrb m0, [r4 + 21], 0
17006 pmaddubsw m4, m0, m6
17007 pmulhrsw m4, m7
17008 pslldq m2, 2
17009 pinsrb m2, [r4 + 4], 1
17010 pinsrb m2, [r4 + 6], 0
17011 pmaddubsw m5, m2, m6
17012 pmulhrsw m5, m7
17013 packuswb m4, m5
17014 movu [r0 + 1256 * 16], m4
17015 pslldq m1, 2
17016 pinsrw m1, [r3 + 5], 0
17017 pmaddubsw m4, m1, m6
17018 pmulhrsw m4, m7
17019 pslldq m3, 2
17020 pinsrw m3, [r3 + 13], 0
17021 pmaddubsw m5, m3, m6
17022 pmulhrsw m5, m7
17023 packuswb m4, m5
17024 movu [r0 + 1257 * 16], m4
17025
17026 ; mode 21 [row 21]
17027 movu m6, [r5 + 10 * 16]
17028 pmaddubsw m4, m0, m6
17029 pmulhrsw m4, m7
17030 pmaddubsw m5, m2, m6
17031 pmulhrsw m5, m7
17032 packuswb m4, m5
17033 movu [r0 + 1258 * 16], m4
17034 pmaddubsw m4, m1, m6
17035 pmulhrsw m4, m7
17036 pmaddubsw m5, m3, m6
17037 pmulhrsw m5, m7
17038 packuswb m4, m5
17039 movu [r0 + 1259 * 16], m4
17040
17041 ; mode 21 [row 22]
17042 movu m6, [r5 + 25 * 16]
17043 pslldq m0, 2
17044 pinsrb m0, [r4 + 21], 1
17045 pinsrb m0, [r4 + 23], 0
17046 pmaddubsw m4, m0, m6
17047 pmulhrsw m4, m7
17048 pslldq m2, 2
17049 pinsrb m2, [r4 + 6], 1
17050 pinsrb m2, [r4 + 8], 0
17051 pmaddubsw m5, m2, m6
17052 pmulhrsw m5, m7
17053 packuswb m4, m5
17054 movu [r0 + 1260 * 16], m4
17055 pslldq m1, 2
17056 pinsrw m1, [r3 + 4], 0
17057 pmaddubsw m4, m1, m6
17058 pmulhrsw m4, m7
17059 pslldq m3, 2
17060 pinsrw m3, [r3 + 12], 0
17061 pmaddubsw m5, m3, m6
17062 pmulhrsw m5, m7
17063 packuswb m4, m5
17064 movu [r0 + 1261 * 16], m4
17065
17066 ; mode 21 [row 23]
17067 movu m6, [r5 + 8 * 16]
17068 pmaddubsw m4, m0, m6
17069 pmulhrsw m4, m7
17070 pmaddubsw m5, m2, m6
17071 pmulhrsw m5, m7
17072 packuswb m4, m5
17073 movu [r0 + 1262 * 16], m4
17074 pmaddubsw m4, m1, m6
17075 pmulhrsw m4, m7
17076 pmaddubsw m5, m3, m6
17077 pmulhrsw m5, m7
17078 packuswb m4, m5
17079 movu [r0 + 1263 * 16], m4
17080
17081 ; mode 21 [row 24]
17082 movu m6, [r5 + 23 * 16]
17083 pslldq m0, 2
17084 pinsrb m0, [r4 + 23], 1
17085 pinsrb m0, [r4 + 24], 0
17086 pmaddubsw m4, m0, m6
17087 pmulhrsw m4, m7
17088 pslldq m2, 2
17089 pinsrb m2, [r4 + 8], 1
17090 pinsrb m2, [r4 + 9], 0
17091 pmaddubsw m5, m2, m6
17092 pmulhrsw m5, m7
17093 packuswb m4, m5
17094 movu [r0 + 1264 * 16], m4
17095 pslldq m1, 2
17096 pinsrw m1, [r3 + 3], 0
17097 pmaddubsw m4, m1, m6
17098 pmulhrsw m4, m7
17099 pslldq m3, 2
17100 pinsrw m3, [r3 + 11], 0
17101 pmaddubsw m5, m3, m6
17102 pmulhrsw m5, m7
17103 packuswb m4, m5
17104 movu [r0 + 1265 * 16], m4
17105
17106 ; mode 21 [row 25]
17107 movu m6, [r5 + 6 * 16]
17108 pmaddubsw m4, m0, m6
17109 pmulhrsw m4, m7
17110 pmaddubsw m5, m2, m6
17111 pmulhrsw m5, m7
17112 packuswb m4, m5
17113 movu [r0 + 1266 * 16], m4
17114 pmaddubsw m4, m1, m6
17115 pmulhrsw m4, m7
17116 pmaddubsw m5, m3, m6
17117 pmulhrsw m5, m7
17118 packuswb m4, m5
17119 movu [r0 + 1267 * 16], m4
17120
17121 ; mode 21 [row 26]
17122 movu m6, [r5 + 21 * 16]
17123 pslldq m0, 2
17124 pinsrb m0, [r4 + 24], 1
17125 pinsrb m0, [r4 + 26], 0
17126 pmaddubsw m4, m0, m6
17127 pmulhrsw m4, m7
17128 pslldq m2, 2
17129 pinsrb m2, [r4 + 9], 1
17130 pinsrb m2, [r4 + 11], 0
17131 pmaddubsw m5, m2, m6
17132 pmulhrsw m5, m7
17133 packuswb m4, m5
17134 movu [r0 + 1268 * 16], m4
17135 pslldq m1, 2
17136 pinsrw m1, [r3 + 2], 0
17137 pmaddubsw m4, m1, m6
17138 pmulhrsw m4, m7
17139 pslldq m3, 2
17140 pinsrw m3, [r3 + 10], 0
17141 pmaddubsw m5, m3, m6
17142 pmulhrsw m5, m7
17143 packuswb m4, m5
17144 movu [r0 + 1269 * 16], m4
17145
17146 ; mode 21 [row 27]
17147 movu m6, [r5 + 4 * 16]
17148 pmaddubsw m4, m0, m6
17149 pmulhrsw m4, m7
17150 pmaddubsw m5, m2, m6
17151 pmulhrsw m5, m7
17152 packuswb m4, m5
17153 movu [r0 + 1270 * 16], m4
17154 pmaddubsw m4, m1, m6
17155 pmulhrsw m4, m7
17156 pmaddubsw m5, m3, m6
17157 pmulhrsw m5, m7
17158 packuswb m4, m5
17159 movu [r0 + 1271 * 16], m4
17160
17161 ; mode 21 [row 28]
17162 movu m6, [r5 + 19 * 16]
17163 pslldq m0, 2
17164 pinsrb m0, [r4 + 26], 1
17165 pinsrb m0, [r4 + 28], 0
17166 pmaddubsw m4, m0, m6
17167 pmulhrsw m4, m7
17168 pslldq m2, 2
17169 pinsrb m2, [r4 + 11], 1
17170 pinsrb m2, [r4 + 13], 0
17171 pmaddubsw m5, m2, m6
17172 pmulhrsw m5, m7
17173 packuswb m4, m5
17174 movu [r0 + 1272 * 16], m4
17175 pslldq m1, 2
17176 pinsrw m1, [r3 + 1], 0
17177 pmaddubsw m4, m1, m6
17178 pmulhrsw m4, m7
17179 pslldq m3, 2
17180 pinsrw m3, [r3 + 9], 0
17181 pmaddubsw m5, m3, m6
17182 pmulhrsw m5, m7
17183 packuswb m4, m5
17184 movu [r0 + 1273 * 16], m4
17185
17186 ; mode 21 [row 29]
17187 movu m6, [r5 + 2 * 16]
17188 pmaddubsw m4, m0, m6
17189 pmulhrsw m4, m7
17190 pmaddubsw m5, m2, m6
17191 pmulhrsw m5, m7
17192 packuswb m4, m5
17193 movu [r0 + 1274 * 16], m4
17194 pmaddubsw m4, m1, m6
17195 pmulhrsw m4, m7
17196 pmaddubsw m5, m3, m6
17197 pmulhrsw m5, m7
17198 packuswb m4, m5
17199 movu [r0 + 1275 * 16], m4
17200
17201 ; mode 21 [row 30]
17202 movu m6, [r5 + 17 * 16]
17203 pslldq m0, 2
17204 pinsrb m0, [r4 + 28], 1
17205 pinsrb m0, [r4 + 30], 0
17206 pmaddubsw m4, m0, m6
17207 pmulhrsw m4, m7
17208 pslldq m2, 2
17209 pinsrb m2, [r4 + 13], 1
17210 pinsrb m2, [r4 + 15], 0
17211 pmaddubsw m5, m2, m6
17212 pmulhrsw m5, m7
17213 packuswb m4, m5
17214 movu [r0 + 1276 * 16], m4
17215 pslldq m1, 2
17216 pinsrw m1, [r3 + 0], 0
17217 pmaddubsw m4, m1, m6
17218 pmulhrsw m4, m7
17219 pslldq m3, 2
17220 pinsrw m3, [r3 + 8], 0
17221 pmaddubsw m5, m3, m6
17222 pmulhrsw m5, m7
17223 packuswb m4, m5
17224 movu [r0 + 1277 * 16], m4
17225
17226 ; mode21 [row 31]
17227 pshufb m5, m0, [tab_S2]
17228 movh [r0 + 1278 * 16], m5
17229 pshufb m5, m2, [tab_S2]
17230 movh [r0 + 1278 * 16 + 8], m5
17231 pshufb m5, m1, [tab_S2]
17232 movh [r0 + 1279 * 16], m5
17233 pshufb m5, m3, [tab_S2]
17234 movh [r0 + 1279 * 16 + 8], m5
17235
17236 ; mode 22 [row 0]
17237 movu m6, [r5 + 19 * 16]
17238 movu m0, [r3 ]
17239 movu m1, [r3 + 1 ]
17240 punpcklbw m0, m1
17241 pmaddubsw m1, m0, m6
17242 pmulhrsw m1, m7
17243 movu m2, [r3 + 8]
17244 movu m3, [r3 + 9]
17245 punpcklbw m2, m3
17246 pmaddubsw m3, m2, m6
17247 pmulhrsw m3, m7
17248 packuswb m1, m3
17249 movu [r0 + 1280 * 16], m1
17250
17251 movu m1, [r3 + 16]
17252 movu m3, [r3 + 17]
17253 punpcklbw m1, m3
17254 pmaddubsw m4, m1, m6
17255 pmulhrsw m4, m7
17256 movu m3, [r3 + 24]
17257 movu m5, [r3 + 25]
17258 punpcklbw m3, m5
17259 pmaddubsw m5, m3, m6
17260 pmulhrsw m5, m7
17261 packuswb m4, m5
17262 movu [r0 + 1281 * 16], m4
17263
17264 ; mode 22 [row 1]
17265 movu m6, [r5 + 6 * 16]
17266 pmaddubsw m4, m0, m6
17267 pmulhrsw m4, m7
17268 pmaddubsw m5, m2, m6
17269 pmulhrsw m5, m7
17270 packuswb m4, m5
17271 movu [r0 + 1282 * 16], m4
17272 pmaddubsw m4, m1, m6
17273 pmulhrsw m4, m7
17274 pmaddubsw m5, m3, m6
17275 pmulhrsw m5, m7
17276 packuswb m4, m5
17277 movu [r0 + 1283 * 16], m4
17278
17279 ; mode 22 [row 2]
17280 movu m6, [r5 + 25 * 16]
17281 pslldq m0, 2
17282 pinsrb m0, [r4 + 0], 1
17283 pinsrb m0, [r4 + 2], 0
17284 pmaddubsw m4, m0, m6
17285 pmulhrsw m4, m7
17286 pslldq m2, 2
17287 pinsrw m2, [r3 + 7], 0
17288 pmaddubsw m5, m2, m6
17289 pmulhrsw m5, m7
17290 packuswb m4, m5
17291 movu [r0 + 1284 * 16], m4
17292 pslldq m1, 2
17293 pinsrw m1, [r3 + 15], 0
17294 pmaddubsw m4, m1, m6
17295 pmulhrsw m4, m7
17296 pslldq m3, 2
17297 pinsrw m3, [r3 + 23], 0
17298 pmaddubsw m5, m3, m6
17299 pmulhrsw m5, m7
17300 packuswb m4, m5
17301 movu [r0 + 1285 * 16], m4
17302
17303 ; mode 22 [row 3]
17304 movu m6, [r5 + 12 * 16]
17305 pmaddubsw m4, m0, m6
17306 pmulhrsw m4, m7
17307 pmaddubsw m5, m2, m6
17308 pmulhrsw m5, m7
17309 packuswb m4, m5
17310 movu [r0 + 1286 * 16], m4
17311 pmaddubsw m4, m1, m6
17312 pmulhrsw m4, m7
17313 pmaddubsw m5, m3, m6
17314 pmulhrsw m5, m7
17315 packuswb m4, m5
17316 movu [r0 + 1287 * 16], m4
17317
17318 ; mode 22 [row 4]
17319 movu m6, [r5 + 31 * 16]
17320 pslldq m0, 2
17321 pinsrb m0, [r4 + 2], 1
17322 pinsrb m0, [r4 + 5], 0
17323 pmaddubsw m4, m0, m6
17324 pmulhrsw m4, m7
17325 pslldq m2, 2
17326 pinsrw m2, [r3 + 6], 0
17327 pmaddubsw m5, m2, m6
17328 pmulhrsw m5, m7
17329 packuswb m4, m5
17330 movu [r0 + 1288 * 16], m4
17331 pslldq m1, 2
17332 pinsrw m1, [r3 + 14], 0
17333 pmaddubsw m4, m1, m6
17334 pmulhrsw m4, m7
17335 pslldq m3, 2
17336 pinsrw m3, [r3 + 22], 0
17337 pmaddubsw m5, m3, m6
17338 pmulhrsw m5, m7
17339 packuswb m4, m5
17340 movu [r0 + 1289 * 16], m4
17341
17342 ; mode 22 [row 5]
17343 movu m6, [r5 + 18 * 16]
17344 pmaddubsw m4, m0, m6
17345 pmulhrsw m4, m7
17346 pmaddubsw m5, m2, m6
17347 pmulhrsw m5, m7
17348 packuswb m4, m5
17349 movu [r0 + 1290 * 16], m4
17350 pmaddubsw m4, m1, m6
17351 pmulhrsw m4, m7
17352 pmaddubsw m5, m3, m6
17353 pmulhrsw m5, m7
17354 packuswb m4, m5
17355 movu [r0 + 1291 * 16], m4
17356
17357 ; mode 22 [row 6]
17358 movu m6, [r5 + 5 * 16]
17359 pmaddubsw m4, m0, m6
17360 pmulhrsw m4, m7
17361 pmaddubsw m5, m2, m6
17362 pmulhrsw m5, m7
17363 packuswb m4, m5
17364 movu [r0 + 1292 * 16], m4
17365 pmaddubsw m4, m1, m6
17366 pmulhrsw m4, m7
17367 pmaddubsw m5, m3, m6
17368 pmulhrsw m5, m7
17369 packuswb m4, m5
17370 movu [r0 + 1293 * 16], m4
17371
17372 ; mode 22 [row 7]
17373 movu m6, [r5 + 24 * 16]
17374 pslldq m0, 2
17375 pinsrb m0, [r4 + 5], 1
17376 pinsrb m0, [r4 + 7], 0
17377 pmaddubsw m4, m0, m6
17378 pmulhrsw m4, m7
17379 pslldq m2, 2
17380 pinsrw m2, [r3 + 5], 0
17381 pmaddubsw m5, m2, m6
17382 pmulhrsw m5, m7
17383 packuswb m4, m5
17384 movu [r0 + 1294 * 16], m4
17385 pslldq m1, 2
17386 pinsrw m1, [r3 + 13], 0
17387 pmaddubsw m4, m1, m6
17388 pmulhrsw m4, m7
17389 pslldq m3, 2
17390 pinsrw m3, [r3 + 21], 0
17391 pmaddubsw m5, m3, m6
17392 pmulhrsw m5, m7
17393 packuswb m4, m5
17394 movu [r0 + 1295 * 16], m4
17395
17396 ; mode 22 [row 8]
17397 movu m6, [r5 + 11 * 16]
17398 pmaddubsw m4, m0, m6
17399 pmulhrsw m4, m7
17400 pmaddubsw m5, m2, m6
17401 pmulhrsw m5, m7
17402 packuswb m4, m5
17403 movu [r0 + 1296 * 16], m4
17404 pmaddubsw m4, m1, m6
17405 pmulhrsw m4, m7
17406 pmaddubsw m5, m3, m6
17407 pmulhrsw m5, m7
17408 packuswb m4, m5
17409 movu [r0 + 1297 * 16], m4
17410
17411 ; mode 22 [row 9]
17412 movu m6, [r5 + 30 * 16]
17413 pslldq m0, 2
17414 pinsrb m0, [r4 + 7], 1
17415 pinsrb m0, [r4 + 10], 0
17416 pmaddubsw m4, m0, m6
17417 pmulhrsw m4, m7
17418 pslldq m2, 2
17419 pinsrw m2, [r3 + 4], 0
17420 pmaddubsw m5, m2, m6
17421 pmulhrsw m5, m7
17422 packuswb m4, m5
17423 movu [r0 + 1298 * 16], m4
17424 pslldq m1, 2
17425 pinsrw m1, [r3 + 12], 0
17426 pmaddubsw m4, m1, m6
17427 pmulhrsw m4, m7
17428 pslldq m3, 2
17429 pinsrw m3, [r3 + 20], 0
17430 pmaddubsw m5, m3, m6
17431 pmulhrsw m5, m7
17432 packuswb m4, m5
17433 movu [r0 + 1299 * 16], m4
17434
17435 ; mode 22 [row 10]
17436 movu m6, [r5 + 17 * 16]
17437 pmaddubsw m4, m0, m6
17438 pmulhrsw m4, m7
17439 pmaddubsw m5, m2, m6
17440 pmulhrsw m5, m7
17441 packuswb m4, m5
17442 movu [r0 + 1300 * 16], m4
17443 pmaddubsw m4, m1, m6
17444 pmulhrsw m4, m7
17445 pmaddubsw m5, m3, m6
17446 pmulhrsw m5, m7
17447 packuswb m4, m5
17448 movu [r0 + 1301 * 16], m4
17449
17450 ; mode 22 [row 11]
17451 movu m6, [r5 + 4 * 16]
17452 pmaddubsw m4, m0, m6
17453 pmulhrsw m4, m7
17454 pmaddubsw m5, m2, m6
17455 pmulhrsw m5, m7
17456 packuswb m4, m5
17457 movu [r0 + 1302 * 16], m4
17458 pmaddubsw m4, m1, m6
17459 pmulhrsw m4, m7
17460 pmaddubsw m5, m3, m6
17461 pmulhrsw m5, m7
17462 packuswb m4, m5
17463 movu [r0 + 1303 * 16], m4
17464
17465 ; mode 22 [row 12]
17466 movu m6, [r5 + 23 * 16]
17467 pslldq m0, 2
17468 pinsrb m0, [r4 + 10], 1
17469 pinsrb m0, [r4 + 12], 0
17470 pmaddubsw m4, m0, m6
17471 pmulhrsw m4, m7
17472 pslldq m2, 2
17473 pinsrw m2, [r3 + 3], 0
17474 pmaddubsw m5, m2, m6
17475 pmulhrsw m5, m7
17476 packuswb m4, m5
17477 movu [r0 + 1304 * 16], m4
17478 pslldq m1, 2
17479 pinsrw m1, [r3 + 11], 0
17480 pmaddubsw m4, m1, m6
17481 pmulhrsw m4, m7
17482 pslldq m3, 2
17483 pinsrw m3, [r3 + 19], 0
17484 pmaddubsw m5, m3, m6
17485 pmulhrsw m5, m7
17486 packuswb m4, m5
17487 movu [r0 + 1305 * 16], m4
17488
17489 ; mode 22 [row 13]
17490 movu m6, [r5 + 10 * 16]
17491 pmaddubsw m4, m0, m6
17492 pmulhrsw m4, m7
17493 pmaddubsw m5, m2, m6
17494 pmulhrsw m5, m7
17495 packuswb m4, m5
17496 movu [r0 + 1306 * 16], m4
17497 pmaddubsw m4, m1, m6
17498 pmulhrsw m4, m7
17499 pmaddubsw m5, m3, m6
17500 pmulhrsw m5, m7
17501 packuswb m4, m5
17502 movu [r0 + 1307 * 16], m4
17503
17504 ; mode 22 [row 14]
17505 movu m6, [r5 + 29 * 16]
17506 pslldq m0, 2
17507 pinsrb m0, [r4 + 12], 1
17508 pinsrb m0, [r4 + 15], 0
17509 pmaddubsw m4, m0, m6
17510 pmulhrsw m4, m7
17511 pslldq m2, 2
17512 pinsrw m2, [r3 + 2], 0
17513 pmaddubsw m5, m2, m6
17514 pmulhrsw m5, m7
17515 packuswb m4, m5
17516 movu [r0 + 1308 * 16], m4
17517 pslldq m1, 2
17518 pinsrw m1, [r3 + 10], 0
17519 pmaddubsw m4, m1, m6
17520 pmulhrsw m4, m7
17521 pslldq m3, 2
17522 pinsrw m3, [r3 + 18], 0
17523 pmaddubsw m5, m3, m6
17524 pmulhrsw m5, m7
17525 packuswb m4, m5
17526 movu [r0 + 1309 * 16], m4
17527
17528 ; mode 22 [row 15]
17529 movu m6, [r5 + 16 * 16]
17530 pmaddubsw m4, m0, m6
17531 pmulhrsw m4, m7
17532 pmaddubsw m5, m2, m6
17533 pmulhrsw m5, m7
17534 packuswb m4, m5
17535 movu [r0 + 1310 * 16], m4
17536 pmaddubsw m4, m1, m6
17537 pmulhrsw m4, m7
17538 pmaddubsw m5, m3, m6
17539 pmulhrsw m5, m7
17540 packuswb m4, m5
17541 movu [r0 + 1311 * 16], m4
17542
17543 ; mode 22 [row 16]
17544 movu m6, [r5 + 3 * 16]
17545 pmaddubsw m4, m0, m6
17546 pmulhrsw m4, m7
17547 pmaddubsw m5, m2, m6
17548 pmulhrsw m5, m7
17549 packuswb m4, m5
17550 movu [r0 + 1312 * 16], m4
17551 pmaddubsw m4, m1, m6
17552 pmulhrsw m4, m7
17553 pmaddubsw m5, m3, m6
17554 pmulhrsw m5, m7
17555 packuswb m4, m5
17556 movu [r0 + 1313 * 16], m4
17557
17558 ; mode 22 [row 17]
17559 movu m6, [r5 + 22 * 16]
17560 pslldq m0, 2
17561 pinsrb m0, [r4 + 15], 1
17562 pinsrb m0, [r4 + 17], 0
17563 pmaddubsw m4, m0, m6
17564 pmulhrsw m4, m7
17565 pslldq m2, 2
17566 pinsrw m2, [r3 + 1], 0
17567 pmaddubsw m5, m2, m6
17568 pmulhrsw m5, m7
17569 packuswb m4, m5
17570 movu [r0 + 1314 * 16], m4
17571 pslldq m1, 2
17572 pinsrw m1, [r3 + 9], 0
17573 pmaddubsw m4, m1, m6
17574 pmulhrsw m4, m7
17575 pslldq m3, 2
17576 pinsrw m3, [r3 + 17], 0
17577 pmaddubsw m5, m3, m6
17578 pmulhrsw m5, m7
17579 packuswb m4, m5
17580 movu [r0 + 1315 * 16], m4
17581
17582 ; mode 22 [row 18]
17583 movu m6, [r5 + 9 * 16]
17584 pmaddubsw m4, m0, m6
17585 pmulhrsw m4, m7
17586 pmaddubsw m5, m2, m6
17587 pmulhrsw m5, m7
17588 packuswb m4, m5
17589 movu [r0 + 1316 * 16], m4
17590 pmaddubsw m4, m1, m6
17591 pmulhrsw m4, m7
17592 pmaddubsw m5, m3, m6
17593 pmulhrsw m5, m7
17594 packuswb m4, m5
17595 movu [r0 + 1317 * 16], m4
17596
17597 ; mode 22 [row 19]
17598 movu m6, [r5 + 28 * 16]
17599 pslldq m0, 2
17600 pinsrb m0, [r4 + 17], 1
17601 pinsrb m0, [r4 + 20], 0
17602 pmaddubsw m4, m0, m6
17603 pmulhrsw m4, m7
17604 pslldq m2, 2
17605 pinsrw m2, [r3 + 0], 0
17606 pmaddubsw m5, m2, m6
17607 pmulhrsw m5, m7
17608 packuswb m4, m5
17609 movu [r0 + 1318 * 16], m4
17610 pslldq m1, 2
17611 pinsrw m1, [r3 + 8], 0
17612 pmaddubsw m4, m1, m6
17613 pmulhrsw m4, m7
17614 pslldq m3, 2
17615 pinsrw m3, [r3 + 16], 0
17616 pmaddubsw m5, m3, m6
17617 pmulhrsw m5, m7
17618 packuswb m4, m5
17619 movu [r0 + 1319 * 16], m4
17620
17621 ; mode 22 [row 20]
17622 movu m6, [r5 + 15 * 16]
17623 pmaddubsw m4, m0, m6
17624 pmulhrsw m4, m7
17625 pmaddubsw m5, m2, m6
17626 pmulhrsw m5, m7
17627 packuswb m4, m5
17628 movu [r0 + 1320 * 16], m4
17629 pmaddubsw m4, m1, m6
17630 pmulhrsw m4, m7
17631 pmaddubsw m5, m3, m6
17632 pmulhrsw m5, m7
17633 packuswb m4, m5
17634 movu [r0 + 1321 * 16], m4
17635
17636 ; mode 22 [row 21]
17637 movu m6, [r5 + 2 * 16]
17638 pmaddubsw m4, m0, m6
17639 pmulhrsw m4, m7
17640 pmaddubsw m5, m2, m6
17641 pmulhrsw m5, m7
17642 packuswb m4, m5
17643 movu [r0 + 1322 * 16], m4
17644 pmaddubsw m4, m1, m6
17645 pmulhrsw m4, m7
17646 pmaddubsw m5, m3, m6
17647 pmulhrsw m5, m7
17648 packuswb m4, m5
17649 movu [r0 + 1323 * 16], m4
17650
17651 ; mode 22 [row 22]
17652 movu m6, [r5 + 21 * 16]
17653 pslldq m0, 2
17654 pinsrb m0, [r4 + 20], 1
17655 pinsrb m0, [r4 + 22], 0
17656 pmaddubsw m4, m0, m6
17657 pmulhrsw m4, m7
17658 pslldq m2, 2
17659 pinsrb m2, [r4 + 0], 1
17660 pinsrb m2, [r4 + 2], 0
17661 pmaddubsw m5, m2, m6
17662 pmulhrsw m5, m7
17663 packuswb m4, m5
17664 movu [r0 + 1324 * 16], m4
17665 pslldq m1, 2
17666 pinsrw m1, [r3 + 7], 0
17667 pmaddubsw m4, m1, m6
17668 pmulhrsw m4, m7
17669 pslldq m3, 2
17670 pinsrw m3, [r3 + 15], 0
17671 pmaddubsw m5, m3, m6
17672 pmulhrsw m5, m7
17673 packuswb m4, m5
17674 movu [r0 + 1325 * 16], m4
17675
17676 ; mode 22 [row 23]
17677 movu m6, [r5 + 8 * 16]
17678 pmaddubsw m4, m0, m6
17679 pmulhrsw m4, m7
17680 pmaddubsw m5, m2, m6
17681 pmulhrsw m5, m7
17682 packuswb m4, m5
17683 movu [r0 + 1326 * 16], m4
17684 pmaddubsw m4, m1, m6
17685 pmulhrsw m4, m7
17686 pmaddubsw m5, m3, m6
17687 pmulhrsw m5, m7
17688 packuswb m4, m5
17689 movu [r0 + 1327 * 16], m4
17690
17691 ; mode 22 [row 24]
17692 movu m6, [r5 + 27 * 16]
17693 pslldq m0, 2
17694 pinsrb m0, [r4 + 22], 1
17695 pinsrb m0, [r4 + 25], 0
17696 pmaddubsw m4, m0, m6
17697 pmulhrsw m4, m7
17698 pslldq m2, 2
17699 pinsrb m2, [r4 + 2], 1
17700 pinsrb m2, [r4 + 5], 0
17701 pmaddubsw m5, m2, m6
17702 pmulhrsw m5, m7
17703 packuswb m4, m5
17704 movu [r0 + 1328 * 16], m4
17705 pslldq m1, 2
17706 pinsrw m1, [r3 + 6], 0
17707 pmaddubsw m4, m1, m6
17708 pmulhrsw m4, m7
17709 pslldq m3, 2
17710 pinsrw m3, [r3 + 14], 0
17711 pmaddubsw m5, m3, m6
17712 pmulhrsw m5, m7
17713 packuswb m4, m5
17714 movu [r0 + 1329 * 16], m4
17715
17716 ; mode 22 [row 25]
17717 movu m6, [r5 + 14 * 16]
17718 pmaddubsw m4, m0, m6
17719 pmulhrsw m4, m7
17720 pmaddubsw m5, m2, m6
17721 pmulhrsw m5, m7
17722 packuswb m4, m5
17723 movu [r0 + 1330 * 16], m4
17724 pmaddubsw m4, m1, m6
17725 pmulhrsw m4, m7
17726 pmaddubsw m5, m3, m6
17727 pmulhrsw m5, m7
17728 packuswb m4, m5
17729 movu [r0 + 1331 * 16], m4
17730
17731 ; mode 22 [row 26]
17732 movu m6, [r5 + 1 * 16]
17733 pmaddubsw m4, m0, m6
17734 pmulhrsw m4, m7
17735 pmaddubsw m5, m2, m6
17736 pmulhrsw m5, m7
17737 packuswb m4, m5
17738 movu [r0 + 1332 * 16], m4
17739 pmaddubsw m4, m1, m6
17740 pmulhrsw m4, m7
17741 pmaddubsw m5, m3, m6
17742 pmulhrsw m5, m7
17743 packuswb m4, m5
17744 movu [r0 + 1333 * 16], m4
17745
17746 ; mode 22 [row 27]
17747 movu m6, [r5 + 20 * 16]
17748 pslldq m0, 2
17749 pinsrb m0, [r4 + 25], 1
17750 pinsrb m0, [r4 + 27], 0
17751 pmaddubsw m4, m0, m6
17752 pmulhrsw m4, m7
17753 pslldq m2, 2
17754 pinsrb m2, [r4 + 5], 1
17755 pinsrb m2, [r4 + 7], 0
17756 pmaddubsw m5, m2, m6
17757 pmulhrsw m5, m7
17758 packuswb m4, m5
17759 movu [r0 + 1334 * 16], m4
17760 pslldq m1, 2
17761 pinsrw m1, [r3 + 5], 0
17762 pmaddubsw m4, m1, m6
17763 pmulhrsw m4, m7
17764 pslldq m3, 2
17765 pinsrw m3, [r3 + 13], 0
17766 pmaddubsw m5, m3, m6
17767 pmulhrsw m5, m7
17768 packuswb m4, m5
17769 movu [r0 + 1335 * 16], m4
17770
17771 ; mode 22 [row 28]
17772 movu m6, [r5 + 7 * 16]
17773 pmaddubsw m4, m0, m6
17774 pmulhrsw m4, m7
17775 pmaddubsw m5, m2, m6
17776 pmulhrsw m5, m7
17777 packuswb m4, m5
17778 movu [r0 + 1336 * 16], m4
17779 pmaddubsw m4, m1, m6
17780 pmulhrsw m4, m7
17781 pmaddubsw m5, m3, m6
17782 pmulhrsw m5, m7
17783 packuswb m4, m5
17784 movu [r0 + 1337 * 16], m4
17785
17786 ; mode 22 [row 29]
17787 movu m6, [r5 + 26 * 16]
17788 pslldq m0, 2
17789 pinsrb m0, [r4 + 27], 1
17790 pinsrb m0, [r4 + 30], 0
17791 pmaddubsw m4, m0, m6
17792 pmulhrsw m4, m7
17793 pslldq m2, 2
17794 pinsrb m2, [r4 + 7], 1
17795 pinsrb m2, [r4 + 10], 0
17796 pmaddubsw m5, m2, m6
17797 pmulhrsw m5, m7
17798 packuswb m4, m5
17799 movu [r0 + 1338 * 16], m4
17800 pslldq m1, 2
17801 pinsrw m1, [r3 + 4], 0
17802 pmaddubsw m4, m1, m6
17803 pmulhrsw m4, m7
17804 pslldq m3, 2
17805 pinsrw m3, [r3 + 12], 0
17806 pmaddubsw m5, m3, m6
17807 pmulhrsw m5, m7
17808 packuswb m4, m5
17809 movu [r0 + 1339 * 16], m4
17810
17811 ; mode 22 [row 30]
17812 movu m6, [r5 + 13 * 16]
17813 pmaddubsw m4, m0, m6
17814 pmulhrsw m4, m7
17815 pmaddubsw m5, m2, m6
17816 pmulhrsw m5, m7
17817 packuswb m4, m5
17818 movu [r0 + 1340 * 16], m4
17819 pmaddubsw m4, m1, m6
17820 pmulhrsw m4, m7
17821 pmaddubsw m5, m3, m6
17822 pmulhrsw m5, m7
17823 packuswb m4, m5
17824 movu [r0 + 1341 * 16], m4
17825
17826 ; mode22 [row 31]
17827 pshufb m5, m0, [tab_S2]
17828 movh [r0 + 1342 * 16], m5
17829 pshufb m5, m2, [tab_S2]
17830 movh [r0 + 1342 * 16 + 8], m5
17831 pshufb m5, m1, [tab_S2]
17832 movh [r0 + 1343 * 16], m5
17833 pshufb m5, m3, [tab_S2]
17834 movh [r0 + 1343 * 16 + 8], m5
17835
17836 ; mode 23 [row 0]
17837 movu m6, [r5 + 23 * 16]
17838 movu m0, [r3 ]
17839 movu m1, [r3 + 1 ]
17840 punpcklbw m0, m1
17841 pmaddubsw m1, m0, m6
17842 pmulhrsw m1, m7
17843 movu m2, [r3 + 8]
17844 movu m3, [r3 + 9]
17845 punpcklbw m2, m3
17846 pmaddubsw m3, m2, m6
17847 pmulhrsw m3, m7
17848 packuswb m1, m3
17849 movu [r0 + 1344 * 16], m1
17850
17851 movu m1, [r3 + 16]
17852 movu m3, [r3 + 17]
17853 punpcklbw m1, m3
17854 pmaddubsw m4, m1, m6
17855 pmulhrsw m4, m7
17856 movu m3, [r3 + 24]
17857 movu m5, [r3 + 25]
17858 punpcklbw m3, m5
17859 pmaddubsw m5, m3, m6
17860 pmulhrsw m5, m7
17861 packuswb m4, m5
17862 movu [r0 + 1345 * 16], m4
17863
17864 ; mode 23 [row 1]
17865 movu m6, [r5 + 14 * 16]
17866 pmaddubsw m4, m0, m6
17867 pmulhrsw m4, m7
17868 pmaddubsw m5, m2, m6
17869 pmulhrsw m5, m7
17870 packuswb m4, m5
17871 movu [r0 + 1346 * 16], m4
17872 pmaddubsw m4, m1, m6
17873 pmulhrsw m4, m7
17874 pmaddubsw m5, m3, m6
17875 pmulhrsw m5, m7
17876 packuswb m4, m5
17877 movu [r0 + 1347 * 16], m4
17878
17879 ; mode 23 [row 2]
17880 movu m6, [r5 + 5 * 16]
17881 pmaddubsw m4, m0, m6
17882 pmulhrsw m4, m7
17883 pmaddubsw m5, m2, m6
17884 pmulhrsw m5, m7
17885 packuswb m4, m5
17886 movu [r0 + 1348 * 16], m4
17887 pmaddubsw m4, m1, m6
17888 pmulhrsw m4, m7
17889 pmaddubsw m5, m3, m6
17890 pmulhrsw m5, m7
17891 packuswb m4, m5
17892 movu [r0 + 1349 * 16], m4
17893
17894 ; mode 23 [row 3]
17895 movu m6, [r5 + 28 * 16]
17896 pslldq m0, 2
17897 pinsrb m0, [r4 + 0], 1
17898 pinsrb m0, [r4 + 4], 0
17899 pmaddubsw m4, m0, m6
17900 pmulhrsw m4, m7
17901 pslldq m2, 2
17902 pinsrw m2, [r3 + 7], 0
17903 pmaddubsw m5, m2, m6
17904 pmulhrsw m5, m7
17905 packuswb m4, m5
17906 movu [r0 + 1350 * 16], m4
17907 pslldq m1, 2
17908 pinsrw m1, [r3 + 15], 0
17909 pmaddubsw m4, m1, m6
17910 pmulhrsw m4, m7
17911 pslldq m3, 2
17912 pinsrw m3, [r3 + 23], 0
17913 pmaddubsw m5, m3, m6
17914 pmulhrsw m5, m7
17915 packuswb m4, m5
17916 movu [r0 + 1351 * 16], m4
17917
17918 ; mode 23 [row 4]
17919 movu m6, [r5 + 19 * 16]
17920 pmaddubsw m4, m0, m6
17921 pmulhrsw m4, m7
17922 pmaddubsw m5, m2, m6
17923 pmulhrsw m5, m7
17924 packuswb m4, m5
17925 movu [r0 + 1352 * 16], m4
17926 pmaddubsw m4, m1, m6
17927 pmulhrsw m4, m7
17928 pmaddubsw m5, m3, m6
17929 pmulhrsw m5, m7
17930 packuswb m4, m5
17931 movu [r0 + 1353 * 16], m4
17932
17933 ; mode 23 [row 5]
17934 movu m6, [r5 + 10 * 16]
17935 pmaddubsw m4, m0, m6
17936 pmulhrsw m4, m7
17937 pmaddubsw m5, m2, m6
17938 pmulhrsw m5, m7
17939 packuswb m4, m5
17940 movu [r0 + 1354 * 16], m4
17941 pmaddubsw m4, m1, m6
17942 pmulhrsw m4, m7
17943 pmaddubsw m5, m3, m6
17944 pmulhrsw m5, m7
17945 packuswb m4, m5
17946 movu [r0 + 1355 * 16], m4
17947
17948 ; mode 23 [row 6]
17949 movu m6, [r5 + 1 * 16]
17950 pmaddubsw m4, m0, m6
17951 pmulhrsw m4, m7
17952 pmaddubsw m5, m2, m6
17953 pmulhrsw m5, m7
17954 packuswb m4, m5
17955 movu [r0 + 1356 * 16], m4
17956 pmaddubsw m4, m1, m6
17957 pmulhrsw m4, m7
17958 pmaddubsw m5, m3, m6
17959 pmulhrsw m5, m7
17960 packuswb m4, m5
17961 movu [r0 + 1357 * 16], m4
17962
17963 ; mode 23 [row 7]
17964 movu m6, [r5 + 24 * 16]
17965 pslldq m0, 2
17966 pinsrb m0, [r4 + 4], 1
17967 pinsrb m0, [r4 + 7], 0
17968 pmaddubsw m4, m0, m6
17969 pmulhrsw m4, m7
17970 pslldq m2, 2
17971 pinsrw m2, [r3 + 6], 0
17972 pmaddubsw m5, m2, m6
17973 pmulhrsw m5, m7
17974 packuswb m4, m5
17975 movu [r0 + 1358 * 16], m4
17976 pslldq m1, 2
17977 pinsrw m1, [r3 + 14], 0
17978 pmaddubsw m4, m1, m6
17979 pmulhrsw m4, m7
17980 pslldq m3, 2
17981 pinsrw m3, [r3 + 22], 0
17982 pmaddubsw m5, m3, m6
17983 pmulhrsw m5, m7
17984 packuswb m4, m5
17985 movu [r0 + 1359 * 16], m4
17986
17987 ; mode 23 [row 8]
17988 movu m6, [r5 + 15 * 16]
17989 pmaddubsw m4, m0, m6
17990 pmulhrsw m4, m7
17991 pmaddubsw m5, m2, m6
17992 pmulhrsw m5, m7
17993 packuswb m4, m5
17994 movu [r0 + 1360 * 16], m4
17995 pmaddubsw m4, m1, m6
17996 pmulhrsw m4, m7
17997 pmaddubsw m5, m3, m6
17998 pmulhrsw m5, m7
17999 packuswb m4, m5
18000 movu [r0 + 1361 * 16], m4
18001
18002 ; mode 23 [row 9]
18003 movu m6, [r5 + 6 * 16]
18004 pmaddubsw m4, m0, m6
18005 pmulhrsw m4, m7
18006 pmaddubsw m5, m2, m6
18007 pmulhrsw m5, m7
18008 packuswb m4, m5
18009 movu [r0 + 1362 * 16], m4
18010 pmaddubsw m4, m1, m6
18011 pmulhrsw m4, m7
18012 pmaddubsw m5, m3, m6
18013 pmulhrsw m5, m7
18014 packuswb m4, m5
18015 movu [r0 + 1363 * 16], m4
18016
18017 ; mode 23 [row 10]
18018 movu m6, [r5 + 29 * 16]
18019 pslldq m0, 2
18020 pinsrb m0, [r4 + 7], 1
18021 pinsrb m0, [r4 + 11], 0
18022 pmaddubsw m4, m0, m6
18023 pmulhrsw m4, m7
18024 pslldq m2, 2
18025 pinsrw m2, [r3 + 5], 0
18026 pmaddubsw m5, m2, m6
18027 pmulhrsw m5, m7
18028 packuswb m4, m5
18029 movu [r0 + 1364 * 16], m4
18030 pslldq m1, 2
18031 pinsrw m1, [r3 + 13], 0
18032 pmaddubsw m4, m1, m6
18033 pmulhrsw m4, m7
18034 pslldq m3, 2
18035 pinsrw m3, [r3 + 21], 0
18036 pmaddubsw m5, m3, m6
18037 pmulhrsw m5, m7
18038 packuswb m4, m5
18039 movu [r0 + 1365 * 16], m4
18040
18041 ; mode 23 [row 11]
18042 movu m6, [r5 + 20 * 16]
18043 pmaddubsw m4, m0, m6
18044 pmulhrsw m4, m7
18045 pmaddubsw m5, m2, m6
18046 pmulhrsw m5, m7
18047 packuswb m4, m5
18048 movu [r0 + 1366 * 16], m4
18049 pmaddubsw m4, m1, m6
18050 pmulhrsw m4, m7
18051 pmaddubsw m5, m3, m6
18052 pmulhrsw m5, m7
18053 packuswb m4, m5
18054 movu [r0 + 1367 * 16], m4
18055
18056 ; mode 23 [row 12]
18057 movu m6, [r5 + 11 * 16]
18058 pmaddubsw m4, m0, m6
18059 pmulhrsw m4, m7
18060 pmaddubsw m5, m2, m6
18061 pmulhrsw m5, m7
18062 packuswb m4, m5
18063 movu [r0 + 1368 * 16], m4
18064 pmaddubsw m4, m1, m6
18065 pmulhrsw m4, m7
18066 pmaddubsw m5, m3, m6
18067 pmulhrsw m5, m7
18068 packuswb m4, m5
18069 movu [r0 + 1369 * 16], m4
18070
18071 ; mode 23 [row 13]
18072 movu m6, [r5 + 2 * 16]
18073 pmaddubsw m4, m0, m6
18074 pmulhrsw m4, m7
18075 pmaddubsw m5, m2, m6
18076 pmulhrsw m5, m7
18077 packuswb m4, m5
18078 movu [r0 + 1370 * 16], m4
18079 pmaddubsw m4, m1, m6
18080 pmulhrsw m4, m7
18081 pmaddubsw m5, m3, m6
18082 pmulhrsw m5, m7
18083 packuswb m4, m5
18084 movu [r0 + 1371 * 16], m4
18085
18086 ; mode 23 [row 14]
18087 movu m6, [r5 + 25 * 16]
18088 pslldq m0, 2
18089 pinsrb m0, [r4 + 11], 1
18090 pinsrb m0, [r4 + 14], 0
18091 pmaddubsw m4, m0, m6
18092 pmulhrsw m4, m7
18093 pslldq m2, 2
18094 pinsrw m2, [r3 + 4], 0
18095 pmaddubsw m5, m2, m6
18096 pmulhrsw m5, m7
18097 packuswb m4, m5
18098 movu [r0 + 1372 * 16], m4
18099 pslldq m1, 2
18100 pinsrw m1, [r3 + 12], 0
18101 pmaddubsw m4, m1, m6
18102 pmulhrsw m4, m7
18103 pslldq m3, 2
18104 pinsrw m3, [r3 + 20], 0
18105 pmaddubsw m5, m3, m6
18106 pmulhrsw m5, m7
18107 packuswb m4, m5
18108 movu [r0 + 1373 * 16], m4
18109
18110 ; mode 23 [row 15]
18111 movu m6, [r5 + 16 * 16]
18112 pmaddubsw m4, m0, m6
18113 pmulhrsw m4, m7
18114 pmaddubsw m5, m2, m6
18115 pmulhrsw m5, m7
18116 packuswb m4, m5
18117 movu [r0 + 1374 * 16], m4
18118 pmaddubsw m4, m1, m6
18119 pmulhrsw m4, m7
18120 pmaddubsw m5, m3, m6
18121 pmulhrsw m5, m7
18122 packuswb m4, m5
18123 movu [r0 + 1375 * 16], m4
18124
18125 ; mode 23 [row 16]
18126 movu m6, [r5 + 7 * 16]
18127 pmaddubsw m4, m0, m6
18128 pmulhrsw m4, m7
18129 pmaddubsw m5, m2, m6
18130 pmulhrsw m5, m7
18131 packuswb m4, m5
18132 movu [r0 + 1376 * 16], m4
18133 pmaddubsw m4, m1, m6
18134 pmulhrsw m4, m7
18135 pmaddubsw m5, m3, m6
18136 pmulhrsw m5, m7
18137 packuswb m4, m5
18138 movu [r0 + 1377 * 16], m4
18139
18140 ; mode 23 [row 17]
18141 movu m6, [r5 + 30 * 16]
18142 pslldq m0, 2
18143 pinsrb m0, [r4 + 14], 1
18144 pinsrb m0, [r4 + 18], 0
18145 pmaddubsw m4, m0, m6
18146 pmulhrsw m4, m7
18147 pslldq m2, 2
18148 pinsrw m2, [r3 + 3], 0
18149 pmaddubsw m5, m2, m6
18150 pmulhrsw m5, m7
18151 packuswb m4, m5
18152 movu [r0 + 1378 * 16], m4
18153 pslldq m1, 2
18154 pinsrw m1, [r3 + 11], 0
18155 pmaddubsw m4, m1, m6
18156 pmulhrsw m4, m7
18157 pslldq m3, 2
18158 pinsrw m3, [r3 + 19], 0
18159 pmaddubsw m5, m3, m6
18160 pmulhrsw m5, m7
18161 packuswb m4, m5
18162 movu [r0 + 1379 * 16], m4
18163
18164 ; mode 23 [row 18]
18165 movu m6, [r5 + 21 * 16]
18166 pmaddubsw m4, m0, m6
18167 pmulhrsw m4, m7
18168 pmaddubsw m5, m2, m6
18169 pmulhrsw m5, m7
18170 packuswb m4, m5
18171 movu [r0 + 1380 * 16], m4
18172 pmaddubsw m4, m1, m6
18173 pmulhrsw m4, m7
18174 pmaddubsw m5, m3, m6
18175 pmulhrsw m5, m7
18176 packuswb m4, m5
18177 movu [r0 + 1381 * 16], m4
18178
18179 ; mode 23 [row 19]
18180 movu m6, [r5 + 12 * 16]
18181 pmaddubsw m4, m0, m6
18182 pmulhrsw m4, m7
18183 pmaddubsw m5, m2, m6
18184 pmulhrsw m5, m7
18185 packuswb m4, m5
18186 movu [r0 + 1382 * 16], m4
18187 pmaddubsw m4, m1, m6
18188 pmulhrsw m4, m7
18189 pmaddubsw m5, m3, m6
18190 pmulhrsw m5, m7
18191 packuswb m4, m5
18192 movu [r0 + 1383 * 16], m4
18193
18194 ; mode 23 [row 20]
18195 movu m6, [r5 + 3 * 16]
18196 pmaddubsw m4, m0, m6
18197 pmulhrsw m4, m7
18198 pmaddubsw m5, m2, m6
18199 pmulhrsw m5, m7
18200 packuswb m4, m5
18201 movu [r0 + 1384 * 16], m4
18202 pmaddubsw m4, m1, m6
18203 pmulhrsw m4, m7
18204 pmaddubsw m5, m3, m6
18205 pmulhrsw m5, m7
18206 packuswb m4, m5
18207 movu [r0 + 1385 * 16], m4
18208
18209 ; mode 23 [row 21]
18210 movu m6, [r5 + 26 * 16]
18211 pslldq m0, 2
18212 pinsrb m0, [r4 + 18], 1
18213 pinsrb m0, [r4 + 21], 0
18214 pmaddubsw m4, m0, m6
18215 pmulhrsw m4, m7
18216 pslldq m2, 2
18217 pinsrw m2, [r3 + 2], 0
18218 pmaddubsw m5, m2, m6
18219 pmulhrsw m5, m7
18220 packuswb m4, m5
18221 movu [r0 + 1386 * 16], m4
18222 pslldq m1, 2
18223 pinsrw m1, [r3 + 10], 0
18224 pmaddubsw m4, m1, m6
18225 pmulhrsw m4, m7
18226 pslldq m3, 2
18227 pinsrw m3, [r3 + 18], 0
18228 pmaddubsw m5, m3, m6
18229 pmulhrsw m5, m7
18230 packuswb m4, m5
18231 movu [r0 + 1387 * 16], m4
18232
18233 ; mode 23 [row 22]
18234 movu m6, [r5 + 17 * 16]
18235 pmaddubsw m4, m0, m6
18236 pmulhrsw m4, m7
18237 pmaddubsw m5, m2, m6
18238 pmulhrsw m5, m7
18239 packuswb m4, m5
18240 movu [r0 + 1388 * 16], m4
18241 pmaddubsw m4, m1, m6
18242 pmulhrsw m4, m7
18243 pmaddubsw m5, m3, m6
18244 pmulhrsw m5, m7
18245 packuswb m4, m5
18246 movu [r0 + 1389 * 16], m4
18247
18248 ; mode 23 [row 23]
18249 movu m6, [r5 + 8 * 16]
18250 pmaddubsw m4, m0, m6
18251 pmulhrsw m4, m7
18252 pmaddubsw m5, m2, m6
18253 pmulhrsw m5, m7
18254 packuswb m4, m5
18255 movu [r0 + 1390 * 16], m4
18256 pmaddubsw m4, m1, m6
18257 pmulhrsw m4, m7
18258 pmaddubsw m5, m3, m6
18259 pmulhrsw m5, m7
18260 packuswb m4, m5
18261 movu [r0 + 1391 * 16], m4
18262
18263 ; mode 23 [row 24]
18264 movu m6, [r5 + 31 * 16]
18265 pslldq m0, 2
18266 pinsrb m0, [r4 + 21], 1
18267 pinsrb m0, [r4 + 25], 0
18268 pmaddubsw m4, m0, m6
18269 pmulhrsw m4, m7
18270 pslldq m2, 2
18271 pinsrw m2, [r3 + 1], 0
18272 pmaddubsw m5, m2, m6
18273 pmulhrsw m5, m7
18274 packuswb m4, m5
18275 movu [r0 + 1392 * 16], m4
18276 pslldq m1, 2
18277 pinsrw m1, [r3 + 9], 0
18278 pmaddubsw m4, m1, m6
18279 pmulhrsw m4, m7
18280 pslldq m3, 2
18281 pinsrw m3, [r3 + 17], 0
18282 pmaddubsw m5, m3, m6
18283 pmulhrsw m5, m7
18284 packuswb m4, m5
18285 movu [r0 + 1393 * 16], m4
18286
18287 ; mode 23 [row 25]
18288 movu m6, [r5 + 22 * 16]
18289 pmaddubsw m4, m0, m6
18290 pmulhrsw m4, m7
18291 pmaddubsw m5, m2, m6
18292 pmulhrsw m5, m7
18293 packuswb m4, m5
18294 movu [r0 + 1394 * 16], m4
18295 pmaddubsw m4, m1, m6
18296 pmulhrsw m4, m7
18297 pmaddubsw m5, m3, m6
18298 pmulhrsw m5, m7
18299 packuswb m4, m5
18300 movu [r0 + 1395 * 16], m4
18301
18302 ; mode 23 [row 26]
18303 movu m6, [r5 + 13 * 16]
18304 pmaddubsw m4, m0, m6
18305 pmulhrsw m4, m7
18306 pmaddubsw m5, m2, m6
18307 pmulhrsw m5, m7
18308 packuswb m4, m5
18309 movu [r0 + 1396 * 16], m4
18310 pmaddubsw m4, m1, m6
18311 pmulhrsw m4, m7
18312 pmaddubsw m5, m3, m6
18313 pmulhrsw m5, m7
18314 packuswb m4, m5
18315 movu [r0 + 1397 * 16], m4
18316
18317 ; mode 23 [row 27]
18318 movu m6, [r5 + 4 * 16]
18319 pmaddubsw m4, m0, m6
18320 pmulhrsw m4, m7
18321 pmaddubsw m5, m2, m6
18322 pmulhrsw m5, m7
18323 packuswb m4, m5
18324 movu [r0 + 1398 * 16], m4
18325 pmaddubsw m4, m1, m6
18326 pmulhrsw m4, m7
18327 pmaddubsw m5, m3, m6
18328 pmulhrsw m5, m7
18329 packuswb m4, m5
18330 movu [r0 + 1399 * 16], m4
18331
18332 ; mode 23 [row 28]
18333 movu m6, [r5 + 27 * 16]
18334 pslldq m0, 2
18335 pinsrb m0, [r4 + 25], 1
18336 pinsrb m0, [r4 + 28], 0
18337 pmaddubsw m4, m0, m6
18338 pmulhrsw m4, m7
18339 pslldq m2, 2
18340 pinsrw m2, [r3 + 0], 0
18341 pmaddubsw m5, m2, m6
18342 pmulhrsw m5, m7
18343 packuswb m4, m5
18344 movu [r0 + 1400 * 16], m4
18345 pslldq m1, 2
18346 pinsrw m1, [r3 + 8], 0
18347 pmaddubsw m4, m1, m6
18348 pmulhrsw m4, m7
18349 pslldq m3, 2
18350 pinsrw m3, [r3 + 16], 0
18351 pmaddubsw m5, m3, m6
18352 pmulhrsw m5, m7
18353 packuswb m4, m5
18354 movu [r0 + 1401 * 16], m4
18355
18356 ; mode 23 [row 29]
18357 movu m6, [r5 + 18 * 16]
18358 pmaddubsw m4, m0, m6
18359 pmulhrsw m4, m7
18360 pmaddubsw m5, m2, m6
18361 pmulhrsw m5, m7
18362 packuswb m4, m5
18363 movu [r0 + 1402 * 16], m4
18364 pmaddubsw m4, m1, m6
18365 pmulhrsw m4, m7
18366 pmaddubsw m5, m3, m6
18367 pmulhrsw m5, m7
18368 packuswb m4, m5
18369 movu [r0 + 1403 * 16], m4
18370
18371 ; mode 23 [row 30]
18372 movu m6, [r5 + 9 * 16]
18373 pmaddubsw m4, m0, m6
18374 pmulhrsw m4, m7
18375 pmaddubsw m5, m2, m6
18376 pmulhrsw m5, m7
18377 packuswb m4, m5
18378 movu [r0 + 1404 * 16], m4
18379 pmaddubsw m4, m1, m6
18380 pmulhrsw m4, m7
18381 pmaddubsw m5, m3, m6
18382 pmulhrsw m5, m7
18383 packuswb m4, m5
18384 movu [r0 + 1405 * 16], m4
18385
18386 ; mode23 [row 31]
18387 pshufb m5, m0, [tab_S2]
18388 movh [r0 + 1406 * 16], m5
18389 pshufb m5, m2, [tab_S2]
18390 movh [r0 + 1406 * 16 + 8], m5
18391 pshufb m5, m1, [tab_S2]
18392 movh [r0 + 1407 * 16], m5
18393 pshufb m5, m3, [tab_S2]
18394 movh [r0 + 1407 * 16 + 8], m5
18395
18396 ; mode 24 [row 0]
18397 movu m6, [r5 + 27 * 16]
18398 movu m0, [r3 ]
18399 movu m1, [r3 + 1 ]
18400 punpcklbw m0, m1
18401 pmaddubsw m4, m0, m6
18402 pmulhrsw m4, m7
18403 movu m2, [r3 + 8]
18404 movu m3, [r3 + 9]
18405 punpcklbw m2, m3
18406 pmaddubsw m5, m2, m6
18407 pmulhrsw m5, m7
18408 packuswb m4, m5
18409 movu [r0 + 1408 * 16], m4
18410
18411 movu m1, [r3 + 16]
18412 movu m3, [r3 + 17]
18413 punpcklbw m1, m3
18414 pmaddubsw m4, m1, m6
18415 pmulhrsw m4, m7
18416 movu m3, [r3 + 24]
18417 movu m5, [r3 + 25]
18418 punpcklbw m3, m5
18419 pmaddubsw m5, m3, m6
18420 pmulhrsw m5, m7
18421 packuswb m4, m5
18422 movu [r0 + 1409 * 16], m4
18423
18424 ; mode 24 [row 1]
18425 movu m6, [r5 + 22 * 16]
18426 pmaddubsw m4, m0, m6
18427 pmulhrsw m4, m7
18428 pmaddubsw m5, m2, m6
18429 pmulhrsw m5, m7
18430 packuswb m4, m5
18431 movu [r0 + 1410 * 16], m4
18432 pmaddubsw m4, m1, m6
18433 pmulhrsw m4, m7
18434 pmaddubsw m5, m3, m6
18435 pmulhrsw m5, m7
18436 packuswb m4, m5
18437 movu [r0 + 1411 * 16], m4
18438
18439 ; mode 24 [row 2]
18440 movu m6, [r5 + 17 * 16]
18441 pmaddubsw m4, m0, m6
18442 pmulhrsw m4, m7
18443 pmaddubsw m5, m2, m6
18444 pmulhrsw m5, m7
18445 packuswb m4, m5
18446 movu [r0 + 1412 * 16], m4
18447 pmaddubsw m4, m1, m6
18448 pmulhrsw m4, m7
18449 pmaddubsw m5, m3, m6
18450 pmulhrsw m5, m7
18451 packuswb m4, m5
18452 movu [r0 + 1413 * 16], m4
18453
18454 ; mode 24 [row 3]
18455 movu m6, [r5 + 12 * 16]
18456 pmaddubsw m4, m0, m6
18457 pmulhrsw m4, m7
18458 pmaddubsw m5, m2, m6
18459 pmulhrsw m5, m7
18460 packuswb m4, m5
18461 movu [r0 + 1414 * 16], m4
18462 pmaddubsw m4, m1, m6
18463 pmulhrsw m4, m7
18464 pmaddubsw m5, m3, m6
18465 pmulhrsw m5, m7
18466 packuswb m4, m5
18467 movu [r0 + 1415 * 16], m4
18468
18469 ; mode 24 [row 4]
18470 movu m6, [r5 + 7 * 16]
18471 pmaddubsw m4, m0, m6
18472 pmulhrsw m4, m7
18473 pmaddubsw m5, m2, m6
18474 pmulhrsw m5, m7
18475 packuswb m4, m5
18476 movu [r0 + 1416 * 16], m4
18477 pmaddubsw m4, m1, m6
18478 pmulhrsw m4, m7
18479 pmaddubsw m5, m3, m6
18480 pmulhrsw m5, m7
18481 packuswb m4, m5
18482 movu [r0 + 1417 * 16], m4
18483
18484 ; mode 24 [row 5]
18485 movu m6, [r5 + 2 * 16]
18486 pmaddubsw m4, m0, m6
18487 pmulhrsw m4, m7
18488 pmaddubsw m5, m2, m6
18489 pmulhrsw m5, m7
18490 packuswb m4, m5
18491 movu [r0 + 1418 * 16], m4
18492 pmaddubsw m4, m1, m6
18493 pmulhrsw m4, m7
18494 pmaddubsw m5, m3, m6
18495 pmulhrsw m5, m7
18496 packuswb m4, m5
18497 movu [r0 + 1419 * 16], m4
18498
18499 ; mode 24 [row 6]
18500 movu m6, [r5 + 29 * 16]
18501 pslldq m0, 2
18502 pinsrb m0, [r4 + 0], 1
18503 pinsrb m0, [r4 + 6], 0
18504 pmaddubsw m4, m0, m6
18505 pmulhrsw m4, m7
18506 pslldq m2, 2
18507 pinsrw m2, [r3 + 7], 0
18508 pmaddubsw m5, m2, m6
18509 pmulhrsw m5, m7
18510 packuswb m4, m5
18511 movu [r0 + 1420 * 16], m4
18512 pslldq m1, 2
18513 pinsrw m1, [r3 + 15], 0
18514 pmaddubsw m4, m1, m6
18515 pmulhrsw m4, m7
18516 pslldq m3, 2
18517 pinsrw m3, [r3 + 23], 0
18518 pmaddubsw m5, m3, m6
18519 pmulhrsw m5, m7
18520 packuswb m4, m5
18521 movu [r0 + 1421 * 16], m4
18522
18523 ; mode 24 [row 7]
18524 movu m6, [r5 + 24 * 16]
18525 pmaddubsw m4, m0, m6
18526 pmulhrsw m4, m7
18527 pmaddubsw m5, m2, m6
18528 pmulhrsw m5, m7
18529 packuswb m4, m5
18530 movu [r0 + 1422 * 16], m4
18531 pmaddubsw m4, m1, m6
18532 pmulhrsw m4, m7
18533 pmaddubsw m5, m3, m6
18534 pmulhrsw m5, m7
18535 packuswb m4, m5
18536 movu [r0 + 1423 * 16], m4
18537
18538 ; mode 24 [row 8]
18539 movu m6, [r5 + 19 * 16]
18540 pmaddubsw m4, m0, m6
18541 pmulhrsw m4, m7
18542 pmaddubsw m5, m2, m6
18543 pmulhrsw m5, m7
18544 packuswb m4, m5
18545 movu [r0 + 1424 * 16], m4
18546 pmaddubsw m4, m1, m6
18547 pmulhrsw m4, m7
18548 pmaddubsw m5, m3, m6
18549 pmulhrsw m5, m7
18550 packuswb m4, m5
18551 movu [r0 + 1425 * 16], m4
18552
18553 ; mode 24 [row 9]
18554 movu m6, [r5 + 14 * 16]
18555 pmaddubsw m4, m0, m6
18556 pmulhrsw m4, m7
18557 pmaddubsw m5, m2, m6
18558 pmulhrsw m5, m7
18559 packuswb m4, m5
18560 movu [r0 + 1426 * 16], m4
18561 pmaddubsw m4, m1, m6
18562 pmulhrsw m4, m7
18563 pmaddubsw m5, m3, m6
18564 pmulhrsw m5, m7
18565 packuswb m4, m5
18566 movu [r0 + 1427 * 16], m4
18567
18568 ; mode 24 [row 10]
18569 movu m6, [r5 + 9 * 16]
18570 pmaddubsw m4, m0, m6
18571 pmulhrsw m4, m7
18572 pmaddubsw m5, m2, m6
18573 pmulhrsw m5, m7
18574 packuswb m4, m5
18575 movu [r0 + 1428 * 16], m4
18576 pmaddubsw m4, m1, m6
18577 pmulhrsw m4, m7
18578 pmaddubsw m5, m3, m6
18579 pmulhrsw m5, m7
18580 packuswb m4, m5
18581 movu [r0 + 1429 * 16], m4
18582
18583 ; mode 24 [row 11]
18584 movu m6, [r5 + 4 * 16]
18585 pmaddubsw m4, m0, m6
18586 pmulhrsw m4, m7
18587 pmaddubsw m5, m2, m6
18588 pmulhrsw m5, m7
18589 packuswb m4, m5
18590 movu [r0 + 1430 * 16], m4
18591 pmaddubsw m4, m1, m6
18592 pmulhrsw m4, m7
18593 pmaddubsw m5, m3, m6
18594 pmulhrsw m5, m7
18595 packuswb m4, m5
18596 movu [r0 + 1431 * 16], m4
18597
18598 ; mode 24 [row 12]
18599 movu m6, [r5 + 31 * 16]
18600 pslldq m0, 2
18601 pinsrb m0, [r4 + 6], 1
18602 pinsrb m0, [r4 + 13], 0
18603 pmaddubsw m4, m0, m6
18604 pmulhrsw m4, m7
18605 pslldq m2, 2
18606 pinsrw m2, [r3 + 6], 0
18607 pmaddubsw m5, m2, m6
18608 pmulhrsw m5, m7
18609 packuswb m4, m5
18610 movu [r0 + 1432 * 16], m4
18611 pslldq m1, 2
18612 pinsrw m1, [r3 + 14], 0
18613 pmaddubsw m4, m1, m6
18614 pmulhrsw m4, m7
18615 pslldq m3, 2
18616 pinsrw m3, [r3 + 22], 0
18617 pmaddubsw m5, m3, m6
18618 pmulhrsw m5, m7
18619 packuswb m4, m5
18620 movu [r0 + 1433 * 16], m4
18621
18622 ; mode 24 [row 13]
18623 movu m6, [r5 + 26 * 16]
18624 pmaddubsw m4, m0, m6
18625 pmulhrsw m4, m7
18626 pmaddubsw m5, m2, m6
18627 pmulhrsw m5, m7
18628 packuswb m4, m5
18629 movu [r0 + 1434 * 16], m4
18630 pmaddubsw m4, m1, m6
18631 pmulhrsw m4, m7
18632 pmaddubsw m5, m3, m6
18633 pmulhrsw m5, m7
18634 packuswb m4, m5
18635 movu [r0 + 1435 * 16], m4
18636
18637 ; mode 24 [row 14]
18638 movu m6, [r5 + 21 * 16]
18639 pmaddubsw m4, m0, m6
18640 pmulhrsw m4, m7
18641 pmaddubsw m5, m2, m6
18642 pmulhrsw m5, m7
18643 packuswb m4, m5
18644 movu [r0 + 1436 * 16], m4
18645 pmaddubsw m4, m1, m6
18646 pmulhrsw m4, m7
18647 pmaddubsw m5, m3, m6
18648 pmulhrsw m5, m7
18649 packuswb m4, m5
18650 movu [r0 + 1437 * 16], m4
18651
18652 ; mode 24 [row 15]
18653 movu m6, [r5 + 16 * 16]
18654 pmaddubsw m4, m0, m6
18655 pmulhrsw m4, m7
18656 pmaddubsw m5, m2, m6
18657 pmulhrsw m5, m7
18658 packuswb m4, m5
18659 movu [r0 + 1438 * 16], m4
18660 pmaddubsw m4, m1, m6
18661 pmulhrsw m4, m7
18662 pmaddubsw m5, m3, m6
18663 pmulhrsw m5, m7
18664 packuswb m4, m5
18665 movu [r0 + 1439 * 16], m4
18666
18667 ; mode 24 [row 16]
18668 movu m6, [r5 + 11 * 16]
18669 pmaddubsw m4, m0, m6
18670 pmulhrsw m4, m7
18671 pmaddubsw m5, m2, m6
18672 pmulhrsw m5, m7
18673 packuswb m4, m5
18674 movu [r0 + 1440 * 16], m4
18675 pmaddubsw m4, m1, m6
18676 pmulhrsw m4, m7
18677 pmaddubsw m5, m3, m6
18678 pmulhrsw m5, m7
18679 packuswb m4, m5
18680 movu [r0 + 1441 * 16], m4
18681
18682 ; mode 24 [row 17]
18683 movu m6, [r5 + 6 * 16]
18684 pmaddubsw m4, m0, m6
18685 pmulhrsw m4, m7
18686 pmaddubsw m5, m2, m6
18687 pmulhrsw m5, m7
18688 packuswb m4, m5
18689 movu [r0 + 1442 * 16], m4
18690 pmaddubsw m4, m1, m6
18691 pmulhrsw m4, m7
18692 pmaddubsw m5, m3, m6
18693 pmulhrsw m5, m7
18694 packuswb m4, m5
18695 movu [r0 + 1443 * 16], m4
18696
18697 ; mode 24 [row 18]
18698 movu m6, [r5 + 1 * 16]
18699 pmaddubsw m4, m0, m6
18700 pmulhrsw m4, m7
18701 pmaddubsw m5, m2, m6
18702 pmulhrsw m5, m7
18703 packuswb m4, m5
18704 movu [r0 + 1444 * 16], m4
18705 pmaddubsw m4, m1, m6
18706 pmulhrsw m4, m7
18707 pmaddubsw m5, m3, m6
18708 pmulhrsw m5, m7
18709 packuswb m4, m5
18710 movu [r0 + 1445 * 16], m4
18711
18712 ; mode 24 [row 19]
18713 movu m6, [r5 + 28 * 16]
18714 pslldq m0, 2
18715 pinsrb m0, [r4 + 13], 1
18716 pinsrb m0, [r4 + 19], 0
18717 pmaddubsw m4, m0, m6
18718 pmulhrsw m4, m7
18719 pslldq m2, 2
18720 pinsrw m2, [r3 + 5], 0
18721 pmaddubsw m5, m2, m6
18722 pmulhrsw m5, m7
18723 packuswb m4, m5
18724 movu [r0 + 1446 * 16], m4
18725 pslldq m1, 2
18726 pinsrw m1, [r3 + 13], 0
18727 pmaddubsw m4, m1, m6
18728 pmulhrsw m4, m7
18729 pslldq m3, 2
18730 pinsrw m3, [r3 + 21], 0
18731 pmaddubsw m5, m3, m6
18732 pmulhrsw m5, m7
18733 packuswb m4, m5
18734 movu [r0 + 1447 * 16], m4
18735
18736 ; mode 24 [row 20]
18737 movu m6, [r5 + 23 * 16]
18738 pmaddubsw m4, m0, m6
18739 pmulhrsw m4, m7
18740 pmaddubsw m5, m2, m6
18741 pmulhrsw m5, m7
18742 packuswb m4, m5
18743 movu [r0 + 1448 * 16], m4
18744 pmaddubsw m4, m1, m6
18745 pmulhrsw m4, m7
18746 pmaddubsw m5, m3, m6
18747 pmulhrsw m5, m7
18748 packuswb m4, m5
18749 movu [r0 + 1449 * 16], m4
18750
18751 ; mode 24 [row 21]
18752 movu m6, [r5 + 18 * 16]
18753 pmaddubsw m4, m0, m6
18754 pmulhrsw m4, m7
18755 pmaddubsw m5, m2, m6
18756 pmulhrsw m5, m7
18757 packuswb m4, m5
18758 movu [r0 + 1450 * 16], m4
18759 pmaddubsw m4, m1, m6
18760 pmulhrsw m4, m7
18761 pmaddubsw m5, m3, m6
18762 pmulhrsw m5, m7
18763 packuswb m4, m5
18764 movu [r0 + 1451 * 16], m4
18765
18766 ; mode 24 [row 22]
18767 movu m6, [r5 + 13 * 16]
18768 pmaddubsw m4, m0, m6
18769 pmulhrsw m4, m7
18770 pmaddubsw m5, m2, m6
18771 pmulhrsw m5, m7
18772 packuswb m4, m5
18773 movu [r0 + 1452 * 16], m4
18774 pmaddubsw m4, m1, m6
18775 pmulhrsw m4, m7
18776 pmaddubsw m5, m3, m6
18777 pmulhrsw m5, m7
18778 packuswb m4, m5
18779 movu [r0 + 1453 * 16], m4
18780
18781 ; mode 24 [row 23]
18782 movu m6, [r5 + 8 * 16]
18783 pmaddubsw m4, m0, m6
18784 pmulhrsw m4, m7
18785 pmaddubsw m5, m2, m6
18786 pmulhrsw m5, m7
18787 packuswb m4, m5
18788 movu [r0 + 1454 * 16], m4
18789 pmaddubsw m4, m1, m6
18790 pmulhrsw m4, m7
18791 pmaddubsw m5, m3, m6
18792 pmulhrsw m5, m7
18793 packuswb m4, m5
18794 movu [r0 + 1455 * 16], m4
18795
18796 ; mode 24 [row 24]
18797 movu m6, [r5 + 3 * 16]
18798 pmaddubsw m4, m0, m6
18799 pmulhrsw m4, m7
18800 pmaddubsw m5, m2, m6
18801 pmulhrsw m5, m7
18802 packuswb m4, m5
18803 movu [r0 + 1456 * 16], m4
18804 pmaddubsw m4, m1, m6
18805 pmulhrsw m4, m7
18806 pmaddubsw m5, m3, m6
18807 pmulhrsw m5, m7
18808 packuswb m4, m5
18809 movu [r0 + 1457 * 16], m4
18810
18811 ; mode 24 [row 25]
18812 movu m6, [r5 + 30 * 16]
18813 pslldq m0, 2
18814 pinsrb m0, [r4 + 19], 1
18815 pinsrb m0, [r4 + 26], 0
18816 pmaddubsw m4, m0, m6
18817 pmulhrsw m4, m7
18818 pslldq m2, 2
18819 pinsrw m2, [r3 + 4], 0
18820 pmaddubsw m5, m2, m6
18821 pmulhrsw m5, m7
18822 packuswb m4, m5
18823 movu [r0 + 1458 * 16], m4
18824 pslldq m1, 2
18825 pinsrw m1, [r3 + 12], 0
18826 pmaddubsw m4, m1, m6
18827 pmulhrsw m4, m7
18828 pslldq m3, 2
18829 pinsrw m3, [r3 + 20], 0
18830 pmaddubsw m5, m3, m6
18831 pmulhrsw m5, m7
18832 packuswb m4, m5
18833 movu [r0 + 1459 * 16], m4
18834
18835 ; mode 24 [row 26]
18836 movu m6, [r5 + 25 * 16]
18837 pmaddubsw m4, m0, m6
18838 pmulhrsw m4, m7
18839 pmaddubsw m5, m2, m6
18840 pmulhrsw m5, m7
18841 packuswb m4, m5
18842 movu [r0 + 1460 * 16], m4
18843 pmaddubsw m4, m1, m6
18844 pmulhrsw m4, m7
18845 pmaddubsw m5, m3, m6
18846 pmulhrsw m5, m7
18847 packuswb m4, m5
18848 movu [r0 + 1461 * 16], m4
18849
18850 ; mode 24 [row 27]
18851 movu m6, [r5 + 20 * 16]
18852 pmaddubsw m4, m0, m6
18853 pmulhrsw m4, m7
18854 pmaddubsw m5, m2, m6
18855 pmulhrsw m5, m7
18856 packuswb m4, m5
18857 movu [r0 + 1462 * 16], m4
18858 pmaddubsw m4, m1, m6
18859 pmulhrsw m4, m7
18860 pmaddubsw m5, m3, m6
18861 pmulhrsw m5, m7
18862 packuswb m4, m5
18863 movu [r0 + 1463 * 16], m4
18864
18865 ; mode 24 [row 28]
18866 movu m6, [r5 + 15 * 16]
18867 pmaddubsw m4, m0, m6
18868 pmulhrsw m4, m7
18869 pmaddubsw m5, m2, m6
18870 pmulhrsw m5, m7
18871 packuswb m4, m5
18872 movu [r0 + 1464 * 16], m4
18873 pmaddubsw m4, m1, m6
18874 pmulhrsw m4, m7
18875 pmaddubsw m5, m3, m6
18876 pmulhrsw m5, m7
18877 packuswb m4, m5
18878 movu [r0 + 1465 * 16], m4
18879
18880 ; mode 24 [row 29]
18881 movu m6, [r5 + 10 * 16]
18882 pmaddubsw m4, m0, m6
18883 pmulhrsw m4, m7
18884 pmaddubsw m5, m2, m6
18885 pmulhrsw m5, m7
18886 packuswb m4, m5
18887 movu [r0 + 1466 * 16], m4
18888 pmaddubsw m4, m1, m6
18889 pmulhrsw m4, m7
18890 pmaddubsw m5, m3, m6
18891 pmulhrsw m5, m7
18892 packuswb m4, m5
18893 movu [r0 + 1467 * 16], m4
18894
18895 ; mode 24 [row 30]
18896 movu m6, [r5 + 5 * 16]
18897 pmaddubsw m4, m0, m6
18898 pmulhrsw m4, m7
18899 pmaddubsw m5, m2, m6
18900 pmulhrsw m5, m7
18901 packuswb m4, m5
18902 movu [r0 + 1468 * 16], m4
18903 pmaddubsw m4, m1, m6
18904 pmulhrsw m4, m7
18905 pmaddubsw m5, m3, m6
18906 pmulhrsw m5, m7
18907 packuswb m4, m5
18908 movu [r0 + 1469 * 16], m4
18909
18910 ; mode 24 [row 31]
18911 pshufb m5, m0, [tab_S2]
18912 movh [r0 + 1470 * 16], m5
18913 pshufb m5, m2, [tab_S2]
18914 movh [r0 + 1470 * 16 + 8], m5
18915 pshufb m5, m1, [tab_S2]
18916 movh [r0 + 1471 * 16], m5
18917 pshufb m5, m3, [tab_S2]
18918 movh [r0 + 1471 * 16 + 8], m5
18919
18920 ; mode 25 [row 0]
18921 movu m6, [r5 + 30 * 16]
18922 movu m0, [r3 ]
18923 movu m1, [r3 + 1 ]
18924 punpcklbw m0, m1
18925 pmaddubsw m4, m0, m6
18926 pmulhrsw m4, m7
18927 movu m2, [r3 + 8]
18928 movu m3, [r3 + 9]
18929 punpcklbw m2, m3
18930 pmaddubsw m5, m2, m6
18931 pmulhrsw m5, m7
18932 packuswb m4, m5
18933 movu [r0 + 1472 * 16], m4
18934
18935 movu m1, [r3 + 16]
18936 movu m3, [r3 + 17]
18937 punpcklbw m1, m3
18938 pmaddubsw m4, m1, m6
18939 pmulhrsw m4, m7
18940 movu m3, [r3 + 24]
18941 movu m5, [r3 + 25]
18942 punpcklbw m3, m5
18943 pmaddubsw m5, m3, m6
18944 pmulhrsw m5, m7
18945 packuswb m4, m5
18946 movu [r0 + 1473 * 16], m4
18947
18948 ; mode 25 [row 1]
18949 movu m6, [r5 + 28 * 16]
18950 pmaddubsw m4, m0, m6
18951 pmulhrsw m4, m7
18952 pmaddubsw m5, m2, m6
18953 pmulhrsw m5, m7
18954 packuswb m4, m5
18955 movu [r0 + 1474 * 16], m4
18956 pmaddubsw m4, m1, m6
18957 pmulhrsw m4, m7
18958 pmaddubsw m5, m3, m6
18959 pmulhrsw m5, m7
18960 packuswb m4, m5
18961 movu [r0 + 1475 * 16], m4
18962
18963 ; mode 25 [row 2]
18964 movu m6, [r5 + 26 * 16]
18965 pmaddubsw m4, m0, m6
18966 pmulhrsw m4, m7
18967 pmaddubsw m5, m2, m6
18968 pmulhrsw m5, m7
18969 packuswb m4, m5
18970 movu [r0 + 1476 * 16], m4
18971 pmaddubsw m4, m1, m6
18972 pmulhrsw m4, m7
18973 pmaddubsw m5, m3, m6
18974 pmulhrsw m5, m7
18975 packuswb m4, m5
18976 movu [r0 + 1477 * 16], m4
18977
18978 ; mode 25 [row 3]
18979 movu m6, [r5 + 24 * 16]
18980 pmaddubsw m4, m0, m6
18981 pmulhrsw m4, m7
18982 pmaddubsw m5, m2, m6
18983 pmulhrsw m5, m7
18984 packuswb m4, m5
18985 movu [r0 + 1478 * 16], m4
18986 pmaddubsw m4, m1, m6
18987 pmulhrsw m4, m7
18988 pmaddubsw m5, m3, m6
18989 pmulhrsw m5, m7
18990 packuswb m4, m5
18991 movu [r0 + 1479 * 16], m4
18992
18993 ; mode 25 [row 4]
18994 movu m6, [r5 + 22 * 16]
18995 pmaddubsw m4, m0, m6
18996 pmulhrsw m4, m7
18997 pmaddubsw m5, m2, m6
18998 pmulhrsw m5, m7
18999 packuswb m4, m5
19000 movu [r0 + 1480 * 16], m4
19001 pmaddubsw m4, m1, m6
19002 pmulhrsw m4, m7
19003 pmaddubsw m5, m3, m6
19004 pmulhrsw m5, m7
19005 packuswb m4, m5
19006 movu [r0 + 1481 * 16], m4
19007
19008 ; mode 25 [row 5]
19009 movu m6, [r5 + 20 * 16]
19010 pmaddubsw m4, m0, m6
19011 pmulhrsw m4, m7
19012 pmaddubsw m5, m2, m6
19013 pmulhrsw m5, m7
19014 packuswb m4, m5
19015 movu [r0 + 1482 * 16], m4
19016 pmaddubsw m4, m1, m6
19017 pmulhrsw m4, m7
19018 pmaddubsw m5, m3, m6
19019 pmulhrsw m5, m7
19020 packuswb m4, m5
19021 movu [r0 + 1483 * 16], m4
19022
19023 ; mode 25 [row 6]
19024 movu m6, [r5 + 18 * 16]
19025 pmaddubsw m4, m0, m6
19026 pmulhrsw m4, m7
19027 pmaddubsw m5, m2, m6
19028 pmulhrsw m5, m7
19029 packuswb m4, m5
19030 movu [r0 + 1484 * 16], m4
19031 pmaddubsw m4, m1, m6
19032 pmulhrsw m4, m7
19033 pmaddubsw m5, m3, m6
19034 pmulhrsw m5, m7
19035 packuswb m4, m5
19036 movu [r0 + 1485 * 16], m4
19037
19038 ; mode 25 [row 7]
19039 movu m6, [r5 + 16 * 16]
19040 pmaddubsw m4, m0, m6
19041 pmulhrsw m4, m7
19042 pmaddubsw m5, m2, m6
19043 pmulhrsw m5, m7
19044 packuswb m4, m5
19045 movu [r0 + 1486 * 16], m4
19046 pmaddubsw m4, m1, m6
19047 pmulhrsw m4, m7
19048 pmaddubsw m5, m3, m6
19049 pmulhrsw m5, m7
19050 packuswb m4, m5
19051 movu [r0 + 1487 * 16], m4
19052
19053 ; mode 25 [row 8]
19054 movu m6, [r5 + 14 * 16]
19055 pmaddubsw m4, m0, m6
19056 pmulhrsw m4, m7
19057 pmaddubsw m5, m2, m6
19058 pmulhrsw m5, m7
19059 packuswb m4, m5
19060 movu [r0 + 1488 * 16], m4
19061 pmaddubsw m4, m1, m6
19062 pmulhrsw m4, m7
19063 pmaddubsw m5, m3, m6
19064 pmulhrsw m5, m7
19065 packuswb m4, m5
19066 movu [r0 + 1489 * 16], m4
19067
19068 ; mode 25 [row 9]
19069 movu m6, [r5 + 12 * 16]
19070 pmaddubsw m4, m0, m6
19071 pmulhrsw m4, m7
19072 pmaddubsw m5, m2, m6
19073 pmulhrsw m5, m7
19074 packuswb m4, m5
19075 movu [r0 + 1490 * 16], m4
19076 pmaddubsw m4, m1, m6
19077 pmulhrsw m4, m7
19078 pmaddubsw m5, m3, m6
19079 pmulhrsw m5, m7
19080 packuswb m4, m5
19081 movu [r0 + 1491 * 16], m4
19082
19083 ; mode 25 [row 10]
19084 movu m6, [r5 + 10 * 16]
19085 pmaddubsw m4, m0, m6
19086 pmulhrsw m4, m7
19087 pmaddubsw m5, m2, m6
19088 pmulhrsw m5, m7
19089 packuswb m4, m5
19090 movu [r0 + 1492 * 16], m4
19091 pmaddubsw m4, m1, m6
19092 pmulhrsw m4, m7
19093 pmaddubsw m5, m3, m6
19094 pmulhrsw m5, m7
19095 packuswb m4, m5
19096 movu [r0 + 1493 * 16], m4
19097
19098 ; mode 25 [row 11]
19099 movu m6, [r5 + 8 * 16]
19100 pmaddubsw m4, m0, m6
19101 pmulhrsw m4, m7
19102 pmaddubsw m5, m2, m6
19103 pmulhrsw m5, m7
19104 packuswb m4, m5
19105 movu [r0 + 1494 * 16], m4
19106 pmaddubsw m4, m1, m6
19107 pmulhrsw m4, m7
19108 pmaddubsw m5, m3, m6
19109 pmulhrsw m5, m7
19110 packuswb m4, m5
19111 movu [r0 + 1495 * 16], m4
19112
19113 ; mode 25 [row 12]
19114 movu m6, [r5 + 6 * 16]
19115 pmaddubsw m4, m0, m6
19116 pmulhrsw m4, m7
19117 pmaddubsw m5, m2, m6
19118 pmulhrsw m5, m7
19119 packuswb m4, m5
19120 movu [r0 + 1496 * 16], m4
19121 pmaddubsw m4, m1, m6
19122 pmulhrsw m4, m7
19123 pmaddubsw m5, m3, m6
19124 pmulhrsw m5, m7
19125 packuswb m4, m5
19126 movu [r0 + 1497 * 16], m4
19127
19128 ; mode 25 [row 13]
19129 movu m6, [r5 + 4 * 16]
19130 pmaddubsw m4, m0, m6
19131 pmulhrsw m4, m7
19132 pmaddubsw m5, m2, m6
19133 pmulhrsw m5, m7
19134 packuswb m4, m5
19135 movu [r0 + 1498 * 16], m4
19136 pmaddubsw m4, m1, m6
19137 pmulhrsw m4, m7
19138 pmaddubsw m5, m3, m6
19139 pmulhrsw m5, m7
19140 packuswb m4, m5
19141 movu [r0 + 1499 * 16], m4
19142
19143 ; mode 25 [row 14]
19144 movu m6, [r5 + 2 * 16]
19145 pmaddubsw m4, m0, m6
19146 pmulhrsw m4, m7
19147 pmaddubsw m5, m2, m6
19148 pmulhrsw m5, m7
19149 packuswb m4, m5
19150 movu [r0 + 1500 * 16], m4
19151 pmaddubsw m4, m1, m6
19152 pmulhrsw m4, m7
19153 pmaddubsw m5, m3, m6
19154 pmulhrsw m5, m7
19155 packuswb m4, m5
19156 movu [r0 + 1501 * 16], m4
19157
19158 ; mode 25 [row 15]
19159 pshufb m5, m0, [tab_S2]
19160 movh [r0 + 1502 * 16], m5
19161 pshufb m5, m2, [tab_S2]
19162 movh [r0 + 1502 * 16 + 8], m5
19163 pshufb m5, m1, [tab_S2]
19164 movh [r0 + 1503 * 16], m5
19165 pshufb m5, m3, [tab_S2]
19166 movh [r0 + 1503 * 16 + 8], m5
19167
19168 ; mode 25 [row 16]
19169 movu m6, [r5 + 30 * 16]
19170 pslldq m0, 2
19171 pinsrb m0, [r4 + 0], 1
19172 pinsrb m0, [r4 + 16], 0
19173 pmaddubsw m4, m0, m6
19174 pmulhrsw m4, m7
19175 pslldq m2, 2
19176 pinsrw m2, [r3 + 7], 0
19177 pmaddubsw m5, m2, m6
19178 pmulhrsw m5, m7
19179 packuswb m4, m5
19180 movu [r0 + 1504 * 16], m4
19181 pslldq m1, 2
19182 pinsrw m1, [r3 + 15], 0
19183 pmaddubsw m4, m1, m6
19184 pmulhrsw m4, m7
19185 pslldq m3, 2
19186 pinsrw m3, [r3 + 23], 0
19187 pmaddubsw m5, m3, m6
19188 pmulhrsw m5, m7
19189 packuswb m4, m5
19190 movu [r0 + 1505 * 16], m4
19191
19192 ; mode 25 [row 17]
19193 movu m6, [r5 + 28 * 16]
19194 pmaddubsw m4, m0, m6
19195 pmulhrsw m4, m7
19196 pmaddubsw m5, m2, m6
19197 pmulhrsw m5, m7
19198 packuswb m4, m5
19199 movu [r0 + 1506 * 16], m4
19200 pmaddubsw m4, m1, m6
19201 pmulhrsw m4, m7
19202 pmaddubsw m5, m3, m6
19203 pmulhrsw m5, m7
19204 packuswb m4, m5
19205 movu [r0 + 1507 * 16], m4
19206
19207 ; mode 25 [row 18]
19208 movu m6, [r5 + 26 * 16]
19209 pmaddubsw m4, m0, m6
19210 pmulhrsw m4, m7
19211 pmaddubsw m5, m2, m6
19212 pmulhrsw m5, m7
19213 packuswb m4, m5
19214 movu [r0 + 1508 * 16], m4
19215 pmaddubsw m4, m1, m6
19216 pmulhrsw m4, m7
19217 pmaddubsw m5, m3, m6
19218 pmulhrsw m5, m7
19219 packuswb m4, m5
19220 movu [r0 + 1509 * 16], m4
19221
19222 ; mode 25 [row 19]
19223 movu m6, [r5 + 24 * 16]
19224 pmaddubsw m4, m0, m6
19225 pmulhrsw m4, m7
19226 pmaddubsw m5, m2, m6
19227 pmulhrsw m5, m7
19228 packuswb m4, m5
19229 movu [r0 + 1510 * 16], m4
19230 pmaddubsw m4, m1, m6
19231 pmulhrsw m4, m7
19232 pmaddubsw m5, m3, m6
19233 pmulhrsw m5, m7
19234 packuswb m4, m5
19235 movu [r0 + 1511 * 16], m4
19236
19237 ; mode 25 [row 20]
19238 movu m6, [r5 + 22 * 16]
19239 pmaddubsw m4, m0, m6
19240 pmulhrsw m4, m7
19241 pmaddubsw m5, m2, m6
19242 pmulhrsw m5, m7
19243 packuswb m4, m5
19244 movu [r0 + 1512 * 16], m4
19245 pmaddubsw m4, m1, m6
19246 pmulhrsw m4, m7
19247 pmaddubsw m5, m3, m6
19248 pmulhrsw m5, m7
19249 packuswb m4, m5
19250 movu [r0 + 1513 * 16], m4
19251
19252 ; mode 25 [row 21]
19253 movu m6, [r5 + 20 * 16]
19254 pmaddubsw m4, m0, m6
19255 pmulhrsw m4, m7
19256 pmaddubsw m5, m2, m6
19257 pmulhrsw m5, m7
19258 packuswb m4, m5
19259 movu [r0 + 1514 * 16], m4
19260 pmaddubsw m4, m1, m6
19261 pmulhrsw m4, m7
19262 pmaddubsw m5, m3, m6
19263 pmulhrsw m5, m7
19264 packuswb m4, m5
19265 movu [r0 + 1515 * 16], m4
19266
19267 ; mode 25 [row 22]
19268 movu m6, [r5 + 18 * 16]
19269 pmaddubsw m4, m0, m6
19270 pmulhrsw m4, m7
19271 pmaddubsw m5, m2, m6
19272 pmulhrsw m5, m7
19273 packuswb m4, m5
19274 movu [r0 + 1516 * 16], m4
19275 pmaddubsw m4, m1, m6
19276 pmulhrsw m4, m7
19277 pmaddubsw m5, m3, m6
19278 pmulhrsw m5, m7
19279 packuswb m4, m5
19280 movu [r0 + 1517 * 16], m4
19281
19282 ; mode 25 [row 23]
19283 movu m6, [r5 + 16 * 16]
19284 pmaddubsw m4, m0, m6
19285 pmulhrsw m4, m7
19286 pmaddubsw m5, m2, m6
19287 pmulhrsw m5, m7
19288 packuswb m4, m5
19289 movu [r0 + 1518 * 16], m4
19290 pmaddubsw m4, m1, m6
19291 pmulhrsw m4, m7
19292 pmaddubsw m5, m3, m6
19293 pmulhrsw m5, m7
19294 packuswb m4, m5
19295 movu [r0 + 1519 * 16], m4
19296
19297 ; mode 25 [row 24]
19298 movu m6, [r5 + 14 * 16]
19299 pmaddubsw m4, m0, m6
19300 pmulhrsw m4, m7
19301 pmaddubsw m5, m2, m6
19302 pmulhrsw m5, m7
19303 packuswb m4, m5
19304 movu [r0 + 1520 * 16], m4
19305 pmaddubsw m4, m1, m6
19306 pmulhrsw m4, m7
19307 pmaddubsw m5, m3, m6
19308 pmulhrsw m5, m7
19309 packuswb m4, m5
19310 movu [r0 + 1521 * 16], m4
19311
19312 ; mode 25 [row 25]
19313 movu m6, [r5 + 12 * 16]
19314 pmaddubsw m4, m0, m6
19315 pmulhrsw m4, m7
19316 pmaddubsw m5, m2, m6
19317 pmulhrsw m5, m7
19318 packuswb m4, m5
19319 movu [r0 + 1522 * 16], m4
19320 pmaddubsw m4, m1, m6
19321 pmulhrsw m4, m7
19322 pmaddubsw m5, m3, m6
19323 pmulhrsw m5, m7
19324 packuswb m4, m5
19325 movu [r0 + 1523 * 16], m4
19326
19327 ; mode 25 [row 26]
19328 movu m6, [r5 + 10 * 16]
19329 pmaddubsw m4, m0, m6
19330 pmulhrsw m4, m7
19331 pmaddubsw m5, m2, m6
19332 pmulhrsw m5, m7
19333 packuswb m4, m5
19334 movu [r0 + 1524 * 16], m4
19335 pmaddubsw m4, m1, m6
19336 pmulhrsw m4, m7
19337 pmaddubsw m5, m3, m6
19338 pmulhrsw m5, m7
19339 packuswb m4, m5
19340 movu [r0 + 1525 * 16], m4
19341
19342 ; mode 25 [row 27]
19343 movu m6, [r5 + 8 * 16]
19344 pmaddubsw m4, m0, m6
19345 pmulhrsw m4, m7
19346 pmaddubsw m5, m2, m6
19347 pmulhrsw m5, m7
19348 packuswb m4, m5
19349 movu [r0 + 1526 * 16], m4
19350 pmaddubsw m4, m1, m6
19351 pmulhrsw m4, m7
19352 pmaddubsw m5, m3, m6
19353 pmulhrsw m5, m7
19354 packuswb m4, m5
19355 movu [r0 + 1527 * 16], m4
19356
19357 ; mode 25 [row 28]
19358 movu m6, [r5 + 6 * 16]
19359 pmaddubsw m4, m0, m6
19360 pmulhrsw m4, m7
19361 pmaddubsw m5, m2, m6
19362 pmulhrsw m5, m7
19363 packuswb m4, m5
19364 movu [r0 + 1528 * 16], m4
19365 pmaddubsw m4, m1, m6
19366 pmulhrsw m4, m7
19367 pmaddubsw m5, m3, m6
19368 pmulhrsw m5, m7
19369 packuswb m4, m5
19370 movu [r0 + 1529 * 16], m4
19371
19372 ; mode 25 [row 29]
19373 movu m6, [r5 + 4 * 16]
19374 pmaddubsw m4, m0, m6
19375 pmulhrsw m4, m7
19376 pmaddubsw m5, m2, m6
19377 pmulhrsw m5, m7
19378 packuswb m4, m5
19379 movu [r0 + 1530 * 16], m4
19380 pmaddubsw m4, m1, m6
19381 pmulhrsw m4, m7
19382 pmaddubsw m5, m3, m6
19383 pmulhrsw m5, m7
19384 packuswb m4, m5
19385 movu [r0 + 1531 * 16], m4
19386
19387 ; mode 25 [row 30]
19388 movu m6, [r5 + 2 * 16]
19389 pmaddubsw m4, m0, m6
19390 pmulhrsw m4, m7
19391 pmaddubsw m5, m2, m6
19392 pmulhrsw m5, m7
19393 packuswb m4, m5
19394 movu [r0 + 1532 * 16], m4
19395 pmaddubsw m4, m1, m6
19396 pmulhrsw m4, m7
19397 pmaddubsw m5, m3, m6
19398 pmulhrsw m5, m7
19399 packuswb m4, m5
19400 movu [r0 + 1533 * 16], m4
19401
19402 ; mode 25 [row 31]
19403 pshufb m5, m0, [tab_S2]
19404 movh [r0 + 1534 * 16], m5
19405 pshufb m5, m2, [tab_S2]
19406 movh [r0 + 1534 * 16 + 8], m5
19407 pshufb m5, m1, [tab_S2]
19408 movh [r0 + 1535 * 16], m5
19409 pshufb m5, m3, [tab_S2]
19410 movh [r0 + 1535 * 16 + 8], m5
19411
19412 ; mode 26
19413 movu m1, [r1 + 1]
19414 movu m2, [r1 + 17]
19415 movu [r0 + 1536 * 16], m1
19416 movu [r0 + 1537 * 16], m2
19417 movu [r0 + 1538 * 16], m1
19418 movu [r0 + 1539 * 16], m2
19419 movu [r0 + 1540 * 16], m1
19420 movu [r0 + 1541 * 16], m2
19421 movu [r0 + 1542 * 16], m1
19422 movu [r0 + 1543 * 16], m2
19423 movu [r0 + 1544 * 16], m1
19424 movu [r0 + 1545 * 16], m2
19425 movu [r0 + 1546 * 16], m1
19426 movu [r0 + 1547 * 16], m2
19427 movu [r0 + 1548 * 16], m1
19428 movu [r0 + 1549 * 16], m2
19429 movu [r0 + 1550 * 16], m1
19430 movu [r0 + 1551 * 16], m2
19431
19432 movu [r0 + 1552 * 16], m1
19433 movu [r0 + 1553 * 16], m2
19434 movu [r0 + 1554 * 16], m1
19435 movu [r0 + 1555 * 16], m2
19436 movu [r0 + 1556 * 16], m1
19437 movu [r0 + 1557 * 16], m2
19438 movu [r0 + 1558 * 16], m1
19439 movu [r0 + 1559 * 16], m2
19440 movu [r0 + 1560 * 16], m1
19441 movu [r0 + 1561 * 16], m2
19442 movu [r0 + 1562 * 16], m1
19443 movu [r0 + 1563 * 16], m2
19444 movu [r0 + 1564 * 16], m1
19445 movu [r0 + 1565 * 16], m2
19446 movu [r0 + 1566 * 16], m1
19447 movu [r0 + 1567 * 16], m2
19448
19449 movu [r0 + 1568 * 16], m1
19450 movu [r0 + 1569 * 16], m2
19451 movu [r0 + 1570 * 16], m1
19452 movu [r0 + 1571 * 16], m2
19453 movu [r0 + 1572 * 16], m1
19454 movu [r0 + 1573 * 16], m2
19455 movu [r0 + 1574 * 16], m1
19456 movu [r0 + 1575 * 16], m2
19457 movu [r0 + 1576 * 16], m1
19458 movu [r0 + 1577 * 16], m2
19459 movu [r0 + 1578 * 16], m1
19460 movu [r0 + 1579 * 16], m2
19461 movu [r0 + 1580 * 16], m1
19462 movu [r0 + 1581 * 16], m2
19463 movu [r0 + 1582 * 16], m1
19464 movu [r0 + 1583 * 16], m2
19465
19466 movu [r0 + 1584 * 16], m1
19467 movu [r0 + 1585 * 16], m2
19468 movu [r0 + 1586 * 16], m1
19469 movu [r0 + 1587 * 16], m2
19470 movu [r0 + 1588 * 16], m1
19471 movu [r0 + 1589 * 16], m2
19472 movu [r0 + 1590 * 16], m1
19473 movu [r0 + 1591 * 16], m2
19474 movu [r0 + 1592 * 16], m1
19475 movu [r0 + 1593 * 16], m2
19476 movu [r0 + 1594 * 16], m1
19477 movu [r0 + 1595 * 16], m2
19478 movu [r0 + 1596 * 16], m1
19479 movu [r0 + 1597 * 16], m2
19480 movu [r0 + 1598 * 16], m1
19481 movu [r0 + 1599 * 16], m2
19482
19483 ; mode 27 [row 0]
19484 movu m6, [r5 + 2 * 16]
19485 movu m0, [r3 + 1 ]
19486 movu m1, [r3 + 2 ]
19487 punpcklbw m0, m1
19488 pmaddubsw m4, m0, m6
19489 pmulhrsw m4, m7
19490 movu m2, [r3 + 9]
19491 movu m3, [r3 + 10]
19492 punpcklbw m2, m3
19493 pmaddubsw m5, m2, m6
19494 pmulhrsw m5, m7
19495 packuswb m4, m5
19496 movu [r0 + 1600 * 16], m4
19497
19498 movu m1, [r3 + 17]
19499 movu m3, [r3 + 18]
19500 punpcklbw m1, m3
19501 pmaddubsw m4, m1, m6
19502 pmulhrsw m4, m7
19503 movu m3, [r3 + 25]
19504 movu m5, [r3 + 26]
19505 punpcklbw m3, m5
19506 pmaddubsw m5, m3, m6
19507 pmulhrsw m5, m7
19508 packuswb m4, m5
19509 movu [r0 + 1601 * 16], m4
19510
19511 ; mode 27 [row 1]
19512 movu m6, [r5 + 4 * 16]
19513 pmaddubsw m4, m0, m6
19514 pmulhrsw m4, m7
19515 pmaddubsw m5, m2, m6
19516 pmulhrsw m5, m7
19517 packuswb m4, m5
19518 movu [r0 + 1602 * 16], m4
19519 pmaddubsw m4, m1, m6
19520 pmulhrsw m4, m7
19521 pmaddubsw m5, m3, m6
19522 pmulhrsw m5, m7
19523 packuswb m4, m5
19524 movu [r0 + 1603 * 16], m4
19525
19526 ; mode 27 [row 2]
19527 movu m6, [r5 + 6 * 16]
19528 pmaddubsw m4, m0, m6
19529 pmulhrsw m4, m7
19530 pmaddubsw m5, m2, m6
19531 pmulhrsw m5, m7
19532 packuswb m4, m5
19533 movu [r0 + 1604 * 16], m4
19534 pmaddubsw m4, m1, m6
19535 pmulhrsw m4, m7
19536 pmaddubsw m5, m3, m6
19537 pmulhrsw m5, m7
19538 packuswb m4, m5
19539 movu [r0 + 1605 * 16], m4
19540
19541 ; mode 27 [row 3]
19542 movu m6, [r5 + 8 * 16]
19543 pmaddubsw m4, m0, m6
19544 pmulhrsw m4, m7
19545 pmaddubsw m5, m2, m6
19546 pmulhrsw m5, m7
19547 packuswb m4, m5
19548 movu [r0 + 1606 * 16], m4
19549 pmaddubsw m4, m1, m6
19550 pmulhrsw m4, m7
19551 pmaddubsw m5, m3, m6
19552 pmulhrsw m5, m7
19553 packuswb m4, m5
19554 movu [r0 + 1607 * 16], m4
19555
19556 ; mode 27 [row 4]
19557 movu m6, [r5 + 10 * 16]
19558 pmaddubsw m4, m0, m6
19559 pmulhrsw m4, m7
19560 pmaddubsw m5, m2, m6
19561 pmulhrsw m5, m7
19562 packuswb m4, m5
19563 movu [r0 + 1608 * 16], m4
19564
19565 ; mode 28 [row 1 -first half]
19566 movu [r0 + 1666 * 16], m4
19567
19568 pmaddubsw m4, m1, m6
19569 pmulhrsw m4, m7
19570 pmaddubsw m5, m3, m6
19571 pmulhrsw m5, m7
19572 packuswb m4, m5
19573 movu [r0 + 1609 * 16], m4
19574
19575 ; mode 28 [row 1 - second half]
19576 movu [r0 + 1667 * 16], m4
19577
19578 ; mode 27 [row 5]
19579 movu m6, [r5 + 12 * 16]
19580 pmaddubsw m4, m0, m6
19581 pmulhrsw m4, m7
19582 pmaddubsw m5, m2, m6
19583 pmulhrsw m5, m7
19584 packuswb m4, m5
19585 movu [r0 + 1610 * 16], m4
19586
19587 pmaddubsw m4, m1, m6
19588 pmulhrsw m4, m7
19589 pmaddubsw m5, m3, m6
19590 pmulhrsw m5, m7
19591 packuswb m4, m5
19592 movu [r0 + 1611 * 16], m4
19593
19594 ; mode 27 [row 6]
19595 movu m6, [r5 + 14 * 16]
19596 pmaddubsw m4, m0, m6
19597 pmulhrsw m4, m7
19598 pmaddubsw m5, m2, m6
19599 pmulhrsw m5, m7
19600 packuswb m4, m5
19601 movu [r0 + 1612 * 16], m4
19602 pmaddubsw m4, m1, m6
19603 pmulhrsw m4, m7
19604 pmaddubsw m5, m3, m6
19605 pmulhrsw m5, m7
19606 packuswb m4, m5
19607 movu [r0 + 1613 * 16], m4
19608
19609 ; mode 27 [row 7]
19610 movu m6, [r5 + 16 * 16]
19611 pmaddubsw m4, m0, m6
19612 pmulhrsw m4, m7
19613 pmaddubsw m5, m2, m6
19614 pmulhrsw m5, m7
19615 packuswb m4, m5
19616 movu [r0 + 1614 * 16], m4
19617 pmaddubsw m4, m1, m6
19618 pmulhrsw m4, m7
19619 pmaddubsw m5, m3, m6
19620 pmulhrsw m5, m7
19621 packuswb m4, m5
19622 movu [r0 + 1615 * 16], m4
19623
19624 ; mode 27 [row 8]
19625 movu m6, [r5 + 18 * 16]
19626 pmaddubsw m4, m0, m6
19627 pmulhrsw m4, m7
19628 pmaddubsw m5, m2, m6
19629 pmulhrsw m5, m7
19630 packuswb m4, m5
19631 movu [r0 + 1616 * 16], m4
19632
19633 ; mode 29 [row 1 - first half]
19634 movu [r0 + 1730 * 16], m4
19635
19636 pmaddubsw m4, m1, m6
19637 pmulhrsw m4, m7
19638 pmaddubsw m5, m3, m6
19639 pmulhrsw m5, m7
19640 packuswb m4, m5
19641 movu [r0 + 1617 * 16], m4
19642
19643 ; mode 29 [row 1 - second half]
19644 movu [r0 + 1731 * 16], m4
19645
19646 ; mode 27 [row 9]
19647 movu m6, [r5 + 20 * 16]
19648 pmaddubsw m4, m0, m6
19649 pmulhrsw m4, m7
19650 pmaddubsw m5, m2, m6
19651 pmulhrsw m5, m7
19652 packuswb m4, m5
19653 movu [r0 + 1618 * 16], m4
19654
19655 ; mode 28 [row 3 -first half]
19656 movu [r0 + 1670 * 16], m4
19657
19658 pmaddubsw m4, m1, m6
19659 pmulhrsw m4, m7
19660 pmaddubsw m5, m3, m6
19661 pmulhrsw m5, m7
19662 packuswb m4, m5
19663 movu [r0 + 1619 * 16], m4
19664
19665 ; mode 28 [row 3 -second half]
19666 movu [r0 + 1671 * 16], m4
19667
19668 ; mode 27 [row 10]
19669 movu m6, [r5 + 22 * 16]
19670 pmaddubsw m4, m0, m6
19671 pmulhrsw m4, m7
19672 pmaddubsw m5, m2, m6
19673 pmulhrsw m5, m7
19674 packuswb m4, m5
19675 movu [r0 + 1620 * 16], m4
19676 pmaddubsw m4, m1, m6
19677 pmulhrsw m4, m7
19678 pmaddubsw m5, m3, m6
19679 pmulhrsw m5, m7
19680 packuswb m4, m5
19681 movu [r0 + 1621 * 16], m4
19682
19683 ; mode 27 [row 11]
19684 movu m6, [r5 + 24 * 16]
19685 pmaddubsw m4, m0, m6
19686 pmulhrsw m4, m7
19687 pmaddubsw m5, m2, m6
19688 pmulhrsw m5, m7
19689 packuswb m4, m5
19690 movu [r0 + 1622 * 16], m4
19691 pmaddubsw m4, m1, m6
19692 pmulhrsw m4, m7
19693 pmaddubsw m5, m3, m6
19694 pmulhrsw m5, m7
19695 packuswb m4, m5
19696 movu [r0 + 1623 * 16], m4
19697
19698 ; mode 27 [row 12]
19699 movu m6, [r5 + 26 * 16]
19700 pmaddubsw m4, m0, m6
19701 pmulhrsw m4, m7
19702 pmaddubsw m5, m2, m6
19703 pmulhrsw m5, m7
19704 packuswb m4, m5
19705 movu [r0 + 1624 * 16], m4
19706
19707 ; mode 30 [row 1 - first half]
19708 movu [r0 + 1794 * 16], m4
19709
19710 ; mode 33 [row 0 - first half]
19711 movu [r0 + 1984 * 16], m4
19712
19713 pmaddubsw m4, m1, m6
19714 pmulhrsw m4, m7
19715 pmaddubsw m5, m3, m6
19716 pmulhrsw m5, m7
19717 packuswb m4, m5
19718 movu [r0 + 1625 * 16], m4
19719
19720 ; mode 30 [row 1 - second half]
19721 movu [r0 + 1795 * 16], m4
19722
19723 ; mode 33 [row 0 - second half]
19724 movu [r0 + 1985 * 16], m4
19725
19726 ; mode 27 [row 13]
19727 movu m6, [r5 + 28 * 16]
19728 pmaddubsw m4, m0, m6
19729 pmulhrsw m4, m7
19730 pmaddubsw m5, m2, m6
19731 pmulhrsw m5, m7
19732 packuswb m4, m5
19733 movu [r0 + 1626 * 16], m4
19734 pmaddubsw m4, m1, m6
19735 pmulhrsw m4, m7
19736 pmaddubsw m5, m3, m6
19737 pmulhrsw m5, m7
19738 packuswb m4, m5
19739 movu [r0 + 1627 * 16], m4
19740
19741 ; mode 27 [row 14]
19742 movu m6, [r5 + 30 * 16]
19743 pmaddubsw m4, m0, m6
19744 pmulhrsw m4, m7
19745 pmaddubsw m5, m2, m6
19746 pmulhrsw m5, m7
19747 packuswb m4, m5
19748 movu [r0 + 1628 * 16], m4
19749
19750 ; mode 28 [row 5 first half]
19751 movu [r0 + 1674 * 16], m4
19752
19753 pmaddubsw m4, m1, m6
19754 pmulhrsw m4, m7
19755 pmaddubsw m5, m3, m6
19756 pmulhrsw m5, m7
19757 packuswb m4, m5
19758 movu [r0 + 1629 * 16], m4
19759
19760 ; mode 28 [row 5 second half]
19761 movu [r0 + 1675 * 16], m4
19762
19763 ; mode 28 [row 0]
19764 movu m6, [r5 + 5 * 16]
19765 pmaddubsw m4, m0, m6
19766 pmulhrsw m4, m7
19767 pmaddubsw m5, m2, m6
19768 pmulhrsw m5, m7
19769 packuswb m4, m5
19770 movu [r0 + 1664 * 16], m4
19771 pmaddubsw m4, m1, m6
19772 pmulhrsw m4, m7
19773 pmaddubsw m5, m3, m6
19774 pmulhrsw m5, m7
19775 packuswb m4, m5
19776 movu [r0 + 1665 * 16], m4
19777
19778 ; mode 28 [row 2]
19779 movu m6, [r5 + 15 * 16]
19780 pmaddubsw m4, m0, m6
19781 pmulhrsw m4, m7
19782 pmaddubsw m5, m2, m6
19783 pmulhrsw m5, m7
19784 packuswb m4, m5
19785 movu [r0 + 1668 * 16], m4
19786 pmaddubsw m4, m1, m6
19787 pmulhrsw m4, m7
19788 pmaddubsw m5, m3, m6
19789 pmulhrsw m5, m7
19790 packuswb m4, m5
19791 movu [r0 + 1669 * 16], m4
19792
19793 ; mode 28 [row 4]
19794 movu m6, [r5 + 25 * 16]
19795 pmaddubsw m4, m0, m6
19796 pmulhrsw m4, m7
19797 pmaddubsw m5, m2, m6
19798 pmulhrsw m5, m7
19799 packuswb m4, m5
19800 movu [r0 + 1672 * 16], m4
19801 pmaddubsw m4, m1, m6
19802 pmulhrsw m4, m7
19803 pmaddubsw m5, m3, m6
19804 pmulhrsw m5, m7
19805 packuswb m4, m5
19806 movu [r0 + 1673 * 16], m4
19807
19808 ; mode 30 [row 0]
19809 movu m6, [r5 + 13 * 16]
19810 pmaddubsw m4, m0, m6
19811 pmulhrsw m4, m7
19812 pmaddubsw m5, m2, m6
19813 pmulhrsw m5, m7
19814 packuswb m4, m5
19815 movu [r0 + 1792 * 16], m4
19816 pmaddubsw m4, m1, m6
19817 pmulhrsw m4, m7
19818 pmaddubsw m5, m3, m6
19819 pmulhrsw m5, m7
19820 packuswb m4, m5
19821 movu [r0 + 1793 * 16], m4
19822
19823 ; mode 29 [row 0]
19824 movu m6, [r5 + 9 * 16]
19825 pmaddubsw m4, m0, m6
19826 pmulhrsw m4, m7
19827 pmaddubsw m5, m2, m6
19828 pmulhrsw m5, m7
19829 packuswb m4, m5
19830 movu [r0 + 1728 * 16], m4
19831 pmaddubsw m4, m1, m6
19832 pmulhrsw m4, m7
19833 pmaddubsw m5, m3, m6
19834 pmulhrsw m5, m7
19835 packuswb m4, m5
19836 movu [r0 + 1729 * 16], m4
19837
19838 ; mode 29 [row 2]
19839 movu m6, [r5 + 27 * 16]
19840 pmaddubsw m4, m0, m6
19841 pmulhrsw m4, m7
19842 pmaddubsw m5, m2, m6
19843 pmulhrsw m5, m7
19844 packuswb m4, m5
19845 movu [r0 + 1732 * 16], m4
19846 pmaddubsw m4, m1, m6
19847 pmulhrsw m4, m7
19848 pmaddubsw m5, m3, m6
19849 pmulhrsw m5, m7
19850 packuswb m4, m5
19851 movu [r0 + 1733 * 16], m4
19852
19853 ; mode 31 [row 0]
19854 movu m6, [r5 + 17 * 16]
19855 pmaddubsw m4, m0, m6
19856 pmulhrsw m4, m7
19857 pmaddubsw m5, m2, m6
19858 pmulhrsw m5, m7
19859 packuswb m4, m5
19860 movu [r0 + 1856 * 16], m4
19861 pmaddubsw m4, m1, m6
19862 pmulhrsw m4, m7
19863 pmaddubsw m5, m3, m6
19864 pmulhrsw m5, m7
19865 packuswb m4, m5
19866 movu [r0 + 1857 * 16], m4
19867
19868 ; mode 32 [row 0]
19869 movu m6, [r5 + 21 * 16]
19870 pmaddubsw m4, m0, m6
19871 pmulhrsw m4, m7
19872 pmaddubsw m5, m2, m6
19873 pmulhrsw m5, m7
19874 packuswb m4, m5
19875 movu [r0 + 1920 * 16], m4
19876 pmaddubsw m4, m1, m6
19877 pmulhrsw m4, m7
19878 pmaddubsw m5, m3, m6
19879 pmulhrsw m5, m7
19880 packuswb m4, m5
19881 movu [r0 + 1921 * 16], m4
19882
19883 ; mode 27 [row 15]
19884 movu m0, [r3 + 2]
19885 movd m1, [r3 + 3]
19886 palignr m1, m0, 1
19887 punpcklbw m0, m1
19888 movu m2, [r3 + 10]
19889 movd m3, [r3 + 11]
19890 palignr m3, m2, 1
19891 punpcklbw m2, m3
19892 movu m1, [r3 + 18]
19893 movd m3, [r3 + 19]
19894 palignr m3, m1, 1
19895 punpcklbw m1, m3
19896 movu m4, [r3 + 26]
19897 movd m5, [r3 + 27]
19898 palignr m5, m4, 1
19899 punpcklbw m4, m5
19900
19901 pshufb m5, m0, [tab_S2]
19902 movh [r0 + 1630 * 16], m5
19903 pshufb m5, m2, [tab_S2]
19904 movh [r0 + 1630 * 16 + 8], m5
19905 pshufb m5, m1, [tab_S2]
19906 movh [r0 + 1631 * 16], m5
19907 pshufb m5, m4, [tab_S2]
19908 movh [r0 + 1631 * 16 + 8], m5
19909
19910 ; mode 27 [row 16]
19911 movu m6, [r5 + 2 * 16]
19912 pmaddubsw m3, m0, m6
19913 pmulhrsw m3, m7
19914 pmaddubsw m5, m2, m6
19915 pmulhrsw m5, m7
19916 packuswb m3, m5
19917 movu [r0 + 1632 * 16], m3
19918
19919 ; mode 31 [row 1 - first half]
19920 movu [r0 + 1858 * 16], m3
19921
19922 pmaddubsw m3, m1, m6
19923 pmulhrsw m3, m7
19924 pmaddubsw m5, m4, m6
19925 pmulhrsw m5, m7
19926 packuswb m3, m5
19927 movu [r0 + 1633 * 16], m3
19928
19929 ; mode 31 [row 1 - second half]
19930 movu [r0 + 1859 * 16], m3
19931
19932 ; mode 27 [row 17]
19933 movu m6, [r5 + 4 * 16]
19934 pmaddubsw m3, m0, m6
19935 pmulhrsw m3, m7
19936 pmaddubsw m5, m2, m6
19937 pmulhrsw m5, m7
19938 packuswb m3, m5
19939 movu [r0 + 1634 * 16], m3
19940
19941 ; mode 29 [row 3 - first half]
19942 movu [r0 + 1734 * 16], m3
19943
19944 pmaddubsw m3, m1, m6
19945 pmulhrsw m3, m7
19946 pmaddubsw m5, m4, m6
19947 pmulhrsw m5, m7
19948 packuswb m3, m5
19949 movu [r0 + 1635 * 16], m3
19950
19951 ; mode 29 [row 3 - second half]
19952 movu [r0 + 1735 * 16], m3
19953
19954 ; mode 27 [row 18]
19955 movu m6, [r5 + 6 * 16]
19956 pmaddubsw m3, m0, m6
19957 pmulhrsw m3, m7
19958 pmaddubsw m5, m2, m6
19959 pmulhrsw m5, m7
19960 packuswb m3, m5
19961 movu [r0 + 1636 * 16], m3
19962 pmaddubsw m3, m1, m6
19963 pmulhrsw m3, m7
19964 pmaddubsw m5, m4, m6
19965 pmulhrsw m5, m7
19966 packuswb m3, m5
19967 movu [r0 + 1637 * 16], m3
19968
19969 ; mode 27 [row 19]
19970 movu m6, [r5 + 8 * 16]
19971 pmaddubsw m3, m0, m6
19972 pmulhrsw m3, m7
19973 pmaddubsw m5, m2, m6
19974 pmulhrsw m5, m7
19975 packuswb m3, m5
19976 movu [r0 + 1638 * 16], m3
19977
19978 ; mode 28 [row 7 - first half]
19979 movu [r0 + 1678 * 16], m3
19980
19981 pmaddubsw m3, m1, m6
19982 pmulhrsw m3, m7
19983 pmaddubsw m5, m4, m6
19984 pmulhrsw m5, m7
19985 packuswb m3, m5
19986 movu [r0 + 1639 * 16], m3
19987
19988 ; mode 28 [row 7 - second half]
19989 movu [r0 + 1679 * 16], m3
19990
19991 ; mode 27 [row 20]
19992 movu m6, [r5 + 10 * 16]
19993 pmaddubsw m3, m0, m6
19994 pmulhrsw m3, m7
19995 pmaddubsw m5, m2, m6
19996 pmulhrsw m5, m7
19997 packuswb m3, m5
19998 movu [r0 + 1640 * 16], m3
19999
20000 ; mode 32 [row 1 - first half]
20001 movu [r0 + 1922 * 16], m3
20002
20003 pmaddubsw m3, m1, m6
20004 pmulhrsw m3, m7
20005 pmaddubsw m5, m4, m6
20006 pmulhrsw m5, m7
20007 packuswb m3, m5
20008 movu [r0 + 1641 * 16], m3
20009
20010 ; mode 32 [row 1 - second half]
20011 movu [r0 + 1923 * 16], m3
20012
20013 ; mode 27 [row 21]
20014 movu m6, [r5 + 12 * 16]
20015 pmaddubsw m3, m0, m6
20016 pmulhrsw m3, m7
20017 pmaddubsw m5, m2, m6
20018 pmulhrsw m5, m7
20019 packuswb m3, m5
20020 movu [r0 + 1642 * 16], m3
20021 pmaddubsw m3, m1, m6
20022 pmulhrsw m3, m7
20023 pmaddubsw m5, m4, m6
20024 pmulhrsw m5, m7
20025 packuswb m3, m5
20026 movu [r0 + 1643 * 16], m3
20027
20028 ; mode 27 [row 22]
20029 movu m6, [r5 + 14 * 16]
20030 pmaddubsw m3, m0, m6
20031 pmulhrsw m3, m7
20032 pmaddubsw m5, m2, m6
20033 pmulhrsw m5, m7
20034 packuswb m3, m5
20035 movu [r0 + 1644 * 16], m3
20036 pmaddubsw m3, m1, m6
20037 pmulhrsw m3, m7
20038 pmaddubsw m5, m4, m6
20039 pmulhrsw m5, m7
20040 packuswb m3, m5
20041 movu [r0 + 1645 * 16], m3
20042
20043 ; mode 27 [row 23]
20044 movu m6, [r5 + 16 * 16]
20045 pmaddubsw m3, m0, m6
20046 pmulhrsw m3, m7
20047 pmaddubsw m5, m2, m6
20048 pmulhrsw m5, m7
20049 packuswb m3, m5
20050 movu [r0 + 1646 * 16], m3
20051 pmaddubsw m3, m1, m6
20052 pmulhrsw m3, m7
20053 pmaddubsw m5, m4, m6
20054 pmulhrsw m5, m7
20055 packuswb m3, m5
20056 movu [r0 + 1647 * 16], m3
20057
20058 ; mode 27 [row 24]
20059 movu m6, [r5 + 18 * 16]
20060 pmaddubsw m3, m0, m6
20061 pmulhrsw m3, m7
20062 pmaddubsw m5, m2, m6
20063 pmulhrsw m5, m7
20064 packuswb m3, m5
20065 movu [r0 + 1648 * 16], m3
20066
20067 ; mode 28 [row 9 - first half]
20068 movu [r0 + 1682 * 16], m3
20069
20070 pmaddubsw m3, m1, m6
20071 pmulhrsw m3, m7
20072 pmaddubsw m5, m4, m6
20073 pmulhrsw m5, m7
20074 packuswb m3, m5
20075 movu [r0 + 1649 * 16], m3
20076
20077 ; mode 28 [row 9 - second half]
20078 movu [r0 + 1683 * 16], m3
20079
20080 ; mode 27 [row 25]
20081 movu m6, [r5 + 20 * 16]
20082 pmaddubsw m3, m0, m6
20083 pmulhrsw m3, m7
20084 pmaddubsw m5, m2, m6
20085 pmulhrsw m5, m7
20086 packuswb m3, m5
20087 movu [r0 + 1650 * 16], m3
20088
20089 ; mode 30 [row 3 - first half]
20090 movu [r0 + 1798 * 16], m3
20091
20092 ; mode 33 [row 1 - first half]
20093 movu [r0 + 1986 * 16], m3
20094
20095 pmaddubsw m3, m1, m6
20096 pmulhrsw m3, m7
20097 pmaddubsw m5, m4, m6
20098 pmulhrsw m5, m7
20099 packuswb m3, m5
20100 movu [r0 + 1651 * 16], m3
20101
20102 ; mode 30 [row 3 - second half]
20103 movu [r0 + 1799 * 16], m3
20104
20105 ; mode 33 [row 1 - second half]
20106 movu [r0 + 1987 * 16], m3
20107
20108 ; mode 27 [row 26]
20109 movu m6, [r5 + 22 * 16]
20110 pmaddubsw m3, m0, m6
20111 pmulhrsw m3, m7
20112 pmaddubsw m5, m2, m6
20113 pmulhrsw m5, m7
20114 packuswb m3, m5
20115 movu [r0 + 1652 * 16], m3
20116
20117 ; mode 29 [row 5 - first half]
20118 movu [r0 + 1738 * 16], m3
20119
20120 pmaddubsw m3, m1, m6
20121 pmulhrsw m3, m7
20122 pmaddubsw m5, m4, m6
20123 pmulhrsw m5, m7
20124 packuswb m3, m5
20125 movu [r0 + 1653 * 16], m3
20126
20127 ; mode 29 [row 5 - second half]
20128 movu [r0 + 1739 * 16], m3
20129
20130 ; mode 27 [row 27]
20131 movu m6, [r5 + 24 * 16]
20132 pmaddubsw m3, m0, m6
20133 pmulhrsw m3, m7
20134 pmaddubsw m5, m2, m6
20135 pmulhrsw m5, m7
20136 packuswb m3, m5
20137 movu [r0 + 1654 * 16], m3
20138 pmaddubsw m3, m1, m6
20139 pmulhrsw m3, m7
20140 pmaddubsw m5, m4, m6
20141 pmulhrsw m5, m7
20142 packuswb m3, m5
20143 movu [r0 + 1655 * 16], m3
20144
20145 ; mode 27 [row 28]
20146 movu m6, [r5 + 26 * 16]
20147 pmaddubsw m3, m0, m6
20148 pmulhrsw m3, m7
20149 pmaddubsw m5, m2, m6
20150 pmulhrsw m5, m7
20151 packuswb m3, m5
20152 movu [r0 + 1656 * 16], m3
20153 pmaddubsw m3, m1, m6
20154 pmulhrsw m3, m7
20155 pmaddubsw m5, m4, m6
20156 pmulhrsw m5, m7
20157 packuswb m3, m5
20158 movu [r0 + 1657 * 16], m3
20159
20160 ; mode 27 [row 29]
20161 movu m6, [r5 + 28 * 16]
20162 pmaddubsw m3, m0, m6
20163 pmulhrsw m3, m7
20164 pmaddubsw m5, m2, m6
20165 pmulhrsw m5, m7
20166 packuswb m3, m5
20167 movu [r0 + 1658 * 16], m3
20168
20169 ; mode 28 [row 11 - first half]
20170 movu [r0 + 1686 * 16], m3
20171
20172 pmaddubsw m3, m1, m6
20173 pmulhrsw m3, m7
20174 pmaddubsw m5, m4, m6
20175 pmulhrsw m5, m7
20176 packuswb m3, m5
20177 movu [r0 + 1659 * 16], m3
20178
20179 ; mode 28 [row 11 - second half]
20180 movu [r0 + 1687 * 16], m3
20181
20182 ; mode 27 [row 30]
20183 movu m6, [r5 + 30 * 16]
20184 pmaddubsw m3, m0, m6
20185 pmulhrsw m3, m7
20186 pmaddubsw m5, m2, m6
20187 pmulhrsw m5, m7
20188 packuswb m3, m5
20189 movu [r0 + 1660 * 16], m3
20190 pmaddubsw m3, m1, m6
20191 pmulhrsw m3, m7
20192 pmaddubsw m5, m4, m6
20193 pmulhrsw m5, m7
20194 packuswb m3, m5
20195 movu [r0 + 1661 * 16], m3
20196
20197 ; mode 28 [row 6]
20198 movu m6, [r5 + 3 * 16]
20199 pmaddubsw m3, m0, m6
20200 pmulhrsw m3, m7
20201 pmaddubsw m5, m2, m6
20202 pmulhrsw m5, m7
20203 packuswb m3, m5
20204 movu [r0 + 1676 * 16], m3
20205 pmaddubsw m3, m1, m6
20206 pmulhrsw m3, m7
20207 pmaddubsw m5, m4, m6
20208 pmulhrsw m5, m7
20209 packuswb m3, m5
20210 movu [r0 + 1677 * 16], m3
20211
20212 ; mode 28 [row 8]
20213 movu m6, [r5 + 13 * 16]
20214 pmaddubsw m3, m0, m6
20215 pmulhrsw m3, m7
20216 pmaddubsw m5, m2, m6
20217 pmulhrsw m5, m7
20218 packuswb m3, m5
20219 movu [r0 + 1680 * 16], m3
20220
20221 ; mode 29 [row 4 - first half]
20222 movu [r0 + 1736 * 16], m3
20223
20224 pmaddubsw m3, m1, m6
20225 pmulhrsw m3, m7
20226 pmaddubsw m5, m4, m6
20227 pmulhrsw m5, m7
20228 packuswb m3, m5
20229 movu [r0 + 1681 * 16], m3
20230
20231 ; mode 29 [row 4 - second half]
20232 movu [r0 + 1737 * 16], m3
20233
20234 ; mode 28 [row 10]
20235 movu m6, [r5 + 23 * 16]
20236 pmaddubsw m3, m0, m6
20237 pmulhrsw m3, m7
20238 pmaddubsw m5, m2, m6
20239 pmulhrsw m5, m7
20240 packuswb m3, m5
20241 movu [r0 + 1684 * 16], m3
20242 pmaddubsw m3, m1, m6
20243 pmulhrsw m3, m7
20244 pmaddubsw m5, m4, m6
20245 pmulhrsw m5, m7
20246 packuswb m3, m5
20247 movu [r0 + 1685 * 16], m3
20248
20249 ; mode 29 [row 6]
20250 movu m6, [r5 + 31 * 16]
20251 pmaddubsw m3, m0, m6
20252 pmulhrsw m3, m7
20253 pmaddubsw m5, m2, m6
20254 pmulhrsw m5, m7
20255 packuswb m3, m5
20256 movu [r0 + 1740 * 16], m3
20257
20258 ; mode 32 [row 2 - first half]
20259 movu [r0 + 1924 * 16], m3
20260
20261 pmaddubsw m3, m1, m6
20262 pmulhrsw m3, m7
20263 pmaddubsw m5, m4, m6
20264 pmulhrsw m5, m7
20265 packuswb m3, m5
20266 movu [r0 + 1741 * 16], m3
20267
20268 ; mode 32 [row 2 - second half]
20269 movu [r0 + 1925 * 16], m3
20270
20271 ; mode 30 [row 2]
20272 movu m6, [r5 + 7 * 16]
20273 pmaddubsw m3, m0, m6
20274 pmulhrsw m3, m7
20275 pmaddubsw m5, m2, m6
20276 pmulhrsw m5, m7
20277 packuswb m3, m5
20278 movu [r0 + 1796 * 16], m3
20279 pmaddubsw m3, m1, m6
20280 pmulhrsw m3, m7
20281 pmaddubsw m5, m4, m6
20282 pmulhrsw m5, m7
20283 packuswb m3, m5
20284 movu [r0 + 1797 * 16], m3
20285
20286 ; mode 31 [row 2]
20287 movu m6, [r5 + 19 * 16]
20288 pmaddubsw m3, m0, m6
20289 pmulhrsw m3, m7
20290 pmaddubsw m5, m2, m6
20291 pmulhrsw m5, m7
20292 packuswb m3, m5
20293 movu [r0 + 1860 * 16], m3
20294 pmaddubsw m3, m1, m6
20295 pmulhrsw m3, m7
20296 pmaddubsw m5, m4, m6
20297 pmulhrsw m5, m7
20298 packuswb m3, m5
20299 movu [r0 + 1861 * 16], m3
20300
20301 ; mode 27 [row 15]
20302 movu m0, [r3 + 3]
20303 movd m1, [r3 + 4]
20304 palignr m1, m0, 1
20305 punpcklbw m0, m1
20306 movu m2, [r3 + 11]
20307 movd m3, [r3 + 12]
20308 palignr m3, m2, 1
20309 punpcklbw m2, m3
20310 movu m1, [r3 + 19]
20311 movd m3, [r3 + 20]
20312 palignr m3, m1, 1
20313 punpcklbw m1, m3
20314 movu m4, [r3 + 27]
20315 movd m5, [r3 + 28]
20316 palignr m5, m4, 1
20317 punpcklbw m4, m5
20318
20319 pshufb m5, m0, [tab_S2]
20320 movh [r0 + 1662 * 16], m5
20321 pshufb m5, m2, [tab_S2]
20322 movh [r0 + 1662 * 16 + 8], m5
20323 pshufb m5, m1, [tab_S2]
20324 movh [r0 + 1663 * 16], m5
20325 pshufb m5, m4, [tab_S2]
20326 movh [r0 + 1663 * 16 + 8], m5
20327
20328 ; mode 28 [row 12]
20329 movu m6, [r5 + 1 * 16]
20330 pmaddubsw m3, m0, m6
20331 pmulhrsw m3, m7
20332 pmaddubsw m5, m2, m6
20333 pmulhrsw m5, m7
20334 packuswb m3, m5
20335 movu [r0 + 1688 * 16], m3
20336
20337 ; mode 30 [row 4 - first half]
20338 movu [r0 + 1800 * 16], m3
20339
20340 pmaddubsw m3, m1, m6
20341 pmulhrsw m3, m7
20342 pmaddubsw m5, m4, m6
20343 pmulhrsw m5, m7
20344 packuswb m3, m5
20345 movu [r0 + 1689 * 16], m3
20346
20347 ; mode 30 [row 4 - second half]
20348 movu [r0 + 1801 * 16], m3
20349
20350 ; mode 28 [row 13]
20351 movu m6, [r5 + 6 * 16]
20352 pmaddubsw m3, m0, m6
20353 pmulhrsw m3, m7
20354 pmaddubsw m5, m2, m6
20355 pmulhrsw m5, m7
20356 packuswb m3, m5
20357 movu [r0 + 1690 * 16], m3
20358 pmaddubsw m3, m1, m6
20359 pmulhrsw m3, m7
20360 pmaddubsw m5, m4, m6
20361 pmulhrsw m5, m7
20362 packuswb m3, m5
20363 movu [r0 + 1691 * 16], m3
20364
20365 ; mode 28 [row 14]
20366 movu m6, [r5 + 11 * 16]
20367 pmaddubsw m3, m0, m6
20368 pmulhrsw m3, m7
20369 pmaddubsw m5, m2, m6
20370 pmulhrsw m5, m7
20371 packuswb m3, m5
20372 movu [r0 + 1692 * 16], m3
20373 pmaddubsw m3, m1, m6
20374 pmulhrsw m3, m7
20375 pmaddubsw m5, m4, m6
20376 pmulhrsw m5, m7
20377 packuswb m3, m5
20378 movu [r0 + 1693 * 16], m3
20379
20380 ; mode 28 [row 15]
20381 movu m6, [r5 + 16 * 16]
20382 pmaddubsw m3, m0, m6
20383 pmulhrsw m3, m7
20384 pmaddubsw m5, m2, m6
20385 pmulhrsw m5, m7
20386 packuswb m3, m5
20387 movu [r0 + 1694 * 16], m3
20388 pmaddubsw m3, m1, m6
20389 pmulhrsw m3, m7
20390 pmaddubsw m5, m4, m6
20391 pmulhrsw m5, m7
20392 packuswb m3, m5
20393 movu [r0 + 1695 * 16], m3
20394
20395 ; mode 28 [row 16]
20396 movu m6, [r5 + 21 * 16]
20397 pmaddubsw m3, m0, m6
20398 pmulhrsw m3, m7
20399 pmaddubsw m5, m2, m6
20400 pmulhrsw m5, m7
20401 packuswb m3, m5
20402 movu [r0 + 1696 * 16], m3
20403
20404 ; mode 31 [row 4 - first half]
20405 movu [r0 + 1864 * 16], m3
20406
20407 pmaddubsw m3, m1, m6
20408 pmulhrsw m3, m7
20409 pmaddubsw m5, m4, m6
20410 pmulhrsw m5, m7
20411 packuswb m3, m5
20412 movu [r0 + 1697 * 16], m3
20413
20414 ; mode 31 [row 4 - second half]
20415 movu [r0 + 1865 * 16], m3
20416
20417 ; mode 28 [row 17]
20418 movu m6, [r5 + 26 * 16]
20419 pmaddubsw m3, m0, m6
20420 pmulhrsw m3, m7
20421 pmaddubsw m5, m2, m6
20422 pmulhrsw m5, m7
20423 packuswb m3, m5
20424 movu [r0 + 1698 * 16], m3
20425
20426 ; mode 29 [row 9 - first half]
20427 movu [r0 + 1746 * 16], m3
20428
20429 pmaddubsw m3, m1, m6
20430 pmulhrsw m3, m7
20431 pmaddubsw m5, m4, m6
20432 pmulhrsw m5, m7
20433 packuswb m3, m5
20434 movu [r0 + 1699 * 16], m3
20435
20436 ; mode 29 [row 9 - second half]
20437 movu [r0 + 1747 * 16], m3
20438
20439 ; mode 28 [row 18]
20440 movu m6, [r5 + 31 * 16]
20441 pmaddubsw m3, m0, m6
20442 pmulhrsw m3, m7
20443 pmaddubsw m5, m2, m6
20444 pmulhrsw m5, m7
20445 packuswb m3, m5
20446 movu [r0 + 1700 * 16], m3
20447 pmaddubsw m3, m1, m6
20448 pmulhrsw m3, m7
20449 pmaddubsw m5, m4, m6
20450 pmulhrsw m5, m7
20451 packuswb m3, m5
20452 movu [r0 + 1701 * 16], m3
20453
20454 ; mode 29 [row 7]
20455 movu m6, [r5 + 8 * 16]
20456 pmaddubsw m3, m0, m6
20457 pmulhrsw m3, m7
20458 pmaddubsw m5, m2, m6
20459 pmulhrsw m5, m7
20460 packuswb m3, m5
20461 movu [r0 + 1742 * 16], m3
20462 pmaddubsw m3, m1, m6
20463 pmulhrsw m3, m7
20464 pmaddubsw m5, m4, m6
20465 pmulhrsw m5, m7
20466 packuswb m3, m5
20467 movu [r0 + 1743 * 16], m3
20468
20469 ; mode 29 [row 8]
20470 movu m6, [r5 + 17 * 16]
20471 pmaddubsw m3, m0, m6
20472 pmulhrsw m3, m7
20473 pmaddubsw m5, m2, m6
20474 pmulhrsw m5, m7
20475 packuswb m3, m5
20476 movu [r0 + 1744 * 16], m3
20477 pmaddubsw m3, m1, m6
20478 pmulhrsw m3, m7
20479 pmaddubsw m5, m4, m6
20480 pmulhrsw m5, m7
20481 packuswb m3, m5
20482 movu [r0 + 1745 * 16], m3
20483
20484 ; mode 30 [row 5]
20485 movu m6, [r5 + 14 * 16]
20486 pmaddubsw m3, m0, m6
20487 pmulhrsw m3, m7
20488 pmaddubsw m5, m2, m6
20489 pmulhrsw m5, m7
20490 packuswb m3, m5
20491 movu [r0 + 1802 * 16], m3
20492
20493 ; mode 33 [row 2 - first half]
20494 movu [r0 + 1988 * 16], m3
20495
20496 pmaddubsw m3, m1, m6
20497 pmulhrsw m3, m7
20498 pmaddubsw m5, m4, m6
20499 pmulhrsw m5, m7
20500 packuswb m3, m5
20501 movu [r0 + 1803 * 16], m3
20502
20503 ; mode 33 [row 2 - second half]
20504 movu [r0 + 1989 * 16], m3
20505
20506 ; mode 30 [row 6]
20507 movu m6, [r5 + 27 * 16]
20508 pmaddubsw m3, m0, m6
20509 pmulhrsw m3, m7
20510 pmaddubsw m5, m2, m6
20511 pmulhrsw m5, m7
20512 packuswb m3, m5
20513 movu [r0 + 1804 * 16], m3
20514 pmaddubsw m3, m1, m6
20515 pmulhrsw m3, m7
20516 pmaddubsw m5, m4, m6
20517 pmulhrsw m5, m7
20518 packuswb m3, m5
20519 movu [r0 + 1805 * 16], m3
20520
20521 ; mode 31 [row 3]
20522 movu m6, [r5 + 4 * 16]
20523 pmaddubsw m3, m0, m6
20524 pmulhrsw m3, m7
20525 pmaddubsw m5, m2, m6
20526 pmulhrsw m5, m7
20527 packuswb m3, m5
20528 movu [r0 + 1862 * 16], m3
20529 pmaddubsw m3, m1, m6
20530 pmulhrsw m3, m7
20531 pmaddubsw m5, m4, m6
20532 pmulhrsw m5, m7
20533 packuswb m3, m5
20534 movu [r0 + 1863 * 16], m3
20535
20536 ; mode 32 [row 3]
20537 movu m6, [r5 + 20 * 16]
20538 pmaddubsw m3, m0, m6
20539 pmulhrsw m3, m7
20540 pmaddubsw m5, m2, m6
20541 pmulhrsw m5, m7
20542 packuswb m3, m5
20543 movu [r0 + 1926 * 16], m3
20544 pmaddubsw m3, m1, m6
20545 pmulhrsw m3, m7
20546 pmaddubsw m5, m4, m6
20547 pmulhrsw m5, m7
20548 packuswb m3, m5
20549 movu [r0 + 1927 * 16], m3
20550
20551 ; mode 28 [row 19]
20552 movu m6, [r5 + 4 * 16]
20553 movu m0, [r3 + 4]
20554 movd m1, [r3 + 5]
20555 palignr m1, m0, 1
20556 punpcklbw m0, m1
20557 pmaddubsw m3, m0, m6
20558 pmulhrsw m3, m7
20559 movu m2, [r3 + 12]
20560 movd m4, [r3 + 13]
20561 palignr m4, m2, 1
20562 punpcklbw m2, m4
20563 pmaddubsw m5, m2, m6
20564 pmulhrsw m5, m7
20565 packuswb m3, m5
20566 movu [r0 + 1702 * 16], m3
20567
20568 movu m1, [r3 + 20]
20569 movd m3, [r3 + 21]
20570 palignr m3, m1, 1
20571 punpcklbw m1, m3
20572 pmaddubsw m3, m1, m6
20573 pmulhrsw m3, m7
20574 movu m4, [r3 + 28]
20575 movd m5, [r3 + 29]
20576 palignr m5, m4, 1
20577 punpcklbw m4, m5
20578 pmaddubsw m5, m4, m6
20579 pmulhrsw m5, m7
20580 packuswb m3, m5
20581 movu [r0 + 1703 * 16], m3
20582
20583 ; mode 28 [row 20]
20584 movu m6, [r5 + 9 * 16]
20585 pmaddubsw m3, m0, m6
20586 pmulhrsw m3, m7
20587 pmaddubsw m5, m2, m6
20588 pmulhrsw m5, m7
20589 packuswb m3, m5
20590 movu [r0 + 1704 * 16], m3
20591
20592 ; mode 32 [row 4 - first half]
20593 movu [r0 + 1928 * 16], m3
20594
20595 pmaddubsw m3, m1, m6
20596 pmulhrsw m3, m7
20597 pmaddubsw m5, m4, m6
20598 pmulhrsw m5, m7
20599 packuswb m3, m5
20600 movu [r0 + 1705 * 16], m3
20601
20602 ; mode 32 [row 4 - second half]
20603 movu [r0 + 1929 * 16], m3
20604
20605 ; mode 28 [row 21]
20606 movu m6, [r5 + 14 * 16]
20607 pmaddubsw m3, m0, m6
20608 pmulhrsw m3, m7
20609 pmaddubsw m5, m2, m6
20610 pmulhrsw m5, m7
20611 packuswb m3, m5
20612 movu [r0 + 1706 * 16], m3
20613 pmaddubsw m3, m1, m6
20614 pmulhrsw m3, m7
20615 pmaddubsw m5, m4, m6
20616 pmulhrsw m5, m7
20617 packuswb m3, m5
20618 movu [r0 + 1707 * 16], m3
20619
20620 ; mode 28 [row 22]
20621 movu m6, [r5 + 19 * 16]
20622 pmaddubsw m3, m0, m6
20623 pmulhrsw m3, m7
20624 pmaddubsw m5, m2, m6
20625 pmulhrsw m5, m7
20626 packuswb m3, m5
20627 movu [r0 + 1708 * 16], m3
20628 pmaddubsw m3, m1, m6
20629 pmulhrsw m3, m7
20630 pmaddubsw m5, m4, m6
20631 pmulhrsw m5, m7
20632 packuswb m3, m5
20633 movu [r0 + 1709 * 16], m3
20634
20635 ; mode 28 [row 23]
20636 movu m6, [r5 + 24 * 16]
20637 pmaddubsw m3, m0, m6
20638 pmulhrsw m3, m7
20639 pmaddubsw m5, m2, m6
20640 pmulhrsw m5, m7
20641 packuswb m3, m5
20642 movu [r0 + 1710 * 16], m3
20643 pmaddubsw m3, m1, m6
20644 pmulhrsw m3, m7
20645 pmaddubsw m5, m4, m6
20646 pmulhrsw m5, m7
20647 packuswb m3, m5
20648 movu [r0 + 1711 * 16], m3
20649
20650 ; mode 28 [row 24]
20651 movu m6, [r5 + 29 * 16]
20652 pmaddubsw m3, m0, m6
20653 pmulhrsw m3, m7
20654 pmaddubsw m5, m2, m6
20655 pmulhrsw m5, m7
20656 packuswb m3, m5
20657 movu [r0 + 1712 * 16], m3
20658 pmaddubsw m3, m1, m6
20659 pmulhrsw m3, m7
20660 pmaddubsw m5, m4, m6
20661 pmulhrsw m5, m7
20662 packuswb m3, m5
20663 movu [r0 + 1713 * 16], m3
20664
20665 ; mode 29 [row 10]
20666 movu m6, [r5 + 3 * 16]
20667 pmaddubsw m3, m0, m6
20668 pmulhrsw m3, m7
20669 pmaddubsw m5, m2, m6
20670 pmulhrsw m5, m7
20671 packuswb m3, m5
20672 movu [r0 + 1748 * 16], m3
20673 pmaddubsw m3, m1, m6
20674 pmulhrsw m3, m7
20675 pmaddubsw m5, m4, m6
20676 pmulhrsw m5, m7
20677 packuswb m3, m5
20678 movu [r0 + 1749 * 16], m3
20679
20680 ; mode 29 [row 11]
20681 movu m6, [r5 + 12 * 16]
20682 pmaddubsw m3, m0, m6
20683 pmulhrsw m3, m7
20684 pmaddubsw m5, m2, m6
20685 pmulhrsw m5, m7
20686 packuswb m3, m5
20687 movu [r0 + 1750 * 16], m3
20688 pmaddubsw m3, m1, m6
20689 pmulhrsw m3, m7
20690 pmaddubsw m5, m4, m6
20691 pmulhrsw m5, m7
20692 packuswb m3, m5
20693 movu [r0 + 1751 * 16], m3
20694
20695 ; mode 29 [row 12]
20696 movu m6, [r5 + 21 * 16]
20697 pmaddubsw m3, m0, m6
20698 pmulhrsw m3, m7
20699 pmaddubsw m5, m2, m6
20700 pmulhrsw m5, m7
20701 packuswb m3, m5
20702 movu [r0 + 1752 * 16], m3
20703
20704 ; mode 30 [row 8 -first half]
20705 movu [r0 + 1808 * 16], m3
20706
20707 pmaddubsw m3, m1, m6
20708 pmulhrsw m3, m7
20709 pmaddubsw m5, m4, m6
20710 pmulhrsw m5, m7
20711 packuswb m3, m5
20712 movu [r0 + 1753 * 16], m3
20713
20714 ; mode 30 [row 8 -second half]
20715 movu [r0 + 1809 * 16], m3
20716
20717 ; mode 29 [row 13]
20718 movu m6, [r5 + 30 * 16]
20719 pmaddubsw m3, m0, m6
20720 pmulhrsw m3, m7
20721 pmaddubsw m5, m2, m6
20722 pmulhrsw m5, m7
20723 packuswb m3, m5
20724 movu [r0 + 1754 * 16], m3
20725
20726 ; mode 32 [row 5 - first half]
20727 movu [r0 + 1930 * 16], m3
20728
20729 pmaddubsw m3, m1, m6
20730 pmulhrsw m3, m7
20731 pmaddubsw m5, m4, m6
20732 pmulhrsw m5, m7
20733 packuswb m3, m5
20734 movu [r0 + 1755 * 16], m3
20735
20736 ; mode 32 [row 5 - second half]
20737 movu [r0 + 1931 * 16], m3
20738
20739 ; mode 30 [row 7]
20740 movu m6, [r5 + 8 * 16]
20741 pmaddubsw m3, m0, m6
20742 pmulhrsw m3, m7
20743 pmaddubsw m5, m2, m6
20744 pmulhrsw m5, m7
20745 packuswb m3, m5
20746 movu [r0 + 1806 * 16], m3
20747
20748 ; mode 33 [row 3 - first half]
20749 movu [r0 + 1990 * 16], m3
20750
20751 pmaddubsw m3, m1, m6
20752 pmulhrsw m3, m7
20753 pmaddubsw m5, m4, m6
20754 pmulhrsw m5, m7
20755 packuswb m3, m5
20756 movu [r0 + 1807 * 16], m3
20757
20758 ; mode 33 [row 3 - second half]
20759 movu [r0 + 1991 * 16], m3
20760
20761 ; mode 31 [row 5]
20762 movu m6, [r5 + 6 * 16]
20763 pmaddubsw m3, m0, m6
20764 pmulhrsw m3, m7
20765 pmaddubsw m5, m2, m6
20766 pmulhrsw m5, m7
20767 packuswb m3, m5
20768 movu [r0 + 1866 * 16], m3
20769 pmaddubsw m3, m1, m6
20770 pmulhrsw m3, m7
20771 pmaddubsw m5, m4, m6
20772 pmulhrsw m5, m7
20773 packuswb m3, m5
20774 movu [r0 + 1867 * 16], m3
20775
20776 ; mode 31 [row 6]
20777 movu m6, [r5 + 23 * 16]
20778 pmaddubsw m3, m0, m6
20779 pmulhrsw m3, m7
20780 pmaddubsw m5, m2, m6
20781 pmulhrsw m5, m7
20782 packuswb m3, m5
20783 movu [r0 + 1868 * 16], m3
20784 pmaddubsw m3, m1, m6
20785 pmulhrsw m3, m7
20786 pmaddubsw m5, m4, m6
20787 pmulhrsw m5, m7
20788 packuswb m3, m5
20789 movu [r0 + 1869 * 16], m3
20790
20791 ; mode 28 [row 25]
20792 movu m6, [r5 + 2 * 16]
20793 movu m0, [r3 + 5]
20794 movd m1, [r3 + 6]
20795 palignr m1, m0, 1
20796 punpcklbw m0, m1
20797 pmaddubsw m3, m0, m6
20798 pmulhrsw m3, m7
20799 movu m2, [r3 + 13]
20800 movd m4, [r3 + 14]
20801 palignr m4, m2, 1
20802 punpcklbw m2, m4
20803 pmaddubsw m5, m2, m6
20804 pmulhrsw m5, m7
20805 packuswb m3, m5
20806 movu [r0 + 1714 * 16], m3
20807
20808 movu m1, [r3 + 21]
20809 movd m3, [r3 + 22]
20810 palignr m3, m1, 1
20811 punpcklbw m1, m3
20812 pmaddubsw m3, m1, m6
20813 pmulhrsw m3, m7
20814 movu m4, [r3 + 29]
20815 movd m5, [r3 + 30]
20816 palignr m5, m4, 1
20817 punpcklbw m4, m5
20818 pmaddubsw m5, m4, m6
20819 pmulhrsw m5, m7
20820 packuswb m3, m5
20821 movu [r0 + 1715 * 16], m3
20822
20823 ; mode 28 [row 26]
20824 movu m6, [r5 + 7 * 16]
20825 pmaddubsw m3, m0, m6
20826 pmulhrsw m3, m7
20827 pmaddubsw m5, m2, m6
20828 pmulhrsw m5, m7
20829 packuswb m3, m5
20830 movu [r0 + 1716 * 16], m3
20831
20832 ; mode 29 [row 14 - first half]
20833 movu [r0 + 1756 * 16], m3
20834
20835 pmaddubsw m3, m1, m6
20836 pmulhrsw m3, m7
20837 pmaddubsw m5, m4, m6
20838 pmulhrsw m5, m7
20839 packuswb m3, m5
20840 movu [r0 + 1717 * 16], m3
20841
20842 ; mode 29 [row 14 - second half]
20843 movu [r0 + 1757 * 16], m3
20844
20845 ; mode 28 [row 27]
20846 movu m6, [r5 + 12 * 16]
20847 pmaddubsw m3, m0, m6
20848 pmulhrsw m3, m7
20849 pmaddubsw m5, m2, m6
20850 pmulhrsw m5, m7
20851 packuswb m3, m5
20852 movu [r0 + 1718 * 16], m3
20853 pmaddubsw m3, m1, m6
20854 pmulhrsw m3, m7
20855 pmaddubsw m5, m4, m6
20856 pmulhrsw m5, m7
20857 packuswb m3, m5
20858 movu [r0 + 1719 * 16], m3
20859
20860 ; mode 28 [row 28]
20861 movu m6, [r5 + 17 * 16]
20862 pmaddubsw m3, m0, m6
20863 pmulhrsw m3, m7
20864 pmaddubsw m5, m2, m6
20865 pmulhrsw m5, m7
20866 packuswb m3, m5
20867 movu [r0 + 1720 * 16], m3
20868 pmaddubsw m3, m1, m6
20869 pmulhrsw m3, m7
20870 pmaddubsw m5, m4, m6
20871 pmulhrsw m5, m7
20872 packuswb m3, m5
20873 movu [r0 + 1721 * 16], m3
20874
20875 ; mode 28 [row 29]
20876 movu m6, [r5 + 22 * 16]
20877 pmaddubsw m3, m0, m6
20878 pmulhrsw m3, m7
20879 pmaddubsw m5, m2, m6
20880 pmulhrsw m5, m7
20881 packuswb m3, m5
20882 movu [r0 + 1722 * 16], m3
20883 pmaddubsw m3, m1, m6
20884 pmulhrsw m3, m7
20885 pmaddubsw m5, m4, m6
20886 pmulhrsw m5, m7
20887 packuswb m3, m5
20888 movu [r0 + 1723 * 16], m3
20889
20890 ; mode 28 [row 30]
20891 movu m6, [r5 + 27 * 16]
20892 pmaddubsw m3, m0, m6
20893 pmulhrsw m3, m7
20894 pmaddubsw m5, m2, m6
20895 pmulhrsw m5, m7
20896 packuswb m3, m5
20897 movu [r0 + 1724 * 16], m3
20898 pmaddubsw m3, m1, m6
20899 pmulhrsw m3, m7
20900 pmaddubsw m5, m4, m6
20901 pmulhrsw m5, m7
20902 packuswb m3, m5
20903 movu [r0 + 1725 * 16], m3
20904
20905 ; mode 29 [row 15]
20906 movu m6, [r5 + 16 * 16]
20907 pmaddubsw m3, m0, m6
20908 pmulhrsw m3, m7
20909 pmaddubsw m5, m2, m6
20910 pmulhrsw m5, m7
20911 packuswb m3, m5
20912 movu [r0 + 1758 * 16], m3
20913 pmaddubsw m3, m1, m6
20914 pmulhrsw m3, m7
20915 pmaddubsw m5, m4, m6
20916 pmulhrsw m5, m7
20917 packuswb m3, m5
20918 movu [r0 + 1759 * 16], m3
20919
20920 ; mode 29 [row 16]
20921 movu m6, [r5 + 25 * 16]
20922 pmaddubsw m3, m0, m6
20923 pmulhrsw m3, m7
20924 pmaddubsw m5, m2, m6
20925 pmulhrsw m5, m7
20926 packuswb m3, m5
20927 movu [r0 + 1760 * 16], m3
20928 pmaddubsw m3, m1, m6
20929 pmulhrsw m3, m7
20930 pmaddubsw m5, m4, m6
20931 pmulhrsw m5, m7
20932 packuswb m3, m5
20933 movu [r0 + 1761 * 16], m3
20934
20935 ; mode 30 [row 9]
20936 movu m6, [r5 + 2 * 16]
20937 pmaddubsw m3, m0, m6
20938 pmulhrsw m3, m7
20939 pmaddubsw m5, m2, m6
20940 pmulhrsw m5, m7
20941 packuswb m3, m5
20942 movu [r0 + 1810 * 16], m3
20943
20944 ; mode 33 [row 4 - first half]
20945 movu [r0 + 1992 * 16], m3
20946
20947 pmaddubsw m3, m1, m6
20948 pmulhrsw m3, m7
20949 pmaddubsw m5, m4, m6
20950 pmulhrsw m5, m7
20951 packuswb m3, m5
20952 movu [r0 + 1811 * 16], m3
20953
20954 ; mode 33 [row 4 - second half]
20955 movu [r0 + 1993 * 16], m3
20956
20957 ; mode 30 [row 10]
20958 movu m6, [r5 + 15 * 16]
20959 pmaddubsw m3, m0, m6
20960 pmulhrsw m3, m7
20961 pmaddubsw m5, m2, m6
20962 pmulhrsw m5, m7
20963 packuswb m3, m5
20964 movu [r0 + 1812 * 16], m3
20965 pmaddubsw m3, m1, m6
20966 pmulhrsw m3, m7
20967 pmaddubsw m5, m4, m6
20968 pmulhrsw m5, m7
20969 packuswb m3, m5
20970 movu [r0 + 1813 * 16], m3
20971
20972 ; mode 31 [row 7]
20973 movu m6, [r5 + 8 * 16]
20974 pmaddubsw m3, m0, m6
20975 pmulhrsw m3, m7
20976 pmaddubsw m5, m2, m6
20977 pmulhrsw m5, m7
20978 packuswb m3, m5
20979 movu [r0 + 1870 * 16], m3
20980 pmaddubsw m3, m1, m6
20981 pmulhrsw m3, m7
20982 pmaddubsw m5, m4, m6
20983 pmulhrsw m5, m7
20984 packuswb m3, m5
20985 movu [r0 + 1871 * 16], m3
20986
20987 ; mode 31 [row 8]
20988 movu m6, [r5 + 25 * 16]
20989 pmaddubsw m3, m0, m6
20990 pmulhrsw m3, m7
20991 pmaddubsw m5, m2, m6
20992 pmulhrsw m5, m7
20993 packuswb m3, m5
20994 movu [r0 + 1872 * 16], m3
20995 pmaddubsw m3, m1, m6
20996 pmulhrsw m3, m7
20997 pmaddubsw m5, m4, m6
20998 pmulhrsw m5, m7
20999 packuswb m3, m5
21000 movu [r0 + 1873 * 16], m3
21001
21002 ; mode 32 [row 6]
21003 movu m6, [r5 + 19 * 16]
21004 pmaddubsw m3, m0, m6
21005 pmulhrsw m3, m7
21006 pmaddubsw m5, m2, m6
21007 pmulhrsw m5, m7
21008 packuswb m3, m5
21009 movu [r0 + 1932 * 16], m3
21010 pmaddubsw m3, m1, m6
21011 pmulhrsw m3, m7
21012 pmaddubsw m5, m4, m6
21013 pmulhrsw m5, m7
21014 packuswb m3, m5
21015 movu [r0 + 1933 * 16], m3
21016
21017 ; mode 30 [row 11]
21018 movu m6, [r5 + 28 * 16]
21019 pmaddubsw m3, m0, m6
21020 pmulhrsw m3, m7
21021 pmaddubsw m5, m2, m6
21022 pmulhrsw m5, m7
21023 packuswb m3, m5
21024 movu [r0 + 1814 * 16], m3
21025
21026 ; mode 33 [row 5 - first half]
21027 movu [r0 + 1994 * 16], m3
21028
21029 pmaddubsw m3, m1, m6
21030 pmulhrsw m3, m7
21031 pmaddubsw m5, m4, m6
21032 pmulhrsw m5, m7
21033 packuswb m3, m5
21034 movu [r0 + 1815 * 16], m3
21035
21036 ; mode 33 [row 5 - second half]
21037 movu [r0 + 1995 * 16], m3
21038
21039 ; mode 28 [row 31]
21040 movu m0, [r3 + 6]
21041 movd m1, [r3 + 7]
21042 palignr m1, m0, 1
21043 punpcklbw m0, m1
21044 movu m2, [r3 + 14]
21045 movd m3, [r3 + 15]
21046 palignr m3, m2, 1
21047 punpcklbw m2, m3
21048 movu m1, [r3 + 22]
21049 movd m3, [r3 + 23]
21050 palignr m3, m1, 1
21051 punpcklbw m1, m3
21052 movu m4, [r3 + 30]
21053 movd m5, [r3 + 31]
21054 palignr m5, m4, 1
21055 punpcklbw m4, m5
21056
21057 pshufb m5, m0, [tab_S2]
21058 movh [r0 + 1726 * 16], m5
21059 pshufb m5, m2, [tab_S2]
21060 movh [r0 + 1726 * 16 + 8], m5
21061 pshufb m5, m1, [tab_S2]
21062 movh [r0 + 1727 * 16], m5
21063 pshufb m5, m4, [tab_S2]
21064 movh [r0 + 1727 * 16 + 8], m5
21065
21066 ; mode 29 [row 17]
21067 movu m6, [r5 + 2 * 16]
21068 pmaddubsw m3, m0, m6
21069 pmulhrsw m3, m7
21070 pmaddubsw m5, m2, m6
21071 pmulhrsw m5, m7
21072 packuswb m3, m5
21073 movu [r0 + 1762 * 16], m3
21074 pmaddubsw m3, m1, m6
21075 pmulhrsw m3, m7
21076 pmaddubsw m5, m4, m6
21077 pmulhrsw m5, m7
21078 packuswb m3, m5
21079 movu [r0 + 1763 * 16], m3
21080
21081 ; mode 29 [row 18]
21082 movu m6, [r5 + 11 * 16]
21083 pmaddubsw m3, m0, m6
21084 pmulhrsw m3, m7
21085 pmaddubsw m5, m2, m6
21086 pmulhrsw m5, m7
21087 packuswb m3, m5
21088 movu [r0 + 1764 * 16], m3
21089 pmaddubsw m3, m1, m6
21090 pmulhrsw m3, m7
21091 pmaddubsw m5, m4, m6
21092 pmulhrsw m5, m7
21093 packuswb m3, m5
21094 movu [r0 + 1765 * 16], m3
21095
21096 ; mode 29 [row 19]
21097 movu m6, [r5 + 20 * 16]
21098 pmaddubsw m3, m0, m6
21099 pmulhrsw m3, m7
21100 pmaddubsw m5, m2, m6
21101 pmulhrsw m5, m7
21102 packuswb m3, m5
21103 movu [r0 + 1766 * 16], m3
21104 pmaddubsw m3, m1, m6
21105 pmulhrsw m3, m7
21106 pmaddubsw m5, m4, m6
21107 pmulhrsw m5, m7
21108 packuswb m3, m5
21109 movu [r0 + 1767 * 16], m3
21110
21111 ; mode 29 [row 20]
21112 movu m6, [r5 + 29 * 16]
21113 pmaddubsw m3, m0, m6
21114 pmulhrsw m3, m7
21115 pmaddubsw m5, m2, m6
21116 pmulhrsw m5, m7
21117 packuswb m3, m5
21118 movu [r0 + 1768 * 16], m3
21119
21120 ; mode 32 [row 8 - first halif]
21121 movu [r0 + 1936 * 16], m3
21122
21123 pmaddubsw m3, m1, m6
21124 pmulhrsw m3, m7
21125 pmaddubsw m5, m4, m6
21126 pmulhrsw m5, m7
21127 packuswb m3, m5
21128 movu [r0 + 1769 * 16], m3
21129
21130 ; mode 32 [row 8 - second halif]
21131 movu [r0 + 1937 * 16], m3
21132
21133 ; mode 30 [row 12]
21134 movu m6, [r5 + 9 * 16]
21135 pmaddubsw m3, m0, m6
21136 pmulhrsw m3, m7
21137 pmaddubsw m5, m2, m6
21138 pmulhrsw m5, m7
21139 packuswb m3, m5
21140 movu [r0 + 1816 * 16], m3
21141 pmaddubsw m3, m1, m6
21142 pmulhrsw m3, m7
21143 pmaddubsw m5, m4, m6
21144 pmulhrsw m5, m7
21145 packuswb m3, m5
21146 movu [r0 + 1817 * 16], m3
21147
21148 ; mode 30 [row 13]
21149 movu m6, [r5 + 22 * 16]
21150 pmaddubsw m3, m0, m6
21151 pmulhrsw m3, m7
21152 pmaddubsw m5, m2, m6
21153 pmulhrsw m5, m7
21154 packuswb m3, m5
21155 movu [r0 + 1818 * 16], m3
21156
21157 ; mode 33 [row 6 - first half]
21158 movu [r0 + 1996 * 16], m3
21159
21160 pmaddubsw m3, m1, m6
21161 pmulhrsw m3, m7
21162 pmaddubsw m5, m4, m6
21163 pmulhrsw m5, m7
21164 packuswb m3, m5
21165 movu [r0 + 1819 * 16], m3
21166
21167 ; mode 33 [row 6 - second half]
21168 movu [r0 + 1997 * 16], m3
21169
21170 ; mode 31 [row 9]
21171 movu m6, [r5 + 10 * 16]
21172 pmaddubsw m3, m0, m6
21173 pmulhrsw m3, m7
21174 pmaddubsw m5, m2, m6
21175 pmulhrsw m5, m7
21176 packuswb m3, m5
21177 movu [r0 + 1874 * 16], m3
21178 pmaddubsw m3, m1, m6
21179 pmulhrsw m3, m7
21180 pmaddubsw m5, m4, m6
21181 pmulhrsw m5, m7
21182 packuswb m3, m5
21183 movu [r0 + 1875 * 16], m3
21184
21185 ; mode 31 [row 10]
21186 movu m6, [r5 + 27 * 16]
21187 pmaddubsw m3, m0, m6
21188 pmulhrsw m3, m7
21189 pmaddubsw m5, m2, m6
21190 pmulhrsw m5, m7
21191 packuswb m3, m5
21192 movu [r0 + 1876 * 16], m3
21193 pmaddubsw m3, m1, m6
21194 pmulhrsw m3, m7
21195 pmaddubsw m5, m4, m6
21196 pmulhrsw m5, m7
21197 packuswb m3, m5
21198 movu [r0 + 1877 * 16], m3
21199
21200 ; mode 32 [row 7]
21201 movu m6, [r5 + 8 * 16]
21202 pmaddubsw m3, m0, m6
21203 pmulhrsw m3, m7
21204 pmaddubsw m5, m2, m6
21205 pmulhrsw m5, m7
21206 packuswb m3, m5
21207 movu [r0 + 1934 * 16], m3
21208 pmaddubsw m3, m1, m6
21209 pmulhrsw m3, m7
21210 pmaddubsw m5, m4, m6
21211 pmulhrsw m5, m7
21212 packuswb m3, m5
21213 movu [r0 + 1935 * 16], m3
21214
21215 ; mode 29 [row 21]
21216 movu m6, [r5 + 6 * 16]
21217 movu m0, [r3 + 7]
21218 movd m1, [r3 + 8]
21219 palignr m1, m0, 1
21220 punpcklbw m0, m1
21221 pmaddubsw m3, m0, m6
21222 pmulhrsw m3, m7
21223 movu m2, [r3 + 15]
21224 movd m4, [r3 + 16]
21225 palignr m4, m2, 1
21226 punpcklbw m2, m4
21227 pmaddubsw m5, m2, m6
21228 pmulhrsw m5, m7
21229 packuswb m3, m5
21230 movu [r0 + 1770 * 16], m3
21231
21232 movu m1, [r3 + 23]
21233 movd m3, [r3 + 24]
21234 palignr m3, m1, 1
21235 punpcklbw m1, m3
21236 pmaddubsw m3, m1, m6
21237 pmulhrsw m3, m7
21238 movu m4, [r3 + 31]
21239 movd m5, [r3 + 32]
21240 palignr m5, m4, 1
21241 punpcklbw m4, m5
21242 pmaddubsw m5, m4, m6
21243 pmulhrsw m5, m7
21244 packuswb m3, m5
21245 movu [r0 + 1771 * 16], m3
21246
21247 ; mode 29 [row 22]
21248 movu m6, [r5 + 15 * 16]
21249 pmaddubsw m3, m0, m6
21250 pmulhrsw m3, m7
21251 pmaddubsw m5, m2, m6
21252 pmulhrsw m5, m7
21253 packuswb m3, m5
21254 movu [r0 + 1772 * 16], m3
21255 pmaddubsw m3, m1, m6
21256 pmulhrsw m3, m7
21257 pmaddubsw m5, m4, m6
21258 pmulhrsw m5, m7
21259 packuswb m3, m5
21260 movu [r0 + 1773 * 16], m3
21261
21262 ; mode 29 [row 23]
21263 movu m6, [r5 + 24 * 16]
21264 pmaddubsw m3, m0, m6
21265 pmulhrsw m3, m7
21266 pmaddubsw m5, m2, m6
21267 pmulhrsw m5, m7
21268 packuswb m3, m5
21269 movu [r0 + 1774 * 16], m3
21270 pmaddubsw m3, m1, m6
21271 pmulhrsw m3, m7
21272 pmaddubsw m5, m4, m6
21273 pmulhrsw m5, m7
21274 packuswb m3, m5
21275 movu [r0 + 1775 * 16], m3
21276
21277 ; mode 30 [row 14]
21278 movu m6, [r5 + 3 * 16]
21279 pmaddubsw m3, m0, m6
21280 pmulhrsw m3, m7
21281 pmaddubsw m5, m2, m6
21282 pmulhrsw m5, m7
21283 packuswb m3, m5
21284 movu [r0 + 1820 * 16], m3
21285 pmaddubsw m3, m1, m6
21286 pmulhrsw m3, m7
21287 pmaddubsw m5, m4, m6
21288 pmulhrsw m5, m7
21289 packuswb m3, m5
21290 movu [r0 + 1821 * 16], m3
21291
21292 ; mode 30 [row 15]
21293 movu m6, [r5 + 16 * 16]
21294 pmaddubsw m3, m0, m6
21295 pmulhrsw m3, m7
21296 pmaddubsw m5, m2, m6
21297 pmulhrsw m5, m7
21298 packuswb m3, m5
21299 movu [r0 + 1822 * 16], m3
21300
21301 ; mode 33 [row 7 - first half]
21302 movu [r0 + 1998 * 16], m3
21303
21304 pmaddubsw m3, m1, m6
21305 pmulhrsw m3, m7
21306 pmaddubsw m5, m4, m6
21307 pmulhrsw m5, m7
21308 packuswb m3, m5
21309 movu [r0 + 1823 * 16], m3
21310
21311 ; mode 33 [row 7 - second half]
21312 movu [r0 + 1999 * 16], m3
21313
21314 ; mode 30 [row 16]
21315 movu m6, [r5 + 29 * 16]
21316 pmaddubsw m3, m0, m6
21317 pmulhrsw m3, m7
21318 pmaddubsw m5, m2, m6
21319 pmulhrsw m5, m7
21320 packuswb m3, m5
21321 movu [r0 + 1824 * 16], m3
21322
21323 ; mode 31 [row 12 - first half]
21324 movu [r0 + 1880 * 16], m3
21325
21326 pmaddubsw m3, m1, m6
21327 pmulhrsw m3, m7
21328 pmaddubsw m5, m4, m6
21329 pmulhrsw m5, m7
21330 packuswb m3, m5
21331 movu [r0 + 1825 * 16], m3
21332
21333 ; mode 31 [row 12 - second half]
21334 movu [r0 + 1881 * 16], m3
21335
21336 ; mode 31 [row 11]
21337 movu m6, [r5 + 12 * 16]
21338 pmaddubsw m3, m0, m6
21339 pmulhrsw m3, m7
21340 pmaddubsw m5, m2, m6
21341 pmulhrsw m5, m7
21342 packuswb m3, m5
21343 movu [r0 + 1878 * 16], m3
21344 pmaddubsw m3, m1, m6
21345 pmulhrsw m3, m7
21346 pmaddubsw m5, m4, m6
21347 pmulhrsw m5, m7
21348 packuswb m3, m5
21349 movu [r0 + 1879 * 16], m3
21350
21351 ; mode 32 [row 9]
21352 movu m6, [r5 + 18 * 16]
21353 pmaddubsw m3, m0, m6
21354 pmulhrsw m3, m7
21355 pmaddubsw m5, m2, m6
21356 pmulhrsw m5, m7
21357 packuswb m3, m5
21358 movu [r0 + 1938 * 16], m3
21359 pmaddubsw m3, m1, m6
21360 pmulhrsw m3, m7
21361 pmaddubsw m5, m4, m6
21362 pmulhrsw m5, m7
21363 packuswb m3, m5
21364 movu [r0 + 1939 * 16], m3
21365
21366 ; mode 29 [row 24]
21367 movu m6, [r5 + 1 * 16]
21368 movu m0, [r3 + 8]
21369 movd m1, [r3 + 9]
21370 palignr m1, m0, 1
21371 punpcklbw m0, m1
21372 pmaddubsw m3, m0, m6
21373 pmulhrsw m3, m7
21374 movu m2, [r3 + 16]
21375 movd m4, [r3 + 17]
21376 palignr m4, m2, 1
21377 punpcklbw m2, m4
21378 pmaddubsw m5, m2, m6
21379 pmulhrsw m5, m7
21380 packuswb m3, m5
21381 movu [r0 + 1776 * 16], m3
21382
21383 movu m1, [r3 + 24]
21384 movd m3, [r3 + 25]
21385 palignr m3, m1, 1
21386 punpcklbw m1, m3
21387 pmaddubsw m3, m1, m6
21388 pmulhrsw m3, m7
21389 movu m4, [r3 + 32]
21390 movd m5, [r3 + 33]
21391 palignr m5, m4, 1
21392 punpcklbw m4, m5
21393 pmaddubsw m5, m4, m6
21394 pmulhrsw m5, m7
21395 packuswb m3, m5
21396 movu [r0 + 1777 * 16], m3
21397
21398 ; mode 29 [row 25]
21399 movu m6, [r5 + 10 * 16]
21400 pmaddubsw m3, m0, m6
21401 pmulhrsw m3, m7
21402 pmaddubsw m5, m2, m6
21403 pmulhrsw m5, m7
21404 packuswb m3, m5
21405 movu [r0 + 1778 * 16], m3
21406
21407 ; mode 30 [row 17 - first half]
21408 movu [r0 + 1826 * 16], m3
21409
21410 ; mode 33 [row 8 - first half]
21411 movu [r0 + 2000 * 16], m3
21412
21413 pmaddubsw m3, m1, m6
21414 pmulhrsw m3, m7
21415 pmaddubsw m5, m4, m6
21416 pmulhrsw m5, m7
21417 packuswb m3, m5
21418 movu [r0 + 1779 * 16], m3
21419
21420 ; mode 30 [row 17 - second half]
21421 movu [r0 + 1827 * 16], m3
21422
21423 ; mode 33 [row 8 - second half]
21424 movu [r0 + 2001 * 16], m3
21425
21426 ; mode 29 [row 26]
21427 movu m6, [r5 + 19 * 16]
21428 pmaddubsw m3, m0, m6
21429 pmulhrsw m3, m7
21430 pmaddubsw m5, m2, m6
21431 pmulhrsw m5, m7
21432 packuswb m3, m5
21433 movu [r0 + 1780 * 16], m3
21434 pmaddubsw m3, m1, m6
21435 pmulhrsw m3, m7
21436 pmaddubsw m5, m4, m6
21437 pmulhrsw m5, m7
21438 packuswb m3, m5
21439 movu [r0 + 1781 * 16], m3
21440
21441 ; mode 29 [row 27]
21442 movu m6, [r5 + 28 * 16]
21443 pmaddubsw m3, m0, m6
21444 pmulhrsw m3, m7
21445 pmaddubsw m5, m2, m6
21446 pmulhrsw m5, m7
21447 packuswb m3, m5
21448 movu [r0 + 1782 * 16], m3
21449
21450 ; mode 32 [row 11 - first half]
21451 movu [r0 + 1942 * 16], m3
21452
21453 pmaddubsw m3, m1, m6
21454 pmulhrsw m3, m7
21455 pmaddubsw m5, m4, m6
21456 pmulhrsw m5, m7
21457 packuswb m3, m5
21458 movu [r0 + 1783 * 16], m3
21459
21460 ; mode 32 [row 11 - second half]
21461 movu [r0 + 1943 * 16], m3
21462
21463 ; mode 30 [row 18]
21464 movu m6, [r5 + 23 * 16]
21465 pmaddubsw m3, m0, m6
21466 pmulhrsw m3, m7
21467 pmaddubsw m5, m2, m6
21468 pmulhrsw m5, m7
21469 packuswb m3, m5
21470 movu [r0 + 1828 * 16], m3
21471 pmaddubsw m3, m1, m6
21472 pmulhrsw m3, m7
21473 pmaddubsw m5, m4, m6
21474 pmulhrsw m5, m7
21475 packuswb m3, m5
21476 movu [r0 + 1829 * 16], m3
21477
21478 ; mode 31 [row 13]
21479 movu m6, [r5 + 14 * 16]
21480 pmaddubsw m3, m0, m6
21481 pmulhrsw m3, m7
21482 pmaddubsw m5, m2, m6
21483 pmulhrsw m5, m7
21484 packuswb m3, m5
21485 movu [r0 + 1882 * 16], m3
21486 pmaddubsw m3, m1, m6
21487 pmulhrsw m3, m7
21488 pmaddubsw m5, m4, m6
21489 pmulhrsw m5, m7
21490 packuswb m3, m5
21491 movu [r0 + 1883 * 16], m3
21492
21493 ; mode 31 [row 14]
21494 movu m6, [r5 + 31 * 16]
21495 pmaddubsw m3, m0, m6
21496 pmulhrsw m3, m7
21497 pmaddubsw m5, m2, m6
21498 pmulhrsw m5, m7
21499 packuswb m3, m5
21500 movu [r0 + 1884 * 16], m3
21501 pmaddubsw m3, m1, m6
21502 pmulhrsw m3, m7
21503 pmaddubsw m5, m4, m6
21504 pmulhrsw m5, m7
21505 packuswb m3, m5
21506 movu [r0 + 1885 * 16], m3
21507
21508 ; mode 32 [row 10]
21509 movu m6, [r5 + 7 * 16]
21510 pmaddubsw m3, m0, m6
21511 pmulhrsw m3, m7
21512 pmaddubsw m5, m2, m6
21513 pmulhrsw m5, m7
21514 packuswb m3, m5
21515 movu [r0 + 1940 * 16], m3
21516 pmaddubsw m3, m1, m6
21517 pmulhrsw m3, m7
21518 pmaddubsw m5, m4, m6
21519 pmulhrsw m5, m7
21520 packuswb m3, m5
21521 movu [r0 + 1941 * 16], m3
21522
21523 ; mode 29 [row 28]
21524 movu m6, [r5 + 5 * 16]
21525 movu m0, [r3 + 9]
21526 movd m1, [r3 + 10]
21527 palignr m1, m0, 1
21528 punpcklbw m0, m1
21529 pmaddubsw m3, m0, m6
21530 pmulhrsw m3, m7
21531 movu m2, [r3 + 17]
21532 movd m4, [r3 + 18]
21533 palignr m4, m2, 1
21534 punpcklbw m2, m4
21535 pmaddubsw m5, m2, m6
21536 pmulhrsw m5, m7
21537 packuswb m3, m5
21538 movu [r0 + 1784 * 16], m3
21539
21540 movu m1, [r3 + 25]
21541 movd m3, [r3 + 26]
21542 palignr m3, m1, 1
21543 punpcklbw m1, m3
21544 pmaddubsw m3, m1, m6
21545 pmulhrsw m3, m7
21546 movu m4, [r3 + 33]
21547 movd m5, [r3 + 34]
21548 palignr m5, m4, 1
21549 punpcklbw m4, m5
21550 pmaddubsw m5, m4, m6
21551 pmulhrsw m5, m7
21552 packuswb m3, m5
21553 movu [r0 + 1785 * 16], m3
21554
21555 ; mode 29 [row 29]
21556 movu m6, [r5 + 14 * 16]
21557 pmaddubsw m3, m0, m6
21558 pmulhrsw m3, m7
21559 pmaddubsw m5, m2, m6
21560 pmulhrsw m5, m7
21561 packuswb m3, m5
21562 movu [r0 + 1786 * 16], m3
21563 pmaddubsw m3, m1, m6
21564 pmulhrsw m3, m7
21565 pmaddubsw m5, m4, m6
21566 pmulhrsw m5, m7
21567 packuswb m3, m5
21568 movu [r0 + 1787 * 16], m3
21569
21570 ; mode 29 [row 30]
21571 movu m6, [r5 + 23 * 16]
21572 pmaddubsw m3, m0, m6
21573 pmulhrsw m3, m7
21574 pmaddubsw m5, m2, m6
21575 pmulhrsw m5, m7
21576 packuswb m3, m5
21577 movu [r0 + 1788 * 16], m3
21578 pmaddubsw m3, m1, m6
21579 pmulhrsw m3, m7
21580 pmaddubsw m5, m4, m6
21581 pmulhrsw m5, m7
21582 packuswb m3, m5
21583 movu [r0 + 1789 * 16], m3
21584
21585 ; mode 30 [row 19]
21586 movu m6, [r5 + 4 * 16]
21587 pmaddubsw m3, m0, m6
21588 pmulhrsw m3, m7
21589 pmaddubsw m5, m2, m6
21590 pmulhrsw m5, m7
21591 packuswb m3, m5
21592 movu [r0 + 1830 * 16], m3
21593
21594 ; mode 33 [row 9 - first half]
21595 movu [r0 + 2002 * 16], m3
21596
21597 pmaddubsw m3, m1, m6
21598 pmulhrsw m3, m7
21599 pmaddubsw m5, m4, m6
21600 pmulhrsw m5, m7
21601 packuswb m3, m5
21602 movu [r0 + 1831 * 16], m3
21603
21604 ; mode 33 [row 9 - second half]
21605 movu [r0 + 2003 * 16], m3
21606
21607 ; mode 30 [row 20]
21608 movu m6, [r5 + 17 * 16]
21609 pmaddubsw m3, m0, m6
21610 pmulhrsw m3, m7
21611 pmaddubsw m5, m2, m6
21612 pmulhrsw m5, m7
21613 packuswb m3, m5
21614 movu [r0 + 1832 * 16], m3
21615
21616 ; mode 32 [row 12 - first half]
21617 movu [r0 + 1944 * 16], m3
21618
21619 pmaddubsw m3, m1, m6
21620 pmulhrsw m3, m7
21621 pmaddubsw m5, m4, m6
21622 pmulhrsw m5, m7
21623 packuswb m3, m5
21624 movu [r0 + 1833 * 16], m3
21625
21626 ; mode 32 [row 12 - second half]
21627 movu [r0 + 1945 * 16], m3
21628
21629 ; mode 30 [row 21]
21630 movu m6, [r5 + 30 * 16]
21631 pmaddubsw m3, m0, m6
21632 pmulhrsw m3, m7
21633 pmaddubsw m5, m2, m6
21634 pmulhrsw m5, m7
21635 packuswb m3, m5
21636 movu [r0 + 1834 * 16], m3
21637
21638 ; mode 33 [row 10 - first half]
21639 movu [r0 + 2004 * 16], m3
21640
21641 pmaddubsw m3, m1, m6
21642 pmulhrsw m3, m7
21643 pmaddubsw m5, m4, m6
21644 pmulhrsw m5, m7
21645 packuswb m3, m5
21646 movu [r0 + 1835 * 16], m3
21647
21648 ; mode 33 [row 10 - second half]
21649 movu [r0 + 2005 * 16], m3
21650
21651 ; mode 31 [row 15]
21652 movu m6, [r5 + 16 * 16]
21653 pmaddubsw m3, m0, m6
21654 pmulhrsw m3, m7
21655 pmaddubsw m5, m2, m6
21656 pmulhrsw m5, m7
21657 packuswb m3, m5
21658 movu [r0 + 1886 * 16], m3
21659 pmaddubsw m3, m1, m6
21660 pmulhrsw m3, m7
21661 pmaddubsw m5, m4, m6
21662 pmulhrsw m5, m7
21663 packuswb m3, m5
21664 movu [r0 + 1887 * 16], m3
21665
21666 ; mode 29 [row 31]
21667 movu m0, [r3 + 10]
21668 movd m1, [r3 + 11]
21669 palignr m1, m0, 1
21670 punpcklbw m0, m1
21671 movu m2, [r3 + 18]
21672 movd m3, [r3 + 19]
21673 palignr m3, m2, 1
21674 punpcklbw m2, m3
21675 movu m1, [r3 + 26]
21676 movd m3, [r3 + 27]
21677 palignr m3, m1, 1
21678 punpcklbw m1, m3
21679 movu m4, [r3 + 34]
21680 movd m5, [r3 + 35]
21681 palignr m5, m4, 1
21682 punpcklbw m4, m5
21683
21684 pshufb m5, m0, [tab_S2]
21685 movh [r0 + 1790 * 16], m5
21686 pshufb m5, m2, [tab_S2]
21687 movh [r0 + 1790 * 16 + 8], m5
21688 pshufb m5, m1, [tab_S2]
21689 movh [r0 + 1791 * 16], m5
21690 pshufb m5, m4, [tab_S2]
21691 movh [r0 + 1791 * 16 + 8], m5
21692
21693 ; mode 30 [row 22]
21694 movu m6, [r5 + 11 * 16]
21695 pmaddubsw m3, m0, m6
21696 pmulhrsw m3, m7
21697 pmaddubsw m5, m2, m6
21698 pmulhrsw m5, m7
21699 packuswb m3, m5
21700 movu [r0 + 1836 * 16], m3
21701 pmaddubsw m3, m1, m6
21702 pmulhrsw m3, m7
21703 pmaddubsw m5, m4, m6
21704 pmulhrsw m5, m7
21705 packuswb m3, m5
21706 movu [r0 + 1837 * 16], m3
21707
21708 ; mode 30 [row 23]
21709 movu m6, [r5 + 24 * 16]
21710 pmaddubsw m3, m0, m6
21711 pmulhrsw m3, m7
21712 pmaddubsw m5, m2, m6
21713 pmulhrsw m5, m7
21714 packuswb m3, m5
21715 movu [r0 + 1838 * 16], m3
21716
21717 ; mode 33 [row 11 - first half]
21718 movu [r0 + 2006 * 16], m3
21719
21720 pmaddubsw m3, m1, m6
21721 pmulhrsw m3, m7
21722 pmaddubsw m5, m4, m6
21723 pmulhrsw m5, m7
21724 packuswb m3, m5
21725 movu [r0 + 1839 * 16], m3
21726
21727 ; mode 33 [row 11 - second half]
21728 movu [r0 + 2007 * 16], m3
21729
21730 ; mode 31 [row 16]
21731 movu m6, [r5 + 1 * 16]
21732 pmaddubsw m3, m0, m6
21733 pmulhrsw m3, m7
21734 pmaddubsw m5, m2, m6
21735 pmulhrsw m5, m7
21736 packuswb m3, m5
21737 movu [r0 + 1888 * 16], m3
21738 pmaddubsw m3, m1, m6
21739 pmulhrsw m3, m7
21740 pmaddubsw m5, m4, m6
21741 pmulhrsw m5, m7
21742 packuswb m3, m5
21743 movu [r0 + 1889 * 16], m3
21744
21745 ; mode 31 [row 17]
21746 movu m6, [r5 + 18 * 16]
21747 pmaddubsw m3, m0, m6
21748 pmulhrsw m3, m7
21749 pmaddubsw m5, m2, m6
21750 pmulhrsw m5, m7
21751 packuswb m3, m5
21752 movu [r0 + 1890 * 16], m3
21753 pmaddubsw m3, m1, m6
21754 pmulhrsw m3, m7
21755 pmaddubsw m5, m4, m6
21756 pmulhrsw m5, m7
21757 packuswb m3, m5
21758 movu [r0 + 1891 * 16], m3
21759
21760 ; mode 32 [row 13]
21761 movu m6, [r5 + 6 * 16]
21762 pmaddubsw m3, m0, m6
21763 pmulhrsw m3, m7
21764 pmaddubsw m5, m2, m6
21765 pmulhrsw m5, m7
21766 packuswb m3, m5
21767 movu [r0 + 1946 * 16], m3
21768 pmaddubsw m3, m1, m6
21769 pmulhrsw m3, m7
21770 pmaddubsw m5, m4, m6
21771 pmulhrsw m5, m7
21772 packuswb m3, m5
21773 movu [r0 + 1947 * 16], m3
21774
21775 ; mode 32 [row 14]
21776 movu m6, [r5 + 27 * 16]
21777 pmaddubsw m3, m0, m6
21778 pmulhrsw m3, m7
21779 pmaddubsw m5, m2, m6
21780 pmulhrsw m5, m7
21781 packuswb m3, m5
21782 movu [r0 + 1948 * 16], m3
21783 pmaddubsw m3, m1, m6
21784 pmulhrsw m3, m7
21785 pmaddubsw m5, m4, m6
21786 pmulhrsw m5, m7
21787 packuswb m3, m5
21788 movu [r0 + 1949 * 16], m3
21789
21790 ; mode 30 [row 24]
21791 movu m6, [r5 + 5 * 16]
21792 movu m0, [r3 + 11]
21793 movd m1, [r3 + 12]
21794 palignr m1, m0, 1
21795 punpcklbw m0, m1
21796 pmaddubsw m3, m0, m6
21797 pmulhrsw m3, m7
21798 movu m2, [r3 + 19]
21799 movd m4, [r3 + 20]
21800 palignr m4, m2, 1
21801 punpcklbw m2, m4
21802 pmaddubsw m5, m2, m6
21803 pmulhrsw m5, m7
21804 packuswb m3, m5
21805 movu [r0 + 1840 * 16], m3
21806
21807 movu m1, [r3 + 27]
21808 movd m3, [r3 + 28]
21809 palignr m3, m1, 1
21810 punpcklbw m1, m3
21811 pmaddubsw m3, m1, m6
21812 pmulhrsw m3, m7
21813 movu m4, [r3 + 35]
21814 movd m5, [r3 + 36]
21815 palignr m5, m4, 1
21816 punpcklbw m4, m5
21817 pmaddubsw m5, m4, m6
21818 pmulhrsw m5, m7
21819 packuswb m3, m5
21820 movu [r0 + 1841 * 16], m3
21821
21822 ; mode 30 [row 25]
21823 movu m6, [r5 + 18 * 16]
21824 pmaddubsw m3, m0, m6
21825 pmulhrsw m3, m7
21826 pmaddubsw m5, m2, m6
21827 pmulhrsw m5, m7
21828 packuswb m3, m5
21829 movu [r0 + 1842 * 16], m3
21830
21831 ; mode 33 [row 12 - first half]
21832 movu [r0 + 2008 * 16], m3
21833
21834 pmaddubsw m3, m1, m6
21835 pmulhrsw m3, m7
21836 pmaddubsw m5, m4, m6
21837 pmulhrsw m5, m7
21838 packuswb m3, m5
21839 movu [r0 + 1843 * 16], m3
21840
21841 ; mode 33 [row 12 - second half]
21842 movu [r0 + 2009 * 16], m3
21843
21844 ; mode 30 [row 26]
21845 movu m6, [r5 + 31 * 16]
21846 pmaddubsw m3, m0, m6
21847 pmulhrsw m3, m7
21848 pmaddubsw m5, m2, m6
21849 pmulhrsw m5, m7
21850 packuswb m3, m5
21851 movu [r0 + 1844 * 16], m3
21852 pmaddubsw m3, m1, m6
21853 pmulhrsw m3, m7
21854 pmaddubsw m5, m4, m6
21855 pmulhrsw m5, m7
21856 packuswb m3, m5
21857 movu [r0 + 1845 * 16], m3
21858
21859 ; mode 31 [row 18]
21860 movu m6, [r5 + 3 * 16]
21861 pmaddubsw m3, m0, m6
21862 pmulhrsw m3, m7
21863 pmaddubsw m5, m2, m6
21864 pmulhrsw m5, m7
21865 packuswb m3, m5
21866 movu [r0 + 1892 * 16], m3
21867 pmaddubsw m3, m1, m6
21868 pmulhrsw m3, m7
21869 pmaddubsw m5, m4, m6
21870 pmulhrsw m5, m7
21871 packuswb m3, m5
21872 movu [r0 + 1893 * 16], m3
21873
21874 ; mode 31 [row 19]
21875 movu m6, [r5 + 20 * 16]
21876 pmaddubsw m3, m0, m6
21877 pmulhrsw m3, m7
21878 pmaddubsw m5, m2, m6
21879 pmulhrsw m5, m7
21880 packuswb m3, m5
21881 movu [r0 + 1894 * 16], m3
21882 pmaddubsw m3, m1, m6
21883 pmulhrsw m3, m7
21884 pmaddubsw m5, m4, m6
21885 pmulhrsw m5, m7
21886 packuswb m3, m5
21887 movu [r0 + 1895 * 16], m3
21888
21889 ; mode 32 [row 15]
21890 movu m6, [r5 + 16 * 16]
21891 pmaddubsw m3, m0, m6
21892 pmulhrsw m3, m7
21893 pmaddubsw m5, m2, m6
21894 pmulhrsw m5, m7
21895 packuswb m3, m5
21896 movu [r0 + 1950 * 16], m3
21897 pmaddubsw m3, m1, m6
21898 pmulhrsw m3, m7
21899 pmaddubsw m5, m4, m6
21900 pmulhrsw m5, m7
21901 packuswb m3, m5
21902 movu [r0 + 1951 * 16], m3
21903
21904 ; mode 30 [row 27]
21905 movu m6, [r5 + 12 * 16]
21906 movu m0, [r3 + 12]
21907 movd m1, [r3 + 13]
21908 palignr m1, m0, 1
21909 punpcklbw m0, m1
21910 pmaddubsw m3, m0, m6
21911 pmulhrsw m3, m7
21912 movu m2, [r3 + 20]
21913 movd m4, [r3 + 21]
21914 palignr m4, m2, 1
21915 punpcklbw m2, m4
21916 pmaddubsw m5, m2, m6
21917 pmulhrsw m5, m7
21918 packuswb m3, m5
21919 movu [r0 + 1846 * 16], m3
21920
21921 ; mode 33 [row 13 - first half]
21922 movu [r0 + 2010 * 16], m3
21923
21924 movu m1, [r3 + 28]
21925 movd m3, [r3 + 29]
21926 palignr m3, m1, 1
21927 punpcklbw m1, m3
21928 pmaddubsw m3, m1, m6
21929 pmulhrsw m3, m7
21930 movu m4, [r3 + 36]
21931 movd m5, [r3 + 37]
21932 palignr m5, m4, 1
21933 punpcklbw m4, m5
21934 pmaddubsw m5, m4, m6
21935 pmulhrsw m5, m7
21936 packuswb m3, m5
21937 movu [r0 + 1847 * 16], m3
21938
21939 ; mode 33 [row 13 - second half]
21940 movu [r0 + 2011 * 16], m3
21941
21942 ; mode 30 [row 28]
21943 movu m6, [r5 + 25 * 16]
21944 pmaddubsw m3, m0, m6
21945 pmulhrsw m3, m7
21946 pmaddubsw m5, m2, m6
21947 pmulhrsw m5, m7
21948 packuswb m3, m5
21949 movu [r0 + 1848 * 16], m3
21950 pmaddubsw m3, m1, m6
21951 pmulhrsw m3, m7
21952 pmaddubsw m5, m4, m6
21953 pmulhrsw m5, m7
21954 packuswb m3, m5
21955 movu [r0 + 1849 * 16], m3
21956
21957 ; mode 31 [row 20]
21958 movu m6, [r5 + 5 * 16]
21959 pmaddubsw m3, m0, m6
21960 pmulhrsw m3, m7
21961 pmaddubsw m5, m2, m6
21962 pmulhrsw m5, m7
21963 packuswb m3, m5
21964 movu [r0 + 1896 * 16], m3
21965
21966 ; mode 32 [row 16 - first half]
21967 movu [r0 + 1952 * 16], m3
21968
21969 pmaddubsw m3, m1, m6
21970 pmulhrsw m3, m7
21971 pmaddubsw m5, m4, m6
21972 pmulhrsw m5, m7
21973 packuswb m3, m5
21974 movu [r0 + 1897 * 16], m3
21975
21976 ; mode 32 [row 16 - second half]
21977 movu [r0 + 1953 * 16], m3
21978
21979 ; mode 31 [row 21]
21980 movu m6, [r5 + 22 * 16]
21981 pmaddubsw m3, m0, m6
21982 pmulhrsw m3, m7
21983 pmaddubsw m5, m2, m6
21984 pmulhrsw m5, m7
21985 packuswb m3, m5
21986 movu [r0 + 1898 * 16], m3
21987 pmaddubsw m3, m1, m6
21988 pmulhrsw m3, m7
21989 pmaddubsw m5, m4, m6
21990 pmulhrsw m5, m7
21991 packuswb m3, m5
21992 movu [r0 + 1899 * 16], m3
21993
21994 ; mode 32 [row 17]
21995 movu m6, [r5 + 26 * 16]
21996 pmaddubsw m3, m0, m6
21997 pmulhrsw m3, m7
21998 pmaddubsw m5, m2, m6
21999 pmulhrsw m5, m7
22000 packuswb m3, m5
22001 movu [r0 + 1954 * 16], m3
22002 pmaddubsw m3, m1, m6
22003 pmulhrsw m3, m7
22004 pmaddubsw m5, m4, m6
22005 pmulhrsw m5, m7
22006 packuswb m3, m5
22007 movu [r0 + 1955 * 16], m3
22008
22009 ; mode 30 [row 29]
22010 movu m6, [r5 + 6 * 16]
22011 movu m0, [r3 + 13]
22012 movd m1, [r3 + 14]
22013 palignr m1, m0, 1
22014 punpcklbw m0, m1
22015 pmaddubsw m3, m0, m6
22016 pmulhrsw m3, m7
22017 movu m2, [r3 + 21]
22018 movd m4, [r3 + 22]
22019 palignr m4, m2, 1
22020 punpcklbw m2, m4
22021 pmaddubsw m5, m2, m6
22022 pmulhrsw m5, m7
22023 packuswb m3, m5
22024 movu [r0 + 1850 * 16], m3
22025
22026 ; mode 33 [row 14 - first half]
22027 movu [r0 + 2012 * 16], m3
22028
22029 movu m1, [r3 + 29]
22030 movd m3, [r3 + 30]
22031 palignr m3, m1, 1
22032 punpcklbw m1, m3
22033 pmaddubsw m3, m1, m6
22034 pmulhrsw m3, m7
22035 movu m4, [r3 + 37]
22036 movd m5, [r3 + 38]
22037 palignr m5, m4, 1
22038 punpcklbw m4, m5
22039 pmaddubsw m5, m4, m6
22040 pmulhrsw m5, m7
22041 packuswb m3, m5
22042 movu [r0 + 1851 * 16], m3
22043
22044 ; mode 33 [row 14 - second half]
22045 movu [r0 + 2013 * 16], m3
22046
22047 ; mode 30 [row 30]
22048 movu m6, [r5 + 19 * 16]
22049 pmaddubsw m3, m0, m6
22050 pmulhrsw m3, m7
22051 pmaddubsw m5, m2, m6
22052 pmulhrsw m5, m7
22053 packuswb m3, m5
22054 movu [r0 + 1852 * 16], m3
22055 pmaddubsw m3, m1, m6
22056 pmulhrsw m3, m7
22057 pmaddubsw m5, m4, m6
22058 pmulhrsw m5, m7
22059 packuswb m3, m5
22060 movu [r0 + 1853 * 16], m3
22061
22062 ; mode 31 [row 22]
22063 movu m6, [r5 + 7 * 16]
22064 pmaddubsw m3, m0, m6
22065 pmulhrsw m3, m7
22066 pmaddubsw m5, m2, m6
22067 pmulhrsw m5, m7
22068 packuswb m3, m5
22069 movu [r0 + 1900 * 16], m3
22070 pmaddubsw m3, m1, m6
22071 pmulhrsw m3, m7
22072 pmaddubsw m5, m4, m6
22073 pmulhrsw m5, m7
22074 packuswb m3, m5
22075 movu [r0 + 1901 * 16], m3
22076
22077 ; mode 31 [row 23]
22078 movu m6, [r5 + 24 * 16]
22079 pmaddubsw m3, m0, m6
22080 pmulhrsw m3, m7
22081 pmaddubsw m5, m2, m6
22082 pmulhrsw m5, m7
22083 packuswb m3, m5
22084 movu [r0 + 1902 * 16], m3
22085 pmaddubsw m3, m1, m6
22086 pmulhrsw m3, m7
22087 pmaddubsw m5, m4, m6
22088 pmulhrsw m5, m7
22089 packuswb m3, m5
22090 movu [r0 + 1903 * 16], m3
22091
22092 ; mode 32 [row 18]
22093 movu m6, [r5 + 15 * 16]
22094 pmaddubsw m3, m0, m6
22095 pmulhrsw m3, m7
22096 pmaddubsw m5, m2, m6
22097 pmulhrsw m5, m7
22098 packuswb m3, m5
22099 movu [r0 + 1956 * 16], m3
22100 pmaddubsw m3, m1, m6
22101 pmulhrsw m3, m7
22102 pmaddubsw m5, m4, m6
22103 pmulhrsw m5, m7
22104 packuswb m3, m5
22105 movu [r0 + 1957 * 16], m3
22106
22107 ; mode 30 [row 31]
22108 movu m0, [r3 + 14]
22109 movd m1, [r3 + 15]
22110 palignr m1, m0, 1
22111 punpcklbw m0, m1
22112 movu m2, [r3 + 22]
22113 movd m3, [r3 + 23]
22114 palignr m3, m2, 1
22115 punpcklbw m2, m3
22116 movu m1, [r3 + 30]
22117 movd m3, [r3 + 31]
22118 palignr m3, m1, 1
22119 punpcklbw m1, m3
22120 movu m4, [r3 + 38]
22121 movd m5, [r3 + 39]
22122 palignr m5, m4, 1
22123 punpcklbw m4, m5
22124
22125 pshufb m5, m0, [tab_S2]
22126 movh [r0 + 1854 * 16], m5
22127
22128 ; mode 33 [row 15 - first eight]
22129 movh [r0 + 2014 * 16], m5
22130
22131 pshufb m5, m2, [tab_S2]
22132 movh [r0 + 1854 * 16 + 8], m5
22133
22134 ; mode 33 [row 15 - second eight]
22135 movh [r0 + 2014 * 16 + 8], m5
22136
22137 pshufb m5, m1, [tab_S2]
22138 movh [r0 + 1855 * 16], m5
22139
22140 ; mode 33 [row 15 - third eight]
22141 movh [r0 + 2015 * 16], m5
22142
22143 pshufb m5, m4, [tab_S2]
22144 movh [r0 + 1855 * 16 + 8], m5
22145
22146 ; mode 33 [row 15 - fourth eight]
22147 movh [r0 + 2015 * 16 + 8], m5
22148
22149 ; mode 31 [row 24]
22150 movu m6, [r5 + 9 * 16]
22151 pmaddubsw m3, m0, m6
22152 pmulhrsw m3, m7
22153 pmaddubsw m5, m2, m6
22154 pmulhrsw m5, m7
22155 packuswb m3, m5
22156 movu [r0 + 1904 * 16], m3
22157 pmaddubsw m3, m1, m6
22158 pmulhrsw m3, m7
22159 pmaddubsw m5, m4, m6
22160 pmulhrsw m5, m7
22161 packuswb m3, m5
22162 movu [r0 + 1905 * 16], m3
22163
22164 ; mode 31 [row 25]
22165 movu m6, [r5 + 26 * 16]
22166 pmaddubsw m3, m0, m6
22167 pmulhrsw m3, m7
22168 pmaddubsw m5, m2, m6
22169 pmulhrsw m5, m7
22170 packuswb m3, m5
22171 movu [r0 + 1906 * 16], m3
22172
22173 ; mode 33 [row 16 - first half]
22174 movu [r0 + 2016 * 16], m3
22175
22176 pmaddubsw m3, m1, m6
22177 pmulhrsw m3, m7
22178 pmaddubsw m5, m4, m6
22179 pmulhrsw m5, m7
22180 packuswb m3, m5
22181 movu [r0 + 1907 * 16], m3
22182
22183 ; mode 33 [row 16 - second half]
22184 movu [r0 + 2017 * 16], m3
22185
22186 ; mode 32 [row 19]
22187 movu m6, [r5 + 4 * 16]
22188 pmaddubsw m3, m0, m6
22189 pmulhrsw m3, m7
22190 pmaddubsw m5, m2, m6
22191 pmulhrsw m5, m7
22192 packuswb m3, m5
22193 movu [r0 + 1958 * 16], m3
22194 pmaddubsw m3, m1, m6
22195 pmulhrsw m3, m7
22196 pmaddubsw m5, m4, m6
22197 pmulhrsw m5, m7
22198 packuswb m3, m5
22199 movu [r0 + 1959 * 16], m3
22200
22201 ; mode 32 [row 20]
22202 movu m6, [r5 + 25 * 16]
22203 pmaddubsw m3, m0, m6
22204 pmulhrsw m3, m7
22205 pmaddubsw m5, m2, m6
22206 pmulhrsw m5, m7
22207 packuswb m3, m5
22208 movu [r0 + 1960 * 16], m3
22209 pmaddubsw m3, m1, m6
22210 pmulhrsw m3, m7
22211 pmaddubsw m5, m4, m6
22212 pmulhrsw m5, m7
22213 packuswb m3, m5
22214 movu [r0 + 1961 * 16], m3
22215
22216 ; mode 31 [row 26]
22217 movu m6, [r5 + 11 * 16]
22218 movu m0, [r3 + 15]
22219 movd m1, [r3 + 16]
22220 palignr m1, m0, 1
22221 punpcklbw m0, m1
22222 pmaddubsw m3, m0, m6
22223 pmulhrsw m3, m7
22224 movu m2, [r3 + 23]
22225 movd m4, [r3 + 24]
22226 palignr m4, m2, 1
22227 punpcklbw m2, m4
22228 pmaddubsw m5, m2, m6
22229 pmulhrsw m5, m7
22230 packuswb m3, m5
22231 movu [r0 + 1908 * 16], m3
22232
22233 movu m1, [r3 + 31]
22234 movd m3, [r3 + 32]
22235 palignr m3, m1, 1
22236 punpcklbw m1, m3
22237 pmaddubsw m3, m1, m6
22238 pmulhrsw m3, m7
22239 movu m4, [r3 + 39]
22240 movd m5, [r3 + 40]
22241 palignr m5, m4, 1
22242 punpcklbw m4, m5
22243 pmaddubsw m5, m4, m6
22244 pmulhrsw m5, m7
22245 packuswb m3, m5
22246 movu [r0 + 1909 * 16], m3
22247
22248 ; mode 31 [row 27]
22249 movu m6, [r5 + 28 * 16]
22250 pmaddubsw m3, m0, m6
22251 pmulhrsw m3, m7
22252 pmaddubsw m5, m2, m6
22253 pmulhrsw m5, m7
22254 packuswb m3, m5
22255 movu [r0 + 1910 * 16], m3
22256 pmaddubsw m3, m1, m6
22257 pmulhrsw m3, m7
22258 pmaddubsw m5, m4, m6
22259 pmulhrsw m5, m7
22260 packuswb m3, m5
22261 movu [r0 + 1911 * 16], m3
22262
22263 ; mode 32 [row 21]
22264 movu m6, [r5 + 14 * 16]
22265 pmaddubsw m3, m0, m6
22266 pmulhrsw m3, m7
22267 pmaddubsw m5, m2, m6
22268 pmulhrsw m5, m7
22269 packuswb m3, m5
22270 movu [r0 + 1962 * 16], m3
22271 pmaddubsw m3, m1, m6
22272 pmulhrsw m3, m7
22273 pmaddubsw m5, m4, m6
22274 pmulhrsw m5, m7
22275 packuswb m3, m5
22276 movu [r0 + 1963 * 16], m3
22277
22278 ; mode 33 [row 17]
22279 movu m6, [r5 + 20 * 16]
22280 pmaddubsw m3, m0, m6
22281 pmulhrsw m3, m7
22282 pmaddubsw m5, m2, m6
22283 pmulhrsw m5, m7
22284 packuswb m3, m5
22285 movu [r0 + 2018 * 16], m3
22286 pmaddubsw m3, m1, m6
22287 pmulhrsw m3, m7
22288 pmaddubsw m5, m4, m6
22289 pmulhrsw m5, m7
22290 packuswb m3, m5
22291 movu [r0 + 2019 * 16], m3
22292
22293 ; mode 31 [row 28]
22294 movu m6, [r5 + 13 * 16]
22295 movu m0, [r3 + 16]
22296 movd m1, [r3 + 17]
22297 palignr m1, m0, 1
22298 punpcklbw m0, m1
22299 pmaddubsw m3, m0, m6
22300 pmulhrsw m3, m7
22301 movu m2, [r3 + 24]
22302 movd m4, [r3 + 25]
22303 palignr m4, m2, 1
22304 punpcklbw m2, m4
22305 pmaddubsw m5, m2, m6
22306 pmulhrsw m5, m7
22307 packuswb m3, m5
22308 movu [r0 + 1912 * 16], m3
22309
22310 movu m1, [r3 + 32]
22311 movd m3, [r3 + 33]
22312 palignr m3, m1, 1
22313 punpcklbw m1, m3
22314 pmaddubsw m3, m1, m6
22315 pmulhrsw m3, m7
22316 movu m4, [r3 + 40]
22317 movd m5, [r3 + 41]
22318 palignr m5, m4, 1
22319 punpcklbw m4, m5
22320 pmaddubsw m5, m4, m6
22321 pmulhrsw m5, m7
22322 packuswb m3, m5
22323 movu [r0 + 1913 * 16], m3
22324
22325 ; mode 31 [row 29]
22326 movu m6, [r5 + 30 * 16]
22327 pmaddubsw m3, m0, m6
22328 pmulhrsw m3, m7
22329 pmaddubsw m5, m2, m6
22330 pmulhrsw m5, m7
22331 packuswb m3, m5
22332 movu [r0 + 1914 * 16], m3
22333 pmaddubsw m3, m1, m6
22334 pmulhrsw m3, m7
22335 pmaddubsw m5, m4, m6
22336 pmulhrsw m5, m7
22337 packuswb m3, m5
22338 movu [r0 + 1915 * 16], m3
22339
22340 ; mode 32 [row 22]
22341 movu m6, [r5 + 3 * 16]
22342 pmaddubsw m3, m0, m6
22343 pmulhrsw m3, m7
22344 pmaddubsw m5, m2, m6
22345 pmulhrsw m5, m7
22346 packuswb m3, m5
22347 movu [r0 + 1964 * 16], m3
22348 pmaddubsw m3, m1, m6
22349 pmulhrsw m3, m7
22350 pmaddubsw m5, m4, m6
22351 pmulhrsw m5, m7
22352 packuswb m3, m5
22353 movu [r0 + 1965 * 16], m3
22354
22355 ; mode 32 [row 23]
22356 movu m6, [r5 + 24 * 16]
22357 pmaddubsw m3, m0, m6
22358 pmulhrsw m3, m7
22359 pmaddubsw m5, m2, m6
22360 pmulhrsw m5, m7
22361 packuswb m3, m5
22362 movu [r0 + 1966 * 16], m3
22363 pmaddubsw m3, m1, m6
22364 pmulhrsw m3, m7
22365 pmaddubsw m5, m4, m6
22366 pmulhrsw m5, m7
22367 packuswb m3, m5
22368 movu [r0 + 1967 * 16], m3
22369
22370 ; mode 33 [row 18]
22371 movu m6, [r5 + 14 * 16]
22372 pmaddubsw m3, m0, m6
22373 pmulhrsw m3, m7
22374 pmaddubsw m5, m2, m6
22375 pmulhrsw m5, m7
22376 packuswb m3, m5
22377 movu [r0 + 2020 * 16], m3
22378 pmaddubsw m3, m1, m6
22379 pmulhrsw m3, m7
22380 pmaddubsw m5, m4, m6
22381 pmulhrsw m5, m7
22382 packuswb m3, m5
22383 movu [r0 + 2021 * 16], m3
22384
22385 ; mode 31 [row 30]
22386 movu m6, [r5 + 15 * 16]
22387 movu m0, [r3 + 17]
22388 movd m1, [r3 + 18]
22389 palignr m1, m0, 1
22390 punpcklbw m0, m1
22391 pmaddubsw m3, m0, m6
22392 pmulhrsw m3, m7
22393 movu m2, [r3 + 25]
22394 movd m4, [r3 + 26]
22395 palignr m4, m2, 1
22396 punpcklbw m2, m4
22397 pmaddubsw m5, m2, m6
22398 pmulhrsw m5, m7
22399 packuswb m3, m5
22400 movu [r0 + 1916 * 16], m3
22401
22402 movu m1, [r3 + 33]
22403 movd m3, [r3 + 34]
22404 palignr m3, m1, 1
22405 punpcklbw m1, m3
22406 pmaddubsw m3, m1, m6
22407 pmulhrsw m3, m7
22408 movu m4, [r3 + 41]
22409 movd m5, [r3 + 42]
22410 palignr m5, m4, 1
22411 punpcklbw m4, m5
22412 pmaddubsw m5, m4, m6
22413 pmulhrsw m5, m7
22414 packuswb m3, m5
22415 movu [r0 + 1917 * 16], m3
22416
22417 ; mode 32 [row 24]
22418 movu m6, [r5 + 13 * 16]
22419 pmaddubsw m3, m0, m6
22420 pmulhrsw m3, m7
22421 pmaddubsw m5, m2, m6
22422 pmulhrsw m5, m7
22423 packuswb m3, m5
22424 movu [r0 + 1968 * 16], m3
22425 pmaddubsw m3, m1, m6
22426 pmulhrsw m3, m7
22427 pmaddubsw m5, m4, m6
22428 pmulhrsw m5, m7
22429 packuswb m3, m5
22430 movu [r0 + 1969 * 16], m3
22431
22432 ; mode 33 [row 19]
22433 movu m6, [r5 + 8 * 16]
22434 pmaddubsw m3, m0, m6
22435 pmulhrsw m3, m7
22436 pmaddubsw m5, m2, m6
22437 pmulhrsw m5, m7
22438 packuswb m3, m5
22439 movu [r0 + 2022 * 16], m3
22440 pmaddubsw m3, m1, m6
22441 pmulhrsw m3, m7
22442 pmaddubsw m5, m4, m6
22443 pmulhrsw m5, m7
22444 packuswb m3, m5
22445 movu [r0 + 2023 * 16], m3
22446
22447 ; mode 31 [row 31]
22448 movu m0, [r3 + 18]
22449 movd m1, [r3 + 19]
22450 palignr m1, m0, 1
22451 punpcklbw m0, m1
22452 movu m2, [r3 + 26]
22453 movd m3, [r3 + 27]
22454 palignr m3, m2, 1
22455 punpcklbw m2, m3
22456 movu m1, [r3 + 34]
22457 movd m3, [r3 + 35]
22458 palignr m3, m1, 1
22459 punpcklbw m1, m3
22460 movu m4, [r3 + 42]
22461 movd m5, [r3 + 43]
22462 palignr m5, m4, 1
22463 punpcklbw m4, m5
22464
22465 pshufb m5, m0, [tab_S2]
22466 movh [r0 + 1918 * 16], m5
22467 pshufb m5, m2, [tab_S2]
22468 movh [r0 + 1918 * 16 + 8], m5
22469 pshufb m5, m1, [tab_S2]
22470 movh [r0 + 1919 * 16], m5
22471 pshufb m5, m4, [tab_S2]
22472 movh [r0 + 1919 * 16 + 8], m5
22473
22474 ; mode 32 [row 25]
22475 movu m6, [r5 + 2 * 16]
22476 pmaddubsw m3, m0, m6
22477 pmulhrsw m3, m7
22478 pmaddubsw m5, m2, m6
22479 pmulhrsw m5, m7
22480 packuswb m3, m5
22481 movu [r0 + 1970 * 16], m3
22482
22483 ; mode 33 [row 20 - first half]
22484 movu [r0 + 2024 * 16], m3
22485
22486 pmaddubsw m3, m1, m6
22487 pmulhrsw m3, m7
22488 pmaddubsw m5, m4, m6
22489 pmulhrsw m5, m7
22490 packuswb m3, m5
22491 movu [r0 + 1971 * 16], m3
22492
22493 ; mode 33 [row 20 - second half]
22494 movu [r0 + 2025 * 16], m3
22495
22496 ; mode 32 [row 26]
22497 movu m6, [r5 + 23 * 16]
22498 pmaddubsw m3, m0, m6
22499 pmulhrsw m3, m7
22500 pmaddubsw m5, m2, m6
22501 pmulhrsw m5, m7
22502 packuswb m3, m5
22503 movu [r0 + 1972 * 16], m3
22504 pmaddubsw m3, m1, m6
22505 pmulhrsw m3, m7
22506 pmaddubsw m5, m4, m6
22507 pmulhrsw m5, m7
22508 packuswb m3, m5
22509 movu [r0 + 1973 * 16], m3
22510
22511 ; mode 33 [row 21]
22512 movu m6, [r5 + 28 * 16]
22513 pmaddubsw m3, m0, m6
22514 pmulhrsw m3, m7
22515 pmaddubsw m5, m2, m6
22516 pmulhrsw m5, m7
22517 packuswb m3, m5
22518 movu [r0 + 2026 * 16], m3
22519 pmaddubsw m3, m1, m6
22520 pmulhrsw m3, m7
22521 pmaddubsw m5, m4, m6
22522 pmulhrsw m5, m7
22523 packuswb m3, m5
22524 movu [r0 + 2027 * 16], m3
22525
22526 ; mode 32 [row 27]
22527 movu m6, [r5 + 12 * 16]
22528 movu m0, [r3 + 19]
22529 movd m1, [r3 + 20]
22530 palignr m1, m0, 1
22531 punpcklbw m0, m1
22532 pmaddubsw m3, m0, m6
22533 pmulhrsw m3, m7
22534 movu m2, [r3 + 27]
22535 movd m4, [r3 + 28]
22536 palignr m4, m2, 1
22537 punpcklbw m2, m4
22538 pmaddubsw m5, m2, m6
22539 pmulhrsw m5, m7
22540 packuswb m3, m5
22541 movu [r0 + 1974 * 16], m3
22542
22543 movu m1, [r3 + 35]
22544 movd m3, [r3 + 36]
22545 palignr m3, m1, 1
22546 punpcklbw m1, m3
22547 pmaddubsw m3, m1, m6
22548 pmulhrsw m3, m7
22549 movu m4, [r3 + 43]
22550 movd m5, [r3 + 44]
22551 palignr m5, m4, 1
22552 punpcklbw m4, m5
22553 pmaddubsw m5, m4, m6
22554 pmulhrsw m5, m7
22555 packuswb m3, m5
22556 movu [r0 + 1975 * 16], m3
22557
22558 ; mode 33 [row 22]
22559 movu m6, [r5 + 22 * 16]
22560 pmaddubsw m3, m0, m6
22561 pmulhrsw m3, m7
22562 pmaddubsw m5, m2, m6
22563 pmulhrsw m5, m7
22564 packuswb m3, m5
22565 movu [r0 + 2028 * 16], m3
22566 pmaddubsw m3, m1, m6
22567 pmulhrsw m3, m7
22568 pmaddubsw m5, m4, m6
22569 pmulhrsw m5, m7
22570 packuswb m3, m5
22571 movu [r0 + 2029 * 16], m3
22572
22573 ; mode 32 [row 28]
22574 movu m6, [r5 + 1 * 16]
22575 movu m0, [r3 + 20]
22576 movd m1, [r3 + 21]
22577 palignr m1, m0, 1
22578 punpcklbw m0, m1
22579 pmaddubsw m3, m0, m6
22580 pmulhrsw m3, m7
22581 movu m2, [r3 + 28]
22582 movd m4, [r3 + 29]
22583 palignr m4, m2, 1
22584 punpcklbw m2, m4
22585 pmaddubsw m5, m2, m6
22586 pmulhrsw m5, m7
22587 packuswb m3, m5
22588 movu [r0 + 1976 * 16], m3
22589
22590 movu m1, [r3 + 36]
22591 movd m3, [r3 + 37]
22592 palignr m3, m1, 1
22593 punpcklbw m1, m3
22594 pmaddubsw m3, m1, m6
22595 pmulhrsw m3, m7
22596 movu m4, [r3 + 44]
22597 movd m5, [r3 + 45]
22598 palignr m5, m4, 1
22599 punpcklbw m4, m5
22600 pmaddubsw m5, m4, m6
22601 pmulhrsw m5, m7
22602 packuswb m3, m5
22603 movu [r0 + 1977 * 16], m3
22604
22605 ; mode 32 [row 29]
22606 movu m6, [r5 + 22 * 16]
22607 pmaddubsw m3, m0, m6
22608 pmulhrsw m3, m7
22609 pmaddubsw m5, m2, m6
22610 pmulhrsw m5, m7
22611 packuswb m3, m5
22612 movu [r0 + 1978 * 16], m3
22613 pmaddubsw m3, m1, m6
22614 pmulhrsw m3, m7
22615 pmaddubsw m5, m4, m6
22616 pmulhrsw m5, m7
22617 packuswb m3, m5
22618 movu [r0 + 1979 * 16], m3
22619
22620 ; mode 33 [row 23]
22621 movu m6, [r5 + 16 * 16]
22622 pmaddubsw m3, m0, m6
22623 pmulhrsw m3, m7
22624 pmaddubsw m5, m2, m6
22625 pmulhrsw m5, m7
22626 packuswb m3, m5
22627 movu [r0 + 2030 * 16], m3
22628 pmaddubsw m3, m1, m6
22629 pmulhrsw m3, m7
22630 pmaddubsw m5, m4, m6
22631 pmulhrsw m5, m7
22632 packuswb m3, m5
22633 movu [r0 + 2031 * 16], m3
22634
22635 ; mode 32 [row 30]
22636 movu m6, [r5 + 11 * 16]
22637 movu m0, [r3 + 21]
22638 movd m1, [r3 + 22]
22639 palignr m1, m0, 1
22640 punpcklbw m0, m1
22641 pmaddubsw m3, m0, m6
22642 pmulhrsw m3, m7
22643 movu m2, [r3 + 29]
22644 movd m4, [r3 + 30]
22645 palignr m4, m2, 1
22646 punpcklbw m2, m4
22647 pmaddubsw m5, m2, m6
22648 pmulhrsw m5, m7
22649 packuswb m3, m5
22650 movu [r0 + 1980 * 16], m3
22651
22652 movu m1, [r3 + 37]
22653 movd m3, [r3 + 38]
22654 palignr m3, m1, 1
22655 punpcklbw m1, m3
22656 pmaddubsw m3, m1, m6
22657 pmulhrsw m3, m7
22658 movu m4, [r3 + 45]
22659 movd m5, [r3 + 46]
22660 palignr m5, m4, 1
22661 punpcklbw m4, m5
22662 pmaddubsw m5, m4, m6
22663 pmulhrsw m5, m7
22664 packuswb m3, m5
22665 movu [r0 + 1981 * 16], m3
22666
22667 ; mode 33 [row 24]
22668 movu m6, [r5 + 10 * 16]
22669 pmaddubsw m3, m0, m6
22670 pmulhrsw m3, m7
22671 pmaddubsw m5, m2, m6
22672 pmulhrsw m5, m7
22673 packuswb m3, m5
22674 movu [r0 + 2032 * 16], m3
22675 pmaddubsw m3, m1, m6
22676 pmulhrsw m3, m7
22677 pmaddubsw m5, m4, m6
22678 pmulhrsw m5, m7
22679 packuswb m3, m5
22680 movu [r0 + 2033 * 16], m3
22681
22682 ; mode 32 [row 31]
22683 movu m0, [r3 + 22]
22684 movd m1, [r3 + 23]
22685 palignr m1, m0, 1
22686 punpcklbw m0, m1
22687 movu m2, [r3 + 30]
22688 movd m3, [r3 + 31]
22689 palignr m3, m2, 1
22690 punpcklbw m2, m3
22691 movu m1, [r3 + 38]
22692 movd m3, [r3 + 39]
22693 palignr m3, m1, 1
22694 punpcklbw m1, m3
22695 movu m4, [r3 + 46]
22696 movd m5, [r3 + 47]
22697 palignr m5, m4, 1
22698 punpcklbw m4, m5
22699
22700 pshufb m5, m0, [tab_S2]
22701 movh [r0 + 1982 * 16], m5
22702 pshufb m5, m2, [tab_S2]
22703 movh [r0 + 1982 * 16 + 8], m5
22704 pshufb m5, m1, [tab_S2]
22705 movh [r0 + 1983 * 16], m5
22706 pshufb m5, m4, [tab_S2]
22707 movh [r0 + 1983 * 16 + 8], m5
22708
22709 ; mode 33 [row 25]
22710 movu m6, [r5 + 4 * 16]
22711 pmaddubsw m3, m0, m6
22712 pmulhrsw m3, m7
22713 pmaddubsw m5, m2, m6
22714 pmulhrsw m5, m7
22715 packuswb m3, m5
22716 movu [r0 + 2034 * 16], m3
22717 pmaddubsw m3, m1, m6
22718 pmulhrsw m3, m7
22719 pmaddubsw m5, m4, m6
22720 pmulhrsw m5, m7
22721 packuswb m3, m5
22722 movu [r0 + 2035 * 16], m3
22723
22724 ; mode 33 [row 26]
22725 movu m6, [r5 + 30 * 16]
22726 pmaddubsw m3, m0, m6
22727 pmulhrsw m3, m7
22728 pmaddubsw m5, m2, m6
22729 pmulhrsw m5, m7
22730 packuswb m3, m5
22731 movu [r0 + 2036 * 16], m3
22732 pmaddubsw m3, m1, m6
22733 pmulhrsw m3, m7
22734 pmaddubsw m5, m4, m6
22735 pmulhrsw m5, m7
22736 packuswb m3, m5
22737 movu [r0 + 2037 * 16], m3
22738
22739 ; mode 33 [row 27]
22740 movu m6, [r5 + 24 * 16]
22741 movu m0, [r3 + 23]
22742 movd m1, [r3 + 24]
22743 palignr m1, m0, 1
22744 punpcklbw m0, m1
22745 pmaddubsw m3, m0, m6
22746 pmulhrsw m3, m7
22747 movu m2, [r3 + 31]
22748 movd m4, [r3 + 32]
22749 palignr m4, m2, 1
22750 punpcklbw m2, m4
22751 pmaddubsw m5, m2, m6
22752 pmulhrsw m5, m7
22753 packuswb m3, m5
22754 movu [r0 + 2038 * 16], m3
22755
22756 movu m1, [r3 + 39]
22757 movd m3, [r3 + 40]
22758 palignr m3, m1, 1
22759 punpcklbw m1, m3
22760 pmaddubsw m3, m1, m6
22761 pmulhrsw m3, m7
22762 movu m4, [r3 + 47]
22763 movd m5, [r3 + 48]
22764 palignr m5, m4, 1
22765 punpcklbw m4, m5
22766 pmaddubsw m5, m4, m6
22767 pmulhrsw m5, m7
22768 packuswb m3, m5
22769 movu [r0 + 2039 * 16], m3
22770
22771 ; mode 33 [row 28]
22772 movu m6, [r5 + 18 * 16]
22773 movu m0, [r3 + 24]
22774 movd m1, [r3 + 25]
22775 palignr m1, m0, 1
22776 punpcklbw m0, m1
22777 pmaddubsw m3, m0, m6
22778 pmulhrsw m3, m7
22779 movu m2, [r3 + 32]
22780 movd m4, [r3 + 33]
22781 palignr m4, m2, 1
22782 punpcklbw m2, m4
22783 pmaddubsw m5, m2, m6
22784 pmulhrsw m5, m7
22785 packuswb m3, m5
22786 movu [r0 + 2040 * 16], m3
22787
22788 movu m1, [r3 + 40]
22789 movd m3, [r3 + 41]
22790 palignr m3, m1, 1
22791 punpcklbw m1, m3
22792 pmaddubsw m3, m1, m6
22793 pmulhrsw m3, m7
22794 movu m4, [r3 + 48]
22795 movd m5, [r3 + 49]
22796 palignr m5, m4, 1
22797 punpcklbw m4, m5
22798 pmaddubsw m5, m4, m6
22799 pmulhrsw m5, m7
22800 packuswb m3, m5
22801 movu [r0 + 2041 * 16], m3
22802
22803 ; mode 33 [row 29]
22804 movu m6, [r5 + 12 * 16]
22805 movu m0, [r3 + 25]
22806 movd m1, [r3 + 26]
22807 palignr m1, m0, 1
22808 punpcklbw m0, m1
22809 pmaddubsw m3, m0, m6
22810 pmulhrsw m3, m7
22811 movu m2, [r3 + 33]
22812 movd m4, [r3 + 34]
22813 palignr m4, m2, 1
22814 punpcklbw m2, m4
22815 pmaddubsw m5, m2, m6
22816 pmulhrsw m5, m7
22817 packuswb m3, m5
22818 movu [r0 + 2042 * 16], m3
22819
22820 movu m1, [r3 + 41]
22821 movd m3, [r3 + 42]
22822 palignr m3, m1, 1
22823 punpcklbw m1, m3
22824 pmaddubsw m3, m1, m6
22825 pmulhrsw m3, m7
22826 movu m4, [r3 + 49]
22827 movd m5, [r3 + 50]
22828 palignr m5, m4, 1
22829 punpcklbw m4, m5
22830 pmaddubsw m5, m4, m6
22831 pmulhrsw m5, m7
22832 packuswb m3, m5
22833 movu [r0 + 2043 * 16], m3
22834
22835 ; mode 33 [row 30]
22836 movu m6, [r5 + 6 * 16]
22837 movu m0, [r3 + 26]
22838 movd m1, [r3 + 27]
22839 palignr m1, m0, 1
22840 punpcklbw m0, m1
22841 pmaddubsw m3, m0, m6
22842 pmulhrsw m3, m7
22843 movu m2, [r3 + 34]
22844 movd m4, [r3 + 35]
22845 palignr m4, m2, 1
22846 punpcklbw m2, m4
22847 pmaddubsw m5, m2, m6
22848 pmulhrsw m5, m7
22849 packuswb m3, m5
22850 movu [r0 + 2044 * 16], m3
22851
22852 movu m1, [r3 + 42]
22853 movd m3, [r3 + 43]
22854 palignr m3, m1, 1
22855 punpcklbw m1, m3
22856 pmaddubsw m3, m1, m6
22857 pmulhrsw m3, m7
22858 movu m4, [r3 + 50]
22859 movd m5, [r3 + 51]
22860 palignr m5, m4, 1
22861 punpcklbw m4, m5
22862 pmaddubsw m5, m4, m6
22863 pmulhrsw m5, m7
22864 packuswb m3, m5
22865 movu [r0 + 2045 * 16], m3
22866
22867 ; mode 33 [row 31]
22868 movu m5, [r3 + 27]
22869 movu [r0 + 2046 * 16], m5
22870 movu m5, [r3 + 43]
22871 movu [r0 + 2047 * 16], m5
22872
22873 ;mode 34 [row 0]
22874 movu m0, [r3 + 2]
22875 movu [r0 + 2048 * 16], m0
22876 movu m1, [r3 + 18]
22877 movu [r0 + 2049 * 16], m1
22878
22879 ;mode 34 [row 1]
22880 movu m2, [r3 + 34]
22881 palignr m3, m1, m0, 1
22882 movu [r0 + 2050 * 16], m3
22883 palignr m4, m2, m1, 1
22884 movu [r0 + 2051 * 16], m4
22885
22886 ;mode 34 [row 2]
22887 palignr m3, m1, m0, 2
22888 movu [r0 + 2052 * 16], m3
22889 palignr m4, m2, m1, 2
22890 movu [r0 + 2053 * 16], m4
22891
22892 ;mode 34 [row 3]
22893 palignr m3, m1, m0, 3
22894 movu [r0 + 2054 * 16], m3
22895 palignr m4, m2, m1, 3
22896 movu [r0 + 2055 * 16], m4
22897
22898 ;mode 34 [row 4]
22899 palignr m3, m1, m0, 4
22900 movu [r0 + 2056 * 16], m3
22901 palignr m4, m2, m1, 4
22902 movu [r0 + 2057 * 16], m4
22903
22904 ;mode 34 [row 5]
22905 palignr m3, m1, m0, 5
22906 movu [r0 + 2058 * 16], m3
22907 palignr m4, m2, m1, 5
22908 movu [r0 + 2059 * 16], m4
22909
22910 ;mode 34 [row 6]
22911 palignr m3, m1, m0, 6
22912 movu [r0 + 2060 * 16], m3
22913 palignr m4, m2, m1, 6
22914 movu [r0 + 2061 * 16], m4
22915
22916 ;mode 34 [row 7]
22917 palignr m3, m1, m0, 7
22918 movu [r0 + 2062 * 16], m3
22919 palignr m4, m2, m1, 7
22920 movu [r0 + 2063 * 16], m4
22921
22922 ;mode 34 [row 8]
22923 palignr m3, m1, m0, 8
22924 movu [r0 + 2064 * 16], m3
22925 palignr m4, m2, m1, 8
22926 movu [r0 + 2065 * 16], m4
22927
22928 ;mode 34 [row 9]
22929 palignr m3, m1, m0, 9
22930 movu [r0 + 2066 * 16], m3
22931 palignr m4, m2, m1, 9
22932 movu [r0 + 2067 * 16], m4
22933
22934 ;mode 34 [row 10]
22935 palignr m3, m1, m0, 10
22936 movu [r0 + 2068 * 16], m3
22937 palignr m4, m2, m1, 10
22938 movu [r0 + 2069 * 16], m4
22939
22940 ;mode 34 [row 11]
22941 palignr m3, m1, m0, 11
22942 movu [r0 + 2070 * 16], m3
22943 palignr m4, m2, m1, 11
22944 movu [r0 + 2071 * 16], m4
22945
22946 ;mode 34 [row 12]
22947 palignr m3, m1, m0, 12
22948 movu [r0 + 2072 * 16], m3
22949 palignr m4, m2, m1, 12
22950 movu [r0 + 2073 * 16], m4
22951
22952 ;mode 34 [row 13]
22953 palignr m3, m1, m0, 13
22954 movu [r0 + 2074 * 16], m3
22955 palignr m4, m2, m1, 13
22956 movu [r0 + 2075 * 16], m4
22957
22958 ;mode 34 [row 14]
22959 palignr m3, m1, m0, 14
22960 movu [r0 + 2076 * 16], m3
22961 palignr m4, m2, m1, 14
22962 movu [r0 + 2077 * 16], m4
22963
22964 ;mode 34 [row 15]
22965 palignr m3, m1, m0, 15
22966 movu [r0 + 2078 * 16], m3
22967 palignr m4, m2, m1, 15
22968 movu [r0 + 2079 * 16], m4
22969
22970 ;mode 34 [row 16]
22971 palignr m3, m1, m0, 16
22972 movu [r0 + 2080 * 16], m3
22973 palignr m4, m2, m1, 16
22974 movu [r0 + 2081 * 16], m4
22975
22976 ;mode 34 [row 17]
22977 movu m0, [r3 + 19]
22978 movu [r0 + 2082 * 16], m0
22979 movu m1, [r3 + 35]
22980 movu [r0 + 2083 * 16], m1
22981
22982 mov r2d, r6d
22983 mov [r4], r2b
22984 mov r2d, [rsp]
22985 mov [r1 + 64], r2b
22986
22987 ;mode 34 [row 18]
22988 movu m2, [r3 + 51]
22989 palignr m3, m1, m0, 1
22990 movu [r0 + 2084 * 16], m3
22991 palignr m4, m2, m1, 1
22992 movu [r0 + 2085 * 16], m4
22993
22994 ;mode 34 [row 19]
22995 palignr m3, m1, m0, 2
22996 movu [r0 + 2086 * 16], m3
22997 palignr m4, m2, m1, 2
22998 movu [r0 + 2087 * 16], m4
22999
23000 ;mode 34 [row 20]
23001 palignr m3, m1, m0, 3
23002 movu [r0 + 2088 * 16], m3
23003 palignr m4, m2, m1, 3
23004 movu [r0 + 2089 * 16], m4
23005
23006 ;mode 34 [row 21]
23007 palignr m3, m1, m0, 4
23008 movu [r0 + 2090 * 16], m3
23009 palignr m4, m2, m1, 4
23010 movu [r0 + 2091 * 16], m4
23011
23012 ;mode 34 [row 22]
23013 palignr m3, m1, m0, 5
23014 movu [r0 + 2092 * 16], m3
23015 palignr m4, m2, m1, 5
23016 movu [r0 + 2093 * 16], m4
23017
23018 ;mode 34 [row 23]
23019 palignr m3, m1, m0, 6
23020 movu [r0 + 2094 * 16], m3
23021 palignr m4, m2, m1, 6
23022 movu [r0 + 2095 * 16], m4
23023
23024 ;mode 34 [row 24]
23025 palignr m3, m1, m0, 7
23026 movu [r0 + 2096 * 16], m3
23027 palignr m4, m2, m1, 7
23028 movu [r0 + 2097 * 16], m4
23029
23030 ;mode 34 [row 25]
23031 palignr m3, m1, m0, 8
23032 movu [r0 + 2098 * 16], m3
23033 palignr m4, m2, m1, 8
23034 movu [r0 + 2099 * 16], m4
23035
23036 ;mode 34 [row 26]
23037 palignr m3, m1, m0, 9
23038 movu [r0 + 2100 * 16], m3
23039 palignr m4, m2, m1, 9
23040 movu [r0 + 2101 * 16], m4
23041
23042 ;mode 34 [row 27]
23043 palignr m3, m1, m0, 10
23044 movu [r0 + 2102 * 16], m3
23045 palignr m4, m2, m1, 10
23046 movu [r0 + 2103 * 16], m4
23047
23048 ;mode 34 [row 28]
23049 palignr m3, m1, m0, 11
23050 movu [r0 + 2104 * 16], m3
23051 palignr m4, m2, m1, 11
23052 movu [r0 + 2105 * 16], m4
23053
23054 ;mode 34 [row 29]
23055 palignr m3, m1, m0, 12
23056 movu [r0 + 2106 * 16], m3
23057 palignr m4, m2, m1, 12
23058 movu [r0 + 2107 * 16], m4
23059
23060 ;mode 34 [row 30]
23061 palignr m3, m1, m0, 13
23062 movu [r0 + 2108 * 16], m3
23063 palignr m4, m2, m1, 13
23064 movu [r0 + 2109 * 16], m4
23065
23066 ;mode 34 [row 31]
23067 palignr m3, m1, m0, 14
23068 movu [r0 + 2110 * 16], m3
23069 palignr m4, m2, m1, 14
23070 movu [r0 + 2111 * 16], m4
23071 RET
23072
23073
23074 ;-----------------------------------------------------------------------------
23075 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
23076 ;-----------------------------------------------------------------------------
23077 INIT_YMM avx2
23078 cglobal all_angs_pred_4x4, 4, 4, 6
23079
23080 mova m5, [pw_1024]
23081 lea r2, [all_ang4]
23082 lea r3, [all_ang4_shuff]
23083
23084 ; mode 2
23085
23086 vbroadcasti128 m0, [r1 + 9]
23087 mova xm1, xm0
23088 psrldq xm1, 1
23089 pshufb xm1, [r3]
23090 movu [r0], xm1
23091
23092 ; mode 3
23093
23094 pshufb m1, m0, [r3 + 1 * mmsize]
23095 pmaddubsw m1, [r2]
23096 pmulhrsw m1, m5
23097
23098 ; mode 4
23099
23100 pshufb m2, m0, [r3 + 2 * mmsize]
23101 pmaddubsw m2, [r2 + 1 * mmsize]
23102 pmulhrsw m2, m5
23103 packuswb m1, m2
23104 vpermq m1, m1, 11011000b
23105 movu [r0 + (3 - 2) * 16], m1
23106
23107 ; mode 5
23108
23109 pshufb m1, m0, [r3 + 2 * mmsize]
23110 pmaddubsw m1, [r2 + 2 * mmsize]
23111 pmulhrsw m1, m5
23112
23113 ; mode 6
23114
23115 pshufb m2, m0, [r3 + 3 * mmsize]
23116 pmaddubsw m2, [r2 + 3 * mmsize]
23117 pmulhrsw m2, m5
23118 packuswb m1, m2
23119 vpermq m1, m1, 11011000b
23120 movu [r0 + (5 - 2) * 16], m1
23121
23122 add r3, 4 * mmsize
23123 add r2, 4 * mmsize
23124
23125 ; mode 7
23126
23127 pshufb m1, m0, [r3 + 0 * mmsize]
23128 pmaddubsw m1, [r2 + 0 * mmsize]
23129 pmulhrsw m1, m5
23130
23131 ; mode 8
23132
23133 pshufb m2, m0, [r3 + 1 * mmsize]
23134 pmaddubsw m2, [r2 + 1 * mmsize]
23135 pmulhrsw m2, m5
23136 packuswb m1, m2
23137 vpermq m1, m1, 11011000b
23138 movu [r0 + (7 - 2) * 16], m1
23139
23140 ; mode 9
23141
23142 pshufb m1, m0, [r3 + 1 * mmsize]
23143 pmaddubsw m1, [r2 + 2 * mmsize]
23144 pmulhrsw m1, m5
23145 packuswb m1, m1
23146 vpermq m1, m1, 11011000b
23147 movu [r0 + (9 - 2) * 16], xm1
23148
23149 ; mode 10
23150
23151 pshufb xm1, xm0, [r3 + 2 * mmsize]
23152 movu [r0 + (10 - 2) * 16], xm1
23153
23154 pxor xm1, xm1
23155 movd xm2, [r1 + 1]
23156 pshufd xm3, xm2, 0
23157 punpcklbw xm3, xm1
23158 pinsrb xm2, [r1], 0
23159 pshufb xm4, xm2, xm1
23160 punpcklbw xm4, xm1
23161 psubw xm3, xm4
23162 psraw xm3, 1
23163 pshufb xm4, xm0, xm1
23164 punpcklbw xm4, xm1
23165 paddw xm3, xm4
23166 packuswb xm3, xm1
23167
23168 pextrb [r0 + 128], xm3, 0
23169 pextrb [r0 + 132], xm3, 1
23170 pextrb [r0 + 136], xm3, 2
23171 pextrb [r0 + 140], xm3, 3
23172
23173 ; mode 11
23174
23175 vbroadcasti128 m0, [r1]
23176 pshufb m1, m0, [r3 + 3 * mmsize]
23177 pmaddubsw m1, [r2 + 3 * mmsize]
23178 pmulhrsw m1, m5
23179
23180 ; mode 12
23181
23182 add r2, 4 * mmsize
23183
23184 pshufb m2, m0, [r3 + 3 * mmsize]
23185 pmaddubsw m2, [r2 + 0 * mmsize]
23186 pmulhrsw m2, m5
23187 packuswb m1, m2
23188 vpermq m1, m1, 11011000b
23189 movu [r0 + (11 - 2) * 16], m1
23190
23191 ; mode 13
23192
23193 add r3, 4 * mmsize
23194
23195 pshufb m1, m0, [r3 + 0 * mmsize]
23196 pmaddubsw m1, [r2 + 1 * mmsize]
23197 pmulhrsw m1, m5
23198
23199 ; mode 14
23200
23201 pshufb m2, m0, [r3 + 1 * mmsize]
23202 pmaddubsw m2, [r2 + 2 * mmsize]
23203 pmulhrsw m2, m5
23204 packuswb m1, m2
23205 vpermq m1, m1, 11011000b
23206 movu [r0 + (13 - 2) * 16], m1
23207
23208 ; mode 15
23209
23210 pshufb m1, m0, [r3 + 2 * mmsize]
23211 pmaddubsw m1, [r2 + 3 * mmsize]
23212 pmulhrsw m1, m5
23213
23214 ; mode 16
23215
23216 add r2, 4 * mmsize
23217
23218 pshufb m2, m0, [r3 + 3 * mmsize]
23219 pmaddubsw m2, [r2 + 0 * mmsize]
23220 pmulhrsw m2, m5
23221 packuswb m1, m2
23222 vpermq m1, m1, 11011000b
23223 movu [r0 + (15 - 2) * 16], m1
23224
23225 ; mode 17
23226
23227 add r3, 4 * mmsize
23228
23229 pshufb m1, m0, [r3 + 0 * mmsize]
23230 pmaddubsw m1, [r2 + 1 * mmsize]
23231 pmulhrsw m1, m5
23232 packuswb m1, m1
23233 vpermq m1, m1, 11011000b
23234
23235 ; mode 18
23236
23237 pshufb m2, m0, [r3 + 1 * mmsize]
23238 vinserti128 m1, m1, xm2, 1
23239 movu [r0 + (17 - 2) * 16], m1
23240
23241 ; mode 19
23242
23243 pshufb m1, m0, [r3 + 2 * mmsize]
23244 pmaddubsw m1, [r2 + 2 * mmsize]
23245 pmulhrsw m1, m5
23246
23247 ; mode 20
23248
23249 pshufb m2, m0, [r3 + 3 * mmsize]
23250 pmaddubsw m2, [r2 + 3 * mmsize]
23251 pmulhrsw m2, m5
23252 packuswb m1, m2
23253 vpermq m1, m1, 11011000b
23254 movu [r0 + (19 - 2) * 16], m1
23255
23256 ; mode 21
23257
23258 add r2, 4 * mmsize
23259 add r3, 4 * mmsize
23260
23261 pshufb m1, m0, [r3 + 0 * mmsize]
23262 pmaddubsw m1, [r2 + 0 * mmsize]
23263 pmulhrsw m1, m5
23264
23265 ; mode 22
23266
23267 pshufb m2, m0, [r3 + 1 * mmsize]
23268 pmaddubsw m2, [r2 + 1 * mmsize]
23269 pmulhrsw m2, m5
23270 packuswb m1, m2
23271 vpermq m1, m1, 11011000b
23272 movu [r0 + (21 - 2) * 16], m1
23273
23274 ; mode 23
23275
23276 pshufb m1, m0, [r3 + 2 * mmsize]
23277 pmaddubsw m1, [r2 + 2 * mmsize]
23278 pmulhrsw m1, m5
23279
23280 ; mode 24
23281
23282 pshufb m2, m0, [r3 + 3 * mmsize]
23283 pmaddubsw m2, [r2 + 3 * mmsize]
23284 pmulhrsw m2, m5
23285 packuswb m1, m2
23286 vpermq m1, m1, 11011000b
23287 movu [r0 + (23 - 2) * 16], m1
23288
23289 ; mode 25
23290
23291 add r2, 4 * mmsize
23292
23293 pshufb m1, m0, [r3 + 3 * mmsize]
23294 pmaddubsw m1, [r2 + 0 * mmsize]
23295 pmulhrsw m1, m5
23296 packuswb m1, m1
23297 vpermq m1, m1, 11011000b
23298 movu [r0 + (25 - 2) * 16], xm1
23299
23300 ; mode 26
23301
23302 add r3, 4 * mmsize
23303
23304 pshufb xm1, xm0, [r3 + 0 * mmsize]
23305 movu [r0 + (26 - 2) * 16], xm1
23306
23307 pxor xm1, xm1
23308 movd xm2, [r1 + 9]
23309 pshufd xm3, xm2, 0
23310 punpcklbw xm3, xm1
23311 pinsrb xm4, [r1 + 0], 0
23312 pshufb xm4, xm1
23313 punpcklbw xm4, xm1
23314 psubw xm3, xm4
23315 psraw xm3, 1
23316 psrldq xm2, xm0, 1
23317 pshufb xm2, xm1
23318 punpcklbw xm2, xm1
23319 paddw xm3, xm2
23320 packuswb xm3, xm1
23321
23322 pextrb [r0 + 384], xm3, 0
23323 pextrb [r0 + 388], xm3, 1
23324 pextrb [r0 + 392], xm3, 2
23325 pextrb [r0 + 396], xm3, 3
23326
23327 ; mode 27
23328
23329 pshufb m1, m0, [r3 + 1 * mmsize]
23330 pmaddubsw m1, [r2 + 1 * mmsize]
23331 pmulhrsw m1, m5
23332
23333 ; mode 28
23334
23335 pshufb m2, m0, [r3 + 1 * mmsize]
23336 pmaddubsw m2, [r2 + 2 * mmsize]
23337 pmulhrsw m2, m5
23338 packuswb m1, m2
23339 vpermq m1, m1, 11011000b
23340 movu [r0 + (27 - 2) * 16], m1
23341
23342 ; mode 29
23343
23344 pshufb m1, m0, [r3 + 2 * mmsize]
23345 pmaddubsw m1, [r2 + 3 * mmsize]
23346 pmulhrsw m1, m5
23347
23348 ; mode 30
23349
23350 add r2, 4 * mmsize
23351
23352 pshufb m2, m0, [r3 + 3 * mmsize]
23353 pmaddubsw m2, [r2 + 0 * mmsize]
23354 pmulhrsw m2, m5
23355 packuswb m1, m2
23356 vpermq m1, m1, 11011000b
23357 movu [r0 + (29 - 2) * 16], m1
23358
23359 ; mode 31
23360
23361 add r3, 4 * mmsize
23362
23363 pshufb m1, m0, [r3 + 0 * mmsize]
23364 pmaddubsw m1, [r2 + 1 * mmsize]
23365 pmulhrsw m1, m5
23366
23367 ; mode 32
23368
23369 pshufb m2, m0, [r3 + 0 * mmsize]
23370 pmaddubsw m2, [r2 + 2 * mmsize]
23371 pmulhrsw m2, m5
23372 packuswb m1, m2
23373 vpermq m1, m1, 11011000b
23374 movu [r0 + (31 - 2) * 16], m1
23375
23376 ; mode 33
23377
23378 pshufb m1, m0, [r3 + 1 * mmsize]
23379 pmaddubsw m1, [r2 + 3 * mmsize]
23380 pmulhrsw m1, m5
23381 packuswb m1, m2
23382 vpermq m1, m1, 11011000b
23383
23384 ; mode 34
23385
23386 pshufb m0, [r3 + 2 * mmsize]
23387 vinserti128 m1, m1, xm0, 1
23388 movu [r0 + (33 - 2) * 16], m1
23389 RET
23390
23391 ;-----------------------------------------------------------------------------
23392 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
23393 ;-----------------------------------------------------------------------------
23394 INIT_XMM sse2
23395 cglobal all_angs_pred_4x4, 4, 4, 8
23396
23397 ; mode 2
23398
23399 movh m6, [r1 + 9]
23400 mova m2, m6
23401 psrldq m2, 1
23402 movd [r0], m2 ;byte[A, B, C, D]
23403 psrldq m2, 1
23404 movd [r0 + 4], m2 ;byte[B, C, D, E]
23405 psrldq m2, 1
23406 movd [r0 + 8], m2 ;byte[C, D, E, F]
23407 psrldq m2, 1
23408 movd [r0 + 12], m2 ;byte[D, E, F, G]
23409
23410 ; mode 10/26
23411
23412 pxor m7, m7
23413 pshufd m5, m6, 0
23414 mova [r0 + 128], m5 ;mode 10 byte[9, A, B, C, 9, A, B, C, 9, A, B, C, 9, A, B, C]
23415
23416 movd m4, [r1 + 1]
23417 pshufd m4, m4, 0
23418 mova [r0 + 384], m4 ;mode 26 byte[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]
23419
23420 movd m1, [r1]
23421 punpcklbw m1, m7
23422 pshuflw m1, m1, 0x00
23423 punpcklqdq m1, m1 ;m1 = byte[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
23424
23425 punpckldq m4, m5
23426 punpcklbw m4, m7 ;m4 = word[1, 2, 3, 4, 9, A, B, C]
23427 pshuflw m2, m4, 0x00
23428 pshufhw m2, m2, 0x00 ;m2 = word[1, 1, 1, 1, 9, 9, 9, 9]
23429
23430 psubw m4, m1
23431 psraw m4, 1
23432
23433 pshufd m2, m2, q1032 ;m2 = word[9, 9, 9, 9, 1, 1, 1, 1]
23434 paddw m4, m2
23435 packuswb m4, m4
23436
23437 %if ARCH_X86_64
23438 movq r2, m4
23439
23440 mov [r0 + 128], r2b ;mode 10
23441 shr r2, 8
23442 mov [r0 + 132], r2b
23443 shr r2, 8
23444 mov [r0 + 136], r2b
23445 shr r2, 8
23446 mov [r0 + 140], r2b
23447 shr r2, 8
23448 mov [r0 + 384], r2b ;mode 26
23449 shr r2d, 8
23450 mov [r0 + 388], r2b
23451 shr r2d, 8
23452 mov [r0 + 392], r2b
23453 shr r2d, 8
23454 mov [r0 + 396], r2b
23455
23456 %else
23457 movd r2d, m4
23458
23459 mov [r0 + 128], r2b ;mode 10
23460 shr r2d, 8
23461 mov [r0 + 132], r2b
23462 shr r2d, 8
23463 mov [r0 + 136], r2b
23464 shr r2d, 8
23465 mov [r0 + 140], r2b
23466
23467 psrldq m4, 4
23468 movd r2d, m4
23469
23470 mov [r0 + 384], r2b ;mode 26
23471 shr r2d, 8
23472 mov [r0 + 388], r2b
23473 shr r2d, 8
23474 mov [r0 + 392], r2b
23475 shr r2d, 8
23476 mov [r0 + 396], r2b
23477 %endif
23478
23479 ; mode 3
23480
23481 mova m2, [pw_16]
23482 lea r3, [pw_ang_table + 7 * 16]
23483 lea r2, [pw_ang_table + 23 * 16]
23484 punpcklbw m6, m6
23485 psrldq m6, 1
23486 movh m1, m6
23487 psrldq m6, 2
23488 movh m0, m6
23489 psrldq m6, 2
23490 movh m3, m6
23491 psrldq m6, 2
23492 punpcklbw m1, m7 ;m1 = word[9, A, A, B, B, C, C, D]
23493 punpcklbw m0, m7 ;m0 = word[A, B, B, C, C, D, D, E]
23494 punpcklbw m3, m7 ;m3 = word[B, C, C, D, D, E, E, F]
23495 punpcklbw m6, m7 ;m6 = word[C, D, D, E, E, F, F, G]
23496
23497 mova m7, [r2 - 3 * 16]
23498
23499 pmaddwd m5, m1, [r2 + 3 * 16]
23500 pmaddwd m4, m0, m7
23501
23502 packssdw m5, m4
23503 paddw m5, m2
23504 psraw m5, 5
23505
23506 pmaddwd m4, m3, [r3 + 7 * 16]
23507 pmaddwd m6, [r3 + 1 * 16]
23508
23509 packssdw m4, m6
23510 paddw m4, m2
23511 psraw m4, 5
23512
23513 packuswb m5, m4
23514 mova [r0 + 16], m5
23515 movd [r0 + 68], m5 ;mode 6 row 1
23516 psrldq m5, 4
23517 movd [r0 + 76], m5 ;mode 6 row 3
23518
23519 ; mode 4
23520
23521 pmaddwd m4, m0, [r2 + 8 * 16]
23522 pmaddwd m6, m3, m7
23523
23524 packssdw m4, m6
23525 paddw m4, m2
23526 psraw m4, 5
23527
23528 pmaddwd m5, m1, [r2 - 2 * 16]
23529 pmaddwd m6, m0, [r3 + 3 * 16]
23530
23531 packssdw m5, m6
23532 paddw m5, m2
23533 psraw m5, 5
23534
23535 packuswb m5, m4
23536 mova [r0 + 32], m5
23537
23538 ; mode 5
23539
23540 pmaddwd m5, m1, [r2 - 6 * 16]
23541 pmaddwd m6, m0, [r3 - 5 * 16]
23542
23543 packssdw m5, m6
23544 paddw m5, m2
23545 psraw m5, 5
23546
23547 pmaddwd m4, m0, [r2 - 4 * 16]
23548 pmaddwd m3, [r3 - 3 * 16]
23549
23550 packssdw m4, m3
23551 paddw m4, m2
23552 psraw m4, 5
23553
23554 packuswb m5, m4
23555 mova [r0 + 48], m5
23556
23557 ; mode 6
23558
23559 pmaddwd m5, m1, [r3 + 6 * 16]
23560 pmaddwd m6, m0, [r3 + 0 * 16]
23561
23562 packssdw m5, m6
23563 paddw m5, m2
23564 psraw m5, 5
23565
23566 packuswb m5, m6
23567 movd [r0 + 64], m5
23568 psrldq m5, 4
23569 movd [r0 + 72], m5
23570
23571 ; mode 7
23572
23573 pmaddwd m5, m1, [r3 + 2 * 16]
23574 pmaddwd m6, m1, [r2 - 5 * 16]
23575
23576 packssdw m5, m6
23577 paddw m5, m2
23578 psraw m5, 5
23579
23580 mova m3, [r2 + 4 * 16]
23581 pmaddwd m4, m1, m3
23582 pmaddwd m0, [r3 - 3 * 16]
23583
23584 packssdw m4, m0
23585 paddw m4, m2
23586 psraw m4, 5
23587
23588 packuswb m5, m4
23589 mova [r0 + 80], m5
23590
23591 ; mode 8
23592
23593 mova m0, [r3 - 2 * 16]
23594 pmaddwd m5, m1, m0
23595 pmaddwd m6, m1, [r3 + 3 * 16]
23596
23597 packssdw m5, m6
23598 paddw m5, m2
23599 psraw m5, 5
23600
23601 pmaddwd m4, m1, [r3 + 8 * 16]
23602 pmaddwd m7, m1
23603
23604 packssdw m4, m7
23605 paddw m4, m2
23606 psraw m4, 5
23607
23608 packuswb m5, m4
23609 mova [r0 + 96], m5
23610
23611 ; mode 9
23612
23613 pmaddwd m5, m1, [r3 - 5 * 16]
23614 pmaddwd m6, m1, [r3 - 3 * 16]
23615
23616 packssdw m5, m6
23617 paddw m5, m2
23618 psraw m5, 5
23619
23620 pmaddwd m4, m1, [r3 - 1 * 16]
23621 pmaddwd m6, m1, [r3 + 1 * 16]
23622
23623 packssdw m4, m6
23624 paddw m4, m2
23625 psraw m4, 5
23626
23627 packuswb m5, m4
23628 mova [r0 + 112], m5
23629
23630 ; mode 11
23631
23632 movd m5, [r1]
23633 punpcklwd m5, m1
23634 pand m5, [pb_0000000000000F0F]
23635 pslldq m1, 4
23636 por m1, m5 ;m1 = word[0, 9, 9, A, A, B, B, C]
23637
23638 pmaddwd m5, m1, [r2 + 7 * 16]
23639 pmaddwd m6, m1, [r2 + 5 * 16]
23640
23641 packssdw m5, m6
23642 paddw m5, m2
23643 psraw m5, 5
23644
23645 pmaddwd m4, m1, [r2 + 3 * 16]
23646 pmaddwd m6, m1, [r2 + 1 * 16]
23647
23648 packssdw m4, m6
23649 paddw m4, m2
23650 psraw m4, 5
23651
23652 packuswb m5, m4
23653 mova [r0 + 144], m5
23654
23655 ; mode 12
23656
23657 pmaddwd m3, m1
23658 pmaddwd m6, m1, [r2 - 1 * 16]
23659
23660 packssdw m3, m6
23661 paddw m3, m2
23662 psraw m3, 5
23663
23664 pmaddwd m4, m1, [r2 - 6 * 16]
23665 pmaddwd m6, m1, [r3 + 5 * 16]
23666
23667 packssdw m4, m6
23668 paddw m4, m2
23669 psraw m4, 5
23670
23671 packuswb m3, m4
23672 mova [r0 + 160], m3
23673
23674 ; mode 13
23675
23676 mova m3, m1
23677 movd m7, [r1 + 4]
23678 punpcklwd m7, m1
23679 pand m7, [pb_0000000000000F0F]
23680 pslldq m3, 4
23681 por m3, m7 ;m3 = word[4, 0, 0, 9, 9, A, A, B]
23682
23683 pmaddwd m5, m1, [r2 + 0 * 16]
23684 pmaddwd m6, m1, [r3 + 7 * 16]
23685
23686 packssdw m5, m6
23687 paddw m5, m2
23688 psraw m5, 5
23689
23690 pmaddwd m4, m1, m0
23691 pmaddwd m6, m3, [r2 + 5 * 16]
23692
23693 packssdw m4, m6
23694 paddw m4, m2
23695 psraw m4, 5
23696
23697 packuswb m5, m4
23698 mova [r0 + 176], m5
23699
23700 ; mode 14
23701
23702 pmaddwd m5, m1, [r2 - 4 * 16]
23703 pmaddwd m6, m1, [r3 - 1 * 16]
23704
23705 packssdw m5, m6
23706 paddw m5, m2
23707 psraw m5, 5
23708
23709 movd m6, [r1 + 2]
23710 pand m3, [pw_FFFFFFFFFFFFFFF0]
23711 pand m6, [pb_000000000000000F]
23712 por m3, m6 ;m3 = word[2, 0, 0, 9, 9, A, A, B]
23713
23714 pmaddwd m4, m3, [r2 + 2 * 16]
23715 pmaddwd m6, m3, [r3 + 5 * 16]
23716
23717 packssdw m4, m6
23718 paddw m4, m2
23719 psraw m4, 5
23720
23721 packuswb m5, m4
23722 mova [r0 + 192], m5
23723 psrldq m5, 4
23724 movd [r0 + 240], m5 ;mode 17 row 0
23725
23726 ; mode 15
23727
23728 pmaddwd m5, m1, [r3 + 8 * 16]
23729 pmaddwd m6, m3, [r2 + 7 * 16]
23730
23731 packssdw m5, m6
23732 paddw m5, m2
23733 psraw m5, 5
23734
23735 pmaddwd m6, m3, [r3 + 6 * 16]
23736
23737 mova m0, m3
23738 punpcklwd m7, m3
23739 pslldq m0, 4
23740 pand m7, [pb_0000000000000F0F]
23741 por m0, m7 ;m0 = word[4, 2, 2, 0, 0, 9, 9, A]
23742
23743 pmaddwd m4, m0, [r2 + 5 * 16]
23744
23745 packssdw m6, m4
23746 paddw m6, m2
23747 psraw m6, 5
23748
23749 packuswb m5, m6
23750 mova [r0 + 208], m5
23751
23752 ; mode 16
23753
23754 pmaddwd m5, m1, [r3 + 4 * 16]
23755 pmaddwd m6, m3, [r2 - 1 * 16]
23756
23757 packssdw m5, m6
23758 paddw m5, m2
23759 psraw m5, 5
23760
23761 pmaddwd m3, [r3 - 6 * 16]
23762
23763 movd m6, [r1 + 3]
23764 pand m0, [pw_FFFFFFFFFFFFFFF0]
23765 pand m6, [pb_000000000000000F]
23766 por m0, m6 ;m0 = word[3, 2, 2, 0, 0, 9, 9, A]
23767
23768 pmaddwd m0, [r3 + 5 * 16]
23769 packssdw m3, m0
23770 paddw m3, m2
23771 psraw m3, 5
23772
23773 packuswb m5, m3
23774 mova [r0 + 224], m5
23775
23776 ; mode 17
23777
23778 movd m4, [r1 + 1]
23779 punpcklwd m4, m1
23780 pand m4, [pb_0000000000000F0F]
23781 pslldq m1, 4
23782 por m1, m4 ;m1 = word[1, 0, 0, 9, 9, A, A, B]
23783
23784 pmaddwd m6, m1, [r3 + 5 * 16]
23785
23786 packssdw m6, m6
23787 paddw m6, m2
23788 psraw m6, 5
23789
23790 movd m5, [r1 + 2]
23791 punpcklwd m5, m1
23792 pand m5, [pb_0000000000000F0F]
23793 pslldq m1, 4
23794 por m1, m5 ;m1 = word[2, 1, 1, 0, 0, 9, 9, A]
23795
23796 pmaddwd m4, m1, [r2 - 5 * 16]
23797
23798 punpcklwd m7, m1
23799 pand m7, [pb_0000000000000F0F]
23800 pslldq m1, 4
23801 por m1, m7 ;m1 = word[4, 2, 2, 1, 1, 0, 0, 9]
23802
23803 pmaddwd m1, [r2 + 1 * 16]
23804 packssdw m4, m1
23805 paddw m4, m2
23806 psraw m4, 5
23807
23808 packuswb m6, m4
23809 movd [r0 + 244], m6
23810 psrldq m6, 8
23811 movh [r0 + 248], m6
23812
23813 ; mode 18
23814
23815 movh m1, [r1]
23816 movd [r0 + 256], m1 ;byte[0, 1, 2, 3]
23817
23818 movh m3, [r1 + 2]
23819 punpcklqdq m3, m1
23820 psrldq m3, 7
23821 movd [r0 + 260], m3 ;byte[2, 1, 0, 9]
23822
23823 movh m4, [r1 + 3]
23824 punpcklqdq m4, m3
23825 psrldq m4, 7
23826 movd [r0 + 264], m4 ;byte[1, 0, 9, A]
23827
23828 movh m0, [r1 + 4]
23829 punpcklqdq m0, m4
23830 psrldq m0, 7
23831 movd [r0 + 268], m0 ;byte[0, 9, A, B]
23832
23833 ; mode 19
23834
23835 pxor m7, m7
23836 punpcklbw m4, m3
23837 punpcklbw m3, m1
23838 punpcklbw m1, m1
23839 punpcklbw m4, m7 ;m4 = word[A, 9, 9, 0, 0, 1, 1, 2]
23840 punpcklbw m3, m7 ;m3 = word[9, 0, 0, 1, 1, 2, 2, 3]
23841 psrldq m1, 1
23842 punpcklbw m1, m7 ;m1 = word[0, 1, 1, 2, 2, 3, 3, 4]
23843
23844 pmaddwd m6, m1, [r3 - 1 * 16]
23845 pmaddwd m7, m3, [r3 + 5 * 16]
23846
23847 packssdw m6, m7
23848 paddw m6, m2
23849 psraw m6, 5
23850
23851 pmaddwd m5, m4, [r2 - 5 * 16]
23852
23853 movd m7, [r1 + 12]
23854 punpcklwd m7, m4
23855 pand m7, [pb_0000000000000F0F]
23856 pslldq m4, 4
23857 por m4, m7 ;m4 = word[C, A, A, 9, 9, 0, 0, 1]
23858
23859 pmaddwd m4, [r2 + 1 * 16]
23860 packssdw m5, m4
23861 paddw m5, m2
23862 psraw m5, 5
23863
23864 packuswb m6, m5
23865 mova [r0 + 272], m6
23866 movd [r0 + 324], m6 ;mode 22 row 1
23867
23868 ; mode 20
23869
23870 pmaddwd m5, m1, [r3 + 4 * 16]
23871
23872 movd m4, [r1 + 10]
23873 pand m3, [pw_FFFFFFFFFFFFFFF0]
23874 pand m4, [pb_000000000000000F]
23875 por m3, m4 ;m3 = word[A, 0, 0, 1, 1, 2, 2, 3]
23876
23877 pmaddwd m6, m3, [r2 - 1 * 16]
23878
23879 packssdw m5, m6
23880 paddw m5, m2
23881 psraw m5, 5
23882
23883 pmaddwd m4, m3, [r3 - 6 * 16]
23884
23885 punpcklwd m0, m3
23886 pand m0, [pb_0000000000000F0F]
23887 mova m6, m3
23888 pslldq m6, 4
23889 por m0, m6 ;m0 = word[B, A, A, 0, 0, 1, 1, 2]
23890
23891 pmaddwd m6, m0, [r3 + 5 * 16]
23892
23893 packssdw m4, m6
23894 paddw m4, m2
23895 psraw m4, 5
23896
23897 packuswb m5, m4
23898 mova [r0 + 288], m5
23899
23900 ; mode 21
23901
23902 pmaddwd m4, m1, [r3 + 8 * 16]
23903 pmaddwd m6, m3, [r2 + 7 * 16]
23904
23905 packssdw m4, m6
23906 paddw m4, m2
23907 psraw m4, 5
23908
23909 pmaddwd m5, m3, [r3 + 6 * 16]
23910
23911 pand m0, [pw_FFFFFFFFFFFFFFF0]
23912 pand m7, [pb_000000000000000F]
23913 por m0, m7 ;m0 = word[C, A, A, 0, 0, 1, 1, 2]
23914
23915 pmaddwd m0, [r2 + 5 * 16]
23916 packssdw m5, m0
23917 paddw m5, m2
23918 psraw m5, 5
23919
23920 packuswb m4, m5
23921 mova [r0 + 304], m4
23922
23923 ; mode 22
23924
23925 pmaddwd m4, m1, [r2 - 4 * 16]
23926 packssdw m4, m4
23927 paddw m4, m2
23928 psraw m4, 5
23929
23930 mova m0, [r3 + 5 * 16]
23931 pmaddwd m5, m3, [r2 + 2 * 16]
23932 pmaddwd m6, m3, m0
23933
23934 packssdw m5, m6
23935 paddw m5, m2
23936 psraw m5, 5
23937
23938 packuswb m4, m5
23939 movd [r0 + 320], m4
23940 psrldq m4, 8
23941 movh [r0 + 328], m4
23942
23943 ; mode 23
23944
23945 pmaddwd m4, m1, [r2 + 0 * 16]
23946 pmaddwd m5, m1, [r3 + 7 * 16]
23947
23948 packssdw m4, m5
23949 paddw m4, m2
23950 psraw m4, 5
23951
23952 pmaddwd m6, m1, [r3 - 2 * 16]
23953
23954 pand m3, [pw_FFFFFFFFFFFFFFF0]
23955 por m3, m7 ;m3 = word[C, 0, 0, 1, 1, 2, 2, 3]
23956
23957 pmaddwd m3, [r2 + 5 * 16]
23958 packssdw m6, m3
23959 paddw m6, m2
23960 psraw m6, 5
23961
23962 packuswb m4, m6
23963 mova [r0 + 336], m4
23964
23965 ; mode 24
23966
23967 pmaddwd m4, m1, [r2 + 4 * 16]
23968 pmaddwd m5, m1, [r2 - 1 * 16]
23969
23970 packssdw m4, m5
23971 paddw m4, m2
23972 psraw m4, 5
23973
23974 pmaddwd m6, m1, [r2 - 6 * 16]
23975 pmaddwd m0, m1
23976
23977 packssdw m6, m0
23978 paddw m6, m2
23979 psraw m6, 5
23980
23981 packuswb m4, m6
23982 mova [r0 + 352], m4
23983
23984 ; mode 25
23985
23986 pmaddwd m4, m1, [r2 + 7 * 16]
23987 pmaddwd m5, m1, [r2 + 5 * 16]
23988
23989 packssdw m4, m5
23990 paddw m4, m2
23991 psraw m4, 5
23992
23993 pmaddwd m6, m1, [r2 + 3 * 16]
23994 pmaddwd m1, [r2 + 1 * 16]
23995
23996 packssdw m6, m1
23997 paddw m6, m2
23998 psraw m6, 5
23999
24000 packuswb m4, m6
24001 mova [r0 + 368], m4
24002
24003 ; mode 27
24004
24005 movh m0, [r1 + 1]
24006 pxor m7, m7
24007 punpcklbw m0, m0
24008 psrldq m0, 1
24009 movh m1, m0
24010 psrldq m0, 2
24011 movh m3, m0
24012 psrldq m0, 2
24013 punpcklbw m1, m7 ;m1 = word[1, 2, 2, 3, 3, 4, 4, 5]
24014 punpcklbw m3, m7 ;m3 = word[2, 3, 3, 4, 4, 5, 5, 6]
24015 punpcklbw m0, m7 ;m0 = word[3, 4, 4, 5, 5, 6, 6, 7]
24016
24017 mova m7, [r3 - 3 * 16]
24018
24019 pmaddwd m4, m1, [r3 - 5 * 16]
24020 pmaddwd m5, m1, m7
24021
24022 packssdw m4, m5
24023 paddw m4, m2
24024 psraw m4, 5
24025
24026 pmaddwd m6, m1, [r3 - 1 * 16]
24027 pmaddwd m5, m1, [r3 + 1 * 16]
24028
24029 packssdw m6, m5
24030 paddw m6, m2
24031 psraw m6, 5
24032
24033 packuswb m4, m6
24034 mova [r0 + 400], m4
24035
24036 ; mode 28
24037
24038 pmaddwd m4, m1, [r3 - 2 * 16]
24039 pmaddwd m5, m1, [r3 + 3 * 16]
24040
24041 packssdw m4, m5
24042 paddw m4, m2
24043 psraw m4, 5
24044
24045 pmaddwd m6, m1, [r3 + 8 * 16]
24046 pmaddwd m5, m1, [r2 - 3 * 16]
24047
24048 packssdw m6, m5
24049 paddw m6, m2
24050 psraw m6, 5
24051
24052 packuswb m4, m6
24053 mova [r0 + 416], m4
24054
24055 ; mode 29
24056
24057 pmaddwd m4, m1, [r3 + 2 * 16]
24058 pmaddwd m6, m1, [r2 - 5 * 16]
24059
24060 packssdw m4, m6
24061 paddw m4, m2
24062 psraw m4, 5
24063
24064 pmaddwd m6, m1, [r2 + 4 * 16]
24065 pmaddwd m5, m3, m7
24066
24067 packssdw m6, m5
24068 paddw m6, m2
24069 psraw m6, 5
24070
24071 packuswb m4, m6
24072 mova [r0 + 432], m4
24073
24074 ; mode 30
24075
24076 pmaddwd m4, m1, [r3 + 6 * 16]
24077 pmaddwd m5, m1, [r2 + 3 * 16]
24078
24079 packssdw m4, m5
24080 paddw m4, m2
24081 psraw m4, 5
24082
24083 pmaddwd m6, m3, [r3 + 0 * 16]
24084 pmaddwd m5, m3, [r2 - 3 * 16]
24085
24086 packssdw m6, m5
24087 paddw m6, m2
24088 psraw m6, 5
24089
24090 packuswb m4, m6
24091 mova [r0 + 448], m4
24092 psrldq m4, 4
24093 movh [r0 + 496], m4 ;mode 33 row 0
24094 psrldq m4, 8
24095 movd [r0 + 500], m4 ;mode 33 row 1
24096
24097 ; mode 31
24098
24099 pmaddwd m4, m1, [r2 - 6 * 16]
24100 pmaddwd m5, m3, [r3 - 5 * 16]
24101
24102 packssdw m4, m5
24103 paddw m4, m2
24104 psraw m4, 5
24105
24106 pmaddwd m6, m3, [r2 - 4 * 16]
24107 pmaddwd m7, m0
24108
24109 packssdw m6, m7
24110 paddw m6, m2
24111 psraw m6, 5
24112
24113 packuswb m4, m6
24114 mova [r0 + 464], m4
24115
24116 ; mode 32
24117
24118 pmaddwd m1, [r2 - 2 * 16]
24119 pmaddwd m5, m3, [r3 + 3 * 16]
24120
24121 packssdw m1, m5
24122 paddw m1, m2
24123 psraw m1, 5
24124
24125 pmaddwd m3, [r2 + 8 * 16]
24126 pmaddwd m5, m0, [r2 - 3 * 16]
24127 packssdw m3, m5
24128 paddw m3, m2
24129 psraw m3, 5
24130
24131 packuswb m1, m3
24132 mova [r0 + 480], m1
24133
24134 ; mode 33
24135
24136 pmaddwd m0, [r3 + 7 * 16]
24137 pxor m7, m7
24138 movh m4, [r1 + 4]
24139 punpcklbw m4, m4
24140 psrldq m4, 1
24141 punpcklbw m4, m7
24142
24143 pmaddwd m4, [r3 + 1 * 16]
24144
24145 packssdw m0, m4
24146 paddw m0, m2
24147 psraw m0, 5
24148
24149 packuswb m0, m0
24150 movh [r0 + 504], m0
24151
24152 ; mode 34
24153
24154 movh m7, [r1 + 2]
24155 movd [r0 + 512], m7 ;byte[2, 3, 4, 5]
24156
24157 psrldq m7, 1
24158 movd [r0 + 516], m7 ;byte[3, 4, 5, 6]
24159
24160 psrldq m7, 1
24161 movd [r0 + 520], m7 ;byte[4, 5, 6, 7]
24162
24163 psrldq m7, 1
24164 movd [r0 + 524], m7 ;byte[5, 6, 7, 8]
24165
24166 RET