0
|
1 ;*****************************************************************************
|
|
2 ;* Copyright (C) 2013 x265 project
|
|
3 ;*
|
|
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
|
|
5 ;*
|
|
6 ;* This program is free software; you can redistribute it and/or modify
|
|
7 ;* it under the terms of the GNU General Public License as published by
|
|
8 ;* the Free Software Foundation; either version 2 of the License, or
|
|
9 ;* (at your option) any later version.
|
|
10 ;*
|
|
11 ;* This program is distributed in the hope that it will be useful,
|
|
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 ;* GNU General Public License for more details.
|
|
15 ;*
|
|
16 ;* You should have received a copy of the GNU General Public License
|
|
17 ;* along with this program; if not, write to the Free Software
|
|
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
19 ;*
|
|
20 ;* This program is also available under a commercial proprietary license.
|
|
21 ;* For more information, contact us at license @ x265.com.
|
|
22 ;*****************************************************************************/
|
|
23
|
|
24 %include "x86inc.asm"
|
|
25 %include "x86util.asm"
|
|
26
|
|
27 SECTION_RODATA 32
|
|
28
|
|
29 SECTION .text
|
|
30
|
|
31 cextern pw_pixel_max
|
|
32
|
|
33 ;-----------------------------------------------------------------------------
|
|
34 ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
35 ;-----------------------------------------------------------------------------
|
|
36 %if HIGH_BIT_DEPTH
|
|
37 INIT_XMM sse2
|
|
38 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
39 mova m1, [pw_pixel_max]
|
|
40 pxor m0, m0
|
|
41 add r4, r4
|
|
42 add r5, r5
|
|
43 add r1, r1
|
|
44 movh m2, [r2]
|
|
45 movhps m2, [r2 + r4]
|
|
46 movh m3, [r3]
|
|
47 movhps m3, [r3 + r5]
|
|
48 lea r2, [r2 + r4 * 2]
|
|
49 lea r3, [r3 + r5 * 2]
|
|
50 movh m4, [r2]
|
|
51 movhps m4, [r2 + r4]
|
|
52 movh m5, [r3]
|
|
53 movhps m5, [r3 + r5]
|
|
54
|
|
55 paddw m2, m3
|
|
56 paddw m4, m5
|
|
57 CLIPW2 m2, m4, m0, m1
|
|
58
|
|
59 movh [r0], m2
|
|
60 movhps [r0 + r1], m2
|
|
61 lea r0, [r0 + r1 * 2]
|
|
62 movh [r0], m4
|
|
63 movhps [r0 + r1], m4
|
|
64
|
|
65 RET
|
|
66 %else
|
|
67 INIT_XMM sse4
|
|
68 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
69 add r5, r5
|
|
70 pmovzxbw m0, [r2]
|
|
71 pmovzxbw m2, [r2 + r4]
|
|
72 movh m1, [r3]
|
|
73 movh m3, [r3 + r5]
|
|
74 lea r2, [r2 + r4 * 2]
|
|
75 lea r3, [r3 + r5 * 2]
|
|
76 pmovzxbw m4, [r2]
|
|
77 pmovzxbw m6, [r2 + r4]
|
|
78 movh m5, [r3]
|
|
79 movh m7, [r3 + r5]
|
|
80
|
|
81 paddw m0, m1
|
|
82 paddw m2, m3
|
|
83 paddw m4, m5
|
|
84 paddw m6, m7
|
|
85 packuswb m0, m0
|
|
86 packuswb m2, m2
|
|
87 packuswb m4, m4
|
|
88 packuswb m6, m6
|
|
89
|
|
90 movd [r0], m0
|
|
91 movd [r0 + r1], m2
|
|
92 lea r0, [r0 + r1 * 2]
|
|
93 movd [r0], m4
|
|
94 movd [r0 + r1], m6
|
|
95
|
|
96 RET
|
|
97 %endif
|
|
98
|
|
99
|
|
100 ;-----------------------------------------------------------------------------
|
|
101 ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
102 ;-----------------------------------------------------------------------------
|
|
103 %macro PIXEL_ADD_PS_W4_H4 2
|
|
104 %if HIGH_BIT_DEPTH
|
|
105 INIT_XMM sse2
|
|
106 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
107 mova m1, [pw_pixel_max]
|
|
108 pxor m0, m0
|
|
109 mov r6d, %2/4
|
|
110 add r4, r4
|
|
111 add r5, r5
|
|
112 add r1, r1
|
|
113 .loop:
|
|
114 movh m2, [r2]
|
|
115 movhps m2, [r2 + r4]
|
|
116 movh m3, [r3]
|
|
117 movhps m3, [r3 + r5]
|
|
118 lea r2, [r2 + r4 * 2]
|
|
119 lea r3, [r3 + r5 * 2]
|
|
120 movh m4, [r2]
|
|
121 movhps m4, [r2 + r4]
|
|
122 movh m5, [r3]
|
|
123 movhps m5, [r3 + r5]
|
|
124 dec r6d
|
|
125 lea r2, [r2 + r4 * 2]
|
|
126 lea r3, [r3 + r5 * 2]
|
|
127
|
|
128 paddw m2, m3
|
|
129 paddw m4, m5
|
|
130 CLIPW2 m2, m4, m0, m1
|
|
131
|
|
132 movh [r0], m2
|
|
133 movhps [r0 + r1], m2
|
|
134 lea r0, [r0 + r1 * 2]
|
|
135 movh [r0], m4
|
|
136 movhps [r0 + r1], m4
|
|
137 lea r0, [r0 + r1 * 2]
|
|
138
|
|
139 jnz .loop
|
|
140 RET
|
|
141 %else
|
|
142 INIT_XMM sse4
|
|
143 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
144 mov r6d, %2/4
|
|
145 add r5, r5
|
|
146 .loop:
|
|
147 pmovzxbw m0, [r2]
|
|
148 pmovzxbw m2, [r2 + r4]
|
|
149 movh m1, [r3]
|
|
150 movh m3, [r3 + r5]
|
|
151 lea r2, [r2 + r4 * 2]
|
|
152 lea r3, [r3 + r5 * 2]
|
|
153 pmovzxbw m4, [r2]
|
|
154 pmovzxbw m6, [r2 + r4]
|
|
155 movh m5, [r3]
|
|
156 movh m7, [r3 + r5]
|
|
157 dec r6d
|
|
158 lea r2, [r2 + r4 * 2]
|
|
159 lea r3, [r3 + r5 * 2]
|
|
160
|
|
161 paddw m0, m1
|
|
162 paddw m2, m3
|
|
163 paddw m4, m5
|
|
164 paddw m6, m7
|
|
165 packuswb m0, m0
|
|
166 packuswb m2, m2
|
|
167 packuswb m4, m4
|
|
168 packuswb m6, m6
|
|
169
|
|
170 movd [r0], m0
|
|
171 movd [r0 + r1], m2
|
|
172 lea r0, [r0 + r1 * 2]
|
|
173 movd [r0], m4
|
|
174 movd [r0 + r1], m6
|
|
175 lea r0, [r0 + r1 * 2]
|
|
176
|
|
177 jnz .loop
|
|
178 RET
|
|
179 %endif
|
|
180 %endmacro
|
|
181
|
|
182 PIXEL_ADD_PS_W4_H4 4, 8
|
|
183
|
|
184
|
|
185 ;-----------------------------------------------------------------------------
|
|
186 ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
187 ;-----------------------------------------------------------------------------
|
|
188 %macro PIXEL_ADD_PS_W8_H4 2
|
|
189 %if HIGH_BIT_DEPTH
|
|
190 INIT_XMM sse2
|
|
191 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
192 mova m5, [pw_pixel_max]
|
|
193 pxor m4, m4
|
|
194 mov r6d, %2/4
|
|
195 add r4, r4
|
|
196 add r5, r5
|
|
197 add r1, r1
|
|
198 .loop:
|
|
199 movu m0, [r2]
|
|
200 movu m2, [r2 + r4]
|
|
201 movu m1, [r3]
|
|
202 movu m3, [r3 + r5]
|
|
203 lea r2, [r2 + r4 * 2]
|
|
204 lea r3, [r3 + r5 * 2]
|
|
205
|
|
206 paddw m0, m1
|
|
207 paddw m2, m3
|
|
208 CLIPW2 m0, m2, m4, m5
|
|
209
|
|
210 movu [r0], m0
|
|
211 movu [r0 + r1], m2
|
|
212
|
|
213 movu m0, [r2]
|
|
214 movu m2, [r2 + r4]
|
|
215 movu m1, [r3]
|
|
216 movu m3, [r3 + r5]
|
|
217 dec r6d
|
|
218 lea r0, [r0 + r1 * 2]
|
|
219 lea r2, [r2 + r4 * 2]
|
|
220 lea r3, [r3 + r5 * 2]
|
|
221
|
|
222 paddw m0, m1
|
|
223 paddw m2, m3
|
|
224 CLIPW2 m0, m2, m4, m5
|
|
225
|
|
226 movu [r0], m0
|
|
227 movu [r0 + r1], m2
|
|
228 lea r0, [r0 + r1 * 2]
|
|
229
|
|
230 jnz .loop
|
|
231 RET
|
|
232 %else
|
|
233 INIT_XMM sse4
|
|
234 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
235 mov r6d, %2/4
|
|
236 add r5, r5
|
|
237 .loop:
|
|
238 pmovzxbw m0, [r2]
|
|
239 pmovzxbw m2, [r2 + r4]
|
|
240 movu m1, [r3]
|
|
241 movu m3, [r3 + r5]
|
|
242 lea r2, [r2 + r4 * 2]
|
|
243 lea r3, [r3 + r5 * 2]
|
|
244 pmovzxbw m4, [r2]
|
|
245 pmovzxbw m6, [r2 + r4]
|
|
246 movu m5, [r3]
|
|
247 movu m7, [r3 + r5]
|
|
248 dec r6d
|
|
249 lea r2, [r2 + r4 * 2]
|
|
250 lea r3, [r3 + r5 * 2]
|
|
251
|
|
252 paddw m0, m1
|
|
253 paddw m2, m3
|
|
254 paddw m4, m5
|
|
255 paddw m6, m7
|
|
256 packuswb m0, m0
|
|
257 packuswb m2, m2
|
|
258 packuswb m4, m4
|
|
259 packuswb m6, m6
|
|
260
|
|
261 movh [r0], m0
|
|
262 movh [r0 + r1], m2
|
|
263 lea r0, [r0 + r1 * 2]
|
|
264 movh [r0], m4
|
|
265 movh [r0 + r1], m6
|
|
266 lea r0, [r0 + r1 * 2]
|
|
267
|
|
268 jnz .loop
|
|
269 RET
|
|
270 %endif
|
|
271 %endmacro
|
|
272
|
|
273 PIXEL_ADD_PS_W8_H4 8, 8
|
|
274 PIXEL_ADD_PS_W8_H4 8, 16
|
|
275
|
|
276
|
|
277 ;-----------------------------------------------------------------------------
|
|
278 ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
279 ;-----------------------------------------------------------------------------
|
|
280 %macro PIXEL_ADD_PS_W16_H4 2
|
|
281 %if HIGH_BIT_DEPTH
|
|
282 INIT_XMM sse2
|
|
283 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
284 mova m5, [pw_pixel_max]
|
|
285 pxor m4, m4
|
|
286 mov r6d, %2/4
|
|
287 add r4, r4
|
|
288 add r5, r5
|
|
289 add r1, r1
|
|
290 .loop:
|
|
291 movu m0, [r2]
|
|
292 movu m2, [r2 + 16]
|
|
293 movu m1, [r3]
|
|
294 movu m3, [r3 + 16]
|
|
295
|
|
296 paddw m0, m1
|
|
297 paddw m2, m3
|
|
298 CLIPW2 m0, m2, m4, m5
|
|
299
|
|
300 movu [r0], m0
|
|
301 movu [r0 + 16], m2
|
|
302
|
|
303 movu m0, [r2 + r4]
|
|
304 movu m2, [r2 + r4 + 16]
|
|
305 movu m1, [r3 + r5]
|
|
306 movu m3, [r3 + r5 + 16]
|
|
307 lea r2, [r2 + r4 * 2]
|
|
308 lea r3, [r3 + r5 * 2]
|
|
309
|
|
310 paddw m0, m1
|
|
311 paddw m2, m3
|
|
312 CLIPW2 m0, m2, m4, m5
|
|
313
|
|
314 movu [r0 + r1], m0
|
|
315 movu [r0 + r1 + 16], m2
|
|
316
|
|
317 movu m0, [r2]
|
|
318 movu m2, [r2 + 16]
|
|
319 movu m1, [r3]
|
|
320 movu m3, [r3 + 16]
|
|
321 lea r0, [r0 + r1 * 2]
|
|
322
|
|
323 paddw m0, m1
|
|
324 paddw m2, m3
|
|
325 CLIPW2 m0, m2, m4, m5
|
|
326
|
|
327 movu [r0], m0
|
|
328 movu [r0 + 16], m2
|
|
329
|
|
330 movu m0, [r2 + r4]
|
|
331 movu m2, [r2 + r4 + 16]
|
|
332 movu m1, [r3 + r5]
|
|
333 movu m3, [r3 + r5 + 16]
|
|
334 dec r6d
|
|
335 lea r2, [r2 + r4 * 2]
|
|
336 lea r3, [r3 + r5 * 2]
|
|
337
|
|
338 paddw m0, m1
|
|
339 paddw m2, m3
|
|
340 CLIPW2 m0, m2, m4, m5
|
|
341
|
|
342 movu [r0 + r1], m0
|
|
343 movu [r0 + r1 + 16], m2
|
|
344 lea r0, [r0 + r1 * 2]
|
|
345
|
|
346 jnz .loop
|
|
347 RET
|
|
348 %else
|
|
349 INIT_XMM sse4
|
|
350 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
351 mov r6d, %2/4
|
|
352 add r5, r5
|
|
353 .loop:
|
|
354 pmovzxbw m0, [r2]
|
|
355 pmovzxbw m1, [r2 + 8]
|
|
356 pmovzxbw m4, [r2 + r4]
|
|
357 pmovzxbw m5, [r2 + r4 + 8]
|
|
358 movu m2, [r3]
|
|
359 movu m3, [r3 + 16]
|
|
360 movu m6, [r3 + r5]
|
|
361 movu m7, [r3 + r5 + 16]
|
|
362 lea r2, [r2 + r4 * 2]
|
|
363 lea r3, [r3 + r5 * 2]
|
|
364
|
|
365 paddw m0, m2
|
|
366 paddw m1, m3
|
|
367 paddw m4, m6
|
|
368 paddw m5, m7
|
|
369 packuswb m0, m1
|
|
370 packuswb m4, m5
|
|
371
|
|
372 movu [r0], m0
|
|
373 movu [r0 + r1], m4
|
|
374
|
|
375 pmovzxbw m0, [r2]
|
|
376 pmovzxbw m1, [r2 + 8]
|
|
377 pmovzxbw m4, [r2 + r4]
|
|
378 pmovzxbw m5, [r2 + r4 + 8]
|
|
379 movu m2, [r3]
|
|
380 movu m3, [r3 + 16]
|
|
381 movu m6, [r3 + r5]
|
|
382 movu m7, [r3 + r5 + 16]
|
|
383 dec r6d
|
|
384 lea r0, [r0 + r1 * 2]
|
|
385 lea r2, [r2 + r4 * 2]
|
|
386 lea r3, [r3 + r5 * 2]
|
|
387
|
|
388 paddw m0, m2
|
|
389 paddw m1, m3
|
|
390 paddw m4, m6
|
|
391 paddw m5, m7
|
|
392 packuswb m0, m1
|
|
393 packuswb m4, m5
|
|
394
|
|
395 movu [r0], m0
|
|
396 movu [r0 + r1], m4
|
|
397 lea r0, [r0 + r1 * 2]
|
|
398
|
|
399 jnz .loop
|
|
400 RET
|
|
401 %endif
|
|
402 %endmacro
|
|
403 PIXEL_ADD_PS_W16_H4 16, 16
|
|
404 PIXEL_ADD_PS_W16_H4 16, 32
|
|
405
|
|
406 ;-----------------------------------------------------------------------------
|
|
407 ; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
408 ;-----------------------------------------------------------------------------
|
|
409 %macro PIXEL_ADD_PS_W16_H4_avx2 1
|
|
410 %if HIGH_BIT_DEPTH
|
|
411 %if ARCH_X86_64
|
|
412 INIT_YMM avx2
|
|
413 cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
414 mova m3, [pw_pixel_max]
|
|
415 pxor m2, m2
|
|
416 mov r6d, %1/4
|
|
417 add r4d, r4d
|
|
418 add r5d, r5d
|
|
419 add r1d, r1d
|
|
420 lea r7, [r4 * 3]
|
|
421 lea r8, [r5 * 3]
|
|
422 lea r9, [r1 * 3]
|
|
423
|
|
424 .loop:
|
|
425 movu m0, [r2]
|
|
426 movu m1, [r3]
|
|
427 paddw m0, m1
|
|
428 CLIPW m0, m2, m3
|
|
429 movu [r0], m0
|
|
430
|
|
431 movu m0, [r2 + r4]
|
|
432 movu m1, [r3 + r5]
|
|
433 paddw m0, m1
|
|
434 CLIPW m0, m2, m3
|
|
435 movu [r0 + r1], m0
|
|
436
|
|
437 movu m0, [r2 + r4 * 2]
|
|
438 movu m1, [r3 + r5 * 2]
|
|
439 paddw m0, m1
|
|
440 CLIPW m0, m2, m3
|
|
441 movu [r0 + r1 * 2], m0
|
|
442
|
|
443 movu m0, [r2 + r7]
|
|
444 movu m1, [r3 + r8]
|
|
445 paddw m0, m1
|
|
446 CLIPW m0, m2, m3
|
|
447 movu [r0 + r9], m0
|
|
448
|
|
449 dec r6d
|
|
450 lea r0, [r0 + r1 * 4]
|
|
451 lea r2, [r2 + r4 * 4]
|
|
452 lea r3, [r3 + r5 * 4]
|
|
453 jnz .loop
|
|
454 RET
|
|
455 %endif
|
|
456 %else
|
|
457 INIT_YMM avx2
|
|
458 cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
459 mov r6d, %1/4
|
|
460 add r5, r5
|
|
461 .loop:
|
|
462
|
|
463 pmovzxbw m0, [r2] ; row 0 of src0
|
|
464 pmovzxbw m1, [r2 + r4] ; row 1 of src0
|
|
465 movu m2, [r3] ; row 0 of src1
|
|
466 movu m3, [r3 + r5] ; row 1 of src1
|
|
467 paddw m0, m2
|
|
468 paddw m1, m3
|
|
469 packuswb m0, m1
|
|
470
|
|
471 lea r2, [r2 + r4 * 2]
|
|
472 lea r3, [r3 + r5 * 2]
|
|
473
|
|
474 pmovzxbw m2, [r2] ; row 2 of src0
|
|
475 pmovzxbw m3, [r2 + r4] ; row 3 of src0
|
|
476 movu m4, [r3] ; row 2 of src1
|
|
477 movu m5, [r3 + r5] ; row 3 of src1
|
|
478 paddw m2, m4
|
|
479 paddw m3, m5
|
|
480 packuswb m2, m3
|
|
481
|
|
482 lea r2, [r2 + r4 * 2]
|
|
483 lea r3, [r3 + r5 * 2]
|
|
484
|
|
485 vpermq m0, m0, 11011000b
|
|
486 movu [r0], xm0 ; row 0 of dst
|
|
487 vextracti128 xm3, m0, 1
|
|
488 movu [r0 + r1], xm3 ; row 1 of dst
|
|
489
|
|
490 lea r0, [r0 + r1 * 2]
|
|
491 vpermq m2, m2, 11011000b
|
|
492 movu [r0], xm2 ; row 2 of dst
|
|
493 vextracti128 xm3, m2, 1
|
|
494 movu [r0 + r1], xm3 ; row 3 of dst
|
|
495
|
|
496 lea r0, [r0 + r1 * 2]
|
|
497
|
|
498 dec r6d
|
|
499 jnz .loop
|
|
500
|
|
501 RET
|
|
502 %endif
|
|
503 %endmacro
|
|
504
|
|
505 PIXEL_ADD_PS_W16_H4_avx2 16
|
|
506 PIXEL_ADD_PS_W16_H4_avx2 32
|
|
507
|
|
508
|
|
509 ;-----------------------------------------------------------------------------
|
|
510 ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
511 ;-----------------------------------------------------------------------------
|
|
512 %macro PIXEL_ADD_PS_W32_H2 2
|
|
513 %if HIGH_BIT_DEPTH
|
|
514 INIT_XMM sse2
|
|
515 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
516 mova m5, [pw_pixel_max]
|
|
517 pxor m4, m4
|
|
518 mov r6d, %2/2
|
|
519 add r4, r4
|
|
520 add r5, r5
|
|
521 add r1, r1
|
|
522 .loop:
|
|
523 movu m0, [r2]
|
|
524 movu m2, [r2 + 16]
|
|
525 movu m1, [r3]
|
|
526 movu m3, [r3 + 16]
|
|
527
|
|
528 paddw m0, m1
|
|
529 paddw m2, m3
|
|
530 CLIPW2 m0, m2, m4, m5
|
|
531
|
|
532 movu [r0], m0
|
|
533 movu [r0 + 16], m2
|
|
534
|
|
535 movu m0, [r2 + 32]
|
|
536 movu m2, [r2 + 48]
|
|
537 movu m1, [r3 + 32]
|
|
538 movu m3, [r3 + 48]
|
|
539
|
|
540 paddw m0, m1
|
|
541 paddw m2, m3
|
|
542 CLIPW2 m0, m2, m4, m5
|
|
543
|
|
544 movu [r0 + 32], m0
|
|
545 movu [r0 + 48], m2
|
|
546
|
|
547 movu m0, [r2 + r4]
|
|
548 movu m2, [r2 + r4 + 16]
|
|
549 movu m1, [r3 + r5]
|
|
550 movu m3, [r3 + r5 + 16]
|
|
551
|
|
552 paddw m0, m1
|
|
553 paddw m2, m3
|
|
554 CLIPW2 m0, m2, m4, m5
|
|
555
|
|
556 movu [r0 + r1], m0
|
|
557 movu [r0 + r1 + 16], m2
|
|
558
|
|
559 movu m0, [r2 + r4 + 32]
|
|
560 movu m2, [r2 + r4 + 48]
|
|
561 movu m1, [r3 + r5 + 32]
|
|
562 movu m3, [r3 + r5 + 48]
|
|
563 dec r6d
|
|
564 lea r2, [r2 + r4 * 2]
|
|
565 lea r3, [r3 + r5 * 2]
|
|
566
|
|
567 paddw m0, m1
|
|
568 paddw m2, m3
|
|
569 CLIPW2 m0, m2, m4, m5
|
|
570
|
|
571 movu [r0 + r1 + 32], m0
|
|
572 movu [r0 + r1 + 48], m2
|
|
573 lea r0, [r0 + r1 * 2]
|
|
574
|
|
575 jnz .loop
|
|
576 RET
|
|
577 %else
|
|
578 INIT_XMM sse4
|
|
579 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
580 mov r6d, %2/2
|
|
581 add r5, r5
|
|
582 .loop:
|
|
583 pmovzxbw m0, [r2]
|
|
584 pmovzxbw m1, [r2 + 8]
|
|
585 pmovzxbw m2, [r2 + 16]
|
|
586 pmovzxbw m3, [r2 + 24]
|
|
587 movu m4, [r3]
|
|
588 movu m5, [r3 + 16]
|
|
589 movu m6, [r3 + 32]
|
|
590 movu m7, [r3 + 48]
|
|
591
|
|
592 paddw m0, m4
|
|
593 paddw m1, m5
|
|
594 paddw m2, m6
|
|
595 paddw m3, m7
|
|
596 packuswb m0, m1
|
|
597 packuswb m2, m3
|
|
598
|
|
599 movu [r0], m0
|
|
600 movu [r0 + 16], m2
|
|
601
|
|
602 pmovzxbw m0, [r2 + r4]
|
|
603 pmovzxbw m1, [r2 + r4 + 8]
|
|
604 pmovzxbw m2, [r2 + r4 + 16]
|
|
605 pmovzxbw m3, [r2 + r4 + 24]
|
|
606 movu m4, [r3 + r5]
|
|
607 movu m5, [r3 + r5 + 16]
|
|
608 movu m6, [r3 + r5 + 32]
|
|
609 movu m7, [r3 + r5 + 48]
|
|
610 dec r6d
|
|
611 lea r2, [r2 + r4 * 2]
|
|
612 lea r3, [r3 + r5 * 2]
|
|
613
|
|
614 paddw m0, m4
|
|
615 paddw m1, m5
|
|
616 paddw m2, m6
|
|
617 paddw m3, m7
|
|
618 packuswb m0, m1
|
|
619 packuswb m2, m3
|
|
620
|
|
621 movu [r0 + r1], m0
|
|
622 movu [r0 + r1 + 16], m2
|
|
623 lea r0, [r0 + r1 * 2]
|
|
624
|
|
625 jnz .loop
|
|
626 RET
|
|
627 %endif
|
|
628 %endmacro
|
|
629 PIXEL_ADD_PS_W32_H2 32, 32
|
|
630 PIXEL_ADD_PS_W32_H2 32, 64
|
|
631
|
|
632 ;-----------------------------------------------------------------------------
|
|
633 ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
634 ;-----------------------------------------------------------------------------
|
|
635 %macro PIXEL_ADD_PS_W32_H4_avx2 1
|
|
636 %if HIGH_BIT_DEPTH
|
|
637 %if ARCH_X86_64
|
|
638 INIT_YMM avx2
|
|
639 cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
640 mova m5, [pw_pixel_max]
|
|
641 pxor m4, m4
|
|
642 mov r6d, %1/4
|
|
643 add r4d, r4d
|
|
644 add r5d, r5d
|
|
645 add r1d, r1d
|
|
646 lea r7, [r4 * 3]
|
|
647 lea r8, [r5 * 3]
|
|
648 lea r9, [r1 * 3]
|
|
649
|
|
650 .loop:
|
|
651 movu m0, [r2]
|
|
652 movu m2, [r2 + 32]
|
|
653 movu m1, [r3]
|
|
654 movu m3, [r3 + 32]
|
|
655 paddw m0, m1
|
|
656 paddw m2, m3
|
|
657 CLIPW2 m0, m2, m4, m5
|
|
658
|
|
659 movu [r0], m0
|
|
660 movu [r0 + 32], m2
|
|
661
|
|
662 movu m0, [r2 + r4]
|
|
663 movu m2, [r2 + r4 + 32]
|
|
664 movu m1, [r3 + r5]
|
|
665 movu m3, [r3 + r5 + 32]
|
|
666 paddw m0, m1
|
|
667 paddw m2, m3
|
|
668 CLIPW2 m0, m2, m4, m5
|
|
669
|
|
670 movu [r0 + r1], m0
|
|
671 movu [r0 + r1 + 32], m2
|
|
672
|
|
673 movu m0, [r2 + r4 * 2]
|
|
674 movu m2, [r2 + r4 * 2 + 32]
|
|
675 movu m1, [r3 + r5 * 2]
|
|
676 movu m3, [r3 + r5 * 2 + 32]
|
|
677 paddw m0, m1
|
|
678 paddw m2, m3
|
|
679 CLIPW2 m0, m2, m4, m5
|
|
680
|
|
681 movu [r0 + r1 * 2], m0
|
|
682 movu [r0 + r1 * 2 + 32], m2
|
|
683
|
|
684 movu m0, [r2 + r7]
|
|
685 movu m2, [r2 + r7 + 32]
|
|
686 movu m1, [r3 + r8]
|
|
687 movu m3, [r3 + r8 + 32]
|
|
688 paddw m0, m1
|
|
689 paddw m2, m3
|
|
690 CLIPW2 m0, m2, m4, m5
|
|
691
|
|
692 movu [r0 + r9], m0
|
|
693 movu [r0 + r9 + 32], m2
|
|
694
|
|
695 dec r6d
|
|
696 lea r0, [r0 + r1 * 4]
|
|
697 lea r2, [r2 + r4 * 4]
|
|
698 lea r3, [r3 + r5 * 4]
|
|
699 jnz .loop
|
|
700 RET
|
|
701 %endif
|
|
702 %else
|
|
703 %if ARCH_X86_64
|
|
704 INIT_YMM avx2
|
|
705 cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
706 mov r6d, %1/4
|
|
707 add r5, r5
|
|
708 lea r7, [r4 * 3]
|
|
709 lea r8, [r5 * 3]
|
|
710 lea r9, [r1 * 3]
|
|
711 .loop:
|
|
712 pmovzxbw m0, [r2] ; first half of row 0 of src0
|
|
713 pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0
|
|
714 movu m2, [r3] ; first half of row 0 of src1
|
|
715 movu m3, [r3 + 32] ; second half of row 0 of src1
|
|
716
|
|
717 paddw m0, m2
|
|
718 paddw m1, m3
|
|
719 packuswb m0, m1
|
|
720 vpermq m0, m0, 11011000b
|
|
721 movu [r0], m0 ; row 0 of dst
|
|
722
|
|
723 pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0
|
|
724 pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0
|
|
725 movu m2, [r3 + r5] ; first half of row 1 of src1
|
|
726 movu m3, [r3 + r5 + 32] ; second half of row 1 of src1
|
|
727
|
|
728 paddw m0, m2
|
|
729 paddw m1, m3
|
|
730 packuswb m0, m1
|
|
731 vpermq m0, m0, 11011000b
|
|
732 movu [r0 + r1], m0 ; row 1 of dst
|
|
733
|
|
734 pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0
|
|
735 pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0
|
|
736 movu m2, [r3 + r5 * 2] ; first half of row 2 of src1
|
|
737 movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1
|
|
738
|
|
739 paddw m0, m2
|
|
740 paddw m1, m3
|
|
741 packuswb m0, m1
|
|
742 vpermq m0, m0, 11011000b
|
|
743 movu [r0 + r1 * 2], m0 ; row 2 of dst
|
|
744
|
|
745 pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0
|
|
746 pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0
|
|
747 movu m2, [r3 + r8] ; first half of row 3 of src1
|
|
748 movu m3, [r3 + r8 + 32] ; second half of row 3 of src1
|
|
749
|
|
750 paddw m0, m2
|
|
751 paddw m1, m3
|
|
752 packuswb m0, m1
|
|
753 vpermq m0, m0, 11011000b
|
|
754 movu [r0 + r9], m0 ; row 3 of dst
|
|
755
|
|
756 lea r2, [r2 + r4 * 4]
|
|
757 lea r3, [r3 + r5 * 4]
|
|
758 lea r0, [r0 + r1 * 4]
|
|
759
|
|
760 dec r6d
|
|
761 jnz .loop
|
|
762 RET
|
|
763 %endif
|
|
764 %endif
|
|
765 %endmacro
|
|
766
|
|
767 PIXEL_ADD_PS_W32_H4_avx2 32
|
|
768 PIXEL_ADD_PS_W32_H4_avx2 64
|
|
769
|
|
770
|
|
771 ;-----------------------------------------------------------------------------
|
|
772 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
773 ;-----------------------------------------------------------------------------
|
|
774 %macro PIXEL_ADD_PS_W64_H2 2
|
|
775 %if HIGH_BIT_DEPTH
|
|
776 INIT_XMM sse2
|
|
777 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
778 mova m5, [pw_pixel_max]
|
|
779 pxor m4, m4
|
|
780 mov r6d, %2/2
|
|
781 add r4, r4
|
|
782 add r5, r5
|
|
783 add r1, r1
|
|
784 .loop:
|
|
785 movu m0, [r2]
|
|
786 movu m2, [r2 + 16]
|
|
787 movu m1, [r3]
|
|
788 movu m3, [r3 + 16]
|
|
789
|
|
790 paddw m0, m1
|
|
791 paddw m2, m3
|
|
792 CLIPW2 m0, m2, m4, m5
|
|
793
|
|
794 movu [r0], m0
|
|
795 movu [r0 + 16], m2
|
|
796
|
|
797 movu m0, [r2 + 32]
|
|
798 movu m2, [r2 + 48]
|
|
799 movu m1, [r3 + 32]
|
|
800 movu m3, [r3 + 48]
|
|
801
|
|
802 paddw m0, m1
|
|
803 paddw m2, m3
|
|
804 CLIPW2 m0, m2, m4, m5
|
|
805
|
|
806 movu [r0 + 32], m0
|
|
807 movu [r0 + 48], m2
|
|
808
|
|
809 movu m0, [r2 + 64]
|
|
810 movu m2, [r2 + 80]
|
|
811 movu m1, [r3 + 64]
|
|
812 movu m3, [r3 + 80]
|
|
813
|
|
814 paddw m0, m1
|
|
815 paddw m2, m3
|
|
816 CLIPW2 m0, m2, m4, m5
|
|
817
|
|
818 movu [r0 + 64], m0
|
|
819 movu [r0 + 80], m2
|
|
820
|
|
821 movu m0, [r2 + 96]
|
|
822 movu m2, [r2 + 112]
|
|
823 movu m1, [r3 + 96]
|
|
824 movu m3, [r3 + 112]
|
|
825
|
|
826 paddw m0, m1
|
|
827 paddw m2, m3
|
|
828 CLIPW2 m0, m2, m4, m5
|
|
829
|
|
830 movu [r0 + 96], m0
|
|
831 movu [r0 + 112], m2
|
|
832
|
|
833 movu m0, [r2 + r4]
|
|
834 movu m2, [r2 + r4 + 16]
|
|
835 movu m1, [r3 + r5]
|
|
836 movu m3, [r3 + r5 + 16]
|
|
837
|
|
838 paddw m0, m1
|
|
839 paddw m2, m3
|
|
840 CLIPW2 m0, m2, m4, m5
|
|
841
|
|
842 movu [r0 + r1], m0
|
|
843 movu [r0 + r1 + 16], m2
|
|
844
|
|
845 movu m0, [r2 + r4 + 32]
|
|
846 movu m2, [r2 + r4 + 48]
|
|
847 movu m1, [r3 + r5 + 32]
|
|
848 movu m3, [r3 + r5 + 48]
|
|
849
|
|
850 paddw m0, m1
|
|
851 paddw m2, m3
|
|
852 CLIPW2 m0, m2, m4, m5
|
|
853
|
|
854 movu [r0 + r1 + 32], m0
|
|
855 movu [r0 + r1 + 48], m2
|
|
856
|
|
857 movu m0, [r2 + r4 + 64]
|
|
858 movu m2, [r2 + r4 + 80]
|
|
859 movu m1, [r3 + r5 + 64]
|
|
860 movu m3, [r3 + r5 + 80]
|
|
861
|
|
862 paddw m0, m1
|
|
863 paddw m2, m3
|
|
864 CLIPW2 m0, m2, m4, m5
|
|
865
|
|
866 movu [r0 + r1 + 64], m0
|
|
867 movu [r0 + r1 + 80], m2
|
|
868
|
|
869 movu m0, [r2 + r4 + 96]
|
|
870 movu m2, [r2 + r4 + 112]
|
|
871 movu m1, [r3 + r5 + 96]
|
|
872 movu m3, [r3 + r5 + 112]
|
|
873 dec r6d
|
|
874 lea r2, [r2 + r4 * 2]
|
|
875 lea r3, [r3 + r5 * 2]
|
|
876
|
|
877 paddw m0, m1
|
|
878 paddw m2, m3
|
|
879 CLIPW2 m0, m2, m4, m5
|
|
880
|
|
881 movu [r0 + r1 + 96], m0
|
|
882 movu [r0 + r1 + 112], m2
|
|
883 lea r0, [r0 + r1 * 2]
|
|
884
|
|
885 jnz .loop
|
|
886 RET
|
|
887 %else
|
|
888 INIT_XMM sse4
|
|
889 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
890 mov r6d, %2/2
|
|
891 add r5, r5
|
|
892 .loop:
|
|
893 pmovzxbw m0, [r2]
|
|
894 pmovzxbw m1, [r2 + 8]
|
|
895 pmovzxbw m2, [r2 + 16]
|
|
896 pmovzxbw m3, [r2 + 24]
|
|
897 movu m4, [r3]
|
|
898 movu m5, [r3 + 16]
|
|
899 movu m6, [r3 + 32]
|
|
900 movu m7, [r3 + 48]
|
|
901
|
|
902 paddw m0, m4
|
|
903 paddw m1, m5
|
|
904 paddw m2, m6
|
|
905 paddw m3, m7
|
|
906 packuswb m0, m1
|
|
907 packuswb m2, m3
|
|
908
|
|
909 movu [r0], m0
|
|
910 movu [r0 + 16], m2
|
|
911
|
|
912 pmovzxbw m0, [r2 + 32]
|
|
913 pmovzxbw m1, [r2 + 40]
|
|
914 pmovzxbw m2, [r2 + 48]
|
|
915 pmovzxbw m3, [r2 + 56]
|
|
916 movu m4, [r3 + 64]
|
|
917 movu m5, [r3 + 80]
|
|
918 movu m6, [r3 + 96]
|
|
919 movu m7, [r3 + 112]
|
|
920
|
|
921 paddw m0, m4
|
|
922 paddw m1, m5
|
|
923 paddw m2, m6
|
|
924 paddw m3, m7
|
|
925 packuswb m0, m1
|
|
926 packuswb m2, m3
|
|
927
|
|
928 movu [r0 + 32], m0
|
|
929 movu [r0 + 48], m2
|
|
930
|
|
931 pmovzxbw m0, [r2 + r4]
|
|
932 pmovzxbw m1, [r2 + r4 + 8]
|
|
933 pmovzxbw m2, [r2 + r4 + 16]
|
|
934 pmovzxbw m3, [r2 + r4 + 24]
|
|
935 movu m4, [r3 + r5]
|
|
936 movu m5, [r3 + r5 + 16]
|
|
937 movu m6, [r3 + r5 + 32]
|
|
938 movu m7, [r3 + r5 + 48]
|
|
939
|
|
940 paddw m0, m4
|
|
941 paddw m1, m5
|
|
942 paddw m2, m6
|
|
943 paddw m3, m7
|
|
944 packuswb m0, m1
|
|
945 packuswb m2, m3
|
|
946
|
|
947 movu [r0 + r1], m0
|
|
948 movu [r0 + r1 + 16], m2
|
|
949
|
|
950 pmovzxbw m0, [r2 + r4 + 32]
|
|
951 pmovzxbw m1, [r2 + r4 + 40]
|
|
952 pmovzxbw m2, [r2 + r4 + 48]
|
|
953 pmovzxbw m3, [r2 + r4 + 56]
|
|
954 movu m4, [r3 + r5 + 64]
|
|
955 movu m5, [r3 + r5 + 80]
|
|
956 movu m6, [r3 + r5 + 96]
|
|
957 movu m7, [r3 + r5 + 112]
|
|
958 dec r6d
|
|
959 lea r2, [r2 + r4 * 2]
|
|
960 lea r3, [r3 + r5 * 2]
|
|
961
|
|
962 paddw m0, m4
|
|
963 paddw m1, m5
|
|
964 paddw m2, m6
|
|
965 paddw m3, m7
|
|
966 packuswb m0, m1
|
|
967 packuswb m2, m3
|
|
968
|
|
969 movu [r0 + r1 + 32], m0
|
|
970 movu [r0 + r1 + 48], m2
|
|
971 lea r0, [r0 + r1 * 2]
|
|
972
|
|
973 jnz .loop
|
|
974 RET
|
|
975 %endif
|
|
976 %endmacro
|
|
977 PIXEL_ADD_PS_W64_H2 64, 64
|
|
978
|
|
979 ;-----------------------------------------------------------------------------
|
|
980 ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
|
|
981 ;-----------------------------------------------------------------------------
|
|
982 %if HIGH_BIT_DEPTH
|
|
983 %if ARCH_X86_64
|
|
984 INIT_YMM avx2
|
|
985 cglobal pixel_add_ps_64x64, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
986 mova m5, [pw_pixel_max]
|
|
987 pxor m4, m4
|
|
988 mov r6d, 16
|
|
989 add r4d, r4d
|
|
990 add r5d, r5d
|
|
991 add r1d, r1d
|
|
992 lea r7, [r4 * 3]
|
|
993 lea r8, [r5 * 3]
|
|
994 lea r9, [r1 * 3]
|
|
995
|
|
996 .loop:
|
|
997 movu m0, [r2]
|
|
998 movu m1, [r2 + 32]
|
|
999 movu m2, [r3]
|
|
1000 movu m3, [r3 + 32]
|
|
1001 paddw m0, m2
|
|
1002 paddw m1, m3
|
|
1003
|
|
1004 CLIPW2 m0, m1, m4, m5
|
|
1005 movu [r0], m0
|
|
1006 movu [r0 + 32], m1
|
|
1007
|
|
1008 movu m0, [r2 + 64]
|
|
1009 movu m1, [r2 + 96]
|
|
1010 movu m2, [r3 + 64]
|
|
1011 movu m3, [r3 + 96]
|
|
1012 paddw m0, m2
|
|
1013 paddw m1, m3
|
|
1014
|
|
1015 CLIPW2 m0, m1, m4, m5
|
|
1016 movu [r0 + 64], m0
|
|
1017 movu [r0 + 96], m1
|
|
1018
|
|
1019 movu m0, [r2 + r4]
|
|
1020 movu m1, [r2 + r4 + 32]
|
|
1021 movu m2, [r3 + r5]
|
|
1022 movu m3, [r3 + r5 + 32]
|
|
1023 paddw m0, m2
|
|
1024 paddw m1, m3
|
|
1025
|
|
1026 CLIPW2 m0, m1, m4, m5
|
|
1027 movu [r0 + r1], m0
|
|
1028 movu [r0 + r1 + 32], m1
|
|
1029
|
|
1030 movu m0, [r2 + r4 + 64]
|
|
1031 movu m1, [r2 + r4 + 96]
|
|
1032 movu m2, [r3 + r5 + 64]
|
|
1033 movu m3, [r3 + r5 + 96]
|
|
1034 paddw m0, m2
|
|
1035 paddw m1, m3
|
|
1036
|
|
1037 CLIPW2 m0, m1, m4, m5
|
|
1038 movu [r0 + r1 + 64], m0
|
|
1039 movu [r0 + r1 + 96], m1
|
|
1040
|
|
1041 movu m0, [r2 + r4 * 2]
|
|
1042 movu m1, [r2 + r4 * 2 + 32]
|
|
1043 movu m2, [r3 + r5 * 2]
|
|
1044 movu m3, [r3 + r5 * 2+ 32]
|
|
1045 paddw m0, m2
|
|
1046 paddw m1, m3
|
|
1047
|
|
1048 CLIPW2 m0, m1, m4, m5
|
|
1049 movu [r0 + r1 * 2], m0
|
|
1050 movu [r0 + r1 * 2 + 32], m1
|
|
1051
|
|
1052 movu m0, [r2 + r4 * 2 + 64]
|
|
1053 movu m1, [r2 + r4 * 2 + 96]
|
|
1054 movu m2, [r3 + r5 * 2 + 64]
|
|
1055 movu m3, [r3 + r5 * 2 + 96]
|
|
1056 paddw m0, m2
|
|
1057 paddw m1, m3
|
|
1058
|
|
1059 CLIPW2 m0, m1, m4, m5
|
|
1060 movu [r0 + r1 * 2 + 64], m0
|
|
1061 movu [r0 + r1 * 2 + 96], m1
|
|
1062
|
|
1063 movu m0, [r2 + r7]
|
|
1064 movu m1, [r2 + r7 + 32]
|
|
1065 movu m2, [r3 + r8]
|
|
1066 movu m3, [r3 + r8 + 32]
|
|
1067 paddw m0, m2
|
|
1068 paddw m1, m3
|
|
1069
|
|
1070 CLIPW2 m0, m1, m4, m5
|
|
1071 movu [r0 + r9], m0
|
|
1072 movu [r0 + r9 + 32], m1
|
|
1073
|
|
1074 movu m0, [r2 + r7 + 64]
|
|
1075 movu m1, [r2 + r7 + 96]
|
|
1076 movu m2, [r3 + r8 + 64]
|
|
1077 movu m3, [r3 + r8 + 96]
|
|
1078 paddw m0, m2
|
|
1079 paddw m1, m3
|
|
1080
|
|
1081 CLIPW2 m0, m1, m4, m5
|
|
1082 movu [r0 + r9 + 64], m0
|
|
1083 movu [r0 + r9 + 96], m1
|
|
1084
|
|
1085 dec r6d
|
|
1086 lea r0, [r0 + r1 * 4]
|
|
1087 lea r2, [r2 + r4 * 4]
|
|
1088 lea r3, [r3 + r5 * 4]
|
|
1089 jnz .loop
|
|
1090 RET
|
|
1091 %endif
|
|
1092 %else
|
|
1093 INIT_YMM avx2
|
|
1094 cglobal pixel_add_ps_64x64, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
|
|
1095 mov r6d, 32
|
|
1096 add r5, r5
|
|
1097 .loop:
|
|
1098 pmovzxbw m0, [r2] ; first 16 of row 0 of src0
|
|
1099 pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0
|
|
1100 pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0
|
|
1101 pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0
|
|
1102 movu m4, [r3] ; first 16 of row 0 of src1
|
|
1103 movu m5, [r3 + 32] ; second 16 of row 0 of src1
|
|
1104 movu m6, [r3 + 64] ; third 16 of row 0 of src1
|
|
1105 movu m7, [r3 + 96] ; forth 16 of row 0 of src1
|
|
1106
|
|
1107 paddw m0, m4
|
|
1108 paddw m1, m5
|
|
1109 paddw m2, m6
|
|
1110 paddw m3, m7
|
|
1111 packuswb m0, m1
|
|
1112 packuswb m2, m3
|
|
1113 vpermq m0, m0, 11011000b
|
|
1114 movu [r0], m0 ; first 32 of row 0 of dst
|
|
1115 vpermq m2, m2, 11011000b
|
|
1116 movu [r0 + 32], m2 ; second 32 of row 0 of dst
|
|
1117
|
|
1118 pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0
|
|
1119 pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0
|
|
1120 pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0
|
|
1121 pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0
|
|
1122 movu m4, [r3 + r5] ; first 16 of row 1 of src1
|
|
1123 movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1
|
|
1124 movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1
|
|
1125 movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1
|
|
1126
|
|
1127 paddw m0, m4
|
|
1128 paddw m1, m5
|
|
1129 paddw m2, m6
|
|
1130 paddw m3, m7
|
|
1131 packuswb m0, m1
|
|
1132 packuswb m2, m3
|
|
1133 vpermq m0, m0, 11011000b
|
|
1134 movu [r0 + r1], m0 ; first 32 of row 1 of dst
|
|
1135 vpermq m2, m2, 11011000b
|
|
1136 movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst
|
|
1137
|
|
1138 lea r2, [r2 + r4 * 2]
|
|
1139 lea r3, [r3 + r5 * 2]
|
|
1140 lea r0, [r0 + r1 * 2]
|
|
1141
|
|
1142 dec r6d
|
|
1143 jnz .loop
|
|
1144 RET
|
|
1145
|
|
1146 %endif
|