Mercurial > hg > forks > libbpg
diff x265/source/common/x86/pixeladd8.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x265/source/common/x86/pixeladd8.asm Wed Nov 16 11:16:33 2016 +0200 @@ -0,0 +1,1146 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +SECTION .text + +cextern pw_pixel_max + +;----------------------------------------------------------------------------- +; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m1, [pw_pixel_max] + pxor m0, m0 + add r4, r4 + add r5, r5 + add r1, r1 + movh m2, [r2] + movhps m2, [r2 + r4] + movh m3, [r3] + movhps m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movhps m4, [r2 + r4] + movh m5, [r3] + movhps m5, [r3 + r5] + + paddw m2, m3 + paddw m4, m5 + CLIPW2 m2, m4, m0, m1 + + movh [r0], m2 + movhps [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movhps [r0 + r1], m4 + + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + add r5, r5 + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movd [r0], m0 + movd [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movd [r0], m4 + movd [r0 + r1], m6 + + RET +%endif + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W4_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m1, [pw_pixel_max] + pxor m0, m0 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movh m2, [r2] + movhps m2, [r2 + r4] + movh m3, [r3] + movhps m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + movh m4, [r2] + movhps m4, [r2 + r4] + movh m5, [r3] + movhps m5, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m2, m3 + paddw m4, m5 + CLIPW2 m2, m4, m0, m1 + + movh [r0], m2 + movhps [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movhps [r0 + r1], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movh m1, [r3] + movh m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movh m5, [r3] + movh m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movd [r0], m0 + movd [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movd [r0], m4 + movd [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W4_H4 4, 8 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W8_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2] + movu m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + dec r6d + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m2, [r2 + r4] + movu m1, [r3] + movu m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + pmovzxbw m4, [r2] + pmovzxbw m6, [r2 + r4] + movu m5, [r3] + movu m7, [r3 + r5] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m0 + packuswb m2, m2 + packuswb m4, m4 + packuswb m6, m6 + + movh [r0], m0 + movh [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + movh [r0], m4 + movh [r0 + r1], m6 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro + +PIXEL_ADD_PS_W8_H4 8, 8 +PIXEL_ADD_PS_W8_H4 8, 16 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W16_H4 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/4 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + lea r0, [r0 + r1 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 8] + movu m2, [r3] + movu m3, [r3 + 16] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 16] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + + movu [r0], m0 + movu [r0 + r1], m4 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 8] + movu m2, [r3] + movu m3, [r3 + 16] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 16] + dec r6d + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + + movu [r0], m0 + movu [r0 + r1], m4 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro +PIXEL_ADD_PS_W16_H4 16, 16 +PIXEL_ADD_PS_W16_H4 16, 32 + +;----------------------------------------------------------------------------- +; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W16_H4_avx2 1 +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m3, [pw_pixel_max] + pxor m2, m2 + mov r6d, %1/4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + +.loop: + movu m0, [r2] + movu m1, [r3] + paddw m0, m1 + CLIPW m0, m2, m3 + movu [r0], m0 + + movu m0, [r2 + r4] + movu m1, [r3 + r5] + paddw m0, m1 + CLIPW m0, m2, m3 + movu [r0 + r1], m0 + + movu m0, [r2 + r4 * 2] + movu m1, [r3 + r5 * 2] + paddw m0, m1 + CLIPW m0, m2, m3 + movu [r0 + r1 * 2], m0 + + movu m0, [r2 + r7] + movu m1, [r3 + r8] + paddw m0, m1 + CLIPW m0, m2, m3 + movu [r0 + r9], m0 + + dec r6d + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + jnz .loop + RET +%endif +%else +INIT_YMM avx2 +cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %1/4 + add r5, r5 +.loop: + + pmovzxbw m0, [r2] ; row 0 of src0 + pmovzxbw m1, [r2 + r4] ; row 1 of src0 + movu m2, [r3] ; row 0 of src1 + movu m3, [r3 + r5] ; row 1 of src1 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + pmovzxbw m2, [r2] ; row 2 of src0 + pmovzxbw m3, [r2 + r4] ; row 3 of src0 + movu m4, [r3] ; row 2 of src1 + movu m5, [r3 + r5] ; row 3 of src1 + paddw m2, m4 + paddw m3, m5 + packuswb m2, m3 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + vpermq m0, m0, 11011000b + movu [r0], xm0 ; row 0 of dst + vextracti128 xm3, m0, 1 + movu [r0 + r1], xm3 ; row 1 of dst + + lea r0, [r0 + r1 * 2] + vpermq m2, m2, 11011000b + movu [r0], xm2 ; row 2 of dst + vextracti128 xm3, m2, 1 + movu [r0 + r1], xm3 ; row 3 of dst + + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + + RET +%endif +%endmacro + +PIXEL_ADD_PS_W16_H4_avx2 16 +PIXEL_ADD_PS_W16_H4_avx2 32 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W32_H2 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + 32] + movu m2, [r2 + 48] + movu m1, [r3 + 32] + movu m3, [r3 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2 + r4 + 32] + movu m2, [r2 + r4 + 48] + movu m1, [r3 + r5 + 32] + movu m3, [r3 + r5 + 48] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/2 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m2, [r2 + 16] + pmovzxbw m3, [r2 + 24] + movu m4, [r3] + movu m5, [r3 + 16] + movu m6, [r3 + 32] + movu m7, [r3 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0], m0 + movu [r0 + 16], m2 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 8] + pmovzxbw m2, [r2 + r4 + 16] + pmovzxbw m3, [r2 + r4 + 24] + movu m4, [r3 + r5] + movu m5, [r3 + r5 + 16] + movu m6, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro +PIXEL_ADD_PS_W32_H2 32, 32 +PIXEL_ADD_PS_W32_H2 32, 64 + +;----------------------------------------------------------------------------- +; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W32_H4_avx2 1 +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %1/4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + +.loop: + movu m0, [r2] + movu m2, [r2 + 32] + movu m1, [r3] + movu m3, [r3 + 32] + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 32], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 32] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 32] + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m2 + + movu m0, [r2 + r4 * 2] + movu m2, [r2 + r4 * 2 + 32] + movu m1, [r3 + r5 * 2] + movu m3, [r3 + r5 * 2 + 32] + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m2 + + movu m0, [r2 + r7] + movu m2, [r2 + r7 + 32] + movu m1, [r3 + r8] + movu m3, [r3 + r8 + 32] + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r9], m0 + movu [r0 + r9 + 32], m2 + + dec r6d + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + jnz .loop + RET +%endif +%else +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %1/4 + add r5, r5 + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] +.loop: + pmovzxbw m0, [r2] ; first half of row 0 of src0 + pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 + movu m2, [r3] ; first half of row 0 of src1 + movu m3, [r3 + 32] ; second half of row 0 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0], m0 ; row 0 of dst + + pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 + movu m2, [r3 + r5] ; first half of row 1 of src1 + movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; row 1 of dst + + pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0 + pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0 + movu m2, [r3 + r5 * 2] ; first half of row 2 of src1 + movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1 * 2], m0 ; row 2 of dst + + pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0 + pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0 + movu m2, [r3 + r8] ; first half of row 3 of src1 + movu m3, [r3 + r8 + 32] ; second half of row 3 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r9], m0 ; row 3 of dst + + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + + dec r6d + jnz .loop + RET +%endif +%endif +%endmacro + +PIXEL_ADD_PS_W32_H4_avx2 32 +PIXEL_ADD_PS_W32_H4_avx2 64 + + +;----------------------------------------------------------------------------- +; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PIXEL_ADD_PS_W64_H2 2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, %2/2 + add r4, r4 + add r5, r5 + add r1, r1 +.loop: + movu m0, [r2] + movu m2, [r2 + 16] + movu m1, [r3] + movu m3, [r3 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0], m0 + movu [r0 + 16], m2 + + movu m0, [r2 + 32] + movu m2, [r2 + 48] + movu m1, [r3 + 32] + movu m3, [r3 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + movu m0, [r2 + 64] + movu m2, [r2 + 80] + movu m1, [r3 + 64] + movu m3, [r3 + 80] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 64], m0 + movu [r0 + 80], m2 + + movu m0, [r2 + 96] + movu m2, [r2 + 112] + movu m1, [r3 + 96] + movu m3, [r3 + 112] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + 96], m0 + movu [r0 + 112], m2 + + movu m0, [r2 + r4] + movu m2, [r2 + r4 + 16] + movu m1, [r3 + r5] + movu m3, [r3 + r5 + 16] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + movu m0, [r2 + r4 + 32] + movu m2, [r2 + r4 + 48] + movu m1, [r3 + r5 + 32] + movu m3, [r3 + r5 + 48] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + + movu m0, [r2 + r4 + 64] + movu m2, [r2 + r4 + 80] + movu m1, [r3 + r5 + 64] + movu m3, [r3 + r5 + 80] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 80], m2 + + movu m0, [r2 + r4 + 96] + movu m2, [r2 + r4 + 112] + movu m1, [r3 + r5 + 96] + movu m3, [r3 + r5 + 112] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m1 + paddw m2, m3 + CLIPW2 m0, m2, m4, m5 + + movu [r0 + r1 + 96], m0 + movu [r0 + r1 + 112], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%else +INIT_XMM sse4 +cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/2 + add r5, r5 +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 8] + pmovzxbw m2, [r2 + 16] + pmovzxbw m3, [r2 + 24] + movu m4, [r3] + movu m5, [r3 + 16] + movu m6, [r3 + 32] + movu m7, [r3 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0], m0 + movu [r0 + 16], m2 + + pmovzxbw m0, [r2 + 32] + pmovzxbw m1, [r2 + 40] + pmovzxbw m2, [r2 + 48] + pmovzxbw m3, [r2 + 56] + movu m4, [r3 + 64] + movu m5, [r3 + 80] + movu m6, [r3 + 96] + movu m7, [r3 + 112] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + 32], m0 + movu [r0 + 48], m2 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 8] + pmovzxbw m2, [r2 + r4 + 16] + pmovzxbw m3, [r2 + r4 + 24] + movu m4, [r3 + r5] + movu m5, [r3 + r5 + 16] + movu m6, [r3 + r5 + 32] + movu m7, [r3 + r5 + 48] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m2 + + pmovzxbw m0, [r2 + r4 + 32] + pmovzxbw m1, [r2 + r4 + 40] + pmovzxbw m2, [r2 + r4 + 48] + pmovzxbw m3, [r2 + r4 + 56] + movu m4, [r3 + r5 + 64] + movu m5, [r3 + r5 + 80] + movu m6, [r3 + r5 + 96] + movu m7, [r3 + r5 + 112] + dec r6d + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m2 + lea r0, [r0 + r1 * 2] + + jnz .loop + RET +%endif +%endmacro +PIXEL_ADD_PS_W64_H2 64, 64 + +;----------------------------------------------------------------------------- +; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_add_ps_64x64, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 + mova m5, [pw_pixel_max] + pxor m4, m4 + mov r6d, 16 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r3] + movu m3, [r3 + 32] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0], m0 + movu [r0 + 32], m1 + + movu m0, [r2 + 64] + movu m1, [r2 + 96] + movu m2, [r3 + 64] + movu m3, [r3 + 96] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + 64], m0 + movu [r0 + 96], m1 + + movu m0, [r2 + r4] + movu m1, [r2 + r4 + 32] + movu m2, [r3 + r5] + movu m3, [r3 + r5 + 32] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + + movu m0, [r2 + r4 + 64] + movu m1, [r2 + r4 + 96] + movu m2, [r3 + r5 + 64] + movu m3, [r3 + r5 + 96] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1 + 64], m0 + movu [r0 + r1 + 96], m1 + + movu m0, [r2 + r4 * 2] + movu m1, [r2 + r4 * 2 + 32] + movu m2, [r3 + r5 * 2] + movu m3, [r3 + r5 * 2+ 32] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 32], m1 + + movu m0, [r2 + r4 * 2 + 64] + movu m1, [r2 + r4 * 2 + 96] + movu m2, [r3 + r5 * 2 + 64] + movu m3, [r3 + r5 * 2 + 96] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1 * 2 + 64], m0 + movu [r0 + r1 * 2 + 96], m1 + + movu m0, [r2 + r7] + movu m1, [r2 + r7 + 32] + movu m2, [r3 + r8] + movu m3, [r3 + r8 + 32] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r9], m0 + movu [r0 + r9 + 32], m1 + + movu m0, [r2 + r7 + 64] + movu m1, [r2 + r7 + 96] + movu m2, [r3 + r8 + 64] + movu m3, [r3 + r8 + 96] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r9 + 64], m0 + movu [r0 + r9 + 96], m1 + + dec r6d + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + jnz .loop + RET +%endif +%else +INIT_YMM avx2 +cglobal pixel_add_ps_64x64, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, 32 + add r5, r5 +.loop: + pmovzxbw m0, [r2] ; first 16 of row 0 of src0 + pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0 + pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0 + pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0 + movu m4, [r3] ; first 16 of row 0 of src1 + movu m5, [r3 + 32] ; second 16 of row 0 of src1 + movu m6, [r3 + 64] ; third 16 of row 0 of src1 + movu m7, [r3 + 96] ; forth 16 of row 0 of src1 + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + movu [r0], m0 ; first 32 of row 0 of dst + vpermq m2, m2, 11011000b + movu [r0 + 32], m2 ; second 32 of row 0 of dst + + pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0 + pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0 + pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0 + movu m4, [r3 + r5] ; first 16 of row 1 of src1 + movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1 + movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1 + movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1 + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; first 32 of row 1 of dst + vpermq m2, m2, 11011000b + movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + RET + +%endif