Mercurial > hg > forks > libbpg
view x265/source/common/x86/loopfilter.asm @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
line wrap: on
line source
;***************************************************************************** ;* Copyright (C) 2013 x265 project ;* ;* Authors: Min Chen <chenm001@163.com> ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com> ;* Nabajit Deka <nabajit@multicorewareinc.com> ;* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com> ;* Murugan Vairavel <murugan@multicorewareinc.com> ;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com> ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" SECTION_RODATA 32 pb_31: times 32 db 31 pb_124: times 32 db 124 pb_15: times 32 db 15 pb_movemask_32: times 32 db 0x00 times 32 db 0xFF SECTION .text cextern pb_1 cextern pb_128 cextern pb_2 cextern pw_2 cextern pw_pixel_max cextern pb_movemask cextern pw_1 cextern hmul_16p cextern pb_4 ;============================================================================================================ ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride) ;============================================================================================================ INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgE0, 4,5,9 mov r4d, r4m movh m6, [r1] movzx r1d, byte [r3] pxor m5, m5 neg r1b movd m0, r1d lea r1, [r0 + r4 * 2] mov r4d, r2d .loop: movu m7, [r0] movu m8, [r0 + 16] movu m2, [r0 + 2] movu m1, [r0 + 18] pcmpgtw m3, m7, m2 pcmpgtw m2, m7 pcmpgtw m4, m8, m1 pcmpgtw m1, m8 packsswb m3, m4 packsswb m2, m1 pand m3, [pb_1] por m3, m2 palignr m2, m3, m5, 15 por m2, m0 mova m4, [pw_pixel_max] psignb m2, [pb_128] ; m2 = signLeft pxor m0, m0 palignr m0, m3, 15 paddb m3, m2 paddb m3, [pb_2] ; m2 = uiEdgeType pshufb m2, m6, m3 pmovsxbw m3, m2 ; offsetEo punpckhbw m2, m2 psraw m2, 8 paddw m7, m3 paddw m8, m2 pmaxsw m7, m5 pmaxsw m8, m5 pminsw m7, m4 pminsw m8, m4 movu [r0], m7 movu [r0 + 16], m8 add r0q, 32 sub r2d, 16 jnz .loop movzx r3d, byte [r3 + 1] neg r3b movd m0, r3d .loopH: movu m7, [r1] movu m8, [r1 + 16] movu m2, [r1 + 2] movu m1, [r1 + 18] pcmpgtw m3, m7, m2 pcmpgtw m2, m7 pcmpgtw m4, m8, m1 pcmpgtw m1, m8 packsswb m3, m4 packsswb m2, m1 pand m3, [pb_1] por m3, m2 palignr m2, m3, m5, 15 por m2, m0 mova m4, [pw_pixel_max] psignb m2, [pb_128] ; m2 = signLeft pxor m0, m0 palignr m0, m3, 15 paddb m3, m2 paddb m3, [pb_2] ; m2 = uiEdgeType pshufb m2, m6, m3 pmovsxbw m3, m2 ; offsetEo punpckhbw m2, m2 psraw m2, 8 paddw m7, m3 paddw m8, m2 pmaxsw m7, m5 pmaxsw m8, m5 pminsw m7, m4 pminsw m8, m4 movu [r1], m7 movu [r1 + 16], m8 add r1q, 32 sub r4d, 16 jnz .loopH RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride mov r4d, r4m mova m4, [pb_128] ; m4 = [80] pxor m5, m5 ; m5 = 0 movu m6, [r1] ; m6 = offsetEo movzx r1d, byte [r3] inc r3 neg r1b movd m0, r1d lea r1, [r0 + r4] mov r4d, r2d .loop: movu m7, [r0] ; m7 = rec[x] movu m2, [r0 + 1] ; m2 = rec[x+1] pxor m1, m7, m4 pxor m3, m2, m4 pcmpgtb m2, m1, m3 pcmpgtb m3, m1 pand m2, [pb_1] por m2, m3 pslldq m3, m2, 1 por m3, m0 psignb m3, m4 ; m3 = signLeft pxor m0, m0 palignr m0, m2, 15 paddb m2, m3 paddb m2, [pb_2] ; m2 = uiEdgeType pshufb m3, m6, m2 pmovzxbw m2, m7 ; rec punpckhbw m7, m5 pmovsxbw m1, m3 ; offsetEo punpckhbw m3, m3 psraw m3, 8 paddw m2, m1 paddw m7, m3 packuswb m2, m7 movu [r0], m2 add r0q, 16 sub r2d, 16 jnz .loop movzx r3d, byte [r3] neg r3b movd m0, r3d .loopH: movu m7, [r1] ; m7 = rec[x] movu m2, [r1 + 1] ; m2 = rec[x+1] pxor m1, m7, m4 pxor m3, m2, m4 pcmpgtb m2, m1, m3 pcmpgtb m3, m1 pand m2, [pb_1] por m2, m3 pslldq m3, m2, 1 por m3, m0 psignb m3, m4 ; m3 = signLeft pxor m0, m0 palignr m0, m2, 15 paddb m2, m3 paddb m2, [pb_2] ; m2 = uiEdgeType pshufb m3, m6, m2 pmovzxbw m2, m7 ; rec punpckhbw m7, m5 pmovsxbw m1, m3 ; offsetEo punpckhbw m3, m3 psraw m3, 8 paddw m2, m1 paddw m7, m3 packuswb m2, m7 movu [r1], m2 add r1q, 16 sub r4d, 16 jnz .loopH RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE0, 4,4,9 vbroadcasti128 m6, [r1] movzx r1d, byte [r3] neg r1b movd xm0, r1d movzx r1d, byte [r3 + 1] neg r1b movd xm1, r1d vinserti128 m0, m0, xm1, 1 mova m5, [pw_pixel_max] mov r1d, r4m add r1d, r1d shr r2d, 4 .loop: movu m7, [r0] movu m8, [r0 + r1] movu m2, [r0 + 2] movu m1, [r0 + r1 + 2] pcmpgtw m3, m7, m2 pcmpgtw m2, m7 pcmpgtw m4, m8, m1 pcmpgtw m1, m8 packsswb m3, m4 packsswb m2, m1 vpermq m3, m3, 11011000b vpermq m2, m2, 11011000b pand m3, [pb_1] por m3, m2 pslldq m2, m3, 1 por m2, m0 psignb m2, [pb_128] ; m2 = signLeft pxor m0, m0 palignr m0, m3, 15 paddb m3, m2 paddb m3, [pb_2] ; m3 = uiEdgeType pshufb m2, m6, m3 pmovsxbw m3, xm2 ; offsetEo vextracti128 xm2, m2, 1 pmovsxbw m2, xm2 pxor m4, m4 paddw m7, m3 paddw m8, m2 pmaxsw m7, m4 pmaxsw m8, m4 pminsw m7, m5 pminsw m8, m5 movu [r0], m7 movu [r0 + r1], m8 add r0q, 32 dec r2d jnz .loop RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride mov r4d, r4m vbroadcasti128 m4, [pb_128] ; m4 = [80] vbroadcasti128 m6, [r1] ; m6 = offsetEo movzx r1d, byte [r3] neg r1b movd xm0, r1d movzx r1d, byte [r3 + 1] neg r1b movd xm1, r1d vinserti128 m0, m0, xm1, 1 .loop: movu xm5, [r0] ; xm5 = rec[x] movu xm2, [r0 + 1] ; xm2 = rec[x + 1] vinserti128 m5, m5, [r0 + r4], 1 vinserti128 m2, m2, [r0 + r4 + 1], 1 pxor m1, m5, m4 pxor m3, m2, m4 pcmpgtb m2, m1, m3 pcmpgtb m3, m1 pand m2, [pb_1] por m2, m3 pslldq m3, m2, 1 por m3, m0 psignb m3, m4 ; m3 = signLeft pxor m0, m0 palignr m0, m2, 15 paddb m2, m3 paddb m2, [pb_2] ; m2 = uiEdgeType pshufb m3, m6, m2 pmovzxbw m2, xm5 ; rec vextracti128 xm5, m5, 1 pmovzxbw m5, xm5 pmovsxbw m1, xm3 ; offsetEo vextracti128 xm3, m3, 1 pmovsxbw m3, xm3 paddw m2, m1 paddw m5, m3 packuswb m2, m5 vpermq m2, m2, 11011000b movu [r0], xm2 vextracti128 [r0 + r4], m2, 1 add r0q, 16 sub r2d, 16 jnz .loop RET %endif ;================================================================================================== ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth) ;================================================================================================== INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgE1, 4,5,8 add r3d, r3d mov r4d, r4m pxor m0, m0 ; m0 = 0 mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] shr r4d, 4 .loop movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] movu m1, [r0 + r3 + 16] pcmpgtw m2, m7, m3 pcmpgtw m3, m7 pcmpgtw m4, m5, m1 pcmpgtw m1, m5 packsswb m2, m4 packsswb m3, m1 pand m2, [pb_1] por m2, m3 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, m6 movu m4, [r2] ; m4 = m_iOffsetEo pshufb m1, m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovsxbw m3, m1 punpckhbw m1, m1 psraw m1, 8 paddw m7, m3 paddw m5, m1 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, [pw_pixel_max] pminsw m5, [pw_pixel_max] movu [r0], m7 movu [r0 + 16], m5 add r0, 32 add r1, 16 dec r4d jnz .loop RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d, r3m mov r4d, r4m pxor m0, m0 ; m0 = 0 mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] mova m7, [pb_128] shr r4d, 4 .loop movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] pxor m3, m1, m7 pxor m4, m2, m7 pcmpgtb m2, m3, m4 pcmpgtb m4, m3 pand m2, [pb_1] por m2, m4 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, m6 movu m4, [r2] ; m4 = m_iOffsetEo pshufb m5, m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovzxbw m2, m1 punpckhbw m1, m0 pmovsxbw m3, m5 punpckhbw m5, m5 psraw m5, 8 paddw m2, m3 paddw m1, m5 packuswb m2, m1 movu [r0], m2 add r0, 16 add r1, 16 dec r4d jnz .loop RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE1, 4,5,6 add r3d, r3d mov r4d, r4m mova m4, [pb_2] shr r4d, 4 mova m0, [pw_pixel_max] .loop movu m5, [r0] movu m3, [r0 + r3] pcmpgtw m2, m5, m3 pcmpgtw m3, m5 packsswb m2, m3 vpermq m3, m2, 11011101b vpermq m2, m2, 10001000b pand xm2, [pb_1] por xm2, xm3 movu xm3, [r1] ; m3 = m_iUpBuff1 paddb xm3, xm2 paddb xm3, xm4 movu xm1, [r2] ; m1 = m_iOffsetEo pshufb xm1, xm3 pmovsxbw m3, xm1 paddw m5, m3 pxor m3, m3 pmaxsw m5, m3 pminsw m5, m0 movu [r0], m5 psubb xm3, xm2 movu [r1], xm3 add r0, 32 add r1, 16 dec r4d jnz .loop RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d, r3m mov r4d, r4m movu xm0, [r2] ; xm0 = m_iOffsetEo mova xm6, [pb_2] ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] mova xm7, [pb_128] shr r4d, 4 .loop movu xm1, [r0] ; xm1 = pRec[x] movu xm2, [r0 + r3] ; xm2 = pRec[x + iStride] pxor xm3, xm1, xm7 pxor xm4, xm2, xm7 pcmpgtb xm2, xm3, xm4 pcmpgtb xm4, xm3 pand xm2, [pb_1] por xm2, xm4 movu xm3, [r1] ; xm3 = m_iUpBuff1 paddb xm3, xm2 paddb xm3, xm6 pshufb xm5, xm0, xm3 pxor xm4, xm4 psubb xm3, xm4, xm2 movu [r1], xm3 pmovzxbw m2, xm1 pmovsxbw m3, xm5 paddw m2, m3 vextracti128 xm3, m2, 1 packuswb xm2, xm3 movu [r0], xm2 add r0, 16 add r1, 16 dec r4d jnz .loop RET %endif ;======================================================================================================== ; void saoCuOrgE1_2Rows(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth) ;======================================================================================================== INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 4,7,8 add r3d, r3d mov r4d, r4m pxor m0, m0 ; m0 = 0 mova m6, [pw_pixel_max] mov r5d, r4d shr r4d, 4 mov r6, r0 .loop movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] movu m1, [r0 + r3 + 16] pcmpgtw m2, m7, m3 pcmpgtw m3, m7 pcmpgtw m4, m5, m1 pcmpgtw m1, m5 packsswb m2, m4 packsswb m3, m1 pand m2, [pb_1] por m2, m3 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, [pb_2] movu m4, [r2] ; m4 = m_iOffsetEo pshufb m1, m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovsxbw m3, m1 punpckhbw m1, m1 psraw m1, 8 paddw m7, m3 paddw m5, m1 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, m6 pminsw m5, m6 movu [r0], m7 movu [r0 + 16], m5 add r0, 32 add r1, 16 dec r4d jnz .loop sub r1, r5 shr r5d, 4 lea r0, [r6 + r3] .loopH: movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] movu m1, [r0 + r3 + 16] pcmpgtw m2, m7, m3 pcmpgtw m3, m7 pcmpgtw m4, m5, m1 pcmpgtw m1, m5 packsswb m2, m4 packsswb m3, m1 pand m2, [pb_1] por m2, m3 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, [pb_2] movu m4, [r2] ; m4 = m_iOffsetEo pshufb m1, m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovsxbw m3, m1 punpckhbw m1, m1 psraw m1, 8 paddw m7, m3 paddw m5, m1 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, m6 pminsw m5, m6 movu [r0], m7 movu [r0 + 16], m5 add r0, 32 add r1, 16 dec r5d jnz .loopH RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d, r3m mov r4d, r4m pxor m0, m0 ; m0 = 0 mova m7, [pb_128] shr r4d, 4 .loop movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] pxor m3, m1, m7 pxor m4, m2, m7 pcmpgtb m6, m3, m4 pcmpgtb m5, m4, m3 pand m6, [pb_1] por m6, m5 movu m5, [r0 + r3 * 2] pxor m3, m5, m7 pcmpgtb m5, m4, m3 pcmpgtb m3, m4 pand m5, [pb_1] por m5, m3 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m6 paddb m3, [pb_2] movu m4, [r2] ; m4 = m_iOffsetEo pshufb m4, m3 psubb m3, m0, m6 movu [r1], m3 pmovzxbw m6, m1 punpckhbw m1, m0 pmovsxbw m3, m4 punpckhbw m4, m4 psraw m4, 8 paddw m6, m3 paddw m1, m4 packuswb m6, m1 movu [r0], m6 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m5 paddb m3, [pb_2] movu m4, [r2] ; m4 = m_iOffsetEo pshufb m4, m3 psubb m3, m0, m5 movu [r1], m3 pmovzxbw m5, m2 punpckhbw m2, m0 pmovsxbw m3, m4 punpckhbw m4, m4 psraw m4, 8 paddw m5, m3 paddw m2, m4 packuswb m5, m2 movu [r0 + r3], m5 add r0, 16 add r1, 16 dec r4d jnz .loop RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 4,5,8 add r3d, r3d mov r4d, r4m mova m4, [pw_pixel_max] vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo shr r4d, 4 .loop movu m7, [r0] movu m5, [r0 + r3] movu m1, [r0 + r3 * 2] pcmpgtw m2, m7, m5 pcmpgtw m3, m5, m7 pcmpgtw m0, m5, m1 pcmpgtw m1, m5 packsswb m2, m0 packsswb m3, m1 vpermq m2, m2, 11011000b vpermq m3, m3, 11011000b pand m2, [pb_1] por m2, m3 movu xm3, [r1] ; m3 = m_iUpBuff1 pxor m0, m0 psubb m1, m0, m2 vinserti128 m3, m3, xm1, 1 vextracti128 [r1], m1, 1 paddb m3, m2 paddb m3, [pb_2] pshufb m1, m6, m3 pmovsxbw m3, xm1 vextracti128 xm1, m1, 1 pmovsxbw m1, xm1 paddw m7, m3 paddw m5, m1 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, m4 pminsw m5, m4 movu [r0], m7 movu [r0 + r3], m5 add r0, 32 add r1, 16 dec r4d jnz .loop RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d, r3m mov r4d, r4m pxor m0, m0 ; m0 = 0 vbroadcasti128 m5, [pb_128] vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo shr r4d, 4 .loop movu xm1, [r0] ; m1 = pRec[x] movu xm2, [r0 + r3] ; m2 = pRec[x + iStride] vinserti128 m1, m1, xm2, 1 vinserti128 m2, m2, [r0 + r3 * 2], 1 pxor m3, m1, m5 pxor m4, m2, m5 pcmpgtb m2, m3, m4 pcmpgtb m4, m3 pand m2, [pb_1] por m2, m4 movu xm3, [r1] ; xm3 = m_iUpBuff psubb m4, m0, m2 vinserti128 m3, m3, xm4, 1 paddb m3, m2 paddb m3, [pb_2] pshufb m2, m6, m3 vextracti128 [r1], m4, 1 pmovzxbw m4, xm1 vextracti128 xm3, m1, 1 pmovzxbw m3, xm3 pmovsxbw m1, xm2 vextracti128 xm2, m2, 1 pmovsxbw m2, xm2 paddw m4, m1 paddw m3, m2 packuswb m4, m3 vpermq m4, m4, 11011000b movu [r0], xm4 vextracti128 [r0 + r3], m4, 1 add r0, 16 add r1, 16 dec r4d jnz .loop RET %endif ;====================================================================================================================================================== ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride) ;====================================================================================================================================================== INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgE2, 6,6,8 mov r4d, r4m add r5d, r5d pxor m0, m0 inc r1 movh m6, [r0 + r4 * 2] movhps m6, [r1 + r4] .loop movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r5 + 2] movu m1, [r0 + r5 + 18] pcmpgtw m2, m7, m3 pcmpgtw m3, m7 pcmpgtw m4, m5, m1 pcmpgtw m1, m5 packsswb m2, m4 packsswb m3, m1 pand m2, [pb_1] por m2, m3 movu m3, [r2] paddb m3, m2 paddb m3, [pb_2] movu m4, [r3] pshufb m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovsxbw m3, m4 punpckhbw m4, m4 psraw m4, 8 paddw m7, m3 paddw m5, m4 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, [pw_pixel_max] pminsw m5, [pw_pixel_max] movu [r0], m7 movu [r0 + 16], m5 add r0, 32 add r1, 16 add r2, 16 sub r4, 16 jg .loop movh [r0 + r4 * 2], m6 movhps [r1 + r4], m6 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth mov r4d, r4m mov r5d, r5m pxor m0, m0 ; m0 = 0 mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] mova m7, [pb_128] inc r1 movh m5, [r0 + r4] movhps m5, [r1 + r4] .loop movu m1, [r0] ; m1 = rec[x] movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1] pxor m3, m1, m7 pxor m4, m2, m7 pcmpgtb m2, m3, m4 pcmpgtb m4, m3 pand m2, [pb_1] por m2, m4 movu m3, [r2] ; m3 = buff1 paddb m3, m2 paddb m3, m6 ; m3 = edgeType movu m4, [r3] ; m4 = offsetEo pshufb m4, m3 psubb m3, m0, m2 movu [r1], m3 pmovzxbw m2, m1 punpckhbw m1, m0 pmovsxbw m3, m4 punpckhbw m4, m4 psraw m4, 8 paddw m2, m3 paddw m1, m4 packuswb m2, m1 movu [r0], m2 add r0, 16 add r1, 16 add r2, 16 sub r4, 16 jg .loop movh [r0 + r4], m5 movhps [r1 + r4], m5 RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE2, 6,6,7 mov r4d, r4m add r5d, r5d inc r1 movq xm4, [r0 + r4 * 2] movhps xm4, [r1 + r4] vbroadcasti128 m5, [r3] mova m6, [pw_pixel_max] .loop movu m1, [r0] movu m3, [r0 + r5 + 2] pcmpgtw m2, m1, m3 pcmpgtw m3, m1 packsswb m2, m3 vpermq m3, m2, 11011101b vpermq m2, m2, 10001000b pand xm2, [pb_1] por xm2, xm3 movu xm3, [r2] paddb xm3, xm2 paddb xm3, [pb_2] pshufb xm0, xm5, xm3 pmovsxbw m3, xm0 pxor m0, m0 paddw m1, m3 pmaxsw m1, m0 pminsw m1, m6 movu [r0], m1 psubb xm0, xm2 movu [r1], xm0 add r0, 32 add r1, 16 add r2, 16 sub r4, 16 jg .loop movq [r0 + r4 * 2], xm4 movhps [r1 + r4], xm4 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth mov r4d, r4m mov r5d, r5m pxor xm0, xm0 ; xm0 = 0 mova xm5, [pb_128] inc r1 movq xm6, [r0 + r4] movhps xm6, [r1 + r4] movu xm1, [r0] ; xm1 = rec[x] movu xm2, [r0 + r5 + 1] ; xm2 = rec[x + stride + 1] pxor xm3, xm1, xm5 pxor xm4, xm2, xm5 pcmpgtb xm2, xm3, xm4 pcmpgtb xm4, xm3 pand xm2, [pb_1] por xm2, xm4 movu xm3, [r2] ; xm3 = buff1 paddb xm3, xm2 paddb xm3, [pb_2] ; xm3 = edgeType movu xm4, [r3] ; xm4 = offsetEo pshufb xm4, xm3 psubb xm3, xm0, xm2 movu [r1], xm3 pmovzxbw m2, xm1 pmovsxbw m3, xm4 paddw m2, m3 vextracti128 xm3, m2, 1 packuswb xm2, xm3 movu [r0], xm2 movq [r0 + r4], xm6 movhps [r1 + r4], xm6 RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE2_32, 6,6,8 mov r4d, r4m add r5d, r5d inc r1 movq xm4, [r0 + r4 * 2] movhps xm4, [r1 + r4] vbroadcasti128 m5, [r3] .loop movu m1, [r0] movu m7, [r0 + 32] movu m3, [r0 + r5 + 2] movu m6, [r0 + r5 + 34] pcmpgtw m2, m1, m3 pcmpgtw m0, m7, m6 pcmpgtw m3, m1 pcmpgtw m6, m7 packsswb m2, m0 packsswb m3, m6 vpermq m3, m3, 11011000b vpermq m2, m2, 11011000b pand m2, [pb_1] por m2, m3 movu m3, [r2] paddb m3, m2 paddb m3, [pb_2] pshufb m0, m5, m3 pmovsxbw m3, xm0 vextracti128 xm0, m0, 1 pmovsxbw m6, xm0 pxor m0, m0 paddw m1, m3 paddw m7, m6 pmaxsw m1, m0 pmaxsw m7, m0 pminsw m1, [pw_pixel_max] pminsw m7, [pw_pixel_max] movu [r0], m1 movu [r0 + 32], m7 psubb m0, m2 movu [r1], m0 add r0, 64 add r1, 32 add r2, 32 sub r4, 32 jg .loop movq [r0 + r4 * 2], xm4 movhps [r1 + r4], xm4 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth mov r4d, r4m mov r5d, r5m pxor m0, m0 ; m0 = 0 vbroadcasti128 m7, [pb_128] vbroadcasti128 m5, [r3] ; m5 = offsetEo inc r1 movq xm6, [r0 + r4] movhps xm6, [r1 + r4] .loop: movu m1, [r0] ; m1 = rec[x] movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1] pxor m3, m1, m7 pxor m4, m2, m7 pcmpgtb m2, m3, m4 pcmpgtb m4, m3 pand m2, [pb_1] por m2, m4 movu m3, [r2] ; m3 = buff1 paddb m3, m2 paddb m3, [pb_2] ; m3 = edgeType pshufb m4, m5, m3 psubb m3, m0, m2 movu [r1], m3 pmovzxbw m2, xm1 vextracti128 xm1, m1, 1 pmovzxbw m1, xm1 pmovsxbw m3, xm4 vextracti128 xm4, m4, 1 pmovsxbw m4, xm4 paddw m2, m3 paddw m1, m4 packuswb m2, m1 vpermq m2, m2, 11011000b movu [r0], m2 add r0, 32 add r1, 32 add r2, 32 sub r4, 32 jg .loop movq [r0 + r4], xm6 movhps [r1 + r4], xm6 RET %endif ;======================================================================================================= ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX) ;======================================================================================================= INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgE3, 4,6,8 add r3d, r3d mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movh m6, [r0 + r5 * 2] movhps m6, [r1 + r5 - 1] ; move to startX+1 inc r4d lea r0, [r0 + r4 * 2] ; x = startX + 1 add r1, r4 sub r5d, r4d pxor m0, m0 .loop: movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] movu m1, [r0 + r3 + 16] pcmpgtw m2, m7, m3 pcmpgtw m3, m7 pcmpgtw m4, m5, m1 pcmpgtw m1, m5 packsswb m2, m4 packsswb m3, m1 pand m2, [pb_1] por m2, m3 movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, [pb_2] ; m3 = uiEdgeType movu m4, [r2] ; m4 = m_iOffsetEo pshufb m4, m3 psubb m3, m0, m2 movu [r1 - 1], m3 pmovsxbw m3, m4 punpckhbw m4, m4 psraw m4, 8 paddw m7, m3 paddw m5, m4 pmaxsw m7, m0 pmaxsw m5, m0 pminsw m7, [pw_pixel_max] pminsw m5, [pw_pixel_max] movu [r0], m7 movu [r0 + 16], m5 add r0, 32 add r1, 16 sub r5, 16 jg .loop ; restore last pixels (up to 2) movh [r0 + r5 * 2], m6 movhps [r1 + r5 - 1], m6 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE3, 3,6,8 mov r3d, r3m mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movh m7, [r0 + r5] movhps m7, [r1 + r5 - 1] ; move to startX+1 inc r4d add r0, r4 add r1, r4 sub r5d, r4d pxor m0, m0 ; m0 = 0 movu m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] .loop: movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] psubusb m3, m2, m1 psubusb m4, m1, m2 pcmpeqb m3, m0 pcmpeqb m4, m0 pcmpeqb m2, m1 pabsb m3, m3 por m4, m3 pandn m2, m4 ; m2 = iSignDown movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, m6 ; m3 = uiEdgeType movu m4, [r2] ; m4 = m_iOffsetEo pshufb m5, m4, m3 psubb m3, m0, m2 movu [r1 - 1], m3 pmovzxbw m2, m1 punpckhbw m1, m0 pmovsxbw m3, m5 punpckhbw m5, m5 psraw m5, 8 paddw m2, m3 paddw m1, m5 packuswb m2, m1 movu [r0], m2 add r0, 16 add r1, 16 sub r5, 16 jg .loop ; restore last pixels (up to 2) movh [r0 + r5], m7 movhps [r1 + r5 - 1], m7 RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE3, 4,6,6 add r3d, r3d mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movq xm5, [r0 + r5 * 2] movhps xm5, [r1 + r5 - 1] ; move to startX+1 inc r4d lea r0, [r0 + r4 * 2] ; x = startX + 1 add r1, r4 sub r5d, r4d movu xm4, [r2] .loop: movu m1, [r0] movu m0, [r0 + r3] pcmpgtw m2, m1, m0 pcmpgtw m0, m1 packsswb m2, m0 vpermq m0, m2, 11011101b vpermq m2, m2, 10001000b pand m2, [pb_1] por m2, m0 movu xm0, [r1] paddb xm0, xm2 paddb xm0, [pb_2] pshufb xm3, xm4, xm0 pmovsxbw m3, xm3 paddw m1, m3 pxor m0, m0 pmaxsw m1, m0 pminsw m1, [pw_pixel_max] movu [r0], m1 psubb xm0, xm2 movu [r1 - 1], xm0 add r0, 32 add r1, 16 sub r5, 16 jg .loop ; restore last pixels (up to 2) movq [r0 + r5 * 2], xm5 movhps [r1 + r5 - 1], xm5 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE3, 3, 6, 8 mov r3d, r3m mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movq xm7, [r0 + r5] movhps xm7, [r1 + r5 - 1] ; move to startX+1 inc r4d add r0, r4 add r1, r4 sub r5d, r4d pxor xm0, xm0 ; xm0 = 0 mova xm6, [pb_2] ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] movu xm5, [r2] ; xm5 = m_iOffsetEo .loop: movu xm1, [r0] ; xm1 = pRec[x] movu xm2, [r0 + r3] ; xm2 = pRec[x + iStride] psubusb xm3, xm2, xm1 psubusb xm4, xm1, xm2 pcmpeqb xm3, xm0 pcmpeqb xm4, xm0 pcmpeqb xm2, xm1 pabsb xm3, xm3 por xm4, xm3 pandn xm2, xm4 ; xm2 = iSignDown movu xm3, [r1] ; xm3 = m_iUpBuff1 paddb xm3, xm2 paddb xm3, xm6 ; xm3 = uiEdgeType pshufb xm4, xm5, xm3 psubb xm3, xm0, xm2 movu [r1 - 1], xm3 pmovzxbw m2, xm1 pmovsxbw m3, xm4 paddw m2, m3 vextracti128 xm3, m2, 1 packuswb xm2, xm3 movu [r0], xm2 add r0, 16 add r1, 16 sub r5, 16 jg .loop ; restore last pixels (up to 2) movq [r0 + r5], xm7 movhps [r1 + r5 - 1], xm7 RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgE3_32, 3,6,8 add r3d, r3d mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movq xm5, [r0 + r5 * 2] movhps xm5, [r1 + r5 - 1] ; move to startX+1 inc r4d lea r0, [r0 + r4 * 2] ; x = startX + 1 add r1, r4 sub r5d, r4d vbroadcasti128 m4, [r2] .loop: movu m1, [r0] movu m7, [r0 + 32] movu m0, [r0 + r3] movu m6, [r0 + r3 + 32] pcmpgtw m2, m1, m0 pcmpgtw m3, m7, m6 pcmpgtw m0, m1 pcmpgtw m6, m7 packsswb m2, m3 packsswb m0, m6 vpermq m2, m2, 11011000b vpermq m0, m0, 11011000b pand m2, [pb_1] por m2, m0 movu m0, [r1] paddb m0, m2 paddb m0, [pb_2] pshufb m3, m4, m0 vextracti128 xm6, m3, 1 pmovsxbw m3, xm3 pmovsxbw m6, xm6 paddw m1, m3 paddw m7, m6 pxor m0, m0 pmaxsw m1, m0 pmaxsw m7, m0 pminsw m1, [pw_pixel_max] pminsw m7, [pw_pixel_max] movu [r0], m1 movu [r0 + 32], m7 psubb m0, m2 movu [r1 - 1], m0 add r0, 64 add r1, 32 sub r5, 32 jg .loop ; restore last pixels (up to 2) movq [r0 + r5 * 2], xm5 movhps [r1 + r5 - 1], xm5 RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgE3_32, 3, 6, 8 mov r3d, r3m mov r4d, r4m mov r5d, r5m ; save latest 2 pixels for case startX=1 or left_endX=15 movq xm7, [r0 + r5] movhps xm7, [r1 + r5 - 1] ; move to startX+1 inc r4d add r0, r4 add r1, r4 sub r5d, r4d pxor m0, m0 ; m0 = 0 mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] vbroadcasti128 m5, [r2] ; m5 = m_iOffsetEo .loop: movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] psubusb m3, m2, m1 psubusb m4, m1, m2 pcmpeqb m3, m0 pcmpeqb m4, m0 pcmpeqb m2, m1 pabsb m3, m3 por m4, m3 pandn m2, m4 ; m2 = iSignDown movu m3, [r1] ; m3 = m_iUpBuff1 paddb m3, m2 paddb m3, m6 ; m3 = uiEdgeType pshufb m4, m5, m3 psubb m3, m0, m2 movu [r1 - 1], m3 pmovzxbw m2, xm1 vextracti128 xm1, m1, 1 pmovzxbw m1, xm1 pmovsxbw m3, xm4 vextracti128 xm4, m4, 1 pmovsxbw m4, xm4 paddw m2, m3 paddw m1, m4 packuswb m2, m1 vpermq m2, m2, 11011000b movu [r0], m2 add r0, 32 add r1, 32 sub r5, 32 jg .loop ; restore last pixels (up to 2) movq [r0 + r5], xm7 movhps [r1 + r5 - 1], xm7 RET %endif ;===================================================================================== ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride) ;===================================================================================== INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal saoCuOrgB0, 5,7,8 add r4d, r4d shr r2d, 4 movu m3, [r1] ; offset[0-15] movu m4, [r1 + 16] ; offset[16-31] pxor m7, m7 .loopH mov r5d, r2d xor r6, r6 .loopW movu m2, [r0 + r6] movu m5, [r0 + r6 + 16] psrlw m0, m2, (BIT_DEPTH - 5) psrlw m6, m5, (BIT_DEPTH - 5) packuswb m0, m6 pand m0, [pb_31] ; m0 = [index] pshufb m6, m3, m0 pshufb m1, m4, m0 pcmpgtb m0, [pb_15] ; m0 = [mask] pblendvb m6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovsxbw m0, m6 ; offset punpckhbw m6, m6 psraw m6, 8 paddw m2, m0 paddw m5, m6 pmaxsw m2, m7 pmaxsw m5, m7 pminsw m2, [pw_pixel_max] pminsw m5, [pw_pixel_max] movu [r0 + r6], m2 movu [r0 + r6 + 16], m5 add r6d, 32 dec r5d jnz .loopW lea r0, [r0 + r4] dec r3d jnz .loopH RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgB0, 4, 7, 8 mov r3d, r3m mov r4d, r4m shr r2d, 4 movu m3, [r1 + 0] ; offset[0-15] movu m4, [r1 + 16] ; offset[16-31] pxor m7, m7 ; m7 =[0] .loopH mov r5d, r2d xor r6, r6 .loopW movu m2, [r0 + r6] ; m0 = [rec] psrlw m1, m2, 3 pand m1, [pb_31] ; m1 = [index] pcmpgtb m0, m1, [pb_15] ; m2 = [mask] pshufb m6, m3, m1 pshufb m5, m4, m1 pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovzxbw m1, m2 ; rec punpckhbw m2, m7 pmovsxbw m0, m6 ; offset punpckhbw m6, m6 psraw m6, 8 paddw m1, m0 paddw m2, m6 packuswb m1, m2 movu [r0 + r6], m1 add r6d, 16 dec r5d jnz .loopW lea r0, [r0 + r4] dec r3d jnz .loopH RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal saoCuOrgB0, 5,7,8 vbroadcasti128 m3, [r1] vbroadcasti128 m4, [r1 + 16] add r4d, r4d lea r1, [r4 * 2] sub r1d, r2d sub r1d, r2d shr r2d, 4 mova m7, [pw_pixel_max] mov r6d, r3d shr r3d, 1 .loopH mov r5d, r2d .loopW movu m2, [r0] movu m5, [r0 + r4] psrlw m0, m2, (BIT_DEPTH - 5) psrlw m6, m5, (BIT_DEPTH - 5) packuswb m0, m6 vpermq m0, m0, 11011000b pand m0, [pb_31] ; m0 = [index] pshufb m6, m3, m0 pshufb m1, m4, m0 pcmpgtb m0, [pb_15] ; m0 = [mask] pblendvb m6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovsxbw m0, xm6 vextracti128 xm6, m6, 1 pmovsxbw m6, xm6 paddw m2, m0 paddw m5, m6 pxor m1, m1 pmaxsw m2, m1 pmaxsw m5, m1 pminsw m2, m7 pminsw m5, m7 movu [r0], m2 movu [r0 + r4], m5 add r0, 32 dec r5d jnz .loopW add r0, r1 dec r3d jnz .loopH test r6b, 1 jz .end xor r1, r1 .loopW1: movu m2, [r0 + r1] psrlw m0, m2, (BIT_DEPTH - 5) packuswb m0, m0 vpermq m0, m0, 10001000b pand m0, [pb_31] ; m0 = [index] pshufb m6, m3, m0 pshufb m1, m4, m0 pcmpgtb m0, [pb_15] ; m0 = [mask] pblendvb m6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovsxbw m0, xm6 ; offset paddw m2, m0 pxor m0, m0 pmaxsw m2, m0 pminsw m2, m7 movu [r0 + r1], m2 add r1d, 32 dec r2d jnz .loopW1 .end: RET %else ; HIGH_BIT_DEPTH cglobal saoCuOrgB0, 4, 7, 8 mov r3d, r3m mov r4d, r4m mova m7, [pb_31] vbroadcasti128 m3, [r1 + 0] ; offset[0-15] vbroadcasti128 m4, [r1 + 16] ; offset[16-31] lea r6, [r4 * 2] sub r6d, r2d shr r2d, 4 mov r1d, r3d shr r3d, 1 .loopH mov r5d, r2d .loopW movu xm2, [r0] ; m2 = [rec] vinserti128 m2, m2, [r0 + r4], 1 psrlw m1, m2, 3 pand m1, m7 ; m1 = [index] pcmpgtb m0, m1, [pb_15] ; m0 = [mask] pshufb m6, m3, m1 pshufb m5, m4, m1 pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovzxbw m1, xm2 ; rec vextracti128 xm2, m2, 1 pmovzxbw m2, xm2 pmovsxbw m0, xm6 ; offset vextracti128 xm6, m6, 1 pmovsxbw m6, xm6 paddw m1, m0 paddw m2, m6 packuswb m1, m2 vpermq m1, m1, 11011000b movu [r0], xm1 vextracti128 [r0 + r4], m1, 1 add r0, 16 dec r5d jnz .loopW add r0, r6 dec r3d jnz .loopH test r1b, 1 jz .end mov r5d, r2d .loopW1 movu xm2, [r0] ; m2 = [rec] psrlw xm1, xm2, 3 pand xm1, xm7 ; m1 = [index] pcmpgtb xm0, xm1, [pb_15] ; m0 = [mask] pshufb xm6, xm3, xm1 pshufb xm5, xm4, xm1 pblendvb xm6, xm6, xm5, xm0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! pmovzxbw m1, xm2 ; rec pmovsxbw m0, xm6 ; offset paddw m1, m0 vextracti128 xm0, m1, 1 packuswb xm1, xm0 movu [r0], xm1 add r0, 16 dec r5d jnz .loopW1 .end RET %endif ;============================================================================================================ ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width) ;============================================================================================================ INIT_XMM sse4 %if HIGH_BIT_DEPTH cglobal calSign, 4, 7, 5 mova m0, [pw_1] mov r4d, r3d shr r3d, 4 add r3d, 1 mov r5, r0 movu m4, [r0 + r4] .loop movu m1, [r1] ; m2 = pRec[x] movu m2, [r2] ; m3 = pTmpU[x] pcmpgtw m3, m1, m2 pcmpgtw m2, m1 pand m3, m0 por m3, m2 packsswb m3, m3 movh [r0], xm3 movu m1, [r1 + 16] ; m2 = pRec[x] movu m2, [r2 + 16] ; m3 = pTmpU[x] pcmpgtw m3, m1, m2 pcmpgtw m2, m1 pand m3, m0 por m3, m2 packsswb m3, m3 movh [r0 + 8], xm3 add r0, 16 add r1, 32 add r2, 32 dec r3d jnz .loop mov r6, r0 sub r6, r5 sub r4, r6 movu [r0 + r4], m4 RET %else ; HIGH_BIT_DEPTH cglobal calSign, 4,5,6 mova m0, [pb_128] mova m1, [pb_1] sub r1, r0 sub r2, r0 mov r4d, r3d shr r3d, 4 jz .next .loop: movu m2, [r0 + r1] ; m2 = pRec[x] movu m3, [r0 + r2] ; m3 = pTmpU[x] pxor m4, m2, m0 pxor m3, m0 pcmpgtb m5, m4, m3 pcmpgtb m3, m4 pand m5, m1 por m5, m3 movu [r0], m5 add r0, 16 dec r3d jnz .loop ; process partial .next: and r4d, 15 jz .end movu m2, [r0 + r1] ; m2 = pRec[x] movu m3, [r0 + r2] ; m3 = pTmpU[x] pxor m4, m2, m0 pxor m3, m0 pcmpgtb m5, m4, m3 pcmpgtb m3, m4 pand m5, m1 por m5, m3 lea r3, [pb_movemask + 16] sub r3, r4 movu xmm0, [r3] movu m3, [r0] pblendvb m5, m5, m3, xmm0 movu [r0], m5 .end: RET %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal calSign, 4, 7, 5 mova m0, [pw_1] mov r4d, r3d shr r3d, 4 add r3d, 1 mov r5, r0 movu m4, [r0 + r4] .loop movu m1, [r1] ; m2 = pRec[x] movu m2, [r2] ; m3 = pTmpU[x] pcmpgtw m3, m1, m2 pcmpgtw m2, m1 pand m3, m0 por m3, m2 packsswb m3, m3 vpermq m3, m3, q3220 movu [r0 ], xm3 add r0, 16 add r1, 32 add r2, 32 dec r3d jnz .loop mov r6, r0 sub r6, r5 sub r4, r6 movu [r0 + r4], m4 RET %else ; HIGH_BIT_DEPTH cglobal calSign, 4, 5, 6 vbroadcasti128 m0, [pb_128] mova m1, [pb_1] sub r1, r0 sub r2, r0 mov r4d, r3d shr r3d, 5 jz .next .loop: movu m2, [r0 + r1] ; m2 = pRec[x] movu m3, [r0 + r2] ; m3 = pTmpU[x] pxor m4, m2, m0 pxor m3, m0 pcmpgtb m5, m4, m3 pcmpgtb m3, m4 pand m5, m1 por m5, m3 movu [r0], m5 add r0, mmsize dec r3d jnz .loop ; process partial .next: and r4d, 31 jz .end movu m2, [r0 + r1] ; m2 = pRec[x] movu m3, [r0 + r2] ; m3 = pTmpU[x] pxor m4, m2, m0 pxor m3, m0 pcmpgtb m5, m4, m3 pcmpgtb m3, m4 pand m5, m1 por m5, m3 lea r3, [pb_movemask_32 + 32] sub r3, r4 movu m0, [r3] movu m3, [r0] pblendvb m5, m5, m3, m0 movu [r0], m5 .end: RET %endif ;-------------------------------------------------------------------------------------------------------------------------- ; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) ;-------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_XMM sse4 cglobal saoCuStatsBO, 7,12,6 mova m3, [hmul_16p + 16] mova m4, [pb_124] mova m5, [pb_4] xor r7d, r7d .loopH: mov r10, r0 mov r11, r1 mov r9d, r3d .loopL: movu m1, [r11] movu m0, [r10] punpckhbw m2, m0, m1 punpcklbw m0, m1 psrlw m1, 1 ; rec[x] >> boShift pmaddubsw m2, m3 pmaddubsw m0, m3 pand m1, m4 paddb m1, m5 %assign x 0 %rep 16 pextrb r7d, m1, x %if (x < 8) pextrw r8d, m0, (x % 8) %else pextrw r8d, m2, (x % 8) %endif movsx r8d, r8w inc dword [r6 + r7] ; count[classIdx]++ add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]); dec r9d jz .next %assign x x+1 %endrep add r10, 16 add r11, 16 jmp .loopL .next: add r0, r2 add r1, r2 dec r4d jnz .loopH RET %endif ;----------------------------------------------------------------------------------------------------------------------- ; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) ;----------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_XMM sse4 cglobal saoCuStatsE0, 5,9,8, 0-32 mov r3d, r3m mov r8, r5mp ; clear internal temporary buffer pxor m0, m0 mova [rsp], m0 mova [rsp + mmsize], m0 mova m4, [pb_128] mova m5, [hmul_16p + 16] mova m6, [pb_2] xor r7d, r7d .loopH: mov r5d, r3d ; calculate signLeft mov r7b, [r1] sub r7b, [r1 - 1] seta r7b setb r6b sub r7b, r6b neg r7b pinsrb m0, r7d, 15 .loopL: movu m7, [r1] movu m2, [r1 + 1] pxor m1, m7, m4 pxor m3, m2, m4 pcmpgtb m2, m1, m3 pcmpgtb m3, m1 pand m2, [pb_1] por m2, m3 ; signRight palignr m3, m2, m0, 15 psignb m3, m4 ; signLeft mova m0, m2 paddb m2, m3 paddb m2, m6 ; edgeType ; stats[edgeType] movu m3, [r0] ; fenc[0-15] punpckhbw m1, m3, m7 punpcklbw m3, m7 pmaddubsw m1, m5 pmaddubsw m3, m5 %assign x 0 %rep 16 pextrb r7d, m2, x %if (x < 8) pextrw r6d, m3, (x % 8) %else pextrw r6d, m1, (x % 8) %endif movsx r6d, r6w inc word [rsp + r7 * 2] ; tmp_count[edgeType]++ add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += (fenc[x] - rec[x]) dec r5d jz .next %assign x x+1 %endrep add r0q, 16 add r1q, 16 jmp .loopL .next: mov r6d, r3d and r6d, 15 sub r6, r3 add r6, r2 add r0, r6 add r1, r6 dec r4d jnz .loopH ; sum to global buffer mov r0, r6mp ; s_eoTable = {1, 2, 0, 3, 4} movzx r5d, word [rsp + 0 * 2] add [r0 + 1 * 4], r5d movzx r6d, word [rsp + 1 * 2] add [r0 + 2 * 4], r6d movzx r5d, word [rsp + 2 * 2] add [r0 + 0 * 4], r5d movzx r6d, word [rsp + 3 * 2] add [r0 + 3 * 4], r6d movzx r5d, word [rsp + 4 * 2] add [r0 + 4 * 4], r5d mov r6d, [rsp + 5 * 2 + 0 * 4] add [r8 + 1 * 4], r6d mov r5d, [rsp + 5 * 2 + 1 * 4] add [r8 + 2 * 4], r5d mov r6d, [rsp + 5 * 2 + 2 * 4] add [r8 + 0 * 4], r6d mov r5d, [rsp + 5 * 2 + 3 * 4] add [r8 + 3 * 4], r5d mov r6d, [rsp + 5 * 2 + 4 * 4] add [r8 + 4 * 4], r6d RET %endif ;------------------------------------------------------------------------------------------------------------------------------------------- ; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) ;------------------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_XMM sse4 cglobal saoCuStatsE1, 4,12,9,0-32 ; Stack: 5 of stats and 5 of count mov r5d, r5m mov r4d, r4m mov r11d, r5d ; clear internal temporary buffer pxor m0, m0 mova [rsp], m0 mova [rsp + mmsize], m0 mova m0, [pb_128] mova m5, [pb_1] mova m6, [pb_2] mova m8, [hmul_16p + 16] movh m7, [r3 + r4] .loopH: mov r6d, r4d mov r9, r0 mov r10, r1 mov r11, r3 .loopW: movu m1, [r10] movu m2, [r10 + r2] ; signDown pxor m1, m0 pxor m2, m0 pcmpgtb m3, m1, m2 pand m3, m5 pcmpgtb m2, m1 por m2, m3 pxor m3, m3 psubb m3, m2 ; -signDown ; edgeType movu m4, [r11] paddb m4, m6 paddb m2, m4 ; update upBuff1 movu [r11], m3 ; stats[edgeType] pxor m1, m0 movu m3, [r9] punpckhbw m4, m3, m1 punpcklbw m3, m1 pmaddubsw m3, m8 pmaddubsw m4, m8 ; 16 pixels %assign x 0 %rep 16 pextrb r7d, m2, x inc word [rsp + r7 * 2] %if (x < 8) pextrw r8d, m3, (x % 8) %else pextrw r8d, m4, (x % 8) %endif movsx r8d, r8w add [rsp + 5 * 2 + r7 * 4], r8d dec r6d jz .next %assign x x+1 %endrep add r9, 16 add r10, 16 add r11, 16 jmp .loopW .next: ; restore pointer upBuff1 add r0, r2 add r1, r2 dec r5d jg .loopH ; restore unavailable pixels movh [r3 + r4], m7 ; sum to global buffer mov r1, r6m mov r0, r7m ; s_eoTable = {1,2,0,3,4} movzx r6d, word [rsp + 0 * 2] add [r0 + 1 * 4], r6d movzx r6d, word [rsp + 1 * 2] add [r0 + 2 * 4], r6d movzx r6d, word [rsp + 2 * 2] add [r0 + 0 * 4], r6d movzx r6d, word [rsp + 3 * 2] add [r0 + 3 * 4], r6d movzx r6d, word [rsp + 4 * 2] add [r0 + 4 * 4], r6d mov r6d, [rsp + 5 * 2 + 0 * 4] add [r1 + 1 * 4], r6d mov r6d, [rsp + 5 * 2 + 1 * 4] add [r1 + 2 * 4], r6d mov r6d, [rsp + 5 * 2 + 2 * 4] add [r1 + 0 * 4], r6d mov r6d, [rsp + 5 * 2 + 3 * 4] add [r1 + 3 * 4], r6d mov r6d, [rsp + 5 * 2 + 4 * 4] add [r1 + 4 * 4], r6d RET %endif ; ARCH_X86_64