view x265/source/encoder/sao.cpp @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
line wrap: on
line source

/*****************************************************************************
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
 *          Min Chen <chenm003@163.com>
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "common.h"
#include "frame.h"
#include "framedata.h"
#include "picyuv.h"
#include "sao.h"

namespace {

inline int32_t roundIBDI(int32_t num, int32_t den)
{
    return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2));
}

/* get the sign of input variable (TODO: this is a dup, make common) */
inline int8_t signOf(int x)
{
    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
}

inline int signOf2(const int a, const int b)
{
    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
    int r = 0;
    if (a < b)
        r = -1;
    if (a > b)
        r = 1;
    return r;
}

inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
{
    return (count * offset - offsetOrg * 2) * offset;
}
} // end anonymous namespace


namespace X265_NS {

const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
{
    1, // 0
    2, // 1
    0, // 2
    3, // 3
    4  // 4
};

SAO::SAO()
{
    m_count = NULL;
    m_offset = NULL;
    m_offsetOrg = NULL;
    m_countPreDblk = NULL;
    m_offsetOrgPreDblk = NULL;
    m_refDepth = 0;
    m_lumaLambda = 0;
    m_chromaLambda = 0;
    m_param = NULL;
    m_clipTable = NULL;
    m_clipTableBase = NULL;
    m_tmpU1[0] = NULL;
    m_tmpU1[1] = NULL;
    m_tmpU1[2] = NULL;
    m_tmpU2[0] = NULL;
    m_tmpU2[1] = NULL;
    m_tmpU2[2] = NULL;
    m_tmpL1 = NULL;
    m_tmpL2 = NULL;

    m_depthSaoRate[0][0] = 0;
    m_depthSaoRate[0][1] = 0;
    m_depthSaoRate[0][2] = 0;
    m_depthSaoRate[0][3] = 0;
    m_depthSaoRate[1][0] = 0;
    m_depthSaoRate[1][1] = 0;
    m_depthSaoRate[1][2] = 0;
    m_depthSaoRate[1][3] = 0;
}

bool SAO::create(x265_param* param)
{
    m_param = param;
    if (param->internalCsp != X265_CSP_I400) {
        m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
        m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
        m_numPlanes = 3;
    } else {
        m_numPlanes = 1;
    }
  
    m_numCuInWidth =  (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
    m_numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;

    const pixel maxY = (1 << X265_DEPTH) - 1;
    const pixel rangeExt = maxY >> 1;
    int numCtu = m_numCuInWidth * m_numCuInHeight;

    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);

    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);

    for (int i = 0; i < 3; i++)
    {
        // SAO asm code will read 1 pixel before and after, so pad by 2
        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
        m_tmpU1[i] += 1;
        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
        m_tmpU2[i] += 1;
    }

    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);

    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);

    m_clipTable = &(m_clipTableBase[rangeExt]);

    for (int i = 0; i < rangeExt; i++)
        m_clipTableBase[i] = 0;

    for (int i = 0; i < maxY; i++)
        m_clipTable[i] = (pixel)i;

    for (int i = maxY; i < maxY + rangeExt; i++)
        m_clipTable[i] = maxY;

    return true;

fail:
    return false;
}

void SAO::destroy()
{
    X265_FREE(m_clipTableBase);

    X265_FREE(m_tmpL1);
    X265_FREE(m_tmpL2);

    for (int i = 0; i < 3; i++)
    {
        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
    }

    X265_FREE(m_count);
    X265_FREE(m_offset);
    X265_FREE(m_offsetOrg);
    X265_FREE(m_countPreDblk);
    X265_FREE(m_offsetOrgPreDblk);
}

/* allocate memory for SAO parameters */
void SAO::allocSaoParam(SAOParam* saoParam) const
{
    saoParam->numCuInWidth  = m_numCuInWidth;

    saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
    saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
    saoParam->ctuParam[2] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
}

void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
{
    Slice* slice = frame->m_encData->m_slice;
    int qpCb = qp;
    if (m_param->internalCsp == X265_CSP_I420)
        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
    else
        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
    m_lumaLambda = x265_lambda2_tab[qp];
    m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
    m_frame = frame;

    switch (slice->m_sliceType)
    {
    case I_SLICE:
        m_refDepth = 0;
        break;
    case P_SLICE:
        m_refDepth = 1;
        break;
    case B_SLICE:
        m_refDepth = 2 + !IS_REFERENCED(frame);
        break;
    }

    resetStats();

    m_entropyCoder.load(initState);
    m_rdContexts.next.load(initState);
    m_rdContexts.cur.load(initState);

    SAOParam* saoParam = frame->m_encData->m_saoParam;
    if (!saoParam)
    {
        saoParam = new SAOParam;
        allocSaoParam(saoParam);
        frame->m_encData->m_saoParam = saoParam;
    }

    saoParam->bSaoFlag[0] = true;
    saoParam->bSaoFlag[1] = (m_numPlanes > 1);

    m_numNoSao[0] = 0; // Luma
    m_numNoSao[1] = 0; // Chroma

    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
    if (m_param->frameNumThreads == 1)
    {
        if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
            saoParam->bSaoFlag[0] = false;
        if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
            saoParam->bSaoFlag[1] = false;
    }
}

// CTU-based SAO process without slice granularity
void SAO::processSaoCu(int addr, int typeIdx, int plane)
{
    int x, y;
    PicYuv* reconPic = m_frame->m_reconPic;
    pixel* rec = reconPic->getPlaneAddr(plane, addr);
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
    uint32_t picWidth  = m_param->sourceWidth;
    uint32_t picHeight = m_param->sourceHeight;
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
    int ctuWidth = g_maxCUSize;
    int ctuHeight = g_maxCUSize;
    uint32_t lpelx = cu->m_cuPelX;
    uint32_t tpely = cu->m_cuPelY;
    if (plane)
    {
        picWidth  >>= m_hChromaShift;
        picHeight >>= m_vChromaShift;
        ctuWidth  >>= m_hChromaShift;
        ctuHeight >>= m_vChromaShift;
        lpelx     >>= m_hChromaShift;
        tpely     >>= m_vChromaShift;
    }
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
    ctuWidth  = rpelx - lpelx;
    ctuHeight = bpely - tpely;

    int startX;
    int startY;
    int endX;
    int endY;
    pixel* tmpL;
    pixel* tmpU;

    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;

    memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */

    {
        const pixel* recR = &rec[ctuWidth - 1];
        for (int i = 0; i < ctuHeight + 1; i++)
        {
            m_tmpL2[i] = *recR;
            recR += stride;
        }

        tmpL = m_tmpL1;
        tmpU = &(m_tmpU1[plane][lpelx]);
    }

    switch (typeIdx)
    {
    case SAO_EO_0: // dir: -
    {
        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
        startX = !lpelx;
        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
        if (ctuWidth & 15)
        {
            for (y = 0; y < ctuHeight; y++)
            {
                int signLeft = signOf(rec[startX] - tmpL[y]);
                for (x = startX; x < endX; x++)
                {
                    int signRight = signOf(rec[x] - rec[x + 1]);
                    int edgeType = signRight + signLeft + 2;
                    signLeft = -signRight;

                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
                }

                rec += stride;
            }
        }
        else
        {
            for (y = 0; y < ctuHeight; y += 2)
            {
                signLeft1[0] = signOf(rec[startX] - tmpL[y]);
                signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);

                if (!lpelx)
                {
                    firstPxl = rec[0];
                    row1FirstPxl = rec[stride];
                }

                if (rpelx == picWidth)
                {
                    lastPxl = rec[ctuWidth - 1];
                    row1LastPxl = rec[stride + ctuWidth - 1];
                }

                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride);

                if (!lpelx)
                {
                    rec[0] = firstPxl;
                    rec[stride] = row1FirstPxl;
                }

                if (rpelx == picWidth)
                {
                    rec[ctuWidth - 1] = lastPxl;
                    rec[stride + ctuWidth - 1] = row1LastPxl;
                }

                rec += 2 * stride;
            }
        }
        break;
    }
    case SAO_EO_1: // dir: |
    {
        startY = !tpely;
        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
        if (!tpely)
            rec += stride;

        if (ctuWidth & 15)
        {
            for (x = 0; x < ctuWidth; x++)
                upBuff1[x] = signOf(rec[x] - tmpU[x]);

            for (y = startY; y < endY; y++)
            {
                for (x = 0; x < ctuWidth; x++)
                {
                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
                    int edgeType = signDown + upBuff1[x] + 2;
                    upBuff1[x] = -signDown;

                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
                }

                rec += stride;
            }
        }
        else
        {
            primitives.sign(upBuff1, rec, tmpU, ctuWidth);

            int diff = (endY - startY) % 2;
            for (y = startY; y < endY - diff; y += 2)
            {
                primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth);
                rec += 2 * stride;
            }
            if (diff & 1)
                primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
        }

        break;
    }
    case SAO_EO_2: // dir: 135
    {
        startX = !lpelx;
        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;

        startY = !tpely;
        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;

        if (!tpely)
            rec += stride;

        if (!(ctuWidth & 15))
        {
            int8_t firstSign, lastSign;

            if (!lpelx)
                firstSign = upBuff1[0];

            if (rpelx == picWidth)
                lastSign = upBuff1[ctuWidth - 1];

            primitives.sign(upBuff1, rec, &tmpU[- 1], ctuWidth);

            if (!lpelx)
                upBuff1[0] = firstSign;

            if (rpelx == picWidth)
                upBuff1[ctuWidth - 1] = lastSign;
        }
        else
        {
            for (x = startX; x < endX; x++)
                upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
        }

        if (ctuWidth & 15)
        {
             for (y = startY; y < endY; y++)
             {
                 upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
                 for (x = startX; x < endX; x++)
                 {
                     int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
                     int edgeType = signDown + upBuff1[x] + 2;
                     upBufft[x + 1] = -signDown;
                     rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
                 }

                 std::swap(upBuff1, upBufft);

                 rec += stride;
             }
        }
        else
        {
            for (y = startY; y < endY; y++)
            {
                int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);

                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride);

                upBufft[startX] = iSignDown2;

                std::swap(upBuff1, upBufft);
                rec += stride;
            }
        }
        break;
    }
    case SAO_EO_3: // dir: 45
    {
        startX = !lpelx;
        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;

        startY = !tpely;
        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;

        if (!tpely)
            rec += stride;

        if (ctuWidth & 15)
        {
            for (x = startX - 1; x < endX; x++)
                upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);

            for (y = startY; y < endY; y++)
            {
                x = startX;
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
                int edgeType = signDown + upBuff1[x] + 2;
                upBuff1[x - 1] = -signDown;
                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];

                for (x = startX + 1; x < endX; x++)
                {
                    signDown = signOf(rec[x] - rec[x + stride - 1]);
                    edgeType = signDown + upBuff1[x] + 2;
                    upBuff1[x - 1] = -signDown;
                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
                }

                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);

                rec += stride;
            }
        }
        else
        {
            int8_t firstSign, lastSign;

            if (lpelx)
                firstSign = signOf(rec[-1] - tmpU[0]);
            if (rpelx == picWidth)
                lastSign = upBuff1[ctuWidth - 1];

            primitives.sign(upBuff1, rec, &tmpU[1], ctuWidth);

            if (lpelx)
                upBuff1[-1] = firstSign;
            if (rpelx == picWidth)
                upBuff1[ctuWidth - 1] = lastSign;

            for (y = startY; y < endY; y++)
            {
                x = startX;
                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
                int edgeType = signDown + upBuff1[x] + 2;
                upBuff1[x - 1] = -signDown;
                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];

                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);

                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);

                rec += stride;
            }
        }

        break;
    }
    case SAO_BO:
    {
        const int8_t* offsetBo = m_offsetBo;

        if (ctuWidth & 15)
        {
            #define SAO_BO_BITS 5
            const int boShift = X265_DEPTH - SAO_BO_BITS;
            for (y = 0; y < ctuHeight; y++)
            {
                for (x = 0; x < ctuWidth; x++)
                {
                     int val = rec[x] + offsetBo[rec[x] >> boShift];
                     if (val < 0)
                         val = 0;
                     else if (val > ((1 << X265_DEPTH) - 1))
                         val = ((1 << X265_DEPTH) - 1);
                     rec[x] = (pixel)val;
                }
                rec += stride;
            }
        }
        else
        {
            primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
        }
        break;
    }
    default: break;
    }

//   if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1)
    std::swap(m_tmpL1, m_tmpL2);
}

/* Process SAO all units */
void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
{
    PicYuv* reconPic = m_frame->m_reconPic;
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
    uint32_t picWidth  = m_param->sourceWidth;
    int ctuWidth  = g_maxCUSize;
    int ctuHeight = g_maxCUSize;
    if (plane)
    {
        picWidth  >>= m_hChromaShift;
        ctuWidth  >>= m_hChromaShift;
        ctuHeight >>= m_vChromaShift;
    }

    if (!idxY)
    {
        pixel* rec = reconPic->m_picOrg[plane];
        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
    }

    int addr = idxY * m_numCuInWidth;
    pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr);

    for (int i = 0; i < ctuHeight + 1; i++)
    {
        m_tmpL1[i] = rec[0];
        rec += stride;
    }

    rec -= (stride << 1);

    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);

    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
    {
        addr = idxY * m_numCuInWidth + idxX;

        bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;
        int typeIdx = ctuParam[addr].typeIdx;

        if (typeIdx >= 0)
        {
            if (!mergeLeftFlag)
            {
                if (typeIdx == SAO_BO)
                {
                    memset(m_offsetBo, 0, sizeof(m_offsetBo));

                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
                        m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
                }
                else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
                {
                    int offset[NUM_EDGETYPE];
                    offset[0] = 0;
                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
                        offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;

                    for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
                        m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]];
                }
            }
            processSaoCu(addr, typeIdx, plane);
        }
        else if (idxX != (m_numCuInWidth - 1))
        {
            rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr);

            for (int i = 0; i < ctuHeight + 1; i++)
            {
                m_tmpL1[i] = rec[ctuWidth - 1];
                rec += stride;
            }
        }
    }

    std::swap(m_tmpU1[plane], m_tmpU2[plane]);
}

void SAO::resetSaoUnit(SaoCtuParam* saoUnit)
{
    saoUnit->mergeMode  = SAO_MERGE_NONE;
    saoUnit->typeIdx    = -1;
    saoUnit->bandPos    = 0;

    for (int i = 0; i < SAO_NUM_OFFSET; i++)
        saoUnit->offset[i] = 0;
}

void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
{
    saoUnitDst->mergeMode   = saoUnitSrc->mergeMode;
    saoUnitDst->typeIdx     = saoUnitSrc->typeIdx;
    saoUnitDst->bandPos     = saoUnitSrc->bandPos;

    for (int i = 0; i < SAO_NUM_OFFSET; i++)
        saoUnitDst->offset[i] = saoUnitSrc->offset[i];
}

/* Calculate SAO statistics for current CTU without non-crossing slice */
void SAO::calcSaoStatsCu(int addr, int plane)
{
    const PicYuv* reconPic = m_frame->m_reconPic;
    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
    const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
    const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
    const pixel* fenc;
    const pixel* rec;
    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
    uint32_t picWidth  = m_param->sourceWidth;
    uint32_t picHeight = m_param->sourceHeight;
    int ctuWidth  = g_maxCUSize;
    int ctuHeight = g_maxCUSize;
    uint32_t lpelx = cu->m_cuPelX;
    uint32_t tpely = cu->m_cuPelY;
    if (plane)
    {
        picWidth  >>= m_hChromaShift;
        picHeight >>= m_vChromaShift;
        ctuWidth  >>= m_hChromaShift;
        ctuHeight >>= m_vChromaShift;
        lpelx     >>= m_hChromaShift;
        tpely     >>= m_vChromaShift;
    }
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
    ctuWidth  = rpelx - lpelx;
    ctuHeight = bpely - tpely;

    int startX;
    int startY;
    int endX;
    int endY;

    int skipB = plane ? 2 : 4;
    int skipR = plane ? 3 : 5;

    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;

    // SAO_BO:
    {
        if (m_param->bSaoNonDeblocked)
        {
            skipB = plane ? 1 : 3;
            skipR = plane ? 2 : 4;
        }

        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;

        primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
    }

    {
        // SAO_EO_0: // dir: -
        {
            if (m_param->bSaoNonDeblocked)
            {
                skipB = plane ? 1 : 3;
                skipR = plane ? 3 : 5;
            }

            startX = !lpelx;
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;

            primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
        }

        // SAO_EO_1: // dir: |
        {
            if (m_param->bSaoNonDeblocked)
            {
                skipB = plane ? 2 : 4;
                skipR = plane ? 2 : 4;
            }

            fenc = fenc0;
            rec  = rec0;

            startY = !tpely;
            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);

            primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
        }

        // SAO_EO_2: // dir: 135
        {
            if (m_param->bSaoNonDeblocked)
            {
                skipB = plane ? 2 : 4;
                skipR = plane ? 3 : 5;
            }

            fenc = fenc0;
            rec  = rec0;

            startX = !lpelx;
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;

            startY = !tpely;
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));

            primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
        }

        // SAO_EO_3: // dir: 45
        {
            if (m_param->bSaoNonDeblocked)
            {
                skipB = plane ? 2 : 4;
                skipR = plane ? 3 : 5;
            }

            fenc = fenc0;
            rec  = rec0;

            startX = !lpelx;
            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;

            startY = !tpely;
            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;

            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));

            primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
        }
    }
}

void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY)
{
    int addr = idxX + m_numCuInWidth * idxY;

    int x, y;
    const CUData* cu = frame->m_encData->getPicCTU(addr);
    const PicYuv* reconPic = m_frame->m_reconPic;
    const pixel* fenc;
    const pixel* rec;
    intptr_t stride = reconPic->m_stride;
    uint32_t picWidth  = m_param->sourceWidth;
    uint32_t picHeight = m_param->sourceHeight;
    int ctuWidth  = g_maxCUSize;
    int ctuHeight = g_maxCUSize;
    uint32_t lpelx = cu->m_cuPelX;
    uint32_t tpely = cu->m_cuPelY;
    uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
    uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
    ctuWidth  = rpelx - lpelx;
    ctuHeight = bpely - tpely;

    int startX;
    int startY;
    int endX;
    int endY;
    int firstX, firstY;
    int32_t* stats;
    int32_t* count;

    int skipB, skipR;

    int32_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
    int32_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;

    const int boShift = X265_DEPTH - SAO_BO_BITS;

    memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
    memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));

    for (int plane = 0; plane < NUM_PLANE; plane++)
    {
        if (plane == 1)
        {
            stride = reconPic->m_strideC;
            picWidth  >>= m_hChromaShift;
            picHeight >>= m_vChromaShift;
            ctuWidth  >>= m_hChromaShift;
            ctuHeight >>= m_vChromaShift;
            lpelx     >>= m_hChromaShift;
            tpely     >>= m_vChromaShift;
            rpelx     >>= m_hChromaShift;
            bpely     >>= m_vChromaShift;
        }

        // SAO_BO:

        skipB = plane ? 1 : 3;
        skipR = plane ? 2 : 4;

        stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
        count = m_countPreDblk[addr][plane][SAO_BO];

        const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
        const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
        fenc = fenc0;
        rec  = rec0;

        startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
        startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;

        for (y = 0; y < ctuHeight; y++)
        {
            for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
            {
                int classIdx = 1 + (rec[x] >> boShift);
                stats[classIdx] += (fenc[x] - rec[x]);
                count[classIdx]++;
            }

            fenc += stride;
            rec += stride;
        }

        // SAO_EO_0: // dir: -
        {
            skipB = plane ? 1 : 3;
            skipR = plane ? 3 : 5;

            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
            count = m_countPreDblk[addr][plane][SAO_EO_0];

            fenc = fenc0;
            rec  = rec0;

            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
            startY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
            firstX = !lpelx;
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
            endX   = ctuWidth - 1;  // not refer right CTU

            for (y = 0; y < ctuHeight; y++)
            {
                x = (y < startY ? startX : firstX);
                int signLeft = signOf(rec[x] - rec[x - 1]);
                for (; x < endX; x++)
                {
                    int signRight = signOf(rec[x] - rec[x + 1]);
                    int edgeType = signRight + signLeft + 2;
                    signLeft = -signRight;

                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
                    count[s_eoTable[edgeType]]++;
                }

                fenc += stride;
                rec += stride;
            }
        }

        // SAO_EO_1: // dir: |
        {
            skipB = plane ? 2 : 4;
            skipR = plane ? 2 : 4;

            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
            count = m_countPreDblk[addr][plane][SAO_EO_1];

            fenc = fenc0;
            rec  = rec0;

            startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
            firstY = !tpely;
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
            endY   = ctuHeight - 1; // not refer below CTU
            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            for (x = startX; x < ctuWidth; x++)
                upBuff1[x] = signOf(rec[x] - rec[x - stride]);

            for (y = firstY; y < endY; y++)
            {
                for (x = (y < startY - 1 ? startX : 0); x < ctuWidth; x++)
                {
                    int signDown = signOf(rec[x] - rec[x + stride]);
                    int edgeType = signDown + upBuff1[x] + 2;
                    upBuff1[x] = -signDown;

                    if (x < startX && y < startY)
                        continue;

                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
                    count[s_eoTable[edgeType]]++;
                }

                fenc += stride;
                rec += stride;
            }
        }

        // SAO_EO_2: // dir: 135
        {
            skipB = plane ? 2 : 4;
            skipR = plane ? 3 : 5;

            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
            count = m_countPreDblk[addr][plane][SAO_EO_2];

            fenc = fenc0;
            rec  = rec0;

            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
            firstX = !lpelx;
            firstY = !tpely;
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
            endX   = ctuWidth - 1;  // not refer right CTU
            endY   = ctuHeight - 1; // not refer below CTU
            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            for (x = startX; x < endX; x++)
                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);

            for (y = firstY; y < endY; y++)
            {
                x = (y < startY - 1 ? startX : firstX);
                upBufft[x] = signOf(rec[x + stride] - rec[x - 1]);
                for (; x < endX; x++)
                {
                    int signDown = signOf(rec[x] - rec[x + stride + 1]);
                    int edgeType = signDown + upBuff1[x] + 2;
                    upBufft[x + 1] = -signDown;

                    if (x < startX && y < startY)
                        continue;

                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
                    count[s_eoTable[edgeType]]++;
                }

                std::swap(upBuff1, upBufft);

                rec += stride;
                fenc += stride;
            }
        }

        // SAO_EO_3: // dir: 45
        {
            skipB = plane ? 2 : 4;
            skipR = plane ? 3 : 5;

            stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
            count = m_countPreDblk[addr][plane][SAO_EO_3];

            fenc = fenc0;
            rec  = rec0;

            startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
            startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
            firstX = !lpelx;
            firstY = !tpely;
            // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
            // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
            endX   = ctuWidth - 1;  // not refer right CTU
            endY   = ctuHeight - 1; // not refer below CTU
            if (!tpely)
            {
                fenc += stride;
                rec += stride;
            }

            for (x = startX - 1; x < endX; x++)
                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);

            for (y = firstY; y < endY; y++)
            {
                for (x = (y < startY - 1 ? startX : firstX); x < endX; x++)
                {
                    int signDown = signOf(rec[x] - rec[x + stride - 1]);
                    int edgeType = signDown + upBuff1[x] + 2;
                    upBuff1[x - 1] = -signDown;

                    if (x < startX && y < startY)
                        continue;

                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
                    count[s_eoTable[edgeType]]++;
                }

                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);

                rec += stride;
                fenc += stride;
            }
        }
    }
}

/* reset offset statistics */
void SAO::resetStats()
{
    memset(m_count, 0, sizeof(PerClass) * NUM_PLANE);
    memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE);
    memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE);
}

void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
{
    if (!saoParam->bSaoFlag[0])
        m_depthSaoRate[0][m_refDepth] = 1.0;
    else
        m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus);

    if (!saoParam->bSaoFlag[1])
        m_depthSaoRate[1][m_refDepth] = 1.0;
    else
        m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus);
}

void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
{
    SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
    double mergeDist[NUM_MERGE_MODE];
    bool allowMerge[2]; // left, up
    allowMerge[1] = (idxY > 0);

    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
    {
        int addr     = idxX + idxY * m_numCuInWidth;
        int addrUp   = idxY ? addr - m_numCuInWidth : -1;
        int addrLeft = idxX ? addr - 1 : -1;
        allowMerge[0] = (idxX > 0);

        m_entropyCoder.load(m_rdContexts.cur);
        if (allowMerge[0])
            m_entropyCoder.codeSaoMerge(0);
        if (allowMerge[1])
            m_entropyCoder.codeSaoMerge(0);
        m_entropyCoder.store(m_rdContexts.temp);
        // reset stats Y, Cb, Cr
        for (int plane = 0; plane < m_numPlanes; plane++)
        {
            for (int j = 0; j < MAX_NUM_SAO_TYPE; j++)
            {
                for (int k = 0; k < MAX_NUM_SAO_CLASS; k++)
                {
                    m_offset[plane][j][k] = 0;
                    if (m_param->bSaoNonDeblocked)
                    {
                        m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k];
                        m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k];
                    }
                    else
                    {
                        m_count[plane][j][k] = 0;
                        m_offsetOrg[plane][j][k] = 0;
                    }
                }
            }

            saoParam->ctuParam[plane][addr].mergeMode = SAO_MERGE_NONE;
            saoParam->ctuParam[plane][addr].typeIdx   = -1;
            saoParam->ctuParam[plane][addr].bandPos   = 0;
            if (saoParam->bSaoFlag[plane > 0])
                calcSaoStatsCu(addr, plane);
        }

        saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);

        if (m_numPlanes > 1) {
            sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
        }

        if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
        {
            // Cost of new SAO_params
            m_entropyCoder.load(m_rdContexts.cur);
            m_entropyCoder.resetBits();
            if (allowMerge[0])
                m_entropyCoder.codeSaoMerge(0);
            if (allowMerge[1])
                m_entropyCoder.codeSaoMerge(0);
            for (int plane = 0; plane < m_numPlanes; plane++)
            {
                if (saoParam->bSaoFlag[plane > 0])
                    m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
            }

            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
            double bestCost = mergeDist[0] + (double)rate;
            m_entropyCoder.store(m_rdContexts.temp);

            // Cost of Merge
            for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
            {
                if (!allowMerge[mergeIdx])
                    continue;

                m_entropyCoder.load(m_rdContexts.cur);
                m_entropyCoder.resetBits();
                if (allowMerge[0])
                    m_entropyCoder.codeSaoMerge(1 - mergeIdx);
                if (allowMerge[1] && (mergeIdx == 1))
                    m_entropyCoder.codeSaoMerge(1);

                rate = m_entropyCoder.getNumberOfWrittenBits();
                double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
                if (mergeCost < bestCost)
                {
                    SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
                    bestCost = mergeCost;
                    m_entropyCoder.store(m_rdContexts.temp);
                    for (int plane = 0; plane < m_numPlanes; plane++)
                    {
                        mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
                        if (saoParam->bSaoFlag[plane > 0])
                            copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
                    }
                }
            }

            if (saoParam->ctuParam[0][addr].typeIdx < 0)
                m_numNoSao[0]++;
            if (saoParam->ctuParam[1][addr].typeIdx < 0)
                m_numNoSao[1]++;
            m_entropyCoder.load(m_rdContexts.temp);
            m_entropyCoder.store(m_rdContexts.cur);
        }
    }
}

/** rate distortion optimization of SAO unit */
inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
{
    int64_t estDist = 0;

    for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ?  SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++)
    {
        int32_t  count = m_count[plane][typeIdx][classIdx];
        int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
        int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];

        if (typeIdx == SAO_BO)
        {
            currentDistortionTableBo[classIdx - 1] = 0;
            currentRdCostTableBo[classIdx - 1] = lambda;
        }
        if (count)
        {
            int offset = roundIBDI(offsetOrg << (X265_DEPTH - 8), count);
            offset = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
            if (typeIdx < SAO_BO)
            {
                if (classIdx < 3)
                    offset = X265_MAX(offset, 0);
                else
                    offset = X265_MIN(offset, 0);
            }
            offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo);
        }
        else
        {
            offsetOrg = 0;
            offsetOut = 0;
        }
        if (typeIdx != SAO_BO)
            estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg);
    }

    return estDist;
}

inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
{
    int offsetOut = 0;

    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here.
    double tempMinCost = lambda;
    while (offset != 0)
    {
        // Calculate the bits required for signalling the offset
        int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
        if (abs(offset) == OFFSET_THRESH - 1)
            tempRate--;

        // Do the dequntization before distorion calculation
        int tempOffset = offset << SAO_BIT_INC;
        int64_t tempDist  = estSaoDist(count, tempOffset, offsetOrg);
        double tempCost   = ((double)tempDist + lambda * (double)tempRate);
        if (tempCost < tempMinCost)
        {
            tempMinCost = tempCost;
            offsetOut = offset;
            if (typeIdx == SAO_BO)
            {
                currentDistortionTableBo[classIdx - 1] = (int)tempDist;
                currentRdCostTableBo[classIdx - 1] = tempCost;
            }
        }
        offset = (offset > 0) ? (offset - 1) : (offset + 1);
    }

    return offsetOut;
}

void SAO::saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam* mergeSaoParam, double* mergeDist)
{
    int64_t bestDist = 0;

    SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];

    double bestRDCostTableBo = MAX_DOUBLE;
    int    bestClassTableBo  = 0;
    int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
    double currentRdCostTableBo[MAX_NUM_SAO_CLASS];

    resetSaoUnit(lclCtuParam);
    m_entropyCoder.load(m_rdContexts.temp);
    m_entropyCoder.resetBits();
    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
    double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;

    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
    {
        int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);

        if (typeIdx == SAO_BO)
        {
            // Estimate Best Position
            for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
            {
                double currentRDCost = 0.0;
                for (int j = i; j < i + SAO_BO_LEN; j++)
                    currentRDCost += currentRdCostTableBo[j];

                if (currentRDCost < bestRDCostTableBo)
                {
                    bestRDCostTableBo = currentRDCost;
                    bestClassTableBo  = i;
                }
            }

            // Re code all Offsets
            // Code Center
            estDist = 0;
            for (int classIdx = bestClassTableBo; classIdx < bestClassTableBo + SAO_BO_LEN; classIdx++)
                estDist += currentDistortionTableBo[classIdx];
        }
        SaoCtuParam  ctuParamRdo;
        ctuParamRdo.mergeMode = SAO_MERGE_NONE;
        ctuParamRdo.typeIdx = typeIdx;
        ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1];

        m_entropyCoder.load(m_rdContexts.temp);
        m_entropyCoder.resetBits();
        m_entropyCoder.codeSaoOffset(ctuParamRdo, 0);

        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
        double cost = (double)estDist + m_lumaLambda * (double)estRate;

        if (cost < dCostPartBest)
        {
            dCostPartBest = cost;
            copySaoUnit(lclCtuParam, &ctuParamRdo);
            bestDist = estDist;
        }
    }

    mergeDist[0] = ((double)bestDist / m_lumaLambda);
    m_entropyCoder.load(m_rdContexts.temp);
    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
    m_entropyCoder.store(m_rdContexts.temp);

    // merge left or merge up

    for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
    {
        SaoCtuParam* mergeSrcParam = NULL;
        if (addrLeft >= 0 && mergeIdx == 0)
            mergeSrcParam = &(saoParam->ctuParam[0][addrLeft]);
        else if (addrUp >= 0 && mergeIdx == 1)
            mergeSrcParam = &(saoParam->ctuParam[0][addrUp]);
        if (mergeSrcParam)
        {
            int64_t estDist = 0;
            int typeIdx = mergeSrcParam->typeIdx;
            if (typeIdx >= 0)
            {
                int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
                for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                {
                    int mergeOffset = mergeSrcParam->offset[classIdx];
                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + bandPos + 1]);
                }
            }

            copySaoUnit(&mergeSaoParam[mergeIdx], mergeSrcParam);
            mergeSaoParam[mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;

            mergeDist[mergeIdx + 1] = ((double)estDist / m_lumaLambda);
        }
        else
            resetSaoUnit(&mergeSaoParam[mergeIdx]);
    }
}

void SAO::sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist)
{
    int64_t bestDist = 0;

    SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };

    double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
    int    bestClassTableBo[2] = { 0, 0 };
    int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];

    resetSaoUnit(lclCtuParam[0]);
    resetSaoUnit(lclCtuParam[1]);
    m_entropyCoder.load(m_rdContexts.temp);
    m_entropyCoder.resetBits();
    m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
    m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);

    double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;

    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
    {
        int64_t estDist[2];
        if (typeIdx == SAO_BO)
        {
            // Estimate Best Position
            for (int compIdx = 0; compIdx < 2; compIdx++)
            {
                double bestRDCostTableBo = MAX_DOUBLE;
                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
                for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
                {
                    double currentRDCost = 0.0;
                    for (int j = i; j < i + SAO_BO_LEN; j++)
                        currentRDCost += currentRdCostTableBo[j];

                    if (currentRDCost < bestRDCostTableBo)
                    {
                        bestRDCostTableBo = currentRDCost;
                        bestClassTableBo[compIdx]  = i;
                    }
                }

                // Re code all Offsets
                // Code Center
                estDist[compIdx] = 0;
                for (int classIdx = bestClassTableBo[compIdx]; classIdx < bestClassTableBo[compIdx] + SAO_BO_LEN; classIdx++)
                    estDist[compIdx] += currentDistortionTableBo[classIdx];
            }
        }
        else
        {
            estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
            estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
        }

        m_entropyCoder.load(m_rdContexts.temp);
        m_entropyCoder.resetBits();

        SaoCtuParam  ctuParamRdo[2];
        for (int compIdx = 0; compIdx < 2; compIdx++)
        {
            ctuParamRdo[compIdx].mergeMode = SAO_MERGE_NONE;
            ctuParamRdo[compIdx].typeIdx = typeIdx;
            ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1];

            m_entropyCoder.codeSaoOffset(ctuParamRdo[compIdx], compIdx + 1);
        }

        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
        double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate;

        if (cost < costPartBest)
        {
            costPartBest = cost;
            copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]);
            copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]);
            bestDist = (estDist[0] + estDist[1]);
        }
    }

    mergeDist[0] += ((double)bestDist / m_chromaLambda);
    m_entropyCoder.load(m_rdContexts.temp);
    m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
    m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
    m_entropyCoder.store(m_rdContexts.temp);

    // merge left or merge up

    for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
    {
        for (int compIdx = 0; compIdx < 2; compIdx++)
        {
            int plane = compIdx + 1;
            SaoCtuParam* mergeSrcParam = NULL;
            if (addrLeft >= 0 && mergeIdx == 0)
                mergeSrcParam = &(saoParam->ctuParam[plane][addrLeft]);
            else if (addrUp >= 0 && mergeIdx == 1)
                mergeSrcParam = &(saoParam->ctuParam[plane][addrUp]);
            if (mergeSrcParam)
            {
                int64_t estDist = 0;
                int typeIdx = mergeSrcParam->typeIdx;
                if (typeIdx >= 0)
                {
                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
                    {
                        int mergeOffset = mergeSrcParam->offset[classIdx];
                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);
                    }
                }

                copySaoUnit(&mergeSaoParam[plane][mergeIdx], mergeSrcParam);
                mergeSaoParam[plane][mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
                mergeDist[mergeIdx + 1] += ((double)estDist / m_chromaLambda);
            }
            else
                resetSaoUnit(&mergeSaoParam[plane][mergeIdx]);
        }
    }
}

// NOTE: must put in namespace X265_NS since we need class SAO
void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
    int x, y;
    const int boShift = X265_DEPTH - SAO_BO_BITS;

    for (y = 0; y < endY; y++)
    {
        for (x = 0; x < endX; x++)
        {
            int classIdx = 1 + (rec[x] >> boShift);
            stats[classIdx] += (fenc[x] - rec[x]);
            count[classIdx]++;
        }

        fenc += stride;
        rec += stride;
    }
}

void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
    int x, y;
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
    int32_t tmp_count[SAO::NUM_EDGETYPE];

    memset(tmp_stats, 0, sizeof(tmp_stats));
    memset(tmp_count, 0, sizeof(tmp_count));

    for (y = 0; y < endY; y++)
    {
        int signLeft = signOf(rec[0] - rec[-1]);
        for (x = 0; x < endX; x++)
        {
            int signRight = signOf2(rec[x], rec[x + 1]);
            X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
            uint32_t edgeType = signRight + signLeft + 2;
            signLeft = -signRight;

            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
            tmp_stats[edgeType] += (fenc[x] - rec[x]);
            tmp_count[edgeType]++;
        }

        fenc += stride;
        rec += stride;
    }

    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
    {
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
        count[SAO::s_eoTable[x]] += tmp_count[x];
    }
}

void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
    X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
    X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");

    int x, y;
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
    int32_t tmp_count[SAO::NUM_EDGETYPE];

    memset(tmp_stats, 0, sizeof(tmp_stats));
    memset(tmp_count, 0, sizeof(tmp_count));

    for (y = 0; y < endY; y++)
    {
        for (x = 0; x < endX; x++)
        {
            int signDown = signOf2(rec[x], rec[x + stride]);
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
            uint32_t edgeType = signDown + upBuff1[x] + 2;
            upBuff1[x] = (int8_t)(-signDown);

            tmp_stats[edgeType] += (fenc[x] - rec[x]);
            tmp_count[edgeType]++;
        }
        fenc += stride;
        rec += stride;
    }

    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
    {
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
        count[SAO::s_eoTable[x]] += tmp_count[x];
    }
}

void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
{
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");

    int x, y;
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
    int32_t tmp_count[SAO::NUM_EDGETYPE];

    memset(tmp_stats, 0, sizeof(tmp_stats));
    memset(tmp_count, 0, sizeof(tmp_count));

    for (y = 0; y < endY; y++)
    {
        upBufft[0] = signOf(rec[stride] - rec[-1]);
        for (x = 0; x < endX; x++)
        {
            int signDown = signOf2(rec[x], rec[x + stride + 1]);
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
            uint32_t edgeType = signDown + upBuff1[x] + 2;
            upBufft[x + 1] = (int8_t)(-signDown);
            tmp_stats[edgeType] += (fenc[x] - rec[x]);
            tmp_count[edgeType]++;
        }

        std::swap(upBuff1, upBufft);

        rec += stride;
        fenc += stride;
    }

    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
    {
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
        count[SAO::s_eoTable[x]] += tmp_count[x];
    }
}

void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");

    int x, y;
    int32_t tmp_stats[SAO::NUM_EDGETYPE];
    int32_t tmp_count[SAO::NUM_EDGETYPE];

    memset(tmp_stats, 0, sizeof(tmp_stats));
    memset(tmp_count, 0, sizeof(tmp_count));

    for (y = 0; y < endY; y++)
    {
        for (x = 0; x < endX; x++)
        {
            int signDown = signOf2(rec[x], rec[x + stride - 1]);
            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
            X265_CHECK(abs(upBuff1[x]) <= 1, "upBuffer1 check failure\n");

            uint32_t edgeType = signDown + upBuff1[x] + 2;
            upBuff1[x - 1] = (int8_t)(-signDown);
            tmp_stats[edgeType] += (fenc[x] - rec[x]);
            tmp_count[edgeType]++;
        }

        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);

        rec += stride;
        fenc += stride;
    }

    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
    {
        stats[SAO::s_eoTable[x]] += tmp_stats[x];
        count[SAO::s_eoTable[x]] += tmp_count[x];
    }
}

void setupSaoPrimitives_c(EncoderPrimitives &p)
{
    // TODO: move other sao functions to here
    p.saoCuStatsBO = saoCuStatsBO_c;
    p.saoCuStatsE0 = saoCuStatsE0_c;
    p.saoCuStatsE1 = saoCuStatsE1_c;
    p.saoCuStatsE2 = saoCuStatsE2_c;
    p.saoCuStatsE3 = saoCuStatsE3_c;
}
}