Mercurial > hg > forks > libbpg
diff x265/source/encoder/search.h @ 0:772086c29cc7
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 16 Nov 2016 11:16:33 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x265/source/encoder/search.h Wed Nov 16 11:16:33 2016 +0200 @@ -0,0 +1,468 @@ +/***************************************************************************** +* Copyright (C) 2013 x265 project +* +* Authors: Steve Borho <steve@borho.org> +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_SEARCH_H +#define X265_SEARCH_H + +#include "common.h" +#include "predict.h" +#include "quant.h" +#include "bitcost.h" +#include "framedata.h" +#include "yuv.h" +#include "threadpool.h" + +#include "rdcost.h" +#include "entropy.h" +#include "motion.h" + +#if DETAILED_CU_STATS +#define ProfileCUScopeNamed(name, cu, acc, count) \ + m_stats[cu.m_encData->m_frameEncoderID].count++; \ + ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc) +#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count) +#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++; +#else +#define ProfileCUScopeNamed(name, cu, acc, count) +#define ProfileCUScope(cu, acc, count) +#define ProfileCounter(cu, count) +#endif + +namespace X265_NS { +// private namespace + +class Entropy; +struct ThreadLocalData; + +/* All the CABAC contexts that Analysis needs to keep track of at each depth + * and temp buffers for residual, coeff, and recon for use during residual + * quad-tree depth recursion */ +struct RQTData +{ + Entropy cur; /* starting context for current CU */ + + /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32 + * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts + * which are reconstructed at each depth are valid. At the end, the transform depth table + * is walked and the coeff and recon at the final split depths are collected */ + Entropy rqtRoot; /* residual quad-tree start context */ + Entropy rqtTemp; /* residual quad-tree temp context */ + Entropy rqtTest; /* residual quad-tree test context */ + coeff_t* coeffRQT[3]; /* coeff storage for entire CTU for each RQT layer */ + Yuv reconQtYuv; /* recon storage for entire CTU for each RQT layer (intra) */ + ShortYuv resiQtYuv; /* residual storage for entire CTU for each RQT layer (inter) */ + + /* per-depth temp buffers for inter prediction */ + ShortYuv tmpResiYuv; + Yuv tmpPredYuv; + Yuv bidirPredYuv[2]; +}; + +struct MotionData +{ + MV mv; + MV mvp; + int mvpIdx; + int ref; + uint32_t cost; + int bits; +}; + +struct Mode +{ + CUData cu; + const Yuv* fencYuv; + Yuv predYuv; + Yuv reconYuv; + Entropy contexts; + + enum { MAX_INTER_PARTS = 2 }; + + MotionData bestME[MAX_INTER_PARTS][2]; + MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; + + // Neighbour MVs of the current partition. 5 spatial candidates and the + // temporal candidate. + InterNeighbourMV interNeighbours[6]; + + uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits) + uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits) + uint32_t sa8dBits; // signal bits used in sa8dCost calculation + uint32_t psyEnergy; // sum of partition psycho-visual energy difference + sse_ret_t resEnergy; // sum of partition residual energy after motion prediction + sse_ret_t lumaDistortion; + sse_ret_t chromaDistortion; + sse_ret_t distortion; // sum of partition SSE distortion + uint32_t totalBits; // sum of partition bits (mv + coeff) + uint32_t mvBits; // Mv bits + Ref + block type (or intra mode) + uint32_t coeffBits; // Texture bits (DCT Coeffs) + + void initCosts() + { + rdCost = 0; + sa8dCost = 0; + sa8dBits = 0; + psyEnergy = 0; + resEnergy = 0; + lumaDistortion = 0; + chromaDistortion = 0; + distortion = 0; + totalBits = 0; + mvBits = 0; + coeffBits = 0; + } + + void invalidate() + { + /* set costs to invalid data, catch uninitialized re-use */ + rdCost = UINT64_MAX / 2; + sa8dCost = UINT64_MAX / 2; + sa8dBits = MAX_UINT / 2; + psyEnergy = MAX_UINT / 2; +#if X265_DEPTH <= 10 + resEnergy = MAX_UINT / 2; + lumaDistortion = MAX_UINT / 2; + chromaDistortion = MAX_UINT / 2; + distortion = MAX_UINT / 2; +#else + resEnergy = UINT64_MAX / 2; + lumaDistortion = UINT64_MAX / 2; + chromaDistortion = UINT64_MAX / 2; + distortion = UINT64_MAX / 2; +#endif + totalBits = MAX_UINT / 2; + mvBits = MAX_UINT / 2; + coeffBits = MAX_UINT / 2; + } + + bool ok() const + { +#if X265_DEPTH <= 10 + return !(rdCost >= UINT64_MAX / 2 || + sa8dCost >= UINT64_MAX / 2 || + sa8dBits >= MAX_UINT / 2 || + psyEnergy >= MAX_UINT / 2 || + resEnergy >= MAX_UINT / 2 || + lumaDistortion >= MAX_UINT / 2 || + chromaDistortion >= MAX_UINT / 2 || + distortion >= MAX_UINT / 2 || + totalBits >= MAX_UINT / 2 || + mvBits >= MAX_UINT / 2 || + coeffBits >= MAX_UINT / 2); +#else + return !(rdCost >= UINT64_MAX / 2 || + sa8dCost >= UINT64_MAX / 2 || + sa8dBits >= MAX_UINT / 2 || + psyEnergy >= MAX_UINT / 2 || + resEnergy >= UINT64_MAX / 2 || + lumaDistortion >= UINT64_MAX / 2 || + chromaDistortion >= UINT64_MAX / 2 || + distortion >= UINT64_MAX / 2 || + totalBits >= MAX_UINT / 2 || + mvBits >= MAX_UINT / 2 || + coeffBits >= MAX_UINT / 2); +#endif + } + + void addSubCosts(const Mode& subMode) + { + X265_CHECK(subMode.ok(), "sub-mode not initialized"); + + rdCost += subMode.rdCost; + sa8dCost += subMode.sa8dCost; + sa8dBits += subMode.sa8dBits; + psyEnergy += subMode.psyEnergy; + resEnergy += subMode.resEnergy; + lumaDistortion += subMode.lumaDistortion; + chromaDistortion += subMode.chromaDistortion; + distortion += subMode.distortion; + totalBits += subMode.totalBits; + mvBits += subMode.mvBits; + coeffBits += subMode.coeffBits; + } +}; + +#if DETAILED_CU_STATS +/* This structure is intended for performance debugging and we make no attempt + * to handle dynamic range overflows. Care should be taken to avoid long encodes + * if you care about the accuracy of these elapsed times and counters. This + * profiling is orthogonal to PPA/VTune and can be enabled independently from + * either of them */ +struct CUStats +{ + int64_t intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth + int64_t interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth + int64_t intraAnalysisElapsedTime; // elapsed worker time in intra sa8d analysis + int64_t motionEstimationElapsedTime; // elapsed worker time in predInterSearch() + int64_t loopFilterElapsedTime; // elapsed worker time in deblock and SAO and PSNR/SSIM + int64_t pmeTime; // elapsed worker time processing ME slave jobs + int64_t pmeBlockTime; // elapsed worker time blocked for pme batch completion + int64_t pmodeTime; // elapsed worker time processing pmode slave jobs + int64_t pmodeBlockTime; // elapsed worker time blocked for pmode batch completion + int64_t weightAnalyzeTime; // elapsed worker time analyzing reference weights + int64_t totalCTUTime; // elapsed worker time in compressCTU (includes pmode master) + + uint32_t skippedMotionReferences[NUM_CU_DEPTH]; + uint32_t totalMotionReferences[NUM_CU_DEPTH]; + uint32_t skippedIntraCU[NUM_CU_DEPTH]; + uint32_t totalIntraCU[NUM_CU_DEPTH]; + + uint64_t countIntraRDO[NUM_CU_DEPTH]; + uint64_t countInterRDO[NUM_CU_DEPTH]; + uint64_t countIntraAnalysis; + uint64_t countMotionEstimate; + uint64_t countLoopFilter; + uint64_t countPMETasks; + uint64_t countPMEMasters; + uint64_t countPModeTasks; + uint64_t countPModeMasters; + uint64_t countWeightAnalyze; + uint64_t totalCTUs; + + CUStats() { clear(); } + + void clear() + { + memset(this, 0, sizeof(*this)); + } + + void accumulate(CUStats& other) + { + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i]; + interRDOElapsedTime[i] += other.interRDOElapsedTime[i]; + countIntraRDO[i] += other.countIntraRDO[i]; + countInterRDO[i] += other.countInterRDO[i]; + skippedMotionReferences[i] += other.skippedMotionReferences[i]; + totalMotionReferences[i] += other.totalMotionReferences[i]; + skippedIntraCU[i] += other.skippedIntraCU[i]; + totalIntraCU[i] += other.totalIntraCU[i]; + } + + intraAnalysisElapsedTime += other.intraAnalysisElapsedTime; + motionEstimationElapsedTime += other.motionEstimationElapsedTime; + loopFilterElapsedTime += other.loopFilterElapsedTime; + pmeTime += other.pmeTime; + pmeBlockTime += other.pmeBlockTime; + pmodeTime += other.pmodeTime; + pmodeBlockTime += other.pmodeBlockTime; + weightAnalyzeTime += other.weightAnalyzeTime; + totalCTUTime += other.totalCTUTime; + + countIntraAnalysis += other.countIntraAnalysis; + countMotionEstimate += other.countMotionEstimate; + countLoopFilter += other.countLoopFilter; + countPMETasks += other.countPMETasks; + countPMEMasters += other.countPMEMasters; + countPModeTasks += other.countPModeTasks; + countPModeMasters += other.countPModeMasters; + countWeightAnalyze += other.countWeightAnalyze; + totalCTUs += other.totalCTUs; + + other.clear(); + } +}; +#endif + +inline int getTUBits(int idx, int numIdx) +{ + return idx + (idx < numIdx - 1); +} + +class Search : public Predict +{ +public: + + static const int16_t zeroShort[MAX_CU_SIZE]; + + MotionEstimate m_me; + Quant m_quant; + RDCost m_rdCost; + const x265_param* m_param; + Frame* m_frame; + const Slice* m_slice; + + Entropy m_entropyCoder; + RQTData m_rqt[NUM_FULL_DEPTH]; + + uint8_t* m_qtTempCbf[3]; + uint8_t* m_qtTempTransformSkipFlag[3]; + + pixel* m_fencScaled; /* 32x32 buffer for down-scaled version of 64x64 CU fenc */ + pixel* m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */ + pixel* m_intraPred; /* 32x32 buffer for individual intra predictions */ + pixel* m_intraPredAngs; /* allocation for 33 consecutive (all angular) 32x32 intra predictions */ + + coeff_t* m_tsCoeff; /* transform skip coeff 32x32 */ + int16_t* m_tsResidual; /* transform skip residual 32x32 */ + pixel* m_tsRecon; /* transform skip reconstructed pixels 32x32 */ + + bool m_bFrameParallel; + bool m_bEnableRDOQ; + uint32_t m_numLayers; + uint32_t m_refLagPixels; + +#if DETAILED_CU_STATS + /* Accumulate CU statistics separately for each frame encoder */ + CUStats m_stats[X265_MAX_FRAME_THREADS]; +#endif + + Search(); + ~Search(); + + bool initSearch(const x265_param& param, ScalingList& scalingList); + int setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */ + + // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores + void invalidateContexts(int fromDepth); + + // full RD search of intra modes. if sharedModes is not NULL, it directly uses them + void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes); + + // select best intra mode using only sa8d costs, cannot measure NxN intra + void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode + void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); + + // estimation inter prediction (non-skip) + void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]); + + // encode residual and compute rd-cost for inter mode + void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); + void encodeResAndCalcRdSkipCU(Mode& interMode); + + // encode residual without rd-cost + void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]); + void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]); + void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth); + + // pick be chroma mode from available using just sa8d costs + void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom); + + /* update CBF flags and QP values to be internally consistent */ + void checkDQP(Mode& mode, const CUGeom& cuGeom); + void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom); + + MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref); + + class PME : public BondedTaskGroup + { + public: + + Search& master; + Mode& mode; + const CUGeom& cuGeom; + const PredictionUnit& pu; + int puIdx; + + struct { + int ref[2][MAX_NUM_REF]; + int refCnt[2]; + } m_jobs; + + PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {} + + void processTasks(int workerThreadId); + + protected: + + PME operator=(const PME&); + }; + + void processPME(PME& pme, Search& slave); + void singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref); + +protected: + + /* motion estimation distribution */ + ThreadLocalData* m_tld; + + uint32_t m_listSelBits[3]; + Lock m_meLock; + + void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth); + + // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned + uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes); + + // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned + uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes); + + void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx); + void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]); + void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype); + + struct Cost + { + uint64_t rdcost; + uint32_t bits; + sse_ret_t distortion; + uint32_t energy; + Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; } + }; + + uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId); + void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]); + + // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits + void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]); + void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs); + void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx); + + // generate chroma prediction, generate residual and recon + uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy); + uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy); + void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth); + + // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks + void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx); + + /* output of mergeEstimation, best merge candidate */ + struct MergeData + { + MVField mvField[2]; + uint32_t dir; + uint32_t index; + uint32_t bits; + }; + + /* inter/ME helper functions */ + int selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref); + const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const; + void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const; + uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m); + static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]); + + /* intra helper functions */ + enum { MAX_RD_INTRA_MODES = 16 }; + static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList); + + // get most probable luma modes for CU part, and bit cost of all non mpm modes + uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const; + + void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); } +}; +} + +#endif // ifndef X265_SEARCH_H