comparison x265/source/encoder/search.h @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #ifndef X265_SEARCH_H
25 #define X265_SEARCH_H
26
27 #include "common.h"
28 #include "predict.h"
29 #include "quant.h"
30 #include "bitcost.h"
31 #include "framedata.h"
32 #include "yuv.h"
33 #include "threadpool.h"
34
35 #include "rdcost.h"
36 #include "entropy.h"
37 #include "motion.h"
38
39 #if DETAILED_CU_STATS
40 #define ProfileCUScopeNamed(name, cu, acc, count) \
41 m_stats[cu.m_encData->m_frameEncoderID].count++; \
42 ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
43 #define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
44 #define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++;
45 #else
46 #define ProfileCUScopeNamed(name, cu, acc, count)
47 #define ProfileCUScope(cu, acc, count)
48 #define ProfileCounter(cu, count)
49 #endif
50
51 namespace X265_NS {
52 // private namespace
53
54 class Entropy;
55 struct ThreadLocalData;
56
57 /* All the CABAC contexts that Analysis needs to keep track of at each depth
58 * and temp buffers for residual, coeff, and recon for use during residual
59 * quad-tree depth recursion */
60 struct RQTData
61 {
62 Entropy cur; /* starting context for current CU */
63
64 /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
65 * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
66 * which are reconstructed at each depth are valid. At the end, the transform depth table
67 * is walked and the coeff and recon at the final split depths are collected */
68 Entropy rqtRoot; /* residual quad-tree start context */
69 Entropy rqtTemp; /* residual quad-tree temp context */
70 Entropy rqtTest; /* residual quad-tree test context */
71 coeff_t* coeffRQT[3]; /* coeff storage for entire CTU for each RQT layer */
72 Yuv reconQtYuv; /* recon storage for entire CTU for each RQT layer (intra) */
73 ShortYuv resiQtYuv; /* residual storage for entire CTU for each RQT layer (inter) */
74
75 /* per-depth temp buffers for inter prediction */
76 ShortYuv tmpResiYuv;
77 Yuv tmpPredYuv;
78 Yuv bidirPredYuv[2];
79 };
80
81 struct MotionData
82 {
83 MV mv;
84 MV mvp;
85 int mvpIdx;
86 int ref;
87 uint32_t cost;
88 int bits;
89 };
90
91 struct Mode
92 {
93 CUData cu;
94 const Yuv* fencYuv;
95 Yuv predYuv;
96 Yuv reconYuv;
97 Entropy contexts;
98
99 enum { MAX_INTER_PARTS = 2 };
100
101 MotionData bestME[MAX_INTER_PARTS][2];
102 MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
103
104 // Neighbour MVs of the current partition. 5 spatial candidates and the
105 // temporal candidate.
106 InterNeighbourMV interNeighbours[6];
107
108 uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits)
109 uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits)
110 uint32_t sa8dBits; // signal bits used in sa8dCost calculation
111 uint32_t psyEnergy; // sum of partition psycho-visual energy difference
112 sse_ret_t resEnergy; // sum of partition residual energy after motion prediction
113 sse_ret_t lumaDistortion;
114 sse_ret_t chromaDistortion;
115 sse_ret_t distortion; // sum of partition SSE distortion
116 uint32_t totalBits; // sum of partition bits (mv + coeff)
117 uint32_t mvBits; // Mv bits + Ref + block type (or intra mode)
118 uint32_t coeffBits; // Texture bits (DCT Coeffs)
119
120 void initCosts()
121 {
122 rdCost = 0;
123 sa8dCost = 0;
124 sa8dBits = 0;
125 psyEnergy = 0;
126 resEnergy = 0;
127 lumaDistortion = 0;
128 chromaDistortion = 0;
129 distortion = 0;
130 totalBits = 0;
131 mvBits = 0;
132 coeffBits = 0;
133 }
134
135 void invalidate()
136 {
137 /* set costs to invalid data, catch uninitialized re-use */
138 rdCost = UINT64_MAX / 2;
139 sa8dCost = UINT64_MAX / 2;
140 sa8dBits = MAX_UINT / 2;
141 psyEnergy = MAX_UINT / 2;
142 #if X265_DEPTH <= 10
143 resEnergy = MAX_UINT / 2;
144 lumaDistortion = MAX_UINT / 2;
145 chromaDistortion = MAX_UINT / 2;
146 distortion = MAX_UINT / 2;
147 #else
148 resEnergy = UINT64_MAX / 2;
149 lumaDistortion = UINT64_MAX / 2;
150 chromaDistortion = UINT64_MAX / 2;
151 distortion = UINT64_MAX / 2;
152 #endif
153 totalBits = MAX_UINT / 2;
154 mvBits = MAX_UINT / 2;
155 coeffBits = MAX_UINT / 2;
156 }
157
158 bool ok() const
159 {
160 #if X265_DEPTH <= 10
161 return !(rdCost >= UINT64_MAX / 2 ||
162 sa8dCost >= UINT64_MAX / 2 ||
163 sa8dBits >= MAX_UINT / 2 ||
164 psyEnergy >= MAX_UINT / 2 ||
165 resEnergy >= MAX_UINT / 2 ||
166 lumaDistortion >= MAX_UINT / 2 ||
167 chromaDistortion >= MAX_UINT / 2 ||
168 distortion >= MAX_UINT / 2 ||
169 totalBits >= MAX_UINT / 2 ||
170 mvBits >= MAX_UINT / 2 ||
171 coeffBits >= MAX_UINT / 2);
172 #else
173 return !(rdCost >= UINT64_MAX / 2 ||
174 sa8dCost >= UINT64_MAX / 2 ||
175 sa8dBits >= MAX_UINT / 2 ||
176 psyEnergy >= MAX_UINT / 2 ||
177 resEnergy >= UINT64_MAX / 2 ||
178 lumaDistortion >= UINT64_MAX / 2 ||
179 chromaDistortion >= UINT64_MAX / 2 ||
180 distortion >= UINT64_MAX / 2 ||
181 totalBits >= MAX_UINT / 2 ||
182 mvBits >= MAX_UINT / 2 ||
183 coeffBits >= MAX_UINT / 2);
184 #endif
185 }
186
187 void addSubCosts(const Mode& subMode)
188 {
189 X265_CHECK(subMode.ok(), "sub-mode not initialized");
190
191 rdCost += subMode.rdCost;
192 sa8dCost += subMode.sa8dCost;
193 sa8dBits += subMode.sa8dBits;
194 psyEnergy += subMode.psyEnergy;
195 resEnergy += subMode.resEnergy;
196 lumaDistortion += subMode.lumaDistortion;
197 chromaDistortion += subMode.chromaDistortion;
198 distortion += subMode.distortion;
199 totalBits += subMode.totalBits;
200 mvBits += subMode.mvBits;
201 coeffBits += subMode.coeffBits;
202 }
203 };
204
205 #if DETAILED_CU_STATS
206 /* This structure is intended for performance debugging and we make no attempt
207 * to handle dynamic range overflows. Care should be taken to avoid long encodes
208 * if you care about the accuracy of these elapsed times and counters. This
209 * profiling is orthogonal to PPA/VTune and can be enabled independently from
210 * either of them */
211 struct CUStats
212 {
213 int64_t intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth
214 int64_t interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth
215 int64_t intraAnalysisElapsedTime; // elapsed worker time in intra sa8d analysis
216 int64_t motionEstimationElapsedTime; // elapsed worker time in predInterSearch()
217 int64_t loopFilterElapsedTime; // elapsed worker time in deblock and SAO and PSNR/SSIM
218 int64_t pmeTime; // elapsed worker time processing ME slave jobs
219 int64_t pmeBlockTime; // elapsed worker time blocked for pme batch completion
220 int64_t pmodeTime; // elapsed worker time processing pmode slave jobs
221 int64_t pmodeBlockTime; // elapsed worker time blocked for pmode batch completion
222 int64_t weightAnalyzeTime; // elapsed worker time analyzing reference weights
223 int64_t totalCTUTime; // elapsed worker time in compressCTU (includes pmode master)
224
225 uint32_t skippedMotionReferences[NUM_CU_DEPTH];
226 uint32_t totalMotionReferences[NUM_CU_DEPTH];
227 uint32_t skippedIntraCU[NUM_CU_DEPTH];
228 uint32_t totalIntraCU[NUM_CU_DEPTH];
229
230 uint64_t countIntraRDO[NUM_CU_DEPTH];
231 uint64_t countInterRDO[NUM_CU_DEPTH];
232 uint64_t countIntraAnalysis;
233 uint64_t countMotionEstimate;
234 uint64_t countLoopFilter;
235 uint64_t countPMETasks;
236 uint64_t countPMEMasters;
237 uint64_t countPModeTasks;
238 uint64_t countPModeMasters;
239 uint64_t countWeightAnalyze;
240 uint64_t totalCTUs;
241
242 CUStats() { clear(); }
243
244 void clear()
245 {
246 memset(this, 0, sizeof(*this));
247 }
248
249 void accumulate(CUStats& other)
250 {
251 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
252 {
253 intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i];
254 interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
255 countIntraRDO[i] += other.countIntraRDO[i];
256 countInterRDO[i] += other.countInterRDO[i];
257 skippedMotionReferences[i] += other.skippedMotionReferences[i];
258 totalMotionReferences[i] += other.totalMotionReferences[i];
259 skippedIntraCU[i] += other.skippedIntraCU[i];
260 totalIntraCU[i] += other.totalIntraCU[i];
261 }
262
263 intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
264 motionEstimationElapsedTime += other.motionEstimationElapsedTime;
265 loopFilterElapsedTime += other.loopFilterElapsedTime;
266 pmeTime += other.pmeTime;
267 pmeBlockTime += other.pmeBlockTime;
268 pmodeTime += other.pmodeTime;
269 pmodeBlockTime += other.pmodeBlockTime;
270 weightAnalyzeTime += other.weightAnalyzeTime;
271 totalCTUTime += other.totalCTUTime;
272
273 countIntraAnalysis += other.countIntraAnalysis;
274 countMotionEstimate += other.countMotionEstimate;
275 countLoopFilter += other.countLoopFilter;
276 countPMETasks += other.countPMETasks;
277 countPMEMasters += other.countPMEMasters;
278 countPModeTasks += other.countPModeTasks;
279 countPModeMasters += other.countPModeMasters;
280 countWeightAnalyze += other.countWeightAnalyze;
281 totalCTUs += other.totalCTUs;
282
283 other.clear();
284 }
285 };
286 #endif
287
288 inline int getTUBits(int idx, int numIdx)
289 {
290 return idx + (idx < numIdx - 1);
291 }
292
293 class Search : public Predict
294 {
295 public:
296
297 static const int16_t zeroShort[MAX_CU_SIZE];
298
299 MotionEstimate m_me;
300 Quant m_quant;
301 RDCost m_rdCost;
302 const x265_param* m_param;
303 Frame* m_frame;
304 const Slice* m_slice;
305
306 Entropy m_entropyCoder;
307 RQTData m_rqt[NUM_FULL_DEPTH];
308
309 uint8_t* m_qtTempCbf[3];
310 uint8_t* m_qtTempTransformSkipFlag[3];
311
312 pixel* m_fencScaled; /* 32x32 buffer for down-scaled version of 64x64 CU fenc */
313 pixel* m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */
314 pixel* m_intraPred; /* 32x32 buffer for individual intra predictions */
315 pixel* m_intraPredAngs; /* allocation for 33 consecutive (all angular) 32x32 intra predictions */
316
317 coeff_t* m_tsCoeff; /* transform skip coeff 32x32 */
318 int16_t* m_tsResidual; /* transform skip residual 32x32 */
319 pixel* m_tsRecon; /* transform skip reconstructed pixels 32x32 */
320
321 bool m_bFrameParallel;
322 bool m_bEnableRDOQ;
323 uint32_t m_numLayers;
324 uint32_t m_refLagPixels;
325
326 #if DETAILED_CU_STATS
327 /* Accumulate CU statistics separately for each frame encoder */
328 CUStats m_stats[X265_MAX_FRAME_THREADS];
329 #endif
330
331 Search();
332 ~Search();
333
334 bool initSearch(const x265_param& param, ScalingList& scalingList);
335 int setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
336
337 // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
338 void invalidateContexts(int fromDepth);
339
340 // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
341 void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
342
343 // select best intra mode using only sa8d costs, cannot measure NxN intra
344 void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
345 // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
346 void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
347
348 // estimation inter prediction (non-skip)
349 void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
350
351 // encode residual and compute rd-cost for inter mode
352 void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
353 void encodeResAndCalcRdSkipCU(Mode& interMode);
354
355 // encode residual without rd-cost
356 void residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
357 void residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
358 void residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);
359
360 // pick be chroma mode from available using just sa8d costs
361 void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
362
363 /* update CBF flags and QP values to be internally consistent */
364 void checkDQP(Mode& mode, const CUGeom& cuGeom);
365 void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom);
366
367 MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
368
369 class PME : public BondedTaskGroup
370 {
371 public:
372
373 Search& master;
374 Mode& mode;
375 const CUGeom& cuGeom;
376 const PredictionUnit& pu;
377 int puIdx;
378
379 struct {
380 int ref[2][MAX_NUM_REF];
381 int refCnt[2];
382 } m_jobs;
383
384 PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
385
386 void processTasks(int workerThreadId);
387
388 protected:
389
390 PME operator=(const PME&);
391 };
392
393 void processPME(PME& pme, Search& slave);
394 void singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref);
395
396 protected:
397
398 /* motion estimation distribution */
399 ThreadLocalData* m_tld;
400
401 uint32_t m_listSelBits[3];
402 Lock m_meLock;
403
404 void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
405
406 // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
407 uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
408
409 // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
410 uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
411
412 void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
413 void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
414 void codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
415
416 struct Cost
417 {
418 uint64_t rdcost;
419 uint32_t bits;
420 sse_ret_t distortion;
421 uint32_t energy;
422 Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
423 };
424
425 uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
426 void estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
427
428 // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
429 void codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
430 void codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
431 void extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
432
433 // generate chroma prediction, generate residual and recon
434 uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
435 uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
436 void extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
437
438 // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
439 void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
440
441 /* output of mergeEstimation, best merge candidate */
442 struct MergeData
443 {
444 MVField mvField[2];
445 uint32_t dir;
446 uint32_t index;
447 uint32_t bits;
448 };
449
450 /* inter/ME helper functions */
451 int selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref);
452 const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
453 void setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
454 uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
455 static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
456
457 /* intra helper functions */
458 enum { MAX_RD_INTRA_MODES = 16 };
459 static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
460
461 // get most probable luma modes for CU part, and bit cost of all non mpm modes
462 uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
463
464 void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
465 };
466 }
467
468 #endif // ifndef X265_SEARCH_H