comparison x265/source/encoder/analysis.cpp @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:772086c29cc7
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25 #include "common.h"
26 #include "frame.h"
27 #include "framedata.h"
28 #include "picyuv.h"
29 #include "primitives.h"
30 #include "threading.h"
31
32 #include "analysis.h"
33 #include "rdcost.h"
34 #include "encoder.h"
35
36 using namespace X265_NS;
37
38 /* An explanation of rate distortion levels (--rd-level)
39 *
40 * rd-level 0 generates no recon per CU (NO RDO or Quant)
41 *
42 * sa8d selection between merge / skip / inter / intra and split
43 * no recon pixels generated until CTU analysis is complete, requiring
44 * intra predictions to use source pixels
45 *
46 * rd-level 1 uses RDO for merge and skip, sa8d for all else
47 *
48 * RDO selection between merge and skip
49 * sa8d selection between (merge/skip) / inter modes / intra and split
50 * intra prediction uses reconstructed pixels
51 *
52 * rd-level 2 uses RDO for merge/skip and split
53 *
54 * RDO selection between merge and skip
55 * sa8d selection between (merge/skip) / inter modes / intra
56 * RDO split decisions
57 *
58 * rd-level 3 uses RDO for merge/skip/best inter/intra
59 *
60 * RDO selection between merge and skip
61 * sa8d selection of best inter mode
62 * sa8d decisions include chroma residual cost
63 * RDO selection between (merge/skip) / best inter mode / intra / split
64 *
65 * rd-level 4 enables RDOQuant
66 * chroma residual cost included in satd decisions, including subpel refine
67 * (as a result of --subme 3 being used by preset slow)
68 *
69 * rd-level 5,6 does RDO for each inter mode
70 */
71
72 Analysis::Analysis()
73 {
74 m_reuseIntraDataCTU = NULL;
75 m_reuseInterDataCTU = NULL;
76 m_reuseRef = NULL;
77 m_reuseBestMergeCand = NULL;
78 }
79
80 bool Analysis::create(ThreadLocalData *tld)
81 {
82 m_tld = tld;
83 m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
84 m_bChromaSa8d = m_param->rdLevel >= 3;
85
86 int csp = m_param->internalCsp;
87 uint32_t cuSize = g_maxCUSize;
88
89 bool ok = true;
90 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
91 {
92 ModeDepth &md = m_modeDepth[depth];
93
94 md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
95 ok &= md.fencYuv.create(cuSize, csp);
96
97 for (int j = 0; j < MAX_PRED_TYPES; j++)
98 {
99 md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
100 ok &= md.pred[j].predYuv.create(cuSize, csp);
101 ok &= md.pred[j].reconYuv.create(cuSize, csp);
102 md.pred[j].fencYuv = &md.fencYuv;
103 }
104 }
105
106 return ok;
107 }
108
109 void Analysis::destroy()
110 {
111 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
112 {
113 m_modeDepth[i].cuMemPool.destroy();
114 m_modeDepth[i].fencYuv.destroy();
115
116 for (int j = 0; j < MAX_PRED_TYPES; j++)
117 {
118 m_modeDepth[i].pred[j].predYuv.destroy();
119 m_modeDepth[i].pred[j].reconYuv.destroy();
120 }
121 }
122 }
123
124 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
125 {
126 m_slice = ctu.m_slice;
127 m_frame = &frame;
128
129 #if _DEBUG || CHECKED_BUILD
130 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
131 for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
132 m_modeDepth[i].pred[j].invalidate();
133 invalidateContexts(0);
134 #endif
135
136 int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
137 ctu.setQPSubParts((int8_t)qp, 0, 0);
138
139 m_rqt[0].cur.load(initialContext);
140 m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
141
142 uint32_t numPartition = ctu.m_numPartitions;
143 if (m_param->analysisMode)
144 {
145 if (m_slice->m_sliceType == I_SLICE)
146 m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
147 else
148 {
149 int numPredDir = m_slice->isInterP() ? 1 : 2;
150 m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
151 m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
152 m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
153 }
154 }
155
156 ProfileCUScope(ctu, totalCTUTime, totalCTUs);
157
158 uint32_t zOrder = 0;
159 if (m_slice->m_sliceType == I_SLICE)
160 {
161 compressIntraCU(ctu, cuGeom, zOrder, qp);
162 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
163 {
164 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
165 memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
166 memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
167 memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
168 memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
169 }
170 }
171 else
172 {
173 if (!m_param->rdLevel)
174 {
175 /* In RD Level 0/1, copy source pixels into the reconstructed block so
176 * they are available for intra predictions */
177 m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
178
179 compressInterCU_rd0_4(ctu, cuGeom, qp);
180
181 /* generate residual for entire CTU at once and copy to reconPic */
182 encodeResidue(ctu, cuGeom);
183 }
184 else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
185 compressInterCU_dist(ctu, cuGeom, qp);
186 else if (m_param->rdLevel <= 4)
187 compressInterCU_rd0_4(ctu, cuGeom, qp);
188 else
189 {
190 compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
191 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
192 {
193 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
194 memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
195 memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
196 }
197 }
198 }
199
200 return *m_modeDepth[0].bestMode;
201 }
202
203 void Analysis::tryLossless(const CUGeom& cuGeom)
204 {
205 ModeDepth& md = m_modeDepth[cuGeom.depth];
206
207 if (!md.bestMode->distortion)
208 /* already lossless */
209 return;
210 else if (md.bestMode->cu.isIntra(0))
211 {
212 md.pred[PRED_LOSSLESS].initCosts();
213 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
214 PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
215 uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
216 checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
217 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
218 }
219 else
220 {
221 md.pred[PRED_LOSSLESS].initCosts();
222 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
223 md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
224 encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
225 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
226 }
227 }
228
229 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
230 {
231 uint32_t depth = cuGeom.depth;
232 ModeDepth& md = m_modeDepth[depth];
233 md.bestMode = NULL;
234
235 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
236 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
237
238 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
239 {
240 uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
241 uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
242 char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
243 uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
244
245 if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
246 {
247 PartSize size = (PartSize)reusePartSizes[zOrder];
248 Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
249 mode.cu.initSubCU(parentCTU, cuGeom, qp);
250 checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
251 checkBestMode(mode, depth);
252
253 if (m_bTryLossless)
254 tryLossless(cuGeom);
255
256 if (mightSplit)
257 addSplitFlagCost(*md.bestMode, cuGeom.depth);
258
259 // increment zOrder offset to point to next best depth in sharedDepth buffer
260 zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
261 mightSplit = false;
262 }
263 }
264 else if (mightNotSplit)
265 {
266 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
267 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
268 checkBestMode(md.pred[PRED_INTRA], depth);
269
270 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
271 {
272 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
273 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
274 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
275 }
276
277 if (m_bTryLossless)
278 tryLossless(cuGeom);
279
280 if (mightSplit)
281 addSplitFlagCost(*md.bestMode, cuGeom.depth);
282 }
283
284 if (mightSplit)
285 {
286 Mode* splitPred = &md.pred[PRED_SPLIT];
287 splitPred->initCosts();
288 CUData* splitCU = &splitPred->cu;
289 splitCU->initSubCU(parentCTU, cuGeom, qp);
290
291 uint32_t nextDepth = depth + 1;
292 ModeDepth& nd = m_modeDepth[nextDepth];
293 invalidateContexts(nextDepth);
294 Entropy* nextContext = &m_rqt[depth].cur;
295 int32_t nextQP = qp;
296
297 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
298 {
299 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
300 if (childGeom.flags & CUGeom::PRESENT)
301 {
302 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
303 m_rqt[nextDepth].cur.load(*nextContext);
304
305 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
306 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
307
308 compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
309
310 // Save best CU and pred data for this sub CU
311 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
312 splitPred->addSubCosts(*nd.bestMode);
313 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
314 nextContext = &nd.bestMode->contexts;
315 }
316 else
317 {
318 /* record the depth of this non-present sub-CU */
319 splitCU->setEmptyPart(childGeom, subPartIdx);
320 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
321 }
322 }
323 nextContext->store(splitPred->contexts);
324 if (mightNotSplit)
325 addSplitFlagCost(*splitPred, cuGeom.depth);
326 else
327 updateModeCost(*splitPred);
328
329 checkDQPForSplitPred(*splitPred, cuGeom);
330 checkBestMode(*splitPred, depth);
331 }
332
333 /* Copy best data to encData CTU and recon */
334 md.bestMode->cu.copyToPic(depth);
335 if (md.bestMode != &md.pred[PRED_SPLIT])
336 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
337 }
338
339 void Analysis::PMODE::processTasks(int workerThreadId)
340 {
341 #if DETAILED_CU_STATS
342 int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
343 master.m_stats[fe].countPModeTasks++;
344 ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
345 #endif
346 ProfileScopeEvent(pmode);
347 master.processPmode(*this, master.m_tld[workerThreadId].analysis);
348 }
349
350 /* process pmode jobs until none remain; may be called by the master thread or by
351 * a bonded peer (slave) thread via pmodeTasks() */
352 void Analysis::processPmode(PMODE& pmode, Analysis& slave)
353 {
354 /* acquire a mode task, else exit early */
355 int task;
356 pmode.m_lock.acquire();
357 if (pmode.m_jobTotal > pmode.m_jobAcquired)
358 {
359 task = pmode.m_jobAcquired++;
360 pmode.m_lock.release();
361 }
362 else
363 {
364 pmode.m_lock.release();
365 return;
366 }
367
368 ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
369
370 /* setup slave Analysis */
371 if (&slave != this)
372 {
373 slave.m_slice = m_slice;
374 slave.m_frame = m_frame;
375 slave.m_param = m_param;
376 slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
377 slave.invalidateContexts(0);
378 slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
379 }
380
381 /* perform Mode task, repeat until no more work is available */
382 do
383 {
384 uint32_t refMasks[2] = { 0, 0 };
385
386 if (m_param->rdLevel <= 4)
387 {
388 switch (pmode.modes[task])
389 {
390 case PRED_INTRA:
391 slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
392 if (m_param->rdLevel > 2)
393 slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
394 break;
395
396 case PRED_2Nx2N:
397 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
398
399 slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
400 if (m_slice->m_sliceType == B_SLICE)
401 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
402 break;
403
404 case PRED_Nx2N:
405 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
406 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
407
408 slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
409 break;
410
411 case PRED_2NxN:
412 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
413 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
414
415 slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
416 break;
417
418 case PRED_2NxnU:
419 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
420 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
421
422 slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
423 break;
424
425 case PRED_2NxnD:
426 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
427 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
428
429 slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
430 break;
431
432 case PRED_nLx2N:
433 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
434 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
435
436 slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
437 break;
438
439 case PRED_nRx2N:
440 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
441 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
442
443 slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
444 break;
445
446 default:
447 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
448 break;
449 }
450 }
451 else
452 {
453 switch (pmode.modes[task])
454 {
455 case PRED_INTRA:
456 slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N, NULL, NULL);
457 if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
458 slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN, NULL, NULL);
459 break;
460
461 case PRED_2Nx2N:
462 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
463
464 slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
465 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
466 if (m_slice->m_sliceType == B_SLICE)
467 {
468 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
469 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
470 slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
471 }
472 break;
473
474 case PRED_Nx2N:
475 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
476 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
477
478 slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
479 break;
480
481 case PRED_2NxN:
482 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
483 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
484
485 slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
486 break;
487
488 case PRED_2NxnU:
489 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
490 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
491
492 slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
493 break;
494
495 case PRED_2NxnD:
496 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
497 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
498 slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
499 break;
500
501 case PRED_nLx2N:
502 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
503 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
504
505 slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
506 break;
507
508 case PRED_nRx2N:
509 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
510 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
511 slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
512 break;
513
514 default:
515 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
516 break;
517 }
518 }
519
520 task = -1;
521 pmode.m_lock.acquire();
522 if (pmode.m_jobTotal > pmode.m_jobAcquired)
523 task = pmode.m_jobAcquired++;
524 pmode.m_lock.release();
525 }
526 while (task >= 0);
527 }
528
529 uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
530 {
531 uint32_t depth = cuGeom.depth;
532 uint32_t cuAddr = parentCTU.m_cuAddr;
533 ModeDepth& md = m_modeDepth[depth];
534 md.bestMode = NULL;
535
536 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
537 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
538 uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
539 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
540
541 X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
542
543 PMODE pmode(*this, cuGeom);
544
545 if (mightNotSplit && depth >= minDepth)
546 {
547 /* Initialize all prediction CUs based on parentCTU */
548 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
549 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
550
551 if (m_param->rdLevel <= 4)
552 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
553 else
554 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
555 }
556
557 bool bNoSplit = false;
558 bool splitIntra = true;
559 if (md.bestMode)
560 {
561 bNoSplit = md.bestMode->cu.isSkipped(0);
562 if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
563 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
564 }
565
566 if (mightSplit && !bNoSplit)
567 {
568 Mode* splitPred = &md.pred[PRED_SPLIT];
569 splitPred->initCosts();
570 CUData* splitCU = &splitPred->cu;
571 splitCU->initSubCU(parentCTU, cuGeom, qp);
572
573 uint32_t nextDepth = depth + 1;
574 ModeDepth& nd = m_modeDepth[nextDepth];
575 invalidateContexts(nextDepth);
576 Entropy* nextContext = &m_rqt[depth].cur;
577 int nextQP = qp;
578 splitIntra = false;
579
580 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
581 {
582 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
583 if (childGeom.flags & CUGeom::PRESENT)
584 {
585 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
586 m_rqt[nextDepth].cur.load(*nextContext);
587
588 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
589 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
590
591 splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
592
593 // Save best CU and pred data for this sub CU
594 splitIntra |= nd.bestMode->cu.isIntra(0);
595 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
596 splitPred->addSubCosts(*nd.bestMode);
597
598 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
599 nextContext = &nd.bestMode->contexts;
600 }
601 else
602 splitCU->setEmptyPart(childGeom, subPartIdx);
603 }
604 nextContext->store(splitPred->contexts);
605
606 if (mightNotSplit)
607 addSplitFlagCost(*splitPred, cuGeom.depth);
608 else
609 updateModeCost(*splitPred);
610
611 checkDQPForSplitPred(*splitPred, cuGeom);
612 }
613
614 if (mightNotSplit && depth >= minDepth)
615 {
616 int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
617 int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra);
618
619 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
620 setLambdaFromQP(parentCTU, qp);
621
622 if (bTryIntra)
623 {
624 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
625 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
626 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
627 pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
628 }
629 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
630 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
631 if (m_param->bEnableRectInter)
632 {
633 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
634 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
635 }
636 if (bTryAmp)
637 {
638 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
639 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
640 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
641 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
642 }
643
644 m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
645
646 pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
647
648 /* participate in processing jobs, until all are distributed */
649 processPmode(pmode, *this);
650
651 /* the master worker thread (this one) does merge analysis. By doing
652 * merge after all the other jobs are at least started, we usually avoid
653 * blocking on another thread */
654
655 if (m_param->rdLevel <= 4)
656 {
657 {
658 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
659 pmode.waitForExit();
660 }
661
662 /* select best inter mode based on sa8d cost */
663 Mode *bestInter = &md.pred[PRED_2Nx2N];
664
665 if (m_param->bEnableRectInter)
666 {
667 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
668 bestInter = &md.pred[PRED_Nx2N];
669 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
670 bestInter = &md.pred[PRED_2NxN];
671 }
672
673 if (bTryAmp)
674 {
675 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
676 bestInter = &md.pred[PRED_2NxnU];
677 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
678 bestInter = &md.pred[PRED_2NxnD];
679 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
680 bestInter = &md.pred[PRED_nLx2N];
681 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
682 bestInter = &md.pred[PRED_nRx2N];
683 }
684
685 if (m_param->rdLevel > 2)
686 {
687 /* RD selection between merge, inter, bidir and intra */
688 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
689 {
690 uint32_t numPU = bestInter->cu.getNumPartInter(0);
691 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
692 {
693 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
694 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
695 }
696 }
697 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
698 checkBestMode(*bestInter, depth);
699
700 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
701 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
702 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
703 {
704 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
705 checkBestMode(md.pred[PRED_BIDIR], depth);
706 }
707
708 if (bTryIntra)
709 checkBestMode(md.pred[PRED_INTRA], depth);
710 }
711 else /* m_param->rdLevel == 2 */
712 {
713 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
714 md.bestMode = bestInter;
715
716 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
717 md.bestMode = &md.pred[PRED_BIDIR];
718
719 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
720 {
721 md.bestMode = &md.pred[PRED_INTRA];
722 encodeIntraInInter(*md.bestMode, cuGeom);
723 }
724 else if (!md.bestMode->cu.m_mergeFlag[0])
725 {
726 /* finally code the best mode selected from SA8D costs */
727 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
728 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
729 {
730 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
731 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
732 }
733 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
734 }
735 }
736 }
737 else
738 {
739 {
740 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
741 pmode.waitForExit();
742 }
743
744 checkBestMode(md.pred[PRED_2Nx2N], depth);
745 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
746 checkBestMode(md.pred[PRED_BIDIR], depth);
747
748 if (m_param->bEnableRectInter)
749 {
750 checkBestMode(md.pred[PRED_Nx2N], depth);
751 checkBestMode(md.pred[PRED_2NxN], depth);
752 }
753
754 if (bTryAmp)
755 {
756 checkBestMode(md.pred[PRED_2NxnU], depth);
757 checkBestMode(md.pred[PRED_2NxnD], depth);
758 checkBestMode(md.pred[PRED_nLx2N], depth);
759 checkBestMode(md.pred[PRED_nRx2N], depth);
760 }
761
762 if (bTryIntra)
763 {
764 checkBestMode(md.pred[PRED_INTRA], depth);
765 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
766 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
767 }
768 }
769
770 if (m_bTryLossless)
771 tryLossless(cuGeom);
772
773 if (mightSplit)
774 addSplitFlagCost(*md.bestMode, cuGeom.depth);
775 }
776
777 /* compare split RD cost against best cost */
778 if (mightSplit && !bNoSplit)
779 checkBestMode(md.pred[PRED_SPLIT], depth);
780
781 /* determine which motion references the parent CU should search */
782 uint32_t refMask;
783 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
784 refMask = 0;
785 else if (md.bestMode == &md.pred[PRED_SPLIT])
786 refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
787 else
788 {
789 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
790 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
791 uint32_t numPU = cu.getNumPartInter(0);
792 refMask = 0;
793 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
794 refMask |= cu.getBestRefIdx(subPartIdx);
795 }
796
797 if (mightNotSplit)
798 {
799 /* early-out statistics */
800 FrameData& curEncData = *m_frame->m_encData;
801 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
802 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
803 cuStat.count[depth] += 1;
804 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
805 }
806
807 /* Copy best data to encData CTU and recon */
808 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
809 md.bestMode->cu.copyToPic(depth);
810 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
811
812 return refMask;
813 }
814
815 uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
816 {
817 uint32_t depth = cuGeom.depth;
818 uint32_t cuAddr = parentCTU.m_cuAddr;
819 ModeDepth& md = m_modeDepth[depth];
820 md.bestMode = NULL;
821
822 PicYuv& reconPic = *m_frame->m_reconPic;
823
824 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
825 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
826 uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
827 bool earlyskip = false;
828 bool splitIntra = true;
829 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
830 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
831 if (mightNotSplit && depth >= minDepth)
832 {
833 /* Compute Merge Cost */
834 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
835 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
836 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
837 if (m_param->rdLevel)
838 earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
839 }
840
841 bool bNoSplit = false;
842 if (md.bestMode)
843 {
844 bNoSplit = md.bestMode->cu.isSkipped(0);
845 if (mightSplit && depth && depth >= minDepth && !bNoSplit)
846 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
847 }
848
849 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
850 if (mightSplit && !bNoSplit)
851 {
852 Mode* splitPred = &md.pred[PRED_SPLIT];
853 splitPred->initCosts();
854 CUData* splitCU = &splitPred->cu;
855 splitCU->initSubCU(parentCTU, cuGeom, qp);
856
857 uint32_t nextDepth = depth + 1;
858 ModeDepth& nd = m_modeDepth[nextDepth];
859 invalidateContexts(nextDepth);
860 Entropy* nextContext = &m_rqt[depth].cur;
861 int nextQP = qp;
862 splitIntra = false;
863
864 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
865 {
866 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
867 if (childGeom.flags & CUGeom::PRESENT)
868 {
869 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
870 m_rqt[nextDepth].cur.load(*nextContext);
871
872 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
873 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
874
875 splitRefs[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
876
877 // Save best CU and pred data for this sub CU
878 splitIntra |= nd.bestMode->cu.isIntra(0);
879 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
880 splitPred->addSubCosts(*nd.bestMode);
881
882 if (m_param->rdLevel)
883 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
884 else
885 nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
886 if (m_param->rdLevel > 1)
887 nextContext = &nd.bestMode->contexts;
888 }
889 else
890 splitCU->setEmptyPart(childGeom, subPartIdx);
891 }
892 nextContext->store(splitPred->contexts);
893
894 if (mightNotSplit)
895 addSplitFlagCost(*splitPred, cuGeom.depth);
896 else if (m_param->rdLevel > 1)
897 updateModeCost(*splitPred);
898 else
899 splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
900 }
901
902 /* Split CUs
903 * 0 1
904 * 2 3 */
905 uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
906 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
907 if (mightNotSplit && depth >= minDepth)
908 {
909 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
910 setLambdaFromQP(parentCTU, qp);
911
912 if (!earlyskip)
913 {
914 uint32_t refMasks[2];
915 refMasks[0] = allSplitRefs;
916 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
917 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
918
919 if (m_param->limitReferences & X265_REF_LIMIT_CU)
920 {
921 CUData& cu = md.pred[PRED_2Nx2N].cu;
922 uint32_t refMask = cu.getBestRefIdx(0);
923 allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
924 }
925
926 if (m_slice->m_sliceType == B_SLICE)
927 {
928 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
929 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
930 }
931
932 Mode *bestInter = &md.pred[PRED_2Nx2N];
933 if (m_param->bEnableRectInter)
934 {
935 refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
936 refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
937 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
938 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
939 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
940 bestInter = &md.pred[PRED_Nx2N];
941
942 refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
943 refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
944 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
945 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
946 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
947 bestInter = &md.pred[PRED_2NxN];
948 }
949
950 if (m_slice->m_sps->maxAMPDepth > depth)
951 {
952 bool bHor = false, bVer = false;
953 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
954 bHor = true;
955 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
956 bVer = true;
957 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
958 md.bestMode && md.bestMode->cu.getQtRootCbf(0))
959 {
960 bHor = true;
961 bVer = true;
962 }
963
964 if (bHor)
965 {
966 refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
967 refMasks[1] = allSplitRefs; /* 75% bot */
968 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
969 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
970 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
971 bestInter = &md.pred[PRED_2NxnU];
972
973 refMasks[0] = allSplitRefs; /* 75% top */
974 refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
975 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
976 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
977 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
978 bestInter = &md.pred[PRED_2NxnD];
979 }
980 if (bVer)
981 {
982 refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
983 refMasks[1] = allSplitRefs; /* 75% right */
984 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
985 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
986 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
987 bestInter = &md.pred[PRED_nLx2N];
988
989 refMasks[0] = allSplitRefs; /* 75% left */
990 refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
991 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
992 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
993 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
994 bestInter = &md.pred[PRED_nRx2N];
995 }
996 }
997 bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
998 if (m_param->rdLevel >= 3)
999 {
1000 /* Calculate RD cost of best inter option */
1001 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1002 {
1003 uint32_t numPU = bestInter->cu.getNumPartInter(0);
1004 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1005 {
1006 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1007 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1008 }
1009 }
1010 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1011 checkBestMode(*bestInter, depth);
1012
1013 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1014 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1015 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1016 {
1017 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1018 checkBestMode(md.pred[PRED_BIDIR], depth);
1019 }
1020
1021 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1022 md.bestMode->sa8dCost == MAX_INT64)
1023 {
1024 if (!m_param->limitReferences || splitIntra)
1025 {
1026 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1027 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1028 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1029 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1030 checkBestMode(md.pred[PRED_INTRA], depth);
1031 }
1032 else
1033 {
1034 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1035 }
1036 }
1037 }
1038 else
1039 {
1040 /* SA8D choice between merge/skip, inter, bidir, and intra */
1041 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1042 md.bestMode = bestInter;
1043
1044 if (m_slice->m_sliceType == B_SLICE &&
1045 md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1046 md.bestMode = &md.pred[PRED_BIDIR];
1047
1048 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1049 {
1050 if (!m_param->limitReferences || splitIntra)
1051 {
1052 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1053 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1054 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1055 if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1056 md.bestMode = &md.pred[PRED_INTRA];
1057 }
1058 else
1059 {
1060 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1061 }
1062 }
1063
1064 /* finally code the best mode selected by SA8D costs:
1065 * RD level 2 - fully encode the best mode
1066 * RD level 1 - generate recon pixels
1067 * RD level 0 - generate chroma prediction */
1068 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
1069 {
1070 /* prediction already generated for this CU, and if rd level
1071 * is not 0, it is already fully encoded */
1072 }
1073 else if (md.bestMode->cu.isInter(0))
1074 {
1075 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1076 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1077 {
1078 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1079 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1080 }
1081 if (m_param->rdLevel == 2)
1082 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1083 else if (m_param->rdLevel == 1)
1084 {
1085 /* generate recon pixels with no rate distortion considerations */
1086 CUData& cu = md.bestMode->cu;
1087
1088 uint32_t tuDepthRange[2];
1089 cu.getInterTUQtDepthRange(tuDepthRange, 0);
1090
1091 m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
1092 residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1093 if (cu.getQtRootCbf(0))
1094 md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
1095 else
1096 {
1097 md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
1098 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
1099 cu.setPredModeSubParts(MODE_SKIP);
1100 }
1101 }
1102 }
1103 else
1104 {
1105 if (m_param->rdLevel == 2)
1106 encodeIntraInInter(*md.bestMode, cuGeom);
1107 else if (m_param->rdLevel == 1)
1108 {
1109 /* generate recon pixels with no rate distortion considerations */
1110 CUData& cu = md.bestMode->cu;
1111
1112 uint32_t tuDepthRange[2];
1113 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1114
1115 residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1116 getBestIntraModeChroma(*md.bestMode, cuGeom);
1117 residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
1118 md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
1119 }
1120 }
1121 }
1122 } // !earlyskip
1123
1124 if (m_bTryLossless)
1125 tryLossless(cuGeom);
1126
1127 if (mightSplit)
1128 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1129 }
1130
1131 if (mightSplit && !bNoSplit)
1132 {
1133 Mode* splitPred = &md.pred[PRED_SPLIT];
1134 if (!md.bestMode)
1135 md.bestMode = splitPred;
1136 else if (m_param->rdLevel > 1)
1137 checkBestMode(*splitPred, cuGeom.depth);
1138 else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1139 md.bestMode = splitPred;
1140
1141 checkDQPForSplitPred(*md.bestMode, cuGeom);
1142 }
1143
1144 /* determine which motion references the parent CU should search */
1145 uint32_t refMask;
1146 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1147 refMask = 0;
1148 else if (md.bestMode == &md.pred[PRED_SPLIT])
1149 refMask = allSplitRefs;
1150 else
1151 {
1152 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1153 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1154 uint32_t numPU = cu.getNumPartInter(0);
1155 refMask = 0;
1156 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1157 refMask |= cu.getBestRefIdx(subPartIdx);
1158 }
1159
1160 if (mightNotSplit)
1161 {
1162 /* early-out statistics */
1163 FrameData& curEncData = *m_frame->m_encData;
1164 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1165 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1166 cuStat.count[depth] += 1;
1167 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1168 }
1169
1170 /* Copy best data to encData CTU and recon */
1171 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
1172 md.bestMode->cu.copyToPic(depth);
1173 if (m_param->rdLevel)
1174 md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
1175
1176 return refMask;
1177 }
1178
1179 uint32_t Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
1180 {
1181 uint32_t depth = cuGeom.depth;
1182 ModeDepth& md = m_modeDepth[depth];
1183 md.bestMode = NULL;
1184
1185 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1186 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1187
1188 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
1189 {
1190 uint8_t* reuseDepth = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
1191 uint8_t* reuseModes = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
1192 if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx && reuseModes[zOrder] == MODE_SKIP)
1193 {
1194 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1195 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1196 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, true);
1197
1198 if (m_bTryLossless)
1199 tryLossless(cuGeom);
1200
1201 if (mightSplit)
1202 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1203
1204 // increment zOrder offset to point to next best depth in sharedDepth buffer
1205 zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
1206
1207 mightSplit = false;
1208 mightNotSplit = false;
1209 }
1210 }
1211
1212 bool foundSkip = false;
1213 bool splitIntra = true;
1214 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
1215 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
1216 if (mightNotSplit)
1217 {
1218 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1219 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1220 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
1221 foundSkip = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1222 }
1223
1224 // estimate split cost
1225 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1226 if (mightSplit && !foundSkip)
1227 {
1228 Mode* splitPred = &md.pred[PRED_SPLIT];
1229 splitPred->initCosts();
1230 CUData* splitCU = &splitPred->cu;
1231 splitCU->initSubCU(parentCTU, cuGeom, qp);
1232
1233 uint32_t nextDepth = depth + 1;
1234 ModeDepth& nd = m_modeDepth[nextDepth];
1235 invalidateContexts(nextDepth);
1236 Entropy* nextContext = &m_rqt[depth].cur;
1237 int nextQP = qp;
1238 splitIntra = false;
1239
1240 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1241 {
1242 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1243 if (childGeom.flags & CUGeom::PRESENT)
1244 {
1245 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1246 m_rqt[nextDepth].cur.load(*nextContext);
1247
1248 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1249 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1250
1251 splitRefs[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
1252
1253 // Save best CU and pred data for this sub CU
1254 splitIntra |= nd.bestMode->cu.isIntra(0);
1255 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1256 splitPred->addSubCosts(*nd.bestMode);
1257 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1258 nextContext = &nd.bestMode->contexts;
1259 }
1260 else
1261 {
1262 splitCU->setEmptyPart(childGeom, subPartIdx);
1263 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
1264 }
1265 }
1266 nextContext->store(splitPred->contexts);
1267 if (mightNotSplit)
1268 addSplitFlagCost(*splitPred, cuGeom.depth);
1269 else
1270 updateModeCost(*splitPred);
1271
1272 checkDQPForSplitPred(*splitPred, cuGeom);
1273 }
1274
1275 /* Split CUs
1276 * 0 1
1277 * 2 3 */
1278 uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1279 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1280 if (mightNotSplit)
1281 {
1282 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1283 setLambdaFromQP(parentCTU, qp);
1284
1285 if (!(foundSkip && m_param->bEnableEarlySkip))
1286 {
1287 uint32_t refMasks[2];
1288 refMasks[0] = allSplitRefs;
1289 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1290 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1291 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1292
1293 if (m_param->limitReferences & X265_REF_LIMIT_CU)
1294 {
1295 CUData& cu = md.pred[PRED_2Nx2N].cu;
1296 uint32_t refMask = cu.getBestRefIdx(0);
1297 allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
1298 }
1299
1300 if (m_slice->m_sliceType == B_SLICE)
1301 {
1302 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1303 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1304 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1305 {
1306 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1307 checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
1308 }
1309 }
1310
1311 if (m_param->bEnableRectInter)
1312 {
1313 refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
1314 refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
1315 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1316 checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1317 checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1318
1319 refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
1320 refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
1321 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1322 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1323 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1324 }
1325
1326 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1327 if (m_slice->m_sps->maxAMPDepth > depth)
1328 {
1329 bool bHor = false, bVer = false;
1330 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1331 bHor = true;
1332 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
1333 bVer = true;
1334 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
1335 {
1336 bHor = true;
1337 bVer = true;
1338 }
1339
1340 if (bHor)
1341 {
1342 refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
1343 refMasks[1] = allSplitRefs; /* 75% bot */
1344 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1345 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1346 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1347
1348 refMasks[0] = allSplitRefs; /* 75% top */
1349 refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
1350 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1351 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1352 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1353 }
1354 if (bVer)
1355 {
1356 refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
1357 refMasks[1] = allSplitRefs; /* 75% right */
1358 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1359 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1360 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1361
1362 refMasks[0] = allSplitRefs; /* 75% left */
1363 refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
1364 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1365 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1366 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1367 }
1368 }
1369
1370 if (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames)
1371 {
1372 if (!m_param->limitReferences || splitIntra)
1373 {
1374 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1375 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1376 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
1377 checkBestMode(md.pred[PRED_INTRA], depth);
1378
1379 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1380 {
1381 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1382 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
1383 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1384 }
1385 }
1386 else
1387 {
1388 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1389 }
1390 }
1391 }
1392
1393 if (m_bTryLossless)
1394 tryLossless(cuGeom);
1395
1396 if (mightSplit)
1397 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1398 }
1399
1400 /* compare split RD cost against best cost */
1401 if (mightSplit && !foundSkip)
1402 checkBestMode(md.pred[PRED_SPLIT], depth);
1403
1404 /* determine which motion references the parent CU should search */
1405 uint32_t refMask;
1406 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1407 refMask = 0;
1408 else if (md.bestMode == &md.pred[PRED_SPLIT])
1409 refMask = allSplitRefs;
1410 else
1411 {
1412 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1413 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1414 uint32_t numPU = cu.getNumPartInter(0);
1415 refMask = 0;
1416 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1417 refMask |= cu.getBestRefIdx(subPartIdx);
1418 }
1419
1420 /* Copy best data to encData CTU and recon */
1421 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
1422 md.bestMode->cu.copyToPic(depth);
1423 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
1424
1425 return refMask;
1426 }
1427
1428 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1429 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1430 {
1431 uint32_t depth = cuGeom.depth;
1432 ModeDepth& md = m_modeDepth[depth];
1433 Yuv *fencYuv = &md.fencYuv;
1434
1435 /* Note that these two Mode instances are named MERGE and SKIP but they may
1436 * hold the reverse when the function returns. We toggle between the two modes */
1437 Mode* tempPred = &merge;
1438 Mode* bestPred = &skip;
1439
1440 X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
1441
1442 tempPred->initCosts();
1443 tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1444 tempPred->cu.setPredModeSubParts(MODE_INTER);
1445 tempPred->cu.m_mergeFlag[0] = true;
1446
1447 bestPred->initCosts();
1448 bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1449 bestPred->cu.setPredModeSubParts(MODE_INTER);
1450 bestPred->cu.m_mergeFlag[0] = true;
1451
1452 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1453 uint8_t candDir[MRG_MAX_NUM_CANDS];
1454 uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
1455 PredictionUnit pu(merge.cu, cuGeom, 0);
1456
1457 bestPred->sa8dCost = MAX_INT64;
1458 int bestSadCand = -1;
1459 int sizeIdx = cuGeom.log2CUSize - 2;
1460
1461 for (uint32_t i = 0; i < numMergeCand; ++i)
1462 {
1463 if (m_bFrameParallel &&
1464 (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1465 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1466 continue;
1467
1468 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1469 X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
1470 tempPred->cu.m_interDir[0] = candDir[i];
1471 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
1472 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
1473 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
1474 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1475
1476 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
1477
1478 tempPred->sa8dBits = getTUBits(i, numMergeCand);
1479 tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1480 if (m_bChromaSa8d)
1481 {
1482 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
1483 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
1484 }
1485 tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
1486
1487 if (tempPred->sa8dCost < bestPred->sa8dCost)
1488 {
1489 bestSadCand = i;
1490 std::swap(tempPred, bestPred);
1491 }
1492 }
1493
1494 /* force mode decision to take inter or intra */
1495 if (bestSadCand < 0)
1496 return;
1497
1498 /* calculate the motion compensation for chroma for the best mode selected */
1499 if (!m_bChromaSa8d) /* Chroma MC was done above */
1500 motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
1501
1502 if (m_param->rdLevel)
1503 {
1504 if (m_param->bLossless)
1505 bestPred->rdCost = MAX_INT64;
1506 else
1507 encodeResAndCalcRdSkipCU(*bestPred);
1508
1509 /* Encode with residual */
1510 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
1511 tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
1512 tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
1513 tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
1514 tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
1515 tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
1516 tempPred->sa8dCost = bestPred->sa8dCost;
1517 tempPred->sa8dBits = bestPred->sa8dBits;
1518 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1519
1520 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1521
1522 md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
1523 }
1524 else
1525 md.bestMode = bestPred;
1526
1527 /* broadcast sets of MV field data */
1528 md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
1529 md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
1530 md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
1531 md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
1532 md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
1533 checkDQP(*md.bestMode, cuGeom);
1534 X265_CHECK(md.bestMode->ok(), "Merge mode not ok\n");
1535 }
1536
1537 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1538 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand)
1539 {
1540 uint32_t depth = cuGeom.depth;
1541
1542 /* Note that these two Mode instances are named MERGE and SKIP but they may
1543 * hold the reverse when the function returns. We toggle between the two modes */
1544 Mode* tempPred = &merge;
1545 Mode* bestPred = &skip;
1546
1547 merge.initCosts();
1548 merge.cu.setPredModeSubParts(MODE_INTER);
1549 merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
1550 merge.cu.m_mergeFlag[0] = true;
1551
1552 skip.initCosts();
1553 skip.cu.setPredModeSubParts(MODE_INTER);
1554 skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
1555 skip.cu.m_mergeFlag[0] = true;
1556
1557 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1558 uint8_t candDir[MRG_MAX_NUM_CANDS];
1559 uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
1560 PredictionUnit pu(merge.cu, cuGeom, 0);
1561
1562 bool foundCbf0Merge = false;
1563 bool triedPZero = false, triedBZero = false;
1564 bestPred->rdCost = MAX_INT64;
1565
1566 uint32_t first = 0, last = numMergeCand;
1567 if (isShareMergeCand)
1568 {
1569 first = *m_reuseBestMergeCand;
1570 last = first + 1;
1571 }
1572
1573 for (uint32_t i = first; i < last; i++)
1574 {
1575 if (m_bFrameParallel &&
1576 (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1577 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1578 continue;
1579
1580 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1581 if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
1582 {
1583 if (triedPZero)
1584 continue;
1585 triedPZero = true;
1586 }
1587 else if (candDir[i] == 3 &&
1588 !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
1589 !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
1590 {
1591 if (triedBZero)
1592 continue;
1593 triedBZero = true;
1594 }
1595
1596 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
1597 tempPred->cu.m_interDir[0] = candDir[i];
1598 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
1599 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
1600 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
1601 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1602 tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
1603
1604 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
1605
1606 uint8_t hasCbf = true;
1607 bool swapped = false;
1608 if (!foundCbf0Merge)
1609 {
1610 /* if the best prediction has CBF (not a skip) then try merge with residual */
1611
1612 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1613 hasCbf = tempPred->cu.getQtRootCbf(0);
1614 foundCbf0Merge = !hasCbf;
1615
1616 if (tempPred->rdCost < bestPred->rdCost)
1617 {
1618 std::swap(tempPred, bestPred);
1619 swapped = true;
1620 }
1621 }
1622 if (!m_param->bLossless && hasCbf)
1623 {
1624 /* try merge without residual (skip), if not lossless coding */
1625
1626 if (swapped)
1627 {
1628 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
1629 tempPred->cu.m_interDir[0] = candDir[i];
1630 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
1631 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
1632 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
1633 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1634 tempPred->cu.setPredModeSubParts(MODE_INTER);
1635 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1636 }
1637
1638 encodeResAndCalcRdSkipCU(*tempPred);
1639
1640 if (tempPred->rdCost < bestPred->rdCost)
1641 std::swap(tempPred, bestPred);
1642 }
1643 }
1644
1645 if (bestPred->rdCost < MAX_INT64)
1646 {
1647 m_modeDepth[depth].bestMode = bestPred;
1648
1649 /* broadcast sets of MV field data */
1650 uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
1651 bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
1652 bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
1653 bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
1654 bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
1655 bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
1656 checkDQP(*bestPred, cuGeom);
1657 X265_CHECK(bestPred->ok(), "merge mode is not ok");
1658 }
1659
1660 if (m_param->analysisMode)
1661 {
1662 m_reuseBestMergeCand++;
1663 if (m_param->analysisMode == X265_ANALYSIS_SAVE)
1664 *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
1665 }
1666 }
1667
1668 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
1669 {
1670 interMode.initCosts();
1671 interMode.cu.setPartSizeSubParts(partSize);
1672 interMode.cu.setPredModeSubParts(MODE_INTER);
1673 int numPredDir = m_slice->isInterP() ? 1 : 2;
1674
1675 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1676 {
1677 uint32_t numPU = interMode.cu.getNumPartInter(0);
1678 for (uint32_t part = 0; part < numPU; part++)
1679 {
1680 MotionData* bestME = interMode.bestME[part];
1681 for (int32_t i = 0; i < numPredDir; i++)
1682 {
1683 bestME[i].ref = *m_reuseRef;
1684 m_reuseRef++;
1685 }
1686 }
1687 }
1688
1689 predInterSearch(interMode, cuGeom, m_bChromaSa8d, refMask);
1690
1691 /* predInterSearch sets interMode.sa8dBits */
1692 const Yuv& fencYuv = *interMode.fencYuv;
1693 Yuv& predYuv = interMode.predYuv;
1694 int part = partitionFromLog2Size(cuGeom.log2CUSize);
1695 interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1696 if (m_bChromaSa8d)
1697 {
1698 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
1699 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
1700 }
1701 interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
1702
1703 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1704 {
1705 uint32_t numPU = interMode.cu.getNumPartInter(0);
1706 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1707 {
1708 MotionData* bestME = interMode.bestME[puIdx];
1709 for (int32_t i = 0; i < numPredDir; i++)
1710 {
1711 *m_reuseRef = bestME[i].ref;
1712 m_reuseRef++;
1713 }
1714 }
1715 }
1716 }
1717
1718 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
1719 {
1720 interMode.initCosts();
1721 interMode.cu.setPartSizeSubParts(partSize);
1722 interMode.cu.setPredModeSubParts(MODE_INTER);
1723 int numPredDir = m_slice->isInterP() ? 1 : 2;
1724
1725 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1726 {
1727 uint32_t numPU = interMode.cu.getNumPartInter(0);
1728 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1729 {
1730 MotionData* bestME = interMode.bestME[puIdx];
1731 for (int32_t i = 0; i < numPredDir; i++)
1732 {
1733 bestME[i].ref = *m_reuseRef;
1734 m_reuseRef++;
1735 }
1736 }
1737 }
1738
1739 predInterSearch(interMode, cuGeom, true, refMask);
1740
1741 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1742 encodeResAndCalcRdInterCU(interMode, cuGeom);
1743
1744 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1745 {
1746 uint32_t numPU = interMode.cu.getNumPartInter(0);
1747 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1748 {
1749 MotionData* bestME = interMode.bestME[puIdx];
1750 for (int32_t i = 0; i < numPredDir; i++)
1751 {
1752 *m_reuseRef = bestME[i].ref;
1753 m_reuseRef++;
1754 }
1755 }
1756 }
1757 }
1758
1759 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
1760 {
1761 CUData& cu = bidir2Nx2N.cu;
1762
1763 if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
1764 {
1765 bidir2Nx2N.sa8dCost = MAX_INT64;
1766 bidir2Nx2N.rdCost = MAX_INT64;
1767 return;
1768 }
1769
1770 const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
1771 MV mvzero(0, 0);
1772 int partEnum = cuGeom.log2CUSize - 2;
1773
1774 bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
1775 bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
1776 MotionData* bestME = bidir2Nx2N.bestME[0];
1777 int ref0 = bestME[0].ref;
1778 MV mvp0 = bestME[0].mvp;
1779 int mvpIdx0 = bestME[0].mvpIdx;
1780 int ref1 = bestME[1].ref;
1781 MV mvp1 = bestME[1].mvp;
1782 int mvpIdx1 = bestME[1].mvpIdx;
1783
1784 bidir2Nx2N.initCosts();
1785 cu.setPartSizeSubParts(SIZE_2Nx2N);
1786 cu.setPredModeSubParts(MODE_INTER);
1787 cu.setPUInterDir(3, 0, 0);
1788 cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
1789 cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
1790 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1791 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1792 cu.m_mergeFlag[0] = 0;
1793
1794 /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
1795 cu.setPUMv(0, bestME[0].mv, 0, 0);
1796 cu.m_mvd[0][0] = bestME[0].mv - mvp0;
1797
1798 cu.setPUMv(1, bestME[1].mv, 0, 0);
1799 cu.m_mvd[1][0] = bestME[1].mv - mvp1;
1800
1801 PredictionUnit pu(cu, cuGeom, 0);
1802 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
1803
1804 int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
1805 if (m_bChromaSa8d)
1806 {
1807 /* Add in chroma distortion */
1808 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
1809 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
1810 }
1811 bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1812 bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
1813
1814 bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
1815 if (bTryZero)
1816 {
1817 /* Do not try zero MV if unidir motion predictors are beyond
1818 * valid search area */
1819 MV mvmin, mvmax;
1820 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
1821 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
1822 mvmax.y += 2; // there is some pad for subpel refine
1823 mvmin <<= 2;
1824 mvmax <<= 2;
1825
1826 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
1827 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
1828 }
1829 if (bTryZero)
1830 {
1831 /* Estimate cost of BIDIR using coincident blocks */
1832 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1833
1834 int zsa8d;
1835
1836 if (m_bChromaSa8d)
1837 {
1838 cu.m_mv[0][0] = mvzero;
1839 cu.m_mv[1][0] = mvzero;
1840
1841 motionCompensation(cu, pu, tmpPredYuv, true, true);
1842
1843 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1844 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
1845 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
1846 }
1847 else
1848 {
1849 pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
1850 pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
1851 intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
1852
1853 primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
1854 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1855 }
1856
1857 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
1858 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
1859 uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
1860
1861 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1862 mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
1863 mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
1864
1865 uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1866 zcost = zsa8d + m_rdCost.getCost(zbits);
1867
1868 if (zcost < bidir2Nx2N.sa8dCost)
1869 {
1870 bidir2Nx2N.sa8dBits = zbits;
1871 bidir2Nx2N.sa8dCost = zcost;
1872
1873 cu.setPUMv(0, mvzero, 0, 0);
1874 cu.m_mvd[0][0] = mvzero - mvp0;
1875 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1876
1877 cu.setPUMv(1, mvzero, 0, 0);
1878 cu.m_mvd[1][0] = mvzero - mvp1;
1879 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1880
1881 if (m_bChromaSa8d)
1882 /* real MC was already performed */
1883 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
1884 else
1885 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
1886 }
1887 else if (m_bChromaSa8d)
1888 {
1889 /* recover overwritten motion vectors */
1890 cu.m_mv[0][0] = bestME[0].mv;
1891 cu.m_mv[1][0] = bestME[1].mv;
1892 }
1893 }
1894 }
1895
1896 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
1897 {
1898 if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < g_maxCUDepth)
1899 {
1900 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1901 {
1902 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1903 if (childGeom.flags & CUGeom::PRESENT)
1904 encodeResidue(ctu, childGeom);
1905 }
1906 return;
1907 }
1908
1909 uint32_t absPartIdx = cuGeom.absPartIdx;
1910 int sizeIdx = cuGeom.log2CUSize - 2;
1911
1912 /* reuse the bestMode data structures at the current depth */
1913 Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1914 CUData& cu = bestMode->cu;
1915
1916 cu.copyFromPic(ctu, cuGeom);
1917
1918 PicYuv& reconPic = *m_frame->m_reconPic;
1919
1920 Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
1921 if (cuGeom.depth)
1922 m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
1923 X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
1924
1925 if (cu.isIntra(0))
1926 {
1927 ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
1928
1929 uint32_t tuDepthRange[2];
1930 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1931
1932 residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
1933 getBestIntraModeChroma(*bestMode, cuGeom);
1934 residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1935 }
1936 else // if (cu.isInter(0))
1937 {
1938 ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
1939
1940 X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
1941
1942 /* Calculate residual for current CU part into depth sized resiYuv */
1943
1944 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1945
1946 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1947 Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1948 pixel* predY = predYuv.getLumaAddr(absPartIdx);
1949 pixel* predU = predYuv.getCbAddr(absPartIdx);
1950 pixel* predV = predYuv.getCrAddr(absPartIdx);
1951
1952 primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
1953 fencYuv.m_buf[0], predY,
1954 fencYuv.m_size, predYuv.m_size);
1955
1956 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
1957 fencYuv.m_buf[1], predU,
1958 fencYuv.m_csize, predYuv.m_csize);
1959
1960 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
1961 fencYuv.m_buf[2], predV,
1962 fencYuv.m_csize, predYuv.m_csize);
1963
1964 uint32_t tuDepthRange[2];
1965 cu.getInterTUQtDepthRange(tuDepthRange, 0);
1966
1967 residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
1968
1969 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
1970 cu.setPredModeSubParts(MODE_SKIP);
1971
1972 /* residualTransformQuantInter() wrote transformed residual back into
1973 * resiYuv. Generate the recon pixels by adding it to the prediction */
1974
1975 if (cu.m_cbf[0][0])
1976 primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1977 predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1978 else
1979 primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1980 predY, predYuv.m_size);
1981
1982 if (cu.m_cbf[1][0])
1983 primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1984 predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1985 else
1986 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1987 predU, predYuv.m_csize);
1988
1989 if (cu.m_cbf[2][0])
1990 primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1991 predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1992 else
1993 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1994 predV, predYuv.m_csize);
1995 }
1996
1997 cu.updatePic(cuGeom.depth);
1998 }
1999
2000 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
2001 {
2002 if (m_param->rdLevel >= 3)
2003 {
2004 /* code the split flag (0 or 1) and update bit costs */
2005 mode.contexts.resetBits();
2006 mode.contexts.codeSplitFlag(mode.cu, 0, depth);
2007 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
2008 mode.mvBits += bits;
2009 mode.totalBits += bits;
2010 updateModeCost(mode);
2011 }
2012 else if (m_param->rdLevel <= 1)
2013 {
2014 mode.sa8dBits++;
2015 mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
2016 }
2017 else
2018 {
2019 mode.mvBits++;
2020 mode.totalBits++;
2021 updateModeCost(mode);
2022 }
2023 }
2024
2025 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
2026 {
2027 /* Do not attempt to code a block larger than the largest block in the
2028 * co-located CTUs in L0 and L1 */
2029 int currentQP = parentCTU.m_qp[0];
2030 int previousQP = currentQP;
2031 uint32_t minDepth0 = 4, minDepth1 = 4;
2032 uint32_t sum = 0;
2033 int numRefs = 0;
2034 if (m_slice->m_numRefIdx[0])
2035 {
2036 numRefs++;
2037 const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
2038 previousQP = cu.m_qp[0];
2039 if (!cu.m_cuDepth[cuGeom.absPartIdx])
2040 return 0;
2041 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
2042 {
2043 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
2044 minDepth0 = X265_MIN(d, minDepth0);
2045 sum += d;
2046 }
2047 }
2048 if (m_slice->m_numRefIdx[1])
2049 {
2050 numRefs++;
2051 const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
2052 if (!cu.m_cuDepth[cuGeom.absPartIdx])
2053 return 0;
2054 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
2055 {
2056 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
2057 minDepth1 = X265_MIN(d, minDepth1);
2058 sum += d;
2059 }
2060 }
2061 if (!numRefs)
2062 return 0;
2063
2064 uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
2065 uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
2066
2067 /* allow block size growth if QP is raising or avg depth is
2068 * less than 1.5 of min depth */
2069 if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
2070 minDepth -= 1;
2071
2072 return minDepth;
2073 }
2074
2075 /* returns true if recursion should be stopped */
2076 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
2077 {
2078 /* early exit when the RD cost of best mode at depth n is less than the sum
2079 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
2080 * left, colocated) and avg cost of that CU at depth "n" with weightage for
2081 * each quantity */
2082
2083 uint32_t depth = cuGeom.depth;
2084 FrameData& curEncData = *m_frame->m_encData;
2085 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
2086 uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
2087 uint64_t cuCount = cuStat.count[depth];
2088
2089 uint64_t neighCost = 0, neighCount = 0;
2090 const CUData* above = parentCTU.m_cuAbove;
2091 if (above)
2092 {
2093 FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
2094 neighCost += astat.avgCost[depth] * astat.count[depth];
2095 neighCount += astat.count[depth];
2096
2097 const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
2098 if (aboveLeft)
2099 {
2100 FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
2101 neighCost += lstat.avgCost[depth] * lstat.count[depth];
2102 neighCount += lstat.count[depth];
2103 }
2104
2105 const CUData* aboveRight = parentCTU.m_cuAboveRight;
2106 if (aboveRight)
2107 {
2108 FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
2109 neighCost += rstat.avgCost[depth] * rstat.count[depth];
2110 neighCount += rstat.count[depth];
2111 }
2112 }
2113 const CUData* left = parentCTU.m_cuLeft;
2114 if (left)
2115 {
2116 FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
2117 neighCost += nstat.avgCost[depth] * nstat.count[depth];
2118 neighCount += nstat.count[depth];
2119 }
2120
2121 // give 60% weight to all CU's and 40% weight to neighbour CU's
2122 if (neighCount + cuCount)
2123 {
2124 uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
2125 uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
2126 if (curCost < avgCost && avgCost)
2127 return true;
2128 }
2129
2130 return false;
2131 }
2132
2133 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom)
2134 {
2135 FrameData& curEncData = *m_frame->m_encData;
2136 double qp = curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
2137
2138 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
2139 bool isReferenced = IS_REFERENCED(m_frame);
2140 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
2141 if (qpoffs)
2142 {
2143 uint32_t width = m_frame->m_fencPic->m_picWidth;
2144 uint32_t height = m_frame->m_fencPic->m_picHeight;
2145 uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
2146 uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
2147 uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
2148 uint32_t blockSize = g_maxCUSize >> cuGeom.depth;
2149 double qp_offset = 0;
2150 uint32_t cnt = 0;
2151 uint32_t idx;
2152
2153 for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16)
2154 {
2155 for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16)
2156 {
2157 idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16);
2158 qp_offset += qpoffs[idx];
2159 cnt++;
2160 }
2161 }
2162
2163 qp_offset /= cnt;
2164 qp += qp_offset;
2165 }
2166
2167 return x265_clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
2168 }