0
|
1 /*****************************************************************************
|
|
2 * Copyright (C) 2013 x265 project
|
|
3 *
|
|
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
|
|
5 * Steve Borho <steve@borho.org>
|
|
6 *
|
|
7 * This program is free software; you can redistribute it and/or modify
|
|
8 * it under the terms of the GNU General Public License as published by
|
|
9 * the Free Software Foundation; either version 2 of the License, or
|
|
10 * (at your option) any later version.
|
|
11 *
|
|
12 * This program is distributed in the hope that it will be useful,
|
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 * GNU General Public License for more details.
|
|
16 *
|
|
17 * You should have received a copy of the GNU General Public License
|
|
18 * along with this program; if not, write to the Free Software
|
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
20 *
|
|
21 * This program is also available under a commercial proprietary license.
|
|
22 * For more information, contact us at license @ x265.com.
|
|
23 *****************************************************************************/
|
|
24
|
|
25 #include "common.h"
|
|
26 #include "frame.h"
|
|
27 #include "framedata.h"
|
|
28 #include "picyuv.h"
|
|
29 #include "primitives.h"
|
|
30 #include "threading.h"
|
|
31
|
|
32 #include "analysis.h"
|
|
33 #include "rdcost.h"
|
|
34 #include "encoder.h"
|
|
35
|
|
36 using namespace X265_NS;
|
|
37
|
|
38 /* An explanation of rate distortion levels (--rd-level)
|
|
39 *
|
|
40 * rd-level 0 generates no recon per CU (NO RDO or Quant)
|
|
41 *
|
|
42 * sa8d selection between merge / skip / inter / intra and split
|
|
43 * no recon pixels generated until CTU analysis is complete, requiring
|
|
44 * intra predictions to use source pixels
|
|
45 *
|
|
46 * rd-level 1 uses RDO for merge and skip, sa8d for all else
|
|
47 *
|
|
48 * RDO selection between merge and skip
|
|
49 * sa8d selection between (merge/skip) / inter modes / intra and split
|
|
50 * intra prediction uses reconstructed pixels
|
|
51 *
|
|
52 * rd-level 2 uses RDO for merge/skip and split
|
|
53 *
|
|
54 * RDO selection between merge and skip
|
|
55 * sa8d selection between (merge/skip) / inter modes / intra
|
|
56 * RDO split decisions
|
|
57 *
|
|
58 * rd-level 3 uses RDO for merge/skip/best inter/intra
|
|
59 *
|
|
60 * RDO selection between merge and skip
|
|
61 * sa8d selection of best inter mode
|
|
62 * sa8d decisions include chroma residual cost
|
|
63 * RDO selection between (merge/skip) / best inter mode / intra / split
|
|
64 *
|
|
65 * rd-level 4 enables RDOQuant
|
|
66 * chroma residual cost included in satd decisions, including subpel refine
|
|
67 * (as a result of --subme 3 being used by preset slow)
|
|
68 *
|
|
69 * rd-level 5,6 does RDO for each inter mode
|
|
70 */
|
|
71
|
|
72 Analysis::Analysis()
|
|
73 {
|
|
74 m_reuseIntraDataCTU = NULL;
|
|
75 m_reuseInterDataCTU = NULL;
|
|
76 m_reuseRef = NULL;
|
|
77 m_reuseBestMergeCand = NULL;
|
|
78 }
|
|
79
|
|
80 bool Analysis::create(ThreadLocalData *tld)
|
|
81 {
|
|
82 m_tld = tld;
|
|
83 m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
|
|
84 m_bChromaSa8d = m_param->rdLevel >= 3;
|
|
85
|
|
86 int csp = m_param->internalCsp;
|
|
87 uint32_t cuSize = g_maxCUSize;
|
|
88
|
|
89 bool ok = true;
|
|
90 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
|
|
91 {
|
|
92 ModeDepth &md = m_modeDepth[depth];
|
|
93
|
|
94 md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
|
|
95 ok &= md.fencYuv.create(cuSize, csp);
|
|
96
|
|
97 for (int j = 0; j < MAX_PRED_TYPES; j++)
|
|
98 {
|
|
99 md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
|
|
100 ok &= md.pred[j].predYuv.create(cuSize, csp);
|
|
101 ok &= md.pred[j].reconYuv.create(cuSize, csp);
|
|
102 md.pred[j].fencYuv = &md.fencYuv;
|
|
103 }
|
|
104 }
|
|
105
|
|
106 return ok;
|
|
107 }
|
|
108
|
|
109 void Analysis::destroy()
|
|
110 {
|
|
111 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
|
|
112 {
|
|
113 m_modeDepth[i].cuMemPool.destroy();
|
|
114 m_modeDepth[i].fencYuv.destroy();
|
|
115
|
|
116 for (int j = 0; j < MAX_PRED_TYPES; j++)
|
|
117 {
|
|
118 m_modeDepth[i].pred[j].predYuv.destroy();
|
|
119 m_modeDepth[i].pred[j].reconYuv.destroy();
|
|
120 }
|
|
121 }
|
|
122 }
|
|
123
|
|
124 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
|
|
125 {
|
|
126 m_slice = ctu.m_slice;
|
|
127 m_frame = &frame;
|
|
128
|
|
129 #if _DEBUG || CHECKED_BUILD
|
|
130 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
|
|
131 for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
|
|
132 m_modeDepth[i].pred[j].invalidate();
|
|
133 invalidateContexts(0);
|
|
134 #endif
|
|
135
|
|
136 int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
|
|
137 ctu.setQPSubParts((int8_t)qp, 0, 0);
|
|
138
|
|
139 m_rqt[0].cur.load(initialContext);
|
|
140 m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
|
|
141
|
|
142 uint32_t numPartition = ctu.m_numPartitions;
|
|
143 if (m_param->analysisMode)
|
|
144 {
|
|
145 if (m_slice->m_sliceType == I_SLICE)
|
|
146 m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
|
|
147 else
|
|
148 {
|
|
149 int numPredDir = m_slice->isInterP() ? 1 : 2;
|
|
150 m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
|
|
151 m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
|
|
152 m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
|
|
153 }
|
|
154 }
|
|
155
|
|
156 ProfileCUScope(ctu, totalCTUTime, totalCTUs);
|
|
157
|
|
158 uint32_t zOrder = 0;
|
|
159 if (m_slice->m_sliceType == I_SLICE)
|
|
160 {
|
|
161 compressIntraCU(ctu, cuGeom, zOrder, qp);
|
|
162 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
|
|
163 {
|
|
164 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
|
|
165 memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
|
|
166 memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
|
|
167 memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
|
|
168 memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
|
|
169 }
|
|
170 }
|
|
171 else
|
|
172 {
|
|
173 if (!m_param->rdLevel)
|
|
174 {
|
|
175 /* In RD Level 0/1, copy source pixels into the reconstructed block so
|
|
176 * they are available for intra predictions */
|
|
177 m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
|
|
178
|
|
179 compressInterCU_rd0_4(ctu, cuGeom, qp);
|
|
180
|
|
181 /* generate residual for entire CTU at once and copy to reconPic */
|
|
182 encodeResidue(ctu, cuGeom);
|
|
183 }
|
|
184 else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
|
|
185 compressInterCU_dist(ctu, cuGeom, qp);
|
|
186 else if (m_param->rdLevel <= 4)
|
|
187 compressInterCU_rd0_4(ctu, cuGeom, qp);
|
|
188 else
|
|
189 {
|
|
190 compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
|
|
191 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
|
|
192 {
|
|
193 CUData* bestCU = &m_modeDepth[0].bestMode->cu;
|
|
194 memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
|
|
195 memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
|
|
196 }
|
|
197 }
|
|
198 }
|
|
199
|
|
200 return *m_modeDepth[0].bestMode;
|
|
201 }
|
|
202
|
|
203 void Analysis::tryLossless(const CUGeom& cuGeom)
|
|
204 {
|
|
205 ModeDepth& md = m_modeDepth[cuGeom.depth];
|
|
206
|
|
207 if (!md.bestMode->distortion)
|
|
208 /* already lossless */
|
|
209 return;
|
|
210 else if (md.bestMode->cu.isIntra(0))
|
|
211 {
|
|
212 md.pred[PRED_LOSSLESS].initCosts();
|
|
213 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
|
|
214 PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
|
|
215 uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
|
|
216 checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
|
|
217 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
|
|
218 }
|
|
219 else
|
|
220 {
|
|
221 md.pred[PRED_LOSSLESS].initCosts();
|
|
222 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
|
|
223 md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
|
|
224 encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
|
|
225 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
|
|
226 }
|
|
227 }
|
|
228
|
|
229 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
|
|
230 {
|
|
231 uint32_t depth = cuGeom.depth;
|
|
232 ModeDepth& md = m_modeDepth[depth];
|
|
233 md.bestMode = NULL;
|
|
234
|
|
235 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
|
|
236 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
|
|
237
|
|
238 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
|
|
239 {
|
|
240 uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
241 uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
242 char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
243 uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
244
|
|
245 if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
|
|
246 {
|
|
247 PartSize size = (PartSize)reusePartSizes[zOrder];
|
|
248 Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
|
|
249 mode.cu.initSubCU(parentCTU, cuGeom, qp);
|
|
250 checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
|
|
251 checkBestMode(mode, depth);
|
|
252
|
|
253 if (m_bTryLossless)
|
|
254 tryLossless(cuGeom);
|
|
255
|
|
256 if (mightSplit)
|
|
257 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
258
|
|
259 // increment zOrder offset to point to next best depth in sharedDepth buffer
|
|
260 zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
|
|
261 mightSplit = false;
|
|
262 }
|
|
263 }
|
|
264 else if (mightNotSplit)
|
|
265 {
|
|
266 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
267 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
|
|
268 checkBestMode(md.pred[PRED_INTRA], depth);
|
|
269
|
|
270 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
|
|
271 {
|
|
272 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
273 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
|
|
274 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
|
|
275 }
|
|
276
|
|
277 if (m_bTryLossless)
|
|
278 tryLossless(cuGeom);
|
|
279
|
|
280 if (mightSplit)
|
|
281 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
282 }
|
|
283
|
|
284 if (mightSplit)
|
|
285 {
|
|
286 Mode* splitPred = &md.pred[PRED_SPLIT];
|
|
287 splitPred->initCosts();
|
|
288 CUData* splitCU = &splitPred->cu;
|
|
289 splitCU->initSubCU(parentCTU, cuGeom, qp);
|
|
290
|
|
291 uint32_t nextDepth = depth + 1;
|
|
292 ModeDepth& nd = m_modeDepth[nextDepth];
|
|
293 invalidateContexts(nextDepth);
|
|
294 Entropy* nextContext = &m_rqt[depth].cur;
|
|
295 int32_t nextQP = qp;
|
|
296
|
|
297 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
|
|
298 {
|
|
299 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
|
|
300 if (childGeom.flags & CUGeom::PRESENT)
|
|
301 {
|
|
302 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
|
|
303 m_rqt[nextDepth].cur.load(*nextContext);
|
|
304
|
|
305 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
|
|
306 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
|
|
307
|
|
308 compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
|
|
309
|
|
310 // Save best CU and pred data for this sub CU
|
|
311 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
|
|
312 splitPred->addSubCosts(*nd.bestMode);
|
|
313 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
|
|
314 nextContext = &nd.bestMode->contexts;
|
|
315 }
|
|
316 else
|
|
317 {
|
|
318 /* record the depth of this non-present sub-CU */
|
|
319 splitCU->setEmptyPart(childGeom, subPartIdx);
|
|
320 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
|
|
321 }
|
|
322 }
|
|
323 nextContext->store(splitPred->contexts);
|
|
324 if (mightNotSplit)
|
|
325 addSplitFlagCost(*splitPred, cuGeom.depth);
|
|
326 else
|
|
327 updateModeCost(*splitPred);
|
|
328
|
|
329 checkDQPForSplitPred(*splitPred, cuGeom);
|
|
330 checkBestMode(*splitPred, depth);
|
|
331 }
|
|
332
|
|
333 /* Copy best data to encData CTU and recon */
|
|
334 md.bestMode->cu.copyToPic(depth);
|
|
335 if (md.bestMode != &md.pred[PRED_SPLIT])
|
|
336 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
|
|
337 }
|
|
338
|
|
339 void Analysis::PMODE::processTasks(int workerThreadId)
|
|
340 {
|
|
341 #if DETAILED_CU_STATS
|
|
342 int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
|
|
343 master.m_stats[fe].countPModeTasks++;
|
|
344 ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
|
|
345 #endif
|
|
346 ProfileScopeEvent(pmode);
|
|
347 master.processPmode(*this, master.m_tld[workerThreadId].analysis);
|
|
348 }
|
|
349
|
|
350 /* process pmode jobs until none remain; may be called by the master thread or by
|
|
351 * a bonded peer (slave) thread via pmodeTasks() */
|
|
352 void Analysis::processPmode(PMODE& pmode, Analysis& slave)
|
|
353 {
|
|
354 /* acquire a mode task, else exit early */
|
|
355 int task;
|
|
356 pmode.m_lock.acquire();
|
|
357 if (pmode.m_jobTotal > pmode.m_jobAcquired)
|
|
358 {
|
|
359 task = pmode.m_jobAcquired++;
|
|
360 pmode.m_lock.release();
|
|
361 }
|
|
362 else
|
|
363 {
|
|
364 pmode.m_lock.release();
|
|
365 return;
|
|
366 }
|
|
367
|
|
368 ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
|
|
369
|
|
370 /* setup slave Analysis */
|
|
371 if (&slave != this)
|
|
372 {
|
|
373 slave.m_slice = m_slice;
|
|
374 slave.m_frame = m_frame;
|
|
375 slave.m_param = m_param;
|
|
376 slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
|
|
377 slave.invalidateContexts(0);
|
|
378 slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
|
|
379 }
|
|
380
|
|
381 /* perform Mode task, repeat until no more work is available */
|
|
382 do
|
|
383 {
|
|
384 uint32_t refMasks[2] = { 0, 0 };
|
|
385
|
|
386 if (m_param->rdLevel <= 4)
|
|
387 {
|
|
388 switch (pmode.modes[task])
|
|
389 {
|
|
390 case PRED_INTRA:
|
|
391 slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
|
|
392 if (m_param->rdLevel > 2)
|
|
393 slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
|
|
394 break;
|
|
395
|
|
396 case PRED_2Nx2N:
|
|
397 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
|
|
398
|
|
399 slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
|
|
400 if (m_slice->m_sliceType == B_SLICE)
|
|
401 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
|
|
402 break;
|
|
403
|
|
404 case PRED_Nx2N:
|
|
405 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
|
|
406 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
|
|
407
|
|
408 slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
|
|
409 break;
|
|
410
|
|
411 case PRED_2NxN:
|
|
412 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
|
|
413 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
|
|
414
|
|
415 slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
|
|
416 break;
|
|
417
|
|
418 case PRED_2NxnU:
|
|
419 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
|
|
420 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
|
|
421
|
|
422 slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
|
|
423 break;
|
|
424
|
|
425 case PRED_2NxnD:
|
|
426 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
|
|
427 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
|
|
428
|
|
429 slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
|
|
430 break;
|
|
431
|
|
432 case PRED_nLx2N:
|
|
433 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
|
|
434 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
|
|
435
|
|
436 slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
|
|
437 break;
|
|
438
|
|
439 case PRED_nRx2N:
|
|
440 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
|
|
441 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
|
|
442
|
|
443 slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
|
|
444 break;
|
|
445
|
|
446 default:
|
|
447 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
|
|
448 break;
|
|
449 }
|
|
450 }
|
|
451 else
|
|
452 {
|
|
453 switch (pmode.modes[task])
|
|
454 {
|
|
455 case PRED_INTRA:
|
|
456 slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N, NULL, NULL);
|
|
457 if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
|
|
458 slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN, NULL, NULL);
|
|
459 break;
|
|
460
|
|
461 case PRED_2Nx2N:
|
|
462 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
|
|
463
|
|
464 slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
|
|
465 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
|
|
466 if (m_slice->m_sliceType == B_SLICE)
|
|
467 {
|
|
468 slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
|
|
469 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
|
|
470 slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
|
|
471 }
|
|
472 break;
|
|
473
|
|
474 case PRED_Nx2N:
|
|
475 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
|
|
476 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
|
|
477
|
|
478 slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
|
|
479 break;
|
|
480
|
|
481 case PRED_2NxN:
|
|
482 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
|
|
483 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
|
|
484
|
|
485 slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
|
|
486 break;
|
|
487
|
|
488 case PRED_2NxnU:
|
|
489 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
|
|
490 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
|
|
491
|
|
492 slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
|
|
493 break;
|
|
494
|
|
495 case PRED_2NxnD:
|
|
496 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
|
|
497 refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
|
|
498 slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
|
|
499 break;
|
|
500
|
|
501 case PRED_nLx2N:
|
|
502 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
|
|
503 refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
|
|
504
|
|
505 slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
|
|
506 break;
|
|
507
|
|
508 case PRED_nRx2N:
|
|
509 refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
|
|
510 refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
|
|
511 slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
|
|
512 break;
|
|
513
|
|
514 default:
|
|
515 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
|
|
516 break;
|
|
517 }
|
|
518 }
|
|
519
|
|
520 task = -1;
|
|
521 pmode.m_lock.acquire();
|
|
522 if (pmode.m_jobTotal > pmode.m_jobAcquired)
|
|
523 task = pmode.m_jobAcquired++;
|
|
524 pmode.m_lock.release();
|
|
525 }
|
|
526 while (task >= 0);
|
|
527 }
|
|
528
|
|
529 uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
|
|
530 {
|
|
531 uint32_t depth = cuGeom.depth;
|
|
532 uint32_t cuAddr = parentCTU.m_cuAddr;
|
|
533 ModeDepth& md = m_modeDepth[depth];
|
|
534 md.bestMode = NULL;
|
|
535
|
|
536 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
|
|
537 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
|
|
538 uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
|
|
539 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
|
|
540
|
|
541 X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
|
|
542
|
|
543 PMODE pmode(*this, cuGeom);
|
|
544
|
|
545 if (mightNotSplit && depth >= minDepth)
|
|
546 {
|
|
547 /* Initialize all prediction CUs based on parentCTU */
|
|
548 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
549 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
550
|
|
551 if (m_param->rdLevel <= 4)
|
|
552 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
|
|
553 else
|
|
554 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
|
|
555 }
|
|
556
|
|
557 bool bNoSplit = false;
|
|
558 bool splitIntra = true;
|
|
559 if (md.bestMode)
|
|
560 {
|
|
561 bNoSplit = md.bestMode->cu.isSkipped(0);
|
|
562 if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
|
|
563 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
|
|
564 }
|
|
565
|
|
566 if (mightSplit && !bNoSplit)
|
|
567 {
|
|
568 Mode* splitPred = &md.pred[PRED_SPLIT];
|
|
569 splitPred->initCosts();
|
|
570 CUData* splitCU = &splitPred->cu;
|
|
571 splitCU->initSubCU(parentCTU, cuGeom, qp);
|
|
572
|
|
573 uint32_t nextDepth = depth + 1;
|
|
574 ModeDepth& nd = m_modeDepth[nextDepth];
|
|
575 invalidateContexts(nextDepth);
|
|
576 Entropy* nextContext = &m_rqt[depth].cur;
|
|
577 int nextQP = qp;
|
|
578 splitIntra = false;
|
|
579
|
|
580 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
|
|
581 {
|
|
582 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
|
|
583 if (childGeom.flags & CUGeom::PRESENT)
|
|
584 {
|
|
585 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
|
|
586 m_rqt[nextDepth].cur.load(*nextContext);
|
|
587
|
|
588 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
|
|
589 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
|
|
590
|
|
591 splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
|
|
592
|
|
593 // Save best CU and pred data for this sub CU
|
|
594 splitIntra |= nd.bestMode->cu.isIntra(0);
|
|
595 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
|
|
596 splitPred->addSubCosts(*nd.bestMode);
|
|
597
|
|
598 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
|
|
599 nextContext = &nd.bestMode->contexts;
|
|
600 }
|
|
601 else
|
|
602 splitCU->setEmptyPart(childGeom, subPartIdx);
|
|
603 }
|
|
604 nextContext->store(splitPred->contexts);
|
|
605
|
|
606 if (mightNotSplit)
|
|
607 addSplitFlagCost(*splitPred, cuGeom.depth);
|
|
608 else
|
|
609 updateModeCost(*splitPred);
|
|
610
|
|
611 checkDQPForSplitPred(*splitPred, cuGeom);
|
|
612 }
|
|
613
|
|
614 if (mightNotSplit && depth >= minDepth)
|
|
615 {
|
|
616 int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
|
|
617 int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra);
|
|
618
|
|
619 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
|
|
620 setLambdaFromQP(parentCTU, qp);
|
|
621
|
|
622 if (bTryIntra)
|
|
623 {
|
|
624 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
625 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
|
|
626 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
627 pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
|
|
628 }
|
|
629 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
|
|
630 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
631 if (m_param->bEnableRectInter)
|
|
632 {
|
|
633 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
|
|
634 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
|
|
635 }
|
|
636 if (bTryAmp)
|
|
637 {
|
|
638 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
|
|
639 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
|
|
640 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
|
|
641 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
|
|
642 }
|
|
643
|
|
644 m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
|
|
645
|
|
646 pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
|
|
647
|
|
648 /* participate in processing jobs, until all are distributed */
|
|
649 processPmode(pmode, *this);
|
|
650
|
|
651 /* the master worker thread (this one) does merge analysis. By doing
|
|
652 * merge after all the other jobs are at least started, we usually avoid
|
|
653 * blocking on another thread */
|
|
654
|
|
655 if (m_param->rdLevel <= 4)
|
|
656 {
|
|
657 {
|
|
658 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
|
|
659 pmode.waitForExit();
|
|
660 }
|
|
661
|
|
662 /* select best inter mode based on sa8d cost */
|
|
663 Mode *bestInter = &md.pred[PRED_2Nx2N];
|
|
664
|
|
665 if (m_param->bEnableRectInter)
|
|
666 {
|
|
667 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
|
|
668 bestInter = &md.pred[PRED_Nx2N];
|
|
669 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
|
|
670 bestInter = &md.pred[PRED_2NxN];
|
|
671 }
|
|
672
|
|
673 if (bTryAmp)
|
|
674 {
|
|
675 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
|
|
676 bestInter = &md.pred[PRED_2NxnU];
|
|
677 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
|
|
678 bestInter = &md.pred[PRED_2NxnD];
|
|
679 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
|
|
680 bestInter = &md.pred[PRED_nLx2N];
|
|
681 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
|
|
682 bestInter = &md.pred[PRED_nRx2N];
|
|
683 }
|
|
684
|
|
685 if (m_param->rdLevel > 2)
|
|
686 {
|
|
687 /* RD selection between merge, inter, bidir and intra */
|
|
688 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
|
|
689 {
|
|
690 uint32_t numPU = bestInter->cu.getNumPartInter(0);
|
|
691 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
692 {
|
|
693 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
|
|
694 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
|
|
695 }
|
|
696 }
|
|
697 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
|
|
698 checkBestMode(*bestInter, depth);
|
|
699
|
|
700 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
|
|
701 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
|
|
702 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
|
|
703 {
|
|
704 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
|
|
705 checkBestMode(md.pred[PRED_BIDIR], depth);
|
|
706 }
|
|
707
|
|
708 if (bTryIntra)
|
|
709 checkBestMode(md.pred[PRED_INTRA], depth);
|
|
710 }
|
|
711 else /* m_param->rdLevel == 2 */
|
|
712 {
|
|
713 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
|
|
714 md.bestMode = bestInter;
|
|
715
|
|
716 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
|
|
717 md.bestMode = &md.pred[PRED_BIDIR];
|
|
718
|
|
719 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
|
|
720 {
|
|
721 md.bestMode = &md.pred[PRED_INTRA];
|
|
722 encodeIntraInInter(*md.bestMode, cuGeom);
|
|
723 }
|
|
724 else if (!md.bestMode->cu.m_mergeFlag[0])
|
|
725 {
|
|
726 /* finally code the best mode selected from SA8D costs */
|
|
727 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
|
|
728 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
729 {
|
|
730 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
|
|
731 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
|
|
732 }
|
|
733 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
|
|
734 }
|
|
735 }
|
|
736 }
|
|
737 else
|
|
738 {
|
|
739 {
|
|
740 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
|
|
741 pmode.waitForExit();
|
|
742 }
|
|
743
|
|
744 checkBestMode(md.pred[PRED_2Nx2N], depth);
|
|
745 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
|
|
746 checkBestMode(md.pred[PRED_BIDIR], depth);
|
|
747
|
|
748 if (m_param->bEnableRectInter)
|
|
749 {
|
|
750 checkBestMode(md.pred[PRED_Nx2N], depth);
|
|
751 checkBestMode(md.pred[PRED_2NxN], depth);
|
|
752 }
|
|
753
|
|
754 if (bTryAmp)
|
|
755 {
|
|
756 checkBestMode(md.pred[PRED_2NxnU], depth);
|
|
757 checkBestMode(md.pred[PRED_2NxnD], depth);
|
|
758 checkBestMode(md.pred[PRED_nLx2N], depth);
|
|
759 checkBestMode(md.pred[PRED_nRx2N], depth);
|
|
760 }
|
|
761
|
|
762 if (bTryIntra)
|
|
763 {
|
|
764 checkBestMode(md.pred[PRED_INTRA], depth);
|
|
765 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
|
|
766 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
|
|
767 }
|
|
768 }
|
|
769
|
|
770 if (m_bTryLossless)
|
|
771 tryLossless(cuGeom);
|
|
772
|
|
773 if (mightSplit)
|
|
774 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
775 }
|
|
776
|
|
777 /* compare split RD cost against best cost */
|
|
778 if (mightSplit && !bNoSplit)
|
|
779 checkBestMode(md.pred[PRED_SPLIT], depth);
|
|
780
|
|
781 /* determine which motion references the parent CU should search */
|
|
782 uint32_t refMask;
|
|
783 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
|
|
784 refMask = 0;
|
|
785 else if (md.bestMode == &md.pred[PRED_SPLIT])
|
|
786 refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
|
|
787 else
|
|
788 {
|
|
789 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
|
|
790 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
|
|
791 uint32_t numPU = cu.getNumPartInter(0);
|
|
792 refMask = 0;
|
|
793 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
|
|
794 refMask |= cu.getBestRefIdx(subPartIdx);
|
|
795 }
|
|
796
|
|
797 if (mightNotSplit)
|
|
798 {
|
|
799 /* early-out statistics */
|
|
800 FrameData& curEncData = *m_frame->m_encData;
|
|
801 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
|
|
802 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
|
|
803 cuStat.count[depth] += 1;
|
|
804 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
|
|
805 }
|
|
806
|
|
807 /* Copy best data to encData CTU and recon */
|
|
808 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
|
|
809 md.bestMode->cu.copyToPic(depth);
|
|
810 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
|
|
811
|
|
812 return refMask;
|
|
813 }
|
|
814
|
|
815 uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
|
|
816 {
|
|
817 uint32_t depth = cuGeom.depth;
|
|
818 uint32_t cuAddr = parentCTU.m_cuAddr;
|
|
819 ModeDepth& md = m_modeDepth[depth];
|
|
820 md.bestMode = NULL;
|
|
821
|
|
822 PicYuv& reconPic = *m_frame->m_reconPic;
|
|
823
|
|
824 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
|
|
825 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
|
|
826 uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
|
|
827 bool earlyskip = false;
|
|
828 bool splitIntra = true;
|
|
829 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
|
|
830 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
|
|
831 if (mightNotSplit && depth >= minDepth)
|
|
832 {
|
|
833 /* Compute Merge Cost */
|
|
834 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
835 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
836 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
|
|
837 if (m_param->rdLevel)
|
|
838 earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
|
|
839 }
|
|
840
|
|
841 bool bNoSplit = false;
|
|
842 if (md.bestMode)
|
|
843 {
|
|
844 bNoSplit = md.bestMode->cu.isSkipped(0);
|
|
845 if (mightSplit && depth && depth >= minDepth && !bNoSplit)
|
|
846 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
|
|
847 }
|
|
848
|
|
849 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
|
|
850 if (mightSplit && !bNoSplit)
|
|
851 {
|
|
852 Mode* splitPred = &md.pred[PRED_SPLIT];
|
|
853 splitPred->initCosts();
|
|
854 CUData* splitCU = &splitPred->cu;
|
|
855 splitCU->initSubCU(parentCTU, cuGeom, qp);
|
|
856
|
|
857 uint32_t nextDepth = depth + 1;
|
|
858 ModeDepth& nd = m_modeDepth[nextDepth];
|
|
859 invalidateContexts(nextDepth);
|
|
860 Entropy* nextContext = &m_rqt[depth].cur;
|
|
861 int nextQP = qp;
|
|
862 splitIntra = false;
|
|
863
|
|
864 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
|
|
865 {
|
|
866 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
|
|
867 if (childGeom.flags & CUGeom::PRESENT)
|
|
868 {
|
|
869 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
|
|
870 m_rqt[nextDepth].cur.load(*nextContext);
|
|
871
|
|
872 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
|
|
873 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
|
|
874
|
|
875 splitRefs[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
|
|
876
|
|
877 // Save best CU and pred data for this sub CU
|
|
878 splitIntra |= nd.bestMode->cu.isIntra(0);
|
|
879 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
|
|
880 splitPred->addSubCosts(*nd.bestMode);
|
|
881
|
|
882 if (m_param->rdLevel)
|
|
883 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
|
|
884 else
|
|
885 nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
|
|
886 if (m_param->rdLevel > 1)
|
|
887 nextContext = &nd.bestMode->contexts;
|
|
888 }
|
|
889 else
|
|
890 splitCU->setEmptyPart(childGeom, subPartIdx);
|
|
891 }
|
|
892 nextContext->store(splitPred->contexts);
|
|
893
|
|
894 if (mightNotSplit)
|
|
895 addSplitFlagCost(*splitPred, cuGeom.depth);
|
|
896 else if (m_param->rdLevel > 1)
|
|
897 updateModeCost(*splitPred);
|
|
898 else
|
|
899 splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
|
|
900 }
|
|
901
|
|
902 /* Split CUs
|
|
903 * 0 1
|
|
904 * 2 3 */
|
|
905 uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
|
|
906 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
|
|
907 if (mightNotSplit && depth >= minDepth)
|
|
908 {
|
|
909 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
|
|
910 setLambdaFromQP(parentCTU, qp);
|
|
911
|
|
912 if (!earlyskip)
|
|
913 {
|
|
914 uint32_t refMasks[2];
|
|
915 refMasks[0] = allSplitRefs;
|
|
916 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
917 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
|
|
918
|
|
919 if (m_param->limitReferences & X265_REF_LIMIT_CU)
|
|
920 {
|
|
921 CUData& cu = md.pred[PRED_2Nx2N].cu;
|
|
922 uint32_t refMask = cu.getBestRefIdx(0);
|
|
923 allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
|
|
924 }
|
|
925
|
|
926 if (m_slice->m_sliceType == B_SLICE)
|
|
927 {
|
|
928 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
929 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
|
|
930 }
|
|
931
|
|
932 Mode *bestInter = &md.pred[PRED_2Nx2N];
|
|
933 if (m_param->bEnableRectInter)
|
|
934 {
|
|
935 refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
|
|
936 refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
|
|
937 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
938 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
|
|
939 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
|
|
940 bestInter = &md.pred[PRED_Nx2N];
|
|
941
|
|
942 refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
|
|
943 refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
|
|
944 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
945 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
|
|
946 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
|
|
947 bestInter = &md.pred[PRED_2NxN];
|
|
948 }
|
|
949
|
|
950 if (m_slice->m_sps->maxAMPDepth > depth)
|
|
951 {
|
|
952 bool bHor = false, bVer = false;
|
|
953 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
|
|
954 bHor = true;
|
|
955 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
|
|
956 bVer = true;
|
|
957 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
|
|
958 md.bestMode && md.bestMode->cu.getQtRootCbf(0))
|
|
959 {
|
|
960 bHor = true;
|
|
961 bVer = true;
|
|
962 }
|
|
963
|
|
964 if (bHor)
|
|
965 {
|
|
966 refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
|
|
967 refMasks[1] = allSplitRefs; /* 75% bot */
|
|
968 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
969 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
|
|
970 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
|
|
971 bestInter = &md.pred[PRED_2NxnU];
|
|
972
|
|
973 refMasks[0] = allSplitRefs; /* 75% top */
|
|
974 refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
|
|
975 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
976 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
|
|
977 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
|
|
978 bestInter = &md.pred[PRED_2NxnD];
|
|
979 }
|
|
980 if (bVer)
|
|
981 {
|
|
982 refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
|
|
983 refMasks[1] = allSplitRefs; /* 75% right */
|
|
984 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
985 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
|
|
986 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
|
|
987 bestInter = &md.pred[PRED_nLx2N];
|
|
988
|
|
989 refMasks[0] = allSplitRefs; /* 75% left */
|
|
990 refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
|
|
991 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
992 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
|
|
993 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
|
|
994 bestInter = &md.pred[PRED_nRx2N];
|
|
995 }
|
|
996 }
|
|
997 bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
|
|
998 if (m_param->rdLevel >= 3)
|
|
999 {
|
|
1000 /* Calculate RD cost of best inter option */
|
|
1001 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
|
|
1002 {
|
|
1003 uint32_t numPU = bestInter->cu.getNumPartInter(0);
|
|
1004 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
1005 {
|
|
1006 PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
|
|
1007 motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
|
|
1008 }
|
|
1009 }
|
|
1010 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
|
|
1011 checkBestMode(*bestInter, depth);
|
|
1012
|
|
1013 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
|
|
1014 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
|
|
1015 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
|
|
1016 {
|
|
1017 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
|
|
1018 checkBestMode(md.pred[PRED_BIDIR], depth);
|
|
1019 }
|
|
1020
|
|
1021 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
|
|
1022 md.bestMode->sa8dCost == MAX_INT64)
|
|
1023 {
|
|
1024 if (!m_param->limitReferences || splitIntra)
|
|
1025 {
|
|
1026 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
|
|
1027 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1028 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
|
|
1029 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
|
|
1030 checkBestMode(md.pred[PRED_INTRA], depth);
|
|
1031 }
|
|
1032 else
|
|
1033 {
|
|
1034 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
|
|
1035 }
|
|
1036 }
|
|
1037 }
|
|
1038 else
|
|
1039 {
|
|
1040 /* SA8D choice between merge/skip, inter, bidir, and intra */
|
|
1041 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
|
|
1042 md.bestMode = bestInter;
|
|
1043
|
|
1044 if (m_slice->m_sliceType == B_SLICE &&
|
|
1045 md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
|
|
1046 md.bestMode = &md.pred[PRED_BIDIR];
|
|
1047
|
|
1048 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
|
|
1049 {
|
|
1050 if (!m_param->limitReferences || splitIntra)
|
|
1051 {
|
|
1052 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
|
|
1053 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1054 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
|
|
1055 if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
|
|
1056 md.bestMode = &md.pred[PRED_INTRA];
|
|
1057 }
|
|
1058 else
|
|
1059 {
|
|
1060 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
|
|
1061 }
|
|
1062 }
|
|
1063
|
|
1064 /* finally code the best mode selected by SA8D costs:
|
|
1065 * RD level 2 - fully encode the best mode
|
|
1066 * RD level 1 - generate recon pixels
|
|
1067 * RD level 0 - generate chroma prediction */
|
|
1068 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
|
|
1069 {
|
|
1070 /* prediction already generated for this CU, and if rd level
|
|
1071 * is not 0, it is already fully encoded */
|
|
1072 }
|
|
1073 else if (md.bestMode->cu.isInter(0))
|
|
1074 {
|
|
1075 uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
|
|
1076 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
1077 {
|
|
1078 PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
|
|
1079 motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
|
|
1080 }
|
|
1081 if (m_param->rdLevel == 2)
|
|
1082 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
|
|
1083 else if (m_param->rdLevel == 1)
|
|
1084 {
|
|
1085 /* generate recon pixels with no rate distortion considerations */
|
|
1086 CUData& cu = md.bestMode->cu;
|
|
1087
|
|
1088 uint32_t tuDepthRange[2];
|
|
1089 cu.getInterTUQtDepthRange(tuDepthRange, 0);
|
|
1090
|
|
1091 m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
|
|
1092 residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
|
|
1093 if (cu.getQtRootCbf(0))
|
|
1094 md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
|
|
1095 else
|
|
1096 {
|
|
1097 md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
|
|
1098 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
|
|
1099 cu.setPredModeSubParts(MODE_SKIP);
|
|
1100 }
|
|
1101 }
|
|
1102 }
|
|
1103 else
|
|
1104 {
|
|
1105 if (m_param->rdLevel == 2)
|
|
1106 encodeIntraInInter(*md.bestMode, cuGeom);
|
|
1107 else if (m_param->rdLevel == 1)
|
|
1108 {
|
|
1109 /* generate recon pixels with no rate distortion considerations */
|
|
1110 CUData& cu = md.bestMode->cu;
|
|
1111
|
|
1112 uint32_t tuDepthRange[2];
|
|
1113 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
|
|
1114
|
|
1115 residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
|
|
1116 getBestIntraModeChroma(*md.bestMode, cuGeom);
|
|
1117 residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
|
|
1118 md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
|
|
1119 }
|
|
1120 }
|
|
1121 }
|
|
1122 } // !earlyskip
|
|
1123
|
|
1124 if (m_bTryLossless)
|
|
1125 tryLossless(cuGeom);
|
|
1126
|
|
1127 if (mightSplit)
|
|
1128 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
1129 }
|
|
1130
|
|
1131 if (mightSplit && !bNoSplit)
|
|
1132 {
|
|
1133 Mode* splitPred = &md.pred[PRED_SPLIT];
|
|
1134 if (!md.bestMode)
|
|
1135 md.bestMode = splitPred;
|
|
1136 else if (m_param->rdLevel > 1)
|
|
1137 checkBestMode(*splitPred, cuGeom.depth);
|
|
1138 else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
|
|
1139 md.bestMode = splitPred;
|
|
1140
|
|
1141 checkDQPForSplitPred(*md.bestMode, cuGeom);
|
|
1142 }
|
|
1143
|
|
1144 /* determine which motion references the parent CU should search */
|
|
1145 uint32_t refMask;
|
|
1146 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
|
|
1147 refMask = 0;
|
|
1148 else if (md.bestMode == &md.pred[PRED_SPLIT])
|
|
1149 refMask = allSplitRefs;
|
|
1150 else
|
|
1151 {
|
|
1152 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
|
|
1153 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
|
|
1154 uint32_t numPU = cu.getNumPartInter(0);
|
|
1155 refMask = 0;
|
|
1156 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
|
|
1157 refMask |= cu.getBestRefIdx(subPartIdx);
|
|
1158 }
|
|
1159
|
|
1160 if (mightNotSplit)
|
|
1161 {
|
|
1162 /* early-out statistics */
|
|
1163 FrameData& curEncData = *m_frame->m_encData;
|
|
1164 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
|
|
1165 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
|
|
1166 cuStat.count[depth] += 1;
|
|
1167 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
|
|
1168 }
|
|
1169
|
|
1170 /* Copy best data to encData CTU and recon */
|
|
1171 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
|
|
1172 md.bestMode->cu.copyToPic(depth);
|
|
1173 if (m_param->rdLevel)
|
|
1174 md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
|
|
1175
|
|
1176 return refMask;
|
|
1177 }
|
|
1178
|
|
1179 uint32_t Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
|
|
1180 {
|
|
1181 uint32_t depth = cuGeom.depth;
|
|
1182 ModeDepth& md = m_modeDepth[depth];
|
|
1183 md.bestMode = NULL;
|
|
1184
|
|
1185 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
|
|
1186 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
|
|
1187
|
|
1188 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
|
|
1189 {
|
|
1190 uint8_t* reuseDepth = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
1191 uint8_t* reuseModes = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
|
|
1192 if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx && reuseModes[zOrder] == MODE_SKIP)
|
|
1193 {
|
|
1194 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1195 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1196 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, true);
|
|
1197
|
|
1198 if (m_bTryLossless)
|
|
1199 tryLossless(cuGeom);
|
|
1200
|
|
1201 if (mightSplit)
|
|
1202 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
1203
|
|
1204 // increment zOrder offset to point to next best depth in sharedDepth buffer
|
|
1205 zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
|
|
1206
|
|
1207 mightSplit = false;
|
|
1208 mightNotSplit = false;
|
|
1209 }
|
|
1210 }
|
|
1211
|
|
1212 bool foundSkip = false;
|
|
1213 bool splitIntra = true;
|
|
1214 uint32_t splitRefs[4] = { 0, 0, 0, 0 };
|
|
1215 /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
|
|
1216 if (mightNotSplit)
|
|
1217 {
|
|
1218 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1219 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1220 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
|
|
1221 foundSkip = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
|
|
1222 }
|
|
1223
|
|
1224 // estimate split cost
|
|
1225 /* Step 2. Evaluate each of the 4 split sub-blocks in series */
|
|
1226 if (mightSplit && !foundSkip)
|
|
1227 {
|
|
1228 Mode* splitPred = &md.pred[PRED_SPLIT];
|
|
1229 splitPred->initCosts();
|
|
1230 CUData* splitCU = &splitPred->cu;
|
|
1231 splitCU->initSubCU(parentCTU, cuGeom, qp);
|
|
1232
|
|
1233 uint32_t nextDepth = depth + 1;
|
|
1234 ModeDepth& nd = m_modeDepth[nextDepth];
|
|
1235 invalidateContexts(nextDepth);
|
|
1236 Entropy* nextContext = &m_rqt[depth].cur;
|
|
1237 int nextQP = qp;
|
|
1238 splitIntra = false;
|
|
1239
|
|
1240 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
|
|
1241 {
|
|
1242 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
|
|
1243 if (childGeom.flags & CUGeom::PRESENT)
|
|
1244 {
|
|
1245 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
|
|
1246 m_rqt[nextDepth].cur.load(*nextContext);
|
|
1247
|
|
1248 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
|
|
1249 nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
|
|
1250
|
|
1251 splitRefs[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
|
|
1252
|
|
1253 // Save best CU and pred data for this sub CU
|
|
1254 splitIntra |= nd.bestMode->cu.isIntra(0);
|
|
1255 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
|
|
1256 splitPred->addSubCosts(*nd.bestMode);
|
|
1257 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
|
|
1258 nextContext = &nd.bestMode->contexts;
|
|
1259 }
|
|
1260 else
|
|
1261 {
|
|
1262 splitCU->setEmptyPart(childGeom, subPartIdx);
|
|
1263 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
|
|
1264 }
|
|
1265 }
|
|
1266 nextContext->store(splitPred->contexts);
|
|
1267 if (mightNotSplit)
|
|
1268 addSplitFlagCost(*splitPred, cuGeom.depth);
|
|
1269 else
|
|
1270 updateModeCost(*splitPred);
|
|
1271
|
|
1272 checkDQPForSplitPred(*splitPred, cuGeom);
|
|
1273 }
|
|
1274
|
|
1275 /* Split CUs
|
|
1276 * 0 1
|
|
1277 * 2 3 */
|
|
1278 uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
|
|
1279 /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
|
|
1280 if (mightNotSplit)
|
|
1281 {
|
|
1282 if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
|
|
1283 setLambdaFromQP(parentCTU, qp);
|
|
1284
|
|
1285 if (!(foundSkip && m_param->bEnableEarlySkip))
|
|
1286 {
|
|
1287 uint32_t refMasks[2];
|
|
1288 refMasks[0] = allSplitRefs;
|
|
1289 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1290 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
|
|
1291 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
|
|
1292
|
|
1293 if (m_param->limitReferences & X265_REF_LIMIT_CU)
|
|
1294 {
|
|
1295 CUData& cu = md.pred[PRED_2Nx2N].cu;
|
|
1296 uint32_t refMask = cu.getBestRefIdx(0);
|
|
1297 allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
|
|
1298 }
|
|
1299
|
|
1300 if (m_slice->m_sliceType == B_SLICE)
|
|
1301 {
|
|
1302 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1303 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
|
|
1304 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
|
|
1305 {
|
|
1306 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
|
|
1307 checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
|
|
1308 }
|
|
1309 }
|
|
1310
|
|
1311 if (m_param->bEnableRectInter)
|
|
1312 {
|
|
1313 refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
|
|
1314 refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
|
|
1315 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1316 checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
|
|
1317 checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
|
|
1318
|
|
1319 refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
|
|
1320 refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
|
|
1321 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1322 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
|
|
1323 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
|
|
1324 }
|
|
1325
|
|
1326 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
|
|
1327 if (m_slice->m_sps->maxAMPDepth > depth)
|
|
1328 {
|
|
1329 bool bHor = false, bVer = false;
|
|
1330 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
|
|
1331 bHor = true;
|
|
1332 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
|
|
1333 bVer = true;
|
|
1334 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
|
|
1335 {
|
|
1336 bHor = true;
|
|
1337 bVer = true;
|
|
1338 }
|
|
1339
|
|
1340 if (bHor)
|
|
1341 {
|
|
1342 refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
|
|
1343 refMasks[1] = allSplitRefs; /* 75% bot */
|
|
1344 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1345 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
|
|
1346 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
|
|
1347
|
|
1348 refMasks[0] = allSplitRefs; /* 75% top */
|
|
1349 refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
|
|
1350 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1351 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
|
|
1352 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
|
|
1353 }
|
|
1354 if (bVer)
|
|
1355 {
|
|
1356 refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
|
|
1357 refMasks[1] = allSplitRefs; /* 75% right */
|
|
1358 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1359 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
|
|
1360 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
|
|
1361
|
|
1362 refMasks[0] = allSplitRefs; /* 75% left */
|
|
1363 refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
|
|
1364 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1365 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
|
|
1366 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
|
|
1367 }
|
|
1368 }
|
|
1369
|
|
1370 if (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames)
|
|
1371 {
|
|
1372 if (!m_param->limitReferences || splitIntra)
|
|
1373 {
|
|
1374 ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
|
|
1375 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1376 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
|
|
1377 checkBestMode(md.pred[PRED_INTRA], depth);
|
|
1378
|
|
1379 if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
|
|
1380 {
|
|
1381 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
|
|
1382 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
|
|
1383 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
|
|
1384 }
|
|
1385 }
|
|
1386 else
|
|
1387 {
|
|
1388 ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
|
|
1389 }
|
|
1390 }
|
|
1391 }
|
|
1392
|
|
1393 if (m_bTryLossless)
|
|
1394 tryLossless(cuGeom);
|
|
1395
|
|
1396 if (mightSplit)
|
|
1397 addSplitFlagCost(*md.bestMode, cuGeom.depth);
|
|
1398 }
|
|
1399
|
|
1400 /* compare split RD cost against best cost */
|
|
1401 if (mightSplit && !foundSkip)
|
|
1402 checkBestMode(md.pred[PRED_SPLIT], depth);
|
|
1403
|
|
1404 /* determine which motion references the parent CU should search */
|
|
1405 uint32_t refMask;
|
|
1406 if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
|
|
1407 refMask = 0;
|
|
1408 else if (md.bestMode == &md.pred[PRED_SPLIT])
|
|
1409 refMask = allSplitRefs;
|
|
1410 else
|
|
1411 {
|
|
1412 /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
|
|
1413 CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
|
|
1414 uint32_t numPU = cu.getNumPartInter(0);
|
|
1415 refMask = 0;
|
|
1416 for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
|
|
1417 refMask |= cu.getBestRefIdx(subPartIdx);
|
|
1418 }
|
|
1419
|
|
1420 /* Copy best data to encData CTU and recon */
|
|
1421 X265_CHECK(md.bestMode->ok(), "best mode is not ok");
|
|
1422 md.bestMode->cu.copyToPic(depth);
|
|
1423 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
|
|
1424
|
|
1425 return refMask;
|
|
1426 }
|
|
1427
|
|
1428 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
|
|
1429 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
|
|
1430 {
|
|
1431 uint32_t depth = cuGeom.depth;
|
|
1432 ModeDepth& md = m_modeDepth[depth];
|
|
1433 Yuv *fencYuv = &md.fencYuv;
|
|
1434
|
|
1435 /* Note that these two Mode instances are named MERGE and SKIP but they may
|
|
1436 * hold the reverse when the function returns. We toggle between the two modes */
|
|
1437 Mode* tempPred = &merge;
|
|
1438 Mode* bestPred = &skip;
|
|
1439
|
|
1440 X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
|
|
1441
|
|
1442 tempPred->initCosts();
|
|
1443 tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
1444 tempPred->cu.setPredModeSubParts(MODE_INTER);
|
|
1445 tempPred->cu.m_mergeFlag[0] = true;
|
|
1446
|
|
1447 bestPred->initCosts();
|
|
1448 bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
1449 bestPred->cu.setPredModeSubParts(MODE_INTER);
|
|
1450 bestPred->cu.m_mergeFlag[0] = true;
|
|
1451
|
|
1452 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
|
|
1453 uint8_t candDir[MRG_MAX_NUM_CANDS];
|
|
1454 uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
|
|
1455 PredictionUnit pu(merge.cu, cuGeom, 0);
|
|
1456
|
|
1457 bestPred->sa8dCost = MAX_INT64;
|
|
1458 int bestSadCand = -1;
|
|
1459 int sizeIdx = cuGeom.log2CUSize - 2;
|
|
1460
|
|
1461 for (uint32_t i = 0; i < numMergeCand; ++i)
|
|
1462 {
|
|
1463 if (m_bFrameParallel &&
|
|
1464 (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
|
|
1465 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
|
|
1466 continue;
|
|
1467
|
|
1468 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
|
|
1469 X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
|
|
1470 tempPred->cu.m_interDir[0] = candDir[i];
|
|
1471 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
|
|
1472 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
|
|
1473 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
|
|
1474 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
|
|
1475
|
|
1476 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
|
|
1477
|
|
1478 tempPred->sa8dBits = getTUBits(i, numMergeCand);
|
|
1479 tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
|
|
1480 if (m_bChromaSa8d)
|
|
1481 {
|
|
1482 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
|
|
1483 tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
|
|
1484 }
|
|
1485 tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
|
|
1486
|
|
1487 if (tempPred->sa8dCost < bestPred->sa8dCost)
|
|
1488 {
|
|
1489 bestSadCand = i;
|
|
1490 std::swap(tempPred, bestPred);
|
|
1491 }
|
|
1492 }
|
|
1493
|
|
1494 /* force mode decision to take inter or intra */
|
|
1495 if (bestSadCand < 0)
|
|
1496 return;
|
|
1497
|
|
1498 /* calculate the motion compensation for chroma for the best mode selected */
|
|
1499 if (!m_bChromaSa8d) /* Chroma MC was done above */
|
|
1500 motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
|
|
1501
|
|
1502 if (m_param->rdLevel)
|
|
1503 {
|
|
1504 if (m_param->bLossless)
|
|
1505 bestPred->rdCost = MAX_INT64;
|
|
1506 else
|
|
1507 encodeResAndCalcRdSkipCU(*bestPred);
|
|
1508
|
|
1509 /* Encode with residual */
|
|
1510 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
|
|
1511 tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
|
|
1512 tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
|
|
1513 tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
|
|
1514 tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
|
|
1515 tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
|
|
1516 tempPred->sa8dCost = bestPred->sa8dCost;
|
|
1517 tempPred->sa8dBits = bestPred->sa8dBits;
|
|
1518 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
|
|
1519
|
|
1520 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
|
|
1521
|
|
1522 md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
|
|
1523 }
|
|
1524 else
|
|
1525 md.bestMode = bestPred;
|
|
1526
|
|
1527 /* broadcast sets of MV field data */
|
|
1528 md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
|
|
1529 md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
|
|
1530 md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
|
|
1531 md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
|
|
1532 md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
|
|
1533 checkDQP(*md.bestMode, cuGeom);
|
|
1534 X265_CHECK(md.bestMode->ok(), "Merge mode not ok\n");
|
|
1535 }
|
|
1536
|
|
1537 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
|
|
1538 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand)
|
|
1539 {
|
|
1540 uint32_t depth = cuGeom.depth;
|
|
1541
|
|
1542 /* Note that these two Mode instances are named MERGE and SKIP but they may
|
|
1543 * hold the reverse when the function returns. We toggle between the two modes */
|
|
1544 Mode* tempPred = &merge;
|
|
1545 Mode* bestPred = &skip;
|
|
1546
|
|
1547 merge.initCosts();
|
|
1548 merge.cu.setPredModeSubParts(MODE_INTER);
|
|
1549 merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
1550 merge.cu.m_mergeFlag[0] = true;
|
|
1551
|
|
1552 skip.initCosts();
|
|
1553 skip.cu.setPredModeSubParts(MODE_INTER);
|
|
1554 skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
1555 skip.cu.m_mergeFlag[0] = true;
|
|
1556
|
|
1557 MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
|
|
1558 uint8_t candDir[MRG_MAX_NUM_CANDS];
|
|
1559 uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
|
|
1560 PredictionUnit pu(merge.cu, cuGeom, 0);
|
|
1561
|
|
1562 bool foundCbf0Merge = false;
|
|
1563 bool triedPZero = false, triedBZero = false;
|
|
1564 bestPred->rdCost = MAX_INT64;
|
|
1565
|
|
1566 uint32_t first = 0, last = numMergeCand;
|
|
1567 if (isShareMergeCand)
|
|
1568 {
|
|
1569 first = *m_reuseBestMergeCand;
|
|
1570 last = first + 1;
|
|
1571 }
|
|
1572
|
|
1573 for (uint32_t i = first; i < last; i++)
|
|
1574 {
|
|
1575 if (m_bFrameParallel &&
|
|
1576 (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
|
|
1577 candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
|
|
1578 continue;
|
|
1579
|
|
1580 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
|
|
1581 if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
|
|
1582 {
|
|
1583 if (triedPZero)
|
|
1584 continue;
|
|
1585 triedPZero = true;
|
|
1586 }
|
|
1587 else if (candDir[i] == 3 &&
|
|
1588 !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
|
|
1589 !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
|
|
1590 {
|
|
1591 if (triedBZero)
|
|
1592 continue;
|
|
1593 triedBZero = true;
|
|
1594 }
|
|
1595
|
|
1596 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
|
|
1597 tempPred->cu.m_interDir[0] = candDir[i];
|
|
1598 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
|
|
1599 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
|
|
1600 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
|
|
1601 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
|
|
1602 tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
|
|
1603
|
|
1604 motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
|
|
1605
|
|
1606 uint8_t hasCbf = true;
|
|
1607 bool swapped = false;
|
|
1608 if (!foundCbf0Merge)
|
|
1609 {
|
|
1610 /* if the best prediction has CBF (not a skip) then try merge with residual */
|
|
1611
|
|
1612 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
|
|
1613 hasCbf = tempPred->cu.getQtRootCbf(0);
|
|
1614 foundCbf0Merge = !hasCbf;
|
|
1615
|
|
1616 if (tempPred->rdCost < bestPred->rdCost)
|
|
1617 {
|
|
1618 std::swap(tempPred, bestPred);
|
|
1619 swapped = true;
|
|
1620 }
|
|
1621 }
|
|
1622 if (!m_param->bLossless && hasCbf)
|
|
1623 {
|
|
1624 /* try merge without residual (skip), if not lossless coding */
|
|
1625
|
|
1626 if (swapped)
|
|
1627 {
|
|
1628 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
|
|
1629 tempPred->cu.m_interDir[0] = candDir[i];
|
|
1630 tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
|
|
1631 tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
|
|
1632 tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
|
|
1633 tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
|
|
1634 tempPred->cu.setPredModeSubParts(MODE_INTER);
|
|
1635 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
|
|
1636 }
|
|
1637
|
|
1638 encodeResAndCalcRdSkipCU(*tempPred);
|
|
1639
|
|
1640 if (tempPred->rdCost < bestPred->rdCost)
|
|
1641 std::swap(tempPred, bestPred);
|
|
1642 }
|
|
1643 }
|
|
1644
|
|
1645 if (bestPred->rdCost < MAX_INT64)
|
|
1646 {
|
|
1647 m_modeDepth[depth].bestMode = bestPred;
|
|
1648
|
|
1649 /* broadcast sets of MV field data */
|
|
1650 uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
|
|
1651 bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
|
|
1652 bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
|
|
1653 bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
|
|
1654 bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
|
|
1655 bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
|
|
1656 checkDQP(*bestPred, cuGeom);
|
|
1657 X265_CHECK(bestPred->ok(), "merge mode is not ok");
|
|
1658 }
|
|
1659
|
|
1660 if (m_param->analysisMode)
|
|
1661 {
|
|
1662 m_reuseBestMergeCand++;
|
|
1663 if (m_param->analysisMode == X265_ANALYSIS_SAVE)
|
|
1664 *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
|
|
1665 }
|
|
1666 }
|
|
1667
|
|
1668 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
|
|
1669 {
|
|
1670 interMode.initCosts();
|
|
1671 interMode.cu.setPartSizeSubParts(partSize);
|
|
1672 interMode.cu.setPredModeSubParts(MODE_INTER);
|
|
1673 int numPredDir = m_slice->isInterP() ? 1 : 2;
|
|
1674
|
|
1675 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
|
|
1676 {
|
|
1677 uint32_t numPU = interMode.cu.getNumPartInter(0);
|
|
1678 for (uint32_t part = 0; part < numPU; part++)
|
|
1679 {
|
|
1680 MotionData* bestME = interMode.bestME[part];
|
|
1681 for (int32_t i = 0; i < numPredDir; i++)
|
|
1682 {
|
|
1683 bestME[i].ref = *m_reuseRef;
|
|
1684 m_reuseRef++;
|
|
1685 }
|
|
1686 }
|
|
1687 }
|
|
1688
|
|
1689 predInterSearch(interMode, cuGeom, m_bChromaSa8d, refMask);
|
|
1690
|
|
1691 /* predInterSearch sets interMode.sa8dBits */
|
|
1692 const Yuv& fencYuv = *interMode.fencYuv;
|
|
1693 Yuv& predYuv = interMode.predYuv;
|
|
1694 int part = partitionFromLog2Size(cuGeom.log2CUSize);
|
|
1695 interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
|
|
1696 if (m_bChromaSa8d)
|
|
1697 {
|
|
1698 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
|
|
1699 interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
|
|
1700 }
|
|
1701 interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
|
|
1702
|
|
1703 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
|
|
1704 {
|
|
1705 uint32_t numPU = interMode.cu.getNumPartInter(0);
|
|
1706 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
1707 {
|
|
1708 MotionData* bestME = interMode.bestME[puIdx];
|
|
1709 for (int32_t i = 0; i < numPredDir; i++)
|
|
1710 {
|
|
1711 *m_reuseRef = bestME[i].ref;
|
|
1712 m_reuseRef++;
|
|
1713 }
|
|
1714 }
|
|
1715 }
|
|
1716 }
|
|
1717
|
|
1718 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
|
|
1719 {
|
|
1720 interMode.initCosts();
|
|
1721 interMode.cu.setPartSizeSubParts(partSize);
|
|
1722 interMode.cu.setPredModeSubParts(MODE_INTER);
|
|
1723 int numPredDir = m_slice->isInterP() ? 1 : 2;
|
|
1724
|
|
1725 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
|
|
1726 {
|
|
1727 uint32_t numPU = interMode.cu.getNumPartInter(0);
|
|
1728 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
1729 {
|
|
1730 MotionData* bestME = interMode.bestME[puIdx];
|
|
1731 for (int32_t i = 0; i < numPredDir; i++)
|
|
1732 {
|
|
1733 bestME[i].ref = *m_reuseRef;
|
|
1734 m_reuseRef++;
|
|
1735 }
|
|
1736 }
|
|
1737 }
|
|
1738
|
|
1739 predInterSearch(interMode, cuGeom, true, refMask);
|
|
1740
|
|
1741 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
|
|
1742 encodeResAndCalcRdInterCU(interMode, cuGeom);
|
|
1743
|
|
1744 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
|
|
1745 {
|
|
1746 uint32_t numPU = interMode.cu.getNumPartInter(0);
|
|
1747 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
|
|
1748 {
|
|
1749 MotionData* bestME = interMode.bestME[puIdx];
|
|
1750 for (int32_t i = 0; i < numPredDir; i++)
|
|
1751 {
|
|
1752 *m_reuseRef = bestME[i].ref;
|
|
1753 m_reuseRef++;
|
|
1754 }
|
|
1755 }
|
|
1756 }
|
|
1757 }
|
|
1758
|
|
1759 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
|
|
1760 {
|
|
1761 CUData& cu = bidir2Nx2N.cu;
|
|
1762
|
|
1763 if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
|
|
1764 {
|
|
1765 bidir2Nx2N.sa8dCost = MAX_INT64;
|
|
1766 bidir2Nx2N.rdCost = MAX_INT64;
|
|
1767 return;
|
|
1768 }
|
|
1769
|
|
1770 const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
|
|
1771 MV mvzero(0, 0);
|
|
1772 int partEnum = cuGeom.log2CUSize - 2;
|
|
1773
|
|
1774 bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
|
|
1775 bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
|
|
1776 MotionData* bestME = bidir2Nx2N.bestME[0];
|
|
1777 int ref0 = bestME[0].ref;
|
|
1778 MV mvp0 = bestME[0].mvp;
|
|
1779 int mvpIdx0 = bestME[0].mvpIdx;
|
|
1780 int ref1 = bestME[1].ref;
|
|
1781 MV mvp1 = bestME[1].mvp;
|
|
1782 int mvpIdx1 = bestME[1].mvpIdx;
|
|
1783
|
|
1784 bidir2Nx2N.initCosts();
|
|
1785 cu.setPartSizeSubParts(SIZE_2Nx2N);
|
|
1786 cu.setPredModeSubParts(MODE_INTER);
|
|
1787 cu.setPUInterDir(3, 0, 0);
|
|
1788 cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
|
|
1789 cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
|
|
1790 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
|
|
1791 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
|
|
1792 cu.m_mergeFlag[0] = 0;
|
|
1793
|
|
1794 /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
|
|
1795 cu.setPUMv(0, bestME[0].mv, 0, 0);
|
|
1796 cu.m_mvd[0][0] = bestME[0].mv - mvp0;
|
|
1797
|
|
1798 cu.setPUMv(1, bestME[1].mv, 0, 0);
|
|
1799 cu.m_mvd[1][0] = bestME[1].mv - mvp1;
|
|
1800
|
|
1801 PredictionUnit pu(cu, cuGeom, 0);
|
|
1802 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
|
|
1803
|
|
1804 int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
|
|
1805 if (m_bChromaSa8d)
|
|
1806 {
|
|
1807 /* Add in chroma distortion */
|
|
1808 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
|
|
1809 sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
|
|
1810 }
|
|
1811 bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
|
|
1812 bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
|
|
1813
|
|
1814 bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
|
|
1815 if (bTryZero)
|
|
1816 {
|
|
1817 /* Do not try zero MV if unidir motion predictors are beyond
|
|
1818 * valid search area */
|
|
1819 MV mvmin, mvmax;
|
|
1820 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
|
|
1821 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
|
|
1822 mvmax.y += 2; // there is some pad for subpel refine
|
|
1823 mvmin <<= 2;
|
|
1824 mvmax <<= 2;
|
|
1825
|
|
1826 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
|
|
1827 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
|
|
1828 }
|
|
1829 if (bTryZero)
|
|
1830 {
|
|
1831 /* Estimate cost of BIDIR using coincident blocks */
|
|
1832 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
|
|
1833
|
|
1834 int zsa8d;
|
|
1835
|
|
1836 if (m_bChromaSa8d)
|
|
1837 {
|
|
1838 cu.m_mv[0][0] = mvzero;
|
|
1839 cu.m_mv[1][0] = mvzero;
|
|
1840
|
|
1841 motionCompensation(cu, pu, tmpPredYuv, true, true);
|
|
1842
|
|
1843 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
|
|
1844 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
|
|
1845 zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
|
|
1846 }
|
|
1847 else
|
|
1848 {
|
|
1849 pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
|
|
1850 pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
|
|
1851 intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
|
|
1852
|
|
1853 primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
|
|
1854 zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
|
|
1855 }
|
|
1856
|
|
1857 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
|
|
1858 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
|
|
1859 uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
|
|
1860
|
|
1861 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
|
|
1862 mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
|
|
1863 mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
|
|
1864
|
|
1865 uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
|
|
1866 zcost = zsa8d + m_rdCost.getCost(zbits);
|
|
1867
|
|
1868 if (zcost < bidir2Nx2N.sa8dCost)
|
|
1869 {
|
|
1870 bidir2Nx2N.sa8dBits = zbits;
|
|
1871 bidir2Nx2N.sa8dCost = zcost;
|
|
1872
|
|
1873 cu.setPUMv(0, mvzero, 0, 0);
|
|
1874 cu.m_mvd[0][0] = mvzero - mvp0;
|
|
1875 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
|
|
1876
|
|
1877 cu.setPUMv(1, mvzero, 0, 0);
|
|
1878 cu.m_mvd[1][0] = mvzero - mvp1;
|
|
1879 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
|
|
1880
|
|
1881 if (m_bChromaSa8d)
|
|
1882 /* real MC was already performed */
|
|
1883 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
|
|
1884 else
|
|
1885 motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
|
|
1886 }
|
|
1887 else if (m_bChromaSa8d)
|
|
1888 {
|
|
1889 /* recover overwritten motion vectors */
|
|
1890 cu.m_mv[0][0] = bestME[0].mv;
|
|
1891 cu.m_mv[1][0] = bestME[1].mv;
|
|
1892 }
|
|
1893 }
|
|
1894 }
|
|
1895
|
|
1896 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
|
|
1897 {
|
|
1898 if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < g_maxCUDepth)
|
|
1899 {
|
|
1900 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
|
|
1901 {
|
|
1902 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
|
|
1903 if (childGeom.flags & CUGeom::PRESENT)
|
|
1904 encodeResidue(ctu, childGeom);
|
|
1905 }
|
|
1906 return;
|
|
1907 }
|
|
1908
|
|
1909 uint32_t absPartIdx = cuGeom.absPartIdx;
|
|
1910 int sizeIdx = cuGeom.log2CUSize - 2;
|
|
1911
|
|
1912 /* reuse the bestMode data structures at the current depth */
|
|
1913 Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
|
|
1914 CUData& cu = bestMode->cu;
|
|
1915
|
|
1916 cu.copyFromPic(ctu, cuGeom);
|
|
1917
|
|
1918 PicYuv& reconPic = *m_frame->m_reconPic;
|
|
1919
|
|
1920 Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
|
|
1921 if (cuGeom.depth)
|
|
1922 m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
|
|
1923 X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
|
|
1924
|
|
1925 if (cu.isIntra(0))
|
|
1926 {
|
|
1927 ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
|
|
1928
|
|
1929 uint32_t tuDepthRange[2];
|
|
1930 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
|
|
1931
|
|
1932 residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
|
|
1933 getBestIntraModeChroma(*bestMode, cuGeom);
|
|
1934 residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
|
|
1935 }
|
|
1936 else // if (cu.isInter(0))
|
|
1937 {
|
|
1938 ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
|
|
1939
|
|
1940 X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
|
|
1941
|
|
1942 /* Calculate residual for current CU part into depth sized resiYuv */
|
|
1943
|
|
1944 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
|
|
1945
|
|
1946 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
|
|
1947 Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
|
|
1948 pixel* predY = predYuv.getLumaAddr(absPartIdx);
|
|
1949 pixel* predU = predYuv.getCbAddr(absPartIdx);
|
|
1950 pixel* predV = predYuv.getCrAddr(absPartIdx);
|
|
1951
|
|
1952 primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
|
|
1953 fencYuv.m_buf[0], predY,
|
|
1954 fencYuv.m_size, predYuv.m_size);
|
|
1955
|
|
1956 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
|
|
1957 fencYuv.m_buf[1], predU,
|
|
1958 fencYuv.m_csize, predYuv.m_csize);
|
|
1959
|
|
1960 primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
|
|
1961 fencYuv.m_buf[2], predV,
|
|
1962 fencYuv.m_csize, predYuv.m_csize);
|
|
1963
|
|
1964 uint32_t tuDepthRange[2];
|
|
1965 cu.getInterTUQtDepthRange(tuDepthRange, 0);
|
|
1966
|
|
1967 residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
|
|
1968
|
|
1969 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
|
|
1970 cu.setPredModeSubParts(MODE_SKIP);
|
|
1971
|
|
1972 /* residualTransformQuantInter() wrote transformed residual back into
|
|
1973 * resiYuv. Generate the recon pixels by adding it to the prediction */
|
|
1974
|
|
1975 if (cu.m_cbf[0][0])
|
|
1976 primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
|
|
1977 predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
|
|
1978 else
|
|
1979 primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
|
|
1980 predY, predYuv.m_size);
|
|
1981
|
|
1982 if (cu.m_cbf[1][0])
|
|
1983 primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
|
|
1984 predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
|
|
1985 else
|
|
1986 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
|
|
1987 predU, predYuv.m_csize);
|
|
1988
|
|
1989 if (cu.m_cbf[2][0])
|
|
1990 primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
|
|
1991 predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
|
|
1992 else
|
|
1993 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
|
|
1994 predV, predYuv.m_csize);
|
|
1995 }
|
|
1996
|
|
1997 cu.updatePic(cuGeom.depth);
|
|
1998 }
|
|
1999
|
|
2000 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
|
|
2001 {
|
|
2002 if (m_param->rdLevel >= 3)
|
|
2003 {
|
|
2004 /* code the split flag (0 or 1) and update bit costs */
|
|
2005 mode.contexts.resetBits();
|
|
2006 mode.contexts.codeSplitFlag(mode.cu, 0, depth);
|
|
2007 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
|
|
2008 mode.mvBits += bits;
|
|
2009 mode.totalBits += bits;
|
|
2010 updateModeCost(mode);
|
|
2011 }
|
|
2012 else if (m_param->rdLevel <= 1)
|
|
2013 {
|
|
2014 mode.sa8dBits++;
|
|
2015 mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
|
|
2016 }
|
|
2017 else
|
|
2018 {
|
|
2019 mode.mvBits++;
|
|
2020 mode.totalBits++;
|
|
2021 updateModeCost(mode);
|
|
2022 }
|
|
2023 }
|
|
2024
|
|
2025 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
|
|
2026 {
|
|
2027 /* Do not attempt to code a block larger than the largest block in the
|
|
2028 * co-located CTUs in L0 and L1 */
|
|
2029 int currentQP = parentCTU.m_qp[0];
|
|
2030 int previousQP = currentQP;
|
|
2031 uint32_t minDepth0 = 4, minDepth1 = 4;
|
|
2032 uint32_t sum = 0;
|
|
2033 int numRefs = 0;
|
|
2034 if (m_slice->m_numRefIdx[0])
|
|
2035 {
|
|
2036 numRefs++;
|
|
2037 const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
|
|
2038 previousQP = cu.m_qp[0];
|
|
2039 if (!cu.m_cuDepth[cuGeom.absPartIdx])
|
|
2040 return 0;
|
|
2041 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
|
|
2042 {
|
|
2043 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
|
|
2044 minDepth0 = X265_MIN(d, minDepth0);
|
|
2045 sum += d;
|
|
2046 }
|
|
2047 }
|
|
2048 if (m_slice->m_numRefIdx[1])
|
|
2049 {
|
|
2050 numRefs++;
|
|
2051 const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
|
|
2052 if (!cu.m_cuDepth[cuGeom.absPartIdx])
|
|
2053 return 0;
|
|
2054 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
|
|
2055 {
|
|
2056 uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
|
|
2057 minDepth1 = X265_MIN(d, minDepth1);
|
|
2058 sum += d;
|
|
2059 }
|
|
2060 }
|
|
2061 if (!numRefs)
|
|
2062 return 0;
|
|
2063
|
|
2064 uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
|
|
2065 uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
|
|
2066
|
|
2067 /* allow block size growth if QP is raising or avg depth is
|
|
2068 * less than 1.5 of min depth */
|
|
2069 if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
|
|
2070 minDepth -= 1;
|
|
2071
|
|
2072 return minDepth;
|
|
2073 }
|
|
2074
|
|
2075 /* returns true if recursion should be stopped */
|
|
2076 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
|
|
2077 {
|
|
2078 /* early exit when the RD cost of best mode at depth n is less than the sum
|
|
2079 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
|
|
2080 * left, colocated) and avg cost of that CU at depth "n" with weightage for
|
|
2081 * each quantity */
|
|
2082
|
|
2083 uint32_t depth = cuGeom.depth;
|
|
2084 FrameData& curEncData = *m_frame->m_encData;
|
|
2085 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
|
|
2086 uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
|
|
2087 uint64_t cuCount = cuStat.count[depth];
|
|
2088
|
|
2089 uint64_t neighCost = 0, neighCount = 0;
|
|
2090 const CUData* above = parentCTU.m_cuAbove;
|
|
2091 if (above)
|
|
2092 {
|
|
2093 FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
|
|
2094 neighCost += astat.avgCost[depth] * astat.count[depth];
|
|
2095 neighCount += astat.count[depth];
|
|
2096
|
|
2097 const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
|
|
2098 if (aboveLeft)
|
|
2099 {
|
|
2100 FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
|
|
2101 neighCost += lstat.avgCost[depth] * lstat.count[depth];
|
|
2102 neighCount += lstat.count[depth];
|
|
2103 }
|
|
2104
|
|
2105 const CUData* aboveRight = parentCTU.m_cuAboveRight;
|
|
2106 if (aboveRight)
|
|
2107 {
|
|
2108 FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
|
|
2109 neighCost += rstat.avgCost[depth] * rstat.count[depth];
|
|
2110 neighCount += rstat.count[depth];
|
|
2111 }
|
|
2112 }
|
|
2113 const CUData* left = parentCTU.m_cuLeft;
|
|
2114 if (left)
|
|
2115 {
|
|
2116 FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
|
|
2117 neighCost += nstat.avgCost[depth] * nstat.count[depth];
|
|
2118 neighCount += nstat.count[depth];
|
|
2119 }
|
|
2120
|
|
2121 // give 60% weight to all CU's and 40% weight to neighbour CU's
|
|
2122 if (neighCount + cuCount)
|
|
2123 {
|
|
2124 uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
|
|
2125 uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
|
|
2126 if (curCost < avgCost && avgCost)
|
|
2127 return true;
|
|
2128 }
|
|
2129
|
|
2130 return false;
|
|
2131 }
|
|
2132
|
|
2133 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom)
|
|
2134 {
|
|
2135 FrameData& curEncData = *m_frame->m_encData;
|
|
2136 double qp = curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
|
|
2137
|
|
2138 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
|
|
2139 bool isReferenced = IS_REFERENCED(m_frame);
|
|
2140 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
|
|
2141 if (qpoffs)
|
|
2142 {
|
|
2143 uint32_t width = m_frame->m_fencPic->m_picWidth;
|
|
2144 uint32_t height = m_frame->m_fencPic->m_picHeight;
|
|
2145 uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
|
|
2146 uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
|
|
2147 uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
|
|
2148 uint32_t blockSize = g_maxCUSize >> cuGeom.depth;
|
|
2149 double qp_offset = 0;
|
|
2150 uint32_t cnt = 0;
|
|
2151 uint32_t idx;
|
|
2152
|
|
2153 for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16)
|
|
2154 {
|
|
2155 for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16)
|
|
2156 {
|
|
2157 idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16);
|
|
2158 qp_offset += qpoffs[idx];
|
|
2159 cnt++;
|
|
2160 }
|
|
2161 }
|
|
2162
|
|
2163 qp_offset /= cnt;
|
|
2164 qp += qp_offset;
|
|
2165 }
|
|
2166
|
|
2167 return x265_clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
|
|
2168 }
|