说明 :该函数内部原理类似WelsMdI4x4函数,具体可以参考:openh264 帧内预测编码原理:WelsMdI4x4 函数。功能 :针对4x4像素块的帧内模式决策的快速实现逻辑原型 :int32_t  WelsMdI4x4Fast  ( sWelsEncCtx*  pEncCtx,  SWelsMD*  pWelsMd,  SMB*  pCurMb,  SMbCache*  pMbCache) 
参数 : pEncCtx: 指向编码上下文的指针。 pWelsMd: 指向模式决策结构的指针。 pCurMb: 指向当前宏块的指针。 pMbCache: 指向宏块缓存的指针。   
 
过程 : 初始化变量; for 循环每个 4x4 块,进行预测分析; 计算当前4x4块在编码宏块pEncMb和解码宏块pDecMb中的位置。这是通过iCoordinateX和iCoordinateY(4x4块在宏块中的相对位置)以及宏块的行大小kiLineSizeEnc和kiLineSizeDec来实现的; PredIntra4x4Mode函数根据当前块的扫描顺序索引kpCache48CountScan4[i]和缓存的内插值模式pMbCache->iIntraPredMode来预测当前块的模式;根据kiOffset(当前块的邻居关系)确定有多少可用的预测模式(iAvailCount),并获取这些模式的数组kpAvailMode; 如果iAvailCount等于 9 或者等于 7, 初始化最佳模式iBestMode为 DC; pfGetLumaI4x4Pred计算 DC 模式下的预测块,pfMdCost函数计算出最佳模式iBestMode和最佳代价iBestCost;初始化当前模式iCurMode为 水平 H, pfGetLumaI4x4Pred计算 H 模式下的预测块,pfMdCost函数计算出当前模式下当前代价iCurCost和水平代价iCostH;比较当前代价iCurCost和最佳代价iBestCost,更新最佳代价iBestCost和最佳模式iBestMode;  初始化当前模式iCurMode为 垂直 V, pfGetLumaI4x4Pred计算 V 模式下的预测块,pfMdCost函数计算出当前模式下当前代价iCurCost和垂直代价iCostV;比较当前代价iCurCost和最佳代价iBestCost,更新最佳代价iBestCost和最佳模式iBestMode;  如果垂直代价iCostV 小于 水平代价iCostH, 如果可预测模式数量iAvailCount等于 9, 分别计算I4_PRED_VR、I4_PRED_VL、I4_PRED_DDR、I4_PRED_DDL模式下的代价,选择最佳模式iBestMode和最佳代价iBestCost;  如果可预测模式数量iAvailCount等于 7, 分别计算I4_PRED_DDR、I4_PRED_VR模式下的代价,选择最佳模式iBestMode和最佳代价iBestCost;   否则, 有条件的计算I4_PRED_HD、I4_PRED_HU、I4_PRED_DDR、I4_PRED_DDL模式下的代价,选择最佳模式iBestMode和最佳代价iBestCost;   否则, for 循环遍历每种可能模式; 根据索引从可用模式中指向当前模式iCurMode; 根据 iBestPredBufferNum 的当前值,更新 pDst 指向另一个缓冲区,以便为下一个模式生成预测; fGetLumaI4x4Pred[iCurMode] 函数根据当前模式生成预测块;使用 pfSampleSatd 函数计算预测块和原始编码块之间的绝对变换差分和(SATD)作为当前预测模式的代价iCurCost; 比较iCurCost和iBestCost,更新iBestMode、iBestCost,并切换 iBestPredBufferNum 以指向当前最佳的预测块;   pBestPredI4x4Blk4 被设置为指向当前最佳预测块的内存地址; 将当前最佳模式的成本 iBestCost 加到总成本 iCosti4x4 上; 如果到目前为止累积的成本 iCosti4x4 大于或等于之前计算的亮度成本 iBestCostLuma,则break退出循环。这是因为继续计算其他模式不太可能找到更低的成本,从而节省计算资源; 更新预测模式和样本可用性缓存; iFinalMode 是将 iBestMode 通过一个映射表 g_kiMapModeI4x4 转换后的模式值; 根据当前预测模式 iPredMode 和最终模式 iFinalMode 的比较结果,更新 pPrevIntra4x4PredModeFlag 缓存。如果它们相等,则设置为 true,否则设置为 false,同时更新 pRemIntra4x4PredModeFlag 指针递增,为下一个模式的更新做准备; 将 iFinalMode 更新到 pMbCache->iIntraPredMode 中,这是当前4x4块的内插值预测模式缓存。这个缓存用于后续的编码过程;  调用 WelsEncRecI4x4Y 函数对确定最佳预测模式的当前4x4块进行编码。这个函数会进行实际的变换、量化和重建操作;  调用宏ST32、LD32函数将 pMbCache->iIntraPredMode 数组中从索引33开始的32位值复制到 pCurMb->pIntra4x4PredMode 的起始位置;常是将计算得到的内插值预测模式复制到宏块的正式存储区域; 将缓存pMbCache中特定位置赋值给当前宏块pCurMb中预测模式,由于在宏块中的特定位置需要特定的预测模式值; iCosti4x4 变量增加了一个基于量化参数 iLambda 的值; 返回总成本iCosti4x4。   
原理图 :int32_t  WelsMdI4x4Fast  ( sWelsEncCtx*  pEncCtx,  SWelsMD*  pWelsMd,  SMB*  pCurMb,  SMbCache*  pMbCache)  { SWelsFuncPtrList*  pFunc       =  pEncCtx-> pFuncList; SDqLayer*  pCurDqLayer         =  pEncCtx-> pCurDqLayer; int32_t  iLambda               =  pWelsMd-> iLambda; int32_t  iBestCostLuma         =  pWelsMd-> iCostLuma; uint8_t *  pEncMb               =  pMbCache-> SPicData. pEncMb[ 0 ] ; uint8_t *  pDecMb               =  pMbCache-> SPicData. pCsMb[ 0 ] ; const  int32_t  kiLineSizeEnc   =  pCurDqLayer-> iEncStride[ 0 ] ; const  int32_t  kiLineSizeDec   =  pCurDqLayer-> iCsStride[ 0 ] ; uint8_t *  pCurEnc,  * pCurDec,  * pDst; int8_t  iPredMode,  iCurMode,  iBestMode,  iFinalMode; int32_t  iCurCost,  iBestCost; int32_t  iAvailCount; const  uint8_t *  kpAvailMode; int32_t  i,  j,  iCoordinateX,  iCoordinateY,  iIdxStrideEnc,  iIdxStrideDec; int32_t  iCostH,  iCostV,  iCostVR,  iCostHD,  iCostVL,  iCostHU,  iBestModeFake; int32_t  lambda[ 2 ]  =  { iLambda <<  2 ,  iLambda} ; bool *  pPrevIntra4x4PredModeFlag       =  pMbCache-> pPrevIntra4x4PredModeFlag; int8_t *  pRemIntra4x4PredModeFlag      =  pMbCache-> pRemIntra4x4PredModeFlag; const  uint8_t *  kpIntra4x4AvailCount   =  & g_kiIntra4AvailCount[ 0 ] ; const  uint8_t *  kpCache48CountScan4    =  & g_kuiCache48CountScan4Idx[ 0 ] ; const  int8_t *  kpNeighborIntraToI4x4   =  g_kiNeighborIntraToI4x4[ pMbCache-> uiNeighborIntra] ; const  int8_t *  kpCoordinateIdxX        =  & g_kiCoordinateIdx4x4X[ 0 ] ; const  int8_t *  kpCoordinateIdxY        =  & g_kiCoordinateIdx4x4Y[ 0 ] ; int32_t  iBestPredBufferNum            =  0 ; int32_t  iCosti4x4                     =  0 ; 
# if  defined ( X86_ASM) WelsPrefetchZero_mmx  ( g_kiMapModeI4x4) ; WelsPrefetchZero_mmx  ( ( int8_t * ) & pFunc-> pfGetLumaI4x4Pred) ; 
# endif for  ( i =  0 ;  i <  16 ;  i++ )  { const  int32_t  kiOffset =  kpNeighborIntraToI4x4[ i] ; 
iCoordinateX =  kpCoordinateIdxX[ i] ; iCoordinateY =  kpCoordinateIdxY[ i] ; iIdxStrideEnc =  ( iCoordinateY *  kiLineSizeEnc)  +  iCoordinateX; pCurEnc =  pEncMb +  iIdxStrideEnc; iIdxStrideDec =  ( iCoordinateY *  kiLineSizeDec)  +  iCoordinateX; pCurDec =  pDecMb +  iIdxStrideDec; iPredMode =  PredIntra4x4Mode  ( pMbCache-> iIntraPredMode,  kpCache48CountScan4[ i] ) ; iAvailCount =  kpIntra4x4AvailCount[ kiOffset] ; kpAvailMode =  g_kiIntra4AvailMode[ kiOffset] ; if  ( iAvailCount ==  9  ||  iAvailCount ==  7 )  { iBestMode =  I4_PRED_DC; pDst =  & pMbCache-> pMemPredBlk4[ iBestPredBufferNum <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ I4_PRED_DC]  ( pDst,  pCurDec,  kiLineSizeDec) ; iBestCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iBestMode] ] ; iCurMode =  I4_PRED_H; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostH =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } iCurMode =  I4_PRED_V; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostV =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } if  ( iCostV <  iCostH)  { if  ( iAvailCount ==  9 )  { iBestModeFake =  true ;  iCurMode =  I4_PRED_VR; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostVR =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } if  ( iCurCost <  iCostV) iBestModeFake =  false ; iCurMode =  I4_PRED_VL; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostVL =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } if  ( iCurCost <  iCostV) iBestModeFake =  false ; if  ( ! iBestModeFake)  {  if  ( iCostVR <  iCostVL)  { iCurMode =  I4_PRED_DDR; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } }  else  { iCurMode =  I4_PRED_DDL; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } } } }  else  if  ( iAvailCount ==  7 )  { iCurMode =  I4_PRED_DDR; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } iCurMode =  I4_PRED_VR; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } } }  else  { iBestModeFake =  true ;  iCurMode =  I4_PRED_HD; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostHD =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } if  ( iCurCost <  iCostH) iBestModeFake =  false ; iCurMode =  I4_PRED_HU; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCostHU =  iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } if  ( iCurCost <  iCostH) iBestModeFake =  false ; if  ( ! iBestModeFake)  {  if  ( iCostHD <  iCostHU)  { iCurMode =  I4_PRED_DDR; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } }  else  if  ( iAvailCount ==  9 )  { iCurMode =  I4_PRED_DDL; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } } } } }  else  { iBestCost =  INT_MAX; iBestMode =  I4_PRED_INVALID; for  ( j =  0 ;  j <  iAvailCount;  j++ )  { iCurMode =  kpAvailMode[ j] ; pDst =  & pMbCache-> pMemPredBlk4[  ( 1  -  iBestPredBufferNum)  <<  4 ] ; pFunc-> pfGetLumaI4x4Pred[ iCurMode]  ( pDst,  pCurDec,  kiLineSizeDec) ; iCurCost =  pFunc-> sSampleDealingFuncs. pfMdCost[ BLOCK_4x4]  ( pDst,  4 ,  pCurEnc,  kiLineSizeEnc)  + lambda[ iPredMode ==  g_kiMapModeI4x4[ iCurMode] ] ; if  ( iCurCost <  iBestCost)  { iBestMode =  iCurMode; iBestCost =  iCurCost; iBestPredBufferNum =  1  -  iBestPredBufferNum; } } } pMbCache-> pBestPredI4x4Blk4 =  & pMbCache-> pMemPredBlk4[ iBestPredBufferNum <<  4 ] ; iCosti4x4 +=  iBestCost; if  ( iCosti4x4 >=  iBestCostLuma)  { break ; } iFinalMode =  g_kiMapModeI4x4[ iBestMode] ; if  ( iPredMode ==  iFinalMode)  { * pPrevIntra4x4PredModeFlag++  =  true ; }  else  { * pPrevIntra4x4PredModeFlag++  =  false ; * pRemIntra4x4PredModeFlag  =  ( iFinalMode <  iPredMode ?  iFinalMode :  ( iFinalMode -  1 ) ) ; } pRemIntra4x4PredModeFlag++ ; pMbCache-> iIntraPredMode[ kpCache48CountScan4[ i] ]  =  iFinalMode; WelsEncRecI4x4Y  ( pEncCtx,  pCurMb,  pMbCache,  i) ; } ST32  ( pCurMb-> pIntra4x4PredMode,  LD32  ( & pMbCache-> iIntraPredMode[ 33 ] ) ) ; pCurMb-> pIntra4x4PredMode[ 4 ]  =  pMbCache-> iIntraPredMode[ 12 ] ; pCurMb-> pIntra4x4PredMode[ 5 ]  =  pMbCache-> iIntraPredMode[ 20 ] ; pCurMb-> pIntra4x4PredMode[ 6 ]  =  pMbCache-> iIntraPredMode[ 28 ] ; iCosti4x4 +=  ( iLambda <<  4 )  +  ( iLambda <<  3 ) ;  return  iCosti4x4; 
}