media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp - third_party/android/platform/frameworks/av - Git at Google

 /* ------------------------------------------------------------------
  * Copyright (C) 1998-2009 PacketVideo
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  * express or implied.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  * -------------------------------------------------------------------
  */
 #include "mp4def.h"
 #include "mp4lib_int.h"
 #include "mp4enc_lib.h"
 #include "dct.h"
 #include "m4venc_oscl.h"

 /* ======================================================================== */
 /*  Function : CodeMB_H263( )                                               */
 /*  Date     : 8/15/2001                                                    */
 /*  Purpose  : Perform residue calc (only zero MV), DCT, H263 Quant/Dequant,*/
 /*              IDCT and motion compensation.Modified from FastCodeMB()     */
 /*  Input    :                                                              */
 /*      video       Video encoder data structure                            */
 /*      function    Approximate DCT function, scaling and threshold         */
 /*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
 /*      QP      Combined offset from the origin to the current          */
 /*                  macroblock  and QP  for current MB.                     */
 /*    Output     :                                                          */
 /*      video->outputMB     Quantized DCT coefficients.                     */
 /*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
 /*                                                                          */
 /*  Return   :   PV_STATUS                                                  */
 /*  Modified :                                                              */
 /*           2/26/01
             -modified threshold based on correlation coeff 0.75 only for mode H.263
             -ncoefblck[] as input,  to keep position of last non-zero coeff*/
 /*           8/10/01
             -modified threshold based on correlation coeff 0.5
             -used column threshold to speedup column DCT.
             -used bitmap zigzag to speedup RunLevel().                      */
 /* ======================================================================== */

 PV_STATUS CodeMB_H263(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
 {
     Int sad, k, CBP, mbnum = video->mbnum;
     Short *output, *dataBlock;
     UChar Mode = video->headerInfo.Mode[mbnum];
     UChar *bitmapcol, *bitmaprow = video->bitmaprow;
     UInt  *bitmapzz ;
     UChar shortHeader = video->vol[video->currLayer]->shortVideoHeader;
     Int dc_scaler = 8;
     Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
     struct QPstruct QuantParam;
     Int dctMode, DctTh1;
     Int ColTh;
     Int(*BlockQuantDequantH263)(Short *, Short *, struct QPstruct *,
                                 UChar[], UChar *, UInt *, Int, Int, Int, UChar);
     Int(*BlockQuantDequantH263DC)(Short *, Short *, struct QPstruct *,
                                   UChar *, UInt *, Int, UChar);
     void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);

     /* motion comp. related var. */
     Vop *currVop = video->currVop;
     VideoEncFrameIO *inputFrame = video->input;
     Int ind_x = video->outputMB->mb_x;
     Int ind_y = video->outputMB->mb_y;
     Int lx = currVop->pitch;
     Int width = currVop->width;
     UChar *rec, *input, *pred;
     Int offset = QP >> 5;  /* QP is combined offset and QP */
     Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
     /*****************************/

     OSCL_UNUSED_ARG(function);

     output = video->outputMB->block[0];
     CBP = 0;
     QP = QP & 0x1F;
 //  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero , 7/24/01*/

     QuantParam.QPx2 = QP << 1;
     QuantParam.QP = QP;
     QuantParam.QPdiv2 = QP >> 1;
     QuantParam.QPx2plus = QuantParam.QPx2 + QuantParam.QPdiv2;
     QuantParam.Addition = QP - 1 + (QP & 0x1);

     if (intra)
     {
         BlockDCT1x1 = &Block1x1DCTIntra;
         BlockDCT2x2 = &Block2x2DCT_AANIntra;
         BlockDCT4x4 = &Block4x4DCT_AANIntra;
         BlockDCT8x8 = &BlockDCT_AANIntra;
         BlockQuantDequantH263 = &BlockQuantDequantH263Intra;
         BlockQuantDequantH263DC = &BlockQuantDequantH263DCIntra;
         if (shortHeader)
         {
             dc_scaler = 8;
         }
         else
         {
             dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
         }
         DctTh1 = (Int)(dc_scaler * 3);//*1.829
         ColTh = ColThIntra[QP];
     }
     else
     {
         BlockDCT1x1 = &Block1x1DCTwSub;
         BlockDCT2x2 = &Block2x2DCT_AANwSub;
         BlockDCT4x4 = &Block4x4DCT_AANwSub;
         BlockDCT8x8 = &BlockDCT_AANwSub;

         BlockQuantDequantH263 = &BlockQuantDequantH263Inter;
         BlockQuantDequantH263DC = &BlockQuantDequantH263DCInter;
         ColTh = ColThInter[QP];
         DctTh1 = (Int)(16 * QP);  //9*QP;
     }

     rec = currVop->yChan + offset;
     input = inputFrame->yChan + offset;
     if (lx != width) input -= (ind_y << 9);  /* non-padded offset */

     dataBlock = video->dataBlock;
     pred = video->predictedMB;

     for (k = 0; k < 6; k++)
     {
         CBP <<= 1;
         bitmapcol = video->bitmapcol[k];
         bitmapzz = video->bitmapzz[k];  /*  7/30/01 */
         if (k < 4)
         {
             sad = video->mot[mbnum][k+1].sad;
             if (k&1)
             {
                 rec += 8;
                 input += 8;
             }
             else if (k == 2)
             {
                 dctMode = ((width << 3) - 8);
                 input += dctMode;
                 dctMode = ((lx << 3) - 8);
                 rec += dctMode;
             }
         }
         else
         {
             if (k == 4)
             {
                 rec = currVop->uChan + offsetc;
                 input = inputFrame->uChan + offsetc;
                 if (lx != width) input -= (ind_y << 7);
                 lx >>= 1;
                 width >>= 1;
                 if (intra)
                 {
                     sad = getBlockSum(input, width);
                     if (shortHeader)
                         dc_scaler = 8;
                     else
                     {
                         dc_scaler = cal_dc_scalerENC(QP, 2); /* chrominance blocks */
                     }
                     DctTh1 = (Int)(dc_scaler * 3);//*1.829
                 }
                 else
                     sad = Sad8x8(input, pred, width);
             }
             else
             {
                 rec = currVop->vChan + offsetc;
                 input = inputFrame->vChan + offsetc;
                 if (lx != width) input -= (ind_y << 7);
                 if (intra)
                 {
                     sad = getBlockSum(input, width);
                 }
                 else
                     sad = Sad8x8(input, pred, width);
             }
         }

         if (sad < DctTh1 && !(shortHeader && intra)) /* all-zero */
         {                       /* For shortHeader intra block, DC value cannot be zero */
             dctMode = 0;
             CBP |= 0;
             ncoefblck[k] = 0;
         }
         else if (sad < 18*QP/*(QP<<4)*/) /* DC-only */
         {
             dctMode = 1;
             BlockDCT1x1(dataBlock, input, pred, width);

             CBP |= (*BlockQuantDequantH263DC)(dataBlock, output, &QuantParam,
                                               bitmaprow + k, bitmapzz, dc_scaler, shortHeader);
             ncoefblck[k] = 1;
         }
         else
         {

             dataBlock[64] = ColTh;

             if (sad < 22*QP/*(QP<<4)+(QP<<1)*/)  /* 2x2 DCT */
             {
                 dctMode = 2;
                 BlockDCT2x2(dataBlock, input, pred, width);
                 ncoefblck[k] = 6;
             }
             else if (sad < (QP << 5)) /* 4x4 DCT */
             {
                 dctMode = 4;
                 BlockDCT4x4(dataBlock, input, pred, width);
                 ncoefblck[k] = 26;
             }
             else /* Full-DCT */
             {
                 dctMode = 8;
                 BlockDCT8x8(dataBlock, input, pred, width);
                 ncoefblck[k] = 64;
             }

             CBP |= (*BlockQuantDequantH263)(dataBlock, output, &QuantParam,
                                             bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler, shortHeader);
         }
         BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | intra);
         output += 64;
         if (!(k&1))
         {
             pred += 8;
         }
         else
         {
             pred += 120;
         }
     }

     video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
     return PV_SUCCESS;
 }

 #ifndef NO_MPEG_QUANT
 /* ======================================================================== */
 /*  Function : CodeMB_MPEG( )                                               */
 /*  Date     : 8/15/2001                                                    */
 /*  Purpose  : Perform residue calc (only zero MV), DCT, MPEG Quant/Dequant,*/
 /*              IDCT and motion compensation.Modified from FastCodeMB()     */
 /*  Input    :                                                              */
 /*      video       Video encoder data structure                            */
 /*      function    Approximate DCT function, scaling and threshold         */
 /*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
 /*      QP      Combined offset from the origin to the current          */
 /*                  macroblock  and QP  for current MB.                     */
 /*    Output     :                                                          */
 /*      video->outputMB     Quantized DCT coefficients.                     */
 /*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
 /*                                                                          */
 /*  Return   :   PV_STATUS                                                  */
 /*  Modified :                                                              */
 /*           2/26/01
             -modified threshold based on correlation coeff 0.75 only for mode H.263
             -ncoefblck[] as input, keep position of last non-zero coeff*/
 /*           8/10/01
             -modified threshold based on correlation coeff 0.5
             -used column threshold to speedup column DCT.
             -used bitmap zigzag to speedup RunLevel().                      */
 /* ======================================================================== */

 PV_STATUS CodeMB_MPEG(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
 {
     Int sad, k, CBP, mbnum = video->mbnum;
     Short *output, *dataBlock;
     UChar Mode = video->headerInfo.Mode[mbnum];
     UChar *bitmapcol, *bitmaprow = video->bitmaprow;
     UInt  *bitmapzz ;
     Int dc_scaler = 8;
     Vol *currVol = video->vol[video->currLayer];
     Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
     Int *qmat;
     Int dctMode, DctTh1, DctTh2, DctTh3, DctTh4;
     Int ColTh;

     Int(*BlockQuantDequantMPEG)(Short *, Short *, Int, Int *,
                                 UChar [], UChar *, UInt *, Int,  Int, Int);
     Int(*BlockQuantDequantMPEGDC)(Short *, Short *, Int, Int *,
                                   UChar [], UChar *, UInt *, Int);

     void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
     void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);

     /* motion comp. related var. */
     Vop *currVop = video->currVop;
     VideoEncFrameIO *inputFrame = video->input;
     Int ind_x = video->outputMB->mb_x;
     Int ind_y = video->outputMB->mb_y;
     Int lx = currVop->pitch;
     Int width = currVop->width;
     UChar *rec, *input, *pred;
     Int offset = QP >> 5;
     Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
     /*****************************/

     OSCL_UNUSED_ARG(function);

     output = video->outputMB->block[0];
     CBP = 0;
     QP = QP & 0x1F;
 //  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero ,  7/24/01*/

     if (intra)
     {
         BlockDCT1x1 = &Block1x1DCTIntra;
         BlockDCT2x2 = &Block2x2DCT_AANIntra;
         BlockDCT4x4 = &Block4x4DCT_AANIntra;
         BlockDCT8x8 = &BlockDCT_AANIntra;

         BlockQuantDequantMPEG = &BlockQuantDequantMPEGIntra;
         BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCIntra;
         dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
         qmat = currVol->iqmat;
         DctTh1 = (Int)(3 * dc_scaler);//2*dc_scaler);
         DctTh2 = (Int)((1.25 * QP - 1) * qmat[1] * 0.45);//0.567);//0.567);
         DctTh3 = (Int)((1.25 * QP - 1) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
         DctTh4 = (Int)((1.25 * QP - 1) * qmat[32] * 0.8);//1.7583);//0.7942);
         ColTh = ColThIntra[QP];
     }
     else
     {
         BlockDCT1x1 = &Block1x1DCTwSub;
         BlockDCT2x2 = &Block2x2DCT_AANwSub;
         BlockDCT4x4 = &Block4x4DCT_AANwSub;
         BlockDCT8x8 = &BlockDCT_AANwSub;

         BlockQuantDequantMPEG = &BlockQuantDequantMPEGInter;
         BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCInter;
         qmat = currVol->niqmat;
         DctTh1 = (Int)(((QP << 1) - 0.5) * qmat[0] * 0.4);//0.2286);//0.3062);
         DctTh2 = (Int)(((QP << 1) - 0.5) * qmat[1] * 0.45);//0.567);//0.4);
         DctTh3 = (Int)(((QP << 1) - 0.5) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
         DctTh4 = (Int)(((QP << 1) - 0.5) * qmat[32] * 0.8);//1.7583);//0.7942);
         ColTh = ColThInter[QP];
     }// get qmat, DctTh1, DctTh2, DctTh3

     rec = currVop->yChan + offset;
     input = inputFrame->yChan + offset;
     if (lx != width) input -= (ind_y << 9);  /* non-padded offset */

     dataBlock = video->dataBlock;
     pred = video->predictedMB;

     for (k = 0; k < 6; k++)
     {
         CBP <<= 1;
         bitmapcol = video->bitmapcol[k];
         bitmapzz = video->bitmapzz[k];  /*  8/2/01 */
         if (k < 4)
         {//Y block
             sad = video->mot[mbnum][k+1].sad;
             if (k&1)
             {
                 rec += 8;
                 input += 8;
             }
             else if (k == 2)
             {
                 dctMode = ((width << 3) - 8);
                 input += dctMode;
                 dctMode = ((lx << 3) - 8);
                 rec += dctMode;
             }
         }
         else
         {// U, V block
             if (k == 4)
             {
                 rec = currVop->uChan + offsetc;
                 input = inputFrame->uChan + offsetc;
                 if (lx != width) input -= (ind_y << 7);
                 lx >>= 1;
                 width >>= 1;
                 if (intra)
                 {
                     dc_scaler = cal_dc_scalerENC(QP, 2); /* luminance blocks */
                     DctTh1 = dc_scaler * 3;
                     sad = getBlockSum(input, width);
                 }
                 else
                     sad = Sad8x8(input, pred, width);
             }
             else
             {
                 rec = currVop->vChan + offsetc;
                 input = inputFrame->vChan + offsetc;
                 if (lx != width) input -= (ind_y << 7);
                 if (intra)
                     sad = getBlockSum(input, width);
                 else
                     sad = Sad8x8(input, pred, width);
             }
         }

         if (sad < DctTh1) /* all-zero */
         {
             dctMode = 0;
             CBP |= 0;
             ncoefblck[k] = 0;
         }
         else if (sad < DctTh2) /* DC-only */
         {
             dctMode = 1;
             BlockDCT1x1(dataBlock, input, pred, width);

             CBP |= (*BlockQuantDequantMPEGDC)(dataBlock, output, QP, qmat,
                                               bitmapcol, bitmaprow + k, bitmapzz, dc_scaler);
             ncoefblck[k] = 1;
         }
         else
         {
             dataBlock[64] = ColTh;

             if (sad < DctTh3) /* 2x2-DCT */
             {
                 dctMode = 2;
                 BlockDCT2x2(dataBlock, input, pred, width);
                 ncoefblck[k] = 6;
             }
             else if (sad < DctTh4) /* 4x4 DCT */
             {
                 dctMode = 4;
                 BlockDCT4x4(dataBlock, input, pred, width);
                 ncoefblck[k] = 26;
             }
             else /* full-DCT */
             {
                 dctMode = 8;
                 BlockDCT8x8(dataBlock, input, pred, width);
                 ncoefblck[k] = 64;
             }

             CBP |= (*BlockQuantDequantMPEG)(dataBlock, output, QP, qmat,
                                             bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler); //
         }
         dctMode = 8; /* for mismatch handle */
         BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | (intra));

         output += 64;
         if (!(k&1))
         {
             pred += 8;
         }
         else
         {
             pred += 120;
         }
     }

     video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
     return PV_SUCCESS;
 }

 #endif

 /* ======================================================================== */
 /*  Function : getBlockSAV( )                                               */
 /*  Date     : 8/10/2000                                                    */
 /*  Purpose  : Get SAV for one block                                        */
 /*  In/out   : block[64] contain one block data                             */
 /*  Return   :                                                              */
 /*  Modified :                                                              */
 /* ======================================================================== */
 /* can be written in MMX or SSE,  2/22/2001 */
 Int getBlockSAV(Short block[])
 {
     Int i, val, sav = 0;

     i = 8;
     while (i--)
     {
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
         val = *block++;
         if (val > 0)    sav += val;
         else        sav -= val;
     }

     return sav;

 }

 /* ======================================================================== */
 /*  Function : Sad8x8( )                                                    */
 /*  Date     : 8/10/2000                                                    */
 /*  Purpose  : Find SAD between prev block and current block                */
 /*  In/out   : Previous and current frame block pointers, and frame width   */
 /*  Return   :                                                              */
 /*  Modified :                                                              */
 /*      8/15/01,  - do 4 pixel at a time    assuming 32 bit register        */
 /* ======================================================================== */
 #ifdef __clang__
 __attribute((no_sanitize("integer")))
 #endif
 Int Sad8x8(UChar *cur, UChar *prev, Int width)
 {
     UChar *end = cur + (width << 3);
     Int sad = 0;
     Int *curInt = (Int*) cur;
     Int *prevInt = (Int*) prev;
     Int cur1, cur2, prev1, prev2;
     UInt mask, sgn_msk = 0x80808080;
     Int  sum2 = 0, sum4 = 0;
     Int  tmp;
     do
     {
         mask    = ~(0xFF00);
         cur1    = curInt[1];        /* load cur[4..7] */
         cur2    = curInt[0];
         curInt += (width >> 2);     /* load cur[0..3] and +=lx */
         prev1   = prevInt[1];
         prev2   = prevInt[0];
         prevInt += 4;

         tmp     = prev2 ^ cur2;
         cur2    = prev2 - cur2;
         tmp     = tmp ^ cur2;       /* (^)^(-) last bit is one if carry */
         tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
         if (cur2 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
         tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
         cur2    = cur2 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
         cur2    = cur2 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */

         tmp     = prev1 ^ cur1;
         cur1    = prev1 - cur1;
         tmp     = tmp ^ cur1;       /* (^)^(-) last bit is one if carry */
         tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
         if (cur1 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
         tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
         cur1    = cur1 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
         cur1    = cur1 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */

         sum4    = sum4 + cur1;
         cur1    = cur1 & (mask << 8);   /* mask first and third bytes */
         sum2    = sum2 + ((UInt)cur1 >> 8);
         sum4    = sum4 + cur2;
         cur2    = cur2 & (mask << 8);   /* mask first and third bytes */
         sum2    = sum2 + ((UInt)cur2 >> 8);
     }
     while ((uintptr_t)curInt < (uintptr_t)end);

     cur1 = sum4 - (sum2 << 8);  /* get even-sum */
     cur1 = cur1 + sum2;         /* add 16 bit even-sum and odd-sum*/
     cur1 = cur1 + (cur1 << 16); /* add upper and lower 16 bit sum */
     sad  = ((UInt)cur1 >> 16);  /* take upper 16 bit */
     return sad;
 }

 /* ======================================================================== */
 /*  Function : getBlockSum( )                                               */
 /*  Date     : 8/10/2000                                                    */
 /*  Purpose  : Find summation of value within a block.                      */
 /*  In/out   : Pointer to current block in a frame and frame width          */
 /*  Return   :                                                              */
 /*  Modified :                                                              */
 /*          8/15/01,  - SIMD 4 pixels at a time                         */
 /* ======================================================================== */
 #ifdef __clang__
 __attribute((no_sanitize("integer")))
 #endif
 Int getBlockSum(UChar *cur, Int width)
 {
     Int sad = 0, sum4 = 0, sum2 = 0;
     UChar *end = cur + (width << 3);
     Int *curInt = (Int*)cur;
     UInt mask   = ~(0xFF00);
     Int load1, load2;

     do
     {
         load1 = curInt[1];
         load2 = curInt[0];
         curInt += (width >> 2);
         sum4 += load1;
         load1 = load1 & (mask << 8); /* even bytes */
         sum2 += ((UInt)load1 >> 8); /* sum even bytes, 16 bit */
         sum4 += load2;
         load2 = load2 & (mask << 8); /* even bytes */
         sum2 += ((UInt)load2 >> 8); /* sum even bytes, 16 bit */
     }
     while ((uintptr_t)curInt < (uintptr_t)end);
     load1 = sum4 - (sum2 << 8);     /* get even-sum */
     load1 = load1 + sum2;           /* add 16 bit even-sum and odd-sum*/
     load1 = load1 + (load1 << 16);  /* add upper and lower 16 bit sum */
     sad  = ((UInt)load1 >> 16); /* take upper 16 bit */

     return sad;
 }
	/* ------------------------------------------------------------------
	* Copyright (C) 1998-2009 PacketVideo
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	* express or implied.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	* -------------------------------------------------------------------
	*/
	#include "mp4def.h"
	#include "mp4lib_int.h"
	#include "mp4enc_lib.h"
	#include "dct.h"
	#include "m4venc_oscl.h"

	/* ======================================================================== */
	/* Function : CodeMB_H263( ) */
	/* Date : 8/15/2001 */
	/* Purpose : Perform residue calc (only zero MV), DCT, H263 Quant/Dequant,*/
	/* IDCT and motion compensation.Modified from FastCodeMB() */
	/* Input : */
	/* video Video encoder data structure */
	/* function Approximate DCT function, scaling and threshold */
	/* ncoefblck Array for last nonzero coeff for speedup in VlcEncode */
	/* QP Combined offset from the origin to the current */
	/* macroblock and QP for current MB. */
	/* Output : */
	/* video->outputMB Quantized DCT coefficients. */
	/* currVop->yChan,uChan,vChan Reconstructed pixels */
	/* */
	/* Return : PV_STATUS */
	/* Modified : */
	/* 2/26/01
	-modified threshold based on correlation coeff 0.75 only for mode H.263
	-ncoefblck[] as input, to keep position of last non-zero coeff*/
	/* 8/10/01
	-modified threshold based on correlation coeff 0.5
	-used column threshold to speedup column DCT.
	-used bitmap zigzag to speedup RunLevel(). */
	/* ======================================================================== */

	PV_STATUS CodeMB_H263(VideoEncData video, approxDCT function, Int QP, Int ncoefblck[])
	{
	Int sad, k, CBP, mbnum = video->mbnum;
	Short output, dataBlock;
	UChar Mode = video->headerInfo.Mode[mbnum];
	UChar bitmapcol, bitmaprow = video->bitmaprow;
	UInt *bitmapzz ;
	UChar shortHeader = video->vol[video->currLayer]->shortVideoHeader;
	Int dc_scaler = 8;
	Int intra = (Mode == MODE_INTRA \|\| Mode == MODE_INTRA_Q);
	struct QPstruct QuantParam;
	Int dctMode, DctTh1;
	Int ColTh;
	Int(BlockQuantDequantH263)(Short , Short , struct QPstruct ,
	UChar[], UChar , UInt , Int, Int, Int, UChar);
	Int(BlockQuantDequantH263DC)(Short , Short , struct QPstruct ,
	UChar , UInt , Int, UChar);
	void (BlockDCT1x1)(Short , UChar , UChar , Int);
	void (BlockDCT2x2)(Short , UChar , UChar , Int);
	void (BlockDCT4x4)(Short , UChar , UChar , Int);
	void (BlockDCT8x8)(Short , UChar , UChar , Int);

	/* motion comp. related var. */
	Vop *currVop = video->currVop;
	VideoEncFrameIO *inputFrame = video->input;
	Int ind_x = video->outputMB->mb_x;
	Int ind_y = video->outputMB->mb_y;
	Int lx = currVop->pitch;
	Int width = currVop->width;
	UChar rec, input, *pred;
	Int offset = QP >> 5; /* QP is combined offset and QP */
	Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
	/*****************************/

	OSCL_UNUSED_ARG(function);

	output = video->outputMB->block[0];
	CBP = 0;
	QP = QP & 0x1F;
	// M4VENC_MEMSET(output,0,(sizeof(Short)<<6)6); / reset quantized coeff. to zero , 7/24/01*/

	QuantParam.QPx2 = QP << 1;
	QuantParam.QP = QP;
	QuantParam.QPdiv2 = QP >> 1;
	QuantParam.QPx2plus = QuantParam.QPx2 + QuantParam.QPdiv2;
	QuantParam.Addition = QP - 1 + (QP & 0x1);

	if (intra)
	{
	BlockDCT1x1 = &Block1x1DCTIntra;
	BlockDCT2x2 = &Block2x2DCT_AANIntra;
	BlockDCT4x4 = &Block4x4DCT_AANIntra;
	BlockDCT8x8 = &BlockDCT_AANIntra;
	BlockQuantDequantH263 = &BlockQuantDequantH263Intra;
	BlockQuantDequantH263DC = &BlockQuantDequantH263DCIntra;
	if (shortHeader)
	{
	dc_scaler = 8;
	}
	else
	{
	dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
	}
	DctTh1 = (Int)(dc_scaler * 3);//*1.829
	ColTh = ColThIntra[QP];
	}
	else
	{
	BlockDCT1x1 = &Block1x1DCTwSub;
	BlockDCT2x2 = &Block2x2DCT_AANwSub;
	BlockDCT4x4 = &Block4x4DCT_AANwSub;
	BlockDCT8x8 = &BlockDCT_AANwSub;

	BlockQuantDequantH263 = &BlockQuantDequantH263Inter;
	BlockQuantDequantH263DC = &BlockQuantDequantH263DCInter;
	ColTh = ColThInter[QP];
	DctTh1 = (Int)(16 * QP); //9*QP;
	}

	rec = currVop->yChan + offset;
	input = inputFrame->yChan + offset;
	if (lx != width) input -= (ind_y << 9); /* non-padded offset */

	dataBlock = video->dataBlock;
	pred = video->predictedMB;

	for (k = 0; k < 6; k++)
	{
	CBP <<= 1;
	bitmapcol = video->bitmapcol[k];
	bitmapzz = video->bitmapzz[k]; /* 7/30/01 */
	if (k < 4)
	{
	sad = video->mot[mbnum][k+1].sad;
	if (k&1)
	{
	rec += 8;
	input += 8;
	}
	else if (k == 2)
	{
	dctMode = ((width << 3) - 8);
	input += dctMode;
	dctMode = ((lx << 3) - 8);
	rec += dctMode;
	}
	}
	else
	{
	if (k == 4)
	{
	rec = currVop->uChan + offsetc;
	input = inputFrame->uChan + offsetc;
	if (lx != width) input -= (ind_y << 7);
	lx >>= 1;
	width >>= 1;
	if (intra)
	{
	sad = getBlockSum(input, width);
	if (shortHeader)
	dc_scaler = 8;
	else
	{
	dc_scaler = cal_dc_scalerENC(QP, 2); /* chrominance blocks */
	}
	DctTh1 = (Int)(dc_scaler * 3);//*1.829
	}
	else
	sad = Sad8x8(input, pred, width);
	}
	else
	{
	rec = currVop->vChan + offsetc;
	input = inputFrame->vChan + offsetc;
	if (lx != width) input -= (ind_y << 7);
	if (intra)
	{
	sad = getBlockSum(input, width);
	}
	else
	sad = Sad8x8(input, pred, width);
	}
	}

	if (sad < DctTh1 && !(shortHeader && intra)) /* all-zero */
	{ /* For shortHeader intra block, DC value cannot be zero */
	dctMode = 0;
	CBP \|= 0;
	ncoefblck[k] = 0;
	}
	else if (sad < 18QP/(QP<<4)/) / DC-only */
	{
	dctMode = 1;
	BlockDCT1x1(dataBlock, input, pred, width);

	CBP \|= (*BlockQuantDequantH263DC)(dataBlock, output, &QuantParam,
	bitmaprow + k, bitmapzz, dc_scaler, shortHeader);
	ncoefblck[k] = 1;
	}
	else
	{

	dataBlock[64] = ColTh;

	if (sad < 22QP/(QP<<4)+(QP<<1)/) / 2x2 DCT */
	{
	dctMode = 2;
	BlockDCT2x2(dataBlock, input, pred, width);
	ncoefblck[k] = 6;
	}
	else if (sad < (QP << 5)) /* 4x4 DCT */
	{
	dctMode = 4;
	BlockDCT4x4(dataBlock, input, pred, width);
	ncoefblck[k] = 26;
	}
	else /* Full-DCT */
	{
	dctMode = 8;
	BlockDCT8x8(dataBlock, input, pred, width);
	ncoefblck[k] = 64;
	}

	CBP \|= (*BlockQuantDequantH263)(dataBlock, output, &QuantParam,
	bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler, shortHeader);
	}
	BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) \| intra);
	output += 64;
	if (!(k&1))
	{
	pred += 8;
	}
	else
	{
	pred += 120;
	}
	}

	video->headerInfo.CBP[mbnum] = CBP; /* 5/18/2001 */
	return PV_SUCCESS;
	}

	#ifndef NO_MPEG_QUANT
	/* ======================================================================== */
	/* Function : CodeMB_MPEG( ) */
	/* Date : 8/15/2001 */
	/* Purpose : Perform residue calc (only zero MV), DCT, MPEG Quant/Dequant,*/
	/* IDCT and motion compensation.Modified from FastCodeMB() */
	/* Input : */
	/* video Video encoder data structure */
	/* function Approximate DCT function, scaling and threshold */
	/* ncoefblck Array for last nonzero coeff for speedup in VlcEncode */
	/* QP Combined offset from the origin to the current */
	/* macroblock and QP for current MB. */
	/* Output : */
	/* video->outputMB Quantized DCT coefficients. */
	/* currVop->yChan,uChan,vChan Reconstructed pixels */
	/* */
	/* Return : PV_STATUS */
	/* Modified : */
	/* 2/26/01
	-modified threshold based on correlation coeff 0.75 only for mode H.263
	-ncoefblck[] as input, keep position of last non-zero coeff*/
	/* 8/10/01
	-modified threshold based on correlation coeff 0.5
	-used column threshold to speedup column DCT.
	-used bitmap zigzag to speedup RunLevel(). */
	/* ======================================================================== */

	PV_STATUS CodeMB_MPEG(VideoEncData video, approxDCT function, Int QP, Int ncoefblck[])
	{
	Int sad, k, CBP, mbnum = video->mbnum;
	Short output, dataBlock;
	UChar Mode = video->headerInfo.Mode[mbnum];
	UChar bitmapcol, bitmaprow = video->bitmaprow;
	UInt *bitmapzz ;
	Int dc_scaler = 8;
	Vol *currVol = video->vol[video->currLayer];
	Int intra = (Mode == MODE_INTRA \|\| Mode == MODE_INTRA_Q);
	Int *qmat;
	Int dctMode, DctTh1, DctTh2, DctTh3, DctTh4;
	Int ColTh;

	Int(BlockQuantDequantMPEG)(Short , Short , Int, Int ,
	UChar [], UChar , UInt , Int, Int, Int);
	Int(BlockQuantDequantMPEGDC)(Short , Short , Int, Int ,
	UChar [], UChar , UInt , Int);

	void (BlockDCT1x1)(Short , UChar , UChar , Int);
	void (BlockDCT2x2)(Short , UChar , UChar , Int);
	void (BlockDCT4x4)(Short , UChar , UChar , Int);
	void (BlockDCT8x8)(Short , UChar , UChar , Int);

	/* motion comp. related var. */
	Vop *currVop = video->currVop;
	VideoEncFrameIO *inputFrame = video->input;
	Int ind_x = video->outputMB->mb_x;
	Int ind_y = video->outputMB->mb_y;
	Int lx = currVop->pitch;
	Int width = currVop->width;
	UChar rec, input, *pred;
	Int offset = QP >> 5;
	Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
	/*****************************/

	OSCL_UNUSED_ARG(function);

	output = video->outputMB->block[0];
	CBP = 0;
	QP = QP & 0x1F;
	// M4VENC_MEMSET(output,0,(sizeof(Short)<<6)6); / reset quantized coeff. to zero , 7/24/01*/

	if (intra)
	{
	BlockDCT1x1 = &Block1x1DCTIntra;
	BlockDCT2x2 = &Block2x2DCT_AANIntra;
	BlockDCT4x4 = &Block4x4DCT_AANIntra;
	BlockDCT8x8 = &BlockDCT_AANIntra;

	BlockQuantDequantMPEG = &BlockQuantDequantMPEGIntra;
	BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCIntra;
	dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
	qmat = currVol->iqmat;
	DctTh1 = (Int)(3 * dc_scaler);//2*dc_scaler);
	DctTh2 = (Int)((1.25 * QP - 1) * qmat[1] * 0.45);//0.567);//0.567);
	DctTh3 = (Int)((1.25 * QP - 1) * qmat[2] * 0.55);//1.162); /* 8/2/2001 */
	DctTh4 = (Int)((1.25 * QP - 1) * qmat[32] * 0.8);//1.7583);//0.7942);
	ColTh = ColThIntra[QP];
	}
	else
	{
	BlockDCT1x1 = &Block1x1DCTwSub;
	BlockDCT2x2 = &Block2x2DCT_AANwSub;
	BlockDCT4x4 = &Block4x4DCT_AANwSub;
	BlockDCT8x8 = &BlockDCT_AANwSub;

	BlockQuantDequantMPEG = &BlockQuantDequantMPEGInter;
	BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCInter;
	qmat = currVol->niqmat;
	DctTh1 = (Int)(((QP << 1) - 0.5) * qmat[0] * 0.4);//0.2286);//0.3062);
	DctTh2 = (Int)(((QP << 1) - 0.5) * qmat[1] * 0.45);//0.567);//0.4);
	DctTh3 = (Int)(((QP << 1) - 0.5) * qmat[2] * 0.55);//1.162); /* 8/2/2001 */
	DctTh4 = (Int)(((QP << 1) - 0.5) * qmat[32] * 0.8);//1.7583);//0.7942);
	ColTh = ColThInter[QP];
	}// get qmat, DctTh1, DctTh2, DctTh3

	rec = currVop->yChan + offset;
	input = inputFrame->yChan + offset;
	if (lx != width) input -= (ind_y << 9); /* non-padded offset */

	dataBlock = video->dataBlock;
	pred = video->predictedMB;

	for (k = 0; k < 6; k++)
	{
	CBP <<= 1;
	bitmapcol = video->bitmapcol[k];
	bitmapzz = video->bitmapzz[k]; /* 8/2/01 */
	if (k < 4)
	{//Y block
	sad = video->mot[mbnum][k+1].sad;
	if (k&1)
	{
	rec += 8;
	input += 8;
	}
	else if (k == 2)
	{
	dctMode = ((width << 3) - 8);
	input += dctMode;
	dctMode = ((lx << 3) - 8);
	rec += dctMode;
	}
	}
	else
	{// U, V block
	if (k == 4)
	{
	rec = currVop->uChan + offsetc;
	input = inputFrame->uChan + offsetc;
	if (lx != width) input -= (ind_y << 7);
	lx >>= 1;
	width >>= 1;
	if (intra)
	{
	dc_scaler = cal_dc_scalerENC(QP, 2); /* luminance blocks */
	DctTh1 = dc_scaler * 3;
	sad = getBlockSum(input, width);
	}
	else
	sad = Sad8x8(input, pred, width);
	}
	else
	{
	rec = currVop->vChan + offsetc;
	input = inputFrame->vChan + offsetc;
	if (lx != width) input -= (ind_y << 7);
	if (intra)
	sad = getBlockSum(input, width);
	else
	sad = Sad8x8(input, pred, width);
	}
	}

	if (sad < DctTh1) /* all-zero */
	{
	dctMode = 0;
	CBP \|= 0;
	ncoefblck[k] = 0;
	}
	else if (sad < DctTh2) /* DC-only */
	{
	dctMode = 1;
	BlockDCT1x1(dataBlock, input, pred, width);

	CBP \|= (*BlockQuantDequantMPEGDC)(dataBlock, output, QP, qmat,
	bitmapcol, bitmaprow + k, bitmapzz, dc_scaler);
	ncoefblck[k] = 1;
	}
	else
	{
	dataBlock[64] = ColTh;

	if (sad < DctTh3) /* 2x2-DCT */
	{
	dctMode = 2;
	BlockDCT2x2(dataBlock, input, pred, width);
	ncoefblck[k] = 6;
	}
	else if (sad < DctTh4) /* 4x4 DCT */
	{
	dctMode = 4;
	BlockDCT4x4(dataBlock, input, pred, width);
	ncoefblck[k] = 26;
	}
	else /* full-DCT */
	{
	dctMode = 8;
	BlockDCT8x8(dataBlock, input, pred, width);
	ncoefblck[k] = 64;
	}

	CBP \|= (*BlockQuantDequantMPEG)(dataBlock, output, QP, qmat,
	bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler); //
	}
	dctMode = 8; /* for mismatch handle */
	BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) \| (intra));

	output += 64;
	if (!(k&1))
	{
	pred += 8;
	}
	else
	{
	pred += 120;
	}
	}

	video->headerInfo.CBP[mbnum] = CBP; /* 5/18/2001 */
	return PV_SUCCESS;
	}

	#endif

	/* ======================================================================== */
	/* Function : getBlockSAV( ) */
	/* Date : 8/10/2000 */
	/* Purpose : Get SAV for one block */
	/* In/out : block[64] contain one block data */
	/* Return : */
	/* Modified : */
	/* ======================================================================== */
	/* can be written in MMX or SSE, 2/22/2001 */
	Int getBlockSAV(Short block[])
	{
	Int i, val, sav = 0;

	i = 8;
	while (i--)
	{
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	val = *block++;
	if (val > 0) sav += val;
	else sav -= val;
	}

	return sav;

	}

	/* ======================================================================== */
	/* Function : Sad8x8( ) */
	/* Date : 8/10/2000 */
	/* Purpose : Find SAD between prev block and current block */
	/* In/out : Previous and current frame block pointers, and frame width */
	/* Return : */
	/* Modified : */
	/* 8/15/01, - do 4 pixel at a time assuming 32 bit register */
	/* ======================================================================== */
	#ifdef __clang__
	__attribute((no_sanitize("integer")))
	#endif
	Int Sad8x8(UChar cur, UChar prev, Int width)
	{
	UChar *end = cur + (width << 3);
	Int sad = 0;
	Int curInt = (Int) cur;
	Int prevInt = (Int) prev;
	Int cur1, cur2, prev1, prev2;
	UInt mask, sgn_msk = 0x80808080;
	Int sum2 = 0, sum4 = 0;
	Int tmp;
	do
	{
	mask = ~(0xFF00);
	cur1 = curInt[1]; /* load cur[4..7] */
	cur2 = curInt[0];
	curInt += (width >> 2); /* load cur[0..3] and +=lx */
	prev1 = prevInt[1];
	prev2 = prevInt[0];
	prevInt += 4;

	tmp = prev2 ^ cur2;
	cur2 = prev2 - cur2;
	tmp = tmp ^ cur2; /* (^)^(-) last bit is one if carry */
	tmp = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
	if (cur2 < 0) tmp = tmp \| 0x80000000; /* corcurt sign of first byte */
	tmp = (tmp << 8) - tmp; /* carry borrowed bytes are marked with 0x1FE */
	cur2 = cur2 + (tmp >> 7); /* negative bytes is added with 0xFF, -1 */
	cur2 = cur2 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */

	tmp = prev1 ^ cur1;
	cur1 = prev1 - cur1;
	tmp = tmp ^ cur1; /* (^)^(-) last bit is one if carry */
	tmp = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
	if (cur1 < 0) tmp = tmp \| 0x80000000; /* corcurt sign of first byte */
	tmp = (tmp << 8) - tmp; /* carry borrowed bytes are marked with 0x1FE */
	cur1 = cur1 + (tmp >> 7); /* negative bytes is added with 0xFF, -1 */
	cur1 = cur1 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */

	sum4 = sum4 + cur1;
	cur1 = cur1 & (mask << 8); /* mask first and third bytes */
	sum2 = sum2 + ((UInt)cur1 >> 8);
	sum4 = sum4 + cur2;
	cur2 = cur2 & (mask << 8); /* mask first and third bytes */
	sum2 = sum2 + ((UInt)cur2 >> 8);
	}
	while ((uintptr_t)curInt < (uintptr_t)end);

	cur1 = sum4 - (sum2 << 8); /* get even-sum */
	cur1 = cur1 + sum2; /* add 16 bit even-sum and odd-sum*/
	cur1 = cur1 + (cur1 << 16); /* add upper and lower 16 bit sum */
	sad = ((UInt)cur1 >> 16); /* take upper 16 bit */
	return sad;
	}

	/* ======================================================================== */
	/* Function : getBlockSum( ) */
	/* Date : 8/10/2000 */
	/* Purpose : Find summation of value within a block. */
	/* In/out : Pointer to current block in a frame and frame width */
	/* Return : */
	/* Modified : */
	/* 8/15/01, - SIMD 4 pixels at a time */
	/* ======================================================================== */
	#ifdef __clang__
	__attribute((no_sanitize("integer")))
	#endif
	Int getBlockSum(UChar *cur, Int width)
	{
	Int sad = 0, sum4 = 0, sum2 = 0;
	UChar *end = cur + (width << 3);
	Int curInt = (Int)cur;
	UInt mask = ~(0xFF00);
	Int load1, load2;

	do
	{
	load1 = curInt[1];
	load2 = curInt[0];
	curInt += (width >> 2);
	sum4 += load1;
	load1 = load1 & (mask << 8); /* even bytes */
	sum2 += ((UInt)load1 >> 8); /* sum even bytes, 16 bit */
	sum4 += load2;
	load2 = load2 & (mask << 8); /* even bytes */
	sum2 += ((UInt)load2 >> 8); /* sum even bytes, 16 bit */
	}
	while ((uintptr_t)curInt < (uintptr_t)end);
	load1 = sum4 - (sum2 << 8); /* get even-sum */
	load1 = load1 + sum2; /* add 16 bit even-sum and odd-sum*/
	load1 = load1 + (load1 << 16); /* add upper and lower 16 bit sum */
	sad = ((UInt)load1 >> 16); /* take upper 16 bit */

	return sad;
	}