| /* ------------------------------------------------------------------ |
| * Copyright (C) 1998-2009 PacketVideo |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either |
| * express or implied. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * ------------------------------------------------------------------- |
| */ |
| #include "avcenc_lib.h" |
| /* 3/29/01 fast half-pel search based on neighboring guess */ |
| /* value ranging from 0 to 4, high complexity (more accurate) to |
| low complexity (less accurate) */ |
| #define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */ |
| |
| #define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/ |
| |
| #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ |
| x = 0xFF & (~(x>>31));} |
| |
| #define CLIP_UPPER16(x) if((uint)x >= 0x20000000){ \ |
| x = 0xFF0000 & (~(x>>31));} \ |
| else { \ |
| x = (x>>5)&0xFF0000; \ |
| } |
| |
| /*===================================================================== |
| Function: AVCFindHalfPelMB |
| Date: 10/31/2007 |
| Purpose: Find half pel resolution MV surrounding the full-pel MV |
| =====================================================================*/ |
| |
| int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand, |
| int xpos, int ypos, int hp_guess, int cmvx, int cmvy) |
| { |
| AVCPictureData *currPic = encvid->common->currPic; |
| int lx = currPic->pitch; |
| int d, dmin, satd_min; |
| uint8* cand; |
| int lambda_motion = encvid->lambda_motion; |
| uint8 *mvbits = encvid->mvbits; |
| int mvcost; |
| /* list of candidate to go through for half-pel search*/ |
| uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions |
| uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */ |
| |
| int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2}; |
| int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2}; |
| int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1}; |
| int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1}; |
| int h, hmin, q, qmin; |
| |
| OSCL_UNUSED_ARG(xpos); |
| OSCL_UNUSED_ARG(ypos); |
| OSCL_UNUSED_ARG(hp_guess); |
| |
| GenerateHalfPelPred(subpel_pred, ncand, lx); |
| |
| cur = encvid->currYMB; // pre-load current original MB |
| |
| cand = hpel_cand[0]; |
| |
| // find cost for the current full-pel position |
| dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD |
| mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy); |
| satd_min = dmin; |
| dmin += mvcost; |
| hmin = 0; |
| |
| /* find half-pel */ |
| for (h = 1; h < 9; h++) |
| { |
| d = SATD_MB(hpel_cand[h], cur, dmin); |
| mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy); |
| d += mvcost; |
| |
| if (d < dmin) |
| { |
| dmin = d; |
| hmin = h; |
| satd_min = d - mvcost; |
| } |
| } |
| |
| mot->sad = dmin; |
| mot->x += xh[hmin]; |
| mot->y += yh[hmin]; |
| encvid->best_hpel_pos = hmin; |
| |
| /*** search for quarter-pel ****/ |
| GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin); |
| |
| encvid->best_qpel_pos = qmin = -1; |
| |
| for (q = 0; q < 8; q++) |
| { |
| d = SATD_MB(encvid->qpel_cand[q], cur, dmin); |
| mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy); |
| d += mvcost; |
| if (d < dmin) |
| { |
| dmin = d; |
| qmin = q; |
| satd_min = d - mvcost; |
| } |
| } |
| |
| if (qmin != -1) |
| { |
| mot->sad = dmin; |
| mot->x += xq[qmin]; |
| mot->y += yq[qmin]; |
| encvid->best_qpel_pos = qmin; |
| } |
| |
| return satd_min; |
| } |
| |
| |
| |
| /** This function generates sub-pel prediction around the full-pel candidate. |
| Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */ |
| /** The sub-pel position is labeled in spiral manner from the center. */ |
| |
| void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx) |
| { |
| /* let's do straightforward way first */ |
| uint8 *ref; |
| uint8 *dst; |
| uint8 tmp8; |
| int32 tmp32; |
| int16 tmp_horz[18*22], *dst_16, *src_16; |
| int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp |
| int i, j; |
| |
| /* first copy full-pel to the first array */ |
| /* to be optimized later based on byte-offset load */ |
| ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */ |
| dst = subpel_pred; |
| |
| dst -= 4; /* offset */ |
| for (j = 0; j < 22; j++) /* 24x22 */ |
| { |
| i = 6; |
| while (i > 0) |
| { |
| tmp32 = *ref++; |
| tmp8 = *ref++; |
| tmp32 |= (tmp8 << 8); |
| tmp8 = *ref++; |
| tmp32 |= (tmp8 << 16); |
| tmp8 = *ref++; |
| tmp32 |= (tmp8 << 24); |
| *((uint32*)(dst += 4)) = tmp32; |
| i--; |
| } |
| ref += (lx - 24); |
| } |
| |
| /* from the first array, we do horizontal interp */ |
| ref = subpel_pred + 2; |
| dst_16 = tmp_horz; /* 17 x 22 */ |
| |
| for (j = 4; j > 0; j--) |
| { |
| for (i = 16; i > 0; i -= 4) |
| { |
| a = ref[-2]; |
| b = ref[-1]; |
| c = ref[0]; |
| d = ref[1]; |
| e = ref[2]; |
| f = ref[3]; |
| *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d); |
| a = ref[4]; |
| *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e); |
| b = ref[5]; |
| *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f); |
| c = ref[6]; |
| *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a); |
| |
| ref += 4; |
| } |
| /* do the 17th column here */ |
| d = ref[3]; |
| *dst_16 = e + d - 5 * (f + c) + 20 * (a + b); |
| dst_16 += 2; /* stride for tmp_horz is 18 */ |
| ref += 8; /* stride for ref is 24 */ |
| if (j == 3) // move 18 lines down |
| { |
| dst_16 += 324;//18*18; |
| ref += 432;//18*24; |
| } |
| } |
| |
| ref -= 480;//20*24; |
| dst_16 -= 360;//20*18; |
| dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/ |
| |
| for (j = 18; j > 0; j--) |
| { |
| for (i = 16; i > 0; i -= 4) |
| { |
| a = ref[-2]; |
| b = ref[-1]; |
| c = ref[0]; |
| d = ref[1]; |
| e = ref[2]; |
| f = ref[3]; |
| tmp32 = a + f - 5 * (b + e) + 20 * (c + d); |
| *dst_16++ = tmp32; |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *dst++ = tmp32; |
| |
| a = ref[4]; |
| tmp32 = b + a - 5 * (c + f) + 20 * (d + e); |
| *dst_16++ = tmp32; |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *dst++ = tmp32; |
| |
| b = ref[5]; |
| tmp32 = c + b - 5 * (d + a) + 20 * (e + f); |
| *dst_16++ = tmp32; |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *dst++ = tmp32; |
| |
| c = ref[6]; |
| tmp32 = d + c - 5 * (e + b) + 20 * (f + a); |
| *dst_16++ = tmp32; |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *dst++ = tmp32; |
| |
| ref += 4; |
| } |
| /* do the 17th column here */ |
| d = ref[3]; |
| tmp32 = e + d - 5 * (f + c) + 20 * (a + b); |
| *dst_16 = tmp32; |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *dst = tmp32; |
| |
| dst += 8; /* stride for dst is 24 */ |
| dst_16 += 2; /* stride for tmp_horz is 18 */ |
| ref += 8; /* stride for ref is 24 */ |
| } |
| |
| |
| /* Do middle point filtering*/ |
| src_16 = tmp_horz; /* 17 x 22 */ |
| dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/ |
| dst -= 24; // offset |
| for (i = 0; i < 17; i++) |
| { |
| for (j = 16; j > 0; j -= 4) |
| { |
| a = *src_16; |
| b = *(src_16 += 18); |
| c = *(src_16 += 18); |
| d = *(src_16 += 18); |
| e = *(src_16 += 18); |
| f = *(src_16 += 18); |
| |
| tmp32 = a + f - 5 * (b + e) + 20 * (c + d); |
| tmp32 = (tmp32 + 512) >> 10; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; |
| |
| a = *(src_16 += 18); |
| tmp32 = b + a - 5 * (c + f) + 20 * (d + e); |
| tmp32 = (tmp32 + 512) >> 10; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; |
| |
| b = *(src_16 += 18); |
| tmp32 = c + b - 5 * (d + a) + 20 * (e + f); |
| tmp32 = (tmp32 + 512) >> 10; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; |
| |
| c = *(src_16 += 18); |
| tmp32 = d + c - 5 * (e + b) + 20 * (f + a); |
| tmp32 = (tmp32 + 512) >> 10; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; |
| |
| src_16 -= (18 << 2); |
| } |
| |
| d = src_16[90]; // 18*5 |
| tmp32 = e + d - 5 * (f + c) + 20 * (a + b); |
| tmp32 = (tmp32 + 512) >> 10; |
| CLIP_RESULT(tmp32) |
| dst[24] = tmp32; |
| |
| src_16 -= ((18 << 4) - 1); |
| dst -= ((24 << 4) - 1); |
| } |
| |
| /* do vertical interpolation */ |
| ref = subpel_pred + 2; |
| dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */ |
| dst -= 24; // offset |
| |
| for (i = 2; i > 0; i--) |
| { |
| for (j = 16; j > 0; j -= 4) |
| { |
| a = *ref; |
| b = *(ref += 24); |
| c = *(ref += 24); |
| d = *(ref += 24); |
| e = *(ref += 24); |
| f = *(ref += 24); |
| |
| tmp32 = a + f - 5 * (b + e) + 20 * (c + d); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| a = *(ref += 24); |
| tmp32 = b + a - 5 * (c + f) + 20 * (d + e); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| b = *(ref += 24); |
| tmp32 = c + b - 5 * (d + a) + 20 * (e + f); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| c = *(ref += 24); |
| tmp32 = d + c - 5 * (e + b) + 20 * (f + a); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| ref -= (24 << 2); |
| } |
| |
| d = ref[120]; // 24*5 |
| tmp32 = e + d - 5 * (f + c) + 20 * (a + b); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| dst[24] = tmp32; // 10th |
| |
| dst -= ((24 << 4) - 1); |
| ref -= ((24 << 4) - 1); |
| } |
| |
| // note that using SIMD here doesn't help much, the cycle almost stays the same |
| // one can just use the above code and change the for(i=2 to for(i=18 |
| for (i = 16; i > 0; i -= 4) |
| { |
| for (j = 17; j > 0; j--) |
| { |
| a = *((uint32*)ref); /* load 4 bytes */ |
| b = (a >> 8) & 0xFF00FF; /* second and fourth byte */ |
| a &= 0xFF00FF; |
| |
| c = *((uint32*)(ref + 120)); |
| d = (c >> 8) & 0xFF00FF; |
| c &= 0xFF00FF; |
| |
| a += c; |
| b += d; |
| |
| e = *((uint32*)(ref + 72)); /* e, f */ |
| f = (e >> 8) & 0xFF00FF; |
| e &= 0xFF00FF; |
| |
| c = *((uint32*)(ref + 48)); /* c, d */ |
| d = (c >> 8) & 0xFF00FF; |
| c &= 0xFF00FF; |
| |
| c += e; |
| d += f; |
| |
| a += 20 * c; |
| b += 20 * d; |
| a += 0x100010; |
| b += 0x100010; |
| |
| e = *((uint32*)(ref += 24)); /* e, f */ |
| f = (e >> 8) & 0xFF00FF; |
| e &= 0xFF00FF; |
| |
| c = *((uint32*)(ref + 72)); /* c, d */ |
| d = (c >> 8) & 0xFF00FF; |
| c &= 0xFF00FF; |
| |
| c += e; |
| d += f; |
| |
| a -= 5 * c; |
| b -= 5 * d; |
| |
| c = a << 16; |
| d = b << 16; |
| CLIP_UPPER16(a) |
| CLIP_UPPER16(c) |
| CLIP_UPPER16(b) |
| CLIP_UPPER16(d) |
| |
| a |= (c >> 16); |
| b |= (d >> 16); |
| // a>>=5; |
| // b>>=5; |
| /* clip */ |
| // msk |= b; msk|=a; |
| // a &= 0xFF00FF; |
| // b &= 0xFF00FF; |
| a |= (b << 8); /* pack it back */ |
| |
| *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned. |
| *((uint16*)(dst + 2)) = a >> 16; |
| |
| } |
| dst -= 404; // 24*17-4 |
| ref -= 404; |
| /* if(msk & 0xFF00FF00) // need clipping |
| { |
| VertInterpWClip(dst,ref); // re-do 4 column with clip |
| }*/ |
| } |
| |
| return ; |
| } |
| |
| void VertInterpWClip(uint8 *dst, uint8 *ref) |
| { |
| int i, j; |
| int a, b, c, d, e, f; |
| int32 tmp32; |
| |
| dst -= 4; |
| ref -= 4; |
| |
| for (i = 4; i > 0; i--) |
| { |
| for (j = 16; j > 0; j -= 4) |
| { |
| a = *ref; |
| b = *(ref += 24); |
| c = *(ref += 24); |
| d = *(ref += 24); |
| e = *(ref += 24); |
| f = *(ref += 24); |
| |
| tmp32 = a + f - 5 * (b + e) + 20 * (c + d); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| a = *(ref += 24); |
| tmp32 = b + a - 5 * (c + f) + 20 * (d + e); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| b = *(ref += 24); |
| tmp32 = c + b - 5 * (d + a) + 20 * (e + f); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| c = *(ref += 24); |
| tmp32 = d + c - 5 * (e + b) + 20 * (f + a); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| *(dst += 24) = tmp32; // 10th |
| |
| ref -= (24 << 2); |
| } |
| |
| d = ref[120]; // 24*5 |
| tmp32 = e + d - 5 * (f + c) + 20 * (a + b); |
| tmp32 = (tmp32 + 16) >> 5; |
| CLIP_RESULT(tmp32) |
| dst[24] = tmp32; // 10th |
| |
| dst -= ((24 << 4) - 1); |
| ref -= ((24 << 4) - 1); |
| } |
| |
| return ; |
| } |
| |
| |
| void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos) |
| { |
| // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2 |
| int i, j; |
| |
| uint8 *c1 = qpel_cand; |
| uint8 *tl = bilin_base[0]; |
| uint8 *tr = bilin_base[1]; |
| uint8 *bl = bilin_base[2]; |
| uint8 *br = bilin_base[3]; |
| int a, b, c, d; |
| int offset = 1 - (384 * 7); |
| |
| if (!(hpel_pos&1)) // diamond pattern |
| { |
| j = 16; |
| while (j--) |
| { |
| i = 16; |
| while (i--) |
| { |
| d = tr[24]; |
| a = *tr++; |
| b = bl[1]; |
| c = *br++; |
| |
| *c1 = (c + a + 1) >> 1; |
| *(c1 += 384) = (b + a + 1) >> 1; /* c2 */ |
| *(c1 += 384) = (b + c + 1) >> 1; /* c3 */ |
| *(c1 += 384) = (b + d + 1) >> 1; /* c4 */ |
| |
| b = *bl++; |
| |
| *(c1 += 384) = (c + d + 1) >> 1; /* c5 */ |
| *(c1 += 384) = (b + d + 1) >> 1; /* c6 */ |
| *(c1 += 384) = (b + c + 1) >> 1; /* c7 */ |
| *(c1 += 384) = (b + a + 1) >> 1; /* c8 */ |
| |
| c1 += offset; |
| } |
| // advance to the next line, pitch is 24 |
| tl += 8; |
| tr += 8; |
| bl += 8; |
| br += 8; |
| c1 += 8; |
| } |
| } |
| else // star pattern |
| { |
| j = 16; |
| while (j--) |
| { |
| i = 16; |
| while (i--) |
| { |
| a = *br++; |
| b = *tr++; |
| c = tl[1]; |
| *c1 = (a + b + 1) >> 1; |
| b = bl[1]; |
| *(c1 += 384) = (a + c + 1) >> 1; /* c2 */ |
| c = tl[25]; |
| *(c1 += 384) = (a + b + 1) >> 1; /* c3 */ |
| b = tr[23]; |
| *(c1 += 384) = (a + c + 1) >> 1; /* c4 */ |
| c = tl[24]; |
| *(c1 += 384) = (a + b + 1) >> 1; /* c5 */ |
| b = *bl++; |
| *(c1 += 384) = (a + c + 1) >> 1; /* c6 */ |
| c = *tl++; |
| *(c1 += 384) = (a + b + 1) >> 1; /* c7 */ |
| *(c1 += 384) = (a + c + 1) >> 1; /* c8 */ |
| |
| c1 += offset; |
| } |
| // advance to the next line, pitch is 24 |
| tl += 8; |
| tr += 8; |
| bl += 8; |
| br += 8; |
| c1 += 8; |
| } |
| } |
| |
| return ; |
| } |
| |
| |
| /* assuming cand always has a pitch of 24 */ |
| int SATD_MB(uint8 *cand, uint8 *cur, int dmin) |
| { |
| int cost; |
| |
| |
| dmin = (dmin << 16) | 24; |
| cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL); |
| |
| return cost; |
| } |
| |
| |
| |
| |
| |