| ;// |
| ;// Copyright (C) 2007-2008 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// File Name: omxVCM4P2_MCReconBlock_s.s |
| ;// OpenMAX DL: v1.0.2 |
| ;// Revision: 9641 |
| ;// Date: Thursday, February 7, 2008 |
| ;// |
| ;// |
| ;// |
| ;// |
| ;// Description: |
| ;// |
| ;// |
| |
| ;// Include standard headers |
| INCLUDE omxtypes_s.h |
| INCLUDE armCOMM_s.h |
| |
| ;// Import symbols required from other files |
| |
| M_VARIANTS ARM1136JS |
| |
| ;// *************************************************************************** |
| ;// ARM1136JS implementation |
| ;// *************************************************************************** |
| IF ARM1136JS |
| |
| ;// *************************************************************************** |
| ;// MACRO DEFINITIONS |
| ;// *************************************************************************** |
| ;// Description: |
| ;// |
| ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 |
| ;// |
| ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to |
| ;// each sum before dividing by two, if round is 1 |
| ;// |
| ;// Syntax: |
| ;// M_UHADD8R $dest, $x, $y, $round, $mask |
| ;// |
| ;// Inputs: |
| ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] |
| ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] |
| ;// $round 0 if no rounding to be added, 1 if rounding to be done |
| ;// $mask some register set to 0x80808080 |
| ;// |
| ;// Outputs: |
| ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] |
| |
| MACRO |
| M_UHADD8R $dest, $x, $y, $round, $mask |
| IF $round = 1 |
| IF $dest /= $y |
| MVN $dest, $x |
| UHSUB8 $dest, $y, $dest |
| EOR $dest, $dest, $mask |
| ELSE |
| MVN $dest, $y |
| UHSUB8 $dest, $x, $dest |
| EOR $dest, $dest, $mask |
| ENDIF |
| ELSE |
| UHADD8 $dest, $x, $y |
| ENDIF |
| MEND |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Load 8 bytes from $pSrc (aligned or unaligned locations) |
| ;// |
| ;// Syntax: |
| ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset |
| ;// |
| ;// Inputs: |
| ;// $pSrc 4 byte aligned source pointer to an address just less than |
| ;// or equal to the data location |
| ;// $srcStep The stride on source |
| ;// $scratch A scratch register, used internally for temp calculations |
| ;// $offset Difference of source data location to the source pointer |
| ;// Use when $offset != 0 (unaligned load) |
| ;// |
| ;// Outputs: |
| ;// $pSrc In case the macro accepts stride, it increments the pSrc by |
| ;// that value, else unchanged |
| ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] |
| ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] |
| ;// |
| ;// Note: {$out0, $out1, $scratch} should be registers with ascending |
| ;// register numbering. In case offset is 0, $scratch is not modified. |
| |
| MACRO |
| M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset |
| IF $offset = 0 |
| LDM $pSrc, {$out0, $out1} |
| ADD $pSrc, $pSrc, $srcStep |
| ELSE |
| LDM $pSrc, {$out0, $out1, $scratch} |
| ADD $pSrc, $pSrc, $srcStep |
| |
| MOV $out0, $out0, LSR #8 * $offset |
| ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) |
| MOV $out1, $out1, LSR #8 * $offset |
| ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) |
| ENDIF |
| MEND |
| |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Loads three words for X interpolation, update pointer to next row. For |
| ;// X interpolation, given a truncated-4byteAligned source pointer, |
| ;// invariably three continous words are required from there to get the |
| ;// nine bytes from the source pointer for filtering. |
| ;// |
| ;// Syntax: |
| ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 |
| ;// |
| ;// Inputs: |
| ;// $pSrc 4 byte aligned source pointer to an address just less than |
| ;// or equal to the data location |
| ;// |
| ;// $srcStep The stride on source |
| ;// |
| ;// $offset Difference of source data location to the source pointer |
| ;// Use when $offset != 0 (unaligned load) |
| ;// |
| ;// Outputs: |
| ;// $pSrc Incremented by $srcStep |
| ;// |
| ;// $word0, $word1, $word2, $word3 |
| ;// Three of these are outputs based on the $offset parameter. |
| ;// The outputs are specifically generated to be processed by |
| ;// the M_EXT_XINT macro. Following is the illustration to show |
| ;// how the nine bytes are spanned for different offsets from |
| ;// notTruncatedForAlignmentSourcePointer. |
| ;// |
| ;// ------------------------------------------------------ |
| ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | |
| ;// |------------------------------------------------------| |
| ;// | 0 | 0 | 0123 | 4567 | 8xxx | | |
| ;// | 1 | -1 | x012 | 3456 | 78xx | | |
| ;// | 2 | -2 | xx01 | 2345 | 678x | | |
| ;// | 3 | -3 | xxx0 | | 1234 | 5678 | |
| ;// ------------------------------------------------------ |
| ;// |
| ;// where the numbering (0-8) is to designate the 9 bytes from |
| ;// start of a particular row. The illustration doesn't take in |
| ;// account the positioning of bytes with in the word and the |
| ;// macro combination with M_EXT_XINT will work only in little |
| ;// endian environs |
| ;// |
| ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending |
| ;// register numbering |
| |
| MACRO |
| M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 |
| IF $offset /= 3 |
| LDM $pSrc, {$word0, $word1, $word2} |
| ELSE |
| LDM $pSrc, {$word0, $word2, $word3} |
| ENDIF |
| ADD $pSrc, $pSrc, $srcStep |
| MEND |
| |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Extract four registers of four pixels for X interpolation |
| ;// |
| ;// Syntax: |
| ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 |
| ;// |
| ;// Inputs: |
| ;// $offset Difference of source data location to the source pointer |
| ;// Use when $offset != 0 (unaligned load) |
| ;// |
| ;// $word0, $word1, $word2, $word3 |
| ;// Three of these are inputs based on the $offset parameter. |
| ;// The inputs are specifically selected to be processed by |
| ;// the M_EXT_XINT macro. |
| ;// |
| ;// ------------------------------------------------------ |
| ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | |
| ;// |------------------------------------------------------| |
| ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | |
| ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | |
| ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | |
| ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | |
| ;// ------------------------------------------------------ |
| ;// |
| ;// Outputs: |
| ;// $word0, $word1, $word2, $word3 |
| ;// Bytes from the original source pointer (not truncated for |
| ;// 4 byte alignment) as shown in the table. |
| ;// ------------------------------- |
| ;// | word0 | word1 | word2 | word3 | |
| ;// |-------------------------------| |
| ;// | 0123 | 4567 | 1234 | 5678 | |
| ;// ------------------------------- |
| ;// |
| ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending |
| ;// register numbering |
| |
| MACRO |
| M_EXT_XINT $offset, $word0, $word1, $word2, $word3 |
| IF $offset = 0 |
| ; $word0 and $word1 are ok |
| ; $word2, $word3 are just 8 shifted versions |
| MOV $word3, $word1, LSR #8 |
| ORR $word3, $word3, $word2, LSL #24 |
| MOV $word2, $word0, LSR #8 |
| ORR $word2, $word2, $word1, LSL #24 |
| ELIF $offset = 3 |
| ; $word2 and $word3 are ok (taken care while loading itself) |
| ; set $word0 & $word1 |
| MOV $word0, $word0, LSR #24 |
| ORR $word0, $word0, $word2, LSL #8 |
| MOV $word1, $word2, LSR #24 |
| ORR $word1, $word1, $word3, LSL #8 |
| ELSE |
| MOV $word0, $word0, LSR #8 * $offset |
| ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) |
| MOV $word1, $word1, LSR #8 * $offset |
| ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) |
| |
| MOV $word3, $word1, LSR #8 |
| ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) |
| MOV $word2, $word0, LSR #8 |
| ORR $word2, $word2, $word1, LSL #24 |
| ENDIF |
| MEND |
| |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Computes half-sum and xor of two inputs and puts them in the input |
| ;// registers in that order |
| ;// |
| ;// Syntax: |
| ;// M_HSUM_XOR $v0, $v1, $tmp |
| ;// |
| ;// Inputs: |
| ;// $v0 a, first input |
| ;// $v1 b, second input |
| ;// $tmp scratch register |
| ;// |
| ;// Outputs: |
| ;// $v0 (a + b)/2 |
| ;// $v1 a ^ b |
| |
| MACRO |
| M_HSUM_XOR $v0, $v1, $tmp |
| UHADD8 $tmp, $v0, $v1 ;// s0 = a + b |
| EOR $v1, $v0, $v1 ;// l0 = a ^ b |
| MOV $v0, $tmp ;// s0 |
| MEND |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in |
| ;// mcReconBlock module. Very specific to the implementation of |
| ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and |
| ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are |
| ;// not significant and are used by the callee for row counter (y) |
| ;// |
| ;// Some points to note are: |
| ;// 1. Input is pair of pair-averages and Xors |
| ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another |
| ;// running average |
| ;// 3. Output is in the first argument |
| ;// |
| ;// Syntax: |
| ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal |
| ;// |
| ;// Inputs: |
| ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged |
| ;// $lsb0 (a ^ b) |
| ;// $sum1 (c + d) >> 1. Not modified |
| ;// $lsb1 (c ^ d) Not modified |
| ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding |
| ;// |
| ;// Outputs: |
| ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding |
| ;// (a + b + c + d + 2) / 4 : If rounding |
| |
| MACRO |
| M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal |
| LCLS OP1 |
| LCLS OP2 |
| IF $rndVal = 0 ;// rounding case |
| OP1 SETS "AND" |
| OP2 SETS "ORR" |
| ELSE ;// Not rounding case |
| OP1 SETS "ORR" |
| OP2 SETS "AND" |
| ENDIF |
| |
| LCLS lsb2 |
| LCLS sum2 |
| LCLS dest |
| |
| lsb2 SETS "tmp" |
| sum2 SETS "$lsb0" |
| dest SETS "$sum0" |
| |
| $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 |
| EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 |
| $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 |
| AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask |
| UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 |
| UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 |
| MEND |
| ;// *************************************************************************** |
| ;// Motion compensation handler macros |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Implement motion compensation routines using the named registers in |
| ;// callee function. Each of the following 4 implement the 4 predict type |
| ;// Each handles 8 cases each ie all the combinations of 4 types of source |
| ;// alignment offsets and 2 types of rounding flag |
| ;// |
| ;// Syntax: |
| ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset |
| ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset |
| ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset |
| ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset |
| ;// |
| ;// Inputs: |
| ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding |
| ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. |
| ;// |
| ;// Outputs: |
| ;// Outputs come in the named registers of the callee functions |
| ;// The macro loads the data from the source pointer, processes it and |
| ;// stores in the destination pointer. Does the whole prediction cycle |
| ;// of Motion Compensation routine for a particular predictType |
| ;// After this only residue addition to the predicted values remain |
| |
| MACRO |
| M_MCRECONBLOCK_IntegerPixel $rndVal, $offset |
| ;// Algorithmic Description: |
| ;// This handles motion compensation for IntegerPixel predictType. Both |
| ;// rounding cases are handled by the same code base. It is just a copy |
| ;// from source to destination. Two lines are done per loop to reduce |
| ;// stalls. Loop has been software pipelined as well for that purpose. |
| ;// |
| ;// M_LOAD_X loads a whole row in two registers and then they are stored |
| |
| CaseIntegerPixelRnd0Offset$offset |
| CaseIntegerPixelRnd1Offset$offset |
| M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset |
| M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset |
| YloopIntegerPixelOffset$offset |
| SUBS y, y, #2 |
| STRD tmp1, tmp2, [pDst], dstStep |
| STRD tmp3, tmp4, [pDst], dstStep |
| M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset |
| M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset |
| BGT YloopIntegerPixelOffset$offset |
| |
| B SwitchPredictTypeEnd |
| MEND |
| ;// *************************************************************************** |
| MACRO |
| M_MCRECONBLOCK_HalfPixelX $rndVal, $offset |
| ;// Algorithmic Description: |
| ;// This handles motion compensation for HalfPixelX predictType. The two |
| ;// rounding cases are handled by the different code base and spanned by |
| ;// different macro calls. Loop has been software pipelined to reduce |
| ;// stalls. |
| ;// |
| ;// Filtering involves averaging a pixel with the next horizontal pixel. |
| ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with |
| ;// all pixels in a row with 4 pixel in each register and another 2 |
| ;// registers with pixels corresponding to one horizontally shifted pixel |
| ;// corresponding to the initial row pixels. These are set of packed |
| ;// registers appropriate to do 4 lane SIMD. |
| ;// After that M_UHADD8R macro does the averaging taking care of the |
| ;// rounding as required |
| |
| CaseHalfPixelXRnd$rndVal.Offset$offset |
| IF $rndVal = 0 |
| LDR mask, =0x80808080 |
| ENDIF |
| |
| M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 |
| YloopHalfPixelXRnd$rndVal.Offset$offset |
| SUBS y, y, #1 |
| M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 |
| M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask |
| M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask |
| STRD tmp5, tmp6, [pDst], dstStep |
| M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 |
| BGT YloopHalfPixelXRnd$rndVal.Offset$offset |
| |
| B SwitchPredictTypeEnd |
| MEND |
| ;// *************************************************************************** |
| MACRO |
| M_MCRECONBLOCK_HalfPixelY $rndVal, $offset |
| ;// Algorithmic Description: |
| ;// This handles motion compensation for HalfPixelY predictType. The two |
| ;// rounding cases are handled by the different code base and spanned by |
| ;// different macro calls. PreLoading is used to avoid reload of same data. |
| ;// |
| ;// Filtering involves averaging a pixel with the next vertical pixel. |
| ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in |
| ;// each register. These are set of packed registers appropriate to do |
| ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care |
| ;// of the rounding as required |
| |
| CaseHalfPixelYRnd$rndVal.Offset$offset |
| IF $rndVal = 0 |
| LDR mask, =0x80808080 |
| ENDIF |
| |
| M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load |
| YloopHalfPixelYRnd$rndVal.Offset$offset |
| SUBS y, y, #2 |
| ;// Processing one line |
| M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset |
| M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask |
| M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask |
| STRD tmp1, tmp2, [pDst], dstStep |
| ;// Processing another line |
| M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset |
| M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask |
| M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask |
| STRD tmp3, tmp4, [pDst], dstStep |
| |
| BGT YloopHalfPixelYRnd$rndVal.Offset$offset |
| |
| B SwitchPredictTypeEnd |
| MEND |
| ;// *************************************************************************** |
| MACRO |
| M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset |
| ;// Algorithmic Description: |
| ;// This handles motion compensation for HalfPixelXY predictType. The two |
| ;// rounding cases are handled by the different code base and spanned by |
| ;// different macro calls. PreLoading is used to avoid reload of same data. |
| ;// |
| ;// Filtering involves averaging a pixel with the next vertical, horizontal |
| ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT |
| ;// and M_EXT_XINT combination generates 4 registers with a row and its |
| ;// 1 pixel right shifted version, with 4 pixels in one register. Another |
| ;// call of that macro-combination gets another row. Then M_HSUM_XOR is |
| ;// called to get mutual half-sum and xor combinations of a row with its |
| ;// shifted version as they are inputs to the M_AVG4 macro which computes |
| ;// the 4 element average with rounding. Note that it is the half-sum/xor |
| ;// values that are preserved for next row as they can be re-used in the |
| ;// next call to the M_AVG4 and saves recomputation. |
| ;// Due to lack of register, the row counter and a masking value required |
| ;// in M_AVG4 are packed into a single register yMask where the last nibble |
| ;// holds the row counter values and rest holds the masking variable left |
| ;// shifted by 4 |
| |
| CaseHalfPixelXYRnd$rndVal.Offset$offset |
| LDR yMask, =((0x01010101 << 4) + 8) |
| |
| M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' |
| M_EXT_XINT $offset, t00, t01, t10, t11 |
| M_HSUM_XOR t00, t10, tmp ;// s0, l0 |
| M_HSUM_XOR t01, t11, tmp ;// s0', l0' |
| |
| YloopHalfPixelXYRnd$rndVal.Offset$offset |
| ;// Processsing one line |
| ;// t00, t01, t10, t11 required from previous loop |
| M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' |
| SUB yMask, yMask, #2 |
| M_EXT_XINT $offset, t20, t21, t30, t31 |
| M_HSUM_XOR t20, t30, tmp ;// s1, l1 |
| M_HSUM_XOR t21, t31, tmp ;// s1', l1' |
| M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 |
| M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' |
| STRD t00, t01, [pDst], dstStep ;// store the average |
| |
| ;// Processsing another line |
| ;// t20, t21, t30, t31 required from above |
| M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' |
| TST yMask, #7 |
| M_EXT_XINT $offset, t00, t01, t10, t11 |
| M_HSUM_XOR t00, t10, tmp |
| M_HSUM_XOR t01, t11, tmp |
| M_AVG4 t20, t30, t00, t10, $rndVal |
| M_AVG4 t21, t31, t01, t11, $rndVal |
| STRD t20, t21, [pDst], dstStep |
| |
| BGT YloopHalfPixelXYRnd$rndVal.Offset$offset |
| |
| IF $offset/=3 :LOR: $rndVal/=1 |
| B SwitchPredictTypeEnd |
| ENDIF |
| MEND |
| ;// *************************************************************************** |
| ;// Motion compensation handler macros end here |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal |
| ;// combination in the "switch" to prediction processing code segment |
| ;// |
| ;// Syntax: |
| ;// M_CASE_OFFSET $rnd, $predictType |
| ;// |
| ;// Inputs: |
| ;// $rnd 0 for rounding, 1 for no rounding |
| ;// $predictType The prediction mode |
| ;// |
| ;// Outputs: |
| ;// Populated list of "M_CASE"s for the "M_SWITCH" macro |
| |
| MACRO |
| M_CASE_OFFSET $rnd, $predictType |
| M_CASE Case$predictType.Rnd$rnd.Offset0 |
| M_CASE Case$predictType.Rnd$rnd.Offset1 |
| M_CASE Case$predictType.Rnd$rnd.Offset2 |
| M_CASE Case$predictType.Rnd$rnd.Offset3 |
| MEND |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Populates all 2 kinds of rounding "cases" for each predictType in the |
| ;// "switch" to prediction processing code segment |
| ;// |
| ;// Syntax: |
| ;// M_CASE_OFFSET $predictType |
| ;// |
| ;// Inputs: |
| ;// $predictType The prediction mode |
| ;// |
| ;// Outputs: |
| ;// Populated list of "M_CASE_OFFSET" macros |
| |
| MACRO |
| M_CASE_MCRECONBLOCK $predictType |
| M_CASE_OFFSET 0, $predictType ;// 0 for rounding |
| M_CASE_OFFSET 1, $predictType ;// 1 for no rounding |
| MEND |
| ;// *************************************************************************** |
| ;// Description: |
| ;// Populates all 8 kinds of rounding and offset combinations handling macros |
| ;// for the specified predictType. In case of "IntegerPixel" predictType, |
| ;// rounding is not required so same code segment handles both cases |
| ;// |
| ;// Syntax: |
| ;// M_MCRECONBLOCK $predictType |
| ;// |
| ;// Inputs: |
| ;// $predictType The prediction mode |
| ;// |
| ;// Outputs: |
| ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified |
| ;// predictType. Each |
| ;// M_MCRECONBLOCK_<predictType> $rnd, $offset |
| ;// is an code segment (starting with a label indicating the predictType, |
| ;// rounding and offset combination) |
| ;// Four calls of this macro with the 4 prediction modes populate all the 32 |
| ;// handlers |
| |
| MACRO |
| M_MCRECONBLOCK $predictType |
| M_MCRECONBLOCK_$predictType 0, 0 |
| M_MCRECONBLOCK_$predictType 0, 1 |
| M_MCRECONBLOCK_$predictType 0, 2 |
| M_MCRECONBLOCK_$predictType 0, 3 |
| IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference |
| M_MCRECONBLOCK_$predictType 1, 0 |
| M_MCRECONBLOCK_$predictType 1, 1 |
| M_MCRECONBLOCK_$predictType 1, 2 |
| M_MCRECONBLOCK_$predictType 1, 3 |
| ENDIF |
| MEND |
| ;// *************************************************************************** |
| ;// Input/Output Registers |
| pSrc RN 0 |
| srcStep RN 1 |
| arg_pSrcResidue RN 2 |
| pSrcResidue RN 12 |
| pDst RN 3 |
| dstStep RN 2 |
| predictType RN 10 |
| rndVal RN 11 |
| mask RN 11 |
| |
| ;// Local Scratch Registers |
| zero RN 12 |
| y RN 14 |
| |
| tmp1 RN 4 |
| tmp2 RN 5 |
| tmp3 RN 6 |
| tmp4 RN 7 |
| tmp5 RN 8 |
| tmp6 RN 9 |
| tmp7 RN 10 |
| tmp8 RN 11 |
| tmp9 RN 12 |
| |
| t00 RN 4 |
| t01 RN 5 |
| t10 RN 6 |
| t11 RN 7 |
| t20 RN 8 |
| t21 RN 9 |
| t30 RN 10 |
| t31 RN 11 |
| tmp RN 12 |
| |
| yMask RN 14 |
| |
| dst RN 1 |
| return RN 0 |
| |
| ;// Allocate memory on stack |
| M_ALLOC4 Stk_pDst, 4 |
| M_ALLOC4 Stk_pSrcResidue, 4 |
| ;// Function header |
| M_START omxVCM4P2_MCReconBlock, r11 |
| ;// Define stack arguments |
| M_ARG Arg_dstStep, 4 |
| M_ARG Arg_predictType, 4 |
| M_ARG Arg_rndVal, 4 |
| ;// Save on stack |
| M_STR pDst, Stk_pDst |
| M_STR arg_pSrcResidue, Stk_pSrcResidue |
| ;// Load argument from the stack |
| M_LDR dstStep, Arg_dstStep |
| M_LDR predictType, Arg_predictType |
| M_LDR rndVal, Arg_rndVal |
| |
| MOV y, #8 |
| |
| AND tmp1, pSrc, #3 |
| ORR predictType, tmp1, predictType, LSL #3 |
| ORR predictType, predictType, rndVal, LSL #2 |
| ;// Truncating source pointer to align to 4 byte location |
| BIC pSrc, pSrc, #3 |
| |
| ;// Implementation takes care of all combinations of different |
| ;// predictTypes, rounding cases and source pointer offsets to alignment |
| ;// of 4 bytes in different code bases unless one of these parameter wasn't |
| ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK |
| ;// macros branch into 8 M_CASE macros for all combinations of the 2 |
| ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte |
| ;// alignment. |
| M_SWITCH predictType |
| M_CASE_MCRECONBLOCK IntegerPixel |
| M_CASE_MCRECONBLOCK HalfPixelX |
| M_CASE_MCRECONBLOCK HalfPixelY |
| M_CASE_MCRECONBLOCK HalfPixelXY |
| M_ENDSWITCH |
| |
| ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 |
| ;// particular macros (4 in case of IntegerPixel as rounding makes no |
| ;// difference there) to generate the code for all cases of rounding and |
| ;// offsets. LTORG is used to segment the code as code size bloated beyond |
| ;// 4KB. |
| M_MCRECONBLOCK IntegerPixel |
| M_MCRECONBLOCK HalfPixelX |
| LTORG |
| M_MCRECONBLOCK HalfPixelY |
| M_MCRECONBLOCK HalfPixelXY |
| SwitchPredictTypeEnd |
| |
| ;// Residue Addition |
| ;// This is done in 2 lane SIMD though loads are further optimized and |
| ;// 4 bytes are loaded in case of destination buffer. Algorithmic |
| ;// details are in inlined comments |
| M_LDR pSrcResidue, Stk_pSrcResidue |
| CMP pSrcResidue, #0 |
| BEQ pSrcResidueConditionEnd |
| pSrcResidueNotNull |
| M_LDR pDst, Stk_pDst |
| MOV y, #8 |
| SUB dstStep, dstStep, #4 |
| Yloop_pSrcResidueNotNull |
| SUBS y, y, #1 |
| LDR dst, [pDst] ;// dst = [dcba] |
| LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] |
| PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] |
| PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] |
| UXTB16 tmp1, dst ;// tmp1 = [0c0a] |
| UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] |
| QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits |
| QADD16 tmp2, tmp2, tmp4 |
| USAT16 tmp1, #8, tmp1 |
| USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) |
| ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] |
| STR tmp1, [pDst], #4 |
| |
| LDR dst, [pDst] |
| LDMIA pSrcResidue!, {tmp1, tmp2} |
| PKHBT tmp3, tmp1, tmp2, LSL #16 |
| PKHTB tmp4, tmp2, tmp1, ASR #16 |
| UXTB16 tmp1, dst |
| UXTB16 tmp2, dst, ROR #8 |
| QADD16 tmp1, tmp1, tmp3 |
| QADD16 tmp2, tmp2, tmp4 |
| USAT16 tmp1, #8, tmp1 |
| USAT16 tmp2, #8, tmp2 |
| ORR tmp1, tmp1, tmp2, LSL #8 |
| STR tmp1, [pDst], dstStep |
| |
| BGT Yloop_pSrcResidueNotNull |
| pSrcResidueConditionEnd |
| |
| MOV return, #OMX_Sts_NoErr |
| |
| M_END |
| ENDIF ;// ARM1136JS |
| |
| ;// *************************************************************************** |
| ;// CortexA8 implementation |
| ;// *************************************************************************** |
| END |
| ;// *************************************************************************** |
| ;// omxVCM4P2_MCReconBlock ends |
| ;// *************************************************************************** |