| ;// |
| ;// Copyright (C) 2007-2008 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s |
| ;// OpenMAX DL: v1.0.2 |
| ;// Revision: 12290 |
| ;// Date: Wednesday, April 9, 2008 |
| ;// |
| ;// |
| ;// |
| ;// |
| |
| INCLUDE omxtypes_s.h |
| INCLUDE armCOMM_s.h |
| |
| M_VARIANTS CortexA8 |
| |
| EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe |
| |
| IF CortexA8 |
| |
| M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 |
| |
| ;// Declare input registers |
| pSrc RN 0 |
| srcStep RN 1 |
| pDst RN 2 |
| dstStep RN 3 |
| |
| Temp RN 12 |
| |
| ;// Declare Neon registers |
| dCoeff5 DN 30.S16 |
| dCoeff20 DN 31.S16 |
| |
| dSrc0 DN 7.U8 |
| dSrc1 DN 8.U8 |
| dSrc2 DN 9.U8 |
| dSrc3 DN 10.U8 |
| dSrc4 DN 11.U8 |
| dSrc5 DN 12.U8 |
| dSrc6 DN 13.U8 |
| dSrc7 DN 14.U8 |
| dSrc8 DN 15.U8 |
| |
| qSumBE01 QN 8.S16 |
| qSumCD01 QN 9.S16 |
| dSumBE0 DN 16.S16 |
| dSumCD0 DN 18.S16 |
| |
| qAcc01 QN 0.S16 |
| qAcc23 QN 1.S16 |
| qAcc45 QN 2.S16 |
| qAcc67 QN 3.S16 |
| |
| dRes0 DN 0.S16 |
| dRes1 DN 2.S16 |
| dRes2 DN 4.S16 |
| dRes3 DN 6.S16 |
| |
| dAcc0 DN 0.U8 |
| dAcc1 DN 2.U8 |
| dAcc2 DN 4.U8 |
| dAcc3 DN 6.U8 |
| |
| |
| dTmp0 DN 20.S16 |
| dTmp1 DN 21.S16 |
| dTmp2 DN 22.S16 |
| dTmp3 DN 23.S16 |
| |
| |
| VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] |
| ADD Temp, pSrc, srcStep, LSL #2 |
| VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] |
| ;// One cycle stall |
| VLD1 dSrc5, [Temp], srcStep |
| ;// One cycle stall |
| VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] |
| VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f |
| VLD1 dSrc3, [pSrc], srcStep |
| ;// One cycle stall |
| VLD1 dSrc6, [Temp], srcStep ;// TeRi |
| |
| VLD1 dSrc4, [pSrc], srcStep |
| VLD1 dSrc7, [Temp], srcStep ;// TeRi |
| VADDL qSumBE01, dSrc1, dSrc4 ;// b+e |
| VADDL qSumCD01, dSrc2, dSrc3 ;// c+d |
| VLD1 dSrc8, [Temp], srcStep ;// TeRi |
| VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) |
| ; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| |
| ; VLD1 dSrc6, [Temp], srcStep |
| VADDL qSumBE01, dSrc2, dSrc5 ;// b+e |
| VADDL qSumCD01, dSrc3, dSrc4 ;// c+d |
| VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f |
| VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) |
| ; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| |
| ; VLD1 dSrc7, [Temp], srcStep |
| VADDL qSumBE01, dSrc3, dSrc6 ;// b+e |
| VADDL qSumCD01, dSrc4, dSrc5 ;// c+d |
| VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f |
| VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) |
| ; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| |
| ; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] |
| VADDL qSumBE01, dSrc4, dSrc7 ;// b+e |
| VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f |
| VADDL qSumCD01, dSrc5, dSrc6 ;// c+d |
| VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) |
| VADD dRes0, dRes0, dTmp0 |
| VADD dRes1, dRes1, dTmp1 |
| VADD dRes2, dRes2, dTmp2 |
| VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| ; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) |
| ; VADD dRes3, dRes3, dTmp3 |
| |
| VQRSHRUN dAcc0, qAcc01, #5 |
| VQRSHRUN dAcc1, qAcc23, #5 |
| VQRSHRUN dAcc2, qAcc45, #5 |
| VQRSHRUN dAcc3, qAcc67, #5 |
| |
| M_END |
| |
| ENDIF |
| |
| |
| |
| END |
| |