| ;// |
| ;// Copyright (C) 2007-2008 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s |
| ;// OpenMAX DL: v1.0.2 |
| ;// Revision: 12290 |
| ;// Date: Wednesday, April 9, 2008 |
| ;// |
| ;// |
| ;// |
| ;// |
| ;// Description: |
| ;// H.264 inverse quantize and transform module |
| ;// |
| ;// |
| |
| ;// Include standard headers |
| |
| INCLUDE omxtypes_s.h |
| INCLUDE armCOMM_s.h |
| |
| ;// Import/Export symbols required from/to other files |
| ;// (For example tables) |
| |
| IMPORT armVCM4P10_UnpackBlock4x4 |
| IMPORT armVCM4P10_QPDivTable |
| IMPORT armVCM4P10_VMatrixQPModTable |
| |
| M_VARIANTS CortexA8 |
| |
| ;// Set debugging level |
| ;//DEBUG_ON SETL {TRUE} |
| |
| |
| ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 |
| |
| |
| ;// Guarding implementation by the processor name |
| |
| |
| |
| ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 |
| |
| ;// Guarding implementation by the processor name |
| |
| IF CortexA8 |
| |
| ;//Input Registers |
| pData RN 0 |
| QP RN 1 |
| |
| |
| ;//Local Scratch Registers |
| |
| ;// ARM Registers |
| |
| pQPDivTable RN 2 |
| pQPModTable RN 3 |
| Shift RN 4 |
| Scale RN 5 |
| |
| ;// NEON Registers |
| |
| ;// Packed Input pixels |
| dIn0 DN D0.S16 |
| dIn1 DN D1.S16 |
| dIn2 DN D2.S16 |
| dIn3 DN D3.S16 |
| |
| ;// Intermediate calculations |
| dRowSum1 DN D4.S16 |
| dRowSum2 DN D5.S16 |
| dRowDiff1 DN D6.S16 |
| dRowDiff2 DN D7.S16 |
| |
| ;// Row operated pixels |
| dRowOp0 DN D0.S16 |
| dRowOp1 DN D1.S16 |
| dRowOp2 DN D2.S16 |
| dRowOp3 DN D3.S16 |
| qRowOp01 QN Q0.32 |
| qRowOp23 QN Q1.32 |
| |
| ;// Intermediate calculations |
| dColSum1 DN D4.S16 |
| dColSum2 DN D5.S16 |
| dColDiff1 DN D6.S16 |
| dColDiff2 DN D7.S16 |
| |
| ;// Coloumn operated pixels |
| dColOp0 DN D0.S16 |
| dColOp1 DN D1.S16 |
| dColOp2 DN D2.S16 |
| dColOp3 DN D3.S16 |
| |
| ;// Temporary scratch varaibles |
| |
| dScale DN D5.S16 |
| qRound0 QN Q3.S32 |
| qRound1 QN Q4.S32 |
| qRound2 QN Q5.S32 |
| qRound3 QN Q6.S32 |
| |
| ;// InvTransformed and Dequantized pixels |
| dOut0 DN D0.S16 |
| dOut1 DN D1.S16 |
| dOut2 DN D2.S16 |
| dOut3 DN D3.S16 |
| |
| |
| ;// Allocate stack memory required by the function |
| |
| |
| ;// Write function header |
| M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 |
| |
| ;****************************************************************** |
| ;// The strategy used in implementing the transform is as follows:* |
| ;// Load the 4x4 block into 4 D-registers * |
| ;// Transpose the 4x4 matrix * |
| ;// Perform the row operations (on columns) using SIMD * |
| ;// Transpose the 4x4 result matrix * |
| ;// Perform the coloumn operations * |
| ;****************************************************************** |
| |
| ;// Load all the 4x4 pixels in Transposed form |
| |
| VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] |
| LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer |
| LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer |
| |
| ;**************************************** |
| ;// Row Operations (Performed on columns) |
| ;**************************************** |
| ;// Scale factor calculation is done using ARM instructions |
| ;// Interleaved with NEON instructions inorder to Dual issue |
| |
| VADD dRowSum1,dIn0,dIn1 |
| VADD dRowSum2,dIn2,dIn3 |
| VSUB dRowDiff1,dIn0,dIn1 |
| LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] |
| VSUB dRowDiff2,dIn2,dIn3 |
| LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] |
| VADD dRowOp0,dRowSum1,dRowSum2 |
| VSUB dRowOp1,dRowSum1,dRowSum2 |
| VSUB dRowOp2,dRowDiff1,dRowDiff2 |
| LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift |
| VADD dRowOp3,dRowDiff1,dRowDiff2 |
| |
| ;**************************************** |
| ;// Transpose the resultant matrix |
| ;**************************************** |
| |
| VTRN dRowOp0,dRowOp1 |
| VTRN dRowOp2,dRowOp3 |
| VTRN qRowOp01,qRowOp23 |
| |
| ;**************************************** |
| ;// Coloumn Operations |
| ;**************************************** |
| |
| VADD dColSum1,dRowOp0,dRowOp1 |
| VADD dColSum2,dRowOp2,dRowOp3 |
| VSUB dColDiff1,dRowOp0,dRowOp1 |
| VSUB dColDiff2,dRowOp2,dRowOp3 |
| VADD dColOp0,dColSum1,dColSum2 |
| VSUB dColOp1,dColSum1,dColSum2 |
| VSUB dColOp2,dColDiff1,dColDiff2 |
| VADD dColOp3,dColDiff1,dColDiff2 |
| |
| ;//---------------------------------------------------------------------- |
| ;// |
| ;// <Dequantize> improves on the c-reference code |
| ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together |
| ;// We do not subtract 2 from Shift as in C reference, instead perform a |
| ;// Scale << Shift once in the beginning and do a right shift by a |
| ;// constant 2 after the Multiplication. The value of Round would be 2 |
| ;// |
| ;// By doing this we aviod the Branches required and also |
| ;// reduce the code size substantially |
| ;// |
| ;//---------------------------------------------------------------------- |
| |
| |
| VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector |
| |
| |
| VMOV qRound0,#2 ;// Set the Round Value |
| VMOV qRound1,#2 |
| VMOV qRound2,#2 |
| VMOV qRound3,#2 |
| |
| VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round |
| VMLAL qRound1,dColOp1,dScale |
| VMLAL qRound2,dColOp2,dScale |
| VMLAL qRound3,dColOp3,dScale |
| |
| VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value |
| VSHRN dOut1,qRound1,#2 |
| VSHRN dOut2,qRound2,#2 |
| VSHRN dOut3,qRound3,#2 |
| |
| ;*************************** |
| ;// Store all the 4x4 pixels |
| ;*************************** |
| |
| VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] |
| |
| |
| ;// Set return value |
| |
| ;// Write function tail |
| M_END |
| |
| ENDIF ;//CORTEXA8 |
| |
| |
| |
| ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair |
| |
| ;//Input Registers |
| ppSrc RN 0 |
| pDst RN 1 |
| QPR2 RN 2 |
| |
| ;//Output Registers |
| result RN 0 |
| |
| ;//Local Scratch Registers |
| pDstR4 RN 4 |
| pDstR0 RN 0 |
| QPR1 RN 1 |
| QPR5 RN 5 |
| |
| ;// Guarding implementation by the processor name |
| |
| IF CortexA8 |
| |
| ;// Allocate stack memory required by the function |
| |
| |
| ;// Write function header |
| M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 |
| |
| MOV pDstR4,pDst ;// Saving register r1 |
| MOV QPR5,QPR2 ;// Saving register r2 |
| BL armVCM4P10_UnpackBlock4x4 |
| |
| MOV pDstR0,pDstR4 ;// Setting up register r0 |
| MOV QPR1,QPR5 ;// Setting up register r1 |
| BL armVCM4P10_InvTransformDequantLumaDC4x4 |
| |
| |
| ;// Set return value |
| MOV result,#OMX_Sts_NoErr |
| |
| ;// Write function tail |
| M_END |
| |
| |
| ENDIF ;//ARM1136JS |
| |
| |
| END |