| /* |
| * Loongson LASX optimized h264idct |
| * |
| * Copyright (c) 2023 Loongson Technology Corporation Limited |
| * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "loongson_asm.S" |
| |
| /* |
| * #define FUNC2(a, b, c) FUNC3(a, b, c) |
| * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
| * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride) |
| * LSX optimization is enough for this function. |
| */ |
| function ff_h264_idct_add_8_lsx |
| fld.d f0, a1, 0 |
| fld.d f1, a1, 8 |
| fld.d f2, a1, 16 |
| fld.d f3, a1, 24 |
| vxor.v vr7, vr7, vr7 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| vst vr7, a1, 0 |
| vst vr7, a1, 16 |
| |
| vadd.h vr4, vr0, vr2 |
| vsub.h vr5, vr0, vr2 |
| vsrai.h vr6, vr1, 1 |
| vsrai.h vr7, vr3, 1 |
| vsub.h vr6, vr6, vr3 |
| vadd.h vr7, vr1, vr7 |
| LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
| LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 |
| vadd.h vr4, vr0, vr2 |
| vsub.h vr5, vr0, vr2 |
| vsrai.h vr6, vr1, 1 |
| vsrai.h vr7, vr3, 1 |
| vsub.h vr6, vr6, vr3 |
| vadd.h vr7, vr1, vr7 |
| LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
| |
| fld.s f4, a0, 0 |
| fldx.s f5, a0, a2 |
| fldx.s f6, a0, t2 |
| fldx.s f7, a0, t3 |
| |
| vsrari.h vr0, vr0, 6 |
| vsrari.h vr1, vr1, 6 |
| vsrari.h vr2, vr2, 6 |
| vsrari.h vr3, vr3, 6 |
| |
| vsllwil.hu.bu vr4, vr4, 0 |
| vsllwil.hu.bu vr5, vr5, 0 |
| vsllwil.hu.bu vr6, vr6, 0 |
| vsllwil.hu.bu vr7, vr7, 0 |
| vadd.h vr0, vr0, vr4 |
| vadd.h vr1, vr1, vr5 |
| vadd.h vr2, vr2, vr6 |
| vadd.h vr3, vr3, vr7 |
| vssrarni.bu.h vr1, vr0, 0 |
| vssrarni.bu.h vr3, vr2, 0 |
| |
| vbsrl.v vr0, vr1, 8 |
| vbsrl.v vr2, vr3, 8 |
| fst.s f1, a0, 0 |
| fstx.s f0, a0, a2 |
| fstx.s f3, a0, t2 |
| fstx.s f2, a0, t3 |
| endfunc |
| |
| /* |
| * #define FUNC2(a, b, c) FUNC3(a, b, c) |
| * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
| * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride) |
| */ |
| function ff_h264_idct8_add_8_lsx |
| ld.h t0, a1, 0 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| add.d t4, t3, a2 |
| add.d t5, t4, a2 |
| add.d t6, t5, a2 |
| add.d t7, t6, a2 |
| addi.w t0, t0, 32 |
| st.h t0, a1, 0 |
| |
| vld vr0, a1, 0 |
| vld vr1, a1, 16 |
| vld vr2, a1, 32 |
| vld vr3, a1, 48 |
| vld vr4, a1, 64 |
| vld vr5, a1, 80 |
| vld vr6, a1, 96 |
| vld vr7, a1, 112 |
| vxor.v vr8, vr8, vr8 |
| vst vr8, a1, 0 |
| vst vr8, a1, 16 |
| vst vr8, a1, 32 |
| vst vr8, a1, 48 |
| vst vr8, a1, 64 |
| vst vr8, a1, 80 |
| vst vr8, a1, 96 |
| vst vr8, a1, 112 |
| |
| vadd.h vr18, vr0, vr4 |
| vsub.h vr19, vr0, vr4 |
| vsrai.h vr20, vr2, 1 |
| vsrai.h vr21, vr6, 1 |
| vsub.h vr20, vr20, vr6 |
| vadd.h vr21, vr21, vr2 |
| LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16 |
| vsrai.h vr11, vr7, 1 |
| vsrai.h vr13, vr3, 1 |
| vsrai.h vr15, vr5, 1 |
| vsrai.h vr17, vr1, 1 |
| vsub.h vr11, vr5, vr11 |
| vsub.h vr13, vr7, vr13 |
| vadd.h vr15, vr7, vr15 |
| vadd.h vr17, vr5, vr17 |
| vsub.h vr11, vr11, vr7 |
| vsub.h vr13, vr13, vr3 |
| vadd.h vr15, vr15, vr5 |
| vadd.h vr17, vr17, vr1 |
| vsub.h vr11, vr11, vr3 |
| vadd.h vr13, vr13, vr1 |
| vsub.h vr15, vr15, vr1 |
| vadd.h vr17, vr17, vr3 |
| vsrai.h vr18, vr11, 2 |
| vsrai.h vr19, vr13, 2 |
| vsrai.h vr20, vr15, 2 |
| vsrai.h vr21, vr17, 2 |
| vadd.h vr11, vr11, vr21 |
| vadd.h vr13, vr13, vr20 |
| vsub.h vr15, vr19, vr15 |
| vsub.h vr17, vr17, vr18 |
| LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \ |
| vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7 |
| |
| LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
| vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
| vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17 |
| vexth.w.h vr20, vr0 |
| vexth.w.h vr21, vr1 |
| vexth.w.h vr22, vr2 |
| vexth.w.h vr23, vr3 |
| vexth.w.h vr8, vr4 |
| vexth.w.h vr9, vr5 |
| vexth.w.h vr18, vr6 |
| vexth.w.h vr19, vr7 |
| vsllwil.w.h vr0, vr0, 0 |
| vsllwil.w.h vr1, vr1, 0 |
| vsllwil.w.h vr2, vr2, 0 |
| vsllwil.w.h vr3, vr3, 0 |
| vsllwil.w.h vr4, vr4, 0 |
| vsllwil.w.h vr5, vr5, 0 |
| vsllwil.w.h vr6, vr6, 0 |
| vsllwil.w.h vr7, vr7, 0 |
| |
| vadd.w vr11, vr0, vr4 |
| vsub.w vr13, vr0, vr4 |
| vsrai.w vr15, vr2, 1 |
| vsrai.w vr17, vr6, 1 |
| vsub.w vr15, vr15, vr6 |
| vadd.w vr17, vr17, vr2 |
| LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16 |
| vsrai.w vr11, vr7, 1 |
| vsrai.w vr13, vr3, 1 |
| vsrai.w vr15, vr5, 1 |
| vsrai.w vr17, vr1, 1 |
| vsub.w vr11, vr5, vr11 |
| vsub.w vr13, vr7, vr13 |
| vadd.w vr15, vr7, vr15 |
| vadd.w vr17, vr5, vr17 |
| vsub.w vr11, vr11, vr7 |
| vsub.w vr13, vr13, vr3 |
| vadd.w vr15, vr15, vr5 |
| vadd.w vr17, vr17, vr1 |
| vsub.w vr11, vr11, vr3 |
| vadd.w vr13, vr13, vr1 |
| vsub.w vr15, vr15, vr1 |
| vadd.w vr17, vr17, vr3 |
| vsrai.w vr0, vr11, 2 |
| vsrai.w vr1, vr13, 2 |
| vsrai.w vr2, vr15, 2 |
| vsrai.w vr3, vr17, 2 |
| vadd.w vr11, vr11, vr3 |
| vadd.w vr13, vr13, vr2 |
| vsub.w vr15, vr1, vr15 |
| vsub.w vr17, vr17, vr0 |
| LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \ |
| vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 |
| |
| vadd.w vr11, vr20, vr8 |
| vsub.w vr13, vr20, vr8 |
| vsrai.w vr15, vr22, 1 |
| vsrai.w vr17, vr18, 1 |
| vsub.w vr15, vr15, vr18 |
| vadd.w vr17, vr17, vr22 |
| LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16 |
| vsrai.w vr11, vr19, 1 |
| vsrai.w vr13, vr23, 1 |
| vsrai.w vr15, vr9, 1 |
| vsrai.w vr17, vr21, 1 |
| vsub.w vr11, vr9, vr11 |
| vsub.w vr13, vr19, vr13 |
| vadd.w vr15, vr19, vr15 |
| vadd.w vr17, vr9, vr17 |
| vsub.w vr11, vr11, vr19 |
| vsub.w vr13, vr13, vr23 |
| vadd.w vr15, vr15, vr9 |
| vadd.w vr17, vr17, vr21 |
| vsub.w vr11, vr11, vr23 |
| vadd.w vr13, vr13, vr21 |
| vsub.w vr15, vr15, vr21 |
| vadd.w vr17, vr17, vr23 |
| vsrai.w vr20, vr11, 2 |
| vsrai.w vr21, vr13, 2 |
| vsrai.w vr22, vr15, 2 |
| vsrai.w vr23, vr17, 2 |
| vadd.w vr11, vr11, vr23 |
| vadd.w vr13, vr13, vr22 |
| vsub.w vr15, vr21, vr15 |
| vsub.w vr17, vr17, vr20 |
| LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \ |
| vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19 |
| |
| vld vr10, a0, 0 |
| vldx vr11, a0, a2 |
| vldx vr12, a0, t2 |
| vldx vr13, a0, t3 |
| vldx vr14, a0, t4 |
| vldx vr15, a0, t5 |
| vldx vr16, a0, t6 |
| vldx vr17, a0, t7 |
| vsrani.h.w vr20, vr0, 6 |
| vsrani.h.w vr21, vr1, 6 |
| vsrani.h.w vr22, vr2, 6 |
| vsrani.h.w vr23, vr3, 6 |
| vsrani.h.w vr8, vr4, 6 |
| vsrani.h.w vr9, vr5, 6 |
| vsrani.h.w vr18, vr6, 6 |
| vsrani.h.w vr19, vr7, 6 |
| vsllwil.hu.bu vr10, vr10, 0 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vsllwil.hu.bu vr12, vr12, 0 |
| vsllwil.hu.bu vr13, vr13, 0 |
| vsllwil.hu.bu vr14, vr14, 0 |
| vsllwil.hu.bu vr15, vr15, 0 |
| vsllwil.hu.bu vr16, vr16, 0 |
| vsllwil.hu.bu vr17, vr17, 0 |
| |
| vadd.h vr0, vr20, vr10 |
| vadd.h vr1, vr21, vr11 |
| vadd.h vr2, vr22, vr12 |
| vadd.h vr3, vr23, vr13 |
| vadd.h vr4, vr8, vr14 |
| vadd.h vr5, vr9, vr15 |
| vadd.h vr6, vr18, vr16 |
| vadd.h vr7, vr19, vr17 |
| vssrarni.bu.h vr1, vr0, 0 |
| vssrarni.bu.h vr3, vr2, 0 |
| vssrarni.bu.h vr5, vr4, 0 |
| vssrarni.bu.h vr7, vr6, 0 |
| vbsrl.v vr0, vr1, 8 |
| vbsrl.v vr2, vr3, 8 |
| vbsrl.v vr4, vr5, 8 |
| vbsrl.v vr6, vr7, 8 |
| fst.d f1, a0, 0 |
| fstx.d f0, a0, a2 |
| fstx.d f3, a0, t2 |
| fstx.d f2, a0, t3 |
| fstx.d f5, a0, t4 |
| fstx.d f4, a0, t5 |
| fstx.d f7, a0, t6 |
| fstx.d f6, a0, t7 |
| endfunc |
| |
| /* |
| * #define FUNC2(a, b, c) FUNC3(a, b, c) |
| * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
| * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride) |
| */ |
| function ff_h264_idct8_add_8_lasx |
| ld.h t0, a1, 0 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| add.d t4, t3, a2 |
| add.d t5, t4, a2 |
| add.d t6, t5, a2 |
| add.d t7, t6, a2 |
| addi.w t0, t0, 32 |
| st.h t0, a1, 0 |
| |
| vld vr0, a1, 0 |
| vld vr1, a1, 16 |
| vld vr2, a1, 32 |
| vld vr3, a1, 48 |
| vld vr4, a1, 64 |
| vld vr5, a1, 80 |
| vld vr6, a1, 96 |
| vld vr7, a1, 112 |
| xvxor.v xr8, xr8, xr8 |
| xvst xr8, a1, 0 |
| xvst xr8, a1, 32 |
| xvst xr8, a1, 64 |
| xvst xr8, a1, 96 |
| |
| vadd.h vr18, vr0, vr4 |
| vsub.h vr19, vr0, vr4 |
| vsrai.h vr20, vr2, 1 |
| vsrai.h vr21, vr6, 1 |
| vsub.h vr20, vr20, vr6 |
| vadd.h vr21, vr21, vr2 |
| LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16 |
| vsrai.h vr11, vr7, 1 |
| vsrai.h vr13, vr3, 1 |
| vsrai.h vr15, vr5, 1 |
| vsrai.h vr17, vr1, 1 |
| vsub.h vr11, vr5, vr11 |
| vsub.h vr13, vr7, vr13 |
| vadd.h vr15, vr7, vr15 |
| vadd.h vr17, vr5, vr17 |
| vsub.h vr11, vr11, vr7 |
| vsub.h vr13, vr13, vr3 |
| vadd.h vr15, vr15, vr5 |
| vadd.h vr17, vr17, vr1 |
| vsub.h vr11, vr11, vr3 |
| vadd.h vr13, vr13, vr1 |
| vsub.h vr15, vr15, vr1 |
| vadd.h vr17, vr17, vr3 |
| vsrai.h vr18, vr11, 2 |
| vsrai.h vr19, vr13, 2 |
| vsrai.h vr20, vr15, 2 |
| vsrai.h vr21, vr17, 2 |
| vadd.h vr11, vr11, vr21 |
| vadd.h vr13, vr13, vr20 |
| vsub.h vr15, vr19, vr15 |
| vsub.h vr17, vr17, vr18 |
| LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \ |
| vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7 |
| |
| LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
| vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
| vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17 |
| vext2xv.w.h xr0, xr0 |
| vext2xv.w.h xr1, xr1 |
| vext2xv.w.h xr2, xr2 |
| vext2xv.w.h xr3, xr3 |
| vext2xv.w.h xr4, xr4 |
| vext2xv.w.h xr5, xr5 |
| vext2xv.w.h xr6, xr6 |
| vext2xv.w.h xr7, xr7 |
| |
| xvadd.w xr11, xr0, xr4 |
| xvsub.w xr13, xr0, xr4 |
| xvsrai.w xr15, xr2, 1 |
| xvsrai.w xr17, xr6, 1 |
| xvsub.w xr15, xr15, xr6 |
| xvadd.w xr17, xr17, xr2 |
| LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16 |
| xvsrai.w xr11, xr7, 1 |
| xvsrai.w xr13, xr3, 1 |
| xvsrai.w xr15, xr5, 1 |
| xvsrai.w xr17, xr1, 1 |
| xvsub.w xr11, xr5, xr11 |
| xvsub.w xr13, xr7, xr13 |
| xvadd.w xr15, xr7, xr15 |
| xvadd.w xr17, xr5, xr17 |
| xvsub.w xr11, xr11, xr7 |
| xvsub.w xr13, xr13, xr3 |
| xvadd.w xr15, xr15, xr5 |
| xvadd.w xr17, xr17, xr1 |
| xvsub.w xr11, xr11, xr3 |
| xvadd.w xr13, xr13, xr1 |
| xvsub.w xr15, xr15, xr1 |
| xvadd.w xr17, xr17, xr3 |
| xvsrai.w xr0, xr11, 2 |
| xvsrai.w xr1, xr13, 2 |
| xvsrai.w xr2, xr15, 2 |
| xvsrai.w xr3, xr17, 2 |
| xvadd.w xr11, xr11, xr3 |
| xvadd.w xr13, xr13, xr2 |
| xvsub.w xr15, xr1, xr15 |
| xvsub.w xr17, xr17, xr0 |
| LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \ |
| xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 |
| |
| vld vr10, a0, 0 |
| vldx vr11, a0, a2 |
| vldx vr12, a0, t2 |
| vldx vr13, a0, t3 |
| vldx vr14, a0, t4 |
| vldx vr15, a0, t5 |
| vldx vr16, a0, t6 |
| vldx vr17, a0, t7 |
| xvldi xr8, 0x806 //"xvldi.w xr8 6" |
| xvsran.h.w xr0, xr0, xr8 |
| xvsran.h.w xr1, xr1, xr8 |
| xvsran.h.w xr2, xr2, xr8 |
| xvsran.h.w xr3, xr3, xr8 |
| xvsran.h.w xr4, xr4, xr8 |
| xvsran.h.w xr5, xr5, xr8 |
| xvsran.h.w xr6, xr6, xr8 |
| xvsran.h.w xr7, xr7, xr8 |
| xvpermi.d xr0, xr0, 0x08 |
| xvpermi.d xr1, xr1, 0x08 |
| xvpermi.d xr2, xr2, 0x08 |
| xvpermi.d xr3, xr3, 0x08 |
| xvpermi.d xr4, xr4, 0x08 |
| xvpermi.d xr5, xr5, 0x08 |
| xvpermi.d xr6, xr6, 0x08 |
| xvpermi.d xr7, xr7, 0x08 |
| |
| vsllwil.hu.bu vr10, vr10, 0 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vsllwil.hu.bu vr12, vr12, 0 |
| vsllwil.hu.bu vr13, vr13, 0 |
| vsllwil.hu.bu vr14, vr14, 0 |
| vsllwil.hu.bu vr15, vr15, 0 |
| vsllwil.hu.bu vr16, vr16, 0 |
| vsllwil.hu.bu vr17, vr17, 0 |
| |
| vadd.h vr0, vr0, vr10 |
| vadd.h vr1, vr1, vr11 |
| vadd.h vr2, vr2, vr12 |
| vadd.h vr3, vr3, vr13 |
| vadd.h vr4, vr4, vr14 |
| vadd.h vr5, vr5, vr15 |
| vadd.h vr6, vr6, vr16 |
| vadd.h vr7, vr7, vr17 |
| vssrarni.bu.h vr1, vr0, 0 |
| vssrarni.bu.h vr3, vr2, 0 |
| vssrarni.bu.h vr5, vr4, 0 |
| vssrarni.bu.h vr7, vr6, 0 |
| vbsrl.v vr0, vr1, 8 |
| vbsrl.v vr2, vr3, 8 |
| vbsrl.v vr4, vr5, 8 |
| vbsrl.v vr6, vr7, 8 |
| fst.d f1, a0, 0 |
| fstx.d f0, a0, a2 |
| fstx.d f3, a0, t2 |
| fstx.d f2, a0, t3 |
| fstx.d f5, a0, t4 |
| fstx.d f4, a0, t5 |
| fstx.d f7, a0, t6 |
| fstx.d f6, a0, t7 |
| endfunc |
| |
| /* |
| * #define FUNC2(a, b, c) FUNC3(a, b, c) |
| * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
| * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride) |
| * LSX optimization is enough for this function. |
| */ |
| function ff_h264_idct_dc_add_8_lsx |
| vldrepl.h vr4, a1, 0 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| fld.s f0, a0, 0 |
| fldx.s f1, a0, a2 |
| fldx.s f2, a0, t2 |
| fldx.s f3, a0, t3 |
| st.h zero, a1, 0 |
| |
| vsrari.h vr4, vr4, 6 |
| vilvl.w vr0, vr1, vr0 |
| vilvl.w vr1, vr3, vr2 |
| vsllwil.hu.bu vr0, vr0, 0 |
| vsllwil.hu.bu vr1, vr1, 0 |
| vadd.h vr0, vr0, vr4 |
| vadd.h vr1, vr1, vr4 |
| vssrarni.bu.h vr1, vr0, 0 |
| |
| vbsrl.v vr2, vr1, 4 |
| vbsrl.v vr3, vr1, 8 |
| vbsrl.v vr4, vr1, 12 |
| fst.s f1, a0, 0 |
| fstx.s f2, a0, a2 |
| fstx.s f3, a0, t2 |
| fstx.s f4, a0, t3 |
| endfunc |
| |
| /* |
| * #define FUNC2(a, b, c) FUNC3(a, b, c) |
| * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
| * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride) |
| */ |
| function ff_h264_idct8_dc_add_8_lsx |
| vldrepl.h vr8, a1, 0 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| add.d t4, t3, a2 |
| add.d t5, t4, a2 |
| add.d t6, t5, a2 |
| add.d t7, t6, a2 |
| |
| fld.d f0, a0, 0 |
| fldx.d f1, a0, a2 |
| fldx.d f2, a0, t2 |
| fldx.d f3, a0, t3 |
| fldx.d f4, a0, t4 |
| fldx.d f5, a0, t5 |
| fldx.d f6, a0, t6 |
| fldx.d f7, a0, t7 |
| st.h zero, a1, 0 |
| |
| vsrari.h vr8, vr8, 6 |
| vsllwil.hu.bu vr0, vr0, 0 |
| vsllwil.hu.bu vr1, vr1, 0 |
| vsllwil.hu.bu vr2, vr2, 0 |
| vsllwil.hu.bu vr3, vr3, 0 |
| vsllwil.hu.bu vr4, vr4, 0 |
| vsllwil.hu.bu vr5, vr5, 0 |
| vsllwil.hu.bu vr6, vr6, 0 |
| vsllwil.hu.bu vr7, vr7, 0 |
| vadd.h vr0, vr0, vr8 |
| vadd.h vr1, vr1, vr8 |
| vadd.h vr2, vr2, vr8 |
| vadd.h vr3, vr3, vr8 |
| vadd.h vr4, vr4, vr8 |
| vadd.h vr5, vr5, vr8 |
| vadd.h vr6, vr6, vr8 |
| vadd.h vr7, vr7, vr8 |
| vssrarni.bu.h vr1, vr0, 0 |
| vssrarni.bu.h vr3, vr2, 0 |
| vssrarni.bu.h vr5, vr4, 0 |
| vssrarni.bu.h vr7, vr6, 0 |
| |
| vbsrl.v vr0, vr1, 8 |
| vbsrl.v vr2, vr3, 8 |
| vbsrl.v vr4, vr5, 8 |
| vbsrl.v vr6, vr7, 8 |
| fst.d f1, a0, 0 |
| fstx.d f0, a0, a2 |
| fstx.d f3, a0, t2 |
| fstx.d f2, a0, t3 |
| fstx.d f5, a0, t4 |
| fstx.d f4, a0, t5 |
| fstx.d f7, a0, t6 |
| fstx.d f6, a0, t7 |
| endfunc |
| function ff_h264_idct8_dc_add_8_lasx |
| xvldrepl.h xr8, a1, 0 |
| add.d t2, a2, a2 |
| add.d t3, t2, a2 |
| add.d t4, t3, a2 |
| add.d t5, t4, a2 |
| add.d t6, t5, a2 |
| add.d t7, t6, a2 |
| |
| fld.d f0, a0, 0 |
| fldx.d f1, a0, a2 |
| fldx.d f2, a0, t2 |
| fldx.d f3, a0, t3 |
| fldx.d f4, a0, t4 |
| fldx.d f5, a0, t5 |
| fldx.d f6, a0, t6 |
| fldx.d f7, a0, t7 |
| st.h zero, a1, 0 |
| |
| xvsrari.h xr8, xr8, 6 |
| xvpermi.q xr1, xr0, 0x20 |
| xvpermi.q xr3, xr2, 0x20 |
| xvpermi.q xr5, xr4, 0x20 |
| xvpermi.q xr7, xr6, 0x20 |
| xvsllwil.hu.bu xr1, xr1, 0 |
| xvsllwil.hu.bu xr3, xr3, 0 |
| xvsllwil.hu.bu xr5, xr5, 0 |
| xvsllwil.hu.bu xr7, xr7, 0 |
| xvadd.h xr1, xr1, xr8 |
| xvadd.h xr3, xr3, xr8 |
| xvadd.h xr5, xr5, xr8 |
| xvadd.h xr7, xr7, xr8 |
| |
| xvssrarni.bu.h xr3, xr1, 0 |
| xvssrarni.bu.h xr7, xr5, 0 |
| |
| xvpermi.q xr1, xr3, 0x11 |
| xvpermi.q xr5, xr7, 0x11 |
| xvbsrl.v xr0, xr1, 8 |
| xvbsrl.v xr2, xr3, 8 |
| xvbsrl.v xr4, xr5, 8 |
| xvbsrl.v xr6, xr7, 8 |
| |
| fst.d f3, a0, 0 |
| fstx.d f1, a0, a2 |
| fstx.d f2, a0, t2 |
| fstx.d f0, a0, t3 |
| fstx.d f7, a0, t4 |
| fstx.d f5, a0, t5 |
| fstx.d f6, a0, t6 |
| fstx.d f4, a0, t7 |
| endfunc |
| |
| /** |
| * IDCT transforms the 16 dc values and dequantizes them. |
| * @param qmul quantization parameter |
| * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){ |
| * LSX optimization is enough for this function. |
| */ |
| function ff_h264_luma_dc_dequant_idct_8_lsx |
| vld vr0, a1, 0 |
| vld vr1, a1, 8 |
| vld vr2, a1, 16 |
| vld vr3, a1, 24 |
| vreplgr2vr.w vr8, a2 |
| LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10 |
| LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1 |
| LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5 |
| LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10 |
| LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5 |
| LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
| vsllwil.w.h vr0, vr0, 0 |
| vsllwil.w.h vr1, vr1, 0 |
| vsllwil.w.h vr2, vr2, 0 |
| vsllwil.w.h vr3, vr3, 0 |
| vmul.w vr0, vr0, vr8 |
| vmul.w vr1, vr1, vr8 |
| vmul.w vr2, vr2, vr8 |
| vmul.w vr3, vr3, vr8 |
| vsrarni.h.w vr1, vr0, 8 |
| vsrarni.h.w vr3, vr2, 8 |
| |
| vstelm.h vr1, a0, 0, 0 |
| vstelm.h vr1, a0, 32, 4 |
| vstelm.h vr1, a0, 64, 1 |
| vstelm.h vr1, a0, 96, 5 |
| vstelm.h vr3, a0, 128, 0 |
| vstelm.h vr3, a0, 160, 4 |
| vstelm.h vr3, a0, 192, 1 |
| vstelm.h vr3, a0, 224, 5 |
| addi.d a0, a0, 256 |
| vstelm.h vr1, a0, 0, 2 |
| vstelm.h vr1, a0, 32, 6 |
| vstelm.h vr1, a0, 64, 3 |
| vstelm.h vr1, a0, 96, 7 |
| vstelm.h vr3, a0, 128, 2 |
| vstelm.h vr3, a0, 160, 6 |
| vstelm.h vr3, a0, 192, 3 |
| vstelm.h vr3, a0, 224, 7 |
| endfunc |