| /* |
| * Loongson LSX/LASX optimized h264chroma |
| * |
| * Copyright (c) 2023 Loongson Technology Corporation Limited |
| * Contributed by Lu Wang <wanglu@loongson.cn> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "loongson_asm.S" |
| |
| /* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_put_h264_chroma_mc8_lsx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| vreplgr2vr.b vr0, t3 |
| vreplgr2vr.b vr1, t4 |
| vreplgr2vr.b vr2, t5 |
| vreplgr2vr.b vr3, t6 |
| vreplgr2vr.b vr4, t0 |
| slli.d t2, a2, 1 |
| add.d t3, t2, a2 |
| slli.d t4, a2, 2 |
| |
| bge zero, t6, .ENDLOOP_D |
| move t1, a3 |
| vilvl.b vr9, vr1, vr0 |
| vilvl.b vr10, vr3, vr2 |
| .LOOP_D: |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| add.d a1, a1, a2 |
| vld vr7, a1, 0 |
| vld vr8, a1, 1 |
| vilvl.b vr11, vr6, vr5 |
| vilvl.b vr12, vr8, vr7 |
| vmulwev.h.bu vr13, vr9, vr11 |
| vmaddwod.h.bu vr13, vr9, vr11 |
| vmulwev.h.bu vr14, vr10, vr12 |
| vmaddwod.h.bu vr14, vr10, vr12 |
| vadd.h vr13, vr13, vr14 |
| vsrarni.b.h vr13, vr13, 6 |
| vstelm.d vr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| vilvl.b vr11, vr8, vr7 |
| vilvl.b vr12, vr6, vr5 |
| vmulwev.h.bu vr13, vr9, vr11 |
| vmaddwod.h.bu vr13, vr9, vr11 |
| vmulwev.h.bu vr14, vr10, vr12 |
| vmaddwod.h.bu vr14, vr10, vr12 |
| vadd.h vr13, vr13, vr14 |
| vsrarni.b.h vr13, vr13, 6 |
| vstelm.d vr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr7, a1, 0 |
| vld vr8, a1, 1 |
| vilvl.b vr11, vr6, vr5 |
| vilvl.b vr12, vr8, vr7 |
| vmulwev.h.bu vr13, vr9, vr11 |
| vmaddwod.h.bu vr13, vr9, vr11 |
| vmulwev.h.bu vr14, vr10, vr12 |
| vmaddwod.h.bu vr14, vr10, vr12 |
| vadd.h vr13, vr13, vr14 |
| vsrarni.b.h vr13, vr13, 6 |
| vstelm.d vr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| vilvl.b vr11, vr8, vr7 |
| vilvl.b vr12, vr6, vr5 |
| vmulwev.h.bu vr13, vr9, vr11 |
| vmaddwod.h.bu vr13, vr9, vr11 |
| vmulwev.h.bu vr14, vr10, vr12 |
| vmaddwod.h.bu vr14, vr10, vr12 |
| vadd.h vr13, vr13, vr14 |
| vsrarni.b.h vr13, vr13, 6 |
| vstelm.d vr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOP_D |
| b .ENDLOOP |
| .ENDLOOP_D: |
| |
| bge zero, t0, .ENDLOOP_E |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| vilvl.b vr7, vr4, vr0 |
| .LOOP_E: |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOP_E |
| b .ENDLOOP |
| .ENDLOOP_E: |
| |
| move t1, a3 |
| .LOOP: |
| vld vr5, a1, 0 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vsrarni.b.h vr7, vr7, 6 |
| vilvl.b vr6, vr7, vr6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, a2 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vsrarni.b.h vr7, vr7, 6 |
| vilvl.b vr6, vr7, vr6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, t2 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vsrarni.b.h vr7, vr7, 6 |
| vilvl.b vr6, vr7, vr6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, t3 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vsrarni.b.h vr7, vr7, 6 |
| vilvl.b vr6, vr7, vr6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, t4 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOP |
| .ENDLOOP: |
| endfunc |
| |
| /* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_avg_h264_chroma_mc8_lsx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| vreplgr2vr.b vr0, t3 |
| vreplgr2vr.b vr1, t4 |
| vreplgr2vr.b vr2, t5 |
| vreplgr2vr.b vr3, t6 |
| vreplgr2vr.b vr4, t0 |
| slli.d t2, a2, 1 |
| add.d t3, t2, a2 |
| slli.d t4, a2, 2 |
| |
| bge zero, t6, .ENDLOOPD |
| move t1, a3 |
| vilvl.b vr9, vr1, vr0 |
| vilvl.b vr10, vr3, vr2 |
| .LOOPD: |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| add.d a1, a1, a2 |
| vld vr7, a1, 0 |
| vld vr8, a1, 1 |
| vld vr11, a0, 0 |
| vilvl.b vr12, vr6, vr5 |
| vilvl.b vr13, vr8, vr7 |
| vmulwev.h.bu vr14, vr9, vr12 |
| vmaddwod.h.bu vr14, vr9, vr12 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| vadd.h vr14, vr14, vr15 |
| vsrari.h vr14, vr14, 6 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vadd.h vr11, vr14, vr11 |
| vsrarni.b.h vr11, vr11, 1 |
| vstelm.d vr11, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| vld vr11, a0, 0 |
| vilvl.b vr12, vr8, vr7 |
| vilvl.b vr13, vr6, vr5 |
| vmulwev.h.bu vr14, vr9, vr12 |
| vmaddwod.h.bu vr14, vr9, vr12 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| vadd.h vr14, vr14, vr15 |
| vsrari.h vr14, vr14, 6 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vadd.h vr11, vr14, vr11 |
| vsrarni.b.h vr11, vr11, 1 |
| vstelm.d vr11, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr7, a1, 0 |
| vld vr8, a1, 1 |
| vld vr11, a0, 0 |
| vilvl.b vr12, vr6, vr5 |
| vilvl.b vr13, vr8, vr7 |
| vmulwev.h.bu vr14, vr9, vr12 |
| vmaddwod.h.bu vr14, vr9, vr12 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| vadd.h vr14, vr14, vr15 |
| vsrari.h vr14, vr14, 6 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vadd.h vr11, vr14, vr11 |
| vsrarni.b.h vr11, vr11, 1 |
| vstelm.d vr11, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| vld vr11, a0, 0 |
| vilvl.b vr12, vr8, vr7 |
| vilvl.b vr13, vr6, vr5 |
| vmulwev.h.bu vr14, vr9, vr12 |
| vmaddwod.h.bu vr14, vr9, vr12 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| vadd.h vr14, vr14, vr15 |
| vsrari.h vr14, vr14, 6 |
| vsllwil.hu.bu vr11, vr11, 0 |
| vadd.h vr11, vr14, vr11 |
| vsrarni.b.h vr11, vr11, 1 |
| vstelm.d vr11, a0, 0, 0 |
| add.d a0, a0, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPD |
| b .ENDLOOPELSE |
| .ENDLOOPD: |
| |
| bge zero, t0, .ENDLOOPE |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| vilvl.b vr7, vr4, vr0 |
| .LOOPE: |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vld vr8, a0, 0 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vld vr8, a0, 0 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vld vr8, a0, 0 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vld vr8, a0, 0 |
| vilvl.b vr5, vr6, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPE |
| b .ENDLOOPELSE |
| .ENDLOOPE: |
| |
| move t1, a3 |
| .LOOPELSE: |
| vld vr5, a1, 0 |
| vld vr8, a0, 0 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vilvl.h vr6, vr7, vr6 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, a2 |
| vld vr8, a0, 0 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vilvl.h vr6, vr7, vr6 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, t2 |
| vld vr8, a0, 0 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vilvl.h vr6, vr7, vr6 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| vldx vr5, a1, t3 |
| vld vr8, a0, 0 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vilvl.h vr6, vr7, vr6 |
| vsrari.h vr6, vr6, 6 |
| vsllwil.hu.bu vr8, vr8, 0 |
| vadd.h vr8, vr6, vr8 |
| vsrarni.b.h vr8, vr8, 1 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| add.d a1, a1, t4 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPELSE |
| .ENDLOOPELSE: |
| endfunc |
| |
| /* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_put_h264_chroma_mc4_lsx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| slli.d t8, a2, 1 |
| vreplgr2vr.b vr0, t3 |
| vreplgr2vr.b vr1, t4 |
| vreplgr2vr.b vr2, t5 |
| vreplgr2vr.b vr3, t6 |
| vreplgr2vr.b vr4, t0 |
| |
| bge zero, t6, .ENDPUT_D |
| move t1, a3 |
| vilvl.b vr9, vr1, vr0 |
| vilvl.b vr10, vr3, vr2 |
| .PUT_D: |
| vld vr5, a1, 0 |
| vld vr6, a1, 1 |
| add.d a1, a1, a2 |
| vld vr7, a1, 0 |
| vld vr8, a1, 1 |
| add.d a1, a1, a2 |
| vld vr11, a1, 0 |
| vld vr12, a1, 1 |
| vilvl.b vr5, vr6, vr5 |
| vilvl.b vr7, vr8, vr7 |
| vilvl.b vr13, vr12, vr11 |
| vilvl.d vr5, vr7, vr5 |
| vilvl.d vr13, vr13, vr7 |
| vmulwev.h.bu vr14, vr9, vr5 |
| vmaddwod.h.bu vr14, vr9, vr5 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| vadd.h vr14, vr14, vr15 |
| vsrarni.b.h vr14, vr14, 6 |
| vstelm.w vr14, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr14, a0, 0, 1 |
| add.d a0, a0, a2 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUT_D |
| b .ENDPUT |
| .ENDPUT_D: |
| |
| bge zero, t0, .ENDPUT_E |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| vilvl.b vr7, vr4, vr0 |
| .PUT_E: |
| vld vr5, a1, 0 |
| vldx vr6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| add.d a1, a1, a2 |
| vld vr8, a1, 0 |
| vldx vr9, a1, t7 |
| vilvl.b vr8, vr9, vr8 |
| vilvl.d vr5, vr8, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.w vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr6, a0, 0, 1 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUT_E |
| b .ENDPUT |
| .ENDPUT_E: |
| |
| move t1, a3 |
| .PUT: |
| vld vr5, a1, 0 |
| vldx vr8, a1, a2 |
| vilvl.w vr5, vr8, vr5 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vsrarni.b.h vr7, vr7, 6 |
| vilvl.b vr6, vr7, vr6 |
| vstelm.w vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr6, a0, 0, 1 |
| add.d a0, a0, a2 |
| add.d a1, a1, t8 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUT |
| .ENDPUT: |
| endfunc |
| |
| /* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_put_h264_chroma_mc8_lasx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| xvreplgr2vr.b xr0, t3 |
| xvreplgr2vr.b xr1, t4 |
| xvreplgr2vr.b xr2, t5 |
| xvreplgr2vr.b xr3, t6 |
| xvreplgr2vr.b xr4, t0 |
| slli.d t2, a2, 1 |
| add.d t3, t2, a2 |
| slli.d t4, a2, 2 |
| |
| bge zero, t6, .ENDLOOP_DA |
| move t1, a3 |
| xvilvl.b xr9, xr1, xr0 |
| xvilvl.b xr10, xr3, xr2 |
| .LOOP_DA: |
| fld.d f5, a1, 0 |
| fld.d f6, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f7, a1, 0 |
| fld.d f8, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f13, a1, 0 |
| fld.d f14, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f15, a1, 0 |
| fld.d f16, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f17, a1, 0 |
| fld.d f18, a1, 1 |
| vilvl.b vr11, vr6, vr5 |
| vilvl.b vr12, vr8, vr7 |
| vilvl.b vr14, vr14, vr13 |
| vilvl.b vr15, vr16, vr15 |
| vilvl.b vr16, vr18, vr17 |
| xvpermi.q xr11, xr12, 0x02 |
| xvpermi.q xr12, xr14, 0x02 |
| xvpermi.q xr14, xr15, 0x02 |
| xvpermi.q xr15, xr16, 0x02 |
| |
| xvmulwev.h.bu xr19, xr9, xr11 |
| xvmaddwod.h.bu xr19, xr9, xr11 |
| xvmulwev.h.bu xr20, xr10, xr12 |
| xvmaddwod.h.bu xr20, xr10, xr12 |
| xvadd.h xr21, xr19, xr20 |
| xvsrarni.b.h xr21, xr21, 6 |
| vstelm.d vr21, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr21, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvmulwev.h.bu xr13, xr9, xr14 |
| xvmaddwod.h.bu xr13, xr9, xr14 |
| xvmulwev.h.bu xr14, xr10, xr15 |
| xvmaddwod.h.bu xr14, xr10, xr15 |
| xvadd.h xr13, xr13, xr14 |
| xvsrarni.b.h xr13, xr13, 6 |
| vstelm.d vr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr13, a0, 0, 2 |
| add.d a0, a0, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOP_DA |
| b .ENDLOOPA |
| .ENDLOOP_DA: |
| |
| bge zero, t0, .ENDLOOP_EA |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| xvilvl.b xr7, xr4, xr0 |
| .LOOP_EA: |
| fld.d f5, a1, 0 |
| fldx.d f6, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f9, a1, 0 |
| fldx.d f10, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f11, a1, 0 |
| fldx.d f12, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f13, a1, 0 |
| fldx.d f14, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| vilvl.b vr9, vr10, vr9 |
| vilvl.b vr11, vr12, vr11 |
| vilvl.b vr13, vr14, vr13 |
| xvpermi.q xr5, xr9, 0x02 |
| xvpermi.q xr11, xr13, 0x02 |
| |
| xvmulwev.h.bu xr8, xr7, xr5 |
| xvmaddwod.h.bu xr8, xr7, xr5 |
| xvmulwev.h.bu xr6, xr7, xr11 |
| xvmaddwod.h.bu xr6, xr7, xr11 |
| xvsrarni.b.h xr8, xr8, 6 |
| vstelm.d vr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr8, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvsrarni.b.h xr6, xr6, 6 |
| vstelm.d vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr6, a0, 0, 2 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOP_EA |
| b .ENDLOOPA |
| .ENDLOOP_EA: |
| |
| move t1, a3 |
| .LOOPA: |
| fld.d f5, a1, 0 |
| fldx.d f6, a1, a2 |
| fldx.d f7, a1, t2 |
| fldx.d f8, a1, t3 |
| vilvl.d vr5, vr6, vr5 |
| vilvl.d vr7, vr8, vr7 |
| xvpermi.q xr5, xr7, 0x02 |
| xvmulwev.h.bu xr6, xr0, xr5 |
| xvmulwod.h.bu xr7, xr0, xr5 |
| xvilvl.h xr8, xr7, xr6 |
| xvilvh.h xr9, xr7, xr6 |
| xvsrarni.b.h xr9, xr8, 6 |
| vstelm.d vr9, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.d vr9, a0, 0, 1 |
| add.d a0, a0, a2 |
| xvstelm.d xr9, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvstelm.d xr9, a0, 0, 3 |
| add.d a0, a0, a2 |
| add.d a1, a1, t4 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPA |
| .ENDLOOPA: |
| endfunc |
| |
| /* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_avg_h264_chroma_mc8_lasx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| xvreplgr2vr.b xr0, t3 |
| xvreplgr2vr.b xr1, t4 |
| xvreplgr2vr.b xr2, t5 |
| xvreplgr2vr.b xr3, t6 |
| xvreplgr2vr.b xr4, t0 |
| slli.d t2, a2, 1 |
| add.d t3, t2, a2 |
| slli.d t4, a2, 2 |
| |
| bge zero, t6, .ENDLOOPDA |
| move t1, a3 |
| xvilvl.b xr9, xr1, xr0 |
| xvilvl.b xr10, xr3, xr2 |
| .LOOPDA: |
| fld.d f5, a1, 0 |
| fld.d f6, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f7, a1, 0 |
| fld.d f8, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f11, a1, 0 |
| fld.d f12, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f13, a1, 0 |
| fld.d f14, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f15, a1, 0 |
| fld.d f16, a1, 1 |
| fld.d f17, a0, 0 |
| fldx.d f18, a0, a2 |
| fldx.d f19, a0, t2 |
| fldx.d f20, a0, t3 |
| vilvl.b vr5, vr6, vr5 |
| vilvl.b vr7, vr8, vr7 |
| vilvl.b vr11, vr12, vr11 |
| vilvl.b vr13, vr14, vr13 |
| vilvl.b vr16, vr16, vr15 |
| xvpermi.q xr5, xr7, 0x02 |
| xvpermi.q xr7, xr11, 0x02 |
| xvpermi.q xr11, xr13, 0x02 |
| xvpermi.q xr13, xr16, 0x02 |
| xvpermi.q xr17, xr18, 0x02 |
| xvpermi.q xr19, xr20, 0x02 |
| |
| xvmulwev.h.bu xr14, xr9, xr5 |
| xvmaddwod.h.bu xr14, xr9, xr5 |
| xvmulwev.h.bu xr15, xr10, xr7 |
| xvmaddwod.h.bu xr15, xr10, xr7 |
| xvadd.h xr14, xr14, xr15 |
| xvsrari.h xr14, xr14, 6 |
| xvsllwil.hu.bu xr17, xr17, 0 |
| xvadd.h xr20, xr14, xr17 |
| xvsrarni.b.h xr20, xr20, 1 |
| xvstelm.d xr20, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr20, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvmulwev.h.bu xr14, xr9, xr11 |
| xvmaddwod.h.bu xr14, xr9, xr11 |
| xvmulwev.h.bu xr15, xr10, xr13 |
| xvmaddwod.h.bu xr15, xr10, xr13 |
| xvadd.h xr14, xr14, xr15 |
| xvsrari.h xr14, xr14, 6 |
| xvsllwil.hu.bu xr19, xr19, 0 |
| xvadd.h xr21, xr14, xr19 |
| xvsrarni.b.h xr21, xr21, 1 |
| xvstelm.d xr21, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr21, a0, 0, 2 |
| add.d a0, a0, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPDA |
| b .ENDLOOPELSEA |
| .ENDLOOPDA: |
| |
| bge zero, t0, .ENDLOOPEA |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| xvilvl.b xr7, xr4, xr0 |
| .LOOPEA: |
| fld.d f5, a1, 0 |
| fldx.d f6, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f8, a1, 0 |
| fldx.d f9, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f10, a1, 0 |
| fldx.d f11, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f12, a1, 0 |
| fldx.d f13, a1, t7 |
| add.d a1, a1, a2 |
| fld.d f14, a0, 0 |
| fldx.d f15, a0, a2 |
| fldx.d f16, a0, t2 |
| fldx.d f17, a0, t3 |
| vilvl.b vr5, vr6, vr5 |
| vilvl.b vr8, vr9, vr8 |
| vilvl.b vr10, vr11, vr10 |
| vilvl.b vr12, vr13, vr12 |
| xvpermi.q xr5, xr8, 0x02 |
| xvpermi.q xr10, xr12, 0x02 |
| xvpermi.q xr14, xr15, 0x02 |
| xvpermi.q xr16, xr17, 0x02 |
| |
| xvmulwev.h.bu xr6, xr7, xr5 |
| xvmaddwod.h.bu xr6, xr7, xr5 |
| xvsrari.h xr6, xr6, 6 |
| xvsllwil.hu.bu xr14, xr14, 0 |
| xvadd.h xr8, xr6, xr14 |
| xvsrarni.b.h xr8, xr8, 1 |
| xvstelm.d xr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr8, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvmulwev.h.bu xr6, xr7, xr10 |
| xvmaddwod.h.bu xr6, xr7, xr10 |
| xvsrari.h xr6, xr6, 6 |
| xvsllwil.hu.bu xr16, xr16, 0 |
| xvadd.h xr8, xr6, xr16 |
| xvsrarni.b.h xr8, xr8, 1 |
| xvstelm.d xr8, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr8, a0, 0, 2 |
| add.d a0, a0, a2 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPEA |
| b .ENDLOOPELSEA |
| .ENDLOOPEA: |
| |
| move t1, a3 |
| .LOOPELSEA: |
| fld.d f5, a1, 0 |
| fldx.d f6, a1, a2 |
| fldx.d f7, a1, t2 |
| fldx.d f8, a1, t3 |
| fld.d f9, a0, 0 |
| fldx.d f10, a0, a2 |
| fldx.d f11, a0, t2 |
| fldx.d f12, a0, t3 |
| xvpermi.q xr5, xr6, 0x02 |
| xvpermi.q xr7, xr8, 0x02 |
| xvpermi.q xr9, xr10, 0x02 |
| xvpermi.q xr11, xr12, 0x02 |
| |
| xvmulwev.h.bu xr12, xr0, xr5 |
| xvmulwod.h.bu xr13, xr0, xr5 |
| xvilvl.h xr12, xr13, xr12 |
| xvsrari.h xr12, xr12, 6 |
| xvsllwil.hu.bu xr9, xr9, 0 |
| xvadd.h xr9, xr12, xr9 |
| xvsrarni.b.h xr9, xr9, 1 |
| xvstelm.d xr9, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr9, a0, 0, 2 |
| add.d a0, a0, a2 |
| xvmulwev.h.bu xr12, xr0, xr7 |
| xvmulwod.h.bu xr13, xr0, xr7 |
| xvilvl.h xr12, xr13, xr12 |
| xvsrari.h xr12, xr12, 6 |
| xvsllwil.hu.bu xr11, xr11, 0 |
| xvadd.h xr13, xr12, xr11 |
| xvsrarni.b.h xr13, xr13, 1 |
| xvstelm.d xr13, a0, 0, 0 |
| add.d a0, a0, a2 |
| xvstelm.d xr13, a0, 0, 2 |
| add.d a0, a0, a2 |
| add.d a1, a1, t4 |
| |
| addi.d t1, t1, -4 |
| blt zero, t1, .LOOPELSEA |
| .ENDLOOPELSEA: |
| endfunc |
| |
| /* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
| int h, int x, int y) */ |
| function ff_put_h264_chroma_mc4_lasx |
| li.d t8, 8 |
| sub.d t1, t8, a4 // 8-x |
| sub.d t2, t8, a5 // 8-y |
| mul.d t3, t1, t2 // A |
| mul.d t4, a4, t2 // B |
| mul.d t5, t1, a5 // C |
| mul.d t6, a4, a5 // D |
| add.d t0, t4, t5 // E |
| slli.d t8, a2, 1 |
| vreplgr2vr.b vr0, t3 |
| vreplgr2vr.b vr1, t4 |
| vreplgr2vr.b vr2, t5 |
| vreplgr2vr.b vr3, t6 |
| vreplgr2vr.b vr4, t0 |
| |
| bge zero, t6, .ENDPUT_DA |
| move t1, a3 |
| vilvl.b vr9, vr1, vr0 |
| vilvl.b vr10, vr3, vr2 |
| .PUT_DA: |
| fld.d f5, a1, 0 |
| fld.d f6, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f7, a1, 0 |
| fld.d f8, a1, 1 |
| add.d a1, a1, a2 |
| fld.d f11, a1, 0 |
| fld.d f12, a1, 1 |
| vilvl.b vr5, vr6, vr5 |
| vilvl.b vr7, vr8, vr7 |
| vilvl.b vr13, vr12, vr11 |
| vilvl.d vr5, vr7, vr5 |
| vilvl.d vr13, vr13, vr7 |
| vmulwev.h.bu vr14, vr9, vr5 |
| vmaddwod.h.bu vr14, vr9, vr5 |
| vmulwev.h.bu vr15, vr10, vr13 |
| vmaddwod.h.bu vr15, vr10, vr13 |
| xvadd.h xr14, xr14, xr15 |
| vsrarni.b.h vr16, vr14, 6 |
| vstelm.w vr16, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr16, a0, 0, 1 |
| add.d a0, a0, a2 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUT_DA |
| b .ENDPUTA |
| .ENDPUT_DA: |
| |
| bge zero, t0, .ENDPUT_EA |
| move t1, a3 |
| li.d t7, 1 |
| slt t8, zero, t5 |
| maskeqz t5, a2, t8 |
| masknez t7, t7, t8 |
| or t7, t7, t5 |
| vilvl.b vr7, vr4, vr0 |
| .PUT_EA: |
| fld.d f5, a1, 0 |
| fldx.d f6, a1, t7 |
| vilvl.b vr5, vr6, vr5 |
| add.d a1, a1, a2 |
| fld.d f8, a1, 0 |
| fldx.d f9, a1, t7 |
| vilvl.b vr8, vr9, vr8 |
| vilvl.d vr5, vr8, vr5 |
| vmulwev.h.bu vr6, vr7, vr5 |
| vmaddwod.h.bu vr6, vr7, vr5 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.w vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr6, a0, 0, 1 |
| add.d a0, a0, a2 |
| add.d a1, a1, a2 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUT_EA |
| b .ENDPUTA |
| .ENDPUT_EA: |
| |
| move t1, a3 |
| .PUTA: |
| fld.d f5, a1, 0 |
| fldx.d f8, a1, a2 |
| vilvl.w vr5, vr8, vr5 |
| vmulwev.h.bu vr6, vr0, vr5 |
| vmulwod.h.bu vr7, vr0, vr5 |
| vilvl.h vr6, vr7, vr6 |
| vsrarni.b.h vr6, vr6, 6 |
| vstelm.w vr6, a0, 0, 0 |
| add.d a0, a0, a2 |
| vstelm.w vr6, a0, 0, 1 |
| add.d a0, a0, a2 |
| add.d a1, a1, t8 |
| addi.d t1, t1, -2 |
| blt zero, t1, .PUTA |
| .ENDPUTA: |
| endfunc |