| /* |
| * Loongson LSX optimized add_residual functions for HEVC decoding |
| * |
| * Copyright (c) 2023 Loongson Technology Corporation Limited |
| * Contributed by jinbo <jinbo@loongson.cn> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "loongson_asm.S" |
| |
| /* |
| * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
| */ |
| .macro ADD_RES_LSX_4x4_8 |
| vldrepl.w vr0, a0, 0 |
| add.d t0, a0, a2 |
| vldrepl.w vr1, t0, 0 |
| vld vr2, a1, 0 |
| |
| vilvl.w vr1, vr1, vr0 |
| vsllwil.hu.bu vr1, vr1, 0 |
| vadd.h vr1, vr1, vr2 |
| vssrani.bu.h vr1, vr1, 0 |
| |
| vstelm.w vr1, a0, 0, 0 |
| vstelm.w vr1, t0, 0, 1 |
| .endm |
| |
| function ff_hevc_add_residual4x4_8_lsx |
| ADD_RES_LSX_4x4_8 |
| alsl.d a0, a2, a0, 1 |
| addi.d a1, a1, 16 |
| ADD_RES_LSX_4x4_8 |
| endfunc |
| |
| /* |
| * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
| */ |
| .macro ADD_RES_LSX_8x8_8 |
| vldrepl.d vr0, a0, 0 |
| add.d t0, a0, a2 |
| vldrepl.d vr1, t0, 0 |
| add.d t1, t0, a2 |
| vldrepl.d vr2, t1, 0 |
| add.d t2, t1, a2 |
| vldrepl.d vr3, t2, 0 |
| |
| vld vr4, a1, 0 |
| addi.d t3, zero, 16 |
| vldx vr5, a1, t3 |
| addi.d t4, a1, 32 |
| vld vr6, t4, 0 |
| vldx vr7, t4, t3 |
| |
| vsllwil.hu.bu vr0, vr0, 0 |
| vsllwil.hu.bu vr1, vr1, 0 |
| vsllwil.hu.bu vr2, vr2, 0 |
| vsllwil.hu.bu vr3, vr3, 0 |
| vadd.h vr0, vr0, vr4 |
| vadd.h vr1, vr1, vr5 |
| vadd.h vr2, vr2, vr6 |
| vadd.h vr3, vr3, vr7 |
| vssrani.bu.h vr1, vr0, 0 |
| vssrani.bu.h vr3, vr2, 0 |
| |
| vstelm.d vr1, a0, 0, 0 |
| vstelm.d vr1, t0, 0, 1 |
| vstelm.d vr3, t1, 0, 0 |
| vstelm.d vr3, t2, 0, 1 |
| .endm |
| |
| function ff_hevc_add_residual8x8_8_lsx |
| ADD_RES_LSX_8x8_8 |
| alsl.d a0, a2, a0, 2 |
| addi.d a1, a1, 64 |
| ADD_RES_LSX_8x8_8 |
| endfunc |
| |
| /* |
| * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
| */ |
| function ff_hevc_add_residual16x16_8_lsx |
| .rept 8 |
| vld vr0, a0, 0 |
| vldx vr2, a0, a2 |
| |
| vld vr4, a1, 0 |
| addi.d t0, zero, 16 |
| vldx vr5, a1, t0 |
| addi.d t1, a1, 32 |
| vld vr6, t1, 0 |
| vldx vr7, t1, t0 |
| |
| vexth.hu.bu vr1, vr0 |
| vsllwil.hu.bu vr0, vr0, 0 |
| vexth.hu.bu vr3, vr2 |
| vsllwil.hu.bu vr2, vr2, 0 |
| vadd.h vr0, vr0, vr4 |
| vadd.h vr1, vr1, vr5 |
| vadd.h vr2, vr2, vr6 |
| vadd.h vr3, vr3, vr7 |
| |
| vssrani.bu.h vr1, vr0, 0 |
| vssrani.bu.h vr3, vr2, 0 |
| |
| vst vr1, a0, 0 |
| vstx vr3, a0, a2 |
| |
| alsl.d a0, a2, a0, 1 |
| addi.d a1, a1, 64 |
| .endr |
| endfunc |
| |
| /* |
| * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
| */ |
| function ff_hevc_add_residual32x32_8_lsx |
| .rept 32 |
| vld vr0, a0, 0 |
| addi.w t0, zero, 16 |
| vldx vr2, a0, t0 |
| |
| vld vr4, a1, 0 |
| vldx vr5, a1, t0 |
| addi.d t1, a1, 32 |
| vld vr6, t1, 0 |
| vldx vr7, t1, t0 |
| |
| vexth.hu.bu vr1, vr0 |
| vsllwil.hu.bu vr0, vr0, 0 |
| vexth.hu.bu vr3, vr2 |
| vsllwil.hu.bu vr2, vr2, 0 |
| vadd.h vr0, vr0, vr4 |
| vadd.h vr1, vr1, vr5 |
| vadd.h vr2, vr2, vr6 |
| vadd.h vr3, vr3, vr7 |
| |
| vssrani.bu.h vr1, vr0, 0 |
| vssrani.bu.h vr3, vr2, 0 |
| |
| vst vr1, a0, 0 |
| vstx vr3, a0, t0 |
| |
| add.d a0, a0, a2 |
| addi.d a1, a1, 64 |
| .endr |
| endfunc |