| /* |
| * Loongson LSX optimized h264intrapred |
| * |
| * Copyright (c) 2023 Loongson Technology Corporation Limited |
| * Contributed by Lu Wang <wanglu@loongson.cn> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "loongson_asm.S" |
| |
| const shufa |
| .byte 6, 5, 4, 3, 2, 1, 0 |
| endconst |
| |
| const mulk |
| .byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 |
| endconst |
| |
| const mulh |
| .byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 |
| .byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0 |
| endconst |
| |
| .macro PRED16X16_PLANE |
| slli.d t6, a1, 1 |
| slli.d t4, a1, 3 |
| addi.d t0, a0, 7 |
| sub.d t0, t0, a1 |
| add.d t1, a0, t4 |
| addi.d t1, t1, -1 |
| sub.d t2, t1, t6 |
| |
| ld.bu t3, t0, 1 |
| ld.bu t4, t0, -1 |
| ld.bu t5, t1, 0 |
| ld.bu t7, t2, 0 |
| sub.d t3, t3, t4 |
| sub.d t4, t5, t7 |
| |
| la.local t5, mulk |
| vld vr0, t5, 0 |
| fld.d f1, t0, 2 |
| fld.d f2, t0, -8 |
| la.local t5, shufa |
| fld.d f3, t5, 0 |
| vshuf.b vr2, vr2, vr2, vr3 |
| vilvl.b vr1, vr1, vr2 |
| vhsubw.hu.bu vr1, vr1, vr1 |
| vmul.h vr0, vr0, vr1 |
| vhaddw.w.h vr1, vr0, vr0 |
| vhaddw.d.w vr0, vr1, vr1 |
| vhaddw.q.d vr1, vr0, vr0 |
| vpickve2gr.w t5, vr1, 0 |
| add.d t3, t3, t5 |
| //2 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ldx.bu t7, t1, a1 |
| sub.d t5, t7, t8 |
| slli.d t5, t5, 1 |
| |
| //3&4 |
| add.d t1, t1, t6 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ld.bu t7, t1, 0 |
| sub.d t7, t7, t8 |
| slli.d t8, t7, 1 |
| add.d t7, t7, t8 |
| add.d t5, t5, t7 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ldx.bu t7, t1, a1 |
| sub.d t7, t7, t8 |
| slli.d t7, t7, 2 |
| add.d t5, t5, t7 |
| |
| //5&6 |
| add.d t1, t1, t6 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ld.bu t7, t1, 0 |
| sub.d t7, t7, t8 |
| slli.d t8, t7, 2 |
| add.d t7, t7, t8 |
| add.d t5, t5, t7 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ldx.bu t7, t1, a1 |
| sub.d t7, t7, t8 |
| slli.d t8, t7, 1 |
| slli.d t7, t7, 2 |
| add.d t7, t7, t8 |
| add.d t5, t5, t7 |
| |
| //7&8 |
| add.d t1, t1, t6 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ld.bu t7, t1, 0 |
| sub.d t7, t7, t8 |
| slli.d t8, t7, 3 |
| sub.d t7, t8, t7 |
| add.d t5, t5, t7 |
| sub.d t2, t2, a1 |
| ld.bu t8, t2, 0 |
| ldx.bu t7, t1, a1 |
| sub.d t7, t7, t8 |
| slli.d t7, t7, 3 |
| add.d t5, t5, t7 |
| add.d t4, t4, t5 |
| add.d t1, t1, a1 |
| .endm |
| |
| .macro PRED16X16_PLANE_END |
| ld.bu t7, t1, 0 |
| ld.bu t8, t2, 16 |
| add.d t5, t7, t8 |
| addi.d t5, t5, 1 |
| slli.d t5, t5, 4 |
| add.d t7, t3, t4 |
| slli.d t8, t7, 3 |
| sub.d t7, t8, t7 |
| sub.d t5, t5, t7 |
| |
| la.local t8, mulh |
| vld vr3, t8, 0 |
| slli.d t8, t3, 3 |
| vreplgr2vr.h vr4, t3 |
| vreplgr2vr.h vr9, t8 |
| vmul.h vr5, vr3, vr4 |
| |
| .rept 16 |
| move t7, t5 |
| add.d t5, t5, t4 |
| vreplgr2vr.h vr6, t7 |
| vadd.h vr7, vr6, vr5 |
| vadd.h vr8, vr9, vr7 |
| vssrani.bu.h vr8, vr7, 5 |
| vst vr8, a0, 0 |
| add.d a0, a0, a1 |
| .endr |
| .endm |
| |
| .macro PRED16X16_PLANE_END_LASX |
| ld.bu t7, t1, 0 |
| ld.bu t8, t2, 16 |
| add.d t5, t7, t8 |
| addi.d t5, t5, 1 |
| slli.d t5, t5, 4 |
| add.d t7, t3, t4 |
| slli.d t8, t7, 3 |
| sub.d t7, t8, t7 |
| sub.d t5, t5, t7 |
| |
| la.local t8, mulh |
| xvld xr3, t8, 0 |
| xvreplgr2vr.h xr4, t3 |
| xvmul.h xr5, xr3, xr4 |
| |
| .rept 8 |
| move t7, t5 |
| add.d t5, t5, t4 |
| xvreplgr2vr.h xr6, t7 |
| xvreplgr2vr.h xr8, t5 |
| add.d t5, t5, t4 |
| xvadd.h xr7, xr6, xr5 |
| xvadd.h xr9, xr8, xr5 |
| |
| xvssrani.bu.h xr9, xr7, 5 |
| vstelm.d vr9, a0, 0, 0 |
| xvstelm.d xr9, a0, 8, 2 |
| add.d a0, a0, a1 |
| vstelm.d vr9, a0, 0, 1 |
| xvstelm.d xr9, a0, 8, 3 |
| add.d a0, a0, a1 |
| .endr |
| .endm |
| |
| /* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_h264_8_lsx |
| PRED16X16_PLANE |
| |
| slli.d t7, t3, 2 |
| add.d t3, t3, t7 |
| addi.d t3, t3, 32 |
| srai.d t3, t3, 6 |
| slli.d t7, t4, 2 |
| add.d t4, t4, t7 |
| addi.d t4, t4, 32 |
| srai.d t4, t4, 6 |
| |
| PRED16X16_PLANE_END |
| endfunc |
| |
| /* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_rv40_8_lsx |
| PRED16X16_PLANE |
| |
| srai.d t7, t3, 2 |
| add.d t3, t3, t7 |
| srai.d t3, t3, 4 |
| srai.d t7, t4, 2 |
| add.d t4, t4, t7 |
| srai.d t4, t4, 4 |
| |
| PRED16X16_PLANE_END |
| endfunc |
| |
| /* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_svq3_8_lsx |
| PRED16X16_PLANE |
| |
| li.d t6, 4 |
| li.d t7, 5 |
| li.d t8, 16 |
| div.d t3, t3, t6 |
| mul.d t3, t3, t7 |
| div.d t3, t3, t8 |
| div.d t4, t4, t6 |
| mul.d t4, t4, t7 |
| div.d t4, t4, t8 |
| move t7, t3 |
| move t3, t4 |
| move t4, t7 |
| |
| PRED16X16_PLANE_END |
| endfunc |
| |
| /* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_h264_8_lasx |
| PRED16X16_PLANE |
| |
| slli.d t7, t3, 2 |
| add.d t3, t3, t7 |
| addi.d t3, t3, 32 |
| srai.d t3, t3, 6 |
| slli.d t7, t4, 2 |
| add.d t4, t4, t7 |
| addi.d t4, t4, 32 |
| srai.d t4, t4, 6 |
| |
| PRED16X16_PLANE_END_LASX |
| endfunc |
| |
| /* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_rv40_8_lasx |
| PRED16X16_PLANE |
| |
| srai.d t7, t3, 2 |
| add.d t3, t3, t7 |
| srai.d t3, t3, 4 |
| srai.d t7, t4, 2 |
| add.d t4, t4, t7 |
| srai.d t4, t4, 4 |
| |
| PRED16X16_PLANE_END_LASX |
| endfunc |
| |
| /* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride) |
| */ |
| function ff_h264_pred16x16_plane_svq3_8_lasx |
| PRED16X16_PLANE |
| |
| li.d t5, 4 |
| li.d t7, 5 |
| li.d t8, 16 |
| div.d t3, t3, t5 |
| mul.d t3, t3, t7 |
| div.d t3, t3, t8 |
| div.d t4, t4, t5 |
| mul.d t4, t4, t7 |
| div.d t4, t4, t8 |
| move t7, t3 |
| move t3, t4 |
| move t4, t7 |
| |
| PRED16X16_PLANE_END_LASX |
| endfunc |