| /* |
| * Loongson asm helper. |
| * |
| * Copyright (c) 2022 Loongson Technology Corporation Limited |
| * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn) |
| * Shiyou Yin(yinshiyou-hf@loongson.cn) |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| /** |
| * MAJOR version: Macro usage changes. |
| * MINOR version: Add new functions, or bug fixes. |
| * MICRO version: Comment changes or implementation changes. |
| */ |
| #define LML_VERSION_MAJOR 0 |
| #define LML_VERSION_MINOR 2 |
| #define LML_VERSION_MICRO 0 |
| |
| /* |
| *============================================================================ |
| * macros for specific projetc, set them as needed. |
| * Following LoongML macros for your reference. |
| *============================================================================ |
| */ |
| #define ASM_PREF |
| #define DEFAULT_ALIGN 5 |
| |
| .macro function name, align=DEFAULT_ALIGN |
| .macro endfunc |
| jirl $r0, $r1, 0x0 |
| .size ASM_PREF\name, . - ASM_PREF\name |
| .purgem endfunc |
| .endm |
| .text ; |
| .align \align ; |
| .globl ASM_PREF\name ; |
| .type ASM_PREF\name, @function ; |
| ASM_PREF\name: ; |
| .endm |
| |
| /** |
| * Attention: If align is not zero, the macro will use |
| * t7 until the end of function |
| */ |
| .macro alloc_stack size, align=0 |
| .if \align |
| .macro clean_stack |
| add.d sp, sp, t7 |
| .endm |
| addi.d sp, sp, - \size |
| andi.d t7, sp, \align - 1 |
| sub.d sp, sp, t7 |
| addi.d t7, t7, \size |
| .else |
| .macro clean_stack |
| addi.d sp, sp, \size |
| .endm |
| addi.d sp, sp, - \size |
| .endif |
| .endm |
| |
| .macro const name, align=DEFAULT_ALIGN |
| .macro endconst |
| .size \name, . - \name |
| .purgem endconst |
| .endm |
| .section .rodata |
| .align \align |
| \name: |
| .endm |
| |
| /* |
| *============================================================================ |
| * LoongArch register alias |
| *============================================================================ |
| */ |
| |
| #define a0 $a0 |
| #define a1 $a1 |
| #define a2 $a2 |
| #define a3 $a3 |
| #define a4 $a4 |
| #define a5 $a5 |
| #define a6 $a6 |
| #define a7 $a7 |
| |
| #define t0 $t0 |
| #define t1 $t1 |
| #define t2 $t2 |
| #define t3 $t3 |
| #define t4 $t4 |
| #define t5 $t5 |
| #define t6 $t6 |
| #define t7 $t7 |
| #define t8 $t8 |
| |
| #define s0 $s0 |
| #define s1 $s1 |
| #define s2 $s2 |
| #define s3 $s3 |
| #define s4 $s4 |
| #define s5 $s5 |
| #define s6 $s6 |
| #define s7 $s7 |
| #define s8 $s8 |
| |
| #define zero $zero |
| #define sp $sp |
| #define ra $ra |
| |
| #define f0 $f0 |
| #define f1 $f1 |
| #define f2 $f2 |
| #define f3 $f3 |
| #define f4 $f4 |
| #define f5 $f5 |
| #define f6 $f6 |
| #define f7 $f7 |
| #define f8 $f8 |
| #define f9 $f9 |
| #define f10 $f10 |
| #define f11 $f11 |
| #define f12 $f12 |
| #define f13 $f13 |
| #define f14 $f14 |
| #define f15 $f15 |
| #define f16 $f16 |
| #define f17 $f17 |
| #define f18 $f18 |
| #define f19 $f19 |
| #define f20 $f20 |
| #define f21 $f21 |
| #define f22 $f22 |
| #define f23 $f23 |
| #define f24 $f24 |
| #define f25 $f25 |
| #define f26 $f26 |
| #define f27 $f27 |
| #define f28 $f28 |
| #define f29 $f29 |
| #define f30 $f30 |
| #define f31 $f31 |
| |
| #define vr0 $vr0 |
| #define vr1 $vr1 |
| #define vr2 $vr2 |
| #define vr3 $vr3 |
| #define vr4 $vr4 |
| #define vr5 $vr5 |
| #define vr6 $vr6 |
| #define vr7 $vr7 |
| #define vr8 $vr8 |
| #define vr9 $vr9 |
| #define vr10 $vr10 |
| #define vr11 $vr11 |
| #define vr12 $vr12 |
| #define vr13 $vr13 |
| #define vr14 $vr14 |
| #define vr15 $vr15 |
| #define vr16 $vr16 |
| #define vr17 $vr17 |
| #define vr18 $vr18 |
| #define vr19 $vr19 |
| #define vr20 $vr20 |
| #define vr21 $vr21 |
| #define vr22 $vr22 |
| #define vr23 $vr23 |
| #define vr24 $vr24 |
| #define vr25 $vr25 |
| #define vr26 $vr26 |
| #define vr27 $vr27 |
| #define vr28 $vr28 |
| #define vr29 $vr29 |
| #define vr30 $vr30 |
| #define vr31 $vr31 |
| |
| #define xr0 $xr0 |
| #define xr1 $xr1 |
| #define xr2 $xr2 |
| #define xr3 $xr3 |
| #define xr4 $xr4 |
| #define xr5 $xr5 |
| #define xr6 $xr6 |
| #define xr7 $xr7 |
| #define xr8 $xr8 |
| #define xr9 $xr9 |
| #define xr10 $xr10 |
| #define xr11 $xr11 |
| #define xr12 $xr12 |
| #define xr13 $xr13 |
| #define xr14 $xr14 |
| #define xr15 $xr15 |
| #define xr16 $xr16 |
| #define xr17 $xr17 |
| #define xr18 $xr18 |
| #define xr19 $xr19 |
| #define xr20 $xr20 |
| #define xr21 $xr21 |
| #define xr22 $xr22 |
| #define xr23 $xr23 |
| #define xr24 $xr24 |
| #define xr25 $xr25 |
| #define xr26 $xr26 |
| #define xr27 $xr27 |
| #define xr28 $xr28 |
| #define xr29 $xr29 |
| #define xr30 $xr30 |
| #define xr31 $xr31 |
| |
| /* |
| *============================================================================ |
| * LSX/LASX synthesize instructions |
| *============================================================================ |
| */ |
| |
| /* |
| * Description : Dot product of byte vector elements |
| * Arguments : Inputs - vj, vk |
| * Outputs - vd |
| * Return Type - halfword |
| */ |
| .macro vdp2.h.bu vd, vj, vk |
| vmulwev.h.bu \vd, \vj, \vk |
| vmaddwod.h.bu \vd, \vj, \vk |
| .endm |
| |
| .macro vdp2.h.bu.b vd, vj, vk |
| vmulwev.h.bu.b \vd, \vj, \vk |
| vmaddwod.h.bu.b \vd, \vj, \vk |
| .endm |
| |
| .macro vdp2.w.h vd, vj, vk |
| vmulwev.w.h \vd, \vj, \vk |
| vmaddwod.w.h \vd, \vj, \vk |
| .endm |
| |
| .macro xvdp2.h.bu xd, xj, xk |
| xvmulwev.h.bu \xd, \xj, \xk |
| xvmaddwod.h.bu \xd, \xj, \xk |
| .endm |
| |
| .macro xvdp2.h.bu.b xd, xj, xk |
| xvmulwev.h.bu.b \xd, \xj, \xk |
| xvmaddwod.h.bu.b \xd, \xj, \xk |
| .endm |
| |
| .macro xvdp2.w.h xd, xj, xk |
| xvmulwev.w.h \xd, \xj, \xk |
| xvmaddwod.w.h \xd, \xj, \xk |
| .endm |
| |
| /* |
| * Description : Dot product & addition of halfword vector elements |
| * Arguments : Inputs - vj, vk |
| * Outputs - vd |
| * Return Type - twice size of input |
| */ |
| .macro vdp2add.h.bu vd, vj, vk |
| vmaddwev.h.bu \vd, \vj, \vk |
| vmaddwod.h.bu \vd, \vj, \vk |
| .endm |
| |
| .macro vdp2add.h.bu.b vd, vj, vk |
| vmaddwev.h.bu.b \vd, \vj, \vk |
| vmaddwod.h.bu.b \vd, \vj, \vk |
| .endm |
| |
| .macro vdp2add.w.h vd, vj, vk |
| vmaddwev.w.h \vd, \vj, \vk |
| vmaddwod.w.h \vd, \vj, \vk |
| .endm |
| |
| .macro xvdp2add.h.bu.b xd, xj, xk |
| xvmaddwev.h.bu.b \xd, \xj, \xk |
| xvmaddwod.h.bu.b \xd, \xj, \xk |
| .endm |
| |
| .macro xvdp2add.w.h xd, xj, xk |
| xvmaddwev.w.h \xd, \xj, \xk |
| xvmaddwod.w.h \xd, \xj, \xk |
| .endm |
| |
| /* |
| * Description : Range each element of vector |
| * clip: vj > vk ? vj : vk && vj < va ? vj : va |
| * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 |
| */ |
| .macro vclip.h vd, vj, vk, va |
| vmax.h \vd, \vj, \vk |
| vmin.h \vd, \vd, \va |
| .endm |
| |
| .macro vclip255.w vd, vj |
| vmaxi.w \vd, \vj, 0 |
| vsat.wu \vd, \vd, 7 |
| .endm |
| |
| .macro vclip255.h vd, vj |
| vmaxi.h \vd, \vj, 0 |
| vsat.hu \vd, \vd, 7 |
| .endm |
| |
| .macro xvclip.h xd, xj, xk, xa |
| xvmax.h \xd, \xj, \xk |
| xvmin.h \xd, \xd, \xa |
| .endm |
| |
| .macro xvclip255.h xd, xj |
| xvmaxi.h \xd, \xj, 0 |
| xvsat.hu \xd, \xd, 7 |
| .endm |
| |
| .macro xvclip255.w xd, xj |
| xvmaxi.w \xd, \xj, 0 |
| xvsat.wu \xd, \xd, 7 |
| .endm |
| |
| /* |
| * Description : Store elements of vector |
| * vd : Data vector to be stroed |
| * rk : Address of data storage |
| * ra : Offset of address |
| * si : Index of data in vd |
| */ |
| .macro vstelmx.b vd, rk, ra, si |
| add.d \rk, \rk, \ra |
| vstelm.b \vd, \rk, 0, \si |
| .endm |
| |
| .macro vstelmx.h vd, rk, ra, si |
| add.d \rk, \rk, \ra |
| vstelm.h \vd, \rk, 0, \si |
| .endm |
| |
| .macro vstelmx.w vd, rk, ra, si |
| add.d \rk, \rk, \ra |
| vstelm.w \vd, \rk, 0, \si |
| .endm |
| |
| .macro vstelmx.d vd, rk, ra, si |
| add.d \rk, \rk, \ra |
| vstelm.d \vd, \rk, 0, \si |
| .endm |
| |
| .macro vmov xd, xj |
| vor.v \xd, \xj, \xj |
| .endm |
| |
| .macro xmov xd, xj |
| xvor.v \xd, \xj, \xj |
| .endm |
| |
| .macro xvstelmx.d xd, rk, ra, si |
| add.d \rk, \rk, \ra |
| xvstelm.d \xd, \rk, 0, \si |
| .endm |
| |
| /* |
| *============================================================================ |
| * LSX/LASX custom macros |
| *============================================================================ |
| */ |
| |
| /* |
| * Load 4 float, double, V128, v256 elements with stride. |
| */ |
| .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 |
| fld.s \out0, \src, 0 |
| fldx.s \out1, \src, \stride |
| fldx.s \out2, \src, \stride2 |
| fldx.s \out3, \src, \stride3 |
| .endm |
| |
| .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 |
| fld.d \out0, \src, 0 |
| fldx.d \out1, \src, \stride |
| fldx.d \out2, \src, \stride2 |
| fldx.d \out3, \src, \stride3 |
| .endm |
| |
| .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 |
| vld \out0, \src, 0 |
| vldx \out1, \src, \stride |
| vldx \out2, \src, \stride2 |
| vldx \out3, \src, \stride3 |
| .endm |
| |
| .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 |
| xvld \out0, \src, 0 |
| xvldx \out1, \src, \stride |
| xvldx \out2, \src, \stride2 |
| xvldx \out3, \src, \stride3 |
| .endm |
| |
| /* |
| * Description : Transpose 4x4 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| */ |
| .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ |
| tmp0, tmp1 |
| vilvl.h \tmp0, \in1, \in0 |
| vilvl.h \tmp1, \in3, \in2 |
| vilvl.w \out0, \tmp1, \tmp0 |
| vilvh.w \out2, \tmp1, \tmp0 |
| vilvh.d \out1, \out0, \out0 |
| vilvh.d \out3, \out0, \out2 |
| .endm |
| |
| /* |
| * Description : Transpose 4x4 block with word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| * Details : |
| * Example : |
| * 1, 2, 3, 4 1, 5, 9,13 |
| * 5, 6, 7, 8 to 2, 6,10,14 |
| * 9,10,11,12 =====> 3, 7,11,15 |
| * 13,14,15,16 4, 8,12,16 |
| */ |
| .macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \ |
| _tmp0, _tmp1 |
| |
| vilvl.w \_tmp0, \_in1, \_in0 |
| vilvh.w \_out1, \_in1, \_in0 |
| vilvl.w \_tmp1, \_in3, \_in2 |
| vilvh.w \_out3, \_in3, \_in2 |
| |
| vilvl.d \_out0, \_tmp1, \_tmp0 |
| vilvl.d \_out2, \_out3, \_out1 |
| vilvh.d \_out3, \_out3, \_out1 |
| vilvh.d \_out1, \_tmp1, \_tmp0 |
| .endm |
| |
| /* |
| * Description : Transpose 8x8 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| */ |
| .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ |
| out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ |
| tmp3, tmp4, tmp5, tmp6, tmp7 |
| vilvl.h \tmp0, \in6, \in4 |
| vilvl.h \tmp1, \in7, \in5 |
| vilvl.h \tmp2, \in2, \in0 |
| vilvl.h \tmp3, \in3, \in1 |
| |
| vilvl.h \tmp4, \tmp1, \tmp0 |
| vilvh.h \tmp5, \tmp1, \tmp0 |
| vilvl.h \tmp6, \tmp3, \tmp2 |
| vilvh.h \tmp7, \tmp3, \tmp2 |
| |
| vilvh.h \tmp0, \in6, \in4 |
| vilvh.h \tmp1, \in7, \in5 |
| vilvh.h \tmp2, \in2, \in0 |
| vilvh.h \tmp3, \in3, \in1 |
| |
| vpickev.d \out0, \tmp4, \tmp6 |
| vpickod.d \out1, \tmp4, \tmp6 |
| vpickev.d \out2, \tmp5, \tmp7 |
| vpickod.d \out3, \tmp5, \tmp7 |
| |
| vilvl.h \tmp4, \tmp1, \tmp0 |
| vilvh.h \tmp5, \tmp1, \tmp0 |
| vilvl.h \tmp6, \tmp3, \tmp2 |
| vilvh.h \tmp7, \tmp3, \tmp2 |
| |
| vpickev.d \out4, \tmp4, \tmp6 |
| vpickod.d \out5, \tmp4, \tmp6 |
| vpickev.d \out6, \tmp5, \tmp7 |
| vpickod.d \out7, \tmp5, \tmp7 |
| .endm |
| |
| /* |
| * Description : Transpose 16x8 block with byte elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| */ |
| .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ |
| in8, in9, in10, in11, in12, in13, in14, in15, \ |
| out0, out1, out2, out3, out4, out5, out6, out7,\ |
| tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 |
| xvilvl.b \tmp0, \in2, \in0 |
| xvilvl.b \tmp1, \in3, \in1 |
| xvilvl.b \tmp2, \in6, \in4 |
| xvilvl.b \tmp3, \in7, \in5 |
| xvilvl.b \tmp4, \in10, \in8 |
| xvilvl.b \tmp5, \in11, \in9 |
| xvilvl.b \tmp6, \in14, \in12 |
| xvilvl.b \tmp7, \in15, \in13 |
| xvilvl.b \out0, \tmp1, \tmp0 |
| xvilvh.b \out1, \tmp1, \tmp0 |
| xvilvl.b \out2, \tmp3, \tmp2 |
| xvilvh.b \out3, \tmp3, \tmp2 |
| xvilvl.b \out4, \tmp5, \tmp4 |
| xvilvh.b \out5, \tmp5, \tmp4 |
| xvilvl.b \out6, \tmp7, \tmp6 |
| xvilvh.b \out7, \tmp7, \tmp6 |
| xvilvl.w \tmp0, \out2, \out0 |
| xvilvh.w \tmp2, \out2, \out0 |
| xvilvl.w \tmp4, \out3, \out1 |
| xvilvh.w \tmp6, \out3, \out1 |
| xvilvl.w \tmp1, \out6, \out4 |
| xvilvh.w \tmp3, \out6, \out4 |
| xvilvl.w \tmp5, \out7, \out5 |
| xvilvh.w \tmp7, \out7, \out5 |
| xvilvl.d \out0, \tmp1, \tmp0 |
| xvilvh.d \out1, \tmp1, \tmp0 |
| xvilvl.d \out2, \tmp3, \tmp2 |
| xvilvh.d \out3, \tmp3, \tmp2 |
| xvilvl.d \out4, \tmp5, \tmp4 |
| xvilvh.d \out5, \tmp5, \tmp4 |
| xvilvl.d \out6, \tmp7, \tmp6 |
| xvilvh.d \out7, \tmp7, \tmp6 |
| .endm |
| |
| /* |
| * Description : Transpose 16x8 block with byte elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| */ |
| .macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ |
| in8, in9, in10, in11, in12, in13, in14, in15, \ |
| out0, out1, out2, out3, out4, out5, out6, out7,\ |
| tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 |
| vilvl.b \tmp0, \in2, \in0 |
| vilvl.b \tmp1, \in3, \in1 |
| vilvl.b \tmp2, \in6, \in4 |
| vilvl.b \tmp3, \in7, \in5 |
| vilvl.b \tmp4, \in10, \in8 |
| vilvl.b \tmp5, \in11, \in9 |
| vilvl.b \tmp6, \in14, \in12 |
| vilvl.b \tmp7, \in15, \in13 |
| |
| vilvl.b \out0, \tmp1, \tmp0 |
| vilvh.b \out1, \tmp1, \tmp0 |
| vilvl.b \out2, \tmp3, \tmp2 |
| vilvh.b \out3, \tmp3, \tmp2 |
| vilvl.b \out4, \tmp5, \tmp4 |
| vilvh.b \out5, \tmp5, \tmp4 |
| vilvl.b \out6, \tmp7, \tmp6 |
| vilvh.b \out7, \tmp7, \tmp6 |
| vilvl.w \tmp0, \out2, \out0 |
| vilvh.w \tmp2, \out2, \out0 |
| vilvl.w \tmp4, \out3, \out1 |
| vilvh.w \tmp6, \out3, \out1 |
| vilvl.w \tmp1, \out6, \out4 |
| vilvh.w \tmp3, \out6, \out4 |
| vilvl.w \tmp5, \out7, \out5 |
| vilvh.w \tmp7, \out7, \out5 |
| vilvl.d \out0, \tmp1, \tmp0 |
| vilvh.d \out1, \tmp1, \tmp0 |
| vilvl.d \out2, \tmp3, \tmp2 |
| vilvh.d \out3, \tmp3, \tmp2 |
| vilvl.d \out4, \tmp5, \tmp4 |
| vilvh.d \out5, \tmp5, \tmp4 |
| vilvl.d \out6, \tmp7, \tmp6 |
| vilvh.d \out7, \tmp7, \tmp6 |
| .endm |
| |
| /* |
| * Description : Transpose 4x4 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| */ |
| .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ |
| tmp0, tmp1 |
| xvilvl.h \tmp0, \in1, \in0 |
| xvilvl.h \tmp1, \in3, \in2 |
| xvilvl.w \out0, \tmp1, \tmp0 |
| xvilvh.w \out2, \tmp1, \tmp0 |
| xvilvh.d \out1, \out0, \out0 |
| xvilvh.d \out3, \out0, \out2 |
| .endm |
| |
| /* |
| * Description : Transpose 4x8 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| */ |
| .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ |
| tmp0, tmp1 |
| xvilvl.h \tmp0, \in2, \in0 |
| xvilvl.h \tmp1, \in3, \in1 |
| xvilvl.h \out2, \tmp1, \tmp0 |
| xvilvh.h \out3, \tmp1, \tmp0 |
| |
| xvilvl.d \out0, \out2, \out2 |
| xvilvh.d \out1, \out2, \out2 |
| xvilvl.d \out2, \out3, \out3 |
| xvilvh.d \out3, \out3, \out3 |
| .endm |
| |
| /* |
| * Description : Transpose 8x8 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| */ |
| .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ |
| out0, out1, out2, out3, out4, out5, out6, out7, \ |
| tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 |
| xvilvl.h \tmp0, \in6, \in4 |
| xvilvl.h \tmp1, \in7, \in5 |
| xvilvl.h \tmp2, \in2, \in0 |
| xvilvl.h \tmp3, \in3, \in1 |
| |
| xvilvl.h \tmp4, \tmp1, \tmp0 |
| xvilvh.h \tmp5, \tmp1, \tmp0 |
| xvilvl.h \tmp6, \tmp3, \tmp2 |
| xvilvh.h \tmp7, \tmp3, \tmp2 |
| |
| xvilvh.h \tmp0, \in6, \in4 |
| xvilvh.h \tmp1, \in7, \in5 |
| xvilvh.h \tmp2, \in2, \in0 |
| xvilvh.h \tmp3, \in3, \in1 |
| |
| xvpickev.d \out0, \tmp4, \tmp6 |
| xvpickod.d \out1, \tmp4, \tmp6 |
| xvpickev.d \out2, \tmp5, \tmp7 |
| xvpickod.d \out3, \tmp5, \tmp7 |
| |
| xvilvl.h \tmp4, \tmp1, \tmp0 |
| xvilvh.h \tmp5, \tmp1, \tmp0 |
| xvilvl.h \tmp6, \tmp3, \tmp2 |
| xvilvh.h \tmp7, \tmp3, \tmp2 |
| |
| xvpickev.d \out4, \tmp4, \tmp6 |
| xvpickod.d \out5, \tmp4, \tmp6 |
| xvpickev.d \out6, \tmp5, \tmp7 |
| xvpickod.d \out7, \tmp5, \tmp7 |
| .endm |
| |
| /* |
| * Description : Transpose 2x4x4 block with half-word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| */ |
| .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ |
| tmp0, tmp1, tmp2 |
| xvilvh.h \tmp1, \in0, \in1 |
| xvilvl.h \out1, \in0, \in1 |
| xvilvh.h \tmp0, \in2, \in3 |
| xvilvl.h \out3, \in2, \in3 |
| |
| xvilvh.w \tmp2, \out3, \out1 |
| xvilvl.w \out3, \out3, \out1 |
| |
| xvilvl.w \out2, \tmp0, \tmp1 |
| xvilvh.w \tmp1, \tmp0, \tmp1 |
| |
| xvilvh.d \out0, \out2, \out3 |
| xvilvl.d \out2, \out2, \out3 |
| xvilvh.d \out1, \tmp1, \tmp2 |
| xvilvl.d \out3, \tmp1, \tmp2 |
| .endm |
| |
| /* |
| * Description : Transpose 4x4 block with word elements in vectors |
| * Arguments : Inputs - in0, in1, in2, in3 |
| * Outputs - out0, out1, out2, out3 |
| * Details : |
| * Example : |
| * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 |
| * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 |
| * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 |
| * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 |
| */ |
| .macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \ |
| _tmp0, _tmp1 |
| |
| xvilvl.w \_tmp0, \_in1, \_in0 |
| xvilvh.w \_out1, \_in1, \_in0 |
| xvilvl.w \_tmp1, \_in3, \_in2 |
| xvilvh.w \_out3, \_in3, \_in2 |
| |
| xvilvl.d \_out0, \_tmp1, \_tmp0 |
| xvilvl.d \_out2, \_out3, \_out1 |
| xvilvh.d \_out3, \_out3, \_out1 |
| xvilvh.d \_out1, \_tmp1, \_tmp0 |
| .endm |
| |
| /* |
| * Description : Transpose 8x8 block with word elements in vectors |
| * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 |
| * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, |
| * _out7 |
| * Example : LASX_TRANSPOSE8x8_W |
| * _in0 : 1,2,3,4,5,6,7,8 |
| * _in1 : 2,2,3,4,5,6,7,8 |
| * _in2 : 3,2,3,4,5,6,7,8 |
| * _in3 : 4,2,3,4,5,6,7,8 |
| * _in4 : 5,2,3,4,5,6,7,8 |
| * _in5 : 6,2,3,4,5,6,7,8 |
| * _in6 : 7,2,3,4,5,6,7,8 |
| * _in7 : 8,2,3,4,5,6,7,8 |
| * |
| * _out0 : 1,2,3,4,5,6,7,8 |
| * _out1 : 2,2,2,2,2,2,2,2 |
| * _out2 : 3,3,3,3,3,3,3,3 |
| * _out3 : 4,4,4,4,4,4,4,4 |
| * _out4 : 5,5,5,5,5,5,5,5 |
| * _out5 : 6,6,6,6,6,6,6,6 |
| * _out6 : 7,7,7,7,7,7,7,7 |
| * _out7 : 8,8,8,8,8,8,8,8 |
| */ |
| .macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\ |
| _tmp0, _tmp1, _tmp2, _tmp3 |
| xvilvl.w \_tmp0, \_in2, \_in0 |
| xvilvl.w \_tmp1, \_in3, \_in1 |
| xvilvh.w \_tmp2, \_in2, \_in0 |
| xvilvh.w \_tmp3, \_in3, \_in1 |
| xvilvl.w \_out0, \_tmp1, \_tmp0 |
| xvilvh.w \_out1, \_tmp1, \_tmp0 |
| xvilvl.w \_out2, \_tmp3, \_tmp2 |
| xvilvh.w \_out3, \_tmp3, \_tmp2 |
| |
| xvilvl.w \_tmp0, \_in6, \_in4 |
| xvilvl.w \_tmp1, \_in7, \_in5 |
| xvilvh.w \_tmp2, \_in6, \_in4 |
| xvilvh.w \_tmp3, \_in7, \_in5 |
| xvilvl.w \_out4, \_tmp1, \_tmp0 |
| xvilvh.w \_out5, \_tmp1, \_tmp0 |
| xvilvl.w \_out6, \_tmp3, \_tmp2 |
| xvilvh.w \_out7, \_tmp3, \_tmp2 |
| |
| xmov \_tmp0, \_out0 |
| xmov \_tmp1, \_out1 |
| xmov \_tmp2, \_out2 |
| xmov \_tmp3, \_out3 |
| xvpermi.q \_out0, \_out4, 0x02 |
| xvpermi.q \_out1, \_out5, 0x02 |
| xvpermi.q \_out2, \_out6, 0x02 |
| xvpermi.q \_out3, \_out7, 0x02 |
| xvpermi.q \_out4, \_tmp0, 0x31 |
| xvpermi.q \_out5, \_tmp1, 0x31 |
| xvpermi.q \_out6, \_tmp2, 0x31 |
| xvpermi.q \_out7, \_tmp3, 0x31 |
| .endm |
| |
| /* |
| * Description : Transpose 4x4 block with double-word elements in vectors |
| * Arguments : Inputs - _in0, _in1, _in2, _in3 |
| * Outputs - _out0, _out1, _out2, _out3 |
| * Example : LASX_TRANSPOSE4x4_D |
| * _in0 : 1,2,3,4 |
| * _in1 : 1,2,3,4 |
| * _in2 : 1,2,3,4 |
| * _in3 : 1,2,3,4 |
| * |
| * _out0 : 1,1,1,1 |
| * _out1 : 2,2,2,2 |
| * _out2 : 3,3,3,3 |
| * _out3 : 4,4,4,4 |
| */ |
| .macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \ |
| _tmp0, _tmp1 |
| xvilvl.d \_tmp0, \_in1, \_in0 |
| xvilvh.d \_out1, \_in1, \_in0 |
| xvilvh.d \_tmp1, \_in3, \_in2 |
| xvilvl.d \_out2, \_in3, \_in2 |
| |
| xvor.v \_out0, \_tmp0, \_tmp0 |
| xvor.v \_out3, \_tmp1, \_tmp1 |
| |
| xvpermi.q \_out0, \_out2, 0x02 |
| xvpermi.q \_out2, \_tmp0, 0x31 |
| xvpermi.q \_out3, \_out1, 0x31 |
| xvpermi.q \_out1, \_tmp1, 0x02 |
| .endm |
| |
| /* |
| * Description : Butterfly of 4 input vectors |
| * Arguments : Inputs - _in0, _in1, _in2, _in3 |
| * Outputs - _out0, _out1, _out2, _out3 |
| * Details : Butterfly operation |
| * Example : LSX_BUTTERFLY_4 |
| * _out0 = _in0 + _in3; |
| * _out1 = _in1 + _in2; |
| * _out2 = _in1 - _in2; |
| * _out3 = _in0 - _in3; |
| */ |
| .macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| vadd.b \_out0, \_in0, \_in3 |
| vadd.b \_out1, \_in1, \_in2 |
| vsub.b \_out2, \_in1, \_in2 |
| vsub.b \_out3, \_in0, \_in3 |
| .endm |
| .macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| vadd.h \_out0, \_in0, \_in3 |
| vadd.h \_out1, \_in1, \_in2 |
| vsub.h \_out2, \_in1, \_in2 |
| vsub.h \_out3, \_in0, \_in3 |
| .endm |
| .macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| vadd.w \_out0, \_in0, \_in3 |
| vadd.w \_out1, \_in1, \_in2 |
| vsub.w \_out2, \_in1, \_in2 |
| vsub.w \_out3, \_in0, \_in3 |
| .endm |
| .macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| vadd.d \_out0, \_in0, \_in3 |
| vadd.d \_out1, \_in1, \_in2 |
| vsub.d \_out2, \_in1, \_in2 |
| vsub.d \_out3, \_in0, \_in3 |
| .endm |
| |
| .macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| xvadd.b \_out0, \_in0, \_in3 |
| xvadd.b \_out1, \_in1, \_in2 |
| xvsub.b \_out2, \_in1, \_in2 |
| xvsub.b \_out3, \_in0, \_in3 |
| .endm |
| .macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| xvadd.h \_out0, \_in0, \_in3 |
| xvadd.h \_out1, \_in1, \_in2 |
| xvsub.h \_out2, \_in1, \_in2 |
| xvsub.h \_out3, \_in0, \_in3 |
| .endm |
| .macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| xvadd.w \_out0, \_in0, \_in3 |
| xvadd.w \_out1, \_in1, \_in2 |
| xvsub.w \_out2, \_in1, \_in2 |
| xvsub.w \_out3, \_in0, \_in3 |
| .endm |
| .macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3 |
| xvadd.d \_out0, \_in0, \_in3 |
| xvadd.d \_out1, \_in1, \_in2 |
| xvsub.d \_out2, \_in1, \_in2 |
| xvsub.d \_out3, \_in0, \_in3 |
| .endm |
| |
| /* |
| * Description : Butterfly of 8 input vectors |
| * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ |
| * Outputs - _out0, _out1, _out2, _out3, ~ |
| * Details : Butterfly operation |
| * Example : LASX_BUTTERFLY_8 |
| * _out0 = _in0 + _in7; |
| * _out1 = _in1 + _in6; |
| * _out2 = _in2 + _in5; |
| * _out3 = _in3 + _in4; |
| * _out4 = _in3 - _in4; |
| * _out5 = _in2 - _in5; |
| * _out6 = _in1 - _in6; |
| * _out7 = _in0 - _in7; |
| */ |
| .macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| vadd.b \_out0, \_in0, \_in7 |
| vadd.b \_out1, \_in1, \_in6 |
| vadd.b \_out2, \_in2, \_in5 |
| vadd.b \_out3, \_in3, \_in4 |
| vsub.b \_out4, \_in3, \_in4 |
| vsub.b \_out5, \_in2, \_in5 |
| vsub.b \_out6, \_in1, \_in6 |
| vsub.b \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| vadd.h \_out0, \_in0, \_in7 |
| vadd.h \_out1, \_in1, \_in6 |
| vadd.h \_out2, \_in2, \_in5 |
| vadd.h \_out3, \_in3, \_in4 |
| vsub.h \_out4, \_in3, \_in4 |
| vsub.h \_out5, \_in2, \_in5 |
| vsub.h \_out6, \_in1, \_in6 |
| vsub.h \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| vadd.w \_out0, \_in0, \_in7 |
| vadd.w \_out1, \_in1, \_in6 |
| vadd.w \_out2, \_in2, \_in5 |
| vadd.w \_out3, \_in3, \_in4 |
| vsub.w \_out4, \_in3, \_in4 |
| vsub.w \_out5, \_in2, \_in5 |
| vsub.w \_out6, \_in1, \_in6 |
| vsub.w \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| vadd.d \_out0, \_in0, \_in7 |
| vadd.d \_out1, \_in1, \_in6 |
| vadd.d \_out2, \_in2, \_in5 |
| vadd.d \_out3, \_in3, \_in4 |
| vsub.d \_out4, \_in3, \_in4 |
| vsub.d \_out5, \_in2, \_in5 |
| vsub.d \_out6, \_in1, \_in6 |
| vsub.d \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| xvadd.b \_out0, \_in0, \_in7 |
| xvadd.b \_out1, \_in1, \_in6 |
| xvadd.b \_out2, \_in2, \_in5 |
| xvadd.b \_out3, \_in3, \_in4 |
| xvsub.b \_out4, \_in3, \_in4 |
| xvsub.b \_out5, \_in2, \_in5 |
| xvsub.b \_out6, \_in1, \_in6 |
| xvsub.b \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| xvadd.h \_out0, \_in0, \_in7 |
| xvadd.h \_out1, \_in1, \_in6 |
| xvadd.h \_out2, \_in2, \_in5 |
| xvadd.h \_out3, \_in3, \_in4 |
| xvsub.h \_out4, \_in3, \_in4 |
| xvsub.h \_out5, \_in2, \_in5 |
| xvsub.h \_out6, \_in1, \_in6 |
| xvsub.h \_out7, \_in0, \_in7 |
| .endm |
| |
| .macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
| _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7 |
| xvadd.w \_out0, \_in0, \_in7 |
| xvadd.w \_out1, \_in1, \_in6 |
| xvadd.w \_out2, \_in2, \_in5 |
| xvadd.w \_out3, \_in3, \_in4 |
| xvsub.w \_out4, \_in3, \_in4 |
| xvsub.w \_out5, \_in2, \_in5 |
| xvsub.w \_out6, \_in1, \_in6 |
| xvsub.w \_out7, \_in0, \_in7 |
| .endm |