libavcodec/riscv/lpc_rvv.S - third_party/ffmpeg - Git at Google

 /*
  * Copyright © 2023 Rémi Denis-Courmont.
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include "libavutil/riscv/asm.S"

 #if __riscv_xlen >= 64
 func ff_lpc_apply_welch_window_rvv, zve64d
         vsetvli   t0, zero, e64, m8, ta, ma
         vid.v     v0
         addi      t2, a1, -1
         vfcvt.f.xu.v v0, v0
         li        t3, 2
         fcvt.d.l  ft2, t2
         srai      t1, a1, 1
         fcvt.d.l  ft3, t3
         li        t4, 1
         fdiv.d    ft0, ft3, ft2    # ft0 = c = 2. / (len - 1)
         fcvt.d.l  fa1, t4         # fa1 = 1.
         fsub.d    ft1, ft0, fa1
         vfrsub.vf v0, v0, ft1    # v0[i] = c - i - 1.
 1:
         vsetvli   t0, t1, e64, m8, ta, ma
         vfmul.vv  v16, v0, v0  # no fused multipy-add as v0 is reused
         sub       t1, t1, t0
         vle32.v   v8, (a0)
         fcvt.d.l  ft2, t0
         vfrsub.vf v16, v16, fa1  # v16 = 1. - w * w
         sh2add    a0, t0, a0
         vsetvli   zero, zero, e32, m4, ta, ma
         vfwcvt.f.x.v v24, v8
         vsetvli   zero, zero, e64, m8, ta, ma
         vfsub.vf  v0, v0, ft2     # v0 -= vl
         vfmul.vv  v8, v24, v16
         vse64.v   v8, (a2)
         sh3add    a2, t0, a2
         bnez      t1, 1b

         andi      t1, a1, 1
         beqz      t1, 2f

         sd        zero, (a2)
         addi      a0, a0, 4
         addi      a2, a2, 8
 2:
         vsetvli   t0, zero, e64, m8, ta, ma
         vid.v     v0
         srai      t1, a1, 1
         vfcvt.f.xu.v v0, v0
         fcvt.d.l  ft1, t1
         fsub.d    ft1, ft0, ft1    # ft1 = c - (len / 2)
         vfadd.vf  v0, v0, ft1     # v0[i] = c - (len / 2) + i
 3:
         vsetvli   t0, t1, e64, m8, ta, ma
         vfmul.vv  v16, v0, v0
         sub       t1, t1, t0
         vle32.v   v8, (a0)
         fcvt.d.l  ft2, t0
         vfrsub.vf v16, v16, fa1  # v16 = 1. - w * w
         sh2add    a0, t0, a0
         vsetvli   zero, zero, e32, m4, ta, ma
         vfwcvt.f.x.v v24, v8
         vsetvli   zero, zero, e64, m8, ta, ma
         vfadd.vf  v0, v0, ft2     # v0 += vl
         vfmul.vv  v8, v24, v16
         vse64.v   v8, (a2)
         sh3add    a2, t0, a2
         bnez      t1, 3b

         ret
 endfunc

 func ff_lpc_compute_autocorr_rvv, zve64d
         li        t0, 1
         vsetvli   zero, a2, e64, m8, ta, ma
         fcvt.d.l  ft0, t0
         vle64.v   v0, (a0)
         sh3add    a0, a2, a0   # data += lag
         vfmv.v.f  v16, ft0
         bge       a2, a1, 2f
 1:
         vfmv.f.s  ft0, v0
         fld       ft1, (a0)    # ft1 = data[lag + i]
         vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
         addi      a1, a1, -1
         vfslide1down.vf v0, v0, ft1
         addi      a0, a0, 8
         bgt       a1, a2, 1b   # while (len > lag);
 2:
         vfmv.f.s  ft0, v0
         vsetvli   zero, a1, e64, m8, tu, ma
         vfmacc.vf v16, ft0, v0
         addi      a1, a1, -1
         vslide1down.vx v0, v0, zero
         bnez      a1, 2b       # while (len > 0);

         vsetvli   zero, a2, e64, m8, ta, ma
         vse64.v   v16, (a3)
         ret
 endfunc
 #endif
	/*
	* Copyright © 2023 Rémi Denis-Courmont.
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include "libavutil/riscv/asm.S"

	#if __riscv_xlen >= 64
	func ff_lpc_apply_welch_window_rvv, zve64d
	vsetvli t0, zero, e64, m8, ta, ma
	vid.v v0
	addi t2, a1, -1
	vfcvt.f.xu.v v0, v0
	li t3, 2
	fcvt.d.l ft2, t2
	srai t1, a1, 1
	fcvt.d.l ft3, t3
	li t4, 1
	fdiv.d ft0, ft3, ft2 # ft0 = c = 2. / (len - 1)
	fcvt.d.l fa1, t4 # fa1 = 1.
	fsub.d ft1, ft0, fa1
	vfrsub.vf v0, v0, ft1 # v0[i] = c - i - 1.
	1:
	vsetvli t0, t1, e64, m8, ta, ma
	vfmul.vv v16, v0, v0 # no fused multipy-add as v0 is reused
	sub t1, t1, t0
	vle32.v v8, (a0)
	fcvt.d.l ft2, t0
	vfrsub.vf v16, v16, fa1 # v16 = 1. - w * w
	sh2add a0, t0, a0
	vsetvli zero, zero, e32, m4, ta, ma
	vfwcvt.f.x.v v24, v8
	vsetvli zero, zero, e64, m8, ta, ma
	vfsub.vf v0, v0, ft2 # v0 -= vl
	vfmul.vv v8, v24, v16
	vse64.v v8, (a2)
	sh3add a2, t0, a2
	bnez t1, 1b

	andi t1, a1, 1
	beqz t1, 2f

	sd zero, (a2)
	addi a0, a0, 4
	addi a2, a2, 8
	2:
	vsetvli t0, zero, e64, m8, ta, ma
	vid.v v0
	srai t1, a1, 1
	vfcvt.f.xu.v v0, v0
	fcvt.d.l ft1, t1
	fsub.d ft1, ft0, ft1 # ft1 = c - (len / 2)
	vfadd.vf v0, v0, ft1 # v0[i] = c - (len / 2) + i
	3:
	vsetvli t0, t1, e64, m8, ta, ma
	vfmul.vv v16, v0, v0
	sub t1, t1, t0
	vle32.v v8, (a0)
	fcvt.d.l ft2, t0
	vfrsub.vf v16, v16, fa1 # v16 = 1. - w * w
	sh2add a0, t0, a0
	vsetvli zero, zero, e32, m4, ta, ma
	vfwcvt.f.x.v v24, v8
	vsetvli zero, zero, e64, m8, ta, ma
	vfadd.vf v0, v0, ft2 # v0 += vl
	vfmul.vv v8, v24, v16
	vse64.v v8, (a2)
	sh3add a2, t0, a2
	bnez t1, 3b

	ret
	endfunc

	func ff_lpc_compute_autocorr_rvv, zve64d
	li t0, 1
	vsetvli zero, a2, e64, m8, ta, ma
	fcvt.d.l ft0, t0
	vle64.v v0, (a0)
	sh3add a0, a2, a0 # data += lag
	vfmv.v.f v16, ft0
	bge a2, a1, 2f
	1:
	vfmv.f.s ft0, v0
	fld ft1, (a0) # ft1 = data[lag + i]
	vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
	addi a1, a1, -1
	vfslide1down.vf v0, v0, ft1
	addi a0, a0, 8
	bgt a1, a2, 1b # while (len > lag);
	2:
	vfmv.f.s ft0, v0
	vsetvli zero, a1, e64, m8, tu, ma
	vfmacc.vf v16, ft0, v0
	addi a1, a1, -1
	vslide1down.vx v0, v0, zero
	bnez a1, 2b # while (len > 0);

	vsetvli zero, a2, e64, m8, ta, ma
	vse64.v v16, (a3)
	ret
	endfunc
	#endif