| ;****************************************************************************** |
| ;* Copyright (c) Lynne |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA 32 |
| |
| one_tab: times 4 dq 1.0 |
| seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0 |
| sub_tab: dq -1.0, -2.0, -3.0, -4.0 |
| add_tab_avx2: times 4 dq 4.0 |
| dec_tab_avx2: times 4 dq -4.0 |
| add_tab_sse2: times 2 dq 2.0 |
| dec_tab_sse2: times 2 dq -2.0 |
| dec_tab_scalar: times 2 dq -1.0 |
| seq_tab_sse2: dq 1.0, 0.0 |
| |
| SECTION .text |
| |
| %macro APPLY_WELCH_FN 0 |
| cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2 |
| cmp lenq, 0 |
| je .end_e |
| cmp lenq, 2 |
| je .two |
| cmp lenq, 1 |
| je .one |
| |
| movapd m6, [one_tab] |
| |
| movd xm1, lend |
| cvtdq2pd xm1, xm1 ; len |
| %if cpuflag(avx2) |
| vbroadcastsd m1, xm1 |
| %else |
| shufpd m1, m1, 00b |
| %endif |
| |
| addpd m0, m6, m6 ; 2.0 |
| subpd m1, m6 ; len - 1 |
| divpd m0, m1 ; 2.0 / (len - 1) |
| |
| mov off1q, lenq |
| and off1q, 1 |
| je .even |
| |
| movapd m5, m0 |
| addpd m0, [sub_tab] |
| |
| lea off2q, [lenq*4 - mmsize/2] |
| sub lenq, mmsize/4 ; avoid overwriting |
| xor off1q, off1q |
| |
| cmp lenq, mmsize/4 |
| jl .scalar_o |
| |
| %if cpuflag(avx2) |
| movapd m7, [dec_tab_avx2] |
| %else |
| movapd m7, [dec_tab_sse2] |
| %endif |
| |
| .loop_o: |
| movapd m1, m6 |
| %if cpuflag(avx2) |
| fnmaddpd m1, m0, m0, m1 |
| vpermpd m2, m1, q0123 |
| %else |
| mulpd m2, m0, m0 |
| subpd m1, m2 |
| shufpd m2, m1, m1, 01b |
| %endif |
| |
| cvtdq2pd m3, [dataq + off1q] |
| cvtdq2pd m4, [dataq + off2q] |
| |
| mulpd m1, m3 |
| mulpd m2, m4 |
| |
| movupd [outq + off1q*2], m1 |
| movupd [outq + off2q*2], m2 |
| |
| addpd m0, m7 |
| add off1q, mmsize/2 |
| sub off2q, mmsize/2 |
| sub lenq, mmsize/4 |
| jg .loop_o |
| |
| add lend, (mmsize/4 - 1) |
| cmp lend, 0 |
| je .end_o |
| sub lenq, (mmsize/4 - 1) |
| |
| .scalar_o: |
| movapd xm7, [dec_tab_scalar] |
| |
| ; Set offsets |
| add off2q, (mmsize/4) + 4*cpuflag(avx2) |
| add lenq, mmsize/4 - 2 |
| |
| .loop_o_scalar: |
| movapd xm1, xm6 |
| %if cpuflag(avx2) |
| fnmaddpd xm1, xm0, xm0, xm1 |
| %else |
| mulpd xm2, xm0, xm0 |
| subpd xm1, xm2 |
| %endif |
| |
| cvtdq2pd xm3, [dataq + off1q] |
| cvtdq2pd xm4, [dataq + off2q] |
| |
| mulpd xm3, xm1 |
| mulpd xm4, xm1 |
| |
| movlpd [outq + off1q*2], xm3 |
| movlpd [outq + off2q*2], xm4 |
| |
| addpd xm0, xm7 |
| |
| add off1q, 4 |
| sub off2q, 4 |
| |
| sub lenq, 2 |
| jg .loop_o_scalar |
| |
| .end_o: |
| xorpd xm3, xm3 |
| movlpd [outq + off1q*2], xm3 |
| RET |
| |
| .even: |
| %if cpuflag(avx2) |
| addpd m0, [seq_tab_avx2] |
| %else |
| addpd m0, [seq_tab_sse2] |
| %endif |
| |
| mov off1d, lend |
| shr off1d, 1 |
| movd xm1, off1d |
| cvtdq2pd xm1, xm1 ; len/2 |
| %if cpuflag(avx2) |
| vbroadcastsd m1, xm1 |
| %else |
| shufpd m1, m1, 00b |
| %endif |
| subpd m0, m1 |
| |
| %if cpuflag(avx2) |
| movapd m7, [add_tab_avx2] |
| %else |
| movapd m7, [add_tab_sse2] |
| %endif |
| |
| lea off2q, [lenq*2] |
| lea off1q, [lenq*2 - mmsize/2] |
| sub lenq, mmsize/4 |
| |
| cmp lenq, mmsize/4 |
| jl .scalar_e |
| |
| .loop_e: |
| movapd m1, m6 |
| %if cpuflag(avx2) |
| fnmaddpd m1, m0, m0, m1 |
| %else |
| mulpd m2, m0, m0 |
| subpd m1, m2 |
| %endif |
| %if cpuflag(avx2) |
| vpermpd m2, m1, q0123 |
| %else |
| shufpd m2, m1, m1, 01b |
| %endif |
| |
| cvtdq2pd m3, [dataq + off1q] |
| cvtdq2pd m4, [dataq + off2q] |
| |
| mulpd m1, m3 |
| mulpd m2, m4 |
| |
| movupd [outq + off1q*2], m1 |
| movupd [outq + off2q*2], m2 |
| |
| addpd m0, m7 |
| add off2q, mmsize/2 |
| sub off1q, mmsize/2 |
| sub lenq, mmsize/4 |
| jge .loop_e |
| |
| .scalar_e: |
| subpd xm0, xm7 |
| movapd xm7, [dec_tab_scalar] |
| subpd xm0, xm7 |
| |
| add off1q, (mmsize/2) |
| sub off2q, (mmsize/2) - 8*cpuflag(avx2) |
| add lenq, 6 + 4*cpuflag(avx2) |
| |
| addpd xm0, [sub_tab] |
| |
| .loop_e_scalar: |
| movapd xm1, xm6 |
| %if cpuflag(avx2) |
| fnmaddpd xm1, xm0, xm0, xm1 |
| %else |
| mulpd xm2, xm0, xm0 |
| subpd xm1, xm2 |
| %endif |
| |
| cvtdq2pd xm3, [dataq + off1q] |
| cvtdq2pd xm4, [dataq + off2q] |
| |
| mulpd xm3, xm1 |
| shufpd xm1, xm1, 00b |
| mulpd xm4, xm1 |
| |
| movlpd [outq + off1q*2], xm3 |
| movhpd [outq + off2q*2 + 8], xm4 |
| |
| subpd xm0, xm7 |
| |
| add off2q, 4 |
| sub off1q, 4 |
| sub lenq, 2 |
| jg .loop_e_scalar |
| RET |
| |
| .two: |
| xorpd xm0, xm0 |
| movhpd [outq + 8], xm0 |
| .one: |
| xorpd xm0, xm0 |
| movhpd [outq], xm0 |
| .end_e: |
| RET |
| %endmacro |
| |
| INIT_XMM sse2 |
| APPLY_WELCH_FN |
| |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| APPLY_WELCH_FN |
| %endif |