libavfilter/x86/af_afir.asm - third_party/ffmpeg - Git at Google

 ;*****************************************************************************
 ;* x86-optimized functions for afir filter
 ;* Copyright (c) 2017 Paul B Mahol
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION .text

 ;------------------------------------------------------------------------------
 ; void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
 ;------------------------------------------------------------------------------

 INIT_XMM sse3
 cglobal fcmul_add, 4,4,6, sum, t, c, len
     shl       lend, 3
     add       lend, mmsize*2
     add         tq, lenq
     add         cq, lenq
     add       sumq, lenq
     neg       lenq
 ALIGN 16
 .loop:
     movsldup  m0, [tq + lenq]
     movsldup  m3, [tq + lenq+mmsize]
     movaps    m1, [cq + lenq]
     movaps    m4, [cq + lenq+mmsize]
     mulps     m0, m1
     mulps     m3, m4
     shufps    m1, m1, 0xb1
     shufps    m4, m4, 0xb1
     movshdup  m2, [tq + lenq]
     movshdup  m5, [tq + lenq+mmsize]
     mulps     m2, m1
     mulps     m5, m4
     addsubps  m0, m2
     addsubps  m3, m5
     addps     m0, [sumq + lenq]
     addps     m3, [sumq + lenq+mmsize]
     movaps    [sumq + lenq], m0
     movaps    [sumq + lenq+mmsize], m3
     add       lenq, mmsize*2
     jl .loop
     REP_RET
	;*****************************************************************************
	;* x86-optimized functions for afir filter
	;* Copyright (c) 2017 Paul B Mahol
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	SECTION .text

	;------------------------------------------------------------------------------
	; void ff_fcmul_add(float sum, const float t, const float *c, int len)
	;------------------------------------------------------------------------------

	INIT_XMM sse3
	cglobal fcmul_add, 4,4,6, sum, t, c, len
	shl lend, 3
	add lend, mmsize*2
	add tq, lenq
	add cq, lenq
	add sumq, lenq
	neg lenq
	ALIGN 16
	.loop:
	movsldup m0, [tq + lenq]
	movsldup m3, [tq + lenq+mmsize]
	movaps m1, [cq + lenq]
	movaps m4, [cq + lenq+mmsize]
	mulps m0, m1
	mulps m3, m4
	shufps m1, m1, 0xb1
	shufps m4, m4, 0xb1
	movshdup m2, [tq + lenq]
	movshdup m5, [tq + lenq+mmsize]
	mulps m2, m1
	mulps m5, m4
	addsubps m0, m2
	addsubps m3, m5
	addps m0, [sumq + lenq]
	addps m3, [sumq + lenq+mmsize]
	movaps [sumq + lenq], m0
	movaps [sumq + lenq+mmsize], m3
	add lenq, mmsize*2
	jl .loop
	REP_RET