libavcodec/x86/fmtconvert.asm - third_party/ffmpeg - Git at Google

 ;******************************************************************************
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION .text

 ;------------------------------------------------------------------------------
 ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
 ;                                    int len);
 ;------------------------------------------------------------------------------
 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
 %if UNIX64
 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
 %else
 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
 %endif
 %if WIN64
     SWAP 0, 2
 %elif ARCH_X86_32
     movss   m0, mulm
 %endif
     SPLATD  m0
     shl     lend, 2
     add     srcq, lenq
     add     dstq, lenq
     neg     lenq
 .loop:
 %if cpuflag(sse2)
     cvtdq2ps  m1, [srcq+lenq   ]
     cvtdq2ps  m2, [srcq+lenq+16]
 %else
     cvtpi2ps  m1, [srcq+lenq   ]
     cvtpi2ps  m3, [srcq+lenq+ 8]
     cvtpi2ps  m2, [srcq+lenq+16]
     cvtpi2ps  m4, [srcq+lenq+24]
     movlhps   m1, m3
     movlhps   m2, m4
 %endif
     mulps     m1, m0
     mulps     m2, m0
     mova  [dstq+lenq   ], m1
     mova  [dstq+lenq+16], m2
     add     lenq, 32
     jl .loop
 %if notcpuflag(sse2)
     ;; cvtpi2ps switches to MMX even if the source is a memory location
     ;; possible an error in documentation since every tested CPU disagrees with
     ;; that. Use emms anyway since the vast majority of machines will use the
     ;; SSE2 variant
     emms
 %endif
     RET
 %endmacro

 INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3

 ;------------------------------------------------------------------------------
 ; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
 ;                                    const float *mul, int len);
 ;------------------------------------------------------------------------------
 %macro INT32_TO_FLOAT_FMUL_ARRAY8 0
 cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
     shl     lend, 2
     add     srcq, lenq
     add     dstq, lenq
     neg     lenq
 .loop:
     movss     m0, [mulq]
     SPLATD    m0
 %if cpuflag(sse2)
     cvtdq2ps  m1, [srcq+lenq   ]
     cvtdq2ps  m2, [srcq+lenq+16]
 %else
     cvtpi2ps  m1, [srcq+lenq   ]
     cvtpi2ps  m3, [srcq+lenq+ 8]
     cvtpi2ps  m2, [srcq+lenq+16]
     cvtpi2ps  m4, [srcq+lenq+24]
     movlhps   m1, m3
     movlhps   m2, m4
 %endif
     mulps     m1, m0
     mulps     m2, m0
     mova  [dstq+lenq   ], m1
     mova  [dstq+lenq+16], m2
     add     mulq, 4
     add     lenq, 32
     jl .loop
 %if notcpuflag(sse2)
     ;; cvtpi2ps switches to MMX even if the source is a memory location
     ;; possible an error in documentation since every tested CPU disagrees with
     ;; that. Use emms anyway since the vast majority of machines will use the
     ;; SSE2 variant
     emms
 %endif
     RET
 %endmacro

 INIT_XMM sse
 INT32_TO_FLOAT_FMUL_ARRAY8
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_ARRAY8
	;******************************************************************************
	;* x86 optimized Format Conversion Utils
	;* Copyright (c) 2008 Loren Merritt
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	SECTION .text

	;------------------------------------------------------------------------------
	; void ff_int32_to_float_fmul_scalar(float dst, const int32_t src, float mul,
	; int len);
	;------------------------------------------------------------------------------
	%macro INT32_TO_FLOAT_FMUL_SCALAR 1
	%if UNIX64
	cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
	%else
	cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
	%endif
	%if WIN64
	SWAP 0, 2
	%elif ARCH_X86_32
	movss m0, mulm
	%endif
	SPLATD m0
	shl lend, 2
	add srcq, lenq
	add dstq, lenq
	neg lenq
	.loop:
	%if cpuflag(sse2)
	cvtdq2ps m1, [srcq+lenq ]
	cvtdq2ps m2, [srcq+lenq+16]
	%else
	cvtpi2ps m1, [srcq+lenq ]
	cvtpi2ps m3, [srcq+lenq+ 8]
	cvtpi2ps m2, [srcq+lenq+16]
	cvtpi2ps m4, [srcq+lenq+24]
	movlhps m1, m3
	movlhps m2, m4
	%endif
	mulps m1, m0
	mulps m2, m0
	mova [dstq+lenq ], m1
	mova [dstq+lenq+16], m2
	add lenq, 32
	jl .loop
	%if notcpuflag(sse2)
	;; cvtpi2ps switches to MMX even if the source is a memory location
	;; possible an error in documentation since every tested CPU disagrees with
	;; that. Use emms anyway since the vast majority of machines will use the
	;; SSE2 variant
	emms
	%endif
	RET
	%endmacro

	INIT_XMM sse
	INT32_TO_FLOAT_FMUL_SCALAR 5
	INIT_XMM sse2
	INT32_TO_FLOAT_FMUL_SCALAR 3

	;------------------------------------------------------------------------------
	; void ff_int32_to_float_fmul_array8(FmtConvertContext c, float dst, const int32_t *src,
	; const float *mul, int len);
	;------------------------------------------------------------------------------
	%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
	cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
	shl lend, 2
	add srcq, lenq
	add dstq, lenq
	neg lenq
	.loop:
	movss m0, [mulq]
	SPLATD m0
	%if cpuflag(sse2)
	cvtdq2ps m1, [srcq+lenq ]
	cvtdq2ps m2, [srcq+lenq+16]
	%else
	cvtpi2ps m1, [srcq+lenq ]
	cvtpi2ps m3, [srcq+lenq+ 8]
	cvtpi2ps m2, [srcq+lenq+16]
	cvtpi2ps m4, [srcq+lenq+24]
	movlhps m1, m3
	movlhps m2, m4
	%endif
	mulps m1, m0
	mulps m2, m0
	mova [dstq+lenq ], m1
	mova [dstq+lenq+16], m2
	add mulq, 4
	add lenq, 32
	jl .loop
	%if notcpuflag(sse2)
	;; cvtpi2ps switches to MMX even if the source is a memory location
	;; possible an error in documentation since every tested CPU disagrees with
	;; that. Use emms anyway since the vast majority of machines will use the
	;; SSE2 variant
	emms
	%endif
	RET
	%endmacro

	INIT_XMM sse
	INT32_TO_FLOAT_FMUL_ARRAY8
	INIT_XMM sse2
	INT32_TO_FLOAT_FMUL_ARRAY8