| ;****************************************************************************** |
| ;* SIMD-optimized JPEG2000 DSP functions |
| ;* Copyright (c) 2014 Nicolas Bertrand |
| ;* Copyright (c) 2015 James Almer |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA 32 |
| |
| pf_ict0: times 8 dd 1.402 |
| pf_ict1: times 8 dd 0.34413 |
| pf_ict2: times 8 dd 0.71414 |
| pf_ict3: times 8 dd 1.772 |
| |
| SECTION .text |
| |
| ;*********************************************************************** |
| ; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize) |
| ;*********************************************************************** |
| %macro ICT_FLOAT 1 |
| cglobal ict_float, 4, 4, %1, src0, src1, src2, csize |
| shl csized, 2 |
| add src0q, csizeq |
| add src1q, csizeq |
| add src2q, csizeq |
| neg csizeq |
| movaps m6, [pf_ict0] |
| movaps m7, [pf_ict1] |
| %define ICT0 m6 |
| %define ICT1 m7 |
| |
| %if ARCH_X86_64 |
| movaps m8, [pf_ict2] |
| %define ICT2 m8 |
| %if cpuflag(avx) |
| movaps m3, [pf_ict3] |
| %define ICT3 m3 |
| %else |
| movaps m9, [pf_ict3] |
| %define ICT3 m9 |
| %endif |
| |
| %else ; ARCH_X86_32 |
| %define ICT2 [pf_ict2] |
| %if cpuflag(avx) |
| movaps m3, [pf_ict3] |
| %define ICT3 m3 |
| %else |
| %define ICT3 [pf_ict3] |
| %endif |
| |
| %endif ; ARCH |
| |
| align 16 |
| .loop: |
| movaps m0, [src0q+csizeq] |
| movaps m1, [src1q+csizeq] |
| movaps m2, [src2q+csizeq] |
| |
| %if cpuflag(fma4) || cpuflag(fma3) |
| %if cpuflag(fma4) |
| fnmaddps m5, m1, ICT1, m0 |
| fmaddps m4, m2, ICT0, m0 |
| %else ; fma3 |
| movaps m5, m1 |
| movaps m4, m2 |
| fnmaddps m5, m5, ICT1, m0 |
| fmaddps m4, m4, ICT0, m0 |
| %endif |
| fmaddps m0, m1, ICT3, m0 |
| fnmaddps m5, m2, ICT2, m5 |
| %else ; non FMA |
| %if cpuflag(avx) |
| mulps m5, m1, ICT1 |
| mulps m4, m2, ICT0 |
| mulps m1, m1, ICT3 |
| mulps m2, m2, ICT2 |
| subps m5, m0, m5 |
| %else ; sse |
| movaps m3, m1 |
| movaps m4, m2 |
| movaps m5, m0 |
| mulps m3, ICT1 |
| mulps m4, ICT0 |
| mulps m1, ICT3 |
| mulps m2, ICT2 |
| subps m5, m3 |
| %endif |
| addps m4, m4, m0 |
| addps m0, m0, m1 |
| subps m5, m5, m2 |
| %endif |
| |
| movaps [src0q+csizeq], m4 |
| movaps [src2q+csizeq], m0 |
| movaps [src1q+csizeq], m5 |
| add csizeq, mmsize |
| jl .loop |
| RET |
| %endmacro |
| |
| INIT_XMM sse |
| ICT_FLOAT 10 |
| INIT_YMM avx |
| ICT_FLOAT 9 |
| %if HAVE_FMA4_EXTERNAL |
| INIT_XMM fma4 |
| ICT_FLOAT 9 |
| %endif |
| INIT_YMM fma3 |
| ICT_FLOAT 9 |
| |
| ;*************************************************************************** |
| ; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize) |
| ;*************************************************************************** |
| %macro RCT_INT 0 |
| cglobal rct_int, 4, 4, 4, src0, src1, src2, csize |
| shl csized, 2 |
| add src0q, csizeq |
| add src1q, csizeq |
| add src2q, csizeq |
| neg csizeq |
| |
| align 16 |
| .loop: |
| mova m1, [src1q+csizeq] |
| mova m2, [src2q+csizeq] |
| mova m0, [src0q+csizeq] |
| paddd m3, m1, m2 |
| psrad m3, 2 |
| psubd m0, m3 |
| paddd m1, m0 |
| paddd m2, m0 |
| mova [src1q+csizeq], m0 |
| mova [src2q+csizeq], m1 |
| mova [src0q+csizeq], m2 |
| add csizeq, mmsize |
| jl .loop |
| RET |
| %endmacro |
| |
| INIT_XMM sse2 |
| RCT_INT |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| RCT_INT |
| %endif |