| ;****************************************************************************** |
| ;* VP9 MC SIMD optimizations |
| ;* |
| ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA 32 |
| |
| pd_64: times 8 dd 64 |
| |
| cextern pw_1023 |
| cextern pw_4095 |
| |
| SECTION .text |
| |
| %macro filter_h4_fn 1-2 12 |
| cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
| mova m5, [pw_1023] |
| .body: |
| %if notcpuflag(sse4) && ARCH_X86_64 |
| pxor m11, m11 |
| %endif |
| mova m6, [pd_64] |
| mova m7, [filteryq+ 0] |
| %if ARCH_X86_64 && mmsize > 8 |
| mova m8, [filteryq+32] |
| mova m9, [filteryq+64] |
| mova m10, [filteryq+96] |
| %endif |
| .loop: |
| movh m0, [srcq-6] |
| movh m1, [srcq-4] |
| movh m2, [srcq-2] |
| movh m3, [srcq+0] |
| movh m4, [srcq+2] |
| punpcklwd m0, m1 |
| punpcklwd m2, m3 |
| pmaddwd m0, m7 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m2, m8 |
| %else |
| pmaddwd m2, [filteryq+32] |
| %endif |
| movu m1, [srcq+4] |
| movu m3, [srcq+6] |
| paddd m0, m2 |
| movu m2, [srcq+8] |
| add srcq, sstrideq |
| punpcklwd m4, m1 |
| punpcklwd m3, m2 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m4, m9 |
| pmaddwd m3, m10 |
| %else |
| pmaddwd m4, [filteryq+64] |
| pmaddwd m3, [filteryq+96] |
| %endif |
| paddd m0, m4 |
| paddd m0, m3 |
| paddd m0, m6 |
| psrad m0, 7 |
| %if cpuflag(sse4) |
| packusdw m0, m0 |
| %else |
| packssdw m0, m0 |
| %endif |
| %ifidn %1, avg |
| movh m1, [dstq] |
| %endif |
| pminsw m0, m5 |
| %if notcpuflag(sse4) |
| %if ARCH_X86_64 |
| pmaxsw m0, m11 |
| %else |
| pxor m2, m2 |
| pmaxsw m0, m2 |
| %endif |
| %endif |
| %ifidn %1, avg |
| pavgw m0, m1 |
| %endif |
| movh [dstq], m0 |
| add dstq, dstrideq |
| dec hd |
| jg .loop |
| RET |
| |
| cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
| mova m5, [pw_4095] |
| jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body |
| %endmacro |
| |
| INIT_XMM sse2 |
| filter_h4_fn put |
| filter_h4_fn avg |
| |
| %macro filter_h_fn 1-2 12 |
| %assign %%px mmsize/2 |
| cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
| mova m5, [pw_1023] |
| .body: |
| %if notcpuflag(sse4) && ARCH_X86_64 |
| pxor m11, m11 |
| %endif |
| mova m6, [pd_64] |
| mova m7, [filteryq+ 0] |
| %if ARCH_X86_64 && mmsize > 8 |
| mova m8, [filteryq+32] |
| mova m9, [filteryq+64] |
| mova m10, [filteryq+96] |
| %endif |
| .loop: |
| movu m0, [srcq-6] |
| movu m1, [srcq-4] |
| movu m2, [srcq-2] |
| movu m3, [srcq+0] |
| movu m4, [srcq+2] |
| pmaddwd m0, m7 |
| pmaddwd m1, m7 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m2, m8 |
| pmaddwd m3, m8 |
| pmaddwd m4, m9 |
| %else |
| pmaddwd m2, [filteryq+32] |
| pmaddwd m3, [filteryq+32] |
| pmaddwd m4, [filteryq+64] |
| %endif |
| paddd m0, m2 |
| paddd m1, m3 |
| paddd m0, m4 |
| movu m2, [srcq+4] |
| movu m3, [srcq+6] |
| movu m4, [srcq+8] |
| add srcq, sstrideq |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m2, m9 |
| pmaddwd m3, m10 |
| pmaddwd m4, m10 |
| %else |
| pmaddwd m2, [filteryq+64] |
| pmaddwd m3, [filteryq+96] |
| pmaddwd m4, [filteryq+96] |
| %endif |
| paddd m1, m2 |
| paddd m0, m3 |
| paddd m1, m4 |
| paddd m0, m6 |
| paddd m1, m6 |
| psrad m0, 7 |
| psrad m1, 7 |
| %if cpuflag(sse4) |
| packusdw m0, m0 |
| packusdw m1, m1 |
| %else |
| packssdw m0, m0 |
| packssdw m1, m1 |
| %endif |
| punpcklwd m0, m1 |
| pminsw m0, m5 |
| %if notcpuflag(sse4) |
| %if ARCH_X86_64 |
| pmaxsw m0, m11 |
| %else |
| pxor m2, m2 |
| pmaxsw m0, m2 |
| %endif |
| %endif |
| %ifidn %1, avg |
| pavgw m0, [dstq] |
| %endif |
| mova [dstq], m0 |
| add dstq, dstrideq |
| dec hd |
| jg .loop |
| RET |
| |
| cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
| mova m5, [pw_4095] |
| jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body |
| %endmacro |
| |
| INIT_XMM sse2 |
| filter_h_fn put |
| filter_h_fn avg |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| filter_h_fn put |
| filter_h_fn avg |
| %endif |
| |
| %macro filter_v4_fn 1-2 12 |
| %if ARCH_X86_64 |
| cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
| %else |
| cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
| mov filteryq, r5mp |
| %define hd r4mp |
| %endif |
| mova m5, [pw_1023] |
| .body: |
| %if notcpuflag(sse4) && ARCH_X86_64 |
| pxor m11, m11 |
| %endif |
| mova m6, [pd_64] |
| lea sstride3q, [sstrideq*3] |
| lea src4q, [srcq+sstrideq] |
| sub srcq, sstride3q |
| mova m7, [filteryq+ 0] |
| %if ARCH_X86_64 && mmsize > 8 |
| mova m8, [filteryq+ 32] |
| mova m9, [filteryq+ 64] |
| mova m10, [filteryq+ 96] |
| %endif |
| .loop: |
| ; FIXME maybe reuse loads from previous rows, or just |
| ; more generally unroll this to prevent multiple loads of |
| ; the same data? |
| movh m0, [srcq] |
| movh m1, [srcq+sstrideq] |
| movh m2, [srcq+sstrideq*2] |
| movh m3, [srcq+sstride3q] |
| add srcq, sstrideq |
| movh m4, [src4q] |
| punpcklwd m0, m1 |
| punpcklwd m2, m3 |
| pmaddwd m0, m7 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m2, m8 |
| %else |
| pmaddwd m2, [filteryq+ 32] |
| %endif |
| movh m1, [src4q+sstrideq] |
| movh m3, [src4q+sstrideq*2] |
| paddd m0, m2 |
| movh m2, [src4q+sstride3q] |
| add src4q, sstrideq |
| punpcklwd m4, m1 |
| punpcklwd m3, m2 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m4, m9 |
| pmaddwd m3, m10 |
| %else |
| pmaddwd m4, [filteryq+ 64] |
| pmaddwd m3, [filteryq+ 96] |
| %endif |
| paddd m0, m4 |
| paddd m0, m3 |
| paddd m0, m6 |
| psrad m0, 7 |
| %if cpuflag(sse4) |
| packusdw m0, m0 |
| %else |
| packssdw m0, m0 |
| %endif |
| %ifidn %1, avg |
| movh m1, [dstq] |
| %endif |
| pminsw m0, m5 |
| %if notcpuflag(sse4) |
| %if ARCH_X86_64 |
| pmaxsw m0, m11 |
| %else |
| pxor m2, m2 |
| pmaxsw m0, m2 |
| %endif |
| %endif |
| %ifidn %1, avg |
| pavgw m0, m1 |
| %endif |
| movh [dstq], m0 |
| add dstq, dstrideq |
| dec hd |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
| %else |
| cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
| mov filteryq, r5mp |
| %endif |
| mova m5, [pw_4095] |
| jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body |
| %endmacro |
| |
| INIT_XMM sse2 |
| filter_v4_fn put |
| filter_v4_fn avg |
| |
| %macro filter_v_fn 1-2 13 |
| %assign %%px mmsize/2 |
| %if ARCH_X86_64 |
| cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
| %else |
| cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
| mov filteryq, r5mp |
| %define hd r4mp |
| %endif |
| mova m5, [pw_1023] |
| .body: |
| %if notcpuflag(sse4) && ARCH_X86_64 |
| pxor m12, m12 |
| %endif |
| %if ARCH_X86_64 |
| mova m11, [pd_64] |
| %endif |
| lea sstride3q, [sstrideq*3] |
| lea src4q, [srcq+sstrideq] |
| sub srcq, sstride3q |
| mova m7, [filteryq+ 0] |
| %if ARCH_X86_64 && mmsize > 8 |
| mova m8, [filteryq+ 32] |
| mova m9, [filteryq+ 64] |
| mova m10, [filteryq+ 96] |
| %endif |
| .loop: |
| ; FIXME maybe reuse loads from previous rows, or just |
| ; more generally unroll this to prevent multiple loads of |
| ; the same data? |
| movu m0, [srcq] |
| movu m1, [srcq+sstrideq] |
| movu m2, [srcq+sstrideq*2] |
| movu m3, [srcq+sstride3q] |
| add srcq, sstrideq |
| movu m4, [src4q] |
| SBUTTERFLY wd, 0, 1, 6 |
| SBUTTERFLY wd, 2, 3, 6 |
| pmaddwd m0, m7 |
| pmaddwd m1, m7 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m2, m8 |
| pmaddwd m3, m8 |
| %else |
| pmaddwd m2, [filteryq+ 32] |
| pmaddwd m3, [filteryq+ 32] |
| %endif |
| paddd m0, m2 |
| paddd m1, m3 |
| movu m2, [src4q+sstrideq] |
| movu m3, [src4q+sstrideq*2] |
| SBUTTERFLY wd, 4, 2, 6 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m4, m9 |
| pmaddwd m2, m9 |
| %else |
| pmaddwd m4, [filteryq+ 64] |
| pmaddwd m2, [filteryq+ 64] |
| %endif |
| paddd m0, m4 |
| paddd m1, m2 |
| movu m4, [src4q+sstride3q] |
| add src4q, sstrideq |
| SBUTTERFLY wd, 3, 4, 6 |
| %if ARCH_X86_64 && mmsize > 8 |
| pmaddwd m3, m10 |
| pmaddwd m4, m10 |
| %else |
| pmaddwd m3, [filteryq+ 96] |
| pmaddwd m4, [filteryq+ 96] |
| %endif |
| paddd m0, m3 |
| paddd m1, m4 |
| %if ARCH_X86_64 |
| paddd m0, m11 |
| paddd m1, m11 |
| %else |
| paddd m0, [pd_64] |
| paddd m1, [pd_64] |
| %endif |
| psrad m0, 7 |
| psrad m1, 7 |
| %if cpuflag(sse4) |
| packusdw m0, m1 |
| %else |
| packssdw m0, m1 |
| %endif |
| pminsw m0, m5 |
| %if notcpuflag(sse4) |
| %if ARCH_X86_64 |
| pmaxsw m0, m12 |
| %else |
| pxor m2, m2 |
| pmaxsw m0, m2 |
| %endif |
| %endif |
| %ifidn %1, avg |
| pavgw m0, [dstq] |
| %endif |
| mova [dstq], m0 |
| add dstq, dstrideq |
| dec hd |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
| %else |
| cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
| mov filteryq, r5mp |
| %endif |
| mova m5, [pw_4095] |
| jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body |
| %endmacro |
| |
| INIT_XMM sse2 |
| filter_v_fn put |
| filter_v_fn avg |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| filter_v_fn put |
| filter_v_fn avg |
| %endif |