| /* |
| * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "libavutil/aarch64/asm.S" |
| #include "neon.S" |
| |
| /* H.264 qpel MC */ |
| |
| .macro lowpass_const r |
| movz \r, #20, lsl #16 |
| movk \r, #5 |
| mov v6.s[0], \r |
| .endm |
| |
| //trashes v0-v5 |
| .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 |
| ext v2.8b, \r0\().8b, \r1\().8b, #2 |
| ext v3.8b, \r0\().8b, \r1\().8b, #3 |
| uaddl v2.8h, v2.8b, v3.8b |
| ext v4.8b, \r0\().8b, \r1\().8b, #1 |
| ext v5.8b, \r0\().8b, \r1\().8b, #4 |
| uaddl v4.8h, v4.8b, v5.8b |
| ext v1.8b, \r0\().8b, \r1\().8b, #5 |
| uaddl \d0\().8h, \r0\().8b, v1.8b |
| ext v0.8b, \r2\().8b, \r3\().8b, #2 |
| mla \d0\().8h, v2.8h, v6.h[1] |
| ext v1.8b, \r2\().8b, \r3\().8b, #3 |
| uaddl v0.8h, v0.8b, v1.8b |
| ext v1.8b, \r2\().8b, \r3\().8b, #1 |
| mls \d0\().8h, v4.8h, v6.h[0] |
| ext v3.8b, \r2\().8b, \r3\().8b, #4 |
| uaddl v1.8h, v1.8b, v3.8b |
| ext v2.8b, \r2\().8b, \r3\().8b, #5 |
| uaddl \d1\().8h, \r2\().8b, v2.8b |
| mla \d1\().8h, v0.8h, v6.h[1] |
| mls \d1\().8h, v1.8h, v6.h[0] |
| .if \narrow |
| sqrshrun \d0\().8b, \d0\().8h, #5 |
| sqrshrun \d1\().8b, \d1\().8h, #5 |
| .endif |
| .endm |
| |
| //trashes v0-v4 |
| .macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 |
| uaddl v2.8h, \r2\().8b, \r3\().8b |
| uaddl v0.8h, \r3\().8b, \r4\().8b |
| uaddl v4.8h, \r1\().8b, \r4\().8b |
| uaddl v1.8h, \r2\().8b, \r5\().8b |
| uaddl \d0\().8h, \r0\().8b, \r5\().8b |
| uaddl \d1\().8h, \r1\().8b, \r6\().8b |
| mla \d0\().8h, v2.8h, v6.h[1] |
| mls \d0\().8h, v4.8h, v6.h[0] |
| mla \d1\().8h, v0.8h, v6.h[1] |
| mls \d1\().8h, v1.8h, v6.h[0] |
| .if \narrow |
| sqrshrun \d0\().8b, \d0\().8h, #5 |
| sqrshrun \d1\().8b, \d1\().8h, #5 |
| .endif |
| .endm |
| |
| //trashes v0-v5, v7, v30-v31 |
| .macro lowpass_8H r0, r1 |
| ext v0.16b, \r0\().16b, \r0\().16b, #2 |
| ext v1.16b, \r0\().16b, \r0\().16b, #3 |
| uaddl v0.8h, v0.8b, v1.8b |
| ext v2.16b, \r0\().16b, \r0\().16b, #1 |
| ext v3.16b, \r0\().16b, \r0\().16b, #4 |
| uaddl v2.8h, v2.8b, v3.8b |
| ext v30.16b, \r0\().16b, \r0\().16b, #5 |
| uaddl \r0\().8h, \r0\().8b, v30.8b |
| ext v4.16b, \r1\().16b, \r1\().16b, #2 |
| mla \r0\().8h, v0.8h, v6.h[1] |
| ext v5.16b, \r1\().16b, \r1\().16b, #3 |
| uaddl v4.8h, v4.8b, v5.8b |
| ext v7.16b, \r1\().16b, \r1\().16b, #1 |
| mls \r0\().8h, v2.8h, v6.h[0] |
| ext v0.16b, \r1\().16b, \r1\().16b, #4 |
| uaddl v7.8h, v7.8b, v0.8b |
| ext v31.16b, \r1\().16b, \r1\().16b, #5 |
| uaddl \r1\().8h, \r1\().8b, v31.8b |
| mla \r1\().8h, v4.8h, v6.h[1] |
| mls \r1\().8h, v7.8h, v6.h[0] |
| .endm |
| |
| // trashes v2-v5, v30 |
| .macro lowpass_8_1 r0, r1, d0, narrow=1 |
| ext v2.8b, \r0\().8b, \r1\().8b, #2 |
| ext v3.8b, \r0\().8b, \r1\().8b, #3 |
| uaddl v2.8h, v2.8b, v3.8b |
| ext v4.8b, \r0\().8b, \r1\().8b, #1 |
| ext v5.8b, \r0\().8b, \r1\().8b, #4 |
| uaddl v4.8h, v4.8b, v5.8b |
| ext v30.8b, \r0\().8b, \r1\().8b, #5 |
| uaddl \d0\().8h, \r0\().8b, v30.8b |
| mla \d0\().8h, v2.8h, v6.h[1] |
| mls \d0\().8h, v4.8h, v6.h[0] |
| .if \narrow |
| sqrshrun \d0\().8b, \d0\().8h, #5 |
| .endif |
| .endm |
| |
| // trashed v0-v7 |
| .macro lowpass_8.16 r0, r1, r2, r3, r4, r5 |
| saddl v5.4s, \r2\().4h, \r3\().4h |
| saddl2 v1.4s, \r2\().8h, \r3\().8h |
| saddl v6.4s, \r1\().4h, \r4\().4h |
| saddl2 v2.4s, \r1\().8h, \r4\().8h |
| saddl v0.4s, \r0\().4h, \r5\().4h |
| saddl2 v4.4s, \r0\().8h, \r5\().8h |
| |
| shl v3.4s, v5.4s, #4 |
| shl v5.4s, v5.4s, #2 |
| shl v7.4s, v6.4s, #2 |
| add v5.4s, v5.4s, v3.4s |
| add v6.4s, v6.4s, v7.4s |
| |
| shl v3.4s, v1.4s, #4 |
| shl v1.4s, v1.4s, #2 |
| shl v7.4s, v2.4s, #2 |
| add v1.4s, v1.4s, v3.4s |
| add v2.4s, v2.4s, v7.4s |
| |
| add v5.4s, v5.4s, v0.4s |
| sub v5.4s, v5.4s, v6.4s |
| |
| add v1.4s, v1.4s, v4.4s |
| sub v1.4s, v1.4s, v2.4s |
| |
| rshrn v5.4h, v5.4s, #10 |
| rshrn2 v5.8h, v1.4s, #10 |
| |
| sqxtun \r0\().8b, v5.8h |
| .endm |
| |
| function put_h264_qpel16_h_lowpass_neon_packed |
| mov x4, x30 |
| mov x12, #16 |
| mov x3, #8 |
| bl put_h264_qpel8_h_lowpass_neon |
| sub x1, x1, x2, lsl #4 |
| add x1, x1, #8 |
| mov x12, #16 |
| mov x30, x4 |
| b put_h264_qpel8_h_lowpass_neon |
| endfunc |
| |
| .macro h264_qpel_h_lowpass type |
| function \type\()_h264_qpel16_h_lowpass_neon |
| mov x13, x30 |
| mov x12, #16 |
| bl \type\()_h264_qpel8_h_lowpass_neon |
| sub x0, x0, x3, lsl #4 |
| sub x1, x1, x2, lsl #4 |
| add x0, x0, #8 |
| add x1, x1, #8 |
| mov x12, #16 |
| mov x30, x13 |
| endfunc |
| |
| function \type\()_h264_qpel8_h_lowpass_neon |
| 1: ld1 {v28.8b, v29.8b}, [x1], x2 |
| ld1 {v16.8b, v17.8b}, [x1], x2 |
| subs x12, x12, #2 |
| lowpass_8 v28, v29, v16, v17, v28, v16 |
| .ifc \type,avg |
| ld1 {v2.8b}, [x0], x3 |
| ld1 {v3.8b}, [x0] |
| urhadd v28.8b, v28.8b, v2.8b |
| urhadd v16.8b, v16.8b, v3.8b |
| sub x0, x0, x3 |
| .endif |
| st1 {v28.8b}, [x0], x3 |
| st1 {v16.8b}, [x0], x3 |
| b.ne 1b |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_h_lowpass put |
| h264_qpel_h_lowpass avg |
| |
| .macro h264_qpel_h_lowpass_l2 type |
| function \type\()_h264_qpel16_h_lowpass_l2_neon |
| mov x13, x30 |
| mov x12, #16 |
| bl \type\()_h264_qpel8_h_lowpass_l2_neon |
| sub x0, x0, x2, lsl #4 |
| sub x1, x1, x2, lsl #4 |
| sub x3, x3, x2, lsl #4 |
| add x0, x0, #8 |
| add x1, x1, #8 |
| add x3, x3, #8 |
| mov x12, #16 |
| mov x30, x13 |
| endfunc |
| |
| function \type\()_h264_qpel8_h_lowpass_l2_neon |
| 1: ld1 {v26.8b, v27.8b}, [x1], x2 |
| ld1 {v16.8b, v17.8b}, [x1], x2 |
| ld1 {v28.8b}, [x3], x2 |
| ld1 {v29.8b}, [x3], x2 |
| subs x12, x12, #2 |
| lowpass_8 v26, v27, v16, v17, v26, v27 |
| urhadd v26.8b, v26.8b, v28.8b |
| urhadd v27.8b, v27.8b, v29.8b |
| .ifc \type,avg |
| ld1 {v2.8b}, [x0], x2 |
| ld1 {v3.8b}, [x0] |
| urhadd v26.8b, v26.8b, v2.8b |
| urhadd v27.8b, v27.8b, v3.8b |
| sub x0, x0, x2 |
| .endif |
| st1 {v26.8b}, [x0], x2 |
| st1 {v27.8b}, [x0], x2 |
| b.ne 1b |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_h_lowpass_l2 put |
| h264_qpel_h_lowpass_l2 avg |
| |
| function put_h264_qpel16_v_lowpass_neon_packed |
| mov x4, x30 |
| mov x2, #8 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| b put_h264_qpel8_v_lowpass_neon |
| endfunc |
| |
| .macro h264_qpel_v_lowpass type |
| function \type\()_h264_qpel16_v_lowpass_neon |
| mov x4, x30 |
| bl \type\()_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_v_lowpass_neon |
| sub x0, x0, x2, lsl #4 |
| add x0, x0, #8 |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| bl \type\()_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| endfunc |
| |
| function \type\()_h264_qpel8_v_lowpass_neon |
| ld1 {v16.8b}, [x1], x3 |
| ld1 {v17.8b}, [x1], x3 |
| ld1 {v18.8b}, [x1], x3 |
| ld1 {v19.8b}, [x1], x3 |
| ld1 {v20.8b}, [x1], x3 |
| ld1 {v21.8b}, [x1], x3 |
| ld1 {v22.8b}, [x1], x3 |
| ld1 {v23.8b}, [x1], x3 |
| ld1 {v24.8b}, [x1], x3 |
| ld1 {v25.8b}, [x1], x3 |
| ld1 {v26.8b}, [x1], x3 |
| ld1 {v27.8b}, [x1], x3 |
| ld1 {v28.8b}, [x1] |
| |
| lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 |
| lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 |
| lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 |
| lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 |
| .ifc \type,avg |
| ld1 {v24.8b}, [x0], x2 |
| ld1 {v25.8b}, [x0], x2 |
| ld1 {v26.8b}, [x0], x2 |
| urhadd v16.8b, v16.8b, v24.8b |
| ld1 {v27.8b}, [x0], x2 |
| urhadd v17.8b, v17.8b, v25.8b |
| ld1 {v28.8b}, [x0], x2 |
| urhadd v18.8b, v18.8b, v26.8b |
| ld1 {v29.8b}, [x0], x2 |
| urhadd v19.8b, v19.8b, v27.8b |
| ld1 {v30.8b}, [x0], x2 |
| urhadd v20.8b, v20.8b, v28.8b |
| ld1 {v31.8b}, [x0], x2 |
| urhadd v21.8b, v21.8b, v29.8b |
| urhadd v22.8b, v22.8b, v30.8b |
| urhadd v23.8b, v23.8b, v31.8b |
| sub x0, x0, x2, lsl #3 |
| .endif |
| |
| st1 {v16.8b}, [x0], x2 |
| st1 {v17.8b}, [x0], x2 |
| st1 {v18.8b}, [x0], x2 |
| st1 {v19.8b}, [x0], x2 |
| st1 {v20.8b}, [x0], x2 |
| st1 {v21.8b}, [x0], x2 |
| st1 {v22.8b}, [x0], x2 |
| st1 {v23.8b}, [x0], x2 |
| |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_v_lowpass put |
| h264_qpel_v_lowpass avg |
| |
| .macro h264_qpel_v_lowpass_l2 type |
| function \type\()_h264_qpel16_v_lowpass_l2_neon |
| mov x4, x30 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| sub x0, x0, x3, lsl #4 |
| sub x12, x12, x2, lsl #4 |
| add x0, x0, #8 |
| add x12, x12, #8 |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| endfunc |
| |
| function \type\()_h264_qpel8_v_lowpass_l2_neon |
| ld1 {v16.8b}, [x1], x3 |
| ld1 {v17.8b}, [x1], x3 |
| ld1 {v18.8b}, [x1], x3 |
| ld1 {v19.8b}, [x1], x3 |
| ld1 {v20.8b}, [x1], x3 |
| ld1 {v21.8b}, [x1], x3 |
| ld1 {v22.8b}, [x1], x3 |
| ld1 {v23.8b}, [x1], x3 |
| ld1 {v24.8b}, [x1], x3 |
| ld1 {v25.8b}, [x1], x3 |
| ld1 {v26.8b}, [x1], x3 |
| ld1 {v27.8b}, [x1], x3 |
| ld1 {v28.8b}, [x1] |
| |
| lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 |
| lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 |
| lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 |
| lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 |
| |
| ld1 {v24.8b}, [x12], x2 |
| ld1 {v25.8b}, [x12], x2 |
| ld1 {v26.8b}, [x12], x2 |
| ld1 {v27.8b}, [x12], x2 |
| ld1 {v28.8b}, [x12], x2 |
| urhadd v16.8b, v24.8b, v16.8b |
| urhadd v17.8b, v25.8b, v17.8b |
| ld1 {v29.8b}, [x12], x2 |
| urhadd v18.8b, v26.8b, v18.8b |
| urhadd v19.8b, v27.8b, v19.8b |
| ld1 {v30.8b}, [x12], x2 |
| urhadd v20.8b, v28.8b, v20.8b |
| urhadd v21.8b, v29.8b, v21.8b |
| ld1 {v31.8b}, [x12], x2 |
| urhadd v22.8b, v30.8b, v22.8b |
| urhadd v23.8b, v31.8b, v23.8b |
| |
| .ifc \type,avg |
| ld1 {v24.8b}, [x0], x3 |
| ld1 {v25.8b}, [x0], x3 |
| ld1 {v26.8b}, [x0], x3 |
| urhadd v16.8b, v16.8b, v24.8b |
| ld1 {v27.8b}, [x0], x3 |
| urhadd v17.8b, v17.8b, v25.8b |
| ld1 {v28.8b}, [x0], x3 |
| urhadd v18.8b, v18.8b, v26.8b |
| ld1 {v29.8b}, [x0], x3 |
| urhadd v19.8b, v19.8b, v27.8b |
| ld1 {v30.8b}, [x0], x3 |
| urhadd v20.8b, v20.8b, v28.8b |
| ld1 {v31.8b}, [x0], x3 |
| urhadd v21.8b, v21.8b, v29.8b |
| urhadd v22.8b, v22.8b, v30.8b |
| urhadd v23.8b, v23.8b, v31.8b |
| sub x0, x0, x3, lsl #3 |
| .endif |
| |
| st1 {v16.8b}, [x0], x3 |
| st1 {v17.8b}, [x0], x3 |
| st1 {v18.8b}, [x0], x3 |
| st1 {v19.8b}, [x0], x3 |
| st1 {v20.8b}, [x0], x3 |
| st1 {v21.8b}, [x0], x3 |
| st1 {v22.8b}, [x0], x3 |
| st1 {v23.8b}, [x0], x3 |
| |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_v_lowpass_l2 put |
| h264_qpel_v_lowpass_l2 avg |
| |
| function put_h264_qpel8_hv_lowpass_neon_top |
| lowpass_const w12 |
| ld1 {v16.8h}, [x1], x3 |
| ld1 {v17.8h}, [x1], x3 |
| ld1 {v18.8h}, [x1], x3 |
| ld1 {v19.8h}, [x1], x3 |
| ld1 {v20.8h}, [x1], x3 |
| ld1 {v21.8h}, [x1], x3 |
| ld1 {v22.8h}, [x1], x3 |
| ld1 {v23.8h}, [x1], x3 |
| ld1 {v24.8h}, [x1], x3 |
| ld1 {v25.8h}, [x1], x3 |
| ld1 {v26.8h}, [x1], x3 |
| ld1 {v27.8h}, [x1], x3 |
| ld1 {v28.8h}, [x1] |
| lowpass_8H v16, v17 |
| lowpass_8H v18, v19 |
| lowpass_8H v20, v21 |
| lowpass_8H v22, v23 |
| lowpass_8H v24, v25 |
| lowpass_8H v26, v27 |
| lowpass_8H v28, v29 |
| |
| lowpass_8.16 v16, v17, v18, v19, v20, v21 |
| lowpass_8.16 v17, v18, v19, v20, v21, v22 |
| |
| lowpass_8.16 v18, v19, v20, v21, v22, v23 |
| lowpass_8.16 v19, v20, v21, v22, v23, v24 |
| |
| lowpass_8.16 v20, v21, v22, v23, v24, v25 |
| lowpass_8.16 v21, v22, v23, v24, v25, v26 |
| |
| lowpass_8.16 v22, v23, v24, v25, v26, v27 |
| lowpass_8.16 v23, v24, v25, v26, v27, v28 |
| |
| ret |
| endfunc |
| |
| .macro h264_qpel8_hv_lowpass type |
| function \type\()_h264_qpel8_hv_lowpass_neon |
| mov x10, x30 |
| bl put_h264_qpel8_hv_lowpass_neon_top |
| .ifc \type,avg |
| ld1 {v0.8b}, [x0], x2 |
| ld1 {v1.8b}, [x0], x2 |
| ld1 {v2.8b}, [x0], x2 |
| urhadd v16.8b, v16.8b, v0.8b |
| ld1 {v3.8b}, [x0], x2 |
| urhadd v17.8b, v17.8b, v1.8b |
| ld1 {v4.8b}, [x0], x2 |
| urhadd v18.8b, v18.8b, v2.8b |
| ld1 {v5.8b}, [x0], x2 |
| urhadd v19.8b, v19.8b, v3.8b |
| ld1 {v6.8b}, [x0], x2 |
| urhadd v20.8b, v20.8b, v4.8b |
| ld1 {v7.8b}, [x0], x2 |
| urhadd v21.8b, v21.8b, v5.8b |
| urhadd v22.8b, v22.8b, v6.8b |
| urhadd v23.8b, v23.8b, v7.8b |
| sub x0, x0, x2, lsl #3 |
| .endif |
| |
| st1 {v16.8b}, [x0], x2 |
| st1 {v17.8b}, [x0], x2 |
| st1 {v18.8b}, [x0], x2 |
| st1 {v19.8b}, [x0], x2 |
| st1 {v20.8b}, [x0], x2 |
| st1 {v21.8b}, [x0], x2 |
| st1 {v22.8b}, [x0], x2 |
| st1 {v23.8b}, [x0], x2 |
| |
| ret x10 |
| endfunc |
| .endm |
| |
| h264_qpel8_hv_lowpass put |
| h264_qpel8_hv_lowpass avg |
| |
| .macro h264_qpel8_hv_lowpass_l2 type |
| function \type\()_h264_qpel8_hv_lowpass_l2_neon |
| mov x10, x30 |
| bl put_h264_qpel8_hv_lowpass_neon_top |
| |
| ld1 {v0.8b, v1.8b}, [x2], #16 |
| ld1 {v2.8b, v3.8b}, [x2], #16 |
| urhadd v0.8b, v0.8b, v16.8b |
| urhadd v1.8b, v1.8b, v17.8b |
| ld1 {v4.8b, v5.8b}, [x2], #16 |
| urhadd v2.8b, v2.8b, v18.8b |
| urhadd v3.8b, v3.8b, v19.8b |
| ld1 {v6.8b, v7.8b}, [x2], #16 |
| urhadd v4.8b, v4.8b, v20.8b |
| urhadd v5.8b, v5.8b, v21.8b |
| urhadd v6.8b, v6.8b, v22.8b |
| urhadd v7.8b, v7.8b, v23.8b |
| .ifc \type,avg |
| ld1 {v16.8b}, [x0], x3 |
| ld1 {v17.8b}, [x0], x3 |
| ld1 {v18.8b}, [x0], x3 |
| urhadd v0.8b, v0.8b, v16.8b |
| ld1 {v19.8b}, [x0], x3 |
| urhadd v1.8b, v1.8b, v17.8b |
| ld1 {v20.8b}, [x0], x3 |
| urhadd v2.8b, v2.8b, v18.8b |
| ld1 {v21.8b}, [x0], x3 |
| urhadd v3.8b, v3.8b, v19.8b |
| ld1 {v22.8b}, [x0], x3 |
| urhadd v4.8b, v4.8b, v20.8b |
| ld1 {v23.8b}, [x0], x3 |
| urhadd v5.8b, v5.8b, v21.8b |
| urhadd v6.8b, v6.8b, v22.8b |
| urhadd v7.8b, v7.8b, v23.8b |
| sub x0, x0, x3, lsl #3 |
| .endif |
| st1 {v0.8b}, [x0], x3 |
| st1 {v1.8b}, [x0], x3 |
| st1 {v2.8b}, [x0], x3 |
| st1 {v3.8b}, [x0], x3 |
| st1 {v4.8b}, [x0], x3 |
| st1 {v5.8b}, [x0], x3 |
| st1 {v6.8b}, [x0], x3 |
| st1 {v7.8b}, [x0], x3 |
| |
| ret x10 |
| endfunc |
| .endm |
| |
| h264_qpel8_hv_lowpass_l2 put |
| h264_qpel8_hv_lowpass_l2 avg |
| |
| .macro h264_qpel16_hv type |
| function \type\()_h264_qpel16_hv_lowpass_neon |
| mov x13, x30 |
| bl \type\()_h264_qpel8_hv_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_hv_lowpass_neon |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| sub x0, x0, x2, lsl #4 |
| add x0, x0, #8 |
| bl \type\()_h264_qpel8_hv_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x13 |
| b \type\()_h264_qpel8_hv_lowpass_neon |
| endfunc |
| |
| function \type\()_h264_qpel16_hv_lowpass_l2_neon |
| mov x13, x30 |
| sub x2, x4, #256 |
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| sub x0, x0, x3, lsl #4 |
| add x0, x0, #8 |
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x13 |
| b \type\()_h264_qpel8_hv_lowpass_l2_neon |
| endfunc |
| .endm |
| |
| h264_qpel16_hv put |
| h264_qpel16_hv avg |
| |
| .macro h264_qpel8 type |
| function ff_\type\()_h264_qpel8_mc10_neon, export=1 |
| lowpass_const w3 |
| mov x3, x1 |
| sub x1, x1, #2 |
| mov x12, #8 |
| b \type\()_h264_qpel8_h_lowpass_l2_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
| lowpass_const w3 |
| sub x1, x1, #2 |
| mov x3, x2 |
| mov x12, #8 |
| b \type\()_h264_qpel8_h_lowpass_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
| lowpass_const w3 |
| add x3, x1, #1 |
| sub x1, x1, #2 |
| mov x12, #8 |
| b \type\()_h264_qpel8_h_lowpass_l2_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
| mov x14, x30 |
| mov x12, x1 |
| \type\()_h264_qpel8_mc01: |
| lowpass_const w3 |
| mov x3, x2 |
| sub x1, x1, x2, lsl #1 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel8_mc11: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #64 |
| mov x0, sp |
| sub x1, x1, #2 |
| mov x3, #8 |
| mov x12, #8 |
| bl put_h264_qpel8_h_lowpass_neon |
| mov x0, x8 |
| mov x3, x2 |
| mov x12, sp |
| sub x1, x9, x2, lsl #1 |
| mov x2, #8 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel8_mc21: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #(8*8+16*12) |
| sub x1, x1, #2 |
| mov x3, #8 |
| mov x0, sp |
| mov x12, #8 |
| bl put_h264_qpel8_h_lowpass_neon |
| mov x4, x0 |
| mov x0, x8 |
| sub x1, x9, x2, lsl #1 |
| sub x1, x1, #2 |
| mov x3, x2 |
| sub x2, x4, #64 |
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
| add x1, x1, #1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| sub x1, x1, #1 |
| b \type\()_h264_qpel8_mc11 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
| mov x14, x30 |
| lowpass_const w3 |
| sub x1, x1, x2, lsl #1 |
| mov x3, x2 |
| bl \type\()_h264_qpel8_v_lowpass_neon |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel8_mc12: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #(8*8+16*12) |
| sub x1, x1, x2, lsl #1 |
| mov x3, x2 |
| mov x2, #8 |
| mov x0, sp |
| bl put_h264_qpel8_v_lowpass_neon |
| mov x4, x0 |
| mov x0, x8 |
| sub x1, x9, x3, lsl #1 |
| sub x1, x1, #2 |
| sub x2, x4, #64 |
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
| mov x14, x30 |
| mov x11, sp |
| sub x1, x1, x2, lsl #1 |
| sub x1, x1, #2 |
| mov x3, x2 |
| bl \type\()_h264_qpel8_hv_lowpass_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, #1 |
| b \type\()_h264_qpel8_mc12 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
| mov x14, x30 |
| add x12, x1, x2 |
| b \type\()_h264_qpel8_mc01 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel8_mc11 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel8_mc21 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
| add x1, x1, #1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| sub x1, x1, #1 |
| b \type\()_h264_qpel8_mc11 |
| endfunc |
| .endm |
| |
| h264_qpel8 put |
| h264_qpel8 avg |
| |
| .macro h264_qpel16 type |
| function ff_\type\()_h264_qpel16_mc10_neon, export=1 |
| lowpass_const w3 |
| mov x3, x1 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel16_h_lowpass_l2_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
| lowpass_const w3 |
| sub x1, x1, #2 |
| mov x3, x2 |
| b \type\()_h264_qpel16_h_lowpass_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
| lowpass_const w3 |
| add x3, x1, #1 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel16_h_lowpass_l2_neon |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
| mov x14, x30 |
| mov x12, x1 |
| \type\()_h264_qpel16_mc01: |
| lowpass_const w3 |
| mov x3, x2 |
| sub x1, x1, x2, lsl #1 |
| bl \type\()_h264_qpel16_v_lowpass_l2_neon |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel16_mc11: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #256 |
| mov x0, sp |
| sub x1, x1, #2 |
| mov x3, #16 |
| bl put_h264_qpel16_h_lowpass_neon |
| mov x0, x8 |
| mov x3, x2 |
| mov x12, sp |
| sub x1, x9, x2, lsl #1 |
| mov x2, #16 |
| bl \type\()_h264_qpel16_v_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel16_mc21: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #(16*16+16*12) |
| sub x1, x1, #2 |
| mov x0, sp |
| bl put_h264_qpel16_h_lowpass_neon_packed |
| mov x4, x0 |
| mov x0, x8 |
| sub x1, x9, x2, lsl #1 |
| sub x1, x1, #2 |
| mov x3, x2 |
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
| add x1, x1, #1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| sub x1, x1, #1 |
| b \type\()_h264_qpel16_mc11 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
| mov x14, x30 |
| lowpass_const w3 |
| sub x1, x1, x2, lsl #1 |
| mov x3, x2 |
| bl \type\()_h264_qpel16_v_lowpass_neon |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel16_mc12: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #(16*16+16*12) |
| sub x1, x1, x2, lsl #1 |
| mov x0, sp |
| mov x3, x2 |
| bl put_h264_qpel16_v_lowpass_neon_packed |
| mov x4, x0 |
| mov x0, x8 |
| sub x1, x9, x3, lsl #1 |
| sub x1, x1, #2 |
| mov x2, x3 |
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
| mov x14, x30 |
| lowpass_const w3 |
| mov x11, sp |
| sub x1, x1, x2, lsl #1 |
| sub x1, x1, #2 |
| mov x3, x2 |
| bl \type\()_h264_qpel16_hv_lowpass_neon |
| mov sp, x11 // restore stack |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, #1 |
| b \type\()_h264_qpel16_mc12 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
| mov x14, x30 |
| add x12, x1, x2 |
| b \type\()_h264_qpel16_mc01 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel16_mc11 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel16_mc21 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
| add x1, x1, #1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| sub x1, x1, #1 |
| b \type\()_h264_qpel16_mc11 |
| endfunc |
| .endm |
| |
| h264_qpel16 put |
| h264_qpel16 avg |
| |
| //trashes v0-v5 |
| .macro lowpass_8_10 r0, r1, r2, r3, d0, d1 |
| ext v2.16b, \r0\().16b, \r1\().16b, #4 |
| ext v3.16b, \r0\().16b, \r1\().16b, #6 |
| add v2.8h, v2.8h, v3.8h |
| ext v4.16b, \r0\().16b, \r1\().16b, #2 |
| ext v5.16b, \r0\().16b, \r1\().16b, #8 |
| add v4.8h, v4.8h, v5.8h |
| ext v1.16b, \r0\().16b, \r1\().16b, #10 |
| |
| add \d0\().8h, \r0\().8h, v1.8h |
| ext v0.16b, \r2\().16b, \r3\().16b, #4 |
| mla \d0\().8h, v2.8h, v6.h[1] |
| ext v1.16b, \r2\().16b, \r3\().16b, #6 |
| add v0.8h, v0.8h, v1.8h |
| ext v1.16b, \r2\().16b, \r3\().16b, #2 |
| mul v5.8h, v4.8h, v6.h[0] |
| uqsub \d0\().8h, \d0\().8h, v5.8h |
| urshr \d0\().8h, \d0\().8h, #5 |
| |
| ext v3.16b, \r2\().16b, \r3\().16b, #8 |
| add v1.8h, v1.8h, v3.8h |
| ext v2.16b, \r2\().16b, \r3\().16b, #10 |
| |
| add \d1\().8h, \r2\().8h, v2.8h |
| mla \d1\().8h, v0.8h, v6.h[1] |
| mul v5.8h, v1.8h, v6.h[0] |
| uqsub \d1\().8h, \d1\().8h, v5.8h |
| mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping |
| urshr \d1\().8h, \d1\().8h, #5 |
| |
| umin \d0\().8h, \d0\().8h, v5.8h |
| umin \d1\().8h, \d1\().8h, v5.8h |
| .endm |
| |
| //trashes v0-v4 |
| .macro lowpass_8_10_v r0, r1, r2, r3, r4, r5, r6, d0, d1 |
| add v2.8h, \r2\().8h, \r3\().8h |
| add v0.8h, \r3\().8h, \r4\().8h |
| add v4.8h, \r1\().8h, \r4\().8h |
| add v1.8h, \r2\().8h, \r5\().8h |
| |
| add \d0\().8h, \r0\().8h, \r5\().8h |
| add \d1\().8h, \r1\().8h, \r6\().8h |
| mla \d0\().8h, v2.8h, v6.h[1] |
| mla \d1\().8h, v0.8h, v6.h[1] |
| mul v2.8h, v4.8h, v6.h[0] |
| mul v0.8h, v1.8h, v6.h[0] |
| uqsub \d0\().8h, \d0\().8h, v2.8h |
| uqsub \d1\().8h, \d1\().8h, v0.8h |
| |
| mvni v0.8h, #0xFC, lsl #8 // 1023 for clipping |
| |
| urshr \d0\().8h, \d0\().8h, #5 |
| urshr \d1\().8h, \d1\().8h, #5 |
| |
| umin \d0\().8h, \d0\().8h, v0.8h |
| umin \d1\().8h, \d1\().8h, v0.8h |
| .endm |
| |
| function put_h264_qpel16_h_lowpass_neon_packed_10 |
| mov x4, x30 |
| mov x12, #32 |
| mov x3, #16 |
| bl put_h264_qpel8_h_lowpass_neon_10 |
| sub x1, x1, x2, lsl #4 |
| add x1, x1, #16 |
| mov x12, #32 |
| mov x30, x4 |
| b put_h264_qpel8_h_lowpass_neon_10 |
| endfunc |
| |
| .macro h264_qpel_h_lowpass_10 type |
| function \type\()_h264_qpel16_h_lowpass_neon_10 |
| mov x13, x30 |
| mov x12, #32 |
| bl \type\()_h264_qpel8_h_lowpass_neon_10 |
| sub x0, x0, x3, lsl #4 |
| sub x1, x1, x2, lsl #4 |
| add x0, x0, #16 |
| add x1, x1, #16 |
| mov x12, #32 |
| mov x30, x13 |
| endfunc |
| |
| function \type\()_h264_qpel8_h_lowpass_neon_10 |
| 1: ld1 {v28.8h, v29.8h}, [x1], x2 |
| ld1 {v16.8h, v17.8h}, [x1], x2 |
| subs x12, x12, #4 |
| lowpass_8_10 v28, v29, v16, v17, v28, v20 |
| .ifc \type,avg |
| ld1 {v2.8h}, [x0], x3 |
| ld1 {v3.8h}, [x0] |
| urhadd v28.8h, v28.8h, v2.8h |
| urhadd v20.8h, v20.8h, v3.8h |
| sub x0, x0, x3 |
| .endif |
| st1 {v28.8h}, [x0], x3 |
| st1 {v20.8h}, [x0], x3 |
| b.ne 1b |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_h_lowpass_10 put |
| h264_qpel_h_lowpass_10 avg |
| |
| .macro h264_qpel_h_lowpass_l2_10 type |
| function \type\()_h264_qpel16_h_lowpass_l2_neon_10 |
| mov x13, x30 |
| mov x12, #32 |
| bl \type\()_h264_qpel8_h_lowpass_l2_neon_10 |
| sub x0, x0, x2, lsl #4 |
| sub x1, x1, x2, lsl #4 |
| sub x3, x3, x2, lsl #4 |
| add x0, x0, #16 |
| add x1, x1, #16 |
| add x3, x3, #16 |
| mov x12, #32 |
| mov x30, x13 |
| endfunc |
| |
| function \type\()_h264_qpel8_h_lowpass_l2_neon_10 |
| 1: ld1 {v26.8h, v27.8h}, [x1], x2 |
| ld1 {v16.8h, v17.8h}, [x1], x2 |
| ld1 {v28.8h}, [x3], x2 |
| ld1 {v29.8h}, [x3], x2 |
| subs x12, x12, #4 |
| lowpass_8_10 v26, v27, v16, v17, v26, v27 |
| urhadd v26.8h, v26.8h, v28.8h |
| urhadd v27.8h, v27.8h, v29.8h |
| .ifc \type,avg |
| ld1 {v2.8h}, [x0], x2 |
| ld1 {v3.8h}, [x0] |
| urhadd v26.8h, v26.8h, v2.8h |
| urhadd v27.8h, v27.8h, v3.8h |
| sub x0, x0, x2 |
| .endif |
| st1 {v26.8h}, [x0], x2 |
| st1 {v27.8h}, [x0], x2 |
| b.ne 1b |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_h_lowpass_l2_10 put |
| h264_qpel_h_lowpass_l2_10 avg |
| |
| function put_h264_qpel16_v_lowpass_neon_packed_10 |
| mov x4, x30 |
| mov x2, #8 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #8 |
| bl put_h264_qpel8_v_lowpass_neon |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| b put_h264_qpel8_v_lowpass_neon |
| endfunc |
| |
| .macro h264_qpel_v_lowpass_10 type |
| function \type\()_h264_qpel16_v_lowpass_neon_10 |
| mov x4, x30 |
| bl \type\()_h264_qpel8_v_lowpass_neon_10 |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_v_lowpass_neon_10 |
| sub x0, x0, x2, lsl #4 |
| add x0, x0, #16 |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #16 |
| bl \type\()_h264_qpel8_v_lowpass_neon_10 |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| endfunc |
| |
| function \type\()_h264_qpel8_v_lowpass_neon_10 |
| ld1 {v16.8h}, [x1], x3 |
| ld1 {v17.8h}, [x1], x3 |
| ld1 {v18.8h}, [x1], x3 |
| ld1 {v19.8h}, [x1], x3 |
| ld1 {v20.8h}, [x1], x3 |
| ld1 {v21.8h}, [x1], x3 |
| ld1 {v22.8h}, [x1], x3 |
| ld1 {v23.8h}, [x1], x3 |
| ld1 {v24.8h}, [x1], x3 |
| ld1 {v25.8h}, [x1], x3 |
| ld1 {v26.8h}, [x1], x3 |
| ld1 {v27.8h}, [x1], x3 |
| ld1 {v28.8h}, [x1] |
| |
| lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17 |
| lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19 |
| lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21 |
| lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23 |
| |
| .ifc \type,avg |
| ld1 {v24.8h}, [x0], x2 |
| ld1 {v25.8h}, [x0], x2 |
| ld1 {v26.8h}, [x0], x2 |
| urhadd v16.8h, v16.8h, v24.8h |
| ld1 {v27.8h}, [x0], x2 |
| urhadd v17.8h, v17.8h, v25.8h |
| ld1 {v28.8h}, [x0], x2 |
| urhadd v18.8h, v18.8h, v26.8h |
| ld1 {v29.8h}, [x0], x2 |
| urhadd v19.8h, v19.8h, v27.8h |
| ld1 {v30.8h}, [x0], x2 |
| urhadd v20.8h, v20.8h, v28.8h |
| ld1 {v31.8h}, [x0], x2 |
| urhadd v21.8h, v21.8h, v29.8h |
| urhadd v22.8h, v22.8h, v30.8h |
| urhadd v23.8h, v23.8h, v31.8h |
| sub x0, x0, x2, lsl #3 |
| .endif |
| |
| st1 {v16.8h}, [x0], x2 |
| st1 {v17.8h}, [x0], x2 |
| st1 {v18.8h}, [x0], x2 |
| st1 {v19.8h}, [x0], x2 |
| st1 {v20.8h}, [x0], x2 |
| st1 {v21.8h}, [x0], x2 |
| st1 {v22.8h}, [x0], x2 |
| st1 {v23.8h}, [x0], x2 |
| |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_v_lowpass_10 put |
| h264_qpel_v_lowpass_10 avg |
| |
| .macro h264_qpel_v_lowpass_l2_10 type |
| function \type\()_h264_qpel16_v_lowpass_l2_neon_10 |
| mov x4, x30 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| sub x1, x1, x3, lsl #2 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| sub x0, x0, x3, lsl #4 |
| sub x12, x12, x2, lsl #4 |
| add x0, x0, #16 |
| add x12, x12, #16 |
| sub x1, x1, x3, lsl #4 |
| sub x1, x1, x3, lsl #2 |
| add x1, x1, #16 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| sub x1, x1, x3, lsl #2 |
| mov x30, x4 |
| endfunc |
| |
| function \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| ld1 {v16.8h}, [x1], x3 |
| ld1 {v17.8h}, [x1], x3 |
| ld1 {v18.8h}, [x1], x3 |
| ld1 {v19.8h}, [x1], x3 |
| ld1 {v20.8h}, [x1], x3 |
| ld1 {v21.8h}, [x1], x3 |
| ld1 {v22.8h}, [x1], x3 |
| ld1 {v23.8h}, [x1], x3 |
| ld1 {v24.8h}, [x1], x3 |
| ld1 {v25.8h}, [x1], x3 |
| ld1 {v26.8h}, [x1], x3 |
| ld1 {v27.8h}, [x1], x3 |
| ld1 {v28.8h}, [x1] |
| |
| lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17 |
| lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19 |
| lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21 |
| lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23 |
| |
| ld1 {v24.8h}, [x12], x2 |
| ld1 {v25.8h}, [x12], x2 |
| ld1 {v26.8h}, [x12], x2 |
| ld1 {v27.8h}, [x12], x2 |
| ld1 {v28.8h}, [x12], x2 |
| urhadd v16.8h, v24.8h, v16.8h |
| urhadd v17.8h, v25.8h, v17.8h |
| ld1 {v29.8h}, [x12], x2 |
| urhadd v18.8h, v26.8h, v18.8h |
| urhadd v19.8h, v27.8h, v19.8h |
| ld1 {v30.8h}, [x12], x2 |
| urhadd v20.8h, v28.8h, v20.8h |
| urhadd v21.8h, v29.8h, v21.8h |
| ld1 {v31.8h}, [x12], x2 |
| urhadd v22.8h, v30.8h, v22.8h |
| urhadd v23.8h, v31.8h, v23.8h |
| |
| .ifc \type,avg |
| ld1 {v24.8h}, [x0], x3 |
| ld1 {v25.8h}, [x0], x3 |
| ld1 {v26.8h}, [x0], x3 |
| urhadd v16.8h, v16.8h, v24.8h |
| ld1 {v27.8h}, [x0], x3 |
| urhadd v17.8h, v17.8h, v25.8h |
| ld1 {v28.8h}, [x0], x3 |
| urhadd v18.8h, v18.8h, v26.8h |
| ld1 {v29.8h}, [x0], x3 |
| urhadd v19.8h, v19.8h, v27.8h |
| ld1 {v30.8h}, [x0], x3 |
| urhadd v20.8h, v20.8h, v28.8h |
| ld1 {v31.8h}, [x0], x3 |
| urhadd v21.8h, v21.8h, v29.8h |
| urhadd v22.8h, v22.8h, v30.8h |
| urhadd v23.8h, v23.8h, v31.8h |
| sub x0, x0, x3, lsl #3 |
| .endif |
| |
| st1 {v16.8h}, [x0], x3 |
| st1 {v17.8h}, [x0], x3 |
| st1 {v18.8h}, [x0], x3 |
| st1 {v19.8h}, [x0], x3 |
| st1 {v20.8h}, [x0], x3 |
| st1 {v21.8h}, [x0], x3 |
| st1 {v22.8h}, [x0], x3 |
| st1 {v23.8h}, [x0], x3 |
| |
| ret |
| endfunc |
| .endm |
| |
| h264_qpel_v_lowpass_l2_10 put |
| h264_qpel_v_lowpass_l2_10 avg |
| |
| .macro h264_qpel8_10 type |
| function ff_\type\()_h264_qpel8_mc10_neon_10, export=1 |
| lowpass_const w3 |
| mov x3, x1 |
| sub x1, x1, #4 |
| mov x12, #16 |
| b \type\()_h264_qpel8_h_lowpass_l2_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc20_neon_10, export=1 |
| lowpass_const w3 |
| sub x1, x1, #4 |
| mov x3, x2 |
| mov x12, #16 |
| b \type\()_h264_qpel8_h_lowpass_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc30_neon_10, export=1 |
| lowpass_const w3 |
| add x3, x1, #2 |
| sub x1, x1, #4 |
| mov x12, #16 |
| b \type\()_h264_qpel8_h_lowpass_l2_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc01_neon_10, export=1 |
| mov x14, x30 |
| mov x12, x1 |
| \type\()_h264_qpel8_mc01_10: |
| lowpass_const w3 |
| mov x3, x2 |
| sub x1, x1, x2, lsl #1 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc11_neon_10, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel8_mc11_10: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #128 |
| mov x0, sp |
| sub x1, x1, #4 |
| mov x3, #16 |
| mov x12, #16 |
| bl put_h264_qpel8_h_lowpass_neon_10 |
| mov x0, x8 |
| mov x3, x2 |
| mov x12, sp |
| sub x1, x9, x2, lsl #1 |
| mov x2, #16 |
| bl \type\()_h264_qpel8_v_lowpass_l2_neon_10 |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc31_neon_10, export=1 |
| add x1, x1, #2 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel8_mc11_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc02_neon_10, export=1 |
| mov x14, x30 |
| lowpass_const w3 |
| sub x1, x1, x2, lsl #1 |
| mov x3, x2 |
| bl \type\()_h264_qpel8_v_lowpass_neon_10 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc03_neon_10, export=1 |
| mov x14, x30 |
| add x12, x1, x2 |
| b \type\()_h264_qpel8_mc01_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc13_neon_10, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel8_mc11_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel8_mc33_neon_10, export=1 |
| add x1, x1, #2 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel8_mc11_10 |
| endfunc |
| .endm |
| |
| h264_qpel8_10 put |
| h264_qpel8_10 avg |
| |
| .macro h264_qpel16_10 type |
| function ff_\type\()_h264_qpel16_mc10_neon_10, export=1 |
| lowpass_const w3 |
| mov x3, x1 |
| sub x1, x1, #4 |
| b \type\()_h264_qpel16_h_lowpass_l2_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc20_neon_10, export=1 |
| lowpass_const w3 |
| sub x1, x1, #4 |
| mov x3, x2 |
| b \type\()_h264_qpel16_h_lowpass_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc30_neon_10, export=1 |
| lowpass_const w3 |
| add x3, x1, #2 |
| sub x1, x1, #4 |
| b \type\()_h264_qpel16_h_lowpass_l2_neon_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc01_neon_10, export=1 |
| mov x14, x30 |
| mov x12, x1 |
| \type\()_h264_qpel16_mc01_10: |
| lowpass_const w3 |
| mov x3, x2 |
| sub x1, x1, x2, lsl #1 |
| bl \type\()_h264_qpel16_v_lowpass_l2_neon_10 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc11_neon_10, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| \type\()_h264_qpel16_mc11_10: |
| lowpass_const w3 |
| mov x11, sp |
| sub sp, sp, #512 |
| mov x0, sp |
| sub x1, x1, #4 |
| mov x3, #32 |
| bl put_h264_qpel16_h_lowpass_neon_10 |
| mov x0, x8 |
| mov x3, x2 |
| mov x12, sp |
| sub x1, x9, x2, lsl #1 |
| mov x2, #32 |
| bl \type\()_h264_qpel16_v_lowpass_l2_neon_10 |
| mov sp, x11 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc31_neon_10, export=1 |
| add x1, x1, #2 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel16_mc11_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc02_neon_10, export=1 |
| mov x14, x30 |
| lowpass_const w3 |
| sub x1, x1, x2, lsl #1 |
| mov x3, x2 |
| bl \type\()_h264_qpel16_v_lowpass_neon_10 |
| ret x14 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc03_neon_10, export=1 |
| mov x14, x30 |
| add x12, x1, x2 |
| b \type\()_h264_qpel16_mc01_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc13_neon_10, export=1 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| b \type\()_h264_qpel16_mc11_10 |
| endfunc |
| |
| function ff_\type\()_h264_qpel16_mc33_neon_10, export=1 |
| add x1, x1, #2 |
| mov x14, x30 |
| mov x8, x0 |
| mov x9, x1 |
| add x1, x1, x2 |
| sub x1, x1, #2 |
| b \type\()_h264_qpel16_mc11_10 |
| endfunc |
| .endm |
| |
| h264_qpel16_10 put |
| h264_qpel16_10 avg |