| /* |
| * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "libavutil/aarch64/asm.S" |
| |
| #define VVC_MAX_PB_SIZE 128 |
| #define BDOF_BLOCK_SIZE 16 |
| #define BDOF_MIN_BLOCK_SIZE 4 |
| |
| .macro vvc_w_avg bit_depth |
| |
| .macro vvc_w_avg_\bit_depth\()_2_4 tap |
| .if \tap == 2 |
| ldr s0, [src0] |
| ldr s2, [src1] |
| .else |
| ldr d0, [src0] |
| ldr d2, [src1] |
| .endif |
| mov v4.16b, v16.16b |
| smlal v4.4s, v0.4h, v19.4h |
| smlal v4.4s, v2.4h, v20.4h |
| sqshl v4.4s, v4.4s, v22.4s |
| sqxtun v4.4h, v4.4s |
| |
| .if \bit_depth == 8 |
| sqxtun v4.8b, v4.8h |
| .if \tap == 2 |
| str h4, [dst] |
| .else // tap == 4 |
| str s4, [dst] |
| .endif |
| |
| .else // bit_depth > 8 |
| umin v4.4h, v4.4h, v17.4h |
| .if \tap == 2 |
| str s4, [dst] |
| .else |
| str d4, [dst] |
| .endif |
| .endif |
| add src0, src0, x10 |
| add src1, src1, x10 |
| add dst, dst, dst_stride |
| .endm |
| |
| function ff_vvc_w_avg_\bit_depth\()_neon, export=1 |
| dst .req x0 |
| dst_stride .req x1 |
| src0 .req x2 |
| src1 .req x3 |
| width .req w4 |
| height .req w5 |
| |
| mov x10, #(VVC_MAX_PB_SIZE * 2) |
| cmp width, #8 |
| lsr x11, x6, #32 // weight0 |
| mov w12, w6 // weight1 |
| lsr x13, x7, #32 // offset |
| mov w14, w7 // shift |
| |
| dup v19.8h, w11 |
| neg w14, w14 // so we can use sqshl |
| dup v20.8h, w12 |
| dup v16.4s, w13 |
| dup v22.4s, w14 |
| |
| .if \bit_depth >= 10 |
| // clip pixel |
| mov w6, #((1 << \bit_depth) - 1) |
| dup v17.8h, w6 |
| .endif |
| |
| b.eq 8f |
| b.hi 16f |
| cmp width, #4 |
| b.eq 4f |
| 2: // width == 2 |
| subs height, height, #1 |
| vvc_w_avg_\bit_depth\()_2_4 2 |
| b.ne 2b |
| b 32f |
| 4: // width == 4 |
| subs height, height, #1 |
| vvc_w_avg_\bit_depth\()_2_4 4 |
| b.ne 4b |
| b 32f |
| 8: // width == 8 |
| ld1 {v0.8h}, [src0], x10 |
| ld1 {v2.8h}, [src1], x10 |
| mov v4.16b, v16.16b |
| mov v5.16b, v16.16b |
| smlal v4.4s, v0.4h, v19.4h |
| smlal v4.4s, v2.4h, v20.4h |
| smlal2 v5.4s, v0.8h, v19.8h |
| smlal2 v5.4s, v2.8h, v20.8h |
| sqshl v4.4s, v4.4s, v22.4s |
| sqshl v5.4s, v5.4s, v22.4s |
| sqxtun v4.4h, v4.4s |
| sqxtun2 v4.8h, v5.4s |
| subs height, height, #1 |
| .if \bit_depth == 8 |
| sqxtun v4.8b, v4.8h |
| st1 {v4.8b}, [dst], dst_stride |
| .else |
| umin v4.8h, v4.8h, v17.8h |
| st1 {v4.8h}, [dst], dst_stride |
| .endif |
| b.ne 8b |
| b 32f |
| 16: // width >= 16 |
| mov w6, width |
| mov x7, src0 |
| mov x8, src1 |
| mov x9, dst |
| 17: |
| ldp q0, q1, [x7], #32 |
| ldp q2, q3, [x8], #32 |
| mov v4.16b, v16.16b |
| mov v5.16b, v16.16b |
| mov v6.16b, v16.16b |
| mov v7.16b, v16.16b |
| smlal v4.4s, v0.4h, v19.4h |
| smlal v4.4s, v2.4h, v20.4h |
| smlal2 v5.4s, v0.8h, v19.8h |
| smlal2 v5.4s, v2.8h, v20.8h |
| smlal v6.4s, v1.4h, v19.4h |
| smlal v6.4s, v3.4h, v20.4h |
| smlal2 v7.4s, v1.8h, v19.8h |
| smlal2 v7.4s, v3.8h, v20.8h |
| sqshl v4.4s, v4.4s, v22.4s |
| sqshl v5.4s, v5.4s, v22.4s |
| sqshl v6.4s, v6.4s, v22.4s |
| sqshl v7.4s, v7.4s, v22.4s |
| sqxtun v4.4h, v4.4s |
| sqxtun v6.4h, v6.4s |
| sqxtun2 v4.8h, v5.4s |
| sqxtun2 v6.8h, v7.4s |
| subs w6, w6, #16 |
| .if \bit_depth == 8 |
| sqxtun v4.8b, v4.8h |
| sqxtun2 v4.16b, v6.8h |
| str q4, [x9], #16 |
| .else |
| umin v4.8h, v4.8h, v17.8h |
| umin v6.8h, v6.8h, v17.8h |
| stp q4, q6, [x9], #32 |
| .endif |
| b.ne 17b |
| |
| subs height, height, #1 |
| add src0, src0, x10 |
| add src1, src1, x10 |
| add dst, dst, dst_stride |
| b.ne 16b |
| 32: |
| ret |
| |
| .unreq dst |
| .unreq dst_stride |
| .unreq src0 |
| .unreq src1 |
| .unreq width |
| .unreq height |
| endfunc |
| .endm |
| |
| vvc_w_avg 8 |
| vvc_w_avg 10 |
| vvc_w_avg 12 |
| |
| .macro vvc_avg bit_depth |
| function ff_vvc_avg_\bit_depth\()_neon, export=1 |
| mov x10, #(VVC_MAX_PB_SIZE * 2) |
| movi v16.8h, #0 |
| movi v17.16b, #255 |
| ushr v17.8h, v17.8h, #(16 - \bit_depth) |
| |
| cmp w4, #8 |
| b.gt 16f |
| b.eq 8f |
| cmp w4, #4 |
| b.eq 4f |
| |
| 2: // width == 2 |
| ldr s0, [x2] |
| subs w5, w5, #1 |
| ldr s1, [x3] |
| .if \bit_depth == 8 |
| shadd v0.4h, v0.4h, v1.4h |
| sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) |
| str h0, [x0] |
| .else |
| shadd v0.4h, v0.4h, v1.4h |
| srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth) |
| smax v0.4h, v0.4h, v16.4h |
| smin v0.4h, v0.4h, v17.4h |
| str s0, [x0] |
| .endif |
| add x2, x2, #(VVC_MAX_PB_SIZE * 2) |
| add x3, x3, #(VVC_MAX_PB_SIZE * 2) |
| add x0, x0, x1 |
| b.ne 2b |
| ret |
| |
| 4: // width == 4 |
| ldr d0, [x2] |
| subs w5, w5, #1 |
| ldr d1, [x3] |
| .if \bit_depth == 8 |
| shadd v0.4h, v0.4h, v1.4h |
| sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) |
| str s0, [x0] |
| .else |
| shadd v0.4h, v0.4h, v1.4h |
| srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth) |
| smax v0.4h, v0.4h, v16.4h |
| smin v0.4h, v0.4h, v17.4h |
| str d0, [x0] |
| .endif |
| add x2, x2, #(VVC_MAX_PB_SIZE * 2) |
| add x3, x3, #(VVC_MAX_PB_SIZE * 2) |
| add x0, x0, x1 |
| b.ne 4b |
| ret |
| |
| 8: // width == 8 |
| ldr q0, [x2] |
| subs w5, w5, #1 |
| ldr q1, [x3] |
| .if \bit_depth == 8 |
| shadd v0.8h, v0.8h, v1.8h |
| sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth) |
| str d0, [x0] |
| .else |
| shadd v0.8h, v0.8h, v1.8h |
| srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth) |
| smax v0.8h, v0.8h, v16.8h |
| smin v0.8h, v0.8h, v17.8h |
| str q0, [x0] |
| .endif |
| add x2, x2, #(VVC_MAX_PB_SIZE * 2) |
| add x3, x3, #(VVC_MAX_PB_SIZE * 2) |
| add x0, x0, x1 |
| b.ne 8b |
| ret |
| |
| 16: // width >= 16 |
| .if \bit_depth == 8 |
| sub x1, x1, w4, sxtw |
| .else |
| sub x1, x1, w4, sxtw #1 |
| .endif |
| sub x10, x10, w4, sxtw #1 |
| 3: |
| mov w6, w4 // width |
| 1: |
| ldp q0, q1, [x2], #32 |
| subs w6, w6, #16 |
| ldp q2, q3, [x3], #32 |
| .if \bit_depth == 8 |
| shadd v4.8h, v0.8h, v2.8h |
| shadd v5.8h, v1.8h, v3.8h |
| sqrshrun v0.8b, v4.8h, #6 |
| sqrshrun2 v0.16b, v5.8h, #6 |
| st1 {v0.16b}, [x0], #16 |
| .else |
| shadd v4.8h, v0.8h, v2.8h |
| shadd v5.8h, v1.8h, v3.8h |
| srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth) |
| srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth) |
| smax v0.8h, v0.8h, v16.8h |
| smax v1.8h, v1.8h, v16.8h |
| smin v0.8h, v0.8h, v17.8h |
| smin v1.8h, v1.8h, v17.8h |
| stp q0, q1, [x0], #32 |
| .endif |
| b.ne 1b |
| |
| subs w5, w5, #1 |
| add x2, x2, x10 |
| add x3, x3, x10 |
| add x0, x0, x1 |
| b.ne 3b |
| ret |
| endfunc |
| .endm |
| |
| vvc_avg 8 |
| vvc_avg 10 |
| vvc_avg 12 |
| |
| /* x0: int16_t *dst |
| * x1: const uint8_t *_src |
| * x2: ptrdiff_t _src_stride |
| * w3: int height |
| * x4: intptr_t mx |
| * x5: intptr_t my |
| * w6: int width |
| */ |
| function ff_vvc_dmvr_8_neon, export=1 |
| dst .req x0 |
| src .req x1 |
| src_stride .req x2 |
| height .req w3 |
| mx .req x4 |
| my .req x5 |
| width .req w6 |
| |
| sxtw x6, w6 |
| mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) |
| cmp width, #16 |
| sub src_stride, src_stride, x6 |
| cset w15, gt // width > 16 |
| movi v16.8h, #2 // DMVR_SHIFT |
| sub x7, x7, x6, lsl #1 |
| 1: |
| cbz w15, 2f |
| ldr q0, [src], #16 |
| ushll v1.8h, v0.8b, #2 |
| ushll2 v2.8h, v0.16b, #2 |
| stp q1, q2, [dst], #32 |
| b 3f |
| 2: |
| ldr d0, [src], #8 |
| ushll v1.8h, v0.8b, #2 |
| str q1, [dst], #16 |
| 3: |
| subs height, height, #1 |
| ldr s3, [src], #4 |
| ushll v4.8h, v3.8b, #2 |
| st1 {v4.4h}, [dst], x7 |
| |
| add src, src, src_stride |
| b.ne 1b |
| |
| ret |
| endfunc |
| |
| function ff_vvc_dmvr_12_neon, export=1 |
| sxtw x6, w6 |
| mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) |
| cmp width, #16 |
| sub src_stride, src_stride, x6, lsl #1 |
| cset w15, gt // width > 16 |
| sub x7, x7, x6, lsl #1 |
| 1: |
| cbz w15, 2f |
| ldp q0, q1, [src], #32 |
| urshr v0.8h, v0.8h, #2 |
| urshr v1.8h, v1.8h, #2 |
| |
| stp q0, q1, [dst], #32 |
| b 3f |
| 2: |
| ldr q0, [src], #16 |
| urshr v0.8h, v0.8h, #2 |
| str q0, [dst], #16 |
| 3: |
| subs height, height, #1 |
| ldr d0, [src], #8 |
| urshr v0.4h, v0.4h, #2 |
| st1 {v0.4h}, [dst], x7 |
| |
| add src, src, src_stride |
| b.ne 1b |
| |
| ret |
| endfunc |
| |
| function ff_vvc_dmvr_v_8_neon, export=1 |
| movrel x7, X(ff_vvc_inter_luma_dmvr_filters) |
| add x7, x7, x5, lsl #1 |
| ld2r {v0.16b, v1.16b}, [x7] |
| tbz w6, #4, 12f |
| |
| ldr s16, [x1, #16] |
| ld1 {v2.16b}, [x1], x2 |
| 20: |
| ldr s17, [x1, #16] |
| umull v4.8h, v0.8b, v2.8b |
| umull2 v5.8h, v0.16b, v2.16b |
| ld1 {v3.16b}, [x1], x2 |
| umull v16.8h, v0.8b, v16.8b |
| umull v6.8h, v1.8b, v3.8b |
| umull2 v7.8h, v1.16b, v3.16b |
| add v4.8h, v4.8h, v6.8h |
| umull v18.8h, v1.8b, v17.8b |
| add v5.8h, v5.8h, v7.8h |
| urshr v4.8h, v4.8h, #2 |
| add v19.4h, v16.4h, v18.4h |
| urshr v5.8h, v5.8h, #2 |
| urshr v19.4h, v19.4h, #2 |
| st1 {v4.8h, v5.8h}, [x0], #32 |
| subs w3, w3, #1 |
| mov v2.16b, v3.16b |
| st1 {v19.4h}, [x0], #8 |
| mov v16.16b, v17.16b |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8) |
| b.ne 20b |
| ret |
| |
| 12: |
| ldr s16, [x1, #8] |
| ld1 {v2.8b}, [x1], x2 |
| 2: |
| ldr s17, [x1, #8] |
| umull v4.8h, v0.8b, v2.8b |
| ld1 {v3.8b}, [x1], x2 |
| umull v16.8h, v0.8b, v16.8b |
| umull v6.8h, v1.8b, v3.8b |
| add v4.8h, v4.8h, v6.8h |
| umull v18.8h, v1.8b, v17.8b |
| srshr v4.8h, v4.8h, #2 |
| add v19.4h, v16.4h, v18.4h |
| srshr v19.4h, v19.4h, #2 |
| st1 {v4.8h}, [x0], #16 |
| subs w3, w3, #1 |
| mov v2.16b, v3.16b |
| st1 {v19.4h}, [x0], #8 |
| mov v16.16b, v17.16b |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8) |
| b.ne 2b |
| ret |
| endfunc |
| |
| function ff_vvc_dmvr_h_8_neon, export=1 |
| movrel x7, X(ff_vvc_inter_luma_dmvr_filters) |
| add x7, x7, x4, lsl #1 |
| ld2r {v0.16b, v1.16b}, [x7] |
| tbz w6, #4, 12f |
| 20: |
| ldur q3, [x1, #1] |
| ldr q2, [x1] |
| umull v4.8h, v0.8b, v2.8b |
| umull2 v5.8h, v0.16b, v2.16b |
| ldur s17, [x1, #17] |
| umull v6.8h, v1.8b, v3.8b |
| ldr s16, [x1, #16] |
| umull2 v7.8h, v1.16b, v3.16b |
| add v4.8h, v4.8h, v6.8h |
| umull v17.8h, v1.8b, v17.8b |
| add v5.8h, v5.8h, v7.8h |
| umull v16.8h, v0.8b, v16.8b |
| srshr v4.8h, v4.8h, #2 |
| add v16.4h, v16.4h, v17.4h |
| srshr v5.8h, v5.8h, #2 |
| srshr v16.4h, v16.4h, #2 |
| st1 {v4.8h, v5.8h}, [x0], #32 |
| subs w3, w3, #1 |
| st1 {v16.4h}, [x0], #8 |
| add x1, x1, x2 |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8) |
| b.ne 20b |
| ret |
| |
| 12: |
| ldur d3, [x1, #1] |
| ldr d2, [x1] |
| umull v4.8h, v0.8b, v2.8b |
| ldur s17, [x1, #9] |
| umull v6.8h, v1.8b, v3.8b |
| ldr s16, [x1, #8] |
| add v4.8h, v4.8h, v6.8h |
| umull v17.8h, v1.8b, v17.8b |
| umull v16.8h, v0.8b, v16.8b |
| srshr v4.8h, v4.8h, #2 |
| add v16.4h, v16.4h, v17.4h |
| srshr v16.4h, v16.4h, #2 |
| st1 {v4.8h}, [x0], #16 |
| subs w3, w3, #1 |
| st1 {v16.4h}, [x0], #8 |
| add x1, x1, x2 |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8) |
| b.ne 12b |
| ret |
| endfunc |
| |
| .macro vvc_dmvr_h_10 bit_depth |
| function ff_vvc_dmvr_h_\bit_depth\()_neon, export=1 |
| movrel x7, X(ff_vvc_inter_luma_dmvr_filters) |
| add x7, x7, x4, lsl #1 |
| ld2r {v0.16b, v1.16b}, [x7] |
| uxtl v0.8h, v0.8b |
| uxtl v1.8h, v1.8b |
| tbz w6, #4, 12f |
| 20: |
| ldur q3, [x1, #2] |
| ldr q2, [x1] |
| ldr q22, [x1, #16] |
| mul v4.8h, v0.8h, v2.8h |
| mul v6.8h, v1.8h, v3.8h |
| ldur q23, [x1, #18] |
| mul v5.8h, v0.8h, v22.8h |
| ldur d17, [x1, #34] |
| mul v7.8h, v1.8h, v23.8h |
| uhadd v4.8h, v4.8h, v6.8h |
| ldr d16, [x1, #32] |
| uhadd v5.8h, v5.8h, v7.8h |
| mul v17.4h, v1.4h, v17.4h |
| mul v16.4h, v0.4h, v16.4h |
| urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1) |
| urshr v5.8h, v5.8h, #(\bit_depth - 6 - 1) |
| uhadd v16.4h, v16.4h, v17.4h |
| urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1) |
| st1 {v4.8h, v5.8h}, [x0], #32 |
| subs w3, w3, #1 |
| st1 {v16.4h}, [x0], #8 |
| add x1, x1, x2 |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8) |
| b.ne 20b |
| ret |
| |
| 12: |
| ldur q3, [x1, #2] |
| ldr q2, [x1] |
| mul v4.8h, v0.8h, v2.8h |
| ldur d17, [x1, #18] |
| mul v6.8h, v1.8h, v3.8h |
| ldr d16, [x1, #16] |
| uhadd v4.8h, v4.8h, v6.8h |
| mul v17.4h, v1.4h, v17.4h |
| mul v16.4h, v0.4h, v16.4h |
| urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1) |
| uhadd v16.4h, v16.4h, v17.4h |
| urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1) |
| st1 {v4.8h}, [x0], #16 |
| subs w3, w3, #1 |
| st1 {v16.4h}, [x0], #8 |
| add x1, x1, x2 |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8) |
| b.ne 12b |
| ret |
| endfunc |
| .endm |
| |
| vvc_dmvr_h_10 10 |
| vvc_dmvr_h_10 12 |
| |
| function ff_vvc_dmvr_hv_8_neon, export=1 |
| tmp0 .req x7 |
| tmp1 .req x8 |
| |
| sub sp, sp, #(VVC_MAX_PB_SIZE * 4) |
| |
| movrel x9, X(ff_vvc_inter_luma_dmvr_filters) |
| add x12, x9, mx, lsl #1 |
| mov tmp0, sp |
| add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2) |
| // We know the value are positive |
| ld2r {v0.16b, v1.16b}, [x12] |
| |
| add x12, x9, my, lsl #1 |
| ldrb w10, [x12] |
| ldrb w11, [x12, #1] |
| sxtw x6, w6 |
| dup v2.8h, w10 // filter_y[0] |
| dup v3.8h, w11 // filter_y[1] |
| |
| // Valid value for width can only be 8 + 4, 16 + 4 |
| cmp width, #16 |
| mov w10, #0 // start filter_y or not |
| add height, height, #1 |
| sub dst, dst, #(VVC_MAX_PB_SIZE * 2) |
| sub src_stride, src_stride, x6 |
| cset w15, gt // width > 16 |
| 1: |
| mov x12, tmp0 |
| mov x13, tmp1 |
| mov x14, dst |
| cbz w15, 2f |
| |
| // width > 16 |
| ldur q5, [src, #1] |
| ldr q4, [src], #16 |
| umull v6.8h, v4.8b, v0.8b |
| umull2 v16.8h, v4.16b, v0.16b |
| umlal v6.8h, v5.8b, v1.8b |
| umlal2 v16.8h, v5.16b, v1.16b |
| urshr v6.8h, v6.8h, #(8 - 6) |
| urshr v7.8h, v16.8h, #(8 - 6) |
| stp q6, q7, [x13], #32 |
| |
| cbz w10, 3f |
| |
| ldp q16, q17, [x12], #32 |
| mul v16.8h, v16.8h, v2.8h |
| mul v17.8h, v17.8h, v2.8h |
| mla v16.8h, v6.8h, v3.8h |
| mla v17.8h, v7.8h, v3.8h |
| urshr v16.8h, v16.8h, #4 |
| urshr v17.8h, v17.8h, #4 |
| stp q16, q17, [x14], #32 |
| b 3f |
| 2: |
| // width > 8 |
| ldur d5, [src, #1] |
| ldr d4, [src], #8 |
| umull v6.8h, v4.8b, v0.8b |
| umlal v6.8h, v5.8b, v1.8b |
| urshr v6.8h, v6.8h, #(8 - 6) |
| str q6, [x13], #16 |
| |
| cbz w10, 3f |
| |
| ldr q16, [x12], #16 |
| mul v16.8h, v16.8h, v2.8h |
| mla v16.8h, v6.8h, v3.8h |
| urshr v16.8h, v16.8h, #4 |
| str q16, [x14], #16 |
| 3: |
| ldur s5, [src, #1] |
| ldr s4, [src], #4 |
| umull v6.8h, v4.8b, v0.8b |
| umlal v6.8h, v5.8b, v1.8b |
| urshr v6.4h, v6.4h, #(8 - 6) |
| str d6, [x13], #8 |
| |
| cbz w10, 4f |
| |
| ldr d16, [x12], #8 |
| mul v16.4h, v16.4h, v2.4h |
| mla v16.4h, v6.4h, v3.4h |
| urshr v16.4h, v16.4h, #4 |
| str d16, [x14], #8 |
| 4: |
| subs height, height, #1 |
| mov w10, #1 |
| add src, src, src_stride |
| add dst, dst, #(VVC_MAX_PB_SIZE * 2) |
| eor tmp0, tmp0, tmp1 |
| eor tmp1, tmp0, tmp1 |
| eor tmp0, tmp0, tmp1 |
| b.ne 1b |
| |
| add sp, sp, #(VVC_MAX_PB_SIZE * 4) |
| ret |
| endfunc |
| |
| function ff_vvc_dmvr_hv_12_neon, export=1 |
| mvni v29.4s, #(12 - 6 - 1) |
| b 0f |
| endfunc |
| |
| function ff_vvc_dmvr_hv_10_neon, export=1 |
| mvni v29.4s, #(10 - 6 - 1) |
| 0: |
| sub sp, sp, #(VVC_MAX_PB_SIZE * 4) |
| |
| movrel x9, X(ff_vvc_inter_luma_dmvr_filters) |
| add x12, x9, mx, lsl #1 |
| ldrb w10, [x12] |
| ldrb w11, [x12, #1] |
| mov tmp0, sp |
| add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2) |
| // We know the value are positive |
| dup v0.8h, w10 // filter_x[0] |
| dup v1.8h, w11 // filter_x[1] |
| |
| add x12, x9, my, lsl #1 |
| ldrb w10, [x12] |
| ldrb w11, [x12, #1] |
| dup v2.8h, w10 // filter_y[0] |
| dup v3.8h, w11 // filter_y[1] |
| |
| // Valid value for width can only be 8 + 4, 16 + 4 |
| cmp width, #16 |
| mov w10, #0 // start filter_y or not |
| add height, height, #1 |
| sub dst, dst, #(VVC_MAX_PB_SIZE * 2) |
| sub src_stride, src_stride, w6, sxtw #1 |
| cset w15, gt // width > 16 |
| 1: |
| mov x12, tmp0 |
| mov x13, tmp1 |
| mov x14, dst |
| cbz w15, 2f |
| |
| // width > 16 |
| add x16, src, #2 |
| ldp q6, q16, [src], #32 |
| ldp q7, q17, [x16] |
| umull v4.4s, v6.4h, v0.4h |
| umull2 v5.4s, v6.8h, v0.8h |
| umull v18.4s, v16.4h, v0.4h |
| umull2 v19.4s, v16.8h, v0.8h |
| umlal v4.4s, v7.4h, v1.4h |
| umlal2 v5.4s, v7.8h, v1.8h |
| umlal v18.4s, v17.4h, v1.4h |
| umlal2 v19.4s, v17.8h, v1.8h |
| |
| urshl v4.4s, v4.4s, v29.4s |
| urshl v5.4s, v5.4s, v29.4s |
| urshl v18.4s, v18.4s, v29.4s |
| urshl v19.4s, v19.4s, v29.4s |
| uqxtn v6.4h, v4.4s |
| uqxtn2 v6.8h, v5.4s |
| uqxtn v7.4h, v18.4s |
| uqxtn2 v7.8h, v19.4s |
| stp q6, q7, [x13], #32 |
| |
| cbz w10, 3f |
| |
| ldp q4, q5, [x12], #32 |
| umull v17.4s, v4.4h, v2.4h |
| umull2 v18.4s, v4.8h, v2.8h |
| umull v19.4s, v5.4h, v2.4h |
| umull2 v20.4s, v5.8h, v2.8h |
| umlal v17.4s, v6.4h, v3.4h |
| umlal2 v18.4s, v6.8h, v3.8h |
| umlal v19.4s, v7.4h, v3.4h |
| umlal2 v20.4s, v7.8h, v3.8h |
| uqrshrn v6.4h, v17.4s, #4 |
| uqrshrn2 v6.8h, v18.4s, #4 |
| uqrshrn v7.4h, v19.4s, #4 |
| uqrshrn2 v7.8h, v20.4s, #4 |
| stp q6, q7, [x14], #32 |
| b 3f |
| 2: |
| // width > 8 |
| ldur q7, [src, #2] |
| ldr q6, [src], #16 |
| umull v4.4s, v6.4h, v0.4h |
| umull2 v5.4s, v6.8h, v0.8h |
| umlal v4.4s, v7.4h, v1.4h |
| umlal2 v5.4s, v7.8h, v1.8h |
| |
| urshl v4.4s, v4.4s, v29.4s |
| urshl v5.4s, v5.4s, v29.4s |
| uqxtn v6.4h, v4.4s |
| uqxtn2 v6.8h, v5.4s |
| str q6, [x13], #16 |
| |
| cbz w10, 3f |
| |
| ldr q16, [x12], #16 |
| umull v17.4s, v16.4h, v2.4h |
| umull2 v18.4s, v16.8h, v2.8h |
| umlal v17.4s, v6.4h, v3.4h |
| umlal2 v18.4s, v6.8h, v3.8h |
| urshr v17.4s, v17.4s, #4 |
| urshr v18.4s, v18.4s, #4 |
| uqxtn v16.4h, v17.4s |
| uqxtn2 v16.8h, v18.4s |
| str q16, [x14], #16 |
| 3: |
| ldur d7, [src, #2] |
| ldr d6, [src], #8 |
| umull v4.4s, v7.4h, v1.4h |
| umlal v4.4s, v6.4h, v0.4h |
| urshl v4.4s, v4.4s, v29.4s |
| uqxtn v6.4h, v4.4s |
| str d6, [x13], #8 |
| |
| cbz w10, 4f |
| |
| ldr d16, [x12], #8 |
| umull v17.4s, v16.4h, v2.4h |
| umlal v17.4s, v6.4h, v3.4h |
| urshr v17.4s, v17.4s, #4 |
| uqxtn v16.4h, v17.4s |
| str d16, [x14], #8 |
| 4: |
| subs height, height, #1 |
| mov w10, #1 |
| add src, src, src_stride |
| add dst, dst, #(VVC_MAX_PB_SIZE * 2) |
| eor tmp0, tmp0, tmp1 |
| eor tmp1, tmp0, tmp1 |
| eor tmp0, tmp0, tmp1 |
| b.ne 1b |
| |
| add sp, sp, #(VVC_MAX_PB_SIZE * 4) |
| ret |
| |
| .unreq dst |
| .unreq src |
| .unreq src_stride |
| .unreq height |
| .unreq mx |
| .unreq my |
| .unreq width |
| .unreq tmp0 |
| .unreq tmp1 |
| endfunc |
| |
| function ff_vvc_prof_grad_filter_8x_neon, export=1 |
| gh .req x0 |
| gv .req x1 |
| gstride .req x2 |
| src .req x3 |
| src_stride .req x4 |
| width .req w5 |
| height .req w6 |
| |
| lsl src_stride, src_stride, #1 |
| neg x7, src_stride |
| 1: |
| mov x10, src |
| mov w11, width |
| mov x12, gh |
| mov x13, gv |
| 2: |
| ldur q0, [x10, #2] |
| ldur q1, [x10, #-2] |
| subs w11, w11, #8 |
| ldr q2, [x10, src_stride] |
| ldr q3, [x10, x7] |
| sshr v0.8h, v0.8h, #6 |
| sshr v1.8h, v1.8h, #6 |
| sshr v2.8h, v2.8h, #6 |
| sshr v3.8h, v3.8h, #6 |
| sub v0.8h, v0.8h, v1.8h |
| sub v2.8h, v2.8h, v3.8h |
| st1 {v0.8h}, [x12], #16 |
| st1 {v2.8h}, [x13], #16 |
| add x10, x10, #16 |
| b.ne 2b |
| |
| subs height, height, #1 |
| add gh, gh, gstride, lsl #1 |
| add gv, gv, gstride, lsl #1 |
| add src, src, src_stride |
| b.ne 1b |
| ret |
| |
| .unreq gh |
| .unreq gv |
| .unreq gstride |
| .unreq src |
| .unreq src_stride |
| .unreq width |
| .unreq height |
| endfunc |
| |
| function vvc_bdof_grad_filter_8x_neon, export=0 |
| gh0 .req x0 |
| gh1 .req x1 |
| gv0 .req x2 |
| gv1 .req x3 |
| src0 .req x4 |
| src1 .req x5 |
| width .req w6 |
| height .req w7 |
| tbnz w6, #4, 16f |
| |
| 8: |
| ldur q0, [src0, #2] |
| ldur q1, [src0, #-2] |
| ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)] |
| ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)] |
| sshr v0.8h, v0.8h, #6 |
| sshr v1.8h, v1.8h, #6 |
| ldur q4, [src1, #2] |
| ldur q5, [src1, #-2] |
| sshr v2.8h, v2.8h, #6 |
| sshr v3.8h, v3.8h, #6 |
| ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)] |
| ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)] |
| // results of gradient_h0 |
| sub v0.8h, v0.8h, v1.8h |
| // results of gradient_v0 |
| sub v2.8h, v2.8h, v3.8h |
| |
| sshr v4.8h, v4.8h, #6 |
| sshr v5.8h, v5.8h, #6 |
| sshr v6.8h, v6.8h, #6 |
| sshr v7.8h, v7.8h, #6 |
| // results of gradient_h1 |
| sub v4.8h, v4.8h, v5.8h |
| // results of gradient_v1 |
| sub v6.8h, v6.8h, v7.8h |
| |
| // (gradient_h0 + gradient_h1) >> 1 |
| shadd v1.8h, v0.8h, v4.8h |
| // gradient_h0 - gradient_h1 |
| sub v5.8h, v0.8h, v4.8h |
| |
| // (gradient_v0 + gradient_v1) >> 1 |
| shadd v3.8h, v2.8h, v6.8h |
| // gradient_v0 - gradient_v1 |
| sub v7.8h, v2.8h, v6.8h |
| |
| st1 {v1.8h}, [gh0] |
| st1 {v5.8h}, [gh1] |
| st1 {v3.8h}, [gv0] |
| st1 {v7.8h}, [gv1] |
| |
| subs height, height, #1 |
| add gh0, gh0, #(BDOF_BLOCK_SIZE << 1) |
| add gv0, gv0, #(BDOF_BLOCK_SIZE << 1) |
| add src0, src0, #(VVC_MAX_PB_SIZE << 1) |
| add gh1, gh1, #(BDOF_BLOCK_SIZE << 1) |
| add gv1, gv1, #(BDOF_BLOCK_SIZE << 1) |
| add src1, src1, #(VVC_MAX_PB_SIZE << 1) |
| b.ne 8b |
| ret |
| |
| 16: |
| ldur q0, [src0, #2] |
| ldur q1, [src0, #18] |
| ldur q16, [src0, #-2] |
| sshr v0.8h, v0.8h, #6 |
| ldur q17, [src0, #14] |
| sshr v1.8h, v1.8h, #6 |
| ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)] |
| sshr v16.8h, v16.8h, #6 |
| ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]! |
| ldur q20, [src1, #2] |
| sshr v17.8h, v17.8h, #6 |
| ldur q21, [src1, #18] |
| sshr v2.8h, v2.8h, #6 |
| ldur q22, [src1, #-2] |
| sshr v3.8h, v3.8h, #6 |
| ldur q23, [src1, #14] |
| sshr v18.8h, v18.8h, #6 |
| ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)] |
| sshr v19.8h, v19.8h, #6 |
| ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]! |
| |
| // results of gradient_h0 |
| sub v0.8h, v0.8h, v16.8h |
| sub v1.8h, v1.8h, v17.8h |
| |
| // results of gradient_v0 |
| sub v2.8h, v2.8h, v18.8h |
| sub v3.8h, v3.8h, v19.8h |
| |
| sshr v20.8h, v20.8h, #6 |
| sshr v21.8h, v21.8h, #6 |
| sshr v22.8h, v22.8h, #6 |
| sshr v23.8h, v23.8h, #6 |
| |
| // results of gradient_h1 |
| sub v20.8h, v20.8h, v22.8h |
| sub v21.8h, v21.8h, v23.8h |
| |
| sshr v24.8h, v24.8h, #6 |
| sshr v25.8h, v25.8h, #6 |
| |
| // gradient_h0 - gradient_h1 |
| sub v22.8h, v0.8h, v20.8h |
| sub v23.8h, v1.8h, v21.8h |
| |
| // (gradient_h0 + gradient_h1) >> 1 |
| shadd v16.8h, v0.8h, v20.8h |
| shadd v17.8h, v1.8h, v21.8h |
| |
| st1 {v22.8h, v23.8h}, [gh1], #32 |
| |
| sshr v26.8h, v26.8h, #6 |
| sshr v27.8h, v27.8h, #6 |
| |
| st1 {v16.8h, v17.8h}, [gh0], #32 |
| |
| // results of gradient_v1 |
| sub v24.8h, v24.8h, v26.8h |
| sub v25.8h, v25.8h, v27.8h |
| |
| // (gradient_v0 + gradient_v1) >> 1 |
| shadd v18.8h, v2.8h, v24.8h |
| shadd v19.8h, v3.8h, v25.8h |
| |
| // gradient_v0 - gradient_v1 |
| sub v26.8h, v2.8h, v24.8h |
| sub v27.8h, v3.8h, v25.8h |
| |
| st1 {v18.8h,v19.8h}, [gv0], #32 |
| |
| subs height, height, #1 |
| st1 {v26.8h,v27.8h}, [gv1], #32 |
| |
| b.ne 16b |
| ret |
| |
| .unreq gh0 |
| .unreq gh1 |
| .unreq gv0 |
| .unreq gv1 |
| .unreq src0 |
| .unreq src1 |
| .unreq width |
| .unreq height |
| endfunc |
| |
| .macro vvc_apply_bdof_block_8x bit_depth |
| dst .req x0 |
| dst_stride .req x1 |
| src0 .req x2 |
| src1 .req x3 |
| gh .req x4 |
| gv .req x5 |
| vx .req x6 |
| vy .req x7 |
| |
| ldr w8, [sp] |
| mov x12, #(BDOF_BLOCK_SIZE * 2) |
| mov x14, #(VVC_MAX_PB_SIZE * 2) |
| .if \bit_depth >= 10 |
| // clip pixel |
| mov w15, #((1 << \bit_depth) - 1) |
| dup v19.8h, w15 |
| .endif |
| |
| 0: |
| ldr s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE) |
| ldr s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE) |
| mov w13, #(BDOF_MIN_BLOCK_SIZE) |
| 1: |
| ld1 {v5.8h}, [src0], x14 |
| ld1 {v6.8h}, [src1], x14 |
| |
| saddl v17.4s, v5.4h, v6.4h |
| ld1 {v4.8h}, [gv], x12 |
| saddl2 v16.4s, v5.8h, v6.8h |
| ld1 {v2.8h}, [gh], x12 |
| smlal v17.4s, v4.4h, v1.h[0] |
| smlal2 v16.4s, v4.8h, v1.h[1] |
| smlal v17.4s, v2.4h, v0.h[0] |
| smlal2 v16.4s, v2.8h, v0.h[1] |
| |
| sqrshrun v5.4h, v17.4s, #(15 - \bit_depth) |
| sqrshrun2 v5.8h, v16.4s, #(15 - \bit_depth) |
| subs w13, w13, #1 |
| .if \bit_depth == 8 |
| sqxtun v5.8b, v5.8h |
| st1 {v5.8b}, [dst], dst_stride |
| .else |
| smin v5.8h, v5.8h, v19.8h |
| st1 {v5.8h}, [dst], dst_stride |
| .endif |
| b.ne 1b |
| |
| subs w8, w8, #(BDOF_MIN_BLOCK_SIZE) |
| b.ne 0b |
| ret |
| |
| .unreq dst |
| .unreq dst_stride |
| .unreq src0 |
| .unreq src1 |
| .unreq gh |
| .unreq gv |
| .unreq vx |
| .unreq vy |
| .endm |
| |
| function vvc_apply_bdof_block_8x_8_neon, export=0 |
| vvc_apply_bdof_block_8x 8 |
| endfunc |
| |
| function vvc_apply_bdof_block_8x_10_neon, export=0 |
| vvc_apply_bdof_block_8x 10 |
| endfunc |
| |
| function vvc_apply_bdof_block_8x_12_neon, export=0 |
| vvc_apply_bdof_block_8x 12 |
| endfunc |
| |
| .macro vvc_apply_bdof_block_16x bit_depth |
| dst .req x0 |
| dst_stride .req x1 |
| src0 .req x2 |
| src1 .req x3 |
| gh .req x4 |
| gv .req x5 |
| vx .req x6 |
| vy .req x7 |
| |
| ldr w8, [sp] |
| movi v7.4s, #(1 << (14 - \bit_depth)) |
| .if \bit_depth >= 10 |
| // clip pixel |
| mov w15, #((1 << \bit_depth) - 1) |
| movi v18.8h, #0 |
| dup v19.8h, w15 |
| .endif |
| |
| 0: |
| ld1r {v0.8h}, [vx], #2 |
| ld1r {v1.8h}, [vy], #2 |
| ld1r {v2.8h}, [vx], #2 |
| ld1r {v3.8h}, [vy], #2 |
| |
| mov w13, #(BDOF_MIN_BLOCK_SIZE) |
| |
| ld1r {v20.8h}, [vx], #2 |
| ld1r {v21.8h}, [vy], #2 |
| ld1r {v22.8h}, [vx], #2 |
| ld1r {v23.8h}, [vy], #2 |
| |
| ins v0.d[1], v2.d[1] |
| ins v1.d[1], v3.d[1] |
| ins v20.d[1], v22.d[1] |
| ins v21.d[1], v23.d[1] |
| 1: |
| ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2) |
| ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2) |
| smull v3.4s, v0.4h, v2.4h |
| smull2 v16.4s, v0.8h, v2.8h |
| smlal v3.4s, v1.4h, v4.4h |
| smlal2 v16.4s, v1.8h, v4.8h |
| |
| ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2) |
| ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2) |
| |
| smull v23.4s, v20.4h, v22.4h |
| smull2 v27.4s, v20.8h, v22.8h |
| smlal v23.4s, v21.4h, v24.4h |
| smlal2 v27.4s, v21.8h, v24.8h |
| |
| saddl v2.4s, v5.4h, v6.4h |
| add v2.4s, v2.4s, v7.4s |
| add v2.4s, v2.4s, v3.4s |
| saddl2 v4.4s, v5.8h, v6.8h |
| add v4.4s, v4.4s, v7.4s |
| add v4.4s, v4.4s, v16.4s |
| |
| saddl v22.4s, v25.4h, v26.4h |
| add v22.4s, v22.4s, v7.4s |
| add v22.4s, v22.4s, v23.4s |
| saddl2 v24.4s, v25.8h, v26.8h |
| add v24.4s, v24.4s, v7.4s |
| add v24.4s, v24.4s, v27.4s |
| |
| sqshrn v5.4h, v2.4s, #(15 - \bit_depth) |
| sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth) |
| sqshrn v25.4h, v22.4s, #(15 - \bit_depth) |
| sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth) |
| |
| subs w13, w13, #1 |
| .if \bit_depth == 8 |
| sqxtun v5.8b, v5.8h |
| sqxtun2 v5.16b, v25.8h |
| str q5, [dst] |
| .else |
| smin v5.8h, v5.8h, v19.8h |
| smax v5.8h, v5.8h, v18.8h |
| smin v25.8h, v25.8h, v19.8h |
| smax v25.8h, v25.8h, v18.8h |
| stp q5, q25, [dst] |
| .endif |
| add dst, dst, dst_stride |
| b.ne 1b |
| |
| subs w8, w8, #(BDOF_MIN_BLOCK_SIZE) |
| b.ne 0b |
| ret |
| |
| .unreq dst |
| .unreq dst_stride |
| .unreq src0 |
| .unreq src1 |
| .unreq gh |
| .unreq gv |
| .unreq vx |
| .unreq vy |
| .endm |
| |
| function vvc_apply_bdof_block_16x_8_neon, export=0 |
| vvc_apply_bdof_block_16x 8 |
| endfunc |
| |
| function vvc_apply_bdof_block_16x_10_neon, export=0 |
| vvc_apply_bdof_block_16x 10 |
| endfunc |
| |
| function vvc_apply_bdof_block_16x_12_neon, export=0 |
| vvc_apply_bdof_block_16x 12 |
| endfunc |
| |
| const bdof_vx_vy_8x_tbl |
| .byte 0, 1, 16, 16, 16, 16, 8, 9 |
| .byte 6, 7, 16, 16, 16, 16, 14, 15 |
| endconst |
| |
| const bdof_vx_vy_16x_tbl |
| .byte 0, 1, 64, 64, 64, 64, 8, 9 |
| .byte 6, 7, 64, 64, 64, 64, 16, 17 |
| .byte 14, 15, 64, 64, 64, 64, 24, 25 |
| .byte 22, 23, 64, 64, 64, 64, 30, 31 |
| endconst |
| |
| // line(-1), line0, line1, line2, line3, line4 |
| // line3 and line4 becomes line(-1) and line0 in the next block. |
| .macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4 |
| mov \tmp0\().16b, v28.16b |
| mov \tmp1\().16b, v29.16b |
| mov \tmp2\().16b, v30.16b |
| mov \tmp3\().16b, v31.16b |
| mov \tmp4\().16b, v8.16b |
| .endm |
| |
| .macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4 |
| add v25.4s, v25.4s, \tmp0\().4s |
| add v27.4s, v27.4s, \tmp1\().4s |
| add v23.4s, v23.4s, \tmp2\().4s |
| sub v26.4s, v26.4s, \tmp3\().4s |
| sub v24.4s, v24.4s, \tmp4\().4s |
| .endm |
| |
| .macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst |
| tbl \tmp0\().16b, { \src\().16b }, v0.16b |
| saddl \tmp1\().4s, \tmp0\().4h, \src\().4h |
| saddl2 \dst\().4s, \tmp0\().8h, \src\().8h |
| addp \dst\().4s, \tmp1\().4s, \dst\().4s |
| .endm |
| |
| .macro bdof_vx_vy_sign src, tmp0, tmp1, dst |
| cmlt \tmp0\().8h, \src\().8h, #0 |
| cmgt \tmp1\().8h, \src\().8h, #0 |
| sub \dst\().8h, \tmp0\().8h, \tmp1\().8h |
| .endm |
| |
| .macro bdof_vx_vy_clip_mask src, max, min, mask, dst |
| smin \src\().4s, \src\().4s, \max\().4s |
| smax \src\().4s, \src\().4s, \min\().4s |
| cmgt \mask\().4s, \mask\().4s, #0 |
| and \dst\().16b, \src\().16b, \mask\().16b |
| .endm |
| |
| .macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4 |
| mov \tmp0\().16b, v29.16b |
| mov \tmp1\().16b, v30.16b |
| mov \tmp2\().16b, v31.16b |
| mov \tmp3\().16b, v8.16b |
| mov \tmp4\().16b, v9.16b |
| .endm |
| |
| .macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4 |
| add v25.4s, v25.4s, \tmp0\().4s |
| add v24.4s, v24.4s, \tmp1\().4s |
| add v26.4s, v26.4s, \tmp2\().4s |
| sub v28.4s, v28.4s, \tmp3\().4s |
| sub v27.4s, v27.4s, \tmp4\().4s |
| .endm |
| |
| .macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst |
| tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b |
| tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b |
| saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h |
| saddl \tmp2\().4s, v2.4h, \src1\().4h |
| saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h |
| saddl2 \dst\().4s, v2.8h, \src1\().8h |
| addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s |
| addp \dst\().4s, \tmp2\().4s, \dst\().4s |
| addp \dst\().4s, \tmp0\().4s, \dst\().4s |
| .endm |
| |
| /* |
| * Line tricks: |
| * We need 6 lines of information, from 4N-1, 4N, 4N+1 to 4N+4. 4N-1 |
| * and 4N+0 are processed in the last group, so they can be reused. |
| * |
| * (4N-1) [xxxxxxxxxxxxx] <--- reuse |
| * (4N) [xxxxxxxxxxxxx] <--- reuse |
| * (4N+1) [xxxxxxxxxxxxx] |
| * (4N+2) [xxxxxxxxxxxxx] |
| * (4N+3) [xxxxxxxxxxxxx] ---> save for reuse |
| * (4N+4) [xxxxxxxxxxxxx] ---> save for reuse |
| * |
| * Special case: |
| * 1. Line -1 needs to duplicate line 0. |
| * 2. Last line +1 needs to duplicate the last line. |
| * |
| * --------------------------------------------------------------------- |
| * Pixel tricks: |
| * |
| * [C-1, C0, C1, C2, ... C16] |
| * |
| * For each line, we need to sum parameters for 4 * 6 pixels: |
| * - C-1 + C0 + C1 + C2 + C3 + C4 |
| * - C3 + C4 + C5 + C6 + C7 + C8 |
| * - C7 + C8 + C9 + C10 + C11 + C12 |
| * - C11 + C12 + C13 + C14 + C15 + C16 |
| * |
| * C-1 is C0, C16 is C15, so we can do: |
| * |
| * [C0, C1, C2, C3, | C4, C5, C6, C7, | C8, ... C15] |
| * + | + | |
| * [C0, 0, 0, C4, | C3, 0, 0, C8, | C7, ... C15] |
| * |
| * 8x is similar. |
| * ---------------------------------------------------------------------- |
| * x0: const int16_t *_src0, |
| * x1: const int16_t *_src1, |
| * x2: const int16_t *gradient_h, |
| * x3: const int16_t *gradient_v, |
| * x4: int16_t vx[16], |
| * x5: int16_t vy[16], |
| * w6: int block_h |
| */ |
| function vvc_derive_bdof_vx_vy_8x_neon, export=0 |
| stp d11, d10, [sp, #-0x20]! |
| stp d9, d8, [sp, #0x10] |
| |
| movrel x11, bdof_vx_vy_8x_tbl |
| ldr q0, [x11] // table |
| mvni v2.4s, #30 // -31, for log2 |
| movi v3.4s, #15 // clip to 15 |
| mvni v4.4s, #14 // clip to -15 |
| |
| mov w11, #0x8 |
| mov w12, w6 // y = block_h |
| b 4f |
| |
| 1: |
| // save line4 results |
| bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17 |
| 2: |
| addp v25.4s, v25.4s, v25.4s |
| addp v27.4s, v27.4s, v27.4s |
| addp v26.4s, v26.4s, v26.4s |
| addp v23.4s, v23.4s, v23.4s |
| addp v24.4s, v24.4s, v24.4s |
| |
| clz v28.4s, v25.4s |
| add v28.4s, v28.4s, v2.4s // log2 |
| shl v26.4s, v26.4s, #0x2 |
| sshl v26.4s, v26.4s, v28.4s |
| |
| bdof_vx_vy_clip_mask v26, v3, v4, v25, v25 |
| sqxtn v26.4h, v25.4s |
| st1 {v26.s}[0], [x4], x11 |
| |
| subs x12, x12, #(BDOF_MIN_BLOCK_SIZE) |
| |
| clz v26.4s, v27.4s |
| add v26.4s, v26.4s, v2.4s |
| shl v24.4s, v24.4s, #0x2 |
| mul v23.4s, v25.4s, v23.4s |
| sshr v23.4s, v23.4s, #0x1 |
| sub v23.4s, v24.4s, v23.4s |
| sshl v23.4s, v23.4s, v26.4s |
| |
| bdof_vx_vy_clip_mask v23, v3, v4, v27, v23 |
| sqxtn v23.4h, v23.4s |
| st1 {v23.s}[0], [x5], x11 |
| |
| b.eq 16f |
| 4: |
| mov x15, #0x0 // dy, inner loop |
| |
| movi v25.2d, #0 |
| movi v27.2d, #0 |
| movi v23.2d, #0 |
| movi v26.2d, #0 |
| movi v24.2d, #0 |
| b 8f |
| |
| 5: |
| // add line(-1) and line0 from previous results |
| bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22 |
| bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17 |
| add x15, x15, #1 |
| 8: |
| cmp w12, w6 |
| b.hs 9f |
| // y < block_h && dy == 0, reuse previous results |
| cbz x15, 5b |
| 9: |
| ldr q28, [x0] // src0 |
| ldr q29, [x1] // src1 |
| ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1 |
| ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1 |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2) |
| add x1, x1, #(VVC_MAX_PB_SIZE * 2) |
| |
| sshr v28.8h, v28.8h, #0x4 |
| sshr v29.8h, v29.8h, #0x4 |
| sub v8.8h, v28.8h, v29.8h // diff |
| |
| abs v28.8h, v30.8h |
| abs v29.8h, v31.8h |
| |
| bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28 |
| bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29 |
| |
| bdof_vx_vy_sign v30, v9, v10, v9 |
| bdof_vx_vy_sign v31, v10, v31, v31 |
| |
| mul v30.8h, v31.8h, v30.8h |
| mul v9.8h, v9.8h, v8.8h |
| mul v8.8h, v31.8h, v8.8h |
| |
| bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30 |
| bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31 |
| bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8 |
| |
| bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 |
| |
| cmp w12, w6 |
| b.ne 10f |
| cbnz x15, 10f |
| |
| // y == block_h && dy == 0, duplicate first line results |
| bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 |
| add x15, x15, #0x1 |
| b 9b |
| 10: |
| cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1) |
| b.eq 11f |
| cmp x15, #(BDOF_MIN_BLOCK_SIZE) |
| b.ne 12f |
| b 1b |
| 11: |
| // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1 |
| // duplicate the results and break |
| cmp x12, #(BDOF_MIN_BLOCK_SIZE) |
| b.eq 13f |
| bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22 |
| 12: |
| add x15, x15, #1 |
| b 8b |
| 13: |
| // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1 |
| // padding bottom then break |
| bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8 |
| b 2b |
| 16: |
| ldp d9, d8, [sp, #0x10] |
| ldp d11, d10, [sp], #0x20 |
| ret |
| endfunc |
| |
| /* |
| * x0: const int16_t *_src0, |
| * x1: const int16_t *_src1, |
| * x2: const int16_t *gradient_h, |
| * x3: const int16_t *gradient_v, |
| * x4: int16_t vx[16], |
| * x5: int16_t vy[16], |
| * w6: int block_h |
| */ |
| function vvc_derive_bdof_vx_vy_16x_neon, export=0 |
| stp d15, d14, [sp, #-0x40]! |
| stp d13, d12, [sp, #0x10] |
| stp d11, d10, [sp, #0x20] |
| stp d9, d8, [sp, #0x30] |
| |
| movrel x12, bdof_vx_vy_16x_tbl |
| ldp q0, q1, [x12] // table |
| mov w13, w6 // y = block_h |
| b 4f |
| |
| 1: |
| // save line4 |
| bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18 |
| 2: |
| clz v3.4s, v25.4s |
| mvni v5.4s, #0x1e |
| add v3.4s, v3.4s, v5.4s // -log2() |
| shl v4.4s, v28.4s, #0x2 |
| sshl v3.4s, v4.4s, v3.4s |
| |
| movi v28.4s, #0xf // clip to 15 |
| mvni v29.4s, #0xe // clip to -15 |
| bdof_vx_vy_clip_mask v3, v28, v29, v25, v3 |
| sqxtn v4.4h, v3.4s |
| st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2) |
| |
| subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE |
| |
| clz v4.4s, v24.4s |
| add v4.4s, v4.4s, v5.4s // -log2() |
| shl v5.4s, v27.4s, #0x2 |
| mul v3.4s, v3.4s, v26.4s |
| sshr v3.4s, v3.4s, #0x1 |
| sub v3.4s, v5.4s, v3.4s |
| sshl v3.4s, v3.4s, v4.4s |
| |
| bdof_vx_vy_clip_mask v3, v28, v29, v24, v3 |
| sqxtn v3.4h, v3.4s |
| st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2) |
| b.eq 16f |
| 4: |
| mov w14, #0x0 // dy, inner loop |
| |
| movi v25.2d, #0 |
| movi v24.2d, #0 |
| movi v26.2d, #0 |
| movi v28.2d, #0 |
| movi v27.2d, #0 |
| b 8f |
| |
| 5: |
| // add line(-1) and line0 from previous results |
| bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23 |
| bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18 |
| add w14, w14, #0x1 |
| |
| 8: |
| cmp w13, w6 |
| b.hs 9f |
| // y < block_h && dy == 0, reuse previous results |
| cbz w14, 5b |
| 9: |
| ld1 {v29.8h, v30.8h}, [x0] // src0 |
| sshr v31.8h, v29.8h, #0x4 |
| ld1 {v8.8h, v9.8h}, [x1] // src1 |
| sshr v10.8h, v8.8h, #0x4 |
| ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1 |
| sshr v29.8h, v30.8h, #0x4 |
| sshr v30.8h, v9.8h, #0x4 |
| ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1 |
| sub v31.8h, v31.8h, v10.8h // diff, left half |
| sub v4.8h, v29.8h, v30.8h // diff, right half |
| |
| abs v29.8h, v13.8h |
| abs v30.8h, v8.8h |
| abs v9.8h, v5.8h |
| abs v10.8h, v3.8h |
| |
| add x0, x0, #(VVC_MAX_PB_SIZE * 2) |
| add x1, x1, #(VVC_MAX_PB_SIZE * 2) |
| |
| bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29 |
| bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30 |
| |
| bdof_vx_vy_sign v13, v9, v10, v9 |
| bdof_vx_vy_sign v8, v10, v11, v10 |
| bdof_vx_vy_sign v5, v11, v5, v5 |
| bdof_vx_vy_sign v3, v11, v3, v3 |
| |
| mul v11.8h, v5.8h, v13.8h |
| mul v12.8h, v3.8h, v8.8h |
| mul v8.8h, v9.8h, v31.8h |
| mul v9.8h, v10.8h, v4.8h |
| mul v13.8h, v5.8h, v31.8h |
| mul v14.8h, v3.8h, v4.8h |
| |
| bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31 |
| bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8 |
| bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9 |
| |
| bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 |
| // check whether padding top |
| cmp w13, w6 |
| b.ne 10f |
| cbnz w14, 10f |
| // y == block_h && dy == 0, padding top |
| bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 |
| add w14, w14, #0x1 |
| b 9b |
| 10: |
| cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1) |
| b.eq 11f |
| cmp w14, #(BDOF_MIN_BLOCK_SIZE) |
| b.ne 12f |
| // save line4 |
| b 1b |
| 11: |
| // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom |
| cmp x13, #(BDOF_MIN_BLOCK_SIZE) |
| b.eq 13f |
| // save line3 |
| bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23 |
| 12: |
| add w14, w14, #0x1 // dy++ |
| b 8b |
| 13: |
| // padding bottom |
| bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9 |
| b 2b |
| 16: |
| // restore |
| ldp d9, d8, [sp, #0x30] |
| ldp d11, d10, [sp, #0x20] |
| ldp d13, d12, [sp, #0x10] |
| ldp d15, d14, [sp], #0x40 |
| ret |
| endfunc |
| |
| function ff_vvc_apply_bdof_10_neon, export=1 |
| mov w6, #10 |
| b 0f |
| endfunc |
| |
| function ff_vvc_apply_bdof_12_neon, export=1 |
| mov w6, #12 |
| b 0f |
| endfunc |
| |
| // int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2] |
| // int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2] |
| // int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE]; |
| #define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4) |
| #define GRADIENT_H0_OFFSET 2 |
| #define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2) |
| #define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2) |
| #define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2) |
| #define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8) |
| #define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2) |
| function ff_vvc_apply_bdof_8_neon, export=1 |
| mov w6, #8 |
| 0: |
| stp x19, x20, [sp, #-0x40]! |
| stp x21, x22, [sp, #0x10] |
| stp x23, x24, [sp, #0x20] |
| stp x25, x30, [sp, #0x30] |
| |
| sub sp, sp, #APPLY_BDOF_STACK_SIZE |
| mov w19, w6 // bit_depth |
| mov x20, x0 // dst |
| mov x21, x1 // dst_stride |
| mov x22, x2 // src0 |
| mov x23, x3 // src1 |
| mov w24, w4 // block_w |
| mov w25, w5 // block_h |
| |
| // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]}; |
| add x0, sp, #GRADIENT_H0_OFFSET |
| add x1, sp, #GRADIENT_H1_OFFSET |
| add x2, sp, #GRADIENT_V0_OFFSET |
| add x3, sp, #GRADIENT_V1_OFFSET |
| mov x4, x22 |
| mov x5, x23 |
| mov w6, w24 |
| mov w7, w25 |
| bl vvc_bdof_grad_filter_8x_neon |
| |
| cmp w24, #8 |
| mov x0, x22 // src0 |
| mov x1, x23 // src1 |
| add x2, sp, #GRADIENT_H0_OFFSET // gh0 |
| add x3, sp, #GRADIENT_V0_OFFSET // gv0 |
| add x4, sp, #VX_OFFSET // vx |
| add x5, sp, #VY_OFFSET // vy |
| mov w6, w25 // block_h |
| |
| b.gt 16f |
| |
| bl vvc_derive_bdof_vx_vy_8x_neon |
| cmp w19, #10 // check bitdepth |
| mov x0, x20 // dst |
| mov x1, x21 // dst_stride |
| mov x2, x22 // src0 |
| mov x3, x23 // src1 |
| add x4, sp, #GRADIENT_H1_OFFSET // gh1 |
| add x5, sp, #GRADIENT_V1_OFFSET // gv1 |
| add x6, sp, #VX_OFFSET |
| add x7, sp, #VY_OFFSET |
| str w25, [sp] |
| b.eq 1f |
| b.gt 2f |
| // 8bit |
| 0: |
| bl vvc_apply_bdof_block_8x_8_neon |
| b 32f |
| 1: |
| // 10bit |
| bl vvc_apply_bdof_block_8x_10_neon |
| b 32f |
| 2: |
| // 12bit |
| bl vvc_apply_bdof_block_8x_12_neon |
| b 32f |
| 16: |
| bl vvc_derive_bdof_vx_vy_16x_neon |
| |
| cmp w19, #10 // check bitdepth |
| mov x0, x20 // dst |
| mov x1, x21 // dst_stride |
| mov x2, x22 // src0 |
| mov x3, x23 // src1 |
| add x4, sp, #GRADIENT_H1_OFFSET // gh1 |
| add x5, sp, #GRADIENT_V1_OFFSET // gv1 |
| add x6, sp, #VX_OFFSET |
| add x7, sp, #VY_OFFSET |
| str w25, [sp] |
| b.eq 17f |
| b.gt 18f |
| // 8bit |
| bl vvc_apply_bdof_block_16x_8_neon |
| b 32f |
| 17: |
| // 10bit |
| bl vvc_apply_bdof_block_16x_10_neon |
| b 32f |
| 18: |
| // 12bit |
| bl vvc_apply_bdof_block_16x_12_neon |
| 32: |
| add sp, sp, #APPLY_BDOF_STACK_SIZE |
| ldp x25, x30, [sp, #0x30] |
| ldp x23, x24, [sp, #0x20] |
| ldp x21, x22, [sp, #0x10] |
| ldp x19, x20, [sp], #0x40 |
| ret |
| endfunc |
| |
| #undef APPLY_BDOF_STACK_SIZE |
| #undef GRADIENT_H0_OFFSET |
| #undef GRADIENT_H1_OFFSET |
| #undef GRADIENT_V0_OFFSET |
| #undef GRADIENT_V1_OFFSET |
| #undef VX_OFFSET |
| #undef VY_OFFSET |
| |
| #define VVC_MAX_PB_SIZE 128 |
| |
| .macro put_luma_h_x8_vector_filter shift |
| // 8 bytes from hf loaded to v0.8h |
| // 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded for shift to v1.8h,..,v6.8h,v17.8h |
| // v24.4h & v25.4h are output vectors to store |
| ext v1.16b, v20.16b, v21.16b, #2 |
| ext v2.16b, v20.16b, v21.16b, #4 |
| ext v3.16b, v20.16b, v21.16b, #6 |
| ext v4.16b, v20.16b, v21.16b, #8 |
| ext v5.16b, v20.16b, v21.16b, #10 |
| ext v6.16b, v20.16b, v21.16b, #12 |
| ext v17.16b, v20.16b, v21.16b, #14 |
| smull v24.4s, v20.4h, v0.h[0] |
| smull2 v25.4s, v20.8h, v0.h[0] |
| smlal v24.4s, v1.4h, v0.h[1] |
| smlal2 v25.4s, v1.8h, v0.h[1] |
| smlal v24.4s, v2.4h, v0.h[2] |
| smlal2 v25.4s, v2.8h, v0.h[2] |
| smlal v24.4s, v3.4h, v0.h[3] |
| smlal2 v25.4s, v3.8h, v0.h[3] |
| smlal v24.4s, v4.4h, v0.h[4] |
| smlal2 v25.4s, v4.8h, v0.h[4] |
| smlal v24.4s, v5.4h, v0.h[5] |
| smlal2 v25.4s, v5.8h, v0.h[5] |
| smlal v24.4s, v6.4h, v0.h[6] |
| smlal2 v25.4s, v6.8h, v0.h[6] |
| smlal v24.4s, v17.4h, v0.h[7] |
| smlal2 v25.4s, v17.8h, v0.h[7] |
| sqshrn v24.4h, v24.4s, #(\shift) |
| sqshrn v25.4h, v25.4s, #(\shift) |
| .endm |
| |
| .macro put_luma_h8_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| ld1 {v0.8b}, [x4] |
| sub x1, x1, #6 |
| sxtl v0.8h, v0.8b |
| 1: |
| ld1 {v20.8h, v21.8h}, [x1], x2 |
| put_luma_h_x8_vector_filter \shift |
| subs w3, w3, #1 |
| st1 {v24.4h, v25.4h}, [x0], x9 |
| b.gt 1b |
| ret |
| .endm |
| |
| .macro put_luma_h16_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| ld1 {v0.8b}, [x4] |
| sub x9, x9, #16 |
| sub x1, x1, #6 |
| sxtl v0.8h, v0.8b |
| 1: |
| ld1 {v20.8h, v21.8h, v22.8h}, [x1], x2 |
| put_luma_h_x8_vector_filter \shift |
| mov v20.16b, v21.16b |
| mov v21.16b, v22.16b |
| st1 {v24.4h, v25.4h}, [x0], #16 |
| put_luma_h_x8_vector_filter \shift |
| subs w3, w3, #1 |
| st1 {v24.4h, v25.4h}, [x0], x9 |
| b.gt 1b |
| ret |
| .endm |
| |
| .macro put_luma_h_x16_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| ld1 {v0.8b}, [x4] |
| sub x9, x9, w6, uxtw #1 |
| sub x2, x2, w6, uxtw #1 |
| sxtl v0.8h, v0.8b |
| sub x1, x1, #6 |
| sub x2, x2, #16 |
| 1: |
| ld1 {v20.8h}, [x1], #16 |
| mov w8, w6 |
| 2: |
| ld1 {v21.8h, v22.8h}, [x1], #32 |
| put_luma_h_x8_vector_filter \shift |
| mov v20.16b, v21.16b |
| mov v21.16b, v22.16b |
| st1 {v24.4h, v25.4h}, [x0], #16 |
| put_luma_h_x8_vector_filter \shift |
| mov v20.16b, v21.16b |
| subs w8, w8, #16 |
| st1 {v24.4h, v25.4h}, [x0], #16 |
| b.gt 2b |
| subs w3, w3, #1 |
| add x0, x0, x9 |
| add x1, x1, x2 |
| b.gt 1b |
| ret |
| .endm |
| |
| function ff_vvc_put_luma_h8_10_neon, export=1 |
| put_luma_h8_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_h8_12_neon, export=1 |
| put_luma_h8_xx_neon 4 |
| endfunc |
| |
| function ff_vvc_put_luma_h16_10_neon, export=1 |
| put_luma_h16_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_h16_12_neon, export=1 |
| put_luma_h16_xx_neon 4 |
| endfunc |
| |
| function ff_vvc_put_luma_h_x16_10_neon, export=1 |
| put_luma_h_x16_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_h_x16_12_neon, export=1 |
| put_luma_h_x16_xx_neon 4 |
| endfunc |
| |
| .macro put_luma_v4_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| sub x1, x1, x2, lsl #1 |
| ld1 {v0.8b}, [x5] |
| sub x1, x1, x2 |
| sxtl v0.8h, v0.8b |
| ld1 {v20.4h}, [x1], x2 |
| ld1 {v21.4h}, [x1], x2 |
| ld1 {v22.4h}, [x1], x2 |
| ld1 {v23.4h}, [x1], x2 |
| ld1 {v24.4h}, [x1], x2 |
| ld1 {v25.4h}, [x1], x2 |
| ld1 {v26.4h}, [x1], x2 |
| 1: |
| ld1 {v27.4h}, [x1], x2 |
| |
| smull v1.4s, v20.4h, v0.h[0] |
| smull v2.4s, v21.4h, v0.h[1] |
| smlal v1.4s, v22.4h, v0.h[2] |
| smlal v2.4s, v23.4h, v0.h[3] |
| smlal v1.4s, v24.4h, v0.h[4] |
| smlal v2.4s, v25.4h, v0.h[5] |
| smlal v1.4s, v26.4h, v0.h[6] |
| smlal v2.4s, v27.4h, v0.h[7] |
| |
| ld1 {v28.4h}, [x1], x2 |
| |
| smull v3.4s, v21.4h, v0.h[0] |
| smull v4.4s, v22.4h, v0.h[1] |
| smlal v3.4s, v23.4h, v0.h[2] |
| smlal v4.4s, v24.4h, v0.h[3] |
| smlal v3.4s, v25.4h, v0.h[4] |
| smlal v4.4s, v26.4h, v0.h[5] |
| smlal v3.4s, v27.4h, v0.h[6] |
| smlal v4.4s, v28.4h, v0.h[7] |
| add v1.4s, v1.4s, v2.4s |
| add v3.4s, v3.4s, v4.4s |
| sqshrn v1.4h, v1.4s, #(\shift) |
| sqshrn v3.4h, v3.4s, #(\shift) |
| |
| st1 {v1.4h}, [x0], x9 |
| ld1 {v29.4h}, [x1], x2 |
| st1 {v3.4h}, [x0], x9 |
| |
| smull v1.4s, v22.4h, v0.h[0] |
| smull v2.4s, v23.4h, v0.h[1] |
| smlal v1.4s, v24.4h, v0.h[2] |
| smlal v2.4s, v25.4h, v0.h[3] |
| smlal v1.4s, v26.4h, v0.h[4] |
| smlal v2.4s, v27.4h, v0.h[5] |
| smlal v1.4s, v28.4h, v0.h[6] |
| smlal v2.4s, v29.4h, v0.h[7] |
| |
| ld1 {v30.4h}, [x1], x2 |
| |
| smull v3.4s, v23.4h, v0.h[0] |
| smull v4.4s, v24.4h, v0.h[1] |
| smlal v3.4s, v25.4h, v0.h[2] |
| smlal v4.4s, v26.4h, v0.h[3] |
| smlal v3.4s, v27.4h, v0.h[4] |
| smlal v4.4s, v28.4h, v0.h[5] |
| smlal v3.4s, v29.4h, v0.h[6] |
| smlal v4.4s, v30.4h, v0.h[7] |
| add v1.4s, v1.4s, v2.4s |
| add v3.4s, v3.4s, v4.4s |
| sqshrn v1.4h, v1.4s, #(\shift) |
| sqshrn v3.4h, v3.4s, #(\shift) |
| |
| st1 {v1.4h}, [x0], x9 |
| |
| mov v20.16b, v24.16b |
| mov v21.16b, v25.16b |
| mov v22.16b, v26.16b |
| mov v23.16b, v27.16b |
| mov v24.16b, v28.16b |
| mov v25.16b, v29.16b |
| mov v26.16b, v30.16b |
| |
| subs w3, w3, #4 |
| st1 {v3.4h}, [x0], x9 |
| b.gt 1b |
| ret |
| .endm |
| |
| function ff_vvc_put_luma_v4_10_neon, export=1 |
| put_luma_v4_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_v4_12_neon, export=1 |
| put_luma_v4_xx_neon 4 |
| endfunc |
| |
| .macro put_luma_v8_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| sub x1, x1, x2, lsl #1 |
| ld1 {v0.8b}, [x5] |
| sub x1, x1, x2 |
| sxtl v0.8h, v0.8b |
| ld1 {v20.8h}, [x1], x2 |
| ld1 {v21.8h}, [x1], x2 |
| ld1 {v22.8h}, [x1], x2 |
| ld1 {v23.8h}, [x1], x2 |
| ld1 {v24.8h}, [x1], x2 |
| ld1 {v25.8h}, [x1], x2 |
| ld1 {v26.8h}, [x1], x2 |
| 1: |
| ld1 {v27.8h}, [x1], x2 |
| |
| smull v1.4s, v20.4h, v0.h[0] |
| smull2 v2.4s, v20.8h, v0.h[0] |
| smlal v1.4s, v21.4h, v0.h[1] |
| smlal2 v2.4s, v21.8h, v0.h[1] |
| smlal v1.4s, v22.4h, v0.h[2] |
| smlal2 v2.4s, v22.8h, v0.h[2] |
| smlal v1.4s, v23.4h, v0.h[3] |
| smlal2 v2.4s, v23.8h, v0.h[3] |
| smlal v1.4s, v24.4h, v0.h[4] |
| smlal2 v2.4s, v24.8h, v0.h[4] |
| smlal v1.4s, v25.4h, v0.h[5] |
| smlal2 v2.4s, v25.8h, v0.h[5] |
| smlal v1.4s, v26.4h, v0.h[6] |
| smlal2 v2.4s, v26.8h, v0.h[6] |
| smlal v1.4s, v27.4h, v0.h[7] |
| smlal2 v2.4s, v27.8h, v0.h[7] |
| sqshrn v1.4h, v1.4s, #(\shift) |
| sqshrn v2.4h, v2.4s, #(\shift) |
| |
| ld1 {v28.8h}, [x1], x2 |
| st1 {v1.4h-v2.4h}, [x0], x9 |
| |
| smull v3.4s, v21.4h, v0.h[0] |
| smull2 v4.4s, v21.8h, v0.h[0] |
| smlal v3.4s, v22.4h, v0.h[1] |
| smlal2 v4.4s, v22.8h, v0.h[1] |
| smlal v3.4s, v23.4h, v0.h[2] |
| smlal2 v4.4s, v23.8h, v0.h[2] |
| smlal v3.4s, v24.4h, v0.h[3] |
| smlal2 v4.4s, v24.8h, v0.h[3] |
| smlal v3.4s, v25.4h, v0.h[4] |
| smlal2 v4.4s, v25.8h, v0.h[4] |
| smlal v3.4s, v26.4h, v0.h[5] |
| smlal2 v4.4s, v26.8h, v0.h[5] |
| smlal v3.4s, v27.4h, v0.h[6] |
| smlal2 v4.4s, v27.8h, v0.h[6] |
| smlal v3.4s, v28.4h, v0.h[7] |
| smlal2 v4.4s, v28.8h, v0.h[7] |
| sqshrn v3.4h, v3.4s, #(\shift) |
| sqshrn v4.4h, v4.4s, #(\shift) |
| |
| ld1 {v29.8h}, [x1], x2 |
| st1 {v3.4h-v4.4h}, [x0], x9 |
| |
| smull v1.4s, v22.4h, v0.h[0] |
| smull2 v2.4s, v22.8h, v0.h[0] |
| smlal v1.4s, v23.4h, v0.h[1] |
| smlal2 v2.4s, v23.8h, v0.h[1] |
| smlal v1.4s, v24.4h, v0.h[2] |
| smlal2 v2.4s, v24.8h, v0.h[2] |
| smlal v1.4s, v25.4h, v0.h[3] |
| smlal2 v2.4s, v25.8h, v0.h[3] |
| smlal v1.4s, v26.4h, v0.h[4] |
| smlal2 v2.4s, v26.8h, v0.h[4] |
| smlal v1.4s, v27.4h, v0.h[5] |
| smlal2 v2.4s, v27.8h, v0.h[5] |
| smlal v1.4s, v28.4h, v0.h[6] |
| smlal2 v2.4s, v28.8h, v0.h[6] |
| smlal v1.4s, v29.4h, v0.h[7] |
| smlal2 v2.4s, v29.8h, v0.h[7] |
| sqshrn v1.4h, v1.4s, #(\shift) |
| sqshrn v2.4h, v2.4s, #(\shift) |
| |
| ld1 {v30.8h}, [x1], x2 |
| st1 {v1.4h-v2.4h}, [x0], x9 |
| |
| smull v3.4s, v23.4h, v0.h[0] |
| smull2 v4.4s, v23.8h, v0.h[0] |
| smlal v3.4s, v24.4h, v0.h[1] |
| smlal2 v4.4s, v24.8h, v0.h[1] |
| smlal v3.4s, v25.4h, v0.h[2] |
| smlal2 v4.4s, v25.8h, v0.h[2] |
| smlal v3.4s, v26.4h, v0.h[3] |
| smlal2 v4.4s, v26.8h, v0.h[3] |
| smlal v3.4s, v27.4h, v0.h[4] |
| smlal2 v4.4s, v27.8h, v0.h[4] |
| smlal v3.4s, v28.4h, v0.h[5] |
| smlal2 v4.4s, v28.8h, v0.h[5] |
| smlal v3.4s, v29.4h, v0.h[6] |
| smlal2 v4.4s, v29.8h, v0.h[6] |
| smlal v3.4s, v30.4h, v0.h[7] |
| smlal2 v4.4s, v30.8h, v0.h[7] |
| sqshrn v3.4h, v3.4s, #(\shift) |
| sqshrn v4.4h, v4.4s, #(\shift) |
| |
| mov v20.16b, v24.16b |
| mov v21.16b, v25.16b |
| mov v22.16b, v26.16b |
| mov v23.16b, v27.16b |
| mov v24.16b, v28.16b |
| mov v25.16b, v29.16b |
| mov v26.16b, v30.16b |
| |
| subs w3, w3, #4 |
| st1 {v3.4h-v4.4h}, [x0], x9 |
| b.gt 1b |
| ret |
| .endm |
| |
| function ff_vvc_put_luma_v8_10_neon, export=1 |
| put_luma_v8_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_v8_12_neon, export=1 |
| put_luma_v8_xx_neon 4 |
| endfunc |
| |
| .macro put_luma_v_x16_vector_filter shift |
| smull v2.4s, v16.4h, v1.h[0] |
| smull2 v3.4s, v16.8h, v1.h[0] |
| smlal v2.4s, v18.4h, v1.h[1] |
| smlal2 v3.4s, v18.8h, v1.h[1] |
| smlal v2.4s, v20.4h, v1.h[2] |
| smlal2 v3.4s, v20.8h, v1.h[2] |
| smlal v2.4s, v22.4h, v1.h[3] |
| smlal2 v3.4s, v22.8h, v1.h[3] |
| smlal v2.4s, v24.4h, v1.h[4] |
| smlal2 v3.4s, v24.8h, v1.h[4] |
| smlal v2.4s, v26.4h, v1.h[5] |
| smlal2 v3.4s, v26.8h, v1.h[5] |
| smlal v2.4s, v28.4h, v1.h[6] |
| smlal2 v3.4s, v28.8h, v1.h[6] |
| smlal v2.4s, v30.4h, v1.h[7] |
| smlal2 v3.4s, v30.8h, v1.h[7] |
| |
| smull v4.4s, v17.4h, v1.h[0] |
| smull2 v5.4s, v17.8h, v1.h[0] |
| smlal v4.4s, v19.4h, v1.h[1] |
| smlal2 v5.4s, v19.8h, v1.h[1] |
| smlal v4.4s, v21.4h, v1.h[2] |
| smlal2 v5.4s, v21.8h, v1.h[2] |
| smlal v4.4s, v23.4h, v1.h[3] |
| smlal2 v5.4s, v23.8h, v1.h[3] |
| smlal v4.4s, v25.4h, v1.h[4] |
| smlal2 v5.4s, v25.8h, v1.h[4] |
| smlal v4.4s, v27.4h, v1.h[5] |
| smlal2 v5.4s, v27.8h, v1.h[5] |
| smlal v4.4s, v29.4h, v1.h[6] |
| smlal2 v5.4s, v29.8h, v1.h[6] |
| smlal v4.4s, v31.4h, v1.h[7] |
| smlal2 v5.4s, v31.8h, v1.h[7] |
| |
| sqshrn v6.4h, v2.4s, #(\shift) |
| sqshrn v7.4h, v4.4s, #(\shift) |
| sqshrn2 v6.8h, v3.4s, #(\shift) |
| sqshrn2 v7.8h, v5.4s, #(\shift) |
| .endm |
| |
| .macro put_luma_v16_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| sub x1, x1, x2, lsl #1 |
| ld1 {v0.8b}, [x5] |
| sub x1, x1, x2 |
| sxtl v0.8h, v0.8b |
| ld1 {v16.8h-v17.8h}, [x1], x2 |
| ld1 {v18.8h-v19.8h}, [x1], x2 |
| ld1 {v20.8h-v21.8h}, [x1], x2 |
| ld1 {v22.8h-v23.8h}, [x1], x2 |
| ld1 {v24.8h-v25.8h}, [x1], x2 |
| ld1 {v26.8h-v27.8h}, [x1], x2 |
| ld1 {v28.8h-v29.8h}, [x1], x2 |
| 1: |
| mov v1.16b, v0.16b |
| ld1 {v30.8h-v31.8h}, [x1], x2 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| ld1 {v16.8h-v17.8h}, [x1], x2 |
| ext v1.16b, v0.16b, v0.16b, #14 |
| st1 {v6.8h-v7.8h}, [x0], x9 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| ld1 {v18.8h-v19.8h}, [x1], x2 |
| ext v1.16b, v0.16b, v0.16b, #12 |
| st1 {v6.8h-v7.8h}, [x0], x9 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| ld1 {v20.8h-v21.8h}, [x1], x2 |
| ext v1.16b, v0.16b, v0.16b, #10 |
| st1 {v6.8h-v7.8h}, [x0], x9 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| subs w3, w3, #4 |
| st1 {v6.8h-v7.8h}, [x0], x9 |
| |
| mov v2.16b, v16.16b |
| mov v3.16b, v17.16b |
| mov v16.16b, v24.16b |
| mov v17.16b, v25.16b |
| mov v24.16b, v2.16b |
| mov v25.16b, v3.16b |
| |
| mov v2.16b, v18.16b |
| mov v3.16b, v19.16b |
| mov v18.16b, v26.16b |
| mov v19.16b, v27.16b |
| mov v26.16b, v2.16b |
| mov v27.16b, v3.16b |
| |
| mov v2.16b, v20.16b |
| mov v3.16b, v21.16b |
| mov v20.16b, v28.16b |
| mov v21.16b, v29.16b |
| mov v28.16b, v2.16b |
| mov v29.16b, v3.16b |
| |
| mov v22.16b, v30.16b |
| mov v23.16b, v31.16b |
| b.gt 1b |
| ret |
| .endm |
| |
| function ff_vvc_put_luma_v16_10_neon, export=1 |
| put_luma_v16_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_v16_12_neon, export=1 |
| put_luma_v16_xx_neon 4 |
| endfunc |
| |
| |
| .macro put_luma_v_x16_xx_neon shift |
| mov x9, #(VVC_MAX_PB_SIZE * 2) |
| sub x1, x1, x2, lsl #1 |
| ld1 {v0.8b}, [x5] |
| sub x1, x1, x2 |
| sxtl v0.8h, v0.8b |
| 1: |
| mov w8, #0 |
| 2: |
| add x11, x1, x8, lsl #1 |
| add x10, x0, x8, lsl #1 |
| ld1 {v16.8h-v17.8h}, [x11], x2 |
| add x8, x8, #16 |
| ld1 {v18.8h-v19.8h}, [x11], x2 |
| cmp w8, w6 |
| ld1 {v20.8h-v21.8h}, [x11], x2 |
| mov v1.16b, v0.16b |
| ld1 {v22.8h-v23.8h}, [x11], x2 |
| ld1 {v24.8h-v25.8h}, [x11], x2 |
| ld1 {v26.8h-v27.8h}, [x11], x2 |
| ld1 {v28.8h-v29.8h}, [x11], x2 |
| ld1 {v30.8h-v31.8h}, [x11], x2 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| ld1 {v16.8h-v17.8h}, [x11], x2 |
| ext v1.16b, v0.16b, v0.16b, #14 |
| st1 {v6.8h-v7.8h}, [x10], x9 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| st1 {v6.8h-v7.8h}, [x10], x9 |
| ext v1.16b, v0.16b, v0.16b, #12 |
| ld1 {v18.8h-v19.8h}, [x11], x2 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| ld1 {v20.8h-v21.8h}, [x11], x2 |
| ext v1.16b, v0.16b, v0.16b, #10 |
| st1 {v6.8h-v7.8h}, [x10], x9 |
| |
| put_luma_v_x16_vector_filter \shift |
| |
| st1 {v6.8h-v7.8h}, [x10], x9 |
| b.lt 2b |
| add x0, x0, x9, lsl #2 |
| subs w3, w3, #4 |
| add x1, x1, x2, lsl #2 |
| b.gt 1b |
| ret |
| .endm |
| |
| function ff_vvc_put_luma_v_x16_10_neon, export=1 |
| put_luma_v_x16_xx_neon 2 |
| endfunc |
| |
| function ff_vvc_put_luma_v_x16_12_neon, export=1 |
| put_luma_v_x16_xx_neon 4 |
| endfunc |