libavcodec/aarch64/vvc/inter.S - third_party/ffmpeg - Git at Google

 /*
  * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include "libavutil/aarch64/asm.S"

 #define VVC_MAX_PB_SIZE 128
 #define BDOF_BLOCK_SIZE 16
 #define BDOF_MIN_BLOCK_SIZE 4

 .macro vvc_w_avg bit_depth

 .macro vvc_w_avg_\bit_depth\()_2_4 tap
 .if \tap == 2
         ldr             s0, [src0]
         ldr             s2, [src1]
 .else
         ldr             d0, [src0]
         ldr             d2, [src1]
 .endif
         mov             v4.16b, v16.16b
         smlal           v4.4s, v0.4h, v19.4h
         smlal           v4.4s, v2.4h, v20.4h
         sqshl           v4.4s, v4.4s, v22.4s
         sqxtun          v4.4h, v4.4s

 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
 .if \tap == 2
         str             h4, [dst]
 .else   // tap == 4
         str             s4, [dst]
 .endif

 .else   // bit_depth > 8
         umin            v4.4h, v4.4h, v17.4h
 .if \tap == 2
         str             s4, [dst]
 .else
         str             d4, [dst]
 .endif
 .endif
         add             src0, src0, x10
         add             src1, src1, x10
         add             dst, dst, dst_stride
 .endm

 function ff_vvc_w_avg_\bit_depth\()_neon, export=1
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
         src1            .req x3
         width           .req w4
         height          .req w5

         mov             x10, #(VVC_MAX_PB_SIZE * 2)
         cmp             width, #8
         lsr             x11, x6, #32        // weight0
         mov             w12, w6             // weight1
         lsr             x13, x7, #32        // offset
         mov             w14, w7             // shift

         dup             v19.8h, w11
         neg             w14, w14            // so we can use sqshl
         dup             v20.8h, w12
         dup             v16.4s, w13
         dup             v22.4s, w14

 .if \bit_depth >= 10
         // clip pixel
         mov             w6, #((1 << \bit_depth) - 1)
         dup             v17.8h, w6
 .endif

         b.eq            8f
         b.hi            16f
         cmp             width, #4
         b.eq            4f
 2:      // width == 2
         subs            height, height, #1
         vvc_w_avg_\bit_depth\()_2_4 2
         b.ne            2b
         b               32f
 4:      // width == 4
         subs            height, height, #1
         vvc_w_avg_\bit_depth\()_2_4 4
         b.ne            4b
         b               32f
 8:      // width == 8
         ld1             {v0.8h}, [src0], x10
         ld1             {v2.8h}, [src1], x10
         mov             v4.16b, v16.16b
         mov             v5.16b, v16.16b
         smlal           v4.4s, v0.4h, v19.4h
         smlal           v4.4s, v2.4h, v20.4h
         smlal2          v5.4s, v0.8h, v19.8h
         smlal2          v5.4s, v2.8h, v20.8h
         sqshl           v4.4s, v4.4s, v22.4s
         sqshl           v5.4s, v5.4s, v22.4s
         sqxtun          v4.4h, v4.4s
         sqxtun2         v4.8h, v5.4s
         subs            height, height, #1
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
         st1             {v4.8b}, [dst], dst_stride
 .else
         umin            v4.8h, v4.8h, v17.8h
         st1             {v4.8h}, [dst], dst_stride
 .endif
         b.ne            8b
         b               32f
 16:     // width >= 16
         mov             w6, width
         mov             x7, src0
         mov             x8, src1
         mov             x9, dst
 17:
         ldp             q0, q1, [x7], #32
         ldp             q2, q3, [x8], #32
         mov             v4.16b, v16.16b
         mov             v5.16b, v16.16b
         mov             v6.16b, v16.16b
         mov             v7.16b, v16.16b
         smlal           v4.4s, v0.4h, v19.4h
         smlal           v4.4s, v2.4h, v20.4h
         smlal2          v5.4s, v0.8h, v19.8h
         smlal2          v5.4s, v2.8h, v20.8h
         smlal           v6.4s, v1.4h, v19.4h
         smlal           v6.4s, v3.4h, v20.4h
         smlal2          v7.4s, v1.8h, v19.8h
         smlal2          v7.4s, v3.8h, v20.8h
         sqshl           v4.4s, v4.4s, v22.4s
         sqshl           v5.4s, v5.4s, v22.4s
         sqshl           v6.4s, v6.4s, v22.4s
         sqshl           v7.4s, v7.4s, v22.4s
         sqxtun          v4.4h, v4.4s
         sqxtun          v6.4h, v6.4s
         sqxtun2         v4.8h, v5.4s
         sqxtun2         v6.8h, v7.4s
         subs            w6, w6, #16
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
         sqxtun2         v4.16b, v6.8h
         str             q4, [x9], #16
 .else
         umin            v4.8h, v4.8h, v17.8h
         umin            v6.8h, v6.8h, v17.8h
         stp             q4, q6, [x9], #32
 .endif
         b.ne            17b

         subs            height, height, #1
         add             src0, src0, x10
         add             src1, src1, x10
         add             dst, dst, dst_stride
         b.ne            16b
 32:
         ret

 .unreq dst
 .unreq dst_stride
 .unreq src0
 .unreq src1
 .unreq width
 .unreq height
 endfunc
 .endm

 vvc_w_avg 8
 vvc_w_avg 10
 vvc_w_avg 12

 .macro vvc_avg bit_depth
 function ff_vvc_avg_\bit_depth\()_neon, export=1
         mov             x10, #(VVC_MAX_PB_SIZE * 2)
         movi            v16.8h, #0
         movi            v17.16b, #255
         ushr            v17.8h, v17.8h, #(16 - \bit_depth)

         cmp             w4, #8
         b.gt            16f
         b.eq            8f
         cmp             w4, #4
         b.eq            4f

 2: // width == 2
         ldr             s0, [x2]
         subs            w5, w5, #1
         ldr             s1, [x3]
 .if \bit_depth == 8
         shadd           v0.4h, v0.4h, v1.4h
         sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
         str             h0, [x0]
 .else
         shadd           v0.4h, v0.4h, v1.4h
         srshr           v0.4h, v0.4h, #(15 - 1 - \bit_depth)
         smax            v0.4h, v0.4h, v16.4h
         smin            v0.4h, v0.4h, v17.4h
         str             s0, [x0]
 .endif
         add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
         add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
         add             x0, x0, x1
         b.ne            2b
         ret

 4: // width == 4
         ldr             d0, [x2]
         subs            w5, w5, #1
         ldr             d1, [x3]
 .if \bit_depth == 8
         shadd           v0.4h, v0.4h, v1.4h
         sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
         str             s0, [x0]
 .else
         shadd           v0.4h, v0.4h, v1.4h
         srshr           v0.4h, v0.4h, #(15 - 1 - \bit_depth)
         smax            v0.4h, v0.4h, v16.4h
         smin            v0.4h, v0.4h, v17.4h
         str             d0, [x0]
 .endif
         add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
         add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
         add             x0, x0, x1
         b.ne            4b
         ret

 8: // width == 8
         ldr             q0, [x2]
         subs            w5, w5, #1
         ldr             q1, [x3]
 .if \bit_depth == 8
         shadd           v0.8h, v0.8h, v1.8h
         sqrshrun        v0.8b, v0.8h, #(15 - 1 - \bit_depth)
         str             d0, [x0]
 .else
         shadd           v0.8h, v0.8h, v1.8h
         srshr           v0.8h, v0.8h, #(15 - 1 - \bit_depth)
         smax            v0.8h, v0.8h, v16.8h
         smin            v0.8h, v0.8h, v17.8h
         str             q0, [x0]
 .endif
         add             x2, x2, #(VVC_MAX_PB_SIZE * 2)
         add             x3, x3, #(VVC_MAX_PB_SIZE * 2)
         add             x0, x0, x1
         b.ne            8b
         ret

 16: // width >= 16
 .if \bit_depth == 8
         sub             x1, x1, w4, sxtw
 .else
         sub             x1, x1, w4, sxtw #1
 .endif
         sub             x10, x10, w4, sxtw #1
 3:
         mov             w6, w4 // width
 1:
         ldp             q0, q1, [x2], #32
         subs            w6, w6, #16
         ldp             q2, q3, [x3], #32
 .if \bit_depth == 8
         shadd           v4.8h, v0.8h, v2.8h
         shadd           v5.8h, v1.8h, v3.8h
         sqrshrun        v0.8b, v4.8h, #6
         sqrshrun2       v0.16b, v5.8h, #6
         st1             {v0.16b}, [x0], #16
 .else
         shadd           v4.8h, v0.8h, v2.8h
         shadd           v5.8h, v1.8h, v3.8h
         srshr           v0.8h, v4.8h, #(15 - 1 - \bit_depth)
         srshr           v1.8h, v5.8h, #(15 - 1 - \bit_depth)
         smax            v0.8h, v0.8h, v16.8h
         smax            v1.8h, v1.8h, v16.8h
         smin            v0.8h, v0.8h, v17.8h
         smin            v1.8h, v1.8h, v17.8h
         stp             q0, q1, [x0], #32
 .endif
         b.ne            1b

         subs            w5, w5, #1
         add             x2, x2, x10
         add             x3, x3, x10
         add             x0, x0, x1
         b.ne            3b
         ret
 endfunc
 .endm

 vvc_avg 8
 vvc_avg 10
 vvc_avg 12

 /* x0: int16_t *dst
  * x1: const uint8_t *_src
  * x2: ptrdiff_t _src_stride
  * w3: int height
  * x4: intptr_t mx
  * x5: intptr_t my
  * w6: int width
  */
 function ff_vvc_dmvr_8_neon, export=1
         dst             .req x0
         src             .req x1
         src_stride      .req x2
         height          .req w3
         mx              .req x4
         my              .req x5
         width           .req w6

         sxtw            x6, w6
         mov             x7, #(VVC_MAX_PB_SIZE * 2 + 8)
         cmp             width, #16
         sub             src_stride, src_stride, x6
         cset            w15, gt                     // width > 16
         movi            v16.8h, #2                  // DMVR_SHIFT
         sub             x7, x7, x6, lsl #1
 1:
         cbz             w15, 2f
         ldr             q0, [src], #16
         ushll           v1.8h, v0.8b, #2
         ushll2          v2.8h, v0.16b, #2
         stp             q1, q2, [dst], #32
         b               3f
 2:
         ldr             d0, [src], #8
         ushll           v1.8h, v0.8b, #2
         str             q1, [dst], #16
 3:
         subs            height, height, #1
         ldr             s3, [src], #4
         ushll           v4.8h, v3.8b, #2
         st1             {v4.4h}, [dst], x7

         add             src, src, src_stride
         b.ne            1b

         ret
 endfunc

 function ff_vvc_dmvr_12_neon, export=1
         sxtw            x6, w6
         mov             x7, #(VVC_MAX_PB_SIZE * 2 + 8)
         cmp             width, #16
         sub             src_stride, src_stride, x6, lsl #1
         cset            w15, gt                     // width > 16
         sub             x7, x7, x6, lsl #1
 1:
         cbz             w15, 2f
         ldp             q0, q1, [src], #32
         urshr           v0.8h, v0.8h, #2
         urshr           v1.8h, v1.8h, #2

         stp             q0, q1, [dst], #32
         b               3f
 2:
         ldr             q0, [src], #16
         urshr           v0.8h, v0.8h, #2
         str             q0, [dst], #16
 3:
         subs            height, height, #1
         ldr             d0, [src], #8
         urshr           v0.4h, v0.4h, #2
         st1             {v0.4h}, [dst], x7

         add             src, src, src_stride
         b.ne            1b

         ret
 endfunc

 function ff_vvc_dmvr_v_8_neon, export=1
         movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
         add             x7, x7, x5, lsl #1
         ld2r            {v0.16b, v1.16b}, [x7]
         tbz             w6, #4, 12f

         ldr             s16, [x1, #16]
         ld1             {v2.16b}, [x1], x2
 20:
         ldr             s17, [x1, #16]
         umull           v4.8h, v0.8b, v2.8b
         umull2          v5.8h, v0.16b, v2.16b
         ld1             {v3.16b}, [x1], x2
         umull           v16.8h, v0.8b, v16.8b
         umull           v6.8h, v1.8b, v3.8b
         umull2          v7.8h, v1.16b, v3.16b
         add             v4.8h, v4.8h, v6.8h
         umull           v18.8h, v1.8b, v17.8b
         add             v5.8h, v5.8h, v7.8h
         urshr           v4.8h, v4.8h, #2
         add             v19.4h, v16.4h, v18.4h
         urshr           v5.8h, v5.8h, #2
         urshr           v19.4h, v19.4h, #2
         st1             {v4.8h, v5.8h}, [x0], #32
         subs            w3, w3, #1
         mov             v2.16b, v3.16b
         st1             {v19.4h}, [x0], #8
         mov             v16.16b, v17.16b
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
         b.ne            20b
         ret

 12:
         ldr             s16, [x1, #8]
         ld1             {v2.8b}, [x1], x2
 2:
         ldr             s17, [x1, #8]
         umull           v4.8h, v0.8b, v2.8b
         ld1             {v3.8b}, [x1], x2
         umull           v16.8h, v0.8b, v16.8b
         umull           v6.8h, v1.8b, v3.8b
         add             v4.8h, v4.8h, v6.8h
         umull           v18.8h, v1.8b, v17.8b
         srshr           v4.8h, v4.8h, #2
         add             v19.4h, v16.4h, v18.4h
         srshr           v19.4h, v19.4h, #2
         st1             {v4.8h}, [x0], #16
         subs            w3, w3, #1
         mov             v2.16b, v3.16b
         st1             {v19.4h}, [x0], #8
         mov             v16.16b, v17.16b
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
         b.ne            2b
         ret
 endfunc

 function ff_vvc_dmvr_h_8_neon, export=1
         movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
         add             x7, x7, x4, lsl #1
         ld2r            {v0.16b, v1.16b}, [x7]
         tbz             w6, #4, 12f
 20:
         ldur            q3, [x1, #1]
         ldr             q2, [x1]
         umull           v4.8h, v0.8b, v2.8b
         umull2          v5.8h, v0.16b, v2.16b
         ldur            s17, [x1, #17]
         umull           v6.8h, v1.8b, v3.8b
         ldr             s16, [x1, #16]
         umull2          v7.8h, v1.16b, v3.16b
         add             v4.8h, v4.8h, v6.8h
         umull           v17.8h, v1.8b, v17.8b
         add             v5.8h, v5.8h, v7.8h
         umull           v16.8h, v0.8b, v16.8b
         srshr           v4.8h, v4.8h, #2
         add             v16.4h, v16.4h, v17.4h
         srshr           v5.8h, v5.8h, #2
         srshr           v16.4h, v16.4h, #2
         st1             {v4.8h, v5.8h}, [x0], #32
         subs            w3, w3, #1
         st1             {v16.4h}, [x0], #8
         add             x1, x1, x2
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
         b.ne            20b
         ret

 12:
         ldur            d3, [x1, #1]
         ldr             d2, [x1]
         umull           v4.8h, v0.8b, v2.8b
         ldur            s17, [x1, #9]
         umull           v6.8h, v1.8b, v3.8b
         ldr             s16, [x1, #8]
         add             v4.8h, v4.8h, v6.8h
         umull           v17.8h, v1.8b, v17.8b
         umull           v16.8h, v0.8b, v16.8b
         srshr           v4.8h, v4.8h, #2
         add             v16.4h, v16.4h, v17.4h
         srshr           v16.4h, v16.4h, #2
         st1             {v4.8h}, [x0], #16
         subs            w3, w3, #1
         st1             {v16.4h}, [x0], #8
         add             x1, x1, x2
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
         b.ne            12b
         ret
 endfunc

 .macro vvc_dmvr_h_10 bit_depth
 function ff_vvc_dmvr_h_\bit_depth\()_neon, export=1
         movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
         add             x7, x7, x4, lsl #1
         ld2r            {v0.16b, v1.16b}, [x7]
         uxtl            v0.8h, v0.8b
         uxtl            v1.8h, v1.8b
         tbz             w6, #4, 12f
 20:
         ldur            q3, [x1, #2]
         ldr             q2, [x1]
         ldr             q22, [x1, #16]
         mul             v4.8h, v0.8h, v2.8h
         mul             v6.8h, v1.8h, v3.8h
         ldur            q23, [x1, #18]
         mul             v5.8h, v0.8h, v22.8h
         ldur            d17, [x1, #34]
         mul             v7.8h, v1.8h, v23.8h
         uhadd           v4.8h, v4.8h, v6.8h
         ldr             d16, [x1, #32]
         uhadd           v5.8h, v5.8h, v7.8h
         mul             v17.4h, v1.4h, v17.4h
         mul             v16.4h, v0.4h, v16.4h
         urshr           v4.8h, v4.8h, #(\bit_depth - 6 - 1)
         urshr           v5.8h, v5.8h, #(\bit_depth - 6 - 1)
         uhadd           v16.4h, v16.4h, v17.4h
         urshr           v16.4h, v16.4h, #(\bit_depth - 6 - 1)
         st1             {v4.8h, v5.8h}, [x0], #32
         subs            w3, w3, #1
         st1             {v16.4h}, [x0], #8
         add             x1, x1, x2
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
         b.ne            20b
         ret

 12:
         ldur            q3, [x1, #2]
         ldr             q2, [x1]
         mul             v4.8h, v0.8h, v2.8h
         ldur            d17, [x1, #18]
         mul             v6.8h, v1.8h, v3.8h
         ldr             d16, [x1, #16]
         uhadd           v4.8h, v4.8h, v6.8h
         mul             v17.4h, v1.4h, v17.4h
         mul             v16.4h, v0.4h, v16.4h
         urshr           v4.8h, v4.8h, #(\bit_depth - 6 - 1)
         uhadd           v16.4h, v16.4h, v17.4h
         urshr           v16.4h, v16.4h, #(\bit_depth - 6 - 1)
         st1             {v4.8h}, [x0], #16
         subs            w3, w3, #1
         st1             {v16.4h}, [x0], #8
         add             x1, x1, x2
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
         b.ne            12b
         ret
 endfunc
 .endm

 vvc_dmvr_h_10 10
 vvc_dmvr_h_10 12

 function ff_vvc_dmvr_hv_8_neon, export=1
         tmp0            .req x7
         tmp1            .req x8

         sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)

         movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
         add             x12, x9, mx, lsl #1
         mov             tmp0, sp
         add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
         // We know the value are positive
         ld2r            {v0.16b, v1.16b}, [x12]

         add             x12, x9, my, lsl #1
         ldrb            w10, [x12]
         ldrb            w11, [x12, #1]
         sxtw            x6, w6
         dup             v2.8h, w10                  // filter_y[0]
         dup             v3.8h, w11                  // filter_y[1]

         // Valid value for width can only be 8 + 4, 16 + 4
         cmp             width, #16
         mov             w10, #0                     // start filter_y or not
         add             height, height, #1
         sub             dst, dst, #(VVC_MAX_PB_SIZE * 2)
         sub             src_stride, src_stride, x6
         cset            w15, gt                     // width > 16
 1:
         mov             x12, tmp0
         mov             x13, tmp1
         mov             x14, dst
         cbz             w15, 2f

         // width > 16
         ldur            q5, [src, #1]
         ldr             q4, [src], #16
         umull           v6.8h, v4.8b, v0.8b
         umull2          v16.8h, v4.16b, v0.16b
         umlal           v6.8h, v5.8b, v1.8b
         umlal2          v16.8h, v5.16b, v1.16b
         urshr           v6.8h, v6.8h, #(8 - 6)
         urshr           v7.8h, v16.8h, #(8 - 6)
         stp             q6, q7, [x13], #32

         cbz             w10, 3f

         ldp             q16, q17, [x12], #32
         mul             v16.8h, v16.8h, v2.8h
         mul             v17.8h, v17.8h, v2.8h
         mla             v16.8h, v6.8h, v3.8h
         mla             v17.8h, v7.8h, v3.8h
         urshr           v16.8h, v16.8h, #4
         urshr           v17.8h, v17.8h, #4
         stp             q16, q17, [x14], #32
         b               3f
 2:
         // width > 8
         ldur            d5, [src, #1]
         ldr             d4, [src], #8
         umull           v6.8h, v4.8b, v0.8b
         umlal           v6.8h, v5.8b, v1.8b
         urshr           v6.8h, v6.8h, #(8 - 6)
         str             q6, [x13], #16

         cbz             w10, 3f

         ldr             q16, [x12], #16
         mul             v16.8h, v16.8h, v2.8h
         mla             v16.8h, v6.8h, v3.8h
         urshr           v16.8h, v16.8h, #4
         str             q16, [x14], #16
 3:
         ldur            s5, [src, #1]
         ldr             s4, [src], #4
         umull           v6.8h, v4.8b, v0.8b
         umlal           v6.8h, v5.8b, v1.8b
         urshr           v6.4h, v6.4h, #(8 - 6)
         str             d6, [x13], #8

         cbz             w10, 4f

         ldr             d16, [x12], #8
         mul             v16.4h, v16.4h, v2.4h
         mla             v16.4h, v6.4h, v3.4h
         urshr           v16.4h, v16.4h, #4
         str             d16, [x14], #8
 4:
         subs            height, height, #1
         mov             w10, #1
         add             src, src, src_stride
         add             dst, dst, #(VVC_MAX_PB_SIZE * 2)
         eor             tmp0, tmp0, tmp1
         eor             tmp1, tmp0, tmp1
         eor             tmp0, tmp0, tmp1
         b.ne            1b

         add             sp, sp, #(VVC_MAX_PB_SIZE * 4)
         ret
 endfunc

 function ff_vvc_dmvr_hv_12_neon, export=1
         mvni            v29.4s, #(12 - 6 - 1)
         b               0f
 endfunc

 function ff_vvc_dmvr_hv_10_neon, export=1
         mvni            v29.4s, #(10 - 6 - 1)
 0:
         sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)

         movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
         add             x12, x9, mx, lsl #1
         ldrb            w10, [x12]
         ldrb            w11, [x12, #1]
         mov             tmp0, sp
         add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
         // We know the value are positive
         dup             v0.8h, w10                  // filter_x[0]
         dup             v1.8h, w11                  // filter_x[1]

         add             x12, x9, my, lsl #1
         ldrb            w10, [x12]
         ldrb            w11, [x12, #1]
         dup             v2.8h, w10                  // filter_y[0]
         dup             v3.8h, w11                  // filter_y[1]

         // Valid value for width can only be 8 + 4, 16 + 4
         cmp             width, #16
         mov             w10, #0                     // start filter_y or not
         add             height, height, #1
         sub             dst, dst, #(VVC_MAX_PB_SIZE * 2)
         sub             src_stride, src_stride, w6, sxtw #1
         cset            w15, gt                     // width > 16
 1:
         mov             x12, tmp0
         mov             x13, tmp1
         mov             x14, dst
         cbz             w15, 2f

         // width > 16
         add             x16, src, #2
         ldp             q6, q16, [src], #32
         ldp             q7, q17, [x16]
         umull           v4.4s, v6.4h, v0.4h
         umull2          v5.4s, v6.8h, v0.8h
         umull           v18.4s, v16.4h, v0.4h
         umull2          v19.4s, v16.8h, v0.8h
         umlal           v4.4s, v7.4h, v1.4h
         umlal2          v5.4s, v7.8h, v1.8h
         umlal           v18.4s, v17.4h, v1.4h
         umlal2          v19.4s, v17.8h, v1.8h

         urshl           v4.4s, v4.4s, v29.4s
         urshl           v5.4s, v5.4s, v29.4s
         urshl           v18.4s, v18.4s, v29.4s
         urshl           v19.4s, v19.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         uqxtn2          v6.8h, v5.4s
         uqxtn           v7.4h, v18.4s
         uqxtn2          v7.8h, v19.4s
         stp             q6, q7, [x13], #32

         cbz             w10, 3f

         ldp             q4, q5, [x12], #32
         umull           v17.4s, v4.4h, v2.4h
         umull2          v18.4s, v4.8h, v2.8h
         umull           v19.4s, v5.4h, v2.4h
         umull2          v20.4s, v5.8h, v2.8h
         umlal           v17.4s, v6.4h, v3.4h
         umlal2          v18.4s, v6.8h, v3.8h
         umlal           v19.4s, v7.4h, v3.4h
         umlal2          v20.4s, v7.8h, v3.8h
         uqrshrn         v6.4h, v17.4s, #4
         uqrshrn2        v6.8h, v18.4s, #4
         uqrshrn         v7.4h, v19.4s, #4
         uqrshrn2        v7.8h, v20.4s, #4
         stp             q6, q7, [x14], #32
         b               3f
 2:
         // width > 8
         ldur            q7, [src, #2]
         ldr             q6, [src], #16
         umull           v4.4s, v6.4h, v0.4h
         umull2          v5.4s, v6.8h, v0.8h
         umlal           v4.4s, v7.4h, v1.4h
         umlal2          v5.4s, v7.8h, v1.8h

         urshl           v4.4s, v4.4s, v29.4s
         urshl           v5.4s, v5.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         uqxtn2          v6.8h, v5.4s
         str             q6, [x13], #16

         cbz             w10, 3f

         ldr             q16, [x12], #16
         umull           v17.4s, v16.4h, v2.4h
         umull2          v18.4s, v16.8h, v2.8h
         umlal           v17.4s, v6.4h, v3.4h
         umlal2          v18.4s, v6.8h, v3.8h
         urshr           v17.4s, v17.4s, #4
         urshr           v18.4s, v18.4s, #4
         uqxtn           v16.4h, v17.4s
         uqxtn2          v16.8h, v18.4s
         str             q16, [x14], #16
 3:
         ldur            d7, [src, #2]
         ldr             d6, [src], #8
         umull           v4.4s, v7.4h, v1.4h
         umlal           v4.4s, v6.4h, v0.4h
         urshl           v4.4s, v4.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         str             d6, [x13], #8

         cbz             w10, 4f

         ldr             d16, [x12], #8
         umull           v17.4s, v16.4h, v2.4h
         umlal           v17.4s, v6.4h, v3.4h
         urshr           v17.4s, v17.4s, #4
         uqxtn           v16.4h, v17.4s
         str             d16, [x14], #8
 4:
         subs            height, height, #1
         mov             w10, #1
         add             src, src, src_stride
         add             dst, dst, #(VVC_MAX_PB_SIZE * 2)
         eor             tmp0, tmp0, tmp1
         eor             tmp1, tmp0, tmp1
         eor             tmp0, tmp0, tmp1
         b.ne            1b

         add             sp, sp, #(VVC_MAX_PB_SIZE * 4)
         ret

 .unreq dst
 .unreq src
 .unreq src_stride
 .unreq height
 .unreq mx
 .unreq my
 .unreq width
 .unreq tmp0
 .unreq tmp1
 endfunc

 function ff_vvc_prof_grad_filter_8x_neon, export=1
         gh              .req x0
         gv              .req x1
         gstride         .req x2
         src             .req x3
         src_stride      .req x4
         width           .req w5
         height          .req w6

         lsl             src_stride, src_stride, #1
         neg             x7, src_stride
 1:
         mov             x10, src
         mov             w11, width
         mov             x12, gh
         mov             x13, gv
 2:
         ldur            q0, [x10, #2]
         ldur            q1, [x10, #-2]
         subs            w11, w11, #8
         ldr             q2, [x10, src_stride]
         ldr             q3, [x10, x7]
         sshr            v0.8h, v0.8h, #6
         sshr            v1.8h, v1.8h, #6
         sshr            v2.8h, v2.8h, #6
         sshr            v3.8h, v3.8h, #6
         sub             v0.8h, v0.8h, v1.8h
         sub             v2.8h, v2.8h, v3.8h
         st1             {v0.8h}, [x12], #16
         st1             {v2.8h}, [x13], #16
         add             x10, x10, #16
         b.ne            2b

         subs            height, height, #1
         add             gh, gh, gstride, lsl #1
         add             gv, gv, gstride, lsl #1
         add             src, src, src_stride
         b.ne            1b
         ret

 .unreq gh
 .unreq gv
 .unreq gstride
 .unreq src
 .unreq src_stride
 .unreq width
 .unreq height
 endfunc

 function vvc_bdof_grad_filter_8x_neon, export=0
         gh0             .req x0
         gh1             .req x1
         gv0             .req x2
         gv1             .req x3
         src0            .req x4
         src1            .req x5
         width           .req w6
         height          .req w7
         tbnz            w6, #4, 16f

 8:
         ldur            q0, [src0, #2]
         ldur            q1, [src0, #-2]
         ldr             q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
         ldr             q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
         sshr            v0.8h, v0.8h, #6
         sshr            v1.8h, v1.8h, #6
         ldur            q4, [src1, #2]
         ldur            q5, [src1, #-2]
         sshr            v2.8h, v2.8h, #6
         sshr            v3.8h, v3.8h, #6
         ldr             q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
         ldr             q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
         // results of gradient_h0
         sub             v0.8h, v0.8h, v1.8h
         // results of gradient_v0
         sub             v2.8h, v2.8h, v3.8h

         sshr            v4.8h, v4.8h, #6
         sshr            v5.8h, v5.8h, #6
         sshr            v6.8h, v6.8h, #6
         sshr            v7.8h, v7.8h, #6
         // results of gradient_h1
         sub             v4.8h, v4.8h, v5.8h
         // results of gradient_v1
         sub             v6.8h, v6.8h, v7.8h

         // (gradient_h0 + gradient_h1) >> 1
         shadd           v1.8h, v0.8h, v4.8h
         // gradient_h0 - gradient_h1
         sub             v5.8h, v0.8h, v4.8h

         // (gradient_v0 + gradient_v1) >> 1
         shadd           v3.8h, v2.8h, v6.8h
         // gradient_v0 - gradient_v1
         sub             v7.8h, v2.8h, v6.8h

         st1             {v1.8h}, [gh0]
         st1             {v5.8h}, [gh1]
         st1             {v3.8h}, [gv0]
         st1             {v7.8h}, [gv1]

         subs            height, height, #1
         add             gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
         add             gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
         add             src0, src0, #(VVC_MAX_PB_SIZE << 1)
         add             gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
         add             gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
         add             src1, src1, #(VVC_MAX_PB_SIZE << 1)
         b.ne            8b
         ret

 16:
         ldur            q0, [src0, #2]
         ldur            q1, [src0, #18]
         ldur            q16, [src0, #-2]
         sshr            v0.8h, v0.8h, #6
         ldur            q17, [src0, #14]
         sshr            v1.8h, v1.8h, #6
         ldp             q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
         sshr            v16.8h, v16.8h, #6
         ldp             q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
         ldur            q20, [src1, #2]
         sshr            v17.8h, v17.8h, #6
         ldur            q21, [src1, #18]
         sshr            v2.8h, v2.8h, #6
         ldur            q22, [src1, #-2]
         sshr            v3.8h, v3.8h, #6
         ldur            q23, [src1, #14]
         sshr            v18.8h, v18.8h, #6
         ldp             q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
         sshr            v19.8h, v19.8h, #6
         ldp             q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!

         // results of gradient_h0
         sub             v0.8h, v0.8h, v16.8h
         sub             v1.8h, v1.8h, v17.8h

         // results of gradient_v0
         sub             v2.8h, v2.8h, v18.8h
         sub             v3.8h, v3.8h, v19.8h

         sshr            v20.8h, v20.8h, #6
         sshr            v21.8h, v21.8h, #6
         sshr            v22.8h, v22.8h, #6
         sshr            v23.8h, v23.8h, #6

         // results of gradient_h1
         sub             v20.8h, v20.8h, v22.8h
         sub             v21.8h, v21.8h, v23.8h

         sshr            v24.8h, v24.8h, #6
         sshr            v25.8h, v25.8h, #6

         // gradient_h0 - gradient_h1
         sub             v22.8h, v0.8h, v20.8h
         sub             v23.8h, v1.8h, v21.8h

         // (gradient_h0 + gradient_h1) >> 1
         shadd           v16.8h, v0.8h, v20.8h
         shadd           v17.8h, v1.8h, v21.8h

         st1             {v22.8h, v23.8h}, [gh1], #32

         sshr            v26.8h, v26.8h, #6
         sshr            v27.8h, v27.8h, #6

         st1             {v16.8h, v17.8h}, [gh0], #32

         // results of gradient_v1
         sub             v24.8h, v24.8h, v26.8h
         sub             v25.8h, v25.8h, v27.8h

         // (gradient_v0 + gradient_v1) >> 1
         shadd           v18.8h, v2.8h, v24.8h
         shadd           v19.8h, v3.8h, v25.8h

         // gradient_v0 - gradient_v1
         sub             v26.8h, v2.8h, v24.8h
         sub             v27.8h, v3.8h, v25.8h

         st1             {v18.8h,v19.8h}, [gv0], #32

         subs            height, height, #1
         st1             {v26.8h,v27.8h}, [gv1], #32

         b.ne            16b
         ret

 .unreq gh0
 .unreq gh1
 .unreq gv0
 .unreq gv1
 .unreq src0
 .unreq src1
 .unreq width
 .unreq height
 endfunc

 .macro vvc_apply_bdof_block_8x bit_depth
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
         src1            .req x3
         gh              .req x4
         gv              .req x5
         vx              .req x6
         vy              .req x7

         ldr             w8, [sp]
         mov             x12, #(BDOF_BLOCK_SIZE * 2)
         mov             x14, #(VVC_MAX_PB_SIZE * 2)
 .if \bit_depth >= 10
         // clip pixel
         mov             w15, #((1 << \bit_depth) - 1)
         dup             v19.8h, w15
 .endif

 0:
         ldr             s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE)
         ldr             s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE)
         mov             w13, #(BDOF_MIN_BLOCK_SIZE)
 1:
         ld1             {v5.8h}, [src0], x14
         ld1             {v6.8h}, [src1], x14

         saddl           v17.4s, v5.4h, v6.4h
         ld1             {v4.8h}, [gv], x12
         saddl2          v16.4s, v5.8h, v6.8h
         ld1             {v2.8h}, [gh], x12
         smlal           v17.4s, v4.4h, v1.h[0]
         smlal2          v16.4s, v4.8h, v1.h[1]
         smlal           v17.4s, v2.4h, v0.h[0]
         smlal2          v16.4s, v2.8h, v0.h[1]

         sqrshrun        v5.4h, v17.4s, #(15 - \bit_depth)
         sqrshrun2       v5.8h, v16.4s, #(15 - \bit_depth)
         subs            w13, w13, #1
 .if \bit_depth == 8
         sqxtun          v5.8b, v5.8h
         st1             {v5.8b}, [dst], dst_stride
 .else
         smin            v5.8h, v5.8h, v19.8h
         st1             {v5.8h}, [dst], dst_stride
 .endif
         b.ne            1b

         subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
         b.ne            0b
         ret

 .unreq dst
 .unreq dst_stride
 .unreq src0
 .unreq src1
 .unreq gh
 .unreq gv
 .unreq vx
 .unreq vy
 .endm

 function vvc_apply_bdof_block_8x_8_neon, export=0
         vvc_apply_bdof_block_8x 8
 endfunc

 function vvc_apply_bdof_block_8x_10_neon, export=0
         vvc_apply_bdof_block_8x 10
 endfunc

 function vvc_apply_bdof_block_8x_12_neon, export=0
         vvc_apply_bdof_block_8x 12
 endfunc

 .macro vvc_apply_bdof_block_16x bit_depth
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
         src1            .req x3
         gh              .req x4
         gv              .req x5
         vx              .req x6
         vy              .req x7

         ldr             w8, [sp]
         movi            v7.4s, #(1 << (14 - \bit_depth))
 .if \bit_depth >= 10
         // clip pixel
         mov             w15, #((1 << \bit_depth) - 1)
         movi            v18.8h, #0
         dup             v19.8h, w15
 .endif

 0:
         ld1r            {v0.8h}, [vx], #2
         ld1r            {v1.8h}, [vy], #2
         ld1r            {v2.8h}, [vx], #2
         ld1r            {v3.8h}, [vy], #2

         mov             w13, #(BDOF_MIN_BLOCK_SIZE)

         ld1r            {v20.8h}, [vx], #2
         ld1r            {v21.8h}, [vy], #2
         ld1r            {v22.8h}, [vx], #2
         ld1r            {v23.8h}, [vy], #2

         ins             v0.d[1], v2.d[1]
         ins             v1.d[1], v3.d[1]
         ins             v20.d[1], v22.d[1]
         ins             v21.d[1], v23.d[1]
 1:
         ldp             q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
         ldp             q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
         smull           v3.4s, v0.4h, v2.4h
         smull2          v16.4s, v0.8h, v2.8h
         smlal           v3.4s, v1.4h, v4.4h
         smlal2          v16.4s, v1.8h, v4.8h

         ldp             q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
         ldp             q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)

         smull           v23.4s, v20.4h, v22.4h
         smull2          v27.4s, v20.8h, v22.8h
         smlal           v23.4s, v21.4h, v24.4h
         smlal2          v27.4s, v21.8h, v24.8h

         saddl           v2.4s, v5.4h, v6.4h
         add             v2.4s, v2.4s, v7.4s
         add             v2.4s, v2.4s, v3.4s
         saddl2          v4.4s, v5.8h, v6.8h
         add             v4.4s, v4.4s, v7.4s
         add             v4.4s, v4.4s, v16.4s

         saddl           v22.4s, v25.4h, v26.4h
         add             v22.4s, v22.4s, v7.4s
         add             v22.4s, v22.4s, v23.4s
         saddl2          v24.4s, v25.8h, v26.8h
         add             v24.4s, v24.4s, v7.4s
         add             v24.4s, v24.4s, v27.4s

         sqshrn          v5.4h, v2.4s, #(15 - \bit_depth)
         sqshrn2         v5.8h, v4.4s, #(15 - \bit_depth)
         sqshrn          v25.4h, v22.4s, #(15 - \bit_depth)
         sqshrn2         v25.8h, v24.4s, #(15 - \bit_depth)

         subs            w13, w13, #1
 .if \bit_depth == 8
         sqxtun          v5.8b, v5.8h
         sqxtun2         v5.16b, v25.8h
         str             q5, [dst]
 .else
         smin            v5.8h, v5.8h, v19.8h
         smax            v5.8h, v5.8h, v18.8h
         smin            v25.8h, v25.8h, v19.8h
         smax            v25.8h, v25.8h, v18.8h
         stp             q5, q25, [dst]
 .endif
         add             dst, dst, dst_stride
         b.ne            1b

         subs            w8, w8, #(BDOF_MIN_BLOCK_SIZE)
         b.ne            0b
         ret

 .unreq dst
 .unreq dst_stride
 .unreq src0
 .unreq src1
 .unreq gh
 .unreq gv
 .unreq vx
 .unreq vy
 .endm

 function vvc_apply_bdof_block_16x_8_neon, export=0
         vvc_apply_bdof_block_16x 8
 endfunc

 function vvc_apply_bdof_block_16x_10_neon, export=0
         vvc_apply_bdof_block_16x 10
 endfunc

 function vvc_apply_bdof_block_16x_12_neon, export=0
         vvc_apply_bdof_block_16x 12
 endfunc

 const bdof_vx_vy_8x_tbl
         .byte 0, 1, 16, 16, 16, 16, 8, 9
         .byte 6, 7, 16, 16, 16, 16, 14, 15
 endconst

 const bdof_vx_vy_16x_tbl
         .byte 0,  1,  64, 64, 64, 64, 8,  9
         .byte 6,  7,  64, 64, 64, 64, 16, 17
         .byte 14, 15, 64, 64, 64, 64, 24, 25
         .byte 22, 23, 64, 64, 64, 64, 30, 31
 endconst

 // line(-1), line0, line1, line2, line3, line4
 // line3 and line4 becomes line(-1) and line0 in the next block.
 .macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
         mov             \tmp0\().16b, v28.16b
         mov             \tmp1\().16b, v29.16b
         mov             \tmp2\().16b, v30.16b
         mov             \tmp3\().16b, v31.16b
         mov             \tmp4\().16b, v8.16b
 .endm

 .macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
         add             v25.4s, v25.4s, \tmp0\().4s
         add             v27.4s, v27.4s, \tmp1\().4s
         add             v23.4s, v23.4s, \tmp2\().4s
         sub             v26.4s, v26.4s, \tmp3\().4s
         sub             v24.4s, v24.4s, \tmp4\().4s
 .endm

 .macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
         tbl             \tmp0\().16b, { \src\().16b }, v0.16b
         saddl           \tmp1\().4s, \tmp0\().4h, \src\().4h
         saddl2          \dst\().4s, \tmp0\().8h, \src\().8h
         addp            \dst\().4s, \tmp1\().4s, \dst\().4s
 .endm

 .macro bdof_vx_vy_sign src, tmp0, tmp1, dst
         cmlt            \tmp0\().8h, \src\().8h, #0
         cmgt            \tmp1\().8h, \src\().8h, #0
         sub             \dst\().8h, \tmp0\().8h, \tmp1\().8h
 .endm

 .macro bdof_vx_vy_clip_mask src, max, min, mask, dst
         smin            \src\().4s, \src\().4s, \max\().4s
         smax            \src\().4s, \src\().4s, \min\().4s
         cmgt            \mask\().4s, \mask\().4s, #0
         and             \dst\().16b, \src\().16b, \mask\().16b
 .endm

 .macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
         mov             \tmp0\().16b, v29.16b
         mov             \tmp1\().16b, v30.16b
         mov             \tmp2\().16b, v31.16b
         mov             \tmp3\().16b, v8.16b
         mov             \tmp4\().16b, v9.16b
 .endm

 .macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
         add             v25.4s, v25.4s, \tmp0\().4s
         add             v24.4s, v24.4s, \tmp1\().4s
         add             v26.4s, v26.4s, \tmp2\().4s
         sub             v28.4s, v28.4s, \tmp3\().4s
         sub             v27.4s, v27.4s, \tmp4\().4s
 .endm

 .macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
         tbl             \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
         tbl             v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
         saddl           \tmp1\().4s, \tmp0\().4h, \src0\().4h
         saddl           \tmp2\().4s, v2.4h, \src1\().4h
         saddl2          \tmp0\().4s, \tmp0\().8h, \src0\().8h
         saddl2          \dst\().4s, v2.8h, \src1\().8h
         addp            \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
         addp            \dst\().4s, \tmp2\().4s, \dst\().4s
         addp            \dst\().4s, \tmp0\().4s, \dst\().4s
 .endm

 /*
  * Line tricks:
  * We need 6 lines of information, from 4N-1, 4N, 4N+1 to 4N+4. 4N-1
  * and 4N+0 are processed in the last group, so they can be reused.
  *
  * (4N-1) [xxxxxxxxxxxxx] <--- reuse
  * (4N)   [xxxxxxxxxxxxx] <--- reuse
  * (4N+1) [xxxxxxxxxxxxx]
  * (4N+2) [xxxxxxxxxxxxx]
  * (4N+3) [xxxxxxxxxxxxx] ---> save for reuse
  * (4N+4) [xxxxxxxxxxxxx] ---> save for reuse
  *
  * Special case:
  * 1. Line -1 needs to duplicate line 0.
  * 2. Last line +1 needs to duplicate the last line.
  *
  * ---------------------------------------------------------------------
  * Pixel tricks:
  *
  * [C-1, C0, C1, C2, ... C16]
  *
  * For each line, we need to sum parameters for 4 * 6 pixels:
  * - C-1 + C0 + C1 + C2 + C3 + C4
  * - C3 + C4 + C5 + C6 + C7 + C8
  * - C7 + C8 + C9 + C10 + C11 + C12
  * - C11 + C12 + C13 + C14 + C15 + C16
  *
  * C-1 is C0, C16 is C15, so we can do:
  *
  * [C0, C1, C2, C3, | C4, C5, C6, C7, | C8, ... C15]
  *         +        |      +          |
  * [C0,  0,  0, C4, | C3,  0,  0, C8, | C7, ... C15]
  *
  * 8x is similar.
  * ----------------------------------------------------------------------
  * x0: const int16_t *_src0,
  * x1: const int16_t *_src1,
  * x2: const int16_t *gradient_h,
  * x3: const int16_t *gradient_v,
  * x4: int16_t vx[16],
  * x5: int16_t vy[16],
  * w6: int block_h
  */
 function vvc_derive_bdof_vx_vy_8x_neon, export=0
         stp             d11, d10, [sp, #-0x20]!
         stp             d9, d8, [sp, #0x10]

         movrel          x11, bdof_vx_vy_8x_tbl
         ldr             q0, [x11]                           // table
         mvni            v2.4s, #30                          // -31, for log2
         movi            v3.4s, #15                          // clip to 15
         mvni            v4.4s, #14                          // clip to -15

         mov             w11, #0x8
         mov             w12, w6                             // y = block_h
         b               4f

 1:
         // save line4 results
         bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
 2:
         addp            v25.4s, v25.4s, v25.4s
         addp            v27.4s, v27.4s, v27.4s
         addp            v26.4s, v26.4s, v26.4s
         addp            v23.4s, v23.4s, v23.4s
         addp            v24.4s, v24.4s, v24.4s

         clz             v28.4s, v25.4s
         add             v28.4s, v28.4s, v2.4s               // log2
         shl             v26.4s, v26.4s, #0x2
         sshl            v26.4s, v26.4s, v28.4s

         bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
         sqxtn           v26.4h, v25.4s
         st1             {v26.s}[0], [x4], x11

         subs            x12, x12, #(BDOF_MIN_BLOCK_SIZE)

         clz             v26.4s, v27.4s
         add             v26.4s, v26.4s, v2.4s
         shl             v24.4s, v24.4s, #0x2
         mul             v23.4s, v25.4s, v23.4s
         sshr            v23.4s, v23.4s, #0x1
         sub             v23.4s, v24.4s, v23.4s
         sshl            v23.4s, v23.4s, v26.4s

         bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
         sqxtn           v23.4h, v23.4s
         st1             {v23.s}[0], [x5], x11

         b.eq            16f
 4:
         mov             x15, #0x0                           // dy, inner loop

         movi            v25.2d, #0
         movi            v27.2d, #0
         movi            v23.2d, #0
         movi            v26.2d, #0
         movi            v24.2d, #0
         b               8f

 5:
         // add line(-1) and line0 from previous results
         bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
         bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
         add             x15, x15, #1
 8:
         cmp             w12, w6
         b.hs            9f
         // y < block_h && dy == 0, reuse previous results
         cbz             x15, 5b
 9:
         ldr             q28, [x0]                                   // src0
         ldr             q29, [x1]                                   // src1
         ldr             q30, [x2], #(BDOF_BLOCK_SIZE * 2)           // (gh0 + gh1) >> 1
         ldr             q31, [x3], #(BDOF_BLOCK_SIZE * 2)           // (gv0 + gv1) >> 1
         add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
         add             x1, x1, #(VVC_MAX_PB_SIZE * 2)

         sshr            v28.8h, v28.8h, #0x4
         sshr            v29.8h, v29.8h, #0x4
         sub             v8.8h, v28.8h, v29.8h                       // diff

         abs             v28.8h, v30.8h
         abs             v29.8h, v31.8h

         bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
         bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29

         bdof_vx_vy_sign v30, v9, v10, v9
         bdof_vx_vy_sign v31, v10, v31, v31

         mul             v30.8h, v31.8h, v30.8h
         mul             v9.8h, v9.8h, v8.8h
         mul             v8.8h, v31.8h, v8.8h

         bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
         bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
         bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8

         bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8

         cmp             w12, w6
         b.ne            10f
         cbnz            x15, 10f

         // y == block_h && dy == 0, duplicate first line results
         bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
         add             x15, x15, #0x1
         b               9b
 10:
         cmp             x15, #(BDOF_MIN_BLOCK_SIZE - 1)
         b.eq            11f
         cmp             x15, #(BDOF_MIN_BLOCK_SIZE)
         b.ne            12f
         b               1b
 11:
         // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
         // duplicate the results and break
         cmp             x12, #(BDOF_MIN_BLOCK_SIZE)
         b.eq            13f
         bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
 12:
         add             x15, x15, #1
         b               8b
 13:
         // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
         // padding bottom then break
         bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
         b               2b
 16:
         ldp             d9, d8, [sp, #0x10]
         ldp             d11, d10, [sp], #0x20
         ret
 endfunc

 /*
  * x0: const int16_t *_src0,
  * x1: const int16_t *_src1,
  * x2: const int16_t *gradient_h,
  * x3: const int16_t *gradient_v,
  * x4: int16_t vx[16],
  * x5: int16_t vy[16],
  * w6: int block_h
  */
 function vvc_derive_bdof_vx_vy_16x_neon, export=0
         stp             d15, d14, [sp, #-0x40]!
         stp             d13, d12, [sp, #0x10]
         stp             d11, d10, [sp, #0x20]
         stp             d9, d8,   [sp, #0x30]

         movrel          x12, bdof_vx_vy_16x_tbl
         ldp             q0, q1, [x12]                       // table
         mov             w13, w6                             // y = block_h
         b               4f

 1:
         // save line4
         bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
 2:
         clz             v3.4s, v25.4s
         mvni            v5.4s, #0x1e
         add             v3.4s, v3.4s, v5.4s                 // -log2()
         shl             v4.4s, v28.4s, #0x2
         sshl            v3.4s, v4.4s, v3.4s

         movi            v28.4s, #0xf                        // clip to 15
         mvni            v29.4s, #0xe                        // clip to -15
         bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
         sqxtn           v4.4h, v3.4s
         st1             {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)

         subs            x13, x13, #(BDOF_MIN_BLOCK_SIZE)    // y -= BDOF_MIN_BLOCK_SIZE

         clz             v4.4s, v24.4s
         add             v4.4s, v4.4s, v5.4s                 // -log2()
         shl             v5.4s, v27.4s, #0x2
         mul             v3.4s, v3.4s, v26.4s
         sshr            v3.4s, v3.4s, #0x1
         sub             v3.4s, v5.4s, v3.4s
         sshl            v3.4s, v3.4s, v4.4s

         bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
         sqxtn           v3.4h, v3.4s
         st1             {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
         b.eq            16f
 4:
         mov             w14, #0x0                           // dy, inner loop

         movi            v25.2d, #0
         movi            v24.2d, #0
         movi            v26.2d, #0
         movi            v28.2d, #0
         movi            v27.2d, #0
         b               8f

 5:
         // add line(-1) and line0 from previous results
         bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
         bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
         add             w14, w14, #0x1

  8:
         cmp             w13, w6
         b.hs            9f
         // y < block_h && dy == 0, reuse previous results
         cbz             w14, 5b
 9:
         ld1             {v29.8h, v30.8h}, [x0]              // src0
         sshr            v31.8h, v29.8h, #0x4
         ld1             {v8.8h, v9.8h}, [x1]                // src1
         sshr            v10.8h, v8.8h, #0x4
         ldp             q13, q8, [x2], #32                  // (gh0 + gh1) >> 1
         sshr            v29.8h, v30.8h, #0x4
         sshr            v30.8h, v9.8h, #0x4
         ldp             q5, q3, [x3], #32                   // (gv0 + gv1) >> 1
         sub             v31.8h, v31.8h, v10.8h              // diff, left half
         sub             v4.8h, v29.8h, v30.8h               // diff, right half

         abs             v29.8h, v13.8h
         abs             v30.8h, v8.8h
         abs             v9.8h, v5.8h
         abs             v10.8h, v3.8h

         add             x0, x0, #(VVC_MAX_PB_SIZE * 2)
         add             x1, x1, #(VVC_MAX_PB_SIZE * 2)

         bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
         bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30

         bdof_vx_vy_sign v13, v9, v10, v9
         bdof_vx_vy_sign v8, v10, v11, v10
         bdof_vx_vy_sign v5, v11, v5, v5
         bdof_vx_vy_sign v3, v11, v3, v3

         mul             v11.8h, v5.8h, v13.8h
         mul             v12.8h, v3.8h, v8.8h
         mul             v8.8h, v9.8h, v31.8h
         mul             v9.8h, v10.8h, v4.8h
         mul             v13.8h, v5.8h, v31.8h
         mul             v14.8h, v3.8h, v4.8h

         bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
         bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
         bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9

         bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
         // check whether padding top
         cmp             w13, w6
         b.ne            10f
         cbnz            w14, 10f
         // y == block_h && dy == 0, padding top
         bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
         add             w14, w14, #0x1
         b               9b
 10:
         cmp             w14, #(BDOF_MIN_BLOCK_SIZE - 1)
         b.eq            11f
         cmp             w14, #(BDOF_MIN_BLOCK_SIZE)
         b.ne            12f
         // save line4
         b               1b
  11:
         // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
         cmp             x13, #(BDOF_MIN_BLOCK_SIZE)
         b.eq            13f
         // save line3
         bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
 12:
         add             w14, w14, #0x1                      // dy++
         b               8b
 13:
         // padding bottom
         bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
         b               2b
 16:
         // restore
         ldp             d9, d8, [sp, #0x30]
         ldp             d11, d10, [sp, #0x20]
         ldp             d13, d12, [sp, #0x10]
         ldp             d15, d14, [sp], #0x40
         ret
 endfunc

 function ff_vvc_apply_bdof_10_neon, export=1
         mov             w6, #10
         b               0f
 endfunc

 function ff_vvc_apply_bdof_12_neon, export=1
         mov             w6, #12
         b               0f
 endfunc

 // int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
 // int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
 // int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
 #define APPLY_BDOF_STACK_SIZE   ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
 #define GRADIENT_H0_OFFSET      2
 #define GRADIENT_H1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
 #define GRADIENT_V0_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
 #define GRADIENT_V1_OFFSET      ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
 #define VX_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
 #define VY_OFFSET               ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
 function ff_vvc_apply_bdof_8_neon, export=1
         mov             w6, #8
 0:
         stp             x19, x20, [sp, #-0x40]!
         stp             x21, x22, [sp, #0x10]
         stp             x23, x24, [sp, #0x20]
         stp             x25, x30, [sp, #0x30]

         sub             sp, sp, #APPLY_BDOF_STACK_SIZE
         mov             w19, w6                         // bit_depth
         mov             x20, x0                         // dst
         mov             x21, x1                         // dst_stride
         mov             x22, x2                         // src0
         mov             x23, x3                         // src1
         mov             w24, w4                         // block_w
         mov             w25, w5                         // block_h

         // int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
         add             x0, sp, #GRADIENT_H0_OFFSET
         add             x1, sp, #GRADIENT_H1_OFFSET
         add             x2, sp, #GRADIENT_V0_OFFSET
         add             x3, sp, #GRADIENT_V1_OFFSET
         mov             x4, x22
         mov             x5, x23
         mov             w6, w24
         mov             w7, w25
         bl              vvc_bdof_grad_filter_8x_neon

         cmp             w24, #8
         mov             x0, x22                         // src0
         mov             x1, x23                         // src1
         add             x2, sp, #GRADIENT_H0_OFFSET     // gh0
         add             x3, sp, #GRADIENT_V0_OFFSET     // gv0
         add             x4, sp, #VX_OFFSET              // vx
         add             x5, sp, #VY_OFFSET              // vy
         mov             w6, w25                         // block_h

         b.gt            16f

         bl              vvc_derive_bdof_vx_vy_8x_neon
         cmp             w19, #10                        // check bitdepth
         mov             x0, x20                         // dst
         mov             x1, x21                         // dst_stride
         mov             x2, x22                         // src0
         mov             x3, x23                         // src1
         add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
         add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
         add             x6, sp, #VX_OFFSET
         add             x7, sp, #VY_OFFSET
         str             w25, [sp]
         b.eq            1f
         b.gt            2f
         // 8bit
 0:
         bl              vvc_apply_bdof_block_8x_8_neon
         b               32f
 1:
         // 10bit
         bl              vvc_apply_bdof_block_8x_10_neon
         b               32f
 2:
         // 12bit
         bl              vvc_apply_bdof_block_8x_12_neon
         b               32f
 16:
         bl              vvc_derive_bdof_vx_vy_16x_neon

         cmp             w19, #10                        // check bitdepth
         mov             x0, x20                         // dst
         mov             x1, x21                         // dst_stride
         mov             x2, x22                         // src0
         mov             x3, x23                         // src1
         add             x4, sp, #GRADIENT_H1_OFFSET     // gh1
         add             x5, sp, #GRADIENT_V1_OFFSET     // gv1
         add             x6, sp, #VX_OFFSET
         add             x7, sp, #VY_OFFSET
         str             w25, [sp]
         b.eq            17f
         b.gt            18f
         // 8bit
         bl              vvc_apply_bdof_block_16x_8_neon
         b               32f
 17:
         // 10bit
         bl              vvc_apply_bdof_block_16x_10_neon
         b               32f
 18:
         // 12bit
         bl              vvc_apply_bdof_block_16x_12_neon
 32:
         add             sp, sp, #APPLY_BDOF_STACK_SIZE
         ldp             x25, x30, [sp, #0x30]
         ldp             x23, x24, [sp, #0x20]
         ldp             x21, x22, [sp, #0x10]
         ldp             x19, x20, [sp], #0x40
         ret
 endfunc

 #undef APPLY_BDOF_STACK_SIZE
 #undef GRADIENT_H0_OFFSET
 #undef GRADIENT_H1_OFFSET
 #undef GRADIENT_V0_OFFSET
 #undef GRADIENT_V1_OFFSET
 #undef VX_OFFSET
 #undef VY_OFFSET

 #define VVC_MAX_PB_SIZE 128

 .macro put_luma_h_x8_vector_filter shift
         // 8 bytes from hf loaded to v0.8h
         // 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded for shift to v1.8h,..,v6.8h,v17.8h
         // v24.4h & v25.4h are output vectors to store
         ext             v1.16b, v20.16b, v21.16b, #2
         ext             v2.16b, v20.16b, v21.16b, #4
         ext             v3.16b, v20.16b, v21.16b, #6
         ext             v4.16b, v20.16b, v21.16b, #8
         ext             v5.16b, v20.16b, v21.16b, #10
         ext             v6.16b, v20.16b, v21.16b, #12
         ext             v17.16b, v20.16b, v21.16b, #14
         smull           v24.4s, v20.4h, v0.h[0]
         smull2          v25.4s, v20.8h, v0.h[0]
         smlal           v24.4s, v1.4h, v0.h[1]
         smlal2          v25.4s, v1.8h, v0.h[1]
         smlal           v24.4s, v2.4h, v0.h[2]
         smlal2          v25.4s, v2.8h, v0.h[2]
         smlal           v24.4s, v3.4h, v0.h[3]
         smlal2          v25.4s, v3.8h, v0.h[3]
         smlal           v24.4s, v4.4h, v0.h[4]
         smlal2          v25.4s, v4.8h, v0.h[4]
         smlal           v24.4s, v5.4h, v0.h[5]
         smlal2          v25.4s, v5.8h, v0.h[5]
         smlal           v24.4s, v6.4h, v0.h[6]
         smlal2          v25.4s, v6.8h, v0.h[6]
         smlal           v24.4s, v17.4h, v0.h[7]
         smlal2          v25.4s, v17.8h, v0.h[7]
         sqshrn          v24.4h, v24.4s, #(\shift)
         sqshrn          v25.4h, v25.4s, #(\shift)
 .endm

 .macro put_luma_h8_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         ld1             {v0.8b}, [x4]
         sub             x1, x1, #6
         sxtl            v0.8h, v0.8b
 1:
         ld1             {v20.8h, v21.8h}, [x1], x2
         put_luma_h_x8_vector_filter \shift
         subs            w3, w3, #1
         st1             {v24.4h, v25.4h}, [x0], x9
         b.gt            1b
         ret
 .endm

 .macro put_luma_h16_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         ld1             {v0.8b}, [x4]
         sub             x9, x9, #16
         sub             x1, x1, #6
         sxtl            v0.8h, v0.8b
 1:
         ld1             {v20.8h, v21.8h, v22.8h}, [x1], x2
         put_luma_h_x8_vector_filter \shift
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
         st1             {v24.4h, v25.4h}, [x0], #16
         put_luma_h_x8_vector_filter \shift
         subs            w3, w3, #1
         st1             {v24.4h, v25.4h}, [x0], x9
         b.gt            1b
         ret
 .endm

 .macro put_luma_h_x16_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         ld1             {v0.8b}, [x4]
         sub             x9, x9, w6, uxtw #1
         sub             x2, x2, w6, uxtw #1
         sxtl            v0.8h, v0.8b
         sub             x1, x1, #6
         sub             x2, x2, #16
 1:
         ld1             {v20.8h}, [x1], #16
         mov             w8, w6
 2:
         ld1             {v21.8h, v22.8h}, [x1], #32
         put_luma_h_x8_vector_filter \shift
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
         st1             {v24.4h, v25.4h}, [x0], #16
         put_luma_h_x8_vector_filter \shift
         mov             v20.16b, v21.16b
         subs            w8, w8, #16
         st1             {v24.4h, v25.4h}, [x0], #16
         b.gt            2b
         subs            w3, w3, #1
         add             x0, x0, x9
         add             x1, x1, x2
         b.gt            1b
         ret
 .endm

 function ff_vvc_put_luma_h8_10_neon, export=1
         put_luma_h8_xx_neon 2
 endfunc

 function ff_vvc_put_luma_h8_12_neon, export=1
         put_luma_h8_xx_neon 4
 endfunc

 function ff_vvc_put_luma_h16_10_neon, export=1
         put_luma_h16_xx_neon 2
 endfunc

 function ff_vvc_put_luma_h16_12_neon, export=1
         put_luma_h16_xx_neon 4
 endfunc

 function ff_vvc_put_luma_h_x16_10_neon, export=1
         put_luma_h_x16_xx_neon 2
 endfunc

 function ff_vvc_put_luma_h_x16_12_neon, export=1
         put_luma_h_x16_xx_neon 4
 endfunc

 .macro put_luma_v4_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         sub             x1, x1, x2, lsl #1
         ld1             {v0.8b}, [x5]
         sub             x1, x1, x2
         sxtl            v0.8h, v0.8b
         ld1             {v20.4h}, [x1], x2
         ld1             {v21.4h}, [x1], x2
         ld1             {v22.4h}, [x1], x2
         ld1             {v23.4h}, [x1], x2
         ld1             {v24.4h}, [x1], x2
         ld1             {v25.4h}, [x1], x2
         ld1             {v26.4h}, [x1], x2
 1:
         ld1             {v27.4h}, [x1], x2

         smull           v1.4s, v20.4h, v0.h[0]
         smull           v2.4s, v21.4h, v0.h[1]
         smlal           v1.4s, v22.4h, v0.h[2]
         smlal           v2.4s, v23.4h, v0.h[3]
         smlal           v1.4s, v24.4h, v0.h[4]
         smlal           v2.4s, v25.4h, v0.h[5]
         smlal           v1.4s, v26.4h, v0.h[6]
         smlal           v2.4s, v27.4h, v0.h[7]

         ld1             {v28.4h}, [x1], x2

         smull           v3.4s, v21.4h, v0.h[0]
         smull           v4.4s, v22.4h, v0.h[1]
         smlal           v3.4s, v23.4h, v0.h[2]
         smlal           v4.4s, v24.4h, v0.h[3]
         smlal           v3.4s, v25.4h, v0.h[4]
         smlal           v4.4s, v26.4h, v0.h[5]
         smlal           v3.4s, v27.4h, v0.h[6]
         smlal           v4.4s, v28.4h, v0.h[7]
         add             v1.4s, v1.4s, v2.4s
         add             v3.4s, v3.4s, v4.4s
         sqshrn          v1.4h, v1.4s, #(\shift)
         sqshrn          v3.4h, v3.4s, #(\shift)

         st1             {v1.4h}, [x0], x9
         ld1             {v29.4h}, [x1], x2
         st1             {v3.4h}, [x0], x9

         smull           v1.4s, v22.4h, v0.h[0]
         smull           v2.4s, v23.4h, v0.h[1]
         smlal           v1.4s, v24.4h, v0.h[2]
         smlal           v2.4s, v25.4h, v0.h[3]
         smlal           v1.4s, v26.4h, v0.h[4]
         smlal           v2.4s, v27.4h, v0.h[5]
         smlal           v1.4s, v28.4h, v0.h[6]
         smlal           v2.4s, v29.4h, v0.h[7]

         ld1             {v30.4h}, [x1], x2

         smull           v3.4s, v23.4h, v0.h[0]
         smull           v4.4s, v24.4h, v0.h[1]
         smlal           v3.4s, v25.4h, v0.h[2]
         smlal           v4.4s, v26.4h, v0.h[3]
         smlal           v3.4s, v27.4h, v0.h[4]
         smlal           v4.4s, v28.4h, v0.h[5]
         smlal           v3.4s, v29.4h, v0.h[6]
         smlal           v4.4s, v30.4h, v0.h[7]
         add             v1.4s, v1.4s, v2.4s
         add             v3.4s, v3.4s, v4.4s
         sqshrn          v1.4h, v1.4s, #(\shift)
         sqshrn          v3.4h, v3.4s, #(\shift)

         st1             {v1.4h}, [x0], x9

         mov             v20.16b, v24.16b
         mov             v21.16b, v25.16b
         mov             v22.16b, v26.16b
         mov             v23.16b, v27.16b
         mov             v24.16b, v28.16b
         mov             v25.16b, v29.16b
         mov             v26.16b, v30.16b

         subs            w3, w3, #4
         st1             {v3.4h}, [x0], x9
         b.gt            1b
         ret
 .endm

 function ff_vvc_put_luma_v4_10_neon, export=1
         put_luma_v4_xx_neon 2
 endfunc

 function ff_vvc_put_luma_v4_12_neon, export=1
         put_luma_v4_xx_neon 4
 endfunc

 .macro put_luma_v8_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         sub             x1, x1, x2, lsl #1
         ld1             {v0.8b}, [x5]
         sub             x1, x1, x2
         sxtl            v0.8h, v0.8b
         ld1             {v20.8h}, [x1], x2
         ld1             {v21.8h}, [x1], x2
         ld1             {v22.8h}, [x1], x2
         ld1             {v23.8h}, [x1], x2
         ld1             {v24.8h}, [x1], x2
         ld1             {v25.8h}, [x1], x2
         ld1             {v26.8h}, [x1], x2
 1:
         ld1             {v27.8h}, [x1], x2

         smull           v1.4s, v20.4h, v0.h[0]
         smull2          v2.4s, v20.8h, v0.h[0]
         smlal           v1.4s, v21.4h, v0.h[1]
         smlal2          v2.4s, v21.8h, v0.h[1]
         smlal           v1.4s, v22.4h, v0.h[2]
         smlal2          v2.4s, v22.8h, v0.h[2]
         smlal           v1.4s, v23.4h, v0.h[3]
         smlal2          v2.4s, v23.8h, v0.h[3]
         smlal           v1.4s, v24.4h, v0.h[4]
         smlal2          v2.4s, v24.8h, v0.h[4]
         smlal           v1.4s, v25.4h, v0.h[5]
         smlal2          v2.4s, v25.8h, v0.h[5]
         smlal           v1.4s, v26.4h, v0.h[6]
         smlal2          v2.4s, v26.8h, v0.h[6]
         smlal           v1.4s, v27.4h, v0.h[7]
         smlal2          v2.4s, v27.8h, v0.h[7]
         sqshrn          v1.4h, v1.4s, #(\shift)
         sqshrn          v2.4h, v2.4s, #(\shift)

         ld1             {v28.8h}, [x1], x2
         st1             {v1.4h-v2.4h}, [x0], x9

         smull           v3.4s, v21.4h, v0.h[0]
         smull2          v4.4s, v21.8h, v0.h[0]
         smlal           v3.4s, v22.4h, v0.h[1]
         smlal2          v4.4s, v22.8h, v0.h[1]
         smlal           v3.4s, v23.4h, v0.h[2]
         smlal2          v4.4s, v23.8h, v0.h[2]
         smlal           v3.4s, v24.4h, v0.h[3]
         smlal2          v4.4s, v24.8h, v0.h[3]
         smlal           v3.4s, v25.4h, v0.h[4]
         smlal2          v4.4s, v25.8h, v0.h[4]
         smlal           v3.4s, v26.4h, v0.h[5]
         smlal2          v4.4s, v26.8h, v0.h[5]
         smlal           v3.4s, v27.4h, v0.h[6]
         smlal2          v4.4s, v27.8h, v0.h[6]
         smlal           v3.4s, v28.4h, v0.h[7]
         smlal2          v4.4s, v28.8h, v0.h[7]
         sqshrn          v3.4h, v3.4s, #(\shift)
         sqshrn          v4.4h, v4.4s, #(\shift)

         ld1             {v29.8h}, [x1], x2
         st1             {v3.4h-v4.4h}, [x0], x9

         smull           v1.4s, v22.4h, v0.h[0]
         smull2          v2.4s, v22.8h, v0.h[0]
         smlal           v1.4s, v23.4h, v0.h[1]
         smlal2          v2.4s, v23.8h, v0.h[1]
         smlal           v1.4s, v24.4h, v0.h[2]
         smlal2          v2.4s, v24.8h, v0.h[2]
         smlal           v1.4s, v25.4h, v0.h[3]
         smlal2          v2.4s, v25.8h, v0.h[3]
         smlal           v1.4s, v26.4h, v0.h[4]
         smlal2          v2.4s, v26.8h, v0.h[4]
         smlal           v1.4s, v27.4h, v0.h[5]
         smlal2          v2.4s, v27.8h, v0.h[5]
         smlal           v1.4s, v28.4h, v0.h[6]
         smlal2          v2.4s, v28.8h, v0.h[6]
         smlal           v1.4s, v29.4h, v0.h[7]
         smlal2          v2.4s, v29.8h, v0.h[7]
         sqshrn          v1.4h, v1.4s, #(\shift)
         sqshrn          v2.4h, v2.4s, #(\shift)

         ld1             {v30.8h}, [x1], x2
         st1             {v1.4h-v2.4h}, [x0], x9

         smull           v3.4s, v23.4h, v0.h[0]
         smull2          v4.4s, v23.8h, v0.h[0]
         smlal           v3.4s, v24.4h, v0.h[1]
         smlal2          v4.4s, v24.8h, v0.h[1]
         smlal           v3.4s, v25.4h, v0.h[2]
         smlal2          v4.4s, v25.8h, v0.h[2]
         smlal           v3.4s, v26.4h, v0.h[3]
         smlal2          v4.4s, v26.8h, v0.h[3]
         smlal           v3.4s, v27.4h, v0.h[4]
         smlal2          v4.4s, v27.8h, v0.h[4]
         smlal           v3.4s, v28.4h, v0.h[5]
         smlal2          v4.4s, v28.8h, v0.h[5]
         smlal           v3.4s, v29.4h, v0.h[6]
         smlal2          v4.4s, v29.8h, v0.h[6]
         smlal           v3.4s, v30.4h, v0.h[7]
         smlal2          v4.4s, v30.8h, v0.h[7]
         sqshrn          v3.4h, v3.4s, #(\shift)
         sqshrn          v4.4h, v4.4s, #(\shift)

         mov             v20.16b, v24.16b
         mov             v21.16b, v25.16b
         mov             v22.16b, v26.16b
         mov             v23.16b, v27.16b
         mov             v24.16b, v28.16b
         mov             v25.16b, v29.16b
         mov             v26.16b, v30.16b

         subs            w3, w3, #4
         st1             {v3.4h-v4.4h}, [x0], x9
         b.gt            1b
         ret
 .endm

 function ff_vvc_put_luma_v8_10_neon, export=1
         put_luma_v8_xx_neon 2
 endfunc

 function ff_vvc_put_luma_v8_12_neon, export=1
         put_luma_v8_xx_neon 4
 endfunc

 .macro put_luma_v_x16_vector_filter shift
         smull           v2.4s, v16.4h, v1.h[0]
         smull2          v3.4s, v16.8h, v1.h[0]
         smlal           v2.4s, v18.4h, v1.h[1]
         smlal2          v3.4s, v18.8h, v1.h[1]
         smlal           v2.4s, v20.4h, v1.h[2]
         smlal2          v3.4s, v20.8h, v1.h[2]
         smlal           v2.4s, v22.4h, v1.h[3]
         smlal2          v3.4s, v22.8h, v1.h[3]
         smlal           v2.4s, v24.4h, v1.h[4]
         smlal2          v3.4s, v24.8h, v1.h[4]
         smlal           v2.4s, v26.4h, v1.h[5]
         smlal2          v3.4s, v26.8h, v1.h[5]
         smlal           v2.4s, v28.4h, v1.h[6]
         smlal2          v3.4s, v28.8h, v1.h[6]
         smlal           v2.4s, v30.4h, v1.h[7]
         smlal2          v3.4s, v30.8h, v1.h[7]

         smull           v4.4s, v17.4h, v1.h[0]
         smull2          v5.4s, v17.8h, v1.h[0]
         smlal           v4.4s, v19.4h, v1.h[1]
         smlal2          v5.4s, v19.8h, v1.h[1]
         smlal           v4.4s, v21.4h, v1.h[2]
         smlal2          v5.4s, v21.8h, v1.h[2]
         smlal           v4.4s, v23.4h, v1.h[3]
         smlal2          v5.4s, v23.8h, v1.h[3]
         smlal           v4.4s, v25.4h, v1.h[4]
         smlal2          v5.4s, v25.8h, v1.h[4]
         smlal           v4.4s, v27.4h, v1.h[5]
         smlal2          v5.4s, v27.8h, v1.h[5]
         smlal           v4.4s, v29.4h, v1.h[6]
         smlal2          v5.4s, v29.8h, v1.h[6]
         smlal           v4.4s, v31.4h, v1.h[7]
         smlal2          v5.4s, v31.8h, v1.h[7]

         sqshrn          v6.4h, v2.4s, #(\shift)
         sqshrn          v7.4h, v4.4s, #(\shift)
         sqshrn2         v6.8h, v3.4s, #(\shift)
         sqshrn2         v7.8h, v5.4s, #(\shift)
 .endm

 .macro put_luma_v16_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         sub             x1, x1, x2, lsl #1
         ld1             {v0.8b}, [x5]
         sub             x1, x1, x2
         sxtl            v0.8h, v0.8b
         ld1             {v16.8h-v17.8h}, [x1], x2
         ld1             {v18.8h-v19.8h}, [x1], x2
         ld1             {v20.8h-v21.8h}, [x1], x2
         ld1             {v22.8h-v23.8h}, [x1], x2
         ld1             {v24.8h-v25.8h}, [x1], x2
         ld1             {v26.8h-v27.8h}, [x1], x2
         ld1             {v28.8h-v29.8h}, [x1], x2
 1:
         mov             v1.16b, v0.16b
         ld1             {v30.8h-v31.8h}, [x1], x2

         put_luma_v_x16_vector_filter \shift

         ld1             {v16.8h-v17.8h}, [x1], x2
         ext             v1.16b, v0.16b, v0.16b, #14
         st1             {v6.8h-v7.8h}, [x0], x9

         put_luma_v_x16_vector_filter \shift

         ld1             {v18.8h-v19.8h}, [x1], x2
         ext             v1.16b, v0.16b, v0.16b, #12
         st1             {v6.8h-v7.8h}, [x0], x9

         put_luma_v_x16_vector_filter \shift

         ld1             {v20.8h-v21.8h}, [x1], x2
         ext             v1.16b, v0.16b, v0.16b, #10
         st1             {v6.8h-v7.8h}, [x0], x9

         put_luma_v_x16_vector_filter \shift

         subs            w3, w3, #4
         st1             {v6.8h-v7.8h}, [x0], x9

         mov             v2.16b, v16.16b
         mov             v3.16b, v17.16b
         mov             v16.16b, v24.16b
         mov             v17.16b, v25.16b
         mov             v24.16b, v2.16b
         mov             v25.16b, v3.16b

         mov             v2.16b, v18.16b
         mov             v3.16b, v19.16b
         mov             v18.16b, v26.16b
         mov             v19.16b, v27.16b
         mov             v26.16b, v2.16b
         mov             v27.16b, v3.16b

         mov             v2.16b, v20.16b
         mov             v3.16b, v21.16b
         mov             v20.16b, v28.16b
         mov             v21.16b, v29.16b
         mov             v28.16b, v2.16b
         mov             v29.16b, v3.16b

         mov             v22.16b, v30.16b
         mov             v23.16b, v31.16b
         b.gt            1b
         ret
 .endm

 function ff_vvc_put_luma_v16_10_neon, export=1
         put_luma_v16_xx_neon 2
 endfunc

 function ff_vvc_put_luma_v16_12_neon, export=1
         put_luma_v16_xx_neon 4
 endfunc


 .macro put_luma_v_x16_xx_neon shift
         mov             x9, #(VVC_MAX_PB_SIZE * 2)
         sub             x1, x1, x2, lsl #1
         ld1             {v0.8b}, [x5]
         sub             x1, x1, x2
         sxtl            v0.8h, v0.8b
 1:
         mov             w8, #0
 2:
         add             x11, x1, x8, lsl #1
         add             x10, x0, x8, lsl #1
         ld1             {v16.8h-v17.8h}, [x11], x2
         add             x8, x8, #16
         ld1             {v18.8h-v19.8h}, [x11], x2
         cmp             w8, w6
         ld1             {v20.8h-v21.8h}, [x11], x2
         mov             v1.16b, v0.16b
         ld1             {v22.8h-v23.8h}, [x11], x2
         ld1             {v24.8h-v25.8h}, [x11], x2
         ld1             {v26.8h-v27.8h}, [x11], x2
         ld1             {v28.8h-v29.8h}, [x11], x2
         ld1             {v30.8h-v31.8h}, [x11], x2

         put_luma_v_x16_vector_filter \shift

         ld1             {v16.8h-v17.8h}, [x11], x2
         ext             v1.16b, v0.16b, v0.16b, #14
         st1             {v6.8h-v7.8h}, [x10], x9

         put_luma_v_x16_vector_filter \shift

         st1             {v6.8h-v7.8h}, [x10], x9
         ext             v1.16b, v0.16b, v0.16b, #12
         ld1             {v18.8h-v19.8h}, [x11], x2

         put_luma_v_x16_vector_filter \shift

         ld1             {v20.8h-v21.8h}, [x11], x2
         ext             v1.16b, v0.16b, v0.16b, #10
         st1             {v6.8h-v7.8h}, [x10], x9

         put_luma_v_x16_vector_filter \shift

         st1             {v6.8h-v7.8h}, [x10], x9
         b.lt            2b
         add             x0, x0, x9, lsl #2
         subs            w3, w3, #4
         add             x1, x1, x2, lsl #2
         b.gt            1b
         ret
 .endm

 function ff_vvc_put_luma_v_x16_10_neon, export=1
         put_luma_v_x16_xx_neon 2
 endfunc

 function ff_vvc_put_luma_v_x16_12_neon, export=1
         put_luma_v_x16_xx_neon 4
 endfunc