source/row_sme.cc - third_party/libyuv - Git at Google

 /*
  *  Copyright 2024 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "libyuv/row.h"
 #include "libyuv/row_sve.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
     defined(__aarch64__)

 #define RGBTOARGB8_SVE_2X                                 \
   /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
   "uqshrnb     z16.b, z16.h, #6     \n" /* B0 */          \
   "uqshrnb     z17.b, z17.h, #6     \n" /* G0 */          \
   "uqshrnb     z18.b, z18.h, #6     \n" /* R0 */          \
   "uqshrnt     z16.b, z20.h, #6     \n" /* B1 */          \
   "uqshrnt     z17.b, z21.h, #6     \n" /* G1 */          \
   "uqshrnt     z18.b, z22.h, #6     \n" /* R1 */

 __arm_locally_streaming void I444ToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   // Streaming-SVE only, no use of ZA tile.
   uint64_t vl;
   asm volatile(
       "cntb     %[vl]                                   \n"
       "ptrue    p0.b                                    \n"  //
       YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n"  // A
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p1.b                                    \n"
       "1:                                               \n"  //
       READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st4b     {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
       "incb     %[dst_argb], all, mul #4                \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
       READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
       "st4b     {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"

       "99:                                              \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
         [width] "+r"(width),                               // %[width]
         [vl] "=&r"(vl)                                     // %[vl]
       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }

 __arm_locally_streaming void I444ToRGB24Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_rgb24,
     const struct YuvConstants* yuvconstants,
     int width) {
   I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
 }

 __arm_locally_streaming void I400ToARGBRow_SME(
     const uint8_t* src_y,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   // Streaming-SVE only, no use of ZA tile.
   I400ToARGBRow_SVE_SC(src_y, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I422ToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   // Streaming-SVE only, no use of ZA tile.
   I422ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I422ToRGB24Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I422ToRGB565Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_rgb565,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToRGB565Row_SVE_SC(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
 }

 __arm_locally_streaming void I422ToARGB1555Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb1555,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToARGB1555Row_SVE_SC(src_y, src_u, src_v, dst_argb1555, yuvconstants,
                            width);
 }

 __arm_locally_streaming void I422ToARGB4444Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb4444,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToARGB4444Row_SVE_SC(src_y, src_u, src_v, dst_argb4444, yuvconstants,
                            width);
 }

 __arm_locally_streaming void I422ToRGBARow_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I422ToAR30Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I422AlphaToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     const uint8_t* src_a,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I422AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                             width);
 }

 __arm_locally_streaming void I444AlphaToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_u,
     const uint8_t* src_v,
     const uint8_t* src_a,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I444AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                             width);
 }

 __arm_locally_streaming void NV12ToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_uv,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   NV12ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void NV21ToARGBRow_SME(
     const uint8_t* src_y,
     const uint8_t* src_vu,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   NV21ToARGBRow_SVE_SC(src_y, src_vu, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void NV12ToRGB24Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_uv,
     uint8_t* dst_rgb24,
     const struct YuvConstants* yuvconstants,
     int width) {
   NV12ToRGB24Row_SVE_SC(src_y, src_uv, dst_rgb24, yuvconstants, width);
 }

 __arm_locally_streaming void NV21ToRGB24Row_SME(
     const uint8_t* src_y,
     const uint8_t* src_vu,
     uint8_t* dst_rgb24,
     const struct YuvConstants* yuvconstants,
     int width) {
   NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
 }

 __arm_locally_streaming void YUY2ToARGBRow_SME(
     const uint8_t* src_yuy2,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   YUY2ToARGBRow_SVE_SC(src_yuy2, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void UYVYToARGBRow_SME(
     const uint8_t* src_uyvy,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   UYVYToARGBRow_SVE_SC(src_uyvy, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I210ToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I210ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I210AlphaToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     const uint16_t* src_a,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I210AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                             width);
 }

 __arm_locally_streaming void I210ToAR30Row_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_ar30,
     const struct YuvConstants* yuvconstants,
     int width) {
   I210ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
 }

 __arm_locally_streaming void P210ToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_uv,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   P210ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void P210ToAR30Row_SME(
     const uint16_t* src_y,
     const uint16_t* src_uv,
     uint8_t* dst_ar30,
     const struct YuvConstants* yuvconstants,
     int width) {
   P210ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
 }

 __arm_locally_streaming void I410ToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I410ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void I410AlphaToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     const uint16_t* src_a,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I410AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
                             width);
 }

 __arm_locally_streaming void I410ToAR30Row_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_ar30,
     const struct YuvConstants* yuvconstants,
     int width) {
   I410ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
 }

 __arm_locally_streaming void P410ToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_uv,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   P410ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void P410ToAR30Row_SME(
     const uint16_t* src_y,
     const uint16_t* src_uv,
     uint8_t* dst_ar30,
     const struct YuvConstants* yuvconstants,
     int width) {
   P410ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
 }

 __arm_locally_streaming void I212ToAR30Row_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_ar30,
     const struct YuvConstants* yuvconstants,
     int width) {
   I212ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
 }

 __arm_locally_streaming void I212ToARGBRow_SME(
     const uint16_t* src_y,
     const uint16_t* src_u,
     const uint16_t* src_v,
     uint8_t* dst_argb,
     const struct YuvConstants* yuvconstants,
     int width) {
   I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

 __arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,
                                                 uint16_t* dst_y,
                                                 int scale,
                                                 int width) {
   // Streaming-SVE only, no use of ZA tile.
   int vl;
   asm volatile(
       "cnth    %x[vl]                                   \n"
       "mov     z0.h, %w[scale]                          \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "b.lt    2f                                       \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue   p0.h                                     \n"
       "1:                                               \n"
       "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
       "incb    %[src_y]                                 \n"
       "mul     z1.h, z0.h, z1.h                         \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st1h    {z1.h}, p0, [%[dst_y]]                   \n"
       "incb    %[dst_y]                                 \n"
       "b.ge    1b                                       \n"

       "2:                                               \n"
       "adds    %w[width], %w[width], %w[vl]             \n"
       "b.eq    99f                                      \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt p0.h, wzr, %w[width]                     \n"
       "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
       "mul     z1.h, z0.h, z1.h                         \n"
       "st1h    {z1.h}, p0, [%[dst_y]]                   \n"

       "99:                                              \n"
       : [src_y] "+r"(src_y),  // %[src_y]
         [dst_y] "+r"(dst_y),  // %[dst_y]
         [width] "+r"(width),  // %[width]
         [vl] "=&r"(vl)        // %[vl]
       : [scale] "r"(scale)    // %[scale]
       : "memory", "cc", "z0", "z1", "p0");
 }

 __arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
                                                  const uint8_t* src_argb1,
                                                  uint8_t* dst_argb,
                                                  int width) {
   // Streaming-SVE only, no use of ZA tile.
   width *= 4;
   int vl;
   asm volatile(
       "cntb    %x[vl]                                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "b.lt    2f                                       \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue   p0.b                                     \n"
       "1:                                               \n"
       "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
       "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
       "incb    %[src_argb]                              \n"
       "incb    %[src_argb1]                             \n"
       "umullb  z2.h, z0.b, z1.b                         \n"
       "umullt  z1.h, z0.b, z1.b                         \n"
       "rshrnb  z0.b, z2.h, #8                           \n"
       "rshrnt  z0.b, z1.h, #8                           \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st1b    {z0.b}, p0, [%[dst_argb]]                \n"
       "incb    %[dst_argb]                              \n"
       "b.ge    1b                                       \n"

       "2:                                               \n"
       "adds    %w[width], %w[width], %w[vl]             \n"
       "b.eq    99f                                      \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt p0.b, wzr, %w[width]                     \n"
       "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
       "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
       "umullb  z2.h, z0.b, z1.b                         \n"
       "umullt  z1.h, z0.b, z1.b                         \n"
       "rshrnb  z0.b, z2.h, #8                           \n"
       "rshrnt  z0.b, z1.h, #8                           \n"
       "st1b    {z0.b}, p0, [%[dst_argb]]                \n"

       "99:                                              \n"
       : [src_argb] "+r"(src_argb),    // %[src_argb]
         [src_argb1] "+r"(src_argb1),  // %[src_argb1]
         [dst_argb] "+r"(dst_argb),    // %[dst_argb]
         [width] "+r"(width),          // %[width]
         [vl] "=&r"(vl)                // %[vl]
       :
       : "memory", "cc", "z0", "z1", "z2", "p0", "p1");
 }

 __arm_locally_streaming void MergeUVRow_SME(const uint8_t* src_u,
                                             const uint8_t* src_v,
                                             uint8_t* dst_uv,
                                             int width) {
   // Streaming-SVE only, no use of ZA tile.
   int vl;
   asm volatile(
       "cntb    %x[vl]                                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "b.lt    2f                                       \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue   p0.b                                     \n"
       "1:                                               \n"
       "ld1b    {z1.b}, p0/z, [%[src_u]]                 \n"
       "ld1b    {z2.b}, p0/z, [%[src_v]]                 \n"
       "incb    %[src_u]                                 \n"
       "incb    %[src_v]                                 \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st2b    {z1.b, z2.b}, p0, [%[dst_uv]]            \n"
       "incb    %[dst_uv], all, mul #2                   \n"
       "b.ge    1b                                       \n"

       "2:                                               \n"
       "adds    %w[width], %w[width], %w[vl]             \n"
       "b.eq    99f                                      \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt p0.b, wzr, %w[width]                     \n"
       "ld1b    {z1.b}, p0/z, [%[src_u]]                 \n"
       "ld1b    {z2.b}, p0/z, [%[src_v]]                 \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st2b    {z1.b, z2.b}, p0, [%[dst_uv]]            \n"

       "99:                                              \n"
       : [src_u] "+r"(src_u),    // %[src_u]
         [src_v] "+r"(src_v),    // %[src_v]
         [dst_uv] "+r"(dst_uv),  // %[dst_uv]
         [width] "+r"(width),    // %[width]
         [vl] "=&r"(vl)          // %[vl]
       :
       : "memory", "cc", "z0", "z1", "z2", "p0");
 }

 __arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
                                                const uint16_t* src_v,
                                                uint16_t* dst_uv,
                                                int depth,
                                                int width) {
   int shift = 16 - depth;
   // Streaming-SVE only, no use of ZA tile.
   int vl;
   asm volatile(
       "cnth    %x[vl]                                   \n"
       "mov     z0.h, %w[shift]                          \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "b.lt    2f                                       \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue   p0.h                                     \n"
       "1:                                               \n"
       "ld1h    {z1.h}, p0/z, [%[src_u]]                 \n"
       "ld1h    {z2.h}, p0/z, [%[src_v]]                 \n"
       "incb    %[src_u]                                 \n"
       "incb    %[src_v]                                 \n"
       "lsl     z1.h, p0/m, z1.h, z0.h                   \n"
       "lsl     z2.h, p0/m, z2.h, z0.h                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st2h    {z1.h, z2.h}, p0, [%[dst_uv]]            \n"
       "incb    %[dst_uv], all, mul #2                   \n"
       "b.ge    1b                                       \n"

       "2:                                               \n"
       "adds    %w[width], %w[width], %w[vl]             \n"
       "b.eq    99f                                      \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt p0.h, wzr, %w[width]                     \n"
       "ld1h    {z1.h}, p0/z, [%[src_u]]                 \n"
       "ld1h    {z2.h}, p0/z, [%[src_v]]                 \n"
       "lsl     z1.h, p0/m, z1.h, z0.h                   \n"
       "lsl     z2.h, p0/m, z2.h, z0.h                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st2h    {z1.h, z2.h}, p0, [%[dst_uv]]            \n"

       "99:                                              \n"
       : [src_u] "+r"(src_u),    // %[src_u]
         [src_v] "+r"(src_v),    // %[src_v]
         [dst_uv] "+r"(dst_uv),  // %[dst_uv]
         [width] "+r"(width),    // %[width]
         [vl] "=&r"(vl)          // %[vl]
       : [shift] "r"(shift)      // %[shift]
       : "memory", "cc", "z0", "z1", "z2", "p0");
 }

 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 32768 = 9 bits = shr 1
 // 16384 = 10 bits = shr 2
 // 4096 = 12 bits = shr 4
 // 256 = 16 bits = shr 8
 __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
                                                  uint8_t* dst_y,
                                                  int scale,
                                                  int width) {
   // 15 - clz(scale), + 8 to shift result into the high half of the lane to
   // saturate, then we can just use UZP2 to narrow rather than a pair of
   // saturating narrow instructions.
   const int shift = 23 - __builtin_clz((int32_t)scale);
   int vl;
   asm volatile(
       "cntb     %x[vl]                                  \n"
       "dup      z0.h, %w[shift]                         \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.b                                    \n"
       "1:                                               \n"
       "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
       "ld1h     {z2.h}, p0/z, [%[src_y], #1, mul vl]    \n"
       "incb     %[src_y], all, mul #2                   \n"
       "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
       "uqshl    z2.h, p0/m, z2.h, z0.h                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "uzp2     z1.b, z1.b, z2.b                        \n"
       "st1b     {z1.b}, p0, [%[dst_y]]                  \n"
       "incb     %[dst_y]                                \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       // We need separate predicates for the load and store instructions since
       // they are operating on different element sizes (.b vs .h).
       "cnth     %x[vl]                                  \n"
       "whilelt  p0.h, wzr, %w[width]                    \n"
       "whilelt  p1.h, %w[vl], %w[width]                 \n"
       "whilelt  p2.b, wzr, %w[width]                    \n"
       "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
       "ld1h     {z2.h}, p1/z, [%[src_y], #1, mul vl]    \n"
       "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
       "uqshl    z2.h, p1/m, z2.h, z0.h                  \n"
       "uzp2     z1.b, z1.b, z2.b                        \n"
       "st1b     {z1.b}, p2, [%[dst_y]]                  \n"

       "99:                                              \n"
       : [src_y] "+r"(src_y),  // %[src_y]
         [dst_y] "+r"(dst_y),  // %[dst_y]
         [width] "+r"(width),  // %[width]
         [vl] "=&r"(vl)        // %[vl]
       : [shift] "r"(shift)    // %[shift]
       : "cc", "memory", "z0", "z1", "z2", "p0", "p1", "p2");
 }

 __arm_locally_streaming void CopyRow_SME(const uint8_t* src,
                                          uint8_t* dst,
                                          int width) {
   // Streaming-SVE only, no use of ZA tile.
   int vl;
   asm volatile(
       "cntb    %x[vl]                                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "b.lt    2f                                       \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue   p0.b                                     \n"
       "1:                                               \n"
       "ld1b    {z0.b}, p0/z, [%[src]]                   \n"
       "incb    %[src]                                   \n"
       "subs    %w[width], %w[width], %w[vl]             \n"
       "st1b    {z0.b}, p0, [%[dst]]                     \n"
       "incb    %[dst]                                   \n"
       "b.ge    1b                                       \n"

       "2:                                               \n"
       "adds    %w[width], %w[width], %w[vl]             \n"
       "b.eq    99f                                      \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt p0.b, wzr, %w[width]                     \n"
       "ld1b    {z0.b}, p0/z, [%[src]]                   \n"
       "st1b    {z0.b}, p0, [%[dst]]                     \n"

       "99:                                              \n"
       : [src] "+r"(src),      // %[src]
         [dst] "+r"(dst),      // %[dst]
         [width] "+r"(width),  // %[width]
         [vl] "=&r"(vl)        // %[vl]
       :
       : "memory", "cc", "z0", "p0");
 }

 __arm_locally_streaming static void HalfRow_SME(uint8_t* dst_ptr,
                                                 const uint8_t* src_ptr,
                                                 ptrdiff_t src_stride,
                                                 int width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride;

   int vl;
   asm volatile(
       "cntb     %x[vl]                                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.b                                    \n"
       "1:                                               \n"
       "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
       "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"
       "incb     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.b, wzr, %w[width]                    \n"
       "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
       "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
       "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
         [width] "+r"(width),        // %[width]
         [vl] "=&r"(vl)              // %[vl]
       :
       : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
 }

 __arm_locally_streaming void InterpolateRow_SME(uint8_t* dst_ptr,
                                                 const uint8_t* src_ptr,
                                                 ptrdiff_t src_stride,
                                                 int width,
                                                 int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8_t* src_ptr1 = src_ptr + src_stride;

   if (y0_fraction == 0) {
     CopyRow_SME(src_ptr1, dst_ptr, width);
     return;
   }
   if (y0_fraction == 128) {
     HalfRow_SME(dst_ptr, src_ptr, src_stride, width);
     return;
   }
   if (y0_fraction == 256) {
     CopyRow_SME(src_ptr, dst_ptr, width);
     return;
   }

   int vl;
   asm volatile(
       "cntb     %x[vl]                                  \n"
       "dup      z0.b, %w[y0_fraction]                   \n"
       "dup      z1.b, %w[y1_fraction]                   \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.b                                    \n"
       "1:                                               \n"
       "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
       "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "umullb   z4.h, z2.b, z0.b                        \n"
       "umullt   z2.h, z2.b, z0.b                        \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "umlalb   z4.h, z3.b, z1.b                        \n"
       "umlalt   z2.h, z3.b, z1.b                        \n"
       "rshrnb   z3.b, z4.h, #8                          \n"
       "rshrnt   z3.b, z2.h, #8                          \n"
       "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"
       "incb     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.b, wzr, %w[width]                    \n"
       "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
       "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
       "umullb   z4.h, z2.b, z0.b                        \n"
       "umullt   z2.h, z2.b, z0.b                        \n"
       "umlalb   z4.h, z3.b, z1.b                        \n"
       "umlalt   z2.h, z3.b, z1.b                        \n"
       "rshrnb   z3.b, z4.h, #8                          \n"
       "rshrnt   z3.b, z2.h, #8                          \n"
       "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
         [width] "+r"(width),             // %[width]
         [vl] "=&r"(vl)                   // %[vl]
       : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
         [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
       : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
 }

 __arm_locally_streaming static void HalfRow_16_SME(uint16_t* dst_ptr,
                                                    const uint16_t* src_ptr,
                                                    ptrdiff_t src_stride,
                                                    int width) {
   const uint16_t* src_ptr1 = src_ptr + src_stride;

   int vl;
   asm volatile(
       "cnth     %x[vl]                                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.h                                    \n"
       "1:                                               \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"
       "incb     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.h, wzr, %w[width]                    \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
       "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
         [width] "+r"(width),        // %[width]
         [vl] "=&r"(vl)              // %[vl]
       :
       : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
 }

 __arm_locally_streaming void InterpolateRow_16_SME(uint16_t* dst_ptr,
                                                    const uint16_t* src_ptr,
                                                    ptrdiff_t src_stride,
                                                    int width,
                                                    int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;

   if (y0_fraction == 0) {
     CopyRow_SME((const uint8_t*)src_ptr1, (uint8_t*)dst_ptr,
                 width * sizeof(uint16_t));
     return;
   }
   if (y0_fraction == 128) {
     HalfRow_16_SME(dst_ptr, src_ptr, src_stride, width);
     return;
   }
   if (y0_fraction == 256) {
     CopyRow_SME((const uint8_t*)src_ptr, (uint8_t*)dst_ptr,
                 width * sizeof(uint16_t));
     return;
   }

   int vl;
   asm volatile(
       "cnth     %x[vl]                                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "dup      z0.h, %w[y0_fraction]                   \n"
       "dup      z1.h, %w[y1_fraction]                   \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.h                                    \n"
       "1:                                               \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "umullb   z4.s, z2.h, z0.h                        \n"
       "umullt   z2.s, z2.h, z0.h                        \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "umlalb   z4.s, z3.h, z1.h                        \n"
       "umlalt   z2.s, z3.h, z1.h                        \n"
       "rshrnb   z3.h, z4.s, #8                          \n"
       "rshrnt   z3.h, z2.s, #8                          \n"
       "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"
       "incb     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.h, wzr, %w[width]                    \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "umullb   z4.s, z2.h, z0.h                        \n"
       "umullt   z2.s, z2.h, z0.h                        \n"
       "umlalb   z4.s, z3.h, z1.h                        \n"
       "umlalt   z2.s, z3.h, z1.h                        \n"
       "rshrnb   z3.h, z4.s, #8                          \n"
       "rshrnt   z3.h, z2.s, #8                          \n"
       "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
         [width] "+r"(width),             // %[width]
         [vl] "=&r"(vl)                   // %[vl]
       : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
         [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
       : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
 }

 __arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
                                                       const uint16_t* src_ptr,
                                                       ptrdiff_t src_stride,
                                                       int scale,
                                                       int width) {
   const uint16_t* src_ptr1 = src_ptr + src_stride;

   // 15 - clz(scale), + 8 to shift result into the high half of the lane to
   // saturate, then we can just use UZP2 to narrow rather than a pair of
   // saturating narrow instructions.
   const int shift = 23 - __builtin_clz((int32_t)scale);

   int vl;
   asm volatile(
       "cnth     %x[vl]                                  \n"
       "dup      z31.h, %w[shift]                        \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.h                                    \n"
       "1:                                               \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
       "shrnb    z2.b, z2.h, #8                          \n"
       "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"
       "inch     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.h, wzr, %w[width]                    \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
       "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
       "shrnb    z2.b, z2.h, #8                          \n"
       "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
         [width] "+r"(width),        // %[width]
         [vl] "=&r"(vl)              // %[vl]
       : [shift] "r"(shift)          // %[shift]
       : "cc", "memory", "z0", "z1", "z2", "z3", "z31", "p0");
 }

 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 32768 = 9 bits
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
 // TODO(fbarchard): change scale to bits
 __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
                                                       const uint16_t* src_ptr,
                                                       ptrdiff_t src_stride,
                                                       int scale,
                                                       int width,
                                                       int source_y_fraction) {
   const int y1_fraction = source_y_fraction;
   const int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;

   // y0_fraction == 0 is never called here.
   if (y0_fraction == 128) {
     HalfRow_16To8_SME(dst_ptr, src_ptr, src_stride, scale, width);
     return;
   }
   if (y0_fraction == 256) {
     Convert16To8Row_SME(src_ptr, dst_ptr, scale, width);
     return;
   }

   // 15 - clz(scale), + 8 to shift result into the high half of the lane to
   // saturate, then we can just use UZP2 to narrow rather than a pair of
   // saturating narrow instructions.
   const int shift = 23 - __builtin_clz((int32_t)scale);

   int vl;
   asm volatile(
       "cnth     %x[vl]                                  \n"
       "dup      z31.h, %w[shift]                        \n"
       "dup      z0.h, %w[y0_fraction]                   \n"
       "dup      z1.h, %w[y1_fraction]                   \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"

       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p0.h                                    \n"
       "1:                                               \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "incb     %[src_ptr]                              \n"
       "incb     %[src_ptr1]                             \n"
       "umullb   z4.s, z2.h, z0.h                        \n"
       "umullt   z2.s, z2.h, z0.h                        \n"
       "subs     %w[width], %w[width], %w[vl]            \n"
       "umlalb   z4.s, z3.h, z1.h                        \n"
       "umlalt   z2.s, z3.h, z1.h                        \n"
       "rshrnb   z3.h, z4.s, #8                          \n"
       "rshrnt   z3.h, z2.s, #8                          \n"
       "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
       "shrnb    z3.b, z3.h, #8                          \n"
       "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"
       "inch     %[dst_ptr]                              \n"
       "b.ge     1b                                      \n"

       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
       "b.eq     99f                                     \n"

       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p0.h, wzr, %w[width]                    \n"
       "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
       "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
       "umullb   z4.s, z2.h, z0.h                        \n"
       "umullt   z2.s, z2.h, z0.h                        \n"
       "umlalb   z4.s, z3.h, z1.h                        \n"
       "umlalt   z2.s, z3.h, z1.h                        \n"
       "rshrnb   z3.h, z4.s, #8                          \n"
       "rshrnt   z3.h, z2.s, #8                          \n"
       "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
       "shrnb    z3.b, z3.h, #8                          \n"
       "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"

       "99:                                              \n"
       : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
         [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
         [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
         [width] "+r"(width),             // %[width]
         [vl] "=&r"(vl)                   // %[vl]
       : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
         [y1_fraction] "r"(y1_fraction),  // %[y1_fraction]
         [shift] "r"(shift)               // %[shift]
       : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
 }

 __arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
                                                 uint8_t* dst_y,
                                                 int scale,
                                                 int bias,
                                                 int width) {
   Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
 }

 #define CONVERT8TO16_SVE                                 \
   "ld1b        {z0.h}, p0/z, [%[src]]                \n" \
   "ld1b        {z1.h}, p1/z, [%[src], #1, mul vl]    \n" \
   "incb        %[src]                                \n" \
   "subs        %w[width], %w[width], %w[vl], lsl #1  \n" \
   "trn1        z0.b, z0.b, z0.b                      \n" \
   "trn1        z1.b, z1.b, z1.b                      \n" \
   "lsr         z0.h, p0/m, z0.h, z2.h                \n" \
   "lsr         z1.h, p1/m, z1.h, z2.h                \n" \
   "prfm        pldl1keep, [%[src], 448]              \n" \
   "st1h        {z0.h}, p0, [%[dst]]                  \n" \
   "st1h        {z1.h}, p1, [%[dst], #1, mul vl]      \n" \
   "incb        %[dst], all, mul #2                   \n"

 __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
                                                  uint16_t* dst_y,
                                                  int scale,
                                                  int width) {
   // (src * 0x0101 * scale) >> 16.
   // Since scale is a power of two, compute the shift to use to avoid needing
   // to widen to int32.
   const int shift = __builtin_clz(scale) - 15;

   uint64_t vl;
   asm volatile(
       "dup         z2.h, %w[shift]                      \n"
       "cnth        %[vl]                                \n"
       "subs        %w[width], %w[width], %w[vl], lsl #1 \n"
       "b.lt        2f                                   \n"

       // Run bulk of computation with all-true predicates to avoid predicate
       // generation overhead.
       "ptrue       p0.h                                 \n"
       "ptrue       p1.h                                 \n"
       "1:                                               \n"  //
       CONVERT8TO16_SVE
       "b.ge        1b                                   \n"

       "2:                                               \n"
       "adds        %w[width], %w[width], %w[vl], lsl #1 \n"
       "b.eq        99f                                  \n"

       // Calculate predicates for the final iteration to deal with the tail.
       "whilelt     p0.h, wzr, %w[width]                 \n"
       "whilelt     p1.h, %w[vl], %w[width]              \n"  //
       CONVERT8TO16_SVE

       "99:                                              \n"
       : [src] "+r"(src_y),    // %[src]
         [dst] "+r"(dst_y),    // %[dst]
         [width] "+r"(width),  // %[width]
         [vl] "=&r"(vl)        // %[vl]
       : [shift] "r"(shift)    // %[shift]
       : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
 }

 __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
                                              int src_stride_argb,
                                              uint8_t* dst_u,
                                              uint8_t* dst_v,
                                              int width) {
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            kARGBToUVCoefficients);
 }

 __arm_locally_streaming void ARGBToUVJRow_SME(const uint8_t* src_argb,
                                               int src_stride_argb,
                                               uint8_t* dst_u,
                                               uint8_t* dst_v,
                                               int width) {
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            kARGBToUVJCoefficients);
 }

 __arm_locally_streaming void ABGRToUVJRow_SME(const uint8_t* src_abgr,
                                               int src_stride_abgr,
                                               uint8_t* dst_uj,
                                               uint8_t* dst_vj,
                                               int width) {
   ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
                            kABGRToUVJCoefficients);
 }

 __arm_locally_streaming void BGRAToUVRow_SME(const uint8_t* src_bgra,
                                              int src_stride_bgra,
                                              uint8_t* dst_u,
                                              uint8_t* dst_v,
                                              int width) {
   ARGBToUVMatrixRow_SVE_SC(src_bgra, src_stride_bgra, dst_u, dst_v, width,
                            kBGRAToUVCoefficients);
 }

 __arm_locally_streaming void ABGRToUVRow_SME(const uint8_t* src_abgr,
                                              int src_stride_abgr,
                                              uint8_t* dst_u,
                                              uint8_t* dst_v,
                                              int width) {
   ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                            kABGRToUVCoefficients);
 }

 __arm_locally_streaming void RGBAToUVRow_SME(const uint8_t* src_rgba,
                                              int src_stride_rgba,
                                              uint8_t* dst_u,
                                              uint8_t* dst_v,
                                              int width) {
   ARGBToUVMatrixRow_SVE_SC(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                            kRGBAToUVCoefficients);
 }

 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
         // defined(__aarch64__)

 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif