[AArch64] Add SVE2 implementations for ARGBToUVRow and similar

By maintaining the interleaved format of the data we can use a common
kernel for all input channel orderings and simply pass a different
vector of constants instead.

A similar approach is possible with only Neon by making use of
multiplies and repeated application of ADDP to combine channels, however
this is slower on older cores like Cortex-A53 so is not pursued further.

For odd problem sizes we need a slightly different implementation for
the final element, so introduce an "any" kernel to address that rather
than bloating the code for the common case.

Observed affect on runtimes compared to the existing Neon kernels:

             | Cortex-A510 | Cortex-A720 | Cortex-X2
ABGRToUVJRow |      -15.5% |       +5.4% |    -33.1%
 ABGRToUVRow |      -15.6% |       +5.3% |    -35.9%
ARGBToUVJRow |      -10.1% |       +5.4% |    -32.7%
 ARGBToUVRow |      -10.1% |       +5.4% |    -29.3%
 BGRAToUVRow |      -15.5% |       +4.6% |    -32.8%
 RGBAToUVRow |      -10.1% |       +4.2% |    -36.0%

Bug: libyuv:973
Change-Id: I041ca44db0ae8a2adffcdf24e822eebe962baf33
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5505537
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 93feb0c..43ffe24 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -564,10 +564,16 @@
 
 // The following are available on AArch64 SVE platforms:
 #if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
-#define HAS_I444TOARGBROW_SVE2
+#define HAS_ABGRTOUVJROW_SVE2
+#define HAS_ABGRTOUVROW_SVE2
+#define HAS_ARGBTOUVJROW_SVE2
+#define HAS_ARGBTOUVROW_SVE2
+#define HAS_BGRATOUVROW_SVE2
+#define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
-#define HAS_I422ALPHATOARGBROW_SVE2
+#define HAS_I444TOARGBROW_SVE2
+#define HAS_RGBATOUVROW_SVE2
 #endif
 
 // The following are available on AArch64 platforms:
@@ -1489,6 +1495,11 @@
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ARGBToUVRow_SVE2(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void ARGBToUV444Row_MSA(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@@ -1521,26 +1532,51 @@
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                        int src_stride_abgr,
                        uint8_t* dst_uj,
                        uint8_t* dst_vj,
                        int width);
+void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width);
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       int src_stride_bgra,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                       int src_stride_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                       int src_stride_rgba,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                        int src_stride_rgb24,
                        uint8_t* dst_u,
@@ -1966,6 +2002,11 @@
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
@@ -1998,26 +2039,51 @@
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
+void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
                           int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
                            int src_stride,
                            uint8_t* dst_u,
diff --git a/source/convert.cc b/source/convert.cc
index e852a90..fdd0cb6 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1920,6 +1920,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -2090,6 +2098,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -2261,6 +2277,14 @@
     }
   }
 #endif
+#if defined(HAS_BGRATOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      BGRAToUVRow = BGRAToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_BGRATOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     BGRAToYRow = BGRAToYRow_Any_SSSE3;
@@ -2428,6 +2452,14 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -2545,6 +2577,14 @@
     }
   }
 #endif
+#if defined(HAS_RGBATOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      RGBAToUVRow = RGBAToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RGBAToYRow = RGBAToYRow_Any_MSA;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 6c361c0..c684ac0 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -255,6 +255,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -363,6 +371,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -593,6 +609,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -800,6 +824,14 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -996,6 +1028,14 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -1197,6 +1237,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -1401,6 +1449,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVRow = ARGBToUVRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -2336,6 +2392,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVJRow = ARGBToUVJRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -2512,6 +2576,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVJRow = ARGBToUVJRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -2818,6 +2890,14 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYJRow = ABGRToYJRow_Any_MSA;
@@ -2956,6 +3036,14 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SVE2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYJRow = ABGRToYJRow_Any_MSA;
diff --git a/source/row_any.cc b/source/row_any.cc
index 8ed5a49..a466e34 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2225,6 +2225,9 @@
 #ifdef HAS_ARGBTOUVROW_NEON
 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVROW_SVE2
+ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_ARGBTOUVROW_MSA
 ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
 #endif
@@ -2237,9 +2240,15 @@
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVJROW_SVE2
+ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_ABGRTOUVJROW_NEON
 ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVJROW_SVE2
+ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_ARGBTOUVJROW_MSA
 ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
 #endif
@@ -2252,6 +2261,9 @@
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_SVE2
+ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_BGRATOUVROW_MSA
 ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
 #endif
@@ -2261,6 +2273,9 @@
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVROW_SVE2
+ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_ABGRTOUVROW_MSA
 ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
 #endif
@@ -2270,6 +2285,9 @@
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_RGBATOUVROW_SVE2
+ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1)
+#endif
 #ifdef HAS_RGBATOUVROW_MSA
 ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
 #endif
diff --git a/source/row_sve.cc b/source/row_sve.cc
index a7048b6..ff65af0 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -262,6 +262,243 @@
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+// Dot-product constants are stored as four-tuples with the two innermost
+// elements flipped to account for the interleaving nature of the widening
+// addition instructions.
+
+static const int16_t kArgbToUvArr[] = {
+    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
+    56, -19, -37, 0, -9, 56, -47, 0,
+};
+
+static const int16_t kRgbaToUvArr[] = {
+    // 0, -UG, UB, -UR, 0, -VG, -VB, VR
+    0, -37, 56, -19, 0, -47, -9, 56,
+};
+
+static const int16_t kBgraToUvArr[] = {
+    // 0, -UG, -UR, UB, 0, -VG, VR, -VB
+    0, -37, -19, 56, 0, -47, 56, -9,
+};
+
+static const int16_t kAbgrToUvArr[] = {
+    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
+    -19, 56, -37, 0, 56, -9, -47, 0,
+};
+
+static const int16_t kArgbToUvjArr[] = {
+    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
+    63, -21, -42, 0, -10, 63, -53, 0,
+};
+
+static const int16_t kAbgrToUvjArr[] = {
+    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
+    -21, 63, -42, 0, 63, -10, -53, 0,
+};
+
+void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const int16_t* uvconstants) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  uint64_t vl;
+  asm volatile(
+      "ptrue    p0.b                                \n"
+      "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
+      "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
+      "mov      z26.b, #0x80                        \n"
+
+      "cntb     %[vl]                               \n"
+      "subs     %w[width], %w[width], %w[vl]        \n"
+      "b.lt     2f                                  \n"
+
+      // Process 4x vectors from each input row per iteration.
+      // Cannot use predication here due to unrolling.
+      "1:                                           \n"  // e.g.
+      "ld1b     {z0.b}, p0/z, [%[src0], #0, mul vl] \n"  // bgrabgra
+      "ld1b     {z4.b}, p0/z, [%[src1], #0, mul vl] \n"  // bgrabgra
+      "ld1b     {z1.b}, p0/z, [%[src0], #1, mul vl] \n"  // bgrabgra
+      "ld1b     {z5.b}, p0/z, [%[src1], #1, mul vl] \n"  // bgrabgra
+      "ld1b     {z2.b}, p0/z, [%[src0], #2, mul vl] \n"  // bgrabgra
+      "ld1b     {z6.b}, p0/z, [%[src1], #2, mul vl] \n"  // bgrabgra
+      "ld1b     {z3.b}, p0/z, [%[src0], #3, mul vl] \n"  // bgrabgra
+      "ld1b     {z7.b}, p0/z, [%[src1], #3, mul vl] \n"  // bgrabgra
+      "incb     %[src0], all, mul #4                \n"
+      "incb     %[src1], all, mul #4                \n"
+
+      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
+      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
+      "uaddlb   z18.h, z1.b, z5.b                   \n"  // brbrbrbr
+      "uaddlt   z19.h, z1.b, z5.b                   \n"  // gagagaga
+      "uaddlb   z20.h, z2.b, z6.b                   \n"  // brbrbrbr
+      "uaddlt   z21.h, z2.b, z6.b                   \n"  // gagagaga
+      "uaddlb   z22.h, z3.b, z7.b                   \n"  // brbrbrbr
+      "uaddlt   z23.h, z3.b, z7.b                   \n"  // gagagaga
+
+      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
+      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
+      "trn1     z2.s, z18.s, z19.s                  \n"  // brgabgra
+      "trn2     z3.s, z18.s, z19.s                  \n"  // brgabgra
+      "trn1     z4.s, z20.s, z21.s                  \n"  // brgabgra
+      "trn2     z5.s, z20.s, z21.s                  \n"  // brgabgra
+      "trn1     z6.s, z22.s, z23.s                  \n"  // brgabgra
+      "trn2     z7.s, z22.s, z23.s                  \n"  // brgabgra
+
+      "subs     %w[width], %w[width], %w[vl]        \n"  // 4*VL per loop
+
+      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
+      "urhadd   z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
+      "urhadd   z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
+      "urhadd   z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga
+
+      "movi     v16.8h, #0                          \n"
+      "movi     v17.8h, #0                          \n"
+      "movi     v18.8h, #0                          \n"
+      "movi     v19.8h, #0                          \n"
+
+      "movi     v20.8h, #0                          \n"
+      "movi     v21.8h, #0                          \n"
+      "movi     v22.8h, #0                          \n"
+      "movi     v23.8h, #0                          \n"
+
+      "sdot     z16.d, z0.h, z24.h                  \n"  // UUxxxxxx
+      "sdot     z17.d, z2.h, z24.h                  \n"  // UUxxxxxx
+      "sdot     z18.d, z4.h, z24.h                  \n"  // UUxxxxxx
+      "sdot     z19.d, z6.h, z24.h                  \n"  // UUxxxxxx
+
+      "sdot     z20.d, z0.h, z25.h                  \n"  // VVxxxxxx
+      "sdot     z21.d, z2.h, z25.h                  \n"  // VVxxxxxx
+      "sdot     z22.d, z4.h, z25.h                  \n"  // VVxxxxxx
+      "sdot     z23.d, z6.h, z25.h                  \n"  // VVxxxxxx
+
+      "uzp1     z16.s, z16.s, z17.s                 \n"  // UUxx
+      "uzp1     z18.s, z18.s, z19.s                 \n"  // UUxx
+      "uzp1     z20.s, z20.s, z21.s                 \n"  // VVxx
+      "uzp1     z22.s, z22.s, z23.s                 \n"  // VVxx
+
+      "uzp1     z16.h, z16.h, z18.h                 \n"  // UU
+      "uzp1     z20.h, z20.h, z22.h                 \n"  // VV
+
+      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
+      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
+
+      "st1b     {z16.h}, p0, [%[dst_u]]             \n"  // U
+      "st1b     {z20.h}, p0, [%[dst_v]]             \n"  // V
+      "inch     %[dst_u]                            \n"
+      "inch     %[dst_v]                            \n"
+
+      "b.ge     1b                                  \n"
+
+      "2:                                           \n"
+      "adds     %w[width], %w[width], %w[vl]        \n"  // VL per loop
+      "b.le     99f                                 \n"
+
+      // Process remaining pixels from each input row.
+      // Use predication to do one vector from each input array, so may loop up
+      // to three iterations.
+      "cntw     %x[vl]                              \n"
+
+      "3:                                           \n"
+      "whilelt  p1.s, wzr, %w[width]                \n"
+      "ld1d     {z0.d}, p1/z, [%[src0]]             \n"  // bgrabgra
+      "ld1d     {z4.d}, p1/z, [%[src1]]             \n"  // bgrabgra
+      "incb     %[src0]                             \n"
+      "incb     %[src1]                             \n"
+
+      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
+      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
+
+      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
+      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
+
+      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
+
+      "subs     %w[width], %w[width], %w[vl]        \n"  // VL per loop
+
+      "movi     v16.8h, #0                          \n"
+      "movi     v20.8h, #0                          \n"
+
+      "sdot     z16.d, z0.h, z24.h                  \n"
+      "sdot     z20.d, z0.h, z25.h                  \n"
+
+      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
+      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
+
+      "st1b     {z16.d}, p0, [%[dst_u]]             \n"  // U
+      "st1b     {z20.d}, p0, [%[dst_v]]             \n"  // V
+      "incd     %[dst_u]                            \n"
+      "incd     %[dst_v]                            \n"
+      "b.gt     3b                                  \n"
+
+      "99:                                          \n"
+      : [src0] "+r"(src_argb),    // %[src0]
+        [src1] "+r"(src_argb_1),  // %[src1]
+        [dst_u] "+r"(dst_u),      // %[dst_u]
+        [dst_v] "+r"(dst_v),      // %[dst_v]
+        [width] "+r"(width),      // %[width]
+        [vl] "=&r"(vl)            // %[vl]
+      : [uvconstants] "r"(uvconstants)
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
+        "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
+        "p0");
+}
+
+void ARGBToUVRow_SVE2(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
+                         kArgbToUvArr);
+}
+
+void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
+                         kArgbToUvjArr);
+}
+
+void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
+                         kAbgrToUvjArr);
+}
+
+void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+                         kBgraToUvArr);
+}
+
+void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+                         kAbgrToUvArr);
+}
+
+void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+                         kRgbaToUvArr);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus