[AArch64] Add SVE2 implementations for ARGBToUVRow and similar
By maintaining the interleaved format of the data we can use a common
kernel for all input channel orderings and simply pass a different
vector of constants instead.
A similar approach is possible with only Neon by making use of
multiplies and repeated application of ADDP to combine channels, however
this is slower on older cores like Cortex-A53 so is not pursued further.
For odd problem sizes we need a slightly different implementation for
the final element, so introduce an "any" kernel to address that rather
than bloating the code for the common case.
Observed affect on runtimes compared to the existing Neon kernels:
| Cortex-A510 | Cortex-A720 | Cortex-X2
ABGRToUVJRow | -15.5% | +5.4% | -33.1%
ABGRToUVRow | -15.6% | +5.3% | -35.9%
ARGBToUVJRow | -10.1% | +5.4% | -32.7%
ARGBToUVRow | -10.1% | +5.4% | -29.3%
BGRAToUVRow | -15.5% | +4.6% | -32.8%
RGBAToUVRow | -10.1% | +4.2% | -36.0%
Bug: libyuv:973
Change-Id: I041ca44db0ae8a2adffcdf24e822eebe962baf33
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5505537
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 93feb0c..43ffe24 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -564,10 +564,16 @@
// The following are available on AArch64 SVE platforms:
#if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
-#define HAS_I444TOARGBROW_SVE2
+#define HAS_ABGRTOUVJROW_SVE2
+#define HAS_ABGRTOUVROW_SVE2
+#define HAS_ARGBTOUVJROW_SVE2
+#define HAS_ARGBTOUVROW_SVE2
+#define HAS_BGRATOUVROW_SVE2
+#define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGBROW_SVE2
#define HAS_I444ALPHATOARGBROW_SVE2
-#define HAS_I422ALPHATOARGBROW_SVE2
+#define HAS_I444TOARGBROW_SVE2
+#define HAS_RGBATOUVROW_SVE2
#endif
// The following are available on AArch64 platforms:
@@ -1489,6 +1495,11 @@
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_SVE2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1521,26 +1532,51 @@
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width);
+void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_uj,
+ uint8_t* dst_vj,
+ int width);
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
@@ -1966,6 +2002,11 @@
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1998,26 +2039,51 @@
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr,
+ int src_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
diff --git a/source/convert.cc b/source/convert.cc
index e852a90..fdd0cb6 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1920,6 +1920,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -2090,6 +2098,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -2261,6 +2277,14 @@
}
}
#endif
+#if defined(HAS_BGRATOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ BGRAToUVRow = BGRAToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ BGRAToUVRow = BGRAToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_BGRATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
BGRAToYRow = BGRAToYRow_Any_SSSE3;
@@ -2428,6 +2452,14 @@
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ABGRToUVRow = ABGRToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -2545,6 +2577,14 @@
}
}
#endif
+#if defined(HAS_RGBATOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ RGBAToUVRow = RGBAToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ RGBAToUVRow = RGBAToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYRow = RGBAToYRow_Any_MSA;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 6c361c0..c684ac0 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -255,6 +255,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -363,6 +371,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -593,6 +609,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -800,6 +824,14 @@
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ABGRToUVRow = ABGRToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -996,6 +1028,14 @@
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ABGRToUVRow = ABGRToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -1197,6 +1237,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -1401,6 +1449,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVRow = ARGBToUVRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -2336,6 +2392,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVJRow = ARGBToUVJRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -2512,6 +2576,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBToUVJRow = ARGBToUVJRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
@@ -2818,6 +2890,14 @@
}
}
#endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ABGRToUVJRow = ABGRToUVJRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYJRow = ABGRToYJRow_Any_MSA;
@@ -2956,6 +3036,14 @@
}
}
#endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+ if (IS_ALIGNED(width, 2)) {
+ ABGRToUVJRow = ABGRToUVJRow_SVE2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYJRow = ABGRToYJRow_Any_MSA;
diff --git a/source/row_any.cc b/source/row_any.cc
index 8ed5a49..a466e34 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2225,6 +2225,9 @@
#ifdef HAS_ARGBTOUVROW_NEON
ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ARGBTOUVROW_SVE2
+ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_ARGBTOUVROW_MSA
ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#endif
@@ -2237,9 +2240,15 @@
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ARGBTOUVJROW_SVE2
+ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_ABGRTOUVJROW_NEON
ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ABGRTOUVJROW_SVE2
+ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
@@ -2252,6 +2261,9 @@
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_BGRATOUVROW_SVE2
+ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_BGRATOUVROW_MSA
ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
#endif
@@ -2261,6 +2273,9 @@
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ABGRTOUVROW_SVE2
+ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_ABGRTOUVROW_MSA
ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
#endif
@@ -2270,6 +2285,9 @@
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_RGBATOUVROW_SVE2
+ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1)
+#endif
#ifdef HAS_RGBATOUVROW_MSA
ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
#endif
diff --git a/source/row_sve.cc b/source/row_sve.cc
index a7048b6..ff65af0 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -262,6 +262,243 @@
: "cc", "memory", YUVTORGB_SVE_REGS);
}
+// Dot-product constants are stored as four-tuples with the two innermost
+// elements flipped to account for the interleaving nature of the widening
+// addition instructions.
+
+static const int16_t kArgbToUvArr[] = {
+ // UB, -UR, -UG, 0, -VB, VR, -VG, 0
+ 56, -19, -37, 0, -9, 56, -47, 0,
+};
+
+static const int16_t kRgbaToUvArr[] = {
+ // 0, -UG, UB, -UR, 0, -VG, -VB, VR
+ 0, -37, 56, -19, 0, -47, -9, 56,
+};
+
+static const int16_t kBgraToUvArr[] = {
+ // 0, -UG, -UR, UB, 0, -VG, VR, -VB
+ 0, -37, -19, 56, 0, -47, 56, -9,
+};
+
+static const int16_t kAbgrToUvArr[] = {
+ // -UR, UB, -UG, 0, VR, -VB, -VG, 0
+ -19, 56, -37, 0, 56, -9, -47, 0,
+};
+
+static const int16_t kArgbToUvjArr[] = {
+ // UB, -UR, -UG, 0, -VB, VR, -VG, 0
+ 63, -21, -42, 0, -10, 63, -53, 0,
+};
+
+static const int16_t kAbgrToUvjArr[] = {
+ // -UR, UB, -UG, 0, VR, -VB, -VG, 0
+ -21, 63, -42, 0, 63, -10, -53, 0,
+};
+
+void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width,
+ const int16_t* uvconstants) {
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+ uint64_t vl;
+ asm volatile(
+ "ptrue p0.b \n"
+ "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
+ "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
+ "mov z26.b, #0x80 \n"
+
+ "cntb %[vl] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Process 4x vectors from each input row per iteration.
+ // Cannot use predication here due to unrolling.
+ "1: \n" // e.g.
+ "ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra
+ "ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra
+ "ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra
+ "ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra
+ "ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra
+ "ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra
+ "ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra
+ "ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra
+ "incb %[src0], all, mul #4 \n"
+ "incb %[src1], all, mul #4 \n"
+
+ "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
+ "uaddlt z17.h, z0.b, z4.b \n" // gagagaga
+ "uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr
+ "uaddlt z19.h, z1.b, z5.b \n" // gagagaga
+ "uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr
+ "uaddlt z21.h, z2.b, z6.b \n" // gagagaga
+ "uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr
+ "uaddlt z23.h, z3.b, z7.b \n" // gagagaga
+
+ "trn1 z0.s, z16.s, z17.s \n" // brgabgra
+ "trn2 z1.s, z16.s, z17.s \n" // brgabgra
+ "trn1 z2.s, z18.s, z19.s \n" // brgabgra
+ "trn2 z3.s, z18.s, z19.s \n" // brgabgra
+ "trn1 z4.s, z20.s, z21.s \n" // brgabgra
+ "trn2 z5.s, z20.s, z21.s \n" // brgabgra
+ "trn1 z6.s, z22.s, z23.s \n" // brgabgra
+ "trn2 z7.s, z22.s, z23.s \n" // brgabgra
+
+ "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop
+
+ "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga
+ "urhadd z2.h, p0/m, z2.h, z3.h \n" // brgabrga
+ "urhadd z4.h, p0/m, z4.h, z5.h \n" // brgabrga
+ "urhadd z6.h, p0/m, z6.h, z7.h \n" // brgabrga
+
+ "movi v16.8h, #0 \n"
+ "movi v17.8h, #0 \n"
+ "movi v18.8h, #0 \n"
+ "movi v19.8h, #0 \n"
+
+ "movi v20.8h, #0 \n"
+ "movi v21.8h, #0 \n"
+ "movi v22.8h, #0 \n"
+ "movi v23.8h, #0 \n"
+
+ "sdot z16.d, z0.h, z24.h \n" // UUxxxxxx
+ "sdot z17.d, z2.h, z24.h \n" // UUxxxxxx
+ "sdot z18.d, z4.h, z24.h \n" // UUxxxxxx
+ "sdot z19.d, z6.h, z24.h \n" // UUxxxxxx
+
+ "sdot z20.d, z0.h, z25.h \n" // VVxxxxxx
+ "sdot z21.d, z2.h, z25.h \n" // VVxxxxxx
+ "sdot z22.d, z4.h, z25.h \n" // VVxxxxxx
+ "sdot z23.d, z6.h, z25.h \n" // VVxxxxxx
+
+ "uzp1 z16.s, z16.s, z17.s \n" // UUxx
+ "uzp1 z18.s, z18.s, z19.s \n" // UUxx
+ "uzp1 z20.s, z20.s, z21.s \n" // VVxx
+ "uzp1 z22.s, z22.s, z23.s \n" // VVxx
+
+ "uzp1 z16.h, z16.h, z18.h \n" // UU
+ "uzp1 z20.h, z20.h, z22.h \n" // VV
+
+ "addhnb z16.b, z16.h, z26.h \n" // U
+ "addhnb z20.b, z20.h, z26.h \n" // V
+
+ "st1b {z16.h}, p0, [%[dst_u]] \n" // U
+ "st1b {z20.h}, p0, [%[dst_v]] \n" // V
+ "inch %[dst_u] \n"
+ "inch %[dst_v] \n"
+
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n" // VL per loop
+ "b.le 99f \n"
+
+ // Process remaining pixels from each input row.
+ // Use predication to do one vector from each input array, so may loop up
+ // to three iterations.
+ "cntw %x[vl] \n"
+
+ "3: \n"
+ "whilelt p1.s, wzr, %w[width] \n"
+ "ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra
+ "ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra
+ "incb %[src0] \n"
+ "incb %[src1] \n"
+
+ "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
+ "uaddlt z17.h, z0.b, z4.b \n" // gagagaga
+
+ "trn1 z0.s, z16.s, z17.s \n" // brgabgra
+ "trn2 z1.s, z16.s, z17.s \n" // brgabgra
+
+ "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga
+
+ "subs %w[width], %w[width], %w[vl] \n" // VL per loop
+
+ "movi v16.8h, #0 \n"
+ "movi v20.8h, #0 \n"
+
+ "sdot z16.d, z0.h, z24.h \n"
+ "sdot z20.d, z0.h, z25.h \n"
+
+ "addhnb z16.b, z16.h, z26.h \n" // U
+ "addhnb z20.b, z20.h, z26.h \n" // V
+
+ "st1b {z16.d}, p0, [%[dst_u]] \n" // U
+ "st1b {z20.d}, p0, [%[dst_v]] \n" // V
+ "incd %[dst_u] \n"
+ "incd %[dst_v] \n"
+ "b.gt 3b \n"
+
+ "99: \n"
+ : [src0] "+r"(src_argb), // %[src0]
+ [src1] "+r"(src_argb_1), // %[src1]
+ [dst_u] "+r"(dst_u), // %[dst_u]
+ [dst_v] "+r"(dst_v), // %[dst_v]
+ [width] "+r"(width), // %[width]
+ [vl] "=&r"(vl) // %[vl]
+ : [uvconstants] "r"(uvconstants)
+ : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
+ "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
+ "p0");
+}
+
+void ARGBToUVRow_SVE2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
+ kArgbToUvArr);
+}
+
+void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
+ kArgbToUvjArr);
+}
+
+void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_uj,
+ uint8_t* dst_vj,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
+ kAbgrToUvjArr);
+}
+
+void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+ kBgraToUvArr);
+}
+
+void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+ kAbgrToUvArr);
+}
+
+void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+ kRgbaToUvArr);
+}
+
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus