[AArch64] Add Neon impls for I212To{ARGB,AR30}Row_NEON

There are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

            | I210ToAR30Row | I210ToARGBRow
 Cortex-A55 |        -40.8% |        -54.4%
Cortex-A510 |        -26.2% |        -22.7%
 Cortex-A76 |        -49.2% |        -44.5%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I967951a6b453ac0023a30d96b754c85c2a3bf14a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607762
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 33a304e..0340db6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -561,6 +561,8 @@
 #define HAS_I410TOARGBROW_NEON
 #define HAS_I210TOAR30ROW_NEON
 #define HAS_I410TOAR30ROW_NEON
+#define HAS_I212TOARGBROW_NEON
+#define HAS_I212TOAR30ROW_NEON
 
 #define HAS_ABGRTOYJROW_NEON_DOTPROD
 #define HAS_ABGRTOYROW_NEON_DOTPROD
@@ -1122,6 +1124,18 @@
                         uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I212ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I212ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -5183,6 +5197,18 @@
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I212ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I212ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
                                  const uint8_t* u_buf,
                                  const uint8_t* v_buf,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 0670f2d..5c844fd 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -1138,6 +1138,14 @@
     }
   }
 #endif
+#if defined(HAS_I212TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I212ToAR30Row = I212ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToAR30Row = I212ToAR30Row_NEON;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
     dst_ar30 += dst_stride_ar30;
@@ -1593,6 +1601,14 @@
     }
   }
 #endif
+#if defined(HAS_I212TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I212ToARGBRow = I212ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToARGBRow = I212ToARGBRow_NEON;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
diff --git a/source/row_any.cc b/source/row_any.cc
index abd731f..46e11a5 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -528,6 +528,12 @@
 #ifdef HAS_I410TOAR30ROW_NEON
 ANY31CT(I410ToAR30Row_Any_NEON, I410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7)
 #endif
+#ifdef HAS_I212TOARGBROW_NEON
+ANY31CT(I212ToARGBRow_Any_NEON, I212ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOAR30ROW_NEON
+ANY31CT(I212ToAR30Row_Any_NEON, I212ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7)
+#endif
 #undef ANY31CT
 
 // Any 3 planes to 1 plane with parameter
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 70b44d2..c17b586 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -52,6 +52,21 @@
   "prfm       pldl1keep, [%[src_u], 128]     \n" \
   "prfm       pldl1keep, [%[src_v], 128]     \n"
 
+// Read 8 Y, 4 U and 4 V from 212
+#define READYUV212                               \
+  "ldr        q2, [%[src_y]], #16            \n" \
+  "ldr        d1, [%[src_u]], #8             \n" \
+  "ldr        d3, [%[src_v]], #8             \n" \
+  "shl        v0.8h, v2.8h, #4               \n" \
+  "usra       v0.8h, v2.8h, #8               \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "zip1       v2.8h, v3.8h, v3.8h            \n" \
+  "zip1       v3.8h, v1.8h, v1.8h            \n" \
+  "uqshrn     v1.8b, v3.8h, #4               \n" \
+  "uqshrn2    v1.16b, v2.8h, #4              \n" \
+  "prfm       pldl1keep, [%[src_u], 128]     \n" \
+  "prfm       pldl1keep, [%[src_v], 128]     \n"
+
 // Read 8 Y, 8 U and 8 V from 410
 #define READYUV410                               \
   "ldr        q1, [%[src_y]], #16            \n" \
@@ -307,6 +322,32 @@
       : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
+void I212ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  const uint16_t limit = 0x3ff0;
+  asm(YUVTORGB_SETUP
+      "dup      v22.8h, %w[limit]                  \n"
+      "movi     v23.8h, #0xc0, lsl #8              \n"  // A
+      "1:                                          \n" READYUV212 NVTORGB
+      "subs     %w[width], %w[width], #8           \n" STOREAR30
+      "b.gt     1b                                 \n"
+      : [src_y] "+r"(src_y),             // %[src_y]
+        [src_u] "+r"(src_u),             // %[src_u]
+        [src_v] "+r"(src_v),             // %[src_v]
+        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+        [width] "+r"(width)              // %[width]
+      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+        [limit] "r"(limit)               // %[limit]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+}
+
 void I210ToARGBRow_NEON(const uint16_t* src_y,
                         const uint16_t* src_u,
                         const uint16_t* src_v,
@@ -351,6 +392,30 @@
       : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
+void I212ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  asm(YUVTORGB_SETUP
+      "movi        v19.8b, #255             \n"
+      "1:                                   \n" READYUV212 NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8 \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                       \n"
+      : [src_y] "+r"(src_y),            // %[src_y]
+        [src_u] "+r"(src_u),            // %[src_u]
+        [src_v] "+r"(src_v),            // %[src_v]
+        [dst_argb] "+r"(dst_argb),      // %[dst_argb]
+        [width] "+r"(width)             // %[width]
+      : [kUVCoeff] "r"(uv_coeff),       // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,