[AArch64] Unroll and use TBL in ScaleRowDown34_NEON ST3 is known to be slow on a number of modern micro-architectures. By unrolling the code we are able to use TBL to shuffle elements into the correct indices without needing to use LD4 and ST3, giving a good improvement in performance across the board. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: -14.4% Cortex-A510: -66.0% Cortex-A520: -50.8% Cortex-A76: -60.5% Cortex-A715: -63.9% Cortex-A720: -64.2% Cortex-X1: -74.3% Cortex-X2: -75.4% Cortex-X3: -75.5% Cortex-X4: -48.1% Bug: b/42280945 Change-Id: Ia1efb03af2d6ec00bc5a4b72168963fede9f0c83 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785971 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/source/scale_any.cc b/source/scale_any.cc index 0291ba1..1433446 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc
@@ -249,13 +249,13 @@ 23) #endif #ifdef HAS_SCALEROWDOWN34_NEON +#ifdef __aarch64__ SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, ScaleRowDown34_C, 4 / 3, 1, - 23) -#ifdef __aarch64__ + 47) SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, ScaleRowDown34_0_Box_C, @@ -269,6 +269,12 @@ 1, 47) #else +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, ScaleRowDown34_0_Box_C,
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index c7b4c0f..15a1fe1 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc
@@ -155,27 +155,42 @@ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. +static const uvec8 kShuf34_0 = { + 0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, +}; +static const uvec8 kShuf34_1 = { + 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, +}; +static const uvec8 kShuf34_2 = { + 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31, +}; + +// Down scale from 4 to 3 pixels. Point samples 64 pixels to 48 pixels. void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "mov v2.16b, v3.16b \n" // order v0,v1,v2 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3"); + asm volatile( + "ld1 {v29.16b}, [%[kShuf34_0]] \n" + "ld1 {v30.16b}, [%[kShuf34_1]] \n" + "ld1 {v31.16b}, [%[kShuf34_2]] \n" + "1: \n" + "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" + "subs %w[width], %w[width], #48 \n" + "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" + "prfm pldl1keep, [%[src_ptr], 448] \n" + "tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n" + "tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n" + "st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n" + "b.gt 1b \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(dst_width) // %[width] + : [kShuf34_0] "r"(&kShuf34_0), // %[kShuf34_0] + [kShuf34_1] "r"(&kShuf34_1), // %[kShuf34_1] + [kShuf34_2] "r"(&kShuf34_2) // %[kShuf34_2] + : "memory", "cc", "v0", "v1", "v2", "v3", "v29", "v30", "v31"); } void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,