ARM64 NEON: Fix another ABI conformance issue
Based on
https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322
with wordsmithing by DRC.
In the AArch64 ABI, as in many others, it's forbidden to read/store data
below the stack pointer. Some SIMD functions were doing just that
(stack pointer misuse) when trying to preserve callee-saved registers,
and this resulted in those registers being restored with incorrect
contents under certain circumstances.
This patch fixes that behavior, and callee-saved registers are now
stored above the stack pointer throughout the function call. The patch
also removes register saving in places where it is unnecessary for this
ABI, or it makes use of unused scratch regiters instead of callee-saved
registers.
Fixes #97. Closes #101.
Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
diff --git a/ChangeLog.md b/ChangeLog.md
index 9cd195f..7f417a4 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -83,6 +83,12 @@
warning makes it easier to detect actual security issues, should they arise in
the future.
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code. Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
1.5.0
=====
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 6c1a959..3309858 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -217,8 +217,9 @@
sub sp, sp, #64
adr x15, Ljsimd_idct_islow_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
ld1 {v0.8h, v1.8h}, [x15]
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
@@ -243,7 +244,6 @@
shl v10.8h, v2.8h, #(PASS1_BITS)
sqxtn v16.8b, v15.8h
mov TMP1, v16.d[0]
- sub sp, sp, #64
mvn TMP2, TMP1
cbnz TMP2, 2f
@@ -1117,18 +1117,12 @@
uxtw x3, w3
/* Save all used NEON registers */
- sub sp, sp, 272
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants (v3.4h is just used for padding) */
adr TMP4, Ljsimd_idct_4x4_neon_consts
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1237,16 +1231,8 @@
#endif
/* vpop {v8.4h - v15.4h} ;not available */
- sub sp, sp, #272
- ldr x15, [sp], 16
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1320,18 +1306,13 @@
uxtw x3, w3
/* vpush {v8.4h - v15.4h} ; not available */
- sub sp, sp, 208
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants */
adr TMP2, Ljsimd_idct_2x2_neon_consts
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v21.8b, v22.8b}, [sp], 16
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v30.8b, v31.8b}, [sp], 16
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1431,15 +1412,8 @@
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
- sub sp, sp, #208
- ldr x15, [sp], 16
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v21.8b, v22.8b}, [sp], 16
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v30.8b, v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1719,13 +1693,13 @@
INPUT_BUF2 .req x1
RGB .req x7
- Y .req x8
- U .req x9
- V .req x10
+ Y .req x9
+ U .req x10
+ V .req x11
N .req w15
- sub sp, sp, 336
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
.if \fast_st3 == 1
@@ -1735,23 +1709,11 @@
.endif
/* Save NEON registers */
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
- /* Save ARM registers and handle input arguments */
- /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
- stp x4, x5, [sp], 16
- stp x6, x7, [sp], 16
- stp x8, x9, [sp], 16
- stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, #8]
ldr INPUT_BUF2, [INPUT_BUF, #16]
@@ -1818,21 +1780,8 @@
b.gt 0b
9:
/* Restore all registers and return */
- sub sp, sp, #336
- ldr x15, [sp], 16
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
- /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
- ldp x4, x5, [sp], 16
- ldp x6, x7, [sp], 16
- ldp x8, x9, [sp], 16
- ldp x10, x30, [sp], 16
br x30
.unreq OUTPUT_WIDTH
.unreq INPUT_ROW
@@ -2101,8 +2050,9 @@
/* Save NEON registers */
sub sp, sp, #64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ mov x9, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
/* Outer loop over scanlines */
cmp NUM_ROWS, #1
@@ -2155,7 +2105,6 @@
b.gt 0b
9:
/* Restore all registers and return */
- sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
br x30
@@ -2359,8 +2308,9 @@
/* Save NEON registers */
sub sp, sp, #64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
/* Load all DATA into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -2590,7 +2540,6 @@
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
/* Restore NEON registers */
- sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
@@ -3104,7 +3053,7 @@
sub sp, sp, 272
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
/* Save ARM registers */
- stp x19, x20, [sp], 16
+ stp x19, x20, [sp]
.if \fast_tbl == 1
adr x15, Ljsimd_huff_encode_one_block_neon_consts
.else
@@ -3318,7 +3267,7 @@
and v18.16b, v18.16b, v23.16b
add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
and v20.16b, v20.16b, v23.16b
- add x15, sp, #0x80 /* x15 = t2 */
+ add x15, sp, #0x90 /* x15 = t2 */
and v22.16b, v22.16b, v23.16b
ldr w10, [x4, x12, lsl #2]
addp v16.16b, v16.16b, v18.16b
@@ -3341,7 +3290,7 @@
rbit x9, x9 /* x9 = index0 */
ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
cmp w12, #(64-8)
- mov x11, sp
+ add x11, sp, #16
b.lt 4f
cbz x9, 6f
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
@@ -3445,7 +3394,7 @@
put_bits x3, x11
cbnz x9, 1b
6:
- add x13, sp, #0xfe
+ add x13, sp, #0x10e
cmp x15, x13
b.hs 1f
ldr w12, [x5]
@@ -3453,7 +3402,6 @@
checkbuf47
put_bits x12, x14
1:
- sub sp, sp, 16
str PUT_BUFFER, [x0, #0x10]
str PUT_BITSw, [x0, #0x18]
ldp x19, x20, [sp], 16