Fix several mathematical issues discovered in the ARM64 NEON code while running the extended regression tests introduced in r1267. Specific comments can be found in the original patches:
https://sourceforge.net/p/libjpeg-turbo/patches/64/
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1389 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
index 0ef770a..f488b0f 100644
--- a/simd/jsimd_arm_neon_64.S
+++ b/simd/jsimd_arm_neon_64.S
@@ -35,7 +35,6 @@
#define RESPECT_STRICT_ALIGNMENT 1
-
/*****************************************************************************/
/* Supplementary macro for setting function attributes */
@@ -363,6 +362,7 @@
orr x0, x0, x4
add v4.4s, v10.4s, v12.4s
orr x0, x0, x5
+ cmp x0, #0 /* orrs instruction removed */
sub v2.4s, v10.4s, v12.4s
add v12.4s, v4.4s, v14.4s
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
@@ -377,7 +377,6 @@
rshrn ROW3L.4h, v10.4s, #11
rshrn ROW0L.4h, v12.4s, #11
rshrn ROW4L.4h, v6.4s, #11
- cmp x0, #0 /* orrs instruction removed */
beq 3f /* Go to do some special handling for the sparse right 4x8 half */
@@ -439,7 +438,7 @@
add v12.4s, v4.4s, v14.4s
sub v4.4s, v4.4s, v14.4s
add v10.4s, v2.4s, v8.4s
- sub v12.4s, v2.4s, v8.4s
+ sub v6.4s, v2.4s, v8.4s
rshrn ROW7R.4h, v4.4s, #11
rshrn ROW3R.4h, v10.4s, #11
rshrn ROW0R.4h, v12.4s, #11
@@ -1220,7 +1219,7 @@
mul v12.4h, v12.4h, v26.4h
mul v13.4h, v13.4h, v27.4h
ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
- ld1 {v30.8h}, [DCT_TABLE], 16
+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
mul v14.4h, v14.4h, v28.4h
mul v15.4h, v15.4h, v29.4h
ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
@@ -1327,17 +1326,17 @@
.balign 8
jsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* d0[0] */
- .short FIX_0_850430095 /* d0[1] */
- .short -FIX_1_272758580 /* d0[2] */
- .short FIX_3_624509785 /* d0[3] */
+ .short -FIX_0_720959822 /* v14[0] */
+ .short FIX_0_850430095 /* v14[1] */
+ .short -FIX_1_272758580 /* v14[2] */
+ .short FIX_3_624509785 /* v14[3] */
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
sshll v15.4s, \x4, #15
- smull v26.4s, \x6, v0.4h[3]
- smlal v26.4s, \x10, v0.4h[2]
- smlal v26.4s, \x12, v0.4h[1]
- smlal v26.4s, \x16, v0.4h[0]
+ smull v26.4s, \x6, v14.4h[3]
+ smlal v26.4s, \x10, v14.4h[2]
+ smlal v26.4s, \x12, v14.4h[1]
+ smlal v26.4s, \x16, v14.4h[0]
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
@@ -1399,26 +1398,26 @@
ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
/* Dequantize */
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.8h, v4.8h, v18.8h
- mul v5.8h, v5.8h, v18.8h
+ mul v4.4h, v4.4h, v18.4h
+ mul v5.4h, v5.4h, v19.4h
ins v4.2d[1], v5.2d[0]
- mul v6.8h, v6.8h, v20.8h
- mul v7.8h, v7.8h, v21.8h
+ mul v6.4h, v6.4h, v20.4h
+ mul v7.4h, v7.4h, v21.4h
ins v6.2d[1], v7.2d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
- mul v10.8h, v10.8h, v24.8h
- mul v11.8h, v11.8h, v25.8h
+ mul v10.4h, v10.4h, v24.4h
+ mul v11.4h, v11.4h, v25.4h
ins v10.2d[1], v11.2d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
- mul v12.8h, v12.8h, v26.8h
- mul v13.8h, v13.8h, v27.8h
+ mul v12.4h, v12.4h, v26.4h
+ mul v13.4h, v13.4h, v27.4h
ins v12.2d[1], v13.2d[0]
add DCT_TABLE, DCT_TABLE, #16
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v16.8h, v16.8h, v30.8h
- mul v17.8h, v17.8h, v31.8h
+ mul v16.4h, v16.4h, v30.4h
+ mul v17.4h, v17.4h, v31.4h
ins v16.2d[1], v17.2d[0]
/* Pass 1 */
@@ -1446,8 +1445,12 @@
sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
rshrn v7.4h, v15.4s, #13
+ ins v4.2d[1], v5.2d[0]
+ ins v6.2d[1], v7.2d[0]
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
+ ins v11.2d[0], v10.2d[1]
+ ins v7.2d[0], v6.2d[1]
#endif
/* Pass 2 */
@@ -1515,11 +1518,11 @@
prfm PLDL1KEEP, [V, #64]
prfm PLDL1KEEP, [Y, #64]
.elseif \size == 4
- ld1 {v4.b}[0], [U]
- ld1 {v4.b}[1], [U]
- ld1 {v4.b}[2], [U]
- ld1 {v4.b}[3], [U]
- ld1 {v5.b}[0], [V]
+ ld1 {v4.b}[0], [U], 1
+ ld1 {v4.b}[1], [U], 1
+ ld1 {v4.b}[2], [U], 1
+ ld1 {v4.b}[3], [U], 1
+ ld1 {v5.b}[0], [V], 1
ld1 {v5.b}[1], [V], 1
ld1 {v5.b}[2], [V], 1
ld1 {v5.b}[3], [V], 1
@@ -1554,7 +1557,7 @@
st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
.elseif \size == 2
st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
- st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
.elseif \size == 1
st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
.else
@@ -1751,7 +1754,7 @@
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
movi v10.16b, #255
- movi v12.16b, #255
+ movi v13.16b, #255
/* Outer loop over scanlines */
cmp NUM_ROWS, #1