Use consistent formatting in ARM NEON SIMD code

There aren't really any best practices to follow here.  I tried as best
as I could to adopt a standard that would ease any future maintenance
burdens.  The basic tenets of that standard are:

* Assembly instructions always start on Column 5, and operands always
  start on Column 21, except:
  - The instruction and operand can be indented (usually by 2 spaces)
    to indicate a separate instruction stream.
  - If the instruction is within an enclosing .if block in a macro,
    it should always be indented relative to the .if block.
* Comments are placed with an eye toward readability.  There are always
  at least 2 spaces between the end of a line of code and the associated
  in-line comment.  Where it made sense, I tried to line up the comments
  in blocks, and some were shifted right to avoid overlap with
  neighboring instruction lines.  Not an exact science.
* Assembler directives and macros use 2-space indenting rules.  .if
  blocks are indented relative to the macro, and code within the .if
  blocks is indented relative to the .if directive.
* No extraneous spaces between operands.  Lining up the operands
  vertically did not really improve readability-- personally, I think it
  made it worse, since my eye would tend to lose its place in the
  uniform columns of characters.  Also, code with a lot of vertical
  alignment is really hard to maintain, since changing one line could
  necessitate changing a bunch of other lines to avoid spoiling the
  alignment.
* No extraneous spaces in #defines or other directives.  In general, the
  only extraneous spaces (other than indenting spaces) are between:
  - Instructions and operands
  - Operands and in-line comments
This standard should be more or less in keeping with other formatting
standards used within the project.
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 5acb713..c13a859 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -28,7 +28,7 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
@@ -55,73 +55,74 @@
 .endm
 
 /* Transpose elements of single 128 bit registers */
-.macro transpose_single x0,x1,xi,xilen,literal
-    ins  \xi\xilen[0],  \x0\xilen[0]
-    ins  \x1\xilen[0],  \x0\xilen[1]
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose_single x0, x1, xi, xilen, literal
+    ins             \xi\xilen[0], \x0\xilen[0]
+    ins             \x1\xilen[0], \x0\xilen[1]
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose elements of 2 differnet registers */
-.macro transpose x0,x1,xi,xilen,literal
-    mov  \xi\xilen,     \x0\xilen
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose x0, x1, xi, xilen, literal
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x2\x2len
-    trn2 \x2\x2len, \xi\x0len, \x2\x2len
-    mov  \xi\xilen, \x1\xilen
-    trn1 \x1\x1len, \x1\x1len, \x3\x3len
-    trn2 \x3\x3len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x2\x2len
+    trn2            \x2\x2len, \xi\x0len, \x2\x2len
+    mov             \xi\xilen, \x1\xilen
+    trn1            \x1\x1len, \x1\x1len, \x3\x3len
+    trn2            \x3\x3len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x1\x1len
-    trn2 \x1\x2len, \xi\x0len, \x1\x2len
-    mov  \xi\xilen, \x2\xilen
-    trn1 \x2\x2len, \x2\x2len, \x3\x3len
-    trn2 \x3\x2len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x1\x1len
+    trn2            \x1\x2len, \xi\x0len, \x1\x2len
+    mov             \xi\xilen, \x2\xilen
+    trn1            \x2\x2len, \x2\x2len, \x3\x3len
+    trn2            \x3\x2len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4 x0, x1, x2, x3,x5
-    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
-    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
+.macro transpose_4x4 x0, x1, x2, x3, x5
+    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
+    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
 .endm
 
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
-    trn1 \t0\().8h, \l0\().8h, \l1\().8h
-    trn1 \t1\().8h, \l2\().8h, \l3\().8h
-    trn1 \t2\().8h, \l4\().8h, \l5\().8h
-    trn1 \t3\().8h, \l6\().8h, \l7\().8h
-    trn2 \l1\().8h, \l0\().8h, \l1\().8h
-    trn2 \l3\().8h, \l2\().8h, \l3\().8h
-    trn2 \l5\().8h, \l4\().8h, \l5\().8h
-    trn2 \l7\().8h, \l6\().8h, \l7\().8h
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
 
-    trn1 \l4\().4s, \t2\().4s, \t3\().4s
-    trn2 \t3\().4s, \t2\().4s, \t3\().4s
-    trn1 \t2\().4s, \t0\().4s, \t1\().4s
-    trn2 \l2\().4s, \t0\().4s, \t1\().4s
-    trn1 \t0\().4s, \l1\().4s, \l3\().4s
-    trn2 \l3\().4s, \l1\().4s, \l3\().4s
-    trn2 \t1\().4s, \l5\().4s, \l7\().4s
-    trn1 \l5\().4s, \l5\().4s, \l7\().4s
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
 
-    trn2 \l6\().2d, \l2\().2d, \t3\().2d
-    trn1 \l0\().2d, \t2\().2d, \l4\().2d
-    trn1 \l1\().2d, \t0\().2d, \l5\().2d
-    trn2 \l7\().2d, \l3\().2d, \t1\().2d
-    trn1 \l2\().2d, \l2\().2d, \t3\().2d
-    trn2 \l4\().2d, \t2\().2d, \l4\().2d
-    trn1 \l3\().2d, \l3\().2d, \t1\().2d
-    trn2 \l5\().2d, \t0\().2d, \l5\().2d
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
 .endm
 
+
 #define CENTERJSAMPLE 128
 
 /*****************************************************************************/
@@ -134,41 +135,40 @@
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define CENTERJSAMPLE 128
-#define CONST_BITS    13
-#define PASS1_BITS    2
+#define CONST_BITS 13
+#define PASS1_BITS 2
 
-#define F_0_298      2446           /* FIX(0.298631336) */
-#define F_0_390      3196           /* FIX(0.390180644) */
-#define F_0_541      4433           /* FIX(0.541196100) */
-#define F_0_765      6270           /* FIX(0.765366865) */
-#define F_0_899      7373           /* FIX(0.899976223) */
-#define F_1_175      9633           /* FIX(1.175875602) */
-#define F_1_501     12299           /* FIX(1.501321110) */
-#define F_1_847     15137           /* FIX(1.847759065) */
-#define F_1_961     16069           /* FIX(1.961570560) */
-#define F_2_053     16819           /* FIX(2.053119869) */
-#define F_2_562     20995           /* FIX(2.562915447) */
-#define F_3_072     25172           /* FIX(3.072711026) */
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
 
 .balign 16
 Ljsimd_idct_islow_neon_consts:
-    .short F_0_298
-    .short -F_0_390
-    .short F_0_541
-    .short F_0_765
-    .short - F_0_899
-    .short F_1_175
-    .short F_1_501
-    .short - F_1_847
-    .short - F_1_961
-    .short F_2_053
-    .short - F_2_562
-    .short F_3_072
-    .short 0  /* padding */
-    .short 0
-    .short 0
-    .short 0
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
 
 #undef F_0_298
 #undef F_0_390
@@ -212,45 +212,45 @@
 
     sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
-    st1             { v8.8b,  v9.8b, v10.8b, v11.8b}, [sp], #32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
-    ld1             { v0.8h,  v1.8h}, [x15]
-    ld1             { v2.8h,  v3.8h,  v4.8h,  v5.8h}, [COEF_BLOCK], #64
-    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE],  #64
-    ld1             { v6.8h,  v7.8h,  v8.8h,  v9.8h}, [COEF_BLOCK], #64
-    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE],  #64
+    ld1             {v0.8h, v1.8h}, [x15]
+    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
 
-    cmeq            v16.8h,    v3.8h,   #0
-    cmeq            v26.8h,    v4.8h,   #0
-    cmeq            v27.8h,    v5.8h,   #0
-    cmeq            v28.8h,    v6.8h,   #0
-    cmeq            v29.8h,    v7.8h,   #0
-    cmeq            v30.8h,    v8.8h,   #0
-    cmeq            v31.8h,    v9.8h,   #0
+    cmeq            v16.8h, v3.8h, #0
+    cmeq            v26.8h, v4.8h, #0
+    cmeq            v27.8h, v5.8h, #0
+    cmeq            v28.8h, v6.8h, #0
+    cmeq            v29.8h, v7.8h, #0
+    cmeq            v30.8h, v8.8h, #0
+    cmeq            v31.8h, v9.8h, #0
 
-    and            v10.16b,  v16.16b,   v26.16b
-    and            v11.16b,  v27.16b,   v28.16b
-    and            v12.16b,  v29.16b,   v30.16b
-    and            v13.16b,  v31.16b,   v10.16b
-    and            v14.16b,  v11.16b,   v12.16b
-    mul              v2.8h,    v2.8h,   v18.8h
-    and            v15.16b,  v13.16b,   v14.16b
-    shl             v10.8h,    v2.8h,   #(PASS1_BITS)
-    sqxtn           v16.8b,   v15.8h
-    mov               TMP1,  v16.d[0]
-    sub                 sp,       sp,   #64
-    mvn               TMP2,  TMP1
+    and             v10.16b, v16.16b, v26.16b
+    and             v11.16b, v27.16b, v28.16b
+    and             v12.16b, v29.16b, v30.16b
+    and             v13.16b, v31.16b, v10.16b
+    and             v14.16b, v11.16b, v12.16b
+    mul             v2.8h, v2.8h, v18.8h
+    and             v15.16b, v13.16b, v14.16b
+    shl             v10.8h, v2.8h, #(PASS1_BITS)
+    sqxtn           v16.8b, v15.8h
+    mov             TMP1, v16.d[0]
+    sub             sp, sp, #64
+    mvn             TMP2, TMP1
 
-    cbnz              TMP2,  2f
+    cbnz            TMP2, 2f
     /* case all AC coeffs are zeros */
-    dup              v2.2d, v10.d[0]
-    dup              v6.2d, v10.d[1]
-    mov             v3.16b,   v2.16b
-    mov             v7.16b,   v6.16b
-    mov             v4.16b,   v2.16b
-    mov             v8.16b,   v6.16b
-    mov             v5.16b,   v2.16b
-    mov             v9.16b,   v6.16b
+    dup             v2.2d, v10.d[0]
+    dup             v6.2d, v10.d[1]
+    mov             v3.16b, v2.16b
+    mov             v7.16b, v6.16b
+    mov             v4.16b, v2.16b
+    mov             v8.16b, v6.16b
+    mov             v5.16b, v2.16b
+    mov             v9.16b, v6.16b
 1:
     /* for this transpose, we should organise data like this:
      * 00, 01, 02, 03, 40, 41, 42, 43
@@ -262,177 +262,177 @@
      * 24, 25, 26, 27, 64, 65, 66, 67
      * 34, 35, 36, 37, 74, 75, 76, 77
      */
-    trn1            v28.8h,    v2.8h,   v3.8h
-    trn1            v29.8h,    v4.8h,   v5.8h
-    trn1            v30.8h,    v6.8h,   v7.8h
-    trn1            v31.8h,    v8.8h,   v9.8h
-    trn2            v16.8h,    v2.8h,   v3.8h
-    trn2            v17.8h,    v4.8h,   v5.8h
-    trn2            v18.8h,    v6.8h,   v7.8h
-    trn2            v19.8h,    v8.8h,   v9.8h
-    trn1             v2.4s,   v28.4s,  v29.4s
-    trn1             v6.4s,   v30.4s,  v31.4s
-    trn1             v3.4s,   v16.4s,  v17.4s
-    trn1             v7.4s,   v18.4s,  v19.4s
-    trn2             v4.4s,   v28.4s,  v29.4s
-    trn2             v8.4s,   v30.4s,  v31.4s
-    trn2             v5.4s,   v16.4s,  v17.4s
-    trn2             v9.4s,   v18.4s,  v19.4s
+    trn1            v28.8h, v2.8h, v3.8h
+    trn1            v29.8h, v4.8h, v5.8h
+    trn1            v30.8h, v6.8h, v7.8h
+    trn1            v31.8h, v8.8h, v9.8h
+    trn2            v16.8h, v2.8h, v3.8h
+    trn2            v17.8h, v4.8h, v5.8h
+    trn2            v18.8h, v6.8h, v7.8h
+    trn2            v19.8h, v8.8h, v9.8h
+    trn1            v2.4s, v28.4s, v29.4s
+    trn1            v6.4s, v30.4s, v31.4s
+    trn1            v3.4s, v16.4s, v17.4s
+    trn1            v7.4s, v18.4s, v19.4s
+    trn2            v4.4s, v28.4s, v29.4s
+    trn2            v8.4s, v30.4s, v31.4s
+    trn2            v5.4s, v16.4s, v17.4s
+    trn2            v9.4s, v18.4s, v19.4s
     /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h,    v4.8h,   v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h,    v2.8h,   v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s,   v18.8h,   XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h,    v2.8h,   v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s,   v18.4h,   XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll2          v23.4s,   v22.8h,   #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov            v21.16b,  v19.16b /* tmp3 = z1 */
-    mov            v20.16b,  v18.16b /* tmp3 = z1 */
-    smlal2          v19.4s,    v8.8h,   XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s,    v8.4h,   XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    sshll2          v27.4s,   v26.8h,   #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s,    v4.8h,   XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    smlal           v20.4s,    v4.4h,   XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    sshll           v22.4s,   v22.4h,   #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    sshll           v26.4s,   v26.4h,   #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    add              v2.4s,   v22.4s,   v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub              v6.4s,   v22.4s,   v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
-    add              v8.4s,   v26.4s,   v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub              v4.4s,   v26.4s,   v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
-    add             v28.4s,   v23.4s,   v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s,   v23.4s,   v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s,   v27.4s,   v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s,   v27.4s,   v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    add             v22.8h,    v9.8h,   v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h,    v7.8h,   v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h,    v9.8h,   v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h,    v7.8h,   v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h,   v22.8h,   v24.8h /* z5 = z3 + z4 */
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 
-    smull2          v11.4s,    v9.8h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s,    v7.8h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s,    v5.8h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s,    v3.8h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s,   v26.8h,   XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s,   v22.8h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s,   v24.8h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s,   v18.8h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s,   v20.8h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    smull           v10.4s,    v9.4h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s,    v7.4h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s,    v5.4h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s,    v3.4h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s,   v26.4h,   XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s,   v22.4h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s,   v24.4h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s,   v18.4h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s,   v20.4h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    add             v23.4s,   v23.4s,   v27.4s /* z3 += z5 */
-    add             v22.4s,   v22.4s,   v26.4s /* z3 += z5 */
-    add             v25.4s,   v25.4s,   v27.4s /* z4 += z5 */
-    add             v24.4s,   v24.4s,   v26.4s /* z4 += z5 */
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 
-    add             v11.4s,   v11.4s,   v19.4s /* tmp0 += z1 */
-    add             v10.4s,   v10.4s,   v18.4s /* tmp0 += z1 */
-    add             v13.4s,   v13.4s,   v21.4s /* tmp1 += z2 */
-    add             v12.4s,   v12.4s,   v20.4s /* tmp1 += z2 */
-    add             v15.4s,   v15.4s,   v21.4s /* tmp2 += z2 */
-    add             v14.4s,   v14.4s,   v20.4s /* tmp2 += z2 */
-    add             v17.4s,   v17.4s,   v19.4s /* tmp3 += z1 */
-    add             v16.4s,   v16.4s,   v18.4s /* tmp3 += z1 */
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 
-    add             v11.4s,   v11.4s,   v23.4s /* tmp0 += z3 */
-    add             v10.4s,   v10.4s,   v22.4s /* tmp0 += z3 */
-    add             v13.4s,   v13.4s,   v25.4s /* tmp1 += z4 */
-    add             v12.4s,   v12.4s,   v24.4s /* tmp1 += z4 */
-    add             v17.4s,   v17.4s,   v25.4s /* tmp3 += z4 */
-    add             v16.4s,   v16.4s,   v24.4s /* tmp3 += z4 */
-    add             v15.4s,   v15.4s,   v23.4s /* tmp2 += z3 */
-    add             v14.4s,   v14.4s,   v22.4s /* tmp2 += z3 */
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    add             v18.4s,    v2.4s,   v16.4s /* tmp10 + tmp3 */
-    add             v19.4s,   v28.4s,   v17.4s /* tmp10 + tmp3 */
-    sub             v20.4s,    v2.4s,   v16.4s /* tmp10 - tmp3 */
-    sub             v21.4s,   v28.4s,   v17.4s /* tmp10 - tmp3 */
-    add             v22.4s,    v8.4s,   v14.4s /* tmp11 + tmp2 */
-    add             v23.4s,   v29.4s,   v15.4s /* tmp11 + tmp2 */
-    sub             v24.4s,    v8.4s,   v14.4s /* tmp11 - tmp2 */
-    sub             v25.4s,   v29.4s,   v15.4s /* tmp11 - tmp2 */
-    add             v26.4s,    v4.4s,   v12.4s /* tmp12 + tmp1 */
-    add             v27.4s,   v30.4s,   v13.4s /* tmp12 + tmp1 */
-    sub             v28.4s,    v4.4s,   v12.4s /* tmp12 - tmp1 */
-    sub             v29.4s,   v30.4s,   v13.4s /* tmp12 - tmp1 */
-    add             v14.4s,    v6.4s,   v10.4s /* tmp13 + tmp0 */
-    add             v15.4s,   v31.4s,   v11.4s /* tmp13 + tmp0 */
-    sub             v16.4s,    v6.4s,   v10.4s /* tmp13 - tmp0 */
-    sub             v17.4s,   v31.4s,   v11.4s /* tmp13 - tmp0 */
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    shrn             v2.4h,   v18.4s,   #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn             v9.4h,   v20.4s,   #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn             v3.4h,   v22.4s,   #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn             v8.4h,   v24.4s,   #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn             v4.4h,   v26.4s,   #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn             v7.4h,   v28.4s,   #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn             v5.4h,   v14.4s,   #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn             v6.4h,   v16.4s,   #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v2.8h,   v19.4s,   #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v9.8h,   v21.4s,   #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v3.8h,   v23.4s,   #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v8.8h,   v25.4s,   #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v4.8h,   v27.4s,   #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v7.8h,   v29.4s,   #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v5.8h,   v15.4s,   #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2            v6.8h,   v17.4s,   #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    movi            v0.16b,   #(CENTERJSAMPLE)
-/* Prepare pointers (dual-issue with NEON instructions) */
-      ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    sqrshrn         v28.8b,    v2.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      ldp             TMP3,     TMP4,     [OUTPUT_BUF], 16
-    sqrshrn         v29.8b,    v3.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP1,     TMP1,     OUTPUT_COL
-    sqrshrn         v30.8b,    v4.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP2,     TMP2,     OUTPUT_COL
-    sqrshrn         v31.8b,    v5.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      add             TMP3,     TMP3,     OUTPUT_COL
-    sqrshrn2        v28.16b,   v6.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      add              TMP4,    TMP4,     OUTPUT_COL
-    sqrshrn2        v29.16b,   v7.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      ldp              TMP5,    TMP6,     [OUTPUT_BUF], 16
-    sqrshrn2        v30.16b,   v8.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      ldp              TMP7,    TMP8,     [OUTPUT_BUF], 16
-    sqrshrn2        v31.16b,   v9.8h,   #(CONST_BITS+PASS1_BITS+3-16)
-      add              TMP5,    TMP5,     OUTPUT_COL
-    add             v16.16b, v28.16b,   v0.16b
-      add              TMP6,    TMP6,     OUTPUT_COL
-    add             v18.16b, v29.16b,   v0.16b
-      add              TMP7,    TMP7,     OUTPUT_COL
-    add             v20.16b, v30.16b,   v0.16b
-      add              TMP8,    TMP8,     OUTPUT_COL
-    add             v22.16b, v31.16b,   v0.16b
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    movi            v0.16b, #(CENTERJSAMPLE)
+    /* Prepare pointers (dual-issue with NEON instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP1, TMP1, OUTPUT_COL
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP2, TMP2, OUTPUT_COL
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP3, TMP3, OUTPUT_COL
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP4, TMP4, OUTPUT_COL
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
 
     /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b,   v18.16b
-    trn1            v30.16b, v20.16b,   v22.16b
-    trn2            v29.16b, v16.16b,   v18.16b
-    trn2            v31.16b, v20.16b,   v22.16b
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
 
-    trn1            v16.8h,   v28.8h,   v30.8h
-    trn2            v18.8h,   v28.8h,   v30.8h
-    trn1            v20.8h,   v29.8h,   v31.8h
-    trn2            v22.8h,   v29.8h,   v31.8h
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
 
-    uzp1            v28.4s,   v16.4s,   v18.4s
-    uzp2            v30.4s,   v16.4s,   v18.4s
-    uzp1            v29.4s,   v20.4s,   v22.4s
-    uzp2            v31.4s,   v20.4s,   v22.4s
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
 
     /* Store results to the output buffer */
     st1             {v28.d}[0], [TMP1]
@@ -443,294 +443,294 @@
     st1             {v31.d}[0], [TMP6]
     st1             {v30.d}[1], [TMP7]
     st1             {v31.d}[1], [TMP8]
-    ld1             { v8.8b,  v9.8b, v10.8b, v11.8b}, [sp], #32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
     blr             x30
 
 .balign 16
 2:
-    mul              v3.8h,    v3.8h,   v19.8h
-    mul              v4.8h,    v4.8h,   v20.8h
-    mul              v5.8h,    v5.8h,   v21.8h
-    add               TMP4,      xzr,   TMP2,  LSL #32
-    mul              v6.8h,    v6.8h,   v22.8h
-    mul              v7.8h,    v7.8h,   v23.8h
-    adds              TMP3,      xzr,   TMP2,  LSR #32
-    mul              v8.8h,    v8.8h,   v24.8h
-    mul              v9.8h,    v9.8h,   v25.8h
-    b.ne             3f
+    mul             v3.8h, v3.8h, v19.8h
+    mul             v4.8h, v4.8h, v20.8h
+    mul             v5.8h, v5.8h, v21.8h
+    add             TMP4, xzr, TMP2, LSL #32
+    mul             v6.8h, v6.8h, v22.8h
+    mul             v7.8h, v7.8h, v23.8h
+    adds            TMP3, xzr, TMP2, LSR #32
+    mul             v8.8h, v8.8h, v24.8h
+    mul             v9.8h, v9.8h, v25.8h
+    b.ne            3f
     /* Right AC coef is zero */
     dup             v15.2d, v10.d[1]
     /* Even part: reverse the even part of the forward DCT. */
-    add             v18.4h,    v4.4h,   v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.4h,    v2.4h,   v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    sub             v26.4h,    v2.4h,   v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s,   v18.4h,   XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll           v22.4s,   v22.4h,   #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov            v20.16b,  v18.16b /* tmp3 = z1 */
-    sshll           v26.4s,   v26.4h,   #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal           v18.4s,    v8.4h,   XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v20.4s,    v4.4h,   XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    add              v2.4s,   v22.4s,   v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub              v6.4s,   v22.4s,   v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
-    add              v8.4s,   v26.4s,   v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub              v4.4s,   v26.4s,   v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    add             v22.4h,    v9.4h,   v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.4h,    v7.4h,   v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.4h,    v9.4h,   v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.4h,    v7.4h,   v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.4h,   v22.4h,   v24.4h /* z5 = z3 + z4 */
+    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
 
-    smull           v10.4s,    v9.4h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s,    v7.4h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s,    v5.4h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s,    v3.4h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s,   v26.4h,   XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s,   v22.4h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s,   v24.4h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s,   v18.4h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s,   v20.4h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    add             v22.4s,   v22.4s,   v26.4s /* z3 += z5 */
-    add             v24.4s,   v24.4s,   v26.4s /* z4 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 
-    add             v10.4s,   v10.4s,   v18.4s /* tmp0 += z1 */
-    add             v12.4s,   v12.4s,   v20.4s /* tmp1 += z2 */
-    add             v14.4s,   v14.4s,   v20.4s /* tmp2 += z2 */
-    add             v16.4s,   v16.4s,   v18.4s /* tmp3 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 
-    add             v10.4s,   v10.4s,   v22.4s /* tmp0 += z3 */
-    add             v12.4s,   v12.4s,   v24.4s /* tmp1 += z4 */
-    add             v16.4s,   v16.4s,   v24.4s /* tmp3 += z4 */
-    add             v14.4s,   v14.4s,   v22.4s /* tmp2 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    add             v18.4s,    v2.4s,   v16.4s /* tmp10 + tmp3 */
-    sub             v20.4s,    v2.4s,   v16.4s /* tmp10 - tmp3 */
-    add             v22.4s,    v8.4s,   v14.4s /* tmp11 + tmp2 */
-    sub             v24.4s,    v8.4s,   v14.4s /* tmp11 - tmp2 */
-    add             v26.4s,    v4.4s,   v12.4s /* tmp12 + tmp1 */
-    sub             v28.4s,    v4.4s,   v12.4s /* tmp12 - tmp1 */
-    add             v14.4s,    v6.4s,   v10.4s /* tmp13 + tmp0 */
-    sub             v16.4s,    v6.4s,   v10.4s /* tmp13 - tmp0 */
+    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
 
-    rshrn            v2.4h,   v18.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn            v3.4h,   v22.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn            v4.4h,   v26.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn            v5.4h,   v14.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v2.8h,   v16.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v3.8h,   v28.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2           v4.8h,   v24.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2           v5.8h,   v20.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    mov             v6.16b,  v15.16b
-    mov             v7.16b,  v15.16b
-    mov             v8.16b,  v15.16b
-    mov             v9.16b,  v15.16b
-    b                1b
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    mov             v6.16b, v15.16b
+    mov             v7.16b, v15.16b
+    mov             v8.16b, v15.16b
+    mov             v9.16b, v15.16b
+    b               1b
 
 .balign 16
 3:
-    cbnz              TMP4,    4f
+    cbnz            TMP4, 4f
     /* Left AC coef is zero */
     dup             v14.2d, v10.d[0]
     /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h,    v4.8h,   v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h,    v2.8h,   v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s,   v18.8h,   XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h,    v2.8h,   v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    sshll2          v23.4s,   v22.8h,   #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov            v21.16b,  v19.16b /* tmp3 = z1 */
-    smlal2          v19.4s,    v8.8h,   XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    sshll2          v27.4s,   v26.8h,   #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s,    v4.8h,   XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    add             v28.4s,   v23.4s,   v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s,   v23.4s,   v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s,   v27.4s,   v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s,   v27.4s,   v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    add             v22.8h,    v9.8h,   v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h,    v7.8h,   v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h,    v9.8h,   v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h,    v7.8h,   v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h,   v22.8h,   v24.8h /* z5 = z3 + z4 */
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 
-    smull2          v11.4s,    v9.8h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s,    v7.8h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s,    v5.8h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s,    v3.8h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s,   v26.8h,   XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s,   v22.8h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s,   v24.8h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s,   v18.8h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s,   v20.8h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    add             v23.4s,   v23.4s,   v27.4s /* z3 += z5 */
-    add             v22.4s,   v22.4s,   v26.4s /* z3 += z5 */
-    add             v25.4s,   v25.4s,   v27.4s /* z4 += z5 */
-    add             v24.4s,   v24.4s,   v26.4s /* z4 += z5 */
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 
-    add             v11.4s,   v11.4s,   v19.4s /* tmp0 += z1 */
-    add             v13.4s,   v13.4s,   v21.4s /* tmp1 += z2 */
-    add             v15.4s,   v15.4s,   v21.4s /* tmp2 += z2 */
-    add             v17.4s,   v17.4s,   v19.4s /* tmp3 += z1 */
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
 
-    add             v11.4s,   v11.4s,   v23.4s /* tmp0 += z3 */
-    add             v13.4s,   v13.4s,   v25.4s /* tmp1 += z4 */
-    add             v17.4s,   v17.4s,   v25.4s /* tmp3 += z4 */
-    add             v15.4s,   v15.4s,   v23.4s /* tmp2 += z3 */
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    add             v19.4s,   v28.4s,   v17.4s /* tmp10 + tmp3 */
-    sub             v21.4s,   v28.4s,   v17.4s /* tmp10 - tmp3 */
-    add             v23.4s,   v29.4s,   v15.4s /* tmp11 + tmp2 */
-    sub             v25.4s,   v29.4s,   v15.4s /* tmp11 - tmp2 */
-    add             v27.4s,   v30.4s,   v13.4s /* tmp12 + tmp1 */
-    sub             v29.4s,   v30.4s,   v13.4s /* tmp12 - tmp1 */
-    add             v15.4s,   v31.4s,   v11.4s /* tmp13 + tmp0 */
-    sub             v17.4s,   v31.4s,   v11.4s /* tmp13 - tmp0 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    mov             v2.16b,  v14.16b
-    mov             v3.16b,  v14.16b
-    mov             v4.16b,  v14.16b
-    mov             v5.16b,  v14.16b
-    rshrn            v6.4h,   v19.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn            v7.4h,   v23.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn            v8.4h,   v27.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn            v9.4h,   v15.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v6.8h,   v17.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v7.8h,   v29.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2           v8.8h,   v25.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2           v9.8h,   v21.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    b                1b
+    mov             v2.16b, v14.16b
+    mov             v3.16b, v14.16b
+    mov             v4.16b, v14.16b
+    mov             v5.16b, v14.16b
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
 
 .balign 16
 4:
     /* "No" AC coef is zero */
     /* Even part: reverse the even part of the forward DCT. */
-    add             v18.8h,    v4.8h,   v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
-    add             v22.8h,    v2.8h,   v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull2          v19.4s,   v18.8h,   XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sub             v26.8h,    v2.8h,   v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
-    smull           v18.4s,   v18.4h,   XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
-    sshll2          v23.4s,   v22.8h,   #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    mov            v21.16b,  v19.16b /* tmp3 = z1 */
-    mov            v20.16b,  v18.16b /* tmp3 = z1 */
-    smlal2          v19.4s,    v8.8h,   XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s,    v8.4h,   XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    sshll2          v27.4s,   v26.8h,   #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal2          v21.4s,    v4.8h,   XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    smlal           v20.4s,    v4.4h,   XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
-    sshll           v22.4s,   v22.4h,   #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
-    sshll           v26.4s,   v26.4h,   #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    add              v2.4s,   v22.4s,   v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
-    sub              v6.4s,   v22.4s,   v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
-    add              v8.4s,   v26.4s,   v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
-    sub              v4.4s,   v26.4s,   v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
-    add             v28.4s,   v23.4s,   v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
-    sub             v31.4s,   v23.4s,   v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
-    add             v29.4s,   v27.4s,   v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
-    sub             v30.4s,   v27.4s,   v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    add             v22.8h,    v9.8h,   v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v24.8h,    v7.8h,   v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v18.8h,    v9.8h,   v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
-    add             v20.8h,    v7.8h,   v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
-    add             v26.8h,   v22.8h,   v24.8h /* z5 = z3 + z4 */
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 
-    smull2          v11.4s,    v9.8h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull2          v13.4s,    v7.8h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull2          v15.4s,    v5.8h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull2          v17.4s,    v3.8h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull2          v27.4s,   v26.8h,   XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s,   v22.8h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s,   v24.8h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s,   v18.8h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s,   v20.8h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    smull           v10.4s,    v9.4h,   XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
-    smull           v12.4s,    v7.4h,   XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
-    smull           v14.4s,    v5.4h,   XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
-    smull           v16.4s,    v3.4h,   XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
-    smull           v26.4s,   v26.4h,   XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s,   v22.4h,   XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s,   v24.4h,   XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s,   v18.4h,   XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s,   v20.4h,   XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 
-    add             v23.4s,   v23.4s,   v27.4s /* z3 += z5 */
-    add             v22.4s,   v22.4s,   v26.4s /* z3 += z5 */
-    add             v25.4s,   v25.4s,   v27.4s /* z4 += z5 */
-    add             v24.4s,   v24.4s,   v26.4s /* z4 += z5 */
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 
-    add             v11.4s,   v11.4s,   v19.4s /* tmp0 += z1 */
-    add             v10.4s,   v10.4s,   v18.4s /* tmp0 += z1 */
-    add             v13.4s,   v13.4s,   v21.4s /* tmp1 += z2 */
-    add             v12.4s,   v12.4s,   v20.4s /* tmp1 += z2 */
-    add             v15.4s,   v15.4s,   v21.4s /* tmp2 += z2 */
-    add             v14.4s,   v14.4s,   v20.4s /* tmp2 += z2 */
-    add             v17.4s,   v17.4s,   v19.4s /* tmp3 += z1 */
-    add             v16.4s,   v16.4s,   v18.4s /* tmp3 += z1 */
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 
-    add             v11.4s,   v11.4s,   v23.4s /* tmp0 += z3 */
-    add             v10.4s,   v10.4s,   v22.4s /* tmp0 += z3 */
-    add             v13.4s,   v13.4s,   v25.4s /* tmp1 += z4 */
-    add             v12.4s,   v12.4s,   v24.4s /* tmp1 += z4 */
-    add             v17.4s,   v17.4s,   v25.4s /* tmp3 += z4 */
-    add             v16.4s,   v16.4s,   v24.4s /* tmp3 += z4 */
-    add             v15.4s,   v15.4s,   v23.4s /* tmp2 += z3 */
-    add             v14.4s,   v14.4s,   v22.4s /* tmp2 += z3 */
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    add             v18.4s,    v2.4s,   v16.4s /* tmp10 + tmp3 */
-    add             v19.4s,   v28.4s,   v17.4s /* tmp10 + tmp3 */
-    sub             v20.4s,    v2.4s,   v16.4s /* tmp10 - tmp3 */
-    sub             v21.4s,   v28.4s,   v17.4s /* tmp10 - tmp3 */
-    add             v22.4s,    v8.4s,   v14.4s /* tmp11 + tmp2 */
-    add             v23.4s,   v29.4s,   v15.4s /* tmp11 + tmp2 */
-    sub             v24.4s,    v8.4s,   v14.4s /* tmp11 - tmp2 */
-    sub             v25.4s,   v29.4s,   v15.4s /* tmp11 - tmp2 */
-    add             v26.4s,    v4.4s,   v12.4s /* tmp12 + tmp1 */
-    add             v27.4s,   v30.4s,   v13.4s /* tmp12 + tmp1 */
-    sub             v28.4s,    v4.4s,   v12.4s /* tmp12 - tmp1 */
-    sub             v29.4s,   v30.4s,   v13.4s /* tmp12 - tmp1 */
-    add             v14.4s,    v6.4s,   v10.4s /* tmp13 + tmp0 */
-    add             v15.4s,   v31.4s,   v11.4s /* tmp13 + tmp0 */
-    sub             v16.4s,    v6.4s,   v10.4s /* tmp13 - tmp0 */
-    sub             v17.4s,   v31.4s,   v11.4s /* tmp13 - tmp0 */
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    rshrn            v2.4h,   v18.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn            v3.4h,   v22.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn            v4.4h,   v26.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn            v5.4h,   v14.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn            v6.4h,   v19.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn            v7.4h,   v23.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn            v8.4h,   v27.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn            v9.4h,   v15.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v2.8h,   v16.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v3.8h,   v28.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2           v4.8h,   v24.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2           v5.8h,   v20.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn2           v6.8h,   v17.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2           v7.8h,   v29.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2           v8.8h,   v25.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2           v9.8h,   v21.4s,   #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    b                1b
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -787,10 +787,10 @@
 
 .balign 16
 Ljsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -811,8 +811,8 @@
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
-     *   0 | d16     | d17     ( v16.8h  )
-     *   1 | d18     | d19     ( v17.8h  )
+     *   0 | d16     | d17     ( v16.8h )
+     *   1 | d18     | d19     ( v17.8h )
      *   2 | d20     | d21     ( v18.8h )
      *   3 | d22     | d23     ( v19.8h )
      *   4 | d24     | d25     ( v20.8h )
@@ -825,9 +825,9 @@
     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
-    mul             v16.8h,  v16.8h,  v0.8h
+    mul             v16.8h, v16.8h, v0.8h
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v17.8h,  v17.8h,  v1.8h
+    mul             v17.8h, v17.8h, v1.8h
     ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
     mul             v18.8h, v18.8h, v2.8h
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
@@ -837,135 +837,135 @@
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
     mul             v22.8h, v22.8h, v2.8h
     mul             v21.8h, v21.8h, v1.8h
-    ld1             {v0.4h}, [TMP5]      /* load constants */
+    ld1             {v0.4h}, [TMP5]        /* load constants */
     mul             v23.8h, v23.8h, v3.8h
 
     /* 1-D IDCT, pass 1 */
-    sub             v2.8h,    v18.8h,   v22.8h
-    add             v22.8h,   v18.8h,   v22.8h
-    sub             v1.8h,    v19.8h,   v21.8h
-    add             v21.8h,   v19.8h,   v21.8h
-    sub             v5.8h,    v17.8h,   v23.8h
-    add             v23.8h,   v17.8h,   v23.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v18.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v23.8h,   v21.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v18.8h,   v18.8h,   v22.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v16.8h,   v20.8h
-    add             v20.8h,   v16.8h,   v20.8h
-    add             v17.8h,   v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v18.8h
-    sub             v18.8h,   v6.8h,    v18.8h
-    add             v6.8h,    v23.8h,   v21.8h
-    add             v16.8h,   v20.8h,   v22.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v20.8h,   v20.8h,   v22.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v17.8h,   v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v23.8h,   v16.8h,   v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v16.8h,   v16.8h,   v6.8h
-    add             v22.8h,   v5.8h,    v3.8h
-    sub             v17.8h,   v5.8h,    v3.8h
-    sub             v21.8h,   v18.8h,   v2.8h
-    add             v18.8h,   v18.8h,   v2.8h
-    sub             v19.8h,   v20.8h,   v1.8h
-    add             v20.8h,   v20.8h,   v1.8h
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
     /* 1-D IDCT, pass 2 */
-    sub             v2.8h,    v18.8h,   v22.8h
-    add             v22.8h,   v18.8h,   v22.8h
-    sub             v1.8h,    v19.8h,   v21.8h
-    add             v21.8h,   v19.8h,   v21.8h
-    sub             v5.8h,    v17.8h,   v23.8h
-    add             v23.8h,   v17.8h,   v23.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v18.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v23.8h,   v21.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v18.8h,   v18.8h,   v22.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v16.8h,   v20.8h
-    add             v20.8h,   v16.8h,   v20.8h
-    add             v17.8h,   v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v18.8h
-    sub             v18.8h,   v6.8h,    v18.8h
-    add             v6.8h,    v23.8h,   v21.8h
-    add             v16.8h,   v20.8h,   v22.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v20.8h,   v20.8h,   v22.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v17.8h,   v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v23.8h,   v16.8h,   v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v16.8h,   v16.8h,   v6.8h
-    add             v22.8h,   v5.8h,    v3.8h
-    sub             v17.8h,   v5.8h,    v3.8h
-    sub             v21.8h,   v18.8h,   v2.8h
-    add             v18.8h,   v18.8h,   v2.8h
-    sub             v19.8h,   v20.8h,   v1.8h
-    add             v20.8h,   v20.8h,   v1.8h
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
     /* Descale to 8-bit and range limit */
-    movi            v0.16b,   #0x80
+    movi            v0.16b, #0x80
       /* Prepare pointers (dual-issue with NEON instructions) */
-      ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    sqshrn          v28.8b,   v16.8h,   #5
-      ldp             TMP3,     TMP4,     [OUTPUT_BUF], 16
-    sqshrn          v29.8b,   v17.8h,   #5
-      add             TMP1,     TMP1,     OUTPUT_COL
-    sqshrn          v30.8b,   v18.8h,   #5
-      add             TMP2,     TMP2,     OUTPUT_COL
-    sqshrn          v31.8b,   v19.8h,   #5
-      add             TMP3,     TMP3,     OUTPUT_COL
-    sqshrn2         v28.16b,  v20.8h,   #5
-      add             TMP4,     TMP4,     OUTPUT_COL
-    sqshrn2         v29.16b,  v21.8h,   #5
-      ldp             TMP5,     TMP6,     [OUTPUT_BUF], 16
-    sqshrn2         v30.16b,  v22.8h,   #5
-      ldp             TMP7,     TMP8,     [OUTPUT_BUF], 16
-    sqshrn2         v31.16b,  v23.8h,   #5
-      add             TMP5,     TMP5,     OUTPUT_COL
-    add             v16.16b,  v28.16b,  v0.16b
-      add             TMP6,     TMP6,     OUTPUT_COL
-    add             v18.16b,  v29.16b,  v0.16b
-      add             TMP7,     TMP7,     OUTPUT_COL
-    add             v20.16b,  v30.16b,  v0.16b
-      add             TMP8,     TMP8,     OUTPUT_COL
-    add             v22.16b,  v31.16b,  v0.16b
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqshrn          v28.8b, v16.8h, #5
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqshrn          v29.8b, v17.8h, #5
+      add             TMP1, TMP1, OUTPUT_COL
+    sqshrn          v30.8b, v18.8h, #5
+      add             TMP2, TMP2, OUTPUT_COL
+    sqshrn          v31.8b, v19.8h, #5
+      add             TMP3, TMP3, OUTPUT_COL
+    sqshrn2         v28.16b, v20.8h, #5
+      add             TMP4, TMP4, OUTPUT_COL
+    sqshrn2         v29.16b, v21.8h, #5
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqshrn2         v30.16b, v22.8h, #5
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqshrn2         v31.16b, v23.8h, #5
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
 
     /* Transpose the final 8-bit samples */
-    trn1            v28.16b,  v16.16b,  v18.16b
-    trn1            v30.16b,  v20.16b,  v22.16b
-    trn2            v29.16b,  v16.16b,  v18.16b
-    trn2            v31.16b,  v20.16b,  v22.16b
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
 
-    trn1            v16.8h,   v28.8h,   v30.8h
-    trn2            v18.8h,   v28.8h,   v30.8h
-    trn1            v20.8h,   v29.8h,   v31.8h
-    trn2            v22.8h,   v29.8h,   v31.8h
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
 
-    uzp1            v28.4s,   v16.4s,   v18.4s
-    uzp2            v30.4s,   v16.4s,   v18.4s
-    uzp1            v29.4s,   v20.4s,   v22.4s
-    uzp2            v31.4s,   v20.4s,   v22.4s
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
 
     /* Store results to the output buffer */
     st1             {v28.d}[0], [TMP1]
@@ -1014,81 +1014,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
 Ljsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* v0.h[0] */
-    .short     -FIX_0_765366865    /* v0.h[1] */
-    .short     -FIX_0_211164243    /* v0.h[2] */
-    .short     FIX_1_451774981     /* v0.h[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* v2.h[0] */
-    .short     FIX_2_562915447     /* v2.h[1] */
-    .short     1 << (CONST_BITS+1) /* v2.h[2] */
-    .short     0                   /* v2.h[3] */
+  .short FIX_1_847759065      /* v0.h[0] */
+  .short -FIX_0_765366865     /* v0.h[1] */
+  .short -FIX_0_211164243     /* v0.h[2] */
+  .short FIX_1_451774981      /* v0.h[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* v2.h[0] */
+  .short FIX_2_562915447      /* v2.h[1] */
+  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
+  .short 0                    /* v2.h[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4,    v2.h[2]
-    smlal           v28.4s, \x8,    v0.h[0]
-    smlal           v28.4s, \x14,   v0.h[1]
+    smull           v28.4s, \x4, v2.h[2]
+    smlal           v28.4s, \x8, v0.h[0]
+    smlal           v28.4s, \x14, v0.h[1]
 
-    smull           v26.4s, \x16,   v1.h[2]
-    smlal           v26.4s, \x12,   v1.h[3]
-    smlal           v26.4s, \x10,   v2.h[0]
-    smlal           v26.4s, \x6,    v2.h[1]
+    smull           v26.4s, \x16, v1.h[2]
+    smlal           v26.4s, \x12, v1.h[3]
+    smlal           v26.4s, \x10, v2.h[0]
+    smlal           v26.4s, \x6, v2.h[1]
 
-    smull           v30.4s, \x4,    v2.h[2]
-    smlsl           v30.4s, \x8,    v0.h[0]
-    smlsl           v30.4s, \x14,   v0.h[1]
+    smull           v30.4s, \x4, v2.h[2]
+    smlsl           v30.4s, \x8, v0.h[0]
+    smlsl           v30.4s, \x14, v0.h[1]
 
-    smull           v24.4s, \x16,   v0.h[2]
-    smlal           v24.4s, \x12,   v0.h[3]
-    smlal           v24.4s, \x10,   v1.h[0]
-    smlal           v24.4s, \x6,    v1.h[1]
+    smull           v24.4s, \x16, v0.h[2]
+    smlal           v24.4s, \x12, v0.h[3]
+    smlal           v24.4s, \x10, v1.h[0]
+    smlal           v24.4s, \x6, v1.h[1]
 
     add             v20.4s, v28.4s, v26.4s
     sub             v28.4s, v28.4s, v26.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26,   v20.4s
-    xtn             \y29,   v28.4s
-.else
-    rshrn           \y26,   v20.4s, #\shift
-    rshrn           \y29,   v28.4s, #\shift
-.endif
+    xtn             \y26, v20.4s
+    xtn             \y29, v28.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y29, v28.4s, #\shift
+  .endif
 
     add             v20.4s, v30.4s, v24.4s
     sub             v30.4s, v30.4s, v24.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27,   v20.4s
-    xtn             \y28,   v30.4s
-.else
-    rshrn           \y27,   v20.4s, #\shift
-    rshrn           \y28,   v30.4s, #\shift
-.endif
-
+    xtn             \y27, v20.4s
+    xtn             \y28, v30.4s
+  .else
+    rshrn           \y27, v20.4s, #\shift
+    rshrn           \y28, v30.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1138,39 +1137,43 @@
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]    /* 128 bit q4 */
+    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]    /* 128 bit q6 */
+    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
     mul             v8.4h, v8.4h, v22.4h
     mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]    /* 128 bit q8 */
+    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]  /* 128 bit q10 */
+    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]  /* 128 bit q12 */
+    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v14.4h, v14.4h, v28.4h
     mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]  /* 128 bit q14 */
+    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]  /* 128 bit q16 */
+    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
 
     /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
+                    v4.4h, v6.4h, v8.4h, v10.4h
     transpose_4x4   v4, v6, v8, v10, v3
     ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
+    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
+                    v5.4h, v7.4h, v9.4h, v11.4h
     transpose_4x4   v5, v7, v9, v11, v3
     ins             v10.d[1], v11.d[0]
+
     /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
+                    v26.4h, v27.4h, v28.4h, v29.4h
     transpose_4x4   v26, v27, v28, v29, v3
 
     /* Range limit */
@@ -1261,31 +1264,30 @@
 
 .balign 8
 Ljsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* v14[0] */
-    .short     FIX_0_850430095     /* v14[1] */
-    .short     -FIX_1_272758580    /* v14[2] */
-    .short     FIX_3_624509785     /* v14[3] */
+  .short -FIX_0_720959822  /* v14[0] */
+  .short FIX_0_850430095   /* v14[1] */
+  .short -FIX_1_272758580  /* v14[2] */
+  .short FIX_3_624509785   /* v14[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll      v15.4s, \x4,    #15
-    smull      v26.4s, \x6,    v14.h[3]
-    smlal      v26.4s, \x10,   v14.h[2]
-    smlal      v26.4s, \x12,   v14.h[1]
-    smlal      v26.4s, \x16,   v14.h[0]
+    sshll           v15.4s, \x4, #15
+    smull           v26.4s, \x6, v14.h[3]
+    smlal           v26.4s, \x10, v14.h[2]
+    smlal           v26.4s, \x12, v14.h[1]
+    smlal           v26.4s, \x16, v14.h[0]
 
-    add        v20.4s, v15.4s, v26.4s
-    sub        v15.4s, v15.4s, v26.4s
+    add             v20.4s, v15.4s, v26.4s
+    sub             v15.4s, v15.4s, v26.4s
 
-.if \shift > 16
-    srshr      v20.4s, v20.4s, #\shift
-    srshr      v15.4s, v15.4s, #\shift
-    xtn        \y26,   v20.4s
-    xtn        \y27,   v15.4s
-.else
-    rshrn      \y26,   v20.4s, #\shift
-    rshrn      \y27,   v15.4s, #\shift
-.endif
-
+  .if \shift > 16
+    srshr           v20.4s, v20.4s, #\shift
+    srshr           v15.4s, v15.4s, #\shift
+    xtn             \y26, v20.4s
+    xtn             \y27, v15.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y27, v15.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1358,28 +1360,28 @@
     /* Pass 1 */
 #if 0
     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
+    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
+    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
 #else
-    smull           v26.4s, v6.4h,  v14.h[3]
+    smull           v26.4s, v6.4h, v14.h[3]
     smlal           v26.4s, v10.4h, v14.h[2]
     smlal           v26.4s, v12.4h, v14.h[1]
     smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h,  v14.h[3]
+    smull           v24.4s, v7.4h, v14.h[3]
     smlal           v24.4s, v11.4h, v14.h[2]
     smlal           v24.4s, v13.4h, v14.h[1]
     smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h,  #15
-    sshll           v30.4s, v5.4h,  #15
+    sshll           v15.4s, v4.4h, #15
+    sshll           v30.4s, v5.4h, #15
     add             v20.4s, v15.4s, v26.4s
     sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h,  v20.4s, #13
-    rshrn           v6.4h,  v15.4s, #13
+    rshrn           v4.4h, v20.4s, #13
+    rshrn           v6.4h, v15.4s, #13
     add             v20.4s, v30.4s, v24.4s
     sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h,  v20.4s, #13
-    rshrn           v7.4h,  v15.4s, #13
+    rshrn           v5.4h, v20.4s, #13
+    rshrn           v7.4h, v15.4s, #13
     ins             v4.d[1], v5.d[0]
     ins             v6.d[1], v7.d[0]
     transpose       v4, v6, v3, .16b, .8h
@@ -1450,125 +1452,125 @@
 #endif
 
 .macro do_load size
-    .if \size == 8
-        ld1  {v4.8b}, [U], 8
-        ld1  {v5.8b}, [V], 8
-        ld1  {v0.8b}, [Y], 8
-        prfm pldl1keep, [U, #64]
-        prfm pldl1keep, [V, #64]
-        prfm pldl1keep, [Y, #64]
-    .elseif \size == 4
-        ld1  {v4.b}[0], [U], 1
-        ld1  {v4.b}[1], [U], 1
-        ld1  {v4.b}[2], [U], 1
-        ld1  {v4.b}[3], [U], 1
-        ld1  {v5.b}[0], [V], 1
-        ld1  {v5.b}[1], [V], 1
-        ld1  {v5.b}[2], [V], 1
-        ld1  {v5.b}[3], [V], 1
-        ld1  {v0.b}[0], [Y], 1
-        ld1  {v0.b}[1], [Y], 1
-        ld1  {v0.b}[2], [Y], 1
-        ld1  {v0.b}[3], [Y], 1
-    .elseif \size == 2
-        ld1  {v4.b}[4], [U], 1
-        ld1  {v4.b}[5], [U], 1
-        ld1  {v5.b}[4], [V], 1
-        ld1  {v5.b}[5], [V], 1
-        ld1  {v0.b}[4], [Y], 1
-        ld1  {v0.b}[5], [Y], 1
-    .elseif \size == 1
-        ld1  {v4.b}[6], [U], 1
-        ld1  {v5.b}[6], [V], 1
-        ld1  {v0.b}[6], [Y], 1
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    ld1             {v4.8b}, [U], 8
+    ld1             {v5.8b}, [V], 8
+    ld1             {v0.8b}, [Y], 8
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+  .elseif \size == 4
+    ld1             {v4.b}[0], [U], 1
+    ld1             {v4.b}[1], [U], 1
+    ld1             {v4.b}[2], [U], 1
+    ld1             {v4.b}[3], [U], 1
+    ld1             {v5.b}[0], [V], 1
+    ld1             {v5.b}[1], [V], 1
+    ld1             {v5.b}[2], [V], 1
+    ld1             {v5.b}[3], [V], 1
+    ld1             {v0.b}[0], [Y], 1
+    ld1             {v0.b}[1], [Y], 1
+    ld1             {v0.b}[2], [Y], 1
+    ld1             {v0.b}[3], [Y], 1
+  .elseif \size == 2
+    ld1             {v4.b}[4], [U], 1
+    ld1             {v4.b}[5], [U], 1
+    ld1             {v5.b}[4], [V], 1
+    ld1             {v5.b}[5], [V], 1
+    ld1             {v0.b}[4], [Y], 1
+    ld1             {v0.b}[5], [Y], 1
+  .elseif \size == 1
+    ld1             {v4.b}[6], [U], 1
+    ld1             {v5.b}[6], [V], 1
+    ld1             {v0.b}[6], [Y], 1
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
+  .if \bpp == 24
+    .if \size == 8
 #ifdef ST3_IS_FAST
-            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      st3           {v10.8b, v11.8b, v12.8b}, [RGB], 24
 #else
-            st1  {v10.b}[0], [RGB], #1
-            st1  {v11.b}[0], [RGB], #1
-            st1  {v12.b}[0], [RGB], #1
+      st1           {v10.b}[0], [RGB], #1
+      st1           {v11.b}[0], [RGB], #1
+      st1           {v12.b}[0], [RGB], #1
 
-            st1  {v10.b}[1], [RGB], #1
-            st1  {v11.b}[1], [RGB], #1
-            st1  {v12.b}[1], [RGB], #1
+      st1           {v10.b}[1], [RGB], #1
+      st1           {v11.b}[1], [RGB], #1
+      st1           {v12.b}[1], [RGB], #1
 
-            st1  {v10.b}[2], [RGB], #1
-            st1  {v11.b}[2], [RGB], #1
-            st1  {v12.b}[2], [RGB], #1
+      st1           {v10.b}[2], [RGB], #1
+      st1           {v11.b}[2], [RGB], #1
+      st1           {v12.b}[2], [RGB], #1
 
-            st1  {v10.b}[3], [RGB], #1
-            st1  {v11.b}[3], [RGB], #1
-            st1  {v12.b}[3], [RGB], #1
+      st1           {v10.b}[3], [RGB], #1
+      st1           {v11.b}[3], [RGB], #1
+      st1           {v12.b}[3], [RGB], #1
 
-            st1  {v10.b}[4], [RGB], #1
-            st1  {v11.b}[4], [RGB], #1
-            st1  {v12.b}[4], [RGB], #1
+      st1           {v10.b}[4], [RGB], #1
+      st1           {v11.b}[4], [RGB], #1
+      st1           {v12.b}[4], [RGB], #1
 
-            st1  {v10.b}[5], [RGB], #1
-            st1  {v11.b}[5], [RGB], #1
-            st1  {v12.b}[5], [RGB], #1
+      st1           {v10.b}[5], [RGB], #1
+      st1           {v11.b}[5], [RGB], #1
+      st1           {v12.b}[5], [RGB], #1
 
-            st1  {v10.b}[6], [RGB], #1
-            st1  {v11.b}[6], [RGB], #1
-            st1  {v12.b}[6], [RGB], #1
+      st1           {v10.b}[6], [RGB], #1
+      st1           {v11.b}[6], [RGB], #1
+      st1           {v12.b}[6], [RGB], #1
 
-            st1  {v10.b}[7], [RGB], #1
-            st1  {v11.b}[7], [RGB], #1
-            st1  {v12.b}[7], [RGB], #1
+      st1           {v10.b}[7], [RGB], #1
+      st1           {v11.b}[7], [RGB], #1
+      st1           {v12.b}[7], [RGB], #1
 #endif
-        .elseif \size == 4
-            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
-        .elseif \size == 2
-            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
-        .elseif \size == 1
-            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
-        .elseif \size == 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
-        .elseif \size == 2
-            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
-        .elseif \size == 1
-            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp==16
-        .if \size == 8
-            st1  {v25.8h}, [RGB],16
-        .elseif \size == 4
-            st1  {v25.4h}, [RGB],8
-        .elseif \size == 2
-            st1  {v25.h}[4], [RGB],2
-            st1  {v25.h}[5], [RGB],2
-        .elseif \size == 1
-            st1  {v25.h}[6], [RGB],2
-        .else
-            .error unsupported macroblock size
-        .endif
-     .else
-        .error unsupported bpp
+    .elseif \size == 4
+      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
+    .elseif \size == 2
+      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
+    .elseif \size == 1
+      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
+    .else
+     .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+    .elseif \size == 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+    .elseif \size == 2
+      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+    .elseif \size == 1
+      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp==16
+    .if \size == 8
+      st1           {v25.8h}, [RGB], 16
+    .elseif \size == 4
+      st1           {v25.4h}, [RGB], 8
+    .elseif \size == 2
+      st1           {v25.h}[4], [RGB], 2
+      st1           {v25.h}[5], [RGB], 2
+    .elseif \size == 1
+      st1           {v25.h}[6], [RGB], 2
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
@@ -1578,92 +1580,91 @@
  */
 
 .macro do_yuv_to_rgb_stage1
-    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
-    rshrn        v20.4h, v20.4s, #15
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16
-    sqxtun       v1\g_offs\defsize, v20.8h
-    sqxtun       v1\r_offs\defsize, v24.8h
-    sqxtun       v1\b_offs\defsize, v28.8h
-.else
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    sri          v25.8h, v21.8h, #5
-    sri          v25.8h, v29.8h, #11
-.endif
-
+    rshrn           v20.4h, v20.4s, #15
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16
+    sqxtun          v1\g_offs\defsize, v20.8h
+    sqxtun          v1\r_offs\defsize, v24.8h
+    sqxtun          v1\b_offs\defsize, v28.8h
+  .else
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    sri             v25.8h, v21.8h, #5
+    sri             v25.8h, v29.8h, #11
+  .endif
 .endm
 
 .macro do_yuv_to_rgb_stage2_store_load_stage1
-    rshrn        v20.4h, v20.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    ld1          {v4.8b}, [U], 8
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    ld1          {v5.8b}, [V], 8
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
-    sqxtun       v1\g_offs\defsize, v20.8h
-    ld1          {v0.8b}, [Y], 8
-    sqxtun       v1\r_offs\defsize, v24.8h
-    prfm         pldl1keep, [U, #64]
-    prfm         pldl1keep, [V, #64]
-    prfm         pldl1keep, [Y, #64]
-    sqxtun       v1\b_offs\defsize, v28.8h
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-.else /**************************** rgb565 ***********************************/
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    ld1          {v0.8b}, [Y], 8
-    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
-    sri          v25.8h, v21.8h, #5
-    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
-    prfm         pldl1keep, [U, #64]
-    prfm         pldl1keep, [V, #64]
-    prfm         pldl1keep, [Y, #64]
-    sri          v25.8h, v29.8h, #11
-.endif
-    do_store     \bpp, 8
-    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+    rshrn           v20.4h, v20.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    ld1             {v4.8b}, [U], 8
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    ld1             {v5.8b}, [V], 8
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+    sqxtun          v1\g_offs\defsize, v20.8h
+    ld1             {v0.8b}, [Y], 8
+    sqxtun          v1\r_offs\defsize, v24.8h
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sqxtun          v1\b_offs\defsize, v28.8h
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+  .else  /**************************** rgb565 ********************************/
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1             {v0.8b}, [Y], 8
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    sri             v25.8h, v21.8h, #5
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sri             v25.8h, v29.8h, #11
+  .endif
+    do_store        \bpp, 8
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb
@@ -1677,10 +1678,10 @@
 
 .balign 16
 Ljsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
 asm_function jsimd_ycc_\colorid\()_convert_neon
     OUTPUT_WIDTH    .req x0
@@ -1701,8 +1702,10 @@
 
     sub             sp, sp, 336
     str             x15, [sp], 16
+
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
     adr             x15, Ljsimd_ycc_\colorid\()_neon_consts
+
     /* Save NEON registers */
     st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
     st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
@@ -1821,19 +1824,22 @@
 .purgem do_yuv_to_rgb_stage1
 .purgem do_yuv_to_rgb_stage2
 .purgem do_yuv_to_rgb_stage2_store_load_stage1
+
 .endm
 
-/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize */
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b
+
 .purgem do_load
 .purgem do_store
 
+
 /*****************************************************************************/
 
 /*
@@ -1848,37 +1854,37 @@
  */
 
 .macro do_store size
-    .if \size == 8
-        st1  {v20.8b}, [Y], #8
-        st1  {v21.8b}, [U], #8
-        st1  {v22.8b}, [V], #8
-    .elseif \size == 4
-        st1  {v20.b}[0], [Y], #1
-        st1  {v20.b}[1], [Y], #1
-        st1  {v20.b}[2], [Y], #1
-        st1  {v20.b}[3], [Y], #1
-        st1  {v21.b}[0], [U], #1
-        st1  {v21.b}[1], [U], #1
-        st1  {v21.b}[2], [U], #1
-        st1  {v21.b}[3], [U], #1
-        st1  {v22.b}[0], [V], #1
-        st1  {v22.b}[1], [V], #1
-        st1  {v22.b}[2], [V], #1
-        st1  {v22.b}[3], [V], #1
-    .elseif \size == 2
-        st1  {v20.b}[4], [Y], #1
-        st1  {v20.b}[5], [Y], #1
-        st1  {v21.b}[4], [U], #1
-        st1  {v21.b}[5], [U], #1
-        st1  {v22.b}[4], [V], #1
-        st1  {v22.b}[5], [V], #1
-    .elseif \size == 1
-        st1  {v20.b}[6], [Y], #1
-        st1  {v21.b}[6], [U], #1
-        st1  {v22.b}[6], [V], #1
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+  .elseif \size == 4
+    st1             {v20.b}[0], [Y], #1
+    st1             {v20.b}[1], [Y], #1
+    st1             {v20.b}[2], [Y], #1
+    st1             {v20.b}[3], [Y], #1
+    st1             {v21.b}[0], [U], #1
+    st1             {v21.b}[1], [U], #1
+    st1             {v21.b}[2], [U], #1
+    st1             {v21.b}[3], [U], #1
+    st1             {v22.b}[0], [V], #1
+    st1             {v22.b}[1], [V], #1
+    st1             {v22.b}[2], [V], #1
+    st1             {v22.b}[3], [V], #1
+  .elseif \size == 2
+    st1             {v20.b}[4], [Y], #1
+    st1             {v20.b}[5], [Y], #1
+    st1             {v21.b}[4], [U], #1
+    st1             {v21.b}[5], [U], #1
+    st1             {v22.b}[4], [V], #1
+    st1             {v22.b}[5], [V], #1
+  .elseif \size == 1
+    st1             {v20.b}[6], [Y], #1
+    st1             {v21.b}[6], [U], #1
+    st1             {v22.b}[6], [V], #1
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 #if defined(__APPLE__) || defined(__ANDROID__)
@@ -1888,77 +1894,77 @@
 #endif
 
 .macro do_load bpp, size
-    .if \bpp == 24
-        .if \size == 8
+  .if \bpp == 24
+    .if \size == 8
 #ifdef LD3_IS_FAST
-            ld3  {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      ld3           {v10.8b, v11.8b, v12.8b}, [RGB], #24
 #else
-            ld1  {v10.b}[0], [RGB], #1
-            ld1  {v11.b}[0], [RGB], #1
-            ld1  {v12.b}[0], [RGB], #1
+      ld1           {v10.b}[0], [RGB], #1
+      ld1           {v11.b}[0], [RGB], #1
+      ld1           {v12.b}[0], [RGB], #1
 
-            ld1  {v10.b}[1], [RGB], #1
-            ld1  {v11.b}[1], [RGB], #1
-            ld1  {v12.b}[1], [RGB], #1
+      ld1           {v10.b}[1], [RGB], #1
+      ld1           {v11.b}[1], [RGB], #1
+      ld1           {v12.b}[1], [RGB], #1
 
-            ld1  {v10.b}[2], [RGB], #1
-            ld1  {v11.b}[2], [RGB], #1
-            ld1  {v12.b}[2], [RGB], #1
+      ld1           {v10.b}[2], [RGB], #1
+      ld1           {v11.b}[2], [RGB], #1
+      ld1           {v12.b}[2], [RGB], #1
 
-            ld1  {v10.b}[3], [RGB], #1
-            ld1  {v11.b}[3], [RGB], #1
-            ld1  {v12.b}[3], [RGB], #1
+      ld1           {v10.b}[3], [RGB], #1
+      ld1           {v11.b}[3], [RGB], #1
+      ld1           {v12.b}[3], [RGB], #1
 
-            ld1  {v10.b}[4], [RGB], #1
-            ld1  {v11.b}[4], [RGB], #1
-            ld1  {v12.b}[4], [RGB], #1
+      ld1           {v10.b}[4], [RGB], #1
+      ld1           {v11.b}[4], [RGB], #1
+      ld1           {v12.b}[4], [RGB], #1
 
-            ld1  {v10.b}[5], [RGB], #1
-            ld1  {v11.b}[5], [RGB], #1
-            ld1  {v12.b}[5], [RGB], #1
+      ld1           {v10.b}[5], [RGB], #1
+      ld1           {v11.b}[5], [RGB], #1
+      ld1           {v12.b}[5], [RGB], #1
 
-            ld1  {v10.b}[6], [RGB], #1
-            ld1  {v11.b}[6], [RGB], #1
-            ld1  {v12.b}[6], [RGB], #1
+      ld1           {v10.b}[6], [RGB], #1
+      ld1           {v11.b}[6], [RGB], #1
+      ld1           {v12.b}[6], [RGB], #1
 
-            ld1  {v10.b}[7], [RGB], #1
-            ld1  {v11.b}[7], [RGB], #1
-            ld1  {v12.b}[7], [RGB], #1
+      ld1           {v10.b}[7], [RGB], #1
+      ld1           {v11.b}[7], [RGB], #1
+      ld1           {v12.b}[7], [RGB], #1
 #endif
-            prfm pldl1keep, [RGB, #128]
-        .elseif \size == 4
-            ld3  {v10.b, v11.b, v12.b}[0], [RGB], #3
-            ld3  {v10.b, v11.b, v12.b}[1], [RGB], #3
-            ld3  {v10.b, v11.b, v12.b}[2], [RGB], #3
-            ld3  {v10.b, v11.b, v12.b}[3], [RGB], #3
-        .elseif \size == 2
-            ld3  {v10.b, v11.b, v12.b}[4], [RGB], #3
-            ld3  {v10.b, v11.b, v12.b}[5], [RGB], #3
-        .elseif \size == 1
-            ld3  {v10.b, v11.b, v12.b}[6], [RGB], #3
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            ld4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
-            prfm pldl1keep, [RGB, #128]
-        .elseif \size == 4
-            ld4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
-            ld4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
-            ld4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
-            ld4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
-        .elseif \size == 2
-            ld4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
-            ld4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
-        .elseif \size == 1
-            ld4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
-        .else
-            .error unsupported macroblock size
-        .endif
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
+    .elseif \size == 2
+      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
+    .elseif \size == 1
+      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+    .elseif \size == 2
+      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+    .elseif \size == 1
+      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
@@ -1968,43 +1974,43 @@
  */
 
 .macro do_rgb_to_yuv_stage1
-    ushll       v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
-    ushll       v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
-    ushll       v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
-    rev64       v18.4s, v1.4s
-    rev64       v26.4s, v1.4s
-    rev64       v28.4s, v1.4s
-    rev64       v30.4s, v1.4s
-    umull       v14.4s, v4.4h, v0.h[0]
-    umull2      v16.4s, v4.8h, v0.h[0]
-    umlsl       v18.4s, v4.4h, v0.h[3]
-    umlsl2      v26.4s, v4.8h, v0.h[3]
-    umlal       v28.4s, v4.4h, v0.h[5]
-    umlal2      v30.4s, v4.8h, v0.h[5]
-    umlal       v14.4s, v6.4h, v0.h[1]
-    umlal2      v16.4s, v6.8h, v0.h[1]
-    umlsl       v18.4s, v6.4h, v0.h[4]
-    umlsl2      v26.4s, v6.8h, v0.h[4]
-    umlsl       v28.4s, v6.4h, v0.h[6]
-    umlsl2      v30.4s, v6.8h, v0.h[6]
-    umlal       v14.4s, v8.4h, v0.h[2]
-    umlal2      v16.4s, v8.8h, v0.h[2]
-    umlal       v18.4s, v8.4h, v0.h[5]
-    umlal2      v26.4s, v8.8h, v0.h[5]
-    umlsl       v28.4s, v8.4h, v0.h[7]
-    umlsl2      v30.4s, v8.8h, v0.h[7]
+    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
+    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
+    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
+    rev64           v18.4s, v1.4s
+    rev64           v26.4s, v1.4s
+    rev64           v28.4s, v1.4s
+    rev64           v30.4s, v1.4s
+    umull           v14.4s, v4.4h, v0.h[0]
+    umull2          v16.4s, v4.8h, v0.h[0]
+    umlsl           v18.4s, v4.4h, v0.h[3]
+    umlsl2          v26.4s, v4.8h, v0.h[3]
+    umlal           v28.4s, v4.4h, v0.h[5]
+    umlal2          v30.4s, v4.8h, v0.h[5]
+    umlal           v14.4s, v6.4h, v0.h[1]
+    umlal2          v16.4s, v6.8h, v0.h[1]
+    umlsl           v18.4s, v6.4h, v0.h[4]
+    umlsl2          v26.4s, v6.8h, v0.h[4]
+    umlsl           v28.4s, v6.4h, v0.h[6]
+    umlsl2          v30.4s, v6.8h, v0.h[6]
+    umlal           v14.4s, v8.4h, v0.h[2]
+    umlal2          v16.4s, v8.8h, v0.h[2]
+    umlal           v18.4s, v8.4h, v0.h[5]
+    umlal2          v26.4s, v8.8h, v0.h[5]
+    umlsl           v28.4s, v8.4h, v0.h[7]
+    umlsl2          v30.4s, v8.8h, v0.h[7]
 .endm
 
 .macro do_rgb_to_yuv_stage2
-    rshrn       v20.4h, v14.4s, #16
-    shrn        v22.4h, v18.4s, #16
-    shrn        v24.4h, v28.4s, #16
-    rshrn2      v20.8h, v16.4s, #16
-    shrn2       v22.8h, v26.4s, #16
-    shrn2       v24.8h, v30.4s, #16
-    xtn         v20.8b, v20.8h      /* v20 = y */
-    xtn         v21.8b, v22.8h      /* v21 = u */
-    xtn         v22.8b, v24.8h      /* v22 = v */
+    rshrn           v20.4h, v14.4s, #16
+    shrn            v22.4h, v18.4s, #16
+    shrn            v24.4h, v28.4s, #16
+    rshrn2          v20.8h, v16.4s, #16
+    shrn2           v22.8h, v26.4s, #16
+    shrn2           v24.8h, v30.4s, #16
+    xtn             v20.8b, v20.8h       /* v20 = y */
+    xtn             v21.8b, v22.8h       /* v21 = u */
+    xtn             v22.8b, v24.8h       /* v22 = v */
 .endm
 
 .macro do_rgb_to_yuv
@@ -2016,19 +2022,19 @@
  *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
 .macro do_rgb_to_yuv_stage2_store_load_stage1
     do_rgb_to_yuv_stage2
-    do_load     \bpp, 8
-    st1         {v20.8b}, [Y], #8
-    st1         {v21.8b}, [U], #8
-    st1         {v22.8b}, [V], #8
+    do_load         \bpp, 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
     do_rgb_to_yuv_stage1
 .endm
 
 .balign 16
 Ljsimd_\colorid\()_ycc_neon_consts:
-    .short          19595, 38470, 7471,  11059
-    .short          21709, 32768, 27439, 5329
-    .short          32767, 128,   32767, 128
-    .short          32767, 128,   32767, 128
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
 
 asm_function jsimd_\colorid\()_ycc_convert_neon
     OUTPUT_WIDTH    .req w0
@@ -2148,6 +2154,7 @@
 .purgem do_load
 .purgem do_store
 
+
 /*****************************************************************************/
 
 /*
@@ -2171,7 +2178,6 @@
     TMP8            .req x4
     TMPDUP          .req w3
 
-
     mov             TMPDUP, #128
     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
@@ -2234,43 +2240,43 @@
  *       rid of a bunch of VLD1.16 instructions
  */
 
-#define CONST_BITS      13
-#define PASS1_BITS      2
+#define CONST_BITS 13
+#define PASS1_BITS 2
 
-#define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-#define DESCALE_P2      (CONST_BITS+PASS1_BITS)
+#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
 
-#define F_0_298      2446           /* FIX(0.298631336) */
-#define F_0_390      3196           /* FIX(0.390180644) */
-#define F_0_541      4433           /* FIX(0.541196100) */
-#define F_0_765      6270           /* FIX(0.765366865) */
-#define F_0_899      7373           /* FIX(0.899976223) */
-#define F_1_175      9633           /* FIX(1.175875602) */
-#define F_1_501     12299           /* FIX(1.501321110) */
-#define F_1_847     15137           /* FIX(1.847759065) */
-#define F_1_961     16069           /* FIX(1.961570560) */
-#define F_2_053     16819           /* FIX(2.053119869) */
-#define F_2_562     20995           /* FIX(2.562915447) */
-#define F_3_072     25172           /* FIX(3.072711026) */
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
 
 .balign 16
 Ljsimd_fdct_islow_neon_consts:
-    .short F_0_298
-    .short -F_0_390
-    .short F_0_541
-    .short F_0_765
-    .short - F_0_899
-    .short F_1_175
-    .short F_1_501
-    .short - F_1_847
-    .short - F_1_961
-    .short F_2_053
-    .short - F_2_562
-    .short F_3_072
-    .short 0  /* padding */
-    .short 0
-    .short 0
-    .short 0
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
 
 #undef F_0_298
 #undef F_0_390
@@ -2299,17 +2305,17 @@
 
 asm_function jsimd_fdct_islow_neon
 
-    DATA    .req x0
-    TMP     .req x9
+    DATA            .req x0
+    TMP             .req x9
 
     /* Load constants */
-    adr     TMP, Ljsimd_fdct_islow_neon_consts
-    ld1     {v0.8h, v1.8h}, [TMP]
+    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    ld1             {v0.8h, v1.8h}, [TMP]
 
     /* Save NEON registers */
-    sub     sp, sp, #64
-    st1     {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1     {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
     /* Load all DATA into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -2324,225 +2330,219 @@
      *   7 | d30     | d31    | v23.8h
      */
 
-    ld1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    sub     DATA, DATA, #64
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub             DATA, DATA, #64
 
     /* Transpose */
-    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
     /* 1-D FDCT */
-    add v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
-    sub v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
-    add v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
-    sub v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
-    add v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
-    sub v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
-    add v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
-    sub v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
 
     /* even part */
 
-    add v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
-    sub v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
-    add v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
-    sub v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
 
-    add v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
-    sub v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
 
-    add v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    shl v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
-    shl v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
 
-    smull2 v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    smull  v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    mov    v22.16b, v18.16b
-    mov    v25.16b, v24.16b
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
 
-    smlal  v18.4s,  v9.4h, XFIX_P_0_765  /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal2 v24.4s,  v9.8h, XFIX_P_0_765  /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal  v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-    smlal2 v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
 
-    rshrn  v18.4h, v18.4s, #DESCALE_P1
-    rshrn  v22.4h, v22.4s, #DESCALE_P1
-    rshrn2 v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765),
-                                                                          CONST_BITS-PASS1_BITS); */
-    rshrn2 v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847),
-                                                                          CONST_BITS-PASS1_BITS); */
+    rshrn           v18.4h, v18.4s, #DESCALE_P1
+    rshrn           v22.4h, v22.4s, #DESCALE_P1
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
 
-    add  v8.8h, v28.8h, v31.8h  /* z1 = tmp4 + tmp7; */
-    add  v9.8h, v29.8h, v30.8h  /* z2 = tmp5 + tmp6; */
-    add v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
-    add v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
-    smull  v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
-    smull2 v5.4s, v10.8h, XFIX_P_1_175
-    smlal  v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
-    smlal2 v5.4s, v11.8h, XFIX_P_1_175
+    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
 
-    smull2 v24.4s, v28.8h, XFIX_P_0_298
-    smull2 v25.4s, v29.8h, XFIX_P_2_053
-    smull2 v26.4s, v30.8h, XFIX_P_3_072
-    smull2 v27.4s, v31.8h, XFIX_P_1_501
-    smull  v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
-    smull  v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
-    smull  v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
-    smull  v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
 
-    smull2 v12.4s,  v8.8h, XFIX_N_0_899
-    smull2 v13.4s,  v9.8h, XFIX_N_2_562
-    smull2 v14.4s, v10.8h, XFIX_N_1_961
-    smull2 v15.4s, v11.8h, XFIX_N_0_390
-    smull   v8.4s,  v8.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull   v9.4s,  v9.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull  v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull  v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
 
-    add v10.4s, v10.4s, v4.4s  /* z3 += z5 */
-    add v14.4s, v14.4s, v5.4s
-    add v11.4s, v11.4s, v4.4s  /* z4 += z5 */
-    add v15.4s, v15.4s, v5.4s
+    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add             v15.4s, v15.4s, v5.4s
 
-    add v28.4s, v28.4s,  v8.4s  /* tmp4 += z1 */
-    add v24.4s, v24.4s, v12.4s
-    add v29.4s, v29.4s,  v9.4s  /* tmp5 += z2 */
-    add v25.4s, v25.4s, v13.4s
-    add v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
-    add v26.4s, v26.4s, v14.4s
-    add v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
-    add v27.4s, v27.4s, v15.4s
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
 
-    add v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
-    add v24.4s, v24.4s, v14.4s
-    add v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
-    add v25.4s, v25.4s, v15.4s
-    add v30.4s, v30.4s,  v9.4s  /* tmp6 += z2 */
-    add v26.4s, v26.4s, v13.4s
-    add v31.4s, v31.4s,  v8.4s  /* tmp7 += z1 */
-    add v27.4s, v27.4s, v12.4s
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
 
-    rshrn  v23.4h, v28.4s, #DESCALE_P1
-    rshrn  v21.4h, v29.4s, #DESCALE_P1
-    rshrn  v19.4h, v30.4s, #DESCALE_P1
-    rshrn  v17.4h, v31.4s, #DESCALE_P1
-    rshrn2 v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2 v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2 v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2 v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn           v23.4h, v28.4s, #DESCALE_P1
+    rshrn           v21.4h, v29.4s, #DESCALE_P1
+    rshrn           v19.4h, v30.4s, #DESCALE_P1
+    rshrn           v17.4h, v31.4s, #DESCALE_P1
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* Transpose */
-    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
 
     /* 1-D FDCT */
-    add v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
-    sub v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
-    add v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
-    sub v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
-    add v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
-    sub v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
-    add v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
-    sub v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
 
     /* even part */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
 
-    add v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
-    sub v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
-    add v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
-    sub v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
 
-    add v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
-    sub v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    add v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
 
-    srshr  v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
-    srshr  v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
 
-    smull2 v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    smull  v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    mov    v22.16b, v18.16b
-    mov    v25.16b, v24.16b
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
 
-    smlal  v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal2 v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal  v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-    smlal2 v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-
-    rshrn  v18.4h, v18.4s, #DESCALE_P2
-    rshrn  v22.4h, v22.4s, #DESCALE_P2
-    rshrn2 v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765),
-                                                                          CONST_BITS-PASS1_BITS); */
-    rshrn2 v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847),
-                                                                          CONST_BITS-PASS1_BITS); */
+    rshrn           v18.4h, v18.4s, #DESCALE_P2
+    rshrn           v22.4h, v22.4s, #DESCALE_P2
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
+    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
 
-    add  v8.8h, v28.8h, v31.8h  /* z1 = tmp4 + tmp7; */
-    add  v9.8h, v29.8h, v30.8h  /* z2 = tmp5 + tmp6; */
-    add v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
-    add v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
 
-    smull  v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
-    smull2 v5.4s, v10.8h, XFIX_P_1_175
-    smlal  v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
-    smlal2 v5.4s, v11.8h, XFIX_P_1_175
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
 
-    smull2 v24.4s, v28.8h, XFIX_P_0_298
-    smull2 v25.4s, v29.8h, XFIX_P_2_053
-    smull2 v26.4s, v30.8h, XFIX_P_3_072
-    smull2 v27.4s, v31.8h, XFIX_P_1_501
-    smull  v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
-    smull  v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
-    smull  v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
-    smull  v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
 
-    smull2 v12.4s,  v8.8h, XFIX_N_0_899
-    smull2 v13.4s,  v9.8h, XFIX_N_2_562
-    smull2 v14.4s, v10.8h, XFIX_N_1_961
-    smull2 v15.4s, v11.8h, XFIX_N_0_390
-    smull   v8.4s,  v8.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull   v9.4s,  v9.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull  v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull  v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    add             v10.4s, v10.4s, v4.4s
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s
+    add             v15.4s, v15.4s, v5.4s
 
-    add v10.4s, v10.4s, v4.4s
-    add v14.4s, v14.4s, v5.4s
-    add v11.4s, v11.4s, v4.4s
-    add v15.4s, v15.4s, v5.4s
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
 
-    add v28.4s, v28.4s,  v8.4s  /* tmp4 += z1 */
-    add v24.4s, v24.4s, v12.4s
-    add v29.4s, v29.4s,  v9.4s  /* tmp5 += z2 */
-    add v25.4s, v25.4s, v13.4s
-    add v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
-    add v26.4s, v26.4s, v14.4s
-    add v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
-    add v27.4s, v27.4s, v15.4s
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
 
-    add v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
-    add v24.4s, v24.4s, v14.4s
-    add v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
-    add v25.4s, v25.4s, v15.4s
-    add v30.4s, v30.4s,  v9.4s  /* tmp6 += z2 */
-    add v26.4s, v26.4s, v13.4s
-    add v31.4s, v31.4s,  v8.4s  /* tmp7 += z1 */
-    add v27.4s, v27.4s, v12.4s
-
-    rshrn  v23.4h, v28.4s, #DESCALE_P2
-    rshrn  v21.4h, v29.4s, #DESCALE_P2
-    rshrn  v19.4h, v30.4s, #DESCALE_P2
-    rshrn  v17.4h, v31.4s, #DESCALE_P2
-    rshrn2 v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2 v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2 v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2 v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn           v23.4h, v28.4s, #DESCALE_P2
+    rshrn           v21.4h, v29.4s, #DESCALE_P2
+    rshrn           v19.4h, v30.4s, #DESCALE_P2
+    rshrn           v17.4h, v31.4s, #DESCALE_P2
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* store results */
-    st1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
     /* Restore NEON registers */
     sub             sp, sp, #64
@@ -2567,6 +2567,7 @@
 #undef XFIX_N_2_562
 #undef XFIX_P_3_072
 
+
 /*****************************************************************************/
 
 /*
@@ -2574,7 +2575,7 @@
  *
  * This function contains a fast, not so accurate integer implementation of
  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG''s original 'jpeg_fdct_ifast'
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
  * function from jfdctfst.c
  *
  * TODO: can be combined with 'jsimd_convsamp_neon' to get
@@ -2589,19 +2590,19 @@
 
 .balign 16
 Ljsimd_fdct_ifast_neon_consts:
-    .short (98 * 128)               /* XFIX_0_382683433 */
-    .short (139 * 128)              /* XFIX_0_541196100 */
-    .short (181 * 128)              /* XFIX_0_707106781 */
-    .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
 
 asm_function jsimd_fdct_ifast_neon
 
-    DATA    .req x0
-    TMP     .req x9
+    DATA            .req x0
+    TMP             .req x9
 
     /* Load constants */
-    adr     TMP, Ljsimd_fdct_ifast_neon_consts
-    ld1     {v0.4h}, [TMP]
+    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    ld1             {v0.4h}, [TMP]
 
     /* Load all DATA into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -2616,56 +2617,56 @@
      *   7 | d30     | d31    | q15
      */
 
-    ld1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    mov     TMP, #2
-    sub     DATA, DATA, #64
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    mov             TMP, #2
+    sub             DATA, DATA, #64
 1:
     /* Transpose */
-    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
-    subs    TMP, TMP, #1
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
+    subs            TMP, TMP, #1
     /* 1-D FDCT */
-    add      v4.8h, v19.8h, v20.8h
-    sub     v20.8h, v19.8h, v20.8h
-    sub     v28.8h, v18.8h, v21.8h
-    add     v18.8h, v18.8h, v21.8h
-    sub     v29.8h, v17.8h, v22.8h
-    add     v17.8h, v17.8h, v22.8h
-    sub     v21.8h, v16.8h, v23.8h
-    add     v16.8h, v16.8h, v23.8h
-    sub      v6.8h, v17.8h, v18.8h
-    sub      v7.8h, v16.8h, v4.8h
-    add      v5.8h, v17.8h, v18.8h
-    add      v6.8h,  v6.8h, v7.8h
-    add      v4.8h, v16.8h, v4.8h
-    sqdmulh  v6.8h,  v6.8h, XFIX_0_707106781
-    add     v19.8h, v20.8h, v28.8h
-    add     v16.8h,  v4.8h, v5.8h
-    sub     v20.8h,  v4.8h, v5.8h
-    add      v5.8h, v28.8h, v29.8h
-    add     v29.8h, v29.8h, v21.8h
-    sqdmulh  v5.8h,  v5.8h, XFIX_0_707106781
-    sub     v28.8h, v19.8h, v29.8h
-    add     v18.8h,  v7.8h, v6.8h
-    sqdmulh v28.8h, v28.8h, XFIX_0_382683433
-    sub     v22.8h,  v7.8h, v6.8h
-    sqdmulh v19.8h, v19.8h, XFIX_0_541196100
-    sqdmulh  v7.8h, v29.8h, XFIX_1_306562965
-    add      v6.8h, v21.8h, v5.8h
-    sub      v5.8h, v21.8h, v5.8h
-    add     v29.8h, v29.8h, v28.8h
-    add     v19.8h, v19.8h, v28.8h
-    add     v29.8h, v29.8h, v7.8h
-    add     v21.8h,  v5.8h, v19.8h
-    sub     v19.8h,  v5.8h, v19.8h
-    add     v17.8h,  v6.8h, v29.8h
-    sub     v23.8h,  v6.8h, v29.8h
+    add             v4.8h, v19.8h, v20.8h
+    sub             v20.8h, v19.8h, v20.8h
+    sub             v28.8h, v18.8h, v21.8h
+    add             v18.8h, v18.8h, v21.8h
+    sub             v29.8h, v17.8h, v22.8h
+    add             v17.8h, v17.8h, v22.8h
+    sub             v21.8h, v16.8h, v23.8h
+    add             v16.8h, v16.8h, v23.8h
+    sub             v6.8h, v17.8h, v18.8h
+    sub             v7.8h, v16.8h, v4.8h
+    add             v5.8h, v17.8h, v18.8h
+    add             v6.8h, v6.8h, v7.8h
+    add             v4.8h, v16.8h, v4.8h
+    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
+    add             v19.8h, v20.8h, v28.8h
+    add             v16.8h, v4.8h, v5.8h
+    sub             v20.8h, v4.8h, v5.8h
+    add             v5.8h, v28.8h, v29.8h
+    add             v29.8h, v29.8h, v21.8h
+    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
+    sub             v28.8h, v19.8h, v29.8h
+    add             v18.8h, v7.8h, v6.8h
+    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
+    sub             v22.8h, v7.8h, v6.8h
+    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
+    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
+    add             v6.8h, v21.8h, v5.8h
+    sub             v5.8h, v21.8h, v5.8h
+    add             v29.8h, v29.8h, v28.8h
+    add             v19.8h, v19.8h, v28.8h
+    add             v29.8h, v29.8h, v7.8h
+    add             v21.8h, v5.8h, v19.8h
+    sub             v19.8h, v5.8h, v19.8h
+    add             v17.8h, v6.8h, v29.8h
+    sub             v23.8h, v6.8h, v29.8h
 
-    b.ne    1b
+    b.ne            1b
 
     /* store results */
-    st1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
     br              x30
 
@@ -2676,6 +2677,7 @@
 #undef XFIX_0_707106781
 #undef XFIX_1_306562965
 
+
 /*****************************************************************************/
 
 /*
@@ -2732,10 +2734,10 @@
     neg             v25.8h, v25.8h
     neg             v26.8h, v26.8h
     neg             v27.8h, v27.8h
-    sshr            v0.8h,  v0.8h,  #15  /* extract sign */
-    sshr            v1.8h,  v1.8h,  #15
-    sshr            v2.8h,  v2.8h,  #15
-    sshr            v3.8h,  v3.8h,  #15
+    sshr            v0.8h, v0.8h, #15  /* extract sign */
+    sshr            v1.8h, v1.8h, #15
+    sshr            v2.8h, v2.8h, #15
+    sshr            v3.8h, v3.8h, #15
     ushl            v4.8h, v4.8h, v24.8h  /* shift */
     ushl            v5.8h, v5.8h, v25.8h
     ushl            v6.8h, v6.8h, v26.8h
@@ -2763,6 +2765,7 @@
     .unreq          SHIFT
     .unreq          LOOP_COUNT
 
+
 /*****************************************************************************/
 
 /*
@@ -2772,44 +2775,45 @@
  *
  * GLOBAL(void)
  * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ *                             JDIMENSION v_samp_factor,
+ *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
+ *                             JSAMPARRAY output_data);
  */
 
 .balign 16
 Ljsimd_h2_downsample_neon_consts:
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-          0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-          0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-          0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-          0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-    .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-          0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-    .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-          0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-    .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
 
 asm_function jsimd_h2v1_downsample_neon
     IMAGE_WIDTH     .req x0
@@ -2825,39 +2829,39 @@
     TMP3            .req x13
     TMPDUP          .req w15
 
-    mov    TMPDUP, #0x10000
-    lsl    TMP2, BLOCK_WIDTH, #4
-    sub    TMP2, TMP2, IMAGE_WIDTH
-    adr    TMP3, Ljsimd_h2_downsample_neon_consts
-    add    TMP3, TMP3, TMP2, lsl #4
-    dup    v16.4s, TMPDUP
-    ld1    {v18.16b}, [TMP3]
+    mov             TMPDUP, #0x10000
+    lsl             TMP2, BLOCK_WIDTH, #4
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
 
 1:  /* row loop */
-    ldr    INPTR, [INPUT_DATA], #8
-    ldr    OUTPTR, [OUTPUT_DATA], #8
-    subs   TMP1, BLOCK_WIDTH, #1
-    b.eq   3f
+    ldr             INPTR, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
 2:  /* columns */
-    ld1    {v0.16b}, [INPTR], #16
-    mov    v4.16b, v16.16b
-    subs   TMP1, TMP1, #1
-    uadalp v4.8h, v0.16b
-    shrn   v6.8b, v4.8h, #1
-    st1    {v6.8b}, [OUTPTR], #8
-    b.ne   2b
+    ld1             {v0.16b}, [INPTR], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
 3:  /* last columns */
-    ld1    {v0.16b}, [INPTR]
-    mov    v4.16b, v16.16b
-    subs   V_SAMP, V_SAMP, #1
+    ld1             {v0.16b}, [INPTR]
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
     /* expand right */
-    tbl    v2.16b, {v0.16b}, v18.16b
-    uadalp v4.8h, v2.16b
-    shrn   v6.8b, v4.8h, #1
-    st1    {v6.8b}, [OUTPTR], #8
-    b.ne   1b
+    tbl             v2.16b, {v0.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
 
-    br     x30
+    br              x30
 
     .unreq          IMAGE_WIDTH
     .unreq          MAX_V_SAMP
@@ -2872,6 +2876,7 @@
     .unreq          TMP3
     .unreq          TMPDUP
 
+
 /*****************************************************************************/
 
 /*
@@ -2901,47 +2906,47 @@
     TMP3            .req x13
     TMPDUP          .req w15
 
-    mov    TMPDUP, #1
-    lsl    TMP2, BLOCK_WIDTH, #4
-    lsl    TMPDUP, TMPDUP, #17
-    sub    TMP2, TMP2, IMAGE_WIDTH
-    adr    TMP3, Ljsimd_h2_downsample_neon_consts
-    orr    TMPDUP, TMPDUP, #1
-    add    TMP3, TMP3, TMP2, lsl #4
-    dup    v16.4s, TMPDUP
-    ld1    {v18.16b}, [TMP3]
+    mov             TMPDUP, #1
+    lsl             TMP2, BLOCK_WIDTH, #4
+    lsl             TMPDUP, TMPDUP, #17
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    orr             TMPDUP, TMPDUP, #1
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
 
 1:  /* row loop */
-    ldr    INPTR0, [INPUT_DATA], #8
-    ldr    OUTPTR, [OUTPUT_DATA], #8
-    ldr    INPTR1, [INPUT_DATA], #8
-    subs   TMP1, BLOCK_WIDTH, #1
-    b.eq   3f
+    ldr             INPTR0, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    ldr             INPTR1, [INPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
 2:  /* columns */
-    ld1    {v0.16b}, [INPTR0], #16
-    ld1    {v1.16b}, [INPTR1], #16
-    mov    v4.16b, v16.16b
-    subs   TMP1, TMP1, #1
-    uadalp v4.8h, v0.16b
-    uadalp v4.8h, v1.16b
-    shrn   v6.8b, v4.8h, #2
-    st1    {v6.8b}, [OUTPTR], #8
-    b.ne   2b
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    uadalp          v4.8h, v1.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
 3:  /* last columns */
-    ld1    {v0.16b}, [INPTR0], #16
-    ld1    {v1.16b}, [INPTR1], #16
-    mov    v4.16b, v16.16b
-    subs   V_SAMP, V_SAMP, #1
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
     /* expand right */
-    tbl    v2.16b, {v0.16b}, v18.16b
-    tbl    v3.16b, {v1.16b}, v18.16b
-    uadalp v4.8h, v2.16b
-    uadalp v4.8h, v3.16b
-    shrn   v6.8b, v4.8h, #2
-    st1    {v6.8b}, [OUTPTR], #8
-    b.ne   1b
+    tbl             v2.16b, {v0.16b}, v18.16b
+    tbl             v3.16b, {v1.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    uadalp          v4.8h, v3.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
 
-    br     x30
+    br              x30
 
     .unreq          IMAGE_WIDTH
     .unreq          MAX_V_SAMP
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index ecbdea8..c63e513 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -27,7 +27,7 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
@@ -59,10 +59,10 @@
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
 .macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16 \x0, \x1
-    vtrn.16 \x2, \x3
-    vtrn.32 \x0, \x2
-    vtrn.32 \x1, \x3
+    vtrn.16         \x0, \x1
+    vtrn.16         \x2, \x3
+    vtrn.32         \x0, \x2
+    vtrn.32         \x1, \x3
 .endm
 
 
@@ -78,18 +78,18 @@
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
 
 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
@@ -171,34 +171,34 @@
     tmp13 = q1;                                                               \
 }
 
-#define XFIX_0_899976223                    d0[0]
-#define XFIX_0_541196100                    d0[1]
-#define XFIX_2_562915447                    d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
-#define XFIX_1_175875602                    d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+#define XFIX_0_899976223                   d0[0]
+#define XFIX_0_541196100                   d0[1]
+#define XFIX_2_562915447                   d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
+#define XFIX_1_175875602                   d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
 
 .balign 16
 jsimd_idct_islow_neon_consts:
-    .short FIX_0_899976223                    /* d0[0] */
-    .short FIX_0_541196100                    /* d0[1] */
-    .short FIX_2_562915447                    /* d0[2] */
-    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-    .short FIX_1_175875602                    /* d1[3] */
-    /* reloadable constants */
-    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
 
 asm_function jsimd_idct_islow_neon
 
@@ -257,140 +257,141 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
     add             ip, ip, #16
     vmul.s16        q15, q15, q3
-    vpush           {d8-d15} /* save NEON registers */
+    vpush           {d8-d15}                      /* save NEON registers */
     /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4,    ROW7L, ROW3L
-    vadd.s16        d5,    ROW5L, ROW1L
-    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d5,    XFIX_1_175875602
-    vmull.s16       q7,    d4,    XFIX_1_175875602
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
       /* Check for the zero coefficients in the right 4x8 half */
       push            {r4, r5}
-    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW4L
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0,    r4,    r5
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-      orr             r0,    r0,    r4
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q2
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-      orr             r0,    r0,    r4
-    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0,    r0,    r4
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-      orr             r0,    r0,    r5
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW6L, q1,    #11
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q5
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW4L
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW2L, q1,    #11
-      orr             r0,    r0,    r5
-    vrshrn.s32      ROW5L, q3,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0,    r0,    r4
-    vadd.s32        q2,    q5,    q6
-      orrs            r0,    r0,    r5
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-      orr             r0,    r4,    r5
-    vsub.s32        q3,    q1,    q4
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
       pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2,    #11
-    vrshrn.s32      ROW3L, q5,    #11
-    vrshrn.s32      ROW0L, q6,    #11
-    vrshrn.s32      ROW4L, q3,    #11
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
 
-      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
 
     /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vadd.s16        d10,   ROW7R, ROW3R
-    vadd.s16        d8,    ROW5R, ROW1R
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
       /* Transpose left 4x8 half */
       vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d8,    XFIX_1_175875602
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
       vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7,    d10,   XFIX_1_175875602
-    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
       vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3,    ROW0R, ROW4R
-    vmull.s16       q2,    ROW2R, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
       vtrn.16         ROW4L, ROW5L
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
       vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
       vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
       vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1,    #11
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
       vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vrshrn.s32      ROW6R, q1,    #11
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1,    #11
-    vrshrn.s32      ROW5R, q3,    #11
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vrshrn.s32      ROW7R, q2,    #11
-    vrshrn.s32      ROW3R, q5,    #11
-    vrshrn.s32      ROW0R, q6,    #11
-    vrshrn.s32      ROW4R, q3,    #11
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
     /* Transpose right 4x8 half */
     vtrn.16         ROW6R, ROW7R
     vtrn.16         ROW2R, ROW3R
@@ -402,122 +403,122 @@
     vtrn.32         ROW5R, ROW7R
 
 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5R, XFIX_1_175875602
-    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmull.s16       q7,    ROW7R, XFIX_1_175875602
-    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
 
 2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16,   q8,    #2
-    vqrshrn.s16     d17,   q9,    #2
-    vqrshrn.s16     d18,   q10,   #2
-    vqrshrn.s16     d19,   q11,   #2
-    vpop            {d8-d15} /* restore NEON registers */
-    vqrshrn.s16     d20,   q12,   #2
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8-d15}                      /* restore NEON registers */
+    vqrshrn.s16     d20, q12, #2
       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8,    q9
-    vqrshrn.s16     d21,   q13,   #2
-    vqrshrn.s16     d22,   q14,   #2
-      vmov.u8         q0,    #(CENTERJSAMPLE)
-    vqrshrn.s16     d23,   q15,   #2
-      vtrn.8          d16,   d17
-      vtrn.8          d18,   d19
-      vadd.u8         q8,    q8,    q0
-      vadd.u8         q9,    q9,    q0
-      vtrn.16         q10,   q11
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
         /* Store results to the output buffer */
         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
         add             TMP1, TMP1, OUTPUT_COL
@@ -529,7 +530,7 @@
         add             TMP1, TMP1, OUTPUT_COL
         add             TMP2, TMP2, OUTPUT_COL
         vst1.8          {d18}, [TMP1]
-      vadd.u8         q10,   q10,   q0
+      vadd.u8         q10, q10, q0
         vst1.8          {d19}, [TMP2]
         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
         add             TMP1, TMP1, OUTPUT_COL
@@ -538,7 +539,7 @@
         add             TMP4, TMP4, OUTPUT_COL
       vtrn.8          d22, d23
         vst1.8          {d20}, [TMP1]
-      vadd.u8         q11,   q11,   q0
+      vadd.u8         q11, q11, q0
         vst1.8          {d21}, [TMP2]
         vst1.8          {d22}, [TMP3]
         vst1.8          {d23}, [TMP4]
@@ -551,14 +552,15 @@
     vtrn.16         ROW2L, ROW3L
     vtrn.16         ROW0L, ROW1L
     vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
     vtrn.32         ROW1L, ROW3L
     vtrn.32         ROW4L, ROW6L
     vtrn.32         ROW0L, ROW2L
     vtrn.32         ROW5L, ROW7L
 
     cmp             r0, #0
-    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
 
     /* Only row 0 is non-zero for the right 4x8 half  */
     vdup.s16        ROW1R, ROW0R[1]
@@ -569,83 +571,83 @@
     vdup.s16        ROW6R, ROW0R[2]
     vdup.s16        ROW7R, ROW0R[3]
     vdup.s16        ROW0R, ROW0R[0]
-    b               1b /* Go to 'normal' second pass */
+    b               1b  /* Go to 'normal' second pass */
 
 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vshll.s16       q3,    ROW0L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW0L, #13
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW7L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100
-    vshll.s16       q3,    ROW4L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW4L, #13
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
-    b               2b /* Go to epilogue */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -699,10 +701,10 @@
 
 .balign 16
 jsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -732,9 +734,9 @@
     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8,  q8,  q0
+    vmul.s16        q8, q8, q0
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9,  q9,  q1
+    vmul.s16        q9, q9, q1
     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
     vmul.s16        q10, q10, q2
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
@@ -744,124 +746,124 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64] /* load constants */
+    vld1.16         {d0}, [ip, :64]  /* load constants */
     vmul.s16        q15, q15, q3
-    vpush           {d8-d13}        /* save NEON registers */
+    vpush           {d8-d13}         /* save NEON registers */
     /* 1-D IDCT, pass 1 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
     vadd.s16        q14, q10, q14
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
     vadd.s16        q10, q10, q2
       /* Transpose */
-      vtrn.16         q8,  q9
+      vtrn.16         q8, q9
     vsub.s16        q11, q12, q1
       vtrn.16         q14, q15
     vadd.s16        q12, q12, q1
       vtrn.16         q10, q11
       vtrn.16         q12, q13
-      vtrn.32         q9,  q11
+      vtrn.32         q9, q11
       vtrn.32         q12, q14
-      vtrn.32         q8,  q10
+      vtrn.32         q8, q10
       vtrn.32         q13, q15
       vswp            d28, d21
       vswp            d26, d19
     /* 1-D IDCT, pass 2 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
       vswp            d30, d23
     vadd.s16        q14, q10, q14
       vswp            d24, d17
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
-    vpop            {d8-d13}        /* restore NEON registers */
+    vpop            {d8-d13}      /* restore NEON registers */
     vadd.s16        q10, q10, q2
     vsub.s16        q11, q12, q1
     vadd.s16        q12, q12, q1
     /* Descale to 8-bit and range limit */
-    vmov.u8         q0,  #0x80
-    vqshrn.s16      d16, q8,  #5
-    vqshrn.s16      d17, q9,  #5
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
     vqshrn.s16      d18, q10, #5
     vqshrn.s16      d19, q11, #5
     vqshrn.s16      d20, q12, #5
     vqshrn.s16      d21, q13, #5
     vqshrn.s16      d22, q14, #5
     vqshrn.s16      d23, q15, #5
-    vadd.u8         q8,  q8,  q0
-    vadd.u8         q9,  q9,  q0
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
     vadd.u8         q10, q10, q0
     vadd.u8         q11, q11, q0
     /* Transpose the final 8-bit samples */
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q10, q11
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
     vtrn.8          d16, d17
     vtrn.8          d18, d19
       /* Store results to the output buffer */
@@ -920,81 +922,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
 jsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* d0[0] */
-    .short     -FIX_0_765366865    /* d0[1] */
-    .short     -FIX_0_211164243    /* d0[2] */
-    .short     FIX_1_451774981     /* d0[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* d2[0] */
-    .short     FIX_2_562915447     /* d2[1] */
-    .short     1 << (CONST_BITS+1) /* d2[2] */
-    .short     0                   /* d2[3] */
+  .short FIX_1_847759065      /* d0[0] */
+  .short -FIX_0_765366865     /* d0[1] */
+  .short -FIX_0_211164243     /* d0[2] */
+  .short FIX_1_451774981      /* d0[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* d2[0] */
+  .short FIX_2_562915447      /* d2[1] */
+  .short 1 << (CONST_BITS+1)  /* d2[2] */
+  .short 0                    /* d2[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4,  d2[2]
-    vmlal.s16       q14, \x8,  d0[0]
+    vmull.s16       q14, \x4, d2[2]
+    vmlal.s16       q14, \x8, d0[0]
     vmlal.s16       q14, \x14, d0[1]
 
     vmull.s16       q13, \x16, d1[2]
     vmlal.s16       q13, \x12, d1[3]
     vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6,  d2[1]
+    vmlal.s16       q13, \x6, d2[1]
 
-    vmull.s16       q15, \x4,  d2[2]
-    vmlsl.s16       q15, \x8,  d0[0]
+    vmull.s16       q15, \x4, d2[2]
+    vmlsl.s16       q15, \x8, d0[0]
     vmlsl.s16       q15, \x14, d0[1]
 
     vmull.s16       q12, \x16, d0[2]
     vmlal.s16       q12, \x12, d0[3]
     vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6,  d1[1]
+    vmlal.s16       q12, \x6, d1[1]
 
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q14,  q14, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
     vmovn.s32       \y26, q10
     vmovn.s32       \y29, q14
-.else
+  .else
     vrshrn.s32      \y26, q10, #\shift
     vrshrn.s32      \y29, q14, #\shift
-.endif
+  .endif
 
     vadd.s32        q10, q15, q12
     vsub.s32        q15, q15, q12
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q15,  q15, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q15, q15, #\shift
     vmovn.s32       \y27, q10
     vmovn.s32       \y28, q15
-.else
+  .else
     vrshrn.s32      \y27, q10, #\shift
     vrshrn.s32      \y28, q15, #\shift
-.endif
-
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1130,31 +1131,30 @@
 
 .balign 8
 jsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* d0[0] */
-    .short     FIX_0_850430095     /* d0[1] */
-    .short     -FIX_1_272758580    /* d0[2] */
-    .short     FIX_3_624509785     /* d0[3] */
+  .short -FIX_0_720959822  /* d0[0] */
+  .short FIX_0_850430095   /* d0[1] */
+  .short -FIX_1_272758580  /* d0[2] */
+  .short FIX_3_624509785   /* d0[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16  q14,  \x4,  #15
-    vmull.s16  q13,  \x6,  d0[3]
-    vmlal.s16  q13,  \x10, d0[2]
-    vmlal.s16  q13,  \x12, d0[1]
-    vmlal.s16  q13,  \x16, d0[0]
+    vshll.s16       q14, \x4, #15
+    vmull.s16       q13, \x6, d0[3]
+    vmlal.s16       q13, \x10, d0[2]
+    vmlal.s16       q13, \x12, d0[1]
+    vmlal.s16       q13, \x16, d0[0]
 
-    vadd.s32   q10,  q14,  q13
-    vsub.s32   q14,  q14,  q13
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32  q10,  q10,  #\shift
-    vrshr.s32  q14,  q14,  #\shift
-    vmovn.s32  \y26, q10
-    vmovn.s32  \y27, q14
-.else
-    vrshrn.s32 \y26, q10,  #\shift
-    vrshrn.s32 \y27, q14,  #\shift
-.endif
-
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
+    vmovn.s32       \y26, q10
+    vmovn.s32       \y27, q14
+  .else
+    vrshrn.s32      \y26, q10, #\shift
+    vrshrn.s32      \y27, q14, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1208,30 +1208,30 @@
     /* Pass 1 */
 #if 0
     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8,  d10
+    transpose_4x4   d4, d6, d8, d10
     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9,  d11
+    transpose_4x4   d5, d7, d9, d11
 #else
-    vmull.s16       q13, d6,  d0[3]
+    vmull.s16       q13, d6, d0[3]
     vmlal.s16       q13, d10, d0[2]
     vmlal.s16       q13, d12, d0[1]
     vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7,  d0[3]
+    vmull.s16       q12, d7, d0[3]
     vmlal.s16       q12, d11, d0[2]
     vmlal.s16       q12, d13, d0[1]
     vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4,  #15
-    vshll.s16       q15, d5,  #15
+    vshll.s16       q14, d4, #15
+    vshll.s16       q15, d5, #15
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
-    vrshrn.s32      d4,  q10, #13
-    vrshrn.s32      d6,  q14, #13
+    vrshrn.s32      d4, q10, #13
+    vrshrn.s32      d6, q14, #13
     vadd.s32        q10, q15, q12
     vsub.s32        q14, q15, q12
-    vrshrn.s32      d5,  q10, #13
-    vrshrn.s32      d7,  q14, #13
-    vtrn.16         q2,  q3
-    vtrn.32         q3,  q5
+    vrshrn.s32      d5, q10, #13
+    vrshrn.s32      d7, q14, #13
+    vtrn.16         q2, q3
+    vtrn.32         q3, q5
 #endif
 
     /* Pass 2 */
@@ -1281,110 +1281,110 @@
 
 
 .macro do_load size
-    .if \size == 8
-        vld1.8  {d4}, [U, :64]!
-        vld1.8  {d5}, [V, :64]!
-        vld1.8  {d0}, [Y, :64]!
-        pld     [U, #64]
-        pld     [V, #64]
-        pld     [Y, #64]
-    .elseif \size == 4
-        vld1.8  {d4[0]}, [U]!
-        vld1.8  {d4[1]}, [U]!
-        vld1.8  {d4[2]}, [U]!
-        vld1.8  {d4[3]}, [U]!
-        vld1.8  {d5[0]}, [V]!
-        vld1.8  {d5[1]}, [V]!
-        vld1.8  {d5[2]}, [V]!
-        vld1.8  {d5[3]}, [V]!
-        vld1.8  {d0[0]}, [Y]!
-        vld1.8  {d0[1]}, [Y]!
-        vld1.8  {d0[2]}, [Y]!
-        vld1.8  {d0[3]}, [Y]!
-    .elseif \size == 2
-        vld1.8  {d4[4]}, [U]!
-        vld1.8  {d4[5]}, [U]!
-        vld1.8  {d5[4]}, [V]!
-        vld1.8  {d5[5]}, [V]!
-        vld1.8  {d0[4]}, [Y]!
-        vld1.8  {d0[5]}, [Y]!
-    .elseif \size == 1
-        vld1.8  {d4[6]}, [U]!
-        vld1.8  {d5[6]}, [V]!
-        vld1.8  {d0[6]}, [Y]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vld1.8          {d4}, [U, :64]!
+    vld1.8          {d5}, [V, :64]!
+    vld1.8          {d0}, [Y, :64]!
+    pld             [U, #64]
+    pld             [V, #64]
+    pld             [Y, #64]
+  .elseif \size == 4
+    vld1.8          {d4[0]}, [U]!
+    vld1.8          {d4[1]}, [U]!
+    vld1.8          {d4[2]}, [U]!
+    vld1.8          {d4[3]}, [U]!
+    vld1.8          {d5[0]}, [V]!
+    vld1.8          {d5[1]}, [V]!
+    vld1.8          {d5[2]}, [V]!
+    vld1.8          {d5[3]}, [V]!
+    vld1.8          {d0[0]}, [Y]!
+    vld1.8          {d0[1]}, [Y]!
+    vld1.8          {d0[2]}, [Y]!
+    vld1.8          {d0[3]}, [Y]!
+  .elseif \size == 2
+    vld1.8          {d4[4]}, [U]!
+    vld1.8          {d4[5]}, [U]!
+    vld1.8          {d5[4]}, [V]!
+    vld1.8          {d5[5]}, [V]!
+    vld1.8          {d0[4]}, [Y]!
+    vld1.8          {d0[5]}, [Y]!
+  .elseif \size == 1
+    vld1.8          {d4[6]}, [U]!
+    vld1.8          {d5[6]}, [V]!
+    vld1.8          {d0[6]}, [Y]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vst3.8  {d10, d11, d12}, [RGB]!
-        .elseif \size == 4
-            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vst4.8  {d10, d11, d12, d13}, [RGB]!
-        .elseif \size == 4
-            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 16
-        .if \size == 8
-            vst1.16  {q15}, [RGB]!
-        .elseif \size == 4
-            vst1.16  {d30}, [RGB]!
-        .elseif \size == 2
-            vst1.16  {d31[0]}, [RGB]!
-            vst1.16  {d31[1]}, [RGB]!
-        .elseif \size == 1
-            vst1.16  {d31[2]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vst3.8        {d10, d11, d12}, [RGB]!
+    .elseif \size == 4
+      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vst4.8        {d10, d11, d12, d13}, [RGB]!
+    .elseif \size == 4
+      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 16
+    .if \size == 8
+      vst1.16       {q15}, [RGB]!
+    .elseif \size == 4
+      vst1.16       {d30}, [RGB]!
+    .elseif \size == 2
+      vst1.16       {d31[0]}, [RGB]!
+      vst1.16       {d31[1]}, [RGB]!
+    .elseif \size == 1
+      vst1.16       {d31[2]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined YCbCr->RGB conversion
+ * 2-stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
@@ -1397,17 +1397,17 @@
     vaddw.u8        q11, q10, d0
     vaddw.u8        q12, q12, d0
     vaddw.u8        q14, q14, d0
-.if \bpp != 16
+  .if \bpp != 16
     vqmovun.s16     d1\g_offs, q11
     vqmovun.s16     d1\r_offs, q12
     vqmovun.s16     d1\b_offs, q14
-.else /* rgb565 */
+  .else  /* rgb565 */
     vqshlu.s16      q13, q11, #8
     vqshlu.s16      q15, q12, #8
     vqshlu.s16      q14, q14, #8
     vsri.u16        q15, q13, #5
     vsri.u16        q15, q14, #11
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1423,27 +1423,27 @@
                                        vrshrn.s32      d28, q14, #14
     vld1.8          {d5}, [V, :64]!
                                        vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
                                        vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
                                        vaddw.u8        q12, q12, d0
                                        vaddw.u8        q14, q14, d0
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
                                        vqmovun.s16     d1\g_offs, q11
     pld             [Y, #64]
                                        vqmovun.s16     d1\r_offs, q12
     vld1.8          {d0}, [Y, :64]!
                                        vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
                                        do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
-.else /**************************** rgb565 ***********************************/
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
+  .else  /**************************** rgb565 ********************************/
                                        vqshlu.s16      q13, q11, #8
     pld             [Y, #64]
                                        vqshlu.s16      q15, q12, #8
@@ -1458,7 +1458,7 @@
     vmull.s16       q14, d6, d1[3]
                                        do_store        \bpp, 8
     vmull.s16       q15, d7, d1[3]
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb
@@ -1472,10 +1472,10 @@
 
 .balign 16
 jsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
 asm_function jsimd_ycc_\colorid\()_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -1620,123 +1620,123 @@
  */
 
 .macro do_store size
-    .if \size == 8
-        vst1.8  {d20}, [Y]!
-        vst1.8  {d21}, [U]!
-        vst1.8  {d22}, [V]!
-    .elseif \size == 4
-        vst1.8  {d20[0]}, [Y]!
-        vst1.8  {d20[1]}, [Y]!
-        vst1.8  {d20[2]}, [Y]!
-        vst1.8  {d20[3]}, [Y]!
-        vst1.8  {d21[0]}, [U]!
-        vst1.8  {d21[1]}, [U]!
-        vst1.8  {d21[2]}, [U]!
-        vst1.8  {d21[3]}, [U]!
-        vst1.8  {d22[0]}, [V]!
-        vst1.8  {d22[1]}, [V]!
-        vst1.8  {d22[2]}, [V]!
-        vst1.8  {d22[3]}, [V]!
-    .elseif \size == 2
-        vst1.8  {d20[4]}, [Y]!
-        vst1.8  {d20[5]}, [Y]!
-        vst1.8  {d21[4]}, [U]!
-        vst1.8  {d21[5]}, [U]!
-        vst1.8  {d22[4]}, [V]!
-        vst1.8  {d22[5]}, [V]!
-    .elseif \size == 1
-        vst1.8  {d20[6]}, [Y]!
-        vst1.8  {d21[6]}, [U]!
-        vst1.8  {d22[6]}, [V]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_load bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vld3.8  {d10, d11, d12}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vld4.8  {d10, d11, d12, d13}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined RGB->YCbCr conversion
+ * 2-stage pipelined RGB->YCbCr conversion
  */
 
 .macro do_rgb_to_yuv_stage1
-    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
-    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
-    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vrev64.32   q9,  q1
-    vrev64.32   q13, q1
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .macro do_rgb_to_yuv_stage2
-    vrshrn.u32  d20, q7,  #16
-    vrshrn.u32  d21, q8,  #16
-    vshrn.u32   d22, q9,  #16
-    vshrn.u32   d23, q13, #16
-    vshrn.u32   d24, q14, #16
-    vshrn.u32   d25, q15, #16
-    vmovn.u16   d20, q10      /* d20 = y */
-    vmovn.u16   d21, q11      /* d21 = u */
-    vmovn.u16   d22, q12      /* d22 = v */
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
 .endm
 
 .macro do_rgb_to_yuv
@@ -1745,52 +1745,52 @@
 .endm
 
 .macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32  d20, q7,  #16
-      vrshrn.u32  d21, q8,  #16
-      vshrn.u32   d22, q9,  #16
-    vrev64.32   q9,  q1
-      vshrn.u32   d23, q13, #16
-    vrev64.32   q13, q1
-      vshrn.u32   d24, q14, #16
-      vshrn.u32   d25, q15, #16
-    do_load     \bpp, 8
-      vmovn.u16   d20, q10      /* d20 = y */
-    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
-      vmovn.u16   d21, q11      /* d21 = u */
-    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
-      vmovn.u16   d22, q12      /* d22 = v */
-    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-      vst1.8      {d20}, [Y]!
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-      vst1.8      {d21}, [U]!
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-      vst1.8      {d22}, [V]!
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .balign 16
 jsimd_\colorid\()_ycc_neon_consts:
-    .short          19595, 38470, 7471,  11059
-    .short          21709, 32768, 27439, 5329
-    .short          32767, 128,   32767, 128
-    .short          32767, 128,   32767, 128
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
 
 asm_function jsimd_\colorid\()_ycc_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -2000,10 +2000,10 @@
 
 .balign 16
 jsimd_fdct_ifast_neon_consts:
-    .short (98 * 128)              /* XFIX_0_382683433 */
-    .short (139 * 128)             /* XFIX_0_541196100 */
-    .short (181 * 128)             /* XFIX_0_707106781 */
-    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
 
 asm_function jsimd_fdct_ifast_neon
 
@@ -2040,52 +2040,52 @@
     /* Transpose */
     vtrn.16         q12, q13
     vtrn.16         q10, q11
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q14, q15
-    vtrn.32         q9,  q11
+    vtrn.32         q9, q11
     vtrn.32         q13, q15
-    vtrn.32         q8,  q10
+    vtrn.32         q8, q10
     vtrn.32         q12, q14
     vswp            d30, d23
     vswp            d24, d17
     vswp            d26, d19
       /* 1-D FDCT */
-      vadd.s16        q2,  q11, q12
+      vadd.s16        q2, q11, q12
     vswp            d28, d21
       vsub.s16        q12, q11, q12
-      vsub.s16        q6,  q10, q13
+      vsub.s16        q6, q10, q13
       vadd.s16        q10, q10, q13
-      vsub.s16        q7,  q9,  q14
-      vadd.s16        q9,  q9,  q14
-      vsub.s16        q1,  q8,  q15
-      vadd.s16        q8,  q8,  q15
-      vsub.s16        q4,  q9,  q10
-      vsub.s16        q5,  q8,  q2
-      vadd.s16        q3,  q9,  q10
-      vadd.s16        q4,  q4,  q5
-      vadd.s16        q2,  q8,  q2
-      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
+      vsub.s16        q7, q9, q14
+      vadd.s16        q9, q9, q14
+      vsub.s16        q1, q8, q15
+      vadd.s16        q8, q8, q15
+      vsub.s16        q4, q9, q10
+      vsub.s16        q5, q8, q2
+      vadd.s16        q3, q9, q10
+      vadd.s16        q4, q4, q5
+      vadd.s16        q2, q8, q2
+      vqdmulh.s16     q4, q4, XFIX_0_707106781
       vadd.s16        q11, q12, q6
-      vadd.s16        q8,  q2,  q3
-      vsub.s16        q12, q2,  q3
-      vadd.s16        q3,  q6,  q7
-      vadd.s16        q7,  q7,  q1
-      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
-      vsub.s16        q6,  q11, q7
-      vadd.s16        q10, q5,  q4
-      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
-      vsub.s16        q14, q5,  q4
+      vadd.s16        q8, q2, q3
+      vsub.s16        q12, q2, q3
+      vadd.s16        q3, q6, q7
+      vadd.s16        q7, q7, q1
+      vqdmulh.s16     q3, q3, XFIX_0_707106781
+      vsub.s16        q6, q11, q7
+      vadd.s16        q10, q5, q4
+      vqdmulh.s16     q6, q6, XFIX_0_382683433
+      vsub.s16        q14, q5, q4
       vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
-      vadd.s16        q4,  q1,  q3
-      vsub.s16        q3,  q1,  q3
-      vadd.s16        q7,  q7,  q6
+      vqdmulh.s16     q5, q7, XFIX_1_306562965
+      vadd.s16        q4, q1, q3
+      vsub.s16        q3, q1, q3
+      vadd.s16        q7, q7, q6
       vadd.s16        q11, q11, q6
-      vadd.s16        q7,  q7,  q5
-      vadd.s16        q13, q3,  q11
-      vsub.s16        q11, q3,  q11
-      vadd.s16        q9,  q4,  q7
-      vsub.s16        q15, q4,  q7
+      vadd.s16        q7, q7, q5
+      vadd.s16        q13, q3, q11
+      vsub.s16        q11, q3, q11
+      vadd.s16        q9, q4, q7
+      vsub.s16        q15, q4, q7
     subs            TMP, TMP, #1
     bne             1b
 
@@ -2134,22 +2134,22 @@
     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
     vabs.s16        q13, q1
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
 
     push            {r4, r5}
@@ -2162,25 +2162,25 @@
     vabs.s16        q13, q1
       veor.u16        q15, q15, q3
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
       vsub.u16        q14, q14, q2
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
       vsub.u16        q15, q15, q3
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
     subs            LOOP_COUNT, LOOP_COUNT, #1
     bne             1b
@@ -2192,7 +2192,7 @@
       vsub.u16        q15, q15, q3
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
 
-    bx              lr /* return */
+    bx              lr  /* return */
 
     .unreq          COEF_BLOCK
     .unreq          DIVISORS
@@ -2207,9 +2207,9 @@
 
 /*
  * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
- *                                 JDIMENSION   downsampled_width,
- *                                 JSAMPARRAY   input_data,
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
+ *                                 JDIMENSION downsampled_width,
+ *                                 JSAMPARRAY input_data,
  *                                 JSAMPARRAY * output_data_ptr);
  *
  * Note: the use of unaligned writes is the main remaining bottleneck in
@@ -2224,22 +2224,22 @@
  * Register d28 is used for multiplication by 3. Register q15 is used
  * for adding +1 bias.
  */
-.macro upsample16   OUTPTR, INPTR
+.macro upsample16 OUTPTR, INPTR
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vmov            q1,  q0       /* backup source pixels to q1 */
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vmov            q1, q0        /* backup source pixels to q1 */
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
@@ -2250,39 +2250,39 @@
  * Also this unrolling allows to reorder loads and stores to compensate
  * multiplication latency and reduce stalls.
  */
-.macro upsample32   OUTPTR, INPTR
+.macro upsample32 OUTPTR, INPTR
     /* even 16 pixels group */
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-        /* odd 16 pixels group */
-        vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
-        vmovl.u8        q8,  d2
-        vext.8          q2,  q0,  q1, #15
-        vmovl.u8        q9,  d3
-        vaddw.u8        q10, q15, d4
-        vaddw.u8        q11, q15, d5
-        vmlal.u8        q8,  d4,  d28
-        vmlal.u8        q9,  d5,  d28
-        vmlal.u8        q10, d2,  d28
-        vmlal.u8        q11, d3,  d28
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+      /* odd 16 pixels group */
+      vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
+      vmovl.u8        q8, d2
+      vext.8          q2, q0, q1, #15
+      vmovl.u8        q9, d3
+      vaddw.u8        q10, q15, d4
+      vaddw.u8        q11, q15, d5
+      vmlal.u8        q8, d4, d28
+      vmlal.u8        q9, d5, d28
+      vmlal.u8        q10, d2, d28
+      vmlal.u8        q11, d3, d28
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-        vrshrn.u16      d6,  q8,  #2
-        vrshrn.u16      d7,  q9,  #2
-        vshrn.u16       d8,  q10, #2
-        vshrn.u16       d9,  q11, #2
-        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+      vrshrn.u16      d6, q8, #2
+      vrshrn.u16      d7, q9, #2
+      vshrn.u16       d8, q10, #2
+      vshrn.u16       d9, q11, #2
+      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
 /*
@@ -2343,21 +2343,21 @@
 2:
     tst             \WIDTH, #8
     beq             2f
-    vmov            d1,  d0
+    vmov            d1, d0
     sub             \INPTR, \INPTR, #8
     vld1.8          {d0}, [\INPTR]
 2:  /* upsample the remaining pixels */
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vrshrn.u16      d10, q8,  #2
-    vrshrn.u16      d12, q9,  #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vrshrn.u16      d10, q8, #2
+    vrshrn.u16      d12, q9, #2
     vshrn.u16       d11, q10, #2
     vshrn.u16       d13, q11, #2
     vzip.8          d10, d11
@@ -2366,12 +2366,12 @@
     tst             \WIDTH, #8
     beq             2f
     vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5,  q6
+    vmov            q5, q6
 2:
     tst             \WIDTH, #4
     beq             2f
     vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10,  d11
+    vmov            d10, d11
 2:
     tst             \WIDTH, #2
     beq             2f
@@ -2435,11 +2435,11 @@
     .unreq          WIDTH
     .unreq          TMP
 
-
 .purgem upsample16
 .purgem upsample32
 .purgem upsample_row
 
+
 /*****************************************************************************/
 
 /*
@@ -2451,424 +2451,426 @@
  */
 
 .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-    sub \PUT_BITS, \PUT_BITS, #0x8
-    lsr \TMP, \PUT_BUFFER, \PUT_BITS
-    uxtb \TMP, \TMP
-    strb \TMP, [\BUFFER, #1]!
-    cmp \TMP, #0xff
+    sub             \PUT_BITS, \PUT_BITS, #0x8
+    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
+    uxtb            \TMP, \TMP
+    strb            \TMP, [\BUFFER, #1]!
+    cmp             \TMP, #0xff
     /*it eq*/
-    streqb \ZERO, [\BUFFER, #1]!
+    streqb          \ZERO, [\BUFFER, #1]!
 .endm
+
 .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
-    /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
-    add \PUT_BITS, \SIZE
-    /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
-    orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+    add             \PUT_BITS, \SIZE
+    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
 .endm
+
 .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-  cmp \PUT_BITS, #0x10
-  blt 15f
-    eor \ZERO, \ZERO, \ZERO
-    emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-    emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+  cmp               \PUT_BITS, #0x10
+  blt               15f
+    eor               \ZERO, \ZERO, \ZERO
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
 15:
 .endm
 
 .balign 16
 jsimd_huff_encode_one_block_neon_consts:
-    .byte 0x01
-    .byte 0x02
-    .byte 0x04
-    .byte 0x08
-    .byte 0x10
-    .byte 0x20
-    .byte 0x40
-    .byte 0x80
+  .byte 0x01
+  .byte 0x02
+  .byte 0x04
+  .byte 0x08
+  .byte 0x10
+  .byte 0x20
+  .byte 0x40
+  .byte 0x80
 
 asm_function jsimd_huff_encode_one_block_neon
-    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-    add r7, sp, #0x1c
-    sub r4, sp, #0x40
-    bfc r4, #0, #5
-    mov sp, r4 /* align sp on 32 bytes */
-    vst1.64 {d8, d9, d10, d11}, [r4, :128]!
-    vst1.64 {d12, d13, d14, d15}, [r4, :128]
-    sub sp, #0x140 /* reserve 320 bytes */
-    str r0, [sp, #0x18] /* working state > sp + Ox18 */
-    add r4, sp, #0x20   /* r4 = t1 */
-    ldr lr, [r7, #0x8]  /* lr = dctbl */
-    sub r10, r1, #0x1   /* r10=buffer-- */
-    ldrsh r1, [r2]
-    mov r9, #0x10
-    mov r8, #0x1
-    adr r5, jsimd_huff_encode_one_block_neon_consts
+    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+    add             r7, sp, #0x1c
+    sub             r4, sp, #0x40
+    bfc             r4, #0, #5
+    mov             sp, r4           /* align sp on 32 bytes */
+    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vst1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             sp, #0x140       /* reserve 320 bytes */
+    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
+    add             r4, sp, #0x20    /* r4 = t1 */
+    ldr             lr, [r7, #0x8]   /* lr = dctbl */
+    sub             r10, r1, #0x1    /* r10=buffer-- */
+    ldrsh           r1, [r2]
+    mov             r9, #0x10
+    mov             r8, #0x1
+    adr             r5, jsimd_huff_encode_one_block_neon_consts
     /* prepare data */
-    vld1.8 {d26}, [r5, :64]
-    veor q8, q8, q8
-    veor q9, q9, q9
-    vdup.16 q14, r9
-    vdup.16 q15, r8
-    veor q10, q10, q10
-    veor q11, q11, q11
-    sub r1, r1, r3
-    add r9, r2, #0x22
-    add r8, r2, #0x18
-    add r3, r2, #0x36
-    vmov.16 d0[0], r1
-    vld1.16 {d2[0]}, [r9, :16]
-    vld1.16 {d4[0]}, [r8, :16]
-    vld1.16 {d6[0]}, [r3, :16]
-    add r1, r2, #0x2
-    add r9, r2, #0x30
-    add r8, r2, #0x26
-    add r3, r2, #0x28
-    vld1.16 {d0[1]}, [r1, :16]
-    vld1.16 {d2[1]}, [r9, :16]
-    vld1.16 {d4[1]}, [r8, :16]
-    vld1.16 {d6[1]}, [r3, :16]
-    add r1, r2, #0x10
-    add r9, r2, #0x40
-    add r8, r2, #0x34
-    add r3, r2, #0x1a
-    vld1.16 {d0[2]}, [r1, :16]
-    vld1.16 {d2[2]}, [r9, :16]
-    vld1.16 {d4[2]}, [r8, :16]
-    vld1.16 {d6[2]}, [r3, :16]
-    add r1, r2, #0x20
-    add r9, r2, #0x32
-    add r8, r2, #0x42
-    add r3, r2, #0xc
-    vld1.16 {d0[3]}, [r1, :16]
-    vld1.16 {d2[3]}, [r9, :16]
-    vld1.16 {d4[3]}, [r8, :16]
-    vld1.16 {d6[3]}, [r3, :16]
-    add r1, r2, #0x12
-    add r9, r2, #0x24
-    add r8, r2, #0x50
-    add r3, r2, #0xe
-    vld1.16 {d1[0]}, [r1, :16]
-    vld1.16 {d3[0]}, [r9, :16]
-    vld1.16 {d5[0]}, [r8, :16]
-    vld1.16 {d7[0]}, [r3, :16]
-    add r1, r2, #0x4
-    add r9, r2, #0x16
-    add r8, r2, #0x60
-    add r3, r2, #0x1c
-    vld1.16 {d1[1]}, [r1, :16]
-    vld1.16 {d3[1]}, [r9, :16]
-    vld1.16 {d5[1]}, [r8, :16]
-    vld1.16 {d7[1]}, [r3, :16]
-    add r1, r2, #0x6
-    add r9, r2, #0x8
-    add r8, r2, #0x52
-    add r3, r2, #0x2a
-    vld1.16 {d1[2]}, [r1, :16]
-    vld1.16 {d3[2]}, [r9, :16]
-    vld1.16 {d5[2]}, [r8, :16]
-    vld1.16 {d7[2]}, [r3, :16]
-    add r1, r2, #0x14
-    add r9, r2, #0xa
-    add r8, r2, #0x44
-    add r3, r2, #0x38
-    vld1.16 {d1[3]}, [r1, :16]
-    vld1.16 {d3[3]}, [r9, :16]
-    vld1.16 {d5[3]}, [r8, :16]
-    vld1.16 {d7[3]}, [r3, :16]
-    vcgt.s16 q8, q8, q0
-    vcgt.s16 q9, q9, q1
-    vcgt.s16 q10, q10, q2
-    vcgt.s16 q11, q11, q3
-    vabs.s16 q0, q0
-    vabs.s16 q1, q1
-    vabs.s16 q2, q2
-    vabs.s16 q3, q3
-    veor q8, q8, q0
-    veor q9, q9, q1
-    veor q10, q10, q2
-    veor q11, q11, q3
-    add r9, r4, #0x20
-    add r8, r4, #0x80
-    add r3, r4, #0xa0
-    vclz.i16 q0, q0
-    vclz.i16 q1, q1
-    vclz.i16 q2, q2
-    vclz.i16 q3, q3
-    vsub.i16 q0, q14, q0
-    vsub.i16 q1, q14, q1
-    vsub.i16 q2, q14, q2
-    vsub.i16 q3, q14, q3
-    vst1.16 {d0, d1, d2, d3}, [r4, :256]
-    vst1.16 {d4, d5, d6, d7}, [r9, :256]
-    vshl.s16 q0, q15, q0
-    vshl.s16 q1, q15, q1
-    vshl.s16 q2, q15, q2
-    vshl.s16 q3, q15, q3
-    vsub.i16 q0, q0, q15
-    vsub.i16 q1, q1, q15
-    vsub.i16 q2, q2, q15
-    vsub.i16 q3, q3, q15
-    vand q8, q8, q0
-    vand q9, q9, q1
-    vand q10, q10, q2
-    vand q11, q11, q3
-    vst1.16 {d16, d17, d18, d19}, [r8, :256]
-    vst1.16 {d20, d21, d22, d23}, [r3, :256]
-    add r1, r2, #0x46
-    add r9, r2, #0x3a
-    add r8, r2, #0x74
-    add r3, r2, #0x6a
-    vld1.16 {d8[0]}, [r1, :16]
-    vld1.16 {d10[0]}, [r9, :16]
-    vld1.16 {d12[0]}, [r8, :16]
-    vld1.16 {d14[0]}, [r3, :16]
-    veor q8, q8, q8
-    veor q9, q9, q9
-    veor q10, q10, q10
-    veor q11, q11, q11
-    add r1, r2, #0x54
-    add r9, r2, #0x2c
-    add r8, r2, #0x76
-    add r3, r2, #0x78
-    vld1.16 {d8[1]}, [r1, :16]
-    vld1.16 {d10[1]}, [r9, :16]
-    vld1.16 {d12[1]}, [r8, :16]
-    vld1.16 {d14[1]}, [r3, :16]
-    add r1, r2, #0x62
-    add r9, r2, #0x1e
-    add r8, r2, #0x68
-    add r3, r2, #0x7a
-    vld1.16 {d8[2]}, [r1, :16]
-    vld1.16 {d10[2]}, [r9, :16]
-    vld1.16 {d12[2]}, [r8, :16]
-    vld1.16 {d14[2]}, [r3, :16]
-    add r1, r2, #0x70
-    add r9, r2, #0x2e
-    add r8, r2, #0x5a
-    add r3, r2, #0x6c
-    vld1.16 {d8[3]}, [r1, :16]
-    vld1.16 {d10[3]}, [r9, :16]
-    vld1.16 {d12[3]}, [r8, :16]
-    vld1.16 {d14[3]}, [r3, :16]
-    add r1, r2, #0x72
-    add r9, r2, #0x3c
-    add r8, r2, #0x4c
-    add r3, r2, #0x5e
-    vld1.16 {d9[0]}, [r1, :16]
-    vld1.16 {d11[0]}, [r9, :16]
-    vld1.16 {d13[0]}, [r8, :16]
-    vld1.16 {d15[0]}, [r3, :16]
-    add r1, r2, #0x64
-    add r9, r2, #0x4a
-    add r8, r2, #0x3e
-    add r3, r2, #0x6e
-    vld1.16 {d9[1]}, [r1, :16]
-    vld1.16 {d11[1]}, [r9, :16]
-    vld1.16 {d13[1]}, [r8, :16]
-    vld1.16 {d15[1]}, [r3, :16]
-    add r1, r2, #0x56
-    add r9, r2, #0x58
-    add r8, r2, #0x4e
-    add r3, r2, #0x7c
-    vld1.16 {d9[2]}, [r1, :16]
-    vld1.16 {d11[2]}, [r9, :16]
-    vld1.16 {d13[2]}, [r8, :16]
-    vld1.16 {d15[2]}, [r3, :16]
-    add r1, r2, #0x48
-    add r9, r2, #0x66
-    add r8, r2, #0x5c
-    add r3, r2, #0x7e
-    vld1.16 {d9[3]}, [r1, :16]
-    vld1.16 {d11[3]}, [r9, :16]
-    vld1.16 {d13[3]}, [r8, :16]
-    vld1.16 {d15[3]}, [r3, :16]
-    vcgt.s16 q8, q8, q4
-    vcgt.s16 q9, q9, q5
-    vcgt.s16 q10, q10, q6
-    vcgt.s16 q11, q11, q7
-    vabs.s16 q4, q4
-    vabs.s16 q5, q5
-    vabs.s16 q6, q6
-    vabs.s16 q7, q7
-    veor q8, q8, q4
-    veor q9, q9, q5
-    veor q10, q10, q6
-    veor q11, q11, q7
-    add r1, r4, #0x40
-    add r9, r4, #0x60
-    add r8, r4, #0xc0
-    add r3, r4, #0xe0
-    vclz.i16 q4, q4
-    vclz.i16 q5, q5
-    vclz.i16 q6, q6
-    vclz.i16 q7, q7
-    vsub.i16 q4, q14, q4
-    vsub.i16 q5, q14, q5
-    vsub.i16 q6, q14, q6
-    vsub.i16 q7, q14, q7
-    vst1.16 {d8, d9, d10, d11}, [r1, :256]
-    vst1.16 {d12, d13, d14, d15}, [r9, :256]
-    vshl.s16 q4, q15, q4
-    vshl.s16 q5, q15, q5
-    vshl.s16 q6, q15, q6
-    vshl.s16 q7, q15, q7
-    vsub.i16 q4, q4, q15
-    vsub.i16 q5, q5, q15
-    vsub.i16 q6, q6, q15
-    vsub.i16 q7, q7, q15
-    vand q8, q8, q4
-    vand q9, q9, q5
-    vand q10, q10, q6
-    vand q11, q11, q7
-    vst1.16 {d16, d17, d18, d19}, [r8, :256]
-    vst1.16 {d20, d21, d22, d23}, [r3, :256]
-    ldr r12, [r7, #0xc]  /* r12 = actbl */
-    add r1, lr, #0x400   /* r1 = dctbl->ehufsi */
-    mov r9, r12          /* r9 = actbl */
-    add r6, r4, #0x80    /* r6 = t2 */
-    ldr r11, [r0, #0x8]  /* r11 = put_buffer */
-    ldr r4, [r0, #0xc]   /* r4  = put_bits */
-    ldrh r2, [r6, #-128] /* r2  = nbits */
-    ldrh r3, [r6]        /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
-    ldr r0, [lr, r2, lsl #2]
-    ldrb r5, [r1, r2]
-    put_bits r11, r4, r0, r5
-    checkbuf15 r10, r11, r4, r5, r0
-    put_bits r11, r4, r3, r2
-    checkbuf15 r10, r11, r4, r5, r0
-    mov lr, r6            /* lr = t2 */
-    add r5, r9, #0x400    /* r5 = actbl->ehufsi */
-    ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
-    veor q8, q8, q8
-    vceq.i16 q0, q0, q8
-    vceq.i16 q1, q1, q8
-    vceq.i16 q2, q2, q8
-    vceq.i16 q3, q3, q8
-    vceq.i16 q4, q4, q8
-    vceq.i16 q5, q5, q8
-    vceq.i16 q6, q6, q8
-    vceq.i16 q7, q7, q8
-    vmovn.i16 d0, q0
-    vmovn.i16 d2, q1
-    vmovn.i16 d4, q2
-    vmovn.i16 d6, q3
-    vmovn.i16 d8, q4
-    vmovn.i16 d10, q5
-    vmovn.i16 d12, q6
-    vmovn.i16 d14, q7
-    vand d0, d0, d26
-    vand d2, d2, d26
-    vand d4, d4, d26
-    vand d6, d6, d26
-    vand d8, d8, d26
-    vand d10, d10, d26
-    vand d12, d12, d26
-    vand d14, d14, d26
-    vpadd.i8 d0, d0, d2
-    vpadd.i8 d4, d4, d6
-    vpadd.i8 d8, d8, d10
-    vpadd.i8 d12, d12, d14
-    vpadd.i8 d0, d0, d4
-    vpadd.i8 d8, d8, d12
-    vpadd.i8 d0, d0, d8
-    vmov.32 r1, d0[1]
-    vmov.32 r8, d0[0]
-    mvn r1, r1
-    mvn r8, r8
-    lsrs r1, r1, #0x1
-    rrx r8, r8  /* shift in last r1 bit while shifting out DC bit */
-    rbit r1, r1 /* r1 = index1 */
-    rbit r8, r8 /* r8 = index0 */
-    ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
-    str r1, [sp, #0x14]  /* index1 > sp + 0x14 */
-    cmp r8, #0x0
-    beq 6f
+    vld1.8          {d26}, [r5, :64]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    vdup.16         q14, r9
+    vdup.16         q15, r8
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    sub             r1, r1, r3
+    add             r9, r2, #0x22
+    add             r8, r2, #0x18
+    add             r3, r2, #0x36
+    vmov.16         d0[0], r1
+    vld1.16         {d2[0]}, [r9, :16]
+    vld1.16         {d4[0]}, [r8, :16]
+    vld1.16         {d6[0]}, [r3, :16]
+    add             r1, r2, #0x2
+    add             r9, r2, #0x30
+    add             r8, r2, #0x26
+    add             r3, r2, #0x28
+    vld1.16         {d0[1]}, [r1, :16]
+    vld1.16         {d2[1]}, [r9, :16]
+    vld1.16         {d4[1]}, [r8, :16]
+    vld1.16         {d6[1]}, [r3, :16]
+    add             r1, r2, #0x10
+    add             r9, r2, #0x40
+    add             r8, r2, #0x34
+    add             r3, r2, #0x1a
+    vld1.16         {d0[2]}, [r1, :16]
+    vld1.16         {d2[2]}, [r9, :16]
+    vld1.16         {d4[2]}, [r8, :16]
+    vld1.16         {d6[2]}, [r3, :16]
+    add             r1, r2, #0x20
+    add             r9, r2, #0x32
+    add             r8, r2, #0x42
+    add             r3, r2, #0xc
+    vld1.16         {d0[3]}, [r1, :16]
+    vld1.16         {d2[3]}, [r9, :16]
+    vld1.16         {d4[3]}, [r8, :16]
+    vld1.16         {d6[3]}, [r3, :16]
+    add             r1, r2, #0x12
+    add             r9, r2, #0x24
+    add             r8, r2, #0x50
+    add             r3, r2, #0xe
+    vld1.16         {d1[0]}, [r1, :16]
+    vld1.16         {d3[0]}, [r9, :16]
+    vld1.16         {d5[0]}, [r8, :16]
+    vld1.16         {d7[0]}, [r3, :16]
+    add             r1, r2, #0x4
+    add             r9, r2, #0x16
+    add             r8, r2, #0x60
+    add             r3, r2, #0x1c
+    vld1.16         {d1[1]}, [r1, :16]
+    vld1.16         {d3[1]}, [r9, :16]
+    vld1.16         {d5[1]}, [r8, :16]
+    vld1.16         {d7[1]}, [r3, :16]
+    add             r1, r2, #0x6
+    add             r9, r2, #0x8
+    add             r8, r2, #0x52
+    add             r3, r2, #0x2a
+    vld1.16         {d1[2]}, [r1, :16]
+    vld1.16         {d3[2]}, [r9, :16]
+    vld1.16         {d5[2]}, [r8, :16]
+    vld1.16         {d7[2]}, [r3, :16]
+    add             r1, r2, #0x14
+    add             r9, r2, #0xa
+    add             r8, r2, #0x44
+    add             r3, r2, #0x38
+    vld1.16         {d1[3]}, [r1, :16]
+    vld1.16         {d3[3]}, [r9, :16]
+    vld1.16         {d5[3]}, [r8, :16]
+    vld1.16         {d7[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q0
+    vcgt.s16        q9, q9, q1
+    vcgt.s16        q10, q10, q2
+    vcgt.s16        q11, q11, q3
+    vabs.s16        q0, q0
+    vabs.s16        q1, q1
+    vabs.s16        q2, q2
+    vabs.s16        q3, q3
+    veor            q8, q8, q0
+    veor            q9, q9, q1
+    veor            q10, q10, q2
+    veor            q11, q11, q3
+    add             r9, r4, #0x20
+    add             r8, r4, #0x80
+    add             r3, r4, #0xa0
+    vclz.i16        q0, q0
+    vclz.i16        q1, q1
+    vclz.i16        q2, q2
+    vclz.i16        q3, q3
+    vsub.i16        q0, q14, q0
+    vsub.i16        q1, q14, q1
+    vsub.i16        q2, q14, q2
+    vsub.i16        q3, q14, q3
+    vst1.16         {d0, d1, d2, d3}, [r4, :256]
+    vst1.16         {d4, d5, d6, d7}, [r9, :256]
+    vshl.s16        q0, q15, q0
+    vshl.s16        q1, q15, q1
+    vshl.s16        q2, q15, q2
+    vshl.s16        q3, q15, q3
+    vsub.i16        q0, q0, q15
+    vsub.i16        q1, q1, q15
+    vsub.i16        q2, q2, q15
+    vsub.i16        q3, q3, q15
+    vand            q8, q8, q0
+    vand            q9, q9, q1
+    vand            q10, q10, q2
+    vand            q11, q11, q3
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    add             r1, r2, #0x46
+    add             r9, r2, #0x3a
+    add             r8, r2, #0x74
+    add             r3, r2, #0x6a
+    vld1.16         {d8[0]}, [r1, :16]
+    vld1.16         {d10[0]}, [r9, :16]
+    vld1.16         {d12[0]}, [r8, :16]
+    vld1.16         {d14[0]}, [r3, :16]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    add             r1, r2, #0x54
+    add             r9, r2, #0x2c
+    add             r8, r2, #0x76
+    add             r3, r2, #0x78
+    vld1.16         {d8[1]}, [r1, :16]
+    vld1.16         {d10[1]}, [r9, :16]
+    vld1.16         {d12[1]}, [r8, :16]
+    vld1.16         {d14[1]}, [r3, :16]
+    add             r1, r2, #0x62
+    add             r9, r2, #0x1e
+    add             r8, r2, #0x68
+    add             r3, r2, #0x7a
+    vld1.16         {d8[2]}, [r1, :16]
+    vld1.16         {d10[2]}, [r9, :16]
+    vld1.16         {d12[2]}, [r8, :16]
+    vld1.16         {d14[2]}, [r3, :16]
+    add             r1, r2, #0x70
+    add             r9, r2, #0x2e
+    add             r8, r2, #0x5a
+    add             r3, r2, #0x6c
+    vld1.16         {d8[3]}, [r1, :16]
+    vld1.16         {d10[3]}, [r9, :16]
+    vld1.16         {d12[3]}, [r8, :16]
+    vld1.16         {d14[3]}, [r3, :16]
+    add             r1, r2, #0x72
+    add             r9, r2, #0x3c
+    add             r8, r2, #0x4c
+    add             r3, r2, #0x5e
+    vld1.16         {d9[0]}, [r1, :16]
+    vld1.16         {d11[0]}, [r9, :16]
+    vld1.16         {d13[0]}, [r8, :16]
+    vld1.16         {d15[0]}, [r3, :16]
+    add             r1, r2, #0x64
+    add             r9, r2, #0x4a
+    add             r8, r2, #0x3e
+    add             r3, r2, #0x6e
+    vld1.16         {d9[1]}, [r1, :16]
+    vld1.16         {d11[1]}, [r9, :16]
+    vld1.16         {d13[1]}, [r8, :16]
+    vld1.16         {d15[1]}, [r3, :16]
+    add             r1, r2, #0x56
+    add             r9, r2, #0x58
+    add             r8, r2, #0x4e
+    add             r3, r2, #0x7c
+    vld1.16         {d9[2]}, [r1, :16]
+    vld1.16         {d11[2]}, [r9, :16]
+    vld1.16         {d13[2]}, [r8, :16]
+    vld1.16         {d15[2]}, [r3, :16]
+    add             r1, r2, #0x48
+    add             r9, r2, #0x66
+    add             r8, r2, #0x5c
+    add             r3, r2, #0x7e
+    vld1.16         {d9[3]}, [r1, :16]
+    vld1.16         {d11[3]}, [r9, :16]
+    vld1.16         {d13[3]}, [r8, :16]
+    vld1.16         {d15[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q4
+    vcgt.s16        q9, q9, q5
+    vcgt.s16        q10, q10, q6
+    vcgt.s16        q11, q11, q7
+    vabs.s16        q4, q4
+    vabs.s16        q5, q5
+    vabs.s16        q6, q6
+    vabs.s16        q7, q7
+    veor            q8, q8, q4
+    veor            q9, q9, q5
+    veor            q10, q10, q6
+    veor            q11, q11, q7
+    add             r1, r4, #0x40
+    add             r9, r4, #0x60
+    add             r8, r4, #0xc0
+    add             r3, r4, #0xe0
+    vclz.i16        q4, q4
+    vclz.i16        q5, q5
+    vclz.i16        q6, q6
+    vclz.i16        q7, q7
+    vsub.i16        q4, q14, q4
+    vsub.i16        q5, q14, q5
+    vsub.i16        q6, q14, q6
+    vsub.i16        q7, q14, q7
+    vst1.16         {d8, d9, d10, d11}, [r1, :256]
+    vst1.16         {d12, d13, d14, d15}, [r9, :256]
+    vshl.s16        q4, q15, q4
+    vshl.s16        q5, q15, q5
+    vshl.s16        q6, q15, q6
+    vshl.s16        q7, q15, q7
+    vsub.i16        q4, q4, q15
+    vsub.i16        q5, q5, q15
+    vsub.i16        q6, q6, q15
+    vsub.i16        q7, q7, q15
+    vand            q8, q8, q4
+    vand            q9, q9, q5
+    vand            q10, q10, q6
+    vand            q11, q11, q7
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    ldr             r12, [r7, #0xc]       /* r12 = actbl */
+    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
+    mov             r9, r12               /* r9 = actbl */
+    add             r6, r4, #0x80         /* r6 = t2 */
+    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
+    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
+    ldrh            r2, [r6, #-128]       /* r2  = nbits */
+    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldr             r0, [lr, r2, lsl #2]
+    ldrb            r5, [r1, r2]
+    put_bits        r11, r4, r0, r5
+    checkbuf15      r10, r11, r4, r5, r0
+    put_bits        r11, r4, r3, r2
+    checkbuf15      r10, r11, r4, r5, r0
+    mov             lr, r6                /* lr = t2 */
+    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
+    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
+    veor            q8, q8, q8
+    vceq.i16        q0, q0, q8
+    vceq.i16        q1, q1, q8
+    vceq.i16        q2, q2, q8
+    vceq.i16        q3, q3, q8
+    vceq.i16        q4, q4, q8
+    vceq.i16        q5, q5, q8
+    vceq.i16        q6, q6, q8
+    vceq.i16        q7, q7, q8
+    vmovn.i16       d0, q0
+    vmovn.i16       d2, q1
+    vmovn.i16       d4, q2
+    vmovn.i16       d6, q3
+    vmovn.i16       d8, q4
+    vmovn.i16       d10, q5
+    vmovn.i16       d12, q6
+    vmovn.i16       d14, q7
+    vand            d0, d0, d26
+    vand            d2, d2, d26
+    vand            d4, d4, d26
+    vand            d6, d6, d26
+    vand            d8, d8, d26
+    vand            d10, d10, d26
+    vand            d12, d12, d26
+    vand            d14, d14, d26
+    vpadd.i8        d0, d0, d2
+    vpadd.i8        d4, d4, d6
+    vpadd.i8        d8, d8, d10
+    vpadd.i8        d12, d12, d14
+    vpadd.i8        d0, d0, d4
+    vpadd.i8        d8, d8, d12
+    vpadd.i8        d0, d0, d8
+    vmov.32         r1, d0[1]
+    vmov.32         r8, d0[0]
+    mvn             r1, r1
+    mvn             r8, r8
+    lsrs            r1, r1, #0x1
+    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
+    rbit            r1, r1            /* r1 = index1 */
+    rbit            r8, r8            /* r8 = index0 */
+    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
+    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
+    cmp             r8, #0x0
+    beq             6f
 1:
-    clz r2, r8
-    add lr, lr, r2, lsl #1
-    lsl r8, r8, r2
-    ldrh r1, [lr, #-126]
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
+    ldrh            r1, [lr, #-126]
 2:
-    cmp r2, #0x10
-    blt 3f
-    sub r2, r2, #0x10
-    put_bits r11, r4, r0, r6
-    cmp r4, #0x10
-    blt 2b
-    eor r3, r3, r3
-    emit_byte r10, r11, r4, r3, r12
-    emit_byte r10, r11, r4, r3, r12
-    b 2b
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
 3:
-    add r2, r1, r2, lsl #4
-    ldrh r3, [lr, #2]!
-    ldr r12, [r9, r2, lsl #2]
-    ldrb r2, [r5, r2]
-    put_bits r11, r4, r12, r2
-    checkbuf15 r10, r11, r4, r2, r12
-    put_bits r11, r4, r3, r1
-    checkbuf15 r10, r11, r4, r2, r12
-    lsls r8, r8, #0x1
-    bne 1b
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
 6:
-    add r12, sp, #0x20   /* r12 = t1 */
-    ldr r8,[sp, #0x14]   /* r8 = index1 */
-    adds r12, #0xc0      /* r12 = t2 + (DCTSIZE2/2) */
-    cmp r8, #0x0
-    beq 6f
-    clz r2, r8
-    sub r12, r12, lr
-    lsl r8, r8, r2
-    add r2, r2, r12, lsr #1
-    add lr, lr, r2, lsl #1
-    b 7f
+    add             r12, sp, #0x20   /* r12 = t1 */
+    ldr             r8, [sp, #0x14]  /* r8 = index1 */
+    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
+    cmp             r8, #0x0
+    beq             6f
+    clz             r2, r8
+    sub             r12, r12, lr
+    lsl             r8, r8, r2
+    add             r2, r2, r12, lsr #1
+    add             lr, lr, r2, lsl #1
+    b               7f
 1:
-    clz r2, r8
-    add lr, lr, r2, lsl #1
-    lsl r8, r8, r2
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
 7:
-    ldrh r1, [lr, #-126]
+    ldrh            r1, [lr, #-126]
 2:
-    cmp r2, #0x10
-    blt 3f
-    sub r2, r2, #0x10
-    put_bits r11, r4, r0, r6
-    cmp r4, #0x10
-    blt 2b
-    eor r3, r3, r3
-    emit_byte r10, r11, r4, r3, r12
-    emit_byte r10, r11, r4, r3, r12
-    b 2b
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
 3:
-    add r2, r1, r2, lsl #4
-    ldrh r3, [lr, #2]!
-    ldr r12, [r9, r2, lsl #2]
-    ldrb r2, [r5, r2]
-    put_bits r11, r4, r12, r2
-    checkbuf15 r10, r11, r4, r2, r12
-    put_bits r11, r4, r3, r1
-    checkbuf15 r10, r11, r4, r2, r12
-    lsls r8, r8, #0x1
-    bne 1b
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
 6:
-    add r0, sp, #0x20
-    add r0, #0xfe
-    cmp lr, r0
-    bhs 1f
-    ldr r1, [r9]
-    ldrb r0, [r5]
-    put_bits r11, r4, r1, r0
-    checkbuf15 r10, r11, r4, r0, r1
+    add             r0, sp, #0x20
+    add             r0, #0xfe
+    cmp             lr, r0
+    bhs             1f
+    ldr             r1, [r9]
+    ldrb            r0, [r5]
+    put_bits        r11, r4, r1, r0
+    checkbuf15      r10, r11, r4, r0, r1
 1:
-    ldr r12, [sp, #0x18]
-    str r11, [r12, #0x8]
-    str r4, [r12, #0xc]
-    add r0, r10, #0x1
-    add r4, sp, #0x140
-    vld1.64 {d8, d9, d10, d11}, [r4, :128]!
-    vld1.64 {d12, d13, d14, d15}, [r4, :128]
-    sub r4, r7, #0x1c
-    mov sp, r4
-    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+    ldr             r12, [sp, #0x18]
+    str             r11, [r12, #0x8]
+    str             r4, [r12, #0xc]
+    add             r0, r10, #0x1
+    add             r4, sp, #0x140
+    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vld1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             r4, r7, #0x1c
+    mov             sp, r4
+    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 
 .purgem emit_byte
 .purgem put_bits