ARM32 NEON SIMD implementation of Huffman encoding

Full-color compression speedups relative to libjpeg-turbo 1.4.2:

800 MHz ARM Cortex-A9, iOS, 32-bit:  26-44% (avg. 32%)

Refer to #42 and #47 for discussion.

This commit also removes the unnecessary

    if (simd_support & JSIMD_ARM_NEON)

statements from the jsimd* algorithm functions.  Since the jsimd_can*()
functions check for the existence of NEON, the corresponding algorithm
functions will never be called if NEON isn't available.  Removing those
if statements improved performance across the board by a couple of
percent.

Based on:
https://github.com/mayeut/libjpeg-turbo/commit/fc023c880ce1d6c908fb78ccc25f5d5fd910ccc5
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 2b0cd32..f79660e 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -67,6 +67,10 @@
 or regression testing, SIMD-accelerated Huffman encoding can be disabled by
 setting the JSIMD_NOHUFFENC environment variable to 1.
 
+[13] Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+platforms.  This speeds up the compression of full-color JPEGs by about 30% on
+average.
+
 
 1.4.2
 =====
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 04277fc..e259fea 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -5,7 +5,7 @@
  * Copyright (C) 2011, 2014-2016 D. R. Commander
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
  * Copyright (C) 2014 Linaro Limited
- * Copyright (C) 2015 Matthieu Darbois
+ * Copyright (C) 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -835,3 +835,7 @@
 EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
         (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
          c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
+        (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index e715291..635cbd7 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,8 +2,8 @@
  * jsimd_arm.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
- * Copyright 2015 Matthieu Darbois
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -228,8 +228,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -274,8 +273,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -283,9 +281,8 @@
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -387,10 +384,9 @@
                            JSAMPARRAY input_data,
                            JSAMPARRAY * output_data_ptr)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
@@ -458,8 +454,7 @@
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
                 DCTELEM * workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_convsamp_neon(sample_data, start_col, workspace);
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
@@ -509,8 +504,7 @@
 GLOBAL(void)
 jsimd_fdct_ifast (DCTELEM * data)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_fdct_ifast_neon(data);
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
@@ -549,8 +543,7 @@
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
                 DCTELEM * workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_quantize_neon(coef_block, divisors, workspace);
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
@@ -610,9 +603,8 @@
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
@@ -620,9 +612,8 @@
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -686,9 +677,8 @@
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
@@ -696,9 +686,8 @@
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
@@ -711,6 +700,16 @@
 GLOBAL(int)
 jsimd_can_huff_encode_one_block (void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -719,5 +718,6 @@
                              int last_dc_val, c_derived_tbl *dctbl,
                              c_derived_tbl *actbl)
 {
-  return NULL;
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
 }
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index c83e1c7..ecbdea8 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -7,6 +7,7 @@
  * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
  * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
  * Copyright (C) 2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016 Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -2438,3 +2439,437 @@
 .purgem upsample16
 .purgem upsample32
 .purgem upsample_row
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_chuff_encode_one_block (working_state * state, JOCTET *buffer,
+ *                               JCOEFPTR block, int last_dc_val,
+ *                               c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+    sub \PUT_BITS, \PUT_BITS, #0x8
+    lsr \TMP, \PUT_BUFFER, \PUT_BITS
+    uxtb \TMP, \TMP
+    strb \TMP, [\BUFFER, #1]!
+    cmp \TMP, #0xff
+    /*it eq*/
+    streqb \ZERO, [\BUFFER, #1]!
+.endm
+.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
+    /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+    add \PUT_BITS, \SIZE
+    /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+    orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+.endm
+.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+  cmp \PUT_BITS, #0x10
+  blt 15f
+    eor \ZERO, \ZERO, \ZERO
+    emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+    emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+15:
+.endm
+
+.balign 16
+jsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01
+    .byte 0x02
+    .byte 0x04
+    .byte 0x08
+    .byte 0x10
+    .byte 0x20
+    .byte 0x40
+    .byte 0x80
+
+asm_function jsimd_huff_encode_one_block_neon
+    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+    add r7, sp, #0x1c
+    sub r4, sp, #0x40
+    bfc r4, #0, #5
+    mov sp, r4 /* align sp on 32 bytes */
+    vst1.64 {d8, d9, d10, d11}, [r4, :128]!
+    vst1.64 {d12, d13, d14, d15}, [r4, :128]
+    sub sp, #0x140 /* reserve 320 bytes */
+    str r0, [sp, #0x18] /* working state > sp + Ox18 */
+    add r4, sp, #0x20   /* r4 = t1 */
+    ldr lr, [r7, #0x8]  /* lr = dctbl */
+    sub r10, r1, #0x1   /* r10=buffer-- */
+    ldrsh r1, [r2]
+    mov r9, #0x10
+    mov r8, #0x1
+    adr r5, jsimd_huff_encode_one_block_neon_consts
+    /* prepare data */
+    vld1.8 {d26}, [r5, :64]
+    veor q8, q8, q8
+    veor q9, q9, q9
+    vdup.16 q14, r9
+    vdup.16 q15, r8
+    veor q10, q10, q10
+    veor q11, q11, q11
+    sub r1, r1, r3
+    add r9, r2, #0x22
+    add r8, r2, #0x18
+    add r3, r2, #0x36
+    vmov.16 d0[0], r1
+    vld1.16 {d2[0]}, [r9, :16]
+    vld1.16 {d4[0]}, [r8, :16]
+    vld1.16 {d6[0]}, [r3, :16]
+    add r1, r2, #0x2
+    add r9, r2, #0x30
+    add r8, r2, #0x26
+    add r3, r2, #0x28
+    vld1.16 {d0[1]}, [r1, :16]
+    vld1.16 {d2[1]}, [r9, :16]
+    vld1.16 {d4[1]}, [r8, :16]
+    vld1.16 {d6[1]}, [r3, :16]
+    add r1, r2, #0x10
+    add r9, r2, #0x40
+    add r8, r2, #0x34
+    add r3, r2, #0x1a
+    vld1.16 {d0[2]}, [r1, :16]
+    vld1.16 {d2[2]}, [r9, :16]
+    vld1.16 {d4[2]}, [r8, :16]
+    vld1.16 {d6[2]}, [r3, :16]
+    add r1, r2, #0x20
+    add r9, r2, #0x32
+    add r8, r2, #0x42
+    add r3, r2, #0xc
+    vld1.16 {d0[3]}, [r1, :16]
+    vld1.16 {d2[3]}, [r9, :16]
+    vld1.16 {d4[3]}, [r8, :16]
+    vld1.16 {d6[3]}, [r3, :16]
+    add r1, r2, #0x12
+    add r9, r2, #0x24
+    add r8, r2, #0x50
+    add r3, r2, #0xe
+    vld1.16 {d1[0]}, [r1, :16]
+    vld1.16 {d3[0]}, [r9, :16]
+    vld1.16 {d5[0]}, [r8, :16]
+    vld1.16 {d7[0]}, [r3, :16]
+    add r1, r2, #0x4
+    add r9, r2, #0x16
+    add r8, r2, #0x60
+    add r3, r2, #0x1c
+    vld1.16 {d1[1]}, [r1, :16]
+    vld1.16 {d3[1]}, [r9, :16]
+    vld1.16 {d5[1]}, [r8, :16]
+    vld1.16 {d7[1]}, [r3, :16]
+    add r1, r2, #0x6
+    add r9, r2, #0x8
+    add r8, r2, #0x52
+    add r3, r2, #0x2a
+    vld1.16 {d1[2]}, [r1, :16]
+    vld1.16 {d3[2]}, [r9, :16]
+    vld1.16 {d5[2]}, [r8, :16]
+    vld1.16 {d7[2]}, [r3, :16]
+    add r1, r2, #0x14
+    add r9, r2, #0xa
+    add r8, r2, #0x44
+    add r3, r2, #0x38
+    vld1.16 {d1[3]}, [r1, :16]
+    vld1.16 {d3[3]}, [r9, :16]
+    vld1.16 {d5[3]}, [r8, :16]
+    vld1.16 {d7[3]}, [r3, :16]
+    vcgt.s16 q8, q8, q0
+    vcgt.s16 q9, q9, q1
+    vcgt.s16 q10, q10, q2
+    vcgt.s16 q11, q11, q3
+    vabs.s16 q0, q0
+    vabs.s16 q1, q1
+    vabs.s16 q2, q2
+    vabs.s16 q3, q3
+    veor q8, q8, q0
+    veor q9, q9, q1
+    veor q10, q10, q2
+    veor q11, q11, q3
+    add r9, r4, #0x20
+    add r8, r4, #0x80
+    add r3, r4, #0xa0
+    vclz.i16 q0, q0
+    vclz.i16 q1, q1
+    vclz.i16 q2, q2
+    vclz.i16 q3, q3
+    vsub.i16 q0, q14, q0
+    vsub.i16 q1, q14, q1
+    vsub.i16 q2, q14, q2
+    vsub.i16 q3, q14, q3
+    vst1.16 {d0, d1, d2, d3}, [r4, :256]
+    vst1.16 {d4, d5, d6, d7}, [r9, :256]
+    vshl.s16 q0, q15, q0
+    vshl.s16 q1, q15, q1
+    vshl.s16 q2, q15, q2
+    vshl.s16 q3, q15, q3
+    vsub.i16 q0, q0, q15
+    vsub.i16 q1, q1, q15
+    vsub.i16 q2, q2, q15
+    vsub.i16 q3, q3, q15
+    vand q8, q8, q0
+    vand q9, q9, q1
+    vand q10, q10, q2
+    vand q11, q11, q3
+    vst1.16 {d16, d17, d18, d19}, [r8, :256]
+    vst1.16 {d20, d21, d22, d23}, [r3, :256]
+    add r1, r2, #0x46
+    add r9, r2, #0x3a
+    add r8, r2, #0x74
+    add r3, r2, #0x6a
+    vld1.16 {d8[0]}, [r1, :16]
+    vld1.16 {d10[0]}, [r9, :16]
+    vld1.16 {d12[0]}, [r8, :16]
+    vld1.16 {d14[0]}, [r3, :16]
+    veor q8, q8, q8
+    veor q9, q9, q9
+    veor q10, q10, q10
+    veor q11, q11, q11
+    add r1, r2, #0x54
+    add r9, r2, #0x2c
+    add r8, r2, #0x76
+    add r3, r2, #0x78
+    vld1.16 {d8[1]}, [r1, :16]
+    vld1.16 {d10[1]}, [r9, :16]
+    vld1.16 {d12[1]}, [r8, :16]
+    vld1.16 {d14[1]}, [r3, :16]
+    add r1, r2, #0x62
+    add r9, r2, #0x1e
+    add r8, r2, #0x68
+    add r3, r2, #0x7a
+    vld1.16 {d8[2]}, [r1, :16]
+    vld1.16 {d10[2]}, [r9, :16]
+    vld1.16 {d12[2]}, [r8, :16]
+    vld1.16 {d14[2]}, [r3, :16]
+    add r1, r2, #0x70
+    add r9, r2, #0x2e
+    add r8, r2, #0x5a
+    add r3, r2, #0x6c
+    vld1.16 {d8[3]}, [r1, :16]
+    vld1.16 {d10[3]}, [r9, :16]
+    vld1.16 {d12[3]}, [r8, :16]
+    vld1.16 {d14[3]}, [r3, :16]
+    add r1, r2, #0x72
+    add r9, r2, #0x3c
+    add r8, r2, #0x4c
+    add r3, r2, #0x5e
+    vld1.16 {d9[0]}, [r1, :16]
+    vld1.16 {d11[0]}, [r9, :16]
+    vld1.16 {d13[0]}, [r8, :16]
+    vld1.16 {d15[0]}, [r3, :16]
+    add r1, r2, #0x64
+    add r9, r2, #0x4a
+    add r8, r2, #0x3e
+    add r3, r2, #0x6e
+    vld1.16 {d9[1]}, [r1, :16]
+    vld1.16 {d11[1]}, [r9, :16]
+    vld1.16 {d13[1]}, [r8, :16]
+    vld1.16 {d15[1]}, [r3, :16]
+    add r1, r2, #0x56
+    add r9, r2, #0x58
+    add r8, r2, #0x4e
+    add r3, r2, #0x7c
+    vld1.16 {d9[2]}, [r1, :16]
+    vld1.16 {d11[2]}, [r9, :16]
+    vld1.16 {d13[2]}, [r8, :16]
+    vld1.16 {d15[2]}, [r3, :16]
+    add r1, r2, #0x48
+    add r9, r2, #0x66
+    add r8, r2, #0x5c
+    add r3, r2, #0x7e
+    vld1.16 {d9[3]}, [r1, :16]
+    vld1.16 {d11[3]}, [r9, :16]
+    vld1.16 {d13[3]}, [r8, :16]
+    vld1.16 {d15[3]}, [r3, :16]
+    vcgt.s16 q8, q8, q4
+    vcgt.s16 q9, q9, q5
+    vcgt.s16 q10, q10, q6
+    vcgt.s16 q11, q11, q7
+    vabs.s16 q4, q4
+    vabs.s16 q5, q5
+    vabs.s16 q6, q6
+    vabs.s16 q7, q7
+    veor q8, q8, q4
+    veor q9, q9, q5
+    veor q10, q10, q6
+    veor q11, q11, q7
+    add r1, r4, #0x40
+    add r9, r4, #0x60
+    add r8, r4, #0xc0
+    add r3, r4, #0xe0
+    vclz.i16 q4, q4
+    vclz.i16 q5, q5
+    vclz.i16 q6, q6
+    vclz.i16 q7, q7
+    vsub.i16 q4, q14, q4
+    vsub.i16 q5, q14, q5
+    vsub.i16 q6, q14, q6
+    vsub.i16 q7, q14, q7
+    vst1.16 {d8, d9, d10, d11}, [r1, :256]
+    vst1.16 {d12, d13, d14, d15}, [r9, :256]
+    vshl.s16 q4, q15, q4
+    vshl.s16 q5, q15, q5
+    vshl.s16 q6, q15, q6
+    vshl.s16 q7, q15, q7
+    vsub.i16 q4, q4, q15
+    vsub.i16 q5, q5, q15
+    vsub.i16 q6, q6, q15
+    vsub.i16 q7, q7, q15
+    vand q8, q8, q4
+    vand q9, q9, q5
+    vand q10, q10, q6
+    vand q11, q11, q7
+    vst1.16 {d16, d17, d18, d19}, [r8, :256]
+    vst1.16 {d20, d21, d22, d23}, [r3, :256]
+    ldr r12, [r7, #0xc]  /* r12 = actbl */
+    add r1, lr, #0x400   /* r1 = dctbl->ehufsi */
+    mov r9, r12          /* r9 = actbl */
+    add r6, r4, #0x80    /* r6 = t2 */
+    ldr r11, [r0, #0x8]  /* r11 = put_buffer */
+    ldr r4, [r0, #0xc]   /* r4  = put_bits */
+    ldrh r2, [r6, #-128] /* r2  = nbits */
+    ldrh r3, [r6]        /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldr r0, [lr, r2, lsl #2]
+    ldrb r5, [r1, r2]
+    put_bits r11, r4, r0, r5
+    checkbuf15 r10, r11, r4, r5, r0
+    put_bits r11, r4, r3, r2
+    checkbuf15 r10, r11, r4, r5, r0
+    mov lr, r6            /* lr = t2 */
+    add r5, r9, #0x400    /* r5 = actbl->ehufsi */
+    ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
+    veor q8, q8, q8
+    vceq.i16 q0, q0, q8
+    vceq.i16 q1, q1, q8
+    vceq.i16 q2, q2, q8
+    vceq.i16 q3, q3, q8
+    vceq.i16 q4, q4, q8
+    vceq.i16 q5, q5, q8
+    vceq.i16 q6, q6, q8
+    vceq.i16 q7, q7, q8
+    vmovn.i16 d0, q0
+    vmovn.i16 d2, q1
+    vmovn.i16 d4, q2
+    vmovn.i16 d6, q3
+    vmovn.i16 d8, q4
+    vmovn.i16 d10, q5
+    vmovn.i16 d12, q6
+    vmovn.i16 d14, q7
+    vand d0, d0, d26
+    vand d2, d2, d26
+    vand d4, d4, d26
+    vand d6, d6, d26
+    vand d8, d8, d26
+    vand d10, d10, d26
+    vand d12, d12, d26
+    vand d14, d14, d26
+    vpadd.i8 d0, d0, d2
+    vpadd.i8 d4, d4, d6
+    vpadd.i8 d8, d8, d10
+    vpadd.i8 d12, d12, d14
+    vpadd.i8 d0, d0, d4
+    vpadd.i8 d8, d8, d12
+    vpadd.i8 d0, d0, d8
+    vmov.32 r1, d0[1]
+    vmov.32 r8, d0[0]
+    mvn r1, r1
+    mvn r8, r8
+    lsrs r1, r1, #0x1
+    rrx r8, r8  /* shift in last r1 bit while shifting out DC bit */
+    rbit r1, r1 /* r1 = index1 */
+    rbit r8, r8 /* r8 = index0 */
+    ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
+    str r1, [sp, #0x14]  /* index1 > sp + 0x14 */
+    cmp r8, #0x0
+    beq 6f
+1:
+    clz r2, r8
+    add lr, lr, r2, lsl #1
+    lsl r8, r8, r2
+    ldrh r1, [lr, #-126]
+2:
+    cmp r2, #0x10
+    blt 3f
+    sub r2, r2, #0x10
+    put_bits r11, r4, r0, r6
+    cmp r4, #0x10
+    blt 2b
+    eor r3, r3, r3
+    emit_byte r10, r11, r4, r3, r12
+    emit_byte r10, r11, r4, r3, r12
+    b 2b
+3:
+    add r2, r1, r2, lsl #4
+    ldrh r3, [lr, #2]!
+    ldr r12, [r9, r2, lsl #2]
+    ldrb r2, [r5, r2]
+    put_bits r11, r4, r12, r2
+    checkbuf15 r10, r11, r4, r2, r12
+    put_bits r11, r4, r3, r1
+    checkbuf15 r10, r11, r4, r2, r12
+    lsls r8, r8, #0x1
+    bne 1b
+6:
+    add r12, sp, #0x20   /* r12 = t1 */
+    ldr r8,[sp, #0x14]   /* r8 = index1 */
+    adds r12, #0xc0      /* r12 = t2 + (DCTSIZE2/2) */
+    cmp r8, #0x0
+    beq 6f
+    clz r2, r8
+    sub r12, r12, lr
+    lsl r8, r8, r2
+    add r2, r2, r12, lsr #1
+    add lr, lr, r2, lsl #1
+    b 7f
+1:
+    clz r2, r8
+    add lr, lr, r2, lsl #1
+    lsl r8, r8, r2
+7:
+    ldrh r1, [lr, #-126]
+2:
+    cmp r2, #0x10
+    blt 3f
+    sub r2, r2, #0x10
+    put_bits r11, r4, r0, r6
+    cmp r4, #0x10
+    blt 2b
+    eor r3, r3, r3
+    emit_byte r10, r11, r4, r3, r12
+    emit_byte r10, r11, r4, r3, r12
+    b 2b
+3:
+    add r2, r1, r2, lsl #4
+    ldrh r3, [lr, #2]!
+    ldr r12, [r9, r2, lsl #2]
+    ldrb r2, [r5, r2]
+    put_bits r11, r4, r12, r2
+    checkbuf15 r10, r11, r4, r2, r12
+    put_bits r11, r4, r3, r1
+    checkbuf15 r10, r11, r4, r2, r12
+    lsls r8, r8, #0x1
+    bne 1b
+6:
+    add r0, sp, #0x20
+    add r0, #0xfe
+    cmp lr, r0
+    bhs 1f
+    ldr r1, [r9]
+    ldrb r0, [r5]
+    put_bits r11, r4, r1, r0
+    checkbuf15 r10, r11, r4, r0, r1
+1:
+    ldr r12, [sp, #0x18]
+    str r11, [r12, #0x8]
+    str r4, [r12, #0xc]
+    add r0, r10, #0x1
+    add r4, sp, #0x140
+    vld1.64 {d8, d9, d10, d11}, [r4, :128]!
+    vld1.64 {d12, d13, d14, d15}, [r4, :128]
+    sub r4, r7, #0x1c
+    mov sp, r4
+    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf15