ARM64: Avoid tbl instruction on Cortex-A53/A57

Full-color compression speedups relative to previous commits:
Cortex-A53 (Nexus 5X), Android, 64-bit: 0.91-3.0% (avg. 1.8%)
Cortex-A57 (Nexus 5X), Android, 64-bit: -0.35-1.5% (avg. 0.65%)
diff --git a/simd/jsimd.h b/simd/jsimd.h
index a312930..d05a2ec 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -865,3 +865,7 @@
 EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
         (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
          c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
+        (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 8633162..cb48258 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -28,10 +28,12 @@
 
 #define JSIMD_FASTLD3 1
 #define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
 
 static unsigned int simd_support = ~0;
 static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
 
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 
@@ -83,7 +85,13 @@
         free(buffer);
         return 0;
       }
-      if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
         /* The SIMD version of Huffman encoding is slower than the C version on
            Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
            CPU. */
@@ -785,6 +793,10 @@
                              int last_dc_val, c_derived_tbl *dctbl,
                              c_derived_tbl *actbl)
 {
-  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
+  if (simd_features & JSIMD_FASTTBL)
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
 }
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 0df1c4a..b9bb5de 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -2986,8 +2986,6 @@
 
 /*****************************************************************************/
 
-#define TBL_IS_FAST
-
 /*
  * GLOBAL(JOCTET*)
  * jsimd_huff_encode_one_block (working_state * state, JOCTET *buffer,
@@ -3037,11 +3035,17 @@
 47:
 .endm
 
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
 .balign 16
+.if \fast_tbl == 1
 Ljsimd_huff_encode_one_block_neon_consts:
+.else
+Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
+.endif
     .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
     .byte    0,   1,   2,   3,  16,  17,  32,  33, \
             18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
     .byte   34,  35,  48,  49, 255, 255,  50,  51, \
@@ -3066,19 +3070,27 @@
            255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-#endif
+.endif
 
+.if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
     stp             x19, x20, [sp], 16
+.if \fast_tbl == 1
     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
+.else
+    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
+.endif
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
     /* prepare data */
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
     ld1             {v23.16b}, [x15], #16
     ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
     ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
@@ -3100,7 +3112,7 @@
     tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
     tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
     tbx             v6.16b, {v31.16b}, v19.16b
-#else
+.else
       add             x13, x2, #0x22
       sub             w12, w12, w3    /* last_dc_val, not used afterwards */
     ld1             {v23.16b}, [x15]
@@ -3230,7 +3242,7 @@
     ld1             {v5.h}[7], [x15]
     ld1             {v6.h}[7], [x19]
     ld1             {v7.h}[7], [x20]
-#endif
+.endif
     cmlt            v24.8h, v0.8h, #0
     cmlt            v25.8h, v1.8h, #0
     cmlt            v26.8h, v2.8h, #0
@@ -3425,6 +3437,11 @@
     add             sp, sp, 256
     br              x30
 
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
     .unreq          BUFFER
     .unreq          PUT_BUFFER
     .unreq          PUT_BITS