Builds a test corpus for fuzzing (#1184)

* Limit size of inputs for fuzz targets

* Build a test corpus for fuzzing
diff --git a/suite/fuzz/fuzz_disasm.c b/suite/fuzz/fuzz_disasm.c
index c497148..8496470 100644
--- a/suite/fuzz/fuzz_disasm.c
+++ b/suite/fuzz/fuzz_disasm.c
@@ -20,110 +20,161 @@
 
 struct platform platforms[] = {
     {
+        // item 0
         CS_ARCH_X86,
         CS_MODE_32,
         "X86 32 (Intel syntax)"
     },
     {
+        // item 1
         CS_ARCH_X86,
         CS_MODE_64,
         "X86 64 (Intel syntax)"
     },
     {
+        // item 2
         CS_ARCH_ARM,
         CS_MODE_ARM,
         "ARM"
     },
     {
-        CS_ARCH_ARM,
-        CS_MODE_THUMB,
-        "THUMB-2"
-    },
-    {
-        CS_ARCH_ARM,
-        CS_MODE_ARM,
-        "ARM: Cortex-A15 + NEON"
-    },
-    {
+        // item 3
         CS_ARCH_ARM,
         CS_MODE_THUMB,
         "THUMB"
     },
     {
-        CS_ARCH_ARM,
-        (cs_mode)(CS_MODE_THUMB + CS_MODE_MCLASS),
-        "Thumb-MClass"
-    },
-    {
+        // item 4
         CS_ARCH_ARM,
         (cs_mode)(CS_MODE_ARM + CS_MODE_V8),
         "Arm-V8"
     },
     {
+        // item 5
+        CS_ARCH_ARM,
+        (cs_mode)(CS_MODE_THUMB+CS_MODE_V8),
+        "THUMB+V8"
+    },
+    {
+        // item 6
+        CS_ARCH_ARM,
+        (cs_mode)(CS_MODE_THUMB + CS_MODE_MCLASS),
+        "Thumb-MClass"
+    },
+    {
+        // item 7
+        CS_ARCH_ARM64,
+        (cs_mode)0,
+        "ARM-64"
+    },
+    {
+        // item 8
         CS_ARCH_MIPS,
         (cs_mode)(CS_MODE_MIPS32 + CS_MODE_BIG_ENDIAN),
         "MIPS-32 (Big-endian)"
     },
     {
+        // item 9
         CS_ARCH_MIPS,
-        (cs_mode)(CS_MODE_MIPS64 + CS_MODE_LITTLE_ENDIAN),
+        (cs_mode)(CS_MODE_MIPS32 + CS_MODE_MICRO),
+        "MIPS-32 (micro)"
+    },
+    {
+        //item 10
+        CS_ARCH_MIPS,
+        CS_MODE_MIPS64,
         "MIPS-64-EL (Little-endian)"
     },
     {
+        //item 11
         CS_ARCH_MIPS,
-        (cs_mode)(CS_MODE_MIPS32R6 + CS_MODE_MICRO + CS_MODE_BIG_ENDIAN),
-        "MIPS-32R6 | Micro (Big-endian)"
+        CS_MODE_MIPS32,
+        "MIPS-32-EL (Little-endian)"
     },
     {
+        //item 12
         CS_ARCH_MIPS,
-        (cs_mode)(CS_MODE_MIPS32R6 + CS_MODE_BIG_ENDIAN),
-        "MIPS-32R6 (Big-endian)"
+        (cs_mode)(CS_MODE_MIPS64 + CS_MODE_BIG_ENDIAN),
+        "MIPS-64 (Big-endian)"
     },
     {
-        CS_ARCH_ARM64,
-        CS_MODE_ARM,
-        "ARM-64"
+        //item 13
+        CS_ARCH_MIPS,
+        (cs_mode)(CS_MODE_MIPS32 + CS_MODE_MICRO + CS_MODE_BIG_ENDIAN),
+        "MIPS-32 | Micro (Big-endian)"
     },
     {
+        //item 14
         CS_ARCH_PPC,
         CS_MODE_BIG_ENDIAN,
         "PPC-64"
     },
     {
+        //item 15
         CS_ARCH_SPARC,
         CS_MODE_BIG_ENDIAN,
         "Sparc"
     },
     {
+        //item 16
         CS_ARCH_SPARC,
         (cs_mode)(CS_MODE_BIG_ENDIAN + CS_MODE_V9),
         "SparcV9"
     },
     {
+        //item 17
         CS_ARCH_SYSZ,
         (cs_mode)0,
         "SystemZ"
     },
     {
+        //item 18
         CS_ARCH_XCORE,
         (cs_mode)0,
         "XCore"
-	},
-	{
-		CS_ARCH_M68K,
-		(cs_mode)0,
-		"M68K"
-	},
-	{
-		CS_ARCH_M680X,
-		(cs_mode)CS_MODE_M680X_6809,
-		"M680X_M6809"
-	},
-	{
-		CS_ARCH_EVM,
-		(cs_mode)0,
-		"EVM"
-	},
+    },
+    {
+        //item 19
+        CS_ARCH_MIPS,
+        (cs_mode)(CS_MODE_MIPS32R6 + CS_MODE_BIG_ENDIAN),
+        "MIPS-32R6 (Big-endian)"
+    },
+    {
+        //item 20
+        CS_ARCH_MIPS,
+        (cs_mode)(CS_MODE_MIPS32R6 + CS_MODE_MICRO + CS_MODE_BIG_ENDIAN),
+        "MIPS-32R6 (Micro+Big-endian)"
+    },
+    {
+        //item 21
+        CS_ARCH_MIPS,
+        CS_MODE_MIPS32R6,
+        "MIPS-32R6 (Little-endian)"
+    },
+    {
+        //item 22
+        CS_ARCH_MIPS,
+        (cs_mode)(CS_MODE_MIPS32R6 + CS_MODE_MICRO),
+        "MIPS-32R6 (Micro+Little-endian)"
+    },
+    {
+        //item 23
+        CS_ARCH_M68K,
+        (cs_mode)0,
+        "M68K"
+    },
+    {
+        //item 24
+        CS_ARCH_M680X,
+        (cs_mode)CS_MODE_M680X_6809,
+        "M680X_M6809"
+    },
+    {
+        //item 25
+        CS_ARCH_EVM,
+        (cs_mode)0,
+        "EVM"
+    },
 };
 
 int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
@@ -135,6 +186,9 @@
     if (Size < 1) {
         // 1 byte for arch choice
         return 0;
+    } else if (Size > 0x1000) {
+        //limit input to 4kb
+        Size = 0x1000;
     }
     if (outfile == NULL) {
         // we compute the output
diff --git a/suite/test_corpus.py b/suite/test_corpus.py
new file mode 100755
index 0000000..43bebda
--- /dev/null
+++ b/suite/test_corpus.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+# Test tool to compare Capstone output with llvm-mc. By Nguyen Anh Quynh, 2014
+import sys
+import os
+from capstone import *
+
+def test_file(fname):
+    print("Test %s" %fname);
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+
+    if not lines[0].startswith('# '):
+        print("ERROR: decoding information is missing")
+        return
+
+    # skip '# ' at the front, then split line to get out hexcode
+    # Note: option can be '', or 'None'
+    #print lines[0]
+    #print lines[0][2:].split(', ')
+    (arch, mode, option) = lines[0][2:].split(', ')
+    mode = mode.replace(' ', '')
+    option = option.strip()
+
+    archs = {
+        "CS_ARCH_ARM": CS_ARCH_ARM,
+        "CS_ARCH_ARM64": CS_ARCH_ARM64,
+        "CS_ARCH_MIPS": CS_ARCH_MIPS,
+        "CS_ARCH_PPC": CS_ARCH_PPC,
+        "CS_ARCH_SPARC": CS_ARCH_SPARC,
+        "CS_ARCH_SYSZ": CS_ARCH_SYSZ,
+        "CS_ARCH_X86": CS_ARCH_X86,
+        "CS_ARCH_XCORE": CS_ARCH_XCORE,
+    }
+    
+    modes = {
+        "CS_MODE_16": CS_MODE_16,
+        "CS_MODE_32": CS_MODE_32,
+        "CS_MODE_64": CS_MODE_64,
+        "CS_MODE_MIPS32": CS_MODE_MIPS32,
+        "CS_MODE_MIPS64": CS_MODE_MIPS64,
+        "0": CS_MODE_ARM,
+        "CS_MODE_ARM": CS_MODE_ARM,
+        "CS_MODE_THUMB": CS_MODE_THUMB,
+        "CS_MODE_ARM+CS_MODE_V8": CS_MODE_ARM+CS_MODE_V8,
+        "CS_MODE_THUMB+CS_MODE_V8": CS_MODE_THUMB+CS_MODE_V8,
+        "CS_MODE_THUMB+CS_MODE_MCLASS": CS_MODE_THUMB+CS_MODE_MCLASS,
+        "CS_MODE_LITTLE_ENDIAN": CS_MODE_LITTLE_ENDIAN,
+        "CS_MODE_BIG_ENDIAN": CS_MODE_BIG_ENDIAN,
+        "CS_MODE_64+CS_MODE_LITTLE_ENDIAN": CS_MODE_64+CS_MODE_LITTLE_ENDIAN,
+        "CS_MODE_64+CS_MODE_BIG_ENDIAN": CS_MODE_64+CS_MODE_BIG_ENDIAN,
+        "CS_MODE_MIPS32+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO,
+        "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
+        "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
+        "CS_MODE_BIG_ENDIAN+CS_MODE_V9": CS_MODE_BIG_ENDIAN + CS_MODE_V9,
+        "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN,
+        "CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN,
+        "CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN,
+        "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN,
+    }
+
+    mc_modes = {
+        ("CS_ARCH_X86", "CS_MODE_32"): 0,
+        ("CS_ARCH_X86", "CS_MODE_64"): 1,
+        ("CS_ARCH_ARM", "CS_MODE_ARM"): 2,
+        ("CS_ARCH_ARM", "CS_MODE_THUMB"): 3,
+        ("CS_ARCH_ARM", "CS_MODE_ARM+CS_MODE_V8"): 4,
+        ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_V8"): 5,
+        ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_MCLASS"): 6,
+        ("CS_ARCH_ARM64", "0"): 7,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN"): 8,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO"): 9,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS64"): 10,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32"): 11,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN"): 12,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): 13,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO"): 13,
+        ("CS_ARCH_PPC", "CS_MODE_BIG_ENDIAN"): 14,
+        ("CS_ARCH_SPARC", "CS_MODE_BIG_ENDIAN"): 15,
+        ("CS_ARCH_SPARC", "CS_MODE_BIG_ENDIAN+CS_MODE_V9"): 16,
+        ("CS_ARCH_SYSZ", "0"): 17,
+        ("CS_ARCH_XCORE", "0"): 18,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32R6+CS_MODE_BIG_ENDIAN"): 19,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32R6+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): 20,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32R6"): 21,
+        ("CS_ARCH_MIPS", "CS_MODE_MIPS32R6+CS_MODE_MICRO"): 22,
+        ("CS_ARCH_M68K", "0"): 23,
+        ("CS_ARCH_M680X", "CS_MODE_M680X_6809"): 24,
+        ("CS_ARCH_EVM", "0"): 25,
+    }
+
+    #if not option in ('', 'None'):
+    #    print archs[arch], modes[mode], options[option]
+
+    for line in lines[1:]:
+        # ignore all the input lines having # in front.
+        if line.startswith('#'):
+            continue
+        #print("Check %s" %line)
+        code = line.split(' = ')[0]
+        if len(code) < 2:
+            continue
+        asm  = ''.join(line.split(' = ')[1:])
+        hex_code = code.replace('0x', '')
+        hex_code = hex_code.replace(',', '')
+        hex_data = hex_code.decode('hex')
+        fout = open("fuzz/corpus/%s_%s" % (os.path.basename(fname), hex_code), 'w')
+        if (arch, mode) not in mc_modes:
+            print "fail", arch, mode
+        fout.write(unichr(mc_modes[(arch, mode)]))
+        fout.write(hex_data)
+        fout.close()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        fnames = sys.stdin.readlines()
+        for fname in fnames:
+            test_file(fname.strip())
+    else:
+        #print("Usage: ./test_mc.py <input-file.s.cs>")
+        test_file(sys.argv[1])
+