Add AMD TBM instructions.

Reference: http://support.amd.com/us/Processor_TechDocs/24594_APM_v3.pdf

Also add appropriate CPU bits and directive handling for these.
diff --git a/modules/arch/x86/gen_x86_insn.py b/modules/arch/x86/gen_x86_insn.py
index 4de9d6c..f3ced2a 100755
--- a/modules/arch/x86/gen_x86_insn.py
+++ b/modules/arch/x86/gen_x86_insn.py
@@ -39,7 +39,7 @@
     "SSE3", "SVM", "PadLock", "SSSE3", "SSE41", "SSE42", "SSE4a", "SSE5",
     "AVX", "FMA", "AES", "CLMUL", "MOVBE", "XOP", "FMA4", "F16C",
     "FSGSBASE", "RDRAND", "XSAVEOPT", "EPTVPID", "SMX", "AVX2", "BMI1",
-    "BMI2", "INVPCID", "LZCNT"]
+    "BMI2", "INVPCID", "LZCNT", "TBM"]
 unordered_cpu_features = ["Priv", "Prot", "Undoc", "Obs"]
 
 # Predefined VEX prefix field values
@@ -7961,8 +7961,6 @@
 
 add_insn("bzhi", "vex_gpr_reg_rm_nds_0F", modifiers=[0x00, 0x38, 0xF5],
          cpu=["BMI2"])
-add_insn("bextr","vex_gpr_reg_rm_nds_0F", modifiers=[0x00, 0x38, 0xF7],
-         cpu=["BMI1"])
 add_insn("shlx", "vex_gpr_reg_rm_nds_0F", modifiers=[0x66, 0x38, 0xF7],
          cpu=["BMI2"])
 add_insn("shrx", "vex_gpr_reg_rm_nds_0F", modifiers=[0xF2, 0x38, 0xF7],
@@ -7973,7 +7971,31 @@
 add_insn("mulx", "vex_gpr_reg_nds_rm_0F", modifiers=[0xF2, 0x38, 0xF6],
          cpu=["BMI2"])
 
+for sfx, sz in zip("lq", [32, 64]):  # no 16-bit forms
+    add_group("bextr",
+        cpu=["BMI1"],
+        suffix=sfx,
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x0F, 0x38, 0xF7],
+        vex=0,
+        operands=[Operand(type="Reg", size=sz, dest="Spare"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA"),
+                  Operand(type="Reg", size=sz, dest="VEX")])
+    add_group("bextr", # TBM alternate form of bextr
+        cpu=["TBM"],
+        suffix=sfx,
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x0A, 0x10],
+        xop=128,
+        xopw=(sz==64),
+        onlyavx=True,
+        operands=[Operand(type="Reg", size=sz, dest="Spare"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA"),
+                  Operand(type="Imm", size=32, relaxed=True, dest="Imm")])
 
+add_insn("bextr", "bextr")
 
 #####################################################################
 # Intel INVPCID instruction
@@ -7996,6 +8018,33 @@
 add_insn("invpcid", "invpcid")
 
 #####################################################################
+# AMD trailing bit manipulation (TBM)
+#####################################################################
+
+for sfx, sz in zip("lq", [32, 64]):  # no 16-bit forms
+    add_group("xop_gpr_reg_rm_09",
+        cpu=["TBM"],
+        suffix=sfx,
+        modifiers=["Op1Add","SpAdd"],
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x09, 0x00],
+        xop=128,
+        xopw=(sz==64),
+        operands=[Operand(type="Reg", size=sz, dest="VEX"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA")])
+
+add_insn("blcfill", "xop_gpr_reg_rm_09", modifiers=[0x01, 1])
+add_insn("blci",    "xop_gpr_reg_rm_09", modifiers=[0x02, 6])
+add_insn("blcic",   "xop_gpr_reg_rm_09", modifiers=[0x01, 5])
+add_insn("blcmsk",  "xop_gpr_reg_rm_09", modifiers=[0x02, 1])
+add_insn("blcs",    "xop_gpr_reg_rm_09", modifiers=[0x01, 3])
+add_insn("blsfill", "xop_gpr_reg_rm_09", modifiers=[0x01, 2])
+add_insn("blsic",   "xop_gpr_reg_rm_09", modifiers=[0x01, 6])
+add_insn("t1mskc",  "xop_gpr_reg_rm_09", modifiers=[0x01, 7])
+add_insn("tzmsk",   "xop_gpr_reg_rm_09", modifiers=[0x01, 4])
+
+#####################################################################
 # AMD 3DNow! instructions
 #####################################################################
 
diff --git a/modules/arch/x86/x86arch.h b/modules/arch/x86/x86arch.h
index 0c387f7..13f6c0e 100644
--- a/modules/arch/x86/x86arch.h
+++ b/modules/arch/x86/x86arch.h
@@ -83,6 +83,7 @@
 #define CPU_BMI2    49      /* Intel BMI2 instructions */
 #define CPU_INVPCID 50      /* Intel INVPCID instruction */
 #define CPU_LZCNT   51      /* Intel LZCNT instruction */
+#define CPU_TBM     52      /* AMD TBM instruction */
 
 enum x86_parser_type {
     X86_PARSER_NASM = 0,
diff --git a/modules/arch/x86/x86cpu.gperf b/modules/arch/x86/x86cpu.gperf
index 669a588..cbdcf58 100644
--- a/modules/arch/x86/x86cpu.gperf
+++ b/modules/arch/x86/x86cpu.gperf
@@ -398,6 +398,8 @@
 noinvpcid,	x86_cpu_clear,	CPU_INVPCID
 lzcnt,		x86_cpu_set,	CPU_LZCNT
 nolzcnt,	x86_cpu_clear,	CPU_LZCNT
+tbm,		x86_cpu_set,	CPU_TBM
+notbm,	x86_cpu_clear,	CPU_TBM
 # Change NOP patterns
 basicnop,	x86_nop,	X86_NOP_BASIC
 intelnop,	x86_nop,	X86_NOP_INTEL
diff --git a/modules/arch/x86/x86id.c b/modules/arch/x86/x86id.c
index 3f645b8..dc70026 100644
--- a/modules/arch/x86/x86id.c
+++ b/modules/arch/x86/x86id.c
@@ -1603,8 +1603,9 @@
              * Leave R=X=B=1 for now.
              */
             if (insn->opcode.opcode[0] != 0x08 &&
-                insn->opcode.opcode[0] != 0x09)
-                yasm_internal_error(N_("first opcode byte of XOP must be 0x08 or 0x09"));
+                insn->opcode.opcode[0] != 0x09 &&
+                insn->opcode.opcode[0] != 0x0A)
+                yasm_internal_error(N_("first opcode byte of XOP must be 0x08, 0x09, or 0x0A"));
             vex1 |= insn->opcode.opcode[0];
             /* Move opcode byte back one byte to make room for XOP prefix. */
             insn->opcode.opcode[2] = insn->opcode.opcode[1];