Merge pull request #61 from fischman/master

Silence is golden: genperf emits too much stdout in success path
diff --git a/modules/arch/x86/gen_x86_insn.py b/modules/arch/x86/gen_x86_insn.py
index 4de9d6c..6b7a333 100755
--- a/modules/arch/x86/gen_x86_insn.py
+++ b/modules/arch/x86/gen_x86_insn.py
@@ -39,7 +39,7 @@
     "SSE3", "SVM", "PadLock", "SSSE3", "SSE41", "SSE42", "SSE4a", "SSE5",
     "AVX", "FMA", "AES", "CLMUL", "MOVBE", "XOP", "FMA4", "F16C",
     "FSGSBASE", "RDRAND", "XSAVEOPT", "EPTVPID", "SMX", "AVX2", "BMI1",
-    "BMI2", "INVPCID", "LZCNT"]
+    "BMI2", "INVPCID", "LZCNT", "TBM", "TSX"]
 unordered_cpu_features = ["Priv", "Prot", "Undoc", "Obs"]
 
 # Predefined VEX prefix field values
@@ -2264,7 +2264,7 @@
 
 add_group("call",
     opersize=16,
-    def_opersize_64=64,
+    not64=True, #there should not be 16bit call in 64bit mode
     opcode=[0xE8],
     operands=[Operand(type="Imm", size=16, tmod="Near", dest="JmpRel")])
 add_group("call",
@@ -7116,6 +7116,45 @@
 add_insn("vpgatherqd", "gather_32x_32y_128", modifiers=[0x91])
 
 #####################################################################
+# Intel TSX instructions
+#####################################################################
+add_prefix("xacquire",     "ACQREL",  0xF2)
+add_prefix("xrelease",     "ACQREL",  0xF3)
+
+add_group("tsx_xabort",
+    cpu=["TSX"],
+    opcode=[0xC6, 0xF8],
+    operands=[Operand(type="Imm", size=8, relaxed=True, dest="Imm")])
+add_insn("xabort", "tsx_xabort")
+
+
+
+add_group("tsx_xbegin",
+    cpu=["TSX"],
+    opcode=[0xC7, 0xF8],
+    operands=[Operand(type="Imm", size=32,  tmod="Near", dest="JmpRel")])
+
+add_group("tsx_xbegin",
+    cpu=["TSX"],
+    opersize=16,
+    not64=True, #there should not be 16bit xbegin in 64bit mode
+    opcode=[0xC7, 0xF8],
+    operands=[Operand(type="Imm", size=16,  tmod="Near", dest="JmpRel")])
+add_insn("xbegin", "tsx_xbegin")
+
+add_group("tsx_0x0F_0x01",
+    cpu=["TSX"],
+    modifiers=["Op2Add"],
+    opcode=[0x0F, 0x01, 0x00],
+    operands=[])
+add_insn("xend", "tsx_0x0F_0x01", modifiers=[0xD5])
+add_insn("xtest", "tsx_0x0F_0x01", modifiers=[0xD6])
+
+
+
+
+
+#####################################################################
 # Intel FMA instructions
 #####################################################################
 
@@ -7961,8 +8000,6 @@
 
 add_insn("bzhi", "vex_gpr_reg_rm_nds_0F", modifiers=[0x00, 0x38, 0xF5],
          cpu=["BMI2"])
-add_insn("bextr","vex_gpr_reg_rm_nds_0F", modifiers=[0x00, 0x38, 0xF7],
-         cpu=["BMI1"])
 add_insn("shlx", "vex_gpr_reg_rm_nds_0F", modifiers=[0x66, 0x38, 0xF7],
          cpu=["BMI2"])
 add_insn("shrx", "vex_gpr_reg_rm_nds_0F", modifiers=[0xF2, 0x38, 0xF7],
@@ -7973,7 +8010,31 @@
 add_insn("mulx", "vex_gpr_reg_nds_rm_0F", modifiers=[0xF2, 0x38, 0xF6],
          cpu=["BMI2"])
 
+for sfx, sz in zip("lq", [32, 64]):  # no 16-bit forms
+    add_group("bextr",
+        cpu=["BMI1"],
+        suffix=sfx,
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x0F, 0x38, 0xF7],
+        vex=0,
+        operands=[Operand(type="Reg", size=sz, dest="Spare"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA"),
+                  Operand(type="Reg", size=sz, dest="VEX")])
+    add_group("bextr", # TBM alternate form of bextr
+        cpu=["TBM"],
+        suffix=sfx,
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x0A, 0x10],
+        xop=128,
+        xopw=(sz==64),
+        onlyavx=True,
+        operands=[Operand(type="Reg", size=sz, dest="Spare"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA"),
+                  Operand(type="Imm", size=32, relaxed=True, dest="Imm")])
 
+add_insn("bextr", "bextr")
 
 #####################################################################
 # Intel INVPCID instruction
@@ -7996,6 +8057,33 @@
 add_insn("invpcid", "invpcid")
 
 #####################################################################
+# AMD trailing bit manipulation (TBM)
+#####################################################################
+
+for sfx, sz in zip("lq", [32, 64]):  # no 16-bit forms
+    add_group("xop_gpr_reg_rm_09",
+        cpu=["TBM"],
+        suffix=sfx,
+        modifiers=["Op1Add","SpAdd"],
+        opersize=sz,
+        prefix=0x00,
+        opcode=[0x09, 0x00],
+        xop=128,
+        xopw=(sz==64),
+        operands=[Operand(type="Reg", size=sz, dest="VEX"),
+                  Operand(type="RM", size=sz, relaxed=True, dest="EA")])
+
+add_insn("blcfill", "xop_gpr_reg_rm_09", modifiers=[0x01, 1])
+add_insn("blci",    "xop_gpr_reg_rm_09", modifiers=[0x02, 6])
+add_insn("blcic",   "xop_gpr_reg_rm_09", modifiers=[0x01, 5])
+add_insn("blcmsk",  "xop_gpr_reg_rm_09", modifiers=[0x02, 1])
+add_insn("blcs",    "xop_gpr_reg_rm_09", modifiers=[0x01, 3])
+add_insn("blsfill", "xop_gpr_reg_rm_09", modifiers=[0x01, 2])
+add_insn("blsic",   "xop_gpr_reg_rm_09", modifiers=[0x01, 6])
+add_insn("t1mskc",  "xop_gpr_reg_rm_09", modifiers=[0x01, 7])
+add_insn("tzmsk",   "xop_gpr_reg_rm_09", modifiers=[0x01, 4])
+
+#####################################################################
 # AMD 3DNow! instructions
 #####################################################################
 
diff --git a/modules/arch/x86/x86arch.h b/modules/arch/x86/x86arch.h
index 0c387f7..5c3a6cf 100644
--- a/modules/arch/x86/x86arch.h
+++ b/modules/arch/x86/x86arch.h
@@ -83,6 +83,8 @@
 #define CPU_BMI2    49      /* Intel BMI2 instructions */
 #define CPU_INVPCID 50      /* Intel INVPCID instruction */
 #define CPU_LZCNT   51      /* Intel LZCNT instruction */
+#define CPU_TBM     52      /* AMD TBM instruction */
+#define CPU_TSX     53      /* Intel TSX instructions */
 
 enum x86_parser_type {
     X86_PARSER_NASM = 0,
@@ -139,7 +141,8 @@
     X86_ADDRSIZE = 2<<8,
     X86_OPERSIZE = 3<<8,
     X86_SEGREG = 4<<8,
-    X86_REX = 5<<8
+    X86_REX = 5<<8,
+    X86_ACQREL = 6<<8     /*TSX hint prefixes*/
 } x86_parse_insn_prefix;
 
 typedef enum {
@@ -219,6 +222,8 @@
     unsigned char addrsize;         /* 0 or =mode_bits => no override */
     unsigned char opersize;         /* 0 or =mode_bits => no override */
     unsigned char lockrep_pre;      /* 0 indicates no prefix */
+    unsigned char acqrel_pre;      /* 0 indicates no prefix. We need this because
+                                   xqcuire/xrelease might require F0 prefix */
 
     unsigned char mode_bits;
 } x86_common;
diff --git a/modules/arch/x86/x86bc.c b/modules/arch/x86/x86bc.c
index 1670df7..a668155 100644
--- a/modules/arch/x86/x86bc.c
+++ b/modules/arch/x86/x86bc.c
@@ -279,6 +279,24 @@
 
     for (i=0; i<num_prefixes; i++) {
         switch ((x86_parse_insn_prefix)(prefixes[i] & 0xff00)) {
+            /*To be accurate, we should enforce that TSX hints come only with a
+            predefined set of instructions, and in most cases only with F0
+            prefix. Otherwise they will have completely different semantics.
+            But F0 prefix can come only with a predefined set of instructions
+            too. And if it comes with other instructions, CPU will #UD.
+            Hence, F0-applicability should be enforced too. But it's not
+            currently. Maybe it is the decision made, that user should know
+            himself what he is doing with LOCK prefix. In this case, we should
+            not enforce TSX hints applicability too. And let user take care of
+            correct usage of TSX hints.
+            That is what we are going to do.*/
+            case X86_ACQREL:
+                if (common->acqrel_pre != 0)
+                    yasm_warn_set(YASM_WARN_GENERAL,
+                        N_("multiple XACQUIRE/XRELEASE prefixes, "
+                        "using leftmost"));
+                common->acqrel_pre = (unsigned char)prefixes[i] & 0xff;
+                break;
             case X86_LOCKREP:
                 if (common->lockrep_pre != 0)
                     yasm_warn_set(YASM_WARN_GENERAL,
@@ -395,11 +413,13 @@
 static void
 x86_common_print(const x86_common *common, FILE *f, int indent_level)
 {
-    fprintf(f, "%*sAddrSize=%u OperSize=%u LockRepPre=%02x BITS=%u\n",
+    fprintf(f, "%*sAddrSize=%u OperSize=%u LockRepPre=%02x "
+        "ACQREL_Pre=%02x BITS=%u\n",
             indent_level, "",
             (unsigned int)common->addrsize,
             (unsigned int)common->opersize,
             (unsigned int)common->lockrep_pre,
+            (unsigned int)common->acqrel_pre,
             (unsigned int)common->mode_bits);
 }
 
@@ -515,6 +535,9 @@
         len++;
     if (common->lockrep_pre != 0)
         len++;
+    if (common->acqrel_pre != 0)
+        len++;
+
 
     return len;
 }
@@ -791,6 +814,9 @@
         ((common->mode_bits != 64 && common->opersize != common->mode_bits) ||
          (common->mode_bits == 64 && common->opersize == 16)))
         YASM_WRITE_8(*bufp, 0x66);
+    /*TSX hints come before lock prefix*/
+    if (common->acqrel_pre != 0)
+        YASM_WRITE_8(*bufp, common->acqrel_pre);
     if (common->lockrep_pre != 0)
         YASM_WRITE_8(*bufp, common->lockrep_pre);
 }
diff --git a/modules/arch/x86/x86cpu.gperf b/modules/arch/x86/x86cpu.gperf
index 669a588..3d49574 100644
--- a/modules/arch/x86/x86cpu.gperf
+++ b/modules/arch/x86/x86cpu.gperf
@@ -390,6 +390,8 @@
 nosmx,		x86_cpu_clear,	CPU_SMX
 avx2,		x86_cpu_set,	CPU_AVX2
 noavx2,		x86_cpu_clear,	CPU_AVX2
+tsx,		x86_cpu_set,	CPU_TSX
+notsx,		x86_cpu_clear,	CPU_TSX
 bmi1,		x86_cpu_set,	CPU_BMI1
 nobmi1,		x86_cpu_clear,	CPU_BMI1
 bmi2,		x86_cpu_set,	CPU_BMI2
@@ -398,6 +400,8 @@
 noinvpcid,	x86_cpu_clear,	CPU_INVPCID
 lzcnt,		x86_cpu_set,	CPU_LZCNT
 nolzcnt,	x86_cpu_clear,	CPU_LZCNT
+tbm,		x86_cpu_set,	CPU_TBM
+notbm,	x86_cpu_clear,	CPU_TBM
 # Change NOP patterns
 basicnop,	x86_nop,	X86_NOP_BASIC
 intelnop,	x86_nop,	X86_NOP_INTEL
diff --git a/modules/arch/x86/x86id.c b/modules/arch/x86/x86id.c
index 3f645b8..b07c9fc 100644
--- a/modules/arch/x86/x86id.c
+++ b/modules/arch/x86/x86id.c
@@ -388,6 +388,7 @@
     common->addrsize = 0;
     common->opersize = info->opersize;
     common->lockrep_pre = 0;
+    common->acqrel_pre = 0;
     common->mode_bits = (unsigned char)mode_bits;
 }
 
@@ -1603,8 +1604,9 @@
              * Leave R=X=B=1 for now.
              */
             if (insn->opcode.opcode[0] != 0x08 &&
-                insn->opcode.opcode[0] != 0x09)
-                yasm_internal_error(N_("first opcode byte of XOP must be 0x08 or 0x09"));
+                insn->opcode.opcode[0] != 0x09 &&
+                insn->opcode.opcode[0] != 0x0A)
+                yasm_internal_error(N_("first opcode byte of XOP must be 0x08, 0x09, or 0x0A"));
             vex1 |= insn->opcode.opcode[0];
             /* Move opcode byte back one byte to make room for XOP prefix. */
             insn->opcode.opcode[2] = insn->opcode.opcode[1];