[ARM NEON] Define vfms_f32 on ARM, and all vfms using vfma.

r259537 added vfma/vfms to armv7, but the builtin was only lowered
on the AArch64 side. Instead of supporting it on ARM, get rid of it.

The vfms builtin lowered to:
  %nb = fsub float -0.0, %b
  %r = @llvm.fma.f32(%a, %nb, %c)

Instead, define the operation in terms of vfma, and swap the
multiplicands. It now lowers to:
  %na = fsub float -0.0, %a
  %r = @llvm.fma.f32(%na, %b, %c)

This matches the instruction more closely, and lets current LLVM
generate the "natural" operand ordering:
  fmls.2s v0, v1, v2
instead of the crooked (but equivalent):
  fmls.2s v0, v2, v1
Except for theses changes, assembly is identical.

LLVM accepts both commutations, and the LLVM tests in:
  test/CodeGen/AArch64/arm64-fmadd.ll
  test/CodeGen/AArch64/fp-dp3.ll
  test/CodeGen/AArch64/neon-fma.ll
  test/CodeGen/ARM/fusedMAC.ll
already check either the new one only, or both.

Also verified against the test-suite unittests.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@266807 91177308-0d34-0410-b5e6-96231b3b80d8
(cherry picked from commit fa8ab2562a582a60fb7dff9d873b65d84ab864f4)

Conflicts:
	test/CodeGen/aarch64-neon-2velem.c
	test/CodeGen/aarch64-neon-fma.c
	test/CodeGen/aarch64-neon-intrinsics.c
	test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index 4863566..6641ed2 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -339,6 +339,7 @@
                                          (call "vget_high", $p2))>;
 def OP_MLALHi_N : Op<(call "vmlal_n", $p0, (call "vget_high", $p1), $p2)>;
 def OP_MLS      : Op<(op "-", $p0, (op "*", $p1, $p2))>;
+def OP_FMLS     : Op<(call "vfma", $p0, (op "-", $p1), $p2)>;
 def OP_MLSL     : Op<(op "-", $p0, (call "vmull", $p1, $p2))>;
 def OP_MLSLHi   : Op<(call "vmlsl", $p0, (call "vget_high", $p1),
                                          (call "vget_high", $p2))>;
@@ -347,7 +348,7 @@
 def OP_MLA_N    : Op<(op "+", $p0, (op "*", $p1, (dup $p2)))>;
 def OP_MLS_N    : Op<(op "-", $p0, (op "*", $p1, (dup $p2)))>;
 def OP_FMLA_N   : Op<(call "vfma", $p0, $p1, (dup $p2))>;
-def OP_FMLS_N   : Op<(call "vfms", $p0, $p1, (dup $p2))>;
+def OP_FMLS_N   : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>;
 def OP_MLAL_N   : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
 def OP_MLSL_N   : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
 def OP_MUL_LN   : Op<(op "*", $p0, (splat $p1, $p2))>;
@@ -377,8 +378,8 @@
 def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
 def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
 def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
-def OP_FMS_LN   : Op<(call "vfma_lane", $p0, $p1, (op "-", $p2), $p3)>;
-def OP_FMS_LNQ  : Op<(call "vfma_laneq", $p0, $p1, (op "-", $p2), $p3)>;
+def OP_FMS_LN   : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>;
+def OP_FMS_LNQ  : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>;
 def OP_TRN1     : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
                                                     (decimate mask1, 2)))>;
 def OP_ZIP1     : Op<(shuffle $p0, $p1, (lowhalf (interleave mask0, mask1)))>;
@@ -826,7 +827,7 @@
 
 let ArchGuard = "defined(__ARM_FEATURE_FMA)" in {
   def VFMA : SInst<"vfma", "dddd", "fQf">;
-  def VFMS : SInst<"vfms", "dddd", "fQf">;
+  def VFMS : SOpInst<"vfms", "dddd", "fQf", OP_FMLS>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -911,7 +912,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Vector fused multiply-add operations
 def FMLA : SInst<"vfma", "dddd", "dQd">;
-def FMLS : SInst<"vfms", "dddd", "dQd">;
+def FMLS : SOpInst<"vfms", "dddd", "dQd", OP_FMLS>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // MUL, MLA, MLS, FMA, FMS definitions with scalar argument
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index c73aa22..55f919e 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -5148,22 +5148,6 @@
     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]});
   }
-  case NEON::BI__builtin_neon_vfms_v:
-  case NEON::BI__builtin_neon_vfmsq_v: {  // Only used for FP types
-    // FIXME: probably remove when we no longer support aarch64_simd.h
-    // (arm_neon.h delegates to vfma).
-
-    // The ARM builtins (and instructions) have the addend as the first
-    // operand, but the 'fma' intrinsics have it last. Swap it around here.
-    Value *Subtrahend = Ops[0];
-    Value *Multiplicand = Ops[2];
-    Ops[0] = Multiplicand;
-    Ops[2] = Subtrahend;
-    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
-    Ops[1] = Builder.CreateFNeg(Ops[1]);
-    Int = Intrinsic::fma;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls");
-  }
   case NEON::BI__builtin_neon_vmull_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
diff --git a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index 4c2f4d7..a9d46cd 100644
--- a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -99,7 +99,6 @@
   // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vfmss_lane_f32
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
   // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
@@ -111,7 +110,6 @@
   // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfms_lane_f64
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
   // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
@@ -123,7 +121,6 @@
   // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vfms_laneq_f64
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
   // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
diff --git a/test/CodeGen/arm64_vfma.c b/test/CodeGen/arm64_vfma.c
index bfa5687..12f3111 100644
--- a/test/CodeGen/arm64_vfma.c
+++ b/test/CodeGen/arm64_vfma.c
@@ -82,7 +82,7 @@
   // CHECK: test_vfms_f32
   return vfms_f32(a1, a2, a3);
   // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f32(<2 x float> %a3, <2 x float> [[NEG]], <2 x float> %a1)
+  // CHECK: llvm.fma.v2f32(<2 x float> [[NEG]], <2 x float> %a3, <2 x float> %a1)
   // CHECK-NEXT: ret
 }
 
@@ -90,7 +90,7 @@
   // CHECK: test_vfmsq_f32
   return vfmsq_f32(a1, a2, a3);
   // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v4f32(<4 x float> %a3, <4 x float> [[NEG]], <4 x float> %a1)
+  // CHECK: llvm.fma.v4f32(<4 x float> [[NEG]], <4 x float> %a3, <4 x float> %a1)
   // CHECK-NEXT: ret
 }
 
@@ -98,7 +98,7 @@
   // CHECK: test_vfmsq_f64
   return vfmsq_f64(a1, a2, a3);
   // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f64(<2 x double> %a3, <2 x double> [[NEG]], <2 x double> %a1)
+  // CHECK: llvm.fma.v2f64(<2 x double> [[NEG]], <2 x double> %a3, <2 x double> %a1)
   // CHECK-NEXT: ret
 }
 
@@ -107,9 +107,9 @@
   return vfms_lane_f32(a1, a2, a3, 1);
   // NB: the test below is deliberately lose, so that we don't depend too much
   // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
-  // CHECK: llvm.fma.v2f32(<2 x float> {{.*}}, <2 x float> [[LANE]], <2 x float> %a1)
+  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
+  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> %a3
+  // CHECK: llvm.fma.v2f32(<2 x float> [[NEG]], <2 x float> [[LANE]], <2 x float> %a1)
   // CHECK-NEXT: ret
 }
 
@@ -118,9 +118,9 @@
   return vfmsq_lane_f32(a1, a2, a3, 1);
   // NB: the test below is deliberately lose, so that we don't depend too much
   // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
-  // CHECK: llvm.fma.v4f32(<4 x float> {{.*}}, <4 x float> [[LANE]], <4 x float> %a1)
+  // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
+  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> %a3
+  // CHECK: llvm.fma.v4f32(<4 x float> [[NEG]], <4 x float> [[LANE]], <4 x float> %a1)
   // CHECK-NEXT: ret
 }
 
@@ -129,8 +129,8 @@
   return vfmsq_lane_f64(a1, a2, a3, 0);
   // NB: the test below is deliberately lose, so that we don't depend too much
   // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <1 x double> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[NEG]]
-  // CHECK: llvm.fma.v2f64(<2 x double> {{.*}}, <2 x double> [[LANE]], <2 x double> %a1)
+  // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
+  // CHECK: [[LANE:%.*]] = shufflevector <1 x double> %a3
+  // CHECK: llvm.fma.v2f64(<2 x double> [[NEG]], <2 x double> [[LANE]], <2 x double> %a1)
   // CHECK-NEXT: ret
 }
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index d92c32c..3a87211 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -2283,6 +2283,34 @@
   return vfmaq_f32(a, b, c);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
+float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+  return vfms_f32(a, b, c);
+}
+
+// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
+float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+  return vfmsq_f32(a, b, c);
+}
+
 
 // CHECK-LABEL: test_vget_high_s8
 int8x8_t test_vget_high_s8(int8x16_t a) {