[ARM] MVE VMLAS

This addes extra patterns for the VMLAS MVE instruction, which performs
Qda = Qda * Qn + Rm, a similar pattern to the existing VMLA. The sinking
of splat(Rm) into the loop is already performed, meaning we just need
extra Pat's in tablegen.

Differential Revision: https://reviews.llvm.org/D75115
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 07cea41..ddd9d29 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5101,6 +5101,19 @@
                         (v16i8 (mul (v16i8 MQPR:$src2),
                                     (v16i8 (ARMvdup (i32 rGPR:$x))))))),
             (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+
+  def : Pat<(v4i32 (add (v4i32 (ARMvdup (i32 rGPR:$x))),
+                        (v4i32 (mul (v4i32 MQPR:$src1),
+                                    (v4i32 MQPR:$src2))))),
+            (v4i32 (MVE_VMLAS_qr_u32 $src1, $src2, $x))>;
+  def : Pat<(v8i16 (add (v8i16 (ARMvdup (i32 rGPR:$x))),
+                        (v8i16 (mul (v8i16 MQPR:$src1),
+                                    (v8i16 MQPR:$src2))))),
+            (v8i16 (MVE_VMLAS_qr_u16 $src1, $src2, $x))>;
+  def : Pat<(v16i8 (add (v16i8 (ARMvdup (i32 rGPR:$x))),
+                        (v16i8 (mul (v16i8 MQPR:$src1),
+                                    (v16i8 MQPR:$src2))))),
+            (v16i8 (MVE_VMLAS_qr_u8 $src1, $src2, $x))>;
 }
 
 let Predicates = [HasMVEFloat] in {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 0224c2f9..5fd03a7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -404,9 +404,8 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r2
+; CHECK-NEXT:    vstrw.32 q1, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB5_5
 ; CHECK-NEXT:    b .LBB5_11
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
@@ -609,9 +608,8 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.s32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.s32 q1, [r1], #8
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r2
+; CHECK-NEXT:    vstrw.32 q1, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB6_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -697,9 +695,8 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r2
+; CHECK-NEXT:    vstrw.32 q1, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB7_5
 ; CHECK-NEXT:    b .LBB7_11
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
@@ -902,9 +899,8 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.u32 q1, [r1], #8
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r2
+; CHECK-NEXT:    vstrw.32 q1, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -990,9 +986,8 @@
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r2
+; CHECK-NEXT:    vstrw.32 q1, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB9_5
 ; CHECK-NEXT:    b .LBB9_11
 ; CHECK-NEXT:  .LBB9_6: @ %for.body.preheader.new
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmla.ll b/llvm/test/CodeGen/Thumb2/mve-vmla.ll
index 925d477..5b0bdeb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmla.ll
@@ -197,8 +197,7 @@
 define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
 ; CHECK-LABEL: vmlasu32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmlas.u32 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <4 x i32> undef, i32 %X, i32 0
@@ -211,8 +210,7 @@
 define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind {
 ; CHECK-LABEL: vmlasu32b:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmlas.u32 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <4 x i32> undef, i32 %X, i32 0
@@ -225,8 +223,7 @@
 define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
 ; CHECK-LABEL: vmlasu16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, r0
+; CHECK-NEXT:    vmlas.u16 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <8 x i16> undef, i16 %X, i32 0
@@ -239,8 +236,7 @@
 define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind {
 ; CHECK-LABEL: vmlasu16b:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, r0
+; CHECK-NEXT:    vmlas.u16 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <8 x i16> undef, i16 %X, i32 0
@@ -253,8 +249,7 @@
 define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
 ; CHECK-LABEL: vmlasu8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, r0
+; CHECK-NEXT:    vmlas.u8 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <16 x i8> undef, i8 %X, i32 0
@@ -267,8 +262,7 @@
 define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind {
 ; CHECK-LABEL: vmlasu8b:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, r0
+; CHECK-NEXT:    vmlas.u8 q0, q1, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = insertelement <16 x i8> undef, i8 %X, i32 0
@@ -286,9 +280,8 @@
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r1
-; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    vmlas.u32 q1, q0, r1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    bne .LBB15_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    bx lr
@@ -325,9 +318,8 @@
 ; CHECK-NEXT:    vldrh.u16 q0, [r2]
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vadd.i16 q0, q0, r1
-; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    vmlas.u16 q1, q0, r1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    bne .LBB16_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    bx lr
@@ -364,9 +356,8 @@
 ; CHECK-NEXT:    vldrh.u16 q0, [r2]
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
 ; CHECK-NEXT:    subs r3, #16
-; CHECK-NEXT:    vmul.i8 q0, q1, q0
-; CHECK-NEXT:    vadd.i8 q0, q0, r1
-; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    vmlas.u8 q1, q0, r1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    bne .LBB17_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    bx lr