llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll - third_party/github.com/llvm/llvm-project - Git at Google

 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s

 ; The following functions should all fail to become tail-predicated.
 ; CHECK-NOT: call i32 @llvm.arm.vctp

 ; trip.count.minus.1 has been inserted into element 1, not 0.
 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; The insert isn't using an undef for operand 0.
 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; The shuffle uses a defined value for operand 1.
 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; The shuffle uses a non zero value for operand 2.
 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; %N - 2
 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.2 = add i32 %N, -2
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; index has been inserted at element 1, not 0.
 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %incorrect = add i32 %index, 1
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; Now using ult, not ule for the vector icmp
 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; The add in the body uses 1, 2, 3, 4
 define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; Using a variable for the loop body broadcast.
 define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, %offsets
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; adding 5, instead of 4, to index.
 define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
   %index.next = add i32 %index, 5
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 declare void @llvm.set.loop.iterations.i32(i32) #3
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3