blob: b464389fe39359f98421fde908f9e4ed709ff566 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
; RUN: opt < %s -loop-vectorize -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
; RUN: opt < %s -loop-vectorize -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc_linux"
; The source code:
;
;void foo1(int *A, int *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX1-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8*
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000
; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000
; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX1-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX1-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0
; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !3
; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX2-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8*
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000
; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000
; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX2-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX2-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !3
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !3
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !3
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !3
; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 8
; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16
; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 24
; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP50:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo1(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX512-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8*
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !3
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !3
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !3
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !3
; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16
; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 32
; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 48
; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX18]], 0
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP50]], i32 0
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP52]], align 4
; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison)
; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]]
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
; AVX512-NEXT: [[INDEX_NEXT19]] = add i64 [[INDEX18]], 8
; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 10000
; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: [[CMP_N20:%.*]] = icmp eq i64 10000, 10000
; AVX512-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP62:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP63:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP12:!llvm.loop !.*]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%1 = load i32, i32* %arrayidx3, align 4
%add = add nsw i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 %add, i32* %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The same as @foo1 but all the pointers are address space 1 pointers.
define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo1_addrspace1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)*
; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)*
; AVX1-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)*
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000
; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)*
; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000
; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)*
; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000
; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]]
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]]
; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX1-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX1-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP5]], i32 0
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14
; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i32 0
; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP13:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo1_addrspace1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)*
; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)*
; AVX2-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)*
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000
; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)*
; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000
; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)*
; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000
; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]]
; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX2-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX2-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8
; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 0
; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !14
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !14
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !14
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14
; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 0
; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 8
; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 16
; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 24
; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP50:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo1_addrspace1(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)*
; AVX512-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)*
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !13
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !13
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !13
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !13
; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !16
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !16
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !16
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !16
; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 0
; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !18, !noalias !20
; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 16
; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !18, !noalias !20
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 32
; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !18, !noalias !20
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 48
; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !18, !noalias !20
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP21:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX18]], 0
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP50]], i32 0
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP52]], align 4
; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP54]], i32 0
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison)
; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]]
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
; AVX512-NEXT: [[INDEX_NEXT19]] = add i64 [[INDEX18]], 8
; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 10000
; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: [[CMP_N20:%.*]] = icmp eq i64 10000, 10000
; AVX512-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP62:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP63:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP23:!llvm.loop !.*]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %trigger, i64 %indvars.iv
%0 = load i32, i32 addrspace(1)* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv
%1 = load i32, i32 addrspace(1)* %arrayidx3, align 4
%add = add nsw i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv
store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo2(float *A, float *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo2(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX1-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8*
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000
; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000
; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX1-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX1-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !24
; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP14]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP14]] to float
; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP15]], [[CONV]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo2(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX2-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8*
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000
; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000
; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX2-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX2-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !24
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !24
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !24
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !24
; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x float>
; AVX2-NEXT: [[TMP36:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP32]]
; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0
; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 8
; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 16
; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 24
; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to float
; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP54]], [[CONV]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo2(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX512-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8*
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !24
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !24
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !24
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !24
; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison), !alias.scope !27
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison), !alias.scope !27
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison), !alias.scope !27
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison), !alias.scope !27
; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float>
; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD14]] to <16 x float>
; AVX512-NEXT: [[TMP36:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP32]]
; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0
; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !29, !noalias !31
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 16
; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !29, !noalias !31
; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 32
; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !29, !noalias !31
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 48
; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !29, !noalias !31
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP32:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[INDEX18]], 0
; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP53]]
; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP56]], align 4
; AVX512-NEXT: [[TMP57:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP53]]
; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to <8 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP60]], i32 4, <8 x i1> [[TMP57]], <8 x float> poison)
; AVX512-NEXT: [[TMP61:%.*]] = sitofp <8 x i32> [[WIDE_LOAD21]] to <8 x float>
; AVX512-NEXT: [[TMP62:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD22]], [[TMP61]]
; AVX512-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP53]]
; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 0
; AVX512-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]])
; AVX512-NEXT: [[INDEX_NEXT19]] = add i64 [[INDEX18]], 8
; AVX512-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 10000
; AVX512-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP33:!llvm.loop !.*]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: [[CMP_N20:%.*]] = icmp eq i64 10000, 10000
; AVX512-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP67]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP67]] to float
; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP68]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP34:!llvm.loop !.*]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds float, float* %B, i64 %indvars.iv
%1 = load float, float* %arrayidx3, align 4
%conv = sitofp i32 %0 to float
%add = fadd float %1, %conv
%arrayidx7 = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %add, float* %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo3(double *A, double *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX-LABEL: @foo3(
; AVX-NEXT: entry:
; AVX-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
; AVX-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX: vector.memcheck:
; AVX-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
; AVX-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
; AVX-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
; AVX-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
; AVX-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX: vector.ph:
; AVX-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX: vector.body:
; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; AVX-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
; AVX-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31
; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31
; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31
; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]]
; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]]
; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]]
; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !34
; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 4
; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !34
; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8
; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !34
; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 12
; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !34
; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double>
; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double>
; AVX-NEXT: [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD14]] to <4 x double>
; AVX-NEXT: [[TMP36:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
; AVX-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
; AVX-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
; AVX-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]]
; AVX-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 0
; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, !noalias !38
; AVX-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 4
; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, !noalias !38
; AVX-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 8
; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, !noalias !38
; AVX-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 12
; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38
; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]]
; AVX: middle.block:
; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000
; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX: scalar.ph:
; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX: if.then:
; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX-NEXT: br label [[FOR_INC]]
; AVX: for.inc:
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]]
; AVX: for.end:
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo3(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !35
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !35
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !35
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !35
; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !38
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8
; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !38
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 16
; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !38
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 24
; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !38
; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double>
; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x double>
; AVX512-NEXT: [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 0
; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !40, !noalias !42
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 8
; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !40, !noalias !42
; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 16
; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !40, !noalias !42
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 24
; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !40, !noalias !42
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP43:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP44:!llvm.loop !.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds double, double* %B, i64 %indvars.iv
%1 = load double, double* %arrayidx3, align 8
%conv = sitofp i32 %0 to double
%add = fadd double %1, %conv
%arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
store double %add, double* %arrayidx7, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo4(double *A, double *B, int *trigger) {
;
; for (int i=0; i<10000; i += 16) {
; if (trigger[i] < 100) {
; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
; }
; }
;}
define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX-LABEL: @foo4(
; AVX-NEXT: entry:
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX: if.then:
; AVX-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
; AVX-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX-NEXT: br label [[FOR_INC]]
; AVX: for.inc:
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; AVX-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; AVX: for.end:
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo4(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !45
; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !48
; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !50, !noalias !52
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP53:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
; AVX512-NEXT: [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP54:!llvm.loop !.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %entry, %for.inc
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%1 = shl nuw nsw i64 %indvars.iv, 1
%arrayidx3 = getelementptr inbounds double, double* %B, i64 %1
%2 = load double, double* %arrayidx3, align 8
%conv = sitofp i32 %0 to double
%add = fadd double %2, %conv
%arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
store double %add, double* %arrayidx7, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
%cmp = icmp ult i64 %indvars.iv.next, 10000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.inc
ret void
}
@a = common global [1 x i32*] zeroinitializer, align 8
@c = common global i32* null, align 8
; The loop here should not be vectorized due to trapping
; constant expression
define void @foo5(i32* nocapture %A, i32* nocapture readnone %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX-LABEL: @foo5(
; AVX-NEXT: entry:
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX: if.then:
; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
; AVX-NEXT: br label [[FOR_INC]]
; AVX: for.inc:
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; AVX: for.end:
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo5(
; AVX512-NEXT: entry:
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; Reverse loop
;void foo6(double *in, double *out, unsigned size, int *trigger) {
;
; for (int i=SIZE-1; i>=0; i--) {
; if (trigger[i] > 0) {
; out[i] = in[i] + (double) 0.5;
; }
; }
;}
define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %size, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo6(
; AVX1-NEXT: entry:
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX1-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX1-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo6(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX2-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX2-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX2-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -12
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3
; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3
; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0
; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -3
; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44
; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -4
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -3
; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !44
; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -3
; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !44
; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -12
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -3
; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !44
; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX2-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -3
; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -4
; AVX2-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -3
; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8
; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -3
; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -12
; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -3
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo6(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
; AVX512-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -8
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -16
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -24
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7
; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !55
; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !55
; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7
; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !55
; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7
; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !55
; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0
; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -7
; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !58
; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -7
; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !58
; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -16
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -7
; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !58
; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -24
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -7
; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !58
; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -7
; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !60, !noalias !62
; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8
; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -7
; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !60, !noalias !62
; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -16
; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -7
; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !60, !noalias !62
; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -24
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -7
; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !60, !noalias !62
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP63:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP64:!llvm.loop !.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp sgt i32 %0, 0
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds double, double* %in, i64 %indvars.iv
%1 = load double, double* %arrayidx3, align 8
%add = fadd double %1, 5.000000e-01
%arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
store double %add, double* %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%cmp = icmp eq i64 %indvars.iv, 0
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; void foo7 (double * __restrict__ out, double ** __restrict__ in,
; bool * __restrict__ trigger, unsigned size) {
;
; for (unsigned i=0; i<size; i++)
; if (trigger[i] && (in[i] != 0))
; out[i] = (double) 0.5;
; }
define void @foo7(double* noalias nocapture %out, double** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
; AVX1-LABEL: @foo7(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX1: for.body.preheader:
; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0
; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4
; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8
; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12
; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX1: land.lhs.true:
; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX1-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]]
; AVX1: for.end.loopexit:
; AVX1-NEXT: br label [[FOR_END]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo7(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX2: for.body.preheader:
; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0
; AVX2-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4
; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8
; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12
; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX2: land.lhs.true:
; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX2-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]]
; AVX2: for.end.loopexit:
; AVX2-NEXT: br label [[FOR_END]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo7(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX512: for.body.preheader:
; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> poison)
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8
; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> poison)
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 16
; AVX512-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> poison)
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 24
; AVX512-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> poison)
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 16
; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 24
; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP65:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP66:!llvm.loop !.*]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
%cmp5 = icmp eq i32 %size, 0
br i1 %cmp5, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %size to i64
br label %for.body
for.body: ; preds = %for.inc, %for.body.preheader
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
%0 = load i8, i8* %arrayidx, align 1
%1 = and i8 %0, 1
%tobool = icmp eq i8 %1, 0
br i1 %tobool, label %for.inc, label %land.lhs.true
land.lhs.true: ; preds = %for.body
%arrayidx2 = getelementptr inbounds double*, double** %in, i64 %indvars.iv
%2 = load double*, double** %arrayidx2, align 8
%cmp3 = icmp eq double* %2, null
br i1 %cmp3, label %for.inc, label %if.then
if.then: ; preds = %land.lhs.true
%arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
store double 5.000000e-01, double* %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %land.lhs.true, %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc, %entry
ret void
}
;typedef int (*fp)();
;void foo8 (double* __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
;
; for (unsigned i=0; i<size; i++)
; if (trigger[i] && (in[i] != 0))
; out[i] = (double) 0.5;
;}
define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
; AVX1-LABEL: @foo8(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX1: for.body.preheader:
; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0
; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4
; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8
; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12
; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX1: land.lhs.true:
; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX1-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]]
; AVX1: for.end.loopexit:
; AVX1-NEXT: br label [[FOR_END]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo8(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX2: for.body.preheader:
; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0
; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4
; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8
; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12
; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX2: land.lhs.true:
; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX2-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]]
; AVX2: for.end.loopexit:
; AVX2-NEXT: br label [[FOR_END]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo8(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX512: for.body.preheader:
; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0
; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> poison)
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8
; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> poison)
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 16
; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> poison)
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 24
; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> poison)
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 16
; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 24
; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP67:!llvm.loop !.*]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP68:!llvm.loop !.*]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
%cmp5 = icmp eq i32 %size, 0
br i1 %cmp5, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %size to i64
br label %for.body
for.body: ; preds = %for.inc, %for.body.preheader
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
%0 = load i8, i8* %arrayidx, align 1
%1 = and i8 %0, 1
%tobool = icmp eq i8 %1, 0
br i1 %tobool, label %for.inc, label %land.lhs.true
land.lhs.true: ; preds = %for.body
%arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %in, i64 %indvars.iv
%2 = load i32 ()*, i32 ()** %arrayidx2, align 8
%cmp3 = icmp eq i32 ()* %2, null
br i1 %cmp3, label %for.inc, label %if.then
if.then: ; preds = %land.lhs.true
%arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
store double 5.000000e-01, double* %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %land.lhs.true, %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc, %entry
ret void
}
attributes #0 = { norecurse nounwind }