Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1670,6 +1670,22 @@ if (!shouldMergeGEPs(*cast(&GEP), *Src)) return nullptr; + // Try to reassociate loop invariant GEP chains to enable LICM. + if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2) { + if (Loop *L = LI->getLoopFor(GEP.getParent())) { + Value *GO1 = GEP.getOperand(1); + Value *SO1 = Src->getOperand(1); + // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is + // invariant: this breaks the dependence between GEPs and allows LICM + // to hoist the invariant part out of the loop. + if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { + Src->setOperand(1, GO1); + GEP.setOperand(1, SO1); + return &GEP; + } + } + } + // Note that if our source is a gep chain itself then we wait for that // chain to be resolved before we perform this transformation. This // avoids us creating a TON of code in some cases. Index: llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @foo(i8* nocapture readnone %match, i32 %cur_match, i32 %best_len, i32 %scan_end, i32* nocapture readonly %prev, i32 %limit, i32 %chain_length, i8* nocapture readonly %win, i32 %wmask) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX_EXT2:%.*]] = zext i32 [[CUR_MATCH:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR4:%.*]] = getelementptr inbounds i8, i8* [[WIN:%.*]], i64 [[IDX_EXT2]] +; CHECK-NEXT: [[IDX_EXT1:%.*]] = zext i32 [[BEST_LEN:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR25:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR4]], i64 [[IDX_EXT1]] +; CHECK-NEXT: [[ADD_PTR36:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR25]], i64 -1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ADD_PTR36]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[CMP7:%.*]] = icmp eq i32 [[TMP1]], [[SCAN_END:%.*]] +; CHECK-NEXT: br i1 [[CMP7]], label [[DO_END:%.*]], label [[IF_THEN_LR_PH:%.*]] +; CHECK: if.then.lr.ph: +; CHECK-NEXT: br label [[IF_THEN:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[IDX_EXT:%.*]] = zext i32 [[TMP4:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[WIN]], i64 [[IDX_EXT1]] +; CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 -1 +; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR2]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ADD_PTR3]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP3]], [[SCAN_END]] +; CHECK-NEXT: br i1 [[CMP]], label [[DO_END]], label [[IF_THEN]] +; CHECK: if.then: +; CHECK-NEXT: [[CUR_MATCH_ADDR_09:%.*]] = phi i32 [ [[CUR_MATCH]], [[IF_THEN_LR_PH]] ], [ [[TMP4]], [[DO_BODY:%.*]] ] +; CHECK-NEXT: [[CHAIN_LENGTH_ADDR_08:%.*]] = phi i32 [ [[CHAIN_LENGTH:%.*]], [[IF_THEN_LR_PH]] ], [ [[DEC:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[CUR_MATCH_ADDR_09]], [[WMASK:%.*]] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[AND]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREV:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP4]], [[LIMIT:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[LAND_LHS_TRUE:%.*]], label [[DO_END]] +; CHECK: land.lhs.true: +; CHECK-NEXT: [[DEC]] = add i32 [[CHAIN_LENGTH_ADDR_08]], -1 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[CMP5]], label [[DO_END]], label [[DO_BODY]] +; CHECK: do.end: +; CHECK-NEXT: [[CONT_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 0, [[IF_THEN]] ], [ 0, [[LAND_LHS_TRUE]] ], [ 1, [[DO_BODY]] ] +; CHECK-NEXT: ret i32 [[CONT_0]] +; +entry: + %idx.ext2 = zext i32 %cur_match to i64 + %add.ptr4 = getelementptr inbounds i8, i8* %win, i64 %idx.ext2 + %idx.ext1 = zext i32 %best_len to i64 + %add.ptr25 = getelementptr inbounds i8, i8* %add.ptr4, i64 %idx.ext1 + %add.ptr36 = getelementptr inbounds i8, i8* %add.ptr25, i64 -1 + %0 = bitcast i8* %add.ptr36 to i32* + %1 = load i32, i32* %0, align 4 + %cmp7 = icmp eq i32 %1, %scan_end + br i1 %cmp7, label %do.end, label %if.then.lr.ph + +if.then.lr.ph: ; preds = %entry + br label %if.then + +do.body: ; preds = %land.lhs.true + %chain_length.addr.0 = phi i32 [ %dec, %land.lhs.true ] + %cur_match.addr.0 = phi i32 [ %4, %land.lhs.true ] + %idx.ext = zext i32 %cur_match.addr.0 to i64 + %add.ptr = getelementptr inbounds i8, i8* %win, i64 %idx.ext + %add.ptr2 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext1 + %add.ptr3 = getelementptr inbounds i8, i8* %add.ptr2, i64 -1 + %2 = bitcast i8* %add.ptr3 to i32* + %3 = load i32, i32* %2, align 4 + %cmp = icmp eq i32 %3, %scan_end + br i1 %cmp, label %do.end, label %if.then + +if.then: ; preds = %if.then.lr.ph, %do.body + %cur_match.addr.09 = phi i32 [ %cur_match, %if.then.lr.ph ], [ %cur_match.addr.0, %do.body ] + %chain_length.addr.08 = phi i32 [ %chain_length, %if.then.lr.ph ], [ %chain_length.addr.0, %do.body ] + %and = and i32 %cur_match.addr.09, %wmask + %idxprom = zext i32 %and to i64 + %arrayidx = getelementptr inbounds i32, i32* %prev, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %cmp4 = icmp ugt i32 %4, %limit + br i1 %cmp4, label %land.lhs.true, label %do.end + +land.lhs.true: ; preds = %if.then + %dec = add i32 %chain_length.addr.08, -1 + %cmp5 = icmp eq i32 %dec, 0 + br i1 %cmp5, label %do.end, label %do.body + +do.end: ; preds = %do.body, %land.lhs.true, %if.then, %entry + %cont.0 = phi i32 [ 1, %entry ], [ 0, %if.then ], [ 0, %land.lhs.true ], [ 1, %do.body ] + ret i32 %cont.0 +} Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -105,103 +105,105 @@ ; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] ; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX2: vector.body.preheader: +; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 8 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 24 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 32 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 24 -; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], -; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], -; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 8 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 24 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] -; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] -; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 8 -; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 -; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 24 -; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 8 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 24 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], -; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], -; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], -; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] -; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 8 -; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[B]], i64 24 +; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B]], i64 32 +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <8 x i32>* +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[A]], i64 24 +; AVX2-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <8 x i32>* +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[A]], i64 32 +; AVX2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <8 x i32>* +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 8 +; AVX2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <8 x i32>* +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 24 +; AVX2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <8 x i32>* +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 32 +; AVX2-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <8 x i32>* +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 +; AVX2-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <8 x i32>* +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[B]], i64 24 +; AVX2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <8 x i32>* +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[B]], i64 32 +; AVX2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <8 x i32>* +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 +; AVX2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <8 x i32>* +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[A]], i64 24 +; AVX2-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to <8 x i32>* +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 32 +; AVX2-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <8 x i32>* +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP40]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP10]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP9]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP14]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP44:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX2-NEXT: [[TMP46:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX2-NEXT: [[TMP47:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP44]], <8 x i32>* [[TMP16]], i32 4, <8 x i1> [[TMP6]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP15]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP45]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP46]], <8 x i32>* [[TMP18]], i32 4, <8 x i1> [[TMP7]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP47]], <8 x i32>* [[TMP20]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 +; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP22]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP21]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] -; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] -; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] -; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP51]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP24]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP26]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP52:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX2-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], +; AVX2-NEXT: [[TMP54:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], +; AVX2-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP28]], i32 4, <8 x i1> [[TMP52]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[TMP27]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 8 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP53]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP30]], i32 4, <8 x i1> [[TMP54]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP32]], i32 4, <8 x i1> [[TMP55]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP58:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX2-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX2-NEXT: [[TMP60:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX2-NEXT: [[TMP61:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP58]], <8 x i32>* [[TMP34]], i32 4, <8 x i1> [[TMP52]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP33]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP53]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP60]], <8 x i32>* [[TMP36]], i32 4, <8 x i1> [[TMP54]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP61]], <8 x i32>* [[TMP38]], i32 4, <8 x i1> [[TMP55]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 ; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8 ; AVX2: for.body.preheader: -; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -274,103 +276,105 @@ ; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] ; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX512: vector.body.preheader: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 16 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 48 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 32 +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 64 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 48 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], -; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], -; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 32 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 48 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] -; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] -; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 32 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 48 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 32 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 48 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], -; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], -; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], -; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 32 +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP6:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 +; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <16 x i32>* +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[B]], i64 48 +; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <16 x i32>* +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B]], i64 64 +; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>* +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <16 x i32>* +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[A]], i64 48 +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <16 x i32>* +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[A]], i64 64 +; AVX512-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <16 x i32>* +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 16 +; AVX512-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <16 x i32>* +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 48 +; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>* +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 64 +; AVX512-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <16 x i32>* +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 +; AVX512-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <16 x i32>* +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[B]], i64 48 +; AVX512-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <16 x i32>* +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[B]], i64 64 +; AVX512-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <16 x i32>* +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 +; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>* +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[A]], i64 48 +; AVX512-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to <16 x i32>* +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 64 +; AVX512-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <16 x i32>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP40]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP10]], i32 4, <16 x i1> [[TMP6]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP9]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP12]], i32 4, <16 x i1> [[TMP7]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP14]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP44:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX512-NEXT: [[TMP46:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX512-NEXT: [[TMP47:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP44]], <16 x i32>* [[TMP16]], i32 4, <16 x i1> [[TMP6]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP15]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP45]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP46]], <16 x i32>* [[TMP18]], i32 4, <16 x i1> [[TMP7]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP47]], <16 x i32>* [[TMP20]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 +; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP22]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP21]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] -; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] -; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] -; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP51]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP24]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP26]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP52:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], +; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], +; AVX512-NEXT: [[TMP54:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], +; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP28]], i32 4, <16 x i1> [[TMP52]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[TMP27]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 -; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 32 -; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP53]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP30]], i32 4, <16 x i1> [[TMP54]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP32]], i32 4, <16 x i1> [[TMP55]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX512-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX512-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX512-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP58]], <16 x i32>* [[TMP34]], i32 4, <16 x i1> [[TMP52]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP33]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP59]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP53]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP60]], <16 x i32>* [[TMP36]], i32 4, <16 x i1> [[TMP54]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP61]], <16 x i32>* [[TMP38]], i32 4, <16 x i1> [[TMP55]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 ; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8 ; AVX512: for.body.preheader: -; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -578,103 +582,105 @@ ; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]] ; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX2: vector.body.preheader: +; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 8 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 24 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 32 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 24 -; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], -; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], -; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 8 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 24 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] -; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] -; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 8 -; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 -; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 24 -; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 8 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 24 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], -; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], -; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], -; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] -; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 8 -; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 8 +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32 addrspace(1)* [[TMP9]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 24 +; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 32 +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32 addrspace(1)* [[TMP13]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 8 +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32 addrspace(1)* [[TMP15]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 24 +; AVX2-NEXT: [[TMP18:%.*]] = bitcast i32 addrspace(1)* [[TMP17]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 32 +; AVX2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(1)* [[TMP19]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 8 +; AVX2-NEXT: [[TMP22:%.*]] = bitcast i32 addrspace(1)* [[TMP21]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 24 +; AVX2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(1)* [[TMP23]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 32 +; AVX2-NEXT: [[TMP26:%.*]] = bitcast i32 addrspace(1)* [[TMP25]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 8 +; AVX2-NEXT: [[TMP28:%.*]] = bitcast i32 addrspace(1)* [[TMP27]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 24 +; AVX2-NEXT: [[TMP30:%.*]] = bitcast i32 addrspace(1)* [[TMP29]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 32 +; AVX2-NEXT: [[TMP32:%.*]] = bitcast i32 addrspace(1)* [[TMP31]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 8 +; AVX2-NEXT: [[TMP34:%.*]] = bitcast i32 addrspace(1)* [[TMP33]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 24 +; AVX2-NEXT: [[TMP36:%.*]] = bitcast i32 addrspace(1)* [[TMP35]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 32 +; AVX2-NEXT: [[TMP38:%.*]] = bitcast i32 addrspace(1)* [[TMP37]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP40:%.*]] = bitcast i32 addrspace(1)* [[TMP39]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP40]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP10]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP14]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP44:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX2-NEXT: [[TMP46:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX2-NEXT: [[TMP47:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP44]], <8 x i32> addrspace(1)* [[TMP16]], i32 4, <8 x i1> [[TMP6]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP15]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP45]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP46]], <8 x i32> addrspace(1)* [[TMP18]], i32 4, <8 x i1> [[TMP7]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP47]], <8 x i32> addrspace(1)* [[TMP20]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 +; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP22]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP21]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] -; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] -; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] -; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP51]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP24]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP26]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP52:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX2-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], +; AVX2-NEXT: [[TMP54:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], +; AVX2-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP28]], i32 4, <8 x i1> [[TMP52]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP27]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 8 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP53]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP30]], i32 4, <8 x i1> [[TMP54]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP32]], i32 4, <8 x i1> [[TMP55]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP58:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX2-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX2-NEXT: [[TMP60:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX2-NEXT: [[TMP61:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP58]], <8 x i32> addrspace(1)* [[TMP34]], i32 4, <8 x i1> [[TMP52]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP33]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP53]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP60]], <8 x i32> addrspace(1)* [[TMP36]], i32 4, <8 x i1> [[TMP54]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP61]], <8 x i32> addrspace(1)* [[TMP38]], i32 4, <8 x i1> [[TMP55]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 ; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19 ; AVX2: for.body.preheader: -; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -747,103 +753,105 @@ ; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]] ; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX512: vector.body.preheader: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 16 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 48 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 32 +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 64 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 48 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], -; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], -; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 32 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 48 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] -; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] -; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 32 -; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 48 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 32 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 48 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], -; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], -; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], -; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 -; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 32 +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP6:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 16 +; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32 addrspace(1)* [[TMP9]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 48 +; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 64 +; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32 addrspace(1)* [[TMP13]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 16 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32 addrspace(1)* [[TMP15]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 48 +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32 addrspace(1)* [[TMP17]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 64 +; AVX512-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(1)* [[TMP19]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 16 +; AVX512-NEXT: [[TMP22:%.*]] = bitcast i32 addrspace(1)* [[TMP21]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 48 +; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(1)* [[TMP23]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 64 +; AVX512-NEXT: [[TMP26:%.*]] = bitcast i32 addrspace(1)* [[TMP25]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 16 +; AVX512-NEXT: [[TMP28:%.*]] = bitcast i32 addrspace(1)* [[TMP27]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 48 +; AVX512-NEXT: [[TMP30:%.*]] = bitcast i32 addrspace(1)* [[TMP29]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 64 +; AVX512-NEXT: [[TMP32:%.*]] = bitcast i32 addrspace(1)* [[TMP31]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 16 +; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32 addrspace(1)* [[TMP33]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 48 +; AVX512-NEXT: [[TMP36:%.*]] = bitcast i32 addrspace(1)* [[TMP35]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 64 +; AVX512-NEXT: [[TMP38:%.*]] = bitcast i32 addrspace(1)* [[TMP37]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP40:%.*]] = bitcast i32 addrspace(1)* [[TMP39]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP40]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP10]], i32 4, <16 x i1> [[TMP6]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP12]], i32 4, <16 x i1> [[TMP7]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP14]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP44:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX512-NEXT: [[TMP46:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX512-NEXT: [[TMP47:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP44]], <16 x i32> addrspace(1)* [[TMP16]], i32 4, <16 x i1> [[TMP6]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP15]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP45]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP46]], <16 x i32> addrspace(1)* [[TMP18]], i32 4, <16 x i1> [[TMP7]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP47]], <16 x i32> addrspace(1)* [[TMP20]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 +; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP22]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP21]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] -; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] -; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] -; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP51]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP24]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP26]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP52:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], +; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], +; AVX512-NEXT: [[TMP54:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], +; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP28]], i32 4, <16 x i1> [[TMP52]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP27]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 -; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 32 -; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP53]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP30]], i32 4, <16 x i1> [[TMP54]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP32]], i32 4, <16 x i1> [[TMP55]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX512-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX512-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX512-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP58]], <16 x i32> addrspace(1)* [[TMP34]], i32 4, <16 x i1> [[TMP52]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP33]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP59]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP53]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP60]], <16 x i32> addrspace(1)* [[TMP36]], i32 4, <16 x i1> [[TMP54]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP61]], <16 x i32> addrspace(1)* [[TMP38]], i32 4, <16 x i1> [[TMP55]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 ; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19 ; AVX512: for.body.preheader: -; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -988,62 +996,64 @@ ; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] ; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX1: vector.body.preheader: +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 8 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 24 ; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 32 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], -; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], -; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 -; AVX1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 -; AVX1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 +; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[B]], i64 24 +; AVX1-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr float, float* [[B]], i64 32 +; AVX1-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>* +; AVX1-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX1-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> +; AVX1-NEXT: [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 8 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX1-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> -; AVX1-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> -; AVX1-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> -; AVX1-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX1-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX1-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX1-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[A]], i64 24 +; AVX1-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[A]], i64 32 +; AVX1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP28:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[TMP11]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP30]], i32 4, <8 x i1> [[TMP28]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP16]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP31:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> +; AVX1-NEXT: [[TMP32:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX1-NEXT: [[TMP33:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX1-NEXT: [[TMP34:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX1-NEXT: [[TMP35:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP32]], <8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP20]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP33]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP28]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP34]], <8 x float>* [[TMP23]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP35]], <8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 ; AVX1: for.body.preheader: -; AVX1-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX1-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ] @@ -1094,62 +1104,64 @@ ; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] ; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX2: vector.body.preheader: +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 8 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 24 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 32 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], -; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], -; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 8 +; AVX2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[B]], i64 24 +; AVX2-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr float, float* [[B]], i64 32 +; AVX2-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>* +; AVX2-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX2-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> +; AVX2-NEXT: [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 8 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX2-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> -; AVX2-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> -; AVX2-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> -; AVX2-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX2-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX2-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX2-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[A]], i64 24 +; AVX2-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[A]], i64 32 +; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP28:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[TMP11]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP30]], i32 4, <8 x i1> [[TMP28]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP16]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP31:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> +; AVX2-NEXT: [[TMP32:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX2-NEXT: [[TMP33:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX2-NEXT: [[TMP34:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX2-NEXT: [[TMP35:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP32]], <8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP20]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP33]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP28]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP34]], <8 x float>* [[TMP23]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP35]], <8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 ; AVX2: for.body.preheader: -; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -1228,62 +1240,64 @@ ; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] ; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX512: vector.body.preheader: +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 16 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 48 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 32 +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 64 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 48 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], -; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 16 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 32 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 48 +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 16 +; AVX512-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <16 x float>* +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[B]], i64 48 +; AVX512-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <16 x float>* +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr float, float* [[B]], i64 64 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <16 x float>* +; AVX512-NEXT: [[TMP17:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> +; AVX512-NEXT: [[TMP18:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float> +; AVX512-NEXT: [[TMP19:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float> +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 16 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> -; AVX512-NEXT: [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float> -; AVX512-NEXT: [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float> -; AVX512-NEXT: [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float> -; AVX512-NEXT: [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX512-NEXT: [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 16 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 32 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 48 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[A]], i64 48 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <16 x float>* +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[A]], i64 64 +; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP27]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP28:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[TMP11]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP30]], i32 4, <16 x i1> [[TMP28]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP16]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP31:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float> +; AVX512-NEXT: [[TMP32:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX512-NEXT: [[TMP34:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX512-NEXT: [[TMP35:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP32]], <16 x float>* [[TMP21]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP20]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP33]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP28]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP34]], <16 x float>* [[TMP23]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP35]], <16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 ; AVX512: for.body.preheader: -; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -1433,62 +1447,64 @@ ; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] ; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] -; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX1: vector.body.preheader: +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 4 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 12 ; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 16 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], -; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], -; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 -; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 -; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], i64 4 +; AVX1-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[B]], i64 12 +; AVX1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[B]], i64 16 +; AVX1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* +; AVX1-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX1-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> +; AVX1-NEXT: [[TMP19:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[A]], i64 4 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX1-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> -; AVX1-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> -; AVX1-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> -; AVX1-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX1-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX1-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX1-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[A]], i64 12 +; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[A]], i64 16 +; AVX1-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP28:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[TMP8]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP11]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[TMP28]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP16]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP31:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> +; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX1-NEXT: [[TMP34:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX1-NEXT: [[TMP35:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP8]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP20]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP28]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP34]], <4 x double>* [[TMP23]], i32 8, <4 x i1> [[TMP9]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP35]], <4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX1-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 @@ -1536,62 +1552,64 @@ ; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] ; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX2: vector.body.preheader: +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 4 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 12 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 16 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], -; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], -; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 -; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], i64 4 +; AVX2-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[B]], i64 12 +; AVX2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[B]], i64 16 +; AVX2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* +; AVX2-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX2-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> +; AVX2-NEXT: [[TMP19:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[A]], i64 4 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX2-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> -; AVX2-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> -; AVX2-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> -; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX2-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX2-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX2-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] -; AVX2-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[A]], i64 12 +; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[A]], i64 16 +; AVX2-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP28:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[TMP8]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP11]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[TMP28]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP16]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP31:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> +; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX2-NEXT: [[TMP34:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX2-NEXT: [[TMP35:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP8]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP20]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP28]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP34]], <4 x double>* [[TMP23]], i32 8, <4 x i1> [[TMP9]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP35]], <4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 @@ -1667,62 +1685,64 @@ ; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] ; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX512: vector.body.preheader: +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 8 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 24 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 32 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], -; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 8 -; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 16 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 24 +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], i64 8 +; AVX512-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <8 x double>* +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[B]], i64 24 +; AVX512-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <8 x double>* +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[B]], i64 32 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <8 x double>* +; AVX512-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> +; AVX512-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double> +; AVX512-NEXT: [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double> +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[A]], i64 8 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> -; AVX512-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double> -; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double> -; AVX512-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double> -; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] -; AVX512-NEXT: [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] -; AVX512-NEXT: [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] -; AVX512-NEXT: [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 8 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 16 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 24 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[A]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[A]], i64 32 +; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP28:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP12]], i32 8, <8 x i1> [[TMP8]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP11]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[TMP28]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP16]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP31:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double> +; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP17]] +; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP31]] +; AVX512-NEXT: [[TMP34:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP18]] +; AVX512-NEXT: [[TMP35:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP19]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP21]], i32 8, <8 x i1> [[TMP8]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP20]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP28]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP34]], <8 x double>* [[TMP23]], i32 8, <8 x i1> [[TMP9]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP35]], <8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39 ; AVX512: for.body.preheader: -; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 9984, [[VECTOR_BODY]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] @@ -2421,102 +2441,95 @@ ; AVX1-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] ; AVX1-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] ; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] -; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] -; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX1: vector.body.preheader: +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 -3 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -10 ; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 -; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 -; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 -; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -14 +; AVX1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -18 +; AVX1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX1-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer -; AVX1-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer -; AVX1-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 -; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 -; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX1-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[IN]], i64 -3 +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[IN]], i64 -10 +; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[IN]], i64 -14 +; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* +; AVX1-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[IN]], i64 -18 +; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* +; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 -3 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[OUT]], i64 -10 +; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[OUT]], i64 -14 ; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 -; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 -; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], -; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], -; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[OUT]], i64 -18 +; AVX1-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !alias.scope !41 +; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP28:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP12]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP28]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP16]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP18]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX1-NEXT: [[TMP34:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP19]], i64 [[OFFSET_IDX]] ; AVX1-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX1-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 -; AVX1-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 -; AVX1-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP34]], <4 x double>* [[TMP25]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX1-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX1-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX1-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX1-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX1-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX1-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; AVX1: if.then.1: ; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX1-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX1-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX1-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX1-NEXT: br label [[FOR_INC_1]] @@ -2539,128 +2552,121 @@ ; AVX2-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] ; AVX2-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] -; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX2: vector.body.preheader: +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 -3 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -10 ; AVX2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 -; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 -; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -14 +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -18 +; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX2-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer -; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 -; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 -; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[IN]], i64 -3 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[IN]], i64 -10 +; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[IN]], i64 -14 +; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[IN]], i64 -18 +; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 -3 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[OUT]], i64 -10 +; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[OUT]], i64 -14 ; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 -; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 -; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], -; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], -; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[OUT]], i64 -18 +; AVX2-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP12]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP28]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP16]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP18]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX2-NEXT: [[TMP34:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP19]], i64 [[OFFSET_IDX]] ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 -; AVX2-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 -; AVX2-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP34]], <4 x double>* [[TMP25]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX2-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX2-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX2-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX2-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; AVX2: if.then.1: ; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX2-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX2-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX2-NEXT: br label [[FOR_INC_1]] ; AVX2: for.inc.1: ; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 ; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP42]], 0 ; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] ; AVX2: if.then.2: ; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] -; AVX2-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 -; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX2-NEXT: [[TMP43:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP43]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] ; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 ; AVX2-NEXT: br label [[FOR_INC_2]] ; AVX2: for.inc.2: ; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 ; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] -; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX2-NEXT: [[TMP44:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP44]], 0 ; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] ; AVX2: if.then.3: ; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] -; AVX2-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 -; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX2-NEXT: [[TMP45:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP45]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] ; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 ; AVX2-NEXT: br label [[FOR_INC_3]] @@ -2683,128 +2689,121 @@ ; AVX512-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] ; AVX512-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -7 +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY_PREHEADER:%.*]] +; AVX512: vector.body.preheader: +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 -7 +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -22 ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53 -; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -7 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53 +; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -16 -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -7 -; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53 +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -30 +; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -24 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -7 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 -38 +; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -7 -; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -7 -; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP9:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer +; AVX512-NEXT: [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[IN]], i64 -7 +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[IN]], i64 -22 +; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <8 x double>* +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[IN]], i64 -30 +; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <8 x double>* +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[IN]], i64 -38 +; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <8 x double>* +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 -7 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[OUT]], i64 -22 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[OUT]], i64 -30 ; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -16 -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -7 -; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -24 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -7 -; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], -; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], -; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], -; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -7 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[OUT]], i64 -38 +; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP2]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4, !alias.scope !53 +; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP28:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP12]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP28]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP14]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP16]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP18]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], +; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], +; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], +; AVX512-NEXT: [[TMP34:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP19]], i64 [[OFFSET_IDX]] ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -7 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -16 -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -7 -; AVX512-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -24 -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -7 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP21]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP34]], <8 x double>* [[TMP25]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX512-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX512-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX512-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX512-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; AVX512: if.then.1: ; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX512-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX512-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX512-NEXT: br label [[FOR_INC_1]] ; AVX512: for.inc.1: ; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 ; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] -; AVX512-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX512-NEXT: [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] ; AVX512: if.then.2: ; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] -; AVX512-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 -; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX512-NEXT: [[TMP43:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP43]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] ; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 ; AVX512-NEXT: br label [[FOR_INC_2]] ; AVX512: for.inc.2: ; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 ; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] -; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX512-NEXT: [[TMP44:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP44]], 0 ; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] ; AVX512: if.then.3: ; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] -; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 -; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX512-NEXT: [[TMP45:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP45]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] ; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 ; AVX512-NEXT: br label [[FOR_INC_3]] @@ -2886,61 +2885,61 @@ ; AVX-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] ; AVX: vector.ph: ; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 4 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 12 ; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 16 ; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP9:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer +; AVX-NEXT: [[TMP10:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 4 +; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr double*, double** [[IN]], i64 12 +; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr double*, double** [[IN]], i64 16 ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 4 -; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 -; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP19:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 4 +; AVX-NEXT: [[TMP22:%.*]] = and <4 x i1> [[TMP18]], [[TMP9]] +; AVX-NEXT: [[TMP23:%.*]] = and <4 x i1> [[TMP19]], [[TMP10]] +; AVX-NEXT: [[TMP24:%.*]] = and <4 x i1> [[TMP20]], [[TMP11]] +; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP21]] to <4 x double>* +; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[OUT]], i64 12 +; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* +; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[OUT]], i64 16 +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[INDEX]] +; AVX-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP31]], align 1 +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP33:%.*]] = icmp ne <4 x i8> [[TMP32]], zeroinitializer +; AVX-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP12]], i64 [[INDEX]] +; AVX-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP33]], <4 x double*> undef) +; AVX-NEXT: [[TMP36:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP37:%.*]] = and <4 x i1> [[TMP36]], [[TMP33]] +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP22]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP21]], i64 [[INDEX]] ; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP37]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP23]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP24]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 @@ -2983,61 +2982,61 @@ ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264 -; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 8 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 24 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 32 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 +; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp ne <8 x i8> [[TMP6]], zeroinitializer +; AVX512-NEXT: [[TMP10:%.*]] = icmp ne <8 x i8> [[TMP7]], zeroinitializer +; AVX512-NEXT: [[TMP11:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 8 +; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr double*, double** [[IN]], i64 24 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double*, double** [[IN]], i64 32 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP19:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 8 +; AVX512-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP9]] +; AVX512-NEXT: [[TMP23:%.*]] = and <8 x i1> [[TMP19]], [[TMP10]] +; AVX512-NEXT: [[TMP24:%.*]] = and <8 x i1> [[TMP20]], [[TMP11]] +; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP21]] to <8 x double>* +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[OUT]], i64 24 +; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[OUT]], i64 32 +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP31]], align 1 +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP32]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP12]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP33]], <8 x double*> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = and <8 x i1> [[TMP36]], [[TMP33]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP22]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP21]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP37]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP23]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP24]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 @@ -3147,61 +3146,61 @@ ; AVX-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] ; AVX: vector.ph: ; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 4 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 12 ; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 16 ; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP9:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer +; AVX-NEXT: [[TMP10:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 4 +; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 12 +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 16 ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 4 -; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 -; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 4 +; AVX-NEXT: [[TMP22:%.*]] = and <4 x i1> [[TMP18]], [[TMP9]] +; AVX-NEXT: [[TMP23:%.*]] = and <4 x i1> [[TMP19]], [[TMP10]] +; AVX-NEXT: [[TMP24:%.*]] = and <4 x i1> [[TMP20]], [[TMP11]] +; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP21]] to <4 x double>* +; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[OUT]], i64 12 +; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* +; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[OUT]], i64 16 +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[INDEX]] +; AVX-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP31]], align 1 +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP33:%.*]] = icmp ne <4 x i8> [[TMP32]], zeroinitializer +; AVX-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP12]], i64 [[INDEX]] +; AVX-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP33]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP36:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP37:%.*]] = and <4 x i1> [[TMP36]], [[TMP33]] +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP22]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP21]], i64 [[INDEX]] ; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP37]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP23]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP24]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 @@ -3244,61 +3243,61 @@ ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264 -; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 8 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 24 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TRIGGER]], i64 32 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 +; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp ne <8 x i8> [[TMP6]], zeroinitializer +; AVX512-NEXT: [[TMP10:%.*]] = icmp ne <8 x i8> [[TMP7]], zeroinitializer +; AVX512-NEXT: [[TMP11:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 8 +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 24 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 32 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 -; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 8 +; AVX512-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP9]] +; AVX512-NEXT: [[TMP23:%.*]] = and <8 x i1> [[TMP19]], [[TMP10]] +; AVX512-NEXT: [[TMP24:%.*]] = and <8 x i1> [[TMP20]], [[TMP11]] +; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP21]] to <8 x double>* +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[OUT]], i64 24 +; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[OUT]], i64 32 +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP31]], align 1 +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP32]], zeroinitializer +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP12]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP33]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = and <8 x i1> [[TMP36]], [[TMP33]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP22]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP21]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP37]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP23]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP24]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 Index: llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -48,8 +48,8 @@ ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i64 %n, %index ; CHECK-NOT: getelementptr -; CHECK: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %offset.idx -; CHECK: getelementptr i32, i32* %[[G0]], i64 -3 +; CHECK: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3 +; CHECK: getelementptr i32, i32* %[[G0]], i64 %offset.idx ; CHECK-NOT: getelementptr ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; @@ -311,16 +311,16 @@ ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; INTER: %[[I0:.+]] = shl i64 %index, 2 -; INTER: %next.gep = getelementptr i32, i32* %a, i64 %[[I0]] +; INTER: %next.gep = getelementptr i32, i32* %a, i64 4 ; INTER: %[[S1:.+]] = shl i64 %index, 2 ; INTER: %[[I1:.+]] = or i64 %[[S1]], 4 -; INTER: %next.gep2 = getelementptr i32, i32* %a, i64 %[[I1]] +; INTER: %next.gep2 = getelementptr i32, i32* %a, i64 2 ; INTER: %[[S2:.+]] = shl i64 %index, 2 ; INTER: %[[I2:.+]] = or i64 %[[S2]], 8 -; INTER: %next.gep3 = getelementptr i32, i32* %a, i64 %[[I2]] +; INTER: %next.gep3 = getelementptr i32, i32* %a, i64 2 ; INTER: %[[S3:.+]] = shl i64 %index, 2 ; INTER: %[[I3:.+]] = or i64 %[[S3]], 12 -; INTER: %next.gep4 = getelementptr i32, i32* %a, i64 %[[I3]] +; INTER: %next.gep4 = getelementptr i32, i32* %a, i64 2 ; INTER: br i1 {{.*}}, label %middle.block, label %vector.body ; define void @pointer_iv_non_uniform_0(i32* %a, i64 %n) { Index: llvm/test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/float-induction.ll +++ llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -51,10 +51,10 @@ ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]] -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 4 ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4 -; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4 +; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 Index: llvm/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/induction.ll +++ llvm/test/Transforms/LoopVectorize/induction.ll @@ -106,8 +106,8 @@ ; UNROLL: vector.body: ; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; UNROLL-NOT: add i64 {{.*}}, 4 -; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index -; UNROLL: getelementptr i64, i64* %[[g1]], i64 2 +; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 2 +; UNROLL: getelementptr i64, i64* %[[g1]], i64 %index define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { entry: @@ -352,8 +352,8 @@ ; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ] ; UNROLL: %[[I2:.+]] = or i32 %index, 2 ; UNROLL: %[[E0:.+]] = sext i32 %index to i64 -; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]] -; UNROLL: getelementptr i32, i32* %[[G0]], i64 2 +; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 2 +; UNROLL: getelementptr i32, i32* %[[G0]], i64 %[[E0]] ; UNROLL: pred.udiv.if: ; UNROLL: udiv i32 {{.*}}, %index ; UNROLL: pred.udiv.if{{[0-9]+}}: @@ -593,7 +593,7 @@ %c = icmp ult i32 %idx.b, %len br i1 %c, label %loop, label %exit - exit: + exit: ret void } @@ -748,10 +748,10 @@ ; UNROLL: %step.add = add <2 x i32> %vec.ind, ; UNROLL: %[[A1:.*]] = add i32 %index, %i ; UNROLL: %[[S1:.*]] = sext i32 %[[A1]] to i64 -; UNROLL: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]] +; UNROLL: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 2 ; UNROLL: %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>* ; UNROLL: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]] -; UNROLL: %[[G2:.*]] = getelementptr i32, i32* %[[G1]], i64 2 +; UNROLL: %[[G2:.*]] = getelementptr i32, i32* %[[G1]], i64 %[[S1]] ; UNROLL: %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>* ; UNROLL: store <2 x i32> %step.add, <2 x i32>* %[[B2]] ; UNROLL: %index.next = add i32 %index, 4 Index: llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll +++ llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll @@ -10,10 +10,10 @@ ; CHECK: %offset.idx = or i64 %index, 1 ; CHECK: %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0 ; CHECK: %[[T3:.+]] = sub nsw i64 %[[T2]], %x -; CHECK: %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T3]] +; CHECK: %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 4 ; CHECK: %[[T5:.+]] = bitcast i32* %[[T4]] to <4 x i32>* ; CHECK: load <4 x i32>, <4 x i32>* %[[T5]], align 4 -; CHECK: %[[T6:.+]] = getelementptr i32, i32* %[[T4]], i64 4 +; CHECK: %[[T6:.+]] = getelementptr i32, i32* %[[T4]], i64 %[[T3]] ; CHECK: %[[T7:.+]] = bitcast i32* %[[T6]] to <4 x i32>* ; CHECK: load <4 x i32>, <4 x i32>* %[[T7]], align 4 ; CHECK: br {{.*}}, label %middle.block, label %vector.body