diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -135,10 +135,12 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/SimplifyCFGOptions.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -9980,6 +9982,62 @@ return true; } +static bool sinkAndHoistInstsInLoop(Loop *L, DominatorTree &DT, LoopInfo &LI, + TargetTransformInfo &TTI, + ScalarEvolution &SE) { + bool Changed = false; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + SmallVector WorkList(reverse(L->blocks())); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + // Skip blocks that may have been removed earlier. + if (LI.getLoopFor(BB) != L) + continue; + + // Remove block if it became dead. + if (pred_begin(BB) == pred_end(BB)) { + LI.removeBlock(BB); + DeleteDeadBlock(BB, &DTU); + continue; + } + + // If BB has a conditional branch and its successors are in the same loop, + // try to hoist common code to BB. + auto *BI = dyn_cast(BB->getTerminator()); + if (BI && BI->isConditional() && + BI->getSuccessor(0)->getSinglePredecessor() && + BI->getSuccessor(1)->getSinglePredecessor() && + all_of(successors(BB), [&LI, L](BasicBlock *Succ) { + return LI.getLoopFor(Succ) == L; + })) { + auto *OldThen = BI->getSuccessor(0); + auto *OldElse = BI->getSuccessor(1); + if (hoistThenElseCodeToIf(BI, TTI, false, &DTU)) { + Changed = true; + if (OldThen->size() == 1) + WorkList.push_back(OldThen); + if (OldElse->size() == 1) + WorkList.push_back(OldElse); + } + } + + // If all predecessors are in the current loop, try to sink instructions + // to BB. + if (all_of(predecessors(BB), + [&LI, L](BasicBlock *Pred) { return LI.getLoopFor(Pred) == L; })) + Changed |= sinkCommonCodeFromPredecessors(BB, &DTU, &LI); + } + + if (Changed) { + SE.forgetTopmostLoop(L); +#ifdef EXPENSIVE_CHECKS + assert(DTU.getDomTree().verify(DominatorTree::VerificationLevel::Full)); + LI.verify(DT); +#endif + } + return Changed; +} + LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, @@ -10035,6 +10093,8 @@ while (!Worklist.empty()) { Loop *L = Worklist.pop_back_val(); + Changed |= CFGChanged |= sinkAndHoistInstsInLoop(L, *DT, *LI, *TTI, *SE); + // For the inner loops we actually process, form LCSSA to simplify the // transform. Changed |= formLCSSARecursively(*L, *DT, LI, SE); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll b/llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/prepare-hoist-sink.ll @@ -1,36 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -verify-dom-info -verify-loop-info -S %s | FileCheck %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -verify-dom-info -verify-loop-info -force-vector-interleave=1 -S %s | FileCheck %s define void @test(float* %A, float* %B, float %x) { ; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 2.000000e+01 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] -; CHECK: then: -; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]] -; CHECK-NEXT: [[A_LV_0:%.*]] = load float, float* [[A_GEP_0]], align 4 -; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]] -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]] -; CHECK-NEXT: store float [[MUL2_I81_I]], float* [[B_GEP_0]], align 4 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: else: -; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] -; CHECK-NEXT: [[A_LV_1:%.*]] = load float, float* [[A_GEP_1]], align 4 -; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[A_LV_1]], [[X]] -; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] -; CHECK-NEXT: [[B_LV:%.*]] = load float, float* [[B_GEP_1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL2]], [[B_LV]] -; CHECK-NEXT: store float [[ADD]], float* [[B_GEP_1]], align 4 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT_I85_I:%.*]] = icmp eq i64 [[IV_NEXT]], 10000 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I85_I]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float %x, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i1> poison, i1 %cmp, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT7]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* %B, i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP4]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT8]], <4 x float> [[TMP4]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP11]], label %middle.block, label %vector.body ; entry: %cmp = fcmp olt float %x, 20.0 @@ -74,11 +72,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: if.end: -; CHECK-NEXT: br i1 true, label [[IF_ELSE29:%.*]], label [[IF_THEN9:%.*]] -; CHECK: if.then9: ; CHECK-NEXT: br i1 true, label [[IF_END38:%.*]], label [[IF_END38]] -; CHECK: if.else29: -; CHECK-NEXT: br i1 true, label [[IF_END38]], label [[IF_END38]] ; CHECK: if.end38: ; CHECK-NEXT: br label [[CLEANUP:%.*]] ; CHECK: cleanup: @@ -106,33 +100,54 @@ br label %for.body } -@rd = external unnamed_addr global i32, align 4 +@rd = external global i32, align 4 define void @sink_with_new_split_block(i32* %ptr) { ; CHECK-LABEL: @sink_with_new_split_block( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4 -; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[LV]], 20 -; CHECK-NEXT: br i1 [[C_0]], label [[THEN_0:%.*]], label [[ELSE_0:%.*]] -; CHECK: then.0: -; CHECK-NEXT: store i32 281701264, i32* @rd, align 4 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: else.0: -; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[LV]], 99 -; CHECK-NEXT: br i1 [[C_1]], label [[THEN_1:%.*]], label [[LOOP_LATCH]] -; CHECK: then.1: -; CHECK-NEXT: store i32 0, i32* @rd, align 4 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: [[C_2:%.*]] = icmp ne i32 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[C_2]], label [[LOOP]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: ret void +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %pred.store.continue8 ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* %ptr, i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> , <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 0 +; CHECK-NEXT: store i32 [[TMP10]], i32* @rd, align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: pred.store.if3: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i32 [[TMP12]], i32* @rd, align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 2 +; CHECK-NEXT: store i32 [[TMP14]], i32* @rd, align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label %pred.store.continue8 +; CHECK: pred.store.if7: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: store i32 [[TMP16]], i32* @rd, align 4, +; CHECK-NEXT: br label %pred.store.continue8 +; CHECK: pred.store.continue8: +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2000 +; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -387,15 +387,11 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP_LATCH]] ] ; CHECK-NEXT: [[C_1:%.*]] = icmp ult i64 [[IV]], 500 -; CHECK-NEXT: br i1 [[C_1]], label [[IF_TRUEBB:%.*]], label [[IF_FALSEBB:%.*]] -; CHECK: if.truebb: -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: if.falsebb: +; CHECK-NEXT: [[DOTFOR:%.*]] = select i1 [[C_1]], i32 20, i32 [[FOR]] ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[FIRST_TIME_1:%.*]] = phi i32 [ 20, [[IF_TRUEBB]] ], [ [[FOR]], [[IF_FALSEBB]] ] ; CHECK-NEXT: [[C_2:%.*]] = icmp ult i64 [[IV]], 800 -; CHECK-NEXT: [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[FIRST_TIME_1]] +; CHECK-NEXT: [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[DOTFOR]] ; CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[IV]] ; CHECK-NEXT: store i32 [[FOR_NEXT]], i32* [[PTR_IDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -126,10 +126,9 @@ ; if (b[i] == k) ; a = ntrunc ; else a = k; -; TODO: We could vectorize this once we support multiple uniform stores to the -; same address. +; The function gets vectorized by sinking the conditional stores in the successor. ; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( -; CHECK-NOT: load <4 x i32> +; CHECK: load <4 x i32> define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { entry: %ntrunc = trunc i64 %n to i32 @@ -210,15 +209,10 @@ ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]] ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] -; CHECK: cond_store: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: cond_store_k: -; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: -; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]] +; CHECK-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP]], i32 [[NTRUNC]], i32 [[K]] ; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] @@ -266,43 +260,42 @@ ; into the uniform address within the loop. ; Since the condition and the phi is loop invariant, they are LICM'ed after ; vectorization. -; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 +; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A3]], i64 1 ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[CMP]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !36, !noalias !39 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !39 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -311,17 +304,12 @@ ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] -; CHECK: cond_store: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: cond_store_k: ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: -; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] -; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[K]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: @@ -363,42 +351,42 @@ define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-LABEL: @variant_val_store_to_inv_address( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 +; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1 -; CHECK-NEXT: [[SMAX3:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]] +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8, !alias.scope !36 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36 -; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !43 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !46, !noalias !43 +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -409,12 +397,12 @@ ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], [[LOOP42:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP49:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[RDX_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -43,10 +43,10 @@ ; Make sure a loop is successfully vectorized with fold-tail when the backedge ; taken count is constant and used inside the loop. Issue revealed by D76992. ; -define void @reuse_const_btc(i8* %A) optsize { +define void @reuse_const_btc(i8* %A, i32* noalias %B) optsize { ; CHECK-LABEL: @reuse_const_btc ; CHECK: {{%.*}} = icmp ule <4 x i32> {{%.*}}, -; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> , <4 x i32> +; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> , <4 x i32> ; entry: br label %loop @@ -58,11 +58,13 @@ %cond0 = icmp eq i32 %riv, 7 br i1 %cond0, label %then, label %else then: + %B.gep = getelementptr inbounds i32, i32* %B, i32 %sub + %lv = load i32, i32* %B.gep br label %merge else: br label %merge merge: - %blend = phi i32 [ 13, %then ], [ 12, %else ] + %blend = phi i32 [ %lv, %then ], [ 12, %else ] %trunc = trunc i32 %blend to i8 store i8 %trunc, i8* %arrayidx, align 1 %rivMinus1 = add nuw nsw i32 %riv, -1