diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -111,10 +111,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -390,6 +392,18 @@ cl::desc( "Override cost based safe divisor widening for div/rem instructions")); +static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; +static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; +static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; +static constexpr uint32_t EpilogueMinItersBypassWeights[] = {1, 127}; + +static void setBranchWeights(BranchInst &BI, ArrayRef Weights) { + assert(BI.getNumSuccessors() == 2 && "expected conditional branch"); + MDBuilder MDB(BI.getContext()); + MDNode *BranchWeights = MDB.createBranchWeights(Weights); + BI.setMetadata(LLVMContext::MD_prof, BranchWeights); +} + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -1917,12 +1931,14 @@ SCEVExpander MemCheckExp; bool CostTooHigh = false; + const bool AddBranchWeights; public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL) + TargetTransformInfo *TTI, const DataLayout &DL, + bool AddBranchWeights) : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check") {} + MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -2115,8 +2131,10 @@ DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); + BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); + if (AddBranchWeights) + setBranchWeights(BI, SCEVCheckBypassWeights); + ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); return SCEVCheckBlock; } @@ -2140,9 +2158,12 @@ if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) PL->addBasicBlockToLoop(MemCheckBlock, *LI); - ReplaceInstWithInst( - MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); + if (AddBranchWeights) { + setBranchWeights(BI, MemCheckBypassWeights); + } + ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); MemCheckBlock->getTerminator()->setDebugLoc( Pred->getTerminator()->getDebugLoc()); @@ -2847,9 +2868,11 @@ // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); } @@ -3079,7 +3102,15 @@ IRBuilder<> B(LoopMiddleBlock->getTerminator()); B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); - cast(LoopMiddleBlock->getTerminator())->setCondition(CmpN); + BranchInst &BI = *cast(LoopMiddleBlock->getTerminator()); + BI.setCondition(CmpN); + if (hasBranchWeightMD(*ScalarLatchTerm)) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = UF * VF.getKnownMinValue(); + MDBuilder MDB(ScalarLatchTerm->getContext()); + MDNode *BranchWeights = MDB.createBranchWeights(1, TripCount - 1); + BI.setMetadata(LLVMContext::MD_prof, BranchWeights); + } } #ifdef EXPENSIVE_CHECKS @@ -7801,9 +7832,11 @@ EPI.TripCount = Count; } - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); return TCCheckBlock; } @@ -7947,9 +7980,11 @@ EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - ReplaceInstWithInst( - Insert->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, EpilogueMinItersBypassWeights); + ReplaceInstWithInst(Insert->getTerminator(), &BI); LoopBypassBlocks.push_back(Insert); return Insert; @@ -9650,8 +9685,10 @@ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9986,8 +10023,10 @@ VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll @@ -0,0 +1,81 @@ +; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 | FileCheck %s + +; CHECK-LABEL: @f0( +; +; CHECK: entry: +; CHECK: br i1 %cmp.entry, label %iter.check, label %exit, !prof [[PROF_F0_ENTRY:![0-9]+]] +; +; CHECK: iter.check: +; CHECK: br i1 %min.iters.check, label %vec.epilog.scalar.ph, label %vector.scevcheck, !prof [[PROF_F0_UNLIKELY:![0-9]+]] +; +; CHECK: vector.scevcheck: +; CHECK: br i1 %4, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vector.main.loop.iter.check: +; CHECK: br i1 %min.iters.check1, label %vec.epilog.ph, label %vector.ph, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vector.ph: +; CHECK: br label %vector.body +; +; CHECK: vector.body: +; CHECK: br i1 %8, label %middle.block, label %vector.body, !prof [[PROF_F0_VECTOR_BODY:![0-9]+]] +; +; CHECK: middle.block: +; CHECK: br i1 %cmp.n, label %exit.loopexit, label %vec.epilog.iter.check, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]] +; +; CHECK: vec.epilog.iter.check: +; CHECK: br i1 %min.epilog.iters.check, label %vec.epilog.scalar.ph, label %vec.epilog.ph, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vec.epilog.ph: +; CHECK: br label %vec.epilog.vector.body +; +; CHECK: vec.epilog.vector.body: +; CHECK: br i1 %12, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]] +; +; CHECK: vec.epilog.middle.block: +; CHECK: br i1 %cmp.n7, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]] +; +; CHECK: vec.epilog.scalar.ph: +; CHECK: br label %loop +; +; CHECK: loop: +; CHECK: br i1 %cmp.loop, label %loop, label %exit.loopexit, !prof [[PROF_F0_LOOP:![0-9]+]] +; +; CHECK: exit.loopexit: +; CHECK: br label %exit +; +; CHECK: exit: +; CHECK: ret void + +define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 { +entry: + %cmp.entry = icmp sgt i32 %len, 0 + br i1 %cmp.entry, label %loop, label %exit, !prof !1 + +loop: + %i8 = phi i8 [0, %entry], [%i8.inc, %loop] + %i32 = phi i32 [0, %entry], [%i32.inc, %loop] + + %ptr = getelementptr inbounds i32, ptr %p, i8 %i8 + store i32 %i32, ptr %ptr + + %i8.inc = add i8 %i8, 1 + %i32.inc = add i32 %i32, 1 + + %cmp.loop = icmp ult i32 %i32, %len + br i1 %cmp.loop, label %loop, label %exit, !prof !2 + +exit: + ret void +} + +!0 = !{!"function_entry_count", i64 13} +!1 = !{!"branch_weights", i32 12, i32 1} +!2 = !{!"branch_weights", i32 1234, i32 1} + +; CHECK: [[PROF_F0_ENTRY]] = !{!"branch_weights", i32 12, i32 1} +; CHECK: [[PROF_F0_UNLIKELY]] = !{!"branch_weights", i32 1, i32 127} +; CEHCK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307} +; CHECK: [[PROF_F0_MIDDLE_BLOCKS]] = !{!"branch_weights", i32 1, i32 3} +; CHECK: [[PROF_F0_VEC_EPILOG_VECTOR_BODY]] = !{!"branch_weights", i32 0, i32 0} +; CEHCK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -2758,7 +2758,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-IC-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 @@ -2860,7 +2860,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -2882,14 +2882,14 @@ ; UNROLL-NO-IC-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region( ; UNROLL-NO-VF-NEXT: bb: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 @@ -2927,7 +2927,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -2947,14 +2947,14 @@ ; UNROLL-NO-VF-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region( ; SINK-AFTER-NEXT: bb: ; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; SINK-AFTER-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; SINK-AFTER-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 @@ -3014,7 +3014,7 @@ ; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 @@ -3035,7 +3035,7 @@ ; SINK-AFTER-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; bb: br label %bb2 @@ -3063,7 +3063,7 @@ ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-IC-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 @@ -3228,7 +3228,7 @@ ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -3255,14 +3255,14 @@ ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region_multiple( ; UNROLL-NO-VF-NEXT: bb: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 @@ -3314,7 +3314,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -3339,14 +3339,14 @@ ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple( ; SINK-AFTER-NEXT: bb: ; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; SINK-AFTER-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; SINK-AFTER-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 @@ -3437,7 +3437,7 @@ ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]]) ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 @@ -3463,7 +3463,7 @@ ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; bb: br label %bb2 @@ -3520,7 +3520,7 @@ ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 @@ -3542,7 +3542,7 @@ ; UNROLL-NO-IC-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-IC-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-IC-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[FOR_LCSSA]] @@ -3570,7 +3570,7 @@ ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3590,7 +3590,7 @@ ; UNROLL-NO-VF-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-VF-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[FOR_LCSSA]] @@ -3616,7 +3616,7 @@ ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 @@ -3638,7 +3638,7 @@ ; SINK-AFTER-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; SINK-AFTER-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; SINK-AFTER-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[FOR_LCSSA]] @@ -3688,7 +3688,7 @@ ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3703,7 +3703,7 @@ ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-IC-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: ret void ; @@ -3725,7 +3725,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP5]] = add i16 [[TMP3]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3739,7 +3739,7 @@ ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -3758,7 +3758,7 @@ ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; SINK-AFTER-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3773,7 +3773,7 @@ ; SINK-AFTER-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; SINK-AFTER-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; SINK-AFTER-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: ret void ;