diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -169,7 +169,14 @@ SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI, nullptr, PreserveLCSSA); // Add the branch to the exit block (around the unrolled loop) - B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader); + MDNode *BranchWeights = nullptr; + if (hasBranchWeightMD(*Latch->getTerminator())) { + // Assume loop is nearly always entered. + MDBuilder MDB(B.getContext()); + BranchWeights = MDB.createBranchWeights(1, 127); + } + B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader, + BranchWeights); InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(OriginalLoopLatchExit, @@ -194,8 +201,8 @@ BasicBlock *Exit, BasicBlock *PreHeader, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, - LoopInfo *LI, bool PreserveLCSSA, - ScalarEvolution &SE) { + LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, + unsigned Count) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast(VMap[Latch]); @@ -292,7 +299,13 @@ SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr, PreserveLCSSA); // Add the branch to the exit block (around the unrolling loop) - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit); + MDNode *BranchWeights = nullptr; + if (hasBranchWeightMD(*Latch->getTerminator())) { + // Assume equal distribution in interval [0;Count]. + MDBuilder MDB(B.getContext()); + BranchWeights = MDB.createBranchWeights(1, Count - 1); + } + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -316,8 +329,9 @@ const bool UnrollRemainder, BasicBlock *InsertTop, BasicBlock *InsertBot, BasicBlock *Preheader, - std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, - ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { + std::vector &NewBlocks, + LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, + DominatorTree *DT, LoopInfo *LI, unsigned Count) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -370,7 +384,20 @@ auto *One = ConstantInt::get(NewIdx->getType(), 1); Value *IdxNext = Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot); + MDNode *BranchWeights = nullptr; + uint64_t OrigExitWeight; + uint64_t OrigBackedgeWeight; + if (extractBranchWeights(*LatchBR, OrigExitWeight, OrigBackedgeWeight)) { + if (LatchBR->getSuccessor(0) == L->getHeader()) + std::swap(OrigExitWeight, OrigBackedgeWeight); + // Assign the maximum possible trip count as the back edge weight for + // the remainder loop if the original loop comes with a branch weight. + assert(Count > 1 && "unroll factor must be 2 or more"); + uint64_t BackEdgeWeight = (Count - 1) * OrigExitWeight; + MDBuilder MDB(Builder.getContext()); + BranchWeights = MDB.createBranchWeights(BackEdgeWeight, OrigExitWeight); + } + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -464,32 +491,6 @@ // know of kinds of multiexit loops that would benefit from unrolling. } -// Assign the maximum possible trip count as the back edge weight for the -// remainder loop if the original loop comes with a branch weight. -static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop, - Loop *RemainderLoop, - uint64_t UnrollFactor) { - uint64_t TrueWeight, FalseWeight; - BranchInst *LatchBR = - cast(OrigLoop->getLoopLatch()->getTerminator()); - if (!extractBranchWeights(*LatchBR, TrueWeight, FalseWeight)) - return; - uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader() - ? FalseWeight - : TrueWeight; - assert(UnrollFactor > 1); - uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight; - BasicBlock *Header = RemainderLoop->getHeader(); - BasicBlock *Latch = RemainderLoop->getLoopLatch(); - auto *RemainderLatchBR = cast(Latch->getTerminator()); - unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1); - MDBuilder MDB(RemainderLatchBR->getContext()); - MDNode *WeightNode = - HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight) - : MDB.createBranchWeights(BackEdgeWeight, ExitWeight); - RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode); -} - /// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain /// accounting for the possibility of unsigned overflow in the 2s complement /// domain. Preconditions: @@ -775,7 +776,13 @@ BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader; BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop); + MDNode *BranchWeights = nullptr; + if (hasBranchWeightMD(*Latch->getTerminator())) { + // Assume loop is nearly always entered. + MDBuilder MDB(B.getContext()); + BranchWeights = MDB.createBranchWeights(1, 127); + } + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -804,12 +811,7 @@ BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; Loop *remainderLoop = CloneLoopBlocks( L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); - - // Assign the maximum possible trip count as the back edge weight for the - // remainder loop if the original loop comes with a branch weight. - if (remainderLoop && !UnrollRemainder) - updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count); + NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -903,7 +905,7 @@ // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll --- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll @@ -165,7 +165,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1 ; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 7 ; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[INNER_1_HEADER_PROL_PREHEADER:%.*]], label [[INNER_1_HEADER_PROL_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[INNER_1_HEADER_PROL_PREHEADER:%.*]], label [[INNER_1_HEADER_PROL_LOOPEXIT:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK: inner.1.header.prol.preheader: ; CHECK-NEXT: br label [[INNER_1_HEADER_PROL:%.*]] ; CHECK: inner.1.header.prol: @@ -180,7 +180,7 @@ ; CHECK-NEXT: [[CMP_2_PROL:%.*]] = icmp sgt i64 [[INNER_1_IV_PROL]], 0 ; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] -; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[INNER_1_HEADER_PROL]], label [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[INNER_1_HEADER_PROL]], label [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !prof [[PROF4:![0-9]+]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: inner.1.header.prol.loopexit.unr-lcssa: ; CHECK-NEXT: [[L_1_LCSSA_UNR_PH:%.*]] = phi i32 [ [[L_1_PROL]], [[INNER_1_LATCH_PROL]] ] ; CHECK-NEXT: [[INNER_1_IV_UNR_PH:%.*]] = phi i64 [ [[INNER_1_IV_NEXT_PROL]], [[INNER_1_LATCH_PROL]] ] @@ -189,7 +189,7 @@ ; CHECK-NEXT: [[L_1_LCSSA_UNR:%.*]] = phi i32 [ undef, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7 -; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]] ; CHECK: outer.header.new: ; CHECK-NEXT: br label [[INNER_1_HEADER:%.*]] ; CHECK: inner.1.header: @@ -233,7 +233,7 @@ ; CHECK-NEXT: store i32 [[L_1_7]], ptr [[DST]], align 8 ; CHECK-NEXT: [[INNER_1_IV_NEXT_7]] = add i64 [[INNER_1_IV]], 8 ; CHECK-NEXT: [[CMP_2_7:%.*]] = icmp sgt i64 [[INNER_1_IV_NEXT_6]], 0 -; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF5:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF6:![0-9]+]] ; CHECK: outer.middle.unr-lcssa: ; CHECK-NEXT: [[L_1_LCSSA_PH:%.*]] = phi i32 [ [[L_1_7]], [[INNER_1_LATCH_7]] ] ; CHECK-NEXT: br label [[OUTER_MIDDLE]] diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll --- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll @@ -18,41 +18,54 @@ ; COMMON-LABEL: @test( -; EPILOG: %xtraiter = and i32 %n -; EPILOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; EPILOG: br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit +; EPILOG: entry: +; EPILOG: br i1 %cmp1, label %for.end, label %for.body.preheader, !prof [[EPILOG_PROF_0:![0-9]+]] +; EPILOG: for.body.preheader: +; EPILOG: %xtraiter = and i32 %n +; EPILOG: br i1 %1, label %for.end.loopexit.unr-lcssa, label %for.body.preheader.new, !prof [[EPILOG_PROF_1:![0-9]+]] + +; EPILOG: for.end.loopexit.unr-lcssa: +; EPILOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; EPILOG: br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit, !prof [[EPILOG_PROF_2:![0-9]+]] ; NOEPILOG-NOT: %xtraiter = and i32 %n -; PROLOG: %xtraiter = and i32 %n -; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; PROLOG: br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit +; PROLOG: entry: +; PROLOG: br i1 %cmp1, label %for.end, label %for.body.preheader, !prof [[PROLOG_PROF_0:![0-9]+]] + +; PROLOG: for.body.preheader: +; PROLOG: %xtraiter = and i32 %n +; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; PROLOG: br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit, !prof [[PROLOG_PROF_1:![0-9]+]] ; NOPROLOG-NOT: %xtraiter = and i32 %n ; EPILOG: for.body.epil: -; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %for.body.epil.preheader ] -; EPILOG: %epil.iter.next = add i32 %epil.iter, 1 -; EPILOG: %epil.iter.cmp = icmp ne i32 %epil.iter.next, %xtraiter -; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0 +; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %for.body.epil.preheader ] +; EPILOG: %epil.iter.next = add i32 %epil.iter, 1 +; EPILOG: %epil.iter.cmp = icmp ne i32 %epil.iter.next, %xtraiter +; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof [[EPILOG_PROF_3:![0-9]+]], !llvm.loop [[EPILOG_LOOP:![0-9]+]] ; NOEPILOG: for.body: ; NOEPILOG-NOT: for.body.epil: ; PROLOG: for.body.prol: -; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ] -; PROLOG: %prol.iter.next = add i32 %prol.iter, 1 -; PROLOG: %prol.iter.cmp = icmp ne i32 %prol.iter.next, %xtraiter -; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop !0 +; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ] +; PROLOG: %prol.iter.next = add i32 %prol.iter, 1 +; PROLOG: %prol.iter.cmp = icmp ne i32 %prol.iter.next, %xtraiter +; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !prof [[PROLOG_PROF_2:![0-9]+]], !llvm.loop [[PROLOG_LOOP:![0-9]+]] + +; PROLOG: for.body.prol.loopexit: +; PROLOG: br i1 %2, label %for.end.loopexit, label %for.body.preheader.new, !prof [[PROLOG_PROF_1:![0-9]+]] ; NOPROLOG: for.body: ; NOPROLOG-NOT: for.body.prol: -define i32 @test(ptr nocapture %a, i32 %n) nounwind uwtable readonly { +define i32 @test(ptr nocapture %a, i32 %n) nounwind uwtable readonly !prof !2 { entry: %cmp1 = icmp eq i32 %n, 0 - br i1 %cmp1, label %for.end, label %for.body + br i1 %cmp1, label %for.end, label %for.body, !prof !3 for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] @@ -63,7 +76,7 @@ %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.end, label %for.body + br i1 %exitcond, label %for.end, label %for.body, !prof !4 for.end: ; preds = %for.body, %entry %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] @@ -274,12 +287,24 @@ !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.unroll.runtime.disable"} +!2 = !{!"function_entry_count", i64 1} +!3 = !{!"branch_weights", i32 1, i32 11} +!4 = !{!"branch_weights", i32 1, i32 42} ; need to use LABEL here to separate function IR matching from metadata matching ; COMMON-LABEL: {{^}}!0 = -; EPILOG-SAME: distinct !{!0, !1} -; EPILOG: !1 = !{!"llvm.loop.unroll.disable"} +; EPILOG: [[EPILOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} +; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127} +; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1, i32 7} +; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 7, i32 1} + +; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_LOOP_1:![0-9]+]]} +; EPILOG: [[EPILOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"} + +; PROLOG: [[PROLOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} +; PROLOG: [[PROLOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127} +; PROLOG: [[PROLOG_PROF_2]] = !{!"branch_weights", i32 7, i32 1} -; PROLOG-SAME: distinct !{!0, !1} -; PROLOG: !1 = !{!"llvm.loop.unroll.disable"} +; PROLOG: distinct !{[[PROLOG_LOOP]], [[PROLOG_LOOP_1:![0-9]+]]} +; PROLOG: [[PROLOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll --- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -4,13 +4,13 @@ ; CHECK-LABEL: @bar_prof ; CHECK: loop: -; CHECK: %mul = mul -; CHECK: %mul.1 = mul -; CHECK: %mul.2 = mul -; CHECK: %mul.3 = mul -; CHECK: br i1 %niter.ncmp.7, label %loop.end.unr-lcssa.loopexit, label %loop, !prof !1 +; CHECK: %mul = mul +; CHECK: %mul.1 = mul +; CHECK: %mul.2 = mul +; CHECK: %mul.3 = mul +; CHECK: br i1 %niter.ncmp.7, label %loop.end.unr-lcssa.loopexit, label %loop, !prof [[PROF0:![0-9]+]] ; CHECK: loop.epil: -; CHECK: br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof !2, !llvm.loop !3 +; CHECK: br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof [[PROF1:![0-9]+]], !llvm.loop {{![0-9]+}} define i32 @bar_prof(ptr noalias nocapture readonly %src, i64 %c) !prof !1 { entry: br label %loop @@ -60,5 +60,5 @@ !1 = !{!"function_entry_count", i64 1} !2 = !{!"branch_weights", i32 1, i32 1000} -; CHECK: !1 = !{!"branch_weights", i32 1, i32 124} -; CHECK: !2 = !{!"branch_weights", i32 7, i32 1} \ No newline at end of file +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 124} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 7, i32 1}