diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -78,6 +79,8 @@ STATISTIC( NumCostMultiplierSkipped, "Number of unswitch candidates that had their cost multiplier skipped"); +STATISTIC(NumInvariantConditionsInjected, + "Number of invariant conditions injected and unswitched"); static cl::opt EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, @@ -118,15 +121,53 @@ cl::desc("If enabled, the freeze instruction will be added to condition " "of loop unswitch to prevent miscompilation.")); +static cl::opt InjectInvariantConditions( + "simple-loop-unswitch-inject-invariant-conditions", cl::Hidden, + cl::desc("Whether we should inject new invariants and unswitch them to " + "eliminate some existing (non-invariant) conditions."), + cl::init(true)); + +static cl::opt InjectInvariantConditionHotnesThreshold( + "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", + cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " + "unswitch on them to eliminate branches that are " + "not-taken 1/ times or less."), + cl::init(16)); + namespace { +struct CompareDesc { + BranchInst *Term; + Value *Invariant; + BasicBlock *InLoopSucc; + + CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) + : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} +}; + +struct InjectedInvariant { + ICmpInst::Predicate Pred; + Value *LHS; + Value *RHS; + BasicBlock *InLoopSucc; + + InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, + BasicBlock *InLoopSucc) + : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} +}; + struct NonTrivialUnswitchCandidate { Instruction *TI = nullptr; TinyPtrVector Invariants; std::optional Cost; + std::optional PendingInjection; NonTrivialUnswitchCandidate( Instruction *TI, ArrayRef Invariants, - std::optional Cost = std::nullopt) - : TI(TI), Invariants(Invariants), Cost(Cost){}; + std::optional Cost = std::nullopt, + std::optional PendingInjection = std::nullopt) + : TI(TI), Invariants(Invariants), Cost(Cost), + PendingInjection(PendingInjection) {}; + + bool hasPendingInjection() const { return PendingInjection.has_value(); } }; } // end anonymous namespace. @@ -2844,6 +2885,252 @@ return !UnswitchCandidates.empty(); } +/// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) +/// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by +/// injecting a loop-invariant condition. +static bool shouldTryInjectInvariantCondition( + const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, + const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { + if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS)) + return false; + // TODO: Support other predicates. + if (Pred != ICmpInst::ICMP_ULT) + return false; + // TODO: Support non-loop-exiting branches? + if (!L.contains(IfTrue) || L.contains(IfFalse)) + return false; + // FIXME: For some reason this causes problems with MSSA updates, need to + // investigate why. So far, just don't unswitch latch. + if (L.getHeader() == IfTrue) + return false; + return true; +} + +/// Returns true, if metadata on \p BI allows us to optimize branching into \p +/// TakenSucc via injection of invariant conditions. The branch should be not +/// enough and not previously unswitched, the information about this comes from +/// the metadata. +bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, + const BasicBlock *TakenSucc) { + // Skip branches that have already been unswithed this way. After successful + // unswitching of injected condition, we will still have a copy of this loop + // which looks exactly the same as original one. To prevent the 2nd attempt + // of unswitching it in the same pass, mark this branch as "nothing to do + // here". + if (BI->hasMetadata("llvm.invariant.condition.injection.disabled")) + return false; + SmallVector Weights; + if (!extractBranchWeights(*BI, Weights)) + return false; + unsigned T = InjectInvariantConditionHotnesThreshold; + BranchProbability LikelyTaken(T - 1, T); + + assert(Weights.size() == 2 && "Unexpected profile data!"); + size_t Idx = BI->getSuccessor(0) == TakenSucc ? 0 : 1; + auto Num = Weights[Idx]; + auto Denom = Weights[0] + Weights[1]; + // Degenerate metadata. + if (Denom == 0) + return false; + BranchProbability ActualTaken(Num, Denom); + if (LikelyTaken > ActualTaken) + return false; + return true; +} + +/// Materialize pending invariant condition of the given candidate into IR. The +/// injected loop-invariant condition implies the original loop-variant branch +/// condition, so the materialization turns +/// +/// loop_block: +/// ... +/// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc +/// +/// into +/// +/// preheader: +/// %invariant_cond = LHS pred RHS +/// ... +/// loop_block: +/// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck +/// OriginalCheck: +/// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc +/// ... +static NonTrivialUnswitchCandidate +injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, + DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, MemorySSAUpdater *MSSAU) { + assert(Candidate.hasPendingInjection() && "Nothing to inject!"); + BasicBlock *Preheader = L.getLoopPreheader(); + assert(Preheader && "Loop is not in simplified form?"); + + auto Pred = Candidate.PendingInjection->Pred; + auto *LHS = Candidate.PendingInjection->LHS; + auto *RHS = Candidate.PendingInjection->RHS; + auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; + auto *TI = cast(Candidate.TI); + auto *BB = Candidate.TI->getParent(); + assert(InLoopSucc == TI->getSuccessor(0)); + auto *OutOfLoopSucc = TI->getSuccessor(1); + // FIXME: Remove this once limitation on successors is lifted. + assert(L.contains(InLoopSucc) && "Not supported yet!"); + assert(!L.contains(OutOfLoopSucc) && "Not supported yet!"); + auto &Ctx = BB->getContext(); + + assert(LHS->getType() == RHS->getType() && "Type mismatch!"); + // Do not use builder here: CreateICmp may simplify this intro a constant and + // unswitching will break. Better optimize it away later. + auto *InjectedCond = + ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond", + Preheader->getTerminator()); + auto *OldCond = TI->getCondition(); + + BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check", + BB->getParent(), InLoopSucc); + IRBuilder<> Builder(TI); + auto *InvariantBr = + Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + + Builder.SetInsertPoint(CheckBlock); + auto *NewTerm = Builder.CreateCondBr(OldCond, InLoopSucc, OutOfLoopSucc); + + TI->eraseFromParent(); + // Prevent infinite unswitching. + NewTerm->setMetadata("llvm.invariant.condition.injection.disabled", + MDNode::get(BB->getContext(), {})); + + // Fixup phis. + for (auto &I : *InLoopSucc) { + auto *PN = dyn_cast(&I); + if (!PN) + break; + auto *Inc = PN->getIncomingValueForBlock(BB); + PN->addIncoming(Inc, CheckBlock); + } + OutOfLoopSucc->replacePhiUsesWith(BB, CheckBlock); + + SmallVector DTUpdates = { + { DominatorTree::Insert, BB, CheckBlock }, + { DominatorTree::Insert, CheckBlock, InLoopSucc }, + { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, + { DominatorTree::Delete, BB, OutOfLoopSucc } + }; + + DT.applyUpdates(DTUpdates); + if (MSSAU) + MSSAU->applyUpdates(DTUpdates, DT); + L.addBasicBlockToLoop(CheckBlock, LI); + +#ifdef EXPENSIVE_CHECKS + DT.verify(); + LI.verify(DT); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); +#endif + + // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* + // higher because we have just inserted a new block. Need to think how to + // adjust the cost of injected candidates when it was first computed. + LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr + << " and considering it for unswitching."); + ++NumInvariantConditionsInjected; + return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, + Candidate.Cost); +} + +/// Given chain of loop branch conditions looking like: +/// br (Variant < Invariant1) +/// br (Variant < Invariant2) +/// br (Variant < Invariant3) +/// ... +/// collect set of invariant conditions on which we want to unswitch, which +/// look like: +/// Invariant1 <= Invariant2 +/// Invariant2 <= Invariant3 +/// ... +/// Though they might not immediately exist in the IR, we can still inject them. +static bool insertCandidatesWithPendingInjections( + SmallVectorImpl &UnswitchCandidates, Loop &L, + ICmpInst::Predicate Pred, ArrayRef Compares, + const DominatorTree &DT) { + + assert(ICmpInst::isRelational(Pred)); + assert(ICmpInst::isStrictPredicate(Pred)); + if (Compares.size() < 2) + return false; + ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(Pred); + for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; + Next != Compares.end(); ++Prev, ++Next) { + Value *LHS = Next->Invariant; + Value *RHS = Prev->Invariant; + BasicBlock *InLoopSucc = Prev->InLoopSucc; + InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); + NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, + std::nullopt, std::move(ToInject)); + UnswitchCandidates.push_back(std::move(Candidate)); + } + return true; +} + +/// Collect unswitch candidates by invariant conditions that are not immediately +/// present in the loop. However, they can be injected into the code if we +/// decide it's profitable. +/// An example of such conditions is following: +/// +/// for (...) { +/// x = load ... +/// if (! x +/// } +/// +/// We can unswitch by condition "C1 <=u C2". If that is true, then "x &UnswitchCandidates, + IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, + const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, + const MemorySSAUpdater *MSSAU) { + if (!InjectInvariantConditions) + return false; + + if (!DT.isReachableFromEntry(L.getHeader())) + return false; + auto *Latch = L.getLoopLatch(); + // Need to have a single latch and a preheader. + if (!Latch) + return false; + assert(L.getLoopPreheader() && "Must have a preheader!"); + + DenseMap > CandidatesULT; + // Traverse the conditions that dominate latch (and therefore dominate each + // other). + for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock()); + DTN = DTN->getIDom()) { + ICmpInst::Predicate Pred; + Value *LHS = nullptr, *RHS = nullptr; + BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; + auto *BB = DTN->getBlock(); + auto *Term = BB->getTerminator(); + if (!match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), + m_BasicBlock(IfTrue), m_BasicBlock(IfFalse)))) + continue; + if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) + continue; + if (!shouldTryInjectBasingOnMetadata(cast(Term), IfTrue)) + continue; + CompareDesc Desc(cast(Term), RHS, IfTrue); + CandidatesULT[LHS].push_back(Desc); + } + + bool Found = false; + for (auto &It : CandidatesULT) + Found |= insertCandidatesWithPendingInjections( + UnswitchCandidates, L, ICmpInst::ICMP_ULT, It.second, DT); + return Found; +} + static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { if (!L.isSafeToClone()) return false; @@ -3003,10 +3290,11 @@ Instruction &TI = *Candidate.TI; ArrayRef Invariants = Candidate.Invariants; BranchInst *BI = dyn_cast(&TI); - InstructionCost CandidateCost = ComputeUnswitchedCost( - TI, /*FullUnswitch*/ !BI || - (Invariants.size() == 1 && - Invariants[0] == skipTrivialSelect(BI->getCondition()))); + bool FullUnswitch = + !BI || Candidate.hasPendingInjection() || + (Invariants.size() == 1 && + Invariants[0] == skipTrivialSelect(BI->getCondition())); + InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); // Calculate cost multiplier which is a tool to limit potentially // exponential behavior of loop-unswitch. if (EnableUnswitchCostMultiplier) { @@ -3044,9 +3332,13 @@ SmallVector UnswitchCandidates; IVConditionInfo PartialIVInfo; Instruction *PartialIVCondBranch = nullptr; + collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU); + collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, DT, LI, AA, + MSSAU); // If we didn't find any candidates, we're done. - if (!collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, - PartialIVCondBranch, L, LI, AA, MSSAU)) + if (UnswitchCandidates.empty()) return false; LLVM_DEBUG( @@ -3065,6 +3357,11 @@ return false; } + if (Best.hasPendingInjection()) + Best = injectPendingInvariantConditions(Best, L, DT, LI, AC, MSSAU); + assert(!Best.hasPendingInjection() && + "All injections should have been done by now!"); + if (Best.TI != PartialIVCondBranch) PartialIVInfo.InstToDuplicate.clear(); diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll @@ -0,0 +1,481 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s +; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch),simplifycfg" -verify-memoryssa | FileCheck %s + +define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_01( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[LEN]] +; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] +; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 +; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: guarded.us: +; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LEN]] +; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] +; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !llvm.invariant.condition.injection.disabled !0 +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp ult i32 %el, %limit + br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +; Should not optimize: profile metadata is missing. +define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_01_neg_void_profile( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp ult i32 %el, %limit + br i1 %bound_check, label %guarded, label %bound_check_failed + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +; Same as test_01, but n and limit are constants. +define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_01_constants( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300 +; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] +; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 +; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200 +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK: guarded.us: +; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300 +; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] +; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], 1000 +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200 +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp ult i32 %el, 200 + br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, 300 + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, 1000 + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_01_neg_degenerate_profile( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF2:![0-9]+]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp ult i32 %el, %limit + br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 0, i32 0} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +; Should not optimize: cold branch. +define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_01_neg_cold( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3:![0-9]+]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp ult i32 %el, %limit + br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 2, i32 3} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_02( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp sge i32 %el, 0 + br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_03( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0 +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET:%.*]], label [[GUARDED:%.*]], !prof [[PROF4:![0-9]+]] +; CHECK: guarded: +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i32, ptr %p, i32 %iv + %el = load i32, ptr %el.ptr, align 4 + %bound_check = icmp slt i32 %el, 0 + br i1 %bound_check, label %bound_check_failed, label %guarded, !prof !{!"branch_weights", i32 1, i32 100} + +guarded: ; preds = %loop + %range_check = icmp ult i32 %el, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +} + +define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) { +; CHECK-LABEL: @test_04( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0 +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET:%.*]], label [[GUARDED:%.*]], !prof [[PROF4]] +; CHECK: guarded: +; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32 +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[LEN]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK: backedge: +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_WIDE]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; +entry: + %len = load i32, ptr %len_p, align 4, !noundef !{} + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %el.ptr = getelementptr i8, ptr %p, i32 %iv + %el = load i8, ptr %el.ptr, align 4 + %bound_check = icmp slt i8 %el, 0 + br i1 %bound_check, label %bound_check_failed, label %guarded, !prof !{!"branch_weights", i32 1, i32 100} + +guarded: ; preds = %loop + %el.wide = zext i8 %el to i32 + %range_check = icmp ult i32 %el.wide, %len + br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1} + +backedge: ; preds = %guarded + %arr.ptr = getelementptr i32, ptr %arr, i32 %el.wide + store i32 %iv, ptr %arr.ptr, align 4 + %iv.next = add i32 %iv, 1 + %loop_cond = icmp slt i32 %iv.next, %n + br i1 %loop_cond, label %loop, label %exit + +exit: ; preds = %backedge + ret i32 0 + +bound_check_failed: ; preds = %loop + ret i32 -1 + +range_check_failed: ; preds = %guarded + ret i32 -2 +}