diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h --- a/llvm/include/llvm/Analysis/CodeMetrics.h +++ b/llvm/include/llvm/Analysis/CodeMetrics.h @@ -20,6 +20,7 @@ namespace llvm { class AssumptionCache; class BasicBlock; +class Instruction; class Loop; class Function; template class SmallPtrSetImpl; @@ -45,6 +46,9 @@ /// True if this function contains a call to a convergent function. bool convergent = false; + /// True if the code contains an uncontrolled convergent operation. + bool convergentUncontrolled = false; + /// True if this function calls alloca (in the C sense). bool usesDynamicAlloca = false; @@ -54,6 +58,9 @@ /// Number of analyzed blocks. unsigned NumBlocks = false; + /// Keeps track of loop heart intrinsics and their convergencectrl token use. + std::vector> convergenceHearts; + /// Keeps track of basic block code size estimates. DenseMap NumBBInsts; @@ -77,7 +84,7 @@ /// Add information about a block to the current state. void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, - bool PrepareForLTO = false); + bool PrepareForLTO = false, const Loop *L = nullptr); /// Collect a loop's ephemeral values (those used only by an assume /// or similar intrinsics in the loop). diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -72,6 +72,7 @@ bool AllowExpensiveTripCount; bool UnrollRemainder; bool ForgetAllSCEV; + const Instruction *Heart = nullptr; }; LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, @@ -126,9 +127,24 @@ std::optional UserUpperBound, std::optional UserFullUnrollMaxCount); -InstructionCost ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, - const SmallPtrSetImpl &EphValues, unsigned BEInsns); +enum class LoopConvergenceKind { + // No convergent operations at all. + None, + + // All convergent operations are controlled and anchored inside the loop. + AnchoredInLoop, + + // Some convergent operations, unrolling is possible subject to constraints + // (no remainder loop). + Some, +}; + +InstructionCost +ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, + LoopConvergenceKind &Convergent, const Instruction *&Heart, + const TargetTransformInfo &TTI, + const SmallPtrSetImpl &EphValues, + unsigned BEInsns); } // end namespace llvm diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InstructionCost.h" @@ -111,11 +112,35 @@ completeEphemeralValues(Visited, Worklist, EphValues); } +static bool isConvergenceCtrlIntr(const Instruction &I) { + auto *Intrinsic = dyn_cast(&I); + if (!Intrinsic) + return false; + switch (Intrinsic->getIntrinsicID()) { + case Intrinsic::experimental_convergence_entry: + case Intrinsic::experimental_convergence_anchor: + case Intrinsic::experimental_convergence_loop: + return true; + } + return false; +} + +static bool isUsedOutsideOfLoop(const Instruction &I, const Loop &L) { + for (const auto *U : I.users()) { + if (auto *I = dyn_cast(U)) { + if (!L.contains(I->getParent())) + return true; + } + } + return false; +} + /// Fill in the current structure with information gleaned from the specified /// block. void CodeMetrics::analyzeBasicBlock( const BasicBlock *BB, const TargetTransformInfo &TTI, - const SmallPtrSetImpl &EphValues, bool PrepareForLTO) { + const SmallPtrSetImpl &EphValues, bool PrepareForLTO, + const Loop *L) { ++NumBlocks; InstructionCost NumInstsBeforeThisBB = NumInsts; for (const Instruction &I : *BB) { @@ -163,19 +188,36 @@ if (isa(I) || I.getType()->isVectorTy()) ++NumVectorInsts; - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) - notDuplicatable = true; + if (I.getType()->isTokenTy()) { + if (L && isConvergenceCtrlIntr(I)) { + notDuplicatable = isUsedOutsideOfLoop(I, *L); + } else { + notDuplicatable = I.isUsedOutsideOfBlock(BB); + } + } - if (const CallInst *CI = dyn_cast(&I)) { - if (CI->cannotDuplicate()) + if (const CallBase *CB = dyn_cast(&I)) { + if (CB->cannotDuplicate()) notDuplicatable = true; - if (CI->isConvergent()) + if (CB->isConvergent()) { convergent = true; - } - if (const InvokeInst *InvI = dyn_cast(&I)) - if (InvI->cannotDuplicate()) - notDuplicatable = true; + auto *intrinsicInst = dyn_cast(CB); + auto control = CB->getOperandBundle(LLVMContext::OB_convergencectrl); + if (intrinsicInst && intrinsicInst->getIntrinsicID() == + Intrinsic::experimental_convergence_loop) { + assert(control && + "invalid IR: loop heart without convergencectrl bundle"); + Value *token = control->Inputs[0].get(); + convergenceHearts.emplace_back(intrinsicInst, token); + } else if (!control && + (!intrinsicInst || + intrinsicInst->getIntrinsicID() != + Intrinsic::experimental_convergence_anchor)) { + convergentUncontrolled = true; + } + } + } NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); } diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3350,14 +3350,16 @@ unsigned NumInlineCandidates; bool NotDuplicatable; - bool Convergent; + LoopConvergenceKind Convergent; + const Instruction *Heart; InstructionCost LoopSizeIC = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); + Heart, TTI, EphValues, UP.BEInsns); LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSizeIC << "\n"); // Loop is not unrollable if the loop contains certain instructions. - if (NotDuplicatable || Convergent || !LoopSizeIC.isValid()) { + if (NotDuplicatable || !LoopSizeIC.isValid() || + Convergent != LoopConvergenceKind::None) { LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); return 1; } diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -320,16 +320,17 @@ // Approximate the loop size and collect useful info unsigned NumInlineCandidates; bool NotDuplicatable; - bool Convergent; SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); Loop *SubLoop = L->getSubLoops()[0]; + LoopConvergenceKind Convergent; + const Instruction *Heart; InstructionCost InnerLoopSizeIC = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, - Convergent, TTI, EphValues, UP.BEInsns); + Convergent, Heart, TTI, EphValues, UP.BEInsns); InstructionCost OuterLoopSizeIC = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); + Heart, TTI, EphValues, UP.BEInsns); LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSizeIC << "\n"); LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSizeIC << "\n"); @@ -350,7 +351,7 @@ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } - if (Convergent) { + if (Convergent != LoopConvergenceKind::None) { LLVM_DEBUG( dbgs() << " Not unrolling loop with convergent instructions.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -664,15 +664,38 @@ /// ApproximateLoopSize - Approximate the size of the loop. InstructionCost llvm::ApproximateLoopSize( - const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, + const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, + LoopConvergenceKind &Convergent, const Instruction *&Heart, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, unsigned BEInsns) { CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, TTI, EphValues); + bool convergenceControlledByOutside = false; + + Heart = nullptr; + + for (BasicBlock *BB : L->blocks()) { + Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false, + L); + + for (const auto &heart : Metrics.convergenceHearts) { + BasicBlock *defBlock = cast(heart.second)->getParent(); + if (!L->contains(defBlock)) { + convergenceControlledByOutside = true; + assert(!Heart && "invalid IR: loop has multiple relevant hearts"); + Heart = heart.first; + } + } + Metrics.convergenceHearts.clear(); + } NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; - Convergent = Metrics.convergent; + + if (Metrics.convergentUncontrolled || convergenceControlledByOutside) + Convergent = LoopConvergenceKind::Some; + else if (Metrics.convergent) + Convergent = LoopConvergenceKind::AnchoredInLoop; + else + Convergent = LoopConvergenceKind::None; InstructionCost LoopSize = Metrics.NumInsts; @@ -1179,7 +1202,6 @@ bool OptForSize = L->getHeader()->getParent()->hasOptSize(); unsigned NumInlineCandidates; bool NotDuplicatable; - bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, @@ -1196,9 +1218,11 @@ SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + LoopConvergenceKind Convergent; + const Instruction *Heart; InstructionCost LoopSizeIC = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); + Heart, TTI, EphValues, UP.BEInsns); LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSizeIC << "\n"); if (!LoopSizeIC.isValid()) { @@ -1254,15 +1278,18 @@ // is unsafe -- it adds a control-flow dependency to the convergent // operation. Therefore restrict remainder loop (try unrolling without). // - // TODO: This is quite conservative. In practice, convergent_op() - // is likely to be called unconditionally in the loop. In this - // case, the program would be ill-formed (on most architectures) - // unless n were the same on all threads in a thread group. - // Assuming n is the same on all threads, any kind of unrolling is - // safe. But currently llvm's notion of convergence isn't powerful - // enough to express this. - if (Convergent) + // TODO: This is still somewhat conservative, as we could allow the remainder + // if the trip count is uniform (and we don't have an unnatural heart). + bool ConvergentAllowsRuntime = true; + switch (Convergent) { + case LoopConvergenceKind::None: + case LoopConvergenceKind::AnchoredInLoop: + break; // no convergence-related restrictions + case LoopConvergenceKind::Some: UP.AllowRemainder = false; + ConvergentAllowsRuntime = false; + break; + } // Try to find the trip count upper bound if we cannot find the exact trip // count. @@ -1282,6 +1309,8 @@ if (!UP.Count) return LoopUnrollResult::Unmodified; + UP.Runtime &= ConvergentAllowsRuntime; + if (PP.PeelCount) { assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step"); LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName() @@ -1324,11 +1353,16 @@ // Unroll the loop. Loop *RemainderLoop = nullptr; + UnrollLoopOptions ULO; + ULO.Count = UP.Count; + ULO.Force = UP.Force; + ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount; + ULO.UnrollRemainder = UP.UnrollRemainder; + ULO.Runtime = UP.Runtime; + ULO.ForgetAllSCEV = ForgetAllSCEV; + ULO.Heart = Heart; LoopUnrollResult UnrollResult = UnrollLoop( - L, - {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UP.UnrollRemainder, ForgetAllSCEV}, - LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); + L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -395,20 +395,37 @@ return LoopUnrollResult::Unmodified; } - // Loops containing convergent instructions cannot use runtime unrolling, - // as the prologue/epilogue may add additional control-dependencies to - // convergent operations. - LLVM_DEBUG( - { - bool HasConvergent = false; - for (auto &BB : L->blocks()) - for (auto &I : *BB) - if (auto *CB = dyn_cast(&I)) - HasConvergent |= CB->isConvergent(); - assert((!HasConvergent || !ULO.Runtime) && - "Can't runtime unroll if loop contains a convergent operation."); - }); - + // Loops containing convergent instructions that are uncontrolled or + // controlled from outside the loop must have a count that divides + // their TripMultiple. + LLVM_DEBUG({ + bool HasOutsideConvergenceControl = false; + for (auto &BB : L->blocks()) { + for (auto &I : *BB) { + if (auto *CB = dyn_cast(&I)) { + if (CB->isConvergent()) { + auto control = + CB->getOperandBundle(LLVMContext::OB_convergencectrl); + if (!control) { + auto intrinsicInst = dyn_cast(CB); + if (intrinsicInst && + intrinsicInst->getIntrinsicID() == + Intrinsic::experimental_convergence_anchor) { + continue; + } + HasOutsideConvergenceControl = true; + break; + } + Value *token = control->Inputs[0].get(); + if (!L->contains(cast(token))) + HasOutsideConvergenceControl = true; + } + } + } + } + assert((!HasOutsideConvergenceControl || !ULO.Runtime) && + "Can't runtime unroll if loop contains a convergent operation."); + }); bool EpilogProfitability = UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog : isEpilogProfitable(L); @@ -536,6 +553,9 @@ // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); + + assert(ULO.Heart == nullptr || ULO.Heart->getParent() == Header); + for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector NewBlocks; SmallDenseMap NewLoops; @@ -553,7 +573,7 @@ if (OldLoop) LoopsToSimplify.insert(NewLoops[OldLoop]); - if (*BB == Header) + if (*BB == Header) { // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (PHINode *OrigPHI : OrigPHINode) { @@ -566,6 +586,16 @@ NewPHI->eraseFromParent(); } + // Eliminate copies of the loop heart intrinsic, if any. + if (ULO.Heart) { + auto it = VMap.find(ULO.Heart); + assert(it != VMap.end()); + Instruction *heartCopy = cast(it->second); + heartCopy->eraseFromParent(); + VMap.erase(it); + } + } + // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -993,12 +993,15 @@ auto UnrollResult = LoopUnrollResult::Unmodified; if (remainderLoop && UnrollRemainder) { LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); - UnrollResult = - UnrollLoop(remainderLoop, - {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false, - /*AllowExpensiveTripCount*/ false, - /*UnrollRemainder*/ false, ForgetAllSCEV}, - LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); + UnrollLoopOptions ULO; + ULO.Count = Count - 1; + ULO.Force = false; + ULO.Runtime = false; + ULO.AllowExpensiveTripCount = false; + ULO.UnrollRemainder = false; + ULO.ForgetAllSCEV = ForgetAllSCEV; + UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI, + /*ORE*/ nullptr, PreserveLCSSA); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll @@ -0,0 +1,576 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s + +declare void @f() convergent +declare void @g() + +; Although this loop contains a convergent instruction, it should be +; fully unrolled. +define i32 @full_unroll() { +; CHECK-LABEL: @full_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, but it should be partially +; unrolled. The unroll count is the largest power of 2 that divides the +; multiple -- 4, in this case. +define i32 @runtime_unroll(i32 %n) { +; CHECK-LABEL: @runtime_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12 +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[A_3:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1 +; CHECK-NEXT: br label [[A_3]] +; CHECK: a.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_3]] = add nsw i32 [[INC_2]], 1 +; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 12 + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, so its partial unroll +; count must divide its trip multiple. This overrides its unroll +; pragma -- we unroll exactly 8 times, even though 16 is requested. +define i32 @pragma_unroll(i32 %n) { +; CHECK-LABEL: @pragma_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[A_7:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1 +; CHECK-NEXT: br label [[A_3:%.*]] +; CHECK: a.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_3:%.*]] = add nuw nsw i32 [[INC_2]], 1 +; CHECK-NEXT: br label [[A_4:%.*]] +; CHECK: a.4: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_4:%.*]] = add nuw nsw i32 [[INC_3]], 1 +; CHECK-NEXT: br label [[A_5:%.*]] +; CHECK: a.5: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_5:%.*]] = add nuw nsw i32 [[INC_4]], 1 +; CHECK-NEXT: br label [[A_6:%.*]] +; CHECK: a.6: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_6:%.*]] = add nuw nsw i32 [[INC_5]], 1 +; CHECK-NEXT: br label [[A_7]] +; CHECK: a.7: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_7]] = add nsw i32 [[INC_6]], 1 +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 24 + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip count 4. The loop unroll should respect the pragma. +define void @pragma_unroll_divisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_divisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret void +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip multiple 2. The loop unroll should respect the pragma. +define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) { +; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[INC]], 1 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 2 + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 is unknown to divide runtime trip count, the loop is not unrolled +; since remainder is forbidden for unrolling convergent loop. +define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) { +; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 does not divide trip count 5, the loop is not unrolled by 2 +; since remainder is forbidden for unrolling convergent loop. Instead, the +; loop gets fully unrolled. +define i32 @pragma_unroll_indivisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_indivisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 5 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction that is anchored inside the loop +; itself. It is unrolled by 2 with remainder, as requested by the loop metadata. +define i32 @pragma_unroll_with_remainder(i32 %n) { +; CHECK-LABEL: @pragma_unroll_with_remainder( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[N:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 1 +; CHECK-NEXT: br i1 [[TMP2]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]] +; CHECK: entry.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP0]], [[XTRAITER]] +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[NITER_NEXT:%.*]] = add nuw nsw i32 [[NITER]], 1 +; CHECK-NEXT: [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[INC]], 1 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i32 [[NITER_NEXT]], 1 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: exit.unr-lcssa.loopexit: +; CHECK-NEXT: br label [[EXIT_UNR_LCSSA]] +; CHECK: exit.unr-lcssa: +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: l3.epil.preheader: +; CHECK-NEXT: br label [[L3_EPIL:%.*]] +; CHECK: l3.epil: +; CHECK-NEXT: [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; Don't unroll a loop that is extended by convergence controls. +; +; We could theoretically duplicate the extension part, but this is not +; implemented. +define i32 @extended_loop(i32 %n) { +; CHECK-LABEL: @extended_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]] +; CHECK: exit: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + call void @f() [ "convergencectrl"(token %tok.loop) ] + ret i32 0 +} + +; Inner loop is extended beyond the outer loop. No unrolling possible. + +define i32 @extended_inner_loop_1(i32 %n, i1 %cond) { +; CHECK-LABEL: @extended_inner_loop_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 4 +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP4]] +; CHECK: latch: +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + br i1 %exitcond, label %exit, label %l3 + +exit: + call void @f() [ "convergencectrl"(token %tok.l2) ] + ret i32 0 +} + +; Inner loop is extended inside the outer loop. Outer loop is unrolled. + +define i32 @extended_inner_loop_2(i32 %n, i1 %cond) { +; CHECK-LABEL: @extended_inner_loop_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br label [[L2_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1: +; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.2: +; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.3: +; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; No extension. Both loops unrolled. + +define i32 @unroll_nest(i32 %n, i1 %cond) { +; CHECK-LABEL: @unroll_nest( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2_1:%.*]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1: +; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: latch: +; CHECK-NEXT: br label [[L2_12:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.12: +; CHECK-NEXT: [[TOK_L2_11:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_11]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_1:%.*]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.1: +; CHECK-NEXT: [[TOK_L2_1_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_12]], label [[LATCH_1]], !llvm.loop [[LOOP9]] +; CHECK: latch.1: +; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.2: +; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_2:%.*]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.2: +; CHECK-NEXT: [[TOK_L2_1_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2]], !llvm.loop [[LOOP9]] +; CHECK: latch.2: +; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.3: +; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_3:%.*]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.3: +; CHECK-NEXT: [[TOK_L2_1_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3]], !llvm.loop [[LOOP9]] +; CHECK: latch.3: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} +!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}}