diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h --- a/llvm/include/llvm/Analysis/CodeMetrics.h +++ b/llvm/include/llvm/Analysis/CodeMetrics.h @@ -19,6 +19,7 @@ namespace llvm { class AssumptionCache; class BasicBlock; +class Instruction; class Loop; class Function; template class SmallPtrSetImpl; @@ -44,6 +45,9 @@ /// True if this function contains a call to a convergent function. bool convergent = false; + /// True if the code contains an uncontrolled convergent operation. + bool convergentUncontrolled = false; + /// True if this function calls alloca (in the C sense). bool usesDynamicAlloca = false; @@ -53,6 +57,9 @@ /// Number of analyzed blocks. unsigned NumBlocks = false; + /// Keeps track of loop heart intrinsics and their convergencectrl token use. + std::vector> convergenceHearts; + /// Keeps track of basic block code size estimates. DenseMap NumBBInsts; diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -76,6 +76,7 @@ unsigned PeelCount; bool UnrollRemainder; bool ForgetAllSCEV; + const Instruction *Heart = nullptr; }; LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, @@ -127,8 +128,25 @@ Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserFullUnrollMaxCount); +enum class LoopConvergenceKind { + // No convergent operations at all. + None, + + // All convergent operations are controlled and anchored inside the loop. + AnchoredInLoop, + + // Some convergent operations, unrolling is possible subject to constraints + // (no remainder loop). + Some, + + // Heart controlling the loop outside of its header block. + NonHeaderHeart, +}; + unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, bool &Convergent, + bool &NotDuplicatable, + LoopConvergenceKind &Convergent, + const Instruction *&Heart, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, unsigned BEInsns); diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "code-metrics" @@ -156,19 +157,34 @@ if (isa(I) || I.getType()->isVectorTy()) ++NumVectorInsts; - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) { + // TODO: This seems misleading, and leads to potentially too conservative + // behavior for loop unrolling decisions. notDuplicatable = true; + } - if (const CallInst *CI = dyn_cast(&I)) { - if (CI->cannotDuplicate()) + if (const CallBase *CB = dyn_cast(&I)) { + if (CB->cannotDuplicate()) notDuplicatable = true; - if (CI->isConvergent()) + if (CB->isConvergent()) { convergent = true; - } - if (const InvokeInst *InvI = dyn_cast(&I)) - if (InvI->cannotDuplicate()) - notDuplicatable = true; + auto *intrinsicInst = dyn_cast(CB); + auto control = CB->getOperandBundle(LLVMContext::OB_convergencectrl); + if (intrinsicInst && intrinsicInst->getIntrinsicID() == + Intrinsic::experimental_convergence_loop) { + assert(control && + "invalid IR: loop heart without convergencectrl bundle"); + Value *token = control.getValue().Inputs[0].get(); + convergenceHearts.emplace_back(intrinsicInst, token); + } else if (!control && + (!intrinsicInst || + intrinsicInst->getIntrinsicID() != + Intrinsic::experimental_convergence_anchor)) { + convergentUncontrolled = true; + } + } + } NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); } diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -322,16 +322,17 @@ // Approximate the loop size and collect useful info unsigned NumInlineCandidates; bool NotDuplicatable; - bool Convergent; SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); Loop *SubLoop = L->getSubLoops()[0]; + LoopConvergenceKind Convergent; + const Instruction *Heart; unsigned InnerLoopSize = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, - Convergent, TTI, EphValues, UP.BEInsns); + Convergent, Heart, TTI, EphValues, UP.BEInsns); unsigned OuterLoopSize = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); + Heart, TTI, EphValues, UP.BEInsns); LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n"); LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n"); if (NotDuplicatable) { @@ -343,7 +344,7 @@ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } - if (Convergent) { + if (Convergent != LoopConvergenceKind::None) { LLVM_DEBUG( dbgs() << " Not unrolling loop with convergent instructions.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -632,15 +632,42 @@ /// ApproximateLoopSize - Approximate the size of the loop. unsigned llvm::ApproximateLoopSize( - const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, + const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, + LoopConvergenceKind &Convergent, const Instruction *&Heart, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, unsigned BEInsns) { CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) + bool convergenceControlledByOutside = false; + bool nonHeaderHeart = false; + + Heart = nullptr; + + for (BasicBlock *BB : L->blocks()) { Metrics.analyzeBasicBlock(BB, TTI, EphValues); + + for (const auto &heart : Metrics.convergenceHearts) { + BasicBlock *defBlock = cast(heart.second)->getParent(); + if (!L->contains(defBlock)) { + convergenceControlledByOutside = true; + if (BB != L->getHeader()) + nonHeaderHeart = true; + assert(!Heart && "invalid IR: loop has multiple relevant hearts"); + Heart = heart.first; + } + } + Metrics.convergenceHearts.clear(); + } NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; - Convergent = Metrics.convergent; + + if (nonHeaderHeart) + Convergent = LoopConvergenceKind::NonHeaderHeart; + else if (Metrics.convergentUncontrolled || convergenceControlledByOutside) + Convergent = LoopConvergenceKind::Some; + else if (Metrics.convergent) + Convergent = LoopConvergenceKind::AnchoredInLoop; + else + Convergent = LoopConvergenceKind::None; unsigned LoopSize = Metrics.NumInsts; @@ -1036,7 +1063,6 @@ bool OptForSize = L->getHeader()->getParent()->hasOptSize(); unsigned NumInlineCandidates; bool NotDuplicatable; - bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, @@ -1053,9 +1079,11 @@ SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + LoopConvergenceKind Convergent; + const Instruction *Heart; unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); + Heart, TTI, EphValues, UP.BEInsns); LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NotDuplicatable) { LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" @@ -1092,15 +1120,18 @@ // is unsafe -- it adds a control-flow dependency to the convergent // operation. Therefore restrict remainder loop (try unrollig without). // - // TODO: This is quite conservative. In practice, convergent_op() - // is likely to be called unconditionally in the loop. In this - // case, the program would be ill-formed (on most architectures) - // unless n were the same on all threads in a thread group. - // Assuming n is the same on all threads, any kind of unrolling is - // safe. But currently llvm's notion of convergence isn't powerful - // enough to express this. - if (Convergent) + // TODO: This is still somewhat conservative, as we could allow the remainder + // if the trip count is uniform (and we don't have an unnatural heart). + switch (Convergent) { + case LoopConvergenceKind::None: + case LoopConvergenceKind::AnchoredInLoop: + break; // no convergence-related restrictions + case LoopConvergenceKind::Some: UP.AllowRemainder = false; + break; + case LoopConvergenceKind::NonHeaderHeart: + return LoopUnrollResult::Unmodified; + } // Try to find the trip count upper bound if we cannot find the exact trip // count. @@ -1128,12 +1159,21 @@ // Unroll the loop. Loop *RemainderLoop = nullptr; + UnrollLoopOptions ULO; + ULO.Count = UP.Count; + ULO.TripCount = TripCount; + ULO.Force = UP.Force; + ULO.AllowRuntime = UP.Runtime; + ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount; + ULO.PreserveCondBr = UseUpperBound; + ULO.PreserveOnlyFirst = MaxOrZero; + ULO.TripMultiple = TripMultiple; + ULO.PeelCount = PP.PeelCount; + ULO.UnrollRemainder = UP.UnrollRemainder; + ULO.ForgetAllSCEV = ForgetAllSCEV; + ULO.Heart = Heart; LoopUnrollResult UnrollResult = UnrollLoop( - L, - {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder, - ForgetAllSCEV}, - LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); + L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -409,19 +409,33 @@ } } - // Loops containing convergent instructions must have a count that divides + // Loops containing convergent instructions that are uncontrolled or + // controlled from outside the loop must have a count that divides // their TripMultiple. - LLVM_DEBUG( - { - bool HasConvergent = false; - for (auto &BB : L->blocks()) - for (auto &I : *BB) - if (auto *CB = dyn_cast(&I)) - HasConvergent |= CB->isConvergent(); - assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) && - "Unroll count must divide trip multiple if loop contains a " - "convergent operation."); - }); + LLVM_DEBUG({ + bool HasOutsideConvergenceControl = false; + for (auto &BB : L->blocks()) { + for (auto &I : *BB) { + if (auto *CB = dyn_cast(&I)) { + if (CB->isConvergent()) { + auto control = + CB->getOperandBundle(LLVMContext::OB_convergencectrl); + if (!control) { + HasOutsideConvergenceControl = true; + break; + } + Value *token = control.getValue().Inputs[0].get(); + if (!L->contains(cast(token))) + HasOutsideConvergenceControl = true; + } + } + } + } + assert( + (!HasOutsideConvergenceControl || ULO.TripMultiple % ULO.Count == 0) && + "Unroll count must divide trip multiple if loop contains an " + "outside-controlled convergent operation."); + }); bool EpilogProfitability = UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog @@ -585,6 +599,8 @@ << DIL->getFilename() << " Line: " << DIL->getLine()); } + assert(ULO.Heart == nullptr || ULO.Heart->getParent() == Header); + for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector NewBlocks; SmallDenseMap NewLoops; @@ -602,7 +618,7 @@ if (OldLoop) LoopsToSimplify.insert(NewLoops[OldLoop]); - if (*BB == Header) + if (*BB == Header) { // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (PHINode *OrigPHI : OrigPHINode) { @@ -615,6 +631,16 @@ New->getInstList().erase(NewPHI); } + // Eliminate copies of the loop heart intrinsic, if any. + if (ULO.Heart) { + auto it = VMap.find(ULO.Heart); + assert(it != VMap.end()); + Instruction *heartCopy = cast(it->second); + heartCopy->eraseFromParent(); + VMap.erase(it); + } + } + // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -973,14 +973,20 @@ auto UnrollResult = LoopUnrollResult::Unmodified; if (remainderLoop && UnrollRemainder) { LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); - UnrollResult = - UnrollLoop(remainderLoop, - {/*Count*/ Count - 1, /*TripCount*/ Count - 1, - /*Force*/ false, /*AllowRuntime*/ false, - /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, - /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, - /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV}, - LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); + UnrollLoopOptions ULO; + ULO.Count = Count - 1; + ULO.TripCount = Count - 1; + ULO.Force = false; + ULO.AllowRuntime = false; + ULO.AllowExpensiveTripCount = false; + ULO.PreserveCondBr = true; + ULO.PreserveOnlyFirst = false; + ULO.TripMultiple = 1; + ULO.PeelCount = 0; + ULO.UnrollRemainder = false; + ULO.ForgetAllSCEV = ForgetAllSCEV; + UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI, + /*ORE*/ nullptr, PreserveLCSSA); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll @@ -0,0 +1,413 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s + +declare void @f() convergent +declare void @g() + +; Although this loop contains a convergent instruction, it should be +; fully unrolled. +define i32 @full_unroll() { +; CHECK-LABEL: @full_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, but it should be partially +; unrolled. The unroll count is the largest power of 2 that divides the +; multiple -- 4, in this case. +define i32 @runtime_unroll(i32 %n) { +; CHECK-LABEL: @runtime_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12 +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_3]] = add nsw i32 [[INC_2]], 1 +; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 12 + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, so its partial unroll +; count must divide its trip multiple. This overrides its unroll +; pragma -- we unroll exactly 8 times, even though 16 is requested. +define i32 @pragma_unroll(i32 %n) { +; CHECK-LABEL: @pragma_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !0 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_3:%.*]] = add nuw nsw i32 [[INC_2]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_4:%.*]] = add nuw nsw i32 [[INC_3]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_5:%.*]] = add nuw nsw i32 [[INC_4]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_6:%.*]] = add nuw nsw i32 [[INC_5]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_7]] = add nsw i32 [[INC_6]], 1 +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !2 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 24 + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip count 4. The loop unroll should respect the pragma. +define void @pragma_unroll_divisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_divisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !6 +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret void +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip multiple 2. The loop unroll should respect the pragma. +define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) { +; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[INC]], 1 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !7 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 2 + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 is unknown to divide runtime trip count, the loop is not unrolled +; since remainder is forbidden for unrolling convergent loop. +define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) { +; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !4 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %anchor) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 does not divide trip count 5, the loop is not unrolled by 2 +; since remainder is forbidden for unrolling convergent loop. Instead, the +; loop gets fully unrolled. +define i32 @pragma_unroll_indivisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_indivisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 5 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction that is anchored inside the loop +; itself. It is unrolled by 2 with remainder, as requested by the loop metadata. +define i32 @pragma_unroll_with_remainder(i32 %n) { +; CHECK-LABEL: @pragma_unroll_with_remainder( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]] +; CHECK: entry.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[ENTRY_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1 +; CHECK-NEXT: [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[INC]], 1 +; CHECK-NEXT: [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NSUB_1]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop !8 +; CHECK: exit.unr-lcssa.loopexit: +; CHECK-NEXT: [[X_0_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[L3]] ] +; CHECK-NEXT: br label [[EXIT_UNR_LCSSA]] +; CHECK: exit.unr-lcssa: +; CHECK-NEXT: [[X_0_UNR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_0_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: l3.epil.preheader: +; CHECK-NEXT: br label [[L3_EPIL:%.*]] +; CHECK: l3.epil: +; CHECK-NEXT: [[X_0_EPIL:%.*]] = phi i32 [ [[X_0_UNR]], [[L3_EPIL_PREHEADER]] ] +; CHECK-NEXT: [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ] +; CHECK-NEXT: [[INC_EPIL:%.*]] = add nsw i32 [[X_0_EPIL]], 1 +; CHECK-NEXT: [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[N]] +; CHECK-NEXT: br label [[EXIT_EPILOG_LCSSA:%.*]] +; CHECK: exit.epilog-lcssa: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; Don't unroll a loop that is extended by convergence controls. +; +; We could theoretically duplicate the extension part, but this is not +; implemented. +define i32 @extended_loop(i32 %n) { +; CHECK-LABEL: @extended_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop !4 +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !4 +; CHECK: exit: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + call void @f() [ "convergencectrl"(token %tok.loop) ] + ret i32 0 +} + +; The heart of this loop does not dominate all latches. It cannot be unrolled. +define i32 @unusual_heart(i32 %v) { +; CHECK-LABEL: @unusual_heart( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[BRANCHCOND:%.*]] = icmp eq i32 [[X_0]], [[V:%.*]] +; CHECK-NEXT: br i1 [[BRANCHCOND]], label [[THEN:%.*]], label [[LATCH]] +; CHECK: then: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: call void @g() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %branchcond = icmp eq i32 %x.0, %v + br i1 %branchcond, label %then, label %latch + +then: + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + br label %latch + +latch: + call void @g() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} +!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}}