diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h
--- a/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -19,6 +19,7 @@
 namespace llvm {
 class AssumptionCache;
 class BasicBlock;
+class Instruction;
 class Loop;
 class Function;
 template <class T> class SmallPtrSetImpl;
@@ -44,6 +45,9 @@
   /// True if this function contains a call to a convergent function.
   bool convergent = false;
 
+  /// True if the code contains an uncontrolled convergent operation.
+  bool convergentUncontrolled = false;
+
   /// True if this function calls alloca (in the C sense).
   bool usesDynamicAlloca = false;
 
@@ -53,6 +57,9 @@
   /// Number of analyzed blocks.
   unsigned NumBlocks = false;
 
+  /// Keeps track of loop heart intrinsics and their convergencectrl token use.
+  std::vector<std::pair<const Instruction *, Value *>> convergenceHearts;
+
   /// Keeps track of basic block code size estimates.
   DenseMap<const BasicBlock *, unsigned> NumBBInsts;
 
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -76,6 +76,7 @@
   unsigned PeelCount;
   bool UnrollRemainder;
   bool ForgetAllSCEV;
+  const Instruction *Heart = nullptr;
 };
 
 LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
@@ -127,8 +128,25 @@
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount);
 
+enum class LoopConvergenceKind {
+  // No convergent operations at all.
+  None,
+
+  // All convergent operations are controlled and anchored inside the loop.
+  AnchoredInLoop,
+
+  // Some convergent operations, unrolling is possible subject to constraints
+  // (no remainder loop).
+  Some,
+
+  // Heart controlling the loop outside of its header block.
+  NonHeaderHeart,
+};
+
 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                             bool &NotDuplicatable, bool &Convergent,
+                             bool &NotDuplicatable,
+                             LoopConvergenceKind &Convergent,
+                             const Instruction *&Heart,
                              const TargetTransformInfo &TTI,
                              const SmallPtrSetImpl<const Value *> &EphValues,
                              unsigned BEInsns);
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "code-metrics"
@@ -156,19 +157,34 @@
     if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
       ++NumVectorInsts;
 
-    if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+    if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) {
+      // TODO: This seems misleading, and leads to potentially too conservative
+      //       behavior for loop unrolling decisions.
       notDuplicatable = true;
+    }
 
-    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->cannotDuplicate())
+    if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->cannotDuplicate())
         notDuplicatable = true;
-      if (CI->isConvergent())
+      if (CB->isConvergent()) {
         convergent = true;
-    }
 
-    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
-      if (InvI->cannotDuplicate())
-        notDuplicatable = true;
+        auto *intrinsicInst = dyn_cast<IntrinsicInst>(CB);
+        auto control = CB->getOperandBundle(LLVMContext::OB_convergencectrl);
+        if (intrinsicInst && intrinsicInst->getIntrinsicID() ==
+                                 Intrinsic::experimental_convergence_loop) {
+          assert(control &&
+                 "invalid IR: loop heart without convergencectrl bundle");
+          Value *token = control.getValue().Inputs[0].get();
+          convergenceHearts.emplace_back(intrinsicInst, token);
+        } else if (!control &&
+                   (!intrinsicInst ||
+                    intrinsicInst->getIntrinsicID() !=
+                        Intrinsic::experimental_convergence_anchor)) {
+          convergentUncontrolled = true;
+        }
+      }
+    }
 
     NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -322,16 +322,17 @@
   // Approximate the loop size and collect useful info
   unsigned NumInlineCandidates;
   bool NotDuplicatable;
-  bool Convergent;
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
   Loop *SubLoop = L->getSubLoops()[0];
+  LoopConvergenceKind Convergent;
+  const Instruction *Heart;
   unsigned InnerLoopSize =
       ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
-                          Convergent, TTI, EphValues, UP.BEInsns);
+                          Convergent, Heart, TTI, EphValues, UP.BEInsns);
   unsigned OuterLoopSize =
       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
-                          TTI, EphValues, UP.BEInsns);
+                          Heart, TTI, EphValues, UP.BEInsns);
   LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n");
   LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n");
   if (NotDuplicatable) {
@@ -343,7 +344,7 @@
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
-  if (Convergent) {
+  if (Convergent != LoopConvergenceKind::None) {
     LLVM_DEBUG(
         dbgs() << "  Not unrolling loop with convergent instructions.\n");
     return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -632,15 +632,42 @@
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 unsigned llvm::ApproximateLoopSize(
-    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
+    LoopConvergenceKind &Convergent, const Instruction *&Heart,
     const TargetTransformInfo &TTI,
     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
   CodeMetrics Metrics;
-  for (BasicBlock *BB : L->blocks())
+  bool convergenceControlledByOutside = false;
+  bool nonHeaderHeart = false;
+
+  Heart = nullptr;
+
+  for (BasicBlock *BB : L->blocks()) {
     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+
+    for (const auto &heart : Metrics.convergenceHearts) {
+      BasicBlock *defBlock = cast<Instruction>(heart.second)->getParent();
+      if (!L->contains(defBlock)) {
+        convergenceControlledByOutside = true;
+        if (BB != L->getHeader())
+          nonHeaderHeart = true;
+        assert(!Heart && "invalid IR: loop has multiple relevant hearts");
+        Heart = heart.first;
+      }
+    }
+    Metrics.convergenceHearts.clear();
+  }
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
-  Convergent = Metrics.convergent;
+
+  if (nonHeaderHeart)
+    Convergent = LoopConvergenceKind::NonHeaderHeart;
+  else if (Metrics.convergentUncontrolled || convergenceControlledByOutside)
+    Convergent = LoopConvergenceKind::Some;
+  else if (Metrics.convergent)
+    Convergent = LoopConvergenceKind::AnchoredInLoop;
+  else
+    Convergent = LoopConvergenceKind::None;
 
   unsigned LoopSize = Metrics.NumInsts;
 
@@ -1036,7 +1063,6 @@
   bool OptForSize = L->getHeader()->getParent()->hasOptSize();
   unsigned NumInlineCandidates;
   bool NotDuplicatable;
-  bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
@@ -1053,9 +1079,11 @@
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
 
+  LoopConvergenceKind Convergent;
+  const Instruction *Heart;
   unsigned LoopSize =
       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
-                          TTI, EphValues, UP.BEInsns);
+                          Heart, TTI, EphValues, UP.BEInsns);
   LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
   if (NotDuplicatable) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
@@ -1092,15 +1120,18 @@
   // is unsafe -- it adds a control-flow dependency to the convergent
   // operation.  Therefore restrict remainder loop (try unrollig without).
   //
-  // TODO: This is quite conservative.  In practice, convergent_op()
-  // is likely to be called unconditionally in the loop.  In this
-  // case, the program would be ill-formed (on most architectures)
-  // unless n were the same on all threads in a thread group.
-  // Assuming n is the same on all threads, any kind of unrolling is
-  // safe.  But currently llvm's notion of convergence isn't powerful
-  // enough to express this.
-  if (Convergent)
+  // TODO: This is still somewhat conservative, as we could allow the remainder
+  // if the trip count is uniform (and we don't have an unnatural heart).
+  switch (Convergent) {
+  case LoopConvergenceKind::None:
+  case LoopConvergenceKind::AnchoredInLoop:
+    break; // no convergence-related restrictions
+  case LoopConvergenceKind::Some:
     UP.AllowRemainder = false;
+    break;
+  case LoopConvergenceKind::NonHeaderHeart:
+    return LoopUnrollResult::Unmodified;
+  }
 
   // Try to find the trip count upper bound if we cannot find the exact trip
   // count.
@@ -1128,12 +1159,21 @@
 
   // Unroll the loop.
   Loop *RemainderLoop = nullptr;
+  UnrollLoopOptions ULO;
+  ULO.Count = UP.Count;
+  ULO.TripCount = TripCount;
+  ULO.Force = UP.Force;
+  ULO.AllowRuntime = UP.Runtime;
+  ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
+  ULO.PreserveCondBr = UseUpperBound;
+  ULO.PreserveOnlyFirst = MaxOrZero;
+  ULO.TripMultiple = TripMultiple;
+  ULO.PeelCount = PP.PeelCount;
+  ULO.UnrollRemainder = UP.UnrollRemainder;
+  ULO.ForgetAllSCEV = ForgetAllSCEV;
+  ULO.Heart = Heart;
   LoopUnrollResult UnrollResult = UnrollLoop(
-      L,
-      {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
-       UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
-       ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
+      L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -409,19 +409,33 @@
     }
   }
 
-  // Loops containing convergent instructions must have a count that divides
+  // Loops containing convergent instructions that are uncontrolled or
+  // controlled from outside the loop must have a count that divides
   // their TripMultiple.
-  LLVM_DEBUG(
-      {
-        bool HasConvergent = false;
-        for (auto &BB : L->blocks())
-          for (auto &I : *BB)
-            if (auto *CB = dyn_cast<CallBase>(&I))
-              HasConvergent |= CB->isConvergent();
-        assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
-               "Unroll count must divide trip multiple if loop contains a "
-               "convergent operation.");
-      });
+  LLVM_DEBUG({
+    bool HasOutsideConvergenceControl = false;
+    for (auto &BB : L->blocks()) {
+      for (auto &I : *BB) {
+        if (auto *CB = dyn_cast<CallBase>(&I)) {
+          if (CB->isConvergent()) {
+            auto control =
+                CB->getOperandBundle(LLVMContext::OB_convergencectrl);
+            if (!control) {
+              HasOutsideConvergenceControl = true;
+              break;
+            }
+            Value *token = control.getValue().Inputs[0].get();
+            if (!L->contains(cast<Instruction>(token)))
+              HasOutsideConvergenceControl = true;
+          }
+        }
+      }
+    }
+    assert(
+        (!HasOutsideConvergenceControl || ULO.TripMultiple % ULO.Count == 0) &&
+        "Unroll count must divide trip multiple if loop contains an "
+        "outside-controlled convergent operation.");
+  });
 
   bool EpilogProfitability =
       UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
@@ -585,6 +599,8 @@
                          << DIL->getFilename() << " Line: " << DIL->getLine());
           }
 
+  assert(ULO.Heart == nullptr || ULO.Heart->getParent() == Header);
+
   for (unsigned It = 1; It != ULO.Count; ++It) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -602,7 +618,7 @@
       if (OldLoop)
         LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-      if (*BB == Header)
+      if (*BB == Header) {
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
         for (PHINode *OrigPHI : OrigPHINode) {
@@ -615,6 +631,16 @@
           New->getInstList().erase(NewPHI);
         }
 
+        // Eliminate copies of the loop heart intrinsic, if any.
+        if (ULO.Heart) {
+          auto it = VMap.find(ULO.Heart);
+          assert(it != VMap.end());
+          Instruction *heartCopy = cast<Instruction>(it->second);
+          heartCopy->eraseFromParent();
+          VMap.erase(it);
+        }
+      }
+
       // Update our running map of newest clones
       LastValueMap[*BB] = New;
       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -973,14 +973,20 @@
   auto UnrollResult = LoopUnrollResult::Unmodified;
   if (remainderLoop && UnrollRemainder) {
     LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
-    UnrollResult =
-        UnrollLoop(remainderLoop,
-                   {/*Count*/ Count - 1, /*TripCount*/ Count - 1,
-                    /*Force*/ false, /*AllowRuntime*/ false,
-                    /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
-                    /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
-                    /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
-                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+    UnrollLoopOptions ULO;
+    ULO.Count = Count - 1;
+    ULO.TripCount = Count - 1;
+    ULO.Force = false;
+    ULO.AllowRuntime = false;
+    ULO.AllowExpensiveTripCount = false;
+    ULO.PreserveCondBr = true;
+    ULO.PreserveOnlyFirst = false;
+    ULO.TripMultiple = 1;
+    ULO.PeelCount = 0;
+    ULO.UnrollRemainder = false;
+    ULO.ForgetAllSCEV = ForgetAllSCEV;
+    UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,
+                              /*ORE*/ nullptr, PreserveLCSSA);
   }
 
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
@@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s
+
+declare void @f() convergent
+declare void @g()
+
+; Although this loop contains a convergent instruction, it should be
+; fully unrolled.
+define i32 @full_unroll() {
+; CHECK-LABEL: @full_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 3
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction, but it should be partially
+; unrolled.  The unroll count is the largest power of 2 that divides the
+; multiple -- 4, in this case.
+define i32 @runtime_unroll(i32 %n) {
+; CHECK-LABEL: @runtime_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_3]] = add nsw i32 [[INC_2]], 1
+; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 12
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction, so its partial unroll
+; count must divide its trip multiple.  This overrides its unroll
+; pragma -- we unroll exactly 8 times, even though 16 is requested.
+define i32 @pragma_unroll(i32 %n) {
+; CHECK-LABEL: @pragma_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !0
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1:%.*]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_2:%.*]] = add nuw nsw i32 [[INC_1]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_3:%.*]] = add nuw nsw i32 [[INC_2]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_4:%.*]] = add nuw nsw i32 [[INC_3]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_5:%.*]] = add nuw nsw i32 [[INC_4]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_6:%.*]] = add nuw nsw i32 [[INC_5]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_7]] = add nsw i32 [[INC_6]], 1
+; CHECK-NEXT:    [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !2
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 24
+  br label %l3, !llvm.loop !0
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !0
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip count 4. The loop unroll should respect the pragma.
+define void @pragma_unroll_divisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !6
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip multiple 2. The loop unroll should respect the pragma.
+define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nsw i32 [[INC]], 1
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !7
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 2
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 is unknown to divide runtime trip count, the loop is not unrolled
+; since remainder is forbidden for unrolling convergent loop.
+define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !4
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %anchor) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 does not divide trip count 5, the loop is not unrolled by 2
+; since remainder is forbidden for unrolling convergent loop. Instead, the
+; loop gets fully unrolled.
+define i32 @pragma_unroll_indivisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_indivisible_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 5
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction that is anchored inside the loop
+; itself. It is unrolled by 2 with remainder, as requested by the loop metadata.
+define i32 @pragma_unroll_with_remainder(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_with_remainder(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[ENTRY_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
+; CHECK-NEXT:    [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nsw i32 [[INC]], 1
+; CHECK-NEXT:    [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NSUB_1]], 0
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop !8
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[X_0_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[L3]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[X_0_UNR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_0_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       l3.epil.preheader:
+; CHECK-NEXT:    br label [[L3_EPIL:%.*]]
+; CHECK:       l3.epil:
+; CHECK-NEXT:    [[X_0_EPIL:%.*]] = phi i32 [ [[X_0_UNR]], [[L3_EPIL_PREHEADER]] ]
+; CHECK-NEXT:    [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ]
+; CHECK-NEXT:    [[INC_EPIL:%.*]] = add nsw i32 [[X_0_EPIL]], 1
+; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[N]]
+; CHECK-NEXT:    br label [[EXIT_EPILOG_LCSSA:%.*]]
+; CHECK:       exit.epilog-lcssa:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; Don't unroll a loop that is extended by convergence controls.
+;
+; We could theoretically duplicate the extension part, but this is not
+; implemented.
+define i32 @extended_loop(i32 %n) {
+; CHECK-LABEL: @extended_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop !4
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop !4
+; CHECK:       exit:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  ret i32 0
+}
+
+; The heart of this loop does not dominate all latches. It cannot be unrolled.
+define i32 @unusual_heart(i32 %v) {
+; CHECK-LABEL: @unusual_heart(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[BRANCHCOND:%.*]] = icmp eq i32 [[X_0]], [[V:%.*]]
+; CHECK-NEXT:    br i1 [[BRANCHCOND]], label [[THEN:%.*]], label [[LATCH]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    call void @g()
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %branchcond = icmp eq i32 %x.0, %v
+  br i1 %branchcond, label %then, label %latch
+
+then:
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  br label %latch
+
+latch:
+  call void @g()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 3
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+declare token @llvm.experimental.convergence.anchor()
+declare token @llvm.experimental.convergence.loop()
+
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}
+!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}}