Index: include/llvm/Analysis/LoopInfo.h
===================================================================
--- include/llvm/Analysis/LoopInfo.h
+++ include/llvm/Analysis/LoopInfo.h
@@ -546,6 +546,7 @@
   /// from being unrolled more than is directed by a pragma if the loop
   /// unrolling pass is run more than once (which it generally is).
   void setLoopAlreadyUnrolled();
+  void setLoopAlreadyFlattened();
 
   /// Return true if no exit block for the loop has a predecessor that is
   /// outside the loop.
Index: lib/Analysis/LoopInfo.cpp
===================================================================
--- lib/Analysis/LoopInfo.cpp
+++ lib/Analysis/LoopInfo.cpp
@@ -301,6 +301,41 @@
   setLoopID(NewLoopID);
 }
 
+void Loop::setLoopAlreadyFlattened() {
+  MDNode *LoopID = getLoopID();
+  // First remove any existing loop flattening metadata.
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      bool IsFlattenMetadata = false;
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsFlattenMetadata =
+            S && S->getString().startswith("llvm.loop.flatten.");
+      }
+      if (!IsFlattenMetadata)
+        MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  // Add flatten(disable) metadata to disable future flattening.
+  LLVMContext &Context = getHeader()->getContext();
+  SmallVector<Metadata *, 1> DisableOperands;
+  DisableOperands.push_back(
+      MDString::get(Context, "llvm.loop.flatten.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  MDs.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  setLoopID(NewLoopID);
+}
+
 bool Loop::isAnnotatedParallel() const {
   MDNode *DesiredLoopIdMetadata = getLoopID();
 
Index: lib/Transforms/Scalar/LoopFlatten.cpp
===================================================================
--- lib/Transforms/Scalar/LoopFlatten.cpp
+++ lib/Transforms/Scalar/LoopFlatten.cpp
@@ -45,6 +45,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 
 #define DEBUG_TYPE "loop-flatten"
 
@@ -62,6 +64,12 @@
                      cl::desc("Assume that the product of the two iteration "
                               "limits will never overflow"));
 
+static MDNode *GetMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
 // Finds the induction variable, increment and limit for a simple loop that we
 // can flatten.
 static bool findLoopComponents(
@@ -395,7 +403,8 @@
 
 static bool FlattenLoopPair(Loop *OuterLoop, Loop *InnerLoop, DominatorTree *DT,
                             LoopInfo *LI, ScalarEvolution *SE,
-                            AssumptionCache *AC, TargetTransformInfo *TTI,
+                            const LoopAccessInfo *LAI, AssumptionCache *AC,
+                            TargetTransformInfo *TTI,
                             std::function<void(Loop *)> markLoopAsDeleted) {
   Function *F = OuterLoop->getHeader()->getParent();
 
@@ -458,7 +467,6 @@
   // Check if the new iteration variable might overflow. In this case, we
   // need to version the loop, and select the original version at runtime if
   // the iteration space is too large.
-  // TODO: We currently don't version the loop.
   // TODO: it might be worth using a wider iteration variable rather than
   // versioning the loop, if a wide enough type is legal.
   bool MustVersionLoop = true;
@@ -468,14 +476,15 @@
     DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
     return false;
   } else if (OR == OverflowResult::MayOverflow) {
-    DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+    DEBUG(dbgs() << "Multiply might overflow, versioning loop\n");
   } else {
     DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
     MustVersionLoop = false;
   }
 
-  // We cannot safely flatten the loop. Exit now.
-  if (MustVersionLoop)
+  // This optimisation is good for code size iff we don't need to do loop
+  // versioning.
+  if (F->optForSize() && MustVersionLoop)
     return false;
 
   // Do the actual transformation.
@@ -485,15 +494,42 @@
     OptimizationRemark Remark(DEBUG_TYPE, "Flattened", InnerLoop->getStartLoc(),
                               InnerLoop->getHeader());
     OptimizationRemarkEmitter ORE(F);
-    Remark << "Flattened into outer loop";
+    if (!MustVersionLoop)
+      Remark << "Flattened into outer loop in-place";
+    else
+      Remark << "Flattened into outer loop, leaving original version when "
+                "overflow occurs";
     ORE.emit(Remark);
   }
 
-  Value *NewTripCount =
-      BinaryOperator::CreateMul(InnerLimit, OuterLimit, "flatten.tripcount",
-                                OuterLoop->getLoopPreheader()->getTerminator());
-  DEBUG(dbgs() << "Created new trip count in preheader: ";
-        NewTripCount->dump());
+  // TODO: Widen IV type if legal to prevent overflow.
+  Type *NewIVType = InnerInductionPHI->getType();
+  Value *NewTripCount;
+  Loop *FallbackLoop = nullptr;
+  if (MustVersionLoop) {
+    IRBuilder<> Builder(OuterLoop->getLoopPreheader()->getTerminator());
+    // FIXME: signedness of this check?
+    Value *M = Intrinsic::getDeclaration(
+        F->getParent(), Intrinsic::umul_with_overflow, NewIVType);
+    CallInst *Call = Builder.CreateCall(M, {InnerLimit, OuterLimit}, "limit");
+    Value *OverflowBit = Builder.CreateExtractValue(Call, 1, "overflow");
+    NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount");
+
+    LoopVersioning LV(*LAI, OuterLoop, LI, DT, SE, false);
+    SCEVUnionPredicate Pred;
+    Pred.add(
+        SE->getEqualPredicate(SE->getSCEV(OverflowBit),
+                              SE->getZero(Type::getInt1Ty(M->getContext()))));
+    LV.setSCEVChecks(Pred);
+    LV.versionLoop();
+    FallbackLoop = LV.getNonVersionedLoop();
+  } else {
+    NewTripCount = BinaryOperator::CreateMul(
+        InnerLimit, OuterLimit, "flatten.tripcount",
+        OuterLoop->getLoopPreheader()->getTerminator());
+    DEBUG(dbgs() << "Created new trip count in preheader: ";
+          NewTripCount->dump());
+  }
 
   // Fix up PHI nodes that take values from the inner loop back-edge, which
   // we are about to remove.
@@ -517,6 +553,12 @@
   for (Value *V : LinearIVUses)
     V->replaceAllUsesWith(OuterInductionPHI);
 
+  // If we made a fallback copy of the loop, it will still be flattenable if
+  // this pass is run again, but that wouldn't be profitable, so disable
+  // flattening of that loop.
+  if (FallbackLoop)
+    FallbackLoop->setLoopAlreadyFlattened();
+
   // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
   // deleted, and any information that have about the outer loop invalidated.
   markLoopAsDeleted(InnerLoop);
@@ -530,16 +572,21 @@
 PreservedAnalyses LoopFlattenPass::run(Loop &L, LoopAnalysisManager &AM,
                                        LoopStandardAnalysisResults &AR,
                                        LPMUpdater &Updater) {
-  if (L.getSubLoops().size() != 1)
+  if (L.getSubLoops().size() != 1 ||
+      GetMetadataForLoop(&L, "llvm.loop.flatten.disable"))
     return PreservedAnalyses::all();
 
+  const LoopAccessInfo *LAI = &AM.getResult<LoopAccessAnalysis>(L, AR);
+
   Loop *InnerLoop = *L.begin();
   std::string LoopName = InnerLoop->getName();
   if (!FlattenLoopPair(
-      &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI,
+      &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, LAI, &AR.AC, &AR.TTI,
       [&](Loop *L) { Updater.markLoopAsDeleted(*L, LoopName); }))
     return PreservedAnalyses::all();
-  return getLoopPassPreservedAnalyses();
+  PreservedAnalyses PA = getLoopPassPreservedAnalyses();
+  PA.preserve<LoopAccessAnalysis>();
+  return PA;
 }
 
 namespace {
@@ -559,6 +606,8 @@
     AU.addPreserved<TargetTransformInfoWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addPreserved<AssumptionCacheTracker>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addPreserved<LoopAccessLegacyAnalysis>();
   }
 };
 } // namespace
@@ -568,6 +617,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
                     false, false)
@@ -578,7 +628,8 @@
   if (skipLoop(L))
     return false;
 
-  if (L->getSubLoops().size() != 1)
+  if (L->getSubLoops().size() != 1 ||
+      GetMetadataForLoop(L, "llvm.loop.flatten.disable"))
     return false;
 
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -590,8 +641,10 @@
   AssumptionCache *AC =
       &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
           *L->getHeader()->getParent());
+  const LoopAccessInfo *LAI =
+      &getAnalysis<LoopAccessLegacyAnalysis>().getInfo(L);
 
   Loop *InnerLoop = *L->begin();
-  return FlattenLoopPair(L, InnerLoop, DT, LI, SE, AC, TTI,
+  return FlattenLoopPair(L, InnerLoop, DT, LI, SE, LAI, AC, TTI,
                              [&](Loop *L) { LPM.markLoopAsDeleted(*L); });
 }
Index: lib/Transforms/Utils/CloneFunction.cpp
===================================================================
--- lib/Transforms/Utils/CloneFunction.cpp
+++ lib/Transforms/Utils/CloneFunction.cpp
@@ -728,6 +728,18 @@
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 }
 
+static void cloneLoopStructure(Loop *OrigLoop, Loop *ParentLoop, LoopInfo *LI,
+                               DenseMap<Loop *, Loop *> &LoopMap) {
+  Loop *NewLoop = LI->AllocateLoop();
+  LoopMap[OrigLoop] = NewLoop;
+  if (ParentLoop)
+    ParentLoop->addChildLoop(NewLoop);
+  else
+    LI->addTopLevelLoop(NewLoop);
+  for (Loop *Child : *OrigLoop)
+    cloneLoopStructure(Child, NewLoop, LI, LoopMap);
+}
+
 /// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
 /// Blocks.
 ///
@@ -738,16 +750,11 @@
                                    const Twine &NameSuffix, LoopInfo *LI,
                                    DominatorTree *DT,
                                    SmallVectorImpl<BasicBlock *> &Blocks) {
-  assert(OrigLoop->getSubLoops().empty() && 
-         "Loop to be cloned cannot have inner loop");
   Function *F = OrigLoop->getHeader()->getParent();
   Loop *ParentLoop = OrigLoop->getParentLoop();
 
-  Loop *NewLoop = LI->AllocateLoop();
-  if (ParentLoop)
-    ParentLoop->addChildLoop(NewLoop);
-  else
-    LI->addTopLevelLoop(NewLoop);
+  DenseMap<Loop *, Loop *> LoopMap;
+  cloneLoopStructure(OrigLoop, ParentLoop, LI, LoopMap);
 
   BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
   assert(OrigPH && "No preheader");
@@ -768,6 +775,8 @@
     VMap[BB] = NewBB;
 
     // Update LoopInfo.
+    Loop *NewLoop = LoopMap[LI->getLoopFor(BB)];
+    assert(NewLoop);
     NewLoop->addBasicBlockToLoop(NewBB, *LI);
 
     // Add DominatorTree node. After seeing all blocks, update to correct IDom.
@@ -787,9 +796,10 @@
   F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
                                 NewPH);
   F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
-                                NewLoop->getHeader()->getIterator(), F->end());
+                                LoopMap[OrigLoop]->getHeader()->getIterator(),
+                                F->end());
 
-  return NewLoop;
+  return LoopMap[OrigLoop];
 }
 
 /// \brief Duplicate non-Phi instructions from the beginning of block up to
Index: lib/Transforms/Utils/LoopVersioning.cpp
===================================================================
--- lib/Transforms/Utils/LoopVersioning.cpp
+++ lib/Transforms/Utils/LoopVersioning.cpp
@@ -64,11 +64,10 @@
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
 
-  const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
-      Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator());
+      Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
   auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
 
   // Discard the SCEV runtime check if it is always true.
Index: test/Transforms/LoopFlatten/loop-flatten.ll
===================================================================
--- test/Transforms/LoopFlatten/loop-flatten.ll
+++ test/Transforms/LoopFlatten/loop-flatten.ll
@@ -529,5 +529,256 @@
   ret i16 %ret.0.lcssa
 }
 
+; CHECK-LABEL: test8
+; Versioned loop
+define void @test8(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %cmp25 = icmp sgt i32 %N, 0
+  br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+; Entry block still contains the zero-iteration check
+; CHECK: entry:
+; CHECK: %[[ZERO_CHECK:.*]] = icmp sgt i32 %N, 0
+; CHECK: br i1 %[[ZERO_CHECK]], label %for.body4.lr.ph.lver.check, label %for.cond.cleanup
+
+; Loop versioning check block
+; CHECK: for.body4.lr.ph.lver.check:
+; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %N, i32 %N)
+; CHECK: extractvalue
+; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.ph.lver.orig, label %for.body4.lr.ph.ph
+
+; Pre-header for the original loop
+; CHECK: for.body4.lr.ph.ph.lver.orig:
+; CHECK: br label %for.body4.lr.ph.lver.orig
+
+
+; Original loop kept as-is (just with blocks renamed)
+for.body4.lr.ph:
+  %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ]
+  %mul = mul nsw i32 %i.026, %N
+  br label %for.body4
+; CHECK: for.body4.lr.ph.lver.orig:
+; CHECK: br label %for.body4.lver.orig
+
+for.body4:
+  %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+  %add = add nsw i32 %j.024, %mul
+  %use = add i32 %add, 10
+  %inc = add nuw nsw i32 %j.024, 1
+  %exitcond = icmp ne i32 %inc, %N
+  br i1 %exitcond, label %for.body4, label %for.cond.cleanup3
+; CHECK: for.body4.lver.orig:
+; CHECK: br i1 %{{.*}}, label %for.body4.lver.orig, label %for.cond.cleanup3.lver.orig
+
+for.cond.cleanup3:
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp ne i32 %inc10, %N
+  br i1 %exitcond27, label %for.body4.lr.ph, label %for.cond.cleanup
+; CHECK: for.cond.cleanup3.lver.orig:
+; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.lver.orig, label %for.cond.cleanup.loopexit
+
+
+; New, flattened loop
+; Pre-header
+; CHECK: for.body4.lr.ph.ph:
+; CHECK: br label %for.body4.lr.ph
+
+; Header
+; The next 3 blocks now have straight-line control flow, and will get merged by SimplifyCFG
+; CHECK: for.body4.lr.ph:
+; CHECK: %[[OUTER_IV:.*]] = phi i32
+; CHECK: br label %for.body4
+
+; (Former) inner loop body
+; CHECK: for.body4:
+; Operands of %use have been replaced with the (formerly) outer iteration variable
+; CHECK: add i32 %[[OUTER_IV]], 10
+; Branch to tail portion of outer loop body is now unconditional
+; CHECK: br label %for.cond.cleanup3
+
+; Latch block
+; CHECK: for.cond.cleanup3:
+; CHECK: br i1 %exitcond27, label %for.body4.lr.ph, label %for.cond.cleanup.loopexit
+
+
+; Exit block, shared between both loop versions
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK: br label %for.cond.cleanup
+
+; Function exit block (no change)
+for.cond.cleanup:
+  ret void
+; CHECK: for.cond.cleanup:
+; CHECK: ret void
+}
+
+; As above, but with PHI operands re-ordered
+define void @test9(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+; CHECK-LABEL: test9
+entry:
+  %cmp25 = icmp sgt i32 %N, 0
+  br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+; Entry block still contains the zero-iteration check
+; CHECK: entry:
+; CHECK: %[[ZERO_CHECK:.*]] = icmp sgt i32 %N, 0
+; CHECK: br i1 %[[ZERO_CHECK]], label %for.body4.lr.ph.lver.check, label %for.cond.cleanup
+
+; Loop versioning check block
+; CHECK: for.body4.lr.ph.lver.check:
+; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %N, i32 %N)
+; CHECK: extractvalue
+; CHECK: br i1 %{{.*}}, label %for.body4.lr.ph.ph.lver.orig, label %for.body4.lr.ph.ph
+
+; Pre-header for the original loop
+; CHECK: for.body4.lr.ph.ph.lver.orig:
+; CHECK: br label %for.body4.lr.ph.lver.orig
+
+
+; Original loop kept as-is (just with blocks renamed)
+for.body4.lr.ph:
+  %i.026 = phi i32 [ 0, %entry ], [ %inc10, %for.cond.cleanup3 ]
+  %mul = mul nsw i32 %i.026, %N
+  br label %for.body4
+; CHECK: for.body4.lr.ph.lver.orig:
+; CHECK: br label %for.body4.lver.orig
+
+for.body4:
+  %j.024 = phi i32 [ %inc, %for.body4 ], [ 0, %for.body4.lr.ph ]
+  %add = add nsw i32 %j.024, %mul
+  %use = add i32 %add, 10
+  %inc = add nuw nsw i32 %j.024, 1
+  %exitcond = icmp ne i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+; CHECK: for.body4.lver.orig:
+; CHECK: br i1 %{{.*}}, label %for.cond.cleanup3.lver.orig, label %for.body4.lver.orig
+
+for.cond.cleanup3:
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp ne i32 %inc10, %N
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
+; CHECK: for.cond.cleanup3.lver.orig:
+; CHECK: br i1 %{{.*}}, label %for.cond.cleanup.loopexit, label %for.body4.lr.ph.lver.orig
+
+
+; New, flattened loop
+; Pre-header
+; CHECK: for.body4.lr.ph.ph:
+; CHECK: br label %for.body4.lr.ph
+
+; Header
+; CHECK: for.body4.lr.ph:
+; CHECK: %[[OUTER_IV:.*]] = phi i32
+; CHECK: br label %for.body4
+
+; (Former) inner loop body
+; CHECK: for.body4:
+; CHECK: add i32 %[[OUTER_IV]], 10
+; CHECK: br label %for.cond.cleanup3
+
+; Latch block
+; CHECK: for.cond.cleanup3:
+; CHECK: br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body4.lr.ph
+
+
+; Exit block, shared between both loop versions
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK: br label %for.cond.cleanup
+
+; Function exit block (no change)
+for.cond.cleanup:
+  ret void
+; CHECK: for.cond.cleanup:
+; CHECK: ret void
+}
+
+; The two loops have different bounds
+define void @test10(i32 %N, i32 %M, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+; CHECK-LABEL: test10
+entry:
+  %cmp24 = icmp eq i32 %N, 0
+  br i1 %cmp24, label %for.cond.cleanup, label %for.body.lr.ph
+; CHECK: entry:
+; CHECK: br i1 {{.*}}, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp222 = icmp eq i32 %M, 0
+  br i1 %cmp222, label %for.body.lr.ph.split, label %for.body.us
+; CHECK: for.body.lr.ph:
+; CHECK: br i1 {{.*}}, label %for.body.lr.ph.split, label %for.body.us.lver.check
+
+; Overflow check block
+; CHECK: for.body.us.lver.check:
+; CHECK: call { i32, i1 } @llvm.umul.with.overflow.i32
+; CHECK: br i1 %{{.*}}, label %for.body.us.ph.lver.orig, label %for.body.us.ph
+
+; Preheader for original outer loop
+; CHECK: for.body.us.ph.lver.orig:
+; CHECK: br label %for.body.us.lver.orig
+
+; Original loop
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.lr.ph.split.us
+  %i.025.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
+  %mul.us = mul i32 %i.025.us, %M
+  br label %for.body4.us
+; CHECK: for.body.us.lver.orig:
+; CHECK: br label %for.body4.us.lver.orig
+
+for.body4.us:                                     ; preds = %for.body.us, %for.body4.us
+  %j.023.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add i32 %j.023.us, %mul.us
+  %arrayidx.us = getelementptr i32, i32* %A, i32 %add.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %mul5.us = mul nsw i32 %0, %scale
+  %arrayidx8.us = getelementptr i32, i32* %C, i32 %add.us
+  store i32 %mul5.us, i32* %arrayidx8.us, align 4
+  %inc.us = add nuw nsw i32 %j.023.us, 1
+  %exitcond = icmp ne i32 %inc.us, %M
+  br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+; CHECK: for.body4.us.lver.orig:
+; CHECK: br i1 %exitcond.lver.orig, label %for.body4.us.lver.orig, label %for.cond1.for.cond.cleanup3_crit_edge.us.lver.orig
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc10.us = add nuw nsw i32 %i.025.us, 1
+  %exitcond27 = icmp ne i32 %inc10.us, %N
+  br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.lver.orig:
+; CHECK: br i1 %exitcond27.lver.orig, label %for.body.us.lver.orig, label %for.cond.cleanup.loopexit
+
+; New loop preheader
+; CHECK: for.body.us.ph:
+; CHECK: br label %for.body.us
+
+; Body of new loop
+; CHECK: for.body.us:
+; CHECK: %[[OUTER_IV:.*]] = phi i32
+; CHECK: br label %for.body4.us
+; CHECK: for.body4.us:
+; CHECK: getelementptr i32, i32* %A, i32 %[[OUTER_IV]]
+; CHECK: getelementptr i32, i32* %C, i32 %[[OUTER_IV]]
+; CHECK: br label %for.cond1.for.cond.cleanup3_crit_edge.us
+; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK: br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  br label %for.cond.cleanup.loopexit26
+; CHECK: for.body.lr.ph.split:
+; CHECK: br label %for.cond.cleanup.loopexit26
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK: br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit26:                      ; preds = %for.body.lr.ph.split
+  br label %for.cond.cleanup
+; CHECK: for.cond.cleanup.loopexit26:
+; CHECK: br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit26, %for.cond.cleanup.loopexit, %entry
+  ret void
+; CHECK: for.cond.cleanup:
+; CHECK: ret void
+}
+
 declare i32 @func(i32)