diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -106,6 +106,20 @@
     cl::desc("The maximum number of SCEV checks allowed for Loop "
              "Distribution"));
 
+static cl::opt<unsigned> DistributeRuntimePointerChecksThreshold(
+    "loop-distribute-runtime-check-threshold", cl::init(32), cl::Hidden,
+    cl::desc("The maximum number of runtime pointer aliasing checks for Loop "
+             "Distribution"));
+
+static cl::opt<bool> DistributeVectorizableLoops(
+    "loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Consider vectorizable loops for loop distribution."));
+
+static cl::opt<bool> DistributeMergeVectorizablePartitions(
+    "loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden,
+    cl::desc("Merge adjacent partitions that are already vectorizable."));
+
 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
     "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
     cl::Hidden,
@@ -130,6 +144,7 @@
 public:
   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
       : DepCycle(DepCycle), OrigLoop(L) {
+    SeedInstructions.insert(I);
     Set.insert(I);
   }
 
@@ -137,7 +152,10 @@
   bool hasDepCycle() const { return DepCycle; }
 
   /// Adds an instruction to this partition.
-  void add(Instruction *I) { Set.insert(I); }
+  void addSeed(Instruction *I) {
+    SeedInstructions.insert(I);
+    Set.insert(I);
+  }
 
   /// Collection accessors.
   InstructionSet::iterator begin() { return Set.begin(); }
@@ -149,6 +167,9 @@
   /// Moves this partition into \p Other.  This partition becomes empty
   /// after this.
   void moveTo(InstPartition &Other) {
+    Other.SeedInstructions.insert(SeedInstructions.begin(),
+                                  SeedInstructions.end());
+    SeedInstructions.clear();
     Other.Set.insert(Set.begin(), Set.end());
     Set.clear();
     Other.DepCycle |= DepCycle;
@@ -207,6 +228,14 @@
 
   /// Remaps the cloned instructions using VMap.
   void remapInstructions() {
+    // Seed instructions might be used outside the loop.
+    for (Instruction *I : SeedInstructions) {
+      I->replaceUsesWithIf(VMap[I], [&](Use &U) {
+        Instruction *Use = cast<Instruction>(U.getUser());
+        return !OrigLoop->contains(Use->getParent());
+      });
+    }
+
     remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
   }
 
@@ -253,6 +282,9 @@
   /// Instructions from OrigLoop selected for this partition.
   InstructionSet Set;
 
+  /// Instructions from OrigLoop used to seed this partition.
+  InstructionSet SeedInstructions;
+
   /// Whether this partition contains a dependence cycle.
   bool DepCycle;
 
@@ -292,7 +324,7 @@
     if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
       PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
     else
-      PartitionContainer.back().add(Inst);
+      PartitionContainer.back().addSeed(Inst);
   }
 
   /// Adds \p Inst into a partition that is not marked to contain
@@ -336,7 +368,8 @@
 
   /// Merges the partitions according to various heuristics.
   void mergeBeforePopulating() {
-    mergeAdjacentNonCyclic();
+    if (DistributeMergeVectorizablePartitions)
+      mergeAdjacentNonCyclic();
     if (!DistributeNonIfConvertible)
       mergeNonIfConvertible();
   }
@@ -652,6 +685,15 @@
   AccessesType Accesses;
 };
 
+static bool hasPossiblyBackwardDependences(
+    const SmallVectorImpl<MemoryDepChecker::Dependence> &Dependences) {
+  for (auto &Dep : Dependences)
+    if (Dep.isPossiblyBackward())
+      return true;
+
+  return false;
+}
+
 /// The actual class performing the per-loop work.
 class LoopDistributeForLoop {
 public:
@@ -684,13 +726,30 @@
 
     // Currently, we only distribute to isolate the part of the loop with
     // dependence cycles to enable partial vectorization.
-    if (LAI->canVectorizeMemory())
+    if (LAI->canVectorizeMemory() && !DistributeVectorizableLoops)
       return fail("MemOpsCanBeVectorized",
                   "memory operations are safe for vectorization");
 
     auto *Dependences = LAI->getDepChecker().getDependences();
-    if (!Dependences || Dependences->empty())
-      return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+    if (!Dependences)
+      return fail("NoDeps", "dependency analysis failed");
+
+    LLVM_DEBUG(dbgs() << "NumDependences: " << Dependences->size() << "\n");
+
+    // If there are potentially-backward dependencies (which don't prevent
+    // vectorisation), loop distribute would spuriously distribute.
+    if (LAI->canVectorizeMemory() &&
+        hasPossiblyBackwardDependences(*Dependences)) {
+      return fail("VectorizableDependences",
+                  "dependences that won't block vectorization found");
+    }
+
+    // If we can't vectorize and the set of depdencies is empty, then that means
+    // that Loop Access Analysis gave up and the results are invalid. Don't try
+    // to do loop distribution based off it, or Bad Things happen.
+    if (!LAI->canVectorizeMemory() && Dependences->empty()) {
+      return fail("NoDeps", "dependency analysis failed");
+    }
 
     InstPartitionContainer Partitions(L, LI, DT);
 
@@ -749,6 +808,7 @@
     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
     // should be able to vectorize these together.
     Partitions.mergeBeforePopulating();
+
     LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
       return fail("CantIsolateUnsafeDeps",
@@ -776,6 +836,8 @@
                   "may not insert runtime check with convergent operation");
     }
 
+    LLVM_DEBUG(dbgs() << "LD: SCEV predicate complexity: "
+                      << Pred.getComplexity() << "\n");
     if (Pred.getComplexity() > (IsForced.getValueOr(false)
                                     ? PragmaDistributeSCEVCheckThreshold
                                     : DistributeSCEVCheckThreshold))
@@ -794,9 +856,16 @@
     auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
     const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
     const auto &AllChecks = RtPtrChecking->getChecks();
+
     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
                                                   RtPtrChecking);
 
+    // Runtime pointer checks could be quadratic in the number of pointers.
+    if (Checks.size() > DistributeRuntimePointerChecksThreshold) {
+      return fail("TooManyRuntimePointerChecks",
+                  "too many runtime pointer-alias checks needed.\n");
+    }
+
     if (LAI->hasConvergentOp() && !Checks.empty()) {
       return fail("RuntimeCheckWithConvergent",
                   "may not insert runtime check with convergent operation");
diff --git a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \
+; RUN:   | FileCheck %s
+
+; for (i = 0; i < n; i ++) {
+;   sumA += A[i]
+;  =========================
+;   sumB += B[i]
+; }
+
+
+define i64 @f(i64 %n, i32* %a) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_LDIST1:%.*]]
+; CHECK:       entry.split.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[INDEX_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[INDEX_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[SUMA_LDIST1:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[SUMA_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[IDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, i32* [[IDXA_LDIST1]], align 4
+; CHECK-NEXT:    [[SUMA_NEXT_LDIST1]] = add nuw nsw i32 [[LOADA_LDIST1]], [[SUMA_LDIST1]]
+; CHECK-NEXT:    [[INDEX_NEXT_LDIST1]] = add nuw nsw i64 [[INDEX_LDIST1]], 1
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[INDEX_NEXT_LDIST1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUMIDXSQ:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[SUMIDXSQ_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXSQ:%.*]] = mul i64 [[INDEX]], [[INDEX]]
+; CHECK-NEXT:    [[SUMIDXSQ_NEXT]] = add nuw nsw i64 [[IDXSQ]], [[SUMIDXSQ]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SUMA_NEXT_LDIST1]] to i64
+; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i64 [[ZEXT]], [[SUMIDXSQ_NEXT]]
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %for.body ]
+  %sumA = phi i32 [ 0, %entry ], [ %sumA.next, %for.body ]
+  %sumIdxSq = phi i64 [ 0, %entry ], [ %sumIdxSq.next, %for.body ]
+
+  %idxA = getelementptr inbounds i32, i32* %a, i64 %index
+  %loadA = load i32, i32* %idxA, align 4
+  %sumA.next = add nuw nsw i32 %loadA, %sumA
+
+  %idxSq = mul i64 %index, %index
+  %sumIdxSq.next = add nuw nsw i64 %idxSq, %sumIdxSq
+
+  %index.next = add nuw nsw i64 %index, 1
+
+  %exitcond = icmp eq i64 %index.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %zext = zext i32 %sumA.next to i64
+  %ret = add nuw nsw i64 %zext, %sumIdxSq.next
+  ret i64 %ret
+}
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
--- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
@@ -25,9 +25,7 @@
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300)
-; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300)
 ; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}}
-; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization{{$}}
 
 define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 {
 entry:
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics.ll b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
--- a/llvm/test/Transforms/LoopDistribute/diagnostics.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
@@ -36,7 +36,6 @@
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; MISSED_REMARKS: remark:  /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
-; ALWAYS:         remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization
 ; ALWAYS:         warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution
 
 define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 {
@@ -67,7 +66,6 @@
 
 ; NO_REMARKS-NOT: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
 ; MISSED_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
-; ANALYSIS_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
 ; ALWAYS-NOT: warning: /tmp/t.c:9:3: loop not distributed: failed explicitly specified loop distribution
 
 define void @not_forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !22 {
diff --git a/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll b/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
+; RUN:   < %s | FileCheck %s
+
+@A = global [2 x [16 x [16 x i32]]] zeroinitializer
+@B = global [2 x [16 x [16 x i32]]] zeroinitializer
+@C = global [16 x [16 x i32]] zeroinitializer
+@D = global [16 x [16 x i32]] zeroinitializer
+
+define void @backward_vectorizable(i32 %j) {
+; CHECK-LABEL: @backward_vectorizable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 1, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 1, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @D, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @C, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %idxprom1 = sext i32 %j to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx2 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 0, i64 %indvars.iv, i64 %idxprom1
+  %0 = load i32, i32* %arrayidx2, align 4
+  %arrayidx6 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 0, i64 %indvars.iv, i64 %idxprom1
+  store i32 %0, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 1, i64 %indvars.iv, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx10, align 4
+  %arrayidx14 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 1, i64 %indvars.iv, i64 %idxprom1
+  store i32 %1, i32* %arrayidx14, align 4
+  %arrayidx18 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @D, i64 0, i64 %indvars.iv, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx18, align 4
+  %arrayidx22 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %idxprom1
+  store i32 %2, i32* %arrayidx22, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+%struct = type {  [361 x i32], [361 x i32] }
+define void @backward_vectorizable2(i32 %a, i32 %b, %struct* nocapture %S) {
+; CHECK-LABEL: @backward_vectorizable2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1, [[A:%.*]]
+; CHECK-NEXT:    [[CMP_NOT12:%.*]] = icmp sgt i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_NOT12]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B]], 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], [[TMP2]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[S:%.*]], i64 0, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    store i32 2, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT]], %struct* [[S]], i64 0, i32 1, i64 [[IDXPROM]]
+; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[TMP1]], [[LFTR_WIDEIV]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = sub i32 1, %a
+  %cmp.not12 = icmp sgt i32 %a, %b
+  br i1 %cmp.not12, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %a to i64
+  %1 = add i32 %b, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %2 = trunc i64 %indvars.iv to i32
+  %add = add i32 %sub, %2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds %struct, %struct* %S, i64 0, i32 0, i64 %idxprom
+  store i32 2, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds %struct, %struct* %S, i64 0, i32 1, i64 %idxprom
+  store i32 1, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond.not = icmp eq i32 %1, %lftr.wideiv
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}