diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -106,6 +106,20 @@
     cl::desc("The maximum number of SCEV checks allowed for Loop "
              "Distribution"));
 
+static cl::opt<unsigned> DistributeRuntimePointerChecksThreshold(
+    "loop-distribute-runtime-check-threshold", cl::init(32), cl::Hidden,
+    cl::desc("The maximum number of runtime pointer aliasing checks for Loop "
+             "Distribution"));
+
+static cl::opt<bool> DistributeVectorizableLoops(
+    "loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Consider loops that are already vectorizable for loop distribution."));
+
+static cl::opt<bool> DistributeMergeVectorizablePartitions(
+    "loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden,
+    cl::desc("Merge adjacent partitions that are already vectorizable."));
+
 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
     "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
     cl::Hidden,
@@ -130,6 +144,7 @@
 public:
   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
       : DepCycle(DepCycle), OrigLoop(L) {
+    SeedInstructions.insert(I);
     Set.insert(I);
   }
 
@@ -137,7 +152,10 @@
   bool hasDepCycle() const { return DepCycle; }
 
   /// Adds an instruction to this partition.
-  void add(Instruction *I) { Set.insert(I); }
+  void addSeed(Instruction *I) {
+    SeedInstructions.insert(I);
+    Set.insert(I);
+  }
 
   /// Collection accessors.
   InstructionSet::iterator begin() { return Set.begin(); }
@@ -149,6 +167,9 @@
   /// Moves this partition into \p Other.  This partition becomes empty
   /// after this.
   void moveTo(InstPartition &Other) {
+    Other.SeedInstructions.insert(SeedInstructions.begin(),
+                                  SeedInstructions.end());
+    SeedInstructions.clear();
     Other.Set.insert(Set.begin(), Set.end());
     Set.clear();
     Other.DepCycle |= DepCycle;
@@ -207,6 +228,14 @@
 
   /// Remaps the cloned instructions using VMap.
   void remapInstructions() {
+    // Seed instructions might be used outside the loop.
+    for (Instruction *I : SeedInstructions) {
+      I->replaceUsesWithIf(VMap[I], [&](Use &U) {
+        Instruction *Use = cast<Instruction>(U.getUser());
+        return !OrigLoop->contains(Use->getParent());
+      });
+    }
+
     remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
   }
 
@@ -253,6 +282,9 @@
   /// Instructions from OrigLoop selected for this partition.
   InstructionSet Set;
 
+  /// Instructions from OrigLoop used to seed this partition.
+  InstructionSet SeedInstructions;
+
   /// Whether this partition contains a dependence cycle.
   bool DepCycle;
 
@@ -292,7 +324,7 @@
     if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
       PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
     else
-      PartitionContainer.back().add(Inst);
+      PartitionContainer.back().addSeed(Inst);
   }
 
   /// Adds \p Inst into a partition that is not marked to contain
@@ -336,7 +368,8 @@
 
   /// Merges the partitions according to various heuristics.
   void mergeBeforePopulating() {
-    mergeAdjacentNonCyclic();
+    if (DistributeMergeVectorizablePartitions)
+      mergeAdjacentNonCyclic();
     if (!DistributeNonIfConvertible)
       mergeNonIfConvertible();
   }
@@ -684,13 +717,20 @@
 
     // Currently, we only distribute to isolate the part of the loop with
     // dependence cycles to enable partial vectorization.
-    if (LAI->canVectorizeMemory())
+    if (LAI->canVectorizeMemory() && !DistributeVectorizableLoops)
       return fail("MemOpsCanBeVectorized",
                   "memory operations are safe for vectorization");
 
     auto *Dependences = LAI->getDepChecker().getDependences();
-    if (!Dependences || Dependences->empty())
-      return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+    if (Dependences)
+      LLVM_DEBUG(dbgs() << "NumDependences: " << Dependences->size() << "\n");
+
+    // If we can't vectorize and the set of depdencies is empty, then that means
+    // that Loop Access Analysis gave up and the results are invalid. Don't try
+    // to do loop distribution based off it, or Bad Things happen.
+    if (!Dependences || (!LAI->canVectorizeMemory() && Dependences->empty())) {
+      return fail("NoDeps", "dependency analysis failed");
+    }
 
     InstPartitionContainer Partitions(L, LI, DT);
 
@@ -749,6 +789,7 @@
     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
     // should be able to vectorize these together.
     Partitions.mergeBeforePopulating();
+
     LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
     if (Partitions.getSize() < 2)
       return fail("CantIsolateUnsafeDeps",
@@ -776,6 +817,8 @@
                   "may not insert runtime check with convergent operation");
     }
 
+    LLVM_DEBUG(dbgs() << "LD: SCEV predicate complexity: "
+                      << Pred.getComplexity() << "\n");
     if (Pred.getComplexity() > (IsForced.getValueOr(false)
                                     ? PragmaDistributeSCEVCheckThreshold
                                     : DistributeSCEVCheckThreshold))
@@ -794,9 +837,16 @@
     auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
     const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
     const auto &AllChecks = RtPtrChecking->getChecks();
+
     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
                                                   RtPtrChecking);
 
+    // Runtime pointer checks could be quadratic in the number of pointers.
+    if (Checks.size() > DistributeRuntimePointerChecksThreshold) {
+      return fail("TooManyRuntimePointerChecks",
+                  "too many runtime pointer-alias checks needed.\n");
+    }
+
     if (LAI->hasConvergentOp() && !Checks.empty()) {
       return fail("RuntimeCheckWithConvergent",
                   "may not insert runtime check with convergent operation");
diff --git a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \
+; RUN:   | FileCheck %s
+
+; for (i = 0; i < n; i ++) {
+;   sumA += A[i]
+;  =========================
+;   sumB += B[i]
+; }
+
+
+define i64 @f(i64 %n, i32* %a) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_LDIST1:%.*]]
+; CHECK:       entry.split.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[INDEX_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[INDEX_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[SUMA_LDIST1:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[SUMA_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[IDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, i32* [[IDXA_LDIST1]], align 4
+; CHECK-NEXT:    [[SUMA_NEXT_LDIST1]] = add nuw nsw i32 [[LOADA_LDIST1]], [[SUMA_LDIST1]]
+; CHECK-NEXT:    [[INDEX_NEXT_LDIST1]] = add nuw nsw i64 [[INDEX_LDIST1]], 1
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[INDEX_NEXT_LDIST1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUMIDXSQ:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[SUMIDXSQ_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXSQ:%.*]] = mul i64 [[INDEX]], [[INDEX]]
+; CHECK-NEXT:    [[SUMIDXSQ_NEXT]] = add nuw nsw i64 [[IDXSQ]], [[SUMIDXSQ]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SUMA_NEXT_LDIST1]] to i64
+; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i64 [[ZEXT]], [[SUMIDXSQ_NEXT]]
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %for.body ]
+  %sumA = phi i32 [ 0, %entry ], [ %sumA.next, %for.body ]
+  %sumIdxSq = phi i64 [ 0, %entry ], [ %sumIdxSq.next, %for.body ]
+
+  %idxA = getelementptr inbounds i32, i32* %a, i64 %index
+  %loadA = load i32, i32* %idxA, align 4
+  %sumA.next = add nuw nsw i32 %loadA, %sumA
+
+  %idxSq = mul i64 %index, %index
+  %sumIdxSq.next = add nuw nsw i64 %idxSq, %sumIdxSq
+
+  %index.next = add nuw nsw i64 %index, 1
+
+  %exitcond = icmp eq i64 %index.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %zext = zext i32 %sumA.next to i64
+  %ret = add nuw nsw i64 %zext, %sumIdxSq.next
+  ret i64 %ret
+}
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
--- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
@@ -25,9 +25,7 @@
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300)
-; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300)
 ; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}}
-; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization{{$}}
 
 define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 {
 entry:
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics.ll b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
--- a/llvm/test/Transforms/LoopDistribute/diagnostics.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
@@ -36,7 +36,6 @@
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; MISSED_REMARKS: remark:  /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
-; ALWAYS:         remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization
 ; ALWAYS:         warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution
 
 define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 {
@@ -67,7 +66,6 @@
 
 ; NO_REMARKS-NOT: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
 ; MISSED_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
-; ANALYSIS_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
 ; ALWAYS-NOT: warning: /tmp/t.c:9:3: loop not distributed: failed explicitly specified loop distribution
 
 define void @not_forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !22 {