Index: include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- include/llvm/Analysis/LoopAccessAnalysis.h
+++ include/llvm/Analysis/LoopAccessAnalysis.h
@@ -522,6 +522,11 @@
   /// no memory dependence cycles.
   bool canVectorizeMemory() const { return CanVecMem; }
 
+  /// Return true if it's legal to insert a runtime pointer check. There may
+  /// still be reported runtime pointer checks that would be required, but it is
+  /// not legal to insert them.
+  bool canInsertRuntimeCheck() const { return CanInsertRuntimeCheck; }
+
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return PtrRtChecking.get();
   }
@@ -642,6 +647,7 @@
 
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
+  bool CanInsertRuntimeCheck;
 
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress;
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1778,15 +1778,25 @@
   unsigned NumReads = 0;
   unsigned NumReadWrites = 0;
 
+  // A runtime check is only legal to insert if there are no convergent calls.
+  // At first assume there are.
+  CanInsertRuntimeCheck = false;
+
   PtrRtChecking->Pointers.clear();
   PtrRtChecking->Need = false;
 
+  bool HasConvergentOp = false;
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the BB and collect legal loads and stores.
     for (Instruction &I : *BB) {
+      if (auto *Call = dyn_cast<CallBase>(&I)) {
+        if (Call->isConvergent())
+          HasConvergentOp = true;
+      }
+
       // If this is a load, save it. If this instruction can read from memory
       // but is not a load, then we quit. Notice that we don't handle function
       // calls that read or write.
@@ -1845,6 +1855,8 @@
     } // Next instr.
   } // Next block.
 
+  CanInsertRuntimeCheck = !HasConvergentOp;
+
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
 
@@ -1997,6 +2009,16 @@
     }
   }
 
+  if (HasConvergentOp) {
+    recordAnalysis("CantInsertRuntimeCheckWithConvergent")
+      << "cannot add control dependency to convergent operation";
+
+    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
+               "would be needed with a convergent operation\n");
+    CanVecMem = false;
+    return;
+  }
+
   if (CanVecMem)
     LLVM_DEBUG(
         dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
@@ -2285,6 +2307,7 @@
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
+      CanInsertRuntimeCheck(true),
       HasDependenceInvolvingLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
Index: lib/Transforms/Scalar/LoopDistribute.cpp
===================================================================
--- lib/Transforms/Scalar/LoopDistribute.cpp
+++ lib/Transforms/Scalar/LoopDistribute.cpp
@@ -690,6 +690,12 @@
     if (!Dependences || Dependences->empty())
       return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
 
+    if (!LAI->canInsertRuntimeCheck() &&
+        LAI->getNumRuntimePointerChecks() != 0) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
     InstPartitionContainer Partitions(L, LI, DT);
 
     // First, go through each memory operation and assign them to consecutive
Index: test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll
===================================================================
--- /dev/null
+++ test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
+
+; Analyze this loop:
+;   for (i = 0; i < n; i++)
+;    A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK: Report: cannot add control dependency to convergent operation
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT:   Backward:
+; CHECK-NEXT:     %loadA = load i16, i16* %arrayidxA, align 2 ->
+; CHECK-NEXT:     store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+
+@B = common global i16* null, align 8
+@A = common global i16* null, align 8
+@C = common global i16* null, align 8
+
+define void @f() {
+entry:
+  %a = load i16*, i16** @A, align 8
+  %b = load i16*, i16** @B, align 8
+  %c = load i16*, i16** @C, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+  %loadB = load i16, i16* %arrayidxB, align 2
+
+  %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+  %loadC = load i16, i16* %arrayidxC, align 2
+
+  call void @llvm.convergent()
+
+  %mul = mul i16 %loadB, %loadA
+  %mul1 = mul i16 %mul, %loadC
+
+  %add = add nuw nsw i64 %storemerge3, 1
+  %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+  store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+
+
Index: test/Transforms/LoopDistribute/basic-with-memchecks.ll
===================================================================
--- test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -5,6 +5,9 @@
 ; RUN:   -verify-loop-info -verify-dom-info -S < %s | \
 ; RUN:   FileCheck --check-prefix=VECTORIZE %s
 
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
+; RUN:   -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS
+
 ; The memcheck version of basic.ll.  We should distribute and vectorize the
 ; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
 ;
@@ -108,3 +111,60 @@
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+declare i32 @llvm.convergent(i32) #0
+
+; This is the same as f, and would require the same bounds
+; check. However, it is not OK to introduce new control dependencies
+; on the convergent call.
+
+; CHECK-LAEBL: @f_with_convergent(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+
+; ANALYSIS: for.body:
+; ANALYSIS: Report: cannot add control dependency to convergent operation
+define void @f_with_convergent() {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
Index: test/Transforms/LoopDistribute/basic.ll
===================================================================
--- test/Transforms/LoopDistribute/basic.ll
+++ test/Transforms/LoopDistribute/basic.ll
@@ -18,6 +18,7 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
+; CHECK-LABEL: @f(
 define void @f(i32* noalias %a,
                i32* noalias %b,
                i32* noalias %c,
@@ -81,3 +82,77 @@
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+declare i32 @llvm.convergent(i32) #0
+
+; It is OK to distribute with a convergent operation, since in each
+; new loop the convergent operation has the ssame control dependency.
+; CHECK-LABEL: @f_with_convergent(
+define void @f_with_convergent(i32* noalias %a,
+                               i32* noalias %b,
+                               i32* noalias %c,
+                               i32* noalias %d,
+                               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; CHECK: entry.split.ldist1:
+; CHECK:    br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+
+; CHECK: entry.split:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %convergentD = call i32 @llvm.convergent(i32 %loadD)
+; CHECK:    %mulC = mul i32 %convergentD, %loadE
+; CHECK: for.end:
+
+
+; ANALYSIS: for.body:
+; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation
+; ANALYSIS: for.body.ldist1:
+; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop
+
+; convergent instruction happens to block vectorization
+; VECTORIZE: call i32 @llvm.convergent
+; VECTORIZE: mul i32
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
Index: test/Transforms/LoopDistribute/diagnostics.ll
===================================================================
--- test/Transforms/LoopDistribute/diagnostics.ll
+++ test/Transforms/LoopDistribute/diagnostics.ll
@@ -131,6 +131,50 @@
   ret void, !dbg !34
 }
 
+; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation
+; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution
+define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !46
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13
+  %add = add i8 %1, %0, !dbg !51
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !56
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13
+  call void @llvm.convergent()
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57
+
+for.cond.cleanup:
+  ret void, !dbg !58
+}
+
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
@@ -177,3 +221,17 @@
 !42 = !DILocation(line: 17, column: 17, scope: !31)
 !43 = !DILocation(line: 17, column: 5, scope: !31)
 !44 = !DILocation(line: 17, column: 10, scope: !31)
+!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!46 = !DILocation(line: 25, column: 20, scope: !45)
+!47 = !DILocation(line: 25, column: 3, scope: !45)
+!48 = !DILocation(line: 29, column: 1, scope: !45)
+!49 = !DILocation(line: 26, column: 16, scope: !45)
+!50 = !DILocation(line: 26, column: 23, scope: !45)
+!51 = !DILocation(line: 26, column: 21, scope: !45)
+!52 = !DILocation(line: 26, column: 5, scope: !45)
+!53 = !DILocation(line: 26, column: 14, scope: !45)
+!54 = !DILocation(line: 27, column: 12, scope: !45)
+!55 = !DILocation(line: 27, column: 19, scope: !45)
+!56 = !DILocation(line: 27, column: 17, scope: !45)
+!57 = !DILocation(line: 27, column: 5, scope: !45)
+!58 = !DILocation(line: 27, column: 10, scope: !45)