diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -516,11 +516,19 @@
            SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID();
   if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
     return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(I1))
-    return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() &&
-           CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I2)->getAttributes() &&
-           CI->hasIdenticalOperandBundleSchema(*cast<CallInst>(I2));
+  if (const CallInst *CI = dyn_cast<CallInst>(I1)) {
+    if (CI->isTailCall() != cast<CallInst>(I2)->isTailCall() ||
+        CI->getCallingConv() != cast<CallInst>(I2)->getCallingConv() ||
+        CI->getAttributes() != cast<CallInst>(I2)->getAttributes() ||
+        !CI->hasIdenticalOperandBundleSchema(*cast<CallInst>(I2)))
+      return false;
+    // Convergent calls implicitly depend on the set of threads that is
+    // currently executing, so conservatively return false if they are in
+    // different basic blocks.
+    if (CI->isConvergent() && CI->getParent() != I2->getParent())
+      return false;
+    return true;
+  }
   if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
     return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
            CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes() &&
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -318,6 +318,16 @@
     return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
                         GCR->getBasePtr(), GCR->getDerivedPtr());
 
+  if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+    // Don't CSE convergent calls in different basic blocks, because they
+    // implicitly depend on the set of threads that is currently executing.
+    if (CI->isConvergent()) {
+      return hash_combine(
+          Inst->getOpcode(), Inst->getParent(),
+          hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+    }
+  }
+
   // Mix in the opcode.
   return hash_combine(
       Inst->getOpcode(),
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -21,15 +21,25 @@
 ; GCN-NEXT:    s_or_saveexec_b32 s4, -1
 ; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GCN-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-NEXT:    s_or_saveexec_b32 s5, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b32 exec_lo, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_or_saveexec_b32 s5, -1
+; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GCN-NEXT:    s_mov_b32 exec_lo, s5
+; GCN-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-NEXT:  ; %bb.2: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GCN-NEXT:    v_add_nc_u32_e32 v0, v4, v5
 ; GCN-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32
diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll
--- a/llvm/test/Transforms/SimplifyCFG/convergent.ll
+++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll
@@ -82,6 +82,8 @@
 ; SINK-NEXT:    [[TMP0:%.*]] = tail call i32 @tid()
 ; SINK-NEXT:    [[REM:%.*]] = and i32 [[TMP0]], 1
 ; SINK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[REM]], 0
+; SINK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP0]] to i64
+; SINK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[Y_COERCE:%.*]], i64 [[IDXPROM4]]
 ; SINK-NEXT:    br i1 [[CMP_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
 ; SINK:       if.then:
 ; SINK-NEXT:    [[TMP1:%.*]] = tail call i32 @mbcnt(i32 -1, i32 0)
@@ -101,8 +103,6 @@
 ; SINK-NEXT:    br label [[IF_END]]
 ; SINK:       if.end:
 ; SINK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ [[TMP6]], [[IF_ELSE]] ], [ [[TMP3]], [[IF_THEN]] ]
-; SINK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP0]] to i64
-; SINK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[Y_COERCE:%.*]], i64 [[IDXPROM4]]
 ; SINK-NEXT:    store i32 [[DOTSINK]], ptr [[ARRAYIDX5]], align 4
 ; SINK-NEXT:    ret void
 ;