diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -516,11 +516,19 @@ SI->getSyncScopeID() == cast(I2)->getSyncScopeID(); if (const CmpInst *CI = dyn_cast(I1)) return CI->getPredicate() == cast(I2)->getPredicate(); - if (const CallInst *CI = dyn_cast(I1)) - return CI->isTailCall() == cast(I2)->isTailCall() && - CI->getCallingConv() == cast(I2)->getCallingConv() && - CI->getAttributes() == cast(I2)->getAttributes() && - CI->hasIdenticalOperandBundleSchema(*cast(I2)); + if (const CallInst *CI = dyn_cast(I1)) { + if (CI->isTailCall() != cast(I2)->isTailCall() || + CI->getCallingConv() != cast(I2)->getCallingConv() || + CI->getAttributes() != cast(I2)->getAttributes() || + !CI->hasIdenticalOperandBundleSchema(*cast(I2))) + return false; + // Convergent calls implicitly depend on the set of threads that is + // currently executing, so conservatively return false if they are in + // different basic blocks. + if (CI->isConvergent() && CI->getParent() != I2->getParent()) + return false; + return true; + } if (const InvokeInst *CI = dyn_cast(I1)) return CI->getCallingConv() == cast(I2)->getCallingConv() && CI->getAttributes() == cast(I2)->getAttributes() && diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -318,6 +318,16 @@ return hash_combine(GCR->getOpcode(), GCR->getOperand(0), GCR->getBasePtr(), GCR->getDerivedPtr()); + if (CallInst *CI = dyn_cast(Inst)) { + // Don't CSE convergent calls in different basic blocks, because they + // implicitly depend on the set of threads that is currently executing. + if (CI->isConvergent()) { + return hash_combine( + Inst->getOpcode(), Inst->getParent(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); + } + } + // Mix in the opcode. return hash_combine( Inst->getOpcode(), diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -21,15 +21,25 @@ ; GCN-NEXT: s_or_saveexec_b32 s4, -1 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: s_or_saveexec_b32 s5, -1 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b32 exec_lo, s5 +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: s_or_saveexec_b32 s5, -1 +; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: s_mov_b32 exec_lo, s5 +; GCN-NEXT: v_mov_b32_e32 v5, v2 ; GCN-NEXT: ; %bb.2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5 ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll --- a/llvm/test/Transforms/SimplifyCFG/convergent.ll +++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll @@ -82,6 +82,8 @@ ; SINK-NEXT: [[TMP0:%.*]] = tail call i32 @tid() ; SINK-NEXT: [[REM:%.*]] = and i32 [[TMP0]], 1 ; SINK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[REM]], 0 +; SINK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP0]] to i64 +; SINK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[Y_COERCE:%.*]], i64 [[IDXPROM4]] ; SINK-NEXT: br i1 [[CMP_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; SINK: if.then: ; SINK-NEXT: [[TMP1:%.*]] = tail call i32 @mbcnt(i32 -1, i32 0) @@ -101,8 +103,6 @@ ; SINK-NEXT: br label [[IF_END]] ; SINK: if.end: ; SINK-NEXT: [[DOTSINK:%.*]] = phi i32 [ [[TMP6]], [[IF_ELSE]] ], [ [[TMP3]], [[IF_THEN]] ] -; SINK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP0]] to i64 -; SINK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[Y_COERCE:%.*]], i64 [[IDXPROM4]] ; SINK-NEXT: store i32 [[DOTSINK]], ptr [[ARRAYIDX5]], align 4 ; SINK-NEXT: ret void ;