diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -344,7 +344,7 @@ auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); - if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) { + if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); return false; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -650,7 +650,7 @@ if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), *OpToFold.getParent(), - UseMI)) + *UseMI)) return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); @@ -669,7 +669,7 @@ if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), *OpToFold.getParent(), - UseMI)) + *UseMI)) return; // %vgpr = COPY %sgpr0 diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1016,13 +1016,19 @@ MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p -/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p -/// VReg. Should be run on SSA. Currently does not attempt to track between -/// blocks. +/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not +/// attempt to track between blocks. bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, - unsigned VReg, + Register VReg, const MachineInstr &DefMI, - const MachineInstr *UseMI = nullptr); + const MachineInstr &UseMI); + +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to +/// track between blocks. +bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI); namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6269,47 +6269,75 @@ } bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, - unsigned VReg, + Register VReg, const MachineInstr &DefMI, - const MachineInstr *UseMI) { + const MachineInstr &UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); auto *TRI = MRI.getTargetRegisterInfo(); auto *DefBB = DefMI.getParent(); - if (UseMI) { + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseMI.getParent() != DefBB) + return true; + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan at the use. + auto E = UseMI.getIterator(); + for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } + + return false; +} + +bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + const int MaxUseInstScan = 10; + int NumUseInst = 0; + + for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { // Don't bother searching between blocks, although it is possible this block // doesn't modify exec. - if (UseMI->getParent() != DefBB) + if (UseInst.getParent() != DefBB) return true; - } else { - int NumUse = 0; - const int MaxUseScan = 10; - - for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { - if (UseInst.getParent() != DefBB) - return true; - if (NumUse++ > MaxUseScan) - return true; - } + if (++NumUseInst > MaxUseInstScan) + return true; } const int MaxInstScan = 20; - int NumScan = 0; + int NumInst = 0; - // Stop scan at the use if known. - auto E = UseMI ? UseMI->getIterator() : DefBB->end(); - for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + // Stop scan when we have seen all the uses. + for (auto I = std::next(DefMI.getIterator()); ; ++I) { if (I->isDebugInstr()) continue; - if (NumScan++ > MaxInstScan) + if (++NumInst > MaxInstScan) return true; + if (I->readsRegister(VReg)) + if (--NumUseInst == 0) + return false; + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) return true; } - - return false; } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -42,6 +42,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -133,6 +135,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -45,6 +45,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] @@ -136,6 +138,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -39,6 +39,8 @@ ; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -44,6 +44,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -104,6 +106,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -44,6 +44,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_add_u32_dpp +; GFX8MORE: v_add_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -117,6 +119,8 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp +; GFX8MORE: v_sub{{(rev)?}}_u32_dpp ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -380,6 +380,38 @@ %9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec ... +# tests on sequences of dpp consumers followed by control flow +# CHECK-LABEL: name: dpp_seq_cf +# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec +# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec +# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec + +name: dpp_seq_cf +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec + %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec + + %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec + %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + bb.2: + SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec +... + # old reg def is in diff BB - cannot combine # CHECK-LABEL: name: old_in_diff_bb # CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec