Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -40,6 +40,7 @@ int checkSMRDHazards(MachineInstr *SMRD); int checkVMEMHazards(MachineInstr* VMEM); + int checkDPPHazards(MachineInstr *DPP); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -47,6 +47,9 @@ if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) + return NoopHazard; + return NoHazard; } @@ -61,6 +64,9 @@ if (SIInstrInfo::isVMEM(*MI)) return std::max(0, checkVMEMHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) + return std::max(0, checkDPPHazards(MI)); + return 0; } @@ -175,3 +181,23 @@ } return WaitStatesNeeded; } + +int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = + static_cast(ST.getRegisterInfo()); + + // Check for DPP VGPR read after VALU VGPR write. + int DppVgprWaitStates = 2; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : DPP->uses()) { + if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + int WaitStatesNeededForUse = + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -127,18 +127,6 @@ /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - /// \param DPP The DPP instruction - /// \param SearchI The iterator to start look for hazards. - /// \param SearchMBB The basic block we are operating on. - /// \param WaitStates Then number of wait states that need to be inserted - /// When a hazard is detected. - void insertDPPWaitStates(MachineBasicBlock::iterator DPP, - MachineBasicBlock::reverse_iterator SearchI, - MachineBasicBlock *SearchMBB, - unsigned WaitStates); - - void insertDPPWaitStates(MachineBasicBlock::iterator DPP); - /// Return true if there are LGKM instrucitons that haven't been waited on /// yet. bool hasOutstandingLGKM() const; @@ -522,45 +510,6 @@ } } -void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP, - MachineBasicBlock::reverse_iterator SearchI, - MachineBasicBlock *SearchMBB, - unsigned WaitStates) { - - MachineBasicBlock::reverse_iterator E = SearchMBB->rend(); - - for (; WaitStates > 0; --WaitStates, ++SearchI) { - - // If we have reached the start of the block, we need to check predecessors. - if (SearchI == E) { - for (MachineBasicBlock *Pred : SearchMBB->predecessors()) { - // We only need to check fall-through blocks. Branch instructions - // give us enough wait states. - if (Pred->getFirstTerminator() == Pred->end()) { - insertDPPWaitStates(DPP, Pred->rbegin(), Pred, WaitStates); - break; - } - } - return; - } - - for (MachineOperand &Op : SearchI->operands()) { - if (!Op.isReg() || !Op.isDef()) - continue; - - if (DPP->readsRegister(Op.getReg(), TRI)) { - TII->insertWaitStates(*DPP->getParent(), DPP, WaitStates); - return; - } - } - } -} - -void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) { - MachineBasicBlock::reverse_iterator I(DPP); - insertDPPWaitStates(DPP, I, DPP->getParent(), 2); -} - // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -630,10 +579,6 @@ } } - if (TII->isDPP(*I)) { - insertDPPWaitStates(I); - } - // Record pre-existing, explicitly requested waits if (I->getOpcode() == AMDGPU::S_WAITCNT) { handleExistingWait(*I); Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s ; FIXME: The register allocator / scheduler should be able to avoid these hazards. @@ -26,7 +27,10 @@ } ; VI-LABEL: {{^}}dpp_first_in_bb: -; VI: s_nop 1 +; VI: ; %endif +; VI-OPT: s_mov_b32 +; VI-OPT: s_mov_b32 +; VI-NOOPT: s_nop 1 ; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0