Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -783,13 +783,130 @@ } int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { + int WaitStatesNeeded = 0; + + if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { + const int TransDefWaitstates = 1; + + auto IsTransDefFn = [this, VALU] (const MachineInstr &MI) { + if (!SIInstrInfo::isTRANS(MI)) + return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) + return true; + } + + return false; + }; + + int WaitStatesNeededForDef = + TransDefWaitstates - getWaitStatesSince(IsTransDefFn, TransDefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + + if (ST.hasDstSelForwardingHazard()) { + const int Shift16DefWaitstates = 1; + + auto IsShift16BitDefFn = [this, VALU] (const MachineInstr &MI) { + if (!SIInstrInfo::isVALU(MI)) + return false; + const SIInstrInfo *TII = ST.getInstrInfo(); + if (SIInstrInfo::isSDWA(MI)) { + if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) + if (DstSel->getImm() == AMDGPU::SDWA::DWORD) + return false; + } else { + if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::op_sel) == -1) || + !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) + ->getImm() & + SISrcMods::DST_OP_SEL)) + return false; + } + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + Register Def = Dst->getReg(); + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) + return true; + } + } + + return false; + }; + + int WaitStatesNeededForDef = Shift16DefWaitstates - + getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + + } + + if (ST.hasVDecCoExecHazard()) { + const int VALUWriteSGPRVALUReadWaitstates = 2; + const int VALUWriteEXECRWLane = 4; + const int VALUWriteVGPRReadlaneRead = 1; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + Register UseReg; + auto IsVALUDefSGPRFn = [&UseReg, TRI] (const MachineInstr &MI) { + if (!SIInstrInfo::isVALU(MI)) + return false; + return MI.modifiesRegister(UseReg, TRI); + }; + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (!Use.isReg()) + continue; + + UseReg = Use.getReg(); + if (TRI->isSGPRReg(MRI, UseReg)) { + int WaitStatesNeededForDef = VALUWriteSGPRVALUReadWaitstates - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + } + + if (VALU->readsRegister(AMDGPU::VCC, TRI)) { + UseReg = AMDGPU::VCC; + int WaitStatesNeededForDef = VALUWriteSGPRVALUReadWaitstates - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + + switch (VALU->getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READFIRSTLANE_B32: { + MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); + UseReg = Src->getReg(); + int WaitStatesNeededForDef = VALUWriteVGPRReadlaneRead - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + LLVM_FALLTHROUGH; + case AMDGPU::V_WRITELANE_B32: { + UseReg = AMDGPU::EXEC; + int WaitStatesNeededForDef = VALUWriteEXECRWLane - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + break; + } + default: + break; + } + } + // This checks for the hazard where VMEM instructions that store more than // 8 bytes can have there store data over written by the next instruction. if (!ST.has12DWordStoreHazard()) - return 0; + return WaitStatesNeeded; const MachineRegisterInfo &MRI = MF.getRegInfo(); - int WaitStatesNeeded = 0; for (const MachineOperand &Def : VALU->defs()) { WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -966,11 +966,28 @@ return HasLdsBranchVmemWARHazard; } + // Has one cycle hazard on transcendental instruction feeding a + // non transcendental VALU. + bool hasTransForwardingHazard() const { + return GFX940Insts; + } + + // Has one cycle hazard on a VALU instruction partially writing dst with + // a shift of result bits feeding another VALU instruction. + bool hasDstSelForwardingHazard() const { + return GFX940Insts; + } + // Cannot use op_sel with v_dot instructions. bool hasDOTOpSelHazard() const { return GFX940Insts; } + // Does not have HW interlocs for VALU writing and then reading SGPRs. + bool hasVDecCoExecHazard() const { + return GFX940Insts; + } + bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; } Index: llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir @@ -0,0 +1,217 @@ +# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: trans32_write_non_trans32_read +# GCN: V_RCP_F32 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MUL_F32 +name: trans32_write_non_trans32_read +body: | + bb.0: + $vgpr1 = V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: trans32_write_trans_read +# GCN: V_SIN_F32 +# GCN-NEXT: V_COS_F32 +name: trans32_write_trans_read +body: | + bb.0: + $vgpr0 = V_SIN_F32_e32 $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: trans64_write_non_trans_read +# GCN: V_RCP_F64 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_OR_B32 +name: trans64_write_non_trans_read +body: | + bb.0: + $vgpr0_vgpr1 = V_RCP_F64_e32 $vgpr2_vgpr3, implicit $mode, implicit $exec + $vgpr4 = V_OR_B32_e32 $vgpr1, $vgpr5, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: trans32_write_non_trans64_read +# GCN: V_EXP_F32 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MUL_F64 +name: trans32_write_non_trans64_read +body: | + bb.0: + $vgpr1 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: opsel_hi16_write_valu_read +# GCN: V_ADD_I16 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MUL_F64 +name: opsel_hi16_write_valu_read +body: | + bb.0: + $vgpr0 = V_ADD_I16_e64 8, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec + $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: opsel_lo16_write_valu_read +# GCN: V_ADD_I16 +# GCN-NEXT: V_MUL_F64 +name: opsel_lo16_write_valu_read +body: | + bb.0: + $vgpr0 = V_ADD_I16_e64 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec + $vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: sdwa_hi16_write_valu_read +# GCN: V_MOV_B32_sdwa +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MOV_B32_e32 +name: sdwa_hi16_write_valu_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 5, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec +... + +# GCN-LABEL: name: sdwa_lo16_write_valu_read +# GCN: V_MOV_B32_sdwa +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MOV_B32_e32 +name: sdwa_lo16_write_valu_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 4, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec +... + +# GCN-LABEL: name: sdwa_dword_write_valu_read +# GCN: V_MOV_B32_sdwa +# GCN-NEXT: V_MOV_B32_e32 +name: sdwa_dword_write_valu_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 6, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec +... + +# GCN-LABEL: name: sdwa_lo16_no_write_valu_read +# GCN: V_CMP_EQ_U32_sdwa +# GCN-NEXT: V_MOV_B32_e32 +name: sdwa_lo16_no_write_valu_read +body: | + bb.0: + $vcc = V_CMP_EQ_U32_sdwa 0, $vgpr1, 0, $vgpr0, 0, 5, 2, implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec +... + +# GCN-LABEL: name: valu_write_sgpr_valu_read_as_constant +# GCN: V_READFIRSTLANE_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MOV_B32_e32 +name: valu_write_sgpr_valu_read_as_constant +body: | + bb.0: + $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec +... + +# GCN-LABEL: name: valu_write_vcc_valu_read_as_constant +# GCN: V_CMP_NE_U32_e32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_ADDC_U32_e32 +name: valu_write_vcc_valu_read_as_constant +body: | + bb.0: + V_CMP_NE_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec + $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: valu_write_sgpr_readlane_read_as_laneselect +# GCN: V_READFIRSTLANE_B32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_READLANE_B32 +name: valu_write_sgpr_readlane_read_as_laneselect +body: | + bb.0: + $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec +... + +# GCN-LABEL: name: valu_write_sgpr_writelane_read_as_laneselect +# GCN: V_ADD_CO_U32_e64 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_WRITELANE_B32 +name: valu_write_sgpr_writelane_read_as_laneselect +body: | + bb.0: + $vgpr0, $sgpr0_sgpr1 = V_ADD_CO_U32_e64 $vgpr0, 1, 0, implicit $exec + $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec +... + +# GCN-LABEL: name: vcmpx_write_exec_valu_read_as_constant +# GCN: V_CMPX_EQ_I32_e32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MOV_B32_e32 +name: vcmpx_write_exec_valu_read_as_constant +body: | + bb.0: + implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 $exec_lo, implicit $exec +... + +# GCN-LABEL: name: vcmpx_write_exec_readlane +# GCN: V_CMPX_EQ_I32_e32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_READLANE_B32 +name: vcmpx_write_exec_readlane +body: | + bb.0: + implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $sgpr1 = V_READLANE_B32 $vgpr1, 0, implicit $exec +... + +# GCN-LABEL: name: vcmpx_write_exec_readfirstlane +# GCN: V_CMPX_EQ_I32_e32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_READFIRSTLANE_B32 +name: vcmpx_write_exec_readfirstlane +body: | + bb.0: + implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec +... + +# GCN-LABEL: name: vcmpx_write_exec_writelane +# GCN: V_CMPX_EQ_I32_e32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_WRITELANE_B32 +name: vcmpx_write_exec_writelane +body: | + bb.0: + implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec +... + +# GCN-LABEL: name: valu_write_vgpr_readlane_read +# GCN: V_ADD_CO_U32_e32 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_READLANE_B32 +name: valu_write_vgpr_readlane_read +body: | + bb.0: + $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec + $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec +... + +# GCN-LABEL: name: valu_write_vgpr_readfirstlane_read +# GCN: V_ADD_CO_U32_e32 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_READFIRSTLANE_B32 +name: valu_write_vgpr_readfirstlane_read +body: | + bb.0: + $vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec + $sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec +...