Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -31,6 +31,9 @@ class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { + // Distinguish if we are called from scheduler or hazard recognizer + bool IsHazardRecognizerMode; + // This variable stores the instruction that has been emitted this cycle. It // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is // called. @@ -54,11 +57,13 @@ void addClauseInst(const MachineInstr &MI); - int getWaitStatesSince(function_ref IsHazard); + int getWaitStatesSince(function_ref IsHazard, + int Limit); int getWaitStatesSinceDef(unsigned Reg, - function_ref IsHazardDef = - [](MachineInstr *) { return true; }); - int getWaitStatesSinceSetReg(function_ref IsHazard); + function_ref IsHazardDef, + int Limit); + int getWaitStatesSinceSetReg(function_ref IsHazard, + int Limit); int checkSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); @@ -85,6 +90,7 @@ void EmitNoop() override; unsigned PreEmitNoops(SUnit *SU) override; unsigned PreEmitNoops(MachineInstr *) override; + unsigned PreEmitNoopsCommon(MachineInstr *); void AdvanceCycle() override; void RecedeCycle() override; }; Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -38,6 +38,7 @@ //===----------------------------------------------------------------------===// GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), @@ -173,10 +174,19 @@ } unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { - return PreEmitNoops(SU->getInstr()); + IsHazardRecognizerMode = false; + return PreEmitNoopsCommon(SU->getInstr()); } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + IsHazardRecognizerMode = true; + CurrCycleInstr = MI; + unsigned W = PreEmitNoopsCommon(MI); + CurrCycleInstr = nullptr; + return W; +} + +unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { int WaitStates = std::max(0, checkAnyInstHazards(MI)); if (SIInstrInfo::isSMRD(*MI)) @@ -266,8 +276,77 @@ // Helper Functions //===----------------------------------------------------------------------===// +// Returns a minimum wait states since \p I walking all predecessors. +// Only scans until \p IsExpired does not return true. +// Can only be run in a hazard recognizer mode. +static int getWaitStatesSince( + function_ref IsHazard, + MachineBasicBlock *MBB, + MachineBasicBlock::reverse_instr_iterator I, + int WaitStates, + function_ref IsExpired, + llvm::DenseSet &Visited) { + + for (auto E = MBB->rend() ; I != E; ++I) { + if (IsHazard(&*I)) + return WaitStates; + + unsigned Opcode = I->getOpcode(); + if (Opcode == AMDGPU::INLINEASM || + I->getOpcode() == AMDGPU::IMPLICIT_DEF || + I->isDebugInstr()) + continue; + + WaitStates += SIInstrInfo::getNumWaitStates(*I); + + if (IsExpired(&*I, WaitStates)) + return std::numeric_limits::max(); + } + + int MinWaitStates = WaitStates; + bool Found = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), + WaitStates, IsExpired, Visited); + + if (W == std::numeric_limits::max()) + continue; + + MinWaitStates = Found ? std::min(MinWaitStates, W) : W; + if (IsExpired(nullptr, MinWaitStates)) + return MinWaitStates; + + Found = true; + } + + if (Found) + return MinWaitStates; + + return std::numeric_limits::max(); +} + +static int getWaitStatesSince( + function_ref IsHazard, + MachineInstr *MI, + function_ref IsExpired) { + llvm::DenseSet Visited; + return getWaitStatesSince(IsHazard, MI->getParent(), + std::next(MI->getReverseIterator()), + 0, IsExpired, Visited); +} + int GCNHazardRecognizer::getWaitStatesSince( - function_ref IsHazard) { + function_ref IsHazard, int Limit) { + if (IsHazardRecognizerMode) { + auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { + return WaitStates >= Limit; + }; + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + } + int WaitStates = 0; for (MachineInstr *MI : EmittedInstrs) { if (MI) { @@ -279,28 +358,31 @@ continue; } ++WaitStates; + + if (WaitStates >= Limit) + break; } return std::numeric_limits::max(); } int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref IsHazardDef) { + unsigned Reg, function_ref IsHazardDef, int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } int GCNHazardRecognizer::getWaitStatesSinceSetReg( - function_ref IsHazard) { + function_ref IsHazard, int Limit) { auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } //===----------------------------------------------------------------------===// @@ -398,7 +480,8 @@ if (!Use.isReg()) continue; int WaitStatesNeededForUse = - SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); // This fixes what appears to be undocumented hardware behavior in SI where @@ -411,7 +494,8 @@ if (IsBufferSMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), - IsBufferHazardDefFn); + IsBufferHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } } @@ -435,7 +519,8 @@ continue; int WaitStatesNeededForUse = - VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + VmemSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } return WaitStatesNeeded; @@ -455,13 +540,16 @@ if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + [](MachineInstr *) { return true; }, + DppVgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } WaitStatesNeeded = std::max( WaitStatesNeeded, - DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn)); + DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, + DppExecWaitStates)); return WaitStatesNeeded; } @@ -473,7 +561,8 @@ // instruction. const int DivFMasWaitStates = 4; auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; - int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn); + int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, + DivFMasWaitStates); return DivFMasWaitStates - WaitStatesNeeded; } @@ -486,7 +575,7 @@ auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { return GetRegHWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); return GetRegWaitStates - WaitStatesNeeded; } @@ -500,7 +589,7 @@ auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { return HWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); return SetRegWaitStates - WaitStatesNeeded; } @@ -571,7 +660,7 @@ TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); }; int WaitStatesNeededForDef = - VALUWaitStates - getWaitStatesSince(IsHazardFn); + VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); return WaitStatesNeeded; @@ -636,7 +725,8 @@ }; const int RWLaneWaitStates = 4; - int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn); + int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, + RWLaneWaitStates); return RWLaneWaitStates - WaitStatesSince; } @@ -651,7 +741,7 @@ auto IsHazardFn = [TII] (MachineInstr *MI) { return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); return RFEWaitStates - WaitStatesNeeded; } @@ -675,7 +765,8 @@ return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; }; int WaitStatesNeededForUse = - MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, + MovFedWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } @@ -688,5 +779,6 @@ auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; - return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, + SMovRelWaitStates); } Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -838,7 +838,7 @@ void insertReturn(MachineBasicBlock &MBB) const; /// Return the number of wait states that result from executing this /// instruction. - unsigned getNumWaitStates(const MachineInstr &MI) const; + static unsigned getNumWaitStates(const MachineInstr &MI); /// Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1154,7 +1154,7 @@ } } -unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? Index: test/CodeGen/AMDGPU/vmem-vcc-hazard.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -0,0 +1,230 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: vmem_vcc_fallthrough +# GCN: bb.1: +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_fallthrough +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + + bb.1: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_branch_to_next +# GCN: bb.1: +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_branch_to_next +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far +# GCN: bb.1: +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_fallthrough_no_hazard_too_far +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + + bb.1: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops +# GCN: bb.1: +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_fallthrough_no_hazard_nops +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 4 + + bb.1: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_branch_around +# GCN: bb.2: +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_branch_around +body: | + bb.0: + successors: %bb.2 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + + bb.2: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_branch_backedge +# GCN: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_branch_backedge +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = IMPLICIT_DEF + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec + + bb.1: + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.0 +... +# GCN-LABEL: name: vmem_vcc_min_of_two +# GCN: bb.2: +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_min_of_two +body: | + bb.0: + successors: %bb.2 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 0 + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + + bb.2: + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: vmem_vcc_self_loop +# GCN: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_self_loop +body: | + bb.0: + successors: %bb.0 + + $vgpr0 = IMPLICIT_DEF + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.0 +... +# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop1 +# GCN: bb.1: +# GCN: $sgpr0 = S_MOV_B32 0 +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_min_of_two_self_loop1 +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + + bb.1: + successors: %bb.1 + + $sgpr0 = S_MOV_B32 0 + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.1 +... +# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop2 +# GCN: bb.1: +# GCN: $sgpr0 = S_MOV_B32 0 +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_min_of_two_self_loop2 +body: | + bb.0: + successors: %bb.1 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 0 + + bb.1: + successors: %bb.1 + + $sgpr0 = S_MOV_B32 0 + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + S_BRANCH %bb.1 +...