diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -60,6 +60,10 @@ void addClauseInst(const MachineInstr &MI); + // Advance over a MachineInstr bundle. Look for hazards in the bundled + // instructions. + void processBundle(); + int getWaitStatesSince(IsHazardFn IsHazard, int Limit); int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -124,6 +124,8 @@ ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); + if (MI->isBundle()) + return NoHazard; if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; @@ -179,6 +181,29 @@ return NoHazard; } +void GCNHazardRecognizer::processBundle() { + MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); + MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); + // Check bundled MachineInstr's for hazards. + for (; MI != E && MI->isInsideBundle(); ++MI) { + CurrCycleInstr = &*MI; + unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); + + for (unsigned i = 0; i < WaitStates; ++i) + TII.insertNoop(*MI->getParent(), MI); + + // It’s unnecessary to track more than MaxLookAhead instructions. Since we + // include the bundled MI directly after, only add a maximum of + // (MaxLookAhead - 1) noops to EmittedInstrs. + for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) + EmittedInstrs.push_front(nullptr); + + EmittedInstrs.push_front(CurrCycleInstr); + EmittedInstrs.resize(MaxLookAhead); + } + CurrCycleInstr = nullptr; +} + unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { IsHazardRecognizerMode = false; return PreEmitNoopsCommon(SU->getInstr()); @@ -199,6 +224,9 @@ } unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { + if (MI->isBundle()) + return 0; + int WaitStates = std::max(0, checkAnyInstHazards(MI)); if (SIInstrInfo::isSMRD(*MI)) @@ -264,6 +292,12 @@ CurrCycleInstr->isKill()) return; + if (CurrCycleInstr->isBundle()) { + processBundle(); + return; + } + + unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); // Keep track of emitted instructions @@ -304,8 +338,11 @@ int WaitStates, IsExpiredFn IsExpired, DenseSet &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // Don't add WaitStates for parent BUNDLE instructions. + if (I->isBundle()) + continue; - for (auto E = MBB->rend() ; I != E; ++I) { if (IsHazard(&*I)) return WaitStates; @@ -437,9 +474,9 @@ // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). // - // In order to handle these situations correctly we need to make sure - // that when a clause has more than one instruction, no instruction in the - // clause writes to a register that is read another instruction in the clause + // In order to handle these situations correctly we need to make sure that + // when a clause has more than one instruction, no instruction in the clause + // writes to a register that is read by another instruction in the clause // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. @@ -525,7 +562,6 @@ // SGPR was written by a VALU Instruction. const int VmemSgprWaitStates = 5; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; - for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) continue; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -843,9 +843,17 @@ void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, int Count) const; + void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::instr_iterator MI, + int Count) const; + void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + // Insert a noop using an instr_iterator. If MI is in a bundle the inserted + // noop will be included in the same bundle. + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator MI) const; + void insertReturn(MachineBasicBlock &MBB) const; /// Return the number of wait states that result from executing this /// instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1165,11 +1165,32 @@ } } +void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator MI, + int Count) const { + DebugLoc DL = MBB.findDebugLoc(MI); + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { insertWaitStates(MBB, MI, 1); } +void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator MI) const { + insertWaitStates(MBB, MI, 1); +} + void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { auto MF = MBB.getParent(); SIMachineFunctionInfo *Info = MF->getInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/hazard-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-bundle.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazard-bundle.mir @@ -0,0 +1,78 @@ +# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s + +# GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_before_bundle +# GCN: BUNDLE +# XNACK-NEXT: S_NOP +# NOXNACK-NOT: S_NOP +# GCN: } +# XNACK-NEXT: S_NOP +# NOXNACK-NOT: S_NOP +# GCN: S_LOAD_DWORDX2_IMM +--- +name: break_smem_clause_simple_load_smrd8_ptr_before_bundle + +body: | + bb.0: + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 + BUNDLE implicit-def $sgpr6_sgpr7 { + $sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM $sgpr12_sgpr13, 0, 0, 0 + } + $sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM $sgpr14_sgpr15, 0, 0, 0 + S_ENDPGM 0 +... + +# GCN-LABEL: name: vmem_vcc_hazard_ignore_bundle_instr +# GCN: BUNDLE +# GCN: BUNDLE +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_hazard_ignore_bundle_instr +body: | + bb.0: + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + BUNDLE implicit-def $vgpr1, implicit $vgpr0, implicit $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec { + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + } + BUNDLE implicit-def $vgpr1, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vcc_lo, implicit $exec { + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + } + S_ENDPGM 0 +... + +# GCN-LABEL: name: vmem_vcc_min_of_two_in_bundle +# GCN: BUNDLE +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_min_of_two_in_bundle +body: | + bb.0: + successors: %bb.2 + + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 0 + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + + bb.2: + BUNDLE implicit-def $vgpr1, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vcc_lo, implicit $exec { + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + } +...