Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -806,10 +806,14 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasReadM0Hazard() const { + bool hasReadM0MovRelInterpHazard() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } + bool hasReadM0SendMsgHazard() const { + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -58,7 +58,7 @@ [](MachineInstr *) { return true; }); int getWaitStatesSinceSetReg(function_ref IsHazard); - int checkSMEMSoftClauseHazards(MachineInstr *SMEM); + int checkSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); int checkVMEMHazards(MachineInstr* VMEM); int checkDPPHazards(MachineInstr *DPP); Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -87,6 +87,18 @@ } } +static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: + case AMDGPU::S_TTRACEDATA: + return true; + default: + // TODO: GDS + return false; + } +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -100,7 +112,10 @@ if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; - if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) + // FIXME: Should flat be considered vmem? + if ((SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI)) + && checkVMEMHazards(MI) > 0) return NoopHazard; if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) @@ -124,7 +139,12 @@ if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return NoopHazard; - if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && + if (ST.hasReadM0MovRelInterpHazard() && + (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && + checkReadM0Hazards(MI) > 0) + return NoopHazard; + + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) && checkReadM0Hazards(MI) > 0) return NoopHazard; @@ -144,26 +164,20 @@ if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVALU(*MI)) { - WaitStates = std::max(WaitStates, checkVALUHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - if (SIInstrInfo::isVMEM(*MI)) - WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (SIInstrInfo::isDPP(*MI)) - WaitStates = std::max(WaitStates, checkDPPHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) + WaitStates = std::max(WaitStates, checkDPPHazards(MI)); - if (isDivFMas(MI->getOpcode())) - WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); + if (isDivFMas(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); - if (isRWLane(MI->getOpcode())) - WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); - - if (TII.isVINTRP(*MI)) - WaitStates = std::max(WaitStates, checkReadM0Hazards(MI)); - - return WaitStates; - } + if (isRWLane(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); if (isSGetReg(MI->getOpcode())) return std::max(WaitStates, checkGetRegHazards(MI)); @@ -174,7 +188,11 @@ if (isRFE(MI->getOpcode())) return std::max(WaitStates, checkRFEHazards(MI)); - if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) + if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || + isSMovRel(MI->getOpcode()))) + return std::max(WaitStates, checkReadM0Hazards(MI)); + + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); return WaitStates; @@ -282,12 +300,14 @@ addRegsToSet(TRI, MI.uses(), ClauseUses); } -int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { +int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // SMEM soft clause are only present on VI+, and only matter if xnack is // enabled. if (!ST.isXNACKEnabled()) return 0; + bool IsSMRD = TII.isSMRD(*MEM); + resetClause(); // A soft-clause is any group of consecutive SMEM instructions. The @@ -303,7 +323,10 @@ for (MachineInstr *MI : EmittedInstrs) { // When we hit a non-SMEM instruction then we have passed the start of the // clause and we can stop. - if (!MI || !SIInstrInfo::isSMRD(*MI)) + if (!MI) + break; + + if (IsSMRD != SIInstrInfo::isSMRD(*MI)) break; addClauseInst(*MI); @@ -312,13 +335,13 @@ if (ClauseDefs.none()) return 0; - // FIXME: When we support stores, we need to make sure not to put loads and - // stores in the same clause if they use the same address. For now, just - // start a new clause whenever we see a store. - if (SMEM->mayStore()) + // We need to make sure not to put loads and stores in the same clause if they + // use the same address. For now, just start a new clause whenever we see a + // store. + if (MEM->mayStore()) return 1; - addClauseInst(*SMEM); + addClauseInst(*MEM); // If the set of defs and uses intersect then we cannot add this instruction // to the clause, so we have a hazard. @@ -329,7 +352,7 @@ const SISubtarget &ST = MF.getSubtarget(); int WaitStatesNeeded = 0; - WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); + WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS) @@ -369,18 +392,15 @@ } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - const SIInstrInfo *TII = ST.getInstrInfo(); - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return 0; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int WaitStatesNeeded = checkSoftClauseHazards(VMEM); // A read of an SGPR by a VMEM instruction requires 5 wait states when the // SGPR was written by a VALU Instruction. - int VmemSgprWaitStates = 5; - int WaitStatesNeeded = 0; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + const int VmemSgprWaitStates = 5; + auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) @@ -598,11 +618,8 @@ } int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { - if (!ST.hasReadM0Hazard()) - return 0; - const SIInstrInfo *TII = ST.getInstrInfo(); - int SMovRelWaitStates = 1; + const int SMovRelWaitStates = 1; auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1522,8 +1522,6 @@ ScoreBrackets->dump(); }); - bool InsertNOP = false; - // Walk over the instructions. for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); Iter != E;) { @@ -1624,58 +1622,6 @@ VCCZBugHandledSet.insert(&Inst); } - if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - - // This avoids a s_nop after a waitcnt has just been inserted. - if (!SWaitInst && InsertNOP) { - BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); - } - InsertNOP = false; - - // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - bool IsSMEM = false; - bool IsVMEM = false; - if (TII->isSMRD(Inst)) - IsSMEM = true; - else if (TII->usesVM_CNT(Inst)) - IsVMEM = true; - - ++Iter; - if (Iter == E) - break; - - MachineInstr &Next = *Iter; - - // TODO: How about consecutive SMEM instructions? - // The comments above says break the clause but the code does not. - // if ((TII->isSMRD(next) && isSMEM) || - if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && - // TODO: Enable this check when hasSoftClause is upstreamed. - // ST->hasSoftClauses() && - ST->isXNACKEnabled()) { - // Insert a NOP to break the clause. - InsertNOP = true; - continue; - } - - // There must be "S_NOP 0" between an instruction writing M0 and - // S_SENDMSG. - if ((Next.getOpcode() == AMDGPU::S_SENDMSG || - Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && - Inst.definesRegister(AMDGPU::M0)) - InsertNOP = true; - - continue; - } - ++Iter; } Index: test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -0,0 +1,580 @@ +# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s +--- +# Trivial clause at beginning of program +name: trivial_clause_load_flat4_x1 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_flat4_x1 + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_clause_load_flat4_x2 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_flat4_x2 + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_clause_load_flat4_x3 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_flat4_x3 + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr5_vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Trivial clause at beginning of program +name: trivial_clause_load_flat4_x4 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_flat4_x4 + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr8_vgpr9, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr10_vgpr11, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Reuse of same input pointer is OK + +name: trivial_clause_load_flat4_x2_sameptr +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +name: flat_load4_overwrite_ptr_lo + +body: | + bb.0: + ; GCN-LABEL: name: flat_load4_overwrite_ptr_lo + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 32-bit load partially clobbers its own ptr reg +name: flat_load4_overwrite_ptr_hi + +body: | + bb.0: + ; GCN-LABEL: name: flat_load4_overwrite_ptr_hi + ; GCN: %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr1 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# 64-bit load clobbers its own ptr reg +name: flat_load8_overwrite_ptr + +body: | + bb.0: + ; GCN-LABEL: name: flat_load8_overwrite_ptr + ; GCN: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt +# breaks the clause. + + +name: break_clause_at_max_clause_size_flat_load4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4 + ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 + ; GCN-NEXT: S_ENDPGM + + %vgpr2 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr5 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr6 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr7 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr9 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr10 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr12 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr13 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr14 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr15 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr16 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr17 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr0 = S_MOV_B32 %sgpr0, implicit %vgpr2, implicit %vgpr3, implicit %vgpr4, implicit %vgpr5, implicit %vgpr6, implicit %vgpr7, implicit %vgpr8, implicit %vgpr9, implicit %vgpr10, implicit %vgpr11, implicit %vgpr12, implicit %vgpr13, implicit %vgpr14, implicit %vgpr15, implicit %vgpr16, implicit %vgpr17, implicit %vgpr18 + S_ENDPGM +... +--- + +name: break_clause_simple_load_flat4_lo_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +name: break_clause_simple_load_flat4_hi_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +name: break_clause_simple_load_flat8_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_simple_load_flat8_ptr + ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + + +name: break_clause_simple_load_flat16_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_simple_load_flat16_ptr + ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +# The clause is broken by the waitcnt inserted at the end of the +# block, so no nop is needed. + + +name: break_clause_block_boundary_load_flat8_ptr + +body: | + ; GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN: bb.1: + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + bb.0: + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + + bb.1: + %vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The load clobbers the pointer of the store, so it needs to break. + +name: break_clause_store_load_into_ptr_flat4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_store_load_into_ptr_flat4 + ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The load clobbers the data of the store, so it needs to break. +# FIXME: Would it be better to s_nop and wait later? + +name: break_clause_store_load_into_data_flat4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_store_load_into_data_flat4 + ; GCN: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Regular VALU instruction breaks clause, no nop needed + +name: valu_inst_breaks_clause + +body: | + bb.0: + ; GCN-LABEL: name: valu_inst_breaks_clause + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = V_MOV_B32_e32 0, implicit %exec + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Regular SALU instruction breaks clause, no nop needed + +name: salu_inst_breaks_clause + +body: | + bb.0: + ; GCN-LABEL: name: salu_inst_breaks_clause + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr8 = S_MOV_B32 0 + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr8 = S_MOV_B32 0 + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +name: ds_inst_breaks_clause + +body: | + bb.0: + ; GCN-LABEL: name: ds_inst_breaks_clause + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +name: smrd_inst_breaks_clause + +body: | + bb.0: + ; GCN-LABEL: name: smrd_inst_breaks_clause + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %sgpr8 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 0, 0 + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# FIXME: Should this be handled? +name: implicit_use_breaks_clause + +body: | + bb.0: + ; GCN-LABEL: name: implicit_use_breaks_clause + ; GCN: %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5 + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr, implicit %vgpr4_vgpr5 + %vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 %vgpr6_vgpr7, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +name: trivial_clause_load_mubuf4_x2 + +body: | + bb.0: + ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2 + ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = BUFFER_LOAD_DWORD_OFFEN %vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +name: break_clause_simple_load_mubuf_offen_ptr + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr + ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# BUFFER instructions overwriting their own inputs is supposedly OK. + +name: mubuf_load4_overwrite_ptr + +body: | + bb.0: + ; GCN-LABEL: name: mubuf_load4_overwrite_ptr + ; GCN: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: %vgpr1 = V_MOV_B32_e32 0, implicit %exec + ; GCN-NEXT: %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec + ; GCN-NEXT: S_ENDPGM + %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr1 = V_MOV_B32_e32 0, implicit %exec + %vgpr2 = V_MOV_B32_e32 %vgpr0, implicit %exec + S_ENDPGM +... +--- +# Break a clause from interference between mubuf and flat instructions + +name: break_clause_flat_load_mubuf_load + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_flat_load_mubuf_load + ; GCN: %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + + %vgpr0 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +# Break a clause from interference between mubuf and flat instructions + +# GCN-LABEL: name: break_clause_mubuf_load_flat_load +# GCN: bb.0: +# GCN-NEXT: %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4 +# XNACK-NEXT: S_NOP 0 +# GCN-NEXT: %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3 +# GCN-NEXT: S_ENDPGM +name: break_clause_mubuf_load_flat_load + +body: | + bb.0: + %vgpr0 = BUFFER_LOAD_DWORD_OFFEN %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr1 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + + S_ENDPGM +... +--- + +name: break_clause_atomic_rtn_into_ptr_flat4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4 + ; GCN: %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + %vgpr2 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr4 = FLAT_ATOMIC_ADD_RTN %vgpr5_vgpr6, %vgpr7, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +name: break_clause_atomic_nortn_ptr_load_flat4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4 + ; GCN: FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: S_ENDPGM + + FLAT_ATOMIC_ADD %vgpr0_vgpr1, %vgpr2, 0, 0, implicit %exec, implicit %flat_scr + %vgpr2 = FLAT_LOAD_DWORD %vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- + +name: break_clause_atomic_rtn_into_ptr_mubuf4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 + ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN %vgpr2, %vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + S_ENDPGM +... +--- + +name: break_clause_atomic_nortn_ptr_load_mubuf4 + +body: | + bb.0: + ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 + ; GCN: BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + ; GCN-NEXT: %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + + BUFFER_ATOMIC_ADD_OFFEN %vgpr0, %vgpr1, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, implicit %exec + %vgpr1 = BUFFER_LOAD_DWORD_OFFEN %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# Make sure there is no assert on mubuf instructions which do not have +# vaddr, and don't add register to track. +name: no_break_clause_mubuf_load_novaddr + +body: | + bb.0: + ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr + ; GCN: %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + ; GCN-NEXT: S_ENDPGM + %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# Loads and stores using different addresses theoretically does not +# need a nop +name: mix_load_store_clause +body: | + bb.0: + ; GCN-LABEL: name: mix_load_store_clause + ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + + FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr2_vgpr3, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# Loads and stores using the same address needs a nop. + +name: mix_load_store_clause_same_address +body: | + bb.0: + ; GCN-LABEL: name: mix_load_store_clause_same_address + ; GCN: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + ; XNACK-NEXT: S_NOP 0 + ; GCN-NEXT: FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + ; GCN-NEXT: %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + + FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr10 = FLAT_LOAD_DWORD %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr0_vgpr1, %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr11 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... Index: test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- test/CodeGen/AMDGPU/inserted-wait-states.mir +++ test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -437,22 +437,22 @@ # GCN-LABEL: bb.0: # GCN: S_MOV_B32 -# GFX9: S_NOP +# GFX9-NEXT: S_NOP # GCN-NEXT: V_INTERP_P1_F32 # GCN-LABEL: bb.1: # GCN: S_MOV_B32 -# GFX9: S_NOP +# GFX9-NEXT: S_NOP # GCN-NEXT: V_INTERP_P2_F32 # GCN-LABEL: bb.2: # GCN: S_MOV_B32 -# GFX9: S_NOP +# GFX9-NEXT: S_NOP # GCN-NEXT: V_INTERP_P1_F32_16bank # GCN-LABEL: bb.3: # GCN: S_MOV_B32 -# GFX9: S_NOP +# GFX9-NEXT: S_NOP # GCN-NEXT: V_INTERP_MOV_F32 name: v_interp Index: test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sendmsg-m0-hazard.mir @@ -0,0 +1,49 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,VI %s +# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,CI %s +# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,SI %s + +--- +name: m0_sendmsg +body: | + ; GCN-LABEL: name: m0_sendmsg + ; GCN: %m0 = S_MOV_B32 -1 + ; VI-NEXT: S_NOP 0 + ; GFX9-NEXT: S_NOP 0 + ; GCN-NEXT: S_SENDMSG 3, implicit %exec, implicit %m0 + + bb.0: + %m0 = S_MOV_B32 -1 + S_SENDMSG 3, implicit %exec, implicit %m0 + S_ENDPGM +... +--- + +name: m0_sendmsghalt +body: | + ; GCN-LABEL: name: m0_sendmsghalt + ; GCN: %m0 = S_MOV_B32 -1 + ; VI-NEXT: S_NOP 0 + ; GFX9-NEXT: S_NOP 0 + ; GCN-NEXT: S_SENDMSGHALT 3, implicit %exec, implicit %m0 + + bb.0: + %m0 = S_MOV_B32 -1 + S_SENDMSGHALT 3, implicit %exec, implicit %m0 + S_ENDPGM +... +--- + +name: m0_ttracedata +body: | + ; GCN-LABEL: name: m0_ttracedata + ; GCN: %m0 = S_MOV_B32 -1 + ; VI-NEXT: S_NOP 0 + ; GFX9-NEXT: S_NOP 0 + ; GCN-NEXT: S_TTRACEDATA implicit %m0 + + bb.0: + %m0 = S_MOV_B32 -1 + S_TTRACEDATA implicit %m0 + S_ENDPGM +...