diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -96,6 +96,8 @@ bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); bool fixVcmpxExecWARHazard(MachineInstr *MI); bool fixLdsBranchVmemWARHazard(MachineInstr *MI); + bool fixLdsDirectVALUHazard(MachineInstr *MI); + bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -427,6 +427,7 @@ typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult; typedef function_ref IsExpiredFn; +typedef function_ref GetNumWaitStatesFn; // Search for a hazard in a block and its predecessors. template @@ -473,11 +474,11 @@ // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. -static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - const MachineBasicBlock *MBB, - MachineBasicBlock::const_reverse_instr_iterator I, - int WaitStates, IsExpiredFn IsExpired, - DenseSet &Visited) { +static int getWaitStatesSince( + GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, + IsExpiredFn IsExpired, DenseSet &Visited, + GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { for (auto E = MBB->instr_rend(); I != E; ++I) { // Don't add WaitStates for parent BUNDLE instructions. if (I->isBundle()) @@ -489,7 +490,7 @@ if (I->isInlineAsm()) continue; - WaitStates += SIInstrInfo::getNumWaitStates(*I); + WaitStates += GetNumWaitStates(*I); if (IsExpired(*I, WaitStates)) return std::numeric_limits::max(); @@ -500,8 +501,8 @@ if (!Visited.insert(Pred).second) continue; - int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), - WaitStates, IsExpired, Visited); + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, + IsExpired, Visited, GetNumWaitStates); MinWaitStates = std::min(MinWaitStates, W); } @@ -1075,6 +1076,10 @@ fixSMEMtoVectorWriteHazards(MI); fixVcmpxExecWARHazard(MI); fixLdsBranchVmemWARHazard(MI); + if (ST.hasLdsDirect()) { + fixLdsDirectVALUHazard(MI); + fixLdsDirectVMEMHazard(MI); + } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); } @@ -1366,6 +1371,81 @@ return true; } +bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { + if (!SIInstrInfo::isLDSDIR(*MI)) + return false; + + const int NoHazardWaitStates = 15; + const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); + const Register VDSTReg = VDST->getReg(); + + bool VisitedTrans = false; + auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { + if (!SIInstrInfo::isVALU(I)) + return false; + VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); + // Cover both WAR and WAW + return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); + }; + auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { + if (WaitStates >= NoHazardWaitStates) + return true; + // Instructions which cause va_vdst==0 expire hazard + return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); + }; + auto GetWaitStatesFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) ? 1 : 0; + }; + + DenseSet Visited; + auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + + // Transcendentals can execute in parallel to other VALUs. + // This makes va_vdst count unusable with a mixture of VALU and TRANS. + if (VisitedTrans) + Count = 0; + + MachineOperand *WaitVdstOp = + TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); + WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); + + return true; +} + +bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { + if (!SIInstrInfo::isLDSDIR(*MI)) + return false; + + const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); + const Register VDSTReg = VDST->getReg(); + + auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { + if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && + !SIInstrInfo::isDS(I)) + return false; + return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); + }; + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0xffe3); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xffe3); + + return true; +} + bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { if (!ST.isWave64()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir @@ -0,0 +1,409 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: lds_param_load_no_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_no_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst0_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst0_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst0_war_salu +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst0_war_salu + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $m0 = S_MOV_B32 killed $sgpr0 + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $m0 = S_MOV_B32 killed $sgpr0 + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst1_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst1_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 1, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst10_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst10_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst10_waw +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst10_waw + ; GCN: $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_va_vdst20_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_va_vdst20_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_valu_war_trans +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_valu_war_trans + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_trans_war_valu +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_trans_war_valu + ; GCN: $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_valu_war_vmem +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_valu_war_vmem + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_valu_war_lds +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_valu_war_lds + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_valu_war_ldsdir +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_valu_war_ldsdir + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: S_WAITCNT_DEPCTR 65507 + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war_valu +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war_valu + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war_exp +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war_exp + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war_waitcnt +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + S_WAITCNT 0 + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war_waitcnt_depctr +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: S_WAITCNT_DEPCTR 65507 + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + S_WAITCNT_DEPCTR 65507 + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_param_load_vmem_war_waitcnt_depctr2 +body: | + bb.0: + ; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr2 + ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) + ; GCN-NEXT: S_WAITCNT_DEPCTR 65535 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65507 + ; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + S_WAITCNT_DEPCTR 65535 + $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_direct_load_no_war +body: | + bb.0: + ; GCN-LABEL: name: lds_direct_load_no_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + $vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec + S_ENDPGM 0 +... + +--- +name: lds_direct_load_va_vdst0_war +body: | + bb.0: + ; GCN-LABEL: name: lds_direct_load_va_vdst0_war + ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec + S_ENDPGM 0 +...