diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -96,6 +96,8 @@ bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); bool fixVcmpxExecWARHazard(MachineInstr *MI); bool fixLdsBranchVmemWARHazard(MachineInstr *MI); + bool fixVALUPartialForwardingHazard(MachineInstr *MI); + bool fixVALUTransUseHazard(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -424,8 +424,52 @@ // Helper Functions //===----------------------------------------------------------------------===// +typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult; + typedef function_ref IsExpiredFn; +// Search for a hazard in a block and its predecessors. +template +static bool +hasHazard(StateT State, + function_ref IsHazard, + function_ref UpdateState, + const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, + DenseSet &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // No need to look at parent BUNDLE instructions. + if (I->isBundle()) + continue; + + switch (IsHazard(State, *I)) { + case HazardFound: + return true; + case HazardExpired: + return false; + default: + // Continue search + break; + } + + if (I->isInlineAsm() || I->isMetaInstruction()) + continue; + + UpdateState(State, *I); + } + + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), + Visited)) + return true; + } + + return false; +} + // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. @@ -1031,6 +1075,8 @@ fixSMEMtoVectorWriteHazards(MI); fixVcmpxExecWARHazard(MI); fixLdsBranchVmemWARHazard(MI); + fixVALUPartialForwardingHazard(MI); + fixVALUTransUseHazard(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -1320,6 +1366,233 @@ return true; } +bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { + if (!ST.isWave64()) + return false; + if (!ST.hasVALUPartialForwardingHazard()) + return false; + if (!SIInstrInfo::isVALU(*MI)) + return false; + + SmallSetVector SrcVGPRs; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + SrcVGPRs.insert(Use.getReg()); + } + + // Only applies with >= 2 unique VGPR sources + if (SrcVGPRs.size() <= 1) + return false; + + // Look for the following pattern: + // Va <- VALU [PreExecPos] + // intv1 + // Exec <- SALU [ExecPos] + // intv2 + // Vb <- VALU [PostExecPos] + // intv3 + // MI Va, Vb (WaitState = 0) + // + // Where: + // intv1 + intv2 <= 2 VALUs + // intv3 <= 4 VALUs + // + // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. + + const int Intv1plus2MaxVALUs = 2; + const int Intv3MaxVALUs = 4; + const int IntvMaxVALUs = 6; + const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; + + struct StateType { + SmallDenseMap DefPos; + int ExecPos = std::numeric_limits::max(); + int VALUs = 0; + }; + + StateType State; + + // This overloads expiry testing with all the hazard detection + auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { + // Too many VALU states have passed + if (State.VALUs > NoHazardVALUWaitStates) + return HazardExpired; + + // Instructions which cause va_vdst==0 expire hazard + if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0x0fff)) + return HazardExpired; + + // Track registers writes + bool Changed = false; + if (SIInstrInfo::isVALU(I)) { + for (Register Src : SrcVGPRs) { + if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { + State.DefPos[Src] = State.VALUs; + Changed = true; + } + } + } else if (SIInstrInfo::isSALU(I)) { + if (State.ExecPos == std::numeric_limits::max()) { + if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { + State.ExecPos = State.VALUs; + Changed = true; + } + } + } + + // Early expiration: too many VALUs in intv3 + if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) + return HazardExpired; + + // Only evaluate state if something changed + if (!Changed) + return NoHazardFound; + + // Determine positions of VALUs pre/post exec change + if (State.ExecPos == std::numeric_limits::max()) + return NoHazardFound; + + int PreExecPos = std::numeric_limits::max(); + int PostExecPos = std::numeric_limits::max(); + + for (auto Entry : State.DefPos) { + int DefVALUs = Entry.second; + if (DefVALUs != std::numeric_limits::max()) { + if (DefVALUs >= State.ExecPos) + PreExecPos = std::min(PreExecPos, DefVALUs); + else if (DefVALUs < State.ExecPos) + PostExecPos = std::min(PostExecPos, DefVALUs); + } + } + + // Need a VALUs post exec change + if (PostExecPos == std::numeric_limits::max()) + return NoHazardFound; + + // Too many VALUs in intv3? + int Intv3VALUs = PostExecPos; + if (Intv3VALUs > Intv3MaxVALUs) + return HazardExpired; + + // Too many VALUs in intv2? + int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; + if (Intv2VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + // Need a VALUs pre exec change + if (PreExecPos == std::numeric_limits::max()) + return NoHazardFound; + + // Too many VALUs in intv1? + int Intv1VALUs = PreExecPos - State.ExecPos; + if (Intv1VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + // Too many VALUs in intv1 + intv2 + if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + return HazardFound; + }; + auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) + State.VALUs += 1; + }; + + DenseSet Visited; + if (!hasHazard(State, IsHazardFn, UpdateStateFn, MI->getParent(), + std::next(MI->getReverseIterator()), Visited)) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0x0fff); + + return true; +} + +bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { + if (!ST.hasVALUTransUseHazard()) + return false; + if (!SIInstrInfo::isVALU(*MI)) + return false; + + SmallSet SrcVGPRs; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + SrcVGPRs.insert(Use.getReg()); + } + + // Look for the following pattern: + // Va <- TRANS VALU + // intv + // MI Va (WaitState = 0) + // + // Where: + // intv <= 5 VALUs / 1 TRANS + // + // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. + + const int IntvMaxVALUs = 5; + const int IntvMaxTRANS = 1; + + struct StateType { + int VALUs = 0; + int TRANS = 0; + }; + + StateType State; + + // This overloads expiry testing with all the hazard detection + auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { + // Too many VALU states have passed + if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) + return HazardExpired; + + // Instructions which cause va_vdst==0 expire hazard + if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0x0fff)) + return HazardExpired; + + // Track registers writes + if (SIInstrInfo::isTRANS(I)) { + for (Register Src : SrcVGPRs) { + if (I.modifiesRegister(Src, &TRI)) { + return HazardFound; + } + } + } + + return NoHazardFound; + }; + auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) + State.VALUs += 1; + if (SIInstrInfo::isTRANS(MI)) + State.TRANS += 1; + }; + + DenseSet Visited; + if (!hasHazard(State, IsHazardFn, UpdateStateFn, MI->getParent(), + std::next(MI->getReverseIterator()), Visited)) + return false; + + // Hazard is observed - insert a wait on va_dst counter to ensure hazard is + // avoided (mask 0x0fff achieves this). + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0x0fff); + + return true; +} + int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { int NSAtoVMEMWaitStates = 1; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3117,6 +3117,7 @@ ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfff ; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir @@ -0,0 +1,399 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: partial_forwarding_1_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_1_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_2_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_2_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $sgpr0 = S_MOV_B32 0 + ; GCN: $sgpr1 = S_MOV_B32 0 + ; GCN: $sgpr2 = S_MOV_B32 0 + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $sgpr3 = S_MOV_B32 0 + ; GCN: $sgpr4 = S_MOV_B32 0 + ; GCN: $sgpr5 = S_MOV_B32 0 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $sgpr6 = S_MOV_B32 0 + ; GCN: $sgpr7 = S_MOV_B32 0 + ; GCN: $sgpr8 = S_MOV_B32 0 + ; GCN: $sgpr9 = S_MOV_B32 0 + ; GCN: $sgpr10 = S_MOV_B32 0 + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 0 + $exec = S_MOV_B64 -1 + $sgpr3 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + $sgpr8 = S_MOV_B32 0 + $sgpr9 = S_MOV_B32 0 + $sgpr10 = S_MOV_B32 0 + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_3_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_3_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_3_no_hazard_1 +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_3_no_hazard_1 + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr20 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr20 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_3_no_hazard_2 +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_3_no_hazard_2 + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr20 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr20 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_3_no_hazard_3 +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_3_no_hazard_3 + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr20 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr20 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_4_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_4_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_4_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_4_no_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr21 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr21 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_5_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_5_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_5_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: partial_forwarding_5_no_hazard + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr21 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr21 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_branching_1a +body: | + ; GCN-LABEL: name: partial_forwarding_branching_1a + ; GCN: bb.0: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr30 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr31 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.2: + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + S_BRANCH %bb.2 + bb.1: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + $vgpr31 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + bb.2: + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: partial_forwarding_branching_1b +body: | + ; GCN-LABEL: name: partial_forwarding_branching_1b + ; GCN: bb.0: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr30 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr31 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $exec = S_MOV_B64 -1 + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.2: + ; GCN: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_WAITCNT_DEPCTR 4095 + ; GCN: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; GCN: S_ENDPGM 0 + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + $vgpr31 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + bb.1: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = S_MOV_B64 -1 + S_BRANCH %bb.2 + bb.2: + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir @@ -0,0 +1,334 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: trans_use_1_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_use_1_hazard + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_1_no_hazard_1 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_1_no_hazard_1 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + S_WAITCNT_DEPCTR 4095 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_2_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_use_2_hazard + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr9 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr10 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + $sgpr8 = S_MOV_B32 0 + $sgpr9 = S_MOV_B32 0 + $sgpr10 = S_MOV_B32 0 + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_3_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_use_3_hazard + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_3_no_hazard_1 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_3_no_hazard_1 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_3_no_hazard_2 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_3_no_hazard_2 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr11 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_3_no_hazard_3 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_3_no_hazard_3 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr12 = V_SQRT_F32_e32 $vgpr13, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec + $vgpr12 = V_SQRT_F32_e32 $vgpr13, implicit $mode, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_4_one_depctr_1 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_4_one_depctr_1 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr4, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr6, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr4, implicit $mode, implicit $exec + $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr6, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_4_one_depctr_2 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_4_one_depctr_2 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr1, $vgpr6, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec + $vgpr7 = V_ADD_F32_e32 $vgpr1, $vgpr6, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_4 +body: | + bb.0: + ; GCN-LABEL: name: trans_use_4 + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_branching_1a +body: | + ; GCN-LABEL: name: trans_use_branching_1a + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr30 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr31 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr33 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + S_BRANCH %bb.2 + bb.1: + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + $vgpr31 = V_MOV_B32_e32 0, implicit $exec + $vgpr32 = V_MOV_B32_e32 0, implicit $exec + $vgpr33 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + bb.2: + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_branching_1b +body: | + ; GCN-LABEL: name: trans_use_branching_1b + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr30 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + S_BRANCH %bb.2 + bb.1: + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT_DEPCTR 4095 + S_BRANCH %bb.2 + bb.2: + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: trans_use_branching_1c_no_hazard_1 +body: | + ; GCN-LABEL: name: trans_use_branching_1c_no_hazard_1 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 4095 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr30 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + S_WAITCNT_DEPCTR 4095 + S_BRANCH %bb.2 + bb.1: + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + bb.2: + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + S_ENDPGM 0 +...