diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -299,6 +299,9 @@ void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; +void initializeAMDGPUInsertDelayAluPass(PassRegistry &); +extern char &AMDGPUInsertDelayAluID; + void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -0,0 +1,457 @@ +//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_delay_alu instructions to avoid stalls on GFX11+. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-insert-delay-alu" + +namespace { + +class AMDGPUInsertDelayAlu : public MachineFunctionPass { +public: + static char ID; + + const SIInstrInfo *SII; + const TargetRegisterInfo *TRI; + + TargetSchedModel SchedModel; + + AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Return true if MI waits for all outstanding VALU instructions to complete. + static bool instructionWaitsForVALU(const MachineInstr &MI) { + // These instruction types wait for VA_VDST==0 before issuing. + const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | + SIInstrFlags::FLAT | SIInstrFlags::MIMG | + SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; + if (MI.getDesc().TSFlags & VA_VDST_0) + return true; + if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || + MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) + return true; + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI.getOperand(0).getImm() & 0xf000) == 0) + return true; + return false; + } + + // Types of delay that can be encoded in an s_delay_alu instruction. + enum DelayType { VALU, TRANS, SALU, OTHER }; + + // Get the delay type for an instruction with the specified TSFlags. + static DelayType getDelayType(uint64_t TSFlags) { + if (TSFlags & SIInstrFlags::TRANS) + return TRANS; + if (TSFlags & SIInstrFlags::VALU) + return VALU; + if (TSFlags & SIInstrFlags::SALU) + return SALU; + return OTHER; + } + + // Information about the last instruction(s) that wrote to a particular + // regunit. In straight-line code there will only be one such instruction, but + // when control flow converges we merge the delay information from each path + // to represent the union of the worst-case delays of each type. + struct DelayInfo { + // One larger than the maximum number of (non-TRANS) VALU instructions we + // can encode in an s_delay_alu instruction. + static const unsigned VALU_MAX = 5; + + // One larger than the maximum number of TRANS instructions we can encode in + // an s_delay_alu instruction. + static const unsigned TRANS_MAX = 4; + + // If it was written by a (non-TRANS) VALU, remember how many clock cycles + // are left until it completes, and how many other (non-TRANS) VALU we have + // seen since it was issued. + uint8_t VALUCycles = 0; + uint8_t VALUNum = VALU_MAX; + + // If it was written by a TRANS, remember how many clock cycles are left + // until it completes, and how many other TRANS we have seen since it was + // issued. + uint8_t TRANSCycles = 0; + uint8_t TRANSNum = TRANS_MAX; + // Also remember how many other (non-TRANS) VALU we have seen since it was + // issued. When an instruction depends on both a prior TRANS and a prior + // non-TRANS VALU, this is used to decide whether to encode a wait for just + // one or both of them. + uint8_t TRANSNumVALU = VALU_MAX; + + // If it was written by an SALU, remember how many clock cycles are left + // until it completes. + uint8_t SALUCycles = 0; + + DelayInfo() = default; + + DelayInfo(DelayType Type, unsigned Cycles) { + switch (Type) { + default: + llvm_unreachable("unexpected type"); + case VALU: + VALUCycles = Cycles; + VALUNum = 0; + break; + case TRANS: + TRANSCycles = Cycles; + TRANSNum = 0; + TRANSNumVALU = 0; + break; + case SALU: + SALUCycles = Cycles; + break; + } + } + + bool operator==(const DelayInfo &RHS) const { + return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && + TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && + TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; + } + + bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } + + // Merge another DelayInfo into this one, to represent the union of the + // worst-case delays of each type. + void merge(const DelayInfo &RHS) { + VALUCycles = std::max(VALUCycles, RHS.VALUCycles); + VALUNum = std::min(VALUNum, RHS.VALUNum); + TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); + TRANSNum = std::min(TRANSNum, RHS.TRANSNum); + TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); + SALUCycles = std::max(SALUCycles, RHS.SALUCycles); + } + + // Update this DelayInfo after issuing an instruction. IsVALU should be 1 + // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing + // a TRANS, else 0. Cycles is the number of cycles it takes to issue the + // instruction. Return true if there is no longer any useful delay info. + bool advance(DelayType Type, unsigned Cycles) { + bool Erase = true; + + VALUNum += (Type == VALU); + if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { + // Forget about the VALU instruction. It was too far back or has + // definitely completed by now. + VALUNum = VALU_MAX; + VALUCycles = 0; + } else { + VALUCycles -= Cycles; + Erase = false; + } + + TRANSNum += (Type == TRANS); + TRANSNumVALU += (Type == VALU); + if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { + // Forget about any TRANS instruction. It was too far back or has + // definitely completed by now. + TRANSNum = TRANS_MAX; + TRANSNumVALU = VALU_MAX; + TRANSCycles = 0; + } else { + TRANSCycles -= Cycles; + Erase = false; + } + + if (SALUCycles <= Cycles) { + // Forget about any SALU instruction. It has definitely completed by + // now. + SALUCycles = 0; + } else { + SALUCycles -= Cycles; + Erase = false; + } + + return Erase; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const { + if (VALUCycles) + dbgs() << " VALUCycles=" << (int)VALUCycles; + if (VALUNum < VALU_MAX) + dbgs() << " VALUNum=" << (int)VALUNum; + if (TRANSCycles) + dbgs() << " TRANSCycles=" << (int)TRANSCycles; + if (TRANSNum < TRANS_MAX) + dbgs() << " TRANSNum=" << (int)TRANSNum; + if (TRANSNumVALU < VALU_MAX) + dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; + if (SALUCycles) + dbgs() << " SALUCycles=" << (int)SALUCycles; + } +#endif + }; + + // A map from regunits to the delay info for that regunit. + struct DelayState : DenseMap { + // Merge another DelayState into this one by merging the delay info for each + // regunit. + void merge(const DelayState &RHS) { + for (const auto &KV : RHS) { + iterator It; + bool Inserted; + std::tie(It, Inserted) = insert(KV); + if (!Inserted) + It->second.merge(KV.second); + } + } + + // Advance the delay info for each regunit, erasing any that are no longer + // useful. + void advance(DelayType Type, unsigned Cycles) { + iterator Next; + for (auto I = begin(), E = end(); I != E; I = Next) { + Next = std::next(I); + if (I->second.advance(Type, Cycles)) + erase(I); + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const TargetRegisterInfo *TRI) const { + if (empty()) { + dbgs() << " empty\n"; + return; + } + + // Dump DelayInfo for each RegUnit in numerical order. + SmallVector Order; + Order.reserve(size()); + for (const_iterator I = begin(), E = end(); I != E; ++I) + Order.push_back(I); + llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { + return A->first < B->first; + }); + for (const_iterator I : Order) { + dbgs() << " " << printRegUnit(I->first, TRI); + I->second.dump(); + dbgs() << "\n"; + } + } +#endif + }; + + // The saved delay state at the end of each basic block. + DenseMap BlockState; + + // Emit an s_delay_alu instruction if necessary before MI. + MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, + MachineInstr *LastDelayAlu) { + unsigned Imm = 0; + + // Wait for a TRANS instruction. + if (Delay.TRANSNum < DelayInfo::TRANS_MAX) + Imm |= 4 + Delay.TRANSNum; + + // Wait for a VALU instruction (if it's more recent than any TRANS + // instruction that we're also waiting for). + if (Delay.VALUNum < DelayInfo::VALU_MAX && + Delay.VALUNum <= Delay.TRANSNumVALU) { + if (Imm & 0xf) + Imm |= Delay.VALUNum << 7; + else + Imm |= Delay.VALUNum; + } + + // Wait for an SALU instruction. + if (Delay.SALUCycles) { + if (Imm & 0x780) { + // We have already encoded a VALU and a TRANS delay. There's no room in + // the encoding for an SALU delay as well, so just drop it. + } else if (Imm & 0xf) { + Imm |= (Delay.SALUCycles + 8) << 7; + } else { + Imm |= Delay.SALUCycles + 8; + } + } + + // Don't emit the s_delay_alu instruction if there's nothing to wait for. + if (!Imm) + return LastDelayAlu; + + // If we only need to wait for one instruction, try encoding it in the last + // s_delay_alu that we emitted. + if (!(Imm & 0x780) && LastDelayAlu) { + unsigned Skip = 0; + for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), + E = MachineBasicBlock::instr_iterator(MI); + ++I != E;) { + if (!I->isBundle() && !I->isMetaInstruction()) + ++Skip; + } + if (Skip < 6) { + MachineOperand &Op = LastDelayAlu->getOperand(0); + unsigned LastImm = Op.getImm(); + assert((LastImm & ~0xf) == 0 && + "Remembered an s_delay_alu with no room for another delay!"); + LastImm |= Imm << 7 | Skip << 4; + Op.setImm(LastImm); + return nullptr; + } + } + + auto &MBB = *MI.getParent(); + MachineInstr *DelayAlu = + BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); + // Remember the s_delay_alu for next time if there is still room in it to + // encode another delay. + return (Imm & 0x780) ? nullptr : DelayAlu; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { + DelayState State; + for (auto *Pred : MBB.predecessors()) + State.merge(BlockState[Pred]); + + LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) + << "\n"; + State.dump(TRI);); + + bool Changed = false; + MachineInstr *LastDelayAlu = nullptr; + + // Iterate over the contents of bundles, but don't emit any instructions + // inside a bundle. + for (auto &MI : MBB.instrs()) { + if (MI.isBundle() || MI.isMetaInstruction()) + continue; + + // Ignore some more instructions that do not generate any code. + switch (MI.getOpcode()) { + case AMDGPU::SI_RETURN_TO_EPILOG: + continue; + } + + DelayType Type = getDelayType(MI.getDesc().TSFlags); + + if (instructionWaitsForVALU(MI)) { + // Forget about all outstanding VALU delays. + State = DelayState(); + } else if (Type != OTHER) { + DelayInfo Delay; + // TODO: Scan implicit uses too? + for (const auto &Op : MI.explicit_uses()) { + if (Op.isReg()) { + // One of the operands of the writelane is also the output operand. + // This creates the insertion of redundant delays. Hence, we have to + // ignore this operand. + if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) + continue; + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { + auto It = State.find(*UI); + if (It != State.end()) { + Delay.merge(It->second); + State.erase(*UI); + } + } + } + } + if (Emit && !MI.isBundledWithPred()) { + // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or + // just ignore them? + LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); + } + } + + if (Type != OTHER) { + // TODO: Scan implicit defs too? + for (const auto &Op : MI.defs()) { + unsigned Latency = SchedModel.computeOperandLatency( + &MI, MI.getOperandNo(&Op), nullptr, 0); + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) + State[*UI] = DelayInfo(Type, Latency); + } + } + + // Advance by the number of cycles it takes to issue this instruction. + // TODO: Use a more advanced model that accounts for instructions that + // take multiple cycles to issue on a particular pipeline. + unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); + // TODO: In wave64 mode, double the number of cycles for VALU and VMEM + // instructions on the assumption that they will usually have to be issued + // twice? + State.advance(Type, Cycles); + + LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); + } + + if (Emit) { + assert(State == BlockState[&MBB] && + "Basic block state should not have changed on final pass!"); + } else if (State != BlockState[&MBB]) { + BlockState[&MBB] = std::move(State); + Changed = true; + } + return Changed; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() + << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasDelayAlu()) + return false; + + SII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + SchedModel.init(&ST); + + // Calculate the delay state for each basic block, iterating until we reach + // a fixed point. + SetVector WorkList; + for (auto &MBB : reverse(MF)) + WorkList.insert(&MBB); + while (!WorkList.empty()) { + auto &MBB = *WorkList.pop_back_val(); + bool Changed = runOnMachineBasicBlock(MBB, false); + if (Changed) + WorkList.insert(MBB.succ_begin(), MBB.succ_end()); + } + + LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); + + // Make one last pass over all basic blocks to emit s_delay_alu + // instructions. + bool Changed = false; + for (auto &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB, true); + return Changed; + } +}; + +} // namespace + +char AMDGPUInsertDelayAlu::ID = 0; + +char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; + +INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", + false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -272,6 +272,12 @@ cl::init(true), cl::Hidden); +// Enable GFX11+ s_delay_alu insertion +static cl::opt + EnableInsertDelayAlu("amdgpu-enable-delay-alu", + cl::desc("Enable s_delay_alu insertion"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt EnableDCEInRA("amdgpu-dce-in-ra", @@ -363,6 +369,7 @@ initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIModeRegisterPass(*PR); @@ -1413,6 +1420,10 @@ // Here we add a stand-alone hazard recognizer pass which can handle all // cases. addPass(&PostRAHazardRecognizerID); + + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) + addPass(&AMDGPUInsertDelayAluID); + addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -57,6 +57,7 @@ AMDGPUFrameLowering.cpp AMDGPUGlobalISelUtils.cpp AMDGPUHSAMetadataStreamer.cpp + AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -589,6 +589,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -12,9 +12,11 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 ; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 ; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done ; GCN-NEXT: s_endpgm @@ -42,13 +44,16 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 ; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 ; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done ; GCN-NEXT: s_endpgm @@ -86,8 +91,10 @@ ; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 ; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done @@ -123,9 +130,11 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 ; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -68,8 +68,10 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11 ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 @@ -133,8 +135,10 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12 ; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 @@ -235,8 +239,10 @@ ; GFX11-NEXT: v_readfirstlane_b32 s5, v12 ; GFX11-NEXT: v_readfirstlane_b32 s6, v13 ; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7] @@ -359,8 +365,10 @@ ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16 @@ -474,8 +482,10 @@ ; GFX11-NEXT: v_readfirstlane_b32 s5, v13 ; GFX11-NEXT: v_readfirstlane_b32 s6, v14 ; GFX11-NEXT: v_readfirstlane_b32 s7, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] @@ -605,8 +615,10 @@ ; GFX11-NEXT: v_readfirstlane_b32 s5, v11 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16 @@ -984,6 +996,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7 ; GFX11-NEXT: s_movk_i32 s5, 0x102 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v10, s5 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo @@ -1123,6 +1136,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6 ; GFX11-NEXT: s_movk_i32 s5, 0x102 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v7, s5 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_saddsat_i7: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_ssubsat_i7: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_uaddsat_i7: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -3,7 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_usubsat_i7: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -173,6 +173,7 @@ ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 @@ -180,6 +181,7 @@ ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -190,6 +192,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -203,12 +206,14 @@ ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -219,6 +224,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -404,6 +410,7 @@ ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 @@ -412,6 +419,7 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -423,6 +431,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 @@ -437,6 +446,7 @@ ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: @@ -444,6 +454,7 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -456,6 +467,7 @@ ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1132-NEXT: s_endpgm @@ -700,17 +712,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -719,12 +737,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -747,6 +767,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -760,25 +781,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -796,6 +823,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -962,20 +990,27 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 @@ -997,15 +1032,20 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 @@ -1196,6 +1236,7 @@ ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1204,6 +1245,7 @@ ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1214,6 +1256,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 @@ -1228,12 +1271,14 @@ ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1244,6 +1289,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 @@ -1461,6 +1507,7 @@ ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -1485,10 +1532,12 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -1500,6 +1549,7 @@ ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: @@ -1522,10 +1572,12 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -1775,6 +1827,7 @@ ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 @@ -1782,6 +1835,7 @@ ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1793,6 +1847,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,12 +1861,14 @@ ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1823,6 +1880,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -2010,6 +2068,7 @@ ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_2 @@ -2018,6 +2077,7 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2031,6 +2091,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1164-NEXT: s_endpgm @@ -2044,6 +2105,7 @@ ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: @@ -2051,6 +2113,7 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2064,6 +2127,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_endpgm @@ -2308,17 +2372,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -2327,12 +2397,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -2355,6 +2427,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -2368,25 +2441,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -2404,6 +2483,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -2570,20 +2650,27 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 @@ -2605,15 +2692,20 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 @@ -2812,6 +2904,7 @@ ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -2820,6 +2913,7 @@ ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2832,6 +2926,7 @@ ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -2847,12 +2942,14 @@ ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2865,6 +2962,7 @@ ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3092,6 +3190,7 @@ ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -3122,6 +3221,7 @@ ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -3134,6 +3234,7 @@ ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: @@ -3157,10 +3258,12 @@ ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -3482,17 +3585,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -3501,12 +3610,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -3529,6 +3640,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -3542,25 +3654,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -3578,6 +3696,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -3825,17 +3944,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -3844,12 +3969,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -3872,6 +3999,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -3885,25 +4013,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -3921,6 +4055,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -4168,17 +4303,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -4187,12 +4328,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -4215,6 +4358,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -4228,25 +4372,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -4264,6 +4414,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -4511,17 +4662,23 @@ ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -4530,12 +4687,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -4558,6 +4717,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -4571,25 +4731,31 @@ ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -4607,6 +4773,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -4797,6 +4964,7 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4817,6 +4985,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc @@ -4830,6 +4999,7 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -4849,6 +5019,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo @@ -5098,17 +5269,23 @@ ; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -5117,12 +5294,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -5145,6 +5324,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -5158,25 +5338,31 @@ ; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -5194,6 +5380,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -5384,6 +5571,7 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5404,6 +5592,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc @@ -5417,6 +5606,7 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -5436,6 +5626,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo @@ -5685,17 +5876,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -5704,12 +5901,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -5732,6 +5931,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,25 +5945,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -5781,6 +5987,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -5968,6 +6175,7 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5988,6 +6196,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc @@ -6001,6 +6210,7 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -6020,6 +6230,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo @@ -6269,17 +6480,23 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -6288,12 +6505,14 @@ ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 @@ -6316,6 +6535,7 @@ ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6329,25 +6549,31 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -6365,6 +6591,7 @@ ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6552,6 +6779,7 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6572,6 +6800,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc @@ -6585,6 +6814,7 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -6604,6 +6834,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -476,8 +476,10 @@ ; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8 ; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v7, v4 diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll --- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll +++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll @@ -14,24 +14,31 @@ ; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 ; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 ; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7 ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -67,6 +67,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc @@ -154,6 +155,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc @@ -241,6 +243,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc @@ -311,6 +314,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -327,6 +331,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 @@ -401,6 +406,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -418,6 +424,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 @@ -492,6 +499,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -509,6 +517,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 @@ -580,6 +589,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc @@ -597,6 +607,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 @@ -671,6 +682,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc @@ -689,6 +701,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 @@ -763,6 +776,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -780,6 +794,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -54,6 +54,7 @@ ; GFX11-LABEL: zero_init_kernel: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -169,6 +170,7 @@ ; GFX11-PAL-LABEL: zero_init_kernel: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -231,6 +233,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -304,6 +307,7 @@ ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -681,6 +685,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -743,6 +748,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -810,6 +816,7 @@ ; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -865,6 +872,7 @@ ; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1021,6 +1029,7 @@ ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -1148,6 +1157,7 @@ ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -1219,6 +1229,7 @@ ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -1300,6 +1311,7 @@ ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -4217,6 +4229,7 @@ ; GFX11-LABEL: large_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4317,6 +4330,7 @@ ; GFX11-PAL-LABEL: large_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir @@ -0,0 +1,561 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s + +--- +name: valu_dep_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_3 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_3: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_4 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_4: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +# There's no encoding for VALU_DEP_5. A normal VALU instruction will have +# completed already. +--- +name: valu_dep_5 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_5: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 + ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec + $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: trans32_dep_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}trans32_dep_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: trans32_dep_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}trans32_dep_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: trans32_dep_3 +body: | + bb.0: + ; CHECK-LABEL: {{^}}trans32_dep_3: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: v_exp_f32_e32 v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode + $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have +# completed already. +--- +name: trans32_dep_4 +body: | + bb.0: + ; CHECK-LABEL: {{^}}trans32_dep_4: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: v_exp_f32_e32 v2, v2 + ; CHECK-NEXT: v_exp_f32_e32 v3, v3 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode + $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode + $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: salu_cycle_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}salu_cycle_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + $sgpr0 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec +... + +# There's no need for SALU_CYCLE_2 here because the s_mov will have completed +# already. +--- +name: salu_cycle_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}salu_cycle_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + $sgpr0 = S_MOV_B32 0 + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_1_same_trans32_dep_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 + $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec +... + +# There's no need to encode the VALU depdendency because it will complete before +# the TRANS. +--- +name: trans32_dep_1_only +body: | + bb.0: + ; CHECK-LABEL: {{^}}trans32_dep_1_only: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec +... + +--- +name: valu_dep_1_same_salu_cycle_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_1_next_valu_dep_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: valu_dep_2_next_valu_dep_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec +... + +# There's no need to encode a dependency for the second mul, because the +# dependency for the first mul has already guaranteed that the add has +# completed. +--- +name: valu_dep_1_no_next_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0 + ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0 + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode + $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode +... + +# There's no need to encode a dependency for the second add, because the +# dependency for the second mul has already guaranteed that a later VALU has +# completed. +--- +name: valu_dep_1_no_next_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode + $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode + $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode +... + +# There are no wait states between an add/sub/cmp generating carry and an +# add/sub/cndmask that consumes it, so no need to encode a dependency. + +--- +name: implicit_cmp_cndmask +body: | + bb.0: + ; CHECK-LABEL: {{^}}implicit_cmp_cndmask: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1 + ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc + implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec +... + +# TODO: There should be no s_delay_alu here. +--- +name: explicit_cmp_cndmask +body: | + bb.0: + ; CHECK-LABEL: {{^}}explicit_cmp_cndmask: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] + $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec + $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec +... + +--- +name: implicit_addc_addc +body: | + bb.0: + ; CHECK-LABEL: {{^}}implicit_addc_addc: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc + ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc + $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec +... + +--- +name: explicit_addc_addc +body: | + bb.0: + ; CHECK-LABEL: {{^}}explicit_addc_addc: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0 + ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc + $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec +... + +--- +name: valu_dep_3_bundle +body: | + bb.0: + ; CHECK-LABEL: {{^}}valu_dep_3_bundle: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + BUNDLE { + $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec + } + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: if +body: | + bb.0: + ; CHECK-LABEL: {{^}}if: + ; CHECK: %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB23_2 + ; CHECK-NEXT: %bb.1: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB23_2: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + bb.1: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + bb.2: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: else +body: | + bb.0: + ; CHECK-LABEL: {{^}}else: + ; CHECK: %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB24_2 + ; CHECK-NEXT: %bb.1 + ; CHECK-NEXT: s_branch .LBB24_3 + ; CHECK-NEXT: .LBB24_2: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB24_3: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + bb.1: + S_BRANCH %bb.3 + bb.2: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + bb.3: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: if_else +body: | + bb.0: + ; CHECK-LABEL: {{^}}if_else: + ; CHECK: %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB25_2 + ; CHECK-NEXT: %bb.1: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_branch .LBB25_3 + ; CHECK-NEXT: .LBB25_2: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1 + ; CHECK-NEXT: .LBB25_3: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + bb.1: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_BRANCH %bb.3 + bb.2: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + bb.3: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +# Dependency from outside the loop. +--- +name: loop_1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}loop_1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB26_1: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0 + ; CHECK-NEXT: s_cbranch_vccz .LBB26_1 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + bb.1: + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + bb.2: +... + +# Dependency from inside the loop. +--- +name: loop_2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}loop_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: .LBB27_1: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_cbranch_vccz .LBB27_1 + bb.1: + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + bb.2: +... + +# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU +# to complete. +--- +name: sendmsg_rtn +body: | + bb.0: + ; CHECK-LABEL: {{^}}sendmsg_rtn: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) + ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) + ; CHECK-NEXT: s_add_u32 s0, s0, s0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_SENDMSG_RTN_B32 128 + $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +# No VALU delay before or across FLAT because it waits for all outstanding VALU +# to complete. +--- +name: flat_load +body: | + bb.0: + ; CHECK-LABEL: {{^}}flat_load: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: v_mov_b32_e32 v1, 0 + ; CHECK-NEXT: v_mov_b32_e32 v2, 0 + ; CHECK-NEXT: flat_load_b32 v0, v[0:1] + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec +... + +# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU +# to complete. +--- +name: waitcnt_depctr +body: | + bb.0: + ; CHECK-LABEL: {{^}}waitcnt_depctr: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: s_waitcnt_depctr 0xfff + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT_DEPCTR 4095 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... + +# Check that no delays are emitted for writelane instructions. +--- +name: writelane1 +body: | + bb.0: + ; CHECK-LABEL: {{^}}writelane1: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_writelane_b32 v0, s0, 0 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 1 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 2 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 + $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0 + $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0 + $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0 + $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0 +... + +# Check if a VALU delay is added after writelane. +--- +name: writelane2 +body: | + bb.0: + ; CHECK-LABEL: {{^}}writelane2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -388,6 +388,7 @@ ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer +; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass ; GCN-O1-NEXT: Register Usage Information Collector Pass ; GCN-O1-NEXT: Live DEBUG_VALUE analysis @@ -676,6 +677,7 @@ ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer +; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis @@ -966,6 +968,7 @@ ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer +; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass ; GCN-O2-NEXT: Register Usage Information Collector Pass ; GCN-O2-NEXT: Live DEBUG_VALUE analysis @@ -1268,6 +1271,7 @@ ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer +; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass ; GCN-O3-NEXT: Register Usage Information Collector Pass ; GCN-O3-NEXT: Live DEBUG_VALUE analysis diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -86,6 +86,7 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 ; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -15,6 +15,7 @@ ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm @@ -43,6 +44,7 @@ ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { ; VERDE-LABEL: load_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: sample_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; TONGA-LABEL: image_sample_2d_f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { ; VERDE-LABEL: sample_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -34,6 +34,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] ; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00] @@ -62,8 +63,10 @@ ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] ; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] @@ -105,6 +108,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04] ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04] ; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06] @@ -147,6 +151,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] ; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06] @@ -193,6 +198,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04] ; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04] ; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08] @@ -226,6 +232,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] ; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] ; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08] @@ -259,6 +266,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] ; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] ; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -12,9 +12,11 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 ; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 ; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done ; GCN-NEXT: s_endpgm @@ -42,13 +44,16 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 ; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 ; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done ; GCN-NEXT: s_endpgm @@ -86,8 +91,10 @@ ; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 ; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done @@ -123,9 +130,11 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 ; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -233,6 +233,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 @@ -325,6 +326,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 @@ -428,6 +430,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v10, 0x102 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 @@ -515,6 +518,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v7, 0x102 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -14,6 +14,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm @@ -28,6 +29,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -83,20 +83,25 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v8, v1 ; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2 ; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo @@ -223,31 +228,40 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 ; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v8, v1 ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo ; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9 ; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo ; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11 ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] @@ -372,6 +386,7 @@ ; GFX11-NEXT: s_add_i32 s1, s1, s6 ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off @@ -548,8 +563,10 @@ ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo ; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_add_i32 s1, s8, s7 @@ -558,7 +575,9 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo @@ -617,9 +636,11 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -677,8 +698,10 @@ ; GFX11-NEXT: v_mov_b32_e32 v6, v0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -34,6 +34,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 @@ -71,6 +72,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 @@ -108,6 +110,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 @@ -145,6 +148,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 @@ -248,22 +252,29 @@ ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] ; GFX11-NEXT: v_mov_b32_e32 v7, v10 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] ; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10] ; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8] ; GFX11-NEXT: v_mov_b32_e32 v7, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i128 @@ -301,6 +312,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i63 @@ -346,6 +358,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31 ; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i31 %arg0 to i63 @@ -394,9 +407,11 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] %ext0 = sext i32 %arg0 to i64 @@ -433,6 +448,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 @@ -481,8 +497,10 @@ ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -532,9 +550,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 @@ -571,6 +591,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl.lhs = shl i64 %arg0, 32 @@ -610,6 +631,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -731,6 +753,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3] ; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2 ; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -794,14 +817,17 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -852,8 +878,10 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -908,9 +936,11 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5] ; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3 ; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %m = mul i48 %arg0, %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GFX9-LABEL: mad_i32_vvv: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -269,6 +269,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc @@ -283,6 +284,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc @@ -561,6 +563,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 @@ -575,6 +578,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -165,6 +165,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc @@ -180,6 +181,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc @@ -359,6 +361,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 @@ -374,6 +377,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0