Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -221,6 +221,9 @@ void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; +void initializeGCNRegBankReassignPass(PassRegistry &); +extern char &GCNRegBankReassignID; + void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -234,6 +234,7 @@ initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } @@ -937,6 +938,7 @@ bool GCNPassConfig::addPreRewrite() { if (EnableRegReassign) { addPass(&GCNNSAReassignID); + addPass(&GCNRegBankReassignID); } return true; } Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNRegBankReassign.cpp GCNNSAReassign.cpp GCNDPPCombine.cpp SIModeRegister.cpp Index: llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -0,0 +1,797 @@ +//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ to reduce register bank +/// conflicts. +/// +/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in +/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to +/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, +/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. +/// +/// The shader can read one dword from each of these banks once per cycle. +/// If an instruction has to read more register operands from the same bank +/// an additional cycle is needed. HW attempts to pre-load registers through +/// input operand gathering, but a stall cycle may occur if that fails. For +/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, +/// potentially incuring 2 stall cycles. +/// +/// The pass tries to reassign registers to reduce bank conflicts. +/// +/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so +/// that 4 has to be subtracted from an SGPR bank number to get the real value. +/// This also corresponds to bit numbers in bank masks used in the pass. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +static cl::opt VerifyStallCycles("amdgpu-verify-regbanks-reassign", + cl::desc("Verify stall cycles in the regbanks reassign pass"), + cl::value_desc("0|1|2"), + cl::init(0), cl::Hidden); + +#define DEBUG_TYPE "amdgpu-regbanks-reassign" + +#define NUM_VGPR_BANKS 4 +#define NUM_SGPR_BANKS 8 +#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) +#define SGPR_BANK_OFFSET NUM_VGPR_BANKS +#define VGPR_BANK_MASK 0xf +#define SGPR_BANK_MASK 0xff0 +#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) + +STATISTIC(NumStallsDetected, + "Number of operand read stalls detected"); +STATISTIC(NumStallsRecovered, + "Number of operand read stalls recovered"); + +namespace { + +class GCNRegBankReassign : public MachineFunctionPass { + + class OperandMask { + public: + OperandMask(unsigned r, unsigned s, unsigned m) + : Reg(r), SubReg(s), Mask(m) {} + unsigned Reg; + unsigned SubReg; + unsigned Mask; + }; + + class Candidate { + public: + Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, + unsigned weight) + : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + + bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const GCNRegBankReassign *P) const { + MI->dump(); + dbgs() << P->printReg(Reg) << " to banks "; + dumpFreeBanks(FreeBanks); + dbgs() << " weight " << Weight << '\n'; + } +#endif + + MachineInstr *MI; + unsigned Reg; + unsigned FreeBanks; + unsigned Weight; + }; + + class CandidateList : public std::list { + public: + // Speedup subsequent sort. + void push(const Candidate&& C) { + if (C.Weight) push_back(C); + else push_front(C); + } + }; + +public: + static char ID; + +public: + GCNRegBankReassign() : MachineFunctionPass(ID) { + initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN RegBank Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + MachineLoopInfo *MLI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + unsigned MaxNumSGPRs; + + BitVector RegsUsed; + + SmallVector OperandMasks; + + CandidateList Candidates; + + const MCPhysReg *CSRegs; + + // Returns bank for a phys reg. + unsigned getPhysRegBank(unsigned Reg) const; + + // Return a bit set for each register bank used. 4 banks for VGPRs and + // 8 banks for SGPRs. + // Registers already processed and recorded in RegsUsed are excluded. + // If Bank is not -1 assume Reg:SubReg to belong to that Bank. + unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); + + // Return number of stalls in the instructions. + // UsedBanks has bits set for the banks used by all operands. + // If Reg and Bank provided substitute the Reg with the Bank. + unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, + unsigned Reg = AMDGPU::NoRegister, int Bank = -1); + + // Return true if register is regular VGPR or SGPR or their tuples. + // Returns false for special registers like m0, vcc etc. + bool isReassignable(unsigned Reg) const; + + // Check if registers' defs are old and may be pre-loaded. + // Returns 0 if both registers are old enough, 1 or 2 if one or both + // registers will not likely be pre-loaded. + unsigned getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const; + + + // Find all bank bits in UsedBanks where Mask can be relocated to. + unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; + + // Find all bank bits in UsedBanks where Mask can be relocated to. + // Bank is relative to the register and not its subregister component. + // Returns 0 is a register is not reassignable. + unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, + unsigned UsedBanks) const; + + // Add cadidate instruction to the work list. + void collectCandidates(MachineInstr& MI, unsigned UsedBanks, + unsigned StallCycles); + + // Collect cadidate instructions across function. Returns a number stall + // cycles detected. Only counts stalls if Collect is false. + unsigned collectCandidates(MachineFunction &MF, bool Collect = true); + + // Remove all candidates that read specified register. + void removeCandidates(unsigned Reg); + + // Compute stalls within the uses of SrcReg replaced by a register from + // Bank. If Bank is -1 does not perform substitution. If Collect is set + // candidates are collected and added to work list. + unsigned computeStallCycles(unsigned SrcReg, + unsigned Reg = AMDGPU::NoRegister, + int Bank = -1, bool Collect = false); + + // Search for a register in Bank unused within LI. + // Returns phys reg or NoRegister. + unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + + // Try to reassign candidate. Returns number or stall cycles saved. + unsigned tryReassign(Candidate &C); + + bool verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, unsigned CyclesSaved); + + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +public: + Printable printReg(unsigned Reg, unsigned SubReg = 0) const { + return Printable([Reg, SubReg, this](raw_ostream &OS) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + OS << llvm::printReg(Reg, TRI); + return; + } + if (!VRM->isAssignedReg(Reg)) + OS << " " << llvm::printReg(Reg, TRI); + else + OS << llvm::printReg(Reg, TRI) << '(' + << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; + if (SubReg) + OS << ':' << TRI->getSubRegIndexName(SubReg); + }); + } + + static Printable printBank(unsigned Bank) { + return Printable([Bank](raw_ostream &OS) { + OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); + }); + } + + static void dumpFreeBanks(unsigned FreeBanks) { + for (unsigned L = 0; L < NUM_BANKS; ++L) + if (FreeBanks & (1 << L)) + dbgs() << printBank(L) << ' '; + } +#endif +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) + + +char GCNRegBankReassign::ID = 0; + +char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; + +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { + assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + Reg -= AMDGPU::VGPR0; + return Reg % NUM_VGPR_BANKS; + } + + Reg = TRI->getEncodingValue(Reg) / 2; + return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, + int Bank) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!VRM->isAssignedReg(Reg)) + return 0; + + Reg = VRM->getPhys(Reg); + if (!Reg) + return 0; + if (SubReg) + Reg = TRI->getSubReg(Reg, SubReg); + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC) / 32; + if (Size > 1) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + // VGPRs have 4 banks assigned in a round-robin fashion. + Reg -= AMDGPU::VGPR0; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + // Bitmask lacks an extract method + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(Reg + I)) + Used |= 1 << I; + RegsUsed.set(Reg, Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); + return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + } + + // SGPRs have 8 banks holding 2 consequitive registers each. + Reg = TRI->getEncodingValue(Reg) / 2; + unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); + if (Reg + StartBit >= RegsUsed.size()) + return 0; + + if (Size > 1) + Size /= 2; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(StartBit + Reg + I)) + Used |= 1 << I; + RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS + : unsigned(Bank - SGPR_BANK_OFFSET); + Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + // Reserve 4 bank ids for VGPRs. + return Mask << SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, + unsigned& UsedBanks, + unsigned Reg, + int Bank) { + unsigned StallCycles = 0; + UsedBanks = 0; + + if (MI.isDebugValue()) + return 0; + + RegsUsed.reset(); + OperandMasks.clear(); + for (const auto& Op : MI.explicit_uses()) { + // Undef can be assigned to any register, so two vregs can be assigned + // the same phys reg within the same instruction. + if (!Op.isReg() || Op.isUndef()) + continue; + + unsigned R = Op.getReg(); + unsigned ShiftedBank = Bank; + + if (Bank != -1 && R == Reg && Op.getSubReg()) { + unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); + if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) >= NUM_VGPR_BANKS) + continue; + ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; + } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) + continue; + ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + + (countTrailingZeros(LM) >> 1)) % + NUM_SGPR_BANKS; + } + } + + unsigned Mask = getRegBankMask(R, Op.getSubReg(), + (Reg == R) ? ShiftedBank : -1); + StallCycles += countPopulation(UsedBanks & Mask); + UsedBanks |= Mask; + OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); + } + + return StallCycles; +} + +unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const +{ + unsigned Defs = 0; + MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); + MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); + for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { + if (MI.isDebugInstr()) + continue; + --Def; + if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) + continue; + if (Def->modifiesRegister(Reg1, TRI)) + Defs |= 1; + if (Def->modifiesRegister(Reg2, TRI)) + Defs |= 2; + } + return countPopulation(Defs); +} + +bool GCNRegBankReassign::isReassignable(unsigned Reg) const { + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return false; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + unsigned PhysReg = VRM->getPhys(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return false; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return false; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return false; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); + if (TRI->hasVGPRs(RC)) + return true; + + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); + + return AMDGPU::SGPR_32RegClass.contains(PhysReg); +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, + unsigned UsedBanks) const { + unsigned Size = countPopulation(Mask); + unsigned FreeBanks = 0; + unsigned Bank = findFirstSet(Mask); + + UsedBanks &= ~Mask; + + // Find free VGPR banks + if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { + for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + if (!(UsedBanks & NewMask)) + FreeBanks |= 1 << I; + } + return FreeBanks; + } + + // Find free SGPR banks + // SGPR tuples must be aligned, so step is size in banks it + // crosses. + Bank -= SGPR_BANK_OFFSET; + for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) + FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; + } + + return FreeBanks; +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, + unsigned SubReg, + unsigned Mask, + unsigned UsedBanks) const { + if (!isReassignable(Reg)) + return 0; + + unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); + + unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); + if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM); + if (Shift >= NUM_VGPR_BANKS) + return 0; + unsigned VB = FreeBanks & VGPR_BANK_MASK; + FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & + VGPR_BANK_MASK; + } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM) >> 1; + if (Shift >= NUM_SGPR_BANKS) + return 0; + unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; + FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & + SGPR_BANK_SHIFTED_MASK; + FreeBanks <<= SGPR_BANK_OFFSET; + } + + LLVM_DEBUG(if (FreeBanks) { + dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) + << " to banks: "; dumpFreeBanks(FreeBanks); + dbgs() << '\n'; }); + + return FreeBanks; +} + +void GCNRegBankReassign::collectCandidates(MachineInstr& MI, + unsigned UsedBanks, + unsigned StallCycles) { + LLVM_DEBUG(MI.dump()); + + if (!StallCycles) + return; + + LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); + + for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { + for (unsigned J = I + 1; J != E; ++J) { + if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) + continue; + + unsigned Reg1 = OperandMasks[I].Reg; + unsigned Reg2 = OperandMasks[J].Reg; + unsigned SubReg1 = OperandMasks[I].SubReg; + unsigned SubReg2 = OperandMasks[J].SubReg; + unsigned Mask1 = OperandMasks[I].Mask; + unsigned Mask2 = OperandMasks[J].Mask; + unsigned Size1 = countPopulation(Mask1); + unsigned Size2 = countPopulation(Mask2); + + LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << + " and " << printReg(Reg2, SubReg2) << '\n'); + + unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); + Weight += MLI->getLoopDepth(MI.getParent()) * 10; + + LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); + + unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); + unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); + if (FreeBanks1) + Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight + + ((Size2 > Size1) ? 1 : 0))); + if (FreeBanks2) + Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight + + ((Size1 > Size2) ? 1 : 0))); + } + } +} + +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, + unsigned Reg, int Bank, + bool Collect) { + unsigned TotalStallCycles = 0; + unsigned UsedBanks = 0; + SmallSet Visited; + + for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { + if (MI.isBundle()) + continue; + if (!Visited.insert(&MI).second) + continue; + unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); + TotalStallCycles += StallCycles; + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + } + + return TotalStallCycles; +} + +unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, + unsigned Bank) const { + const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); + unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs + : MaxNumSGPRs; + unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 + : AMDGPU::SGPR0); + + for (unsigned Reg : RC->getRegisters()) { + // Check occupancy limit. + if (TRI->isSubRegisterEq(Reg, MaxReg)) + break; + + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + continue; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return AMDGPU::NoRegister; + + LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); + + if (!LRM->checkInterference(LI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + +unsigned GCNRegBankReassign::tryReassign(Candidate &C) { + if (!LIS->hasInterval(C.Reg)) + return 0; + + LiveInterval &LI = LIS->getInterval(C.Reg); + LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); + LI.dump()); + + // For each candidate bank walk all instructions in the range of live + // interval and check if replacing the register with one belonging to + // the candidate bank reduces conflicts. + + unsigned OrigStalls = computeStallCycles(C.Reg); + LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); + if (!OrigStalls) + return 0; + + struct BankStall { + BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; + bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } + unsigned Bank; + unsigned Stalls; + }; + SmallVector BankStalls; + + for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { + if (C.FreeBanks & (1 << Bank)) { + LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + if (Stalls < OrigStalls) { + LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " + << Stalls << '\n'); + BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); + } + } + } + std::sort(BankStalls.begin(), BankStalls.end()); + + unsigned OrigReg = VRM->getPhys(C.Reg); + LRM->unassign(LI); + while (!BankStalls.empty()) { + BankStall BS = BankStalls.pop_back_val(); + unsigned Reg = scavengeReg(LI, BS.Bank); + if (Reg == AMDGPU::NoRegister) { + LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) + << '\n'); + continue; + } + LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) + << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") + << " in bank " << printBank(BS.Bank) << '\n'); + + LRM->assign(LI, Reg); + + LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); + + return OrigStalls - BS.Stalls; + } + LRM->assign(LI, OrigReg); + + return 0; +} + +unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, + bool Collect) { + unsigned TotalStallCycles = 0; + + for (MachineBasicBlock &MBB : MF) { + + LLVM_DEBUG(if (Collect) { + if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); + else dbgs() << MBB.getName(); dbgs() << ":\n"; + }); + + for (MachineInstr &MI : MBB.instrs()) { + if (MI.isBundle()) + continue; // we analyze the instructions inside the bundle individually + + unsigned UsedBanks = 0; + unsigned StallCycles = analyzeInst(MI, UsedBanks); + + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + + TotalStallCycles += StallCycles; + } + + LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); + } + + return TotalStallCycles; +} + +void GCNRegBankReassign::removeCandidates(unsigned Reg) { + Candidates.remove_if([Reg, this](const Candidate& C) { + return C.MI->readsRegister(Reg, TRI); + }); +} + +bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, + unsigned CyclesSaved) { + unsigned StallCycles = collectCandidates(MF, false); + LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles + << " stall cycles left\n"); + return StallCycles + CyclesSaved == OriginalCycles; +} + +bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + MLI = &getAnalysis(); + VRM = &getAnalysis(); + LRM = &getAnalysis(); + LIS = &getAnalysis(); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned Occupancy = MFI->getOccupancy(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumSGPRs = ST->getMaxNumSGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); + MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); + + CSRegs = TRI->getCalleeSavedRegs(&MF); + + RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + + TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); + + LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() + << '\n'); + + unsigned StallCycles = collectCandidates(MF); + NumStallsDetected += StallCycles; + + LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " + "function " << MF.getName() << '\n'); + + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) C.dump(this); + dbgs() << "\n\n"); + + unsigned CyclesSaved = 0; + while (!Candidates.empty()) { + Candidate C = Candidates.back(); + unsigned LocalCyclesSaved = tryReassign(C); + CyclesSaved += LocalCyclesSaved; + + if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + Candidates.pop_back(); + if (LocalCyclesSaved) { + removeCandidates(C.Reg); + computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) + C.dump(this); + dbgs() << "\n\n"); + } + } + NumStallsRecovered += CyclesSaved; + + LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved + << " cycles saved in function " << MF.getName() << '\n'); + + Candidates.clear(); + + if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + RegsUsed.clear(); + + return CyclesSaved > 0; +} Index: llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir +++ llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -0,0 +1,336 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: v1_vs_v5{{$}} +# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1, +--- +name: v1_vs_v5 +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = V_AND_B32_e32 %1, %0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: v0_1_vs_v4{{$}} +# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3, +--- +name: v0_1_vs_v4 +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: v1_2_vs_v4_5{{$}} +# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5, +--- +name: v1_2_vs_v4_5 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: s0_vs_s16{{$}} +# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0, +--- +name: s0_vs_s16 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: s1_vs_s16{{$}} +# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1, +--- +name: s1_vs_s16 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + %1 = S_AND_B32 %0, $sgpr1, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: s12_vs_null{{$}} +# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14, +--- +name: s12_vs_null +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '$sgpr12' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: s13_vs_m0{{$}} +# GCN: S_AND_B32 $m0, killed renamable $sgpr14, +--- +name: s13_vs_m0 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '$sgpr13' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = S_AND_B32 $m0, %0, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: s12_13_vs_s28_s29{{$}} +# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15, +--- +name: s12_13_vs_s28_s29 +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' } + - { id: 1, class: sreg_64 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr28_sgpr29 = IMPLICIT_DEF + %1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: livein{{$}} +# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0, +--- +name: livein +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 2, class: vgpr_32 } +liveins: + - { reg: '$vgpr0', virtual-reg: '' } + - { reg: '$vgpr4', virtual-reg: '' } +body: | + bb.0: + liveins: $vgpr0, $vgpr4 + + %0 = COPY $vgpr0 + %1 = COPY $vgpr4 + %2 = V_AND_B32_e32 %1, %0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: liveout{{$}} +# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, +--- +name: liveout +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = V_AND_B32_e32 %1, %0, implicit $exec + $vgpr0 = COPY %0 + $vgpr4 = COPY %1 + S_ENDPGM 0 +... + +# GCN-LABEL: implicit{{$}} +# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0 +--- +name: implicit +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' } +body: | + bb.0: + %1 = IMPLICIT_DEF + V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0 + S_ENDPGM 0 +... + +# GCN-LABEL: occupancy_limit{{$}} +# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, +--- +name: occupancy_limit +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } + - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } + - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } + - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } + - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } + - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %3 = IMPLICIT_DEF + %4 = IMPLICIT_DEF + %5 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %2 = V_AND_B32_e32 %1, %0, implicit $exec + GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: csr{{$}} +# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, +--- +name: csr +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } + - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } + - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } + - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } + - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } + - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } + - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } + - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 12, class: vgpr_32, preferred-register: '$vgpr33' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %3 = IMPLICIT_DEF + %4 = IMPLICIT_DEF + %5 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %2 = V_AND_B32_e32 %1, %0, implicit $exec + GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# Do not touch undefs +# GCN-LABEL: s0_vs_s16_undef{{$}} +# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0, +--- +name: s0_vs_s16_undef +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc + S_ENDPGM 0 +... + +# GCN-LABEL: smem_bundle{{$}} +# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0 +# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0 +--- +name: smem_bundle +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' } + - { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' } + - { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' } + - { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' } + - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 { + %3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0, 0 + %4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0, 0 + } + S_ENDPGM 0 +... + +# GCN-LABEL: vreg_512_subs{{$}} +# don't care about the assignment: this used to trigger an infinite loop +--- +name: vreg_512_subs +tracksRegLiveness: true +registers: + - { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' } + - { id: 2, class: vgpr_32, preferred-register: '$vgpr28' } +body: | + bb.0: + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec + DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec + S_ENDPGM 0 +...