Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -15,6 +15,8 @@ #define LLVM_CODEGEN_PASSES_H #include "llvm/Support/CodeGen.h" +#include "llvm/CodeGen/RegAllocCommon.h" + #include #include @@ -169,16 +171,20 @@ /// possible. It is best suited for debug code where live ranges are short. /// FunctionPass *createFastRegisterAllocator(); + FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F, + bool ClearVirtRegs); /// BasicRegisterAllocation Pass - This pass implements a degenerate global /// register allocator using the basic regalloc framework. /// FunctionPass *createBasicRegisterAllocator(); + FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F); /// Greedy register allocation pass - This pass implements a global register /// allocator for optimized builds. /// FunctionPass *createGreedyRegisterAllocator(); + FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F); /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean /// Quadratic Prograaming (PBQP) based register allocator. Index: llvm/include/llvm/CodeGen/RegAllocCommon.h =================================================================== --- /dev/null +++ llvm/include/llvm/CodeGen/RegAllocCommon.h @@ -0,0 +1,32 @@ +//===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCCOMMON_H +#define LLVM_CODEGEN_REGALLOCCOMMON_H + +#include + +namespace llvm { + +class TargetRegisterClass; +class TargetRegisterInfo; + +typedef std::function RegClassFilterFunc; + +/// Default register class filter function for register allocation. All virtual +/// registers should be allocated. +static inline bool allocateAllRegClasses(const TargetRegisterInfo &, + const TargetRegisterClass &) { + return true; +} + +} + +#endif // LLVM_CODEGEN_REGALLOCCOMMON_H Index: llvm/include/llvm/CodeGen/RegAllocRegistry.h =================================================================== --- llvm/include/llvm/CodeGen/RegAllocRegistry.h +++ llvm/include/llvm/CodeGen/RegAllocRegistry.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H #define LLVM_CODEGEN_REGALLOCREGISTRY_H +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/MachinePassRegistry.h" namespace llvm { Index: llvm/lib/CodeGen/LiveIntervals.cpp =================================================================== --- llvm/lib/CodeGen/LiveIntervals.cpp +++ llvm/lib/CodeGen/LiveIntervals.cpp @@ -713,10 +713,15 @@ if (LI.empty()) continue; + // Target may have not allocated this yet. + Register PhysReg = VRM->getPhys(Reg); + if (!PhysReg) + continue; + // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) Index: llvm/lib/CodeGen/RegAllocBase.h =================================================================== --- llvm/lib/CodeGen/RegAllocBase.h +++ llvm/lib/CodeGen/RegAllocBase.h @@ -37,6 +37,7 @@ #define LLVM_LIB_CODEGEN_REGALLOCBASE_H #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegisterClassInfo.h" namespace llvm { @@ -67,6 +68,7 @@ LiveIntervals *LIS = nullptr; LiveRegMatrix *Matrix = nullptr; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Inst which is a def of an original reg and whose defs are already all /// dead after remat is saved in DeadRemats. The deletion of such inst is @@ -74,7 +76,9 @@ /// always available for the remat of all the siblings of the original reg. SmallPtrSet DeadRemats; - RegAllocBase() = default; + RegAllocBase(const RegClassFilterFunc F = allocateAllRegClasses) : + ShouldAllocateClass(F) {} + virtual ~RegAllocBase() = default; // A RegAlloc pass should call this before allocatePhysRegs. @@ -92,7 +96,10 @@ virtual Spiller &spiller() = 0; /// enqueue - Add VirtReg to the priority queue of unassigned registers. - virtual void enqueue(LiveInterval *LI) = 0; + virtual void enqueueImpl(LiveInterval *LI) = 0; + + /// enqueue - Add VirtReg to the priority queue of unassigned registers. + void enqueue(LiveInterval *LI); /// dequeue - Return the next unassigned register, or NULL. virtual LiveInterval *dequeue() = 0; Index: llvm/lib/CodeGen/RegAllocBase.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocBase.cpp +++ llvm/lib/CodeGen/RegAllocBase.cpp @@ -176,3 +176,21 @@ } DeadRemats.clear(); } + +void RegAllocBase::enqueue(LiveInterval *LI) { + const Register Reg = LI->reg(); + + assert(Reg.isVirtual() && "Can only enqueue virtual registers"); + + if (VRM->hasPhys(Reg)) + return; + + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + if (ShouldAllocateClass(*TRI, RC)) { + LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n'); + enqueueImpl(LI); + } else { + LLVM_DEBUG(dbgs() << "Not enqueueing " << printReg(Reg, TRI) + << " in skipped register class\n"); + } +} Index: llvm/lib/CodeGen/RegAllocBasic.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocBasic.cpp +++ llvm/lib/CodeGen/RegAllocBasic.cpp @@ -76,7 +76,7 @@ void LRE_WillShrinkVirtReg(Register) override; public: - RABasic(); + RABasic(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Basic Register Allocator"; } @@ -88,7 +88,7 @@ Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override { + void enqueueImpl(LiveInterval *LI) override { Queue.push(LI); } @@ -171,7 +171,9 @@ enqueue(&LI); } -RABasic::RABasic(): MachineFunctionPass(ID) { +RABasic::RABasic(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -332,7 +334,10 @@ return true; } -FunctionPass* llvm::createBasicRegisterAllocator() -{ +FunctionPass* llvm::createBasicRegisterAllocator() { return new RABasic(); } + +FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) { + return new RABasic(F); +} Index: llvm/lib/CodeGen/RegAllocFast.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocFast.cpp +++ llvm/lib/CodeGen/RegAllocFast.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -69,7 +70,13 @@ public: static char ID; - RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + RegAllocFast(const RegClassFilterFunc F = allocateAllRegClasses, + bool ClearVirtRegs_ = true) : + MachineFunctionPass(ID), + ShouldAllocateClass(F), + StackSlotForVirtReg(-1), + ClearVirtRegs(ClearVirtRegs_) { + } private: MachineFrameInfo *MFI; @@ -77,6 +84,7 @@ const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Basic block currently being allocated. MachineBasicBlock *MBB; @@ -84,6 +92,8 @@ /// Maps virtual regs to the frame index where these values are spilled. IndexedMap StackSlotForVirtReg; + bool ClearVirtRegs; + /// Everything we know about a live virtual register. struct LiveReg { MachineInstr *LastUse = nullptr; ///< Last instr to use reg. @@ -213,8 +223,12 @@ } MachineFunctionProperties getSetProperties() const override { - return MachineFunctionProperties().set( + if (ClearVirtRegs) { + return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); + } + + return MachineFunctionProperties(); } MachineFunctionProperties getClearedProperties() const override { @@ -1541,9 +1555,11 @@ for (MachineBasicBlock &MBB : MF) allocateBasicBlock(MBB); - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers. - MRI->clearVirtRegs(); + if (ClearVirtRegs) { + // All machine operands and other references to virtual registers have been + // replaced. Remove the virtual registers. + MRI->clearVirtRegs(); + } StackSlotForVirtReg.clear(); LiveDbgValueMap.clear(); @@ -1553,3 +1569,9 @@ FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); } + +FunctionPass *llvm::createFastRegisterAllocator( + std::function Ftor, bool ClearVirtRegs) { + return new RegAllocFast(Ftor, ClearVirtRegs); +} Index: llvm/lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocGreedy.cpp +++ llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -412,7 +412,7 @@ ArrayRef RegCosts; public: - RAGreedy(); + RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Greedy Register Allocator"; } @@ -421,7 +421,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override; + void enqueueImpl(LiveInterval *LI) override; LiveInterval *dequeue() override; MCRegister selectOrSplit(LiveInterval &, SmallVectorImpl &) override; @@ -636,7 +636,22 @@ return new RAGreedy(); } -RAGreedy::RAGreedy(): MachineFunctionPass(ID) { +namespace llvm { +FunctionPass* createGreedyRegisterAllocator( + std::function Ftor); + +} + +FunctionPass* llvm::createGreedyRegisterAllocator( + std::function Ftor) { + return new RAGreedy(Ftor); +} + +RAGreedy::RAGreedy(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { @@ -693,7 +708,7 @@ // Register is assigned, put it back on the queue for reassignment. LiveInterval &LI = LIS->getInterval(VirtReg); Matrix->unassign(LI); - enqueue(&LI); + RegAllocBase::enqueue(&LI); } void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) { @@ -716,7 +731,7 @@ GlobalCand.clear(); } -void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); } +void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); } void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Prioritize live ranges by size, assigning larger ranges first. @@ -2936,7 +2951,12 @@ if (Register::isPhysicalRegister(Reg)) continue; - assert(VRM->hasPhys(Reg) && "We have unallocated variable!!"); + // This may be a skipped class + if (!VRM->hasPhys(Reg)) { + assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) && + "We have an unallocated variable which should have been handled"); + continue; + } // Get the live interval mapped with this virtual register to be able // to check for the interference with the new color. Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1316,8 +1316,8 @@ } bool TargetPassConfig::addRegAssignAndRewriteFast() { - if (RegAlloc != &useDefaultRegisterAllocator && - RegAlloc != &createFastRegisterAllocator) + if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator && + RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator) report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addPass(createRegAllocPass(false)); Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -32,6 +32,8 @@ #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -52,6 +54,115 @@ using namespace llvm; +namespace { +class SGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +class VGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isSGPRClass(&RC); +} + +static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return !static_cast(TRI).isSGPRClass(&RC); +} + + +/// -{sgpr|vgpr}-regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + +/// A dummy default pass factory indicates whether the register allocator is +/// overridden on the command line. +static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; + +static SGPRRegisterRegAlloc +defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); + +static cl::opt> +SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); + +static cl::opt> +VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); + + +static void initializeDefaultSGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = SGPRRegAlloc; + SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); + } +} + +static void initializeDefaultVGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = VGPRRegAlloc; + VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); + } +} + +static FunctionPass *createBasicSGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createGreedySGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createFastSGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +static FunctionPass *createBasicVGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createGreedyVGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createFastVGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateVGPRs, true); +} + +static SGPRRegisterRegAlloc basicRegAllocSGPR( + "basic", "basic register allocator", createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc greedyRegAllocSGPR( + "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); + +static SGPRRegisterRegAlloc fastRegAllocSGPR( + "fast", "fast register allocator", createFastSGPRRegisterAllocator); + + +static VGPRRegisterRegAlloc basicRegAllocVGPR( + "basic", "basic register allocator", createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc greedyRegAllocVGPR( + "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); + +static VGPRRegisterRegAlloc fastRegAllocVGPR( + "fast", "fast register allocator", createFastVGPRRegisterAllocator); +} + + static cl::opt EnableR600StructurizeCFG( "r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), @@ -816,6 +927,14 @@ bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; + + FunctionPass *createSGPRAllocPass(bool Optimized); + FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createRegAllocPass(bool Optimized) override; + + bool addRegAssignAndRewriteFast() override; + bool addRegAssignAndRewriteOptimized() override; + void addPreRegAlloc() override; bool addPreRewrite() override; void addPostRegAlloc() override; @@ -1186,14 +1305,84 @@ return true; } +FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, + initializeDefaultSGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyRegisterAllocator(onlyAllocateSGPRs); + + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, + initializeDefaultVGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyVGPRRegisterAllocator(); + + return createFastVGPRRegisterAllocator(); +} + +FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { + llvm_unreachable("should not be used"); +} + +static const char RegAllocOptNotSupportedMessage[] = + "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + +bool GCNPassConfig::addRegAssignAndRewriteFast() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(false)); + return true; +} + +bool GCNPassConfig::addRegAssignAndRewriteOptimized() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(true)); + + // Commit allocated register changes. This is mostly necessary because too + // many things rely on the use lists of the physical registers, such as the + // verifier. This is only necessary with allocators which use LiveIntervals, + // since FastRegAlloc does the replacments itself. + addPass(createVirtRegRewriter(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(true)); + + addPreRewrite(); + addPass(&VirtRegRewriterID); + + return true; +} + void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); - - // Equivalent of PEI for SGPRs. - addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -20,6 +20,12 @@ #define DEBUG_TYPE "frame-info" +static cl::opt EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + // Find a scratch register that we can use in the prologue. We avoid using // callee-save registers since they may appear to be free when this is called // from canUseAsPrologue (during shrink wrapping), but then no longer be free @@ -1125,9 +1131,73 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + if (SpillVGPRToAGPR) { + // To track the spill frame indices handled in this pass. + BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + + bool SeenDbgInstr = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (MI.isDebugInstr()) + SeenDbgInstr = true; + + if (TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); + TRI->eliminateFrameIndex(MI, 0, FIOp, RS); + SpillFIs.set(FI); + continue; + } + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + + if (!SpillFIs.empty() && SeenDbgInstr) { + // FIXME: The dead frame indices are replaced with a null register from + // the debug value instructions. We should instead, update it with the + // correct register value. But not sure the register value alone is + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() && MI.getOperand(0).isFI() && + SpillFIs[MI.getOperand(0).getIndex()]) { + MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + MI.getOperand(0).setIsDebug(); + } + } + } + } + } + FuncInfo->removeDeadFrameIndices(MFI); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); Index: llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -31,12 +31,6 @@ namespace { -static cl::opt EnableSpillVGPRToAGPR( - "amdgpu-spill-vgpr-to-agpr", - cl::desc("Enable spilling VGPRs to AGPRs"), - cl::ReallyHidden, - cl::init(true)); - class SILowerSGPRSpills : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; @@ -71,6 +65,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -295,6 +290,7 @@ TRI = &TII->getRegisterInfo(); VRM = getAnalysisIfAvailable(); + LIS = getAnalysisIfAvailable(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -318,21 +314,14 @@ return false; } - const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() - && EnableSpillVGPRToAGPR; - bool MadeChange = false; - - const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); - std::unique_ptr RS; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs()); - if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) { + if (HasSGPRSpillToVGPR) { // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs // are spilled to VGPRs, in which case we can eliminate the stack usage. // @@ -350,36 +339,15 @@ MachineInstr &MI = *I; Next = std::next(I); - if (SpillToAGPR && TII->isVGPRSpill(MI)) { - // Try to eliminate stack used by VGPR spills before frame - // finalization. - unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::vaddr); - int FI = MI.getOperand(FIOp).getIndex(); - Register VReg = - TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); - if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, - TRI->isAGPR(MRI, VReg))) { - NewReservedRegs = true; - if (!RS) - RS.reset(new RegScavenger()); - - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); - TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get()); - SpillFIs.set(FI); - continue; - } - } - - if (!TII->isSGPRSpill(MI) || !TRI->spillSGPRToVGPR()) + if (!TII->isSGPRSpill(MI)) continue; int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, + nullptr, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); SpillFIs.set(FI); @@ -387,16 +355,10 @@ } } + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) MBB.addLiveIn(SSpill.VGPR); - - for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) - MBB.addLiveIn(Reg); - - for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); // FIXME: The dead frame indices are replaced with a null register from Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -8,7 +8,22 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" +#include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include +#include #define MAX_LANES 64 @@ -312,6 +327,13 @@ // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + +#if 0 + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); +#endif return false; } Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -114,13 +114,16 @@ void buildSGPRSpillLoadStore(SGPRSpillBuilder &SB, int Offset, int64_t VGPRLanes) const; - /// If \p OnlyToVGPR is true, this will only succeed if this + /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a + /// free VGPR lane to spill. bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, @@ -128,7 +131,8 @@ RegScavenger *RS) const override; bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(MCRegister Reg) const override; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,6 +572,17 @@ reserveRegisterTuples(Reserved, Reg.first); } + // Reserve VGPRs used for SGPR spilling. + // Note we treat freezeReservedRegs unusually because we run register + // allocation in two phases. It's OK to re-freeze with new registers for the + // second run. +#if 0 + for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { + for (auto &SpilledVGPR : SpilledFI.second) + reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); + } +#endif + // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) reserveRegisterTuples(Reserved, Reg); @@ -1304,6 +1315,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); @@ -1333,6 +1345,12 @@ .addReg(SubReg, getKillRegState(UseKill)) .addImm(Spill.Lane) .addReg(Spill.VGPR); + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } if (i == 0 && SB.NumSubRegs > 1) { // We may be spilling a super-register which is only partially defined, @@ -1376,6 +1394,13 @@ .addReg(SB.TmpVGPR, TmpVGPRFlags); TmpVGPRFlags = 0; + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); + else + LIS->InsertMachineInstrInMaps(*WriteLane); + } + // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (SB.NumSubRegs > 1) { @@ -1396,12 +1421,17 @@ MI->eraseFromParent(); SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); @@ -1425,6 +1455,13 @@ .addImm(Spill.Lane); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } + } } else { SB.prepare(); @@ -1452,6 +1489,12 @@ .addImm(i); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } } } @@ -1459,6 +1502,10 @@ } MI->eraseFromParent(); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } @@ -1468,7 +1515,8 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, - RegScavenger *RS) const { + RegScavenger *RS, + LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -1479,7 +1527,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, LIS, true); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -1489,7 +1537,7 @@ case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, true); + return restoreSGPR(MI, FI, RS, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -12,20 +12,22 @@ ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -44,8 +46,8 @@ ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -59,44 +61,25 @@ ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -115,6 +98,7 @@ ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -155,26 +139,26 @@ ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -192,31 +176,6 @@ ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -234,29 +193,35 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -273,20 +238,22 @@ ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -305,8 +272,8 @@ ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -320,44 +287,25 @@ ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -376,6 +324,7 @@ ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -416,26 +365,26 @@ ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -453,31 +402,6 @@ ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -495,34 +419,40 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v15, 6, s33 -; GCN-NEXT: v_add_u32_e32 v15, 0x100, v15 -; GCN-NEXT: v_add_u32_e32 v0, v15, v0 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_lshrrev_b32_e64 v11, 6, s33 +; GCN-NEXT: v_add_u32_e32 v11, 0x100, v11 +; GCN-NEXT: v_add_u32_e32 v0, v11, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: s_waitcnt vmcnt(16) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -539,20 +469,22 @@ ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -571,8 +503,8 @@ ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -586,44 +518,25 @@ ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -642,6 +555,7 @@ ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -682,26 +596,26 @@ ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -719,31 +633,6 @@ ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -761,30 +650,36 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/agpr-csr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -35,11 +35,11 @@ ; GCN-LABEL: {{^}}func_areg_33: ; GCN-NOT: a32 -; GFX90A: buffer_store_dword a32, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A: v_accvgpr_read_b32 v0, a32 ; Reload Reuse ; GCN-NOT: a32 ; GCN: use agpr32 ; GCN-NOT: a32 -; GFX90A: buffer_load_dword a32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A: v_accvgpr_write_b32 a32, v0 ; Reload Reuse ; GCN-NOT: a32 ; GCN: s_setpc_b64 define void @func_areg_33() #0 { @@ -50,9 +50,9 @@ ; GCN-LABEL: {{^}}func_areg_64: ; GFX908-NOT: buffer_ ; GCN-NOT: v_accvgpr -; GFX90A: buffer_store_dword a63, +; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse ; GCN: use agpr63 -; GFX90A: buffer_load_dword a63, +; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse ; GCN-NOT: v_accvgpr ; GCN: s_setpc_b64 define void @func_areg_64() #0 { @@ -62,12 +62,13 @@ ; GCN-LABEL: {{^}}func_areg_31_63: ; GFX908-NOT: buffer_ -; GCN-NOT: v_accvgpr -; GFX90A: buffer_store_dword a63, +; GFX908-NOT: v_accvgpr +; GFX908-NOT: buffer +; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse ; GCN: use agpr31, agpr63 -; GFX90A: buffer_load_dword a63, -; GCN-NOT: buffer_ -; GCN-NOT: v_accvgpr +; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse +; GFX908-NOT: v_accvgpr +; GFX908-NOT: buffer ; GCN: s_setpc_b64 define void @func_areg_31_63() #0 { call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" () Index: llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir +++ llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s --- # GCN-LABEL: name: alloc_vgpr_64 Index: llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir +++ llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s # Using the unaligned vector tuples are OK as long as they aren't used # in a real instruction. Index: llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -1,11 +1,11 @@ ; -enable-misched=false makes the register usage more predictable ; -regalloc=fast just makes the test run faster -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 -; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A -; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false --sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 define internal void @use256vgprs() { %v0 = call i32 asm sideeffect "; def $0", "=v"() Index: llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -273,22 +273,22 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v1, s33, 63 -; GCN-COUNT-60: v_writelane_b32 v1 +; GCN-NEXT: v_writelane_b32 v0, s33, 63 +; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-2: v_writelane_b32 v1 +; GCN-COUNT-2: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 +; GCN: v_writelane_b32 v0 ; MUBUF: s_add_u32 s32, s32, 0x400 ; MUBUF: s_sub_u32 s32, s32, 0x400 ; FLATSCR: s_add_u32 s32, s32, 16 ; FLATSCR: s_sub_u32 s32, s32, 16 -; GCN-NEXT: v_readlane_b32 s33, v1, 63 +; GCN-NEXT: v_readlane_b32 s33, v0, 63 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload @@ -318,21 +318,21 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-62: v_writelane_b32 v1, +; GCN-COUNT-62: v_writelane_b32 v0, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_add_u32 s32, s32, 0x400 ; FLATSCR: s_add_u32 s32, s32, 16 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 +; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] @@ -388,24 +388,24 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v1, s33, 2 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s33, 2 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN: v_writelane_b32 v1, s31, 1 +; GCN: v_writelane_b32 v0, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART ; MUBUF: s_add_u32 s32, s32, 0x300 -; MUBUF-NEXT: v_readlane_b32 s4, v1, 0 -; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s4, v0, 0 +; MUBUF-NEXT: v_readlane_b32 s5, v0, 1 ; FLATSCR: s_add_u32 s32, s32, 12 -; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0 -; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s0, v0, 0 +; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 -; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: v_readlane_b32 s33, v0, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2870,32 +2870,32 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v42, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: global_store_dword v[41:42], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v42, 0 -; GFX9-NEXT: v_readlane_b32 s5, v42, 1 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v42, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -2905,34 +2905,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v42, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v40, v0 -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v1 -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s4, v42, 0 -; GFX10-NEXT: v_readlane_b32 s5, v42, 1 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v42, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -184,33 +184,33 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 -; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -220,34 +220,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v31 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v31, v40 +; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 -; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -762,14 +762,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 3 -; GFX9-NEXT: v_writelane_b32 v41, s40, 0 -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -779,23 +779,23 @@ ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 2 -; GFX9-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ; use v41 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 1 -; GFX9-NEXT: v_readlane_b32 s5, v41, 2 -; GFX9-NEXT: v_readlane_b32 s40, v41, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s40, v40, 0 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: v_readlane_b32 s33, v41, 3 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -805,41 +805,41 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s40, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v32 -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 -; GFX10-NEXT: v_writelane_b32 v41, s31, 2 +; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ; use v41 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 1 -; GFX10-NEXT: v_readlane_b32 s5, v41, 2 -; GFX10-NEXT: v_readlane_b32 s40, v41, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s40, v40, 0 ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 -; GFX10-NEXT: v_readlane_b32 s33, v41, 3 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -202,32 +202,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -235,13 +235,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -250,36 +250,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB2_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -292,32 +292,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -325,13 +325,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] @@ -341,36 +341,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB3_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -383,32 +383,32 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -416,13 +416,13 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -431,37 +431,37 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB4_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -475,32 +475,32 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 19 +; GCN-NEXT: v_writelane_b32 v40, s33, 19 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s50, 15 -; GCN-NEXT: v_writelane_b32 v43, s51, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s50, 15 +; GCN-NEXT: v_writelane_b32 v40, s51, 16 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -508,20 +508,20 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NEXT: v_mov_b32_e32 v42, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_cbranch_execz BB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_writelane_b32 v43, s30, 17 -; GCN-NEXT: v_writelane_b32 v43, s31, 18 +; GCN-NEXT: v_writelane_b32 v40, s30, 17 +; GCN-NEXT: v_writelane_b32 v40, s31, 18 ; GCN-NEXT: s_mov_b64 s[48:49], exec ; GCN-NEXT: BB5_2: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v42 +; GCN-NEXT: v_readfirstlane_b32 s17, v43 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[42:43] ; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -530,40 +530,40 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] ; GCN-NEXT: s_cbranch_execnz BB5_2 ; GCN-NEXT: ; %bb.3: ; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: v_readlane_b32 s30, v43, 17 -; GCN-NEXT: v_readlane_b32 s31, v43, 18 +; GCN-NEXT: v_readlane_b32 s30, v40, 17 +; GCN-NEXT: v_readlane_b32 s31, v40, 18 ; GCN-NEXT: BB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v43, 16 -; GCN-NEXT: v_readlane_b32 s50, v43, 15 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s51, v40, 16 +; GCN-NEXT: v_readlane_b32 s50, v40, 15 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x800 -; GCN-NEXT: v_readlane_b32 s33, v43, 19 +; GCN-NEXT: v_readlane_b32 s33, v40, 19 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -583,26 +583,26 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v42, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 6 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v42, s34, 0 -; GCN-NEXT: v_writelane_b32 v42, s35, 1 -; GCN-NEXT: v_writelane_b32 v42, s36, 2 -; GCN-NEXT: v_writelane_b32 v42, s37, 3 -; GCN-NEXT: v_writelane_b32 v42, s30, 4 -; GCN-NEXT: v_writelane_b32 v42, s31, 5 -; GCN-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[34:35], exec ; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v40 -; GCN-NEXT: v_readfirstlane_b32 s7, v41 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41] +; GCN-NEXT: v_readfirstlane_b32 s6, v41 +; GCN-NEXT: v_readfirstlane_b32 s7, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[41:42] ; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] @@ -610,18 +610,18 @@ ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_readlane_b32 s4, v42, 4 -; GCN-NEXT: v_readlane_b32 s5, v42, 5 -; GCN-NEXT: v_readlane_b32 s37, v42, 3 -; GCN-NEXT: v_readlane_b32 s36, v42, 2 -; GCN-NEXT: v_readlane_b32 s35, v42, 1 -; GCN-NEXT: v_readlane_b32 s34, v42, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v42, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -136,8 +136,9 @@ ; GCN-O0-NEXT: Live Register Matrix ; GCN-O0-NEXT: SI Pre-allocate WWM Registers ; GCN-O0-NEXT: Fast Register Allocator -; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: SI lower SGPR spill instructions +; GCN-O0-NEXT: Fast Register Allocator +; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Fixup Statepoint Caller Saved ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter @@ -363,6 +364,12 @@ ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Greedy Register Allocator +; GCN-O1-NEXT: Virtual Register Rewriter +; GCN-O1-NEXT: SI lower SGPR spill instructions +; GCN-O1-NEXT: Virtual Register Map +; GCN-O1-NEXT: Live Register Matrix +; GCN-O1-NEXT: Machine Optimization Remark Emitter +; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: Stack Slot Coloring @@ -370,7 +377,6 @@ ; GCN-O1-NEXT: Machine Loop Invariant Code Motion ; GCN-O1-NEXT: SI Fix VGPR copies ; GCN-O1-NEXT: SI optimize exec mask operations -; GCN-O1-NEXT: SI lower SGPR spill instructions ; GCN-O1-NEXT: Fixup Statepoint Caller Saved ; GCN-O1-NEXT: PostRA Machine Sink ; GCN-O1-NEXT: MachineDominator Tree Construction @@ -647,6 +653,12 @@ ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Greedy Register Allocator +; GCN-O1-OPTS-NEXT: Virtual Register Rewriter +; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions +; GCN-O1-OPTS-NEXT: Virtual Register Map +; GCN-O1-OPTS-NEXT: Live Register Matrix +; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter +; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -654,7 +666,6 @@ ; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion ; GCN-O1-OPTS-NEXT: SI Fix VGPR copies ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations -; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions ; GCN-O1-OPTS-NEXT: Fixup Statepoint Caller Saved ; GCN-O1-OPTS-NEXT: PostRA Machine Sink ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction @@ -931,6 +942,12 @@ ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Greedy Register Allocator +; GCN-O2-NEXT: Virtual Register Rewriter +; GCN-O2-NEXT: SI lower SGPR spill instructions +; GCN-O2-NEXT: Virtual Register Map +; GCN-O2-NEXT: Live Register Matrix +; GCN-O2-NEXT: Machine Optimization Remark Emitter +; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: Stack Slot Coloring @@ -938,7 +955,6 @@ ; GCN-O2-NEXT: Machine Loop Invariant Code Motion ; GCN-O2-NEXT: SI Fix VGPR copies ; GCN-O2-NEXT: SI optimize exec mask operations -; GCN-O2-NEXT: SI lower SGPR spill instructions ; GCN-O2-NEXT: Fixup Statepoint Caller Saved ; GCN-O2-NEXT: PostRA Machine Sink ; GCN-O2-NEXT: MachineDominator Tree Construction @@ -1228,6 +1244,12 @@ ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Greedy Register Allocator +; GCN-O3-NEXT: Virtual Register Rewriter +; GCN-O3-NEXT: SI lower SGPR spill instructions +; GCN-O3-NEXT: Virtual Register Map +; GCN-O3-NEXT: Live Register Matrix +; GCN-O3-NEXT: Machine Optimization Remark Emitter +; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: Stack Slot Coloring @@ -1235,7 +1257,6 @@ ; GCN-O3-NEXT: Machine Loop Invariant Code Motion ; GCN-O3-NEXT: SI Fix VGPR copies ; GCN-O3-NEXT: SI optimize exec mask operations -; GCN-O3-NEXT: SI lower SGPR spill instructions ; GCN-O3-NEXT: Fixup Statepoint Caller Saved ; GCN-O3-NEXT: PostRA Machine Sink ; GCN-O3-NEXT: MachineDominator Tree Construction Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -189,44 +189,44 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_writelane_b32 v43, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 -; GFX9-NEXT: v_writelane_b32 v43, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 +; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v43, 2 -; GFX9-NEXT: v_readlane_b32 s5, v43, 3 -; GFX9-NEXT: v_readlane_b32 s35, v43, 1 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 3 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 4 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] Index: llvm/test/CodeGen/AMDGPU/pei-build-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-build-spill.mir +++ llvm/test/CodeGen/AMDGPU/pei-build-spill.mir @@ -1,12 +1,12 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s --- name: test_spill_v1 Index: llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -0,0 +1,108 @@ +; REQUIRES: asserts + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s +; RUN: llc -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s + +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s + +; RUN: llc -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s +; RUN: llc -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s + +; RUN: not --crash llc -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s +; RUN: not --crash llc -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s + + +; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc + +; DEFAULT: Greedy Register Allocator +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: SI lower SGPR spill instructions +; DEFAULT-NEXT: Virtual Register Map +; DEFAULT-NEXT: Live Register Matrix +; DEFAULT-NEXT: Machine Optimization Remark Emitter +; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: GCN NSA Reassign +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: Stack Slot Coloring + +; O0: Fast Register Allocator +; O0-NEXT: SI lower SGPR spill instructions +; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Fix VGPR copies + + + + +; BASIC-DEFAULT: Debug Variable Analysis +; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis +; BASIC-DEFAULT-NEXT: Machine Natural Loop Construction +; BASIC-DEFAULT-NEXT: Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Basic Register Allocator +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges +; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis +; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter +; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: GCN NSA Reassign +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: Stack Slot Coloring + + + +; DEFAULT-BASIC: Greedy Register Allocator +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions +; DEFAULT-BASIC-NEXT: Virtual Register Map +; DEFAULT-BASIC-NEXT: Live Register Matrix +; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: GCN NSA Reassign +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: Stack Slot Coloring + + + +; BASIC-BASIC: Debug Variable Analysis +; BASIC-BASIC-NEXT: Live Stack Slot Analysis +; BASIC-BASIC-NEXT: Machine Natural Loop Construction +; BASIC-BASIC-NEXT: Machine Block Frequency Analysis +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: SI lower SGPR spill instructions +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: GCN NSA Reassign +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: Stack Slot Coloring + + +declare void @bar() + +; Something with some CSR SGPR spills +define void @foo() { + call void asm sideeffect "; clobber", "~{s33}"() + call void @bar() + ret void +} + +; Block live out spills with fast regalloc +define amdgpu_kernel void @control_flow(i1 %cond) { + %s33 = call i32 asm sideeffect "; clobber", "={s33}"() + br i1 %cond, label %bb0, label %bb1 + +bb0: + call void asm sideeffect "; use %0", "s"(i32 %s33) + br label %bb1 + +bb1: + ret void +} Index: llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. + +define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { +; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 0 +; GCN-NEXT: v_writelane_b32 v23, s9, 1 +; GCN-NEXT: v_writelane_b32 v23, s10, 2 +; GCN-NEXT: v_writelane_b32 v23, s11, 3 +; GCN-NEXT: v_writelane_b32 v23, s12, 4 +; GCN-NEXT: v_writelane_b32 v23, s13, 5 +; GCN-NEXT: v_writelane_b32 v23, s14, 6 +; GCN-NEXT: v_writelane_b32 v23, s15, 7 +; GCN-NEXT: v_writelane_b32 v23, s16, 8 +; GCN-NEXT: v_writelane_b32 v23, s17, 9 +; GCN-NEXT: v_writelane_b32 v23, s18, 10 +; GCN-NEXT: v_writelane_b32 v23, s19, 11 +; GCN-NEXT: v_writelane_b32 v23, s20, 12 +; GCN-NEXT: v_writelane_b32 v23, s21, 13 +; GCN-NEXT: v_writelane_b32 v23, s22, 14 +; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 16 +; GCN-NEXT: v_writelane_b32 v23, s9, 17 +; GCN-NEXT: v_writelane_b32 v23, s10, 18 +; GCN-NEXT: v_writelane_b32 v23, s11, 19 +; GCN-NEXT: v_writelane_b32 v23, s12, 20 +; GCN-NEXT: v_writelane_b32 v23, s13, 21 +; GCN-NEXT: v_writelane_b32 v23, s14, 22 +; GCN-NEXT: v_writelane_b32 v23, s15, 23 +; GCN-NEXT: v_writelane_b32 v23, s16, 24 +; GCN-NEXT: v_writelane_b32 v23, s17, 25 +; GCN-NEXT: v_writelane_b32 v23, s18, 26 +; GCN-NEXT: v_writelane_b32 v23, s19, 27 +; GCN-NEXT: v_writelane_b32 v23, s20, 28 +; GCN-NEXT: v_writelane_b32 v23, s21, 29 +; GCN-NEXT: v_writelane_b32 v23, s22, 30 +; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 32 +; GCN-NEXT: v_writelane_b32 v23, s9, 33 +; GCN-NEXT: v_writelane_b32 v23, s10, 34 +; GCN-NEXT: v_writelane_b32 v23, s11, 35 +; GCN-NEXT: v_writelane_b32 v23, s12, 36 +; GCN-NEXT: v_writelane_b32 v23, s13, 37 +; GCN-NEXT: v_writelane_b32 v23, s14, 38 +; GCN-NEXT: v_writelane_b32 v23, s15, 39 +; GCN-NEXT: v_writelane_b32 v23, s16, 40 +; GCN-NEXT: v_writelane_b32 v23, s17, 41 +; GCN-NEXT: v_writelane_b32 v23, s18, 42 +; GCN-NEXT: v_writelane_b32 v23, s19, 43 +; GCN-NEXT: v_writelane_b32 v23, s20, 44 +; GCN-NEXT: v_writelane_b32 v23, s21, 45 +; GCN-NEXT: v_writelane_b32 v23, s22, 46 +; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 48 +; GCN-NEXT: v_writelane_b32 v23, s9, 49 +; GCN-NEXT: v_writelane_b32 v23, s10, 50 +; GCN-NEXT: v_writelane_b32 v23, s11, 51 +; GCN-NEXT: v_writelane_b32 v23, s12, 52 +; GCN-NEXT: v_writelane_b32 v23, s13, 53 +; GCN-NEXT: v_writelane_b32 v23, s14, 54 +; GCN-NEXT: v_writelane_b32 v23, s15, 55 +; GCN-NEXT: v_writelane_b32 v23, s16, 56 +; GCN-NEXT: v_writelane_b32 v23, s17, 57 +; GCN-NEXT: v_writelane_b32 v23, s18, 58 +; GCN-NEXT: v_writelane_b32 v23, s19, 59 +; GCN-NEXT: v_writelane_b32 v23, s20, 60 +; GCN-NEXT: v_writelane_b32 v23, s21, 61 +; GCN-NEXT: v_writelane_b32 v23, s22, 62 +; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[6:7] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_writelane_b32 v0, s6, 0 +; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s4, s5 +; GCN-NEXT: s_cbranch_scc1 BB0_2 +; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:5] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: BB0_2: ; %ret +; GCN-NEXT: s_endpgm + call void asm sideeffect "", "~{v[0:7]}" () #0 + call void asm sideeffect "", "~{v[8:15]}" () #0 + call void asm sideeffect "", "~{v[16:19]}"() #0 + call void asm sideeffect "", "~{v[20:21]}"() #0 + call void asm sideeffect "", "~{v22}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } Index: llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -1,5 +1,8 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s + +# -run-pass is used to artifically avoid using split register allocation, which would avoid stressing StackSlotColoring. + # Make sure that stack slot coloring doesn't try to merge frame # indexes used for SGPR spilling with those that aren't. Index: llvm/test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -197,15 +197,15 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v42, s34, 0 -; GCN-DAG: v_writelane_b32 v42, s35, 1 +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 [[CSRV]], s34, 0 +; GCN-DAG: v_writelane_b32 [[CSRV]], s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -214,20 +214,20 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-DAG: v_readlane_b32 s34, v42, 0 -; GCN-DAG: v_readlane_b32 s35, v42, 1 +; GCN-DAG: v_readlane_b32 s34, [[CSRV]], 0 +; GCN-DAG: v_readlane_b32 s35, [[CSRV]], 1 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { Index: llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-dce-in-ra=0 -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy,1 -o - %s | FileCheck %s # https://bugs.llvm.org/show_bug.cgi?id=33620 --- Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s -; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s ; RUN: llc -march=amdgcn -mattr=-xnack -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; Index: llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -22,12 +22,11 @@ } ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 +; CHECK-DAG: v_writelane_b32 v0, s98, 63 +; CHECK-DAG: v_writelane_b32 v1, s99, 0 ; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 - +; CHECK-DAG: v_readlane_b32 s99, v1, 0 +; CHECK-DAG: v_readlane_b32 s98, v0, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca Index: llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -1,4 +1,7 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck %s +# Note we are NOT using the normal register allocator pipeline. We are +# forcing allocating VGPRs and SGPRs at the same time. +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck %s + --- # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}} Index: llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -89,7 +89,6 @@ ; SI-OPT: bb12: ; SI-OPT-NEXT: store float 0.000000e+00, float addrspace(1)* null, align 8 ; SI-OPT-NEXT: ret void -; bb: %tmp = load i32, i32 addrspace(1)* null, align 16 %tmp1 = icmp slt i32 %tmp, 21 @@ -187,30 +186,30 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v43, s33, 4 +; GCN-NEXT: v_writelane_b32 v40, s33, 4 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s37, 3 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: flat_load_dword v40, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v41, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v43, 0 ; GCN-NEXT: s_getpc_b64 s[36:37] ; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v41 ; GCN-NEXT: s_branch BB1_3 ; GCN-NEXT: BB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 @@ -229,7 +228,7 @@ ; GCN-NEXT: BB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: flat_load_dword v0, v[42:43] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -273,7 +272,7 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0 ; GCN-NEXT: s_branch BB1_2 bb: %tmp = load float, float* null, align 16 Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -8,11 +8,11 @@ ; preserved across the call and should get 8 scratch registers. ; GFX9-LABEL: non_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -21,28 +21,28 @@ ; GFX9-NEXT: v_mov_b32_e32 v32, v12 ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 +; GFX9: image_gather4_c_b_cl v[41:44], v[32:39], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: non_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: v_mov_b32_e32 v36, v16 ; GFX10-NEXT: v_mov_b32_e32 v35, v15 @@ -53,7 +53,7 @@ ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10: image_gather4_c_b_cl v[41:44], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 @@ -62,12 +62,12 @@ ; GFX10: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10: buffer_load_dword v43, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX10: s_setpc_b64 s[4:5] main_body: call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 @@ -86,20 +86,20 @@ ; The upper 3 sub-registers are unused. ; GFX9-LABEL: call_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill - -; GFX9: v_mov_b32_e32 v44, v16 -; GFX9-NEXT: v_mov_b32_e32 v43, v15 -; GFX9-NEXT: v_mov_b32_e32 v42, v14 -; GFX9-NEXT: v_mov_b32_e32 v41, v13 -; GFX9-NEXT: v_mov_b32_e32 v40, v12 - -; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v60, v16 +; GFX9-NEXT: v_mov_b32_e32 v59, v15 +; GFX9-NEXT: v_mov_b32_e32 v58, v14 +; GFX9-NEXT: v_mov_b32_e32 v57, v13 +; GFX9-NEXT: v_mov_b32_e32 v56, v12 + +; GFX9: image_gather4_c_b_cl v[0:3], v[56:63], s[36:43], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -108,24 +108,24 @@ ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[56:63], s[36:43], s[4:7] dmask:0x1 -; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: call_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D @@ -133,24 +133,24 @@ ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-NEXT: v_mov_b32_e32 v42, v14 -; GFX10-NEXT: v_mov_b32_e32 v43, v13 -; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_mov_b32_e32 v42, v15 +; GFX10-NEXT: v_mov_b32_e32 v43, v14 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 +; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D - -; GFX10: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D + +; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}} +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; GFX10: s_setpc_b64 s[4:5] main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck %s # The undef copy of %4 is allocated to $vgpr3, and the identity copy # was deleted, and $vgpr3 was considered undef. The code to replace @@ -31,7 +31,7 @@ ; CHECK-LABEL: name: undef_identity_copy ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc ; CHECK: $sgpr4 = COPY $sgpr95 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95